Compressing stories. Also fixing compression of feed_pages bootstrap.

This commit is contained in:
Samuel Clay 2010-08-29 13:23:50 -04:00
parent e26ee9dbcc
commit c9d06f9358
3 changed files with 38 additions and 10 deletions

View file

@ -332,10 +332,10 @@ class Feed(models.Model):
# logging.debug('- Updated story in feed (%s - %s): %s / %s' % (self.feed_title, story.get('title'), len(existing_story.story_content), len(story_content)))
original_content = None
if existing_story.get('story_original_content'):
original_content = existing_story.get('story_original_content')
if existing_story.get('story_original_content_z'):
original_content = zlib.decompress(existing_story.get('story_original_content_z'))
else:
original_content = existing_story.get('story_content')
original_content = zlib.decompress(existing_story.get('story_content_z'))
# print 'Type: %s %s' % (type(original_content), type(story_content))
if len(story_content) > 10:
diff = HTMLDiff(unicode(original_content), story_content)
@ -462,7 +462,7 @@ class Feed(models.Model):
story['story_date'] = story_db.story_date
story['story_authors'] = story_db.story_author_name
story['story_title'] = story_db.story_title
story['story_content'] = story_db.story_content
story['story_content'] = story_db.story_content_z and zlib.decompress(story_db.story_content_z)
story['story_permalink'] = story_db.story_permalink
story['story_feed_id'] = self.pk
story['id'] = story_db.id
@ -720,7 +720,9 @@ class MStory(mongo.Document):
story_date = mongo.DateTimeField()
story_title = mongo.StringField(max_length=1024)
story_content = mongo.StringField()
story_content_z = mongo.BinaryField()
story_original_content = mongo.StringField()
story_original_content_z = mongo.BinaryField()
story_content_type = mongo.StringField(max_length=255)
story_author_name = mongo.StringField()
story_permalink = mongo.StringField()
@ -734,6 +736,15 @@ class MStory(mongo.Document):
'ordering': ['-story_date'],
'allow_inheritance': False,
}
def save(self, *args, **kwargs):
if self.story_content:
self.story_content_z = zlib.compress(self.story_content)
self.story_content = None
if self.story_original_content:
self.story_original_content_z = zlib.compress(self.story_original_content)
self.story_original_content = None
super(MStory, self).save(*args, **kwargs)
class FeedUpdateHistory(models.Model):
fetch_date = models.DateTimeField(default=datetime.datetime.now)

View file

@ -109,13 +109,13 @@ def bootstrap_classifiers():
def bootstrap_feedpages():
print "Mongo DB feed_pages: %s" % MFeedPage.objects().count()
db.feed_pages.drop()
# db.feed_pages.drop()
print "Dropped! Mongo DB feed_pages: %s" % MFeedPage.objects().count()
print "FeedPages: %s" % MFeedPage.objects().count()
pprint(db.feed_pages.index_information())
feeds = Feed.objects.all().order_by('-average_stories_per_month')
feeds = Feed.objects.filter(average_stories_per_month=0).order_by('-average_stories_per_month')
feed_count = feeds.count()
i = 0
for feed in feeds:
@ -127,13 +127,31 @@ def bootstrap_feedpages():
if feed_page:
del feed_page[0]['id']
feed_page[0]['feed_id'] = feed.pk
MFeedPage(**feed_page[0]).save()
try:
MFeedPage(**feed_page[0]).save()
except:
print '\n\n!\n\n'
continue
print "\nMongo DB feed_pages: %s" % MFeedPage.objects().count()
def compress_stories():
count = MStory.objects().count()
print "Mongo DB stories: %s" % count
p = 0.0
i = 0
for story in MStory.objects():
i += 1.0
if round(i / count * 100) != p:
p = round(i / count * 100)
print '%s%%' % p
story.save()
if __name__ == '__main__':
# bootstrap_stories()
# bootstrap_userstories()
# bootstrap_classifiers()
bootstrap_feedpages()
bootstrap_feedpages()
compress_stories()

View file

@ -93,8 +93,7 @@ class ProcessFeed:
ENTRY_SAME:0,
ENTRY_ERR:0}
logging.debug(u' ---> [%d] Processing %s' % (self.feed.id,
self.feed.feed_title))
# logging.debug(u' ---> [%d] Processing %s' % (self.feed.id, self.feed.feed_title))
if hasattr(self.fpf, 'status'):
if self.options['verbose']: