Compressing stories. Also fixing compression of feed_pages bootstrap.

This commit is contained in:
Samuel Clay 2010-08-29 13:23:50 -04:00
parent e26ee9dbcc
commit c9d06f9358
3 changed files with 38 additions and 10 deletions

View file

@ -332,10 +332,10 @@ class Feed(models.Model):
# logging.debug('- Updated story in feed (%s - %s): %s / %s' % (self.feed_title, story.get('title'), len(existing_story.story_content), len(story_content))) # logging.debug('- Updated story in feed (%s - %s): %s / %s' % (self.feed_title, story.get('title'), len(existing_story.story_content), len(story_content)))
original_content = None original_content = None
if existing_story.get('story_original_content'): if existing_story.get('story_original_content_z'):
original_content = existing_story.get('story_original_content') original_content = zlib.decompress(existing_story.get('story_original_content_z'))
else: else:
original_content = existing_story.get('story_content') original_content = zlib.decompress(existing_story.get('story_content_z'))
# print 'Type: %s %s' % (type(original_content), type(story_content)) # print 'Type: %s %s' % (type(original_content), type(story_content))
if len(story_content) > 10: if len(story_content) > 10:
diff = HTMLDiff(unicode(original_content), story_content) diff = HTMLDiff(unicode(original_content), story_content)
@ -462,7 +462,7 @@ class Feed(models.Model):
story['story_date'] = story_db.story_date story['story_date'] = story_db.story_date
story['story_authors'] = story_db.story_author_name story['story_authors'] = story_db.story_author_name
story['story_title'] = story_db.story_title story['story_title'] = story_db.story_title
story['story_content'] = story_db.story_content story['story_content'] = story_db.story_content_z and zlib.decompress(story_db.story_content_z)
story['story_permalink'] = story_db.story_permalink story['story_permalink'] = story_db.story_permalink
story['story_feed_id'] = self.pk story['story_feed_id'] = self.pk
story['id'] = story_db.id story['id'] = story_db.id
@ -720,7 +720,9 @@ class MStory(mongo.Document):
story_date = mongo.DateTimeField() story_date = mongo.DateTimeField()
story_title = mongo.StringField(max_length=1024) story_title = mongo.StringField(max_length=1024)
story_content = mongo.StringField() story_content = mongo.StringField()
story_content_z = mongo.BinaryField()
story_original_content = mongo.StringField() story_original_content = mongo.StringField()
story_original_content_z = mongo.BinaryField()
story_content_type = mongo.StringField(max_length=255) story_content_type = mongo.StringField(max_length=255)
story_author_name = mongo.StringField() story_author_name = mongo.StringField()
story_permalink = mongo.StringField() story_permalink = mongo.StringField()
@ -734,6 +736,15 @@ class MStory(mongo.Document):
'ordering': ['-story_date'], 'ordering': ['-story_date'],
'allow_inheritance': False, 'allow_inheritance': False,
} }
def save(self, *args, **kwargs):
if self.story_content:
self.story_content_z = zlib.compress(self.story_content)
self.story_content = None
if self.story_original_content:
self.story_original_content_z = zlib.compress(self.story_original_content)
self.story_original_content = None
super(MStory, self).save(*args, **kwargs)
class FeedUpdateHistory(models.Model): class FeedUpdateHistory(models.Model):
fetch_date = models.DateTimeField(default=datetime.datetime.now) fetch_date = models.DateTimeField(default=datetime.datetime.now)

View file

@ -109,13 +109,13 @@ def bootstrap_classifiers():
def bootstrap_feedpages(): def bootstrap_feedpages():
print "Mongo DB feed_pages: %s" % MFeedPage.objects().count() print "Mongo DB feed_pages: %s" % MFeedPage.objects().count()
db.feed_pages.drop() # db.feed_pages.drop()
print "Dropped! Mongo DB feed_pages: %s" % MFeedPage.objects().count() print "Dropped! Mongo DB feed_pages: %s" % MFeedPage.objects().count()
print "FeedPages: %s" % MFeedPage.objects().count() print "FeedPages: %s" % MFeedPage.objects().count()
pprint(db.feed_pages.index_information()) pprint(db.feed_pages.index_information())
feeds = Feed.objects.all().order_by('-average_stories_per_month') feeds = Feed.objects.filter(average_stories_per_month=0).order_by('-average_stories_per_month')
feed_count = feeds.count() feed_count = feeds.count()
i = 0 i = 0
for feed in feeds: for feed in feeds:
@ -127,13 +127,31 @@ def bootstrap_feedpages():
if feed_page: if feed_page:
del feed_page[0]['id'] del feed_page[0]['id']
feed_page[0]['feed_id'] = feed.pk feed_page[0]['feed_id'] = feed.pk
MFeedPage(**feed_page[0]).save() try:
MFeedPage(**feed_page[0]).save()
except:
print '\n\n!\n\n'
continue
print "\nMongo DB feed_pages: %s" % MFeedPage.objects().count() print "\nMongo DB feed_pages: %s" % MFeedPage.objects().count()
def compress_stories():
count = MStory.objects().count()
print "Mongo DB stories: %s" % count
p = 0.0
i = 0
for story in MStory.objects():
i += 1.0
if round(i / count * 100) != p:
p = round(i / count * 100)
print '%s%%' % p
story.save()
if __name__ == '__main__': if __name__ == '__main__':
# bootstrap_stories() # bootstrap_stories()
# bootstrap_userstories() # bootstrap_userstories()
# bootstrap_classifiers() # bootstrap_classifiers()
bootstrap_feedpages() bootstrap_feedpages()
compress_stories()

View file

@ -93,8 +93,7 @@ class ProcessFeed:
ENTRY_SAME:0, ENTRY_SAME:0,
ENTRY_ERR:0} ENTRY_ERR:0}
logging.debug(u' ---> [%d] Processing %s' % (self.feed.id, # logging.debug(u' ---> [%d] Processing %s' % (self.feed.id, self.feed.feed_title))
self.feed.feed_title))
if hasattr(self.fpf, 'status'): if hasattr(self.fpf, 'status'):
if self.options['verbose']: if self.options['verbose']: