diff --git a/apps/rss_feeds/models.py b/apps/rss_feeds/models.py index 5f7562574..04ac24cd4 100644 --- a/apps/rss_feeds/models.py +++ b/apps/rss_feeds/models.py @@ -332,10 +332,10 @@ class Feed(models.Model): # logging.debug('- Updated story in feed (%s - %s): %s / %s' % (self.feed_title, story.get('title'), len(existing_story.story_content), len(story_content))) original_content = None - if existing_story.get('story_original_content'): - original_content = existing_story.get('story_original_content') + if existing_story.get('story_original_content_z'): + original_content = zlib.decompress(existing_story.get('story_original_content_z')) else: - original_content = existing_story.get('story_content') + original_content = zlib.decompress(existing_story.get('story_content_z')) # print 'Type: %s %s' % (type(original_content), type(story_content)) if len(story_content) > 10: diff = HTMLDiff(unicode(original_content), story_content) @@ -462,7 +462,7 @@ class Feed(models.Model): story['story_date'] = story_db.story_date story['story_authors'] = story_db.story_author_name story['story_title'] = story_db.story_title - story['story_content'] = story_db.story_content + story['story_content'] = story_db.story_content_z and zlib.decompress(story_db.story_content_z) story['story_permalink'] = story_db.story_permalink story['story_feed_id'] = self.pk story['id'] = story_db.id @@ -720,7 +720,9 @@ class MStory(mongo.Document): story_date = mongo.DateTimeField() story_title = mongo.StringField(max_length=1024) story_content = mongo.StringField() + story_content_z = mongo.BinaryField() story_original_content = mongo.StringField() + story_original_content_z = mongo.BinaryField() story_content_type = mongo.StringField(max_length=255) story_author_name = mongo.StringField() story_permalink = mongo.StringField() @@ -734,6 +736,15 @@ class MStory(mongo.Document): 'ordering': ['-story_date'], 'allow_inheritance': False, } + + def save(self, *args, **kwargs): + if self.story_content: + self.story_content_z = zlib.compress(self.story_content) + self.story_content = None + if self.story_original_content: + self.story_original_content_z = zlib.compress(self.story_original_content) + self.story_original_content = None + super(MStory, self).save(*args, **kwargs) class FeedUpdateHistory(models.Model): fetch_date = models.DateTimeField(default=datetime.datetime.now) diff --git a/utils/bootstrap_mongo.py b/utils/bootstrap_mongo.py index 7b4674d09..0763d4de3 100644 --- a/utils/bootstrap_mongo.py +++ b/utils/bootstrap_mongo.py @@ -109,13 +109,13 @@ def bootstrap_classifiers(): def bootstrap_feedpages(): print "Mongo DB feed_pages: %s" % MFeedPage.objects().count() - db.feed_pages.drop() + # db.feed_pages.drop() print "Dropped! Mongo DB feed_pages: %s" % MFeedPage.objects().count() print "FeedPages: %s" % MFeedPage.objects().count() pprint(db.feed_pages.index_information()) - feeds = Feed.objects.all().order_by('-average_stories_per_month') + feeds = Feed.objects.filter(average_stories_per_month=0).order_by('-average_stories_per_month') feed_count = feeds.count() i = 0 for feed in feeds: @@ -127,13 +127,31 @@ def bootstrap_feedpages(): if feed_page: del feed_page[0]['id'] feed_page[0]['feed_id'] = feed.pk - MFeedPage(**feed_page[0]).save() + try: + MFeedPage(**feed_page[0]).save() + except: + print '\n\n!\n\n' + continue print "\nMongo DB feed_pages: %s" % MFeedPage.objects().count() +def compress_stories(): + count = MStory.objects().count() + print "Mongo DB stories: %s" % count + p = 0.0 + i = 0 + for story in MStory.objects(): + i += 1.0 + if round(i / count * 100) != p: + p = round(i / count * 100) + print '%s%%' % p + story.save() + + if __name__ == '__main__': # bootstrap_stories() # bootstrap_userstories() # bootstrap_classifiers() - bootstrap_feedpages() \ No newline at end of file + bootstrap_feedpages() + compress_stories() \ No newline at end of file diff --git a/utils/feed_fetcher.py b/utils/feed_fetcher.py index 9df9074a7..35cf198f8 100644 --- a/utils/feed_fetcher.py +++ b/utils/feed_fetcher.py @@ -93,8 +93,7 @@ class ProcessFeed: ENTRY_SAME:0, ENTRY_ERR:0} - logging.debug(u' ---> [%d] Processing %s' % (self.feed.id, - self.feed.feed_title)) + # logging.debug(u' ---> [%d] Processing %s' % (self.feed.id, self.feed.feed_title)) if hasattr(self.fpf, 'status'): if self.options['verbose']: