From 49d8b6d505abcdcdffec13c05d70699d652dcab8 Mon Sep 17 00:00:00 2001 From: Samuel Clay Date: Wed, 25 Aug 2010 19:10:55 -0400 Subject: [PATCH] Huge fix for exception feeds. Now tries to fetch the correct URL from the feed_link or the feed_address (if the rss URL is actually HTML), and uses that to figure out the right feed. Also merges feeds if a duplicate is found during this process. Really nifty. Let's hope nobody dies. --- apps/rss_feeds/models.py | 143 ++++++++++++++++++++++++++++++++++++++- utils/feed_fetcher.py | 5 +- 2 files changed, 146 insertions(+), 2 deletions(-) diff --git a/apps/rss_feeds/models.py b/apps/rss_feeds/models.py index f35c088ad..eab059362 100644 --- a/apps/rss_feeds/models.py +++ b/apps/rss_feeds/models.py @@ -15,12 +15,14 @@ from django.db import models from django.db import IntegrityError from django.core.cache import cache from utils import json +from utils import feedfinder from utils.feed_functions import levenshtein_distance from utils.story_functions import format_story_link_date__short from utils.story_functions import format_story_link_date__long from utils.story_functions import pre_process_story from utils.compressed_textfield import StoryField from utils.diff import HTMLDiff +from utils import log as logging ENTRY_NEW, ENTRY_UPDATED, ENTRY_SAME, ENTRY_ERR = range(4) @@ -69,13 +71,39 @@ class Feed(models.Model): self.count_stories(lock=lock) self.save_popular_authors(lock=lock) self.save_popular_tags(lock=lock) + + def check_feed_address_for_feed_link(self): + feed_address = None + + if not feedfinder.isFeed(self.feed_address): + feed_address = feedfinder.feed(self.feed_address) + if not feed_address: + feed_address = feedfinder.feed(self.feed_link) + else: + feed_address_from_link = feedfinder.feed(self.feed_link) + if feed_address_from_link != self.feed_address: + feed_address = feed_address_from_link + if feed_address: + try: + self.feed_address = feed_address + self.next_scheduled_update = datetime.datetime.now() + self.has_exception = False + self.save() + except: + original_feed = Feed.objects.get(feed_address=feed_address) + original_feed.has_exception = False + original_feed.save() + merge_feeds(original_feed.pk, self.pk) + + return not not feed_address + def save_feed_history(self, status_code, message, exception=None): FeedFetchHistory.objects.create(feed=self, status_code=status_code, message=message, exception=exception) - old_fetch_histories = self.feed_fetch_history.all()[10:] + old_fetch_histories = self.feed_fetch_history.all().order_by('-fetch_date')[10:] for history in old_fetch_histories: history.delete() @@ -545,6 +573,11 @@ class Feed(models.Model): self.next_scheduled_update = next_scheduled_update self.save(lock=lock) + + def reset_next_scheduled_update(self, lock=None): + self.next_scheduled_update = datetime.datetime.now() + + self.save(lock=lock) def calculate_collocations_story_content(self, collocation_measures=TrigramAssocMeasures, @@ -724,3 +757,111 @@ class DuplicateFeed(models.Model): duplicate_address = models.CharField(max_length=255, unique=True) feed = models.ForeignKey(Feed, related_name='duplicate_addresses') + +def merge_feeds(original_feed_id, duplicate_feed_id): + from apps.reader.models import UserSubscription, UserSubscriptionFolders, MUserStory + from apps.analyzer.models import MClassifierTitle, MClassifierAuthor, MClassifierFeed, MClassifierTag + try: + original_feed = Feed.objects.get(pk=original_feed_id) + duplicate_feed = Feed.objects.get(pk=duplicate_feed_id) + except Feed.DoesNotExist: + logging.info(" ***> Already deleted feed: %s" % duplicate_feed_id) + return + + logging.info(" ---> Feed: [%s - %s] %s - %s" % (original_feed_id, duplicate_feed_id, + original_feed, original_feed.feed_link)) + logging.info(" --> %s" % original_feed.feed_address) + logging.info(" --> %s" % duplicate_feed.feed_address) + + user_subs = UserSubscription.objects.filter(feed=duplicate_feed) + for user_sub in user_subs: + # Rewrite feed in subscription folders + try: + user_sub_folders = UserSubscriptionFolders.objects.get(user=user_sub.user) + except Exception, e: + logging.info(" *** ---> UserSubscriptionFolders error: %s" % e) + continue + + # Switch to original feed for the user subscription + logging.info(" ===> %s " % user_sub.user) + user_sub.feed = original_feed + user_sub.needs_unread_recalc = True + try: + user_sub.save() + folders = json.decode(user_sub_folders.folders) + folders = rewrite_folders(folders, original_feed, duplicate_feed) + user_sub_folders.folders = json.encode(folders) + user_sub_folders.save() + except IntegrityError: + logging.info(" !!!!> %s already subscribed" % user_sub.user) + user_sub.delete() + + # Switch read stories + user_stories = MUserStory.objects(feed_id=duplicate_feed.pk) + logging.info(" ---> %s read stories" % user_stories.count()) + for user_story in user_stories: + user_story.feed_id = original_feed.pk + duplicate_story = user_story.story + original_story = MStory.objects(story_guid=duplicate_story.story_guid, + story_feed_id=original_feed.pk) + + if original_story: + user_story.story = original_story[0] + else: + logging.info(" ***> Can't find original story: %s" % duplicate_story) + try: + user_story.save() + except IntegrityError: + logging.info(" ***> Story already saved: %s" % user_story) + + def delete_story_feed(model, feed_field='feed_id'): + duplicate_stories = model.objects(**{feed_field: duplicate_feed.pk}) + # if duplicate_stories.count(): + # logging.info(" ---> Deleting %s %s" % (duplicate_stories.count(), model)) + duplicate_stories.delete() + + def switch_feed(model): + duplicates = model.objects(feed_id=duplicate_feed.pk) + if duplicates.count(): + logging.info(" ---> Switching %s %s" % (duplicates.count(), model)) + for duplicate in duplicates: + duplicate.feed_id = original_feed.pk + try: + duplicate.save() + pass + except IntegrityError: + logging.info(" !!!!> %s already exists" % duplicate) + duplicate.delete() + + delete_story_feed(MStory, 'story_feed_id') + switch_feed(MClassifierTitle) + switch_feed(MClassifierAuthor) + switch_feed(MClassifierFeed) + switch_feed(MClassifierTag) + + try: + DuplicateFeed.objects.create( + duplicate_address=duplicate_feed.feed_address, + feed=original_feed + ) + except IntegrityError: + pass + + duplicate_feed.delete() + + +def rewrite_folders(folders, original_feed, duplicate_feed): + new_folders = [] + + for k, folder in enumerate(folders): + if isinstance(folder, int): + if folder == duplicate_feed.pk: + # logging.info(" ===> Rewrote %s'th item: %s" % (k+1, folders)) + new_folders.append(original_feed.pk) + else: + new_folders.append(folder) + elif isinstance(folder, dict): + for f_k, f_v in folder.items(): + new_folders.append({f_k: rewrite_folders(f_v, original_feed, duplicate_feed)}) + + return new_folders \ No newline at end of file diff --git a/utils/feed_fetcher.py b/utils/feed_fetcher.py index e5b41dfed..3e5b3d3d3 100644 --- a/utils/feed_fetcher.py +++ b/utils/feed_fetcher.py @@ -127,7 +127,9 @@ class ProcessFeed: return FEED_ERRPARSE, ret_values elif self.fpf.bozo and isinstance(self.fpf.bozo_exception, xml.sax._exceptions.SAXException): if not self.fpf.entries: - self.feed.save_feed_history(503, 'SAX Exception', self.fpf.bozo_exception) + fixed_feed = self.feed.check_feed_address_for_feed_link() + if not fixed_feed: + self.feed.save_feed_history(503, 'SAX Exception', self.fpf.bozo_exception) return FEED_ERRPARSE, ret_values # the feed has changed (or it is the first time we parse it) @@ -235,6 +237,7 @@ class Dispatcher: # Close the DB so the connection can be re-opened on a per-process basis from django.db import connection connection.close() + delta = None MONGO_DB = settings.MONGO_DB db = pymongo.Connection(host=MONGO_DB['HOST'], port=MONGO_DB['PORT'])[MONGO_DB['NAME']]