Huge fix for exception feeds. Now tries to fetch the correct URL from the feed_link or the feed_address (if the rss URL is actually HTML), and uses that to figure out the right feed. Also merges feeds if a duplicate is found during this process. Really nifty. Let's hope nobody dies.

This commit is contained in:
Samuel Clay 2010-08-25 19:10:55 -04:00
parent e01bb9eb74
commit 49d8b6d505
2 changed files with 146 additions and 2 deletions

View file

@ -15,12 +15,14 @@ from django.db import models
from django.db import IntegrityError from django.db import IntegrityError
from django.core.cache import cache from django.core.cache import cache
from utils import json from utils import json
from utils import feedfinder
from utils.feed_functions import levenshtein_distance from utils.feed_functions import levenshtein_distance
from utils.story_functions import format_story_link_date__short from utils.story_functions import format_story_link_date__short
from utils.story_functions import format_story_link_date__long from utils.story_functions import format_story_link_date__long
from utils.story_functions import pre_process_story from utils.story_functions import pre_process_story
from utils.compressed_textfield import StoryField from utils.compressed_textfield import StoryField
from utils.diff import HTMLDiff from utils.diff import HTMLDiff
from utils import log as logging
ENTRY_NEW, ENTRY_UPDATED, ENTRY_SAME, ENTRY_ERR = range(4) ENTRY_NEW, ENTRY_UPDATED, ENTRY_SAME, ENTRY_ERR = range(4)
@ -69,13 +71,39 @@ class Feed(models.Model):
self.count_stories(lock=lock) self.count_stories(lock=lock)
self.save_popular_authors(lock=lock) self.save_popular_authors(lock=lock)
self.save_popular_tags(lock=lock) self.save_popular_tags(lock=lock)
def check_feed_address_for_feed_link(self):
feed_address = None
if not feedfinder.isFeed(self.feed_address):
feed_address = feedfinder.feed(self.feed_address)
if not feed_address:
feed_address = feedfinder.feed(self.feed_link)
else:
feed_address_from_link = feedfinder.feed(self.feed_link)
if feed_address_from_link != self.feed_address:
feed_address = feed_address_from_link
if feed_address:
try:
self.feed_address = feed_address
self.next_scheduled_update = datetime.datetime.now()
self.has_exception = False
self.save()
except:
original_feed = Feed.objects.get(feed_address=feed_address)
original_feed.has_exception = False
original_feed.save()
merge_feeds(original_feed.pk, self.pk)
return not not feed_address
def save_feed_history(self, status_code, message, exception=None): def save_feed_history(self, status_code, message, exception=None):
FeedFetchHistory.objects.create(feed=self, FeedFetchHistory.objects.create(feed=self,
status_code=status_code, status_code=status_code,
message=message, message=message,
exception=exception) exception=exception)
old_fetch_histories = self.feed_fetch_history.all()[10:] old_fetch_histories = self.feed_fetch_history.all().order_by('-fetch_date')[10:]
for history in old_fetch_histories: for history in old_fetch_histories:
history.delete() history.delete()
@ -545,6 +573,11 @@ class Feed(models.Model):
self.next_scheduled_update = next_scheduled_update self.next_scheduled_update = next_scheduled_update
self.save(lock=lock) self.save(lock=lock)
def reset_next_scheduled_update(self, lock=None):
self.next_scheduled_update = datetime.datetime.now()
self.save(lock=lock)
def calculate_collocations_story_content(self, def calculate_collocations_story_content(self,
collocation_measures=TrigramAssocMeasures, collocation_measures=TrigramAssocMeasures,
@ -724,3 +757,111 @@ class DuplicateFeed(models.Model):
duplicate_address = models.CharField(max_length=255, unique=True) duplicate_address = models.CharField(max_length=255, unique=True)
feed = models.ForeignKey(Feed, related_name='duplicate_addresses') feed = models.ForeignKey(Feed, related_name='duplicate_addresses')
def merge_feeds(original_feed_id, duplicate_feed_id):
from apps.reader.models import UserSubscription, UserSubscriptionFolders, MUserStory
from apps.analyzer.models import MClassifierTitle, MClassifierAuthor, MClassifierFeed, MClassifierTag
try:
original_feed = Feed.objects.get(pk=original_feed_id)
duplicate_feed = Feed.objects.get(pk=duplicate_feed_id)
except Feed.DoesNotExist:
logging.info(" ***> Already deleted feed: %s" % duplicate_feed_id)
return
logging.info(" ---> Feed: [%s - %s] %s - %s" % (original_feed_id, duplicate_feed_id,
original_feed, original_feed.feed_link))
logging.info(" --> %s" % original_feed.feed_address)
logging.info(" --> %s" % duplicate_feed.feed_address)
user_subs = UserSubscription.objects.filter(feed=duplicate_feed)
for user_sub in user_subs:
# Rewrite feed in subscription folders
try:
user_sub_folders = UserSubscriptionFolders.objects.get(user=user_sub.user)
except Exception, e:
logging.info(" *** ---> UserSubscriptionFolders error: %s" % e)
continue
# Switch to original feed for the user subscription
logging.info(" ===> %s " % user_sub.user)
user_sub.feed = original_feed
user_sub.needs_unread_recalc = True
try:
user_sub.save()
folders = json.decode(user_sub_folders.folders)
folders = rewrite_folders(folders, original_feed, duplicate_feed)
user_sub_folders.folders = json.encode(folders)
user_sub_folders.save()
except IntegrityError:
logging.info(" !!!!> %s already subscribed" % user_sub.user)
user_sub.delete()
# Switch read stories
user_stories = MUserStory.objects(feed_id=duplicate_feed.pk)
logging.info(" ---> %s read stories" % user_stories.count())
for user_story in user_stories:
user_story.feed_id = original_feed.pk
duplicate_story = user_story.story
original_story = MStory.objects(story_guid=duplicate_story.story_guid,
story_feed_id=original_feed.pk)
if original_story:
user_story.story = original_story[0]
else:
logging.info(" ***> Can't find original story: %s" % duplicate_story)
try:
user_story.save()
except IntegrityError:
logging.info(" ***> Story already saved: %s" % user_story)
def delete_story_feed(model, feed_field='feed_id'):
duplicate_stories = model.objects(**{feed_field: duplicate_feed.pk})
# if duplicate_stories.count():
# logging.info(" ---> Deleting %s %s" % (duplicate_stories.count(), model))
duplicate_stories.delete()
def switch_feed(model):
duplicates = model.objects(feed_id=duplicate_feed.pk)
if duplicates.count():
logging.info(" ---> Switching %s %s" % (duplicates.count(), model))
for duplicate in duplicates:
duplicate.feed_id = original_feed.pk
try:
duplicate.save()
pass
except IntegrityError:
logging.info(" !!!!> %s already exists" % duplicate)
duplicate.delete()
delete_story_feed(MStory, 'story_feed_id')
switch_feed(MClassifierTitle)
switch_feed(MClassifierAuthor)
switch_feed(MClassifierFeed)
switch_feed(MClassifierTag)
try:
DuplicateFeed.objects.create(
duplicate_address=duplicate_feed.feed_address,
feed=original_feed
)
except IntegrityError:
pass
duplicate_feed.delete()
def rewrite_folders(folders, original_feed, duplicate_feed):
new_folders = []
for k, folder in enumerate(folders):
if isinstance(folder, int):
if folder == duplicate_feed.pk:
# logging.info(" ===> Rewrote %s'th item: %s" % (k+1, folders))
new_folders.append(original_feed.pk)
else:
new_folders.append(folder)
elif isinstance(folder, dict):
for f_k, f_v in folder.items():
new_folders.append({f_k: rewrite_folders(f_v, original_feed, duplicate_feed)})
return new_folders

View file

@ -127,7 +127,9 @@ class ProcessFeed:
return FEED_ERRPARSE, ret_values return FEED_ERRPARSE, ret_values
elif self.fpf.bozo and isinstance(self.fpf.bozo_exception, xml.sax._exceptions.SAXException): elif self.fpf.bozo and isinstance(self.fpf.bozo_exception, xml.sax._exceptions.SAXException):
if not self.fpf.entries: if not self.fpf.entries:
self.feed.save_feed_history(503, 'SAX Exception', self.fpf.bozo_exception) fixed_feed = self.feed.check_feed_address_for_feed_link()
if not fixed_feed:
self.feed.save_feed_history(503, 'SAX Exception', self.fpf.bozo_exception)
return FEED_ERRPARSE, ret_values return FEED_ERRPARSE, ret_values
# the feed has changed (or it is the first time we parse it) # the feed has changed (or it is the first time we parse it)
@ -235,6 +237,7 @@ class Dispatcher:
# Close the DB so the connection can be re-opened on a per-process basis # Close the DB so the connection can be re-opened on a per-process basis
from django.db import connection from django.db import connection
connection.close() connection.close()
delta = None
MONGO_DB = settings.MONGO_DB MONGO_DB = settings.MONGO_DB
db = pymongo.Connection(host=MONGO_DB['HOST'], port=MONGO_DB['PORT'])[MONGO_DB['NAME']] db = pymongo.Connection(host=MONGO_DB['HOST'], port=MONGO_DB['PORT'])[MONGO_DB['NAME']]