mirror of
https://github.com/samuelclay/NewsBlur.git
synced 2025-04-13 09:42:01 +00:00
Huge fix for exception feeds. Now tries to fetch the correct URL from the feed_link or the feed_address (if the rss URL is actually HTML), and uses that to figure out the right feed. Also merges feeds if a duplicate is found during this process. Really nifty. Let's hope nobody dies.
This commit is contained in:
parent
e01bb9eb74
commit
49d8b6d505
2 changed files with 146 additions and 2 deletions
|
@ -15,12 +15,14 @@ from django.db import models
|
|||
from django.db import IntegrityError
|
||||
from django.core.cache import cache
|
||||
from utils import json
|
||||
from utils import feedfinder
|
||||
from utils.feed_functions import levenshtein_distance
|
||||
from utils.story_functions import format_story_link_date__short
|
||||
from utils.story_functions import format_story_link_date__long
|
||||
from utils.story_functions import pre_process_story
|
||||
from utils.compressed_textfield import StoryField
|
||||
from utils.diff import HTMLDiff
|
||||
from utils import log as logging
|
||||
|
||||
ENTRY_NEW, ENTRY_UPDATED, ENTRY_SAME, ENTRY_ERR = range(4)
|
||||
|
||||
|
@ -69,13 +71,39 @@ class Feed(models.Model):
|
|||
self.count_stories(lock=lock)
|
||||
self.save_popular_authors(lock=lock)
|
||||
self.save_popular_tags(lock=lock)
|
||||
|
||||
def check_feed_address_for_feed_link(self):
|
||||
feed_address = None
|
||||
|
||||
if not feedfinder.isFeed(self.feed_address):
|
||||
feed_address = feedfinder.feed(self.feed_address)
|
||||
if not feed_address:
|
||||
feed_address = feedfinder.feed(self.feed_link)
|
||||
else:
|
||||
feed_address_from_link = feedfinder.feed(self.feed_link)
|
||||
if feed_address_from_link != self.feed_address:
|
||||
feed_address = feed_address_from_link
|
||||
|
||||
if feed_address:
|
||||
try:
|
||||
self.feed_address = feed_address
|
||||
self.next_scheduled_update = datetime.datetime.now()
|
||||
self.has_exception = False
|
||||
self.save()
|
||||
except:
|
||||
original_feed = Feed.objects.get(feed_address=feed_address)
|
||||
original_feed.has_exception = False
|
||||
original_feed.save()
|
||||
merge_feeds(original_feed.pk, self.pk)
|
||||
|
||||
return not not feed_address
|
||||
|
||||
def save_feed_history(self, status_code, message, exception=None):
|
||||
FeedFetchHistory.objects.create(feed=self,
|
||||
status_code=status_code,
|
||||
message=message,
|
||||
exception=exception)
|
||||
old_fetch_histories = self.feed_fetch_history.all()[10:]
|
||||
old_fetch_histories = self.feed_fetch_history.all().order_by('-fetch_date')[10:]
|
||||
for history in old_fetch_histories:
|
||||
history.delete()
|
||||
|
||||
|
@ -545,6 +573,11 @@ class Feed(models.Model):
|
|||
self.next_scheduled_update = next_scheduled_update
|
||||
|
||||
self.save(lock=lock)
|
||||
|
||||
def reset_next_scheduled_update(self, lock=None):
|
||||
self.next_scheduled_update = datetime.datetime.now()
|
||||
|
||||
self.save(lock=lock)
|
||||
|
||||
def calculate_collocations_story_content(self,
|
||||
collocation_measures=TrigramAssocMeasures,
|
||||
|
@ -724,3 +757,111 @@ class DuplicateFeed(models.Model):
|
|||
duplicate_address = models.CharField(max_length=255, unique=True)
|
||||
feed = models.ForeignKey(Feed, related_name='duplicate_addresses')
|
||||
|
||||
|
||||
def merge_feeds(original_feed_id, duplicate_feed_id):
|
||||
from apps.reader.models import UserSubscription, UserSubscriptionFolders, MUserStory
|
||||
from apps.analyzer.models import MClassifierTitle, MClassifierAuthor, MClassifierFeed, MClassifierTag
|
||||
try:
|
||||
original_feed = Feed.objects.get(pk=original_feed_id)
|
||||
duplicate_feed = Feed.objects.get(pk=duplicate_feed_id)
|
||||
except Feed.DoesNotExist:
|
||||
logging.info(" ***> Already deleted feed: %s" % duplicate_feed_id)
|
||||
return
|
||||
|
||||
logging.info(" ---> Feed: [%s - %s] %s - %s" % (original_feed_id, duplicate_feed_id,
|
||||
original_feed, original_feed.feed_link))
|
||||
logging.info(" --> %s" % original_feed.feed_address)
|
||||
logging.info(" --> %s" % duplicate_feed.feed_address)
|
||||
|
||||
user_subs = UserSubscription.objects.filter(feed=duplicate_feed)
|
||||
for user_sub in user_subs:
|
||||
# Rewrite feed in subscription folders
|
||||
try:
|
||||
user_sub_folders = UserSubscriptionFolders.objects.get(user=user_sub.user)
|
||||
except Exception, e:
|
||||
logging.info(" *** ---> UserSubscriptionFolders error: %s" % e)
|
||||
continue
|
||||
|
||||
# Switch to original feed for the user subscription
|
||||
logging.info(" ===> %s " % user_sub.user)
|
||||
user_sub.feed = original_feed
|
||||
user_sub.needs_unread_recalc = True
|
||||
try:
|
||||
user_sub.save()
|
||||
folders = json.decode(user_sub_folders.folders)
|
||||
folders = rewrite_folders(folders, original_feed, duplicate_feed)
|
||||
user_sub_folders.folders = json.encode(folders)
|
||||
user_sub_folders.save()
|
||||
except IntegrityError:
|
||||
logging.info(" !!!!> %s already subscribed" % user_sub.user)
|
||||
user_sub.delete()
|
||||
|
||||
# Switch read stories
|
||||
user_stories = MUserStory.objects(feed_id=duplicate_feed.pk)
|
||||
logging.info(" ---> %s read stories" % user_stories.count())
|
||||
for user_story in user_stories:
|
||||
user_story.feed_id = original_feed.pk
|
||||
duplicate_story = user_story.story
|
||||
original_story = MStory.objects(story_guid=duplicate_story.story_guid,
|
||||
story_feed_id=original_feed.pk)
|
||||
|
||||
if original_story:
|
||||
user_story.story = original_story[0]
|
||||
else:
|
||||
logging.info(" ***> Can't find original story: %s" % duplicate_story)
|
||||
try:
|
||||
user_story.save()
|
||||
except IntegrityError:
|
||||
logging.info(" ***> Story already saved: %s" % user_story)
|
||||
|
||||
def delete_story_feed(model, feed_field='feed_id'):
|
||||
duplicate_stories = model.objects(**{feed_field: duplicate_feed.pk})
|
||||
# if duplicate_stories.count():
|
||||
# logging.info(" ---> Deleting %s %s" % (duplicate_stories.count(), model))
|
||||
duplicate_stories.delete()
|
||||
|
||||
def switch_feed(model):
|
||||
duplicates = model.objects(feed_id=duplicate_feed.pk)
|
||||
if duplicates.count():
|
||||
logging.info(" ---> Switching %s %s" % (duplicates.count(), model))
|
||||
for duplicate in duplicates:
|
||||
duplicate.feed_id = original_feed.pk
|
||||
try:
|
||||
duplicate.save()
|
||||
pass
|
||||
except IntegrityError:
|
||||
logging.info(" !!!!> %s already exists" % duplicate)
|
||||
duplicate.delete()
|
||||
|
||||
delete_story_feed(MStory, 'story_feed_id')
|
||||
switch_feed(MClassifierTitle)
|
||||
switch_feed(MClassifierAuthor)
|
||||
switch_feed(MClassifierFeed)
|
||||
switch_feed(MClassifierTag)
|
||||
|
||||
try:
|
||||
DuplicateFeed.objects.create(
|
||||
duplicate_address=duplicate_feed.feed_address,
|
||||
feed=original_feed
|
||||
)
|
||||
except IntegrityError:
|
||||
pass
|
||||
|
||||
duplicate_feed.delete()
|
||||
|
||||
|
||||
def rewrite_folders(folders, original_feed, duplicate_feed):
|
||||
new_folders = []
|
||||
|
||||
for k, folder in enumerate(folders):
|
||||
if isinstance(folder, int):
|
||||
if folder == duplicate_feed.pk:
|
||||
# logging.info(" ===> Rewrote %s'th item: %s" % (k+1, folders))
|
||||
new_folders.append(original_feed.pk)
|
||||
else:
|
||||
new_folders.append(folder)
|
||||
elif isinstance(folder, dict):
|
||||
for f_k, f_v in folder.items():
|
||||
new_folders.append({f_k: rewrite_folders(f_v, original_feed, duplicate_feed)})
|
||||
|
||||
return new_folders
|
|
@ -127,7 +127,9 @@ class ProcessFeed:
|
|||
return FEED_ERRPARSE, ret_values
|
||||
elif self.fpf.bozo and isinstance(self.fpf.bozo_exception, xml.sax._exceptions.SAXException):
|
||||
if not self.fpf.entries:
|
||||
self.feed.save_feed_history(503, 'SAX Exception', self.fpf.bozo_exception)
|
||||
fixed_feed = self.feed.check_feed_address_for_feed_link()
|
||||
if not fixed_feed:
|
||||
self.feed.save_feed_history(503, 'SAX Exception', self.fpf.bozo_exception)
|
||||
return FEED_ERRPARSE, ret_values
|
||||
|
||||
# the feed has changed (or it is the first time we parse it)
|
||||
|
@ -235,6 +237,7 @@ class Dispatcher:
|
|||
# Close the DB so the connection can be re-opened on a per-process basis
|
||||
from django.db import connection
|
||||
connection.close()
|
||||
delta = None
|
||||
|
||||
MONGO_DB = settings.MONGO_DB
|
||||
db = pymongo.Connection(host=MONGO_DB['HOST'], port=MONGO_DB['PORT'])[MONGO_DB['NAME']]
|
||||
|
|
Loading…
Add table
Reference in a new issue