Fixing the out-of-date dupe bug.

This commit is contained in:
Samuel Clay 2009-12-18 20:47:44 +00:00
parent f518e10ea1
commit 8bf5fc8dca
8 changed files with 77 additions and 61 deletions

View file

@ -3,7 +3,7 @@
"pk": 4,
"model": "rss_feeds.feed",
"fields": {
"feed_address": "%(NEWSBLUR_DIR)s/apps/rss_feeds/fixtures/gothamist1.xml",
"feed_address": "%(NEWSBLUR_DIR)s/apps/rss_feeds/fixtures/gothamist_aug_2009_1.xml",
"days_to_trim": 90,
"feed_link": "http://gothamist.com",
"num_subscribers": 0,

View file

@ -3,7 +3,7 @@
"pk": 4,
"model": "rss_feeds.feed",
"fields": {
"feed_address": "%(NEWSBLUR_DIR)s/apps/rss_feeds/fixtures/gothamist2.xml",
"feed_address": "%(NEWSBLUR_DIR)s/apps/rss_feeds/fixtures/gothamist_aug_2009_2.xml",
"days_to_trim": 90,
"feed_link": "http://gothamist.com",
"num_subscribers": 0,

View file

@ -11,6 +11,7 @@ from django.utils.http import urlquote
from django.utils.safestring import mark_safe
from utils.story_functions import format_story_link_date__short
from utils.story_functions import format_story_link_date__long
from utils.story_functions import pre_process_story
from django.db.models import Q
import settings
import logging
@ -78,7 +79,7 @@ class Feed(models.Model):
}
for story in stories:
story = self._pre_process_story(story)
story = pre_process_story(story)
if story.get('title'):
story_contents = story.get('content')
@ -87,15 +88,16 @@ class Feed(models.Model):
else:
story_content = story.get('summary')
existing_story, story_has_changed = self._exists_story(story, story_content, existing_stories)
story_author, _ = self._save_story_author(story.get('author'))
if existing_story is None:
pub_date = datetime.datetime.timetuple(story.get('published'))
# logging.debug('- New story: %s %s' % (pub_date, story.get('title')))
s = Story(story_feed = self,
story_date = story.get('published'),
story_title = story.get('title'),
story_content = story_content,
story_author = story.get('author'),
story_author = story_author,
story_permalink = story.get('link'),
story_guid = story.get('id') or story.get('link')
)
@ -104,7 +106,7 @@ class Feed(models.Model):
s.save(force_insert=True)
except IntegrityError, e:
ret_values[ENTRY_ERR] += 1
logging.error('Saving new story, IntegrityError: %s - %s: %s' % (self.feed_title, story.get('title'), e))
print('Saving new story, IntegrityError: %s - %s: %s' % (self.feed_title, story.get('title'), e))
elif existing_story and story_has_changed:
# update story
logging.debug('- Updated story in feed (%s - %s): %s / %s' % (self.feed_title, story.get('title'), len(existing_story['story_content']), len(story_content)))
@ -127,7 +129,7 @@ class Feed(models.Model):
story_title = story.get('title'),
story_content = diff.getDiff(),
story_original_content = original_content,
story_author = story.get('author'),
story_author = story_author,
story_permalink = story.get('link'),
story_guid = story.get('id') or story.get('link')
)
@ -136,14 +138,17 @@ class Feed(models.Model):
s.save(force_update=True)
except IntegrityError, e:
ret_values[ENTRY_ERR] += 1
logging.error('Saving updated story, IntegrityError: %s - %s' % (self.feed_title, story.get('title')))
print('Saving updated story, IntegrityError: %s - %s' % (self.feed_title, story.get('title')))
else:
ret_values[ENTRY_SAME] += 1
# logging.debug("Unchanged story: %s " % story.get('title'))
return ret_values
def _save_story_author(self, author):
author, created = StoryAuthor.objects.get_or_create(feed=self, author_name=author)
return author, created
def trim_feed(self):
date_diff = datetime.datetime.now() - datetime.timedelta(self.days_to_trim)
stories = Story.objects.filter(story_feed=self, story_date__lte=date_diff)
@ -213,32 +218,6 @@ class Feed(models.Model):
break
return story_in_system, story_has_changed
def _pre_process_story(self, entry):
date_published = entry.get('published', entry.get('updated'))
if not date_published:
date_published = str(datetime.datetime.now())
date_published = dateutil_parse(date_published)
# Change the date to UTC and remove timezone info since
# MySQL doesn't support it.
timezone_diff = datetime.datetime.utcnow() - datetime.datetime.now()
date_published_offset = date_published.utcoffset()
if date_published_offset:
date_published = (date_published - date_published_offset
- timezone_diff).replace(tzinfo=None)
else:
date_published = date_published.replace(tzinfo=None)
entry['published'] = date_published
entry_link = entry.get('link', '')
protocol_index = entry_link.find("://")
if protocol_index != -1:
entry['link'] = (entry_link[:protocol_index+3]
+ urlquote(entry_link[protocol_index+3:]))
else:
entry['link'] = urlquote(entry_link)
return entry
class Meta:
db_table="feeds"
@ -253,6 +232,10 @@ class Tag(models.Model):
def save(self):
super(Tag, self).save()
class StoryAuthor(models.Model):
feed = models.ForeignKey(Feed)
author_name = models.CharField(max_length=255, null=True, blank=True)
class Story(models.Model):
'''A feed item'''
story_feed = models.ForeignKey(Feed)
@ -262,7 +245,7 @@ class Story(models.Model):
story_original_content = models.TextField(null=True, blank=True)
story_content_type = models.CharField(max_length=255, null=True,
blank=True)
story_author = models.CharField(max_length=255, null=True, blank=True)
story_author = models.ForeignKey(StoryAuthor)
story_permalink = models.CharField(max_length=1000)
story_guid = models.CharField(max_length=1000)
story_past_trim_date = models.BooleanField(default=False)

View file

@ -11,7 +11,7 @@ class FeedTest(TestCase):
self.client = Client()
def test_load_feeds__gawker(self):
self.client.login(userame='conesus', password='test')
self.client.login(username='conesus', password='test')
management.call_command('loaddata', 'gawker1.json', verbosity=0)
response = self.client.get('/reader/refresh_feed', { "feed_id": 1, "force": True })
@ -28,24 +28,24 @@ class FeedTest(TestCase):
self.assertEquals(len(stories), 38)
def test_load_feeds__gothamist(self):
self.client.login(userame='conesus', password='test')
self.client.login(username='conesus', password='test')
management.call_command('loaddata', 'gothamist1.json', verbosity=0)
management.call_command('loaddata', 'gothamist_aug_2009_1.json', verbosity=0)
response = self.client.get('/reader/refresh_feed', { "feed_id": 4, "force": True })
management.call_command('loaddata', 'gothamist2.json', verbosity=0)
response = self.client.get('/reader/refresh_feed', { "feed_id": 4, "force": True })
response = self.client.get('/reader/load_single_feed', { "feed_id": 4 })
# print [c['story_title'] for c in json.decode(response.content)]
stories = json.decode(response.content)
self.assertEquals(len(stories), 42)
management.call_command('loaddata', 'gothamist_aug_2009_2.json', verbosity=0)
response = self.client.get('/reader/refresh_feed', { "feed_id": 4, "force": True })
response = self.client.get('/reader/load_single_feed', { "feed_id": 4 })
print [c['story_title'] for c in json.decode(response.content)]
stories = json.decode(response.content)
# Test: 1 changed char in title
self.assertEquals(len(stories), 42)
def test_load_feeds__slashdot(self):
self.client.login(userame='conesus', password='test')
self.client.login(username='conesus', password='test')
management.call_command('loaddata', 'slashdot1.json', verbosity=0)
response = self.client.get('/reader/refresh_feed', { "feed_id": 5, "force": True })
@ -55,7 +55,7 @@ class FeedTest(TestCase):
response = self.client.get('/reader/load_single_feed', { "feed_id": 5 })
pprint([c['story_title'] for c in json.decode(response.content)])
# pprint([c['story_title'] for c in json.decode(response.content)])
stories = json.decode(response.content)
# Test: 1 changed char in title

View file

@ -4,6 +4,8 @@ from apps.reader.models import UserSubscription, UserSubscriptionFolders, UserSt
from apps.rss_feeds.importer import PageImporter
from utils import feedparser, threadpool
from django.db import transaction
from utils.dateutil.parser import parse as dateutil_parse
from utils.story_functions import pre_process_story
import sys
import time
import logging
@ -170,19 +172,21 @@ class ProcessFeed:
# Compare new stories to existing stories, adding and updating
try:
num_entries = len(self.fpf.entries)
existing_stories = Story.objects.filter(
story_feed=self.feed
).order_by('-story_date').values()[:num_entries*2]
ret_values = self.feed.add_update_stories(self.fpf.entries, existing_stories)
except:
(etype, eobj, etb) = sys.exc_info()
print '[%d] ! -------------------------' % (self.feed.id,)
# print traceback.format_exception(etype, eobj, etb)
traceback.print_exception(etype, eobj, etb)
print '[%d] ! -------------------------' % (self.feed.id,)
num_entries = len(self.fpf.entries)
start_date = datetime.datetime.now()
end_date = datetime.datetime.now()
for entry in self.fpf.entries:
story = pre_process_story(entry)
if story.get('published') < start_date or not start_date:
start_date = story.get('published')
if story.get('published') > end_date or not end_date:
end_date = story.get('published')
existing_stories = Story.objects.filter(
story_feed=self.feed,
story_date__gte=start_date,
story_date__lte=end_date,
).order_by('-story_date').values()[:100]
ret_values = self.feed.add_update_stories(self.fpf.entries, existing_stories)
return FEED_OK, ret_values

View file

@ -1,5 +1,7 @@
from django.utils.dateformat import DateFormat
import datetime
from utils.dateutil.parser import parse as dateutil_parse
from django.utils.http import urlquote
def format_story_link_date__short(date):
parsed_date, date_tuple, today_tuple, yesterday_tuple = _extract_date_tuples(date)
@ -26,4 +28,31 @@ def _extract_date_tuples(date):
today = datetime.datetime.today()
yesterday_tuple = datetime.datetime.timetuple(today - datetime.timedelta(1))[:3]
return parsed_date, date_tuple, today_tuple, yesterday_tuple
return parsed_date, date_tuple, today_tuple, yesterday_tuple
def pre_process_story(entry):
date_published = entry.get('published', entry.get('updated'))
if not date_published:
date_published = str(datetime.datetime.now())
if not isinstance(date_published, datetime.datetime):
date_published = dateutil_parse(date_published)
# Change the date to UTC and remove timezone info since
# MySQL doesn't support it.
timezone_diff = datetime.datetime.utcnow() - datetime.datetime.now()
date_published_offset = date_published.utcoffset()
if date_published_offset:
date_published = (date_published - date_published_offset
- timezone_diff).replace(tzinfo=None)
else:
date_published = date_published.replace(tzinfo=None)
entry['published'] = date_published
entry_link = entry.get('link', '')
protocol_index = entry_link.find("://")
if protocol_index != -1:
entry['link'] = (entry_link[:protocol_index+3]
+ urlquote(entry_link[protocol_index+3:]))
else:
entry['link'] = urlquote(entry_link)
return entry