mirror of
https://github.com/viq/NewsBlur.git
synced 2025-11-01 09:09:16 +00:00
Fixing the out-of-date dupe bug.
This commit is contained in:
parent
f518e10ea1
commit
8bf5fc8dca
8 changed files with 77 additions and 61 deletions
|
|
@ -3,7 +3,7 @@
|
|||
"pk": 4,
|
||||
"model": "rss_feeds.feed",
|
||||
"fields": {
|
||||
"feed_address": "%(NEWSBLUR_DIR)s/apps/rss_feeds/fixtures/gothamist1.xml",
|
||||
"feed_address": "%(NEWSBLUR_DIR)s/apps/rss_feeds/fixtures/gothamist_aug_2009_1.xml",
|
||||
"days_to_trim": 90,
|
||||
"feed_link": "http://gothamist.com",
|
||||
"num_subscribers": 0,
|
||||
|
|
@ -3,7 +3,7 @@
|
|||
"pk": 4,
|
||||
"model": "rss_feeds.feed",
|
||||
"fields": {
|
||||
"feed_address": "%(NEWSBLUR_DIR)s/apps/rss_feeds/fixtures/gothamist2.xml",
|
||||
"feed_address": "%(NEWSBLUR_DIR)s/apps/rss_feeds/fixtures/gothamist_aug_2009_2.xml",
|
||||
"days_to_trim": 90,
|
||||
"feed_link": "http://gothamist.com",
|
||||
"num_subscribers": 0,
|
||||
|
|
@ -11,6 +11,7 @@ from django.utils.http import urlquote
|
|||
from django.utils.safestring import mark_safe
|
||||
from utils.story_functions import format_story_link_date__short
|
||||
from utils.story_functions import format_story_link_date__long
|
||||
from utils.story_functions import pre_process_story
|
||||
from django.db.models import Q
|
||||
import settings
|
||||
import logging
|
||||
|
|
@ -78,7 +79,7 @@ class Feed(models.Model):
|
|||
}
|
||||
|
||||
for story in stories:
|
||||
story = self._pre_process_story(story)
|
||||
story = pre_process_story(story)
|
||||
|
||||
if story.get('title'):
|
||||
story_contents = story.get('content')
|
||||
|
|
@ -87,15 +88,16 @@ class Feed(models.Model):
|
|||
else:
|
||||
story_content = story.get('summary')
|
||||
existing_story, story_has_changed = self._exists_story(story, story_content, existing_stories)
|
||||
story_author, _ = self._save_story_author(story.get('author'))
|
||||
if existing_story is None:
|
||||
pub_date = datetime.datetime.timetuple(story.get('published'))
|
||||
# logging.debug('- New story: %s %s' % (pub_date, story.get('title')))
|
||||
|
||||
|
||||
s = Story(story_feed = self,
|
||||
story_date = story.get('published'),
|
||||
story_title = story.get('title'),
|
||||
story_content = story_content,
|
||||
story_author = story.get('author'),
|
||||
story_author = story_author,
|
||||
story_permalink = story.get('link'),
|
||||
story_guid = story.get('id') or story.get('link')
|
||||
)
|
||||
|
|
@ -104,7 +106,7 @@ class Feed(models.Model):
|
|||
s.save(force_insert=True)
|
||||
except IntegrityError, e:
|
||||
ret_values[ENTRY_ERR] += 1
|
||||
logging.error('Saving new story, IntegrityError: %s - %s: %s' % (self.feed_title, story.get('title'), e))
|
||||
print('Saving new story, IntegrityError: %s - %s: %s' % (self.feed_title, story.get('title'), e))
|
||||
elif existing_story and story_has_changed:
|
||||
# update story
|
||||
logging.debug('- Updated story in feed (%s - %s): %s / %s' % (self.feed_title, story.get('title'), len(existing_story['story_content']), len(story_content)))
|
||||
|
|
@ -127,7 +129,7 @@ class Feed(models.Model):
|
|||
story_title = story.get('title'),
|
||||
story_content = diff.getDiff(),
|
||||
story_original_content = original_content,
|
||||
story_author = story.get('author'),
|
||||
story_author = story_author,
|
||||
story_permalink = story.get('link'),
|
||||
story_guid = story.get('id') or story.get('link')
|
||||
)
|
||||
|
|
@ -136,14 +138,17 @@ class Feed(models.Model):
|
|||
s.save(force_update=True)
|
||||
except IntegrityError, e:
|
||||
ret_values[ENTRY_ERR] += 1
|
||||
logging.error('Saving updated story, IntegrityError: %s - %s' % (self.feed_title, story.get('title')))
|
||||
print('Saving updated story, IntegrityError: %s - %s' % (self.feed_title, story.get('title')))
|
||||
else:
|
||||
ret_values[ENTRY_SAME] += 1
|
||||
# logging.debug("Unchanged story: %s " % story.get('title'))
|
||||
|
||||
return ret_values
|
||||
|
||||
|
||||
def _save_story_author(self, author):
|
||||
author, created = StoryAuthor.objects.get_or_create(feed=self, author_name=author)
|
||||
return author, created
|
||||
|
||||
def trim_feed(self):
|
||||
date_diff = datetime.datetime.now() - datetime.timedelta(self.days_to_trim)
|
||||
stories = Story.objects.filter(story_feed=self, story_date__lte=date_diff)
|
||||
|
|
@ -213,32 +218,6 @@ class Feed(models.Model):
|
|||
break
|
||||
|
||||
return story_in_system, story_has_changed
|
||||
|
||||
def _pre_process_story(self, entry):
|
||||
date_published = entry.get('published', entry.get('updated'))
|
||||
if not date_published:
|
||||
date_published = str(datetime.datetime.now())
|
||||
date_published = dateutil_parse(date_published)
|
||||
# Change the date to UTC and remove timezone info since
|
||||
# MySQL doesn't support it.
|
||||
timezone_diff = datetime.datetime.utcnow() - datetime.datetime.now()
|
||||
date_published_offset = date_published.utcoffset()
|
||||
if date_published_offset:
|
||||
date_published = (date_published - date_published_offset
|
||||
- timezone_diff).replace(tzinfo=None)
|
||||
else:
|
||||
date_published = date_published.replace(tzinfo=None)
|
||||
|
||||
entry['published'] = date_published
|
||||
|
||||
entry_link = entry.get('link', '')
|
||||
protocol_index = entry_link.find("://")
|
||||
if protocol_index != -1:
|
||||
entry['link'] = (entry_link[:protocol_index+3]
|
||||
+ urlquote(entry_link[protocol_index+3:]))
|
||||
else:
|
||||
entry['link'] = urlquote(entry_link)
|
||||
return entry
|
||||
|
||||
class Meta:
|
||||
db_table="feeds"
|
||||
|
|
@ -253,6 +232,10 @@ class Tag(models.Model):
|
|||
def save(self):
|
||||
super(Tag, self).save()
|
||||
|
||||
class StoryAuthor(models.Model):
|
||||
feed = models.ForeignKey(Feed)
|
||||
author_name = models.CharField(max_length=255, null=True, blank=True)
|
||||
|
||||
class Story(models.Model):
|
||||
'''A feed item'''
|
||||
story_feed = models.ForeignKey(Feed)
|
||||
|
|
@ -262,7 +245,7 @@ class Story(models.Model):
|
|||
story_original_content = models.TextField(null=True, blank=True)
|
||||
story_content_type = models.CharField(max_length=255, null=True,
|
||||
blank=True)
|
||||
story_author = models.CharField(max_length=255, null=True, blank=True)
|
||||
story_author = models.ForeignKey(StoryAuthor)
|
||||
story_permalink = models.CharField(max_length=1000)
|
||||
story_guid = models.CharField(max_length=1000)
|
||||
story_past_trim_date = models.BooleanField(default=False)
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ class FeedTest(TestCase):
|
|||
self.client = Client()
|
||||
|
||||
def test_load_feeds__gawker(self):
|
||||
self.client.login(userame='conesus', password='test')
|
||||
self.client.login(username='conesus', password='test')
|
||||
|
||||
management.call_command('loaddata', 'gawker1.json', verbosity=0)
|
||||
response = self.client.get('/reader/refresh_feed', { "feed_id": 1, "force": True })
|
||||
|
|
@ -28,24 +28,24 @@ class FeedTest(TestCase):
|
|||
self.assertEquals(len(stories), 38)
|
||||
|
||||
def test_load_feeds__gothamist(self):
|
||||
self.client.login(userame='conesus', password='test')
|
||||
self.client.login(username='conesus', password='test')
|
||||
|
||||
management.call_command('loaddata', 'gothamist1.json', verbosity=0)
|
||||
management.call_command('loaddata', 'gothamist_aug_2009_1.json', verbosity=0)
|
||||
response = self.client.get('/reader/refresh_feed', { "feed_id": 4, "force": True })
|
||||
|
||||
management.call_command('loaddata', 'gothamist2.json', verbosity=0)
|
||||
response = self.client.get('/reader/refresh_feed', { "feed_id": 4, "force": True })
|
||||
|
||||
response = self.client.get('/reader/load_single_feed', { "feed_id": 4 })
|
||||
|
||||
# print [c['story_title'] for c in json.decode(response.content)]
|
||||
stories = json.decode(response.content)
|
||||
self.assertEquals(len(stories), 42)
|
||||
|
||||
management.call_command('loaddata', 'gothamist_aug_2009_2.json', verbosity=0)
|
||||
response = self.client.get('/reader/refresh_feed', { "feed_id": 4, "force": True })
|
||||
response = self.client.get('/reader/load_single_feed', { "feed_id": 4 })
|
||||
print [c['story_title'] for c in json.decode(response.content)]
|
||||
stories = json.decode(response.content)
|
||||
# Test: 1 changed char in title
|
||||
self.assertEquals(len(stories), 42)
|
||||
|
||||
def test_load_feeds__slashdot(self):
|
||||
self.client.login(userame='conesus', password='test')
|
||||
self.client.login(username='conesus', password='test')
|
||||
|
||||
management.call_command('loaddata', 'slashdot1.json', verbosity=0)
|
||||
response = self.client.get('/reader/refresh_feed', { "feed_id": 5, "force": True })
|
||||
|
|
@ -55,7 +55,7 @@ class FeedTest(TestCase):
|
|||
|
||||
response = self.client.get('/reader/load_single_feed', { "feed_id": 5 })
|
||||
|
||||
pprint([c['story_title'] for c in json.decode(response.content)])
|
||||
# pprint([c['story_title'] for c in json.decode(response.content)])
|
||||
stories = json.decode(response.content)
|
||||
|
||||
# Test: 1 changed char in title
|
||||
|
|
|
|||
|
|
@ -4,6 +4,8 @@ from apps.reader.models import UserSubscription, UserSubscriptionFolders, UserSt
|
|||
from apps.rss_feeds.importer import PageImporter
|
||||
from utils import feedparser, threadpool
|
||||
from django.db import transaction
|
||||
from utils.dateutil.parser import parse as dateutil_parse
|
||||
from utils.story_functions import pre_process_story
|
||||
import sys
|
||||
import time
|
||||
import logging
|
||||
|
|
@ -170,19 +172,21 @@ class ProcessFeed:
|
|||
|
||||
|
||||
# Compare new stories to existing stories, adding and updating
|
||||
try:
|
||||
num_entries = len(self.fpf.entries)
|
||||
existing_stories = Story.objects.filter(
|
||||
story_feed=self.feed
|
||||
).order_by('-story_date').values()[:num_entries*2]
|
||||
|
||||
ret_values = self.feed.add_update_stories(self.fpf.entries, existing_stories)
|
||||
except:
|
||||
(etype, eobj, etb) = sys.exc_info()
|
||||
print '[%d] ! -------------------------' % (self.feed.id,)
|
||||
# print traceback.format_exception(etype, eobj, etb)
|
||||
traceback.print_exception(etype, eobj, etb)
|
||||
print '[%d] ! -------------------------' % (self.feed.id,)
|
||||
num_entries = len(self.fpf.entries)
|
||||
start_date = datetime.datetime.now()
|
||||
end_date = datetime.datetime.now()
|
||||
for entry in self.fpf.entries:
|
||||
story = pre_process_story(entry)
|
||||
if story.get('published') < start_date or not start_date:
|
||||
start_date = story.get('published')
|
||||
if story.get('published') > end_date or not end_date:
|
||||
end_date = story.get('published')
|
||||
existing_stories = Story.objects.filter(
|
||||
story_feed=self.feed,
|
||||
story_date__gte=start_date,
|
||||
story_date__lte=end_date,
|
||||
).order_by('-story_date').values()[:100]
|
||||
ret_values = self.feed.add_update_stories(self.fpf.entries, existing_stories)
|
||||
|
||||
return FEED_OK, ret_values
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,7 @@
|
|||
from django.utils.dateformat import DateFormat
|
||||
import datetime
|
||||
from utils.dateutil.parser import parse as dateutil_parse
|
||||
from django.utils.http import urlquote
|
||||
|
||||
def format_story_link_date__short(date):
|
||||
parsed_date, date_tuple, today_tuple, yesterday_tuple = _extract_date_tuples(date)
|
||||
|
|
@ -26,4 +28,31 @@ def _extract_date_tuples(date):
|
|||
today = datetime.datetime.today()
|
||||
yesterday_tuple = datetime.datetime.timetuple(today - datetime.timedelta(1))[:3]
|
||||
|
||||
return parsed_date, date_tuple, today_tuple, yesterday_tuple
|
||||
return parsed_date, date_tuple, today_tuple, yesterday_tuple
|
||||
|
||||
def pre_process_story(entry):
|
||||
date_published = entry.get('published', entry.get('updated'))
|
||||
if not date_published:
|
||||
date_published = str(datetime.datetime.now())
|
||||
if not isinstance(date_published, datetime.datetime):
|
||||
date_published = dateutil_parse(date_published)
|
||||
# Change the date to UTC and remove timezone info since
|
||||
# MySQL doesn't support it.
|
||||
timezone_diff = datetime.datetime.utcnow() - datetime.datetime.now()
|
||||
date_published_offset = date_published.utcoffset()
|
||||
if date_published_offset:
|
||||
date_published = (date_published - date_published_offset
|
||||
- timezone_diff).replace(tzinfo=None)
|
||||
else:
|
||||
date_published = date_published.replace(tzinfo=None)
|
||||
|
||||
entry['published'] = date_published
|
||||
|
||||
entry_link = entry.get('link', '')
|
||||
protocol_index = entry_link.find("://")
|
||||
if protocol_index != -1:
|
||||
entry['link'] = (entry_link[:protocol_index+3]
|
||||
+ urlquote(entry_link[protocol_index+3:]))
|
||||
else:
|
||||
entry['link'] = urlquote(entry_link)
|
||||
return entry
|
||||
Loading…
Add table
Reference in a new issue