Don't collide stories that are too short to be collided.

This commit is contained in:
Samuel Clay 2013-01-28 16:45:48 -08:00
parent 4f6edc2acf
commit 79c6ea09cd
2 changed files with 10 additions and 4 deletions

View file

@ -791,7 +791,7 @@ class Feed(models.Model):
logging.debug(" ---> [%-30s] ~FBChecking ~SB%s~SN new/updated against ~SB%s~SN stories" % (
self.title[:30],
len(stories),
len(existing_stories)))
len(existing_stories.keys())))
for story in stories:
if not story.get('title'):
@ -1112,12 +1112,13 @@ class Feed(models.Model):
story_in_system = None
story_has_changed = False
story_link = self.get_permalink(story)
existing_stories_guids = existing_stories.keys()
# story_pub_date = story.get('published')
# story_published_now = story.get('published_now', False)
# start_date = story_pub_date - datetime.timedelta(hours=8)
# end_date = story_pub_date + datetime.timedelta(hours=8)
for existing_story in existing_stories:
for existing_story in existing_stories.values():
content_ratio = 0
# existing_story_pub_date = existing_story.story_date
# print 'Story pub date: %s %s' % (story_published_now, story_pub_date)
@ -1135,7 +1136,10 @@ class Feed(models.Model):
if isinstance(existing_story.id, unicode):
existing_story.story_guid = existing_story.id
if story.get('guid') and story.get('guid') == existing_story.story_guid:
if (story.get('guid') in existing_stories_guids and
story.get('guid') != existing_story.story_guid):
continue
elif story.get('guid') == existing_story.story_guid:
story_in_system = existing_story
# Title distance + content distance, checking if story changed
@ -1167,8 +1171,10 @@ class Feed(models.Model):
if story_in_system and not story_has_changed:
if story_content != existing_story_content:
# print "Content difference - %s/%s" % (story_content, existing_story_content)
story_has_changed = True
if story_link != existing_story.story_permalink:
# print "Permalink difference - %s/%s" % (story_link, existing_story.story_permalink)
story_has_changed = True
# if story_pub_date != existing_story.story_date:
# story_has_changed = True

View file

@ -216,7 +216,7 @@ class ProcessFeed:
stories.append(story)
story_guids.append(story.get('guid'))
existing_stories = list(MStory.objects(
existing_stories = dict((s.story_guid, s) for s in MStory.objects(
# story_guid__in=story_guids,
story_date__gte=start_date,
story_feed_id=self.feed.pk