mirror of
https://github.com/viq/NewsBlur.git
synced 2025-09-18 21:43:31 +00:00
Forcing de-duped stories to be written in the same day and share 75% of their titles.
This commit is contained in:
parent
28dfd96112
commit
281b8a3fd3
1 changed files with 10 additions and 2 deletions
|
@ -1283,7 +1283,7 @@ class Feed(models.Model):
|
|||
story_has_changed = False
|
||||
story_link = self.get_permalink(story)
|
||||
existing_stories_guids = existing_stories.keys()
|
||||
# story_pub_date = story.get('published')
|
||||
story_pub_date = story.get('published')
|
||||
# story_published_now = story.get('published_now', False)
|
||||
# start_date = story_pub_date - datetime.timedelta(hours=8)
|
||||
# end_date = story_pub_date + datetime.timedelta(hours=8)
|
||||
|
@ -1315,10 +1315,18 @@ class Feed(models.Model):
|
|||
story.get('guid') != existing_story.story_guid):
|
||||
# Story coming up later
|
||||
continue
|
||||
|
||||
# Title distance + content distance, checking if story changed
|
||||
story_title_difference = abs(levenshtein_distance(story.get('title'),
|
||||
existing_story.story_title))
|
||||
|
||||
|
||||
title_ratio = difflib.SequenceMatcher(None, story.get('title', ""),
|
||||
existing_story.story_title).ratio()
|
||||
if title_ratio < .75: continue
|
||||
|
||||
story_timedelta = existing_story.story_date - story_pub_date
|
||||
if abs(story_timedelta.days) >= 1: continue
|
||||
|
||||
seq = difflib.SequenceMatcher(None, story_content, existing_story_content)
|
||||
|
||||
if (seq
|
||||
|
|
Loading…
Add table
Reference in a new issue