mirror of
https://github.com/viq/NewsBlur.git
synced 2025-04-13 09:38:09 +00:00
Woo - Checking for content and title differences to resolve duplicate entries. Integration tests prove a number of differences are successfully resolved.
This commit is contained in:
parent
bdd91d714d
commit
9e522ca528
6 changed files with 90 additions and 82 deletions
|
@ -14,11 +14,11 @@
|
|||
<item>
|
||||
<title>Public Advocate Hopefuls Debate Each Other, Defend The Job</title>
|
||||
|
||||
<link>http://feeds.gothamistllc.com/click.phdo?i=8a845cced28e85b43ca559267d509c78</link>
|
||||
<link>http://feeds.gothamistllc.com/click.phdo?i=yatta</link>
|
||||
<pheedo:origLink>http://gothamist.com/2009/08/29/public_advocate_hopefuls_debate_eac.php</pheedo:origLink>
|
||||
<guid isPermaLink="false">http://gothamist.com/2009/08/29/public_advocate_hopefuls_debate_eac.php</guid>
|
||||
<guid isPermaLink="false">http://gothamist.com/2009/08/29/something_different.php</guid>
|
||||
<comments>http://gothamist.com/2009/08/29/public_advocate_hopefuls_debate_eac.php#comments</comments>
|
||||
<description><p><span class="mt-enclosure mt-enclosure-image" style="display: inline;"> <img alt="2009_08_pubadv.jpg" src="http://gothamist.com/attachments/jen/2009_08_pubadv.jpg" width="290" height="224" class="image-left" /> </span>Last night, the Democratic candidates for NYC Public Advocates squared off for a WABC 7-televised debate, and it got good fast: Within 10 minutes, City Councilman Bill de Blasio <a href="http://www.nytimes.com/2009/08/29/nyregion/29debate.html?ref=nyregion">attacked frontrunner and former Public Advocate Mark Green</a>, "I have to say, with all due respect to Mark Green, he has a little amnesia. <strong>He did not stand up to Rudy Giuliani when it mattered most, when Rudy Giuliani <a href="http://www.nytimes.com/2001/09/24/nyregion/24RUDY.html">wanted to stay on for 90 days</a> at the end of 2001.</strong> Mark <a href="http://www.nytimes.com/2001/10/02/nyregion/giuliani-s-quest-for-a-term-extension-hits-a-wall-in-albany.html?scp=3&sq=2001%20giuliani%20mark%20green%2090%20days&st=cse">caved in to him for political reasons</a>.&#8221; Green retorted, "There was not any public official in the eight years of Rudy Giuliani who stood up to him more often and more successfully." Green <a href="http://www.politickerny.com/5072/green-says-hes-not-interested-running-mayor-de-blasio-attacks">also told moderator Diane Williams</a>, "<strong>Diane, let Bill attack me one more time because he needs it emotionally.</strong>"</p>
|
||||
<description><p><span class="mt-enclosure mt-enclosure-image" style="display: inline;"> <img alt="2009_08_pubadv.jpg" src="http://gothamist.com/attachments/jen/2009_08_pubadv.jpg" width="290" height="224" class="image-left" /> </span>Last night, the Democratic candidates for NYC Public Advocates squared off for a WABC 7-televised debate, and it got good fast: Within 10 minutes, City Councilman Somebody Else Entirely <a href="http://www.nytimes.com/2009/08/29/nyregion/29debate.html?ref=nyregion">attacked frontrunner and former Public Advocate Mark Green</a>, "I have to say, with all due respect to Mark Green, he has a little amnesia. <strong>He did not stand up to Rudy Giuliani when it mattered most, when Rudy Giuliani <a href="http://www.nytimes.com/2001/09/24/nyregion/24RUDY.html">wanted to stay on for 90 days</a> at the end of 2001.</strong> Mark <a href="http://www.nytimes.com/2001/10/02/nyregion/giuliani-s-quest-for-a-term-extension-hits-a-wall-in-albany.html?scp=3&sq=2001%20giuliani%20mark%20green%2090%20days&st=cse">caved in to him for political reasons</a>.&#8221; Green retorted, "There was not any public official in the eight years of Rudy Giuliani who stood up to him more often and more successfully." Green <a href="http://www.politickerny.com/5072/green-says-hes-not-interested-running-mayor-de-blasio-attacks">also told moderator Diane Williams</a>, "<strong>Diane, let Bill attack me one more time because he needs it emotionally.</strong>"</p>
|
||||
|
||||
<p>Of course, a big question that loomed over the debate is whether there needs to be a Public Advocate at all (City Councilman Simcha Felder <a href="http://www.politickerny.com/4693/felders-argument-eliminating-public-advocate">released a report on why the city should do away</a> with the position, which is second highest elected position in the city). Civil rights lawyer Norman Siegel said, "You need a public advocate to go and fight. When there is a slush fund in the City Council, you need someone from the outside like me to take on the insiders," while Green said, "I was the public advocate. I don't remember this question being asked very often, if ever, when I was there." </p>
|
||||
|
||||
|
|
|
@ -4,6 +4,7 @@ from apps.rss_feeds.models import Feed, Story
|
|||
from django.core.cache import cache
|
||||
from apps.reader.models import UserSubscription, UserSubscriptionFolders, UserStory
|
||||
from optparse import OptionParser, make_option
|
||||
from utils.management_functions import daemonize
|
||||
import os
|
||||
import logging
|
||||
import errno
|
||||
|
@ -29,25 +30,4 @@ class Command(BaseCommand):
|
|||
)
|
||||
for us in usersubs:
|
||||
us.count_unread()
|
||||
cache.delete('usersub:%s' % us.user_id)
|
||||
|
||||
def daemonize():
|
||||
"""
|
||||
Detach from the terminal and continue as a daemon.
|
||||
"""
|
||||
# swiped from twisted/scripts/twistd.py
|
||||
# See http://www.erlenstar.demon.co.uk/unix/faq_toc.html#TOC16
|
||||
if os.fork(): # launch child and...
|
||||
os._exit(0) # kill off parent
|
||||
os.setsid()
|
||||
if os.fork(): # launch child and...
|
||||
os._exit(0) # kill off parent again.
|
||||
os.umask(077)
|
||||
null = os.open("/dev/null", os.O_RDWR)
|
||||
for i in range(3):
|
||||
try:
|
||||
os.dup2(null, i)
|
||||
except OSError, e:
|
||||
if e.errno != errno.EBADF:
|
||||
raise
|
||||
os.close(null)
|
||||
cache.delete('usersub:%s' % us.user_id)
|
|
@ -3,10 +3,9 @@ from django.core.handlers.wsgi import WSGIHandler
|
|||
from apps.rss_feeds.models import Feed, Story
|
||||
from optparse import OptionParser, make_option
|
||||
from utils import feed_fetcher
|
||||
from utils.management_functions import daemonize
|
||||
import logging
|
||||
import os
|
||||
import socket
|
||||
import errno
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
|
@ -36,24 +35,4 @@ class Command(BaseCommand):
|
|||
|
||||
disp.poll()
|
||||
|
||||
|
||||
def daemonize():
|
||||
"""
|
||||
Detach from the terminal and continue as a daemon.
|
||||
"""
|
||||
# swiped from twisted/scripts/twistd.py
|
||||
# See http://www.erlenstar.demon.co.uk/unix/faq_toc.html#TOC16
|
||||
if os.fork(): # launch child and...
|
||||
os._exit(0) # kill off parent
|
||||
os.setsid()
|
||||
if os.fork(): # launch child and...
|
||||
os._exit(0) # kill off parent again.
|
||||
os.umask(077)
|
||||
null = os.open("/dev/null", os.O_RDWR)
|
||||
for i in range(3):
|
||||
try:
|
||||
os.dup2(null, i)
|
||||
except OSError, e:
|
||||
if e.errno != errno.EBADF:
|
||||
raise
|
||||
os.close(null)
|
||||
|
|
@ -13,6 +13,7 @@ from utils.story_functions import format_story_link_date__long
|
|||
from django.db.models import Q
|
||||
import settings
|
||||
import logging
|
||||
import difflib
|
||||
from utils.diff import HTMLDiff
|
||||
|
||||
USER_AGENT = 'NewsBlur v1.0 - newsblur.com'
|
||||
|
@ -83,7 +84,7 @@ class Feed(models.Model):
|
|||
story_content = story_contents[0]['value']
|
||||
else:
|
||||
story_content = story.get('summary')
|
||||
existing_story, is_different = self._exists_story(story, story_content, existing_stories)
|
||||
existing_story, story_has_changed = self._exists_story(story, story_content, existing_stories)
|
||||
if existing_story is None:
|
||||
pub_date = datetime.datetime.timetuple(story.get('published'))
|
||||
# logging.debug('- New story: %s %s' % (pub_date, story.get('title')))
|
||||
|
@ -101,7 +102,7 @@ class Feed(models.Model):
|
|||
except:
|
||||
ret_values[ENTRY_ERR] += 1
|
||||
pass
|
||||
elif existing_story and is_different:
|
||||
elif existing_story and story_has_changed:
|
||||
# update story
|
||||
logging.debug('- Updated story in feed (%s - %s): %s / %s' % (self.feed_title, story.get('title'), len(existing_story['story_content']), len(story_content)))
|
||||
|
||||
|
@ -161,30 +162,47 @@ class Feed(models.Model):
|
|||
return stories
|
||||
|
||||
def _exists_story(self, story=None, story_content=None, existing_stories=None):
|
||||
same_story = None
|
||||
is_different = False
|
||||
story_in_system = None
|
||||
story_has_changed = False
|
||||
story_pub_date = story.get('published')
|
||||
start_date = story_pub_date - datetime.timedelta(hours=8)
|
||||
end_date = story_pub_date + datetime.timedelta(hours=8)
|
||||
|
||||
|
||||
for existing_story in existing_stories:
|
||||
content_ratio = 0
|
||||
|
||||
if story_pub_date > start_date and story_pub_date < end_date:
|
||||
if story.get('link') == existing_story['story_permalink']:
|
||||
same_story = existing_story
|
||||
|
||||
story_in_system = existing_story
|
||||
|
||||
# Title distance + content distance, checking if story changed
|
||||
story_title_difference = levenshtein_distance(story.get('title'),
|
||||
existing_story['story_title'])
|
||||
if same_story and story_title_difference < 10:
|
||||
same_story = existing_story
|
||||
if story_title_difference > 0:
|
||||
is_different = True
|
||||
|
||||
if same_story:
|
||||
if story_content != existing_story['story_content']:
|
||||
is_different = True
|
||||
seq = difflib.SequenceMatcher(None, story_content, existing_story['story_content'])
|
||||
|
||||
if seq.real_quick_ratio() > .9 and seq.quick_ratio() > .95:
|
||||
content_ratio = seq.ratio()
|
||||
|
||||
if story_title_difference > 0 and story_title_difference < 5 and content_ratio > .98:
|
||||
story_in_system = existing_story
|
||||
if story_title_difference > 0 or content_ratio < 1.0:
|
||||
# print "Title difference - %s/%s (%s): %s" % (story.get('title'), existing_story['story_title'], story_title_difference, content_ratio)
|
||||
story_has_changed = True
|
||||
break
|
||||
|
||||
# More restrictive content distance, still no story match
|
||||
if not story_in_system and content_ratio > .99:
|
||||
# print "Content difference - %s/%s (%s): %s" % (story.get('title'), existing_story['story_title'], story_title_difference, content_ratio)
|
||||
story_in_system = existing_story
|
||||
story_has_changed = True
|
||||
break
|
||||
|
||||
return same_story, is_different
|
||||
|
||||
if story_in_system:
|
||||
if story_content != existing_story['story_content']:
|
||||
story_has_changed = True
|
||||
break
|
||||
|
||||
return story_in_system, story_has_changed
|
||||
|
||||
def _pre_process_story(self, entry):
|
||||
date_published = entry.get('published', entry.get('updated'))
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from django.utils import simplejson as json
|
||||
from utils.json import decode
|
||||
from django.test.client import Client
|
||||
from django.test import TestCase
|
||||
from django.core import management
|
||||
|
@ -9,20 +9,24 @@ class FeedTest(TestCase):
|
|||
def setUp(self):
|
||||
self.client = Client()
|
||||
|
||||
# def test_load_feeds__changed_story_title(self):
|
||||
# self.client.login(userame='conesus', password='test')
|
||||
#
|
||||
# management.call_command('loaddata', 'gawker1.json', verbosity=0)
|
||||
# response = self.client.get('/reader/refresh_feed', { "feed_id": 1, "force": True })
|
||||
#
|
||||
# management.call_command('loaddata', 'gawker2.json', verbosity=0)
|
||||
# response = self.client.get('/reader/refresh_feed', { "feed_id": 1, "force": True })
|
||||
#
|
||||
# response = self.client.get('/reader/load_single_feed', { "feed_id": 1 })
|
||||
# print [c['story_title'] for c in json.loads(response.content)]
|
||||
# # print json.loads(response.content)[0]
|
||||
|
||||
def test_load_feeds__gothamist__changed_story_title(self):
|
||||
def test_load_feeds__gawker(self):
|
||||
self.client.login(userame='conesus', password='test')
|
||||
|
||||
management.call_command('loaddata', 'gawker1.json', verbosity=0)
|
||||
response = self.client.get('/reader/refresh_feed', { "feed_id": 1, "force": True })
|
||||
|
||||
management.call_command('loaddata', 'gawker2.json', verbosity=0)
|
||||
response = self.client.get('/reader/refresh_feed', { "feed_id": 1, "force": True })
|
||||
|
||||
response = self.client.get('/reader/load_single_feed', { "feed_id": 1 })
|
||||
|
||||
# print [c['story_title'] for c in json.loads(response.content)]
|
||||
stories = decode(response.content)
|
||||
|
||||
# Test: 1 changed char in content
|
||||
self.assertEquals(len(stories), 38)
|
||||
|
||||
def test_load_feeds__gothamist(self):
|
||||
self.client.login(userame='conesus', password='test')
|
||||
|
||||
management.call_command('loaddata', 'gothamist1.json', verbosity=0)
|
||||
|
@ -32,5 +36,9 @@ class FeedTest(TestCase):
|
|||
response = self.client.get('/reader/refresh_feed', { "feed_id": 4, "force": True })
|
||||
|
||||
response = self.client.get('/reader/load_single_feed', { "feed_id": 4 })
|
||||
print [c['story_title'] for c in json.loads(response.content)]
|
||||
# print json.loads(response.content)[0]
|
||||
|
||||
# print [c['story_title'] for c in json.loads(response.content)]
|
||||
stories = decode(response.content)
|
||||
|
||||
# Test: 1 changed char in title
|
||||
self.assertEquals(len(stories), 42)
|
23
utils/management_functions.py
Normal file
23
utils/management_functions.py
Normal file
|
@ -0,0 +1,23 @@
|
|||
import os
|
||||
import errno
|
||||
|
||||
def daemonize():
|
||||
"""
|
||||
Detach from the terminal and continue as a daemon.
|
||||
"""
|
||||
# swiped from twisted/scripts/twistd.py
|
||||
# See http://www.erlenstar.demon.co.uk/unix/faq_toc.html#TOC16
|
||||
if os.fork(): # launch child and...
|
||||
os._exit(0) # kill off parent
|
||||
os.setsid()
|
||||
if os.fork(): # launch child and...
|
||||
os._exit(0) # kill off parent again.
|
||||
os.umask(077)
|
||||
null = os.open("/dev/null", os.O_RDWR)
|
||||
for i in range(3):
|
||||
try:
|
||||
os.dup2(null, i)
|
||||
except OSError, e:
|
||||
if e.errno != errno.EBADF:
|
||||
raise
|
||||
os.close(null)
|
Loading…
Add table
Reference in a new issue