Woo - Checking for content and title differences to resolve duplicate entries. Integration tests prove a number of differences are successfully resolved.

This commit is contained in:
Samuel Clay 2009-08-30 00:43:13 +00:00
parent bdd91d714d
commit 9e522ca528
6 changed files with 90 additions and 82 deletions

View file

@ -14,11 +14,11 @@
<item> <item>
<title>Public Advocate Hopefuls Debate Each Other, Defend The Job</title> <title>Public Advocate Hopefuls Debate Each Other, Defend The Job</title>
<link>http://feeds.gothamistllc.com/click.phdo?i=8a845cced28e85b43ca559267d509c78</link> <link>http://feeds.gothamistllc.com/click.phdo?i=yatta</link>
<pheedo:origLink>http://gothamist.com/2009/08/29/public_advocate_hopefuls_debate_eac.php</pheedo:origLink> <pheedo:origLink>http://gothamist.com/2009/08/29/public_advocate_hopefuls_debate_eac.php</pheedo:origLink>
<guid isPermaLink="false">http://gothamist.com/2009/08/29/public_advocate_hopefuls_debate_eac.php</guid> <guid isPermaLink="false">http://gothamist.com/2009/08/29/something_different.php</guid>
<comments>http://gothamist.com/2009/08/29/public_advocate_hopefuls_debate_eac.php#comments</comments> <comments>http://gothamist.com/2009/08/29/public_advocate_hopefuls_debate_eac.php#comments</comments>
<description>&lt;p&gt;&lt;span class=&quot;mt-enclosure mt-enclosure-image&quot; style=&quot;display: inline;&quot;&gt; &lt;img alt=&quot;2009_08_pubadv.jpg&quot; src=&quot;http://gothamist.com/attachments/jen/2009_08_pubadv.jpg&quot; width=&quot;290&quot; height=&quot;224&quot; class=&quot;image-left&quot; /&gt; &lt;/span&gt;Last night, the Democratic candidates for NYC Public Advocates squared off for a WABC 7-televised debate, and it got good fast: Within 10 minutes, City Councilman Bill de Blasio &lt;a href=&quot;http://www.nytimes.com/2009/08/29/nyregion/29debate.html?ref=nyregion&quot;&gt;attacked frontrunner and former Public Advocate Mark Green&lt;/a&gt;, &quot;I have to say, with all due respect to Mark Green, he has a little amnesia. &lt;strong&gt;He did not stand up to Rudy Giuliani when it mattered most, when Rudy Giuliani &lt;a href=&quot;http://www.nytimes.com/2001/09/24/nyregion/24RUDY.html&quot;&gt;wanted to stay on for 90 days&lt;/a&gt; at the end of 2001.&lt;/strong&gt; Mark &lt;a href=&quot;http://www.nytimes.com/2001/10/02/nyregion/giuliani-s-quest-for-a-term-extension-hits-a-wall-in-albany.html?scp=3&amp;sq=2001%20giuliani%20mark%20green%2090%20days&amp;st=cse&quot;&gt;caved in to him for political reasons&lt;/a&gt;.&amp;#8221; Green retorted, &quot;There was not any public official in the eight years of Rudy Giuliani who stood up to him more often and more successfully.&quot; Green &lt;a href=&quot;http://www.politickerny.com/5072/green-says-hes-not-interested-running-mayor-de-blasio-attacks&quot;&gt;also told moderator Diane Williams&lt;/a&gt;, &quot;&lt;strong&gt;Diane, let Bill attack me one more time because he needs it emotionally.&lt;/strong&gt;&quot;&lt;/p&gt; <description>&lt;p&gt;&lt;span class=&quot;mt-enclosure mt-enclosure-image&quot; style=&quot;display: inline;&quot;&gt; &lt;img alt=&quot;2009_08_pubadv.jpg&quot; src=&quot;http://gothamist.com/attachments/jen/2009_08_pubadv.jpg&quot; width=&quot;290&quot; height=&quot;224&quot; class=&quot;image-left&quot; /&gt; &lt;/span&gt;Last night, the Democratic candidates for NYC Public Advocates squared off for a WABC 7-televised debate, and it got good fast: Within 10 minutes, City Councilman Somebody Else Entirely &lt;a href=&quot;http://www.nytimes.com/2009/08/29/nyregion/29debate.html?ref=nyregion&quot;&gt;attacked frontrunner and former Public Advocate Mark Green&lt;/a&gt;, &quot;I have to say, with all due respect to Mark Green, he has a little amnesia. &lt;strong&gt;He did not stand up to Rudy Giuliani when it mattered most, when Rudy Giuliani &lt;a href=&quot;http://www.nytimes.com/2001/09/24/nyregion/24RUDY.html&quot;&gt;wanted to stay on for 90 days&lt;/a&gt; at the end of 2001.&lt;/strong&gt; Mark &lt;a href=&quot;http://www.nytimes.com/2001/10/02/nyregion/giuliani-s-quest-for-a-term-extension-hits-a-wall-in-albany.html?scp=3&amp;sq=2001%20giuliani%20mark%20green%2090%20days&amp;st=cse&quot;&gt;caved in to him for political reasons&lt;/a&gt;.&amp;#8221; Green retorted, &quot;There was not any public official in the eight years of Rudy Giuliani who stood up to him more often and more successfully.&quot; Green &lt;a href=&quot;http://www.politickerny.com/5072/green-says-hes-not-interested-running-mayor-de-blasio-attacks&quot;&gt;also told moderator Diane Williams&lt;/a&gt;, &quot;&lt;strong&gt;Diane, let Bill attack me one more time because he needs it emotionally.&lt;/strong&gt;&quot;&lt;/p&gt;
&lt;p&gt;Of course, a big question that loomed over the debate is whether there needs to be a Public Advocate at all (City Councilman Simcha Felder &lt;a href=&quot;http://www.politickerny.com/4693/felders-argument-eliminating-public-advocate&quot;&gt;released a report on why the city should do away&lt;/a&gt; with the position, which is second highest elected position in the city). Civil rights lawyer Norman Siegel said, &quot;You need a public advocate to go and fight. When there is a slush fund in the City Council, you need someone from the outside like me to take on the insiders,&quot; while Green said, &quot;I was the public advocate. I don't remember this question being asked very often, if ever, when I was there.&quot; &lt;/p&gt; &lt;p&gt;Of course, a big question that loomed over the debate is whether there needs to be a Public Advocate at all (City Councilman Simcha Felder &lt;a href=&quot;http://www.politickerny.com/4693/felders-argument-eliminating-public-advocate&quot;&gt;released a report on why the city should do away&lt;/a&gt; with the position, which is second highest elected position in the city). Civil rights lawyer Norman Siegel said, &quot;You need a public advocate to go and fight. When there is a slush fund in the City Council, you need someone from the outside like me to take on the insiders,&quot; while Green said, &quot;I was the public advocate. I don't remember this question being asked very often, if ever, when I was there.&quot; &lt;/p&gt;

View file

@ -4,6 +4,7 @@ from apps.rss_feeds.models import Feed, Story
from django.core.cache import cache from django.core.cache import cache
from apps.reader.models import UserSubscription, UserSubscriptionFolders, UserStory from apps.reader.models import UserSubscription, UserSubscriptionFolders, UserStory
from optparse import OptionParser, make_option from optparse import OptionParser, make_option
from utils.management_functions import daemonize
import os import os
import logging import logging
import errno import errno
@ -30,24 +31,3 @@ class Command(BaseCommand):
for us in usersubs: for us in usersubs:
us.count_unread() us.count_unread()
cache.delete('usersub:%s' % us.user_id) cache.delete('usersub:%s' % us.user_id)
def daemonize():
"""
Detach from the terminal and continue as a daemon.
"""
# swiped from twisted/scripts/twistd.py
# See http://www.erlenstar.demon.co.uk/unix/faq_toc.html#TOC16
if os.fork(): # launch child and...
os._exit(0) # kill off parent
os.setsid()
if os.fork(): # launch child and...
os._exit(0) # kill off parent again.
os.umask(077)
null = os.open("/dev/null", os.O_RDWR)
for i in range(3):
try:
os.dup2(null, i)
except OSError, e:
if e.errno != errno.EBADF:
raise
os.close(null)

View file

@ -3,10 +3,9 @@ from django.core.handlers.wsgi import WSGIHandler
from apps.rss_feeds.models import Feed, Story from apps.rss_feeds.models import Feed, Story
from optparse import OptionParser, make_option from optparse import OptionParser, make_option
from utils import feed_fetcher from utils import feed_fetcher
from utils.management_functions import daemonize
import logging import logging
import os
import socket import socket
import errno
class Command(BaseCommand): class Command(BaseCommand):
@ -37,23 +36,3 @@ class Command(BaseCommand):
disp.poll() disp.poll()
def daemonize():
"""
Detach from the terminal and continue as a daemon.
"""
# swiped from twisted/scripts/twistd.py
# See http://www.erlenstar.demon.co.uk/unix/faq_toc.html#TOC16
if os.fork(): # launch child and...
os._exit(0) # kill off parent
os.setsid()
if os.fork(): # launch child and...
os._exit(0) # kill off parent again.
os.umask(077)
null = os.open("/dev/null", os.O_RDWR)
for i in range(3):
try:
os.dup2(null, i)
except OSError, e:
if e.errno != errno.EBADF:
raise
os.close(null)

View file

@ -13,6 +13,7 @@ from utils.story_functions import format_story_link_date__long
from django.db.models import Q from django.db.models import Q
import settings import settings
import logging import logging
import difflib
from utils.diff import HTMLDiff from utils.diff import HTMLDiff
USER_AGENT = 'NewsBlur v1.0 - newsblur.com' USER_AGENT = 'NewsBlur v1.0 - newsblur.com'
@ -83,7 +84,7 @@ class Feed(models.Model):
story_content = story_contents[0]['value'] story_content = story_contents[0]['value']
else: else:
story_content = story.get('summary') story_content = story.get('summary')
existing_story, is_different = self._exists_story(story, story_content, existing_stories) existing_story, story_has_changed = self._exists_story(story, story_content, existing_stories)
if existing_story is None: if existing_story is None:
pub_date = datetime.datetime.timetuple(story.get('published')) pub_date = datetime.datetime.timetuple(story.get('published'))
# logging.debug('- New story: %s %s' % (pub_date, story.get('title'))) # logging.debug('- New story: %s %s' % (pub_date, story.get('title')))
@ -101,7 +102,7 @@ class Feed(models.Model):
except: except:
ret_values[ENTRY_ERR] += 1 ret_values[ENTRY_ERR] += 1
pass pass
elif existing_story and is_different: elif existing_story and story_has_changed:
# update story # update story
logging.debug('- Updated story in feed (%s - %s): %s / %s' % (self.feed_title, story.get('title'), len(existing_story['story_content']), len(story_content))) logging.debug('- Updated story in feed (%s - %s): %s / %s' % (self.feed_title, story.get('title'), len(existing_story['story_content']), len(story_content)))
@ -161,30 +162,47 @@ class Feed(models.Model):
return stories return stories
def _exists_story(self, story=None, story_content=None, existing_stories=None): def _exists_story(self, story=None, story_content=None, existing_stories=None):
same_story = None story_in_system = None
is_different = False story_has_changed = False
story_pub_date = story.get('published') story_pub_date = story.get('published')
start_date = story_pub_date - datetime.timedelta(hours=8) start_date = story_pub_date - datetime.timedelta(hours=8)
end_date = story_pub_date + datetime.timedelta(hours=8) end_date = story_pub_date + datetime.timedelta(hours=8)
for existing_story in existing_stories: for existing_story in existing_stories:
content_ratio = 0
if story_pub_date > start_date and story_pub_date < end_date: if story_pub_date > start_date and story_pub_date < end_date:
if story.get('link') == existing_story['story_permalink']: if story.get('link') == existing_story['story_permalink']:
same_story = existing_story story_in_system = existing_story
# Title distance + content distance, checking if story changed
story_title_difference = levenshtein_distance(story.get('title'), story_title_difference = levenshtein_distance(story.get('title'),
existing_story['story_title']) existing_story['story_title'])
if same_story and story_title_difference < 10: seq = difflib.SequenceMatcher(None, story_content, existing_story['story_content'])
same_story = existing_story
if story_title_difference > 0:
is_different = True
if same_story: if seq.real_quick_ratio() > .9 and seq.quick_ratio() > .95:
if story_content != existing_story['story_content']: content_ratio = seq.ratio()
is_different = True
if story_title_difference > 0 and story_title_difference < 5 and content_ratio > .98:
story_in_system = existing_story
if story_title_difference > 0 or content_ratio < 1.0:
# print "Title difference - %s/%s (%s): %s" % (story.get('title'), existing_story['story_title'], story_title_difference, content_ratio)
story_has_changed = True
break break
return same_story, is_different # More restrictive content distance, still no story match
if not story_in_system and content_ratio > .99:
# print "Content difference - %s/%s (%s): %s" % (story.get('title'), existing_story['story_title'], story_title_difference, content_ratio)
story_in_system = existing_story
story_has_changed = True
break
if story_in_system:
if story_content != existing_story['story_content']:
story_has_changed = True
break
return story_in_system, story_has_changed
def _pre_process_story(self, entry): def _pre_process_story(self, entry):
date_published = entry.get('published', entry.get('updated')) date_published = entry.get('published', entry.get('updated'))

View file

@ -1,4 +1,4 @@
from django.utils import simplejson as json from utils.json import decode
from django.test.client import Client from django.test.client import Client
from django.test import TestCase from django.test import TestCase
from django.core import management from django.core import management
@ -9,20 +9,24 @@ class FeedTest(TestCase):
def setUp(self): def setUp(self):
self.client = Client() self.client = Client()
# def test_load_feeds__changed_story_title(self): def test_load_feeds__gawker(self):
# self.client.login(userame='conesus', password='test') self.client.login(userame='conesus', password='test')
#
# management.call_command('loaddata', 'gawker1.json', verbosity=0)
# response = self.client.get('/reader/refresh_feed', { "feed_id": 1, "force": True })
#
# management.call_command('loaddata', 'gawker2.json', verbosity=0)
# response = self.client.get('/reader/refresh_feed', { "feed_id": 1, "force": True })
#
# response = self.client.get('/reader/load_single_feed', { "feed_id": 1 })
# print [c['story_title'] for c in json.loads(response.content)]
# # print json.loads(response.content)[0]
def test_load_feeds__gothamist__changed_story_title(self): management.call_command('loaddata', 'gawker1.json', verbosity=0)
response = self.client.get('/reader/refresh_feed', { "feed_id": 1, "force": True })
management.call_command('loaddata', 'gawker2.json', verbosity=0)
response = self.client.get('/reader/refresh_feed', { "feed_id": 1, "force": True })
response = self.client.get('/reader/load_single_feed', { "feed_id": 1 })
# print [c['story_title'] for c in json.loads(response.content)]
stories = decode(response.content)
# Test: 1 changed char in content
self.assertEquals(len(stories), 38)
def test_load_feeds__gothamist(self):
self.client.login(userame='conesus', password='test') self.client.login(userame='conesus', password='test')
management.call_command('loaddata', 'gothamist1.json', verbosity=0) management.call_command('loaddata', 'gothamist1.json', verbosity=0)
@ -32,5 +36,9 @@ class FeedTest(TestCase):
response = self.client.get('/reader/refresh_feed', { "feed_id": 4, "force": True }) response = self.client.get('/reader/refresh_feed', { "feed_id": 4, "force": True })
response = self.client.get('/reader/load_single_feed', { "feed_id": 4 }) response = self.client.get('/reader/load_single_feed', { "feed_id": 4 })
print [c['story_title'] for c in json.loads(response.content)]
# print json.loads(response.content)[0] # print [c['story_title'] for c in json.loads(response.content)]
stories = decode(response.content)
# Test: 1 changed char in title
self.assertEquals(len(stories), 42)

View file

@ -0,0 +1,23 @@
import os
import errno
def daemonize():
"""
Detach from the terminal and continue as a daemon.
"""
# swiped from twisted/scripts/twistd.py
# See http://www.erlenstar.demon.co.uk/unix/faq_toc.html#TOC16
if os.fork(): # launch child and...
os._exit(0) # kill off parent
os.setsid()
if os.fork(): # launch child and...
os._exit(0) # kill off parent again.
os.umask(077)
null = os.open("/dev/null", os.O_RDWR)
for i in range(3):
try:
os.dup2(null, i)
except OSError, e:
if e.errno != errno.EBADF:
raise
os.close(null)