Woo - Checking for content and title differences to resolve duplicate entries. Integration tests prove a number of differences are successfully resolved.

This commit is contained in:
Samuel Clay 2009-08-30 00:43:13 +00:00
parent bdd91d714d
commit 9e522ca528
6 changed files with 90 additions and 82 deletions

View file

@ -14,11 +14,11 @@
<item>
<title>Public Advocate Hopefuls Debate Each Other, Defend The Job</title>
<link>http://feeds.gothamistllc.com/click.phdo?i=8a845cced28e85b43ca559267d509c78</link>
<link>http://feeds.gothamistllc.com/click.phdo?i=yatta</link>
<pheedo:origLink>http://gothamist.com/2009/08/29/public_advocate_hopefuls_debate_eac.php</pheedo:origLink>
<guid isPermaLink="false">http://gothamist.com/2009/08/29/public_advocate_hopefuls_debate_eac.php</guid>
<guid isPermaLink="false">http://gothamist.com/2009/08/29/something_different.php</guid>
<comments>http://gothamist.com/2009/08/29/public_advocate_hopefuls_debate_eac.php#comments</comments>
<description>&lt;p&gt;&lt;span class=&quot;mt-enclosure mt-enclosure-image&quot; style=&quot;display: inline;&quot;&gt; &lt;img alt=&quot;2009_08_pubadv.jpg&quot; src=&quot;http://gothamist.com/attachments/jen/2009_08_pubadv.jpg&quot; width=&quot;290&quot; height=&quot;224&quot; class=&quot;image-left&quot; /&gt; &lt;/span&gt;Last night, the Democratic candidates for NYC Public Advocates squared off for a WABC 7-televised debate, and it got good fast: Within 10 minutes, City Councilman Bill de Blasio &lt;a href=&quot;http://www.nytimes.com/2009/08/29/nyregion/29debate.html?ref=nyregion&quot;&gt;attacked frontrunner and former Public Advocate Mark Green&lt;/a&gt;, &quot;I have to say, with all due respect to Mark Green, he has a little amnesia. &lt;strong&gt;He did not stand up to Rudy Giuliani when it mattered most, when Rudy Giuliani &lt;a href=&quot;http://www.nytimes.com/2001/09/24/nyregion/24RUDY.html&quot;&gt;wanted to stay on for 90 days&lt;/a&gt; at the end of 2001.&lt;/strong&gt; Mark &lt;a href=&quot;http://www.nytimes.com/2001/10/02/nyregion/giuliani-s-quest-for-a-term-extension-hits-a-wall-in-albany.html?scp=3&amp;sq=2001%20giuliani%20mark%20green%2090%20days&amp;st=cse&quot;&gt;caved in to him for political reasons&lt;/a&gt;.&amp;#8221; Green retorted, &quot;There was not any public official in the eight years of Rudy Giuliani who stood up to him more often and more successfully.&quot; Green &lt;a href=&quot;http://www.politickerny.com/5072/green-says-hes-not-interested-running-mayor-de-blasio-attacks&quot;&gt;also told moderator Diane Williams&lt;/a&gt;, &quot;&lt;strong&gt;Diane, let Bill attack me one more time because he needs it emotionally.&lt;/strong&gt;&quot;&lt;/p&gt;
<description>&lt;p&gt;&lt;span class=&quot;mt-enclosure mt-enclosure-image&quot; style=&quot;display: inline;&quot;&gt; &lt;img alt=&quot;2009_08_pubadv.jpg&quot; src=&quot;http://gothamist.com/attachments/jen/2009_08_pubadv.jpg&quot; width=&quot;290&quot; height=&quot;224&quot; class=&quot;image-left&quot; /&gt; &lt;/span&gt;Last night, the Democratic candidates for NYC Public Advocates squared off for a WABC 7-televised debate, and it got good fast: Within 10 minutes, City Councilman Somebody Else Entirely &lt;a href=&quot;http://www.nytimes.com/2009/08/29/nyregion/29debate.html?ref=nyregion&quot;&gt;attacked frontrunner and former Public Advocate Mark Green&lt;/a&gt;, &quot;I have to say, with all due respect to Mark Green, he has a little amnesia. &lt;strong&gt;He did not stand up to Rudy Giuliani when it mattered most, when Rudy Giuliani &lt;a href=&quot;http://www.nytimes.com/2001/09/24/nyregion/24RUDY.html&quot;&gt;wanted to stay on for 90 days&lt;/a&gt; at the end of 2001.&lt;/strong&gt; Mark &lt;a href=&quot;http://www.nytimes.com/2001/10/02/nyregion/giuliani-s-quest-for-a-term-extension-hits-a-wall-in-albany.html?scp=3&amp;sq=2001%20giuliani%20mark%20green%2090%20days&amp;st=cse&quot;&gt;caved in to him for political reasons&lt;/a&gt;.&amp;#8221; Green retorted, &quot;There was not any public official in the eight years of Rudy Giuliani who stood up to him more often and more successfully.&quot; Green &lt;a href=&quot;http://www.politickerny.com/5072/green-says-hes-not-interested-running-mayor-de-blasio-attacks&quot;&gt;also told moderator Diane Williams&lt;/a&gt;, &quot;&lt;strong&gt;Diane, let Bill attack me one more time because he needs it emotionally.&lt;/strong&gt;&quot;&lt;/p&gt;
&lt;p&gt;Of course, a big question that loomed over the debate is whether there needs to be a Public Advocate at all (City Councilman Simcha Felder &lt;a href=&quot;http://www.politickerny.com/4693/felders-argument-eliminating-public-advocate&quot;&gt;released a report on why the city should do away&lt;/a&gt; with the position, which is second highest elected position in the city). Civil rights lawyer Norman Siegel said, &quot;You need a public advocate to go and fight. When there is a slush fund in the City Council, you need someone from the outside like me to take on the insiders,&quot; while Green said, &quot;I was the public advocate. I don't remember this question being asked very often, if ever, when I was there.&quot; &lt;/p&gt;

View file

@ -4,6 +4,7 @@ from apps.rss_feeds.models import Feed, Story
from django.core.cache import cache
from apps.reader.models import UserSubscription, UserSubscriptionFolders, UserStory
from optparse import OptionParser, make_option
from utils.management_functions import daemonize
import os
import logging
import errno
@ -30,24 +31,3 @@ class Command(BaseCommand):
for us in usersubs:
us.count_unread()
cache.delete('usersub:%s' % us.user_id)
def daemonize():
"""
Detach from the terminal and continue as a daemon.
"""
# swiped from twisted/scripts/twistd.py
# See http://www.erlenstar.demon.co.uk/unix/faq_toc.html#TOC16
if os.fork(): # launch child and...
os._exit(0) # kill off parent
os.setsid()
if os.fork(): # launch child and...
os._exit(0) # kill off parent again.
os.umask(077)
null = os.open("/dev/null", os.O_RDWR)
for i in range(3):
try:
os.dup2(null, i)
except OSError, e:
if e.errno != errno.EBADF:
raise
os.close(null)

View file

@ -3,10 +3,9 @@ from django.core.handlers.wsgi import WSGIHandler
from apps.rss_feeds.models import Feed, Story
from optparse import OptionParser, make_option
from utils import feed_fetcher
from utils.management_functions import daemonize
import logging
import os
import socket
import errno
class Command(BaseCommand):
@ -37,23 +36,3 @@ class Command(BaseCommand):
disp.poll()
def daemonize():
"""
Detach from the terminal and continue as a daemon.
"""
# swiped from twisted/scripts/twistd.py
# See http://www.erlenstar.demon.co.uk/unix/faq_toc.html#TOC16
if os.fork(): # launch child and...
os._exit(0) # kill off parent
os.setsid()
if os.fork(): # launch child and...
os._exit(0) # kill off parent again.
os.umask(077)
null = os.open("/dev/null", os.O_RDWR)
for i in range(3):
try:
os.dup2(null, i)
except OSError, e:
if e.errno != errno.EBADF:
raise
os.close(null)

View file

@ -13,6 +13,7 @@ from utils.story_functions import format_story_link_date__long
from django.db.models import Q
import settings
import logging
import difflib
from utils.diff import HTMLDiff
USER_AGENT = 'NewsBlur v1.0 - newsblur.com'
@ -83,7 +84,7 @@ class Feed(models.Model):
story_content = story_contents[0]['value']
else:
story_content = story.get('summary')
existing_story, is_different = self._exists_story(story, story_content, existing_stories)
existing_story, story_has_changed = self._exists_story(story, story_content, existing_stories)
if existing_story is None:
pub_date = datetime.datetime.timetuple(story.get('published'))
# logging.debug('- New story: %s %s' % (pub_date, story.get('title')))
@ -101,7 +102,7 @@ class Feed(models.Model):
except:
ret_values[ENTRY_ERR] += 1
pass
elif existing_story and is_different:
elif existing_story and story_has_changed:
# update story
logging.debug('- Updated story in feed (%s - %s): %s / %s' % (self.feed_title, story.get('title'), len(existing_story['story_content']), len(story_content)))
@ -161,30 +162,47 @@ class Feed(models.Model):
return stories
def _exists_story(self, story=None, story_content=None, existing_stories=None):
same_story = None
is_different = False
story_in_system = None
story_has_changed = False
story_pub_date = story.get('published')
start_date = story_pub_date - datetime.timedelta(hours=8)
end_date = story_pub_date + datetime.timedelta(hours=8)
for existing_story in existing_stories:
content_ratio = 0
if story_pub_date > start_date and story_pub_date < end_date:
if story.get('link') == existing_story['story_permalink']:
same_story = existing_story
story_in_system = existing_story
# Title distance + content distance, checking if story changed
story_title_difference = levenshtein_distance(story.get('title'),
existing_story['story_title'])
if same_story and story_title_difference < 10:
same_story = existing_story
if story_title_difference > 0:
is_different = True
seq = difflib.SequenceMatcher(None, story_content, existing_story['story_content'])
if same_story:
if story_content != existing_story['story_content']:
is_different = True
if seq.real_quick_ratio() > .9 and seq.quick_ratio() > .95:
content_ratio = seq.ratio()
if story_title_difference > 0 and story_title_difference < 5 and content_ratio > .98:
story_in_system = existing_story
if story_title_difference > 0 or content_ratio < 1.0:
# print "Title difference - %s/%s (%s): %s" % (story.get('title'), existing_story['story_title'], story_title_difference, content_ratio)
story_has_changed = True
break
# More restrictive content distance, still no story match
if not story_in_system and content_ratio > .99:
# print "Content difference - %s/%s (%s): %s" % (story.get('title'), existing_story['story_title'], story_title_difference, content_ratio)
story_in_system = existing_story
story_has_changed = True
break
return same_story, is_different
if story_in_system:
if story_content != existing_story['story_content']:
story_has_changed = True
break
return story_in_system, story_has_changed
def _pre_process_story(self, entry):
date_published = entry.get('published', entry.get('updated'))

View file

@ -1,4 +1,4 @@
from django.utils import simplejson as json
from utils.json import decode
from django.test.client import Client
from django.test import TestCase
from django.core import management
@ -9,20 +9,24 @@ class FeedTest(TestCase):
def setUp(self):
self.client = Client()
# def test_load_feeds__changed_story_title(self):
# self.client.login(userame='conesus', password='test')
#
# management.call_command('loaddata', 'gawker1.json', verbosity=0)
# response = self.client.get('/reader/refresh_feed', { "feed_id": 1, "force": True })
#
# management.call_command('loaddata', 'gawker2.json', verbosity=0)
# response = self.client.get('/reader/refresh_feed', { "feed_id": 1, "force": True })
#
# response = self.client.get('/reader/load_single_feed', { "feed_id": 1 })
# print [c['story_title'] for c in json.loads(response.content)]
# # print json.loads(response.content)[0]
def test_load_feeds__gawker(self):
self.client.login(userame='conesus', password='test')
def test_load_feeds__gothamist__changed_story_title(self):
management.call_command('loaddata', 'gawker1.json', verbosity=0)
response = self.client.get('/reader/refresh_feed', { "feed_id": 1, "force": True })
management.call_command('loaddata', 'gawker2.json', verbosity=0)
response = self.client.get('/reader/refresh_feed', { "feed_id": 1, "force": True })
response = self.client.get('/reader/load_single_feed', { "feed_id": 1 })
# print [c['story_title'] for c in json.loads(response.content)]
stories = decode(response.content)
# Test: 1 changed char in content
self.assertEquals(len(stories), 38)
def test_load_feeds__gothamist(self):
self.client.login(userame='conesus', password='test')
management.call_command('loaddata', 'gothamist1.json', verbosity=0)
@ -32,5 +36,9 @@ class FeedTest(TestCase):
response = self.client.get('/reader/refresh_feed', { "feed_id": 4, "force": True })
response = self.client.get('/reader/load_single_feed', { "feed_id": 4 })
print [c['story_title'] for c in json.loads(response.content)]
# print json.loads(response.content)[0]
# print [c['story_title'] for c in json.loads(response.content)]
stories = decode(response.content)
# Test: 1 changed char in title
self.assertEquals(len(stories), 42)

View file

@ -0,0 +1,23 @@
import os
import errno
def daemonize():
"""
Detach from the terminal and continue as a daemon.
"""
# swiped from twisted/scripts/twistd.py
# See http://www.erlenstar.demon.co.uk/unix/faq_toc.html#TOC16
if os.fork(): # launch child and...
os._exit(0) # kill off parent
os.setsid()
if os.fork(): # launch child and...
os._exit(0) # kill off parent again.
os.umask(077)
null = os.open("/dev/null", os.O_RDWR)
for i in range(3):
try:
os.dup2(null, i)
except OSError, e:
if e.errno != errno.EBADF:
raise
os.close(null)