mirror of
https://github.com/viq/NewsBlur.git
synced 2025-08-05 16:49:45 +00:00
Woo - Checking for content and title differences to resolve duplicate entries. Integration tests prove a number of differences are successfully resolved.
This commit is contained in:
parent
bdd91d714d
commit
9e522ca528
6 changed files with 90 additions and 82 deletions
|
@ -14,11 +14,11 @@
|
||||||
<item>
|
<item>
|
||||||
<title>Public Advocate Hopefuls Debate Each Other, Defend The Job</title>
|
<title>Public Advocate Hopefuls Debate Each Other, Defend The Job</title>
|
||||||
|
|
||||||
<link>http://feeds.gothamistllc.com/click.phdo?i=8a845cced28e85b43ca559267d509c78</link>
|
<link>http://feeds.gothamistllc.com/click.phdo?i=yatta</link>
|
||||||
<pheedo:origLink>http://gothamist.com/2009/08/29/public_advocate_hopefuls_debate_eac.php</pheedo:origLink>
|
<pheedo:origLink>http://gothamist.com/2009/08/29/public_advocate_hopefuls_debate_eac.php</pheedo:origLink>
|
||||||
<guid isPermaLink="false">http://gothamist.com/2009/08/29/public_advocate_hopefuls_debate_eac.php</guid>
|
<guid isPermaLink="false">http://gothamist.com/2009/08/29/something_different.php</guid>
|
||||||
<comments>http://gothamist.com/2009/08/29/public_advocate_hopefuls_debate_eac.php#comments</comments>
|
<comments>http://gothamist.com/2009/08/29/public_advocate_hopefuls_debate_eac.php#comments</comments>
|
||||||
<description><p><span class="mt-enclosure mt-enclosure-image" style="display: inline;"> <img alt="2009_08_pubadv.jpg" src="http://gothamist.com/attachments/jen/2009_08_pubadv.jpg" width="290" height="224" class="image-left" /> </span>Last night, the Democratic candidates for NYC Public Advocates squared off for a WABC 7-televised debate, and it got good fast: Within 10 minutes, City Councilman Bill de Blasio <a href="http://www.nytimes.com/2009/08/29/nyregion/29debate.html?ref=nyregion">attacked frontrunner and former Public Advocate Mark Green</a>, "I have to say, with all due respect to Mark Green, he has a little amnesia. <strong>He did not stand up to Rudy Giuliani when it mattered most, when Rudy Giuliani <a href="http://www.nytimes.com/2001/09/24/nyregion/24RUDY.html">wanted to stay on for 90 days</a> at the end of 2001.</strong> Mark <a href="http://www.nytimes.com/2001/10/02/nyregion/giuliani-s-quest-for-a-term-extension-hits-a-wall-in-albany.html?scp=3&sq=2001%20giuliani%20mark%20green%2090%20days&st=cse">caved in to him for political reasons</a>.&#8221; Green retorted, "There was not any public official in the eight years of Rudy Giuliani who stood up to him more often and more successfully." Green <a href="http://www.politickerny.com/5072/green-says-hes-not-interested-running-mayor-de-blasio-attacks">also told moderator Diane Williams</a>, "<strong>Diane, let Bill attack me one more time because he needs it emotionally.</strong>"</p>
|
<description><p><span class="mt-enclosure mt-enclosure-image" style="display: inline;"> <img alt="2009_08_pubadv.jpg" src="http://gothamist.com/attachments/jen/2009_08_pubadv.jpg" width="290" height="224" class="image-left" /> </span>Last night, the Democratic candidates for NYC Public Advocates squared off for a WABC 7-televised debate, and it got good fast: Within 10 minutes, City Councilman Somebody Else Entirely <a href="http://www.nytimes.com/2009/08/29/nyregion/29debate.html?ref=nyregion">attacked frontrunner and former Public Advocate Mark Green</a>, "I have to say, with all due respect to Mark Green, he has a little amnesia. <strong>He did not stand up to Rudy Giuliani when it mattered most, when Rudy Giuliani <a href="http://www.nytimes.com/2001/09/24/nyregion/24RUDY.html">wanted to stay on for 90 days</a> at the end of 2001.</strong> Mark <a href="http://www.nytimes.com/2001/10/02/nyregion/giuliani-s-quest-for-a-term-extension-hits-a-wall-in-albany.html?scp=3&sq=2001%20giuliani%20mark%20green%2090%20days&st=cse">caved in to him for political reasons</a>.&#8221; Green retorted, "There was not any public official in the eight years of Rudy Giuliani who stood up to him more often and more successfully." Green <a href="http://www.politickerny.com/5072/green-says-hes-not-interested-running-mayor-de-blasio-attacks">also told moderator Diane Williams</a>, "<strong>Diane, let Bill attack me one more time because he needs it emotionally.</strong>"</p>
|
||||||
|
|
||||||
<p>Of course, a big question that loomed over the debate is whether there needs to be a Public Advocate at all (City Councilman Simcha Felder <a href="http://www.politickerny.com/4693/felders-argument-eliminating-public-advocate">released a report on why the city should do away</a> with the position, which is second highest elected position in the city). Civil rights lawyer Norman Siegel said, "You need a public advocate to go and fight. When there is a slush fund in the City Council, you need someone from the outside like me to take on the insiders," while Green said, "I was the public advocate. I don't remember this question being asked very often, if ever, when I was there." </p>
|
<p>Of course, a big question that loomed over the debate is whether there needs to be a Public Advocate at all (City Councilman Simcha Felder <a href="http://www.politickerny.com/4693/felders-argument-eliminating-public-advocate">released a report on why the city should do away</a> with the position, which is second highest elected position in the city). Civil rights lawyer Norman Siegel said, "You need a public advocate to go and fight. When there is a slush fund in the City Council, you need someone from the outside like me to take on the insiders," while Green said, "I was the public advocate. I don't remember this question being asked very often, if ever, when I was there." </p>
|
||||||
|
|
||||||
|
|
|
@ -4,6 +4,7 @@ from apps.rss_feeds.models import Feed, Story
|
||||||
from django.core.cache import cache
|
from django.core.cache import cache
|
||||||
from apps.reader.models import UserSubscription, UserSubscriptionFolders, UserStory
|
from apps.reader.models import UserSubscription, UserSubscriptionFolders, UserStory
|
||||||
from optparse import OptionParser, make_option
|
from optparse import OptionParser, make_option
|
||||||
|
from utils.management_functions import daemonize
|
||||||
import os
|
import os
|
||||||
import logging
|
import logging
|
||||||
import errno
|
import errno
|
||||||
|
@ -30,24 +31,3 @@ class Command(BaseCommand):
|
||||||
for us in usersubs:
|
for us in usersubs:
|
||||||
us.count_unread()
|
us.count_unread()
|
||||||
cache.delete('usersub:%s' % us.user_id)
|
cache.delete('usersub:%s' % us.user_id)
|
||||||
|
|
||||||
def daemonize():
|
|
||||||
"""
|
|
||||||
Detach from the terminal and continue as a daemon.
|
|
||||||
"""
|
|
||||||
# swiped from twisted/scripts/twistd.py
|
|
||||||
# See http://www.erlenstar.demon.co.uk/unix/faq_toc.html#TOC16
|
|
||||||
if os.fork(): # launch child and...
|
|
||||||
os._exit(0) # kill off parent
|
|
||||||
os.setsid()
|
|
||||||
if os.fork(): # launch child and...
|
|
||||||
os._exit(0) # kill off parent again.
|
|
||||||
os.umask(077)
|
|
||||||
null = os.open("/dev/null", os.O_RDWR)
|
|
||||||
for i in range(3):
|
|
||||||
try:
|
|
||||||
os.dup2(null, i)
|
|
||||||
except OSError, e:
|
|
||||||
if e.errno != errno.EBADF:
|
|
||||||
raise
|
|
||||||
os.close(null)
|
|
|
@ -3,10 +3,9 @@ from django.core.handlers.wsgi import WSGIHandler
|
||||||
from apps.rss_feeds.models import Feed, Story
|
from apps.rss_feeds.models import Feed, Story
|
||||||
from optparse import OptionParser, make_option
|
from optparse import OptionParser, make_option
|
||||||
from utils import feed_fetcher
|
from utils import feed_fetcher
|
||||||
|
from utils.management_functions import daemonize
|
||||||
import logging
|
import logging
|
||||||
import os
|
|
||||||
import socket
|
import socket
|
||||||
import errno
|
|
||||||
|
|
||||||
|
|
||||||
class Command(BaseCommand):
|
class Command(BaseCommand):
|
||||||
|
@ -37,23 +36,3 @@ class Command(BaseCommand):
|
||||||
disp.poll()
|
disp.poll()
|
||||||
|
|
||||||
|
|
||||||
def daemonize():
|
|
||||||
"""
|
|
||||||
Detach from the terminal and continue as a daemon.
|
|
||||||
"""
|
|
||||||
# swiped from twisted/scripts/twistd.py
|
|
||||||
# See http://www.erlenstar.demon.co.uk/unix/faq_toc.html#TOC16
|
|
||||||
if os.fork(): # launch child and...
|
|
||||||
os._exit(0) # kill off parent
|
|
||||||
os.setsid()
|
|
||||||
if os.fork(): # launch child and...
|
|
||||||
os._exit(0) # kill off parent again.
|
|
||||||
os.umask(077)
|
|
||||||
null = os.open("/dev/null", os.O_RDWR)
|
|
||||||
for i in range(3):
|
|
||||||
try:
|
|
||||||
os.dup2(null, i)
|
|
||||||
except OSError, e:
|
|
||||||
if e.errno != errno.EBADF:
|
|
||||||
raise
|
|
||||||
os.close(null)
|
|
|
@ -13,6 +13,7 @@ from utils.story_functions import format_story_link_date__long
|
||||||
from django.db.models import Q
|
from django.db.models import Q
|
||||||
import settings
|
import settings
|
||||||
import logging
|
import logging
|
||||||
|
import difflib
|
||||||
from utils.diff import HTMLDiff
|
from utils.diff import HTMLDiff
|
||||||
|
|
||||||
USER_AGENT = 'NewsBlur v1.0 - newsblur.com'
|
USER_AGENT = 'NewsBlur v1.0 - newsblur.com'
|
||||||
|
@ -83,7 +84,7 @@ class Feed(models.Model):
|
||||||
story_content = story_contents[0]['value']
|
story_content = story_contents[0]['value']
|
||||||
else:
|
else:
|
||||||
story_content = story.get('summary')
|
story_content = story.get('summary')
|
||||||
existing_story, is_different = self._exists_story(story, story_content, existing_stories)
|
existing_story, story_has_changed = self._exists_story(story, story_content, existing_stories)
|
||||||
if existing_story is None:
|
if existing_story is None:
|
||||||
pub_date = datetime.datetime.timetuple(story.get('published'))
|
pub_date = datetime.datetime.timetuple(story.get('published'))
|
||||||
# logging.debug('- New story: %s %s' % (pub_date, story.get('title')))
|
# logging.debug('- New story: %s %s' % (pub_date, story.get('title')))
|
||||||
|
@ -101,7 +102,7 @@ class Feed(models.Model):
|
||||||
except:
|
except:
|
||||||
ret_values[ENTRY_ERR] += 1
|
ret_values[ENTRY_ERR] += 1
|
||||||
pass
|
pass
|
||||||
elif existing_story and is_different:
|
elif existing_story and story_has_changed:
|
||||||
# update story
|
# update story
|
||||||
logging.debug('- Updated story in feed (%s - %s): %s / %s' % (self.feed_title, story.get('title'), len(existing_story['story_content']), len(story_content)))
|
logging.debug('- Updated story in feed (%s - %s): %s / %s' % (self.feed_title, story.get('title'), len(existing_story['story_content']), len(story_content)))
|
||||||
|
|
||||||
|
@ -161,30 +162,47 @@ class Feed(models.Model):
|
||||||
return stories
|
return stories
|
||||||
|
|
||||||
def _exists_story(self, story=None, story_content=None, existing_stories=None):
|
def _exists_story(self, story=None, story_content=None, existing_stories=None):
|
||||||
same_story = None
|
story_in_system = None
|
||||||
is_different = False
|
story_has_changed = False
|
||||||
story_pub_date = story.get('published')
|
story_pub_date = story.get('published')
|
||||||
start_date = story_pub_date - datetime.timedelta(hours=8)
|
start_date = story_pub_date - datetime.timedelta(hours=8)
|
||||||
end_date = story_pub_date + datetime.timedelta(hours=8)
|
end_date = story_pub_date + datetime.timedelta(hours=8)
|
||||||
|
|
||||||
for existing_story in existing_stories:
|
for existing_story in existing_stories:
|
||||||
|
content_ratio = 0
|
||||||
|
|
||||||
if story_pub_date > start_date and story_pub_date < end_date:
|
if story_pub_date > start_date and story_pub_date < end_date:
|
||||||
if story.get('link') == existing_story['story_permalink']:
|
if story.get('link') == existing_story['story_permalink']:
|
||||||
same_story = existing_story
|
story_in_system = existing_story
|
||||||
|
|
||||||
|
# Title distance + content distance, checking if story changed
|
||||||
story_title_difference = levenshtein_distance(story.get('title'),
|
story_title_difference = levenshtein_distance(story.get('title'),
|
||||||
existing_story['story_title'])
|
existing_story['story_title'])
|
||||||
if same_story and story_title_difference < 10:
|
seq = difflib.SequenceMatcher(None, story_content, existing_story['story_content'])
|
||||||
same_story = existing_story
|
|
||||||
if story_title_difference > 0:
|
|
||||||
is_different = True
|
|
||||||
|
|
||||||
if same_story:
|
if seq.real_quick_ratio() > .9 and seq.quick_ratio() > .95:
|
||||||
if story_content != existing_story['story_content']:
|
content_ratio = seq.ratio()
|
||||||
is_different = True
|
|
||||||
|
if story_title_difference > 0 and story_title_difference < 5 and content_ratio > .98:
|
||||||
|
story_in_system = existing_story
|
||||||
|
if story_title_difference > 0 or content_ratio < 1.0:
|
||||||
|
# print "Title difference - %s/%s (%s): %s" % (story.get('title'), existing_story['story_title'], story_title_difference, content_ratio)
|
||||||
|
story_has_changed = True
|
||||||
break
|
break
|
||||||
|
|
||||||
return same_story, is_different
|
# More restrictive content distance, still no story match
|
||||||
|
if not story_in_system and content_ratio > .99:
|
||||||
|
# print "Content difference - %s/%s (%s): %s" % (story.get('title'), existing_story['story_title'], story_title_difference, content_ratio)
|
||||||
|
story_in_system = existing_story
|
||||||
|
story_has_changed = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if story_in_system:
|
||||||
|
if story_content != existing_story['story_content']:
|
||||||
|
story_has_changed = True
|
||||||
|
break
|
||||||
|
|
||||||
|
return story_in_system, story_has_changed
|
||||||
|
|
||||||
def _pre_process_story(self, entry):
|
def _pre_process_story(self, entry):
|
||||||
date_published = entry.get('published', entry.get('updated'))
|
date_published = entry.get('published', entry.get('updated'))
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from django.utils import simplejson as json
|
from utils.json import decode
|
||||||
from django.test.client import Client
|
from django.test.client import Client
|
||||||
from django.test import TestCase
|
from django.test import TestCase
|
||||||
from django.core import management
|
from django.core import management
|
||||||
|
@ -9,20 +9,24 @@ class FeedTest(TestCase):
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.client = Client()
|
self.client = Client()
|
||||||
|
|
||||||
# def test_load_feeds__changed_story_title(self):
|
def test_load_feeds__gawker(self):
|
||||||
# self.client.login(userame='conesus', password='test')
|
self.client.login(userame='conesus', password='test')
|
||||||
#
|
|
||||||
# management.call_command('loaddata', 'gawker1.json', verbosity=0)
|
|
||||||
# response = self.client.get('/reader/refresh_feed', { "feed_id": 1, "force": True })
|
|
||||||
#
|
|
||||||
# management.call_command('loaddata', 'gawker2.json', verbosity=0)
|
|
||||||
# response = self.client.get('/reader/refresh_feed', { "feed_id": 1, "force": True })
|
|
||||||
#
|
|
||||||
# response = self.client.get('/reader/load_single_feed', { "feed_id": 1 })
|
|
||||||
# print [c['story_title'] for c in json.loads(response.content)]
|
|
||||||
# # print json.loads(response.content)[0]
|
|
||||||
|
|
||||||
def test_load_feeds__gothamist__changed_story_title(self):
|
management.call_command('loaddata', 'gawker1.json', verbosity=0)
|
||||||
|
response = self.client.get('/reader/refresh_feed', { "feed_id": 1, "force": True })
|
||||||
|
|
||||||
|
management.call_command('loaddata', 'gawker2.json', verbosity=0)
|
||||||
|
response = self.client.get('/reader/refresh_feed', { "feed_id": 1, "force": True })
|
||||||
|
|
||||||
|
response = self.client.get('/reader/load_single_feed', { "feed_id": 1 })
|
||||||
|
|
||||||
|
# print [c['story_title'] for c in json.loads(response.content)]
|
||||||
|
stories = decode(response.content)
|
||||||
|
|
||||||
|
# Test: 1 changed char in content
|
||||||
|
self.assertEquals(len(stories), 38)
|
||||||
|
|
||||||
|
def test_load_feeds__gothamist(self):
|
||||||
self.client.login(userame='conesus', password='test')
|
self.client.login(userame='conesus', password='test')
|
||||||
|
|
||||||
management.call_command('loaddata', 'gothamist1.json', verbosity=0)
|
management.call_command('loaddata', 'gothamist1.json', verbosity=0)
|
||||||
|
@ -32,5 +36,9 @@ class FeedTest(TestCase):
|
||||||
response = self.client.get('/reader/refresh_feed', { "feed_id": 4, "force": True })
|
response = self.client.get('/reader/refresh_feed', { "feed_id": 4, "force": True })
|
||||||
|
|
||||||
response = self.client.get('/reader/load_single_feed', { "feed_id": 4 })
|
response = self.client.get('/reader/load_single_feed', { "feed_id": 4 })
|
||||||
print [c['story_title'] for c in json.loads(response.content)]
|
|
||||||
# print json.loads(response.content)[0]
|
# print [c['story_title'] for c in json.loads(response.content)]
|
||||||
|
stories = decode(response.content)
|
||||||
|
|
||||||
|
# Test: 1 changed char in title
|
||||||
|
self.assertEquals(len(stories), 42)
|
23
utils/management_functions.py
Normal file
23
utils/management_functions.py
Normal file
|
@ -0,0 +1,23 @@
|
||||||
|
import os
|
||||||
|
import errno
|
||||||
|
|
||||||
|
def daemonize():
|
||||||
|
"""
|
||||||
|
Detach from the terminal and continue as a daemon.
|
||||||
|
"""
|
||||||
|
# swiped from twisted/scripts/twistd.py
|
||||||
|
# See http://www.erlenstar.demon.co.uk/unix/faq_toc.html#TOC16
|
||||||
|
if os.fork(): # launch child and...
|
||||||
|
os._exit(0) # kill off parent
|
||||||
|
os.setsid()
|
||||||
|
if os.fork(): # launch child and...
|
||||||
|
os._exit(0) # kill off parent again.
|
||||||
|
os.umask(077)
|
||||||
|
null = os.open("/dev/null", os.O_RDWR)
|
||||||
|
for i in range(3):
|
||||||
|
try:
|
||||||
|
os.dup2(null, i)
|
||||||
|
except OSError, e:
|
||||||
|
if e.errno != errno.EBADF:
|
||||||
|
raise
|
||||||
|
os.close(null)
|
Loading…
Add table
Reference in a new issue