diff --git a/apps/rss_feeds/fixtures/gothamist2.xml b/apps/rss_feeds/fixtures/gothamist2.xml index 8407f5a9a..342404b62 100644 --- a/apps/rss_feeds/fixtures/gothamist2.xml +++ b/apps/rss_feeds/fixtures/gothamist2.xml @@ -14,11 +14,11 @@ Public Advocate Hopefuls Debate Each Other, Defend The Job - http://feeds.gothamistllc.com/click.phdo?i=8a845cced28e85b43ca559267d509c78 + http://feeds.gothamistllc.com/click.phdo?i=yatta http://gothamist.com/2009/08/29/public_advocate_hopefuls_debate_eac.php - http://gothamist.com/2009/08/29/public_advocate_hopefuls_debate_eac.php + http://gothamist.com/2009/08/29/something_different.php http://gothamist.com/2009/08/29/public_advocate_hopefuls_debate_eac.php#comments - <p><span class="mt-enclosure mt-enclosure-image" style="display: inline;"> <img alt="2009_08_pubadv.jpg" src="http://gothamist.com/attachments/jen/2009_08_pubadv.jpg" width="290" height="224" class="image-left" /> </span>Last night, the Democratic candidates for NYC Public Advocates squared off for a WABC 7-televised debate, and it got good fast: Within 10 minutes, City Councilman Bill de Blasio <a href="http://www.nytimes.com/2009/08/29/nyregion/29debate.html?ref=nyregion">attacked frontrunner and former Public Advocate Mark Green</a>, "I have to say, with all due respect to Mark Green, he has a little amnesia. <strong>He did not stand up to Rudy Giuliani when it mattered most, when Rudy Giuliani <a href="http://www.nytimes.com/2001/09/24/nyregion/24RUDY.html">wanted to stay on for 90 days</a> at the end of 2001.</strong> Mark <a href="http://www.nytimes.com/2001/10/02/nyregion/giuliani-s-quest-for-a-term-extension-hits-a-wall-in-albany.html?scp=3&sq=2001%20giuliani%20mark%20green%2090%20days&st=cse">caved in to him for political reasons</a>.&#8221; Green retorted, "There was not any public official in the eight years of Rudy Giuliani who stood up to him more often and more successfully." Green <a href="http://www.politickerny.com/5072/green-says-hes-not-interested-running-mayor-de-blasio-attacks">also told moderator Diane Williams</a>, "<strong>Diane, let Bill attack me one more time because he needs it emotionally.</strong>"</p> + <p><span class="mt-enclosure mt-enclosure-image" style="display: inline;"> <img alt="2009_08_pubadv.jpg" src="http://gothamist.com/attachments/jen/2009_08_pubadv.jpg" width="290" height="224" class="image-left" /> </span>Last night, the Democratic candidates for NYC Public Advocates squared off for a WABC 7-televised debate, and it got good fast: Within 10 minutes, City Councilman Somebody Else Entirely <a href="http://www.nytimes.com/2009/08/29/nyregion/29debate.html?ref=nyregion">attacked frontrunner and former Public Advocate Mark Green</a>, "I have to say, with all due respect to Mark Green, he has a little amnesia. <strong>He did not stand up to Rudy Giuliani when it mattered most, when Rudy Giuliani <a href="http://www.nytimes.com/2001/09/24/nyregion/24RUDY.html">wanted to stay on for 90 days</a> at the end of 2001.</strong> Mark <a href="http://www.nytimes.com/2001/10/02/nyregion/giuliani-s-quest-for-a-term-extension-hits-a-wall-in-albany.html?scp=3&sq=2001%20giuliani%20mark%20green%2090%20days&st=cse">caved in to him for political reasons</a>.&#8221; Green retorted, "There was not any public official in the eight years of Rudy Giuliani who stood up to him more often and more successfully." Green <a href="http://www.politickerny.com/5072/green-says-hes-not-interested-running-mayor-de-blasio-attacks">also told moderator Diane Williams</a>, "<strong>Diane, let Bill attack me one more time because he needs it emotionally.</strong>"</p> <p>Of course, a big question that loomed over the debate is whether there needs to be a Public Advocate at all (City Councilman Simcha Felder <a href="http://www.politickerny.com/4693/felders-argument-eliminating-public-advocate">released a report on why the city should do away</a> with the position, which is second highest elected position in the city). Civil rights lawyer Norman Siegel said, "You need a public advocate to go and fight. When there is a slush fund in the City Council, you need someone from the outside like me to take on the insiders," while Green said, "I was the public advocate. I don't remember this question being asked very often, if ever, when I was there." </p> diff --git a/apps/rss_feeds/management/commands/refresh_feed.py b/apps/rss_feeds/management/commands/refresh_feed.py index 6148481d6..3384a56a1 100644 --- a/apps/rss_feeds/management/commands/refresh_feed.py +++ b/apps/rss_feeds/management/commands/refresh_feed.py @@ -4,6 +4,7 @@ from apps.rss_feeds.models import Feed, Story from django.core.cache import cache from apps.reader.models import UserSubscription, UserSubscriptionFolders, UserStory from optparse import OptionParser, make_option +from utils.management_functions import daemonize import os import logging import errno @@ -29,25 +30,4 @@ class Command(BaseCommand): ) for us in usersubs: us.count_unread() - cache.delete('usersub:%s' % us.user_id) - -def daemonize(): - """ - Detach from the terminal and continue as a daemon. - """ - # swiped from twisted/scripts/twistd.py - # See http://www.erlenstar.demon.co.uk/unix/faq_toc.html#TOC16 - if os.fork(): # launch child and... - os._exit(0) # kill off parent - os.setsid() - if os.fork(): # launch child and... - os._exit(0) # kill off parent again. - os.umask(077) - null = os.open("/dev/null", os.O_RDWR) - for i in range(3): - try: - os.dup2(null, i) - except OSError, e: - if e.errno != errno.EBADF: - raise - os.close(null) \ No newline at end of file + cache.delete('usersub:%s' % us.user_id) \ No newline at end of file diff --git a/apps/rss_feeds/management/commands/refresh_feeds.py b/apps/rss_feeds/management/commands/refresh_feeds.py index 7ef36b51a..1afd12d4d 100644 --- a/apps/rss_feeds/management/commands/refresh_feeds.py +++ b/apps/rss_feeds/management/commands/refresh_feeds.py @@ -3,10 +3,9 @@ from django.core.handlers.wsgi import WSGIHandler from apps.rss_feeds.models import Feed, Story from optparse import OptionParser, make_option from utils import feed_fetcher +from utils.management_functions import daemonize import logging -import os import socket -import errno class Command(BaseCommand): @@ -36,24 +35,4 @@ class Command(BaseCommand): disp.poll() - -def daemonize(): - """ - Detach from the terminal and continue as a daemon. - """ - # swiped from twisted/scripts/twistd.py - # See http://www.erlenstar.demon.co.uk/unix/faq_toc.html#TOC16 - if os.fork(): # launch child and... - os._exit(0) # kill off parent - os.setsid() - if os.fork(): # launch child and... - os._exit(0) # kill off parent again. - os.umask(077) - null = os.open("/dev/null", os.O_RDWR) - for i in range(3): - try: - os.dup2(null, i) - except OSError, e: - if e.errno != errno.EBADF: - raise - os.close(null) \ No newline at end of file + \ No newline at end of file diff --git a/apps/rss_feeds/models.py b/apps/rss_feeds/models.py index 09e7e4450..5d3dccea1 100644 --- a/apps/rss_feeds/models.py +++ b/apps/rss_feeds/models.py @@ -13,6 +13,7 @@ from utils.story_functions import format_story_link_date__long from django.db.models import Q import settings import logging +import difflib from utils.diff import HTMLDiff USER_AGENT = 'NewsBlur v1.0 - newsblur.com' @@ -83,7 +84,7 @@ class Feed(models.Model): story_content = story_contents[0]['value'] else: story_content = story.get('summary') - existing_story, is_different = self._exists_story(story, story_content, existing_stories) + existing_story, story_has_changed = self._exists_story(story, story_content, existing_stories) if existing_story is None: pub_date = datetime.datetime.timetuple(story.get('published')) # logging.debug('- New story: %s %s' % (pub_date, story.get('title'))) @@ -101,7 +102,7 @@ class Feed(models.Model): except: ret_values[ENTRY_ERR] += 1 pass - elif existing_story and is_different: + elif existing_story and story_has_changed: # update story logging.debug('- Updated story in feed (%s - %s): %s / %s' % (self.feed_title, story.get('title'), len(existing_story['story_content']), len(story_content))) @@ -161,30 +162,47 @@ class Feed(models.Model): return stories def _exists_story(self, story=None, story_content=None, existing_stories=None): - same_story = None - is_different = False + story_in_system = None + story_has_changed = False story_pub_date = story.get('published') start_date = story_pub_date - datetime.timedelta(hours=8) end_date = story_pub_date + datetime.timedelta(hours=8) - + for existing_story in existing_stories: + content_ratio = 0 + if story_pub_date > start_date and story_pub_date < end_date: if story.get('link') == existing_story['story_permalink']: - same_story = existing_story - + story_in_system = existing_story + + # Title distance + content distance, checking if story changed story_title_difference = levenshtein_distance(story.get('title'), existing_story['story_title']) - if same_story and story_title_difference < 10: - same_story = existing_story - if story_title_difference > 0: - is_different = True - - if same_story: - if story_content != existing_story['story_content']: - is_different = True + seq = difflib.SequenceMatcher(None, story_content, existing_story['story_content']) + + if seq.real_quick_ratio() > .9 and seq.quick_ratio() > .95: + content_ratio = seq.ratio() + + if story_title_difference > 0 and story_title_difference < 5 and content_ratio > .98: + story_in_system = existing_story + if story_title_difference > 0 or content_ratio < 1.0: + # print "Title difference - %s/%s (%s): %s" % (story.get('title'), existing_story['story_title'], story_title_difference, content_ratio) + story_has_changed = True + break + + # More restrictive content distance, still no story match + if not story_in_system and content_ratio > .99: + # print "Content difference - %s/%s (%s): %s" % (story.get('title'), existing_story['story_title'], story_title_difference, content_ratio) + story_in_system = existing_story + story_has_changed = True break - - return same_story, is_different + + if story_in_system: + if story_content != existing_story['story_content']: + story_has_changed = True + break + + return story_in_system, story_has_changed def _pre_process_story(self, entry): date_published = entry.get('published', entry.get('updated')) diff --git a/apps/rss_feeds/tests.py b/apps/rss_feeds/tests.py index 2172e90dc..9efe0dbfb 100644 --- a/apps/rss_feeds/tests.py +++ b/apps/rss_feeds/tests.py @@ -1,4 +1,4 @@ -from django.utils import simplejson as json +from utils.json import decode from django.test.client import Client from django.test import TestCase from django.core import management @@ -9,20 +9,24 @@ class FeedTest(TestCase): def setUp(self): self.client = Client() - # def test_load_feeds__changed_story_title(self): - # self.client.login(userame='conesus', password='test') - # - # management.call_command('loaddata', 'gawker1.json', verbosity=0) - # response = self.client.get('/reader/refresh_feed', { "feed_id": 1, "force": True }) - # - # management.call_command('loaddata', 'gawker2.json', verbosity=0) - # response = self.client.get('/reader/refresh_feed', { "feed_id": 1, "force": True }) - # - # response = self.client.get('/reader/load_single_feed', { "feed_id": 1 }) - # print [c['story_title'] for c in json.loads(response.content)] - # # print json.loads(response.content)[0] - - def test_load_feeds__gothamist__changed_story_title(self): + def test_load_feeds__gawker(self): + self.client.login(userame='conesus', password='test') + + management.call_command('loaddata', 'gawker1.json', verbosity=0) + response = self.client.get('/reader/refresh_feed', { "feed_id": 1, "force": True }) + + management.call_command('loaddata', 'gawker2.json', verbosity=0) + response = self.client.get('/reader/refresh_feed', { "feed_id": 1, "force": True }) + + response = self.client.get('/reader/load_single_feed', { "feed_id": 1 }) + + # print [c['story_title'] for c in json.loads(response.content)] + stories = decode(response.content) + + # Test: 1 changed char in content + self.assertEquals(len(stories), 38) + + def test_load_feeds__gothamist(self): self.client.login(userame='conesus', password='test') management.call_command('loaddata', 'gothamist1.json', verbosity=0) @@ -32,5 +36,9 @@ class FeedTest(TestCase): response = self.client.get('/reader/refresh_feed', { "feed_id": 4, "force": True }) response = self.client.get('/reader/load_single_feed', { "feed_id": 4 }) - print [c['story_title'] for c in json.loads(response.content)] - # print json.loads(response.content)[0] + + # print [c['story_title'] for c in json.loads(response.content)] + stories = decode(response.content) + + # Test: 1 changed char in title + self.assertEquals(len(stories), 42) \ No newline at end of file diff --git a/utils/management_functions.py b/utils/management_functions.py new file mode 100644 index 000000000..803109d08 --- /dev/null +++ b/utils/management_functions.py @@ -0,0 +1,23 @@ +import os +import errno + +def daemonize(): + """ + Detach from the terminal and continue as a daemon. + """ + # swiped from twisted/scripts/twistd.py + # See http://www.erlenstar.demon.co.uk/unix/faq_toc.html#TOC16 + if os.fork(): # launch child and... + os._exit(0) # kill off parent + os.setsid() + if os.fork(): # launch child and... + os._exit(0) # kill off parent again. + os.umask(077) + null = os.open("/dev/null", os.O_RDWR) + for i in range(3): + try: + os.dup2(null, i) + except OSError, e: + if e.errno != errno.EBADF: + raise + os.close(null) \ No newline at end of file