Fixing bug where publisher-disabled original pages were not properly disabled.

This commit is contained in:
Samuel Clay 2012-09-05 11:32:12 -07:00
parent 5216fa88b0
commit 9d379377d0
3 changed files with 23 additions and 21 deletions

View file

@ -34,14 +34,6 @@ from utils.diff import HTMLDiff
ENTRY_NEW, ENTRY_UPDATED, ENTRY_SAME, ENTRY_ERR = range(4)
# Also change in reader_utils.js.
BROKEN_PAGE_URLS = [
'nytimes.com',
'stackoverflow.com',
'stackexchange.com',
'twitter.com',
'rankexploits',
]
class Feed(models.Model):
feed_address = models.URLField(max_length=255, db_index=True)
@ -139,12 +131,9 @@ class Feed(models.Model):
feed['exception_type'] = None
feed['exception_code'] = self.exception_code
if self.feed_link:
for broken_page in BROKEN_PAGE_URLS:
if broken_page in self.feed_link:
feed['disabled_page'] = True
break
if not self.has_page:
feed['disabled_page'] = True
print feed
if full:
feed['feed_tags'] = json.decode(self.data.popular_tags) if self.data.popular_tags else []
feed['feed_authors'] = json.decode(self.data.popular_authors) if self.data.popular_authors else []

View file

@ -19,6 +19,15 @@ BROKEN_PAGES = [
'[]',
]
# Also change in reader_utils.js.
BROKEN_PAGE_URLS = [
'nytimes.com',
'stackoverflow.com',
'stackexchange.com',
'twitter.com',
'rankexploits',
]
class PageImporter(object):
def __init__(self, feed):
@ -47,11 +56,17 @@ class PageImporter(object):
if not feed_link:
self.save_no_page()
return
if feed_link.startswith('www'):
self.feed.feed_link = 'http://' + feed_link
try:
if feed_link.startswith('www'):
self.feed.feed_link = 'http://' + feed_link
if feed_link.startswith('http'):
if any(feed_link.startswith(s) for s in BROKEN_PAGES):
self.save_no_page()
return
elif any(s in feed_link.lower() for s in BROKEN_PAGE_URLS):
self.save_no_page()
return
elif feed_link.startswith('http'):
if urllib_fallback:
request = urllib2.Request(feed_link, headers=self.headers)
response = urllib2.urlopen(request)
@ -66,9 +81,6 @@ class PageImporter(object):
data = response.text
except (LookupError, TypeError):
data = response.content
elif any(feed_link.startswith(s) for s in BROKEN_PAGES):
self.save_no_page()
return
else:
try:
data = open(feed_link, 'r').read()
@ -112,6 +124,7 @@ class PageImporter(object):
return html
def save_no_page(self):
logging.debug(' --->> [%-30s] ~FYNo original page: %s' % (self.feed, self.feed.feed_link))
self.feed.has_page = False
self.feed.save()
self.feed.save_page_history(404, "Feed has no original page.")

BIN
dump.rdb

Binary file not shown.