Merge pull request #835 from sv0/text_importer

Text importer
This commit is contained in:
Samuel Clay 2015-11-30 16:03:50 -08:00
commit 53e4998146
3 changed files with 78 additions and 86 deletions

View file

@ -24,6 +24,7 @@ from django.core.urlresolvers import reverse
from django.contrib.auth.models import User from django.contrib.auth.models import User
from django.contrib.sites.models import Site from django.contrib.sites.models import Site
from django.template.defaultfilters import slugify from django.template.defaultfilters import slugify
from django.utils.encoding import smart_str
from mongoengine.queryset import OperationError, Q, NotUniqueError from mongoengine.queryset import OperationError, Q, NotUniqueError
from mongoengine.base import ValidationError from mongoengine.base import ValidationError
from vendor.timezones.utilities import localtime_for_timezone from vendor.timezones.utilities import localtime_for_timezone
@ -84,7 +85,6 @@ class Feed(models.Model):
s3_icon = models.NullBooleanField(default=False, blank=True, null=True) s3_icon = models.NullBooleanField(default=False, blank=True, null=True)
search_indexed = models.NullBooleanField(default=None, null=True, blank=True) search_indexed = models.NullBooleanField(default=None, null=True, blank=True)
class Meta: class Meta:
db_table="feeds" db_table="feeds"
ordering=["feed_title"] ordering=["feed_title"]
@ -1900,13 +1900,13 @@ class MStory(mongo.Document):
self.story_hash = self.feed_guid_hash self.story_hash = self.feed_guid_hash
if self.story_content: if self.story_content:
self.story_content_z = zlib.compress(self.story_content) self.story_content_z = zlib.compress(smart_str(self.story_content))
self.story_content = None self.story_content = None
if self.story_original_content: if self.story_original_content:
self.story_original_content_z = zlib.compress(self.story_original_content) self.story_original_content_z = zlib.compress(smart_str(self.story_original_content))
self.story_original_content = None self.story_original_content = None
if self.story_latest_content: if self.story_latest_content:
self.story_latest_content_z = zlib.compress(self.story_latest_content) self.story_latest_content_z = zlib.compress(smart_str(self.story_latest_content))
self.story_latest_content = None self.story_latest_content = None
if self.story_title and len(self.story_title) > story_title_max: if self.story_title and len(self.story_title) > story_title_max:
self.story_title = self.story_title[:story_title_max] self.story_title = self.story_title[:story_title_max]

View file

@ -7,9 +7,10 @@ from django.conf import settings
from apps.rss_feeds.models import Feed, MStory from apps.rss_feeds.models import Feed, MStory
from mongoengine.connection import connect, disconnect from mongoengine.connection import connect, disconnect
class FeedTest(TestCase): class FeedTest(TestCase):
fixtures = ['rss_feeds.json'] fixtures = ['rss_feeds.json']
def setUp(self): def setUp(self):
disconnect() disconnect()
settings.MONGODB = connect('test_newsblur') settings.MONGODB = connect('test_newsblur')
@ -17,168 +18,167 @@ class FeedTest(TestCase):
def tearDown(self): def tearDown(self):
settings.MONGODB.drop_database('test_newsblur') settings.MONGODB.drop_database('test_newsblur')
def test_load_feeds__gawker(self): def test_load_feeds__gawker(self):
self.client.login(username='conesus', password='test') self.client.login(username='conesus', password='test')
management.call_command('loaddata', 'gawker1.json', verbosity=0) management.call_command('loaddata', 'gawker1.json', verbosity=0)
feed = Feed.objects.get(feed_link__contains='gawker') feed = Feed.objects.get(feed_link__contains='gawker')
stories = MStory.objects(story_feed_id=feed.pk) stories = MStory.objects(story_feed_id=feed.pk)
self.assertEquals(stories.count(), 0) self.assertEquals(stories.count(), 0)
feed.update(force=True) feed.update(force=True)
stories = MStory.objects(story_feed_id=feed.pk) stories = MStory.objects(story_feed_id=feed.pk)
self.assertEquals(stories.count(), 38) self.assertEquals(stories.count(), 38)
management.call_command('loaddata', 'gawker2.json', verbosity=0) management.call_command('loaddata', 'gawker2.json', verbosity=0)
feed.update(force=True) feed.update(force=True)
# Test: 1 changed char in content # Test: 1 changed char in content
stories = MStory.objects(story_feed_id=feed.pk) stories = MStory.objects(story_feed_id=feed.pk)
self.assertEquals(stories.count(), 38) self.assertEquals(stories.count(), 38)
url = reverse('load-single-feed', kwargs=dict(feed_id=1)) url = reverse('load-single-feed', kwargs=dict(feed_id=1))
response = self.client.get(url) response = self.client.get(url)
feed = json.decode(response.content) feed = json.decode(response.content)
self.assertEquals(len(feed['stories']), 6) self.assertEquals(len(feed['stories']), 6)
def test_load_feeds__gothamist(self): def test_load_feeds__gothamist(self):
self.client.login(username='conesus', password='test') self.client.login(username='conesus', password='test')
management.call_command('loaddata', 'gothamist_aug_2009_1.json', verbosity=0) management.call_command('loaddata', 'gothamist_aug_2009_1.json', verbosity=0)
feed = Feed.objects.get(feed_link__contains='gothamist') feed = Feed.objects.get(feed_link__contains='gothamist')
stories = MStory.objects(story_feed_id=feed.pk) stories = MStory.objects(story_feed_id=feed.pk)
self.assertEquals(stories.count(), 0) self.assertEquals(stories.count(), 0)
feed.update(force=True) feed.update(force=True)
stories = MStory.objects(story_feed_id=feed.pk) stories = MStory.objects(story_feed_id=feed.pk)
self.assertEquals(stories.count(), 42) self.assertEquals(stories.count(), 42)
url = reverse('load-single-feed', kwargs=dict(feed_id=4)) url = reverse('load-single-feed', kwargs=dict(feed_id=4))
response = self.client.get(url) response = self.client.get(url)
content = json.decode(response.content) content = json.decode(response.content)
self.assertEquals(len(content['stories']), 6) self.assertEquals(len(content['stories']), 6)
management.call_command('loaddata', 'gothamist_aug_2009_2.json', verbosity=0) management.call_command('loaddata', 'gothamist_aug_2009_2.json', verbosity=0)
feed.update(force=True) feed.update(force=True)
stories = MStory.objects(story_feed_id=feed.pk) stories = MStory.objects(story_feed_id=feed.pk)
self.assertEquals(stories.count(), 42) self.assertEquals(stories.count(), 42)
url = reverse('load-single-feed', kwargs=dict(feed_id=4)) url = reverse('load-single-feed', kwargs=dict(feed_id=4))
response = self.client.get(url) response = self.client.get(url)
# print [c['story_title'] for c in json.decode(response.content)] # print [c['story_title'] for c in json.decode(response.content)]
content = json.decode(response.content) content = json.decode(response.content)
# Test: 1 changed char in title # Test: 1 changed char in title
self.assertEquals(len(content['stories']), 6) self.assertEquals(len(content['stories']), 6)
def test_load_feeds__slashdot(self): def test_load_feeds__slashdot(self):
self.client.login(username='conesus', password='test') self.client.login(username='conesus', password='test')
old_story_guid = "tag:google.com,2005:reader/item/4528442633bc7b2b" old_story_guid = "tag:google.com,2005:reader/item/4528442633bc7b2b"
management.call_command('loaddata', 'slashdot1.json', verbosity=0) management.call_command('loaddata', 'slashdot1.json', verbosity=0)
feed = Feed.objects.get(feed_link__contains='slashdot') feed = Feed.objects.get(feed_link__contains='slashdot')
stories = MStory.objects(story_feed_id=feed.pk) stories = MStory.objects(story_feed_id=feed.pk)
self.assertEquals(stories.count(), 0) self.assertEquals(stories.count(), 0)
management.call_command('refresh_feed', force=1, feed=5, single_threaded=True, daemonize=False) management.call_command('refresh_feed', force=1, feed=5, single_threaded=True, daemonize=False)
stories = MStory.objects(story_feed_id=feed.pk) stories = MStory.objects(story_feed_id=feed.pk)
self.assertEquals(stories.count(), 38) self.assertEquals(stories.count(), 38)
response = self.client.get(reverse('load-feeds')) response = self.client.get(reverse('load-feeds'))
content = json.decode(response.content) content = json.decode(response.content)
self.assertEquals(content['feeds']['5']['nt'], 38) self.assertEquals(content['feeds']['5']['nt'], 38)
self.client.post(reverse('mark-story-as-read'), {'story_id': old_story_guid, 'feed_id': 5}) self.client.post(reverse('mark-story-as-read'), {'story_id': old_story_guid, 'feed_id': 5})
response = self.client.get(reverse('refresh-feeds')) response = self.client.get(reverse('refresh-feeds'))
content = json.decode(response.content) content = json.decode(response.content)
self.assertEquals(content['feeds']['5']['nt'], 37) self.assertEquals(content['feeds']['5']['nt'], 37)
management.call_command('loaddata', 'slashdot2.json', verbosity=0) management.call_command('loaddata', 'slashdot2.json', verbosity=0)
management.call_command('refresh_feed', force=1, feed=5, single_threaded=True, daemonize=False) management.call_command('refresh_feed', force=1, feed=5, single_threaded=True, daemonize=False)
stories = MStory.objects(story_feed_id=feed.pk) stories = MStory.objects(story_feed_id=feed.pk)
self.assertEquals(stories.count(), 38) self.assertEquals(stories.count(), 38)
url = reverse('load-single-feed', kwargs=dict(feed_id=5)) url = reverse('load-single-feed', kwargs=dict(feed_id=5))
response = self.client.get(url) response = self.client.get(url)
# pprint([c['story_title'] for c in json.decode(response.content)]) # pprint([c['story_title'] for c in json.decode(response.content)])
feed = json.decode(response.content) feed = json.decode(response.content)
# Test: 1 changed char in title # Test: 1 changed char in title
self.assertEquals(len(feed['stories']), 6) self.assertEquals(len(feed['stories']), 6)
response = self.client.get(reverse('refresh-feeds')) response = self.client.get(reverse('refresh-feeds'))
content = json.decode(response.content) content = json.decode(response.content)
self.assertEquals(content['feeds']['5']['nt'], 37) self.assertEquals(content['feeds']['5']['nt'], 37)
def test_load_feeds__motherjones(self): def test_load_feeds__motherjones(self):
self.client.login(username='conesus', password='test') self.client.login(username='conesus', password='test')
management.call_command('loaddata', 'motherjones1.json', verbosity=0) management.call_command('loaddata', 'motherjones1.json', verbosity=0)
feed = Feed.objects.get(feed_link__contains='motherjones') feed = Feed.objects.get(feed_link__contains='motherjones')
stories = MStory.objects(story_feed_id=feed.pk) stories = MStory.objects(story_feed_id=feed.pk)
self.assertEquals(stories.count(), 0) self.assertEquals(stories.count(), 0)
management.call_command('refresh_feed', force=1, feed=feed.pk, single_threaded=True, daemonize=False) management.call_command('refresh_feed', force=1, feed=feed.pk, single_threaded=True, daemonize=False)
stories = MStory.objects(story_feed_id=feed.pk) stories = MStory.objects(story_feed_id=feed.pk)
self.assertEquals(stories.count(), 10) self.assertEquals(stories.count(), 10)
response = self.client.get(reverse('load-feeds')) response = self.client.get(reverse('load-feeds'))
content = json.decode(response.content) content = json.decode(response.content)
self.assertEquals(content['feeds'][str(feed.pk)]['nt'], 10) self.assertEquals(content['feeds'][str(feed.pk)]['nt'], 10)
self.client.post(reverse('mark-story-as-read'), {'story_id': stories[0].story_guid, 'feed_id': feed.pk}) self.client.post(reverse('mark-story-as-read'), {'story_id': stories[0].story_guid, 'feed_id': feed.pk})
response = self.client.get(reverse('refresh-feeds')) response = self.client.get(reverse('refresh-feeds'))
content = json.decode(response.content) content = json.decode(response.content)
self.assertEquals(content['feeds'][str(feed.pk)]['nt'], 9) self.assertEquals(content['feeds'][str(feed.pk)]['nt'], 9)
management.call_command('loaddata', 'motherjones2.json', verbosity=0) management.call_command('loaddata', 'motherjones2.json', verbosity=0)
management.call_command('refresh_feed', force=1, feed=feed.pk, single_threaded=True, daemonize=False) management.call_command('refresh_feed', force=1, feed=feed.pk, single_threaded=True, daemonize=False)
stories = MStory.objects(story_feed_id=feed.pk) stories = MStory.objects(story_feed_id=feed.pk)
self.assertEquals(stories.count(), 10) self.assertEquals(stories.count(), 10)
url = reverse('load-single-feed', kwargs=dict(feed_id=feed.pk)) url = reverse('load-single-feed', kwargs=dict(feed_id=feed.pk))
response = self.client.get(url) response = self.client.get(url)
# pprint([c['story_title'] for c in json.decode(response.content)]) # pprint([c['story_title'] for c in json.decode(response.content)])
feed = json.decode(response.content) feed = json.decode(response.content)
# Test: 1 changed char in title # Test: 1 changed char in title
self.assertEquals(len(feed['stories']), 6) self.assertEquals(len(feed['stories']), 6)
response = self.client.get(reverse('refresh-feeds')) response = self.client.get(reverse('refresh-feeds'))
content = json.decode(response.content) content = json.decode(response.content)
self.assertEquals(content['feeds'][str(feed['feed_id'])]['nt'], 9) self.assertEquals(content['feeds'][str(feed['feed_id'])]['nt'], 9)
def test_load_feeds__brokelyn__invalid_xml(self): def test_load_feeds__brokelyn__invalid_xml(self):
self.client.login(username='conesus', password='test') self.client.login(username='conesus', password='test')
management.call_command('loaddata', 'brokelyn.json', verbosity=0) management.call_command('loaddata', 'brokelyn.json', verbosity=0)
management.call_command('refresh_feed', force=1, feed=6, single_threaded=True, daemonize=False) management.call_command('refresh_feed', force=1, feed=6, single_threaded=True, daemonize=False)
url = reverse('load-single-feed', kwargs=dict(feed_id=6)) url = reverse('load-single-feed', kwargs=dict(feed_id=6))
response = self.client.get(url) response = self.client.get(url)
# pprint([c['story_title'] for c in json.decode(response.content)]) # pprint([c['story_title'] for c in json.decode(response.content)])
feed = json.decode(response.content) feed = json.decode(response.content)
# Test: 1 changed char in title # Test: 1 changed char in title
self.assertEquals(len(feed['stories']), 6) self.assertEquals(len(feed['stories']), 6)
def test_all_feeds(self): def test_all_feeds(self):
pass pass

View file

@ -1,7 +1,6 @@
import requests import requests
import zlib import zlib
from requests.packages.urllib3.exceptions import LocationParseError from requests.packages.urllib3.exceptions import LocationParseError
from django.conf import settings
from socket import error as SocketError from socket import error as SocketError
from mongoengine.queryset import NotUniqueError from mongoengine.queryset import NotUniqueError
from vendor.readability import readability from vendor.readability import readability
@ -9,38 +8,41 @@ from utils import log as logging
from utils.feed_functions import timelimit, TimeoutError from utils.feed_functions import timelimit, TimeoutError
from OpenSSL.SSL import Error as OpenSSLError from OpenSSL.SSL import Error as OpenSSLError
from pyasn1.error import PyAsn1Error from pyasn1.error import PyAsn1Error
from django.utils.encoding import smart_str
BROKEN_URLS = [ BROKEN_URLS = [
"gamespot.com", "gamespot.com",
] ]
class TextImporter: class TextImporter:
def __init__(self, story=None, feed=None, story_url=None, request=None, debug=False): def __init__(self, story=None, feed=None, story_url=None, request=None, debug=False):
self.story = story self.story = story
self.story_url = story_url self.story_url = story_url
self.feed = feed self.feed = feed
self.request = request self.request = request
self.debug = debug self.debug = debug
@property @property
def headers(self): def headers(self):
num_subscribers = getattr(self.feed, 'num_subscribers', 0)
return { return {
'User-Agent': 'NewsBlur Content Fetcher - %s subscriber%s - %s ' 'User-Agent': 'NewsBlur Content Fetcher - %s subscriber%s - %s '
'(Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) ' '(Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) '
'AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 ' 'AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 '
'Safari/534.48.3)' % ( 'Safari/534.48.3)' % (
self.feed.num_subscribers, num_subscribers,
's' if self.feed.num_subscribers != 1 else '', 's' if num_subscribers != 1 else '',
self.feed.permalink, getattr(self.feed, 'permalink', '')
), ),
} }
def fetch(self, skip_save=False, return_document=False): def fetch(self, skip_save=False, return_document=False):
if self.story_url and any(broken_url in self.story_url for broken_url in BROKEN_URLS): if self.story_url and any(broken_url in self.story_url for broken_url in BROKEN_URLS):
logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: banned") logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: banned")
return return
try: try:
resp = self.fetch_request() resp = self.fetch_request()
except TimeoutError: except TimeoutError:
@ -49,56 +51,46 @@ class TextImporter:
except requests.exceptions.TooManyRedirects: except requests.exceptions.TooManyRedirects:
logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: too many redirects") logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: too many redirects")
resp = None resp = None
if not resp: if not resp:
return return
try: text = resp.text
text = resp.text original_text_doc = readability.Document(text, url=resp.url,
except (LookupError, TypeError):
text = resp.content
charset_declared = 'charset' in resp.headers.get('content-type', "")
if resp.encoding and resp.encoding != 'utf-8' and not charset_declared:
try:
text = text.encode(resp.encoding)
except (LookupError, UnicodeEncodeError):
pass
original_text_doc = readability.Document(text, url=resp.url,
debug=self.debug, debug=self.debug,
positive_keywords=["postContent", "postField"]) positive_keywords=["postContent", "postField"])
try: try:
content = original_text_doc.summary(html_partial=True) content = original_text_doc.summary(html_partial=True)
except readability.Unparseable: except readability.Unparseable:
return return
try: try:
title = original_text_doc.title() title = original_text_doc.title()
except TypeError: except TypeError:
title = "" title = ""
url = resp.url url = resp.url
if content: if content:
if self.story and not skip_save: if self.story and not skip_save:
self.story.original_text_z = zlib.compress(content) self.story.original_text_z = zlib.compress(smart_str(content))
try: try:
self.story.save() self.story.save()
except NotUniqueError: except NotUniqueError:
pass pass
logging.user(self.request, ("~SN~FYFetched ~FGoriginal text~FY: now ~SB%s bytes~SN vs. was ~SB%s bytes" % ( logging.user(self.request, ("~SN~FYFetched ~FGoriginal text~FY: now ~SB%s bytes~SN vs. was ~SB%s bytes" % (
len(unicode(content)), len(content),
self.story and self.story.story_content_z and len(zlib.decompress(self.story.story_content_z)) self.story and self.story.story_content_z and len(zlib.decompress(self.story.story_content_z))
)), warn_color=False) )), warn_color=False)
else: else:
logging.user(self.request, ("~SN~FRFailed~FY to fetch ~FGoriginal text~FY: was ~SB%s bytes" % ( logging.user(self.request, ("~SN~FRFailed~FY to fetch ~FGoriginal text~FY: was ~SB%s bytes" % (
self.story and self.story.story_content_z and len(zlib.decompress(self.story.story_content_z)) self.story and self.story.story_content_z and len(zlib.decompress(self.story.story_content_z))
)), warn_color=False) )), warn_color=False)
if return_document: if return_document:
return dict(content=content, title=title, url=url, doc=original_text_doc) return dict(content=content, title=title, url=url, doc=original_text_doc)
return content return content
@timelimit(10) @timelimit(10)
def fetch_request(self): def fetch_request(self):
url = self.story_url url = self.story_url
@ -107,7 +99,7 @@ class TextImporter:
try: try:
r = requests.get(url, headers=self.headers, verify=False) r = requests.get(url, headers=self.headers, verify=False)
r.connection.close() r.connection.close()
except (AttributeError, SocketError, requests.ConnectionError, except (AttributeError, SocketError, requests.ConnectionError,
requests.models.MissingSchema, requests.sessions.InvalidSchema, requests.models.MissingSchema, requests.sessions.InvalidSchema,
requests.sessions.TooManyRedirects, requests.sessions.TooManyRedirects,
requests.models.InvalidURL, requests.models.InvalidURL,