diff --git a/apps/rss_feeds/models.py b/apps/rss_feeds/models.py index e333db538..2d0af8a18 100644 --- a/apps/rss_feeds/models.py +++ b/apps/rss_feeds/models.py @@ -24,6 +24,7 @@ from django.core.urlresolvers import reverse from django.contrib.auth.models import User from django.contrib.sites.models import Site from django.template.defaultfilters import slugify +from django.utils.encoding import smart_str from mongoengine.queryset import OperationError, Q, NotUniqueError from mongoengine.base import ValidationError from vendor.timezones.utilities import localtime_for_timezone @@ -84,7 +85,6 @@ class Feed(models.Model): s3_icon = models.NullBooleanField(default=False, blank=True, null=True) search_indexed = models.NullBooleanField(default=None, null=True, blank=True) - class Meta: db_table="feeds" ordering=["feed_title"] @@ -1900,13 +1900,13 @@ class MStory(mongo.Document): self.story_hash = self.feed_guid_hash if self.story_content: - self.story_content_z = zlib.compress(self.story_content) + self.story_content_z = zlib.compress(smart_str(self.story_content)) self.story_content = None if self.story_original_content: - self.story_original_content_z = zlib.compress(self.story_original_content) + self.story_original_content_z = zlib.compress(smart_str(self.story_original_content)) self.story_original_content = None if self.story_latest_content: - self.story_latest_content_z = zlib.compress(self.story_latest_content) + self.story_latest_content_z = zlib.compress(smart_str(self.story_latest_content)) self.story_latest_content = None if self.story_title and len(self.story_title) > story_title_max: self.story_title = self.story_title[:story_title_max] diff --git a/apps/rss_feeds/tests.py b/apps/rss_feeds/tests.py index 74e7c0c7b..bf4152a40 100644 --- a/apps/rss_feeds/tests.py +++ b/apps/rss_feeds/tests.py @@ -7,9 +7,10 @@ from django.conf import settings from apps.rss_feeds.models import Feed, MStory from mongoengine.connection import connect, disconnect + class FeedTest(TestCase): fixtures = ['rss_feeds.json'] - + def setUp(self): disconnect() settings.MONGODB = connect('test_newsblur') @@ -17,168 +18,167 @@ class FeedTest(TestCase): def tearDown(self): settings.MONGODB.drop_database('test_newsblur') - + def test_load_feeds__gawker(self): self.client.login(username='conesus', password='test') - + management.call_command('loaddata', 'gawker1.json', verbosity=0) - + feed = Feed.objects.get(feed_link__contains='gawker') stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 0) - + feed.update(force=True) - + stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 38) - + management.call_command('loaddata', 'gawker2.json', verbosity=0) - + feed.update(force=True) - + # Test: 1 changed char in content stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 38) - + url = reverse('load-single-feed', kwargs=dict(feed_id=1)) response = self.client.get(url) feed = json.decode(response.content) self.assertEquals(len(feed['stories']), 6) - + def test_load_feeds__gothamist(self): self.client.login(username='conesus', password='test') - + management.call_command('loaddata', 'gothamist_aug_2009_1.json', verbosity=0) feed = Feed.objects.get(feed_link__contains='gothamist') stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 0) - + feed.update(force=True) - + stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 42) - + url = reverse('load-single-feed', kwargs=dict(feed_id=4)) response = self.client.get(url) content = json.decode(response.content) self.assertEquals(len(content['stories']), 6) - + management.call_command('loaddata', 'gothamist_aug_2009_2.json', verbosity=0) feed.update(force=True) - + stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 42) - + url = reverse('load-single-feed', kwargs=dict(feed_id=4)) response = self.client.get(url) # print [c['story_title'] for c in json.decode(response.content)] content = json.decode(response.content) # Test: 1 changed char in title self.assertEquals(len(content['stories']), 6) - + def test_load_feeds__slashdot(self): self.client.login(username='conesus', password='test') - + old_story_guid = "tag:google.com,2005:reader/item/4528442633bc7b2b" - + management.call_command('loaddata', 'slashdot1.json', verbosity=0) - + feed = Feed.objects.get(feed_link__contains='slashdot') stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 0) - + management.call_command('refresh_feed', force=1, feed=5, single_threaded=True, daemonize=False) - + stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 38) - + response = self.client.get(reverse('load-feeds')) content = json.decode(response.content) self.assertEquals(content['feeds']['5']['nt'], 38) self.client.post(reverse('mark-story-as-read'), {'story_id': old_story_guid, 'feed_id': 5}) - + response = self.client.get(reverse('refresh-feeds')) content = json.decode(response.content) self.assertEquals(content['feeds']['5']['nt'], 37) - + management.call_command('loaddata', 'slashdot2.json', verbosity=0) management.call_command('refresh_feed', force=1, feed=5, single_threaded=True, daemonize=False) - + stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 38) - + url = reverse('load-single-feed', kwargs=dict(feed_id=5)) response = self.client.get(url) - + # pprint([c['story_title'] for c in json.decode(response.content)]) feed = json.decode(response.content) - + # Test: 1 changed char in title self.assertEquals(len(feed['stories']), 6) - + response = self.client.get(reverse('refresh-feeds')) content = json.decode(response.content) self.assertEquals(content['feeds']['5']['nt'], 37) - - + def test_load_feeds__motherjones(self): self.client.login(username='conesus', password='test') - + management.call_command('loaddata', 'motherjones1.json', verbosity=0) - + feed = Feed.objects.get(feed_link__contains='motherjones') stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 0) - + management.call_command('refresh_feed', force=1, feed=feed.pk, single_threaded=True, daemonize=False) - + stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 10) - + response = self.client.get(reverse('load-feeds')) content = json.decode(response.content) self.assertEquals(content['feeds'][str(feed.pk)]['nt'], 10) self.client.post(reverse('mark-story-as-read'), {'story_id': stories[0].story_guid, 'feed_id': feed.pk}) - + response = self.client.get(reverse('refresh-feeds')) content = json.decode(response.content) self.assertEquals(content['feeds'][str(feed.pk)]['nt'], 9) - + management.call_command('loaddata', 'motherjones2.json', verbosity=0) management.call_command('refresh_feed', force=1, feed=feed.pk, single_threaded=True, daemonize=False) - + stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 10) - + url = reverse('load-single-feed', kwargs=dict(feed_id=feed.pk)) response = self.client.get(url) - + # pprint([c['story_title'] for c in json.decode(response.content)]) feed = json.decode(response.content) - + # Test: 1 changed char in title self.assertEquals(len(feed['stories']), 6) - + response = self.client.get(reverse('refresh-feeds')) content = json.decode(response.content) self.assertEquals(content['feeds'][str(feed['feed_id'])]['nt'], 9) def test_load_feeds__brokelyn__invalid_xml(self): self.client.login(username='conesus', password='test') - + management.call_command('loaddata', 'brokelyn.json', verbosity=0) management.call_command('refresh_feed', force=1, feed=6, single_threaded=True, daemonize=False) - + url = reverse('load-single-feed', kwargs=dict(feed_id=6)) response = self.client.get(url) - + # pprint([c['story_title'] for c in json.decode(response.content)]) feed = json.decode(response.content) - + # Test: 1 changed char in title self.assertEquals(len(feed['stories']), 6) - + def test_all_feeds(self): - pass \ No newline at end of file + pass diff --git a/apps/rss_feeds/text_importer.py b/apps/rss_feeds/text_importer.py index f8359d107..6548f0a7f 100644 --- a/apps/rss_feeds/text_importer.py +++ b/apps/rss_feeds/text_importer.py @@ -1,7 +1,6 @@ import requests import zlib from requests.packages.urllib3.exceptions import LocationParseError -from django.conf import settings from socket import error as SocketError from mongoengine.queryset import NotUniqueError from vendor.readability import readability @@ -9,38 +8,41 @@ from utils import log as logging from utils.feed_functions import timelimit, TimeoutError from OpenSSL.SSL import Error as OpenSSLError from pyasn1.error import PyAsn1Error +from django.utils.encoding import smart_str BROKEN_URLS = [ "gamespot.com", ] + class TextImporter: - + def __init__(self, story=None, feed=None, story_url=None, request=None, debug=False): self.story = story self.story_url = story_url self.feed = feed self.request = request self.debug = debug - + @property def headers(self): + num_subscribers = getattr(self.feed, 'num_subscribers', 0) return { 'User-Agent': 'NewsBlur Content Fetcher - %s subscriber%s - %s ' '(Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) ' 'AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 ' 'Safari/534.48.3)' % ( - self.feed.num_subscribers, - 's' if self.feed.num_subscribers != 1 else '', - self.feed.permalink, - ), + num_subscribers, + 's' if num_subscribers != 1 else '', + getattr(self.feed, 'permalink', '') + ), } - + def fetch(self, skip_save=False, return_document=False): if self.story_url and any(broken_url in self.story_url for broken_url in BROKEN_URLS): logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: banned") return - + try: resp = self.fetch_request() except TimeoutError: @@ -49,56 +51,46 @@ class TextImporter: except requests.exceptions.TooManyRedirects: logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: too many redirects") resp = None - + if not resp: return - - try: - text = resp.text - except (LookupError, TypeError): - text = resp.content - - charset_declared = 'charset' in resp.headers.get('content-type', "") - if resp.encoding and resp.encoding != 'utf-8' and not charset_declared: - try: - text = text.encode(resp.encoding) - except (LookupError, UnicodeEncodeError): - pass - original_text_doc = readability.Document(text, url=resp.url, + + text = resp.text + original_text_doc = readability.Document(text, url=resp.url, debug=self.debug, positive_keywords=["postContent", "postField"]) try: content = original_text_doc.summary(html_partial=True) except readability.Unparseable: return - + try: title = original_text_doc.title() except TypeError: title = "" url = resp.url - + if content: if self.story and not skip_save: - self.story.original_text_z = zlib.compress(content) + self.story.original_text_z = zlib.compress(smart_str(content)) try: self.story.save() except NotUniqueError: pass logging.user(self.request, ("~SN~FYFetched ~FGoriginal text~FY: now ~SB%s bytes~SN vs. was ~SB%s bytes" % ( - len(unicode(content)), + len(content), self.story and self.story.story_content_z and len(zlib.decompress(self.story.story_content_z)) )), warn_color=False) else: logging.user(self.request, ("~SN~FRFailed~FY to fetch ~FGoriginal text~FY: was ~SB%s bytes" % ( self.story and self.story.story_content_z and len(zlib.decompress(self.story.story_content_z)) )), warn_color=False) - + if return_document: return dict(content=content, title=title, url=url, doc=original_text_doc) return content - + @timelimit(10) def fetch_request(self): url = self.story_url @@ -107,7 +99,7 @@ class TextImporter: try: r = requests.get(url, headers=self.headers, verify=False) r.connection.close() - except (AttributeError, SocketError, requests.ConnectionError, + except (AttributeError, SocketError, requests.ConnectionError, requests.models.MissingSchema, requests.sessions.InvalidSchema, requests.sessions.TooManyRedirects, requests.models.InvalidURL,