mirror of
https://github.com/samuelclay/NewsBlur.git
synced 2025-04-13 09:42:01 +00:00
commit
53e4998146
3 changed files with 78 additions and 86 deletions
|
@ -24,6 +24,7 @@ from django.core.urlresolvers import reverse
|
|||
from django.contrib.auth.models import User
|
||||
from django.contrib.sites.models import Site
|
||||
from django.template.defaultfilters import slugify
|
||||
from django.utils.encoding import smart_str
|
||||
from mongoengine.queryset import OperationError, Q, NotUniqueError
|
||||
from mongoengine.base import ValidationError
|
||||
from vendor.timezones.utilities import localtime_for_timezone
|
||||
|
@ -84,7 +85,6 @@ class Feed(models.Model):
|
|||
s3_icon = models.NullBooleanField(default=False, blank=True, null=True)
|
||||
search_indexed = models.NullBooleanField(default=None, null=True, blank=True)
|
||||
|
||||
|
||||
class Meta:
|
||||
db_table="feeds"
|
||||
ordering=["feed_title"]
|
||||
|
@ -1900,13 +1900,13 @@ class MStory(mongo.Document):
|
|||
self.story_hash = self.feed_guid_hash
|
||||
|
||||
if self.story_content:
|
||||
self.story_content_z = zlib.compress(self.story_content)
|
||||
self.story_content_z = zlib.compress(smart_str(self.story_content))
|
||||
self.story_content = None
|
||||
if self.story_original_content:
|
||||
self.story_original_content_z = zlib.compress(self.story_original_content)
|
||||
self.story_original_content_z = zlib.compress(smart_str(self.story_original_content))
|
||||
self.story_original_content = None
|
||||
if self.story_latest_content:
|
||||
self.story_latest_content_z = zlib.compress(self.story_latest_content)
|
||||
self.story_latest_content_z = zlib.compress(smart_str(self.story_latest_content))
|
||||
self.story_latest_content = None
|
||||
if self.story_title and len(self.story_title) > story_title_max:
|
||||
self.story_title = self.story_title[:story_title_max]
|
||||
|
|
|
@ -7,9 +7,10 @@ from django.conf import settings
|
|||
from apps.rss_feeds.models import Feed, MStory
|
||||
from mongoengine.connection import connect, disconnect
|
||||
|
||||
|
||||
class FeedTest(TestCase):
|
||||
fixtures = ['rss_feeds.json']
|
||||
|
||||
|
||||
def setUp(self):
|
||||
disconnect()
|
||||
settings.MONGODB = connect('test_newsblur')
|
||||
|
@ -17,168 +18,167 @@ class FeedTest(TestCase):
|
|||
|
||||
def tearDown(self):
|
||||
settings.MONGODB.drop_database('test_newsblur')
|
||||
|
||||
|
||||
def test_load_feeds__gawker(self):
|
||||
self.client.login(username='conesus', password='test')
|
||||
|
||||
|
||||
management.call_command('loaddata', 'gawker1.json', verbosity=0)
|
||||
|
||||
|
||||
feed = Feed.objects.get(feed_link__contains='gawker')
|
||||
stories = MStory.objects(story_feed_id=feed.pk)
|
||||
self.assertEquals(stories.count(), 0)
|
||||
|
||||
|
||||
feed.update(force=True)
|
||||
|
||||
|
||||
stories = MStory.objects(story_feed_id=feed.pk)
|
||||
self.assertEquals(stories.count(), 38)
|
||||
|
||||
|
||||
management.call_command('loaddata', 'gawker2.json', verbosity=0)
|
||||
|
||||
|
||||
feed.update(force=True)
|
||||
|
||||
|
||||
# Test: 1 changed char in content
|
||||
stories = MStory.objects(story_feed_id=feed.pk)
|
||||
self.assertEquals(stories.count(), 38)
|
||||
|
||||
|
||||
url = reverse('load-single-feed', kwargs=dict(feed_id=1))
|
||||
response = self.client.get(url)
|
||||
feed = json.decode(response.content)
|
||||
self.assertEquals(len(feed['stories']), 6)
|
||||
|
||||
|
||||
def test_load_feeds__gothamist(self):
|
||||
self.client.login(username='conesus', password='test')
|
||||
|
||||
|
||||
management.call_command('loaddata', 'gothamist_aug_2009_1.json', verbosity=0)
|
||||
feed = Feed.objects.get(feed_link__contains='gothamist')
|
||||
stories = MStory.objects(story_feed_id=feed.pk)
|
||||
self.assertEquals(stories.count(), 0)
|
||||
|
||||
|
||||
feed.update(force=True)
|
||||
|
||||
|
||||
stories = MStory.objects(story_feed_id=feed.pk)
|
||||
self.assertEquals(stories.count(), 42)
|
||||
|
||||
|
||||
url = reverse('load-single-feed', kwargs=dict(feed_id=4))
|
||||
response = self.client.get(url)
|
||||
content = json.decode(response.content)
|
||||
self.assertEquals(len(content['stories']), 6)
|
||||
|
||||
|
||||
management.call_command('loaddata', 'gothamist_aug_2009_2.json', verbosity=0)
|
||||
feed.update(force=True)
|
||||
|
||||
|
||||
stories = MStory.objects(story_feed_id=feed.pk)
|
||||
self.assertEquals(stories.count(), 42)
|
||||
|
||||
|
||||
url = reverse('load-single-feed', kwargs=dict(feed_id=4))
|
||||
response = self.client.get(url)
|
||||
# print [c['story_title'] for c in json.decode(response.content)]
|
||||
content = json.decode(response.content)
|
||||
# Test: 1 changed char in title
|
||||
self.assertEquals(len(content['stories']), 6)
|
||||
|
||||
|
||||
def test_load_feeds__slashdot(self):
|
||||
self.client.login(username='conesus', password='test')
|
||||
|
||||
|
||||
old_story_guid = "tag:google.com,2005:reader/item/4528442633bc7b2b"
|
||||
|
||||
|
||||
management.call_command('loaddata', 'slashdot1.json', verbosity=0)
|
||||
|
||||
|
||||
feed = Feed.objects.get(feed_link__contains='slashdot')
|
||||
stories = MStory.objects(story_feed_id=feed.pk)
|
||||
self.assertEquals(stories.count(), 0)
|
||||
|
||||
|
||||
management.call_command('refresh_feed', force=1, feed=5, single_threaded=True, daemonize=False)
|
||||
|
||||
|
||||
stories = MStory.objects(story_feed_id=feed.pk)
|
||||
self.assertEquals(stories.count(), 38)
|
||||
|
||||
|
||||
response = self.client.get(reverse('load-feeds'))
|
||||
content = json.decode(response.content)
|
||||
self.assertEquals(content['feeds']['5']['nt'], 38)
|
||||
|
||||
self.client.post(reverse('mark-story-as-read'), {'story_id': old_story_guid, 'feed_id': 5})
|
||||
|
||||
|
||||
response = self.client.get(reverse('refresh-feeds'))
|
||||
content = json.decode(response.content)
|
||||
self.assertEquals(content['feeds']['5']['nt'], 37)
|
||||
|
||||
|
||||
management.call_command('loaddata', 'slashdot2.json', verbosity=0)
|
||||
management.call_command('refresh_feed', force=1, feed=5, single_threaded=True, daemonize=False)
|
||||
|
||||
|
||||
stories = MStory.objects(story_feed_id=feed.pk)
|
||||
self.assertEquals(stories.count(), 38)
|
||||
|
||||
|
||||
url = reverse('load-single-feed', kwargs=dict(feed_id=5))
|
||||
response = self.client.get(url)
|
||||
|
||||
|
||||
# pprint([c['story_title'] for c in json.decode(response.content)])
|
||||
feed = json.decode(response.content)
|
||||
|
||||
|
||||
# Test: 1 changed char in title
|
||||
self.assertEquals(len(feed['stories']), 6)
|
||||
|
||||
|
||||
response = self.client.get(reverse('refresh-feeds'))
|
||||
content = json.decode(response.content)
|
||||
self.assertEquals(content['feeds']['5']['nt'], 37)
|
||||
|
||||
|
||||
|
||||
def test_load_feeds__motherjones(self):
|
||||
self.client.login(username='conesus', password='test')
|
||||
|
||||
|
||||
management.call_command('loaddata', 'motherjones1.json', verbosity=0)
|
||||
|
||||
|
||||
feed = Feed.objects.get(feed_link__contains='motherjones')
|
||||
stories = MStory.objects(story_feed_id=feed.pk)
|
||||
self.assertEquals(stories.count(), 0)
|
||||
|
||||
|
||||
management.call_command('refresh_feed', force=1, feed=feed.pk, single_threaded=True, daemonize=False)
|
||||
|
||||
|
||||
stories = MStory.objects(story_feed_id=feed.pk)
|
||||
self.assertEquals(stories.count(), 10)
|
||||
|
||||
|
||||
response = self.client.get(reverse('load-feeds'))
|
||||
content = json.decode(response.content)
|
||||
self.assertEquals(content['feeds'][str(feed.pk)]['nt'], 10)
|
||||
|
||||
self.client.post(reverse('mark-story-as-read'), {'story_id': stories[0].story_guid, 'feed_id': feed.pk})
|
||||
|
||||
|
||||
response = self.client.get(reverse('refresh-feeds'))
|
||||
content = json.decode(response.content)
|
||||
self.assertEquals(content['feeds'][str(feed.pk)]['nt'], 9)
|
||||
|
||||
|
||||
management.call_command('loaddata', 'motherjones2.json', verbosity=0)
|
||||
management.call_command('refresh_feed', force=1, feed=feed.pk, single_threaded=True, daemonize=False)
|
||||
|
||||
|
||||
stories = MStory.objects(story_feed_id=feed.pk)
|
||||
self.assertEquals(stories.count(), 10)
|
||||
|
||||
|
||||
url = reverse('load-single-feed', kwargs=dict(feed_id=feed.pk))
|
||||
response = self.client.get(url)
|
||||
|
||||
|
||||
# pprint([c['story_title'] for c in json.decode(response.content)])
|
||||
feed = json.decode(response.content)
|
||||
|
||||
|
||||
# Test: 1 changed char in title
|
||||
self.assertEquals(len(feed['stories']), 6)
|
||||
|
||||
|
||||
response = self.client.get(reverse('refresh-feeds'))
|
||||
content = json.decode(response.content)
|
||||
self.assertEquals(content['feeds'][str(feed['feed_id'])]['nt'], 9)
|
||||
|
||||
def test_load_feeds__brokelyn__invalid_xml(self):
|
||||
self.client.login(username='conesus', password='test')
|
||||
|
||||
|
||||
management.call_command('loaddata', 'brokelyn.json', verbosity=0)
|
||||
management.call_command('refresh_feed', force=1, feed=6, single_threaded=True, daemonize=False)
|
||||
|
||||
|
||||
url = reverse('load-single-feed', kwargs=dict(feed_id=6))
|
||||
response = self.client.get(url)
|
||||
|
||||
|
||||
# pprint([c['story_title'] for c in json.decode(response.content)])
|
||||
feed = json.decode(response.content)
|
||||
|
||||
|
||||
# Test: 1 changed char in title
|
||||
self.assertEquals(len(feed['stories']), 6)
|
||||
|
||||
|
||||
def test_all_feeds(self):
|
||||
pass
|
||||
pass
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
import requests
|
||||
import zlib
|
||||
from requests.packages.urllib3.exceptions import LocationParseError
|
||||
from django.conf import settings
|
||||
from socket import error as SocketError
|
||||
from mongoengine.queryset import NotUniqueError
|
||||
from vendor.readability import readability
|
||||
|
@ -9,38 +8,41 @@ from utils import log as logging
|
|||
from utils.feed_functions import timelimit, TimeoutError
|
||||
from OpenSSL.SSL import Error as OpenSSLError
|
||||
from pyasn1.error import PyAsn1Error
|
||||
from django.utils.encoding import smart_str
|
||||
|
||||
BROKEN_URLS = [
|
||||
"gamespot.com",
|
||||
]
|
||||
|
||||
|
||||
class TextImporter:
|
||||
|
||||
|
||||
def __init__(self, story=None, feed=None, story_url=None, request=None, debug=False):
|
||||
self.story = story
|
||||
self.story_url = story_url
|
||||
self.feed = feed
|
||||
self.request = request
|
||||
self.debug = debug
|
||||
|
||||
|
||||
@property
|
||||
def headers(self):
|
||||
num_subscribers = getattr(self.feed, 'num_subscribers', 0)
|
||||
return {
|
||||
'User-Agent': 'NewsBlur Content Fetcher - %s subscriber%s - %s '
|
||||
'(Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) '
|
||||
'AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 '
|
||||
'Safari/534.48.3)' % (
|
||||
self.feed.num_subscribers,
|
||||
's' if self.feed.num_subscribers != 1 else '',
|
||||
self.feed.permalink,
|
||||
),
|
||||
num_subscribers,
|
||||
's' if num_subscribers != 1 else '',
|
||||
getattr(self.feed, 'permalink', '')
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def fetch(self, skip_save=False, return_document=False):
|
||||
if self.story_url and any(broken_url in self.story_url for broken_url in BROKEN_URLS):
|
||||
logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: banned")
|
||||
return
|
||||
|
||||
|
||||
try:
|
||||
resp = self.fetch_request()
|
||||
except TimeoutError:
|
||||
|
@ -49,56 +51,46 @@ class TextImporter:
|
|||
except requests.exceptions.TooManyRedirects:
|
||||
logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: too many redirects")
|
||||
resp = None
|
||||
|
||||
|
||||
if not resp:
|
||||
return
|
||||
|
||||
try:
|
||||
text = resp.text
|
||||
except (LookupError, TypeError):
|
||||
text = resp.content
|
||||
|
||||
charset_declared = 'charset' in resp.headers.get('content-type', "")
|
||||
if resp.encoding and resp.encoding != 'utf-8' and not charset_declared:
|
||||
try:
|
||||
text = text.encode(resp.encoding)
|
||||
except (LookupError, UnicodeEncodeError):
|
||||
pass
|
||||
original_text_doc = readability.Document(text, url=resp.url,
|
||||
|
||||
text = resp.text
|
||||
original_text_doc = readability.Document(text, url=resp.url,
|
||||
debug=self.debug,
|
||||
positive_keywords=["postContent", "postField"])
|
||||
try:
|
||||
content = original_text_doc.summary(html_partial=True)
|
||||
except readability.Unparseable:
|
||||
return
|
||||
|
||||
|
||||
try:
|
||||
title = original_text_doc.title()
|
||||
except TypeError:
|
||||
title = ""
|
||||
url = resp.url
|
||||
|
||||
|
||||
if content:
|
||||
if self.story and not skip_save:
|
||||
self.story.original_text_z = zlib.compress(content)
|
||||
self.story.original_text_z = zlib.compress(smart_str(content))
|
||||
try:
|
||||
self.story.save()
|
||||
except NotUniqueError:
|
||||
pass
|
||||
logging.user(self.request, ("~SN~FYFetched ~FGoriginal text~FY: now ~SB%s bytes~SN vs. was ~SB%s bytes" % (
|
||||
len(unicode(content)),
|
||||
len(content),
|
||||
self.story and self.story.story_content_z and len(zlib.decompress(self.story.story_content_z))
|
||||
)), warn_color=False)
|
||||
else:
|
||||
logging.user(self.request, ("~SN~FRFailed~FY to fetch ~FGoriginal text~FY: was ~SB%s bytes" % (
|
||||
self.story and self.story.story_content_z and len(zlib.decompress(self.story.story_content_z))
|
||||
)), warn_color=False)
|
||||
|
||||
|
||||
if return_document:
|
||||
return dict(content=content, title=title, url=url, doc=original_text_doc)
|
||||
|
||||
return content
|
||||
|
||||
|
||||
@timelimit(10)
|
||||
def fetch_request(self):
|
||||
url = self.story_url
|
||||
|
@ -107,7 +99,7 @@ class TextImporter:
|
|||
try:
|
||||
r = requests.get(url, headers=self.headers, verify=False)
|
||||
r.connection.close()
|
||||
except (AttributeError, SocketError, requests.ConnectionError,
|
||||
except (AttributeError, SocketError, requests.ConnectionError,
|
||||
requests.models.MissingSchema, requests.sessions.InvalidSchema,
|
||||
requests.sessions.TooManyRedirects,
|
||||
requests.models.InvalidURL,
|
||||
|
|
Loading…
Add table
Reference in a new issue