Merge pull request #835 from sv0/text_importer

Text importer
This commit is contained in:
Samuel Clay 2015-11-30 16:03:50 -08:00
commit 53e4998146
3 changed files with 78 additions and 86 deletions

View file

@ -24,6 +24,7 @@ from django.core.urlresolvers import reverse
from django.contrib.auth.models import User from django.contrib.auth.models import User
from django.contrib.sites.models import Site from django.contrib.sites.models import Site
from django.template.defaultfilters import slugify from django.template.defaultfilters import slugify
from django.utils.encoding import smart_str
from mongoengine.queryset import OperationError, Q, NotUniqueError from mongoengine.queryset import OperationError, Q, NotUniqueError
from mongoengine.base import ValidationError from mongoengine.base import ValidationError
from vendor.timezones.utilities import localtime_for_timezone from vendor.timezones.utilities import localtime_for_timezone
@ -84,7 +85,6 @@ class Feed(models.Model):
s3_icon = models.NullBooleanField(default=False, blank=True, null=True) s3_icon = models.NullBooleanField(default=False, blank=True, null=True)
search_indexed = models.NullBooleanField(default=None, null=True, blank=True) search_indexed = models.NullBooleanField(default=None, null=True, blank=True)
class Meta: class Meta:
db_table="feeds" db_table="feeds"
ordering=["feed_title"] ordering=["feed_title"]
@ -1900,13 +1900,13 @@ class MStory(mongo.Document):
self.story_hash = self.feed_guid_hash self.story_hash = self.feed_guid_hash
if self.story_content: if self.story_content:
self.story_content_z = zlib.compress(self.story_content) self.story_content_z = zlib.compress(smart_str(self.story_content))
self.story_content = None self.story_content = None
if self.story_original_content: if self.story_original_content:
self.story_original_content_z = zlib.compress(self.story_original_content) self.story_original_content_z = zlib.compress(smart_str(self.story_original_content))
self.story_original_content = None self.story_original_content = None
if self.story_latest_content: if self.story_latest_content:
self.story_latest_content_z = zlib.compress(self.story_latest_content) self.story_latest_content_z = zlib.compress(smart_str(self.story_latest_content))
self.story_latest_content = None self.story_latest_content = None
if self.story_title and len(self.story_title) > story_title_max: if self.story_title and len(self.story_title) > story_title_max:
self.story_title = self.story_title[:story_title_max] self.story_title = self.story_title[:story_title_max]

View file

@ -7,6 +7,7 @@ from django.conf import settings
from apps.rss_feeds.models import Feed, MStory from apps.rss_feeds.models import Feed, MStory
from mongoengine.connection import connect, disconnect from mongoengine.connection import connect, disconnect
class FeedTest(TestCase): class FeedTest(TestCase):
fixtures = ['rss_feeds.json'] fixtures = ['rss_feeds.json']
@ -121,7 +122,6 @@ class FeedTest(TestCase):
content = json.decode(response.content) content = json.decode(response.content)
self.assertEquals(content['feeds']['5']['nt'], 37) self.assertEquals(content['feeds']['5']['nt'], 37)
def test_load_feeds__motherjones(self): def test_load_feeds__motherjones(self):
self.client.login(username='conesus', password='test') self.client.login(username='conesus', password='test')

View file

@ -1,7 +1,6 @@
import requests import requests
import zlib import zlib
from requests.packages.urllib3.exceptions import LocationParseError from requests.packages.urllib3.exceptions import LocationParseError
from django.conf import settings
from socket import error as SocketError from socket import error as SocketError
from mongoengine.queryset import NotUniqueError from mongoengine.queryset import NotUniqueError
from vendor.readability import readability from vendor.readability import readability
@ -9,11 +8,13 @@ from utils import log as logging
from utils.feed_functions import timelimit, TimeoutError from utils.feed_functions import timelimit, TimeoutError
from OpenSSL.SSL import Error as OpenSSLError from OpenSSL.SSL import Error as OpenSSLError
from pyasn1.error import PyAsn1Error from pyasn1.error import PyAsn1Error
from django.utils.encoding import smart_str
BROKEN_URLS = [ BROKEN_URLS = [
"gamespot.com", "gamespot.com",
] ]
class TextImporter: class TextImporter:
def __init__(self, story=None, feed=None, story_url=None, request=None, debug=False): def __init__(self, story=None, feed=None, story_url=None, request=None, debug=False):
@ -25,14 +26,15 @@ class TextImporter:
@property @property
def headers(self): def headers(self):
num_subscribers = getattr(self.feed, 'num_subscribers', 0)
return { return {
'User-Agent': 'NewsBlur Content Fetcher - %s subscriber%s - %s ' 'User-Agent': 'NewsBlur Content Fetcher - %s subscriber%s - %s '
'(Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) ' '(Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) '
'AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 ' 'AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 '
'Safari/534.48.3)' % ( 'Safari/534.48.3)' % (
self.feed.num_subscribers, num_subscribers,
's' if self.feed.num_subscribers != 1 else '', 's' if num_subscribers != 1 else '',
self.feed.permalink, getattr(self.feed, 'permalink', '')
), ),
} }
@ -53,17 +55,7 @@ class TextImporter:
if not resp: if not resp:
return return
try:
text = resp.text text = resp.text
except (LookupError, TypeError):
text = resp.content
charset_declared = 'charset' in resp.headers.get('content-type', "")
if resp.encoding and resp.encoding != 'utf-8' and not charset_declared:
try:
text = text.encode(resp.encoding)
except (LookupError, UnicodeEncodeError):
pass
original_text_doc = readability.Document(text, url=resp.url, original_text_doc = readability.Document(text, url=resp.url,
debug=self.debug, debug=self.debug,
positive_keywords=["postContent", "postField"]) positive_keywords=["postContent", "postField"])
@ -80,13 +72,13 @@ class TextImporter:
if content: if content:
if self.story and not skip_save: if self.story and not skip_save:
self.story.original_text_z = zlib.compress(content) self.story.original_text_z = zlib.compress(smart_str(content))
try: try:
self.story.save() self.story.save()
except NotUniqueError: except NotUniqueError:
pass pass
logging.user(self.request, ("~SN~FYFetched ~FGoriginal text~FY: now ~SB%s bytes~SN vs. was ~SB%s bytes" % ( logging.user(self.request, ("~SN~FYFetched ~FGoriginal text~FY: now ~SB%s bytes~SN vs. was ~SB%s bytes" % (
len(unicode(content)), len(content),
self.story and self.story.story_content_z and len(zlib.decompress(self.story.story_content_z)) self.story and self.story.story_content_z and len(zlib.decompress(self.story.story_content_z))
)), warn_color=False) )), warn_color=False)
else: else: