NewsBlur/apps/rss_feeds/text_importer.py

115 lines
4.4 KiB
Python

import requests
import zlib
from requests.packages.urllib3.exceptions import LocationParseError
from django.conf import settings
from socket import error as SocketError
from mongoengine.queryset import NotUniqueError
from vendor.readability import readability
from utils import log as logging
from utils.feed_functions import timelimit, TimeoutError
from OpenSSL.SSL import Error as OpenSSLError
from pyasn1.error import PyAsn1Error
BROKEN_URLS = [
"gamespot.com",
]
class TextImporter:
def __init__(self, story=None, feed=None, story_url=None, request=None, debug=False):
self.story = story
self.story_url = story_url
self.feed = feed
self.request = request
self.debug = debug
@property
def headers(self):
return {
'User-Agent': 'NewsBlur Content Fetcher - %s subscriber%s - %s '
'(Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) '
'AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 '
'Safari/534.48.3)' % (
self.feed.num_subscribers,
's' if self.feed.num_subscribers != 1 else '',
self.feed.permalink,
),
}
def fetch(self, skip_save=False, return_document=False):
if any(broken_url in self.story_url for broken_url in BROKEN_URLS):
logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: banned")
return
try:
resp = self.fetch_request()
except TimeoutError:
logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: timed out")
resp = None
except requests.exceptions.TooManyRedirects:
logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: too many redirects")
resp = None
if not resp:
return
try:
text = resp.text
except (LookupError, TypeError):
text = resp.content
charset_declared = 'charset' in resp.headers.get('content-type', "")
if resp.encoding and resp.encoding != 'utf-8' and not charset_declared:
try:
text = text.encode(resp.encoding)
except (LookupError, UnicodeEncodeError):
pass
original_text_doc = readability.Document(text, url=resp.url,
debug=self.debug,
positive_keywords=["postContent", "postField"])
try:
content = original_text_doc.summary(html_partial=True)
except readability.Unparseable:
return
try:
title = original_text_doc.title()
except TypeError:
title = ""
url = resp.url
if content:
if self.story and not skip_save:
self.story.original_text_z = zlib.compress(content)
try:
self.story.save()
except NotUniqueError:
pass
logging.user(self.request, ("~SN~FYFetched ~FGoriginal text~FY: now ~SB%s bytes~SN vs. was ~SB%s bytes" % (
len(unicode(content)),
self.story and self.story.story_content_z and len(zlib.decompress(self.story.story_content_z))
)), warn_color=False)
else:
logging.user(self.request, ("~SN~FRFailed~FY to fetch ~FGoriginal text~FY: was ~SB%s bytes" % (
self.story and self.story.story_content_z and len(zlib.decompress(self.story.story_content_z))
)), warn_color=False)
if return_document:
return dict(content=content, title=title, url=url, doc=original_text_doc)
return content
@timelimit(10)
def fetch_request(self):
url = self.story_url
if self.story and not url:
url = self.story.story_permalink
try:
r = requests.get(url, headers=self.headers, verify=False)
r.connection.close()
except (AttributeError, SocketError, requests.ConnectionError,
requests.models.MissingSchema, requests.sessions.InvalidSchema,
LocationParseError, OpenSSLError, PyAsn1Error), e:
logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: %s" % e)
return
return r