NewsBlur/apps/rss_feeds/text_importer.py

247 lines
10 KiB
Python
Raw Normal View History

import requests
import urllib3
import zlib
2018-08-09 09:47:10 -04:00
from simplejson.decoder import JSONDecodeError
from requests.packages.urllib3.exceptions import LocationParseError
from socket import error as SocketError
from mongoengine.queryset import NotUniqueError
from vendor.readability import readability
from lxml.etree import ParserError
from utils import log as logging
from utils.feed_functions import timelimit, TimeoutError
from OpenSSL.SSL import Error as OpenSSLError
2014-05-27 13:08:21 -07:00
from pyasn1.error import PyAsn1Error
from django.utils.encoding import smart_str
from django.conf import settings
2021-05-12 21:20:05 -04:00
from django.utils.encoding import smart_bytes
from django.contrib.sites.models import Site
2020-06-15 02:54:37 -04:00
from bs4 import BeautifulSoup
from urllib.parse import urljoin
BROKEN_URLS = [
"gamespot.com",
'thedailyskip.com',
]
2015-11-27 08:18:34 +01:00
class TextImporter:
2015-11-27 08:18:34 +01:00
def __init__(self, story=None, feed=None, story_url=None, request=None, debug=False):
self.story = story
self.story_url = story_url
2020-04-30 14:53:42 -04:00
if self.story and not self.story_url:
self.story_url = self.story.story_permalink
self.feed = feed
self.request = request
self.debug = debug
2015-11-27 08:18:34 +01:00
@property
def headers(self):
num_subscribers = getattr(self.feed, 'num_subscribers', 0)
return {
2020-11-30 15:48:59 -05:00
'User-Agent': 'NewsBlur Content Fetcher - %s subscriber%s - %s %s' % (
num_subscribers,
's' if num_subscribers != 1 else '',
2020-11-30 15:48:59 -05:00
getattr(self.feed, 'permalink', ''),
getattr(self.feed, 'fake_user_agent', ''),
2015-11-27 08:18:34 +01:00
),
}
2015-11-27 08:18:34 +01:00
def fetch(self, skip_save=False, return_document=False, use_mercury=True):
2015-08-03 20:12:51 -07:00
if self.story_url and any(broken_url in self.story_url for broken_url in BROKEN_URLS):
logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: banned")
return
if use_mercury:
results = self.fetch_mercury(skip_save=skip_save, return_document=return_document)
if not use_mercury or not results:
logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY with Mercury, trying readability...", warn_color=False)
results = self.fetch_manually(skip_save=skip_save, return_document=return_document)
return results
def fetch_mercury(self, skip_save=False, return_document=False):
try:
resp = self.fetch_request(use_mercury=True)
except TimeoutError:
logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: timed out")
resp = None
except requests.exceptions.TooManyRedirects:
logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: too many redirects")
resp = None
if not resp:
return
2018-08-09 09:47:10 -04:00
try:
doc = resp.json()
except JSONDecodeError:
doc = None
if not doc or doc.get('error', False):
logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: %s" % (doc and doc.get('messages', None) or "[unknown mercury error]"))
2017-10-30 11:47:18 -07:00
return
text = doc['content']
title = doc['title']
url = doc['url']
image = doc['lead_image_url']
if image and ('http://' in image[1:] or 'https://' in image[1:]):
logging.user(self.request, "~SN~FRRemoving broken image from text: %s" % image)
image = None
return self.process_content(text, title, url, image, skip_save=skip_save, return_document=return_document)
def fetch_manually(self, skip_save=False, return_document=False):
try:
resp = self.fetch_request(use_mercury=False)
except TimeoutError:
logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: timed out")
resp = None
except requests.exceptions.TooManyRedirects:
logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: too many redirects")
resp = None
2015-11-27 08:18:34 +01:00
if not resp:
return
2015-11-27 08:18:34 +01:00
try:
text = resp.text
except (LookupError, TypeError):
text = resp.content
# if self.debug:
# logging.user(self.request, "~FBOriginal text's website: %s" % text)
# if resp.encoding and resp.encoding != 'utf-8':
# try:
# text = text.encode(resp.encoding)
# except (LookupError, UnicodeEncodeError):
# pass
if text:
text = text.replace("\xc2\xa0", " ") # Non-breaking space, is mangled when encoding is not utf-8
2020-06-15 02:54:37 -04:00
text = text.replace("\\u00a0", " ") # Non-breaking space, is mangled when encoding is not utf-8
2015-11-27 08:18:34 +01:00
original_text_doc = readability.Document(text, url=resp.url,
positive_keywords="post, entry, postProp, article, postContent, postField")
try:
content = original_text_doc.summary(html_partial=True)
2020-06-15 02:54:37 -04:00
except (readability.Unparseable, ParserError) as e:
logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: %s" % e)
return
2015-11-27 08:18:34 +01:00
try:
title = original_text_doc.title()
except TypeError:
title = ""
url = resp.url
return self.process_content(content, title, url, image=None, skip_save=skip_save, return_document=return_document,
original_text_doc=original_text_doc)
def process_content(self, content, title, url, image, skip_save=False, return_document=False, original_text_doc=None):
original_story_content = self.story and self.story.story_content_z and zlib.decompress(self.story.story_content_z)
if not original_story_content:
original_story_content = ""
story_image_urls = self.story and self.story.image_urls
if not story_image_urls:
story_image_urls = []
content = self.add_hero_image(content, story_image_urls)
if content and len(content) > len(original_story_content):
if self.story and not skip_save:
self.story.original_text_z = zlib.compress(smart_bytes(content))
try:
self.story.save()
2020-06-15 02:54:37 -04:00
except NotUniqueError as e:
logging.user(self.request, ("~SN~FYFetched ~FGoriginal text~FY: %s" % (e)), warn_color=False)
pass
logging.user(self.request, ("~SN~FYFetched ~FGoriginal text~FY: now ~SB%s bytes~SN vs. was ~SB%s bytes" % (
len(content),
len(original_story_content)
)), warn_color=False)
else:
logging.user(self.request, ("~SN~FRFailed~FY to fetch ~FGoriginal text~FY: was ~SB%s bytes" % (
len(original_story_content)
)), warn_color=False)
return
if content:
content = self.rewrite_content(content)
if return_document:
return dict(content=content, title=title, url=url, doc=original_text_doc, image=image)
return content
2015-11-27 08:18:34 +01:00
def add_hero_image(self, content, image_urls):
# Need to have images in the original story to add to the text that may not have any images
if not len(image_urls):
return content
content_soup = BeautifulSoup(content, features="lxml")
content_imgs = content_soup.findAll('img')
for img in content_imgs:
# Since NewsBlur proxies all http images over https, the url can change, so acknowledge urls
# that are https on the original text but http on the feed
2021-03-25 18:52:32 -04:00
if not img.get('src'): continue
if img.get('src') in image_urls:
image_urls.remove(img.get('src'))
elif img.get('src').replace('https:', 'http:') in image_urls:
image_urls.remove(img.get('src').replace('https:', 'http:'))
if len(image_urls):
image_content = f'<img src="{image_urls[0]}">'
content = f"{image_content}\n {content}"
return content
def rewrite_content(self, content):
2020-06-30 17:22:47 -04:00
soup = BeautifulSoup(content, features="lxml")
for noscript in soup.findAll('noscript'):
if len(noscript.contents) > 0:
noscript.replaceWith(noscript.contents[0])
2020-06-15 02:54:37 -04:00
content = str(soup)
2019-08-21 18:33:57 -07:00
images = set([img['src'] for img in soup.findAll('img') if 'src' in img])
for image_url in images:
abs_image_url = urljoin(self.story.story_permalink, image_url)
content = content.replace(image_url, abs_image_url)
return content
@timelimit(10)
def fetch_request(self, use_mercury=True):
headers = self.headers
url = self.story_url
if use_mercury:
mercury_api_key = getattr(settings, 'MERCURY_PARSER_API_KEY', 'abc123')
headers["content-type"] = "application/json"
headers["x-api-key"] = mercury_api_key
domain = Site.objects.get_current().domain
url = f"https://{domain}/rss_feeds/original_text_fetcher?url={url}"
try:
r = requests.get(url, headers=headers, timeout=15)
r.connection.close()
2015-11-27 08:18:34 +01:00
except (AttributeError, SocketError, requests.ConnectionError,
2014-03-29 17:17:30 -07:00
requests.models.MissingSchema, requests.sessions.InvalidSchema,
requests.sessions.TooManyRedirects,
requests.models.InvalidURL,
requests.models.ChunkedEncodingError,
requests.models.ContentDecodingError,
requests.adapters.ReadTimeout,
urllib3.exceptions.LocationValueError,
2020-06-15 02:54:37 -04:00
LocationParseError, OpenSSLError, PyAsn1Error) as e:
logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: %s" % e)
return
return r