NewsBlur-viq/apps/rss_feeds/text_importer.py

247 lines
10 KiB
Python
Raw Normal View History

import requests
import urllib3
import zlib
2021-08-03 16:05:28 -04:00
import readability
2018-08-09 09:47:10 -04:00
from simplejson.decoder import JSONDecodeError
from requests.packages.urllib3.exceptions import LocationParseError
from socket import error as SocketError
from mongoengine.queryset import NotUniqueError
from lxml.etree import ParserError
from utils import log as logging
from utils.feed_functions import timelimit, TimeoutError
from OpenSSL.SSL import Error as OpenSSLError
2014-05-27 13:08:21 -07:00
from pyasn1.error import PyAsn1Error
from django.utils.encoding import smart_str
from django.conf import settings
2021-05-12 21:20:05 -04:00
from django.utils.encoding import smart_bytes
from django.contrib.sites.models import Site
2020-06-15 02:54:37 -04:00
from bs4 import BeautifulSoup
from urllib.parse import urljoin
BROKEN_URLS = [
"gamespot.com",
'thedailyskip.com',
]
2015-11-27 08:18:34 +01:00
class TextImporter:
2015-11-27 08:18:34 +01:00
def __init__(self, story=None, feed=None, story_url=None, request=None, debug=False):
self.story = story
self.story_url = story_url
2020-04-30 14:53:42 -04:00
if self.story and not self.story_url:
self.story_url = self.story.story_permalink
self.feed = feed
self.request = request
self.debug = debug
2015-11-27 08:18:34 +01:00
@property
def headers(self):
num_subscribers = getattr(self.feed, 'num_subscribers', 0)
return {
2020-11-30 15:48:59 -05:00
'User-Agent': 'NewsBlur Content Fetcher - %s subscriber%s - %s %s' % (
num_subscribers,
's' if num_subscribers != 1 else '',
2020-11-30 15:48:59 -05:00
getattr(self.feed, 'permalink', ''),
getattr(self.feed, 'fake_user_agent', ''),
2015-11-27 08:18:34 +01:00
),
}
2015-11-27 08:18:34 +01:00
def fetch(self, skip_save=False, return_document=False, use_mercury=True):
2015-08-03 20:12:51 -07:00
if self.story_url and any(broken_url in self.story_url for broken_url in BROKEN_URLS):
logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: banned")
return
if use_mercury:
results = self.fetch_mercury(skip_save=skip_save, return_document=return_document)
if not use_mercury or not results:
logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY with Mercury, trying readability...", warn_color=False)
results = self.fetch_manually(skip_save=skip_save, return_document=return_document)
return results
def fetch_mercury(self, skip_save=False, return_document=False):
try:
resp = self.fetch_request(use_mercury=True)
except TimeoutError:
logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: timed out")
resp = None
except requests.exceptions.TooManyRedirects:
logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: too many redirects")
resp = None
if not resp:
return
2018-08-09 09:47:10 -04:00
try:
doc = resp.json()
except JSONDecodeError:
doc = None
if not doc or doc.get('error', False):
logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: %s" % (doc and doc.get('messages', None) or "[unknown mercury error]"))
2017-10-30 11:47:18 -07:00
return
text = doc['content']
title = doc['title']
url = doc['url']
image = doc['lead_image_url']
if image and ('http://' in image[1:] or 'https://' in image[1:]):
logging.user(self.request, "~SN~FRRemoving broken image from text: %s" % image)
image = None
return self.process_content(text, title, url, image, skip_save=skip_save, return_document=return_document)
def fetch_manually(self, skip_save=False, return_document=False):
try:
resp = self.fetch_request(use_mercury=False)
except TimeoutError:
logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: timed out")
resp = None
except requests.exceptions.TooManyRedirects:
logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: too many redirects")
resp = None
2015-11-27 08:18:34 +01:00
if not resp:
return
2015-11-27 08:18:34 +01:00
try:
text = resp.text
except (LookupError, TypeError):
text = resp.content
# if self.debug:
# logging.user(self.request, "~FBOriginal text's website: %s" % text)
# if resp.encoding and resp.encoding != 'utf-8':
# try:
# text = text.encode(resp.encoding)
# except (LookupError, UnicodeEncodeError):
# pass
if text:
text = text.replace("\xc2\xa0", " ") # Non-breaking space, is mangled when encoding is not utf-8
2020-06-15 02:54:37 -04:00
text = text.replace("\\u00a0", " ") # Non-breaking space, is mangled when encoding is not utf-8
2015-11-27 08:18:34 +01:00
original_text_doc = readability.Document(text, url=resp.url,
positive_keywords="post, entry, postProp, article, postContent, postField")
try:
content = original_text_doc.summary(html_partial=True)
2020-06-15 02:54:37 -04:00
except (readability.Unparseable, ParserError) as e:
logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: %s" % e)
return
2015-11-27 08:18:34 +01:00
try:
title = original_text_doc.title()
except TypeError:
title = ""
url = resp.url
return self.process_content(content, title, url, image=None, skip_save=skip_save, return_document=return_document,
original_text_doc=original_text_doc)
def process_content(self, content, title, url, image, skip_save=False, return_document=False, original_text_doc=None):
original_story_content = self.story and self.story.story_content_z and zlib.decompress(self.story.story_content_z)
if not original_story_content:
original_story_content = ""
story_image_urls = self.story and self.story.image_urls
if not story_image_urls:
story_image_urls = []
content = self.add_hero_image(content, story_image_urls)
if content and len(content) > len(original_story_content):
if self.story and not skip_save:
self.story.original_text_z = zlib.compress(smart_bytes(content))
try:
self.story.save()
2020-06-15 02:54:37 -04:00
except NotUniqueError as e:
logging.user(self.request, ("~SN~FYFetched ~FGoriginal text~FY: %s" % (e)), warn_color=False)
pass
logging.user(self.request, ("~SN~FYFetched ~FGoriginal text~FY: now ~SB%s bytes~SN vs. was ~SB%s bytes" % (
len(content),
len(original_story_content)
)), warn_color=False)
else:
logging.user(self.request, ("~SN~FRFailed~FY to fetch ~FGoriginal text~FY: was ~SB%s bytes" % (
len(original_story_content)
)), warn_color=False)
return
if content:
content = self.rewrite_content(content)
if return_document:
return dict(content=content, title=title, url=url, doc=original_text_doc, image=image)
return content
2015-11-27 08:18:34 +01:00
def add_hero_image(self, content, image_urls):
# Need to have images in the original story to add to the text that may not have any images
if not len(image_urls):
return content
content_soup = BeautifulSoup(content, features="lxml")
content_imgs = content_soup.findAll('img')
for img in content_imgs:
# Since NewsBlur proxies all http images over https, the url can change, so acknowledge urls
# that are https on the original text but http on the feed
2021-03-25 18:52:32 -04:00
if not img.get('src'): continue
if img.get('src') in image_urls:
image_urls.remove(img.get('src'))
elif img.get('src').replace('https:', 'http:') in image_urls:
image_urls.remove(img.get('src').replace('https:', 'http:'))
if len(image_urls):
image_content = f'<img src="{image_urls[0]}">'
content = f"{image_content}\n {content}"
return content
def rewrite_content(self, content):
2020-06-30 17:22:47 -04:00
soup = BeautifulSoup(content, features="lxml")
for noscript in soup.findAll('noscript'):
if len(noscript.contents) > 0:
noscript.replaceWith(noscript.contents[0])
2020-06-15 02:54:37 -04:00
content = str(soup)
2019-08-21 18:33:57 -07:00
images = set([img['src'] for img in soup.findAll('img') if 'src' in img])
for image_url in images:
abs_image_url = urljoin(self.story.story_permalink, image_url)
content = content.replace(image_url, abs_image_url)
return content
@timelimit(10)
def fetch_request(self, use_mercury=True):
headers = self.headers
url = self.story_url
if use_mercury:
mercury_api_key = getattr(settings, 'MERCURY_PARSER_API_KEY', 'abc123')
headers["content-type"] = "application/json"
headers["x-api-key"] = mercury_api_key
domain = Site.objects.get_current().domain
url = f"https://{domain}/rss_feeds/original_text_fetcher?url={url}"
try:
r = requests.get(url, headers=headers, timeout=15)
r.connection.close()
2015-11-27 08:18:34 +01:00
except (AttributeError, SocketError, requests.ConnectionError,
2014-03-29 17:17:30 -07:00
requests.models.MissingSchema, requests.sessions.InvalidSchema,
requests.sessions.TooManyRedirects,
requests.models.InvalidURL,
requests.models.ChunkedEncodingError,
requests.models.ContentDecodingError,
requests.adapters.ReadTimeout,
urllib3.exceptions.LocationValueError,
2020-06-15 02:54:37 -04:00
LocationParseError, OpenSSLError, PyAsn1Error) as e:
logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: %s" % e)
return
return r