NewsBlur-viq/apps/rss_feeds/text_importer.py

309 lines
11 KiB
Python
Raw Normal View History

import zlib
from socket import error as SocketError
2024-04-24 09:50:42 -04:00
from urllib.parse import urljoin
import requests
import urllib3
from bs4 import BeautifulSoup
from django.conf import settings
from django.contrib.sites.models import Site
from django.utils.encoding import smart_bytes, smart_str
from lxml.etree import ParserError
2024-04-24 09:50:42 -04:00
from mongoengine.queryset import NotUniqueError
from OpenSSL.SSL import Error as OpenSSLError
2014-05-27 13:08:21 -07:00
from pyasn1.error import PyAsn1Error
2024-04-24 09:50:42 -04:00
from requests.packages.urllib3.exceptions import LocationParseError
from simplejson.decoder import JSONDecodeError
from utils import log as logging
from utils.feed_functions import TimeoutError, timelimit
from vendor import readability
from vendor.readability.readability import Unparseable
2024-04-24 09:43:56 -04:00
BROKEN_URLS = [
"gamespot.com",
2024-04-24 09:43:56 -04:00
"thedailyskip.com",
]
2015-11-27 08:18:34 +01:00
class TextImporter:
def __init__(self, story=None, feed=None, story_url=None, request=None, debug=False):
self.story = story
self.story_url = story_url
2020-04-30 14:53:42 -04:00
if self.story and not self.story_url:
self.story_url = self.story.story_permalink
self.feed = feed
self.request = request
self.debug = debug
2015-11-27 08:18:34 +01:00
@property
def headers(self):
2024-04-24 09:43:56 -04:00
num_subscribers = getattr(self.feed, "num_subscribers", 0)
return {
2024-04-24 09:43:56 -04:00
"User-Agent": "NewsBlur Content Fetcher - %s subscriber%s - %s %s"
% (
num_subscribers,
"s" if num_subscribers != 1 else "",
getattr(self.feed, "permalink", ""),
getattr(self.feed, "fake_user_agent", ""),
),
}
2015-11-27 08:18:34 +01:00
def fetch(self, skip_save=False, return_document=False, use_mercury=True):
2015-08-03 20:12:51 -07:00
if self.story_url and any(broken_url in self.story_url for broken_url in BROKEN_URLS):
logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: banned")
return
2024-04-24 09:43:56 -04:00
if use_mercury:
results = self.fetch_mercury(skip_save=skip_save, return_document=return_document)
2024-04-24 09:43:56 -04:00
if not use_mercury or not results:
2024-04-24 09:43:56 -04:00
logging.user(
self.request,
"~SN~FRFailed~FY to fetch ~FGoriginal text~FY with Mercury, trying readability...",
warn_color=False,
)
results = self.fetch_manually(skip_save=skip_save, return_document=return_document)
2024-04-24 09:43:56 -04:00
return results
2024-04-24 09:43:56 -04:00
def fetch_mercury(self, skip_save=False, return_document=False):
try:
resp = self.fetch_request(use_mercury=True)
except TimeoutError:
logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: timed out")
resp = None
except requests.exceptions.TooManyRedirects:
logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: too many redirects")
resp = None
2024-04-24 09:43:56 -04:00
if not resp:
return
2024-04-24 09:43:56 -04:00
2018-08-09 09:47:10 -04:00
try:
doc = resp.json()
except JSONDecodeError:
doc = None
2024-04-24 09:43:56 -04:00
if not doc or doc.get("error", False):
logging.user(
self.request,
"~SN~FRFailed~FY to fetch ~FGoriginal text~FY: %s"
% (doc and doc.get("messages", None) or "[unknown mercury error]"),
)
2017-10-30 11:47:18 -07:00
return
2024-04-24 09:43:56 -04:00
text = doc["content"]
title = doc["title"]
url = doc["url"]
image = doc["lead_image_url"]
if image and ("http://" in image[1:] or "https://" in image[1:]):
logging.user(self.request, "~SN~FRRemoving broken image from text: %s" % image)
image = None
2024-04-24 09:43:56 -04:00
return self.process_content(
text, title, url, image, skip_save=skip_save, return_document=return_document
)
def fetch_manually(self, skip_save=False, return_document=False):
try:
resp = self.fetch_request(use_mercury=False)
except TimeoutError:
logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: timed out")
resp = None
except requests.exceptions.TooManyRedirects:
logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: too many redirects")
resp = None
2015-11-27 08:18:34 +01:00
if not resp:
return
2015-11-27 08:18:34 +01:00
@timelimit(5)
def extract_text(resp):
try:
text = resp.text
except (LookupError, TypeError):
text = resp.content
return text
2024-04-24 09:43:56 -04:00
try:
text = extract_text(resp)
except TimeoutError:
logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: timed out on resp.text")
return
2024-04-24 09:43:56 -04:00
# if self.debug:
# logging.user(self.request, "~FBOriginal text's website: %s" % text)
2024-04-24 09:43:56 -04:00
# if resp.encoding and resp.encoding != 'utf-8':
# try:
# text = text.encode(resp.encoding)
# except (LookupError, UnicodeEncodeError):
# pass
if text:
2024-04-24 09:43:56 -04:00
text = text.replace("\xc2\xa0", " ") # Non-breaking space, is mangled when encoding is not utf-8
text = text.replace("\\u00a0", " ") # Non-breaking space, is mangled when encoding is not utf-8
2024-04-24 09:43:56 -04:00
original_text_doc = readability.Document(
text, url=resp.url, positive_keywords="post, entry, postProp, article, postContent, postField"
)
try:
content = original_text_doc.summary(html_partial=True)
except (ParserError, Unparseable) as e:
logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: %s" % e)
return
2015-11-27 08:18:34 +01:00
try:
title = original_text_doc.title()
except TypeError:
title = ""
url = resp.url
2024-04-24 09:43:56 -04:00
return self.process_content(
content,
title,
url,
image=None,
skip_save=skip_save,
return_document=return_document,
original_text_doc=original_text_doc,
)
def process_content(
self, content, title, url, image, skip_save=False, return_document=False, original_text_doc=None
):
original_story_content = (
self.story and self.story.story_content_z and zlib.decompress(self.story.story_content_z)
)
if not original_story_content:
original_story_content = ""
story_image_urls = self.story and self.story.image_urls
if not story_image_urls:
story_image_urls = []
2024-04-24 09:43:56 -04:00
content = self.add_hero_image(content, story_image_urls)
2021-09-30 12:25:28 -04:00
if content:
content = self.rewrite_content(content)
full_content_is_longer = False
if self.feed and self.feed.is_newsletter:
full_content_is_longer = True
elif len(content) > len(original_story_content):
full_content_is_longer = True
2024-04-24 09:43:56 -04:00
if content and full_content_is_longer:
if self.story and not skip_save:
self.story.original_text_z = zlib.compress(smart_bytes(content))
try:
self.story.save()
2020-06-15 02:54:37 -04:00
except NotUniqueError as e:
2024-04-24 09:43:56 -04:00
logging.user(
self.request, ("~SN~FYFetched ~FGoriginal text~FY: %s" % (e)), warn_color=False
)
pass
2024-04-24 09:43:56 -04:00
logging.user(
self.request,
(
"~SN~FYFetched ~FGoriginal text~FY: now ~SB%s bytes~SN vs. was ~SB%s bytes"
% (len(content), len(original_story_content))
),
warn_color=False,
)
else:
2024-04-24 09:43:56 -04:00
logging.user(
self.request,
(
"~SN~FRFailed~FY to fetch ~FGoriginal text~FY: was ~SB%s bytes"
% (len(original_story_content))
),
warn_color=False,
)
return
2024-04-24 09:43:56 -04:00
if return_document:
return dict(content=content, title=title, url=url, doc=original_text_doc, image=image)
return content
2015-11-27 08:18:34 +01:00
def add_hero_image(self, content, image_urls):
# Need to have images in the original story to add to the text that may not have any images
2024-04-24 09:43:56 -04:00
if not len(image_urls):
return content
2024-04-24 09:43:56 -04:00
content_soup = BeautifulSoup(content, features="lxml")
2024-04-24 09:43:56 -04:00
content_imgs = content_soup.findAll("img")
for img in content_imgs:
# Since NewsBlur proxies all http images over https, the url can change, so acknowledge urls
# that are https on the original text but http on the feed
2024-04-24 09:43:56 -04:00
if not img.get("src"):
continue
if img.get("src") in image_urls:
image_urls.remove(img.get("src"))
elif img.get("src").replace("https:", "http:") in image_urls:
image_urls.remove(img.get("src").replace("https:", "http:"))
if len(image_urls):
image_content = f'<img src="{image_urls[0]}">'
content = f"{image_content}\n {content}"
return content
def rewrite_content(self, content):
2020-06-30 17:22:47 -04:00
soup = BeautifulSoup(content, features="lxml")
2024-04-24 09:43:56 -04:00
for noscript in soup.findAll("noscript"):
if len(noscript.contents) > 0:
noscript.replaceWith(noscript.contents[0])
2024-04-24 09:43:56 -04:00
2020-06-15 02:54:37 -04:00
content = str(soup)
2024-04-24 09:43:56 -04:00
images = set([img.attrs["src"] for img in soup.findAll("img") if "src" in img.attrs])
for image_url in images:
abs_image_url = urljoin(self.story_url, image_url)
content = content.replace(image_url, abs_image_url)
2024-04-24 09:43:56 -04:00
return content
2024-04-24 09:43:56 -04:00
@timelimit(10)
def fetch_request(self, use_mercury=True):
headers = self.headers
url = self.story_url
2024-04-24 09:43:56 -04:00
if use_mercury:
2024-04-24 09:43:56 -04:00
mercury_api_key = getattr(settings, "MERCURY_PARSER_API_KEY", "abc123")
headers["content-type"] = "application/json"
headers["x-api-key"] = mercury_api_key
domain = Site.objects.get_current().domain
protocol = "https"
if settings.DOCKERBUILD:
2024-04-24 09:43:56 -04:00
domain = "haproxy"
protocol = "http"
url = f"{protocol}://{domain}/rss_feeds/original_text_fetcher?url={url}"
2024-04-24 09:43:56 -04:00
try:
r = requests.get(url, headers=headers, timeout=15)
r.connection.close()
2024-04-24 09:43:56 -04:00
except (
AttributeError,
SocketError,
requests.ConnectionError,
requests.models.MissingSchema,
requests.sessions.InvalidSchema,
requests.sessions.TooManyRedirects,
requests.models.InvalidURL,
requests.models.ChunkedEncodingError,
requests.models.ContentDecodingError,
requests.adapters.ReadTimeout,
urllib3.exceptions.LocationValueError,
LocationParseError,
OpenSSLError,
PyAsn1Error,
) as e:
logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: %s" % e)
return
return r