2013-01-08 18:33:30 -08:00
|
|
|
import requests
|
2018-07-16 10:56:33 -04:00
|
|
|
import urllib3
|
2013-01-08 18:33:30 -08:00
|
|
|
import zlib
|
2018-08-09 09:47:10 -04:00
|
|
|
from simplejson.decoder import JSONDecodeError
|
2014-03-29 20:24:08 -07:00
|
|
|
from requests.packages.urllib3.exceptions import LocationParseError
|
2013-07-15 11:06:50 -07:00
|
|
|
from socket import error as SocketError
|
2013-08-06 13:18:55 -07:00
|
|
|
from mongoengine.queryset import NotUniqueError
|
2013-01-08 18:33:30 -08:00
|
|
|
from vendor.readability import readability
|
2016-06-28 16:11:46 -07:00
|
|
|
from lxml.etree import ParserError
|
2013-01-08 18:33:30 -08:00
|
|
|
from utils import log as logging
|
2013-07-15 11:06:50 -07:00
|
|
|
from utils.feed_functions import timelimit, TimeoutError
|
2014-05-22 15:15:34 -07:00
|
|
|
from OpenSSL.SSL import Error as OpenSSLError
|
2014-05-27 13:08:21 -07:00
|
|
|
from pyasn1.error import PyAsn1Error
|
2015-11-28 21:59:57 +01:00
|
|
|
from django.utils.encoding import smart_str
|
2017-10-24 15:28:36 -07:00
|
|
|
from django.conf import settings
|
2021-05-12 21:20:05 -04:00
|
|
|
from django.utils.encoding import smart_bytes
|
2021-05-05 15:18:09 -04:00
|
|
|
from django.contrib.sites.models import Site
|
2020-06-15 02:54:37 -04:00
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
from urllib.parse import urljoin
|
2019-08-21 18:22:56 -07:00
|
|
|
|
2015-08-03 20:09:36 -07:00
|
|
|
BROKEN_URLS = [
|
|
|
|
"gamespot.com",
|
2020-04-30 14:36:32 -04:00
|
|
|
'thedailyskip.com',
|
2015-08-03 20:09:36 -07:00
|
|
|
]
|
|
|
|
|
2015-11-27 08:18:34 +01:00
|
|
|
|
2013-01-08 18:33:30 -08:00
|
|
|
class TextImporter:
|
2015-11-27 08:18:34 +01:00
|
|
|
|
2014-07-21 14:22:07 -07:00
|
|
|
def __init__(self, story=None, feed=None, story_url=None, request=None, debug=False):
|
2013-01-08 18:33:30 -08:00
|
|
|
self.story = story
|
2014-02-18 12:39:57 -08:00
|
|
|
self.story_url = story_url
|
2020-04-30 14:53:42 -04:00
|
|
|
if self.story and not self.story_url:
|
|
|
|
self.story_url = self.story.story_permalink
|
2013-07-15 11:06:50 -07:00
|
|
|
self.feed = feed
|
2013-01-08 18:33:30 -08:00
|
|
|
self.request = request
|
2014-07-21 14:22:07 -07:00
|
|
|
self.debug = debug
|
2015-11-27 08:18:34 +01:00
|
|
|
|
2013-01-08 18:33:30 -08:00
|
|
|
@property
|
|
|
|
def headers(self):
|
2015-11-28 21:59:57 +01:00
|
|
|
num_subscribers = getattr(self.feed, 'num_subscribers', 0)
|
2013-01-08 18:33:30 -08:00
|
|
|
return {
|
2020-11-30 15:48:59 -05:00
|
|
|
'User-Agent': 'NewsBlur Content Fetcher - %s subscriber%s - %s %s' % (
|
2015-11-28 21:59:57 +01:00
|
|
|
num_subscribers,
|
|
|
|
's' if num_subscribers != 1 else '',
|
2020-11-30 15:48:59 -05:00
|
|
|
getattr(self.feed, 'permalink', ''),
|
2020-11-30 18:27:49 -05:00
|
|
|
getattr(self.feed, 'fake_user_agent', ''),
|
2015-11-27 08:18:34 +01:00
|
|
|
),
|
2013-01-08 18:33:30 -08:00
|
|
|
}
|
2015-11-27 08:18:34 +01:00
|
|
|
|
2017-10-24 15:28:36 -07:00
|
|
|
def fetch(self, skip_save=False, return_document=False, use_mercury=True):
|
2015-08-03 20:12:51 -07:00
|
|
|
if self.story_url and any(broken_url in self.story_url for broken_url in BROKEN_URLS):
|
2015-08-03 20:09:36 -07:00
|
|
|
logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: banned")
|
|
|
|
return
|
2017-10-24 15:28:36 -07:00
|
|
|
|
|
|
|
if use_mercury:
|
|
|
|
results = self.fetch_mercury(skip_save=skip_save, return_document=return_document)
|
|
|
|
|
|
|
|
if not use_mercury or not results:
|
|
|
|
logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY with Mercury, trying readability...", warn_color=False)
|
|
|
|
results = self.fetch_manually(skip_save=skip_save, return_document=return_document)
|
|
|
|
|
|
|
|
return results
|
|
|
|
|
|
|
|
def fetch_mercury(self, skip_save=False, return_document=False):
|
2013-07-15 11:06:50 -07:00
|
|
|
try:
|
2017-10-24 15:28:36 -07:00
|
|
|
resp = self.fetch_request(use_mercury=True)
|
|
|
|
except TimeoutError:
|
|
|
|
logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: timed out")
|
|
|
|
resp = None
|
|
|
|
except requests.exceptions.TooManyRedirects:
|
|
|
|
logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: too many redirects")
|
|
|
|
resp = None
|
|
|
|
|
|
|
|
if not resp:
|
|
|
|
return
|
|
|
|
|
2018-08-09 09:47:10 -04:00
|
|
|
try:
|
|
|
|
doc = resp.json()
|
|
|
|
except JSONDecodeError:
|
|
|
|
doc = None
|
|
|
|
if not doc or doc.get('error', False):
|
2018-08-14 15:56:04 -04:00
|
|
|
logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: %s" % (doc and doc.get('messages', None) or "[unknown mercury error]"))
|
2017-10-30 11:47:18 -07:00
|
|
|
return
|
|
|
|
|
2017-10-24 15:28:36 -07:00
|
|
|
text = doc['content']
|
|
|
|
title = doc['title']
|
|
|
|
url = doc['url']
|
2017-11-02 22:09:37 -07:00
|
|
|
image = doc['lead_image_url']
|
2017-10-24 15:28:36 -07:00
|
|
|
|
2018-01-18 08:06:32 -08:00
|
|
|
if image and ('http://' in image[1:] or 'https://' in image[1:]):
|
2018-01-17 16:51:06 -08:00
|
|
|
logging.user(self.request, "~SN~FRRemoving broken image from text: %s" % image)
|
|
|
|
image = None
|
|
|
|
|
2017-11-02 22:09:37 -07:00
|
|
|
return self.process_content(text, title, url, image, skip_save=skip_save, return_document=return_document)
|
2017-10-24 15:28:36 -07:00
|
|
|
|
|
|
|
def fetch_manually(self, skip_save=False, return_document=False):
|
|
|
|
try:
|
|
|
|
resp = self.fetch_request(use_mercury=False)
|
2013-07-15 11:06:50 -07:00
|
|
|
except TimeoutError:
|
|
|
|
logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: timed out")
|
|
|
|
resp = None
|
2013-09-09 13:46:14 -07:00
|
|
|
except requests.exceptions.TooManyRedirects:
|
|
|
|
logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: too many redirects")
|
|
|
|
resp = None
|
2015-11-27 08:18:34 +01:00
|
|
|
|
2013-07-15 11:06:50 -07:00
|
|
|
if not resp:
|
|
|
|
return
|
2015-11-27 08:18:34 +01:00
|
|
|
|
2016-12-05 17:40:39 -08:00
|
|
|
try:
|
|
|
|
text = resp.text
|
|
|
|
except (LookupError, TypeError):
|
|
|
|
text = resp.content
|
2017-01-25 17:35:48 -08:00
|
|
|
|
|
|
|
# if self.debug:
|
|
|
|
# logging.user(self.request, "~FBOriginal text's website: %s" % text)
|
|
|
|
|
2017-10-15 17:15:56 -07:00
|
|
|
# if resp.encoding and resp.encoding != 'utf-8':
|
|
|
|
# try:
|
|
|
|
# text = text.encode(resp.encoding)
|
|
|
|
# except (LookupError, UnicodeEncodeError):
|
|
|
|
# pass
|
2016-12-05 17:40:39 -08:00
|
|
|
|
|
|
|
if text:
|
|
|
|
text = text.replace("\xc2\xa0", " ") # Non-breaking space, is mangled when encoding is not utf-8
|
2020-06-15 02:54:37 -04:00
|
|
|
text = text.replace("\\u00a0", " ") # Non-breaking space, is mangled when encoding is not utf-8
|
2016-12-05 17:40:39 -08:00
|
|
|
|
2015-11-27 08:18:34 +01:00
|
|
|
original_text_doc = readability.Document(text, url=resp.url,
|
2017-09-29 10:50:08 -07:00
|
|
|
positive_keywords="post, entry, postProp, article, postContent, postField")
|
2013-08-06 13:18:55 -07:00
|
|
|
try:
|
|
|
|
content = original_text_doc.summary(html_partial=True)
|
2020-06-15 02:54:37 -04:00
|
|
|
except (readability.Unparseable, ParserError) as e:
|
2016-06-28 16:11:46 -07:00
|
|
|
logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: %s" % e)
|
2013-08-06 13:18:55 -07:00
|
|
|
return
|
2015-11-27 08:18:34 +01:00
|
|
|
|
2014-03-10 11:55:10 -07:00
|
|
|
try:
|
|
|
|
title = original_text_doc.title()
|
|
|
|
except TypeError:
|
|
|
|
title = ""
|
2017-11-02 22:09:37 -07:00
|
|
|
|
2014-02-18 12:39:57 -08:00
|
|
|
url = resp.url
|
2017-03-23 16:28:47 -07:00
|
|
|
|
2017-11-02 22:09:37 -07:00
|
|
|
return self.process_content(content, title, url, image=None, skip_save=skip_save, return_document=return_document,
|
2017-10-24 15:28:36 -07:00
|
|
|
original_text_doc=original_text_doc)
|
2017-03-23 16:28:47 -07:00
|
|
|
|
2017-11-02 22:09:37 -07:00
|
|
|
def process_content(self, content, title, url, image, skip_save=False, return_document=False, original_text_doc=None):
|
2017-10-24 15:28:36 -07:00
|
|
|
original_story_content = self.story and self.story.story_content_z and zlib.decompress(self.story.story_content_z)
|
2017-10-24 15:33:27 -07:00
|
|
|
if not original_story_content:
|
|
|
|
original_story_content = ""
|
2021-02-26 12:10:30 -05:00
|
|
|
story_image_urls = self.story and self.story.image_urls
|
|
|
|
if not story_image_urls:
|
|
|
|
story_image_urls = []
|
|
|
|
|
|
|
|
content = self.add_hero_image(content, story_image_urls)
|
|
|
|
|
2017-10-24 15:28:36 -07:00
|
|
|
if content and len(content) > len(original_story_content):
|
2014-02-18 12:39:57 -08:00
|
|
|
if self.story and not skip_save:
|
2021-05-12 21:19:09 -04:00
|
|
|
self.story.original_text_z = zlib.compress(smart_bytes(content))
|
2013-08-06 13:18:55 -07:00
|
|
|
try:
|
|
|
|
self.story.save()
|
2020-06-15 02:54:37 -04:00
|
|
|
except NotUniqueError as e:
|
2017-03-23 16:06:06 -07:00
|
|
|
logging.user(self.request, ("~SN~FYFetched ~FGoriginal text~FY: %s" % (e)), warn_color=False)
|
2013-08-06 13:18:55 -07:00
|
|
|
pass
|
2013-06-30 17:12:41 -07:00
|
|
|
logging.user(self.request, ("~SN~FYFetched ~FGoriginal text~FY: now ~SB%s bytes~SN vs. was ~SB%s bytes" % (
|
2015-11-28 21:59:57 +01:00
|
|
|
len(content),
|
2017-10-24 15:28:36 -07:00
|
|
|
len(original_story_content)
|
2013-06-30 17:12:41 -07:00
|
|
|
)), warn_color=False)
|
2013-01-08 18:33:30 -08:00
|
|
|
else:
|
2013-06-30 17:12:41 -07:00
|
|
|
logging.user(self.request, ("~SN~FRFailed~FY to fetch ~FGoriginal text~FY: was ~SB%s bytes" % (
|
2017-10-24 15:33:27 -07:00
|
|
|
len(original_story_content)
|
2013-06-30 17:12:41 -07:00
|
|
|
)), warn_color=False)
|
2017-10-24 15:28:36 -07:00
|
|
|
return
|
2017-11-02 22:09:37 -07:00
|
|
|
|
2019-08-21 18:22:56 -07:00
|
|
|
if content:
|
|
|
|
content = self.rewrite_content(content)
|
|
|
|
|
2014-02-18 12:39:57 -08:00
|
|
|
if return_document:
|
2017-11-02 22:09:37 -07:00
|
|
|
return dict(content=content, title=title, url=url, doc=original_text_doc, image=image)
|
2014-02-18 12:39:57 -08:00
|
|
|
|
2013-07-15 11:06:50 -07:00
|
|
|
return content
|
2015-11-27 08:18:34 +01:00
|
|
|
|
2021-02-26 12:10:30 -05:00
|
|
|
def add_hero_image(self, content, image_urls):
|
|
|
|
# Need to have images in the original story to add to the text that may not have any images
|
|
|
|
if not len(image_urls):
|
|
|
|
return content
|
|
|
|
|
|
|
|
content_soup = BeautifulSoup(content, features="lxml")
|
|
|
|
|
|
|
|
content_imgs = content_soup.findAll('img')
|
|
|
|
for img in content_imgs:
|
2021-02-26 14:06:42 -05:00
|
|
|
# Since NewsBlur proxies all http images over https, the url can change, so acknowledge urls
|
|
|
|
# that are https on the original text but http on the feed
|
2021-03-25 18:52:32 -04:00
|
|
|
if not img.get('src'): continue
|
2021-04-14 13:27:31 -04:00
|
|
|
if img.get('src') in image_urls:
|
2021-02-26 12:10:30 -05:00
|
|
|
image_urls.remove(img.get('src'))
|
2021-04-14 13:27:31 -04:00
|
|
|
elif img.get('src').replace('https:', 'http:') in image_urls:
|
|
|
|
image_urls.remove(img.get('src').replace('https:', 'http:'))
|
2021-02-26 12:10:30 -05:00
|
|
|
|
|
|
|
if len(image_urls):
|
|
|
|
image_content = f'<img src="{image_urls[0]}">'
|
|
|
|
content = f"{image_content}\n {content}"
|
|
|
|
|
|
|
|
return content
|
|
|
|
|
2017-03-23 16:28:47 -07:00
|
|
|
def rewrite_content(self, content):
|
2020-06-30 17:22:47 -04:00
|
|
|
soup = BeautifulSoup(content, features="lxml")
|
2017-03-23 16:28:47 -07:00
|
|
|
|
|
|
|
for noscript in soup.findAll('noscript'):
|
|
|
|
if len(noscript.contents) > 0:
|
|
|
|
noscript.replaceWith(noscript.contents[0])
|
|
|
|
|
2020-06-15 02:54:37 -04:00
|
|
|
content = str(soup)
|
2019-08-21 18:22:56 -07:00
|
|
|
|
2019-08-21 18:33:57 -07:00
|
|
|
images = set([img['src'] for img in soup.findAll('img') if 'src' in img])
|
2019-08-21 18:22:56 -07:00
|
|
|
for image_url in images:
|
|
|
|
abs_image_url = urljoin(self.story.story_permalink, image_url)
|
|
|
|
content = content.replace(image_url, abs_image_url)
|
|
|
|
|
|
|
|
return content
|
2017-03-23 16:28:47 -07:00
|
|
|
|
2013-07-15 11:06:50 -07:00
|
|
|
@timelimit(10)
|
2017-10-24 15:28:36 -07:00
|
|
|
def fetch_request(self, use_mercury=True):
|
|
|
|
headers = self.headers
|
2014-02-18 12:39:57 -08:00
|
|
|
url = self.story_url
|
2017-10-24 15:28:36 -07:00
|
|
|
|
|
|
|
if use_mercury:
|
|
|
|
mercury_api_key = getattr(settings, 'MERCURY_PARSER_API_KEY', 'abc123')
|
|
|
|
headers["content-type"] = "application/json"
|
|
|
|
headers["x-api-key"] = mercury_api_key
|
2021-05-05 15:18:09 -04:00
|
|
|
domain = Site.objects.get_current().domain
|
|
|
|
url = f"https://{domain}/rss_feeds/original_text_fetcher?url={url}"
|
2017-10-24 15:28:36 -07:00
|
|
|
|
2013-07-15 11:06:50 -07:00
|
|
|
try:
|
2021-02-26 12:10:30 -05:00
|
|
|
r = requests.get(url, headers=headers, timeout=15)
|
2014-05-22 15:15:34 -07:00
|
|
|
r.connection.close()
|
2015-11-27 08:18:34 +01:00
|
|
|
except (AttributeError, SocketError, requests.ConnectionError,
|
2014-03-29 17:17:30 -07:00
|
|
|
requests.models.MissingSchema, requests.sessions.InvalidSchema,
|
2015-11-30 13:02:17 -08:00
|
|
|
requests.sessions.TooManyRedirects,
|
|
|
|
requests.models.InvalidURL,
|
|
|
|
requests.models.ChunkedEncodingError,
|
|
|
|
requests.models.ContentDecodingError,
|
2020-12-06 11:37:01 -05:00
|
|
|
requests.adapters.ReadTimeout,
|
2018-07-16 10:56:33 -04:00
|
|
|
urllib3.exceptions.LocationValueError,
|
2020-06-15 02:54:37 -04:00
|
|
|
LocationParseError, OpenSSLError, PyAsn1Error) as e:
|
2013-07-15 11:06:50 -07:00
|
|
|
logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: %s" % e)
|
|
|
|
return
|
|
|
|
return r
|