mirror of
https://github.com/samuelclay/NewsBlur.git
synced 2025-08-05 16:58:59 +00:00

* django3.0: (27 commits) Removing log override Moving logging over to the newsblur log. Fixing search indexer background task for new celery. Attempting to add gunicorn errors to console/log. Better handling of missing subs. Handling missing user sub on feed delete. Correct encoding for strings on systems that don't have utf-8 as default encoding. Writing in the real urllib3 dependency for requests. Upgrading requests due to urllib3 incompatibility. Login required should use the next parameter. Upgrading django oauth toolkit for django 1.11. Handling newsletters with multiple recipients. Extracting image urls sometimes fails. Handling ajax errors in json views. Adding timeouts to most outbound requests. Sentry SDK 0.19.4. Removing imperfect proxy warning for every story. Found four more GET/POST crosses. Feed unread count may need a POST. Namespacing settings. ...
345 lines
13 KiB
Python
345 lines
13 KiB
Python
import requests
|
|
import re
|
|
import urllib.parse
|
|
import traceback
|
|
import feedparser
|
|
import time
|
|
import urllib.request, urllib.error, urllib.parse
|
|
import http.client
|
|
import zlib
|
|
from mongoengine.queryset import NotUniqueError
|
|
from socket import error as SocketError
|
|
from boto.s3.key import Key
|
|
from django.conf import settings
|
|
from django.utils.text import compress_string
|
|
from utils import log as logging
|
|
from apps.rss_feeds.models import MFeedPage
|
|
from utils.feed_functions import timelimit, TimeoutError
|
|
from OpenSSL.SSL import Error as OpenSSLError
|
|
from pyasn1.error import PyAsn1Error
|
|
# from utils.feed_functions import mail_feed_error_to_admin
|
|
|
|
BROKEN_PAGES = [
|
|
'tag:',
|
|
'info:',
|
|
'uuid:',
|
|
'urn:',
|
|
'[]',
|
|
]
|
|
|
|
# Also change in reader_utils.js.
|
|
BROKEN_PAGE_URLS = [
|
|
'nytimes.com',
|
|
'github.com',
|
|
'washingtonpost.com',
|
|
'stackoverflow.com',
|
|
'stackexchange.com',
|
|
'twitter.com',
|
|
'rankexploits',
|
|
'gamespot.com',
|
|
'espn.com',
|
|
'royalroad.com',
|
|
]
|
|
|
|
class PageImporter(object):
|
|
|
|
def __init__(self, feed, story=None, request=None):
|
|
self.feed = feed
|
|
self.story = story
|
|
self.request = request
|
|
|
|
@property
|
|
def headers(self):
|
|
return {
|
|
'User-Agent': 'NewsBlur Page Fetcher - %s subscriber%s - %s %s' % (
|
|
self.feed.num_subscribers,
|
|
's' if self.feed.num_subscribers != 1 else '',
|
|
self.feed.permalink,
|
|
self.feed.fake_user_agent,
|
|
),
|
|
}
|
|
|
|
def fetch_page(self, urllib_fallback=False, requests_exception=None):
|
|
try:
|
|
self.fetch_page_timeout(urllib_fallback=urllib_fallback, requests_exception=requests_exception)
|
|
except TimeoutError:
|
|
logging.user(self.request, ' ***> [%-30s] ~FBPage fetch ~SN~FRfailed~FB due to timeout' % (self.feed.log_title[:30]))
|
|
|
|
@timelimit(10)
|
|
def fetch_page_timeout(self, urllib_fallback=False, requests_exception=None):
|
|
html = None
|
|
feed_link = self.feed.feed_link
|
|
if not feed_link:
|
|
self.save_no_page()
|
|
return
|
|
|
|
if feed_link.startswith('www'):
|
|
self.feed.feed_link = 'http://' + feed_link
|
|
try:
|
|
if any(feed_link.startswith(s) for s in BROKEN_PAGES):
|
|
self.save_no_page()
|
|
return
|
|
elif any(s in feed_link.lower() for s in BROKEN_PAGE_URLS):
|
|
self.save_no_page()
|
|
return
|
|
elif feed_link.startswith('http'):
|
|
if urllib_fallback:
|
|
request = urllib.request.Request(feed_link, headers=self.headers)
|
|
response = urllib.request.urlopen(request)
|
|
time.sleep(0.01) # Grrr, GIL.
|
|
data = response.read()
|
|
else:
|
|
try:
|
|
response = requests.get(feed_link, headers=self.headers, timeout=10)
|
|
response.connection.close()
|
|
except requests.exceptions.TooManyRedirects:
|
|
response = requests.get(feed_link, timeout=10)
|
|
except (AttributeError, SocketError, OpenSSLError, PyAsn1Error, TypeError,
|
|
requests.adapters.ReadTimeout) as e:
|
|
logging.debug(' ***> [%-30s] Page fetch failed using requests: %s' % (self.feed.log_title[:30], e))
|
|
self.save_no_page()
|
|
return
|
|
# try:
|
|
data = response.content
|
|
# except (LookupError, TypeError):
|
|
# data = response.content
|
|
|
|
# if response.encoding and response.encoding != 'utf-8':
|
|
# try:
|
|
# data = data.encode(response.encoding)
|
|
# except LookupError:
|
|
# pass
|
|
else:
|
|
try:
|
|
data = open(feed_link, 'r').read()
|
|
except IOError:
|
|
self.feed.feed_link = 'http://' + feed_link
|
|
self.fetch_page(urllib_fallback=True)
|
|
return
|
|
if data:
|
|
html = self.rewrite_page(data)
|
|
if html:
|
|
self.save_page(html)
|
|
else:
|
|
self.save_no_page()
|
|
return
|
|
else:
|
|
self.save_no_page()
|
|
return
|
|
except (ValueError, urllib.error.URLError, http.client.BadStatusLine, http.client.InvalidURL,
|
|
requests.exceptions.ConnectionError) as e:
|
|
self.feed.save_page_history(401, "Bad URL", e)
|
|
fp = feedparser.parse(self.feed.feed_address)
|
|
feed_link = fp.feed.get('link', "")
|
|
self.feed.save()
|
|
logging.debug(' ***> [%-30s] Page fetch failed: %s' % (self.feed.log_title[:30], e))
|
|
except (urllib.error.HTTPError) as e:
|
|
self.feed.save_page_history(e.code, e.msg, e.fp.read())
|
|
except (http.client.IncompleteRead) as e:
|
|
self.feed.save_page_history(500, "IncompleteRead", e)
|
|
except (requests.exceptions.RequestException,
|
|
requests.packages.urllib3.exceptions.HTTPError) as e:
|
|
logging.debug(' ***> [%-30s] Page fetch failed using requests: %s' % (self.feed.log_title[:30], e))
|
|
# mail_feed_error_to_admin(self.feed, e, local_vars=locals())
|
|
return self.fetch_page(urllib_fallback=True, requests_exception=e)
|
|
except Exception as e:
|
|
logging.debug('[%d] ! -------------------------' % (self.feed.id,))
|
|
tb = traceback.format_exc()
|
|
logging.debug(tb)
|
|
logging.debug('[%d] ! -------------------------' % (self.feed.id,))
|
|
self.feed.save_page_history(500, "Error", tb)
|
|
# mail_feed_error_to_admin(self.feed, e, local_vars=locals())
|
|
if (not settings.DEBUG and hasattr(settings, 'RAVEN_CLIENT') and
|
|
settings.RAVEN_CLIENT):
|
|
settings.RAVEN_CLIENT.captureException()
|
|
if not urllib_fallback:
|
|
self.fetch_page(urllib_fallback=True)
|
|
else:
|
|
self.feed.save_page_history(200, "OK")
|
|
|
|
return html
|
|
|
|
def fetch_story(self):
|
|
html = None
|
|
try:
|
|
html = self._fetch_story()
|
|
except TimeoutError:
|
|
logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal story~FY: timed out")
|
|
except requests.exceptions.TooManyRedirects:
|
|
logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal story~FY: too many redirects")
|
|
|
|
return html
|
|
|
|
@timelimit(10)
|
|
def _fetch_story(self):
|
|
html = None
|
|
story_permalink = self.story.story_permalink
|
|
|
|
if not self.feed:
|
|
return
|
|
if any(story_permalink.startswith(s) for s in BROKEN_PAGES):
|
|
return
|
|
if any(s in story_permalink.lower() for s in BROKEN_PAGE_URLS):
|
|
return
|
|
if not story_permalink.startswith('http'):
|
|
return
|
|
|
|
try:
|
|
response = requests.get(story_permalink, headers=self.headers, timeout=10)
|
|
response.connection.close()
|
|
except (AttributeError, SocketError, OpenSSLError, PyAsn1Error,
|
|
requests.exceptions.ConnectionError,
|
|
requests.exceptions.TooManyRedirects,
|
|
requests.adapters.ReadTimeout) as e:
|
|
try:
|
|
response = requests.get(story_permalink, timeout=10)
|
|
except (AttributeError, SocketError, OpenSSLError, PyAsn1Error,
|
|
requests.exceptions.ConnectionError,
|
|
requests.exceptions.TooManyRedirects,
|
|
requests.adapters.ReadTimeout) as e:
|
|
logging.debug(' ***> [%-30s] Original story fetch failed using requests: %s' % (self.feed.log_title[:30], e))
|
|
return
|
|
try:
|
|
data = response.text
|
|
except (LookupError, TypeError):
|
|
data = response.content
|
|
|
|
if response.encoding and response.encoding != 'utf-8':
|
|
try:
|
|
data = data.encode(response.encoding)
|
|
except (LookupError, UnicodeEncodeError):
|
|
pass
|
|
|
|
if data:
|
|
data = data.replace("\xc2\xa0", " ") # Non-breaking space, is mangled when encoding is not utf-8
|
|
data = data.replace("\\u00a0", " ") # Non-breaking space, is mangled when encoding is not utf-8
|
|
html = self.rewrite_page(data)
|
|
if not html:
|
|
return
|
|
self.save_story(html)
|
|
|
|
return html
|
|
|
|
def save_story(self, html):
|
|
self.story.original_page_z = zlib.compress(html)
|
|
try:
|
|
self.story.save()
|
|
except NotUniqueError:
|
|
pass
|
|
|
|
|
|
def save_no_page(self):
|
|
logging.debug(' ---> [%-30s] ~FYNo original page: %s' % (self.feed.log_title[:30], self.feed.feed_link))
|
|
self.feed.has_page = False
|
|
self.feed.save()
|
|
self.feed.save_page_history(404, "Feed has no original page.")
|
|
|
|
def rewrite_page(self, response):
|
|
BASE_RE = re.compile(r'<head(.*?\>)', re.I)
|
|
base_code = '<base href="%s" />' % (self.feed.feed_link,)
|
|
try:
|
|
html = BASE_RE.sub(r'<head\1 '+base_code, response)
|
|
except:
|
|
# response = response.decode('latin1').encode('utf-8')
|
|
# html = BASE_RE.sub(r'<head\1 '+base_code, response)
|
|
return None
|
|
|
|
if '<base href' not in html:
|
|
html = "%s %s" % (base_code, html)
|
|
|
|
# html = self.fix_urls(html)
|
|
|
|
return html.strip()
|
|
|
|
def fix_urls(self, document):
|
|
# BEWARE: This will rewrite URLs inside of <script> tags. You know, like
|
|
# Google Analytics. Ugh.
|
|
|
|
FIND_RE = re.compile(r'\b(href|src)\s*=\s*("[^"]*"|\'[^\']*\'|[^"\'<>=\s]+)')
|
|
ret = []
|
|
last_end = 0
|
|
|
|
for match in FIND_RE.finditer(document):
|
|
url = match.group(2)
|
|
if url[0] in "\"'":
|
|
url = url.strip(url[0])
|
|
parsed = urllib.parse.urlparse(url)
|
|
if parsed.scheme == parsed.netloc == '': #relative to domain
|
|
url = urllib.parse.urljoin(self.feed.feed_link, url)
|
|
ret.append(document[last_end:match.start(2)])
|
|
ret.append('"%s"' % (url,))
|
|
last_end = match.end(2)
|
|
ret.append(document[last_end:])
|
|
|
|
return ''.join(ret)
|
|
|
|
def save_page(self, html):
|
|
saved = False
|
|
|
|
if not html or len(html) < 100:
|
|
return
|
|
|
|
if settings.BACKED_BY_AWS.get('pages_on_node'):
|
|
saved = self.save_page_node(html)
|
|
if saved and self.feed.s3_page and settings.BACKED_BY_AWS.get('pages_on_s3'):
|
|
self.delete_page_s3()
|
|
|
|
if settings.BACKED_BY_AWS.get('pages_on_s3') and not saved:
|
|
saved = self.save_page_s3(html)
|
|
|
|
if not saved:
|
|
try:
|
|
feed_page = MFeedPage.objects.get(feed_id=self.feed.pk)
|
|
# feed_page.page_data = html.encode('utf-8')
|
|
if feed_page.page() == html:
|
|
logging.debug(' ---> [%-30s] ~FYNo change in page data: %s' % (self.feed.log_title[:30], self.feed.feed_link))
|
|
else:
|
|
# logging.debug(' ---> [%-30s] ~FYChange in page data: %s (%s/%s %s/%s)' % (self.feed.log_title[:30], self.feed.feed_link, type(html), type(feed_page.page()), len(html), len(feed_page.page())))
|
|
feed_page.page_data = zlib.compress(html)
|
|
feed_page.save()
|
|
except MFeedPage.DoesNotExist:
|
|
feed_page = MFeedPage.objects.create(feed_id=self.feed.pk,
|
|
page_data=zlib.compress(html))
|
|
return feed_page
|
|
|
|
def save_page_node(self, html):
|
|
url = "http://%s/original_page/%s" % (
|
|
settings.ORIGINAL_PAGE_SERVER,
|
|
self.feed.pk,
|
|
)
|
|
response = requests.post(url, files={
|
|
'original_page': compress_string(html),
|
|
# 'original_page': html,
|
|
})
|
|
if response.status_code == 200:
|
|
return True
|
|
|
|
def save_page_s3(self, html):
|
|
k = Key(settings.S3_CONN.get_bucket(settings.S3_PAGES_BUCKET_NAME))
|
|
k.key = self.feed.s3_pages_key
|
|
k.set_metadata('Content-Encoding', 'gzip')
|
|
k.set_metadata('Content-Type', 'text/html')
|
|
k.set_metadata('Access-Control-Allow-Origin', '*')
|
|
k.set_contents_from_string(compress_string(html))
|
|
k.set_acl('public-read')
|
|
|
|
try:
|
|
feed_page = MFeedPage.objects.get(feed_id=self.feed.pk)
|
|
feed_page.delete()
|
|
logging.debug(' ---> [%-30s] ~FYTransfering page data to S3...' % (self.feed.log_title[:30]))
|
|
except MFeedPage.DoesNotExist:
|
|
pass
|
|
|
|
if not self.feed.s3_page:
|
|
self.feed.s3_page = True
|
|
self.feed.save()
|
|
|
|
return True
|
|
|
|
def delete_page_s3(self):
|
|
k = Key(settings.S3_CONN.get_bucket(settings.S3_PAGES_BUCKET_NAME))
|
|
k.key = self.feed.s3_pages_key
|
|
k.delete()
|
|
|
|
self.feed.s3_page = False
|
|
self.feed.save()
|