mirror of
https://github.com/samuelclay/NewsBlur.git
synced 2025-08-05 16:58:59 +00:00

* django1.11: (152 commits) request.raw_post_data -> request.body (django 1.6) Upgrading pgbouncer to 1.15.0. Finishing off Postgresql 13 upgrade. Upgrading to Postgresql 13. Ubuntu 20.04 Fixing supervisor path issues Upgrading setuptools Fixing flask Handling over capacity for twitter. Max length for image_urls. Properly filtering newsletter feeds. Fixing issue with text importer on feed-less urls. Removing dependency, fixing encoding issue for pages. Fixing DB Monitor. Updating User Agent for all fetchers. Ignoring VSCode. Fixing DB Monitor. Updating User Agent for all fetchers. Ignoring VSCode. Fixing Statistics by fixing how timezones are handled. ...
338 lines
13 KiB
Python
338 lines
13 KiB
Python
import requests
|
|
import re
|
|
import urllib.parse
|
|
import traceback
|
|
from vendor import feedparser
|
|
import time
|
|
import urllib.request, urllib.error, urllib.parse
|
|
import http.client
|
|
import zlib
|
|
from mongoengine.queryset import NotUniqueError
|
|
from socket import error as SocketError
|
|
from boto.s3.key import Key
|
|
from django.conf import settings
|
|
from django.utils.text import compress_string
|
|
from utils import log as logging
|
|
from apps.rss_feeds.models import MFeedPage
|
|
from utils.feed_functions import timelimit, TimeoutError
|
|
from OpenSSL.SSL import Error as OpenSSLError
|
|
from pyasn1.error import PyAsn1Error
|
|
# from utils.feed_functions import mail_feed_error_to_admin
|
|
|
|
BROKEN_PAGES = [
|
|
'tag:',
|
|
'info:',
|
|
'uuid:',
|
|
'urn:',
|
|
'[]',
|
|
]
|
|
|
|
# Also change in reader_utils.js.
|
|
BROKEN_PAGE_URLS = [
|
|
'nytimes.com',
|
|
'github.com',
|
|
'washingtonpost.com',
|
|
'stackoverflow.com',
|
|
'stackexchange.com',
|
|
'twitter.com',
|
|
'rankexploits',
|
|
'gamespot.com',
|
|
'espn.com',
|
|
'royalroad.com',
|
|
]
|
|
|
|
class PageImporter(object):
|
|
|
|
def __init__(self, feed, story=None, request=None):
|
|
self.feed = feed
|
|
self.story = story
|
|
self.request = request
|
|
|
|
@property
|
|
def headers(self):
|
|
return {
|
|
'User-Agent': 'NewsBlur Page Fetcher - %s subscriber%s - %s %s' % (
|
|
self.feed.num_subscribers,
|
|
's' if self.feed.num_subscribers != 1 else '',
|
|
self.feed.permalink,
|
|
self.feed.fake_user_agent,
|
|
),
|
|
}
|
|
|
|
def fetch_page(self, urllib_fallback=False, requests_exception=None):
|
|
try:
|
|
self.fetch_page_timeout(urllib_fallback=urllib_fallback, requests_exception=requests_exception)
|
|
except TimeoutError:
|
|
logging.user(self.request, ' ***> [%-30s] ~FBPage fetch ~SN~FRfailed~FB due to timeout' % (self.feed.log_title[:30]))
|
|
|
|
@timelimit(10)
|
|
def fetch_page_timeout(self, urllib_fallback=False, requests_exception=None):
|
|
html = None
|
|
feed_link = self.feed.feed_link
|
|
if not feed_link:
|
|
self.save_no_page()
|
|
return
|
|
|
|
if feed_link.startswith('www'):
|
|
self.feed.feed_link = 'http://' + feed_link
|
|
try:
|
|
if any(feed_link.startswith(s) for s in BROKEN_PAGES):
|
|
self.save_no_page()
|
|
return
|
|
elif any(s in feed_link.lower() for s in BROKEN_PAGE_URLS):
|
|
self.save_no_page()
|
|
return
|
|
elif feed_link.startswith('http'):
|
|
if urllib_fallback:
|
|
request = urllib.request.Request(feed_link, headers=self.headers)
|
|
response = urllib.request.urlopen(request)
|
|
time.sleep(0.01) # Grrr, GIL.
|
|
data = response.read()
|
|
else:
|
|
try:
|
|
response = requests.get(feed_link, headers=self.headers)
|
|
response.connection.close()
|
|
except requests.exceptions.TooManyRedirects:
|
|
response = requests.get(feed_link)
|
|
except (AttributeError, SocketError, OpenSSLError, PyAsn1Error, TypeError) as e:
|
|
logging.debug(' ***> [%-30s] Page fetch failed using requests: %s' % (self.feed.log_title[:30], e))
|
|
self.save_no_page()
|
|
return
|
|
# try:
|
|
data = response.content
|
|
# except (LookupError, TypeError):
|
|
# data = response.content
|
|
|
|
# if response.encoding and response.encoding != 'utf-8':
|
|
# try:
|
|
# data = data.encode(response.encoding)
|
|
# except LookupError:
|
|
# pass
|
|
else:
|
|
try:
|
|
data = open(feed_link, 'r').read()
|
|
except IOError:
|
|
self.feed.feed_link = 'http://' + feed_link
|
|
self.fetch_page(urllib_fallback=True)
|
|
return
|
|
if data:
|
|
html = self.rewrite_page(data)
|
|
if html:
|
|
self.save_page(html)
|
|
else:
|
|
self.save_no_page()
|
|
return
|
|
else:
|
|
self.save_no_page()
|
|
return
|
|
except (ValueError, urllib.error.URLError, http.client.BadStatusLine, http.client.InvalidURL,
|
|
requests.exceptions.ConnectionError) as e:
|
|
self.feed.save_page_history(401, "Bad URL", e)
|
|
fp = feedparser.parse(self.feed.feed_address)
|
|
feed_link = fp.feed.get('link', "")
|
|
self.feed.save()
|
|
logging.debug(' ***> [%-30s] Page fetch failed: %s' % (self.feed.log_title[:30], e))
|
|
except (urllib.error.HTTPError) as e:
|
|
self.feed.save_page_history(e.code, e.msg, e.fp.read())
|
|
except (http.client.IncompleteRead) as e:
|
|
self.feed.save_page_history(500, "IncompleteRead", e)
|
|
except (requests.exceptions.RequestException,
|
|
requests.packages.urllib3.exceptions.HTTPError) as e:
|
|
logging.debug(' ***> [%-30s] Page fetch failed using requests: %s' % (self.feed.log_title[:30], e))
|
|
# mail_feed_error_to_admin(self.feed, e, local_vars=locals())
|
|
return self.fetch_page(urllib_fallback=True, requests_exception=e)
|
|
except Exception as e:
|
|
logging.debug('[%d] ! -------------------------' % (self.feed.id,))
|
|
tb = traceback.format_exc()
|
|
logging.debug(tb)
|
|
logging.debug('[%d] ! -------------------------' % (self.feed.id,))
|
|
self.feed.save_page_history(500, "Error", tb)
|
|
# mail_feed_error_to_admin(self.feed, e, local_vars=locals())
|
|
if (not settings.DEBUG and hasattr(settings, 'RAVEN_CLIENT') and
|
|
settings.RAVEN_CLIENT):
|
|
settings.RAVEN_CLIENT.captureException()
|
|
if not urllib_fallback:
|
|
self.fetch_page(urllib_fallback=True)
|
|
else:
|
|
self.feed.save_page_history(200, "OK")
|
|
|
|
return html
|
|
|
|
def fetch_story(self):
|
|
html = None
|
|
try:
|
|
html = self._fetch_story()
|
|
except TimeoutError:
|
|
logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal story~FY: timed out")
|
|
except requests.exceptions.TooManyRedirects:
|
|
logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal story~FY: too many redirects")
|
|
|
|
return html
|
|
|
|
@timelimit(10)
|
|
def _fetch_story(self):
|
|
html = None
|
|
story_permalink = self.story.story_permalink
|
|
|
|
if not self.feed:
|
|
return
|
|
if any(story_permalink.startswith(s) for s in BROKEN_PAGES):
|
|
return
|
|
if any(s in story_permalink.lower() for s in BROKEN_PAGE_URLS):
|
|
return
|
|
if not story_permalink.startswith('http'):
|
|
return
|
|
|
|
try:
|
|
response = requests.get(story_permalink, headers=self.headers)
|
|
response.connection.close()
|
|
except (AttributeError, SocketError, OpenSSLError, PyAsn1Error, requests.exceptions.ConnectionError, requests.exceptions.TooManyRedirects) as e:
|
|
try:
|
|
response = requests.get(story_permalink)
|
|
except (AttributeError, SocketError, OpenSSLError, PyAsn1Error, requests.exceptions.ConnectionError, requests.exceptions.TooManyRedirects) as e:
|
|
logging.debug(' ***> [%-30s] Original story fetch failed using requests: %s' % (self.feed.log_title[:30], e))
|
|
return
|
|
try:
|
|
data = response.text
|
|
except (LookupError, TypeError):
|
|
data = response.content
|
|
|
|
if response.encoding and response.encoding != 'utf-8':
|
|
try:
|
|
data = data.encode(response.encoding)
|
|
except (LookupError, UnicodeEncodeError):
|
|
pass
|
|
|
|
if data:
|
|
data = data.replace("\xc2\xa0", " ") # Non-breaking space, is mangled when encoding is not utf-8
|
|
data = data.replace("\\u00a0", " ") # Non-breaking space, is mangled when encoding is not utf-8
|
|
html = self.rewrite_page(data)
|
|
if not html:
|
|
return
|
|
self.save_story(html)
|
|
|
|
return html
|
|
|
|
def save_story(self, html):
|
|
self.story.original_page_z = zlib.compress(html)
|
|
try:
|
|
self.story.save()
|
|
except NotUniqueError:
|
|
pass
|
|
|
|
|
|
def save_no_page(self):
|
|
logging.debug(' ---> [%-30s] ~FYNo original page: %s' % (self.feed.log_title[:30], self.feed.feed_link))
|
|
self.feed.has_page = False
|
|
self.feed.save()
|
|
self.feed.save_page_history(404, "Feed has no original page.")
|
|
|
|
def rewrite_page(self, response):
|
|
BASE_RE = re.compile(r'<head(.*?\>)', re.I)
|
|
base_code = '<base href="%s" />' % (self.feed.feed_link,)
|
|
try:
|
|
html = BASE_RE.sub(r'<head\1 '+base_code, response)
|
|
except:
|
|
# response = response.decode('latin1').encode('utf-8')
|
|
# html = BASE_RE.sub(r'<head\1 '+base_code, response)
|
|
return None
|
|
|
|
if '<base href' not in html:
|
|
html = "%s %s" % (base_code, html)
|
|
|
|
# html = self.fix_urls(html)
|
|
|
|
return html.strip()
|
|
|
|
def fix_urls(self, document):
|
|
# BEWARE: This will rewrite URLs inside of <script> tags. You know, like
|
|
# Google Analytics. Ugh.
|
|
|
|
FIND_RE = re.compile(r'\b(href|src)\s*=\s*("[^"]*"|\'[^\']*\'|[^"\'<>=\s]+)')
|
|
ret = []
|
|
last_end = 0
|
|
|
|
for match in FIND_RE.finditer(document):
|
|
url = match.group(2)
|
|
if url[0] in "\"'":
|
|
url = url.strip(url[0])
|
|
parsed = urllib.parse.urlparse(url)
|
|
if parsed.scheme == parsed.netloc == '': #relative to domain
|
|
url = urllib.parse.urljoin(self.feed.feed_link, url)
|
|
ret.append(document[last_end:match.start(2)])
|
|
ret.append('"%s"' % (url,))
|
|
last_end = match.end(2)
|
|
ret.append(document[last_end:])
|
|
|
|
return ''.join(ret)
|
|
|
|
def save_page(self, html):
|
|
saved = False
|
|
|
|
if not html or len(html) < 100:
|
|
return
|
|
|
|
if settings.BACKED_BY_AWS.get('pages_on_node'):
|
|
saved = self.save_page_node(html)
|
|
if saved and self.feed.s3_page and settings.BACKED_BY_AWS.get('pages_on_s3'):
|
|
self.delete_page_s3()
|
|
|
|
if settings.BACKED_BY_AWS.get('pages_on_s3') and not saved:
|
|
saved = self.save_page_s3(html)
|
|
|
|
if not saved:
|
|
try:
|
|
feed_page = MFeedPage.objects.get(feed_id=self.feed.pk)
|
|
# feed_page.page_data = html.encode('utf-8')
|
|
if feed_page.page() == html:
|
|
logging.debug(' ---> [%-30s] ~FYNo change in page data: %s' % (self.feed.log_title[:30], self.feed.feed_link))
|
|
else:
|
|
# logging.debug(' ---> [%-30s] ~FYChange in page data: %s (%s/%s %s/%s)' % (self.feed.log_title[:30], self.feed.feed_link, type(html), type(feed_page.page()), len(html), len(feed_page.page())))
|
|
feed_page.page_data = zlib.compress(html)
|
|
feed_page.save()
|
|
except MFeedPage.DoesNotExist:
|
|
feed_page = MFeedPage.objects.create(feed_id=self.feed.pk,
|
|
page_data=zlib.compress(html))
|
|
return feed_page
|
|
|
|
def save_page_node(self, html):
|
|
url = "http://%s/original_page/%s" % (
|
|
settings.ORIGINAL_PAGE_SERVER,
|
|
self.feed.pk,
|
|
)
|
|
response = requests.post(url, files={
|
|
'original_page': compress_string(html),
|
|
# 'original_page': html,
|
|
})
|
|
if response.status_code == 200:
|
|
return True
|
|
|
|
def save_page_s3(self, html):
|
|
k = Key(settings.S3_CONN.get_bucket(settings.S3_PAGES_BUCKET_NAME))
|
|
k.key = self.feed.s3_pages_key
|
|
k.set_metadata('Content-Encoding', 'gzip')
|
|
k.set_metadata('Content-Type', 'text/html')
|
|
k.set_metadata('Access-Control-Allow-Origin', '*')
|
|
k.set_contents_from_string(compress_string(html))
|
|
k.set_acl('public-read')
|
|
|
|
try:
|
|
feed_page = MFeedPage.objects.get(feed_id=self.feed.pk)
|
|
feed_page.delete()
|
|
logging.debug(' ---> [%-30s] ~FYTransfering page data to S3...' % (self.feed.log_title[:30]))
|
|
except MFeedPage.DoesNotExist:
|
|
pass
|
|
|
|
if not self.feed.s3_page:
|
|
self.feed.s3_page = True
|
|
self.feed.save()
|
|
|
|
return True
|
|
|
|
def delete_page_s3(self):
|
|
k = Key(settings.S3_CONN.get_bucket(settings.S3_PAGES_BUCKET_NAME))
|
|
k.key = self.feed.s3_pages_key
|
|
k.delete()
|
|
|
|
self.feed.s3_page = False
|
|
self.feed.save()
|