mirror of
https://github.com/samuelclay/NewsBlur.git
synced 2025-04-13 09:42:01 +00:00
339 lines
13 KiB
Python
339 lines
13 KiB
Python
import requests
|
|
import re
|
|
import urllib.parse
|
|
import traceback
|
|
from vendor import feedparser
|
|
import time
|
|
import urllib.request, urllib.error, urllib.parse
|
|
import http.client
|
|
import zlib
|
|
from mongoengine.queryset import NotUniqueError
|
|
from socket import error as SocketError
|
|
from boto.s3.key import Key
|
|
from django.conf import settings
|
|
from django.utils.text import compress_string
|
|
from utils import log as logging
|
|
from apps.rss_feeds.models import MFeedPage
|
|
from utils.feed_functions import timelimit, TimeoutError
|
|
from OpenSSL.SSL import Error as OpenSSLError
|
|
from pyasn1.error import PyAsn1Error
|
|
# from utils.feed_functions import mail_feed_error_to_admin
|
|
|
|
BROKEN_PAGES = [
|
|
'tag:',
|
|
'info:',
|
|
'uuid:',
|
|
'urn:',
|
|
'[]',
|
|
]
|
|
|
|
# Also change in reader_utils.js.
|
|
BROKEN_PAGE_URLS = [
|
|
'nytimes.com',
|
|
'github.com',
|
|
'washingtonpost.com',
|
|
'stackoverflow.com',
|
|
'stackexchange.com',
|
|
'twitter.com',
|
|
'rankexploits',
|
|
'gamespot.com',
|
|
'espn.com',
|
|
'royalroad.com',
|
|
]
|
|
|
|
class PageImporter(object):
|
|
|
|
def __init__(self, feed, story=None, request=None):
|
|
self.feed = feed
|
|
self.story = story
|
|
self.request = request
|
|
|
|
@property
|
|
def headers(self):
|
|
return {
|
|
'User-Agent': 'NewsBlur Page Fetcher - %s subscriber%s - %s '
|
|
'(Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) '
|
|
'AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 '
|
|
'Safari/534.48.3)' % (
|
|
self.feed.num_subscribers,
|
|
's' if self.feed.num_subscribers != 1 else '',
|
|
self.feed.permalink,
|
|
),
|
|
}
|
|
|
|
def fetch_page(self, urllib_fallback=False, requests_exception=None):
|
|
try:
|
|
self.fetch_page_timeout(urllib_fallback=urllib_fallback, requests_exception=requests_exception)
|
|
except TimeoutError:
|
|
logging.user(self.request, ' ***> [%-30s] ~FBPage fetch ~SN~FRfailed~FB due to timeout' % (self.feed.log_title[:30]))
|
|
|
|
@timelimit(10)
|
|
def fetch_page_timeout(self, urllib_fallback=False, requests_exception=None):
|
|
html = None
|
|
feed_link = self.feed.feed_link
|
|
if not feed_link:
|
|
self.save_no_page()
|
|
return
|
|
|
|
if feed_link.startswith('www'):
|
|
self.feed.feed_link = 'http://' + feed_link
|
|
try:
|
|
if any(feed_link.startswith(s) for s in BROKEN_PAGES):
|
|
self.save_no_page()
|
|
return
|
|
elif any(s in feed_link.lower() for s in BROKEN_PAGE_URLS):
|
|
self.save_no_page()
|
|
return
|
|
elif feed_link.startswith('http'):
|
|
if urllib_fallback:
|
|
request = urllib.request.Request(feed_link, headers=self.headers)
|
|
response = urllib.request.urlopen(request)
|
|
time.sleep(0.01) # Grrr, GIL.
|
|
data = response.read()
|
|
else:
|
|
try:
|
|
response = requests.get(feed_link, headers=self.headers)
|
|
response.connection.close()
|
|
except requests.exceptions.TooManyRedirects:
|
|
response = requests.get(feed_link)
|
|
except (AttributeError, SocketError, OpenSSLError, PyAsn1Error, TypeError) as e:
|
|
logging.debug(' ***> [%-30s] Page fetch failed using requests: %s' % (self.feed.log_title[:30], e))
|
|
self.save_no_page()
|
|
return
|
|
# try:
|
|
data = response.content
|
|
# except (LookupError, TypeError):
|
|
# data = response.content
|
|
|
|
# if response.encoding and response.encoding != 'utf-8':
|
|
# try:
|
|
# data = data.encode(response.encoding)
|
|
# except LookupError:
|
|
# pass
|
|
else:
|
|
try:
|
|
data = open(feed_link, 'r').read()
|
|
except IOError:
|
|
self.feed.feed_link = 'http://' + feed_link
|
|
self.fetch_page(urllib_fallback=True)
|
|
return
|
|
if data:
|
|
html = self.rewrite_page(data)
|
|
if html:
|
|
self.save_page(html)
|
|
else:
|
|
self.save_no_page()
|
|
return
|
|
else:
|
|
self.save_no_page()
|
|
return
|
|
except (ValueError, urllib.error.URLError, http.client.BadStatusLine, http.client.InvalidURL,
|
|
requests.exceptions.ConnectionError) as e:
|
|
self.feed.save_page_history(401, "Bad URL", e)
|
|
fp = feedparser.parse(self.feed.feed_address)
|
|
feed_link = fp.feed.get('link', "")
|
|
self.feed.save()
|
|
logging.debug(' ***> [%-30s] Page fetch failed: %s' % (self.feed.log_title[:30], e))
|
|
except (urllib.error.HTTPError) as e:
|
|
self.feed.save_page_history(e.code, e.msg, e.fp.read())
|
|
except (http.client.IncompleteRead) as e:
|
|
self.feed.save_page_history(500, "IncompleteRead", e)
|
|
except (requests.exceptions.RequestException,
|
|
requests.packages.urllib3.exceptions.HTTPError) as e:
|
|
logging.debug(' ***> [%-30s] Page fetch failed using requests: %s' % (self.feed.log_title[:30], e))
|
|
# mail_feed_error_to_admin(self.feed, e, local_vars=locals())
|
|
return self.fetch_page(urllib_fallback=True, requests_exception=e)
|
|
except Exception as e:
|
|
logging.debug('[%d] ! -------------------------' % (self.feed.id,))
|
|
tb = traceback.format_exc()
|
|
logging.debug(tb)
|
|
logging.debug('[%d] ! -------------------------' % (self.feed.id,))
|
|
self.feed.save_page_history(500, "Error", tb)
|
|
# mail_feed_error_to_admin(self.feed, e, local_vars=locals())
|
|
if (not settings.DEBUG and hasattr(settings, 'RAVEN_CLIENT') and
|
|
settings.RAVEN_CLIENT):
|
|
settings.RAVEN_CLIENT.captureException()
|
|
if not urllib_fallback:
|
|
self.fetch_page(urllib_fallback=True)
|
|
else:
|
|
self.feed.save_page_history(200, "OK")
|
|
|
|
return html
|
|
|
|
def fetch_story(self):
|
|
html = None
|
|
try:
|
|
html = self._fetch_story()
|
|
except TimeoutError:
|
|
logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal story~FY: timed out")
|
|
except requests.exceptions.TooManyRedirects:
|
|
logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal story~FY: too many redirects")
|
|
|
|
return html
|
|
|
|
@timelimit(10)
|
|
def _fetch_story(self):
|
|
html = None
|
|
story_permalink = self.story.story_permalink
|
|
|
|
if not self.feed:
|
|
return
|
|
if any(story_permalink.startswith(s) for s in BROKEN_PAGES):
|
|
return
|
|
if any(s in story_permalink.lower() for s in BROKEN_PAGE_URLS):
|
|
return
|
|
if not story_permalink.startswith('http'):
|
|
return
|
|
|
|
try:
|
|
response = requests.get(story_permalink, headers=self.headers)
|
|
response.connection.close()
|
|
except (AttributeError, SocketError, OpenSSLError, PyAsn1Error, requests.exceptions.ConnectionError, requests.exceptions.TooManyRedirects) as e:
|
|
try:
|
|
response = requests.get(story_permalink)
|
|
except (AttributeError, SocketError, OpenSSLError, PyAsn1Error, requests.exceptions.ConnectionError, requests.exceptions.TooManyRedirects) as e:
|
|
logging.debug(' ***> [%-30s] Original story fetch failed using requests: %s' % (self.feed.log_title[:30], e))
|
|
return
|
|
try:
|
|
data = response.text
|
|
except (LookupError, TypeError):
|
|
data = response.content
|
|
|
|
if response.encoding and response.encoding != 'utf-8':
|
|
try:
|
|
data = data.encode(response.encoding)
|
|
except (LookupError, UnicodeEncodeError):
|
|
pass
|
|
|
|
if data:
|
|
data = data.replace("\xc2\xa0", " ") # Non-breaking space, is mangled when encoding is not utf-8
|
|
data = data.replace("\\u00a0", " ") # Non-breaking space, is mangled when encoding is not utf-8
|
|
html = self.rewrite_page(data)
|
|
if not html:
|
|
return
|
|
self.save_story(html)
|
|
|
|
return html
|
|
|
|
def save_story(self, html):
|
|
self.story.original_page_z = zlib.compress(html)
|
|
try:
|
|
self.story.save()
|
|
except NotUniqueError:
|
|
pass
|
|
|
|
|
|
def save_no_page(self):
|
|
logging.debug(' ---> [%-30s] ~FYNo original page: %s' % (self.feed.log_title[:30], self.feed.feed_link))
|
|
self.feed.has_page = False
|
|
self.feed.save()
|
|
self.feed.save_page_history(404, "Feed has no original page.")
|
|
|
|
def rewrite_page(self, response):
|
|
BASE_RE = re.compile(r'<head(.*?\>)', re.I)
|
|
base_code = '<base href="%s" />' % (self.feed.feed_link,)
|
|
try:
|
|
html = BASE_RE.sub(r'<head\1 '+base_code, response)
|
|
except:
|
|
# response = response.decode('latin1').encode('utf-8')
|
|
# html = BASE_RE.sub(r'<head\1 '+base_code, response)
|
|
return None
|
|
|
|
if '<base href' not in html:
|
|
html = "%s %s" % (base_code, html)
|
|
|
|
# html = self.fix_urls(html)
|
|
|
|
return html.strip()
|
|
|
|
def fix_urls(self, document):
|
|
# BEWARE: This will rewrite URLs inside of <script> tags. You know, like
|
|
# Google Analytics. Ugh.
|
|
|
|
FIND_RE = re.compile(r'\b(href|src)\s*=\s*("[^"]*"|\'[^\']*\'|[^"\'<>=\s]+)')
|
|
ret = []
|
|
last_end = 0
|
|
|
|
for match in FIND_RE.finditer(document):
|
|
url = match.group(2)
|
|
if url[0] in "\"'":
|
|
url = url.strip(url[0])
|
|
parsed = urllib.parse.urlparse(url)
|
|
if parsed.scheme == parsed.netloc == '': #relative to domain
|
|
url = urllib.parse.urljoin(self.feed.feed_link, url)
|
|
ret.append(document[last_end:match.start(2)])
|
|
ret.append('"%s"' % (url,))
|
|
last_end = match.end(2)
|
|
ret.append(document[last_end:])
|
|
|
|
return ''.join(ret)
|
|
|
|
def save_page(self, html):
|
|
saved = False
|
|
|
|
if not html or len(html) < 100:
|
|
return
|
|
|
|
if settings.BACKED_BY_AWS.get('pages_on_node'):
|
|
saved = self.save_page_node(html)
|
|
if saved and self.feed.s3_page and settings.BACKED_BY_AWS.get('pages_on_s3'):
|
|
self.delete_page_s3()
|
|
|
|
if settings.BACKED_BY_AWS.get('pages_on_s3') and not saved:
|
|
saved = self.save_page_s3(html)
|
|
|
|
if not saved:
|
|
try:
|
|
feed_page = MFeedPage.objects.get(feed_id=self.feed.pk)
|
|
# feed_page.page_data = html.encode('utf-8')
|
|
if feed_page.page() == html:
|
|
logging.debug(' ---> [%-30s] ~FYNo change in page data: %s' % (self.feed.log_title[:30], self.feed.feed_link))
|
|
else:
|
|
# logging.debug(' ---> [%-30s] ~FYChange in page data: %s (%s/%s %s/%s)' % (self.feed.log_title[:30], self.feed.feed_link, type(html), type(feed_page.page()), len(html), len(feed_page.page())))
|
|
feed_page.page_data = zlib.compress(html)
|
|
feed_page.save()
|
|
except MFeedPage.DoesNotExist:
|
|
feed_page = MFeedPage.objects.create(feed_id=self.feed.pk, page_data=html)
|
|
return feed_page
|
|
|
|
def save_page_node(self, html):
|
|
url = "http://%s/original_page/%s" % (
|
|
settings.ORIGINAL_PAGE_SERVER,
|
|
self.feed.pk,
|
|
)
|
|
response = requests.post(url, files={
|
|
'original_page': compress_string(html),
|
|
# 'original_page': html,
|
|
})
|
|
if response.status_code == 200:
|
|
return True
|
|
|
|
def save_page_s3(self, html):
|
|
k = Key(settings.S3_CONN.get_bucket(settings.S3_PAGES_BUCKET_NAME))
|
|
k.key = self.feed.s3_pages_key
|
|
k.set_metadata('Content-Encoding', 'gzip')
|
|
k.set_metadata('Content-Type', 'text/html')
|
|
k.set_metadata('Access-Control-Allow-Origin', '*')
|
|
k.set_contents_from_string(compress_string(html))
|
|
k.set_acl('public-read')
|
|
|
|
try:
|
|
feed_page = MFeedPage.objects.get(feed_id=self.feed.pk)
|
|
feed_page.delete()
|
|
logging.debug(' ---> [%-30s] ~FYTransfering page data to S3...' % (self.feed.log_title[:30]))
|
|
except MFeedPage.DoesNotExist:
|
|
pass
|
|
|
|
if not self.feed.s3_page:
|
|
self.feed.s3_page = True
|
|
self.feed.save()
|
|
|
|
return True
|
|
|
|
def delete_page_s3(self):
|
|
k = Key(settings.S3_CONN.get_bucket(settings.S3_PAGES_BUCKET_NAME))
|
|
k.key = self.feed.s3_pages_key
|
|
k.delete()
|
|
|
|
self.feed.s3_page = False
|
|
self.feed.save()
|