import requests import re import urlparse import traceback import feedparser import time import urllib2 import httplib from socket import error as SocketError from boto.s3.key import Key from django.conf import settings from django.utils.text import compress_string from utils import log as logging from apps.rss_feeds.models import MFeedPage from utils.feed_functions import timelimit from OpenSSL.SSL import Error as OpenSSLError from pyasn1.error import PyAsn1Error # from utils.feed_functions import mail_feed_error_to_admin BROKEN_PAGES = [ 'tag:', 'info:', 'uuid:', 'urn:', '[]', ] # Also change in reader_utils.js. BROKEN_PAGE_URLS = [ 'nytimes.com', 'github.com', 'washingtonpost.com', 'stackoverflow.com', 'stackexchange.com', 'twitter.com', 'rankexploits', ] class PageImporter(object): def __init__(self, feed): self.feed = feed @property def headers(self): return { 'User-Agent': 'NewsBlur Page Fetcher - %s subscriber%s - %s ' '(Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) ' 'AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 ' 'Safari/534.48.3)' % ( self.feed.num_subscribers, 's' if self.feed.num_subscribers != 1 else '', self.feed.permalink, ), } @timelimit(15) def fetch_page(self, urllib_fallback=False, requests_exception=None): html = None feed_link = self.feed.feed_link if not feed_link: self.save_no_page() return if feed_link.startswith('www'): self.feed.feed_link = 'http://' + feed_link try: if any(feed_link.startswith(s) for s in BROKEN_PAGES): self.save_no_page() return elif any(s in feed_link.lower() for s in BROKEN_PAGE_URLS): self.save_no_page() return elif feed_link.startswith('http'): if urllib_fallback: request = urllib2.Request(feed_link, headers=self.headers) response = urllib2.urlopen(request) time.sleep(0.01) # Grrr, GIL. data = response.read() else: try: response = requests.get(feed_link, headers=self.headers) response.connection.close() except requests.exceptions.TooManyRedirects: response = requests.get(feed_link) except (AttributeError, SocketError, OpenSSLError, PyAsn1Error), e: logging.debug(' ***> [%-30s] Page fetch failed using requests: %s' % (self.feed, e)) self.save_no_page() return try: data = response.text except (LookupError, TypeError): data = response.content if response.encoding and response.encoding != 'utf-8': try: data = data.encode(response.encoding) except LookupError: pass else: try: data = open(feed_link, 'r').read() except IOError: self.feed.feed_link = 'http://' + feed_link self.fetch_page(urllib_fallback=True) return if data: html = self.rewrite_page(data) self.save_page(html) else: self.save_no_page() return except (ValueError, urllib2.URLError, httplib.BadStatusLine, httplib.InvalidURL, requests.exceptions.ConnectionError), e: self.feed.save_page_history(401, "Bad URL", e) fp = feedparser.parse(self.feed.feed_address) feed_link = fp.feed.get('link', "") self.feed.save() except (urllib2.HTTPError), e: self.feed.save_page_history(e.code, e.msg, e.fp.read()) except (httplib.IncompleteRead), e: self.feed.save_page_history(500, "IncompleteRead", e) except (requests.exceptions.RequestException, requests.packages.urllib3.exceptions.HTTPError), e: logging.debug(' ***> [%-30s] Page fetch failed using requests: %s' % (self.feed, e)) # mail_feed_error_to_admin(self.feed, e, local_vars=locals()) return self.fetch_page(urllib_fallback=True, requests_exception=e) except Exception, e: logging.debug('[%d] ! -------------------------' % (self.feed.id,)) tb = traceback.format_exc() logging.debug(tb) logging.debug('[%d] ! -------------------------' % (self.feed.id,)) self.feed.save_page_history(500, "Error", tb) # mail_feed_error_to_admin(self.feed, e, local_vars=locals()) if (not settings.DEBUG and hasattr(settings, 'RAVEN_CLIENT') and settings.RAVEN_CLIENT): settings.RAVEN_CLIENT.captureException() if not urllib_fallback: self.fetch_page(urllib_fallback=True) else: self.feed.save_page_history(200, "OK") return html def save_no_page(self): logging.debug(' ---> [%-30s] ~FYNo original page: %s' % (self.feed, self.feed.feed_link)) self.feed.has_page = False self.feed.save() self.feed.save_page_history(404, "Feed has no original page.") def rewrite_page(self, response): BASE_RE = re.compile(r'
)', re.I) base_code = u'