import requests import re import urlparse import traceback import feedparser import time import urllib2 import httplib from requests.models import ConnectionError from django.conf import settings from utils import log as logging from apps.rss_feeds.models import MFeedPage from utils.feed_functions import timelimit, mail_feed_error_to_admin BROKEN_PAGES = [ 'tag:', 'info:', 'uuid:', 'urn:', '[]', ] class PageImporter(object): def __init__(self, feed): self.feed = feed @property def headers(self): s = requests.session() s.config['keep_alive'] = False return { 'User-Agent': 'NewsBlur Page Fetcher (%s subscriber%s) - %s ' '(Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) ' 'AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 ' 'Safari/534.48.3)' % ( self.feed.num_subscribers, 's' if self.feed.num_subscribers != 1 else '', settings.NEWSBLUR_URL ), 'Connection': 'close', } @timelimit(15) def fetch_page(self, urllib_fallback=False): feed_link = self.feed.feed_link if not feed_link: self.save_no_page() return try: if feed_link.startswith('www'): self.feed.feed_link = 'http://' + feed_link if feed_link.startswith('http'): if urllib_fallback: request = urllib2.Request(feed_link, headers=self.headers) response = urllib2.urlopen(request) time.sleep(0.01) # Grrr, GIL. data = response.read() else: response = requests.get(feed_link, headers=self.headers) data = response.content elif any(feed_link.startswith(s) for s in BROKEN_PAGES): self.save_no_page() return else: data = open(feed_link, 'r').read() html = self.rewrite_page(data) self.save_page(html) except (ValueError, urllib2.URLError, httplib.BadStatusLine, httplib.InvalidURL), e: self.feed.save_page_history(401, "Bad URL", e) fp = feedparser.parse(self.feed.feed_address) feed_link = fp.feed.get('link', "") self.feed.save() except (urllib2.HTTPError), e: self.feed.save_page_history(e.code, e.msg, e.fp.read()) except (httplib.IncompleteRead), e: self.feed.save_page_history(500, "IncompleteRead", e) except Exception, e: logging.debug('[%d] ! -------------------------' % (self.feed.id,)) tb = traceback.format_exc() logging.debug(tb) logging.debug('[%d] ! -------------------------' % (self.feed.id,)) self.feed.save_page_history(500, "Error", tb) mail_feed_error_to_admin(self.feed, e, locals()) if not urllib_fallback: self.fetch_page(urllib_fallback=True) else: self.feed.save_page_history(200, "OK") def save_no_page(self): self.feed.has_page = False self.feed.save() self.feed.save_page_history(404, "Feed has no original page.") def rewrite_page(self, response): BASE_RE = re.compile(r'
)', re.I) base_code = u'