2011-11-26 11:15:01 -05:00
import requests
2009-08-13 03:26:12 +00:00
import re
2010-04-23 10:44:46 -04:00
import urlparse
2010-07-08 11:37:54 -04:00
import traceback
2010-07-21 11:38:33 -04:00
import feedparser
2011-08-27 14:13:28 -07:00
import time
2011-11-26 11:15:01 -05:00
from django . conf import settings
2010-08-16 15:45:35 -04:00
from utils import log as logging
2010-08-29 12:35:09 -04:00
from apps . rss_feeds . models import MFeedPage
2011-02-15 21:08:40 -05:00
from utils . feed_functions import timelimit , mail_feed_error_to_admin
2011-01-29 22:01:09 -05:00
2011-09-04 14:47:47 -07:00
BROKEN_PAGES = [
' tag: ' ,
' info: ' ,
' uuid: ' ,
2011-09-07 22:35:03 -07:00
' urn: ' ,
2011-09-04 14:47:47 -07:00
' [] ' ,
]
2009-08-13 03:26:12 +00:00
class PageImporter ( object ) :
def __init__ ( self , url , feed ) :
self . url = url
self . feed = feed
2011-11-26 11:15:01 -05:00
self . setup_headers ( )
def setup_headers ( self ) :
s = requests . session ( )
s . config [ ' keep_alive ' ] = False
self . headers = {
' User-Agent ' : ' NewsBlur Page Fetcher ( %s subscriber %s ) - %s (Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 Safari/534.48.3) ' % (
self . feed . num_subscribers ,
' s ' if self . feed . num_subscribers != 1 else ' ' ,
settings . NEWSBLUR_URL
) ,
}
2009-08-13 03:26:12 +00:00
2011-02-02 13:07:12 -05:00
@timelimit ( 15 )
2009-08-13 03:26:12 +00:00
def fetch_page ( self ) :
2010-07-06 18:16:41 -04:00
if not self . url :
2011-09-04 10:59:29 -07:00
self . save_no_page ( )
2010-07-06 18:16:41 -04:00
return
2010-07-08 11:37:54 -04:00
try :
2011-09-04 10:59:29 -07:00
if self . url . startswith ( ' http ' ) :
2011-11-26 11:15:01 -05:00
response = requests . get ( self . url , headers = self . headers )
2011-08-31 09:41:34 -07:00
time . sleep ( 0.01 ) # Grrr, GIL.
2011-11-26 11:15:01 -05:00
data = response . content
2011-09-04 14:47:47 -07:00
elif any ( self . url . startswith ( s ) for s in BROKEN_PAGES ) :
2011-09-04 10:59:29 -07:00
self . save_no_page ( )
return
2011-08-31 09:41:34 -07:00
else :
data = open ( self . url , ' r ' ) . read ( )
2010-07-08 11:37:54 -04:00
html = self . rewrite_page ( data )
self . save_page ( html )
2011-11-26 11:15:01 -05:00
# except (ValueError, urllib2.URLError, httplib.BadStatusLine, httplib.InvalidURL), e:
# self.feed.save_page_history(401, "Bad URL", e)
# fp = feedparser.parse(self.feed.feed_address)
# self.feed.feed_link = fp.feed.get('link', "")
# self.feed.save()
# except (urllib2.HTTPError), e:
# self.feed.save_page_history(e.code, e.msg, e.fp.read())
# return
# except (httplib.IncompleteRead), e:
# self.feed.save_page_history(500, "IncompleteRead", e)
# return
2010-07-08 11:37:54 -04:00
except Exception , e :
2010-08-15 12:09:40 -04:00
logging . debug ( ' [ %d ] ! ------------------------- ' % ( self . feed . id , ) )
2010-07-08 11:37:54 -04:00
tb = traceback . format_exc ( )
2010-08-15 12:09:40 -04:00
logging . debug ( tb )
logging . debug ( ' [ %d ] ! ------------------------- ' % ( self . feed . id , ) )
2010-07-08 11:37:54 -04:00
self . feed . save_page_history ( 500 , " Error " , tb )
2011-02-15 21:08:40 -05:00
mail_feed_error_to_admin ( self . feed , e )
2010-07-08 11:37:54 -04:00
return
self . feed . save_page_history ( 200 , " OK " )
2011-09-04 10:59:29 -07:00
def save_no_page ( self ) :
self . feed . has_page = False
self . feed . save ( )
self . feed . save_page_history ( 404 , " Feed has no original page. " )
2009-08-13 03:26:12 +00:00
def rewrite_page ( self , response ) :
2010-04-23 10:44:46 -04:00
BASE_RE = re . compile ( r ' <head(.*? \ >) ' , re . I )
2009-08-13 03:26:12 +00:00
base_code = u ' <base href= " %s " /> ' % ( self . feed . feed_link , )
try :
2010-04-23 10:44:46 -04:00
html = BASE_RE . sub ( r ' <head \ 1 ' + base_code , response )
2009-08-13 03:26:12 +00:00
except :
response = response . decode ( ' latin1 ' ) . encode ( ' utf-8 ' )
2010-04-23 10:44:46 -04:00
html = BASE_RE . sub ( r ' <head \ 1 ' + base_code , response )
2011-08-07 21:44:57 -07:00
if ' <base href ' not in html :
html = " %s %s " % ( base_code , html )
2010-04-23 10:44:46 -04:00
# html = self.fix_urls(html)
2009-08-13 03:26:12 +00:00
2010-06-24 15:27:25 -04:00
return html . strip ( )
2010-04-23 10:44:46 -04:00
def fix_urls ( self , document ) :
# BEWARE: This will rewrite URLs inside of <script> tags. You know, like
# Google Analytics. Ugh.
FIND_RE = re . compile ( r ' \ b(href|src) \ s*= \ s*( " [^ " ]* " | \' [^ \' ]* \' |[^ " \' <>= \ s]+) ' )
ret = [ ]
last_end = 0
for match in FIND_RE . finditer ( document ) :
url = match . group ( 2 )
if url [ 0 ] in " \" ' " :
url = url . strip ( url [ 0 ] )
parsed = urlparse . urlparse ( url )
if parsed . scheme == parsed . netloc == ' ' : #relative to domain
url = urlparse . urljoin ( self . feed . feed_link , url )
ret . append ( document [ last_end : match . start ( 2 ) ] )
ret . append ( ' " %s " ' % ( url , ) )
last_end = match . end ( 2 )
ret . append ( document [ last_end : ] )
return ' ' . join ( ret )
2009-08-13 03:26:12 +00:00
def save_page ( self , html ) :
2010-06-24 16:31:38 -04:00
if html and len ( html ) > 100 :
2010-08-29 12:35:09 -04:00
feed_page , _ = MFeedPage . objects . get_or_create ( feed_id = self . feed . pk )
2010-06-27 21:03:29 -04:00
feed_page . page_data = html
2010-07-01 17:33:58 -04:00
feed_page . save ( )