2010-04-07 11:37:53 -04:00
""" feedfinder: Find the Web feed for a Web page
http : / / www . aaronsw . com / 2002 / feedfinder /
Usage :
feed ( uri ) - returns feed found for a URI
feeds ( uri ) - returns all feeds found for a URI
>> > import feedfinder
>> > feedfinder . feed ( ' scripting.com ' )
' http://scripting.com/rss.xml '
>> >
>> > feedfinder . feeds ( ' scripting.com ' )
[ ' http://delong.typepad.com/sdj/atom.xml ' ,
' http://delong.typepad.com/sdj/index.rdf ' ,
' http://delong.typepad.com/sdj/rss.xml ' ]
>> >
Can also use from the command line . Feeds are returned one per line :
$ python feedfinder . py diveintomark . org
http : / / diveintomark . org / xml / atom . xml
How it works :
0. At every step , feeds are minimally verified to make sure they are really feeds .
1. If the URI points to a feed , it is simply returned ; otherwise
the page is downloaded and the real fun begins .
2. Feeds pointed to by LINK tags in the header of the page ( autodiscovery )
3. < A > links to feeds on the same server ending in " .rss " , " .rdf " , " .xml " , or
" .atom "
4. < A > links to feeds on the same server containing " rss " , " rdf " , " xml " , or " atom "
5. < A > links to feeds on external servers ending in " .rss " , " .rdf " , " .xml " , or
" .atom "
6. < A > links to feeds on external servers containing " rss " , " rdf " , " xml " , or " atom "
7. Try some guesses about common places for feeds ( index . xml , atom . xml , etc . ) .
8. As a last ditch effort , we search Syndic8 for feeds matching the URI
"""
__version__ = " 1.371 "
__date__ = " 2006-04-24 "
__maintainer__ = " Aaron Swartz (me@aaronsw.com) "
__author__ = " Mark Pilgrim (http://diveintomark.org) "
__copyright__ = " Copyright 2002-4, Mark Pilgrim; 2006 Aaron Swartz "
__license__ = " Python "
__credits__ = """ Abe Fettig for a patch to sort Syndic8 feeds by popularity
Also Jason Diamond , Brian Lalor for bug reporting and patches """
2011-02-08 23:00:00 -05:00
_debug = 0
2010-04-07 11:37:53 -04:00
2024-04-24 09:50:42 -04:00
import re
import sys
import urllib . error
import urllib . parse
import urllib . request
import urllib . robotparser
2020-06-13 13:13:20 -04:00
from io import StringIO
2011-02-06 00:12:00 -05:00
2024-04-24 09:50:42 -04:00
import requests
import sgmllib
from lxml import etree
2010-04-07 11:37:53 -04:00
# XML-RPC support allows feedfinder to query Syndic8 for possible matches.
# Python 2.3 now comes with this module by default, otherwise you can download it
try :
2024-04-24 09:43:56 -04:00
import xmlrpc . client # http://www.pythonware.com/products/xmlrpc/
2010-04-07 11:37:53 -04:00
except ImportError :
xmlrpclib = None
if not dict :
2024-04-24 09:43:56 -04:00
2010-04-07 11:37:53 -04:00
def dict ( aList ) :
rc = { }
for k , v in aList :
rc [ k ] = v
return rc
2024-04-24 09:43:56 -04:00
2010-04-07 11:37:53 -04:00
def _debuglog ( message ) :
2024-04-24 09:43:56 -04:00
if _debug :
print ( message )
2010-04-07 11:37:53 -04:00
class URLGatekeeper :
""" a class to track robots.txt rules across multiple servers """
2024-04-24 09:43:56 -04:00
2010-04-07 11:37:53 -04:00
def __init__ ( self ) :
2024-04-24 09:43:56 -04:00
self . rpcache = { } # a dictionary of RobotFileParser objects, by domain
2020-06-19 02:27:48 -04:00
self . urlopener = urllib . request . build_opener ( )
2011-11-10 10:29:25 -08:00
self . urlopener . version = " NewsBlur Feed Finder (Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 Safari/534.48.3) "
2010-04-07 11:37:53 -04:00
_debuglog ( self . urlopener . version )
2024-04-24 09:43:56 -04:00
self . urlopener . addheaders = [ ( " User-Agent " , self . urlopener . version ) ]
2012-03-21 14:41:10 -07:00
# self.urlopener.addheaders = [('User-Agent', self.urlopener.version), ('Accept', '*')]
2024-04-24 09:43:56 -04:00
# urllib.robotparser.URLopener.version = self.urlopener.version
# urllib.robotparser.URLopener.addheaders = self.urlopener.addheaders
2010-04-07 11:37:53 -04:00
def _getrp ( self , url ) :
2020-06-13 13:13:20 -04:00
protocol , domain = urllib . parse . urlparse ( url ) [ : 2 ]
if domain in self . rpcache :
2010-04-07 11:37:53 -04:00
return self . rpcache [ domain ]
2024-04-24 09:43:56 -04:00
baseurl = " %s :// %s " % ( protocol , domain )
robotsurl = urllib . parse . urljoin ( baseurl , " robots.txt " )
_debuglog ( " fetching %s " % robotsurl )
2020-06-13 13:13:20 -04:00
rp = urllib . robotparser . RobotFileParser ( robotsurl )
2010-04-07 11:37:53 -04:00
try :
rp . read ( )
except :
pass
self . rpcache [ domain ] = rp
return rp
2024-04-24 09:43:56 -04:00
2010-04-07 11:37:53 -04:00
def can_fetch ( self , url ) :
rp = self . _getrp ( url )
allow = rp . can_fetch ( self . urlopener . version , url )
_debuglog ( " gatekeeper of %s says %s " % ( url , allow ) )
return allow
2013-06-30 17:00:31 -07:00
def get ( self , url , check = False ) :
2024-04-24 09:43:56 -04:00
if check and not self . can_fetch ( url ) :
return " "
2010-04-07 11:37:53 -04:00
try :
2021-05-12 15:52:48 -04:00
return requests . get ( url , headers = dict ( self . urlopener . addheaders ) ) . text
2010-04-07 11:37:53 -04:00
except :
2024-04-24 09:43:56 -04:00
return " "
2010-04-07 11:37:53 -04:00
_gatekeeper = URLGatekeeper ( )
2024-04-24 09:43:56 -04:00
2010-04-07 11:37:53 -04:00
class BaseParser ( sgmllib . SGMLParser ) :
def __init__ ( self , baseuri ) :
sgmllib . SGMLParser . __init__ ( self )
self . links = [ ]
self . baseuri = baseuri
2024-04-24 09:43:56 -04:00
2010-04-07 11:37:53 -04:00
def normalize_attrs ( self , attrs ) :
def cleanattr ( v ) :
2020-06-13 13:13:20 -04:00
v = sgmllib . charref . sub ( lambda m : chr ( int ( m . groups ( ) [ 0 ] ) ) , v )
2024-04-24 09:43:56 -04:00
if not v :
return
2010-04-07 11:37:53 -04:00
v = v . strip ( )
2024-04-24 09:43:56 -04:00
v = (
v . replace ( " < " , " < " )
. replace ( " > " , " > " )
. replace ( " ' " , " ' " )
. replace ( " " " , ' " ' )
. replace ( " & " , " & " )
)
2010-04-07 11:37:53 -04:00
return v
2024-04-24 09:43:56 -04:00
2011-02-15 21:08:40 -05:00
attrs = [ ( k . lower ( ) , cleanattr ( v ) ) for k , v in attrs if cleanattr ( v ) ]
2024-04-24 09:43:56 -04:00
attrs = [ ( k , k in ( " rel " , " type " ) and v . lower ( ) or v ) for k , v in attrs if cleanattr ( v ) ]
2010-04-07 11:37:53 -04:00
return attrs
2024-04-24 09:43:56 -04:00
2010-04-07 11:37:53 -04:00
def do_base ( self , attrs ) :
attrsD = dict ( self . normalize_attrs ( attrs ) )
2024-04-24 09:43:56 -04:00
if " href " not in attrsD :
return
self . baseuri = attrsD [ " href " ]
def error ( self , * a , * * kw ) :
pass # we're not picky
2010-04-07 11:37:53 -04:00
class LinkParser ( BaseParser ) :
2024-04-24 09:43:56 -04:00
FEED_TYPES = (
" application/rss+xml " ,
" text/xml " ,
" application/atom+xml " ,
" application/x.atom+xml " ,
" application/x-atom+xml " ,
)
2010-04-07 11:37:53 -04:00
def do_link ( self , attrs ) :
attrsD = dict ( self . normalize_attrs ( attrs ) )
2024-04-24 09:43:56 -04:00
if " rel " not in attrsD :
return
rels = attrsD [ " rel " ] . split ( )
if " alternate " not in rels :
return
if attrsD . get ( " type " ) not in self . FEED_TYPES :
return
if " href " not in attrsD :
return
self . links . append ( urllib . parse . urljoin ( self . baseuri , attrsD [ " href " ] ) )
2010-04-07 11:37:53 -04:00
class ALinkParser ( BaseParser ) :
def start_a ( self , attrs ) :
attrsD = dict ( self . normalize_attrs ( attrs ) )
2024-04-24 09:43:56 -04:00
if " href " not in attrsD :
return
self . links . append ( urllib . parse . urljoin ( self . baseuri , attrsD [ " href " ] ) )
2010-04-07 11:37:53 -04:00
def makeFullURI ( uri ) :
2024-04-24 09:43:56 -04:00
if not uri :
return
2010-04-07 11:37:53 -04:00
uri = uri . strip ( )
2024-04-24 09:43:56 -04:00
if uri . startswith ( " feed:// " ) :
uri = " http:// " + uri . split ( " feed:// " , 1 ) . pop ( )
for x in [ " http " , " https " ] :
if uri . startswith ( " %s :// " % x ) :
2010-04-07 11:37:53 -04:00
return uri
2024-04-24 09:43:56 -04:00
return " http:// %s " % uri
2010-04-07 11:37:53 -04:00
def getLinks ( data , baseuri ) :
p = LinkParser ( baseuri )
p . feed ( data )
return p . links
2024-04-24 09:43:56 -04:00
2011-02-06 00:12:00 -05:00
def getLinksLXML ( data , baseuri ) :
parser = etree . HTMLParser ( recover = True )
tree = etree . parse ( StringIO ( data ) , parser )
links = [ ]
2024-04-24 09:43:56 -04:00
for link in tree . findall ( " .//link " ) :
if link . attrib . get ( " type " ) in LinkParser . FEED_TYPES :
href = link . attrib [ " href " ]
if href :
links . append ( href )
2011-02-06 00:12:00 -05:00
return links
2024-04-24 09:43:56 -04:00
2010-04-07 11:37:53 -04:00
def getALinks ( data , baseuri ) :
p = ALinkParser ( baseuri )
p . feed ( data )
return p . links
2024-04-24 09:43:56 -04:00
2010-04-07 11:37:53 -04:00
def getLocalLinks ( links , baseuri ) :
2011-03-23 22:08:00 -04:00
found_links = [ ]
2024-04-24 09:43:56 -04:00
if not baseuri :
return found_links
2010-04-07 11:37:53 -04:00
baseuri = baseuri . lower ( )
2011-02-23 14:25:07 -05:00
for l in links :
try :
if l . lower ( ) . startswith ( baseuri ) :
2011-03-23 22:08:00 -04:00
found_links . append ( l )
2011-02-23 14:27:34 -05:00
except ( AttributeError , UnicodeDecodeError ) :
2011-02-23 14:25:07 -05:00
pass
2011-03-23 22:08:00 -04:00
return found_links
2010-04-07 11:37:53 -04:00
2024-04-24 09:43:56 -04:00
2010-04-07 11:37:53 -04:00
def isFeedLink ( link ) :
2024-04-24 09:43:56 -04:00
return link [ - 4 : ] . lower ( ) in ( " .rss " , " .rdf " , " .xml " , " .atom " )
2010-04-07 11:37:53 -04:00
def isXMLRelatedLink ( link ) :
link = link . lower ( )
2024-04-24 09:43:56 -04:00
return link . count ( " rss " ) + link . count ( " rdf " ) + link . count ( " xml " ) + link . count ( " atom " )
r_brokenRedirect = re . compile ( " <newLocation[^>]*>(.*?)</newLocation> " , re . S )
2010-04-07 11:37:53 -04:00
def tryBrokenRedirect ( data ) :
2024-04-24 09:43:56 -04:00
if " <newLocation " in data :
2010-04-07 11:37:53 -04:00
newuris = r_brokenRedirect . findall ( data )
2024-04-24 09:43:56 -04:00
if newuris and newuris [ 0 ] :
return newuris [ 0 ] . strip ( )
2010-04-07 11:37:53 -04:00
def couldBeFeedData ( data ) :
data = data . lower ( )
2024-04-24 09:43:56 -04:00
if data . count ( " <html " ) :
return 0
return data . count ( " <rss " ) + data . count ( " <rdf " ) + data . count ( " <feed " )
2010-04-07 11:37:53 -04:00
def isFeed ( uri ) :
2024-04-24 09:43:56 -04:00
_debuglog ( " seeing if %s is a feed " % uri )
2020-06-13 13:13:20 -04:00
protocol = urllib . parse . urlparse ( uri )
2024-04-24 09:43:56 -04:00
if protocol [ 0 ] not in ( " http " , " https " ) :
return 0
2011-02-23 14:25:07 -05:00
try :
2012-05-19 10:32:55 -07:00
data = _gatekeeper . get ( uri , check = False )
2011-02-23 14:25:07 -05:00
except ( KeyError , UnicodeDecodeError ) :
return False
2011-02-08 22:07:59 -05:00
count = couldBeFeedData ( data )
return count
2010-04-07 11:37:53 -04:00
2024-04-24 09:43:56 -04:00
2020-06-19 02:27:48 -04:00
def cmp_ ( a , b ) :
2024-04-24 09:43:56 -04:00
return ( a > b ) - ( a < b )
2020-06-19 02:27:48 -04:00
2010-04-07 11:37:53 -04:00
def sortFeeds ( feed1Info , feed2Info ) :
2024-04-24 09:43:56 -04:00
return cmp_ ( feed2Info [ " headlines_rank " ] , feed1Info [ " headlines_rank " ] )
2010-04-07 11:37:53 -04:00
def getFeedsFromSyndic8 ( uri ) :
feeds = [ ]
try :
2024-04-24 09:43:56 -04:00
server = xmlrpc . client . Server ( " http://www.syndic8.com/xmlrpc.php " )
2010-04-07 11:37:53 -04:00
feedids = server . syndic8 . FindFeeds ( uri )
2024-04-24 09:43:56 -04:00
infolist = server . syndic8 . GetFeedInfo ( feedids , [ " headlines_rank " , " status " , " dataurl " ] )
2010-04-07 11:37:53 -04:00
infolist . sort ( sortFeeds )
2024-04-24 09:43:56 -04:00
feeds = [ f [ " dataurl " ] for f in infolist if f [ " status " ] == " Syndicated " ]
_debuglog ( " found %s feeds through Syndic8 " % len ( feeds ) )
2010-04-07 11:37:53 -04:00
except :
pass
return feeds
2024-04-24 09:43:56 -04:00
2010-04-07 11:37:53 -04:00
def feeds ( uri , all = False , querySyndic8 = False , _recurs = None ) :
2024-04-24 09:43:56 -04:00
if _recurs is None :
_recurs = [ uri ]
2010-04-07 11:37:53 -04:00
fulluri = makeFullURI ( uri )
try :
data = _gatekeeper . get ( fulluri , check = False )
except :
return [ ]
# is this already a feed?
if couldBeFeedData ( data ) :
return [ fulluri ]
newuri = tryBrokenRedirect ( data )
if newuri and newuri not in _recurs :
_recurs . append ( newuri )
return feeds ( newuri , all = all , querySyndic8 = querySyndic8 , _recurs = _recurs )
# nope, it's a page, try LINK tags first
2024-04-24 09:43:56 -04:00
_debuglog ( " looking for LINK tags " )
2010-04-07 11:37:53 -04:00
try :
outfeeds = getLinks ( data , fulluri )
except :
outfeeds = [ ]
2011-02-06 00:12:00 -05:00
if not outfeeds :
2024-04-24 09:43:56 -04:00
_debuglog ( " using lxml to look for LINK tags " )
2011-02-06 00:12:00 -05:00
try :
outfeeds = getLinksLXML ( data , fulluri )
except :
outfeeds = [ ]
2024-04-24 09:43:56 -04:00
_debuglog ( " found %s feeds through LINK tags " % len ( outfeeds ) )
2020-06-13 13:13:20 -04:00
outfeeds = list ( filter ( isFeed , outfeeds ) )
2010-04-07 11:37:53 -04:00
if all or not outfeeds :
# no LINK tags, look for regular <A> links that point to feeds
2024-04-24 09:43:56 -04:00
_debuglog ( " no LINK tags, looking at A tags " )
2010-04-07 11:37:53 -04:00
try :
links = getALinks ( data , fulluri )
except :
links = [ ]
2024-04-24 09:43:56 -04:00
_debuglog ( " no LINK tags, looking at local links " )
2010-04-07 11:37:53 -04:00
locallinks = getLocalLinks ( links , fulluri )
# look for obvious feed links on the same server
2020-06-13 13:13:20 -04:00
outfeeds . extend ( list ( filter ( isFeed , list ( filter ( isFeedLink , locallinks ) ) ) ) )
2010-04-07 11:37:53 -04:00
if all or not outfeeds :
# look harder for feed links on the same server
2020-06-13 13:13:20 -04:00
outfeeds . extend ( list ( filter ( isFeed , list ( filter ( isXMLRelatedLink , locallinks ) ) ) ) )
2010-04-07 11:37:53 -04:00
if all or not outfeeds :
# look for obvious feed links on another server
2020-06-13 13:13:20 -04:00
outfeeds . extend ( list ( filter ( isFeed , list ( filter ( isFeedLink , links ) ) ) ) )
2010-04-07 11:37:53 -04:00
if all or not outfeeds :
# look harder for feed links on another server
2020-06-13 13:13:20 -04:00
outfeeds . extend ( list ( filter ( isFeed , list ( filter ( isXMLRelatedLink , links ) ) ) ) )
2010-04-07 11:37:53 -04:00
if all or not outfeeds :
2024-04-24 09:43:56 -04:00
_debuglog ( " no A tags, guessing " )
suffixes = [ # filenames used by popular software:
" feed/ " , # obvious
" atom.xml " , # blogger, TypePad
" index.atom " , # MT, apparently
" index.rdf " , # MT
" rss.xml " , # Dave Winer/Manila
" index.xml " , # MT
" index.rss " , # Slash
2010-04-07 11:37:53 -04:00
]
2020-06-13 13:13:20 -04:00
outfeeds . extend ( list ( filter ( isFeed , [ urllib . parse . urljoin ( fulluri , x ) for x in suffixes ] ) ) )
2010-04-07 11:37:53 -04:00
if ( all or not outfeeds ) and querySyndic8 :
# still no luck, search Syndic8 for feeds (requires xmlrpclib)
2024-04-24 09:43:56 -04:00
_debuglog ( " still no luck, searching Syndic8 " )
2010-04-07 11:37:53 -04:00
outfeeds . extend ( getFeedsFromSyndic8 ( uri ) )
2024-04-24 09:43:56 -04:00
if hasattr ( __builtins__ , " set " ) or " set " in __builtins__ :
2010-04-07 11:37:53 -04:00
outfeeds = list ( set ( outfeeds ) )
return outfeeds
2024-04-24 09:43:56 -04:00
getFeeds = feeds # backwards-compatibility
2010-04-07 11:37:53 -04:00
def feed ( uri ) :
2024-04-24 09:43:56 -04:00
# todo: give preference to certain feed formats
2010-04-07 11:37:53 -04:00
feedlist = feeds ( uri )
if feedlist :
2024-04-24 09:43:56 -04:00
feeds_no_comments = [ f for f in feedlist if " comments " not in f . lower ( ) ]
2011-02-06 00:12:00 -05:00
if feeds_no_comments :
return feeds_no_comments [ 0 ]
2010-04-07 11:37:53 -04:00
return feedlist [ 0 ]
else :
return None
2024-04-24 09:43:56 -04:00
2010-04-07 11:37:53 -04:00
##### test harness ######
2024-04-24 09:43:56 -04:00
2010-04-07 11:37:53 -04:00
def test ( ) :
2024-04-24 09:43:56 -04:00
uri = " http://diveintomark.org/tests/client/autodiscovery/html4-001.html "
2010-04-07 11:37:53 -04:00
failed = [ ]
count = 0
while 1 :
data = _gatekeeper . get ( uri )
2024-04-24 09:43:56 -04:00
if data . find ( " Atom autodiscovery test " ) == - 1 :
break
sys . stdout . write ( " . " )
2010-04-07 11:37:53 -04:00
sys . stdout . flush ( )
count + = 1
links = getLinks ( data , uri )
if not links :
2024-04-24 09:43:56 -04:00
print ( ( " \n *** FAILED *** " , uri , " could not find link " ) )
2010-04-07 11:37:53 -04:00
failed . append ( uri )
elif len ( links ) > 1 :
2024-04-24 09:43:56 -04:00
print ( ( " \n *** FAILED *** " , uri , " found too many links " ) )
2010-04-07 11:37:53 -04:00
failed . append ( uri )
else :
2020-06-13 13:13:20 -04:00
atomdata = urllib . request . urlopen ( links [ 0 ] ) . read ( )
2010-04-07 11:37:53 -04:00
if atomdata . find ( ' <link rel= " alternate " ' ) == - 1 :
2024-04-24 09:43:56 -04:00
print ( ( " \n *** FAILED *** " , uri , " retrieved something that is not a feed " ) )
2010-04-07 11:37:53 -04:00
failed . append ( uri )
else :
backlink = atomdata . split ( ' href= " ' ) . pop ( ) . split ( ' " ' ) [ 0 ]
if backlink != uri :
2024-04-24 09:43:56 -04:00
print ( ( " \n *** FAILED *** " , uri , " retrieved wrong feed " ) )
2010-04-07 11:37:53 -04:00
failed . append ( uri )
2024-04-24 09:43:56 -04:00
if data . find ( ' <link rel= " next " href= " ' ) == - 1 :
break
2020-06-13 13:13:20 -04:00
uri = urllib . parse . urljoin ( uri , data . split ( ' <link rel= " next " href= " ' ) . pop ( ) . split ( ' " ' ) [ 0 ] )
print ( )
2024-04-24 09:43:56 -04:00
print ( ( count , " tests executed, " , len ( failed ) , " failed " ) )
if __name__ == " __main__ " :
2010-04-07 11:37:53 -04:00
args = sys . argv [ 1 : ]
2024-04-24 09:43:56 -04:00
if args and args [ 0 ] == " --debug " :
2010-04-07 11:37:53 -04:00
_debug = 1
args . pop ( 0 )
if args :
uri = args [ 0 ]
else :
2024-04-24 09:43:56 -04:00
uri = " http://diveintomark.org/ "
if uri == " test " :
2010-04-07 11:37:53 -04:00
test ( )
else :
2020-06-13 13:13:20 -04:00
print ( ( " \n " . join ( getFeeds ( uri ) ) ) )