NewsBlur-viq/utils/feedfinder_pilgrim.py

441 lines
14 KiB
Python
Raw Normal View History

2010-04-07 11:37:53 -04:00
"""feedfinder: Find the Web feed for a Web page
http://www.aaronsw.com/2002/feedfinder/
Usage:
feed(uri) - returns feed found for a URI
feeds(uri) - returns all feeds found for a URI
>>> import feedfinder
>>> feedfinder.feed('scripting.com')
'http://scripting.com/rss.xml'
>>>
>>> feedfinder.feeds('scripting.com')
['http://delong.typepad.com/sdj/atom.xml',
'http://delong.typepad.com/sdj/index.rdf',
'http://delong.typepad.com/sdj/rss.xml']
>>>
Can also use from the command line. Feeds are returned one per line:
$ python feedfinder.py diveintomark.org
http://diveintomark.org/xml/atom.xml
How it works:
0. At every step, feeds are minimally verified to make sure they are really feeds.
1. If the URI points to a feed, it is simply returned; otherwise
the page is downloaded and the real fun begins.
2. Feeds pointed to by LINK tags in the header of the page (autodiscovery)
3. <A> links to feeds on the same server ending in ".rss", ".rdf", ".xml", or
".atom"
4. <A> links to feeds on the same server containing "rss", "rdf", "xml", or "atom"
5. <A> links to feeds on external servers ending in ".rss", ".rdf", ".xml", or
".atom"
6. <A> links to feeds on external servers containing "rss", "rdf", "xml", or "atom"
7. Try some guesses about common places for feeds (index.xml, atom.xml, etc.).
8. As a last ditch effort, we search Syndic8 for feeds matching the URI
"""
__version__ = "1.371"
__date__ = "2006-04-24"
__maintainer__ = "Aaron Swartz (me@aaronsw.com)"
__author__ = "Mark Pilgrim (http://diveintomark.org)"
__copyright__ = "Copyright 2002-4, Mark Pilgrim; 2006 Aaron Swartz"
__license__ = "Python"
__credits__ = """Abe Fettig for a patch to sort Syndic8 feeds by popularity
Also Jason Diamond, Brian Lalor for bug reporting and patches"""
2011-02-08 23:00:00 -05:00
_debug = 0
2010-04-07 11:37:53 -04:00
2024-04-24 09:50:42 -04:00
import re
import sys
import urllib.error
import urllib.parse
import urllib.request
import urllib.robotparser
2020-06-13 13:13:20 -04:00
from io import StringIO
2024-04-24 09:50:42 -04:00
import requests
import sgmllib
from lxml import etree
2010-04-07 11:37:53 -04:00
# XML-RPC support allows feedfinder to query Syndic8 for possible matches.
# Python 2.3 now comes with this module by default, otherwise you can download it
try:
2024-04-24 09:43:56 -04:00
import xmlrpc.client # http://www.pythonware.com/products/xmlrpc/
2010-04-07 11:37:53 -04:00
except ImportError:
xmlrpclib = None
if not dict:
2024-04-24 09:43:56 -04:00
2010-04-07 11:37:53 -04:00
def dict(aList):
rc = {}
for k, v in aList:
rc[k] = v
return rc
2024-04-24 09:43:56 -04:00
2010-04-07 11:37:53 -04:00
def _debuglog(message):
2024-04-24 09:43:56 -04:00
if _debug:
print(message)
2010-04-07 11:37:53 -04:00
class URLGatekeeper:
"""a class to track robots.txt rules across multiple servers"""
2024-04-24 09:43:56 -04:00
2010-04-07 11:37:53 -04:00
def __init__(self):
2024-04-24 09:43:56 -04:00
self.rpcache = {} # a dictionary of RobotFileParser objects, by domain
2020-06-19 02:27:48 -04:00
self.urlopener = urllib.request.build_opener()
self.urlopener.version = "NewsBlur Feed Finder (Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 Safari/534.48.3)"
2010-04-07 11:37:53 -04:00
_debuglog(self.urlopener.version)
2024-04-24 09:43:56 -04:00
self.urlopener.addheaders = [("User-Agent", self.urlopener.version)]
# self.urlopener.addheaders = [('User-Agent', self.urlopener.version), ('Accept', '*')]
2024-04-24 09:43:56 -04:00
# urllib.robotparser.URLopener.version = self.urlopener.version
# urllib.robotparser.URLopener.addheaders = self.urlopener.addheaders
2010-04-07 11:37:53 -04:00
def _getrp(self, url):
2020-06-13 13:13:20 -04:00
protocol, domain = urllib.parse.urlparse(url)[:2]
if domain in self.rpcache:
2010-04-07 11:37:53 -04:00
return self.rpcache[domain]
2024-04-24 09:43:56 -04:00
baseurl = "%s://%s" % (protocol, domain)
robotsurl = urllib.parse.urljoin(baseurl, "robots.txt")
_debuglog("fetching %s" % robotsurl)
2020-06-13 13:13:20 -04:00
rp = urllib.robotparser.RobotFileParser(robotsurl)
2010-04-07 11:37:53 -04:00
try:
rp.read()
except:
pass
self.rpcache[domain] = rp
return rp
2024-04-24 09:43:56 -04:00
2010-04-07 11:37:53 -04:00
def can_fetch(self, url):
rp = self._getrp(url)
allow = rp.can_fetch(self.urlopener.version, url)
_debuglog("gatekeeper of %s says %s" % (url, allow))
return allow
def get(self, url, check=False):
2024-04-24 09:43:56 -04:00
if check and not self.can_fetch(url):
return ""
2010-04-07 11:37:53 -04:00
try:
return requests.get(url, headers=dict(self.urlopener.addheaders)).text
2010-04-07 11:37:53 -04:00
except:
2024-04-24 09:43:56 -04:00
return ""
2010-04-07 11:37:53 -04:00
_gatekeeper = URLGatekeeper()
2024-04-24 09:43:56 -04:00
2010-04-07 11:37:53 -04:00
class BaseParser(sgmllib.SGMLParser):
def __init__(self, baseuri):
sgmllib.SGMLParser.__init__(self)
self.links = []
self.baseuri = baseuri
2024-04-24 09:43:56 -04:00
2010-04-07 11:37:53 -04:00
def normalize_attrs(self, attrs):
def cleanattr(v):
2020-06-13 13:13:20 -04:00
v = sgmllib.charref.sub(lambda m: chr(int(m.groups()[0])), v)
2024-04-24 09:43:56 -04:00
if not v:
return
2010-04-07 11:37:53 -04:00
v = v.strip()
2024-04-24 09:43:56 -04:00
v = (
v.replace("&lt;", "<")
.replace("&gt;", ">")
.replace("&apos;", "'")
.replace("&quot;", '"')
.replace("&amp;", "&")
)
2010-04-07 11:37:53 -04:00
return v
2024-04-24 09:43:56 -04:00
2011-02-15 21:08:40 -05:00
attrs = [(k.lower(), cleanattr(v)) for k, v in attrs if cleanattr(v)]
2024-04-24 09:43:56 -04:00
attrs = [(k, k in ("rel", "type") and v.lower() or v) for k, v in attrs if cleanattr(v)]
2010-04-07 11:37:53 -04:00
return attrs
2024-04-24 09:43:56 -04:00
2010-04-07 11:37:53 -04:00
def do_base(self, attrs):
attrsD = dict(self.normalize_attrs(attrs))
2024-04-24 09:43:56 -04:00
if "href" not in attrsD:
return
self.baseuri = attrsD["href"]
def error(self, *a, **kw):
pass # we're not picky
2010-04-07 11:37:53 -04:00
class LinkParser(BaseParser):
2024-04-24 09:43:56 -04:00
FEED_TYPES = (
"application/rss+xml",
"text/xml",
"application/atom+xml",
"application/x.atom+xml",
"application/x-atom+xml",
)
2010-04-07 11:37:53 -04:00
def do_link(self, attrs):
attrsD = dict(self.normalize_attrs(attrs))
2024-04-24 09:43:56 -04:00
if "rel" not in attrsD:
return
rels = attrsD["rel"].split()
if "alternate" not in rels:
return
if attrsD.get("type") not in self.FEED_TYPES:
return
if "href" not in attrsD:
return
self.links.append(urllib.parse.urljoin(self.baseuri, attrsD["href"]))
2010-04-07 11:37:53 -04:00
class ALinkParser(BaseParser):
def start_a(self, attrs):
attrsD = dict(self.normalize_attrs(attrs))
2024-04-24 09:43:56 -04:00
if "href" not in attrsD:
return
self.links.append(urllib.parse.urljoin(self.baseuri, attrsD["href"]))
2010-04-07 11:37:53 -04:00
def makeFullURI(uri):
2024-04-24 09:43:56 -04:00
if not uri:
return
2010-04-07 11:37:53 -04:00
uri = uri.strip()
2024-04-24 09:43:56 -04:00
if uri.startswith("feed://"):
uri = "http://" + uri.split("feed://", 1).pop()
for x in ["http", "https"]:
if uri.startswith("%s://" % x):
2010-04-07 11:37:53 -04:00
return uri
2024-04-24 09:43:56 -04:00
return "http://%s" % uri
2010-04-07 11:37:53 -04:00
def getLinks(data, baseuri):
p = LinkParser(baseuri)
p.feed(data)
return p.links
2024-04-24 09:43:56 -04:00
def getLinksLXML(data, baseuri):
parser = etree.HTMLParser(recover=True)
tree = etree.parse(StringIO(data), parser)
links = []
2024-04-24 09:43:56 -04:00
for link in tree.findall(".//link"):
if link.attrib.get("type") in LinkParser.FEED_TYPES:
href = link.attrib["href"]
if href:
links.append(href)
return links
2024-04-24 09:43:56 -04:00
2010-04-07 11:37:53 -04:00
def getALinks(data, baseuri):
p = ALinkParser(baseuri)
p.feed(data)
return p.links
2024-04-24 09:43:56 -04:00
2010-04-07 11:37:53 -04:00
def getLocalLinks(links, baseuri):
found_links = []
2024-04-24 09:43:56 -04:00
if not baseuri:
return found_links
2010-04-07 11:37:53 -04:00
baseuri = baseuri.lower()
2011-02-23 14:25:07 -05:00
for l in links:
try:
if l.lower().startswith(baseuri):
found_links.append(l)
2011-02-23 14:27:34 -05:00
except (AttributeError, UnicodeDecodeError):
2011-02-23 14:25:07 -05:00
pass
return found_links
2010-04-07 11:37:53 -04:00
2024-04-24 09:43:56 -04:00
2010-04-07 11:37:53 -04:00
def isFeedLink(link):
2024-04-24 09:43:56 -04:00
return link[-4:].lower() in (".rss", ".rdf", ".xml", ".atom")
2010-04-07 11:37:53 -04:00
def isXMLRelatedLink(link):
link = link.lower()
2024-04-24 09:43:56 -04:00
return link.count("rss") + link.count("rdf") + link.count("xml") + link.count("atom")
r_brokenRedirect = re.compile("<newLocation[^>]*>(.*?)</newLocation>", re.S)
2010-04-07 11:37:53 -04:00
def tryBrokenRedirect(data):
2024-04-24 09:43:56 -04:00
if "<newLocation" in data:
2010-04-07 11:37:53 -04:00
newuris = r_brokenRedirect.findall(data)
2024-04-24 09:43:56 -04:00
if newuris and newuris[0]:
return newuris[0].strip()
2010-04-07 11:37:53 -04:00
def couldBeFeedData(data):
data = data.lower()
2024-04-24 09:43:56 -04:00
if data.count("<html"):
return 0
return data.count("<rss") + data.count("<rdf") + data.count("<feed")
2010-04-07 11:37:53 -04:00
def isFeed(uri):
2024-04-24 09:43:56 -04:00
_debuglog("seeing if %s is a feed" % uri)
2020-06-13 13:13:20 -04:00
protocol = urllib.parse.urlparse(uri)
2024-04-24 09:43:56 -04:00
if protocol[0] not in ("http", "https"):
return 0
2011-02-23 14:25:07 -05:00
try:
data = _gatekeeper.get(uri, check=False)
2011-02-23 14:25:07 -05:00
except (KeyError, UnicodeDecodeError):
return False
count = couldBeFeedData(data)
return count
2010-04-07 11:37:53 -04:00
2024-04-24 09:43:56 -04:00
2020-06-19 02:27:48 -04:00
def cmp_(a, b):
2024-04-24 09:43:56 -04:00
return (a > b) - (a < b)
2020-06-19 02:27:48 -04:00
2010-04-07 11:37:53 -04:00
def sortFeeds(feed1Info, feed2Info):
2024-04-24 09:43:56 -04:00
return cmp_(feed2Info["headlines_rank"], feed1Info["headlines_rank"])
2010-04-07 11:37:53 -04:00
def getFeedsFromSyndic8(uri):
feeds = []
try:
2024-04-24 09:43:56 -04:00
server = xmlrpc.client.Server("http://www.syndic8.com/xmlrpc.php")
2010-04-07 11:37:53 -04:00
feedids = server.syndic8.FindFeeds(uri)
2024-04-24 09:43:56 -04:00
infolist = server.syndic8.GetFeedInfo(feedids, ["headlines_rank", "status", "dataurl"])
2010-04-07 11:37:53 -04:00
infolist.sort(sortFeeds)
2024-04-24 09:43:56 -04:00
feeds = [f["dataurl"] for f in infolist if f["status"] == "Syndicated"]
_debuglog("found %s feeds through Syndic8" % len(feeds))
2010-04-07 11:37:53 -04:00
except:
pass
return feeds
2024-04-24 09:43:56 -04:00
2010-04-07 11:37:53 -04:00
def feeds(uri, all=False, querySyndic8=False, _recurs=None):
2024-04-24 09:43:56 -04:00
if _recurs is None:
_recurs = [uri]
2010-04-07 11:37:53 -04:00
fulluri = makeFullURI(uri)
try:
data = _gatekeeper.get(fulluri, check=False)
except:
return []
# is this already a feed?
if couldBeFeedData(data):
return [fulluri]
newuri = tryBrokenRedirect(data)
if newuri and newuri not in _recurs:
_recurs.append(newuri)
return feeds(newuri, all=all, querySyndic8=querySyndic8, _recurs=_recurs)
# nope, it's a page, try LINK tags first
2024-04-24 09:43:56 -04:00
_debuglog("looking for LINK tags")
2010-04-07 11:37:53 -04:00
try:
outfeeds = getLinks(data, fulluri)
except:
outfeeds = []
if not outfeeds:
2024-04-24 09:43:56 -04:00
_debuglog("using lxml to look for LINK tags")
try:
outfeeds = getLinksLXML(data, fulluri)
except:
outfeeds = []
2024-04-24 09:43:56 -04:00
_debuglog("found %s feeds through LINK tags" % len(outfeeds))
2020-06-13 13:13:20 -04:00
outfeeds = list(filter(isFeed, outfeeds))
2010-04-07 11:37:53 -04:00
if all or not outfeeds:
# no LINK tags, look for regular <A> links that point to feeds
2024-04-24 09:43:56 -04:00
_debuglog("no LINK tags, looking at A tags")
2010-04-07 11:37:53 -04:00
try:
links = getALinks(data, fulluri)
except:
links = []
2024-04-24 09:43:56 -04:00
_debuglog("no LINK tags, looking at local links")
2010-04-07 11:37:53 -04:00
locallinks = getLocalLinks(links, fulluri)
# look for obvious feed links on the same server
2020-06-13 13:13:20 -04:00
outfeeds.extend(list(filter(isFeed, list(filter(isFeedLink, locallinks)))))
2010-04-07 11:37:53 -04:00
if all or not outfeeds:
# look harder for feed links on the same server
2020-06-13 13:13:20 -04:00
outfeeds.extend(list(filter(isFeed, list(filter(isXMLRelatedLink, locallinks)))))
2010-04-07 11:37:53 -04:00
if all or not outfeeds:
# look for obvious feed links on another server
2020-06-13 13:13:20 -04:00
outfeeds.extend(list(filter(isFeed, list(filter(isFeedLink, links)))))
2010-04-07 11:37:53 -04:00
if all or not outfeeds:
# look harder for feed links on another server
2020-06-13 13:13:20 -04:00
outfeeds.extend(list(filter(isFeed, list(filter(isXMLRelatedLink, links)))))
2010-04-07 11:37:53 -04:00
if all or not outfeeds:
2024-04-24 09:43:56 -04:00
_debuglog("no A tags, guessing")
suffixes = [ # filenames used by popular software:
"feed/", # obvious
"atom.xml", # blogger, TypePad
"index.atom", # MT, apparently
"index.rdf", # MT
"rss.xml", # Dave Winer/Manila
"index.xml", # MT
"index.rss", # Slash
2010-04-07 11:37:53 -04:00
]
2020-06-13 13:13:20 -04:00
outfeeds.extend(list(filter(isFeed, [urllib.parse.urljoin(fulluri, x) for x in suffixes])))
2010-04-07 11:37:53 -04:00
if (all or not outfeeds) and querySyndic8:
# still no luck, search Syndic8 for feeds (requires xmlrpclib)
2024-04-24 09:43:56 -04:00
_debuglog("still no luck, searching Syndic8")
2010-04-07 11:37:53 -04:00
outfeeds.extend(getFeedsFromSyndic8(uri))
2024-04-24 09:43:56 -04:00
if hasattr(__builtins__, "set") or "set" in __builtins__:
2010-04-07 11:37:53 -04:00
outfeeds = list(set(outfeeds))
return outfeeds
2024-04-24 09:43:56 -04:00
getFeeds = feeds # backwards-compatibility
2010-04-07 11:37:53 -04:00
def feed(uri):
2024-04-24 09:43:56 -04:00
# todo: give preference to certain feed formats
2010-04-07 11:37:53 -04:00
feedlist = feeds(uri)
if feedlist:
2024-04-24 09:43:56 -04:00
feeds_no_comments = [f for f in feedlist if "comments" not in f.lower()]
if feeds_no_comments:
return feeds_no_comments[0]
2010-04-07 11:37:53 -04:00
return feedlist[0]
else:
return None
2024-04-24 09:43:56 -04:00
2010-04-07 11:37:53 -04:00
##### test harness ######
2024-04-24 09:43:56 -04:00
2010-04-07 11:37:53 -04:00
def test():
2024-04-24 09:43:56 -04:00
uri = "http://diveintomark.org/tests/client/autodiscovery/html4-001.html"
2010-04-07 11:37:53 -04:00
failed = []
count = 0
while 1:
data = _gatekeeper.get(uri)
2024-04-24 09:43:56 -04:00
if data.find("Atom autodiscovery test") == -1:
break
sys.stdout.write(".")
2010-04-07 11:37:53 -04:00
sys.stdout.flush()
count += 1
links = getLinks(data, uri)
if not links:
2024-04-24 09:43:56 -04:00
print(("\n*** FAILED ***", uri, "could not find link"))
2010-04-07 11:37:53 -04:00
failed.append(uri)
elif len(links) > 1:
2024-04-24 09:43:56 -04:00
print(("\n*** FAILED ***", uri, "found too many links"))
2010-04-07 11:37:53 -04:00
failed.append(uri)
else:
2020-06-13 13:13:20 -04:00
atomdata = urllib.request.urlopen(links[0]).read()
2010-04-07 11:37:53 -04:00
if atomdata.find('<link rel="alternate"') == -1:
2024-04-24 09:43:56 -04:00
print(("\n*** FAILED ***", uri, "retrieved something that is not a feed"))
2010-04-07 11:37:53 -04:00
failed.append(uri)
else:
backlink = atomdata.split('href="').pop().split('"')[0]
if backlink != uri:
2024-04-24 09:43:56 -04:00
print(("\n*** FAILED ***", uri, "retrieved wrong feed"))
2010-04-07 11:37:53 -04:00
failed.append(uri)
2024-04-24 09:43:56 -04:00
if data.find('<link rel="next" href="') == -1:
break
2020-06-13 13:13:20 -04:00
uri = urllib.parse.urljoin(uri, data.split('<link rel="next" href="').pop().split('"')[0])
print()
2024-04-24 09:43:56 -04:00
print((count, "tests executed,", len(failed), "failed"))
if __name__ == "__main__":
2010-04-07 11:37:53 -04:00
args = sys.argv[1:]
2024-04-24 09:43:56 -04:00
if args and args[0] == "--debug":
2010-04-07 11:37:53 -04:00
_debug = 1
args.pop(0)
if args:
uri = args[0]
else:
2024-04-24 09:43:56 -04:00
uri = "http://diveintomark.org/"
if uri == "test":
2010-04-07 11:37:53 -04:00
test()
else:
2020-06-13 13:13:20 -04:00
print(("\n".join(getFeeds(uri))))