#!/usr/bin/env python # -*- coding: utf-8 -*- __version__ = "0.0.3" try: __FEEDFINDER2_SETUP__ except NameError: __FEEDFINDER2_SETUP__ = False if not __FEEDFINDER2_SETUP__: __all__ = ["find_feeds"] import logging import requests from bs4 import BeautifulSoup from six.moves.urllib import parse as urlparse def coerce_url(url): url = url.strip() if url.startswith("feed://"): return "http://{0}".format(url[7:]) for proto in ["http://", "https://"]: if url.startswith(proto): return url return "http://{0}".format(url) class FeedFinder(object): def __init__(self, user_agent=None): if user_agent is None: user_agent = "NewsBlur Feed Finder" self.user_agent = user_agent def get_feed(self, url, skip_user_agent=False): try: r = requests.get( url, headers={"User-Agent": self.user_agent if not skip_user_agent else None}, timeout=15 ) except Exception as e: logging.warn("Error while getting '{0}'".format(url)) logging.warn("{0}".format(e)) return None if not skip_user_agent and r.status_code in [403, 204]: return self.get_feed(url, skip_user_agent=True) return r.text def is_feed_data(self, text): data = text.lower() if data and data[:100].count(" tags. logging.info("Looking for tags.") try: tree = BeautifulSoup(feed_text, features="lxml") except ValueError: return [] links = [] for link in tree.findAll("link"): if link.get("type") in [ "application/rss+xml", "text/xml", "application/atom+xml", "application/x.atom+xml", "application/x-atom+xml", "application/json", ]: links.append(urlparse.urljoin(url, link.get("href", ""))) # Check the detected links. urls = list(filter(finder.is_feed, links)) logging.info("Found {0} feed tags.".format(len(urls))) if len(urls) and not check_all: return sort_urls(urls) # Look for tags. logging.info("Looking for tags.") local, remote = [], [] for a in tree.findAll("a"): href = a.get("href", None) if href is None: continue if "://" not in href and finder.is_feed_url(href): local.append(href) if finder.is_feedlike_url(href): remote.append(href) # Check the local URLs. local = [urlparse.urljoin(url, l) for l in local] urls += list(filter(finder.is_feed, local)) logging.info("Found {0} local links to feeds.".format(len(urls))) if len(urls) and not check_all: return sort_urls(urls) # Check the remote URLs. remote = [urlparse.urljoin(url, l) for l in remote] urls += list(filter(finder.is_feed, remote)) logging.info("Found {0} remote links to feeds.".format(len(urls))) if len(urls) and not check_all: return sort_urls(urls) # Guessing potential URLs. if not any(ignored_domain in url for ignored_domain in ["openrss", "feedburner"]): fns = ["atom.xml", "index.atom", "index.rdf", "rss.xml", "index.xml", "index.rss", "index.json"] urls += list(filter(finder.is_feed, [urlparse.urljoin(url, f) for f in fns])) return sort_urls(urls) def url_feed_prob(url): if "comments" in url: return -2 if "georss" in url: return -1 kw = ["atom", "rss", "rdf", ".xml", "feed", "json"] for p, t in zip(list(range(len(kw), 0, -1)), kw): if t in url: return p return 0 def sort_urls(feeds): return sorted(list(set(feeds)), key=url_feed_prob, reverse=True) if __name__ == "__main__": print(find_feeds("www.preposterousuniverse.com/blog/")) print(find_feeds("http://xkcd.com")) print(find_feeds("dan.iel.fm/atom.xml")) print(find_feeds("dan.iel.fm", check_all=True)) print(find_feeds("kapadia.github.io")) print(find_feeds("blog.jonathansick.ca")) print(find_feeds("asdasd"))