| 
									
										
										
										
											2016-02-04 12:28:55 -08:00
										 |  |  | #!/usr/bin/env python | 
					
						
							|  |  |  | # -*- coding: utf-8 -*- | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-06-13 13:13:20 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-02-04 12:28:55 -08:00
										 |  |  | 
 | 
					
						
							|  |  |  | __version__ = "0.0.3" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | try: | 
					
						
							|  |  |  |     __FEEDFINDER2_SETUP__ | 
					
						
							|  |  |  | except NameError: | 
					
						
							|  |  |  |     __FEEDFINDER2_SETUP__ = False | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | if not __FEEDFINDER2_SETUP__: | 
					
						
							|  |  |  |     __all__ = ["find_feeds"] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     import logging | 
					
						
							|  |  |  |     import requests | 
					
						
							| 
									
										
										
										
											2020-06-19 02:27:48 -04:00
										 |  |  |     from bs4 import BeautifulSoup | 
					
						
							| 
									
										
										
										
											2016-02-04 12:28:55 -08:00
										 |  |  |     from six.moves.urllib import parse as urlparse | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def coerce_url(url): | 
					
						
							|  |  |  |     url = url.strip() | 
					
						
							|  |  |  |     if url.startswith("feed://"): | 
					
						
							|  |  |  |         return "http://{0}".format(url[7:]) | 
					
						
							|  |  |  |     for proto in ["http://", "https://"]: | 
					
						
							|  |  |  |         if url.startswith(proto): | 
					
						
							|  |  |  |             return url | 
					
						
							|  |  |  |     return "http://{0}".format(url) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class FeedFinder(object): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def __init__(self, user_agent=None): | 
					
						
							|  |  |  |         if user_agent is None: | 
					
						
							| 
									
										
										
										
											2016-02-18 11:50:38 -08:00
										 |  |  |             user_agent = "NewsBlur Feed Finder" | 
					
						
							| 
									
										
										
										
											2016-02-04 12:28:55 -08:00
										 |  |  |         self.user_agent = user_agent | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-02-18 11:50:38 -08:00
										 |  |  |     def get_feed(self, url, skip_user_agent=False): | 
					
						
							| 
									
										
										
										
											2016-02-04 12:28:55 -08:00
										 |  |  |         try: | 
					
						
							| 
									
										
										
										
											2020-12-06 11:37:01 -05:00
										 |  |  |             r = requests.get(url, headers={"User-Agent": self.user_agent if not skip_user_agent else None}, timeout=15) | 
					
						
							| 
									
										
										
										
											2016-02-04 12:28:55 -08:00
										 |  |  |         except Exception as e: | 
					
						
							|  |  |  |             logging.warn("Error while getting '{0}'".format(url)) | 
					
						
							|  |  |  |             logging.warn("{0}".format(e)) | 
					
						
							|  |  |  |             return None | 
					
						
							| 
									
										
										
										
											2016-02-18 11:50:38 -08:00
										 |  |  |         if not skip_user_agent and r.status_code == 403: | 
					
						
							|  |  |  |             return self.get_feed(url, skip_user_agent=True) | 
					
						
							| 
									
										
										
										
											2016-02-04 12:28:55 -08:00
										 |  |  |         return r.text | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def is_feed_data(self, text): | 
					
						
							|  |  |  |         data = text.lower() | 
					
						
							| 
									
										
										
										
											2016-03-17 21:10:03 -07:00
										 |  |  |         if data and data[:100].count("<html"): | 
					
						
							| 
									
										
										
										
											2016-02-04 12:28:55 -08:00
										 |  |  |             return False | 
					
						
							| 
									
										
										
										
											2017-05-22 16:46:56 -07:00
										 |  |  |         return data.count("<rss")+data.count("<rdf")+data.count("<feed")+data.count("jsonfeed.org") | 
					
						
							| 
									
										
										
										
											2016-02-04 12:28:55 -08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def is_feed(self, url): | 
					
						
							|  |  |  |         text = self.get_feed(url) | 
					
						
							|  |  |  |         if text is None: | 
					
						
							|  |  |  |             return False | 
					
						
							|  |  |  |         return self.is_feed_data(text) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def is_feed_url(self, url): | 
					
						
							|  |  |  |         return any(map(url.lower().endswith, | 
					
						
							| 
									
										
										
										
											2017-05-22 16:46:56 -07:00
										 |  |  |                        [".rss", ".rdf", ".xml", ".atom", ".json"])) | 
					
						
							| 
									
										
										
										
											2016-02-04 12:28:55 -08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def is_feedlike_url(self, url): | 
					
						
							|  |  |  |         return any(map(url.lower().count, | 
					
						
							| 
									
										
										
										
											2017-05-22 16:46:56 -07:00
										 |  |  |                        ["rss", "rdf", "xml", "atom", "feed", "json"])) | 
					
						
							| 
									
										
										
										
											2016-02-04 12:28:55 -08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def find_feeds(url, check_all=False, user_agent=None): | 
					
						
							|  |  |  |     finder = FeedFinder(user_agent=user_agent) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Format the URL properly. | 
					
						
							|  |  |  |     url = coerce_url(url) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Download the requested URL. | 
					
						
							| 
									
										
										
										
											2016-03-17 21:10:03 -07:00
										 |  |  |     feed_text = finder.get_feed(url) | 
					
						
							|  |  |  |     if feed_text is None: | 
					
						
							| 
									
										
										
										
											2016-02-04 12:28:55 -08:00
										 |  |  |         return [] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Check if it is already a feed. | 
					
						
							| 
									
										
										
										
											2016-03-17 21:10:03 -07:00
										 |  |  |     if finder.is_feed_data(feed_text): | 
					
						
							| 
									
										
										
										
											2016-02-04 12:28:55 -08:00
										 |  |  |         return [url] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Look for <link> tags. | 
					
						
							|  |  |  |     logging.info("Looking for <link> tags.") | 
					
						
							| 
									
										
										
										
											2018-03-26 17:31:12 -07:00
										 |  |  |     try: | 
					
						
							| 
									
										
										
										
											2020-06-30 17:22:47 -04:00
										 |  |  |         tree = BeautifulSoup(feed_text, features="lxml") | 
					
						
							| 
									
										
										
										
											2018-03-26 17:31:12 -07:00
										 |  |  |     except ValueError: | 
					
						
							|  |  |  |         return [] | 
					
						
							| 
									
										
										
										
											2016-02-04 12:28:55 -08:00
										 |  |  |     links = [] | 
					
						
							| 
									
										
										
										
											2016-02-05 14:43:31 -08:00
										 |  |  |     for link in tree.findAll("link"): | 
					
						
							| 
									
										
										
										
											2016-02-04 12:28:55 -08:00
										 |  |  |         if link.get("type") in ["application/rss+xml", | 
					
						
							|  |  |  |                                 "text/xml", | 
					
						
							|  |  |  |                                 "application/atom+xml", | 
					
						
							|  |  |  |                                 "application/x.atom+xml", | 
					
						
							| 
									
										
										
										
											2017-05-22 16:46:56 -07:00
										 |  |  |                                 "application/x-atom+xml", | 
					
						
							|  |  |  |                                 "application/json"]: | 
					
						
							| 
									
										
										
										
											2016-02-04 12:28:55 -08:00
										 |  |  |             links.append(urlparse.urljoin(url, link.get("href", ""))) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Check the detected links. | 
					
						
							|  |  |  |     urls = list(filter(finder.is_feed, links)) | 
					
						
							|  |  |  |     logging.info("Found {0} feed <link> tags.".format(len(urls))) | 
					
						
							|  |  |  |     if len(urls) and not check_all: | 
					
						
							|  |  |  |         return sort_urls(urls) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Look for <a> tags. | 
					
						
							|  |  |  |     logging.info("Looking for <a> tags.") | 
					
						
							|  |  |  |     local, remote = [], [] | 
					
						
							| 
									
										
										
										
											2016-02-05 14:43:31 -08:00
										 |  |  |     for a in tree.findAll("a"): | 
					
						
							| 
									
										
										
										
											2016-02-04 12:28:55 -08:00
										 |  |  |         href = a.get("href", None) | 
					
						
							|  |  |  |         if href is None: | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         if "://" not in href and finder.is_feed_url(href): | 
					
						
							|  |  |  |             local.append(href) | 
					
						
							|  |  |  |         if finder.is_feedlike_url(href): | 
					
						
							|  |  |  |             remote.append(href) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Check the local URLs. | 
					
						
							|  |  |  |     local = [urlparse.urljoin(url, l) for l in local] | 
					
						
							|  |  |  |     urls += list(filter(finder.is_feed, local)) | 
					
						
							|  |  |  |     logging.info("Found {0} local <a> links to feeds.".format(len(urls))) | 
					
						
							|  |  |  |     if len(urls) and not check_all: | 
					
						
							|  |  |  |         return sort_urls(urls) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Check the remote URLs. | 
					
						
							|  |  |  |     remote = [urlparse.urljoin(url, l) for l in remote] | 
					
						
							|  |  |  |     urls += list(filter(finder.is_feed, remote)) | 
					
						
							|  |  |  |     logging.info("Found {0} remote <a> links to feeds.".format(len(urls))) | 
					
						
							|  |  |  |     if len(urls) and not check_all: | 
					
						
							|  |  |  |         return sort_urls(urls) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Guessing potential URLs. | 
					
						
							|  |  |  |     fns = ["atom.xml", "index.atom", "index.rdf", "rss.xml", "index.xml", | 
					
						
							| 
									
										
										
										
											2017-05-22 16:46:56 -07:00
										 |  |  |            "index.rss", "index.json"] | 
					
						
							| 
									
										
										
										
											2016-02-04 12:28:55 -08:00
										 |  |  |     urls += list(filter(finder.is_feed, [urlparse.urljoin(url, f) | 
					
						
							|  |  |  |                                          for f in fns])) | 
					
						
							|  |  |  |     return sort_urls(urls) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def url_feed_prob(url): | 
					
						
							|  |  |  |     if "comments" in url: | 
					
						
							|  |  |  |         return -2 | 
					
						
							|  |  |  |     if "georss" in url: | 
					
						
							|  |  |  |         return -1 | 
					
						
							| 
									
										
										
										
											2017-05-22 16:46:56 -07:00
										 |  |  |     kw = ["atom", "rss", "rdf", ".xml", "feed", "json"] | 
					
						
							| 
									
										
										
										
											2020-06-13 13:13:20 -04:00
										 |  |  |     for p, t in zip(list(range(len(kw), 0, -1)), kw): | 
					
						
							| 
									
										
										
										
											2016-02-04 12:28:55 -08:00
										 |  |  |         if t in url: | 
					
						
							|  |  |  |             return p | 
					
						
							|  |  |  |     return 0 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def sort_urls(feeds): | 
					
						
							|  |  |  |     return sorted(list(set(feeds)), key=url_feed_prob, reverse=True) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | if __name__ == "__main__": | 
					
						
							|  |  |  |     print(find_feeds("www.preposterousuniverse.com/blog/")) | 
					
						
							|  |  |  |     print(find_feeds("http://xkcd.com")) | 
					
						
							|  |  |  |     print(find_feeds("dan.iel.fm/atom.xml")) | 
					
						
							|  |  |  |     print(find_feeds("dan.iel.fm", check_all=True)) | 
					
						
							|  |  |  |     print(find_feeds("kapadia.github.io")) | 
					
						
							|  |  |  |     print(find_feeds("blog.jonathansick.ca")) | 
					
						
							| 
									
										
										
										
											2020-12-06 11:37:01 -05:00
										 |  |  |     print(find_feeds("asdasd")) |