diff --git a/apps/rss_feeds/models.py b/apps/rss_feeds/models.py index 0940de73c..b6edad48d 100644 --- a/apps/rss_feeds/models.py +++ b/apps/rss_feeds/models.py @@ -1,4 +1,5 @@ import difflib +import requests import datetime import time import random @@ -462,6 +463,12 @@ class Feed(models.Model): feed = cls.objects.create(feed_address=url) feed = feed.update(requesting_user_id=user.pk if user else None) + # Check for JSON feed + if not feed and fetch and create: + r = requests.get(url) + if 'application/json' in r.headers.get('Content-Type'): + feed = cls.objects.create(feed_address=url) + feed = feed.update() # Still nothing? Maybe the URL has some clues. if not feed and fetch and len(found_feed_urls): @@ -1103,7 +1110,7 @@ class Feed(models.Model): if getattr(settings, 'TEST_DEBUG', False): print " ---> Testing feed fetch: %s" % self.log_title options['force'] = False - options['force_fp'] = True + # options['force_fp'] = True # No, why would this be needed? original_feed_address = self.feed_address original_feed_link = self.feed_link self.feed_address = self.feed_address.replace("%(NEWSBLUR_DIR)s", settings.NEWSBLUR_DIR) diff --git a/utils/feed_fetcher.py b/utils/feed_fetcher.py index 7ecef86bd..21246f0a9 100644 --- a/utils/feed_fetcher.py +++ b/utils/feed_fetcher.py @@ -34,6 +34,7 @@ from django.utils.encoding import smart_unicode from utils import json_functions as json from celery.exceptions import SoftTimeLimitExceeded from utils.twitter_fetcher import TwitterFetcher +from utils.json_fetcher import JSONFetcher # from utils.feed_functions import mail_feed_error_to_admin @@ -63,7 +64,7 @@ class FetchFeed: datetime.datetime.now() - self.feed.last_update) logging.debug(log_msg) - etag=self.feed.etag + etag = self.feed.etag modified = self.feed.last_modified.utctimetuple()[:7] if self.feed.last_modified else None address = self.feed.feed_address @@ -126,7 +127,16 @@ class FetchFeed: if raw_feed.status_code >= 400: logging.debug(" ***> [%-30s] ~FRFeed fetch was %s status code, trying fake user agent: %s" % (self.feed.log_title[:30], raw_feed.status_code, raw_feed.headers)) raw_feed = requests.get(address, headers=self.feed.fetch_headers(fake=True)) - if raw_feed.content and raw_feed.status_code < 400: + + if raw_feed.content and 'application/json' in raw_feed.headers.get('Content-Type', ""): + # JSON Feed + json_feed = self.fetch_json_feed(address, raw_feed) + if not json_feed: + logging.debug(u' ***> [%-30s] ~FRJSON fetch failed: %s' % + (self.feed.log_title[:30], address)) + return FEED_ERRHTTP, None + self.fpf = feedparser.parse(json_feed) + elif raw_feed.content and raw_feed.status_code < 400: response_headers = raw_feed.headers response_headers['Content-Location'] = raw_feed.url self.raw_feed = smart_unicode(raw_feed.content) @@ -176,6 +186,10 @@ class FetchFeed: twitter_fetcher = TwitterFetcher(self.feed, self.options) return twitter_fetcher.fetch(address) + def fetch_json_feed(self, address, headers): + json_fetcher = JSONFetcher(self.feed, self.options) + return json_fetcher.fetch(address, headers) + def fetch_youtube(self, address): username = None channel_id = None @@ -380,7 +394,7 @@ class ProcessFeed: logging.debug(" ---> [%-30s] ~SB~FRFeed is Non-XML. No feedparser feed either!" % (self.feed.log_title[:30])) self.feed.save_feed_history(551, "Broken feed") return FEED_ERRHTTP, ret_values - + if self.fpf and not self.fpf.entries: if self.fpf.bozo and isinstance(self.fpf.bozo_exception, feedparser.NonXMLContentType): logging.debug(" ---> [%-30s] ~SB~FRFeed is Non-XML. %s entries. Checking address..." % (self.feed.log_title[:30], len(self.fpf.entries))) @@ -659,6 +673,7 @@ class Dispatcher: ffeed = FetchFeed(feed_id, self.options) ret_feed, fetched_feed = ffeed.fetch() + feed_fetch_duration = time.time() - start_duration raw_feed = ffeed.raw_feed diff --git a/utils/feedfinder2.py b/utils/feedfinder2.py index 87dd649b7..cf897e4a8 100755 --- a/utils/feedfinder2.py +++ b/utils/feedfinder2.py @@ -51,7 +51,7 @@ class FeedFinder(object): data = text.lower() if data and data[:100].count(" [%-30s] ~FRJSON fetch failed: %s' % + (self.feed.log_title[:30], address)) + return + + data = {} + data['title'] = json_feed.get('title', '[Untitled]') + data['link'] = json_feed.get('home_page_url', None) + data['description'] = json_feed.get('title', "") + data['lastBuildDate'] = datetime.datetime.utcnow() + data['generator'] = 'NewsBlur JSON Feed - %s' % settings.NEWSBLUR_URL + data['docs'] = None + data['feed_url'] = json_feed.get('feed_url') + + rss = feedgenerator.Atom1Feed(**data) + + for item in json_feed.get('items', []): + story_data = self.json_feed_story(item) + rss.add_item(**story_data) + + return rss.writeString('utf-8') + + def json_feed_story(self, item): + date_published = datetime.datetime.now() + pubdate = item.get('date_published', None) + if pubdate: + date_published = dateutil.parser.parse(pubdate) + story = { + 'title': item.get('title', None), + 'link': item.get('url', None), + 'description': item.get('content_html', item.get('content_text', None)), + 'author_name': item.get('author', {}).get('name', None), + 'categories': item.get('tags', []), + 'unique_id': item.get('id', item.get('url', None)), + 'pubdate': date_published, + } + + return story \ No newline at end of file