Adding support for JSON Feeds.

This commit is contained in:
Samuel Clay 2017-05-22 16:46:56 -07:00
parent 6c24e09a78
commit c1834703d9
4 changed files with 89 additions and 10 deletions

View file

@ -1,4 +1,5 @@
import difflib
import requests
import datetime
import time
import random
@ -462,6 +463,12 @@ class Feed(models.Model):
feed = cls.objects.create(feed_address=url)
feed = feed.update(requesting_user_id=user.pk if user else None)
# Check for JSON feed
if not feed and fetch and create:
r = requests.get(url)
if 'application/json' in r.headers.get('Content-Type'):
feed = cls.objects.create(feed_address=url)
feed = feed.update()
# Still nothing? Maybe the URL has some clues.
if not feed and fetch and len(found_feed_urls):
@ -1103,7 +1110,7 @@ class Feed(models.Model):
if getattr(settings, 'TEST_DEBUG', False):
print " ---> Testing feed fetch: %s" % self.log_title
options['force'] = False
options['force_fp'] = True
# options['force_fp'] = True # No, why would this be needed?
original_feed_address = self.feed_address
original_feed_link = self.feed_link
self.feed_address = self.feed_address.replace("%(NEWSBLUR_DIR)s", settings.NEWSBLUR_DIR)

View file

@ -34,6 +34,7 @@ from django.utils.encoding import smart_unicode
from utils import json_functions as json
from celery.exceptions import SoftTimeLimitExceeded
from utils.twitter_fetcher import TwitterFetcher
from utils.json_fetcher import JSONFetcher
# from utils.feed_functions import mail_feed_error_to_admin
@ -63,7 +64,7 @@ class FetchFeed:
datetime.datetime.now() - self.feed.last_update)
logging.debug(log_msg)
etag=self.feed.etag
etag = self.feed.etag
modified = self.feed.last_modified.utctimetuple()[:7] if self.feed.last_modified else None
address = self.feed.feed_address
@ -126,7 +127,16 @@ class FetchFeed:
if raw_feed.status_code >= 400:
logging.debug(" ***> [%-30s] ~FRFeed fetch was %s status code, trying fake user agent: %s" % (self.feed.log_title[:30], raw_feed.status_code, raw_feed.headers))
raw_feed = requests.get(address, headers=self.feed.fetch_headers(fake=True))
if raw_feed.content and raw_feed.status_code < 400:
if raw_feed.content and 'application/json' in raw_feed.headers.get('Content-Type', ""):
# JSON Feed
json_feed = self.fetch_json_feed(address, raw_feed)
if not json_feed:
logging.debug(u' ***> [%-30s] ~FRJSON fetch failed: %s' %
(self.feed.log_title[:30], address))
return FEED_ERRHTTP, None
self.fpf = feedparser.parse(json_feed)
elif raw_feed.content and raw_feed.status_code < 400:
response_headers = raw_feed.headers
response_headers['Content-Location'] = raw_feed.url
self.raw_feed = smart_unicode(raw_feed.content)
@ -176,6 +186,10 @@ class FetchFeed:
twitter_fetcher = TwitterFetcher(self.feed, self.options)
return twitter_fetcher.fetch(address)
def fetch_json_feed(self, address, headers):
json_fetcher = JSONFetcher(self.feed, self.options)
return json_fetcher.fetch(address, headers)
def fetch_youtube(self, address):
username = None
channel_id = None
@ -659,6 +673,7 @@ class Dispatcher:
ffeed = FetchFeed(feed_id, self.options)
ret_feed, fetched_feed = ffeed.fetch()
feed_fetch_duration = time.time() - start_duration
raw_feed = ffeed.raw_feed

View file

@ -51,7 +51,7 @@ class FeedFinder(object):
data = text.lower()
if data and data[:100].count("<html"):
return False
return data.count("<rss")+data.count("<rdf")+data.count("<feed")
return data.count("<rss")+data.count("<rdf")+data.count("<feed")+data.count("jsonfeed.org")
def is_feed(self, url):
text = self.get_feed(url)
@ -61,11 +61,11 @@ class FeedFinder(object):
def is_feed_url(self, url):
return any(map(url.lower().endswith,
[".rss", ".rdf", ".xml", ".atom"]))
[".rss", ".rdf", ".xml", ".atom", ".json"]))
def is_feedlike_url(self, url):
return any(map(url.lower().count,
["rss", "rdf", "xml", "atom", "feed"]))
["rss", "rdf", "xml", "atom", "feed", "json"]))
def find_feeds(url, check_all=False, user_agent=None):
@ -92,7 +92,8 @@ def find_feeds(url, check_all=False, user_agent=None):
"text/xml",
"application/atom+xml",
"application/x.atom+xml",
"application/x-atom+xml"]:
"application/x-atom+xml",
"application/json"]:
links.append(urlparse.urljoin(url, link.get("href", "")))
# Check the detected links.
@ -129,7 +130,7 @@ def find_feeds(url, check_all=False, user_agent=None):
# Guessing potential URLs.
fns = ["atom.xml", "index.atom", "index.rdf", "rss.xml", "index.xml",
"index.rss"]
"index.rss", "index.json"]
urls += list(filter(finder.is_feed, [urlparse.urljoin(url, f)
for f in fns]))
return sort_urls(urls)
@ -140,7 +141,7 @@ def url_feed_prob(url):
return -2
if "georss" in url:
return -1
kw = ["atom", "rss", "rdf", ".xml", "feed"]
kw = ["atom", "rss", "rdf", ".xml", "feed", "json"]
for p, t in zip(range(len(kw), 0, -1), kw):
if t in url:
return p

56
utils/json_fetcher.py Normal file
View file

@ -0,0 +1,56 @@
import datetime
import dateutil.parser
from django.conf import settings
from django.utils import feedgenerator
from utils import log as logging
from utils.json_functions import decode
class JSONFetcher:
def __init__(self, feed, options=None):
self.feed = feed
self.options = options or {}
def fetch(self, address, raw_feed):
if not address:
address = self.feed.feed_address
json_feed = decode(raw_feed.content)
if not json_feed:
logging.debug(u' ***> [%-30s] ~FRJSON fetch failed: %s' %
(self.feed.log_title[:30], address))
return
data = {}
data['title'] = json_feed.get('title', '[Untitled]')
data['link'] = json_feed.get('home_page_url', None)
data['description'] = json_feed.get('title', "")
data['lastBuildDate'] = datetime.datetime.utcnow()
data['generator'] = 'NewsBlur JSON Feed - %s' % settings.NEWSBLUR_URL
data['docs'] = None
data['feed_url'] = json_feed.get('feed_url')
rss = feedgenerator.Atom1Feed(**data)
for item in json_feed.get('items', []):
story_data = self.json_feed_story(item)
rss.add_item(**story_data)
return rss.writeString('utf-8')
def json_feed_story(self, item):
date_published = datetime.datetime.now()
pubdate = item.get('date_published', None)
if pubdate:
date_published = dateutil.parser.parse(pubdate)
story = {
'title': item.get('title', None),
'link': item.get('url', None),
'description': item.get('content_html', item.get('content_text', None)),
'author_name': item.get('author', {}).get('name', None),
'categories': item.get('tags', []),
'unique_id': item.get('id', item.get('url', None)),
'pubdate': date_published,
}
return story