mirror of
https://github.com/samuelclay/NewsBlur.git
synced 2025-08-31 21:41:33 +00:00
Adding support for JSON Feeds.
This commit is contained in:
parent
6c24e09a78
commit
c1834703d9
4 changed files with 89 additions and 10 deletions
|
@ -1,4 +1,5 @@
|
|||
import difflib
|
||||
import requests
|
||||
import datetime
|
||||
import time
|
||||
import random
|
||||
|
@ -462,6 +463,12 @@ class Feed(models.Model):
|
|||
feed = cls.objects.create(feed_address=url)
|
||||
feed = feed.update(requesting_user_id=user.pk if user else None)
|
||||
|
||||
# Check for JSON feed
|
||||
if not feed and fetch and create:
|
||||
r = requests.get(url)
|
||||
if 'application/json' in r.headers.get('Content-Type'):
|
||||
feed = cls.objects.create(feed_address=url)
|
||||
feed = feed.update()
|
||||
|
||||
# Still nothing? Maybe the URL has some clues.
|
||||
if not feed and fetch and len(found_feed_urls):
|
||||
|
@ -1103,7 +1110,7 @@ class Feed(models.Model):
|
|||
if getattr(settings, 'TEST_DEBUG', False):
|
||||
print " ---> Testing feed fetch: %s" % self.log_title
|
||||
options['force'] = False
|
||||
options['force_fp'] = True
|
||||
# options['force_fp'] = True # No, why would this be needed?
|
||||
original_feed_address = self.feed_address
|
||||
original_feed_link = self.feed_link
|
||||
self.feed_address = self.feed_address.replace("%(NEWSBLUR_DIR)s", settings.NEWSBLUR_DIR)
|
||||
|
|
|
@ -34,6 +34,7 @@ from django.utils.encoding import smart_unicode
|
|||
from utils import json_functions as json
|
||||
from celery.exceptions import SoftTimeLimitExceeded
|
||||
from utils.twitter_fetcher import TwitterFetcher
|
||||
from utils.json_fetcher import JSONFetcher
|
||||
# from utils.feed_functions import mail_feed_error_to_admin
|
||||
|
||||
|
||||
|
@ -63,7 +64,7 @@ class FetchFeed:
|
|||
datetime.datetime.now() - self.feed.last_update)
|
||||
logging.debug(log_msg)
|
||||
|
||||
etag=self.feed.etag
|
||||
etag = self.feed.etag
|
||||
modified = self.feed.last_modified.utctimetuple()[:7] if self.feed.last_modified else None
|
||||
address = self.feed.feed_address
|
||||
|
||||
|
@ -126,7 +127,16 @@ class FetchFeed:
|
|||
if raw_feed.status_code >= 400:
|
||||
logging.debug(" ***> [%-30s] ~FRFeed fetch was %s status code, trying fake user agent: %s" % (self.feed.log_title[:30], raw_feed.status_code, raw_feed.headers))
|
||||
raw_feed = requests.get(address, headers=self.feed.fetch_headers(fake=True))
|
||||
if raw_feed.content and raw_feed.status_code < 400:
|
||||
|
||||
if raw_feed.content and 'application/json' in raw_feed.headers.get('Content-Type', ""):
|
||||
# JSON Feed
|
||||
json_feed = self.fetch_json_feed(address, raw_feed)
|
||||
if not json_feed:
|
||||
logging.debug(u' ***> [%-30s] ~FRJSON fetch failed: %s' %
|
||||
(self.feed.log_title[:30], address))
|
||||
return FEED_ERRHTTP, None
|
||||
self.fpf = feedparser.parse(json_feed)
|
||||
elif raw_feed.content and raw_feed.status_code < 400:
|
||||
response_headers = raw_feed.headers
|
||||
response_headers['Content-Location'] = raw_feed.url
|
||||
self.raw_feed = smart_unicode(raw_feed.content)
|
||||
|
@ -176,6 +186,10 @@ class FetchFeed:
|
|||
twitter_fetcher = TwitterFetcher(self.feed, self.options)
|
||||
return twitter_fetcher.fetch(address)
|
||||
|
||||
def fetch_json_feed(self, address, headers):
|
||||
json_fetcher = JSONFetcher(self.feed, self.options)
|
||||
return json_fetcher.fetch(address, headers)
|
||||
|
||||
def fetch_youtube(self, address):
|
||||
username = None
|
||||
channel_id = None
|
||||
|
@ -659,6 +673,7 @@ class Dispatcher:
|
|||
|
||||
ffeed = FetchFeed(feed_id, self.options)
|
||||
ret_feed, fetched_feed = ffeed.fetch()
|
||||
|
||||
feed_fetch_duration = time.time() - start_duration
|
||||
raw_feed = ffeed.raw_feed
|
||||
|
||||
|
|
|
@ -51,7 +51,7 @@ class FeedFinder(object):
|
|||
data = text.lower()
|
||||
if data and data[:100].count("<html"):
|
||||
return False
|
||||
return data.count("<rss")+data.count("<rdf")+data.count("<feed")
|
||||
return data.count("<rss")+data.count("<rdf")+data.count("<feed")+data.count("jsonfeed.org")
|
||||
|
||||
def is_feed(self, url):
|
||||
text = self.get_feed(url)
|
||||
|
@ -61,11 +61,11 @@ class FeedFinder(object):
|
|||
|
||||
def is_feed_url(self, url):
|
||||
return any(map(url.lower().endswith,
|
||||
[".rss", ".rdf", ".xml", ".atom"]))
|
||||
[".rss", ".rdf", ".xml", ".atom", ".json"]))
|
||||
|
||||
def is_feedlike_url(self, url):
|
||||
return any(map(url.lower().count,
|
||||
["rss", "rdf", "xml", "atom", "feed"]))
|
||||
["rss", "rdf", "xml", "atom", "feed", "json"]))
|
||||
|
||||
|
||||
def find_feeds(url, check_all=False, user_agent=None):
|
||||
|
@ -92,7 +92,8 @@ def find_feeds(url, check_all=False, user_agent=None):
|
|||
"text/xml",
|
||||
"application/atom+xml",
|
||||
"application/x.atom+xml",
|
||||
"application/x-atom+xml"]:
|
||||
"application/x-atom+xml",
|
||||
"application/json"]:
|
||||
links.append(urlparse.urljoin(url, link.get("href", "")))
|
||||
|
||||
# Check the detected links.
|
||||
|
@ -129,7 +130,7 @@ def find_feeds(url, check_all=False, user_agent=None):
|
|||
|
||||
# Guessing potential URLs.
|
||||
fns = ["atom.xml", "index.atom", "index.rdf", "rss.xml", "index.xml",
|
||||
"index.rss"]
|
||||
"index.rss", "index.json"]
|
||||
urls += list(filter(finder.is_feed, [urlparse.urljoin(url, f)
|
||||
for f in fns]))
|
||||
return sort_urls(urls)
|
||||
|
@ -140,7 +141,7 @@ def url_feed_prob(url):
|
|||
return -2
|
||||
if "georss" in url:
|
||||
return -1
|
||||
kw = ["atom", "rss", "rdf", ".xml", "feed"]
|
||||
kw = ["atom", "rss", "rdf", ".xml", "feed", "json"]
|
||||
for p, t in zip(range(len(kw), 0, -1), kw):
|
||||
if t in url:
|
||||
return p
|
||||
|
|
56
utils/json_fetcher.py
Normal file
56
utils/json_fetcher.py
Normal file
|
@ -0,0 +1,56 @@
|
|||
import datetime
|
||||
import dateutil.parser
|
||||
from django.conf import settings
|
||||
from django.utils import feedgenerator
|
||||
from utils import log as logging
|
||||
from utils.json_functions import decode
|
||||
|
||||
class JSONFetcher:
|
||||
|
||||
def __init__(self, feed, options=None):
|
||||
self.feed = feed
|
||||
self.options = options or {}
|
||||
|
||||
def fetch(self, address, raw_feed):
|
||||
if not address:
|
||||
address = self.feed.feed_address
|
||||
|
||||
json_feed = decode(raw_feed.content)
|
||||
if not json_feed:
|
||||
logging.debug(u' ***> [%-30s] ~FRJSON fetch failed: %s' %
|
||||
(self.feed.log_title[:30], address))
|
||||
return
|
||||
|
||||
data = {}
|
||||
data['title'] = json_feed.get('title', '[Untitled]')
|
||||
data['link'] = json_feed.get('home_page_url', None)
|
||||
data['description'] = json_feed.get('title', "")
|
||||
data['lastBuildDate'] = datetime.datetime.utcnow()
|
||||
data['generator'] = 'NewsBlur JSON Feed - %s' % settings.NEWSBLUR_URL
|
||||
data['docs'] = None
|
||||
data['feed_url'] = json_feed.get('feed_url')
|
||||
|
||||
rss = feedgenerator.Atom1Feed(**data)
|
||||
|
||||
for item in json_feed.get('items', []):
|
||||
story_data = self.json_feed_story(item)
|
||||
rss.add_item(**story_data)
|
||||
|
||||
return rss.writeString('utf-8')
|
||||
|
||||
def json_feed_story(self, item):
|
||||
date_published = datetime.datetime.now()
|
||||
pubdate = item.get('date_published', None)
|
||||
if pubdate:
|
||||
date_published = dateutil.parser.parse(pubdate)
|
||||
story = {
|
||||
'title': item.get('title', None),
|
||||
'link': item.get('url', None),
|
||||
'description': item.get('content_html', item.get('content_text', None)),
|
||||
'author_name': item.get('author', {}).get('name', None),
|
||||
'categories': item.get('tags', []),
|
||||
'unique_id': item.get('id', item.get('url', None)),
|
||||
'pubdate': date_published,
|
||||
}
|
||||
|
||||
return story
|
Loading…
Add table
Reference in a new issue