mirror of
https://github.com/samuelclay/NewsBlur.git
synced 2025-08-05 16:58:59 +00:00
78 lines
No EOL
2.6 KiB
Python
78 lines
No EOL
2.6 KiB
Python
# Originally adapted from IanLewis / dlife
|
|
|
|
from django.shortcuts import get_list_or_404
|
|
from apps.rss_feeds.models import Feed, Story
|
|
from utils import feedparser
|
|
from utils.dateutil.parser import parse as dateutil_parse
|
|
from django.utils.http import urlquote
|
|
from django.db.models import Q
|
|
|
|
import time
|
|
|
|
class FeedInjest(object):
|
|
|
|
feed = None
|
|
|
|
def __init__(self, feed):
|
|
self.feed = feed
|
|
|
|
def update(self):
|
|
stories = []
|
|
feed_items = feedparser.parse(self.feed.feed_address)
|
|
for entry in feed_items['entries']:
|
|
stories.append(entry)
|
|
self.feed.feed_last_update = time.time()
|
|
self.feed.save()
|
|
return stories
|
|
|
|
def save_story(self, story):
|
|
story_contents = story.get('content')
|
|
if story_contents is not None:
|
|
story_contents = story_contents[0]['value']
|
|
else:
|
|
story_contents = story.get('summary')
|
|
print 'Story: ', story_contents
|
|
if story_contents is not None:
|
|
story_content = story_contents
|
|
else:
|
|
story_content = None
|
|
|
|
s = Story(story_feed = self.feed,
|
|
story_date = story.get('published'),
|
|
story_title = story.get('title'),
|
|
story_content = story_content,
|
|
story_author = story.get('author'),
|
|
story_permalink = story.get('link')
|
|
)
|
|
s.save()
|
|
|
|
def include_story(self, entry):
|
|
story_count = Story.objects.filter(
|
|
Q(story_date = entry['published']) | Q(story_permalink = entry['link'])
|
|
).filter(
|
|
story_feed = self.feed
|
|
).count()
|
|
|
|
return story_count == 0
|
|
|
|
def pre_process(self, entry):
|
|
'''
|
|
A hook is used to clean up feed entry data before it is processed.
|
|
This hook can be used to clean up dates and/or media data
|
|
before being processed.
|
|
'''
|
|
date_published = entry.get('published', entry.get('updated'))
|
|
if not date_published:
|
|
date_published = str(datetime.datetime.utcnow())
|
|
date_published = dateutil_parse(date_published)
|
|
# Change the date to UTC and remove timezone info since MySQL doesn't
|
|
# support it.
|
|
date_published = (date_published - date_published.utcoffset()).replace(tzinfo=None)
|
|
|
|
entry['published'] = date_published
|
|
|
|
protocol_index = entry['link'].find("://")
|
|
if protocol_index != -1:
|
|
entry['link'] = entry['link'][:protocol_index+3] + urlquote(entry['link'][protocol_index+3:])
|
|
else:
|
|
entry['link'] = urlquote(entry['link']) |