mirror of
https://github.com/samuelclay/NewsBlur.git
synced 2025-09-18 21:50:56 +00:00
Fixing feedfinder beautifulsoup issues, unicode issues in feed, and unicode comparisons issues with page_data (since it wasnt being decompressed when compared).
This commit is contained in:
parent
6edbf67a50
commit
2d6343b7ae
7 changed files with 41 additions and 28 deletions
|
@ -529,18 +529,17 @@ class Feed(models.Model):
|
|||
def _1():
|
||||
feed_address = None
|
||||
feed = self
|
||||
found_feed_urls = []
|
||||
try:
|
||||
is_feed = feedfinder.isFeed(self.feed_address)
|
||||
logging.debug(" ---> Checking: %s" % self.feed_address)
|
||||
found_feed_urls = feedfinder.find_feeds(self.feed_address)
|
||||
feed_address = found_feed_urls[0]
|
||||
except KeyError:
|
||||
is_feed = False
|
||||
if not is_feed:
|
||||
feed_address = feedfinder.feed(self.feed_address)
|
||||
if not feed_address and self.feed_link:
|
||||
feed_address = feedfinder.feed(self.feed_link)
|
||||
else:
|
||||
feed_address_from_link = feedfinder.feed(self.feed_link)
|
||||
if feed_address_from_link != self.feed_address:
|
||||
feed_address = feed_address_from_link
|
||||
if not len(found_feed_urls) and self.feed_link:
|
||||
found_feed_urls = feedfinder.find_feeds(self.feed_link)
|
||||
if len(found_feed_urls) and found_feed_urls[0] != self.feed_address:
|
||||
feed_address = found_feed_urls[0]
|
||||
|
||||
if feed_address:
|
||||
if (feed_address.endswith('feedburner.com/atom.xml') or
|
||||
|
@ -608,7 +607,6 @@ class Feed(models.Model):
|
|||
self.save()
|
||||
|
||||
def count_errors_in_history(self, exception_type='feed', status_code=None, fetch_history=None):
|
||||
logging.debug(' ---> [%-30s] Counting errors in history...' % (unicode(self)[:30]))
|
||||
if not fetch_history:
|
||||
fetch_history = MFetchHistory.feed(self.pk)
|
||||
fh = fetch_history[exception_type + '_fetch_history']
|
||||
|
@ -633,6 +631,9 @@ class Feed(models.Model):
|
|||
self.has_page_exception = False
|
||||
self.save()
|
||||
|
||||
logging.debug(' ---> [%-30s] ~FBCounting any errors in history: %s (%s non errors)' %
|
||||
(unicode(self)[:30], len(errors), len(non_errors)))
|
||||
|
||||
return errors, non_errors
|
||||
|
||||
def count_redirects_in_history(self, fetch_type='feed', fetch_history=None):
|
||||
|
@ -1869,9 +1870,12 @@ class MFeedPage(mongo.Document):
|
|||
|
||||
def save(self, *args, **kwargs):
|
||||
if self.page_data:
|
||||
self.page_data = zlib.compress(self.page_data)
|
||||
self.page_data = zlib.compress(self.page_data).decode('utf-8')
|
||||
return super(MFeedPage, self).save(*args, **kwargs)
|
||||
|
||||
def page(self):
|
||||
return zlib.decompress(self.page_data)
|
||||
|
||||
@classmethod
|
||||
def get_data(cls, feed_id):
|
||||
data = None
|
||||
|
|
|
@ -98,16 +98,16 @@ class PageImporter(object):
|
|||
logging.debug(' ***> [%-30s] Page fetch failed using requests: %s' % (self.feed, e))
|
||||
self.save_no_page()
|
||||
return
|
||||
try:
|
||||
data = response.text
|
||||
except (LookupError, TypeError):
|
||||
data = response.content
|
||||
# try:
|
||||
data = response.content
|
||||
# except (LookupError, TypeError):
|
||||
# data = response.content
|
||||
|
||||
if response.encoding and response.encoding != 'utf-8':
|
||||
try:
|
||||
data = data.encode(response.encoding)
|
||||
except LookupError:
|
||||
pass
|
||||
# if response.encoding and response.encoding != 'utf-8':
|
||||
# try:
|
||||
# data = data.encode(response.encoding)
|
||||
# except LookupError:
|
||||
# pass
|
||||
else:
|
||||
try:
|
||||
data = open(feed_link, 'r').read()
|
||||
|
@ -270,8 +270,12 @@ class PageImporter(object):
|
|||
if not saved:
|
||||
try:
|
||||
feed_page = MFeedPage.objects.get(feed_id=self.feed.pk)
|
||||
feed_page.page_data = html
|
||||
feed_page.save()
|
||||
# feed_page.page_data = html.encode('utf-8')
|
||||
if feed_page.page() == html:
|
||||
logging.debug(' ---> [%-30s] ~FYNo change in page data: %s' % (self.feed.title[:30], self.feed.feed_link))
|
||||
else:
|
||||
feed_page.page_data = html
|
||||
feed_page.save()
|
||||
except MFeedPage.DoesNotExist:
|
||||
feed_page = MFeedPage.objects.create(feed_id=self.feed.pk, page_data=html)
|
||||
return feed_page
|
||||
|
|
|
@ -30,6 +30,10 @@ class TaskFeeds(Task):
|
|||
now_timestamp = int(now.strftime("%s"))
|
||||
queued_feeds = r.zrangebyscore('scheduled_updates', 0, now_timestamp)
|
||||
r.zremrangebyscore('scheduled_updates', 0, now_timestamp)
|
||||
if not queued_feeds:
|
||||
logging.debug(" ---> ~SN~FB~BMNo feeds to queue! Exiting...")
|
||||
return
|
||||
|
||||
r.sadd('queued_feeds', *queued_feeds)
|
||||
logging.debug(" ---> ~SN~FBQueuing ~SB%s~SN stale feeds (~SB%s~SN/~FG%s~FB~SN/%s tasked/queued/scheduled)" % (
|
||||
len(queued_feeds),
|
||||
|
|
|
@ -396,14 +396,14 @@ def exception_change_feed_link(request):
|
|||
if not feed.known_good and (feed.has_page_exception or feed.has_feed_exception):
|
||||
# Fix broken feed
|
||||
logging.user(request, "~FRFixing feed exception by link: ~SB%s~SN to ~SB%s" % (feed.feed_link, feed_link))
|
||||
feed_address = feedfinder.feed(feed_link)
|
||||
if feed_address:
|
||||
found_feed_urls = feedfinder.find_feeds(feed_link)
|
||||
if len(found_feed_urls):
|
||||
code = 1
|
||||
feed.has_page_exception = False
|
||||
feed.active = True
|
||||
feed.fetched_once = False
|
||||
feed.feed_link = feed_link
|
||||
feed.feed_address = feed_address
|
||||
feed.feed_address = found_feed_urls[0]
|
||||
duplicate_feed = feed.schedule_feed_fetch_immediately()
|
||||
if duplicate_feed:
|
||||
new_feed = Feed.objects.get(pk=duplicate_feed.pk)
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
[program:celerybeat]
|
||||
command=/srv/newsblur/manage.py celerybeat --schedule=/srv/newsblur/data/celerybeat-schedule.db --loglevel=INFO
|
||||
directory=/srv/newsblur
|
||||
environment=PATH="/srv/newsblur/venv/bin"
|
||||
user=sclay
|
||||
numprocs=1
|
||||
stdout_logfile=/var/log/celerybeat.log
|
||||
|
|
|
@ -131,7 +131,7 @@ class FetchFeed:
|
|||
headers['If-Modified-Since'] = modified_header
|
||||
raw_feed = requests.get(address, headers=headers)
|
||||
if raw_feed.text:
|
||||
self.fpf = feedparser.parse(raw_feed.text)
|
||||
self.fpf = feedparser.parse(raw_feed.content)
|
||||
except Exception, e:
|
||||
logging.debug(" ---> [%-30s] ~FRFeed failed to fetch with request, trying feedparser: %s" % (self.feed.title[:30], e))
|
||||
|
||||
|
|
|
@ -85,7 +85,7 @@ def find_feeds(url, check_all=False, user_agent=None):
|
|||
logging.info("Looking for <link> tags.")
|
||||
tree = BeautifulSoup(text)
|
||||
links = []
|
||||
for link in tree.find_all("link"):
|
||||
for link in tree.findAll("link"):
|
||||
if link.get("type") in ["application/rss+xml",
|
||||
"text/xml",
|
||||
"application/atom+xml",
|
||||
|
@ -102,7 +102,7 @@ def find_feeds(url, check_all=False, user_agent=None):
|
|||
# Look for <a> tags.
|
||||
logging.info("Looking for <a> tags.")
|
||||
local, remote = [], []
|
||||
for a in tree.find_all("a"):
|
||||
for a in tree.findAll("a"):
|
||||
href = a.get("href", None)
|
||||
if href is None:
|
||||
continue
|
||||
|
|
Loading…
Add table
Reference in a new issue