Fixing feedfinder beautifulsoup issues, unicode issues in feed, and unicode comparisons issues with page_data (since it wasnt being decompressed when compared).

This commit is contained in:
Samuel Clay 2016-02-05 14:43:31 -08:00
parent 6edbf67a50
commit 2d6343b7ae
7 changed files with 41 additions and 28 deletions

View file

@ -529,18 +529,17 @@ class Feed(models.Model):
def _1():
feed_address = None
feed = self
found_feed_urls = []
try:
is_feed = feedfinder.isFeed(self.feed_address)
logging.debug(" ---> Checking: %s" % self.feed_address)
found_feed_urls = feedfinder.find_feeds(self.feed_address)
feed_address = found_feed_urls[0]
except KeyError:
is_feed = False
if not is_feed:
feed_address = feedfinder.feed(self.feed_address)
if not feed_address and self.feed_link:
feed_address = feedfinder.feed(self.feed_link)
else:
feed_address_from_link = feedfinder.feed(self.feed_link)
if feed_address_from_link != self.feed_address:
feed_address = feed_address_from_link
if not len(found_feed_urls) and self.feed_link:
found_feed_urls = feedfinder.find_feeds(self.feed_link)
if len(found_feed_urls) and found_feed_urls[0] != self.feed_address:
feed_address = found_feed_urls[0]
if feed_address:
if (feed_address.endswith('feedburner.com/atom.xml') or
@ -608,7 +607,6 @@ class Feed(models.Model):
self.save()
def count_errors_in_history(self, exception_type='feed', status_code=None, fetch_history=None):
logging.debug(' ---> [%-30s] Counting errors in history...' % (unicode(self)[:30]))
if not fetch_history:
fetch_history = MFetchHistory.feed(self.pk)
fh = fetch_history[exception_type + '_fetch_history']
@ -633,6 +631,9 @@ class Feed(models.Model):
self.has_page_exception = False
self.save()
logging.debug(' ---> [%-30s] ~FBCounting any errors in history: %s (%s non errors)' %
(unicode(self)[:30], len(errors), len(non_errors)))
return errors, non_errors
def count_redirects_in_history(self, fetch_type='feed', fetch_history=None):
@ -1869,9 +1870,12 @@ class MFeedPage(mongo.Document):
def save(self, *args, **kwargs):
if self.page_data:
self.page_data = zlib.compress(self.page_data)
self.page_data = zlib.compress(self.page_data).decode('utf-8')
return super(MFeedPage, self).save(*args, **kwargs)
def page(self):
return zlib.decompress(self.page_data)
@classmethod
def get_data(cls, feed_id):
data = None

View file

@ -98,16 +98,16 @@ class PageImporter(object):
logging.debug(' ***> [%-30s] Page fetch failed using requests: %s' % (self.feed, e))
self.save_no_page()
return
try:
data = response.text
except (LookupError, TypeError):
data = response.content
# try:
data = response.content
# except (LookupError, TypeError):
# data = response.content
if response.encoding and response.encoding != 'utf-8':
try:
data = data.encode(response.encoding)
except LookupError:
pass
# if response.encoding and response.encoding != 'utf-8':
# try:
# data = data.encode(response.encoding)
# except LookupError:
# pass
else:
try:
data = open(feed_link, 'r').read()
@ -270,8 +270,12 @@ class PageImporter(object):
if not saved:
try:
feed_page = MFeedPage.objects.get(feed_id=self.feed.pk)
feed_page.page_data = html
feed_page.save()
# feed_page.page_data = html.encode('utf-8')
if feed_page.page() == html:
logging.debug(' ---> [%-30s] ~FYNo change in page data: %s' % (self.feed.title[:30], self.feed.feed_link))
else:
feed_page.page_data = html
feed_page.save()
except MFeedPage.DoesNotExist:
feed_page = MFeedPage.objects.create(feed_id=self.feed.pk, page_data=html)
return feed_page

View file

@ -30,6 +30,10 @@ class TaskFeeds(Task):
now_timestamp = int(now.strftime("%s"))
queued_feeds = r.zrangebyscore('scheduled_updates', 0, now_timestamp)
r.zremrangebyscore('scheduled_updates', 0, now_timestamp)
if not queued_feeds:
logging.debug(" ---> ~SN~FB~BMNo feeds to queue! Exiting...")
return
r.sadd('queued_feeds', *queued_feeds)
logging.debug(" ---> ~SN~FBQueuing ~SB%s~SN stale feeds (~SB%s~SN/~FG%s~FB~SN/%s tasked/queued/scheduled)" % (
len(queued_feeds),

View file

@ -396,14 +396,14 @@ def exception_change_feed_link(request):
if not feed.known_good and (feed.has_page_exception or feed.has_feed_exception):
# Fix broken feed
logging.user(request, "~FRFixing feed exception by link: ~SB%s~SN to ~SB%s" % (feed.feed_link, feed_link))
feed_address = feedfinder.feed(feed_link)
if feed_address:
found_feed_urls = feedfinder.find_feeds(feed_link)
if len(found_feed_urls):
code = 1
feed.has_page_exception = False
feed.active = True
feed.fetched_once = False
feed.feed_link = feed_link
feed.feed_address = feed_address
feed.feed_address = found_feed_urls[0]
duplicate_feed = feed.schedule_feed_fetch_immediately()
if duplicate_feed:
new_feed = Feed.objects.get(pk=duplicate_feed.pk)

View file

@ -1,6 +1,7 @@
[program:celerybeat]
command=/srv/newsblur/manage.py celerybeat --schedule=/srv/newsblur/data/celerybeat-schedule.db --loglevel=INFO
directory=/srv/newsblur
environment=PATH="/srv/newsblur/venv/bin"
user=sclay
numprocs=1
stdout_logfile=/var/log/celerybeat.log

View file

@ -131,7 +131,7 @@ class FetchFeed:
headers['If-Modified-Since'] = modified_header
raw_feed = requests.get(address, headers=headers)
if raw_feed.text:
self.fpf = feedparser.parse(raw_feed.text)
self.fpf = feedparser.parse(raw_feed.content)
except Exception, e:
logging.debug(" ---> [%-30s] ~FRFeed failed to fetch with request, trying feedparser: %s" % (self.feed.title[:30], e))

View file

@ -85,7 +85,7 @@ def find_feeds(url, check_all=False, user_agent=None):
logging.info("Looking for <link> tags.")
tree = BeautifulSoup(text)
links = []
for link in tree.find_all("link"):
for link in tree.findAll("link"):
if link.get("type") in ["application/rss+xml",
"text/xml",
"application/atom+xml",
@ -102,7 +102,7 @@ def find_feeds(url, check_all=False, user_agent=None):
# Look for <a> tags.
logging.info("Looking for <a> tags.")
local, remote = [], []
for a in tree.find_all("a"):
for a in tree.findAll("a"):
href = a.get("href", None)
if href is None:
continue