From 2d6343b7ae01416ec14cc4dcca250c1661446c06 Mon Sep 17 00:00:00 2001 From: Samuel Clay Date: Fri, 5 Feb 2016 14:43:31 -0800 Subject: [PATCH] Fixing feedfinder beautifulsoup issues, unicode issues in feed, and unicode comparisons issues with page_data (since it wasnt being decompressed when compared). --- apps/rss_feeds/models.py | 26 +++++++++++++++----------- apps/rss_feeds/page_importer.py | 26 +++++++++++++++----------- apps/rss_feeds/tasks.py | 4 ++++ apps/rss_feeds/views.py | 6 +++--- config/supervisor_celerybeat.conf | 1 + utils/feed_fetcher.py | 2 +- utils/feedfinder2.py | 4 ++-- 7 files changed, 41 insertions(+), 28 deletions(-) diff --git a/apps/rss_feeds/models.py b/apps/rss_feeds/models.py index 5bcfa9dc0..7ec296e14 100644 --- a/apps/rss_feeds/models.py +++ b/apps/rss_feeds/models.py @@ -529,18 +529,17 @@ class Feed(models.Model): def _1(): feed_address = None feed = self + found_feed_urls = [] try: - is_feed = feedfinder.isFeed(self.feed_address) + logging.debug(" ---> Checking: %s" % self.feed_address) + found_feed_urls = feedfinder.find_feeds(self.feed_address) + feed_address = found_feed_urls[0] except KeyError: is_feed = False - if not is_feed: - feed_address = feedfinder.feed(self.feed_address) - if not feed_address and self.feed_link: - feed_address = feedfinder.feed(self.feed_link) - else: - feed_address_from_link = feedfinder.feed(self.feed_link) - if feed_address_from_link != self.feed_address: - feed_address = feed_address_from_link + if not len(found_feed_urls) and self.feed_link: + found_feed_urls = feedfinder.find_feeds(self.feed_link) + if len(found_feed_urls) and found_feed_urls[0] != self.feed_address: + feed_address = found_feed_urls[0] if feed_address: if (feed_address.endswith('feedburner.com/atom.xml') or @@ -608,7 +607,6 @@ class Feed(models.Model): self.save() def count_errors_in_history(self, exception_type='feed', status_code=None, fetch_history=None): - logging.debug(' ---> [%-30s] Counting errors in history...' % (unicode(self)[:30])) if not fetch_history: fetch_history = MFetchHistory.feed(self.pk) fh = fetch_history[exception_type + '_fetch_history'] @@ -633,6 +631,9 @@ class Feed(models.Model): self.has_page_exception = False self.save() + logging.debug(' ---> [%-30s] ~FBCounting any errors in history: %s (%s non errors)' % + (unicode(self)[:30], len(errors), len(non_errors))) + return errors, non_errors def count_redirects_in_history(self, fetch_type='feed', fetch_history=None): @@ -1869,9 +1870,12 @@ class MFeedPage(mongo.Document): def save(self, *args, **kwargs): if self.page_data: - self.page_data = zlib.compress(self.page_data) + self.page_data = zlib.compress(self.page_data).decode('utf-8') return super(MFeedPage, self).save(*args, **kwargs) + def page(self): + return zlib.decompress(self.page_data) + @classmethod def get_data(cls, feed_id): data = None diff --git a/apps/rss_feeds/page_importer.py b/apps/rss_feeds/page_importer.py index 943c749f8..d4519a834 100644 --- a/apps/rss_feeds/page_importer.py +++ b/apps/rss_feeds/page_importer.py @@ -98,16 +98,16 @@ class PageImporter(object): logging.debug(' ***> [%-30s] Page fetch failed using requests: %s' % (self.feed, e)) self.save_no_page() return - try: - data = response.text - except (LookupError, TypeError): - data = response.content + # try: + data = response.content + # except (LookupError, TypeError): + # data = response.content - if response.encoding and response.encoding != 'utf-8': - try: - data = data.encode(response.encoding) - except LookupError: - pass + # if response.encoding and response.encoding != 'utf-8': + # try: + # data = data.encode(response.encoding) + # except LookupError: + # pass else: try: data = open(feed_link, 'r').read() @@ -270,8 +270,12 @@ class PageImporter(object): if not saved: try: feed_page = MFeedPage.objects.get(feed_id=self.feed.pk) - feed_page.page_data = html - feed_page.save() + # feed_page.page_data = html.encode('utf-8') + if feed_page.page() == html: + logging.debug(' ---> [%-30s] ~FYNo change in page data: %s' % (self.feed.title[:30], self.feed.feed_link)) + else: + feed_page.page_data = html + feed_page.save() except MFeedPage.DoesNotExist: feed_page = MFeedPage.objects.create(feed_id=self.feed.pk, page_data=html) return feed_page diff --git a/apps/rss_feeds/tasks.py b/apps/rss_feeds/tasks.py index b541d2abc..78a3d0805 100644 --- a/apps/rss_feeds/tasks.py +++ b/apps/rss_feeds/tasks.py @@ -30,6 +30,10 @@ class TaskFeeds(Task): now_timestamp = int(now.strftime("%s")) queued_feeds = r.zrangebyscore('scheduled_updates', 0, now_timestamp) r.zremrangebyscore('scheduled_updates', 0, now_timestamp) + if not queued_feeds: + logging.debug(" ---> ~SN~FB~BMNo feeds to queue! Exiting...") + return + r.sadd('queued_feeds', *queued_feeds) logging.debug(" ---> ~SN~FBQueuing ~SB%s~SN stale feeds (~SB%s~SN/~FG%s~FB~SN/%s tasked/queued/scheduled)" % ( len(queued_feeds), diff --git a/apps/rss_feeds/views.py b/apps/rss_feeds/views.py index fe2d2f5cd..a8188335e 100644 --- a/apps/rss_feeds/views.py +++ b/apps/rss_feeds/views.py @@ -396,14 +396,14 @@ def exception_change_feed_link(request): if not feed.known_good and (feed.has_page_exception or feed.has_feed_exception): # Fix broken feed logging.user(request, "~FRFixing feed exception by link: ~SB%s~SN to ~SB%s" % (feed.feed_link, feed_link)) - feed_address = feedfinder.feed(feed_link) - if feed_address: + found_feed_urls = feedfinder.find_feeds(feed_link) + if len(found_feed_urls): code = 1 feed.has_page_exception = False feed.active = True feed.fetched_once = False feed.feed_link = feed_link - feed.feed_address = feed_address + feed.feed_address = found_feed_urls[0] duplicate_feed = feed.schedule_feed_fetch_immediately() if duplicate_feed: new_feed = Feed.objects.get(pk=duplicate_feed.pk) diff --git a/config/supervisor_celerybeat.conf b/config/supervisor_celerybeat.conf index 4200d224c..72b95f735 100644 --- a/config/supervisor_celerybeat.conf +++ b/config/supervisor_celerybeat.conf @@ -1,6 +1,7 @@ [program:celerybeat] command=/srv/newsblur/manage.py celerybeat --schedule=/srv/newsblur/data/celerybeat-schedule.db --loglevel=INFO directory=/srv/newsblur +environment=PATH="/srv/newsblur/venv/bin" user=sclay numprocs=1 stdout_logfile=/var/log/celerybeat.log diff --git a/utils/feed_fetcher.py b/utils/feed_fetcher.py index 988c6ccdf..24a24074f 100644 --- a/utils/feed_fetcher.py +++ b/utils/feed_fetcher.py @@ -131,7 +131,7 @@ class FetchFeed: headers['If-Modified-Since'] = modified_header raw_feed = requests.get(address, headers=headers) if raw_feed.text: - self.fpf = feedparser.parse(raw_feed.text) + self.fpf = feedparser.parse(raw_feed.content) except Exception, e: logging.debug(" ---> [%-30s] ~FRFeed failed to fetch with request, trying feedparser: %s" % (self.feed.title[:30], e)) diff --git a/utils/feedfinder2.py b/utils/feedfinder2.py index 2d8d75760..bc50feaeb 100755 --- a/utils/feedfinder2.py +++ b/utils/feedfinder2.py @@ -85,7 +85,7 @@ def find_feeds(url, check_all=False, user_agent=None): logging.info("Looking for tags.") tree = BeautifulSoup(text) links = [] - for link in tree.find_all("link"): + for link in tree.findAll("link"): if link.get("type") in ["application/rss+xml", "text/xml", "application/atom+xml", @@ -102,7 +102,7 @@ def find_feeds(url, check_all=False, user_agent=None): # Look for tags. logging.info("Looking for tags.") local, remote = [], [] - for a in tree.find_all("a"): + for a in tree.findAll("a"): href = a.get("href", None) if href is None: continue