diff --git a/apps/rss_feeds/models.py b/apps/rss_feeds/models.py
index 5bcfa9dc0..7ec296e14 100644
--- a/apps/rss_feeds/models.py
+++ b/apps/rss_feeds/models.py
@@ -529,18 +529,17 @@ class Feed(models.Model):
def _1():
feed_address = None
feed = self
+ found_feed_urls = []
try:
- is_feed = feedfinder.isFeed(self.feed_address)
+ logging.debug(" ---> Checking: %s" % self.feed_address)
+ found_feed_urls = feedfinder.find_feeds(self.feed_address)
+ feed_address = found_feed_urls[0]
except KeyError:
is_feed = False
- if not is_feed:
- feed_address = feedfinder.feed(self.feed_address)
- if not feed_address and self.feed_link:
- feed_address = feedfinder.feed(self.feed_link)
- else:
- feed_address_from_link = feedfinder.feed(self.feed_link)
- if feed_address_from_link != self.feed_address:
- feed_address = feed_address_from_link
+ if not len(found_feed_urls) and self.feed_link:
+ found_feed_urls = feedfinder.find_feeds(self.feed_link)
+ if len(found_feed_urls) and found_feed_urls[0] != self.feed_address:
+ feed_address = found_feed_urls[0]
if feed_address:
if (feed_address.endswith('feedburner.com/atom.xml') or
@@ -608,7 +607,6 @@ class Feed(models.Model):
self.save()
def count_errors_in_history(self, exception_type='feed', status_code=None, fetch_history=None):
- logging.debug(' ---> [%-30s] Counting errors in history...' % (unicode(self)[:30]))
if not fetch_history:
fetch_history = MFetchHistory.feed(self.pk)
fh = fetch_history[exception_type + '_fetch_history']
@@ -633,6 +631,9 @@ class Feed(models.Model):
self.has_page_exception = False
self.save()
+ logging.debug(' ---> [%-30s] ~FBCounting any errors in history: %s (%s non errors)' %
+ (unicode(self)[:30], len(errors), len(non_errors)))
+
return errors, non_errors
def count_redirects_in_history(self, fetch_type='feed', fetch_history=None):
@@ -1869,9 +1870,12 @@ class MFeedPage(mongo.Document):
def save(self, *args, **kwargs):
if self.page_data:
- self.page_data = zlib.compress(self.page_data)
+ self.page_data = zlib.compress(self.page_data).decode('utf-8')
return super(MFeedPage, self).save(*args, **kwargs)
+ def page(self):
+ return zlib.decompress(self.page_data)
+
@classmethod
def get_data(cls, feed_id):
data = None
diff --git a/apps/rss_feeds/page_importer.py b/apps/rss_feeds/page_importer.py
index 943c749f8..d4519a834 100644
--- a/apps/rss_feeds/page_importer.py
+++ b/apps/rss_feeds/page_importer.py
@@ -98,16 +98,16 @@ class PageImporter(object):
logging.debug(' ***> [%-30s] Page fetch failed using requests: %s' % (self.feed, e))
self.save_no_page()
return
- try:
- data = response.text
- except (LookupError, TypeError):
- data = response.content
+ # try:
+ data = response.content
+ # except (LookupError, TypeError):
+ # data = response.content
- if response.encoding and response.encoding != 'utf-8':
- try:
- data = data.encode(response.encoding)
- except LookupError:
- pass
+ # if response.encoding and response.encoding != 'utf-8':
+ # try:
+ # data = data.encode(response.encoding)
+ # except LookupError:
+ # pass
else:
try:
data = open(feed_link, 'r').read()
@@ -270,8 +270,12 @@ class PageImporter(object):
if not saved:
try:
feed_page = MFeedPage.objects.get(feed_id=self.feed.pk)
- feed_page.page_data = html
- feed_page.save()
+ # feed_page.page_data = html.encode('utf-8')
+ if feed_page.page() == html:
+ logging.debug(' ---> [%-30s] ~FYNo change in page data: %s' % (self.feed.title[:30], self.feed.feed_link))
+ else:
+ feed_page.page_data = html
+ feed_page.save()
except MFeedPage.DoesNotExist:
feed_page = MFeedPage.objects.create(feed_id=self.feed.pk, page_data=html)
return feed_page
diff --git a/apps/rss_feeds/tasks.py b/apps/rss_feeds/tasks.py
index b541d2abc..78a3d0805 100644
--- a/apps/rss_feeds/tasks.py
+++ b/apps/rss_feeds/tasks.py
@@ -30,6 +30,10 @@ class TaskFeeds(Task):
now_timestamp = int(now.strftime("%s"))
queued_feeds = r.zrangebyscore('scheduled_updates', 0, now_timestamp)
r.zremrangebyscore('scheduled_updates', 0, now_timestamp)
+ if not queued_feeds:
+ logging.debug(" ---> ~SN~FB~BMNo feeds to queue! Exiting...")
+ return
+
r.sadd('queued_feeds', *queued_feeds)
logging.debug(" ---> ~SN~FBQueuing ~SB%s~SN stale feeds (~SB%s~SN/~FG%s~FB~SN/%s tasked/queued/scheduled)" % (
len(queued_feeds),
diff --git a/apps/rss_feeds/views.py b/apps/rss_feeds/views.py
index fe2d2f5cd..a8188335e 100644
--- a/apps/rss_feeds/views.py
+++ b/apps/rss_feeds/views.py
@@ -396,14 +396,14 @@ def exception_change_feed_link(request):
if not feed.known_good and (feed.has_page_exception or feed.has_feed_exception):
# Fix broken feed
logging.user(request, "~FRFixing feed exception by link: ~SB%s~SN to ~SB%s" % (feed.feed_link, feed_link))
- feed_address = feedfinder.feed(feed_link)
- if feed_address:
+ found_feed_urls = feedfinder.find_feeds(feed_link)
+ if len(found_feed_urls):
code = 1
feed.has_page_exception = False
feed.active = True
feed.fetched_once = False
feed.feed_link = feed_link
- feed.feed_address = feed_address
+ feed.feed_address = found_feed_urls[0]
duplicate_feed = feed.schedule_feed_fetch_immediately()
if duplicate_feed:
new_feed = Feed.objects.get(pk=duplicate_feed.pk)
diff --git a/config/supervisor_celerybeat.conf b/config/supervisor_celerybeat.conf
index 4200d224c..72b95f735 100644
--- a/config/supervisor_celerybeat.conf
+++ b/config/supervisor_celerybeat.conf
@@ -1,6 +1,7 @@
[program:celerybeat]
command=/srv/newsblur/manage.py celerybeat --schedule=/srv/newsblur/data/celerybeat-schedule.db --loglevel=INFO
directory=/srv/newsblur
+environment=PATH="/srv/newsblur/venv/bin"
user=sclay
numprocs=1
stdout_logfile=/var/log/celerybeat.log
diff --git a/utils/feed_fetcher.py b/utils/feed_fetcher.py
index 988c6ccdf..24a24074f 100644
--- a/utils/feed_fetcher.py
+++ b/utils/feed_fetcher.py
@@ -131,7 +131,7 @@ class FetchFeed:
headers['If-Modified-Since'] = modified_header
raw_feed = requests.get(address, headers=headers)
if raw_feed.text:
- self.fpf = feedparser.parse(raw_feed.text)
+ self.fpf = feedparser.parse(raw_feed.content)
except Exception, e:
logging.debug(" ---> [%-30s] ~FRFeed failed to fetch with request, trying feedparser: %s" % (self.feed.title[:30], e))
diff --git a/utils/feedfinder2.py b/utils/feedfinder2.py
index 2d8d75760..bc50feaeb 100755
--- a/utils/feedfinder2.py
+++ b/utils/feedfinder2.py
@@ -85,7 +85,7 @@ def find_feeds(url, check_all=False, user_agent=None):
logging.info("Looking for tags.")
tree = BeautifulSoup(text)
links = []
- for link in tree.find_all("link"):
+ for link in tree.findAll("link"):
if link.get("type") in ["application/rss+xml",
"text/xml",
"application/atom+xml",
@@ -102,7 +102,7 @@ def find_feeds(url, check_all=False, user_agent=None):
# Look for tags.
logging.info("Looking for tags.")
local, remote = [], []
- for a in tree.find_all("a"):
+ for a in tree.findAll("a"):
href = a.get("href", None)
if href is None:
continue