From 88f2a69a93409f45760ac6ac469826ecae0c7f6a Mon Sep 17 00:00:00 2001 From: Samuel Clay Date: Tue, 6 Aug 2013 13:18:55 -0700 Subject: [PATCH] Fixing a dozen text and feed fetching bugs. --- apps/rss_feeds/models.py | 6 +++++- apps/rss_feeds/text_importer.py | 16 ++++++++++++---- apps/social/models.py | 10 ++++++---- fabfile.py | 2 +- utils/feed_fetch.sh | 9 --------- utils/feed_fetch_silent.sh | 9 --------- utils/feed_fetcher.py | 13 +++---------- utils/story_functions.py | 2 +- 8 files changed, 28 insertions(+), 39 deletions(-) delete mode 100755 utils/feed_fetch.sh delete mode 100755 utils/feed_fetch_silent.sh diff --git a/apps/rss_feeds/models.py b/apps/rss_feeds/models.py index d75533a91..76fd8344b 100644 --- a/apps/rss_feeds/models.py +++ b/apps/rss_feeds/models.py @@ -1831,7 +1831,11 @@ class MStory(mongo.Document): if not story_content: return - soup = BeautifulSoup(story_content) + try: + soup = BeautifulSoup(story_content) + except ValueError: + return + images = soup.findAll('img') if not images: return diff --git a/apps/rss_feeds/text_importer.py b/apps/rss_feeds/text_importer.py index 0bed8da29..3c4128a9d 100644 --- a/apps/rss_feeds/text_importer.py +++ b/apps/rss_feeds/text_importer.py @@ -2,6 +2,7 @@ import requests import zlib from django.conf import settings from socket import error as SocketError +from mongoengine.queryset import NotUniqueError from vendor.readability import readability from utils import log as logging from utils.feed_functions import timelimit, TimeoutError @@ -45,15 +46,21 @@ class TextImporter: if resp.encoding and resp.encoding != 'utf-8': try: text = text.encode(resp.encoding) - except LookupError: + except (LookupError, UnicodeEncodeError): pass original_text_doc = readability.Document(text, url=resp.url, debug=settings.DEBUG) - content = original_text_doc.summary(html_partial=True) + try: + content = original_text_doc.summary(html_partial=True) + except readability.Unparseable: + return if content: if not skip_save: self.story.original_text_z = zlib.compress(content) - self.story.save() + try: + self.story.save() + except NotUniqueError: + pass logging.user(self.request, ("~SN~FYFetched ~FGoriginal text~FY: now ~SB%s bytes~SN vs. was ~SB%s bytes" % ( len(unicode(content)), self.story.story_content_z and len(zlib.decompress(self.story.story_content_z)) @@ -69,7 +76,8 @@ class TextImporter: def fetch_request(self): try: r = requests.get(self.story.story_permalink, headers=self.headers, verify=False) - except (AttributeError, SocketError, requests.ConnectionError), e: + except (AttributeError, SocketError, requests.ConnectionError, + requests.models.MissingSchema, requests.sessions.InvalidSchema), e: logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: %s" % e) return return r diff --git a/apps/social/models.py b/apps/social/models.py index 74359c77e..51dec828f 100644 --- a/apps/social/models.py +++ b/apps/social/models.py @@ -1873,10 +1873,11 @@ class MSharedStory(mongo.Document): 'story_feed': story_feed, 'mute_url': mute_url, } - + story_title = self.story_title.replace('\n', ' ') + text = render_to_string('mail/email_reply.txt', data) html = pynliner.fromString(render_to_string('mail/email_reply.xhtml', data)) - subject = "%s replied to you on \"%s\" on NewsBlur" % (reply_user.username, self.story_title) + subject = "%s replied to you on \"%s\" on NewsBlur" % (reply_user.username, story_title) msg = EmailMultiAlternatives(subject, text, from_email='NewsBlur <%s>' % settings.HELLO_EMAIL, to=['%s <%s>' % (user.username, user.email)]) @@ -1936,10 +1937,11 @@ class MSharedStory(mongo.Document): 'story_feed': story_feed, 'mute_url': mute_url, } - + story_title = self.story_title.replace('\n', ' ') + text = render_to_string('mail/email_reshare.txt', data) html = pynliner.fromString(render_to_string('mail/email_reshare.xhtml', data)) - subject = "%s re-shared \"%s\" from you on NewsBlur" % (reshare_user.username, self.story_title) + subject = "%s re-shared \"%s\" from you on NewsBlur" % (reshare_user.username, story_title) msg = EmailMultiAlternatives(subject, text, from_email='NewsBlur <%s>' % settings.HELLO_EMAIL, to=['%s <%s>' % (original_user.username, original_user.email)]) diff --git a/fabfile.py b/fabfile.py index 15f1a2137..957d788ad 100644 --- a/fabfile.py +++ b/fabfile.py @@ -1172,7 +1172,7 @@ def staging_full(): run('curl -s http://dev.newsblur.com > /dev/null') run('curl -s http://dev.newsblur.com/m/ > /dev/null') -@parallel +# @parallel def celery(): celery_slow() diff --git a/utils/feed_fetch.sh b/utils/feed_fetch.sh deleted file mode 100755 index 673eaf76e..000000000 --- a/utils/feed_fetch.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/sh - -ps aux | grep refresh_feeds | egrep -v grep | awk '{print $2}' | xargs kill > /dev/null 2>&1 -python /home/conesus/newsblur/manage.py refresh_feeds -s & -python /home/conesus/newsblur/manage.py refresh_feeds -s & -python /home/conesus/newsblur/manage.py refresh_feeds -s & -python /home/conesus/newsblur/manage.py refresh_feeds -s & -python /home/conesus/newsblur/manage.py refresh_feeds -s & -python /home/conesus/newsblur/manage.py refresh_feeds -s & diff --git a/utils/feed_fetch_silent.sh b/utils/feed_fetch_silent.sh deleted file mode 100755 index 1f3aff2a9..000000000 --- a/utils/feed_fetch_silent.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/sh - -ps aux | grep refresh_feeds | egrep -v grep | awk '{print $2}' | xargs kill > /dev/null 2>&1 -python /home/conesus/newsblur/manage.py refresh_feeds -s > /dev/null 2>&1 & -python /home/conesus/newsblur/manage.py refresh_feeds -s > /dev/null 2>&1 & -python /home/conesus/newsblur/manage.py refresh_feeds -s > /dev/null 2>&1 & -python /home/conesus/newsblur/manage.py refresh_feeds -s > /dev/null 2>&1 & - - diff --git a/utils/feed_fetcher.py b/utils/feed_fetcher.py index c98aa6380..3ab2393b4 100644 --- a/utils/feed_fetcher.py +++ b/utils/feed_fetcher.py @@ -91,15 +91,10 @@ class FetchFeed: agent=USER_AGENT, etag=etag, modified=modified) - except (TypeError, ValueError), e: - logging.debug(u' ***> [%-30s] ~FR%s, turning off microformats.' % + except (TypeError, ValueError, KeyError), e: + logging.debug(u' ***> [%-30s] ~FR%s, turning off headers.' % (self.feed.title[:30], e)) - feedparser.PARSE_MICROFORMATS = False - self.fpf = feedparser.parse(address, - agent=USER_AGENT, - etag=etag, - modified=modified) - feedparser.PARSE_MICROFORMATS = True + self.fpf = feedparser.parse(address, agent=USER_AGENT) logging.debug(u' ---> [%-30s] ~FYFeed fetch in ~FM%.4ss' % ( self.feed.title[:30], time.time() - start)) @@ -396,8 +391,6 @@ class Dispatcher: if self.options['verbose']: logging.debug(u' ---> [%-30s] ~FBTIME: unread count in ~FM%.4ss' % ( feed.title[:30], time.time() - start)) - except KeyboardInterrupt: - break except urllib2.HTTPError, e: logging.debug(' ---> [%-30s] ~FRFeed throws HTTP error: ~SB%s' % (unicode(feed_id)[:30], e.fp.read())) feed.save_feed_history(e.code, e.msg, e.fp.read()) diff --git a/utils/story_functions.py b/utils/story_functions.py index 9b22529c4..1441b4018 100644 --- a/utils/story_functions.py +++ b/utils/story_functions.py @@ -78,7 +78,7 @@ def pre_process_story(entry): entry['guid'] = unicode(entry['guid']) # Normalize story content/summary - summary = entry.get('summary', '') + summary = entry.get('summary') or "" content = "" if not summary and 'summary_detail' in entry: summary = entry['summary_detail'].get('value', '')