Fixing a dozen text and feed fetching bugs.

2025-09-18 21:43:31 +00:00 · 2013-08-06 13:18:55 -07:00 · 2013-08-06 13:18:55 -07:00 · 88f2a69a93
commit 88f2a69a93
parent 0d32ae0623
8 changed files with 28 additions and 39 deletions
--- a/apps/rss_feeds/models.py
+++ b/apps/rss_feeds/models.py
@ -1831,7 +1831,11 @@ class MStory(mongo.Document):
        if not story_content:
            return
        
-        soup = BeautifulSoup(story_content)
+        try:
+            soup = BeautifulSoup(story_content)
+        except ValueError:
+            return
+        
        images = soup.findAll('img')
        if not images:
            return
--- a/apps/rss_feeds/text_importer.py
+++ b/apps/rss_feeds/text_importer.py
@ -2,6 +2,7 @@ import requests
 import zlib
 from django.conf import settings
 from socket import error as SocketError
+from mongoengine.queryset import NotUniqueError
 from vendor.readability import readability
 from utils import log as logging
 from utils.feed_functions import timelimit, TimeoutError
@ -45,15 +46,21 @@ class TextImporter:
        if resp.encoding and resp.encoding != 'utf-8':
            try:
                text = text.encode(resp.encoding)
-            except LookupError:
+            except (LookupError, UnicodeEncodeError):
                pass
        original_text_doc = readability.Document(text, url=resp.url, debug=settings.DEBUG)
-        content = original_text_doc.summary(html_partial=True)
+        try:
+            content = original_text_doc.summary(html_partial=True)
+        except readability.Unparseable:
+            return
        
        if content:
            if not skip_save:
                self.story.original_text_z = zlib.compress(content)
-                self.story.save()
+                try:
+                    self.story.save()
+                except NotUniqueError:
+                    pass
            logging.user(self.request, ("~SN~FYFetched ~FGoriginal text~FY: now ~SB%s bytes~SN vs. was ~SB%s bytes" % (
                len(unicode(content)),
                self.story.story_content_z and len(zlib.decompress(self.story.story_content_z))
@ -69,7 +76,8 @@ class TextImporter:
    def fetch_request(self):
        try:
            r = requests.get(self.story.story_permalink, headers=self.headers, verify=False)
-        except (AttributeError, SocketError, requests.ConnectionError), e:
+        except (AttributeError, SocketError, requests.ConnectionError, 
+                requests.models.MissingSchema, requests.sessions.InvalidSchema), e:
            logging.user(self.request, "~SN~FRFailed~FY to fetch ~FGoriginal text~FY: %s" % e)
            return
        return r
--- a/apps/social/models.py
+++ b/apps/social/models.py
@ -1873,10 +1873,11 @@ class MSharedStory(mongo.Document):
                'story_feed': story_feed,
                'mute_url': mute_url,
            }
-        
+            story_title = self.story_title.replace('\n', ' ')
+            
            text    = render_to_string('mail/email_reply.txt', data)
            html    = pynliner.fromString(render_to_string('mail/email_reply.xhtml', data))
-            subject = "%s replied to you on \"%s\" on NewsBlur" % (reply_user.username, self.story_title)
+            subject = "%s replied to you on \"%s\" on NewsBlur" % (reply_user.username, story_title)
            msg     = EmailMultiAlternatives(subject, text, 
                                             from_email='NewsBlur <%s>' % settings.HELLO_EMAIL,
                                             to=['%s <%s>' % (user.username, user.email)])
@ -1936,10 +1937,11 @@ class MSharedStory(mongo.Document):
            'story_feed': story_feed,
            'mute_url': mute_url,
        }
-    
+        story_title = self.story_title.replace('\n', ' ')
+        
        text    = render_to_string('mail/email_reshare.txt', data)
        html    = pynliner.fromString(render_to_string('mail/email_reshare.xhtml', data))
-        subject = "%s re-shared \"%s\" from you on NewsBlur" % (reshare_user.username, self.story_title)
+        subject = "%s re-shared \"%s\" from you on NewsBlur" % (reshare_user.username, story_title)
        msg     = EmailMultiAlternatives(subject, text, 
                                         from_email='NewsBlur <%s>' % settings.HELLO_EMAIL,
                                         to=['%s <%s>' % (original_user.username, original_user.email)])
--- a/fabfile.py
+++ b/fabfile.py
@ -1172,7 +1172,7 @@ def staging_full():
        run('curl -s http://dev.newsblur.com > /dev/null')
        run('curl -s http://dev.newsblur.com/m/ > /dev/null')

-@parallel
+# @parallel
 def celery():
    celery_slow()

--- a/utils/feed_fetch.sh
+++ b/utils/feed_fetch.sh
@ -1,9 +0,0 @@
-#!/bin/sh
-
-ps aux | grep refresh_feeds | egrep -v grep | awk '{print $2}' | xargs kill > /dev/null 2>&1
-python /home/conesus/newsblur/manage.py refresh_feeds -s &
-python /home/conesus/newsblur/manage.py refresh_feeds -s &
-python /home/conesus/newsblur/manage.py refresh_feeds -s &
-python /home/conesus/newsblur/manage.py refresh_feeds -s &
-python /home/conesus/newsblur/manage.py refresh_feeds -s &
-python /home/conesus/newsblur/manage.py refresh_feeds -s &
--- a/utils/feed_fetch_silent.sh
+++ b/utils/feed_fetch_silent.sh
@ -1,9 +0,0 @@
-#!/bin/sh
-
-ps aux | grep refresh_feeds | egrep -v grep | awk '{print $2}' | xargs kill > /dev/null 2>&1
-python /home/conesus/newsblur/manage.py refresh_feeds -s > /dev/null 2>&1 &
-python /home/conesus/newsblur/manage.py refresh_feeds -s > /dev/null 2>&1 &
-python /home/conesus/newsblur/manage.py refresh_feeds -s > /dev/null 2>&1 &
-python /home/conesus/newsblur/manage.py refresh_feeds -s > /dev/null 2>&1 &
-
-
--- a/utils/feed_fetcher.py
+++ b/utils/feed_fetcher.py
@ -91,15 +91,10 @@ class FetchFeed:
                                        agent=USER_AGENT,
                                        etag=etag,
                                        modified=modified)
-        except (TypeError, ValueError), e:
-            logging.debug(u'   ***> [%-30s] ~FR%s, turning off microformats.' % 
+        except (TypeError, ValueError, KeyError), e:
+            logging.debug(u'   ***> [%-30s] ~FR%s, turning off headers.' % 
                          (self.feed.title[:30], e))
-            feedparser.PARSE_MICROFORMATS = False
-            self.fpf = feedparser.parse(address,
-                                        agent=USER_AGENT,
-                                        etag=etag,
-                                        modified=modified)
-            feedparser.PARSE_MICROFORMATS = True
+            self.fpf = feedparser.parse(address, agent=USER_AGENT)
            
        logging.debug(u'   ---> [%-30s] ~FYFeed fetch in ~FM%.4ss' % (
                      self.feed.title[:30], time.time() - start))
@ -396,8 +391,6 @@ class Dispatcher:
                        if self.options['verbose']:
                            logging.debug(u'   ---> [%-30s] ~FBTIME: unread count in ~FM%.4ss' % (
                                          feed.title[:30], time.time() - start))
-            except KeyboardInterrupt:
-                break
            except urllib2.HTTPError, e:
                logging.debug('   ---> [%-30s] ~FRFeed throws HTTP error: ~SB%s' % (unicode(feed_id)[:30], e.fp.read()))
                feed.save_feed_history(e.code, e.msg, e.fp.read())
--- a/utils/story_functions.py
+++ b/utils/story_functions.py
@ -78,7 +78,7 @@ def pre_process_story(entry):
        entry['guid'] = unicode(entry['guid'])

    # Normalize story content/summary
-    summary = entry.get('summary', '')
+    summary = entry.get('summary') or ""
    content = ""
    if not summary and 'summary_detail' in entry:
        summary = entry['summary_detail'].get('value', '')