diff --git a/apps/rss_feeds/icon_importer.py b/apps/rss_feeds/icon_importer.py index e3b616df4..6405fdfda 100644 --- a/apps/rss_feeds/icon_importer.py +++ b/apps/rss_feeds/icon_importer.py @@ -170,7 +170,10 @@ class IconImporter(object): compressed_content = key.get_contents_as_string() stream = StringIO(compressed_content) gz = gzip.GzipFile(fileobj=stream) - content = gz.read() + try: + content = gz.read() + except IOError: + content = None else: content = MFeedPage.get_data(feed_id=self.feed.pk) url = self._url_from_html(content) @@ -197,6 +200,9 @@ class IconImporter(object): def get_image_from_url(self, url): # print 'Requesting: %s' % url + if not url: + return None, None + @timelimit(30) def _1(url): try: diff --git a/apps/rss_feeds/page_importer.py b/apps/rss_feeds/page_importer.py index 2b2c1b637..0ac2ee00c 100644 --- a/apps/rss_feeds/page_importer.py +++ b/apps/rss_feeds/page_importer.py @@ -80,6 +80,9 @@ class PageImporter(object): response = requests.get(feed_link, headers=self.headers) except requests.exceptions.TooManyRedirects: response = requests.get(feed_link) + except AttributeError: + self.save_no_page() + return try: data = response.text except (LookupError, TypeError): diff --git a/utils/story_functions.py b/utils/story_functions.py index bc62dc8ed..fb4c16abb 100644 --- a/utils/story_functions.py +++ b/utils/story_functions.py @@ -2,6 +2,7 @@ import datetime import struct from HTMLParser import HTMLParser from lxml.html.diff import tokenize, fixup_ins_del_tags, htmldiff_tokens +from lxml.etree import ParserError from itertools import chain from django.utils.dateformat import DateFormat from django.utils.html import strip_tags as strip_tags_django @@ -250,8 +251,12 @@ def image_size(datastream): return content_type, width, height def htmldiff(old_html, new_html): - old_html_tokens = tokenize(old_html, include_hrefs=False) - new_html_tokens = tokenize(new_html, include_hrefs=False) + try: + old_html_tokens = tokenize(old_html, include_hrefs=False) + new_html_tokens = tokenize(new_html, include_hrefs=False) + except (KeyError, ParserError): + return new_html + result = htmldiff_tokens(old_html_tokens, new_html_tokens) result = ''.join(result).strip()