Fixing a bunch of feed fetch errors.

This commit is contained in:
Samuel Clay 2012-10-01 19:31:33 -07:00
parent ee72253695
commit 4a7516cb83
3 changed files with 17 additions and 3 deletions

View file

@ -170,7 +170,10 @@ class IconImporter(object):
compressed_content = key.get_contents_as_string()
stream = StringIO(compressed_content)
gz = gzip.GzipFile(fileobj=stream)
content = gz.read()
try:
content = gz.read()
except IOError:
content = None
else:
content = MFeedPage.get_data(feed_id=self.feed.pk)
url = self._url_from_html(content)
@ -197,6 +200,9 @@ class IconImporter(object):
def get_image_from_url(self, url):
# print 'Requesting: %s' % url
if not url:
return None, None
@timelimit(30)
def _1(url):
try:

View file

@ -80,6 +80,9 @@ class PageImporter(object):
response = requests.get(feed_link, headers=self.headers)
except requests.exceptions.TooManyRedirects:
response = requests.get(feed_link)
except AttributeError:
self.save_no_page()
return
try:
data = response.text
except (LookupError, TypeError):

View file

@ -2,6 +2,7 @@ import datetime
import struct
from HTMLParser import HTMLParser
from lxml.html.diff import tokenize, fixup_ins_del_tags, htmldiff_tokens
from lxml.etree import ParserError
from itertools import chain
from django.utils.dateformat import DateFormat
from django.utils.html import strip_tags as strip_tags_django
@ -250,8 +251,12 @@ def image_size(datastream):
return content_type, width, height
def htmldiff(old_html, new_html):
old_html_tokens = tokenize(old_html, include_hrefs=False)
new_html_tokens = tokenize(new_html, include_hrefs=False)
try:
old_html_tokens = tokenize(old_html, include_hrefs=False)
new_html_tokens = tokenize(new_html, include_hrefs=False)
except (KeyError, ParserError):
return new_html
result = htmldiff_tokens(old_html_tokens, new_html_tokens)
result = ''.join(result).strip()