Fixing a bunch of feed fetch errors.

This commit is contained in:
Samuel Clay 2012-10-01 19:31:33 -07:00
parent ee72253695
commit 4a7516cb83
3 changed files with 17 additions and 3 deletions

View file

@ -170,7 +170,10 @@ class IconImporter(object):
compressed_content = key.get_contents_as_string() compressed_content = key.get_contents_as_string()
stream = StringIO(compressed_content) stream = StringIO(compressed_content)
gz = gzip.GzipFile(fileobj=stream) gz = gzip.GzipFile(fileobj=stream)
content = gz.read() try:
content = gz.read()
except IOError:
content = None
else: else:
content = MFeedPage.get_data(feed_id=self.feed.pk) content = MFeedPage.get_data(feed_id=self.feed.pk)
url = self._url_from_html(content) url = self._url_from_html(content)
@ -197,6 +200,9 @@ class IconImporter(object):
def get_image_from_url(self, url): def get_image_from_url(self, url):
# print 'Requesting: %s' % url # print 'Requesting: %s' % url
if not url:
return None, None
@timelimit(30) @timelimit(30)
def _1(url): def _1(url):
try: try:

View file

@ -80,6 +80,9 @@ class PageImporter(object):
response = requests.get(feed_link, headers=self.headers) response = requests.get(feed_link, headers=self.headers)
except requests.exceptions.TooManyRedirects: except requests.exceptions.TooManyRedirects:
response = requests.get(feed_link) response = requests.get(feed_link)
except AttributeError:
self.save_no_page()
return
try: try:
data = response.text data = response.text
except (LookupError, TypeError): except (LookupError, TypeError):

View file

@ -2,6 +2,7 @@ import datetime
import struct import struct
from HTMLParser import HTMLParser from HTMLParser import HTMLParser
from lxml.html.diff import tokenize, fixup_ins_del_tags, htmldiff_tokens from lxml.html.diff import tokenize, fixup_ins_del_tags, htmldiff_tokens
from lxml.etree import ParserError
from itertools import chain from itertools import chain
from django.utils.dateformat import DateFormat from django.utils.dateformat import DateFormat
from django.utils.html import strip_tags as strip_tags_django from django.utils.html import strip_tags as strip_tags_django
@ -250,8 +251,12 @@ def image_size(datastream):
return content_type, width, height return content_type, width, height
def htmldiff(old_html, new_html): def htmldiff(old_html, new_html):
old_html_tokens = tokenize(old_html, include_hrefs=False) try:
new_html_tokens = tokenize(new_html, include_hrefs=False) old_html_tokens = tokenize(old_html, include_hrefs=False)
new_html_tokens = tokenize(new_html, include_hrefs=False)
except (KeyError, ParserError):
return new_html
result = htmldiff_tokens(old_html_tokens, new_html_tokens) result = htmldiff_tokens(old_html_tokens, new_html_tokens)
result = ''.join(result).strip() result = ''.join(result).strip()