diff --git a/utils/feedparser.py b/utils/feedparser.py index 8fdf5a0eb..39230240f 100755 --- a/utils/feedparser.py +++ b/utils/feedparser.py @@ -9,7 +9,7 @@ Required: Python 2.4 or later Recommended: iconv_codec """ -__version__ = "5.1.1" +__version__ = "5.1.2" __license__ = """ Copyright (c) 2010-2012 Kurt McKee Copyright (c) 2002-2008 Mark Pilgrim @@ -131,9 +131,10 @@ else: # https://secure.wikimedia.org/wikipedia/en/wiki/URI_scheme # Many more will likely need to be added! ACCEPTABLE_URI_SCHEMES = ( - 'file', 'ftp', 'gopher', 'h323', 'hdl', 'http', 'https', 'imap', 'mailto', - 'mms', 'news', 'nntp', 'prospero', 'rsync', 'rtsp', 'rtspu', 'sftp', - 'shttp', 'sip', 'sips', 'snews', 'svn', 'svn+ssh', 'telnet', 'wais', + 'file', 'ftp', 'gopher', 'h323', 'hdl', 'http', 'https', 'imap', 'magnet', + 'mailto', 'mms', 'news', 'nntp', 'prospero', 'rsync', 'rtsp', 'rtspu', + 'sftp', 'shttp', 'sip', 'sips', 'snews', 'svn', 'svn+ssh', 'telnet', + 'wais', # Additional common-but-unofficial schemes 'aim', 'callto', 'cvs', 'facetime', 'feed', 'git', 'gtalk', 'irc', 'ircs', 'irc6', 'itms', 'mms', 'msnim', 'skype', 'ssh', 'smb', 'svn', 'ymsg', @@ -283,15 +284,6 @@ except ImportError: BeautifulSoup = None PARSE_MICROFORMATS = False -try: - # the utf_32 codec was introduced in Python 2.6; it's necessary to - # check this as long as feedparser supports Python 2.4 and 2.5 - codecs.lookup('utf_32') -except LookupError: - _UTF32_AVAILABLE = False -else: - _UTF32_AVAILABLE = True - # ---------- don't touch these ---------- class ThingsNobodyCaresAboutButMe(Exception): pass class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass @@ -1721,6 +1713,8 @@ class _FeedParserMixin: self.push('itunes_image', 0) if attrsD.get('href'): self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')}) + elif attrsD.get('url'): + self._getContext()['image'] = FeedParserDict({'href': attrsD.get('url')}) _start_itunes_link = _start_itunes_image def _end_itunes_block(self): @@ -2554,7 +2548,7 @@ class _RelativeURIResolver(_BaseHTMLProcessor): self.baseuri = baseuri def resolveURI(self, uri): - return _makeSafeAbsoluteURI(_urljoin(self.baseuri, uri.strip())) + return _makeSafeAbsoluteURI(self.baseuri, uri.strip()) def unknown_starttag(self, tag, attrs): attrs = self.normalize_attrs(attrs) @@ -2607,8 +2601,7 @@ class _HTMLSanitizer(_BaseHTMLProcessor): 'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select', 'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot', - 'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video', 'noscript', - 'object', 'embed', 'iframe', 'param']) + 'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video', 'noscript']) acceptable_attributes = set(['abbr', 'accept', 'accept-charset', 'accesskey', 'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis', @@ -3010,11 +3003,14 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h # try to open with native open function (if url_file_stream_or_string is a filename) try: return open(url_file_stream_or_string, 'rb') - except (IOError, UnicodeEncodeError): + except (IOError, UnicodeEncodeError, TypeError): # if url_file_stream_or_string is a unicode object that # cannot be converted to the encoding returned by # sys.getfilesystemencoding(), a UnicodeEncodeError # will be thrown + # If url_file_stream_or_string is a string that contains NULL + # (such as an XML document encoded in UTF-32), TypeError will + # be thrown. pass # treat url_file_stream_or_string as string @@ -3452,7 +3448,7 @@ _rfc822_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'] _rfc822_month = "(?P%s)(?:[a-z]*,?)" % ('|'.join(_rfc822_months)) # The year may be 2 or 4 digits; capture the century if it exists _rfc822_year = "(?P(?:\d{2})?\d{2})" -_rfc822_day = "(?P\d{2})" +_rfc822_day = "(?P *\d{1,2})" _rfc822_date = "%s %s %s" % (_rfc822_day, _rfc822_month, _rfc822_year) _rfc822_hour = "(?P\d{2}):(?P\d{2})(?::(?P\d{2}))?" @@ -3561,217 +3557,283 @@ def _parse_date(dateString): return date9tuple return None -def _getCharacterEncoding(http_headers, xml_data): - '''Get the character encoding of the XML document +# Each marker represents some of the characters of the opening XML +# processing instruction (' +RE_XML_DECLARATION = re.compile('^<\?xml[^>]*?>') + +# Capture the value of the XML processing instruction's encoding attribute. +# Example: +RE_XML_PI_ENCODING = re.compile(_s2bytes('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')) + +def convert_to_utf8(http_headers, data): + '''Detect and convert the character encoding to UTF-8. http_headers is a dictionary - xml_data is a raw string (not Unicode) + data is a raw string (not Unicode)''' - This is so much trickier than it sounds, it's not even funny. - According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type - is application/xml, application/*+xml, - application/xml-external-parsed-entity, or application/xml-dtd, - the encoding given in the charset parameter of the HTTP Content-Type - takes precedence over the encoding given in the XML prefix within the - document, and defaults to 'utf-8' if neither are specified. But, if - the HTTP Content-Type is text/xml, text/*+xml, or - text/xml-external-parsed-entity, the encoding given in the XML prefix - within the document is ALWAYS IGNORED and only the encoding given in - the charset parameter of the HTTP Content-Type header should be - respected, and it defaults to 'us-ascii' if not specified. + # This is so much trickier than it sounds, it's not even funny. + # According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type + # is application/xml, application/*+xml, + # application/xml-external-parsed-entity, or application/xml-dtd, + # the encoding given in the charset parameter of the HTTP Content-Type + # takes precedence over the encoding given in the XML prefix within the + # document, and defaults to 'utf-8' if neither are specified. But, if + # the HTTP Content-Type is text/xml, text/*+xml, or + # text/xml-external-parsed-entity, the encoding given in the XML prefix + # within the document is ALWAYS IGNORED and only the encoding given in + # the charset parameter of the HTTP Content-Type header should be + # respected, and it defaults to 'us-ascii' if not specified. - Furthermore, discussion on the atom-syntax mailing list with the - author of RFC 3023 leads me to the conclusion that any document - served with a Content-Type of text/* and no charset parameter - must be treated as us-ascii. (We now do this.) And also that it - must always be flagged as non-well-formed. (We now do this too.) + # Furthermore, discussion on the atom-syntax mailing list with the + # author of RFC 3023 leads me to the conclusion that any document + # served with a Content-Type of text/* and no charset parameter + # must be treated as us-ascii. (We now do this.) And also that it + # must always be flagged as non-well-formed. (We now do this too.) - If Content-Type is unspecified (input was local file or non-HTTP source) - or unrecognized (server just got it totally wrong), then go by the - encoding given in the XML prefix of the document and default to - 'iso-8859-1' as per the HTTP specification (RFC 2616). + # If Content-Type is unspecified (input was local file or non-HTTP source) + # or unrecognized (server just got it totally wrong), then go by the + # encoding given in the XML prefix of the document and default to + # 'iso-8859-1' as per the HTTP specification (RFC 2616). - Then, assuming we didn't find a character encoding in the HTTP headers - (and the HTTP Content-type allowed us to look in the body), we need - to sniff the first few bytes of the XML data and try to determine - whether the encoding is ASCII-compatible. Section F of the XML - specification shows the way here: - http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info - - If the sniffed encoding is not ASCII-compatible, we need to make it - ASCII compatible so that we can sniff further into the XML declaration - to find the encoding attribute, which will tell us the true encoding. - - Of course, none of this guarantees that we will be able to parse the - feed in the declared character encoding (assuming it was declared - correctly, which many are not). iconv_codec can help a lot; - you should definitely install it if you can. - http://cjkpython.i18n.org/ - ''' - - def _parseHTTPContentType(content_type): - '''takes HTTP Content-Type header and returns (content type, charset) - - If no charset is specified, returns (content type, '') - If no content type is specified, returns ('', '') - Both return parameters are guaranteed to be lowercase strings - ''' - content_type = content_type or '' - content_type, params = cgi.parse_header(content_type) - charset = params.get('charset', '').replace("'", "") - if not isinstance(charset, unicode): - charset = charset.decode('utf-8', 'ignore') - return content_type, charset - - sniffed_xml_encoding = u'' - xml_encoding = u'' - true_encoding = u'' - http_content_type, http_encoding = _parseHTTPContentType(http_headers.get('content-type')) - # Must sniff for non-ASCII-compatible character encodings before - # searching for XML declaration. This heuristic is defined in - # section F of the XML specification: + # Then, assuming we didn't find a character encoding in the HTTP headers + # (and the HTTP Content-type allowed us to look in the body), we need + # to sniff the first few bytes of the XML data and try to determine + # whether the encoding is ASCII-compatible. Section F of the XML + # specification shows the way here: # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info + + # If the sniffed encoding is not ASCII-compatible, we need to make it + # ASCII compatible so that we can sniff further into the XML declaration + # to find the encoding attribute, which will tell us the true encoding. + + # Of course, none of this guarantees that we will be able to parse the + # feed in the declared character encoding (assuming it was declared + # correctly, which many are not). iconv_codec can help a lot; + # you should definitely install it if you can. + # http://cjkpython.i18n.org/ + + bom_encoding = u'' + xml_encoding = u'' + rfc3023_encoding = u'' + + # Look at the first few bytes of the document to guess what + # its encoding may be. We only need to decode enough of the + # document that we can use an ASCII-compatible regular + # expression to search for an XML encoding declaration. + # The heuristic follows the XML specification, section F: + # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info + # Check for BOMs first. + if data[:4] == codecs.BOM_UTF32_BE: + bom_encoding = u'utf-32be' + data = data[4:] + elif data[:4] == codecs.BOM_UTF32_LE: + bom_encoding = u'utf-32le' + data = data[4:] + elif data[:2] == codecs.BOM_UTF16_BE and data[2:4] != ZERO_BYTES: + bom_encoding = u'utf-16be' + data = data[2:] + elif data[:2] == codecs.BOM_UTF16_LE and data[2:4] != ZERO_BYTES: + bom_encoding = u'utf-16le' + data = data[2:] + elif data[:3] == codecs.BOM_UTF8: + bom_encoding = u'utf-8' + data = data[3:] + # Check for the characters '= 4) and (xml_data[:2] == _l2bytes([0xfe, 0xff])) and (xml_data[2:4] != _l2bytes([0x00, 0x00])): - # UTF-16BE with BOM - sniffed_xml_encoding = u'utf-16be' - xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') - elif xml_data[:4] == _l2bytes([0x3c, 0x00, 0x3f, 0x00]): - # UTF-16LE - sniffed_xml_encoding = u'utf-16le' - xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') - elif (len(xml_data) >= 4) and (xml_data[:2] == _l2bytes([0xff, 0xfe])) and (xml_data[2:4] != _l2bytes([0x00, 0x00])): - # UTF-16LE with BOM - sniffed_xml_encoding = u'utf-16le' - xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') - elif xml_data[:4] == _l2bytes([0x00, 0x00, 0x00, 0x3c]): - # UTF-32BE - sniffed_xml_encoding = u'utf-32be' - if _UTF32_AVAILABLE: - xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') - elif xml_data[:4] == _l2bytes([0x3c, 0x00, 0x00, 0x00]): - # UTF-32LE - sniffed_xml_encoding = u'utf-32le' - if _UTF32_AVAILABLE: - xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') - elif xml_data[:4] == _l2bytes([0x00, 0x00, 0xfe, 0xff]): - # UTF-32BE with BOM - sniffed_xml_encoding = u'utf-32be' - if _UTF32_AVAILABLE: - xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') - elif xml_data[:4] == _l2bytes([0xff, 0xfe, 0x00, 0x00]): - # UTF-32LE with BOM - sniffed_xml_encoding = u'utf-32le' - if _UTF32_AVAILABLE: - xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') - elif xml_data[:3] == _l2bytes([0xef, 0xbb, 0xbf]): - # UTF-8 with BOM - sniffed_xml_encoding = u'utf-8' - xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') - else: - # ASCII-compatible - pass - xml_encoding_match = re.compile(_s2bytes('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')).match(xml_data) - except UnicodeDecodeError: + if bom_encoding: + tempdata = data.decode(bom_encoding).encode('utf-8') + except (UnicodeDecodeError, LookupError): + # feedparser recognizes UTF-32 encodings that aren't + # available in Python 2.4 and 2.5, so it's possible to + # encounter a LookupError during decoding. xml_encoding_match = None + else: + xml_encoding_match = RE_XML_PI_ENCODING.match(tempdata) + if xml_encoding_match: xml_encoding = xml_encoding_match.groups()[0].decode('utf-8').lower() - if sniffed_xml_encoding and (xml_encoding in (u'iso-10646-ucs-2', u'ucs-2', u'csunicode', u'iso-10646-ucs-4', u'ucs-4', u'csucs4', u'utf-16', u'utf-32', u'utf_16', u'utf_32', u'utf16', u'u16')): - xml_encoding = sniffed_xml_encoding + # Normalize the xml_encoding if necessary. + if bom_encoding and (xml_encoding in ( + u'u16', u'utf-16', u'utf16', u'utf_16', + u'u32', u'utf-32', u'utf32', u'utf_32', + u'iso-10646-ucs-2', u'iso-10646-ucs-4', + u'csucs4', u'csunicode', u'ucs-2', u'ucs-4' + )): + xml_encoding = bom_encoding + + # Find the HTTP Content-Type and, hopefully, a character + # encoding provided by the server. The Content-Type is used + # to choose the "correct" encoding among the BOM encoding, + # XML declaration encoding, and HTTP encoding, following the + # heuristic defined in RFC 3023. + http_content_type = http_headers.get('content-type') or '' + http_content_type, params = cgi.parse_header(http_content_type) + http_encoding = params.get('charset', '').replace("'", "") + if not isinstance(http_encoding, unicode): + http_encoding = http_encoding.decode('utf-8', 'ignore') + acceptable_content_type = 0 - application_content_types = (u'application/xml', u'application/xml-dtd', u'application/xml-external-parsed-entity') + application_content_types = (u'application/xml', u'application/xml-dtd', + u'application/xml-external-parsed-entity') text_content_types = (u'text/xml', u'text/xml-external-parsed-entity') if (http_content_type in application_content_types) or \ - (http_content_type.startswith(u'application/') and http_content_type.endswith(u'+xml')): + (http_content_type.startswith(u'application/') and + http_content_type.endswith(u'+xml')): acceptable_content_type = 1 - true_encoding = http_encoding or xml_encoding or u'utf-8' + rfc3023_encoding = http_encoding or xml_encoding or u'utf-8' elif (http_content_type in text_content_types) or \ - (http_content_type.startswith(u'text/')) and http_content_type.endswith(u'+xml'): + (http_content_type.startswith(u'text/') and + http_content_type.endswith(u'+xml')): acceptable_content_type = 1 - true_encoding = http_encoding or u'us-ascii' + rfc3023_encoding = http_encoding or u'us-ascii' elif http_content_type.startswith(u'text/'): - true_encoding = http_encoding or u'us-ascii' + rfc3023_encoding = http_encoding or u'us-ascii' elif http_headers and 'content-type' not in http_headers: - true_encoding = xml_encoding or u'iso-8859-1' + rfc3023_encoding = xml_encoding or u'iso-8859-1' else: - true_encoding = xml_encoding or u'utf-8' - # some feeds claim to be gb2312 but are actually gb18030. - # apparently MSIE and Firefox both do the following switch: - if true_encoding.lower() == u'gb2312': - true_encoding = u'gb18030' - return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type + rfc3023_encoding = xml_encoding or u'utf-8' + # gb18030 is a superset of gb2312, so always replace gb2312 + # with gb18030 for greater compatibility. + if rfc3023_encoding.lower() == u'gb2312': + rfc3023_encoding = u'gb18030' + if xml_encoding.lower() == u'gb2312': + xml_encoding = u'gb18030' -def _toUTF8(data, encoding): - '''Changes an XML data stream on the fly to specify a new encoding + # there are four encodings to keep track of: + # - http_encoding is the encoding declared in the Content-Type HTTP header + # - xml_encoding is the encoding declared in the = 4) and (data[:2] == _l2bytes([0xfe, 0xff])) and (data[2:4] != _l2bytes([0x00, 0x00])): - encoding = 'utf-16be' - data = data[2:] - elif (len(data) >= 4) and (data[:2] == _l2bytes([0xff, 0xfe])) and (data[2:4] != _l2bytes([0x00, 0x00])): - encoding = 'utf-16le' - data = data[2:] - elif data[:3] == _l2bytes([0xef, 0xbb, 0xbf]): - encoding = 'utf-8' - data = data[3:] - elif data[:4] == _l2bytes([0x00, 0x00, 0xfe, 0xff]): - encoding = 'utf-32be' - data = data[4:] - elif data[:4] == _l2bytes([0xff, 0xfe, 0x00, 0x00]): - encoding = 'utf-32le' - data = data[4:] - newdata = unicode(data, encoding) - declmatch = re.compile('^<\?xml[^>]*?>') - newdecl = '''''' - if declmatch.search(newdata): - newdata = declmatch.sub(newdecl, newdata) - else: - newdata = newdecl + u'\n' + newdata - return newdata.encode('utf-8') + if http_headers and (not acceptable_content_type): + if 'content-type' in http_headers: + msg = '%s is not an XML media type' % http_headers['content-type'] + else: + msg = 'no Content-type specified' + error = NonXMLContentType(msg) -def _stripDoctype(data): - '''Strips DOCTYPE from XML document, returns (rss_version, stripped_data) + # determine character encoding + known_encoding = 0 + chardet_encoding = None + tried_encodings = [] + if chardet: + chardet_encoding = unicode(chardet.detect(data)['encoding'], 'ascii', 'ignore') + # try: HTTP encoding, declared XML encoding, encoding sniffed from BOM + for proposed_encoding in (rfc3023_encoding, xml_encoding, bom_encoding, + chardet_encoding, u'utf-8', u'windows-1252', u'iso-8859-2'): + if not proposed_encoding: + continue + if proposed_encoding in tried_encodings: + continue + tried_encodings.append(proposed_encoding) + try: + data = data.decode(proposed_encoding) + except (UnicodeDecodeError, LookupError): + pass + else: + known_encoding = 1 + # Update the encoding in the opening XML processing instruction. + new_declaration = '''''' + if RE_XML_DECLARATION.search(data): + data = RE_XML_DECLARATION.sub(new_declaration, data) + else: + data = new_declaration + u'\n' + data + data = data.encode('utf-8') + break + # if still no luck, give up + if not known_encoding: + error = CharacterEncodingUnknown( + 'document encoding unknown, I tried ' + + '%s, %s, utf-8, windows-1252, and iso-8859-2 but nothing worked' % + (rfc3023_encoding, xml_encoding)) + rfc3023_encoding = u'' + elif proposed_encoding != rfc3023_encoding: + error = CharacterEncodingOverride( + 'document declared as %s, but parsed as %s' % + (rfc3023_encoding, proposed_encoding)) + rfc3023_encoding = proposed_encoding + + return data, rfc3023_encoding, error + +# Match XML entity declarations. +# Example: +RE_ENTITY_PATTERN = re.compile(_s2bytes(r'^\s*]*?)>'), re.MULTILINE) + +# Match XML DOCTYPE declarations. +# Example: +RE_DOCTYPE_PATTERN = re.compile(_s2bytes(r'^\s*]*?)>'), re.MULTILINE) + +# Match safe entity declarations. +# This will allow hexadecimal character references through, +# as well as text, but not arbitrary nested entities. +# Example: cubed "³" +# Example: copyright "(C)" +# Forbidden: explode1 "&explode2;&explode2;" +RE_SAFE_ENTITY_PATTERN = re.compile(_s2bytes('\s+(\w+)\s+"(&#\w+;|[^&"]*)"')) + +def replace_doctype(data): + '''Strips and replaces the DOCTYPE, returns (rss_version, stripped_data) rss_version may be 'rss091n' or None - stripped_data is the same XML document, minus the DOCTYPE + stripped_data is the same XML document with a replaced DOCTYPE ''' + + # Divide the document into two groups by finding the location + # of the first element that doesn't begin with ']*?)>'), re.MULTILINE) - entity_results=entity_pattern.findall(head) - head = entity_pattern.sub(_s2bytes(''), head) - doctype_pattern = re.compile(_s2bytes(r'^\s*]*?)>'), re.MULTILINE) - doctype_results = doctype_pattern.findall(head) + # Save and then remove all of the ENTITY declarations. + entity_results = RE_ENTITY_PATTERN.findall(head) + head = RE_ENTITY_PATTERN.sub(_s2bytes(''), head) + + # Find the DOCTYPE declaration and check the feed type. + doctype_results = RE_DOCTYPE_PATTERN.findall(head) doctype = doctype_results and doctype_results[0] or _s2bytes('') - if doctype.lower().count(_s2bytes('netscape')): + if _s2bytes('netscape') in doctype.lower(): version = u'rss091n' else: version = None - # only allow in 'safe' inline entity definitions - replacement=_s2bytes('') - if len(doctype_results)==1 and entity_results: - safe_pattern=re.compile(_s2bytes('\s+(\w+)\s+"(&#\w+;|[^&"]*)"')) - safe_entities=filter(lambda e: safe_pattern.match(e),entity_results) + # Re-insert the safe ENTITY declarations if a DOCTYPE was found. + replacement = _s2bytes('') + if len(doctype_results) == 1 and entity_results: + match_safe_entities = lambda e: RE_SAFE_ENTITY_PATTERN.match(e) + safe_entities = filter(match_safe_entities, entity_results) if safe_entities: - replacement=_s2bytes('\n \n]>') - data = doctype_pattern.sub(replacement, head) + data + replacement = _s2bytes('\n\n]>') + data = RE_DOCTYPE_PATTERN.sub(replacement, head) + data - return version, data, dict(replacement and [(k.decode('utf-8'), v.decode('utf-8')) for k, v in safe_pattern.findall(replacement)]) + # Precompute the safe entities for the loose parser. + safe_entities = dict((k.decode('utf-8'), v.decode('utf-8')) + for k, v in RE_SAFE_ENTITY_PATTERN.findall(replacement)) + return version, data, safe_entities def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, response_headers=None): '''Parse a feed from a URL, file, stream, or string. @@ -3822,24 +3884,25 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer try: data = gzip.GzipFile(fileobj=_StringIO(data)).read() except (IOError, struct.error), e: - # IOError can occur if the gzip header is bad - # struct.error can occur if the data is damaged - # Some feeds claim to be gzipped but they're not, so - # we get garbage. Ideally, we should re-request the - # feed without the 'Accept-encoding: gzip' header, - # but we don't. + # IOError can occur if the gzip header is bad. + # struct.error can occur if the data is damaged. result['bozo'] = 1 result['bozo_exception'] = e - data = None + if isinstance(e, struct.error): + # A gzip header was found but the data is corrupt. + # Ideally, we should re-request the feed without the + # 'Accept-encoding: gzip' header, but we don't. + data = None elif zlib and 'deflate' in http_headers.get('content-encoding', ''): try: data = zlib.decompress(data) except zlib.error, e: - data = zlib.decompress(data, -zlib.MAX_WBITS) - except zlib.error, e: - result['bozo'] = 1 - result['bozo_exception'] = e - data = None + try: + # The data may have no headers and no checksum. + data = zlib.decompress(data, -15) + except zlib.error, e: + result['bozo'] = 1 + result['bozo_exception'] = e # save HTTP headers if http_headers: @@ -3868,25 +3931,22 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer if data is None: return result - # there are four encodings to keep track of: - # - http_encoding is the encoding declared in the Content-Type HTTP header - # - xml_encoding is the encoding declared in the