diff --git a/utils/feedparser.py b/utils/feedparser.py
index 8fdf5a0eb..39230240f 100755
--- a/utils/feedparser.py
+++ b/utils/feedparser.py
@@ -9,7 +9,7 @@ Required: Python 2.4 or later
Recommended: iconv_codec
"""
-__version__ = "5.1.1"
+__version__ = "5.1.2"
__license__ = """
Copyright (c) 2010-2012 Kurt McKee
Copyright (c) 2002-2008 Mark Pilgrim
@@ -131,9 +131,10 @@ else:
# https://secure.wikimedia.org/wikipedia/en/wiki/URI_scheme
# Many more will likely need to be added!
ACCEPTABLE_URI_SCHEMES = (
- 'file', 'ftp', 'gopher', 'h323', 'hdl', 'http', 'https', 'imap', 'mailto',
- 'mms', 'news', 'nntp', 'prospero', 'rsync', 'rtsp', 'rtspu', 'sftp',
- 'shttp', 'sip', 'sips', 'snews', 'svn', 'svn+ssh', 'telnet', 'wais',
+ 'file', 'ftp', 'gopher', 'h323', 'hdl', 'http', 'https', 'imap', 'magnet',
+ 'mailto', 'mms', 'news', 'nntp', 'prospero', 'rsync', 'rtsp', 'rtspu',
+ 'sftp', 'shttp', 'sip', 'sips', 'snews', 'svn', 'svn+ssh', 'telnet',
+ 'wais',
# Additional common-but-unofficial schemes
'aim', 'callto', 'cvs', 'facetime', 'feed', 'git', 'gtalk', 'irc', 'ircs',
'irc6', 'itms', 'mms', 'msnim', 'skype', 'ssh', 'smb', 'svn', 'ymsg',
@@ -283,15 +284,6 @@ except ImportError:
BeautifulSoup = None
PARSE_MICROFORMATS = False
-try:
- # the utf_32 codec was introduced in Python 2.6; it's necessary to
- # check this as long as feedparser supports Python 2.4 and 2.5
- codecs.lookup('utf_32')
-except LookupError:
- _UTF32_AVAILABLE = False
-else:
- _UTF32_AVAILABLE = True
-
# ---------- don't touch these ----------
class ThingsNobodyCaresAboutButMe(Exception): pass
class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass
@@ -1721,6 +1713,8 @@ class _FeedParserMixin:
self.push('itunes_image', 0)
if attrsD.get('href'):
self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')})
+ elif attrsD.get('url'):
+ self._getContext()['image'] = FeedParserDict({'href': attrsD.get('url')})
_start_itunes_link = _start_itunes_image
def _end_itunes_block(self):
@@ -2554,7 +2548,7 @@ class _RelativeURIResolver(_BaseHTMLProcessor):
self.baseuri = baseuri
def resolveURI(self, uri):
- return _makeSafeAbsoluteURI(_urljoin(self.baseuri, uri.strip()))
+ return _makeSafeAbsoluteURI(self.baseuri, uri.strip())
def unknown_starttag(self, tag, attrs):
attrs = self.normalize_attrs(attrs)
@@ -2607,8 +2601,7 @@ class _HTMLSanitizer(_BaseHTMLProcessor):
'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
- 'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video', 'noscript',
- 'object', 'embed', 'iframe', 'param'])
+ 'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video', 'noscript'])
acceptable_attributes = set(['abbr', 'accept', 'accept-charset', 'accesskey',
'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis',
@@ -3010,11 +3003,14 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h
# try to open with native open function (if url_file_stream_or_string is a filename)
try:
return open(url_file_stream_or_string, 'rb')
- except (IOError, UnicodeEncodeError):
+ except (IOError, UnicodeEncodeError, TypeError):
# if url_file_stream_or_string is a unicode object that
# cannot be converted to the encoding returned by
# sys.getfilesystemencoding(), a UnicodeEncodeError
# will be thrown
+ # If url_file_stream_or_string is a string that contains NULL
+ # (such as an XML document encoded in UTF-32), TypeError will
+ # be thrown.
pass
# treat url_file_stream_or_string as string
@@ -3452,7 +3448,7 @@ _rfc822_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
_rfc822_month = "(?P%s)(?:[a-z]*,?)" % ('|'.join(_rfc822_months))
# The year may be 2 or 4 digits; capture the century if it exists
_rfc822_year = "(?P(?:\d{2})?\d{2})"
-_rfc822_day = "(?P\d{2})"
+_rfc822_day = "(?P *\d{1,2})"
_rfc822_date = "%s %s %s" % (_rfc822_day, _rfc822_month, _rfc822_year)
_rfc822_hour = "(?P\d{2}):(?P\d{2})(?::(?P\d{2}))?"
@@ -3561,217 +3557,283 @@ def _parse_date(dateString):
return date9tuple
return None
-def _getCharacterEncoding(http_headers, xml_data):
- '''Get the character encoding of the XML document
+# Each marker represents some of the characters of the opening XML
+# processing instruction ('
+RE_XML_DECLARATION = re.compile('^<\?xml[^>]*?>')
+
+# Capture the value of the XML processing instruction's encoding attribute.
+# Example:
+RE_XML_PI_ENCODING = re.compile(_s2bytes('^<\?.*encoding=[\'"](.*?)[\'"].*\?>'))
+
+def convert_to_utf8(http_headers, data):
+ '''Detect and convert the character encoding to UTF-8.
http_headers is a dictionary
- xml_data is a raw string (not Unicode)
+ data is a raw string (not Unicode)'''
- This is so much trickier than it sounds, it's not even funny.
- According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
- is application/xml, application/*+xml,
- application/xml-external-parsed-entity, or application/xml-dtd,
- the encoding given in the charset parameter of the HTTP Content-Type
- takes precedence over the encoding given in the XML prefix within the
- document, and defaults to 'utf-8' if neither are specified. But, if
- the HTTP Content-Type is text/xml, text/*+xml, or
- text/xml-external-parsed-entity, the encoding given in the XML prefix
- within the document is ALWAYS IGNORED and only the encoding given in
- the charset parameter of the HTTP Content-Type header should be
- respected, and it defaults to 'us-ascii' if not specified.
+ # This is so much trickier than it sounds, it's not even funny.
+ # According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
+ # is application/xml, application/*+xml,
+ # application/xml-external-parsed-entity, or application/xml-dtd,
+ # the encoding given in the charset parameter of the HTTP Content-Type
+ # takes precedence over the encoding given in the XML prefix within the
+ # document, and defaults to 'utf-8' if neither are specified. But, if
+ # the HTTP Content-Type is text/xml, text/*+xml, or
+ # text/xml-external-parsed-entity, the encoding given in the XML prefix
+ # within the document is ALWAYS IGNORED and only the encoding given in
+ # the charset parameter of the HTTP Content-Type header should be
+ # respected, and it defaults to 'us-ascii' if not specified.
- Furthermore, discussion on the atom-syntax mailing list with the
- author of RFC 3023 leads me to the conclusion that any document
- served with a Content-Type of text/* and no charset parameter
- must be treated as us-ascii. (We now do this.) And also that it
- must always be flagged as non-well-formed. (We now do this too.)
+ # Furthermore, discussion on the atom-syntax mailing list with the
+ # author of RFC 3023 leads me to the conclusion that any document
+ # served with a Content-Type of text/* and no charset parameter
+ # must be treated as us-ascii. (We now do this.) And also that it
+ # must always be flagged as non-well-formed. (We now do this too.)
- If Content-Type is unspecified (input was local file or non-HTTP source)
- or unrecognized (server just got it totally wrong), then go by the
- encoding given in the XML prefix of the document and default to
- 'iso-8859-1' as per the HTTP specification (RFC 2616).
+ # If Content-Type is unspecified (input was local file or non-HTTP source)
+ # or unrecognized (server just got it totally wrong), then go by the
+ # encoding given in the XML prefix of the document and default to
+ # 'iso-8859-1' as per the HTTP specification (RFC 2616).
- Then, assuming we didn't find a character encoding in the HTTP headers
- (and the HTTP Content-type allowed us to look in the body), we need
- to sniff the first few bytes of the XML data and try to determine
- whether the encoding is ASCII-compatible. Section F of the XML
- specification shows the way here:
- http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
-
- If the sniffed encoding is not ASCII-compatible, we need to make it
- ASCII compatible so that we can sniff further into the XML declaration
- to find the encoding attribute, which will tell us the true encoding.
-
- Of course, none of this guarantees that we will be able to parse the
- feed in the declared character encoding (assuming it was declared
- correctly, which many are not). iconv_codec can help a lot;
- you should definitely install it if you can.
- http://cjkpython.i18n.org/
- '''
-
- def _parseHTTPContentType(content_type):
- '''takes HTTP Content-Type header and returns (content type, charset)
-
- If no charset is specified, returns (content type, '')
- If no content type is specified, returns ('', '')
- Both return parameters are guaranteed to be lowercase strings
- '''
- content_type = content_type or ''
- content_type, params = cgi.parse_header(content_type)
- charset = params.get('charset', '').replace("'", "")
- if not isinstance(charset, unicode):
- charset = charset.decode('utf-8', 'ignore')
- return content_type, charset
-
- sniffed_xml_encoding = u''
- xml_encoding = u''
- true_encoding = u''
- http_content_type, http_encoding = _parseHTTPContentType(http_headers.get('content-type'))
- # Must sniff for non-ASCII-compatible character encodings before
- # searching for XML declaration. This heuristic is defined in
- # section F of the XML specification:
+ # Then, assuming we didn't find a character encoding in the HTTP headers
+ # (and the HTTP Content-type allowed us to look in the body), we need
+ # to sniff the first few bytes of the XML data and try to determine
+ # whether the encoding is ASCII-compatible. Section F of the XML
+ # specification shows the way here:
# http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
+
+ # If the sniffed encoding is not ASCII-compatible, we need to make it
+ # ASCII compatible so that we can sniff further into the XML declaration
+ # to find the encoding attribute, which will tell us the true encoding.
+
+ # Of course, none of this guarantees that we will be able to parse the
+ # feed in the declared character encoding (assuming it was declared
+ # correctly, which many are not). iconv_codec can help a lot;
+ # you should definitely install it if you can.
+ # http://cjkpython.i18n.org/
+
+ bom_encoding = u''
+ xml_encoding = u''
+ rfc3023_encoding = u''
+
+ # Look at the first few bytes of the document to guess what
+ # its encoding may be. We only need to decode enough of the
+ # document that we can use an ASCII-compatible regular
+ # expression to search for an XML encoding declaration.
+ # The heuristic follows the XML specification, section F:
+ # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
+ # Check for BOMs first.
+ if data[:4] == codecs.BOM_UTF32_BE:
+ bom_encoding = u'utf-32be'
+ data = data[4:]
+ elif data[:4] == codecs.BOM_UTF32_LE:
+ bom_encoding = u'utf-32le'
+ data = data[4:]
+ elif data[:2] == codecs.BOM_UTF16_BE and data[2:4] != ZERO_BYTES:
+ bom_encoding = u'utf-16be'
+ data = data[2:]
+ elif data[:2] == codecs.BOM_UTF16_LE and data[2:4] != ZERO_BYTES:
+ bom_encoding = u'utf-16le'
+ data = data[2:]
+ elif data[:3] == codecs.BOM_UTF8:
+ bom_encoding = u'utf-8'
+ data = data[3:]
+ # Check for the characters '= 4) and (xml_data[:2] == _l2bytes([0xfe, 0xff])) and (xml_data[2:4] != _l2bytes([0x00, 0x00])):
- # UTF-16BE with BOM
- sniffed_xml_encoding = u'utf-16be'
- xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
- elif xml_data[:4] == _l2bytes([0x3c, 0x00, 0x3f, 0x00]):
- # UTF-16LE
- sniffed_xml_encoding = u'utf-16le'
- xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
- elif (len(xml_data) >= 4) and (xml_data[:2] == _l2bytes([0xff, 0xfe])) and (xml_data[2:4] != _l2bytes([0x00, 0x00])):
- # UTF-16LE with BOM
- sniffed_xml_encoding = u'utf-16le'
- xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
- elif xml_data[:4] == _l2bytes([0x00, 0x00, 0x00, 0x3c]):
- # UTF-32BE
- sniffed_xml_encoding = u'utf-32be'
- if _UTF32_AVAILABLE:
- xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
- elif xml_data[:4] == _l2bytes([0x3c, 0x00, 0x00, 0x00]):
- # UTF-32LE
- sniffed_xml_encoding = u'utf-32le'
- if _UTF32_AVAILABLE:
- xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
- elif xml_data[:4] == _l2bytes([0x00, 0x00, 0xfe, 0xff]):
- # UTF-32BE with BOM
- sniffed_xml_encoding = u'utf-32be'
- if _UTF32_AVAILABLE:
- xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
- elif xml_data[:4] == _l2bytes([0xff, 0xfe, 0x00, 0x00]):
- # UTF-32LE with BOM
- sniffed_xml_encoding = u'utf-32le'
- if _UTF32_AVAILABLE:
- xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
- elif xml_data[:3] == _l2bytes([0xef, 0xbb, 0xbf]):
- # UTF-8 with BOM
- sniffed_xml_encoding = u'utf-8'
- xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
- else:
- # ASCII-compatible
- pass
- xml_encoding_match = re.compile(_s2bytes('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')).match(xml_data)
- except UnicodeDecodeError:
+ if bom_encoding:
+ tempdata = data.decode(bom_encoding).encode('utf-8')
+ except (UnicodeDecodeError, LookupError):
+ # feedparser recognizes UTF-32 encodings that aren't
+ # available in Python 2.4 and 2.5, so it's possible to
+ # encounter a LookupError during decoding.
xml_encoding_match = None
+ else:
+ xml_encoding_match = RE_XML_PI_ENCODING.match(tempdata)
+
if xml_encoding_match:
xml_encoding = xml_encoding_match.groups()[0].decode('utf-8').lower()
- if sniffed_xml_encoding and (xml_encoding in (u'iso-10646-ucs-2', u'ucs-2', u'csunicode', u'iso-10646-ucs-4', u'ucs-4', u'csucs4', u'utf-16', u'utf-32', u'utf_16', u'utf_32', u'utf16', u'u16')):
- xml_encoding = sniffed_xml_encoding
+ # Normalize the xml_encoding if necessary.
+ if bom_encoding and (xml_encoding in (
+ u'u16', u'utf-16', u'utf16', u'utf_16',
+ u'u32', u'utf-32', u'utf32', u'utf_32',
+ u'iso-10646-ucs-2', u'iso-10646-ucs-4',
+ u'csucs4', u'csunicode', u'ucs-2', u'ucs-4'
+ )):
+ xml_encoding = bom_encoding
+
+ # Find the HTTP Content-Type and, hopefully, a character
+ # encoding provided by the server. The Content-Type is used
+ # to choose the "correct" encoding among the BOM encoding,
+ # XML declaration encoding, and HTTP encoding, following the
+ # heuristic defined in RFC 3023.
+ http_content_type = http_headers.get('content-type') or ''
+ http_content_type, params = cgi.parse_header(http_content_type)
+ http_encoding = params.get('charset', '').replace("'", "")
+ if not isinstance(http_encoding, unicode):
+ http_encoding = http_encoding.decode('utf-8', 'ignore')
+
acceptable_content_type = 0
- application_content_types = (u'application/xml', u'application/xml-dtd', u'application/xml-external-parsed-entity')
+ application_content_types = (u'application/xml', u'application/xml-dtd',
+ u'application/xml-external-parsed-entity')
text_content_types = (u'text/xml', u'text/xml-external-parsed-entity')
if (http_content_type in application_content_types) or \
- (http_content_type.startswith(u'application/') and http_content_type.endswith(u'+xml')):
+ (http_content_type.startswith(u'application/') and
+ http_content_type.endswith(u'+xml')):
acceptable_content_type = 1
- true_encoding = http_encoding or xml_encoding or u'utf-8'
+ rfc3023_encoding = http_encoding or xml_encoding or u'utf-8'
elif (http_content_type in text_content_types) or \
- (http_content_type.startswith(u'text/')) and http_content_type.endswith(u'+xml'):
+ (http_content_type.startswith(u'text/') and
+ http_content_type.endswith(u'+xml')):
acceptable_content_type = 1
- true_encoding = http_encoding or u'us-ascii'
+ rfc3023_encoding = http_encoding or u'us-ascii'
elif http_content_type.startswith(u'text/'):
- true_encoding = http_encoding or u'us-ascii'
+ rfc3023_encoding = http_encoding or u'us-ascii'
elif http_headers and 'content-type' not in http_headers:
- true_encoding = xml_encoding or u'iso-8859-1'
+ rfc3023_encoding = xml_encoding or u'iso-8859-1'
else:
- true_encoding = xml_encoding or u'utf-8'
- # some feeds claim to be gb2312 but are actually gb18030.
- # apparently MSIE and Firefox both do the following switch:
- if true_encoding.lower() == u'gb2312':
- true_encoding = u'gb18030'
- return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
+ rfc3023_encoding = xml_encoding or u'utf-8'
+ # gb18030 is a superset of gb2312, so always replace gb2312
+ # with gb18030 for greater compatibility.
+ if rfc3023_encoding.lower() == u'gb2312':
+ rfc3023_encoding = u'gb18030'
+ if xml_encoding.lower() == u'gb2312':
+ xml_encoding = u'gb18030'
-def _toUTF8(data, encoding):
- '''Changes an XML data stream on the fly to specify a new encoding
+ # there are four encodings to keep track of:
+ # - http_encoding is the encoding declared in the Content-Type HTTP header
+ # - xml_encoding is the encoding declared in the = 4) and (data[:2] == _l2bytes([0xfe, 0xff])) and (data[2:4] != _l2bytes([0x00, 0x00])):
- encoding = 'utf-16be'
- data = data[2:]
- elif (len(data) >= 4) and (data[:2] == _l2bytes([0xff, 0xfe])) and (data[2:4] != _l2bytes([0x00, 0x00])):
- encoding = 'utf-16le'
- data = data[2:]
- elif data[:3] == _l2bytes([0xef, 0xbb, 0xbf]):
- encoding = 'utf-8'
- data = data[3:]
- elif data[:4] == _l2bytes([0x00, 0x00, 0xfe, 0xff]):
- encoding = 'utf-32be'
- data = data[4:]
- elif data[:4] == _l2bytes([0xff, 0xfe, 0x00, 0x00]):
- encoding = 'utf-32le'
- data = data[4:]
- newdata = unicode(data, encoding)
- declmatch = re.compile('^<\?xml[^>]*?>')
- newdecl = ''''''
- if declmatch.search(newdata):
- newdata = declmatch.sub(newdecl, newdata)
- else:
- newdata = newdecl + u'\n' + newdata
- return newdata.encode('utf-8')
+ if http_headers and (not acceptable_content_type):
+ if 'content-type' in http_headers:
+ msg = '%s is not an XML media type' % http_headers['content-type']
+ else:
+ msg = 'no Content-type specified'
+ error = NonXMLContentType(msg)
-def _stripDoctype(data):
- '''Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
+ # determine character encoding
+ known_encoding = 0
+ chardet_encoding = None
+ tried_encodings = []
+ if chardet:
+ chardet_encoding = unicode(chardet.detect(data)['encoding'], 'ascii', 'ignore')
+ # try: HTTP encoding, declared XML encoding, encoding sniffed from BOM
+ for proposed_encoding in (rfc3023_encoding, xml_encoding, bom_encoding,
+ chardet_encoding, u'utf-8', u'windows-1252', u'iso-8859-2'):
+ if not proposed_encoding:
+ continue
+ if proposed_encoding in tried_encodings:
+ continue
+ tried_encodings.append(proposed_encoding)
+ try:
+ data = data.decode(proposed_encoding)
+ except (UnicodeDecodeError, LookupError):
+ pass
+ else:
+ known_encoding = 1
+ # Update the encoding in the opening XML processing instruction.
+ new_declaration = ''''''
+ if RE_XML_DECLARATION.search(data):
+ data = RE_XML_DECLARATION.sub(new_declaration, data)
+ else:
+ data = new_declaration + u'\n' + data
+ data = data.encode('utf-8')
+ break
+ # if still no luck, give up
+ if not known_encoding:
+ error = CharacterEncodingUnknown(
+ 'document encoding unknown, I tried ' +
+ '%s, %s, utf-8, windows-1252, and iso-8859-2 but nothing worked' %
+ (rfc3023_encoding, xml_encoding))
+ rfc3023_encoding = u''
+ elif proposed_encoding != rfc3023_encoding:
+ error = CharacterEncodingOverride(
+ 'document declared as %s, but parsed as %s' %
+ (rfc3023_encoding, proposed_encoding))
+ rfc3023_encoding = proposed_encoding
+
+ return data, rfc3023_encoding, error
+
+# Match XML entity declarations.
+# Example:
+RE_ENTITY_PATTERN = re.compile(_s2bytes(r'^\s*]*?)>'), re.MULTILINE)
+
+# Match XML DOCTYPE declarations.
+# Example:
+RE_DOCTYPE_PATTERN = re.compile(_s2bytes(r'^\s*]*?)>'), re.MULTILINE)
+
+# Match safe entity declarations.
+# This will allow hexadecimal character references through,
+# as well as text, but not arbitrary nested entities.
+# Example: cubed "³"
+# Example: copyright "(C)"
+# Forbidden: explode1 "&explode2;&explode2;"
+RE_SAFE_ENTITY_PATTERN = re.compile(_s2bytes('\s+(\w+)\s+"(\w+;|[^&"]*)"'))
+
+def replace_doctype(data):
+ '''Strips and replaces the DOCTYPE, returns (rss_version, stripped_data)
rss_version may be 'rss091n' or None
- stripped_data is the same XML document, minus the DOCTYPE
+ stripped_data is the same XML document with a replaced DOCTYPE
'''
+
+ # Divide the document into two groups by finding the location
+ # of the first element that doesn't begin with '' or ']*?)>'), re.MULTILINE)
- entity_results=entity_pattern.findall(head)
- head = entity_pattern.sub(_s2bytes(''), head)
- doctype_pattern = re.compile(_s2bytes(r'^\s*]*?)>'), re.MULTILINE)
- doctype_results = doctype_pattern.findall(head)
+ # Save and then remove all of the ENTITY declarations.
+ entity_results = RE_ENTITY_PATTERN.findall(head)
+ head = RE_ENTITY_PATTERN.sub(_s2bytes(''), head)
+
+ # Find the DOCTYPE declaration and check the feed type.
+ doctype_results = RE_DOCTYPE_PATTERN.findall(head)
doctype = doctype_results and doctype_results[0] or _s2bytes('')
- if doctype.lower().count(_s2bytes('netscape')):
+ if _s2bytes('netscape') in doctype.lower():
version = u'rss091n'
else:
version = None
- # only allow in 'safe' inline entity definitions
- replacement=_s2bytes('')
- if len(doctype_results)==1 and entity_results:
- safe_pattern=re.compile(_s2bytes('\s+(\w+)\s+"(\w+;|[^&"]*)"'))
- safe_entities=filter(lambda e: safe_pattern.match(e),entity_results)
+ # Re-insert the safe ENTITY declarations if a DOCTYPE was found.
+ replacement = _s2bytes('')
+ if len(doctype_results) == 1 and entity_results:
+ match_safe_entities = lambda e: RE_SAFE_ENTITY_PATTERN.match(e)
+ safe_entities = filter(match_safe_entities, entity_results)
if safe_entities:
- replacement=_s2bytes('\n \n]>')
- data = doctype_pattern.sub(replacement, head) + data
+ replacement = _s2bytes('\n\n]>')
+ data = RE_DOCTYPE_PATTERN.sub(replacement, head) + data
- return version, data, dict(replacement and [(k.decode('utf-8'), v.decode('utf-8')) for k, v in safe_pattern.findall(replacement)])
+ # Precompute the safe entities for the loose parser.
+ safe_entities = dict((k.decode('utf-8'), v.decode('utf-8'))
+ for k, v in RE_SAFE_ENTITY_PATTERN.findall(replacement))
+ return version, data, safe_entities
def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, response_headers=None):
'''Parse a feed from a URL, file, stream, or string.
@@ -3822,24 +3884,25 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
try:
data = gzip.GzipFile(fileobj=_StringIO(data)).read()
except (IOError, struct.error), e:
- # IOError can occur if the gzip header is bad
- # struct.error can occur if the data is damaged
- # Some feeds claim to be gzipped but they're not, so
- # we get garbage. Ideally, we should re-request the
- # feed without the 'Accept-encoding: gzip' header,
- # but we don't.
+ # IOError can occur if the gzip header is bad.
+ # struct.error can occur if the data is damaged.
result['bozo'] = 1
result['bozo_exception'] = e
- data = None
+ if isinstance(e, struct.error):
+ # A gzip header was found but the data is corrupt.
+ # Ideally, we should re-request the feed without the
+ # 'Accept-encoding: gzip' header, but we don't.
+ data = None
elif zlib and 'deflate' in http_headers.get('content-encoding', ''):
try:
data = zlib.decompress(data)
except zlib.error, e:
- data = zlib.decompress(data, -zlib.MAX_WBITS)
- except zlib.error, e:
- result['bozo'] = 1
- result['bozo_exception'] = e
- data = None
+ try:
+ # The data may have no headers and no checksum.
+ data = zlib.decompress(data, -15)
+ except zlib.error, e:
+ result['bozo'] = 1
+ result['bozo_exception'] = e
# save HTTP headers
if http_headers:
@@ -3868,25 +3931,22 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
if data is None:
return result
- # there are four encodings to keep track of:
- # - http_encoding is the encoding declared in the Content-Type HTTP header
- # - xml_encoding is the encoding declared in the