mirror of
https://github.com/samuelclay/NewsBlur.git
synced 2025-08-31 21:41:33 +00:00
Upgrading feedparser from 5.1.1 to 5.1.2. Because I'm some kind of sadist.
This commit is contained in:
parent
332d85db91
commit
3a95882c48
1 changed files with 283 additions and 308 deletions
|
@ -9,7 +9,7 @@ Required: Python 2.4 or later
|
||||||
Recommended: iconv_codec <http://cjkpython.i18n.org/>
|
Recommended: iconv_codec <http://cjkpython.i18n.org/>
|
||||||
"""
|
"""
|
||||||
|
|
||||||
__version__ = "5.1.1"
|
__version__ = "5.1.2"
|
||||||
__license__ = """
|
__license__ = """
|
||||||
Copyright (c) 2010-2012 Kurt McKee <contactme@kurtmckee.org>
|
Copyright (c) 2010-2012 Kurt McKee <contactme@kurtmckee.org>
|
||||||
Copyright (c) 2002-2008 Mark Pilgrim
|
Copyright (c) 2002-2008 Mark Pilgrim
|
||||||
|
@ -131,9 +131,10 @@ else:
|
||||||
# https://secure.wikimedia.org/wikipedia/en/wiki/URI_scheme
|
# https://secure.wikimedia.org/wikipedia/en/wiki/URI_scheme
|
||||||
# Many more will likely need to be added!
|
# Many more will likely need to be added!
|
||||||
ACCEPTABLE_URI_SCHEMES = (
|
ACCEPTABLE_URI_SCHEMES = (
|
||||||
'file', 'ftp', 'gopher', 'h323', 'hdl', 'http', 'https', 'imap', 'mailto',
|
'file', 'ftp', 'gopher', 'h323', 'hdl', 'http', 'https', 'imap', 'magnet',
|
||||||
'mms', 'news', 'nntp', 'prospero', 'rsync', 'rtsp', 'rtspu', 'sftp',
|
'mailto', 'mms', 'news', 'nntp', 'prospero', 'rsync', 'rtsp', 'rtspu',
|
||||||
'shttp', 'sip', 'sips', 'snews', 'svn', 'svn+ssh', 'telnet', 'wais',
|
'sftp', 'shttp', 'sip', 'sips', 'snews', 'svn', 'svn+ssh', 'telnet',
|
||||||
|
'wais',
|
||||||
# Additional common-but-unofficial schemes
|
# Additional common-but-unofficial schemes
|
||||||
'aim', 'callto', 'cvs', 'facetime', 'feed', 'git', 'gtalk', 'irc', 'ircs',
|
'aim', 'callto', 'cvs', 'facetime', 'feed', 'git', 'gtalk', 'irc', 'ircs',
|
||||||
'irc6', 'itms', 'mms', 'msnim', 'skype', 'ssh', 'smb', 'svn', 'ymsg',
|
'irc6', 'itms', 'mms', 'msnim', 'skype', 'ssh', 'smb', 'svn', 'ymsg',
|
||||||
|
@ -283,15 +284,6 @@ except ImportError:
|
||||||
BeautifulSoup = None
|
BeautifulSoup = None
|
||||||
PARSE_MICROFORMATS = False
|
PARSE_MICROFORMATS = False
|
||||||
|
|
||||||
try:
|
|
||||||
# the utf_32 codec was introduced in Python 2.6; it's necessary to
|
|
||||||
# check this as long as feedparser supports Python 2.4 and 2.5
|
|
||||||
codecs.lookup('utf_32')
|
|
||||||
except LookupError:
|
|
||||||
_UTF32_AVAILABLE = False
|
|
||||||
else:
|
|
||||||
_UTF32_AVAILABLE = True
|
|
||||||
|
|
||||||
# ---------- don't touch these ----------
|
# ---------- don't touch these ----------
|
||||||
class ThingsNobodyCaresAboutButMe(Exception): pass
|
class ThingsNobodyCaresAboutButMe(Exception): pass
|
||||||
class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass
|
class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass
|
||||||
|
@ -1721,6 +1713,8 @@ class _FeedParserMixin:
|
||||||
self.push('itunes_image', 0)
|
self.push('itunes_image', 0)
|
||||||
if attrsD.get('href'):
|
if attrsD.get('href'):
|
||||||
self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')})
|
self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')})
|
||||||
|
elif attrsD.get('url'):
|
||||||
|
self._getContext()['image'] = FeedParserDict({'href': attrsD.get('url')})
|
||||||
_start_itunes_link = _start_itunes_image
|
_start_itunes_link = _start_itunes_image
|
||||||
|
|
||||||
def _end_itunes_block(self):
|
def _end_itunes_block(self):
|
||||||
|
@ -2554,7 +2548,7 @@ class _RelativeURIResolver(_BaseHTMLProcessor):
|
||||||
self.baseuri = baseuri
|
self.baseuri = baseuri
|
||||||
|
|
||||||
def resolveURI(self, uri):
|
def resolveURI(self, uri):
|
||||||
return _makeSafeAbsoluteURI(_urljoin(self.baseuri, uri.strip()))
|
return _makeSafeAbsoluteURI(self.baseuri, uri.strip())
|
||||||
|
|
||||||
def unknown_starttag(self, tag, attrs):
|
def unknown_starttag(self, tag, attrs):
|
||||||
attrs = self.normalize_attrs(attrs)
|
attrs = self.normalize_attrs(attrs)
|
||||||
|
@ -2607,8 +2601,7 @@ class _HTMLSanitizer(_BaseHTMLProcessor):
|
||||||
'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
|
'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
|
||||||
'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
|
'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
|
||||||
'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
|
'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
|
||||||
'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video', 'noscript',
|
'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video', 'noscript'])
|
||||||
'object', 'embed', 'iframe', 'param'])
|
|
||||||
|
|
||||||
acceptable_attributes = set(['abbr', 'accept', 'accept-charset', 'accesskey',
|
acceptable_attributes = set(['abbr', 'accept', 'accept-charset', 'accesskey',
|
||||||
'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis',
|
'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis',
|
||||||
|
@ -3010,11 +3003,14 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h
|
||||||
# try to open with native open function (if url_file_stream_or_string is a filename)
|
# try to open with native open function (if url_file_stream_or_string is a filename)
|
||||||
try:
|
try:
|
||||||
return open(url_file_stream_or_string, 'rb')
|
return open(url_file_stream_or_string, 'rb')
|
||||||
except (IOError, UnicodeEncodeError):
|
except (IOError, UnicodeEncodeError, TypeError):
|
||||||
# if url_file_stream_or_string is a unicode object that
|
# if url_file_stream_or_string is a unicode object that
|
||||||
# cannot be converted to the encoding returned by
|
# cannot be converted to the encoding returned by
|
||||||
# sys.getfilesystemencoding(), a UnicodeEncodeError
|
# sys.getfilesystemencoding(), a UnicodeEncodeError
|
||||||
# will be thrown
|
# will be thrown
|
||||||
|
# If url_file_stream_or_string is a string that contains NULL
|
||||||
|
# (such as an XML document encoded in UTF-32), TypeError will
|
||||||
|
# be thrown.
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# treat url_file_stream_or_string as string
|
# treat url_file_stream_or_string as string
|
||||||
|
@ -3452,7 +3448,7 @@ _rfc822_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
|
||||||
_rfc822_month = "(?P<month>%s)(?:[a-z]*,?)" % ('|'.join(_rfc822_months))
|
_rfc822_month = "(?P<month>%s)(?:[a-z]*,?)" % ('|'.join(_rfc822_months))
|
||||||
# The year may be 2 or 4 digits; capture the century if it exists
|
# The year may be 2 or 4 digits; capture the century if it exists
|
||||||
_rfc822_year = "(?P<year>(?:\d{2})?\d{2})"
|
_rfc822_year = "(?P<year>(?:\d{2})?\d{2})"
|
||||||
_rfc822_day = "(?P<day>\d{2})"
|
_rfc822_day = "(?P<day> *\d{1,2})"
|
||||||
_rfc822_date = "%s %s %s" % (_rfc822_day, _rfc822_month, _rfc822_year)
|
_rfc822_date = "%s %s %s" % (_rfc822_day, _rfc822_month, _rfc822_year)
|
||||||
|
|
||||||
_rfc822_hour = "(?P<hour>\d{2}):(?P<minute>\d{2})(?::(?P<second>\d{2}))?"
|
_rfc822_hour = "(?P<hour>\d{2}):(?P<minute>\d{2})(?::(?P<second>\d{2}))?"
|
||||||
|
@ -3561,217 +3557,283 @@ def _parse_date(dateString):
|
||||||
return date9tuple
|
return date9tuple
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _getCharacterEncoding(http_headers, xml_data):
|
# Each marker represents some of the characters of the opening XML
|
||||||
'''Get the character encoding of the XML document
|
# processing instruction ('<?xm') in the specified encoding.
|
||||||
|
EBCDIC_MARKER = _l2bytes([0x4C, 0x6F, 0xA7, 0x94])
|
||||||
|
UTF16BE_MARKER = _l2bytes([0x00, 0x3C, 0x00, 0x3F])
|
||||||
|
UTF16LE_MARKER = _l2bytes([0x3C, 0x00, 0x3F, 0x00])
|
||||||
|
UTF32BE_MARKER = _l2bytes([0x00, 0x00, 0x00, 0x3C])
|
||||||
|
UTF32LE_MARKER = _l2bytes([0x3C, 0x00, 0x00, 0x00])
|
||||||
|
|
||||||
|
ZERO_BYTES = _l2bytes([0x00, 0x00])
|
||||||
|
|
||||||
|
# Match the opening XML declaration.
|
||||||
|
# Example: <?xml version="1.0" encoding="utf-8"?>
|
||||||
|
RE_XML_DECLARATION = re.compile('^<\?xml[^>]*?>')
|
||||||
|
|
||||||
|
# Capture the value of the XML processing instruction's encoding attribute.
|
||||||
|
# Example: <?xml version="1.0" encoding="utf-8"?>
|
||||||
|
RE_XML_PI_ENCODING = re.compile(_s2bytes('^<\?.*encoding=[\'"](.*?)[\'"].*\?>'))
|
||||||
|
|
||||||
|
def convert_to_utf8(http_headers, data):
|
||||||
|
'''Detect and convert the character encoding to UTF-8.
|
||||||
|
|
||||||
http_headers is a dictionary
|
http_headers is a dictionary
|
||||||
xml_data is a raw string (not Unicode)
|
data is a raw string (not Unicode)'''
|
||||||
|
|
||||||
This is so much trickier than it sounds, it's not even funny.
|
# This is so much trickier than it sounds, it's not even funny.
|
||||||
According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
|
# According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
|
||||||
is application/xml, application/*+xml,
|
# is application/xml, application/*+xml,
|
||||||
application/xml-external-parsed-entity, or application/xml-dtd,
|
# application/xml-external-parsed-entity, or application/xml-dtd,
|
||||||
the encoding given in the charset parameter of the HTTP Content-Type
|
# the encoding given in the charset parameter of the HTTP Content-Type
|
||||||
takes precedence over the encoding given in the XML prefix within the
|
# takes precedence over the encoding given in the XML prefix within the
|
||||||
document, and defaults to 'utf-8' if neither are specified. But, if
|
# document, and defaults to 'utf-8' if neither are specified. But, if
|
||||||
the HTTP Content-Type is text/xml, text/*+xml, or
|
# the HTTP Content-Type is text/xml, text/*+xml, or
|
||||||
text/xml-external-parsed-entity, the encoding given in the XML prefix
|
# text/xml-external-parsed-entity, the encoding given in the XML prefix
|
||||||
within the document is ALWAYS IGNORED and only the encoding given in
|
# within the document is ALWAYS IGNORED and only the encoding given in
|
||||||
the charset parameter of the HTTP Content-Type header should be
|
# the charset parameter of the HTTP Content-Type header should be
|
||||||
respected, and it defaults to 'us-ascii' if not specified.
|
# respected, and it defaults to 'us-ascii' if not specified.
|
||||||
|
|
||||||
Furthermore, discussion on the atom-syntax mailing list with the
|
# Furthermore, discussion on the atom-syntax mailing list with the
|
||||||
author of RFC 3023 leads me to the conclusion that any document
|
# author of RFC 3023 leads me to the conclusion that any document
|
||||||
served with a Content-Type of text/* and no charset parameter
|
# served with a Content-Type of text/* and no charset parameter
|
||||||
must be treated as us-ascii. (We now do this.) And also that it
|
# must be treated as us-ascii. (We now do this.) And also that it
|
||||||
must always be flagged as non-well-formed. (We now do this too.)
|
# must always be flagged as non-well-formed. (We now do this too.)
|
||||||
|
|
||||||
If Content-Type is unspecified (input was local file or non-HTTP source)
|
# If Content-Type is unspecified (input was local file or non-HTTP source)
|
||||||
or unrecognized (server just got it totally wrong), then go by the
|
# or unrecognized (server just got it totally wrong), then go by the
|
||||||
encoding given in the XML prefix of the document and default to
|
# encoding given in the XML prefix of the document and default to
|
||||||
'iso-8859-1' as per the HTTP specification (RFC 2616).
|
# 'iso-8859-1' as per the HTTP specification (RFC 2616).
|
||||||
|
|
||||||
Then, assuming we didn't find a character encoding in the HTTP headers
|
# Then, assuming we didn't find a character encoding in the HTTP headers
|
||||||
(and the HTTP Content-type allowed us to look in the body), we need
|
# (and the HTTP Content-type allowed us to look in the body), we need
|
||||||
to sniff the first few bytes of the XML data and try to determine
|
# to sniff the first few bytes of the XML data and try to determine
|
||||||
whether the encoding is ASCII-compatible. Section F of the XML
|
# whether the encoding is ASCII-compatible. Section F of the XML
|
||||||
specification shows the way here:
|
# specification shows the way here:
|
||||||
http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
|
|
||||||
|
|
||||||
If the sniffed encoding is not ASCII-compatible, we need to make it
|
|
||||||
ASCII compatible so that we can sniff further into the XML declaration
|
|
||||||
to find the encoding attribute, which will tell us the true encoding.
|
|
||||||
|
|
||||||
Of course, none of this guarantees that we will be able to parse the
|
|
||||||
feed in the declared character encoding (assuming it was declared
|
|
||||||
correctly, which many are not). iconv_codec can help a lot;
|
|
||||||
you should definitely install it if you can.
|
|
||||||
http://cjkpython.i18n.org/
|
|
||||||
'''
|
|
||||||
|
|
||||||
def _parseHTTPContentType(content_type):
|
|
||||||
'''takes HTTP Content-Type header and returns (content type, charset)
|
|
||||||
|
|
||||||
If no charset is specified, returns (content type, '')
|
|
||||||
If no content type is specified, returns ('', '')
|
|
||||||
Both return parameters are guaranteed to be lowercase strings
|
|
||||||
'''
|
|
||||||
content_type = content_type or ''
|
|
||||||
content_type, params = cgi.parse_header(content_type)
|
|
||||||
charset = params.get('charset', '').replace("'", "")
|
|
||||||
if not isinstance(charset, unicode):
|
|
||||||
charset = charset.decode('utf-8', 'ignore')
|
|
||||||
return content_type, charset
|
|
||||||
|
|
||||||
sniffed_xml_encoding = u''
|
|
||||||
xml_encoding = u''
|
|
||||||
true_encoding = u''
|
|
||||||
http_content_type, http_encoding = _parseHTTPContentType(http_headers.get('content-type'))
|
|
||||||
# Must sniff for non-ASCII-compatible character encodings before
|
|
||||||
# searching for XML declaration. This heuristic is defined in
|
|
||||||
# section F of the XML specification:
|
|
||||||
# http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
|
# http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
|
||||||
|
|
||||||
|
# If the sniffed encoding is not ASCII-compatible, we need to make it
|
||||||
|
# ASCII compatible so that we can sniff further into the XML declaration
|
||||||
|
# to find the encoding attribute, which will tell us the true encoding.
|
||||||
|
|
||||||
|
# Of course, none of this guarantees that we will be able to parse the
|
||||||
|
# feed in the declared character encoding (assuming it was declared
|
||||||
|
# correctly, which many are not). iconv_codec can help a lot;
|
||||||
|
# you should definitely install it if you can.
|
||||||
|
# http://cjkpython.i18n.org/
|
||||||
|
|
||||||
|
bom_encoding = u''
|
||||||
|
xml_encoding = u''
|
||||||
|
rfc3023_encoding = u''
|
||||||
|
|
||||||
|
# Look at the first few bytes of the document to guess what
|
||||||
|
# its encoding may be. We only need to decode enough of the
|
||||||
|
# document that we can use an ASCII-compatible regular
|
||||||
|
# expression to search for an XML encoding declaration.
|
||||||
|
# The heuristic follows the XML specification, section F:
|
||||||
|
# http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
|
||||||
|
# Check for BOMs first.
|
||||||
|
if data[:4] == codecs.BOM_UTF32_BE:
|
||||||
|
bom_encoding = u'utf-32be'
|
||||||
|
data = data[4:]
|
||||||
|
elif data[:4] == codecs.BOM_UTF32_LE:
|
||||||
|
bom_encoding = u'utf-32le'
|
||||||
|
data = data[4:]
|
||||||
|
elif data[:2] == codecs.BOM_UTF16_BE and data[2:4] != ZERO_BYTES:
|
||||||
|
bom_encoding = u'utf-16be'
|
||||||
|
data = data[2:]
|
||||||
|
elif data[:2] == codecs.BOM_UTF16_LE and data[2:4] != ZERO_BYTES:
|
||||||
|
bom_encoding = u'utf-16le'
|
||||||
|
data = data[2:]
|
||||||
|
elif data[:3] == codecs.BOM_UTF8:
|
||||||
|
bom_encoding = u'utf-8'
|
||||||
|
data = data[3:]
|
||||||
|
# Check for the characters '<?xm' in several encodings.
|
||||||
|
elif data[:4] == EBCDIC_MARKER:
|
||||||
|
bom_encoding = u'cp037'
|
||||||
|
elif data[:4] == UTF16BE_MARKER:
|
||||||
|
bom_encoding = u'utf-16be'
|
||||||
|
elif data[:4] == UTF16LE_MARKER:
|
||||||
|
bom_encoding = u'utf-16le'
|
||||||
|
elif data[:4] == UTF32BE_MARKER:
|
||||||
|
bom_encoding = u'utf-32be'
|
||||||
|
elif data[:4] == UTF32LE_MARKER:
|
||||||
|
bom_encoding = u'utf-32le'
|
||||||
|
|
||||||
|
tempdata = data
|
||||||
try:
|
try:
|
||||||
if xml_data[:4] == _l2bytes([0x4c, 0x6f, 0xa7, 0x94]):
|
if bom_encoding:
|
||||||
# In all forms of EBCDIC, these four bytes correspond
|
tempdata = data.decode(bom_encoding).encode('utf-8')
|
||||||
# to the string '<?xm'; try decoding using CP037
|
except (UnicodeDecodeError, LookupError):
|
||||||
sniffed_xml_encoding = u'cp037'
|
# feedparser recognizes UTF-32 encodings that aren't
|
||||||
xml_data = xml_data.decode('cp037').encode('utf-8')
|
# available in Python 2.4 and 2.5, so it's possible to
|
||||||
elif xml_data[:4] == _l2bytes([0x00, 0x3c, 0x00, 0x3f]):
|
# encounter a LookupError during decoding.
|
||||||
# UTF-16BE
|
|
||||||
sniffed_xml_encoding = u'utf-16be'
|
|
||||||
xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
|
|
||||||
elif (len(xml_data) >= 4) and (xml_data[:2] == _l2bytes([0xfe, 0xff])) and (xml_data[2:4] != _l2bytes([0x00, 0x00])):
|
|
||||||
# UTF-16BE with BOM
|
|
||||||
sniffed_xml_encoding = u'utf-16be'
|
|
||||||
xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
|
|
||||||
elif xml_data[:4] == _l2bytes([0x3c, 0x00, 0x3f, 0x00]):
|
|
||||||
# UTF-16LE
|
|
||||||
sniffed_xml_encoding = u'utf-16le'
|
|
||||||
xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
|
|
||||||
elif (len(xml_data) >= 4) and (xml_data[:2] == _l2bytes([0xff, 0xfe])) and (xml_data[2:4] != _l2bytes([0x00, 0x00])):
|
|
||||||
# UTF-16LE with BOM
|
|
||||||
sniffed_xml_encoding = u'utf-16le'
|
|
||||||
xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
|
|
||||||
elif xml_data[:4] == _l2bytes([0x00, 0x00, 0x00, 0x3c]):
|
|
||||||
# UTF-32BE
|
|
||||||
sniffed_xml_encoding = u'utf-32be'
|
|
||||||
if _UTF32_AVAILABLE:
|
|
||||||
xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
|
|
||||||
elif xml_data[:4] == _l2bytes([0x3c, 0x00, 0x00, 0x00]):
|
|
||||||
# UTF-32LE
|
|
||||||
sniffed_xml_encoding = u'utf-32le'
|
|
||||||
if _UTF32_AVAILABLE:
|
|
||||||
xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
|
|
||||||
elif xml_data[:4] == _l2bytes([0x00, 0x00, 0xfe, 0xff]):
|
|
||||||
# UTF-32BE with BOM
|
|
||||||
sniffed_xml_encoding = u'utf-32be'
|
|
||||||
if _UTF32_AVAILABLE:
|
|
||||||
xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
|
|
||||||
elif xml_data[:4] == _l2bytes([0xff, 0xfe, 0x00, 0x00]):
|
|
||||||
# UTF-32LE with BOM
|
|
||||||
sniffed_xml_encoding = u'utf-32le'
|
|
||||||
if _UTF32_AVAILABLE:
|
|
||||||
xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
|
|
||||||
elif xml_data[:3] == _l2bytes([0xef, 0xbb, 0xbf]):
|
|
||||||
# UTF-8 with BOM
|
|
||||||
sniffed_xml_encoding = u'utf-8'
|
|
||||||
xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
|
|
||||||
else:
|
|
||||||
# ASCII-compatible
|
|
||||||
pass
|
|
||||||
xml_encoding_match = re.compile(_s2bytes('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')).match(xml_data)
|
|
||||||
except UnicodeDecodeError:
|
|
||||||
xml_encoding_match = None
|
xml_encoding_match = None
|
||||||
|
else:
|
||||||
|
xml_encoding_match = RE_XML_PI_ENCODING.match(tempdata)
|
||||||
|
|
||||||
if xml_encoding_match:
|
if xml_encoding_match:
|
||||||
xml_encoding = xml_encoding_match.groups()[0].decode('utf-8').lower()
|
xml_encoding = xml_encoding_match.groups()[0].decode('utf-8').lower()
|
||||||
if sniffed_xml_encoding and (xml_encoding in (u'iso-10646-ucs-2', u'ucs-2', u'csunicode', u'iso-10646-ucs-4', u'ucs-4', u'csucs4', u'utf-16', u'utf-32', u'utf_16', u'utf_32', u'utf16', u'u16')):
|
# Normalize the xml_encoding if necessary.
|
||||||
xml_encoding = sniffed_xml_encoding
|
if bom_encoding and (xml_encoding in (
|
||||||
|
u'u16', u'utf-16', u'utf16', u'utf_16',
|
||||||
|
u'u32', u'utf-32', u'utf32', u'utf_32',
|
||||||
|
u'iso-10646-ucs-2', u'iso-10646-ucs-4',
|
||||||
|
u'csucs4', u'csunicode', u'ucs-2', u'ucs-4'
|
||||||
|
)):
|
||||||
|
xml_encoding = bom_encoding
|
||||||
|
|
||||||
|
# Find the HTTP Content-Type and, hopefully, a character
|
||||||
|
# encoding provided by the server. The Content-Type is used
|
||||||
|
# to choose the "correct" encoding among the BOM encoding,
|
||||||
|
# XML declaration encoding, and HTTP encoding, following the
|
||||||
|
# heuristic defined in RFC 3023.
|
||||||
|
http_content_type = http_headers.get('content-type') or ''
|
||||||
|
http_content_type, params = cgi.parse_header(http_content_type)
|
||||||
|
http_encoding = params.get('charset', '').replace("'", "")
|
||||||
|
if not isinstance(http_encoding, unicode):
|
||||||
|
http_encoding = http_encoding.decode('utf-8', 'ignore')
|
||||||
|
|
||||||
acceptable_content_type = 0
|
acceptable_content_type = 0
|
||||||
application_content_types = (u'application/xml', u'application/xml-dtd', u'application/xml-external-parsed-entity')
|
application_content_types = (u'application/xml', u'application/xml-dtd',
|
||||||
|
u'application/xml-external-parsed-entity')
|
||||||
text_content_types = (u'text/xml', u'text/xml-external-parsed-entity')
|
text_content_types = (u'text/xml', u'text/xml-external-parsed-entity')
|
||||||
if (http_content_type in application_content_types) or \
|
if (http_content_type in application_content_types) or \
|
||||||
(http_content_type.startswith(u'application/') and http_content_type.endswith(u'+xml')):
|
(http_content_type.startswith(u'application/') and
|
||||||
|
http_content_type.endswith(u'+xml')):
|
||||||
acceptable_content_type = 1
|
acceptable_content_type = 1
|
||||||
true_encoding = http_encoding or xml_encoding or u'utf-8'
|
rfc3023_encoding = http_encoding or xml_encoding or u'utf-8'
|
||||||
elif (http_content_type in text_content_types) or \
|
elif (http_content_type in text_content_types) or \
|
||||||
(http_content_type.startswith(u'text/')) and http_content_type.endswith(u'+xml'):
|
(http_content_type.startswith(u'text/') and
|
||||||
|
http_content_type.endswith(u'+xml')):
|
||||||
acceptable_content_type = 1
|
acceptable_content_type = 1
|
||||||
true_encoding = http_encoding or u'us-ascii'
|
rfc3023_encoding = http_encoding or u'us-ascii'
|
||||||
elif http_content_type.startswith(u'text/'):
|
elif http_content_type.startswith(u'text/'):
|
||||||
true_encoding = http_encoding or u'us-ascii'
|
rfc3023_encoding = http_encoding or u'us-ascii'
|
||||||
elif http_headers and 'content-type' not in http_headers:
|
elif http_headers and 'content-type' not in http_headers:
|
||||||
true_encoding = xml_encoding or u'iso-8859-1'
|
rfc3023_encoding = xml_encoding or u'iso-8859-1'
|
||||||
else:
|
else:
|
||||||
true_encoding = xml_encoding or u'utf-8'
|
rfc3023_encoding = xml_encoding or u'utf-8'
|
||||||
# some feeds claim to be gb2312 but are actually gb18030.
|
# gb18030 is a superset of gb2312, so always replace gb2312
|
||||||
# apparently MSIE and Firefox both do the following switch:
|
# with gb18030 for greater compatibility.
|
||||||
if true_encoding.lower() == u'gb2312':
|
if rfc3023_encoding.lower() == u'gb2312':
|
||||||
true_encoding = u'gb18030'
|
rfc3023_encoding = u'gb18030'
|
||||||
return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
|
if xml_encoding.lower() == u'gb2312':
|
||||||
|
xml_encoding = u'gb18030'
|
||||||
|
|
||||||
def _toUTF8(data, encoding):
|
# there are four encodings to keep track of:
|
||||||
'''Changes an XML data stream on the fly to specify a new encoding
|
# - http_encoding is the encoding declared in the Content-Type HTTP header
|
||||||
|
# - xml_encoding is the encoding declared in the <?xml declaration
|
||||||
|
# - bom_encoding is the encoding sniffed from the first 4 bytes of the XML data
|
||||||
|
# - rfc3023_encoding is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
|
||||||
|
error = None
|
||||||
|
|
||||||
data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
|
if http_headers and (not acceptable_content_type):
|
||||||
encoding is a string recognized by encodings.aliases
|
if 'content-type' in http_headers:
|
||||||
'''
|
msg = '%s is not an XML media type' % http_headers['content-type']
|
||||||
# strip Byte Order Mark (if present)
|
else:
|
||||||
if (len(data) >= 4) and (data[:2] == _l2bytes([0xfe, 0xff])) and (data[2:4] != _l2bytes([0x00, 0x00])):
|
msg = 'no Content-type specified'
|
||||||
encoding = 'utf-16be'
|
error = NonXMLContentType(msg)
|
||||||
data = data[2:]
|
|
||||||
elif (len(data) >= 4) and (data[:2] == _l2bytes([0xff, 0xfe])) and (data[2:4] != _l2bytes([0x00, 0x00])):
|
|
||||||
encoding = 'utf-16le'
|
|
||||||
data = data[2:]
|
|
||||||
elif data[:3] == _l2bytes([0xef, 0xbb, 0xbf]):
|
|
||||||
encoding = 'utf-8'
|
|
||||||
data = data[3:]
|
|
||||||
elif data[:4] == _l2bytes([0x00, 0x00, 0xfe, 0xff]):
|
|
||||||
encoding = 'utf-32be'
|
|
||||||
data = data[4:]
|
|
||||||
elif data[:4] == _l2bytes([0xff, 0xfe, 0x00, 0x00]):
|
|
||||||
encoding = 'utf-32le'
|
|
||||||
data = data[4:]
|
|
||||||
newdata = unicode(data, encoding)
|
|
||||||
declmatch = re.compile('^<\?xml[^>]*?>')
|
|
||||||
newdecl = '''<?xml version='1.0' encoding='utf-8'?>'''
|
|
||||||
if declmatch.search(newdata):
|
|
||||||
newdata = declmatch.sub(newdecl, newdata)
|
|
||||||
else:
|
|
||||||
newdata = newdecl + u'\n' + newdata
|
|
||||||
return newdata.encode('utf-8')
|
|
||||||
|
|
||||||
def _stripDoctype(data):
|
# determine character encoding
|
||||||
'''Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
|
known_encoding = 0
|
||||||
|
chardet_encoding = None
|
||||||
|
tried_encodings = []
|
||||||
|
if chardet:
|
||||||
|
chardet_encoding = unicode(chardet.detect(data)['encoding'], 'ascii', 'ignore')
|
||||||
|
# try: HTTP encoding, declared XML encoding, encoding sniffed from BOM
|
||||||
|
for proposed_encoding in (rfc3023_encoding, xml_encoding, bom_encoding,
|
||||||
|
chardet_encoding, u'utf-8', u'windows-1252', u'iso-8859-2'):
|
||||||
|
if not proposed_encoding:
|
||||||
|
continue
|
||||||
|
if proposed_encoding in tried_encodings:
|
||||||
|
continue
|
||||||
|
tried_encodings.append(proposed_encoding)
|
||||||
|
try:
|
||||||
|
data = data.decode(proposed_encoding)
|
||||||
|
except (UnicodeDecodeError, LookupError):
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
known_encoding = 1
|
||||||
|
# Update the encoding in the opening XML processing instruction.
|
||||||
|
new_declaration = '''<?xml version='1.0' encoding='utf-8'?>'''
|
||||||
|
if RE_XML_DECLARATION.search(data):
|
||||||
|
data = RE_XML_DECLARATION.sub(new_declaration, data)
|
||||||
|
else:
|
||||||
|
data = new_declaration + u'\n' + data
|
||||||
|
data = data.encode('utf-8')
|
||||||
|
break
|
||||||
|
# if still no luck, give up
|
||||||
|
if not known_encoding:
|
||||||
|
error = CharacterEncodingUnknown(
|
||||||
|
'document encoding unknown, I tried ' +
|
||||||
|
'%s, %s, utf-8, windows-1252, and iso-8859-2 but nothing worked' %
|
||||||
|
(rfc3023_encoding, xml_encoding))
|
||||||
|
rfc3023_encoding = u''
|
||||||
|
elif proposed_encoding != rfc3023_encoding:
|
||||||
|
error = CharacterEncodingOverride(
|
||||||
|
'document declared as %s, but parsed as %s' %
|
||||||
|
(rfc3023_encoding, proposed_encoding))
|
||||||
|
rfc3023_encoding = proposed_encoding
|
||||||
|
|
||||||
|
return data, rfc3023_encoding, error
|
||||||
|
|
||||||
|
# Match XML entity declarations.
|
||||||
|
# Example: <!ENTITY copyright "(C)">
|
||||||
|
RE_ENTITY_PATTERN = re.compile(_s2bytes(r'^\s*<!ENTITY([^>]*?)>'), re.MULTILINE)
|
||||||
|
|
||||||
|
# Match XML DOCTYPE declarations.
|
||||||
|
# Example: <!DOCTYPE feed [ ]>
|
||||||
|
RE_DOCTYPE_PATTERN = re.compile(_s2bytes(r'^\s*<!DOCTYPE([^>]*?)>'), re.MULTILINE)
|
||||||
|
|
||||||
|
# Match safe entity declarations.
|
||||||
|
# This will allow hexadecimal character references through,
|
||||||
|
# as well as text, but not arbitrary nested entities.
|
||||||
|
# Example: cubed "³"
|
||||||
|
# Example: copyright "(C)"
|
||||||
|
# Forbidden: explode1 "&explode2;&explode2;"
|
||||||
|
RE_SAFE_ENTITY_PATTERN = re.compile(_s2bytes('\s+(\w+)\s+"(&#\w+;|[^&"]*)"'))
|
||||||
|
|
||||||
|
def replace_doctype(data):
|
||||||
|
'''Strips and replaces the DOCTYPE, returns (rss_version, stripped_data)
|
||||||
|
|
||||||
rss_version may be 'rss091n' or None
|
rss_version may be 'rss091n' or None
|
||||||
stripped_data is the same XML document, minus the DOCTYPE
|
stripped_data is the same XML document with a replaced DOCTYPE
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
# Divide the document into two groups by finding the location
|
||||||
|
# of the first element that doesn't begin with '<?' or '<!'.
|
||||||
start = re.search(_s2bytes('<\w'), data)
|
start = re.search(_s2bytes('<\w'), data)
|
||||||
start = start and start.start() or -1
|
start = start and start.start() or -1
|
||||||
head,data = data[:start+1], data[start+1:]
|
head, data = data[:start+1], data[start+1:]
|
||||||
|
|
||||||
entity_pattern = re.compile(_s2bytes(r'^\s*<!ENTITY([^>]*?)>'), re.MULTILINE)
|
# Save and then remove all of the ENTITY declarations.
|
||||||
entity_results=entity_pattern.findall(head)
|
entity_results = RE_ENTITY_PATTERN.findall(head)
|
||||||
head = entity_pattern.sub(_s2bytes(''), head)
|
head = RE_ENTITY_PATTERN.sub(_s2bytes(''), head)
|
||||||
doctype_pattern = re.compile(_s2bytes(r'^\s*<!DOCTYPE([^>]*?)>'), re.MULTILINE)
|
|
||||||
doctype_results = doctype_pattern.findall(head)
|
# Find the DOCTYPE declaration and check the feed type.
|
||||||
|
doctype_results = RE_DOCTYPE_PATTERN.findall(head)
|
||||||
doctype = doctype_results and doctype_results[0] or _s2bytes('')
|
doctype = doctype_results and doctype_results[0] or _s2bytes('')
|
||||||
if doctype.lower().count(_s2bytes('netscape')):
|
if _s2bytes('netscape') in doctype.lower():
|
||||||
version = u'rss091n'
|
version = u'rss091n'
|
||||||
else:
|
else:
|
||||||
version = None
|
version = None
|
||||||
|
|
||||||
# only allow in 'safe' inline entity definitions
|
# Re-insert the safe ENTITY declarations if a DOCTYPE was found.
|
||||||
replacement=_s2bytes('')
|
replacement = _s2bytes('')
|
||||||
if len(doctype_results)==1 and entity_results:
|
if len(doctype_results) == 1 and entity_results:
|
||||||
safe_pattern=re.compile(_s2bytes('\s+(\w+)\s+"(&#\w+;|[^&"]*)"'))
|
match_safe_entities = lambda e: RE_SAFE_ENTITY_PATTERN.match(e)
|
||||||
safe_entities=filter(lambda e: safe_pattern.match(e),entity_results)
|
safe_entities = filter(match_safe_entities, entity_results)
|
||||||
if safe_entities:
|
if safe_entities:
|
||||||
replacement=_s2bytes('<!DOCTYPE feed [\n <!ENTITY') + _s2bytes('>\n <!ENTITY ').join(safe_entities) + _s2bytes('>\n]>')
|
replacement = _s2bytes('<!DOCTYPE feed [\n<!ENTITY') \
|
||||||
data = doctype_pattern.sub(replacement, head) + data
|
+ _s2bytes('>\n<!ENTITY ').join(safe_entities) \
|
||||||
|
+ _s2bytes('>\n]>')
|
||||||
|
data = RE_DOCTYPE_PATTERN.sub(replacement, head) + data
|
||||||
|
|
||||||
return version, data, dict(replacement and [(k.decode('utf-8'), v.decode('utf-8')) for k, v in safe_pattern.findall(replacement)])
|
# Precompute the safe entities for the loose parser.
|
||||||
|
safe_entities = dict((k.decode('utf-8'), v.decode('utf-8'))
|
||||||
|
for k, v in RE_SAFE_ENTITY_PATTERN.findall(replacement))
|
||||||
|
return version, data, safe_entities
|
||||||
|
|
||||||
def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, response_headers=None):
|
def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, response_headers=None):
|
||||||
'''Parse a feed from a URL, file, stream, or string.
|
'''Parse a feed from a URL, file, stream, or string.
|
||||||
|
@ -3822,24 +3884,25 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
|
||||||
try:
|
try:
|
||||||
data = gzip.GzipFile(fileobj=_StringIO(data)).read()
|
data = gzip.GzipFile(fileobj=_StringIO(data)).read()
|
||||||
except (IOError, struct.error), e:
|
except (IOError, struct.error), e:
|
||||||
# IOError can occur if the gzip header is bad
|
# IOError can occur if the gzip header is bad.
|
||||||
# struct.error can occur if the data is damaged
|
# struct.error can occur if the data is damaged.
|
||||||
# Some feeds claim to be gzipped but they're not, so
|
|
||||||
# we get garbage. Ideally, we should re-request the
|
|
||||||
# feed without the 'Accept-encoding: gzip' header,
|
|
||||||
# but we don't.
|
|
||||||
result['bozo'] = 1
|
result['bozo'] = 1
|
||||||
result['bozo_exception'] = e
|
result['bozo_exception'] = e
|
||||||
data = None
|
if isinstance(e, struct.error):
|
||||||
|
# A gzip header was found but the data is corrupt.
|
||||||
|
# Ideally, we should re-request the feed without the
|
||||||
|
# 'Accept-encoding: gzip' header, but we don't.
|
||||||
|
data = None
|
||||||
elif zlib and 'deflate' in http_headers.get('content-encoding', ''):
|
elif zlib and 'deflate' in http_headers.get('content-encoding', ''):
|
||||||
try:
|
try:
|
||||||
data = zlib.decompress(data)
|
data = zlib.decompress(data)
|
||||||
except zlib.error, e:
|
except zlib.error, e:
|
||||||
data = zlib.decompress(data, -zlib.MAX_WBITS)
|
try:
|
||||||
except zlib.error, e:
|
# The data may have no headers and no checksum.
|
||||||
result['bozo'] = 1
|
data = zlib.decompress(data, -15)
|
||||||
result['bozo_exception'] = e
|
except zlib.error, e:
|
||||||
data = None
|
result['bozo'] = 1
|
||||||
|
result['bozo_exception'] = e
|
||||||
|
|
||||||
# save HTTP headers
|
# save HTTP headers
|
||||||
if http_headers:
|
if http_headers:
|
||||||
|
@ -3868,25 +3931,22 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
|
||||||
if data is None:
|
if data is None:
|
||||||
return result
|
return result
|
||||||
|
|
||||||
# there are four encodings to keep track of:
|
# Stop processing if the server sent HTTP 304 Not Modified.
|
||||||
# - http_encoding is the encoding declared in the Content-Type HTTP header
|
if getattr(f, 'code', 0) == 304:
|
||||||
# - xml_encoding is the encoding declared in the <?xml declaration
|
result['version'] = u''
|
||||||
# - sniffed_encoding is the encoding sniffed from the first 4 bytes of the XML data
|
result['debug_message'] = 'The feed has not changed since you last checked, ' + \
|
||||||
# - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
|
'so the server sent no data. This is a feature, not a bug!'
|
||||||
result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type = \
|
return result
|
||||||
_getCharacterEncoding(http_headers, data)
|
|
||||||
if http_headers and (not acceptable_content_type):
|
data, result['encoding'], error = convert_to_utf8(http_headers, data)
|
||||||
if 'content-type' in http_headers:
|
use_strict_parser = result['encoding'] and True or False
|
||||||
bozo_message = '%s is not an XML media type' % http_headers['content-type']
|
if error is not None:
|
||||||
else:
|
|
||||||
bozo_message = 'no Content-type specified'
|
|
||||||
result['bozo'] = 1
|
result['bozo'] = 1
|
||||||
result['bozo_exception'] = NonXMLContentType(bozo_message)
|
result['bozo_exception'] = error
|
||||||
|
|
||||||
if data is not None:
|
result['version'], data, entities = replace_doctype(data)
|
||||||
result['version'], data, entities = _stripDoctype(data)
|
|
||||||
|
|
||||||
# ensure that baseuri is an absolute uri using an acceptable URI scheme
|
# Ensure that baseuri is an absolute URI using an acceptable URI scheme.
|
||||||
contentloc = http_headers.get('content-location', u'')
|
contentloc = http_headers.get('content-location', u'')
|
||||||
href = result.get('href', u'')
|
href = result.get('href', u'')
|
||||||
baseuri = _makeSafeAbsoluteURI(href, contentloc) or _makeSafeAbsoluteURI(contentloc) or href
|
baseuri = _makeSafeAbsoluteURI(href, contentloc) or _makeSafeAbsoluteURI(contentloc) or href
|
||||||
|
@ -3895,91 +3955,6 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
|
||||||
if not isinstance(baselang, unicode) and baselang is not None:
|
if not isinstance(baselang, unicode) and baselang is not None:
|
||||||
baselang = baselang.decode('utf-8', 'ignore')
|
baselang = baselang.decode('utf-8', 'ignore')
|
||||||
|
|
||||||
# if server sent 304, we're done
|
|
||||||
if getattr(f, 'code', 0) == 304:
|
|
||||||
result['version'] = u''
|
|
||||||
result['debug_message'] = 'The feed has not changed since you last checked, ' + \
|
|
||||||
'so the server sent no data. This is a feature, not a bug!'
|
|
||||||
return result
|
|
||||||
|
|
||||||
# if there was a problem downloading, we're done
|
|
||||||
if data is None:
|
|
||||||
return result
|
|
||||||
|
|
||||||
# determine character encoding
|
|
||||||
use_strict_parser = 0
|
|
||||||
known_encoding = 0
|
|
||||||
tried_encodings = []
|
|
||||||
# try: HTTP encoding, declared XML encoding, encoding sniffed from BOM
|
|
||||||
for proposed_encoding in (result['encoding'], xml_encoding, sniffed_xml_encoding):
|
|
||||||
if not proposed_encoding:
|
|
||||||
continue
|
|
||||||
if proposed_encoding in tried_encodings:
|
|
||||||
continue
|
|
||||||
tried_encodings.append(proposed_encoding)
|
|
||||||
try:
|
|
||||||
data = _toUTF8(data, proposed_encoding)
|
|
||||||
except (UnicodeDecodeError, LookupError):
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
known_encoding = use_strict_parser = 1
|
|
||||||
break
|
|
||||||
# if no luck and we have auto-detection library, try that
|
|
||||||
if (not known_encoding) and chardet:
|
|
||||||
proposed_encoding = unicode(chardet.detect(data)['encoding'], 'ascii', 'ignore')
|
|
||||||
if proposed_encoding and (proposed_encoding not in tried_encodings):
|
|
||||||
tried_encodings.append(proposed_encoding)
|
|
||||||
try:
|
|
||||||
data = _toUTF8(data, proposed_encoding)
|
|
||||||
except (UnicodeDecodeError, LookupError):
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
known_encoding = use_strict_parser = 1
|
|
||||||
# if still no luck and we haven't tried utf-8 yet, try that
|
|
||||||
if (not known_encoding) and (u'utf-8' not in tried_encodings):
|
|
||||||
proposed_encoding = u'utf-8'
|
|
||||||
tried_encodings.append(proposed_encoding)
|
|
||||||
try:
|
|
||||||
data = _toUTF8(data, proposed_encoding)
|
|
||||||
except UnicodeDecodeError:
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
known_encoding = use_strict_parser = 1
|
|
||||||
# if still no luck and we haven't tried windows-1252 yet, try that
|
|
||||||
if (not known_encoding) and (u'windows-1252' not in tried_encodings):
|
|
||||||
proposed_encoding = u'windows-1252'
|
|
||||||
tried_encodings.append(proposed_encoding)
|
|
||||||
try:
|
|
||||||
data = _toUTF8(data, proposed_encoding)
|
|
||||||
except UnicodeDecodeError:
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
known_encoding = use_strict_parser = 1
|
|
||||||
# if still no luck and we haven't tried iso-8859-2 yet, try that.
|
|
||||||
if (not known_encoding) and (u'iso-8859-2' not in tried_encodings):
|
|
||||||
proposed_encoding = u'iso-8859-2'
|
|
||||||
tried_encodings.append(proposed_encoding)
|
|
||||||
try:
|
|
||||||
data = _toUTF8(data, proposed_encoding)
|
|
||||||
except UnicodeDecodeError:
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
known_encoding = use_strict_parser = 1
|
|
||||||
# if still no luck, give up
|
|
||||||
if not known_encoding:
|
|
||||||
result['bozo'] = 1
|
|
||||||
result['bozo_exception'] = CharacterEncodingUnknown( \
|
|
||||||
'document encoding unknown, I tried ' + \
|
|
||||||
'%s, %s, utf-8, windows-1252, and iso-8859-2 but nothing worked' % \
|
|
||||||
(result['encoding'], xml_encoding))
|
|
||||||
result['encoding'] = u''
|
|
||||||
elif proposed_encoding != result['encoding']:
|
|
||||||
result['bozo'] = 1
|
|
||||||
result['bozo_exception'] = CharacterEncodingOverride( \
|
|
||||||
'document declared as %s, but parsed as %s' % \
|
|
||||||
(result['encoding'], proposed_encoding))
|
|
||||||
result['encoding'] = proposed_encoding
|
|
||||||
|
|
||||||
if not _XML_AVAILABLE:
|
if not _XML_AVAILABLE:
|
||||||
use_strict_parser = 0
|
use_strict_parser = 0
|
||||||
if use_strict_parser:
|
if use_strict_parser:
|
||||||
|
@ -3998,7 +3973,7 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
|
||||||
source.setByteStream(_StringIO(data))
|
source.setByteStream(_StringIO(data))
|
||||||
try:
|
try:
|
||||||
saxparser.parse(source)
|
saxparser.parse(source)
|
||||||
except xml.sax.SAXParseException, e:
|
except xml.sax.SAXException, e:
|
||||||
result['bozo'] = 1
|
result['bozo'] = 1
|
||||||
result['bozo_exception'] = feedparser.exc or e
|
result['bozo_exception'] = feedparser.exc or e
|
||||||
use_strict_parser = 0
|
use_strict_parser = 0
|
||||||
|
@ -4009,4 +3984,4 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
|
||||||
result['entries'] = feedparser.entries
|
result['entries'] = feedparser.entries
|
||||||
result['version'] = result['version'] or feedparser.version
|
result['version'] = result['version'] or feedparser.version
|
||||||
result['namespaces'] = feedparser.namespacesInUse
|
result['namespaces'] = feedparser.namespacesInUse
|
||||||
return result
|
return result
|
Loading…
Add table
Reference in a new issue