NewsBlur/vendor/feedvalidator/uri.py

234 lines
4.7 KiB
Python
Raw Normal View History

"""$Id: uri.py 511 2006-03-07 05:19:10Z rubys $"""
# This is working code, with tests, but not yet integrated into validation.
# (Change unique in validators.py to use Uri(self.value), rather than the
# plain value.)
# Ideally, this would be part of the core Python classes.
# It's probably not ready for deployment, but having it here helps establish
# the test case as a repository for any pathological cases that people
# suggest.
from urlparse import urljoin
from urllib import quote, quote_plus, unquote, unquote_plus
from unicodedata import normalize
from codecs import lookup
import re
(enc, dec) = lookup('UTF-8')[:2]
SUBDELIMS='!$&\'()*+,;='
PCHAR='-._~' + SUBDELIMS + ':@'
GENDELIMS=':/?#[]@'
RESERVED=GENDELIMS + SUBDELIMS
default_port = {
'ftp': 21,
'telnet': 23,
'http': 80,
'gopher': 70,
'news': 119,
'nntp': 119,
'prospero': 191,
'https': 443,
'snews': 563,
'snntp': 563,
}
class BadUri(Exception):
pass
def _n(s):
return enc(normalize('NFC', dec(s)[0]))[0]
octetRe = re.compile('([^%]|%[a-fA-F0-9]{2})')
def asOctets(s):
while (s):
m = octetRe.match(s)
if not(m):
raise BadUri()
c = m.group(1)
if (c[0] == '%'):
yield(c.upper(), chr(int(c[1:], 0x10)))
else:
yield(c, c)
s = s[m.end(1):]
def _qnu(s,safe=''):
if s == None:
return None
# unquote{,_plus} leave high-bit octets unconverted in Unicode strings
# This conversion will, correctly, cause UnicodeEncodeError if there are
# non-ASCII characters present in the string
s = str(s)
res = ''
b = ''
for (c,x) in asOctets(s):
if x in RESERVED and x in safe:
res += quote(_n(unquote(b)), safe)
b = ''
res += c
else:
b += x
res += quote(_n(unquote(b)), safe)
return res
def _normPort(netloc,defPort):
nl = netloc.lower()
p = defPort
i = nl.find(':')
if i >= 0:
ps = nl[i + 1:]
if ps:
if not(ps.isdigit()):
return netloc
p = int(ps)
nl = nl[:i]
if nl and nl[-1] == '.' and nl.rfind('.', 0, -2) >= 0:
nl = nl[:-1]
if p != defPort:
nl = nl + ':' + str(p)
return nl
def _normAuth(auth,port):
i = auth.rfind('@')
if i >= 0:
c = auth[:i]
if c == ':':
c = ''
h = auth[i + 1:]
else:
c = None
h = auth
if c:
return c + '@' + _normPort(h,port)
else:
return _normPort(h,port)
def _normPath(p):
l = p.split(u'/')
i = 0
if l and l[0]:
i = len(l)
while i < len(l):
c = l[i]
if (c == '.'):
if i < len(l) - 1:
del l[i]
else:
l[i] = ''
elif (c == '..'):
if i < len(l) - 1:
del l[i]
else:
l[i] = ''
if i > 1 or (i > 0 and l[0]):
i -= 1
del l[i]
else:
i += 1
if l == ['']:
l = ['', '']
return u'/'.join([_qnu(c, PCHAR) for c in l])
# From RFC 2396bis, with added end-of-string marker
uriRe = re.compile('^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?$')
def _canonical(s):
m = uriRe.match(s)
if not(m):
raise BadUri()
# Check for a relative URI
if m.group(2) is None:
scheme = None
else:
scheme = m.group(2).lower()
if m.group(4) is None:
authority = None
p = m.group(5)
# Don't try to normalise URI references with relative paths
if scheme is None and not p.startswith('/'):
return None
if scheme == 'mailto':
# XXX From RFC 2368, mailto equivalence needs to be subtler than this
i = p.find('@')
if i > 0:
j = p.find('?')
if j < 0:
j = len(p)
p = _qnu(p[:i]) + '@' + _qnu(p[i + 1:].lower()) + _qnu(p[j:])
path = p
else:
if scheme is None or p.startswith('/'):
path = _normPath(p)
else:
path = _qnu(p, PCHAR + '/')
else:
a = m.group(4)
p = m.group(5)
if scheme in default_port:
a = _normAuth(a, default_port[scheme])
else:
a = _normAuth(a, None)
authority = a
path = _normPath(p)
query = _qnu(m.group(7), PCHAR + "/?")
fragment = _qnu(m.group(9), PCHAR + "/?")
s = u''
if scheme != None:
s += scheme + ':'
if authority != None:
s += '//' + authority
s += path
if query != None:
s += '?' + query
if fragment != None:
s += '#' + fragment
return s
class Uri:
"""A Uri wraps a string and performs equality testing according to the
rules for URI equivalence. """
def __init__(self,s):
self.s = s
self.n = _canonical(s)
def __str__(self):
return self.s
def __repr__(self):
return repr(self.s)
def __eq__(self, a):
return self.n == a.n
def canonicalForm(u):
"""Give the canonical form for a URI, so char-by-char comparisons become valid tests for equivalence."""
try:
return _canonical(u)
except BadUri:
return None
except UnicodeError:
return None