NewsBlur/vendor/feedvalidator/uri.py

"""$Id: uri.py 511 2006-03-07 05:19:10Z rubys $"""

# This is working code, with tests, but not yet integrated into validation.
# (Change unique in validators.py to use Uri(self.value), rather than the
#  plain value.)
# Ideally, this would be part of the core Python classes.
# It's probably not ready for deployment, but having it here helps establish
#  the test case as a repository for any pathological cases that people
#  suggest.

from urlparse import urljoin
from urllib import quote, quote_plus, unquote, unquote_plus

from unicodedata import normalize
from codecs import lookup

import re

(enc, dec) = lookup('UTF-8')[:2]

SUBDELIMS='!$&\'()*+,;='
PCHAR='-._~' + SUBDELIMS + ':@'
GENDELIMS=':/?#[]@'
RESERVED=GENDELIMS + SUBDELIMS

default_port = {
  'ftp': 21,
  'telnet': 23,
  'http': 80,
  'gopher': 70,
  'news': 119,
  'nntp': 119,
  'prospero': 191,
  'https': 443,
  'snews': 563,
  'snntp': 563,
}

class BadUri(Exception):
  pass

def _n(s):
  return enc(normalize('NFC', dec(s)[0]))[0]

octetRe = re.compile('([^%]|%[a-fA-F0-9]{2})')

def asOctets(s):
  while (s):
    m = octetRe.match(s)

    if not(m):
      raise BadUri()

    c = m.group(1)
    if (c[0] == '%'):
      yield(c.upper(), chr(int(c[1:], 0x10)))
    else:
      yield(c, c)

    s = s[m.end(1):]
  
def _qnu(s,safe=''):
  if s == None:
    return None
  # unquote{,_plus} leave high-bit octets unconverted in Unicode strings
  # This conversion will, correctly, cause UnicodeEncodeError if there are
  #  non-ASCII characters present in the string
  s = str(s)

  res = ''
  b = ''
  for (c,x) in asOctets(s):
    if x in RESERVED and x in safe:
      res += quote(_n(unquote(b)), safe)
      b = ''
      res += c
    else:
      b += x
    
  res += quote(_n(unquote(b)), safe)

  return res

def _normPort(netloc,defPort):
  nl = netloc.lower()
  p = defPort
  i = nl.find(':')
  if i >= 0:
    ps = nl[i + 1:]
    if ps:
      if not(ps.isdigit()):
        return netloc
      p = int(ps)
    nl = nl[:i]

  if nl and nl[-1] == '.' and nl.rfind('.', 0, -2) >= 0:
    nl = nl[:-1]

  if p != defPort:
    nl = nl + ':' + str(p)
  return nl

def _normAuth(auth,port):
  i = auth.rfind('@')
  if i >= 0:
    c = auth[:i]
    if c == ':':
      c = ''
    h = auth[i + 1:]
  else:
    c = None
    h = auth

  if c:
    return c + '@' + _normPort(h,port)
  else:
    return _normPort(h,port)

def _normPath(p):
  l = p.split(u'/')
  i = 0
  if l and l[0]:
    i = len(l)
  while i < len(l):
    c = l[i]
    if (c == '.'):
      if i < len(l) - 1:
        del l[i]
      else:
        l[i] = ''
    elif (c == '..'):
      if i < len(l) - 1:
        del l[i]
      else:
        l[i] = ''
      if i > 1 or (i > 0 and l[0]):
        i -= 1
        del l[i]
    else:
      i += 1
  if l == ['']:
    l = ['', '']
  return u'/'.join([_qnu(c, PCHAR) for c in l])

# From RFC 2396bis, with added end-of-string marker
uriRe = re.compile('^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?$')

def _canonical(s):
  m = uriRe.match(s)
  if not(m):
    raise BadUri()
  
  # Check for a relative URI
  if m.group(2) is None:
    scheme = None
  else:
    scheme = m.group(2).lower()

  if m.group(4) is None:
    authority = None

    p = m.group(5)

    # Don't try to normalise URI references with relative paths
    if scheme is None and not p.startswith('/'):
      return None

    if scheme == 'mailto':
      # XXX From RFC 2368, mailto equivalence needs to be subtler than this
      i = p.find('@')
      if i > 0:
        j = p.find('?')
        if j < 0:
          j = len(p)
        p = _qnu(p[:i]) + '@' + _qnu(p[i + 1:].lower()) + _qnu(p[j:])
      path = p
    else:
      if scheme is None or p.startswith('/'):
        path = _normPath(p)
      else:
        path = _qnu(p, PCHAR + '/')
  else:
    a = m.group(4)
    p = m.group(5)
    if scheme in default_port:
      a = _normAuth(a, default_port[scheme])
    else:
      a = _normAuth(a, None)

    authority = a
    path = _normPath(p)

  query = _qnu(m.group(7), PCHAR + "/?")
  fragment = _qnu(m.group(9), PCHAR + "/?")

  s = u''
  if scheme != None:
    s += scheme + ':'

  if authority != None:
    s += '//' + authority

  s += path
  if query != None:
    s += '?' + query
  if fragment != None:
    s += '#' + fragment
  return s

class Uri:
  """A Uri wraps a string and performs equality testing according to the
   rules for URI equivalence. """
  def __init__(self,s):
    self.s = s
    self.n = _canonical(s)

  def __str__(self):
    return self.s

  def __repr__(self):
    return repr(self.s)

  def __eq__(self, a):
    return self.n == a.n

def canonicalForm(u):
  """Give the canonical form for a URI, so char-by-char comparisons become valid tests for equivalence."""
  try:
    return _canonical(u)
  except BadUri:
    return None
  except UnicodeError:
    return None
Adding feedvalidator as a vendorized package, for some day. 2014-12-17 18:24:58 -08:00			`"""$Id: uri.py 511 2006-03-07 05:19:10Z rubys $"""`

			`# This is working code, with tests, but not yet integrated into validation.`
			`# (Change unique in validators.py to use Uri(self.value), rather than the`
			`# plain value.)`
			`# Ideally, this would be part of the core Python classes.`
			`# It's probably not ready for deployment, but having it here helps establish`
			`# the test case as a repository for any pathological cases that people`
			`# suggest.`

			`from urlparse import urljoin`
			`from urllib import quote, quote_plus, unquote, unquote_plus`

			`from unicodedata import normalize`
			`from codecs import lookup`

			`import re`

			`(enc, dec) = lookup('UTF-8')[:2]`

			`SUBDELIMS='!$&\'()*+,;='`
			`PCHAR='-._~' + SUBDELIMS + ':@'`
			`GENDELIMS=':/?#[]@'`
			`RESERVED=GENDELIMS + SUBDELIMS`

			`default_port = {`
			`'ftp': 21,`
			`'telnet': 23,`
			`'http': 80,`
			`'gopher': 70,`
			`'news': 119,`
			`'nntp': 119,`
			`'prospero': 191,`
			`'https': 443,`
			`'snews': 563,`
			`'snntp': 563,`
			`}`

			`class BadUri(Exception):`
			`pass`

			`def _n(s):`
			`return enc(normalize('NFC', dec(s)[0]))[0]`

			`octetRe = re.compile('([^%]\|%[a-fA-F0-9]{2})')`

			`def asOctets(s):`
			`while (s):`
			`m = octetRe.match(s)`

			`if not(m):`
			`raise BadUri()`

			`c = m.group(1)`
			`if (c[0] == '%'):`
			`yield(c.upper(), chr(int(c[1:], 0x10)))`
			`else:`
			`yield(c, c)`

			`s = s[m.end(1):]`

			`def _qnu(s,safe=''):`
			`if s == None:`
			`return None`
			`# unquote{,_plus} leave high-bit octets unconverted in Unicode strings`
			`# This conversion will, correctly, cause UnicodeEncodeError if there are`
			`# non-ASCII characters present in the string`
			`s = str(s)`

			`res = ''`
			`b = ''`
			`for (c,x) in asOctets(s):`
			`if x in RESERVED and x in safe:`
			`res += quote(_n(unquote(b)), safe)`
			`b = ''`
			`res += c`
			`else:`
			`b += x`

			`res += quote(_n(unquote(b)), safe)`

			`return res`

			`def _normPort(netloc,defPort):`
			`nl = netloc.lower()`
			`p = defPort`
			`i = nl.find(':')`
			`if i >= 0:`
			`ps = nl[i + 1:]`
			`if ps:`
			`if not(ps.isdigit()):`
			`return netloc`
			`p = int(ps)`
			`nl = nl[:i]`

			`if nl and nl[-1] == '.' and nl.rfind('.', 0, -2) >= 0:`
			`nl = nl[:-1]`

			`if p != defPort:`
			`nl = nl + ':' + str(p)`
			`return nl`

			`def _normAuth(auth,port):`
			`i = auth.rfind('@')`
			`if i >= 0:`
			`c = auth[:i]`
			`if c == ':':`
			`c = ''`
			`h = auth[i + 1:]`
			`else:`
			`c = None`
			`h = auth`

			`if c:`
			`return c + '@' + _normPort(h,port)`
			`else:`
			`return _normPort(h,port)`

			`def _normPath(p):`
			`l = p.split(u'/')`
			`i = 0`
			`if l and l[0]:`
			`i = len(l)`
			`while i < len(l):`
			`c = l[i]`
			`if (c == '.'):`
			`if i < len(l) - 1:`
			`del l[i]`
			`else:`
			`l[i] = ''`
			`elif (c == '..'):`
			`if i < len(l) - 1:`
			`del l[i]`
			`else:`
			`l[i] = ''`
			`if i > 1 or (i > 0 and l[0]):`
			`i -= 1`
			`del l[i]`
			`else:`
			`i += 1`
			`if l == ['']:`
			`l = ['', '']`
			`return u'/'.join([_qnu(c, PCHAR) for c in l])`

			`# From RFC 2396bis, with added end-of-string marker`
			`uriRe = re.compile('^(([^:/?#]+):)?(//([^/?#]))?([^?#])(\?([^#]))?(#(.))?$')`

			`def _canonical(s):`
			`m = uriRe.match(s)`
			`if not(m):`
			`raise BadUri()`

			`# Check for a relative URI`
			`if m.group(2) is None:`
			`scheme = None`
			`else:`
			`scheme = m.group(2).lower()`

			`if m.group(4) is None:`
			`authority = None`

			`p = m.group(5)`

			`# Don't try to normalise URI references with relative paths`
			`if scheme is None and not p.startswith('/'):`
			`return None`

			`if scheme == 'mailto':`
			`# XXX From RFC 2368, mailto equivalence needs to be subtler than this`
			`i = p.find('@')`
			`if i > 0:`
			`j = p.find('?')`
			`if j < 0:`
			`j = len(p)`
			`p = _qnu(p[:i]) + '@' + _qnu(p[i + 1:].lower()) + _qnu(p[j:])`
			`path = p`
			`else:`
			`if scheme is None or p.startswith('/'):`
			`path = _normPath(p)`
			`else:`
			`path = _qnu(p, PCHAR + '/')`
			`else:`
			`a = m.group(4)`
			`p = m.group(5)`
			`if scheme in default_port:`
			`a = _normAuth(a, default_port[scheme])`
			`else:`
			`a = _normAuth(a, None)`

			`authority = a`
			`path = _normPath(p)`

			`query = _qnu(m.group(7), PCHAR + "/?")`
			`fragment = _qnu(m.group(9), PCHAR + "/?")`

			`s = u''`
			`if scheme != None:`
			`s += scheme + ':'`

			`if authority != None:`
			`s += '//' + authority`

			`s += path`
			`if query != None:`
			`s += '?' + query`
			`if fragment != None:`
			`s += '#' + fragment`
			`return s`

			`class Uri:`
			`"""A Uri wraps a string and performs equality testing according to the`
			`rules for URI equivalence. """`
			`def __init__(self,s):`
			`self.s = s`
			`self.n = _canonical(s)`

			`def __str__(self):`
			`return self.s`

			`def __repr__(self):`
			`return repr(self.s)`

			`def __eq__(self, a):`
			`return self.n == a.n`

			`def canonicalForm(u):`
			`"""Give the canonical form for a URI, so char-by-char comparisons become valid tests for equivalence."""`
			`try:`
			`return _canonical(u)`
			`except BadUri:`
			`return None`
			`except UnicodeError:`
			`return None`