mirror of
				https://github.com/samuelclay/NewsBlur.git
				synced 2025-11-01 09:09:51 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			353 lines
		
	
	
	
		
			12 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			353 lines
		
	
	
	
		
			12 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
#!/usr/bin/env python
 | 
						|
#
 | 
						|
# Copyright 2009 Facebook
 | 
						|
#
 | 
						|
# Licensed under the Apache License, Version 2.0 (the "License"); you may
 | 
						|
# not use this file except in compliance with the License. You may obtain
 | 
						|
# a copy of the License at
 | 
						|
#
 | 
						|
#     http://www.apache.org/licenses/LICENSE-2.0
 | 
						|
#
 | 
						|
# Unless required by applicable law or agreed to in writing, software
 | 
						|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 | 
						|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 | 
						|
# License for the specific language governing permissions and limitations
 | 
						|
# under the License.
 | 
						|
 | 
						|
"""Escaping/unescaping methods for HTML, JSON, URLs, and others.
 | 
						|
 | 
						|
Also includes a few other miscellaneous string manipulation functions that
 | 
						|
have crept in over time.
 | 
						|
"""
 | 
						|
 | 
						|
 | 
						|
import html.entities
 | 
						|
import re
 | 
						|
import sys
 | 
						|
import urllib.parse
 | 
						|
from urllib.parse import parse_qs
 | 
						|
 | 
						|
# json module is in the standard library as of python 2.6; fall back to
 | 
						|
# simplejson if present for older versions.
 | 
						|
try:
 | 
						|
    import json
 | 
						|
 | 
						|
    assert hasattr(json, "loads") and hasattr(json, "dumps")
 | 
						|
    _json_decode = json.loads
 | 
						|
    _json_encode = json.dumps
 | 
						|
except Exception:
 | 
						|
    try:
 | 
						|
        import simplejson
 | 
						|
 | 
						|
        _json_decode = lambda s: simplejson.loads(_unicode(s))
 | 
						|
        _json_encode = lambda v: simplejson.dumps(v)
 | 
						|
    except ImportError:
 | 
						|
        try:
 | 
						|
            # For Google AppEngine
 | 
						|
            from django.utils import simplejson
 | 
						|
 | 
						|
            _json_decode = lambda s: simplejson.loads(_unicode(s))
 | 
						|
            _json_encode = lambda v: simplejson.dumps(v)
 | 
						|
        except ImportError:
 | 
						|
 | 
						|
            def _json_decode(s):
 | 
						|
                raise NotImplementedError(
 | 
						|
                    "A JSON parser is required, e.g., simplejson at "
 | 
						|
                    "http://pypi.python.org/pypi/simplejson/"
 | 
						|
                )
 | 
						|
 | 
						|
            _json_encode = _json_decode
 | 
						|
 | 
						|
 | 
						|
_XHTML_ESCAPE_RE = re.compile('[&<>"]')
 | 
						|
_XHTML_ESCAPE_DICT = {"&": "&", "<": "<", ">": ">", '"': """}
 | 
						|
 | 
						|
 | 
						|
def xhtml_escape(value):
 | 
						|
    """Escapes a string so it is valid within XML or XHTML."""
 | 
						|
    return _XHTML_ESCAPE_RE.sub(lambda match: _XHTML_ESCAPE_DICT[match.group(0)], to_basestring(value))
 | 
						|
 | 
						|
 | 
						|
def xhtml_unescape(value):
 | 
						|
    """Un-escapes an XML-escaped string."""
 | 
						|
    return re.sub(r"&(#?)(\w+?);", _convert_entity, _unicode(value))
 | 
						|
 | 
						|
 | 
						|
def json_encode(value):
 | 
						|
    """JSON-encodes the given Python object."""
 | 
						|
    # JSON permits but does not require forward slashes to be escaped.
 | 
						|
    # This is useful when json data is emitted in a <script> tag
 | 
						|
    # in HTML, as it prevents </script> tags from prematurely terminating
 | 
						|
    # the javscript.  Some json libraries do this escaping by default,
 | 
						|
    # although python's standard library does not, so we do it here.
 | 
						|
    # http://stackoverflow.com/questions/1580647/json-why-are-forward-slashes-escaped
 | 
						|
    return _json_encode(recursive_unicode(value)).replace("</", "<\\/")
 | 
						|
 | 
						|
 | 
						|
def json_decode(value):
 | 
						|
    """Returns Python objects for the given JSON string."""
 | 
						|
    return _json_decode(to_basestring(value))
 | 
						|
 | 
						|
 | 
						|
def squeeze(value):
 | 
						|
    """Replace all sequences of whitespace chars with a single space."""
 | 
						|
    return re.sub(r"[\x00-\x20]+", " ", value).strip()
 | 
						|
 | 
						|
 | 
						|
def url_escape(value):
 | 
						|
    """Returns a valid URL-encoded version of the given value."""
 | 
						|
    return urllib.parse.quote_plus(utf8(value))
 | 
						|
 | 
						|
 | 
						|
# python 3 changed things around enough that we need two separate
 | 
						|
# implementations of url_unescape.  We also need our own implementation
 | 
						|
# of parse_qs since python 3's version insists on decoding everything.
 | 
						|
if sys.version_info[0] < 3:
 | 
						|
 | 
						|
    def url_unescape(value, encoding="utf-8"):
 | 
						|
        """Decodes the given value from a URL.
 | 
						|
 | 
						|
        The argument may be either a byte or unicode string.
 | 
						|
 | 
						|
        If encoding is None, the result will be a byte string.  Otherwise,
 | 
						|
        the result is a unicode string in the specified encoding.
 | 
						|
        """
 | 
						|
        if encoding is None:
 | 
						|
            return urllib.parse.unquote_plus(utf8(value))
 | 
						|
        else:
 | 
						|
            return str(urllib.parse.unquote_plus(utf8(value)), encoding)
 | 
						|
 | 
						|
    parse_qs_bytes = parse_qs
 | 
						|
else:
 | 
						|
 | 
						|
    def url_unescape(value, encoding="utf-8"):
 | 
						|
        """Decodes the given value from a URL.
 | 
						|
 | 
						|
        The argument may be either a byte or unicode string.
 | 
						|
 | 
						|
        If encoding is None, the result will be a byte string.  Otherwise,
 | 
						|
        the result is a unicode string in the specified encoding.
 | 
						|
        """
 | 
						|
        if encoding is None:
 | 
						|
            return urllib.parse.unquote_to_bytes(value)
 | 
						|
        else:
 | 
						|
            return urllib.parse.unquote_plus(to_basestring(value), encoding=encoding)
 | 
						|
 | 
						|
    def parse_qs_bytes(qs, keep_blank_values=False, strict_parsing=False):
 | 
						|
        """Parses a query string like urlparse.parse_qs, but returns the
 | 
						|
        values as byte strings.
 | 
						|
 | 
						|
        Keys still become type str (interpreted as latin1 in python3!)
 | 
						|
        because it's too painful to keep them as byte strings in
 | 
						|
        python3 and in practice they're nearly always ascii anyway.
 | 
						|
        """
 | 
						|
        # This is gross, but python3 doesn't give us another way.
 | 
						|
        # Latin1 is the universal donor of character encodings.
 | 
						|
        result = parse_qs(qs, keep_blank_values, strict_parsing, encoding="latin1", errors="strict")
 | 
						|
        encoded = {}
 | 
						|
        for k, v in result.items():
 | 
						|
            encoded[k] = [i.encode("latin1") for i in v]
 | 
						|
        return encoded
 | 
						|
 | 
						|
 | 
						|
_UTF8_TYPES = (bytes, type(None))
 | 
						|
 | 
						|
 | 
						|
def utf8(value):
 | 
						|
    """Converts a string argument to a byte string.
 | 
						|
 | 
						|
    If the argument is already a byte string or None, it is returned unchanged.
 | 
						|
    Otherwise it must be a unicode string and is encoded as utf8.
 | 
						|
    """
 | 
						|
    if isinstance(value, _UTF8_TYPES):
 | 
						|
        return value
 | 
						|
    assert isinstance(value, str)
 | 
						|
    return value.encode("utf-8")
 | 
						|
 | 
						|
 | 
						|
_TO_UNICODE_TYPES = (str, type(None))
 | 
						|
 | 
						|
 | 
						|
def to_unicode(value):
 | 
						|
    """Converts a string argument to a unicode string.
 | 
						|
 | 
						|
    If the argument is already a unicode string or None, it is returned
 | 
						|
    unchanged.  Otherwise it must be a byte string and is decoded as utf8.
 | 
						|
    """
 | 
						|
    if isinstance(value, _TO_UNICODE_TYPES):
 | 
						|
        return value
 | 
						|
    assert isinstance(value, bytes)
 | 
						|
    return value.decode("utf-8")
 | 
						|
 | 
						|
 | 
						|
# to_unicode was previously named _unicode not because it was private,
 | 
						|
# but to avoid conflicts with the built-in unicode() function/type
 | 
						|
_unicode = to_unicode
 | 
						|
 | 
						|
# When dealing with the standard library across python 2 and 3 it is
 | 
						|
# sometimes useful to have a direct conversion to the native string type
 | 
						|
if str is str:
 | 
						|
    native_str = to_unicode
 | 
						|
else:
 | 
						|
    native_str = utf8
 | 
						|
 | 
						|
_BASESTRING_TYPES = (str, type(None))
 | 
						|
 | 
						|
 | 
						|
def to_basestring(value):
 | 
						|
    """Converts a string argument to a subclass of basestring.
 | 
						|
 | 
						|
    In python2, byte and unicode strings are mostly interchangeable,
 | 
						|
    so functions that deal with a user-supplied argument in combination
 | 
						|
    with ascii string constants can use either and should return the type
 | 
						|
    the user supplied.  In python3, the two types are not interchangeable,
 | 
						|
    so this method is needed to convert byte strings to unicode.
 | 
						|
    """
 | 
						|
    if isinstance(value, _BASESTRING_TYPES):
 | 
						|
        return value
 | 
						|
    assert isinstance(value, bytes)
 | 
						|
    return value.decode("utf-8")
 | 
						|
 | 
						|
 | 
						|
def recursive_unicode(obj):
 | 
						|
    """Walks a simple data structure, converting byte strings to unicode.
 | 
						|
 | 
						|
    Supports lists, tuples, and dictionaries.
 | 
						|
    """
 | 
						|
    if isinstance(obj, dict):
 | 
						|
        return dict((recursive_unicode(k), recursive_unicode(v)) for (k, v) in obj.items())
 | 
						|
    elif isinstance(obj, list):
 | 
						|
        return list(recursive_unicode(i) for i in obj)
 | 
						|
    elif isinstance(obj, tuple):
 | 
						|
        return tuple(recursive_unicode(i) for i in obj)
 | 
						|
    elif isinstance(obj, bytes):
 | 
						|
        return to_unicode(obj)
 | 
						|
    else:
 | 
						|
        return obj
 | 
						|
 | 
						|
 | 
						|
# I originally used the regex from
 | 
						|
# http://daringfireball.net/2010/07/improved_regex_for_matching_urls
 | 
						|
# but it gets all exponential on certain patterns (such as too many trailing
 | 
						|
# dots), causing the regex matcher to never return.
 | 
						|
# This regex should avoid those problems.
 | 
						|
_URL_RE = re.compile(
 | 
						|
    r"""\b((?:([\w-]+):(/{1,3})|www[.])(?:(?:(?:[^\s&()]|&|")*(?:[^!"#$%&'()*+,.:;<=>?@\[\]^`{|}~\s]))|(?:\((?:[^\s&()]|&|")*\)))+)"""
 | 
						|
)
 | 
						|
 | 
						|
 | 
						|
def linkify(
 | 
						|
    text, shorten=False, extra_params="", require_protocol=False, permitted_protocols=["http", "https"]
 | 
						|
):
 | 
						|
    """Converts plain text into HTML with links.
 | 
						|
 | 
						|
    For example: ``linkify("Hello http://tornadoweb.org!")`` would return
 | 
						|
    ``Hello <a href="http://tornadoweb.org">http://tornadoweb.org</a>!``
 | 
						|
 | 
						|
    Parameters:
 | 
						|
 | 
						|
    shorten: Long urls will be shortened for display.
 | 
						|
 | 
						|
    extra_params: Extra text to include in the link tag, or a callable
 | 
						|
        taking the link as an argument and returning the extra text
 | 
						|
        e.g. ``linkify(text, extra_params='rel="nofollow" class="external"')``,
 | 
						|
        or::
 | 
						|
 | 
						|
            def extra_params_cb(url):
 | 
						|
                if url.startswith("http://example.com"):
 | 
						|
                    return 'class="internal"'
 | 
						|
                else:
 | 
						|
                    return 'class="external" rel="nofollow"'
 | 
						|
            linkify(text, extra_params=extra_params_cb)
 | 
						|
 | 
						|
    require_protocol: Only linkify urls which include a protocol. If this is
 | 
						|
        False, urls such as www.facebook.com will also be linkified.
 | 
						|
 | 
						|
    permitted_protocols: List (or set) of protocols which should be linkified,
 | 
						|
        e.g. linkify(text, permitted_protocols=["http", "ftp", "mailto"]).
 | 
						|
        It is very unsafe to include protocols such as "javascript".
 | 
						|
    """
 | 
						|
    if extra_params and not callable(extra_params):
 | 
						|
        extra_params = " " + extra_params.strip()
 | 
						|
 | 
						|
    def make_link(m):
 | 
						|
        url = m.group(1)
 | 
						|
        proto = m.group(2)
 | 
						|
        if require_protocol and not proto:
 | 
						|
            return url  # not protocol, no linkify
 | 
						|
 | 
						|
        if proto and proto not in permitted_protocols:
 | 
						|
            return url  # bad protocol, no linkify
 | 
						|
 | 
						|
        href = m.group(1)
 | 
						|
        if not proto:
 | 
						|
            href = "http://" + href  # no proto specified, use http
 | 
						|
 | 
						|
        if callable(extra_params):
 | 
						|
            params = " " + extra_params(href).strip()
 | 
						|
        else:
 | 
						|
            params = extra_params
 | 
						|
 | 
						|
        # clip long urls. max_len is just an approximation
 | 
						|
        max_len = 30
 | 
						|
        if shorten and len(url) > max_len:
 | 
						|
            before_clip = url
 | 
						|
            if proto:
 | 
						|
                proto_len = len(proto) + 1 + len(m.group(3) or "")  # +1 for :
 | 
						|
            else:
 | 
						|
                proto_len = 0
 | 
						|
 | 
						|
            parts = url[proto_len:].split("/")
 | 
						|
            if len(parts) > 1:
 | 
						|
                # Grab the whole host part plus the first bit of the path
 | 
						|
                # The path is usually not that interesting once shortened
 | 
						|
                # (no more slug, etc), so it really just provides a little
 | 
						|
                # extra indication of shortening.
 | 
						|
                url = url[:proto_len] + parts[0] + "/" + parts[1][:8].split("?")[0].split(".")[0]
 | 
						|
 | 
						|
            if len(url) > max_len * 1.5:  # still too long
 | 
						|
                url = url[:max_len]
 | 
						|
 | 
						|
            if url != before_clip:
 | 
						|
                amp = url.rfind("&")
 | 
						|
                # avoid splitting html char entities
 | 
						|
                if amp > max_len - 5:
 | 
						|
                    url = url[:amp]
 | 
						|
                url += "..."
 | 
						|
 | 
						|
                if len(url) >= len(before_clip):
 | 
						|
                    url = before_clip
 | 
						|
                else:
 | 
						|
                    # full url is visible on mouse-over (for those who don't
 | 
						|
                    # have a status bar, such as Safari by default)
 | 
						|
                    params += ' title="%s"' % href
 | 
						|
 | 
						|
        return '<a href="%s"%s>%s</a>' % (href, params, url)
 | 
						|
 | 
						|
    # First HTML-escape so that our strings are all safe.
 | 
						|
    # The regex is modified to avoid character entites other than & so
 | 
						|
    # that we won't pick up ", etc.
 | 
						|
    text = _unicode(xhtml_escape(text))
 | 
						|
    return _URL_RE.sub(make_link, text)
 | 
						|
 | 
						|
 | 
						|
def _convert_entity(m):
 | 
						|
    if m.group(1) == "#":
 | 
						|
        try:
 | 
						|
            return chr(int(m.group(2)))
 | 
						|
        except ValueError:
 | 
						|
            return "&#%s;" % m.group(2)
 | 
						|
    try:
 | 
						|
        return _HTML_UNICODE_MAP[m.group(2)]
 | 
						|
    except KeyError:
 | 
						|
        return "&%s;" % m.group(2)
 | 
						|
 | 
						|
 | 
						|
def _build_unicode_map():
 | 
						|
    unicode_map = {}
 | 
						|
    for name, value in html.entities.name2codepoint.items():
 | 
						|
        unicode_map[name] = chr(value)
 | 
						|
    return unicode_map
 | 
						|
 | 
						|
 | 
						|
_HTML_UNICODE_MAP = _build_unicode_map()
 |