NewsBlur/utils/scrubber/__init__.py
2024-04-24 09:50:42 -04:00

461 lines
15 KiB
Python
Executable file

"""
Whitelisting HTML sanitizer.
Copyright (c) 2009-2010 Lefora <samuel@lefora.com>
See LICENSE for license details.
"""
__author__ = "Samuel Stauffer <samuel@lefora.com>"
__version__ = "1.6.1"
__license__ = "BSD"
__all__ = ["Scrubber", "SelectiveScriptScrubber", "ScrubberWarning", "UnapprovedJavascript", "urlize"]
import re
import string
from itertools import chain
from urllib.parse import urljoin
from bs4 import BeautifulSoup, Comment
def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False):
"""Converts any URLs in text into clickable links.
If trim_url_limit is not None, the URLs in link text longer than this limit
will truncated to trim_url_limit-3 characters and appended with an elipsis.
If nofollow is True, the URLs in link text will get a rel="nofollow"
attribute.
If autoescape is True, the link text and URLs will get autoescaped.
*Modified from Django*
"""
from urllib.parse import quote as urlquote
LEADING_PUNCTUATION = ["(", "<", "&lt;"]
TRAILING_PUNCTUATION = [".", ",", ")", ">", "\n", "&gt;"]
word_split_re = re.compile(r"([\s\xa0]+|&nbsp;)") # a0 == NBSP
punctuation_re = re.compile(
"^(?P<lead>(?:%s)*)(?P<middle>.*?)(?P<trail>(?:%s)*)$"
% (
"|".join([re.escape(x) for x in LEADING_PUNCTUATION]),
"|".join([re.escape(x) for x in TRAILING_PUNCTUATION]),
)
)
simple_email_re = re.compile(r"^\S+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+$")
# del x # Temporary variable
def escape(html):
return (
html.replace("&", "&amp;")
.replace("<", "&lt;")
.replace(">", "&gt;")
.replace('"', "&quot;")
.replace("'", "&#39;")
)
trim_url = (
lambda x, limit=trim_url_limit: limit is not None
and (len(x) > limit and ("%s..." % x[: max(0, limit - 3)]))
or x
)
words = word_split_re.split(text)
nofollow_attr = nofollow and ' rel="nofollow"' or ""
for i, word in enumerate(words):
match = None
if "." in word or "@" in word or ":" in word:
match = punctuation_re.match(word.replace("\u2019", "'"))
if match:
lead, middle, trail = match.groups()
middle = middle.encode("utf-8")
middle = middle.decode("utf-8") # Bytes to str
# Make URL we want to point to.
url = None
if middle.startswith("http://") or middle.startswith("https://"):
url = urlquote(middle, safe="%/&=:;#?+*")
elif middle.startswith("www.") or (
"@" not in middle
and middle
and middle[0] in string.ascii_letters + string.digits
and (middle.endswith(".org") or middle.endswith(".net") or middle.endswith(".com"))
):
url = urlquote("http://%s" % middle, safe="%/&=:;#?+*")
elif "@" in middle and not ":" in middle and simple_email_re.match(middle):
url = "mailto:%s" % middle
nofollow_attr = ""
# Make link.
if url:
trimmed = trim_url(middle)
if autoescape:
lead, trail = escape(lead), escape(trail)
url, trimmed = escape(url), escape(trimmed)
middle = '<a href="%s"%s>%s</a>' % (url, nofollow_attr, trimmed)
words[i] = "%s%s%s" % (lead, middle, trail)
elif autoescape:
words[i] = escape(word)
elif autoescape:
words[i] = escape(word)
return "".join(words)
class ScrubberWarning(object):
pass
class Scrubber(object):
allowed_tags = set(
(
"a",
"abbr",
"acronym",
"b",
"bdo",
"big",
"blockquote",
"br",
"center",
"cite",
"code",
"dd",
"del",
"dfn",
"div",
"dl",
"dt",
"em",
"embed",
"font",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"hr",
"i",
"img",
"ins",
"kbd",
"li",
"object",
"ol",
"param",
"pre",
"p",
"q",
"s",
"samp",
"small",
"span",
"strike",
"strong",
"sub",
"sup",
"table",
"tbody",
"td",
"th",
"thead",
"tr",
"tt",
"ul",
"u",
"var",
"wbr",
)
)
disallowed_tags_save_content = set(
(
"blink",
"body",
"html",
)
)
allowed_attributes = set(
(
"align",
"alt",
"border",
"cite",
"class",
"dir",
"height",
"href",
"src",
"style",
"title",
"type",
"width",
"face",
"size", # font tags
"flashvars", # Not sure about flashvars - if any harm can come from it
"classid", # FF needs the classid on object tags for flash
"name",
"value",
"quality",
"data",
"scale", # for flash embed param tags, could limit to just param if this is harmful
"salign",
"align",
"wmode",
)
) # Bad attributes: 'allowscriptaccess', 'xmlns', 'target'
normalized_tag_replacements = {"b": "strong", "i": "em"}
def __init__(self, base_url=None, autolink=True, nofollow=True, remove_comments=True):
self.base_url = base_url
self.autolink = autolink and bool(urlize)
self.nofollow = nofollow
self.remove_comments = remove_comments
self.allowed_tags = self.__class__.allowed_tags.copy()
self.disallowed_tags_save_content = self.__class__.disallowed_tags_save_content.copy()
self.allowed_attributes = self.__class__.allowed_attributes.copy()
self.normalized_tag_replacements = self.__class__.normalized_tag_replacements.copy()
self.warnings = []
# Find all _scrub_tab_<name> methods
self.tag_scrubbers = {}
for k in chain(*[cls.__dict__ for cls in self.__class__.__mro__]):
if k.startswith("_scrub_tag_"):
self.tag_scrubbers[k[11:]] = [getattr(self, k)]
def autolink_soup(self, soup):
"""Autolink urls in text nodes that aren't already linked (inside anchor tags)."""
def _autolink(node):
if isinstance(node, str):
text = node
text2 = urlize(text, nofollow=self.nofollow)
if text != text2:
node.replaceWith(text2)
else:
if node.name == "a":
return
for child in node.contents:
_autolink(child)
_autolink(soup)
def strip_disallowed(self, soup):
"""Remove nodes and attributes from the soup that aren't specifically allowed."""
toremove = []
for node in soup.recursiveChildGenerator():
if self.remove_comments and isinstance(node, Comment):
toremove.append((False, node))
continue
if isinstance(node, str):
continue
# Remove disallowed tags
if node.name not in self.allowed_tags:
toremove.append((node.name in self.disallowed_tags_save_content, node))
continue
# Remove disallowed attributes
attrs = {}
if hasattr(node, "attrs") and isinstance(node.attrs, dict):
for k, v in list(node.attrs.items()):
if not v:
continue
if k.lower() not in self.allowed_attributes:
continue
# TODO: This probably needs to be more robust
if isinstance(v, str):
v2 = v.lower()
if any(x in v2 for x in ("javascript:", "vbscript:", "expression(")):
continue
attrs[k] = v
node.attrs = attrs
self._remove_nodes(toremove)
def normalize_html(self, soup):
"""Convert tags to a standard set. (e.g. convert 'b' tags to 'strong')"""
for node in soup.findAll(list(self.normalized_tag_replacements.keys())):
node.name = self.normalized_tag_replacements[node.name]
# for node in soup.findAll('br', clear="all"):
# node.extract()
def _remove_nodes(self, nodes):
"""Remove a list of nodes from the soup."""
for keep_contentes, node in nodes:
if keep_contentes and node.contents:
idx = node.parent.contents.index(node)
for n in reversed(
list(node.contents)
): # Copy the contents list to avoid modifying while traversing
node.parent.insert(idx, n)
node.extract()
def _clean_path(self, node, attrname):
url = node.get(attrname)
if url and "://" not in url and not url.startswith("mailto:"):
print(url)
if url[0] not in ("/", ".") and not self.base_url:
node[attrname] = "http://" + url
elif not url.startswith("http") and self.base_url:
print(self.base_url)
node[attrname] = urljoin(self.base_url, url)
def _scrub_tag_a(self, a):
if self.nofollow:
a["rel"] = ["nofollow"]
if not a.get("class", None):
a["class"] = ["external"]
self._clean_path(a, "href")
def _scrub_tag_img(self, img):
try:
if img["src"].lower().startswith("chrome://"):
return True
except KeyError:
return True
# Make sure images always have an 'alt' attribute
img["alt"] = img.get("alt", "")
self._clean_path(img, "src")
def _scrub_tag_font(self, node):
attrs = {}
if hasattr(node, "attrs") and isinstance(node.attrs, dict):
for k, v in list(node.attrs.items()):
if k.lower() == "size" and v.startswith("+"):
# Remove "size=+0"
continue
attrs[k] = v
node.attrs = attrs
if len(node.attrs) == 0:
# IE renders font tags with no attributes differently then other browsers so remove them
return "keep_contents"
def _scrub_html_pre(self, html):
"""Process the html before sanitization"""
return html
def _scrub_html_post(self, html):
"""Process the html after sanitization"""
return html
def _scrub_soup(self, soup):
self.strip_disallowed(soup)
if self.autolink:
self.autolink_soup(soup)
toremove = []
for tag_name, scrubbers in list(self.tag_scrubbers.items()):
for node in soup.find_all(tag_name):
for scrub in scrubbers:
remove = scrub(node)
if remove:
# Remove the node from the tree
toremove.append((remove == "keep_contents", node))
break
self._remove_nodes(toremove)
self.normalize_html(soup)
def scrub(self, html):
"""Return a sanitized version of the given html."""
self.warnings = []
html = self._scrub_html_pre(html)
soup = BeautifulSoup(html, features="lxml")
self._scrub_soup(soup)
html = str(soup)
return self._scrub_html_post(html)
class UnapprovedJavascript(ScrubberWarning):
def __init__(self, src):
self.src = src
self.path = src[: src.rfind("/")]
class SelectiveScriptScrubber(Scrubber):
allowed_tags = Scrubber.allowed_tags | set(("script", "noscript", "iframe"))
allowed_attributes = Scrubber.allowed_attributes | set(("scrolling", "frameborder"))
def __init__(self, *args, **kwargs):
super(SelectiveScriptScrubber, self).__init__(*args, **kwargs)
self.allowed_script_srcs = set(
(
"http://www.statcounter.com/counter/counter_xhtml.js",
# 'http://www.google-analytics.com/urchin.js',
"http://pub.mybloglog.com/",
"http://rpc.bloglines.com/blogroll",
"http://widget.blogrush.com/show.js",
"http://re.adroll.com/",
"http://widgetserver.com/",
"http://pagead2.googlesyndication.com/pagead/show_ads.js", # are there pageadX for all kinds of numbers?
)
)
self.allowed_script_line_res = set(
re.compile(text)
for text in (
r"^(var )?sc_project\=\d+;$",
r"^(var )?sc_invisible\=\d;$",
r"^(var )?sc_partition\=\d+;$",
r'^(var )?sc_security\="[A-Za-z0-9]+";$',
# """^_uacct \= "[^"]+";$""",
# """^urchinTracker\(\);$""",
r'^blogrush_feed = "[^"]+";$',
# """^!--$""",
# """^//-->$""",
)
)
self.allowed_iframe_srcs = set(
re.compile(text)
for text in (
r"^http://www\.google\.com/calendar/embed\?[\w&;=\%]+$", # Google Calendar
r"^https?://www\.youtube\.com/", # YouTube
r"^http://player\.vimeo\.com/", # Vimeo
)
)
def _scrub_tag_script(self, script):
src = script.get("src", None)
if src:
for asrc in self.allowed_script_srcs:
# TODO: It could be dangerous to only check "start" of string
# as there could be browser bugs using crafted urls
if src.startswith(asrc):
script.contents = []
break
else:
self.warnings.append(UnapprovedJavascript(src))
script.extract()
elif script.get("type", "") != "text/javascript":
script.extract()
else:
for line in script.string.splitlines():
line = line.strip()
if not line:
continue
line_match = any(line_re.match(line) for line_re in self.allowed_script_line_res)
if not line_match:
script.extract()
break
def _scrub_tag_iframe(self, iframe):
src = iframe.get("src", None)
if not src or not any(asrc.match(src) for asrc in self.allowed_iframe_srcs):
iframe.extract()