NewsBlur/utils/scrubber/__init__.py
2021-07-05 12:57:21 -04:00

350 lines
14 KiB
Python
Executable file

"""
Whitelisting HTML sanitizer.
Copyright (c) 2009-2010 Lefora <samuel@lefora.com>
See LICENSE for license details.
"""
__author__ = "Samuel Stauffer <samuel@lefora.com>"
__version__ = "1.6.1"
__license__ = "BSD"
__all__ = ['Scrubber', 'SelectiveScriptScrubber', 'ScrubberWarning', 'UnapprovedJavascript', 'urlize']
import re, string
from urllib.parse import urljoin
from itertools import chain
from bs4 import BeautifulSoup, Comment
def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False):
"""Converts any URLs in text into clickable links.
If trim_url_limit is not None, the URLs in link text longer than this limit
will truncated to trim_url_limit-3 characters and appended with an elipsis.
If nofollow is True, the URLs in link text will get a rel="nofollow"
attribute.
If autoescape is True, the link text and URLs will get autoescaped.
*Modified from Django*
"""
from urllib.parse import quote as urlquote
LEADING_PUNCTUATION = ['(', '<', '&lt;']
TRAILING_PUNCTUATION = ['.', ',', ')', '>', '\n', '&gt;']
word_split_re = re.compile(r'([\s\xa0]+|&nbsp;)') # a0 == NBSP
punctuation_re = re.compile('^(?P<lead>(?:%s)*)(?P<middle>.*?)(?P<trail>(?:%s)*)$' % \
('|'.join([re.escape(x) for x in LEADING_PUNCTUATION]),
'|'.join([re.escape(x) for x in TRAILING_PUNCTUATION])))
simple_email_re = re.compile(r'^\S+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+$')
# del x # Temporary variable
def escape(html):
return html.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;').replace("'", '&#39;')
trim_url = lambda x, limit=trim_url_limit: limit is not None and (len(x) > limit and ('%s...' % x[:max(0, limit - 3)])) or x
words = word_split_re.split(text)
nofollow_attr = nofollow and ' rel="nofollow"' or ''
for i, word in enumerate(words):
match = None
if '.' in word or '@' in word or ':' in word:
match = punctuation_re.match(word.replace('\u2019', "'"))
if match:
lead, middle, trail = match.groups()
middle = middle.encode('utf-8')
middle = middle.decode('utf-8') # Bytes to str
# Make URL we want to point to.
url = None
if middle.startswith('http://') or middle.startswith('https://'):
url = urlquote(middle, safe='%/&=:;#?+*')
elif middle.startswith('www.') or ('@' not in middle and \
middle and middle[0] in string.ascii_letters + string.digits and \
(middle.endswith('.org') or middle.endswith('.net') or middle.endswith('.com'))):
url = urlquote('http://%s' % middle, safe='%/&=:;#?+*')
elif '@' in middle and not ':' in middle and simple_email_re.match(middle):
url = 'mailto:%s' % middle
nofollow_attr = ''
# Make link.
if url:
trimmed = trim_url(middle)
if autoescape:
lead, trail = escape(lead), escape(trail)
url, trimmed = escape(url), escape(trimmed)
middle = '<a href="%s"%s>%s</a>' % (url, nofollow_attr, trimmed)
words[i] = '%s%s%s' % (lead, middle, trail)
elif autoescape:
words[i] = escape(word)
elif autoescape:
words[i] = escape(word)
return "".join(words)
class ScrubberWarning(object):
pass
class Scrubber(object):
allowed_tags = set((
'a', 'abbr', 'acronym', 'b', 'bdo', 'big', 'blockquote', 'br',
'center', 'cite', 'code',
'dd', 'del', 'dfn', 'div', 'dl', 'dt', 'em', 'embed', 'font',
'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'ins',
'kbd', 'li', 'object', 'ol', 'param', 'pre', 'p', 'q',
's', 'samp', 'small', 'span', 'strike', 'strong', 'sub', 'sup',
'table', 'tbody', 'td', 'th', 'thead', 'tr', 'tt', 'ul', 'u',
'var', 'wbr',
))
disallowed_tags_save_content = set((
'blink', 'body', 'html',
))
allowed_attributes = set((
'align', 'alt', 'border', 'cite', 'class', 'dir',
'height', 'href', 'src', 'style', 'title', 'type', 'width',
'face', 'size', # font tags
'flashvars', # Not sure about flashvars - if any harm can come from it
'classid', # FF needs the classid on object tags for flash
'name', 'value', 'quality', 'data', 'scale', # for flash embed param tags, could limit to just param if this is harmful
'salign', 'align', 'wmode',
)) # Bad attributes: 'allowscriptaccess', 'xmlns', 'target'
normalized_tag_replacements = {'b': 'strong', 'i': 'em'}
def __init__(self, base_url=None, autolink=True, nofollow=True, remove_comments=True):
self.base_url = base_url
self.autolink = autolink and bool(urlize)
self.nofollow = nofollow
self.remove_comments = remove_comments
self.allowed_tags = self.__class__.allowed_tags.copy()
self.disallowed_tags_save_content = self.__class__.disallowed_tags_save_content.copy()
self.allowed_attributes = self.__class__.allowed_attributes.copy()
self.normalized_tag_replacements = self.__class__.normalized_tag_replacements.copy()
self.warnings = []
# Find all _scrub_tab_<name> methods
self.tag_scrubbers = {}
for k in chain(*[cls.__dict__ for cls in self.__class__.__mro__]):
if k.startswith('_scrub_tag_'):
self.tag_scrubbers[k[11:]] = [getattr(self, k)]
def autolink_soup(self, soup):
"""Autolink urls in text nodes that aren't already linked (inside anchor tags)."""
def _autolink(node):
if isinstance(node, str):
text = node
text2 = urlize(text, nofollow=self.nofollow)
if text != text2:
node.replaceWith(text2)
else:
if node.name == "a":
return
for child in node.contents:
_autolink(child)
_autolink(soup)
def strip_disallowed(self, soup):
"""Remove nodes and attributes from the soup that aren't specifically allowed."""
toremove = []
for node in soup.recursiveChildGenerator():
if self.remove_comments and isinstance(node, Comment):
toremove.append((False, node))
continue
if isinstance(node, str):
continue
# Remove disallowed tags
if node.name not in self.allowed_tags:
toremove.append((node.name in self.disallowed_tags_save_content, node))
continue
# Remove disallowed attributes
attrs = {}
if hasattr(node, 'attrs') and isinstance(node.attrs, dict):
for k, v in list(node.attrs.items()):
if not v:
continue
if k.lower() not in self.allowed_attributes:
continue
# TODO: This probably needs to be more robust
if isinstance(v, str):
v2 = v.lower()
if any(x in v2 for x in ('javascript:', 'vbscript:', 'expression(')):
continue
attrs[k] = v
node.attrs = attrs
self._remove_nodes(toremove)
def normalize_html(self, soup):
"""Convert tags to a standard set. (e.g. convert 'b' tags to 'strong')"""
for node in soup.findAll(list(self.normalized_tag_replacements.keys())):
node.name = self.normalized_tag_replacements[node.name]
# for node in soup.findAll('br', clear="all"):
# node.extract()
def _remove_nodes(self, nodes):
"""Remove a list of nodes from the soup."""
for keep_contentes, node in nodes:
if keep_contentes and node.contents:
idx = node.parent.contents.index(node)
for n in reversed(list(node.contents)): # Copy the contents list to avoid modifying while traversing
node.parent.insert(idx, n)
node.extract()
def _clean_path(self, node, attrname):
url = node.get(attrname)
if url and '://' not in url and not url.startswith('mailto:'):
print(url)
if url[0] not in ('/', '.') and not self.base_url:
node[attrname] = "http://" + url
elif not url.startswith('http') and self.base_url:
print(self.base_url)
node[attrname] = urljoin(self.base_url, url)
def _scrub_tag_a(self, a):
if self.nofollow:
a['rel'] = ["nofollow"]
if not a.get('class', None):
a['class'] = ["external"]
self._clean_path(a, 'href')
def _scrub_tag_img(self, img):
try:
if img['src'].lower().startswith('chrome://'):
return True
except KeyError:
return True
# Make sure images always have an 'alt' attribute
img['alt'] = img.get('alt', '')
self._clean_path(img, 'src')
def _scrub_tag_font(self, node):
attrs = {}
if hasattr(node, 'attrs') and isinstance(node.attrs, dict):
for k, v in list(node.attrs.items()):
if k.lower() == 'size' and v.startswith('+'):
# Remove "size=+0"
continue
attrs[k] = v
node.attrs = attrs
if len(node.attrs) == 0:
# IE renders font tags with no attributes differently then other browsers so remove them
return "keep_contents"
def _scrub_html_pre(self, html):
"""Process the html before sanitization"""
return html
def _scrub_html_post(self, html):
"""Process the html after sanitization"""
return html
def _scrub_soup(self, soup):
self.strip_disallowed(soup)
if self.autolink:
self.autolink_soup(soup)
toremove = []
for tag_name, scrubbers in list(self.tag_scrubbers.items()):
for node in soup.find_all(tag_name):
for scrub in scrubbers:
remove = scrub(node)
if remove:
# Remove the node from the tree
toremove.append((remove == "keep_contents", node))
break
self._remove_nodes(toremove)
self.normalize_html(soup)
def scrub(self, html):
"""Return a sanitized version of the given html."""
self.warnings = []
html = self._scrub_html_pre(html)
soup = BeautifulSoup(html, features="lxml")
self._scrub_soup(soup)
html = str(soup)
return self._scrub_html_post(html)
class UnapprovedJavascript(ScrubberWarning):
def __init__(self, src):
self.src = src
self.path = src[:src.rfind('/')]
class SelectiveScriptScrubber(Scrubber):
allowed_tags = Scrubber.allowed_tags | set(('script', 'noscript', 'iframe'))
allowed_attributes = Scrubber.allowed_attributes | set(('scrolling', 'frameborder'))
def __init__(self, *args, **kwargs):
super(SelectiveScriptScrubber, self).__init__(*args, **kwargs)
self.allowed_script_srcs = set((
'http://www.statcounter.com/counter/counter_xhtml.js',
# 'http://www.google-analytics.com/urchin.js',
'http://pub.mybloglog.com/',
'http://rpc.bloglines.com/blogroll',
'http://widget.blogrush.com/show.js',
'http://re.adroll.com/',
'http://widgetserver.com/',
'http://pagead2.googlesyndication.com/pagead/show_ads.js', # are there pageadX for all kinds of numbers?
))
self.allowed_script_line_res = set(re.compile(text) for text in (
r"^(var )?sc_project\=\d+;$",
r"^(var )?sc_invisible\=\d;$",
r"^(var )?sc_partition\=\d+;$",
r'^(var )?sc_security\="[A-Za-z0-9]+";$',
# """^_uacct \= "[^"]+";$""",
# """^urchinTracker\(\);$""",
r'^blogrush_feed = "[^"]+";$',
# """^!--$""",
# """^//-->$""",
))
self.allowed_iframe_srcs = set(re.compile(text) for text in (
r'^http://www\.google\.com/calendar/embed\?[\w&;=\%]+$', # Google Calendar
r'^https?://www\.youtube\.com/', # YouTube
r'^http://player\.vimeo\.com/', # Vimeo
))
def _scrub_tag_script(self, script):
src = script.get('src', None)
if src:
for asrc in self.allowed_script_srcs:
# TODO: It could be dangerous to only check "start" of string
# as there could be browser bugs using crafted urls
if src.startswith(asrc):
script.contents = []
break
else:
self.warnings.append(UnapprovedJavascript(src))
script.extract()
elif script.get('type', '') != 'text/javascript':
script.extract()
else:
for line in script.string.splitlines():
line = line.strip()
if not line:
continue
line_match = any(line_re.match(line) for line_re in self.allowed_script_line_res)
if not line_match:
script.extract()
break
def _scrub_tag_iframe(self, iframe):
src = iframe.get('src', None)
if not src or not any(asrc.match(src) for asrc in self.allowed_iframe_srcs):
iframe.extract()