""" Whitelisting HTML sanitizer. Copyright (c) 2009-2010 Lefora See LICENSE for license details. """ __author__ = "Samuel Stauffer " __version__ = "1.6.1" __license__ = "BSD" __all__ = ['Scrubber', 'SelectiveScriptScrubber', 'ScrubberWarning', 'UnapprovedJavascript', 'urlize'] import re, string from urllib.parse import urljoin from itertools import chain from bs4 import BeautifulSoup, Comment def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False): """Converts any URLs in text into clickable links. If trim_url_limit is not None, the URLs in link text longer than this limit will truncated to trim_url_limit-3 characters and appended with an elipsis. If nofollow is True, the URLs in link text will get a rel="nofollow" attribute. If autoescape is True, the link text and URLs will get autoescaped. *Modified from Django* """ from urllib.parse import quote as urlquote LEADING_PUNCTUATION = ['(', '<', '<'] TRAILING_PUNCTUATION = ['.', ',', ')', '>', '\n', '>'] word_split_re = re.compile(r'([\s\xa0]+| )') # a0 == NBSP punctuation_re = re.compile('^(?P(?:%s)*)(?P.*?)(?P(?:%s)*)$' % \ ('|'.join([re.escape(x) for x in LEADING_PUNCTUATION]), '|'.join([re.escape(x) for x in TRAILING_PUNCTUATION]))) simple_email_re = re.compile(r'^\S+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+$') # del x # Temporary variable def escape(html): return html.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"').replace("'", ''') trim_url = lambda x, limit=trim_url_limit: limit is not None and (len(x) > limit and ('%s...' % x[:max(0, limit - 3)])) or x words = word_split_re.split(text) nofollow_attr = nofollow and ' rel="nofollow"' or '' for i, word in enumerate(words): match = None if '.' in word or '@' in word or ':' in word: match = punctuation_re.match(word.replace('\u2019', "'")) if match: lead, middle, trail = match.groups() middle = middle.encode('utf-8') middle = middle.decode('utf-8') # Bytes to str # Make URL we want to point to. url = None if middle.startswith('http://') or middle.startswith('https://'): url = urlquote(middle, safe='%/&=:;#?+*') elif middle.startswith('www.') or ('@' not in middle and \ middle and middle[0] in string.ascii_letters + string.digits and \ (middle.endswith('.org') or middle.endswith('.net') or middle.endswith('.com'))): url = urlquote('http://%s' % middle, safe='%/&=:;#?+*') elif '@' in middle and not ':' in middle and simple_email_re.match(middle): url = 'mailto:%s' % middle nofollow_attr = '' # Make link. if url: trimmed = trim_url(middle) if autoescape: lead, trail = escape(lead), escape(trail) url, trimmed = escape(url), escape(trimmed) middle = '%s' % (url, nofollow_attr, trimmed) words[i] = '%s%s%s' % (lead, middle, trail) elif autoescape: words[i] = escape(word) elif autoescape: words[i] = escape(word) return "".join(words) class ScrubberWarning(object): pass class Scrubber(object): allowed_tags = set(( 'a', 'abbr', 'acronym', 'b', 'bdo', 'big', 'blockquote', 'br', 'center', 'cite', 'code', 'dd', 'del', 'dfn', 'div', 'dl', 'dt', 'em', 'embed', 'font', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'ins', 'kbd', 'li', 'object', 'ol', 'param', 'pre', 'p', 'q', 's', 'samp', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'th', 'thead', 'tr', 'tt', 'ul', 'u', 'var', 'wbr', )) disallowed_tags_save_content = set(( 'blink', 'body', 'html', )) allowed_attributes = set(( 'align', 'alt', 'border', 'cite', 'class', 'dir', 'height', 'href', 'src', 'style', 'title', 'type', 'width', 'face', 'size', # font tags 'flashvars', # Not sure about flashvars - if any harm can come from it 'classid', # FF needs the classid on object tags for flash 'name', 'value', 'quality', 'data', 'scale', # for flash embed param tags, could limit to just param if this is harmful 'salign', 'align', 'wmode', )) # Bad attributes: 'allowscriptaccess', 'xmlns', 'target' normalized_tag_replacements = {'b': 'strong', 'i': 'em'} def __init__(self, base_url=None, autolink=True, nofollow=True, remove_comments=True): self.base_url = base_url self.autolink = autolink and bool(urlize) self.nofollow = nofollow self.remove_comments = remove_comments self.allowed_tags = self.__class__.allowed_tags.copy() self.disallowed_tags_save_content = self.__class__.disallowed_tags_save_content.copy() self.allowed_attributes = self.__class__.allowed_attributes.copy() self.normalized_tag_replacements = self.__class__.normalized_tag_replacements.copy() self.warnings = [] # Find all _scrub_tab_ methods self.tag_scrubbers = {} for k in chain(*[cls.__dict__ for cls in self.__class__.__mro__]): if k.startswith('_scrub_tag_'): self.tag_scrubbers[k[11:]] = [getattr(self, k)] def autolink_soup(self, soup): """Autolink urls in text nodes that aren't already linked (inside anchor tags).""" def _autolink(node): if isinstance(node, str): text = node text2 = urlize(text, nofollow=self.nofollow) if text != text2: node.replaceWith(text2) else: if node.name == "a": return for child in node.contents: _autolink(child) _autolink(soup) def strip_disallowed(self, soup): """Remove nodes and attributes from the soup that aren't specifically allowed.""" toremove = [] for node in soup.recursiveChildGenerator(): if self.remove_comments and isinstance(node, Comment): toremove.append((False, node)) continue if isinstance(node, str): continue # Remove disallowed tags if node.name not in self.allowed_tags: toremove.append((node.name in self.disallowed_tags_save_content, node)) continue # Remove disallowed attributes attrs = {} if hasattr(node, 'attrs') and isinstance(node.attrs, dict): for k, v in list(node.attrs.items()): if not v: continue if k.lower() not in self.allowed_attributes: continue # TODO: This probably needs to be more robust if isinstance(v, str): v2 = v.lower() if any(x in v2 for x in ('javascript:', 'vbscript:', 'expression(')): continue attrs[k] = v node.attrs = attrs self._remove_nodes(toremove) def normalize_html(self, soup): """Convert tags to a standard set. (e.g. convert 'b' tags to 'strong')""" for node in soup.findAll(list(self.normalized_tag_replacements.keys())): node.name = self.normalized_tag_replacements[node.name] # for node in soup.findAll('br', clear="all"): # node.extract() def _remove_nodes(self, nodes): """Remove a list of nodes from the soup.""" for keep_contentes, node in nodes: if keep_contentes and node.contents: idx = node.parent.contents.index(node) for n in reversed(list(node.contents)): # Copy the contents list to avoid modifying while traversing node.parent.insert(idx, n) node.extract() def _clean_path(self, node, attrname): url = node.get(attrname) if url and '://' not in url and not url.startswith('mailto:'): print(url) if url[0] not in ('/', '.') and not self.base_url: node[attrname] = "http://" + url elif not url.startswith('http') and self.base_url: print(self.base_url) node[attrname] = urljoin(self.base_url, url) def _scrub_tag_a(self, a): if self.nofollow: a['rel'] = ["nofollow"] if not a.get('class', None): a['class'] = ["external"] self._clean_path(a, 'href') def _scrub_tag_img(self, img): try: if img['src'].lower().startswith('chrome://'): return True except KeyError: return True # Make sure images always have an 'alt' attribute img['alt'] = img.get('alt', '') self._clean_path(img, 'src') def _scrub_tag_font(self, node): attrs = {} if hasattr(node, 'attrs') and isinstance(node.attrs, dict): for k, v in list(node.attrs.items()): if k.lower() == 'size' and v.startswith('+'): # Remove "size=+0" continue attrs[k] = v node.attrs = attrs if len(node.attrs) == 0: # IE renders font tags with no attributes differently then other browsers so remove them return "keep_contents" def _scrub_html_pre(self, html): """Process the html before sanitization""" return html def _scrub_html_post(self, html): """Process the html after sanitization""" return html def _scrub_soup(self, soup): self.strip_disallowed(soup) if self.autolink: self.autolink_soup(soup) toremove = [] for tag_name, scrubbers in list(self.tag_scrubbers.items()): for node in soup.find_all(tag_name): for scrub in scrubbers: remove = scrub(node) if remove: # Remove the node from the tree toremove.append((remove == "keep_contents", node)) break self._remove_nodes(toremove) self.normalize_html(soup) def scrub(self, html): """Return a sanitized version of the given html.""" self.warnings = [] html = self._scrub_html_pre(html) soup = BeautifulSoup(html, features="lxml") self._scrub_soup(soup) html = str(soup) return self._scrub_html_post(html) class UnapprovedJavascript(ScrubberWarning): def __init__(self, src): self.src = src self.path = src[:src.rfind('/')] class SelectiveScriptScrubber(Scrubber): allowed_tags = Scrubber.allowed_tags | set(('script', 'noscript', 'iframe')) allowed_attributes = Scrubber.allowed_attributes | set(('scrolling', 'frameborder')) def __init__(self, *args, **kwargs): super(SelectiveScriptScrubber, self).__init__(*args, **kwargs) self.allowed_script_srcs = set(( 'http://www.statcounter.com/counter/counter_xhtml.js', # 'http://www.google-analytics.com/urchin.js', 'http://pub.mybloglog.com/', 'http://rpc.bloglines.com/blogroll', 'http://widget.blogrush.com/show.js', 'http://re.adroll.com/', 'http://widgetserver.com/', 'http://pagead2.googlesyndication.com/pagead/show_ads.js', # are there pageadX for all kinds of numbers? )) self.allowed_script_line_res = set(re.compile(text) for text in ( r"^(var )?sc_project\=\d+;$", r"^(var )?sc_invisible\=\d;$", r"^(var )?sc_partition\=\d+;$", r'^(var )?sc_security\="[A-Za-z0-9]+";$', # """^_uacct \= "[^"]+";$""", # """^urchinTracker\(\);$""", r'^blogrush_feed = "[^"]+";$', # """^!--$""", # """^//-->$""", )) self.allowed_iframe_srcs = set(re.compile(text) for text in ( r'^http://www\.google\.com/calendar/embed\?[\w&;=\%]+$', # Google Calendar r'^https?://www\.youtube\.com/', # YouTube r'^http://player\.vimeo\.com/', # Vimeo )) def _scrub_tag_script(self, script): src = script.get('src', None) if src: for asrc in self.allowed_script_srcs: # TODO: It could be dangerous to only check "start" of string # as there could be browser bugs using crafted urls if src.startswith(asrc): script.contents = [] break else: self.warnings.append(UnapprovedJavascript(src)) script.extract() elif script.get('type', '') != 'text/javascript': script.extract() else: for line in script.string.splitlines(): line = line.strip() if not line: continue line_match = any(line_re.match(line) for line_re in self.allowed_script_line_res) if not line_match: script.extract() break def _scrub_tag_iframe(self, iframe): src = iframe.get('src', None) if not src or not any(asrc.match(src) for asrc in self.allowed_iframe_srcs): iframe.extract()