653 lines
19 KiB
Python
653 lines
19 KiB
Python
# flake8: noqa
|
|
"""
|
|
Shim module between Bleach and html5lib. This makes it easier to upgrade the
|
|
html5lib library without having to change a lot of code.
|
|
"""
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
import re
|
|
import string
|
|
import warnings
|
|
|
|
import six
|
|
|
|
# ignore html5lib deprecation warnings to use bleach; we are bleach
|
|
# apply before we import submodules that import html5lib
|
|
warnings.filterwarnings(
|
|
"ignore",
|
|
message="html5lib's sanitizer is deprecated",
|
|
category=DeprecationWarning,
|
|
module="bleach._vendor.html5lib",
|
|
)
|
|
|
|
from bleach._vendor.html5lib import ( # noqa: E402 module level import not at top of file
|
|
HTMLParser,
|
|
getTreeWalker,
|
|
)
|
|
from bleach._vendor.html5lib import (
|
|
constants,
|
|
) # noqa: E402 module level import not at top of file
|
|
from bleach._vendor.html5lib.constants import ( # noqa: E402 module level import not at top of file
|
|
namespaces,
|
|
prefixes,
|
|
)
|
|
from bleach._vendor.html5lib.constants import (
|
|
_ReparseException as ReparseException,
|
|
) # noqa: E402 module level import not at top of file
|
|
from bleach._vendor.html5lib.filters.base import (
|
|
Filter,
|
|
) # noqa: E402 module level import not at top of file
|
|
from bleach._vendor.html5lib.filters.sanitizer import (
|
|
allowed_protocols,
|
|
) # noqa: E402 module level import not at top of file
|
|
from bleach._vendor.html5lib.filters.sanitizer import (
|
|
Filter as SanitizerFilter,
|
|
) # noqa: E402 module level import not at top of file
|
|
from bleach._vendor.html5lib._inputstream import (
|
|
HTMLInputStream,
|
|
) # noqa: E402 module level import not at top of file
|
|
from bleach._vendor.html5lib.serializer import (
|
|
HTMLSerializer,
|
|
) # noqa: E402 module level import not at top of file
|
|
from bleach._vendor.html5lib._tokenizer import (
|
|
attributeMap,
|
|
HTMLTokenizer,
|
|
) # noqa: E402 module level import not at top of file
|
|
from bleach._vendor.html5lib._trie import (
|
|
Trie,
|
|
) # noqa: E402 module level import not at top of file
|
|
|
|
|
|
#: Map of entity name to expanded entity
|
|
ENTITIES = constants.entities
|
|
|
|
#: Trie of html entity string -> character representation
|
|
ENTITIES_TRIE = Trie(ENTITIES)
|
|
|
|
#: Token type constants--these never change
|
|
TAG_TOKEN_TYPES = {
|
|
constants.tokenTypes["StartTag"],
|
|
constants.tokenTypes["EndTag"],
|
|
constants.tokenTypes["EmptyTag"],
|
|
}
|
|
CHARACTERS_TYPE = constants.tokenTypes["Characters"]
|
|
PARSEERROR_TYPE = constants.tokenTypes["ParseError"]
|
|
|
|
|
|
#: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17
|
|
#: https://html.spec.whatwg.org/multipage/indices.html#elements-3
|
|
HTML_TAGS = [
|
|
"a",
|
|
"abbr",
|
|
"address",
|
|
"area",
|
|
"article",
|
|
"aside",
|
|
"audio",
|
|
"b",
|
|
"base",
|
|
"bdi",
|
|
"bdo",
|
|
"blockquote",
|
|
"body",
|
|
"br",
|
|
"button",
|
|
"canvas",
|
|
"caption",
|
|
"cite",
|
|
"code",
|
|
"col",
|
|
"colgroup",
|
|
"data",
|
|
"datalist",
|
|
"dd",
|
|
"del",
|
|
"details",
|
|
"dfn",
|
|
"dialog",
|
|
"div",
|
|
"dl",
|
|
"dt",
|
|
"em",
|
|
"embed",
|
|
"fieldset",
|
|
"figcaption",
|
|
"figure",
|
|
"footer",
|
|
"form",
|
|
"h1",
|
|
"h2",
|
|
"h3",
|
|
"h4",
|
|
"h5",
|
|
"h6",
|
|
"head",
|
|
"header",
|
|
"hgroup",
|
|
"hr",
|
|
"html",
|
|
"i",
|
|
"iframe",
|
|
"img",
|
|
"input",
|
|
"ins",
|
|
"kbd",
|
|
"keygen",
|
|
"label",
|
|
"legend",
|
|
"li",
|
|
"link",
|
|
"map",
|
|
"mark",
|
|
"menu",
|
|
"meta",
|
|
"meter",
|
|
"nav",
|
|
"noscript",
|
|
"object",
|
|
"ol",
|
|
"optgroup",
|
|
"option",
|
|
"output",
|
|
"p",
|
|
"param",
|
|
"picture",
|
|
"pre",
|
|
"progress",
|
|
"q",
|
|
"rp",
|
|
"rt",
|
|
"ruby",
|
|
"s",
|
|
"samp",
|
|
"script",
|
|
"section",
|
|
"select",
|
|
"slot",
|
|
"small",
|
|
"source",
|
|
"span",
|
|
"strong",
|
|
"style",
|
|
"sub",
|
|
"summary",
|
|
"sup",
|
|
"table",
|
|
"tbody",
|
|
"td",
|
|
"template",
|
|
"textarea",
|
|
"tfoot",
|
|
"th",
|
|
"thead",
|
|
"time",
|
|
"title",
|
|
"tr",
|
|
"track",
|
|
"u",
|
|
"ul",
|
|
"var",
|
|
"video",
|
|
"wbr",
|
|
]
|
|
|
|
|
|
class InputStreamWithMemory(object):
|
|
"""Wraps an HTMLInputStream to remember characters since last <
|
|
|
|
This wraps existing HTMLInputStream classes to keep track of the stream
|
|
since the last < which marked an open tag state.
|
|
|
|
"""
|
|
|
|
def __init__(self, inner_stream):
|
|
self._inner_stream = inner_stream
|
|
self.reset = self._inner_stream.reset
|
|
self.position = self._inner_stream.position
|
|
self._buffer = []
|
|
|
|
@property
|
|
def errors(self):
|
|
return self._inner_stream.errors
|
|
|
|
@property
|
|
def charEncoding(self):
|
|
return self._inner_stream.charEncoding
|
|
|
|
@property
|
|
def changeEncoding(self):
|
|
return self._inner_stream.changeEncoding
|
|
|
|
def char(self):
|
|
c = self._inner_stream.char()
|
|
# char() can return None if EOF, so ignore that
|
|
if c:
|
|
self._buffer.append(c)
|
|
return c
|
|
|
|
def charsUntil(self, characters, opposite=False):
|
|
chars = self._inner_stream.charsUntil(characters, opposite=opposite)
|
|
self._buffer.extend(list(chars))
|
|
return chars
|
|
|
|
def unget(self, char):
|
|
if self._buffer:
|
|
self._buffer.pop(-1)
|
|
return self._inner_stream.unget(char)
|
|
|
|
def get_tag(self):
|
|
"""Returns the stream history since last '<'
|
|
|
|
Since the buffer starts at the last '<' as as seen by tagOpenState(),
|
|
we know that everything from that point to when this method is called
|
|
is the "tag" that is being tokenized.
|
|
|
|
"""
|
|
return six.text_type("").join(self._buffer)
|
|
|
|
def start_tag(self):
|
|
"""Resets stream history to just '<'
|
|
|
|
This gets called by tagOpenState() which marks a '<' that denotes an
|
|
open tag. Any time we see that, we reset the buffer.
|
|
|
|
"""
|
|
self._buffer = ["<"]
|
|
|
|
|
|
class BleachHTMLTokenizer(HTMLTokenizer):
|
|
"""Tokenizer that doesn't consume character entities"""
|
|
|
|
def __init__(self, consume_entities=False, **kwargs):
|
|
super(BleachHTMLTokenizer, self).__init__(**kwargs)
|
|
|
|
self.consume_entities = consume_entities
|
|
|
|
# Wrap the stream with one that remembers the history
|
|
self.stream = InputStreamWithMemory(self.stream)
|
|
|
|
def __iter__(self):
|
|
last_error_token = None
|
|
|
|
for token in super(BleachHTMLTokenizer, self).__iter__():
|
|
if last_error_token is not None:
|
|
if (
|
|
last_error_token["data"] == "invalid-character-in-attribute-name"
|
|
and token["type"] in TAG_TOKEN_TYPES
|
|
and token.get("data")
|
|
):
|
|
# token["data"] is an html5lib attributeMap
|
|
# (OrderedDict 3.7+ and dict otherwise)
|
|
# of attr name to attr value
|
|
#
|
|
# Remove attribute names that have ', " or < in them
|
|
# because those characters are invalid for attribute names.
|
|
token["data"] = attributeMap(
|
|
(attr_name, attr_value)
|
|
for attr_name, attr_value in token["data"].items()
|
|
if (
|
|
'"' not in attr_name
|
|
and "'" not in attr_name
|
|
and "<" not in attr_name
|
|
)
|
|
)
|
|
last_error_token = None
|
|
yield token
|
|
|
|
elif (
|
|
last_error_token["data"] == "expected-closing-tag-but-got-char"
|
|
and self.parser.tags is not None
|
|
and token["data"].lower().strip() not in self.parser.tags
|
|
):
|
|
# We've got either a malformed tag or a pseudo-tag or
|
|
# something that html5lib wants to turn into a malformed
|
|
# comment which Bleach clean() will drop so we interfere
|
|
# with the token stream to handle it more correctly.
|
|
#
|
|
# If this is an allowed tag, it's malformed and we just let
|
|
# the html5lib parser deal with it--we don't enter into this
|
|
# block.
|
|
#
|
|
# If this is not an allowed tag, then we convert it to
|
|
# characters and it'll get escaped in the sanitizer.
|
|
token["data"] = self.stream.get_tag()
|
|
token["type"] = CHARACTERS_TYPE
|
|
|
|
last_error_token = None
|
|
yield token
|
|
|
|
elif token["type"] == PARSEERROR_TYPE:
|
|
# If the token is a parse error, then let the last_error_token
|
|
# go, and make token the new last_error_token
|
|
yield last_error_token
|
|
last_error_token = token
|
|
|
|
else:
|
|
yield last_error_token
|
|
yield token
|
|
last_error_token = None
|
|
|
|
continue
|
|
|
|
# If the token is a ParseError, we hold on to it so we can get the
|
|
# next token and potentially fix it.
|
|
if token["type"] == PARSEERROR_TYPE:
|
|
last_error_token = token
|
|
continue
|
|
|
|
yield token
|
|
|
|
if last_error_token:
|
|
yield last_error_token
|
|
|
|
def consumeEntity(self, allowedChar=None, fromAttribute=False):
|
|
# If this tokenizer is set to consume entities, then we can let the
|
|
# superclass do its thing.
|
|
if self.consume_entities:
|
|
return super(BleachHTMLTokenizer, self).consumeEntity(
|
|
allowedChar, fromAttribute
|
|
)
|
|
|
|
# If this tokenizer is set to not consume entities, then we don't want
|
|
# to consume and convert them, so this overrides the html5lib tokenizer's
|
|
# consumeEntity so that it's now a no-op.
|
|
#
|
|
# However, when that gets called, it's consumed an &, so we put that back in
|
|
# the stream.
|
|
if fromAttribute:
|
|
self.currentToken["data"][-1][1] += "&"
|
|
|
|
else:
|
|
self.tokenQueue.append({"type": CHARACTERS_TYPE, "data": "&"})
|
|
|
|
def tagOpenState(self):
|
|
# This state marks a < that is either a StartTag, EndTag, EmptyTag,
|
|
# or ParseError. In all cases, we want to drop any stream history
|
|
# we've collected so far and we do that by calling start_tag() on
|
|
# the input stream wrapper.
|
|
self.stream.start_tag()
|
|
return super(BleachHTMLTokenizer, self).tagOpenState()
|
|
|
|
def emitCurrentToken(self):
|
|
token = self.currentToken
|
|
|
|
if (
|
|
self.parser.tags is not None
|
|
and token["type"] in TAG_TOKEN_TYPES
|
|
and token["name"].lower() not in self.parser.tags
|
|
):
|
|
# If this is a start/end/empty tag for a tag that's not in our
|
|
# allowed list, then it gets stripped or escaped. In both of these
|
|
# cases it gets converted to a Characters token.
|
|
if self.parser.strip:
|
|
# If we're stripping the token, we just throw in an empty
|
|
# string token.
|
|
new_data = ""
|
|
|
|
else:
|
|
# If we're escaping the token, we want to escape the exact
|
|
# original string. Since tokenizing also normalizes data
|
|
# and this is a tag-like thing, we've lost some information.
|
|
# So we go back through the stream to get the original
|
|
# string and use that.
|
|
new_data = self.stream.get_tag()
|
|
|
|
new_token = {"type": CHARACTERS_TYPE, "data": new_data}
|
|
|
|
self.currentToken = new_token
|
|
self.tokenQueue.append(new_token)
|
|
self.state = self.dataState
|
|
return
|
|
|
|
super(BleachHTMLTokenizer, self).emitCurrentToken()
|
|
|
|
|
|
class BleachHTMLParser(HTMLParser):
|
|
"""Parser that uses BleachHTMLTokenizer"""
|
|
|
|
def __init__(self, tags, strip, consume_entities, **kwargs):
|
|
"""
|
|
:arg tags: list of allowed tags--everything else is either stripped or
|
|
escaped; if None, then this doesn't look at tags at all
|
|
:arg strip: whether to strip disallowed tags (True) or escape them (False);
|
|
if tags=None, then this doesn't have any effect
|
|
:arg consume_entities: whether to consume entities (default behavior) or
|
|
leave them as is when tokenizing (BleachHTMLTokenizer-added behavior)
|
|
|
|
"""
|
|
self.tags = [tag.lower() for tag in tags] if tags is not None else None
|
|
self.strip = strip
|
|
self.consume_entities = consume_entities
|
|
super(BleachHTMLParser, self).__init__(**kwargs)
|
|
|
|
def _parse(
|
|
self, stream, innerHTML=False, container="div", scripting=True, **kwargs
|
|
):
|
|
# set scripting=True to parse <noscript> as though JS is enabled to
|
|
# match the expected context in browsers
|
|
#
|
|
# https://html.spec.whatwg.org/multipage/scripting.html#the-noscript-element
|
|
#
|
|
# Override HTMLParser so we can swap out the tokenizer for our own.
|
|
self.innerHTMLMode = innerHTML
|
|
self.container = container
|
|
self.scripting = scripting
|
|
self.tokenizer = BleachHTMLTokenizer(
|
|
stream=stream, consume_entities=self.consume_entities, parser=self, **kwargs
|
|
)
|
|
self.reset()
|
|
|
|
try:
|
|
self.mainLoop()
|
|
except ReparseException:
|
|
self.reset()
|
|
self.mainLoop()
|
|
|
|
|
|
def convert_entity(value):
|
|
"""Convert an entity (minus the & and ; part) into what it represents
|
|
|
|
This handles numeric, hex, and text entities.
|
|
|
|
:arg value: the string (minus the ``&`` and ``;`` part) to convert
|
|
|
|
:returns: unicode character or None if it's an ambiguous ampersand that
|
|
doesn't match a character entity
|
|
|
|
"""
|
|
if value[0] == "#":
|
|
if value[1] in ("x", "X"):
|
|
return six.unichr(int(value[2:], 16))
|
|
return six.unichr(int(value[1:], 10))
|
|
|
|
return ENTITIES.get(value, None)
|
|
|
|
|
|
def convert_entities(text):
|
|
"""Converts all found entities in the text
|
|
|
|
:arg text: the text to convert entities in
|
|
|
|
:returns: unicode text with converted entities
|
|
|
|
"""
|
|
if "&" not in text:
|
|
return text
|
|
|
|
new_text = []
|
|
for part in next_possible_entity(text):
|
|
if not part:
|
|
continue
|
|
|
|
if part.startswith("&"):
|
|
entity = match_entity(part)
|
|
if entity is not None:
|
|
converted = convert_entity(entity)
|
|
|
|
# If it's not an ambiguous ampersand, then replace with the
|
|
# unicode character. Otherwise, we leave the entity in.
|
|
if converted is not None:
|
|
new_text.append(converted)
|
|
remainder = part[len(entity) + 2 :]
|
|
if part:
|
|
new_text.append(remainder)
|
|
continue
|
|
|
|
new_text.append(part)
|
|
|
|
return "".join(new_text)
|
|
|
|
|
|
def match_entity(stream):
|
|
"""Returns first entity in stream or None if no entity exists
|
|
|
|
Note: For Bleach purposes, entities must start with a "&" and end with
|
|
a ";". This ignoresambiguous character entities that have no ";" at the
|
|
end.
|
|
|
|
:arg stream: the character stream
|
|
|
|
:returns: ``None`` or the entity string without "&" or ";"
|
|
|
|
"""
|
|
# Nix the & at the beginning
|
|
if stream[0] != "&":
|
|
raise ValueError('Stream should begin with "&"')
|
|
|
|
stream = stream[1:]
|
|
|
|
stream = list(stream)
|
|
possible_entity = ""
|
|
end_characters = "<&=;" + string.whitespace
|
|
|
|
# Handle number entities
|
|
if stream and stream[0] == "#":
|
|
possible_entity = "#"
|
|
stream.pop(0)
|
|
|
|
if stream and stream[0] in ("x", "X"):
|
|
allowed = "0123456789abcdefABCDEF"
|
|
possible_entity += stream.pop(0)
|
|
else:
|
|
allowed = "0123456789"
|
|
|
|
# FIXME(willkg): Do we want to make sure these are valid number
|
|
# entities? This doesn't do that currently.
|
|
while stream and stream[0] not in end_characters:
|
|
c = stream.pop(0)
|
|
if c not in allowed:
|
|
break
|
|
possible_entity += c
|
|
|
|
if possible_entity and stream and stream[0] == ";":
|
|
return possible_entity
|
|
return None
|
|
|
|
# Handle character entities
|
|
while stream and stream[0] not in end_characters:
|
|
c = stream.pop(0)
|
|
if not ENTITIES_TRIE.has_keys_with_prefix(possible_entity):
|
|
break
|
|
possible_entity += c
|
|
|
|
if possible_entity and stream and stream[0] == ";":
|
|
return possible_entity
|
|
|
|
return None
|
|
|
|
|
|
AMP_SPLIT_RE = re.compile("(&)")
|
|
|
|
|
|
def next_possible_entity(text):
|
|
"""Takes a text and generates a list of possible entities
|
|
|
|
:arg text: the text to look at
|
|
|
|
:returns: generator where each part (except the first) starts with an
|
|
"&"
|
|
|
|
"""
|
|
for i, part in enumerate(AMP_SPLIT_RE.split(text)):
|
|
if i == 0:
|
|
yield part
|
|
elif i % 2 == 0:
|
|
yield "&" + part
|
|
|
|
|
|
class BleachHTMLSerializer(HTMLSerializer):
|
|
"""HTMLSerializer that undoes & -> & in attributes and sets
|
|
escape_rcdata to True
|
|
"""
|
|
|
|
# per the HTMLSerializer.__init__ docstring:
|
|
#
|
|
# Whether to escape characters that need to be
|
|
# escaped within normal elements within rcdata elements such as
|
|
# style.
|
|
#
|
|
escape_rcdata = True
|
|
|
|
def escape_base_amp(self, stoken):
|
|
"""Escapes just bare & in HTML attribute values"""
|
|
# First, undo escaping of &. We need to do this because html5lib's
|
|
# HTMLSerializer expected the tokenizer to consume all the character
|
|
# entities and convert them to their respective characters, but the
|
|
# BleachHTMLTokenizer doesn't do that. For example, this fixes
|
|
# &entity; back to &entity; .
|
|
stoken = stoken.replace("&", "&")
|
|
|
|
# However, we do want all bare & that are not marking character
|
|
# entities to be changed to &, so let's do that carefully here.
|
|
for part in next_possible_entity(stoken):
|
|
if not part:
|
|
continue
|
|
|
|
if part.startswith("&"):
|
|
entity = match_entity(part)
|
|
# Only leave entities in that are not ambiguous. If they're
|
|
# ambiguous, then we escape the ampersand.
|
|
if entity is not None and convert_entity(entity) is not None:
|
|
yield "&" + entity + ";"
|
|
|
|
# Length of the entity plus 2--one for & at the beginning
|
|
# and one for ; at the end
|
|
part = part[len(entity) + 2 :]
|
|
if part:
|
|
yield part
|
|
continue
|
|
|
|
yield part.replace("&", "&")
|
|
|
|
def serialize(self, treewalker, encoding=None):
|
|
"""Wrap HTMLSerializer.serialize and conver & to & in attribute values
|
|
|
|
Note that this converts & to & in attribute values where the & isn't
|
|
already part of an unambiguous character entity.
|
|
|
|
"""
|
|
in_tag = False
|
|
after_equals = False
|
|
|
|
for stoken in super(BleachHTMLSerializer, self).serialize(treewalker, encoding):
|
|
if in_tag:
|
|
if stoken == ">":
|
|
in_tag = False
|
|
|
|
elif after_equals:
|
|
if stoken != '"':
|
|
for part in self.escape_base_amp(stoken):
|
|
yield part
|
|
|
|
after_equals = False
|
|
continue
|
|
|
|
elif stoken == "=":
|
|
after_equals = True
|
|
|
|
yield stoken
|
|
else:
|
|
if stoken.startswith("<"):
|
|
in_tag = True
|
|
yield stoken
|