Uploaded Test files
This commit is contained in:
parent
f584ad9d97
commit
2e81cb7d99
16627 changed files with 2065359 additions and 102444 deletions
578
venv/Lib/site-packages/bleach/linkifier.py
Normal file
578
venv/Lib/site-packages/bleach/linkifier.py
Normal file
|
@ -0,0 +1,578 @@
|
|||
from __future__ import unicode_literals
|
||||
import re
|
||||
import six
|
||||
|
||||
from bleach import callbacks as linkify_callbacks
|
||||
from bleach import html5lib_shim
|
||||
from bleach.utils import alphabetize_attributes, force_unicode
|
||||
|
||||
|
||||
#: List of default callbacks
|
||||
DEFAULT_CALLBACKS = [linkify_callbacks.nofollow]
|
||||
|
||||
|
||||
TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az
|
||||
ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat
|
||||
cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk
|
||||
dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg
|
||||
gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il
|
||||
im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp
|
||||
kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk
|
||||
ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne
|
||||
net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn post
|
||||
pr pro ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl
|
||||
sm sn so sr ss st su sv sx sy sz tc td tel tf tg th tj tk tl tm tn to
|
||||
tp tr travel tt tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws
|
||||
xn xxx ye yt yu za zm zw""".split()
|
||||
|
||||
# Make sure that .com doesn't get matched by .co first
|
||||
TLDS.reverse()
|
||||
|
||||
|
||||
def build_url_re(tlds=TLDS, protocols=html5lib_shim.allowed_protocols):
|
||||
"""Builds the url regex used by linkifier
|
||||
|
||||
If you want a different set of tlds or allowed protocols, pass those in
|
||||
and stomp on the existing ``url_re``::
|
||||
|
||||
from bleach import linkifier
|
||||
|
||||
my_url_re = linkifier.build_url_re(my_tlds_list, my_protocols)
|
||||
|
||||
linker = LinkifyFilter(url_re=my_url_re)
|
||||
|
||||
"""
|
||||
return re.compile(
|
||||
r"""\(* # Match any opening parentheses.
|
||||
\b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)? # http://
|
||||
([\w-]+\.)+(?:{1})(?:\:[0-9]+)?(?!\.\w)\b # xx.yy.tld(:##)?
|
||||
(?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?
|
||||
# /path/zz (excluding "unsafe" chars from RFC 1738,
|
||||
# except for # and ~, which happen in practice)
|
||||
""".format(
|
||||
"|".join(sorted(protocols)), "|".join(sorted(tlds))
|
||||
),
|
||||
re.IGNORECASE | re.VERBOSE | re.UNICODE,
|
||||
)
|
||||
|
||||
|
||||
URL_RE = build_url_re()
|
||||
|
||||
|
||||
PROTO_RE = re.compile(r"^[\w-]+:/{0,3}", re.IGNORECASE)
|
||||
|
||||
|
||||
def build_email_re(tlds=TLDS):
|
||||
"""Builds the email regex used by linkifier
|
||||
|
||||
If you want a different set of tlds, pass those in and stomp on the existing ``email_re``::
|
||||
|
||||
from bleach import linkifier
|
||||
|
||||
my_email_re = linkifier.build_email_re(my_tlds_list)
|
||||
|
||||
linker = LinkifyFilter(email_re=my_url_re)
|
||||
|
||||
"""
|
||||
# open and closing braces doubled below for format string
|
||||
return re.compile(
|
||||
r"""(?<!//)
|
||||
(([-!#$%&'*+/=?^_`{{}}|~0-9A-Z]+
|
||||
(\.[-!#$%&'*+/=?^_`{{}}|~0-9A-Z]+)* # dot-atom
|
||||
|^"([\001-\010\013\014\016-\037!#-\[\]-\177]
|
||||
|\\[\001-\011\013\014\016-\177])*" # quoted-string
|
||||
)@(?:[A-Z0-9](?:[A-Z0-9-]{{0,61}}[A-Z0-9])?\.)+(?:{0})) # domain
|
||||
""".format(
|
||||
"|".join(tlds)
|
||||
),
|
||||
re.IGNORECASE | re.MULTILINE | re.VERBOSE,
|
||||
)
|
||||
|
||||
|
||||
EMAIL_RE = build_email_re()
|
||||
|
||||
|
||||
class Linker(object):
|
||||
"""Convert URL-like strings in an HTML fragment to links
|
||||
|
||||
This function converts strings that look like URLs, domain names and email
|
||||
addresses in text that may be an HTML fragment to links, while preserving:
|
||||
|
||||
1. links already in the string
|
||||
2. urls found in attributes
|
||||
3. email addresses
|
||||
|
||||
linkify does a best-effort approach and tries to recover from bad
|
||||
situations due to crazy text.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
callbacks=DEFAULT_CALLBACKS,
|
||||
skip_tags=None,
|
||||
parse_email=False,
|
||||
url_re=URL_RE,
|
||||
email_re=EMAIL_RE,
|
||||
recognized_tags=html5lib_shim.HTML_TAGS,
|
||||
):
|
||||
"""Creates a Linker instance
|
||||
|
||||
:arg list callbacks: list of callbacks to run when adjusting tag attributes;
|
||||
defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
|
||||
|
||||
:arg list skip_tags: list of tags that you don't want to linkify the
|
||||
contents of; for example, you could set this to ``['pre']`` to skip
|
||||
linkifying contents of ``pre`` tags
|
||||
|
||||
:arg bool parse_email: whether or not to linkify email addresses
|
||||
|
||||
:arg re url_re: url matching regex
|
||||
|
||||
:arg re email_re: email matching regex
|
||||
|
||||
:arg list-of-strings recognized_tags: the list of tags that linkify knows about;
|
||||
everything else gets escaped
|
||||
|
||||
:returns: linkified text as unicode
|
||||
|
||||
"""
|
||||
self.callbacks = callbacks
|
||||
self.skip_tags = skip_tags
|
||||
self.parse_email = parse_email
|
||||
self.url_re = url_re
|
||||
self.email_re = email_re
|
||||
|
||||
# Create a parser/tokenizer that allows all HTML tags and escapes
|
||||
# anything not in that list.
|
||||
self.parser = html5lib_shim.BleachHTMLParser(
|
||||
tags=recognized_tags,
|
||||
strip=False,
|
||||
consume_entities=True,
|
||||
namespaceHTMLElements=False,
|
||||
)
|
||||
self.walker = html5lib_shim.getTreeWalker("etree")
|
||||
self.serializer = html5lib_shim.BleachHTMLSerializer(
|
||||
quote_attr_values="always",
|
||||
omit_optional_tags=False,
|
||||
# linkify does not sanitize
|
||||
sanitize=False,
|
||||
# linkify alphabetizes
|
||||
alphabetical_attributes=False,
|
||||
)
|
||||
|
||||
def linkify(self, text):
|
||||
"""Linkify specified text
|
||||
|
||||
:arg str text: the text to add links to
|
||||
|
||||
:returns: linkified text as unicode
|
||||
|
||||
:raises TypeError: if ``text`` is not a text type
|
||||
|
||||
"""
|
||||
if not isinstance(text, six.string_types):
|
||||
raise TypeError("argument must be of text type")
|
||||
|
||||
text = force_unicode(text)
|
||||
|
||||
if not text:
|
||||
return ""
|
||||
|
||||
dom = self.parser.parseFragment(text)
|
||||
filtered = LinkifyFilter(
|
||||
source=self.walker(dom),
|
||||
callbacks=self.callbacks,
|
||||
skip_tags=self.skip_tags,
|
||||
parse_email=self.parse_email,
|
||||
url_re=self.url_re,
|
||||
email_re=self.email_re,
|
||||
)
|
||||
return self.serializer.render(filtered)
|
||||
|
||||
|
||||
class LinkifyFilter(html5lib_shim.Filter):
|
||||
"""html5lib filter that linkifies text
|
||||
|
||||
This will do the following:
|
||||
|
||||
* convert email addresses into links
|
||||
* convert urls into links
|
||||
* edit existing links by running them through callbacks--the default is to
|
||||
add a ``rel="nofollow"``
|
||||
|
||||
This filter can be used anywhere html5lib filters can be used.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
source,
|
||||
callbacks=DEFAULT_CALLBACKS,
|
||||
skip_tags=None,
|
||||
parse_email=False,
|
||||
url_re=URL_RE,
|
||||
email_re=EMAIL_RE,
|
||||
):
|
||||
"""Creates a LinkifyFilter instance
|
||||
|
||||
:arg TreeWalker source: stream
|
||||
|
||||
:arg list callbacks: list of callbacks to run when adjusting tag attributes;
|
||||
defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
|
||||
|
||||
:arg list skip_tags: list of tags that you don't want to linkify the
|
||||
contents of; for example, you could set this to ``['pre']`` to skip
|
||||
linkifying contents of ``pre`` tags
|
||||
|
||||
:arg bool parse_email: whether or not to linkify email addresses
|
||||
|
||||
:arg re url_re: url matching regex
|
||||
|
||||
:arg re email_re: email matching regex
|
||||
|
||||
"""
|
||||
super(LinkifyFilter, self).__init__(source)
|
||||
|
||||
self.callbacks = callbacks or []
|
||||
self.skip_tags = skip_tags or []
|
||||
self.parse_email = parse_email
|
||||
|
||||
self.url_re = url_re
|
||||
self.email_re = email_re
|
||||
|
||||
def apply_callbacks(self, attrs, is_new):
|
||||
"""Given an attrs dict and an is_new bool, runs through callbacks
|
||||
|
||||
Callbacks can return an adjusted attrs dict or ``None``. In the case of
|
||||
``None``, we stop going through callbacks and return that and the link
|
||||
gets dropped.
|
||||
|
||||
:arg dict attrs: map of ``(namespace, name)`` -> ``value``
|
||||
|
||||
:arg bool is_new: whether or not this link was added by linkify
|
||||
|
||||
:returns: adjusted attrs dict or ``None``
|
||||
|
||||
"""
|
||||
for cb in self.callbacks:
|
||||
attrs = cb(attrs, is_new)
|
||||
if attrs is None:
|
||||
return None
|
||||
return attrs
|
||||
|
||||
def extract_character_data(self, token_list):
|
||||
"""Extracts and squashes character sequences in a token stream"""
|
||||
# FIXME(willkg): This is a terrible idea. What it does is drop all the
|
||||
# tags from the token list and merge the Characters and SpaceCharacters
|
||||
# tokens into a single text.
|
||||
#
|
||||
# So something like this::
|
||||
#
|
||||
# "<span>" "<b>" "some text" "</b>" "</span>"
|
||||
#
|
||||
# gets converted to "some text".
|
||||
#
|
||||
# This gets used to figure out the ``_text`` fauxttribute value for
|
||||
# linkify callables.
|
||||
#
|
||||
# I'm not really sure how else to support that ``_text`` fauxttribute and
|
||||
# maintain some modicum of backwards compatibility with previous versions
|
||||
# of Bleach.
|
||||
|
||||
out = []
|
||||
for token in token_list:
|
||||
token_type = token["type"]
|
||||
if token_type in ["Characters", "SpaceCharacters"]:
|
||||
out.append(token["data"])
|
||||
|
||||
return "".join(out)
|
||||
|
||||
def handle_email_addresses(self, src_iter):
|
||||
"""Handle email addresses in character tokens"""
|
||||
for token in src_iter:
|
||||
if token["type"] == "Characters":
|
||||
text = token["data"]
|
||||
new_tokens = []
|
||||
end = 0
|
||||
|
||||
# For each email address we find in the text
|
||||
for match in self.email_re.finditer(text):
|
||||
if match.start() > end:
|
||||
new_tokens.append(
|
||||
{"type": "Characters", "data": text[end : match.start()]}
|
||||
)
|
||||
|
||||
# Run attributes through the callbacks to see what we
|
||||
# should do with this match
|
||||
attrs = {
|
||||
(None, "href"): "mailto:%s" % match.group(0),
|
||||
"_text": match.group(0),
|
||||
}
|
||||
attrs = self.apply_callbacks(attrs, True)
|
||||
|
||||
if attrs is None:
|
||||
# Just add the text--but not as a link
|
||||
new_tokens.append(
|
||||
{"type": "Characters", "data": match.group(0)}
|
||||
)
|
||||
|
||||
else:
|
||||
# Add an "a" tag for the new link
|
||||
_text = attrs.pop("_text", "")
|
||||
attrs = alphabetize_attributes(attrs)
|
||||
new_tokens.extend(
|
||||
[
|
||||
{"type": "StartTag", "name": "a", "data": attrs},
|
||||
{"type": "Characters", "data": force_unicode(_text)},
|
||||
{"type": "EndTag", "name": "a"},
|
||||
]
|
||||
)
|
||||
end = match.end()
|
||||
|
||||
if new_tokens:
|
||||
# Yield the adjusted set of tokens and then continue
|
||||
# through the loop
|
||||
if end < len(text):
|
||||
new_tokens.append({"type": "Characters", "data": text[end:]})
|
||||
|
||||
for new_token in new_tokens:
|
||||
yield new_token
|
||||
|
||||
continue
|
||||
|
||||
yield token
|
||||
|
||||
def strip_non_url_bits(self, fragment):
|
||||
"""Strips non-url bits from the url
|
||||
|
||||
This accounts for over-eager matching by the regex.
|
||||
|
||||
"""
|
||||
prefix = suffix = ""
|
||||
|
||||
while fragment:
|
||||
# Try removing ( from the beginning and, if it's balanced, from the
|
||||
# end, too
|
||||
if fragment.startswith("("):
|
||||
prefix = prefix + "("
|
||||
fragment = fragment[1:]
|
||||
|
||||
if fragment.endswith(")"):
|
||||
suffix = ")" + suffix
|
||||
fragment = fragment[:-1]
|
||||
continue
|
||||
|
||||
# Now try extraneous things from the end. For example, sometimes we
|
||||
# pick up ) at the end of a url, but the url is in a parenthesized
|
||||
# phrase like:
|
||||
#
|
||||
# "i looked at the site (at http://example.com)"
|
||||
|
||||
if fragment.endswith(")") and "(" not in fragment:
|
||||
fragment = fragment[:-1]
|
||||
suffix = ")" + suffix
|
||||
continue
|
||||
|
||||
# Handle commas
|
||||
if fragment.endswith(","):
|
||||
fragment = fragment[:-1]
|
||||
suffix = "," + suffix
|
||||
continue
|
||||
|
||||
# Handle periods
|
||||
if fragment.endswith("."):
|
||||
fragment = fragment[:-1]
|
||||
suffix = "." + suffix
|
||||
continue
|
||||
|
||||
# Nothing matched, so we're done
|
||||
break
|
||||
|
||||
return fragment, prefix, suffix
|
||||
|
||||
def handle_links(self, src_iter):
|
||||
"""Handle links in character tokens"""
|
||||
in_a = False # happens, if parse_email=True and if a mail was found
|
||||
for token in src_iter:
|
||||
if in_a:
|
||||
if token["type"] == "EndTag" and token["name"] == "a":
|
||||
in_a = False
|
||||
yield token
|
||||
continue
|
||||
elif token["type"] == "StartTag" and token["name"] == "a":
|
||||
in_a = True
|
||||
yield token
|
||||
continue
|
||||
if token["type"] == "Characters":
|
||||
text = token["data"]
|
||||
new_tokens = []
|
||||
end = 0
|
||||
|
||||
for match in self.url_re.finditer(text):
|
||||
if match.start() > end:
|
||||
new_tokens.append(
|
||||
{"type": "Characters", "data": text[end : match.start()]}
|
||||
)
|
||||
|
||||
url = match.group(0)
|
||||
prefix = suffix = ""
|
||||
|
||||
# Sometimes we pick up too much in the url match, so look for
|
||||
# bits we should drop and remove them from the match
|
||||
url, prefix, suffix = self.strip_non_url_bits(url)
|
||||
|
||||
# If there's no protocol, add one
|
||||
if PROTO_RE.search(url):
|
||||
href = url
|
||||
else:
|
||||
href = "http://%s" % url
|
||||
|
||||
attrs = {(None, "href"): href, "_text": url}
|
||||
attrs = self.apply_callbacks(attrs, True)
|
||||
|
||||
if attrs is None:
|
||||
# Just add the text
|
||||
new_tokens.append(
|
||||
{"type": "Characters", "data": prefix + url + suffix}
|
||||
)
|
||||
|
||||
else:
|
||||
# Add the "a" tag!
|
||||
if prefix:
|
||||
new_tokens.append({"type": "Characters", "data": prefix})
|
||||
|
||||
_text = attrs.pop("_text", "")
|
||||
attrs = alphabetize_attributes(attrs)
|
||||
|
||||
new_tokens.extend(
|
||||
[
|
||||
{"type": "StartTag", "name": "a", "data": attrs},
|
||||
{"type": "Characters", "data": force_unicode(_text)},
|
||||
{"type": "EndTag", "name": "a"},
|
||||
]
|
||||
)
|
||||
|
||||
if suffix:
|
||||
new_tokens.append({"type": "Characters", "data": suffix})
|
||||
|
||||
end = match.end()
|
||||
|
||||
if new_tokens:
|
||||
# Yield the adjusted set of tokens and then continue
|
||||
# through the loop
|
||||
if end < len(text):
|
||||
new_tokens.append({"type": "Characters", "data": text[end:]})
|
||||
|
||||
for new_token in new_tokens:
|
||||
yield new_token
|
||||
|
||||
continue
|
||||
|
||||
yield token
|
||||
|
||||
def handle_a_tag(self, token_buffer):
|
||||
"""Handle the "a" tag
|
||||
|
||||
This could adjust the link or drop it altogether depending on what the
|
||||
callbacks return.
|
||||
|
||||
This yields the new set of tokens.
|
||||
|
||||
"""
|
||||
a_token = token_buffer[0]
|
||||
if a_token["data"]:
|
||||
attrs = a_token["data"]
|
||||
else:
|
||||
attrs = {}
|
||||
text = self.extract_character_data(token_buffer)
|
||||
attrs["_text"] = text
|
||||
|
||||
attrs = self.apply_callbacks(attrs, False)
|
||||
|
||||
if attrs is None:
|
||||
# We're dropping the "a" tag and everything else and replacing
|
||||
# it with character data. So emit that token.
|
||||
yield {"type": "Characters", "data": text}
|
||||
|
||||
else:
|
||||
new_text = attrs.pop("_text", "")
|
||||
a_token["data"] = alphabetize_attributes(attrs)
|
||||
|
||||
if text == new_text:
|
||||
# The callbacks didn't change the text, so we yield the new "a"
|
||||
# token, then whatever else was there, then the end "a" token
|
||||
yield a_token
|
||||
for mem in token_buffer[1:]:
|
||||
yield mem
|
||||
|
||||
else:
|
||||
# If the callbacks changed the text, then we're going to drop
|
||||
# all the tokens between the start and end "a" tags and replace
|
||||
# it with the new text
|
||||
yield a_token
|
||||
yield {"type": "Characters", "data": force_unicode(new_text)}
|
||||
yield token_buffer[-1]
|
||||
|
||||
def __iter__(self):
|
||||
in_a = False
|
||||
in_skip_tag = None
|
||||
|
||||
token_buffer = []
|
||||
|
||||
for token in super(LinkifyFilter, self).__iter__():
|
||||
if in_a:
|
||||
# Handle the case where we're in an "a" tag--we want to buffer tokens
|
||||
# until we hit an end "a" tag.
|
||||
if token["type"] == "EndTag" and token["name"] == "a":
|
||||
# Add the end tag to the token buffer and then handle them
|
||||
# and yield anything returned
|
||||
token_buffer.append(token)
|
||||
for new_token in self.handle_a_tag(token_buffer):
|
||||
yield new_token
|
||||
|
||||
# Clear "a" related state and continue since we've yielded all
|
||||
# the tokens we're going to yield
|
||||
in_a = False
|
||||
token_buffer = []
|
||||
else:
|
||||
token_buffer.append(token)
|
||||
continue
|
||||
|
||||
if token["type"] in ["StartTag", "EmptyTag"]:
|
||||
if token["name"] in self.skip_tags:
|
||||
# Skip tags start a "special mode" where we don't linkify
|
||||
# anything until the end tag.
|
||||
in_skip_tag = token["name"]
|
||||
|
||||
elif token["name"] == "a":
|
||||
# The "a" tag is special--we switch to a slurp mode and
|
||||
# slurp all the tokens until the end "a" tag and then
|
||||
# figure out what to do with them there.
|
||||
in_a = True
|
||||
token_buffer.append(token)
|
||||
|
||||
# We buffer the start tag, so we don't want to yield it,
|
||||
# yet
|
||||
continue
|
||||
|
||||
elif in_skip_tag and self.skip_tags:
|
||||
# NOTE(willkg): We put this clause here since in_a and
|
||||
# switching in and out of in_a takes precedence.
|
||||
if token["type"] == "EndTag" and token["name"] == in_skip_tag:
|
||||
in_skip_tag = None
|
||||
|
||||
elif not in_a and not in_skip_tag and token["type"] == "Characters":
|
||||
new_stream = iter([token])
|
||||
if self.parse_email:
|
||||
new_stream = self.handle_email_addresses(new_stream)
|
||||
|
||||
new_stream = self.handle_links(new_stream)
|
||||
|
||||
for token in new_stream:
|
||||
yield token
|
||||
|
||||
# We've already yielded this token, so continue
|
||||
continue
|
||||
|
||||
yield token
|
Loading…
Add table
Add a link
Reference in a new issue