Uploaded Test files
This commit is contained in:
parent
f584ad9d97
commit
2e81cb7d99
16627 changed files with 2065359 additions and 102444 deletions
40
venv/Lib/site-packages/bleach/_vendor/README.rst
Normal file
40
venv/Lib/site-packages/bleach/_vendor/README.rst
Normal file
|
@ -0,0 +1,40 @@
|
|||
=======================
|
||||
Vendored library policy
|
||||
=======================
|
||||
|
||||
To simplify Bleach development, we're now vendoring certain libraries that
|
||||
we use.
|
||||
|
||||
Vendored libraries must follow these rules:
|
||||
|
||||
1. Vendored libraries must be pure Python--no compiling.
|
||||
2. Source code for the libary is included in this directory.
|
||||
3. License must be included in this repo and in the Bleach distribution.
|
||||
4. Requirements of the library become requirements of Bleach.
|
||||
5. No modifications to the library may be made.
|
||||
|
||||
|
||||
Adding/Updating a vendored library
|
||||
==================================
|
||||
|
||||
Way to vendor a library or update a version:
|
||||
|
||||
1. Update ``vendor.txt`` with the library, version, and hash. You can use
|
||||
`hashin <https://pypi.org/project/hashin/>`_.
|
||||
2. Remove all old files and directories of the old version.
|
||||
3. Run ``pip_install_vendor.sh`` and check everything it produced in including
|
||||
the ``.dist-info`` directory and contents.
|
||||
|
||||
|
||||
Reviewing a change involving a vendored library
|
||||
===============================================
|
||||
|
||||
Way to verify a vendored library addition/update:
|
||||
|
||||
1. Pull down the branch.
|
||||
2. Delete all the old files and directories of the old version.
|
||||
3. Run ``pip_install_vendor.sh``.
|
||||
4. Run ``git diff`` and verify there are no changes.
|
||||
|
||||
|
||||
NB: the current ``vendor.txt`` was generated with pip 20.2.3, which might be necessary to reproduce the dist-info
|
0
venv/Lib/site-packages/bleach/_vendor/__init__.py
Normal file
0
venv/Lib/site-packages/bleach/_vendor/__init__.py
Normal file
Binary file not shown.
|
@ -0,0 +1,66 @@
|
|||
Credits
|
||||
=======
|
||||
|
||||
``html5lib`` is written and maintained by:
|
||||
|
||||
- James Graham
|
||||
- Sam Sneddon
|
||||
- Łukasz Langa
|
||||
- Will Kahn-Greene
|
||||
|
||||
|
||||
Patches and suggestions
|
||||
-----------------------
|
||||
(In chronological order, by first commit:)
|
||||
|
||||
- Anne van Kesteren
|
||||
- Lachlan Hunt
|
||||
- lantis63
|
||||
- Sam Ruby
|
||||
- Thomas Broyer
|
||||
- Tim Fletcher
|
||||
- Mark Pilgrim
|
||||
- Ryan King
|
||||
- Philip Taylor
|
||||
- Edward Z. Yang
|
||||
- fantasai
|
||||
- Philip Jägenstedt
|
||||
- Ms2ger
|
||||
- Mohammad Taha Jahangir
|
||||
- Andy Wingo
|
||||
- Andreas Madsack
|
||||
- Karim Valiev
|
||||
- Juan Carlos Garcia Segovia
|
||||
- Mike West
|
||||
- Marc DM
|
||||
- Simon Sapin
|
||||
- Michael[tm] Smith
|
||||
- Ritwik Gupta
|
||||
- Marc Abramowitz
|
||||
- Tony Lopes
|
||||
- lilbludevil
|
||||
- Kevin
|
||||
- Drew Hubl
|
||||
- Austin Kumbera
|
||||
- Jim Baker
|
||||
- Jon Dufresne
|
||||
- Donald Stufft
|
||||
- Alex Gaynor
|
||||
- Nik Nyby
|
||||
- Jakub Wilk
|
||||
- Sigmund Cherem
|
||||
- Gabi Davar
|
||||
- Florian Mounier
|
||||
- neumond
|
||||
- Vitalik Verhovodov
|
||||
- Kovid Goyal
|
||||
- Adam Chainz
|
||||
- John Vandenberg
|
||||
- Eric Amorde
|
||||
- Benedikt Morbach
|
||||
- Jonathan Vanasco
|
||||
- Tom Most
|
||||
- Ville Skyttä
|
||||
- Hugo van Kemenade
|
||||
- Mark Vasilkov
|
||||
|
|
@ -0,0 +1 @@
|
|||
pip
|
|
@ -0,0 +1,552 @@
|
|||
Metadata-Version: 2.1
|
||||
Name: html5lib
|
||||
Version: 1.1
|
||||
Summary: HTML parser based on the WHATWG HTML specification
|
||||
Home-page: https://github.com/html5lib/html5lib-python
|
||||
Maintainer: James Graham
|
||||
Maintainer-email: james@hoppipolla.co.uk
|
||||
License: MIT License
|
||||
Platform: UNKNOWN
|
||||
Classifier: Development Status :: 5 - Production/Stable
|
||||
Classifier: Intended Audience :: Developers
|
||||
Classifier: License :: OSI Approved :: MIT License
|
||||
Classifier: Operating System :: OS Independent
|
||||
Classifier: Programming Language :: Python
|
||||
Classifier: Programming Language :: Python :: 2
|
||||
Classifier: Programming Language :: Python :: 2.7
|
||||
Classifier: Programming Language :: Python :: 3
|
||||
Classifier: Programming Language :: Python :: 3.5
|
||||
Classifier: Programming Language :: Python :: 3.6
|
||||
Classifier: Programming Language :: Python :: 3.7
|
||||
Classifier: Programming Language :: Python :: 3.8
|
||||
Classifier: Programming Language :: Python :: Implementation :: CPython
|
||||
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
||||
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
||||
Classifier: Topic :: Text Processing :: Markup :: HTML
|
||||
Requires-Python: >=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*
|
||||
Requires-Dist: six (>=1.9)
|
||||
Requires-Dist: webencodings
|
||||
Provides-Extra: all
|
||||
Requires-Dist: genshi ; extra == 'all'
|
||||
Requires-Dist: chardet (>=2.2) ; extra == 'all'
|
||||
Requires-Dist: lxml ; (platform_python_implementation == 'CPython') and extra == 'all'
|
||||
Provides-Extra: chardet
|
||||
Requires-Dist: chardet (>=2.2) ; extra == 'chardet'
|
||||
Provides-Extra: genshi
|
||||
Requires-Dist: genshi ; extra == 'genshi'
|
||||
Provides-Extra: lxml
|
||||
Requires-Dist: lxml ; (platform_python_implementation == 'CPython') and extra == 'lxml'
|
||||
|
||||
html5lib
|
||||
========
|
||||
|
||||
.. image:: https://travis-ci.org/html5lib/html5lib-python.svg?branch=master
|
||||
:target: https://travis-ci.org/html5lib/html5lib-python
|
||||
|
||||
|
||||
html5lib is a pure-python library for parsing HTML. It is designed to
|
||||
conform to the WHATWG HTML specification, as is implemented by all major
|
||||
web browsers.
|
||||
|
||||
|
||||
Usage
|
||||
-----
|
||||
|
||||
Simple usage follows this pattern:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import html5lib
|
||||
with open("mydocument.html", "rb") as f:
|
||||
document = html5lib.parse(f)
|
||||
|
||||
or:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import html5lib
|
||||
document = html5lib.parse("<p>Hello World!")
|
||||
|
||||
By default, the ``document`` will be an ``xml.etree`` element instance.
|
||||
Whenever possible, html5lib chooses the accelerated ``ElementTree``
|
||||
implementation (i.e. ``xml.etree.cElementTree`` on Python 2.x).
|
||||
|
||||
Two other tree types are supported: ``xml.dom.minidom`` and
|
||||
``lxml.etree``. To use an alternative format, specify the name of
|
||||
a treebuilder:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import html5lib
|
||||
with open("mydocument.html", "rb") as f:
|
||||
lxml_etree_document = html5lib.parse(f, treebuilder="lxml")
|
||||
|
||||
When using with ``urllib2`` (Python 2), the charset from HTTP should be
|
||||
pass into html5lib as follows:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from contextlib import closing
|
||||
from urllib2 import urlopen
|
||||
import html5lib
|
||||
|
||||
with closing(urlopen("http://example.com/")) as f:
|
||||
document = html5lib.parse(f, transport_encoding=f.info().getparam("charset"))
|
||||
|
||||
When using with ``urllib.request`` (Python 3), the charset from HTTP
|
||||
should be pass into html5lib as follows:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from urllib.request import urlopen
|
||||
import html5lib
|
||||
|
||||
with urlopen("http://example.com/") as f:
|
||||
document = html5lib.parse(f, transport_encoding=f.info().get_content_charset())
|
||||
|
||||
To have more control over the parser, create a parser object explicitly.
|
||||
For instance, to make the parser raise exceptions on parse errors, use:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import html5lib
|
||||
with open("mydocument.html", "rb") as f:
|
||||
parser = html5lib.HTMLParser(strict=True)
|
||||
document = parser.parse(f)
|
||||
|
||||
When you're instantiating parser objects explicitly, pass a treebuilder
|
||||
class as the ``tree`` keyword argument to use an alternative document
|
||||
format:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import html5lib
|
||||
parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom"))
|
||||
minidom_document = parser.parse("<p>Hello World!")
|
||||
|
||||
More documentation is available at https://html5lib.readthedocs.io/.
|
||||
|
||||
|
||||
Installation
|
||||
------------
|
||||
|
||||
html5lib works on CPython 2.7+, CPython 3.5+ and PyPy. To install:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
$ pip install html5lib
|
||||
|
||||
The goal is to support a (non-strict) superset of the versions that `pip
|
||||
supports
|
||||
<https://pip.pypa.io/en/stable/installing/#python-and-os-compatibility>`_.
|
||||
|
||||
Optional Dependencies
|
||||
---------------------
|
||||
|
||||
The following third-party libraries may be used for additional
|
||||
functionality:
|
||||
|
||||
- ``lxml`` is supported as a tree format (for both building and
|
||||
walking) under CPython (but *not* PyPy where it is known to cause
|
||||
segfaults);
|
||||
|
||||
- ``genshi`` has a treewalker (but not builder); and
|
||||
|
||||
- ``chardet`` can be used as a fallback when character encoding cannot
|
||||
be determined.
|
||||
|
||||
|
||||
Bugs
|
||||
----
|
||||
|
||||
Please report any bugs on the `issue tracker
|
||||
<https://github.com/html5lib/html5lib-python/issues>`_.
|
||||
|
||||
|
||||
Tests
|
||||
-----
|
||||
|
||||
Unit tests require the ``pytest`` and ``mock`` libraries and can be
|
||||
run using the ``py.test`` command in the root directory.
|
||||
|
||||
Test data are contained in a separate `html5lib-tests
|
||||
<https://github.com/html5lib/html5lib-tests>`_ repository and included
|
||||
as a submodule, thus for git checkouts they must be initialized::
|
||||
|
||||
$ git submodule init
|
||||
$ git submodule update
|
||||
|
||||
If you have all compatible Python implementations available on your
|
||||
system, you can run tests on all of them using the ``tox`` utility,
|
||||
which can be found on PyPI.
|
||||
|
||||
|
||||
Questions?
|
||||
----------
|
||||
|
||||
There's a mailing list available for support on Google Groups,
|
||||
`html5lib-discuss <http://groups.google.com/group/html5lib-discuss>`_,
|
||||
though you may get a quicker response asking on IRC in `#whatwg on
|
||||
irc.freenode.net <http://wiki.whatwg.org/wiki/IRC>`_.
|
||||
|
||||
Change Log
|
||||
----------
|
||||
|
||||
1.1
|
||||
~~~
|
||||
|
||||
UNRELEASED
|
||||
|
||||
Breaking changes:
|
||||
|
||||
* Drop support for Python 3.3. (#358)
|
||||
* Drop support for Python 3.4. (#421)
|
||||
|
||||
Deprecations:
|
||||
|
||||
* Deprecate the ``html5lib`` sanitizer (``html5lib.serialize(sanitize=True)`` and
|
||||
``html5lib.filters.sanitizer``). We recommend users migrate to `Bleach
|
||||
<https://github.com/mozilla/bleach>`. Please let us know if Bleach doesn't suffice for your
|
||||
use. (#443)
|
||||
|
||||
Other changes:
|
||||
|
||||
* Try to import from ``collections.abc`` to remove DeprecationWarning and ensure
|
||||
``html5lib`` keeps working in future Python versions. (#403)
|
||||
* Drop optional ``datrie`` dependency. (#442)
|
||||
|
||||
|
||||
1.0.1
|
||||
~~~~~
|
||||
|
||||
Released on December 7, 2017
|
||||
|
||||
Breaking changes:
|
||||
|
||||
* Drop support for Python 2.6. (#330) (Thank you, Hugo, Will Kahn-Greene!)
|
||||
* Remove ``utils/spider.py`` (#353) (Thank you, Jon Dufresne!)
|
||||
|
||||
Features:
|
||||
|
||||
* Improve documentation. (#300, #307) (Thank you, Jon Dufresne, Tom Most,
|
||||
Will Kahn-Greene!)
|
||||
* Add iframe seamless boolean attribute. (Thank you, Ritwik Gupta!)
|
||||
* Add itemscope as a boolean attribute. (#194) (Thank you, Jonathan Vanasco!)
|
||||
* Support Python 3.6. (#333) (Thank you, Jon Dufresne!)
|
||||
* Add CI support for Windows using AppVeyor. (Thank you, John Vandenberg!)
|
||||
* Improve testing and CI and add code coverage (#323, #334), (Thank you, Jon
|
||||
Dufresne, John Vandenberg, Sam Sneddon, Will Kahn-Greene!)
|
||||
* Semver-compliant version number.
|
||||
|
||||
Bug fixes:
|
||||
|
||||
* Add support for setuptools < 18.5 to support environment markers. (Thank you,
|
||||
John Vandenberg!)
|
||||
* Add explicit dependency for six >= 1.9. (Thank you, Eric Amorde!)
|
||||
* Fix regexes to work with Python 3.7 regex adjustments. (#318, #379) (Thank
|
||||
you, Benedikt Morbach, Ville Skyttä, Mark Vasilkov!)
|
||||
* Fix alphabeticalattributes filter namespace bug. (#324) (Thank you, Will
|
||||
Kahn-Greene!)
|
||||
* Include license file in generated wheel package. (#350) (Thank you, Jon
|
||||
Dufresne!)
|
||||
* Fix annotation-xml typo. (#339) (Thank you, Will Kahn-Greene!)
|
||||
* Allow uppercase hex chararcters in CSS colour check. (#377) (Thank you,
|
||||
Komal Dembla, Hugo!)
|
||||
|
||||
|
||||
1.0
|
||||
~~~
|
||||
|
||||
Released and unreleased on December 7, 2017. Badly packaged release.
|
||||
|
||||
|
||||
0.999999999/1.0b10
|
||||
~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Released on July 15, 2016
|
||||
|
||||
* Fix attribute order going to the tree builder to be document order
|
||||
instead of reverse document order(!).
|
||||
|
||||
|
||||
0.99999999/1.0b9
|
||||
~~~~~~~~~~~~~~~~
|
||||
|
||||
Released on July 14, 2016
|
||||
|
||||
* **Added ordereddict as a mandatory dependency on Python 2.6.**
|
||||
|
||||
* Added ``lxml``, ``genshi``, ``datrie``, ``charade``, and ``all``
|
||||
extras that will do the right thing based on the specific
|
||||
interpreter implementation.
|
||||
|
||||
* Now requires the ``mock`` package for the testsuite.
|
||||
|
||||
* Cease supporting DATrie under PyPy.
|
||||
|
||||
* **Remove PullDOM support, as this hasn't ever been properly
|
||||
tested, doesn't entirely work, and as far as I can tell is
|
||||
completely unused by anyone.**
|
||||
|
||||
* Move testsuite to ``py.test``.
|
||||
|
||||
* **Fix #124: move to webencodings for decoding the input byte stream;
|
||||
this makes html5lib compliant with the Encoding Standard, and
|
||||
introduces a required dependency on webencodings.**
|
||||
|
||||
* **Cease supporting Python 3.2 (in both CPython and PyPy forms).**
|
||||
|
||||
* **Fix comments containing double-dash with lxml 3.5 and above.**
|
||||
|
||||
* **Use scripting disabled by default (as we don't implement
|
||||
scripting).**
|
||||
|
||||
* **Fix #11, avoiding the XSS bug potentially caused by serializer
|
||||
allowing attribute values to be escaped out of in old browser versions,
|
||||
changing the quote_attr_values option on serializer to take one of
|
||||
three values, "always" (the old True value), "legacy" (the new option,
|
||||
and the new default), and "spec" (the old False value, and the old
|
||||
default).**
|
||||
|
||||
* **Fix #72 by rewriting the sanitizer to apply only to treewalkers
|
||||
(instead of the tokenizer); as such, this will require amending all
|
||||
callers of it to use it via the treewalker API.**
|
||||
|
||||
* **Drop support of charade, now that chardet is supported once more.**
|
||||
|
||||
* **Replace the charset keyword argument on parse and related methods
|
||||
with a set of keyword arguments: override_encoding, transport_encoding,
|
||||
same_origin_parent_encoding, likely_encoding, and default_encoding.**
|
||||
|
||||
* **Move filters._base, treebuilder._base, and treewalkers._base to .base
|
||||
to clarify their status as public.**
|
||||
|
||||
* **Get rid of the sanitizer package. Merge sanitizer.sanitize into the
|
||||
sanitizer.htmlsanitizer module and move that to sanitizer. This means
|
||||
anyone who used sanitizer.sanitize or sanitizer.HTMLSanitizer needs no
|
||||
code changes.**
|
||||
|
||||
* **Rename treewalkers.lxmletree to .etree_lxml and
|
||||
treewalkers.genshistream to .genshi to have a consistent API.**
|
||||
|
||||
* Move a whole load of stuff (inputstream, ihatexml, trie, tokenizer,
|
||||
utils) to be underscore prefixed to clarify their status as private.
|
||||
|
||||
|
||||
0.9999999/1.0b8
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
Released on September 10, 2015
|
||||
|
||||
* Fix #195: fix the sanitizer to drop broken URLs (it threw an
|
||||
exception between 0.9999 and 0.999999).
|
||||
|
||||
|
||||
0.999999/1.0b7
|
||||
~~~~~~~~~~~~~~
|
||||
|
||||
Released on July 7, 2015
|
||||
|
||||
* Fix #189: fix the sanitizer to allow relative URLs again (as it did
|
||||
prior to 0.9999/1.0b5).
|
||||
|
||||
|
||||
0.99999/1.0b6
|
||||
~~~~~~~~~~~~~
|
||||
|
||||
Released on April 30, 2015
|
||||
|
||||
* Fix #188: fix the sanitizer to not throw an exception when sanitizing
|
||||
bogus data URLs.
|
||||
|
||||
|
||||
0.9999/1.0b5
|
||||
~~~~~~~~~~~~
|
||||
|
||||
Released on April 29, 2015
|
||||
|
||||
* Fix #153: Sanitizer fails to treat some attributes as URLs. Despite how
|
||||
this sounds, this has no known security implications. No known version
|
||||
of IE (5.5 to current), Firefox (3 to current), Safari (6 to current),
|
||||
Chrome (1 to current), or Opera (12 to current) will run any script
|
||||
provided in these attributes.
|
||||
|
||||
* Pass error message to the ParseError exception in strict parsing mode.
|
||||
|
||||
* Allow data URIs in the sanitizer, with a whitelist of content-types.
|
||||
|
||||
* Add support for Python implementations that don't support lone
|
||||
surrogates (read: Jython). Fixes #2.
|
||||
|
||||
* Remove localization of error messages. This functionality was totally
|
||||
unused (and untested that everything was localizable), so we may as
|
||||
well follow numerous browsers in not supporting translating technical
|
||||
strings.
|
||||
|
||||
* Expose treewalkers.pprint as a public API.
|
||||
|
||||
* Add a documentEncoding property to HTML5Parser, fix #121.
|
||||
|
||||
|
||||
0.999
|
||||
~~~~~
|
||||
|
||||
Released on December 23, 2013
|
||||
|
||||
* Fix #127: add work-around for CPython issue #20007: .read(0) on
|
||||
http.client.HTTPResponse drops the rest of the content.
|
||||
|
||||
* Fix #115: lxml treewalker can now deal with fragments containing, at
|
||||
their root level, text nodes with non-ASCII characters on Python 2.
|
||||
|
||||
|
||||
0.99
|
||||
~~~~
|
||||
|
||||
Released on September 10, 2013
|
||||
|
||||
* No library changes from 1.0b3; released as 0.99 as pip has changed
|
||||
behaviour from 1.4 to avoid installing pre-release versions per
|
||||
PEP 440.
|
||||
|
||||
|
||||
1.0b3
|
||||
~~~~~
|
||||
|
||||
Released on July 24, 2013
|
||||
|
||||
* Removed ``RecursiveTreeWalker`` from ``treewalkers._base``. Any
|
||||
implementation using it should be moved to
|
||||
``NonRecursiveTreeWalker``, as everything bundled with html5lib has
|
||||
for years.
|
||||
|
||||
* Fix #67 so that ``BufferedStream`` to correctly returns a bytes
|
||||
object, thereby fixing any case where html5lib is passed a
|
||||
non-seekable RawIOBase-like object.
|
||||
|
||||
|
||||
1.0b2
|
||||
~~~~~
|
||||
|
||||
Released on June 27, 2013
|
||||
|
||||
* Removed reordering of attributes within the serializer. There is now
|
||||
an ``alphabetical_attributes`` option which preserves the previous
|
||||
behaviour through a new filter. This allows attribute order to be
|
||||
preserved through html5lib if the tree builder preserves order.
|
||||
|
||||
* Removed ``dom2sax`` from DOM treebuilders. It has been replaced by
|
||||
``treeadapters.sax.to_sax`` which is generic and supports any
|
||||
treewalker; it also resolves all known bugs with ``dom2sax``.
|
||||
|
||||
* Fix treewalker assertions on hitting bytes strings on
|
||||
Python 2. Previous to 1.0b1, treewalkers coped with mixed
|
||||
bytes/unicode data on Python 2; this reintroduces this prior
|
||||
behaviour on Python 2. Behaviour is unchanged on Python 3.
|
||||
|
||||
|
||||
1.0b1
|
||||
~~~~~
|
||||
|
||||
Released on May 17, 2013
|
||||
|
||||
* Implementation updated to implement the `HTML specification
|
||||
<http://www.whatwg.org/specs/web-apps/current-work/>`_ as of 5th May
|
||||
2013 (`SVN <http://svn.whatwg.org/webapps/>`_ revision r7867).
|
||||
|
||||
* Python 3.2+ supported in a single codebase using the ``six`` library.
|
||||
|
||||
* Removed support for Python 2.5 and older.
|
||||
|
||||
* Removed the deprecated Beautiful Soup 3 treebuilder.
|
||||
``beautifulsoup4`` can use ``html5lib`` as a parser instead. Note that
|
||||
since it doesn't support namespaces, foreign content like SVG and
|
||||
MathML is parsed incorrectly.
|
||||
|
||||
* Removed ``simpletree`` from the package. The default tree builder is
|
||||
now ``etree`` (using the ``xml.etree.cElementTree`` implementation if
|
||||
available, and ``xml.etree.ElementTree`` otherwise).
|
||||
|
||||
* Removed the ``XHTMLSerializer`` as it never actually guaranteed its
|
||||
output was well-formed XML, and hence provided little of use.
|
||||
|
||||
* Removed default DOM treebuilder, so ``html5lib.treebuilders.dom`` is no
|
||||
longer supported. ``html5lib.treebuilders.getTreeBuilder("dom")`` will
|
||||
return the default DOM treebuilder, which uses ``xml.dom.minidom``.
|
||||
|
||||
* Optional heuristic character encoding detection now based on
|
||||
``charade`` for Python 2.6 - 3.3 compatibility.
|
||||
|
||||
* Optional ``Genshi`` treewalker support fixed.
|
||||
|
||||
* Many bugfixes, including:
|
||||
|
||||
* #33: null in attribute value breaks XML AttValue;
|
||||
|
||||
* #4: nested, indirect descendant, <button> causes infinite loop;
|
||||
|
||||
* `Google Code 215
|
||||
<http://code.google.com/p/html5lib/issues/detail?id=215>`_: Properly
|
||||
detect seekable streams;
|
||||
|
||||
* `Google Code 206
|
||||
<http://code.google.com/p/html5lib/issues/detail?id=206>`_: add
|
||||
support for <video preload=...>, <audio preload=...>;
|
||||
|
||||
* `Google Code 205
|
||||
<http://code.google.com/p/html5lib/issues/detail?id=205>`_: add
|
||||
support for <video poster=...>;
|
||||
|
||||
* `Google Code 202
|
||||
<http://code.google.com/p/html5lib/issues/detail?id=202>`_: Unicode
|
||||
file breaks InputStream.
|
||||
|
||||
* Source code is now mostly PEP 8 compliant.
|
||||
|
||||
* Test harness has been improved and now depends on ``nose``.
|
||||
|
||||
* Documentation updated and moved to https://html5lib.readthedocs.io/.
|
||||
|
||||
|
||||
0.95
|
||||
~~~~
|
||||
|
||||
Released on February 11, 2012
|
||||
|
||||
|
||||
0.90
|
||||
~~~~
|
||||
|
||||
Released on January 17, 2010
|
||||
|
||||
|
||||
0.11.1
|
||||
~~~~~~
|
||||
|
||||
Released on June 12, 2008
|
||||
|
||||
|
||||
0.11
|
||||
~~~~
|
||||
|
||||
Released on June 10, 2008
|
||||
|
||||
|
||||
0.10
|
||||
~~~~
|
||||
|
||||
Released on October 7, 2007
|
||||
|
||||
|
||||
0.9
|
||||
~~~
|
||||
|
||||
Released on March 11, 2007
|
||||
|
||||
|
||||
0.2
|
||||
~~~
|
||||
|
||||
Released on January 8, 2007
|
||||
|
||||
|
|
@ -0,0 +1,41 @@
|
|||
html5lib-1.1.dist-info/AUTHORS.rst,sha256=DrNAMifoDpuQyJn-KW-H6K8Tt2a5rKnV2UF4-DRrGUI,983
|
||||
html5lib-1.1.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
||||
html5lib-1.1.dist-info/LICENSE,sha256=FqOZkWGekvGGgJMtoqkZn999ld8-yu3FLqBiGKq6_W8,1084
|
||||
html5lib-1.1.dist-info/METADATA,sha256=Y3w-nd_22HQnQRy3yypVsV_ke2FF94uUD4-vGpc2DnI,16076
|
||||
html5lib-1.1.dist-info/RECORD,,
|
||||
html5lib-1.1.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
||||
html5lib-1.1.dist-info/WHEEL,sha256=kGT74LWyRUZrL4VgLh6_g12IeVl_9u9ZVhadrgXZUEY,110
|
||||
html5lib-1.1.dist-info/top_level.txt,sha256=XEX6CHpskSmvjJB4tP6m4Q5NYXhIf_0ceMc0PNbzJPQ,9
|
||||
html5lib/__init__.py,sha256=pWnYcfZ69wNLrdQL7bpr49FUi8O8w0KhKCOHsyRgYGQ,1143
|
||||
html5lib/_ihatexml.py,sha256=ifOwF7pXqmyThIXc3boWc96s4MDezqRrRVp7FwDYUFs,16728
|
||||
html5lib/_inputstream.py,sha256=IKuMiY8rzb7pqIGCpbvTqsxysLEpgEHWYvYEFu4LUAI,32300
|
||||
html5lib/_tokenizer.py,sha256=WvJQa2Mli4NtTmhLXkX8Jy5FcWttqCaiDTiKyaw8D-k,77028
|
||||
html5lib/_trie/__init__.py,sha256=nqfgO910329BEVJ5T4psVwQtjd2iJyEXQ2-X8c1YxwU,109
|
||||
html5lib/_trie/_base.py,sha256=CaybYyMro8uERQYjby2tTeSUatnWDfWroUN9N7ety5w,1013
|
||||
html5lib/_trie/py.py,sha256=zg7RZSHxJ8mLmuI_7VEIV8AomISrgkvqCP477AgXaG0,1763
|
||||
html5lib/_utils.py,sha256=AxAJSG15eyarCgKMnlUwzs1X6jFHXqEvhlYEOxAFmis,4919
|
||||
html5lib/constants.py,sha256=Ll-yzLU_jcjyAI_h57zkqZ7aQWE5t5xA4y_jQgoUUhw,83464
|
||||
html5lib/filters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
||||
html5lib/filters/alphabeticalattributes.py,sha256=lViZc2JMCclXi_5gduvmdzrRxtO5Xo9ONnbHBVCsykU,919
|
||||
html5lib/filters/base.py,sha256=z-IU9ZAYjpsVsqmVt7kuWC63jR11hDMr6CVrvuao8W0,286
|
||||
html5lib/filters/inject_meta_charset.py,sha256=egDXUEHXmAG9504xz0K6ALDgYkvUrC2q15YUVeNlVQg,2945
|
||||
html5lib/filters/lint.py,sha256=upXATs6By7cot7o0bnNqR15sPq2Fn6Vnjvoy3gyO_rY,3631
|
||||
html5lib/filters/optionaltags.py,sha256=8lWT75J0aBOHmPgfmqTHSfPpPMp01T84NKu0CRedxcE,10588
|
||||
html5lib/filters/sanitizer.py,sha256=XGNSdzIqDTaHot1V-rRj1V_XOolApJ7n95tHP9JcgNU,26885
|
||||
html5lib/filters/whitespace.py,sha256=8eWqZxd4UC4zlFGW6iyY6f-2uuT8pOCSALc3IZt7_t4,1214
|
||||
html5lib/html5parser.py,sha256=w5hZJh0cvD3g4CS196DiTmuGpSKCMYe1GS46-yf_WZQ,117174
|
||||
html5lib/serializer.py,sha256=K2kfoLyMPMFPfdusfR30SrxNkf0mJB92-P5_RntyaaI,15747
|
||||
html5lib/treeadapters/__init__.py,sha256=18hyI-at2aBsdKzpwRwa5lGF1ipgctaTYXoU9En2ZQg,650
|
||||
html5lib/treeadapters/genshi.py,sha256=CH27pAsDKmu4ZGkAUrwty7u0KauGLCZRLPMzaO3M5vo,1715
|
||||
html5lib/treeadapters/sax.py,sha256=BKS8woQTnKiqeffHsxChUqL4q2ZR_wb5fc9MJ3zQC8s,1776
|
||||
html5lib/treebuilders/__init__.py,sha256=AysSJyvPfikCMMsTVvaxwkgDieELD5dfR8FJIAuq7hY,3592
|
||||
html5lib/treebuilders/base.py,sha256=oeZNGEB-kt90YJGVH05gb5a8E7ids2AbYwGRsVCieWk,14553
|
||||
html5lib/treebuilders/dom.py,sha256=22whb0C71zXIsai5mamg6qzBEiigcBIvaDy4Asw3at0,8925
|
||||
html5lib/treebuilders/etree.py,sha256=EbmHx-wQ-11MVucTPtF7Ul92-mQGN3Udu_KfDn-Ifhk,12824
|
||||
html5lib/treebuilders/etree_lxml.py,sha256=OazDHZGO_q4FnVs4Dhs4hzzn2JwGAOs-rfV8LAlUGW4,14754
|
||||
html5lib/treewalkers/__init__.py,sha256=OBPtc1TU5mGyy18QDMxKEyYEz0wxFUUNj5v0-XgmYhY,5719
|
||||
html5lib/treewalkers/base.py,sha256=ouiOsuSzvI0KgzdWP8PlxIaSNs9falhbiinAEc_UIJY,7476
|
||||
html5lib/treewalkers/dom.py,sha256=EHyFR8D8lYNnyDU9lx_IKigVJRyecUGua0mOi7HBukc,1413
|
||||
html5lib/treewalkers/etree.py,sha256=gkD4tfEfRWPsEGvgHHJxZmKZXUvBzVVGz3v5C_MIiOE,4539
|
||||
html5lib/treewalkers/etree_lxml.py,sha256=eLedbn6nPjlpebibsWVijey7WEpzDwxU3ubwUoudBuA,6345
|
||||
html5lib/treewalkers/genshi.py,sha256=4D2PECZ5n3ZN3qu3jMl9yY7B81jnQApBQSVlfaIuYbA,2309
|
|
@ -0,0 +1,6 @@
|
|||
Wheel-Version: 1.0
|
||||
Generator: bdist_wheel (0.34.2)
|
||||
Root-Is-Purelib: true
|
||||
Tag: py2-none-any
|
||||
Tag: py3-none-any
|
||||
|
|
@ -0,0 +1 @@
|
|||
html5lib
|
35
venv/Lib/site-packages/bleach/_vendor/html5lib/__init__.py
Normal file
35
venv/Lib/site-packages/bleach/_vendor/html5lib/__init__.py
Normal file
|
@ -0,0 +1,35 @@
|
|||
"""
|
||||
HTML parsing library based on the `WHATWG HTML specification
|
||||
<https://whatwg.org/html>`_. The parser is designed to be compatible with
|
||||
existing HTML found in the wild and implements well-defined error recovery that
|
||||
is largely compatible with modern desktop web browsers.
|
||||
|
||||
Example usage::
|
||||
|
||||
import html5lib
|
||||
with open("my_document.html", "rb") as f:
|
||||
tree = html5lib.parse(f)
|
||||
|
||||
For convenience, this module re-exports the following names:
|
||||
|
||||
* :func:`~.html5parser.parse`
|
||||
* :func:`~.html5parser.parseFragment`
|
||||
* :class:`~.html5parser.HTMLParser`
|
||||
* :func:`~.treebuilders.getTreeBuilder`
|
||||
* :func:`~.treewalkers.getTreeWalker`
|
||||
* :func:`~.serializer.serialize`
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
from .html5parser import HTMLParser, parse, parseFragment
|
||||
from .treebuilders import getTreeBuilder
|
||||
from .treewalkers import getTreeWalker
|
||||
from .serializer import serialize
|
||||
|
||||
__all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder",
|
||||
"getTreeWalker", "serialize"]
|
||||
|
||||
# this has to be at the top level, see how setup.py parses this
|
||||
#: Distribution version number.
|
||||
__version__ = "1.1"
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
289
venv/Lib/site-packages/bleach/_vendor/html5lib/_ihatexml.py
Normal file
289
venv/Lib/site-packages/bleach/_vendor/html5lib/_ihatexml.py
Normal file
|
@ -0,0 +1,289 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
import re
|
||||
import warnings
|
||||
|
||||
from .constants import DataLossWarning
|
||||
|
||||
baseChar = """
|
||||
[#x0041-#x005A] | [#x0061-#x007A] | [#x00C0-#x00D6] | [#x00D8-#x00F6] |
|
||||
[#x00F8-#x00FF] | [#x0100-#x0131] | [#x0134-#x013E] | [#x0141-#x0148] |
|
||||
[#x014A-#x017E] | [#x0180-#x01C3] | [#x01CD-#x01F0] | [#x01F4-#x01F5] |
|
||||
[#x01FA-#x0217] | [#x0250-#x02A8] | [#x02BB-#x02C1] | #x0386 |
|
||||
[#x0388-#x038A] | #x038C | [#x038E-#x03A1] | [#x03A3-#x03CE] |
|
||||
[#x03D0-#x03D6] | #x03DA | #x03DC | #x03DE | #x03E0 | [#x03E2-#x03F3] |
|
||||
[#x0401-#x040C] | [#x040E-#x044F] | [#x0451-#x045C] | [#x045E-#x0481] |
|
||||
[#x0490-#x04C4] | [#x04C7-#x04C8] | [#x04CB-#x04CC] | [#x04D0-#x04EB] |
|
||||
[#x04EE-#x04F5] | [#x04F8-#x04F9] | [#x0531-#x0556] | #x0559 |
|
||||
[#x0561-#x0586] | [#x05D0-#x05EA] | [#x05F0-#x05F2] | [#x0621-#x063A] |
|
||||
[#x0641-#x064A] | [#x0671-#x06B7] | [#x06BA-#x06BE] | [#x06C0-#x06CE] |
|
||||
[#x06D0-#x06D3] | #x06D5 | [#x06E5-#x06E6] | [#x0905-#x0939] | #x093D |
|
||||
[#x0958-#x0961] | [#x0985-#x098C] | [#x098F-#x0990] | [#x0993-#x09A8] |
|
||||
[#x09AA-#x09B0] | #x09B2 | [#x09B6-#x09B9] | [#x09DC-#x09DD] |
|
||||
[#x09DF-#x09E1] | [#x09F0-#x09F1] | [#x0A05-#x0A0A] | [#x0A0F-#x0A10] |
|
||||
[#x0A13-#x0A28] | [#x0A2A-#x0A30] | [#x0A32-#x0A33] | [#x0A35-#x0A36] |
|
||||
[#x0A38-#x0A39] | [#x0A59-#x0A5C] | #x0A5E | [#x0A72-#x0A74] |
|
||||
[#x0A85-#x0A8B] | #x0A8D | [#x0A8F-#x0A91] | [#x0A93-#x0AA8] |
|
||||
[#x0AAA-#x0AB0] | [#x0AB2-#x0AB3] | [#x0AB5-#x0AB9] | #x0ABD | #x0AE0 |
|
||||
[#x0B05-#x0B0C] | [#x0B0F-#x0B10] | [#x0B13-#x0B28] | [#x0B2A-#x0B30] |
|
||||
[#x0B32-#x0B33] | [#x0B36-#x0B39] | #x0B3D | [#x0B5C-#x0B5D] |
|
||||
[#x0B5F-#x0B61] | [#x0B85-#x0B8A] | [#x0B8E-#x0B90] | [#x0B92-#x0B95] |
|
||||
[#x0B99-#x0B9A] | #x0B9C | [#x0B9E-#x0B9F] | [#x0BA3-#x0BA4] |
|
||||
[#x0BA8-#x0BAA] | [#x0BAE-#x0BB5] | [#x0BB7-#x0BB9] | [#x0C05-#x0C0C] |
|
||||
[#x0C0E-#x0C10] | [#x0C12-#x0C28] | [#x0C2A-#x0C33] | [#x0C35-#x0C39] |
|
||||
[#x0C60-#x0C61] | [#x0C85-#x0C8C] | [#x0C8E-#x0C90] | [#x0C92-#x0CA8] |
|
||||
[#x0CAA-#x0CB3] | [#x0CB5-#x0CB9] | #x0CDE | [#x0CE0-#x0CE1] |
|
||||
[#x0D05-#x0D0C] | [#x0D0E-#x0D10] | [#x0D12-#x0D28] | [#x0D2A-#x0D39] |
|
||||
[#x0D60-#x0D61] | [#x0E01-#x0E2E] | #x0E30 | [#x0E32-#x0E33] |
|
||||
[#x0E40-#x0E45] | [#x0E81-#x0E82] | #x0E84 | [#x0E87-#x0E88] | #x0E8A |
|
||||
#x0E8D | [#x0E94-#x0E97] | [#x0E99-#x0E9F] | [#x0EA1-#x0EA3] | #x0EA5 |
|
||||
#x0EA7 | [#x0EAA-#x0EAB] | [#x0EAD-#x0EAE] | #x0EB0 | [#x0EB2-#x0EB3] |
|
||||
#x0EBD | [#x0EC0-#x0EC4] | [#x0F40-#x0F47] | [#x0F49-#x0F69] |
|
||||
[#x10A0-#x10C5] | [#x10D0-#x10F6] | #x1100 | [#x1102-#x1103] |
|
||||
[#x1105-#x1107] | #x1109 | [#x110B-#x110C] | [#x110E-#x1112] | #x113C |
|
||||
#x113E | #x1140 | #x114C | #x114E | #x1150 | [#x1154-#x1155] | #x1159 |
|
||||
[#x115F-#x1161] | #x1163 | #x1165 | #x1167 | #x1169 | [#x116D-#x116E] |
|
||||
[#x1172-#x1173] | #x1175 | #x119E | #x11A8 | #x11AB | [#x11AE-#x11AF] |
|
||||
[#x11B7-#x11B8] | #x11BA | [#x11BC-#x11C2] | #x11EB | #x11F0 | #x11F9 |
|
||||
[#x1E00-#x1E9B] | [#x1EA0-#x1EF9] | [#x1F00-#x1F15] | [#x1F18-#x1F1D] |
|
||||
[#x1F20-#x1F45] | [#x1F48-#x1F4D] | [#x1F50-#x1F57] | #x1F59 | #x1F5B |
|
||||
#x1F5D | [#x1F5F-#x1F7D] | [#x1F80-#x1FB4] | [#x1FB6-#x1FBC] | #x1FBE |
|
||||
[#x1FC2-#x1FC4] | [#x1FC6-#x1FCC] | [#x1FD0-#x1FD3] | [#x1FD6-#x1FDB] |
|
||||
[#x1FE0-#x1FEC] | [#x1FF2-#x1FF4] | [#x1FF6-#x1FFC] | #x2126 |
|
||||
[#x212A-#x212B] | #x212E | [#x2180-#x2182] | [#x3041-#x3094] |
|
||||
[#x30A1-#x30FA] | [#x3105-#x312C] | [#xAC00-#xD7A3]"""
|
||||
|
||||
ideographic = """[#x4E00-#x9FA5] | #x3007 | [#x3021-#x3029]"""
|
||||
|
||||
combiningCharacter = """
|
||||
[#x0300-#x0345] | [#x0360-#x0361] | [#x0483-#x0486] | [#x0591-#x05A1] |
|
||||
[#x05A3-#x05B9] | [#x05BB-#x05BD] | #x05BF | [#x05C1-#x05C2] | #x05C4 |
|
||||
[#x064B-#x0652] | #x0670 | [#x06D6-#x06DC] | [#x06DD-#x06DF] |
|
||||
[#x06E0-#x06E4] | [#x06E7-#x06E8] | [#x06EA-#x06ED] | [#x0901-#x0903] |
|
||||
#x093C | [#x093E-#x094C] | #x094D | [#x0951-#x0954] | [#x0962-#x0963] |
|
||||
[#x0981-#x0983] | #x09BC | #x09BE | #x09BF | [#x09C0-#x09C4] |
|
||||
[#x09C7-#x09C8] | [#x09CB-#x09CD] | #x09D7 | [#x09E2-#x09E3] | #x0A02 |
|
||||
#x0A3C | #x0A3E | #x0A3F | [#x0A40-#x0A42] | [#x0A47-#x0A48] |
|
||||
[#x0A4B-#x0A4D] | [#x0A70-#x0A71] | [#x0A81-#x0A83] | #x0ABC |
|
||||
[#x0ABE-#x0AC5] | [#x0AC7-#x0AC9] | [#x0ACB-#x0ACD] | [#x0B01-#x0B03] |
|
||||
#x0B3C | [#x0B3E-#x0B43] | [#x0B47-#x0B48] | [#x0B4B-#x0B4D] |
|
||||
[#x0B56-#x0B57] | [#x0B82-#x0B83] | [#x0BBE-#x0BC2] | [#x0BC6-#x0BC8] |
|
||||
[#x0BCA-#x0BCD] | #x0BD7 | [#x0C01-#x0C03] | [#x0C3E-#x0C44] |
|
||||
[#x0C46-#x0C48] | [#x0C4A-#x0C4D] | [#x0C55-#x0C56] | [#x0C82-#x0C83] |
|
||||
[#x0CBE-#x0CC4] | [#x0CC6-#x0CC8] | [#x0CCA-#x0CCD] | [#x0CD5-#x0CD6] |
|
||||
[#x0D02-#x0D03] | [#x0D3E-#x0D43] | [#x0D46-#x0D48] | [#x0D4A-#x0D4D] |
|
||||
#x0D57 | #x0E31 | [#x0E34-#x0E3A] | [#x0E47-#x0E4E] | #x0EB1 |
|
||||
[#x0EB4-#x0EB9] | [#x0EBB-#x0EBC] | [#x0EC8-#x0ECD] | [#x0F18-#x0F19] |
|
||||
#x0F35 | #x0F37 | #x0F39 | #x0F3E | #x0F3F | [#x0F71-#x0F84] |
|
||||
[#x0F86-#x0F8B] | [#x0F90-#x0F95] | #x0F97 | [#x0F99-#x0FAD] |
|
||||
[#x0FB1-#x0FB7] | #x0FB9 | [#x20D0-#x20DC] | #x20E1 | [#x302A-#x302F] |
|
||||
#x3099 | #x309A"""
|
||||
|
||||
digit = """
|
||||
[#x0030-#x0039] | [#x0660-#x0669] | [#x06F0-#x06F9] | [#x0966-#x096F] |
|
||||
[#x09E6-#x09EF] | [#x0A66-#x0A6F] | [#x0AE6-#x0AEF] | [#x0B66-#x0B6F] |
|
||||
[#x0BE7-#x0BEF] | [#x0C66-#x0C6F] | [#x0CE6-#x0CEF] | [#x0D66-#x0D6F] |
|
||||
[#x0E50-#x0E59] | [#x0ED0-#x0ED9] | [#x0F20-#x0F29]"""
|
||||
|
||||
extender = """
|
||||
#x00B7 | #x02D0 | #x02D1 | #x0387 | #x0640 | #x0E46 | #x0EC6 | #x3005 |
|
||||
#[#x3031-#x3035] | [#x309D-#x309E] | [#x30FC-#x30FE]"""
|
||||
|
||||
letter = " | ".join([baseChar, ideographic])
|
||||
|
||||
# Without the
|
||||
name = " | ".join([letter, digit, ".", "-", "_", combiningCharacter,
|
||||
extender])
|
||||
nameFirst = " | ".join([letter, "_"])
|
||||
|
||||
reChar = re.compile(r"#x([\d|A-F]{4,4})")
|
||||
reCharRange = re.compile(r"\[#x([\d|A-F]{4,4})-#x([\d|A-F]{4,4})\]")
|
||||
|
||||
|
||||
def charStringToList(chars):
|
||||
charRanges = [item.strip() for item in chars.split(" | ")]
|
||||
rv = []
|
||||
for item in charRanges:
|
||||
foundMatch = False
|
||||
for regexp in (reChar, reCharRange):
|
||||
match = regexp.match(item)
|
||||
if match is not None:
|
||||
rv.append([hexToInt(item) for item in match.groups()])
|
||||
if len(rv[-1]) == 1:
|
||||
rv[-1] = rv[-1] * 2
|
||||
foundMatch = True
|
||||
break
|
||||
if not foundMatch:
|
||||
assert len(item) == 1
|
||||
|
||||
rv.append([ord(item)] * 2)
|
||||
rv = normaliseCharList(rv)
|
||||
return rv
|
||||
|
||||
|
||||
def normaliseCharList(charList):
|
||||
charList = sorted(charList)
|
||||
for item in charList:
|
||||
assert item[1] >= item[0]
|
||||
rv = []
|
||||
i = 0
|
||||
while i < len(charList):
|
||||
j = 1
|
||||
rv.append(charList[i])
|
||||
while i + j < len(charList) and charList[i + j][0] <= rv[-1][1] + 1:
|
||||
rv[-1][1] = charList[i + j][1]
|
||||
j += 1
|
||||
i += j
|
||||
return rv
|
||||
|
||||
|
||||
# We don't really support characters above the BMP :(
|
||||
max_unicode = int("FFFF", 16)
|
||||
|
||||
|
||||
def missingRanges(charList):
|
||||
rv = []
|
||||
if charList[0] != 0:
|
||||
rv.append([0, charList[0][0] - 1])
|
||||
for i, item in enumerate(charList[:-1]):
|
||||
rv.append([item[1] + 1, charList[i + 1][0] - 1])
|
||||
if charList[-1][1] != max_unicode:
|
||||
rv.append([charList[-1][1] + 1, max_unicode])
|
||||
return rv
|
||||
|
||||
|
||||
def listToRegexpStr(charList):
|
||||
rv = []
|
||||
for item in charList:
|
||||
if item[0] == item[1]:
|
||||
rv.append(escapeRegexp(chr(item[0])))
|
||||
else:
|
||||
rv.append(escapeRegexp(chr(item[0])) + "-" +
|
||||
escapeRegexp(chr(item[1])))
|
||||
return "[%s]" % "".join(rv)
|
||||
|
||||
|
||||
def hexToInt(hex_str):
|
||||
return int(hex_str, 16)
|
||||
|
||||
|
||||
def escapeRegexp(string):
|
||||
specialCharacters = (".", "^", "$", "*", "+", "?", "{", "}",
|
||||
"[", "]", "|", "(", ")", "-")
|
||||
for char in specialCharacters:
|
||||
string = string.replace(char, "\\" + char)
|
||||
|
||||
return string
|
||||
|
||||
# output from the above
|
||||
nonXmlNameBMPRegexp = re.compile('[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') # noqa
|
||||
|
||||
nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') # noqa
|
||||
|
||||
# Simpler things
|
||||
nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\\-'()+,./:=?;!*#@$_%]")
|
||||
|
||||
|
||||
class InfosetFilter(object):
|
||||
replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
|
||||
|
||||
def __init__(self,
|
||||
dropXmlnsLocalName=False,
|
||||
dropXmlnsAttrNs=False,
|
||||
preventDoubleDashComments=False,
|
||||
preventDashAtCommentEnd=False,
|
||||
replaceFormFeedCharacters=True,
|
||||
preventSingleQuotePubid=False):
|
||||
|
||||
self.dropXmlnsLocalName = dropXmlnsLocalName
|
||||
self.dropXmlnsAttrNs = dropXmlnsAttrNs
|
||||
|
||||
self.preventDoubleDashComments = preventDoubleDashComments
|
||||
self.preventDashAtCommentEnd = preventDashAtCommentEnd
|
||||
|
||||
self.replaceFormFeedCharacters = replaceFormFeedCharacters
|
||||
|
||||
self.preventSingleQuotePubid = preventSingleQuotePubid
|
||||
|
||||
self.replaceCache = {}
|
||||
|
||||
def coerceAttribute(self, name, namespace=None):
|
||||
if self.dropXmlnsLocalName and name.startswith("xmlns:"):
|
||||
warnings.warn("Attributes cannot begin with xmlns", DataLossWarning)
|
||||
return None
|
||||
elif (self.dropXmlnsAttrNs and
|
||||
namespace == "http://www.w3.org/2000/xmlns/"):
|
||||
warnings.warn("Attributes cannot be in the xml namespace", DataLossWarning)
|
||||
return None
|
||||
else:
|
||||
return self.toXmlName(name)
|
||||
|
||||
def coerceElement(self, name):
|
||||
return self.toXmlName(name)
|
||||
|
||||
def coerceComment(self, data):
|
||||
if self.preventDoubleDashComments:
|
||||
while "--" in data:
|
||||
warnings.warn("Comments cannot contain adjacent dashes", DataLossWarning)
|
||||
data = data.replace("--", "- -")
|
||||
if data.endswith("-"):
|
||||
warnings.warn("Comments cannot end in a dash", DataLossWarning)
|
||||
data += " "
|
||||
return data
|
||||
|
||||
def coerceCharacters(self, data):
|
||||
if self.replaceFormFeedCharacters:
|
||||
for _ in range(data.count("\x0C")):
|
||||
warnings.warn("Text cannot contain U+000C", DataLossWarning)
|
||||
data = data.replace("\x0C", " ")
|
||||
# Other non-xml characters
|
||||
return data
|
||||
|
||||
def coercePubid(self, data):
|
||||
dataOutput = data
|
||||
for char in nonPubidCharRegexp.findall(data):
|
||||
warnings.warn("Coercing non-XML pubid", DataLossWarning)
|
||||
replacement = self.getReplacementCharacter(char)
|
||||
dataOutput = dataOutput.replace(char, replacement)
|
||||
if self.preventSingleQuotePubid and dataOutput.find("'") >= 0:
|
||||
warnings.warn("Pubid cannot contain single quote", DataLossWarning)
|
||||
dataOutput = dataOutput.replace("'", self.getReplacementCharacter("'"))
|
||||
return dataOutput
|
||||
|
||||
def toXmlName(self, name):
|
||||
nameFirst = name[0]
|
||||
nameRest = name[1:]
|
||||
m = nonXmlNameFirstBMPRegexp.match(nameFirst)
|
||||
if m:
|
||||
warnings.warn("Coercing non-XML name: %s" % name, DataLossWarning)
|
||||
nameFirstOutput = self.getReplacementCharacter(nameFirst)
|
||||
else:
|
||||
nameFirstOutput = nameFirst
|
||||
|
||||
nameRestOutput = nameRest
|
||||
replaceChars = set(nonXmlNameBMPRegexp.findall(nameRest))
|
||||
for char in replaceChars:
|
||||
warnings.warn("Coercing non-XML name: %s" % name, DataLossWarning)
|
||||
replacement = self.getReplacementCharacter(char)
|
||||
nameRestOutput = nameRestOutput.replace(char, replacement)
|
||||
return nameFirstOutput + nameRestOutput
|
||||
|
||||
def getReplacementCharacter(self, char):
|
||||
if char in self.replaceCache:
|
||||
replacement = self.replaceCache[char]
|
||||
else:
|
||||
replacement = self.escapeChar(char)
|
||||
return replacement
|
||||
|
||||
def fromXmlName(self, name):
|
||||
for item in set(self.replacementRegexp.findall(name)):
|
||||
name = name.replace(item, self.unescapeChar(item))
|
||||
return name
|
||||
|
||||
def escapeChar(self, char):
|
||||
replacement = "U%05X" % ord(char)
|
||||
self.replaceCache[char] = replacement
|
||||
return replacement
|
||||
|
||||
def unescapeChar(self, charcode):
|
||||
return chr(int(charcode[1:], 16))
|
918
venv/Lib/site-packages/bleach/_vendor/html5lib/_inputstream.py
Normal file
918
venv/Lib/site-packages/bleach/_vendor/html5lib/_inputstream.py
Normal file
|
@ -0,0 +1,918 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
from six import text_type
|
||||
from six.moves import http_client, urllib
|
||||
|
||||
import codecs
|
||||
import re
|
||||
from io import BytesIO, StringIO
|
||||
|
||||
import webencodings
|
||||
|
||||
from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
|
||||
from .constants import _ReparseException
|
||||
from . import _utils
|
||||
|
||||
# Non-unicode versions of constants for use in the pre-parser
|
||||
spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters])
|
||||
asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
|
||||
asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
|
||||
spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
|
||||
|
||||
|
||||
invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]" # noqa
|
||||
|
||||
if _utils.supports_lone_surrogates:
|
||||
# Use one extra step of indirection and create surrogates with
|
||||
# eval. Not using this indirection would introduce an illegal
|
||||
# unicode literal on platforms not supporting such lone
|
||||
# surrogates.
|
||||
assert invalid_unicode_no_surrogate[-1] == "]" and invalid_unicode_no_surrogate.count("]") == 1
|
||||
invalid_unicode_re = re.compile(invalid_unicode_no_surrogate[:-1] +
|
||||
eval('"\\uD800-\\uDFFF"') + # pylint:disable=eval-used
|
||||
"]")
|
||||
else:
|
||||
invalid_unicode_re = re.compile(invalid_unicode_no_surrogate)
|
||||
|
||||
non_bmp_invalid_codepoints = {0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
|
||||
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
|
||||
0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE,
|
||||
0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
|
||||
0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
|
||||
0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
|
||||
0x10FFFE, 0x10FFFF}
|
||||
|
||||
ascii_punctuation_re = re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005C\u005B-\u0060\u007B-\u007E]")
|
||||
|
||||
# Cache for charsUntil()
|
||||
charsUntilRegEx = {}
|
||||
|
||||
|
||||
class BufferedStream(object):
|
||||
"""Buffering for streams that do not have buffering of their own
|
||||
|
||||
The buffer is implemented as a list of chunks on the assumption that
|
||||
joining many strings will be slow since it is O(n**2)
|
||||
"""
|
||||
|
||||
def __init__(self, stream):
|
||||
self.stream = stream
|
||||
self.buffer = []
|
||||
self.position = [-1, 0] # chunk number, offset
|
||||
|
||||
def tell(self):
|
||||
pos = 0
|
||||
for chunk in self.buffer[:self.position[0]]:
|
||||
pos += len(chunk)
|
||||
pos += self.position[1]
|
||||
return pos
|
||||
|
||||
def seek(self, pos):
|
||||
assert pos <= self._bufferedBytes()
|
||||
offset = pos
|
||||
i = 0
|
||||
while len(self.buffer[i]) < offset:
|
||||
offset -= len(self.buffer[i])
|
||||
i += 1
|
||||
self.position = [i, offset]
|
||||
|
||||
def read(self, bytes):
|
||||
if not self.buffer:
|
||||
return self._readStream(bytes)
|
||||
elif (self.position[0] == len(self.buffer) and
|
||||
self.position[1] == len(self.buffer[-1])):
|
||||
return self._readStream(bytes)
|
||||
else:
|
||||
return self._readFromBuffer(bytes)
|
||||
|
||||
def _bufferedBytes(self):
|
||||
return sum([len(item) for item in self.buffer])
|
||||
|
||||
def _readStream(self, bytes):
|
||||
data = self.stream.read(bytes)
|
||||
self.buffer.append(data)
|
||||
self.position[0] += 1
|
||||
self.position[1] = len(data)
|
||||
return data
|
||||
|
||||
def _readFromBuffer(self, bytes):
|
||||
remainingBytes = bytes
|
||||
rv = []
|
||||
bufferIndex = self.position[0]
|
||||
bufferOffset = self.position[1]
|
||||
while bufferIndex < len(self.buffer) and remainingBytes != 0:
|
||||
assert remainingBytes > 0
|
||||
bufferedData = self.buffer[bufferIndex]
|
||||
|
||||
if remainingBytes <= len(bufferedData) - bufferOffset:
|
||||
bytesToRead = remainingBytes
|
||||
self.position = [bufferIndex, bufferOffset + bytesToRead]
|
||||
else:
|
||||
bytesToRead = len(bufferedData) - bufferOffset
|
||||
self.position = [bufferIndex, len(bufferedData)]
|
||||
bufferIndex += 1
|
||||
rv.append(bufferedData[bufferOffset:bufferOffset + bytesToRead])
|
||||
remainingBytes -= bytesToRead
|
||||
|
||||
bufferOffset = 0
|
||||
|
||||
if remainingBytes:
|
||||
rv.append(self._readStream(remainingBytes))
|
||||
|
||||
return b"".join(rv)
|
||||
|
||||
|
||||
def HTMLInputStream(source, **kwargs):
|
||||
# Work around Python bug #20007: read(0) closes the connection.
|
||||
# http://bugs.python.org/issue20007
|
||||
if (isinstance(source, http_client.HTTPResponse) or
|
||||
# Also check for addinfourl wrapping HTTPResponse
|
||||
(isinstance(source, urllib.response.addbase) and
|
||||
isinstance(source.fp, http_client.HTTPResponse))):
|
||||
isUnicode = False
|
||||
elif hasattr(source, "read"):
|
||||
isUnicode = isinstance(source.read(0), text_type)
|
||||
else:
|
||||
isUnicode = isinstance(source, text_type)
|
||||
|
||||
if isUnicode:
|
||||
encodings = [x for x in kwargs if x.endswith("_encoding")]
|
||||
if encodings:
|
||||
raise TypeError("Cannot set an encoding with a unicode input, set %r" % encodings)
|
||||
|
||||
return HTMLUnicodeInputStream(source, **kwargs)
|
||||
else:
|
||||
return HTMLBinaryInputStream(source, **kwargs)
|
||||
|
||||
|
||||
class HTMLUnicodeInputStream(object):
|
||||
"""Provides a unicode stream of characters to the HTMLTokenizer.
|
||||
|
||||
This class takes care of character encoding and removing or replacing
|
||||
incorrect byte-sequences and also provides column and line tracking.
|
||||
|
||||
"""
|
||||
|
||||
_defaultChunkSize = 10240
|
||||
|
||||
def __init__(self, source):
|
||||
"""Initialises the HTMLInputStream.
|
||||
|
||||
HTMLInputStream(source, [encoding]) -> Normalized stream from source
|
||||
for use by html5lib.
|
||||
|
||||
source can be either a file-object, local filename or a string.
|
||||
|
||||
The optional encoding parameter must be a string that indicates
|
||||
the encoding. If specified, that encoding will be used,
|
||||
regardless of any BOM or later declaration (such as in a meta
|
||||
element)
|
||||
|
||||
"""
|
||||
|
||||
if not _utils.supports_lone_surrogates:
|
||||
# Such platforms will have already checked for such
|
||||
# surrogate errors, so no need to do this checking.
|
||||
self.reportCharacterErrors = None
|
||||
elif len("\U0010FFFF") == 1:
|
||||
self.reportCharacterErrors = self.characterErrorsUCS4
|
||||
else:
|
||||
self.reportCharacterErrors = self.characterErrorsUCS2
|
||||
|
||||
# List of where new lines occur
|
||||
self.newLines = [0]
|
||||
|
||||
self.charEncoding = (lookupEncoding("utf-8"), "certain")
|
||||
self.dataStream = self.openStream(source)
|
||||
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
self.chunk = ""
|
||||
self.chunkSize = 0
|
||||
self.chunkOffset = 0
|
||||
self.errors = []
|
||||
|
||||
# number of (complete) lines in previous chunks
|
||||
self.prevNumLines = 0
|
||||
# number of columns in the last line of the previous chunk
|
||||
self.prevNumCols = 0
|
||||
|
||||
# Deal with CR LF and surrogates split over chunk boundaries
|
||||
self._bufferedCharacter = None
|
||||
|
||||
def openStream(self, source):
|
||||
"""Produces a file object from source.
|
||||
|
||||
source can be either a file object, local filename or a string.
|
||||
|
||||
"""
|
||||
# Already a file object
|
||||
if hasattr(source, 'read'):
|
||||
stream = source
|
||||
else:
|
||||
stream = StringIO(source)
|
||||
|
||||
return stream
|
||||
|
||||
def _position(self, offset):
|
||||
chunk = self.chunk
|
||||
nLines = chunk.count('\n', 0, offset)
|
||||
positionLine = self.prevNumLines + nLines
|
||||
lastLinePos = chunk.rfind('\n', 0, offset)
|
||||
if lastLinePos == -1:
|
||||
positionColumn = self.prevNumCols + offset
|
||||
else:
|
||||
positionColumn = offset - (lastLinePos + 1)
|
||||
return (positionLine, positionColumn)
|
||||
|
||||
def position(self):
|
||||
"""Returns (line, col) of the current position in the stream."""
|
||||
line, col = self._position(self.chunkOffset)
|
||||
return (line + 1, col)
|
||||
|
||||
def char(self):
|
||||
""" Read one character from the stream or queue if available. Return
|
||||
EOF when EOF is reached.
|
||||
"""
|
||||
# Read a new chunk from the input stream if necessary
|
||||
if self.chunkOffset >= self.chunkSize:
|
||||
if not self.readChunk():
|
||||
return EOF
|
||||
|
||||
chunkOffset = self.chunkOffset
|
||||
char = self.chunk[chunkOffset]
|
||||
self.chunkOffset = chunkOffset + 1
|
||||
|
||||
return char
|
||||
|
||||
def readChunk(self, chunkSize=None):
|
||||
if chunkSize is None:
|
||||
chunkSize = self._defaultChunkSize
|
||||
|
||||
self.prevNumLines, self.prevNumCols = self._position(self.chunkSize)
|
||||
|
||||
self.chunk = ""
|
||||
self.chunkSize = 0
|
||||
self.chunkOffset = 0
|
||||
|
||||
data = self.dataStream.read(chunkSize)
|
||||
|
||||
# Deal with CR LF and surrogates broken across chunks
|
||||
if self._bufferedCharacter:
|
||||
data = self._bufferedCharacter + data
|
||||
self._bufferedCharacter = None
|
||||
elif not data:
|
||||
# We have no more data, bye-bye stream
|
||||
return False
|
||||
|
||||
if len(data) > 1:
|
||||
lastv = ord(data[-1])
|
||||
if lastv == 0x0D or 0xD800 <= lastv <= 0xDBFF:
|
||||
self._bufferedCharacter = data[-1]
|
||||
data = data[:-1]
|
||||
|
||||
if self.reportCharacterErrors:
|
||||
self.reportCharacterErrors(data)
|
||||
|
||||
# Replace invalid characters
|
||||
data = data.replace("\r\n", "\n")
|
||||
data = data.replace("\r", "\n")
|
||||
|
||||
self.chunk = data
|
||||
self.chunkSize = len(data)
|
||||
|
||||
return True
|
||||
|
||||
def characterErrorsUCS4(self, data):
|
||||
for _ in range(len(invalid_unicode_re.findall(data))):
|
||||
self.errors.append("invalid-codepoint")
|
||||
|
||||
def characterErrorsUCS2(self, data):
|
||||
# Someone picked the wrong compile option
|
||||
# You lose
|
||||
skip = False
|
||||
for match in invalid_unicode_re.finditer(data):
|
||||
if skip:
|
||||
continue
|
||||
codepoint = ord(match.group())
|
||||
pos = match.start()
|
||||
# Pretty sure there should be endianness issues here
|
||||
if _utils.isSurrogatePair(data[pos:pos + 2]):
|
||||
# We have a surrogate pair!
|
||||
char_val = _utils.surrogatePairToCodepoint(data[pos:pos + 2])
|
||||
if char_val in non_bmp_invalid_codepoints:
|
||||
self.errors.append("invalid-codepoint")
|
||||
skip = True
|
||||
elif (codepoint >= 0xD800 and codepoint <= 0xDFFF and
|
||||
pos == len(data) - 1):
|
||||
self.errors.append("invalid-codepoint")
|
||||
else:
|
||||
skip = False
|
||||
self.errors.append("invalid-codepoint")
|
||||
|
||||
def charsUntil(self, characters, opposite=False):
|
||||
""" Returns a string of characters from the stream up to but not
|
||||
including any character in 'characters' or EOF. 'characters' must be
|
||||
a container that supports the 'in' method and iteration over its
|
||||
characters.
|
||||
"""
|
||||
|
||||
# Use a cache of regexps to find the required characters
|
||||
try:
|
||||
chars = charsUntilRegEx[(characters, opposite)]
|
||||
except KeyError:
|
||||
if __debug__:
|
||||
for c in characters:
|
||||
assert(ord(c) < 128)
|
||||
regex = "".join(["\\x%02x" % ord(c) for c in characters])
|
||||
if not opposite:
|
||||
regex = "^%s" % regex
|
||||
chars = charsUntilRegEx[(characters, opposite)] = re.compile("[%s]+" % regex)
|
||||
|
||||
rv = []
|
||||
|
||||
while True:
|
||||
# Find the longest matching prefix
|
||||
m = chars.match(self.chunk, self.chunkOffset)
|
||||
if m is None:
|
||||
# If nothing matched, and it wasn't because we ran out of chunk,
|
||||
# then stop
|
||||
if self.chunkOffset != self.chunkSize:
|
||||
break
|
||||
else:
|
||||
end = m.end()
|
||||
# If not the whole chunk matched, return everything
|
||||
# up to the part that didn't match
|
||||
if end != self.chunkSize:
|
||||
rv.append(self.chunk[self.chunkOffset:end])
|
||||
self.chunkOffset = end
|
||||
break
|
||||
# If the whole remainder of the chunk matched,
|
||||
# use it all and read the next chunk
|
||||
rv.append(self.chunk[self.chunkOffset:])
|
||||
if not self.readChunk():
|
||||
# Reached EOF
|
||||
break
|
||||
|
||||
r = "".join(rv)
|
||||
return r
|
||||
|
||||
def unget(self, char):
|
||||
# Only one character is allowed to be ungotten at once - it must
|
||||
# be consumed again before any further call to unget
|
||||
if char is not EOF:
|
||||
if self.chunkOffset == 0:
|
||||
# unget is called quite rarely, so it's a good idea to do
|
||||
# more work here if it saves a bit of work in the frequently
|
||||
# called char and charsUntil.
|
||||
# So, just prepend the ungotten character onto the current
|
||||
# chunk:
|
||||
self.chunk = char + self.chunk
|
||||
self.chunkSize += 1
|
||||
else:
|
||||
self.chunkOffset -= 1
|
||||
assert self.chunk[self.chunkOffset] == char
|
||||
|
||||
|
||||
class HTMLBinaryInputStream(HTMLUnicodeInputStream):
|
||||
"""Provides a unicode stream of characters to the HTMLTokenizer.
|
||||
|
||||
This class takes care of character encoding and removing or replacing
|
||||
incorrect byte-sequences and also provides column and line tracking.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, source, override_encoding=None, transport_encoding=None,
|
||||
same_origin_parent_encoding=None, likely_encoding=None,
|
||||
default_encoding="windows-1252", useChardet=True):
|
||||
"""Initialises the HTMLInputStream.
|
||||
|
||||
HTMLInputStream(source, [encoding]) -> Normalized stream from source
|
||||
for use by html5lib.
|
||||
|
||||
source can be either a file-object, local filename or a string.
|
||||
|
||||
The optional encoding parameter must be a string that indicates
|
||||
the encoding. If specified, that encoding will be used,
|
||||
regardless of any BOM or later declaration (such as in a meta
|
||||
element)
|
||||
|
||||
"""
|
||||
# Raw Stream - for unicode objects this will encode to utf-8 and set
|
||||
# self.charEncoding as appropriate
|
||||
self.rawStream = self.openStream(source)
|
||||
|
||||
HTMLUnicodeInputStream.__init__(self, self.rawStream)
|
||||
|
||||
# Encoding Information
|
||||
# Number of bytes to use when looking for a meta element with
|
||||
# encoding information
|
||||
self.numBytesMeta = 1024
|
||||
# Number of bytes to use when using detecting encoding using chardet
|
||||
self.numBytesChardet = 100
|
||||
# Things from args
|
||||
self.override_encoding = override_encoding
|
||||
self.transport_encoding = transport_encoding
|
||||
self.same_origin_parent_encoding = same_origin_parent_encoding
|
||||
self.likely_encoding = likely_encoding
|
||||
self.default_encoding = default_encoding
|
||||
|
||||
# Determine encoding
|
||||
self.charEncoding = self.determineEncoding(useChardet)
|
||||
assert self.charEncoding[0] is not None
|
||||
|
||||
# Call superclass
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
self.dataStream = self.charEncoding[0].codec_info.streamreader(self.rawStream, 'replace')
|
||||
HTMLUnicodeInputStream.reset(self)
|
||||
|
||||
def openStream(self, source):
|
||||
"""Produces a file object from source.
|
||||
|
||||
source can be either a file object, local filename or a string.
|
||||
|
||||
"""
|
||||
# Already a file object
|
||||
if hasattr(source, 'read'):
|
||||
stream = source
|
||||
else:
|
||||
stream = BytesIO(source)
|
||||
|
||||
try:
|
||||
stream.seek(stream.tell())
|
||||
except Exception:
|
||||
stream = BufferedStream(stream)
|
||||
|
||||
return stream
|
||||
|
||||
def determineEncoding(self, chardet=True):
|
||||
# BOMs take precedence over everything
|
||||
# This will also read past the BOM if present
|
||||
charEncoding = self.detectBOM(), "certain"
|
||||
if charEncoding[0] is not None:
|
||||
return charEncoding
|
||||
|
||||
# If we've been overridden, we've been overridden
|
||||
charEncoding = lookupEncoding(self.override_encoding), "certain"
|
||||
if charEncoding[0] is not None:
|
||||
return charEncoding
|
||||
|
||||
# Now check the transport layer
|
||||
charEncoding = lookupEncoding(self.transport_encoding), "certain"
|
||||
if charEncoding[0] is not None:
|
||||
return charEncoding
|
||||
|
||||
# Look for meta elements with encoding information
|
||||
charEncoding = self.detectEncodingMeta(), "tentative"
|
||||
if charEncoding[0] is not None:
|
||||
return charEncoding
|
||||
|
||||
# Parent document encoding
|
||||
charEncoding = lookupEncoding(self.same_origin_parent_encoding), "tentative"
|
||||
if charEncoding[0] is not None and not charEncoding[0].name.startswith("utf-16"):
|
||||
return charEncoding
|
||||
|
||||
# "likely" encoding
|
||||
charEncoding = lookupEncoding(self.likely_encoding), "tentative"
|
||||
if charEncoding[0] is not None:
|
||||
return charEncoding
|
||||
|
||||
# Guess with chardet, if available
|
||||
if chardet:
|
||||
try:
|
||||
from chardet.universaldetector import UniversalDetector
|
||||
except ImportError:
|
||||
pass
|
||||
else:
|
||||
buffers = []
|
||||
detector = UniversalDetector()
|
||||
while not detector.done:
|
||||
buffer = self.rawStream.read(self.numBytesChardet)
|
||||
assert isinstance(buffer, bytes)
|
||||
if not buffer:
|
||||
break
|
||||
buffers.append(buffer)
|
||||
detector.feed(buffer)
|
||||
detector.close()
|
||||
encoding = lookupEncoding(detector.result['encoding'])
|
||||
self.rawStream.seek(0)
|
||||
if encoding is not None:
|
||||
return encoding, "tentative"
|
||||
|
||||
# Try the default encoding
|
||||
charEncoding = lookupEncoding(self.default_encoding), "tentative"
|
||||
if charEncoding[0] is not None:
|
||||
return charEncoding
|
||||
|
||||
# Fallback to html5lib's default if even that hasn't worked
|
||||
return lookupEncoding("windows-1252"), "tentative"
|
||||
|
||||
def changeEncoding(self, newEncoding):
|
||||
assert self.charEncoding[1] != "certain"
|
||||
newEncoding = lookupEncoding(newEncoding)
|
||||
if newEncoding is None:
|
||||
return
|
||||
if newEncoding.name in ("utf-16be", "utf-16le"):
|
||||
newEncoding = lookupEncoding("utf-8")
|
||||
assert newEncoding is not None
|
||||
elif newEncoding == self.charEncoding[0]:
|
||||
self.charEncoding = (self.charEncoding[0], "certain")
|
||||
else:
|
||||
self.rawStream.seek(0)
|
||||
self.charEncoding = (newEncoding, "certain")
|
||||
self.reset()
|
||||
raise _ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding))
|
||||
|
||||
def detectBOM(self):
|
||||
"""Attempts to detect at BOM at the start of the stream. If
|
||||
an encoding can be determined from the BOM return the name of the
|
||||
encoding otherwise return None"""
|
||||
bomDict = {
|
||||
codecs.BOM_UTF8: 'utf-8',
|
||||
codecs.BOM_UTF16_LE: 'utf-16le', codecs.BOM_UTF16_BE: 'utf-16be',
|
||||
codecs.BOM_UTF32_LE: 'utf-32le', codecs.BOM_UTF32_BE: 'utf-32be'
|
||||
}
|
||||
|
||||
# Go to beginning of file and read in 4 bytes
|
||||
string = self.rawStream.read(4)
|
||||
assert isinstance(string, bytes)
|
||||
|
||||
# Try detecting the BOM using bytes from the string
|
||||
encoding = bomDict.get(string[:3]) # UTF-8
|
||||
seek = 3
|
||||
if not encoding:
|
||||
# Need to detect UTF-32 before UTF-16
|
||||
encoding = bomDict.get(string) # UTF-32
|
||||
seek = 4
|
||||
if not encoding:
|
||||
encoding = bomDict.get(string[:2]) # UTF-16
|
||||
seek = 2
|
||||
|
||||
# Set the read position past the BOM if one was found, otherwise
|
||||
# set it to the start of the stream
|
||||
if encoding:
|
||||
self.rawStream.seek(seek)
|
||||
return lookupEncoding(encoding)
|
||||
else:
|
||||
self.rawStream.seek(0)
|
||||
return None
|
||||
|
||||
def detectEncodingMeta(self):
|
||||
"""Report the encoding declared by the meta element
|
||||
"""
|
||||
buffer = self.rawStream.read(self.numBytesMeta)
|
||||
assert isinstance(buffer, bytes)
|
||||
parser = EncodingParser(buffer)
|
||||
self.rawStream.seek(0)
|
||||
encoding = parser.getEncoding()
|
||||
|
||||
if encoding is not None and encoding.name in ("utf-16be", "utf-16le"):
|
||||
encoding = lookupEncoding("utf-8")
|
||||
|
||||
return encoding
|
||||
|
||||
|
||||
class EncodingBytes(bytes):
|
||||
"""String-like object with an associated position and various extra methods
|
||||
If the position is ever greater than the string length then an exception is
|
||||
raised"""
|
||||
def __new__(self, value):
|
||||
assert isinstance(value, bytes)
|
||||
return bytes.__new__(self, value.lower())
|
||||
|
||||
def __init__(self, value):
|
||||
# pylint:disable=unused-argument
|
||||
self._position = -1
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def __next__(self):
|
||||
p = self._position = self._position + 1
|
||||
if p >= len(self):
|
||||
raise StopIteration
|
||||
elif p < 0:
|
||||
raise TypeError
|
||||
return self[p:p + 1]
|
||||
|
||||
def next(self):
|
||||
# Py2 compat
|
||||
return self.__next__()
|
||||
|
||||
def previous(self):
|
||||
p = self._position
|
||||
if p >= len(self):
|
||||
raise StopIteration
|
||||
elif p < 0:
|
||||
raise TypeError
|
||||
self._position = p = p - 1
|
||||
return self[p:p + 1]
|
||||
|
||||
def setPosition(self, position):
|
||||
if self._position >= len(self):
|
||||
raise StopIteration
|
||||
self._position = position
|
||||
|
||||
def getPosition(self):
|
||||
if self._position >= len(self):
|
||||
raise StopIteration
|
||||
if self._position >= 0:
|
||||
return self._position
|
||||
else:
|
||||
return None
|
||||
|
||||
position = property(getPosition, setPosition)
|
||||
|
||||
def getCurrentByte(self):
|
||||
return self[self.position:self.position + 1]
|
||||
|
||||
currentByte = property(getCurrentByte)
|
||||
|
||||
def skip(self, chars=spaceCharactersBytes):
|
||||
"""Skip past a list of characters"""
|
||||
p = self.position # use property for the error-checking
|
||||
while p < len(self):
|
||||
c = self[p:p + 1]
|
||||
if c not in chars:
|
||||
self._position = p
|
||||
return c
|
||||
p += 1
|
||||
self._position = p
|
||||
return None
|
||||
|
||||
def skipUntil(self, chars):
|
||||
p = self.position
|
||||
while p < len(self):
|
||||
c = self[p:p + 1]
|
||||
if c in chars:
|
||||
self._position = p
|
||||
return c
|
||||
p += 1
|
||||
self._position = p
|
||||
return None
|
||||
|
||||
def matchBytes(self, bytes):
|
||||
"""Look for a sequence of bytes at the start of a string. If the bytes
|
||||
are found return True and advance the position to the byte after the
|
||||
match. Otherwise return False and leave the position alone"""
|
||||
rv = self.startswith(bytes, self.position)
|
||||
if rv:
|
||||
self.position += len(bytes)
|
||||
return rv
|
||||
|
||||
def jumpTo(self, bytes):
|
||||
"""Look for the next sequence of bytes matching a given sequence. If
|
||||
a match is found advance the position to the last byte of the match"""
|
||||
try:
|
||||
self._position = self.index(bytes, self.position) + len(bytes) - 1
|
||||
except ValueError:
|
||||
raise StopIteration
|
||||
return True
|
||||
|
||||
|
||||
class EncodingParser(object):
|
||||
"""Mini parser for detecting character encoding from meta elements"""
|
||||
|
||||
def __init__(self, data):
|
||||
"""string - the data to work on for encoding detection"""
|
||||
self.data = EncodingBytes(data)
|
||||
self.encoding = None
|
||||
|
||||
def getEncoding(self):
|
||||
if b"<meta" not in self.data:
|
||||
return None
|
||||
|
||||
methodDispatch = (
|
||||
(b"<!--", self.handleComment),
|
||||
(b"<meta", self.handleMeta),
|
||||
(b"</", self.handlePossibleEndTag),
|
||||
(b"<!", self.handleOther),
|
||||
(b"<?", self.handleOther),
|
||||
(b"<", self.handlePossibleStartTag))
|
||||
for _ in self.data:
|
||||
keepParsing = True
|
||||
try:
|
||||
self.data.jumpTo(b"<")
|
||||
except StopIteration:
|
||||
break
|
||||
for key, method in methodDispatch:
|
||||
if self.data.matchBytes(key):
|
||||
try:
|
||||
keepParsing = method()
|
||||
break
|
||||
except StopIteration:
|
||||
keepParsing = False
|
||||
break
|
||||
if not keepParsing:
|
||||
break
|
||||
|
||||
return self.encoding
|
||||
|
||||
def handleComment(self):
|
||||
"""Skip over comments"""
|
||||
return self.data.jumpTo(b"-->")
|
||||
|
||||
def handleMeta(self):
|
||||
if self.data.currentByte not in spaceCharactersBytes:
|
||||
# if we have <meta not followed by a space so just keep going
|
||||
return True
|
||||
# We have a valid meta element we want to search for attributes
|
||||
hasPragma = False
|
||||
pendingEncoding = None
|
||||
while True:
|
||||
# Try to find the next attribute after the current position
|
||||
attr = self.getAttribute()
|
||||
if attr is None:
|
||||
return True
|
||||
else:
|
||||
if attr[0] == b"http-equiv":
|
||||
hasPragma = attr[1] == b"content-type"
|
||||
if hasPragma and pendingEncoding is not None:
|
||||
self.encoding = pendingEncoding
|
||||
return False
|
||||
elif attr[0] == b"charset":
|
||||
tentativeEncoding = attr[1]
|
||||
codec = lookupEncoding(tentativeEncoding)
|
||||
if codec is not None:
|
||||
self.encoding = codec
|
||||
return False
|
||||
elif attr[0] == b"content":
|
||||
contentParser = ContentAttrParser(EncodingBytes(attr[1]))
|
||||
tentativeEncoding = contentParser.parse()
|
||||
if tentativeEncoding is not None:
|
||||
codec = lookupEncoding(tentativeEncoding)
|
||||
if codec is not None:
|
||||
if hasPragma:
|
||||
self.encoding = codec
|
||||
return False
|
||||
else:
|
||||
pendingEncoding = codec
|
||||
|
||||
def handlePossibleStartTag(self):
|
||||
return self.handlePossibleTag(False)
|
||||
|
||||
def handlePossibleEndTag(self):
|
||||
next(self.data)
|
||||
return self.handlePossibleTag(True)
|
||||
|
||||
def handlePossibleTag(self, endTag):
|
||||
data = self.data
|
||||
if data.currentByte not in asciiLettersBytes:
|
||||
# If the next byte is not an ascii letter either ignore this
|
||||
# fragment (possible start tag case) or treat it according to
|
||||
# handleOther
|
||||
if endTag:
|
||||
data.previous()
|
||||
self.handleOther()
|
||||
return True
|
||||
|
||||
c = data.skipUntil(spacesAngleBrackets)
|
||||
if c == b"<":
|
||||
# return to the first step in the overall "two step" algorithm
|
||||
# reprocessing the < byte
|
||||
data.previous()
|
||||
else:
|
||||
# Read all attributes
|
||||
attr = self.getAttribute()
|
||||
while attr is not None:
|
||||
attr = self.getAttribute()
|
||||
return True
|
||||
|
||||
def handleOther(self):
|
||||
return self.data.jumpTo(b">")
|
||||
|
||||
def getAttribute(self):
|
||||
"""Return a name,value pair for the next attribute in the stream,
|
||||
if one is found, or None"""
|
||||
data = self.data
|
||||
# Step 1 (skip chars)
|
||||
c = data.skip(spaceCharactersBytes | frozenset([b"/"]))
|
||||
assert c is None or len(c) == 1
|
||||
# Step 2
|
||||
if c in (b">", None):
|
||||
return None
|
||||
# Step 3
|
||||
attrName = []
|
||||
attrValue = []
|
||||
# Step 4 attribute name
|
||||
while True:
|
||||
if c == b"=" and attrName:
|
||||
break
|
||||
elif c in spaceCharactersBytes:
|
||||
# Step 6!
|
||||
c = data.skip()
|
||||
break
|
||||
elif c in (b"/", b">"):
|
||||
return b"".join(attrName), b""
|
||||
elif c in asciiUppercaseBytes:
|
||||
attrName.append(c.lower())
|
||||
elif c is None:
|
||||
return None
|
||||
else:
|
||||
attrName.append(c)
|
||||
# Step 5
|
||||
c = next(data)
|
||||
# Step 7
|
||||
if c != b"=":
|
||||
data.previous()
|
||||
return b"".join(attrName), b""
|
||||
# Step 8
|
||||
next(data)
|
||||
# Step 9
|
||||
c = data.skip()
|
||||
# Step 10
|
||||
if c in (b"'", b'"'):
|
||||
# 10.1
|
||||
quoteChar = c
|
||||
while True:
|
||||
# 10.2
|
||||
c = next(data)
|
||||
# 10.3
|
||||
if c == quoteChar:
|
||||
next(data)
|
||||
return b"".join(attrName), b"".join(attrValue)
|
||||
# 10.4
|
||||
elif c in asciiUppercaseBytes:
|
||||
attrValue.append(c.lower())
|
||||
# 10.5
|
||||
else:
|
||||
attrValue.append(c)
|
||||
elif c == b">":
|
||||
return b"".join(attrName), b""
|
||||
elif c in asciiUppercaseBytes:
|
||||
attrValue.append(c.lower())
|
||||
elif c is None:
|
||||
return None
|
||||
else:
|
||||
attrValue.append(c)
|
||||
# Step 11
|
||||
while True:
|
||||
c = next(data)
|
||||
if c in spacesAngleBrackets:
|
||||
return b"".join(attrName), b"".join(attrValue)
|
||||
elif c in asciiUppercaseBytes:
|
||||
attrValue.append(c.lower())
|
||||
elif c is None:
|
||||
return None
|
||||
else:
|
||||
attrValue.append(c)
|
||||
|
||||
|
||||
class ContentAttrParser(object):
|
||||
def __init__(self, data):
|
||||
assert isinstance(data, bytes)
|
||||
self.data = data
|
||||
|
||||
def parse(self):
|
||||
try:
|
||||
# Check if the attr name is charset
|
||||
# otherwise return
|
||||
self.data.jumpTo(b"charset")
|
||||
self.data.position += 1
|
||||
self.data.skip()
|
||||
if not self.data.currentByte == b"=":
|
||||
# If there is no = sign keep looking for attrs
|
||||
return None
|
||||
self.data.position += 1
|
||||
self.data.skip()
|
||||
# Look for an encoding between matching quote marks
|
||||
if self.data.currentByte in (b'"', b"'"):
|
||||
quoteMark = self.data.currentByte
|
||||
self.data.position += 1
|
||||
oldPosition = self.data.position
|
||||
if self.data.jumpTo(quoteMark):
|
||||
return self.data[oldPosition:self.data.position]
|
||||
else:
|
||||
return None
|
||||
else:
|
||||
# Unquoted value
|
||||
oldPosition = self.data.position
|
||||
try:
|
||||
self.data.skipUntil(spaceCharactersBytes)
|
||||
return self.data[oldPosition:self.data.position]
|
||||
except StopIteration:
|
||||
# Return the whole remaining value
|
||||
return self.data[oldPosition:]
|
||||
except StopIteration:
|
||||
return None
|
||||
|
||||
|
||||
def lookupEncoding(encoding):
|
||||
"""Return the python codec name corresponding to an encoding or None if the
|
||||
string doesn't correspond to a valid encoding."""
|
||||
if isinstance(encoding, bytes):
|
||||
try:
|
||||
encoding = encoding.decode("ascii")
|
||||
except UnicodeDecodeError:
|
||||
return None
|
||||
|
||||
if encoding is not None:
|
||||
try:
|
||||
return webencodings.lookup(encoding)
|
||||
except AttributeError:
|
||||
return None
|
||||
else:
|
||||
return None
|
1735
venv/Lib/site-packages/bleach/_vendor/html5lib/_tokenizer.py
Normal file
1735
venv/Lib/site-packages/bleach/_vendor/html5lib/_tokenizer.py
Normal file
File diff suppressed because it is too large
Load diff
|
@ -0,0 +1,5 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
from .py import Trie
|
||||
|
||||
__all__ = ["Trie"]
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,40 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
try:
|
||||
from collections.abc import Mapping
|
||||
except ImportError: # Python 2.7
|
||||
from collections import Mapping
|
||||
|
||||
|
||||
class Trie(Mapping):
|
||||
"""Abstract base class for tries"""
|
||||
|
||||
def keys(self, prefix=None):
|
||||
# pylint:disable=arguments-differ
|
||||
keys = super(Trie, self).keys()
|
||||
|
||||
if prefix is None:
|
||||
return set(keys)
|
||||
|
||||
return {x for x in keys if x.startswith(prefix)}
|
||||
|
||||
def has_keys_with_prefix(self, prefix):
|
||||
for key in self.keys():
|
||||
if key.startswith(prefix):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def longest_prefix(self, prefix):
|
||||
if prefix in self:
|
||||
return prefix
|
||||
|
||||
for i in range(1, len(prefix) + 1):
|
||||
if prefix[:-i] in self:
|
||||
return prefix[:-i]
|
||||
|
||||
raise KeyError(prefix)
|
||||
|
||||
def longest_prefix_item(self, prefix):
|
||||
lprefix = self.longest_prefix(prefix)
|
||||
return (lprefix, self[lprefix])
|
67
venv/Lib/site-packages/bleach/_vendor/html5lib/_trie/py.py
Normal file
67
venv/Lib/site-packages/bleach/_vendor/html5lib/_trie/py.py
Normal file
|
@ -0,0 +1,67 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
from six import text_type
|
||||
|
||||
from bisect import bisect_left
|
||||
|
||||
from ._base import Trie as ABCTrie
|
||||
|
||||
|
||||
class Trie(ABCTrie):
|
||||
def __init__(self, data):
|
||||
if not all(isinstance(x, text_type) for x in data.keys()):
|
||||
raise TypeError("All keys must be strings")
|
||||
|
||||
self._data = data
|
||||
self._keys = sorted(data.keys())
|
||||
self._cachestr = ""
|
||||
self._cachepoints = (0, len(data))
|
||||
|
||||
def __contains__(self, key):
|
||||
return key in self._data
|
||||
|
||||
def __len__(self):
|
||||
return len(self._data)
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self._data)
|
||||
|
||||
def __getitem__(self, key):
|
||||
return self._data[key]
|
||||
|
||||
def keys(self, prefix=None):
|
||||
if prefix is None or prefix == "" or not self._keys:
|
||||
return set(self._keys)
|
||||
|
||||
if prefix.startswith(self._cachestr):
|
||||
lo, hi = self._cachepoints
|
||||
start = i = bisect_left(self._keys, prefix, lo, hi)
|
||||
else:
|
||||
start = i = bisect_left(self._keys, prefix)
|
||||
|
||||
keys = set()
|
||||
if start == len(self._keys):
|
||||
return keys
|
||||
|
||||
while self._keys[i].startswith(prefix):
|
||||
keys.add(self._keys[i])
|
||||
i += 1
|
||||
|
||||
self._cachestr = prefix
|
||||
self._cachepoints = (start, i)
|
||||
|
||||
return keys
|
||||
|
||||
def has_keys_with_prefix(self, prefix):
|
||||
if prefix in self._data:
|
||||
return True
|
||||
|
||||
if prefix.startswith(self._cachestr):
|
||||
lo, hi = self._cachepoints
|
||||
i = bisect_left(self._keys, prefix, lo, hi)
|
||||
else:
|
||||
i = bisect_left(self._keys, prefix)
|
||||
|
||||
if i == len(self._keys):
|
||||
return False
|
||||
|
||||
return self._keys[i].startswith(prefix)
|
159
venv/Lib/site-packages/bleach/_vendor/html5lib/_utils.py
Normal file
159
venv/Lib/site-packages/bleach/_vendor/html5lib/_utils.py
Normal file
|
@ -0,0 +1,159 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
from types import ModuleType
|
||||
|
||||
try:
|
||||
from collections.abc import Mapping
|
||||
except ImportError:
|
||||
from collections import Mapping
|
||||
|
||||
from six import text_type, PY3
|
||||
|
||||
if PY3:
|
||||
import xml.etree.ElementTree as default_etree
|
||||
else:
|
||||
try:
|
||||
import xml.etree.cElementTree as default_etree
|
||||
except ImportError:
|
||||
import xml.etree.ElementTree as default_etree
|
||||
|
||||
|
||||
__all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
|
||||
"surrogatePairToCodepoint", "moduleFactoryFactory",
|
||||
"supports_lone_surrogates"]
|
||||
|
||||
|
||||
# Platforms not supporting lone surrogates (\uD800-\uDFFF) should be
|
||||
# caught by the below test. In general this would be any platform
|
||||
# using UTF-16 as its encoding of unicode strings, such as
|
||||
# Jython. This is because UTF-16 itself is based on the use of such
|
||||
# surrogates, and there is no mechanism to further escape such
|
||||
# escapes.
|
||||
try:
|
||||
_x = eval('"\\uD800"') # pylint:disable=eval-used
|
||||
if not isinstance(_x, text_type):
|
||||
# We need this with u"" because of http://bugs.jython.org/issue2039
|
||||
_x = eval('u"\\uD800"') # pylint:disable=eval-used
|
||||
assert isinstance(_x, text_type)
|
||||
except Exception:
|
||||
supports_lone_surrogates = False
|
||||
else:
|
||||
supports_lone_surrogates = True
|
||||
|
||||
|
||||
class MethodDispatcher(dict):
|
||||
"""Dict with 2 special properties:
|
||||
|
||||
On initiation, keys that are lists, sets or tuples are converted to
|
||||
multiple keys so accessing any one of the items in the original
|
||||
list-like object returns the matching value
|
||||
|
||||
md = MethodDispatcher({("foo", "bar"):"baz"})
|
||||
md["foo"] == "baz"
|
||||
|
||||
A default value which can be set through the default attribute.
|
||||
"""
|
||||
|
||||
def __init__(self, items=()):
|
||||
_dictEntries = []
|
||||
for name, value in items:
|
||||
if isinstance(name, (list, tuple, frozenset, set)):
|
||||
for item in name:
|
||||
_dictEntries.append((item, value))
|
||||
else:
|
||||
_dictEntries.append((name, value))
|
||||
dict.__init__(self, _dictEntries)
|
||||
assert len(self) == len(_dictEntries)
|
||||
self.default = None
|
||||
|
||||
def __getitem__(self, key):
|
||||
return dict.get(self, key, self.default)
|
||||
|
||||
def __get__(self, instance, owner=None):
|
||||
return BoundMethodDispatcher(instance, self)
|
||||
|
||||
|
||||
class BoundMethodDispatcher(Mapping):
|
||||
"""Wraps a MethodDispatcher, binding its return values to `instance`"""
|
||||
def __init__(self, instance, dispatcher):
|
||||
self.instance = instance
|
||||
self.dispatcher = dispatcher
|
||||
|
||||
def __getitem__(self, key):
|
||||
# see https://docs.python.org/3/reference/datamodel.html#object.__get__
|
||||
# on a function, __get__ is used to bind a function to an instance as a bound method
|
||||
return self.dispatcher[key].__get__(self.instance)
|
||||
|
||||
def get(self, key, default):
|
||||
if key in self.dispatcher:
|
||||
return self[key]
|
||||
else:
|
||||
return default
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self.dispatcher)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.dispatcher)
|
||||
|
||||
def __contains__(self, key):
|
||||
return key in self.dispatcher
|
||||
|
||||
|
||||
# Some utility functions to deal with weirdness around UCS2 vs UCS4
|
||||
# python builds
|
||||
|
||||
def isSurrogatePair(data):
|
||||
return (len(data) == 2 and
|
||||
ord(data[0]) >= 0xD800 and ord(data[0]) <= 0xDBFF and
|
||||
ord(data[1]) >= 0xDC00 and ord(data[1]) <= 0xDFFF)
|
||||
|
||||
|
||||
def surrogatePairToCodepoint(data):
|
||||
char_val = (0x10000 + (ord(data[0]) - 0xD800) * 0x400 +
|
||||
(ord(data[1]) - 0xDC00))
|
||||
return char_val
|
||||
|
||||
# Module Factory Factory (no, this isn't Java, I know)
|
||||
# Here to stop this being duplicated all over the place.
|
||||
|
||||
|
||||
def moduleFactoryFactory(factory):
|
||||
moduleCache = {}
|
||||
|
||||
def moduleFactory(baseModule, *args, **kwargs):
|
||||
if isinstance(ModuleType.__name__, type("")):
|
||||
name = "_%s_factory" % baseModule.__name__
|
||||
else:
|
||||
name = b"_%s_factory" % baseModule.__name__
|
||||
|
||||
kwargs_tuple = tuple(kwargs.items())
|
||||
|
||||
try:
|
||||
return moduleCache[name][args][kwargs_tuple]
|
||||
except KeyError:
|
||||
mod = ModuleType(name)
|
||||
objs = factory(baseModule, *args, **kwargs)
|
||||
mod.__dict__.update(objs)
|
||||
if "name" not in moduleCache:
|
||||
moduleCache[name] = {}
|
||||
if "args" not in moduleCache[name]:
|
||||
moduleCache[name][args] = {}
|
||||
if "kwargs" not in moduleCache[name][args]:
|
||||
moduleCache[name][args][kwargs_tuple] = {}
|
||||
moduleCache[name][args][kwargs_tuple] = mod
|
||||
return mod
|
||||
|
||||
return moduleFactory
|
||||
|
||||
|
||||
def memoize(func):
|
||||
cache = {}
|
||||
|
||||
def wrapped(*args, **kwargs):
|
||||
key = (tuple(args), tuple(kwargs.items()))
|
||||
if key not in cache:
|
||||
cache[key] = func(*args, **kwargs)
|
||||
return cache[key]
|
||||
|
||||
return wrapped
|
2946
venv/Lib/site-packages/bleach/_vendor/html5lib/constants.py
Normal file
2946
venv/Lib/site-packages/bleach/_vendor/html5lib/constants.py
Normal file
File diff suppressed because it is too large
Load diff
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,29 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
from . import base
|
||||
|
||||
from collections import OrderedDict
|
||||
|
||||
|
||||
def _attr_key(attr):
|
||||
"""Return an appropriate key for an attribute for sorting
|
||||
|
||||
Attributes have a namespace that can be either ``None`` or a string. We
|
||||
can't compare the two because they're different types, so we convert
|
||||
``None`` to an empty string first.
|
||||
|
||||
"""
|
||||
return (attr[0][0] or ''), attr[0][1]
|
||||
|
||||
|
||||
class Filter(base.Filter):
|
||||
"""Alphabetizes attributes for elements"""
|
||||
def __iter__(self):
|
||||
for token in base.Filter.__iter__(self):
|
||||
if token["type"] in ("StartTag", "EmptyTag"):
|
||||
attrs = OrderedDict()
|
||||
for name, value in sorted(token["data"].items(),
|
||||
key=_attr_key):
|
||||
attrs[name] = value
|
||||
token["data"] = attrs
|
||||
yield token
|
|
@ -0,0 +1,12 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
|
||||
class Filter(object):
|
||||
def __init__(self, source):
|
||||
self.source = source
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self.source)
|
||||
|
||||
def __getattr__(self, name):
|
||||
return getattr(self.source, name)
|
|
@ -0,0 +1,73 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
from . import base
|
||||
|
||||
|
||||
class Filter(base.Filter):
|
||||
"""Injects ``<meta charset=ENCODING>`` tag into head of document"""
|
||||
def __init__(self, source, encoding):
|
||||
"""Creates a Filter
|
||||
|
||||
:arg source: the source token stream
|
||||
|
||||
:arg encoding: the encoding to set
|
||||
|
||||
"""
|
||||
base.Filter.__init__(self, source)
|
||||
self.encoding = encoding
|
||||
|
||||
def __iter__(self):
|
||||
state = "pre_head"
|
||||
meta_found = (self.encoding is None)
|
||||
pending = []
|
||||
|
||||
for token in base.Filter.__iter__(self):
|
||||
type = token["type"]
|
||||
if type == "StartTag":
|
||||
if token["name"].lower() == "head":
|
||||
state = "in_head"
|
||||
|
||||
elif type == "EmptyTag":
|
||||
if token["name"].lower() == "meta":
|
||||
# replace charset with actual encoding
|
||||
has_http_equiv_content_type = False
|
||||
for (namespace, name), value in token["data"].items():
|
||||
if namespace is not None:
|
||||
continue
|
||||
elif name.lower() == 'charset':
|
||||
token["data"][(namespace, name)] = self.encoding
|
||||
meta_found = True
|
||||
break
|
||||
elif name == 'http-equiv' and value.lower() == 'content-type':
|
||||
has_http_equiv_content_type = True
|
||||
else:
|
||||
if has_http_equiv_content_type and (None, "content") in token["data"]:
|
||||
token["data"][(None, "content")] = 'text/html; charset=%s' % self.encoding
|
||||
meta_found = True
|
||||
|
||||
elif token["name"].lower() == "head" and not meta_found:
|
||||
# insert meta into empty head
|
||||
yield {"type": "StartTag", "name": "head",
|
||||
"data": token["data"]}
|
||||
yield {"type": "EmptyTag", "name": "meta",
|
||||
"data": {(None, "charset"): self.encoding}}
|
||||
yield {"type": "EndTag", "name": "head"}
|
||||
meta_found = True
|
||||
continue
|
||||
|
||||
elif type == "EndTag":
|
||||
if token["name"].lower() == "head" and pending:
|
||||
# insert meta into head (if necessary) and flush pending queue
|
||||
yield pending.pop(0)
|
||||
if not meta_found:
|
||||
yield {"type": "EmptyTag", "name": "meta",
|
||||
"data": {(None, "charset"): self.encoding}}
|
||||
while pending:
|
||||
yield pending.pop(0)
|
||||
meta_found = True
|
||||
state = "post_head"
|
||||
|
||||
if state == "in_head":
|
||||
pending.append(token)
|
||||
else:
|
||||
yield token
|
|
@ -0,0 +1,93 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
from six import text_type
|
||||
|
||||
from . import base
|
||||
from ..constants import namespaces, voidElements
|
||||
|
||||
from ..constants import spaceCharacters
|
||||
spaceCharacters = "".join(spaceCharacters)
|
||||
|
||||
|
||||
class Filter(base.Filter):
|
||||
"""Lints the token stream for errors
|
||||
|
||||
If it finds any errors, it'll raise an ``AssertionError``.
|
||||
|
||||
"""
|
||||
def __init__(self, source, require_matching_tags=True):
|
||||
"""Creates a Filter
|
||||
|
||||
:arg source: the source token stream
|
||||
|
||||
:arg require_matching_tags: whether or not to require matching tags
|
||||
|
||||
"""
|
||||
super(Filter, self).__init__(source)
|
||||
self.require_matching_tags = require_matching_tags
|
||||
|
||||
def __iter__(self):
|
||||
open_elements = []
|
||||
for token in base.Filter.__iter__(self):
|
||||
type = token["type"]
|
||||
if type in ("StartTag", "EmptyTag"):
|
||||
namespace = token["namespace"]
|
||||
name = token["name"]
|
||||
assert namespace is None or isinstance(namespace, text_type)
|
||||
assert namespace != ""
|
||||
assert isinstance(name, text_type)
|
||||
assert name != ""
|
||||
assert isinstance(token["data"], dict)
|
||||
if (not namespace or namespace == namespaces["html"]) and name in voidElements:
|
||||
assert type == "EmptyTag"
|
||||
else:
|
||||
assert type == "StartTag"
|
||||
if type == "StartTag" and self.require_matching_tags:
|
||||
open_elements.append((namespace, name))
|
||||
for (namespace, name), value in token["data"].items():
|
||||
assert namespace is None or isinstance(namespace, text_type)
|
||||
assert namespace != ""
|
||||
assert isinstance(name, text_type)
|
||||
assert name != ""
|
||||
assert isinstance(value, text_type)
|
||||
|
||||
elif type == "EndTag":
|
||||
namespace = token["namespace"]
|
||||
name = token["name"]
|
||||
assert namespace is None or isinstance(namespace, text_type)
|
||||
assert namespace != ""
|
||||
assert isinstance(name, text_type)
|
||||
assert name != ""
|
||||
if (not namespace or namespace == namespaces["html"]) and name in voidElements:
|
||||
assert False, "Void element reported as EndTag token: %(tag)s" % {"tag": name}
|
||||
elif self.require_matching_tags:
|
||||
start = open_elements.pop()
|
||||
assert start == (namespace, name)
|
||||
|
||||
elif type == "Comment":
|
||||
data = token["data"]
|
||||
assert isinstance(data, text_type)
|
||||
|
||||
elif type in ("Characters", "SpaceCharacters"):
|
||||
data = token["data"]
|
||||
assert isinstance(data, text_type)
|
||||
assert data != ""
|
||||
if type == "SpaceCharacters":
|
||||
assert data.strip(spaceCharacters) == ""
|
||||
|
||||
elif type == "Doctype":
|
||||
name = token["name"]
|
||||
assert name is None or isinstance(name, text_type)
|
||||
assert token["publicId"] is None or isinstance(name, text_type)
|
||||
assert token["systemId"] is None or isinstance(name, text_type)
|
||||
|
||||
elif type == "Entity":
|
||||
assert isinstance(token["name"], text_type)
|
||||
|
||||
elif type == "SerializerError":
|
||||
assert isinstance(token["data"], text_type)
|
||||
|
||||
else:
|
||||
assert False, "Unknown token type: %(type)s" % {"type": type}
|
||||
|
||||
yield token
|
|
@ -0,0 +1,207 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
from . import base
|
||||
|
||||
|
||||
class Filter(base.Filter):
|
||||
"""Removes optional tags from the token stream"""
|
||||
def slider(self):
|
||||
previous1 = previous2 = None
|
||||
for token in self.source:
|
||||
if previous1 is not None:
|
||||
yield previous2, previous1, token
|
||||
previous2 = previous1
|
||||
previous1 = token
|
||||
if previous1 is not None:
|
||||
yield previous2, previous1, None
|
||||
|
||||
def __iter__(self):
|
||||
for previous, token, next in self.slider():
|
||||
type = token["type"]
|
||||
if type == "StartTag":
|
||||
if (token["data"] or
|
||||
not self.is_optional_start(token["name"], previous, next)):
|
||||
yield token
|
||||
elif type == "EndTag":
|
||||
if not self.is_optional_end(token["name"], next):
|
||||
yield token
|
||||
else:
|
||||
yield token
|
||||
|
||||
def is_optional_start(self, tagname, previous, next):
|
||||
type = next and next["type"] or None
|
||||
if tagname in 'html':
|
||||
# An html element's start tag may be omitted if the first thing
|
||||
# inside the html element is not a space character or a comment.
|
||||
return type not in ("Comment", "SpaceCharacters")
|
||||
elif tagname == 'head':
|
||||
# A head element's start tag may be omitted if the first thing
|
||||
# inside the head element is an element.
|
||||
# XXX: we also omit the start tag if the head element is empty
|
||||
if type in ("StartTag", "EmptyTag"):
|
||||
return True
|
||||
elif type == "EndTag":
|
||||
return next["name"] == "head"
|
||||
elif tagname == 'body':
|
||||
# A body element's start tag may be omitted if the first thing
|
||||
# inside the body element is not a space character or a comment,
|
||||
# except if the first thing inside the body element is a script
|
||||
# or style element and the node immediately preceding the body
|
||||
# element is a head element whose end tag has been omitted.
|
||||
if type in ("Comment", "SpaceCharacters"):
|
||||
return False
|
||||
elif type == "StartTag":
|
||||
# XXX: we do not look at the preceding event, so we never omit
|
||||
# the body element's start tag if it's followed by a script or
|
||||
# a style element.
|
||||
return next["name"] not in ('script', 'style')
|
||||
else:
|
||||
return True
|
||||
elif tagname == 'colgroup':
|
||||
# A colgroup element's start tag may be omitted if the first thing
|
||||
# inside the colgroup element is a col element, and if the element
|
||||
# is not immediately preceded by another colgroup element whose
|
||||
# end tag has been omitted.
|
||||
if type in ("StartTag", "EmptyTag"):
|
||||
# XXX: we do not look at the preceding event, so instead we never
|
||||
# omit the colgroup element's end tag when it is immediately
|
||||
# followed by another colgroup element. See is_optional_end.
|
||||
return next["name"] == "col"
|
||||
else:
|
||||
return False
|
||||
elif tagname == 'tbody':
|
||||
# A tbody element's start tag may be omitted if the first thing
|
||||
# inside the tbody element is a tr element, and if the element is
|
||||
# not immediately preceded by a tbody, thead, or tfoot element
|
||||
# whose end tag has been omitted.
|
||||
if type == "StartTag":
|
||||
# omit the thead and tfoot elements' end tag when they are
|
||||
# immediately followed by a tbody element. See is_optional_end.
|
||||
if previous and previous['type'] == 'EndTag' and \
|
||||
previous['name'] in ('tbody', 'thead', 'tfoot'):
|
||||
return False
|
||||
return next["name"] == 'tr'
|
||||
else:
|
||||
return False
|
||||
return False
|
||||
|
||||
def is_optional_end(self, tagname, next):
|
||||
type = next and next["type"] or None
|
||||
if tagname in ('html', 'head', 'body'):
|
||||
# An html element's end tag may be omitted if the html element
|
||||
# is not immediately followed by a space character or a comment.
|
||||
return type not in ("Comment", "SpaceCharacters")
|
||||
elif tagname in ('li', 'optgroup', 'tr'):
|
||||
# A li element's end tag may be omitted if the li element is
|
||||
# immediately followed by another li element or if there is
|
||||
# no more content in the parent element.
|
||||
# An optgroup element's end tag may be omitted if the optgroup
|
||||
# element is immediately followed by another optgroup element,
|
||||
# or if there is no more content in the parent element.
|
||||
# A tr element's end tag may be omitted if the tr element is
|
||||
# immediately followed by another tr element, or if there is
|
||||
# no more content in the parent element.
|
||||
if type == "StartTag":
|
||||
return next["name"] == tagname
|
||||
else:
|
||||
return type == "EndTag" or type is None
|
||||
elif tagname in ('dt', 'dd'):
|
||||
# A dt element's end tag may be omitted if the dt element is
|
||||
# immediately followed by another dt element or a dd element.
|
||||
# A dd element's end tag may be omitted if the dd element is
|
||||
# immediately followed by another dd element or a dt element,
|
||||
# or if there is no more content in the parent element.
|
||||
if type == "StartTag":
|
||||
return next["name"] in ('dt', 'dd')
|
||||
elif tagname == 'dd':
|
||||
return type == "EndTag" or type is None
|
||||
else:
|
||||
return False
|
||||
elif tagname == 'p':
|
||||
# A p element's end tag may be omitted if the p element is
|
||||
# immediately followed by an address, article, aside,
|
||||
# blockquote, datagrid, dialog, dir, div, dl, fieldset,
|
||||
# footer, form, h1, h2, h3, h4, h5, h6, header, hr, menu,
|
||||
# nav, ol, p, pre, section, table, or ul, element, or if
|
||||
# there is no more content in the parent element.
|
||||
if type in ("StartTag", "EmptyTag"):
|
||||
return next["name"] in ('address', 'article', 'aside',
|
||||
'blockquote', 'datagrid', 'dialog',
|
||||
'dir', 'div', 'dl', 'fieldset', 'footer',
|
||||
'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
||||
'header', 'hr', 'menu', 'nav', 'ol',
|
||||
'p', 'pre', 'section', 'table', 'ul')
|
||||
else:
|
||||
return type == "EndTag" or type is None
|
||||
elif tagname == 'option':
|
||||
# An option element's end tag may be omitted if the option
|
||||
# element is immediately followed by another option element,
|
||||
# or if it is immediately followed by an <code>optgroup</code>
|
||||
# element, or if there is no more content in the parent
|
||||
# element.
|
||||
if type == "StartTag":
|
||||
return next["name"] in ('option', 'optgroup')
|
||||
else:
|
||||
return type == "EndTag" or type is None
|
||||
elif tagname in ('rt', 'rp'):
|
||||
# An rt element's end tag may be omitted if the rt element is
|
||||
# immediately followed by an rt or rp element, or if there is
|
||||
# no more content in the parent element.
|
||||
# An rp element's end tag may be omitted if the rp element is
|
||||
# immediately followed by an rt or rp element, or if there is
|
||||
# no more content in the parent element.
|
||||
if type == "StartTag":
|
||||
return next["name"] in ('rt', 'rp')
|
||||
else:
|
||||
return type == "EndTag" or type is None
|
||||
elif tagname == 'colgroup':
|
||||
# A colgroup element's end tag may be omitted if the colgroup
|
||||
# element is not immediately followed by a space character or
|
||||
# a comment.
|
||||
if type in ("Comment", "SpaceCharacters"):
|
||||
return False
|
||||
elif type == "StartTag":
|
||||
# XXX: we also look for an immediately following colgroup
|
||||
# element. See is_optional_start.
|
||||
return next["name"] != 'colgroup'
|
||||
else:
|
||||
return True
|
||||
elif tagname in ('thead', 'tbody'):
|
||||
# A thead element's end tag may be omitted if the thead element
|
||||
# is immediately followed by a tbody or tfoot element.
|
||||
# A tbody element's end tag may be omitted if the tbody element
|
||||
# is immediately followed by a tbody or tfoot element, or if
|
||||
# there is no more content in the parent element.
|
||||
# A tfoot element's end tag may be omitted if the tfoot element
|
||||
# is immediately followed by a tbody element, or if there is no
|
||||
# more content in the parent element.
|
||||
# XXX: we never omit the end tag when the following element is
|
||||
# a tbody. See is_optional_start.
|
||||
if type == "StartTag":
|
||||
return next["name"] in ['tbody', 'tfoot']
|
||||
elif tagname == 'tbody':
|
||||
return type == "EndTag" or type is None
|
||||
else:
|
||||
return False
|
||||
elif tagname == 'tfoot':
|
||||
# A tfoot element's end tag may be omitted if the tfoot element
|
||||
# is immediately followed by a tbody element, or if there is no
|
||||
# more content in the parent element.
|
||||
# XXX: we never omit the end tag when the following element is
|
||||
# a tbody. See is_optional_start.
|
||||
if type == "StartTag":
|
||||
return next["name"] == 'tbody'
|
||||
else:
|
||||
return type == "EndTag" or type is None
|
||||
elif tagname in ('td', 'th'):
|
||||
# A td element's end tag may be omitted if the td element is
|
||||
# immediately followed by a td or th element, or if there is
|
||||
# no more content in the parent element.
|
||||
# A th element's end tag may be omitted if the th element is
|
||||
# immediately followed by a td or th element, or if there is
|
||||
# no more content in the parent element.
|
||||
if type == "StartTag":
|
||||
return next["name"] in ('td', 'th')
|
||||
else:
|
||||
return type == "EndTag" or type is None
|
||||
return False
|
|
@ -0,0 +1,916 @@
|
|||
"""Deprecated from html5lib 1.1.
|
||||
|
||||
See `here <https://github.com/html5lib/html5lib-python/issues/443>`_ for
|
||||
information about its deprecation; `Bleach <https://github.com/mozilla/bleach>`_
|
||||
is recommended as a replacement. Please let us know in the aforementioned issue
|
||||
if Bleach is unsuitable for your needs.
|
||||
|
||||
"""
|
||||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
import re
|
||||
import warnings
|
||||
from xml.sax.saxutils import escape, unescape
|
||||
|
||||
from six.moves import urllib_parse as urlparse
|
||||
|
||||
from . import base
|
||||
from ..constants import namespaces, prefixes
|
||||
|
||||
__all__ = ["Filter"]
|
||||
|
||||
|
||||
_deprecation_msg = (
|
||||
"html5lib's sanitizer is deprecated; see " +
|
||||
"https://github.com/html5lib/html5lib-python/issues/443 and please let " +
|
||||
"us know if Bleach is unsuitable for your needs"
|
||||
)
|
||||
|
||||
warnings.warn(_deprecation_msg, DeprecationWarning)
|
||||
|
||||
allowed_elements = frozenset((
|
||||
(namespaces['html'], 'a'),
|
||||
(namespaces['html'], 'abbr'),
|
||||
(namespaces['html'], 'acronym'),
|
||||
(namespaces['html'], 'address'),
|
||||
(namespaces['html'], 'area'),
|
||||
(namespaces['html'], 'article'),
|
||||
(namespaces['html'], 'aside'),
|
||||
(namespaces['html'], 'audio'),
|
||||
(namespaces['html'], 'b'),
|
||||
(namespaces['html'], 'big'),
|
||||
(namespaces['html'], 'blockquote'),
|
||||
(namespaces['html'], 'br'),
|
||||
(namespaces['html'], 'button'),
|
||||
(namespaces['html'], 'canvas'),
|
||||
(namespaces['html'], 'caption'),
|
||||
(namespaces['html'], 'center'),
|
||||
(namespaces['html'], 'cite'),
|
||||
(namespaces['html'], 'code'),
|
||||
(namespaces['html'], 'col'),
|
||||
(namespaces['html'], 'colgroup'),
|
||||
(namespaces['html'], 'command'),
|
||||
(namespaces['html'], 'datagrid'),
|
||||
(namespaces['html'], 'datalist'),
|
||||
(namespaces['html'], 'dd'),
|
||||
(namespaces['html'], 'del'),
|
||||
(namespaces['html'], 'details'),
|
||||
(namespaces['html'], 'dfn'),
|
||||
(namespaces['html'], 'dialog'),
|
||||
(namespaces['html'], 'dir'),
|
||||
(namespaces['html'], 'div'),
|
||||
(namespaces['html'], 'dl'),
|
||||
(namespaces['html'], 'dt'),
|
||||
(namespaces['html'], 'em'),
|
||||
(namespaces['html'], 'event-source'),
|
||||
(namespaces['html'], 'fieldset'),
|
||||
(namespaces['html'], 'figcaption'),
|
||||
(namespaces['html'], 'figure'),
|
||||
(namespaces['html'], 'footer'),
|
||||
(namespaces['html'], 'font'),
|
||||
(namespaces['html'], 'form'),
|
||||
(namespaces['html'], 'header'),
|
||||
(namespaces['html'], 'h1'),
|
||||
(namespaces['html'], 'h2'),
|
||||
(namespaces['html'], 'h3'),
|
||||
(namespaces['html'], 'h4'),
|
||||
(namespaces['html'], 'h5'),
|
||||
(namespaces['html'], 'h6'),
|
||||
(namespaces['html'], 'hr'),
|
||||
(namespaces['html'], 'i'),
|
||||
(namespaces['html'], 'img'),
|
||||
(namespaces['html'], 'input'),
|
||||
(namespaces['html'], 'ins'),
|
||||
(namespaces['html'], 'keygen'),
|
||||
(namespaces['html'], 'kbd'),
|
||||
(namespaces['html'], 'label'),
|
||||
(namespaces['html'], 'legend'),
|
||||
(namespaces['html'], 'li'),
|
||||
(namespaces['html'], 'm'),
|
||||
(namespaces['html'], 'map'),
|
||||
(namespaces['html'], 'menu'),
|
||||
(namespaces['html'], 'meter'),
|
||||
(namespaces['html'], 'multicol'),
|
||||
(namespaces['html'], 'nav'),
|
||||
(namespaces['html'], 'nextid'),
|
||||
(namespaces['html'], 'ol'),
|
||||
(namespaces['html'], 'output'),
|
||||
(namespaces['html'], 'optgroup'),
|
||||
(namespaces['html'], 'option'),
|
||||
(namespaces['html'], 'p'),
|
||||
(namespaces['html'], 'pre'),
|
||||
(namespaces['html'], 'progress'),
|
||||
(namespaces['html'], 'q'),
|
||||
(namespaces['html'], 's'),
|
||||
(namespaces['html'], 'samp'),
|
||||
(namespaces['html'], 'section'),
|
||||
(namespaces['html'], 'select'),
|
||||
(namespaces['html'], 'small'),
|
||||
(namespaces['html'], 'sound'),
|
||||
(namespaces['html'], 'source'),
|
||||
(namespaces['html'], 'spacer'),
|
||||
(namespaces['html'], 'span'),
|
||||
(namespaces['html'], 'strike'),
|
||||
(namespaces['html'], 'strong'),
|
||||
(namespaces['html'], 'sub'),
|
||||
(namespaces['html'], 'sup'),
|
||||
(namespaces['html'], 'table'),
|
||||
(namespaces['html'], 'tbody'),
|
||||
(namespaces['html'], 'td'),
|
||||
(namespaces['html'], 'textarea'),
|
||||
(namespaces['html'], 'time'),
|
||||
(namespaces['html'], 'tfoot'),
|
||||
(namespaces['html'], 'th'),
|
||||
(namespaces['html'], 'thead'),
|
||||
(namespaces['html'], 'tr'),
|
||||
(namespaces['html'], 'tt'),
|
||||
(namespaces['html'], 'u'),
|
||||
(namespaces['html'], 'ul'),
|
||||
(namespaces['html'], 'var'),
|
||||
(namespaces['html'], 'video'),
|
||||
(namespaces['mathml'], 'maction'),
|
||||
(namespaces['mathml'], 'math'),
|
||||
(namespaces['mathml'], 'merror'),
|
||||
(namespaces['mathml'], 'mfrac'),
|
||||
(namespaces['mathml'], 'mi'),
|
||||
(namespaces['mathml'], 'mmultiscripts'),
|
||||
(namespaces['mathml'], 'mn'),
|
||||
(namespaces['mathml'], 'mo'),
|
||||
(namespaces['mathml'], 'mover'),
|
||||
(namespaces['mathml'], 'mpadded'),
|
||||
(namespaces['mathml'], 'mphantom'),
|
||||
(namespaces['mathml'], 'mprescripts'),
|
||||
(namespaces['mathml'], 'mroot'),
|
||||
(namespaces['mathml'], 'mrow'),
|
||||
(namespaces['mathml'], 'mspace'),
|
||||
(namespaces['mathml'], 'msqrt'),
|
||||
(namespaces['mathml'], 'mstyle'),
|
||||
(namespaces['mathml'], 'msub'),
|
||||
(namespaces['mathml'], 'msubsup'),
|
||||
(namespaces['mathml'], 'msup'),
|
||||
(namespaces['mathml'], 'mtable'),
|
||||
(namespaces['mathml'], 'mtd'),
|
||||
(namespaces['mathml'], 'mtext'),
|
||||
(namespaces['mathml'], 'mtr'),
|
||||
(namespaces['mathml'], 'munder'),
|
||||
(namespaces['mathml'], 'munderover'),
|
||||
(namespaces['mathml'], 'none'),
|
||||
(namespaces['svg'], 'a'),
|
||||
(namespaces['svg'], 'animate'),
|
||||
(namespaces['svg'], 'animateColor'),
|
||||
(namespaces['svg'], 'animateMotion'),
|
||||
(namespaces['svg'], 'animateTransform'),
|
||||
(namespaces['svg'], 'clipPath'),
|
||||
(namespaces['svg'], 'circle'),
|
||||
(namespaces['svg'], 'defs'),
|
||||
(namespaces['svg'], 'desc'),
|
||||
(namespaces['svg'], 'ellipse'),
|
||||
(namespaces['svg'], 'font-face'),
|
||||
(namespaces['svg'], 'font-face-name'),
|
||||
(namespaces['svg'], 'font-face-src'),
|
||||
(namespaces['svg'], 'g'),
|
||||
(namespaces['svg'], 'glyph'),
|
||||
(namespaces['svg'], 'hkern'),
|
||||
(namespaces['svg'], 'linearGradient'),
|
||||
(namespaces['svg'], 'line'),
|
||||
(namespaces['svg'], 'marker'),
|
||||
(namespaces['svg'], 'metadata'),
|
||||
(namespaces['svg'], 'missing-glyph'),
|
||||
(namespaces['svg'], 'mpath'),
|
||||
(namespaces['svg'], 'path'),
|
||||
(namespaces['svg'], 'polygon'),
|
||||
(namespaces['svg'], 'polyline'),
|
||||
(namespaces['svg'], 'radialGradient'),
|
||||
(namespaces['svg'], 'rect'),
|
||||
(namespaces['svg'], 'set'),
|
||||
(namespaces['svg'], 'stop'),
|
||||
(namespaces['svg'], 'svg'),
|
||||
(namespaces['svg'], 'switch'),
|
||||
(namespaces['svg'], 'text'),
|
||||
(namespaces['svg'], 'title'),
|
||||
(namespaces['svg'], 'tspan'),
|
||||
(namespaces['svg'], 'use'),
|
||||
))
|
||||
|
||||
allowed_attributes = frozenset((
|
||||
# HTML attributes
|
||||
(None, 'abbr'),
|
||||
(None, 'accept'),
|
||||
(None, 'accept-charset'),
|
||||
(None, 'accesskey'),
|
||||
(None, 'action'),
|
||||
(None, 'align'),
|
||||
(None, 'alt'),
|
||||
(None, 'autocomplete'),
|
||||
(None, 'autofocus'),
|
||||
(None, 'axis'),
|
||||
(None, 'background'),
|
||||
(None, 'balance'),
|
||||
(None, 'bgcolor'),
|
||||
(None, 'bgproperties'),
|
||||
(None, 'border'),
|
||||
(None, 'bordercolor'),
|
||||
(None, 'bordercolordark'),
|
||||
(None, 'bordercolorlight'),
|
||||
(None, 'bottompadding'),
|
||||
(None, 'cellpadding'),
|
||||
(None, 'cellspacing'),
|
||||
(None, 'ch'),
|
||||
(None, 'challenge'),
|
||||
(None, 'char'),
|
||||
(None, 'charoff'),
|
||||
(None, 'choff'),
|
||||
(None, 'charset'),
|
||||
(None, 'checked'),
|
||||
(None, 'cite'),
|
||||
(None, 'class'),
|
||||
(None, 'clear'),
|
||||
(None, 'color'),
|
||||
(None, 'cols'),
|
||||
(None, 'colspan'),
|
||||
(None, 'compact'),
|
||||
(None, 'contenteditable'),
|
||||
(None, 'controls'),
|
||||
(None, 'coords'),
|
||||
(None, 'data'),
|
||||
(None, 'datafld'),
|
||||
(None, 'datapagesize'),
|
||||
(None, 'datasrc'),
|
||||
(None, 'datetime'),
|
||||
(None, 'default'),
|
||||
(None, 'delay'),
|
||||
(None, 'dir'),
|
||||
(None, 'disabled'),
|
||||
(None, 'draggable'),
|
||||
(None, 'dynsrc'),
|
||||
(None, 'enctype'),
|
||||
(None, 'end'),
|
||||
(None, 'face'),
|
||||
(None, 'for'),
|
||||
(None, 'form'),
|
||||
(None, 'frame'),
|
||||
(None, 'galleryimg'),
|
||||
(None, 'gutter'),
|
||||
(None, 'headers'),
|
||||
(None, 'height'),
|
||||
(None, 'hidefocus'),
|
||||
(None, 'hidden'),
|
||||
(None, 'high'),
|
||||
(None, 'href'),
|
||||
(None, 'hreflang'),
|
||||
(None, 'hspace'),
|
||||
(None, 'icon'),
|
||||
(None, 'id'),
|
||||
(None, 'inputmode'),
|
||||
(None, 'ismap'),
|
||||
(None, 'keytype'),
|
||||
(None, 'label'),
|
||||
(None, 'leftspacing'),
|
||||
(None, 'lang'),
|
||||
(None, 'list'),
|
||||
(None, 'longdesc'),
|
||||
(None, 'loop'),
|
||||
(None, 'loopcount'),
|
||||
(None, 'loopend'),
|
||||
(None, 'loopstart'),
|
||||
(None, 'low'),
|
||||
(None, 'lowsrc'),
|
||||
(None, 'max'),
|
||||
(None, 'maxlength'),
|
||||
(None, 'media'),
|
||||
(None, 'method'),
|
||||
(None, 'min'),
|
||||
(None, 'multiple'),
|
||||
(None, 'name'),
|
||||
(None, 'nohref'),
|
||||
(None, 'noshade'),
|
||||
(None, 'nowrap'),
|
||||
(None, 'open'),
|
||||
(None, 'optimum'),
|
||||
(None, 'pattern'),
|
||||
(None, 'ping'),
|
||||
(None, 'point-size'),
|
||||
(None, 'poster'),
|
||||
(None, 'pqg'),
|
||||
(None, 'preload'),
|
||||
(None, 'prompt'),
|
||||
(None, 'radiogroup'),
|
||||
(None, 'readonly'),
|
||||
(None, 'rel'),
|
||||
(None, 'repeat-max'),
|
||||
(None, 'repeat-min'),
|
||||
(None, 'replace'),
|
||||
(None, 'required'),
|
||||
(None, 'rev'),
|
||||
(None, 'rightspacing'),
|
||||
(None, 'rows'),
|
||||
(None, 'rowspan'),
|
||||
(None, 'rules'),
|
||||
(None, 'scope'),
|
||||
(None, 'selected'),
|
||||
(None, 'shape'),
|
||||
(None, 'size'),
|
||||
(None, 'span'),
|
||||
(None, 'src'),
|
||||
(None, 'start'),
|
||||
(None, 'step'),
|
||||
(None, 'style'),
|
||||
(None, 'summary'),
|
||||
(None, 'suppress'),
|
||||
(None, 'tabindex'),
|
||||
(None, 'target'),
|
||||
(None, 'template'),
|
||||
(None, 'title'),
|
||||
(None, 'toppadding'),
|
||||
(None, 'type'),
|
||||
(None, 'unselectable'),
|
||||
(None, 'usemap'),
|
||||
(None, 'urn'),
|
||||
(None, 'valign'),
|
||||
(None, 'value'),
|
||||
(None, 'variable'),
|
||||
(None, 'volume'),
|
||||
(None, 'vspace'),
|
||||
(None, 'vrml'),
|
||||
(None, 'width'),
|
||||
(None, 'wrap'),
|
||||
(namespaces['xml'], 'lang'),
|
||||
# MathML attributes
|
||||
(None, 'actiontype'),
|
||||
(None, 'align'),
|
||||
(None, 'columnalign'),
|
||||
(None, 'columnalign'),
|
||||
(None, 'columnalign'),
|
||||
(None, 'columnlines'),
|
||||
(None, 'columnspacing'),
|
||||
(None, 'columnspan'),
|
||||
(None, 'depth'),
|
||||
(None, 'display'),
|
||||
(None, 'displaystyle'),
|
||||
(None, 'equalcolumns'),
|
||||
(None, 'equalrows'),
|
||||
(None, 'fence'),
|
||||
(None, 'fontstyle'),
|
||||
(None, 'fontweight'),
|
||||
(None, 'frame'),
|
||||
(None, 'height'),
|
||||
(None, 'linethickness'),
|
||||
(None, 'lspace'),
|
||||
(None, 'mathbackground'),
|
||||
(None, 'mathcolor'),
|
||||
(None, 'mathvariant'),
|
||||
(None, 'mathvariant'),
|
||||
(None, 'maxsize'),
|
||||
(None, 'minsize'),
|
||||
(None, 'other'),
|
||||
(None, 'rowalign'),
|
||||
(None, 'rowalign'),
|
||||
(None, 'rowalign'),
|
||||
(None, 'rowlines'),
|
||||
(None, 'rowspacing'),
|
||||
(None, 'rowspan'),
|
||||
(None, 'rspace'),
|
||||
(None, 'scriptlevel'),
|
||||
(None, 'selection'),
|
||||
(None, 'separator'),
|
||||
(None, 'stretchy'),
|
||||
(None, 'width'),
|
||||
(None, 'width'),
|
||||
(namespaces['xlink'], 'href'),
|
||||
(namespaces['xlink'], 'show'),
|
||||
(namespaces['xlink'], 'type'),
|
||||
# SVG attributes
|
||||
(None, 'accent-height'),
|
||||
(None, 'accumulate'),
|
||||
(None, 'additive'),
|
||||
(None, 'alphabetic'),
|
||||
(None, 'arabic-form'),
|
||||
(None, 'ascent'),
|
||||
(None, 'attributeName'),
|
||||
(None, 'attributeType'),
|
||||
(None, 'baseProfile'),
|
||||
(None, 'bbox'),
|
||||
(None, 'begin'),
|
||||
(None, 'by'),
|
||||
(None, 'calcMode'),
|
||||
(None, 'cap-height'),
|
||||
(None, 'class'),
|
||||
(None, 'clip-path'),
|
||||
(None, 'color'),
|
||||
(None, 'color-rendering'),
|
||||
(None, 'content'),
|
||||
(None, 'cx'),
|
||||
(None, 'cy'),
|
||||
(None, 'd'),
|
||||
(None, 'dx'),
|
||||
(None, 'dy'),
|
||||
(None, 'descent'),
|
||||
(None, 'display'),
|
||||
(None, 'dur'),
|
||||
(None, 'end'),
|
||||
(None, 'fill'),
|
||||
(None, 'fill-opacity'),
|
||||
(None, 'fill-rule'),
|
||||
(None, 'font-family'),
|
||||
(None, 'font-size'),
|
||||
(None, 'font-stretch'),
|
||||
(None, 'font-style'),
|
||||
(None, 'font-variant'),
|
||||
(None, 'font-weight'),
|
||||
(None, 'from'),
|
||||
(None, 'fx'),
|
||||
(None, 'fy'),
|
||||
(None, 'g1'),
|
||||
(None, 'g2'),
|
||||
(None, 'glyph-name'),
|
||||
(None, 'gradientUnits'),
|
||||
(None, 'hanging'),
|
||||
(None, 'height'),
|
||||
(None, 'horiz-adv-x'),
|
||||
(None, 'horiz-origin-x'),
|
||||
(None, 'id'),
|
||||
(None, 'ideographic'),
|
||||
(None, 'k'),
|
||||
(None, 'keyPoints'),
|
||||
(None, 'keySplines'),
|
||||
(None, 'keyTimes'),
|
||||
(None, 'lang'),
|
||||
(None, 'marker-end'),
|
||||
(None, 'marker-mid'),
|
||||
(None, 'marker-start'),
|
||||
(None, 'markerHeight'),
|
||||
(None, 'markerUnits'),
|
||||
(None, 'markerWidth'),
|
||||
(None, 'mathematical'),
|
||||
(None, 'max'),
|
||||
(None, 'min'),
|
||||
(None, 'name'),
|
||||
(None, 'offset'),
|
||||
(None, 'opacity'),
|
||||
(None, 'orient'),
|
||||
(None, 'origin'),
|
||||
(None, 'overline-position'),
|
||||
(None, 'overline-thickness'),
|
||||
(None, 'panose-1'),
|
||||
(None, 'path'),
|
||||
(None, 'pathLength'),
|
||||
(None, 'points'),
|
||||
(None, 'preserveAspectRatio'),
|
||||
(None, 'r'),
|
||||
(None, 'refX'),
|
||||
(None, 'refY'),
|
||||
(None, 'repeatCount'),
|
||||
(None, 'repeatDur'),
|
||||
(None, 'requiredExtensions'),
|
||||
(None, 'requiredFeatures'),
|
||||
(None, 'restart'),
|
||||
(None, 'rotate'),
|
||||
(None, 'rx'),
|
||||
(None, 'ry'),
|
||||
(None, 'slope'),
|
||||
(None, 'stemh'),
|
||||
(None, 'stemv'),
|
||||
(None, 'stop-color'),
|
||||
(None, 'stop-opacity'),
|
||||
(None, 'strikethrough-position'),
|
||||
(None, 'strikethrough-thickness'),
|
||||
(None, 'stroke'),
|
||||
(None, 'stroke-dasharray'),
|
||||
(None, 'stroke-dashoffset'),
|
||||
(None, 'stroke-linecap'),
|
||||
(None, 'stroke-linejoin'),
|
||||
(None, 'stroke-miterlimit'),
|
||||
(None, 'stroke-opacity'),
|
||||
(None, 'stroke-width'),
|
||||
(None, 'systemLanguage'),
|
||||
(None, 'target'),
|
||||
(None, 'text-anchor'),
|
||||
(None, 'to'),
|
||||
(None, 'transform'),
|
||||
(None, 'type'),
|
||||
(None, 'u1'),
|
||||
(None, 'u2'),
|
||||
(None, 'underline-position'),
|
||||
(None, 'underline-thickness'),
|
||||
(None, 'unicode'),
|
||||
(None, 'unicode-range'),
|
||||
(None, 'units-per-em'),
|
||||
(None, 'values'),
|
||||
(None, 'version'),
|
||||
(None, 'viewBox'),
|
||||
(None, 'visibility'),
|
||||
(None, 'width'),
|
||||
(None, 'widths'),
|
||||
(None, 'x'),
|
||||
(None, 'x-height'),
|
||||
(None, 'x1'),
|
||||
(None, 'x2'),
|
||||
(namespaces['xlink'], 'actuate'),
|
||||
(namespaces['xlink'], 'arcrole'),
|
||||
(namespaces['xlink'], 'href'),
|
||||
(namespaces['xlink'], 'role'),
|
||||
(namespaces['xlink'], 'show'),
|
||||
(namespaces['xlink'], 'title'),
|
||||
(namespaces['xlink'], 'type'),
|
||||
(namespaces['xml'], 'base'),
|
||||
(namespaces['xml'], 'lang'),
|
||||
(namespaces['xml'], 'space'),
|
||||
(None, 'y'),
|
||||
(None, 'y1'),
|
||||
(None, 'y2'),
|
||||
(None, 'zoomAndPan'),
|
||||
))
|
||||
|
||||
attr_val_is_uri = frozenset((
|
||||
(None, 'href'),
|
||||
(None, 'src'),
|
||||
(None, 'cite'),
|
||||
(None, 'action'),
|
||||
(None, 'longdesc'),
|
||||
(None, 'poster'),
|
||||
(None, 'background'),
|
||||
(None, 'datasrc'),
|
||||
(None, 'dynsrc'),
|
||||
(None, 'lowsrc'),
|
||||
(None, 'ping'),
|
||||
(namespaces['xlink'], 'href'),
|
||||
(namespaces['xml'], 'base'),
|
||||
))
|
||||
|
||||
svg_attr_val_allows_ref = frozenset((
|
||||
(None, 'clip-path'),
|
||||
(None, 'color-profile'),
|
||||
(None, 'cursor'),
|
||||
(None, 'fill'),
|
||||
(None, 'filter'),
|
||||
(None, 'marker'),
|
||||
(None, 'marker-start'),
|
||||
(None, 'marker-mid'),
|
||||
(None, 'marker-end'),
|
||||
(None, 'mask'),
|
||||
(None, 'stroke'),
|
||||
))
|
||||
|
||||
svg_allow_local_href = frozenset((
|
||||
(None, 'altGlyph'),
|
||||
(None, 'animate'),
|
||||
(None, 'animateColor'),
|
||||
(None, 'animateMotion'),
|
||||
(None, 'animateTransform'),
|
||||
(None, 'cursor'),
|
||||
(None, 'feImage'),
|
||||
(None, 'filter'),
|
||||
(None, 'linearGradient'),
|
||||
(None, 'pattern'),
|
||||
(None, 'radialGradient'),
|
||||
(None, 'textpath'),
|
||||
(None, 'tref'),
|
||||
(None, 'set'),
|
||||
(None, 'use')
|
||||
))
|
||||
|
||||
allowed_css_properties = frozenset((
|
||||
'azimuth',
|
||||
'background-color',
|
||||
'border-bottom-color',
|
||||
'border-collapse',
|
||||
'border-color',
|
||||
'border-left-color',
|
||||
'border-right-color',
|
||||
'border-top-color',
|
||||
'clear',
|
||||
'color',
|
||||
'cursor',
|
||||
'direction',
|
||||
'display',
|
||||
'elevation',
|
||||
'float',
|
||||
'font',
|
||||
'font-family',
|
||||
'font-size',
|
||||
'font-style',
|
||||
'font-variant',
|
||||
'font-weight',
|
||||
'height',
|
||||
'letter-spacing',
|
||||
'line-height',
|
||||
'overflow',
|
||||
'pause',
|
||||
'pause-after',
|
||||
'pause-before',
|
||||
'pitch',
|
||||
'pitch-range',
|
||||
'richness',
|
||||
'speak',
|
||||
'speak-header',
|
||||
'speak-numeral',
|
||||
'speak-punctuation',
|
||||
'speech-rate',
|
||||
'stress',
|
||||
'text-align',
|
||||
'text-decoration',
|
||||
'text-indent',
|
||||
'unicode-bidi',
|
||||
'vertical-align',
|
||||
'voice-family',
|
||||
'volume',
|
||||
'white-space',
|
||||
'width',
|
||||
))
|
||||
|
||||
allowed_css_keywords = frozenset((
|
||||
'auto',
|
||||
'aqua',
|
||||
'black',
|
||||
'block',
|
||||
'blue',
|
||||
'bold',
|
||||
'both',
|
||||
'bottom',
|
||||
'brown',
|
||||
'center',
|
||||
'collapse',
|
||||
'dashed',
|
||||
'dotted',
|
||||
'fuchsia',
|
||||
'gray',
|
||||
'green',
|
||||
'!important',
|
||||
'italic',
|
||||
'left',
|
||||
'lime',
|
||||
'maroon',
|
||||
'medium',
|
||||
'none',
|
||||
'navy',
|
||||
'normal',
|
||||
'nowrap',
|
||||
'olive',
|
||||
'pointer',
|
||||
'purple',
|
||||
'red',
|
||||
'right',
|
||||
'solid',
|
||||
'silver',
|
||||
'teal',
|
||||
'top',
|
||||
'transparent',
|
||||
'underline',
|
||||
'white',
|
||||
'yellow',
|
||||
))
|
||||
|
||||
allowed_svg_properties = frozenset((
|
||||
'fill',
|
||||
'fill-opacity',
|
||||
'fill-rule',
|
||||
'stroke',
|
||||
'stroke-width',
|
||||
'stroke-linecap',
|
||||
'stroke-linejoin',
|
||||
'stroke-opacity',
|
||||
))
|
||||
|
||||
allowed_protocols = frozenset((
|
||||
'ed2k',
|
||||
'ftp',
|
||||
'http',
|
||||
'https',
|
||||
'irc',
|
||||
'mailto',
|
||||
'news',
|
||||
'gopher',
|
||||
'nntp',
|
||||
'telnet',
|
||||
'webcal',
|
||||
'xmpp',
|
||||
'callto',
|
||||
'feed',
|
||||
'urn',
|
||||
'aim',
|
||||
'rsync',
|
||||
'tag',
|
||||
'ssh',
|
||||
'sftp',
|
||||
'rtsp',
|
||||
'afs',
|
||||
'data',
|
||||
))
|
||||
|
||||
allowed_content_types = frozenset((
|
||||
'image/png',
|
||||
'image/jpeg',
|
||||
'image/gif',
|
||||
'image/webp',
|
||||
'image/bmp',
|
||||
'text/plain',
|
||||
))
|
||||
|
||||
|
||||
data_content_type = re.compile(r'''
|
||||
^
|
||||
# Match a content type <application>/<type>
|
||||
(?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
|
||||
# Match any character set and encoding
|
||||
(?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
|
||||
|(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
|
||||
# Assume the rest is data
|
||||
,.*
|
||||
$
|
||||
''',
|
||||
re.VERBOSE)
|
||||
|
||||
|
||||
class Filter(base.Filter):
|
||||
"""Sanitizes token stream of XHTML+MathML+SVG and of inline style attributes"""
|
||||
def __init__(self,
|
||||
source,
|
||||
allowed_elements=allowed_elements,
|
||||
allowed_attributes=allowed_attributes,
|
||||
allowed_css_properties=allowed_css_properties,
|
||||
allowed_css_keywords=allowed_css_keywords,
|
||||
allowed_svg_properties=allowed_svg_properties,
|
||||
allowed_protocols=allowed_protocols,
|
||||
allowed_content_types=allowed_content_types,
|
||||
attr_val_is_uri=attr_val_is_uri,
|
||||
svg_attr_val_allows_ref=svg_attr_val_allows_ref,
|
||||
svg_allow_local_href=svg_allow_local_href):
|
||||
"""Creates a Filter
|
||||
|
||||
:arg allowed_elements: set of elements to allow--everything else will
|
||||
be escaped
|
||||
|
||||
:arg allowed_attributes: set of attributes to allow in
|
||||
elements--everything else will be stripped
|
||||
|
||||
:arg allowed_css_properties: set of CSS properties to allow--everything
|
||||
else will be stripped
|
||||
|
||||
:arg allowed_css_keywords: set of CSS keywords to allow--everything
|
||||
else will be stripped
|
||||
|
||||
:arg allowed_svg_properties: set of SVG properties to allow--everything
|
||||
else will be removed
|
||||
|
||||
:arg allowed_protocols: set of allowed protocols for URIs
|
||||
|
||||
:arg allowed_content_types: set of allowed content types for ``data`` URIs.
|
||||
|
||||
:arg attr_val_is_uri: set of attributes that have URI values--values
|
||||
that have a scheme not listed in ``allowed_protocols`` are removed
|
||||
|
||||
:arg svg_attr_val_allows_ref: set of SVG attributes that can have
|
||||
references
|
||||
|
||||
:arg svg_allow_local_href: set of SVG elements that can have local
|
||||
hrefs--these are removed
|
||||
|
||||
"""
|
||||
super(Filter, self).__init__(source)
|
||||
|
||||
warnings.warn(_deprecation_msg, DeprecationWarning)
|
||||
|
||||
self.allowed_elements = allowed_elements
|
||||
self.allowed_attributes = allowed_attributes
|
||||
self.allowed_css_properties = allowed_css_properties
|
||||
self.allowed_css_keywords = allowed_css_keywords
|
||||
self.allowed_svg_properties = allowed_svg_properties
|
||||
self.allowed_protocols = allowed_protocols
|
||||
self.allowed_content_types = allowed_content_types
|
||||
self.attr_val_is_uri = attr_val_is_uri
|
||||
self.svg_attr_val_allows_ref = svg_attr_val_allows_ref
|
||||
self.svg_allow_local_href = svg_allow_local_href
|
||||
|
||||
def __iter__(self):
|
||||
for token in base.Filter.__iter__(self):
|
||||
token = self.sanitize_token(token)
|
||||
if token:
|
||||
yield token
|
||||
|
||||
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
|
||||
# stripping out all attributes not in ALLOWED_ATTRIBUTES. Style attributes
|
||||
# are parsed, and a restricted set, specified by ALLOWED_CSS_PROPERTIES and
|
||||
# ALLOWED_CSS_KEYWORDS, are allowed through. attributes in ATTR_VAL_IS_URI
|
||||
# are scanned, and only URI schemes specified in ALLOWED_PROTOCOLS are
|
||||
# allowed.
|
||||
#
|
||||
# sanitize_html('<script> do_nasty_stuff() </script>')
|
||||
# => <script> do_nasty_stuff() </script>
|
||||
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
|
||||
# => <a>Click here for $100</a>
|
||||
def sanitize_token(self, token):
|
||||
|
||||
# accommodate filters which use token_type differently
|
||||
token_type = token["type"]
|
||||
if token_type in ("StartTag", "EndTag", "EmptyTag"):
|
||||
name = token["name"]
|
||||
namespace = token["namespace"]
|
||||
if ((namespace, name) in self.allowed_elements or
|
||||
(namespace is None and
|
||||
(namespaces["html"], name) in self.allowed_elements)):
|
||||
return self.allowed_token(token)
|
||||
else:
|
||||
return self.disallowed_token(token)
|
||||
elif token_type == "Comment":
|
||||
pass
|
||||
else:
|
||||
return token
|
||||
|
||||
def allowed_token(self, token):
|
||||
if "data" in token:
|
||||
attrs = token["data"]
|
||||
attr_names = set(attrs.keys())
|
||||
|
||||
# Remove forbidden attributes
|
||||
for to_remove in (attr_names - self.allowed_attributes):
|
||||
del token["data"][to_remove]
|
||||
attr_names.remove(to_remove)
|
||||
|
||||
# Remove attributes with disallowed URL values
|
||||
for attr in (attr_names & self.attr_val_is_uri):
|
||||
assert attr in attrs
|
||||
# I don't have a clue where this regexp comes from or why it matches those
|
||||
# characters, nor why we call unescape. I just know it's always been here.
|
||||
# Should you be worried by this comment in a sanitizer? Yes. On the other hand, all
|
||||
# this will do is remove *more* than it otherwise would.
|
||||
val_unescaped = re.sub("[`\x00-\x20\x7f-\xa0\\s]+", '',
|
||||
unescape(attrs[attr])).lower()
|
||||
# remove replacement characters from unescaped characters
|
||||
val_unescaped = val_unescaped.replace("\ufffd", "")
|
||||
try:
|
||||
uri = urlparse.urlparse(val_unescaped)
|
||||
except ValueError:
|
||||
uri = None
|
||||
del attrs[attr]
|
||||
if uri and uri.scheme:
|
||||
if uri.scheme not in self.allowed_protocols:
|
||||
del attrs[attr]
|
||||
if uri.scheme == 'data':
|
||||
m = data_content_type.match(uri.path)
|
||||
if not m:
|
||||
del attrs[attr]
|
||||
elif m.group('content_type') not in self.allowed_content_types:
|
||||
del attrs[attr]
|
||||
|
||||
for attr in self.svg_attr_val_allows_ref:
|
||||
if attr in attrs:
|
||||
attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
|
||||
' ',
|
||||
unescape(attrs[attr]))
|
||||
if (token["name"] in self.svg_allow_local_href and
|
||||
(namespaces['xlink'], 'href') in attrs and re.search(r'^\s*[^#\s].*',
|
||||
attrs[(namespaces['xlink'], 'href')])):
|
||||
del attrs[(namespaces['xlink'], 'href')]
|
||||
if (None, 'style') in attrs:
|
||||
attrs[(None, 'style')] = self.sanitize_css(attrs[(None, 'style')])
|
||||
token["data"] = attrs
|
||||
return token
|
||||
|
||||
def disallowed_token(self, token):
|
||||
token_type = token["type"]
|
||||
if token_type == "EndTag":
|
||||
token["data"] = "</%s>" % token["name"]
|
||||
elif token["data"]:
|
||||
assert token_type in ("StartTag", "EmptyTag")
|
||||
attrs = []
|
||||
for (ns, name), v in token["data"].items():
|
||||
attrs.append(' %s="%s"' % (name if ns is None else "%s:%s" % (prefixes[ns], name), escape(v)))
|
||||
token["data"] = "<%s%s>" % (token["name"], ''.join(attrs))
|
||||
else:
|
||||
token["data"] = "<%s>" % token["name"]
|
||||
if token.get("selfClosing"):
|
||||
token["data"] = token["data"][:-1] + "/>"
|
||||
|
||||
token["type"] = "Characters"
|
||||
|
||||
del token["name"]
|
||||
return token
|
||||
|
||||
def sanitize_css(self, style):
|
||||
# disallow urls
|
||||
style = re.compile(r'url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
|
||||
|
||||
# gauntlet
|
||||
if not re.match(r"""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
|
||||
return ''
|
||||
if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
|
||||
return ''
|
||||
|
||||
clean = []
|
||||
for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style):
|
||||
if not value:
|
||||
continue
|
||||
if prop.lower() in self.allowed_css_properties:
|
||||
clean.append(prop + ': ' + value + ';')
|
||||
elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
|
||||
'padding']:
|
||||
for keyword in value.split():
|
||||
if keyword not in self.allowed_css_keywords and \
|
||||
not re.match(r"^(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword): # noqa
|
||||
break
|
||||
else:
|
||||
clean.append(prop + ': ' + value + ';')
|
||||
elif prop.lower() in self.allowed_svg_properties:
|
||||
clean.append(prop + ': ' + value + ';')
|
||||
|
||||
return ' '.join(clean)
|
|
@ -0,0 +1,38 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
import re
|
||||
|
||||
from . import base
|
||||
from ..constants import rcdataElements, spaceCharacters
|
||||
spaceCharacters = "".join(spaceCharacters)
|
||||
|
||||
SPACES_REGEX = re.compile("[%s]+" % spaceCharacters)
|
||||
|
||||
|
||||
class Filter(base.Filter):
|
||||
"""Collapses whitespace except in pre, textarea, and script elements"""
|
||||
spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements))
|
||||
|
||||
def __iter__(self):
|
||||
preserve = 0
|
||||
for token in base.Filter.__iter__(self):
|
||||
type = token["type"]
|
||||
if type == "StartTag" \
|
||||
and (preserve or token["name"] in self.spacePreserveElements):
|
||||
preserve += 1
|
||||
|
||||
elif type == "EndTag" and preserve:
|
||||
preserve -= 1
|
||||
|
||||
elif not preserve and type == "SpaceCharacters" and token["data"]:
|
||||
# Test on token["data"] above to not introduce spaces where there were not
|
||||
token["data"] = " "
|
||||
|
||||
elif not preserve and type == "Characters":
|
||||
token["data"] = collapse_spaces(token["data"])
|
||||
|
||||
yield token
|
||||
|
||||
|
||||
def collapse_spaces(text):
|
||||
return SPACES_REGEX.sub(' ', text)
|
2795
venv/Lib/site-packages/bleach/_vendor/html5lib/html5parser.py
Normal file
2795
venv/Lib/site-packages/bleach/_vendor/html5lib/html5parser.py
Normal file
File diff suppressed because it is too large
Load diff
409
venv/Lib/site-packages/bleach/_vendor/html5lib/serializer.py
Normal file
409
venv/Lib/site-packages/bleach/_vendor/html5lib/serializer.py
Normal file
|
@ -0,0 +1,409 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
from six import text_type
|
||||
|
||||
import re
|
||||
|
||||
from codecs import register_error, xmlcharrefreplace_errors
|
||||
|
||||
from .constants import voidElements, booleanAttributes, spaceCharacters
|
||||
from .constants import rcdataElements, entities, xmlEntities
|
||||
from . import treewalkers, _utils
|
||||
from xml.sax.saxutils import escape
|
||||
|
||||
_quoteAttributeSpecChars = "".join(spaceCharacters) + "\"'=<>`"
|
||||
_quoteAttributeSpec = re.compile("[" + _quoteAttributeSpecChars + "]")
|
||||
_quoteAttributeLegacy = re.compile("[" + _quoteAttributeSpecChars +
|
||||
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n"
|
||||
"\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15"
|
||||
"\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
|
||||
"\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000"
|
||||
"\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
|
||||
"\u2008\u2009\u200a\u2028\u2029\u202f\u205f"
|
||||
"\u3000]")
|
||||
|
||||
|
||||
_encode_entity_map = {}
|
||||
_is_ucs4 = len("\U0010FFFF") == 1
|
||||
for k, v in list(entities.items()):
|
||||
# skip multi-character entities
|
||||
if ((_is_ucs4 and len(v) > 1) or
|
||||
(not _is_ucs4 and len(v) > 2)):
|
||||
continue
|
||||
if v != "&":
|
||||
if len(v) == 2:
|
||||
v = _utils.surrogatePairToCodepoint(v)
|
||||
else:
|
||||
v = ord(v)
|
||||
if v not in _encode_entity_map or k.islower():
|
||||
# prefer < over < and similarly for &, >, etc.
|
||||
_encode_entity_map[v] = k
|
||||
|
||||
|
||||
def htmlentityreplace_errors(exc):
|
||||
if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
|
||||
res = []
|
||||
codepoints = []
|
||||
skip = False
|
||||
for i, c in enumerate(exc.object[exc.start:exc.end]):
|
||||
if skip:
|
||||
skip = False
|
||||
continue
|
||||
index = i + exc.start
|
||||
if _utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
|
||||
codepoint = _utils.surrogatePairToCodepoint(exc.object[index:index + 2])
|
||||
skip = True
|
||||
else:
|
||||
codepoint = ord(c)
|
||||
codepoints.append(codepoint)
|
||||
for cp in codepoints:
|
||||
e = _encode_entity_map.get(cp)
|
||||
if e:
|
||||
res.append("&")
|
||||
res.append(e)
|
||||
if not e.endswith(";"):
|
||||
res.append(";")
|
||||
else:
|
||||
res.append("&#x%s;" % (hex(cp)[2:]))
|
||||
return ("".join(res), exc.end)
|
||||
else:
|
||||
return xmlcharrefreplace_errors(exc)
|
||||
|
||||
|
||||
register_error("htmlentityreplace", htmlentityreplace_errors)
|
||||
|
||||
|
||||
def serialize(input, tree="etree", encoding=None, **serializer_opts):
|
||||
"""Serializes the input token stream using the specified treewalker
|
||||
|
||||
:arg input: the token stream to serialize
|
||||
|
||||
:arg tree: the treewalker to use
|
||||
|
||||
:arg encoding: the encoding to use
|
||||
|
||||
:arg serializer_opts: any options to pass to the
|
||||
:py:class:`html5lib.serializer.HTMLSerializer` that gets created
|
||||
|
||||
:returns: the tree serialized as a string
|
||||
|
||||
Example:
|
||||
|
||||
>>> from html5lib.html5parser import parse
|
||||
>>> from html5lib.serializer import serialize
|
||||
>>> token_stream = parse('<html><body><p>Hi!</p></body></html>')
|
||||
>>> serialize(token_stream, omit_optional_tags=False)
|
||||
'<html><head></head><body><p>Hi!</p></body></html>'
|
||||
|
||||
"""
|
||||
# XXX: Should we cache this?
|
||||
walker = treewalkers.getTreeWalker(tree)
|
||||
s = HTMLSerializer(**serializer_opts)
|
||||
return s.render(walker(input), encoding)
|
||||
|
||||
|
||||
class HTMLSerializer(object):
|
||||
|
||||
# attribute quoting options
|
||||
quote_attr_values = "legacy" # be secure by default
|
||||
quote_char = '"'
|
||||
use_best_quote_char = True
|
||||
|
||||
# tag syntax options
|
||||
omit_optional_tags = True
|
||||
minimize_boolean_attributes = True
|
||||
use_trailing_solidus = False
|
||||
space_before_trailing_solidus = True
|
||||
|
||||
# escaping options
|
||||
escape_lt_in_attrs = False
|
||||
escape_rcdata = False
|
||||
resolve_entities = True
|
||||
|
||||
# miscellaneous options
|
||||
alphabetical_attributes = False
|
||||
inject_meta_charset = True
|
||||
strip_whitespace = False
|
||||
sanitize = False
|
||||
|
||||
options = ("quote_attr_values", "quote_char", "use_best_quote_char",
|
||||
"omit_optional_tags", "minimize_boolean_attributes",
|
||||
"use_trailing_solidus", "space_before_trailing_solidus",
|
||||
"escape_lt_in_attrs", "escape_rcdata", "resolve_entities",
|
||||
"alphabetical_attributes", "inject_meta_charset",
|
||||
"strip_whitespace", "sanitize")
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
"""Initialize HTMLSerializer
|
||||
|
||||
:arg inject_meta_charset: Whether or not to inject the meta charset.
|
||||
|
||||
Defaults to ``True``.
|
||||
|
||||
:arg quote_attr_values: Whether to quote attribute values that don't
|
||||
require quoting per legacy browser behavior (``"legacy"``), when
|
||||
required by the standard (``"spec"``), or always (``"always"``).
|
||||
|
||||
Defaults to ``"legacy"``.
|
||||
|
||||
:arg quote_char: Use given quote character for attribute quoting.
|
||||
|
||||
Defaults to ``"`` which will use double quotes unless attribute
|
||||
value contains a double quote, in which case single quotes are
|
||||
used.
|
||||
|
||||
:arg escape_lt_in_attrs: Whether or not to escape ``<`` in attribute
|
||||
values.
|
||||
|
||||
Defaults to ``False``.
|
||||
|
||||
:arg escape_rcdata: Whether to escape characters that need to be
|
||||
escaped within normal elements within rcdata elements such as
|
||||
style.
|
||||
|
||||
Defaults to ``False``.
|
||||
|
||||
:arg resolve_entities: Whether to resolve named character entities that
|
||||
appear in the source tree. The XML predefined entities < >
|
||||
& " ' are unaffected by this setting.
|
||||
|
||||
Defaults to ``True``.
|
||||
|
||||
:arg strip_whitespace: Whether to remove semantically meaningless
|
||||
whitespace. (This compresses all whitespace to a single space
|
||||
except within ``pre``.)
|
||||
|
||||
Defaults to ``False``.
|
||||
|
||||
:arg minimize_boolean_attributes: Shortens boolean attributes to give
|
||||
just the attribute value, for example::
|
||||
|
||||
<input disabled="disabled">
|
||||
|
||||
becomes::
|
||||
|
||||
<input disabled>
|
||||
|
||||
Defaults to ``True``.
|
||||
|
||||
:arg use_trailing_solidus: Includes a close-tag slash at the end of the
|
||||
start tag of void elements (empty elements whose end tag is
|
||||
forbidden). E.g. ``<hr/>``.
|
||||
|
||||
Defaults to ``False``.
|
||||
|
||||
:arg space_before_trailing_solidus: Places a space immediately before
|
||||
the closing slash in a tag using a trailing solidus. E.g.
|
||||
``<hr />``. Requires ``use_trailing_solidus=True``.
|
||||
|
||||
Defaults to ``True``.
|
||||
|
||||
:arg sanitize: Strip all unsafe or unknown constructs from output.
|
||||
See :py:class:`html5lib.filters.sanitizer.Filter`.
|
||||
|
||||
Defaults to ``False``.
|
||||
|
||||
:arg omit_optional_tags: Omit start/end tags that are optional.
|
||||
|
||||
Defaults to ``True``.
|
||||
|
||||
:arg alphabetical_attributes: Reorder attributes to be in alphabetical order.
|
||||
|
||||
Defaults to ``False``.
|
||||
|
||||
"""
|
||||
unexpected_args = frozenset(kwargs) - frozenset(self.options)
|
||||
if len(unexpected_args) > 0:
|
||||
raise TypeError("__init__() got an unexpected keyword argument '%s'" % next(iter(unexpected_args)))
|
||||
if 'quote_char' in kwargs:
|
||||
self.use_best_quote_char = False
|
||||
for attr in self.options:
|
||||
setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
|
||||
self.errors = []
|
||||
self.strict = False
|
||||
|
||||
def encode(self, string):
|
||||
assert(isinstance(string, text_type))
|
||||
if self.encoding:
|
||||
return string.encode(self.encoding, "htmlentityreplace")
|
||||
else:
|
||||
return string
|
||||
|
||||
def encodeStrict(self, string):
|
||||
assert(isinstance(string, text_type))
|
||||
if self.encoding:
|
||||
return string.encode(self.encoding, "strict")
|
||||
else:
|
||||
return string
|
||||
|
||||
def serialize(self, treewalker, encoding=None):
|
||||
# pylint:disable=too-many-nested-blocks
|
||||
self.encoding = encoding
|
||||
in_cdata = False
|
||||
self.errors = []
|
||||
|
||||
if encoding and self.inject_meta_charset:
|
||||
from .filters.inject_meta_charset import Filter
|
||||
treewalker = Filter(treewalker, encoding)
|
||||
# Alphabetical attributes is here under the assumption that none of
|
||||
# the later filters add or change order of attributes; it needs to be
|
||||
# before the sanitizer so escaped elements come out correctly
|
||||
if self.alphabetical_attributes:
|
||||
from .filters.alphabeticalattributes import Filter
|
||||
treewalker = Filter(treewalker)
|
||||
# WhitespaceFilter should be used before OptionalTagFilter
|
||||
# for maximum efficiently of this latter filter
|
||||
if self.strip_whitespace:
|
||||
from .filters.whitespace import Filter
|
||||
treewalker = Filter(treewalker)
|
||||
if self.sanitize:
|
||||
from .filters.sanitizer import Filter
|
||||
treewalker = Filter(treewalker)
|
||||
if self.omit_optional_tags:
|
||||
from .filters.optionaltags import Filter
|
||||
treewalker = Filter(treewalker)
|
||||
|
||||
for token in treewalker:
|
||||
type = token["type"]
|
||||
if type == "Doctype":
|
||||
doctype = "<!DOCTYPE %s" % token["name"]
|
||||
|
||||
if token["publicId"]:
|
||||
doctype += ' PUBLIC "%s"' % token["publicId"]
|
||||
elif token["systemId"]:
|
||||
doctype += " SYSTEM"
|
||||
if token["systemId"]:
|
||||
if token["systemId"].find('"') >= 0:
|
||||
if token["systemId"].find("'") >= 0:
|
||||
self.serializeError("System identifier contains both single and double quote characters")
|
||||
quote_char = "'"
|
||||
else:
|
||||
quote_char = '"'
|
||||
doctype += " %s%s%s" % (quote_char, token["systemId"], quote_char)
|
||||
|
||||
doctype += ">"
|
||||
yield self.encodeStrict(doctype)
|
||||
|
||||
elif type in ("Characters", "SpaceCharacters"):
|
||||
if type == "SpaceCharacters" or in_cdata:
|
||||
if in_cdata and token["data"].find("</") >= 0:
|
||||
self.serializeError("Unexpected </ in CDATA")
|
||||
yield self.encode(token["data"])
|
||||
else:
|
||||
yield self.encode(escape(token["data"]))
|
||||
|
||||
elif type in ("StartTag", "EmptyTag"):
|
||||
name = token["name"]
|
||||
yield self.encodeStrict("<%s" % name)
|
||||
if name in rcdataElements and not self.escape_rcdata:
|
||||
in_cdata = True
|
||||
elif in_cdata:
|
||||
self.serializeError("Unexpected child element of a CDATA element")
|
||||
for (_, attr_name), attr_value in token["data"].items():
|
||||
# TODO: Add namespace support here
|
||||
k = attr_name
|
||||
v = attr_value
|
||||
yield self.encodeStrict(' ')
|
||||
|
||||
yield self.encodeStrict(k)
|
||||
if not self.minimize_boolean_attributes or \
|
||||
(k not in booleanAttributes.get(name, tuple()) and
|
||||
k not in booleanAttributes.get("", tuple())):
|
||||
yield self.encodeStrict("=")
|
||||
if self.quote_attr_values == "always" or len(v) == 0:
|
||||
quote_attr = True
|
||||
elif self.quote_attr_values == "spec":
|
||||
quote_attr = _quoteAttributeSpec.search(v) is not None
|
||||
elif self.quote_attr_values == "legacy":
|
||||
quote_attr = _quoteAttributeLegacy.search(v) is not None
|
||||
else:
|
||||
raise ValueError("quote_attr_values must be one of: "
|
||||
"'always', 'spec', or 'legacy'")
|
||||
v = v.replace("&", "&")
|
||||
if self.escape_lt_in_attrs:
|
||||
v = v.replace("<", "<")
|
||||
if quote_attr:
|
||||
quote_char = self.quote_char
|
||||
if self.use_best_quote_char:
|
||||
if "'" in v and '"' not in v:
|
||||
quote_char = '"'
|
||||
elif '"' in v and "'" not in v:
|
||||
quote_char = "'"
|
||||
if quote_char == "'":
|
||||
v = v.replace("'", "'")
|
||||
else:
|
||||
v = v.replace('"', """)
|
||||
yield self.encodeStrict(quote_char)
|
||||
yield self.encode(v)
|
||||
yield self.encodeStrict(quote_char)
|
||||
else:
|
||||
yield self.encode(v)
|
||||
if name in voidElements and self.use_trailing_solidus:
|
||||
if self.space_before_trailing_solidus:
|
||||
yield self.encodeStrict(" /")
|
||||
else:
|
||||
yield self.encodeStrict("/")
|
||||
yield self.encode(">")
|
||||
|
||||
elif type == "EndTag":
|
||||
name = token["name"]
|
||||
if name in rcdataElements:
|
||||
in_cdata = False
|
||||
elif in_cdata:
|
||||
self.serializeError("Unexpected child element of a CDATA element")
|
||||
yield self.encodeStrict("</%s>" % name)
|
||||
|
||||
elif type == "Comment":
|
||||
data = token["data"]
|
||||
if data.find("--") >= 0:
|
||||
self.serializeError("Comment contains --")
|
||||
yield self.encodeStrict("<!--%s-->" % token["data"])
|
||||
|
||||
elif type == "Entity":
|
||||
name = token["name"]
|
||||
key = name + ";"
|
||||
if key not in entities:
|
||||
self.serializeError("Entity %s not recognized" % name)
|
||||
if self.resolve_entities and key not in xmlEntities:
|
||||
data = entities[key]
|
||||
else:
|
||||
data = "&%s;" % name
|
||||
yield self.encodeStrict(data)
|
||||
|
||||
else:
|
||||
self.serializeError(token["data"])
|
||||
|
||||
def render(self, treewalker, encoding=None):
|
||||
"""Serializes the stream from the treewalker into a string
|
||||
|
||||
:arg treewalker: the treewalker to serialize
|
||||
|
||||
:arg encoding: the string encoding to use
|
||||
|
||||
:returns: the serialized tree
|
||||
|
||||
Example:
|
||||
|
||||
>>> from html5lib import parse, getTreeWalker
|
||||
>>> from html5lib.serializer import HTMLSerializer
|
||||
>>> token_stream = parse('<html><body>Hi!</body></html>')
|
||||
>>> walker = getTreeWalker('etree')
|
||||
>>> serializer = HTMLSerializer(omit_optional_tags=False)
|
||||
>>> serializer.render(walker(token_stream))
|
||||
'<html><head></head><body>Hi!</body></html>'
|
||||
|
||||
"""
|
||||
if encoding:
|
||||
return b"".join(list(self.serialize(treewalker, encoding)))
|
||||
else:
|
||||
return "".join(list(self.serialize(treewalker)))
|
||||
|
||||
def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
|
||||
# XXX The idea is to make data mandatory.
|
||||
self.errors.append(data)
|
||||
if self.strict:
|
||||
raise SerializeError
|
||||
|
||||
|
||||
class SerializeError(Exception):
|
||||
"""Error in serialized tree"""
|
||||
pass
|
|
@ -0,0 +1,30 @@
|
|||
"""Tree adapters let you convert from one tree structure to another
|
||||
|
||||
Example:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import html5lib
|
||||
from html5lib.treeadapters import genshi
|
||||
|
||||
doc = '<html><body>Hi!</body></html>'
|
||||
treebuilder = html5lib.getTreeBuilder('etree')
|
||||
parser = html5lib.HTMLParser(tree=treebuilder)
|
||||
tree = parser.parse(doc)
|
||||
TreeWalker = html5lib.getTreeWalker('etree')
|
||||
|
||||
genshi_tree = genshi.to_genshi(TreeWalker(tree))
|
||||
|
||||
"""
|
||||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
from . import sax
|
||||
|
||||
__all__ = ["sax"]
|
||||
|
||||
try:
|
||||
from . import genshi # noqa
|
||||
except ImportError:
|
||||
pass
|
||||
else:
|
||||
__all__.append("genshi")
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,54 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
from genshi.core import QName, Attrs
|
||||
from genshi.core import START, END, TEXT, COMMENT, DOCTYPE
|
||||
|
||||
|
||||
def to_genshi(walker):
|
||||
"""Convert a tree to a genshi tree
|
||||
|
||||
:arg walker: the treewalker to use to walk the tree to convert it
|
||||
|
||||
:returns: generator of genshi nodes
|
||||
|
||||
"""
|
||||
text = []
|
||||
for token in walker:
|
||||
type = token["type"]
|
||||
if type in ("Characters", "SpaceCharacters"):
|
||||
text.append(token["data"])
|
||||
elif text:
|
||||
yield TEXT, "".join(text), (None, -1, -1)
|
||||
text = []
|
||||
|
||||
if type in ("StartTag", "EmptyTag"):
|
||||
if token["namespace"]:
|
||||
name = "{%s}%s" % (token["namespace"], token["name"])
|
||||
else:
|
||||
name = token["name"]
|
||||
attrs = Attrs([(QName("{%s}%s" % attr if attr[0] is not None else attr[1]), value)
|
||||
for attr, value in token["data"].items()])
|
||||
yield (START, (QName(name), attrs), (None, -1, -1))
|
||||
if type == "EmptyTag":
|
||||
type = "EndTag"
|
||||
|
||||
if type == "EndTag":
|
||||
if token["namespace"]:
|
||||
name = "{%s}%s" % (token["namespace"], token["name"])
|
||||
else:
|
||||
name = token["name"]
|
||||
|
||||
yield END, QName(name), (None, -1, -1)
|
||||
|
||||
elif type == "Comment":
|
||||
yield COMMENT, token["data"], (None, -1, -1)
|
||||
|
||||
elif type == "Doctype":
|
||||
yield DOCTYPE, (token["name"], token["publicId"],
|
||||
token["systemId"]), (None, -1, -1)
|
||||
|
||||
else:
|
||||
pass # FIXME: What to do?
|
||||
|
||||
if text:
|
||||
yield TEXT, "".join(text), (None, -1, -1)
|
|
@ -0,0 +1,50 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
from xml.sax.xmlreader import AttributesNSImpl
|
||||
|
||||
from ..constants import adjustForeignAttributes, unadjustForeignAttributes
|
||||
|
||||
prefix_mapping = {}
|
||||
for prefix, localName, namespace in adjustForeignAttributes.values():
|
||||
if prefix is not None:
|
||||
prefix_mapping[prefix] = namespace
|
||||
|
||||
|
||||
def to_sax(walker, handler):
|
||||
"""Call SAX-like content handler based on treewalker walker
|
||||
|
||||
:arg walker: the treewalker to use to walk the tree to convert it
|
||||
|
||||
:arg handler: SAX handler to use
|
||||
|
||||
"""
|
||||
handler.startDocument()
|
||||
for prefix, namespace in prefix_mapping.items():
|
||||
handler.startPrefixMapping(prefix, namespace)
|
||||
|
||||
for token in walker:
|
||||
type = token["type"]
|
||||
if type == "Doctype":
|
||||
continue
|
||||
elif type in ("StartTag", "EmptyTag"):
|
||||
attrs = AttributesNSImpl(token["data"],
|
||||
unadjustForeignAttributes)
|
||||
handler.startElementNS((token["namespace"], token["name"]),
|
||||
token["name"],
|
||||
attrs)
|
||||
if type == "EmptyTag":
|
||||
handler.endElementNS((token["namespace"], token["name"]),
|
||||
token["name"])
|
||||
elif type == "EndTag":
|
||||
handler.endElementNS((token["namespace"], token["name"]),
|
||||
token["name"])
|
||||
elif type in ("Characters", "SpaceCharacters"):
|
||||
handler.characters(token["data"])
|
||||
elif type == "Comment":
|
||||
pass
|
||||
else:
|
||||
assert False, "Unknown token type"
|
||||
|
||||
for prefix, namespace in prefix_mapping.items():
|
||||
handler.endPrefixMapping(prefix)
|
||||
handler.endDocument()
|
|
@ -0,0 +1,88 @@
|
|||
"""A collection of modules for building different kinds of trees from HTML
|
||||
documents.
|
||||
|
||||
To create a treebuilder for a new type of tree, you need to do
|
||||
implement several things:
|
||||
|
||||
1. A set of classes for various types of elements: Document, Doctype, Comment,
|
||||
Element. These must implement the interface of ``base.treebuilders.Node``
|
||||
(although comment nodes have a different signature for their constructor,
|
||||
see ``treebuilders.etree.Comment``) Textual content may also be implemented
|
||||
as another node type, or not, as your tree implementation requires.
|
||||
|
||||
2. A treebuilder object (called ``TreeBuilder`` by convention) that inherits
|
||||
from ``treebuilders.base.TreeBuilder``. This has 4 required attributes:
|
||||
|
||||
* ``documentClass`` - the class to use for the bottommost node of a document
|
||||
* ``elementClass`` - the class to use for HTML Elements
|
||||
* ``commentClass`` - the class to use for comments
|
||||
* ``doctypeClass`` - the class to use for doctypes
|
||||
|
||||
It also has one required method:
|
||||
|
||||
* ``getDocument`` - Returns the root node of the complete document tree
|
||||
|
||||
3. If you wish to run the unit tests, you must also create a ``testSerializer``
|
||||
method on your treebuilder which accepts a node and returns a string
|
||||
containing Node and its children serialized according to the format used in
|
||||
the unittests
|
||||
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
from .._utils import default_etree
|
||||
|
||||
treeBuilderCache = {}
|
||||
|
||||
|
||||
def getTreeBuilder(treeType, implementation=None, **kwargs):
|
||||
"""Get a TreeBuilder class for various types of trees with built-in support
|
||||
|
||||
:arg treeType: the name of the tree type required (case-insensitive). Supported
|
||||
values are:
|
||||
|
||||
* "dom" - A generic builder for DOM implementations, defaulting to a
|
||||
xml.dom.minidom based implementation.
|
||||
* "etree" - A generic builder for tree implementations exposing an
|
||||
ElementTree-like interface, defaulting to xml.etree.cElementTree if
|
||||
available and xml.etree.ElementTree if not.
|
||||
* "lxml" - A etree-based builder for lxml.etree, handling limitations
|
||||
of lxml's implementation.
|
||||
|
||||
:arg implementation: (Currently applies to the "etree" and "dom" tree
|
||||
types). A module implementing the tree type e.g. xml.etree.ElementTree
|
||||
or xml.etree.cElementTree.
|
||||
|
||||
:arg kwargs: Any additional options to pass to the TreeBuilder when
|
||||
creating it.
|
||||
|
||||
Example:
|
||||
|
||||
>>> from html5lib.treebuilders import getTreeBuilder
|
||||
>>> builder = getTreeBuilder('etree')
|
||||
|
||||
"""
|
||||
|
||||
treeType = treeType.lower()
|
||||
if treeType not in treeBuilderCache:
|
||||
if treeType == "dom":
|
||||
from . import dom
|
||||
# Come up with a sane default (pref. from the stdlib)
|
||||
if implementation is None:
|
||||
from xml.dom import minidom
|
||||
implementation = minidom
|
||||
# NEVER cache here, caching is done in the dom submodule
|
||||
return dom.getDomModule(implementation, **kwargs).TreeBuilder
|
||||
elif treeType == "lxml":
|
||||
from . import etree_lxml
|
||||
treeBuilderCache[treeType] = etree_lxml.TreeBuilder
|
||||
elif treeType == "etree":
|
||||
from . import etree
|
||||
if implementation is None:
|
||||
implementation = default_etree
|
||||
# NEVER cache here, caching is done in the etree submodule
|
||||
return etree.getETreeModule(implementation, **kwargs).TreeBuilder
|
||||
else:
|
||||
raise ValueError("""Unrecognised treebuilder "%s" """ % treeType)
|
||||
return treeBuilderCache.get(treeType)
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,417 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
from six import text_type
|
||||
|
||||
from ..constants import scopingElements, tableInsertModeElements, namespaces
|
||||
|
||||
# The scope markers are inserted when entering object elements,
|
||||
# marquees, table cells, and table captions, and are used to prevent formatting
|
||||
# from "leaking" into tables, object elements, and marquees.
|
||||
Marker = None
|
||||
|
||||
listElementsMap = {
|
||||
None: (frozenset(scopingElements), False),
|
||||
"button": (frozenset(scopingElements | {(namespaces["html"], "button")}), False),
|
||||
"list": (frozenset(scopingElements | {(namespaces["html"], "ol"),
|
||||
(namespaces["html"], "ul")}), False),
|
||||
"table": (frozenset([(namespaces["html"], "html"),
|
||||
(namespaces["html"], "table")]), False),
|
||||
"select": (frozenset([(namespaces["html"], "optgroup"),
|
||||
(namespaces["html"], "option")]), True)
|
||||
}
|
||||
|
||||
|
||||
class Node(object):
|
||||
"""Represents an item in the tree"""
|
||||
def __init__(self, name):
|
||||
"""Creates a Node
|
||||
|
||||
:arg name: The tag name associated with the node
|
||||
|
||||
"""
|
||||
# The tag name associated with the node
|
||||
self.name = name
|
||||
# The parent of the current node (or None for the document node)
|
||||
self.parent = None
|
||||
# The value of the current node (applies to text nodes and comments)
|
||||
self.value = None
|
||||
# A dict holding name -> value pairs for attributes of the node
|
||||
self.attributes = {}
|
||||
# A list of child nodes of the current node. This must include all
|
||||
# elements but not necessarily other node types.
|
||||
self.childNodes = []
|
||||
# A list of miscellaneous flags that can be set on the node.
|
||||
self._flags = []
|
||||
|
||||
def __str__(self):
|
||||
attributesStr = " ".join(["%s=\"%s\"" % (name, value)
|
||||
for name, value in
|
||||
self.attributes.items()])
|
||||
if attributesStr:
|
||||
return "<%s %s>" % (self.name, attributesStr)
|
||||
else:
|
||||
return "<%s>" % (self.name)
|
||||
|
||||
def __repr__(self):
|
||||
return "<%s>" % (self.name)
|
||||
|
||||
def appendChild(self, node):
|
||||
"""Insert node as a child of the current node
|
||||
|
||||
:arg node: the node to insert
|
||||
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def insertText(self, data, insertBefore=None):
|
||||
"""Insert data as text in the current node, positioned before the
|
||||
start of node insertBefore or to the end of the node's text.
|
||||
|
||||
:arg data: the data to insert
|
||||
|
||||
:arg insertBefore: True if you want to insert the text before the node
|
||||
and False if you want to insert it after the node
|
||||
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def insertBefore(self, node, refNode):
|
||||
"""Insert node as a child of the current node, before refNode in the
|
||||
list of child nodes. Raises ValueError if refNode is not a child of
|
||||
the current node
|
||||
|
||||
:arg node: the node to insert
|
||||
|
||||
:arg refNode: the child node to insert the node before
|
||||
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def removeChild(self, node):
|
||||
"""Remove node from the children of the current node
|
||||
|
||||
:arg node: the child node to remove
|
||||
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def reparentChildren(self, newParent):
|
||||
"""Move all the children of the current node to newParent.
|
||||
This is needed so that trees that don't store text as nodes move the
|
||||
text in the correct way
|
||||
|
||||
:arg newParent: the node to move all this node's children to
|
||||
|
||||
"""
|
||||
# XXX - should this method be made more general?
|
||||
for child in self.childNodes:
|
||||
newParent.appendChild(child)
|
||||
self.childNodes = []
|
||||
|
||||
def cloneNode(self):
|
||||
"""Return a shallow copy of the current node i.e. a node with the same
|
||||
name and attributes but with no parent or child nodes
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def hasContent(self):
|
||||
"""Return true if the node has children or text, false otherwise
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class ActiveFormattingElements(list):
|
||||
def append(self, node):
|
||||
equalCount = 0
|
||||
if node != Marker:
|
||||
for element in self[::-1]:
|
||||
if element == Marker:
|
||||
break
|
||||
if self.nodesEqual(element, node):
|
||||
equalCount += 1
|
||||
if equalCount == 3:
|
||||
self.remove(element)
|
||||
break
|
||||
list.append(self, node)
|
||||
|
||||
def nodesEqual(self, node1, node2):
|
||||
if not node1.nameTuple == node2.nameTuple:
|
||||
return False
|
||||
|
||||
if not node1.attributes == node2.attributes:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
class TreeBuilder(object):
|
||||
"""Base treebuilder implementation
|
||||
|
||||
* documentClass - the class to use for the bottommost node of a document
|
||||
* elementClass - the class to use for HTML Elements
|
||||
* commentClass - the class to use for comments
|
||||
* doctypeClass - the class to use for doctypes
|
||||
|
||||
"""
|
||||
# pylint:disable=not-callable
|
||||
|
||||
# Document class
|
||||
documentClass = None
|
||||
|
||||
# The class to use for creating a node
|
||||
elementClass = None
|
||||
|
||||
# The class to use for creating comments
|
||||
commentClass = None
|
||||
|
||||
# The class to use for creating doctypes
|
||||
doctypeClass = None
|
||||
|
||||
# Fragment class
|
||||
fragmentClass = None
|
||||
|
||||
def __init__(self, namespaceHTMLElements):
|
||||
"""Create a TreeBuilder
|
||||
|
||||
:arg namespaceHTMLElements: whether or not to namespace HTML elements
|
||||
|
||||
"""
|
||||
if namespaceHTMLElements:
|
||||
self.defaultNamespace = "http://www.w3.org/1999/xhtml"
|
||||
else:
|
||||
self.defaultNamespace = None
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
self.openElements = []
|
||||
self.activeFormattingElements = ActiveFormattingElements()
|
||||
|
||||
# XXX - rename these to headElement, formElement
|
||||
self.headPointer = None
|
||||
self.formPointer = None
|
||||
|
||||
self.insertFromTable = False
|
||||
|
||||
self.document = self.documentClass()
|
||||
|
||||
def elementInScope(self, target, variant=None):
|
||||
|
||||
# If we pass a node in we match that. if we pass a string
|
||||
# match any node with that name
|
||||
exactNode = hasattr(target, "nameTuple")
|
||||
if not exactNode:
|
||||
if isinstance(target, text_type):
|
||||
target = (namespaces["html"], target)
|
||||
assert isinstance(target, tuple)
|
||||
|
||||
listElements, invert = listElementsMap[variant]
|
||||
|
||||
for node in reversed(self.openElements):
|
||||
if exactNode and node == target:
|
||||
return True
|
||||
elif not exactNode and node.nameTuple == target:
|
||||
return True
|
||||
elif (invert ^ (node.nameTuple in listElements)):
|
||||
return False
|
||||
|
||||
assert False # We should never reach this point
|
||||
|
||||
def reconstructActiveFormattingElements(self):
|
||||
# Within this algorithm the order of steps described in the
|
||||
# specification is not quite the same as the order of steps in the
|
||||
# code. It should still do the same though.
|
||||
|
||||
# Step 1: stop the algorithm when there's nothing to do.
|
||||
if not self.activeFormattingElements:
|
||||
return
|
||||
|
||||
# Step 2 and step 3: we start with the last element. So i is -1.
|
||||
i = len(self.activeFormattingElements) - 1
|
||||
entry = self.activeFormattingElements[i]
|
||||
if entry == Marker or entry in self.openElements:
|
||||
return
|
||||
|
||||
# Step 6
|
||||
while entry != Marker and entry not in self.openElements:
|
||||
if i == 0:
|
||||
# This will be reset to 0 below
|
||||
i = -1
|
||||
break
|
||||
i -= 1
|
||||
# Step 5: let entry be one earlier in the list.
|
||||
entry = self.activeFormattingElements[i]
|
||||
|
||||
while True:
|
||||
# Step 7
|
||||
i += 1
|
||||
|
||||
# Step 8
|
||||
entry = self.activeFormattingElements[i]
|
||||
clone = entry.cloneNode() # Mainly to get a new copy of the attributes
|
||||
|
||||
# Step 9
|
||||
element = self.insertElement({"type": "StartTag",
|
||||
"name": clone.name,
|
||||
"namespace": clone.namespace,
|
||||
"data": clone.attributes})
|
||||
|
||||
# Step 10
|
||||
self.activeFormattingElements[i] = element
|
||||
|
||||
# Step 11
|
||||
if element == self.activeFormattingElements[-1]:
|
||||
break
|
||||
|
||||
def clearActiveFormattingElements(self):
|
||||
entry = self.activeFormattingElements.pop()
|
||||
while self.activeFormattingElements and entry != Marker:
|
||||
entry = self.activeFormattingElements.pop()
|
||||
|
||||
def elementInActiveFormattingElements(self, name):
|
||||
"""Check if an element exists between the end of the active
|
||||
formatting elements and the last marker. If it does, return it, else
|
||||
return false"""
|
||||
|
||||
for item in self.activeFormattingElements[::-1]:
|
||||
# Check for Marker first because if it's a Marker it doesn't have a
|
||||
# name attribute.
|
||||
if item == Marker:
|
||||
break
|
||||
elif item.name == name:
|
||||
return item
|
||||
return False
|
||||
|
||||
def insertRoot(self, token):
|
||||
element = self.createElement(token)
|
||||
self.openElements.append(element)
|
||||
self.document.appendChild(element)
|
||||
|
||||
def insertDoctype(self, token):
|
||||
name = token["name"]
|
||||
publicId = token["publicId"]
|
||||
systemId = token["systemId"]
|
||||
|
||||
doctype = self.doctypeClass(name, publicId, systemId)
|
||||
self.document.appendChild(doctype)
|
||||
|
||||
def insertComment(self, token, parent=None):
|
||||
if parent is None:
|
||||
parent = self.openElements[-1]
|
||||
parent.appendChild(self.commentClass(token["data"]))
|
||||
|
||||
def createElement(self, token):
|
||||
"""Create an element but don't insert it anywhere"""
|
||||
name = token["name"]
|
||||
namespace = token.get("namespace", self.defaultNamespace)
|
||||
element = self.elementClass(name, namespace)
|
||||
element.attributes = token["data"]
|
||||
return element
|
||||
|
||||
def _getInsertFromTable(self):
|
||||
return self._insertFromTable
|
||||
|
||||
def _setInsertFromTable(self, value):
|
||||
"""Switch the function used to insert an element from the
|
||||
normal one to the misnested table one and back again"""
|
||||
self._insertFromTable = value
|
||||
if value:
|
||||
self.insertElement = self.insertElementTable
|
||||
else:
|
||||
self.insertElement = self.insertElementNormal
|
||||
|
||||
insertFromTable = property(_getInsertFromTable, _setInsertFromTable)
|
||||
|
||||
def insertElementNormal(self, token):
|
||||
name = token["name"]
|
||||
assert isinstance(name, text_type), "Element %s not unicode" % name
|
||||
namespace = token.get("namespace", self.defaultNamespace)
|
||||
element = self.elementClass(name, namespace)
|
||||
element.attributes = token["data"]
|
||||
self.openElements[-1].appendChild(element)
|
||||
self.openElements.append(element)
|
||||
return element
|
||||
|
||||
def insertElementTable(self, token):
|
||||
"""Create an element and insert it into the tree"""
|
||||
element = self.createElement(token)
|
||||
if self.openElements[-1].name not in tableInsertModeElements:
|
||||
return self.insertElementNormal(token)
|
||||
else:
|
||||
# We should be in the InTable mode. This means we want to do
|
||||
# special magic element rearranging
|
||||
parent, insertBefore = self.getTableMisnestedNodePosition()
|
||||
if insertBefore is None:
|
||||
parent.appendChild(element)
|
||||
else:
|
||||
parent.insertBefore(element, insertBefore)
|
||||
self.openElements.append(element)
|
||||
return element
|
||||
|
||||
def insertText(self, data, parent=None):
|
||||
"""Insert text data."""
|
||||
if parent is None:
|
||||
parent = self.openElements[-1]
|
||||
|
||||
if (not self.insertFromTable or (self.insertFromTable and
|
||||
self.openElements[-1].name
|
||||
not in tableInsertModeElements)):
|
||||
parent.insertText(data)
|
||||
else:
|
||||
# We should be in the InTable mode. This means we want to do
|
||||
# special magic element rearranging
|
||||
parent, insertBefore = self.getTableMisnestedNodePosition()
|
||||
parent.insertText(data, insertBefore)
|
||||
|
||||
def getTableMisnestedNodePosition(self):
|
||||
"""Get the foster parent element, and sibling to insert before
|
||||
(or None) when inserting a misnested table node"""
|
||||
# The foster parent element is the one which comes before the most
|
||||
# recently opened table element
|
||||
# XXX - this is really inelegant
|
||||
lastTable = None
|
||||
fosterParent = None
|
||||
insertBefore = None
|
||||
for elm in self.openElements[::-1]:
|
||||
if elm.name == "table":
|
||||
lastTable = elm
|
||||
break
|
||||
if lastTable:
|
||||
# XXX - we should really check that this parent is actually a
|
||||
# node here
|
||||
if lastTable.parent:
|
||||
fosterParent = lastTable.parent
|
||||
insertBefore = lastTable
|
||||
else:
|
||||
fosterParent = self.openElements[
|
||||
self.openElements.index(lastTable) - 1]
|
||||
else:
|
||||
fosterParent = self.openElements[0]
|
||||
return fosterParent, insertBefore
|
||||
|
||||
def generateImpliedEndTags(self, exclude=None):
|
||||
name = self.openElements[-1].name
|
||||
# XXX td, th and tr are not actually needed
|
||||
if (name in frozenset(("dd", "dt", "li", "option", "optgroup", "p", "rp", "rt")) and
|
||||
name != exclude):
|
||||
self.openElements.pop()
|
||||
# XXX This is not entirely what the specification says. We should
|
||||
# investigate it more closely.
|
||||
self.generateImpliedEndTags(exclude)
|
||||
|
||||
def getDocument(self):
|
||||
"""Return the final tree"""
|
||||
return self.document
|
||||
|
||||
def getFragment(self):
|
||||
"""Return the final fragment"""
|
||||
# assert self.innerHTML
|
||||
fragment = self.fragmentClass()
|
||||
self.openElements[0].reparentChildren(fragment)
|
||||
return fragment
|
||||
|
||||
def testSerializer(self, node):
|
||||
"""Serialize the subtree of node in the format required by unit tests
|
||||
|
||||
:arg node: the node from which to start serializing
|
||||
|
||||
"""
|
||||
raise NotImplementedError
|
|
@ -0,0 +1,239 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
|
||||
try:
|
||||
from collections.abc import MutableMapping
|
||||
except ImportError: # Python 2.7
|
||||
from collections import MutableMapping
|
||||
from xml.dom import minidom, Node
|
||||
import weakref
|
||||
|
||||
from . import base
|
||||
from .. import constants
|
||||
from ..constants import namespaces
|
||||
from .._utils import moduleFactoryFactory
|
||||
|
||||
|
||||
def getDomBuilder(DomImplementation):
|
||||
Dom = DomImplementation
|
||||
|
||||
class AttrList(MutableMapping):
|
||||
def __init__(self, element):
|
||||
self.element = element
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self.element.attributes.keys())
|
||||
|
||||
def __setitem__(self, name, value):
|
||||
if isinstance(name, tuple):
|
||||
raise NotImplementedError
|
||||
else:
|
||||
attr = self.element.ownerDocument.createAttribute(name)
|
||||
attr.value = value
|
||||
self.element.attributes[name] = attr
|
||||
|
||||
def __len__(self):
|
||||
return len(self.element.attributes)
|
||||
|
||||
def items(self):
|
||||
return list(self.element.attributes.items())
|
||||
|
||||
def values(self):
|
||||
return list(self.element.attributes.values())
|
||||
|
||||
def __getitem__(self, name):
|
||||
if isinstance(name, tuple):
|
||||
raise NotImplementedError
|
||||
else:
|
||||
return self.element.attributes[name].value
|
||||
|
||||
def __delitem__(self, name):
|
||||
if isinstance(name, tuple):
|
||||
raise NotImplementedError
|
||||
else:
|
||||
del self.element.attributes[name]
|
||||
|
||||
class NodeBuilder(base.Node):
|
||||
def __init__(self, element):
|
||||
base.Node.__init__(self, element.nodeName)
|
||||
self.element = element
|
||||
|
||||
namespace = property(lambda self: hasattr(self.element, "namespaceURI") and
|
||||
self.element.namespaceURI or None)
|
||||
|
||||
def appendChild(self, node):
|
||||
node.parent = self
|
||||
self.element.appendChild(node.element)
|
||||
|
||||
def insertText(self, data, insertBefore=None):
|
||||
text = self.element.ownerDocument.createTextNode(data)
|
||||
if insertBefore:
|
||||
self.element.insertBefore(text, insertBefore.element)
|
||||
else:
|
||||
self.element.appendChild(text)
|
||||
|
||||
def insertBefore(self, node, refNode):
|
||||
self.element.insertBefore(node.element, refNode.element)
|
||||
node.parent = self
|
||||
|
||||
def removeChild(self, node):
|
||||
if node.element.parentNode == self.element:
|
||||
self.element.removeChild(node.element)
|
||||
node.parent = None
|
||||
|
||||
def reparentChildren(self, newParent):
|
||||
while self.element.hasChildNodes():
|
||||
child = self.element.firstChild
|
||||
self.element.removeChild(child)
|
||||
newParent.element.appendChild(child)
|
||||
self.childNodes = []
|
||||
|
||||
def getAttributes(self):
|
||||
return AttrList(self.element)
|
||||
|
||||
def setAttributes(self, attributes):
|
||||
if attributes:
|
||||
for name, value in list(attributes.items()):
|
||||
if isinstance(name, tuple):
|
||||
if name[0] is not None:
|
||||
qualifiedName = (name[0] + ":" + name[1])
|
||||
else:
|
||||
qualifiedName = name[1]
|
||||
self.element.setAttributeNS(name[2], qualifiedName,
|
||||
value)
|
||||
else:
|
||||
self.element.setAttribute(
|
||||
name, value)
|
||||
attributes = property(getAttributes, setAttributes)
|
||||
|
||||
def cloneNode(self):
|
||||
return NodeBuilder(self.element.cloneNode(False))
|
||||
|
||||
def hasContent(self):
|
||||
return self.element.hasChildNodes()
|
||||
|
||||
def getNameTuple(self):
|
||||
if self.namespace is None:
|
||||
return namespaces["html"], self.name
|
||||
else:
|
||||
return self.namespace, self.name
|
||||
|
||||
nameTuple = property(getNameTuple)
|
||||
|
||||
class TreeBuilder(base.TreeBuilder): # pylint:disable=unused-variable
|
||||
def documentClass(self):
|
||||
self.dom = Dom.getDOMImplementation().createDocument(None, None, None)
|
||||
return weakref.proxy(self)
|
||||
|
||||
def insertDoctype(self, token):
|
||||
name = token["name"]
|
||||
publicId = token["publicId"]
|
||||
systemId = token["systemId"]
|
||||
|
||||
domimpl = Dom.getDOMImplementation()
|
||||
doctype = domimpl.createDocumentType(name, publicId, systemId)
|
||||
self.document.appendChild(NodeBuilder(doctype))
|
||||
if Dom == minidom:
|
||||
doctype.ownerDocument = self.dom
|
||||
|
||||
def elementClass(self, name, namespace=None):
|
||||
if namespace is None and self.defaultNamespace is None:
|
||||
node = self.dom.createElement(name)
|
||||
else:
|
||||
node = self.dom.createElementNS(namespace, name)
|
||||
|
||||
return NodeBuilder(node)
|
||||
|
||||
def commentClass(self, data):
|
||||
return NodeBuilder(self.dom.createComment(data))
|
||||
|
||||
def fragmentClass(self):
|
||||
return NodeBuilder(self.dom.createDocumentFragment())
|
||||
|
||||
def appendChild(self, node):
|
||||
self.dom.appendChild(node.element)
|
||||
|
||||
def testSerializer(self, element):
|
||||
return testSerializer(element)
|
||||
|
||||
def getDocument(self):
|
||||
return self.dom
|
||||
|
||||
def getFragment(self):
|
||||
return base.TreeBuilder.getFragment(self).element
|
||||
|
||||
def insertText(self, data, parent=None):
|
||||
data = data
|
||||
if parent != self:
|
||||
base.TreeBuilder.insertText(self, data, parent)
|
||||
else:
|
||||
# HACK: allow text nodes as children of the document node
|
||||
if hasattr(self.dom, '_child_node_types'):
|
||||
# pylint:disable=protected-access
|
||||
if Node.TEXT_NODE not in self.dom._child_node_types:
|
||||
self.dom._child_node_types = list(self.dom._child_node_types)
|
||||
self.dom._child_node_types.append(Node.TEXT_NODE)
|
||||
self.dom.appendChild(self.dom.createTextNode(data))
|
||||
|
||||
implementation = DomImplementation
|
||||
name = None
|
||||
|
||||
def testSerializer(element):
|
||||
element.normalize()
|
||||
rv = []
|
||||
|
||||
def serializeElement(element, indent=0):
|
||||
if element.nodeType == Node.DOCUMENT_TYPE_NODE:
|
||||
if element.name:
|
||||
if element.publicId or element.systemId:
|
||||
publicId = element.publicId or ""
|
||||
systemId = element.systemId or ""
|
||||
rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" %
|
||||
(' ' * indent, element.name, publicId, systemId))
|
||||
else:
|
||||
rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, element.name))
|
||||
else:
|
||||
rv.append("|%s<!DOCTYPE >" % (' ' * indent,))
|
||||
elif element.nodeType == Node.DOCUMENT_NODE:
|
||||
rv.append("#document")
|
||||
elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
|
||||
rv.append("#document-fragment")
|
||||
elif element.nodeType == Node.COMMENT_NODE:
|
||||
rv.append("|%s<!-- %s -->" % (' ' * indent, element.nodeValue))
|
||||
elif element.nodeType == Node.TEXT_NODE:
|
||||
rv.append("|%s\"%s\"" % (' ' * indent, element.nodeValue))
|
||||
else:
|
||||
if (hasattr(element, "namespaceURI") and
|
||||
element.namespaceURI is not None):
|
||||
name = "%s %s" % (constants.prefixes[element.namespaceURI],
|
||||
element.nodeName)
|
||||
else:
|
||||
name = element.nodeName
|
||||
rv.append("|%s<%s>" % (' ' * indent, name))
|
||||
if element.hasAttributes():
|
||||
attributes = []
|
||||
for i in range(len(element.attributes)):
|
||||
attr = element.attributes.item(i)
|
||||
name = attr.nodeName
|
||||
value = attr.value
|
||||
ns = attr.namespaceURI
|
||||
if ns:
|
||||
name = "%s %s" % (constants.prefixes[ns], attr.localName)
|
||||
else:
|
||||
name = attr.nodeName
|
||||
attributes.append((name, value))
|
||||
|
||||
for name, value in sorted(attributes):
|
||||
rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
|
||||
indent += 2
|
||||
for child in element.childNodes:
|
||||
serializeElement(child, indent)
|
||||
serializeElement(element, 0)
|
||||
|
||||
return "\n".join(rv)
|
||||
|
||||
return locals()
|
||||
|
||||
|
||||
# The actual means to get a module!
|
||||
getDomModule = moduleFactoryFactory(getDomBuilder)
|
|
@ -0,0 +1,343 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
# pylint:disable=protected-access
|
||||
|
||||
from six import text_type
|
||||
|
||||
import re
|
||||
|
||||
from copy import copy
|
||||
|
||||
from . import base
|
||||
from .. import _ihatexml
|
||||
from .. import constants
|
||||
from ..constants import namespaces
|
||||
from .._utils import moduleFactoryFactory
|
||||
|
||||
tag_regexp = re.compile("{([^}]*)}(.*)")
|
||||
|
||||
|
||||
def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
||||
ElementTree = ElementTreeImplementation
|
||||
ElementTreeCommentType = ElementTree.Comment("asd").tag
|
||||
|
||||
class Element(base.Node):
|
||||
def __init__(self, name, namespace=None):
|
||||
self._name = name
|
||||
self._namespace = namespace
|
||||
self._element = ElementTree.Element(self._getETreeTag(name,
|
||||
namespace))
|
||||
if namespace is None:
|
||||
self.nameTuple = namespaces["html"], self._name
|
||||
else:
|
||||
self.nameTuple = self._namespace, self._name
|
||||
self.parent = None
|
||||
self._childNodes = []
|
||||
self._flags = []
|
||||
|
||||
def _getETreeTag(self, name, namespace):
|
||||
if namespace is None:
|
||||
etree_tag = name
|
||||
else:
|
||||
etree_tag = "{%s}%s" % (namespace, name)
|
||||
return etree_tag
|
||||
|
||||
def _setName(self, name):
|
||||
self._name = name
|
||||
self._element.tag = self._getETreeTag(self._name, self._namespace)
|
||||
|
||||
def _getName(self):
|
||||
return self._name
|
||||
|
||||
name = property(_getName, _setName)
|
||||
|
||||
def _setNamespace(self, namespace):
|
||||
self._namespace = namespace
|
||||
self._element.tag = self._getETreeTag(self._name, self._namespace)
|
||||
|
||||
def _getNamespace(self):
|
||||
return self._namespace
|
||||
|
||||
namespace = property(_getNamespace, _setNamespace)
|
||||
|
||||
def _getAttributes(self):
|
||||
return self._element.attrib
|
||||
|
||||
def _setAttributes(self, attributes):
|
||||
el_attrib = self._element.attrib
|
||||
el_attrib.clear()
|
||||
if attributes:
|
||||
# calling .items _always_ allocates, and the above truthy check is cheaper than the
|
||||
# allocation on average
|
||||
for key, value in attributes.items():
|
||||
if isinstance(key, tuple):
|
||||
name = "{%s}%s" % (key[2], key[1])
|
||||
else:
|
||||
name = key
|
||||
el_attrib[name] = value
|
||||
|
||||
attributes = property(_getAttributes, _setAttributes)
|
||||
|
||||
def _getChildNodes(self):
|
||||
return self._childNodes
|
||||
|
||||
def _setChildNodes(self, value):
|
||||
del self._element[:]
|
||||
self._childNodes = []
|
||||
for element in value:
|
||||
self.insertChild(element)
|
||||
|
||||
childNodes = property(_getChildNodes, _setChildNodes)
|
||||
|
||||
def hasContent(self):
|
||||
"""Return true if the node has children or text"""
|
||||
return bool(self._element.text or len(self._element))
|
||||
|
||||
def appendChild(self, node):
|
||||
self._childNodes.append(node)
|
||||
self._element.append(node._element)
|
||||
node.parent = self
|
||||
|
||||
def insertBefore(self, node, refNode):
|
||||
index = list(self._element).index(refNode._element)
|
||||
self._element.insert(index, node._element)
|
||||
node.parent = self
|
||||
|
||||
def removeChild(self, node):
|
||||
self._childNodes.remove(node)
|
||||
self._element.remove(node._element)
|
||||
node.parent = None
|
||||
|
||||
def insertText(self, data, insertBefore=None):
|
||||
if not(len(self._element)):
|
||||
if not self._element.text:
|
||||
self._element.text = ""
|
||||
self._element.text += data
|
||||
elif insertBefore is None:
|
||||
# Insert the text as the tail of the last child element
|
||||
if not self._element[-1].tail:
|
||||
self._element[-1].tail = ""
|
||||
self._element[-1].tail += data
|
||||
else:
|
||||
# Insert the text before the specified node
|
||||
children = list(self._element)
|
||||
index = children.index(insertBefore._element)
|
||||
if index > 0:
|
||||
if not self._element[index - 1].tail:
|
||||
self._element[index - 1].tail = ""
|
||||
self._element[index - 1].tail += data
|
||||
else:
|
||||
if not self._element.text:
|
||||
self._element.text = ""
|
||||
self._element.text += data
|
||||
|
||||
def cloneNode(self):
|
||||
element = type(self)(self.name, self.namespace)
|
||||
if self._element.attrib:
|
||||
element._element.attrib = copy(self._element.attrib)
|
||||
return element
|
||||
|
||||
def reparentChildren(self, newParent):
|
||||
if newParent.childNodes:
|
||||
newParent.childNodes[-1]._element.tail += self._element.text
|
||||
else:
|
||||
if not newParent._element.text:
|
||||
newParent._element.text = ""
|
||||
if self._element.text is not None:
|
||||
newParent._element.text += self._element.text
|
||||
self._element.text = ""
|
||||
base.Node.reparentChildren(self, newParent)
|
||||
|
||||
class Comment(Element):
|
||||
def __init__(self, data):
|
||||
# Use the superclass constructor to set all properties on the
|
||||
# wrapper element
|
||||
self._element = ElementTree.Comment(data)
|
||||
self.parent = None
|
||||
self._childNodes = []
|
||||
self._flags = []
|
||||
|
||||
def _getData(self):
|
||||
return self._element.text
|
||||
|
||||
def _setData(self, value):
|
||||
self._element.text = value
|
||||
|
||||
data = property(_getData, _setData)
|
||||
|
||||
class DocumentType(Element):
|
||||
def __init__(self, name, publicId, systemId):
|
||||
Element.__init__(self, "<!DOCTYPE>")
|
||||
self._element.text = name
|
||||
self.publicId = publicId
|
||||
self.systemId = systemId
|
||||
|
||||
def _getPublicId(self):
|
||||
return self._element.get("publicId", "")
|
||||
|
||||
def _setPublicId(self, value):
|
||||
if value is not None:
|
||||
self._element.set("publicId", value)
|
||||
|
||||
publicId = property(_getPublicId, _setPublicId)
|
||||
|
||||
def _getSystemId(self):
|
||||
return self._element.get("systemId", "")
|
||||
|
||||
def _setSystemId(self, value):
|
||||
if value is not None:
|
||||
self._element.set("systemId", value)
|
||||
|
||||
systemId = property(_getSystemId, _setSystemId)
|
||||
|
||||
class Document(Element):
|
||||
def __init__(self):
|
||||
Element.__init__(self, "DOCUMENT_ROOT")
|
||||
|
||||
class DocumentFragment(Element):
|
||||
def __init__(self):
|
||||
Element.__init__(self, "DOCUMENT_FRAGMENT")
|
||||
|
||||
def testSerializer(element):
|
||||
rv = []
|
||||
|
||||
def serializeElement(element, indent=0):
|
||||
if not(hasattr(element, "tag")):
|
||||
element = element.getroot()
|
||||
if element.tag == "<!DOCTYPE>":
|
||||
if element.get("publicId") or element.get("systemId"):
|
||||
publicId = element.get("publicId") or ""
|
||||
systemId = element.get("systemId") or ""
|
||||
rv.append("""<!DOCTYPE %s "%s" "%s">""" %
|
||||
(element.text, publicId, systemId))
|
||||
else:
|
||||
rv.append("<!DOCTYPE %s>" % (element.text,))
|
||||
elif element.tag == "DOCUMENT_ROOT":
|
||||
rv.append("#document")
|
||||
if element.text is not None:
|
||||
rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
|
||||
if element.tail is not None:
|
||||
raise TypeError("Document node cannot have tail")
|
||||
if hasattr(element, "attrib") and len(element.attrib):
|
||||
raise TypeError("Document node cannot have attributes")
|
||||
elif element.tag == ElementTreeCommentType:
|
||||
rv.append("|%s<!-- %s -->" % (' ' * indent, element.text))
|
||||
else:
|
||||
assert isinstance(element.tag, text_type), \
|
||||
"Expected unicode, got %s, %s" % (type(element.tag), element.tag)
|
||||
nsmatch = tag_regexp.match(element.tag)
|
||||
|
||||
if nsmatch is None:
|
||||
name = element.tag
|
||||
else:
|
||||
ns, name = nsmatch.groups()
|
||||
prefix = constants.prefixes[ns]
|
||||
name = "%s %s" % (prefix, name)
|
||||
rv.append("|%s<%s>" % (' ' * indent, name))
|
||||
|
||||
if hasattr(element, "attrib"):
|
||||
attributes = []
|
||||
for name, value in element.attrib.items():
|
||||
nsmatch = tag_regexp.match(name)
|
||||
if nsmatch is not None:
|
||||
ns, name = nsmatch.groups()
|
||||
prefix = constants.prefixes[ns]
|
||||
attr_string = "%s %s" % (prefix, name)
|
||||
else:
|
||||
attr_string = name
|
||||
attributes.append((attr_string, value))
|
||||
|
||||
for name, value in sorted(attributes):
|
||||
rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
|
||||
if element.text:
|
||||
rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
|
||||
indent += 2
|
||||
for child in element:
|
||||
serializeElement(child, indent)
|
||||
if element.tail:
|
||||
rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
|
||||
serializeElement(element, 0)
|
||||
|
||||
return "\n".join(rv)
|
||||
|
||||
def tostring(element): # pylint:disable=unused-variable
|
||||
"""Serialize an element and its child nodes to a string"""
|
||||
rv = []
|
||||
filter = _ihatexml.InfosetFilter()
|
||||
|
||||
def serializeElement(element):
|
||||
if isinstance(element, ElementTree.ElementTree):
|
||||
element = element.getroot()
|
||||
|
||||
if element.tag == "<!DOCTYPE>":
|
||||
if element.get("publicId") or element.get("systemId"):
|
||||
publicId = element.get("publicId") or ""
|
||||
systemId = element.get("systemId") or ""
|
||||
rv.append("""<!DOCTYPE %s PUBLIC "%s" "%s">""" %
|
||||
(element.text, publicId, systemId))
|
||||
else:
|
||||
rv.append("<!DOCTYPE %s>" % (element.text,))
|
||||
elif element.tag == "DOCUMENT_ROOT":
|
||||
if element.text is not None:
|
||||
rv.append(element.text)
|
||||
if element.tail is not None:
|
||||
raise TypeError("Document node cannot have tail")
|
||||
if hasattr(element, "attrib") and len(element.attrib):
|
||||
raise TypeError("Document node cannot have attributes")
|
||||
|
||||
for child in element:
|
||||
serializeElement(child)
|
||||
|
||||
elif element.tag == ElementTreeCommentType:
|
||||
rv.append("<!--%s-->" % (element.text,))
|
||||
else:
|
||||
# This is assumed to be an ordinary element
|
||||
if not element.attrib:
|
||||
rv.append("<%s>" % (filter.fromXmlName(element.tag),))
|
||||
else:
|
||||
attr = " ".join(["%s=\"%s\"" % (
|
||||
filter.fromXmlName(name), value)
|
||||
for name, value in element.attrib.items()])
|
||||
rv.append("<%s %s>" % (element.tag, attr))
|
||||
if element.text:
|
||||
rv.append(element.text)
|
||||
|
||||
for child in element:
|
||||
serializeElement(child)
|
||||
|
||||
rv.append("</%s>" % (element.tag,))
|
||||
|
||||
if element.tail:
|
||||
rv.append(element.tail)
|
||||
|
||||
serializeElement(element)
|
||||
|
||||
return "".join(rv)
|
||||
|
||||
class TreeBuilder(base.TreeBuilder): # pylint:disable=unused-variable
|
||||
documentClass = Document
|
||||
doctypeClass = DocumentType
|
||||
elementClass = Element
|
||||
commentClass = Comment
|
||||
fragmentClass = DocumentFragment
|
||||
implementation = ElementTreeImplementation
|
||||
|
||||
def testSerializer(self, element):
|
||||
return testSerializer(element)
|
||||
|
||||
def getDocument(self):
|
||||
if fullTree:
|
||||
return self.document._element
|
||||
else:
|
||||
if self.defaultNamespace is not None:
|
||||
return self.document._element.find(
|
||||
"{%s}html" % self.defaultNamespace)
|
||||
else:
|
||||
return self.document._element.find("html")
|
||||
|
||||
def getFragment(self):
|
||||
return base.TreeBuilder.getFragment(self)._element
|
||||
|
||||
return locals()
|
||||
|
||||
|
||||
getETreeModule = moduleFactoryFactory(getETreeBuilder)
|
|
@ -0,0 +1,392 @@
|
|||
"""Module for supporting the lxml.etree library. The idea here is to use as much
|
||||
of the native library as possible, without using fragile hacks like custom element
|
||||
names that break between releases. The downside of this is that we cannot represent
|
||||
all possible trees; specifically the following are known to cause problems:
|
||||
|
||||
Text or comments as siblings of the root element
|
||||
Docypes with no name
|
||||
|
||||
When any of these things occur, we emit a DataLossWarning
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import, division, unicode_literals
|
||||
# pylint:disable=protected-access
|
||||
|
||||
import warnings
|
||||
import re
|
||||
import sys
|
||||
|
||||
try:
|
||||
from collections.abc import MutableMapping
|
||||
except ImportError:
|
||||
from collections import MutableMapping
|
||||
|
||||
from . import base
|
||||
from ..constants import DataLossWarning
|
||||
from .. import constants
|
||||
from . import etree as etree_builders
|
||||
from .. import _ihatexml
|
||||
|
||||
import lxml.etree as etree
|
||||
from six import PY3, binary_type
|
||||
|
||||
|
||||
fullTree = True
|
||||
tag_regexp = re.compile("{([^}]*)}(.*)")
|
||||
|
||||
comment_type = etree.Comment("asd").tag
|
||||
|
||||
|
||||
class DocumentType(object):
|
||||
def __init__(self, name, publicId, systemId):
|
||||
self.name = name
|
||||
self.publicId = publicId
|
||||
self.systemId = systemId
|
||||
|
||||
|
||||
class Document(object):
|
||||
def __init__(self):
|
||||
self._elementTree = None
|
||||
self._childNodes = []
|
||||
|
||||
def appendChild(self, element):
|
||||
last = self._elementTree.getroot()
|
||||
for last in self._elementTree.getroot().itersiblings():
|
||||
pass
|
||||
|
||||
last.addnext(element._element)
|
||||
|
||||
def _getChildNodes(self):
|
||||
return self._childNodes
|
||||
|
||||
childNodes = property(_getChildNodes)
|
||||
|
||||
|
||||
def testSerializer(element):
|
||||
rv = []
|
||||
infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True)
|
||||
|
||||
def serializeElement(element, indent=0):
|
||||
if not hasattr(element, "tag"):
|
||||
if hasattr(element, "getroot"):
|
||||
# Full tree case
|
||||
rv.append("#document")
|
||||
if element.docinfo.internalDTD:
|
||||
if not (element.docinfo.public_id or
|
||||
element.docinfo.system_url):
|
||||
dtd_str = "<!DOCTYPE %s>" % element.docinfo.root_name
|
||||
else:
|
||||
dtd_str = """<!DOCTYPE %s "%s" "%s">""" % (
|
||||
element.docinfo.root_name,
|
||||
element.docinfo.public_id,
|
||||
element.docinfo.system_url)
|
||||
rv.append("|%s%s" % (' ' * (indent + 2), dtd_str))
|
||||
next_element = element.getroot()
|
||||
while next_element.getprevious() is not None:
|
||||
next_element = next_element.getprevious()
|
||||
while next_element is not None:
|
||||
serializeElement(next_element, indent + 2)
|
||||
next_element = next_element.getnext()
|
||||
elif isinstance(element, str) or isinstance(element, bytes):
|
||||
# Text in a fragment
|
||||
assert isinstance(element, str) or sys.version_info[0] == 2
|
||||
rv.append("|%s\"%s\"" % (' ' * indent, element))
|
||||
else:
|
||||
# Fragment case
|
||||
rv.append("#document-fragment")
|
||||
for next_element in element:
|
||||
serializeElement(next_element, indent + 2)
|
||||
elif element.tag == comment_type:
|
||||
rv.append("|%s<!-- %s -->" % (' ' * indent, element.text))
|
||||
if hasattr(element, "tail") and element.tail:
|
||||
rv.append("|%s\"%s\"" % (' ' * indent, element.tail))
|
||||
else:
|
||||
assert isinstance(element, etree._Element)
|
||||
nsmatch = etree_builders.tag_regexp.match(element.tag)
|
||||
if nsmatch is not None:
|
||||
ns = nsmatch.group(1)
|
||||
tag = nsmatch.group(2)
|
||||
prefix = constants.prefixes[ns]
|
||||
rv.append("|%s<%s %s>" % (' ' * indent, prefix,
|
||||
infosetFilter.fromXmlName(tag)))
|
||||
else:
|
||||
rv.append("|%s<%s>" % (' ' * indent,
|
||||
infosetFilter.fromXmlName(element.tag)))
|
||||
|
||||
if hasattr(element, "attrib"):
|
||||
attributes = []
|
||||
for name, value in element.attrib.items():
|
||||
nsmatch = tag_regexp.match(name)
|
||||
if nsmatch is not None:
|
||||
ns, name = nsmatch.groups()
|
||||
name = infosetFilter.fromXmlName(name)
|
||||
prefix = constants.prefixes[ns]
|
||||
attr_string = "%s %s" % (prefix, name)
|
||||
else:
|
||||
attr_string = infosetFilter.fromXmlName(name)
|
||||
attributes.append((attr_string, value))
|
||||
|
||||
for name, value in sorted(attributes):
|
||||
rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
|
||||
|
||||
if element.text:
|
||||
rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
|
||||
indent += 2
|
||||
for child in element:
|
||||
serializeElement(child, indent)
|
||||
if hasattr(element, "tail") and element.tail:
|
||||
rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
|
||||
serializeElement(element, 0)
|
||||
|
||||
return "\n".join(rv)
|
||||
|
||||
|
||||
def tostring(element):
|
||||
"""Serialize an element and its child nodes to a string"""
|
||||
rv = []
|
||||
|
||||
def serializeElement(element):
|
||||
if not hasattr(element, "tag"):
|
||||
if element.docinfo.internalDTD:
|
||||
if element.docinfo.doctype:
|
||||
dtd_str = element.docinfo.doctype
|
||||
else:
|
||||
dtd_str = "<!DOCTYPE %s>" % element.docinfo.root_name
|
||||
rv.append(dtd_str)
|
||||
serializeElement(element.getroot())
|
||||
|
||||
elif element.tag == comment_type:
|
||||
rv.append("<!--%s-->" % (element.text,))
|
||||
|
||||
else:
|
||||
# This is assumed to be an ordinary element
|
||||
if not element.attrib:
|
||||
rv.append("<%s>" % (element.tag,))
|
||||
else:
|
||||
attr = " ".join(["%s=\"%s\"" % (name, value)
|
||||
for name, value in element.attrib.items()])
|
||||
rv.append("<%s %s>" % (element.tag, attr))
|
||||
if element.text:
|
||||
rv.append(element.text)
|
||||
|
||||
for child in element:
|
||||
serializeElement(child)
|
||||
|
||||
rv.append("</%s>" % (element.tag,))
|
||||
|
||||
if hasattr(element, "tail") and element.tail:
|
||||
rv.append(element.tail)
|
||||
|
||||
serializeElement(element)
|
||||
|
||||
return "".join(rv)
|
||||
|
||||
|
||||
class TreeBuilder(base.TreeBuilder):
|
||||
documentClass = Document
|
||||
doctypeClass = DocumentType
|
||||
elementClass = None
|
||||
commentClass = None
|
||||
fragmentClass = Document
|
||||
implementation = etree
|
||||
|
||||
def __init__(self, namespaceHTMLElements, fullTree=False):
|
||||
builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
|
||||
infosetFilter = self.infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True)
|
||||
self.namespaceHTMLElements = namespaceHTMLElements
|
||||
|
||||
class Attributes(MutableMapping):
|
||||
def __init__(self, element):
|
||||
self._element = element
|
||||
|
||||
def _coerceKey(self, key):
|
||||
if isinstance(key, tuple):
|
||||
name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
|
||||
else:
|
||||
name = infosetFilter.coerceAttribute(key)
|
||||
return name
|
||||
|
||||
def __getitem__(self, key):
|
||||
value = self._element._element.attrib[self._coerceKey(key)]
|
||||
if not PY3 and isinstance(value, binary_type):
|
||||
value = value.decode("ascii")
|
||||
return value
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
self._element._element.attrib[self._coerceKey(key)] = value
|
||||
|
||||
def __delitem__(self, key):
|
||||
del self._element._element.attrib[self._coerceKey(key)]
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self._element._element.attrib)
|
||||
|
||||
def __len__(self):
|
||||
return len(self._element._element.attrib)
|
||||
|
||||
def clear(self):
|
||||
return self._element._element.attrib.clear()
|
||||
|
||||
class Element(builder.Element):
|
||||
def __init__(self, name, namespace):
|
||||
name = infosetFilter.coerceElement(name)
|
||||
builder.Element.__init__(self, name, namespace=namespace)
|
||||
self._attributes = Attributes(self)
|
||||
|
||||
def _setName(self, name):
|
||||
self._name = infosetFilter.coerceElement(name)
|
||||
self._element.tag = self._getETreeTag(
|
||||
self._name, self._namespace)
|
||||
|
||||
def _getName(self):
|
||||
return infosetFilter.fromXmlName(self._name)
|
||||
|
||||
name = property(_getName, _setName)
|
||||
|
||||
def _getAttributes(self):
|
||||
return self._attributes
|
||||
|
||||
def _setAttributes(self, value):
|
||||
attributes = self.attributes
|
||||
attributes.clear()
|
||||
attributes.update(value)
|
||||
|
||||
attributes = property(_getAttributes, _setAttributes)
|
||||
|
||||
def insertText(self, data, insertBefore=None):
|
||||
data = infosetFilter.coerceCharacters(data)
|
||||
builder.Element.insertText(self, data, insertBefore)
|
||||
|
||||
def cloneNode(self):
|
||||
element = type(self)(self.name, self.namespace)
|
||||
if self._element.attrib:
|
||||
element._element.attrib.update(self._element.attrib)
|
||||
return element
|
||||
|
||||
class Comment(builder.Comment):
|
||||
def __init__(self, data):
|
||||
data = infosetFilter.coerceComment(data)
|
||||
builder.Comment.__init__(self, data)
|
||||
|
||||
def _setData(self, data):
|
||||
data = infosetFilter.coerceComment(data)
|
||||
self._element.text = data
|
||||
|
||||
def _getData(self):
|
||||
return self._element.text
|
||||
|
||||
data = property(_getData, _setData)
|
||||
|
||||
self.elementClass = Element
|
||||
self.commentClass = Comment
|
||||
# self.fragmentClass = builder.DocumentFragment
|
||||
base.TreeBuilder.__init__(self, namespaceHTMLElements)
|
||||
|
||||
def reset(self):
|
||||
base.TreeBuilder.reset(self)
|
||||
self.insertComment = self.insertCommentInitial
|
||||
self.initial_comments = []
|
||||
self.doctype = None
|
||||
|
||||
def testSerializer(self, element):
|
||||
return testSerializer(element)
|
||||
|
||||
def getDocument(self):
|
||||
if fullTree:
|
||||
return self.document._elementTree
|
||||
else:
|
||||
return self.document._elementTree.getroot()
|
||||
|
||||
def getFragment(self):
|
||||
fragment = []
|
||||
element = self.openElements[0]._element
|
||||
if element.text:
|
||||
fragment.append(element.text)
|
||||
fragment.extend(list(element))
|
||||
if element.tail:
|
||||
fragment.append(element.tail)
|
||||
return fragment
|
||||
|
||||
def insertDoctype(self, token):
|
||||
name = token["name"]
|
||||
publicId = token["publicId"]
|
||||
systemId = token["systemId"]
|
||||
|
||||
if not name:
|
||||
warnings.warn("lxml cannot represent empty doctype", DataLossWarning)
|
||||
self.doctype = None
|
||||
else:
|
||||
coercedName = self.infosetFilter.coerceElement(name)
|
||||
if coercedName != name:
|
||||
warnings.warn("lxml cannot represent non-xml doctype", DataLossWarning)
|
||||
|
||||
doctype = self.doctypeClass(coercedName, publicId, systemId)
|
||||
self.doctype = doctype
|
||||
|
||||
def insertCommentInitial(self, data, parent=None):
|
||||
assert parent is None or parent is self.document
|
||||
assert self.document._elementTree is None
|
||||
self.initial_comments.append(data)
|
||||
|
||||
def insertCommentMain(self, data, parent=None):
|
||||
if (parent == self.document and
|
||||
self.document._elementTree.getroot()[-1].tag == comment_type):
|
||||
warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning)
|
||||
super(TreeBuilder, self).insertComment(data, parent)
|
||||
|
||||
def insertRoot(self, token):
|
||||
# Because of the way libxml2 works, it doesn't seem to be possible to
|
||||
# alter information like the doctype after the tree has been parsed.
|
||||
# Therefore we need to use the built-in parser to create our initial
|
||||
# tree, after which we can add elements like normal
|
||||
docStr = ""
|
||||
if self.doctype:
|
||||
assert self.doctype.name
|
||||
docStr += "<!DOCTYPE %s" % self.doctype.name
|
||||
if (self.doctype.publicId is not None or
|
||||
self.doctype.systemId is not None):
|
||||
docStr += (' PUBLIC "%s" ' %
|
||||
(self.infosetFilter.coercePubid(self.doctype.publicId or "")))
|
||||
if self.doctype.systemId:
|
||||
sysid = self.doctype.systemId
|
||||
if sysid.find("'") >= 0 and sysid.find('"') >= 0:
|
||||
warnings.warn("DOCTYPE system cannot contain single and double quotes", DataLossWarning)
|
||||
sysid = sysid.replace("'", 'U00027')
|
||||
if sysid.find("'") >= 0:
|
||||
docStr += '"%s"' % sysid
|
||||
else:
|
||||
docStr += "'%s'" % sysid
|
||||
else:
|
||||
docStr += "''"
|
||||
docStr += ">"
|
||||
if self.doctype.name != token["name"]:
|
||||
warnings.warn("lxml cannot represent doctype with a different name to the root element", DataLossWarning)
|
||||
docStr += "<THIS_SHOULD_NEVER_APPEAR_PUBLICLY/>"
|
||||
root = etree.fromstring(docStr)
|
||||
|
||||
# Append the initial comments:
|
||||
for comment_token in self.initial_comments:
|
||||
comment = self.commentClass(comment_token["data"])
|
||||
root.addprevious(comment._element)
|
||||
|
||||
# Create the root document and add the ElementTree to it
|
||||
self.document = self.documentClass()
|
||||
self.document._elementTree = root.getroottree()
|
||||
|
||||
# Give the root element the right name
|
||||
name = token["name"]
|
||||
namespace = token.get("namespace", self.defaultNamespace)
|
||||
if namespace is None:
|
||||
etree_tag = name
|
||||
else:
|
||||
etree_tag = "{%s}%s" % (namespace, name)
|
||||
root.tag = etree_tag
|
||||
|
||||
# Add the root element to the internal child/open data structures
|
||||
root_element = self.elementClass(name, namespace)
|
||||
root_element._element = root
|
||||
self.document._childNodes.append(root_element)
|
||||
self.openElements.append(root_element)
|
||||
|
||||
# Reset to the default insert comment function
|
||||
self.insertComment = self.insertCommentMain
|
|
@ -0,0 +1,154 @@
|
|||
"""A collection of modules for iterating through different kinds of
|
||||
tree, generating tokens identical to those produced by the tokenizer
|
||||
module.
|
||||
|
||||
To create a tree walker for a new type of tree, you need to
|
||||
implement a tree walker object (called TreeWalker by convention) that
|
||||
implements a 'serialize' method which takes a tree as sole argument and
|
||||
returns an iterator which generates tokens.
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
from .. import constants
|
||||
from .._utils import default_etree
|
||||
|
||||
__all__ = ["getTreeWalker", "pprint"]
|
||||
|
||||
treeWalkerCache = {}
|
||||
|
||||
|
||||
def getTreeWalker(treeType, implementation=None, **kwargs):
|
||||
"""Get a TreeWalker class for various types of tree with built-in support
|
||||
|
||||
:arg str treeType: the name of the tree type required (case-insensitive).
|
||||
Supported values are:
|
||||
|
||||
* "dom": The xml.dom.minidom DOM implementation
|
||||
* "etree": A generic walker for tree implementations exposing an
|
||||
elementtree-like interface (known to work with ElementTree,
|
||||
cElementTree and lxml.etree).
|
||||
* "lxml": Optimized walker for lxml.etree
|
||||
* "genshi": a Genshi stream
|
||||
|
||||
:arg implementation: A module implementing the tree type e.g.
|
||||
xml.etree.ElementTree or cElementTree (Currently applies to the "etree"
|
||||
tree type only).
|
||||
|
||||
:arg kwargs: keyword arguments passed to the etree walker--for other
|
||||
walkers, this has no effect
|
||||
|
||||
:returns: a TreeWalker class
|
||||
|
||||
"""
|
||||
|
||||
treeType = treeType.lower()
|
||||
if treeType not in treeWalkerCache:
|
||||
if treeType == "dom":
|
||||
from . import dom
|
||||
treeWalkerCache[treeType] = dom.TreeWalker
|
||||
elif treeType == "genshi":
|
||||
from . import genshi
|
||||
treeWalkerCache[treeType] = genshi.TreeWalker
|
||||
elif treeType == "lxml":
|
||||
from . import etree_lxml
|
||||
treeWalkerCache[treeType] = etree_lxml.TreeWalker
|
||||
elif treeType == "etree":
|
||||
from . import etree
|
||||
if implementation is None:
|
||||
implementation = default_etree
|
||||
# XXX: NEVER cache here, caching is done in the etree submodule
|
||||
return etree.getETreeModule(implementation, **kwargs).TreeWalker
|
||||
return treeWalkerCache.get(treeType)
|
||||
|
||||
|
||||
def concatenateCharacterTokens(tokens):
|
||||
pendingCharacters = []
|
||||
for token in tokens:
|
||||
type = token["type"]
|
||||
if type in ("Characters", "SpaceCharacters"):
|
||||
pendingCharacters.append(token["data"])
|
||||
else:
|
||||
if pendingCharacters:
|
||||
yield {"type": "Characters", "data": "".join(pendingCharacters)}
|
||||
pendingCharacters = []
|
||||
yield token
|
||||
if pendingCharacters:
|
||||
yield {"type": "Characters", "data": "".join(pendingCharacters)}
|
||||
|
||||
|
||||
def pprint(walker):
|
||||
"""Pretty printer for tree walkers
|
||||
|
||||
Takes a TreeWalker instance and pretty prints the output of walking the tree.
|
||||
|
||||
:arg walker: a TreeWalker instance
|
||||
|
||||
"""
|
||||
output = []
|
||||
indent = 0
|
||||
for token in concatenateCharacterTokens(walker):
|
||||
type = token["type"]
|
||||
if type in ("StartTag", "EmptyTag"):
|
||||
# tag name
|
||||
if token["namespace"] and token["namespace"] != constants.namespaces["html"]:
|
||||
if token["namespace"] in constants.prefixes:
|
||||
ns = constants.prefixes[token["namespace"]]
|
||||
else:
|
||||
ns = token["namespace"]
|
||||
name = "%s %s" % (ns, token["name"])
|
||||
else:
|
||||
name = token["name"]
|
||||
output.append("%s<%s>" % (" " * indent, name))
|
||||
indent += 2
|
||||
# attributes (sorted for consistent ordering)
|
||||
attrs = token["data"]
|
||||
for (namespace, localname), value in sorted(attrs.items()):
|
||||
if namespace:
|
||||
if namespace in constants.prefixes:
|
||||
ns = constants.prefixes[namespace]
|
||||
else:
|
||||
ns = namespace
|
||||
name = "%s %s" % (ns, localname)
|
||||
else:
|
||||
name = localname
|
||||
output.append("%s%s=\"%s\"" % (" " * indent, name, value))
|
||||
# self-closing
|
||||
if type == "EmptyTag":
|
||||
indent -= 2
|
||||
|
||||
elif type == "EndTag":
|
||||
indent -= 2
|
||||
|
||||
elif type == "Comment":
|
||||
output.append("%s<!-- %s -->" % (" " * indent, token["data"]))
|
||||
|
||||
elif type == "Doctype":
|
||||
if token["name"]:
|
||||
if token["publicId"]:
|
||||
output.append("""%s<!DOCTYPE %s "%s" "%s">""" %
|
||||
(" " * indent,
|
||||
token["name"],
|
||||
token["publicId"],
|
||||
token["systemId"] if token["systemId"] else ""))
|
||||
elif token["systemId"]:
|
||||
output.append("""%s<!DOCTYPE %s "" "%s">""" %
|
||||
(" " * indent,
|
||||
token["name"],
|
||||
token["systemId"]))
|
||||
else:
|
||||
output.append("%s<!DOCTYPE %s>" % (" " * indent,
|
||||
token["name"]))
|
||||
else:
|
||||
output.append("%s<!DOCTYPE >" % (" " * indent,))
|
||||
|
||||
elif type == "Characters":
|
||||
output.append("%s\"%s\"" % (" " * indent, token["data"]))
|
||||
|
||||
elif type == "SpaceCharacters":
|
||||
assert False, "concatenateCharacterTokens should have got rid of all Space tokens"
|
||||
|
||||
else:
|
||||
raise ValueError("Unknown token type, %s" % type)
|
||||
|
||||
return "\n".join(output)
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,252 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
from xml.dom import Node
|
||||
from ..constants import namespaces, voidElements, spaceCharacters
|
||||
|
||||
__all__ = ["DOCUMENT", "DOCTYPE", "TEXT", "ELEMENT", "COMMENT", "ENTITY", "UNKNOWN",
|
||||
"TreeWalker", "NonRecursiveTreeWalker"]
|
||||
|
||||
DOCUMENT = Node.DOCUMENT_NODE
|
||||
DOCTYPE = Node.DOCUMENT_TYPE_NODE
|
||||
TEXT = Node.TEXT_NODE
|
||||
ELEMENT = Node.ELEMENT_NODE
|
||||
COMMENT = Node.COMMENT_NODE
|
||||
ENTITY = Node.ENTITY_NODE
|
||||
UNKNOWN = "<#UNKNOWN#>"
|
||||
|
||||
spaceCharacters = "".join(spaceCharacters)
|
||||
|
||||
|
||||
class TreeWalker(object):
|
||||
"""Walks a tree yielding tokens
|
||||
|
||||
Tokens are dicts that all have a ``type`` field specifying the type of the
|
||||
token.
|
||||
|
||||
"""
|
||||
def __init__(self, tree):
|
||||
"""Creates a TreeWalker
|
||||
|
||||
:arg tree: the tree to walk
|
||||
|
||||
"""
|
||||
self.tree = tree
|
||||
|
||||
def __iter__(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def error(self, msg):
|
||||
"""Generates an error token with the given message
|
||||
|
||||
:arg msg: the error message
|
||||
|
||||
:returns: SerializeError token
|
||||
|
||||
"""
|
||||
return {"type": "SerializeError", "data": msg}
|
||||
|
||||
def emptyTag(self, namespace, name, attrs, hasChildren=False):
|
||||
"""Generates an EmptyTag token
|
||||
|
||||
:arg namespace: the namespace of the token--can be ``None``
|
||||
|
||||
:arg name: the name of the element
|
||||
|
||||
:arg attrs: the attributes of the element as a dict
|
||||
|
||||
:arg hasChildren: whether or not to yield a SerializationError because
|
||||
this tag shouldn't have children
|
||||
|
||||
:returns: EmptyTag token
|
||||
|
||||
"""
|
||||
yield {"type": "EmptyTag", "name": name,
|
||||
"namespace": namespace,
|
||||
"data": attrs}
|
||||
if hasChildren:
|
||||
yield self.error("Void element has children")
|
||||
|
||||
def startTag(self, namespace, name, attrs):
|
||||
"""Generates a StartTag token
|
||||
|
||||
:arg namespace: the namespace of the token--can be ``None``
|
||||
|
||||
:arg name: the name of the element
|
||||
|
||||
:arg attrs: the attributes of the element as a dict
|
||||
|
||||
:returns: StartTag token
|
||||
|
||||
"""
|
||||
return {"type": "StartTag",
|
||||
"name": name,
|
||||
"namespace": namespace,
|
||||
"data": attrs}
|
||||
|
||||
def endTag(self, namespace, name):
|
||||
"""Generates an EndTag token
|
||||
|
||||
:arg namespace: the namespace of the token--can be ``None``
|
||||
|
||||
:arg name: the name of the element
|
||||
|
||||
:returns: EndTag token
|
||||
|
||||
"""
|
||||
return {"type": "EndTag",
|
||||
"name": name,
|
||||
"namespace": namespace}
|
||||
|
||||
def text(self, data):
|
||||
"""Generates SpaceCharacters and Characters tokens
|
||||
|
||||
Depending on what's in the data, this generates one or more
|
||||
``SpaceCharacters`` and ``Characters`` tokens.
|
||||
|
||||
For example:
|
||||
|
||||
>>> from html5lib.treewalkers.base import TreeWalker
|
||||
>>> # Give it an empty tree just so it instantiates
|
||||
>>> walker = TreeWalker([])
|
||||
>>> list(walker.text(''))
|
||||
[]
|
||||
>>> list(walker.text(' '))
|
||||
[{u'data': ' ', u'type': u'SpaceCharacters'}]
|
||||
>>> list(walker.text(' abc ')) # doctest: +NORMALIZE_WHITESPACE
|
||||
[{u'data': ' ', u'type': u'SpaceCharacters'},
|
||||
{u'data': u'abc', u'type': u'Characters'},
|
||||
{u'data': u' ', u'type': u'SpaceCharacters'}]
|
||||
|
||||
:arg data: the text data
|
||||
|
||||
:returns: one or more ``SpaceCharacters`` and ``Characters`` tokens
|
||||
|
||||
"""
|
||||
data = data
|
||||
middle = data.lstrip(spaceCharacters)
|
||||
left = data[:len(data) - len(middle)]
|
||||
if left:
|
||||
yield {"type": "SpaceCharacters", "data": left}
|
||||
data = middle
|
||||
middle = data.rstrip(spaceCharacters)
|
||||
right = data[len(middle):]
|
||||
if middle:
|
||||
yield {"type": "Characters", "data": middle}
|
||||
if right:
|
||||
yield {"type": "SpaceCharacters", "data": right}
|
||||
|
||||
def comment(self, data):
|
||||
"""Generates a Comment token
|
||||
|
||||
:arg data: the comment
|
||||
|
||||
:returns: Comment token
|
||||
|
||||
"""
|
||||
return {"type": "Comment", "data": data}
|
||||
|
||||
def doctype(self, name, publicId=None, systemId=None):
|
||||
"""Generates a Doctype token
|
||||
|
||||
:arg name:
|
||||
|
||||
:arg publicId:
|
||||
|
||||
:arg systemId:
|
||||
|
||||
:returns: the Doctype token
|
||||
|
||||
"""
|
||||
return {"type": "Doctype",
|
||||
"name": name,
|
||||
"publicId": publicId,
|
||||
"systemId": systemId}
|
||||
|
||||
def entity(self, name):
|
||||
"""Generates an Entity token
|
||||
|
||||
:arg name: the entity name
|
||||
|
||||
:returns: an Entity token
|
||||
|
||||
"""
|
||||
return {"type": "Entity", "name": name}
|
||||
|
||||
def unknown(self, nodeType):
|
||||
"""Handles unknown node types"""
|
||||
return self.error("Unknown node type: " + nodeType)
|
||||
|
||||
|
||||
class NonRecursiveTreeWalker(TreeWalker):
|
||||
def getNodeDetails(self, node):
|
||||
raise NotImplementedError
|
||||
|
||||
def getFirstChild(self, node):
|
||||
raise NotImplementedError
|
||||
|
||||
def getNextSibling(self, node):
|
||||
raise NotImplementedError
|
||||
|
||||
def getParentNode(self, node):
|
||||
raise NotImplementedError
|
||||
|
||||
def __iter__(self):
|
||||
currentNode = self.tree
|
||||
while currentNode is not None:
|
||||
details = self.getNodeDetails(currentNode)
|
||||
type, details = details[0], details[1:]
|
||||
hasChildren = False
|
||||
|
||||
if type == DOCTYPE:
|
||||
yield self.doctype(*details)
|
||||
|
||||
elif type == TEXT:
|
||||
for token in self.text(*details):
|
||||
yield token
|
||||
|
||||
elif type == ELEMENT:
|
||||
namespace, name, attributes, hasChildren = details
|
||||
if (not namespace or namespace == namespaces["html"]) and name in voidElements:
|
||||
for token in self.emptyTag(namespace, name, attributes,
|
||||
hasChildren):
|
||||
yield token
|
||||
hasChildren = False
|
||||
else:
|
||||
yield self.startTag(namespace, name, attributes)
|
||||
|
||||
elif type == COMMENT:
|
||||
yield self.comment(details[0])
|
||||
|
||||
elif type == ENTITY:
|
||||
yield self.entity(details[0])
|
||||
|
||||
elif type == DOCUMENT:
|
||||
hasChildren = True
|
||||
|
||||
else:
|
||||
yield self.unknown(details[0])
|
||||
|
||||
if hasChildren:
|
||||
firstChild = self.getFirstChild(currentNode)
|
||||
else:
|
||||
firstChild = None
|
||||
|
||||
if firstChild is not None:
|
||||
currentNode = firstChild
|
||||
else:
|
||||
while currentNode is not None:
|
||||
details = self.getNodeDetails(currentNode)
|
||||
type, details = details[0], details[1:]
|
||||
if type == ELEMENT:
|
||||
namespace, name, attributes, hasChildren = details
|
||||
if (namespace and namespace != namespaces["html"]) or name not in voidElements:
|
||||
yield self.endTag(namespace, name)
|
||||
if self.tree is currentNode:
|
||||
currentNode = None
|
||||
break
|
||||
nextSibling = self.getNextSibling(currentNode)
|
||||
if nextSibling is not None:
|
||||
currentNode = nextSibling
|
||||
break
|
||||
else:
|
||||
currentNode = self.getParentNode(currentNode)
|
|
@ -0,0 +1,43 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
from xml.dom import Node
|
||||
|
||||
from . import base
|
||||
|
||||
|
||||
class TreeWalker(base.NonRecursiveTreeWalker):
|
||||
def getNodeDetails(self, node):
|
||||
if node.nodeType == Node.DOCUMENT_TYPE_NODE:
|
||||
return base.DOCTYPE, node.name, node.publicId, node.systemId
|
||||
|
||||
elif node.nodeType in (Node.TEXT_NODE, Node.CDATA_SECTION_NODE):
|
||||
return base.TEXT, node.nodeValue
|
||||
|
||||
elif node.nodeType == Node.ELEMENT_NODE:
|
||||
attrs = {}
|
||||
for attr in list(node.attributes.keys()):
|
||||
attr = node.getAttributeNode(attr)
|
||||
if attr.namespaceURI:
|
||||
attrs[(attr.namespaceURI, attr.localName)] = attr.value
|
||||
else:
|
||||
attrs[(None, attr.name)] = attr.value
|
||||
return (base.ELEMENT, node.namespaceURI, node.nodeName,
|
||||
attrs, node.hasChildNodes())
|
||||
|
||||
elif node.nodeType == Node.COMMENT_NODE:
|
||||
return base.COMMENT, node.nodeValue
|
||||
|
||||
elif node.nodeType in (Node.DOCUMENT_NODE, Node.DOCUMENT_FRAGMENT_NODE):
|
||||
return (base.DOCUMENT,)
|
||||
|
||||
else:
|
||||
return base.UNKNOWN, node.nodeType
|
||||
|
||||
def getFirstChild(self, node):
|
||||
return node.firstChild
|
||||
|
||||
def getNextSibling(self, node):
|
||||
return node.nextSibling
|
||||
|
||||
def getParentNode(self, node):
|
||||
return node.parentNode
|
|
@ -0,0 +1,131 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
from collections import OrderedDict
|
||||
import re
|
||||
|
||||
from six import string_types
|
||||
|
||||
from . import base
|
||||
from .._utils import moduleFactoryFactory
|
||||
|
||||
tag_regexp = re.compile("{([^}]*)}(.*)")
|
||||
|
||||
|
||||
def getETreeBuilder(ElementTreeImplementation):
|
||||
ElementTree = ElementTreeImplementation
|
||||
ElementTreeCommentType = ElementTree.Comment("asd").tag
|
||||
|
||||
class TreeWalker(base.NonRecursiveTreeWalker): # pylint:disable=unused-variable
|
||||
"""Given the particular ElementTree representation, this implementation,
|
||||
to avoid using recursion, returns "nodes" as tuples with the following
|
||||
content:
|
||||
|
||||
1. The current element
|
||||
|
||||
2. The index of the element relative to its parent
|
||||
|
||||
3. A stack of ancestor elements
|
||||
|
||||
4. A flag "text", "tail" or None to indicate if the current node is a
|
||||
text node; either the text or tail of the current element (1)
|
||||
"""
|
||||
def getNodeDetails(self, node):
|
||||
if isinstance(node, tuple): # It might be the root Element
|
||||
elt, _, _, flag = node
|
||||
if flag in ("text", "tail"):
|
||||
return base.TEXT, getattr(elt, flag)
|
||||
else:
|
||||
node = elt
|
||||
|
||||
if not(hasattr(node, "tag")):
|
||||
node = node.getroot()
|
||||
|
||||
if node.tag in ("DOCUMENT_ROOT", "DOCUMENT_FRAGMENT"):
|
||||
return (base.DOCUMENT,)
|
||||
|
||||
elif node.tag == "<!DOCTYPE>":
|
||||
return (base.DOCTYPE, node.text,
|
||||
node.get("publicId"), node.get("systemId"))
|
||||
|
||||
elif node.tag == ElementTreeCommentType:
|
||||
return base.COMMENT, node.text
|
||||
|
||||
else:
|
||||
assert isinstance(node.tag, string_types), type(node.tag)
|
||||
# This is assumed to be an ordinary element
|
||||
match = tag_regexp.match(node.tag)
|
||||
if match:
|
||||
namespace, tag = match.groups()
|
||||
else:
|
||||
namespace = None
|
||||
tag = node.tag
|
||||
attrs = OrderedDict()
|
||||
for name, value in list(node.attrib.items()):
|
||||
match = tag_regexp.match(name)
|
||||
if match:
|
||||
attrs[(match.group(1), match.group(2))] = value
|
||||
else:
|
||||
attrs[(None, name)] = value
|
||||
return (base.ELEMENT, namespace, tag,
|
||||
attrs, len(node) or node.text)
|
||||
|
||||
def getFirstChild(self, node):
|
||||
if isinstance(node, tuple):
|
||||
element, key, parents, flag = node
|
||||
else:
|
||||
element, key, parents, flag = node, None, [], None
|
||||
|
||||
if flag in ("text", "tail"):
|
||||
return None
|
||||
else:
|
||||
if element.text:
|
||||
return element, key, parents, "text"
|
||||
elif len(element):
|
||||
parents.append(element)
|
||||
return element[0], 0, parents, None
|
||||
else:
|
||||
return None
|
||||
|
||||
def getNextSibling(self, node):
|
||||
if isinstance(node, tuple):
|
||||
element, key, parents, flag = node
|
||||
else:
|
||||
return None
|
||||
|
||||
if flag == "text":
|
||||
if len(element):
|
||||
parents.append(element)
|
||||
return element[0], 0, parents, None
|
||||
else:
|
||||
return None
|
||||
else:
|
||||
if element.tail and flag != "tail":
|
||||
return element, key, parents, "tail"
|
||||
elif key < len(parents[-1]) - 1:
|
||||
return parents[-1][key + 1], key + 1, parents, None
|
||||
else:
|
||||
return None
|
||||
|
||||
def getParentNode(self, node):
|
||||
if isinstance(node, tuple):
|
||||
element, key, parents, flag = node
|
||||
else:
|
||||
return None
|
||||
|
||||
if flag == "text":
|
||||
if not parents:
|
||||
return element
|
||||
else:
|
||||
return element, key, parents, None
|
||||
else:
|
||||
parent = parents.pop()
|
||||
if not parents:
|
||||
return parent
|
||||
else:
|
||||
assert list(parents[-1]).count(parent) == 1
|
||||
return parent, list(parents[-1]).index(parent), parents, None
|
||||
|
||||
return locals()
|
||||
|
||||
|
||||
getETreeModule = moduleFactoryFactory(getETreeBuilder)
|
|
@ -0,0 +1,215 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
from six import text_type
|
||||
|
||||
from collections import OrderedDict
|
||||
|
||||
from lxml import etree
|
||||
from ..treebuilders.etree import tag_regexp
|
||||
|
||||
from . import base
|
||||
|
||||
from .. import _ihatexml
|
||||
|
||||
|
||||
def ensure_str(s):
|
||||
if s is None:
|
||||
return None
|
||||
elif isinstance(s, text_type):
|
||||
return s
|
||||
else:
|
||||
return s.decode("ascii", "strict")
|
||||
|
||||
|
||||
class Root(object):
|
||||
def __init__(self, et):
|
||||
self.elementtree = et
|
||||
self.children = []
|
||||
|
||||
try:
|
||||
if et.docinfo.internalDTD:
|
||||
self.children.append(Doctype(self,
|
||||
ensure_str(et.docinfo.root_name),
|
||||
ensure_str(et.docinfo.public_id),
|
||||
ensure_str(et.docinfo.system_url)))
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
try:
|
||||
node = et.getroot()
|
||||
except AttributeError:
|
||||
node = et
|
||||
|
||||
while node.getprevious() is not None:
|
||||
node = node.getprevious()
|
||||
while node is not None:
|
||||
self.children.append(node)
|
||||
node = node.getnext()
|
||||
|
||||
self.text = None
|
||||
self.tail = None
|
||||
|
||||
def __getitem__(self, key):
|
||||
return self.children[key]
|
||||
|
||||
def getnext(self):
|
||||
return None
|
||||
|
||||
def __len__(self):
|
||||
return 1
|
||||
|
||||
|
||||
class Doctype(object):
|
||||
def __init__(self, root_node, name, public_id, system_id):
|
||||
self.root_node = root_node
|
||||
self.name = name
|
||||
self.public_id = public_id
|
||||
self.system_id = system_id
|
||||
|
||||
self.text = None
|
||||
self.tail = None
|
||||
|
||||
def getnext(self):
|
||||
return self.root_node.children[1]
|
||||
|
||||
|
||||
class FragmentRoot(Root):
|
||||
def __init__(self, children):
|
||||
self.children = [FragmentWrapper(self, child) for child in children]
|
||||
self.text = self.tail = None
|
||||
|
||||
def getnext(self):
|
||||
return None
|
||||
|
||||
|
||||
class FragmentWrapper(object):
|
||||
def __init__(self, fragment_root, obj):
|
||||
self.root_node = fragment_root
|
||||
self.obj = obj
|
||||
if hasattr(self.obj, 'text'):
|
||||
self.text = ensure_str(self.obj.text)
|
||||
else:
|
||||
self.text = None
|
||||
if hasattr(self.obj, 'tail'):
|
||||
self.tail = ensure_str(self.obj.tail)
|
||||
else:
|
||||
self.tail = None
|
||||
|
||||
def __getattr__(self, name):
|
||||
return getattr(self.obj, name)
|
||||
|
||||
def getnext(self):
|
||||
siblings = self.root_node.children
|
||||
idx = siblings.index(self)
|
||||
if idx < len(siblings) - 1:
|
||||
return siblings[idx + 1]
|
||||
else:
|
||||
return None
|
||||
|
||||
def __getitem__(self, key):
|
||||
return self.obj[key]
|
||||
|
||||
def __bool__(self):
|
||||
return bool(self.obj)
|
||||
|
||||
def getparent(self):
|
||||
return None
|
||||
|
||||
def __str__(self):
|
||||
return str(self.obj)
|
||||
|
||||
def __unicode__(self):
|
||||
return str(self.obj)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.obj)
|
||||
|
||||
|
||||
class TreeWalker(base.NonRecursiveTreeWalker):
|
||||
def __init__(self, tree):
|
||||
# pylint:disable=redefined-variable-type
|
||||
if isinstance(tree, list):
|
||||
self.fragmentChildren = set(tree)
|
||||
tree = FragmentRoot(tree)
|
||||
else:
|
||||
self.fragmentChildren = set()
|
||||
tree = Root(tree)
|
||||
base.NonRecursiveTreeWalker.__init__(self, tree)
|
||||
self.filter = _ihatexml.InfosetFilter()
|
||||
|
||||
def getNodeDetails(self, node):
|
||||
if isinstance(node, tuple): # Text node
|
||||
node, key = node
|
||||
assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
|
||||
return base.TEXT, ensure_str(getattr(node, key))
|
||||
|
||||
elif isinstance(node, Root):
|
||||
return (base.DOCUMENT,)
|
||||
|
||||
elif isinstance(node, Doctype):
|
||||
return base.DOCTYPE, node.name, node.public_id, node.system_id
|
||||
|
||||
elif isinstance(node, FragmentWrapper) and not hasattr(node, "tag"):
|
||||
return base.TEXT, ensure_str(node.obj)
|
||||
|
||||
elif node.tag == etree.Comment:
|
||||
return base.COMMENT, ensure_str(node.text)
|
||||
|
||||
elif node.tag == etree.Entity:
|
||||
return base.ENTITY, ensure_str(node.text)[1:-1] # strip &;
|
||||
|
||||
else:
|
||||
# This is assumed to be an ordinary element
|
||||
match = tag_regexp.match(ensure_str(node.tag))
|
||||
if match:
|
||||
namespace, tag = match.groups()
|
||||
else:
|
||||
namespace = None
|
||||
tag = ensure_str(node.tag)
|
||||
attrs = OrderedDict()
|
||||
for name, value in list(node.attrib.items()):
|
||||
name = ensure_str(name)
|
||||
value = ensure_str(value)
|
||||
match = tag_regexp.match(name)
|
||||
if match:
|
||||
attrs[(match.group(1), match.group(2))] = value
|
||||
else:
|
||||
attrs[(None, name)] = value
|
||||
return (base.ELEMENT, namespace, self.filter.fromXmlName(tag),
|
||||
attrs, len(node) > 0 or node.text)
|
||||
|
||||
def getFirstChild(self, node):
|
||||
assert not isinstance(node, tuple), "Text nodes have no children"
|
||||
|
||||
assert len(node) or node.text, "Node has no children"
|
||||
if node.text:
|
||||
return (node, "text")
|
||||
else:
|
||||
return node[0]
|
||||
|
||||
def getNextSibling(self, node):
|
||||
if isinstance(node, tuple): # Text node
|
||||
node, key = node
|
||||
assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
|
||||
if key == "text":
|
||||
# XXX: we cannot use a "bool(node) and node[0] or None" construct here
|
||||
# because node[0] might evaluate to False if it has no child element
|
||||
if len(node):
|
||||
return node[0]
|
||||
else:
|
||||
return None
|
||||
else: # tail
|
||||
return node.getnext()
|
||||
|
||||
return (node, "tail") if node.tail else node.getnext()
|
||||
|
||||
def getParentNode(self, node):
|
||||
if isinstance(node, tuple): # Text node
|
||||
node, key = node
|
||||
assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
|
||||
if key == "text":
|
||||
return node
|
||||
# else: fallback to "normal" processing
|
||||
elif node in self.fragmentChildren:
|
||||
return None
|
||||
|
||||
return node.getparent()
|
|
@ -0,0 +1,69 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
from genshi.core import QName
|
||||
from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT
|
||||
from genshi.core import START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
|
||||
|
||||
from . import base
|
||||
|
||||
from ..constants import voidElements, namespaces
|
||||
|
||||
|
||||
class TreeWalker(base.TreeWalker):
|
||||
def __iter__(self):
|
||||
# Buffer the events so we can pass in the following one
|
||||
previous = None
|
||||
for event in self.tree:
|
||||
if previous is not None:
|
||||
for token in self.tokens(previous, event):
|
||||
yield token
|
||||
previous = event
|
||||
|
||||
# Don't forget the final event!
|
||||
if previous is not None:
|
||||
for token in self.tokens(previous, None):
|
||||
yield token
|
||||
|
||||
def tokens(self, event, next):
|
||||
kind, data, _ = event
|
||||
if kind == START:
|
||||
tag, attribs = data
|
||||
name = tag.localname
|
||||
namespace = tag.namespace
|
||||
converted_attribs = {}
|
||||
for k, v in attribs:
|
||||
if isinstance(k, QName):
|
||||
converted_attribs[(k.namespace, k.localname)] = v
|
||||
else:
|
||||
converted_attribs[(None, k)] = v
|
||||
|
||||
if namespace == namespaces["html"] and name in voidElements:
|
||||
for token in self.emptyTag(namespace, name, converted_attribs,
|
||||
not next or next[0] != END or
|
||||
next[1] != tag):
|
||||
yield token
|
||||
else:
|
||||
yield self.startTag(namespace, name, converted_attribs)
|
||||
|
||||
elif kind == END:
|
||||
name = data.localname
|
||||
namespace = data.namespace
|
||||
if namespace != namespaces["html"] or name not in voidElements:
|
||||
yield self.endTag(namespace, name)
|
||||
|
||||
elif kind == COMMENT:
|
||||
yield self.comment(data)
|
||||
|
||||
elif kind == TEXT:
|
||||
for token in self.text(data):
|
||||
yield token
|
||||
|
||||
elif kind == DOCTYPE:
|
||||
yield self.doctype(*data)
|
||||
|
||||
elif kind in (XML_NAMESPACE, DOCTYPE, START_NS, END_NS,
|
||||
START_CDATA, END_CDATA, PI):
|
||||
pass
|
||||
|
||||
else:
|
||||
yield self.unknown(kind)
|
|
@ -0,0 +1,4 @@
|
|||
#!/bin/bash
|
||||
|
||||
|
||||
pip install --no-binary all --no-compile --no-deps -r vendor.txt --target .
|
3
venv/Lib/site-packages/bleach/_vendor/vendor.txt
Normal file
3
venv/Lib/site-packages/bleach/_vendor/vendor.txt
Normal file
|
@ -0,0 +1,3 @@
|
|||
html5lib==1.1 \
|
||||
--hash=sha256:0d78f8fde1c230e99fe37986a60526d7049ed4bf8a9fadbad5f00e22e58e041d \
|
||||
--hash=sha256:b2e5b40261e20f354d198eae92afc10d750afb487ed5e50f9c4eaf07c184146f
|
Loading…
Add table
Add a link
Reference in a new issue