Created starter files for the project.

This commit is contained in:
Batuhan Berk Başoğlu 2020-10-02 21:26:03 -04:00
commit 73f0c0db42
1992 changed files with 769897 additions and 0 deletions

View file

@ -0,0 +1,154 @@
"""A collection of modules for iterating through different kinds of
tree, generating tokens identical to those produced by the tokenizer
module.
To create a tree walker for a new type of tree, you need to
implement a tree walker object (called TreeWalker by convention) that
implements a 'serialize' method which takes a tree as sole argument and
returns an iterator which generates tokens.
"""
from __future__ import absolute_import, division, unicode_literals
from .. import constants
from .._utils import default_etree
__all__ = ["getTreeWalker", "pprint"]
treeWalkerCache = {}
def getTreeWalker(treeType, implementation=None, **kwargs):
"""Get a TreeWalker class for various types of tree with built-in support
:arg str treeType: the name of the tree type required (case-insensitive).
Supported values are:
* "dom": The xml.dom.minidom DOM implementation
* "etree": A generic walker for tree implementations exposing an
elementtree-like interface (known to work with ElementTree,
cElementTree and lxml.etree).
* "lxml": Optimized walker for lxml.etree
* "genshi": a Genshi stream
:arg implementation: A module implementing the tree type e.g.
xml.etree.ElementTree or cElementTree (Currently applies to the "etree"
tree type only).
:arg kwargs: keyword arguments passed to the etree walker--for other
walkers, this has no effect
:returns: a TreeWalker class
"""
treeType = treeType.lower()
if treeType not in treeWalkerCache:
if treeType == "dom":
from . import dom
treeWalkerCache[treeType] = dom.TreeWalker
elif treeType == "genshi":
from . import genshi
treeWalkerCache[treeType] = genshi.TreeWalker
elif treeType == "lxml":
from . import etree_lxml
treeWalkerCache[treeType] = etree_lxml.TreeWalker
elif treeType == "etree":
from . import etree
if implementation is None:
implementation = default_etree
# XXX: NEVER cache here, caching is done in the etree submodule
return etree.getETreeModule(implementation, **kwargs).TreeWalker
return treeWalkerCache.get(treeType)
def concatenateCharacterTokens(tokens):
pendingCharacters = []
for token in tokens:
type = token["type"]
if type in ("Characters", "SpaceCharacters"):
pendingCharacters.append(token["data"])
else:
if pendingCharacters:
yield {"type": "Characters", "data": "".join(pendingCharacters)}
pendingCharacters = []
yield token
if pendingCharacters:
yield {"type": "Characters", "data": "".join(pendingCharacters)}
def pprint(walker):
"""Pretty printer for tree walkers
Takes a TreeWalker instance and pretty prints the output of walking the tree.
:arg walker: a TreeWalker instance
"""
output = []
indent = 0
for token in concatenateCharacterTokens(walker):
type = token["type"]
if type in ("StartTag", "EmptyTag"):
# tag name
if token["namespace"] and token["namespace"] != constants.namespaces["html"]:
if token["namespace"] in constants.prefixes:
ns = constants.prefixes[token["namespace"]]
else:
ns = token["namespace"]
name = "%s %s" % (ns, token["name"])
else:
name = token["name"]
output.append("%s<%s>" % (" " * indent, name))
indent += 2
# attributes (sorted for consistent ordering)
attrs = token["data"]
for (namespace, localname), value in sorted(attrs.items()):
if namespace:
if namespace in constants.prefixes:
ns = constants.prefixes[namespace]
else:
ns = namespace
name = "%s %s" % (ns, localname)
else:
name = localname
output.append("%s%s=\"%s\"" % (" " * indent, name, value))
# self-closing
if type == "EmptyTag":
indent -= 2
elif type == "EndTag":
indent -= 2
elif type == "Comment":
output.append("%s<!-- %s -->" % (" " * indent, token["data"]))
elif type == "Doctype":
if token["name"]:
if token["publicId"]:
output.append("""%s<!DOCTYPE %s "%s" "%s">""" %
(" " * indent,
token["name"],
token["publicId"],
token["systemId"] if token["systemId"] else ""))
elif token["systemId"]:
output.append("""%s<!DOCTYPE %s "" "%s">""" %
(" " * indent,
token["name"],
token["systemId"]))
else:
output.append("%s<!DOCTYPE %s>" % (" " * indent,
token["name"]))
else:
output.append("%s<!DOCTYPE >" % (" " * indent,))
elif type == "Characters":
output.append("%s\"%s\"" % (" " * indent, token["data"]))
elif type == "SpaceCharacters":
assert False, "concatenateCharacterTokens should have got rid of all Space tokens"
else:
raise ValueError("Unknown token type, %s" % type)
return "\n".join(output)

View file

@ -0,0 +1,252 @@
from __future__ import absolute_import, division, unicode_literals
from xml.dom import Node
from ..constants import namespaces, voidElements, spaceCharacters
__all__ = ["DOCUMENT", "DOCTYPE", "TEXT", "ELEMENT", "COMMENT", "ENTITY", "UNKNOWN",
"TreeWalker", "NonRecursiveTreeWalker"]
DOCUMENT = Node.DOCUMENT_NODE
DOCTYPE = Node.DOCUMENT_TYPE_NODE
TEXT = Node.TEXT_NODE
ELEMENT = Node.ELEMENT_NODE
COMMENT = Node.COMMENT_NODE
ENTITY = Node.ENTITY_NODE
UNKNOWN = "<#UNKNOWN#>"
spaceCharacters = "".join(spaceCharacters)
class TreeWalker(object):
"""Walks a tree yielding tokens
Tokens are dicts that all have a ``type`` field specifying the type of the
token.
"""
def __init__(self, tree):
"""Creates a TreeWalker
:arg tree: the tree to walk
"""
self.tree = tree
def __iter__(self):
raise NotImplementedError
def error(self, msg):
"""Generates an error token with the given message
:arg msg: the error message
:returns: SerializeError token
"""
return {"type": "SerializeError", "data": msg}
def emptyTag(self, namespace, name, attrs, hasChildren=False):
"""Generates an EmptyTag token
:arg namespace: the namespace of the token--can be ``None``
:arg name: the name of the element
:arg attrs: the attributes of the element as a dict
:arg hasChildren: whether or not to yield a SerializationError because
this tag shouldn't have children
:returns: EmptyTag token
"""
yield {"type": "EmptyTag", "name": name,
"namespace": namespace,
"data": attrs}
if hasChildren:
yield self.error("Void element has children")
def startTag(self, namespace, name, attrs):
"""Generates a StartTag token
:arg namespace: the namespace of the token--can be ``None``
:arg name: the name of the element
:arg attrs: the attributes of the element as a dict
:returns: StartTag token
"""
return {"type": "StartTag",
"name": name,
"namespace": namespace,
"data": attrs}
def endTag(self, namespace, name):
"""Generates an EndTag token
:arg namespace: the namespace of the token--can be ``None``
:arg name: the name of the element
:returns: EndTag token
"""
return {"type": "EndTag",
"name": name,
"namespace": namespace}
def text(self, data):
"""Generates SpaceCharacters and Characters tokens
Depending on what's in the data, this generates one or more
``SpaceCharacters`` and ``Characters`` tokens.
For example:
>>> from html5lib.treewalkers.base import TreeWalker
>>> # Give it an empty tree just so it instantiates
>>> walker = TreeWalker([])
>>> list(walker.text(''))
[]
>>> list(walker.text(' '))
[{u'data': ' ', u'type': u'SpaceCharacters'}]
>>> list(walker.text(' abc ')) # doctest: +NORMALIZE_WHITESPACE
[{u'data': ' ', u'type': u'SpaceCharacters'},
{u'data': u'abc', u'type': u'Characters'},
{u'data': u' ', u'type': u'SpaceCharacters'}]
:arg data: the text data
:returns: one or more ``SpaceCharacters`` and ``Characters`` tokens
"""
data = data
middle = data.lstrip(spaceCharacters)
left = data[:len(data) - len(middle)]
if left:
yield {"type": "SpaceCharacters", "data": left}
data = middle
middle = data.rstrip(spaceCharacters)
right = data[len(middle):]
if middle:
yield {"type": "Characters", "data": middle}
if right:
yield {"type": "SpaceCharacters", "data": right}
def comment(self, data):
"""Generates a Comment token
:arg data: the comment
:returns: Comment token
"""
return {"type": "Comment", "data": data}
def doctype(self, name, publicId=None, systemId=None):
"""Generates a Doctype token
:arg name:
:arg publicId:
:arg systemId:
:returns: the Doctype token
"""
return {"type": "Doctype",
"name": name,
"publicId": publicId,
"systemId": systemId}
def entity(self, name):
"""Generates an Entity token
:arg name: the entity name
:returns: an Entity token
"""
return {"type": "Entity", "name": name}
def unknown(self, nodeType):
"""Handles unknown node types"""
return self.error("Unknown node type: " + nodeType)
class NonRecursiveTreeWalker(TreeWalker):
def getNodeDetails(self, node):
raise NotImplementedError
def getFirstChild(self, node):
raise NotImplementedError
def getNextSibling(self, node):
raise NotImplementedError
def getParentNode(self, node):
raise NotImplementedError
def __iter__(self):
currentNode = self.tree
while currentNode is not None:
details = self.getNodeDetails(currentNode)
type, details = details[0], details[1:]
hasChildren = False
if type == DOCTYPE:
yield self.doctype(*details)
elif type == TEXT:
for token in self.text(*details):
yield token
elif type == ELEMENT:
namespace, name, attributes, hasChildren = details
if (not namespace or namespace == namespaces["html"]) and name in voidElements:
for token in self.emptyTag(namespace, name, attributes,
hasChildren):
yield token
hasChildren = False
else:
yield self.startTag(namespace, name, attributes)
elif type == COMMENT:
yield self.comment(details[0])
elif type == ENTITY:
yield self.entity(details[0])
elif type == DOCUMENT:
hasChildren = True
else:
yield self.unknown(details[0])
if hasChildren:
firstChild = self.getFirstChild(currentNode)
else:
firstChild = None
if firstChild is not None:
currentNode = firstChild
else:
while currentNode is not None:
details = self.getNodeDetails(currentNode)
type, details = details[0], details[1:]
if type == ELEMENT:
namespace, name, attributes, hasChildren = details
if (namespace and namespace != namespaces["html"]) or name not in voidElements:
yield self.endTag(namespace, name)
if self.tree is currentNode:
currentNode = None
break
nextSibling = self.getNextSibling(currentNode)
if nextSibling is not None:
currentNode = nextSibling
break
else:
currentNode = self.getParentNode(currentNode)

View file

@ -0,0 +1,43 @@
from __future__ import absolute_import, division, unicode_literals
from xml.dom import Node
from . import base
class TreeWalker(base.NonRecursiveTreeWalker):
def getNodeDetails(self, node):
if node.nodeType == Node.DOCUMENT_TYPE_NODE:
return base.DOCTYPE, node.name, node.publicId, node.systemId
elif node.nodeType in (Node.TEXT_NODE, Node.CDATA_SECTION_NODE):
return base.TEXT, node.nodeValue
elif node.nodeType == Node.ELEMENT_NODE:
attrs = {}
for attr in list(node.attributes.keys()):
attr = node.getAttributeNode(attr)
if attr.namespaceURI:
attrs[(attr.namespaceURI, attr.localName)] = attr.value
else:
attrs[(None, attr.name)] = attr.value
return (base.ELEMENT, node.namespaceURI, node.nodeName,
attrs, node.hasChildNodes())
elif node.nodeType == Node.COMMENT_NODE:
return base.COMMENT, node.nodeValue
elif node.nodeType in (Node.DOCUMENT_NODE, Node.DOCUMENT_FRAGMENT_NODE):
return (base.DOCUMENT,)
else:
return base.UNKNOWN, node.nodeType
def getFirstChild(self, node):
return node.firstChild
def getNextSibling(self, node):
return node.nextSibling
def getParentNode(self, node):
return node.parentNode

View file

@ -0,0 +1,131 @@
from __future__ import absolute_import, division, unicode_literals
from collections import OrderedDict
import re
from pip._vendor.six import string_types
from . import base
from .._utils import moduleFactoryFactory
tag_regexp = re.compile("{([^}]*)}(.*)")
def getETreeBuilder(ElementTreeImplementation):
ElementTree = ElementTreeImplementation
ElementTreeCommentType = ElementTree.Comment("asd").tag
class TreeWalker(base.NonRecursiveTreeWalker): # pylint:disable=unused-variable
"""Given the particular ElementTree representation, this implementation,
to avoid using recursion, returns "nodes" as tuples with the following
content:
1. The current element
2. The index of the element relative to its parent
3. A stack of ancestor elements
4. A flag "text", "tail" or None to indicate if the current node is a
text node; either the text or tail of the current element (1)
"""
def getNodeDetails(self, node):
if isinstance(node, tuple): # It might be the root Element
elt, _, _, flag = node
if flag in ("text", "tail"):
return base.TEXT, getattr(elt, flag)
else:
node = elt
if not(hasattr(node, "tag")):
node = node.getroot()
if node.tag in ("DOCUMENT_ROOT", "DOCUMENT_FRAGMENT"):
return (base.DOCUMENT,)
elif node.tag == "<!DOCTYPE>":
return (base.DOCTYPE, node.text,
node.get("publicId"), node.get("systemId"))
elif node.tag == ElementTreeCommentType:
return base.COMMENT, node.text
else:
assert isinstance(node.tag, string_types), type(node.tag)
# This is assumed to be an ordinary element
match = tag_regexp.match(node.tag)
if match:
namespace, tag = match.groups()
else:
namespace = None
tag = node.tag
attrs = OrderedDict()
for name, value in list(node.attrib.items()):
match = tag_regexp.match(name)
if match:
attrs[(match.group(1), match.group(2))] = value
else:
attrs[(None, name)] = value
return (base.ELEMENT, namespace, tag,
attrs, len(node) or node.text)
def getFirstChild(self, node):
if isinstance(node, tuple):
element, key, parents, flag = node
else:
element, key, parents, flag = node, None, [], None
if flag in ("text", "tail"):
return None
else:
if element.text:
return element, key, parents, "text"
elif len(element):
parents.append(element)
return element[0], 0, parents, None
else:
return None
def getNextSibling(self, node):
if isinstance(node, tuple):
element, key, parents, flag = node
else:
return None
if flag == "text":
if len(element):
parents.append(element)
return element[0], 0, parents, None
else:
return None
else:
if element.tail and flag != "tail":
return element, key, parents, "tail"
elif key < len(parents[-1]) - 1:
return parents[-1][key + 1], key + 1, parents, None
else:
return None
def getParentNode(self, node):
if isinstance(node, tuple):
element, key, parents, flag = node
else:
return None
if flag == "text":
if not parents:
return element
else:
return element, key, parents, None
else:
parent = parents.pop()
if not parents:
return parent
else:
assert list(parents[-1]).count(parent) == 1
return parent, list(parents[-1]).index(parent), parents, None
return locals()
getETreeModule = moduleFactoryFactory(getETreeBuilder)

View file

@ -0,0 +1,215 @@
from __future__ import absolute_import, division, unicode_literals
from pip._vendor.six import text_type
from collections import OrderedDict
from lxml import etree
from ..treebuilders.etree import tag_regexp
from . import base
from .. import _ihatexml
def ensure_str(s):
if s is None:
return None
elif isinstance(s, text_type):
return s
else:
return s.decode("ascii", "strict")
class Root(object):
def __init__(self, et):
self.elementtree = et
self.children = []
try:
if et.docinfo.internalDTD:
self.children.append(Doctype(self,
ensure_str(et.docinfo.root_name),
ensure_str(et.docinfo.public_id),
ensure_str(et.docinfo.system_url)))
except AttributeError:
pass
try:
node = et.getroot()
except AttributeError:
node = et
while node.getprevious() is not None:
node = node.getprevious()
while node is not None:
self.children.append(node)
node = node.getnext()
self.text = None
self.tail = None
def __getitem__(self, key):
return self.children[key]
def getnext(self):
return None
def __len__(self):
return 1
class Doctype(object):
def __init__(self, root_node, name, public_id, system_id):
self.root_node = root_node
self.name = name
self.public_id = public_id
self.system_id = system_id
self.text = None
self.tail = None
def getnext(self):
return self.root_node.children[1]
class FragmentRoot(Root):
def __init__(self, children):
self.children = [FragmentWrapper(self, child) for child in children]
self.text = self.tail = None
def getnext(self):
return None
class FragmentWrapper(object):
def __init__(self, fragment_root, obj):
self.root_node = fragment_root
self.obj = obj
if hasattr(self.obj, 'text'):
self.text = ensure_str(self.obj.text)
else:
self.text = None
if hasattr(self.obj, 'tail'):
self.tail = ensure_str(self.obj.tail)
else:
self.tail = None
def __getattr__(self, name):
return getattr(self.obj, name)
def getnext(self):
siblings = self.root_node.children
idx = siblings.index(self)
if idx < len(siblings) - 1:
return siblings[idx + 1]
else:
return None
def __getitem__(self, key):
return self.obj[key]
def __bool__(self):
return bool(self.obj)
def getparent(self):
return None
def __str__(self):
return str(self.obj)
def __unicode__(self):
return str(self.obj)
def __len__(self):
return len(self.obj)
class TreeWalker(base.NonRecursiveTreeWalker):
def __init__(self, tree):
# pylint:disable=redefined-variable-type
if isinstance(tree, list):
self.fragmentChildren = set(tree)
tree = FragmentRoot(tree)
else:
self.fragmentChildren = set()
tree = Root(tree)
base.NonRecursiveTreeWalker.__init__(self, tree)
self.filter = _ihatexml.InfosetFilter()
def getNodeDetails(self, node):
if isinstance(node, tuple): # Text node
node, key = node
assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
return base.TEXT, ensure_str(getattr(node, key))
elif isinstance(node, Root):
return (base.DOCUMENT,)
elif isinstance(node, Doctype):
return base.DOCTYPE, node.name, node.public_id, node.system_id
elif isinstance(node, FragmentWrapper) and not hasattr(node, "tag"):
return base.TEXT, ensure_str(node.obj)
elif node.tag == etree.Comment:
return base.COMMENT, ensure_str(node.text)
elif node.tag == etree.Entity:
return base.ENTITY, ensure_str(node.text)[1:-1] # strip &;
else:
# This is assumed to be an ordinary element
match = tag_regexp.match(ensure_str(node.tag))
if match:
namespace, tag = match.groups()
else:
namespace = None
tag = ensure_str(node.tag)
attrs = OrderedDict()
for name, value in list(node.attrib.items()):
name = ensure_str(name)
value = ensure_str(value)
match = tag_regexp.match(name)
if match:
attrs[(match.group(1), match.group(2))] = value
else:
attrs[(None, name)] = value
return (base.ELEMENT, namespace, self.filter.fromXmlName(tag),
attrs, len(node) > 0 or node.text)
def getFirstChild(self, node):
assert not isinstance(node, tuple), "Text nodes have no children"
assert len(node) or node.text, "Node has no children"
if node.text:
return (node, "text")
else:
return node[0]
def getNextSibling(self, node):
if isinstance(node, tuple): # Text node
node, key = node
assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
if key == "text":
# XXX: we cannot use a "bool(node) and node[0] or None" construct here
# because node[0] might evaluate to False if it has no child element
if len(node):
return node[0]
else:
return None
else: # tail
return node.getnext()
return (node, "tail") if node.tail else node.getnext()
def getParentNode(self, node):
if isinstance(node, tuple): # Text node
node, key = node
assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
if key == "text":
return node
# else: fallback to "normal" processing
elif node in self.fragmentChildren:
return None
return node.getparent()

View file

@ -0,0 +1,69 @@
from __future__ import absolute_import, division, unicode_literals
from genshi.core import QName
from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT
from genshi.core import START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
from . import base
from ..constants import voidElements, namespaces
class TreeWalker(base.TreeWalker):
def __iter__(self):
# Buffer the events so we can pass in the following one
previous = None
for event in self.tree:
if previous is not None:
for token in self.tokens(previous, event):
yield token
previous = event
# Don't forget the final event!
if previous is not None:
for token in self.tokens(previous, None):
yield token
def tokens(self, event, next):
kind, data, _ = event
if kind == START:
tag, attribs = data
name = tag.localname
namespace = tag.namespace
converted_attribs = {}
for k, v in attribs:
if isinstance(k, QName):
converted_attribs[(k.namespace, k.localname)] = v
else:
converted_attribs[(None, k)] = v
if namespace == namespaces["html"] and name in voidElements:
for token in self.emptyTag(namespace, name, converted_attribs,
not next or next[0] != END or
next[1] != tag):
yield token
else:
yield self.startTag(namespace, name, converted_attribs)
elif kind == END:
name = data.localname
namespace = data.namespace
if namespace != namespaces["html"] or name not in voidElements:
yield self.endTag(namespace, name)
elif kind == COMMENT:
yield self.comment(data)
elif kind == TEXT:
for token in self.text(data):
yield token
elif kind == DOCTYPE:
yield self.doctype(*data)
elif kind in (XML_NAMESPACE, DOCTYPE, START_NS, END_NS,
START_CDATA, END_CDATA, PI):
pass
else:
yield self.unknown(kind)