Uploaded Test files

This commit is contained in:
Batuhan Berk Başoğlu 2020-11-12 11:05:57 -05:00
parent f584ad9d97
commit 2e81cb7d99
16627 changed files with 2065359 additions and 102444 deletions

View file

@ -0,0 +1,58 @@
r"""
Parso is a Python parser that supports error recovery and round-trip parsing
for different Python versions (in multiple Python versions). Parso is also able
to list multiple syntax errors in your python file.
Parso has been battle-tested by jedi_. It was pulled out of jedi to be useful
for other projects as well.
Parso consists of a small API to parse Python and analyse the syntax tree.
.. _jedi: https://github.com/davidhalter/jedi
A simple example:
>>> import parso
>>> module = parso.parse('hello + 1', version="3.6")
>>> expr = module.children[0]
>>> expr
PythonNode(arith_expr, [<Name: hello@1,0>, <Operator: +>, <Number: 1>])
>>> print(expr.get_code())
hello + 1
>>> name = expr.children[0]
>>> name
<Name: hello@1,0>
>>> name.end_pos
(1, 5)
>>> expr.end_pos
(1, 9)
To list multiple issues:
>>> grammar = parso.load_grammar()
>>> module = grammar.parse('foo +\nbar\ncontinue')
>>> error1, error2 = grammar.iter_errors(module)
>>> error1.message
'SyntaxError: invalid syntax'
>>> error2.message
"SyntaxError: 'continue' not properly in loop"
"""
from parso.parser import ParserSyntaxError
from parso.grammar import Grammar, load_grammar
from parso.utils import split_lines, python_bytes_to_unicode
__version__ = '0.7.1'
def parse(code=None, **kwargs):
"""
A utility function to avoid loading grammars.
Params are documented in :py:meth:`parso.Grammar.parse`.
:param str version: The version used by :py:func:`parso.load_grammar`.
"""
version = kwargs.pop('version', None)
grammar = load_grammar(version=version)
return grammar.parse(code, **kwargs)

View file

@ -0,0 +1,19 @@
from typing import Any, Optional, Union
from parso.grammar import Grammar as Grammar, load_grammar as load_grammar
from parso.parser import ParserSyntaxError as ParserSyntaxError
from parso.utils import python_bytes_to_unicode as python_bytes_to_unicode, split_lines as split_lines
__version__: str = ...
def parse(
code: Optional[Union[str, bytes]],
*,
version: Optional[str] = None,
error_recovery: bool = True,
path: Optional[str] = None,
start_symbol: Optional[str] = None,
cache: bool = False,
diff_cache: bool = False,
cache_path: Optional[str] = None,
) -> Any: ...

View file

@ -0,0 +1,101 @@
"""
To ensure compatibility from Python ``2.7`` - ``3.3``, a module has been
created. Clearly there is huge need to use conforming syntax.
"""
import os
import sys
import platform
# unicode function
try:
unicode = unicode
except NameError:
unicode = str
is_pypy = platform.python_implementation() == 'PyPy'
def use_metaclass(meta, *bases):
""" Create a class with a metaclass. """
if not bases:
bases = (object,)
return meta("HackClass", bases, {})
try:
encoding = sys.stdout.encoding
if encoding is None:
encoding = 'utf-8'
except AttributeError:
encoding = 'ascii'
def u(string):
"""Cast to unicode DAMMIT!
Written because Python2 repr always implicitly casts to a string, so we
have to cast back to a unicode (and we know that we always deal with valid
unicode, because we check that in the beginning).
"""
if sys.version_info.major >= 3:
return str(string)
if not isinstance(string, unicode):
return unicode(str(string), 'UTF-8')
return string
try:
# Python 3.3+
FileNotFoundError = FileNotFoundError
except NameError:
# Python 2.7 (both IOError + OSError)
FileNotFoundError = EnvironmentError
try:
# Python 3.3+
PermissionError = PermissionError
except NameError:
# Python 2.7 (both IOError + OSError)
PermissionError = EnvironmentError
def utf8_repr(func):
"""
``__repr__`` methods in Python 2 don't allow unicode objects to be
returned. Therefore cast them to utf-8 bytes in this decorator.
"""
def wrapper(self):
result = func(self)
if isinstance(result, unicode):
return result.encode('utf-8')
else:
return result
if sys.version_info.major >= 3:
return func
else:
return wrapper
if sys.version_info < (3, 5):
"""
A super-minimal shim around listdir that behave like
scandir for the information we need.
"""
class _DirEntry:
def __init__(self, name, basepath):
self.name = name
self.basepath = basepath
@property
def path(self):
return os.path.join(self.basepath, self.name)
def stat(self):
# won't follow symlinks
return os.lstat(os.path.join(self.basepath, self.name))
def scandir(dir):
return [_DirEntry(name, dir) for name in os.listdir(dir)]
else:
from os import scandir

View file

@ -0,0 +1,276 @@
import time
import os
import sys
import hashlib
import gc
import shutil
import platform
import errno
import logging
import warnings
try:
import cPickle as pickle
except:
import pickle
from parso._compatibility import FileNotFoundError, PermissionError, scandir
from parso.file_io import FileIO
LOG = logging.getLogger(__name__)
_CACHED_FILE_MINIMUM_SURVIVAL = 60 * 10 # 10 minutes
"""
Cached files should survive at least a few minutes.
"""
_CACHED_FILE_MAXIMUM_SURVIVAL = 60 * 60 * 24 * 30
"""
Maximum time for a cached file to survive if it is not
accessed within.
"""
_CACHED_SIZE_TRIGGER = 600
"""
This setting limits the amount of cached files. It's basically a way to start
garbage collection.
The reasoning for this limit being as big as it is, is the following:
Numpy, Pandas, Matplotlib and Tensorflow together use about 500 files. This
makes Jedi use ~500mb of memory. Since we might want a bit more than those few
libraries, we just increase it a bit.
"""
_PICKLE_VERSION = 33
"""
Version number (integer) for file system cache.
Increment this number when there are any incompatible changes in
the parser tree classes. For example, the following changes
are regarded as incompatible.
- A class name is changed.
- A class is moved to another module.
- A __slot__ of a class is changed.
"""
_VERSION_TAG = '%s-%s%s-%s' % (
platform.python_implementation(),
sys.version_info[0],
sys.version_info[1],
_PICKLE_VERSION
)
"""
Short name for distinguish Python implementations and versions.
It's like `sys.implementation.cache_tag` but for Python2
we generate something similar. See:
http://docs.python.org/3/library/sys.html#sys.implementation
"""
def _get_default_cache_path():
if platform.system().lower() == 'windows':
dir_ = os.path.join(os.getenv('LOCALAPPDATA')
or os.path.expanduser('~'), 'Parso', 'Parso')
elif platform.system().lower() == 'darwin':
dir_ = os.path.join('~', 'Library', 'Caches', 'Parso')
else:
dir_ = os.path.join(os.getenv('XDG_CACHE_HOME') or '~/.cache', 'parso')
return os.path.expanduser(dir_)
_default_cache_path = _get_default_cache_path()
"""
The path where the cache is stored.
On Linux, this defaults to ``~/.cache/parso/``, on OS X to
``~/Library/Caches/Parso/`` and on Windows to ``%LOCALAPPDATA%\\Parso\\Parso\\``.
On Linux, if environment variable ``$XDG_CACHE_HOME`` is set,
``$XDG_CACHE_HOME/parso`` is used instead of the default one.
"""
_CACHE_CLEAR_THRESHOLD = 60 * 60 * 24
def _get_cache_clear_lock(cache_path = None):
"""
The path where the cache lock is stored.
Cache lock will prevent continous cache clearing and only allow garbage
collection once a day (can be configured in _CACHE_CLEAR_THRESHOLD).
"""
cache_path = cache_path or _get_default_cache_path()
return FileIO(os.path.join(cache_path, "PARSO-CACHE-LOCK"))
parser_cache = {}
class _NodeCacheItem(object):
def __init__(self, node, lines, change_time=None):
self.node = node
self.lines = lines
if change_time is None:
change_time = time.time()
self.change_time = change_time
self.last_used = change_time
def load_module(hashed_grammar, file_io, cache_path=None):
"""
Returns a module or None, if it fails.
"""
p_time = file_io.get_last_modified()
if p_time is None:
return None
try:
module_cache_item = parser_cache[hashed_grammar][file_io.path]
if p_time <= module_cache_item.change_time:
module_cache_item.last_used = time.time()
return module_cache_item.node
except KeyError:
return _load_from_file_system(
hashed_grammar,
file_io.path,
p_time,
cache_path=cache_path
)
def _load_from_file_system(hashed_grammar, path, p_time, cache_path=None):
cache_path = _get_hashed_path(hashed_grammar, path, cache_path=cache_path)
try:
try:
if p_time > os.path.getmtime(cache_path):
# Cache is outdated
return None
except OSError as e:
if e.errno == errno.ENOENT:
# In Python 2 instead of an IOError here we get an OSError.
raise FileNotFoundError
else:
raise
with open(cache_path, 'rb') as f:
gc.disable()
try:
module_cache_item = pickle.load(f)
finally:
gc.enable()
except FileNotFoundError:
return None
else:
_set_cache_item(hashed_grammar, path, module_cache_item)
LOG.debug('pickle loaded: %s', path)
return module_cache_item.node
def _set_cache_item(hashed_grammar, path, module_cache_item):
if sum(len(v) for v in parser_cache.values()) >= _CACHED_SIZE_TRIGGER:
# Garbage collection of old cache files.
# We are basically throwing everything away that hasn't been accessed
# in 10 minutes.
cutoff_time = time.time() - _CACHED_FILE_MINIMUM_SURVIVAL
for key, path_to_item_map in parser_cache.items():
parser_cache[key] = {
path: node_item
for path, node_item in path_to_item_map.items()
if node_item.last_used > cutoff_time
}
parser_cache.setdefault(hashed_grammar, {})[path] = module_cache_item
def try_to_save_module(hashed_grammar, file_io, module, lines, pickling=True, cache_path=None):
path = file_io.path
try:
p_time = None if path is None else file_io.get_last_modified()
except OSError:
p_time = None
pickling = False
item = _NodeCacheItem(module, lines, p_time)
_set_cache_item(hashed_grammar, path, item)
if pickling and path is not None:
try:
_save_to_file_system(hashed_grammar, path, item, cache_path=cache_path)
except PermissionError:
# It's not really a big issue if the cache cannot be saved to the
# file system. It's still in RAM in that case. However we should
# still warn the user that this is happening.
warnings.warn(
'Tried to save a file to %s, but got permission denied.',
Warning
)
else:
_remove_cache_and_update_lock(cache_path=cache_path)
def _save_to_file_system(hashed_grammar, path, item, cache_path=None):
with open(_get_hashed_path(hashed_grammar, path, cache_path=cache_path), 'wb') as f:
pickle.dump(item, f, pickle.HIGHEST_PROTOCOL)
def clear_cache(cache_path=None):
if cache_path is None:
cache_path = _default_cache_path
shutil.rmtree(cache_path)
parser_cache.clear()
def clear_inactive_cache(
cache_path=None,
inactivity_threshold=_CACHED_FILE_MAXIMUM_SURVIVAL,
):
if cache_path is None:
cache_path = _get_default_cache_path()
if not os.path.exists(cache_path):
return False
for version_path in os.listdir(cache_path):
version_path = os.path.join(cache_path, version_path)
if not os.path.isdir(version_path):
continue
for file in scandir(version_path):
if (
file.stat().st_atime + _CACHED_FILE_MAXIMUM_SURVIVAL
<= time.time()
):
try:
os.remove(file.path)
except OSError: # silently ignore all failures
continue
else:
return True
def _remove_cache_and_update_lock(cache_path = None):
lock = _get_cache_clear_lock(cache_path=cache_path)
clear_lock_time = lock.get_last_modified()
if (
clear_lock_time is None # first time
or clear_lock_time + _CACHE_CLEAR_THRESHOLD <= time.time()
):
if not lock._touch():
# First make sure that as few as possible other cleanup jobs also
# get started. There is still a race condition but it's probably
# not a big problem.
return False
clear_inactive_cache(cache_path = cache_path)
def _get_hashed_path(hashed_grammar, path, cache_path=None):
directory = _get_cache_directory_path(cache_path=cache_path)
file_hash = hashlib.sha256(path.encode("utf-8")).hexdigest()
return os.path.join(directory, '%s-%s.pkl' % (hashed_grammar, file_hash))
def _get_cache_directory_path(cache_path=None):
if cache_path is None:
cache_path = _default_cache_path
directory = os.path.join(cache_path, _VERSION_TAG)
if not os.path.exists(directory):
os.makedirs(directory)
return directory

View file

@ -0,0 +1,47 @@
import os
from parso._compatibility import FileNotFoundError
class FileIO(object):
def __init__(self, path):
self.path = path
def read(self): # Returns bytes/str
# We would like to read unicode here, but we cannot, because we are not
# sure if it is a valid unicode file. Therefore just read whatever is
# here.
with open(self.path, 'rb') as f:
return f.read()
def get_last_modified(self):
"""
Returns float - timestamp or None, if path doesn't exist.
"""
try:
return os.path.getmtime(self.path)
except OSError:
# Might raise FileNotFoundError, OSError for Python 2
return None
def _touch(self):
try:
os.utime(self.path, None)
except FileNotFoundError:
try:
file = open(self.path, 'a')
file.close()
except (OSError, IOError): # TODO Maybe log this?
return False
return True
def __repr__(self):
return '%s(%s)' % (self.__class__.__name__, self.path)
class KnownContentFileIO(FileIO):
def __init__(self, path, content):
super(KnownContentFileIO, self).__init__(path)
self._content = content
def read(self):
return self._content

View file

@ -0,0 +1,260 @@
import hashlib
import os
from parso._compatibility import FileNotFoundError, is_pypy
from parso.pgen2 import generate_grammar
from parso.utils import split_lines, python_bytes_to_unicode, parse_version_string
from parso.python.diff import DiffParser
from parso.python.tokenize import tokenize_lines, tokenize
from parso.python.token import PythonTokenTypes
from parso.cache import parser_cache, load_module, try_to_save_module
from parso.parser import BaseParser
from parso.python.parser import Parser as PythonParser
from parso.python.errors import ErrorFinderConfig
from parso.python import pep8
from parso.file_io import FileIO, KnownContentFileIO
from parso.normalizer import RefactoringNormalizer
_loaded_grammars = {}
class Grammar(object):
"""
:py:func:`parso.load_grammar` returns instances of this class.
Creating custom none-python grammars by calling this is not supported, yet.
"""
#:param text: A BNF representation of your grammar.
_error_normalizer_config = None
_token_namespace = None
_default_normalizer_config = pep8.PEP8NormalizerConfig()
def __init__(self, text, tokenizer, parser=BaseParser, diff_parser=None):
self._pgen_grammar = generate_grammar(
text,
token_namespace=self._get_token_namespace()
)
self._parser = parser
self._tokenizer = tokenizer
self._diff_parser = diff_parser
self._hashed = hashlib.sha256(text.encode("utf-8")).hexdigest()
def parse(self, code=None, **kwargs):
"""
If you want to parse a Python file you want to start here, most likely.
If you need finer grained control over the parsed instance, there will be
other ways to access it.
:param str code: A unicode or bytes string. When it's not possible to
decode bytes to a string, returns a
:py:class:`UnicodeDecodeError`.
:param bool error_recovery: If enabled, any code will be returned. If
it is invalid, it will be returned as an error node. If disabled,
you will get a ParseError when encountering syntax errors in your
code.
:param str start_symbol: The grammar rule (nonterminal) that you want
to parse. Only allowed to be used when error_recovery is False.
:param str path: The path to the file you want to open. Only needed for caching.
:param bool cache: Keeps a copy of the parser tree in RAM and on disk
if a path is given. Returns the cached trees if the corresponding
files on disk have not changed. Note that this stores pickle files
on your file system (e.g. for Linux in ``~/.cache/parso/``).
:param bool diff_cache: Diffs the cached python module against the new
code and tries to parse only the parts that have changed. Returns
the same (changed) module that is found in cache. Using this option
requires you to not do anything anymore with the cached modules
under that path, because the contents of it might change. This
option is still somewhat experimental. If you want stability,
please don't use it.
:param bool cache_path: If given saves the parso cache in this
directory. If not given, defaults to the default cache places on
each platform.
:return: A subclass of :py:class:`parso.tree.NodeOrLeaf`. Typically a
:py:class:`parso.python.tree.Module`.
"""
if 'start_pos' in kwargs:
raise TypeError("parse() got an unexpected keyword argument.")
return self._parse(code=code, **kwargs)
def _parse(self, code=None, error_recovery=True, path=None,
start_symbol=None, cache=False, diff_cache=False,
cache_path=None, file_io=None, start_pos=(1, 0)):
"""
Wanted python3.5 * operator and keyword only arguments. Therefore just
wrap it all.
start_pos here is just a parameter internally used. Might be public
sometime in the future.
"""
if code is None and path is None and file_io is None:
raise TypeError("Please provide either code or a path.")
if start_symbol is None:
start_symbol = self._start_nonterminal
if error_recovery and start_symbol != 'file_input':
raise NotImplementedError("This is currently not implemented.")
if file_io is None:
if code is None:
file_io = FileIO(path)
else:
file_io = KnownContentFileIO(path, code)
if cache and file_io.path is not None:
module_node = load_module(self._hashed, file_io, cache_path=cache_path)
if module_node is not None:
return module_node
if code is None:
code = file_io.read()
code = python_bytes_to_unicode(code)
lines = split_lines(code, keepends=True)
if diff_cache:
if self._diff_parser is None:
raise TypeError("You have to define a diff parser to be able "
"to use this option.")
try:
module_cache_item = parser_cache[self._hashed][file_io.path]
except KeyError:
pass
else:
module_node = module_cache_item.node
old_lines = module_cache_item.lines
if old_lines == lines:
return module_node
new_node = self._diff_parser(
self._pgen_grammar, self._tokenizer, module_node
).update(
old_lines=old_lines,
new_lines=lines
)
try_to_save_module(self._hashed, file_io, new_node, lines,
# Never pickle in pypy, it's slow as hell.
pickling=cache and not is_pypy,
cache_path=cache_path)
return new_node
tokens = self._tokenizer(lines, start_pos=start_pos)
p = self._parser(
self._pgen_grammar,
error_recovery=error_recovery,
start_nonterminal=start_symbol
)
root_node = p.parse(tokens=tokens)
if cache or diff_cache:
try_to_save_module(self._hashed, file_io, root_node, lines,
# Never pickle in pypy, it's slow as hell.
pickling=cache and not is_pypy,
cache_path=cache_path)
return root_node
def _get_token_namespace(self):
ns = self._token_namespace
if ns is None:
raise ValueError("The token namespace should be set.")
return ns
def iter_errors(self, node):
"""
Given a :py:class:`parso.tree.NodeOrLeaf` returns a generator of
:py:class:`parso.normalizer.Issue` objects. For Python this is
a list of syntax/indentation errors.
"""
if self._error_normalizer_config is None:
raise ValueError("No error normalizer specified for this grammar.")
return self._get_normalizer_issues(node, self._error_normalizer_config)
def refactor(self, base_node, node_to_str_map):
return RefactoringNormalizer(node_to_str_map).walk(base_node)
def _get_normalizer(self, normalizer_config):
if normalizer_config is None:
normalizer_config = self._default_normalizer_config
if normalizer_config is None:
raise ValueError("You need to specify a normalizer, because "
"there's no default normalizer for this tree.")
return normalizer_config.create_normalizer(self)
def _normalize(self, node, normalizer_config=None):
"""
TODO this is not public, yet.
The returned code will be normalized, e.g. PEP8 for Python.
"""
normalizer = self._get_normalizer(normalizer_config)
return normalizer.walk(node)
def _get_normalizer_issues(self, node, normalizer_config=None):
normalizer = self._get_normalizer(normalizer_config)
normalizer.walk(node)
return normalizer.issues
def __repr__(self):
nonterminals = self._pgen_grammar.nonterminal_to_dfas.keys()
txt = ' '.join(list(nonterminals)[:3]) + ' ...'
return '<%s:%s>' % (self.__class__.__name__, txt)
class PythonGrammar(Grammar):
_error_normalizer_config = ErrorFinderConfig()
_token_namespace = PythonTokenTypes
_start_nonterminal = 'file_input'
def __init__(self, version_info, bnf_text):
super(PythonGrammar, self).__init__(
bnf_text,
tokenizer=self._tokenize_lines,
parser=PythonParser,
diff_parser=DiffParser
)
self.version_info = version_info
def _tokenize_lines(self, lines, **kwargs):
return tokenize_lines(lines, self.version_info, **kwargs)
def _tokenize(self, code):
# Used by Jedi.
return tokenize(code, self.version_info)
def load_grammar(**kwargs):
"""
Loads a :py:class:`parso.Grammar`. The default version is the current Python
version.
:param str version: A python version string, e.g. ``version='3.8'``.
:param str path: A path to a grammar file
"""
def load_grammar(language='python', version=None, path=None):
if language == 'python':
version_info = parse_version_string(version)
file = path or os.path.join(
'python',
'grammar%s%s.txt' % (version_info.major, version_info.minor)
)
global _loaded_grammars
path = os.path.join(os.path.dirname(__file__), file)
try:
return _loaded_grammars[path]
except KeyError:
try:
with open(path) as f:
bnf_text = f.read()
grammar = PythonGrammar(version_info, bnf_text)
return _loaded_grammars.setdefault(path, grammar)
except FileNotFoundError:
message = "Python version %s.%s is currently not supported." % (version_info.major, version_info.minor)
raise NotImplementedError(message)
else:
raise NotImplementedError("No support for language %s." % language)
return load_grammar(**kwargs)

View file

@ -0,0 +1,38 @@
from typing import Any, Callable, Generic, Optional, Sequence, TypeVar, Union
from typing_extensions import Literal
from parso.utils import PythonVersionInfo
_Token = Any
_NodeT = TypeVar("_NodeT")
class Grammar(Generic[_NodeT]):
_default_normalizer_config: Optional[Any] = ...
_error_normalizer_config: Optional[Any] = None
_start_nonterminal: str = ...
_token_namespace: Optional[str] = None
def __init__(
self,
text: str,
tokenizer: Callable[[Sequence[str], int], Sequence[_Token]],
parser: Any = ...,
diff_parser: Any = ...,
) -> None: ...
def parse(
self,
code: Union[str, bytes] = ...,
error_recovery: bool = ...,
path: Optional[str] = ...,
start_symbol: Optional[str] = ...,
cache: bool = ...,
diff_cache: bool = ...,
cache_path: Optional[str] = ...,
) -> _NodeT: ...
class PythonGrammar(Grammar):
version_info: PythonVersionInfo
def __init__(self, version_info: PythonVersionInfo, bnf_text: str) -> None: ...
def load_grammar(
language: Literal["python"] = "python", version: Optional[str] = ..., path: str = ...
) -> Grammar: ...

View file

@ -0,0 +1,203 @@
from contextlib import contextmanager
from parso._compatibility import use_metaclass
class _NormalizerMeta(type):
def __new__(cls, name, bases, dct):
new_cls = type.__new__(cls, name, bases, dct)
new_cls.rule_value_classes = {}
new_cls.rule_type_classes = {}
return new_cls
class Normalizer(use_metaclass(_NormalizerMeta)):
_rule_type_instances = {}
_rule_value_instances = {}
def __init__(self, grammar, config):
self.grammar = grammar
self._config = config
self.issues = []
self._rule_type_instances = self._instantiate_rules('rule_type_classes')
self._rule_value_instances = self._instantiate_rules('rule_value_classes')
def _instantiate_rules(self, attr):
dct = {}
for base in type(self).mro():
rules_map = getattr(base, attr, {})
for type_, rule_classes in rules_map.items():
new = [rule_cls(self) for rule_cls in rule_classes]
dct.setdefault(type_, []).extend(new)
return dct
def walk(self, node):
self.initialize(node)
value = self.visit(node)
self.finalize()
return value
def visit(self, node):
try:
children = node.children
except AttributeError:
return self.visit_leaf(node)
else:
with self.visit_node(node):
return ''.join(self.visit(child) for child in children)
@contextmanager
def visit_node(self, node):
self._check_type_rules(node)
yield
def _check_type_rules(self, node):
for rule in self._rule_type_instances.get(node.type, []):
rule.feed_node(node)
def visit_leaf(self, leaf):
self._check_type_rules(leaf)
for rule in self._rule_value_instances.get(leaf.value, []):
rule.feed_node(leaf)
return leaf.prefix + leaf.value
def initialize(self, node):
pass
def finalize(self):
pass
def add_issue(self, node, code, message):
issue = Issue(node, code, message)
if issue not in self.issues:
self.issues.append(issue)
return True
@classmethod
def register_rule(cls, **kwargs):
"""
Use it as a class decorator::
normalizer = Normalizer('grammar', 'config')
@normalizer.register_rule(value='foo')
class MyRule(Rule):
error_code = 42
"""
return cls._register_rule(**kwargs)
@classmethod
def _register_rule(cls, value=None, values=(), type=None, types=()):
values = list(values)
types = list(types)
if value is not None:
values.append(value)
if type is not None:
types.append(type)
if not values and not types:
raise ValueError("You must register at least something.")
def decorator(rule_cls):
for v in values:
cls.rule_value_classes.setdefault(v, []).append(rule_cls)
for t in types:
cls.rule_type_classes.setdefault(t, []).append(rule_cls)
return rule_cls
return decorator
class NormalizerConfig(object):
normalizer_class = Normalizer
def create_normalizer(self, grammar):
if self.normalizer_class is None:
return None
return self.normalizer_class(grammar, self)
class Issue(object):
def __init__(self, node, code, message):
self.code = code
"""
An integer code that stands for the type of error.
"""
self.message = message
"""
A message (string) for the issue.
"""
self.start_pos = node.start_pos
"""
The start position position of the error as a tuple (line, column). As
always in |parso| the first line is 1 and the first column 0.
"""
self.end_pos = node.end_pos
def __eq__(self, other):
return self.start_pos == other.start_pos and self.code == other.code
def __ne__(self, other):
return not self.__eq__(other)
def __hash__(self):
return hash((self.code, self.start_pos))
def __repr__(self):
return '<%s: %s>' % (self.__class__.__name__, self.code)
class Rule(object):
code = None
message = None
def __init__(self, normalizer):
self._normalizer = normalizer
def is_issue(self, node):
raise NotImplementedError()
def get_node(self, node):
return node
def _get_message(self, message, node):
if message is None:
message = self.message
if message is None:
raise ValueError("The message on the class is not set.")
return message
def add_issue(self, node, code=None, message=None):
if code is None:
code = self.code
if code is None:
raise ValueError("The error code on the class is not set.")
message = self._get_message(message, node)
self._normalizer.add_issue(node, code, message)
def feed_node(self, node):
if self.is_issue(node):
issue_node = self.get_node(node)
self.add_issue(issue_node)
class RefactoringNormalizer(Normalizer):
def __init__(self, node_to_str_map):
self._node_to_str_map = node_to_str_map
def visit(self, node):
try:
return self._node_to_str_map[node]
except KeyError:
return super(RefactoringNormalizer, self).visit(node)
def visit_leaf(self, leaf):
try:
return self._node_to_str_map[leaf]
except KeyError:
return super(RefactoringNormalizer, self).visit_leaf(leaf)

View file

@ -0,0 +1,211 @@
# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.
# Modifications:
# Copyright David Halter and Contributors
# Modifications are dual-licensed: MIT and PSF.
# 99% of the code is different from pgen2, now.
"""
The ``Parser`` tries to convert the available Python code in an easy to read
format, something like an abstract syntax tree. The classes who represent this
tree, are sitting in the :mod:`parso.tree` module.
The Python module ``tokenize`` is a very important part in the ``Parser``,
because it splits the code into different words (tokens). Sometimes it looks a
bit messy. Sorry for that! You might ask now: "Why didn't you use the ``ast``
module for this? Well, ``ast`` does a very good job understanding proper Python
code, but fails to work as soon as there's a single line of broken code.
There's one important optimization that needs to be known: Statements are not
being parsed completely. ``Statement`` is just a representation of the tokens
within the statement. This lowers memory usage and cpu time and reduces the
complexity of the ``Parser`` (there's another parser sitting inside
``Statement``, which produces ``Array`` and ``Call``).
"""
from parso import tree
from parso.pgen2.generator import ReservedString
class ParserSyntaxError(Exception):
"""
Contains error information about the parser tree.
May be raised as an exception.
"""
def __init__(self, message, error_leaf):
self.message = message
self.error_leaf = error_leaf
class InternalParseError(Exception):
"""
Exception to signal the parser is stuck and error recovery didn't help.
Basically this shouldn't happen. It's a sign that something is really
wrong.
"""
def __init__(self, msg, type_, value, start_pos):
Exception.__init__(self, "%s: type=%r, value=%r, start_pos=%r" %
(msg, type_.name, value, start_pos))
self.msg = msg
self.type = type
self.value = value
self.start_pos = start_pos
class Stack(list):
def _allowed_transition_names_and_token_types(self):
def iterate():
# An API just for Jedi.
for stack_node in reversed(self):
for transition in stack_node.dfa.transitions:
if isinstance(transition, ReservedString):
yield transition.value
else:
yield transition # A token type
if not stack_node.dfa.is_final:
break
return list(iterate())
class StackNode(object):
def __init__(self, dfa):
self.dfa = dfa
self.nodes = []
@property
def nonterminal(self):
return self.dfa.from_rule
def __repr__(self):
return '%s(%s, %s)' % (self.__class__.__name__, self.dfa, self.nodes)
def _token_to_transition(grammar, type_, value):
# Map from token to label
if type_.contains_syntax:
# Check for reserved words (keywords)
try:
return grammar.reserved_syntax_strings[value]
except KeyError:
pass
return type_
class BaseParser(object):
"""Parser engine.
A Parser instance contains state pertaining to the current token
sequence, and should not be used concurrently by different threads
to parse separate token sequences.
See python/tokenize.py for how to get input tokens by a string.
When a syntax error occurs, error_recovery() is called.
"""
node_map = {}
default_node = tree.Node
leaf_map = {
}
default_leaf = tree.Leaf
def __init__(self, pgen_grammar, start_nonterminal='file_input', error_recovery=False):
self._pgen_grammar = pgen_grammar
self._start_nonterminal = start_nonterminal
self._error_recovery = error_recovery
def parse(self, tokens):
first_dfa = self._pgen_grammar.nonterminal_to_dfas[self._start_nonterminal][0]
self.stack = Stack([StackNode(first_dfa)])
for token in tokens:
self._add_token(token)
while True:
tos = self.stack[-1]
if not tos.dfa.is_final:
# We never broke out -- EOF is too soon -- Unfinished statement.
# However, the error recovery might have added the token again, if
# the stack is empty, we're fine.
raise InternalParseError(
"incomplete input", token.type, token.string, token.start_pos
)
if len(self.stack) > 1:
self._pop()
else:
return self.convert_node(tos.nonterminal, tos.nodes)
def error_recovery(self, token):
if self._error_recovery:
raise NotImplementedError("Error Recovery is not implemented")
else:
type_, value, start_pos, prefix = token
error_leaf = tree.ErrorLeaf(type_, value, start_pos, prefix)
raise ParserSyntaxError('SyntaxError: invalid syntax', error_leaf)
def convert_node(self, nonterminal, children):
try:
node = self.node_map[nonterminal](children)
except KeyError:
node = self.default_node(nonterminal, children)
for c in children:
c.parent = node
return node
def convert_leaf(self, type_, value, prefix, start_pos):
try:
return self.leaf_map[type_](value, start_pos, prefix)
except KeyError:
return self.default_leaf(value, start_pos, prefix)
def _add_token(self, token):
"""
This is the only core function for parsing. Here happens basically
everything. Everything is well prepared by the parser generator and we
only apply the necessary steps here.
"""
grammar = self._pgen_grammar
stack = self.stack
type_, value, start_pos, prefix = token
transition = _token_to_transition(grammar, type_, value)
while True:
try:
plan = stack[-1].dfa.transitions[transition]
break
except KeyError:
if stack[-1].dfa.is_final:
self._pop()
else:
self.error_recovery(token)
return
except IndexError:
raise InternalParseError("too much input", type_, value, start_pos)
stack[-1].dfa = plan.next_dfa
for push in plan.dfa_pushes:
stack.append(StackNode(push))
leaf = self.convert_leaf(type_, value, prefix, start_pos)
stack[-1].nodes.append(leaf)
def _pop(self):
tos = self.stack.pop()
# If there's exactly one child, return that child instead of
# creating a new node. We still create expr_stmt and
# file_input though, because a lot of Jedi depends on its
# logic.
if len(tos.nodes) == 1:
new_node = tos.nodes[0]
else:
new_node = self.convert_node(tos.dfa.from_rule, tos.nodes)
self.stack[-1].nodes.append(new_node)

View file

@ -0,0 +1,10 @@
# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.
# Modifications:
# Copyright 2006 Google, Inc. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.
# Copyright 2014 David Halter and Contributors
# Modifications are dual-licensed: MIT and PSF.
from parso.pgen2.generator import generate_grammar

View file

@ -0,0 +1 @@
from parso.pgen2.generator import generate_grammar as generate_grammar

View file

@ -0,0 +1,378 @@
# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.
# Modifications:
# Copyright David Halter and Contributors
# Modifications are dual-licensed: MIT and PSF.
"""
This module defines the data structures used to represent a grammar.
Specifying grammars in pgen is possible with this grammar::
grammar: (NEWLINE | rule)* ENDMARKER
rule: NAME ':' rhs NEWLINE
rhs: items ('|' items)*
items: item+
item: '[' rhs ']' | atom ['+' | '*']
atom: '(' rhs ')' | NAME | STRING
This grammar is self-referencing.
This parser generator (pgen2) was created by Guido Rossum and used for lib2to3.
Most of the code has been refactored to make it more Pythonic. Since this was a
"copy" of the CPython Parser parser "pgen", there was some work needed to make
it more readable. It should also be slightly faster than the original pgen2,
because we made some optimizations.
"""
from ast import literal_eval
from parso.pgen2.grammar_parser import GrammarParser, NFAState
class Grammar(object):
"""
Once initialized, this class supplies the grammar tables for the
parsing engine implemented by parse.py. The parsing engine
accesses the instance variables directly.
The only important part in this parsers are dfas and transitions between
dfas.
"""
def __init__(self, start_nonterminal, rule_to_dfas, reserved_syntax_strings):
self.nonterminal_to_dfas = rule_to_dfas # Dict[str, List[DFAState]]
self.reserved_syntax_strings = reserved_syntax_strings
self.start_nonterminal = start_nonterminal
class DFAPlan(object):
"""
Plans are used for the parser to create stack nodes and do the proper
DFA state transitions.
"""
def __init__(self, next_dfa, dfa_pushes=[]):
self.next_dfa = next_dfa
self.dfa_pushes = dfa_pushes
def __repr__(self):
return '%s(%s, %s)' % (self.__class__.__name__, self.next_dfa, self.dfa_pushes)
class DFAState(object):
"""
The DFAState object is the core class for pretty much anything. DFAState
are the vertices of an ordered graph while arcs and transitions are the
edges.
Arcs are the initial edges, where most DFAStates are not connected and
transitions are then calculated to connect the DFA state machines that have
different nonterminals.
"""
def __init__(self, from_rule, nfa_set, final):
assert isinstance(nfa_set, set)
assert isinstance(next(iter(nfa_set)), NFAState)
assert isinstance(final, NFAState)
self.from_rule = from_rule
self.nfa_set = nfa_set
self.arcs = {} # map from terminals/nonterminals to DFAState
# In an intermediary step we set these nonterminal arcs (which has the
# same structure as arcs). These don't contain terminals anymore.
self.nonterminal_arcs = {}
# Transitions are basically the only thing that the parser is using
# with is_final. Everyting else is purely here to create a parser.
self.transitions = {} #: Dict[Union[TokenType, ReservedString], DFAPlan]
self.is_final = final in nfa_set
def add_arc(self, next_, label):
assert isinstance(label, str)
assert label not in self.arcs
assert isinstance(next_, DFAState)
self.arcs[label] = next_
def unifystate(self, old, new):
for label, next_ in self.arcs.items():
if next_ is old:
self.arcs[label] = new
def __eq__(self, other):
# Equality test -- ignore the nfa_set instance variable
assert isinstance(other, DFAState)
if self.is_final != other.is_final:
return False
# Can't just return self.arcs == other.arcs, because that
# would invoke this method recursively, with cycles...
if len(self.arcs) != len(other.arcs):
return False
for label, next_ in self.arcs.items():
if next_ is not other.arcs.get(label):
return False
return True
__hash__ = None # For Py3 compatibility.
def __repr__(self):
return '<%s: %s is_final=%s>' % (
self.__class__.__name__, self.from_rule, self.is_final
)
class ReservedString(object):
"""
Most grammars will have certain keywords and operators that are mentioned
in the grammar as strings (e.g. "if") and not token types (e.g. NUMBER).
This class basically is the former.
"""
def __init__(self, value):
self.value = value
def __repr__(self):
return '%s(%s)' % (self.__class__.__name__, self.value)
def _simplify_dfas(dfas):
"""
This is not theoretically optimal, but works well enough.
Algorithm: repeatedly look for two states that have the same
set of arcs (same labels pointing to the same nodes) and
unify them, until things stop changing.
dfas is a list of DFAState instances
"""
changes = True
while changes:
changes = False
for i, state_i in enumerate(dfas):
for j in range(i + 1, len(dfas)):
state_j = dfas[j]
if state_i == state_j:
#print " unify", i, j
del dfas[j]
for state in dfas:
state.unifystate(state_j, state_i)
changes = True
break
def _make_dfas(start, finish):
"""
Uses the powerset construction algorithm to create DFA states from sets of
NFA states.
Also does state reduction if some states are not needed.
"""
# To turn an NFA into a DFA, we define the states of the DFA
# to correspond to *sets* of states of the NFA. Then do some
# state reduction.
assert isinstance(start, NFAState)
assert isinstance(finish, NFAState)
def addclosure(nfa_state, base_nfa_set):
assert isinstance(nfa_state, NFAState)
if nfa_state in base_nfa_set:
return
base_nfa_set.add(nfa_state)
for nfa_arc in nfa_state.arcs:
if nfa_arc.nonterminal_or_string is None:
addclosure(nfa_arc.next, base_nfa_set)
base_nfa_set = set()
addclosure(start, base_nfa_set)
states = [DFAState(start.from_rule, base_nfa_set, finish)]
for state in states: # NB states grows while we're iterating
arcs = {}
# Find state transitions and store them in arcs.
for nfa_state in state.nfa_set:
for nfa_arc in nfa_state.arcs:
if nfa_arc.nonterminal_or_string is not None:
nfa_set = arcs.setdefault(nfa_arc.nonterminal_or_string, set())
addclosure(nfa_arc.next, nfa_set)
# Now create the dfa's with no None's in arcs anymore. All Nones have
# been eliminated and state transitions (arcs) are properly defined, we
# just need to create the dfa's.
for nonterminal_or_string, nfa_set in arcs.items():
for nested_state in states:
if nested_state.nfa_set == nfa_set:
# The DFA state already exists for this rule.
break
else:
nested_state = DFAState(start.from_rule, nfa_set, finish)
states.append(nested_state)
state.add_arc(nested_state, nonterminal_or_string)
return states # List of DFAState instances; first one is start
def _dump_nfa(start, finish):
print("Dump of NFA for", start.from_rule)
todo = [start]
for i, state in enumerate(todo):
print(" State", i, state is finish and "(final)" or "")
for arc in state.arcs:
label, next_ = arc.nonterminal_or_string, arc.next
if next_ in todo:
j = todo.index(next_)
else:
j = len(todo)
todo.append(next_)
if label is None:
print(" -> %d" % j)
else:
print(" %s -> %d" % (label, j))
def _dump_dfas(dfas):
print("Dump of DFA for", dfas[0].from_rule)
for i, state in enumerate(dfas):
print(" State", i, state.is_final and "(final)" or "")
for nonterminal, next_ in state.arcs.items():
print(" %s -> %d" % (nonterminal, dfas.index(next_)))
def generate_grammar(bnf_grammar, token_namespace):
"""
``bnf_text`` is a grammar in extended BNF (using * for repetition, + for
at-least-once repetition, [] for optional parts, | for alternatives and ()
for grouping).
It's not EBNF according to ISO/IEC 14977. It's a dialect Python uses in its
own parser.
"""
rule_to_dfas = {}
start_nonterminal = None
for nfa_a, nfa_z in GrammarParser(bnf_grammar).parse():
#_dump_nfa(nfa_a, nfa_z)
dfas = _make_dfas(nfa_a, nfa_z)
#_dump_dfas(dfas)
# oldlen = len(dfas)
_simplify_dfas(dfas)
# newlen = len(dfas)
rule_to_dfas[nfa_a.from_rule] = dfas
#print(nfa_a.from_rule, oldlen, newlen)
if start_nonterminal is None:
start_nonterminal = nfa_a.from_rule
reserved_strings = {}
for nonterminal, dfas in rule_to_dfas.items():
for dfa_state in dfas:
for terminal_or_nonterminal, next_dfa in dfa_state.arcs.items():
if terminal_or_nonterminal in rule_to_dfas:
dfa_state.nonterminal_arcs[terminal_or_nonterminal] = next_dfa
else:
transition = _make_transition(
token_namespace,
reserved_strings,
terminal_or_nonterminal
)
dfa_state.transitions[transition] = DFAPlan(next_dfa)
_calculate_tree_traversal(rule_to_dfas)
return Grammar(start_nonterminal, rule_to_dfas, reserved_strings)
def _make_transition(token_namespace, reserved_syntax_strings, label):
"""
Creates a reserved string ("if", "for", "*", ...) or returns the token type
(NUMBER, STRING, ...) for a given grammar terminal.
"""
if label[0].isalpha():
# A named token (e.g. NAME, NUMBER, STRING)
return getattr(token_namespace, label)
else:
# Either a keyword or an operator
assert label[0] in ('"', "'"), label
assert not label.startswith('"""') and not label.startswith("'''")
value = literal_eval(label)
try:
return reserved_syntax_strings[value]
except KeyError:
r = reserved_syntax_strings[value] = ReservedString(value)
return r
def _calculate_tree_traversal(nonterminal_to_dfas):
"""
By this point we know how dfas can move around within a stack node, but we
don't know how we can add a new stack node (nonterminal transitions).
"""
# Map from grammar rule (nonterminal) name to a set of tokens.
first_plans = {}
nonterminals = list(nonterminal_to_dfas.keys())
nonterminals.sort()
for nonterminal in nonterminals:
if nonterminal not in first_plans:
_calculate_first_plans(nonterminal_to_dfas, first_plans, nonterminal)
# Now that we have calculated the first terminals, we are sure that
# there is no left recursion.
for dfas in nonterminal_to_dfas.values():
for dfa_state in dfas:
transitions = dfa_state.transitions
for nonterminal, next_dfa in dfa_state.nonterminal_arcs.items():
for transition, pushes in first_plans[nonterminal].items():
if transition in transitions:
prev_plan = transitions[transition]
# Make sure these are sorted so that error messages are
# at least deterministic
choices = sorted([
(
prev_plan.dfa_pushes[0].from_rule
if prev_plan.dfa_pushes
else prev_plan.next_dfa.from_rule
),
(
pushes[0].from_rule
if pushes else next_dfa.from_rule
),
])
raise ValueError(
"Rule %s is ambiguous; given a %s token, we "
"can't determine if we should evaluate %s or %s."
% (
(
dfa_state.from_rule,
transition,
) + tuple(choices)
)
)
transitions[transition] = DFAPlan(next_dfa, pushes)
def _calculate_first_plans(nonterminal_to_dfas, first_plans, nonterminal):
"""
Calculates the first plan in the first_plans dictionary for every given
nonterminal. This is going to be used to know when to create stack nodes.
"""
dfas = nonterminal_to_dfas[nonterminal]
new_first_plans = {}
first_plans[nonterminal] = None # dummy to detect left recursion
# We only need to check the first dfa. All the following ones are not
# interesting to find first terminals.
state = dfas[0]
for transition, next_ in state.transitions.items():
# It's a string. We have finally found a possible first token.
new_first_plans[transition] = [next_.next_dfa]
for nonterminal2, next_ in state.nonterminal_arcs.items():
# It's a nonterminal and we have either a left recursion issue
# in the grammar or we have to recurse.
try:
first_plans2 = first_plans[nonterminal2]
except KeyError:
first_plans2 = _calculate_first_plans(nonterminal_to_dfas, first_plans, nonterminal2)
else:
if first_plans2 is None:
raise ValueError("left recursion for rule %r" % nonterminal)
for t, pushes in first_plans2.items():
new_first_plans[t] = [next_] + pushes
first_plans[nonterminal] = new_first_plans
return new_first_plans

View file

@ -0,0 +1,38 @@
from typing import Any, Generic, Mapping, Sequence, Set, TypeVar, Union
from parso.pgen2.grammar_parser import NFAState
_TokenTypeT = TypeVar("_TokenTypeT")
class Grammar(Generic[_TokenTypeT]):
nonterminal_to_dfas: Mapping[str, Sequence[DFAState[_TokenTypeT]]]
reserved_syntax_strings: Mapping[str, ReservedString]
start_nonterminal: str
def __init__(
self,
start_nonterminal: str,
rule_to_dfas: Mapping[str, Sequence[DFAState]],
reserved_syntax_strings: Mapping[str, ReservedString],
) -> None: ...
class DFAPlan:
next_dfa: DFAState
dfa_pushes: Sequence[DFAState]
class DFAState(Generic[_TokenTypeT]):
from_rule: str
nfa_set: Set[NFAState]
is_final: bool
arcs: Mapping[str, DFAState] # map from all terminals/nonterminals to DFAState
nonterminal_arcs: Mapping[str, DFAState]
transitions: Mapping[Union[_TokenTypeT, ReservedString], DFAPlan]
def __init__(
self, from_rule: str, nfa_set: Set[NFAState], final: NFAState
) -> None: ...
class ReservedString:
value: str
def __init__(self, value: str) -> None: ...
def __repr__(self) -> str: ...
def generate_grammar(bnf_grammar: str, token_namespace: Any) -> Grammar[Any]: ...

View file

@ -0,0 +1,159 @@
# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.
# Modifications:
# Copyright David Halter and Contributors
# Modifications are dual-licensed: MIT and PSF.
from parso.python.tokenize import tokenize
from parso.utils import parse_version_string
from parso.python.token import PythonTokenTypes
class GrammarParser():
"""
The parser for Python grammar files.
"""
def __init__(self, bnf_grammar):
self._bnf_grammar = bnf_grammar
self.generator = tokenize(
bnf_grammar,
version_info=parse_version_string('3.6')
)
self._gettoken() # Initialize lookahead
def parse(self):
# grammar: (NEWLINE | rule)* ENDMARKER
while self.type != PythonTokenTypes.ENDMARKER:
while self.type == PythonTokenTypes.NEWLINE:
self._gettoken()
# rule: NAME ':' rhs NEWLINE
self._current_rule_name = self._expect(PythonTokenTypes.NAME)
self._expect(PythonTokenTypes.OP, ':')
a, z = self._parse_rhs()
self._expect(PythonTokenTypes.NEWLINE)
yield a, z
def _parse_rhs(self):
# rhs: items ('|' items)*
a, z = self._parse_items()
if self.value != "|":
return a, z
else:
aa = NFAState(self._current_rule_name)
zz = NFAState(self._current_rule_name)
while True:
# Add the possibility to go into the state of a and come back
# to finish.
aa.add_arc(a)
z.add_arc(zz)
if self.value != "|":
break
self._gettoken()
a, z = self._parse_items()
return aa, zz
def _parse_items(self):
# items: item+
a, b = self._parse_item()
while self.type in (PythonTokenTypes.NAME, PythonTokenTypes.STRING) \
or self.value in ('(', '['):
c, d = self._parse_item()
# Need to end on the next item.
b.add_arc(c)
b = d
return a, b
def _parse_item(self):
# item: '[' rhs ']' | atom ['+' | '*']
if self.value == "[":
self._gettoken()
a, z = self._parse_rhs()
self._expect(PythonTokenTypes.OP, ']')
# Make it also possible that there is no token and change the
# state.
a.add_arc(z)
return a, z
else:
a, z = self._parse_atom()
value = self.value
if value not in ("+", "*"):
return a, z
self._gettoken()
# Make it clear that we can go back to the old state and repeat.
z.add_arc(a)
if value == "+":
return a, z
else:
# The end state is the same as the beginning, nothing must
# change.
return a, a
def _parse_atom(self):
# atom: '(' rhs ')' | NAME | STRING
if self.value == "(":
self._gettoken()
a, z = self._parse_rhs()
self._expect(PythonTokenTypes.OP, ')')
return a, z
elif self.type in (PythonTokenTypes.NAME, PythonTokenTypes.STRING):
a = NFAState(self._current_rule_name)
z = NFAState(self._current_rule_name)
# Make it clear that the state transition requires that value.
a.add_arc(z, self.value)
self._gettoken()
return a, z
else:
self._raise_error("expected (...) or NAME or STRING, got %s/%s",
self.type, self.value)
def _expect(self, type_, value=None):
if self.type != type_:
self._raise_error("expected %s, got %s [%s]",
type_, self.type, self.value)
if value is not None and self.value != value:
self._raise_error("expected %s, got %s", value, self.value)
value = self.value
self._gettoken()
return value
def _gettoken(self):
tup = next(self.generator)
self.type, self.value, self.begin, prefix = tup
def _raise_error(self, msg, *args):
if args:
try:
msg = msg % args
except:
msg = " ".join([msg] + list(map(str, args)))
line = self._bnf_grammar.splitlines()[self.begin[0] - 1]
raise SyntaxError(msg, ('<grammar>', self.begin[0],
self.begin[1], line))
class NFAArc(object):
def __init__(self, next_, nonterminal_or_string):
self.next = next_
self.nonterminal_or_string = nonterminal_or_string
def __repr__(self):
return '<%s: %s>' % (self.__class__.__name__, self.nonterminal_or_string)
class NFAState(object):
def __init__(self, from_rule):
self.from_rule = from_rule
self.arcs = [] # List[nonterminal (str), NFAState]
def add_arc(self, next_, nonterminal_or_string=None):
assert nonterminal_or_string is None or isinstance(nonterminal_or_string, str)
assert isinstance(next_, NFAState)
self.arcs.append(NFAArc(next_, nonterminal_or_string))
def __repr__(self):
return '<%s: from %s>' % (self.__class__.__name__, self.from_rule)

View file

@ -0,0 +1,20 @@
from typing import Generator, List, Optional, Tuple
from parso.python.token import TokenType
class GrammarParser:
generator: Generator[TokenType, None, None]
def __init__(self, bnf_grammar: str) -> None: ...
def parse(self) -> Generator[Tuple[NFAState, NFAState], None, None]: ...
class NFAArc:
next: NFAState
nonterminal_or_string: Optional[str]
def __init__(
self, next_: NFAState, nonterminal_or_string: Optional[str]
) -> None: ...
class NFAState:
from_rule: str
arcs: List[NFAArc]
def __init__(self, from_rule: str) -> None: ...

View file

View file

@ -0,0 +1,886 @@
"""
The diff parser is trying to be a faster version of the normal parser by trying
to reuse the nodes of a previous pass over the same file. This is also called
incremental parsing in parser literature. The difference is mostly that with
incremental parsing you get a range that needs to be reparsed. Here we
calculate that range ourselves by using difflib. After that it's essentially
incremental parsing.
The biggest issue of this approach is that we reuse nodes in a mutable way. The
intial design and idea is quite problematic for this parser, but it is also
pretty fast. Measurements showed that just copying nodes in Python is simply
quite a bit slower (especially for big files >3 kLOC). Therefore we did not
want to get rid of the mutable nodes, since this is usually not an issue.
This is by far the hardest software I ever wrote, exactly because the initial
design is crappy. When you have to account for a lot of mutable state, it
creates a ton of issues that you would otherwise not have. This file took
probably 3-6 months to write, which is insane for a parser.
There is a fuzzer in that helps test this whole thing. Please use it if you
make changes here. If you run the fuzzer like::
test/fuzz_diff_parser.py random -n 100000
you can be pretty sure that everything is still fine. I sometimes run the
fuzzer up to 24h to make sure everything is still ok.
"""
import re
import difflib
from collections import namedtuple
import logging
from parso.utils import split_lines
from parso.python.parser import Parser
from parso.python.tree import EndMarker
from parso.python.tokenize import PythonToken, BOM_UTF8_STRING
from parso.python.token import PythonTokenTypes
LOG = logging.getLogger(__name__)
DEBUG_DIFF_PARSER = False
_INDENTATION_TOKENS = 'INDENT', 'ERROR_DEDENT', 'DEDENT'
NEWLINE = PythonTokenTypes.NEWLINE
DEDENT = PythonTokenTypes.DEDENT
NAME = PythonTokenTypes.NAME
ERROR_DEDENT = PythonTokenTypes.ERROR_DEDENT
ENDMARKER = PythonTokenTypes.ENDMARKER
def _is_indentation_error_leaf(node):
return node.type == 'error_leaf' and node.token_type in _INDENTATION_TOKENS
def _get_previous_leaf_if_indentation(leaf):
while leaf and _is_indentation_error_leaf(leaf):
leaf = leaf.get_previous_leaf()
return leaf
def _get_next_leaf_if_indentation(leaf):
while leaf and _is_indentation_error_leaf(leaf):
leaf = leaf.get_next_leaf()
return leaf
def _get_suite_indentation(tree_node):
return _get_indentation(tree_node.children[1])
def _get_indentation(tree_node):
return tree_node.start_pos[1]
def _assert_valid_graph(node):
"""
Checks if the parent/children relationship is correct.
This is a check that only runs during debugging/testing.
"""
try:
children = node.children
except AttributeError:
# Ignore INDENT is necessary, because indent/dedent tokens don't
# contain value/prefix and are just around, because of the tokenizer.
if node.type == 'error_leaf' and node.token_type in _INDENTATION_TOKENS:
assert not node.value
assert not node.prefix
return
# Calculate the content between two start positions.
previous_leaf = _get_previous_leaf_if_indentation(node.get_previous_leaf())
if previous_leaf is None:
content = node.prefix
previous_start_pos = 1, 0
else:
assert previous_leaf.end_pos <= node.start_pos, \
(previous_leaf, node)
content = previous_leaf.value + node.prefix
previous_start_pos = previous_leaf.start_pos
if '\n' in content or '\r' in content:
splitted = split_lines(content)
line = previous_start_pos[0] + len(splitted) - 1
actual = line, len(splitted[-1])
else:
actual = previous_start_pos[0], previous_start_pos[1] + len(content)
if content.startswith(BOM_UTF8_STRING) \
and node.get_start_pos_of_prefix() == (1, 0):
# Remove the byte order mark
actual = actual[0], actual[1] - 1
assert node.start_pos == actual, (node.start_pos, actual)
else:
for child in children:
assert child.parent == node, (node, child)
_assert_valid_graph(child)
def _assert_nodes_are_equal(node1, node2):
try:
children1 = node1.children
except AttributeError:
assert not hasattr(node2, 'children'), (node1, node2)
assert node1.value == node2.value, (node1, node2)
assert node1.type == node2.type, (node1, node2)
assert node1.prefix == node2.prefix, (node1, node2)
assert node1.start_pos == node2.start_pos, (node1, node2)
return
else:
try:
children2 = node2.children
except AttributeError:
assert False, (node1, node2)
for n1, n2 in zip(children1, children2):
_assert_nodes_are_equal(n1, n2)
assert len(children1) == len(children2), '\n' + repr(children1) + '\n' + repr(children2)
def _get_debug_error_message(module, old_lines, new_lines):
current_lines = split_lines(module.get_code(), keepends=True)
current_diff = difflib.unified_diff(new_lines, current_lines)
old_new_diff = difflib.unified_diff(old_lines, new_lines)
import parso
return (
"There's an issue with the diff parser. Please "
"report (parso v%s) - Old/New:\n%s\nActual Diff (May be empty):\n%s"
% (parso.__version__, ''.join(old_new_diff), ''.join(current_diff))
)
def _get_last_line(node_or_leaf):
last_leaf = node_or_leaf.get_last_leaf()
if _ends_with_newline(last_leaf):
return last_leaf.start_pos[0]
else:
n = last_leaf.get_next_leaf()
if n.type == 'endmarker' and '\n' in n.prefix:
# This is a very special case and has to do with error recovery in
# Parso. The problem is basically that there's no newline leaf at
# the end sometimes (it's required in the grammar, but not needed
# actually before endmarker, CPython just adds a newline to make
# source code pass the parser, to account for that Parso error
# recovery allows small_stmt instead of simple_stmt).
return last_leaf.end_pos[0] + 1
return last_leaf.end_pos[0]
def _skip_dedent_error_leaves(leaf):
while leaf is not None and leaf.type == 'error_leaf' and leaf.token_type == 'DEDENT':
leaf = leaf.get_previous_leaf()
return leaf
def _ends_with_newline(leaf, suffix=''):
leaf = _skip_dedent_error_leaves(leaf)
if leaf.type == 'error_leaf':
typ = leaf.token_type.lower()
else:
typ = leaf.type
return typ == 'newline' or suffix.endswith('\n') or suffix.endswith('\r')
def _flows_finished(pgen_grammar, stack):
"""
if, while, for and try might not be finished, because another part might
still be parsed.
"""
for stack_node in stack:
if stack_node.nonterminal in ('if_stmt', 'while_stmt', 'for_stmt', 'try_stmt'):
return False
return True
def _func_or_class_has_suite(node):
if node.type == 'decorated':
node = node.children[-1]
if node.type in ('async_funcdef', 'async_stmt'):
node = node.children[-1]
return node.type in ('classdef', 'funcdef') and node.children[-1].type == 'suite'
def _suite_or_file_input_is_valid(pgen_grammar, stack):
if not _flows_finished(pgen_grammar, stack):
return False
for stack_node in reversed(stack):
if stack_node.nonterminal == 'decorator':
# A decorator is only valid with the upcoming function.
return False
if stack_node.nonterminal == 'suite':
# If only newline is in the suite, the suite is not valid, yet.
return len(stack_node.nodes) > 1
# Not reaching a suite means that we're dealing with file_input levels
# where there's no need for a valid statement in it. It can also be empty.
return True
def _is_flow_node(node):
if node.type == 'async_stmt':
node = node.children[1]
try:
value = node.children[0].value
except AttributeError:
return False
return value in ('if', 'for', 'while', 'try', 'with')
class _PositionUpdatingFinished(Exception):
pass
def _update_positions(nodes, line_offset, last_leaf):
for node in nodes:
try:
children = node.children
except AttributeError:
# Is a leaf
node.line += line_offset
if node is last_leaf:
raise _PositionUpdatingFinished
else:
_update_positions(children, line_offset, last_leaf)
class DiffParser(object):
"""
An advanced form of parsing a file faster. Unfortunately comes with huge
side effects. It changes the given module.
"""
def __init__(self, pgen_grammar, tokenizer, module):
self._pgen_grammar = pgen_grammar
self._tokenizer = tokenizer
self._module = module
def _reset(self):
self._copy_count = 0
self._parser_count = 0
self._nodes_tree = _NodesTree(self._module)
def update(self, old_lines, new_lines):
'''
The algorithm works as follows:
Equal:
- Assure that the start is a newline, otherwise parse until we get
one.
- Copy from parsed_until_line + 1 to max(i2 + 1)
- Make sure that the indentation is correct (e.g. add DEDENT)
- Add old and change positions
Insert:
- Parse from parsed_until_line + 1 to min(j2 + 1), hopefully not
much more.
Returns the new module node.
'''
LOG.debug('diff parser start')
# Reset the used names cache so they get regenerated.
self._module._used_names = None
self._parser_lines_new = new_lines
self._reset()
line_length = len(new_lines)
sm = difflib.SequenceMatcher(None, old_lines, self._parser_lines_new)
opcodes = sm.get_opcodes()
LOG.debug('line_lengths old: %s; new: %s' % (len(old_lines), line_length))
for operation, i1, i2, j1, j2 in opcodes:
LOG.debug('-> code[%s] old[%s:%s] new[%s:%s]',
operation, i1 + 1, i2, j1 + 1, j2)
if j2 == line_length and new_lines[-1] == '':
# The empty part after the last newline is not relevant.
j2 -= 1
if operation == 'equal':
line_offset = j1 - i1
self._copy_from_old_parser(line_offset, i1 + 1, i2, j2)
elif operation == 'replace':
self._parse(until_line=j2)
elif operation == 'insert':
self._parse(until_line=j2)
else:
assert operation == 'delete'
# With this action all change will finally be applied and we have a
# changed module.
self._nodes_tree.close()
if DEBUG_DIFF_PARSER:
# If there is reasonable suspicion that the diff parser is not
# behaving well, this should be enabled.
try:
code = ''.join(new_lines)
assert self._module.get_code() == code
_assert_valid_graph(self._module)
without_diff_parser_module = Parser(
self._pgen_grammar,
error_recovery=True
).parse(self._tokenizer(new_lines))
_assert_nodes_are_equal(self._module, without_diff_parser_module)
except AssertionError:
print(_get_debug_error_message(self._module, old_lines, new_lines))
raise
last_pos = self._module.end_pos[0]
if last_pos != line_length:
raise Exception(
('(%s != %s) ' % (last_pos, line_length))
+ _get_debug_error_message(self._module, old_lines, new_lines)
)
LOG.debug('diff parser end')
return self._module
def _enabled_debugging(self, old_lines, lines_new):
if self._module.get_code() != ''.join(lines_new):
LOG.warning('parser issue:\n%s\n%s', ''.join(old_lines), ''.join(lines_new))
def _copy_from_old_parser(self, line_offset, start_line_old, until_line_old, until_line_new):
last_until_line = -1
while until_line_new > self._nodes_tree.parsed_until_line:
parsed_until_line_old = self._nodes_tree.parsed_until_line - line_offset
line_stmt = self._get_old_line_stmt(parsed_until_line_old + 1)
if line_stmt is None:
# Parse 1 line at least. We don't need more, because we just
# want to get into a state where the old parser has statements
# again that can be copied (e.g. not lines within parentheses).
self._parse(self._nodes_tree.parsed_until_line + 1)
else:
p_children = line_stmt.parent.children
index = p_children.index(line_stmt)
if start_line_old == 1 \
and p_children[0].get_first_leaf().prefix.startswith(BOM_UTF8_STRING):
# If there's a BOM in the beginning, just reparse. It's too
# complicated to account for it otherwise.
copied_nodes = []
else:
from_ = self._nodes_tree.parsed_until_line + 1
copied_nodes = self._nodes_tree.copy_nodes(
p_children[index:],
until_line_old,
line_offset
)
# Match all the nodes that are in the wanted range.
if copied_nodes:
self._copy_count += 1
to = self._nodes_tree.parsed_until_line
LOG.debug('copy old[%s:%s] new[%s:%s]',
copied_nodes[0].start_pos[0],
copied_nodes[-1].end_pos[0] - 1, from_, to)
else:
# We have copied as much as possible (but definitely not too
# much). Therefore we just parse a bit more.
self._parse(self._nodes_tree.parsed_until_line + 1)
# Since there are potential bugs that might loop here endlessly, we
# just stop here.
assert last_until_line != self._nodes_tree.parsed_until_line, last_until_line
last_until_line = self._nodes_tree.parsed_until_line
def _get_old_line_stmt(self, old_line):
leaf = self._module.get_leaf_for_position((old_line, 0), include_prefixes=True)
if _ends_with_newline(leaf):
leaf = leaf.get_next_leaf()
if leaf.get_start_pos_of_prefix()[0] == old_line:
node = leaf
while node.parent.type not in ('file_input', 'suite'):
node = node.parent
# Make sure that if only the `else:` line of an if statement is
# copied that not the whole thing is going to be copied.
if node.start_pos[0] >= old_line:
return node
# Must be on the same line. Otherwise we need to parse that bit.
return None
def _parse(self, until_line):
"""
Parses at least until the given line, but might just parse more until a
valid state is reached.
"""
last_until_line = 0
while until_line > self._nodes_tree.parsed_until_line:
node = self._try_parse_part(until_line)
nodes = node.children
self._nodes_tree.add_parsed_nodes(nodes, self._keyword_token_indents)
if self._replace_tos_indent is not None:
self._nodes_tree.indents[-1] = self._replace_tos_indent
LOG.debug(
'parse_part from %s to %s (to %s in part parser)',
nodes[0].get_start_pos_of_prefix()[0],
self._nodes_tree.parsed_until_line,
node.end_pos[0] - 1
)
# Since the tokenizer sometimes has bugs, we cannot be sure that
# this loop terminates. Therefore assert that there's always a
# change.
assert last_until_line != self._nodes_tree.parsed_until_line, last_until_line
last_until_line = self._nodes_tree.parsed_until_line
def _try_parse_part(self, until_line):
"""
Sets up a normal parser that uses a spezialized tokenizer to only parse
until a certain position (or a bit longer if the statement hasn't
ended.
"""
self._parser_count += 1
# TODO speed up, shouldn't copy the whole list all the time.
# memoryview?
parsed_until_line = self._nodes_tree.parsed_until_line
lines_after = self._parser_lines_new[parsed_until_line:]
tokens = self._diff_tokenize(
lines_after,
until_line,
line_offset=parsed_until_line
)
self._active_parser = Parser(
self._pgen_grammar,
error_recovery=True
)
return self._active_parser.parse(tokens=tokens)
def _diff_tokenize(self, lines, until_line, line_offset=0):
was_newline = False
indents = self._nodes_tree.indents
initial_indentation_count = len(indents)
tokens = self._tokenizer(
lines,
start_pos=(line_offset + 1, 0),
indents=indents,
is_first_token=line_offset == 0,
)
stack = self._active_parser.stack
self._replace_tos_indent = None
self._keyword_token_indents = {}
# print('start', line_offset + 1, indents)
for token in tokens:
# print(token, indents)
typ = token.type
if typ == DEDENT:
if len(indents) < initial_indentation_count:
# We are done here, only thing that can come now is an
# endmarker or another dedented code block.
while True:
typ, string, start_pos, prefix = token = next(tokens)
if typ in (DEDENT, ERROR_DEDENT):
if typ == ERROR_DEDENT:
# We want to force an error dedent in the next
# parser/pass. To make this possible we just
# increase the location by one.
self._replace_tos_indent = start_pos[1] + 1
pass
else:
break
if '\n' in prefix or '\r' in prefix:
prefix = re.sub(r'[^\n\r]+\Z', '', prefix)
else:
assert start_pos[1] >= len(prefix), repr(prefix)
if start_pos[1] - len(prefix) == 0:
prefix = ''
yield PythonToken(
ENDMARKER, '',
start_pos,
prefix
)
break
elif typ == NEWLINE and token.start_pos[0] >= until_line:
was_newline = True
elif was_newline:
was_newline = False
if len(indents) == initial_indentation_count:
# Check if the parser is actually in a valid suite state.
if _suite_or_file_input_is_valid(self._pgen_grammar, stack):
yield PythonToken(ENDMARKER, '', token.start_pos, '')
break
if typ == NAME and token.string in ('class', 'def'):
self._keyword_token_indents[token.start_pos] = list(indents)
yield token
class _NodesTreeNode(object):
_ChildrenGroup = namedtuple(
'_ChildrenGroup',
'prefix children line_offset last_line_offset_leaf')
def __init__(self, tree_node, parent=None, indentation=0):
self.tree_node = tree_node
self._children_groups = []
self.parent = parent
self._node_children = []
self.indentation = indentation
def finish(self):
children = []
for prefix, children_part, line_offset, last_line_offset_leaf in self._children_groups:
first_leaf = _get_next_leaf_if_indentation(
children_part[0].get_first_leaf()
)
first_leaf.prefix = prefix + first_leaf.prefix
if line_offset != 0:
try:
_update_positions(
children_part, line_offset, last_line_offset_leaf)
except _PositionUpdatingFinished:
pass
children += children_part
self.tree_node.children = children
# Reset the parents
for node in children:
node.parent = self.tree_node
for node_child in self._node_children:
node_child.finish()
def add_child_node(self, child_node):
self._node_children.append(child_node)
def add_tree_nodes(self, prefix, children, line_offset=0,
last_line_offset_leaf=None):
if last_line_offset_leaf is None:
last_line_offset_leaf = children[-1].get_last_leaf()
group = self._ChildrenGroup(
prefix, children, line_offset, last_line_offset_leaf
)
self._children_groups.append(group)
def get_last_line(self, suffix):
line = 0
if self._children_groups:
children_group = self._children_groups[-1]
last_leaf = _get_previous_leaf_if_indentation(
children_group.last_line_offset_leaf
)
line = last_leaf.end_pos[0] + children_group.line_offset
# Newlines end on the next line, which means that they would cover
# the next line. That line is not fully parsed at this point.
if _ends_with_newline(last_leaf, suffix):
line -= 1
line += len(split_lines(suffix)) - 1
if suffix and not suffix.endswith('\n') and not suffix.endswith('\r'):
# This is the end of a file (that doesn't end with a newline).
line += 1
if self._node_children:
return max(line, self._node_children[-1].get_last_line(suffix))
return line
def __repr__(self):
return '<%s: %s>' % (self.__class__.__name__, self.tree_node)
class _NodesTree(object):
def __init__(self, module):
self._base_node = _NodesTreeNode(module)
self._working_stack = [self._base_node]
self._module = module
self._prefix_remainder = ''
self.prefix = ''
self.indents = [0]
@property
def parsed_until_line(self):
return self._working_stack[-1].get_last_line(self.prefix)
def _update_insertion_node(self, indentation):
for node in reversed(list(self._working_stack)):
if node.indentation < indentation or node is self._working_stack[0]:
return node
self._working_stack.pop()
def add_parsed_nodes(self, tree_nodes, keyword_token_indents):
old_prefix = self.prefix
tree_nodes = self._remove_endmarker(tree_nodes)
if not tree_nodes:
self.prefix = old_prefix + self.prefix
return
assert tree_nodes[0].type != 'newline'
node = self._update_insertion_node(tree_nodes[0].start_pos[1])
assert node.tree_node.type in ('suite', 'file_input')
node.add_tree_nodes(old_prefix, tree_nodes)
# tos = Top of stack
self._update_parsed_node_tos(tree_nodes[-1], keyword_token_indents)
def _update_parsed_node_tos(self, tree_node, keyword_token_indents):
if tree_node.type == 'suite':
def_leaf = tree_node.parent.children[0]
new_tos = _NodesTreeNode(
tree_node,
indentation=keyword_token_indents[def_leaf.start_pos][-1],
)
new_tos.add_tree_nodes('', list(tree_node.children))
self._working_stack[-1].add_child_node(new_tos)
self._working_stack.append(new_tos)
self._update_parsed_node_tos(tree_node.children[-1], keyword_token_indents)
elif _func_or_class_has_suite(tree_node):
self._update_parsed_node_tos(tree_node.children[-1], keyword_token_indents)
def _remove_endmarker(self, tree_nodes):
"""
Helps cleaning up the tree nodes that get inserted.
"""
last_leaf = tree_nodes[-1].get_last_leaf()
is_endmarker = last_leaf.type == 'endmarker'
self._prefix_remainder = ''
if is_endmarker:
prefix = last_leaf.prefix
separation = max(prefix.rfind('\n'), prefix.rfind('\r'))
if separation > -1:
# Remove the whitespace part of the prefix after a newline.
# That is not relevant if parentheses were opened. Always parse
# until the end of a line.
last_leaf.prefix, self._prefix_remainder = \
last_leaf.prefix[:separation + 1], last_leaf.prefix[separation + 1:]
self.prefix = ''
if is_endmarker:
self.prefix = last_leaf.prefix
tree_nodes = tree_nodes[:-1]
return tree_nodes
def _get_matching_indent_nodes(self, tree_nodes, is_new_suite):
# There might be a random dedent where we have to stop copying.
# Invalid indents are ok, because the parser handled that
# properly before. An invalid dedent can happen, because a few
# lines above there was an invalid indent.
node_iterator = iter(tree_nodes)
if is_new_suite:
yield next(node_iterator)
first_node = next(node_iterator)
indent = _get_indentation(first_node)
if not is_new_suite and indent not in self.indents:
return
yield first_node
for n in node_iterator:
if _get_indentation(n) != indent:
return
yield n
def copy_nodes(self, tree_nodes, until_line, line_offset):
"""
Copies tree nodes from the old parser tree.
Returns the number of tree nodes that were copied.
"""
if tree_nodes[0].type in ('error_leaf', 'error_node'):
# Avoid copying errors in the beginning. Can lead to a lot of
# issues.
return []
indentation = _get_indentation(tree_nodes[0])
old_working_stack = list(self._working_stack)
old_prefix = self.prefix
old_indents = self.indents
self.indents = [i for i in self.indents if i <= indentation]
self._update_insertion_node(indentation)
new_nodes, self._working_stack, self.prefix, added_indents = self._copy_nodes(
list(self._working_stack),
tree_nodes,
until_line,
line_offset,
self.prefix,
)
if new_nodes:
self.indents += added_indents
else:
self._working_stack = old_working_stack
self.prefix = old_prefix
self.indents = old_indents
return new_nodes
def _copy_nodes(self, working_stack, nodes, until_line, line_offset,
prefix='', is_nested=False):
new_nodes = []
added_indents = []
nodes = list(self._get_matching_indent_nodes(
nodes,
is_new_suite=is_nested,
))
new_prefix = ''
for node in nodes:
if node.start_pos[0] > until_line:
break
if node.type == 'endmarker':
break
if node.type == 'error_leaf' and node.token_type in ('DEDENT', 'ERROR_DEDENT'):
break
# TODO this check might take a bit of time for large files. We
# might want to change this to do more intelligent guessing or
# binary search.
if _get_last_line(node) > until_line:
# We can split up functions and classes later.
if _func_or_class_has_suite(node):
new_nodes.append(node)
break
try:
c = node.children
except AttributeError:
pass
else:
# This case basically appears with error recovery of one line
# suites like `def foo(): bar.-`. In this case we might not
# include a newline in the statement and we need to take care
# of that.
n = node
if n.type == 'decorated':
n = n.children[-1]
if n.type in ('async_funcdef', 'async_stmt'):
n = n.children[-1]
if n.type in ('classdef', 'funcdef'):
suite_node = n.children[-1]
else:
suite_node = c[-1]
if suite_node.type in ('error_leaf', 'error_node'):
break
new_nodes.append(node)
# Pop error nodes at the end from the list
if new_nodes:
while new_nodes:
last_node = new_nodes[-1]
if (last_node.type in ('error_leaf', 'error_node')
or _is_flow_node(new_nodes[-1])):
# Error leafs/nodes don't have a defined start/end. Error
# nodes might not end with a newline (e.g. if there's an
# open `(`). Therefore ignore all of them unless they are
# succeeded with valid parser state.
# If we copy flows at the end, they might be continued
# after the copy limit (in the new parser).
# In this while loop we try to remove until we find a newline.
new_prefix = ''
new_nodes.pop()
while new_nodes:
last_node = new_nodes[-1]
if last_node.get_last_leaf().type == 'newline':
break
new_nodes.pop()
continue
if len(new_nodes) > 1 and new_nodes[-2].type == 'error_node':
# The problem here is that Parso error recovery sometimes
# influences nodes before this node.
# Since the new last node is an error node this will get
# cleaned up in the next while iteration.
new_nodes.pop()
continue
break
if not new_nodes:
return [], working_stack, prefix, added_indents
tos = working_stack[-1]
last_node = new_nodes[-1]
had_valid_suite_last = False
# Pop incomplete suites from the list
if _func_or_class_has_suite(last_node):
suite = last_node
while suite.type != 'suite':
suite = suite.children[-1]
indent = _get_suite_indentation(suite)
added_indents.append(indent)
suite_tos = _NodesTreeNode(suite, indentation=_get_indentation(last_node))
# Don't need to pass line_offset here, it's already done by the
# parent.
suite_nodes, new_working_stack, new_prefix, ai = self._copy_nodes(
working_stack + [suite_tos], suite.children, until_line, line_offset,
is_nested=True,
)
added_indents += ai
if len(suite_nodes) < 2:
# A suite only with newline is not valid.
new_nodes.pop()
new_prefix = ''
else:
assert new_nodes
tos.add_child_node(suite_tos)
working_stack = new_working_stack
had_valid_suite_last = True
if new_nodes:
if not _ends_with_newline(new_nodes[-1].get_last_leaf()) and not had_valid_suite_last:
p = new_nodes[-1].get_next_leaf().prefix
# We are not allowed to remove the newline at the end of the
# line, otherwise it's going to be missing. This happens e.g.
# if a bracket is around before that moves newlines to
# prefixes.
new_prefix = split_lines(p, keepends=True)[0]
if had_valid_suite_last:
last = new_nodes[-1]
if last.type == 'decorated':
last = last.children[-1]
if last.type in ('async_funcdef', 'async_stmt'):
last = last.children[-1]
last_line_offset_leaf = last.children[-2].get_last_leaf()
assert last_line_offset_leaf == ':'
else:
last_line_offset_leaf = new_nodes[-1].get_last_leaf()
tos.add_tree_nodes(
prefix, new_nodes, line_offset, last_line_offset_leaf,
)
prefix = new_prefix
self._prefix_remainder = ''
return new_nodes, working_stack, prefix, added_indents
def close(self):
self._base_node.finish()
# Add an endmarker.
try:
last_leaf = self._module.get_last_leaf()
except IndexError:
end_pos = [1, 0]
else:
last_leaf = _skip_dedent_error_leaves(last_leaf)
end_pos = list(last_leaf.end_pos)
lines = split_lines(self.prefix)
assert len(lines) > 0
if len(lines) == 1:
if lines[0].startswith(BOM_UTF8_STRING) and end_pos == [1, 0]:
end_pos[1] -= 1
end_pos[1] += len(lines[0])
else:
end_pos[0] += len(lines) - 1
end_pos[1] = len(lines[-1])
endmarker = EndMarker('', tuple(end_pos), self.prefix + self._prefix_remainder)
endmarker.parent = self._module
self._module.children.append(endmarker)

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,143 @@
# Grammar for Python
# Note: Changing the grammar specified in this file will most likely
# require corresponding changes in the parser module
# (../Modules/parsermodule.c). If you can't make the changes to
# that module yourself, please co-ordinate the required changes
# with someone who can; ask around on python-dev for help. Fred
# Drake <fdrake@acm.org> will probably be listening there.
# NOTE WELL: You should also follow all the steps listed in PEP 306,
# "How to Change Python's Grammar"
# Start symbols for the grammar:
# single_input is a single interactive statement;
# file_input is a module or sequence of commands read from an input file;
# eval_input is the input for the eval() and input() functions.
# NB: compound_stmt in single_input is followed by extra NEWLINE!
single_input: NEWLINE | simple_stmt | compound_stmt NEWLINE
file_input: stmt* ENDMARKER
eval_input: testlist NEWLINE* ENDMARKER
decorator: '@' dotted_name [ '(' [arglist] ')' ] NEWLINE
decorators: decorator+
decorated: decorators (classdef | funcdef)
funcdef: 'def' NAME parameters ':' suite
parameters: '(' [varargslist] ')'
varargslist: ((fpdef ['=' test] ',')*
('*' NAME [',' '**' NAME] | '**' NAME) |
fpdef ['=' test] (',' fpdef ['=' test])* [','])
fpdef: NAME | '(' fplist ')'
fplist: fpdef (',' fpdef)* [',']
stmt: simple_stmt | compound_stmt | NEWLINE
simple_stmt: small_stmt (';' small_stmt)* [';'] NEWLINE
small_stmt: (expr_stmt | print_stmt | del_stmt | pass_stmt | flow_stmt |
import_stmt | global_stmt | exec_stmt | assert_stmt)
expr_stmt: testlist (augassign (yield_expr|testlist) |
('=' (yield_expr|testlist))*)
augassign: ('+=' | '-=' | '*=' | '/=' | '%=' | '&=' | '|=' | '^=' |
'<<=' | '>>=' | '**=' | '//=')
# For normal assignments, additional restrictions enforced by the interpreter
print_stmt: 'print' ( [ test (',' test)* [','] ] |
'>>' test [ (',' test)+ [','] ] )
del_stmt: 'del' exprlist
pass_stmt: 'pass'
flow_stmt: break_stmt | continue_stmt | return_stmt | raise_stmt | yield_stmt
break_stmt: 'break'
continue_stmt: 'continue'
return_stmt: 'return' [testlist]
yield_stmt: yield_expr
raise_stmt: 'raise' [test [',' test [',' test]]]
import_stmt: import_name | import_from
import_name: 'import' dotted_as_names
import_from: ('from' ('.'* dotted_name | '.'+)
'import' ('*' | '(' import_as_names ')' | import_as_names))
import_as_name: NAME ['as' NAME]
dotted_as_name: dotted_name ['as' NAME]
import_as_names: import_as_name (',' import_as_name)* [',']
dotted_as_names: dotted_as_name (',' dotted_as_name)*
dotted_name: NAME ('.' NAME)*
global_stmt: 'global' NAME (',' NAME)*
exec_stmt: 'exec' expr ['in' test [',' test]]
assert_stmt: 'assert' test [',' test]
compound_stmt: if_stmt | while_stmt | for_stmt | try_stmt | with_stmt | funcdef | classdef | decorated
if_stmt: 'if' test ':' suite ('elif' test ':' suite)* ['else' ':' suite]
while_stmt: 'while' test ':' suite ['else' ':' suite]
for_stmt: 'for' exprlist 'in' testlist ':' suite ['else' ':' suite]
try_stmt: ('try' ':' suite
((except_clause ':' suite)+
['else' ':' suite]
['finally' ':' suite] |
'finally' ':' suite))
with_stmt: 'with' with_item (',' with_item)* ':' suite
with_item: test ['as' expr]
# NB compile.c makes sure that the default except clause is last
except_clause: 'except' [test [('as' | ',') test]]
suite: simple_stmt | NEWLINE INDENT stmt+ DEDENT
# Backward compatibility cruft to support:
# [ x for x in lambda: True, lambda: False if x() ]
# even while also allowing:
# lambda x: 5 if x else 2
# (But not a mix of the two)
testlist_safe: old_test [(',' old_test)+ [',']]
old_test: or_test | old_lambdef
old_lambdef: 'lambda' [varargslist] ':' old_test
test: or_test ['if' or_test 'else' test] | lambdef
or_test: and_test ('or' and_test)*
and_test: not_test ('and' not_test)*
not_test: 'not' not_test | comparison
comparison: expr (comp_op expr)*
comp_op: '<'|'>'|'=='|'>='|'<='|'<>'|'!='|'in'|'not' 'in'|'is'|'is' 'not'
expr: xor_expr ('|' xor_expr)*
xor_expr: and_expr ('^' and_expr)*
and_expr: shift_expr ('&' shift_expr)*
shift_expr: arith_expr (('<<'|'>>') arith_expr)*
arith_expr: term (('+'|'-') term)*
term: factor (('*'|'/'|'%'|'//') factor)*
factor: ('+'|'-'|'~') factor | power
power: atom trailer* ['**' factor]
atom: ('(' [yield_expr|testlist_comp] ')' |
'[' [listmaker] ']' |
'{' [dictorsetmaker] '}' |
'`' testlist1 '`' |
NAME | NUMBER | strings)
strings: STRING+
listmaker: test ( list_for | (',' test)* [','] )
testlist_comp: test ( sync_comp_for | (',' test)* [','] )
lambdef: 'lambda' [varargslist] ':' test
trailer: '(' [arglist] ')' | '[' subscriptlist ']' | '.' NAME
subscriptlist: subscript (',' subscript)* [',']
subscript: '.' '.' '.' | test | [test] ':' [test] [sliceop]
sliceop: ':' [test]
exprlist: expr (',' expr)* [',']
testlist: test (',' test)* [',']
dictorsetmaker: ( (test ':' test (sync_comp_for | (',' test ':' test)* [','])) |
(test (sync_comp_for | (',' test)* [','])) )
classdef: 'class' NAME ['(' [testlist] ')'] ':' suite
arglist: (argument ',')* (argument [',']
|'*' test (',' argument)* [',' '**' test]
|'**' test)
# The reason that keywords are test nodes instead of NAME is that using NAME
# results in an ambiguity. ast.c makes sure it's a NAME.
argument: test [sync_comp_for] | test '=' test
list_iter: list_for | list_if
list_for: 'for' exprlist 'in' testlist_safe [list_iter]
list_if: 'if' old_test [list_iter]
comp_iter: sync_comp_for | comp_if
sync_comp_for: 'for' exprlist 'in' or_test [comp_iter]
comp_if: 'if' old_test [comp_iter]
testlist1: test (',' test)*
# not used in grammar, but may appear in "node" passed from Parser to Compiler
encoding_decl: NAME
yield_expr: 'yield' [testlist]

View file

@ -0,0 +1,171 @@
# Grammar for Python
# NOTE WELL: You should also follow all the steps listed at
# https://devguide.python.org/grammar/
# Start symbols for the grammar:
# single_input is a single interactive statement;
# file_input is a module or sequence of commands read from an input file;
# eval_input is the input for the eval() functions.
# NB: compound_stmt in single_input is followed by extra NEWLINE!
single_input: NEWLINE | simple_stmt | compound_stmt NEWLINE
file_input: stmt* ENDMARKER
eval_input: testlist NEWLINE* ENDMARKER
decorator: '@' namedexpr_test NEWLINE
decorators: decorator+
decorated: decorators (classdef | funcdef | async_funcdef)
async_funcdef: 'async' funcdef
funcdef: 'def' NAME parameters ['->' test] ':' suite
parameters: '(' [typedargslist] ')'
typedargslist: (
(tfpdef ['=' test] (',' tfpdef ['=' test])* ',' '/' [',' [ tfpdef ['=' test] (
',' tfpdef ['=' test])* ([',' [
'*' [tfpdef] (',' tfpdef ['=' test])* [',' ['**' tfpdef [',']]]
| '**' tfpdef [',']]])
| '*' [tfpdef] (',' tfpdef ['=' test])* ([',' ['**' tfpdef [',']]])
| '**' tfpdef [',']]] )
| (tfpdef ['=' test] (',' tfpdef ['=' test])* [',' [
'*' [tfpdef] (',' tfpdef ['=' test])* [',' ['**' tfpdef [',']]]
| '**' tfpdef [',']]]
| '*' [tfpdef] (',' tfpdef ['=' test])* [',' ['**' tfpdef [',']]]
| '**' tfpdef [','])
)
tfpdef: NAME [':' test]
varargslist: vfpdef ['=' test ](',' vfpdef ['=' test])* ',' '/' [',' [ (vfpdef ['=' test] (',' vfpdef ['=' test])* [',' [
'*' [vfpdef] (',' vfpdef ['=' test])* [',' ['**' vfpdef [',']]]
| '**' vfpdef [',']]]
| '*' [vfpdef] (',' vfpdef ['=' test])* [',' ['**' vfpdef [',']]]
| '**' vfpdef [',']) ]] | (vfpdef ['=' test] (',' vfpdef ['=' test])* [',' [
'*' [vfpdef] (',' vfpdef ['=' test])* [',' ['**' vfpdef [',']]]
| '**' vfpdef [',']]]
| '*' [vfpdef] (',' vfpdef ['=' test])* [',' ['**' vfpdef [',']]]
| '**' vfpdef [',']
)
vfpdef: NAME
stmt: simple_stmt | compound_stmt | NEWLINE
simple_stmt: small_stmt (';' small_stmt)* [';'] NEWLINE
small_stmt: (expr_stmt | del_stmt | pass_stmt | flow_stmt |
import_stmt | global_stmt | nonlocal_stmt | assert_stmt)
expr_stmt: testlist_star_expr (annassign | augassign (yield_expr|testlist) |
('=' (yield_expr|testlist_star_expr))*)
annassign: ':' test ['=' (yield_expr|testlist_star_expr)]
testlist_star_expr: (test|star_expr) (',' (test|star_expr))* [',']
augassign: ('+=' | '-=' | '*=' | '@=' | '/=' | '%=' | '&=' | '|=' | '^=' |
'<<=' | '>>=' | '**=' | '//=')
# For normal and annotated assignments, additional restrictions enforced by the interpreter
del_stmt: 'del' exprlist
pass_stmt: 'pass'
flow_stmt: break_stmt | continue_stmt | return_stmt | raise_stmt | yield_stmt
break_stmt: 'break'
continue_stmt: 'continue'
return_stmt: 'return' [testlist_star_expr]
yield_stmt: yield_expr
raise_stmt: 'raise' [test ['from' test]]
import_stmt: import_name | import_from
import_name: 'import' dotted_as_names
# note below: the ('.' | '...') is necessary because '...' is tokenized as ELLIPSIS
import_from: ('from' (('.' | '...')* dotted_name | ('.' | '...')+)
'import' ('*' | '(' import_as_names ')' | import_as_names))
import_as_name: NAME ['as' NAME]
dotted_as_name: dotted_name ['as' NAME]
import_as_names: import_as_name (',' import_as_name)* [',']
dotted_as_names: dotted_as_name (',' dotted_as_name)*
dotted_name: NAME ('.' NAME)*
global_stmt: 'global' NAME (',' NAME)*
nonlocal_stmt: 'nonlocal' NAME (',' NAME)*
assert_stmt: 'assert' test [',' test]
compound_stmt: if_stmt | while_stmt | for_stmt | try_stmt | with_stmt | funcdef | classdef | decorated | async_stmt
async_stmt: 'async' (funcdef | with_stmt | for_stmt)
if_stmt: 'if' namedexpr_test ':' suite ('elif' namedexpr_test ':' suite)* ['else' ':' suite]
while_stmt: 'while' namedexpr_test ':' suite ['else' ':' suite]
for_stmt: 'for' exprlist 'in' testlist ':' suite ['else' ':' suite]
try_stmt: ('try' ':' suite
((except_clause ':' suite)+
['else' ':' suite]
['finally' ':' suite] |
'finally' ':' suite))
with_stmt: 'with' with_item (',' with_item)* ':' suite
with_item: test ['as' expr]
# NB compile.c makes sure that the default except clause is last
except_clause: 'except' [test ['as' NAME]]
suite: simple_stmt | NEWLINE INDENT stmt+ DEDENT
namedexpr_test: test [':=' test]
test: or_test ['if' or_test 'else' test] | lambdef
test_nocond: or_test | lambdef_nocond
lambdef: 'lambda' [varargslist] ':' test
lambdef_nocond: 'lambda' [varargslist] ':' test_nocond
or_test: and_test ('or' and_test)*
and_test: not_test ('and' not_test)*
not_test: 'not' not_test | comparison
comparison: expr (comp_op expr)*
# <> isn't actually a valid comparison operator in Python. It's here for the
# sake of a __future__ import described in PEP 401 (which really works :-)
comp_op: '<'|'>'|'=='|'>='|'<='|'<>'|'!='|'in'|'not' 'in'|'is'|'is' 'not'
star_expr: '*' expr
expr: xor_expr ('|' xor_expr)*
xor_expr: and_expr ('^' and_expr)*
and_expr: shift_expr ('&' shift_expr)*
shift_expr: arith_expr (('<<'|'>>') arith_expr)*
arith_expr: term (('+'|'-') term)*
term: factor (('*'|'@'|'/'|'%'|'//') factor)*
factor: ('+'|'-'|'~') factor | power
power: atom_expr ['**' factor]
atom_expr: ['await'] atom trailer*
atom: ('(' [yield_expr|testlist_comp] ')' |
'[' [testlist_comp] ']' |
'{' [dictorsetmaker] '}' |
NAME | NUMBER | strings | '...' | 'None' | 'True' | 'False')
testlist_comp: (namedexpr_test|star_expr) ( comp_for | (',' (namedexpr_test|star_expr))* [','] )
trailer: '(' [arglist] ')' | '[' subscriptlist ']' | '.' NAME
subscriptlist: subscript (',' subscript)* [',']
subscript: test | [test] ':' [test] [sliceop]
sliceop: ':' [test]
exprlist: (expr|star_expr) (',' (expr|star_expr))* [',']
testlist: test (',' test)* [',']
dictorsetmaker: ( ((test ':' test | '**' expr)
(comp_for | (',' (test ':' test | '**' expr))* [','])) |
((test | star_expr)
(comp_for | (',' (test | star_expr))* [','])) )
classdef: 'class' NAME ['(' [arglist] ')'] ':' suite
arglist: argument (',' argument)* [',']
# The reason that keywords are test nodes instead of NAME is that using NAME
# results in an ambiguity. ast.c makes sure it's a NAME.
# "test '=' test" is really "keyword '=' test", but we have no such token.
# These need to be in a single rule to avoid grammar that is ambiguous
# to our LL(1) parser. Even though 'test' includes '*expr' in star_expr,
# we explicitly match '*' here, too, to give it proper precedence.
# Illegal combinations and orderings are blocked in ast.c:
# multiple (test comp_for) arguments are blocked; keyword unpackings
# that precede iterable unpackings are blocked; etc.
argument: ( test [comp_for] |
test ':=' test |
test '=' test |
'**' test |
'*' test )
comp_iter: comp_for | comp_if
sync_comp_for: 'for' exprlist 'in' or_test [comp_iter]
comp_for: ['async'] sync_comp_for
comp_if: 'if' test_nocond [comp_iter]
# not used in grammar, but may appear in "node" passed from Parser to Compiler
encoding_decl: NAME
yield_expr: 'yield' [yield_arg]
yield_arg: 'from' test | testlist_star_expr
strings: (STRING | fstring)+
fstring: FSTRING_START fstring_content* FSTRING_END
fstring_content: FSTRING_STRING | fstring_expr
fstring_conversion: '!' NAME
fstring_expr: '{' testlist ['='] [ fstring_conversion ] [ fstring_format_spec ] '}'
fstring_format_spec: ':' fstring_content*

View file

@ -0,0 +1,134 @@
# Grammar for Python
# Note: Changing the grammar specified in this file will most likely
# require corresponding changes in the parser module
# (../Modules/parsermodule.c). If you can't make the changes to
# that module yourself, please co-ordinate the required changes
# with someone who can; ask around on python-dev for help. Fred
# Drake <fdrake@acm.org> will probably be listening there.
# NOTE WELL: You should also follow all the steps listed in PEP 306,
# "How to Change Python's Grammar"
# Start symbols for the grammar:
# single_input is a single interactive statement;
# file_input is a module or sequence of commands read from an input file;
# eval_input is the input for the eval() functions.
# NB: compound_stmt in single_input is followed by extra NEWLINE!
single_input: NEWLINE | simple_stmt | compound_stmt NEWLINE
file_input: stmt* ENDMARKER
eval_input: testlist NEWLINE* ENDMARKER
decorator: '@' dotted_name [ '(' [arglist] ')' ] NEWLINE
decorators: decorator+
decorated: decorators (classdef | funcdef)
funcdef: 'def' NAME parameters ['->' test] ':' suite
parameters: '(' [typedargslist] ')'
typedargslist: (tfpdef ['=' test] (',' tfpdef ['=' test])* [','
['*' [tfpdef] (',' tfpdef ['=' test])* [',' '**' tfpdef] | '**' tfpdef]]
| '*' [tfpdef] (',' tfpdef ['=' test])* [',' '**' tfpdef] | '**' tfpdef)
tfpdef: NAME [':' test]
varargslist: (vfpdef ['=' test] (',' vfpdef ['=' test])* [','
['*' [vfpdef] (',' vfpdef ['=' test])* [',' '**' vfpdef] | '**' vfpdef]]
| '*' [vfpdef] (',' vfpdef ['=' test])* [',' '**' vfpdef] | '**' vfpdef)
vfpdef: NAME
stmt: simple_stmt | compound_stmt | NEWLINE
simple_stmt: small_stmt (';' small_stmt)* [';'] NEWLINE
small_stmt: (expr_stmt | del_stmt | pass_stmt | flow_stmt |
import_stmt | global_stmt | nonlocal_stmt | assert_stmt)
expr_stmt: testlist_star_expr (augassign (yield_expr|testlist) |
('=' (yield_expr|testlist_star_expr))*)
testlist_star_expr: (test|star_expr) (',' (test|star_expr))* [',']
augassign: ('+=' | '-=' | '*=' | '/=' | '%=' | '&=' | '|=' | '^=' |
'<<=' | '>>=' | '**=' | '//=')
# For normal assignments, additional restrictions enforced by the interpreter
del_stmt: 'del' exprlist
pass_stmt: 'pass'
flow_stmt: break_stmt | continue_stmt | return_stmt | raise_stmt | yield_stmt
break_stmt: 'break'
continue_stmt: 'continue'
return_stmt: 'return' [testlist]
yield_stmt: yield_expr
raise_stmt: 'raise' [test ['from' test]]
import_stmt: import_name | import_from
import_name: 'import' dotted_as_names
# note below: the ('.' | '...') is necessary because '...' is tokenized as ELLIPSIS
import_from: ('from' (('.' | '...')* dotted_name | ('.' | '...')+)
'import' ('*' | '(' import_as_names ')' | import_as_names))
import_as_name: NAME ['as' NAME]
dotted_as_name: dotted_name ['as' NAME]
import_as_names: import_as_name (',' import_as_name)* [',']
dotted_as_names: dotted_as_name (',' dotted_as_name)*
dotted_name: NAME ('.' NAME)*
global_stmt: 'global' NAME (',' NAME)*
nonlocal_stmt: 'nonlocal' NAME (',' NAME)*
assert_stmt: 'assert' test [',' test]
compound_stmt: if_stmt | while_stmt | for_stmt | try_stmt | with_stmt | funcdef | classdef | decorated
if_stmt: 'if' test ':' suite ('elif' test ':' suite)* ['else' ':' suite]
while_stmt: 'while' test ':' suite ['else' ':' suite]
for_stmt: 'for' exprlist 'in' testlist ':' suite ['else' ':' suite]
try_stmt: ('try' ':' suite
((except_clause ':' suite)+
['else' ':' suite]
['finally' ':' suite] |
'finally' ':' suite))
with_stmt: 'with' with_item (',' with_item)* ':' suite
with_item: test ['as' expr]
# NB compile.c makes sure that the default except clause is last
except_clause: 'except' [test ['as' NAME]]
suite: simple_stmt | NEWLINE INDENT stmt+ DEDENT
test: or_test ['if' or_test 'else' test] | lambdef
test_nocond: or_test | lambdef_nocond
lambdef: 'lambda' [varargslist] ':' test
lambdef_nocond: 'lambda' [varargslist] ':' test_nocond
or_test: and_test ('or' and_test)*
and_test: not_test ('and' not_test)*
not_test: 'not' not_test | comparison
comparison: expr (comp_op expr)*
# <> isn't actually a valid comparison operator in Python. It's here for the
# sake of a __future__ import described in PEP 401
comp_op: '<'|'>'|'=='|'>='|'<='|'<>'|'!='|'in'|'not' 'in'|'is'|'is' 'not'
star_expr: '*' expr
expr: xor_expr ('|' xor_expr)*
xor_expr: and_expr ('^' and_expr)*
and_expr: shift_expr ('&' shift_expr)*
shift_expr: arith_expr (('<<'|'>>') arith_expr)*
arith_expr: term (('+'|'-') term)*
term: factor (('*'|'/'|'%'|'//') factor)*
factor: ('+'|'-'|'~') factor | power
power: atom trailer* ['**' factor]
atom: ('(' [yield_expr|testlist_comp] ')' |
'[' [testlist_comp] ']' |
'{' [dictorsetmaker] '}' |
NAME | NUMBER | strings | '...' | 'None' | 'True' | 'False')
strings: STRING+
testlist_comp: (test|star_expr) ( sync_comp_for | (',' (test|star_expr))* [','] )
trailer: '(' [arglist] ')' | '[' subscriptlist ']' | '.' NAME
subscriptlist: subscript (',' subscript)* [',']
subscript: test | [test] ':' [test] [sliceop]
sliceop: ':' [test]
exprlist: (expr|star_expr) (',' (expr|star_expr))* [',']
testlist: test (',' test)* [',']
dictorsetmaker: ( (test ':' test (sync_comp_for | (',' test ':' test)* [','])) |
(test (sync_comp_for | (',' test)* [','])) )
classdef: 'class' NAME ['(' [arglist] ')'] ':' suite
arglist: (argument ',')* (argument [',']
|'*' test (',' argument)* [',' '**' test]
|'**' test)
# The reason that keywords are test nodes instead of NAME is that using NAME
# results in an ambiguity. ast.c makes sure it's a NAME.
argument: test [sync_comp_for] | test '=' test # Really [keyword '='] test
comp_iter: sync_comp_for | comp_if
sync_comp_for: 'for' exprlist 'in' or_test [comp_iter]
comp_if: 'if' test_nocond [comp_iter]
# not used in grammar, but may appear in "node" passed from Parser to Compiler
encoding_decl: NAME
yield_expr: 'yield' [yield_arg]
yield_arg: 'from' test | testlist

View file

@ -0,0 +1,134 @@
# Grammar for Python
# Note: Changing the grammar specified in this file will most likely
# require corresponding changes in the parser module
# (../Modules/parsermodule.c). If you can't make the changes to
# that module yourself, please co-ordinate the required changes
# with someone who can; ask around on python-dev for help. Fred
# Drake <fdrake@acm.org> will probably be listening there.
# NOTE WELL: You should also follow all the steps listed at
# https://docs.python.org/devguide/grammar.html
# Start symbols for the grammar:
# single_input is a single interactive statement;
# file_input is a module or sequence of commands read from an input file;
# eval_input is the input for the eval() functions.
# NB: compound_stmt in single_input is followed by extra NEWLINE!
single_input: NEWLINE | simple_stmt | compound_stmt NEWLINE
file_input: stmt* ENDMARKER
eval_input: testlist NEWLINE* ENDMARKER
decorator: '@' dotted_name [ '(' [arglist] ')' ] NEWLINE
decorators: decorator+
decorated: decorators (classdef | funcdef)
funcdef: 'def' NAME parameters ['->' test] ':' suite
parameters: '(' [typedargslist] ')'
typedargslist: (tfpdef ['=' test] (',' tfpdef ['=' test])* [','
['*' [tfpdef] (',' tfpdef ['=' test])* [',' '**' tfpdef] | '**' tfpdef]]
| '*' [tfpdef] (',' tfpdef ['=' test])* [',' '**' tfpdef] | '**' tfpdef)
tfpdef: NAME [':' test]
varargslist: (vfpdef ['=' test] (',' vfpdef ['=' test])* [','
['*' [vfpdef] (',' vfpdef ['=' test])* [',' '**' vfpdef] | '**' vfpdef]]
| '*' [vfpdef] (',' vfpdef ['=' test])* [',' '**' vfpdef] | '**' vfpdef)
vfpdef: NAME
stmt: simple_stmt | compound_stmt | NEWLINE
simple_stmt: small_stmt (';' small_stmt)* [';'] NEWLINE
small_stmt: (expr_stmt | del_stmt | pass_stmt | flow_stmt |
import_stmt | global_stmt | nonlocal_stmt | assert_stmt)
expr_stmt: testlist_star_expr (augassign (yield_expr|testlist) |
('=' (yield_expr|testlist_star_expr))*)
testlist_star_expr: (test|star_expr) (',' (test|star_expr))* [',']
augassign: ('+=' | '-=' | '*=' | '/=' | '%=' | '&=' | '|=' | '^=' |
'<<=' | '>>=' | '**=' | '//=')
# For normal assignments, additional restrictions enforced by the interpreter
del_stmt: 'del' exprlist
pass_stmt: 'pass'
flow_stmt: break_stmt | continue_stmt | return_stmt | raise_stmt | yield_stmt
break_stmt: 'break'
continue_stmt: 'continue'
return_stmt: 'return' [testlist]
yield_stmt: yield_expr
raise_stmt: 'raise' [test ['from' test]]
import_stmt: import_name | import_from
import_name: 'import' dotted_as_names
# note below: the ('.' | '...') is necessary because '...' is tokenized as ELLIPSIS
import_from: ('from' (('.' | '...')* dotted_name | ('.' | '...')+)
'import' ('*' | '(' import_as_names ')' | import_as_names))
import_as_name: NAME ['as' NAME]
dotted_as_name: dotted_name ['as' NAME]
import_as_names: import_as_name (',' import_as_name)* [',']
dotted_as_names: dotted_as_name (',' dotted_as_name)*
dotted_name: NAME ('.' NAME)*
global_stmt: 'global' NAME (',' NAME)*
nonlocal_stmt: 'nonlocal' NAME (',' NAME)*
assert_stmt: 'assert' test [',' test]
compound_stmt: if_stmt | while_stmt | for_stmt | try_stmt | with_stmt | funcdef | classdef | decorated
if_stmt: 'if' test ':' suite ('elif' test ':' suite)* ['else' ':' suite]
while_stmt: 'while' test ':' suite ['else' ':' suite]
for_stmt: 'for' exprlist 'in' testlist ':' suite ['else' ':' suite]
try_stmt: ('try' ':' suite
((except_clause ':' suite)+
['else' ':' suite]
['finally' ':' suite] |
'finally' ':' suite))
with_stmt: 'with' with_item (',' with_item)* ':' suite
with_item: test ['as' expr]
# NB compile.c makes sure that the default except clause is last
except_clause: 'except' [test ['as' NAME]]
suite: simple_stmt | NEWLINE INDENT stmt+ DEDENT
test: or_test ['if' or_test 'else' test] | lambdef
test_nocond: or_test | lambdef_nocond
lambdef: 'lambda' [varargslist] ':' test
lambdef_nocond: 'lambda' [varargslist] ':' test_nocond
or_test: and_test ('or' and_test)*
and_test: not_test ('and' not_test)*
not_test: 'not' not_test | comparison
comparison: expr (comp_op expr)*
# <> isn't actually a valid comparison operator in Python. It's here for the
# sake of a __future__ import described in PEP 401
comp_op: '<'|'>'|'=='|'>='|'<='|'<>'|'!='|'in'|'not' 'in'|'is'|'is' 'not'
star_expr: '*' expr
expr: xor_expr ('|' xor_expr)*
xor_expr: and_expr ('^' and_expr)*
and_expr: shift_expr ('&' shift_expr)*
shift_expr: arith_expr (('<<'|'>>') arith_expr)*
arith_expr: term (('+'|'-') term)*
term: factor (('*'|'/'|'%'|'//') factor)*
factor: ('+'|'-'|'~') factor | power
power: atom trailer* ['**' factor]
atom: ('(' [yield_expr|testlist_comp] ')' |
'[' [testlist_comp] ']' |
'{' [dictorsetmaker] '}' |
NAME | NUMBER | strings | '...' | 'None' | 'True' | 'False')
strings: STRING+
testlist_comp: (test|star_expr) ( sync_comp_for | (',' (test|star_expr))* [','] )
trailer: '(' [arglist] ')' | '[' subscriptlist ']' | '.' NAME
subscriptlist: subscript (',' subscript)* [',']
subscript: test | [test] ':' [test] [sliceop]
sliceop: ':' [test]
exprlist: (expr|star_expr) (',' (expr|star_expr))* [',']
testlist: test (',' test)* [',']
dictorsetmaker: ( (test ':' test (sync_comp_for | (',' test ':' test)* [','])) |
(test (sync_comp_for | (',' test)* [','])) )
classdef: 'class' NAME ['(' [arglist] ')'] ':' suite
arglist: (argument ',')* (argument [',']
|'*' test (',' argument)* [',' '**' test]
|'**' test)
# The reason that keywords are test nodes instead of NAME is that using NAME
# results in an ambiguity. ast.c makes sure it's a NAME.
argument: test [sync_comp_for] | test '=' test # Really [keyword '='] test
comp_iter: sync_comp_for | comp_if
sync_comp_for: 'for' exprlist 'in' or_test [comp_iter]
comp_if: 'if' test_nocond [comp_iter]
# not used in grammar, but may appear in "node" passed from Parser to Compiler
encoding_decl: NAME
yield_expr: 'yield' [yield_arg]
yield_arg: 'from' test | testlist

View file

@ -0,0 +1,153 @@
# Grammar for Python
# Note: Changing the grammar specified in this file will most likely
# require corresponding changes in the parser module
# (../Modules/parsermodule.c). If you can't make the changes to
# that module yourself, please co-ordinate the required changes
# with someone who can; ask around on python-dev for help. Fred
# Drake <fdrake@acm.org> will probably be listening there.
# NOTE WELL: You should also follow all the steps listed at
# https://docs.python.org/devguide/grammar.html
# Start symbols for the grammar:
# single_input is a single interactive statement;
# file_input is a module or sequence of commands read from an input file;
# eval_input is the input for the eval() functions.
# NB: compound_stmt in single_input is followed by extra NEWLINE!
single_input: NEWLINE | simple_stmt | compound_stmt NEWLINE
file_input: stmt* ENDMARKER
eval_input: testlist NEWLINE* ENDMARKER
decorator: '@' dotted_name [ '(' [arglist] ')' ] NEWLINE
decorators: decorator+
decorated: decorators (classdef | funcdef | async_funcdef)
# NOTE: Reinoud Elhorst, using ASYNC/AWAIT keywords instead of tokens
# skipping python3.5 compatibility, in favour of 3.7 solution
async_funcdef: 'async' funcdef
funcdef: 'def' NAME parameters ['->' test] ':' suite
parameters: '(' [typedargslist] ')'
typedargslist: (tfpdef ['=' test] (',' tfpdef ['=' test])* [','
['*' [tfpdef] (',' tfpdef ['=' test])* [',' '**' tfpdef] | '**' tfpdef]]
| '*' [tfpdef] (',' tfpdef ['=' test])* [',' '**' tfpdef] | '**' tfpdef)
tfpdef: NAME [':' test]
varargslist: (vfpdef ['=' test] (',' vfpdef ['=' test])* [','
['*' [vfpdef] (',' vfpdef ['=' test])* [',' '**' vfpdef] | '**' vfpdef]]
| '*' [vfpdef] (',' vfpdef ['=' test])* [',' '**' vfpdef] | '**' vfpdef)
vfpdef: NAME
stmt: simple_stmt | compound_stmt | NEWLINE
simple_stmt: small_stmt (';' small_stmt)* [';'] NEWLINE
small_stmt: (expr_stmt | del_stmt | pass_stmt | flow_stmt |
import_stmt | global_stmt | nonlocal_stmt | assert_stmt)
expr_stmt: testlist_star_expr (augassign (yield_expr|testlist) |
('=' (yield_expr|testlist_star_expr))*)
testlist_star_expr: (test|star_expr) (',' (test|star_expr))* [',']
augassign: ('+=' | '-=' | '*=' | '@=' | '/=' | '%=' | '&=' | '|=' | '^=' |
'<<=' | '>>=' | '**=' | '//=')
# For normal assignments, additional restrictions enforced by the interpreter
del_stmt: 'del' exprlist
pass_stmt: 'pass'
flow_stmt: break_stmt | continue_stmt | return_stmt | raise_stmt | yield_stmt
break_stmt: 'break'
continue_stmt: 'continue'
return_stmt: 'return' [testlist]
yield_stmt: yield_expr
raise_stmt: 'raise' [test ['from' test]]
import_stmt: import_name | import_from
import_name: 'import' dotted_as_names
# note below: the ('.' | '...') is necessary because '...' is tokenized as ELLIPSIS
import_from: ('from' (('.' | '...')* dotted_name | ('.' | '...')+)
'import' ('*' | '(' import_as_names ')' | import_as_names))
import_as_name: NAME ['as' NAME]
dotted_as_name: dotted_name ['as' NAME]
import_as_names: import_as_name (',' import_as_name)* [',']
dotted_as_names: dotted_as_name (',' dotted_as_name)*
dotted_name: NAME ('.' NAME)*
global_stmt: 'global' NAME (',' NAME)*
nonlocal_stmt: 'nonlocal' NAME (',' NAME)*
assert_stmt: 'assert' test [',' test]
compound_stmt: if_stmt | while_stmt | for_stmt | try_stmt | with_stmt | funcdef | classdef | decorated | async_stmt
async_stmt: 'async' (funcdef | with_stmt | for_stmt)
if_stmt: 'if' test ':' suite ('elif' test ':' suite)* ['else' ':' suite]
while_stmt: 'while' test ':' suite ['else' ':' suite]
for_stmt: 'for' exprlist 'in' testlist ':' suite ['else' ':' suite]
try_stmt: ('try' ':' suite
((except_clause ':' suite)+
['else' ':' suite]
['finally' ':' suite] |
'finally' ':' suite))
with_stmt: 'with' with_item (',' with_item)* ':' suite
with_item: test ['as' expr]
# NB compile.c makes sure that the default except clause is last
except_clause: 'except' [test ['as' NAME]]
suite: simple_stmt | NEWLINE INDENT stmt+ DEDENT
test: or_test ['if' or_test 'else' test] | lambdef
test_nocond: or_test | lambdef_nocond
lambdef: 'lambda' [varargslist] ':' test
lambdef_nocond: 'lambda' [varargslist] ':' test_nocond
or_test: and_test ('or' and_test)*
and_test: not_test ('and' not_test)*
not_test: 'not' not_test | comparison
comparison: expr (comp_op expr)*
# <> isn't actually a valid comparison operator in Python. It's here for the
# sake of a __future__ import described in PEP 401 (which really works :-)
comp_op: '<'|'>'|'=='|'>='|'<='|'<>'|'!='|'in'|'not' 'in'|'is'|'is' 'not'
star_expr: '*' expr
expr: xor_expr ('|' xor_expr)*
xor_expr: and_expr ('^' and_expr)*
and_expr: shift_expr ('&' shift_expr)*
shift_expr: arith_expr (('<<'|'>>') arith_expr)*
arith_expr: term (('+'|'-') term)*
term: factor (('*'|'@'|'/'|'%'|'//') factor)*
factor: ('+'|'-'|'~') factor | power
power: atom_expr ['**' factor]
atom_expr: ['await'] atom trailer*
atom: ('(' [yield_expr|testlist_comp] ')' |
'[' [testlist_comp] ']' |
'{' [dictorsetmaker] '}' |
NAME | NUMBER | strings | '...' | 'None' | 'True' | 'False')
strings: STRING+
testlist_comp: (test|star_expr) ( sync_comp_for | (',' (test|star_expr))* [','] )
trailer: '(' [arglist] ')' | '[' subscriptlist ']' | '.' NAME
subscriptlist: subscript (',' subscript)* [',']
subscript: test | [test] ':' [test] [sliceop]
sliceop: ':' [test]
exprlist: (expr|star_expr) (',' (expr|star_expr))* [',']
testlist: test (',' test)* [',']
dictorsetmaker: ( ((test ':' test | '**' expr)
(sync_comp_for | (',' (test ':' test | '**' expr))* [','])) |
((test | star_expr)
(sync_comp_for | (',' (test | star_expr))* [','])) )
classdef: 'class' NAME ['(' [arglist] ')'] ':' suite
arglist: argument (',' argument)* [',']
# The reason that keywords are test nodes instead of NAME is that using NAME
# results in an ambiguity. ast.c makes sure it's a NAME.
# "test '=' test" is really "keyword '=' test", but we have no such token.
# These need to be in a single rule to avoid grammar that is ambiguous
# to our LL(1) parser. Even though 'test' includes '*expr' in star_expr,
# we explicitly match '*' here, too, to give it proper precedence.
# Illegal combinations and orderings are blocked in ast.c:
# multiple (test comp_for) arguments are blocked; keyword unpackings
# that precede iterable unpackings are blocked; etc.
argument: ( test [sync_comp_for] |
test '=' test |
'**' test |
'*' test )
comp_iter: sync_comp_for | comp_if
sync_comp_for: 'for' exprlist 'in' or_test [comp_iter]
comp_if: 'if' test_nocond [comp_iter]
# not used in grammar, but may appear in "node" passed from Parser to Compiler
encoding_decl: NAME
yield_expr: 'yield' [yield_arg]
yield_arg: 'from' test | testlist

View file

@ -0,0 +1,158 @@
# Grammar for Python
# NOTE WELL: You should also follow all the steps listed at
# https://docs.python.org/devguide/grammar.html
# Start symbols for the grammar:
# single_input is a single interactive statement;
# file_input is a module or sequence of commands read from an input file;
# eval_input is the input for the eval() functions.
# NB: compound_stmt in single_input is followed by extra NEWLINE!
single_input: NEWLINE | simple_stmt | compound_stmt NEWLINE
file_input: stmt* ENDMARKER
eval_input: testlist NEWLINE* ENDMARKER
decorator: '@' dotted_name [ '(' [arglist] ')' ] NEWLINE
decorators: decorator+
decorated: decorators (classdef | funcdef | async_funcdef)
# NOTE: Francisco Souza/Reinoud Elhorst, using ASYNC/'await' keywords instead of
# skipping python3.5+ compatibility, in favour of 3.7 solution
async_funcdef: 'async' funcdef
funcdef: 'def' NAME parameters ['->' test] ':' suite
parameters: '(' [typedargslist] ')'
typedargslist: (tfpdef ['=' test] (',' tfpdef ['=' test])* [',' [
'*' [tfpdef] (',' tfpdef ['=' test])* [',' ['**' tfpdef [',']]]
| '**' tfpdef [',']]]
| '*' [tfpdef] (',' tfpdef ['=' test])* [',' ['**' tfpdef [',']]]
| '**' tfpdef [','])
tfpdef: NAME [':' test]
varargslist: (vfpdef ['=' test] (',' vfpdef ['=' test])* [',' [
'*' [vfpdef] (',' vfpdef ['=' test])* [',' ['**' vfpdef [',']]]
| '**' vfpdef [',']]]
| '*' [vfpdef] (',' vfpdef ['=' test])* [',' ['**' vfpdef [',']]]
| '**' vfpdef [',']
)
vfpdef: NAME
stmt: simple_stmt | compound_stmt | NEWLINE
simple_stmt: small_stmt (';' small_stmt)* [';'] NEWLINE
small_stmt: (expr_stmt | del_stmt | pass_stmt | flow_stmt |
import_stmt | global_stmt | nonlocal_stmt | assert_stmt)
expr_stmt: testlist_star_expr (annassign | augassign (yield_expr|testlist) |
('=' (yield_expr|testlist_star_expr))*)
annassign: ':' test ['=' test]
testlist_star_expr: (test|star_expr) (',' (test|star_expr))* [',']
augassign: ('+=' | '-=' | '*=' | '@=' | '/=' | '%=' | '&=' | '|=' | '^=' |
'<<=' | '>>=' | '**=' | '//=')
# For normal and annotated assignments, additional restrictions enforced by the interpreter
del_stmt: 'del' exprlist
pass_stmt: 'pass'
flow_stmt: break_stmt | continue_stmt | return_stmt | raise_stmt | yield_stmt
break_stmt: 'break'
continue_stmt: 'continue'
return_stmt: 'return' [testlist]
yield_stmt: yield_expr
raise_stmt: 'raise' [test ['from' test]]
import_stmt: import_name | import_from
import_name: 'import' dotted_as_names
# note below: the ('.' | '...') is necessary because '...' is tokenized as ELLIPSIS
import_from: ('from' (('.' | '...')* dotted_name | ('.' | '...')+)
'import' ('*' | '(' import_as_names ')' | import_as_names))
import_as_name: NAME ['as' NAME]
dotted_as_name: dotted_name ['as' NAME]
import_as_names: import_as_name (',' import_as_name)* [',']
dotted_as_names: dotted_as_name (',' dotted_as_name)*
dotted_name: NAME ('.' NAME)*
global_stmt: 'global' NAME (',' NAME)*
nonlocal_stmt: 'nonlocal' NAME (',' NAME)*
assert_stmt: 'assert' test [',' test]
compound_stmt: if_stmt | while_stmt | for_stmt | try_stmt | with_stmt | funcdef | classdef | decorated | async_stmt
async_stmt: 'async' (funcdef | with_stmt | for_stmt)
if_stmt: 'if' test ':' suite ('elif' test ':' suite)* ['else' ':' suite]
while_stmt: 'while' test ':' suite ['else' ':' suite]
for_stmt: 'for' exprlist 'in' testlist ':' suite ['else' ':' suite]
try_stmt: ('try' ':' suite
((except_clause ':' suite)+
['else' ':' suite]
['finally' ':' suite] |
'finally' ':' suite))
with_stmt: 'with' with_item (',' with_item)* ':' suite
with_item: test ['as' expr]
# NB compile.c makes sure that the default except clause is last
except_clause: 'except' [test ['as' NAME]]
suite: simple_stmt | NEWLINE INDENT stmt+ DEDENT
test: or_test ['if' or_test 'else' test] | lambdef
test_nocond: or_test | lambdef_nocond
lambdef: 'lambda' [varargslist] ':' test
lambdef_nocond: 'lambda' [varargslist] ':' test_nocond
or_test: and_test ('or' and_test)*
and_test: not_test ('and' not_test)*
not_test: 'not' not_test | comparison
comparison: expr (comp_op expr)*
# <> isn't actually a valid comparison operator in Python. It's here for the
# sake of a __future__ import described in PEP 401 (which really works :-)
comp_op: '<'|'>'|'=='|'>='|'<='|'<>'|'!='|'in'|'not' 'in'|'is'|'is' 'not'
star_expr: '*' expr
expr: xor_expr ('|' xor_expr)*
xor_expr: and_expr ('^' and_expr)*
and_expr: shift_expr ('&' shift_expr)*
shift_expr: arith_expr (('<<'|'>>') arith_expr)*
arith_expr: term (('+'|'-') term)*
term: factor (('*'|'@'|'/'|'%'|'//') factor)*
factor: ('+'|'-'|'~') factor | power
power: atom_expr ['**' factor]
atom_expr: ['await'] atom trailer*
atom: ('(' [yield_expr|testlist_comp] ')' |
'[' [testlist_comp] ']' |
'{' [dictorsetmaker] '}' |
NAME | NUMBER | strings | '...' | 'None' | 'True' | 'False')
testlist_comp: (test|star_expr) ( comp_for | (',' (test|star_expr))* [','] )
trailer: '(' [arglist] ')' | '[' subscriptlist ']' | '.' NAME
subscriptlist: subscript (',' subscript)* [',']
subscript: test | [test] ':' [test] [sliceop]
sliceop: ':' [test]
exprlist: (expr|star_expr) (',' (expr|star_expr))* [',']
testlist: test (',' test)* [',']
dictorsetmaker: ( ((test ':' test | '**' expr)
(comp_for | (',' (test ':' test | '**' expr))* [','])) |
((test | star_expr)
(comp_for | (',' (test | star_expr))* [','])) )
classdef: 'class' NAME ['(' [arglist] ')'] ':' suite
arglist: argument (',' argument)* [',']
# The reason that keywords are test nodes instead of NAME is that using NAME
# results in an ambiguity. ast.c makes sure it's a NAME.
# "test '=' test" is really "keyword '=' test", but we have no such token.
# These need to be in a single rule to avoid grammar that is ambiguous
# to our LL(1) parser. Even though 'test' includes '*expr' in star_expr,
# we explicitly match '*' here, too, to give it proper precedence.
# Illegal combinations and orderings are blocked in ast.c:
# multiple (test comp_for) arguments are blocked; keyword unpackings
# that precede iterable unpackings are blocked; etc.
argument: ( test [comp_for] |
test '=' test |
'**' test |
'*' test )
comp_iter: comp_for | comp_if
sync_comp_for: 'for' exprlist 'in' or_test [comp_iter]
comp_for: ['async'] sync_comp_for
comp_if: 'if' test_nocond [comp_iter]
# not used in grammar, but may appear in "node" passed from Parser to Compiler
encoding_decl: NAME
yield_expr: 'yield' [yield_arg]
yield_arg: 'from' test | testlist
strings: (STRING | fstring)+
fstring: FSTRING_START fstring_content* FSTRING_END
fstring_content: FSTRING_STRING | fstring_expr
fstring_conversion: '!' NAME
fstring_expr: '{' testlist_comp [ fstring_conversion ] [ fstring_format_spec ] '}'
fstring_format_spec: ':' fstring_content*

View file

@ -0,0 +1,156 @@
# Grammar for Python
# NOTE WELL: You should also follow all the steps listed at
# https://docs.python.org/devguide/grammar.html
# Start symbols for the grammar:
# single_input is a single interactive statement;
# file_input is a module or sequence of commands read from an input file;
# eval_input is the input for the eval() functions.
# NB: compound_stmt in single_input is followed by extra NEWLINE!
single_input: NEWLINE | simple_stmt | compound_stmt NEWLINE
file_input: stmt* ENDMARKER
eval_input: testlist NEWLINE* ENDMARKER
decorator: '@' dotted_name [ '(' [arglist] ')' ] NEWLINE
decorators: decorator+
decorated: decorators (classdef | funcdef | async_funcdef)
async_funcdef: 'async' funcdef
funcdef: 'def' NAME parameters ['->' test] ':' suite
parameters: '(' [typedargslist] ')'
typedargslist: (tfpdef ['=' test] (',' tfpdef ['=' test])* [',' [
'*' [tfpdef] (',' tfpdef ['=' test])* [',' ['**' tfpdef [',']]]
| '**' tfpdef [',']]]
| '*' [tfpdef] (',' tfpdef ['=' test])* [',' ['**' tfpdef [',']]]
| '**' tfpdef [','])
tfpdef: NAME [':' test]
varargslist: (vfpdef ['=' test] (',' vfpdef ['=' test])* [',' [
'*' [vfpdef] (',' vfpdef ['=' test])* [',' ['**' vfpdef [',']]]
| '**' vfpdef [',']]]
| '*' [vfpdef] (',' vfpdef ['=' test])* [',' ['**' vfpdef [',']]]
| '**' vfpdef [',']
)
vfpdef: NAME
stmt: simple_stmt | compound_stmt | NEWLINE
simple_stmt: small_stmt (';' small_stmt)* [';'] NEWLINE
small_stmt: (expr_stmt | del_stmt | pass_stmt | flow_stmt |
import_stmt | global_stmt | nonlocal_stmt | assert_stmt)
expr_stmt: testlist_star_expr (annassign | augassign (yield_expr|testlist) |
('=' (yield_expr|testlist_star_expr))*)
annassign: ':' test ['=' test]
testlist_star_expr: (test|star_expr) (',' (test|star_expr))* [',']
augassign: ('+=' | '-=' | '*=' | '@=' | '/=' | '%=' | '&=' | '|=' | '^=' |
'<<=' | '>>=' | '**=' | '//=')
# For normal and annotated assignments, additional restrictions enforced by the interpreter
del_stmt: 'del' exprlist
pass_stmt: 'pass'
flow_stmt: break_stmt | continue_stmt | return_stmt | raise_stmt | yield_stmt
break_stmt: 'break'
continue_stmt: 'continue'
return_stmt: 'return' [testlist]
yield_stmt: yield_expr
raise_stmt: 'raise' [test ['from' test]]
import_stmt: import_name | import_from
import_name: 'import' dotted_as_names
# note below: the ('.' | '...') is necessary because '...' is tokenized as ELLIPSIS
import_from: ('from' (('.' | '...')* dotted_name | ('.' | '...')+)
'import' ('*' | '(' import_as_names ')' | import_as_names))
import_as_name: NAME ['as' NAME]
dotted_as_name: dotted_name ['as' NAME]
import_as_names: import_as_name (',' import_as_name)* [',']
dotted_as_names: dotted_as_name (',' dotted_as_name)*
dotted_name: NAME ('.' NAME)*
global_stmt: 'global' NAME (',' NAME)*
nonlocal_stmt: 'nonlocal' NAME (',' NAME)*
assert_stmt: 'assert' test [',' test]
compound_stmt: if_stmt | while_stmt | for_stmt | try_stmt | with_stmt | funcdef | classdef | decorated | async_stmt
async_stmt: 'async' (funcdef | with_stmt | for_stmt)
if_stmt: 'if' test ':' suite ('elif' test ':' suite)* ['else' ':' suite]
while_stmt: 'while' test ':' suite ['else' ':' suite]
for_stmt: 'for' exprlist 'in' testlist ':' suite ['else' ':' suite]
try_stmt: ('try' ':' suite
((except_clause ':' suite)+
['else' ':' suite]
['finally' ':' suite] |
'finally' ':' suite))
with_stmt: 'with' with_item (',' with_item)* ':' suite
with_item: test ['as' expr]
# NB compile.c makes sure that the default except clause is last
except_clause: 'except' [test ['as' NAME]]
suite: simple_stmt | NEWLINE INDENT stmt+ DEDENT
test: or_test ['if' or_test 'else' test] | lambdef
test_nocond: or_test | lambdef_nocond
lambdef: 'lambda' [varargslist] ':' test
lambdef_nocond: 'lambda' [varargslist] ':' test_nocond
or_test: and_test ('or' and_test)*
and_test: not_test ('and' not_test)*
not_test: 'not' not_test | comparison
comparison: expr (comp_op expr)*
# <> isn't actually a valid comparison operator in Python. It's here for the
# sake of a __future__ import described in PEP 401 (which really works :-)
comp_op: '<'|'>'|'=='|'>='|'<='|'<>'|'!='|'in'|'not' 'in'|'is'|'is' 'not'
star_expr: '*' expr
expr: xor_expr ('|' xor_expr)*
xor_expr: and_expr ('^' and_expr)*
and_expr: shift_expr ('&' shift_expr)*
shift_expr: arith_expr (('<<'|'>>') arith_expr)*
arith_expr: term (('+'|'-') term)*
term: factor (('*'|'@'|'/'|'%'|'//') factor)*
factor: ('+'|'-'|'~') factor | power
power: atom_expr ['**' factor]
atom_expr: ['await'] atom trailer*
atom: ('(' [yield_expr|testlist_comp] ')' |
'[' [testlist_comp] ']' |
'{' [dictorsetmaker] '}' |
NAME | NUMBER | strings | '...' | 'None' | 'True' | 'False')
testlist_comp: (test|star_expr) ( comp_for | (',' (test|star_expr))* [','] )
trailer: '(' [arglist] ')' | '[' subscriptlist ']' | '.' NAME
subscriptlist: subscript (',' subscript)* [',']
subscript: test | [test] ':' [test] [sliceop]
sliceop: ':' [test]
exprlist: (expr|star_expr) (',' (expr|star_expr))* [',']
testlist: test (',' test)* [',']
dictorsetmaker: ( ((test ':' test | '**' expr)
(comp_for | (',' (test ':' test | '**' expr))* [','])) |
((test | star_expr)
(comp_for | (',' (test | star_expr))* [','])) )
classdef: 'class' NAME ['(' [arglist] ')'] ':' suite
arglist: argument (',' argument)* [',']
# The reason that keywords are test nodes instead of NAME is that using NAME
# results in an ambiguity. ast.c makes sure it's a NAME.
# "test '=' test" is really "keyword '=' test", but we have no such token.
# These need to be in a single rule to avoid grammar that is ambiguous
# to our LL(1) parser. Even though 'test' includes '*expr' in star_expr,
# we explicitly match '*' here, too, to give it proper precedence.
# Illegal combinations and orderings are blocked in ast.c:
# multiple (test comp_for) arguments are blocked; keyword unpackings
# that precede iterable unpackings are blocked; etc.
argument: ( test [comp_for] |
test '=' test |
'**' test |
'*' test )
comp_iter: comp_for | comp_if
sync_comp_for: 'for' exprlist 'in' or_test [comp_iter]
comp_for: ['async'] sync_comp_for
comp_if: 'if' test_nocond [comp_iter]
# not used in grammar, but may appear in "node" passed from Parser to Compiler
encoding_decl: NAME
yield_expr: 'yield' [yield_arg]
yield_arg: 'from' test | testlist
strings: (STRING | fstring)+
fstring: FSTRING_START fstring_content* FSTRING_END
fstring_content: FSTRING_STRING | fstring_expr
fstring_conversion: '!' NAME
fstring_expr: '{' testlist [ fstring_conversion ] [ fstring_format_spec ] '}'
fstring_format_spec: ':' fstring_content*

View file

@ -0,0 +1,171 @@
# Grammar for Python
# NOTE WELL: You should also follow all the steps listed at
# https://devguide.python.org/grammar/
# Start symbols for the grammar:
# single_input is a single interactive statement;
# file_input is a module or sequence of commands read from an input file;
# eval_input is the input for the eval() functions.
# NB: compound_stmt in single_input is followed by extra NEWLINE!
single_input: NEWLINE | simple_stmt | compound_stmt NEWLINE
file_input: stmt* ENDMARKER
eval_input: testlist NEWLINE* ENDMARKER
decorator: '@' dotted_name [ '(' [arglist] ')' ] NEWLINE
decorators: decorator+
decorated: decorators (classdef | funcdef | async_funcdef)
async_funcdef: 'async' funcdef
funcdef: 'def' NAME parameters ['->' test] ':' suite
parameters: '(' [typedargslist] ')'
typedargslist: (
(tfpdef ['=' test] (',' tfpdef ['=' test])* ',' '/' [',' [ tfpdef ['=' test] (
',' tfpdef ['=' test])* ([',' [
'*' [tfpdef] (',' tfpdef ['=' test])* [',' ['**' tfpdef [',']]]
| '**' tfpdef [',']]])
| '*' [tfpdef] (',' tfpdef ['=' test])* ([',' ['**' tfpdef [',']]])
| '**' tfpdef [',']]] )
| (tfpdef ['=' test] (',' tfpdef ['=' test])* [',' [
'*' [tfpdef] (',' tfpdef ['=' test])* [',' ['**' tfpdef [',']]]
| '**' tfpdef [',']]]
| '*' [tfpdef] (',' tfpdef ['=' test])* [',' ['**' tfpdef [',']]]
| '**' tfpdef [','])
)
tfpdef: NAME [':' test]
varargslist: vfpdef ['=' test ](',' vfpdef ['=' test])* ',' '/' [',' [ (vfpdef ['=' test] (',' vfpdef ['=' test])* [',' [
'*' [vfpdef] (',' vfpdef ['=' test])* [',' ['**' vfpdef [',']]]
| '**' vfpdef [',']]]
| '*' [vfpdef] (',' vfpdef ['=' test])* [',' ['**' vfpdef [',']]]
| '**' vfpdef [',']) ]] | (vfpdef ['=' test] (',' vfpdef ['=' test])* [',' [
'*' [vfpdef] (',' vfpdef ['=' test])* [',' ['**' vfpdef [',']]]
| '**' vfpdef [',']]]
| '*' [vfpdef] (',' vfpdef ['=' test])* [',' ['**' vfpdef [',']]]
| '**' vfpdef [',']
)
vfpdef: NAME
stmt: simple_stmt | compound_stmt | NEWLINE
simple_stmt: small_stmt (';' small_stmt)* [';'] NEWLINE
small_stmt: (expr_stmt | del_stmt | pass_stmt | flow_stmt |
import_stmt | global_stmt | nonlocal_stmt | assert_stmt)
expr_stmt: testlist_star_expr (annassign | augassign (yield_expr|testlist) |
('=' (yield_expr|testlist_star_expr))*)
annassign: ':' test ['=' (yield_expr|testlist_star_expr)]
testlist_star_expr: (test|star_expr) (',' (test|star_expr))* [',']
augassign: ('+=' | '-=' | '*=' | '@=' | '/=' | '%=' | '&=' | '|=' | '^=' |
'<<=' | '>>=' | '**=' | '//=')
# For normal and annotated assignments, additional restrictions enforced by the interpreter
del_stmt: 'del' exprlist
pass_stmt: 'pass'
flow_stmt: break_stmt | continue_stmt | return_stmt | raise_stmt | yield_stmt
break_stmt: 'break'
continue_stmt: 'continue'
return_stmt: 'return' [testlist_star_expr]
yield_stmt: yield_expr
raise_stmt: 'raise' [test ['from' test]]
import_stmt: import_name | import_from
import_name: 'import' dotted_as_names
# note below: the ('.' | '...') is necessary because '...' is tokenized as ELLIPSIS
import_from: ('from' (('.' | '...')* dotted_name | ('.' | '...')+)
'import' ('*' | '(' import_as_names ')' | import_as_names))
import_as_name: NAME ['as' NAME]
dotted_as_name: dotted_name ['as' NAME]
import_as_names: import_as_name (',' import_as_name)* [',']
dotted_as_names: dotted_as_name (',' dotted_as_name)*
dotted_name: NAME ('.' NAME)*
global_stmt: 'global' NAME (',' NAME)*
nonlocal_stmt: 'nonlocal' NAME (',' NAME)*
assert_stmt: 'assert' test [',' test]
compound_stmt: if_stmt | while_stmt | for_stmt | try_stmt | with_stmt | funcdef | classdef | decorated | async_stmt
async_stmt: 'async' (funcdef | with_stmt | for_stmt)
if_stmt: 'if' namedexpr_test ':' suite ('elif' namedexpr_test ':' suite)* ['else' ':' suite]
while_stmt: 'while' namedexpr_test ':' suite ['else' ':' suite]
for_stmt: 'for' exprlist 'in' testlist ':' suite ['else' ':' suite]
try_stmt: ('try' ':' suite
((except_clause ':' suite)+
['else' ':' suite]
['finally' ':' suite] |
'finally' ':' suite))
with_stmt: 'with' with_item (',' with_item)* ':' suite
with_item: test ['as' expr]
# NB compile.c makes sure that the default except clause is last
except_clause: 'except' [test ['as' NAME]]
suite: simple_stmt | NEWLINE INDENT stmt+ DEDENT
namedexpr_test: test [':=' test]
test: or_test ['if' or_test 'else' test] | lambdef
test_nocond: or_test | lambdef_nocond
lambdef: 'lambda' [varargslist] ':' test
lambdef_nocond: 'lambda' [varargslist] ':' test_nocond
or_test: and_test ('or' and_test)*
and_test: not_test ('and' not_test)*
not_test: 'not' not_test | comparison
comparison: expr (comp_op expr)*
# <> isn't actually a valid comparison operator in Python. It's here for the
# sake of a __future__ import described in PEP 401 (which really works :-)
comp_op: '<'|'>'|'=='|'>='|'<='|'<>'|'!='|'in'|'not' 'in'|'is'|'is' 'not'
star_expr: '*' expr
expr: xor_expr ('|' xor_expr)*
xor_expr: and_expr ('^' and_expr)*
and_expr: shift_expr ('&' shift_expr)*
shift_expr: arith_expr (('<<'|'>>') arith_expr)*
arith_expr: term (('+'|'-') term)*
term: factor (('*'|'@'|'/'|'%'|'//') factor)*
factor: ('+'|'-'|'~') factor | power
power: atom_expr ['**' factor]
atom_expr: ['await'] atom trailer*
atom: ('(' [yield_expr|testlist_comp] ')' |
'[' [testlist_comp] ']' |
'{' [dictorsetmaker] '}' |
NAME | NUMBER | strings | '...' | 'None' | 'True' | 'False')
testlist_comp: (namedexpr_test|star_expr) ( comp_for | (',' (namedexpr_test|star_expr))* [','] )
trailer: '(' [arglist] ')' | '[' subscriptlist ']' | '.' NAME
subscriptlist: subscript (',' subscript)* [',']
subscript: test | [test] ':' [test] [sliceop]
sliceop: ':' [test]
exprlist: (expr|star_expr) (',' (expr|star_expr))* [',']
testlist: test (',' test)* [',']
dictorsetmaker: ( ((test ':' test | '**' expr)
(comp_for | (',' (test ':' test | '**' expr))* [','])) |
((test | star_expr)
(comp_for | (',' (test | star_expr))* [','])) )
classdef: 'class' NAME ['(' [arglist] ')'] ':' suite
arglist: argument (',' argument)* [',']
# The reason that keywords are test nodes instead of NAME is that using NAME
# results in an ambiguity. ast.c makes sure it's a NAME.
# "test '=' test" is really "keyword '=' test", but we have no such token.
# These need to be in a single rule to avoid grammar that is ambiguous
# to our LL(1) parser. Even though 'test' includes '*expr' in star_expr,
# we explicitly match '*' here, too, to give it proper precedence.
# Illegal combinations and orderings are blocked in ast.c:
# multiple (test comp_for) arguments are blocked; keyword unpackings
# that precede iterable unpackings are blocked; etc.
argument: ( test [comp_for] |
test ':=' test |
test '=' test |
'**' test |
'*' test )
comp_iter: comp_for | comp_if
sync_comp_for: 'for' exprlist 'in' or_test [comp_iter]
comp_for: ['async'] sync_comp_for
comp_if: 'if' test_nocond [comp_iter]
# not used in grammar, but may appear in "node" passed from Parser to Compiler
encoding_decl: NAME
yield_expr: 'yield' [yield_arg]
yield_arg: 'from' test | testlist_star_expr
strings: (STRING | fstring)+
fstring: FSTRING_START fstring_content* FSTRING_END
fstring_content: FSTRING_STRING | fstring_expr
fstring_conversion: '!' NAME
fstring_expr: '{' testlist ['='] [ fstring_conversion ] [ fstring_format_spec ] '}'
fstring_format_spec: ':' fstring_content*

View file

@ -0,0 +1,171 @@
# Grammar for Python
# NOTE WELL: You should also follow all the steps listed at
# https://devguide.python.org/grammar/
# Start symbols for the grammar:
# single_input is a single interactive statement;
# file_input is a module or sequence of commands read from an input file;
# eval_input is the input for the eval() functions.
# NB: compound_stmt in single_input is followed by extra NEWLINE!
single_input: NEWLINE | simple_stmt | compound_stmt NEWLINE
file_input: stmt* ENDMARKER
eval_input: testlist NEWLINE* ENDMARKER
decorator: '@' namedexpr_test NEWLINE
decorators: decorator+
decorated: decorators (classdef | funcdef | async_funcdef)
async_funcdef: 'async' funcdef
funcdef: 'def' NAME parameters ['->' test] ':' suite
parameters: '(' [typedargslist] ')'
typedargslist: (
(tfpdef ['=' test] (',' tfpdef ['=' test])* ',' '/' [',' [ tfpdef ['=' test] (
',' tfpdef ['=' test])* ([',' [
'*' [tfpdef] (',' tfpdef ['=' test])* [',' ['**' tfpdef [',']]]
| '**' tfpdef [',']]])
| '*' [tfpdef] (',' tfpdef ['=' test])* ([',' ['**' tfpdef [',']]])
| '**' tfpdef [',']]] )
| (tfpdef ['=' test] (',' tfpdef ['=' test])* [',' [
'*' [tfpdef] (',' tfpdef ['=' test])* [',' ['**' tfpdef [',']]]
| '**' tfpdef [',']]]
| '*' [tfpdef] (',' tfpdef ['=' test])* [',' ['**' tfpdef [',']]]
| '**' tfpdef [','])
)
tfpdef: NAME [':' test]
varargslist: vfpdef ['=' test ](',' vfpdef ['=' test])* ',' '/' [',' [ (vfpdef ['=' test] (',' vfpdef ['=' test])* [',' [
'*' [vfpdef] (',' vfpdef ['=' test])* [',' ['**' vfpdef [',']]]
| '**' vfpdef [',']]]
| '*' [vfpdef] (',' vfpdef ['=' test])* [',' ['**' vfpdef [',']]]
| '**' vfpdef [',']) ]] | (vfpdef ['=' test] (',' vfpdef ['=' test])* [',' [
'*' [vfpdef] (',' vfpdef ['=' test])* [',' ['**' vfpdef [',']]]
| '**' vfpdef [',']]]
| '*' [vfpdef] (',' vfpdef ['=' test])* [',' ['**' vfpdef [',']]]
| '**' vfpdef [',']
)
vfpdef: NAME
stmt: simple_stmt | compound_stmt | NEWLINE
simple_stmt: small_stmt (';' small_stmt)* [';'] NEWLINE
small_stmt: (expr_stmt | del_stmt | pass_stmt | flow_stmt |
import_stmt | global_stmt | nonlocal_stmt | assert_stmt)
expr_stmt: testlist_star_expr (annassign | augassign (yield_expr|testlist) |
('=' (yield_expr|testlist_star_expr))*)
annassign: ':' test ['=' (yield_expr|testlist_star_expr)]
testlist_star_expr: (test|star_expr) (',' (test|star_expr))* [',']
augassign: ('+=' | '-=' | '*=' | '@=' | '/=' | '%=' | '&=' | '|=' | '^=' |
'<<=' | '>>=' | '**=' | '//=')
# For normal and annotated assignments, additional restrictions enforced by the interpreter
del_stmt: 'del' exprlist
pass_stmt: 'pass'
flow_stmt: break_stmt | continue_stmt | return_stmt | raise_stmt | yield_stmt
break_stmt: 'break'
continue_stmt: 'continue'
return_stmt: 'return' [testlist_star_expr]
yield_stmt: yield_expr
raise_stmt: 'raise' [test ['from' test]]
import_stmt: import_name | import_from
import_name: 'import' dotted_as_names
# note below: the ('.' | '...') is necessary because '...' is tokenized as ELLIPSIS
import_from: ('from' (('.' | '...')* dotted_name | ('.' | '...')+)
'import' ('*' | '(' import_as_names ')' | import_as_names))
import_as_name: NAME ['as' NAME]
dotted_as_name: dotted_name ['as' NAME]
import_as_names: import_as_name (',' import_as_name)* [',']
dotted_as_names: dotted_as_name (',' dotted_as_name)*
dotted_name: NAME ('.' NAME)*
global_stmt: 'global' NAME (',' NAME)*
nonlocal_stmt: 'nonlocal' NAME (',' NAME)*
assert_stmt: 'assert' test [',' test]
compound_stmt: if_stmt | while_stmt | for_stmt | try_stmt | with_stmt | funcdef | classdef | decorated | async_stmt
async_stmt: 'async' (funcdef | with_stmt | for_stmt)
if_stmt: 'if' namedexpr_test ':' suite ('elif' namedexpr_test ':' suite)* ['else' ':' suite]
while_stmt: 'while' namedexpr_test ':' suite ['else' ':' suite]
for_stmt: 'for' exprlist 'in' testlist ':' suite ['else' ':' suite]
try_stmt: ('try' ':' suite
((except_clause ':' suite)+
['else' ':' suite]
['finally' ':' suite] |
'finally' ':' suite))
with_stmt: 'with' with_item (',' with_item)* ':' suite
with_item: test ['as' expr]
# NB compile.c makes sure that the default except clause is last
except_clause: 'except' [test ['as' NAME]]
suite: simple_stmt | NEWLINE INDENT stmt+ DEDENT
namedexpr_test: test [':=' test]
test: or_test ['if' or_test 'else' test] | lambdef
test_nocond: or_test | lambdef_nocond
lambdef: 'lambda' [varargslist] ':' test
lambdef_nocond: 'lambda' [varargslist] ':' test_nocond
or_test: and_test ('or' and_test)*
and_test: not_test ('and' not_test)*
not_test: 'not' not_test | comparison
comparison: expr (comp_op expr)*
# <> isn't actually a valid comparison operator in Python. It's here for the
# sake of a __future__ import described in PEP 401 (which really works :-)
comp_op: '<'|'>'|'=='|'>='|'<='|'<>'|'!='|'in'|'not' 'in'|'is'|'is' 'not'
star_expr: '*' expr
expr: xor_expr ('|' xor_expr)*
xor_expr: and_expr ('^' and_expr)*
and_expr: shift_expr ('&' shift_expr)*
shift_expr: arith_expr (('<<'|'>>') arith_expr)*
arith_expr: term (('+'|'-') term)*
term: factor (('*'|'@'|'/'|'%'|'//') factor)*
factor: ('+'|'-'|'~') factor | power
power: atom_expr ['**' factor]
atom_expr: ['await'] atom trailer*
atom: ('(' [yield_expr|testlist_comp] ')' |
'[' [testlist_comp] ']' |
'{' [dictorsetmaker] '}' |
NAME | NUMBER | strings | '...' | 'None' | 'True' | 'False')
testlist_comp: (namedexpr_test|star_expr) ( comp_for | (',' (namedexpr_test|star_expr))* [','] )
trailer: '(' [arglist] ')' | '[' subscriptlist ']' | '.' NAME
subscriptlist: subscript (',' subscript)* [',']
subscript: test | [test] ':' [test] [sliceop]
sliceop: ':' [test]
exprlist: (expr|star_expr) (',' (expr|star_expr))* [',']
testlist: test (',' test)* [',']
dictorsetmaker: ( ((test ':' test | '**' expr)
(comp_for | (',' (test ':' test | '**' expr))* [','])) |
((test | star_expr)
(comp_for | (',' (test | star_expr))* [','])) )
classdef: 'class' NAME ['(' [arglist] ')'] ':' suite
arglist: argument (',' argument)* [',']
# The reason that keywords are test nodes instead of NAME is that using NAME
# results in an ambiguity. ast.c makes sure it's a NAME.
# "test '=' test" is really "keyword '=' test", but we have no such token.
# These need to be in a single rule to avoid grammar that is ambiguous
# to our LL(1) parser. Even though 'test' includes '*expr' in star_expr,
# we explicitly match '*' here, too, to give it proper precedence.
# Illegal combinations and orderings are blocked in ast.c:
# multiple (test comp_for) arguments are blocked; keyword unpackings
# that precede iterable unpackings are blocked; etc.
argument: ( test [comp_for] |
test ':=' test |
test '=' test |
'**' test |
'*' test )
comp_iter: comp_for | comp_if
sync_comp_for: 'for' exprlist 'in' or_test [comp_iter]
comp_for: ['async'] sync_comp_for
comp_if: 'if' test_nocond [comp_iter]
# not used in grammar, but may appear in "node" passed from Parser to Compiler
encoding_decl: NAME
yield_expr: 'yield' [yield_arg]
yield_arg: 'from' test | testlist_star_expr
strings: (STRING | fstring)+
fstring: FSTRING_START fstring_content* FSTRING_END
fstring_content: FSTRING_STRING | fstring_expr
fstring_conversion: '!' NAME
fstring_expr: '{' testlist ['='] [ fstring_conversion ] [ fstring_format_spec ] '}'
fstring_format_spec: ':' fstring_content*

View file

@ -0,0 +1,217 @@
from parso.python import tree
from parso.python.token import PythonTokenTypes
from parso.parser import BaseParser
NAME = PythonTokenTypes.NAME
INDENT = PythonTokenTypes.INDENT
DEDENT = PythonTokenTypes.DEDENT
class Parser(BaseParser):
"""
This class is used to parse a Python file, it then divides them into a
class structure of different scopes.
:param pgen_grammar: The grammar object of pgen2. Loaded by load_grammar.
"""
node_map = {
'expr_stmt': tree.ExprStmt,
'classdef': tree.Class,
'funcdef': tree.Function,
'file_input': tree.Module,
'import_name': tree.ImportName,
'import_from': tree.ImportFrom,
'break_stmt': tree.KeywordStatement,
'continue_stmt': tree.KeywordStatement,
'return_stmt': tree.ReturnStmt,
'raise_stmt': tree.KeywordStatement,
'yield_expr': tree.YieldExpr,
'del_stmt': tree.KeywordStatement,
'pass_stmt': tree.KeywordStatement,
'global_stmt': tree.GlobalStmt,
'nonlocal_stmt': tree.KeywordStatement,
'print_stmt': tree.KeywordStatement,
'assert_stmt': tree.AssertStmt,
'if_stmt': tree.IfStmt,
'with_stmt': tree.WithStmt,
'for_stmt': tree.ForStmt,
'while_stmt': tree.WhileStmt,
'try_stmt': tree.TryStmt,
'sync_comp_for': tree.SyncCompFor,
# Not sure if this is the best idea, but IMO it's the easiest way to
# avoid extreme amounts of work around the subtle difference of 2/3
# grammar in list comoprehensions.
'list_for': tree.SyncCompFor,
'decorator': tree.Decorator,
'lambdef': tree.Lambda,
'old_lambdef': tree.Lambda,
'lambdef_nocond': tree.Lambda,
}
default_node = tree.PythonNode
# Names/Keywords are handled separately
_leaf_map = {
PythonTokenTypes.STRING: tree.String,
PythonTokenTypes.NUMBER: tree.Number,
PythonTokenTypes.NEWLINE: tree.Newline,
PythonTokenTypes.ENDMARKER: tree.EndMarker,
PythonTokenTypes.FSTRING_STRING: tree.FStringString,
PythonTokenTypes.FSTRING_START: tree.FStringStart,
PythonTokenTypes.FSTRING_END: tree.FStringEnd,
}
def __init__(self, pgen_grammar, error_recovery=True, start_nonterminal='file_input'):
super(Parser, self).__init__(pgen_grammar, start_nonterminal,
error_recovery=error_recovery)
self.syntax_errors = []
self._omit_dedent_list = []
self._indent_counter = 0
def parse(self, tokens):
if self._error_recovery:
if self._start_nonterminal != 'file_input':
raise NotImplementedError
tokens = self._recovery_tokenize(tokens)
return super(Parser, self).parse(tokens)
def convert_node(self, nonterminal, children):
"""
Convert raw node information to a PythonBaseNode instance.
This is passed to the parser driver which calls it whenever a reduction of a
grammar rule produces a new complete node, so that the tree is build
strictly bottom-up.
"""
try:
node = self.node_map[nonterminal](children)
except KeyError:
if nonterminal == 'suite':
# We don't want the INDENT/DEDENT in our parser tree. Those
# leaves are just cancer. They are virtual leaves and not real
# ones and therefore have pseudo start/end positions and no
# prefixes. Just ignore them.
children = [children[0]] + children[2:-1]
elif nonterminal == 'list_if':
# Make transitioning from 2 to 3 easier.
nonterminal = 'comp_if'
elif nonterminal == 'listmaker':
# Same as list_if above.
nonterminal = 'testlist_comp'
node = self.default_node(nonterminal, children)
for c in children:
c.parent = node
return node
def convert_leaf(self, type, value, prefix, start_pos):
# print('leaf', repr(value), token.tok_name[type])
if type == NAME:
if value in self._pgen_grammar.reserved_syntax_strings:
return tree.Keyword(value, start_pos, prefix)
else:
return tree.Name(value, start_pos, prefix)
return self._leaf_map.get(type, tree.Operator)(value, start_pos, prefix)
def error_recovery(self, token):
tos_nodes = self.stack[-1].nodes
if tos_nodes:
last_leaf = tos_nodes[-1].get_last_leaf()
else:
last_leaf = None
if self._start_nonterminal == 'file_input' and \
(token.type == PythonTokenTypes.ENDMARKER
or token.type == DEDENT and not last_leaf.value.endswith('\n')
and not last_leaf.value.endswith('\r')):
# In Python statements need to end with a newline. But since it's
# possible (and valid in Python) that there's no newline at the
# end of a file, we have to recover even if the user doesn't want
# error recovery.
if self.stack[-1].dfa.from_rule == 'simple_stmt':
try:
plan = self.stack[-1].dfa.transitions[PythonTokenTypes.NEWLINE]
except KeyError:
pass
else:
if plan.next_dfa.is_final and not plan.dfa_pushes:
# We are ignoring here that the newline would be
# required for a simple_stmt.
self.stack[-1].dfa = plan.next_dfa
self._add_token(token)
return
if not self._error_recovery:
return super(Parser, self).error_recovery(token)
def current_suite(stack):
# For now just discard everything that is not a suite or
# file_input, if we detect an error.
for until_index, stack_node in reversed(list(enumerate(stack))):
# `suite` can sometimes be only simple_stmt, not stmt.
if stack_node.nonterminal == 'file_input':
break
elif stack_node.nonterminal == 'suite':
# In the case where we just have a newline we don't want to
# do error recovery here. In all other cases, we want to do
# error recovery.
if len(stack_node.nodes) != 1:
break
return until_index
until_index = current_suite(self.stack)
if self._stack_removal(until_index + 1):
self._add_token(token)
else:
typ, value, start_pos, prefix = token
if typ == INDENT:
# For every deleted INDENT we have to delete a DEDENT as well.
# Otherwise the parser will get into trouble and DEDENT too early.
self._omit_dedent_list.append(self._indent_counter)
error_leaf = tree.PythonErrorLeaf(typ.name, value, start_pos, prefix)
self.stack[-1].nodes.append(error_leaf)
tos = self.stack[-1]
if tos.nonterminal == 'suite':
# Need at least one statement in the suite. This happend with the
# error recovery above.
try:
tos.dfa = tos.dfa.arcs['stmt']
except KeyError:
# We're already in a final state.
pass
def _stack_removal(self, start_index):
all_nodes = [node for stack_node in self.stack[start_index:] for node in stack_node.nodes]
if all_nodes:
node = tree.PythonErrorNode(all_nodes)
for n in all_nodes:
n.parent = node
self.stack[start_index - 1].nodes.append(node)
self.stack[start_index:] = []
return bool(all_nodes)
def _recovery_tokenize(self, tokens):
for token in tokens:
typ = token[0]
if typ == DEDENT:
# We need to count indents, because if we just omit any DEDENT,
# we might omit them in the wrong place.
o = self._omit_dedent_list
if o and o[-1] == self._indent_counter:
o.pop()
self._indent_counter -= 1
continue
self._indent_counter -= 1
elif typ == INDENT:
self._indent_counter += 1
yield token

View file

@ -0,0 +1,727 @@
import re
from contextlib import contextmanager
from parso.python.errors import ErrorFinder, ErrorFinderConfig
from parso.normalizer import Rule
from parso.python.tree import search_ancestor, Flow, Scope
_IMPORT_TYPES = ('import_name', 'import_from')
_SUITE_INTRODUCERS = ('classdef', 'funcdef', 'if_stmt', 'while_stmt',
'for_stmt', 'try_stmt', 'with_stmt')
_NON_STAR_TYPES = ('term', 'import_from', 'power')
_OPENING_BRACKETS = '(', '[', '{'
_CLOSING_BRACKETS = ')', ']', '}'
_FACTOR = '+', '-', '~'
_ALLOW_SPACE = '*', '+', '-', '**', '/', '//', '@'
_BITWISE_OPERATOR = '<<', '>>', '|', '&', '^'
_NEEDS_SPACE = ('=', '%', '->',
'<', '>', '==', '>=', '<=', '<>', '!=',
'+=', '-=', '*=', '@=', '/=', '%=', '&=', '|=', '^=', '<<=',
'>>=', '**=', '//=')
_NEEDS_SPACE += _BITWISE_OPERATOR
_IMPLICIT_INDENTATION_TYPES = ('dictorsetmaker', 'argument')
_POSSIBLE_SLICE_PARENTS = ('subscript', 'subscriptlist', 'sliceop')
class IndentationTypes(object):
VERTICAL_BRACKET = object()
HANGING_BRACKET = object()
BACKSLASH = object()
SUITE = object()
IMPLICIT = object()
class IndentationNode(object):
type = IndentationTypes.SUITE
def __init__(self, config, indentation, parent=None):
self.bracket_indentation = self.indentation = indentation
self.parent = parent
def __repr__(self):
return '<%s>' % self.__class__.__name__
def get_latest_suite_node(self):
n = self
while n is not None:
if n.type == IndentationTypes.SUITE:
return n
n = n.parent
class BracketNode(IndentationNode):
def __init__(self, config, leaf, parent, in_suite_introducer=False):
self.leaf = leaf
# Figure out here what the indentation is. For chained brackets
# we can basically use the previous indentation.
previous_leaf = leaf
n = parent
if n.type == IndentationTypes.IMPLICIT:
n = n.parent
while True:
if hasattr(n, 'leaf') and previous_leaf.line != n.leaf.line:
break
previous_leaf = previous_leaf.get_previous_leaf()
if not isinstance(n, BracketNode) or previous_leaf != n.leaf:
break
n = n.parent
parent_indentation = n.indentation
next_leaf = leaf.get_next_leaf()
if '\n' in next_leaf.prefix:
# This implies code like:
# foobarbaz(
# a,
# b,
# )
self.bracket_indentation = parent_indentation \
+ config.closing_bracket_hanging_indentation
self.indentation = parent_indentation + config.indentation
self.type = IndentationTypes.HANGING_BRACKET
else:
# Implies code like:
# foobarbaz(
# a,
# b,
# )
expected_end_indent = leaf.end_pos[1]
if '\t' in config.indentation:
self.indentation = None
else:
self.indentation = ' ' * expected_end_indent
self.bracket_indentation = self.indentation
self.type = IndentationTypes.VERTICAL_BRACKET
if in_suite_introducer and parent.type == IndentationTypes.SUITE \
and self.indentation == parent_indentation + config.indentation:
self.indentation += config.indentation
# The closing bracket should have the same indentation.
self.bracket_indentation = self.indentation
self.parent = parent
class ImplicitNode(BracketNode):
"""
Implicit indentation after keyword arguments, default arguments,
annotations and dict values.
"""
def __init__(self, config, leaf, parent):
super(ImplicitNode, self).__init__(config, leaf, parent)
self.type = IndentationTypes.IMPLICIT
next_leaf = leaf.get_next_leaf()
if leaf == ':' and '\n' not in next_leaf.prefix:
self.indentation += ' '
class BackslashNode(IndentationNode):
type = IndentationTypes.BACKSLASH
def __init__(self, config, parent_indentation, containing_leaf, spacing, parent=None):
expr_stmt = search_ancestor(containing_leaf, 'expr_stmt')
if expr_stmt is not None:
equals = expr_stmt.children[-2]
if '\t' in config.indentation:
# TODO unite with the code of BracketNode
self.indentation = None
else:
# If the backslash follows the equals, use normal indentation
# otherwise it should align with the equals.
if equals.end_pos == spacing.start_pos:
self.indentation = parent_indentation + config.indentation
else:
# +1 because there is a space.
self.indentation = ' ' * (equals.end_pos[1] + 1)
else:
self.indentation = parent_indentation + config.indentation
self.bracket_indentation = self.indentation
self.parent = parent
def _is_magic_name(name):
return name.value.startswith('__') and name.value.endswith('__')
class PEP8Normalizer(ErrorFinder):
def __init__(self, *args, **kwargs):
super(PEP8Normalizer, self).__init__(*args, **kwargs)
self._previous_part = None
self._previous_leaf = None
self._on_newline = True
self._newline_count = 0
self._wanted_newline_count = None
self._max_new_lines_in_prefix = 0
self._new_statement = True
self._implicit_indentation_possible = False
# The top of stack of the indentation nodes.
self._indentation_tos = self._last_indentation_tos = \
IndentationNode(self._config, indentation='')
self._in_suite_introducer = False
if ' ' in self._config.indentation:
self._indentation_type = 'spaces'
self._wrong_indentation_char = '\t'
else:
self._indentation_type = 'tabs'
self._wrong_indentation_char = ' '
@contextmanager
def visit_node(self, node):
with super(PEP8Normalizer, self).visit_node(node):
with self._visit_node(node):
yield
@contextmanager
def _visit_node(self, node):
typ = node.type
if typ in 'import_name':
names = node.get_defined_names()
if len(names) > 1:
for name in names[:1]:
self.add_issue(name, 401, 'Multiple imports on one line')
elif typ == 'lambdef':
expr_stmt = node.parent
# Check if it's simply defining a single name, not something like
# foo.bar or x[1], where using a lambda could make more sense.
if expr_stmt.type == 'expr_stmt' and any(n.type == 'name' for n in expr_stmt.children[:-2:2]):
self.add_issue(node, 731, 'Do not assign a lambda expression, use a def')
elif typ == 'try_stmt':
for child in node.children:
# Here we can simply check if it's an except, because otherwise
# it would be an except_clause.
if child.type == 'keyword' and child.value == 'except':
self.add_issue(child, 722, 'Do not use bare except, specify exception instead')
elif typ == 'comparison':
for child in node.children:
if child.type not in ('atom_expr', 'power'):
continue
if len(child.children) > 2:
continue
trailer = child.children[1]
atom = child.children[0]
if trailer.type == 'trailer' and atom.type == 'name' \
and atom.value == 'type':
self.add_issue(node, 721, "Do not compare types, use 'isinstance()")
break
elif typ == 'file_input':
endmarker = node.children[-1]
prev = endmarker.get_previous_leaf()
prefix = endmarker.prefix
if (not prefix.endswith('\n') and (
prefix or prev is None or prev.value != '\n')):
self.add_issue(endmarker, 292, "No newline at end of file")
if typ in _IMPORT_TYPES:
simple_stmt = node.parent
module = simple_stmt.parent
#if module.type == 'simple_stmt':
if module.type == 'file_input':
index = module.children.index(simple_stmt)
for child in module.children[:index]:
children = [child]
if child.type == 'simple_stmt':
# Remove the newline.
children = child.children[:-1]
found_docstring = False
for c in children:
if c.type == 'string' and not found_docstring:
continue
found_docstring = True
if c.type == 'expr_stmt' and \
all(_is_magic_name(n) for n in c.get_defined_names()):
continue
if c.type in _IMPORT_TYPES or isinstance(c, Flow):
continue
self.add_issue(node, 402, 'Module level import not at top of file')
break
else:
continue
break
implicit_indentation_possible = typ in _IMPLICIT_INDENTATION_TYPES
in_introducer = typ in _SUITE_INTRODUCERS
if in_introducer:
self._in_suite_introducer = True
elif typ == 'suite':
if self._indentation_tos.type == IndentationTypes.BACKSLASH:
self._indentation_tos = self._indentation_tos.parent
self._indentation_tos = IndentationNode(
self._config,
self._indentation_tos.indentation + self._config.indentation,
parent=self._indentation_tos
)
elif implicit_indentation_possible:
self._implicit_indentation_possible = True
yield
if typ == 'suite':
assert self._indentation_tos.type == IndentationTypes.SUITE
self._indentation_tos = self._indentation_tos.parent
# If we dedent, no lines are needed anymore.
self._wanted_newline_count = None
elif implicit_indentation_possible:
self._implicit_indentation_possible = False
if self._indentation_tos.type == IndentationTypes.IMPLICIT:
self._indentation_tos = self._indentation_tos.parent
elif in_introducer:
self._in_suite_introducer = False
if typ in ('classdef', 'funcdef'):
self._wanted_newline_count = self._get_wanted_blank_lines_count()
def _check_tabs_spaces(self, spacing):
if self._wrong_indentation_char in spacing.value:
self.add_issue(spacing, 101, 'Indentation contains ' + self._indentation_type)
return True
return False
def _get_wanted_blank_lines_count(self):
suite_node = self._indentation_tos.get_latest_suite_node()
return int(suite_node.parent is None) + 1
def _reset_newlines(self, spacing, leaf, is_comment=False):
self._max_new_lines_in_prefix = \
max(self._max_new_lines_in_prefix, self._newline_count)
wanted = self._wanted_newline_count
if wanted is not None:
# Need to substract one
blank_lines = self._newline_count - 1
if wanted > blank_lines and leaf.type != 'endmarker':
# In case of a comment we don't need to add the issue, yet.
if not is_comment:
# TODO end_pos wrong.
code = 302 if wanted == 2 else 301
message = "expected %s blank line, found %s" \
% (wanted, blank_lines)
self.add_issue(spacing, code, message)
self._wanted_newline_count = None
else:
self._wanted_newline_count = None
if not is_comment:
wanted = self._get_wanted_blank_lines_count()
actual = self._max_new_lines_in_prefix - 1
val = leaf.value
needs_lines = (
val == '@' and leaf.parent.type == 'decorator'
or (
val == 'class'
or val == 'async' and leaf.get_next_leaf() == 'def'
or val == 'def' and self._previous_leaf != 'async'
) and leaf.parent.parent.type != 'decorated'
)
if needs_lines and actual < wanted:
func_or_cls = leaf.parent
suite = func_or_cls.parent
if suite.type == 'decorated':
suite = suite.parent
# The first leaf of a file or a suite should not need blank
# lines.
if suite.children[int(suite.type == 'suite')] != func_or_cls:
code = 302 if wanted == 2 else 301
message = "expected %s blank line, found %s" \
% (wanted, actual)
self.add_issue(spacing, code, message)
self._max_new_lines_in_prefix = 0
self._newline_count = 0
def visit_leaf(self, leaf):
super(PEP8Normalizer, self).visit_leaf(leaf)
for part in leaf._split_prefix():
if part.type == 'spacing':
# This part is used for the part call after for.
break
self._visit_part(part, part.create_spacing_part(), leaf)
self._analyse_non_prefix(leaf)
self._visit_part(leaf, part, leaf)
# Cleanup
self._last_indentation_tos = self._indentation_tos
self._new_statement = leaf.type == 'newline'
# TODO does this work? with brackets and stuff?
if leaf.type == 'newline' and \
self._indentation_tos.type == IndentationTypes.BACKSLASH:
self._indentation_tos = self._indentation_tos.parent
if leaf.value == ':' and leaf.parent.type in _SUITE_INTRODUCERS:
self._in_suite_introducer = False
elif leaf.value == 'elif':
self._in_suite_introducer = True
if not self._new_statement:
self._reset_newlines(part, leaf)
self._max_blank_lines = 0
self._previous_leaf = leaf
return leaf.value
def _visit_part(self, part, spacing, leaf):
value = part.value
type_ = part.type
if type_ == 'error_leaf':
return
if value == ',' and part.parent.type == 'dictorsetmaker':
self._indentation_tos = self._indentation_tos.parent
node = self._indentation_tos
if type_ == 'comment':
if value.startswith('##'):
# Whole blocks of # should not raise an error.
if value.lstrip('#'):
self.add_issue(part, 266, "Too many leading '#' for block comment.")
elif self._on_newline:
if not re.match(r'#:? ', value) and not value == '#' \
and not (value.startswith('#!') and part.start_pos == (1, 0)):
self.add_issue(part, 265, "Block comment should start with '# '")
else:
if not re.match(r'#:? [^ ]', value):
self.add_issue(part, 262, "Inline comment should start with '# '")
self._reset_newlines(spacing, leaf, is_comment=True)
elif type_ == 'newline':
if self._newline_count > self._get_wanted_blank_lines_count():
self.add_issue(part, 303, "Too many blank lines (%s)" % self._newline_count)
elif leaf in ('def', 'class') \
and leaf.parent.parent.type == 'decorated':
self.add_issue(part, 304, "Blank lines found after function decorator")
self._newline_count += 1
if type_ == 'backslash':
# TODO is this enough checking? What about ==?
if node.type != IndentationTypes.BACKSLASH:
if node.type != IndentationTypes.SUITE:
self.add_issue(part, 502, 'The backslash is redundant between brackets')
else:
indentation = node.indentation
if self._in_suite_introducer and node.type == IndentationTypes.SUITE:
indentation += self._config.indentation
self._indentation_tos = BackslashNode(
self._config,
indentation,
part,
spacing,
parent=self._indentation_tos
)
elif self._on_newline:
indentation = spacing.value
if node.type == IndentationTypes.BACKSLASH \
and self._previous_part.type == 'newline':
self._indentation_tos = self._indentation_tos.parent
if not self._check_tabs_spaces(spacing):
should_be_indentation = node.indentation
if type_ == 'comment':
# Comments can be dedented. So we have to care for that.
n = self._last_indentation_tos
while True:
if len(indentation) > len(n.indentation):
break
should_be_indentation = n.indentation
self._last_indentation_tos = n
if n == node:
break
n = n.parent
if self._new_statement:
if type_ == 'newline':
if indentation:
self.add_issue(spacing, 291, 'Trailing whitespace')
elif indentation != should_be_indentation:
s = '%s %s' % (len(self._config.indentation), self._indentation_type)
self.add_issue(part, 111, 'Indentation is not a multiple of ' + s)
else:
if value in '])}':
should_be_indentation = node.bracket_indentation
else:
should_be_indentation = node.indentation
if self._in_suite_introducer and indentation == \
node.get_latest_suite_node().indentation \
+ self._config.indentation:
self.add_issue(part, 129, "Line with same indent as next logical block")
elif indentation != should_be_indentation:
if not self._check_tabs_spaces(spacing) and part.value != '\n':
if value in '])}':
if node.type == IndentationTypes.VERTICAL_BRACKET:
self.add_issue(part, 124, "Closing bracket does not match visual indentation")
else:
self.add_issue(part, 123, "Losing bracket does not match indentation of opening bracket's line")
else:
if len(indentation) < len(should_be_indentation):
if node.type == IndentationTypes.VERTICAL_BRACKET:
self.add_issue(part, 128, 'Continuation line under-indented for visual indent')
elif node.type == IndentationTypes.BACKSLASH:
self.add_issue(part, 122, 'Continuation line missing indentation or outdented')
elif node.type == IndentationTypes.IMPLICIT:
self.add_issue(part, 135, 'xxx')
else:
self.add_issue(part, 121, 'Continuation line under-indented for hanging indent')
else:
if node.type == IndentationTypes.VERTICAL_BRACKET:
self.add_issue(part, 127, 'Continuation line over-indented for visual indent')
elif node.type == IndentationTypes.IMPLICIT:
self.add_issue(part, 136, 'xxx')
else:
self.add_issue(part, 126, 'Continuation line over-indented for hanging indent')
else:
self._check_spacing(part, spacing)
self._check_line_length(part, spacing)
# -------------------------------
# Finalizing. Updating the state.
# -------------------------------
if value and value in '()[]{}' and type_ != 'error_leaf' \
and part.parent.type != 'error_node':
if value in _OPENING_BRACKETS:
self._indentation_tos = BracketNode(
self._config, part,
parent=self._indentation_tos,
in_suite_introducer=self._in_suite_introducer
)
else:
assert node.type != IndentationTypes.IMPLICIT
self._indentation_tos = self._indentation_tos.parent
elif value in ('=', ':') and self._implicit_indentation_possible \
and part.parent.type in _IMPLICIT_INDENTATION_TYPES:
indentation = node.indentation
self._indentation_tos = ImplicitNode(
self._config, part, parent=self._indentation_tos
)
self._on_newline = type_ in ('newline', 'backslash', 'bom')
self._previous_part = part
self._previous_spacing = spacing
def _check_line_length(self, part, spacing):
if part.type == 'backslash':
last_column = part.start_pos[1] + 1
else:
last_column = part.end_pos[1]
if last_column > self._config.max_characters \
and spacing.start_pos[1] <= self._config.max_characters :
# Special case for long URLs in multi-line docstrings or comments,
# but still report the error when the 72 first chars are whitespaces.
report = True
if part.type == 'comment':
splitted = part.value[1:].split()
if len(splitted) == 1 \
and (part.end_pos[1] - len(splitted[0])) < 72:
report = False
if report:
self.add_issue(
part,
501,
'Line too long (%s > %s characters)' %
(last_column, self._config.max_characters),
)
def _check_spacing(self, part, spacing):
def add_if_spaces(*args):
if spaces:
return self.add_issue(*args)
def add_not_spaces(*args):
if not spaces:
return self.add_issue(*args)
spaces = spacing.value
prev = self._previous_part
if prev is not None and prev.type == 'error_leaf' or part.type == 'error_leaf':
return
type_ = part.type
if '\t' in spaces:
self.add_issue(spacing, 223, 'Used tab to separate tokens')
elif type_ == 'comment':
if len(spaces) < self._config.spaces_before_comment:
self.add_issue(spacing, 261, 'At least two spaces before inline comment')
elif type_ == 'newline':
add_if_spaces(spacing, 291, 'Trailing whitespace')
elif len(spaces) > 1:
self.add_issue(spacing, 221, 'Multiple spaces used')
else:
if prev in _OPENING_BRACKETS:
message = "Whitespace after '%s'" % part.value
add_if_spaces(spacing, 201, message)
elif part in _CLOSING_BRACKETS:
message = "Whitespace before '%s'" % part.value
add_if_spaces(spacing, 202, message)
elif part in (',', ';') or part == ':' \
and part.parent.type not in _POSSIBLE_SLICE_PARENTS:
message = "Whitespace before '%s'" % part.value
add_if_spaces(spacing, 203, message)
elif prev == ':' and prev.parent.type in _POSSIBLE_SLICE_PARENTS:
pass # TODO
elif prev in (',', ';', ':'):
add_not_spaces(spacing, 231, "missing whitespace after '%s'")
elif part == ':': # Is a subscript
# TODO
pass
elif part in ('*', '**') and part.parent.type not in _NON_STAR_TYPES \
or prev in ('*', '**') \
and prev.parent.type not in _NON_STAR_TYPES:
# TODO
pass
elif prev in _FACTOR and prev.parent.type == 'factor':
pass
elif prev == '@' and prev.parent.type == 'decorator':
pass # TODO should probably raise an error if there's a space here
elif part in _NEEDS_SPACE or prev in _NEEDS_SPACE:
if part == '=' and part.parent.type in ('argument', 'param') \
or prev == '=' and prev.parent.type in ('argument', 'param'):
if part == '=':
param = part.parent
else:
param = prev.parent
if param.type == 'param' and param.annotation:
add_not_spaces(spacing, 252, 'Expected spaces around annotation equals')
else:
add_if_spaces(spacing, 251, 'Unexpected spaces around keyword / parameter equals')
elif part in _BITWISE_OPERATOR or prev in _BITWISE_OPERATOR:
add_not_spaces(spacing, 227, 'Missing whitespace around bitwise or shift operator')
elif part == '%' or prev == '%':
add_not_spaces(spacing, 228, 'Missing whitespace around modulo operator')
else:
message_225 = 'Missing whitespace between tokens'
add_not_spaces(spacing, 225, message_225)
elif type_ == 'keyword' or prev.type == 'keyword':
add_not_spaces(spacing, 275, 'Missing whitespace around keyword')
else:
prev_spacing = self._previous_spacing
if prev in _ALLOW_SPACE and spaces != prev_spacing.value \
and '\n' not in self._previous_leaf.prefix:
message = "Whitespace before operator doesn't match with whitespace after"
self.add_issue(spacing, 229, message)
if spaces and part not in _ALLOW_SPACE and prev not in _ALLOW_SPACE:
message_225 = 'Missing whitespace between tokens'
#print('xy', spacing)
#self.add_issue(spacing, 225, message_225)
# TODO why only brackets?
if part in _OPENING_BRACKETS:
message = "Whitespace before '%s'" % part.value
add_if_spaces(spacing, 211, message)
def _analyse_non_prefix(self, leaf):
typ = leaf.type
if typ == 'name' and leaf.value in ('l', 'O', 'I'):
if leaf.is_definition():
message = "Do not define %s named 'l', 'O', or 'I' one line"
if leaf.parent.type == 'class' and leaf.parent.name == leaf:
self.add_issue(leaf, 742, message % 'classes')
elif leaf.parent.type == 'function' and leaf.parent.name == leaf:
self.add_issue(leaf, 743, message % 'function')
else:
self.add_issuadd_issue(741, message % 'variables', leaf)
elif leaf.value == ':':
if isinstance(leaf.parent, (Flow, Scope)) and leaf.parent.type != 'lambdef':
next_leaf = leaf.get_next_leaf()
if next_leaf.type != 'newline':
if leaf.parent.type == 'funcdef':
self.add_issue(next_leaf, 704, 'Multiple statements on one line (def)')
else:
self.add_issue(next_leaf, 701, 'Multiple statements on one line (colon)')
elif leaf.value == ';':
if leaf.get_next_leaf().type in ('newline', 'endmarker'):
self.add_issue(leaf, 703, 'Statement ends with a semicolon')
else:
self.add_issue(leaf, 702, 'Multiple statements on one line (semicolon)')
elif leaf.value in ('==', '!='):
comparison = leaf.parent
index = comparison.children.index(leaf)
left = comparison.children[index - 1]
right = comparison.children[index + 1]
for node in left, right:
if node.type == 'keyword' or node.type == 'name':
if node.value == 'None':
message = "comparison to None should be 'if cond is None:'"
self.add_issue(leaf, 711, message)
break
elif node.value in ('True', 'False'):
message = "comparison to False/True should be 'if cond is True:' or 'if cond:'"
self.add_issue(leaf, 712, message)
break
elif leaf.value in ('in', 'is'):
comparison = leaf.parent
if comparison.type == 'comparison' and comparison.parent.type == 'not_test':
if leaf.value == 'in':
self.add_issue(leaf, 713, "test for membership should be 'not in'")
else:
self.add_issue(leaf, 714, "test for object identity should be 'is not'")
elif typ == 'string':
# Checking multiline strings
for i, line in enumerate(leaf.value.splitlines()[1:]):
indentation = re.match(r'[ \t]*', line).group(0)
start_pos = leaf.line + i, len(indentation)
# TODO check multiline indentation.
elif typ == 'endmarker':
if self._newline_count >= 2:
self.add_issue(leaf, 391, 'Blank line at end of file')
def add_issue(self, node, code, message):
if self._previous_leaf is not None:
if search_ancestor(self._previous_leaf, 'error_node') is not None:
return
if self._previous_leaf.type == 'error_leaf':
return
if search_ancestor(node, 'error_node') is not None:
return
if code in (901, 903):
# 901 and 903 are raised by the ErrorFinder.
super(PEP8Normalizer, self).add_issue(node, code, message)
else:
# Skip ErrorFinder here, because it has custom behavior.
super(ErrorFinder, self).add_issue(node, code, message)
class PEP8NormalizerConfig(ErrorFinderConfig):
normalizer_class = PEP8Normalizer
"""
Normalizing to PEP8. Not really implemented, yet.
"""
def __init__(self, indentation=' ' * 4, hanging_indentation=None,
max_characters=79, spaces_before_comment=2):
self.indentation = indentation
if hanging_indentation is None:
hanging_indentation = indentation
self.hanging_indentation = hanging_indentation
self.closing_bracket_hanging_indentation = ''
self.break_after_binary = False
self.max_characters = max_characters
self.spaces_before_comment = spaces_before_comment
# TODO this is not yet ready.
#@PEP8Normalizer.register_rule(type='endmarker')
class BlankLineAtEnd(Rule):
code = 392
message = 'Blank line at end of file'
def is_issue(self, leaf):
return self._newline_count >= 2

View file

@ -0,0 +1,97 @@
import re
from codecs import BOM_UTF8
from parso.python.tokenize import group
unicode_bom = BOM_UTF8.decode('utf-8')
class PrefixPart(object):
def __init__(self, leaf, typ, value, spacing='', start_pos=None):
assert start_pos is not None
self.parent = leaf
self.type = typ
self.value = value
self.spacing = spacing
self.start_pos = start_pos
@property
def end_pos(self):
if self.value.endswith('\n'):
return self.start_pos[0] + 1, 0
if self.value == unicode_bom:
# The bom doesn't have a length at the start of a Python file.
return self.start_pos
return self.start_pos[0], self.start_pos[1] + len(self.value)
def create_spacing_part(self):
column = self.start_pos[1] - len(self.spacing)
return PrefixPart(
self.parent, 'spacing', self.spacing,
start_pos=(self.start_pos[0], column)
)
def __repr__(self):
return '%s(%s, %s, %s)' % (
self.__class__.__name__,
self.type,
repr(self.value),
self.start_pos
)
_comment = r'#[^\n\r\f]*'
_backslash = r'\\\r?\n'
_newline = r'\r?\n'
_form_feed = r'\f'
_only_spacing = '$'
_spacing = r'[ \t]*'
_bom = unicode_bom
_regex = group(
_comment, _backslash, _newline, _form_feed, _only_spacing, _bom,
capture=True
)
_regex = re.compile(group(_spacing, capture=True) + _regex)
_types = {
'#': 'comment',
'\\': 'backslash',
'\f': 'formfeed',
'\n': 'newline',
'\r': 'newline',
unicode_bom: 'bom'
}
def split_prefix(leaf, start_pos):
line, column = start_pos
start = 0
value = spacing = ''
bom = False
while start != len(leaf.prefix):
match =_regex.match(leaf.prefix, start)
spacing = match.group(1)
value = match.group(2)
if not value:
break
type_ = _types[value[0]]
yield PrefixPart(
leaf, type_, value, spacing,
start_pos=(line, column + start - int(bom) + len(spacing))
)
if type_ == 'bom':
bom = True
start = match.end(0)
if value.endswith('\n'):
line += 1
column = -start
if value:
spacing = ''
yield PrefixPart(
leaf, 'spacing', spacing,
start_pos=(line, column + start)
)

View file

@ -0,0 +1,27 @@
from __future__ import absolute_import
class TokenType(object):
def __init__(self, name, contains_syntax=False):
self.name = name
self.contains_syntax = contains_syntax
def __repr__(self):
return '%s(%s)' % (self.__class__.__name__, self.name)
class TokenTypes(object):
"""
Basically an enum, but Python 2 doesn't have enums in the standard library.
"""
def __init__(self, names, contains_syntax):
for name in names:
setattr(self, name, TokenType(name, contains_syntax=name in contains_syntax))
PythonTokenTypes = TokenTypes((
'STRING', 'NUMBER', 'NAME', 'ERRORTOKEN', 'NEWLINE', 'INDENT', 'DEDENT',
'ERROR_DEDENT', 'FSTRING_STRING', 'FSTRING_START', 'FSTRING_END', 'OP',
'ENDMARKER'),
contains_syntax=('NAME', 'OP'),
)

View file

@ -0,0 +1,30 @@
from typing import Container, Iterable
class TokenType:
name: str
contains_syntax: bool
def __init__(self, name: str, contains_syntax: bool) -> None: ...
class TokenTypes:
def __init__(
self, names: Iterable[str], contains_syntax: Container[str]
) -> None: ...
# not an actual class in the source code, but we need this class to type the fields of
# PythonTokenTypes
class _FakePythonTokenTypesClass(TokenTypes):
STRING: TokenType
NUMBER: TokenType
NAME: TokenType
ERRORTOKEN: TokenType
NEWLINE: TokenType
INDENT: TokenType
DEDENT: TokenType
ERROR_DEDENT: TokenType
FSTRING_STRING: TokenType
FSTRING_START: TokenType
FSTRING_END: TokenType
OP: TokenType
ENDMARKER: TokenType
PythonTokenTypes: _FakePythonTokenTypesClass = ...

View file

@ -0,0 +1,722 @@
# -*- coding: utf-8 -*-
"""
This tokenizer has been copied from the ``tokenize.py`` standard library
tokenizer. The reason was simple: The standard library tokenizer fails
if the indentation is not right. To make it possible to do error recovery the
tokenizer needed to be rewritten.
Basically this is a stripped down version of the standard library module, so
you can read the documentation there. Additionally we included some speed and
memory optimizations here.
"""
from __future__ import absolute_import
import sys
import re
from collections import namedtuple
import itertools as _itertools
from codecs import BOM_UTF8
from parso.python.token import PythonTokenTypes
from parso.utils import split_lines
# Maximum code point of Unicode 6.0: 0x10ffff (1,114,111)
MAX_UNICODE = '\U0010ffff'
STRING = PythonTokenTypes.STRING
NAME = PythonTokenTypes.NAME
NUMBER = PythonTokenTypes.NUMBER
OP = PythonTokenTypes.OP
NEWLINE = PythonTokenTypes.NEWLINE
INDENT = PythonTokenTypes.INDENT
DEDENT = PythonTokenTypes.DEDENT
ENDMARKER = PythonTokenTypes.ENDMARKER
ERRORTOKEN = PythonTokenTypes.ERRORTOKEN
ERROR_DEDENT = PythonTokenTypes.ERROR_DEDENT
FSTRING_START = PythonTokenTypes.FSTRING_START
FSTRING_STRING = PythonTokenTypes.FSTRING_STRING
FSTRING_END = PythonTokenTypes.FSTRING_END
TokenCollection = namedtuple(
'TokenCollection',
'pseudo_token single_quoted triple_quoted endpats whitespace '
'fstring_pattern_map always_break_tokens',
)
BOM_UTF8_STRING = BOM_UTF8.decode('utf-8')
_token_collection_cache = {}
if sys.version_info.major >= 3:
# Python 3 has str.isidentifier() to check if a char is a valid identifier
is_identifier = str.isidentifier
else:
# Python 2 doesn't, but it's not that important anymore and if you tokenize
# Python 2 code with this, it's still ok. It's just that parsing Python 3
# code with this function is not 100% correct.
# This just means that Python 2 code matches a few identifiers too much,
# but that doesn't really matter.
def is_identifier(s):
return True
def group(*choices, **kwargs):
capture = kwargs.pop('capture', False) # Python 2, arrghhhhh :(
assert not kwargs
start = '('
if not capture:
start += '?:'
return start + '|'.join(choices) + ')'
def maybe(*choices):
return group(*choices) + '?'
# Return the empty string, plus all of the valid string prefixes.
def _all_string_prefixes(version_info, include_fstring=False, only_fstring=False):
def different_case_versions(prefix):
for s in _itertools.product(*[(c, c.upper()) for c in prefix]):
yield ''.join(s)
# The valid string prefixes. Only contain the lower case versions,
# and don't contain any permuations (include 'fr', but not
# 'rf'). The various permutations will be generated.
valid_string_prefixes = ['b', 'r', 'u']
if version_info.major >= 3:
valid_string_prefixes.append('br')
result = set([''])
if version_info >= (3, 6) and include_fstring:
f = ['f', 'fr']
if only_fstring:
valid_string_prefixes = f
result = set()
else:
valid_string_prefixes += f
elif only_fstring:
return set()
# if we add binary f-strings, add: ['fb', 'fbr']
for prefix in valid_string_prefixes:
for t in _itertools.permutations(prefix):
# create a list with upper and lower versions of each
# character
result.update(different_case_versions(t))
if version_info.major == 2:
# In Python 2 the order cannot just be random.
result.update(different_case_versions('ur'))
result.update(different_case_versions('br'))
return result
def _compile(expr):
return re.compile(expr, re.UNICODE)
def _get_token_collection(version_info):
try:
return _token_collection_cache[tuple(version_info)]
except KeyError:
_token_collection_cache[tuple(version_info)] = result = \
_create_token_collection(version_info)
return result
fstring_string_single_line = _compile(r'(?:\{\{|\}\}|\\(?:\r\n?|\n)|[^{}\r\n])+')
fstring_string_multi_line = _compile(r'(?:[^{}]+|\{\{|\}\})+')
fstring_format_spec_single_line = _compile(r'(?:\\(?:\r\n?|\n)|[^{}\r\n])+')
fstring_format_spec_multi_line = _compile(r'[^{}]+')
def _create_token_collection(version_info):
# Note: we use unicode matching for names ("\w") but ascii matching for
# number literals.
Whitespace = r'[ \f\t]*'
whitespace = _compile(Whitespace)
Comment = r'#[^\r\n]*'
# Python 2 is pretty much not working properly anymore, we just ignore
# parsing unicode properly, which is fine, I guess.
if version_info[0] == 2:
Name = r'([A-Za-z_0-9]+)'
elif sys.version_info[0] == 2:
# Unfortunately the regex engine cannot deal with the regex below, so
# just use this one.
Name = r'(\w+)'
else:
Name = u'([A-Za-z_0-9\u0080-' + MAX_UNICODE + ']+)'
if version_info >= (3, 6):
Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+'
Binnumber = r'0[bB](?:_?[01])+'
Octnumber = r'0[oO](?:_?[0-7])+'
Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)'
Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*'
Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?',
r'\.[0-9](?:_?[0-9])*') + maybe(Exponent)
Expfloat = r'[0-9](?:_?[0-9])*' + Exponent
Floatnumber = group(Pointfloat, Expfloat)
Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]')
else:
Hexnumber = r'0[xX][0-9a-fA-F]+'
Binnumber = r'0[bB][01]+'
if version_info.major >= 3:
Octnumber = r'0[oO][0-7]+'
else:
Octnumber = '0[oO]?[0-7]+'
Decnumber = r'(?:0+|[1-9][0-9]*)'
Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
if version_info[0] < 3:
Intnumber += '[lL]?'
Exponent = r'[eE][-+]?[0-9]+'
Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)
Expfloat = r'[0-9]+' + Exponent
Floatnumber = group(Pointfloat, Expfloat)
Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')
Number = group(Imagnumber, Floatnumber, Intnumber)
# Note that since _all_string_prefixes includes the empty string,
# StringPrefix can be the empty string (making it optional).
possible_prefixes = _all_string_prefixes(version_info)
StringPrefix = group(*possible_prefixes)
StringPrefixWithF = group(*_all_string_prefixes(version_info, include_fstring=True))
fstring_prefixes = _all_string_prefixes(version_info, include_fstring=True, only_fstring=True)
FStringStart = group(*fstring_prefixes)
# Tail end of ' string.
Single = r"(?:\\.|[^'\\])*'"
# Tail end of " string.
Double = r'(?:\\.|[^"\\])*"'
# Tail end of ''' string.
Single3 = r"(?:\\.|'(?!'')|[^'\\])*'''"
# Tail end of """ string.
Double3 = r'(?:\\.|"(?!"")|[^"\\])*"""'
Triple = group(StringPrefixWithF + "'''", StringPrefixWithF + '"""')
# Because of leftmost-then-longest match semantics, be sure to put the
# longest operators first (e.g., if = came before ==, == would get
# recognized as two instances of =).
Operator = group(r"\*\*=?", r">>=?", r"<<=?",
r"//=?", r"->",
r"[+\-*/%&@`|^!=<>]=?",
r"~")
Bracket = '[][(){}]'
special_args = [r'\r\n?', r'\n', r'[;.,@]']
if version_info >= (3, 0):
special_args.insert(0, r'\.\.\.')
if version_info >= (3, 8):
special_args.insert(0, ":=?")
else:
special_args.insert(0, ":")
Special = group(*special_args)
Funny = group(Operator, Bracket, Special)
# First (or only) line of ' or " string.
ContStr = group(StringPrefix + r"'[^\r\n'\\]*(?:\\.[^\r\n'\\]*)*"
+ group("'", r'\\(?:\r\n?|\n)'),
StringPrefix + r'"[^\r\n"\\]*(?:\\.[^\r\n"\\]*)*'
+ group('"', r'\\(?:\r\n?|\n)'))
pseudo_extra_pool = [Comment, Triple]
all_quotes = '"', "'", '"""', "'''"
if fstring_prefixes:
pseudo_extra_pool.append(FStringStart + group(*all_quotes))
PseudoExtras = group(r'\\(?:\r\n?|\n)|\Z', *pseudo_extra_pool)
PseudoToken = group(Whitespace, capture=True) + \
group(PseudoExtras, Number, Funny, ContStr, Name, capture=True)
# For a given string prefix plus quotes, endpats maps it to a regex
# to match the remainder of that string. _prefix can be empty, for
# a normal single or triple quoted string (with no prefix).
endpats = {}
for _prefix in possible_prefixes:
endpats[_prefix + "'"] = _compile(Single)
endpats[_prefix + '"'] = _compile(Double)
endpats[_prefix + "'''"] = _compile(Single3)
endpats[_prefix + '"""'] = _compile(Double3)
# A set of all of the single and triple quoted string prefixes,
# including the opening quotes.
single_quoted = set()
triple_quoted = set()
fstring_pattern_map = {}
for t in possible_prefixes:
for quote in '"', "'":
single_quoted.add(t + quote)
for quote in '"""', "'''":
triple_quoted.add(t + quote)
for t in fstring_prefixes:
for quote in all_quotes:
fstring_pattern_map[t + quote] = quote
ALWAYS_BREAK_TOKENS = (';', 'import', 'class', 'def', 'try', 'except',
'finally', 'while', 'with', 'return', 'continue',
'break', 'del', 'pass', 'global', 'assert')
if version_info >= (3, 5):
ALWAYS_BREAK_TOKENS += ('nonlocal', )
pseudo_token_compiled = _compile(PseudoToken)
return TokenCollection(
pseudo_token_compiled, single_quoted, triple_quoted, endpats,
whitespace, fstring_pattern_map, set(ALWAYS_BREAK_TOKENS)
)
class Token(namedtuple('Token', ['type', 'string', 'start_pos', 'prefix'])):
@property
def end_pos(self):
lines = split_lines(self.string)
if len(lines) > 1:
return self.start_pos[0] + len(lines) - 1, 0
else:
return self.start_pos[0], self.start_pos[1] + len(self.string)
class PythonToken(Token):
def __repr__(self):
return ('TokenInfo(type=%s, string=%r, start_pos=%r, prefix=%r)' %
self._replace(type=self.type.name))
class FStringNode(object):
def __init__(self, quote):
self.quote = quote
self.parentheses_count = 0
self.previous_lines = ''
self.last_string_start_pos = None
# In the syntax there can be multiple format_spec's nested:
# {x:{y:3}}
self.format_spec_count = 0
def open_parentheses(self, character):
self.parentheses_count += 1
def close_parentheses(self, character):
self.parentheses_count -= 1
if self.parentheses_count == 0:
# No parentheses means that the format spec is also finished.
self.format_spec_count = 0
def allow_multiline(self):
return len(self.quote) == 3
def is_in_expr(self):
return self.parentheses_count > self.format_spec_count
def is_in_format_spec(self):
return not self.is_in_expr() and self.format_spec_count
def _close_fstring_if_necessary(fstring_stack, string, line_nr, column, additional_prefix):
for fstring_stack_index, node in enumerate(fstring_stack):
lstripped_string = string.lstrip()
len_lstrip = len(string) - len(lstripped_string)
if lstripped_string.startswith(node.quote):
token = PythonToken(
FSTRING_END,
node.quote,
(line_nr, column + len_lstrip),
prefix=additional_prefix+string[:len_lstrip],
)
additional_prefix = ''
assert not node.previous_lines
del fstring_stack[fstring_stack_index:]
return token, '', len(node.quote) + len_lstrip
return None, additional_prefix, 0
def _find_fstring_string(endpats, fstring_stack, line, lnum, pos):
tos = fstring_stack[-1]
allow_multiline = tos.allow_multiline()
if tos.is_in_format_spec():
if allow_multiline:
regex = fstring_format_spec_multi_line
else:
regex = fstring_format_spec_single_line
else:
if allow_multiline:
regex = fstring_string_multi_line
else:
regex = fstring_string_single_line
match = regex.match(line, pos)
if match is None:
return tos.previous_lines, pos
if not tos.previous_lines:
tos.last_string_start_pos = (lnum, pos)
string = match.group(0)
for fstring_stack_node in fstring_stack:
end_match = endpats[fstring_stack_node.quote].match(string)
if end_match is not None:
string = end_match.group(0)[:-len(fstring_stack_node.quote)]
new_pos = pos
new_pos += len(string)
# even if allow_multiline is False, we still need to check for trailing
# newlines, because a single-line f-string can contain line continuations
if string.endswith('\n') or string.endswith('\r'):
tos.previous_lines += string
string = ''
else:
string = tos.previous_lines + string
return string, new_pos
def tokenize(code, version_info, start_pos=(1, 0)):
"""Generate tokens from a the source code (string)."""
lines = split_lines(code, keepends=True)
return tokenize_lines(lines, version_info, start_pos=start_pos)
def _print_tokens(func):
"""
A small helper function to help debug the tokenize_lines function.
"""
def wrapper(*args, **kwargs):
for token in func(*args, **kwargs):
print(token) # This print is intentional for debugging!
yield token
return wrapper
# @_print_tokens
def tokenize_lines(lines, version_info, start_pos=(1, 0), indents=None, is_first_token=True):
"""
A heavily modified Python standard library tokenizer.
Additionally to the default information, yields also the prefix of each
token. This idea comes from lib2to3. The prefix contains all information
that is irrelevant for the parser like newlines in parentheses or comments.
"""
def dedent_if_necessary(start):
while start < indents[-1]:
if start > indents[-2]:
yield PythonToken(ERROR_DEDENT, '', (lnum, start), '')
indents[-1] = start
break
indents.pop()
yield PythonToken(DEDENT, '', spos, '')
pseudo_token, single_quoted, triple_quoted, endpats, whitespace, \
fstring_pattern_map, always_break_tokens, = \
_get_token_collection(version_info)
paren_level = 0 # count parentheses
if indents is None:
indents = [0]
max_ = 0
numchars = '0123456789'
contstr = ''
contline = None
# We start with a newline. This makes indent at the first position
# possible. It's not valid Python, but still better than an INDENT in the
# second line (and not in the first). This makes quite a few things in
# Jedi's fast parser possible.
new_line = True
prefix = '' # Should never be required, but here for safety
additional_prefix = ''
lnum = start_pos[0] - 1
fstring_stack = []
for line in lines: # loop over lines in stream
lnum += 1
pos = 0
max_ = len(line)
if is_first_token:
if line.startswith(BOM_UTF8_STRING):
additional_prefix = BOM_UTF8_STRING
line = line[1:]
max_ = len(line)
# Fake that the part before was already parsed.
line = '^' * start_pos[1] + line
pos = start_pos[1]
max_ += start_pos[1]
is_first_token = False
if contstr: # continued string
endmatch = endprog.match(line)
if endmatch:
pos = endmatch.end(0)
yield PythonToken(
STRING, contstr + line[:pos],
contstr_start, prefix)
contstr = ''
contline = None
else:
contstr = contstr + line
contline = contline + line
continue
while pos < max_:
if fstring_stack:
tos = fstring_stack[-1]
if not tos.is_in_expr():
string, pos = _find_fstring_string(endpats, fstring_stack, line, lnum, pos)
if string:
yield PythonToken(
FSTRING_STRING, string,
tos.last_string_start_pos,
# Never has a prefix because it can start anywhere and
# include whitespace.
prefix=''
)
tos.previous_lines = ''
continue
if pos == max_:
break
rest = line[pos:]
fstring_end_token, additional_prefix, quote_length = _close_fstring_if_necessary(
fstring_stack,
rest,
lnum,
pos,
additional_prefix,
)
pos += quote_length
if fstring_end_token is not None:
yield fstring_end_token
continue
# in an f-string, match until the end of the string
if fstring_stack:
string_line = line
for fstring_stack_node in fstring_stack:
quote = fstring_stack_node.quote
end_match = endpats[quote].match(line, pos)
if end_match is not None:
end_match_string = end_match.group(0)
if len(end_match_string) - len(quote) + pos < len(string_line):
string_line = line[:pos] + end_match_string[:-len(quote)]
pseudomatch = pseudo_token.match(string_line, pos)
else:
pseudomatch = pseudo_token.match(line, pos)
if pseudomatch:
prefix = additional_prefix + pseudomatch.group(1)
additional_prefix = ''
start, pos = pseudomatch.span(2)
spos = (lnum, start)
token = pseudomatch.group(2)
if token == '':
assert prefix
additional_prefix = prefix
# This means that we have a line with whitespace/comments at
# the end, which just results in an endmarker.
break
initial = token[0]
else:
match = whitespace.match(line, pos)
initial = line[match.end()]
start = match.end()
spos = (lnum, start)
if new_line and initial not in '\r\n#' and (initial != '\\' or pseudomatch is None):
new_line = False
if paren_level == 0 and not fstring_stack:
indent_start = start
if indent_start > indents[-1]:
yield PythonToken(INDENT, '', spos, '')
indents.append(indent_start)
for t in dedent_if_necessary(indent_start):
yield t
if not pseudomatch: # scan for tokens
match = whitespace.match(line, pos)
if new_line and paren_level == 0 and not fstring_stack:
for t in dedent_if_necessary(match.end()):
yield t
pos = match.end()
new_line = False
yield PythonToken(
ERRORTOKEN, line[pos], (lnum, pos),
additional_prefix + match.group(0)
)
additional_prefix = ''
pos += 1
continue
if (initial in numchars # ordinary number
or (initial == '.' and token != '.' and token != '...')):
yield PythonToken(NUMBER, token, spos, prefix)
elif pseudomatch.group(3) is not None: # ordinary name
if token in always_break_tokens and (fstring_stack or paren_level):
fstring_stack[:] = []
paren_level = 0
# We only want to dedent if the token is on a new line.
m = re.match(r'[ \f\t]*$', line[:start])
if m is not None:
for t in dedent_if_necessary(m.end()):
yield t
if is_identifier(token):
yield PythonToken(NAME, token, spos, prefix)
else:
for t in _split_illegal_unicode_name(token, spos, prefix):
yield t # yield from Python 2
elif initial in '\r\n':
if any(not f.allow_multiline() for f in fstring_stack):
# Would use fstring_stack.clear, but that's not available
# in Python 2.
fstring_stack[:] = []
if not new_line and paren_level == 0 and not fstring_stack:
yield PythonToken(NEWLINE, token, spos, prefix)
else:
additional_prefix = prefix + token
new_line = True
elif initial == '#': # Comments
assert not token.endswith("\n")
if fstring_stack and fstring_stack[-1].is_in_expr():
# `#` is not allowed in f-string expressions
yield PythonToken(ERRORTOKEN, initial, spos, prefix)
pos = start + 1
else:
additional_prefix = prefix + token
elif token in triple_quoted:
endprog = endpats[token]
endmatch = endprog.match(line, pos)
if endmatch: # all on one line
pos = endmatch.end(0)
token = line[start:pos]
yield PythonToken(STRING, token, spos, prefix)
else:
contstr_start = spos # multiple lines
contstr = line[start:]
contline = line
break
# Check up to the first 3 chars of the token to see if
# they're in the single_quoted set. If so, they start
# a string.
# We're using the first 3, because we're looking for
# "rb'" (for example) at the start of the token. If
# we switch to longer prefixes, this needs to be
# adjusted.
# Note that initial == token[:1].
# Also note that single quote checking must come after
# triple quote checking (above).
elif initial in single_quoted or \
token[:2] in single_quoted or \
token[:3] in single_quoted:
if token[-1] in '\r\n': # continued string
# This means that a single quoted string ends with a
# backslash and is continued.
contstr_start = lnum, start
endprog = (endpats.get(initial) or endpats.get(token[1])
or endpats.get(token[2]))
contstr = line[start:]
contline = line
break
else: # ordinary string
yield PythonToken(STRING, token, spos, prefix)
elif token in fstring_pattern_map: # The start of an fstring.
fstring_stack.append(FStringNode(fstring_pattern_map[token]))
yield PythonToken(FSTRING_START, token, spos, prefix)
elif initial == '\\' and line[start:] in ('\\\n', '\\\r\n', '\\\r'): # continued stmt
additional_prefix += prefix + line[start:]
break
else:
if token in '([{':
if fstring_stack:
fstring_stack[-1].open_parentheses(token)
else:
paren_level += 1
elif token in ')]}':
if fstring_stack:
fstring_stack[-1].close_parentheses(token)
else:
if paren_level:
paren_level -= 1
elif token.startswith(':') and fstring_stack \
and fstring_stack[-1].parentheses_count \
- fstring_stack[-1].format_spec_count == 1:
# `:` and `:=` both count
fstring_stack[-1].format_spec_count += 1
token = ':'
pos = start + 1
yield PythonToken(OP, token, spos, prefix)
if contstr:
yield PythonToken(ERRORTOKEN, contstr, contstr_start, prefix)
if contstr.endswith('\n') or contstr.endswith('\r'):
new_line = True
if fstring_stack:
tos = fstring_stack[-1]
if tos.previous_lines:
yield PythonToken(
FSTRING_STRING, tos.previous_lines,
tos.last_string_start_pos,
# Never has a prefix because it can start anywhere and
# include whitespace.
prefix=''
)
end_pos = lnum, max_
# As the last position we just take the maximally possible position. We
# remove -1 for the last new line.
for indent in indents[1:]:
indents.pop()
yield PythonToken(DEDENT, '', end_pos, '')
yield PythonToken(ENDMARKER, '', end_pos, additional_prefix)
def _split_illegal_unicode_name(token, start_pos, prefix):
def create_token():
return PythonToken(ERRORTOKEN if is_illegal else NAME, found, pos, prefix)
found = ''
is_illegal = False
pos = start_pos
for i, char in enumerate(token):
if is_illegal:
if is_identifier(char):
yield create_token()
found = char
is_illegal = False
prefix = ''
pos = start_pos[0], start_pos[1] + i
else:
found += char
else:
new_found = found + char
if is_identifier(new_found):
found = new_found
else:
if found:
yield create_token()
prefix = ''
pos = start_pos[0], start_pos[1] + i
found = char
is_illegal = True
if found:
yield create_token()
if __name__ == "__main__":
if len(sys.argv) >= 2:
path = sys.argv[1]
with open(path) as f:
code = f.read()
else:
code = sys.stdin.read()
from parso.utils import python_bytes_to_unicode, parse_version_string
if isinstance(code, bytes):
code = python_bytes_to_unicode(code)
for token in tokenize(code, parse_version_string()):
print(token)

View file

@ -0,0 +1,24 @@
from typing import Generator, Iterable, NamedTuple, Tuple
from parso.python.token import TokenType
from parso.utils import PythonVersionInfo
class Token(NamedTuple):
type: TokenType
string: str
start_pos: Tuple[int, int]
prefix: str
@property
def end_pos(self) -> Tuple[int, int]: ...
class PythonToken(Token):
def __repr__(self) -> str: ...
def tokenize(
code: str, version_info: PythonVersionInfo, start_pos: Tuple[int, int] = (1, 0)
) -> Generator[PythonToken, None, None]: ...
def tokenize_lines(
lines: Iterable[str],
version_info: PythonVersionInfo,
start_pos: Tuple[int, int] = (1, 0),
) -> Generator[PythonToken, None, None]: ...

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,381 @@
import sys
from abc import abstractmethod, abstractproperty
from parso._compatibility import utf8_repr, encoding
from parso.utils import split_lines
def search_ancestor(node, *node_types):
"""
Recursively looks at the parents of a node and returns the first found node
that matches node_types. Returns ``None`` if no matching node is found.
:param node: The ancestors of this node will be checked.
:param node_types: type names that are searched for.
:type node_types: tuple of str
"""
while True:
node = node.parent
if node is None or node.type in node_types:
return node
class NodeOrLeaf(object):
"""
The base class for nodes and leaves.
"""
__slots__ = ()
type = None
'''
The type is a string that typically matches the types of the grammar file.
'''
def get_root_node(self):
"""
Returns the root node of a parser tree. The returned node doesn't have
a parent node like all the other nodes/leaves.
"""
scope = self
while scope.parent is not None:
scope = scope.parent
return scope
def get_next_sibling(self):
"""
Returns the node immediately following this node in this parent's
children list. If this node does not have a next sibling, it is None
"""
parent = self.parent
if parent is None:
return None
# Can't use index(); we need to test by identity
for i, child in enumerate(parent.children):
if child is self:
try:
return self.parent.children[i + 1]
except IndexError:
return None
def get_previous_sibling(self):
"""
Returns the node immediately preceding this node in this parent's
children list. If this node does not have a previous sibling, it is
None.
"""
parent = self.parent
if parent is None:
return None
# Can't use index(); we need to test by identity
for i, child in enumerate(parent.children):
if child is self:
if i == 0:
return None
return self.parent.children[i - 1]
def get_previous_leaf(self):
"""
Returns the previous leaf in the parser tree.
Returns `None` if this is the first element in the parser tree.
"""
if self.parent is None:
return None
node = self
while True:
c = node.parent.children
i = c.index(node)
if i == 0:
node = node.parent
if node.parent is None:
return None
else:
node = c[i - 1]
break
while True:
try:
node = node.children[-1]
except AttributeError: # A Leaf doesn't have children.
return node
def get_next_leaf(self):
"""
Returns the next leaf in the parser tree.
Returns None if this is the last element in the parser tree.
"""
if self.parent is None:
return None
node = self
while True:
c = node.parent.children
i = c.index(node)
if i == len(c) - 1:
node = node.parent
if node.parent is None:
return None
else:
node = c[i + 1]
break
while True:
try:
node = node.children[0]
except AttributeError: # A Leaf doesn't have children.
return node
@abstractproperty
def start_pos(self):
"""
Returns the starting position of the prefix as a tuple, e.g. `(3, 4)`.
:return tuple of int: (line, column)
"""
@abstractproperty
def end_pos(self):
"""
Returns the end position of the prefix as a tuple, e.g. `(3, 4)`.
:return tuple of int: (line, column)
"""
@abstractmethod
def get_start_pos_of_prefix(self):
"""
Returns the start_pos of the prefix. This means basically it returns
the end_pos of the last prefix. The `get_start_pos_of_prefix()` of the
prefix `+` in `2 + 1` would be `(1, 1)`, while the start_pos is
`(1, 2)`.
:return tuple of int: (line, column)
"""
@abstractmethod
def get_first_leaf(self):
"""
Returns the first leaf of a node or itself if this is a leaf.
"""
@abstractmethod
def get_last_leaf(self):
"""
Returns the last leaf of a node or itself if this is a leaf.
"""
@abstractmethod
def get_code(self, include_prefix=True):
"""
Returns the code that was the input for the parser for this node.
:param include_prefix: Removes the prefix (whitespace and comments) of
e.g. a statement.
"""
class Leaf(NodeOrLeaf):
'''
Leafs are basically tokens with a better API. Leafs exactly know where they
were defined and what text preceeds them.
'''
__slots__ = ('value', 'parent', 'line', 'column', 'prefix')
def __init__(self, value, start_pos, prefix=''):
self.value = value
'''
:py:func:`str` The value of the current token.
'''
self.start_pos = start_pos
self.prefix = prefix
'''
:py:func:`str` Typically a mixture of whitespace and comments. Stuff
that is syntactically irrelevant for the syntax tree.
'''
self.parent = None
'''
The parent :class:`BaseNode` of this leaf.
'''
@property
def start_pos(self):
return self.line, self.column
@start_pos.setter
def start_pos(self, value):
self.line = value[0]
self.column = value[1]
def get_start_pos_of_prefix(self):
previous_leaf = self.get_previous_leaf()
if previous_leaf is None:
lines = split_lines(self.prefix)
# + 1 is needed because split_lines always returns at least [''].
return self.line - len(lines) + 1, 0 # It's the first leaf.
return previous_leaf.end_pos
def get_first_leaf(self):
return self
def get_last_leaf(self):
return self
def get_code(self, include_prefix=True):
if include_prefix:
return self.prefix + self.value
else:
return self.value
@property
def end_pos(self):
lines = split_lines(self.value)
end_pos_line = self.line + len(lines) - 1
# Check for multiline token
if self.line == end_pos_line:
end_pos_column = self.column + len(lines[-1])
else:
end_pos_column = len(lines[-1])
return end_pos_line, end_pos_column
@utf8_repr
def __repr__(self):
value = self.value
if not value:
value = self.type
return "<%s: %s>" % (type(self).__name__, value)
class TypedLeaf(Leaf):
__slots__ = ('type',)
def __init__(self, type, value, start_pos, prefix=''):
super(TypedLeaf, self).__init__(value, start_pos, prefix)
self.type = type
class BaseNode(NodeOrLeaf):
"""
The super class for all nodes.
A node has children, a type and possibly a parent node.
"""
__slots__ = ('children', 'parent')
type = None
def __init__(self, children):
self.children = children
"""
A list of :class:`NodeOrLeaf` child nodes.
"""
self.parent = None
'''
The parent :class:`BaseNode` of this leaf.
None if this is the root node.
'''
@property
def start_pos(self):
return self.children[0].start_pos
def get_start_pos_of_prefix(self):
return self.children[0].get_start_pos_of_prefix()
@property
def end_pos(self):
return self.children[-1].end_pos
def _get_code_for_children(self, children, include_prefix):
if include_prefix:
return "".join(c.get_code() for c in children)
else:
first = children[0].get_code(include_prefix=False)
return first + "".join(c.get_code() for c in children[1:])
def get_code(self, include_prefix=True):
return self._get_code_for_children(self.children, include_prefix)
def get_leaf_for_position(self, position, include_prefixes=False):
"""
Get the :py:class:`parso.tree.Leaf` at ``position``
:param tuple position: A position tuple, row, column. Rows start from 1
:param bool include_prefixes: If ``False``, ``None`` will be returned if ``position`` falls
on whitespace or comments before a leaf
:return: :py:class:`parso.tree.Leaf` at ``position``, or ``None``
"""
def binary_search(lower, upper):
if lower == upper:
element = self.children[lower]
if not include_prefixes and position < element.start_pos:
# We're on a prefix.
return None
# In case we have prefixes, a leaf always matches
try:
return element.get_leaf_for_position(position, include_prefixes)
except AttributeError:
return element
index = int((lower + upper) / 2)
element = self.children[index]
if position <= element.end_pos:
return binary_search(lower, index)
else:
return binary_search(index + 1, upper)
if not ((1, 0) <= position <= self.children[-1].end_pos):
raise ValueError('Please provide a position that exists within this node.')
return binary_search(0, len(self.children) - 1)
def get_first_leaf(self):
return self.children[0].get_first_leaf()
def get_last_leaf(self):
return self.children[-1].get_last_leaf()
@utf8_repr
def __repr__(self):
code = self.get_code().replace('\n', ' ').replace('\r', ' ').strip()
if not sys.version_info.major >= 3:
code = code.encode(encoding, 'replace')
return "<%s: %s@%s,%s>" % \
(type(self).__name__, code, self.start_pos[0], self.start_pos[1])
class Node(BaseNode):
"""Concrete implementation for interior nodes."""
__slots__ = ('type',)
def __init__(self, type, children):
super(Node, self).__init__(children)
self.type = type
def __repr__(self):
return "%s(%s, %r)" % (self.__class__.__name__, self.type, self.children)
class ErrorNode(BaseNode):
"""
A node that contains valid nodes/leaves that we're follow by a token that
was invalid. This basically means that the leaf after this node is where
Python would mark a syntax error.
"""
__slots__ = ()
type = 'error_node'
class ErrorLeaf(Leaf):
"""
A leaf that is either completely invalid in a language (like `$` in Python)
or is invalid at that position. Like the star in `1 +* 1`.
"""
__slots__ = ('token_type',)
type = 'error_leaf'
def __init__(self, token_type, value, start_pos, prefix=''):
super(ErrorLeaf, self).__init__(value, start_pos, prefix)
self.token_type = token_type
def __repr__(self):
return "<%s: %s:%s, %s>" % \
(type(self).__name__, self.token_type, repr(self.value), self.start_pos)

View file

@ -0,0 +1,185 @@
from collections import namedtuple
import re
import sys
from ast import literal_eval
from functools import total_ordering
from parso._compatibility import unicode
# The following is a list in Python that are line breaks in str.splitlines, but
# not in Python. In Python only \r (Carriage Return, 0xD) and \n (Line Feed,
# 0xA) are allowed to split lines.
_NON_LINE_BREAKS = (
u'\v', # Vertical Tabulation 0xB
u'\f', # Form Feed 0xC
u'\x1C', # File Separator
u'\x1D', # Group Separator
u'\x1E', # Record Separator
u'\x85', # Next Line (NEL - Equivalent to CR+LF.
# Used to mark end-of-line on some IBM mainframes.)
u'\u2028', # Line Separator
u'\u2029', # Paragraph Separator
)
Version = namedtuple('Version', 'major, minor, micro')
def split_lines(string, keepends=False):
r"""
Intended for Python code. In contrast to Python's :py:meth:`str.splitlines`,
looks at form feeds and other special characters as normal text. Just
splits ``\n`` and ``\r\n``.
Also different: Returns ``[""]`` for an empty string input.
In Python 2.7 form feeds are used as normal characters when using
str.splitlines. However in Python 3 somewhere there was a decision to split
also on form feeds.
"""
if keepends:
lst = string.splitlines(True)
# We have to merge lines that were broken by form feed characters.
merge = []
for i, line in enumerate(lst):
try:
last_chr = line[-1]
except IndexError:
pass
else:
if last_chr in _NON_LINE_BREAKS:
merge.append(i)
for index in reversed(merge):
try:
lst[index] = lst[index] + lst[index + 1]
del lst[index + 1]
except IndexError:
# index + 1 can be empty and therefore there's no need to
# merge.
pass
# The stdlib's implementation of the end is inconsistent when calling
# it with/without keepends. One time there's an empty string in the
# end, one time there's none.
if string.endswith('\n') or string.endswith('\r') or string == '':
lst.append('')
return lst
else:
return re.split(r'\n|\r\n|\r', string)
def python_bytes_to_unicode(source, encoding='utf-8', errors='strict'):
"""
Checks for unicode BOMs and PEP 263 encoding declarations. Then returns a
unicode object like in :py:meth:`bytes.decode`.
:param encoding: See :py:meth:`bytes.decode` documentation.
:param errors: See :py:meth:`bytes.decode` documentation. ``errors`` can be
``'strict'``, ``'replace'`` or ``'ignore'``.
"""
def detect_encoding():
"""
For the implementation of encoding definitions in Python, look at:
- http://www.python.org/dev/peps/pep-0263/
- http://docs.python.org/2/reference/lexical_analysis.html#encoding-declarations
"""
byte_mark = literal_eval(r"b'\xef\xbb\xbf'")
if source.startswith(byte_mark):
# UTF-8 byte-order mark
return 'utf-8'
first_two_lines = re.match(br'(?:[^\n]*\n){0,2}', source).group(0)
possible_encoding = re.search(br"coding[=:]\s*([-\w.]+)",
first_two_lines)
if possible_encoding:
return possible_encoding.group(1)
else:
# the default if nothing else has been set -> PEP 263
return encoding
if isinstance(source, unicode):
# only cast str/bytes
return source
encoding = detect_encoding()
if not isinstance(encoding, unicode):
encoding = unicode(encoding, 'utf-8', 'replace')
try:
# Cast to unicode
return unicode(source, encoding, errors)
except LookupError:
if errors == 'replace':
# This is a weird case that can happen if the given encoding is not
# a valid encoding. This usually shouldn't happen with provided
# encodings, but can happen if somebody uses encoding declarations
# like `# coding: foo-8`.
return unicode(source, 'utf-8', errors)
raise
def version_info():
"""
Returns a namedtuple of parso's version, similar to Python's
``sys.version_info``.
"""
from parso import __version__
tupl = re.findall(r'[a-z]+|\d+', __version__)
return Version(*[x if i == 3 else int(x) for i, x in enumerate(tupl)])
def _parse_version(version):
match = re.match(r'(\d+)(?:\.(\d{1,2})(?:\.\d+)?)?((a|b|rc)\d)?$', version)
if match is None:
raise ValueError('The given version is not in the right format. '
'Use something like "3.8" or "3".')
major = int(match.group(1))
minor = match.group(2)
if minor is None:
# Use the latest Python in case it's not exactly defined, because the
# grammars are typically backwards compatible?
if major == 2:
minor = "7"
elif major == 3:
minor = "6"
else:
raise NotImplementedError("Sorry, no support yet for those fancy new/old versions.")
minor = int(minor)
return PythonVersionInfo(major, minor)
@total_ordering
class PythonVersionInfo(namedtuple('Version', 'major, minor')):
def __gt__(self, other):
if isinstance(other, tuple):
if len(other) != 2:
raise ValueError("Can only compare to tuples of length 2.")
return (self.major, self.minor) > other
super(PythonVersionInfo, self).__gt__(other)
return (self.major, self.minor)
def __eq__(self, other):
if isinstance(other, tuple):
if len(other) != 2:
raise ValueError("Can only compare to tuples of length 2.")
return (self.major, self.minor) == other
super(PythonVersionInfo, self).__eq__(other)
def __ne__(self, other):
return not self.__eq__(other)
def parse_version_string(version=None):
"""
Checks for a valid version number (e.g. `3.8` or `2.7.1` or `3`) and
returns a corresponding version info that is always two characters long in
decimal.
"""
if version is None:
version = '%s.%s' % sys.version_info[:2]
if not isinstance(version, (unicode, str)):
raise TypeError('version must be a string like "3.8"')
return _parse_version(version)

View file

@ -0,0 +1,29 @@
from typing import NamedTuple, Optional, Sequence, Union
class Version(NamedTuple):
major: int
minor: int
micro: int
def split_lines(string: str, keepends: bool = ...) -> Sequence[str]: ...
def python_bytes_to_unicode(
source: Union[str, bytes], encoding: str = ..., errors: str = ...
) -> str: ...
def version_info() -> Version:
"""
Returns a namedtuple of parso's version, similar to Python's
``sys.version_info``.
"""
...
class PythonVersionInfo(NamedTuple):
major: int
minor: int
def parse_version_string(version: Optional[str]) -> PythonVersionInfo:
"""
Checks for a valid version number (e.g. `3.2` or `2.7.1` or `3`) and
returns a corresponding version info that is always two characters long in
decimal.
"""
...