217 lines
8.4 KiB
Python
217 lines
8.4 KiB
Python
from parso.python import tree
|
|
from parso.python.token import PythonTokenTypes
|
|
from parso.parser import BaseParser
|
|
|
|
|
|
NAME = PythonTokenTypes.NAME
|
|
INDENT = PythonTokenTypes.INDENT
|
|
DEDENT = PythonTokenTypes.DEDENT
|
|
|
|
|
|
class Parser(BaseParser):
|
|
"""
|
|
This class is used to parse a Python file, it then divides them into a
|
|
class structure of different scopes.
|
|
|
|
:param pgen_grammar: The grammar object of pgen2. Loaded by load_grammar.
|
|
"""
|
|
|
|
node_map = {
|
|
'expr_stmt': tree.ExprStmt,
|
|
'classdef': tree.Class,
|
|
'funcdef': tree.Function,
|
|
'file_input': tree.Module,
|
|
'import_name': tree.ImportName,
|
|
'import_from': tree.ImportFrom,
|
|
'break_stmt': tree.KeywordStatement,
|
|
'continue_stmt': tree.KeywordStatement,
|
|
'return_stmt': tree.ReturnStmt,
|
|
'raise_stmt': tree.KeywordStatement,
|
|
'yield_expr': tree.YieldExpr,
|
|
'del_stmt': tree.KeywordStatement,
|
|
'pass_stmt': tree.KeywordStatement,
|
|
'global_stmt': tree.GlobalStmt,
|
|
'nonlocal_stmt': tree.KeywordStatement,
|
|
'print_stmt': tree.KeywordStatement,
|
|
'assert_stmt': tree.AssertStmt,
|
|
'if_stmt': tree.IfStmt,
|
|
'with_stmt': tree.WithStmt,
|
|
'for_stmt': tree.ForStmt,
|
|
'while_stmt': tree.WhileStmt,
|
|
'try_stmt': tree.TryStmt,
|
|
'sync_comp_for': tree.SyncCompFor,
|
|
# Not sure if this is the best idea, but IMO it's the easiest way to
|
|
# avoid extreme amounts of work around the subtle difference of 2/3
|
|
# grammar in list comoprehensions.
|
|
'list_for': tree.SyncCompFor,
|
|
'decorator': tree.Decorator,
|
|
'lambdef': tree.Lambda,
|
|
'old_lambdef': tree.Lambda,
|
|
'lambdef_nocond': tree.Lambda,
|
|
}
|
|
default_node = tree.PythonNode
|
|
|
|
# Names/Keywords are handled separately
|
|
_leaf_map = {
|
|
PythonTokenTypes.STRING: tree.String,
|
|
PythonTokenTypes.NUMBER: tree.Number,
|
|
PythonTokenTypes.NEWLINE: tree.Newline,
|
|
PythonTokenTypes.ENDMARKER: tree.EndMarker,
|
|
PythonTokenTypes.FSTRING_STRING: tree.FStringString,
|
|
PythonTokenTypes.FSTRING_START: tree.FStringStart,
|
|
PythonTokenTypes.FSTRING_END: tree.FStringEnd,
|
|
}
|
|
|
|
def __init__(self, pgen_grammar, error_recovery=True, start_nonterminal='file_input'):
|
|
super(Parser, self).__init__(pgen_grammar, start_nonterminal,
|
|
error_recovery=error_recovery)
|
|
|
|
self.syntax_errors = []
|
|
self._omit_dedent_list = []
|
|
self._indent_counter = 0
|
|
|
|
def parse(self, tokens):
|
|
if self._error_recovery:
|
|
if self._start_nonterminal != 'file_input':
|
|
raise NotImplementedError
|
|
|
|
tokens = self._recovery_tokenize(tokens)
|
|
|
|
return super(Parser, self).parse(tokens)
|
|
|
|
def convert_node(self, nonterminal, children):
|
|
"""
|
|
Convert raw node information to a PythonBaseNode instance.
|
|
|
|
This is passed to the parser driver which calls it whenever a reduction of a
|
|
grammar rule produces a new complete node, so that the tree is build
|
|
strictly bottom-up.
|
|
"""
|
|
try:
|
|
node = self.node_map[nonterminal](children)
|
|
except KeyError:
|
|
if nonterminal == 'suite':
|
|
# We don't want the INDENT/DEDENT in our parser tree. Those
|
|
# leaves are just cancer. They are virtual leaves and not real
|
|
# ones and therefore have pseudo start/end positions and no
|
|
# prefixes. Just ignore them.
|
|
children = [children[0]] + children[2:-1]
|
|
elif nonterminal == 'list_if':
|
|
# Make transitioning from 2 to 3 easier.
|
|
nonterminal = 'comp_if'
|
|
elif nonterminal == 'listmaker':
|
|
# Same as list_if above.
|
|
nonterminal = 'testlist_comp'
|
|
node = self.default_node(nonterminal, children)
|
|
for c in children:
|
|
c.parent = node
|
|
return node
|
|
|
|
def convert_leaf(self, type, value, prefix, start_pos):
|
|
# print('leaf', repr(value), token.tok_name[type])
|
|
if type == NAME:
|
|
if value in self._pgen_grammar.reserved_syntax_strings:
|
|
return tree.Keyword(value, start_pos, prefix)
|
|
else:
|
|
return tree.Name(value, start_pos, prefix)
|
|
|
|
return self._leaf_map.get(type, tree.Operator)(value, start_pos, prefix)
|
|
|
|
def error_recovery(self, token):
|
|
tos_nodes = self.stack[-1].nodes
|
|
if tos_nodes:
|
|
last_leaf = tos_nodes[-1].get_last_leaf()
|
|
else:
|
|
last_leaf = None
|
|
|
|
if self._start_nonterminal == 'file_input' and \
|
|
(token.type == PythonTokenTypes.ENDMARKER
|
|
or token.type == DEDENT and not last_leaf.value.endswith('\n')
|
|
and not last_leaf.value.endswith('\r')):
|
|
# In Python statements need to end with a newline. But since it's
|
|
# possible (and valid in Python) that there's no newline at the
|
|
# end of a file, we have to recover even if the user doesn't want
|
|
# error recovery.
|
|
if self.stack[-1].dfa.from_rule == 'simple_stmt':
|
|
try:
|
|
plan = self.stack[-1].dfa.transitions[PythonTokenTypes.NEWLINE]
|
|
except KeyError:
|
|
pass
|
|
else:
|
|
if plan.next_dfa.is_final and not plan.dfa_pushes:
|
|
# We are ignoring here that the newline would be
|
|
# required for a simple_stmt.
|
|
self.stack[-1].dfa = plan.next_dfa
|
|
self._add_token(token)
|
|
return
|
|
|
|
if not self._error_recovery:
|
|
return super(Parser, self).error_recovery(token)
|
|
|
|
def current_suite(stack):
|
|
# For now just discard everything that is not a suite or
|
|
# file_input, if we detect an error.
|
|
for until_index, stack_node in reversed(list(enumerate(stack))):
|
|
# `suite` can sometimes be only simple_stmt, not stmt.
|
|
if stack_node.nonterminal == 'file_input':
|
|
break
|
|
elif stack_node.nonterminal == 'suite':
|
|
# In the case where we just have a newline we don't want to
|
|
# do error recovery here. In all other cases, we want to do
|
|
# error recovery.
|
|
if len(stack_node.nodes) != 1:
|
|
break
|
|
return until_index
|
|
|
|
until_index = current_suite(self.stack)
|
|
|
|
if self._stack_removal(until_index + 1):
|
|
self._add_token(token)
|
|
else:
|
|
typ, value, start_pos, prefix = token
|
|
if typ == INDENT:
|
|
# For every deleted INDENT we have to delete a DEDENT as well.
|
|
# Otherwise the parser will get into trouble and DEDENT too early.
|
|
self._omit_dedent_list.append(self._indent_counter)
|
|
|
|
error_leaf = tree.PythonErrorLeaf(typ.name, value, start_pos, prefix)
|
|
self.stack[-1].nodes.append(error_leaf)
|
|
|
|
tos = self.stack[-1]
|
|
if tos.nonterminal == 'suite':
|
|
# Need at least one statement in the suite. This happend with the
|
|
# error recovery above.
|
|
try:
|
|
tos.dfa = tos.dfa.arcs['stmt']
|
|
except KeyError:
|
|
# We're already in a final state.
|
|
pass
|
|
|
|
def _stack_removal(self, start_index):
|
|
all_nodes = [node for stack_node in self.stack[start_index:] for node in stack_node.nodes]
|
|
|
|
if all_nodes:
|
|
node = tree.PythonErrorNode(all_nodes)
|
|
for n in all_nodes:
|
|
n.parent = node
|
|
self.stack[start_index - 1].nodes.append(node)
|
|
|
|
self.stack[start_index:] = []
|
|
return bool(all_nodes)
|
|
|
|
def _recovery_tokenize(self, tokens):
|
|
for token in tokens:
|
|
typ = token[0]
|
|
if typ == DEDENT:
|
|
# We need to count indents, because if we just omit any DEDENT,
|
|
# we might omit them in the wrong place.
|
|
o = self._omit_dedent_list
|
|
if o and o[-1] == self._indent_counter:
|
|
o.pop()
|
|
self._indent_counter -= 1
|
|
continue
|
|
|
|
self._indent_counter -= 1
|
|
elif typ == INDENT:
|
|
self._indent_counter += 1
|
|
yield token
|