Added delete option to database storage.

This commit is contained in:
Batuhan Berk Başoğlu 2020-10-12 12:10:01 -04:00
parent 308604a33c
commit 963b5bc68b
1868 changed files with 192402 additions and 13278 deletions

View file

@ -0,0 +1,90 @@
#-----------------------------------------------------------------
# pycparser: __init__.py
#
# This package file exports some convenience functions for
# interacting with pycparser
#
# Eli Bendersky [https://eli.thegreenplace.net/]
# License: BSD
#-----------------------------------------------------------------
__all__ = ['c_lexer', 'c_parser', 'c_ast']
__version__ = '2.20'
import io
from subprocess import check_output
from .c_parser import CParser
def preprocess_file(filename, cpp_path='cpp', cpp_args=''):
""" Preprocess a file using cpp.
filename:
Name of the file you want to preprocess.
cpp_path:
cpp_args:
Refer to the documentation of parse_file for the meaning of these
arguments.
When successful, returns the preprocessed file's contents.
Errors from cpp will be printed out.
"""
path_list = [cpp_path]
if isinstance(cpp_args, list):
path_list += cpp_args
elif cpp_args != '':
path_list += [cpp_args]
path_list += [filename]
try:
# Note the use of universal_newlines to treat all newlines
# as \n for Python's purpose
text = check_output(path_list, universal_newlines=True)
except OSError as e:
raise RuntimeError("Unable to invoke 'cpp'. " +
'Make sure its path was passed correctly\n' +
('Original error: %s' % e))
return text
def parse_file(filename, use_cpp=False, cpp_path='cpp', cpp_args='',
parser=None):
""" Parse a C file using pycparser.
filename:
Name of the file you want to parse.
use_cpp:
Set to True if you want to execute the C pre-processor
on the file prior to parsing it.
cpp_path:
If use_cpp is True, this is the path to 'cpp' on your
system. If no path is provided, it attempts to just
execute 'cpp', so it must be in your PATH.
cpp_args:
If use_cpp is True, set this to the command line arguments strings
to cpp. Be careful with quotes - it's best to pass a raw string
(r'') here. For example:
r'-I../utils/fake_libc_include'
If several arguments are required, pass a list of strings.
parser:
Optional parser object to be used instead of the default CParser
When successful, an AST is returned. ParseError can be
thrown if the file doesn't parse successfully.
Errors from cpp will be printed out.
"""
if use_cpp:
text = preprocess_file(filename, cpp_path, cpp_args)
else:
with io.open(filename) as f:
text = f.read()
if parser is None:
parser = CParser()
return parser.parse(text, filename)

View file

@ -0,0 +1,338 @@
#-----------------------------------------------------------------
# _ast_gen.py
#
# Generates the AST Node classes from a specification given in
# a configuration file
#
# The design of this module was inspired by astgen.py from the
# Python 2.5 code-base.
#
# Eli Bendersky [https://eli.thegreenplace.net/]
# License: BSD
#-----------------------------------------------------------------
import pprint
from string import Template
class ASTCodeGenerator(object):
def __init__(self, cfg_filename='_c_ast.cfg'):
""" Initialize the code generator from a configuration
file.
"""
self.cfg_filename = cfg_filename
self.node_cfg = [NodeCfg(name, contents)
for (name, contents) in self.parse_cfgfile(cfg_filename)]
def generate(self, file=None):
""" Generates the code into file, an open file buffer.
"""
src = Template(_PROLOGUE_COMMENT).substitute(
cfg_filename=self.cfg_filename)
src += _PROLOGUE_CODE
for node_cfg in self.node_cfg:
src += node_cfg.generate_source() + '\n\n'
file.write(src)
def parse_cfgfile(self, filename):
""" Parse the configuration file and yield pairs of
(name, contents) for each node.
"""
with open(filename, "r") as f:
for line in f:
line = line.strip()
if not line or line.startswith('#'):
continue
colon_i = line.find(':')
lbracket_i = line.find('[')
rbracket_i = line.find(']')
if colon_i < 1 or lbracket_i <= colon_i or rbracket_i <= lbracket_i:
raise RuntimeError("Invalid line in %s:\n%s\n" % (filename, line))
name = line[:colon_i]
val = line[lbracket_i + 1:rbracket_i]
vallist = [v.strip() for v in val.split(',')] if val else []
yield name, vallist
class NodeCfg(object):
""" Node configuration.
name: node name
contents: a list of contents - attributes and child nodes
See comment at the top of the configuration file for details.
"""
def __init__(self, name, contents):
self.name = name
self.all_entries = []
self.attr = []
self.child = []
self.seq_child = []
for entry in contents:
clean_entry = entry.rstrip('*')
self.all_entries.append(clean_entry)
if entry.endswith('**'):
self.seq_child.append(clean_entry)
elif entry.endswith('*'):
self.child.append(clean_entry)
else:
self.attr.append(entry)
def generate_source(self):
src = self._gen_init()
src += '\n' + self._gen_children()
src += '\n' + self._gen_iter()
src += '\n' + self._gen_attr_names()
return src
def _gen_init(self):
src = "class %s(Node):\n" % self.name
if self.all_entries:
args = ', '.join(self.all_entries)
slots = ', '.join("'{0}'".format(e) for e in self.all_entries)
slots += ", 'coord', '__weakref__'"
arglist = '(self, %s, coord=None)' % args
else:
slots = "'coord', '__weakref__'"
arglist = '(self, coord=None)'
src += " __slots__ = (%s)\n" % slots
src += " def __init__%s:\n" % arglist
for name in self.all_entries + ['coord']:
src += " self.%s = %s\n" % (name, name)
return src
def _gen_children(self):
src = ' def children(self):\n'
if self.all_entries:
src += ' nodelist = []\n'
for child in self.child:
src += (
' if self.%(child)s is not None:' +
' nodelist.append(("%(child)s", self.%(child)s))\n') % (
dict(child=child))
for seq_child in self.seq_child:
src += (
' for i, child in enumerate(self.%(child)s or []):\n'
' nodelist.append(("%(child)s[%%d]" %% i, child))\n') % (
dict(child=seq_child))
src += ' return tuple(nodelist)\n'
else:
src += ' return ()\n'
return src
def _gen_iter(self):
src = ' def __iter__(self):\n'
if self.all_entries:
for child in self.child:
src += (
' if self.%(child)s is not None:\n' +
' yield self.%(child)s\n') % (dict(child=child))
for seq_child in self.seq_child:
src += (
' for child in (self.%(child)s or []):\n'
' yield child\n') % (dict(child=seq_child))
if not (self.child or self.seq_child):
# Empty generator
src += (
' return\n' +
' yield\n')
else:
# Empty generator
src += (
' return\n' +
' yield\n')
return src
def _gen_attr_names(self):
src = " attr_names = (" + ''.join("%r, " % nm for nm in self.attr) + ')'
return src
_PROLOGUE_COMMENT = \
r'''#-----------------------------------------------------------------
# ** ATTENTION **
# This code was automatically generated from the file:
# $cfg_filename
#
# Do not modify it directly. Modify the configuration file and
# run the generator again.
# ** ** *** ** **
#
# pycparser: c_ast.py
#
# AST Node classes.
#
# Eli Bendersky [https://eli.thegreenplace.net/]
# License: BSD
#-----------------------------------------------------------------
'''
_PROLOGUE_CODE = r'''
import sys
def _repr(obj):
"""
Get the representation of an object, with dedicated pprint-like format for lists.
"""
if isinstance(obj, list):
return '[' + (',\n '.join((_repr(e).replace('\n', '\n ') for e in obj))) + '\n]'
else:
return repr(obj)
class Node(object):
__slots__ = ()
""" Abstract base class for AST nodes.
"""
def __repr__(self):
""" Generates a python representation of the current node
"""
result = self.__class__.__name__ + '('
indent = ''
separator = ''
for name in self.__slots__[:-2]:
result += separator
result += indent
result += name + '=' + (_repr(getattr(self, name)).replace('\n', '\n ' + (' ' * (len(name) + len(self.__class__.__name__)))))
separator = ','
indent = '\n ' + (' ' * len(self.__class__.__name__))
result += indent + ')'
return result
def children(self):
""" A sequence of all children that are Nodes
"""
pass
def show(self, buf=sys.stdout, offset=0, attrnames=False, nodenames=False, showcoord=False, _my_node_name=None):
""" Pretty print the Node and all its attributes and
children (recursively) to a buffer.
buf:
Open IO buffer into which the Node is printed.
offset:
Initial offset (amount of leading spaces)
attrnames:
True if you want to see the attribute names in
name=value pairs. False to only see the values.
nodenames:
True if you want to see the actual node names
within their parents.
showcoord:
Do you want the coordinates of each Node to be
displayed.
"""
lead = ' ' * offset
if nodenames and _my_node_name is not None:
buf.write(lead + self.__class__.__name__+ ' <' + _my_node_name + '>: ')
else:
buf.write(lead + self.__class__.__name__+ ': ')
if self.attr_names:
if attrnames:
nvlist = [(n, getattr(self,n)) for n in self.attr_names]
attrstr = ', '.join('%s=%s' % nv for nv in nvlist)
else:
vlist = [getattr(self, n) for n in self.attr_names]
attrstr = ', '.join('%s' % v for v in vlist)
buf.write(attrstr)
if showcoord:
buf.write(' (at %s)' % self.coord)
buf.write('\n')
for (child_name, child) in self.children():
child.show(
buf,
offset=offset + 2,
attrnames=attrnames,
nodenames=nodenames,
showcoord=showcoord,
_my_node_name=child_name)
class NodeVisitor(object):
""" A base NodeVisitor class for visiting c_ast nodes.
Subclass it and define your own visit_XXX methods, where
XXX is the class name you want to visit with these
methods.
For example:
class ConstantVisitor(NodeVisitor):
def __init__(self):
self.values = []
def visit_Constant(self, node):
self.values.append(node.value)
Creates a list of values of all the constant nodes
encountered below the given node. To use it:
cv = ConstantVisitor()
cv.visit(node)
Notes:
* generic_visit() will be called for AST nodes for which
no visit_XXX method was defined.
* The children of nodes for which a visit_XXX was
defined will not be visited - if you need this, call
generic_visit() on the node.
You can use:
NodeVisitor.generic_visit(self, node)
* Modeled after Python's own AST visiting facilities
(the ast module of Python 3.0)
"""
_method_cache = None
def visit(self, node):
""" Visit a node.
"""
if self._method_cache is None:
self._method_cache = {}
visitor = self._method_cache.get(node.__class__.__name__, None)
if visitor is None:
method = 'visit_' + node.__class__.__name__
visitor = getattr(self, method, self.generic_visit)
self._method_cache[node.__class__.__name__] = visitor
return visitor(node)
def generic_visit(self, node):
""" Called if no explicit visitor function exists for a
node. Implements preorder visiting of the node.
"""
for c in node:
self.visit(c)
'''

View file

@ -0,0 +1,37 @@
#-----------------------------------------------------------------
# pycparser: _build_tables.py
#
# A dummy for generating the lexing/parsing tables and and
# compiling them into .pyc for faster execution in optimized mode.
# Also generates AST code from the configuration file.
# Should be called from the pycparser directory.
#
# Eli Bendersky [https://eli.thegreenplace.net/]
# License: BSD
#-----------------------------------------------------------------
# Insert '.' and '..' as first entries to the search path for modules.
# Restricted environments like embeddable python do not include the
# current working directory on startup.
import sys
sys.path[0:0] = ['.', '..']
# Generate c_ast.py
from _ast_gen import ASTCodeGenerator
ast_gen = ASTCodeGenerator('_c_ast.cfg')
ast_gen.generate(open('c_ast.py', 'w'))
from pycparser import c_parser
# Generates the tables
#
c_parser.CParser(
lex_optimize=True,
yacc_debug=False,
yacc_optimize=True)
# Load to compile into .pyc
#
import lextab
import yacctab
import c_ast

View file

@ -0,0 +1,191 @@
#-----------------------------------------------------------------
# pycparser: _c_ast.cfg
#
# Defines the AST Node classes used in pycparser.
#
# Each entry is a Node sub-class name, listing the attributes
# and child nodes of the class:
# <name>* - a child node
# <name>** - a sequence of child nodes
# <name> - an attribute
#
# Eli Bendersky [https://eli.thegreenplace.net/]
# License: BSD
#-----------------------------------------------------------------
# ArrayDecl is a nested declaration of an array with the given type.
# dim: the dimension (for example, constant 42)
# dim_quals: list of dimension qualifiers, to support C99's allowing 'const'
# and 'static' within the array dimension in function declarations.
ArrayDecl: [type*, dim*, dim_quals]
ArrayRef: [name*, subscript*]
# op: =, +=, /= etc.
#
Assignment: [op, lvalue*, rvalue*]
BinaryOp: [op, left*, right*]
Break: []
Case: [expr*, stmts**]
Cast: [to_type*, expr*]
# Compound statement in C99 is a list of block items (declarations or
# statements).
#
Compound: [block_items**]
# Compound literal (anonymous aggregate) for C99.
# (type-name) {initializer_list}
# type: the typename
# init: InitList for the initializer list
#
CompoundLiteral: [type*, init*]
# type: int, char, float, etc. see CLexer for constant token types
#
Constant: [type, value]
Continue: []
# name: the variable being declared
# quals: list of qualifiers (const, volatile)
# funcspec: list function specifiers (i.e. inline in C99)
# storage: list of storage specifiers (extern, register, etc.)
# type: declaration type (probably nested with all the modifiers)
# init: initialization value, or None
# bitsize: bit field size, or None
#
Decl: [name, quals, storage, funcspec, type*, init*, bitsize*]
DeclList: [decls**]
Default: [stmts**]
DoWhile: [cond*, stmt*]
# Represents the ellipsis (...) parameter in a function
# declaration
#
EllipsisParam: []
# An empty statement (a semicolon ';' on its own)
#
EmptyStatement: []
# Enumeration type specifier
# name: an optional ID
# values: an EnumeratorList
#
Enum: [name, values*]
# A name/value pair for enumeration values
#
Enumerator: [name, value*]
# A list of enumerators
#
EnumeratorList: [enumerators**]
# A list of expressions separated by the comma operator.
#
ExprList: [exprs**]
# This is the top of the AST, representing a single C file (a
# translation unit in K&R jargon). It contains a list of
# "external-declaration"s, which is either declarations (Decl),
# Typedef or function definitions (FuncDef).
#
FileAST: [ext**]
# for (init; cond; next) stmt
#
For: [init*, cond*, next*, stmt*]
# name: Id
# args: ExprList
#
FuncCall: [name*, args*]
# type <decl>(args)
#
FuncDecl: [args*, type*]
# Function definition: a declarator for the function name and
# a body, which is a compound statement.
# There's an optional list of parameter declarations for old
# K&R-style definitions
#
FuncDef: [decl*, param_decls**, body*]
Goto: [name]
ID: [name]
# Holder for types that are a simple identifier (e.g. the built
# ins void, char etc. and typedef-defined types)
#
IdentifierType: [names]
If: [cond*, iftrue*, iffalse*]
# An initialization list used for compound literals.
#
InitList: [exprs**]
Label: [name, stmt*]
# A named initializer for C99.
# The name of a NamedInitializer is a sequence of Nodes, because
# names can be hierarchical and contain constant expressions.
#
NamedInitializer: [name**, expr*]
# a list of comma separated function parameter declarations
#
ParamList: [params**]
PtrDecl: [quals, type*]
Return: [expr*]
# name: struct tag name
# decls: declaration of members
#
Struct: [name, decls**]
# type: . or ->
# name.field or name->field
#
StructRef: [name*, type, field*]
Switch: [cond*, stmt*]
# cond ? iftrue : iffalse
#
TernaryOp: [cond*, iftrue*, iffalse*]
# A base type declaration
#
TypeDecl: [declname, quals, type*]
# A typedef declaration.
# Very similar to Decl, but without some attributes
#
Typedef: [name, quals, storage, type*]
Typename: [name, quals, type*]
UnaryOp: [op, expr*]
# name: union tag name
# decls: declaration of members
#
Union: [name, decls**]
While: [cond*, stmt*]
Pragma: [string]

View file

@ -0,0 +1,106 @@
#------------------------------------------------------------------------------
# pycparser: ast_transforms.py
#
# Some utilities used by the parser to create a friendlier AST.
#
# Eli Bendersky [https://eli.thegreenplace.net/]
# License: BSD
#------------------------------------------------------------------------------
from . import c_ast
def fix_switch_cases(switch_node):
""" The 'case' statements in a 'switch' come out of parsing with one
child node, so subsequent statements are just tucked to the parent
Compound. Additionally, consecutive (fall-through) case statements
come out messy. This is a peculiarity of the C grammar. The following:
switch (myvar) {
case 10:
k = 10;
p = k + 1;
return 10;
case 20:
case 30:
return 20;
default:
break;
}
Creates this tree (pseudo-dump):
Switch
ID: myvar
Compound:
Case 10:
k = 10
p = k + 1
return 10
Case 20:
Case 30:
return 20
Default:
break
The goal of this transform is to fix this mess, turning it into the
following:
Switch
ID: myvar
Compound:
Case 10:
k = 10
p = k + 1
return 10
Case 20:
Case 30:
return 20
Default:
break
A fixed AST node is returned. The argument may be modified.
"""
assert isinstance(switch_node, c_ast.Switch)
if not isinstance(switch_node.stmt, c_ast.Compound):
return switch_node
# The new Compound child for the Switch, which will collect children in the
# correct order
new_compound = c_ast.Compound([], switch_node.stmt.coord)
# The last Case/Default node
last_case = None
# Goes over the children of the Compound below the Switch, adding them
# either directly below new_compound or below the last Case as appropriate
# (for `switch(cond) {}`, block_items would have been None)
for child in (switch_node.stmt.block_items or []):
if isinstance(child, (c_ast.Case, c_ast.Default)):
# If it's a Case/Default:
# 1. Add it to the Compound and mark as "last case"
# 2. If its immediate child is also a Case or Default, promote it
# to a sibling.
new_compound.block_items.append(child)
_extract_nested_case(child, new_compound.block_items)
last_case = new_compound.block_items[-1]
else:
# Other statements are added as children to the last case, if it
# exists.
if last_case is None:
new_compound.block_items.append(child)
else:
last_case.stmts.append(child)
switch_node.stmt = new_compound
return switch_node
def _extract_nested_case(case_node, stmts_list):
""" Recursively extract consecutive Case statements that are made nested
by the parser and add them to the stmts_list.
"""
if isinstance(case_node.stmts[0], (c_ast.Case, c_ast.Default)):
stmts_list.append(case_node.stmts.pop())
_extract_nested_case(stmts_list[-1], stmts_list)

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,444 @@
#------------------------------------------------------------------------------
# pycparser: c_generator.py
#
# C code generator from pycparser AST nodes.
#
# Eli Bendersky [https://eli.thegreenplace.net/]
# License: BSD
#------------------------------------------------------------------------------
from . import c_ast
class CGenerator(object):
""" Uses the same visitor pattern as c_ast.NodeVisitor, but modified to
return a value from each visit method, using string accumulation in
generic_visit.
"""
def __init__(self):
# Statements start with indentation of self.indent_level spaces, using
# the _make_indent method
#
self.indent_level = 0
def _make_indent(self):
return ' ' * self.indent_level
def visit(self, node):
method = 'visit_' + node.__class__.__name__
return getattr(self, method, self.generic_visit)(node)
def generic_visit(self, node):
#~ print('generic:', type(node))
if node is None:
return ''
else:
return ''.join(self.visit(c) for c_name, c in node.children())
def visit_Constant(self, n):
return n.value
def visit_ID(self, n):
return n.name
def visit_Pragma(self, n):
ret = '#pragma'
if n.string:
ret += ' ' + n.string
return ret
def visit_ArrayRef(self, n):
arrref = self._parenthesize_unless_simple(n.name)
return arrref + '[' + self.visit(n.subscript) + ']'
def visit_StructRef(self, n):
sref = self._parenthesize_unless_simple(n.name)
return sref + n.type + self.visit(n.field)
def visit_FuncCall(self, n):
fref = self._parenthesize_unless_simple(n.name)
return fref + '(' + self.visit(n.args) + ')'
def visit_UnaryOp(self, n):
operand = self._parenthesize_unless_simple(n.expr)
if n.op == 'p++':
return '%s++' % operand
elif n.op == 'p--':
return '%s--' % operand
elif n.op == 'sizeof':
# Always parenthesize the argument of sizeof since it can be
# a name.
return 'sizeof(%s)' % self.visit(n.expr)
else:
return '%s%s' % (n.op, operand)
def visit_BinaryOp(self, n):
lval_str = self._parenthesize_if(n.left,
lambda d: not self._is_simple_node(d))
rval_str = self._parenthesize_if(n.right,
lambda d: not self._is_simple_node(d))
return '%s %s %s' % (lval_str, n.op, rval_str)
def visit_Assignment(self, n):
rval_str = self._parenthesize_if(
n.rvalue,
lambda n: isinstance(n, c_ast.Assignment))
return '%s %s %s' % (self.visit(n.lvalue), n.op, rval_str)
def visit_IdentifierType(self, n):
return ' '.join(n.names)
def _visit_expr(self, n):
if isinstance(n, c_ast.InitList):
return '{' + self.visit(n) + '}'
elif isinstance(n, c_ast.ExprList):
return '(' + self.visit(n) + ')'
else:
return self.visit(n)
def visit_Decl(self, n, no_type=False):
# no_type is used when a Decl is part of a DeclList, where the type is
# explicitly only for the first declaration in a list.
#
s = n.name if no_type else self._generate_decl(n)
if n.bitsize: s += ' : ' + self.visit(n.bitsize)
if n.init:
s += ' = ' + self._visit_expr(n.init)
return s
def visit_DeclList(self, n):
s = self.visit(n.decls[0])
if len(n.decls) > 1:
s += ', ' + ', '.join(self.visit_Decl(decl, no_type=True)
for decl in n.decls[1:])
return s
def visit_Typedef(self, n):
s = ''
if n.storage: s += ' '.join(n.storage) + ' '
s += self._generate_type(n.type)
return s
def visit_Cast(self, n):
s = '(' + self._generate_type(n.to_type, emit_declname=False) + ')'
return s + ' ' + self._parenthesize_unless_simple(n.expr)
def visit_ExprList(self, n):
visited_subexprs = []
for expr in n.exprs:
visited_subexprs.append(self._visit_expr(expr))
return ', '.join(visited_subexprs)
def visit_InitList(self, n):
visited_subexprs = []
for expr in n.exprs:
visited_subexprs.append(self._visit_expr(expr))
return ', '.join(visited_subexprs)
def visit_Enum(self, n):
return self._generate_struct_union_enum(n, name='enum')
def visit_Enumerator(self, n):
if not n.value:
return '{indent}{name},\n'.format(
indent=self._make_indent(),
name=n.name,
)
else:
return '{indent}{name} = {value},\n'.format(
indent=self._make_indent(),
name=n.name,
value=self.visit(n.value),
)
def visit_FuncDef(self, n):
decl = self.visit(n.decl)
self.indent_level = 0
body = self.visit(n.body)
if n.param_decls:
knrdecls = ';\n'.join(self.visit(p) for p in n.param_decls)
return decl + '\n' + knrdecls + ';\n' + body + '\n'
else:
return decl + '\n' + body + '\n'
def visit_FileAST(self, n):
s = ''
for ext in n.ext:
if isinstance(ext, c_ast.FuncDef):
s += self.visit(ext)
elif isinstance(ext, c_ast.Pragma):
s += self.visit(ext) + '\n'
else:
s += self.visit(ext) + ';\n'
return s
def visit_Compound(self, n):
s = self._make_indent() + '{\n'
self.indent_level += 2
if n.block_items:
s += ''.join(self._generate_stmt(stmt) for stmt in n.block_items)
self.indent_level -= 2
s += self._make_indent() + '}\n'
return s
def visit_CompoundLiteral(self, n):
return '(' + self.visit(n.type) + '){' + self.visit(n.init) + '}'
def visit_EmptyStatement(self, n):
return ';'
def visit_ParamList(self, n):
return ', '.join(self.visit(param) for param in n.params)
def visit_Return(self, n):
s = 'return'
if n.expr: s += ' ' + self.visit(n.expr)
return s + ';'
def visit_Break(self, n):
return 'break;'
def visit_Continue(self, n):
return 'continue;'
def visit_TernaryOp(self, n):
s = '(' + self._visit_expr(n.cond) + ') ? '
s += '(' + self._visit_expr(n.iftrue) + ') : '
s += '(' + self._visit_expr(n.iffalse) + ')'
return s
def visit_If(self, n):
s = 'if ('
if n.cond: s += self.visit(n.cond)
s += ')\n'
s += self._generate_stmt(n.iftrue, add_indent=True)
if n.iffalse:
s += self._make_indent() + 'else\n'
s += self._generate_stmt(n.iffalse, add_indent=True)
return s
def visit_For(self, n):
s = 'for ('
if n.init: s += self.visit(n.init)
s += ';'
if n.cond: s += ' ' + self.visit(n.cond)
s += ';'
if n.next: s += ' ' + self.visit(n.next)
s += ')\n'
s += self._generate_stmt(n.stmt, add_indent=True)
return s
def visit_While(self, n):
s = 'while ('
if n.cond: s += self.visit(n.cond)
s += ')\n'
s += self._generate_stmt(n.stmt, add_indent=True)
return s
def visit_DoWhile(self, n):
s = 'do\n'
s += self._generate_stmt(n.stmt, add_indent=True)
s += self._make_indent() + 'while ('
if n.cond: s += self.visit(n.cond)
s += ');'
return s
def visit_Switch(self, n):
s = 'switch (' + self.visit(n.cond) + ')\n'
s += self._generate_stmt(n.stmt, add_indent=True)
return s
def visit_Case(self, n):
s = 'case ' + self.visit(n.expr) + ':\n'
for stmt in n.stmts:
s += self._generate_stmt(stmt, add_indent=True)
return s
def visit_Default(self, n):
s = 'default:\n'
for stmt in n.stmts:
s += self._generate_stmt(stmt, add_indent=True)
return s
def visit_Label(self, n):
return n.name + ':\n' + self._generate_stmt(n.stmt)
def visit_Goto(self, n):
return 'goto ' + n.name + ';'
def visit_EllipsisParam(self, n):
return '...'
def visit_Struct(self, n):
return self._generate_struct_union_enum(n, 'struct')
def visit_Typename(self, n):
return self._generate_type(n.type)
def visit_Union(self, n):
return self._generate_struct_union_enum(n, 'union')
def visit_NamedInitializer(self, n):
s = ''
for name in n.name:
if isinstance(name, c_ast.ID):
s += '.' + name.name
else:
s += '[' + self.visit(name) + ']'
s += ' = ' + self._visit_expr(n.expr)
return s
def visit_FuncDecl(self, n):
return self._generate_type(n)
def visit_ArrayDecl(self, n):
return self._generate_type(n, emit_declname=False)
def visit_TypeDecl(self, n):
return self._generate_type(n, emit_declname=False)
def visit_PtrDecl(self, n):
return self._generate_type(n, emit_declname=False)
def _generate_struct_union_enum(self, n, name):
""" Generates code for structs, unions, and enums. name should be
'struct', 'union', or 'enum'.
"""
if name in ('struct', 'union'):
members = n.decls
body_function = self._generate_struct_union_body
else:
assert name == 'enum'
members = None if n.values is None else n.values.enumerators
body_function = self._generate_enum_body
s = name + ' ' + (n.name or '')
if members is not None:
# None means no members
# Empty sequence means an empty list of members
s += '\n'
s += self._make_indent()
self.indent_level += 2
s += '{\n'
s += body_function(members)
self.indent_level -= 2
s += self._make_indent() + '}'
return s
def _generate_struct_union_body(self, members):
return ''.join(self._generate_stmt(decl) for decl in members)
def _generate_enum_body(self, members):
# `[:-2] + '\n'` removes the final `,` from the enumerator list
return ''.join(self.visit(value) for value in members)[:-2] + '\n'
def _generate_stmt(self, n, add_indent=False):
""" Generation from a statement node. This method exists as a wrapper
for individual visit_* methods to handle different treatment of
some statements in this context.
"""
typ = type(n)
if add_indent: self.indent_level += 2
indent = self._make_indent()
if add_indent: self.indent_level -= 2
if typ in (
c_ast.Decl, c_ast.Assignment, c_ast.Cast, c_ast.UnaryOp,
c_ast.BinaryOp, c_ast.TernaryOp, c_ast.FuncCall, c_ast.ArrayRef,
c_ast.StructRef, c_ast.Constant, c_ast.ID, c_ast.Typedef,
c_ast.ExprList):
# These can also appear in an expression context so no semicolon
# is added to them automatically
#
return indent + self.visit(n) + ';\n'
elif typ in (c_ast.Compound,):
# No extra indentation required before the opening brace of a
# compound - because it consists of multiple lines it has to
# compute its own indentation.
#
return self.visit(n)
else:
return indent + self.visit(n) + '\n'
def _generate_decl(self, n):
""" Generation from a Decl node.
"""
s = ''
if n.funcspec: s = ' '.join(n.funcspec) + ' '
if n.storage: s += ' '.join(n.storage) + ' '
s += self._generate_type(n.type)
return s
def _generate_type(self, n, modifiers=[], emit_declname = True):
""" Recursive generation from a type node. n is the type node.
modifiers collects the PtrDecl, ArrayDecl and FuncDecl modifiers
encountered on the way down to a TypeDecl, to allow proper
generation from it.
"""
typ = type(n)
#~ print(n, modifiers)
if typ == c_ast.TypeDecl:
s = ''
if n.quals: s += ' '.join(n.quals) + ' '
s += self.visit(n.type)
nstr = n.declname if n.declname and emit_declname else ''
# Resolve modifiers.
# Wrap in parens to distinguish pointer to array and pointer to
# function syntax.
#
for i, modifier in enumerate(modifiers):
if isinstance(modifier, c_ast.ArrayDecl):
if (i != 0 and
isinstance(modifiers[i - 1], c_ast.PtrDecl)):
nstr = '(' + nstr + ')'
nstr += '['
if modifier.dim_quals:
nstr += ' '.join(modifier.dim_quals) + ' '
nstr += self.visit(modifier.dim) + ']'
elif isinstance(modifier, c_ast.FuncDecl):
if (i != 0 and
isinstance(modifiers[i - 1], c_ast.PtrDecl)):
nstr = '(' + nstr + ')'
nstr += '(' + self.visit(modifier.args) + ')'
elif isinstance(modifier, c_ast.PtrDecl):
if modifier.quals:
nstr = '* %s%s' % (' '.join(modifier.quals),
' ' + nstr if nstr else '')
else:
nstr = '*' + nstr
if nstr: s += ' ' + nstr
return s
elif typ == c_ast.Decl:
return self._generate_decl(n.type)
elif typ == c_ast.Typename:
return self._generate_type(n.type, emit_declname = emit_declname)
elif typ == c_ast.IdentifierType:
return ' '.join(n.names) + ' '
elif typ in (c_ast.ArrayDecl, c_ast.PtrDecl, c_ast.FuncDecl):
return self._generate_type(n.type, modifiers + [n],
emit_declname = emit_declname)
else:
return self.visit(n)
def _parenthesize_if(self, n, condition):
""" Visits 'n' and returns its string representation, parenthesized
if the condition function applied to the node returns True.
"""
s = self._visit_expr(n)
if condition(n):
return '(' + s + ')'
else:
return s
def _parenthesize_unless_simple(self, n):
""" Common use case for _parenthesize_if
"""
return self._parenthesize_if(n, lambda d: not self._is_simple_node(d))
def _is_simple_node(self, n):
""" Returns True for nodes that are "simple" - i.e. nodes that always
have higher precedence than operators.
"""
return isinstance(n, (c_ast.Constant, c_ast.ID, c_ast.ArrayRef,
c_ast.StructRef, c_ast.FuncCall))

View file

@ -0,0 +1,514 @@
#------------------------------------------------------------------------------
# pycparser: c_lexer.py
#
# CLexer class: lexer for the C language
#
# Eli Bendersky [https://eli.thegreenplace.net/]
# License: BSD
#------------------------------------------------------------------------------
import re
import sys
from .ply import lex
from .ply.lex import TOKEN
class CLexer(object):
""" A lexer for the C language. After building it, set the
input text with input(), and call token() to get new
tokens.
The public attribute filename can be set to an initial
filename, but the lexer will update it upon #line
directives.
"""
def __init__(self, error_func, on_lbrace_func, on_rbrace_func,
type_lookup_func):
""" Create a new Lexer.
error_func:
An error function. Will be called with an error
message, line and column as arguments, in case of
an error during lexing.
on_lbrace_func, on_rbrace_func:
Called when an LBRACE or RBRACE is encountered
(likely to push/pop type_lookup_func's scope)
type_lookup_func:
A type lookup function. Given a string, it must
return True IFF this string is a name of a type
that was defined with a typedef earlier.
"""
self.error_func = error_func
self.on_lbrace_func = on_lbrace_func
self.on_rbrace_func = on_rbrace_func
self.type_lookup_func = type_lookup_func
self.filename = ''
# Keeps track of the last token returned from self.token()
self.last_token = None
# Allow either "# line" or "# <num>" to support GCC's
# cpp output
#
self.line_pattern = re.compile(r'([ \t]*line\W)|([ \t]*\d+)')
self.pragma_pattern = re.compile(r'[ \t]*pragma\W')
def build(self, **kwargs):
""" Builds the lexer from the specification. Must be
called after the lexer object is created.
This method exists separately, because the PLY
manual warns against calling lex.lex inside
__init__
"""
self.lexer = lex.lex(object=self, **kwargs)
def reset_lineno(self):
""" Resets the internal line number counter of the lexer.
"""
self.lexer.lineno = 1
def input(self, text):
self.lexer.input(text)
def token(self):
self.last_token = self.lexer.token()
return self.last_token
def find_tok_column(self, token):
""" Find the column of the token in its line.
"""
last_cr = self.lexer.lexdata.rfind('\n', 0, token.lexpos)
return token.lexpos - last_cr
######################-- PRIVATE --######################
##
## Internal auxiliary methods
##
def _error(self, msg, token):
location = self._make_tok_location(token)
self.error_func(msg, location[0], location[1])
self.lexer.skip(1)
def _make_tok_location(self, token):
return (token.lineno, self.find_tok_column(token))
##
## Reserved keywords
##
keywords = (
'_BOOL', '_COMPLEX', 'AUTO', 'BREAK', 'CASE', 'CHAR', 'CONST',
'CONTINUE', 'DEFAULT', 'DO', 'DOUBLE', 'ELSE', 'ENUM', 'EXTERN',
'FLOAT', 'FOR', 'GOTO', 'IF', 'INLINE', 'INT', 'LONG',
'REGISTER', 'OFFSETOF',
'RESTRICT', 'RETURN', 'SHORT', 'SIGNED', 'SIZEOF', 'STATIC', 'STRUCT',
'SWITCH', 'TYPEDEF', 'UNION', 'UNSIGNED', 'VOID',
'VOLATILE', 'WHILE', '__INT128',
)
keyword_map = {}
for keyword in keywords:
if keyword == '_BOOL':
keyword_map['_Bool'] = keyword
elif keyword == '_COMPLEX':
keyword_map['_Complex'] = keyword
else:
keyword_map[keyword.lower()] = keyword
##
## All the tokens recognized by the lexer
##
tokens = keywords + (
# Identifiers
'ID',
# Type identifiers (identifiers previously defined as
# types with typedef)
'TYPEID',
# constants
'INT_CONST_DEC', 'INT_CONST_OCT', 'INT_CONST_HEX', 'INT_CONST_BIN', 'INT_CONST_CHAR',
'FLOAT_CONST', 'HEX_FLOAT_CONST',
'CHAR_CONST',
'WCHAR_CONST',
# String literals
'STRING_LITERAL',
'WSTRING_LITERAL',
# Operators
'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'MOD',
'OR', 'AND', 'NOT', 'XOR', 'LSHIFT', 'RSHIFT',
'LOR', 'LAND', 'LNOT',
'LT', 'LE', 'GT', 'GE', 'EQ', 'NE',
# Assignment
'EQUALS', 'TIMESEQUAL', 'DIVEQUAL', 'MODEQUAL',
'PLUSEQUAL', 'MINUSEQUAL',
'LSHIFTEQUAL','RSHIFTEQUAL', 'ANDEQUAL', 'XOREQUAL',
'OREQUAL',
# Increment/decrement
'PLUSPLUS', 'MINUSMINUS',
# Structure dereference (->)
'ARROW',
# Conditional operator (?)
'CONDOP',
# Delimeters
'LPAREN', 'RPAREN', # ( )
'LBRACKET', 'RBRACKET', # [ ]
'LBRACE', 'RBRACE', # { }
'COMMA', 'PERIOD', # . ,
'SEMI', 'COLON', # ; :
# Ellipsis (...)
'ELLIPSIS',
# pre-processor
'PPHASH', # '#'
'PPPRAGMA', # 'pragma'
'PPPRAGMASTR',
)
##
## Regexes for use in tokens
##
##
# valid C identifiers (K&R2: A.2.3), plus '$' (supported by some compilers)
identifier = r'[a-zA-Z_$][0-9a-zA-Z_$]*'
hex_prefix = '0[xX]'
hex_digits = '[0-9a-fA-F]+'
bin_prefix = '0[bB]'
bin_digits = '[01]+'
# integer constants (K&R2: A.2.5.1)
integer_suffix_opt = r'(([uU]ll)|([uU]LL)|(ll[uU]?)|(LL[uU]?)|([uU][lL])|([lL][uU]?)|[uU])?'
decimal_constant = '(0'+integer_suffix_opt+')|([1-9][0-9]*'+integer_suffix_opt+')'
octal_constant = '0[0-7]*'+integer_suffix_opt
hex_constant = hex_prefix+hex_digits+integer_suffix_opt
bin_constant = bin_prefix+bin_digits+integer_suffix_opt
bad_octal_constant = '0[0-7]*[89]'
# character constants (K&R2: A.2.5.2)
# Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line
# directives with Windows paths as filenames (..\..\dir\file)
# For the same reason, decimal_escape allows all digit sequences. We want to
# parse all correct code, even if it means to sometimes parse incorrect
# code.
#
# The original regexes were taken verbatim from the C syntax definition,
# and were later modified to avoid worst-case exponential running time.
#
# simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])"""
# decimal_escape = r"""(\d+)"""
# hex_escape = r"""(x[0-9a-fA-F]+)"""
# bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])"""
#
# The following modifications were made to avoid the ambiguity that allowed backtracking:
# (https://github.com/eliben/pycparser/issues/61)
#
# - \x was removed from simple_escape, unless it was not followed by a hex digit, to avoid ambiguity with hex_escape.
# - hex_escape allows one or more hex characters, but requires that the next character(if any) is not hex
# - decimal_escape allows one or more decimal characters, but requires that the next character(if any) is not a decimal
# - bad_escape does not allow any decimals (8-9), to avoid conflicting with the permissive decimal_escape.
#
# Without this change, python's `re` module would recursively try parsing each ambiguous escape sequence in multiple ways.
# e.g. `\123` could be parsed as `\1`+`23`, `\12`+`3`, and `\123`.
simple_escape = r"""([a-wyzA-Z._~!=&\^\-\\?'"]|x(?![0-9a-fA-F]))"""
decimal_escape = r"""(\d+)(?!\d)"""
hex_escape = r"""(x[0-9a-fA-F]+)(?![0-9a-fA-F])"""
bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-9])"""
escape_sequence = r"""(\\("""+simple_escape+'|'+decimal_escape+'|'+hex_escape+'))'
# This complicated regex with lookahead might be slow for strings, so because all of the valid escapes (including \x) allowed
# 0 or more non-escaped characters after the first character, simple_escape+decimal_escape+hex_escape got simplified to
escape_sequence_start_in_string = r"""(\\[0-9a-zA-Z._~!=&\^\-\\?'"])"""
cconst_char = r"""([^'\\\n]|"""+escape_sequence+')'
char_const = "'"+cconst_char+"'"
wchar_const = 'L'+char_const
multicharacter_constant = "'"+cconst_char+"{2,4}'"
unmatched_quote = "('"+cconst_char+"*\\n)|('"+cconst_char+"*$)"
bad_char_const = r"""('"""+cconst_char+"""[^'\n]+')|('')|('"""+bad_escape+r"""[^'\n]*')"""
# string literals (K&R2: A.2.6)
string_char = r"""([^"\\\n]|"""+escape_sequence_start_in_string+')'
string_literal = '"'+string_char+'*"'
wstring_literal = 'L'+string_literal
bad_string_literal = '"'+string_char+'*'+bad_escape+string_char+'*"'
# floating constants (K&R2: A.2.5.3)
exponent_part = r"""([eE][-+]?[0-9]+)"""
fractional_constant = r"""([0-9]*\.[0-9]+)|([0-9]+\.)"""
floating_constant = '(((('+fractional_constant+')'+exponent_part+'?)|([0-9]+'+exponent_part+'))[FfLl]?)'
binary_exponent_part = r'''([pP][+-]?[0-9]+)'''
hex_fractional_constant = '((('+hex_digits+r""")?\."""+hex_digits+')|('+hex_digits+r"""\.))"""
hex_floating_constant = '('+hex_prefix+'('+hex_digits+'|'+hex_fractional_constant+')'+binary_exponent_part+'[FfLl]?)'
##
## Lexer states: used for preprocessor \n-terminated directives
##
states = (
# ppline: preprocessor line directives
#
('ppline', 'exclusive'),
# pppragma: pragma
#
('pppragma', 'exclusive'),
)
def t_PPHASH(self, t):
r'[ \t]*\#'
if self.line_pattern.match(t.lexer.lexdata, pos=t.lexer.lexpos):
t.lexer.begin('ppline')
self.pp_line = self.pp_filename = None
elif self.pragma_pattern.match(t.lexer.lexdata, pos=t.lexer.lexpos):
t.lexer.begin('pppragma')
else:
t.type = 'PPHASH'
return t
##
## Rules for the ppline state
##
@TOKEN(string_literal)
def t_ppline_FILENAME(self, t):
if self.pp_line is None:
self._error('filename before line number in #line', t)
else:
self.pp_filename = t.value.lstrip('"').rstrip('"')
@TOKEN(decimal_constant)
def t_ppline_LINE_NUMBER(self, t):
if self.pp_line is None:
self.pp_line = t.value
else:
# Ignore: GCC's cpp sometimes inserts a numeric flag
# after the file name
pass
def t_ppline_NEWLINE(self, t):
r'\n'
if self.pp_line is None:
self._error('line number missing in #line', t)
else:
self.lexer.lineno = int(self.pp_line)
if self.pp_filename is not None:
self.filename = self.pp_filename
t.lexer.begin('INITIAL')
def t_ppline_PPLINE(self, t):
r'line'
pass
t_ppline_ignore = ' \t'
def t_ppline_error(self, t):
self._error('invalid #line directive', t)
##
## Rules for the pppragma state
##
def t_pppragma_NEWLINE(self, t):
r'\n'
t.lexer.lineno += 1
t.lexer.begin('INITIAL')
def t_pppragma_PPPRAGMA(self, t):
r'pragma'
return t
t_pppragma_ignore = ' \t'
def t_pppragma_STR(self, t):
'.+'
t.type = 'PPPRAGMASTR'
return t
def t_pppragma_error(self, t):
self._error('invalid #pragma directive', t)
##
## Rules for the normal state
##
t_ignore = ' \t'
# Newlines
def t_NEWLINE(self, t):
r'\n+'
t.lexer.lineno += t.value.count("\n")
# Operators
t_PLUS = r'\+'
t_MINUS = r'-'
t_TIMES = r'\*'
t_DIVIDE = r'/'
t_MOD = r'%'
t_OR = r'\|'
t_AND = r'&'
t_NOT = r'~'
t_XOR = r'\^'
t_LSHIFT = r'<<'
t_RSHIFT = r'>>'
t_LOR = r'\|\|'
t_LAND = r'&&'
t_LNOT = r'!'
t_LT = r'<'
t_GT = r'>'
t_LE = r'<='
t_GE = r'>='
t_EQ = r'=='
t_NE = r'!='
# Assignment operators
t_EQUALS = r'='
t_TIMESEQUAL = r'\*='
t_DIVEQUAL = r'/='
t_MODEQUAL = r'%='
t_PLUSEQUAL = r'\+='
t_MINUSEQUAL = r'-='
t_LSHIFTEQUAL = r'<<='
t_RSHIFTEQUAL = r'>>='
t_ANDEQUAL = r'&='
t_OREQUAL = r'\|='
t_XOREQUAL = r'\^='
# Increment/decrement
t_PLUSPLUS = r'\+\+'
t_MINUSMINUS = r'--'
# ->
t_ARROW = r'->'
# ?
t_CONDOP = r'\?'
# Delimeters
t_LPAREN = r'\('
t_RPAREN = r'\)'
t_LBRACKET = r'\['
t_RBRACKET = r'\]'
t_COMMA = r','
t_PERIOD = r'\.'
t_SEMI = r';'
t_COLON = r':'
t_ELLIPSIS = r'\.\.\.'
# Scope delimiters
# To see why on_lbrace_func is needed, consider:
# typedef char TT;
# void foo(int TT) { TT = 10; }
# TT x = 5;
# Outside the function, TT is a typedef, but inside (starting and ending
# with the braces) it's a parameter. The trouble begins with yacc's
# lookahead token. If we open a new scope in brace_open, then TT has
# already been read and incorrectly interpreted as TYPEID. So, we need
# to open and close scopes from within the lexer.
# Similar for the TT immediately outside the end of the function.
#
@TOKEN(r'\{')
def t_LBRACE(self, t):
self.on_lbrace_func()
return t
@TOKEN(r'\}')
def t_RBRACE(self, t):
self.on_rbrace_func()
return t
t_STRING_LITERAL = string_literal
# The following floating and integer constants are defined as
# functions to impose a strict order (otherwise, decimal
# is placed before the others because its regex is longer,
# and this is bad)
#
@TOKEN(floating_constant)
def t_FLOAT_CONST(self, t):
return t
@TOKEN(hex_floating_constant)
def t_HEX_FLOAT_CONST(self, t):
return t
@TOKEN(hex_constant)
def t_INT_CONST_HEX(self, t):
return t
@TOKEN(bin_constant)
def t_INT_CONST_BIN(self, t):
return t
@TOKEN(bad_octal_constant)
def t_BAD_CONST_OCT(self, t):
msg = "Invalid octal constant"
self._error(msg, t)
@TOKEN(octal_constant)
def t_INT_CONST_OCT(self, t):
return t
@TOKEN(decimal_constant)
def t_INT_CONST_DEC(self, t):
return t
# Must come before bad_char_const, to prevent it from
# catching valid char constants as invalid
#
@TOKEN(multicharacter_constant)
def t_INT_CONST_CHAR(self, t):
return t
@TOKEN(char_const)
def t_CHAR_CONST(self, t):
return t
@TOKEN(wchar_const)
def t_WCHAR_CONST(self, t):
return t
@TOKEN(unmatched_quote)
def t_UNMATCHED_QUOTE(self, t):
msg = "Unmatched '"
self._error(msg, t)
@TOKEN(bad_char_const)
def t_BAD_CHAR_CONST(self, t):
msg = "Invalid char constant %s" % t.value
self._error(msg, t)
@TOKEN(wstring_literal)
def t_WSTRING_LITERAL(self, t):
return t
# unmatched string literals are caught by the preprocessor
@TOKEN(bad_string_literal)
def t_BAD_STRING_LITERAL(self, t):
msg = "String contains invalid escape code"
self._error(msg, t)
@TOKEN(identifier)
def t_ID(self, t):
t.type = self.keyword_map.get(t.value, "ID")
if t.type == 'ID' and self.type_lookup_func(t.value):
t.type = "TYPEID"
return t
def t_error(self, t):
msg = 'Illegal character %s' % repr(t.value[0])
self._error(msg, t)

File diff suppressed because it is too large Load diff

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1,5 @@
# PLY package
# Author: David Beazley (dave@dabeaz.com)
__version__ = '3.9'
__all__ = ['lex','yacc']

View file

@ -0,0 +1,905 @@
# -----------------------------------------------------------------------------
# cpp.py
#
# Author: David Beazley (http://www.dabeaz.com)
# Copyright (C) 2017
# All rights reserved
#
# This module implements an ANSI-C style lexical preprocessor for PLY.
# -----------------------------------------------------------------------------
import sys
# Some Python 3 compatibility shims
if sys.version_info.major < 3:
STRING_TYPES = (str, unicode)
else:
STRING_TYPES = str
xrange = range
# -----------------------------------------------------------------------------
# Default preprocessor lexer definitions. These tokens are enough to get
# a basic preprocessor working. Other modules may import these if they want
# -----------------------------------------------------------------------------
tokens = (
'CPP_ID','CPP_INTEGER', 'CPP_FLOAT', 'CPP_STRING', 'CPP_CHAR', 'CPP_WS', 'CPP_COMMENT1', 'CPP_COMMENT2', 'CPP_POUND','CPP_DPOUND'
)
literals = "+-*/%|&~^<>=!?()[]{}.,;:\\\'\""
# Whitespace
def t_CPP_WS(t):
r'\s+'
t.lexer.lineno += t.value.count("\n")
return t
t_CPP_POUND = r'\#'
t_CPP_DPOUND = r'\#\#'
# Identifier
t_CPP_ID = r'[A-Za-z_][\w_]*'
# Integer literal
def CPP_INTEGER(t):
r'(((((0x)|(0X))[0-9a-fA-F]+)|(\d+))([uU][lL]|[lL][uU]|[uU]|[lL])?)'
return t
t_CPP_INTEGER = CPP_INTEGER
# Floating literal
t_CPP_FLOAT = r'((\d+)(\.\d+)(e(\+|-)?(\d+))? | (\d+)e(\+|-)?(\d+))([lL]|[fF])?'
# String literal
def t_CPP_STRING(t):
r'\"([^\\\n]|(\\(.|\n)))*?\"'
t.lexer.lineno += t.value.count("\n")
return t
# Character constant 'c' or L'c'
def t_CPP_CHAR(t):
r'(L)?\'([^\\\n]|(\\(.|\n)))*?\''
t.lexer.lineno += t.value.count("\n")
return t
# Comment
def t_CPP_COMMENT1(t):
r'(/\*(.|\n)*?\*/)'
ncr = t.value.count("\n")
t.lexer.lineno += ncr
# replace with one space or a number of '\n'
t.type = 'CPP_WS'; t.value = '\n' * ncr if ncr else ' '
return t
# Line comment
def t_CPP_COMMENT2(t):
r'(//.*?(\n|$))'
# replace with '/n'
t.type = 'CPP_WS'; t.value = '\n'
return t
def t_error(t):
t.type = t.value[0]
t.value = t.value[0]
t.lexer.skip(1)
return t
import re
import copy
import time
import os.path
# -----------------------------------------------------------------------------
# trigraph()
#
# Given an input string, this function replaces all trigraph sequences.
# The following mapping is used:
#
# ??= #
# ??/ \
# ??' ^
# ??( [
# ??) ]
# ??! |
# ??< {
# ??> }
# ??- ~
# -----------------------------------------------------------------------------
_trigraph_pat = re.compile(r'''\?\?[=/\'\(\)\!<>\-]''')
_trigraph_rep = {
'=':'#',
'/':'\\',
"'":'^',
'(':'[',
')':']',
'!':'|',
'<':'{',
'>':'}',
'-':'~'
}
def trigraph(input):
return _trigraph_pat.sub(lambda g: _trigraph_rep[g.group()[-1]],input)
# ------------------------------------------------------------------
# Macro object
#
# This object holds information about preprocessor macros
#
# .name - Macro name (string)
# .value - Macro value (a list of tokens)
# .arglist - List of argument names
# .variadic - Boolean indicating whether or not variadic macro
# .vararg - Name of the variadic parameter
#
# When a macro is created, the macro replacement token sequence is
# pre-scanned and used to create patch lists that are later used
# during macro expansion
# ------------------------------------------------------------------
class Macro(object):
def __init__(self,name,value,arglist=None,variadic=False):
self.name = name
self.value = value
self.arglist = arglist
self.variadic = variadic
if variadic:
self.vararg = arglist[-1]
self.source = None
# ------------------------------------------------------------------
# Preprocessor object
#
# Object representing a preprocessor. Contains macro definitions,
# include directories, and other information
# ------------------------------------------------------------------
class Preprocessor(object):
def __init__(self,lexer=None):
if lexer is None:
lexer = lex.lexer
self.lexer = lexer
self.macros = { }
self.path = []
self.temp_path = []
# Probe the lexer for selected tokens
self.lexprobe()
tm = time.localtime()
self.define("__DATE__ \"%s\"" % time.strftime("%b %d %Y",tm))
self.define("__TIME__ \"%s\"" % time.strftime("%H:%M:%S",tm))
self.parser = None
# -----------------------------------------------------------------------------
# tokenize()
#
# Utility function. Given a string of text, tokenize into a list of tokens
# -----------------------------------------------------------------------------
def tokenize(self,text):
tokens = []
self.lexer.input(text)
while True:
tok = self.lexer.token()
if not tok: break
tokens.append(tok)
return tokens
# ---------------------------------------------------------------------
# error()
#
# Report a preprocessor error/warning of some kind
# ----------------------------------------------------------------------
def error(self,file,line,msg):
print("%s:%d %s" % (file,line,msg))
# ----------------------------------------------------------------------
# lexprobe()
#
# This method probes the preprocessor lexer object to discover
# the token types of symbols that are important to the preprocessor.
# If this works right, the preprocessor will simply "work"
# with any suitable lexer regardless of how tokens have been named.
# ----------------------------------------------------------------------
def lexprobe(self):
# Determine the token type for identifiers
self.lexer.input("identifier")
tok = self.lexer.token()
if not tok or tok.value != "identifier":
print("Couldn't determine identifier type")
else:
self.t_ID = tok.type
# Determine the token type for integers
self.lexer.input("12345")
tok = self.lexer.token()
if not tok or int(tok.value) != 12345:
print("Couldn't determine integer type")
else:
self.t_INTEGER = tok.type
self.t_INTEGER_TYPE = type(tok.value)
# Determine the token type for strings enclosed in double quotes
self.lexer.input("\"filename\"")
tok = self.lexer.token()
if not tok or tok.value != "\"filename\"":
print("Couldn't determine string type")
else:
self.t_STRING = tok.type
# Determine the token type for whitespace--if any
self.lexer.input(" ")
tok = self.lexer.token()
if not tok or tok.value != " ":
self.t_SPACE = None
else:
self.t_SPACE = tok.type
# Determine the token type for newlines
self.lexer.input("\n")
tok = self.lexer.token()
if not tok or tok.value != "\n":
self.t_NEWLINE = None
print("Couldn't determine token for newlines")
else:
self.t_NEWLINE = tok.type
self.t_WS = (self.t_SPACE, self.t_NEWLINE)
# Check for other characters used by the preprocessor
chars = [ '<','>','#','##','\\','(',')',',','.']
for c in chars:
self.lexer.input(c)
tok = self.lexer.token()
if not tok or tok.value != c:
print("Unable to lex '%s' required for preprocessor" % c)
# ----------------------------------------------------------------------
# add_path()
#
# Adds a search path to the preprocessor.
# ----------------------------------------------------------------------
def add_path(self,path):
self.path.append(path)
# ----------------------------------------------------------------------
# group_lines()
#
# Given an input string, this function splits it into lines. Trailing whitespace
# is removed. Any line ending with \ is grouped with the next line. This
# function forms the lowest level of the preprocessor---grouping into text into
# a line-by-line format.
# ----------------------------------------------------------------------
def group_lines(self,input):
lex = self.lexer.clone()
lines = [x.rstrip() for x in input.splitlines()]
for i in xrange(len(lines)):
j = i+1
while lines[i].endswith('\\') and (j < len(lines)):
lines[i] = lines[i][:-1]+lines[j]
lines[j] = ""
j += 1
input = "\n".join(lines)
lex.input(input)
lex.lineno = 1
current_line = []
while True:
tok = lex.token()
if not tok:
break
current_line.append(tok)
if tok.type in self.t_WS and '\n' in tok.value:
yield current_line
current_line = []
if current_line:
yield current_line
# ----------------------------------------------------------------------
# tokenstrip()
#
# Remove leading/trailing whitespace tokens from a token list
# ----------------------------------------------------------------------
def tokenstrip(self,tokens):
i = 0
while i < len(tokens) and tokens[i].type in self.t_WS:
i += 1
del tokens[:i]
i = len(tokens)-1
while i >= 0 and tokens[i].type in self.t_WS:
i -= 1
del tokens[i+1:]
return tokens
# ----------------------------------------------------------------------
# collect_args()
#
# Collects comma separated arguments from a list of tokens. The arguments
# must be enclosed in parenthesis. Returns a tuple (tokencount,args,positions)
# where tokencount is the number of tokens consumed, args is a list of arguments,
# and positions is a list of integers containing the starting index of each
# argument. Each argument is represented by a list of tokens.
#
# When collecting arguments, leading and trailing whitespace is removed
# from each argument.
#
# This function properly handles nested parenthesis and commas---these do not
# define new arguments.
# ----------------------------------------------------------------------
def collect_args(self,tokenlist):
args = []
positions = []
current_arg = []
nesting = 1
tokenlen = len(tokenlist)
# Search for the opening '('.
i = 0
while (i < tokenlen) and (tokenlist[i].type in self.t_WS):
i += 1
if (i < tokenlen) and (tokenlist[i].value == '('):
positions.append(i+1)
else:
self.error(self.source,tokenlist[0].lineno,"Missing '(' in macro arguments")
return 0, [], []
i += 1
while i < tokenlen:
t = tokenlist[i]
if t.value == '(':
current_arg.append(t)
nesting += 1
elif t.value == ')':
nesting -= 1
if nesting == 0:
if current_arg:
args.append(self.tokenstrip(current_arg))
positions.append(i)
return i+1,args,positions
current_arg.append(t)
elif t.value == ',' and nesting == 1:
args.append(self.tokenstrip(current_arg))
positions.append(i+1)
current_arg = []
else:
current_arg.append(t)
i += 1
# Missing end argument
self.error(self.source,tokenlist[-1].lineno,"Missing ')' in macro arguments")
return 0, [],[]
# ----------------------------------------------------------------------
# macro_prescan()
#
# Examine the macro value (token sequence) and identify patch points
# This is used to speed up macro expansion later on---we'll know
# right away where to apply patches to the value to form the expansion
# ----------------------------------------------------------------------
def macro_prescan(self,macro):
macro.patch = [] # Standard macro arguments
macro.str_patch = [] # String conversion expansion
macro.var_comma_patch = [] # Variadic macro comma patch
i = 0
while i < len(macro.value):
if macro.value[i].type == self.t_ID and macro.value[i].value in macro.arglist:
argnum = macro.arglist.index(macro.value[i].value)
# Conversion of argument to a string
if i > 0 and macro.value[i-1].value == '#':
macro.value[i] = copy.copy(macro.value[i])
macro.value[i].type = self.t_STRING
del macro.value[i-1]
macro.str_patch.append((argnum,i-1))
continue
# Concatenation
elif (i > 0 and macro.value[i-1].value == '##'):
macro.patch.append(('c',argnum,i-1))
del macro.value[i-1]
continue
elif ((i+1) < len(macro.value) and macro.value[i+1].value == '##'):
macro.patch.append(('c',argnum,i))
i += 1
continue
# Standard expansion
else:
macro.patch.append(('e',argnum,i))
elif macro.value[i].value == '##':
if macro.variadic and (i > 0) and (macro.value[i-1].value == ',') and \
((i+1) < len(macro.value)) and (macro.value[i+1].type == self.t_ID) and \
(macro.value[i+1].value == macro.vararg):
macro.var_comma_patch.append(i-1)
i += 1
macro.patch.sort(key=lambda x: x[2],reverse=True)
# ----------------------------------------------------------------------
# macro_expand_args()
#
# Given a Macro and list of arguments (each a token list), this method
# returns an expanded version of a macro. The return value is a token sequence
# representing the replacement macro tokens
# ----------------------------------------------------------------------
def macro_expand_args(self,macro,args):
# Make a copy of the macro token sequence
rep = [copy.copy(_x) for _x in macro.value]
# Make string expansion patches. These do not alter the length of the replacement sequence
str_expansion = {}
for argnum, i in macro.str_patch:
if argnum not in str_expansion:
str_expansion[argnum] = ('"%s"' % "".join([x.value for x in args[argnum]])).replace("\\","\\\\")
rep[i] = copy.copy(rep[i])
rep[i].value = str_expansion[argnum]
# Make the variadic macro comma patch. If the variadic macro argument is empty, we get rid
comma_patch = False
if macro.variadic and not args[-1]:
for i in macro.var_comma_patch:
rep[i] = None
comma_patch = True
# Make all other patches. The order of these matters. It is assumed that the patch list
# has been sorted in reverse order of patch location since replacements will cause the
# size of the replacement sequence to expand from the patch point.
expanded = { }
for ptype, argnum, i in macro.patch:
# Concatenation. Argument is left unexpanded
if ptype == 'c':
rep[i:i+1] = args[argnum]
# Normal expansion. Argument is macro expanded first
elif ptype == 'e':
if argnum not in expanded:
expanded[argnum] = self.expand_macros(args[argnum])
rep[i:i+1] = expanded[argnum]
# Get rid of removed comma if necessary
if comma_patch:
rep = [_i for _i in rep if _i]
return rep
# ----------------------------------------------------------------------
# expand_macros()
#
# Given a list of tokens, this function performs macro expansion.
# The expanded argument is a dictionary that contains macros already
# expanded. This is used to prevent infinite recursion.
# ----------------------------------------------------------------------
def expand_macros(self,tokens,expanded=None):
if expanded is None:
expanded = {}
i = 0
while i < len(tokens):
t = tokens[i]
if t.type == self.t_ID:
if t.value in self.macros and t.value not in expanded:
# Yes, we found a macro match
expanded[t.value] = True
m = self.macros[t.value]
if not m.arglist:
# A simple macro
ex = self.expand_macros([copy.copy(_x) for _x in m.value],expanded)
for e in ex:
e.lineno = t.lineno
tokens[i:i+1] = ex
i += len(ex)
else:
# A macro with arguments
j = i + 1
while j < len(tokens) and tokens[j].type in self.t_WS:
j += 1
if tokens[j].value == '(':
tokcount,args,positions = self.collect_args(tokens[j:])
if not m.variadic and len(args) != len(m.arglist):
self.error(self.source,t.lineno,"Macro %s requires %d arguments" % (t.value,len(m.arglist)))
i = j + tokcount
elif m.variadic and len(args) < len(m.arglist)-1:
if len(m.arglist) > 2:
self.error(self.source,t.lineno,"Macro %s must have at least %d arguments" % (t.value, len(m.arglist)-1))
else:
self.error(self.source,t.lineno,"Macro %s must have at least %d argument" % (t.value, len(m.arglist)-1))
i = j + tokcount
else:
if m.variadic:
if len(args) == len(m.arglist)-1:
args.append([])
else:
args[len(m.arglist)-1] = tokens[j+positions[len(m.arglist)-1]:j+tokcount-1]
del args[len(m.arglist):]
# Get macro replacement text
rep = self.macro_expand_args(m,args)
rep = self.expand_macros(rep,expanded)
for r in rep:
r.lineno = t.lineno
tokens[i:j+tokcount] = rep
i += len(rep)
del expanded[t.value]
continue
elif t.value == '__LINE__':
t.type = self.t_INTEGER
t.value = self.t_INTEGER_TYPE(t.lineno)
i += 1
return tokens
# ----------------------------------------------------------------------
# evalexpr()
#
# Evaluate an expression token sequence for the purposes of evaluating
# integral expressions.
# ----------------------------------------------------------------------
def evalexpr(self,tokens):
# tokens = tokenize(line)
# Search for defined macros
i = 0
while i < len(tokens):
if tokens[i].type == self.t_ID and tokens[i].value == 'defined':
j = i + 1
needparen = False
result = "0L"
while j < len(tokens):
if tokens[j].type in self.t_WS:
j += 1
continue
elif tokens[j].type == self.t_ID:
if tokens[j].value in self.macros:
result = "1L"
else:
result = "0L"
if not needparen: break
elif tokens[j].value == '(':
needparen = True
elif tokens[j].value == ')':
break
else:
self.error(self.source,tokens[i].lineno,"Malformed defined()")
j += 1
tokens[i].type = self.t_INTEGER
tokens[i].value = self.t_INTEGER_TYPE(result)
del tokens[i+1:j+1]
i += 1
tokens = self.expand_macros(tokens)
for i,t in enumerate(tokens):
if t.type == self.t_ID:
tokens[i] = copy.copy(t)
tokens[i].type = self.t_INTEGER
tokens[i].value = self.t_INTEGER_TYPE("0L")
elif t.type == self.t_INTEGER:
tokens[i] = copy.copy(t)
# Strip off any trailing suffixes
tokens[i].value = str(tokens[i].value)
while tokens[i].value[-1] not in "0123456789abcdefABCDEF":
tokens[i].value = tokens[i].value[:-1]
expr = "".join([str(x.value) for x in tokens])
expr = expr.replace("&&"," and ")
expr = expr.replace("||"," or ")
expr = expr.replace("!"," not ")
try:
result = eval(expr)
except Exception:
self.error(self.source,tokens[0].lineno,"Couldn't evaluate expression")
result = 0
return result
# ----------------------------------------------------------------------
# parsegen()
#
# Parse an input string/
# ----------------------------------------------------------------------
def parsegen(self,input,source=None):
# Replace trigraph sequences
t = trigraph(input)
lines = self.group_lines(t)
if not source:
source = ""
self.define("__FILE__ \"%s\"" % source)
self.source = source
chunk = []
enable = True
iftrigger = False
ifstack = []
for x in lines:
for i,tok in enumerate(x):
if tok.type not in self.t_WS: break
if tok.value == '#':
# Preprocessor directive
# insert necessary whitespace instead of eaten tokens
for tok in x:
if tok.type in self.t_WS and '\n' in tok.value:
chunk.append(tok)
dirtokens = self.tokenstrip(x[i+1:])
if dirtokens:
name = dirtokens[0].value
args = self.tokenstrip(dirtokens[1:])
else:
name = ""
args = []
if name == 'define':
if enable:
for tok in self.expand_macros(chunk):
yield tok
chunk = []
self.define(args)
elif name == 'include':
if enable:
for tok in self.expand_macros(chunk):
yield tok
chunk = []
oldfile = self.macros['__FILE__']
for tok in self.include(args):
yield tok
self.macros['__FILE__'] = oldfile
self.source = source
elif name == 'undef':
if enable:
for tok in self.expand_macros(chunk):
yield tok
chunk = []
self.undef(args)
elif name == 'ifdef':
ifstack.append((enable,iftrigger))
if enable:
if not args[0].value in self.macros:
enable = False
iftrigger = False
else:
iftrigger = True
elif name == 'ifndef':
ifstack.append((enable,iftrigger))
if enable:
if args[0].value in self.macros:
enable = False
iftrigger = False
else:
iftrigger = True
elif name == 'if':
ifstack.append((enable,iftrigger))
if enable:
result = self.evalexpr(args)
if not result:
enable = False
iftrigger = False
else:
iftrigger = True
elif name == 'elif':
if ifstack:
if ifstack[-1][0]: # We only pay attention if outer "if" allows this
if enable: # If already true, we flip enable False
enable = False
elif not iftrigger: # If False, but not triggered yet, we'll check expression
result = self.evalexpr(args)
if result:
enable = True
iftrigger = True
else:
self.error(self.source,dirtokens[0].lineno,"Misplaced #elif")
elif name == 'else':
if ifstack:
if ifstack[-1][0]:
if enable:
enable = False
elif not iftrigger:
enable = True
iftrigger = True
else:
self.error(self.source,dirtokens[0].lineno,"Misplaced #else")
elif name == 'endif':
if ifstack:
enable,iftrigger = ifstack.pop()
else:
self.error(self.source,dirtokens[0].lineno,"Misplaced #endif")
else:
# Unknown preprocessor directive
pass
else:
# Normal text
if enable:
chunk.extend(x)
for tok in self.expand_macros(chunk):
yield tok
chunk = []
# ----------------------------------------------------------------------
# include()
#
# Implementation of file-inclusion
# ----------------------------------------------------------------------
def include(self,tokens):
# Try to extract the filename and then process an include file
if not tokens:
return
if tokens:
if tokens[0].value != '<' and tokens[0].type != self.t_STRING:
tokens = self.expand_macros(tokens)
if tokens[0].value == '<':
# Include <...>
i = 1
while i < len(tokens):
if tokens[i].value == '>':
break
i += 1
else:
print("Malformed #include <...>")
return
filename = "".join([x.value for x in tokens[1:i]])
path = self.path + [""] + self.temp_path
elif tokens[0].type == self.t_STRING:
filename = tokens[0].value[1:-1]
path = self.temp_path + [""] + self.path
else:
print("Malformed #include statement")
return
for p in path:
iname = os.path.join(p,filename)
try:
data = open(iname,"r").read()
dname = os.path.dirname(iname)
if dname:
self.temp_path.insert(0,dname)
for tok in self.parsegen(data,filename):
yield tok
if dname:
del self.temp_path[0]
break
except IOError:
pass
else:
print("Couldn't find '%s'" % filename)
# ----------------------------------------------------------------------
# define()
#
# Define a new macro
# ----------------------------------------------------------------------
def define(self,tokens):
if isinstance(tokens,STRING_TYPES):
tokens = self.tokenize(tokens)
linetok = tokens
try:
name = linetok[0]
if len(linetok) > 1:
mtype = linetok[1]
else:
mtype = None
if not mtype:
m = Macro(name.value,[])
self.macros[name.value] = m
elif mtype.type in self.t_WS:
# A normal macro
m = Macro(name.value,self.tokenstrip(linetok[2:]))
self.macros[name.value] = m
elif mtype.value == '(':
# A macro with arguments
tokcount, args, positions = self.collect_args(linetok[1:])
variadic = False
for a in args:
if variadic:
print("No more arguments may follow a variadic argument")
break
astr = "".join([str(_i.value) for _i in a])
if astr == "...":
variadic = True
a[0].type = self.t_ID
a[0].value = '__VA_ARGS__'
variadic = True
del a[1:]
continue
elif astr[-3:] == "..." and a[0].type == self.t_ID:
variadic = True
del a[1:]
# If, for some reason, "." is part of the identifier, strip off the name for the purposes
# of macro expansion
if a[0].value[-3:] == '...':
a[0].value = a[0].value[:-3]
continue
if len(a) > 1 or a[0].type != self.t_ID:
print("Invalid macro argument")
break
else:
mvalue = self.tokenstrip(linetok[1+tokcount:])
i = 0
while i < len(mvalue):
if i+1 < len(mvalue):
if mvalue[i].type in self.t_WS and mvalue[i+1].value == '##':
del mvalue[i]
continue
elif mvalue[i].value == '##' and mvalue[i+1].type in self.t_WS:
del mvalue[i+1]
i += 1
m = Macro(name.value,mvalue,[x[0].value for x in args],variadic)
self.macro_prescan(m)
self.macros[name.value] = m
else:
print("Bad macro definition")
except LookupError:
print("Bad macro definition")
# ----------------------------------------------------------------------
# undef()
#
# Undefine a macro
# ----------------------------------------------------------------------
def undef(self,tokens):
id = tokens[0].value
try:
del self.macros[id]
except LookupError:
pass
# ----------------------------------------------------------------------
# parse()
#
# Parse input text.
# ----------------------------------------------------------------------
def parse(self,input,source=None,ignore={}):
self.ignore = ignore
self.parser = self.parsegen(input,source)
# ----------------------------------------------------------------------
# token()
#
# Method to return individual tokens
# ----------------------------------------------------------------------
def token(self):
try:
while True:
tok = next(self.parser)
if tok.type not in self.ignore: return tok
except StopIteration:
self.parser = None
return None
if __name__ == '__main__':
import ply.lex as lex
lexer = lex.lex()
# Run a preprocessor
import sys
f = open(sys.argv[1])
input = f.read()
p = Preprocessor(lexer)
p.parse(input,sys.argv[1])
while True:
tok = p.token()
if not tok: break
print(p.source, tok)

View file

@ -0,0 +1,133 @@
# ----------------------------------------------------------------------
# ctokens.py
#
# Token specifications for symbols in ANSI C and C++. This file is
# meant to be used as a library in other tokenizers.
# ----------------------------------------------------------------------
# Reserved words
tokens = [
# Literals (identifier, integer constant, float constant, string constant, char const)
'ID', 'TYPEID', 'INTEGER', 'FLOAT', 'STRING', 'CHARACTER',
# Operators (+,-,*,/,%,|,&,~,^,<<,>>, ||, &&, !, <, <=, >, >=, ==, !=)
'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'MODULO',
'OR', 'AND', 'NOT', 'XOR', 'LSHIFT', 'RSHIFT',
'LOR', 'LAND', 'LNOT',
'LT', 'LE', 'GT', 'GE', 'EQ', 'NE',
# Assignment (=, *=, /=, %=, +=, -=, <<=, >>=, &=, ^=, |=)
'EQUALS', 'TIMESEQUAL', 'DIVEQUAL', 'MODEQUAL', 'PLUSEQUAL', 'MINUSEQUAL',
'LSHIFTEQUAL','RSHIFTEQUAL', 'ANDEQUAL', 'XOREQUAL', 'OREQUAL',
# Increment/decrement (++,--)
'INCREMENT', 'DECREMENT',
# Structure dereference (->)
'ARROW',
# Ternary operator (?)
'TERNARY',
# Delimeters ( ) [ ] { } , . ; :
'LPAREN', 'RPAREN',
'LBRACKET', 'RBRACKET',
'LBRACE', 'RBRACE',
'COMMA', 'PERIOD', 'SEMI', 'COLON',
# Ellipsis (...)
'ELLIPSIS',
]
# Operators
t_PLUS = r'\+'
t_MINUS = r'-'
t_TIMES = r'\*'
t_DIVIDE = r'/'
t_MODULO = r'%'
t_OR = r'\|'
t_AND = r'&'
t_NOT = r'~'
t_XOR = r'\^'
t_LSHIFT = r'<<'
t_RSHIFT = r'>>'
t_LOR = r'\|\|'
t_LAND = r'&&'
t_LNOT = r'!'
t_LT = r'<'
t_GT = r'>'
t_LE = r'<='
t_GE = r'>='
t_EQ = r'=='
t_NE = r'!='
# Assignment operators
t_EQUALS = r'='
t_TIMESEQUAL = r'\*='
t_DIVEQUAL = r'/='
t_MODEQUAL = r'%='
t_PLUSEQUAL = r'\+='
t_MINUSEQUAL = r'-='
t_LSHIFTEQUAL = r'<<='
t_RSHIFTEQUAL = r'>>='
t_ANDEQUAL = r'&='
t_OREQUAL = r'\|='
t_XOREQUAL = r'\^='
# Increment/decrement
t_INCREMENT = r'\+\+'
t_DECREMENT = r'--'
# ->
t_ARROW = r'->'
# ?
t_TERNARY = r'\?'
# Delimeters
t_LPAREN = r'\('
t_RPAREN = r'\)'
t_LBRACKET = r'\['
t_RBRACKET = r'\]'
t_LBRACE = r'\{'
t_RBRACE = r'\}'
t_COMMA = r','
t_PERIOD = r'\.'
t_SEMI = r';'
t_COLON = r':'
t_ELLIPSIS = r'\.\.\.'
# Identifiers
t_ID = r'[A-Za-z_][A-Za-z0-9_]*'
# Integer literal
t_INTEGER = r'\d+([uU]|[lL]|[uU][lL]|[lL][uU])?'
# Floating literal
t_FLOAT = r'((\d+)(\.\d+)(e(\+|-)?(\d+))? | (\d+)e(\+|-)?(\d+))([lL]|[fF])?'
# String literal
t_STRING = r'\"([^\\\n]|(\\.))*?\"'
# Character constant 'c' or L'c'
t_CHARACTER = r'(L)?\'([^\\\n]|(\\.))*?\''
# Comment (C-Style)
def t_COMMENT(t):
r'/\*(.|\n)*?\*/'
t.lexer.lineno += t.value.count('\n')
return t
# Comment (C++-Style)
def t_CPPCOMMENT(t):
r'//.*\n'
t.lexer.lineno += 1
return t

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,74 @@
# ply: ygen.py
#
# This is a support program that auto-generates different versions of the YACC parsing
# function with different features removed for the purposes of performance.
#
# Users should edit the method LParser.parsedebug() in yacc.py. The source code
# for that method is then used to create the other methods. See the comments in
# yacc.py for further details.
import os.path
import shutil
def get_source_range(lines, tag):
srclines = enumerate(lines)
start_tag = '#--! %s-start' % tag
end_tag = '#--! %s-end' % tag
for start_index, line in srclines:
if line.strip().startswith(start_tag):
break
for end_index, line in srclines:
if line.strip().endswith(end_tag):
break
return (start_index + 1, end_index)
def filter_section(lines, tag):
filtered_lines = []
include = True
tag_text = '#--! %s' % tag
for line in lines:
if line.strip().startswith(tag_text):
include = not include
elif include:
filtered_lines.append(line)
return filtered_lines
def main():
dirname = os.path.dirname(__file__)
shutil.copy2(os.path.join(dirname, 'yacc.py'), os.path.join(dirname, 'yacc.py.bak'))
with open(os.path.join(dirname, 'yacc.py'), 'r') as f:
lines = f.readlines()
parse_start, parse_end = get_source_range(lines, 'parsedebug')
parseopt_start, parseopt_end = get_source_range(lines, 'parseopt')
parseopt_notrack_start, parseopt_notrack_end = get_source_range(lines, 'parseopt-notrack')
# Get the original source
orig_lines = lines[parse_start:parse_end]
# Filter the DEBUG sections out
parseopt_lines = filter_section(orig_lines, 'DEBUG')
# Filter the TRACKING sections out
parseopt_notrack_lines = filter_section(parseopt_lines, 'TRACKING')
# Replace the parser source sections with updated versions
lines[parseopt_notrack_start:parseopt_notrack_end] = parseopt_notrack_lines
lines[parseopt_start:parseopt_end] = parseopt_lines
lines = [line.rstrip()+'\n' for line in lines]
with open(os.path.join(dirname, 'yacc.py'), 'w') as f:
f.writelines(lines)
print('Updated yacc.py')
if __name__ == '__main__':
main()

View file

@ -0,0 +1,133 @@
#-----------------------------------------------------------------
# plyparser.py
#
# PLYParser class and other utilites for simplifying programming
# parsers with PLY
#
# Eli Bendersky [https://eli.thegreenplace.net/]
# License: BSD
#-----------------------------------------------------------------
import warnings
class Coord(object):
""" Coordinates of a syntactic element. Consists of:
- File name
- Line number
- (optional) column number, for the Lexer
"""
__slots__ = ('file', 'line', 'column', '__weakref__')
def __init__(self, file, line, column=None):
self.file = file
self.line = line
self.column = column
def __str__(self):
str = "%s:%s" % (self.file, self.line)
if self.column: str += ":%s" % self.column
return str
class ParseError(Exception): pass
class PLYParser(object):
def _create_opt_rule(self, rulename):
""" Given a rule name, creates an optional ply.yacc rule
for it. The name of the optional rule is
<rulename>_opt
"""
optname = rulename + '_opt'
def optrule(self, p):
p[0] = p[1]
optrule.__doc__ = '%s : empty\n| %s' % (optname, rulename)
optrule.__name__ = 'p_%s' % optname
setattr(self.__class__, optrule.__name__, optrule)
def _coord(self, lineno, column=None):
return Coord(
file=self.clex.filename,
line=lineno,
column=column)
def _token_coord(self, p, token_idx):
""" Returns the coordinates for the YaccProduction objet 'p' indexed
with 'token_idx'. The coordinate includes the 'lineno' and
'column'. Both follow the lex semantic, starting from 1.
"""
last_cr = p.lexer.lexer.lexdata.rfind('\n', 0, p.lexpos(token_idx))
if last_cr < 0:
last_cr = -1
column = (p.lexpos(token_idx) - (last_cr))
return self._coord(p.lineno(token_idx), column)
def _parse_error(self, msg, coord):
raise ParseError("%s: %s" % (coord, msg))
def parameterized(*params):
""" Decorator to create parameterized rules.
Parameterized rule methods must be named starting with 'p_' and contain
'xxx', and their docstrings may contain 'xxx' and 'yyy'. These will be
replaced by the given parameter tuples. For example, ``p_xxx_rule()`` with
docstring 'xxx_rule : yyy' when decorated with
``@parameterized(('id', 'ID'))`` produces ``p_id_rule()`` with the docstring
'id_rule : ID'. Using multiple tuples produces multiple rules.
"""
def decorate(rule_func):
rule_func._params = params
return rule_func
return decorate
def template(cls):
""" Class decorator to generate rules from parameterized rule templates.
See `parameterized` for more information on parameterized rules.
"""
issued_nodoc_warning = False
for attr_name in dir(cls):
if attr_name.startswith('p_'):
method = getattr(cls, attr_name)
if hasattr(method, '_params'):
# Remove the template method
delattr(cls, attr_name)
# Create parameterized rules from this method; only run this if
# the method has a docstring. This is to address an issue when
# pycparser's users are installed in -OO mode which strips
# docstrings away.
# See: https://github.com/eliben/pycparser/pull/198/ and
# https://github.com/eliben/pycparser/issues/197
# for discussion.
if method.__doc__ is not None:
_create_param_rules(cls, method)
elif not issued_nodoc_warning:
warnings.warn(
'parsing methods must have __doc__ for pycparser to work properly',
RuntimeWarning,
stacklevel=2)
issued_nodoc_warning = True
return cls
def _create_param_rules(cls, func):
""" Create ply.yacc rules based on a parameterized rule function
Generates new methods (one per each pair of parameters) based on the
template rule function `func`, and attaches them to `cls`. The rule
function's parameters must be accessible via its `_params` attribute.
"""
for xxx, yyy in func._params:
# Use the template method's body for each new method
def param_rule(self, p):
func(self, p)
# Substitute in the params for the grammar rule and function name
param_rule.__doc__ = func.__doc__.replace('xxx', xxx).replace('yyy', yyy)
param_rule.__name__ = func.__name__.replace('xxx', xxx)
# Attach the new method to the class
setattr(cls, param_rule.__name__, param_rule)

File diff suppressed because one or more lines are too long