From b2f1b3bf7c63d980f69025d467064ba504f6a279 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sat, 11 Jan 2020 16:05:29 +0200 Subject: [PATCH] Small fixes --- examples/standalone/json_parser.py | 608 ++++++++++++++++++++--------- lark/lark.py | 2 - lark/lexer.py | 2 +- lark/load_grammar.py | 35 +- lark/parsers/lalr_analysis.py | 2 +- lark/tools/standalone.py | 3 + lark/utils.py | 26 ++ 7 files changed, 451 insertions(+), 227 deletions(-) diff --git a/examples/standalone/json_parser.py b/examples/standalone/json_parser.py index 73acf9c..f270ade 100644 --- a/examples/standalone/json_parser.py +++ b/examples/standalone/json_parser.py @@ -1,4 +1,4 @@ -# The file was automatically generated by Lark v0.7.0 +# The file was automatically generated by Lark v0.8.0rc1 # # # Lark Stand-alone Generator Tool @@ -35,6 +35,9 @@ # # +import os +from io import open + class LarkError(Exception): pass @@ -47,6 +50,14 @@ class ParseError(LarkError): class LexError(LarkError): pass +class UnexpectedEOF(ParseError): + def __init__(self, expected): + self.expected = expected + + message = ("Unexpected end-of-input. Expected one of: \n\t* %s\n" % '\n\t* '.join(x.name for x in self.expected)) + super(UnexpectedEOF, self).__init__(message) + + class UnexpectedInput(LarkError): pos_in_stream = None @@ -86,7 +97,7 @@ class UnexpectedInput(LarkError): class UnexpectedCharacters(LexError, UnexpectedInput): - def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None): + def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None): message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos], line, column) self.line = line @@ -99,6 +110,8 @@ class UnexpectedCharacters(LexError, UnexpectedInput): message += '\n\n' + self.get_context(seq) if allowed: message += '\nExpecting: %s\n' % allowed + if token_history: + message += '\nPrevious tokens: %s\n' % ', '.join(repr(t) for t in token_history) super(UnexpectedCharacters, self).__init__(message) @@ -121,13 +134,25 @@ class UnexpectedToken(ParseError, UnexpectedInput): super(UnexpectedToken, self).__init__(message) class VisitError(LarkError): - def __init__(self, tree, orig_exc): - self.tree = tree + def __init__(self, rule, obj, orig_exc): + self.obj = obj self.orig_exc = orig_exc - message = 'Error trying to process rule "%s":\n\n%s' % (tree.data, orig_exc) + message = 'Error trying to process rule "%s":\n\n%s' % (rule, orig_exc) super(VisitError, self).__init__(message) +def classify(seq, key=None, value=None): + d = {} + for item in seq: + k = key(item) if (key is not None) else item + v = value(item) if (value is not None) else item + if k in d: + d[k].append(v) + else: + d[k] = [v] + return d + + def _deserialize(data, namespace, memo): if isinstance(data, dict): if '__type__' in data: # Object @@ -170,7 +195,10 @@ class Serialize(object): inst = cls.__new__(cls) for f in fields: - setattr(inst, f, _deserialize(data[f], namespace, memo)) + try: + setattr(inst, f, _deserialize(data[f], namespace, memo)) + except KeyError as e: + raise KeyError("Cannot find key for class", cls, e) postprocess = getattr(inst, '_deserialize', None) if postprocess: postprocess() @@ -224,7 +252,7 @@ def smart_decorator(f, create_decorator): elif isinstance(f, partial): # wraps does not work for partials in 2.7: https://bugs.python.org/issue3445 - return create_decorator(f.__func__, True) + return wraps(f.func)(create_decorator(lambda *args, **kw: f(*args[1:], **kw), True)) else: return create_decorator(f.__func__.__call__, True) @@ -232,6 +260,15 @@ def smart_decorator(f, create_decorator): import sys, re Py36 = (sys.version_info[:2] >= (3, 6)) +import sre_parse +import sre_constants +def get_regexp_width(regexp): + try: + return [int(x) for x in sre_parse.parse(regexp).getwidth()] + except sre_constants.error: + raise ValueError(regexp) + + class Meta: def __init__(self): self.empty = True @@ -282,6 +319,36 @@ class Tree(object): def __hash__(self): return hash((self.data, tuple(self.children))) + def iter_subtrees(self): + # TODO: Re-write as a more efficient version + + visited = set() + q = [self] + + l = [] + while q: + subtree = q.pop() + l.append( subtree ) + if id(subtree) in visited: + continue # already been here from another branch + visited.add(id(subtree)) + q += [c for c in subtree.children if isinstance(c, Tree)] + + seen = set() + for x in reversed(l): + if id(x) not in seen: + yield x + seen.add(id(x)) + + def find_pred(self, pred): + "Find all nodes where pred(tree) == True" + return filter(pred, self.iter_subtrees()) + + def find_data(self, data): + "Find all nodes where tree.data == data" + return self.find_pred(lambda t: t.data == data) + + from inspect import getmembers, getmro class Discard(Exception): @@ -298,6 +365,10 @@ class Transformer: Can be used to implement map or reduce. """ + __visit_tokens__ = True # For backwards compatibility + def __init__(self, visit_tokens=True): + self.__visit_tokens__ = visit_tokens + def _call_userfunc(self, tree, new_children=None): # Assumes tree is already transformed children = new_children if new_children is not None else tree.children @@ -307,25 +378,39 @@ class Transformer: return self.__default__(tree.data, children, tree.meta) else: try: - if getattr(f, 'meta', False): - return f(children, tree.meta) - elif getattr(f, 'inline', False): - return f(*children) - elif getattr(f, 'whole_tree', False): - if new_children is not None: - raise NotImplementedError("Doesn't work with the base Transformer class") - return f(tree) + wrapper = getattr(f, 'visit_wrapper', None) + if wrapper is not None: + return f.visit_wrapper(f, tree.data, children, tree.meta) else: return f(children) except (GrammarError, Discard): raise except Exception as e: - raise VisitError(tree, e) + raise VisitError(tree.data, tree, e) + + def _call_userfunc_token(self, token): + try: + f = getattr(self, token.type) + except AttributeError: + return self.__default_token__(token) + else: + try: + return f(token) + except (GrammarError, Discard): + raise + except Exception as e: + raise VisitError(token.type, token, e) + def _transform_children(self, children): for c in children: try: - yield self._transform_tree(c) if isinstance(c, Tree) else c + if isinstance(c, Tree): + yield self._transform_tree(c) + elif self.__visit_tokens__ and isinstance(c, Token): + yield self._call_userfunc_token(c) + else: + yield c except Discard: pass @@ -343,13 +428,20 @@ class Transformer: "Default operation on tree (for override)" return Tree(data, children, meta) + def __default_token__(self, token): + "Default operation on token (for override)" + return token + + @classmethod def _apply_decorator(cls, decorator, **kwargs): mro = getmro(cls) assert mro[0] is cls libmembers = {name for _cls in mro[1:] for name, _ in getmembers(_cls)} for name, value in getmembers(cls): - if name.startswith('_') or name in libmembers: + + # Make sure the function isn't inherited (unless it's overwritten) + if name.startswith('_') or (name in libmembers and name not in cls.__dict__): continue if not callable(cls.__dict__[name]): continue @@ -432,6 +524,11 @@ class Visitor(VisitorBase): self._call_userfunc(subtree) return tree + def visit_topdown(self,tree): + for subtree in tree.iter_subtrees_topdown(): + self._call_userfunc(subtree) + return tree + class Visitor_Recursive(VisitorBase): """Bottom-up visitor, recursive @@ -444,8 +541,16 @@ class Visitor_Recursive(VisitorBase): if isinstance(child, Tree): self.visit(child) - f = getattr(self, tree.data, self.__default__) - f(tree) + self._call_userfunc(tree) + return tree + + def visit_topdown(self,tree): + self._call_userfunc(tree) + + for child in tree.children: + if isinstance(child, Tree): + self.visit_topdown(child) + return tree @@ -515,8 +620,7 @@ def inline_args(obj): # XXX Deprecated -def _visitor_args_func_dec(func, inline=False, meta=False, whole_tree=False, static=False): - assert [whole_tree, meta, inline].count(True) <= 1 +def _visitor_args_func_dec(func, visit_wrapper=None, static=False): def create_decorator(_f, with_self): if with_self: def f(self, *args, **kwargs): @@ -531,17 +635,42 @@ def _visitor_args_func_dec(func, inline=False, meta=False, whole_tree=False, sta else: f = smart_decorator(func, create_decorator) f.vargs_applied = True - f.inline = inline - f.meta = meta - f.whole_tree = whole_tree + f.visit_wrapper = visit_wrapper return f -def v_args(inline=False, meta=False, tree=False): + +def _vargs_inline(f, data, children, meta): + return f(*children) +def _vargs_meta_inline(f, data, children, meta): + return f(meta, *children) +def _vargs_meta(f, data, children, meta): + return f(children, meta) # TODO swap these for consistency? Backwards incompatible! +def _vargs_tree(f, data, children, meta): + return f(Tree(data, children, meta)) + +def v_args(inline=False, meta=False, tree=False, wrapper=None): "A convenience decorator factory, for modifying the behavior of user-supplied visitor methods" - if [tree, meta, inline].count(True) > 1: - raise ValueError("Visitor functions can either accept tree, or meta, or be inlined. These cannot be combined.") + if tree and (meta or inline): + raise ValueError("Visitor functions cannot combine 'tree' with 'meta' or 'inline'.") + + func = None + if meta: + if inline: + func = _vargs_meta_inline + else: + func = _vargs_meta + elif inline: + func = _vargs_inline + elif tree: + func = _vargs_tree + + if wrapper is not None: + if func is not None: + raise ValueError("Cannot use 'wrapper' along with 'tree', 'meta' or 'inline'.") + func = wrapper + def _visitor_args_dec(obj): - return _apply_decorator(obj, _visitor_args_func_dec, inline=inline, meta=meta, whole_tree=tree) + return _apply_decorator(obj, _visitor_args_func_dec, visit_wrapper=func) return _visitor_args_dec @@ -604,6 +733,8 @@ class Indenter: class Symbol(Serialize): + __slots__ = ('name',) + is_term = NotImplemented def __init__(self, name): @@ -680,7 +811,7 @@ class Rule(Serialize): self.expansion = expansion self.alias = alias self.order = order - self.options = options + self.options = options or RuleOptions() self._hash = hash((self.origin, tuple(self.expansion))) def _deserialize(self): @@ -705,7 +836,6 @@ class Rule(Serialize): class Pattern(Serialize): - __serialize_fields__ = 'value', 'flags' def __init__(self, value, flags=()): self.value = value @@ -738,6 +868,10 @@ class Pattern(Serialize): class PatternStr(Pattern): + __serialize_fields__ = 'value', 'flags' + + type = "str" + def to_regexp(self): return self._get_flags(re.escape(self.value)) @@ -747,15 +881,25 @@ class PatternStr(Pattern): max_width = min_width class PatternRE(Pattern): + __serialize_fields__ = 'value', 'flags', '_width' + + type = "re" + def to_regexp(self): return self._get_flags(self.value) + _width = None + def _get_width(self): + if self._width is None: + self._width = get_regexp_width(self.to_regexp()) + return self._width + @property def min_width(self): - return get_regexp_width(self.to_regexp())[0] + return self._get_width()[0] @property def max_width(self): - return get_regexp_width(self.to_regexp())[1] + return self._get_width()[1] class TerminalDef(Serialize): @@ -774,9 +918,9 @@ class TerminalDef(Serialize): class Token(Str): - __slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column') + __slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column', 'end_pos') - def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None, end_line=None, end_column=None): + def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None, end_line=None, end_column=None, end_pos=None): try: self = super(Token, cls).__new__(cls, value) except UnicodeDecodeError: @@ -790,11 +934,19 @@ class Token(Str): self.column = column self.end_line = end_line self.end_column = end_column + self.end_pos = end_pos return self + def update(self, type_=None, value=None): + return Token.new_borrow_pos( + type_ if type_ is not None else self.type, + value if value is not None else self.value, + self + ) + @classmethod def new_borrow_pos(cls, type_, value, borrow_t): - return cls(type_, value, borrow_t.pos_in_stream, borrow_t.line, borrow_t.column, borrow_t.end_line, borrow_t.end_column) + return cls(type_, value, borrow_t.pos_in_stream, borrow_t.line, borrow_t.column, borrow_t.end_line, borrow_t.end_column, borrow_t.end_pos) def __reduce__(self): return (self.__class__, (self.type, self.value, self.pos_in_stream, self.line, self.column, )) @@ -846,38 +998,38 @@ class _Lex: newline_types = frozenset(newline_types) ignore_types = frozenset(ignore_types) line_ctr = LineCounter() + last_token = None while line_ctr.char_pos < len(stream): lexer = self.lexer - for mre, type_from_index in lexer.mres: - m = mre.match(stream, line_ctr.char_pos) - if not m: - continue - - t = None - value = m.group(0) - type_ = type_from_index[m.lastindex] - if type_ not in ignore_types: - t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) - if t.type in lexer.callback: - t = lexer.callback[t.type](t) - if not isinstance(t, Token): - raise ValueError("Callbacks must return a token (returned %r)" % t) - yield t - else: - if type_ in lexer.callback: - t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) - lexer.callback[type_](t) + res = lexer.match(stream, line_ctr.char_pos) + if not res: + allowed = {v for m, tfi in lexer.mres for v in tfi.values()} - ignore_types + if not allowed: + allowed = {""} + raise UnexpectedCharacters(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, state=self.state, token_history=last_token and [last_token]) - line_ctr.feed(value, type_ in newline_types) - if t: - t.end_line = line_ctr.line - t.end_column = line_ctr.column + value, type_ = res - break + if type_ not in ignore_types: + t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) + line_ctr.feed(value, type_ in newline_types) + t.end_line = line_ctr.line + t.end_column = line_ctr.column + t.end_pos = line_ctr.char_pos + if t.type in lexer.callback: + t = lexer.callback[t.type](t) + if not isinstance(t, Token): + raise ValueError("Callbacks must return a token (returned %r)" % t) + yield t + last_token = t else: - allowed = [v for m, tfi in lexer.mres for v in tfi.values()] - raise UnexpectedCharacters(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, state=self.state) + if type_ in lexer.callback: + t2 = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) + lexer.callback[type_](t2) + line_ctr.feed(value, type_ in newline_types) + + class UnlessCallback: @@ -950,34 +1102,25 @@ def build_mres(terminals, match_whole=False): return _build_mres(terminals, len(terminals), match_whole) def _regexp_has_newline(r): - """Expressions that may indicate newlines in a regexp: + r"""Expressions that may indicate newlines in a regexp: - newlines (\n) - escaped newline (\\n) - anything but ([^...]) - any-char (.) when the flag (?s) exists + - spaces (\s) """ - return '\n' in r or '\\n' in r or '[^' in r or ('(?s' in r and '.' in r) + return '\n' in r or '\\n' in r or '\\s' in r or '[^' in r or ('(?s' in r and '.' in r) -class Lexer(Serialize): +class Lexer(object): """Lexer interface Method Signatures: lex(self, stream) -> Iterator[Token] - - set_parser_state(self, state) # Optional """ - set_parser_state = NotImplemented lex = NotImplemented class TraditionalLexer(Lexer): - __serialize_fields__ = 'terminals', 'ignore_types', 'newline_types' - __serialize_namespace__ = TerminalDef, - - def _deserialize(self): - self.mres = build_mres(self.terminals) - self.callback = {} # TODO implement - def __init__(self, terminals, ignore=(), user_callbacks={}): assert all(isinstance(t, TerminalDef) for t in terminals), terminals @@ -988,7 +1131,7 @@ class TraditionalLexer(Lexer): for t in terminals: try: re.compile(t.pattern.to_regexp()) - except: + except re.error: raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern)) if t.pattern.min_width == 0: @@ -1001,21 +1144,28 @@ class TraditionalLexer(Lexer): self.ignore_types = list(ignore) terminals.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name)) + self.terminals = terminals + self.user_callbacks = user_callbacks + self.build() - terminals, self.callback = _create_unless(terminals) + def build(self): + terminals, self.callback = _create_unless(self.terminals) assert all(self.callback.values()) - for type_, f in user_callbacks.items(): + for type_, f in self.user_callbacks.items(): if type_ in self.callback: # Already a callback there, probably UnlessCallback self.callback[type_] = CallChain(self.callback[type_], f, lambda t: t.type == type_) else: self.callback[type_] = f - self.terminals = terminals - self.mres = build_mres(terminals) + def match(self, stream, pos): + for mre, type_from_index in self.mres: + m = mre.match(stream, pos) + if m: + return m.group(0), type_from_index[m.lastindex] def lex(self, stream): return _Lex(self).lex(stream, self.newline_types, self.ignore_types) @@ -1024,8 +1174,6 @@ class TraditionalLexer(Lexer): class ContextualLexer(Lexer): - __serialize_fields__ = 'root_lexer', 'lexers' - __serialize_namespace__ = TraditionalLexer, def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}): tokens_by_name = {} @@ -1049,17 +1197,41 @@ class ContextualLexer(Lexer): self.root_lexer = TraditionalLexer(terminals, ignore=ignore, user_callbacks=user_callbacks) - self.set_parser_state(None) # Needs to be set on the outside + def lex(self, stream, get_parser_state): + parser_state = get_parser_state() + l = _Lex(self.lexers[parser_state], parser_state) + try: + for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types): + yield x + parser_state = get_parser_state() + l.lexer = self.lexers[parser_state] + l.state = parser_state # For debug only, no need to worry about multithreading + except UnexpectedCharacters as e: + # In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, + # but not in the current context. + # This tests the input against the global context, to provide a nicer error. + root_match = self.root_lexer.match(stream, e.pos_in_stream) + if not root_match: + raise - def set_parser_state(self, state): - self.parser_state = state + value, type_ = root_match + t = Token(type_, value, e.pos_in_stream, e.line, e.column) + raise UnexpectedToken(t, e.allowed, state=e.state) - def lex(self, stream): - l = _Lex(self.lexers[self.parser_state], self.parser_state) - for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types): - yield x - l.lexer = self.lexers[self.parser_state] - l.state = self.parser_state + + +class LexerConf(Serialize): + __serialize_fields__ = 'tokens', 'ignore' + __serialize_namespace__ = TerminalDef, + + def __init__(self, tokens, ignore=(), postlex=None, callbacks=None): + self.tokens = tokens + self.ignore = ignore + self.postlex = postlex + self.callbacks = callbacks or {} + + def _deserialize(self): + self.callbacks = {} # TODO from functools import partial, wraps @@ -1085,7 +1257,7 @@ class PropagatePositions: if isinstance(res, Tree): for c in children: - if isinstance(c, Tree) and c.children and not c.meta.empty: + if isinstance(c, Tree) and not c.meta.empty: res.meta.line = c.meta.line res.meta.column = c.meta.column res.meta.start_pos = c.meta.start_pos @@ -1099,7 +1271,7 @@ class PropagatePositions: break for c in reversed(children): - if isinstance(c, Tree) and c.children and not c.meta.empty: + if isinstance(c, Tree) and not c.meta.empty: res.meta.end_line = c.meta.end_line res.meta.end_column = c.meta.end_column res.meta.end_pos = c.meta.end_pos @@ -1108,7 +1280,7 @@ class PropagatePositions: elif isinstance(c, Token): res.meta.end_line = c.end_line res.meta.end_column = c.end_column - res.meta.end_pos = c.pos_in_stream + len(c.value) + res.meta.end_pos = c.end_pos res.meta.empty = False break @@ -1251,6 +1423,23 @@ def ptb_inline_args(func): return func(*children) return f +def inplace_transformer(func): + @wraps(func) + def f(children): + # function name in a Transformer is a rule name. + tree = Tree(func.__name__, children) + return func(tree) + return f + +def apply_visit_wrapper(func, name, wrapper): + if wrapper is visitors._vargs_meta or wrapper is visitors._vargs_meta_inline: + raise NotImplementedError("Meta args not supported for internal transformer") + @wraps(func) + def f(children): + return wrapper(func, name, children, None) + return f + + class ParseTreeBuilder: def __init__(self, rules, tree_class, propagate_positions=False, keep_all_tokens=False, ambiguous=False, maybe_placeholders=False): self.tree_class = tree_class @@ -1264,12 +1453,12 @@ class ParseTreeBuilder: def _init_builders(self, rules): for rule in rules: options = rule.options - keep_all_tokens = self.always_keep_all_tokens or (options.keep_all_tokens if options else False) - expand_single_child = options.expand1 if options else False + keep_all_tokens = self.always_keep_all_tokens or options.keep_all_tokens + expand_single_child = options.expand1 wrapper_chain = list(filter(None, [ (expand_single_child and not rule.alias) and ExpandSingleChild, - maybe_create_child_filter(rule.expansion, keep_all_tokens, self.ambiguous, options.empty_indices if self.maybe_placeholders and options else None), + maybe_create_child_filter(rule.expansion, keep_all_tokens, self.ambiguous, options.empty_indices if self.maybe_placeholders else None), self.propagate_positions and PropagatePositions, self.ambiguous and maybe_create_ambiguous_expander(self.tree_class, rule.expansion, keep_all_tokens), ])) @@ -1285,10 +1474,15 @@ class ParseTreeBuilder: user_callback_name = rule.alias or rule.origin.name try: f = getattr(transformer, user_callback_name) - assert not getattr(f, 'meta', False), "Meta args not supported for internal transformer" # XXX InlineTransformer is deprecated! - if getattr(f, 'inline', False) or isinstance(transformer, InlineTransformer): - f = ptb_inline_args(f) + wrapper = getattr(f, 'visit_wrapper', None) + if wrapper is not None: + f = apply_visit_wrapper(f, user_callback_name, wrapper) + else: + if isinstance(transformer, InlineTransformer): + f = ptb_inline_args(f) + elif isinstance(transformer, Transformer_InPlace): + f = inplace_transformer(f) except AttributeError: f = partial(self.tree_class, user_callback_name) @@ -1307,7 +1501,7 @@ class LALR_Parser(object): def __init__(self, parser_conf, debug=False): assert all(r.options.priority is None for r in parser_conf.rules), "LALR doesn't yet support prioritization" analysis = LALR_Analyzer(parser_conf, debug=debug) - analysis.compute_lookahead() + analysis.compute_lalr() callbacks = parser_conf.callbacks self._parse_table = analysis.parse_table @@ -1317,7 +1511,8 @@ class LALR_Parser(object): @classmethod def deserialize(cls, data, memo, callbacks): inst = cls.__new__(cls) - inst.parser = _Parser(IntParseTable.deserialize(data, memo), callbacks) + inst._parse_table = IntParseTable.deserialize(data, memo) + inst.parser = _Parser(inst._parse_table, callbacks) return inst def serialize(self, memo): @@ -1330,19 +1525,22 @@ class LALR_Parser(object): class _Parser: def __init__(self, parse_table, callbacks): self.states = parse_table.states - self.start_state = parse_table.start_state - self.end_state = parse_table.end_state + self.start_states = parse_table.start_states + self.end_states = parse_table.end_states self.callbacks = callbacks - def parse(self, seq, set_state=None): + def parse(self, seq, start, set_state=None): token = None stream = iter(seq) states = self.states - state_stack = [self.start_state] + start_state = self.start_states[start] + end_state = self.end_states[start] + + state_stack = [start_state] value_stack = [] - if set_state: set_state(self.start_state) + if set_state: set_state(start_state) def get_action(token): state = state_stack[-1] @@ -1372,7 +1570,7 @@ class _Parser: for token in stream: while True: action, arg = get_action(token) - assert arg != self.end_state + assert arg != end_state if action is Shift: state_stack.append(arg) @@ -1385,12 +1583,10 @@ class _Parser: token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1) while True: _action, arg = get_action(token) - if _action is Shift: - assert arg == self.end_state - val ,= value_stack - return val - else: - reduce(arg) + assert(_action is Reduce) + reduce(arg) + if state_stack[-1] == end_state: + return value_stack[-1] @@ -1405,11 +1601,12 @@ class Action: Shift = Action('Shift') Reduce = Action('Reduce') + class ParseTable: - def __init__(self, states, start_state, end_state): + def __init__(self, states, start_states, end_states): self.states = states - self.start_state = start_state - self.end_state = end_state + self.start_states = start_states + self.end_states = end_states def serialize(self, memo): tokens = Enumerator() @@ -1424,8 +1621,8 @@ class ParseTable: return { 'tokens': tokens.reversed(), 'states': states, - 'start_state': self.start_state, - 'end_state': self.end_state, + 'start_states': self.start_states, + 'end_states': self.end_states, } @classmethod @@ -1436,7 +1633,7 @@ class ParseTable: for token, (action, arg) in actions.items()} for state, actions in data['states'].items() } - return cls(states, data['start_state'], data['end_state']) + return cls(states, data['start_states'], data['end_states']) class IntParseTable(ParseTable): @@ -1453,9 +1650,9 @@ class IntParseTable(ParseTable): int_states[ state_to_idx[s] ] = la - start_state = state_to_idx[parse_table.start_state] - end_state = state_to_idx[parse_table.end_state] - return cls(int_states, start_state, end_state) + start_states = {start:state_to_idx[s] for start, s in parse_table.start_states.items()} + end_states = {start:state_to_idx[s] for start, s in parse_table.end_states.items()} + return cls(int_states, start_states, end_states) @@ -1491,63 +1688,84 @@ def get_frontend(parser, lexer): raise ValueError('Unknown parser: %s' % parser) +class _ParserFrontend(Serialize): + def _parse(self, input, start, *args): + if start is None: + start = self.start + if len(start) > 1: + raise ValueError("Lark initialized with more than 1 possible start rule. Must specify which start rule to parse", start) + start ,= start + return self.parser.parse(input, start, *args) -class WithLexer(Serialize): +class WithLexer(_ParserFrontend): lexer = None parser = None lexer_conf = None + start = None - __serialize_fields__ = 'parser', 'lexer' - __serialize_namespace__ = Rule, ContextualLexer, TraditionalLexer + __serialize_fields__ = 'parser', 'lexer_conf', 'start' + __serialize_namespace__ = LexerConf, + + def __init__(self, lexer_conf, parser_conf, options=None): + self.lexer_conf = lexer_conf + self.start = parser_conf.start + self.postlex = lexer_conf.postlex @classmethod def deserialize(cls, data, memo, callbacks, postlex): inst = super(WithLexer, cls).deserialize(data, memo) inst.postlex = postlex inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks) + inst.init_lexer() return inst def _serialize(self, data, memo): data['parser'] = data['parser'].serialize(memo) - def init_traditional_lexer(self, lexer_conf): - self.lexer_conf = lexer_conf - self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks) - self.postlex = lexer_conf.postlex - - def init_contextual_lexer(self, lexer_conf): - self.lexer_conf = lexer_conf - self.postlex = lexer_conf.postlex - states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()} - always_accept = self.postlex.always_accept if self.postlex else () - self.lexer = ContextualLexer(lexer_conf.tokens, states, - ignore=lexer_conf.ignore, - always_accept=always_accept, - user_callbacks=lexer_conf.callbacks) - - def lex(self, text): - stream = self.lexer.lex(text) + def lex(self, *args): + stream = self.lexer.lex(*args) return self.postlex.process(stream) if self.postlex else stream - def parse(self, text): + def parse(self, text, start=None): token_stream = self.lex(text) - sps = self.lexer.set_parser_state - return self.parser.parse(token_stream, *[sps] if sps is not NotImplemented else []) + return self._parse(token_stream, start) + def init_traditional_lexer(self): + self.lexer = TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks) -class LALR_TraditionalLexer(WithLexer): +class LALR_WithLexer(WithLexer): def __init__(self, lexer_conf, parser_conf, options=None): debug = options.debug if options else False self.parser = LALR_Parser(parser_conf, debug=debug) - self.init_traditional_lexer(lexer_conf) + WithLexer.__init__(self, lexer_conf, parser_conf, options) -class LALR_ContextualLexer(WithLexer): - def __init__(self, lexer_conf, parser_conf, options=None): - debug = options.debug if options else False - self.parser = LALR_Parser(parser_conf, debug=debug) - self.init_contextual_lexer(lexer_conf) + self.init_lexer() + + def init_lexer(self): + raise NotImplementedError() + +class LALR_TraditionalLexer(LALR_WithLexer): + def init_lexer(self): + self.init_traditional_lexer() + +class LALR_ContextualLexer(LALR_WithLexer): + def init_lexer(self): + states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()} + always_accept = self.postlex.always_accept if self.postlex else () + self.lexer = ContextualLexer(self.lexer_conf.tokens, states, + ignore=self.lexer_conf.ignore, + always_accept=always_accept, + user_callbacks=self.lexer_conf.callbacks) + + + def parse(self, text, start=None): + parser_state = [None] + def set_parser_state(s): + parser_state[0] = s + token_stream = self.lex(text, lambda: parser_state[0]) + return self._parse(token_stream, start, set_parser_state) class LarkOptions(Serialize): @@ -1576,8 +1794,7 @@ class LarkOptions(Serialize): keep_all_tokens - Don't automagically remove "punctuation" tokens (default: False) cache_grammar - Cache the Lark grammar (Default: False) postlex - Lexer post-processing (Default: None) Only works with the standard and contextual lexers. - start - The start symbol (Default: start) - profile - Measure run-time usage in Lark. Read results from the profiler proprety (Default: False) + start - The start symbol, either a string, or a list of strings for multiple possible starts (Default: "start") priority - How priorities should be evaluated - auto, none, normal, invert (Default: auto) propagate_positions - Propagates [line, column, end_line, end_column] attributes into all tree branches. lexer_callbacks - Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution. @@ -1596,12 +1813,12 @@ class LarkOptions(Serialize): 'lexer': 'auto', 'transformer': None, 'start': 'start', - 'profile': False, 'priority': 'auto', 'ambiguity': 'auto', - 'propagate_positions': False, + 'propagate_positions': True, 'lexer_callbacks': {}, - 'maybe_placeholders': False, + 'maybe_placeholders': True, + 'edit_terminals': None, } def __init__(self, options_dict): @@ -1618,6 +1835,9 @@ class LarkOptions(Serialize): options[name] = value + if isinstance(options['start'], STRING_TYPE): + options['start'] = [options['start']] + self.__dict__['options'] = options assert self.parser in ('earley', 'lalr', 'cyk', None) @@ -1630,7 +1850,11 @@ class LarkOptions(Serialize): raise ValueError("Unknown options: %s" % o.keys()) def __getattr__(self, name): - return self.options[name] + try: + return self.options[name] + except KeyError as e: + raise AttributeError(e) + def __setattr__(self, name, value): assert name in self.options self.options[name] = value @@ -1643,30 +1867,6 @@ class LarkOptions(Serialize): return cls(data) -class Profiler: - def __init__(self): - self.total_time = defaultdict(float) - self.cur_section = '__init__' - self.last_enter_time = time.time() - - def enter_section(self, name): - cur_time = time.time() - self.total_time[self.cur_section] += cur_time - self.last_enter_time - self.last_enter_time = cur_time - self.cur_section = name - - def make_wrapper(self, name, f): - def wrapper(*args, **kwargs): - last_section = self.cur_section - self.enter_section(name) - try: - return f(*args, **kwargs) - finally: - self.enter_section(last_section) - - return wrapper - - class Lark(Serialize): def __init__(self, grammar, **options): """ @@ -1694,9 +1894,6 @@ class Lark(Serialize): if self.options.cache_grammar: raise NotImplementedError("Not available yet") - assert not self.options.profile, "Feature temporarily disabled" - # self.profiler = Profiler() if self.options.profile else None - if self.options.lexer == 'auto': if self.options.parser == 'lalr': self.options.lexer = 'contextual' @@ -1733,7 +1930,13 @@ class Lark(Serialize): self.grammar = load_grammar(grammar, self.source) # Compile the EBNF grammar into BNF - self.terminals, self.rules, self.ignore_tokens = self.grammar.compile() + self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start) + + if self.options.edit_terminals: + for t in self.terminals: + self.options.edit_terminals(t) + + self._terminals_dict = {t.name:t for t in self.terminals} # If the user asked to invert the priorities, negate them all here. # This replaces the old 'resolve__antiscore_sum' option. @@ -1748,7 +1951,16 @@ class Lark(Serialize): for rule in self.rules: if rule.options.priority is not None: rule.options.priority = None - self.lexer_conf = LexerConf(self.terminals, self.ignore_tokens, self.options.postlex, self.options.lexer_callbacks) + + # TODO Deprecate lexer_callbacks? + lexer_callbacks = dict(self.options.lexer_callbacks) + if self.options.transformer: + t = self.options.transformer + for term in self.terminals: + if hasattr(t, term.name): + lexer_callbacks[term.name] = getattr(t, term.name) + + self.lexer_conf = LexerConf(self.terminals, self.ignore_tokens, self.options.postlex, lexer_callbacks) if self.options.parser: self.parser = self._build_parser() @@ -1783,6 +1995,7 @@ class Lark(Serialize): options['postlex'] = postlex inst.options = LarkOptions.deserialize(options, memo) inst.rules = [Rule.deserialize(r, memo) for r in data['rules']] + inst.source = '' inst._prepare_callbacks() inst.parser = inst.parser_class.deserialize(data['parser'], memo, inst._callbacks, inst.options.postlex) return inst @@ -1819,16 +2032,25 @@ class Lark(Serialize): return self.options.postlex.process(stream) return stream - def parse(self, text): - "Parse the given text, according to the options provided. Returns a tree, unless specified otherwise." - return self.parser.parse(text) + def get_terminal(self, name): + "Get information about a terminal" + return self._terminals_dict[name] + + def parse(self, text, start=None): + """Parse the given text, according to the options provided. + + The 'start' parameter is required if Lark was given multiple possible start symbols (using the start option). + + Returns a tree, unless specified otherwise. + """ + return self.parser.parse(text, start=start) DATA = ( -{'rules': [{'@': 27}, {'@': 31}, {'@': 26}, {'@': 13}, {'@': 24}, {'@': 18}, {'@': 16}, {'@': 23}, {'@': 21}, {'@': 17}, {'@': 28}, {'@': 30}, {'@': 25}, {'@': 29}, {'@': 20}, {'@': 22}, {'@': 15}, {'@': 19}, {'@': 12}, {'@': 14}], 'parser': {'parser': {'tokens': {0: 'COMMA', 1: 'RBRACE', 2: u'pair', 3: u'ESCAPED_STRING', 4: u'string', 5: 'COLON', 6: 'RSQB', 7: '$END', 8: 'LBRACE', 9: u'FALSE', 10: u'object', 11: u'SIGNED_NUMBER', 12: u'value', 13: 'LSQB', 14: u'NULL', 15: u'TRUE', 16: u'array', 17: '__anon_star_1', 18: '__anon_star_0', 19: 'start'}, 'states': {0: {0: (0, 1), 1: (0, 32)}, 1: {2: (0, 5), 3: (0, 21), 4: (0, 3)}, 2: {0: (1, {'@': 12}), 1: (1, {'@': 12})}, 3: {5: (0, 13)}, 4: {0: (1, {'@': 13}), 1: (1, {'@': 13}), 6: (1, {'@': 13}), 7: (1, {'@': 13})}, 5: {0: (1, {'@': 14}), 1: (1, {'@': 14})}, 6: {0: (1, {'@': 15}), 6: (1, {'@': 15})}, 7: {0: (1, {'@': 16}), 1: (1, {'@': 16}), 6: (1, {'@': 16}), 7: (1, {'@': 16})}, 8: {3: (0, 21), 4: (0, 4), 8: (0, 34), 9: (0, 7), 10: (0, 33), 11: (0, 25), 12: (0, 12), 13: (0, 14), 14: (0, 24), 15: (0, 11), 16: (0, 27)}, 9: {0: (1, {'@': 17}), 1: (1, {'@': 17}), 6: (1, {'@': 17}), 7: (1, {'@': 17})}, 10: {0: (0, 22), 17: (0, 0), 1: (0, 26)}, 11: {0: (1, {'@': 18}), 1: (1, {'@': 18}), 6: (1, {'@': 18}), 7: (1, {'@': 18})}, 12: {0: (1, {'@': 19}), 6: (1, {'@': 19})}, 13: {3: (0, 21), 4: (0, 4), 8: (0, 34), 9: (0, 7), 10: (0, 33), 11: (0, 25), 12: (0, 15), 13: (0, 14), 14: (0, 24), 15: (0, 11), 16: (0, 27)}, 14: {3: (0, 21), 4: (0, 4), 6: (0, 30), 8: (0, 34), 9: (0, 7), 10: (0, 33), 11: (0, 25), 12: (0, 23), 13: (0, 14), 14: (0, 24), 15: (0, 11), 16: (0, 27)}, 15: {0: (1, {'@': 20}), 1: (1, {'@': 20})}, 16: {0: (1, {'@': 21}), 1: (1, {'@': 21}), 6: (1, {'@': 21}), 7: (1, {'@': 21})}, 17: {3: (0, 21), 4: (0, 4), 8: (0, 34), 9: (0, 7), 10: (0, 33), 11: (0, 25), 12: (0, 6), 13: (0, 14), 14: (0, 24), 15: (0, 11), 16: (0, 27)}, 18: {}, 19: {7: (0, 18)}, 20: {0: (0, 8), 6: (0, 16)}, 21: {0: (1, {'@': 22}), 1: (1, {'@': 22}), 5: (1, {'@': 22}), 6: (1, {'@': 22}), 7: (1, {'@': 22})}, 22: {2: (0, 2), 3: (0, 21), 4: (0, 3)}, 23: {0: (0, 17), 18: (0, 20), 6: (0, 9)}, 24: {0: (1, {'@': 23}), 1: (1, {'@': 23}), 6: (1, {'@': 23}), 7: (1, {'@': 23})}, 25: {0: (1, {'@': 24}), 1: (1, {'@': 24}), 6: (1, {'@': 24}), 7: (1, {'@': 24})}, 26: {0: (1, {'@': 25}), 1: (1, {'@': 25}), 6: (1, {'@': 25}), 7: (1, {'@': 25})}, 27: {0: (1, {'@': 26}), 1: (1, {'@': 26}), 6: (1, {'@': 26}), 7: (1, {'@': 26})}, 28: {3: (0, 21), 4: (0, 4), 8: (0, 34), 9: (0, 7), 10: (0, 33), 11: (0, 25), 12: (0, 29), 13: (0, 14), 14: (0, 24), 15: (0, 11), 16: (0, 27), 19: (0, 19)}, 29: {7: (1, {'@': 27})}, 30: {0: (1, {'@': 28}), 1: (1, {'@': 28}), 6: (1, {'@': 28}), 7: (1, {'@': 28})}, 31: {0: (1, {'@': 29}), 1: (1, {'@': 29}), 6: (1, {'@': 29}), 7: (1, {'@': 29})}, 32: {0: (1, {'@': 30}), 1: (1, {'@': 30}), 6: (1, {'@': 30}), 7: (1, {'@': 30})}, 33: {0: (1, {'@': 31}), 1: (1, {'@': 31}), 6: (1, {'@': 31}), 7: (1, {'@': 31})}, 34: {1: (0, 31), 2: (0, 10), 3: (0, 21), 4: (0, 3)}}, 'end_state': 18, 'start_state': 28}, '__type__': 'LALR_TraditionalLexer', 'lexer': {'ignore_types': [u'WS'], 'terminals': [{'@': 0}, {'@': 1}, {'@': 2}, {'@': 3}, {'@': 4}, {'@': 5}, {'@': 6}, {'@': 7}, {'@': 8}, {'@': 9}, {'@': 10}, {'@': 11}], '__type__': 'TraditionalLexer', 'newline_types': [u'WS']}}, '__type__': 'Lark', 'options': {'profile': False, 'transformer': None, 'lexer': 'standard', 'lexer_callbacks': {}, 'postlex': None, 'parser': 'lalr', 'cache_grammar': False, 'tree_class': None, 'priority': None, 'start': 'start', 'keep_all_tokens': False, 'ambiguity': 'auto', 'debug': False, 'propagate_positions': False, 'maybe_placeholders': False}} +{'rules': [{'@': 27}, {'@': 31}, {'@': 26}, {'@': 13}, {'@': 25}, {'@': 18}, {'@': 16}, {'@': 24}, {'@': 22}, {'@': 17}, {'@': 28}, {'@': 30}, {'@': 20}, {'@': 29}, {'@': 21}, {'@': 23}, {'@': 15}, {'@': 19}, {'@': 12}, {'@': 14}], 'parser': {'lexer_conf': {'tokens': [{'@': 0}, {'@': 1}, {'@': 2}, {'@': 3}, {'@': 4}, {'@': 5}, {'@': 6}, {'@': 7}, {'@': 8}, {'@': 9}, {'@': 10}, {'@': 11}], 'ignore': [u'WS'], '__type__': 'LexerConf'}, 'parser': {'tokens': {0: 'LBRACE', 1: u'FALSE', 2: u'string', 3: u'object', 4: u'NULL', 5: u'SIGNED_NUMBER', 6: u'value', 7: 'start', 8: 'LSQB', 9: u'ESCAPED_STRING', 10: u'TRUE', 11: u'array', 12: 'COMMA', 13: 'RBRACE', 14: u'pair', 15: 'COLON', 16: 'RSQB', 17: '$END', 18: '__anon_star_1', 19: '__anon_star_0'}, 'states': {0: {0: (0, 33), 1: (0, 8), 2: (0, 5), 3: (0, 32), 4: (0, 23), 5: (0, 24), 6: (0, 28), 7: (0, 11), 8: (0, 25), 9: (0, 20), 10: (0, 13), 11: (0, 26)}, 1: {12: (0, 2), 13: (0, 31)}, 2: {9: (0, 20), 2: (0, 4), 14: (0, 6)}, 3: {12: (1, {'@': 12}), 13: (1, {'@': 12})}, 4: {15: (0, 15)}, 5: {16: (1, {'@': 13}), 17: (1, {'@': 13}), 12: (1, {'@': 13}), 13: (1, {'@': 13})}, 6: {12: (1, {'@': 14}), 13: (1, {'@': 14})}, 7: {16: (1, {'@': 15}), 12: (1, {'@': 15})}, 8: {16: (1, {'@': 16}), 17: (1, {'@': 16}), 12: (1, {'@': 16}), 13: (1, {'@': 16})}, 9: {0: (0, 33), 1: (0, 8), 2: (0, 5), 3: (0, 32), 4: (0, 23), 5: (0, 24), 6: (0, 14), 8: (0, 25), 9: (0, 20), 10: (0, 13), 11: (0, 26)}, 10: {16: (1, {'@': 17}), 17: (1, {'@': 17}), 12: (1, {'@': 17}), 13: (1, {'@': 17})}, 11: {}, 12: {18: (0, 1), 12: (0, 21), 13: (0, 16)}, 13: {16: (1, {'@': 18}), 17: (1, {'@': 18}), 12: (1, {'@': 18}), 13: (1, {'@': 18})}, 14: {16: (1, {'@': 19}), 12: (1, {'@': 19})}, 15: {0: (0, 33), 1: (0, 8), 2: (0, 5), 3: (0, 32), 4: (0, 23), 5: (0, 24), 6: (0, 17), 8: (0, 25), 9: (0, 20), 10: (0, 13), 11: (0, 26)}, 16: {16: (1, {'@': 20}), 17: (1, {'@': 20}), 12: (1, {'@': 20}), 13: (1, {'@': 20})}, 17: {12: (1, {'@': 21}), 13: (1, {'@': 21})}, 18: {16: (1, {'@': 22}), 17: (1, {'@': 22}), 12: (1, {'@': 22}), 13: (1, {'@': 22})}, 19: {16: (0, 18), 12: (0, 9)}, 20: {16: (1, {'@': 23}), 17: (1, {'@': 23}), 12: (1, {'@': 23}), 13: (1, {'@': 23}), 15: (1, {'@': 23})}, 21: {9: (0, 20), 2: (0, 4), 14: (0, 3)}, 22: {16: (0, 10), 19: (0, 19), 12: (0, 27)}, 23: {16: (1, {'@': 24}), 17: (1, {'@': 24}), 12: (1, {'@': 24}), 13: (1, {'@': 24})}, 24: {16: (1, {'@': 25}), 17: (1, {'@': 25}), 12: (1, {'@': 25}), 13: (1, {'@': 25})}, 25: {0: (0, 33), 1: (0, 8), 2: (0, 5), 3: (0, 32), 4: (0, 23), 5: (0, 24), 6: (0, 22), 8: (0, 25), 9: (0, 20), 10: (0, 13), 11: (0, 26), 16: (0, 29)}, 26: {16: (1, {'@': 26}), 17: (1, {'@': 26}), 12: (1, {'@': 26}), 13: (1, {'@': 26})}, 27: {0: (0, 33), 1: (0, 8), 2: (0, 5), 3: (0, 32), 4: (0, 23), 5: (0, 24), 6: (0, 7), 8: (0, 25), 9: (0, 20), 10: (0, 13), 11: (0, 26)}, 28: {17: (1, {'@': 27})}, 29: {16: (1, {'@': 28}), 17: (1, {'@': 28}), 12: (1, {'@': 28}), 13: (1, {'@': 28})}, 30: {16: (1, {'@': 29}), 17: (1, {'@': 29}), 12: (1, {'@': 29}), 13: (1, {'@': 29})}, 31: {16: (1, {'@': 30}), 17: (1, {'@': 30}), 12: (1, {'@': 30}), 13: (1, {'@': 30})}, 32: {16: (1, {'@': 31}), 17: (1, {'@': 31}), 12: (1, {'@': 31}), 13: (1, {'@': 31})}, 33: {9: (0, 20), 2: (0, 4), 13: (0, 30), 14: (0, 12)}}, 'end_states': {'start': 11}, 'start_states': {'start': 0}}, '__type__': 'LALR_ContextualLexer', 'start': ['start']}, '__type__': 'Lark', 'options': {'transformer': None, 'lexer': 'contextual', 'lexer_callbacks': {}, 'debug': False, 'postlex': None, 'parser': 'lalr', 'cache_grammar': False, 'tree_class': None, 'priority': None, 'start': ['start'], 'keep_all_tokens': False, 'ambiguity': 'auto', 'edit_terminals': None, 'propagate_positions': True, 'maybe_placeholders': True}} ) MEMO = ( -{0: {'priority': 1, 'pattern': {'__type__': 'PatternRE', 'flags': [], 'value': u'(?:(?:\\+|\\-))?(?:(?:(?:[0-9])+(?:e|E)(?:(?:\\+|\\-))?(?:[0-9])+|(?:(?:[0-9])+\\.(?:(?:[0-9])+)?|\\.(?:[0-9])+)(?:(?:e|E)(?:(?:\\+|\\-))?(?:[0-9])+)?)|(?:[0-9])+)'}, '__type__': 'TerminalDef', 'name': u'SIGNED_NUMBER'}, 1: {'priority': 1, 'pattern': {'__type__': 'PatternRE', 'flags': [], 'value': u'\\".*?(? 1: if dups[0].expansion: - raise GrammarError("Rules defined twice: %s\n\n(Might happen due to colliding expansion of optionals: [] or ?)" % ''.join('\n * %s' % i for i in dups)) + raise GrammarError("Rules defined twice: %s\n\n(Might happen due to colliding expansion of optionals: [] or ?)" + % ''.join('\n * %s' % i for i in dups)) # Empty rule; assert all other attributes are equal assert len({(r.alias, r.order, r.options) for r in dups}) == len(dups) diff --git a/lark/parsers/lalr_analysis.py b/lark/parsers/lalr_analysis.py index 4af2c24..7822485 100644 --- a/lark/parsers/lalr_analysis.py +++ b/lark/parsers/lalr_analysis.py @@ -202,7 +202,7 @@ class LALR_Analyzer(GrammarAnalyzer): continue s2 = rp2.next # if s2 is a terminal - if not s2 in self.lr0_rules_by_origin: + if s2 not in self.lr0_rules_by_origin: dr.add(s2) if s2 in self.NULLABLE: r.add((next_state, s2)) diff --git a/lark/tools/standalone.py b/lark/tools/standalone.py index 07016ff..9934567 100644 --- a/lark/tools/standalone.py +++ b/lark/tools/standalone.py @@ -34,6 +34,9 @@ # See . # # + +import os +from io import open ###} import pprint diff --git a/lark/utils.py b/lark/utils.py index 9513b8b..b1354cf 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -1,4 +1,5 @@ import sys +from ast import literal_eval from collections import deque class fzset(frozenset): @@ -239,3 +240,28 @@ class Enumerator(Serialize): assert len(r) == len(self.enums) return r + +def eval_escaping(s): + w = '' + i = iter(s) + for n in i: + w += n + if n == '\\': + try: + n2 = next(i) + except StopIteration: + raise ValueError("Literal ended unexpectedly (bad escaping): `%r`" % s) + if n2 == '\\': + w += '\\\\' + elif n2 not in 'uxnftr': + w += '\\' + w += n2 + w = w.replace('\\"', '"').replace("'", "\\'") + + to_eval = "u'''%s'''" % w + try: + s = literal_eval(to_eval) + except SyntaxError as e: + raise ValueError(s, e) + + return s