Merge branch 'superlexer2' (Contextual Lexer)

8 years ago · 137536b6a6
--- a/examples/conf.py
+++ b/examples/conf.py
@@ -0,0 +1,37 @@
 #
 # This example demonstrates the power of the contextual lexer, by parsing a config file.
 #
 # The tokens NAME and VALUE match the same input. A regular lexer would arbitrarily
 # choose one over the other, which would lead to a (confusing) parse error.
 # However, due to the unambiguous structure of the grammar, the LALR(1) algorithm knows 
 # which one of them to expect at each point during the parse.
 # The lexer then only matches the tokens that the parser expects.
 # The result is a correct parse, something that is impossible with a regular lexer.
 #
 # Another approach is to discard a lexer altogether and use the Earley algorithm.
 # It will handle more cases than the contextual lexer, but at the cost of performance.
 # See examples/conf_nolex.py for an example of that approach.
 #

 from lark import Lark

 parser = Lark(r"""
        start: _NL? section+
        section: "[" NAME "]" _NL item+
        item: NAME "=" VALUE _NL
        NAME: /[a-zA-Z_]\w*/
        VALUE: /.*/

        WS.ignore: /[\t \f]+/
        COMMENT.ignore: /\#[^\n]*/
        _NL: /(\r?\n)+/
    """, parser="lalr_contextual_lexer")


 sample_conf = """
 [bla]
 a=Hello
 this="that",4
 """

 print parser.parse(sample_conf).pretty()
--- a/examples/conf_nolex.py
+++ b/examples/conf_nolex.py
@@ -1,5 +1,5 @@
 #
 # This example demonstrates lex-less parsing using the earley_nolex frontend
 # This example demonstrates scanless parsing using the earley_nolex frontend
 #
 # Using a lexer for configuration files is tricky, because values don't
 # have to be surrounded by delimiters.
@@ -7,6 +7,10 @@
 #
 # Future versions of lark will make it easier to write these kinds of grammars.
 #
 # Another approach is to use the contextual lexer. It is less powerful than the scanless approach,
 # but it can handle some ambiguity in lexing and it's much faster since it uses LALR(1).
 # See examples/conf.py for an example of that approach.
 #

 from lark import Lark, Transformer

--- a/lark/common.py
+++ b/lark/common.py
@@ -17,6 +17,8 @@ class UnexpectedToken(ParseError):
            context = ' '.join(['%r(%s)' % (t.value, t.type) for t in seq[index:index+5]])
        except AttributeError:
            context = seq[index:index+5]
        except TypeError:
            context = "<no context>"
        message = ("Unexpected token %r at line %s, column %s.\n"
                   "Expected: %s\n"
                   "Context: %s" % (token, self.line, self.column, expected, context))
--- a/lark/indenter.py
+++ b/lark/indenter.py
@@ -26,7 +26,6 @@ class Indenter:

            assert indent == self.indent_level[-1], '%s != %s' % (indent, self.indent_level[-1])


    def process(self, stream):
        for token in stream:
            if token.type == self.NL_type:
@@ -37,7 +36,7 @@ class Indenter:

            if token.type in self.OPEN_PAREN_types:
                self.paren_level += 1
            if token.type in self.CLOSE_PAREN_types:
            elif token.type in self.CLOSE_PAREN_types:
                self.paren_level -= 1
                assert self.paren_level >= 0

@@ -47,3 +46,7 @@ class Indenter:

        assert self.indent_level == [0], self.indent_level

    # XXX Hack for ContextualLexer. Maybe there's a more elegant solution?
    @property
    def always_accept(self):
        return (self.NL_type,)
--- a/lark/lexer.py
+++ b/lark/lexer.py
@@ -3,6 +3,7 @@
 import re

 from .utils import Str, classify
 from .common import is_terminal

 class LexError(Exception):
    pass
@@ -169,3 +170,64 @@ class Lexer(object):
                if lex_pos < len(stream):
                    raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos)
                break


 class ContextualLexer:
    def __init__(self, tokens, states, ignore=(), always_accept=()):
        tokens_by_name = {}
        for t in tokens:
            assert t.name not in tokens_by_name
            tokens_by_name[t.name] = t

        lexer_by_tokens = {}
        self.lexers = {}
        for state, accepts in states.items():
            key = frozenset(accepts)
            try:
                lexer = lexer_by_tokens[key]
            except KeyError:
                accepts = set(accepts) # For python3
                accepts |= set(ignore)
                accepts |= set(always_accept)
                state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$end']
                lexer = Lexer(state_tokens, ignore=ignore)
                lexer_by_tokens[key] = lexer

            self.lexers[state] = lexer

        self.root_lexer = Lexer(tokens, ignore=ignore)

    def lex(self, stream, parser):
        lex_pos = 0
        line = 1
        col_start_pos = 0
        newline_types = list(self.root_lexer.newline_types)
        ignore_types = list(self.root_lexer.ignore_types)
        while True:
            lexer = self.lexers[parser.state]
            for mre, type_from_index in lexer.mres:
                m = mre.match(stream, lex_pos)
                if m:
                    value = m.group(0)
                    type_ = type_from_index[m.lastindex]
                    if type_ not in ignore_types:
                        t = Token(type_, value, lex_pos)
                        t.line = line
                        t.column = lex_pos - col_start_pos
                        if t.type in lexer.callback:
                            t = lexer.callback[t.type](t)
                        yield t

                    if type_ in newline_types:
                        newlines = value.count(lexer.newline_char)
                        if newlines:
                            line += newlines
                            col_start_pos = lex_pos + value.rindex(lexer.newline_char)
                    lex_pos += len(value)
                    break
            else:
                if lex_pos < len(stream):
                    print("Allowed tokens:", lexer.tokens)
                    raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos)
                break

--- a/lark/parser_frontends.py
+++ b/lark/parser_frontends.py
@@ -1,7 +1,7 @@
 import re
 import sre_parse

 from .lexer import Lexer
 from .lexer import Lexer, ContextualLexer
 from .parsers.lalr_analysis import GrammarAnalyzer

 from .common import is_terminal, GrammarError
@@ -31,6 +31,29 @@ class LALR(WithLexer):
        tokens = list(self.lex(text))
        return self.parser.parse(tokens)

 class LALR_ContextualLexer:
    def __init__(self, lexer_conf, parser_conf):
        self.lexer_conf = lexer_conf
        self.parser_conf = parser_conf

        self.analyzer = GrammarAnalyzer(parser_conf.rules, parser_conf.start)
        self.analyzer.analyze()

        d = {idx:t.keys() for idx, t in self.analyzer.states_idx.items()}
        self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore,
                                     always_accept=lexer_conf.postlex.always_accept
                                                   if lexer_conf.postlex else ())


    def parse(self, text):
        parser = lalr_parser.Parser(self.analyzer, self.parser_conf.callback)
        tokens = self.lexer.lex(text, parser)
        if self.lexer_conf.postlex:
            tokens = self.lexer_conf.postlex.process(tokens)
        return parser.parse(tokens, True)



 class Earley(WithLexer):
    def __init__(self, lexer_conf, parser_conf):
        WithLexer.__init__(self, lexer_conf)
@@ -82,4 +105,4 @@ class Earley_NoLex:
        assert len(res) ==1 , 'Ambiguious Parse! Not handled yet'
        return res[0]

 ENGINE_DICT = { 'lalr': LALR, 'earley': Earley, 'earley_nolex': Earley_NoLex }
 ENGINE_DICT = { 'lalr': LALR, 'earley': Earley, 'earley_nolex': Earley_NoLex, 'lalr_contextual_lexer': LALR_ContextualLexer }
--- a/lark/parsers/lalr_parser.py
+++ b/lark/parsers/lalr_parser.py
@@ -7,13 +7,15 @@ class Parser(object):
        self.analysis = analysis
        self.callbacks = {rule: getattr(callback, rule.alias or rule.origin, None)
                          for rule in analysis.rules}
        self.state = self.analysis.init_state_idx

    def parse(self, seq):
    def parse(self, seq, set_state=False):
        i = 0
        stream = iter(seq)
        states_idx = self.analysis.states_idx

        state_stack = [self.analysis.init_state_idx]
        value_stack = []
        i = 0

        def get_action(key):
            state = state_stack[-1]
@@ -21,11 +23,6 @@ class Parser(object):
                return states_idx[state][key]
            except KeyError:
                expected = states_idx[state].keys()
                try:
                    token = seq[i]
                except IndexError:
                    assert key == '$end'
                    token = seq[-1]

                raise UnexpectedToken(token, expected, seq, i)

@@ -48,15 +45,22 @@ class Parser(object):
            value_stack.append(res)

        # Main LALR-parser loop
        while i < len(seq):
            action, arg = get_action(seq[i].type)

            if action == ACTION_SHIFT:
                state_stack.append(arg)
                value_stack.append(seq[i])
                i+= 1
            else:
                reduce(*arg)
        try:
            token = next(stream)
            i += 1
            while True:
                action, arg = get_action(token.type)

                if action == ACTION_SHIFT:
                    state_stack.append(arg)
                    value_stack.append(token)
                    if set_state: self.state = arg
                    token = next(stream)
                    i += 1
                else:
                    reduce(*arg)
        except StopIteration:
            pass

        while True:
            _action, rule = get_action('$end')
--- a/tests/main.py
+++ b/tests/main.py
@@ -5,7 +5,7 @@ import logging

 from .test_trees import TestTrees
 # from .test_selectors import TestSelectors
 from .test_parser import TestLalr, TestEarley, TestParsers
 from .test_parser import TestLalr, TestEarley, TestLalr_contextual_lexer, TestParsers
 # from .test_grammars import TestPythonG, TestConfigG

 logging.basicConfig(level=logging.INFO)
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -356,11 +356,10 @@ def _make_parser_test(PARSER):
    _TestParser.__name__ = _NAME
    globals()[_NAME] = _TestParser

 for PARSER in ['lalr', 'earley']:
 for PARSER in ['lalr', 'earley', 'lalr_contextual_lexer']:
    _make_parser_test(PARSER)



 if __name__ == '__main__':
    unittest.main()