Improved lexer, added profiler option to Lark

9 years ago · 8b9c5801da
--- a/README.md
+++ b/README.md
@@ -1,13 +1,13 @@
 # Lark - a modern pure-Python parsing library
 # Lark - a modern parsing library
 Lark is a modern general-purpose Python parsing library, that focuses on simplicity and power.
 Lark is a modern general-purpose parsing library for Python.
 Lark accepts grammars as EBNF and lets you choose between two parsing algorithms:
 Lark focuses on simplicity and power. It lets you choose between two parsing algorithms:
 - Earley : Parses all context-free grammars (even ambiguous ones)!
 - Earley : Parses all context-free grammars (even ambiguous ones)! It is the default.
 - LALR(1): Only LR grammars. Outperforms PLY and most if not all other pure-python parsing libraries.
 Both algorithms are pure-python implementations and can be used interchangably (aside for algorithmic restrictions).
 Both algorithms are written in Python and can be used interchangably with the same grammar (aside for algorithmic restrictions). See "Comparison to other parsers" for more details.
 Lark can automagically build an AST from your grammar, without any more code on your part.
@@ -41,10 +41,12 @@ Tree(start, [Token(WORD, Hello), Token(WORD, World)])
 Notice punctuation doesn't appear in the resulting tree. It's automatically filtered away by Lark.
 To learn more about Lark:
 - Learn how to parse json at the [tutorial](/docs/json_tutorial.md)
 ## Learn more about using Lark
 ## Features
 - Read the [tutorial](/docs/json_tutorial.md), which shows how to write a JSON parser in Lark.
 - Browse the [examples](/examples), which include a calculator, and a Python-code parser.
 ## List of Features
 - EBNF grammar with a little extra
 - Earley & LALR(1)
--- a/lark/init.py
+++ b/lark/init.py
@@ -0,0 +1 @@
 from .lark import Lark, Transformer
--- a/lark/lark.py
+++ b/lark/lark.py
@@ -39,6 +39,7 @@ class LarkOptions(object):
        self.parser = o.pop('parser', 'earley')
        self.transformer = o.pop('transformer', None)
        self.start = o.pop('start', 'start')
        self.profile = o.pop('profile', False)  # XXX new
        assert self.parser in ENGINE_DICT
        if self.parser == 'earley' and self.transformer:
@@ -50,6 +51,30 @@ class LarkOptions(object):
            raise ValueError("Unknown options: %s" % o.keys())
 import time
 from collections import defaultdict
 class Profiler:
    def __init__(self):
        self.total_time = defaultdict(float)
        self.cur_section = '__init__'
        self.last_enter_time = time.time()
    def enter_section(self, name):
        cur_time = time.time()
        self.total_time[self.cur_section] += cur_time - self.last_enter_time
        self.last_enter_time = cur_time
        self.cur_section = name
    def make_wrapper(self, name, f):
        def _f(*args, **kwargs):
            last_section = self.cur_section
            self.enter_section(name)
            try:
                return f(*args, **kwargs)
            finally:
                self.enter_section(last_section)
        return _f
 class Lark:
@@ -82,6 +107,8 @@ class Lark:
        if self.options.cache_grammar:
            raise NotImplementedError("Not available yet")
        self.profiler = Profiler() if self.options.profile else None
        self.tokens, self.rules = load_grammar(grammar)
        self.lexer = self._build_lexer()
@@ -90,6 +117,9 @@ class Lark:
            self.parse_tree_builder = ParseTreeBuilder(self.options.tree_class)
            self.parser = self._build_parser()
        if self.profiler: self.profiler.enter_section('outside_lark')
    def _create_unless_callback(self, strs):
        def f(t):
            if t in strs:
@@ -105,8 +135,6 @@ class Lark:
            for flag in flags:
                if flag == 'ignore':
                    ignore_tokens.append(name)
                elif flag == 'newline':
                    pass    # TODO
                elif isinstance(flag, tuple) and flag[0] == 'unless':
                    _, strs = flag
                    callbacks[name] = self._create_unless_callback(strs)
@@ -119,6 +147,10 @@ class Lark:
    def _build_parser(self):
        rules, callback = self.parse_tree_builder.create_tree_builder(self.rules, self.options.transformer)
        if self.profiler:
            for f in dir(callback):
                if not f.startswith('__'):
                    setattr(callback, f, self.profiler.make_wrapper('transformer', getattr(callback, f)))
        return self.parser_engine.build_parser(rules, callback, self.options.start)
@@ -133,6 +165,16 @@ class Lark:
    def parse(self, text):
        assert not self.options.only_lex
        l = list(self.lex(text))
        return self.parser.parse(l)
        if self.profiler:
            self.profiler.enter_section('lex')
            l = list(self.lex(text))
            self.profiler.enter_section('parse')
            try:
                return self.parser.parse(l)
            finally:
                self.profiler.enter_section('outside_lark')
        else:
            l = list(self.lex(text))
            return self.parser.parse(l)
--- a/lark/lexer.py
+++ b/lark/lexer.py
@@ -1,5 +1,7 @@
 ## Lexer Implementation
 import re
 from .utils import Str
 class LexError(Exception):
@@ -13,13 +15,6 @@ class Token(Str):
        inst.value = value
        return inst
 # class Token(object):
 #     def __init__(self, type, value, lexpos):
 #         self.type = type
 #         self.value = value
 #         self.lexpos = lexpos
    def __repr__(self):
        return 'Token(%s, %s)' % (self.type, self.value)
@@ -29,12 +24,11 @@ class Regex:
        self.flags = flags
 import re
 LIMIT = 50 # Stupid named groups limit in python re
 class Lexer(object):
    def __init__(self, tokens, callbacks, ignore=()):
        self.ignore = ignore
        self.newline_char = '\n'
        tokens = list(tokens)
        # Sanitization
        token_names = {t[0] for t in tokens}
@@ -49,42 +43,57 @@ class Lexer(object):
        self.tokens = tokens
        self.callbacks = callbacks
        # self.tokens.sort(key=lambda x:len(x[1]), reverse=True)
        self.token_types = list(token_names)
        self.type_index = {name:i for i,name in enumerate(self.token_types)}
        self.newline_types = [self.type_index[t[0]] for t in tokens if '\n' in t[1] or '\\n' in t[1]]
        self.ignore_types = [self.type_index[t] for t in ignore]
        self.mres = []
        self.name_from_index = []
        x = list(tokens)
        while x:
            mre =  re.compile(u'|'.join(u'(?P<%s>%s)'%t for t in x[:LIMIT]))
            self.mres.append(mre)
            self.name_from_index.append( {i:n for n,i in mre.groupindex.items()} )
            x = x[LIMIT:]
        self.mres = self._build_mres(tokens, len(tokens))
    def _build_mres(self, tokens, max_size):
        # Python sets an unreasonable group limit (currently 100) in its re module
        # Worse, the only way to know we reached it is by catching an AssertionError!
        # This function recursively tries less and less groups until it's successful.
        mres = []
        while tokens:
            try:
                mre = re.compile(u'|'.join(u'(?P<%s>%s)'%t for t in tokens[:max_size]))
            except AssertionError:  # Yes, this is what Python provides us.. :/
                return self._build_mres(tokens, max_size/2)
            mres.append((mre, {i:self.type_index[n] for n,i in mre.groupindex.items()} ))
            tokens = tokens[max_size:]
        return mres
    def lex(self, stream):
        lex_pos = 0
        line = 1
        col_start_pos = 0
        newline_types = list(self.newline_types)
        ignore_types = list(self.ignore_types)
        while True:
            i = 0
            for mre in self.mres:
            for mre, type_from_index in self.mres:
                m = mre.match(stream, lex_pos)
                if m:
                    value = m.group(0)
                    type_ = self.name_from_index[i][m.lastindex]
                    if type_ not in self.ignore:
                        t = Token(type_, value, lex_pos)
                    type_num = type_from_index[m.lastindex]
                    if type_num not in ignore_types:
                        t = Token(self.token_types[type_num], value, lex_pos)
                        t.line = line
                        t.column = lex_pos - col_start_pos
                        if t.type in self.callbacks:
                            t = self.callbacks[t.type](t)
                        yield t
                    newlines = value.count(self.newline_char)
                    if newlines:
                        line += newlines
                        col_start_pos = lex_pos + value.rindex(self.newline_char)
                    if type_num in newline_types:
                        newlines = value.count(self.newline_char)
                        if newlines:
                            line += newlines
                            col_start_pos = lex_pos + value.rindex(self.newline_char)
                    lex_pos += len(value)
                    break
                i += 1
            else:
                if lex_pos < len(stream):
                    context = stream[lex_pos:lex_pos+5]
--- a/lark/tests/test_parser.py
+++ b/lark/tests/test_parser.py
@@ -334,6 +334,13 @@ def _make_parser_test(PARSER):
            x = g.parse('a')
            self.assertEqual(x.data, "b")
        def test_lexer_token_limit(self):
            "Python has a stupid limit of 100 groups in a regular expression. Test that we handle this limitation"
            tokens = {'A%d'%i:'"%d"'%i for i in range(300)}
            g = _Lark("""start: %s
                      %s""" % (' '.join(tokens), '\n'.join("%s: %s"%x for x in tokens.items())))
    _NAME = "Test" + PARSER.capitalize()
    _TestParser.__name__ = _NAME
    globals()[_NAME] = _TestParser