| @@ -1,13 +1,13 @@ | |||||
| # Lark - a modern pure-Python parsing library | |||||
| # Lark - a modern parsing library | |||||
| Lark is a modern general-purpose Python parsing library, that focuses on simplicity and power. | |||||
| Lark is a modern general-purpose parsing library for Python. | |||||
| Lark accepts grammars as EBNF and lets you choose between two parsing algorithms: | |||||
| Lark focuses on simplicity and power. It lets you choose between two parsing algorithms: | |||||
| - Earley : Parses all context-free grammars (even ambiguous ones)! | |||||
| - Earley : Parses all context-free grammars (even ambiguous ones)! It is the default. | |||||
| - LALR(1): Only LR grammars. Outperforms PLY and most if not all other pure-python parsing libraries. | - LALR(1): Only LR grammars. Outperforms PLY and most if not all other pure-python parsing libraries. | ||||
| Both algorithms are pure-python implementations and can be used interchangably (aside for algorithmic restrictions). | |||||
| Both algorithms are written in Python and can be used interchangably with the same grammar (aside for algorithmic restrictions). See "Comparison to other parsers" for more details. | |||||
| Lark can automagically build an AST from your grammar, without any more code on your part. | Lark can automagically build an AST from your grammar, without any more code on your part. | ||||
| @@ -41,10 +41,12 @@ Tree(start, [Token(WORD, Hello), Token(WORD, World)]) | |||||
| Notice punctuation doesn't appear in the resulting tree. It's automatically filtered away by Lark. | Notice punctuation doesn't appear in the resulting tree. It's automatically filtered away by Lark. | ||||
| To learn more about Lark: | |||||
| - Learn how to parse json at the [tutorial](/docs/json_tutorial.md) | |||||
| ## Learn more about using Lark | |||||
| ## Features | |||||
| - Read the [tutorial](/docs/json_tutorial.md), which shows how to write a JSON parser in Lark. | |||||
| - Browse the [examples](/examples), which include a calculator, and a Python-code parser. | |||||
| ## List of Features | |||||
| - EBNF grammar with a little extra | - EBNF grammar with a little extra | ||||
| - Earley & LALR(1) | - Earley & LALR(1) | ||||
| @@ -0,0 +1 @@ | |||||
| from .lark import Lark, Transformer | |||||
| @@ -39,6 +39,7 @@ class LarkOptions(object): | |||||
| self.parser = o.pop('parser', 'earley') | self.parser = o.pop('parser', 'earley') | ||||
| self.transformer = o.pop('transformer', None) | self.transformer = o.pop('transformer', None) | ||||
| self.start = o.pop('start', 'start') | self.start = o.pop('start', 'start') | ||||
| self.profile = o.pop('profile', False) # XXX new | |||||
| assert self.parser in ENGINE_DICT | assert self.parser in ENGINE_DICT | ||||
| if self.parser == 'earley' and self.transformer: | if self.parser == 'earley' and self.transformer: | ||||
| @@ -50,6 +51,30 @@ class LarkOptions(object): | |||||
| raise ValueError("Unknown options: %s" % o.keys()) | raise ValueError("Unknown options: %s" % o.keys()) | ||||
| import time | |||||
| from collections import defaultdict | |||||
| class Profiler: | |||||
| def __init__(self): | |||||
| self.total_time = defaultdict(float) | |||||
| self.cur_section = '__init__' | |||||
| self.last_enter_time = time.time() | |||||
| def enter_section(self, name): | |||||
| cur_time = time.time() | |||||
| self.total_time[self.cur_section] += cur_time - self.last_enter_time | |||||
| self.last_enter_time = cur_time | |||||
| self.cur_section = name | |||||
| def make_wrapper(self, name, f): | |||||
| def _f(*args, **kwargs): | |||||
| last_section = self.cur_section | |||||
| self.enter_section(name) | |||||
| try: | |||||
| return f(*args, **kwargs) | |||||
| finally: | |||||
| self.enter_section(last_section) | |||||
| return _f | |||||
| class Lark: | class Lark: | ||||
| @@ -82,6 +107,8 @@ class Lark: | |||||
| if self.options.cache_grammar: | if self.options.cache_grammar: | ||||
| raise NotImplementedError("Not available yet") | raise NotImplementedError("Not available yet") | ||||
| self.profiler = Profiler() if self.options.profile else None | |||||
| self.tokens, self.rules = load_grammar(grammar) | self.tokens, self.rules = load_grammar(grammar) | ||||
| self.lexer = self._build_lexer() | self.lexer = self._build_lexer() | ||||
| @@ -90,6 +117,9 @@ class Lark: | |||||
| self.parse_tree_builder = ParseTreeBuilder(self.options.tree_class) | self.parse_tree_builder = ParseTreeBuilder(self.options.tree_class) | ||||
| self.parser = self._build_parser() | self.parser = self._build_parser() | ||||
| if self.profiler: self.profiler.enter_section('outside_lark') | |||||
| def _create_unless_callback(self, strs): | def _create_unless_callback(self, strs): | ||||
| def f(t): | def f(t): | ||||
| if t in strs: | if t in strs: | ||||
| @@ -105,8 +135,6 @@ class Lark: | |||||
| for flag in flags: | for flag in flags: | ||||
| if flag == 'ignore': | if flag == 'ignore': | ||||
| ignore_tokens.append(name) | ignore_tokens.append(name) | ||||
| elif flag == 'newline': | |||||
| pass # TODO | |||||
| elif isinstance(flag, tuple) and flag[0] == 'unless': | elif isinstance(flag, tuple) and flag[0] == 'unless': | ||||
| _, strs = flag | _, strs = flag | ||||
| callbacks[name] = self._create_unless_callback(strs) | callbacks[name] = self._create_unless_callback(strs) | ||||
| @@ -119,6 +147,10 @@ class Lark: | |||||
| def _build_parser(self): | def _build_parser(self): | ||||
| rules, callback = self.parse_tree_builder.create_tree_builder(self.rules, self.options.transformer) | rules, callback = self.parse_tree_builder.create_tree_builder(self.rules, self.options.transformer) | ||||
| if self.profiler: | |||||
| for f in dir(callback): | |||||
| if not f.startswith('__'): | |||||
| setattr(callback, f, self.profiler.make_wrapper('transformer', getattr(callback, f))) | |||||
| return self.parser_engine.build_parser(rules, callback, self.options.start) | return self.parser_engine.build_parser(rules, callback, self.options.start) | ||||
| @@ -133,6 +165,16 @@ class Lark: | |||||
| def parse(self, text): | def parse(self, text): | ||||
| assert not self.options.only_lex | assert not self.options.only_lex | ||||
| l = list(self.lex(text)) | |||||
| return self.parser.parse(l) | |||||
| if self.profiler: | |||||
| self.profiler.enter_section('lex') | |||||
| l = list(self.lex(text)) | |||||
| self.profiler.enter_section('parse') | |||||
| try: | |||||
| return self.parser.parse(l) | |||||
| finally: | |||||
| self.profiler.enter_section('outside_lark') | |||||
| else: | |||||
| l = list(self.lex(text)) | |||||
| return self.parser.parse(l) | |||||
| @@ -1,5 +1,7 @@ | |||||
| ## Lexer Implementation | ## Lexer Implementation | ||||
| import re | |||||
| from .utils import Str | from .utils import Str | ||||
| class LexError(Exception): | class LexError(Exception): | ||||
| @@ -13,13 +15,6 @@ class Token(Str): | |||||
| inst.value = value | inst.value = value | ||||
| return inst | return inst | ||||
| # class Token(object): | |||||
| # def __init__(self, type, value, lexpos): | |||||
| # self.type = type | |||||
| # self.value = value | |||||
| # self.lexpos = lexpos | |||||
| def __repr__(self): | def __repr__(self): | ||||
| return 'Token(%s, %s)' % (self.type, self.value) | return 'Token(%s, %s)' % (self.type, self.value) | ||||
| @@ -29,12 +24,11 @@ class Regex: | |||||
| self.flags = flags | self.flags = flags | ||||
| import re | |||||
| LIMIT = 50 # Stupid named groups limit in python re | |||||
| class Lexer(object): | class Lexer(object): | ||||
| def __init__(self, tokens, callbacks, ignore=()): | def __init__(self, tokens, callbacks, ignore=()): | ||||
| self.ignore = ignore | self.ignore = ignore | ||||
| self.newline_char = '\n' | self.newline_char = '\n' | ||||
| tokens = list(tokens) | |||||
| # Sanitization | # Sanitization | ||||
| token_names = {t[0] for t in tokens} | token_names = {t[0] for t in tokens} | ||||
| @@ -49,42 +43,57 @@ class Lexer(object): | |||||
| self.tokens = tokens | self.tokens = tokens | ||||
| self.callbacks = callbacks | self.callbacks = callbacks | ||||
| # self.tokens.sort(key=lambda x:len(x[1]), reverse=True) | |||||
| self.token_types = list(token_names) | |||||
| self.type_index = {name:i for i,name in enumerate(self.token_types)} | |||||
| self.newline_types = [self.type_index[t[0]] for t in tokens if '\n' in t[1] or '\\n' in t[1]] | |||||
| self.ignore_types = [self.type_index[t] for t in ignore] | |||||
| self.mres = [] | |||||
| self.name_from_index = [] | |||||
| x = list(tokens) | |||||
| while x: | |||||
| mre = re.compile(u'|'.join(u'(?P<%s>%s)'%t for t in x[:LIMIT])) | |||||
| self.mres.append(mre) | |||||
| self.name_from_index.append( {i:n for n,i in mre.groupindex.items()} ) | |||||
| x = x[LIMIT:] | |||||
| self.mres = self._build_mres(tokens, len(tokens)) | |||||
| def _build_mres(self, tokens, max_size): | |||||
| # Python sets an unreasonable group limit (currently 100) in its re module | |||||
| # Worse, the only way to know we reached it is by catching an AssertionError! | |||||
| # This function recursively tries less and less groups until it's successful. | |||||
| mres = [] | |||||
| while tokens: | |||||
| try: | |||||
| mre = re.compile(u'|'.join(u'(?P<%s>%s)'%t for t in tokens[:max_size])) | |||||
| except AssertionError: # Yes, this is what Python provides us.. :/ | |||||
| return self._build_mres(tokens, max_size/2) | |||||
| mres.append((mre, {i:self.type_index[n] for n,i in mre.groupindex.items()} )) | |||||
| tokens = tokens[max_size:] | |||||
| return mres | |||||
| def lex(self, stream): | def lex(self, stream): | ||||
| lex_pos = 0 | lex_pos = 0 | ||||
| line = 1 | line = 1 | ||||
| col_start_pos = 0 | col_start_pos = 0 | ||||
| newline_types = list(self.newline_types) | |||||
| ignore_types = list(self.ignore_types) | |||||
| while True: | while True: | ||||
| i = 0 | |||||
| for mre in self.mres: | |||||
| for mre, type_from_index in self.mres: | |||||
| m = mre.match(stream, lex_pos) | m = mre.match(stream, lex_pos) | ||||
| if m: | if m: | ||||
| value = m.group(0) | value = m.group(0) | ||||
| type_ = self.name_from_index[i][m.lastindex] | |||||
| if type_ not in self.ignore: | |||||
| t = Token(type_, value, lex_pos) | |||||
| type_num = type_from_index[m.lastindex] | |||||
| if type_num not in ignore_types: | |||||
| t = Token(self.token_types[type_num], value, lex_pos) | |||||
| t.line = line | t.line = line | ||||
| t.column = lex_pos - col_start_pos | t.column = lex_pos - col_start_pos | ||||
| if t.type in self.callbacks: | if t.type in self.callbacks: | ||||
| t = self.callbacks[t.type](t) | t = self.callbacks[t.type](t) | ||||
| yield t | yield t | ||||
| newlines = value.count(self.newline_char) | |||||
| if newlines: | |||||
| line += newlines | |||||
| col_start_pos = lex_pos + value.rindex(self.newline_char) | |||||
| if type_num in newline_types: | |||||
| newlines = value.count(self.newline_char) | |||||
| if newlines: | |||||
| line += newlines | |||||
| col_start_pos = lex_pos + value.rindex(self.newline_char) | |||||
| lex_pos += len(value) | lex_pos += len(value) | ||||
| break | break | ||||
| i += 1 | |||||
| else: | else: | ||||
| if lex_pos < len(stream): | if lex_pos < len(stream): | ||||
| context = stream[lex_pos:lex_pos+5] | context = stream[lex_pos:lex_pos+5] | ||||
| @@ -334,6 +334,13 @@ def _make_parser_test(PARSER): | |||||
| x = g.parse('a') | x = g.parse('a') | ||||
| self.assertEqual(x.data, "b") | self.assertEqual(x.data, "b") | ||||
| def test_lexer_token_limit(self): | |||||
| "Python has a stupid limit of 100 groups in a regular expression. Test that we handle this limitation" | |||||
| tokens = {'A%d'%i:'"%d"'%i for i in range(300)} | |||||
| g = _Lark("""start: %s | |||||
| %s""" % (' '.join(tokens), '\n'.join("%s: %s"%x for x in tokens.items()))) | |||||
| _NAME = "Test" + PARSER.capitalize() | _NAME = "Test" + PARSER.capitalize() | ||||
| _TestParser.__name__ = _NAME | _TestParser.__name__ = _NAME | ||||
| globals()[_NAME] = _TestParser | globals()[_NAME] = _TestParser | ||||