diff --git a/examples/indented_tree.py b/examples/indented_tree.py index 7d10add..c1a5bf4 100644 --- a/examples/indented_tree.py +++ b/examples/indented_tree.py @@ -24,7 +24,7 @@ class TreeIndenter(Indenter): CLOSE_PAREN_types = [] INDENT_type = '_INDENT' DEDENT_type = '_DEDENT' - tab_len = 0 + tab_len = 8 parser = Lark(tree_grammar, parser='lalr', postlex=TreeIndenter()) diff --git a/lark/lark.py b/lark/lark.py index a23cbbd..def7368 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -152,12 +152,12 @@ class Lark: def _build_lexer(self): ignore_tokens = [] - tokens = {} - for name, (value, flags) in self.tokens.items(): + tokens = [] + for name, (value, flags) in self.tokens: if 'ignore' in flags: ignore_tokens.append(name) - tokens[name] = value - return Lexer(tokens.items(), {}, ignore=ignore_tokens) + tokens.append((name, value)) + return Lexer(tokens, {}, ignore=ignore_tokens) def _build_parser(self): diff --git a/lark/lexer.py b/lark/lexer.py index bbd3b1e..74821f1 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -48,7 +48,7 @@ class Lexer(object): self.tokens = tokens self.callbacks = callbacks - self.tokens.sort(key=lambda x:len(x[1]), reverse=True) + # self.tokens.sort(key=lambda x:len(x[1]), reverse=True) self.mres = [] self.name_from_index = [] diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 5b4e037..49f3c86 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -1,10 +1,14 @@ import re +import codecs + from lexer import Lexer, Token from grammar_analysis import GrammarAnalyzer from parser import Parser from tree import Tree as T, Transformer, InlineTransformer, Visitor +unicode_escape = codecs.getdecoder('unicode_escape') + _TOKEN_NAMES = { ':' : 'COLON', ',' : 'COMMA', @@ -143,26 +147,18 @@ class SaveDefinitions(object): raise ValueError("Token '%s' defined more than once" % name) if len(x) == 4: - self.tokens[name] = x[2][1], [] + self.tokens[name] = x[2], [] else: - self.tokens[name] = x[3][1], x[1].children + self.tokens[name] = x[3], x[1].children def tokenvalue(self, tokenvalue): - value = tokenvalue.value[1:-1] - import codecs - decoder = codecs.getdecoder('unicode_escape') - if '\u' in value: - # XXX for now, you can't mix unicode escaping and unicode characters at the same token - value = decoder(value)[0] - - if tokenvalue.type == 'STRING': - value = re.escape(value) - return tokenvalue, value - - def anontoken(self, (token, value)): + return tokenvalue + + def anontoken(self, token): if token.type == 'STRING': + value = token.value[1:-1] try: - token_name = _TOKEN_NAMES[token.value[1:-1]] + token_name = _TOKEN_NAMES[value] except KeyError: if value.isalnum() and value[0].isalpha(): token_name = value.upper() @@ -178,7 +174,7 @@ class SaveDefinitions(object): assert False, x if token_name not in self.tokens: - self.tokens[token_name] = value, [] + self.tokens[token_name] = token, [] return Token('TOKEN', token_name, -1) @@ -312,6 +308,27 @@ class GrammarLoader: p = Parser(self.ga, c) p.parse( list(self.lexer.lex(grammar_text+"\n")) ) + # Tokens + re_tokens = [] + str_tokens = [] + for name, (token, flags) in sd.tokens.items(): + value = token.value[1:-1] + if '\u' in value: + # XXX for now, you can't mix unicode escaping and unicode characters at the same token + value = unicode_escape(value)[0] + + if token.type == 'STRING': + value = re.escape(value) + str_tokens.append((name, (value, flags))) + else: + assert token.type == 'REGEXP' + re_tokens.append((name, (value, flags))) + + str_tokens.sort(key=lambda x:len(x[1][0]), reverse=True) + re_tokens.sort(key=lambda x:len(x[1][0]), reverse=True) + tokens = str_tokens + re_tokens # Order is important! + + # Rules ebnf_to_bnf = EBNF_to_BNF() rules = {name: ebnf_to_bnf.transform(r) for name, r in sd.rules.items()} @@ -320,7 +337,7 @@ class GrammarLoader: for r in rules.values(): self.simplify_rule.visit(r) - return sd.tokens, rules + return tokens, rules load_grammar = GrammarLoader().load_grammar