| @@ -0,0 +1,46 @@ | |||
| """This example demonstrates usage of the Indenter class. | |||
| Since indentation is context-sensitive, a postlex stage is introduced to manufacture INDENT/DEDENT tokens. | |||
| It is crucial for the indenter that the NL_type matches the spaces (and tabs) after the newline. | |||
| """ | |||
| from lark.lark import Lark | |||
| from lark.indenter import Indenter | |||
| tree_grammar = """ | |||
| ?start: _NL* tree | |||
| tree: /\w+/ _NL [_INDENT tree+ _DEDENT] | |||
| NAME: /\w+/ | |||
| WS.ignore: /\s+/ | |||
| _NL.newline: /(\r?\n[\t ]*)+/ | |||
| """ | |||
| class TreeIndenter(Indenter): | |||
| NL_type = '_NL' | |||
| OPEN_PAREN_types = [] | |||
| CLOSE_PAREN_types = [] | |||
| INDENT_type = '_INDENT' | |||
| DEDENT_type = '_DEDENT' | |||
| tab_len = 0 | |||
| parser = Lark(tree_grammar, parser='lalr', postlex=TreeIndenter()) | |||
| test_tree = """ | |||
| a | |||
| b | |||
| c | |||
| d | |||
| e | |||
| f | |||
| g | |||
| """ | |||
| def test(): | |||
| print parser.parse(test_tree).pretty() | |||
| if __name__ == '__main__': | |||
| test() | |||
| @@ -0,0 +1,47 @@ | |||
| "Provides Indentation services for languages with indentation similar to Python" | |||
| from .lexer import Token | |||
| class Indenter: | |||
| def __init__(self): | |||
| self.paren_level = 0 | |||
| self.indent_level = [0] | |||
| def handle_NL(self, token): | |||
| if (self.paren_level > 0): | |||
| return | |||
| indent_str = token.rsplit('\n', 1)[1] # Tabs and spaces | |||
| indent = indent_str.count(' ') + indent_str.count('\t') * self.tab_len | |||
| if indent > self.indent_level[-1]: | |||
| self.indent_level.append(indent) | |||
| yield Token(self.INDENT_type, indent_str) | |||
| else: | |||
| while indent < self.indent_level[-1]: | |||
| self.indent_level.pop() | |||
| yield Token(self.DEDENT_type, indent_str) | |||
| assert indent == self.indent_level[-1], '%s != %s' % (indent, self.indent_level[-1]) | |||
| def process(self, stream): | |||
| for token in stream: | |||
| yield token | |||
| if token.type == self.NL_type: | |||
| for t in self.handle_NL(token): | |||
| yield t | |||
| if token.type in self.OPEN_PAREN_types: | |||
| self.paren_level += 1 | |||
| if token.type in self.CLOSE_PAREN_types: | |||
| self.paren_level -= 1 | |||
| assert self.paren_level >= 0 | |||
| while len(self.indent_level) > 1: | |||
| self.indent_level.pop() | |||
| yield Token(self.DEDENT_type, '') | |||
| assert self.indent_level == [0], self.indent_level | |||
| @@ -23,7 +23,7 @@ class LarkOptions(object): | |||
| only_lex - Don't build a parser. Useful for debugging (default: False) | |||
| keep_all_tokens - Don't automagically remove "punctuation" tokens (default: True) | |||
| cache_grammar - Cache the Lark grammar (Default: False) | |||
| ignore_postproc - Don't call the post-processing function (default: False) | |||
| postlex - Lexer post-processing (Default: None) | |||
| """ | |||
| __doc__ += OPTIONS_DOC | |||
| def __init__(self, options_dict): | |||
| @@ -34,7 +34,7 @@ class LarkOptions(object): | |||
| self.keep_all_tokens = bool(o.pop('keep_all_tokens', False)) | |||
| self.tree_class = o.pop('tree_class', Tree) | |||
| self.cache_grammar = o.pop('cache_grammar', False) | |||
| self.ignore_postproc = bool(o.pop('ignore_postproc', False)) | |||
| self.postlex = o.pop('postlex', None) | |||
| self.parser = o.pop('parser', 'earley') | |||
| self.transformer = o.pop('transformer', None) | |||
| @@ -206,7 +206,11 @@ class Lark: | |||
| return f | |||
| def lex(self, text): | |||
| return self.lexer.lex(text) | |||
| stream = self.lexer.lex(text) | |||
| if self.options.postlex: | |||
| return self.options.postlex.process(stream) | |||
| else: | |||
| return stream | |||
| def parse(self, text): | |||
| assert not self.options.only_lex | |||
| @@ -33,6 +33,7 @@ LIMIT = 50 # Stupid named groups limit in python re | |||
| class Lexer(object): | |||
| def __init__(self, tokens, callbacks, ignore=()): | |||
| self.ignore = ignore | |||
| self.newline_char = '\n' | |||
| # Sanitization | |||
| token_names = {t[0] for t in tokens} | |||
| @@ -60,6 +61,8 @@ class Lexer(object): | |||
| def lex(self, stream): | |||
| lex_pos = 0 | |||
| line = 0 | |||
| col_start_pos = 0 | |||
| while True: | |||
| i = 0 | |||
| for mre in self.mres: | |||
| @@ -67,11 +70,17 @@ class Lexer(object): | |||
| if m: | |||
| value = m.group(0) | |||
| type_ = self.name_from_index[i][m.lastindex] | |||
| t = Token(type_, value, lex_pos) | |||
| if t.type in self.callbacks: | |||
| self.callbacks[t.type](t) | |||
| if t.type not in self.ignore: | |||
| if type_ not in self.ignore: | |||
| t = Token(type_, value, lex_pos) | |||
| t.line = line | |||
| t.column = lex_pos - col_start_pos | |||
| if t.type in self.callbacks: | |||
| t = self.callbacks[t.type](t) | |||
| yield t | |||
| newlines = value.count(self.newline_char) | |||
| if newlines: | |||
| line += newlines | |||
| col_start_pos = lex_pos + value.rindex(self.newline_char) | |||
| lex_pos += len(value) | |||
| break | |||
| i += 1 | |||
| @@ -4,7 +4,7 @@ class ParseError(Exception): | |||
| pass | |||
| class Parser(object): | |||
| def __init__(self, ga, callback, temp=False): | |||
| def __init__(self, ga, callback): | |||
| self.ga = ga | |||
| self.callbacks = {rule: getattr(callback, rule.alias or rule.origin, None) | |||
| for rule in ga.rules} | |||