| @@ -0,0 +1,54 @@ | |||||
| # | |||||
| # This example demonstrates using Lark with a custom lexer. | |||||
| # | |||||
| # You can use a custom lexer to tokenize text when the lexers offered by Lark | |||||
| # are too slow, or not flexible enough. | |||||
| # | |||||
| # You can also use it (as shown in this example) to tokenize streams of objects. | |||||
| # | |||||
| from lark import Lark, Transformer, v_args | |||||
| from lark.lexer import Lexer, Token | |||||
| class TypeLexer(Lexer): | |||||
| def __init__(self, lexer_conf): | |||||
| pass | |||||
| def lex(self, data): | |||||
| print(data) | |||||
| for obj in data: | |||||
| if isinstance(obj, int): | |||||
| yield Token('INT', obj) | |||||
| elif isinstance(obj, (type(''), type(u''))): | |||||
| yield Token('STR', obj) | |||||
| else: | |||||
| raise TypeError(obj) | |||||
| parser = Lark(""" | |||||
| start: data_item+ | |||||
| data_item: STR INT* | |||||
| %declare STR INT | |||||
| """, parser='lalr', lexer=TypeLexer) | |||||
| class ParseToDict(Transformer): | |||||
| @v_args(inline=True) | |||||
| def data_item(self, name, *numbers): | |||||
| return name.value, [n.value for n in numbers] | |||||
| start = dict | |||||
| def test(): | |||||
| data = ['alice', 1, 27, 3, 'bob', 4, 'carrie', 'dan', 8, 6] | |||||
| tree = parser.parse(data) | |||||
| res = ParseToDict().transform(tree) | |||||
| print(res) # prints {'alice': [1, 27, 3], 'bob': [4], 'carrie': [], 'dan': [8, 6]} | |||||
| if __name__ == '__main__': | |||||
| test() | |||||
| @@ -8,7 +8,7 @@ | |||||
| # the spaces (and tabs) after the newline. | # the spaces (and tabs) after the newline. | ||||
| # | # | ||||
| from lark.lark import Lark | |||||
| from lark import Lark | |||||
| from lark.indenter import Indenter | from lark.indenter import Indenter | ||||
| tree_grammar = r""" | tree_grammar = r""" | ||||
| @@ -10,7 +10,7 @@ from .load_grammar import load_grammar | |||||
| from .tree import Tree | from .tree import Tree | ||||
| from .common import LexerConf, ParserConf | from .common import LexerConf, ParserConf | ||||
| from .lexer import Lexer | |||||
| from .lexer import Lexer, TraditionalLexer | |||||
| from .parse_tree_builder import ParseTreeBuilder | from .parse_tree_builder import ParseTreeBuilder | ||||
| from .parser_frontends import get_frontend | from .parser_frontends import get_frontend | ||||
| @@ -142,7 +142,7 @@ class Lark: | |||||
| else: | else: | ||||
| assert False, self.options.parser | assert False, self.options.parser | ||||
| lexer = self.options.lexer | lexer = self.options.lexer | ||||
| assert lexer in ('standard', 'contextual', 'dynamic', 'dynamic_complete') | |||||
| assert lexer in ('standard', 'contextual', 'dynamic', 'dynamic_complete') or issubclass(lexer, Lexer) | |||||
| if self.options.ambiguity == 'auto': | if self.options.ambiguity == 'auto': | ||||
| if self.options.parser == 'earley': | if self.options.parser == 'earley': | ||||
| @@ -171,7 +171,7 @@ class Lark: | |||||
| __init__.__doc__ += "\nOPTIONS:" + LarkOptions.OPTIONS_DOC | __init__.__doc__ += "\nOPTIONS:" + LarkOptions.OPTIONS_DOC | ||||
| def _build_lexer(self): | def _build_lexer(self): | ||||
| return Lexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks) | |||||
| return TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks) | |||||
| def _build_parser(self): | def _build_parser(self): | ||||
| self.parser_class = get_frontend(self.options.parser, self.options.lexer) | self.parser_class = get_frontend(self.options.parser, self.options.lexer) | ||||
| @@ -168,6 +168,17 @@ def _regexp_has_newline(r): | |||||
| return '\n' in r or '\\n' in r or ('(?s' in r and '.' in r) | return '\n' in r or '\\n' in r or ('(?s' in r and '.' in r) | ||||
| class Lexer: | class Lexer: | ||||
| """Lexer interface | |||||
| Method Signatures: | |||||
| lex(self, stream) -> Iterator[Token] | |||||
| set_parser_state(self, state) # Optional | |||||
| """ | |||||
| set_parser_state = NotImplemented | |||||
| lex = NotImplemented | |||||
| class TraditionalLexer(Lexer): | |||||
| def __init__(self, tokens, ignore=(), user_callbacks={}): | def __init__(self, tokens, ignore=(), user_callbacks={}): | ||||
| assert all(isinstance(t, TokenDef) for t in tokens), tokens | assert all(isinstance(t, TokenDef) for t in tokens), tokens | ||||
| @@ -206,7 +217,7 @@ class Lexer: | |||||
| return _Lex(self).lex(stream, self.newline_types, self.ignore_types) | return _Lex(self).lex(stream, self.newline_types, self.ignore_types) | ||||
| class ContextualLexer: | |||||
| class ContextualLexer(Lexer): | |||||
| def __init__(self, tokens, states, ignore=(), always_accept=(), user_callbacks={}): | def __init__(self, tokens, states, ignore=(), always_accept=(), user_callbacks={}): | ||||
| tokens_by_name = {} | tokens_by_name = {} | ||||
| for t in tokens: | for t in tokens: | ||||
| @@ -222,12 +233,12 @@ class ContextualLexer: | |||||
| except KeyError: | except KeyError: | ||||
| accepts = set(accepts) | set(ignore) | set(always_accept) | accepts = set(accepts) | set(ignore) | set(always_accept) | ||||
| state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name] | state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name] | ||||
| lexer = Lexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks) | |||||
| lexer = TraditionalLexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks) | |||||
| lexer_by_tokens[key] = lexer | lexer_by_tokens[key] = lexer | ||||
| self.lexers[state] = lexer | self.lexers[state] = lexer | ||||
| self.root_lexer = Lexer(tokens, ignore=ignore, user_callbacks=user_callbacks) | |||||
| self.root_lexer = TraditionalLexer(tokens, ignore=ignore, user_callbacks=user_callbacks) | |||||
| self.set_parser_state(None) # Needs to be set on the outside | self.set_parser_state(None) # Needs to be set on the outside | ||||
| @@ -10,7 +10,7 @@ from .lexer import Token | |||||
| from .parse_tree_builder import ParseTreeBuilder | from .parse_tree_builder import ParseTreeBuilder | ||||
| from .parser_frontends import LALR | |||||
| from .parser_frontends import LALR_TraditionalLexer | |||||
| from .common import LexerConf, ParserConf, PatternStr, PatternRE, TokenDef | from .common import LexerConf, ParserConf, PatternStr, PatternRE, TokenDef | ||||
| from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol | from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol | ||||
| from .utils import classify, suppress | from .utils import classify, suppress | ||||
| @@ -568,7 +568,7 @@ class GrammarLoader: | |||||
| lexer_conf = LexerConf(tokens, ['WS', 'COMMENT']) | lexer_conf = LexerConf(tokens, ['WS', 'COMMENT']) | ||||
| parser_conf = ParserConf(rules, callback, 'start') | parser_conf = ParserConf(rules, callback, 'start') | ||||
| self.parser = LALR(lexer_conf, parser_conf) | |||||
| self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf) | |||||
| self.canonize_tree = CanonizeTree() | self.canonize_tree = CanonizeTree() | ||||
| @@ -1,8 +1,9 @@ | |||||
| import re | import re | ||||
| from .utils import get_regexp_width | |||||
| from functools import partial | |||||
| from .utils import get_regexp_width | |||||
| from .parsers.grammar_analysis import GrammarAnalyzer | from .parsers.grammar_analysis import GrammarAnalyzer | ||||
| from .lexer import Lexer, ContextualLexer, Token | |||||
| from .lexer import TraditionalLexer, ContextualLexer, Lexer, Token | |||||
| from .exceptions import GrammarError | from .exceptions import GrammarError | ||||
| from .parsers import lalr_parser, earley, xearley, resolve_ambig, cyk | from .parsers import lalr_parser, earley, xearley, resolve_ambig, cyk | ||||
| @@ -11,7 +12,7 @@ from .tree import Tree | |||||
| class WithLexer: | class WithLexer: | ||||
| def init_traditional_lexer(self, lexer_conf): | def init_traditional_lexer(self, lexer_conf): | ||||
| self.lexer_conf = lexer_conf | self.lexer_conf = lexer_conf | ||||
| self.lexer = Lexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks) | |||||
| self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks) | |||||
| def init_contextual_lexer(self, lexer_conf, parser_conf): | def init_contextual_lexer(self, lexer_conf, parser_conf): | ||||
| self.lexer_conf = lexer_conf | self.lexer_conf = lexer_conf | ||||
| @@ -29,25 +30,27 @@ class WithLexer: | |||||
| else: | else: | ||||
| return stream | return stream | ||||
| def parse(self, text): | |||||
| token_stream = self.lex(text) | |||||
| sps = self.lexer.set_parser_state | |||||
| return self.parser.parse(token_stream, *[sps] if sps is not NotImplemented else []) | |||||
| class LALR(WithLexer): | |||||
| class LALR_TraditionalLexer(WithLexer): | |||||
| def __init__(self, lexer_conf, parser_conf, options=None): | def __init__(self, lexer_conf, parser_conf, options=None): | ||||
| self.parser = lalr_parser.Parser(parser_conf) | self.parser = lalr_parser.Parser(parser_conf) | ||||
| self.init_traditional_lexer(lexer_conf) | self.init_traditional_lexer(lexer_conf) | ||||
| def parse(self, text): | |||||
| token_stream = self.lex(text) | |||||
| return self.parser.parse(token_stream) | |||||
| class LALR_ContextualLexer(WithLexer): | class LALR_ContextualLexer(WithLexer): | ||||
| def __init__(self, lexer_conf, parser_conf, options=None): | def __init__(self, lexer_conf, parser_conf, options=None): | ||||
| self.parser = lalr_parser.Parser(parser_conf) | self.parser = lalr_parser.Parser(parser_conf) | ||||
| self.init_contextual_lexer(lexer_conf, parser_conf) | self.init_contextual_lexer(lexer_conf, parser_conf) | ||||
| def parse(self, text): | |||||
| token_stream = self.lex(text) | |||||
| return self.parser.parse(token_stream, self.lexer.set_parser_state) | |||||
| class LALR_CustomLexer(WithLexer): | |||||
| def __init__(self, lexer_cls, lexer_conf, parser_conf, options=None): | |||||
| self.parser = lalr_parser.Parser(parser_conf) | |||||
| self.lexer_conf = lexer_conf | |||||
| self.lexer = lexer_cls(lexer_conf) | |||||
| def get_ambiguity_resolver(options): | def get_ambiguity_resolver(options): | ||||
| if not options or options.ambiguity == 'resolve': | if not options or options.ambiguity == 'resolve': | ||||
| @@ -77,10 +80,6 @@ class Earley(WithLexer): | |||||
| def match(self, term, token): | def match(self, term, token): | ||||
| return term.name == token.type | return term.name == token.type | ||||
| def parse(self, text): | |||||
| tokens = self.lex(text) | |||||
| return self.parser.parse(tokens) | |||||
| class XEarley: | class XEarley: | ||||
| def __init__(self, lexer_conf, parser_conf, options=None, **kw): | def __init__(self, lexer_conf, parser_conf, options=None, **kw): | ||||
| @@ -161,9 +160,11 @@ def get_frontend(parser, lexer): | |||||
| if lexer is None: | if lexer is None: | ||||
| raise ValueError('The LALR parser requires use of a lexer') | raise ValueError('The LALR parser requires use of a lexer') | ||||
| elif lexer == 'standard': | elif lexer == 'standard': | ||||
| return LALR | |||||
| return LALR_TraditionalLexer | |||||
| elif lexer == 'contextual': | elif lexer == 'contextual': | ||||
| return LALR_ContextualLexer | return LALR_ContextualLexer | ||||
| elif issubclass(lexer, Lexer): | |||||
| return partial(LALR_CustomLexer, lexer) | |||||
| else: | else: | ||||
| raise ValueError('Unknown lexer: %s' % lexer) | raise ValueError('Unknown lexer: %s' % lexer) | ||||
| elif parser=='earley': | elif parser=='earley': | ||||