Also some issues with python_parsertags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.5.1
| @@ -3,6 +3,7 @@ | |||
| import re | |||
| from .utils import Str, classify | |||
| from .common import is_terminal | |||
| class LexError(Exception): | |||
| pass | |||
| @@ -169,3 +170,64 @@ class Lexer(object): | |||
| if lex_pos < len(stream): | |||
| raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos) | |||
| break | |||
| class ContextualLexer: | |||
| def __init__(self, tokens, states, ignore=()): | |||
| tokens_by_name = {} | |||
| for t in tokens: | |||
| assert t.name not in tokens_by_name | |||
| tokens_by_name[t.name] = t | |||
| lexer_by_tokens = {} | |||
| self.lexers = {} | |||
| for state, accepts in states.items(): | |||
| key = frozenset(accepts) | |||
| try: | |||
| lexer = lexer_by_tokens[key] | |||
| except KeyError: | |||
| accepts = list(accepts) # For python3 | |||
| accepts += ignore | |||
| # if '_NEWLINE' in tokens_by_name and '_NEWLINE' not in accepts: | |||
| # accepts.append('_NEWLINE') # XXX hack for now | |||
| state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$end'] | |||
| lexer = Lexer(state_tokens, ignore=ignore) | |||
| lexer_by_tokens[key] = lexer | |||
| self.lexers[state] = lexer | |||
| self.root_lexer = Lexer(tokens, ignore=ignore) | |||
| def lex(self, stream, parser): | |||
| lex_pos = 0 | |||
| line = 1 | |||
| col_start_pos = 0 | |||
| newline_types = list(self.root_lexer.newline_types) | |||
| ignore_types = list(self.root_lexer.ignore_types) | |||
| while True: | |||
| lexer = self.lexers[parser.state] | |||
| for mre, type_from_index in lexer.mres: | |||
| m = mre.match(stream, lex_pos) | |||
| if m: | |||
| value = m.group(0) | |||
| type_ = type_from_index[m.lastindex] | |||
| if type_ not in ignore_types: | |||
| t = Token(type_, value, lex_pos) | |||
| t.line = line | |||
| t.column = lex_pos - col_start_pos | |||
| if t.type in lexer.callback: | |||
| t = lexer.callback[t.type](t) | |||
| yield t | |||
| if type_ in newline_types: | |||
| newlines = value.count(lexer.newline_char) | |||
| if newlines: | |||
| line += newlines | |||
| col_start_pos = lex_pos + value.rindex(lexer.newline_char) | |||
| lex_pos += len(value) | |||
| break | |||
| else: | |||
| if lex_pos < len(stream): | |||
| raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos) | |||
| break | |||
| @@ -1,7 +1,7 @@ | |||
| import re | |||
| import sre_parse | |||
| from .lexer import Lexer | |||
| from .lexer import Lexer, ContextualLexer | |||
| from .parsers.lalr_analysis import GrammarAnalyzer | |||
| from .common import is_terminal, GrammarError | |||
| @@ -31,6 +31,25 @@ class LALR(WithLexer): | |||
| tokens = list(self.lex(text)) | |||
| return self.parser.parse(tokens) | |||
| class LALR_ContextualLexer: | |||
| def __init__(self, lexer_conf, parser_conf): | |||
| self.lexer_conf = lexer_conf | |||
| self.parser_conf = parser_conf | |||
| self.analyzer = GrammarAnalyzer(parser_conf.rules, parser_conf.start) | |||
| self.analyzer.analyze() | |||
| d = {idx:t.keys() for idx, t in self.analyzer.states_idx.items()} | |||
| self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore) | |||
| def parse(self, text): | |||
| parser = lalr_parser.Parser(self.analyzer, self.parser_conf.callback) | |||
| l = self.lexer.lex(text, parser) | |||
| return parser.parse(l, True) | |||
| class Earley(WithLexer): | |||
| def __init__(self, lexer_conf, parser_conf): | |||
| WithLexer.__init__(self, lexer_conf) | |||
| @@ -82,4 +101,4 @@ class Earley_NoLex: | |||
| assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' | |||
| return res[0] | |||
| ENGINE_DICT = { 'lalr': LALR, 'earley': Earley, 'earley_nolex': Earley_NoLex } | |||
| ENGINE_DICT = { 'lalr': LALR, 'earley': Earley, 'earley_nolex': Earley_NoLex, 'lalr_contextual_lexer': LALR_ContextualLexer } | |||
| @@ -7,13 +7,14 @@ class Parser(object): | |||
| self.analysis = analysis | |||
| self.callbacks = {rule: getattr(callback, rule.alias or rule.origin, None) | |||
| for rule in analysis.rules} | |||
| self.state = self.analysis.init_state_idx | |||
| def parse(self, seq): | |||
| def parse(self, stream, set_state=False): # XXX no set_state | |||
| stream = iter(stream) | |||
| states_idx = self.analysis.states_idx | |||
| state_stack = [self.analysis.init_state_idx] | |||
| value_stack = [] | |||
| i = 0 | |||
| def get_action(key): | |||
| state = state_stack[-1] | |||
| @@ -21,13 +22,8 @@ class Parser(object): | |||
| return states_idx[state][key] | |||
| except KeyError: | |||
| expected = states_idx[state].keys() | |||
| try: | |||
| token = seq[i] | |||
| except IndexError: | |||
| assert key == '$end' | |||
| token = seq[-1] | |||
| raise UnexpectedToken(token, expected, seq, i) | |||
| raise UnexpectedToken(token, expected, [], 0) | |||
| def reduce(rule, size): | |||
| if size: | |||
| @@ -48,15 +44,20 @@ class Parser(object): | |||
| value_stack.append(res) | |||
| # Main LALR-parser loop | |||
| while i < len(seq): | |||
| action, arg = get_action(seq[i].type) | |||
| if action == ACTION_SHIFT: | |||
| state_stack.append(arg) | |||
| value_stack.append(seq[i]) | |||
| i+= 1 | |||
| else: | |||
| reduce(*arg) | |||
| try: | |||
| token = next(stream) | |||
| while True: | |||
| action, arg = get_action(token.type) | |||
| if action == ACTION_SHIFT: | |||
| state_stack.append(arg) | |||
| value_stack.append(token) | |||
| if set_state: self.state = arg | |||
| token = next(stream) | |||
| else: | |||
| reduce(*arg) | |||
| except StopIteration: | |||
| pass | |||
| while True: | |||
| _action, rule = get_action('$end') | |||
| @@ -5,7 +5,7 @@ import logging | |||
| from .test_trees import TestTrees | |||
| # from .test_selectors import TestSelectors | |||
| from .test_parser import TestLalr, TestEarley, TestParsers | |||
| from .test_parser import TestLalr, TestEarley, TestLalr_contextual_lexer, TestParsers | |||
| # from .test_grammars import TestPythonG, TestConfigG | |||
| logging.basicConfig(level=logging.INFO) | |||
| @@ -356,11 +356,10 @@ def _make_parser_test(PARSER): | |||
| _TestParser.__name__ = _NAME | |||
| globals()[_NAME] = _TestParser | |||
| for PARSER in ['lalr', 'earley']: | |||
| for PARSER in ['lalr', 'earley', 'lalr_contextual_lexer']: | |||
| _make_parser_test(PARSER) | |||
| if __name__ == '__main__': | |||
| unittest.main() | |||