| @@ -1,4 +1,5 @@ | |||||
| from __future__ import absolute_import | from __future__ import absolute_import | ||||
| from lark.exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken | |||||
| import sys, os, pickle, hashlib | import sys, os, pickle, hashlib | ||||
| from io import open | from io import open | ||||
| @@ -9,7 +10,7 @@ from .load_grammar import load_grammar | |||||
| from .tree import Tree | from .tree import Tree | ||||
| from .common import LexerConf, ParserConf | from .common import LexerConf, ParserConf | ||||
| from .lexer import Lexer, TraditionalLexer, TerminalDef, UnexpectedToken | |||||
| from .lexer import Lexer, TraditionalLexer, TerminalDef | |||||
| from .parse_tree_builder import ParseTreeBuilder | from .parse_tree_builder import ParseTreeBuilder | ||||
| from .parser_frontends import get_frontend, _get_lexer_callbacks | from .parser_frontends import get_frontend, _get_lexer_callbacks | ||||
| from .grammar import Rule | from .grammar import Rule | ||||
| @@ -462,7 +463,7 @@ class Lark(Serialize): | |||||
| try: | try: | ||||
| return self.parser.parse(text, start=start) | return self.parser.parse(text, start=start) | ||||
| except UnexpectedToken as e: | |||||
| except UnexpectedInput as e: | |||||
| if on_error is None: | if on_error is None: | ||||
| raise | raise | ||||
| @@ -472,10 +473,12 @@ class Lark(Serialize): | |||||
| try: | try: | ||||
| return e.puppet.resume_parse() | return e.puppet.resume_parse() | ||||
| except UnexpectedToken as e2: | except UnexpectedToken as e2: | ||||
| if e.token.type == e2.token.type == '$END' and e.puppet == e2.puppet: | |||||
| if isinstance(e, UnexpectedToken) and e.token.type == e2.token.type == '$END' and e.puppet == e2.puppet: | |||||
| # Prevent infinite loop | # Prevent infinite loop | ||||
| raise e2 | raise e2 | ||||
| e = e2 | e = e2 | ||||
| except UnexpectedCharacters as e2: | |||||
| e = e2 | |||||
| ###} | ###} | ||||
| @@ -157,6 +157,8 @@ class Token(Str): | |||||
| class LineCounter: | class LineCounter: | ||||
| __slots__ = 'char_pos', 'line', 'column', 'line_start_pos', 'newline_char' | |||||
| def __init__(self, newline_char): | def __init__(self, newline_char): | ||||
| self.newline_char = newline_char | self.newline_char = newline_char | ||||
| self.char_pos = 0 | self.char_pos = 0 | ||||
| @@ -167,7 +169,7 @@ class LineCounter: | |||||
| def feed(self, token, test_newline=True): | def feed(self, token, test_newline=True): | ||||
| """Consume a token and calculate the new line & column. | """Consume a token and calculate the new line & column. | ||||
| As an optional optimization, set test_newline=False is token doesn't contain a newline. | |||||
| As an optional optimization, set test_newline=False if token doesn't contain a newline. | |||||
| """ | """ | ||||
| if test_newline: | if test_newline: | ||||
| newlines = token.count(self.newline_char) | newlines = token.count(self.newline_char) | ||||
| @@ -243,7 +245,6 @@ def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_, use_bytes) | |||||
| except AssertionError: # Yes, this is what Python provides us.. :/ | except AssertionError: # Yes, this is what Python provides us.. :/ | ||||
| return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_, use_bytes) | return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_, use_bytes) | ||||
| # terms_from_name = {t.name: t for t in terminals[:max_size]} | |||||
| mres.append((mre, {i:n for n,i in mre.groupindex.items()} )) | mres.append((mre, {i:n for n,i in mre.groupindex.items()} )) | ||||
| terminals = terminals[max_size:] | terminals = terminals[max_size:] | ||||
| return mres | return mres | ||||
| @@ -269,6 +270,10 @@ class Lexer(object): | |||||
| """ | """ | ||||
| lex = NotImplemented | lex = NotImplemented | ||||
| def make_lexer_state(self, text): | |||||
| line_ctr = LineCounter(b'\n' if isinstance(text, bytes) else '\n') | |||||
| return LexerState(text, line_ctr) | |||||
| class TraditionalLexer(Lexer): | class TraditionalLexer(Lexer): | ||||
| @@ -328,26 +333,21 @@ class TraditionalLexer(Lexer): | |||||
| if m: | if m: | ||||
| return m.group(0), type_from_index[m.lastindex] | return m.group(0), type_from_index[m.lastindex] | ||||
| def make_lexer_state(self, text): | |||||
| line_ctr = LineCounter('\n' if not self.use_bytes else b'\n') | |||||
| return LexerState(text, line_ctr) | |||||
| def lex(self, text): | |||||
| state = self.make_lexer_state(text) | |||||
| def lex(self, state, parser_state): | |||||
| with suppress(EOFError): | with suppress(EOFError): | ||||
| while True: | while True: | ||||
| yield self.next_token(state) | yield self.next_token(state) | ||||
| def next_token(self, lex_state): | def next_token(self, lex_state): | ||||
| text = lex_state.text | |||||
| line_ctr = lex_state.line_ctr | line_ctr = lex_state.line_ctr | ||||
| while line_ctr.char_pos < len(text): | |||||
| res = self.match(text, line_ctr.char_pos) | |||||
| while line_ctr.char_pos < len(lex_state.text): | |||||
| res = self.match(lex_state.text, line_ctr.char_pos) | |||||
| if not res: | if not res: | ||||
| allowed = {v for m, tfi in self.mres for v in tfi.values()} - self.ignore_types | allowed = {v for m, tfi in self.mres for v in tfi.values()} - self.ignore_types | ||||
| if not allowed: | if not allowed: | ||||
| allowed = {"<END-OF-FILE>"} | allowed = {"<END-OF-FILE>"} | ||||
| raise UnexpectedCharacters(text, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token]) | |||||
| raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column, | |||||
| allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token]) | |||||
| value, type_ = res | value, type_ = res | ||||
| @@ -373,11 +373,15 @@ class TraditionalLexer(Lexer): | |||||
| raise EOFError(self) | raise EOFError(self) | ||||
| class LexerState: | class LexerState: | ||||
| __slots__ = 'text', 'line_ctr', 'last_token' | |||||
| def __init__(self, text, line_ctr, last_token=None): | def __init__(self, text, line_ctr, last_token=None): | ||||
| self.text = text | self.text = text | ||||
| self.line_ctr = line_ctr | self.line_ctr = line_ctr | ||||
| self.last_token = last_token | self.last_token = last_token | ||||
| def __copy__(self): | |||||
| return type(self)(self.text, copy(self.line_ctr), self.last_token) | |||||
| class ContextualLexer(Lexer): | class ContextualLexer(Lexer): | ||||
| @@ -410,24 +414,29 @@ class ContextualLexer(Lexer): | |||||
| assert trad_conf.tokens is terminals | assert trad_conf.tokens is terminals | ||||
| self.root_lexer = TraditionalLexer(trad_conf) | self.root_lexer = TraditionalLexer(trad_conf) | ||||
| def lex(self, text, get_parser_state): | |||||
| state = self.root_lexer.make_lexer_state(text) | |||||
| def make_lexer_state(self, text): | |||||
| return self.root_lexer.make_lexer_state(text) | |||||
| def lex(self, lexer_state, parser_state): | |||||
| try: | try: | ||||
| while True: | while True: | ||||
| lexer = self.lexers[get_parser_state()] | |||||
| yield lexer.next_token(state) | |||||
| lexer = self.lexers[parser_state.position] | |||||
| yield lexer.next_token(lexer_state) | |||||
| except EOFError: | except EOFError: | ||||
| pass | pass | ||||
| except UnexpectedCharacters as e: | except UnexpectedCharacters as e: | ||||
| # In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, | |||||
| # but not in the current context. | |||||
| # In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, but not in the current context. | |||||
| # This tests the input against the global context, to provide a nicer error. | # This tests the input against the global context, to provide a nicer error. | ||||
| root_match = self.root_lexer.match(text, e.pos_in_stream) | |||||
| if not root_match: | |||||
| raise | |||||
| token = self.root_lexer.next_token(lexer_state) | |||||
| raise UnexpectedToken(token, e.allowed, state=parser_state.position) | |||||
| class LexerThread: | |||||
| "A thread that ties a lexer instance and a lexer state, to be used by the parser" | |||||
| value, type_ = root_match | |||||
| t = Token(type_, value, e.pos_in_stream, e.line, e.column) | |||||
| raise UnexpectedToken(t, e.allowed, state=get_parser_state()) | |||||
| def __init__(self, lexer, text): | |||||
| self.lexer = lexer | |||||
| self.state = lexer.make_lexer_state(text) | |||||
| def lex(self, parser_state): | |||||
| return self.lexer.lex(self.state, parser_state) | |||||
| ###} | ###} | ||||
| @@ -1,6 +1,6 @@ | |||||
| from .utils import get_regexp_width, Serialize | from .utils import get_regexp_width, Serialize | ||||
| from .parsers.grammar_analysis import GrammarAnalyzer | from .parsers.grammar_analysis import GrammarAnalyzer | ||||
| from .lexer import TraditionalLexer, ContextualLexer, Lexer, Token, TerminalDef | |||||
| from .lexer import LexerThread, TraditionalLexer, ContextualLexer, Lexer, Token, TerminalDef | |||||
| from .parsers import earley, xearley, cyk | from .parsers import earley, xearley, cyk | ||||
| from .parsers.lalr_parser import LALR_Parser | from .parsers.lalr_parser import LALR_Parser | ||||
| from .grammar import Rule | from .grammar import Rule | ||||
| @@ -23,12 +23,18 @@ def get_frontend(parser, lexer): | |||||
| elif lexer == 'contextual': | elif lexer == 'contextual': | ||||
| return LALR_ContextualLexer | return LALR_ContextualLexer | ||||
| elif issubclass(lexer, Lexer): | elif issubclass(lexer, Lexer): | ||||
| class CustomLexerWrapper(Lexer): | |||||
| def __init__(self, lexer_conf): | |||||
| self.lexer = lexer(lexer_conf) | |||||
| def lex(self, lexer_state, parser_state): | |||||
| return self.lexer.lex(lexer_state.text) | |||||
| class LALR_CustomLexerWrapper(LALR_CustomLexer): | class LALR_CustomLexerWrapper(LALR_CustomLexer): | ||||
| def __init__(self, lexer_conf, parser_conf, options=None): | def __init__(self, lexer_conf, parser_conf, options=None): | ||||
| super(LALR_CustomLexerWrapper, self).__init__( | super(LALR_CustomLexerWrapper, self).__init__( | ||||
| lexer, lexer_conf, parser_conf, options=options) | lexer, lexer_conf, parser_conf, options=options) | ||||
| def init_lexer(self): | def init_lexer(self): | ||||
| self.lexer = lexer(self.lexer_conf) | |||||
| self.lexer = CustomLexerWrapper(self.lexer_conf) | |||||
| return LALR_CustomLexerWrapper | return LALR_CustomLexerWrapper | ||||
| else: | else: | ||||
| @@ -54,7 +60,7 @@ def get_frontend(parser, lexer): | |||||
| class _ParserFrontend(Serialize): | class _ParserFrontend(Serialize): | ||||
| def _parse(self, input, start, *args): | |||||
| def _parse(self, start, input, *args): | |||||
| if start is None: | if start is None: | ||||
| start = self.start | start = self.start | ||||
| if len(start) > 1: | if len(start) > 1: | ||||
| @@ -71,6 +77,18 @@ def _get_lexer_callbacks(transformer, terminals): | |||||
| result[terminal.name] = callback | result[terminal.name] = callback | ||||
| return result | return result | ||||
| class PostLexConnector: | |||||
| def __init__(self, lexer, postlexer): | |||||
| self.lexer = lexer | |||||
| self.postlexer = postlexer | |||||
| def make_lexer_state(self, text): | |||||
| return self.lexer.make_lexer_state(text) | |||||
| def lex(self, lexer_state, parser_state): | |||||
| i = self.lexer.lex(lexer_state, parser_state) | |||||
| return self.postlexer.process(i) | |||||
| class WithLexer(_ParserFrontend): | class WithLexer(_ParserFrontend): | ||||
| lexer = None | lexer = None | ||||
| @@ -106,13 +124,14 @@ class WithLexer(_ParserFrontend): | |||||
| def _serialize(self, data, memo): | def _serialize(self, data, memo): | ||||
| data['parser'] = data['parser'].serialize(memo) | data['parser'] = data['parser'].serialize(memo) | ||||
| def lex(self, *args): | |||||
| stream = self.lexer.lex(*args) | |||||
| return self.postlex.process(stream) if self.postlex else stream | |||||
| def make_lexer(self, text): | |||||
| lexer = self.lexer | |||||
| if self.postlex: | |||||
| lexer = PostLexConnector(self.lexer, self.postlex) | |||||
| return LexerThread(lexer, text) | |||||
| def parse(self, text, start=None): | def parse(self, text, start=None): | ||||
| token_stream = self.lex(text) | |||||
| return self._parse(token_stream, start) | |||||
| return self._parse(start, self.make_lexer(text)) | |||||
| def init_traditional_lexer(self): | def init_traditional_lexer(self): | ||||
| self.lexer = TraditionalLexer(self.lexer_conf) | self.lexer = TraditionalLexer(self.lexer_conf) | ||||
| @@ -138,14 +157,6 @@ class LALR_ContextualLexer(LALR_WithLexer): | |||||
| always_accept = self.postlex.always_accept if self.postlex else () | always_accept = self.postlex.always_accept if self.postlex else () | ||||
| self.lexer = ContextualLexer(self.lexer_conf, states, always_accept=always_accept) | self.lexer = ContextualLexer(self.lexer_conf, states, always_accept=always_accept) | ||||
| def parse(self, text, start=None): | |||||
| parser_state = [None] | |||||
| def set_parser_state(s): | |||||
| parser_state[0] = s | |||||
| token_stream = self.lex(text, lambda: parser_state[0]) | |||||
| return self._parse(token_stream, start, set_parser_state) | |||||
| ###} | ###} | ||||
| class LALR_CustomLexer(LALR_WithLexer): | class LALR_CustomLexer(LALR_WithLexer): | ||||
| @@ -156,15 +167,6 @@ class LALR_CustomLexer(LALR_WithLexer): | |||||
| WithLexer.__init__(self, lexer_conf, parser_conf, options) | WithLexer.__init__(self, lexer_conf, parser_conf, options) | ||||
| def tokenize_text(text): | |||||
| line = 1 | |||||
| col_start_pos = 0 | |||||
| for i, ch in enumerate(text): | |||||
| if '\n' in ch: | |||||
| line += ch.count('\n') | |||||
| col_start_pos = i + ch.rindex('\n') | |||||
| yield Token('CHAR', ch, line=line, column=i - col_start_pos) | |||||
| class Earley(WithLexer): | class Earley(WithLexer): | ||||
| def __init__(self, lexer_conf, parser_conf, options=None): | def __init__(self, lexer_conf, parser_conf, options=None): | ||||
| WithLexer.__init__(self, lexer_conf, parser_conf, options) | WithLexer.__init__(self, lexer_conf, parser_conf, options) | ||||
| @@ -175,6 +177,9 @@ class Earley(WithLexer): | |||||
| tree_class = options.tree_class or Tree if options.ambiguity != 'forest' else None | tree_class = options.tree_class or Tree if options.ambiguity != 'forest' else None | ||||
| self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=resolve_ambiguity, debug=debug, tree_class=tree_class) | self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=resolve_ambiguity, debug=debug, tree_class=tree_class) | ||||
| def make_lexer(self, text): | |||||
| return WithLexer.make_lexer(self, text).lex(None) | |||||
| def match(self, term, token): | def match(self, term, token): | ||||
| return term.name == token.type | return term.name == token.type | ||||
| @@ -219,7 +224,7 @@ class XEarley(_ParserFrontend): | |||||
| self.regexps[t.name] = lexer_conf.re_module.compile(regexp, lexer_conf.g_regex_flags) | self.regexps[t.name] = lexer_conf.re_module.compile(regexp, lexer_conf.g_regex_flags) | ||||
| def parse(self, text, start): | def parse(self, text, start): | ||||
| return self._parse(text, start) | |||||
| return self._parse(start, text) | |||||
| class XEarley_CompleteLex(XEarley): | class XEarley_CompleteLex(XEarley): | ||||
| def __init__(self, *args, **kw): | def __init__(self, *args, **kw): | ||||
| @@ -239,8 +244,8 @@ class CYK(WithLexer): | |||||
| self.callbacks = parser_conf.callbacks | self.callbacks = parser_conf.callbacks | ||||
| def parse(self, text, start): | def parse(self, text, start): | ||||
| tokens = list(self.lex(text)) | |||||
| parse = self._parse(tokens, start) | |||||
| tokens = list(self.make_lexer(text).lex(None)) | |||||
| parse = self._parse(start, tokens) | |||||
| parse = self._transform(parse) | parse = self._transform(parse) | ||||
| return parse | return parse | ||||
| @@ -2,9 +2,9 @@ | |||||
| """ | """ | ||||
| # Author: Erez Shinan (2017) | # Author: Erez Shinan (2017) | ||||
| # Email : erezshin@gmail.com | # Email : erezshin@gmail.com | ||||
| from ..exceptions import UnexpectedToken | |||||
| from copy import deepcopy | |||||
| from ..exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken | |||||
| from ..lexer import Token | from ..lexer import Token | ||||
| from ..utils import Enumerator, Serialize | |||||
| from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable | from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable | ||||
| from .lalr_puppet import ParserPuppet | from .lalr_puppet import ParserPuppet | ||||
| @@ -35,84 +35,116 @@ class LALR_Parser(object): | |||||
| return self.parser.parse(*args) | return self.parser.parse(*args) | ||||
| class _Parser: | |||||
| def __init__(self, parse_table, callbacks, debug=False): | |||||
| self.parse_table = parse_table | |||||
| self.callbacks = callbacks | |||||
| self.debug = debug | |||||
| class ParserState: | |||||
| __slots__ = 'parse_table', 'lexer', 'callbacks', 'start', 'state_stack', 'value_stack', 'start_state', 'end_state', 'states' | |||||
| def parse(self, seq, start, set_state=None, value_stack=None, state_stack=None): | |||||
| token = None | |||||
| stream = iter(seq) | |||||
| states = self.parse_table.states | |||||
| start_state = self.parse_table.start_states[start] | |||||
| end_state = self.parse_table.end_states[start] | |||||
| def __init__(self, parse_table, lexer, callbacks, start, state_stack=None, value_stack=None): | |||||
| self.parse_table = parse_table | |||||
| state_stack = state_stack or [start_state] | |||||
| value_stack = value_stack or [] | |||||
| self.start_state = self.parse_table.start_states[start] | |||||
| self.end_state = self.parse_table.end_states[start] | |||||
| self.states = self.parse_table.states | |||||
| if set_state: set_state(start_state) | |||||
| self.lexer = lexer | |||||
| self.callbacks = callbacks | |||||
| self.start = start | |||||
| self.state_stack = state_stack or [self.start_state] | |||||
| self.value_stack = value_stack or [] | |||||
| @property | |||||
| def position(self): | |||||
| return self.state_stack[-1] | |||||
| def __copy__(self): | |||||
| return type(self)( | |||||
| self.parse_table, | |||||
| self.lexer, # XXX copy | |||||
| self.callbacks, | |||||
| self.start, | |||||
| list(self.state_stack), | |||||
| deepcopy(self.value_stack), | |||||
| ) | |||||
| def feed_token(self, token, is_end=False): | |||||
| state_stack = self.state_stack | |||||
| value_stack = self.value_stack | |||||
| states = self.states | |||||
| def get_action(token): | |||||
| while True: | |||||
| state = state_stack[-1] | state = state_stack[-1] | ||||
| try: | try: | ||||
| return states[state][token.type] | |||||
| action, arg = states[state][token.type] | |||||
| except KeyError: | except KeyError: | ||||
| expected = {s for s in states[state].keys() if s.isupper()} | expected = {s for s in states[state].keys() if s.isupper()} | ||||
| try: | |||||
| puppet = ParserPuppet(self, state_stack, value_stack, start, stream, set_state) | |||||
| except NameError: # For standalone parser | |||||
| puppet = None | |||||
| raise UnexpectedToken(token, expected, state=state, puppet=puppet) | |||||
| def reduce(rule): | |||||
| size = len(rule.expansion) | |||||
| if size: | |||||
| s = value_stack[-size:] | |||||
| del state_stack[-size:] | |||||
| del value_stack[-size:] | |||||
| raise UnexpectedToken(token, expected, state=state, puppet=None) | |||||
| assert arg != self.end_state | |||||
| if action is Shift: | |||||
| # shift once and return | |||||
| assert not is_end | |||||
| state_stack.append(arg) | |||||
| value_stack.append(token) | |||||
| return arg | |||||
| else: | else: | ||||
| s = [] | |||||
| # reduce+shift as many times as necessary | |||||
| rule = arg | |||||
| size = len(rule.expansion) | |||||
| if size: | |||||
| s = value_stack[-size:] | |||||
| del state_stack[-size:] | |||||
| del value_stack[-size:] | |||||
| else: | |||||
| s = [] | |||||
| value = self.callbacks[rule](s) | |||||
| _action, new_state = states[state_stack[-1]][rule.origin.name] | |||||
| assert _action is Shift | |||||
| state_stack.append(new_state) | |||||
| value_stack.append(value) | |||||
| if is_end and state_stack[-1] == self.end_state: | |||||
| return value_stack[-1] | |||||
| value = self.callbacks[rule](s) | |||||
| class _Parser: | |||||
| def __init__(self, parse_table, callbacks, debug=False): | |||||
| self.parse_table = parse_table | |||||
| self.callbacks = callbacks | |||||
| self.debug = debug | |||||
| _action, new_state = states[state_stack[-1]][rule.origin.name] | |||||
| assert _action is Shift | |||||
| state_stack.append(new_state) | |||||
| value_stack.append(value) | |||||
| def parse(self, lexer, start, value_stack=None, state_stack=None): | |||||
| parser_state = ParserState(self.parse_table, lexer, self.callbacks, start, state_stack, value_stack) | |||||
| return self.parse_from_state(parser_state) | |||||
| def parse_from_state(self, state): | |||||
| # Main LALR-parser loop | # Main LALR-parser loop | ||||
| try: | try: | ||||
| for token in stream: | |||||
| while True: | |||||
| action, arg = get_action(token) | |||||
| assert arg != end_state | |||||
| if action is Shift: | |||||
| state_stack.append(arg) | |||||
| value_stack.append(token) | |||||
| if set_state: set_state(arg) | |||||
| break # next token | |||||
| else: | |||||
| reduce(arg) | |||||
| token = None | |||||
| for token in state.lexer.lex(state): | |||||
| state.feed_token(token) | |||||
| token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1) | |||||
| return state.feed_token(token, True) | |||||
| except UnexpectedInput as e: | |||||
| try: | |||||
| e.puppet = ParserPuppet(self, state, state.lexer) | |||||
| except NameError: | |||||
| pass | |||||
| if isinstance(e, UnexpectedCharacters): | |||||
| s = state.lexer.state | |||||
| p = s.line_ctr.char_pos | |||||
| s.line_ctr.feed(s.text[p:p+1]) | |||||
| raise e | |||||
| except Exception as e: | except Exception as e: | ||||
| if self.debug: | if self.debug: | ||||
| print("") | print("") | ||||
| print("STATE STACK DUMP") | print("STATE STACK DUMP") | ||||
| print("----------------") | print("----------------") | ||||
| for i, s in enumerate(state_stack): | |||||
| for i, s in enumerate(state.state_stack): | |||||
| print('%d)' % i , s) | print('%d)' % i , s) | ||||
| print("") | print("") | ||||
| raise | raise | ||||
| token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1) | |||||
| while True: | |||||
| _action, arg = get_action(token) | |||||
| assert(_action is Reduce) | |||||
| reduce(arg) | |||||
| if state_stack[-1] == end_state: | |||||
| return value_stack[-1] | |||||
| ###} | ###} | ||||
| @@ -1,10 +1,10 @@ | |||||
| # This module provide a LALR puppet, which is used to debugging and error handling | # This module provide a LALR puppet, which is used to debugging and error handling | ||||
| from copy import deepcopy | |||||
| from copy import copy | |||||
| from .lalr_analysis import Shift, Reduce | from .lalr_analysis import Shift, Reduce | ||||
| from .. import Token | from .. import Token | ||||
| from ..exceptions import ParseError | |||||
| from ..exceptions import UnexpectedToken | |||||
| class ParserPuppet(object): | class ParserPuppet(object): | ||||
| @@ -12,96 +12,44 @@ class ParserPuppet(object): | |||||
| For a simpler, more streamlined interface, see the ``on_error`` argument to ``Lark.parse()``. | For a simpler, more streamlined interface, see the ``on_error`` argument to ``Lark.parse()``. | ||||
| """ | """ | ||||
| def __init__(self, parser, state_stack, value_stack, start, stream, set_state): | |||||
| def __init__(self, parser, parser_state, lexer_state): | |||||
| self.parser = parser | self.parser = parser | ||||
| self._state_stack = state_stack | |||||
| self._value_stack = value_stack | |||||
| self._start = start | |||||
| self._stream = stream | |||||
| self._set_state = set_state | |||||
| self.result = None | |||||
| self.parser_state = parser_state | |||||
| self.lexer_state = lexer_state | |||||
| def feed_token(self, token): | def feed_token(self, token): | ||||
| """Feed the parser with a token, and advance it to the next state, as if it recieved it from the lexer. | """Feed the parser with a token, and advance it to the next state, as if it recieved it from the lexer. | ||||
| Note that ``token`` has to be an instance of ``Token``. | Note that ``token`` has to be an instance of ``Token``. | ||||
| """ | """ | ||||
| end_state = self.parser.parse_table.end_states[self._start] | |||||
| state_stack = self._state_stack | |||||
| value_stack = self._value_stack | |||||
| state = state_stack[-1] | |||||
| action, arg = self.parser.parse_table.states[state][token.type] | |||||
| if arg == end_state: | |||||
| raise ParseError(arg) | |||||
| while action is Reduce: | |||||
| rule = arg | |||||
| size = len(rule.expansion) | |||||
| if size: | |||||
| s = value_stack[-size:] | |||||
| del state_stack[-size:] | |||||
| del value_stack[-size:] | |||||
| else: | |||||
| s = [] | |||||
| value = self.parser.callbacks[rule](s) | |||||
| _action, new_state = self.parser.parse_table.states[state_stack[-1]][rule.origin.name] | |||||
| assert _action is Shift | |||||
| state_stack.append(new_state) | |||||
| value_stack.append(value) | |||||
| if state_stack[-1] == end_state: | |||||
| self.result = value_stack[-1] | |||||
| return self.result | |||||
| state = state_stack[-1] | |||||
| try: | |||||
| action, arg = self.parser.parse_table.states[state][token.type] | |||||
| except KeyError as e: | |||||
| raise ParseError(e) | |||||
| assert arg != end_state | |||||
| assert action is Shift | |||||
| state_stack.append(arg) | |||||
| value_stack.append(token) | |||||
| def copy(self): | |||||
| return self.parser_state.feed_token(token) | |||||
| def __copy__(self): | |||||
| """Create a new puppet with a separate state. | """Create a new puppet with a separate state. | ||||
| Calls to feed_token() won't affect the old puppet, and vice-versa. | Calls to feed_token() won't affect the old puppet, and vice-versa. | ||||
| """ | """ | ||||
| return type(self)( | return type(self)( | ||||
| self.parser, | self.parser, | ||||
| list(self._state_stack), | |||||
| deepcopy(self._value_stack), | |||||
| self._start, | |||||
| self._stream, | |||||
| self._set_state, | |||||
| copy(self.parser_state), | |||||
| copy(self.lexer_state), | |||||
| ) | ) | ||||
| def __eq__(self, other): | def __eq__(self, other): | ||||
| if not isinstance(other, ParserPuppet): | if not isinstance(other, ParserPuppet): | ||||
| return False | return False | ||||
| return ( | |||||
| self._state_stack == other._state_stack and | |||||
| self._value_stack == other._value_stack and | |||||
| self._stream == other._stream and | |||||
| self._start == other._start | |||||
| ) | |||||
| return self.parser_state == other.parser_state and self.lexer_state == other.lexer_state | |||||
| def __hash__(self): | def __hash__(self): | ||||
| return hash((tuple(self._state_stack), self._start)) | |||||
| return hash((self.parser_state, self.lexer_state)) | |||||
| def pretty(self): | def pretty(self): | ||||
| """Print the output of ``choices()`` in a way that's easier to read.""" | """Print the output of ``choices()`` in a way that's easier to read.""" | ||||
| out = ["Puppet choices:"] | out = ["Puppet choices:"] | ||||
| for k, v in self.choices().items(): | for k, v in self.choices().items(): | ||||
| out.append('\t- %s -> %s' % (k, v)) | out.append('\t- %s -> %s' % (k, v)) | ||||
| out.append('stack size: %s' % len(self._state_stack)) | |||||
| out.append('stack size: %s' % len(self.parser_state.state_stack)) | |||||
| return '\n'.join(out) | return '\n'.join(out) | ||||
| def choices(self): | def choices(self): | ||||
| @@ -111,16 +59,16 @@ class ParserPuppet(object): | |||||
| Updated by ``feed_token()``. | Updated by ``feed_token()``. | ||||
| """ | """ | ||||
| return self.parser.parse_table.states[self._state_stack[-1]] | |||||
| return self.parser_state.parse_table.states[self.parser_state.position] | |||||
| def accepts(self): | def accepts(self): | ||||
| accepts = set() | accepts = set() | ||||
| for t in self.choices(): | for t in self.choices(): | ||||
| if t.isupper(): # is terminal? | if t.isupper(): # is terminal? | ||||
| new_puppet = self.copy() | |||||
| new_puppet = copy(self) | |||||
| try: | try: | ||||
| new_puppet.feed_token(Token(t, '')) | new_puppet.feed_token(Token(t, '')) | ||||
| except ParseError: | |||||
| except UnexpectedToken: | |||||
| pass | pass | ||||
| else: | else: | ||||
| accepts.add(t) | accepts.add(t) | ||||
| @@ -128,7 +76,4 @@ class ParserPuppet(object): | |||||
| def resume_parse(self): | def resume_parse(self): | ||||
| """Resume parsing from the current puppet state.""" | """Resume parsing from the current puppet state.""" | ||||
| return self.parser.parse( | |||||
| self._stream, self._start, self._set_state, | |||||
| self._value_stack, self._state_stack | |||||
| ) | |||||
| return self.parser.parse_from_state(self.parser_state) | |||||
| @@ -2217,6 +2217,42 @@ def _make_parser_test(LEXER, PARSER): | |||||
| """, regex=True) | """, regex=True) | ||||
| self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்') | self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்') | ||||
| @unittest.skipIf(PARSER!='lalr', "Puppet error handling only works with LALR for now") | |||||
| def test_error_with_puppet(self): | |||||
| def ignore_errors(e): | |||||
| if isinstance(e, UnexpectedCharacters): | |||||
| # Skip bad character | |||||
| return True | |||||
| # Must be UnexpectedToken | |||||
| if e.token.type == 'COMMA': | |||||
| # Skip comma | |||||
| return True | |||||
| elif e.token.type == 'SIGNED_NUMBER': | |||||
| # Try to feed a comma and retry the number | |||||
| e.puppet.feed_token(Token('COMMA', ',')) | |||||
| e.puppet.feed_token(e.token) | |||||
| return True | |||||
| # Unhandled error. Will stop parse and raise exception | |||||
| return False | |||||
| g = _Lark(r''' | |||||
| start: "[" num ("," num)* "]" | |||||
| ?num: SIGNED_NUMBER | |||||
| %import common.SIGNED_NUMBER | |||||
| %ignore " " | |||||
| ''') | |||||
| s = "[0 1, 2,, 3,,, 4, 5 6 ]" | |||||
| tree = g.parse(s, on_error=ignore_errors) | |||||
| res = [int(x) for x in tree.children] | |||||
| assert res == list(range(7)) | |||||
| s = "[0 1, 2,@, 3,,, 4, 5 6 ]$" | |||||
| tree = g.parse(s, on_error=ignore_errors) | |||||
| _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize() | _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize() | ||||
| _TestParser.__name__ = _NAME | _TestParser.__name__ = _NAME | ||||
| _TestParser.__qualname__ = "tests.test_parser." + _NAME | _TestParser.__qualname__ = "tests.test_parser." + _NAME | ||||