From 5c8a25c7333ea685d5816dea186f0cf389d6d7f5 Mon Sep 17 00:00:00 2001 From: pwwang Date: Tue, 30 Jun 2020 18:18:49 -0500 Subject: [PATCH 01/25] Avoid using root logger --- docs/how_to_use.md | 7 ++-- lark/__init__.py | 1 + lark/common.py | 7 ++++ lark/lark.py | 8 ++-- lark/parsers/earley.py | 4 +- lark/parsers/lalr_analysis.py | 6 +-- tests/__main__.py | 7 +++- tests/test_logger.py | 65 ++++++++++++++++++++++++++++++ tests/test_nearley/test_nearley.py | 7 ++-- tests/test_parser.py | 3 +- 10 files changed, 97 insertions(+), 18 deletions(-) create mode 100644 tests/test_logger.py diff --git a/docs/how_to_use.md b/docs/how_to_use.md index 886b440..78f4df2 100644 --- a/docs/how_to_use.md +++ b/docs/how_to_use.md @@ -30,12 +30,13 @@ Use the reference pages for more in-depth explanations. (links in the [main page ## LALR usage -By default Lark silently resolves Shift/Reduce conflicts as Shift. To enable warnings pass `debug=True`. To get the messages printed you have to configure `logging` framework beforehand. For example: +By default Lark silently resolves Shift/Reduce conflicts as Shift. To enable warnings pass `debug=True`. To get the messages printed you have to configure the `LOGGER` beforehand. For example: ```python -from lark import Lark import logging -logging.basicConfig(level=logging.DEBUG) +from lark import Lark, LOGGER + +LOGGER.setLevel(logging.DEBUG) collision_grammar = ''' start: as as diff --git a/lark/__init__.py b/lark/__init__.py index 9e50691..e4c54dd 100644 --- a/lark/__init__.py +++ b/lark/__init__.py @@ -1,3 +1,4 @@ +from .common import LOGGER from .tree import Tree from .visitors import Transformer, Visitor, v_args, Discard from .visitors import InlineTransformer, inline_args # XXX Deprecated diff --git a/lark/common.py b/lark/common.py index c44f9ce..aac9d75 100644 --- a/lark/common.py +++ b/lark/common.py @@ -1,6 +1,13 @@ +import logging from .utils import Serialize from .lexer import TerminalDef +LOGGER = logging.getLogger("LARK") +LOGGER.addHandler(logging.StreamHandler()) +# Set to highest level, since we have some warnings amongst the code +# By default, we should not output any log messages +LOGGER.setLevel(logging.CRITICAL) + ###{standalone class LexerConf(Serialize): diff --git a/lark/lark.py b/lark/lark.py index 2b783cb..8df2b87 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -1,13 +1,13 @@ from __future__ import absolute_import -import sys, os, pickle, hashlib, logging +import sys, os, pickle, hashlib from io import open from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS from .load_grammar import load_grammar from .tree import Tree -from .common import LexerConf, ParserConf +from .common import LexerConf, ParserConf, LOGGER from .lexer import Lexer, TraditionalLexer, TerminalDef, UnexpectedToken from .parse_tree_builder import ParseTreeBuilder @@ -205,7 +205,7 @@ class Lark(Serialize): cache_fn = '.lark_cache_%s.tmp' % md5 if FS.exists(cache_fn): - logging.debug('Loading grammar from cache: %s', cache_fn) + LOGGER.debug('Loading grammar from cache: %s', cache_fn) with FS.open(cache_fn, 'rb') as f: self._load(f, self.options.transformer, self.options.postlex) return @@ -284,7 +284,7 @@ class Lark(Serialize): self.lexer = self._build_lexer() if cache_fn: - logging.debug('Saving grammar to cache: %s', cache_fn) + LOGGER.debug('Saving grammar to cache: %s', cache_fn) with FS.open(cache_fn, 'wb') as f: self.save(f) diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index 59e9a06..5fc7531 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -10,11 +10,11 @@ is better documented here: http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/ """ -import logging from collections import deque from ..visitors import Transformer_InPlace, v_args from ..exceptions import UnexpectedEOF, UnexpectedToken +from ..common import LOGGER from .grammar_analysis import GrammarAnalyzer from ..grammar import NonTerminal from .earley_common import Item, TransitiveItem @@ -301,7 +301,7 @@ class Parser: try: debug_walker = ForestToPyDotVisitor() except ImportError: - logging.warning("Cannot find dependency 'pydot', will not generate sppf debug image") + LOGGER.warning("Cannot find dependency 'pydot', will not generate sppf debug image") else: debug_walker.visit(solutions[0], "sppf.png") diff --git a/lark/parsers/lalr_analysis.py b/lark/parsers/lalr_analysis.py index 8890c3c..6fefa4c 100644 --- a/lark/parsers/lalr_analysis.py +++ b/lark/parsers/lalr_analysis.py @@ -6,11 +6,11 @@ For now, shift/reduce conflicts are automatically resolved as shifts. # Author: Erez Shinan (2017) # Email : erezshin@gmail.com -import logging from collections import defaultdict, deque from ..utils import classify, classify_bool, bfs, fzset, Serialize, Enumerator from ..exceptions import GrammarError +from ..common import LOGGER from .grammar_analysis import GrammarAnalyzer, Terminal, LR0ItemSet from ..grammar import Rule @@ -256,8 +256,8 @@ class LALR_Analyzer(GrammarAnalyzer): raise GrammarError('Reduce/Reduce collision in %s between the following rules: %s' % (la, ''.join([ '\n\t\t- ' + str(r) for r in rules ]))) if la in actions: if self.debug: - logging.warning('Shift/Reduce conflict for terminal %s: (resolving as shift)', la.name) - logging.warning(' * %s', list(rules)[0]) + LOGGER.warning('Shift/Reduce conflict for terminal %s: (resolving as shift)', la.name) + LOGGER.warning(' * %s', list(rules)[0]) else: actions[la] = (Reduce, list(rules)[0]) m[state] = { k.name: v for k, v in actions.items() } diff --git a/tests/__main__.py b/tests/__main__.py index cb26eb4..1807aa8 100644 --- a/tests/__main__.py +++ b/tests/__main__.py @@ -2,6 +2,7 @@ from __future__ import absolute_import, print_function import unittest import logging +from lark import LOGGER from .test_trees import TestTrees from .test_tools import TestStandalone @@ -11,11 +12,13 @@ from .test_reconstructor import TestReconstructor try: from .test_nearley.test_nearley import TestNearley except ImportError: - logging.warning("Warning: Skipping tests for Nearley grammar imports (js2py required)") + LOGGER.warning("Warning: Skipping tests for Nearley grammar imports (js2py required)") # from .test_selectors import TestSelectors # from .test_grammars import TestPythonG, TestConfigG +from .test_logger import TestLogger + from .test_parser import ( TestLalrStandard, TestEarleyStandard, @@ -31,7 +34,7 @@ from .test_parser import ( TestParsers, ) -logging.basicConfig(level=logging.INFO) +LOGGER.setLevel(logging.INFO) if __name__ == '__main__': unittest.main() diff --git a/tests/test_logger.py b/tests/test_logger.py new file mode 100644 index 0000000..dd6beb3 --- /dev/null +++ b/tests/test_logger.py @@ -0,0 +1,65 @@ +import logging +from contextlib import contextmanager +from lark import Lark, LOGGER +from unittest import TestCase, main + +try: + from StringIO import StringIO +except ImportError: + from io import StringIO + +@contextmanager +def capture_log(): + stream = StringIO() + orig_handler = LOGGER.handlers[0] + del LOGGER.handlers[:] + LOGGER.addHandler(logging.StreamHandler(stream)) + yield stream + del LOGGER.handlers[:] + LOGGER.addHandler(orig_handler) + +class TestLogger(TestCase): + + def test_debug(self): + LOGGER.setLevel(logging.DEBUG) + collision_grammar = ''' + start: as as + as: a* + a: "a" + ''' + with capture_log() as log: + Lark(collision_grammar, parser='lalr', debug=True) + + log = log.getvalue() + self.assertIn("Shift/Reduce conflict for terminal", log) + self.assertIn("A: (resolving as shift)", log) + self.assertIn("Shift/Reduce conflict for terminal A: (resolving as shift)", log) + + def test_non_debug(self): + LOGGER.setLevel(logging.DEBUG) + collision_grammar = ''' + start: as as + as: a* + a: "a" + ''' + with capture_log() as log: + Lark(collision_grammar, parser='lalr', debug=False) + log = log.getvalue() + # no log messge + self.assertEqual(len(log), 0) + + def test_loglevel_higher(self): + LOGGER.setLevel(logging.ERROR) + collision_grammar = ''' + start: as as + as: a* + a: "a" + ''' + with capture_log() as log: + Lark(collision_grammar, parser='lalr', debug=True) + log = log.getvalue() + # no log messge + self.assertEqual(len(log), 0) + +if __name__ == '__main__': + main() diff --git a/tests/test_nearley/test_nearley.py b/tests/test_nearley/test_nearley.py index 647f489..345af8a 100644 --- a/tests/test_nearley/test_nearley.py +++ b/tests/test_nearley/test_nearley.py @@ -6,16 +6,17 @@ import logging import os import codecs -logging.basicConfig(level=logging.INFO) - +from lark import LOGGER from lark.tools.nearley import create_code_for_nearley_grammar, main as nearley_tool_main +LOGGER.setLevel(logging.INFO) + TEST_PATH = os.path.abspath(os.path.dirname(__file__)) NEARLEY_PATH = os.path.join(TEST_PATH, 'nearley') BUILTIN_PATH = os.path.join(NEARLEY_PATH, 'builtin') if not os.path.exists(NEARLEY_PATH): - logging.warn("Nearley not installed. Skipping Nearley tests!") + LOGGER.warn("Nearley not installed. Skipping Nearley tests!") raise ImportError("Skipping Nearley tests!") import js2py # Ensures that js2py exists, to avoid failing tests diff --git a/tests/test_parser.py b/tests/test_parser.py index df09307..5a10b9f 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -18,13 +18,13 @@ from io import ( open, ) -logging.basicConfig(level=logging.INFO) try: import regex except ImportError: regex = None +from lark import LOGGER from lark.lark import Lark from lark.exceptions import GrammarError, ParseError, UnexpectedToken, UnexpectedInput, UnexpectedCharacters from lark.tree import Tree @@ -32,6 +32,7 @@ from lark.visitors import Transformer, Transformer_InPlace, v_args from lark.grammar import Rule from lark.lexer import TerminalDef, Lexer, TraditionalLexer +LOGGER.setLevel(logging.INFO) __path__ = os.path.dirname(__file__) From a6201b41e471897ef044696925911df86b94a886 Mon Sep 17 00:00:00 2001 From: pwwang <1188067+pwwang@users.noreply.github.com> Date: Tue, 30 Jun 2020 17:35:26 -0700 Subject: [PATCH 02/25] Lowercase logger name --- lark/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/common.py b/lark/common.py index aac9d75..3bd7c98 100644 --- a/lark/common.py +++ b/lark/common.py @@ -2,7 +2,7 @@ import logging from .utils import Serialize from .lexer import TerminalDef -LOGGER = logging.getLogger("LARK") +LOGGER = logging.getLogger("lark") LOGGER.addHandler(logging.StreamHandler()) # Set to highest level, since we have some warnings amongst the code # By default, we should not output any log messages From 2a73afd3554c29f216869bc3e70f971f74b62c13 Mon Sep 17 00:00:00 2001 From: pwwang Date: Thu, 2 Jul 2020 19:28:45 -0500 Subject: [PATCH 03/25] Change LOGGER to logger --- docs/how_to_use.md | 6 +++--- lark/__init__.py | 2 +- lark/common.py | 6 +++--- lark/lark.py | 6 +++--- lark/parsers/earley.py | 4 ++-- lark/parsers/lalr_analysis.py | 6 +++--- tests/__main__.py | 8 ++++---- tests/test_logger.py | 26 +++++++++++++------------- tests/test_nearley/test_nearley.py | 6 +++--- tests/test_parser.py | 4 ++-- 10 files changed, 37 insertions(+), 37 deletions(-) diff --git a/docs/how_to_use.md b/docs/how_to_use.md index 78f4df2..303098f 100644 --- a/docs/how_to_use.md +++ b/docs/how_to_use.md @@ -30,13 +30,13 @@ Use the reference pages for more in-depth explanations. (links in the [main page ## LALR usage -By default Lark silently resolves Shift/Reduce conflicts as Shift. To enable warnings pass `debug=True`. To get the messages printed you have to configure the `LOGGER` beforehand. For example: +By default Lark silently resolves Shift/Reduce conflicts as Shift. To enable warnings pass `debug=True`. To get the messages printed you have to configure the `logger` beforehand. For example: ```python import logging -from lark import Lark, LOGGER +from lark import Lark, logger -LOGGER.setLevel(logging.DEBUG) +logger.setLevel(logging.DEBUG) collision_grammar = ''' start: as as diff --git a/lark/__init__.py b/lark/__init__.py index e4c54dd..e3021cf 100644 --- a/lark/__init__.py +++ b/lark/__init__.py @@ -1,4 +1,4 @@ -from .common import LOGGER +from .common import logger from .tree import Tree from .visitors import Transformer, Visitor, v_args, Discard from .visitors import InlineTransformer, inline_args # XXX Deprecated diff --git a/lark/common.py b/lark/common.py index 3bd7c98..745e287 100644 --- a/lark/common.py +++ b/lark/common.py @@ -2,11 +2,11 @@ import logging from .utils import Serialize from .lexer import TerminalDef -LOGGER = logging.getLogger("lark") -LOGGER.addHandler(logging.StreamHandler()) +logger = logging.getLogger("lark") +logger.addHandler(logging.StreamHandler()) # Set to highest level, since we have some warnings amongst the code # By default, we should not output any log messages -LOGGER.setLevel(logging.CRITICAL) +logger.setLevel(logging.CRITICAL) ###{standalone diff --git a/lark/lark.py b/lark/lark.py index 8df2b87..9bb60c8 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -7,7 +7,7 @@ from io import open from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS from .load_grammar import load_grammar from .tree import Tree -from .common import LexerConf, ParserConf, LOGGER +from .common import LexerConf, ParserConf, logger from .lexer import Lexer, TraditionalLexer, TerminalDef, UnexpectedToken from .parse_tree_builder import ParseTreeBuilder @@ -205,7 +205,7 @@ class Lark(Serialize): cache_fn = '.lark_cache_%s.tmp' % md5 if FS.exists(cache_fn): - LOGGER.debug('Loading grammar from cache: %s', cache_fn) + logger.debug('Loading grammar from cache: %s', cache_fn) with FS.open(cache_fn, 'rb') as f: self._load(f, self.options.transformer, self.options.postlex) return @@ -284,7 +284,7 @@ class Lark(Serialize): self.lexer = self._build_lexer() if cache_fn: - LOGGER.debug('Saving grammar to cache: %s', cache_fn) + logger.debug('Saving grammar to cache: %s', cache_fn) with FS.open(cache_fn, 'wb') as f: self.save(f) diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index 5fc7531..bf099e6 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -14,7 +14,7 @@ from collections import deque from ..visitors import Transformer_InPlace, v_args from ..exceptions import UnexpectedEOF, UnexpectedToken -from ..common import LOGGER +from ..common import logger from .grammar_analysis import GrammarAnalyzer from ..grammar import NonTerminal from .earley_common import Item, TransitiveItem @@ -301,7 +301,7 @@ class Parser: try: debug_walker = ForestToPyDotVisitor() except ImportError: - LOGGER.warning("Cannot find dependency 'pydot', will not generate sppf debug image") + logger.warning("Cannot find dependency 'pydot', will not generate sppf debug image") else: debug_walker.visit(solutions[0], "sppf.png") diff --git a/lark/parsers/lalr_analysis.py b/lark/parsers/lalr_analysis.py index 6fefa4c..861941f 100644 --- a/lark/parsers/lalr_analysis.py +++ b/lark/parsers/lalr_analysis.py @@ -10,7 +10,7 @@ from collections import defaultdict, deque from ..utils import classify, classify_bool, bfs, fzset, Serialize, Enumerator from ..exceptions import GrammarError -from ..common import LOGGER +from ..common import logger from .grammar_analysis import GrammarAnalyzer, Terminal, LR0ItemSet from ..grammar import Rule @@ -256,8 +256,8 @@ class LALR_Analyzer(GrammarAnalyzer): raise GrammarError('Reduce/Reduce collision in %s between the following rules: %s' % (la, ''.join([ '\n\t\t- ' + str(r) for r in rules ]))) if la in actions: if self.debug: - LOGGER.warning('Shift/Reduce conflict for terminal %s: (resolving as shift)', la.name) - LOGGER.warning(' * %s', list(rules)[0]) + logger.warning('Shift/Reduce conflict for terminal %s: (resolving as shift)', la.name) + logger.warning(' * %s', list(rules)[0]) else: actions[la] = (Reduce, list(rules)[0]) m[state] = { k.name: v for k, v in actions.items() } diff --git a/tests/__main__.py b/tests/__main__.py index 1807aa8..9ef9f1b 100644 --- a/tests/__main__.py +++ b/tests/__main__.py @@ -2,7 +2,7 @@ from __future__ import absolute_import, print_function import unittest import logging -from lark import LOGGER +from lark import logger from .test_trees import TestTrees from .test_tools import TestStandalone @@ -12,12 +12,12 @@ from .test_reconstructor import TestReconstructor try: from .test_nearley.test_nearley import TestNearley except ImportError: - LOGGER.warning("Warning: Skipping tests for Nearley grammar imports (js2py required)") + logger.warning("Warning: Skipping tests for Nearley grammar imports (js2py required)") # from .test_selectors import TestSelectors # from .test_grammars import TestPythonG, TestConfigG -from .test_logger import TestLogger +from .test_logger import Testlogger from .test_parser import ( TestLalrStandard, @@ -34,7 +34,7 @@ from .test_parser import ( TestParsers, ) -LOGGER.setLevel(logging.INFO) +logger.setLevel(logging.INFO) if __name__ == '__main__': unittest.main() diff --git a/tests/test_logger.py b/tests/test_logger.py index dd6beb3..93dc8ed 100644 --- a/tests/test_logger.py +++ b/tests/test_logger.py @@ -1,6 +1,6 @@ import logging from contextlib import contextmanager -from lark import Lark, LOGGER +from lark import Lark, logger from unittest import TestCase, main try: @@ -11,17 +11,17 @@ except ImportError: @contextmanager def capture_log(): stream = StringIO() - orig_handler = LOGGER.handlers[0] - del LOGGER.handlers[:] - LOGGER.addHandler(logging.StreamHandler(stream)) + orig_handler = logger.handlers[0] + del logger.handlers[:] + logger.addHandler(logging.StreamHandler(stream)) yield stream - del LOGGER.handlers[:] - LOGGER.addHandler(orig_handler) + del logger.handlers[:] + logger.addHandler(orig_handler) -class TestLogger(TestCase): +class Testlogger(TestCase): def test_debug(self): - LOGGER.setLevel(logging.DEBUG) + logger.setLevel(logging.DEBUG) collision_grammar = ''' start: as as as: a* @@ -31,12 +31,12 @@ class TestLogger(TestCase): Lark(collision_grammar, parser='lalr', debug=True) log = log.getvalue() - self.assertIn("Shift/Reduce conflict for terminal", log) - self.assertIn("A: (resolving as shift)", log) - self.assertIn("Shift/Reduce conflict for terminal A: (resolving as shift)", log) + # since there are conflicts about A + # symbol A should appear in the log message for hint + self.assertIn("A", log) def test_non_debug(self): - LOGGER.setLevel(logging.DEBUG) + logger.setLevel(logging.DEBUG) collision_grammar = ''' start: as as as: a* @@ -49,7 +49,7 @@ class TestLogger(TestCase): self.assertEqual(len(log), 0) def test_loglevel_higher(self): - LOGGER.setLevel(logging.ERROR) + logger.setLevel(logging.ERROR) collision_grammar = ''' start: as as as: a* diff --git a/tests/test_nearley/test_nearley.py b/tests/test_nearley/test_nearley.py index 345af8a..1ad6449 100644 --- a/tests/test_nearley/test_nearley.py +++ b/tests/test_nearley/test_nearley.py @@ -6,17 +6,17 @@ import logging import os import codecs -from lark import LOGGER +from lark import logger from lark.tools.nearley import create_code_for_nearley_grammar, main as nearley_tool_main -LOGGER.setLevel(logging.INFO) +logger.setLevel(logging.INFO) TEST_PATH = os.path.abspath(os.path.dirname(__file__)) NEARLEY_PATH = os.path.join(TEST_PATH, 'nearley') BUILTIN_PATH = os.path.join(NEARLEY_PATH, 'builtin') if not os.path.exists(NEARLEY_PATH): - LOGGER.warn("Nearley not installed. Skipping Nearley tests!") + logger.warn("Nearley not installed. Skipping Nearley tests!") raise ImportError("Skipping Nearley tests!") import js2py # Ensures that js2py exists, to avoid failing tests diff --git a/tests/test_parser.py b/tests/test_parser.py index 5a10b9f..88d175f 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -24,7 +24,7 @@ try: except ImportError: regex = None -from lark import LOGGER +from lark import logger from lark.lark import Lark from lark.exceptions import GrammarError, ParseError, UnexpectedToken, UnexpectedInput, UnexpectedCharacters from lark.tree import Tree @@ -32,7 +32,7 @@ from lark.visitors import Transformer, Transformer_InPlace, v_args from lark.grammar import Rule from lark.lexer import TerminalDef, Lexer, TraditionalLexer -LOGGER.setLevel(logging.INFO) +logger.setLevel(logging.INFO) __path__ = os.path.dirname(__file__) From 438e89dea9cd886a4bc01738a224e6a0e5fbb519 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sat, 8 Aug 2020 15:33:36 +0300 Subject: [PATCH 04/25] Fix readthedocs (Issue #640) --- mkdocs.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/mkdocs.yml b/mkdocs.yml index 6c22d89..8d2a562 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -13,3 +13,4 @@ pages: - Classes Reference: classes.md - Recipes: recipes.md - Import grammars from Nearley: nearley.md + - Tutorial - JSON Parser: json_tutorial.md From 61a7c1e20a6c6cbdbd23fdd20611075fe3147176 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sat, 8 Aug 2020 15:43:20 +0300 Subject: [PATCH 05/25] Removed code that causes failure in Python 3.4 --- lark/exceptions.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/lark/exceptions.py b/lark/exceptions.py index 033275c..645b09c 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -72,11 +72,7 @@ class UnexpectedInput(LarkError): class UnexpectedCharacters(LexError, UnexpectedInput): def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None): - - if isinstance(seq, bytes): - message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace"), line, column) - else: - message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos], line, column) + message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos], line, column) self.line = line self.column = column From 5954fdf87aa79c7369c040ade8dbdd04dff58ef7 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sat, 8 Aug 2020 16:16:34 +0300 Subject: [PATCH 06/25] Restore bad code (needs better fix). Updated readme & docs. --- README.md | 7 +++---- docs/features.md | 2 +- lark/exceptions.py | 6 +++++- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 18c181f..23ec565 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ Most importantly, Lark will save you time and prevent you from getting parsing h ### Install Lark - $ pip install lark-parser + $ pip install lark-parser --upgrade Lark has no dependencies. @@ -77,12 +77,11 @@ Notice punctuation doesn't appear in the resulting tree. It's automatically filt ### Fruit flies like bananas -Lark is great at handling ambiguity. Let's parse the phrase "fruit flies like bananas": +Lark is great at handling ambiguity. Here is the result of parsing the phrase "fruit flies like bananas": ![fruitflies.png](examples/fruitflies.png) -See more [examples here](https://github.com/lark-parser/lark/tree/master/examples) - +See the code and more [examples here](https://github.com/lark-parser/lark/tree/master/examples) ## List of main features diff --git a/docs/features.md b/docs/features.md index 9346989..c2f6983 100644 --- a/docs/features.md +++ b/docs/features.md @@ -19,8 +19,8 @@ [Read more about the parsers](parsers.md) # Extra features - - Import rules and tokens from other Lark grammars, for code reuse and modularity. + - Support for external regex module ([see here](/docs/classes.md#using-unicode-character-classes-with-regex)) - Import grammars from Nearley.js ([read more](/docs/nearley.md)) - CYK parser diff --git a/lark/exceptions.py b/lark/exceptions.py index 645b09c..a844dd4 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -72,7 +72,11 @@ class UnexpectedInput(LarkError): class UnexpectedCharacters(LexError, UnexpectedInput): def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None): - message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos], line, column) + + if isinstance(seq, bytes): + message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace"), line, column) + else: + message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos], line, column) self.line = line self.column = column From 8dc8865072a526dbb70cd6f073668fe22c5680b8 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sat, 8 Aug 2020 16:21:01 +0300 Subject: [PATCH 07/25] [docs] Fixed links --- docs/features.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/features.md b/docs/features.md index c2f6983..00fdf4b 100644 --- a/docs/features.md +++ b/docs/features.md @@ -20,8 +20,8 @@ # Extra features - Import rules and tokens from other Lark grammars, for code reuse and modularity. - - Support for external regex module ([see here](/docs/classes.md#using-unicode-character-classes-with-regex)) - - Import grammars from Nearley.js ([read more](/docs/nearley.md)) + - Support for external regex module ([see here](classes.md#using-unicode-character-classes-with-regex)) + - Import grammars from Nearley.js ([read more](nearley.md)) - CYK parser ### Experimental features From b7068c45a73bc70d3f9611c81198f0aa5571c4d9 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sun, 9 Aug 2020 12:05:07 +0300 Subject: [PATCH 08/25] Tiny fixes. Don't test use_bytes on Python 3.4. --- docs/index.md | 2 +- lark/visitors.py | 2 ++ tests/test_parser.py | 3 ++- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/docs/index.md b/docs/index.md index 1310be2..c72305d 100644 --- a/docs/index.md +++ b/docs/index.md @@ -32,7 +32,7 @@ $ pip install lark-parser * [Philosophy & Design Choices](philosophy.md) -* [Full List of Features](features.md) +* [Features](features.md) * [Examples](https://github.com/lark-parser/lark/tree/master/examples) * [Online IDE](https://lark-parser.github.io/lark/ide/app.html) * Tutorials diff --git a/lark/visitors.py b/lark/visitors.py index 3f80016..6494deb 100644 --- a/lark/visitors.py +++ b/lark/visitors.py @@ -14,6 +14,8 @@ class Discard(Exception): # Transformers class _Decoratable: + "Provides support for decorating methods with @v_args" + @classmethod def _apply_decorator(cls, decorator, **kwargs): mro = getmro(cls) diff --git a/tests/test_parser.py b/tests/test_parser.py index f1e269f..cd3ea4d 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -721,7 +721,8 @@ def _make_parser_test(LEXER, PARSER): """) g.parse('\x01\x02\x03') - @unittest.skipIf(sys.version_info[:2]==(2, 7), "bytes parser isn't perfect in Python2.7, exceptions don't work correctly") + @unittest.skipIf(sys.version_info[0]==2 or sys.version_info[:2]==(3, 4), + "bytes parser isn't perfect in Python2, exceptions don't work correctly") def test_bytes_utf8(self): g = r""" start: BOM? char+ From 9923987e94547ded8a17d7a03840c4cebce39188 Mon Sep 17 00:00:00 2001 From: decorator-factory <42166884+decorator-factory@users.noreply.github.com> Date: Mon, 10 Aug 2020 23:07:55 +0300 Subject: [PATCH 09/25] allow multiline regexes with 'x' (verbose) flag --- lark/load_grammar.py | 13 ++++++++++--- tests/test_parser.py | 26 ++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 3 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index ae7ec32..d716ec1 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -13,7 +13,7 @@ from .parser_frontends import LALR_TraditionalLexer from .common import LexerConf, ParserConf from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol from .utils import classify, suppress, dedup_list, Str -from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken +from .exceptions import GrammarError, LarkError, UnexpectedCharacters, UnexpectedToken from .tree import Tree, SlottedTree as ST from .visitors import Transformer, Visitor, v_args, Transformer_InPlace, Transformer_NonRecursive @@ -85,7 +85,7 @@ TERMINALS = { 'RULE': '!?[_?]?[a-z][_a-z0-9]*', 'TERMINAL': '_?[A-Z][_A-Z0-9]*', 'STRING': r'"(\\"|\\\\|[^"\n])*?"i?', - 'REGEXP': r'/(?!/)(\\/|\\\\|[^/\n])*?/[%s]*' % _RE_FLAGS, + 'REGEXP': r'/(?!/)(\\/|\\\\|[^/])*?/[%s]*' % _RE_FLAGS, '_NL': r'(\r?\n)+\s*', 'WS': r'[ \t]+', 'COMMENT': r'\s*//[^\n]*', @@ -336,7 +336,7 @@ class PrepareAnonTerminals(Transformer_InPlace): term_name = None elif isinstance(p, PatternRE): - if p in self.term_reverse: # Kind of a wierd placement.name + if p in self.term_reverse: # Kind of a weird placement.name term_name = self.term_reverse[p].name else: assert False, p @@ -409,6 +409,13 @@ def _literal_to_pattern(literal): flags = v[flag_start:] assert all(f in _RE_FLAGS for f in flags), flags + if literal.type == 'STRING' and '\n' in v: + raise GrammarError('You cannot put newlines in string literals') + + if literal.type == 'REGEXP' and '\n' in v and 'x' not in flags: + raise GrammarError('You can only use newlines in regular expressions ' + 'with the `x` (verbose) flag') + v = v[:flag_start] assert v[0] == v[-1] and v[0] in '"/' x = v[1:-1] diff --git a/tests/test_parser.py b/tests/test_parser.py index cd3ea4d..48a4674 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1262,6 +1262,32 @@ def _make_parser_test(LEXER, PARSER): tree = l.parse('aA') self.assertEqual(tree.children, ['a', 'A']) + def test_token_flags_verbose(self): + g = _Lark(r"""start: NL | ABC + ABC: / [a-z] /x + NL: /\n/ + """) + x = g.parse('a') + self.assertEqual(x.children, ['a']) + + def test_token_flags_verbose_multiline(self): + g = _Lark(r"""start: ABC + ABC: / a b c + d + e f + /x + """) + x = g.parse('abcdef') + self.assertEqual(x.children, ['abcdef']) + + def test_token_multiline_only_works_with_x_flag(self): + g = r"""start: ABC + ABC: / a b c + d + e f + /i + """ + self.assertRaises( GrammarError, _Lark, g) @unittest.skipIf(PARSER == 'cyk', "No empty rules") def test_twice_empty(self): From 8b59a1642533f1f577b104c7be33f0511193050d Mon Sep 17 00:00:00 2001 From: decorator-factory <42166884+decorator-factory@users.noreply.github.com> Date: Tue, 11 Aug 2020 00:44:23 +0300 Subject: [PATCH 10/25] refactor: replace dict lookup with simple conditional --- lark/load_grammar.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index d716ec1..1a1a396 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -424,9 +424,11 @@ def _literal_to_pattern(literal): if literal.type == 'STRING': s = s.replace('\\\\', '\\') - - return { 'STRING': PatternStr, - 'REGEXP': PatternRE }[literal.type](s, flags) + return PatternStr(s, flags) + elif literal.type == 'REGEXP': + return PatternRE(s, flags) + else: + assert False, 'Invariant failed: literal.type not in ["STRING", "REGEXP"]' @inline_args From 2525e0ce9c594b81a79caa5ff57c66a12a79ca5a Mon Sep 17 00:00:00 2001 From: decorator-factory <42166884+decorator-factory@users.noreply.github.com> Date: Tue, 11 Aug 2020 00:46:54 +0300 Subject: [PATCH 11/25] formatting: fix pistol operator --- lark/load_grammar.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 1a1a396..0ee546c 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -13,7 +13,7 @@ from .parser_frontends import LALR_TraditionalLexer from .common import LexerConf, ParserConf from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol from .utils import classify, suppress, dedup_list, Str -from .exceptions import GrammarError, LarkError, UnexpectedCharacters, UnexpectedToken +from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken from .tree import Tree, SlottedTree as ST from .visitors import Transformer, Visitor, v_args, Transformer_InPlace, Transformer_NonRecursive @@ -850,7 +850,7 @@ class GrammarLoader: if len(stmt.children) > 1: path_node, arg1 = stmt.children else: - path_node, = stmt.children + path_node ,= stmt.children arg1 = None if isinstance(arg1, Tree): # Multi import From 28e0a86f389c329a35091b7acb7b0afc5d57dc74 Mon Sep 17 00:00:00 2001 From: MegaIng1 Date: Wed, 12 Aug 2020 14:48:55 +0200 Subject: [PATCH 12/25] Small improvements for debug info --- lark-stubs/exceptions.pyi | 15 ++++++++++----- lark/exceptions.py | 15 ++++++++++++--- lark/parsers/lalr_puppet.py | 6 +++--- 3 files changed, 25 insertions(+), 11 deletions(-) diff --git a/lark-stubs/exceptions.pyi b/lark-stubs/exceptions.pyi index f09bfbd..012ac51 100644 --- a/lark-stubs/exceptions.pyi +++ b/lark-stubs/exceptions.pyi @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -from typing import Dict, Iterable, Callable, Union +from typing import Dict, Iterable, Callable, Union, TypeVar, Tuple from .tree import Tree from .lexer import Token @@ -21,6 +21,9 @@ class LexError(LarkError): pass +T = TypeVar('T') + + class UnexpectedInput(LarkError): pos_in_stream: int @@ -28,10 +31,12 @@ class UnexpectedInput(LarkError): ... def match_examples( - self, - parse_fn: Callable[[str], Tree], - examples: Dict[str, Iterable[str]] - ): + self, + parse_fn: Callable[[str], Tree], + examples: Union[Dict[T, Iterable[str]], Iterable[Tuple[T, Iterable[str]]]], + token_type_match_fallback: bool = False, + print_debug_info: bool = True + ) -> T: ... diff --git a/lark/exceptions.py b/lark/exceptions.py index 033275c..47670a6 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -37,34 +37,43 @@ class UnexpectedInput(LarkError): after = text[pos:end].split(b'\n', 1)[0] return (before + after + b'\n' + b' ' * len(before) + b'^\n').decode("ascii", "backslashreplace") - def match_examples(self, parse_fn, examples, token_type_match_fallback=False): + def match_examples(self, parse_fn, examples, token_type_match_fallback=False, print_debug_info=True): """ Given a parser instance and a dictionary mapping some label with some malformed syntax examples, it'll return the label for the example that bests matches the current error. """ assert self.state is not None, "Not supported for this exception" + + if isinstance(examples, dict): + examples = examples.items() candidate = (None, False) - for label, example in examples.items(): + for i, (label, example) in enumerate(examples): assert not isinstance(example, STRING_TYPE) - for malformed in example: + for j, malformed in enumerate(example): try: parse_fn(malformed) except UnexpectedInput as ut: if ut.state == self.state: try: if ut.token == self.token: # Try exact match first + if print_debug_info: + print("Exact Match at %d, with example %d" % (i, j), (ut.token, self.token, ut.state, self.state)) return label if token_type_match_fallback: # Fallback to token types match if (ut.token.type == self.token.type) and not candidate[-1]: + if print_debug_info: + print("Token Type Fallback at %d, with example %d" % (i, j)) candidate = label, True except AttributeError: pass if not candidate[0]: + if print_debug_info: + print("Defaulted at %d, with example %d" % (i, j)) candidate = label, False return candidate[0] diff --git a/lark/parsers/lalr_puppet.py b/lark/parsers/lalr_puppet.py index 968783c..d5a4703 100644 --- a/lark/parsers/lalr_puppet.py +++ b/lark/parsers/lalr_puppet.py @@ -16,7 +16,7 @@ class ParserPuppet: self.result = None def feed_token(self, token): - """Advance the parser state, as if it just recieved `token` from the lexer + """Advance the parser state, as if it just received `token` from the lexer """ end_state = self.parser.parse_table.end_states[self._start] @@ -66,9 +66,9 @@ class ParserPuppet: self._set_state, ) - def pretty(): + def pretty(self): print("Puppet choices:") - for k, v in self.choices.items(): + for k, v in self.choices().items(): print('\t-', k, '->', v) print('stack size:', len(self._state_stack)) From a7bcd0bc2d3cb96030d9e77523c0007e8034ce49 Mon Sep 17 00:00:00 2001 From: MegaIng1 Date: Wed, 12 Aug 2020 15:36:01 +0200 Subject: [PATCH 13/25] Added `accepts` attribute to `UnexpectedToken` and update stubs --- lark-stubs/exceptions.pyi | 15 ++++++++++----- lark/exceptions.py | 5 +++-- lark/parsers/lalr_parser.py | 13 +++++++++++-- 3 files changed, 24 insertions(+), 9 deletions(-) diff --git a/lark-stubs/exceptions.pyi b/lark-stubs/exceptions.pyi index 012ac51..67c39fb 100644 --- a/lark-stubs/exceptions.pyi +++ b/lark-stubs/exceptions.pyi @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -from typing import Dict, Iterable, Callable, Union, TypeVar, Tuple +from typing import Dict, Iterable, Callable, Union, TypeVar, Tuple, Any, List, Set from .tree import Tree from .lexer import Token @@ -25,7 +25,10 @@ T = TypeVar('T') class UnexpectedInput(LarkError): + line: int + column: int pos_in_stream: int + state: Any def get_context(self, text: str, span: int = ...): ... @@ -41,12 +44,14 @@ class UnexpectedInput(LarkError): class UnexpectedToken(ParseError, UnexpectedInput): - pass - + expected: List[str] + considered_rules: Set[str] + puppet: Any + accepts: List[str] class UnexpectedCharacters(LexError, UnexpectedInput): - line: int - column: int + allowed: Set[str] + considered_tokens: Set[Any] class VisitError(LarkError): diff --git a/lark/exceptions.py b/lark/exceptions.py index 47670a6..022a00f 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -105,7 +105,7 @@ class UnexpectedCharacters(LexError, UnexpectedInput): class UnexpectedToken(ParseError, UnexpectedInput): - def __init__(self, token, expected, considered_rules=None, state=None, puppet=None): + def __init__(self, token, expected, considered_rules=None, state=None, puppet=None, accepts=None): self.token = token self.expected = expected # XXX str shouldn't necessary self.line = getattr(token, 'line', '?') @@ -114,10 +114,11 @@ class UnexpectedToken(ParseError, UnexpectedInput): self.state = state self.pos_in_stream = getattr(token, 'pos_in_stream', None) self.puppet = puppet + self.accepts = accepts message = ("Unexpected token %r at line %s, column %s.\n" "Expected one of: \n\t* %s\n" - % (token, self.line, self.column, '\n\t* '.join(self.expected))) + % (token, self.line, self.column, '\n\t* '.join(self.accepts or self.expected))) super(UnexpectedToken, self).__init__(message) diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index f26cbc5..f61e093 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -62,9 +62,18 @@ class _Parser: expected = [s for s in states[state].keys() if s.isupper()] try: puppet = ParserPuppet(self, state_stack, value_stack, start, stream, set_state) + accepts = [] + for t in expected: + new_puppet = puppet.copy() + try: + new_puppet.feed_token(Token(t, '')) + except KeyError: + pass + else: + accepts.append(t) except NameError: - puppet = None - raise UnexpectedToken(token, expected, state=state, puppet=puppet) + puppet = accepts = None + raise UnexpectedToken(token, expected, state=state, puppet=puppet, accepts=accepts) def reduce(rule): size = len(rule.expansion) From d3b0449f714615b190699644650e41669a1510d4 Mon Sep 17 00:00:00 2001 From: MegaIng1 Date: Wed, 12 Aug 2020 16:46:36 +0200 Subject: [PATCH 14/25] Improved `match_examples` with `UnexpectedToken.accepts` --- lark/exceptions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/exceptions.py b/lark/exceptions.py index 022a00f..497cf96 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -55,7 +55,7 @@ class UnexpectedInput(LarkError): try: parse_fn(malformed) except UnexpectedInput as ut: - if ut.state == self.state: + if ut.state == self.state and ut.accepts == self.accepts: try: if ut.token == self.token: # Try exact match first if print_debug_info: From 2e160c046e5de3d82b664d9867c1e9386ff4efb7 Mon Sep 17 00:00:00 2001 From: MegaIng1 Date: Wed, 12 Aug 2020 16:52:21 +0200 Subject: [PATCH 15/25] Correction for python2.7 (LalrPuppet-> new style class) --- lark/parsers/lalr_puppet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/parsers/lalr_puppet.py b/lark/parsers/lalr_puppet.py index d5a4703..2b350bf 100644 --- a/lark/parsers/lalr_puppet.py +++ b/lark/parsers/lalr_puppet.py @@ -4,7 +4,7 @@ from copy import deepcopy from .lalr_analysis import Shift, Reduce -class ParserPuppet: +class ParserPuppet(object): def __init__(self, parser, state_stack, value_stack, start, stream, set_state): self.parser = parser self._state_stack = state_stack From cb2d9cded072e0f150b0d6d349fd431369b83a93 Mon Sep 17 00:00:00 2001 From: MegaIng1 Date: Thu, 13 Aug 2020 03:51:01 +0200 Subject: [PATCH 16/25] Refactored ParserPuppet, added stubs --- lark-stubs/exceptions.pyi | 10 +++++----- lark-stubs/parsers/__init__.pyi | 0 lark-stubs/parsers/lalr_puppet.pyi | 21 +++++++++++++++++++++ lark/exceptions.py | 19 ++++++++++--------- lark/parsers/lalr_parser.py | 12 ++---------- lark/parsers/lalr_puppet.py | 21 ++++++++++++++++++--- 6 files changed, 56 insertions(+), 27 deletions(-) create mode 100644 lark-stubs/parsers/__init__.pyi create mode 100644 lark-stubs/parsers/lalr_puppet.pyi diff --git a/lark-stubs/exceptions.pyi b/lark-stubs/exceptions.pyi index 67c39fb..268844c 100644 --- a/lark-stubs/exceptions.pyi +++ b/lark-stubs/exceptions.pyi @@ -3,7 +3,7 @@ from typing import Dict, Iterable, Callable, Union, TypeVar, Tuple, Any, List, Set from .tree import Tree from .lexer import Token - +from .parsers.lalr_puppet import ParserPuppet class LarkError(Exception): pass @@ -38,16 +38,16 @@ class UnexpectedInput(LarkError): parse_fn: Callable[[str], Tree], examples: Union[Dict[T, Iterable[str]], Iterable[Tuple[T, Iterable[str]]]], token_type_match_fallback: bool = False, - print_debug_info: bool = True + use_accepts: bool = False, ) -> T: ... class UnexpectedToken(ParseError, UnexpectedInput): - expected: List[str] + expected: Set[str] considered_rules: Set[str] - puppet: Any - accepts: List[str] + puppet: ParserPuppet + accepts: Set[str] class UnexpectedCharacters(LexError, UnexpectedInput): allowed: Set[str] diff --git a/lark-stubs/parsers/__init__.pyi b/lark-stubs/parsers/__init__.pyi new file mode 100644 index 0000000..e69de29 diff --git a/lark-stubs/parsers/lalr_puppet.pyi b/lark-stubs/parsers/lalr_puppet.pyi new file mode 100644 index 0000000..c138c32 --- /dev/null +++ b/lark-stubs/parsers/lalr_puppet.pyi @@ -0,0 +1,21 @@ +from typing import Set, Dict, Any + +from lark import Token, Tree + + +class ParserPuppet(object): + """ + Represents a LalrParser that can be step through. + Shouldn't instantiated by hand, but is accessible as `UnexpectedToken.puppet` + """ + def feed_token(self, token: Token): ... + + def copy(self) -> ParserPuppet: ... + + def pretty(self) -> str: ... + + def choices(self) -> Dict[str, Any]: ... + + def accepts(self) -> Set[str]: ... + + def resume_parse(self) -> Tree: ... diff --git a/lark/exceptions.py b/lark/exceptions.py index 92ef64e..03f3da4 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -1,3 +1,5 @@ +import logging + from .utils import STRING_TYPE ###{standalone @@ -37,7 +39,7 @@ class UnexpectedInput(LarkError): after = text[pos:end].split(b'\n', 1)[0] return (before + after + b'\n' + b' ' * len(before) + b'^\n').decode("ascii", "backslashreplace") - def match_examples(self, parse_fn, examples, token_type_match_fallback=False, print_debug_info=True): + def match_examples(self, parse_fn, examples, token_type_match_fallback=False, use_accepts=False): """ Given a parser instance and a dictionary mapping some label with some malformed syntax examples, it'll return the label for the example that bests matches the current error. @@ -55,27 +57,26 @@ class UnexpectedInput(LarkError): try: parse_fn(malformed) except UnexpectedInput as ut: - if ut.state == self.state and ut.accepts == self.accepts: + if ut.state == self.state and (not use_accepts or ut.accepts == self.accepts): try: if ut.token == self.token: # Try exact match first - if print_debug_info: - print("Exact Match at %d, with example %d" % (i, j), (ut.token, self.token, ut.state, self.state)) + logging.debug("Exact Match at example [%s][%s]" % (i, j)) return label if token_type_match_fallback: # Fallback to token types match if (ut.token.type == self.token.type) and not candidate[-1]: - if print_debug_info: - print("Token Type Fallback at %d, with example %d" % (i, j)) + logging.debug("Token Type Fallback at example [%s][%s]" % (i, j)) candidate = label, True except AttributeError: pass if not candidate[0]: - if print_debug_info: - print("Defaulted at %d, with example %d" % (i, j)) + logging.debug("Same State match at example [%s][%s]" % (i, j)) candidate = label, False - + elif ut.state == self.state: + logging.debug("Different accepts with same state[%d]: %s != %s at example [%s][%s]" % + (self.state, self.accepts, ut.accepts, i, j)) return candidate[0] diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index f61e093..ba75606 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -59,18 +59,10 @@ class _Parser: try: return states[state][token.type] except KeyError: - expected = [s for s in states[state].keys() if s.isupper()] + expected = {s for s in states[state].keys() if s.isupper()} try: puppet = ParserPuppet(self, state_stack, value_stack, start, stream, set_state) - accepts = [] - for t in expected: - new_puppet = puppet.copy() - try: - new_puppet.feed_token(Token(t, '')) - except KeyError: - pass - else: - accepts.append(t) + accepts = puppet.accepts() except NameError: puppet = accepts = None raise UnexpectedToken(token, expected, state=state, puppet=puppet, accepts=accepts) diff --git a/lark/parsers/lalr_puppet.py b/lark/parsers/lalr_puppet.py index 2b350bf..24c77a1 100644 --- a/lark/parsers/lalr_puppet.py +++ b/lark/parsers/lalr_puppet.py @@ -3,6 +3,8 @@ from copy import deepcopy from .lalr_analysis import Shift, Reduce +from .. import Token + class ParserPuppet(object): def __init__(self, parser, state_stack, value_stack, start, stream, set_state): @@ -67,13 +69,26 @@ class ParserPuppet(object): ) def pretty(self): - print("Puppet choices:") + out = ["Puppet choices:"] for k, v in self.choices().items(): - print('\t-', k, '->', v) - print('stack size:', len(self._state_stack)) + out.append('\t- %s -> %s' % (k, v)) + out.append('stack size: %s' % len(self._state_stack)) + return '\n'.join(out) def choices(self): return self.parser.parse_table.states[self._state_stack[-1]] + def accepts(self): + accepts = set() + for t in self.choices(): + new_puppet = self.copy() + try: + new_puppet.feed_token(Token(t, '')) + except KeyError: + pass + else: + accepts.add(t) + return accepts + def resume_parse(self): return self.parser.parse(self._stream, self._start, self._set_state, self._value_stack, self._state_stack) From d4503374ff6171425c70a57899443cef10210553 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Thu, 13 Aug 2020 10:09:31 +0300 Subject: [PATCH 17/25] Small addition to docs --- README.md | 1 + docs/grammar.md | 2 ++ 2 files changed, 3 insertions(+) diff --git a/README.md b/README.md index 23ec565..69ccb2b 100644 --- a/README.md +++ b/README.md @@ -155,6 +155,7 @@ Check out the [JSON tutorial](/docs/json_tutorial.md#conclusion) for more detail - [miniwdl](https://github.com/chanzuckerberg/miniwdl) - A static analysis toolkit for the Workflow Description Language - [pytreeview](https://gitlab.com/parmenti/pytreeview) - a lightweight tree-based grammar explorer - [harmalysis](https://github.com/napulen/harmalysis) - A language for harmonic analysis and music theory + - [gersemi](https://github.com/BlankSpruce/gersemi) - A CMake code formatter Using Lark? Send me a message and I'll add your project! diff --git a/docs/grammar.md b/docs/grammar.md index d4ecec5..ff6553f 100644 --- a/docs/grammar.md +++ b/docs/grammar.md @@ -112,6 +112,8 @@ Terminals can be assigned priority only when using a lexer (future versions may Priority can be either positive or negative. If not specified for a terminal, it defaults to 1. +Highest priority terminals are always matched first. + ### Regexp Flags You can use flags on regexps and strings. For example: From 02d57bc32a2fae1722ee3f8e003a3d6234e58190 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Thu, 13 Aug 2020 11:43:52 +0300 Subject: [PATCH 18/25] Small adjustments to PR --- lark-stubs/parsers/lalr_puppet.pyi | 5 ++-- lark/exceptions.py | 42 +++++++++++++++++------------- lark/parsers/lalr_parser.py | 7 +++-- 3 files changed, 30 insertions(+), 24 deletions(-) diff --git a/lark-stubs/parsers/lalr_puppet.pyi b/lark-stubs/parsers/lalr_puppet.pyi index c138c32..f35112a 100644 --- a/lark-stubs/parsers/lalr_puppet.pyi +++ b/lark-stubs/parsers/lalr_puppet.pyi @@ -5,8 +5,9 @@ from lark import Token, Tree class ParserPuppet(object): """ - Represents a LalrParser that can be step through. - Shouldn't instantiated by hand, but is accessible as `UnexpectedToken.puppet` + Provides an interface to interactively step through the parser (LALR(1) only for now) + + Accessible via `UnexpectedToken.puppet` (raised by the parser on token error) """ def feed_token(self, token: Token): ... diff --git a/lark/exceptions.py b/lark/exceptions.py index 03f3da4..e1225a9 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -45,7 +45,7 @@ class UnexpectedInput(LarkError): example that bests matches the current error. """ assert self.state is not None, "Not supported for this exception" - + if isinstance(examples, dict): examples = examples.items() @@ -57,7 +57,11 @@ class UnexpectedInput(LarkError): try: parse_fn(malformed) except UnexpectedInput as ut: - if ut.state == self.state and (not use_accepts or ut.accepts == self.accepts): + if ut.state == self.state: + if use_accepts and ut.accepts != self.accepts: + logging.debug("Different accepts with same state[%d]: %s != %s at example [%s][%s]" % + (self.state, self.accepts, ut.accepts, i, j)) + continue try: if ut.token == self.token: # Try exact match first logging.debug("Exact Match at example [%s][%s]" % (i, j)) @@ -74,27 +78,25 @@ class UnexpectedInput(LarkError): if not candidate[0]: logging.debug("Same State match at example [%s][%s]" % (i, j)) candidate = label, False - elif ut.state == self.state: - logging.debug("Different accepts with same state[%d]: %s != %s at example [%s][%s]" % - (self.state, self.accepts, ut.accepts, i, j)) + return candidate[0] class UnexpectedCharacters(LexError, UnexpectedInput): def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None): + self.line = line + self.column = column + self.pos_in_stream = lex_pos + self.state = state + + self.allowed = allowed + self.considered_tokens = considered_tokens if isinstance(seq, bytes): message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace"), line, column) else: message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos], line, column) - self.line = line - self.column = column - self.allowed = allowed - self.considered_tokens = considered_tokens - self.pos_in_stream = lex_pos - self.state = state - message += '\n\n' + self.get_context(seq) if allowed: message += '\nExpecting: %s\n' % allowed @@ -106,16 +108,20 @@ class UnexpectedCharacters(LexError, UnexpectedInput): class UnexpectedToken(ParseError, UnexpectedInput): - def __init__(self, token, expected, considered_rules=None, state=None, puppet=None, accepts=None): - self.token = token - self.expected = expected # XXX str shouldn't necessary + def __init__(self, token, expected, considered_rules=None, state=None, puppet=None): self.line = getattr(token, 'line', '?') self.column = getattr(token, 'column', '?') - self.considered_rules = considered_rules - self.state = state self.pos_in_stream = getattr(token, 'pos_in_stream', None) + self.state = state + + self.token = token + self.expected = expected # XXX deprecate? `accepts` is better + self.considered_rules = considered_rules self.puppet = puppet - self.accepts = accepts + + # TODO Only calculate `accepts()` when we need to display it to the user + # This will improve performance when doing automatic error handling + self.accepts = puppet and puppet.accepts() message = ("Unexpected token %r at line %s, column %s.\n" "Expected one of: \n\t* %s\n" diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index ba75606..cf6a4bf 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -62,10 +62,9 @@ class _Parser: expected = {s for s in states[state].keys() if s.isupper()} try: puppet = ParserPuppet(self, state_stack, value_stack, start, stream, set_state) - accepts = puppet.accepts() - except NameError: - puppet = accepts = None - raise UnexpectedToken(token, expected, state=state, puppet=puppet, accepts=accepts) + except NameError: # For standalone parser + puppet = None + raise UnexpectedToken(token, expected, state=state, puppet=puppet) def reduce(rule): size = len(rule.expansion) From 00e736fda3cebfc9766f293fcbf4826e7e7c8103 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Thu, 13 Aug 2020 11:48:05 +0300 Subject: [PATCH 19/25] Use accepts in default example (even though it's not necessary) --- examples/error_reporting_lalr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/error_reporting_lalr.py b/examples/error_reporting_lalr.py index 5e7d967..f038eda 100644 --- a/examples/error_reporting_lalr.py +++ b/examples/error_reporting_lalr.py @@ -52,7 +52,7 @@ def parse(json_text): '[1,2,]', '{"foo":1,}', '{"foo":false,"bar":true,}'] - }) + }, use_accepts=True) if not exc_class: raise raise exc_class(u.get_context(json_text), u.line, u.column) From 2c7afed894b362dc9b1ea13b658a6094f3c1e281 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Thu, 13 Aug 2020 11:55:44 +0300 Subject: [PATCH 20/25] Small fixes --- lark/exceptions.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/lark/exceptions.py b/lark/exceptions.py index e1225a9..7330125 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -43,6 +43,8 @@ class UnexpectedInput(LarkError): """ Given a parser instance and a dictionary mapping some label with some malformed syntax examples, it'll return the label for the example that bests matches the current error. + + It's recommended to call this with `use_accepts=True`. The default is False for backwards compatibility. """ assert self.state is not None, "Not supported for this exception" @@ -93,10 +95,11 @@ class UnexpectedCharacters(LexError, UnexpectedInput): self.considered_tokens = considered_tokens if isinstance(seq, bytes): - message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace"), line, column) + _s = seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace") else: - message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos], line, column) + _s = seq[lex_pos] + message = "No terminal defined for '%s' at line %d col %d" % (_s, line, column) message += '\n\n' + self.get_context(seq) if allowed: message += '\nExpecting: %s\n' % allowed From 96873d64ba8ef85fcad1daa2dd2e9bf931eb06ba Mon Sep 17 00:00:00 2001 From: Blank Spruce <32396809+BlankSpruce@users.noreply.github.com> Date: Thu, 13 Aug 2020 18:09:05 +0200 Subject: [PATCH 21/25] Make transformer work with tokens in standalone parser, fixes #648 --- lark/common.py | 3 --- lark/lark.py | 9 ++++++++- lark/parser_frontends.py | 16 +++++++++++++--- tests/test_tools.py | 27 +++++++++++++++++++++++++++ 4 files changed, 48 insertions(+), 7 deletions(-) diff --git a/lark/common.py b/lark/common.py index cc8c73c..714399a 100644 --- a/lark/common.py +++ b/lark/common.py @@ -17,9 +17,6 @@ class LexerConf(Serialize): self.skip_validation = skip_validation self.use_bytes = use_bytes - def _deserialize(self): - self.callbacks = {} # TODO - ###} class ParserConf: diff --git a/lark/lark.py b/lark/lark.py index daab45b..3ed96d7 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -344,7 +344,14 @@ class Lark(Serialize): self.rules = [Rule.deserialize(r, memo) for r in data['rules']] self.source = '' self._prepare_callbacks() - self.parser = self.parser_class.deserialize(data['parser'], memo, self._callbacks, self.options.postlex, re_module) + self.parser = self.parser_class.deserialize( + data['parser'], + memo, + self._callbacks, + self.options.postlex, + self.options.transformer, + re_module + ) return self @classmethod diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 33ad9bc..a45bf9c 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -1,6 +1,6 @@ from .utils import get_regexp_width, Serialize from .parsers.grammar_analysis import GrammarAnalyzer -from .lexer import TraditionalLexer, ContextualLexer, Lexer, Token +from .lexer import TraditionalLexer, ContextualLexer, Lexer, Token, TerminalDef from .parsers import earley, xearley, cyk from .parsers.lalr_parser import LALR_Parser from .grammar import Rule @@ -58,6 +58,16 @@ class _ParserFrontend(Serialize): return self.parser.parse(input, start, *args) +def _recreate_lexer_callbacks(memo, transformer): + result = {} + terminals = [item for item in memo.values() if isinstance(item, TerminalDef)] + for terminal in terminals: + callback = getattr(transformer, terminal.name, None) + if callback is not None: + result[terminal.name] = callback + return result + + class WithLexer(_ParserFrontend): lexer = None parser = None @@ -73,10 +83,11 @@ class WithLexer(_ParserFrontend): self.postlex = lexer_conf.postlex @classmethod - def deserialize(cls, data, memo, callbacks, postlex, re_module): + def deserialize(cls, data, memo, callbacks, postlex, transformer, re_module): inst = super(WithLexer, cls).deserialize(data, memo) inst.postlex = postlex inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks) + inst.lexer_conf.callbacks = _recreate_lexer_callbacks(memo, transformer) inst.lexer_conf.re_module = re_module inst.lexer_conf.skip_validation=True inst.init_lexer() @@ -229,4 +240,3 @@ class CYK(WithLexer): def _apply_callback(self, tree): return self.callbacks[tree.rule](tree.children) - diff --git a/tests/test_tools.py b/tests/test_tools.py index 1e0d78e..e691237 100644 --- a/tests/test_tools.py +++ b/tests/test_tools.py @@ -106,6 +106,33 @@ class TestStandalone(TestCase): x = l.parse('(\n)\n') self.assertEqual(x, Tree('start', [])) + def test_transformer(self): + grammar = r""" + start: some_rule "(" SOME_TERMINAL ")" + some_rule: SOME_TERMINAL + SOME_TERMINAL: /[A-Za-z_][A-Za-z0-9_]*/ + """ + context = self._create_standalone(grammar) + _Lark = context["Lark_StandAlone"] + + _Token = context["Token"] + _Tree = context["Tree"] + + class MyTransformer(context["Transformer"]): + def SOME_TERMINAL(self, token): + return _Token("SOME_TERMINAL", "token is transformed") + + def some_rule(self, children): + return _Tree("rule_is_transformed", []) + + parser = _Lark(transformer=MyTransformer()) + self.assertEqual( + parser.parse("FOO(BAR)"), + _Tree("start", [ + _Tree("rule_is_transformed", []), + _Token("SOME_TERMINAL", "token is transformed") + ]) + ) if __name__ == '__main__': From 2f4831f9b6dd857dcb3b8d53a8839474d3c5e5f7 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Thu, 13 Aug 2020 21:13:42 +0300 Subject: [PATCH 22/25] Small refactor after PR --- lark/lark.py | 12 +++++------- lark/parser_frontends.py | 9 ++++++--- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/lark/lark.py b/lark/lark.py index 3ed96d7..8371943 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -11,7 +11,7 @@ from .common import LexerConf, ParserConf from .lexer import Lexer, TraditionalLexer, TerminalDef, UnexpectedToken from .parse_tree_builder import ParseTreeBuilder -from .parser_frontends import get_frontend +from .parser_frontends import get_frontend, _get_lexer_callbacks from .grammar import Rule import re @@ -278,12 +278,10 @@ class Lark(Serialize): rule.options.priority = None # TODO Deprecate lexer_callbacks? - lexer_callbacks = dict(self.options.lexer_callbacks) - if self.options.transformer: - t = self.options.transformer - for term in self.terminals: - if hasattr(t, term.name): - lexer_callbacks[term.name] = getattr(t, term.name) + lexer_callbacks = (_get_lexer_callbacks(self.options.transformer, self.terminals) + if self.options.transformer + else {}) + lexer_callbacks.update(self.options.lexer_callbacks) self.lexer_conf = LexerConf(self.terminals, re_module, self.ignore_tokens, self.options.postlex, lexer_callbacks, self.options.g_regex_flags, use_bytes=self.options.use_bytes) diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index a45bf9c..b993b9f 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -58,9 +58,8 @@ class _ParserFrontend(Serialize): return self.parser.parse(input, start, *args) -def _recreate_lexer_callbacks(memo, transformer): +def _get_lexer_callbacks(transformer, terminals): result = {} - terminals = [item for item in memo.values() if isinstance(item, TerminalDef)] for terminal in terminals: callback = getattr(transformer, terminal.name, None) if callback is not None: @@ -85,12 +84,16 @@ class WithLexer(_ParserFrontend): @classmethod def deserialize(cls, data, memo, callbacks, postlex, transformer, re_module): inst = super(WithLexer, cls).deserialize(data, memo) + inst.postlex = postlex inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks) - inst.lexer_conf.callbacks = _recreate_lexer_callbacks(memo, transformer) + + terminals = [item for item in memo.values() if isinstance(item, TerminalDef)] + inst.lexer_conf.callbacks = _get_lexer_callbacks(transformer, terminals) inst.lexer_conf.re_module = re_module inst.lexer_conf.skip_validation=True inst.init_lexer() + return inst def _serialize(self, data, memo): From 5559b1a21167c662c385e47e52f27c0cc470c278 Mon Sep 17 00:00:00 2001 From: Blank Spruce <32396809+BlankSpruce@users.noreply.github.com> Date: Fri, 14 Aug 2020 12:08:02 +0200 Subject: [PATCH 23/25] Add missing elements in standalone parser Add: - missing imports - __version__ variable Additionally regenerated json parser example --- examples/standalone/json_parser.py | 178 ++++++++++++++++++++--------- lark/exceptions.py | 5 +- lark/tools/standalone.py | 2 + lark/tree.py | 4 +- 4 files changed, 134 insertions(+), 55 deletions(-) diff --git a/examples/standalone/json_parser.py b/examples/standalone/json_parser.py index c9a5147..cadc51d 100644 --- a/examples/standalone/json_parser.py +++ b/examples/standalone/json_parser.py @@ -1,4 +1,6 @@ # The file was automatically generated by Lark v0.9.0 +__version__ = "0.9.0" + # # # Lark Stand-alone Generator Tool @@ -27,6 +29,9 @@ import os from io import open +import logging + + class LarkError(Exception): pass @@ -54,38 +59,55 @@ class UnexpectedInput(LarkError): pos = self.pos_in_stream start = max(pos - span, 0) end = pos + span - before = text[start:pos].rsplit('\n', 1)[-1] - after = text[pos:end].split('\n', 1)[0] - return before + after + '\n' + ' ' * len(before) + '^\n' + if not isinstance(text, bytes): + before = text[start:pos].rsplit('\n', 1)[-1] + after = text[pos:end].split('\n', 1)[0] + return before + after + '\n' + ' ' * len(before) + '^\n' + else: + before = text[start:pos].rsplit(b'\n', 1)[-1] + after = text[pos:end].split(b'\n', 1)[0] + return (before + after + b'\n' + b' ' * len(before) + b'^\n').decode("ascii", "backslashreplace") - def match_examples(self, parse_fn, examples, token_type_match_fallback=False): + def match_examples(self, parse_fn, examples, token_type_match_fallback=False, use_accepts=False): """ Given a parser instance and a dictionary mapping some label with some malformed syntax examples, it'll return the label for the example that bests matches the current error. + + It's recommended to call this with `use_accepts=True`. The default is False for backwards compatibility. """ assert self.state is not None, "Not supported for this exception" + if isinstance(examples, dict): + examples = examples.items() + candidate = (None, False) - for label, example in examples.items(): + for i, (label, example) in enumerate(examples): assert not isinstance(example, STRING_TYPE) - for malformed in example: + for j, malformed in enumerate(example): try: parse_fn(malformed) except UnexpectedInput as ut: if ut.state == self.state: + if use_accepts and ut.accepts != self.accepts: + logging.debug("Different accepts with same state[%d]: %s != %s at example [%s][%s]" % + (self.state, self.accepts, ut.accepts, i, j)) + continue try: if ut.token == self.token: # Try exact match first + logging.debug("Exact Match at example [%s][%s]" % (i, j)) return label if token_type_match_fallback: # Fallback to token types match if (ut.token.type == self.token.type) and not candidate[-1]: + logging.debug("Token Type Fallback at example [%s][%s]" % (i, j)) candidate = label, True except AttributeError: pass if not candidate[0]: + logging.debug("Same State match at example [%s][%s]" % (i, j)) candidate = label, False return candidate[0] @@ -93,15 +115,20 @@ class UnexpectedInput(LarkError): class UnexpectedCharacters(LexError, UnexpectedInput): def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None): - message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos], line, column) - self.line = line self.column = column - self.allowed = allowed - self.considered_tokens = considered_tokens self.pos_in_stream = lex_pos self.state = state + self.allowed = allowed + self.considered_tokens = considered_tokens + + if isinstance(seq, bytes): + _s = seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace") + else: + _s = seq[lex_pos] + + message = "No terminal defined for '%s' at line %d col %d" % (_s, line, column) message += '\n\n' + self.get_context(seq) if allowed: message += '\nExpecting: %s\n' % allowed @@ -114,18 +141,23 @@ class UnexpectedCharacters(LexError, UnexpectedInput): class UnexpectedToken(ParseError, UnexpectedInput): def __init__(self, token, expected, considered_rules=None, state=None, puppet=None): - self.token = token - self.expected = expected # XXX str shouldn't necessary self.line = getattr(token, 'line', '?') self.column = getattr(token, 'column', '?') - self.considered_rules = considered_rules - self.state = state self.pos_in_stream = getattr(token, 'pos_in_stream', None) + self.state = state + + self.token = token + self.expected = expected # XXX deprecate? `accepts` is better + self.considered_rules = considered_rules self.puppet = puppet + # TODO Only calculate `accepts()` when we need to display it to the user + # This will improve performance when doing automatic error handling + self.accepts = puppet and puppet.accepts() + message = ("Unexpected token %r at line %s, column %s.\n" "Expected one of: \n\t* %s\n" - % (token, self.line, self.column, '\n\t* '.join(self.expected))) + % (token, self.line, self.column, '\n\t* '.join(self.accepts or self.expected))) super(UnexpectedToken, self).__init__(message) @@ -286,6 +318,9 @@ def get_regexp_width(expr): raise ValueError(expr) +from collections import OrderedDict + + class Meta: def __init__(self): self.empty = True @@ -364,6 +399,8 @@ class Discard(Exception): # Transformers class _Decoratable: + "Provides support for decorating methods with @v_args" + @classmethod def _apply_decorator(cls, decorator, **kwargs): mro = getmro(cls) @@ -978,8 +1015,7 @@ class Token(Str): try: self = super(Token, cls).__new__(cls, value) except UnicodeDecodeError: - # value = value.decode('latin1') - value = value.decode("ascii", "backslashreplace") + value = value.decode('latin1') self = super(Token, cls).__new__(cls, value) self.type = type_ @@ -1022,8 +1058,8 @@ class Token(Str): class LineCounter: - def __init__(self): - self.newline_char = '\n' + def __init__(self, newline_char): + self.newline_char = newline_char self.char_pos = 0 self.line = 1 self.column = 1 @@ -1052,7 +1088,7 @@ class _Lex: def lex(self, stream, newline_types, ignore_types): newline_types = frozenset(newline_types) ignore_types = frozenset(ignore_types) - line_ctr = LineCounter() + line_ctr = LineCounter('\n' if not self.lexer.use_bytes else b'\n') last_token = None while line_ctr.char_pos < len(stream): @@ -1113,7 +1149,7 @@ class CallChain: -def _create_unless(terminals, g_regex_flags, re_): +def _create_unless(terminals, g_regex_flags, re_, use_bytes): tokens_by_type = classify(terminals, lambda t: type(t.pattern)) assert len(tokens_by_type) <= 2, tokens_by_type.keys() embedded_strs = set() @@ -1130,31 +1166,34 @@ def _create_unless(terminals, g_regex_flags, re_): if strtok.pattern.flags <= retok.pattern.flags: embedded_strs.add(strtok) if unless: - callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, re_, match_whole=True)) + callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, re_, match_whole=True, use_bytes=use_bytes)) terminals = [t for t in terminals if t not in embedded_strs] return terminals, callback -def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_): +def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_, use_bytes): # Python sets an unreasonable group limit (currently 100) in its re module # Worse, the only way to know we reached it is by catching an AssertionError! # This function recursively tries less and less groups until it's successful. postfix = '$' if match_whole else '' mres = [] while terminals: + pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size]) + if use_bytes: + pattern = pattern.encode('latin-1') try: - mre = re_.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in terminals[:max_size]), g_regex_flags) + mre = re_.compile(pattern, g_regex_flags) except AssertionError: # Yes, this is what Python provides us.. :/ - return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_) + return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_, use_bytes) # terms_from_name = {t.name: t for t in terminals[:max_size]} mres.append((mre, {i:n for n,i in mre.groupindex.items()} )) terminals = terminals[max_size:] return mres -def build_mres(terminals, g_regex_flags, re_, match_whole=False): - return _build_mres(terminals, len(terminals), g_regex_flags, match_whole, re_) +def build_mres(terminals, g_regex_flags, re_, use_bytes, match_whole=False): + return _build_mres(terminals, len(terminals), g_regex_flags, match_whole, re_, use_bytes) def _regexp_has_newline(r): r"""Expressions that may indicate newlines in a regexp: @@ -1204,12 +1243,13 @@ class TraditionalLexer(Lexer): self.terminals = terminals self.user_callbacks = conf.callbacks self.g_regex_flags = conf.g_regex_flags + self.use_bytes = conf.use_bytes self._mres = None # self.build(g_regex_flags) def _build(self): - terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, re_=self.re) + terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, re_=self.re, use_bytes=self.use_bytes) assert all(self.callback.values()) for type_, f in self.user_callbacks.items(): @@ -1219,7 +1259,7 @@ class TraditionalLexer(Lexer): else: self.callback[type_] = f - self._mres = build_mres(terminals, self.g_regex_flags, self.re) + self._mres = build_mres(terminals, self.g_regex_flags, self.re, self.use_bytes) @property def mres(self): @@ -1248,7 +1288,8 @@ class ContextualLexer(Lexer): assert t.name not in tokens_by_name, t tokens_by_name[t.name] = t - trad_conf = type(conf)(terminals, conf.re_module, conf.ignore, callbacks=conf.callbacks, g_regex_flags=conf.g_regex_flags, skip_validation=conf.skip_validation) + trad_conf = copy(conf) + trad_conf.tokens = terminals lexer_by_tokens = {} self.lexers = {} @@ -1293,10 +1334,10 @@ class ContextualLexer(Lexer): class LexerConf(Serialize): - __serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags' + __serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags', 'use_bytes' __serialize_namespace__ = TerminalDef, - def __init__(self, tokens, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False): + def __init__(self, tokens, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False, use_bytes=False): self.tokens = tokens # TODO should be terminals self.ignore = ignore self.postlex = postlex @@ -1304,9 +1345,7 @@ class LexerConf(Serialize): self.g_regex_flags = g_regex_flags self.re_module = re_module self.skip_validation = skip_validation - - def _deserialize(self): - self.callbacks = {} # TODO + self.use_bytes = use_bytes from functools import partial, wraps @@ -1627,10 +1666,10 @@ class _Parser: try: return states[state][token.type] except KeyError: - expected = [s for s in states[state].keys() if s.isupper()] + expected = {s for s in states[state].keys() if s.isupper()} try: puppet = ParserPuppet(self, state_stack, value_stack, start, stream, set_state) - except NameError: + except NameError: # For standalone parser puppet = None raise UnexpectedToken(token, expected, state=state, puppet=puppet) @@ -1760,7 +1799,14 @@ def get_frontend(parser, lexer): elif lexer == 'contextual': return LALR_ContextualLexer elif issubclass(lexer, Lexer): - return partial(LALR_CustomLexer, lexer) + class LALR_CustomLexerWrapper(LALR_CustomLexer): + def __init__(self, lexer_conf, parser_conf, options=None): + super(LALR_CustomLexerWrapper, self).__init__( + lexer, lexer_conf, parser_conf, options=options) + def init_lexer(self): + self.lexer = lexer(self.lexer_conf) + + return LALR_CustomLexerWrapper else: raise ValueError('Unknown lexer: %s' % lexer) elif parser=='earley': @@ -1793,6 +1839,15 @@ class _ParserFrontend(Serialize): return self.parser.parse(input, start, *args) +def _get_lexer_callbacks(transformer, terminals): + result = {} + for terminal in terminals: + callback = getattr(transformer, terminal.name, None) + if callback is not None: + result[terminal.name] = callback + return result + + class WithLexer(_ParserFrontend): lexer = None parser = None @@ -1808,13 +1863,18 @@ class WithLexer(_ParserFrontend): self.postlex = lexer_conf.postlex @classmethod - def deserialize(cls, data, memo, callbacks, postlex, re_module): + def deserialize(cls, data, memo, callbacks, postlex, transformer, re_module): inst = super(WithLexer, cls).deserialize(data, memo) + inst.postlex = postlex inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks) + + terminals = [item for item in memo.values() if isinstance(item, TerminalDef)] + inst.lexer_conf.callbacks = _get_lexer_callbacks(transformer, terminals) inst.lexer_conf.re_module = re_module inst.lexer_conf.skip_validation=True inst.init_lexer() + return inst def _serialize(self, data, memo): @@ -1922,6 +1982,7 @@ class LarkOptions(Serialize): invert (Default: auto) lexer_callbacks - Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution. + use_bytes - Accept an input of type `bytes` instead of `str` (Python 3 only). edit_terminals - A callback """ if __doc__: @@ -1945,6 +2006,7 @@ class LarkOptions(Serialize): 'maybe_placeholders': False, 'edit_terminals': None, 'g_regex_flags': 0, + 'use_bytes': False, } def __init__(self, options_dict): @@ -1954,7 +2016,7 @@ class LarkOptions(Serialize): for name, default in self._defaults.items(): if name in o: value = o.pop(name) - if isinstance(default, bool) and name != 'cache': + if isinstance(default, bool) and name not in ('cache', 'use_bytes'): value = bool(value) else: value = default @@ -2027,6 +2089,13 @@ class Lark(Serialize): grammar = read() assert isinstance(grammar, STRING_TYPE) + self.grammar_source = grammar + if self.options.use_bytes: + if not isascii(grammar): + raise ValueError("Grammar must be ascii only, when use_bytes=True") + if sys.version_info[0] == 2 and self.options.use_bytes != 'force': + raise NotImplementedError("`use_bytes=True` may have issues on python2." + "Use `use_bytes='force'` to use it at your own risk.") cache_fn = None if self.options.cache: @@ -2036,7 +2105,7 @@ class Lark(Serialize): cache_fn = self.options.cache else: if self.options.cache is not True: - raise ValueError("cache must be bool or str") + raise ValueError("cache argument must be bool or str") unhashable = ('transformer', 'postlex', 'lexer_callbacks', 'edit_terminals') from . import __version__ options_str = ''.join(k+str(v) for k, v in options.items() if k not in unhashable) @@ -2092,7 +2161,7 @@ class Lark(Serialize): for t in self.terminals: self.options.edit_terminals(t) - self._terminals_dict = {t.name:t for t in self.terminals} + self._terminals_dict = {t.name: t for t in self.terminals} # If the user asked to invert the priorities, negate them all here. # This replaces the old 'resolve__antiscore_sum' option. @@ -2109,14 +2178,12 @@ class Lark(Serialize): rule.options.priority = None # TODO Deprecate lexer_callbacks? - lexer_callbacks = dict(self.options.lexer_callbacks) - if self.options.transformer: - t = self.options.transformer - for term in self.terminals: - if hasattr(t, term.name): - lexer_callbacks[term.name] = getattr(t, term.name) + lexer_callbacks = (_get_lexer_callbacks(self.options.transformer, self.terminals) + if self.options.transformer + else {}) + lexer_callbacks.update(self.options.lexer_callbacks) - self.lexer_conf = LexerConf(self.terminals, re_module, self.ignore_tokens, self.options.postlex, lexer_callbacks, self.options.g_regex_flags) + self.lexer_conf = LexerConf(self.terminals, re_module, self.ignore_tokens, self.options.postlex, lexer_callbacks, self.options.g_regex_flags, use_bytes=self.options.use_bytes) if self.options.parser: self.parser = self._build_parser() @@ -2175,7 +2242,14 @@ class Lark(Serialize): self.rules = [Rule.deserialize(r, memo) for r in data['rules']] self.source = '' self._prepare_callbacks() - self.parser = self.parser_class.deserialize(data['parser'], memo, self._callbacks, self.options.postlex, re_module) + self.parser = self.parser_class.deserialize( + data['parser'], + memo, + self._callbacks, + self.options.postlex, + self.options.transformer, + re_module + ) return self @classmethod @@ -2244,10 +2318,10 @@ class Lark(Serialize): DATA = ( -{'rules': [{'@': 23}, {'@': 31}, {'@': 26}, {'@': 13}, {'@': 24}, {'@': 19}, {'@': 14}, {'@': 27}, {'@': 28}, {'@': 16}, {'@': 29}, {'@': 12}, {'@': 25}, {'@': 30}, {'@': 20}, {'@': 22}, {'@': 15}, {'@': 21}, {'@': 17}, {'@': 18}], 'parser': {'lexer_conf': {'tokens': [{'@': 0}, {'@': 1}, {'@': 2}, {'@': 3}, {'@': 4}, {'@': 5}, {'@': 6}, {'@': 7}, {'@': 8}, {'@': 9}, {'@': 10}, {'@': 11}], 'ignore': [u'WS'], 'g_regex_flags': 0, '__type__': 'LexerConf'}, 'parser': {'tokens': {0: 'COMMA', 1: 'RSQB', 2: 'RBRACE', 3: '$END', 4: 'LBRACE', 5: u'FALSE', 6: u'string', 7: u'object', 8: u'NULL', 9: u'SIGNED_NUMBER', 10: u'value', 11: u'array', 12: u'ESCAPED_STRING', 13: u'TRUE', 14: 'LSQB', 15: 'COLON', 16: u'pair', 17: u'__array_star_0', 18: u'__object_star_1', 19: 'start'}, 'states': {0: {0: (1, {'@': 12}), 1: (1, {'@': 12}), 2: (1, {'@': 12}), 3: (1, {'@': 12})}, 1: {1: (0, 29), 4: (0, 33), 5: (0, 8), 6: (0, 5), 7: (0, 31), 8: (0, 27), 9: (0, 24), 10: (0, 6), 11: (0, 26), 12: (0, 21), 13: (0, 16), 14: (0, 1)}, 2: {0: (0, 23), 2: (0, 0)}, 3: {15: (0, 12)}, 4: {16: (0, 13), 12: (0, 21), 6: (0, 3)}, 5: {0: (1, {'@': 13}), 1: (1, {'@': 13}), 2: (1, {'@': 13}), 3: (1, {'@': 13})}, 6: {0: (0, 7), 1: (0, 11), 17: (0, 17)}, 7: {4: (0, 33), 5: (0, 8), 6: (0, 5), 7: (0, 31), 8: (0, 27), 9: (0, 24), 10: (0, 9), 11: (0, 26), 12: (0, 21), 13: (0, 16), 14: (0, 1)}, 8: {0: (1, {'@': 14}), 1: (1, {'@': 14}), 2: (1, {'@': 14}), 3: (1, {'@': 14})}, 9: {0: (1, {'@': 15}), 1: (1, {'@': 15})}, 10: {4: (0, 33), 5: (0, 8), 6: (0, 5), 7: (0, 31), 8: (0, 27), 9: (0, 24), 10: (0, 20), 11: (0, 26), 12: (0, 21), 13: (0, 16), 14: (0, 1)}, 11: {0: (1, {'@': 16}), 1: (1, {'@': 16}), 2: (1, {'@': 16}), 3: (1, {'@': 16})}, 12: {4: (0, 33), 5: (0, 8), 6: (0, 5), 7: (0, 31), 8: (0, 27), 9: (0, 24), 10: (0, 18), 11: (0, 26), 12: (0, 21), 13: (0, 16), 14: (0, 1)}, 13: {0: (1, {'@': 17}), 2: (1, {'@': 17})}, 14: {}, 15: {0: (1, {'@': 18}), 2: (1, {'@': 18})}, 16: {0: (1, {'@': 19}), 1: (1, {'@': 19}), 2: (1, {'@': 19}), 3: (1, {'@': 19})}, 17: {0: (0, 10), 1: (0, 28)}, 18: {0: (1, {'@': 20}), 2: (1, {'@': 20})}, 19: {0: (0, 4), 18: (0, 2), 2: (0, 25)}, 20: {0: (1, {'@': 21}), 1: (1, {'@': 21})}, 21: {0: (1, {'@': 22}), 1: (1, {'@': 22}), 2: (1, {'@': 22}), 3: (1, {'@': 22}), 15: (1, {'@': 22})}, 22: {3: (1, {'@': 23})}, 23: {16: (0, 15), 12: (0, 21), 6: (0, 3)}, 24: {0: (1, {'@': 24}), 1: (1, {'@': 24}), 2: (1, {'@': 24}), 3: (1, {'@': 24})}, 25: {0: (1, {'@': 25}), 1: (1, {'@': 25}), 2: (1, {'@': 25}), 3: (1, {'@': 25})}, 26: {0: (1, {'@': 26}), 1: (1, {'@': 26}), 2: (1, {'@': 26}), 3: (1, {'@': 26})}, 27: {0: (1, {'@': 27}), 1: (1, {'@': 27}), 2: (1, {'@': 27}), 3: (1, {'@': 27})}, 28: {0: (1, {'@': 28}), 1: (1, {'@': 28}), 2: (1, {'@': 28}), 3: (1, {'@': 28})}, 29: {0: (1, {'@': 29}), 1: (1, {'@': 29}), 2: (1, {'@': 29}), 3: (1, {'@': 29})}, 30: {0: (1, {'@': 30}), 1: (1, {'@': 30}), 2: (1, {'@': 30}), 3: (1, {'@': 30})}, 31: {0: (1, {'@': 31}), 1: (1, {'@': 31}), 2: (1, {'@': 31}), 3: (1, {'@': 31})}, 32: {4: (0, 33), 5: (0, 8), 6: (0, 5), 7: (0, 31), 8: (0, 27), 9: (0, 24), 10: (0, 22), 11: (0, 26), 12: (0, 21), 13: (0, 16), 14: (0, 1), 19: (0, 14)}, 33: {16: (0, 19), 2: (0, 30), 12: (0, 21), 6: (0, 3)}}, 'end_states': {'start': 14}, 'start_states': {'start': 32}}, '__type__': 'LALR_ContextualLexer', 'start': ['start']}, '__type__': 'Lark', 'options': {'regex': False, 'transformer': None, 'lexer': 'contextual', 'lexer_callbacks': {}, 'start': ['start'], 'debug': False, 'postlex': None, 'parser': 'lalr', 'tree_class': None, 'priority': None, 'cache': False, 'g_regex_flags': 0, 'keep_all_tokens': False, 'ambiguity': 'auto', 'edit_terminals': None, 'propagate_positions': False, 'maybe_placeholders': False}} +{'parser': {'parser': {'tokens': {0: 'RSQB', 1: 'COMMA', 2: '$END', 3: 'RBRACE', 4: 'ESCAPED_STRING', 5: 'string', 6: 'pair', 7: 'LSQB', 8: 'LBRACE', 9: 'SIGNED_NUMBER', 10: 'NULL', 11: 'FALSE', 12: 'value', 13: 'array', 14: 'object', 15: 'TRUE', 16: '__array_star_0', 17: 'COLON', 18: '__object_star_1', 19: 'start'}, 'states': {0: {0: (1, {'@': 12}), 1: (1, {'@': 12}), 2: (1, {'@': 12}), 3: (1, {'@': 12})}, 1: {0: (1, {'@': 13}), 1: (1, {'@': 13}), 2: (1, {'@': 13}), 3: (1, {'@': 13})}, 2: {1: (0, 25), 0: (0, 19)}, 3: {0: (1, {'@': 14}), 1: (1, {'@': 14}), 2: (1, {'@': 14}), 3: (1, {'@': 14})}, 4: {4: (0, 31), 5: (0, 13), 6: (0, 26)}, 5: {0: (1, {'@': 15}), 1: (1, {'@': 15}), 2: (1, {'@': 15}), 3: (1, {'@': 15})}, 6: {0: (1, {'@': 16}), 1: (1, {'@': 16}), 2: (1, {'@': 16}), 3: (1, {'@': 16})}, 7: {0: (1, {'@': 17}), 1: (1, {'@': 17}), 2: (1, {'@': 17}), 3: (1, {'@': 17})}, 8: {1: (0, 14), 3: (0, 28)}, 9: {0: (0, 21), 7: (0, 9), 8: (0, 18), 9: (0, 0), 10: (0, 1), 11: (0, 29), 5: (0, 5), 12: (0, 10), 13: (0, 7), 14: (0, 33), 4: (0, 31), 15: (0, 24)}, 10: {1: (0, 20), 16: (0, 2), 0: (0, 3)}, 11: {0: (1, {'@': 18}), 1: (1, {'@': 18})}, 12: {2: (1, {'@': 19})}, 13: {17: (0, 32)}, 14: {5: (0, 13), 4: (0, 31), 6: (0, 23)}, 15: {18: (0, 8), 1: (0, 4), 3: (0, 17)}, 16: {0: (1, {'@': 20}), 1: (1, {'@': 20})}, 17: {0: (1, {'@': 21}), 1: (1, {'@': 21}), 2: (1, {'@': 21}), 3: (1, {'@': 21})}, 18: {4: (0, 31), 6: (0, 15), 5: (0, 13), 3: (0, 6)}, 19: {0: (1, {'@': 22}), 1: (1, {'@': 22}), 2: (1, {'@': 22}), 3: (1, {'@': 22})}, 20: {7: (0, 9), 8: (0, 18), 12: (0, 11), 9: (0, 0), 14: (0, 33), 10: (0, 1), 4: (0, 31), 15: (0, 24), 5: (0, 5), 11: (0, 29), 13: (0, 7)}, 21: {0: (1, {'@': 23}), 1: (1, {'@': 23}), 2: (1, {'@': 23}), 3: (1, {'@': 23})}, 22: {1: (1, {'@': 24}), 3: (1, {'@': 24})}, 23: {1: (1, {'@': 25}), 3: (1, {'@': 25})}, 24: {0: (1, {'@': 26}), 1: (1, {'@': 26}), 2: (1, {'@': 26}), 3: (1, {'@': 26})}, 25: {7: (0, 9), 12: (0, 16), 8: (0, 18), 9: (0, 0), 14: (0, 33), 10: (0, 1), 4: (0, 31), 15: (0, 24), 5: (0, 5), 11: (0, 29), 13: (0, 7)}, 26: {1: (1, {'@': 27}), 3: (1, {'@': 27})}, 27: {7: (0, 9), 8: (0, 18), 12: (0, 12), 9: (0, 0), 10: (0, 1), 11: (0, 29), 5: (0, 5), 13: (0, 7), 14: (0, 33), 4: (0, 31), 15: (0, 24), 19: (0, 30)}, 28: {0: (1, {'@': 28}), 1: (1, {'@': 28}), 2: (1, {'@': 28}), 3: (1, {'@': 28})}, 29: {0: (1, {'@': 29}), 1: (1, {'@': 29}), 2: (1, {'@': 29}), 3: (1, {'@': 29})}, 30: {}, 31: {17: (1, {'@': 30}), 0: (1, {'@': 30}), 1: (1, {'@': 30}), 2: (1, {'@': 30}), 3: (1, {'@': 30})}, 32: {7: (0, 9), 8: (0, 18), 12: (0, 22), 9: (0, 0), 14: (0, 33), 10: (0, 1), 4: (0, 31), 15: (0, 24), 5: (0, 5), 11: (0, 29), 13: (0, 7)}, 33: {0: (1, {'@': 31}), 1: (1, {'@': 31}), 2: (1, {'@': 31}), 3: (1, {'@': 31})}}, 'start_states': {'start': 27}, 'end_states': {'start': 30}}, 'lexer_conf': {'tokens': [{'@': 0}, {'@': 1}, {'@': 2}, {'@': 3}, {'@': 4}, {'@': 5}, {'@': 6}, {'@': 7}, {'@': 8}, {'@': 9}, {'@': 10}, {'@': 11}], 'ignore': ['WS'], 'g_regex_flags': 0, 'use_bytes': False, '__type__': 'LexerConf'}, 'start': ['start'], '__type__': 'LALR_ContextualLexer'}, 'rules': [{'@': 19}, {'@': 31}, {'@': 17}, {'@': 15}, {'@': 12}, {'@': 26}, {'@': 29}, {'@': 13}, {'@': 22}, {'@': 14}, {'@': 23}, {'@': 28}, {'@': 21}, {'@': 16}, {'@': 24}, {'@': 30}, {'@': 18}, {'@': 20}, {'@': 27}, {'@': 25}], 'options': {'debug': False, 'keep_all_tokens': False, 'tree_class': None, 'cache': False, 'postlex': None, 'parser': 'lalr', 'lexer': 'contextual', 'transformer': None, 'start': ['start'], 'priority': None, 'ambiguity': 'auto', 'regex': False, 'propagate_positions': False, 'lexer_callbacks': {}, 'maybe_placeholders': False, 'edit_terminals': None, 'g_regex_flags': 0, 'use_bytes': False}, '__type__': 'Lark'} ) MEMO = ( -{0: {'priority': 1, 'pattern': {'__type__': 'PatternRE', '_width': [2, 4294967295], 'flags': [], 'value': u'\\".*?(? Date: Fri, 14 Aug 2020 16:17:26 +0300 Subject: [PATCH 24/25] Adjustments to logging PR --- lark/__init__.py | 2 +- lark/common.py | 7 --- lark/exceptions.py | 11 ++-- lark/lark.py | 4 +- lark/parsers/earley.py | 2 +- lark/parsers/lalr_analysis.py | 5 +- lark/utils.py | 96 ++++++++++++++++++----------------- 7 files changed, 61 insertions(+), 66 deletions(-) diff --git a/lark/__init__.py b/lark/__init__.py index 9bd88b0..1b5e7e3 100644 --- a/lark/__init__.py +++ b/lark/__init__.py @@ -1,4 +1,4 @@ -from .common import logger +from .utils import logger from .tree import Tree from .visitors import Transformer, Visitor, v_args, Discard from .visitors import InlineTransformer, inline_args # XXX Deprecated diff --git a/lark/common.py b/lark/common.py index b333dcb..714399a 100644 --- a/lark/common.py +++ b/lark/common.py @@ -1,13 +1,6 @@ -import logging from .utils import Serialize from .lexer import TerminalDef -logger = logging.getLogger("lark") -logger.addHandler(logging.StreamHandler()) -# Set to highest level, since we have some warnings amongst the code -# By default, we should not output any log messages -logger.setLevel(logging.CRITICAL) - ###{standalone class LexerConf(Serialize): diff --git a/lark/exceptions.py b/lark/exceptions.py index d1b956d..9d2d8dc 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -1,7 +1,6 @@ -from .utils import STRING_TYPE +from .utils import STRING_TYPE, logger ###{standalone -import logging class LarkError(Exception): @@ -62,24 +61,24 @@ class UnexpectedInput(LarkError): except UnexpectedInput as ut: if ut.state == self.state: if use_accepts and ut.accepts != self.accepts: - logging.debug("Different accepts with same state[%d]: %s != %s at example [%s][%s]" % + logger.debug("Different accepts with same state[%d]: %s != %s at example [%s][%s]" % (self.state, self.accepts, ut.accepts, i, j)) continue try: if ut.token == self.token: # Try exact match first - logging.debug("Exact Match at example [%s][%s]" % (i, j)) + logger.debug("Exact Match at example [%s][%s]" % (i, j)) return label if token_type_match_fallback: # Fallback to token types match if (ut.token.type == self.token.type) and not candidate[-1]: - logging.debug("Token Type Fallback at example [%s][%s]" % (i, j)) + logger.debug("Token Type Fallback at example [%s][%s]" % (i, j)) candidate = label, True except AttributeError: pass if not candidate[0]: - logging.debug("Same State match at example [%s][%s]" % (i, j)) + logger.debug("Same State match at example [%s][%s]" % (i, j)) candidate = label, False return candidate[0] diff --git a/lark/lark.py b/lark/lark.py index ddea2d6..9a4e001 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -4,10 +4,10 @@ import sys, os, pickle, hashlib from io import open -from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS, isascii +from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS, isascii, logger from .load_grammar import load_grammar from .tree import Tree -from .common import LexerConf, ParserConf, logger +from .common import LexerConf, ParserConf from .lexer import Lexer, TraditionalLexer, TerminalDef, UnexpectedToken from .parse_tree_builder import ParseTreeBuilder diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index bf099e6..098639d 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -14,7 +14,7 @@ from collections import deque from ..visitors import Transformer_InPlace, v_args from ..exceptions import UnexpectedEOF, UnexpectedToken -from ..common import logger +from ..utils import logger from .grammar_analysis import GrammarAnalyzer from ..grammar import NonTerminal from .earley_common import Item, TransitiveItem diff --git a/lark/parsers/lalr_analysis.py b/lark/parsers/lalr_analysis.py index 861941f..7a94b4d 100644 --- a/lark/parsers/lalr_analysis.py +++ b/lark/parsers/lalr_analysis.py @@ -6,11 +6,10 @@ For now, shift/reduce conflicts are automatically resolved as shifts. # Author: Erez Shinan (2017) # Email : erezshin@gmail.com -from collections import defaultdict, deque +from collections import defaultdict -from ..utils import classify, classify_bool, bfs, fzset, Serialize, Enumerator +from ..utils import classify, classify_bool, bfs, fzset, Enumerator, logger from ..exceptions import GrammarError -from ..common import logger from .grammar_analysis import GrammarAnalyzer, Terminal, LR0ItemSet from ..grammar import Rule diff --git a/lark/utils.py b/lark/utils.py index c70b947..0c41e6b 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -4,51 +4,15 @@ from functools import reduce from ast import literal_eval from collections import deque -class fzset(frozenset): - def __repr__(self): - return '{%s}' % ', '.join(map(repr, self)) - - -def classify_bool(seq, pred): - true_elems = [] - false_elems = [] - - for elem in seq: - if pred(elem): - true_elems.append(elem) - else: - false_elems.append(elem) - - return true_elems, false_elems - - - -def bfs(initial, expand): - open_q = deque(list(initial)) - visited = set(open_q) - while open_q: - node = open_q.popleft() - yield node - for next_node in expand(node): - if next_node not in visited: - visited.add(next_node) - open_q.append(next_node) - - +###{standalone +import logging +logger = logging.getLogger("lark") +logger.addHandler(logging.StreamHandler()) +# Set to highest level, since we have some warnings amongst the code +# By default, we should not output any log messages +logger.setLevel(logging.CRITICAL) -def _serialize(value, memo): - if isinstance(value, Serialize): - return value.serialize(memo) - elif isinstance(value, list): - return [_serialize(elem, memo) for elem in value] - elif isinstance(value, frozenset): - return list(value) # TODO reversible? - elif isinstance(value, dict): - return {key:_serialize(elem, memo) for key, elem in value.items()} - return value - -###{standalone def classify(seq, key=None, value=None): d = {} for item in seq: @@ -302,13 +266,11 @@ def combine_alternatives(lists): return reduce(lambda a,b: [i+[j] for i in a for j in b], lists[1:], init) - class FS: open = open exists = os.path.exists - def isascii(s): """ str.isascii only exists in python3.7+ """ try: @@ -318,4 +280,46 @@ def isascii(s): s.encode('ascii') return True except (UnicodeDecodeError, UnicodeEncodeError): - return False \ No newline at end of file + return False + + +class fzset(frozenset): + def __repr__(self): + return '{%s}' % ', '.join(map(repr, self)) + + +def classify_bool(seq, pred): + true_elems = [] + false_elems = [] + + for elem in seq: + if pred(elem): + true_elems.append(elem) + else: + false_elems.append(elem) + + return true_elems, false_elems + + +def bfs(initial, expand): + open_q = deque(list(initial)) + visited = set(open_q) + while open_q: + node = open_q.popleft() + yield node + for next_node in expand(node): + if next_node not in visited: + visited.add(next_node) + open_q.append(next_node) + + +def _serialize(value, memo): + if isinstance(value, Serialize): + return value.serialize(memo) + elif isinstance(value, list): + return [_serialize(elem, memo) for elem in value] + elif isinstance(value, frozenset): + return list(value) # TODO reversible? + elif isinstance(value, dict): + return {key:_serialize(elem, memo) for key, elem in value.items()} + return value \ No newline at end of file From 39fb4c0f3e2c1c24ceeb4de29d6904a957eaaaf1 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Fri, 14 Aug 2020 16:34:51 +0300 Subject: [PATCH 25/25] Bugfix and warn on ambiguous intermediate nodes, based on PR #651 --- lark/parsers/earley_forest.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/lark/parsers/earley_forest.py b/lark/parsers/earley_forest.py index c8b4f25..4ed75d9 100644 --- a/lark/parsers/earley_forest.py +++ b/lark/parsers/earley_forest.py @@ -13,6 +13,7 @@ from collections import deque from operator import attrgetter from importlib import import_module +from ..utils import logger from ..tree import Tree from ..exceptions import ParseError @@ -328,10 +329,17 @@ class ForestToAmbiguousTreeVisitor(ForestToTreeVisitor): self.output_stack[-1].children.append(node) def visit_symbol_node_in(self, node): - if self.forest_sum_visitor and node.is_ambiguous and isinf(node.priority): - self.forest_sum_visitor.visit(node) - if not node.is_intermediate and node.is_ambiguous: - self.output_stack.append(Tree('_ambig', [])) + if node.is_ambiguous: + if self.forest_sum_visitor and isinf(node.priority): + self.forest_sum_visitor.visit(node) + if node.is_intermediate: + # TODO Support ambiguous intermediate nodes! + logger.warning("Ambiguous intermediate node in the SPPF: %s. " + "Lark does not currently process these ambiguities; resolving with the first derivation.", node) + return next(iter(node.children)) + else: + self.output_stack.append(Tree('_ambig', [])) + return iter(node.children) def visit_symbol_node_out(self, node):