From 3fbac825a84a516406f043ab79dacaf8eefdcfe4 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Mon, 24 Dec 2018 16:11:46 +0200 Subject: [PATCH 001/164] Added to tests: Make sure the standalone parser is reusable --- tests/test_tools.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_tools.py b/tests/test_tools.py index ff823ec..27927eb 100644 --- a/tests/test_tools.py +++ b/tests/test_tools.py @@ -49,6 +49,8 @@ class TestStandalone(TestCase): l = _Lark() x = l.parse('12 elephants') self.assertEqual(x.children, ['12', 'elephants']) + x = l.parse('16 candles') + self.assertEqual(x.children, ['16', 'candles']) def test_contextual(self): grammar = """ From 68cee8aa6e9a64bc77ef773910782c9a721db305 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Tue, 2 Jul 2019 18:31:22 +0300 Subject: [PATCH 002/164] Cleaned up a test --- tests/test_tools.py | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/tests/test_tools.py b/tests/test_tools.py index 27927eb..5316396 100644 --- a/tests/test_tools.py +++ b/tests/test_tools.py @@ -1,11 +1,9 @@ from __future__ import absolute_import import sys -import unittest -from unittest import TestCase +from unittest import TestCase, main from lark.tree import Tree - from lark.tools import standalone try: @@ -94,26 +92,19 @@ class TestStandalone(TestCase): _NEWLINE: /\n/ """ - # from lark import Lark - # l = Lark(grammar, parser='lalr', lexer='contextual', postlex=MyIndenter()) - # x = l.parse('(\n)\n') - # print('@@', x) - - context = self._create_standalone(grammar) _Lark = context['Lark_StandAlone'] - # l = _Lark(postlex=MyIndenter()) - # x = l.parse('()\n') - # print(x) + l = _Lark(postlex=MyIndenter()) + x = l.parse('()\n') + self.assertEqual(x, Tree('start', [])) l = _Lark(postlex=MyIndenter()) x = l.parse('(\n)\n') - print(x) - + self.assertEqual(x, Tree('start', [])) if __name__ == '__main__': - unittest.main() + main() From aa75d50bd5b19ab8003135848419e34063b1f2ac Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Wed, 3 Jul 2019 15:12:33 +0300 Subject: [PATCH 003/164] End symbol working for lalr + fixed validation for undefined %ignore --- lark/grammar.py | 1 + lark/lexer.py | 2 +- lark/load_grammar.py | 13 ++++++++++--- lark/parsers/grammar_analysis.py | 4 ++-- lark/parsers/lalr_analysis.py | 2 +- lark/parsers/lalr_parser.py | 10 ++++++---- tests/test_parser.py | 12 ++++++++++++ 7 files changed, 33 insertions(+), 11 deletions(-) diff --git a/lark/grammar.py b/lark/grammar.py index 14893fb..730b912 100644 --- a/lark/grammar.py +++ b/lark/grammar.py @@ -1,6 +1,7 @@ from .utils import Serialize ###{standalone +END = '_END$' class Symbol(Serialize): is_term = NotImplemented diff --git a/lark/lexer.py b/lark/lexer.py index 3e881f8..e12195f 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -292,7 +292,7 @@ class TraditionalLexer(Lexer): if t.pattern.min_width == 0: raise LexError("Lexer does not allow zero-width terminals. (%s: %s)" % (t.name, t.pattern)) - assert set(ignore) <= {t.name for t in terminals} + assert set(ignore) <= {t.name for t in terminals}, (ignore, terminals) # Init self.newline_types = [t.name for t in terminals if _regexp_has_newline(t.pattern.to_regexp())] diff --git a/lark/load_grammar.py b/lark/load_grammar.py index f7b1011..8d50e0a 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -11,7 +11,7 @@ from .lexer import Token, TerminalDef, PatternStr, PatternRE from .parse_tree_builder import ParseTreeBuilder from .parser_frontends import LALR_TraditionalLexer from .common import LexerConf, ParserConf -from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol +from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol, END from .utils import classify, suppress, dedup_list from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken @@ -91,6 +91,7 @@ TERMINALS = { '_DECLARE': r'%declare', '_IMPORT': r'%import', 'NUMBER': r'\d+', + '_END': r'\$', } RULES = { @@ -122,7 +123,8 @@ RULES = { 'value': ['terminal', 'nonterminal', 'literal', - 'range'], + 'range', + 'end'], 'terminal': ['TERMINAL'], 'nonterminal': ['RULE'], @@ -131,6 +133,7 @@ RULES = { 'maybe': ['_LBRA expansions _RBRA'], 'range': ['STRING _DOT _DOT STRING'], + 'end': ['_END'], 'term': ['TERMINAL _COLON expansions _NL', 'TERMINAL _DOT NUMBER _COLON expansions _NL'], @@ -285,6 +288,9 @@ class CanonizeTree(Transformer_InPlace): tokenmods, value = args return tokenmods + [value] + def end(self): + return Token('TERMINAL', END) + class PrepareAnonTerminals(Transformer_InPlace): "Create a unique list of anonymous terminals. Attempt to give meaningful names to them when we add them" @@ -735,6 +741,7 @@ class GrammarLoader: term_defs = [td if len(td)==3 else (td[0], 1, td[1]) for td in term_defs] term_defs = [(name.value, (t, int(p))) for name, p, t in term_defs] + term_defs.append((END, (None, 0))) rule_defs = [options_from_rule(*x) for x in rule_defs] # Execute statements @@ -827,7 +834,7 @@ class GrammarLoader: raise GrammarError("Terminal '%s' defined more than once" % name) terminal_names.add(name) - if set(ignore_names) > terminal_names: + if set(ignore_names) - terminal_names: raise GrammarError("Terminals %s were marked to ignore but were not defined!" % (set(ignore_names) - terminal_names)) resolve_term_references(term_defs) diff --git a/lark/parsers/grammar_analysis.py b/lark/parsers/grammar_analysis.py index 086349c..803b935 100644 --- a/lark/parsers/grammar_analysis.py +++ b/lark/parsers/grammar_analysis.py @@ -2,7 +2,7 @@ from collections import Counter from ..utils import bfs, fzset, classify from ..exceptions import GrammarError -from ..grammar import Rule, Terminal, NonTerminal +from ..grammar import Rule, Terminal, NonTerminal, END class RulePtr(object): @@ -109,7 +109,7 @@ class GrammarAnalyzer(object): def __init__(self, parser_conf, debug=False): self.debug = debug - root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start), Terminal('$END')]) + root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start), Terminal(END)]) for start in parser_conf.start} rules = parser_conf.rules + list(root_rules.values()) diff --git a/lark/parsers/lalr_analysis.py b/lark/parsers/lalr_analysis.py index eef1f9b..9c02ca2 100644 --- a/lark/parsers/lalr_analysis.py +++ b/lark/parsers/lalr_analysis.py @@ -13,7 +13,7 @@ from ..utils import classify, classify_bool, bfs, fzset, Serialize, Enumerator from ..exceptions import GrammarError from .grammar_analysis import GrammarAnalyzer, Terminal -from ..grammar import Rule +from ..grammar import Rule, END ###{standalone diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index 39dd5f3..7444a74 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -5,6 +5,7 @@ from ..exceptions import UnexpectedToken from ..lexer import Token from ..utils import Enumerator, Serialize +from ..grammar import END from .lalr_analysis import LALR_Analyzer, Shift, IntParseTable @@ -94,13 +95,14 @@ class _Parser: else: reduce(arg) - token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1) + token = Token.new_borrow_pos(END, None, token) if token else Token(END, None, 0, 1, 1) while True: _action, arg = get_action(token) if _action is Shift: - assert arg == end_state - val ,= value_stack - return val + if arg == end_state: + val ,= value_stack + return val + state_stack.append(arg) else: reduce(arg) diff --git a/tests/test_parser.py b/tests/test_parser.py index 3238ead..9a902b8 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1505,6 +1505,18 @@ def _make_parser_test(LEXER, PARSER): """ parser = _Lark(grammar) + @unittest.skipIf(PARSER!='lalr', "Using the end symbol currently works for LALR only") + def test_end_symbol(self): + grammar = """ + start: a b? + a: "a" $ + b: "b" + """ + parser = _Lark(grammar) + + self.assertEqual(parser.parse('a'), Tree('start', [Tree('a', [])])) + self.assertRaises(UnexpectedInput, parser.parse, 'ab') + @unittest.skipIf(PARSER!='lalr', "Serialize currently only works for LALR parsers (though it should be easy to extend)") def test_serialize(self): From cf7479f1865e98f7595128cd992fd965dcce5638 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Thu, 23 Jan 2020 12:26:36 +0200 Subject: [PATCH 004/164] Post-merge fixed for end_symbol, + two more tests (Issue #237) --- lark/load_grammar.py | 8 -------- lark/parsers/lalr_analysis.py | 2 +- lark/parsers/lalr_parser.py | 12 ++++++++---- tests/test_parser.py | 24 ++++++++++++++++++++++++ 4 files changed, 33 insertions(+), 13 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index ad61239..0e0ce70 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -91,12 +91,8 @@ TERMINALS = { '_IGNORE': r'%ignore', '_DECLARE': r'%declare', '_IMPORT': r'%import', -<<<<<<< HEAD 'NUMBER': r'[+-]?\d+', -======= - 'NUMBER': r'\d+', '_END': r'\$', ->>>>>>> end_symbol } RULES = { @@ -137,12 +133,8 @@ RULES = { '?name': ['RULE', 'TERMINAL'], 'maybe': ['_LBRA expansions _RBRA'], -<<<<<<< HEAD 'range': ['STRING _DOTDOT STRING'], -======= - 'range': ['STRING _DOT _DOT STRING'], 'end': ['_END'], ->>>>>>> end_symbol 'term': ['TERMINAL _COLON expansions _NL', 'TERMINAL _DOT NUMBER _COLON expansions _NL'], diff --git a/lark/parsers/lalr_analysis.py b/lark/parsers/lalr_analysis.py index ade6163..82773a7 100644 --- a/lark/parsers/lalr_analysis.py +++ b/lark/parsers/lalr_analysis.py @@ -178,7 +178,7 @@ class LALR_Analyzer(GrammarAnalyzer): assert(len(root.kernel) == 1) for rp in root.kernel: assert(rp.index == 0) - self.directly_reads[(root, rp.next)] = set([ Terminal('$END') ]) + self.directly_reads[(root, rp.next)] = set([ Terminal(END) ]) for state in self.lr0_states: seen = set() diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index 11d3407..5c8416b 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -97,9 +97,13 @@ class _Parser: token = Token.new_borrow_pos(END, None, token) if token else Token(END, None, 0, 1, 1) while True: _action, arg = get_action(token) - assert(_action is Reduce) - reduce(arg) - if state_stack[-1] == end_state: - return value_stack[-1] + if _action is Shift: + state_stack.append(arg) + value_stack.append(token) + else: + assert(_action is Reduce) + reduce(arg) + if state_stack[-1] == end_state: + return value_stack[-1] ###} diff --git a/tests/test_parser.py b/tests/test_parser.py index 1b0a093..960eb3c 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1660,6 +1660,30 @@ def _make_parser_test(LEXER, PARSER): self.assertEqual(parser.parse('a'), Tree('start', [Tree('a', [])])) self.assertRaises(UnexpectedInput, parser.parse, 'ab') + @unittest.skipIf(PARSER!='lalr', "Using the end symbol currently works for LALR only") + def test_end_symbol2(self): + grammar = """ + start: (a|b)+ + a: "a" ("x"|$) + b: "b" + """ + parser = _Lark(grammar) + + self.assertEqual(parser.parse('axa'), Tree('start', [Tree('a', []),Tree('a', [])])) + self.assertRaises(UnexpectedInput, parser.parse, 'ab') + + @unittest.skipIf(PARSER!='lalr', "Using the end symbol currently works for LALR only") + def test_end_symbol3(self): + grammar = """ + start: (a|b)+ + a: "a" (e|"x") + b: "b" + e: $ + """ + parser = _Lark(grammar) + + self.assertEqual(parser.parse('axa'), Tree('start', [Tree('a', []),Tree('a', [Tree('e', [])])])) + self.assertRaises(UnexpectedInput, parser.parse, 'ab') @unittest.skipIf(PARSER!='lalr' or LEXER=='custom', "Serialize currently only works for LALR parsers without custom lexers (though it should be easy to extend)") def test_serialize(self): From 3fc97331881489d9320bd83eef25d66379d241dc Mon Sep 17 00:00:00 2001 From: julienmalard Date: Thu, 14 May 2020 14:36:55 -0400 Subject: [PATCH 005/164] Added regex module option. --- lark/lexer.py | 5 ++++- lark/parser_frontends.py | 5 ++++- setup.py | 5 ++++- tests/test_nearley/nearley | 2 +- tests/test_parser.py | 5 ++++- 5 files changed, 17 insertions(+), 5 deletions(-) diff --git a/lark/lexer.py b/lark/lexer.py index 32bfe78..36541d1 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -1,6 +1,9 @@ ## Lexer Implementation -import re +try: + import regex as re +except ImportError: + import re from .utils import Str, classify, get_regexp_width, Py36, Serialize from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index d68d186..9f80ed4 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -1,4 +1,7 @@ -import re +try: + import regex as re +except ImportError: + import re from functools import partial from .utils import get_regexp_width, Serialize diff --git a/setup.py b/setup.py index b962b7f..d31e4d2 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,7 @@ -import re +try: + import regex as re +except ImportError: + import re from setuptools import find_packages, setup __version__ ,= re.findall('__version__ = "(.*)"', open('lark/__init__.py').read()) diff --git a/tests/test_nearley/nearley b/tests/test_nearley/nearley index a46b374..cf8925f 160000 --- a/tests/test_nearley/nearley +++ b/tests/test_nearley/nearley @@ -1 +1 @@ -Subproject commit a46b37471db486db0f6e1ce6a2934fb238346b44 +Subproject commit cf8925f729bde741a3076c5856c0c0862bc7f5de diff --git a/tests/test_parser.py b/tests/test_parser.py index fcb6d22..c6f420e 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1,7 +1,10 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import -import re +try: + import regex as re +except ImportError: + import re import unittest import logging import os From eeafdb954b2f4de71062bb44b06a6968e0921781 Mon Sep 17 00:00:00 2001 From: julienmalard Date: Fri, 15 May 2020 17:11:23 -0400 Subject: [PATCH 006/164] Added preliminary tests. --- regex-requirements.txt | 1 + tests/test_regex.py | 34 ++++++++++++++++++++++++++++++++++ tox.ini | 1 + 3 files changed, 36 insertions(+) create mode 100644 regex-requirements.txt create mode 100644 tests/test_regex.py diff --git a/regex-requirements.txt b/regex-requirements.txt new file mode 100644 index 0000000..822e14a --- /dev/null +++ b/regex-requirements.txt @@ -0,0 +1 @@ +regex \ No newline at end of file diff --git a/tests/test_regex.py b/tests/test_regex.py new file mode 100644 index 0000000..db0bb85 --- /dev/null +++ b/tests/test_regex.py @@ -0,0 +1,34 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import + +import logging +import unittest + +logging.basicConfig(level=logging.INFO) + +from lark.lark import Lark + + +class TestRegex(unittest.TestCase): + def test_unicode_class(self): + "Tests that character classes from the `regex` module work correctly." + g = Lark(r""" + ?start: NAME + NAME: ID_START ID_CONTINUE* + ID_START: /[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}_]+/ + ID_CONTINUE: ID_START | /[\p{Mn}\p{Mc}\p{Nd}\p{Pc}·]+/ + """) + + self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்') + + def test_unicode_word(self): + "Tests that a persistent bug in the `re` module works when `regex` is enabled." + g = Lark(r""" + ?start: NAME + NAME: /[\w]+/ + """) + self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்') + + +if __name__ == '__main__': + unittest.main() diff --git a/tox.ini b/tox.ini index f0f311e..5427f0f 100644 --- a/tox.ini +++ b/tox.ini @@ -15,6 +15,7 @@ pypy3 = pypy3 whitelist_externals = git deps = -rnearley-requirements.txt + -rregex-requirements.txt # to always force recreation and avoid unexpected side effects recreate=True From 732a562a1cf7bba5216b0ae61ce00b1b4f46d10a Mon Sep 17 00:00:00 2001 From: Aleh Arol Date: Sun, 31 May 2020 00:01:19 +0300 Subject: [PATCH 007/164] Use token type equality as a fallback when matching error examples --- lark/exceptions.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/lark/exceptions.py b/lark/exceptions.py index cf03746..4cbe4bf 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -39,7 +39,7 @@ class UnexpectedInput(LarkError): """ assert self.state is not None, "Not supported for this exception" - candidate = None + candidate = (None, False) for label, example in examples.items(): assert not isinstance(example, STRING_TYPE) @@ -51,12 +51,16 @@ class UnexpectedInput(LarkError): try: if ut.token == self.token: # Try exact match first return label + + if (ut.token.type == self.token.type) and not candidate[-1]: # Fallback to token types match + candidate = label, True + except AttributeError: pass - if not candidate: - candidate = label + if not candidate[0]: + candidate = label, False - return candidate + return candidate[0] class UnexpectedCharacters(LexError, UnexpectedInput): From 85545e394704e2004582923d82e12d88dd57e9a6 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sat, 6 Jun 2020 17:46:46 +0300 Subject: [PATCH 008/164] Set theme jekyll-theme-cayman --- docs/_config.yml | 1 + 1 file changed, 1 insertion(+) create mode 100644 docs/_config.yml diff --git a/docs/_config.yml b/docs/_config.yml new file mode 100644 index 0000000..c419263 --- /dev/null +++ b/docs/_config.yml @@ -0,0 +1 @@ +theme: jekyll-theme-cayman \ No newline at end of file From 9e7eb48855238a52e12bbe2dd6cac7d78bd22bf6 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sat, 6 Jun 2020 17:49:29 +0300 Subject: [PATCH 009/164] Set theme jekyll-theme-minimal --- docs/_config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/_config.yml b/docs/_config.yml index c419263..2f7efbe 100644 --- a/docs/_config.yml +++ b/docs/_config.yml @@ -1 +1 @@ -theme: jekyll-theme-cayman \ No newline at end of file +theme: jekyll-theme-minimal \ No newline at end of file From 9e6b1b95825c1d232ea760fc8c22d8c2413200b1 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sat, 6 Jun 2020 17:53:37 +0300 Subject: [PATCH 010/164] Set theme jekyll-theme-slate --- docs/_config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/_config.yml b/docs/_config.yml index 2f7efbe..c741881 100644 --- a/docs/_config.yml +++ b/docs/_config.yml @@ -1 +1 @@ -theme: jekyll-theme-minimal \ No newline at end of file +theme: jekyll-theme-slate \ No newline at end of file From 87bc7aa914cdeb1ae181fff68e095dff10a462aa Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sat, 6 Jun 2020 23:08:59 +0300 Subject: [PATCH 011/164] Added ide to github pages (initial) --- docs/ide/__init__.py | 8 + docs/ide/app.js | 105 ++ docs/ide/app.py | 76 + docs/ide/examples.py | 150 ++ docs/ide/files.json | 10 + docs/ide/html5/.gitignore | 4 + docs/ide/html5/CHANGELOG.md | 93 ++ docs/ide/html5/LICENSE | 165 ++ docs/ide/html5/README.md | 69 + docs/ide/html5/__init__.py | 6 + docs/ide/html5/core.py | 3152 +++++++++++++++++++++++++++++++++++ docs/ide/html5/ext.py | 475 ++++++ docs/ide/html5/ignite.py | 186 +++ docs/ide/html5/utils.py | 101 ++ docs/ide/index.html | 101 ++ docs/ide/is-loading.gif | Bin 0 -> 404 bytes docs/ide/lark-logo.png | Bin 0 -> 13770 bytes 17 files changed, 4701 insertions(+) create mode 100644 docs/ide/__init__.py create mode 100644 docs/ide/app.js create mode 100644 docs/ide/app.py create mode 100644 docs/ide/examples.py create mode 100644 docs/ide/files.json create mode 100644 docs/ide/html5/.gitignore create mode 100644 docs/ide/html5/CHANGELOG.md create mode 100644 docs/ide/html5/LICENSE create mode 100644 docs/ide/html5/README.md create mode 100644 docs/ide/html5/__init__.py create mode 100644 docs/ide/html5/core.py create mode 100644 docs/ide/html5/ext.py create mode 100644 docs/ide/html5/ignite.py create mode 100644 docs/ide/html5/utils.py create mode 100644 docs/ide/index.html create mode 100644 docs/ide/is-loading.gif create mode 100644 docs/ide/lark-logo.png diff --git a/docs/ide/__init__.py b/docs/ide/__init__.py new file mode 100644 index 0000000..1ef4637 --- /dev/null +++ b/docs/ide/__init__.py @@ -0,0 +1,8 @@ +from . import html5, app + + +def start(): + html5.Body().appendChild( + app.App() + ) + diff --git a/docs/ide/app.js b/docs/ide/app.js new file mode 100644 index 0000000..47c0662 --- /dev/null +++ b/docs/ide/app.js @@ -0,0 +1,105 @@ +class app { + + constructor(modules, invocation){ + languagePluginLoader.then(() => { + // If you don't require for pre-loaded Python packages, remove this promise below. + window.pyodide.runPythonAsync("import setuptools, micropip").then(()=>{ + window.pyodide.runPythonAsync("micropip.install('lark-parser')").then(()=>{ + this.fetchSources(modules).then(() => { + window.pyodide.runPythonAsync("import " + Object.keys(modules).join("\nimport ") + "\n" + invocation + "\n").then(() => this.initializingComplete()); + }); + }); + }); + }); + } + + loadSources(module, baseURL, files) { + let promises = []; + + for (let f in files) { + promises.push( + new Promise((resolve, reject) => { + let file = files[f]; + let url = (baseURL ? baseURL + "/" : "") + file; + + fetch(url, {}).then((response) => { + if (response.status === 200) + return response.text().then((code) => { + let path = ("/lib/python3.7/site-packages/" + module + "/" + file).split("/"); + let lookup = ""; + + for (let i in path) { + if (!path[i]) { + continue; + } + + lookup += (lookup ? "/" : "") + path[i]; + + if (parseInt(i) === path.length - 1) { + window.pyodide._module.FS.writeFile(lookup, code); + console.debug(`fetched ${lookup}`); + } else { + try { + window.pyodide._module.FS.lookupPath(lookup); + } catch { + window.pyodide._module.FS.mkdir(lookup); + console.debug(`created ${lookup}`); + } + } + } + + resolve(); + }); + else + reject(); + }); + }) + ); + } + + return Promise.all(promises); + } + + fetchSources(modules) { + let promises = []; + + for( let module of Object.keys(modules) ) + { + promises.push( + new Promise((resolve, reject) => { + fetch(`${modules[module]}/files.json`, {}).then((response) => { + if (response.status === 200) { + response.text().then((list) => { + let files = JSON.parse(list); + + this.loadSources(module, modules[module], files).then(() => { + resolve(); + }) + }) + } else { + reject(); + } + }) + })); + } + + return Promise.all(promises).then(() => { + for( let module of Object.keys(modules) ) { + window.pyodide.loadedPackages[module] = "default channel"; + } + + window.pyodide.runPython( + 'import importlib as _importlib\n' + + '_importlib.invalidate_caches()\n' + ); + }); + } + + initializingComplete() { + document.body.classList.remove("is-loading") + } +} + +(function () { + window.top.app = new app({"app": "."}, "app.start()"); +})(); diff --git a/docs/ide/app.py b/docs/ide/app.py new file mode 100644 index 0000000..7c126e8 --- /dev/null +++ b/docs/ide/app.py @@ -0,0 +1,76 @@ +from . import html5 +from .examples import examples + +from lark import Lark +from lark.tree import Tree + + +class App(html5.Div): + def __init__(self): + super().__init__(""" +

+ IDE +

+ +
+ + + + +
+
+ Grammar: + +
+
+ Input: + +
+
+
+
    +
+
+ """) + self.sinkEvent("onKeyUp", "onChange") + + self.parser = "earley" + + # Pre-load examples + for name, (grammar, input) in examples.items(): + option = html5.Option(name) + option.grammar = grammar + option.input = input + + self.examples.appendChild(option) + + def onChange(self, e): + if html5.utils.doesEventHitWidgetOrChildren(e, self.examples): + example = self.examples.children(self.examples["selectedIndex"]) + self.grammar["value"] = example.grammar.strip() + self.input["value"] = example.input.strip() + self.onKeyUp() + + elif html5.utils.doesEventHitWidgetOrChildren(e, self.parser): + self.parser = self.parser.children(self.parser["selectedIndex"])["value"] + self.onKeyUp() + + def onKeyUp(self, e=None): + l = Lark(self.grammar["value"], parser=self.parser) + + try: + ast = l.parse(self.input["value"]) + except Exception as e: + self.ast.appendChild( + html5.Li(str(e)), replace=True + ) + + print(ast) + traverse = lambda node: html5.Li([node.data, html5.Ul([traverse(c) for c in node.children])] if isinstance(node, Tree) else node) + self.ast.appendChild(traverse(ast), replace=True) diff --git a/docs/ide/examples.py b/docs/ide/examples.py new file mode 100644 index 0000000..af9c38c --- /dev/null +++ b/docs/ide/examples.py @@ -0,0 +1,150 @@ + +# Examples formattet this way: +# "name": ("grammar", "demo-input") + +examples = { + + # --- hello.lark --- + "hello.lark": (""" +start: WORD "," WORD "!" + +%import common.WORD // imports from terminal library +%ignore " " // Disregard spaces in text +""", "Hello, World!"), + + # --- calc.lark --- +"calc.lark": (""" +?start: sum + | NAME "=" sum -> assign_var + +?sum: product + | sum "+" product -> add + | sum "-" product -> sub + +?product: atom + | product "*" atom -> mul + | product "/" atom -> div + +?atom: NUMBER -> number + | "-" atom -> neg + | NAME -> var + | "(" sum ")" + +%import common.CNAME -> NAME +%import common.NUMBER +%import common.WS_INLINE +%ignore WS_INLINE""", + "1 + 2 * 3 + 4"), + + # --- json.lark --- + "json.lark": (""" +?start: value +?value: object + | array + | string + | SIGNED_NUMBER -> number + | "true" -> true + | "false" -> false + | "null" -> null +array : "[" [value ("," value)*] "]" +object : "{" [pair ("," pair)*] "}" +pair : string ":" value +string : ESCAPED_STRING +%import common.ESCAPED_STRING +%import common.SIGNED_NUMBER +%import common.WS +%ignore WS""", +""" +[ + { + "_id": "5edb875cf3d764da55602437", + "index": 0, + "guid": "3dae2206-5d4d-41fe-b81d-dc8cdba7acaa", + "isActive": false, + "balance": "$2,872.54", + "picture": "http://placehold.it/32x32", + "age": 24, + "eyeColor": "blue", + "name": "Theresa Vargas", + "gender": "female", + "company": "GEEKOL", + "email": "theresavargas@geekol.com", + "phone": "+1 (930) 450-3445", + "address": "418 Herbert Street, Sexton, Florida, 1375", + "about": "Id minim deserunt laborum enim. Veniam commodo incididunt amet aute esse duis veniam occaecat nulla esse aute et deserunt eiusmod. Anim elit ullamco minim magna sint laboris. Est consequat quis deserunt excepteur in magna pariatur laborum quis eu. Ex quis tempor elit qui qui et culpa sunt sit esse mollit cupidatat. Fugiat cillum deserunt enim minim irure reprehenderit est. Voluptate nisi quis amet quis incididunt pariatur nostrud Lorem consectetur adipisicing voluptate.\\r\\n", + "registered": "2016-11-19T01:02:42 -01:00", + "latitude": -25.65267, + "longitude": 104.19531, + "tags": [ + "eiusmod", + "reprehenderit", + "anim", + "sunt", + "esse", + "proident", + "esse" + ], + "friends": [ + { + "id": 0, + "name": "Roth Herrera" + }, + { + "id": 1, + "name": "Callie Christian" + }, + { + "id": 2, + "name": "Gracie Whitfield" + } + ], + "greeting": "Hello, Theresa Vargas! You have 6 unread messages.", + "favoriteFruit": "banana" + }, + { + "_id": "5edb875c845eb08161a83e64", + "index": 1, + "guid": "a8ada2c1-e2c7-40d3-96b4-52c93baff7f0", + "isActive": false, + "balance": "$2,717.04", + "picture": "http://placehold.it/32x32", + "age": 23, + "eyeColor": "green", + "name": "Lily Ross", + "gender": "female", + "company": "RODEOMAD", + "email": "lilyross@rodeomad.com", + "phone": "+1 (941) 465-3561", + "address": "525 Beekman Place, Blodgett, Marshall Islands, 3173", + "about": "Aliquip duis proident excepteur eiusmod in quis officia consequat culpa eu et ut. Occaecat reprehenderit tempor mollit do eu magna qui et magna exercitation aliqua. Incididunt exercitation dolor proident eiusmod minim occaecat. Sunt et minim mollit et veniam sint ex. Duis ullamco elit aute eu excepteur reprehenderit officia.\\r\\n", + "registered": "2019-11-02T04:06:42 -01:00", + "latitude": 17.031701, + "longitude": -42.657106, + "tags": [ + "id", + "non", + "culpa", + "reprehenderit", + "esse", + "elit", + "sit" + ], + "friends": [ + { + "id": 0, + "name": "Ursula Maldonado" + }, + { + "id": 1, + "name": "Traci Huff" + }, + { + "id": 2, + "name": "Taylor Holt" + } + ], + "greeting": "Hello, Lily Ross! You have 3 unread messages.", + "favoriteFruit": "strawberry" + } +]""") +} \ No newline at end of file diff --git a/docs/ide/files.json b/docs/ide/files.json new file mode 100644 index 0000000..ebeb185 --- /dev/null +++ b/docs/ide/files.json @@ -0,0 +1,10 @@ +[ + "__init__.py", + "app.py", + "examples.py", + "html5/__init__.py", + "html5/core.py", + "html5/ext.py", + "html5/ignite.py", + "html5/utils.py" +] \ No newline at end of file diff --git a/docs/ide/html5/.gitignore b/docs/ide/html5/.gitignore new file mode 100644 index 0000000..b65483f --- /dev/null +++ b/docs/ide/html5/.gitignore @@ -0,0 +1,4 @@ +__target__ +__pycache__ +*.pyc +.idea diff --git a/docs/ide/html5/CHANGELOG.md b/docs/ide/html5/CHANGELOG.md new file mode 100644 index 0000000..6f24335 --- /dev/null +++ b/docs/ide/html5/CHANGELOG.md @@ -0,0 +1,93 @@ +# Changelog + +This file documents any relevant changes done to ViUR html5 since version 2. + +## 3.0.0 [develop] + +This is the current development version. + +- Feature: Ported framework to Python 3 using [Pyodide](https://github.com/iodide-project/pyodide), with a full source code and library cleanup +- Feature: `html5.Widget.__init__()` now allows parameters equal to `Widget.appendChild()` to directly stack widgets together. + Additionally, the following parameters are available: + - `appendTo`: Directly append the newly created widget to another widget. + - `style`: Provide class attributes for styling added to the new Widget, using `Widget.addClass()`. +- Feature: `html5.Widget.appendChild()` and `html5.Widget.prependChild()` can handle arbitrary input now, including HTML, lists of widgets or just text, in any order. `html5.Widget.insertChild()` runs slightly different, but shares same features. This change mostly supersedes `html5.Widget.fromHTML()`. +- Feature: New `replace`-parameter for `html5.Widget.appendChild()` and `html5.Widget.prependChild()` which clears the content. +- Feature: `html5.ext.InputDialog` refactored & disables OK-Button when no value is present. +- Feature: `html5.utils.doesEventHitWidgetOrChildren()` and `html5.utils.doesEventHitWidgetOrParent()` now return the Widget or None instead of a boolean, to avoid creating loops and directly work with the recognized Widget. +- Feature: New function `html5.Widget.onBind()` enables widgets to react when bound to other widgets using the HTML parser. +- Feature: Replace HTML-parsing-related `vars`-parameter generally by `**kwargs`, with backward-compatibility. +- Speed-improvement: Hold static `_WidgetClassWrapper` per `html5.Widget` instead of creating one each time on the fly. + +## [2.5.0] Vesuv + +Release date: Jul 26, 2019 + +- Bugfix: `Widget.Th()` now supporting full col-/rowspan getting and setting. +- Bugfix: HTML-parser accepts tags in upper-/camel-case order now. +- Bugfix: HTML-parser handles table tags with tbody/thead tags inside more gracefully. +- Feature: Split HTML-parser into separate stages to compile and run; This allows to pre-compile HTML into a list/dict-structure and render it later on without parsing it again. `parseHTML()` is the new function, `fromHTML()` works like before and handles pre-compiled or raw HTML as parameter. +- Feature: `fromHTML()` extended to `vars` parameter to replace key-value pairs in text-nodes and attribute values expressed as `{{key}}`. +- Feature: HTML-parser dynamically reconizes void elements +- Feature: `html5.registerTag()` can be used to define new or override existing HTML elements in the HTML parser by custom implementations based on `html5.Widget()` +- Feature: New function `Widget.isVisible()` as counterpart for `Widget.isHidden()`. + +## [2.4.0] Agung + +Release date: May 17, 2019 + +- Bugfix: Fixed bug with disabling of input widgets. +- Feature: Fully refactored the librarys source base into just two single files, to reduce number of required files to download and make the library easier to access. +- Feature: New function `Widget.isHidden()` to check if a widget is currently shown. +- Feature: Improved handling of key-events. +- Feature: Allow to close popups by pressing `ESC`. +- Feature: Improvements for SVG and TextNode. + +## [2.3.0] Kilauea + +Release date: Oct 2, 2018 + +- Refactored `html5.ext.SelectDialog` +- Extended html parser to apply data-attributes +- Switching event handling to newer JavaScript event listener API +- Added `onFocusIn` and `onFocusOut` events + +## [2.2.0] Etna + +Release date: Apr 23, 2018 + +- Implemented `html5.Head()` to access the document's head object within the library. +- Directly append text in construction of Li(). + +## [2.1.0] + +Release date: Nov 2, 2017 + +- Introduced a build-in HTML parser (`Widget.fromHTML()`) that is capable to compile HTML-code into DOM-objects of the html5 library, and an extra-feature to bind them to their root node for further access. This attempt makes it possible to create PyJS apps using the HTML5 library without creating every single element by hand. +- A more distinct way for `Widget.hide()` and `Widget.show()` that cannot be overridden by styling. (setting "hidden" does not work when another display value is set). +- Utility functions `Widget.enable() and `Widget.disable()`. +- Directly append text in construction of Div() and Span(). +- Allow for tuple and list processing in table cell assignments. +- Adding `utils.parseFloat()` and `utils.parseInt()` utility functions. +- Implemented `colspan` attribute for Th() +- New README.md and CHANGELOG.md. + +## 2.0 + +Release date: Dec 22, 2016 + +- v[2.0.1]: Directly append text in construction of Option(). +- v[2.0.1]: Anything added to Widget.appendChild() or Widget.prependChild() which is not a widget is handled as text (TextNode() is automatically created). +- New functions `Widget.prependChild()`, `Widget.insertBefore()`, `Widget.children()`, `Widget.removeAllChildren()`, + `Widget.addClass()`, `Widget.removeClass()`, `Widget.toggleClass()` +- Utility functions `utils.doesEventHitWidgetOrParents()`, `utils.doesEventHitWidgetOrChildren()` taken from vi77 +- Insert text blocks easier with `utils.textToHtml()` +- Several bugfixes + +[develop]: https://github.com/viur-framework/html5/compare/v2.5.0...develop +[2.5.0]: https://github.com/viur-framework/html5/compare/v2.4.0...v2.5.0 +[2.4.0]: https://github.com/viur-framework/html5/compare/v2.3.0...v2.4.0 +[2.3.0]: https://github.com/viur-framework/html5/compare/v2.2.0...v2.3.0 +[2.2.0]: https://github.com/viur-framework/html5/compare/v2.1.0...v2.2.0 +[2.1.0]: https://github.com/viur-framework/html5/compare/v2.0.0...v2.1.0 +[2.0.1]: https://github.com/viur-framework/html5/compare/v2.0.0...v2.0.1 diff --git a/docs/ide/html5/LICENSE b/docs/ide/html5/LICENSE new file mode 100644 index 0000000..65c5ca8 --- /dev/null +++ b/docs/ide/html5/LICENSE @@ -0,0 +1,165 @@ + GNU LESSER GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + + This version of the GNU Lesser General Public License incorporates +the terms and conditions of version 3 of the GNU General Public +License, supplemented by the additional permissions listed below. + + 0. Additional Definitions. + + As used herein, "this License" refers to version 3 of the GNU Lesser +General Public License, and the "GNU GPL" refers to version 3 of the GNU +General Public License. + + "The Library" refers to a covered work governed by this License, +other than an Application or a Combined Work as defined below. + + An "Application" is any work that makes use of an interface provided +by the Library, but which is not otherwise based on the Library. +Defining a subclass of a class defined by the Library is deemed a mode +of using an interface provided by the Library. + + A "Combined Work" is a work produced by combining or linking an +Application with the Library. The particular version of the Library +with which the Combined Work was made is also called the "Linked +Version". + + The "Minimal Corresponding Source" for a Combined Work means the +Corresponding Source for the Combined Work, excluding any source code +for portions of the Combined Work that, considered in isolation, are +based on the Application, and not on the Linked Version. + + The "Corresponding Application Code" for a Combined Work means the +object code and/or source code for the Application, including any data +and utility programs needed for reproducing the Combined Work from the +Application, but excluding the System Libraries of the Combined Work. + + 1. Exception to Section 3 of the GNU GPL. + + You may convey a covered work under sections 3 and 4 of this License +without being bound by section 3 of the GNU GPL. + + 2. Conveying Modified Versions. + + If you modify a copy of the Library, and, in your modifications, a +facility refers to a function or data to be supplied by an Application +that uses the facility (other than as an argument passed when the +facility is invoked), then you may convey a copy of the modified +version: + + a) under this License, provided that you make a good faith effort to + ensure that, in the event an Application does not supply the + function or data, the facility still operates, and performs + whatever part of its purpose remains meaningful, or + + b) under the GNU GPL, with none of the additional permissions of + this License applicable to that copy. + + 3. Object Code Incorporating Material from Library Header Files. + + The object code form of an Application may incorporate material from +a header file that is part of the Library. You may convey such object +code under terms of your choice, provided that, if the incorporated +material is not limited to numerical parameters, data structure +layouts and accessors, or small macros, inline functions and templates +(ten or fewer lines in length), you do both of the following: + + a) Give prominent notice with each copy of the object code that the + Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the object code with a copy of the GNU GPL and this license + document. + + 4. Combined Works. + + You may convey a Combined Work under terms of your choice that, +taken together, effectively do not restrict modification of the +portions of the Library contained in the Combined Work and reverse +engineering for debugging such modifications, if you also do each of +the following: + + a) Give prominent notice with each copy of the Combined Work that + the Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the Combined Work with a copy of the GNU GPL and this license + document. + + c) For a Combined Work that displays copyright notices during + execution, include the copyright notice for the Library among + these notices, as well as a reference directing the user to the + copies of the GNU GPL and this license document. + + d) Do one of the following: + + 0) Convey the Minimal Corresponding Source under the terms of this + License, and the Corresponding Application Code in a form + suitable for, and under terms that permit, the user to + recombine or relink the Application with a modified version of + the Linked Version to produce a modified Combined Work, in the + manner specified by section 6 of the GNU GPL for conveying + Corresponding Source. + + 1) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (a) uses at run time + a copy of the Library already present on the user's computer + system, and (b) will operate properly with a modified version + of the Library that is interface-compatible with the Linked + Version. + + e) Provide Installation Information, but only if you would otherwise + be required to provide such information under section 6 of the + GNU GPL, and only to the extent that such information is + necessary to install and execute a modified version of the + Combined Work produced by recombining or relinking the + Application with a modified version of the Linked Version. (If + you use option 4d0, the Installation Information must accompany + the Minimal Corresponding Source and Corresponding Application + Code. If you use option 4d1, you must provide the Installation + Information in the manner specified by section 6 of the GNU GPL + for conveying Corresponding Source.) + + 5. Combined Libraries. + + You may place library facilities that are a work based on the +Library side by side in a single library together with other library +facilities that are not Applications and are not covered by this +License, and convey such a combined library under terms of your +choice, if you do both of the following: + + a) Accompany the combined library with a copy of the same work based + on the Library, uncombined with any other library facilities, + conveyed under the terms of this License. + + b) Give prominent notice with the combined library that part of it + is a work based on the Library, and explaining where to find the + accompanying uncombined form of the same work. + + 6. Revised Versions of the GNU Lesser General Public License. + + The Free Software Foundation may publish revised and/or new versions +of the GNU Lesser General Public License from time to time. Such new +versions will be similar in spirit to the present version, but may +differ in detail to address new problems or concerns. + + Each version is given a distinguishing version number. If the +Library as you received it specifies that a certain numbered version +of the GNU Lesser General Public License "or any later version" +applies to it, you have the option of following the terms and +conditions either of that published version or of any later version +published by the Free Software Foundation. If the Library as you +received it does not specify a version number of the GNU Lesser +General Public License, you may choose any version of the GNU Lesser +General Public License ever published by the Free Software Foundation. + + If the Library as you received it specifies that a proxy can decide +whether future versions of the GNU Lesser General Public License shall +apply, that proxy's public statement of acceptance of any version is +permanent authorization for you to choose that version for the +Library. diff --git a/docs/ide/html5/README.md b/docs/ide/html5/README.md new file mode 100644 index 0000000..d51b38c --- /dev/null +++ b/docs/ide/html5/README.md @@ -0,0 +1,69 @@ +# ViUR html5 + +**html5** is a DOM-abstraction layer and API that is used to create client-side Web-Apps running in the browser and written in Python. + +Look [here](https://www.viur.dev/blog/html5-library) for a short introduction. + +## About + +This API and framework is used to implement HTML5 web-apps using the Python programming language. The framework is an abstraction layer for a DOM running in [Pyodide](https://github.com/iodide-project/pyodide), a Python 3 interpreter compiled to web-assembly. + +It provides + +- class abstraction for all HTML5-DOM-elements, e.g. `html5.Div()` +- a built-in HTML parser and executor to generate DOM objects from HTML-code +- helpers for adding/removing classes, arrange children, handling events etc. + +The most prominent software completely established on this library is [ViUR-vi](https://github.com/viur-framework/viur-vi/), the visual administration interface for ViUR-based applications. + +[ViUR](https://www.viur.dev) is a free software development framework for the [Google App Engine](https://appengine.google.com). + +## Quick Start + +**Warning: This section is incomplete, a working example will follow soon!** + +```python +import html5 + +class Game(html5.Div): + def __init__(self): + super().__init__( + """ + + +

Hello Enter Name!

+ """) + self.sinkEvent("onChange") + + def onChange(self, event): + if html5.utils.doesEventHitWidgetOrChildren(event, self.myInput): + self.mySpan.appendChild(self.myInput["value"], replace=True) + +Game() +``` + +## Contributing + +We take a great interest in your opinion about ViUR. We appreciate your feedback and are looking forward to hear about your ideas. Share your visions or questions with us and participate in ongoing discussions. + +- [ViUR website](https://www.viur.dev) +- [#ViUR on freenode IRC](https://webchat.freenode.net/?channels=viur) +- [ViUR on GitHub](https://github.com/viur-framework) +- [ViUR on Twitter](https://twitter.com/weloveViUR) + +## Credits + +ViUR is developed and maintained by [Mausbrand Informationssysteme GmbH](https://www.mausbrand.de/en), from Dortmund in Germany. We are a software company consisting of young, enthusiastic software developers, designers and social media experts, working on exciting projects for different kinds of customers. All of our newer projects are implemented with ViUR, from tiny web-pages to huge company intranets with hundreds of users. + +Help of any kind to extend and improve or enhance this project in any kind or way is always appreciated. + +## License + +Copyright (C) 2012-2020 by Mausbrand Informationssysteme GmbH. + +Mausbrand and ViUR are registered trademarks of Mausbrand Informationssysteme GmbH. + +You may use, modify and distribute this software under the terms and conditions of the GNU Lesser General Public License (LGPL). See the file LICENSE provided within this package for more information. diff --git a/docs/ide/html5/__init__.py b/docs/ide/html5/__init__.py new file mode 100644 index 0000000..b62a821 --- /dev/null +++ b/docs/ide/html5/__init__.py @@ -0,0 +1,6 @@ +#-*- coding: utf-8 -*- + +from .core import * +from . import ext, utils, ignite + + diff --git a/docs/ide/html5/core.py b/docs/ide/html5/core.py new file mode 100644 index 0000000..6ebe679 --- /dev/null +++ b/docs/ide/html5/core.py @@ -0,0 +1,3152 @@ +# -*- coding: utf-8 -* + +######################################################################################################################## +# DOM-access functions and variables +######################################################################################################################## + +try: + # Pyodide + from js import window, eval as jseval + document = window.document + +except: + print("Emulation mode") + from xml.dom.minidom import parseString + + jseval = None + window = None + document = parseString("") + + +def domCreateAttribute(tag, ns=None): + """ + Creates a new HTML/SVG/... attribute + :param ns: the namespace. Default: HTML. Possible values: HTML, SVG, XBL, XUL + """ + uri = None + + if ns == "SVG": + uri = "http://www.w3.org/2000/svg" + elif ns == "XBL": + uri = "http://www.mozilla.org/xbl" + elif ns == "XUL": + uri = "http://www.mozilla.org/keymaster/gatekeeper/there.is.only.xul" + + if uri: + return document.createAttribute(uri, tag) + + return document.createAttribute(tag) + + +def domCreateElement(tag, ns=None): + """ + Creates a new HTML/SVG/... tag + :param ns: the namespace. Default: HTML. Possible values: HTML, SVG, XBL, XUL + """ + uri = None + + if ns == "SVG": + uri = "http://www.w3.org/2000/svg" + elif ns == "XBL": + uri = "http://www.mozilla.org/xbl" + elif ns == "XUL": + uri = "http://www.mozilla.org/keymaster/gatekeeper/there.is.only.xul" + + if uri: + return document.createElementNS(uri, tag) + + return document.createElement(tag) + + +def domCreateTextNode(txt=""): + return document.createTextNode(txt) + + +def domGetElementById(idTag): + return document.getElementById(idTag) + + +def domElementFromPoint(x, y): + return document.elementFromPoint(x, y) + + +def domGetElementsByTagName(tag): + items = document.getElementsByTagName(tag) + return [items.item(i) for i in range(0, int(items.length))] #pyodide interprets items.length as float, so convert to int + + +######################################################################################################################## +# HTML Widgets +######################################################################################################################## + +# TextNode ------------------------------------------------------------------------------------------------------------- + +class TextNode(object): + """ + Represents a piece of text inside the DOM. + This is the *only* object not deriving from "Widget", as it does + not support any of its properties. + """ + + def __init__(self, txt=None, *args, **kwargs): + super().__init__() + self._parent = None + self._children = [] + self.element = domCreateTextNode(txt or "") + self._isAttached = False + + def _setText(self, txt): + self.element.data = txt + + def _getText(self): + return self.element.data + + def __str__(self): + return self.element.data + + def onAttach(self): + self._isAttached = True + + def onDetach(self): + self._isAttached = False + + def _setDisabled(self, disabled): + return + + def _getDisabled(self): + return False + + def children(self): + return [] + + +# _WidgetClassWrapper ------------------------------------------------------------------------------------------------- + +class _WidgetClassWrapper(list): + + def __init__(self, targetWidget): + super().__init__() + + self.targetWidget = targetWidget + + def _updateElem(self): + if len(self) == 0: + self.targetWidget.element.removeAttribute("class") + else: + self.targetWidget.element.setAttribute("class", " ".join(self)) + + def append(self, p_object): + list.append(self, p_object) + self._updateElem() + + def clear(self): + list.clear(self) + self._updateElem() + + def remove(self, value): + try: + list.remove(self, value) + except: + pass + self._updateElem() + + def extend(self, iterable): + list.extend(self, iterable) + self._updateElem() + + def insert(self, index, p_object): + list.insert(self, index, p_object) + self._updateElem() + + def pop(self, index=None): + list.pop(self, index) + self._updateElem() + + +# _WidgetDataWrapper --------------------------------------------------------------------------------------------------- + +class _WidgetDataWrapper(dict): + + def __init__(self, targetWidget): + super().__init__() + + self.targetWidget = targetWidget + alldata = targetWidget.element + + for data in dir(alldata.dataset): + dict.__setitem__(self, data, getattr(alldata.dataset, data)) + + def __setitem__(self, key, value): + dict.__setitem__(self, key, value) + self.targetWidget.element.setAttribute(str("data-" + key), value) + + def update(self, E=None, **F): + dict.update(self, E, **F) + if E is not None and "keys" in dir(E): + for key in E: + self.targetWidget.element.setAttribute(str("data-" + key), E["data-" + key]) + elif E: + for (key, val) in E: + self.targetWidget.element.setAttribute(str("data-" + key), "data-" + val) + for key in F: + self.targetWidget.element.setAttribute(str("data-" + key), F["data-" + key]) + + +# _WidgetStyleWrapper -------------------------------------------------------------------------------------------------- + +class _WidgetStyleWrapper(dict): + + def __init__(self, targetWidget): + super().__init__() + + self.targetWidget = targetWidget + style = targetWidget.element.style + + for key in dir(style): + # Convert JS-Style-Syntax to CSS Syntax (ie borderTop -> border-top) + realKey = "" + for currChar in key: + if currChar.isupper(): + realKey += "-" + realKey += currChar.lower() + val = style.getPropertyValue(realKey) + if val: + dict.__setitem__(self, realKey, val) + + def __setitem__(self, key, value): + dict.__setitem__(self, key, value) + self.targetWidget.element.style.setProperty(key, value) + + def update(self, E=None, **F): + dict.update(self, E, **F) + if E is not None and "keys" in dir(E): + for key in E: + self.targetWidget.element.style.setProperty(key, E[key]) + elif E: + for (key, val) in E: + self.targetWidget.element.style.setProperty(key, val) + for key in F: + self.targetWidget.element.style.setProperty(key, F[key]) + + +# Widget --------------------------------------------------------------------------------------------------------------- + +class Widget(object): + _tagName = None + _namespace = None + _parserTagName = None + style = [] + + def __init__(self, *args, appendTo=None, style=None, **kwargs): + if "_wrapElem" in kwargs.keys(): + self.element = kwargs["_wrapElem"] + del kwargs["_wrapElem"] + else: + assert self._tagName is not None + self.element = domCreateElement(self._tagName, ns=self._namespace) + + super().__init__() + self._widgetClassWrapper = _WidgetClassWrapper(self) + self.addClass(self.style) + + if style: + self.addClass(style) + + self._children = [] + self._catchedEvents = {} + self._disabledState = 0 + self._isAttached = False + self._parent = None + + self._lastDisplayState = None + + if args: + self.appendChild(*args, **kwargs) + + if appendTo: + appendTo.appendChild(self) + + def sinkEvent(self, *args): + for event_attrName in args: + event = event_attrName.lower() + + if event_attrName in self._catchedEvents or event in ["onattach", "ondetach"]: + continue + + eventFn = getattr(self, event_attrName, None) + assert eventFn and callable(eventFn), "{} must provide a {} method".format(str(self), event_attrName) + + self._catchedEvents[event_attrName] = eventFn + + if event.startswith("on"): + event = event[2:] + + self.element.addEventListener(event, eventFn) + + def unsinkEvent(self, *args): + for event_attrName in args: + event = event_attrName.lower() + + if event_attrName not in self._catchedEvents: + continue + + eventFn = self._catchedEvents[event_attrName] + del self._catchedEvents[event_attrName] + + if event.startswith("on"): + event = event[2:] + + self.element.removeEventListener(event, eventFn) + + def disable(self): + if not self["disabled"]: + self["disabled"] = True + + def enable(self): + if self["disabled"]: + self["disabled"] = False + + def _getDisabled(self): + return bool(self._disabledState) + + def _setDisabled(self, disable): + for child in self._children: + child._setDisabled(disable) + + if disable: + self._disabledState += 1 + self.addClass("is-disabled") + + if isinstance(self, _attrDisabled): + self.element.disabled = True + + elif self._disabledState: + self._disabledState -= 1 + + if not self._disabledState: + self.removeClass("is-disabled") + + if isinstance(self, _attrDisabled): + self.element.disabled = False + + def _getTargetfuncName(self, key, type): + assert type in ["get", "set"] + return "_{}{}{}".format(type, key[0].upper(), key[1:]) + + def __getitem__(self, key): + funcName = self._getTargetfuncName(key, "get") + + if funcName in dir(self): + return getattr(self, funcName)() + + return None + + def __setitem__(self, key, value): + funcName = self._getTargetfuncName(key, "set") + + if funcName in dir(self): + return getattr(self, funcName)(value) + + raise ValueError("{} is no valid attribute for {}".format(key, (self._tagName or str(self)))) + + def __str__(self): + return str(self.__class__.__name__) + + def __iter__(self): + return self._children.__iter__() + + def _getData(self): + """ + Custom data attributes are intended to store custom data private to the page or application, for which there are no more appropriate attributes or elements. + :param name: + :returns: + """ + return _WidgetDataWrapper(self) + + def _getTranslate(self): + """ + Specifies whether an elements attribute values and contents of its children are to be translated when the page is localized, or whether to leave them unchanged. + :returns: True | False + """ + return True if self.element.translate == "yes" else False + + def _setTranslate(self, val): + """ + Specifies whether an elements attribute values and contents of its children are to be translated when the page is localized, or whether to leave them unchanged. + :param val: True | False + """ + self.element.translate = "yes" if val == True else "no" + + def _getTitle(self): + """ + Advisory information associated with the element. + :returns: str + """ + return self.element.title + + def _setTitle(self, val): + """ + Advisory information associated with the element. + :param val: str + """ + self.element.title = val + + def _getTabindex(self): + """ + Specifies whether the element represents an element that is is focusable (that is, an element which is part of the sequence of focusable elements in the document), and the relative order of the element in the sequence of focusable elements in the document. + :returns: number + """ + return self.element.getAttribute("tabindex") + + def _setTabindex(self, val): + """ + Specifies whether the element represents an element that is is focusable (that is, an element which is part of the sequence of focusable elements in the document), and the relative order of the element in the sequence of focusable elements in the document. + :param val: number + """ + self.element.setAttribute("tabindex", val) + + def _getSpellcheck(self): + """ + Specifies whether the element represents an element whose contents are subject to spell checking and grammar checking. + :returns: True | False + """ + return True if self.element.spellcheck == "true" else False + + def _setSpellcheck(self, val): + """ + Specifies whether the element represents an element whose contents are subject to spell checking and grammar checking. + :param val: True | False + """ + self.element.spellcheck = str(val).lower() + + def _getLang(self): + """ + Specifies the primary language for the contents of the element and for any of the elements attributes that contain text. + :returns: language tag e.g. de|en|fr|es|it|ru| + """ + return self.element.lang + + def _setLang(self, val): + """ + Specifies the primary language for the contents of the element and for any of the elements attributes that contain text. + :param val: language tag + """ + self.element.lang = val + + def _getHidden(self): + """ + Specifies that the element represents an element that is not yet, or is no longer, relevant. + :returns: True | False + """ + return True if self.element.hasAttribute("hidden") else False + + def _setHidden(self, val): + """ + Specifies that the element represents an element that is not yet, or is no longer, relevant. + :param val: True | False + """ + if val: + self.element.setAttribute("hidden", "") + else: + self.element.removeAttribute("hidden") + + def _getDropzone(self): + """ + Specifies what types of content can be dropped on the element, and instructs the UA about which actions to take with content when it is dropped on the element. + :returns: "copy" | "move" | "link" + """ + return self.element.dropzone + + def _setDropzone(self, val): + """ + Specifies what types of content can be dropped on the element, and instructs the UA about which actions to take with content when it is dropped on the element. + :param val: "copy" | "move" | "link" + """ + self.element.dropzone = val + + def _getDraggable(self): + """ + Specifies whether the element is draggable. + :returns: True | False | "auto" + """ + return (self.element.draggable if str(self.element.draggable) == "auto" else ( + True if str(self.element.draggable).lower() == "true" else False)) + + def _setDraggable(self, val): + """ + Specifies whether the element is draggable. + :param val: True | False | "auto" + """ + self.element.draggable = str(val).lower() + + def _getDir(self): + """ + Specifies the elements text directionality. + :returns: ltr | rtl | auto + """ + return self.element.dir + + def _setDir(self, val): + """ + Specifies the elements text directionality. + :param val: ltr | rtl | auto + """ + self.element.dir = val + + def _getContextmenu(self): + """ + The value of the id attribute on the menu with which to associate the element as a context menu. + :returns: + """ + return self.element.contextmenu + + def _setContextmenu(self, val): + """ + The value of the id attribute on the menu with which to associate the element as a context menu. + :param val: + """ + self.element.contextmenu = val + + def _getContenteditable(self): + """ + Specifies whether the contents of the element are editable. + :returns: True | False + """ + v = self.element.getAttribute("contenteditable") + return str(v).lower() == "true" + + def _setContenteditable(self, val): + """ + Specifies whether the contents of the element are editable. + :param val: True | False + """ + self.element.setAttribute("contenteditable", str(val).lower()) + + def _getAccesskey(self): + """ + A key label or list of key labels with which to associate the element; each key label represents a keyboard shortcut which UAs can use to activate the element or give focus to the element. + :param self: + :returns: + """ + return self.element.accesskey + + def _setAccesskey(self, val): + """ + A key label or list of key labels with which to associate the element; each key label represents a keyboard shortcut which UAs can use to activate the element or give focus to the element. + :param self: + :param val: + """ + self.element.accesskey = val + + def _getId(self): + """ + Specifies a unique id for an element + :param self: + :returns: + """ + return self.element.id + + def _setId(self, val): + """ + Specifies a unique id for an element + :param self: + :param val: + """ + self.element.id = val + + def _getClass(self): + """ + The class attribute specifies one or more classnames for an element. + :returns: + """ + return self._widgetClassWrapper + + def _setClass(self, value): + """ + The class attribute specifies one or more classnames for an element. + :param self: + :param value: + @raise ValueError: + """ + + if value is None: + self.element.setAttribute("class", " ") + elif isinstance(value, str): + self.element.setAttribute("class", value) + elif isinstance(value, list): + self.element.setAttribute("class", " ".join(value)) + else: + raise ValueError("Class must be a str, a List or None") + + def _getStyle(self): + """ + The style attribute specifies an inline style for an element. + :param self: + :returns: + """ + return _WidgetStyleWrapper(self) + + def _getRole(self): + """ + Specifies a role for an element + @param self: + @return: + """ + return self.element.getAttribute("role") + + def _setRole(self, val): + """ + Specifies a role for an element + @param self: + @param val: + """ + self.element.setAttribute("role", val) + + def hide(self): + """ + Hide element, if shown. + :return: + """ + state = self["style"].get("display", "") + + if state != "none": + self._lastDisplayState = state + self["style"]["display"] = "none" + + def show(self): + """ + Show element, if hidden. + :return: + """ + if self._lastDisplayState is not None: + self["style"]["display"] = self._lastDisplayState + self._lastDisplayState = None + + def isHidden(self): + """ + Checks if a widget is hidden. + :return: True if hidden, False otherwise. + """ + return self["style"].get("display", "") == "none" + + def isVisible(self): + """ + Checks if a widget is visible. + :return: True if visible, False otherwise. + """ + return not self.isHidden() + + def onBind(self, widget, name): + """ + Event function that is called on the widget when it is bound to another widget with a name. + This is only done by the HTML parser, a manual binding by the user is not triggered. + """ + return + + def onAttach(self): + self._isAttached = True + + for c in self._children: + c.onAttach() + + def onDetach(self): + self._isAttached = False + for c in self._children: + c.onDetach() + + def __collectChildren(self, *args, **kwargs): + assert not isinstance(self, _isVoid), "<%s> can't have children!" % self._tagName + + if kwargs.get("bindTo") is None: + kwargs["bindTo"] = self + + widgets = [] + for arg in args: + if isinstance(arg, (str, HtmlAst)): + widgets.extend(fromHTML(arg, **kwargs)) + + elif isinstance(arg, (list, tuple)): + for subarg in arg: + widgets.extend(self.__collectChildren(subarg, **kwargs)) + + elif not isinstance(arg, (Widget, TextNode)): + widgets.append(TextNode(str(arg))) + + else: + widgets.append(arg) + + return widgets + + def insertBefore(self, insert, child, **kwargs): + if not child: + return self.appendChild(insert) + + assert child in self._children, "{} is not a child of {}".format(child, self) + + toInsert = self.__collectChildren(insert, **kwargs) + + for insert in toInsert: + if insert._parent: + insert._parent.removeChild(insert) + + self.element.insertBefore(insert.element, child.element) + self._children.insert(self._children.index(child), insert) + + insert._parent = self + if self._isAttached: + insert.onAttach() + + return toInsert + + def prependChild(self, *args, **kwargs): + if kwargs.get("replace", False): + self.removeAllChildren() + del kwargs["replace"] + + toPrepend = self.__collectChildren(*args, **kwargs) + + for child in toPrepend: + if child._parent: + child._parent._children.remove(child) + child._parent = None + + if not self._children: + self.appendChild(child) + else: + self.insertBefore(child, self.children(0)) + + return toPrepend + + def appendChild(self, *args, **kwargs): + if kwargs.get("replace", False): + self.removeAllChildren() + del kwargs["replace"] + + toAppend = self.__collectChildren(*args, **kwargs) + + for child in toAppend: + if child._parent: + child._parent._children.remove(child) + + self._children.append(child) + self.element.appendChild(child.element) + child._parent = self + + if self._isAttached: + child.onAttach() + + return toAppend + + def removeChild(self, child): + assert child in self._children, "{} is not a child of {}".format(child, self) + + if child._isAttached: + child.onDetach() + + self.element.removeChild(child.element) + self._children.remove(child) + child._parent = None + + def removeAllChildren(self): + """ + Removes all child widgets of the current widget. + """ + for child in self._children[:]: + self.removeChild(child) + + def isParentOf(self, widget): + """ + Checks if an object is the parent of widget. + + :type widget: Widget + :param widget: The widget to check for. + :return: True, if widget is a child of the object, else False. + """ + + # You cannot be your own child! + if self == widget: + return False + + for child in self._children: + if child == widget: + return True + + if child.isParentOf(widget): + return True + + return False + + def isChildOf(self, widget): + """ + Checks if an object is the child of widget. + + :type widget: Widget + :param widget: The widget to check for. + :return: True, if object is a child of widget, else False. + """ + + # You cannot be your own parent! + if self == widget: + return False + + parent = self.parent() + while parent: + if parent == widget: + return True + + parent = widget.parent() + + return False + + def hasClass(self, className): + """ + Determine whether the current widget is assigned the given class + + :param className: The class name to search for. + :type className: str + """ + + if isinstance(className, str) or isinstance(className, unicode): + return className in self["class"] + else: + raise TypeError() + + def addClass(self, *args): + """ + Adds a class or a list of classes to the current widget. + If the widget already has the class, it is ignored. + + :param args: A list of class names. This can also be a list. + :type args: list of str | list of list of str + """ + + for item in args: + if isinstance(item, list): + self.addClass(*item) + + elif isinstance(item, str): + for sitem in item.split(" "): + if not self.hasClass(sitem): + self["class"].append(sitem) + else: + raise TypeError() + + def removeClass(self, *args): + """ + Removes a class or a list of classes from the current widget. + + :param args: A list of class names. This can also be a list. + :type args: list of str | list of list of str + """ + + for item in args: + if isinstance(item, list): + self.removeClass(item) + + elif isinstance(item, str): + for sitem in item.split(" "): + if self.hasClass(sitem): + self["class"].remove(sitem) + else: + raise TypeError() + + def toggleClass(self, on, off=None): + """ + Toggles the class ``on``. + + If the widget contains a class ``on``, it is toggled by ``off``. + ``off`` can either be a class name that is substituted, or nothing. + + :param on: Classname to test for. If ``on`` does not exist, but ``off``, ``off`` is replaced by ``on``. + :type on: str + + :param off: Classname to replace if ``on`` existed. + :type off: str + + :return: Returns True, if ``on`` was switched, else False. + :rtype: bool + """ + if self.hasClass(on): + self.removeClass(on) + + if off and not self.hasClass(off): + self.addClass(off) + + return False + + if off and self.hasClass(off): + self.removeClass(off) + + self.addClass(on) + return True + + def onBlur(self, event): + pass + + def onChange(self, event): + pass + + def onContextMenu(self, event): + pass + + def onFocus(self, event): + pass + + def onFocusIn(self, event): + pass + + def onFocusOut(self, event): + pass + + def onFormChange(self, event): + pass + + def onFormInput(self, event): + pass + + def onInput(self, event): + pass + + def onInvalid(self, event): + pass + + def onReset(self, event): + pass + + def onSelect(self, event): + pass + + def onSubmit(self, event): + pass + + def onKeyDown(self, event): + pass + + def onKeyPress(self, event): + pass + + def onKeyUp(self, event): + pass + + def onClick(self, event): + pass + + def onDblClick(self, event): + pass + + def onDrag(self, event): + pass + + def onDragEnd(self, event): + pass + + def onDragEnter(self, event): + pass + + def onDragLeave(self, event): + pass + + def onDragOver(self, event): + pass + + def onDragStart(self, event): + pass + + def onDrop(self, event): + pass + + def onMouseDown(self, event): + pass + + def onMouseMove(self, event): + pass + + def onMouseOut(self, event): + pass + + def onMouseOver(self, event): + pass + + def onMouseUp(self, event): + pass + + def onMouseWheel(self, event): + pass + + def onScroll(self, event): + pass + + def onTouchStart(self, event): + pass + + def onTouchEnd(self, event): + pass + + def onTouchMove(self, event): + pass + + def onTouchCancel(self, event): + pass + + def focus(self): + self.element.focus() + + def blur(self): + self.element.blur() + + def parent(self): + return self._parent + + def children(self, n=None): + """ + Access children of widget. + + If ``n`` is ommitted, it returns a list of all child-widgets; + Else, it returns the N'th child, or None if its out of bounds. + + :param n: Optional offset of child widget to return. + :type n: int + + :return: Returns all children or only the requested one. + :rtype: list | Widget | None + """ + if n is None: + return self._children[:] + + try: + return self._children[n] + except IndexError: + return None + + def sortChildren(self, key): + """ + Sorts our direct children. They are rearranged on DOM level. + Key must be a function accepting one widget as parameter and must return + the key used to sort these widgets. + """ + self._children.sort(key=key) + tmpl = self._children[:] + tmpl.reverse() + for c in tmpl: + self.element.removeChild(c.element) + self.element.insertBefore(c.element, self.element.children.item(0)) + + def fromHTML(self, html, appendTo=None, bindTo=None, replace=False, vars=None, **kwargs): + """ + Parses html and constructs its elements as part of self. + + :param html: HTML code. + :param appendTo: The entity where the HTML code is constructed below. This defaults to self in usual case. + :param bindTo: The entity where the named objects are bound to. This defaults to self in usual case. + :param replace: Clear entire content of appendTo before appending. + :param vars: Deprecated; Same as kwargs. + :param **kwargs: Additional variables provided as a dict for {{placeholders}} inside the HTML + + :return: + """ + if appendTo is None: + appendTo = self + + if bindTo is None: + bindTo = self + + if replace: + appendTo.removeAllChildren() + + # use of vars is deprecated! + if isinstance(vars, dict): + kwargs.update(vars) + + return fromHTML(html, appendTo=appendTo, bindTo=bindTo, **kwargs) + + +######################################################################################################################## +# Attribute Collectors +######################################################################################################################## + +# _attrLabel --------------------------------------------------------------------------------------------------------------- + +class _attrLabel(object): + def _getLabel(self): + return self.element.getAttribute("label") + + def _setLabel(self, val): + self.element.setAttribute("label", val) + + +# _attrCharset -------------------------------------------------------------------------------------------------------------- + +class _attrCharset(object): + def _getCharset(self): + return self.element._attrCharset + + def _setCharset(self, val): + self.element._attrCharset = val + + +# _attrCite ----------------------------------------------------------------------------------------------------------------- + +class _attrCite(object): + def _getCite(self): + return self.element._attrCite + + def _setCite(self, val): + self.element._attrCite = val + + +class _attrDatetime(object): + def _getDatetime(self): + return self.element.datetime + + def _setDatetime(self, val): + self.element.datetime = val + + +# Form ----------------------------------------------------------------------------------------------------------------- + +class _attrForm(object): + def _getForm(self): + return self.element.form + + def _setForm(self, val): + self.element.form = val + + +class _attrAlt(object): + def _getAlt(self): + return self.element.alt + + def _setAlt(self, val): + self.element.alt = val + + +class _attrAutofocus(object): + def _getAutofocus(self): + return True if self.element.hasAttribute("autofocus") else False + + def _setAutofocus(self, val): + if val: + self.element.setAttribute("autofocus", "") + else: + self.element.removeAttribute("autofocus") + + +class _attrDisabled(object): + pass + + +class _attrChecked(object): + def _getChecked(self): + return self.element.checked + + def _setChecked(self, val): + self.element.checked = val + + +class _attrIndeterminate(object): + def _getIndeterminate(self): + return self.element.indeterminate + + def _setIndeterminate(self, val): + self.element.indeterminate = val + + +class _attrName(object): + def _getName(self): + return self.element.getAttribute("name") + + def _setName(self, val): + self.element.setAttribute("name", val) + + +class _attrValue(object): + def _getValue(self): + return self.element.value + + def _setValue(self, val): + self.element.value = val + + +class _attrAutocomplete(object): + def _getAutocomplete(self): + return True if self.element.autocomplete == "on" else False + + def _setAutocomplete(self, val): + self.element.autocomplete = "on" if val == True else "off" + + +class _attrRequired(object): + def _getRequired(self): + return True if self.element.hasAttribute("required") else False + + def _setRequired(self, val): + if val: + self.element.setAttribute("required", "") + else: + self.element.removeAttribute("required") + + +class _attrMultiple(object): + def _getMultiple(self): + return True if self.element.hasAttribute("multiple") else False + + def _setMultiple(self, val): + if val: + self.element.setAttribute("multiple", "") + else: + self.element.removeAttribute("multiple") + + +class _attrSize(object): + def _getSize(self): + return self.element.size + + def _setSize(self, val): + self.element.size = val + + +class _attrFor(object): + def _getFor(self): + return self.element.getAttribute("for") + + def _setFor(self, val): + self.element.setAttribute("for", val) + + +class _attrInputs(_attrRequired): + def _getMaxlength(self): + return self.element.maxlength + + def _setMaxlength(self, val): + self.element.maxlength = val + + def _getPlaceholder(self): + return self.element.placeholder + + def _setPlaceholder(self, val): + self.element.placeholder = val + + def _getReadonly(self): + return True if self.element.hasAttribute("readonly") else False + + def _setReadonly(self, val): + if val: + self.element.setAttribute("readonly", "") + else: + self.element.removeAttribute("readonly") + + +class _attrFormhead(object): + def _getFormaction(self): + return self.element.formaction + + def _setFormaction(self, val): + self.element.formaction = val + + def _getFormenctype(self): + return self.element.formenctype + + def _setFormenctype(self, val): + self.element.formenctype = val + + def _getFormmethod(self): + return self.element.formmethod + + def _setFormmethod(self, val): + self.element.formmethod = val + + def _getFormtarget(self): + return self.element.formtarget + + def _setFormtarget(self, val): + self.element.formtarget = val + + def _getFormnovalidate(self): + return True if self.element.hasAttribute("formnovalidate") else False + + def _setFormnovalidate(self, val): + if val: + self.element.setAttribute("formnovalidate", "") + else: + self.element.removeAttribute("formnovalidate") + + +# _attrHref ----------------------------------------------------------------------------------------------------------------- + +class _attrHref(object): + def _getHref(self): + """ + Url of a Page + :param self: + """ + return self.element.href + + def _setHref(self, val): + """ + Url of a Page + :param val: URL + """ + self.element.href = val + + def _getHreflang(self): + return self.element.hreflang + + def _setHreflang(self, val): + self.element.hreflang = val + + +class _attrTarget(object): + def _getTarget(self): + return self.element.target + + def _setTarget(self, val): + self.element.target = val + + +# _attrMedia ---------------------------------------------------------------------------------------------------------------- + +class _attrType(object): + def _getType(self): + return self.element.type + + def _setType(self, val): + self.element.type = val + + +class _attrMedia(_attrType): + def _getMedia(self): + return self.element.media + + def _setMedia(self, val): + self.element.media = val + + +class _attrDimensions(object): + def _getWidth(self): + return self.element.width + + def _setWidth(self, val): + self.element.width = val + + def _getHeight(self): + return self.element.height + + def _setHeight(self, val): + self.element.height = val + + +class _attrUsemap(object): + def _getUsemap(self): + return self.element.usemap + + def _setUsemap(self, val): + self.element.usemap = val + + +class _attrMultimedia(object): + def _getAutoplay(self): + return True if self.element.hasAttribute("autoplay") else False + + def _setAutoplay(self, val): + if val: + self.element.setAttribute("autoplay", "") + else: + self.element.removeAttribute("autoplay") + + def _getPlaysinline(self): + return True if self.element.hasAttribute("playsinline") else False + + def _setPlaysinline(self, val): + if val: + self.element.setAttribute("playsinline", "") + else: + self.element.removeAttribute("playsinline") + + def _getControls(self): + return True if self.element.hasAttribute("controls") else False + + def _setControls(self, val): + if val: + self.element.setAttribute("controls", "") + else: + self.element.removeAttribute("controls") + + def _getLoop(self): + return True if self.element.hasAttribute("loop") else False + + def _setLoop(self, val): + if val: + self.element.setAttribute("loop", "") + else: + self.element.removeAttribute("loop") + + def _getMuted(self): + return True if self.element.hasAttribute("muted") else False + + def _setMuted(self, val): + if val: + self.element.setAttribute("muted", "") + else: + self.element.removeAttribute("muted") + + def _getPreload(self): + return self.element.preload + + def _setPreload(self, val): + self.element.preload = val + + +# _attrRel ------------------------------------------------------------------------------------------------------------------ + +class _attrRel(object): + def _getRel(self): + return self.element.rel + + def _setRel(self, val): + self.element.rel = val + + +# _attrSrc ------------------------------------------------------------------------------------------------------------------ + +class _attrSrc(object): + def _getSrc(self): + return self.element.src + + def _setSrc(self, val): + self.element.src = val + + +# Svg ------------------------------------------------------------------------------------------------------------------ + +class _attrSvgViewBox(object): + def _getViewbox(self): + viewBox = self.element.viewBox + try: + return " ".join([str(x) for x in [viewBox.baseVal.x, viewBox.baseVal.y, viewBox.baseVal.width, viewBox.baseVal.height]]) + except: + return "" + + def _setViewbox(self, val): + self.element.setAttribute("viewBox", val) + + def _getPreserveaspectratio(self): + return self.element.preserveAspectRatio + + def _setPreserveaspectratio(self, val): + self.element.setAttribute("preserveAspectRatio", val) + + +class _attrSvgDimensions(object): + def _getWidth(self): + return self.element.width + + def _setWidth(self, val): + self.element.setAttribute("width", val) + + def _getHeight(self): + return self.element.height + + def _setHeight(self, val): + self.element.setAttribute("height", val) + + def _getX(self): + return self.element.x + + def _setX(self, val): + self.element.setAttribute("x", val) + + def _getY(self): + return self.element.y + + def _setY(self, val): + self.element.setAttribute("y", val) + + def _getR(self): + return self.element.r + + def _setR(self, val): + self.element.setAttribute("r", val) + + def _getRx(self): + return self.element.rx + + def _setRx(self, val): + self.element.setAttribute("rx", val) + + def _getRy(self): + return self.element.ry + + def _setRy(self, val): + self.element.setAttribute("ry", val) + + def _getCx(self): + return self.element.cx + + def _setCx(self, val): + self.element.setAttribute("cx", val) + + def _getCy(self): + return self.element.cy + + def _setCy(self, val): + self.element.setAttribute("cy", val) + + +class _attrSvgPoints(object): + def _getPoints(self): + return self.element.points + + def _setPoints(self, val): + self.element.setAttribute("points", val) + + def _getX1(self): + return self.element.x1 + + def _setX1(self, val): + self.element.setAttribute("x1", val) + + def _getY1(self): + return self.element.y1 + + def _setY1(self, val): + self.element.setAttribute("y1", val) + + def _getX2(self): + return self.element.x2 + + def _setX2(self, val): + self.element.setAttribute("x2", val) + + def _getY2(self): + return self.element.y2 + + def _setY2(self, val): + self.element.setAttribute("y2", val) + + +class _attrSvgTransform(object): + def _getTransform(self): + return self.element.transform + + def _setTransform(self, val): + self.element.setAttribute("transform", val) + + +class _attrSvgXlink(object): + def _getXlinkhref(self): + return self.element.getAttribute("xlink:href") + + def _setXlinkhref(self, val): + self.element.setAttribute("xlink:href", val) + + +class _attrSvgStyles(object): + def _getFill(self): + return self.element.fill + + def _setFill(self, val): + self.element.setAttribute("fill", val) + + def _getStroke(self): + return self.element.stroke + + def _setStroke(self, val): + self.element.setAttribute("stroke", val) + + +class _isVoid(object): + pass + + +######################################################################################################################## +# HTML Elements +######################################################################################################################## + +# A -------------------------------------------------------------------------------------------------------------------- + +class A(Widget, _attrHref, _attrTarget, _attrMedia, _attrRel, _attrName): + _tagName = "a" + + def _getDownload(self): + """ + The download attribute specifies the path to a download + :returns: filename + """ + return self.element.download + + def _setDownload(self, val): + """ + The download attribute specifies the path to a download + :param val: filename + """ + self.element.download = val + + +# Area ----------------------------------------------------------------------------------------------------------------- + +class Area(A, _attrAlt, _isVoid): + _tagName = "area" + + def _getCoords(self): + return self.element.coords + + def _setCoords(self, val): + self.element.coords = val + + def _getShape(self): + return self.element.shape + + def _setShape(self, val): + self.element.shape = val + + +# Audio ---------------------------------------------------------------------------------------------------------------- + +class Audio(Widget, _attrSrc, _attrMultimedia): + _tagName = "audio" + +class Bdo(Widget): + _tagName = "bdo" + + +# Blockquote ----------------------------------------------------------------------------------------------------------- + +class Blockquote(Widget): + _tagName = "blockquote" + + def _getBlockquote(self): + return self.element.blockquote + + def _setBlockquote(self, val): + self.element.blockquote = val + + +# Body ----------------------------------------------------------------------------------------------------------------- + +class BodyCls(Widget): + + def __init__(self, *args, **kwargs): + super().__init__(_wrapElem=domGetElementsByTagName("body")[0], *args, **kwargs) + self._isAttached = True + + +_body = None + + +def Body(): + global _body + + if _body is None: + _body = BodyCls() + + return _body + + +# Canvas --------------------------------------------------------------------------------------------------------------- + +class Canvas(Widget, _attrDimensions): + _tagName = "canvas" + + +# Command -------------------------------------------------------------------------------------------------------------- + +class Command(Widget, _attrLabel, _attrType, _attrDisabled, _attrChecked): + _tagName = "command" + + def _getIcon(self): + return self.element.icon + + def _setIcon(self, val): + self.element.icon = val + + def _getRadiogroup(self): + return self.element.radiogroup + + def _setRadiogroup(self, val): + self.element.radiogroup = val + + +# _Del ----------------------------------------------------------------------------------------------------------------- + +class _Del(Widget, _attrCite, _attrDatetime): + _tagName = "_del" + + +# Dialog -------------------------------------------------------------------------------------------------------------- + +class Dialog(Widget): + _tagName = "dialog" + + def _getOpen(self): + return True if self.element.hasAttribute("open") else False + + def _setOpen(self, val): + if val: + self.element.setAttribute("open", "") + else: + self.element.removeAttribute("open") + +# Elements ------------------------------------------------------------------------------------------------------------- + +class Abbr(Widget): + _tagName = "abbr" + + +class Address(Widget): + _tagName = "address" + + +class Article(Widget): + _tagName = "article" + + +class Aside(Widget): + _tagName = "aside" + + +class B(Widget): + _tagName = "b" + + +class Bdi(Widget): + _tagName = "bdi" + + +class Br(Widget, _isVoid): + _tagName = "br" + + +class Caption(Widget): + _tagName = "caption" + + +class Cite(Widget): + _tagName = "cite" + + +class Code(Widget): + _tagName = "code" + + +class Datalist(Widget): + _tagName = "datalist" + + +class Dfn(Widget): + _tagName = "dfn" + + +class Div(Widget): + _tagName = "div" + + +class Em(Widget): + _tagName = "em" + + +class Embed(Widget, _attrSrc, _attrType, _attrDimensions, _isVoid): + _tagName = "embed" + + +class Figcaption(Widget): + _tagName = "figcaption" + + +class Figure(Widget): + _tagName = "figure" + + +class Footer(Widget): + _tagName = "footer" + + +class Header(Widget): + _tagName = "header" + + +class H1(Widget): + _tagName = "h1" + + +class H2(Widget): + _tagName = "h2" + + +class H3(Widget): + _tagName = "h3" + + +class H4(Widget): + _tagName = "h4" + + +class H5(Widget): + _tagName = "h5" + + +class H6(Widget): + _tagName = "h6" + + +class Hr(Widget, _isVoid): + _tagName = "hr" + + +class I(Widget): + _tagName = "i" + + +class Kdb(Widget): + _tagName = "kdb" + + +class Legend(Widget): + _tagName = "legend" + + +class Mark(Widget): + _tagName = "mark" + + +class Noscript(Widget): + _tagName = "noscript" + + +class P(Widget): + _tagName = "p" + + +class Rq(Widget): + _tagName = "rq" + + +class Rt(Widget): + _tagName = "rt" + + +class Ruby(Widget): + _tagName = "ruby" + + +class S(Widget): + _tagName = "s" + + +class Samp(Widget): + _tagName = "samp" + + +class Section(Widget): + _tagName = "section" + + +class Small(Widget): + _tagName = "small" + + +class Strong(Widget): + _tagName = "strong" + + +class Sub(Widget): + _tagName = "sub" + + +class Summery(Widget): + _tagName = "summery" + + +class Sup(Widget): + _tagName = "sup" + + +class U(Widget): + _tagName = "u" + + +class Var(Widget): + _tagName = "var" + + +class Wbr(Widget): + _tagName = "wbr" + + +# Form ----------------------------------------------------------------------------------------------------------------- + +class Button(Widget, _attrDisabled, _attrType, _attrForm, _attrAutofocus, _attrName, _attrValue, _attrFormhead): + _tagName = "button" + + +class Fieldset(Widget, _attrDisabled, _attrForm, _attrName): + _tagName = "fieldset" + + +class Form(Widget, _attrDisabled, _attrName, _attrTarget, _attrAutocomplete): + _tagName = "form" + + def _getNovalidate(self): + return True if self.element.hasAttribute("novalidate") else False + + def _setNovalidate(self, val): + if val: + self.element.setAttribute("novalidate", "") + else: + self.element.removeAttribute("novalidate") + + def _getAction(self): + return self.element.action + + def _setAction(self, val): + self.element.action = val + + def _getMethod(self): + return self.element.method + + def _setMethod(self, val): + self.element.method = val + + def _getEnctype(self): + return self.element.enctype + + def _setEnctype(self, val): + self.element.enctype = val + + def _getAccept_attrCharset(self): + return getattr(self.element, "accept-charset") + + def _setAccept_attrCharset(self, val): + self.element.setAttribute("accept-charset", val) + + +class Input(Widget, _attrDisabled, _attrType, _attrForm, _attrAlt, _attrAutofocus, _attrChecked, + _attrIndeterminate, _attrName, _attrDimensions, _attrValue, _attrFormhead, + _attrAutocomplete, _attrInputs, _attrMultiple, _attrSize, _attrSrc, _isVoid): + _tagName = "input" + + def _getAccept(self): + return self.element.accept + + def _setAccept(self, val): + self.element.accept = val + + def _getList(self): + return self.element.list + + def _setList(self, val): + self.element.list = val + + def _getMax(self): + return self.element.max + + def _setMax(self, val): + self.element.max = val + + def _getMin(self): + return self.element.min + + def _setMin(self, val): + self.element.min = val + + def _getPattern(self): + return self.element.pattern + + def _setPattern(self, val): + self.element.pattern = val + + def _getStep(self): + return self.element.step + + def _setStep(self, val): + self.element.step = val + + +class Label(Widget, _attrForm, _attrFor): + _tagName = "label" + autoIdCounter = 0 + + def __init__(self, *args, forElem=None, **kwargs): + super().__init__(*args, **kwargs) + + if forElem: + if not forElem["id"]: + idx = Label.autoIdCounter + Label.autoIdCounter += 1 + forElem["id"] = "label-autoid-for-{}".format(idx) + + self["for"] = forElem["id"] + + +class Optgroup(Widget, _attrDisabled, _attrLabel): + _tagName = "optgroup" + + +class Option(Widget, _attrDisabled, _attrLabel, _attrValue): + _tagName = "option" + + def _getSelected(self): + return True if self.element.selected else False + + def _setSelected(self, val): + if val: + self.element.selected = True + else: + self.element.selected = False + + +class Output(Widget, _attrForm, _attrName, _attrFor): + _tagName = "output" + + +class Select(Widget, _attrDisabled, _attrForm, _attrAutofocus, _attrName, _attrRequired, _attrMultiple, _attrSize): + _tagName = "select" + + def _getSelectedIndex(self): + return self.element.selectedIndex + + def _getOptions(self): + return self.element.options + + +class Textarea(Widget, _attrDisabled, _attrForm, _attrAutofocus, _attrName, _attrInputs, _attrValue): + _tagName = "textarea" + + def _getCols(self): + return self.element.cols + + def _setCols(self, val): + self.element.cols = val + + def _getRows(self): + return self.element.rows + + def _setRows(self, val): + self.element.rows = val + + def _getWrap(self): + return self.element.wrap + + def _setWrap(self, val): + self.element.wrap = val + + +# Head ----------------------------------------------------------------------------------------------------------------- + +class HeadCls(Widget): + + def __init__(self, *args, **kwargs): + super().__init__(_wrapElem=domGetElementsByTagName("head")[0], *args, **kwargs) + self._isAttached = True + + +_head = None + + +def Head(): + global _head + if _head is None: + _head = HeadCls() + return _head + + +# Iframe --------------------------------------------------------------------------------------------------------------- + +class Iframe(Widget, _attrSrc, _attrName, _attrDimensions): + _tagName = "iframe" + + def _getSandbox(self): + return self.element.sandbox + + def _setSandbox(self, val): + self.element.sandbox = val + + def _getSrcdoc(self): + return self.element.src + + def _setSrcdoc(self, val): + self.element.src = val + + def _getSeamless(self): + return True if self.element.hasAttribute("seamless") else False + + def _setSeamless(self, val): + if val: + self.element.setAttribute("seamless", "") + else: + self.element.removeAttribute("seamless") + + +# Img ------------------------------------------------------------------------------------------------------------------ + +class Img(Widget, _attrSrc, _attrDimensions, _attrUsemap, _attrAlt, _isVoid): + _tagName = "img" + + def __init__(self, src=None, *args, **kwargs): + super().__init__() + if src: + self["src"] = src + + def _getCrossorigin(self): + return self.element.crossorigin + + def _setCrossorigin(self, val): + self.element.crossorigin = val + + def _getIsmap(self): + return self.element.ismap + + def _setIsmap(self, val): + self.element.ismap = val + + +# Ins ------------------------------------------------------------------------------------------------------------------ + +class Ins(Widget, _attrCite, _attrDatetime): + _tagName = "ins" + + +# Keygen --------------------------------------------------------------------------------------------------------------- + +class Keygen(Form, _attrAutofocus, _attrDisabled): + _tagName = "keygen" + + def _getChallenge(self): + return True if self.element.hasAttribute("challenge") else False + + def _setChallenge(self, val): + if val: + self.element.setAttribute("challenge", "") + else: + self.element.removeAttribute("challenge") + + def _getKeytype(self): + return self.element.keytype + + def _setKeytype(self, val): + self.element.keytype = val + + +# Link ----------------------------------------------------------------------------------------------------------------- + +class Link(Widget, _attrHref, _attrMedia, _attrRel, _isVoid): + _tagName = "link" + + def _getSizes(self): + return self.element.sizes + + def _setSizes(self, val): + self.element.sizes = val + + +# List ----------------------------------------------------------------------------------------------------------------- + +class Ul(Widget): + _tagName = "ul" + + +class Ol(Widget): + _tagName = "ol" + + +class Li(Widget): + _tagName = "li" + + +class Dl(Widget): + _tagName = "dl" + + +class Dt(Widget): + _tagName = "dt" + + +class Dd(Widget): + _tagName = "dd" + + +# Map ------------------------------------------------------------------------------------------------------------------ + +class Map(Label, _attrType): + _tagName = "map" + + +# Menu ----------------------------------------------------------------------------------------------------------------- + +class Menu(Widget): + _tagName = "menu" + + +# Meta ----------------------------------------------------------------------------------------------------------------- + +class Meta(Widget, _attrName, _attrCharset, _isVoid): + _tagName = "meta" + + def _getContent(self): + return self.element.content + + def _setContent(self, val): + self.element.content = val + + +# Meter ---------------------------------------------------------------------------------------------------------------- + +class Meter(Form, _attrValue): + _tagName = "meter" + + def _getHigh(self): + return self.element.high + + def _setHigh(self, val): + self.element.high = val + + def _getLow(self): + return self.element.low + + def _setLow(self, val): + self.element.low = val + + def _getMax(self): + return self.element.max + + def _setMax(self, val): + self.element.max = val + + def _getMin(self): + return self.element.min + + def _setMin(self, val): + self.element.min = val + + def _getOptimum(self): + return self.element.optimum + + def _setOptimum(self, val): + self.element.optimum = val + + +# Nav ------------------------------------------------------------------------------------------------------------------ + +class Nav(Widget): + _tagName = "nav" + + +# Object ----------------------------------------------------------------------------------------------------------------- + +class Object(Form, _attrType, _attrName, _attrDimensions, _attrUsemap): + _tagName = "object" + + +# Param ----------------------------------------------------------------------------------------------------------------- + +class Param(Widget, _attrName, _attrValue, _isVoid): + _tagName = "param" + + +# Progress ------------------------------------------------------------------------------------------------------------- + +class Progress(Widget, _attrValue): + _tagName = "progress" + + def _getMax(self): + return self.element.max + + def _setMax(self, val): + self.element.max = val + + +# Q -------------------------------------------------------------------------------------------------------------------- + +class Q(Widget, _attrCite): + _tagName = "q" + + +# Script ---------------------------------------------------------------------------------------------------------------- + +class Script(Widget, _attrSrc, _attrCharset): + _tagName = "script" + + def _getAsync(self): + return True if self.element.hasAttribute("async") else False + + def _setAsync(self, val): + if val: + self.element.setAttribute("async", "") + else: + self.element.removeAttribute("async") + + def _getDefer(self): + return True if self.element.hasAttribute("defer") else False + + def _setDefer(self, val): + if val: + self.element.setAttribute("defer", "") + else: + self.element.removeAttribute("defer") + + +# Source --------------------------------------------------------------------------------------------------------------- + +class Source(Widget, _attrMedia, _attrSrc, _isVoid): + _tagName = "source" + + +# Span ----------------------------------------------------------------------------------------------------------------- + +class Span(Widget): + _tagName = "span" + + +# Style ---------------------------------------------------------------------------------------------------------------- + +class Style(Widget, _attrMedia): + _tagName = "style" + + def _getScoped(self): + return True if self.element.hasAttribute("scoped") else False + + def _setScoped(self, val): + if val: + self.element.setAttribute("scoped", "") + else: + self.element.removeAttribute("scoped") + + +# SVG ------------------------------------------------------------------------------------------------------------------ + +class Svg(Widget, _attrSvgViewBox, _attrSvgDimensions, _attrSvgTransform): + _tagName = "svg" + _namespace = "SVG" + + def _getVersion(self): + return self.element.version + + def _setVersion(self, val): + self.element.setAttribute("version", val) + + def _getXmlns(self): + return self.element.xmlns + + def _setXmlns(self, val): + self.element.setAttribute("xmlns", val) + + +class SvgCircle(Widget, _attrSvgTransform, _attrSvgDimensions): + _tagName = "circle" + _namespace = "SVG" + + +class SvgEllipse(Widget, _attrSvgTransform, _attrSvgDimensions): + _tagName = "ellipse" + _namespace = "SVG" + + +class SvgG(Widget, _attrSvgTransform, _attrSvgStyles): + _tagName = "g" + _namespace = "SVG" + + def _getSvgTransform(self): + return self.element.transform + + def _setSvgTransform(self, val): + self.element.setAttribute("transform", val) + + +class SvgImage(Widget, _attrSvgViewBox, _attrSvgDimensions, _attrSvgTransform, _attrSvgXlink): + _tagName = "image" + _namespace = "SVG" + + +class SvgLine(Widget, _attrSvgTransform, _attrSvgPoints): + _tagName = "line" + _namespace = "SVG" + + +class SvgPath(Widget, _attrSvgTransform): + _tagName = "path" + _namespace = "SVG" + + def _getD(self): + return self.element.d + + def _setD(self, val): + self.element.setAttribute("d", val) + + def _getPathLength(self): + return self.element.pathLength + + def _setPathLength(self, val): + self.element.setAttribute("pathLength", val) + + +class SvgPolygon(Widget, _attrSvgTransform, _attrSvgPoints): + _tagName = "polygon" + _namespace = "SVG" + + +class SvgPolyline(Widget, _attrSvgTransform, _attrSvgPoints): + _tagName = "polyline" + _namespace = "SVG" + + +class SvgRect(Widget, _attrSvgDimensions, _attrSvgTransform, _attrSvgStyles): + _tagName = "rect" + _namespace = "SVG" + + +class SvgText(Widget, _attrSvgDimensions, _attrSvgTransform, _attrSvgStyles): + _tagName = "text" + _namespace = "SVG" + + +# Table ---------------------------------------------------------------------------------------------------------------- + + +class Tr(Widget): + _tagName = "tr" + + def _getRowspan(self): + span = self.element.getAttribute("rowspan") + return span if span else 1 + + def _setRowspan(self, span): + assert span >= 1, "span may not be negative" + self.element.setAttribute("rowspan", span) + return self + + +class Td(Widget): + _tagName = "td" + + def _getColspan(self): + span = self.element.getAttribute("colspan") + return span if span else 1 + + def _setColspan(self, span): + assert span >= 1, "span may not be negative" + self.element.setAttribute("colspan", span) + return self + + def _getRowspan(self): + span = self.element.getAttribute("rowspan") + return span if span else 1 + + def _setRowspan(self, span): + assert span >= 1, "span may not be negative" + self.element.setAttribute("rowspan", span) + return self + + +class Th(Td): + _tagName = "th" + + +class Thead(Widget): + _tagName = "thead" + + +class Tbody(Widget): + _tagName = "tbody" + + +class ColWrapper(object): + def __init__(self, parentElem, *args, **kwargs): + super().__init__(*args, **kwargs) + self.parentElem = parentElem + + def __getitem__(self, item): + assert isinstance(item, int), "Invalid col-number. Expected int, got {}".format(str(type(item))) + if item < 0 or item > len(self.parentElem._children): + return None + + return self.parentElem._children[item] + + def __setitem__(self, key, value): + col = self[key] + assert col is not None, "Cannot assign widget to invalid column" + + col.removeAllChildren() + + if isinstance(value, list) or isinstance(value, tuple): + for el in value: + if isinstance(el, Widget) or isinstance(el, TextNode): + col.appendChild(value) + + elif isinstance(value, Widget) or isinstance(value, TextNode): + col.appendChild(value) + + +class RowWrapper(object): + def __init__(self, parentElem, *args, **kwargs): + super().__init__(*args, **kwargs) + self.parentElem = parentElem + + def __getitem__(self, item): + assert isinstance(item, int), "Invalid row-number. Expected int, got {}".format(str(type(item))) + if item < 0 or item > len(self.parentElem._children): + return None + + return ColWrapper(self.parentElem._children[item]) + + +class Table(Widget): + _tagName = "table" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.head = Thead() + self.body = Tbody() + self.appendChild(self.head) + self.appendChild(self.body) + + def prepareRow(self, row): + assert row >= 0, "Cannot create rows with negative index" + + for child in self.body._children: + row -= child["rowspan"] + if row < 0: + return + + while row >= 0: + self.body.appendChild(Tr()) + row -= 1 + + def prepareCol(self, row, col): + assert col >= 0, "Cannot create cols with negative index" + self.prepareRow(row) + + for rowChild in self.body._children: + row -= rowChild["rowspan"] + + if row < 0: + for colChild in rowChild._children: + col -= colChild["colspan"] + if col < 0: + return + + while col >= 0: + rowChild.appendChild(Td()) + col -= 1 + + return + + def prepareGrid(self, rows, cols): + for row in range(self.getRowCount(), self.getRowCount() + rows): + self.prepareCol(row, cols) + + def clear(self): + for row in self.body._children[:]: + + for col in row._children[:]: + row.removeChild(col) + + self.body.removeChild(row) + + def _getCell(self): + return RowWrapper(self.body) + + def getRowCount(self): + cnt = 0 + + for tr in self.body._children: + cnt += tr["rowspan"] + + return cnt + + +# Time ----------------------------------------------------------------------------------------------------------------- + +class Time(Widget, _attrDatetime): + _tagName = "time" + + +# Track ---------------------------------------------------------------------------------------------------------------- + +class Track(Label, _attrSrc, _isVoid): + _tagName = "track" + + def _getKind(self): + return self.element.kind + + def _setKind(self, val): + self.element.kind = val + + def _getSrclang(self): + return self.element.srclang + + def _setSrclang(self, val): + self.element.srclang = val + + def _getDefault(self): + return True if self.element.hasAttribute("default") else False + + def _setDefault(self, val): + if val: + self.element.setAttribute("default", "") + else: + self.element.removeAttribute("default") + + +# Video ---------------------------------------------------------------------------------------------------------------- + +class Video(Widget, _attrSrc, _attrDimensions, _attrMultimedia): + _tagName = "video" + + def _getPoster(self): + return self.element.poster + + def _setPoster(self, val): + self.element.poster = val + + +######################################################################################################################## +# Utilities +######################################################################################################################## + +def unescape(val, maxLength=0): + """ + Unquotes several HTML-quoted characters in a string. + + :param val: The value to be unescaped. + :type val: str + + :param maxLength: Cut-off after maxLength characters. + A value of 0 means "unlimited". (default) + :type maxLength: int + + :returns: The unquoted string. + :rtype: str + """ + val = val \ + .replace("<", "<") \ + .replace(">", ">") \ + .replace(""", "\"") \ + .replace("'", "'") + + if maxLength > 0: + return val[0:maxLength] + + return val + + +def doesEventHitWidgetOrParents(event, widget): + """ + Test if event 'event' hits widget 'widget' (or *any* of its parents) + """ + while widget: + if event.target == widget.element: + return True + + widget = widget.parent() + + return False + + +def doesEventHitWidgetOrChildren(event, widget): + """ + Test if event 'event' hits widget 'widget' (or *any* of its children) + """ + if event.target == widget.element: + return True + + for child in widget._children: + if doesEventHitWidgetOrChildren(event, child): + return True + + return False + + +def textToHtml(node, text): + """ + Generates html nodes from text by splitting text into content and into + line breaks html5.Br. + + :param node: The node where the nodes are appended to. + :param text: The text to be inserted. + """ + + for (i, part) in enumerate(text.split("\n")): + if i > 0: + node.appendChild(Br()) + + node.appendChild(TextNode(part)) + + +def parseInt(s, ret=0): + """ + Parses a value as int + """ + if not isinstance(s, str): + return int(s) + elif s: + if s[0] in "+-": + ts = s[1:] + else: + ts = s + + if ts and all([_ in "0123456789" for _ in ts]): + return int(s) + + return ret + + +def parseFloat(s, ret=0.0): + """ + Parses a value as float. + """ + if not isinstance(s, str): + return float(s) + elif s: + if s[0] in "+-": + ts = s[1:] + else: + ts = s + + if ts and ts.count(".") <= 1 and all([_ in ".0123456789" for _ in ts]): + return float(s) + + return ret + + +######################################################################################################################## +# Keycodes +######################################################################################################################## + +def getKey(event): + """ + Returns the Key Identifier of the given event + + Available Codes: https://www.w3.org/TR/2006/WD-DOM-Level-3-Events-20060413/keyset.html#KeySet-Set + """ + if hasattr(event, "key"): + return event.key + + elif hasattr(event, "keyIdentifier"): + if event.keyIdentifier in ["Esc", "U+001B"]: + return "Escape" + else: + return event.keyIdentifier + + return None + + +def isArrowLeft(event): + return getKey(event) in ["ArrowLeft", "Left"] + +def isArrowUp(event): + return getKey(event) in ["ArrowUp", "Up"] + +def isArrowRight(event): + return getKey(event) in ["ArrowRight", "Right"] + +def isArrowDown(event): + return getKey(event) in ["ArrowDown", "Down"] + +def isEscape(event): + return getKey(event) == "Escape" + +def isReturn(event): + return getKey(event) == "Enter" + +def isControl(event): # The Control (Ctrl) key. + return getKey(event) == "Control" + +def isShift(event): + return getKey(event) == "Shift" + + +######################################################################################################################## +# HTML parser +######################################################################################################################## + +# Global variables required by HTML parser +__tags = None +__domParser = None + + +def registerTag(tagName, widgetClass, override=True): + assert issubclass(widgetClass, Widget), "widgetClass must be a sub-class of Widget!" + global __tags + + if __tags is None: + _buildTags() + + if not override and tagName.lower() in __tags: + return + + attr = [] + + for fname in dir(widgetClass): + if fname.startswith("_set"): + attr.append(fname[4:].lower()) + + __tags[tagName.lower()] = (widgetClass, attr) + + +def tag(cls): + assert issubclass(cls, Widget) + registerTag(cls._parserTagName or cls.__name__, cls) # do NOT check for cls._tagName here!!! + return cls + + +def _buildTags(debug=False): + """ + Generates a dictionary of all to the html5-library + known tags and their associated objects and attributes. + """ + global __tags + + if __tags is not None: + return + + if __tags is None: + __tags = {} + + for cname in globals().keys(): + if cname.startswith("_"): + continue + + cls = globals()[cname] + + try: + if not issubclass(cls, Widget): + continue + except: + continue + + registerTag(cls._parserTagName or cls._tagName or cls.__name__, cls, override=False) + + if debug: + for tag in sorted(__tags.keys()): + print("{}: {}".format(tag, ", ".join(sorted(__tags[tag][1])))) + + +class HtmlAst(list): + pass + + +def parseHTML(html, debug=False): + """ + Parses the provided HTML-code according to the objects defined in the html5-library. + """ + + def convertEncodedText(txt): + """ + Convert HTML-encoded text into decoded string. + + The reason for this function is the handling of HTML entities, which is not + properly supported by native JavaScript. + + We use the browser's DOM parser to to this, according to + https://stackoverflow.com/questions/3700326/decode-amp-back-to-in-javascript + + :param txt: The encoded text. + :return: The decoded text. + """ + global __domParser + + if jseval is None: + return txt + + if __domParser is None: + __domParser = jseval("new DOMParser") + + dom = __domParser.parseFromString("" + str(txt), "text/html") + return dom.body.textContent + + def scanWhite(l): + """ + Scan and return whitespace. + """ + + ret = "" + while l and l[0] in " \t\r\n": + ret += l.pop(0) + + return ret + + def scanWord(l): + """ + Scan and return a word. + """ + + ret = "" + while l and l[0] not in " \t\r\n" + "<>=\"'": + ret += l.pop(0) + + return ret + + stack = [] + + # Obtain tag descriptions, if not already done! + global __tags + + if __tags is None: + _buildTags(debug=debug) + + # Prepare stack and input + stack.append((None, None, HtmlAst())) + html = [ch for ch in html] + + # Parse + while html: + tag = None + text = "" + + # Auto-close void elements (_isVoid), e.g.
,
, etc. + while stack and stack[-1][0] and issubclass(__tags[stack[-1][0]][0], _isVoid): + stack.pop() + + if not stack: + break + + parent = stack[-1][2] + + while html: + ch = html.pop(0) + + # Comment + if html and ch == "<" and "".join(html[:3]) == "!--": + html = html[3:] + while html and "".join(html[:3]) != "-->": + html.pop(0) + + html = html[3:] + + # Opening tag + elif html and ch == "<" and html[0] != "/": + tag = scanWord(html) + if tag.lower() in __tags: + break + + text += ch + tag + + # Closing tag + elif html and stack[-1][0] and ch == "<" and html[0] == "/": + junk = ch + junk += html.pop(0) + + tag = scanWord(html) + junk += tag + + if stack[-1][0] == tag.lower(): + junk += scanWhite(html) + if html and html[0] == ">": + html.pop(0) + stack.pop() + tag = None + break + + text += junk + tag = None + + else: + text += ch + + # Append plain text (if not only whitespace) + if (text and ((len(text) == 1 and text in ["\t "]) + or not all([ch in " \t\r\n" for ch in text]))): + # print("text", text) + parent.append(convertEncodedText(text)) + + # Create tag + if tag: + tag = tag.lower() + # print("tag", tag) + + elem = (tag, {}, HtmlAst()) + + stack.append(elem) + parent.append(elem) + + while html: + scanWhite(html) + if not html: + break + + # End of tag > + if html[0] == ">": + html.pop(0) + break + + # Closing tag at end /> + elif html[0] == "/": + html.pop(0) + scanWhite(html) + + if html[0] == ">": + stack.pop() + html.pop(0) + break + + val = att = scanWord(html).lower() + + if not att: + html.pop(0) + continue + + if att in __tags[tag][1] or att in ["[name]", "style", "disabled", "hidden"] or att.startswith("data-"): + scanWhite(html) + if html[0] == "=": + html.pop(0) + scanWhite(html) + + if html[0] in "\"'": + ch = html.pop(0) + + val = "" + while html and html[0] != ch: + val += html.pop(0) + + html.pop(0) + + if att not in elem[1]: + elem[1][att] = val + else: + elem[1][att] += " " + val + + continue + + while stack and stack[-1][0]: + stack.pop() + + return stack[0][2] + +def fromHTML(html, appendTo=None, bindTo=None, debug=False, vars=None, **kwargs): + """ + Parses the provided HTML code according to the objects defined in the html5-library. + html can also be pre-compiled by `parseHTML()` so that it executes faster. + + Constructs all objects as DOM nodes. The first level is chained into appendTo. + If no appendTo is provided, appendTo will be set to html5.Body(). + + If bindTo is provided, objects are bound to this widget. + + ```python + from vi import html5 + + div = html5.Div() + html5.parse.fromHTML(''' + ''', div) + + div.myLink.appendChild("appended!") + ``` + """ + + # Handle defaults + if bindTo is None: + bindTo = appendTo + + if isinstance(html, str): + html = parseHTML(html, debug=debug) + + assert isinstance(html, HtmlAst) + + if isinstance(vars, dict): + kwargs.update(vars) + + def replaceVars(txt): + for var, val in kwargs.items(): + txt = txt.replace("{{%s}}" % var, str(val) if val is not None else "") + + return txt + + def interpret(parent, items): + ret = [] + + for item in items: + if isinstance(item, str): + txt = TextNode(replaceVars(item)) + + if parent: + parent.appendChild(txt) + + ret.append(txt) + continue + + tag = item[0] + atts = item[1] + children = item[2] + + # Special handling for tables: A "thead" and "tbody" are already part of table! + if tag in ["thead", "tbody"] and isinstance(parent, Table): + wdg = getattr(parent, tag[1:]) + + # Usual way: Construct new element and chain it into the parent. + else: + wdg = __tags[tag][0]() + + for att, val in atts.items(): + val = replaceVars(val) + + if att == "[name]": + # Allow disable binding! + if not bindTo: + continue + + if getattr(bindTo, val, None): + print("Cannot assign name '{}' because it already exists in {}".format(val, bindTo)) + + elif not (any([val.startswith(x) for x in + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" + "_"]) + and all( + [x in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" + "0123456789" + "_" + for x in val[1:]])): + print("Cannot assign name '{}' because it contains invalid characters".format(val)) + + else: + setattr(bindTo, val, wdg) + wdg.onBind(bindTo, val) + + if debug: + print("name '{}' assigned to {}".format(val, bindTo)) + + elif att == "class": + # print(tag, att, val.split()) + wdg.addClass(*val.split()) + + elif att == "disabled": + # print(tag, att, val) + if val == "disabled": + wdg.disable() + + elif att == "hidden": + # print(tag, att, val) + if val == "hidden": + wdg.hide() + + elif att == "style": + for dfn in val.split(";"): + if ":" not in dfn: + continue + + att, val = dfn.split(":", 1) + + # print(tag, "style", att.strip(), val.strip()) + wdg["style"][att.strip()] = val.strip() + + elif att.startswith("data-"): + wdg["data"][att[5:]] = val + + else: + wdg[att] = parseInt(val, val) + + interpret(wdg, children) + + if parent and not wdg.parent(): + parent.appendChild(wdg) + + ret.append(wdg) + + return ret + + return interpret(appendTo, html) + + +if __name__ == '__main__': + print(globals()) diff --git a/docs/ide/html5/ext.py b/docs/ide/html5/ext.py new file mode 100644 index 0000000..330d032 --- /dev/null +++ b/docs/ide/html5/ext.py @@ -0,0 +1,475 @@ +# -*- coding: utf-8 -*- +from . import core as html5 +from . import utils + +class Button(html5.Button): + + def __init__(self, txt=None, callback=None, className=None, *args, **kwargs): + super().__init__(*args, **kwargs) + self["class"] = "btn" + + if className: + self.addClass(className) + + self["type"] = "button" + + if txt is not None: + self.setText(txt) + + self.callback = callback + self.sinkEvent("onClick") + + def setText(self, txt): + if txt is not None: + self.element.innerHTML = txt + self["title"] = txt + else: + self.element.innerHTML = "" + self["title"] = "" + + def onClick(self, event): + event.stopPropagation() + event.preventDefault() + if self.callback is not None: + self.callback(self) + + +class Input(html5.Input): + def __init__(self, type="text", placeholder=None, callback=None, id=None, focusCallback=None, *args, **kwargs): + """ + + :param type: Input type. Default: "text + :param placeholder: Placeholder text. Default: None + :param callback: Function to be called onChanged: callback(id, value) + :param id: Optional id of the input element. Will be passed to callback + :return: + """ + super().__init__(*args, **kwargs) + self["class"] = "input" + self["type"] = type + if placeholder is not None: + self["placeholder"] = placeholder + + self.callback = callback + if id is not None: + self["id"] = id + self.sinkEvent("onChange") + + self.focusCallback = focusCallback + if focusCallback: + self.sinkEvent("onFocus") + + def onChange(self, event): + event.stopPropagation() + event.preventDefault() + if self.callback is not None: + self.callback(self, self["id"], self["value"]) + + def onFocus(self, event): + event.stopPropagation() + event.preventDefault() + if self.focusCallback is not None: + self.focusCallback(self, self["id"], self["value"]) + + def onDetach(self): + super().onDetach() + self.callback = None + + +class Popup(html5.Div): + def __init__(self, title=None, id=None, className=None, icon=None, enableShortcuts=True, closeable=True, *args, **kwargs): + super().__init__(""" +
+
+
+
+ +
+
+
+
+
+
+
+
+
+ """) + + self.appendChild = self.popupBody.appendChild + self.fromHTML = lambda *args, **kwargs: self.popupBody.fromHTML(*args, **kwargs) if kwargs.get("bindTo") else self.popupBody.fromHTML(bindTo=self, *args, **kwargs) + + self["class"] = "popup popup--center is-active" + if className: + self.addClass(className) + + if closeable: + closeBtn = Button("×", self.close, className="item-action") + closeBtn.removeClass("btn") + self.popupHeadItem.appendChild(closeBtn) + + if title: + self.popupHeadline.appendChild(title) + + if icon: + self.popupIcon.appendChild(icon[0]) + elif title: + self.popupIcon.appendChild(title[0]) + else: + self.popupIcon.appendChild("Vi") #fixme!!! this _LIBRARY_ is not only used in the Vi... + + # id can be used to pass information to callbacks + self.id = id + + #FIXME: Implement a global overlay! One popupOverlay next to a list of popups. + self.popupOverlay = html5.Div() + self.popupOverlay["class"] = "popup-overlay is-active" + + self.enableShortcuts = enableShortcuts + self.onDocumentKeyDownMethod = None + + self.popupOverlay.appendChild(self) + html5.Body().appendChild(self.popupOverlay) + + #FIXME: Close/Cancel every popup with click on popupCloseBtn without removing the global overlay. + + def onAttach(self): + super(Popup, self).onAttach() + + if self.enableShortcuts: + self.onDocumentKeyDownMethod = self.onDocumentKeyDown # safe reference to method + html5.document.addEventListener("keydown", self.onDocumentKeyDownMethod) + + def onDetach(self): + super(Popup, self).onDetach() + + if self.enableShortcuts: + html5.document.removeEventListener("keydown", self.onDocumentKeyDownMethod) + + def onDocumentKeyDown(self, event): + if html5.isEscape(event): + self.close() + + def close(self, *args, **kwargs): + html5.Body().removeChild(self.popupOverlay) + self.popupOverlay = None + + + +class InputDialog(Popup): + def __init__(self, text, value="", successHandler=None, abortHandler=None, + successLbl="OK", abortLbl="Cancel", placeholder="", *args, **kwargs): + + super().__init__(*args, **kwargs) + self.addClass("popup--inputdialog") + + self.sinkEvent("onKeyDown", "onKeyUp") + + self.successHandler = successHandler + self.abortHandler = abortHandler + + self.fromHTML( + """ +
+ + +
+ """, + vars={ + "text": text, + "value": value, + "placeholder": placeholder + } + ) + + # Cancel + self.popupFoot.appendChild(Button(abortLbl, self.onCancel, className="btn--cancel btn--danger")) + + # Okay + self.okayBtn = Button(successLbl, self.onOkay, className="btn--okay btn--primary") + if not value: + self.okayBtn.disable() + + self.popupFoot.appendChild(self.okayBtn) + + self.inputElem.focus() + + def onKeyDown(self, event): + if html5.isReturn(event) and self.inputElem["value"]: + event.stopPropagation() + event.preventDefault() + self.onOkay() + + def onKeyUp(self, event): + if self.inputElem["value"]: + self.okayBtn.enable() + else: + self.okayBtn.disable() + + def onDocumentKeyDown(self, event): + if html5.isEscape(event): + event.stopPropagation() + event.preventDefault() + self.onCancel() + + def onOkay(self, *args, **kwargs): + if self.successHandler: + self.successHandler(self, self.inputElem["value"]) + self.close() + + def onCancel(self, *args, **kwargs): + if self.abortHandler: + self.abortHandler(self, self.inputElem["value"]) + self.close() + + +class Alert(Popup): + """ + Just displaying an alerting message box with OK-button. + """ + + def __init__(self, msg, title=None, className=None, okCallback=None, okLabel="OK", icon="!", closeable=True, *args, **kwargs): + super().__init__(title, className=None, icon=icon, closeable=closeable, *args, **kwargs) + self.addClass("popup--alert") + + if className: + self.addClass(className) + + self.okCallback = okCallback + + message = html5.Span() + message.addClass("alert-msg") + self.popupBody.appendChild(message) + + if isinstance(msg, str): + msg = msg.replace("\n", "
") + + message.appendChild(msg, bindTo=False) + + self.sinkEvent("onKeyDown") + + if closeable: + okBtn = Button(okLabel, callback=self.onOkBtnClick) + okBtn.addClass("btn--okay btn--primary") + self.popupFoot.appendChild(okBtn) + + okBtn.focus() + + def drop(self): + self.okCallback = None + self.close() + + def onOkBtnClick(self, sender=None): + if self.okCallback: + self.okCallback(self) + + self.drop() + + def onKeyDown(self, event): + if html5.isReturn(event): + event.stopPropagation() + event.preventDefault() + self.onOkBtnClick() + + +class YesNoDialog(Popup): + def __init__(self, question, title=None, yesCallback=None, noCallback=None, + yesLabel="Yes", noLabel="No", icon="?", + closeable=False, *args, **kwargs): + super().__init__(title, closeable=closeable, icon=icon, *args, **kwargs) + self.addClass("popup--yesnodialog") + + self.yesCallback = yesCallback + self.noCallback = noCallback + + lbl = html5.Span() + lbl["class"].append("question") + self.popupBody.appendChild(lbl) + + if isinstance(question, html5.Widget): + lbl.appendChild(question) + else: + utils.textToHtml(lbl, question) + + if len(noLabel): + btnNo = Button(noLabel, className="btn--no", callback=self.onNoClicked) + #btnNo["class"].append("btn--no") + self.popupFoot.appendChild(btnNo) + + btnYes = Button(yesLabel, callback=self.onYesClicked) + btnYes["class"].append("btn--yes") + self.popupFoot.appendChild(btnYes) + + self.sinkEvent("onKeyDown") + btnYes.focus() + + def onKeyDown(self, event): + if html5.isReturn(event): + event.stopPropagation() + event.preventDefault() + self.onYesClicked() + + def onDocumentKeyDown(self, event): + if html5.isEscape(event): + event.stopPropagation() + event.preventDefault() + self.onNoClicked() + + def drop(self): + self.yesCallback = None + self.noCallback = None + self.close() + + def onYesClicked(self, *args, **kwargs): + if self.yesCallback: + self.yesCallback(self) + + self.drop() + + def onNoClicked(self, *args, **kwargs): + if self.noCallback: + self.noCallback(self) + + self.drop() + + +class SelectDialog(Popup): + + def __init__(self, prompt, items=None, title=None, okBtn="OK", cancelBtn="Cancel", forceSelect=False, + callback=None, *args, **kwargs): + super().__init__(title, *args, **kwargs) + self["class"].append("popup--selectdialog") + + self.callback = callback + self.items = items + assert isinstance(self.items, list) + + # Prompt + if prompt: + lbl = html5.Span() + lbl["class"].append("prompt") + + if isinstance(prompt, html5.Widget): + lbl.appendChild(prompt) + else: + utils.textToHtml(lbl, prompt) + + self.popupBody.appendChild(lbl) + + # Items + if not forceSelect and len(items) <= 3: + for idx, item in enumerate(items): + if isinstance(item, dict): + title = item.get("title") + cssc = item.get("class") + elif isinstance(item, tuple): + title = item[1] + cssc = None + else: + title = item + + btn = Button(title, callback=self.onAnyBtnClick) + btn.idx = idx + + if cssc: + btn.addClass(cssc) + + self.popupBody.appendChild(btn) + else: + self.select = html5.Select() + self.popupBody.appendChild(self.select) + + for idx, item in enumerate(items): + if isinstance(item, dict): + title = item.get("title") + elif isinstance(item, tuple): + title = item[1] + else: + title = item + + opt = html5.Option(title) + opt["value"] = str(idx) + + self.select.appendChild(opt) + + if okBtn: + self.popupFoot.appendChild(Button(okBtn, callback=self.onOkClick)) + + if cancelBtn: + self.popupFoot.appendChild(Button(cancelBtn, callback=self.onCancelClick)) + + def onAnyBtnClick(self, sender): + item = self.items[sender.idx] + + if isinstance(item, dict) and item.get("callback") and callable(item["callback"]): + item["callback"](item) + + if self.callback: + self.callback(item) + + self.items = None + self.close() + + def onCancelClick(self, sender=None): + self.close() + + def onOkClick(self, sender=None): + assert self.select["selectedIndex"] >= 0 + item = self.items[int(self.select.children(self.select["selectedIndex"])["value"])] + + if isinstance(item, dict) and item.get("callback") and callable(item["callback"]): + item["callback"](item) + + if self.callback: + self.callback(item) + + self.items = None + self.select = None + self.close() + + +class TextareaDialog(Popup): + def __init__(self, text, value="", successHandler=None, abortHandler=None, successLbl="OK", abortLbl="Cancel", + *args, **kwargs): + super().__init__(*args, **kwargs) + self["class"].append("popup--textareadialog") + + self.successHandler = successHandler + self.abortHandler = abortHandler + + span = html5.Span() + span.element.innerHTML = text + self.popupBody.appendChild(span) + + self.inputElem = html5.Textarea() + self.inputElem["value"] = value + self.popupBody.appendChild(self.inputElem) + + okayBtn = Button(successLbl, self.onOkay) + okayBtn["class"].append("btn--okay") + self.popupFoot.appendChild(okayBtn) + + cancelBtn = Button(abortLbl, self.onCancel) + cancelBtn["class"].append("btn--cancel") + self.popupFoot.appendChild(cancelBtn) + + self.sinkEvent("onKeyDown") + + self.inputElem.focus() + + def onDocumentKeyDown(self, event): + if html5.isEscape(event): + event.stopPropagation() + event.preventDefault() + self.onCancel() + + def onOkay(self, *args, **kwargs): + if self.successHandler: + self.successHandler(self, self.inputElem["value"]) + self.close() + + def onCancel(self, *args, **kwargs): + if self.abortHandler: + self.abortHandler(self, self.inputElem["value"]) + self.close() diff --git a/docs/ide/html5/ignite.py b/docs/ide/html5/ignite.py new file mode 100644 index 0000000..61c10a0 --- /dev/null +++ b/docs/ide/html5/ignite.py @@ -0,0 +1,186 @@ +# -*- coding: utf-8 -*- +from . import core as html5 + + +@html5.tag +class Label(html5.Label): + _parserTagName = "ignite-label" + + def __init__(self, *args, **kwargs): + super(Label, self).__init__(style="label ignt-label", *args, **kwargs) + + +@html5.tag +class Input(html5.Input): + _parserTagName = "ignite-input" + + def __init__(self, *args, **kwargs): + super(Input, self).__init__(style="input ignt-input", *args, **kwargs) + + +@html5.tag +class Switch(html5.Div): + _parserTagName = "ignite-switch" + + def __init__(self, *args, **kwargs): + super(Switch, self).__init__(style="switch ignt-switch", *args, **kwargs) + + self.input = html5.Input(style="switch-input") + self.appendChild(self.input) + self.input["type"] = "checkbox" + + switchLabel = html5.Label(forElem=self.input) + switchLabel.addClass("switch-label") + self.appendChild(switchLabel) + + def _setChecked(self, value): + self.input["checked"] = bool(value) + + def _getChecked(self): + return self.input["checked"] + + +@html5.tag +class Check(html5.Input): + _parserTagName = "ignite-check" + + def __init__(self, *args, **kwargs): + super(Check, self).__init__(style="check ignt-check", *args, **kwargs) + + checkInput = html5.Input() + checkInput.addClass("check-input") + checkInput["type"] = "checkbox" + self.appendChild(checkInput) + + checkLabel = html5.Label(forElem=checkInput) + checkLabel.addClass("check-label") + self.appendChild(checkLabel) + + +@html5.tag +class Radio(html5.Div): + _parserTagName = "ignite-radio" + + def __init__(self, *args, **kwargs): + super(Radio, self).__init__(style="radio ignt-radio", *args, **kwargs) + + radioInput = html5.Input() + radioInput.addClass("radio-input") + radioInput["type"] = "radio" + self.appendChild(radioInput) + + radioLabel = html5.Label(forElem=radioInput) + radioLabel.addClass("radio-label") + self.appendChild(radioLabel) + + +@html5.tag +class Select(html5.Select): + _parserTagName = "ignite-select" + + def __init__(self, *args, **kwargs): + super(Select, self).__init__(style="select ignt-select", *args, **kwargs) + + defaultOpt = html5.Option() + defaultOpt["selected"] = True + defaultOpt["disabled"] = True + defaultOpt.element.innerHTML = "" + self.appendChild(defaultOpt) + + +@html5.tag +class Textarea(html5.Textarea): + _parserTagName = "ignite-textarea" + + def __init__(self, *args, **kwargs): + super(Textarea, self).__init__(style="textarea ignt-textarea", *args, **kwargs) + + +@html5.tag +class Progress(html5.Progress): + _parserTagName = "ignite-progress" + + def __init__(self, *args, **kwargs): + super(Progress, self).__init__(style="progress ignt-progress", *args, **kwargs) + + +@html5.tag +class Item(html5.Div): + _parserTagName = "ignite-item" + + def __init__(self, title=None, descr=None, className=None, *args, **kwargs): + super(Item, self).__init__(style="item ignt-item", *args, **kwargs) + if className: + self.addClass(className) + + self.fromHTML(""" +
+
+
+
+
+
+ """) + + if title: + self.itemHeadline.appendChild(html5.TextNode(title)) + + if descr: + self.itemSubline = html5.Div() + self.addClass("item-subline ignt-item-subline") + self.itemSubline.appendChild(html5.TextNode(descr)) + self.appendChild(self.itemSubline) + + +@html5.tag +class Table(html5.Table): + _parserTagName = "ignite-table" + + def __init__(self, *args, **kwargs): + super(Table, self).__init__(*args, **kwargs) + self.head.addClass("ignt-table-head") + self.body.addClass("ignt-table-body") + + def prepareRow(self, row): + assert row >= 0, "Cannot create rows with negative index" + + for child in self.body._children: + row -= child["rowspan"] + if row < 0: + return + + while row >= 0: + tableRow = html5.Tr() + tableRow.addClass("ignt-table-body-row") + self.body.appendChild(tableRow) + row -= 1 + + def prepareCol(self, row, col): + assert col >= 0, "Cannot create cols with negative index" + self.prepareRow(row) + + for rowChild in self.body._children: + row -= rowChild["rowspan"] + + if row < 0: + for colChild in rowChild._children: + col -= colChild["colspan"] + if col < 0: + return + + while col >= 0: + tableCell = html5.Td() + tableCell.addClass("ignt-table-body-cell") + rowChild.appendChild(tableCell) + col -= 1 + + return + def fastGrid( self, rows, cols, createHidden=False ): + colsstr = "".join(['' for i in range(0, cols)]) + tblstr = '' + + for r in range(0, rows): + tblstr += '%s' %("is-hidden" if createHidden else "",colsstr) + tblstr +="" + + self.fromHTML(tblstr) diff --git a/docs/ide/html5/utils.py b/docs/ide/html5/utils.py new file mode 100644 index 0000000..d80f672 --- /dev/null +++ b/docs/ide/html5/utils.py @@ -0,0 +1,101 @@ +# -*- coding: utf-8 -*- +from . import core as html5 + +def unescape(val, maxLength = 0): + """ + Unquotes several HTML-quoted characters in a string. + + :param val: The value to be unescaped. + :type val: str + + :param maxLength: Cut-off after maxLength characters. + A value of 0 means "unlimited". (default) + :type maxLength: int + + :returns: The unquoted string. + :rtype: str + """ + val = val \ + .replace("<", "<") \ + .replace(">", ">") \ + .replace(""", "\"") \ + .replace("'", "'") + + if maxLength > 0: + return val[0:maxLength] + + return val + +def doesEventHitWidgetOrParents(event, widget): + """ + Test if event 'event' hits widget 'widget' (or *any* of its parents) + """ + while widget: + if event.target == widget.element: + return widget + + widget = widget.parent() + + return None + +def doesEventHitWidgetOrChildren(event, widget): + """ + Test if event 'event' hits widget 'widget' (or *any* of its children) + """ + if event.target == widget.element: + return widget + + for child in widget.children(): + if doesEventHitWidgetOrChildren(event, child): + return child + + return None + +def textToHtml(node, text): + """ + Generates html nodes from text by splitting text into content and into + line breaks html5.Br. + + :param node: The node where the nodes are appended to. + :param text: The text to be inserted. + """ + + for (i, part) in enumerate(text.split("\n")): + if i > 0: + node.appendChild(html5.Br()) + + node.appendChild(html5.TextNode(part)) + +def parseInt(s, ret = 0): + """ + Parses a value as int + """ + if not isinstance(s, str): + return int(s) + elif s: + if s[0] in "+-": + ts = s[1:] + else: + ts = s + + if ts and all([_ in "0123456789" for _ in ts]): + return int(s) + + return ret + +def parseFloat(s, ret = 0.0): + """ + Parses a value as float. + """ + if not isinstance(s, str): + return float(s) + elif s: + if s[0] in "+-": + ts = s[1:] + else: + ts = s + + if ts and ts.count(".") <= 1 and all([_ in ".0123456789" for _ in ts]): + return float(s) + + return ret diff --git a/docs/ide/index.html b/docs/ide/index.html new file mode 100644 index 0000000..48a1505 --- /dev/null +++ b/docs/ide/index.html @@ -0,0 +1,101 @@ + + + + + + + + + + + + + + + + + + diff --git a/docs/ide/is-loading.gif b/docs/ide/is-loading.gif new file mode 100644 index 0000000000000000000000000000000000000000..79a8a679979fcc077a9e95f70ab292344b3c3438 GIT binary patch literal 404 zcmZ?wbhEHb)Mnsj_{hL;{QpUYM#IztwLntwKewN2NU*bGfUA+70W%{51B2o}Dd(cp z#FEq$h4Rdj428t3#EP8!#1y^EytI4<#h)ykTtHPiK+FI#kb%jlrH|wEw3|1YKOg0i zz8CiSbn64Nwus1uY3jB2X6;+fldkJ|z5SHluOpj(+U;1NVB{kTv;@-_+ao|@6!;U9 zH_mD~zNY5$jKq82ds1f1n15=am$O%a&JOnMvm4jGRN>pim?|QbAMvTS@08!=N3(R2 zf{w4pZX(EG20#;6&dQ3ulmCdHOL%+b!^)l|R~D?;#kY+6&}*M$v(D!y&%OK^G;78h z(=!G=?x$XUnh7*<#X77eIzYWTW6Cae>AH=-cBcBQFq!Wz7pPUDestZ+-+UX*W^bL_ m|98{li7C#pZ=Y#8$;#H= zlFZ%5*^{f& zlMjoOSoiA2h+o_$%~2;+kUxTPqtoHxSjM#wWzg7uXG{pH@H@qE#_sd&$IW^Ct+m9> zJMJ&i`)9VX#|zI(pL54FIjK`ZRpo~H5%wHjS)V{=(C@gQace)LT>FC6epTkCvzgGH@OQ=(i88M~Xs`rGv*yOB<&mQBPYii>LKRCes|T$YMll9=^r zqvk~-VWZTznHppj<1{k#eA-`V6yG)s%#ZoBH7cJ>+oHwjaxE814@K+cRIkiSLJA9Y zV=J||97H=4vVawOUnLHyco_BBn(eG6JK}%n+pgF({?&6TlAm@icvz`!-FU^rfaM{m z;^GZ*wiy4@sF3TiV&6DjA3V(>|`z~(w@ z=!YD7ZO#&-XCb`?yjmBxDQzttH6v~G#+`0w55uFr2jWT~$ngd=4#TqEquMTV99Xrg zT-joH_*cDN8`oFKy!7?yqY1s(W%y4Ou$4gmP!F}$J`}p)b4ZC^rLihNx_*N+f_(n; zLZHfFE`JKGAYi!u_Y?WnLub0pMM*@ftuS53ftIe6-Y%XO!**Fs%f^&9Ig!@^2kTofl7qz}Z$s4+C{yVdL)6n1yscswMo7Jn%wZ5#E@I!mT zBtgUXr-72>76N}jREpGQ#ES?!s3P{l(7KpA?&BBXP7Xmn)_KDmHSNrFZfd6kf~mH2 zvL2+$qG@lWkzKq{d^$1d-womucbq&(RT1RfIZDSf@7QT|+gz%`v~d0%JS;!1gs|Im|Co+R=)3L;tJ(Qf?MxxTSq zmc%)qY)5fIQi4nnfjT={o<*7K!_i@cS}X`DH>$WCOQ({vA$Tvcoym8Yl`IZka|d?? zRx&Mw?-Ugx(P0Pww)P!)i6BoMC?h=<)sD0XEH6%*G9AnBd`6U!5H_#!aES}0M8eXX ze@j}|t34n;3J|B2 zAI`-xrI~$McW%(4)i?V-DmHwE*8-bqx3w#RT@`s}+Bkb5)5}ouYt^9;s|Yq1kD5TE zaDV5}xQ&hB03NlKq6NCIw(+iztX-ArCst2AMb$-%)u7s;Lt#hW5so@1(uVlb%xu=~ zAlHuR+@ytfdmUL@D{@D_C2BHc2XuUT$WrQUvM0J=lIw_(@B9J=`pax1-LT2O z!cpq5mt`bJcRHc~$eoJv~6K7VBz|J4hL%y6m-1g9Q5APJ9pol3vh8ahvoRt z#O>m7rj49qEs#kBG}6xTj|#Dv+6jP$qsb+~ci5#`RS_P0uy;dhHB?C0ZWIYtI$@zd zPpSauGlRjwMP=NCDTNMbh)6BMBLAowc6?G=QT~>s;8SS5YRG)C*gs`ck( z%|p`s;;1C8QbpGArNJ}*#pW&;&@e4AmxfN%LdQuw0CyH8OZ#txg~73&dlX^-fZ^N3{ zRW9smq_=^&U^vd0mD1yd1NLgh*CeO_O3x)9s@e>VjpGd24fX*SyI}f>@H9ea`zv=U z9+Mk$O00Firr{M%k|C>a0?!tv9E5bT>43C5#w{!PdeS!c3mx+wOqM^EMJHpMWWQCo zMJh8DU&i&2aBp@Jwi)7AX)eYLO#D}~Q3C)6ZVbfC#WntI<{(1{ZYdF+tq&^V!3>?W zmw6zGR>@f>+B*MgB8UTX9`mdp*O2AO*VntH?)UI2kccfV^Llsi{G$c6u9{a{2Z1mA z$CA4XIX@{1jbrt*Nzzdbx2Lm^^@im|Sn08$V<+tas*OnF3~}pG{T=SEcpGt~w2iYU^n6S6tkO-IP} z%Lfcx&h^6|(6H z^oH(wtvSGiND@QP10LDc*n@+S86}gWot?+ipj!CeT=xJ5_|z7miQ#!nq2*te=t5m) z-7Ad^D#i|j?JCrsgeqDN5l@mVGq#5)DVDkF5^h)Eg_!w%he3F5v4g;^`Vu?CJN#}U ztz~sPyWr1n#y+-1@X`BLxH>~p#LwGJ@A^8vg^+M~#Z5X;Ci`e*ki5oWo^u4m>90Y# zs&iNtYHv%I%*GWjruQj!TtBoPW|xxvjl63af%@0?;)r=*at?B3f*Qn#@$-B{*>0qVKvCNsHaRpzhYBh-N9cm`P;=0(}zTd zS;E>rN1q4WL+@nxSGc=NIDqs>7Q32{AYBn+yL#9DXONHwH`dPdVJ&eO)JSi?PO=e3CKjl(Ki0i!;(iZ+NZU!obl zzeUr1E6t7ULm^t|YV#Ywj8nnpi*jOZp8V{DcF1w}S(C zgz}P}vK;sgXW79$yqJZVlW}?NqJIG$KT!KKs%c>Vl`7IE2wIRB$*2DXY4r?J%|r~P zDk{P}#pvS-*1mu^Q=; zDkV#G9n6DESFPAdFy}L|*`8uNp=aXB@5cYITT#=8{MbDu5$fKM^wBzX3rb+-0S zoaqg!#+V{QtOl9Y^-mEra>FUyXV~VtUVF3)3q2MTBum0A8xpivh~AJ(cOr3_C*)Ip z$k(oI7R``j<+-oOOs>#(_E5M>$coEQ%wEACwBp>B+Bfr9u6$eXaozM#ck7V3AS0*y zfE*zJ{Rx8$6cG_5rO&t#@cJU*VBS=K(&>giD!+ zOT2_fQeL~T-?Pi$lPKtQ%giyy_w5Y`fu#!Pq+?Z_`^BXcCF61G82l*O zYL-5gG?_70sefw5mg8OjeXI8}r;Db3PEsT;mWhTFNg71vOmJK~dKafqL<^RXE2fiw z2dQB%#xK2ZEWP_}J{wUwiA_xAVZwoLzs6-g;(&Qy>h;qFtH zs}x4v-xgtn4AD8il2rSG7H-DbgtoOg#}vHfNRVhu@@-*tTR>v$N}NCapjd~o+kp@o zJ$T)Wm~#Kgtxe}75Kk5YAVPb@K?l#|z&#=hEqd}nLlT>RU_4t5(U6SGp{F1$Z=FdQ zKP6wHMy7f;`sePC@a?Zp=*jcy%kW**$CF)x9D$?zm%vH+BlS`?JLX8T4gFm0?Ljo< z7`;&on@P6LMFatQp%S5CJ1hiAha$IdG4yUBpLa>4z2+^n@>!e+d&lq$g#evuoHC@5 z1T*N5bhUaLdTj=gWGco#EOEt{6&dFB|DIoC@X}&@*$Gh=X(NgJ+>(Pzo3r{HPxG_$ zYx#`m(|ybqM9V<8-r-6}!vmrS{rNldyit?hlMey~*4@bIj_v!~`kq$lAS(6SOfozGI?1$5HmUg> z5@{(+aQ)Za+IV9kw!ZoH)uSc99%Q6BeGT=;R$?*!n6VG=ae6$GqD zciWkn%N)PmE79Sh58fNy;O+^||LLvnx$>#lElfM%vmwz8aFn=KT(rwo46aVZhqK@; zqKV6*3D6S+oBp)OQmX#Cr);ezedn9&Bmcecz*U)(Uz`3q!9wt-f}+PjQ^@mB>Th>P z)4&yn)0cqIQi!3w=`}Yqy!hcf8G(aJFBDO<(PDP}6PXitz3lnrWv516KZ5sRt498N z-}Nl23wO^4%y6;JADdZ|G-VU}XSH4^Lwj6Gtz#y|hljWXc1W>9XYTypUfGZ+Nwv(1 z=kU*cSNG(Giu9_7J(Z(XL&iVk!WwR^I*I$YuZH}X1R9t3I9ID`AI_J@EZZBrn2D|i ze=gWOPLWJmH2w}Z{ny}38gMqdbWFriI%li2ev*}dGveas$H9Y(;w^zk(86awLipY? zzsxt&36%am#b(|;^J`?m9=W-RTRT%xfmI(;jG!%x69u3jhzMXbQHPXnnrW_SMLM!bJxuB6Y3Tj< z{>TVQ#FeL}Ly&U>=x_)4dfk!a|LhJd{-_#)EKO^N*oxC(I4?;B&GW-s=sq|A0NLvg zoQ0)QRsr4m%5uEi@PN$?1~aNVu_yaNDu1_JO!FQ zQOmYQ9&u)NK3w*gK#aVFxLgizv|P#E3HPiw5+hEd+gF*{&*cx(&z;a%MN1`zELnRl z(nOl$PMRUt@EF-3?U3EQQp_S1W@KlBcO_ujf(|R*n)|2OEYKl4(|#W` z<1dj4u$YfF<-||q(zsPyzT{aSYxxOR^J)e6b&YoFv!=V-Z9~^--2l*Ok8RiWvRd2bIW+66IzO#$-_fC)! ztPdCY8`aB`lxXk?(iF2Ch0fl~vBR=MOLk)wA?$IJfh>42@J-ERo;I`e=Q}+y4&7;0w z@Hi2L`oTvN3943Th=juTV8#ydI6xsrBN2lcS~fQdR|(Pi>8Xbk0UIl%Q#l(IMZI++ ztW~pa{tXlJ!V@=DDeoAo!oX9h6o?NG9u^c)tR`wgu96NMYiw8lh0q2S8)6x6ZO=}k zP%^LS5ZnHCOrcF;*SspYL{nR9OWY|{#`kkdqgF0Ot^_Ss!WabXu;FUC*f{Tbxw;uD zG+>59z~>H)R(%|N8Wc37`0cluw;f?p&^l!G_-E=*0sv6Xb@Gugm0-O@u&GX9VStg87o*X^VDQj3oW$7p30$4zATM z<13Ec_|Dg%2HM;)4ar*YxzX-lZmzPjf-$K`37tU~dmucZ zz{~pb8*ibr)8$1dPvB!vxjRiGxy0g;$MkDh?1EK%gbG@LTIFB&_iq%Dcmj8|moJ<3&-$!JJQWc6|oKmSOB=?U3L#a@#Zl4Y9g$~U)=GGH8;8kPc$vuXB-x65{xj*Wp&Pu3D>ZN|;x5Au|(xj_=2 zZ>D8PU&RnFn*B*R`!is_zV+h@ZuIchvr($&;Ds>e*PnWcexqG4y1G9963)@R+g=IK zAtpaTkWd}ollDJ~wvJAvK==I=LprQb(Q3hV-qWb|p@@*e9zO_Xqi)Da&u!_~n|q-=Js z!scyDnl#WI77(mNvz_Tg0+m8$Vr@+NU6SI`5VLN==U*plj}ynpynUlDOCWE@frH22 zbAK=XZ;9MDEbsSb4q*``MN@RCu)h`z&6c&3n5F08!m zKvJhptUYF7(Z!^gAqkeKRt(s)pATar7;a8&>?&p4vW~c*MN665S-}S0uWpJZjOBH# zzDBEz%}AvOaL(ZNmi{cfZ_?pmOUX>@ydFyOy7YuM6kPx0J$-D4Kr2rhcVbCRpg3jb zj+XjW=ER1H%orZZ^^Oe>U65qnx@fNj8p4l}vhycsSAa$Qh* zYm@PPl2w5{b$p@f9tR$#>oE&Y5oi7pA{gQ6z>7aO7Y@_j*W1p~(S{j|7o3>t$x`$6 zyj=T&fktxs?C;gQGjr^UqTmv)9f^ zynQR=$k;`$;8IHT-g3RX_J^4VR)vv*p(4F>|3_~Y6jFp*MQKjB3SIBv{kvIuBM^1u zmu8Rr%;Vn}SI%#+?9XRx={f@6cXcL)uIu}#@pZBl@@B-gR2Xp*pGvKG_5v|7@1Cse zVsWz9-LCtOI^2X#=kV;}Ul3=#t1kH36;Rsk9}+;eq?bWwPyTdu2AnioN&6`t)k>@S zbXX@G`gB$L8D{nRJ*3ei?$rqg&eizo0p>$zyxz9<3K?=VK)F3^DpkxK>}ZK3Yd4nK zuL?Sc{8Tk~`d#XIM5#-Ea%kLA-7LuptfenO+4mUDt*!UC=#DzIvk5-dU7un$ntA^9 zB4CZ@ZT}fqpiXbiot^5alO|K*s`UE1W|AP$2!!lcW_3x2-ob2!HC{qX_yWtme3WY? zK~KHN;rYI@HGZ!k6z7v-3G@Z|f+urW# zVp({Rw(7bE&;#sf$dQT7o=wOsj2r(nG^^G6HUlm@ujYfiHJ4`BeN35}rGQ8>{za-o zx9egF64JzAu5t0|Yltc=)fJdWcuJuD(>;gUwy)T^XpIt0~Doc-qxI>F7t093$Yu zuZmrYMIKt2i%3gDGkxUId|D`1g7HtESom%5j8)JOBUqXdU7cg847Itr`NEJG5(ZEX zINWGJRV5B{du$EP<0h(Ow^9``QQIjfsN@qI+Ap}eJU43A+{ck4Zu-RdJKgkw*@sI#ewZDk%ZTfVXAt3GZt%T^xZgLB?CpQfSF3^w*AMWr z2zp8%Ctr+yYqA|j@@^7Tmr55cAy2*>I^tV1yY16*5!@#_V)GleFfCS+G%ntVu@c~$ zG@eyHssgTMs;v4tvx=p}ww)D6-CTW7lEzGznKCB!3uPo|E3rg1LMYqqnNy}L*6xYE z1<|oW7Xo-!ze#NhzitRG1en6V0Y$WGSO?0<3aXyyaE@OT~_z=^Kh{&dF zFb;GlE3ymgt_#Oj05hi^Eh=6)1$&;BH8$DB=GO~?lA z{1EXJ=rmz$0zUhdf%h(8jsJsT5BJwsZkTdhDJDzAsGZq=qm#InWHKWsRzrrX{O_0< zBVKdC}B8N-rEIaqUW}@e4d2vK>T1(m)9*a;f$GDun$i}6`tGCBmj|Y z7Odc!)VeP3P?AMp;H*0H53Lsr zv{)(S#dntqS4aT90iUf2J70WJ1EzLu+~!;>DBc+@Pcl_Sy7`T`0<+(%Ot~D)tn&h8YGLp$uf;7Nk0=^i{7`T& zc3vh>-N{~x6#WTHtVHz@w8-GaoDWyiKU*T!hlvzHg@lWETf3C`HdkZJJbU8Wv**7AyqKo{TYzV!5K1 zO0^>24IW`9o|8NCcBI$-94AM4Axp5{x!7$){2=GfcGwdMGb+I!$3c|F4TH#qMkmR< zoqFep`Bkh3dbXp$`7DtFxv1+c3J>po`J&C(lcZD{ICfb+U9?hMrHwY-Y$Yw<{l>?S~?p4!-6(tg49$11h4toCp*x6=#do8eSZ zzFvIaF6P<4WiQU3-*d6Tw$hx2UzY#?oY;2bIJrbJv+ZQ+d`U3amF(Qr%kVbrXG?iK zI*{hwPcH_@o?A^+d@aiO^gaFrYlo($f6|z;D;wK3d@dhn&y@j9V0sa%Am(NO~;RCTrY8XwaOG@88TgHlhywVpFEVfr^*f#rYn7A=Z9mgwN25 zuSn}-<{wkOh;bWY1(!Zv{=Xw2W`6+wd@9v*IWw_?-(6NRWE zM4l`|M^C}eE66n&w1Nv*uuG3G7!50qc^UTDtOBW31gHE0q)u!S=XWaw>}+9>K^8{! zUM1gN<^`Px9tQ#SPu1Jy>K)g!>q3#h@A-Ye^s*LU&5|+kU@s(H((p$#n9$EjR(7Z^ zmqMU86d(~S5&`;0IZK!31S>bEFq~q=j#Ht5pa>h5@l{)DxBls9PRbixv7i@EIKcKe zr<4dC>G#Ib4Fv$`0t(0j2hv=dxO>Dnj{DL}aLtn8?|cQZT%Ttd2YlOH_wDOdyoGUM zU%Azt&xy~Yu;q51Pv<#m5VRo*Rp1F=JS=?eUJ9Ru+8{?{)nxB~OjvfRFrG+SHf$hG zs6dO}!48nyG(~d*01WA&Mj+zsyqpg^=IkzS;doGWSG91iwX!u{Da~;Qa0I1|pN%gD zK51j|bZ3dHkRT4sumsbKk9jb^y!;Tw0WHq;;30(oOx!(sAt3qAU=+d0c}ZC1ql zaL&lY3u@4U`<~V<&bkPOJYx)?+gwIH&~VzFAVUS2Itz~%*|$HTEkKbBpGE}(Y5H!5 zQz!z6(QMX4M2t1B&|shdHyLs1-E$q?cTJ9X9%9Qxf@xPU^5~-GLIxx$l~JB9!#08@ z7N=Q}iaGF2bZ}%+>Wa$$c}8IN42;lVfXUT6SEQYH z8ZDkf4UO%a2l4o|>4TFAfuC;zO)bqGP$7Vs3M{Zzcl==q9LmzlSL5+t61#|l{$|nBRv*^ z7w3c%4H7>wmTLrhyxQ^eQU8e7o^?}mE6P!PWC+p#FJ?vY*OtC-&=;Fshim#yAAz6e zfsdDYZ*$9uEdFozCamttP2>&c(^E6e$F{_uE1RE5h3q0FYEJbkSHI{G7$?L^)F5(= zHVKg;%Ejtu>X;GaFv+sK7$@JE zL9fK(SrX{ogUl_;Q4MT9-` zE^+|Ofu5;GrEN$}wnv6yLsWII zTZI==9(0Zd4Us}-mqQa%1y1D4%zTWEjomh*{D1}kVmz#VclkYPd`VZLA!IhtBhJ;m z;R1VUpX2HEw`vtwD)cLL$RnC2wvlgttVxK$JPl2msz8H5ls!GvKnQin3>}L%Z~&Ja z9vmy12#~)^P(0n{ZxV~DVijZ1(>W8VU>P=~3o;H!?XlylV8QUxM9D=U8h86!&WrDK zdY%)F;>Nzy(Y%f~Edh1)HeK{FE8F?v#`dh=gg#}+jd{JwP#i83HZn>Nesh&}R8)NF z|Bzfir;vW+V)GWj(-{xhLx_-efh2`rcxHs`jqevkr?`TK6ipnn32*1?kp^jr*X!Ix zifw!FwXS!qMp+*M8`q?oJ&#`Y?r|@*b*~4(30n;{ym2XE7H_n?1Fh) z(c1aG<;Fwl1vzS)W`ZpSgQt_HMN(G$Qn|)6Z;O%EviI%`?MS`W=H_Q%fya>h4{lc< zfx8*}yC`g|Ykw*nH+pA*xR}8pQe~=gFcg3LA){gz-PXF{d$&kujTqH{q@OxsxA`~F zn~gvS24K>%^2);#xZq&=b=Jq~Q*V?e16q8Lu`DM{FG|FjC^3w-nCs?; zsx3?-;bXX!5Zj(lmT6)*b!^d1Kt!X$KnM@aoIbpKy>b5UA@E?_J)5*xrbVkivI$kv z$TS19Vt2bqK84T^3M_{lbMZKA(hIso*brwo}4KQcaSk#yxbSF`hSQ- z-i=7#7&69=Btl8Bzl3jykU1yQ!Y0Cz_&uKU3E>(DWXY8|oz8$GbA$UZM!eMJw|I&z zQtHI`!DqM2E}1B^^J1iLR$SR#py?ty1dOrx-`EV2$6)4gSTej@7*@2*R#(0Hd_6a( zS-m`Ze|AJHrJA-^GO$}Ht=)RsHtYCd;n2GHhQfkPBCs`F7USpn&|NfB$!j2PVJrL( z0DwXKZ!drgPUpS#Zw>FOw&-;~&w$l6vt8%-6yT}ND>n-|iGbs;>aO#;xq@wlNp7XG zrTZ(4?Y>F=~^rE>wwRQ`ZbU85QiH7WM2xNcTzkPXHgHMEm$~BzQc0nF$ zZs!*RJ_l@GRzn13%hQ9_66<7d2&217z=Uzkxv_ef{2z7%gS~jHxPFb z3=nK}AJkvfd&Snh=*pqr*awO%Hv-p~(>E zB_Wisv{}Dm5~Z`ZEJt!#i}xZGg=T&0`QQPpJy~ro-(H5aSJ3}&R}r4pBd2YVJD!)D zZl3$s2UgOb=5Qp!phj+&sk|D#k(R$`HHEwpT4P|o;N$3i7jVIX@=k5-Snp!hXv=cx zf9qW!f)=j0SbQs0q&ic%P{bQK8(}s}5*L2H?~1RNo!wP*8MK!K76Tg=0_399(vXE{ zqn7Li@UCeI!*Jq#Gx*@eMj{~nd-XX%xb6rRFk{;kh)(eoFGDpq1%rbEK{jL7`dk!p zWz59Lcr!~&f(qB97Z2`Fev2Tcv6Z{0(Otobi~2{71NdhtPHIhovx6JuuQyXXiLV4i zp+t;W2x(wOQKLfw2~bHI6pgI@(BhqFNMgcOZgv6+k7I_7K%t7KZj%9mhyJ-w9d;&{ z%UiQ+7TY*DMxJI!_cc2Z7Q3Cvh8bP%MfdH7NJW~0H_@J3z)bR zsL^ltyqyQZqKJ0?kxu}(C~7~(mIoCvxK*oF*dO2!wwhluGp8mnTer8%{m9Pj9=BzZ zx^TRl03wk*QiF}giiPq-Eof&dQQCs}c8BGPgppmLc4*=bAwgv0w@0{b9;e@5{wSsb z+3>>63P%x>M#8?t4!lf_JKK+V(zbRm7K2vr&kr1`C-QNhvdx1A&I9a8mo8R3_Z=^% zX`j5;ym+skp$2f74T-?eUZ?wjiHSf9d-&T;lllm(52aKFQ^z#qCaf9@R4d3N%alsg z=t)^*Nl>wfh?&-6KhJp|g`bU2NLSNU^q&1#iTXRSZ?`*i(&{{V&}?Sk zwf~`p8Si;0D9R~t_w+x`I8kTqV|L|DhaNxtn_UBT%mRVG)uZ0Ja7ag(6!&H%a@iW$yTl_ytJp@Q{41rM;|oD#}n@TUrhz wHWRuB{)=|OgoG5MF7~}9GfrbT{GX(=P`x`%u;~ZjT~h#ANhOIIF_X~$0n3+d)&Kwi literal 0 HcmV?d00001 From 8d9cc8db7a076036a11d7eb850273f74ee187997 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sat, 6 Jun 2020 23:13:00 +0300 Subject: [PATCH 012/164] Added ide to github pages --- docs/ide/{index.html => app.html} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename docs/ide/{index.html => app.html} (100%) diff --git a/docs/ide/index.html b/docs/ide/app.html similarity index 100% rename from docs/ide/index.html rename to docs/ide/app.html From 42e9f1486caf4782b5398332853c03e61072a886 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sat, 6 Jun 2020 23:34:14 +0300 Subject: [PATCH 013/164] Fix for ide in github pages (no underscores) --- docs/ide/__init__.py | 8 - docs/ide/app.js | 2 +- docs/ide/{ => app}/app.py | 11 +- docs/ide/{html5 => app}/core.py | 0 docs/ide/{ => app}/examples.py | 0 docs/ide/{html5 => app}/ext.py | 0 docs/ide/app/files.json | 9 + docs/ide/{html5/__init__.py => app/html5.py} | 0 docs/ide/{html5 => app}/ignite.py | 0 docs/ide/{html5 => app}/utils.py | 0 docs/ide/files.json | 10 -- docs/ide/html5/.gitignore | 4 - docs/ide/html5/CHANGELOG.md | 93 ----------- docs/ide/html5/LICENSE | 165 ------------------- docs/ide/html5/README.md | 69 -------- 15 files changed, 19 insertions(+), 352 deletions(-) delete mode 100644 docs/ide/__init__.py rename docs/ide/{ => app}/app.py (95%) rename docs/ide/{html5 => app}/core.py (100%) rename docs/ide/{ => app}/examples.py (100%) rename docs/ide/{html5 => app}/ext.py (100%) create mode 100644 docs/ide/app/files.json rename docs/ide/{html5/__init__.py => app/html5.py} (100%) rename docs/ide/{html5 => app}/ignite.py (100%) rename docs/ide/{html5 => app}/utils.py (100%) delete mode 100644 docs/ide/files.json delete mode 100644 docs/ide/html5/.gitignore delete mode 100644 docs/ide/html5/CHANGELOG.md delete mode 100644 docs/ide/html5/LICENSE delete mode 100644 docs/ide/html5/README.md diff --git a/docs/ide/__init__.py b/docs/ide/__init__.py deleted file mode 100644 index 1ef4637..0000000 --- a/docs/ide/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from . import html5, app - - -def start(): - html5.Body().appendChild( - app.App() - ) - diff --git a/docs/ide/app.js b/docs/ide/app.js index 47c0662..90e54f1 100644 --- a/docs/ide/app.js +++ b/docs/ide/app.js @@ -101,5 +101,5 @@ class app { } (function () { - window.top.app = new app({"app": "."}, "app.start()"); + window.top.app = new app({"app": "app"}, "import app.app; app.app.start()"); })(); diff --git a/docs/ide/app.py b/docs/ide/app/app.py similarity index 95% rename from docs/ide/app.py rename to docs/ide/app/app.py index 7c126e8..146aee9 100644 --- a/docs/ide/app.py +++ b/docs/ide/app/app.py @@ -25,11 +25,11 @@ class App(html5.Div):
- Grammar: +
Grammar:
- Input: +
Input:
@@ -74,3 +74,10 @@ class App(html5.Div): print(ast) traverse = lambda node: html5.Li([node.data, html5.Ul([traverse(c) for c in node.children])] if isinstance(node, Tree) else node) self.ast.appendChild(traverse(ast), replace=True) + + +def start(): + html5.Body().appendChild( + App() + ) + diff --git a/docs/ide/html5/core.py b/docs/ide/app/core.py similarity index 100% rename from docs/ide/html5/core.py rename to docs/ide/app/core.py diff --git a/docs/ide/examples.py b/docs/ide/app/examples.py similarity index 100% rename from docs/ide/examples.py rename to docs/ide/app/examples.py diff --git a/docs/ide/html5/ext.py b/docs/ide/app/ext.py similarity index 100% rename from docs/ide/html5/ext.py rename to docs/ide/app/ext.py diff --git a/docs/ide/app/files.json b/docs/ide/app/files.json new file mode 100644 index 0000000..b230899 --- /dev/null +++ b/docs/ide/app/files.json @@ -0,0 +1,9 @@ +[ + "app.py", + "examples.py", + "html5.py", + "core.py", + "ext.py", + "ignite.py", + "utils.py" +] \ No newline at end of file diff --git a/docs/ide/html5/__init__.py b/docs/ide/app/html5.py similarity index 100% rename from docs/ide/html5/__init__.py rename to docs/ide/app/html5.py diff --git a/docs/ide/html5/ignite.py b/docs/ide/app/ignite.py similarity index 100% rename from docs/ide/html5/ignite.py rename to docs/ide/app/ignite.py diff --git a/docs/ide/html5/utils.py b/docs/ide/app/utils.py similarity index 100% rename from docs/ide/html5/utils.py rename to docs/ide/app/utils.py diff --git a/docs/ide/files.json b/docs/ide/files.json deleted file mode 100644 index ebeb185..0000000 --- a/docs/ide/files.json +++ /dev/null @@ -1,10 +0,0 @@ -[ - "__init__.py", - "app.py", - "examples.py", - "html5/__init__.py", - "html5/core.py", - "html5/ext.py", - "html5/ignite.py", - "html5/utils.py" -] \ No newline at end of file diff --git a/docs/ide/html5/.gitignore b/docs/ide/html5/.gitignore deleted file mode 100644 index b65483f..0000000 --- a/docs/ide/html5/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -__target__ -__pycache__ -*.pyc -.idea diff --git a/docs/ide/html5/CHANGELOG.md b/docs/ide/html5/CHANGELOG.md deleted file mode 100644 index 6f24335..0000000 --- a/docs/ide/html5/CHANGELOG.md +++ /dev/null @@ -1,93 +0,0 @@ -# Changelog - -This file documents any relevant changes done to ViUR html5 since version 2. - -## 3.0.0 [develop] - -This is the current development version. - -- Feature: Ported framework to Python 3 using [Pyodide](https://github.com/iodide-project/pyodide), with a full source code and library cleanup -- Feature: `html5.Widget.__init__()` now allows parameters equal to `Widget.appendChild()` to directly stack widgets together. - Additionally, the following parameters are available: - - `appendTo`: Directly append the newly created widget to another widget. - - `style`: Provide class attributes for styling added to the new Widget, using `Widget.addClass()`. -- Feature: `html5.Widget.appendChild()` and `html5.Widget.prependChild()` can handle arbitrary input now, including HTML, lists of widgets or just text, in any order. `html5.Widget.insertChild()` runs slightly different, but shares same features. This change mostly supersedes `html5.Widget.fromHTML()`. -- Feature: New `replace`-parameter for `html5.Widget.appendChild()` and `html5.Widget.prependChild()` which clears the content. -- Feature: `html5.ext.InputDialog` refactored & disables OK-Button when no value is present. -- Feature: `html5.utils.doesEventHitWidgetOrChildren()` and `html5.utils.doesEventHitWidgetOrParent()` now return the Widget or None instead of a boolean, to avoid creating loops and directly work with the recognized Widget. -- Feature: New function `html5.Widget.onBind()` enables widgets to react when bound to other widgets using the HTML parser. -- Feature: Replace HTML-parsing-related `vars`-parameter generally by `**kwargs`, with backward-compatibility. -- Speed-improvement: Hold static `_WidgetClassWrapper` per `html5.Widget` instead of creating one each time on the fly. - -## [2.5.0] Vesuv - -Release date: Jul 26, 2019 - -- Bugfix: `Widget.Th()` now supporting full col-/rowspan getting and setting. -- Bugfix: HTML-parser accepts tags in upper-/camel-case order now. -- Bugfix: HTML-parser handles table tags with tbody/thead tags inside more gracefully. -- Feature: Split HTML-parser into separate stages to compile and run; This allows to pre-compile HTML into a list/dict-structure and render it later on without parsing it again. `parseHTML()` is the new function, `fromHTML()` works like before and handles pre-compiled or raw HTML as parameter. -- Feature: `fromHTML()` extended to `vars` parameter to replace key-value pairs in text-nodes and attribute values expressed as `{{key}}`. -- Feature: HTML-parser dynamically reconizes void elements -- Feature: `html5.registerTag()` can be used to define new or override existing HTML elements in the HTML parser by custom implementations based on `html5.Widget()` -- Feature: New function `Widget.isVisible()` as counterpart for `Widget.isHidden()`. - -## [2.4.0] Agung - -Release date: May 17, 2019 - -- Bugfix: Fixed bug with disabling of input widgets. -- Feature: Fully refactored the librarys source base into just two single files, to reduce number of required files to download and make the library easier to access. -- Feature: New function `Widget.isHidden()` to check if a widget is currently shown. -- Feature: Improved handling of key-events. -- Feature: Allow to close popups by pressing `ESC`. -- Feature: Improvements for SVG and TextNode. - -## [2.3.0] Kilauea - -Release date: Oct 2, 2018 - -- Refactored `html5.ext.SelectDialog` -- Extended html parser to apply data-attributes -- Switching event handling to newer JavaScript event listener API -- Added `onFocusIn` and `onFocusOut` events - -## [2.2.0] Etna - -Release date: Apr 23, 2018 - -- Implemented `html5.Head()` to access the document's head object within the library. -- Directly append text in construction of Li(). - -## [2.1.0] - -Release date: Nov 2, 2017 - -- Introduced a build-in HTML parser (`Widget.fromHTML()`) that is capable to compile HTML-code into DOM-objects of the html5 library, and an extra-feature to bind them to their root node for further access. This attempt makes it possible to create PyJS apps using the HTML5 library without creating every single element by hand. -- A more distinct way for `Widget.hide()` and `Widget.show()` that cannot be overridden by styling. (setting "hidden" does not work when another display value is set). -- Utility functions `Widget.enable() and `Widget.disable()`. -- Directly append text in construction of Div() and Span(). -- Allow for tuple and list processing in table cell assignments. -- Adding `utils.parseFloat()` and `utils.parseInt()` utility functions. -- Implemented `colspan` attribute for Th() -- New README.md and CHANGELOG.md. - -## 2.0 - -Release date: Dec 22, 2016 - -- v[2.0.1]: Directly append text in construction of Option(). -- v[2.0.1]: Anything added to Widget.appendChild() or Widget.prependChild() which is not a widget is handled as text (TextNode() is automatically created). -- New functions `Widget.prependChild()`, `Widget.insertBefore()`, `Widget.children()`, `Widget.removeAllChildren()`, - `Widget.addClass()`, `Widget.removeClass()`, `Widget.toggleClass()` -- Utility functions `utils.doesEventHitWidgetOrParents()`, `utils.doesEventHitWidgetOrChildren()` taken from vi77 -- Insert text blocks easier with `utils.textToHtml()` -- Several bugfixes - -[develop]: https://github.com/viur-framework/html5/compare/v2.5.0...develop -[2.5.0]: https://github.com/viur-framework/html5/compare/v2.4.0...v2.5.0 -[2.4.0]: https://github.com/viur-framework/html5/compare/v2.3.0...v2.4.0 -[2.3.0]: https://github.com/viur-framework/html5/compare/v2.2.0...v2.3.0 -[2.2.0]: https://github.com/viur-framework/html5/compare/v2.1.0...v2.2.0 -[2.1.0]: https://github.com/viur-framework/html5/compare/v2.0.0...v2.1.0 -[2.0.1]: https://github.com/viur-framework/html5/compare/v2.0.0...v2.0.1 diff --git a/docs/ide/html5/LICENSE b/docs/ide/html5/LICENSE deleted file mode 100644 index 65c5ca8..0000000 --- a/docs/ide/html5/LICENSE +++ /dev/null @@ -1,165 +0,0 @@ - GNU LESSER GENERAL PUBLIC LICENSE - Version 3, 29 June 2007 - - Copyright (C) 2007 Free Software Foundation, Inc. - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - - - This version of the GNU Lesser General Public License incorporates -the terms and conditions of version 3 of the GNU General Public -License, supplemented by the additional permissions listed below. - - 0. Additional Definitions. - - As used herein, "this License" refers to version 3 of the GNU Lesser -General Public License, and the "GNU GPL" refers to version 3 of the GNU -General Public License. - - "The Library" refers to a covered work governed by this License, -other than an Application or a Combined Work as defined below. - - An "Application" is any work that makes use of an interface provided -by the Library, but which is not otherwise based on the Library. -Defining a subclass of a class defined by the Library is deemed a mode -of using an interface provided by the Library. - - A "Combined Work" is a work produced by combining or linking an -Application with the Library. The particular version of the Library -with which the Combined Work was made is also called the "Linked -Version". - - The "Minimal Corresponding Source" for a Combined Work means the -Corresponding Source for the Combined Work, excluding any source code -for portions of the Combined Work that, considered in isolation, are -based on the Application, and not on the Linked Version. - - The "Corresponding Application Code" for a Combined Work means the -object code and/or source code for the Application, including any data -and utility programs needed for reproducing the Combined Work from the -Application, but excluding the System Libraries of the Combined Work. - - 1. Exception to Section 3 of the GNU GPL. - - You may convey a covered work under sections 3 and 4 of this License -without being bound by section 3 of the GNU GPL. - - 2. Conveying Modified Versions. - - If you modify a copy of the Library, and, in your modifications, a -facility refers to a function or data to be supplied by an Application -that uses the facility (other than as an argument passed when the -facility is invoked), then you may convey a copy of the modified -version: - - a) under this License, provided that you make a good faith effort to - ensure that, in the event an Application does not supply the - function or data, the facility still operates, and performs - whatever part of its purpose remains meaningful, or - - b) under the GNU GPL, with none of the additional permissions of - this License applicable to that copy. - - 3. Object Code Incorporating Material from Library Header Files. - - The object code form of an Application may incorporate material from -a header file that is part of the Library. You may convey such object -code under terms of your choice, provided that, if the incorporated -material is not limited to numerical parameters, data structure -layouts and accessors, or small macros, inline functions and templates -(ten or fewer lines in length), you do both of the following: - - a) Give prominent notice with each copy of the object code that the - Library is used in it and that the Library and its use are - covered by this License. - - b) Accompany the object code with a copy of the GNU GPL and this license - document. - - 4. Combined Works. - - You may convey a Combined Work under terms of your choice that, -taken together, effectively do not restrict modification of the -portions of the Library contained in the Combined Work and reverse -engineering for debugging such modifications, if you also do each of -the following: - - a) Give prominent notice with each copy of the Combined Work that - the Library is used in it and that the Library and its use are - covered by this License. - - b) Accompany the Combined Work with a copy of the GNU GPL and this license - document. - - c) For a Combined Work that displays copyright notices during - execution, include the copyright notice for the Library among - these notices, as well as a reference directing the user to the - copies of the GNU GPL and this license document. - - d) Do one of the following: - - 0) Convey the Minimal Corresponding Source under the terms of this - License, and the Corresponding Application Code in a form - suitable for, and under terms that permit, the user to - recombine or relink the Application with a modified version of - the Linked Version to produce a modified Combined Work, in the - manner specified by section 6 of the GNU GPL for conveying - Corresponding Source. - - 1) Use a suitable shared library mechanism for linking with the - Library. A suitable mechanism is one that (a) uses at run time - a copy of the Library already present on the user's computer - system, and (b) will operate properly with a modified version - of the Library that is interface-compatible with the Linked - Version. - - e) Provide Installation Information, but only if you would otherwise - be required to provide such information under section 6 of the - GNU GPL, and only to the extent that such information is - necessary to install and execute a modified version of the - Combined Work produced by recombining or relinking the - Application with a modified version of the Linked Version. (If - you use option 4d0, the Installation Information must accompany - the Minimal Corresponding Source and Corresponding Application - Code. If you use option 4d1, you must provide the Installation - Information in the manner specified by section 6 of the GNU GPL - for conveying Corresponding Source.) - - 5. Combined Libraries. - - You may place library facilities that are a work based on the -Library side by side in a single library together with other library -facilities that are not Applications and are not covered by this -License, and convey such a combined library under terms of your -choice, if you do both of the following: - - a) Accompany the combined library with a copy of the same work based - on the Library, uncombined with any other library facilities, - conveyed under the terms of this License. - - b) Give prominent notice with the combined library that part of it - is a work based on the Library, and explaining where to find the - accompanying uncombined form of the same work. - - 6. Revised Versions of the GNU Lesser General Public License. - - The Free Software Foundation may publish revised and/or new versions -of the GNU Lesser General Public License from time to time. Such new -versions will be similar in spirit to the present version, but may -differ in detail to address new problems or concerns. - - Each version is given a distinguishing version number. If the -Library as you received it specifies that a certain numbered version -of the GNU Lesser General Public License "or any later version" -applies to it, you have the option of following the terms and -conditions either of that published version or of any later version -published by the Free Software Foundation. If the Library as you -received it does not specify a version number of the GNU Lesser -General Public License, you may choose any version of the GNU Lesser -General Public License ever published by the Free Software Foundation. - - If the Library as you received it specifies that a proxy can decide -whether future versions of the GNU Lesser General Public License shall -apply, that proxy's public statement of acceptance of any version is -permanent authorization for you to choose that version for the -Library. diff --git a/docs/ide/html5/README.md b/docs/ide/html5/README.md deleted file mode 100644 index d51b38c..0000000 --- a/docs/ide/html5/README.md +++ /dev/null @@ -1,69 +0,0 @@ -# ViUR html5 - -**html5** is a DOM-abstraction layer and API that is used to create client-side Web-Apps running in the browser and written in Python. - -Look [here](https://www.viur.dev/blog/html5-library) for a short introduction. - -## About - -This API and framework is used to implement HTML5 web-apps using the Python programming language. The framework is an abstraction layer for a DOM running in [Pyodide](https://github.com/iodide-project/pyodide), a Python 3 interpreter compiled to web-assembly. - -It provides - -- class abstraction for all HTML5-DOM-elements, e.g. `html5.Div()` -- a built-in HTML parser and executor to generate DOM objects from HTML-code -- helpers for adding/removing classes, arrange children, handling events etc. - -The most prominent software completely established on this library is [ViUR-vi](https://github.com/viur-framework/viur-vi/), the visual administration interface for ViUR-based applications. - -[ViUR](https://www.viur.dev) is a free software development framework for the [Google App Engine](https://appengine.google.com). - -## Quick Start - -**Warning: This section is incomplete, a working example will follow soon!** - -```python -import html5 - -class Game(html5.Div): - def __init__(self): - super().__init__( - """ - - -

Hello Enter Name!

- """) - self.sinkEvent("onChange") - - def onChange(self, event): - if html5.utils.doesEventHitWidgetOrChildren(event, self.myInput): - self.mySpan.appendChild(self.myInput["value"], replace=True) - -Game() -``` - -## Contributing - -We take a great interest in your opinion about ViUR. We appreciate your feedback and are looking forward to hear about your ideas. Share your visions or questions with us and participate in ongoing discussions. - -- [ViUR website](https://www.viur.dev) -- [#ViUR on freenode IRC](https://webchat.freenode.net/?channels=viur) -- [ViUR on GitHub](https://github.com/viur-framework) -- [ViUR on Twitter](https://twitter.com/weloveViUR) - -## Credits - -ViUR is developed and maintained by [Mausbrand Informationssysteme GmbH](https://www.mausbrand.de/en), from Dortmund in Germany. We are a software company consisting of young, enthusiastic software developers, designers and social media experts, working on exciting projects for different kinds of customers. All of our newer projects are implemented with ViUR, from tiny web-pages to huge company intranets with hundreds of users. - -Help of any kind to extend and improve or enhance this project in any kind or way is always appreciated. - -## License - -Copyright (C) 2012-2020 by Mausbrand Informationssysteme GmbH. - -Mausbrand and ViUR are registered trademarks of Mausbrand Informationssysteme GmbH. - -You may use, modify and distribute this software under the terms and conditions of the GNU Lesser General Public License (LGPL). See the file LICENSE provided within this package for more information. From 88b4c6458661058a53927eac6b444ab3b4ab3ba4 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Wed, 10 Jun 2020 14:42:39 +0300 Subject: [PATCH 014/164] Added links to Lark IDE --- README.md | 1 + docs/index.md | 1 + 2 files changed, 2 insertions(+) diff --git a/README.md b/README.md index 1c7062c..464f409 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,7 @@ Most importantly, Lark will save you time and prevent you from getting parsing h - [Documentation @readthedocs](https://lark-parser.readthedocs.io/) - [Cheatsheet (PDF)](/docs/lark_cheatsheet.pdf) +- [Online IDE (very basic)](https://lark-parser.github.io/lark/ide/app.html) - [Tutorial](/docs/json_tutorial.md) for writing a JSON parser. - Blog post: [How to write a DSL with Lark](http://blog.erezsh.com/how-to-write-a-dsl-in-python-with-lark/) - [Gitter chat](https://gitter.im/lark-parser/Lobby) diff --git a/docs/index.md b/docs/index.md index 3efac24..20257b5 100644 --- a/docs/index.md +++ b/docs/index.md @@ -34,6 +34,7 @@ $ pip install lark-parser * [Philosophy & Design Choices](philosophy.md) * [Full List of Features](features.md) * [Examples](https://github.com/lark-parser/lark/tree/master/examples) +* [Online IDE](https://lark-parser.github.io/lark/ide/app.html) * Tutorials * [How to write a DSL](http://blog.erezsh.com/how-to-write-a-dsl-in-python-with-lark/) - Implements a toy LOGO-like language with an interpreter * [How to write a JSON parser](json_tutorial.md) - Teaches you how to use Lark From 4fbae1253f0d3b9cfb757f01566090a1502164a4 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Wed, 10 Jun 2020 14:56:17 +0300 Subject: [PATCH 015/164] Version bump --- lark/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/__init__.py b/lark/__init__.py index 851c35a..3b1f50b 100644 --- a/lark/__init__.py +++ b/lark/__init__.py @@ -6,4 +6,4 @@ from .exceptions import (ParseError, LexError, GrammarError, UnexpectedToken, from .lexer import Token from .lark import Lark -__version__ = "0.8.5" +__version__ = "0.8.6" From 382489e020975f2d12b5f636ab6d76cb248d0cd1 Mon Sep 17 00:00:00 2001 From: julienmalard Date: Wed, 10 Jun 2020 09:53:24 -0400 Subject: [PATCH 016/164] All tests pass now (local testing) --- lark/utils.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/lark/utils.py b/lark/utils.py index 199071c..5ed662b 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -165,14 +165,29 @@ def smart_decorator(f, create_decorator): else: return create_decorator(f.__func__.__call__, True) +try: + import regex +except ImportError: + regex = None + import sys, re Py36 = (sys.version_info[:2] >= (3, 6)) import sre_parse import sre_constants +categ_pattern = re.compile(r'\\p{[A-Za-z_]+}') def get_regexp_width(regexp): + if regex: + # Since `sre_parse` cannot deal with Unicode categories of the form `\p{Mn}`, we replace these with + # a simple letter, which makes no difference as we are only trying to get the possible lengths of the regex + # match here below. + regexp_final = re.sub(categ_pattern, 'A', regexp) + else: + if re.search(categ_pattern, regexp): + raise ImportError('`regex` module must be installed in order to use Unicode categories.', regexp) + regexp_final = regexp try: - return [int(x) for x in sre_parse.parse(regexp).getwidth()] + return [int(x) for x in sre_parse.parse(regexp_final).getwidth()] except sre_constants.error: raise ValueError(regexp) @@ -182,7 +197,7 @@ def get_regexp_width(regexp): def dedup_list(l): """Given a list (l) will removing duplicates from the list, preserving the original order of the list. Assumes that - the list entrie are hashable.""" + the list entries are hashable.""" dedup = set() return [ x for x in l if not (x in dedup or dedup.add(x))] From a6444066903ed859132eceb1a9e6cea29dbaeab0 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sat, 13 Jun 2020 10:22:56 +0300 Subject: [PATCH 017/164] Fixed setup.py (Issue #600) --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b962b7f..686aae7 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ __version__ ,= re.findall('__version__ = "(.*)"', open('lark/__init__.py').read( setup( name = "lark-parser", version = __version__, - packages = ['lark', 'lark.parsers', 'lark.tools', 'lark.grammars', 'lark-stubs'], + packages = ['lark', 'lark.parsers', 'lark.tools', 'lark.grammars', 'lark.__pyinstaller', 'lark-stubs'], requires = [], install_requires = [], From 20a9e3039fea6feaa1afaef9ba237bf16c61dfe9 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sat, 13 Jun 2020 10:23:36 +0300 Subject: [PATCH 018/164] Version bump --- lark/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/__init__.py b/lark/__init__.py index 3b1f50b..3499d0e 100644 --- a/lark/__init__.py +++ b/lark/__init__.py @@ -6,4 +6,4 @@ from .exceptions import (ParseError, LexError, GrammarError, UnexpectedToken, from .lexer import Token from .lark import Lark -__version__ = "0.8.6" +__version__ = "0.8.7" From 3bee21051e8440e506ea13f45b224e2f6d668662 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sat, 13 Jun 2020 22:09:03 +0300 Subject: [PATCH 019/164] Reverted changes regarding EOF --- lark/__init__.py | 2 +- lark/grammar.py | 1 - lark/lexer.py | 2 +- lark/load_grammar.py | 13 +++--------- lark/parsers/grammar_analysis.py | 4 ++-- lark/parsers/lalr_analysis.py | 4 ++-- lark/parsers/lalr_parser.py | 15 +++++-------- setup.py | 2 +- tests/test_parser.py | 36 -------------------------------- 9 files changed, 15 insertions(+), 64 deletions(-) diff --git a/lark/__init__.py b/lark/__init__.py index 3499d0e..3b1f50b 100644 --- a/lark/__init__.py +++ b/lark/__init__.py @@ -6,4 +6,4 @@ from .exceptions import (ParseError, LexError, GrammarError, UnexpectedToken, from .lexer import Token from .lark import Lark -__version__ = "0.8.7" +__version__ = "0.8.6" diff --git a/lark/grammar.py b/lark/grammar.py index 501983a..bb84351 100644 --- a/lark/grammar.py +++ b/lark/grammar.py @@ -1,7 +1,6 @@ from .utils import Serialize ###{standalone -END = '_END$' class Symbol(Serialize): __slots__ = ('name',) diff --git a/lark/lexer.py b/lark/lexer.py index 9b69418..32bfe78 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -309,7 +309,7 @@ class TraditionalLexer(Lexer): if t.pattern.min_width == 0: raise LexError("Lexer does not allow zero-width terminals. (%s: %s)" % (t.name, t.pattern)) - assert set(ignore) <= {t.name for t in terminals}, (ignore, terminals) + assert set(ignore) <= {t.name for t in terminals} # Init self.newline_types = [t.name for t in terminals if _regexp_has_newline(t.pattern.to_regexp())] diff --git a/lark/load_grammar.py b/lark/load_grammar.py index a7eb3d3..a4bef03 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -11,7 +11,7 @@ from .lexer import Token, TerminalDef, PatternStr, PatternRE from .parse_tree_builder import ParseTreeBuilder from .parser_frontends import LALR_TraditionalLexer from .common import LexerConf, ParserConf -from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol, END +from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol from .utils import classify, suppress, dedup_list, Str from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken @@ -94,7 +94,6 @@ TERMINALS = { '_DECLARE': r'%declare', '_IMPORT': r'%import', 'NUMBER': r'[+-]?\d+', - '_END': r'\$', } RULES = { @@ -131,8 +130,7 @@ RULES = { 'nonterminal', 'literal', 'range', - 'template_usage', - 'end'], + 'template_usage'], 'terminal': ['TERMINAL'], 'nonterminal': ['RULE'], @@ -141,7 +139,6 @@ RULES = { 'maybe': ['_LBRA expansions _RBRA'], 'range': ['STRING _DOTDOT STRING'], - 'end': ['_END'], 'template_usage': ['RULE _LBRACE _template_args _RBRACE'], '_template_args': ['value', @@ -302,9 +299,6 @@ class CanonizeTree(Transformer_InPlace): tokenmods, value = args return tokenmods + [value] - def end(self): - return Token('TERMINAL', END) - class PrepareAnonTerminals(Transformer_InPlace): "Create a unique list of anonymous terminals. Attempt to give meaningful names to them when we add them" @@ -813,7 +807,6 @@ class GrammarLoader: term_defs = [td if len(td)==3 else (td[0], 1, td[1]) for td in term_defs] term_defs = [(name.value, (t, int(p))) for name, p, t in term_defs] - term_defs.append((END, (None, 0))) rule_defs = [options_from_rule(*x) for x in rule_defs] # Execute statements @@ -906,7 +899,7 @@ class GrammarLoader: raise GrammarError("Terminal '%s' defined more than once" % name) terminal_names.add(name) - if set(ignore_names) - terminal_names: + if set(ignore_names) > terminal_names: raise GrammarError("Terminals %s were marked to ignore but were not defined!" % (set(ignore_names) - terminal_names)) resolve_term_references(term_defs) diff --git a/lark/parsers/grammar_analysis.py b/lark/parsers/grammar_analysis.py index 20c5ba1..94c32cc 100644 --- a/lark/parsers/grammar_analysis.py +++ b/lark/parsers/grammar_analysis.py @@ -2,7 +2,7 @@ from collections import Counter, defaultdict from ..utils import bfs, fzset, classify from ..exceptions import GrammarError -from ..grammar import Rule, Terminal, NonTerminal, END +from ..grammar import Rule, Terminal, NonTerminal class RulePtr(object): @@ -125,7 +125,7 @@ class GrammarAnalyzer(object): def __init__(self, parser_conf, debug=False): self.debug = debug - root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start), Terminal(END)]) + root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start), Terminal('$END')]) for start in parser_conf.start} rules = parser_conf.rules + list(root_rules.values()) diff --git a/lark/parsers/lalr_analysis.py b/lark/parsers/lalr_analysis.py index 6c8388a..8890c3c 100644 --- a/lark/parsers/lalr_analysis.py +++ b/lark/parsers/lalr_analysis.py @@ -13,7 +13,7 @@ from ..utils import classify, classify_bool, bfs, fzset, Serialize, Enumerator from ..exceptions import GrammarError from .grammar_analysis import GrammarAnalyzer, Terminal, LR0ItemSet -from ..grammar import Rule, END +from ..grammar import Rule ###{standalone @@ -178,7 +178,7 @@ class LALR_Analyzer(GrammarAnalyzer): assert(len(root.kernel) == 1) for rp in root.kernel: assert(rp.index == 0) - self.directly_reads[(root, rp.next)] = set([ Terminal(END) ]) + self.directly_reads[(root, rp.next)] = set([ Terminal('$END') ]) for state in self.lr0_states: seen = set() diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index efdd359..c820f21 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -5,7 +5,6 @@ from ..exceptions import UnexpectedToken from ..lexer import Token from ..utils import Enumerator, Serialize -from ..grammar import END from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable @@ -106,16 +105,12 @@ class _Parser: raise - token = Token.new_borrow_pos(END, None, token) if token else Token(END, None, 0, 1, 1) + token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1) while True: _action, arg = get_action(token) - if _action is Shift: - state_stack.append(arg) - value_stack.append(token) - else: - assert(_action is Reduce) - reduce(arg) - if state_stack[-1] == end_state: - return value_stack[-1] + assert(_action is Reduce) + reduce(arg) + if state_stack[-1] == end_state: + return value_stack[-1] ###} diff --git a/setup.py b/setup.py index 686aae7..b962b7f 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ __version__ ,= re.findall('__version__ = "(.*)"', open('lark/__init__.py').read( setup( name = "lark-parser", version = __version__, - packages = ['lark', 'lark.parsers', 'lark.tools', 'lark.grammars', 'lark.__pyinstaller', 'lark-stubs'], + packages = ['lark', 'lark.parsers', 'lark.tools', 'lark.grammars', 'lark-stubs'], requires = [], install_requires = [], diff --git a/tests/test_parser.py b/tests/test_parser.py index 458ddb7..fcb6d22 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1737,42 +1737,6 @@ def _make_parser_test(LEXER, PARSER): """ parser = _Lark(grammar) - @unittest.skipIf(PARSER!='lalr', "Using the end symbol currently works for LALR only") - def test_end_symbol(self): - grammar = """ - start: a b? - a: "a" $ - b: "b" - """ - parser = _Lark(grammar) - - self.assertEqual(parser.parse('a'), Tree('start', [Tree('a', [])])) - self.assertRaises(UnexpectedInput, parser.parse, 'ab') - - @unittest.skipIf(PARSER!='lalr', "Using the end symbol currently works for LALR only") - def test_end_symbol2(self): - grammar = """ - start: (a|b)+ - a: "a" ("x"|$) - b: "b" - """ - parser = _Lark(grammar) - - self.assertEqual(parser.parse('axa'), Tree('start', [Tree('a', []),Tree('a', [])])) - self.assertRaises(UnexpectedInput, parser.parse, 'ab') - - @unittest.skipIf(PARSER!='lalr', "Using the end symbol currently works for LALR only") - def test_end_symbol3(self): - grammar = """ - start: (a|b)+ - a: "a" (e|"x") - b: "b" - e: $ - """ - parser = _Lark(grammar) - - self.assertEqual(parser.parse('axa'), Tree('start', [Tree('a', []),Tree('a', [Tree('e', [])])])) - self.assertRaises(UnexpectedInput, parser.parse, 'ab') @unittest.skipIf(PARSER!='lalr' or LEXER=='custom', "Serialize currently only works for LALR parsers without custom lexers (though it should be easy to extend)") def test_serialize(self): From a2d3e6332ff475d0d7036c2bba91c7092cae6a92 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sat, 13 Jun 2020 22:10:14 +0300 Subject: [PATCH 020/164] Version bump --- lark/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/__init__.py b/lark/__init__.py index 3b1f50b..11a5186 100644 --- a/lark/__init__.py +++ b/lark/__init__.py @@ -6,4 +6,4 @@ from .exceptions import (ParseError, LexError, GrammarError, UnexpectedToken, from .lexer import Token from .lark import Lark -__version__ = "0.8.6" +__version__ = "0.8.8" From 46bb1e1d6322e3dbf20915b415c7530ce3d81a46 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sun, 14 Jun 2020 12:37:51 +0300 Subject: [PATCH 021/164] Cache now also depends on the Lark version --- lark/lark.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lark/lark.py b/lark/lark.py index 4497dd1..3855191 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -180,8 +180,9 @@ class Lark(Serialize): if self.options.cache is not True: raise ValueError("cache must be bool or str") unhashable = ('transformer', 'postlex', 'lexer_callbacks', 'edit_terminals') + from . import __version__ options_str = ''.join(k+str(v) for k, v in options.items() if k not in unhashable) - s = grammar + options_str + s = grammar + options_str + __version__ md5 = hashlib.md5(s.encode()).hexdigest() cache_fn = '.lark_cache_%s.tmp' % md5 From 323ed2a9fe94e6189ac49e09ac3c7991c9313e23 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Tue, 16 Jun 2020 14:54:28 +0300 Subject: [PATCH 022/164] Fixed setup.py.. again (Issue #600) --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b962b7f..686aae7 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ __version__ ,= re.findall('__version__ = "(.*)"', open('lark/__init__.py').read( setup( name = "lark-parser", version = __version__, - packages = ['lark', 'lark.parsers', 'lark.tools', 'lark.grammars', 'lark-stubs'], + packages = ['lark', 'lark.parsers', 'lark.tools', 'lark.grammars', 'lark.__pyinstaller', 'lark-stubs'], requires = [], install_requires = [], From d2499d8a71d61ac201c9a108f22d16375536dea2 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Tue, 16 Jun 2020 15:01:55 +0300 Subject: [PATCH 023/164] Version bump --- lark/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/__init__.py b/lark/__init__.py index 11a5186..9e50691 100644 --- a/lark/__init__.py +++ b/lark/__init__.py @@ -6,4 +6,4 @@ from .exceptions import (ParseError, LexError, GrammarError, UnexpectedToken, from .lexer import Token from .lark import Lark -__version__ = "0.8.8" +__version__ = "0.8.9" From e12e1ccbf9accc7dcd9223ba9d010b3c5f90e023 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sat, 20 Jun 2020 17:46:08 +0300 Subject: [PATCH 024/164] Minor refactor --- lark/parsers/lalr_parser.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index c820f21..e15b954 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -37,19 +37,16 @@ class LALR_Parser(object): class _Parser: def __init__(self, parse_table, callbacks, debug=False): - self.states = parse_table.states - self.start_states = parse_table.start_states - self.end_states = parse_table.end_states + self.parse_table = parse_table self.callbacks = callbacks self.debug = debug def parse(self, seq, start, set_state=None): token = None stream = iter(seq) - states = self.states - - start_state = self.start_states[start] - end_state = self.end_states[start] + states = self.parse_table.states + start_state = self.parse_table.start_states[start] + end_state = self.parse_table.end_states[start] state_stack = [start_state] value_stack = [] From 4463524b3a164b9a02679ad80ff076fb3013ced3 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sun, 21 Jun 2020 09:18:56 +0300 Subject: [PATCH 025/164] Puppet initial --- lark/exceptions.py | 3 ++- lark/parsers/lalr_parser.py | 47 ++++++++++++++++++++++++++++++++++++- 2 files changed, 48 insertions(+), 2 deletions(-) diff --git a/lark/exceptions.py b/lark/exceptions.py index cf03746..017e504 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -81,7 +81,7 @@ class UnexpectedCharacters(LexError, UnexpectedInput): class UnexpectedToken(ParseError, UnexpectedInput): - def __init__(self, token, expected, considered_rules=None, state=None): + def __init__(self, token, expected, considered_rules=None, state=None, puppet=None): self.token = token self.expected = expected # XXX str shouldn't necessary self.line = getattr(token, 'line', '?') @@ -89,6 +89,7 @@ class UnexpectedToken(ParseError, UnexpectedInput): self.considered_rules = considered_rules self.state = state self.pos_in_stream = getattr(token, 'pos_in_stream', None) + self.puppet = puppet message = ("Unexpected token %r at line %s, column %s.\n" "Expected one of: \n\t* %s\n" diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index e15b954..991789b 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -59,7 +59,7 @@ class _Parser: return states[state][token.type] except KeyError: expected = [s for s in states[state].keys() if s.isupper()] - raise UnexpectedToken(token, expected, state=state) + raise UnexpectedToken(token, expected, state=state, puppet=_ParserPuppet(self, state_stack, value_stack, start)) def reduce(rule): size = len(rule.expansion) @@ -111,3 +111,48 @@ class _Parser: return value_stack[-1] ###} + + + + +class _ParserPuppet: + def __init__(self, parser, state_stack, value_stack, start): + self.parser = parser + self.state_stack = state_stack + self.value_stack = value_stack + self.start = start + + def feed_token(self, token): + end_state = self.parser.parse_table.end_states[self.start] + state_stack = self.state_stack + value_stack = self.value_stack + + state = state_stack[-1] + action, arg = self.parser.parse_table.states[state][token.type] + assert arg != end_state + + if action is Shift: + state_stack.append(arg) + value_stack.append(token) + else: + rule = arg + size = len(rule.expansion) + if size: + s = value_stack[-size:] + del state_stack[-size:] + del value_stack[-size:] + else: + s = [] + + value = self.parser.callbacks[rule](s) + + _action, new_state = self.parser.parse_table.states[state_stack[-1]][rule.origin.name] + assert _action is Shift + state_stack.append(new_state) + value_stack.append(value) + + if state_stack[-1] == end_state: + return value_stack[-1] + + def choices(self): + return self.parser.parse_table.states[self.state_stack[-1]] \ No newline at end of file From 66a073d0aa6733c160b83cc4fb2d6f13b7ea8dc1 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Fri, 26 Jun 2020 00:23:38 +0300 Subject: [PATCH 026/164] Added support for error handling, using a puppet parser. TODO: Add docs --- examples/error_puppet.py | 34 +++++++++++++++++++++++++++ lark/lark.py | 23 ++++++++++++++---- lark/parsers/lalr_parser.py | 47 +++++++++++++++++++++++-------------- 3 files changed, 82 insertions(+), 22 deletions(-) create mode 100644 examples/error_puppet.py diff --git a/examples/error_puppet.py b/examples/error_puppet.py new file mode 100644 index 0000000..a5e0857 --- /dev/null +++ b/examples/error_puppet.py @@ -0,0 +1,34 @@ +# +# This example demonstrates error handling using a parsing puppet in LALR +# +# When the parser encounters an UnexpectedToken exception, it creates a +# parsing puppet with the current parse-state, and lets you control how +# to proceed step-by-step. When you've achieved the correct parse-state, +# you can resume the run by returning True. +# + +from lark import UnexpectedToken, Token + +from .json_parser import json_parser + +def ignore_errors(e): + if e.token.type == 'COMMA': + # Skip comma + return True + elif e.token.type == 'SIGNED_NUMBER': + # Try to feed a comma and retry the number + e.puppet.feed_token(Token('COMMA', ',')) + e.puppet.feed_token(e.token) + return True + + # Unhandled error. Will stop parse and raise exception + return False + + +def main(): + s = "[0 1, 2,, 3,,, 4, 5 6 ]" + res = json_parser.parse(s, on_error=ignore_errors) + print(res) # prints [1.0, 2.0, 3.0, 4.0, 5.0, 6.0] + +main() + diff --git a/lark/lark.py b/lark/lark.py index 3855191..f5d957e 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -9,7 +9,7 @@ from .load_grammar import load_grammar from .tree import Tree from .common import LexerConf, ParserConf -from .lexer import Lexer, TraditionalLexer, TerminalDef +from .lexer import Lexer, TraditionalLexer, TerminalDef, UnexpectedToken from .parse_tree_builder import ParseTreeBuilder from .parser_frontends import get_frontend from .grammar import Rule @@ -359,13 +359,28 @@ class Lark(Serialize): "Get information about a terminal" return self._terminals_dict[name] - def parse(self, text, start=None): + def parse(self, text, start=None, on_error=None): """Parse the given text, according to the options provided. - The 'start' parameter is required if Lark was given multiple possible start symbols (using the start option). + Parameters: + start: str - required if Lark was given multiple possible start symbols (using the start option). + on_error: function - if provided, will be called on UnexpectedToken error. Return true to resume parsing. Returns a tree, unless specified otherwise. """ - return self.parser.parse(text, start=start) + try: + return self.parser.parse(text, start=start) + except UnexpectedToken as e: + if on_error is None: + raise + + while True: + if not on_error(e): + raise e + try: + return e.puppet.resume_parse() + except UnexpectedToken as e2: + e = e2 + ###} diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index 991789b..7d5cf3b 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -41,15 +41,15 @@ class _Parser: self.callbacks = callbacks self.debug = debug - def parse(self, seq, start, set_state=None): + def parse(self, seq, start, set_state=None, value_stack=None, state_stack=None): token = None stream = iter(seq) states = self.parse_table.states start_state = self.parse_table.start_states[start] end_state = self.parse_table.end_states[start] - state_stack = [start_state] - value_stack = [] + state_stack = state_stack or [start_state] + value_stack = value_stack or [] if set_state: set_state(start_state) @@ -59,7 +59,7 @@ class _Parser: return states[state][token.type] except KeyError: expected = [s for s in states[state].keys() if s.isupper()] - raise UnexpectedToken(token, expected, state=state, puppet=_ParserPuppet(self, state_stack, value_stack, start)) + raise UnexpectedToken(token, expected, state=state, puppet=_ParserPuppet(self, state_stack, value_stack, start, stream, set_state)) def reduce(rule): size = len(rule.expansion) @@ -116,25 +116,24 @@ class _Parser: class _ParserPuppet: - def __init__(self, parser, state_stack, value_stack, start): + def __init__(self, parser, state_stack, value_stack, start, stream, set_state): self.parser = parser - self.state_stack = state_stack - self.value_stack = value_stack - self.start = start + self._state_stack = state_stack + self._value_stack = value_stack + self._start = start + self._stream = stream + self._set_state = set_state def feed_token(self, token): - end_state = self.parser.parse_table.end_states[self.start] - state_stack = self.state_stack - value_stack = self.value_stack + end_state = self.parser.parse_table.end_states[self._start] + state_stack = self._state_stack + value_stack = self._value_stack state = state_stack[-1] action, arg = self.parser.parse_table.states[state][token.type] assert arg != end_state - if action is Shift: - state_stack.append(arg) - value_stack.append(token) - else: + while action is Reduce: rule = arg size = len(rule.expansion) if size: @@ -151,8 +150,20 @@ class _ParserPuppet: state_stack.append(new_state) value_stack.append(value) - if state_stack[-1] == end_state: - return value_stack[-1] + if state_stack[-1] == end_state: + return value_stack[-1] + + state = state_stack[-1] + action, arg = self.parser.parse_table.states[state][token.type] + assert arg != end_state + + assert action is Shift + state_stack.append(arg) + value_stack.append(token) + def choices(self): - return self.parser.parse_table.states[self.state_stack[-1]] \ No newline at end of file + return self.parser.parse_table.states[self._state_stack[-1]] + + def resume_parse(self): + return self.parser.parse(self._stream, self._start, self._set_state, self._value_stack, self._state_stack) \ No newline at end of file From 3b3a8c1c924f4db9b27f7ee740ffc16f5d0b4eb9 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Fri, 26 Jun 2020 00:37:37 +0300 Subject: [PATCH 027/164] Added docs for on_error --- docs/classes.md | 15 +++++++++++++-- docs/features.md | 1 + lark/lark.py | 2 +- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/docs/classes.md b/docs/classes.md index 60d08ef..084cda6 100644 --- a/docs/classes.md +++ b/docs/classes.md @@ -25,12 +25,21 @@ Example: Lark(...) ``` -#### parse(self, text) +#### parse(self, text, start=None, on_error=None) -Return a complete parse tree for the text (of type Tree) +Parse the given text, according to the options provided. + +Returns a complete parse tree for the text (of type Tree) If a transformer is supplied to `__init__`, returns whatever is the result of the transformation. +Parameters: + +* start: str - required if Lark was given multiple possible start symbols (using the start option). + +* on_error: function - if provided, will be called on UnexpectedToken error. Return true to resume parsing. LALR only. + +(See `examples/error_puppet.py` for an example of how to use `on_error`.) #### save(self, f) / load(cls, f) @@ -160,6 +169,8 @@ See the [visitors page](visitors.md) ## UnexpectedToken +TODO: Explain puppet mechanism (related to on_error) + ## UnexpectedException - `UnexpectedInput` diff --git a/docs/features.md b/docs/features.md index 5dff9f4..d8a4340 100644 --- a/docs/features.md +++ b/docs/features.md @@ -6,6 +6,7 @@ - EBNF-inspired grammar, with extra features (See: [Grammar Reference](grammar.md)) - Builds a parse-tree (AST) automagically based on the grammar - Stand-alone parser generator - create a small independent parser to embed in your project. + - Flexible error handling by using a "puppet parser" mechanism (LALR only) - Automatic line & column tracking (for both tokens and matched rules) - Automatic terminal collision resolution - Standard library of terminals (strings, numbers, names, etc.) diff --git a/lark/lark.py b/lark/lark.py index f5d957e..f7d12fc 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -364,7 +364,7 @@ class Lark(Serialize): Parameters: start: str - required if Lark was given multiple possible start symbols (using the start option). - on_error: function - if provided, will be called on UnexpectedToken error. Return true to resume parsing. + on_error: function - if provided, will be called on UnexpectedToken error. Return true to resume parsing. LALR only. Returns a tree, unless specified otherwise. """ From 41ade56723a123df21205f9082e34785584ef571 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Fri, 26 Jun 2020 00:40:12 +0300 Subject: [PATCH 028/164] Typo --- examples/error_puppet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/error_puppet.py b/examples/error_puppet.py index a5e0857..87d69e1 100644 --- a/examples/error_puppet.py +++ b/examples/error_puppet.py @@ -28,7 +28,7 @@ def ignore_errors(e): def main(): s = "[0 1, 2,, 3,,, 4, 5 6 ]" res = json_parser.parse(s, on_error=ignore_errors) - print(res) # prints [1.0, 2.0, 3.0, 4.0, 5.0, 6.0] + print(res) # prints [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0] main() From cc1092bd5339a1d41842f1eddfc138f0932f529e Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Fri, 26 Jun 2020 16:42:01 +0300 Subject: [PATCH 029/164] Refactored puppet + small fixes --- lark/parsers/lalr_parser.py | 64 ++++--------------------------- lark/parsers/lalr_puppet.py | 76 +++++++++++++++++++++++++++++++++++++ lark/tree.py | 2 +- tests/test_tools.py | 4 ++ 4 files changed, 88 insertions(+), 58 deletions(-) create mode 100644 lark/parsers/lalr_puppet.py diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index 7d5cf3b..f26cbc5 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -7,9 +7,10 @@ from ..lexer import Token from ..utils import Enumerator, Serialize from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable - +from .lalr_puppet import ParserPuppet ###{standalone + class LALR_Parser(object): def __init__(self, parser_conf, debug=False): assert all(r.options.priority is None for r in parser_conf.rules), "LALR doesn't yet support prioritization" @@ -59,7 +60,11 @@ class _Parser: return states[state][token.type] except KeyError: expected = [s for s in states[state].keys() if s.isupper()] - raise UnexpectedToken(token, expected, state=state, puppet=_ParserPuppet(self, state_stack, value_stack, start, stream, set_state)) + try: + puppet = ParserPuppet(self, state_stack, value_stack, start, stream, set_state) + except NameError: + puppet = None + raise UnexpectedToken(token, expected, state=state, puppet=puppet) def reduce(rule): size = len(rule.expansion) @@ -112,58 +117,3 @@ class _Parser: ###} - - - -class _ParserPuppet: - def __init__(self, parser, state_stack, value_stack, start, stream, set_state): - self.parser = parser - self._state_stack = state_stack - self._value_stack = value_stack - self._start = start - self._stream = stream - self._set_state = set_state - - def feed_token(self, token): - end_state = self.parser.parse_table.end_states[self._start] - state_stack = self._state_stack - value_stack = self._value_stack - - state = state_stack[-1] - action, arg = self.parser.parse_table.states[state][token.type] - assert arg != end_state - - while action is Reduce: - rule = arg - size = len(rule.expansion) - if size: - s = value_stack[-size:] - del state_stack[-size:] - del value_stack[-size:] - else: - s = [] - - value = self.parser.callbacks[rule](s) - - _action, new_state = self.parser.parse_table.states[state_stack[-1]][rule.origin.name] - assert _action is Shift - state_stack.append(new_state) - value_stack.append(value) - - if state_stack[-1] == end_state: - return value_stack[-1] - - state = state_stack[-1] - action, arg = self.parser.parse_table.states[state][token.type] - assert arg != end_state - - assert action is Shift - state_stack.append(arg) - value_stack.append(token) - - - def choices(self): - return self.parser.parse_table.states[self._state_stack[-1]] - - def resume_parse(self): - return self.parser.parse(self._stream, self._start, self._set_state, self._value_stack, self._state_stack) \ No newline at end of file diff --git a/lark/parsers/lalr_puppet.py b/lark/parsers/lalr_puppet.py new file mode 100644 index 0000000..14e21fc --- /dev/null +++ b/lark/parsers/lalr_puppet.py @@ -0,0 +1,76 @@ +# This module provide a LALR puppet, which is used to debugging and error handling + +from copy import deepcopy + +from .lalr_analysis import Shift, Reduce + +class ParserPuppet: + def __init__(self, parser, state_stack, value_stack, start, stream, set_state): + self.parser = parser + self._state_stack = state_stack + self._value_stack = value_stack + self._start = start + self._stream = stream + self._set_state = set_state + + self.result = None + + def feed_token(self, token): + end_state = self.parser.parse_table.end_states[self._start] + state_stack = self._state_stack + value_stack = self._value_stack + + state = state_stack[-1] + action, arg = self.parser.parse_table.states[state][token.type] + assert arg != end_state + + while action is Reduce: + rule = arg + size = len(rule.expansion) + if size: + s = value_stack[-size:] + del state_stack[-size:] + del value_stack[-size:] + else: + s = [] + + value = self.parser.callbacks[rule](s) + + _action, new_state = self.parser.parse_table.states[state_stack[-1]][rule.origin.name] + assert _action is Shift + state_stack.append(new_state) + value_stack.append(value) + + if state_stack[-1] == end_state: + self.result = value_stack[-1] + return self.result + + state = state_stack[-1] + action, arg = self.parser.parse_table.states[state][token.type] + assert arg != end_state + + assert action is Shift + state_stack.append(arg) + value_stack.append(token) + + def copy(self): + return type(self)( + self.parser, + list(self._state_stack), + deepcopy(self._value_stack), + self._start, + self._stream, + self._set_state, + ) + + def pretty(): + print("Puppet choices:") + for k, v in self.choices.items(): + print('\t-', k, '->', v) + print('stack size:', len(self._state_stack)) + + def choices(self): + return self.parser.parse_table.states[self._state_stack[-1]] + + def resume_parse(self): + return self.parser.parse(self._stream, self._start, self._set_state, self._value_stack, self._state_stack) diff --git a/lark/tree.py b/lark/tree.py index 84c996a..f9767e4 100644 --- a/lark/tree.py +++ b/lark/tree.py @@ -105,7 +105,7 @@ class Tree(object): stack.append(n) def __deepcopy__(self, memo): - return type(self)(self.data, deepcopy(self.children, memo)) + return type(self)(self.data, deepcopy(self.children, memo), meta=self._meta) def copy(self): return type(self)(self.data, self.children) diff --git a/tests/test_tools.py b/tests/test_tools.py index 5316396..1e0d78e 100644 --- a/tests/test_tools.py +++ b/tests/test_tools.py @@ -50,6 +50,10 @@ class TestStandalone(TestCase): x = l.parse('16 candles') self.assertEqual(x.children, ['16', 'candles']) + self.assertRaises(context['UnexpectedToken'], l.parse, 'twelve monkeys') + self.assertRaises(context['UnexpectedToken'], l.parse, 'twelve') + self.assertRaises(context['UnexpectedCharacters'], l.parse, '$ talks') + def test_contextual(self): grammar = """ start: a b From 86a162d6d82522ab9f008b693e5418443f428ef5 Mon Sep 17 00:00:00 2001 From: julienmalard Date: Fri, 26 Jun 2020 10:52:42 -0400 Subject: [PATCH 030/164] Added `regex` module as optional mode. --- lark-stubs/lark.pyi | 2 ++ lark/lark.py | 25 ++++++++++++++++++++++--- lark/lexer.py | 25 ++++++++++++++----------- lark/load_grammar.py | 14 ++++++++------ lark/parser_frontends.py | 38 ++++++++++++++++++++------------------ tests/test_parser.py | 4 ++-- tests/test_regex.py | 4 ++-- 7 files changed, 70 insertions(+), 42 deletions(-) diff --git a/lark-stubs/lark.pyi b/lark-stubs/lark.pyi index 8e5e3dd..511e0ad 100644 --- a/lark-stubs/lark.pyi +++ b/lark-stubs/lark.pyi @@ -23,6 +23,7 @@ class LarkOptions: transformer: Optional[Transformer] postlex: Optional[PostLex] ambiguity: str + regex: bool debug: bool keep_all_tokens: bool propagate_positions: bool @@ -48,6 +49,7 @@ class Lark: transformer: Optional[Transformer] = None, postlex: Optional[PostLex] = None, ambiguity: Literal["explicit", "resolve"] = "resolve", + regex: bool = False, debug: bool = False, keep_all_tokens: bool = False, propagate_positions: bool = False, diff --git a/lark/lark.py b/lark/lark.py index 4497dd1..2c9dd42 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -14,6 +14,12 @@ from .parse_tree_builder import ParseTreeBuilder from .parser_frontends import get_frontend from .grammar import Rule +import re +try: + import regex +except ImportError: + regex = None + ###{standalone class LarkOptions(Serialize): @@ -34,6 +40,7 @@ class LarkOptions(Serialize): When `False`, `[]` behaves like the `?` operator, and returns no value at all. (default=`False`. Recommended to set to `True`) + regex - When True, uses the `regex` module instead of the stdlib `re`. cache - Cache the results of the Lark grammar analysis, for x2 to x3 faster loading. LALR only for now. When `False`, does nothing (default) @@ -92,6 +99,7 @@ class LarkOptions(Serialize): 'start': 'start', 'priority': 'auto', 'ambiguity': 'auto', + 'regex': False, 'propagate_positions': False, 'lexer_callbacks': {}, 'maybe_placeholders': False, @@ -154,6 +162,16 @@ class Lark(Serialize): self.options = LarkOptions(options) + # Set regex or re module + use_regex = self.options.regex + if use_regex: + if regex: + self.re = regex + else: + raise ImportError('`regex` module must be installed if calling `Lark(regex=True)`.') + else: + self.re = re + # Some, but not all file-like objects have a 'name' attribute try: self.source = grammar.name @@ -224,7 +242,7 @@ class Lark(Serialize): assert self.options.ambiguity in ('resolve', 'explicit', 'auto', ) # Parse the grammar file and compose the grammars (TODO) - self.grammar = load_grammar(grammar, self.source) + self.grammar = load_grammar(grammar, self.source, self.re) # Compile the EBNF grammar into BNF self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start) @@ -285,7 +303,7 @@ class Lark(Serialize): def _build_parser(self): self._prepare_callbacks() parser_conf = ParserConf(self.rules, self._callbacks, self.options.start) - return self.parser_class(self.lexer_conf, parser_conf, options=self.options) + return self.parser_class(self.lexer_conf, parser_conf, self.re, options=self.options) def save(self, f): data, m = self.memo_serialize([TerminalDef, Rule]) @@ -312,10 +330,11 @@ class Lark(Serialize): if postlex is not None: options['postlex'] = postlex self.options = LarkOptions.deserialize(options, memo) + self.re = regex if self.options.regex else re self.rules = [Rule.deserialize(r, memo) for r in data['rules']] self.source = '' self._prepare_callbacks() - self.parser = self.parser_class.deserialize(data['parser'], memo, self._callbacks, self.options.postlex) + self.parser = self.parser_class.deserialize(data['parser'], memo, self._callbacks, self.options.postlex, self.re) return self @classmethod diff --git a/lark/lexer.py b/lark/lexer.py index 36541d1..4d5c498 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -1,9 +1,10 @@ ## Lexer Implementation +import re try: - import regex as re + import regex except ImportError: - import re + regex = None from .utils import Str, classify, get_regexp_width, Py36, Serialize from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken @@ -233,7 +234,7 @@ class CallChain: -def _create_unless(terminals, g_regex_flags): +def _create_unless(terminals, g_regex_flags, re_): tokens_by_type = classify(terminals, lambda t: type(t.pattern)) assert len(tokens_by_type) <= 2, tokens_by_type.keys() embedded_strs = set() @@ -244,7 +245,7 @@ def _create_unless(terminals, g_regex_flags): if strtok.priority > retok.priority: continue s = strtok.pattern.value - m = re.match(retok.pattern.to_regexp(), s, g_regex_flags) + m = re_.match(retok.pattern.to_regexp(), s, g_regex_flags) if m and m.group(0) == s: unless.append(strtok) if strtok.pattern.flags <= retok.pattern.flags: @@ -297,16 +298,17 @@ class Lexer(object): class TraditionalLexer(Lexer): - def __init__(self, terminals, ignore=(), user_callbacks={}, g_regex_flags=0): + def __init__(self, terminals, re_, ignore=(), user_callbacks={}, g_regex_flags=0): assert all(isinstance(t, TerminalDef) for t in terminals), terminals terminals = list(terminals) + self.re = re_ # Sanitization for t in terminals: try: - re.compile(t.pattern.to_regexp(), g_regex_flags) - except re.error: + self.re.compile(t.pattern.to_regexp(), g_regex_flags) + except self.re.error: raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern)) if t.pattern.min_width == 0: @@ -324,7 +326,7 @@ class TraditionalLexer(Lexer): self.build(g_regex_flags) def build(self, g_regex_flags=0): - terminals, self.callback = _create_unless(self.terminals, g_regex_flags) + terminals, self.callback = _create_unless(self.terminals, g_regex_flags, re_=self.re) assert all(self.callback.values()) for type_, f in self.user_callbacks.items(): @@ -350,7 +352,8 @@ class TraditionalLexer(Lexer): class ContextualLexer(Lexer): - def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}, g_regex_flags=0): + def __init__(self, terminals, states, re_, ignore=(), always_accept=(), user_callbacks={}, g_regex_flags=0): + self.re = re_ tokens_by_name = {} for t in terminals: assert t.name not in tokens_by_name, t @@ -365,12 +368,12 @@ class ContextualLexer(Lexer): except KeyError: accepts = set(accepts) | set(ignore) | set(always_accept) state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name] - lexer = TraditionalLexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags) + lexer = TraditionalLexer(state_tokens, re_=self.re, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags) lexer_by_tokens[key] = lexer self.lexers[state] = lexer - self.root_lexer = TraditionalLexer(terminals, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags) + self.root_lexer = TraditionalLexer(terminals, re_=self.re, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags) def lex(self, stream, get_parser_state): parser_state = get_parser_state() diff --git a/lark/load_grammar.py b/lark/load_grammar.py index a4bef03..407d8d1 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -616,7 +616,7 @@ class Grammar: _imported_grammars = {} -def import_grammar(grammar_path, base_paths=[]): +def import_grammar(grammar_path, re_, base_paths=[]): if grammar_path not in _imported_grammars: import_paths = base_paths + IMPORT_PATHS for import_path in import_paths: @@ -624,7 +624,7 @@ def import_grammar(grammar_path, base_paths=[]): joined_path = os.path.join(import_path, grammar_path) with open(joined_path, encoding='utf8') as f: text = f.read() - grammar = load_grammar(text, joined_path) + grammar = load_grammar(text, joined_path, re_) _imported_grammars[grammar_path] = grammar break else: @@ -755,7 +755,8 @@ def _find_used_symbols(tree): for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))} class GrammarLoader: - def __init__(self): + def __init__(self, re_): + self.re = re_ terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()] rules = [options_from_rule(name, None, x) for name, x in RULES.items()] @@ -764,7 +765,7 @@ class GrammarLoader: lexer_conf = LexerConf(terminals, ['WS', 'COMMENT']) parser_conf = ParserConf(rules, callback, ['start']) - self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf) + self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf, re_) self.canonize_tree = CanonizeTree() @@ -862,7 +863,7 @@ class GrammarLoader: # import grammars for dotted_path, (base_paths, aliases) in imports.items(): grammar_path = os.path.join(*dotted_path) + EXT - g = import_grammar(grammar_path, base_paths=base_paths) + g = import_grammar(grammar_path, self.re, base_paths=base_paths) new_td, new_rd = import_from_grammar_into_namespace(g, '__'.join(dotted_path), aliases) term_defs += new_td @@ -942,4 +943,5 @@ class GrammarLoader: -load_grammar = GrammarLoader().load_grammar +def load_grammar(grammar, source, re_): + return GrammarLoader(re_).load_grammar(grammar, source) diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 9f80ed4..c453ab6 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -1,7 +1,3 @@ -try: - import regex as re -except ImportError: - import re from functools import partial from .utils import get_regexp_width, Serialize @@ -66,14 +62,16 @@ class WithLexer(_ParserFrontend): __serialize_fields__ = 'parser', 'lexer_conf', 'start' __serialize_namespace__ = LexerConf, - def __init__(self, lexer_conf, parser_conf, options=None): + def __init__(self, lexer_conf, parser_conf, re_, options=None): self.lexer_conf = lexer_conf self.start = parser_conf.start self.postlex = lexer_conf.postlex + self.re = re_ @classmethod - def deserialize(cls, data, memo, callbacks, postlex): + def deserialize(cls, data, memo, callbacks, postlex, re_): inst = super(WithLexer, cls).deserialize(data, memo) + inst.re = re_ inst.postlex = postlex inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks) inst.init_lexer() @@ -91,13 +89,14 @@ class WithLexer(_ParserFrontend): return self._parse(token_stream, start) def init_traditional_lexer(self): - self.lexer = TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks, g_regex_flags=self.lexer_conf.g_regex_flags) + self.lexer = TraditionalLexer(self.lexer_conf.tokens, re_=self.re, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks, g_regex_flags=self.lexer_conf.g_regex_flags) class LALR_WithLexer(WithLexer): - def __init__(self, lexer_conf, parser_conf, options=None): + def __init__(self, lexer_conf, parser_conf, re_, options=None): debug = options.debug if options else False + self.re = re_ self.parser = LALR_Parser(parser_conf, debug=debug) - WithLexer.__init__(self, lexer_conf, parser_conf, options) + WithLexer.__init__(self, lexer_conf, parser_conf, re_, options) self.init_lexer() @@ -113,6 +112,7 @@ class LALR_ContextualLexer(LALR_WithLexer): states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()} always_accept = self.postlex.always_accept if self.postlex else () self.lexer = ContextualLexer(self.lexer_conf.tokens, states, + re_=self.re, ignore=self.lexer_conf.ignore, always_accept=always_accept, user_callbacks=self.lexer_conf.callbacks, @@ -129,11 +129,11 @@ class LALR_ContextualLexer(LALR_WithLexer): ###} class LALR_CustomLexer(LALR_WithLexer): - def __init__(self, lexer_cls, lexer_conf, parser_conf, options=None): - self.lexer = lexer_cls(lexer_conf) + def __init__(self, lexer_cls, lexer_conf, parser_conf, re_, options=None): + self.lexer = lexer_cls(lexer_conf, re_=re_) debug = options.debug if options else False self.parser = LALR_Parser(parser_conf, debug=debug) - WithLexer.__init__(self, lexer_conf, parser_conf, options) + WithLexer.__init__(self, lexer_conf, parser_conf, re_, options) def tokenize_text(text): @@ -146,8 +146,8 @@ def tokenize_text(text): yield Token('CHAR', ch, line=line, column=i - col_start_pos) class Earley(WithLexer): - def __init__(self, lexer_conf, parser_conf, options=None): - WithLexer.__init__(self, lexer_conf, parser_conf, options) + def __init__(self, lexer_conf, parser_conf, re_, options=None): + WithLexer.__init__(self, lexer_conf, parser_conf, re_, options) self.init_traditional_lexer() resolve_ambiguity = options.ambiguity == 'resolve' @@ -159,7 +159,9 @@ class Earley(WithLexer): class XEarley(_ParserFrontend): - def __init__(self, lexer_conf, parser_conf, options=None, **kw): + def __init__(self, lexer_conf, parser_conf, re_, options=None, **kw): + self.re = re_ + self.token_by_name = {t.name:t for t in lexer_conf.tokens} self.start = parser_conf.start @@ -191,7 +193,7 @@ class XEarley(_ParserFrontend): if width == 0: raise ValueError("Dynamic Earley doesn't allow zero-width regexps", t) - self.regexps[t.name] = re.compile(regexp, lexer_conf.g_regex_flags) + self.regexps[t.name] = self.re.compile(regexp, lexer_conf.g_regex_flags) def parse(self, text, start): return self._parse(text, start) @@ -204,8 +206,8 @@ class XEarley_CompleteLex(XEarley): class CYK(WithLexer): - def __init__(self, lexer_conf, parser_conf, options=None): - WithLexer.__init__(self, lexer_conf, parser_conf, options) + def __init__(self, lexer_conf, parser_conf, re_, options=None): + WithLexer.__init__(self, lexer_conf, parser_conf, re_, options) self.init_traditional_lexer() self._analysis = GrammarAnalyzer(parser_conf) diff --git a/tests/test_parser.py b/tests/test_parser.py index c6f420e..f8f37df 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -551,8 +551,8 @@ class CustomLexer(Lexer): Purpose of this custom lexer is to test the integration, so it uses the traditionalparser as implementation without custom lexing behaviour. """ - def __init__(self, lexer_conf): - self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks, g_regex_flags=lexer_conf.g_regex_flags) + def __init__(self, lexer_conf, re_): + self.lexer = TraditionalLexer(lexer_conf.tokens, re_, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks, g_regex_flags=lexer_conf.g_regex_flags) def lex(self, *args, **kwargs): return self.lexer.lex(*args, **kwargs) diff --git a/tests/test_regex.py b/tests/test_regex.py index db0bb85..6932a6b 100644 --- a/tests/test_regex.py +++ b/tests/test_regex.py @@ -17,7 +17,7 @@ class TestRegex(unittest.TestCase): NAME: ID_START ID_CONTINUE* ID_START: /[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}_]+/ ID_CONTINUE: ID_START | /[\p{Mn}\p{Mc}\p{Nd}\p{Pc}·]+/ - """) + """, regex=True) self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்') @@ -26,7 +26,7 @@ class TestRegex(unittest.TestCase): g = Lark(r""" ?start: NAME NAME: /[\w]+/ - """) + """, regex=True) self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்') From 857f71e3aaade4e9fed8f87e728dada22e1ef060 Mon Sep 17 00:00:00 2001 From: julienmalard Date: Fri, 26 Jun 2020 11:12:05 -0400 Subject: [PATCH 031/164] Added regex tests to tox. --- tests/__main__.py | 2 +- tests/test_regex.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/__main__.py b/tests/__main__.py index cb26eb4..6b8f513 100644 --- a/tests/__main__.py +++ b/tests/__main__.py @@ -7,7 +7,7 @@ from .test_trees import TestTrees from .test_tools import TestStandalone from .test_cache import TestCache from .test_reconstructor import TestReconstructor - +from .test_regex import TestRegex try: from .test_nearley.test_nearley import TestNearley except ImportError: diff --git a/tests/test_regex.py b/tests/test_regex.py index 6932a6b..19f1923 100644 --- a/tests/test_regex.py +++ b/tests/test_regex.py @@ -2,6 +2,7 @@ from __future__ import absolute_import import logging +import sys import unittest logging.basicConfig(level=logging.INFO) @@ -10,8 +11,10 @@ from lark.lark import Lark class TestRegex(unittest.TestCase): + @unittest.skipIf(sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.') def test_unicode_class(self): "Tests that character classes from the `regex` module work correctly." + print(sys.version_info) g = Lark(r""" ?start: NAME NAME: ID_START ID_CONTINUE* @@ -21,6 +24,7 @@ class TestRegex(unittest.TestCase): self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்') + @unittest.skipIf(sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.') def test_unicode_word(self): "Tests that a persistent bug in the `re` module works when `regex` is enabled." g = Lark(r""" From 797195d8ad212e62a6c51fe5d767afdeeefa3ae9 Mon Sep 17 00:00:00 2001 From: julienmalard Date: Fri, 26 Jun 2020 11:21:35 -0400 Subject: [PATCH 032/164] Removed debug print --- tests/test_regex.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_regex.py b/tests/test_regex.py index 19f1923..d20a8bf 100644 --- a/tests/test_regex.py +++ b/tests/test_regex.py @@ -14,7 +14,6 @@ class TestRegex(unittest.TestCase): @unittest.skipIf(sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.') def test_unicode_class(self): "Tests that character classes from the `regex` module work correctly." - print(sys.version_info) g = Lark(r""" ?start: NAME NAME: ID_START ID_CONTINUE* From 1465ac73537d5f42d4d977d5a8c5c91b9b9d51bc Mon Sep 17 00:00:00 2001 From: julienmalard Date: Fri, 26 Jun 2020 11:21:51 -0400 Subject: [PATCH 033/164] Added `regex` extras dependency --- tox.ini | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tox.ini b/tox.ini index 5427f0f..ee0c5dd 100644 --- a/tox.ini +++ b/tox.ini @@ -17,6 +17,9 @@ deps = -rnearley-requirements.txt -rregex-requirements.txt +# For regex testing +extras = regex + # to always force recreation and avoid unexpected side effects recreate=True From 959d05ad36a24186daa0cde887ea4325eff72d0a Mon Sep 17 00:00:00 2001 From: julienmalard Date: Fri, 26 Jun 2020 11:27:43 -0400 Subject: [PATCH 034/164] Try with extras_require --- setup.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/setup.py b/setup.py index d31e4d2..a3d2a97 100644 --- a/setup.py +++ b/setup.py @@ -14,6 +14,10 @@ setup( requires = [], install_requires = [], + extras_require = { + "regex": ["regex"] + }, + package_data = {'': ['*.md', '*.lark'], 'lark-stubs': ['*.pyi']}, test_suite = 'tests.__main__', From a163b344b3a8868c1eb0819faa12bda6ec7eb7c2 Mon Sep 17 00:00:00 2001 From: julienmalard Date: Fri, 26 Jun 2020 11:32:23 -0400 Subject: [PATCH 035/164] Found it! --- .github/workflows/tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 7be3a92..f55b88c 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -23,6 +23,7 @@ jobs: run: | python -m pip install --upgrade pip pip install -r nearley-requirements.txt + pip install -r regex-requirements.txt - name: Run tests run: | python -m tests \ No newline at end of file From 5fe67b9fc4c8302534bef499a6ddc6b7c3344eac Mon Sep 17 00:00:00 2001 From: julienmalard Date: Fri, 26 Jun 2020 11:35:46 -0400 Subject: [PATCH 036/164] Merged test requirements --- .github/workflows/tests.yml | 3 +-- regex-requirements.txt | 1 - nearley-requirements.txt => test-requirements.txt | 1 + 3 files changed, 2 insertions(+), 3 deletions(-) delete mode 100644 regex-requirements.txt rename nearley-requirements.txt => test-requirements.txt (70%) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index f55b88c..6d1e406 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -22,8 +22,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install -r nearley-requirements.txt - pip install -r regex-requirements.txt + pip install -r test-requirements.txt - name: Run tests run: | python -m tests \ No newline at end of file diff --git a/regex-requirements.txt b/regex-requirements.txt deleted file mode 100644 index 822e14a..0000000 --- a/regex-requirements.txt +++ /dev/null @@ -1 +0,0 @@ -regex \ No newline at end of file diff --git a/nearley-requirements.txt b/test-requirements.txt similarity index 70% rename from nearley-requirements.txt rename to test-requirements.txt index 750c740..d304ee8 100644 --- a/nearley-requirements.txt +++ b/test-requirements.txt @@ -1 +1,2 @@ Js2Py==0.68 +regex \ No newline at end of file From e22536fc9b70e1ec6a875f20754331826c3197fd Mon Sep 17 00:00:00 2001 From: julienmalard Date: Fri, 26 Jun 2020 11:40:18 -0400 Subject: [PATCH 037/164] Updated stubs --- lark-stubs/lexer.pyi | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lark-stubs/lexer.pyi b/lark-stubs/lexer.pyi index a43b754..1ae861d 100644 --- a/lark-stubs/lexer.pyi +++ b/lark-stubs/lexer.pyi @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- - +from types import ModuleType from typing import ( TypeVar, Type, Tuple, List, Dict, Iterator, Collection, Callable, Optional, Pattern as REPattern, @@ -111,6 +111,7 @@ class TraditionalLexer(Lexer): def __init__( self, terminals: Collection[TerminalDef], + re_: ModuleType, ignore: Collection[str] = ..., user_callbacks: Dict[str, _Callback] = ..., g_regex_flags: int = ... @@ -135,6 +136,7 @@ class ContextualLexer(Lexer): self, terminals: Collection[TerminalDef], states: Dict[str, Collection[str]], + re_: ModuleType, ignore: Collection[str] = ..., always_accept: Collection[str] = ..., user_callbacks: Dict[str, _Callback] = ..., From c319ace48d1b0edea506a5364fd04816480e84a7 Mon Sep 17 00:00:00 2001 From: julienmalard Date: Fri, 26 Jun 2020 11:47:00 -0400 Subject: [PATCH 038/164] Update README.md --- README.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/README.md b/README.md index 1c7062c..02b89d7 100644 --- a/README.md +++ b/README.md @@ -176,6 +176,27 @@ You can use the output as a regular python module: 0.38981434460254655 ``` +### Using Unicode character classes with `regex` +Python's builtin `re` module has a few persistent known bugs and also won't parse +advanced regex features such as character classes. +With `pip install lark-parser[regex]`, the `regex` module will be installed alongside `lark` +and can act as a drop-in replacement to `re`. + +Any instance of `Lark` instantiated with `regex=True` will now use the `regex` module +instead of `re`. For example, we can now use character classes to match PEP-3131 compliant Python identifiers. +```python +from lark import Lark +>>> g = Lark(r""" + ?start: NAME + NAME: ID_START ID_CONTINUE* + ID_START: /[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}_]+/ + ID_CONTINUE: ID_START | /[\p{Mn}\p{Mc}\p{Nd}\p{Pc}·]+/ + """, regex=True) + +>>> g.parse('வணக்கம்') +'வணக்கம்' + +``` ## License From 985c38e0e04b6641acf3ebaa3bd700d9d66cb013 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sun, 28 Jun 2020 12:03:35 +0300 Subject: [PATCH 039/164] Documentation fix (Removed bloat from README) --- README.md | 21 --------------------- docs/classes.md | 26 +++++++++++++++++++++++++- lark/lexer.py | 4 ---- tests/test_nearley/nearley | 2 +- 4 files changed, 26 insertions(+), 27 deletions(-) diff --git a/README.md b/README.md index 7f62f84..464f409 100644 --- a/README.md +++ b/README.md @@ -177,27 +177,6 @@ You can use the output as a regular python module: 0.38981434460254655 ``` -### Using Unicode character classes with `regex` -Python's builtin `re` module has a few persistent known bugs and also won't parse -advanced regex features such as character classes. -With `pip install lark-parser[regex]`, the `regex` module will be installed alongside `lark` -and can act as a drop-in replacement to `re`. - -Any instance of `Lark` instantiated with `regex=True` will now use the `regex` module -instead of `re`. For example, we can now use character classes to match PEP-3131 compliant Python identifiers. -```python -from lark import Lark ->>> g = Lark(r""" - ?start: NAME - NAME: ID_START ID_CONTINUE* - ID_START: /[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}_]+/ - ID_CONTINUE: ID_START | /[\p{Mn}\p{Mc}\p{Nd}\p{Pc}·]+/ - """, regex=True) - ->>> g.parse('வணக்கம்') -'வணக்கம்' - -``` ## License diff --git a/docs/classes.md b/docs/classes.md index 084cda6..61cefb2 100644 --- a/docs/classes.md +++ b/docs/classes.md @@ -70,6 +70,8 @@ Useful for caching and multiprocessing. **g_regex_flags** - Flags that are applied to all terminals (both regex and strings) +**regex** - Use the `regex` library instead of the built-in `re` module (See below) + **keep_all_tokens** - Prevent the tree builder from automagically removing "punctuation" tokens (default: False) **cache** - Cache the results of the Lark grammar analysis, for x2 to x3 faster loading. LALR only for now. @@ -94,13 +96,35 @@ Useful for caching and multiprocessing. - "resolve": The parser will automatically choose the simplest derivation (it chooses consistently: greedy for tokens, non-greedy for rules) - "explicit": The parser will return all derivations wrapped in "_ambig" tree nodes (i.e. a forest). -#### Domain Specific +#### Misc. - **postlex** - Lexer post-processing (Default: None) Only works with the standard and contextual lexers. - **priority** - How priorities should be evaluated - auto, none, normal, invert (Default: auto) - **lexer_callbacks** - Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution. - **edit_terminals** - A callback + +#### Using Unicode character classes with `regex` +Python's builtin `re` module has a few persistent known bugs and also won't parse +advanced regex features such as character classes. +With `pip install lark-parser[regex]`, the `regex` module will be installed alongside `lark` +and can act as a drop-in replacement to `re`. + +Any instance of `Lark` instantiated with `regex=True` will now use the `regex` module +instead of `re`. For example, we can now use character classes to match PEP-3131 compliant Python identifiers. +```python +from lark import Lark +>>> g = Lark(r""" + ?start: NAME + NAME: ID_START ID_CONTINUE* + ID_START: /[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}_]+/ + ID_CONTINUE: ID_START | /[\p{Mn}\p{Mc}\p{Nd}\p{Pc}·]+/ + """, regex=True) + +>>> g.parse('வணக்கம்') +'வணக்கம்' + +``` ---- ## Tree diff --git a/lark/lexer.py b/lark/lexer.py index 4d5c498..8d0d03f 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -1,10 +1,6 @@ ## Lexer Implementation import re -try: - import regex -except ImportError: - regex = None from .utils import Str, classify, get_regexp_width, Py36, Serialize from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken diff --git a/tests/test_nearley/nearley b/tests/test_nearley/nearley index cf8925f..a46b374 160000 --- a/tests/test_nearley/nearley +++ b/tests/test_nearley/nearley @@ -1 +1 @@ -Subproject commit cf8925f729bde741a3076c5856c0c0862bc7f5de +Subproject commit a46b37471db486db0f6e1ce6a2934fb238346b44 From e6daf51f255e8bb72153cb5157f03dfe38216052 Mon Sep 17 00:00:00 2001 From: Aleh Arol Date: Sun, 28 Jun 2020 18:07:00 +0300 Subject: [PATCH 040/164] Make token type check fallback disabled by default --- lark/exceptions.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/lark/exceptions.py b/lark/exceptions.py index 4cbe4bf..6dd31be 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -32,7 +32,7 @@ class UnexpectedInput(LarkError): after = text[pos:end].split('\n', 1)[0] return before + after + '\n' + ' ' * len(before) + '^\n' - def match_examples(self, parse_fn, examples): + def match_examples(self, parse_fn, examples, token_type_match_fallback=False): """ Given a parser instance and a dictionary mapping some label with some malformed syntax examples, it'll return the label for the example that bests matches the current error. @@ -52,8 +52,10 @@ class UnexpectedInput(LarkError): if ut.token == self.token: # Try exact match first return label - if (ut.token.type == self.token.type) and not candidate[-1]: # Fallback to token types match - candidate = label, True + if token_type_match_fallback: + # Fallback to token types match + if (ut.token.type == self.token.type) and not candidate[-1]: + candidate = label, True except AttributeError: pass From 09e80a5c9ef45214340708a48bc1f0edad6efd06 Mon Sep 17 00:00:00 2001 From: julienmalard Date: Sun, 28 Jun 2020 13:46:22 -0400 Subject: [PATCH 041/164] Fixed tests --- lark-stubs/lexer.pyi | 1 + lark/lexer.py | 14 +++++++------- tests/__main__.py | 2 +- tests/test_parser.py | 20 ++++++++++++++++++++ tests/test_regex.py | 37 ------------------------------------- 5 files changed, 29 insertions(+), 45 deletions(-) delete mode 100644 tests/test_regex.py diff --git a/lark-stubs/lexer.pyi b/lark-stubs/lexer.pyi index 1ae861d..ae7d68a 100644 --- a/lark-stubs/lexer.pyi +++ b/lark-stubs/lexer.pyi @@ -107,6 +107,7 @@ class TraditionalLexer(Lexer): user_callbacks: Dict[str, _Callback] callback: Dict[str, _Callback] mres: List[Tuple[REPattern, Dict[int, str]]] + re: ModuleType def __init__( self, diff --git a/lark/lexer.py b/lark/lexer.py index 4d5c498..9a0fc65 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -251,13 +251,13 @@ def _create_unless(terminals, g_regex_flags, re_): if strtok.pattern.flags <= retok.pattern.flags: embedded_strs.add(strtok) if unless: - callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, match_whole=True)) + callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, re_, match_whole=True)) terminals = [t for t in terminals if t not in embedded_strs] return terminals, callback -def _build_mres(terminals, max_size, g_regex_flags, match_whole): +def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_): # Python sets an unreasonable group limit (currently 100) in its re module # Worse, the only way to know we reached it is by catching an AssertionError! # This function recursively tries less and less groups until it's successful. @@ -265,17 +265,17 @@ def _build_mres(terminals, max_size, g_regex_flags, match_whole): mres = [] while terminals: try: - mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in terminals[:max_size]), g_regex_flags) + mre = re_.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in terminals[:max_size]), g_regex_flags) except AssertionError: # Yes, this is what Python provides us.. :/ - return _build_mres(terminals, max_size//2, g_regex_flags, match_whole) + return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_) # terms_from_name = {t.name: t for t in terminals[:max_size]} mres.append((mre, {i:n for n,i in mre.groupindex.items()} )) terminals = terminals[max_size:] return mres -def build_mres(terminals, g_regex_flags, match_whole=False): - return _build_mres(terminals, len(terminals), g_regex_flags, match_whole) +def build_mres(terminals, g_regex_flags, re_, match_whole=False): + return _build_mres(terminals, len(terminals), g_regex_flags, match_whole, re_) def _regexp_has_newline(r): r"""Expressions that may indicate newlines in a regexp: @@ -336,7 +336,7 @@ class TraditionalLexer(Lexer): else: self.callback[type_] = f - self.mres = build_mres(terminals, g_regex_flags) + self.mres = build_mres(terminals, g_regex_flags, self.re) def match(self, stream, pos): for mre, type_from_index in self.mres: diff --git a/tests/__main__.py b/tests/__main__.py index 6b8f513..cb26eb4 100644 --- a/tests/__main__.py +++ b/tests/__main__.py @@ -7,7 +7,7 @@ from .test_trees import TestTrees from .test_tools import TestStandalone from .test_cache import TestCache from .test_reconstructor import TestReconstructor -from .test_regex import TestRegex + try: from .test_nearley.test_nearley import TestNearley except ImportError: diff --git a/tests/test_parser.py b/tests/test_parser.py index f8f37df..ac84c61 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1787,6 +1787,26 @@ def _make_parser_test(LEXER, PARSER): self.assertEqual(a.line, 1) self.assertEqual(b.line, 2) + @unittest.skipIf(sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.') + def test_unicode_class(self): + "Tests that character classes from the `regex` module work correctly." + g = _Lark(r""" + ?start: NAME + NAME: ID_START ID_CONTINUE* + ID_START: /[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}_]+/ + ID_CONTINUE: ID_START | /[\p{Mn}\p{Mc}\p{Nd}\p{Pc}·]+/ + """, regex=True) + + self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்') + + @unittest.skipIf(sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.') + def test_unicode_word(self): + "Tests that a persistent bug in the `re` module works when `regex` is enabled." + g = _Lark(r""" + ?start: NAME + NAME: /[\w]+/ + """, regex=True) + self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்') _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize() _TestParser.__name__ = _NAME diff --git a/tests/test_regex.py b/tests/test_regex.py deleted file mode 100644 index d20a8bf..0000000 --- a/tests/test_regex.py +++ /dev/null @@ -1,37 +0,0 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import - -import logging -import sys -import unittest - -logging.basicConfig(level=logging.INFO) - -from lark.lark import Lark - - -class TestRegex(unittest.TestCase): - @unittest.skipIf(sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.') - def test_unicode_class(self): - "Tests that character classes from the `regex` module work correctly." - g = Lark(r""" - ?start: NAME - NAME: ID_START ID_CONTINUE* - ID_START: /[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}_]+/ - ID_CONTINUE: ID_START | /[\p{Mn}\p{Mc}\p{Nd}\p{Pc}·]+/ - """, regex=True) - - self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்') - - @unittest.skipIf(sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.') - def test_unicode_word(self): - "Tests that a persistent bug in the `re` module works when `regex` is enabled." - g = Lark(r""" - ?start: NAME - NAME: /[\w]+/ - """, regex=True) - self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்') - - -if __name__ == '__main__': - unittest.main() From b5e02c58af6dfc4837564d3142b3f722fdca96b8 Mon Sep 17 00:00:00 2001 From: julienmalard Date: Sun, 28 Jun 2020 13:46:58 -0400 Subject: [PATCH 042/164] Fix reorganised tests --- tox.ini | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tox.ini b/tox.ini index ee0c5dd..631e115 100644 --- a/tox.ini +++ b/tox.ini @@ -14,8 +14,7 @@ pypy3 = pypy3 [testenv] whitelist_externals = git deps = - -rnearley-requirements.txt - -rregex-requirements.txt + -rtest-requirements.txt # For regex testing extras = regex From 2ed40237a535f5bedb57de4c00c3e1482561b0dd Mon Sep 17 00:00:00 2001 From: julienmalard Date: Sun, 28 Jun 2020 13:48:02 -0400 Subject: [PATCH 043/164] Removed unnecessary imports --- lark/lexer.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/lark/lexer.py b/lark/lexer.py index 9a0fc65..b37db74 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -1,11 +1,5 @@ ## Lexer Implementation -import re -try: - import regex -except ImportError: - regex = None - from .utils import Str, classify, get_regexp_width, Py36, Serialize from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken From 3f524c88c9beec43f49b86d858bbadf8376f9e99 Mon Sep 17 00:00:00 2001 From: julienmalard Date: Sun, 28 Jun 2020 13:48:42 -0400 Subject: [PATCH 044/164] Apparently still need re --- lark/lexer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lark/lexer.py b/lark/lexer.py index b37db74..bff5de9 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -1,5 +1,7 @@ ## Lexer Implementation +import re + from .utils import Str, classify, get_regexp_width, Py36, Serialize from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken From b90ac434d284f143d94a6e35cef0c00de3e943bf Mon Sep 17 00:00:00 2001 From: julienmalard Date: Sun, 28 Jun 2020 18:00:50 -0400 Subject: [PATCH 045/164] =?UTF-8?q?=C2=B7=20character=20was=20causing=20pr?= =?UTF-8?q?oblems=20in=20pypy3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/test_parser.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/tests/test_parser.py b/tests/test_parser.py index ac84c61..9a73889 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1790,21 +1790,18 @@ def _make_parser_test(LEXER, PARSER): @unittest.skipIf(sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.') def test_unicode_class(self): "Tests that character classes from the `regex` module work correctly." - g = _Lark(r""" - ?start: NAME - NAME: ID_START ID_CONTINUE* - ID_START: /[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}_]+/ - ID_CONTINUE: ID_START | /[\p{Mn}\p{Mc}\p{Nd}\p{Pc}·]+/ - """, regex=True) + g = _Lark(r"""?start: NAME + NAME: ID_START ID_CONTINUE* + ID_START: /[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}_]+/ + ID_CONTINUE: ID_START | /[\p{Mn}\p{Mc}\p{Nd}\p{Pc}]+/""", regex=True) self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்') @unittest.skipIf(sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.') def test_unicode_word(self): "Tests that a persistent bug in the `re` module works when `regex` is enabled." - g = _Lark(r""" - ?start: NAME - NAME: /[\w]+/ + g = _Lark(r"""?start: NAME + NAME: /[\w]+/ """, regex=True) self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்') From 9e5108b566e85ee66ad512ce3adbbc0e62319634 Mon Sep 17 00:00:00 2001 From: julienmalard Date: Sun, 28 Jun 2020 18:01:01 -0400 Subject: [PATCH 046/164] Not needed anymore? --- tox.ini | 3 --- 1 file changed, 3 deletions(-) diff --git a/tox.ini b/tox.ini index 631e115..6234410 100644 --- a/tox.ini +++ b/tox.ini @@ -16,9 +16,6 @@ whitelist_externals = git deps = -rtest-requirements.txt -# For regex testing -extras = regex - # to always force recreation and avoid unexpected side effects recreate=True From 12d95c37afeb962181d84961aeb92282dd25c77f Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Tue, 30 Jun 2020 12:36:47 +0300 Subject: [PATCH 047/164] Small fixes --- lark/utils.py | 12 ++++++------ tests/test_parser.py | 14 ++++++++------ 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/lark/utils.py b/lark/utils.py index 5ed662b..36f50d1 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -176,20 +176,20 @@ Py36 = (sys.version_info[:2] >= (3, 6)) import sre_parse import sre_constants categ_pattern = re.compile(r'\\p{[A-Za-z_]+}') -def get_regexp_width(regexp): +def get_regexp_width(expr): if regex: # Since `sre_parse` cannot deal with Unicode categories of the form `\p{Mn}`, we replace these with # a simple letter, which makes no difference as we are only trying to get the possible lengths of the regex # match here below. - regexp_final = re.sub(categ_pattern, 'A', regexp) + regexp_final = re.sub(categ_pattern, 'A', expr) else: - if re.search(categ_pattern, regexp): - raise ImportError('`regex` module must be installed in order to use Unicode categories.', regexp) - regexp_final = regexp + if re.search(categ_pattern, expr): + raise ImportError('`regex` module must be installed in order to use Unicode categories.', expr) + regexp_final = expr try: return [int(x) for x in sre_parse.parse(regexp_final).getwidth()] except sre_constants.error: - raise ValueError(regexp) + raise ValueError(expr) ###} diff --git a/tests/test_parser.py b/tests/test_parser.py index 9a73889..df09307 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1,10 +1,7 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import -try: - import regex as re -except ImportError: - import re +import re import unittest import logging import os @@ -23,6 +20,11 @@ from io import ( logging.basicConfig(level=logging.INFO) +try: + import regex +except ImportError: + regex = None + from lark.lark import Lark from lark.exceptions import GrammarError, ParseError, UnexpectedToken, UnexpectedInput, UnexpectedCharacters from lark.tree import Tree @@ -1787,7 +1789,7 @@ def _make_parser_test(LEXER, PARSER): self.assertEqual(a.line, 1) self.assertEqual(b.line, 2) - @unittest.skipIf(sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.') + @unittest.skipIf(not regex or sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.') def test_unicode_class(self): "Tests that character classes from the `regex` module work correctly." g = _Lark(r"""?start: NAME @@ -1797,7 +1799,7 @@ def _make_parser_test(LEXER, PARSER): self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்') - @unittest.skipIf(sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.') + @unittest.skipIf(not regex or sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.') def test_unicode_word(self): "Tests that a persistent bug in the `re` module works when `regex` is enabled." g = _Lark(r"""?start: NAME From 230aad94a7c5df9b4a1bc4ef05dd873bd236f12e Mon Sep 17 00:00:00 2001 From: julienmalard Date: Tue, 30 Jun 2020 08:07:31 -0400 Subject: [PATCH 048/164] Added reconstructor tests for tokens to keep ("!") and for expanded rules. --- tests/test_reconstructor.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tests/test_reconstructor.py b/tests/test_reconstructor.py index ecab499..7a896a0 100644 --- a/tests/test_reconstructor.py +++ b/tests/test_reconstructor.py @@ -69,6 +69,36 @@ class TestReconstructor(TestCase): self.assert_reconstruct(g, code) + def test_keep_tokens(self): + g = """ + start: (NL | stmt)* + stmt: var op var + !op: ("+" | "-" | "*" | "/") + var: WORD + NL: /(\\r?\\n)+\s*/ + """ + common + + code = """ + a+b + """ + + self.assert_reconstruct(g, code) + + @unittest.skip('Not working yet') + def test_expand_rule(self): + g = """ + ?start: (NL | mult_stmt)* + ?mult_stmt: sum_stmt ["*" sum_stmt] + ?sum_stmt: var ["+" var] + var: WORD + NL: /(\\r?\\n)+\s*/ + """ + common + + code = ['a', 'a*b', 'a+b', 'a*b+c', 'a+b*c', 'a+b*c+d'] + + for c in code: + self.assert_reconstruct(g, c) + def test_json_example(self): test_json = ''' { From 5d01f0ae68cea8c72186bd3d8ca6b30154f099cd Mon Sep 17 00:00:00 2001 From: julienmalard Date: Tue, 30 Jun 2020 08:08:07 -0400 Subject: [PATCH 049/164] test keep tokens in reconstructor works --- lark/load_grammar.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 407d8d1..4b0deff 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -307,6 +307,7 @@ class PrepareAnonTerminals(Transformer_InPlace): self.term_set = {td.name for td in self.terminals} self.term_reverse = {td.pattern: td for td in terminals} self.i = 0 + self.rule_options = None @inline_args @@ -351,7 +352,10 @@ class PrepareAnonTerminals(Transformer_InPlace): self.term_reverse[p] = termdef self.terminals.append(termdef) - return Terminal(term_name, filter_out=isinstance(p, PatternStr)) + filter_out = False if self.rule_options and self.rule_options.keep_all_tokens else isinstance(p, PatternStr) + + return Terminal(term_name, filter_out=filter_out) + class _ReplaceSymbols(Transformer_InPlace): " Helper for ApplyTemplates " @@ -527,7 +531,8 @@ class Grammar: # ================= # 1. Pre-process terminals - transformer = PrepareLiterals() * PrepareSymbols() * PrepareAnonTerminals(terminals) # Adds to terminals + anon_tokens_transf = PrepareAnonTerminals(terminals) + transformer = PrepareLiterals() * PrepareSymbols() * anon_tokens_transf # Adds to terminals # 2. Inline Templates @@ -542,8 +547,10 @@ class Grammar: i += 1 if len(params) != 0: # Dont transform templates continue - ebnf_to_bnf.rule_options = RuleOptions(keep_all_tokens=True) if options.keep_all_tokens else None + rule_options = RuleOptions(keep_all_tokens=True) if options and options.keep_all_tokens else None + ebnf_to_bnf.rule_options = rule_options ebnf_to_bnf.prefix = name + anon_tokens_transf.rule_options = rule_options tree = transformer.transform(rule_tree) res = ebnf_to_bnf.transform(tree) rules.append((name, res, options)) From 279c3190968ee745fc8af29b932eb5e15d589167 Mon Sep 17 00:00:00 2001 From: julienmalard Date: Tue, 30 Jun 2020 12:25:28 -0400 Subject: [PATCH 050/164] Activate expand rules reconstructor test (fails) --- tests/test_reconstructor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_reconstructor.py b/tests/test_reconstructor.py index 7a896a0..93c64fe 100644 --- a/tests/test_reconstructor.py +++ b/tests/test_reconstructor.py @@ -84,7 +84,6 @@ class TestReconstructor(TestCase): self.assert_reconstruct(g, code) - @unittest.skip('Not working yet') def test_expand_rule(self): g = """ ?start: (NL | mult_stmt)* From bca6cfa45897d019c84396d6932c7fde643509fc Mon Sep 17 00:00:00 2001 From: julienmalard Date: Tue, 30 Jun 2020 12:25:43 -0400 Subject: [PATCH 051/164] Expand rules reconstructor test passes --- lark/reconstruct.py | 46 +++++++++++++++++++++++++++++++------- tests/test_nearley/nearley | 2 +- 2 files changed, 39 insertions(+), 9 deletions(-) diff --git a/lark/reconstruct.py b/lark/reconstruct.py index 1e3adc7..d6eccf5 100644 --- a/lark/reconstruct.py +++ b/lark/reconstruct.py @@ -86,6 +86,14 @@ def best_from_group(seq, group_key, cmp_key): d[key] = item return list(d.values()) + +def make_recons_rule(origin, expansion, old_expansion): + return Rule(origin, expansion, alias=MakeMatchTree(origin.name, old_expansion)) + +def make_recons_rule_to_term(origin, term): + return make_recons_rule(origin, [Terminal(term.name)], [term]) + + class Reconstructor: def __init__(self, parser, term_subs={}): # XXX TODO calling compile twice returns different results! @@ -93,6 +101,8 @@ class Reconstructor: tokens, rules, _grammar_extra = parser.grammar.compile(parser.options.start) self.write_tokens = WriteTokensTransformer({t.name:t for t in tokens}, term_subs) + self.rules_for_root = defaultdict(list) + self.rules = list(self._build_recons_rules(rules)) self.rules.reverse() @@ -100,9 +110,8 @@ class Reconstructor: self.rules = best_from_group(self.rules, lambda r: r, lambda r: -len(r.expansion)) self.rules.sort(key=lambda r: len(r.expansion)) - callbacks = {rule: rule.alias for rule in self.rules} # TODO pass callbacks through dict, instead of alias? - self.parser = earley.Parser(ParserConf(self.rules, callbacks, parser.options.start), - self._match, resolve_ambiguity=True) + self.parser = parser + self._parser_cache = {} def _build_recons_rules(self, rules): expand1s = {r.origin for r in rules if r.options.expand1} @@ -116,22 +125,35 @@ class Reconstructor: nonterminals = {sym for sym in rule_names if sym.name.startswith('_') or sym in expand1s or sym in aliases } + seen = set() for r in rules: recons_exp = [sym if sym in nonterminals else Terminal(sym.name) for sym in r.expansion if not is_discarded_terminal(sym)] # Skip self-recursive constructs - if recons_exp == [r.origin]: + if recons_exp == [r.origin] and r.alias is None: continue sym = NonTerminal(r.alias) if r.alias else r.origin + rule = make_recons_rule(sym, recons_exp, r.expansion) + + if sym in expand1s and len(recons_exp) != 1: + self.rules_for_root[sym.name].append(rule) - yield Rule(sym, recons_exp, alias=MakeMatchTree(sym.name, r.expansion)) + if sym.name not in seen: + yield make_recons_rule_to_term(sym, sym) + seen.add(sym.name) + else: + if sym.name.startswith('_') or sym in expand1s: + yield rule + else: + self.rules_for_root[sym.name].append(rule) + # yield rule # Rule(sym, recons_exp, alias=MakeMatchTree(sym.name, r.expansion)) for origin, rule_aliases in aliases.items(): for alias in rule_aliases: - yield Rule(origin, [Terminal(alias)], alias=MakeMatchTree(origin.name, [NonTerminal(alias)])) - yield Rule(origin, [Terminal(origin.name)], alias=MakeMatchTree(origin.name, [origin])) + yield make_recons_rule_to_term(origin, NonTerminal(alias)) + yield make_recons_rule_to_term(origin, origin) def _match(self, term, token): if isinstance(token, Tree): @@ -142,7 +164,15 @@ class Reconstructor: def _reconstruct(self, tree): # TODO: ambiguity? - unreduced_tree = self.parser.parse(tree.children, tree.data) # find a full derivation + try: + parser = self._parser_cache[tree.data] + except KeyError: + rules = self.rules + self.rules_for_root[tree.data] + callbacks = {rule: rule.alias for rule in rules} # TODO pass callbacks through dict, instead of alias? + parser = earley.Parser(ParserConf(rules, callbacks, [tree.data]), self._match, resolve_ambiguity=True) + self._parser_cache[tree.data] = parser + + unreduced_tree = parser.parse(tree.children, tree.data) # find a full derivation assert unreduced_tree.data == tree.data res = self.write_tokens.transform(unreduced_tree) for item in res: diff --git a/tests/test_nearley/nearley b/tests/test_nearley/nearley index a46b374..cf8925f 160000 --- a/tests/test_nearley/nearley +++ b/tests/test_nearley/nearley @@ -1 +1 @@ -Subproject commit a46b37471db486db0f6e1ce6a2934fb238346b44 +Subproject commit cf8925f729bde741a3076c5856c0c0862bc7f5de From 6b9bd84091fc65e9ab806a9dfcf577d76477dea4 Mon Sep 17 00:00:00 2001 From: julienmalard Date: Tue, 30 Jun 2020 15:05:49 -0400 Subject: [PATCH 052/164] Remove commented out line --- lark/reconstruct.py | 1 - 1 file changed, 1 deletion(-) diff --git a/lark/reconstruct.py b/lark/reconstruct.py index d6eccf5..876c6ae 100644 --- a/lark/reconstruct.py +++ b/lark/reconstruct.py @@ -148,7 +148,6 @@ class Reconstructor: yield rule else: self.rules_for_root[sym.name].append(rule) - # yield rule # Rule(sym, recons_exp, alias=MakeMatchTree(sym.name, r.expansion)) for origin, rule_aliases in aliases.items(): for alias in rule_aliases: From 5c8a25c7333ea685d5816dea186f0cf389d6d7f5 Mon Sep 17 00:00:00 2001 From: pwwang Date: Tue, 30 Jun 2020 18:18:49 -0500 Subject: [PATCH 053/164] Avoid using root logger --- docs/how_to_use.md | 7 ++-- lark/__init__.py | 1 + lark/common.py | 7 ++++ lark/lark.py | 8 ++-- lark/parsers/earley.py | 4 +- lark/parsers/lalr_analysis.py | 6 +-- tests/__main__.py | 7 +++- tests/test_logger.py | 65 ++++++++++++++++++++++++++++++ tests/test_nearley/test_nearley.py | 7 ++-- tests/test_parser.py | 3 +- 10 files changed, 97 insertions(+), 18 deletions(-) create mode 100644 tests/test_logger.py diff --git a/docs/how_to_use.md b/docs/how_to_use.md index 886b440..78f4df2 100644 --- a/docs/how_to_use.md +++ b/docs/how_to_use.md @@ -30,12 +30,13 @@ Use the reference pages for more in-depth explanations. (links in the [main page ## LALR usage -By default Lark silently resolves Shift/Reduce conflicts as Shift. To enable warnings pass `debug=True`. To get the messages printed you have to configure `logging` framework beforehand. For example: +By default Lark silently resolves Shift/Reduce conflicts as Shift. To enable warnings pass `debug=True`. To get the messages printed you have to configure the `LOGGER` beforehand. For example: ```python -from lark import Lark import logging -logging.basicConfig(level=logging.DEBUG) +from lark import Lark, LOGGER + +LOGGER.setLevel(logging.DEBUG) collision_grammar = ''' start: as as diff --git a/lark/__init__.py b/lark/__init__.py index 9e50691..e4c54dd 100644 --- a/lark/__init__.py +++ b/lark/__init__.py @@ -1,3 +1,4 @@ +from .common import LOGGER from .tree import Tree from .visitors import Transformer, Visitor, v_args, Discard from .visitors import InlineTransformer, inline_args # XXX Deprecated diff --git a/lark/common.py b/lark/common.py index c44f9ce..aac9d75 100644 --- a/lark/common.py +++ b/lark/common.py @@ -1,6 +1,13 @@ +import logging from .utils import Serialize from .lexer import TerminalDef +LOGGER = logging.getLogger("LARK") +LOGGER.addHandler(logging.StreamHandler()) +# Set to highest level, since we have some warnings amongst the code +# By default, we should not output any log messages +LOGGER.setLevel(logging.CRITICAL) + ###{standalone class LexerConf(Serialize): diff --git a/lark/lark.py b/lark/lark.py index 2b783cb..8df2b87 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -1,13 +1,13 @@ from __future__ import absolute_import -import sys, os, pickle, hashlib, logging +import sys, os, pickle, hashlib from io import open from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS from .load_grammar import load_grammar from .tree import Tree -from .common import LexerConf, ParserConf +from .common import LexerConf, ParserConf, LOGGER from .lexer import Lexer, TraditionalLexer, TerminalDef, UnexpectedToken from .parse_tree_builder import ParseTreeBuilder @@ -205,7 +205,7 @@ class Lark(Serialize): cache_fn = '.lark_cache_%s.tmp' % md5 if FS.exists(cache_fn): - logging.debug('Loading grammar from cache: %s', cache_fn) + LOGGER.debug('Loading grammar from cache: %s', cache_fn) with FS.open(cache_fn, 'rb') as f: self._load(f, self.options.transformer, self.options.postlex) return @@ -284,7 +284,7 @@ class Lark(Serialize): self.lexer = self._build_lexer() if cache_fn: - logging.debug('Saving grammar to cache: %s', cache_fn) + LOGGER.debug('Saving grammar to cache: %s', cache_fn) with FS.open(cache_fn, 'wb') as f: self.save(f) diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index 59e9a06..5fc7531 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -10,11 +10,11 @@ is better documented here: http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/ """ -import logging from collections import deque from ..visitors import Transformer_InPlace, v_args from ..exceptions import UnexpectedEOF, UnexpectedToken +from ..common import LOGGER from .grammar_analysis import GrammarAnalyzer from ..grammar import NonTerminal from .earley_common import Item, TransitiveItem @@ -301,7 +301,7 @@ class Parser: try: debug_walker = ForestToPyDotVisitor() except ImportError: - logging.warning("Cannot find dependency 'pydot', will not generate sppf debug image") + LOGGER.warning("Cannot find dependency 'pydot', will not generate sppf debug image") else: debug_walker.visit(solutions[0], "sppf.png") diff --git a/lark/parsers/lalr_analysis.py b/lark/parsers/lalr_analysis.py index 8890c3c..6fefa4c 100644 --- a/lark/parsers/lalr_analysis.py +++ b/lark/parsers/lalr_analysis.py @@ -6,11 +6,11 @@ For now, shift/reduce conflicts are automatically resolved as shifts. # Author: Erez Shinan (2017) # Email : erezshin@gmail.com -import logging from collections import defaultdict, deque from ..utils import classify, classify_bool, bfs, fzset, Serialize, Enumerator from ..exceptions import GrammarError +from ..common import LOGGER from .grammar_analysis import GrammarAnalyzer, Terminal, LR0ItemSet from ..grammar import Rule @@ -256,8 +256,8 @@ class LALR_Analyzer(GrammarAnalyzer): raise GrammarError('Reduce/Reduce collision in %s between the following rules: %s' % (la, ''.join([ '\n\t\t- ' + str(r) for r in rules ]))) if la in actions: if self.debug: - logging.warning('Shift/Reduce conflict for terminal %s: (resolving as shift)', la.name) - logging.warning(' * %s', list(rules)[0]) + LOGGER.warning('Shift/Reduce conflict for terminal %s: (resolving as shift)', la.name) + LOGGER.warning(' * %s', list(rules)[0]) else: actions[la] = (Reduce, list(rules)[0]) m[state] = { k.name: v for k, v in actions.items() } diff --git a/tests/__main__.py b/tests/__main__.py index cb26eb4..1807aa8 100644 --- a/tests/__main__.py +++ b/tests/__main__.py @@ -2,6 +2,7 @@ from __future__ import absolute_import, print_function import unittest import logging +from lark import LOGGER from .test_trees import TestTrees from .test_tools import TestStandalone @@ -11,11 +12,13 @@ from .test_reconstructor import TestReconstructor try: from .test_nearley.test_nearley import TestNearley except ImportError: - logging.warning("Warning: Skipping tests for Nearley grammar imports (js2py required)") + LOGGER.warning("Warning: Skipping tests for Nearley grammar imports (js2py required)") # from .test_selectors import TestSelectors # from .test_grammars import TestPythonG, TestConfigG +from .test_logger import TestLogger + from .test_parser import ( TestLalrStandard, TestEarleyStandard, @@ -31,7 +34,7 @@ from .test_parser import ( TestParsers, ) -logging.basicConfig(level=logging.INFO) +LOGGER.setLevel(logging.INFO) if __name__ == '__main__': unittest.main() diff --git a/tests/test_logger.py b/tests/test_logger.py new file mode 100644 index 0000000..dd6beb3 --- /dev/null +++ b/tests/test_logger.py @@ -0,0 +1,65 @@ +import logging +from contextlib import contextmanager +from lark import Lark, LOGGER +from unittest import TestCase, main + +try: + from StringIO import StringIO +except ImportError: + from io import StringIO + +@contextmanager +def capture_log(): + stream = StringIO() + orig_handler = LOGGER.handlers[0] + del LOGGER.handlers[:] + LOGGER.addHandler(logging.StreamHandler(stream)) + yield stream + del LOGGER.handlers[:] + LOGGER.addHandler(orig_handler) + +class TestLogger(TestCase): + + def test_debug(self): + LOGGER.setLevel(logging.DEBUG) + collision_grammar = ''' + start: as as + as: a* + a: "a" + ''' + with capture_log() as log: + Lark(collision_grammar, parser='lalr', debug=True) + + log = log.getvalue() + self.assertIn("Shift/Reduce conflict for terminal", log) + self.assertIn("A: (resolving as shift)", log) + self.assertIn("Shift/Reduce conflict for terminal A: (resolving as shift)", log) + + def test_non_debug(self): + LOGGER.setLevel(logging.DEBUG) + collision_grammar = ''' + start: as as + as: a* + a: "a" + ''' + with capture_log() as log: + Lark(collision_grammar, parser='lalr', debug=False) + log = log.getvalue() + # no log messge + self.assertEqual(len(log), 0) + + def test_loglevel_higher(self): + LOGGER.setLevel(logging.ERROR) + collision_grammar = ''' + start: as as + as: a* + a: "a" + ''' + with capture_log() as log: + Lark(collision_grammar, parser='lalr', debug=True) + log = log.getvalue() + # no log messge + self.assertEqual(len(log), 0) + +if __name__ == '__main__': + main() diff --git a/tests/test_nearley/test_nearley.py b/tests/test_nearley/test_nearley.py index 647f489..345af8a 100644 --- a/tests/test_nearley/test_nearley.py +++ b/tests/test_nearley/test_nearley.py @@ -6,16 +6,17 @@ import logging import os import codecs -logging.basicConfig(level=logging.INFO) - +from lark import LOGGER from lark.tools.nearley import create_code_for_nearley_grammar, main as nearley_tool_main +LOGGER.setLevel(logging.INFO) + TEST_PATH = os.path.abspath(os.path.dirname(__file__)) NEARLEY_PATH = os.path.join(TEST_PATH, 'nearley') BUILTIN_PATH = os.path.join(NEARLEY_PATH, 'builtin') if not os.path.exists(NEARLEY_PATH): - logging.warn("Nearley not installed. Skipping Nearley tests!") + LOGGER.warn("Nearley not installed. Skipping Nearley tests!") raise ImportError("Skipping Nearley tests!") import js2py # Ensures that js2py exists, to avoid failing tests diff --git a/tests/test_parser.py b/tests/test_parser.py index df09307..5a10b9f 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -18,13 +18,13 @@ from io import ( open, ) -logging.basicConfig(level=logging.INFO) try: import regex except ImportError: regex = None +from lark import LOGGER from lark.lark import Lark from lark.exceptions import GrammarError, ParseError, UnexpectedToken, UnexpectedInput, UnexpectedCharacters from lark.tree import Tree @@ -32,6 +32,7 @@ from lark.visitors import Transformer, Transformer_InPlace, v_args from lark.grammar import Rule from lark.lexer import TerminalDef, Lexer, TraditionalLexer +LOGGER.setLevel(logging.INFO) __path__ = os.path.dirname(__file__) From a6201b41e471897ef044696925911df86b94a886 Mon Sep 17 00:00:00 2001 From: pwwang <1188067+pwwang@users.noreply.github.com> Date: Tue, 30 Jun 2020 17:35:26 -0700 Subject: [PATCH 054/164] Lowercase logger name --- lark/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/common.py b/lark/common.py index aac9d75..3bd7c98 100644 --- a/lark/common.py +++ b/lark/common.py @@ -2,7 +2,7 @@ import logging from .utils import Serialize from .lexer import TerminalDef -LOGGER = logging.getLogger("LARK") +LOGGER = logging.getLogger("lark") LOGGER.addHandler(logging.StreamHandler()) # Set to highest level, since we have some warnings amongst the code # By default, we should not output any log messages From 601f0506453ef0b25eabf203d42f5861f914fe04 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Wed, 1 Jul 2020 16:25:45 +0300 Subject: [PATCH 055/164] Added docs for ParserPuppet --- docs/classes.md | 61 ++++++++++++++++++++++++++++++++----- lark/parsers/lalr_puppet.py | 3 ++ 2 files changed, 56 insertions(+), 8 deletions(-) diff --git a/docs/classes.md b/docs/classes.md index 61cefb2..c901a44 100644 --- a/docs/classes.md +++ b/docs/classes.md @@ -25,7 +25,11 @@ Example: Lark(...) ``` -#### parse(self, text, start=None, on_error=None) +#### Lark.parser + +```python +def parse(self, text, start=None, on_error=None): ... +``` Parse the given text, according to the options provided. @@ -41,7 +45,11 @@ Parameters: (See `examples/error_puppet.py` for an example of how to use `on_error`.) -#### save(self, f) / load(cls, f) +#### Lark.save / Lark.load +```python +def save(self, f): ... +def load(cls, f): ... +``` Useful for caching and multiprocessing. @@ -191,12 +199,6 @@ See the [visitors page](visitors.md) ## UnexpectedInput -## UnexpectedToken - -TODO: Explain puppet mechanism (related to on_error) - -## UnexpectedException - - `UnexpectedInput` - `UnexpectedToken` - The parser recieved an unexpected token - `UnexpectedCharacters` - The lexer encountered an unexpected string @@ -218,3 +220,46 @@ Accepts the parse function (usually `lark_instance.parse`) and a dictionary of ` The function will iterate the dictionary until it finds a matching error, and return the corresponding value. For an example usage, see: [examples/error_reporting_lalr.py](https://github.com/lark-parser/lark/blob/master/examples/error_reporting_lalr.py) + + +### UnexpectedToken + +When the parser throws UnexpectedToken, it instanciates a puppet with its internal state. + +Users can then interactively set the puppet to the desired puppet state, and resume regular parsing. + +See [ParserPuppet](#ParserPuppet) + +### UnexpectedCharacters + +## ParserPuppet + +ParserPuppet gives you advanced control over error handling when parsing with LALR. + +For a simpler, more streamlined interface, see the `on_error` argument to `Lark.parse()`. + +#### choices(self) + +Returns a dictionary of token types, matched to their action in the parser. + +Only returns token types that are accepted by the current state. + +Updated by `feed_token()` + +#### feed_token(self, token) + +Feed the parser with a token, and advance it to the next state, as if it recieved it from the lexer. + +Note that `token` has to be an instance of `Token`. + +#### copy(self) + +Create a new puppet with a separate state. Calls to `feed_token()` won't affect the old puppet, and vice-versa. + +#### pretty(self) + +Print the output of `choices()` in a way that's easier to read. + +#### resume_parse(self) +Resume parsing from the current puppet state. + diff --git a/lark/parsers/lalr_puppet.py b/lark/parsers/lalr_puppet.py index 14e21fc..968783c 100644 --- a/lark/parsers/lalr_puppet.py +++ b/lark/parsers/lalr_puppet.py @@ -16,6 +16,9 @@ class ParserPuppet: self.result = None def feed_token(self, token): + """Advance the parser state, as if it just recieved `token` from the lexer + + """ end_state = self.parser.parse_table.end_states[self._start] state_stack = self._state_stack value_stack = self._value_stack From 2c49cc6e4516a17f5172e1bd3de43e8c3c9e5c45 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Wed, 1 Jul 2020 16:25:57 +0300 Subject: [PATCH 056/164] Version Bump --- lark/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/__init__.py b/lark/__init__.py index 9e50691..8ddab96 100644 --- a/lark/__init__.py +++ b/lark/__init__.py @@ -6,4 +6,4 @@ from .exceptions import (ParseError, LexError, GrammarError, UnexpectedToken, from .lexer import Token from .lark import Lark -__version__ = "0.8.9" +__version__ = "0.9.0" From 11ef9a18fe1ee0d02717a906b8f4bbf91f1c9640 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Wed, 1 Jul 2020 16:53:08 +0300 Subject: [PATCH 057/164] Improved documentation --- docs/classes.md | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/docs/classes.md b/docs/classes.md index c901a44..8b32801 100644 --- a/docs/classes.md +++ b/docs/classes.md @@ -8,11 +8,23 @@ This page details the important classes in Lark. The Lark class is the main interface for the library. It's mostly a thin wrapper for the many different parsers, and for the tree constructor. -#### \_\_init\_\_(self, grammar_string, **options) - +#### Lark.\_\_init\_\_ +```python +def __init__(self, grammar_string, **options): ... +``` Creates an instance of Lark with the given grammar -#### open(cls, grammar_filename, rel_to=None, **options) +Example: + +```python + >>> Lark(r'''start: "foo" ''') + Lark(...) +``` + +#### Lark.open +```python +def open(cls, grammar_filename, rel_to=None, **options): ... +``` Creates an instance of Lark with the grammar given by its filename @@ -25,7 +37,7 @@ Example: Lark(...) ``` -#### Lark.parser +#### Lark.parse ```python def parse(self, text, start=None, on_error=None): ... @@ -45,6 +57,12 @@ Parameters: (See `examples/error_puppet.py` for an example of how to use `on_error`.) +Example: +```python + >>> Lark(r'''start: "hello" " "+ /\w+/ ''').parse('hello kitty') + Tree(start, [Token(__ANON_0, 'kitty')]) +``` + #### Lark.save / Lark.load ```python def save(self, f): ... From 7dc00179e63efa6e98d688bfba3265d382db79c4 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Thu, 2 Jul 2020 12:44:02 +0300 Subject: [PATCH 058/164] Lark now loads faster - Refactored lexer interface into LexerConf - Lexer now compiles regexps only when used (especially useful for ContextualLexer) - Lexer now doesn't validate on deserialize (noticable speedup) --- examples/python_parser.py | 20 +++++-------- lark/common.py | 6 ++-- lark/lark.py | 14 ++++----- lark/lexer.py | 63 ++++++++++++++++++++++++--------------- lark/load_grammar.py | 10 +++---- lark/parser_frontends.py | 44 +++++++++++---------------- tests/test_parser.py | 6 ++-- 7 files changed, 84 insertions(+), 79 deletions(-) diff --git a/examples/python_parser.py b/examples/python_parser.py index 988fd97..82bfcb9 100644 --- a/examples/python_parser.py +++ b/examples/python_parser.py @@ -26,6 +26,13 @@ python_parser2 = Lark.open('python2.lark', parser='lalr', **kwargs) python_parser3 = Lark.open('python3.lark',parser='lalr', **kwargs) python_parser2_earley = Lark.open('python2.lark', parser='earley', lexer='standard', **kwargs) +try: + xrange +except NameError: + chosen_parser = python_parser3 +else: + chosen_parser = python_parser2 + def _read(fn, *args): kwargs = {'encoding': 'iso-8859-1'} @@ -42,24 +49,13 @@ def _get_lib_path(): return [x for x in sys.path if x.endswith('%s.%s' % sys.version_info[:2])][0] def test_python_lib(): - path = _get_lib_path() start = time.time() files = glob.glob(path+'/*.py') for f in files: print( f ) - try: - # print list(python_parser.lex(_read(os.path.join(path, f)) + '\n')) - try: - xrange - except NameError: - python_parser3.parse(_read(os.path.join(path, f)) + '\n') - else: - python_parser2.parse(_read(os.path.join(path, f)) + '\n') - except: - print ('At %s' % f) - raise + chosen_parser.parse(_read(os.path.join(path, f)) + '\n') end = time.time() print( "test_python_lib (%d files), time: %s secs"%(len(files), end-start) ) diff --git a/lark/common.py b/lark/common.py index c44f9ce..5c55b8c 100644 --- a/lark/common.py +++ b/lark/common.py @@ -7,12 +7,14 @@ class LexerConf(Serialize): __serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags' __serialize_namespace__ = TerminalDef, - def __init__(self, tokens, ignore=(), postlex=None, callbacks=None, g_regex_flags=0): - self.tokens = tokens + def __init__(self, tokens, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False): + self.tokens = tokens # TODO should be terminals self.ignore = ignore self.postlex = postlex self.callbacks = callbacks or {} self.g_regex_flags = g_regex_flags + self.re_module = re_module + self.skip_validation = skip_validation def _deserialize(self): self.callbacks = {} # TODO diff --git a/lark/lark.py b/lark/lark.py index 2b783cb..e17da6b 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -166,11 +166,11 @@ class Lark(Serialize): use_regex = self.options.regex if use_regex: if regex: - self.re = regex + re_module = regex else: raise ImportError('`regex` module must be installed if calling `Lark(regex=True)`.') else: - self.re = re + re_module = re # Some, but not all file-like objects have a 'name' attribute try: @@ -243,7 +243,7 @@ class Lark(Serialize): assert self.options.ambiguity in ('resolve', 'explicit', 'auto', ) # Parse the grammar file and compose the grammars (TODO) - self.grammar = load_grammar(grammar, self.source, self.re) + self.grammar = load_grammar(grammar, self.source, re_module) # Compile the EBNF grammar into BNF self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start) @@ -276,7 +276,7 @@ class Lark(Serialize): if hasattr(t, term.name): lexer_callbacks[term.name] = getattr(t, term.name) - self.lexer_conf = LexerConf(self.terminals, self.ignore_tokens, self.options.postlex, lexer_callbacks, self.options.g_regex_flags) + self.lexer_conf = LexerConf(self.terminals, re_module, self.ignore_tokens, self.options.postlex, lexer_callbacks, self.options.g_regex_flags) if self.options.parser: self.parser = self._build_parser() @@ -304,7 +304,7 @@ class Lark(Serialize): def _build_parser(self): self._prepare_callbacks() parser_conf = ParserConf(self.rules, self._callbacks, self.options.start) - return self.parser_class(self.lexer_conf, parser_conf, self.re, options=self.options) + return self.parser_class(self.lexer_conf, parser_conf, options=self.options) def save(self, f): data, m = self.memo_serialize([TerminalDef, Rule]) @@ -331,11 +331,11 @@ class Lark(Serialize): if postlex is not None: options['postlex'] = postlex self.options = LarkOptions.deserialize(options, memo) - self.re = regex if self.options.regex else re + re_module = regex if self.options.regex else re self.rules = [Rule.deserialize(r, memo) for r in data['rules']] self.source = '' self._prepare_callbacks() - self.parser = self.parser_class.deserialize(data['parser'], memo, self._callbacks, self.options.postlex, self.re) + self.parser = self.parser_class.deserialize(data['parser'], memo, self._callbacks, self.options.postlex, re_module) return self @classmethod diff --git a/lark/lexer.py b/lark/lexer.py index bff5de9..4979500 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -6,6 +6,7 @@ from .utils import Str, classify, get_regexp_width, Py36, Serialize from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken ###{standalone +from copy import copy class Pattern(Serialize): @@ -88,7 +89,6 @@ class TerminalDef(Serialize): return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern) - class Token(Str): __slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column', 'end_pos') @@ -294,35 +294,39 @@ class Lexer(object): class TraditionalLexer(Lexer): - def __init__(self, terminals, re_, ignore=(), user_callbacks={}, g_regex_flags=0): + def __init__(self, conf): + terminals = list(conf.tokens) assert all(isinstance(t, TerminalDef) for t in terminals), terminals - terminals = list(terminals) + self.re = conf.re_module - self.re = re_ - # Sanitization - for t in terminals: - try: - self.re.compile(t.pattern.to_regexp(), g_regex_flags) - except self.re.error: - raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern)) + if not conf.skip_validation: + # Sanitization + for t in terminals: + try: + self.re.compile(t.pattern.to_regexp(), conf.g_regex_flags) + except self.re.error: + raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern)) - if t.pattern.min_width == 0: - raise LexError("Lexer does not allow zero-width terminals. (%s: %s)" % (t.name, t.pattern)) + if t.pattern.min_width == 0: + raise LexError("Lexer does not allow zero-width terminals. (%s: %s)" % (t.name, t.pattern)) - assert set(ignore) <= {t.name for t in terminals} + assert set(conf.ignore) <= {t.name for t in terminals} # Init self.newline_types = [t.name for t in terminals if _regexp_has_newline(t.pattern.to_regexp())] - self.ignore_types = list(ignore) + self.ignore_types = list(conf.ignore) terminals.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name)) self.terminals = terminals - self.user_callbacks = user_callbacks - self.build(g_regex_flags) + self.user_callbacks = conf.callbacks + self.g_regex_flags = conf.g_regex_flags + + self._mres = None + # self.build(g_regex_flags) - def build(self, g_regex_flags=0): - terminals, self.callback = _create_unless(self.terminals, g_regex_flags, re_=self.re) + def _build(self): + terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, re_=self.re) assert all(self.callback.values()) for type_, f in self.user_callbacks.items(): @@ -332,7 +336,13 @@ class TraditionalLexer(Lexer): else: self.callback[type_] = f - self.mres = build_mres(terminals, g_regex_flags, self.re) + self._mres = build_mres(terminals, self.g_regex_flags, self.re) + + @property + def mres(self): + if self._mres is None: + self._build() + return self._mres def match(self, stream, pos): for mre, type_from_index in self.mres: @@ -348,13 +358,15 @@ class TraditionalLexer(Lexer): class ContextualLexer(Lexer): - def __init__(self, terminals, states, re_, ignore=(), always_accept=(), user_callbacks={}, g_regex_flags=0): - self.re = re_ + def __init__(self, conf, states, always_accept=()): + terminals = list(conf.tokens) tokens_by_name = {} for t in terminals: assert t.name not in tokens_by_name, t tokens_by_name[t.name] = t + trad_conf = type(conf)(terminals, conf.re_module, conf.ignore, callbacks=conf.callbacks, g_regex_flags=conf.g_regex_flags, skip_validation=conf.skip_validation) + lexer_by_tokens = {} self.lexers = {} for state, accepts in states.items(): @@ -362,14 +374,17 @@ class ContextualLexer(Lexer): try: lexer = lexer_by_tokens[key] except KeyError: - accepts = set(accepts) | set(ignore) | set(always_accept) + accepts = set(accepts) | set(conf.ignore) | set(always_accept) state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name] - lexer = TraditionalLexer(state_tokens, re_=self.re, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags) + lexer_conf = copy(trad_conf) + lexer_conf.tokens = state_tokens + lexer = TraditionalLexer(lexer_conf) lexer_by_tokens[key] = lexer self.lexers[state] = lexer - self.root_lexer = TraditionalLexer(terminals, re_=self.re, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags) + assert trad_conf.tokens is terminals + self.root_lexer = TraditionalLexer(trad_conf) def lex(self, stream, get_parser_state): parser_state = get_parser_state() diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 407d8d1..ee0f1c0 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -755,19 +755,19 @@ def _find_used_symbols(tree): for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))} class GrammarLoader: - def __init__(self, re_): - self.re = re_ + def __init__(self, re_module): terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()] rules = [options_from_rule(name, None, x) for name, x in RULES.items()] rules = [Rule(NonTerminal(r), symbols_from_strcase(x.split()), i, None, o) for r, _p, xs, o in rules for i, x in enumerate(xs)] callback = ParseTreeBuilder(rules, ST).create_callback() - lexer_conf = LexerConf(terminals, ['WS', 'COMMENT']) + lexer_conf = LexerConf(terminals, re_module, ['WS', 'COMMENT']) parser_conf = ParserConf(rules, callback, ['start']) - self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf, re_) + self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf) self.canonize_tree = CanonizeTree() + self.re_module = re_module def load_grammar(self, grammar_text, grammar_name=''): "Parse grammar_text, verify, and create Grammar object. Display nice messages on error." @@ -863,7 +863,7 @@ class GrammarLoader: # import grammars for dotted_path, (base_paths, aliases) in imports.items(): grammar_path = os.path.join(*dotted_path) + EXT - g = import_grammar(grammar_path, self.re, base_paths=base_paths) + g = import_grammar(grammar_path, self.re_module, base_paths=base_paths) new_td, new_rd = import_from_grammar_into_namespace(g, '__'.join(dotted_path), aliases) term_defs += new_td diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index c453ab6..08f4756 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -62,18 +62,18 @@ class WithLexer(_ParserFrontend): __serialize_fields__ = 'parser', 'lexer_conf', 'start' __serialize_namespace__ = LexerConf, - def __init__(self, lexer_conf, parser_conf, re_, options=None): + def __init__(self, lexer_conf, parser_conf, options=None): self.lexer_conf = lexer_conf self.start = parser_conf.start self.postlex = lexer_conf.postlex - self.re = re_ @classmethod - def deserialize(cls, data, memo, callbacks, postlex, re_): + def deserialize(cls, data, memo, callbacks, postlex, re_module): inst = super(WithLexer, cls).deserialize(data, memo) - inst.re = re_ inst.postlex = postlex inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks) + inst.lexer_conf.re_module = re_module + inst.lexer_conf.skip_validation=True inst.init_lexer() return inst @@ -89,18 +89,17 @@ class WithLexer(_ParserFrontend): return self._parse(token_stream, start) def init_traditional_lexer(self): - self.lexer = TraditionalLexer(self.lexer_conf.tokens, re_=self.re, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks, g_regex_flags=self.lexer_conf.g_regex_flags) + self.lexer = TraditionalLexer(self.lexer_conf) class LALR_WithLexer(WithLexer): - def __init__(self, lexer_conf, parser_conf, re_, options=None): + def __init__(self, lexer_conf, parser_conf, options=None): debug = options.debug if options else False - self.re = re_ self.parser = LALR_Parser(parser_conf, debug=debug) - WithLexer.__init__(self, lexer_conf, parser_conf, re_, options) + WithLexer.__init__(self, lexer_conf, parser_conf, options) self.init_lexer() - def init_lexer(self): + def init_lexer(self, **kw): raise NotImplementedError() class LALR_TraditionalLexer(LALR_WithLexer): @@ -111,12 +110,7 @@ class LALR_ContextualLexer(LALR_WithLexer): def init_lexer(self): states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()} always_accept = self.postlex.always_accept if self.postlex else () - self.lexer = ContextualLexer(self.lexer_conf.tokens, states, - re_=self.re, - ignore=self.lexer_conf.ignore, - always_accept=always_accept, - user_callbacks=self.lexer_conf.callbacks, - g_regex_flags=self.lexer_conf.g_regex_flags) + self.lexer = ContextualLexer(self.lexer_conf, states, always_accept=always_accept) def parse(self, text, start=None): @@ -129,11 +123,11 @@ class LALR_ContextualLexer(LALR_WithLexer): ###} class LALR_CustomLexer(LALR_WithLexer): - def __init__(self, lexer_cls, lexer_conf, parser_conf, re_, options=None): - self.lexer = lexer_cls(lexer_conf, re_=re_) + def __init__(self, lexer_cls, lexer_conf, parser_conf, options=None): + self.lexer = lexer_cls(lexer_conf) debug = options.debug if options else False self.parser = LALR_Parser(parser_conf, debug=debug) - WithLexer.__init__(self, lexer_conf, parser_conf, re_, options) + WithLexer.__init__(self, lexer_conf, parser_conf, options) def tokenize_text(text): @@ -146,8 +140,8 @@ def tokenize_text(text): yield Token('CHAR', ch, line=line, column=i - col_start_pos) class Earley(WithLexer): - def __init__(self, lexer_conf, parser_conf, re_, options=None): - WithLexer.__init__(self, lexer_conf, parser_conf, re_, options) + def __init__(self, lexer_conf, parser_conf, options=None): + WithLexer.__init__(self, lexer_conf, parser_conf, options) self.init_traditional_lexer() resolve_ambiguity = options.ambiguity == 'resolve' @@ -159,9 +153,7 @@ class Earley(WithLexer): class XEarley(_ParserFrontend): - def __init__(self, lexer_conf, parser_conf, re_, options=None, **kw): - self.re = re_ - + def __init__(self, lexer_conf, parser_conf, options=None, **kw): self.token_by_name = {t.name:t for t in lexer_conf.tokens} self.start = parser_conf.start @@ -193,7 +185,7 @@ class XEarley(_ParserFrontend): if width == 0: raise ValueError("Dynamic Earley doesn't allow zero-width regexps", t) - self.regexps[t.name] = self.re.compile(regexp, lexer_conf.g_regex_flags) + self.regexps[t.name] = lexer_conf.re_module.compile(regexp, lexer_conf.g_regex_flags) def parse(self, text, start): return self._parse(text, start) @@ -206,8 +198,8 @@ class XEarley_CompleteLex(XEarley): class CYK(WithLexer): - def __init__(self, lexer_conf, parser_conf, re_, options=None): - WithLexer.__init__(self, lexer_conf, parser_conf, re_, options) + def __init__(self, lexer_conf, parser_conf, options=None): + WithLexer.__init__(self, lexer_conf, parser_conf, options) self.init_traditional_lexer() self._analysis = GrammarAnalyzer(parser_conf) diff --git a/tests/test_parser.py b/tests/test_parser.py index df09307..def4eca 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -6,7 +6,7 @@ import unittest import logging import os import sys -from copy import deepcopy +from copy import copy, deepcopy try: from cStringIO import StringIO as cStringIO except ImportError: @@ -553,8 +553,8 @@ class CustomLexer(Lexer): Purpose of this custom lexer is to test the integration, so it uses the traditionalparser as implementation without custom lexing behaviour. """ - def __init__(self, lexer_conf, re_): - self.lexer = TraditionalLexer(lexer_conf.tokens, re_, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks, g_regex_flags=lexer_conf.g_regex_flags) + def __init__(self, lexer_conf): + self.lexer = TraditionalLexer(copy(lexer_conf)) def lex(self, *args, **kwargs): return self.lexer.lex(*args, **kwargs) From d559e495ee22f6bb97aa1abda3be4fa653edb9f3 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Thu, 2 Jul 2020 13:47:54 +0300 Subject: [PATCH 059/164] Added templates example --- examples/templates.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 examples/templates.py diff --git a/examples/templates.py b/examples/templates.py new file mode 100644 index 0000000..2acc6eb --- /dev/null +++ b/examples/templates.py @@ -0,0 +1,26 @@ +# +# This example shows how to use Lark's templates to achieve cleaner grammars +# + +from lark import Lark + +grammar = r""" +start: list | dict + +list: "[" _seperated{atom, ","} "]" +dict: "{" _seperated{key_value, ","} "}" +key_value: atom ":" atom + +_seperated{x, sep}: x (sep x)* // Define a sequence of 'x sep x sep x ...' + +atom: NUMBER | ESCAPED_STRING + +%import common (NUMBER, ESCAPED_STRING, WS) +%ignore WS +""" + + +parser = Lark(grammar) + +print(parser.parse('[1, "a", 2]')) +print(parser.parse('{"a": 2, "b": 6}')) \ No newline at end of file From aac5fc7e87a89d1cece4a33f598f823cf7263f52 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Thu, 2 Jul 2020 13:56:07 +0300 Subject: [PATCH 060/164] Added templates documentation --- docs/grammar.md | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/docs/grammar.md b/docs/grammar.md index 94c7d17..b9fe7f6 100644 --- a/docs/grammar.md +++ b/docs/grammar.md @@ -83,6 +83,29 @@ Terminals also support grammar operators, such as `|`, `+`, `*` and `?`. Terminals are a linear construct, and therefore may not contain themselves (recursion isn't allowed). +### Templates + +Templates are expanded when preprocessing the grammar. + +Definition syntax: + +```ebnf + my_template{param1, param2, ...}: +``` + +Use syntax: + +```ebnf +some_rule: my_template{arg1, arg2, ...} +``` + +Example: +```ebnf +_seperated{x, sep}: x (sep x)* // Define a sequence of 'x sep x sep x ...' + +num_list: "[" _seperated{NUMBER, ","} "]" // Will match "[1, 2, 3]" etc. +``` + ### Priority Terminals can be assigned priority only when using a lexer (future versions may support Earley's dynamic lexing). @@ -256,3 +279,4 @@ Note that `%ignore` directives cannot be imported. Imported rules will abide by ### %declare Declare a terminal without defining it. Useful for plugins. + From 2a73afd3554c29f216869bc3e70f971f74b62c13 Mon Sep 17 00:00:00 2001 From: pwwang Date: Thu, 2 Jul 2020 19:28:45 -0500 Subject: [PATCH 061/164] Change LOGGER to logger --- docs/how_to_use.md | 6 +++--- lark/__init__.py | 2 +- lark/common.py | 6 +++--- lark/lark.py | 6 +++--- lark/parsers/earley.py | 4 ++-- lark/parsers/lalr_analysis.py | 6 +++--- tests/__main__.py | 8 ++++---- tests/test_logger.py | 26 +++++++++++++------------- tests/test_nearley/test_nearley.py | 6 +++--- tests/test_parser.py | 4 ++-- 10 files changed, 37 insertions(+), 37 deletions(-) diff --git a/docs/how_to_use.md b/docs/how_to_use.md index 78f4df2..303098f 100644 --- a/docs/how_to_use.md +++ b/docs/how_to_use.md @@ -30,13 +30,13 @@ Use the reference pages for more in-depth explanations. (links in the [main page ## LALR usage -By default Lark silently resolves Shift/Reduce conflicts as Shift. To enable warnings pass `debug=True`. To get the messages printed you have to configure the `LOGGER` beforehand. For example: +By default Lark silently resolves Shift/Reduce conflicts as Shift. To enable warnings pass `debug=True`. To get the messages printed you have to configure the `logger` beforehand. For example: ```python import logging -from lark import Lark, LOGGER +from lark import Lark, logger -LOGGER.setLevel(logging.DEBUG) +logger.setLevel(logging.DEBUG) collision_grammar = ''' start: as as diff --git a/lark/__init__.py b/lark/__init__.py index e4c54dd..e3021cf 100644 --- a/lark/__init__.py +++ b/lark/__init__.py @@ -1,4 +1,4 @@ -from .common import LOGGER +from .common import logger from .tree import Tree from .visitors import Transformer, Visitor, v_args, Discard from .visitors import InlineTransformer, inline_args # XXX Deprecated diff --git a/lark/common.py b/lark/common.py index 3bd7c98..745e287 100644 --- a/lark/common.py +++ b/lark/common.py @@ -2,11 +2,11 @@ import logging from .utils import Serialize from .lexer import TerminalDef -LOGGER = logging.getLogger("lark") -LOGGER.addHandler(logging.StreamHandler()) +logger = logging.getLogger("lark") +logger.addHandler(logging.StreamHandler()) # Set to highest level, since we have some warnings amongst the code # By default, we should not output any log messages -LOGGER.setLevel(logging.CRITICAL) +logger.setLevel(logging.CRITICAL) ###{standalone diff --git a/lark/lark.py b/lark/lark.py index 8df2b87..9bb60c8 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -7,7 +7,7 @@ from io import open from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS from .load_grammar import load_grammar from .tree import Tree -from .common import LexerConf, ParserConf, LOGGER +from .common import LexerConf, ParserConf, logger from .lexer import Lexer, TraditionalLexer, TerminalDef, UnexpectedToken from .parse_tree_builder import ParseTreeBuilder @@ -205,7 +205,7 @@ class Lark(Serialize): cache_fn = '.lark_cache_%s.tmp' % md5 if FS.exists(cache_fn): - LOGGER.debug('Loading grammar from cache: %s', cache_fn) + logger.debug('Loading grammar from cache: %s', cache_fn) with FS.open(cache_fn, 'rb') as f: self._load(f, self.options.transformer, self.options.postlex) return @@ -284,7 +284,7 @@ class Lark(Serialize): self.lexer = self._build_lexer() if cache_fn: - LOGGER.debug('Saving grammar to cache: %s', cache_fn) + logger.debug('Saving grammar to cache: %s', cache_fn) with FS.open(cache_fn, 'wb') as f: self.save(f) diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index 5fc7531..bf099e6 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -14,7 +14,7 @@ from collections import deque from ..visitors import Transformer_InPlace, v_args from ..exceptions import UnexpectedEOF, UnexpectedToken -from ..common import LOGGER +from ..common import logger from .grammar_analysis import GrammarAnalyzer from ..grammar import NonTerminal from .earley_common import Item, TransitiveItem @@ -301,7 +301,7 @@ class Parser: try: debug_walker = ForestToPyDotVisitor() except ImportError: - LOGGER.warning("Cannot find dependency 'pydot', will not generate sppf debug image") + logger.warning("Cannot find dependency 'pydot', will not generate sppf debug image") else: debug_walker.visit(solutions[0], "sppf.png") diff --git a/lark/parsers/lalr_analysis.py b/lark/parsers/lalr_analysis.py index 6fefa4c..861941f 100644 --- a/lark/parsers/lalr_analysis.py +++ b/lark/parsers/lalr_analysis.py @@ -10,7 +10,7 @@ from collections import defaultdict, deque from ..utils import classify, classify_bool, bfs, fzset, Serialize, Enumerator from ..exceptions import GrammarError -from ..common import LOGGER +from ..common import logger from .grammar_analysis import GrammarAnalyzer, Terminal, LR0ItemSet from ..grammar import Rule @@ -256,8 +256,8 @@ class LALR_Analyzer(GrammarAnalyzer): raise GrammarError('Reduce/Reduce collision in %s between the following rules: %s' % (la, ''.join([ '\n\t\t- ' + str(r) for r in rules ]))) if la in actions: if self.debug: - LOGGER.warning('Shift/Reduce conflict for terminal %s: (resolving as shift)', la.name) - LOGGER.warning(' * %s', list(rules)[0]) + logger.warning('Shift/Reduce conflict for terminal %s: (resolving as shift)', la.name) + logger.warning(' * %s', list(rules)[0]) else: actions[la] = (Reduce, list(rules)[0]) m[state] = { k.name: v for k, v in actions.items() } diff --git a/tests/__main__.py b/tests/__main__.py index 1807aa8..9ef9f1b 100644 --- a/tests/__main__.py +++ b/tests/__main__.py @@ -2,7 +2,7 @@ from __future__ import absolute_import, print_function import unittest import logging -from lark import LOGGER +from lark import logger from .test_trees import TestTrees from .test_tools import TestStandalone @@ -12,12 +12,12 @@ from .test_reconstructor import TestReconstructor try: from .test_nearley.test_nearley import TestNearley except ImportError: - LOGGER.warning("Warning: Skipping tests for Nearley grammar imports (js2py required)") + logger.warning("Warning: Skipping tests for Nearley grammar imports (js2py required)") # from .test_selectors import TestSelectors # from .test_grammars import TestPythonG, TestConfigG -from .test_logger import TestLogger +from .test_logger import Testlogger from .test_parser import ( TestLalrStandard, @@ -34,7 +34,7 @@ from .test_parser import ( TestParsers, ) -LOGGER.setLevel(logging.INFO) +logger.setLevel(logging.INFO) if __name__ == '__main__': unittest.main() diff --git a/tests/test_logger.py b/tests/test_logger.py index dd6beb3..93dc8ed 100644 --- a/tests/test_logger.py +++ b/tests/test_logger.py @@ -1,6 +1,6 @@ import logging from contextlib import contextmanager -from lark import Lark, LOGGER +from lark import Lark, logger from unittest import TestCase, main try: @@ -11,17 +11,17 @@ except ImportError: @contextmanager def capture_log(): stream = StringIO() - orig_handler = LOGGER.handlers[0] - del LOGGER.handlers[:] - LOGGER.addHandler(logging.StreamHandler(stream)) + orig_handler = logger.handlers[0] + del logger.handlers[:] + logger.addHandler(logging.StreamHandler(stream)) yield stream - del LOGGER.handlers[:] - LOGGER.addHandler(orig_handler) + del logger.handlers[:] + logger.addHandler(orig_handler) -class TestLogger(TestCase): +class Testlogger(TestCase): def test_debug(self): - LOGGER.setLevel(logging.DEBUG) + logger.setLevel(logging.DEBUG) collision_grammar = ''' start: as as as: a* @@ -31,12 +31,12 @@ class TestLogger(TestCase): Lark(collision_grammar, parser='lalr', debug=True) log = log.getvalue() - self.assertIn("Shift/Reduce conflict for terminal", log) - self.assertIn("A: (resolving as shift)", log) - self.assertIn("Shift/Reduce conflict for terminal A: (resolving as shift)", log) + # since there are conflicts about A + # symbol A should appear in the log message for hint + self.assertIn("A", log) def test_non_debug(self): - LOGGER.setLevel(logging.DEBUG) + logger.setLevel(logging.DEBUG) collision_grammar = ''' start: as as as: a* @@ -49,7 +49,7 @@ class TestLogger(TestCase): self.assertEqual(len(log), 0) def test_loglevel_higher(self): - LOGGER.setLevel(logging.ERROR) + logger.setLevel(logging.ERROR) collision_grammar = ''' start: as as as: a* diff --git a/tests/test_nearley/test_nearley.py b/tests/test_nearley/test_nearley.py index 345af8a..1ad6449 100644 --- a/tests/test_nearley/test_nearley.py +++ b/tests/test_nearley/test_nearley.py @@ -6,17 +6,17 @@ import logging import os import codecs -from lark import LOGGER +from lark import logger from lark.tools.nearley import create_code_for_nearley_grammar, main as nearley_tool_main -LOGGER.setLevel(logging.INFO) +logger.setLevel(logging.INFO) TEST_PATH = os.path.abspath(os.path.dirname(__file__)) NEARLEY_PATH = os.path.join(TEST_PATH, 'nearley') BUILTIN_PATH = os.path.join(NEARLEY_PATH, 'builtin') if not os.path.exists(NEARLEY_PATH): - LOGGER.warn("Nearley not installed. Skipping Nearley tests!") + logger.warn("Nearley not installed. Skipping Nearley tests!") raise ImportError("Skipping Nearley tests!") import js2py # Ensures that js2py exists, to avoid failing tests diff --git a/tests/test_parser.py b/tests/test_parser.py index 5a10b9f..88d175f 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -24,7 +24,7 @@ try: except ImportError: regex = None -from lark import LOGGER +from lark import logger from lark.lark import Lark from lark.exceptions import GrammarError, ParseError, UnexpectedToken, UnexpectedInput, UnexpectedCharacters from lark.tree import Tree @@ -32,7 +32,7 @@ from lark.visitors import Transformer, Transformer_InPlace, v_args from lark.grammar import Rule from lark.lexer import TerminalDef, Lexer, TraditionalLexer -LOGGER.setLevel(logging.INFO) +logger.setLevel(logging.INFO) __path__ = os.path.dirname(__file__) From a3368d8a72de3d10cc8a713052dfe0313d18568e Mon Sep 17 00:00:00 2001 From: julienmalard Date: Fri, 3 Jul 2020 10:32:53 -0400 Subject: [PATCH 062/164] Need to remove duplicate rules for root rools as well. --- lark/reconstruct.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/lark/reconstruct.py b/lark/reconstruct.py index 876c6ae..51294a3 100644 --- a/lark/reconstruct.py +++ b/lark/reconstruct.py @@ -1,3 +1,4 @@ +import unicodedata from collections import defaultdict from .tree import Tree @@ -166,7 +167,12 @@ class Reconstructor: try: parser = self._parser_cache[tree.data] except KeyError: - rules = self.rules + self.rules_for_root[tree.data] + rules = self.rules + best_from_group( + self.rules_for_root[tree.data], lambda r: r, lambda r: -len(r.expansion) + ) + + rules.sort(key=lambda r: len(r.expansion)) + callbacks = {rule: rule.alias for rule in rules} # TODO pass callbacks through dict, instead of alias? parser = earley.Parser(ParserConf(rules, callbacks, [tree.data]), self._match, resolve_ambiguity=True) self._parser_cache[tree.data] = parser From cf943c0047d8e5c92507c0c4b9ccf7063cb1b7c4 Mon Sep 17 00:00:00 2001 From: decorator-factory <42166884+decorator-factory@users.noreply.github.com> Date: Sun, 5 Jul 2020 11:36:51 +0300 Subject: [PATCH 063/164] Fix typo in docs https://en.wiktionary.org/wiki/seperated --- docs/grammar.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/grammar.md b/docs/grammar.md index b9fe7f6..d6cc274 100644 --- a/docs/grammar.md +++ b/docs/grammar.md @@ -101,9 +101,9 @@ some_rule: my_template{arg1, arg2, ...} Example: ```ebnf -_seperated{x, sep}: x (sep x)* // Define a sequence of 'x sep x sep x ...' +_separated{x, sep}: x (sep x)* // Define a sequence of 'x sep x sep x ...' -num_list: "[" _seperated{NUMBER, ","} "]" // Will match "[1, 2, 3]" etc. +num_list: "[" _separated{NUMBER, ","} "]" // Will match "[1, 2, 3]" etc. ``` ### Priority From c7fa5e3aa35cefce9c03fbcbc53391bee8f5bc4a Mon Sep 17 00:00:00 2001 From: Nathaniel Hartley Date: Tue, 7 Jul 2020 19:48:40 +0100 Subject: [PATCH 064/164] Allows building TraditionalLexer with current configuration convention --- lark/lark.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/lark.py b/lark/lark.py index e17da6b..232dbb7 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -294,7 +294,7 @@ class Lark(Serialize): __serialize_fields__ = 'parser', 'rules', 'options' def _build_lexer(self): - return TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks, g_regex_flags=self.lexer_conf.g_regex_flags) + return TraditionalLexer(self.lexer_conf) def _prepare_callbacks(self): self.parser_class = get_frontend(self.options.parser, self.options.lexer) From e6fc3c9b00306e3a8661210fcc93bf50479ee229 Mon Sep 17 00:00:00 2001 From: MegaIng1 Date: Thu, 16 Jul 2020 18:04:26 +0200 Subject: [PATCH 065/164] Added possibility for terminals with different flags to be joined in python3.6+ --- lark/load_grammar.py | 11 +++++++---- tests/test_parser.py | 16 ++++++++++++++++ 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index ee0f1c0..f20feb6 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -5,7 +5,7 @@ import sys from copy import copy, deepcopy from io import open -from .utils import bfs, eval_escaping +from .utils import bfs, eval_escaping, Py36 from .lexer import Token, TerminalDef, PatternStr, PatternRE from .parse_tree_builder import ParseTreeBuilder @@ -441,9 +441,12 @@ class TerminalTreeToPattern(Transformer): assert items if len(items) == 1: return items[0] - if len({i.flags for i in items}) > 1: - raise GrammarError("Lark doesn't support joining terminals with conflicting flags!") - return PatternRE(''.join(i.to_regexp() for i in items), items[0].flags if items else ()) + if not Py36: + if len({i.flags for i in items}) > 1: + raise GrammarError("Lark doesn't support joining terminals with conflicting flags in python <3.6!") + return PatternRE(''.join(i.to_regexp() for i in items), items[0].flags if items else ()) + else: + return PatternRE(''.join(i.to_regexp() for i in items), ()) def expansions(self, exps): if len(exps) == 1: diff --git a/tests/test_parser.py b/tests/test_parser.py index def4eca..762c979 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -7,6 +7,9 @@ import logging import os import sys from copy import copy, deepcopy + +from lark.utils import Py36 + try: from cStringIO import StringIO as cStringIO except ImportError: @@ -1062,6 +1065,19 @@ def _make_parser_test(LEXER, PARSER): g = _Lark(g) self.assertEqual( g.parse('"hello"').children, ['"hello"']) self.assertEqual( g.parse("'hello'").children, ["'hello'"]) + + @unittest.skipIf(not Py36, "Required re syntax only exists in python3.6+") + def test_join_regex_flags(self): + g = r""" + start: A + A: B C + B: /./s + C: /./ + """ + g = _Lark(g) + self.assertEqual(g.parse(" ").children,[" "]) + self.assertEqual(g.parse("\n ").children,["\n "]) + self.assertRaises(UnexpectedCharacters, g.parse, "\n\n") def test_lexer_token_limit(self): From 3d3bf69403fb34be115c45a545985bb50d1cc972 Mon Sep 17 00:00:00 2001 From: MegaIng1 Date: Thu, 16 Jul 2020 18:18:06 +0200 Subject: [PATCH 066/164] Added support of expansions --- lark/load_grammar.py | 9 ++++++--- tests/test_parser.py | 12 ++++++++++++ 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index f20feb6..eac1473 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -451,9 +451,12 @@ class TerminalTreeToPattern(Transformer): def expansions(self, exps): if len(exps) == 1: return exps[0] - if len({i.flags for i in exps}) > 1: - raise GrammarError("Lark doesn't support joining terminals with conflicting flags!") - return PatternRE('(?:%s)' % ('|'.join(i.to_regexp() for i in exps)), exps[0].flags) + if not Py36: + if len({i.flags for i in exps}) > 1: + raise GrammarError("Lark doesn't support joining terminals with conflicting flags!") + return PatternRE('(?:%s)' % ('|'.join(i.to_regexp() for i in exps)), exps[0].flags) + else: + return PatternRE('(?:%s)' % ('|'.join(i.to_regexp() for i in exps)), ()) def expr(self, args): inner, op = args[:2] diff --git a/tests/test_parser.py b/tests/test_parser.py index 762c979..1249211 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1078,6 +1078,18 @@ def _make_parser_test(LEXER, PARSER): self.assertEqual(g.parse(" ").children,[" "]) self.assertEqual(g.parse("\n ").children,["\n "]) self.assertRaises(UnexpectedCharacters, g.parse, "\n\n") + + g = r""" + start: A + A: B | C + B: "b"i + C: "c" + """ + g = _Lark(g) + self.assertEqual(g.parse("b").children,["b"]) + self.assertEqual(g.parse("B").children,["B"]) + self.assertEqual(g.parse("c").children,["c"]) + self.assertRaises(UnexpectedCharacters, g.parse, "C") def test_lexer_token_limit(self): From 9ab02b465c7568d25b382ab18c44ddf9414795be Mon Sep 17 00:00:00 2001 From: MegaIng1 Date: Thu, 16 Jul 2020 19:55:09 +0200 Subject: [PATCH 067/164] Added comment with explanation --- lark/load_grammar.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index eac1473..736d67c 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -441,6 +441,9 @@ class TerminalTreeToPattern(Transformer): assert items if len(items) == 1: return items[0] + # In Python 3.6, a new syntax for flags was introduced. We are already using it in `lexer.Pattern._get_flags` + # It allows us to activate flags just in a specific part, like in this case for a specific terminal. + # The `to_regexp` method already does this, so we don't have to continue to pass around the flags. if not Py36: if len({i.flags for i in items}) > 1: raise GrammarError("Lark doesn't support joining terminals with conflicting flags in python <3.6!") @@ -451,6 +454,7 @@ class TerminalTreeToPattern(Transformer): def expansions(self, exps): if len(exps) == 1: return exps[0] + # See `expansion` if not Py36: if len({i.flags for i in exps}) > 1: raise GrammarError("Lark doesn't support joining terminals with conflicting flags!") From c96adbd1e875b9b59cf9a9ae283b4dac38ecd28b Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Thu, 16 Jul 2020 22:17:30 +0300 Subject: [PATCH 068/164] Small refactor --- lark/load_grammar.py | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 736d67c..dd3a27c 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -432,6 +432,20 @@ class PrepareLiterals(Transformer_InPlace): return ST('pattern', [PatternRE(regexp)]) +def _make_joined_pattern(regexp, flags_set): + # In Python 3.6, a new syntax for flags was introduced, that allows us to restrict the scope + # of flags to a specific regexp group. We are already using it in `lexer.Pattern._get_flags` + # However, for prior Python versions, we still need to use global flags, so we have to make sure + # that there are no flag collisions when we merge several terminals. + flags = () + if not Py36: + if len(flags_set) > 1: + raise GrammarError("Lark doesn't support joining terminals with conflicting flags in python <3.6!") + elif len(flags_set) == 1: + flags ,= flags_set + + return PatternRE(regexp, flags) + class TerminalTreeToPattern(Transformer): def pattern(self, ps): p ,= ps @@ -441,26 +455,16 @@ class TerminalTreeToPattern(Transformer): assert items if len(items) == 1: return items[0] - # In Python 3.6, a new syntax for flags was introduced. We are already using it in `lexer.Pattern._get_flags` - # It allows us to activate flags just in a specific part, like in this case for a specific terminal. - # The `to_regexp` method already does this, so we don't have to continue to pass around the flags. - if not Py36: - if len({i.flags for i in items}) > 1: - raise GrammarError("Lark doesn't support joining terminals with conflicting flags in python <3.6!") - return PatternRE(''.join(i.to_regexp() for i in items), items[0].flags if items else ()) - else: - return PatternRE(''.join(i.to_regexp() for i in items), ()) + + pattern = ''.join(i.to_regexp() for i in items) + return _make_joined_pattern(pattern, {i.flags for i in items}) def expansions(self, exps): if len(exps) == 1: return exps[0] - # See `expansion` - if not Py36: - if len({i.flags for i in exps}) > 1: - raise GrammarError("Lark doesn't support joining terminals with conflicting flags!") - return PatternRE('(?:%s)' % ('|'.join(i.to_regexp() for i in exps)), exps[0].flags) - else: - return PatternRE('(?:%s)' % ('|'.join(i.to_regexp() for i in exps)), ()) + + pattern = '(?:%s)' % ('|'.join(i.to_regexp() for i in exps)) + return _make_joined_pattern(pattern, {i.flags for i in exps}) def expr(self, args): inner, op = args[:2] From a379f7fc22a6775d62385b05d7af9e0c19080d0b Mon Sep 17 00:00:00 2001 From: Inky <47245667+Inky-developer@users.noreply.github.com> Date: Sat, 18 Jul 2020 19:15:39 +0200 Subject: [PATCH 069/164] Added cache option for Lark stub --- lark-stubs/lark.pyi | 1 + 1 file changed, 1 insertion(+) diff --git a/lark-stubs/lark.pyi b/lark-stubs/lark.pyi index 511e0ad..d601fc2 100644 --- a/lark-stubs/lark.pyi +++ b/lark-stubs/lark.pyi @@ -55,6 +55,7 @@ class Lark: propagate_positions: bool = False, maybe_placeholders: bool = False, lexer_callbacks: Optional[Dict[str, Callable[[Token], Token]]] = None, + cache: Union[bool, str] = False, g_regex_flags: int = ... ): ... From 69adc221c527660a57637dbd6754434b0e9c33ff Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sun, 19 Jul 2020 11:30:24 +0300 Subject: [PATCH 070/164] Fixed documentation bug: Issue #625 --- docs/visitors.md | 23 +++++++++++++++++++++++ lark/visitors.py | 2 +- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/docs/visitors.md b/docs/visitors.md index 0238e05..dcdc8f8 100644 --- a/docs/visitors.md +++ b/docs/visitors.md @@ -28,6 +28,29 @@ There are two classes that implement the visitor interface: * Visitor_Recursive - Visit every node using recursion. Slightly faster. +### Interpreter + +The interpreter walks the tree starting at the root (top-down). + +For each node, it calls the method corresponding with its `data` attribute. + +Unlike Transformer and Visitor, the Interpreter doesn't automatically visit its sub-branches. +The user has to explicitly call `visit`, `visit_children`, or use the `@visit_children_decor`. +This allows the user to implement branching and loops. + +**Example:** +```python +class IncreaseSomeOfTheNumbers(Interpreter): + def number(self, tree): + tree.children[0] += 1 + + def skip(self, tree): + # skip this subtree. don't change any number node inside it. + pass + +IncreaseSomeOfTheNumbers().visit(parse_tree) +``` + ### Transformers Transformers visit each node of the tree, and run the appropriate method on it according to the node's data. diff --git a/lark/visitors.py b/lark/visitors.py index c9f0e2d..3f80016 100644 --- a/lark/visitors.py +++ b/lark/visitors.py @@ -267,7 +267,7 @@ class Interpreter(_Decoratable): Calls its methods (provided by user via inheritance) according to tree.data Unlike Transformer and Visitor, the Interpreter doesn't automatically visit its sub-branches. - The user has to explicitly call visit_children, or use the @visit_children_decor + The user has to explicitly call visit, visit_children, or use the @visit_children_decor """ def visit(self, tree): From bf326104d5369e3ccbfb01515a510c45d911a0ab Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Wed, 22 Jul 2020 22:14:13 +0300 Subject: [PATCH 071/164] Updated standalone example --- examples/standalone/json_parser.py | 543 +++++++++++++++++++---------- 1 file changed, 362 insertions(+), 181 deletions(-) diff --git a/examples/standalone/json_parser.py b/examples/standalone/json_parser.py index 8a92a14..c9a5147 100644 --- a/examples/standalone/json_parser.py +++ b/examples/standalone/json_parser.py @@ -1,4 +1,4 @@ -# The file was automatically generated by Lark v0.8.1 +# The file was automatically generated by Lark v0.9.0 # # # Lark Stand-alone Generator Tool @@ -58,14 +58,14 @@ class UnexpectedInput(LarkError): after = text[pos:end].split('\n', 1)[0] return before + after + '\n' + ' ' * len(before) + '^\n' - def match_examples(self, parse_fn, examples): + def match_examples(self, parse_fn, examples, token_type_match_fallback=False): """ Given a parser instance and a dictionary mapping some label with some malformed syntax examples, it'll return the label for the example that bests matches the current error. """ assert self.state is not None, "Not supported for this exception" - candidate = None + candidate = (None, False) for label, example in examples.items(): assert not isinstance(example, STRING_TYPE) @@ -77,12 +77,18 @@ class UnexpectedInput(LarkError): try: if ut.token == self.token: # Try exact match first return label + + if token_type_match_fallback: + # Fallback to token types match + if (ut.token.type == self.token.type) and not candidate[-1]: + candidate = label, True + except AttributeError: pass - if not candidate: - candidate = label + if not candidate[0]: + candidate = label, False - return candidate + return candidate[0] class UnexpectedCharacters(LexError, UnexpectedInput): @@ -107,7 +113,7 @@ class UnexpectedCharacters(LexError, UnexpectedInput): class UnexpectedToken(ParseError, UnexpectedInput): - def __init__(self, token, expected, considered_rules=None, state=None): + def __init__(self, token, expected, considered_rules=None, state=None, puppet=None): self.token = token self.expected = expected # XXX str shouldn't necessary self.line = getattr(token, 'line', '?') @@ -115,6 +121,7 @@ class UnexpectedToken(ParseError, UnexpectedInput): self.considered_rules = considered_rules self.state = state self.pos_in_stream = getattr(token, 'pos_in_stream', None) + self.puppet = puppet message = ("Unexpected token %r at line %s, column %s.\n" "Expected one of: \n\t* %s\n" @@ -123,6 +130,12 @@ class UnexpectedToken(ParseError, UnexpectedInput): super(UnexpectedToken, self).__init__(message) class VisitError(LarkError): + """VisitError is raised when visitors are interrupted by an exception + + It provides the following attributes for inspection: + - obj: the tree node or token it was processing when the exception was raised + - orig_exc: the exception that cause it to fail + """ def __init__(self, rule, obj, orig_exc): self.obj = obj self.orig_exc = orig_exc @@ -246,16 +259,31 @@ def smart_decorator(f, create_decorator): else: return create_decorator(f.__func__.__call__, True) +try: + import regex +except ImportError: + regex = None + import sys, re Py36 = (sys.version_info[:2] >= (3, 6)) import sre_parse import sre_constants -def get_regexp_width(regexp): +categ_pattern = re.compile(r'\\p{[A-Za-z_]+}') +def get_regexp_width(expr): + if regex: + # Since `sre_parse` cannot deal with Unicode categories of the form `\p{Mn}`, we replace these with + # a simple letter, which makes no difference as we are only trying to get the possible lengths of the regex + # match here below. + regexp_final = re.sub(categ_pattern, 'A', expr) + else: + if re.search(categ_pattern, expr): + raise ImportError('`regex` module must be installed in order to use Unicode categories.', expr) + regexp_final = expr try: - return [int(x) for x in sre_parse.parse(regexp).getwidth()] + return [int(x) for x in sre_parse.parse(regexp_final).getwidth()] except sre_constants.error: - raise ValueError(regexp) + raise ValueError(expr) class Meta: @@ -309,25 +337,15 @@ class Tree(object): return hash((self.data, tuple(self.children))) def iter_subtrees(self): - # TODO: Re-write as a more efficient version - - visited = set() - q = [self] + queue = [self] + subtrees = OrderedDict() + for subtree in queue: + subtrees[id(subtree)] = subtree + queue += [c for c in reversed(subtree.children) + if isinstance(c, Tree) and id(c) not in subtrees] - l = [] - while q: - subtree = q.pop() - l.append( subtree ) - if id(subtree) in visited: - continue # already been here from another branch - visited.add(id(subtree)) - q += [c for c in subtree.children if isinstance(c, Tree)] - - seen = set() - for x in reversed(l): - if id(x) not in seen: - yield x - seen.add(id(x)) + del queue + return reversed(list(subtrees.values())) def find_pred(self, pred): "Find all nodes where pred(tree) == True" @@ -356,11 +374,11 @@ class _Decoratable: # Make sure the function isn't inherited (unless it's overwritten) if name.startswith('_') or (name in libmembers and name not in cls.__dict__): continue - if not callable(cls.__dict__[name]): + if not callable(value): continue # Skip if v_args already applied (at the function level) - if hasattr(cls.__dict__[name], 'vargs_applied'): + if hasattr(cls.__dict__[name], 'vargs_applied') or hasattr(value, 'vargs_applied'): continue static = isinstance(cls.__dict__[name], (staticmethod, classmethod)) @@ -486,6 +504,38 @@ class Transformer_InPlace(Transformer): return self._transform_tree(tree) +class Transformer_NonRecursive(Transformer): + "Non-recursive. Doesn't change the original tree." + + def transform(self, tree): + # Tree to postfix + rev_postfix = [] + q = [tree] + while q: + t = q.pop() + rev_postfix.append( t ) + if isinstance(t, Tree): + q += t.children + + # Postfix to tree + stack = [] + for x in reversed(rev_postfix): + if isinstance(x, Tree): + size = len(x.children) + if size: + args = stack[-size:] + del stack[-size:] + else: + args = [] + stack.append(self._call_userfunc(x, args)) + else: + stack.append(x) + + t ,= stack # We should have only one tree remaining + return t + + + class Transformer_InPlaceRecursive(Transformer): "Recursive. Changes the tree in-place instead of returning new instances" def _transform_tree(self, tree): @@ -567,7 +617,7 @@ class Interpreter(_Decoratable): Calls its methods (provided by user via inheritance) according to tree.data Unlike Transformer and Visitor, the Interpreter doesn't automatically visit its sub-branches. - The user has to explicitly call visit_children, or use the @visit_children_decor + The user has to explicitly call visit, visit_children, or use the @visit_children_decor """ def visit(self, tree): @@ -781,19 +831,21 @@ class NonTerminal(Symbol): class RuleOptions(Serialize): - __serialize_fields__ = 'keep_all_tokens', 'expand1', 'priority', 'empty_indices' + __serialize_fields__ = 'keep_all_tokens', 'expand1', 'priority', 'template_source', 'empty_indices' - def __init__(self, keep_all_tokens=False, expand1=False, priority=None, empty_indices=()): + def __init__(self, keep_all_tokens=False, expand1=False, priority=None, template_source=None, empty_indices=()): self.keep_all_tokens = keep_all_tokens self.expand1 = expand1 self.priority = priority + self.template_source = template_source self.empty_indices = empty_indices def __repr__(self): - return 'RuleOptions(%r, %r, %r)' % ( + return 'RuleOptions(%r, %r, %r, %r)' % ( self.keep_all_tokens, self.expand1, self.priority, + self.template_source ) @@ -836,6 +888,7 @@ class Rule(Serialize): +from copy import copy class Pattern(Serialize): @@ -918,7 +971,6 @@ class TerminalDef(Serialize): return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern) - class Token(Str): __slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column', 'end_pos') @@ -926,7 +978,8 @@ class Token(Str): try: self = super(Token, cls).__new__(cls, value) except UnicodeDecodeError: - value = value.decode('latin1') + # value = value.decode('latin1') + value = value.decode("ascii", "backslashreplace") self = super(Token, cls).__new__(cls, value) self.type = type_ @@ -1060,7 +1113,7 @@ class CallChain: -def _create_unless(terminals): +def _create_unless(terminals, g_regex_flags, re_): tokens_by_type = classify(terminals, lambda t: type(t.pattern)) assert len(tokens_by_type) <= 2, tokens_by_type.keys() embedded_strs = set() @@ -1071,19 +1124,19 @@ def _create_unless(terminals): if strtok.priority > retok.priority: continue s = strtok.pattern.value - m = re.match(retok.pattern.to_regexp(), s) + m = re_.match(retok.pattern.to_regexp(), s, g_regex_flags) if m and m.group(0) == s: unless.append(strtok) if strtok.pattern.flags <= retok.pattern.flags: embedded_strs.add(strtok) if unless: - callback[retok.name] = UnlessCallback(build_mres(unless, match_whole=True)) + callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, re_, match_whole=True)) terminals = [t for t in terminals if t not in embedded_strs] return terminals, callback -def _build_mres(terminals, max_size, match_whole): +def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_): # Python sets an unreasonable group limit (currently 100) in its re module # Worse, the only way to know we reached it is by catching an AssertionError! # This function recursively tries less and less groups until it's successful. @@ -1091,17 +1144,17 @@ def _build_mres(terminals, max_size, match_whole): mres = [] while terminals: try: - mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in terminals[:max_size])) + mre = re_.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in terminals[:max_size]), g_regex_flags) except AssertionError: # Yes, this is what Python provides us.. :/ - return _build_mres(terminals, max_size//2, match_whole) + return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_) # terms_from_name = {t.name: t for t in terminals[:max_size]} mres.append((mre, {i:n for n,i in mre.groupindex.items()} )) terminals = terminals[max_size:] return mres -def build_mres(terminals, match_whole=False): - return _build_mres(terminals, len(terminals), match_whole) +def build_mres(terminals, g_regex_flags, re_, match_whole=False): + return _build_mres(terminals, len(terminals), g_regex_flags, match_whole, re_) def _regexp_has_newline(r): r"""Expressions that may indicate newlines in a regexp: @@ -1124,34 +1177,39 @@ class Lexer(object): class TraditionalLexer(Lexer): - def __init__(self, terminals, ignore=(), user_callbacks={}): + def __init__(self, conf): + terminals = list(conf.tokens) assert all(isinstance(t, TerminalDef) for t in terminals), terminals - terminals = list(terminals) + self.re = conf.re_module - # Sanitization - for t in terminals: - try: - re.compile(t.pattern.to_regexp()) - except re.error: - raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern)) + if not conf.skip_validation: + # Sanitization + for t in terminals: + try: + self.re.compile(t.pattern.to_regexp(), conf.g_regex_flags) + except self.re.error: + raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern)) - if t.pattern.min_width == 0: - raise LexError("Lexer does not allow zero-width terminals. (%s: %s)" % (t.name, t.pattern)) + if t.pattern.min_width == 0: + raise LexError("Lexer does not allow zero-width terminals. (%s: %s)" % (t.name, t.pattern)) - assert set(ignore) <= {t.name for t in terminals} + assert set(conf.ignore) <= {t.name for t in terminals} # Init self.newline_types = [t.name for t in terminals if _regexp_has_newline(t.pattern.to_regexp())] - self.ignore_types = list(ignore) + self.ignore_types = list(conf.ignore) terminals.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name)) self.terminals = terminals - self.user_callbacks = user_callbacks - self.build() + self.user_callbacks = conf.callbacks + self.g_regex_flags = conf.g_regex_flags - def build(self): - terminals, self.callback = _create_unless(self.terminals) + self._mres = None + # self.build(g_regex_flags) + + def _build(self): + terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, re_=self.re) assert all(self.callback.values()) for type_, f in self.user_callbacks.items(): @@ -1161,7 +1219,13 @@ class TraditionalLexer(Lexer): else: self.callback[type_] = f - self.mres = build_mres(terminals) + self._mres = build_mres(terminals, self.g_regex_flags, self.re) + + @property + def mres(self): + if self._mres is None: + self._build() + return self._mres def match(self, stream, pos): for mre, type_from_index in self.mres: @@ -1177,12 +1241,15 @@ class TraditionalLexer(Lexer): class ContextualLexer(Lexer): - def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}): + def __init__(self, conf, states, always_accept=()): + terminals = list(conf.tokens) tokens_by_name = {} for t in terminals: assert t.name not in tokens_by_name, t tokens_by_name[t.name] = t + trad_conf = type(conf)(terminals, conf.re_module, conf.ignore, callbacks=conf.callbacks, g_regex_flags=conf.g_regex_flags, skip_validation=conf.skip_validation) + lexer_by_tokens = {} self.lexers = {} for state, accepts in states.items(): @@ -1190,14 +1257,17 @@ class ContextualLexer(Lexer): try: lexer = lexer_by_tokens[key] except KeyError: - accepts = set(accepts) | set(ignore) | set(always_accept) + accepts = set(accepts) | set(conf.ignore) | set(always_accept) state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name] - lexer = TraditionalLexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks) + lexer_conf = copy(trad_conf) + lexer_conf.tokens = state_tokens + lexer = TraditionalLexer(lexer_conf) lexer_by_tokens[key] = lexer self.lexers[state] = lexer - self.root_lexer = TraditionalLexer(terminals, ignore=ignore, user_callbacks=user_callbacks) + assert trad_conf.tokens is terminals + self.root_lexer = TraditionalLexer(trad_conf) def lex(self, stream, get_parser_state): parser_state = get_parser_state() @@ -1223,14 +1293,17 @@ class ContextualLexer(Lexer): class LexerConf(Serialize): - __serialize_fields__ = 'tokens', 'ignore' + __serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags' __serialize_namespace__ = TerminalDef, - def __init__(self, tokens, ignore=(), postlex=None, callbacks=None): - self.tokens = tokens + def __init__(self, tokens, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False): + self.tokens = tokens # TODO should be terminals self.ignore = ignore self.postlex = postlex self.callbacks = callbacks or {} + self.g_regex_flags = g_regex_flags + self.re_module = re_module + self.skip_validation = skip_validation def _deserialize(self): self.callbacks = {} # TODO @@ -1257,33 +1330,39 @@ class PropagatePositions: def __call__(self, children): res = self.node_builder(children) + # local reference to Tree.meta reduces number of presence checks if isinstance(res, Tree): + res_meta = res.meta for c in children: - if isinstance(c, Tree) and not c.meta.empty: - res.meta.line = c.meta.line - res.meta.column = c.meta.column - res.meta.start_pos = c.meta.start_pos - res.meta.empty = False - break + if isinstance(c, Tree): + child_meta = c.meta + if not child_meta.empty: + res_meta.line = child_meta.line + res_meta.column = child_meta.column + res_meta.start_pos = child_meta.start_pos + res_meta.empty = False + break elif isinstance(c, Token): - res.meta.line = c.line - res.meta.column = c.column - res.meta.start_pos = c.pos_in_stream - res.meta.empty = False + res_meta.line = c.line + res_meta.column = c.column + res_meta.start_pos = c.pos_in_stream + res_meta.empty = False break for c in reversed(children): - if isinstance(c, Tree) and not c.meta.empty: - res.meta.end_line = c.meta.end_line - res.meta.end_column = c.meta.end_column - res.meta.end_pos = c.meta.end_pos - res.meta.empty = False - break + if isinstance(c, Tree): + child_meta = c.meta + if not child_meta.empty: + res_meta.end_line = child_meta.end_line + res_meta.end_column = child_meta.end_column + res_meta.end_pos = child_meta.end_pos + res_meta.empty = False + break elif isinstance(c, Token): - res.meta.end_line = c.end_line - res.meta.end_column = c.end_column - res.meta.end_pos = c.end_pos - res.meta.empty = False + res_meta.end_line = c.end_line + res_meta.end_column = c.end_column + res_meta.end_pos = c.end_pos + res_meta.empty = False break return res @@ -1473,7 +1552,7 @@ class ParseTreeBuilder: for rule, wrapper_chain in self.rule_builders: - user_callback_name = rule.alias or rule.origin.name + user_callback_name = rule.alias or rule.options.template_source or rule.origin.name try: f = getattr(transformer, user_callback_name) # XXX InlineTransformer is deprecated! @@ -1499,6 +1578,7 @@ class ParseTreeBuilder: return callbacks + class LALR_Parser(object): def __init__(self, parser_conf, debug=False): assert all(r.options.priority is None for r in parser_conf.rules), "LALR doesn't yet support prioritization" @@ -1508,7 +1588,7 @@ class LALR_Parser(object): self._parse_table = analysis.parse_table self.parser_conf = parser_conf - self.parser = _Parser(analysis.parse_table, callbacks) + self.parser = _Parser(analysis.parse_table, callbacks, debug) @classmethod def deserialize(cls, data, memo, callbacks): @@ -1525,22 +1605,20 @@ class LALR_Parser(object): class _Parser: - def __init__(self, parse_table, callbacks): - self.states = parse_table.states - self.start_states = parse_table.start_states - self.end_states = parse_table.end_states + def __init__(self, parse_table, callbacks, debug=False): + self.parse_table = parse_table self.callbacks = callbacks + self.debug = debug - def parse(self, seq, start, set_state=None): + def parse(self, seq, start, set_state=None, value_stack=None, state_stack=None): token = None stream = iter(seq) - states = self.states + states = self.parse_table.states + start_state = self.parse_table.start_states[start] + end_state = self.parse_table.end_states[start] - start_state = self.start_states[start] - end_state = self.end_states[start] - - state_stack = [start_state] - value_stack = [] + state_stack = state_stack or [start_state] + value_stack = value_stack or [] if set_state: set_state(start_state) @@ -1550,7 +1628,11 @@ class _Parser: return states[state][token.type] except KeyError: expected = [s for s in states[state].keys() if s.isupper()] - raise UnexpectedToken(token, expected, state=state) + try: + puppet = ParserPuppet(self, state_stack, value_stack, start, stream, set_state) + except NameError: + puppet = None + raise UnexpectedToken(token, expected, state=state, puppet=puppet) def reduce(rule): size = len(rule.expansion) @@ -1569,18 +1651,29 @@ class _Parser: value_stack.append(value) # Main LALR-parser loop - for token in stream: - while True: - action, arg = get_action(token) - assert arg != end_state - - if action is Shift: - state_stack.append(arg) - value_stack.append(token) - if set_state: set_state(arg) - break # next token - else: - reduce(arg) + try: + for token in stream: + while True: + action, arg = get_action(token) + assert arg != end_state + + if action is Shift: + state_stack.append(arg) + value_stack.append(token) + if set_state: set_state(arg) + break # next token + else: + reduce(arg) + except Exception as e: + if self.debug: + print("") + print("STATE STACK DUMP") + print("----------------") + for i, s in enumerate(state_stack): + print('%d)' % i , s) + print("") + + raise token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1) while True: @@ -1715,10 +1808,12 @@ class WithLexer(_ParserFrontend): self.postlex = lexer_conf.postlex @classmethod - def deserialize(cls, data, memo, callbacks, postlex): + def deserialize(cls, data, memo, callbacks, postlex, re_module): inst = super(WithLexer, cls).deserialize(data, memo) inst.postlex = postlex inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks) + inst.lexer_conf.re_module = re_module + inst.lexer_conf.skip_validation=True inst.init_lexer() return inst @@ -1734,7 +1829,7 @@ class WithLexer(_ParserFrontend): return self._parse(token_stream, start) def init_traditional_lexer(self): - self.lexer = TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks) + self.lexer = TraditionalLexer(self.lexer_conf) class LALR_WithLexer(WithLexer): def __init__(self, lexer_conf, parser_conf, options=None): @@ -1744,7 +1839,7 @@ class LALR_WithLexer(WithLexer): self.init_lexer() - def init_lexer(self): + def init_lexer(self, **kw): raise NotImplementedError() class LALR_TraditionalLexer(LALR_WithLexer): @@ -1755,10 +1850,7 @@ class LALR_ContextualLexer(LALR_WithLexer): def init_lexer(self): states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()} always_accept = self.postlex.always_accept if self.postlex else () - self.lexer = ContextualLexer(self.lexer_conf.tokens, states, - ignore=self.lexer_conf.ignore, - always_accept=always_accept, - user_callbacks=self.lexer_conf.callbacks) + self.lexer = ContextualLexer(self.lexer_conf, states, always_accept=always_accept) def parse(self, text, start=None): @@ -1775,32 +1867,62 @@ class LarkOptions(Serialize): """ OPTIONS_DOC = """ - parser - Decides which parser engine to use, "earley" or "lalr". (Default: "earley") - Note: "lalr" requires a lexer - - lexer - Decides whether or not to use a lexer stage - "standard": Use a standard lexer - "contextual": Stronger lexer (only works with parser="lalr") - "dynamic": Flexible and powerful (only with parser="earley") - "dynamic_complete": Same as dynamic, but tries *every* variation - of tokenizing possible. (only with parser="earley") - "auto" (default): Choose for me based on grammar and parser - - ambiguity - Decides how to handle ambiguity in the parse. Only relevant if parser="earley" - "resolve": The parser will automatically choose the simplest derivation - (it chooses consistently: greedy for tokens, non-greedy for rules) - "explicit": The parser will return all derivations wrapped in "_ambig" tree nodes (i.e. a forest). - - transformer - Applies the transformer to every parse tree - debug - Affects verbosity (default: False) - keep_all_tokens - Don't automagically remove "punctuation" tokens (default: False) - cache_grammar - Cache the Lark grammar (Default: False) - postlex - Lexer post-processing (Default: None) Only works with the standard and contextual lexers. - start - The start symbol, either a string, or a list of strings for multiple possible starts (Default: "start") - priority - How priorities should be evaluated - auto, none, normal, invert (Default: auto) - propagate_positions - Propagates [line, column, end_line, end_column] attributes into all tree branches. - lexer_callbacks - Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution. - maybe_placeholders - Experimental feature. Instead of omitting optional rules (i.e. rule?), replace them with None +# General + + start - The start symbol. Either a string, or a list of strings for + multiple possible starts (Default: "start") + debug - Display debug information, such as warnings (default: False) + transformer - Applies the transformer to every parse tree (equivlent to + applying it after the parse, but faster) + propagate_positions - Propagates (line, column, end_line, end_column) + attributes into all tree branches. + maybe_placeholders - When True, the `[]` operator returns `None` when not matched. + When `False`, `[]` behaves like the `?` operator, + and returns no value at all. + (default=`False`. Recommended to set to `True`) + regex - When True, uses the `regex` module instead of the stdlib `re`. + cache - Cache the results of the Lark grammar analysis, for x2 to x3 faster loading. + LALR only for now. + When `False`, does nothing (default) + When `True`, caches to a temporary file in the local directory + When given a string, caches to the path pointed by the string + + g_regex_flags - Flags that are applied to all terminals + (both regex and strings) + keep_all_tokens - Prevent the tree builder from automagically + removing "punctuation" tokens (default: False) + +# Algorithm + + parser - Decides which parser engine to use + Accepts "earley" or "lalr". (Default: "earley") + (there is also a "cyk" option for legacy) + + lexer - Decides whether or not to use a lexer stage + "auto" (default): Choose for me based on the parser + "standard": Use a standard lexer + "contextual": Stronger lexer (only works with parser="lalr") + "dynamic": Flexible and powerful (only with parser="earley") + "dynamic_complete": Same as dynamic, but tries *every* variation + of tokenizing possible. + + ambiguity - Decides how to handle ambiguity in the parse. + Only relevant if parser="earley" + "resolve": The parser will automatically choose the simplest + derivation (it chooses consistently: greedy for + tokens, non-greedy for rules) + "explicit": The parser will return all derivations wrapped + in "_ambig" tree nodes (i.e. a forest). + +# Domain Specific + + postlex - Lexer post-processing (Default: None) Only works with the + standard and contextual lexers. + priority - How priorities should be evaluated - auto, none, normal, + invert (Default: auto) + lexer_callbacks - Dictionary of callbacks for the lexer. May alter + tokens during lexing. Use with caution. + edit_terminals - A callback """ if __doc__: __doc__ += OPTIONS_DOC @@ -1809,7 +1931,7 @@ class LarkOptions(Serialize): 'debug': False, 'keep_all_tokens': False, 'tree_class': None, - 'cache: False, + 'cache': False, 'postlex': None, 'parser': 'earley', 'lexer': 'auto', @@ -1817,10 +1939,12 @@ class LarkOptions(Serialize): 'start': 'start', 'priority': 'auto', 'ambiguity': 'auto', + 'regex': False, 'propagate_positions': False, 'lexer_callbacks': {}, 'maybe_placeholders': False, 'edit_terminals': None, + 'g_regex_flags': 0, } def __init__(self, options_dict): @@ -1830,7 +1954,7 @@ class LarkOptions(Serialize): for name, default in self._defaults.items(): if name in o: value = o.pop(name) - if isinstance(default, bool): + if isinstance(default, bool) and name != 'cache': value = bool(value) else: value = default @@ -1875,8 +1999,19 @@ class Lark(Serialize): grammar : a string or file-object containing the grammar spec (using Lark's ebnf syntax) options : a dictionary controlling various aspects of Lark. """ + self.options = LarkOptions(options) + # Set regex or re module + use_regex = self.options.regex + if use_regex: + if regex: + re_module = regex + else: + raise ImportError('`regex` module must be installed if calling `Lark(regex=True)`.') + else: + re_module = re + # Some, but not all file-like objects have a 'name' attribute try: self.source = grammar.name @@ -1893,8 +2028,27 @@ class Lark(Serialize): assert isinstance(grammar, STRING_TYPE) - if self.options.cache_grammar: - raise NotImplementedError("Not available yet") + cache_fn = None + if self.options.cache: + if self.options.parser != 'lalr': + raise NotImplementedError("cache only works with parser='lalr' for now") + if isinstance(self.options.cache, STRING_TYPE): + cache_fn = self.options.cache + else: + if self.options.cache is not True: + raise ValueError("cache must be bool or str") + unhashable = ('transformer', 'postlex', 'lexer_callbacks', 'edit_terminals') + from . import __version__ + options_str = ''.join(k+str(v) for k, v in options.items() if k not in unhashable) + s = grammar + options_str + __version__ + md5 = hashlib.md5(s.encode()).hexdigest() + cache_fn = '.lark_cache_%s.tmp' % md5 + + if FS.exists(cache_fn): + logging.debug('Loading grammar from cache: %s', cache_fn) + with FS.open(cache_fn, 'rb') as f: + self._load(f, self.options.transformer, self.options.postlex) + return if self.options.lexer == 'auto': if self.options.parser == 'lalr': @@ -1929,7 +2083,7 @@ class Lark(Serialize): assert self.options.ambiguity in ('resolve', 'explicit', 'auto', ) # Parse the grammar file and compose the grammars (TODO) - self.grammar = load_grammar(grammar, self.source) + self.grammar = load_grammar(grammar, self.source, re_module) # Compile the EBNF grammar into BNF self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start) @@ -1962,20 +2116,25 @@ class Lark(Serialize): if hasattr(t, term.name): lexer_callbacks[term.name] = getattr(t, term.name) - self.lexer_conf = LexerConf(self.terminals, self.ignore_tokens, self.options.postlex, lexer_callbacks) + self.lexer_conf = LexerConf(self.terminals, re_module, self.ignore_tokens, self.options.postlex, lexer_callbacks, self.options.g_regex_flags) if self.options.parser: self.parser = self._build_parser() elif lexer: self.lexer = self._build_lexer() + if cache_fn: + logging.debug('Saving grammar to cache: %s', cache_fn) + with FS.open(cache_fn, 'wb') as f: + self.save(f) + if __init__.__doc__: - __init__.__doc__ += "\nOPTIONS:" + LarkOptions.OPTIONS_DOC + __init__.__doc__ += "\nOptions:\n" + LarkOptions.OPTIONS_DOC __serialize_fields__ = 'parser', 'rules', 'options' def _build_lexer(self): - return TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks) + return TraditionalLexer(self.lexer_conf) def _prepare_callbacks(self): self.parser_class = get_frontend(self.options.parser, self.options.lexer) @@ -1987,34 +2146,42 @@ class Lark(Serialize): parser_conf = ParserConf(self.rules, self._callbacks, self.options.start) return self.parser_class(self.lexer_conf, parser_conf, options=self.options) + def save(self, f): + data, m = self.memo_serialize([TerminalDef, Rule]) + pickle.dump({'data': data, 'memo': m}, f) + @classmethod - def deserialize(cls, data, namespace, memo, transformer=None, postlex=None): - if memo: - memo = SerializeMemoizer.deserialize(memo, namespace, {}) + def load(cls, f): inst = cls.__new__(cls) + return inst._load(f) + + def _load(self, f, transformer=None, postlex=None): + if isinstance(f, dict): + d = f + else: + d = pickle.load(f) + memo = d['memo'] + data = d['data'] + + assert memo + memo = SerializeMemoizer.deserialize(memo, {'Rule': Rule, 'TerminalDef': TerminalDef}, {}) options = dict(data['options']) if transformer is not None: options['transformer'] = transformer if postlex is not None: options['postlex'] = postlex - inst.options = LarkOptions.deserialize(options, memo) - inst.rules = [Rule.deserialize(r, memo) for r in data['rules']] - inst.source = '' - inst._prepare_callbacks() - inst.parser = inst.parser_class.deserialize(data['parser'], memo, inst._callbacks, inst.options.postlex) - return inst - - def save(self, f): - data, m = self.memo_serialize([TerminalDef, Rule]) - pickle.dump({'data': data, 'memo': m}, f) + self.options = LarkOptions.deserialize(options, memo) + re_module = regex if self.options.regex else re + self.rules = [Rule.deserialize(r, memo) for r in data['rules']] + self.source = '' + self._prepare_callbacks() + self.parser = self.parser_class.deserialize(data['parser'], memo, self._callbacks, self.options.postlex, re_module) + return self @classmethod - def load(cls, f): - d = pickle.load(f) - namespace = {'Rule': Rule, 'TerminalDef': TerminalDef} - memo = d['memo'] - return Lark.deserialize(d['data'], namespace, memo) - + def _load_from_dict(cls, data, memo, transformer=None, postlex=None): + inst = cls.__new__(cls) + return inst._load({'data': data, 'memo': memo}, transformer, postlex) @classmethod def open(cls, grammar_filename, rel_to=None, **options): @@ -2051,24 +2218,38 @@ class Lark(Serialize): "Get information about a terminal" return self._terminals_dict[name] - def parse(self, text, start=None): + def parse(self, text, start=None, on_error=None): """Parse the given text, according to the options provided. - The 'start' parameter is required if Lark was given multiple possible start symbols (using the start option). + Parameters: + start: str - required if Lark was given multiple possible start symbols (using the start option). + on_error: function - if provided, will be called on UnexpectedToken error. Return true to resume parsing. LALR only. Returns a tree, unless specified otherwise. """ - return self.parser.parse(text, start=start) + try: + return self.parser.parse(text, start=start) + except UnexpectedToken as e: + if on_error is None: + raise + + while True: + if not on_error(e): + raise e + try: + return e.puppet.resume_parse() + except UnexpectedToken as e2: + e = e2 + DATA = ( -{'rules': [{'@': 27}, {'@': 31}, {'@': 26}, {'@': 13}, {'@': 24}, {'@': 18}, {'@': 14}, {'@': 22}, {'@': 28}, {'@': 23}, {'@': 29}, {'@': 12}, {'@': 25}, {'@': 30}, {'@': 19}, {'@': 21}, {'@': 15}, {'@': 20}, {'@': 16}, {'@': 17}], 'parser': {'lexer_conf': {'tokens': [{'@': 0}, {'@': 1}, {'@': 2}, {'@': 3}, {'@': 4}, {'@': 5}, {'@': 6}, {'@': 7}, {'@': 8}, {'@': 9}, {'@': 10}, {'@': 11}], 'ignore': [u'WS'], '__type__': 'LexerConf'}, 'parser': {'tokens': {0: 'COMMA', 1: 'RSQB', 2: 'RBRACE', 3: '$END', 4: 'LBRACE', 5: u'FALSE', 6: u'string', 7: u'object', 8: u'NULL', 9: u'SIGNED_NUMBER', 10: u'value', 11: u'array', 12: u'ESCAPED_STRING', 13: u'TRUE', 14: 'LSQB', 15: 'COLON', 16: u'pair', 17: u'__array_star_0', 18: u'__object_star_1', 19: 'start'}, 'states': {0: {0: (1, {'@': 12}), 1: (1, {'@': 12}), 2: (1, {'@': 12}), 3: (1, {'@': 12})}, 1: {1: (0, 29), 4: (0, 33), 5: (0, 8), 6: (0, 5), 7: (0, 31), 8: (0, 22), 9: (0, 24), 10: (0, 6), 11: (0, 26), 12: (0, 21), 13: (0, 16), 14: (0, 1)}, 2: {0: (0, 11), 2: (0, 0)}, 3: {15: (0, 12)}, 4: {16: (0, 13), 12: (0, 21), 6: (0, 3)}, 5: {0: (1, {'@': 13}), 1: (1, {'@': 13}), 2: (1, {'@': 13}), 3: (1, {'@': 13})}, 6: {0: (0, 7), 1: (0, 23), 17: (0, 17)}, 7: {4: (0, 33), 5: (0, 8), 6: (0, 5), 7: (0, 31), 8: (0, 22), 9: (0, 24), 10: (0, 9), 11: (0, 26), 12: (0, 21), 13: (0, 16), 14: (0, 1)}, 8: {0: (1, {'@': 14}), 1: (1, {'@': 14}), 2: (1, {'@': 14}), 3: (1, {'@': 14})}, 9: {0: (1, {'@': 15}), 1: (1, {'@': 15})}, 10: {4: (0, 33), 5: (0, 8), 6: (0, 5), 7: (0, 31), 8: (0, 22), 9: (0, 24), 10: (0, 20), 11: (0, 26), 12: (0, 21), 13: (0, 16), 14: (0, 1)}, 11: {16: (0, 15), 12: (0, 21), 6: (0, 3)}, 12: {4: (0, 33), 5: (0, 8), 6: (0, 5), 7: (0, 31), 8: (0, 22), 9: (0, 24), 10: (0, 18), 11: (0, 26), 12: (0, 21), 13: (0, 16), 14: (0, 1)}, 13: {0: (1, {'@': 16}), 2: (1, {'@': 16})}, 14: {}, 15: {0: (1, {'@': 17}), 2: (1, {'@': 17})}, 16: {0: (1, {'@': 18}), 1: (1, {'@': 18}), 2: (1, {'@': 18}), 3: (1, {'@': 18})}, 17: {0: (0, 10), 1: (0, 28)}, 18: {0: (1, {'@': 19}), 2: (1, {'@': 19})}, 19: {0: (0, 4), 18: (0, 2), 2: (0, 25)}, 20: {0: (1, {'@': 20}), 1: (1, {'@': 20})}, 21: {0: (1, {'@': 21}), 1: (1, {'@': 21}), 2: (1, {'@': 21}), 3: (1, {'@': 21}), 15: (1, {'@': 21})}, 22: {0: (1, {'@': 22}), 1: (1, {'@': 22}), 2: (1, {'@': 22}), 3: (1, {'@': 22})}, 23: {0: (1, {'@': 23}), 1: (1, {'@': 23}), 2: (1, {'@': 23}), 3: (1, {'@': 23})}, 24: {0: (1, {'@': 24}), 1: (1, {'@': 24}), 2: (1, {'@': 24}), 3: (1, {'@': 24})}, 25: {0: (1, {'@': 25}), 1: (1, {'@': 25}), 2: (1, {'@': 25}), 3: (1, {'@': 25})}, 26: {0: (1, {'@': 26}), 1: (1, {'@': 26}), 2: (1, {'@': 26}), 3: (1, {'@': 26})}, 27: {3: (1, {'@': 27})}, 28: {0: (1, {'@': 28}), 1: (1, {'@': 28}), 2: (1, {'@': 28}), 3: (1, {'@': 28})}, 29: {0: (1, {'@': 29}), 1: (1, {'@': 29}), 2: (1, {'@': 29}), 3: (1, {'@': 29})}, 30: {0: (1, {'@': 30}), 1: (1, {'@': 30}), 2: (1, {'@': 30}), 3: (1, {'@': 30})}, 31: {0: (1, {'@': 31}), 1: (1, {'@': 31}), 2: (1, {'@': 31}), 3: (1, {'@': 31})}, 32: {4: (0, 33), 5: (0, 8), 6: (0, 5), 7: (0, 31), 8: (0, 22), 9: (0, 24), 10: (0, 27), 11: (0, 26), 12: (0, 21), 13: (0, 16), 14: (0, 1), 19: (0, 14)}, 33: {16: (0, 19), 2: (0, 30), 12: (0, 21), 6: (0, 3)}}, 'end_states': {'start': 14}, 'start_states': {'start': 32}}, '__type__': 'LALR_ContextualLexer', 'start': ['start']}, '__type__': 'Lark', 'options': {'transformer': None, 'lexer': 'contextual', 'lexer_callbacks': {}, 'debug': False, 'postlex': None, 'parser': 'lalr', 'cache_grammar': False, 'tree_class': None, 'priority': None, 'start': ['start'], 'keep_all_tokens': False, 'ambiguity': 'auto', 'edit_terminals': None, 'propagate_positions': False, 'maybe_placeholders': False}} +{'rules': [{'@': 23}, {'@': 31}, {'@': 26}, {'@': 13}, {'@': 24}, {'@': 19}, {'@': 14}, {'@': 27}, {'@': 28}, {'@': 16}, {'@': 29}, {'@': 12}, {'@': 25}, {'@': 30}, {'@': 20}, {'@': 22}, {'@': 15}, {'@': 21}, {'@': 17}, {'@': 18}], 'parser': {'lexer_conf': {'tokens': [{'@': 0}, {'@': 1}, {'@': 2}, {'@': 3}, {'@': 4}, {'@': 5}, {'@': 6}, {'@': 7}, {'@': 8}, {'@': 9}, {'@': 10}, {'@': 11}], 'ignore': [u'WS'], 'g_regex_flags': 0, '__type__': 'LexerConf'}, 'parser': {'tokens': {0: 'COMMA', 1: 'RSQB', 2: 'RBRACE', 3: '$END', 4: 'LBRACE', 5: u'FALSE', 6: u'string', 7: u'object', 8: u'NULL', 9: u'SIGNED_NUMBER', 10: u'value', 11: u'array', 12: u'ESCAPED_STRING', 13: u'TRUE', 14: 'LSQB', 15: 'COLON', 16: u'pair', 17: u'__array_star_0', 18: u'__object_star_1', 19: 'start'}, 'states': {0: {0: (1, {'@': 12}), 1: (1, {'@': 12}), 2: (1, {'@': 12}), 3: (1, {'@': 12})}, 1: {1: (0, 29), 4: (0, 33), 5: (0, 8), 6: (0, 5), 7: (0, 31), 8: (0, 27), 9: (0, 24), 10: (0, 6), 11: (0, 26), 12: (0, 21), 13: (0, 16), 14: (0, 1)}, 2: {0: (0, 23), 2: (0, 0)}, 3: {15: (0, 12)}, 4: {16: (0, 13), 12: (0, 21), 6: (0, 3)}, 5: {0: (1, {'@': 13}), 1: (1, {'@': 13}), 2: (1, {'@': 13}), 3: (1, {'@': 13})}, 6: {0: (0, 7), 1: (0, 11), 17: (0, 17)}, 7: {4: (0, 33), 5: (0, 8), 6: (0, 5), 7: (0, 31), 8: (0, 27), 9: (0, 24), 10: (0, 9), 11: (0, 26), 12: (0, 21), 13: (0, 16), 14: (0, 1)}, 8: {0: (1, {'@': 14}), 1: (1, {'@': 14}), 2: (1, {'@': 14}), 3: (1, {'@': 14})}, 9: {0: (1, {'@': 15}), 1: (1, {'@': 15})}, 10: {4: (0, 33), 5: (0, 8), 6: (0, 5), 7: (0, 31), 8: (0, 27), 9: (0, 24), 10: (0, 20), 11: (0, 26), 12: (0, 21), 13: (0, 16), 14: (0, 1)}, 11: {0: (1, {'@': 16}), 1: (1, {'@': 16}), 2: (1, {'@': 16}), 3: (1, {'@': 16})}, 12: {4: (0, 33), 5: (0, 8), 6: (0, 5), 7: (0, 31), 8: (0, 27), 9: (0, 24), 10: (0, 18), 11: (0, 26), 12: (0, 21), 13: (0, 16), 14: (0, 1)}, 13: {0: (1, {'@': 17}), 2: (1, {'@': 17})}, 14: {}, 15: {0: (1, {'@': 18}), 2: (1, {'@': 18})}, 16: {0: (1, {'@': 19}), 1: (1, {'@': 19}), 2: (1, {'@': 19}), 3: (1, {'@': 19})}, 17: {0: (0, 10), 1: (0, 28)}, 18: {0: (1, {'@': 20}), 2: (1, {'@': 20})}, 19: {0: (0, 4), 18: (0, 2), 2: (0, 25)}, 20: {0: (1, {'@': 21}), 1: (1, {'@': 21})}, 21: {0: (1, {'@': 22}), 1: (1, {'@': 22}), 2: (1, {'@': 22}), 3: (1, {'@': 22}), 15: (1, {'@': 22})}, 22: {3: (1, {'@': 23})}, 23: {16: (0, 15), 12: (0, 21), 6: (0, 3)}, 24: {0: (1, {'@': 24}), 1: (1, {'@': 24}), 2: (1, {'@': 24}), 3: (1, {'@': 24})}, 25: {0: (1, {'@': 25}), 1: (1, {'@': 25}), 2: (1, {'@': 25}), 3: (1, {'@': 25})}, 26: {0: (1, {'@': 26}), 1: (1, {'@': 26}), 2: (1, {'@': 26}), 3: (1, {'@': 26})}, 27: {0: (1, {'@': 27}), 1: (1, {'@': 27}), 2: (1, {'@': 27}), 3: (1, {'@': 27})}, 28: {0: (1, {'@': 28}), 1: (1, {'@': 28}), 2: (1, {'@': 28}), 3: (1, {'@': 28})}, 29: {0: (1, {'@': 29}), 1: (1, {'@': 29}), 2: (1, {'@': 29}), 3: (1, {'@': 29})}, 30: {0: (1, {'@': 30}), 1: (1, {'@': 30}), 2: (1, {'@': 30}), 3: (1, {'@': 30})}, 31: {0: (1, {'@': 31}), 1: (1, {'@': 31}), 2: (1, {'@': 31}), 3: (1, {'@': 31})}, 32: {4: (0, 33), 5: (0, 8), 6: (0, 5), 7: (0, 31), 8: (0, 27), 9: (0, 24), 10: (0, 22), 11: (0, 26), 12: (0, 21), 13: (0, 16), 14: (0, 1), 19: (0, 14)}, 33: {16: (0, 19), 2: (0, 30), 12: (0, 21), 6: (0, 3)}}, 'end_states': {'start': 14}, 'start_states': {'start': 32}}, '__type__': 'LALR_ContextualLexer', 'start': ['start']}, '__type__': 'Lark', 'options': {'regex': False, 'transformer': None, 'lexer': 'contextual', 'lexer_callbacks': {}, 'start': ['start'], 'debug': False, 'postlex': None, 'parser': 'lalr', 'tree_class': None, 'priority': None, 'cache': False, 'g_regex_flags': 0, 'keep_all_tokens': False, 'ambiguity': 'auto', 'edit_terminals': None, 'propagate_positions': False, 'maybe_placeholders': False}} ) MEMO = ( -{0: {'priority': 1, 'pattern': {'__type__': 'PatternRE', '_width': [2, 4294967295], 'flags': [], 'value': u'\\".*?(? Date: Sat, 25 Jul 2020 18:00:37 +0200 Subject: [PATCH 072/164] Added a bit of explanation for `term_subs` --- lark-stubs/reconstruct.pyi | 2 +- lark/reconstruct.py | 9 ++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/lark-stubs/reconstruct.pyi b/lark-stubs/reconstruct.pyi index 8ac6c14..2220c46 100644 --- a/lark-stubs/reconstruct.pyi +++ b/lark-stubs/reconstruct.pyi @@ -30,7 +30,7 @@ class MakeMatchTree: class Reconstructor: - def __init__(self, parser: Lark): + def __init__(self, parser: Lark, term_subs: Dict[str, str] = ...): ... def reconstruct(self, tree: Tree) -> str: diff --git a/lark/reconstruct.py b/lark/reconstruct.py index 1e3adc7..baf1d2c 100644 --- a/lark/reconstruct.py +++ b/lark/reconstruct.py @@ -87,9 +87,16 @@ def best_from_group(seq, group_key, cmp_key): return list(d.values()) class Reconstructor: - def __init__(self, parser, term_subs={}): + """ + A Reconstructor that will, given a full parse Tree, generate source code. + Pass `term_subs`, a dictionary of [Terminal name as str] to [output text as str] + to say what discarded Terminals should be written as. + """ + def __init__(self, parser, term_subs=None): # XXX TODO calling compile twice returns different results! assert parser.options.maybe_placeholders == False + if term_subs is None: + term_subs = {} tokens, rules, _grammar_extra = parser.grammar.compile(parser.options.start) self.write_tokens = WriteTokensTransformer({t.name:t for t in tokens}, term_subs) From 7eee42bfae95964e9eaa98b6d1f9cd8f2f992486 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Mon, 27 Jul 2020 10:37:12 +0300 Subject: [PATCH 073/164] Docs: Fixed bug in recipes (Issue #633) --- docs/recipes.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/recipes.md b/docs/recipes.md index edcef64..18fed37 100644 --- a/docs/recipes.md +++ b/docs/recipes.md @@ -139,7 +139,7 @@ If your tree nodes aren't unique (if there is a shared Tree instance), the asser ```python class Parent(Visitor): - def visit(self, tree): + def __default__(self, tree): for subtree in tree.children: if isinstance(subtree, Tree): assert not hasattr(subtree, 'parent') From fcaf10ac4d42c7f1017e2c82a51f224a810b2bcd Mon Sep 17 00:00:00 2001 From: Inky <47245667+Inky-developer@users.noreply.github.com> Date: Sat, 18 Jul 2020 19:15:39 +0200 Subject: [PATCH 074/164] Fixes caching when custom lexers are used --- lark-stubs/lark.pyi | 1 + lark/parser_frontends.py | 11 ++++++++--- tests/test_cache.py | 16 ++++++++++++++++ 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/lark-stubs/lark.pyi b/lark-stubs/lark.pyi index 511e0ad..d601fc2 100644 --- a/lark-stubs/lark.pyi +++ b/lark-stubs/lark.pyi @@ -55,6 +55,7 @@ class Lark: propagate_positions: bool = False, maybe_placeholders: bool = False, lexer_callbacks: Optional[Dict[str, Callable[[Token], Token]]] = None, + cache: Union[bool, str] = False, g_regex_flags: int = ... ): ... diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 08f4756..c05f235 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -1,5 +1,3 @@ -from functools import partial - from .utils import get_regexp_width, Serialize from .parsers.grammar_analysis import GrammarAnalyzer from .lexer import TraditionalLexer, ContextualLexer, Lexer, Token @@ -20,7 +18,14 @@ def get_frontend(parser, lexer): elif lexer == 'contextual': return LALR_ContextualLexer elif issubclass(lexer, Lexer): - return partial(LALR_CustomLexer, lexer) + class LALR_CustomLexerWrapper(LALR_CustomLexer): + def __init__(self, lexer_conf, parser_conf, options=None): + super(LALR_CustomLexerWrapper, self).__init__( + lexer, lexer_conf, parser_conf, options=options) + def init_lexer(self): + self.lexer = lexer(self.lexer_conf) + + return LALR_CustomLexerWrapper else: raise ValueError('Unknown lexer: %s' % lexer) elif parser=='earley': diff --git a/tests/test_cache.py b/tests/test_cache.py index 9436081..ca4d781 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -4,6 +4,7 @@ import sys from unittest import TestCase, main from lark import Lark, Tree +from lark.lexer import Lexer, Token import lark.lark as lark_module try: @@ -38,6 +39,15 @@ class MockFS: return name in self.files +class CustomLexer(Lexer): + def __init__(self, lexer_conf): + pass + + def lex(self, data): + for obj in data: + yield Token('A', obj) + + class TestCache(TestCase): def setUp(self): pass @@ -70,6 +80,12 @@ class TestCache(TestCase): parser = Lark(g, parser='lalr', cache=True) assert parser.parse('a') == Tree('start', []) + # Test with custom lexer + mock_fs.files = {} + parser = Lark(g, parser='lalr', lexer=CustomLexer, cache=True) + parser = Lark(g, parser='lalr', lexer=CustomLexer, cache=True) + assert len(mock_fs.files) == 1 + assert parser.parse('a') == Tree('start', []) finally: lark_module.FS = fs From 0c89189b5fed475fe265ca29b63e68857f2f4c7e Mon Sep 17 00:00:00 2001 From: MegaIng1 Date: Thu, 23 Jul 2020 19:11:53 +0200 Subject: [PATCH 075/164] Support for bytes parser --- lark-stubs/lark.pyi | 4 +++- lark/common.py | 5 +++-- lark/exceptions.py | 17 +++++++++++++---- lark/lark.py | 5 +++-- lark/lexer.py | 25 +++++++++++++++---------- 5 files changed, 37 insertions(+), 19 deletions(-) diff --git a/lark-stubs/lark.pyi b/lark-stubs/lark.pyi index 511e0ad..4f5f57e 100644 --- a/lark-stubs/lark.pyi +++ b/lark-stubs/lark.pyi @@ -31,6 +31,7 @@ class LarkOptions: lexer_callbacks: Dict[str, Callable[[Token], Token]] cache: Union[bool, str] g_regex_flags: int + use_bytes: bool class Lark: @@ -55,7 +56,8 @@ class Lark: propagate_positions: bool = False, maybe_placeholders: bool = False, lexer_callbacks: Optional[Dict[str, Callable[[Token], Token]]] = None, - g_regex_flags: int = ... + g_regex_flags: int = ..., + use_bytes: bool = False, ): ... diff --git a/lark/common.py b/lark/common.py index 5c55b8c..cc8c73c 100644 --- a/lark/common.py +++ b/lark/common.py @@ -4,10 +4,10 @@ from .lexer import TerminalDef ###{standalone class LexerConf(Serialize): - __serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags' + __serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags', 'use_bytes' __serialize_namespace__ = TerminalDef, - def __init__(self, tokens, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False): + def __init__(self, tokens, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False, use_bytes=False): self.tokens = tokens # TODO should be terminals self.ignore = ignore self.postlex = postlex @@ -15,6 +15,7 @@ class LexerConf(Serialize): self.g_regex_flags = g_regex_flags self.re_module = re_module self.skip_validation = skip_validation + self.use_bytes = use_bytes def _deserialize(self): self.callbacks = {} # TODO diff --git a/lark/exceptions.py b/lark/exceptions.py index 1c5e533..033275c 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -28,9 +28,14 @@ class UnexpectedInput(LarkError): pos = self.pos_in_stream start = max(pos - span, 0) end = pos + span - before = text[start:pos].rsplit('\n', 1)[-1] - after = text[pos:end].split('\n', 1)[0] - return before + after + '\n' + ' ' * len(before) + '^\n' + if not isinstance(text, bytes): + before = text[start:pos].rsplit('\n', 1)[-1] + after = text[pos:end].split('\n', 1)[0] + return before + after + '\n' + ' ' * len(before) + '^\n' + else: + before = text[start:pos].rsplit(b'\n', 1)[-1] + after = text[pos:end].split(b'\n', 1)[0] + return (before + after + b'\n' + b' ' * len(before) + b'^\n').decode("ascii", "backslashreplace") def match_examples(self, parse_fn, examples, token_type_match_fallback=False): """ Given a parser instance and a dictionary mapping some label with @@ -67,7 +72,11 @@ class UnexpectedInput(LarkError): class UnexpectedCharacters(LexError, UnexpectedInput): def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None): - message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos], line, column) + + if isinstance(seq, bytes): + message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace"), line, column) + else: + message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos], line, column) self.line = line self.column = column diff --git a/lark/lark.py b/lark/lark.py index 232dbb7..36e92b1 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -105,6 +105,7 @@ class LarkOptions(Serialize): 'maybe_placeholders': False, 'edit_terminals': None, 'g_regex_flags': 0, + 'use_bytes': False, } def __init__(self, options_dict): @@ -252,7 +253,7 @@ class Lark(Serialize): for t in self.terminals: self.options.edit_terminals(t) - self._terminals_dict = {t.name:t for t in self.terminals} + self._terminals_dict = {t.name: t for t in self.terminals} # If the user asked to invert the priorities, negate them all here. # This replaces the old 'resolve__antiscore_sum' option. @@ -276,7 +277,7 @@ class Lark(Serialize): if hasattr(t, term.name): lexer_callbacks[term.name] = getattr(t, term.name) - self.lexer_conf = LexerConf(self.terminals, re_module, self.ignore_tokens, self.options.postlex, lexer_callbacks, self.options.g_regex_flags) + self.lexer_conf = LexerConf(self.terminals, re_module, self.ignore_tokens, self.options.postlex, lexer_callbacks, self.options.g_regex_flags, use_bytes=self.options.use_bytes) if self.options.parser: self.parser = self._build_parser() diff --git a/lark/lexer.py b/lark/lexer.py index 4979500..6039c54 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -230,7 +230,7 @@ class CallChain: -def _create_unless(terminals, g_regex_flags, re_): +def _create_unless(terminals, g_regex_flags, re_, use_bytes): tokens_by_type = classify(terminals, lambda t: type(t.pattern)) assert len(tokens_by_type) <= 2, tokens_by_type.keys() embedded_strs = set() @@ -247,31 +247,34 @@ def _create_unless(terminals, g_regex_flags, re_): if strtok.pattern.flags <= retok.pattern.flags: embedded_strs.add(strtok) if unless: - callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, re_, match_whole=True)) + callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, re_, match_whole=True, use_bytes=use_bytes)) terminals = [t for t in terminals if t not in embedded_strs] return terminals, callback -def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_): +def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_, use_bytes): # Python sets an unreasonable group limit (currently 100) in its re module # Worse, the only way to know we reached it is by catching an AssertionError! # This function recursively tries less and less groups until it's successful. postfix = '$' if match_whole else '' mres = [] while terminals: + pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size]) + if use_bytes: + pattern = pattern.encode() try: - mre = re_.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in terminals[:max_size]), g_regex_flags) + mre = re_.compile(pattern, g_regex_flags) except AssertionError: # Yes, this is what Python provides us.. :/ - return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_) + return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_, use_bytes) # terms_from_name = {t.name: t for t in terminals[:max_size]} mres.append((mre, {i:n for n,i in mre.groupindex.items()} )) terminals = terminals[max_size:] return mres -def build_mres(terminals, g_regex_flags, re_, match_whole=False): - return _build_mres(terminals, len(terminals), g_regex_flags, match_whole, re_) +def build_mres(terminals, g_regex_flags, re_, use_bytes, match_whole=False): + return _build_mres(terminals, len(terminals), g_regex_flags, match_whole, re_, use_bytes) def _regexp_has_newline(r): r"""Expressions that may indicate newlines in a regexp: @@ -321,12 +324,13 @@ class TraditionalLexer(Lexer): self.terminals = terminals self.user_callbacks = conf.callbacks self.g_regex_flags = conf.g_regex_flags + self.use_bytes = conf.use_bytes self._mres = None # self.build(g_regex_flags) def _build(self): - terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, re_=self.re) + terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, re_=self.re, use_bytes=self.use_bytes) assert all(self.callback.values()) for type_, f in self.user_callbacks.items(): @@ -336,7 +340,7 @@ class TraditionalLexer(Lexer): else: self.callback[type_] = f - self._mres = build_mres(terminals, self.g_regex_flags, self.re) + self._mres = build_mres(terminals, self.g_regex_flags, self.re, self.use_bytes) @property def mres(self): @@ -365,7 +369,8 @@ class ContextualLexer(Lexer): assert t.name not in tokens_by_name, t tokens_by_name[t.name] = t - trad_conf = type(conf)(terminals, conf.re_module, conf.ignore, callbacks=conf.callbacks, g_regex_flags=conf.g_regex_flags, skip_validation=conf.skip_validation) + trad_conf = copy(conf) + trad_conf.tokens = terminals lexer_by_tokens = {} self.lexers = {} From c93106f1430b05e722866aec844e91e6e5966b7b Mon Sep 17 00:00:00 2001 From: MegaIng1 Date: Sat, 25 Jul 2020 13:59:05 +0200 Subject: [PATCH 076/164] Tests for bytes parser (credit to @ctrlcctrlv) --- lark-stubs/lark.pyi | 1 + lark/lark.py | 11 ++++- lark/lexer.py | 8 ++-- lark/parser_frontends.py | 2 + lark/utils.py | 15 +++++- tests/test_parser.py | 100 +++++++++++++++++++++++++++++++++++++-- 6 files changed, 127 insertions(+), 10 deletions(-) diff --git a/lark-stubs/lark.pyi b/lark-stubs/lark.pyi index 4f5f57e..131bbe0 100644 --- a/lark-stubs/lark.pyi +++ b/lark-stubs/lark.pyi @@ -36,6 +36,7 @@ class LarkOptions: class Lark: source: str + grammar_source: str options: LarkOptions lexer: Lexer terminals: List[TerminalDef] diff --git a/lark/lark.py b/lark/lark.py index 36e92b1..a1ed414 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -4,7 +4,7 @@ import sys, os, pickle, hashlib, logging from io import open -from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS +from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS, isascii from .load_grammar import load_grammar from .tree import Tree from .common import LexerConf, ParserConf @@ -115,7 +115,7 @@ class LarkOptions(Serialize): for name, default in self._defaults.items(): if name in o: value = o.pop(name) - if isinstance(default, bool) and name != 'cache': + if isinstance(default, bool) and name not in ('cache', 'use_bytes'): value = bool(value) else: value = default @@ -188,6 +188,13 @@ class Lark(Serialize): grammar = read() assert isinstance(grammar, STRING_TYPE) + self.grammar_source = grammar + if self.options.use_bytes: + assert isascii(grammar), "If creating a parser for bytes, the grammar needs to be ascii only" + if sys.version_info[0] == 2 and self.options.use_bytes != 'force': + raise NotImplementedError("The `use_bytes=True` for python2.7 is not perfect. " + "It might have weird behaviour. Use `use_bytes='force'` " + "to still use it") cache_fn = None if self.options.cache: diff --git a/lark/lexer.py b/lark/lexer.py index 6039c54..fdc7429 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -139,8 +139,8 @@ class Token(Str): class LineCounter: - def __init__(self): - self.newline_char = '\n' + def __init__(self, use_bytes=False): + self.newline_char = '\n' if not use_bytes else b'\n' self.char_pos = 0 self.line = 1 self.column = 1 @@ -169,7 +169,7 @@ class _Lex: def lex(self, stream, newline_types, ignore_types): newline_types = frozenset(newline_types) ignore_types = frozenset(ignore_types) - line_ctr = LineCounter() + line_ctr = LineCounter(self.lexer.use_bytes) last_token = None while line_ctr.char_pos < len(stream): @@ -262,7 +262,7 @@ def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_, use_bytes) while terminals: pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size]) if use_bytes: - pattern = pattern.encode() + pattern = pattern.encode('utf-8') try: mre = re_.compile(pattern, g_regex_flags) except AssertionError: # Yes, this is what Python provides us.. :/ diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 08f4756..0b2d5f2 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -184,6 +184,8 @@ class XEarley(_ParserFrontend): else: if width == 0: raise ValueError("Dynamic Earley doesn't allow zero-width regexps", t) + if lexer_conf.use_bytes: + regexp = regexp.encode('utf-8') self.regexps[t.name] = lexer_conf.re_module.compile(regexp, lexer_conf.g_regex_flags) diff --git a/lark/utils.py b/lark/utils.py index 36f50d1..c70b947 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -305,4 +305,17 @@ def combine_alternatives(lists): class FS: open = open - exists = os.path.exists \ No newline at end of file + exists = os.path.exists + + + +def isascii(s): + """ str.isascii only exists in python3.7+ """ + try: + return s.isascii() + except AttributeError: + try: + s.encode('ascii') + return True + except (UnicodeDecodeError, UnicodeEncodeError): + return False \ No newline at end of file diff --git a/tests/test_parser.py b/tests/test_parser.py index 1249211..4fd6cea 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -8,7 +8,9 @@ import os import sys from copy import copy, deepcopy -from lark.utils import Py36 +from lark.utils import Py36, isascii + +from lark import Token try: from cStringIO import StringIO as cStringIO @@ -561,12 +563,82 @@ class CustomLexer(Lexer): def lex(self, *args, **kwargs): return self.lexer.lex(*args, **kwargs) +def _tree_structure_check(a, b): + """ + Checks that both Tree objects have the same structure, without checking their values. + """ + assert a.data == b.data and len(a.children) == len(b.children) + for ca,cb in zip(a.children, b.children): + assert type(ca) == type(cb) + if isinstance(ca, Tree): + _tree_structure_check(ca, cb) + elif isinstance(ca, Token): + assert ca.type == cb.type + else: + assert ca == cb + +class DualLark: + """ + A helper class that wraps both a normal parser, and a parser for bytes. + It automatically transforms `.parse` calls for both lexer, returning the value from the text lexer + It always checks that both produce the same output/error + """ + + def __init__(self, g, *args, **kwargs): + self.text_lexer = Lark(g, *args, use_bytes=False, **kwargs) + g = self.text_lexer.grammar_source.lower() + if '\\u' in g or not isascii(g): + # Bytes re can't deal with uniode escapes + self.bytes_lark = None + else: + # Everything here should work, so use `use_bytes='force'` + self.bytes_lark = Lark(self.text_lexer.grammar_source, *args, use_bytes='force', **kwargs) + + def parse(self, text, start=None): + # TODO: Easy workaround, more complex checks would be beneficial + if not isascii(text) or self.bytes_lark is None: + return self.text_lexer.parse(text, start) + try: + rv = self.text_lexer.parse(text, start) + except Exception as e: + try: + self.bytes_lark.parse(text.encode(), start) + except Exception as be: + assert type(e) == type(be), "Parser with and without `use_bytes` raise different exceptions" + raise e + assert False, "Parser without `use_bytes` raises exception, with doesn't" + try: + bv = self.bytes_lark.parse(text.encode(), start) + except Exception as be: + assert False, "Parser without `use_bytes` doesn't raise an exception, with does" + _tree_structure_check(rv, bv) + return rv + + @classmethod + def open(cls, grammar_filename, rel_to=None, **options): + if rel_to: + basepath = os.path.dirname(rel_to) + grammar_filename = os.path.join(basepath, grammar_filename) + with open(grammar_filename, encoding='utf8') as f: + return cls(f, **options) + + def save(self,f): + self.text_lexer.save(f) + if self.bytes_lark is not None: + self.bytes_lark.save(f) + + def load(self,f): + self.text_lexer = self.text_lexer.load(f) + if self.bytes_lark is not None: + self.bytes_lark.load(f) + def _make_parser_test(LEXER, PARSER): lexer_class_or_name = CustomLexer if LEXER == 'custom' else LEXER def _Lark(grammar, **kwargs): - return Lark(grammar, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs) + return DualLark(grammar, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs) def _Lark_open(gfilename, **kwargs): - return Lark.open(gfilename, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs) + return DualLark.open(gfilename, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs) + class _TestParser(unittest.TestCase): def test_basic1(self): g = _Lark("""start: a+ b a* "b" a* @@ -646,6 +718,28 @@ def _make_parser_test(LEXER, PARSER): A: "\x01".."\x03" """) g.parse('\x01\x02\x03') + + @unittest.skipIf(sys.version_info[:2]==(2, 7), "bytes parser isn't perfect in Python2.7, exceptions don't work correctly") + def test_bytes_utf8(self): + g = r""" + start: BOM? char+ + BOM: "\xef\xbb\xbf" + char: CHAR1 | CHAR2 | CHAR3 | CHAR4 + CONTINUATION_BYTE: "\x80" .. "\xbf" + CHAR1: "\x00" .. "\x7f" + CHAR2: "\xc0" .. "\xdf" CONTINUATION_BYTE + CHAR3: "\xe0" .. "\xef" CONTINUATION_BYTE CONTINUATION_BYTE + CHAR4: "\xf0" .. "\xf7" CONTINUATION_BYTE CONTINUATION_BYTE CONTINUATION_BYTE + """ + g = _Lark(g) + s = u"🔣 地? gurīn".encode('utf-8') + self.assertEqual(len(g.bytes_lark.parse(s).children), 10) + + for enc, j in [("sjis", u"地球の絵はグリーンでグッド? Chikyuu no e wa guriin de guddo"), + ("sjis", u"売春婦"), + ("euc-jp", u"乂鵬鵠")]: + s = j.encode(enc) + self.assertRaises(UnexpectedCharacters, g.bytes_lark.parse, s) @unittest.skipIf(PARSER == 'cyk', "Takes forever") def test_stack_for_ebnf(self): From 7c6e94bf73b3efca580ca4a99ca365b34482292a Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Fri, 31 Jul 2020 10:22:35 +0300 Subject: [PATCH 077/164] Fixed issues with the use_bytes PR, and added documentation --- docs/classes.md | 1 + lark/lark.py | 11 ++++++----- lark/lexer.py | 8 ++++---- tests/test_parser.py | 22 ++++++++++++---------- 4 files changed, 23 insertions(+), 19 deletions(-) diff --git a/docs/classes.md b/docs/classes.md index 8b32801..7bd92fe 100644 --- a/docs/classes.md +++ b/docs/classes.md @@ -128,6 +128,7 @@ Useful for caching and multiprocessing. - **priority** - How priorities should be evaluated - auto, none, normal, invert (Default: auto) - **lexer_callbacks** - Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution. - **edit_terminals** - A callback +- **use_bytes** - Accept and parse an input of type `bytes` instead of `str`. Grammar should still be specified as `str`, and terminal values are assumed to be `latin-1`. #### Using Unicode character classes with `regex` diff --git a/lark/lark.py b/lark/lark.py index a1ed414..daab45b 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -82,6 +82,7 @@ class LarkOptions(Serialize): invert (Default: auto) lexer_callbacks - Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution. + use_bytes - Accept an input of type `bytes` instead of `str` (Python 3 only). edit_terminals - A callback """ if __doc__: @@ -190,11 +191,11 @@ class Lark(Serialize): assert isinstance(grammar, STRING_TYPE) self.grammar_source = grammar if self.options.use_bytes: - assert isascii(grammar), "If creating a parser for bytes, the grammar needs to be ascii only" + if not isascii(grammar): + raise ValueError("Grammar must be ascii only, when use_bytes=True") if sys.version_info[0] == 2 and self.options.use_bytes != 'force': - raise NotImplementedError("The `use_bytes=True` for python2.7 is not perfect. " - "It might have weird behaviour. Use `use_bytes='force'` " - "to still use it") + raise NotImplementedError("`use_bytes=True` may have issues on python2." + "Use `use_bytes='force'` to use it at your own risk.") cache_fn = None if self.options.cache: @@ -204,7 +205,7 @@ class Lark(Serialize): cache_fn = self.options.cache else: if self.options.cache is not True: - raise ValueError("cache must be bool or str") + raise ValueError("cache argument must be bool or str") unhashable = ('transformer', 'postlex', 'lexer_callbacks', 'edit_terminals') from . import __version__ options_str = ''.join(k+str(v) for k, v in options.items() if k not in unhashable) diff --git a/lark/lexer.py b/lark/lexer.py index fdc7429..c77207b 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -139,8 +139,8 @@ class Token(Str): class LineCounter: - def __init__(self, use_bytes=False): - self.newline_char = '\n' if not use_bytes else b'\n' + def __init__(self, newline_char): + self.newline_char = newline_char self.char_pos = 0 self.line = 1 self.column = 1 @@ -169,7 +169,7 @@ class _Lex: def lex(self, stream, newline_types, ignore_types): newline_types = frozenset(newline_types) ignore_types = frozenset(ignore_types) - line_ctr = LineCounter(self.lexer.use_bytes) + line_ctr = LineCounter('\n' if not self.lexer.use_bytes else b'\n') last_token = None while line_ctr.char_pos < len(stream): @@ -262,7 +262,7 @@ def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_, use_bytes) while terminals: pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size]) if use_bytes: - pattern = pattern.encode('utf-8') + pattern = pattern.encode('latin-1') try: mre = re_.compile(pattern, g_regex_flags) except AssertionError: # Yes, this is what Python provides us.. :/ diff --git a/tests/test_parser.py b/tests/test_parser.py index 4fd6cea..f1e269f 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -577,11 +577,13 @@ def _tree_structure_check(a, b): else: assert ca == cb -class DualLark: +class DualBytesLark: """ A helper class that wraps both a normal parser, and a parser for bytes. It automatically transforms `.parse` calls for both lexer, returning the value from the text lexer It always checks that both produce the same output/error + + NOTE: Not currently used, but left here for future debugging. """ def __init__(self, g, *args, **kwargs): @@ -613,7 +615,7 @@ class DualLark: assert False, "Parser without `use_bytes` doesn't raise an exception, with does" _tree_structure_check(rv, bv) return rv - + @classmethod def open(cls, grammar_filename, rel_to=None, **options): if rel_to: @@ -635,9 +637,9 @@ class DualLark: def _make_parser_test(LEXER, PARSER): lexer_class_or_name = CustomLexer if LEXER == 'custom' else LEXER def _Lark(grammar, **kwargs): - return DualLark(grammar, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs) + return Lark(grammar, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs) def _Lark_open(gfilename, **kwargs): - return DualLark.open(gfilename, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs) + return Lark.open(gfilename, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs) class _TestParser(unittest.TestCase): def test_basic1(self): @@ -718,7 +720,7 @@ def _make_parser_test(LEXER, PARSER): A: "\x01".."\x03" """) g.parse('\x01\x02\x03') - + @unittest.skipIf(sys.version_info[:2]==(2, 7), "bytes parser isn't perfect in Python2.7, exceptions don't work correctly") def test_bytes_utf8(self): g = r""" @@ -731,15 +733,15 @@ def _make_parser_test(LEXER, PARSER): CHAR3: "\xe0" .. "\xef" CONTINUATION_BYTE CONTINUATION_BYTE CHAR4: "\xf0" .. "\xf7" CONTINUATION_BYTE CONTINUATION_BYTE CONTINUATION_BYTE """ - g = _Lark(g) + g = _Lark(g, use_bytes=True) s = u"🔣 地? gurīn".encode('utf-8') - self.assertEqual(len(g.bytes_lark.parse(s).children), 10) + self.assertEqual(len(g.parse(s).children), 10) for enc, j in [("sjis", u"地球の絵はグリーンでグッド? Chikyuu no e wa guriin de guddo"), ("sjis", u"売春婦"), ("euc-jp", u"乂鵬鵠")]: s = j.encode(enc) - self.assertRaises(UnexpectedCharacters, g.bytes_lark.parse, s) + self.assertRaises(UnexpectedCharacters, g.parse, s) @unittest.skipIf(PARSER == 'cyk', "Takes forever") def test_stack_for_ebnf(self): @@ -1159,7 +1161,7 @@ def _make_parser_test(LEXER, PARSER): g = _Lark(g) self.assertEqual( g.parse('"hello"').children, ['"hello"']) self.assertEqual( g.parse("'hello'").children, ["'hello'"]) - + @unittest.skipIf(not Py36, "Required re syntax only exists in python3.6+") def test_join_regex_flags(self): g = r""" @@ -1172,7 +1174,7 @@ def _make_parser_test(LEXER, PARSER): self.assertEqual(g.parse(" ").children,[" "]) self.assertEqual(g.parse("\n ").children,["\n "]) self.assertRaises(UnexpectedCharacters, g.parse, "\n\n") - + g = r""" start: A A: B | C From e6ad86f373a12edb9a4b941d2ea80cb81f2384e2 Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Mon, 3 Aug 2020 20:30:24 +1000 Subject: [PATCH 078/164] For Nearley: add argparse, extras install, allow es6 compiling --- lark/tools/nearley.py | 30 ++++++++++++++++++------------ setup.py | 3 ++- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/lark/tools/nearley.py b/lark/tools/nearley.py index 0b04fb5..c3df234 100644 --- a/lark/tools/nearley.py +++ b/lark/tools/nearley.py @@ -3,6 +3,7 @@ import os.path import sys import codecs +import argparse from lark import Lark, InlineTransformer @@ -137,7 +138,7 @@ def _nearley_to_lark(g, builtin_path, n2l, js_code, folder_path, includes): return rule_defs -def create_code_for_nearley_grammar(g, start, builtin_path, folder_path): +def create_code_for_nearley_grammar(g, start, builtin_path, folder_path, es6=False): import js2py emit_code = [] @@ -160,7 +161,10 @@ def create_code_for_nearley_grammar(g, start, builtin_path, folder_path): for alias, code in n2l.alias_js_code.items(): js_code.append('%s = (%s);' % (alias, code)) - emit(js2py.translate_js('\n'.join(js_code))) + if es6: + emit(js2py.translate_js6('\n'.join(js_code))) + else: + emit(js2py.translate_js('\n'.join(js_code))) emit('class TransformNearley(Transformer):') for alias in n2l.alias_js_code: emit(" %s = var.get('%s').to_python()" % (alias, alias)) @@ -173,18 +177,20 @@ def create_code_for_nearley_grammar(g, start, builtin_path, folder_path): return ''.join(emit_code) -def main(fn, start, nearley_lib): +def main(fn, start, nearley_lib, es6=False): with codecs.open(fn, encoding='utf8') as f: grammar = f.read() - return create_code_for_nearley_grammar(grammar, start, os.path.join(nearley_lib, 'builtin'), os.path.abspath(os.path.dirname(fn))) + return create_code_for_nearley_grammar(grammar, start, os.path.join(nearley_lib, 'builtin'), os.path.abspath(os.path.dirname(fn)), es6=es6) +def get_parser(): + parser = argparse.ArgumentParser('Reads Nearley grammar (with js functions) outputs an equivalent lark parser.') + parser.add_argument('nearley_grammar', help='Path to the file containing the nearley grammar') + parser.add_argument('start_rule', help='Rule within the nearley grammar to make the base rule') + parser.add_argument('nearley_lib', help='') + parser.add_argument('--es6', help='Enable experimental ES6 support', action='store_true') + return parser if __name__ == '__main__': - if len(sys.argv) < 4: - print("Reads Nearley grammar (with js functions) outputs an equivalent lark parser.") - print("Usage: %s " % sys.argv[0]) - sys.exit(1) - - fn, start, nearley_lib = sys.argv[1:] - - print(main(fn, start, nearley_lib)) + parser = get_parser() + args = parser.parse_args() + print(main(fn=args.nearley_grammar, start=args.start_rule, nearley_lib=args.nearley_lib, es6=args.es6)) diff --git a/setup.py b/setup.py index 5e7bda3..382943e 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,8 @@ setup( install_requires = [], extras_require = { - "regex": ["regex"] + "regex": ["regex"], + "nearley": ["js2py"] }, package_data = {'': ['*.md', '*.lark'], 'lark-stubs': ['*.pyi']}, From 193dcf032d549a48cc57a7bc39181be73cccae18 Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Mon, 3 Aug 2020 20:35:45 +1000 Subject: [PATCH 079/164] Update readme --- README.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/README.md b/README.md index 464f409..fb00841 100644 --- a/README.md +++ b/README.md @@ -163,6 +163,11 @@ Using Lark? Send me a message and I'll add your project! Lark comes with a tool to convert grammars from [Nearley](https://github.com/Hardmath123/nearley), a popular Earley library for Javascript. It uses [Js2Py](https://github.com/PiotrDabkowski/Js2Py) to convert and run the Javascript postprocessing code segments. +First, ensure you have Lark installed with the `nearley` component included: +```bash +pip install lark-parser[nearley] +``` + Here's an example: ```bash git clone https://github.com/Hardmath123/nearley @@ -177,6 +182,12 @@ You can use the output as a regular python module: 0.38981434460254655 ``` +The Nearley converter also supports an experimental converter for newer JavaScript (ES6+), using the `--es6` flag: + +```bash +git clone https://github.com/Hardmath123/nearley +python -m lark.tools.nearley nearley/examples/calculator/arithmetic.ne main nearley --es6 > ncalc.py +``` ## License From b0feed8c12838f6e0078b119e2eee97a1fafa66c Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Wed, 5 Aug 2020 16:05:31 +0300 Subject: [PATCH 080/164] A few adjustments to Nearley PR (mostly docs) --- README.md | 32 +---------------------------- docs/features.md | 2 +- docs/index.md | 1 + docs/nearley.md | 47 +++++++++++++++++++++++++++++++++++++++++++ lark/tools/nearley.py | 6 +++--- 5 files changed, 53 insertions(+), 35 deletions(-) create mode 100644 docs/nearley.md diff --git a/README.md b/README.md index fb00841..aa19ab0 100644 --- a/README.md +++ b/README.md @@ -100,7 +100,7 @@ See more [examples here](https://github.com/lark-parser/lark/tree/master/example - **Python 2 & 3** compatible - Automatic line & column tracking - Standard library of terminals (strings, numbers, names, etc.) - - Import grammars from Nearley.js + - Import grammars from Nearley.js ([read more](/docs/nearley.md)) - Extensive test suite [![codecov](https://codecov.io/gh/erezsh/lark/branch/master/graph/badge.svg)](https://codecov.io/gh/erezsh/lark) - MyPy support using type stubs - And much more! @@ -159,36 +159,6 @@ Check out the [JSON tutorial](/docs/json_tutorial.md#conclusion) for more detail Using Lark? Send me a message and I'll add your project! -### How to use Nearley grammars in Lark - -Lark comes with a tool to convert grammars from [Nearley](https://github.com/Hardmath123/nearley), a popular Earley library for Javascript. It uses [Js2Py](https://github.com/PiotrDabkowski/Js2Py) to convert and run the Javascript postprocessing code segments. - -First, ensure you have Lark installed with the `nearley` component included: -```bash -pip install lark-parser[nearley] -``` - -Here's an example: -```bash -git clone https://github.com/Hardmath123/nearley -python -m lark.tools.nearley nearley/examples/calculator/arithmetic.ne main nearley > ncalc.py -``` - -You can use the output as a regular python module: - -```python ->>> import ncalc ->>> ncalc.parse('sin(pi/4) ^ e') -0.38981434460254655 -``` - -The Nearley converter also supports an experimental converter for newer JavaScript (ES6+), using the `--es6` flag: - -```bash -git clone https://github.com/Hardmath123/nearley -python -m lark.tools.nearley nearley/examples/calculator/arithmetic.ne main nearley --es6 > ncalc.py -``` - ## License Lark uses the [MIT license](LICENSE). diff --git a/docs/features.md b/docs/features.md index d8a4340..9346989 100644 --- a/docs/features.md +++ b/docs/features.md @@ -21,7 +21,7 @@ # Extra features - Import rules and tokens from other Lark grammars, for code reuse and modularity. - - Import grammars from Nearley.js + - Import grammars from Nearley.js ([read more](/docs/nearley.md)) - CYK parser ### Experimental features diff --git a/docs/index.md b/docs/index.md index 20257b5..1310be2 100644 --- a/docs/index.md +++ b/docs/index.md @@ -49,6 +49,7 @@ $ pip install lark-parser * [Visitors & Transformers](visitors.md) * [Classes](classes.md) * [Cheatsheet (PDF)](lark_cheatsheet.pdf) + * [Importing grammars from Nearley](nearley.md) * Discussion * [Gitter](https://gitter.im/lark-parser/Lobby) * [Forum (Google Groups)](https://groups.google.com/forum/#!forum/lark-parser) diff --git a/docs/nearley.md b/docs/nearley.md new file mode 100644 index 0000000..4ab8595 --- /dev/null +++ b/docs/nearley.md @@ -0,0 +1,47 @@ +# Importing grammars from Nearley + +Lark comes with a tool to convert grammars from [Nearley](https://github.com/Hardmath123/nearley), a popular Earley library for Javascript. It uses [Js2Py](https://github.com/PiotrDabkowski/Js2Py) to convert and run the Javascript postprocessing code segments. + +## Requirements + +1. Install Lark with the `nearley` component: +```bash +pip install lark-parser[nearley] +``` + +2. Acquire a copy of the nearley codebase. This can be done using: +```bash +git clone https://github.com/Hardmath123/nearley +``` + +## Usage + +Here's an example of how to import nearley's calculator example into Lark: + +```bash +git clone https://github.com/Hardmath123/nearley +python -m lark.tools.nearley nearley/examples/calculator/arithmetic.ne main nearley > ncalc.py +``` + +You can use the output as a regular python module: + +```python +>>> import ncalc +>>> ncalc.parse('sin(pi/4) ^ e') +0.38981434460254655 +``` + +The Nearley converter also supports an experimental converter for newer JavaScript (ES6+), using the `--es6` flag: + +```bash +git clone https://github.com/Hardmath123/nearley +python -m lark.tools.nearley nearley/examples/calculator/arithmetic.ne main nearley --es6 > ncalc.py +``` + +## Notes + +- Lark currently cannot import templates from Nearley + +- Lark currently cannot export grammars to Nearley + +These might get added in the future, if enough users ask for them. \ No newline at end of file diff --git a/lark/tools/nearley.py b/lark/tools/nearley.py index c3df234..0237fcd 100644 --- a/lark/tools/nearley.py +++ b/lark/tools/nearley.py @@ -1,4 +1,4 @@ -"Converts between Lark and Nearley grammars. Work in progress!" +"Converts Nearley grammars to Lark" import os.path import sys @@ -182,7 +182,7 @@ def main(fn, start, nearley_lib, es6=False): grammar = f.read() return create_code_for_nearley_grammar(grammar, start, os.path.join(nearley_lib, 'builtin'), os.path.abspath(os.path.dirname(fn)), es6=es6) -def get_parser(): +def get_arg_parser(): parser = argparse.ArgumentParser('Reads Nearley grammar (with js functions) outputs an equivalent lark parser.') parser.add_argument('nearley_grammar', help='Path to the file containing the nearley grammar') parser.add_argument('start_rule', help='Rule within the nearley grammar to make the base rule') @@ -191,6 +191,6 @@ def get_parser(): return parser if __name__ == '__main__': - parser = get_parser() + parser = get_arg_parser() args = parser.parse_args() print(main(fn=args.nearley_grammar, start=args.start_rule, nearley_lib=args.nearley_lib, es6=args.es6)) From 621d9f651fcd0f184eb960a2c468d049386953d6 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Wed, 5 Aug 2020 16:30:54 +0300 Subject: [PATCH 081/164] Tiny update (Nearley) --- lark/tools/nearley.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/tools/nearley.py b/lark/tools/nearley.py index 0237fcd..af2789e 100644 --- a/lark/tools/nearley.py +++ b/lark/tools/nearley.py @@ -186,7 +186,7 @@ def get_arg_parser(): parser = argparse.ArgumentParser('Reads Nearley grammar (with js functions) outputs an equivalent lark parser.') parser.add_argument('nearley_grammar', help='Path to the file containing the nearley grammar') parser.add_argument('start_rule', help='Rule within the nearley grammar to make the base rule') - parser.add_argument('nearley_lib', help='') + parser.add_argument('nearley_lib', help='Path to root directory of nearley codebase (used for including builtins)') parser.add_argument('--es6', help='Enable experimental ES6 support', action='store_true') return parser From 1dd863dd3c158c8692b1449fa8a1adc04cd6101b Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Thu, 6 Aug 2020 14:04:51 +0300 Subject: [PATCH 082/164] Update README.md --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index aa19ab0..18c181f 100644 --- a/README.md +++ b/README.md @@ -1,18 +1,18 @@ -# Lark - a modern parsing library for Python +# Lark - a parsing toolkit for Python -Lark is a parser built with a focus on ergonomics, performance and resilience. +Lark is a parsing toolkit for Python, built with a focus on ergonomics, performance and modularity. -Lark can parse all context-free languages. That means it is capable of parsing almost any programming language out there, and to some degree most natural languages too. +Lark can parse all context-free languages. To put it simply, it means that it is capable of parsing almost any programming language out there, and to some degree most natural languages too. **Who is it for?** - - **Beginners**: Lark is very friendly for experimentation. It can parse any grammar you throw at it, no matter how complicated or ambiguous, and do so efficiently. It also constructs an annotated parse-tree for you, using only the grammar, and it gives you convienient and flexible tools to process that parse-tree. + - **Beginners**: Lark is very friendly for experimentation. It can parse any grammar you throw at it, no matter how complicated or ambiguous, and do so efficiently. It also constructs an annotated parse-tree for you, using only the grammar and an input, and it gives you convienient and flexible tools to process that parse-tree. - **Experts**: Lark implements both Earley(SPPF) and LALR(1), and several different lexers, so you can trade-off power and speed, according to your requirements. It also provides a variety of sophisticated features and utilities. **What can it do?** - - Parse all context-free grammars, and handle any ambiguity + - Parse all context-free grammars, and handle any ambiguity gracefully - Build an annotated parse-tree automagically, no construction code required. - Provide first-rate performance in terms of both Big-O complexity and measured run-time (considering that this is Python ;) - Run on every Python interpreter (it's pure-python) From d631cad024ab8c80df1db58c0202e43eaca321aa Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Thu, 6 Aug 2020 15:17:52 +0300 Subject: [PATCH 083/164] Minor adjustments --- lark/reconstruct.py | 3 +-- tests/test_nearley/nearley | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/lark/reconstruct.py b/lark/reconstruct.py index 79e294c..89967b2 100644 --- a/lark/reconstruct.py +++ b/lark/reconstruct.py @@ -1,4 +1,3 @@ -import unicodedata from collections import defaultdict from .tree import Tree @@ -131,7 +130,7 @@ class Reconstructor: rule_names = {r.origin for r in rules} nonterminals = {sym for sym in rule_names - if sym.name.startswith('_') or sym in expand1s or sym in aliases } + if sym.name.startswith('_') or sym in expand1s or sym in aliases } seen = set() for r in rules: diff --git a/tests/test_nearley/nearley b/tests/test_nearley/nearley index cf8925f..a46b374 160000 --- a/tests/test_nearley/nearley +++ b/tests/test_nearley/nearley @@ -1 +1 @@ -Subproject commit cf8925f729bde741a3076c5856c0c0862bc7f5de +Subproject commit a46b37471db486db0f6e1ce6a2934fb238346b44 From b174d5eae8063f2ccbe8f3aac573c0ca951ce169 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Thu, 6 Aug 2020 22:54:36 +0300 Subject: [PATCH 084/164] Documented regexp flags. Issue #230 --- docs/grammar.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/docs/grammar.md b/docs/grammar.md index d6cc274..d4ecec5 100644 --- a/docs/grammar.md +++ b/docs/grammar.md @@ -112,6 +112,19 @@ Terminals can be assigned priority only when using a lexer (future versions may Priority can be either positive or negative. If not specified for a terminal, it defaults to 1. +### Regexp Flags + +You can use flags on regexps and strings. For example: + +```perl +SELECT: "select"i //# Will ignore case, and match SELECT or Select, etc. +MULTILINE_TEXT: /.+/s +``` + +Supported flags are one of: `imslu`. See Python's regex documentation for more details on each one. + +Regexps/strings of different flags can only be concatenated in Python 3.6+ + #### Notes for when using a lexer: When using a lexer (standard or contextual), it is the grammar-author's responsibility to make sure the literals don't collide, or that if they do, they are matched in the desired order. Literals are matched according to the following precedence: From 7ca6bb6559d3c90f8d9567340ca6465f4c4db783 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Thu, 6 Aug 2020 23:18:44 +0300 Subject: [PATCH 085/164] Fixed link in docs (Issue #621) --- docs/parsers.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/parsers.md b/docs/parsers.md index c487238..cff5a4b 100644 --- a/docs/parsers.md +++ b/docs/parsers.md @@ -13,7 +13,7 @@ It's possible to bypass the dynamic lexing, and use the regular Earley parser wi Lark implements the Shared Packed Parse Forest data-structure for the Earley parser, in order to reduce the space and computation required to handle ambiguous grammars. -You can read more about SPPF [here](http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/) +You can read more about SPPF [here](https://web.archive.org/web/20191229100607/www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest) As a result, Lark can efficiently parse and store every ambiguity in the grammar, when using Earley. From fc9afe5177fbf798192da8148aa1be7340d392f1 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Fri, 7 Aug 2020 09:59:41 +0300 Subject: [PATCH 086/164] Added nearley.md to mkdocs (so readthedocs can display it) --- mkdocs.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/mkdocs.yml b/mkdocs.yml index f5b0d1d..6c22d89 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -12,3 +12,4 @@ pages: - Visitors and Transformers: visitors.md - Classes Reference: classes.md - Recipes: recipes.md + - Import grammars from Nearley: nearley.md From 438e89dea9cd886a4bc01738a224e6a0e5fbb519 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sat, 8 Aug 2020 15:33:36 +0300 Subject: [PATCH 087/164] Fix readthedocs (Issue #640) --- mkdocs.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/mkdocs.yml b/mkdocs.yml index 6c22d89..8d2a562 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -13,3 +13,4 @@ pages: - Classes Reference: classes.md - Recipes: recipes.md - Import grammars from Nearley: nearley.md + - Tutorial - JSON Parser: json_tutorial.md From 61a7c1e20a6c6cbdbd23fdd20611075fe3147176 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sat, 8 Aug 2020 15:43:20 +0300 Subject: [PATCH 088/164] Removed code that causes failure in Python 3.4 --- lark/exceptions.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/lark/exceptions.py b/lark/exceptions.py index 033275c..645b09c 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -72,11 +72,7 @@ class UnexpectedInput(LarkError): class UnexpectedCharacters(LexError, UnexpectedInput): def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None): - - if isinstance(seq, bytes): - message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace"), line, column) - else: - message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos], line, column) + message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos], line, column) self.line = line self.column = column From 5954fdf87aa79c7369c040ade8dbdd04dff58ef7 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sat, 8 Aug 2020 16:16:34 +0300 Subject: [PATCH 089/164] Restore bad code (needs better fix). Updated readme & docs. --- README.md | 7 +++---- docs/features.md | 2 +- lark/exceptions.py | 6 +++++- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 18c181f..23ec565 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ Most importantly, Lark will save you time and prevent you from getting parsing h ### Install Lark - $ pip install lark-parser + $ pip install lark-parser --upgrade Lark has no dependencies. @@ -77,12 +77,11 @@ Notice punctuation doesn't appear in the resulting tree. It's automatically filt ### Fruit flies like bananas -Lark is great at handling ambiguity. Let's parse the phrase "fruit flies like bananas": +Lark is great at handling ambiguity. Here is the result of parsing the phrase "fruit flies like bananas": ![fruitflies.png](examples/fruitflies.png) -See more [examples here](https://github.com/lark-parser/lark/tree/master/examples) - +See the code and more [examples here](https://github.com/lark-parser/lark/tree/master/examples) ## List of main features diff --git a/docs/features.md b/docs/features.md index 9346989..c2f6983 100644 --- a/docs/features.md +++ b/docs/features.md @@ -19,8 +19,8 @@ [Read more about the parsers](parsers.md) # Extra features - - Import rules and tokens from other Lark grammars, for code reuse and modularity. + - Support for external regex module ([see here](/docs/classes.md#using-unicode-character-classes-with-regex)) - Import grammars from Nearley.js ([read more](/docs/nearley.md)) - CYK parser diff --git a/lark/exceptions.py b/lark/exceptions.py index 645b09c..a844dd4 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -72,7 +72,11 @@ class UnexpectedInput(LarkError): class UnexpectedCharacters(LexError, UnexpectedInput): def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None): - message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos], line, column) + + if isinstance(seq, bytes): + message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace"), line, column) + else: + message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos], line, column) self.line = line self.column = column From 8dc8865072a526dbb70cd6f073668fe22c5680b8 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sat, 8 Aug 2020 16:21:01 +0300 Subject: [PATCH 090/164] [docs] Fixed links --- docs/features.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/features.md b/docs/features.md index c2f6983..00fdf4b 100644 --- a/docs/features.md +++ b/docs/features.md @@ -20,8 +20,8 @@ # Extra features - Import rules and tokens from other Lark grammars, for code reuse and modularity. - - Support for external regex module ([see here](/docs/classes.md#using-unicode-character-classes-with-regex)) - - Import grammars from Nearley.js ([read more](/docs/nearley.md)) + - Support for external regex module ([see here](classes.md#using-unicode-character-classes-with-regex)) + - Import grammars from Nearley.js ([read more](nearley.md)) - CYK parser ### Experimental features From b7068c45a73bc70d3f9611c81198f0aa5571c4d9 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sun, 9 Aug 2020 12:05:07 +0300 Subject: [PATCH 091/164] Tiny fixes. Don't test use_bytes on Python 3.4. --- docs/index.md | 2 +- lark/visitors.py | 2 ++ tests/test_parser.py | 3 ++- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/docs/index.md b/docs/index.md index 1310be2..c72305d 100644 --- a/docs/index.md +++ b/docs/index.md @@ -32,7 +32,7 @@ $ pip install lark-parser * [Philosophy & Design Choices](philosophy.md) -* [Full List of Features](features.md) +* [Features](features.md) * [Examples](https://github.com/lark-parser/lark/tree/master/examples) * [Online IDE](https://lark-parser.github.io/lark/ide/app.html) * Tutorials diff --git a/lark/visitors.py b/lark/visitors.py index 3f80016..6494deb 100644 --- a/lark/visitors.py +++ b/lark/visitors.py @@ -14,6 +14,8 @@ class Discard(Exception): # Transformers class _Decoratable: + "Provides support for decorating methods with @v_args" + @classmethod def _apply_decorator(cls, decorator, **kwargs): mro = getmro(cls) diff --git a/tests/test_parser.py b/tests/test_parser.py index f1e269f..cd3ea4d 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -721,7 +721,8 @@ def _make_parser_test(LEXER, PARSER): """) g.parse('\x01\x02\x03') - @unittest.skipIf(sys.version_info[:2]==(2, 7), "bytes parser isn't perfect in Python2.7, exceptions don't work correctly") + @unittest.skipIf(sys.version_info[0]==2 or sys.version_info[:2]==(3, 4), + "bytes parser isn't perfect in Python2, exceptions don't work correctly") def test_bytes_utf8(self): g = r""" start: BOM? char+ From 9923987e94547ded8a17d7a03840c4cebce39188 Mon Sep 17 00:00:00 2001 From: decorator-factory <42166884+decorator-factory@users.noreply.github.com> Date: Mon, 10 Aug 2020 23:07:55 +0300 Subject: [PATCH 092/164] allow multiline regexes with 'x' (verbose) flag --- lark/load_grammar.py | 13 ++++++++++--- tests/test_parser.py | 26 ++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 3 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index ae7ec32..d716ec1 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -13,7 +13,7 @@ from .parser_frontends import LALR_TraditionalLexer from .common import LexerConf, ParserConf from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol from .utils import classify, suppress, dedup_list, Str -from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken +from .exceptions import GrammarError, LarkError, UnexpectedCharacters, UnexpectedToken from .tree import Tree, SlottedTree as ST from .visitors import Transformer, Visitor, v_args, Transformer_InPlace, Transformer_NonRecursive @@ -85,7 +85,7 @@ TERMINALS = { 'RULE': '!?[_?]?[a-z][_a-z0-9]*', 'TERMINAL': '_?[A-Z][_A-Z0-9]*', 'STRING': r'"(\\"|\\\\|[^"\n])*?"i?', - 'REGEXP': r'/(?!/)(\\/|\\\\|[^/\n])*?/[%s]*' % _RE_FLAGS, + 'REGEXP': r'/(?!/)(\\/|\\\\|[^/])*?/[%s]*' % _RE_FLAGS, '_NL': r'(\r?\n)+\s*', 'WS': r'[ \t]+', 'COMMENT': r'\s*//[^\n]*', @@ -336,7 +336,7 @@ class PrepareAnonTerminals(Transformer_InPlace): term_name = None elif isinstance(p, PatternRE): - if p in self.term_reverse: # Kind of a wierd placement.name + if p in self.term_reverse: # Kind of a weird placement.name term_name = self.term_reverse[p].name else: assert False, p @@ -409,6 +409,13 @@ def _literal_to_pattern(literal): flags = v[flag_start:] assert all(f in _RE_FLAGS for f in flags), flags + if literal.type == 'STRING' and '\n' in v: + raise GrammarError('You cannot put newlines in string literals') + + if literal.type == 'REGEXP' and '\n' in v and 'x' not in flags: + raise GrammarError('You can only use newlines in regular expressions ' + 'with the `x` (verbose) flag') + v = v[:flag_start] assert v[0] == v[-1] and v[0] in '"/' x = v[1:-1] diff --git a/tests/test_parser.py b/tests/test_parser.py index cd3ea4d..48a4674 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1262,6 +1262,32 @@ def _make_parser_test(LEXER, PARSER): tree = l.parse('aA') self.assertEqual(tree.children, ['a', 'A']) + def test_token_flags_verbose(self): + g = _Lark(r"""start: NL | ABC + ABC: / [a-z] /x + NL: /\n/ + """) + x = g.parse('a') + self.assertEqual(x.children, ['a']) + + def test_token_flags_verbose_multiline(self): + g = _Lark(r"""start: ABC + ABC: / a b c + d + e f + /x + """) + x = g.parse('abcdef') + self.assertEqual(x.children, ['abcdef']) + + def test_token_multiline_only_works_with_x_flag(self): + g = r"""start: ABC + ABC: / a b c + d + e f + /i + """ + self.assertRaises( GrammarError, _Lark, g) @unittest.skipIf(PARSER == 'cyk', "No empty rules") def test_twice_empty(self): From 8b59a1642533f1f577b104c7be33f0511193050d Mon Sep 17 00:00:00 2001 From: decorator-factory <42166884+decorator-factory@users.noreply.github.com> Date: Tue, 11 Aug 2020 00:44:23 +0300 Subject: [PATCH 093/164] refactor: replace dict lookup with simple conditional --- lark/load_grammar.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index d716ec1..1a1a396 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -424,9 +424,11 @@ def _literal_to_pattern(literal): if literal.type == 'STRING': s = s.replace('\\\\', '\\') - - return { 'STRING': PatternStr, - 'REGEXP': PatternRE }[literal.type](s, flags) + return PatternStr(s, flags) + elif literal.type == 'REGEXP': + return PatternRE(s, flags) + else: + assert False, 'Invariant failed: literal.type not in ["STRING", "REGEXP"]' @inline_args From 2525e0ce9c594b81a79caa5ff57c66a12a79ca5a Mon Sep 17 00:00:00 2001 From: decorator-factory <42166884+decorator-factory@users.noreply.github.com> Date: Tue, 11 Aug 2020 00:46:54 +0300 Subject: [PATCH 094/164] formatting: fix pistol operator --- lark/load_grammar.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 1a1a396..0ee546c 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -13,7 +13,7 @@ from .parser_frontends import LALR_TraditionalLexer from .common import LexerConf, ParserConf from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol from .utils import classify, suppress, dedup_list, Str -from .exceptions import GrammarError, LarkError, UnexpectedCharacters, UnexpectedToken +from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken from .tree import Tree, SlottedTree as ST from .visitors import Transformer, Visitor, v_args, Transformer_InPlace, Transformer_NonRecursive @@ -850,7 +850,7 @@ class GrammarLoader: if len(stmt.children) > 1: path_node, arg1 = stmt.children else: - path_node, = stmt.children + path_node ,= stmt.children arg1 = None if isinstance(arg1, Tree): # Multi import From 28e0a86f389c329a35091b7acb7b0afc5d57dc74 Mon Sep 17 00:00:00 2001 From: MegaIng1 Date: Wed, 12 Aug 2020 14:48:55 +0200 Subject: [PATCH 095/164] Small improvements for debug info --- lark-stubs/exceptions.pyi | 15 ++++++++++----- lark/exceptions.py | 15 ++++++++++++--- lark/parsers/lalr_puppet.py | 6 +++--- 3 files changed, 25 insertions(+), 11 deletions(-) diff --git a/lark-stubs/exceptions.pyi b/lark-stubs/exceptions.pyi index f09bfbd..012ac51 100644 --- a/lark-stubs/exceptions.pyi +++ b/lark-stubs/exceptions.pyi @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -from typing import Dict, Iterable, Callable, Union +from typing import Dict, Iterable, Callable, Union, TypeVar, Tuple from .tree import Tree from .lexer import Token @@ -21,6 +21,9 @@ class LexError(LarkError): pass +T = TypeVar('T') + + class UnexpectedInput(LarkError): pos_in_stream: int @@ -28,10 +31,12 @@ class UnexpectedInput(LarkError): ... def match_examples( - self, - parse_fn: Callable[[str], Tree], - examples: Dict[str, Iterable[str]] - ): + self, + parse_fn: Callable[[str], Tree], + examples: Union[Dict[T, Iterable[str]], Iterable[Tuple[T, Iterable[str]]]], + token_type_match_fallback: bool = False, + print_debug_info: bool = True + ) -> T: ... diff --git a/lark/exceptions.py b/lark/exceptions.py index 033275c..47670a6 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -37,34 +37,43 @@ class UnexpectedInput(LarkError): after = text[pos:end].split(b'\n', 1)[0] return (before + after + b'\n' + b' ' * len(before) + b'^\n').decode("ascii", "backslashreplace") - def match_examples(self, parse_fn, examples, token_type_match_fallback=False): + def match_examples(self, parse_fn, examples, token_type_match_fallback=False, print_debug_info=True): """ Given a parser instance and a dictionary mapping some label with some malformed syntax examples, it'll return the label for the example that bests matches the current error. """ assert self.state is not None, "Not supported for this exception" + + if isinstance(examples, dict): + examples = examples.items() candidate = (None, False) - for label, example in examples.items(): + for i, (label, example) in enumerate(examples): assert not isinstance(example, STRING_TYPE) - for malformed in example: + for j, malformed in enumerate(example): try: parse_fn(malformed) except UnexpectedInput as ut: if ut.state == self.state: try: if ut.token == self.token: # Try exact match first + if print_debug_info: + print("Exact Match at %d, with example %d" % (i, j), (ut.token, self.token, ut.state, self.state)) return label if token_type_match_fallback: # Fallback to token types match if (ut.token.type == self.token.type) and not candidate[-1]: + if print_debug_info: + print("Token Type Fallback at %d, with example %d" % (i, j)) candidate = label, True except AttributeError: pass if not candidate[0]: + if print_debug_info: + print("Defaulted at %d, with example %d" % (i, j)) candidate = label, False return candidate[0] diff --git a/lark/parsers/lalr_puppet.py b/lark/parsers/lalr_puppet.py index 968783c..d5a4703 100644 --- a/lark/parsers/lalr_puppet.py +++ b/lark/parsers/lalr_puppet.py @@ -16,7 +16,7 @@ class ParserPuppet: self.result = None def feed_token(self, token): - """Advance the parser state, as if it just recieved `token` from the lexer + """Advance the parser state, as if it just received `token` from the lexer """ end_state = self.parser.parse_table.end_states[self._start] @@ -66,9 +66,9 @@ class ParserPuppet: self._set_state, ) - def pretty(): + def pretty(self): print("Puppet choices:") - for k, v in self.choices.items(): + for k, v in self.choices().items(): print('\t-', k, '->', v) print('stack size:', len(self._state_stack)) From a7bcd0bc2d3cb96030d9e77523c0007e8034ce49 Mon Sep 17 00:00:00 2001 From: MegaIng1 Date: Wed, 12 Aug 2020 15:36:01 +0200 Subject: [PATCH 096/164] Added `accepts` attribute to `UnexpectedToken` and update stubs --- lark-stubs/exceptions.pyi | 15 ++++++++++----- lark/exceptions.py | 5 +++-- lark/parsers/lalr_parser.py | 13 +++++++++++-- 3 files changed, 24 insertions(+), 9 deletions(-) diff --git a/lark-stubs/exceptions.pyi b/lark-stubs/exceptions.pyi index 012ac51..67c39fb 100644 --- a/lark-stubs/exceptions.pyi +++ b/lark-stubs/exceptions.pyi @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -from typing import Dict, Iterable, Callable, Union, TypeVar, Tuple +from typing import Dict, Iterable, Callable, Union, TypeVar, Tuple, Any, List, Set from .tree import Tree from .lexer import Token @@ -25,7 +25,10 @@ T = TypeVar('T') class UnexpectedInput(LarkError): + line: int + column: int pos_in_stream: int + state: Any def get_context(self, text: str, span: int = ...): ... @@ -41,12 +44,14 @@ class UnexpectedInput(LarkError): class UnexpectedToken(ParseError, UnexpectedInput): - pass - + expected: List[str] + considered_rules: Set[str] + puppet: Any + accepts: List[str] class UnexpectedCharacters(LexError, UnexpectedInput): - line: int - column: int + allowed: Set[str] + considered_tokens: Set[Any] class VisitError(LarkError): diff --git a/lark/exceptions.py b/lark/exceptions.py index 47670a6..022a00f 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -105,7 +105,7 @@ class UnexpectedCharacters(LexError, UnexpectedInput): class UnexpectedToken(ParseError, UnexpectedInput): - def __init__(self, token, expected, considered_rules=None, state=None, puppet=None): + def __init__(self, token, expected, considered_rules=None, state=None, puppet=None, accepts=None): self.token = token self.expected = expected # XXX str shouldn't necessary self.line = getattr(token, 'line', '?') @@ -114,10 +114,11 @@ class UnexpectedToken(ParseError, UnexpectedInput): self.state = state self.pos_in_stream = getattr(token, 'pos_in_stream', None) self.puppet = puppet + self.accepts = accepts message = ("Unexpected token %r at line %s, column %s.\n" "Expected one of: \n\t* %s\n" - % (token, self.line, self.column, '\n\t* '.join(self.expected))) + % (token, self.line, self.column, '\n\t* '.join(self.accepts or self.expected))) super(UnexpectedToken, self).__init__(message) diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index f26cbc5..f61e093 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -62,9 +62,18 @@ class _Parser: expected = [s for s in states[state].keys() if s.isupper()] try: puppet = ParserPuppet(self, state_stack, value_stack, start, stream, set_state) + accepts = [] + for t in expected: + new_puppet = puppet.copy() + try: + new_puppet.feed_token(Token(t, '')) + except KeyError: + pass + else: + accepts.append(t) except NameError: - puppet = None - raise UnexpectedToken(token, expected, state=state, puppet=puppet) + puppet = accepts = None + raise UnexpectedToken(token, expected, state=state, puppet=puppet, accepts=accepts) def reduce(rule): size = len(rule.expansion) From d3b0449f714615b190699644650e41669a1510d4 Mon Sep 17 00:00:00 2001 From: MegaIng1 Date: Wed, 12 Aug 2020 16:46:36 +0200 Subject: [PATCH 097/164] Improved `match_examples` with `UnexpectedToken.accepts` --- lark/exceptions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/exceptions.py b/lark/exceptions.py index 022a00f..497cf96 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -55,7 +55,7 @@ class UnexpectedInput(LarkError): try: parse_fn(malformed) except UnexpectedInput as ut: - if ut.state == self.state: + if ut.state == self.state and ut.accepts == self.accepts: try: if ut.token == self.token: # Try exact match first if print_debug_info: From 2e160c046e5de3d82b664d9867c1e9386ff4efb7 Mon Sep 17 00:00:00 2001 From: MegaIng1 Date: Wed, 12 Aug 2020 16:52:21 +0200 Subject: [PATCH 098/164] Correction for python2.7 (LalrPuppet-> new style class) --- lark/parsers/lalr_puppet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/parsers/lalr_puppet.py b/lark/parsers/lalr_puppet.py index d5a4703..2b350bf 100644 --- a/lark/parsers/lalr_puppet.py +++ b/lark/parsers/lalr_puppet.py @@ -4,7 +4,7 @@ from copy import deepcopy from .lalr_analysis import Shift, Reduce -class ParserPuppet: +class ParserPuppet(object): def __init__(self, parser, state_stack, value_stack, start, stream, set_state): self.parser = parser self._state_stack = state_stack From cb2d9cded072e0f150b0d6d349fd431369b83a93 Mon Sep 17 00:00:00 2001 From: MegaIng1 Date: Thu, 13 Aug 2020 03:51:01 +0200 Subject: [PATCH 099/164] Refactored ParserPuppet, added stubs --- lark-stubs/exceptions.pyi | 10 +++++----- lark-stubs/parsers/__init__.pyi | 0 lark-stubs/parsers/lalr_puppet.pyi | 21 +++++++++++++++++++++ lark/exceptions.py | 19 ++++++++++--------- lark/parsers/lalr_parser.py | 12 ++---------- lark/parsers/lalr_puppet.py | 21 ++++++++++++++++++--- 6 files changed, 56 insertions(+), 27 deletions(-) create mode 100644 lark-stubs/parsers/__init__.pyi create mode 100644 lark-stubs/parsers/lalr_puppet.pyi diff --git a/lark-stubs/exceptions.pyi b/lark-stubs/exceptions.pyi index 67c39fb..268844c 100644 --- a/lark-stubs/exceptions.pyi +++ b/lark-stubs/exceptions.pyi @@ -3,7 +3,7 @@ from typing import Dict, Iterable, Callable, Union, TypeVar, Tuple, Any, List, Set from .tree import Tree from .lexer import Token - +from .parsers.lalr_puppet import ParserPuppet class LarkError(Exception): pass @@ -38,16 +38,16 @@ class UnexpectedInput(LarkError): parse_fn: Callable[[str], Tree], examples: Union[Dict[T, Iterable[str]], Iterable[Tuple[T, Iterable[str]]]], token_type_match_fallback: bool = False, - print_debug_info: bool = True + use_accepts: bool = False, ) -> T: ... class UnexpectedToken(ParseError, UnexpectedInput): - expected: List[str] + expected: Set[str] considered_rules: Set[str] - puppet: Any - accepts: List[str] + puppet: ParserPuppet + accepts: Set[str] class UnexpectedCharacters(LexError, UnexpectedInput): allowed: Set[str] diff --git a/lark-stubs/parsers/__init__.pyi b/lark-stubs/parsers/__init__.pyi new file mode 100644 index 0000000..e69de29 diff --git a/lark-stubs/parsers/lalr_puppet.pyi b/lark-stubs/parsers/lalr_puppet.pyi new file mode 100644 index 0000000..c138c32 --- /dev/null +++ b/lark-stubs/parsers/lalr_puppet.pyi @@ -0,0 +1,21 @@ +from typing import Set, Dict, Any + +from lark import Token, Tree + + +class ParserPuppet(object): + """ + Represents a LalrParser that can be step through. + Shouldn't instantiated by hand, but is accessible as `UnexpectedToken.puppet` + """ + def feed_token(self, token: Token): ... + + def copy(self) -> ParserPuppet: ... + + def pretty(self) -> str: ... + + def choices(self) -> Dict[str, Any]: ... + + def accepts(self) -> Set[str]: ... + + def resume_parse(self) -> Tree: ... diff --git a/lark/exceptions.py b/lark/exceptions.py index 92ef64e..03f3da4 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -1,3 +1,5 @@ +import logging + from .utils import STRING_TYPE ###{standalone @@ -37,7 +39,7 @@ class UnexpectedInput(LarkError): after = text[pos:end].split(b'\n', 1)[0] return (before + after + b'\n' + b' ' * len(before) + b'^\n').decode("ascii", "backslashreplace") - def match_examples(self, parse_fn, examples, token_type_match_fallback=False, print_debug_info=True): + def match_examples(self, parse_fn, examples, token_type_match_fallback=False, use_accepts=False): """ Given a parser instance and a dictionary mapping some label with some malformed syntax examples, it'll return the label for the example that bests matches the current error. @@ -55,27 +57,26 @@ class UnexpectedInput(LarkError): try: parse_fn(malformed) except UnexpectedInput as ut: - if ut.state == self.state and ut.accepts == self.accepts: + if ut.state == self.state and (not use_accepts or ut.accepts == self.accepts): try: if ut.token == self.token: # Try exact match first - if print_debug_info: - print("Exact Match at %d, with example %d" % (i, j), (ut.token, self.token, ut.state, self.state)) + logging.debug("Exact Match at example [%s][%s]" % (i, j)) return label if token_type_match_fallback: # Fallback to token types match if (ut.token.type == self.token.type) and not candidate[-1]: - if print_debug_info: - print("Token Type Fallback at %d, with example %d" % (i, j)) + logging.debug("Token Type Fallback at example [%s][%s]" % (i, j)) candidate = label, True except AttributeError: pass if not candidate[0]: - if print_debug_info: - print("Defaulted at %d, with example %d" % (i, j)) + logging.debug("Same State match at example [%s][%s]" % (i, j)) candidate = label, False - + elif ut.state == self.state: + logging.debug("Different accepts with same state[%d]: %s != %s at example [%s][%s]" % + (self.state, self.accepts, ut.accepts, i, j)) return candidate[0] diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index f61e093..ba75606 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -59,18 +59,10 @@ class _Parser: try: return states[state][token.type] except KeyError: - expected = [s for s in states[state].keys() if s.isupper()] + expected = {s for s in states[state].keys() if s.isupper()} try: puppet = ParserPuppet(self, state_stack, value_stack, start, stream, set_state) - accepts = [] - for t in expected: - new_puppet = puppet.copy() - try: - new_puppet.feed_token(Token(t, '')) - except KeyError: - pass - else: - accepts.append(t) + accepts = puppet.accepts() except NameError: puppet = accepts = None raise UnexpectedToken(token, expected, state=state, puppet=puppet, accepts=accepts) diff --git a/lark/parsers/lalr_puppet.py b/lark/parsers/lalr_puppet.py index 2b350bf..24c77a1 100644 --- a/lark/parsers/lalr_puppet.py +++ b/lark/parsers/lalr_puppet.py @@ -3,6 +3,8 @@ from copy import deepcopy from .lalr_analysis import Shift, Reduce +from .. import Token + class ParserPuppet(object): def __init__(self, parser, state_stack, value_stack, start, stream, set_state): @@ -67,13 +69,26 @@ class ParserPuppet(object): ) def pretty(self): - print("Puppet choices:") + out = ["Puppet choices:"] for k, v in self.choices().items(): - print('\t-', k, '->', v) - print('stack size:', len(self._state_stack)) + out.append('\t- %s -> %s' % (k, v)) + out.append('stack size: %s' % len(self._state_stack)) + return '\n'.join(out) def choices(self): return self.parser.parse_table.states[self._state_stack[-1]] + def accepts(self): + accepts = set() + for t in self.choices(): + new_puppet = self.copy() + try: + new_puppet.feed_token(Token(t, '')) + except KeyError: + pass + else: + accepts.add(t) + return accepts + def resume_parse(self): return self.parser.parse(self._stream, self._start, self._set_state, self._value_stack, self._state_stack) From d4503374ff6171425c70a57899443cef10210553 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Thu, 13 Aug 2020 10:09:31 +0300 Subject: [PATCH 100/164] Small addition to docs --- README.md | 1 + docs/grammar.md | 2 ++ 2 files changed, 3 insertions(+) diff --git a/README.md b/README.md index 23ec565..69ccb2b 100644 --- a/README.md +++ b/README.md @@ -155,6 +155,7 @@ Check out the [JSON tutorial](/docs/json_tutorial.md#conclusion) for more detail - [miniwdl](https://github.com/chanzuckerberg/miniwdl) - A static analysis toolkit for the Workflow Description Language - [pytreeview](https://gitlab.com/parmenti/pytreeview) - a lightweight tree-based grammar explorer - [harmalysis](https://github.com/napulen/harmalysis) - A language for harmonic analysis and music theory + - [gersemi](https://github.com/BlankSpruce/gersemi) - A CMake code formatter Using Lark? Send me a message and I'll add your project! diff --git a/docs/grammar.md b/docs/grammar.md index d4ecec5..ff6553f 100644 --- a/docs/grammar.md +++ b/docs/grammar.md @@ -112,6 +112,8 @@ Terminals can be assigned priority only when using a lexer (future versions may Priority can be either positive or negative. If not specified for a terminal, it defaults to 1. +Highest priority terminals are always matched first. + ### Regexp Flags You can use flags on regexps and strings. For example: From 02d57bc32a2fae1722ee3f8e003a3d6234e58190 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Thu, 13 Aug 2020 11:43:52 +0300 Subject: [PATCH 101/164] Small adjustments to PR --- lark-stubs/parsers/lalr_puppet.pyi | 5 ++-- lark/exceptions.py | 42 +++++++++++++++++------------- lark/parsers/lalr_parser.py | 7 +++-- 3 files changed, 30 insertions(+), 24 deletions(-) diff --git a/lark-stubs/parsers/lalr_puppet.pyi b/lark-stubs/parsers/lalr_puppet.pyi index c138c32..f35112a 100644 --- a/lark-stubs/parsers/lalr_puppet.pyi +++ b/lark-stubs/parsers/lalr_puppet.pyi @@ -5,8 +5,9 @@ from lark import Token, Tree class ParserPuppet(object): """ - Represents a LalrParser that can be step through. - Shouldn't instantiated by hand, but is accessible as `UnexpectedToken.puppet` + Provides an interface to interactively step through the parser (LALR(1) only for now) + + Accessible via `UnexpectedToken.puppet` (raised by the parser on token error) """ def feed_token(self, token: Token): ... diff --git a/lark/exceptions.py b/lark/exceptions.py index 03f3da4..e1225a9 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -45,7 +45,7 @@ class UnexpectedInput(LarkError): example that bests matches the current error. """ assert self.state is not None, "Not supported for this exception" - + if isinstance(examples, dict): examples = examples.items() @@ -57,7 +57,11 @@ class UnexpectedInput(LarkError): try: parse_fn(malformed) except UnexpectedInput as ut: - if ut.state == self.state and (not use_accepts or ut.accepts == self.accepts): + if ut.state == self.state: + if use_accepts and ut.accepts != self.accepts: + logging.debug("Different accepts with same state[%d]: %s != %s at example [%s][%s]" % + (self.state, self.accepts, ut.accepts, i, j)) + continue try: if ut.token == self.token: # Try exact match first logging.debug("Exact Match at example [%s][%s]" % (i, j)) @@ -74,27 +78,25 @@ class UnexpectedInput(LarkError): if not candidate[0]: logging.debug("Same State match at example [%s][%s]" % (i, j)) candidate = label, False - elif ut.state == self.state: - logging.debug("Different accepts with same state[%d]: %s != %s at example [%s][%s]" % - (self.state, self.accepts, ut.accepts, i, j)) + return candidate[0] class UnexpectedCharacters(LexError, UnexpectedInput): def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None): + self.line = line + self.column = column + self.pos_in_stream = lex_pos + self.state = state + + self.allowed = allowed + self.considered_tokens = considered_tokens if isinstance(seq, bytes): message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace"), line, column) else: message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos], line, column) - self.line = line - self.column = column - self.allowed = allowed - self.considered_tokens = considered_tokens - self.pos_in_stream = lex_pos - self.state = state - message += '\n\n' + self.get_context(seq) if allowed: message += '\nExpecting: %s\n' % allowed @@ -106,16 +108,20 @@ class UnexpectedCharacters(LexError, UnexpectedInput): class UnexpectedToken(ParseError, UnexpectedInput): - def __init__(self, token, expected, considered_rules=None, state=None, puppet=None, accepts=None): - self.token = token - self.expected = expected # XXX str shouldn't necessary + def __init__(self, token, expected, considered_rules=None, state=None, puppet=None): self.line = getattr(token, 'line', '?') self.column = getattr(token, 'column', '?') - self.considered_rules = considered_rules - self.state = state self.pos_in_stream = getattr(token, 'pos_in_stream', None) + self.state = state + + self.token = token + self.expected = expected # XXX deprecate? `accepts` is better + self.considered_rules = considered_rules self.puppet = puppet - self.accepts = accepts + + # TODO Only calculate `accepts()` when we need to display it to the user + # This will improve performance when doing automatic error handling + self.accepts = puppet and puppet.accepts() message = ("Unexpected token %r at line %s, column %s.\n" "Expected one of: \n\t* %s\n" diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index ba75606..cf6a4bf 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -62,10 +62,9 @@ class _Parser: expected = {s for s in states[state].keys() if s.isupper()} try: puppet = ParserPuppet(self, state_stack, value_stack, start, stream, set_state) - accepts = puppet.accepts() - except NameError: - puppet = accepts = None - raise UnexpectedToken(token, expected, state=state, puppet=puppet, accepts=accepts) + except NameError: # For standalone parser + puppet = None + raise UnexpectedToken(token, expected, state=state, puppet=puppet) def reduce(rule): size = len(rule.expansion) From 00e736fda3cebfc9766f293fcbf4826e7e7c8103 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Thu, 13 Aug 2020 11:48:05 +0300 Subject: [PATCH 102/164] Use accepts in default example (even though it's not necessary) --- examples/error_reporting_lalr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/error_reporting_lalr.py b/examples/error_reporting_lalr.py index 5e7d967..f038eda 100644 --- a/examples/error_reporting_lalr.py +++ b/examples/error_reporting_lalr.py @@ -52,7 +52,7 @@ def parse(json_text): '[1,2,]', '{"foo":1,}', '{"foo":false,"bar":true,}'] - }) + }, use_accepts=True) if not exc_class: raise raise exc_class(u.get_context(json_text), u.line, u.column) From 2c7afed894b362dc9b1ea13b658a6094f3c1e281 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Thu, 13 Aug 2020 11:55:44 +0300 Subject: [PATCH 103/164] Small fixes --- lark/exceptions.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/lark/exceptions.py b/lark/exceptions.py index e1225a9..7330125 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -43,6 +43,8 @@ class UnexpectedInput(LarkError): """ Given a parser instance and a dictionary mapping some label with some malformed syntax examples, it'll return the label for the example that bests matches the current error. + + It's recommended to call this with `use_accepts=True`. The default is False for backwards compatibility. """ assert self.state is not None, "Not supported for this exception" @@ -93,10 +95,11 @@ class UnexpectedCharacters(LexError, UnexpectedInput): self.considered_tokens = considered_tokens if isinstance(seq, bytes): - message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace"), line, column) + _s = seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace") else: - message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos], line, column) + _s = seq[lex_pos] + message = "No terminal defined for '%s' at line %d col %d" % (_s, line, column) message += '\n\n' + self.get_context(seq) if allowed: message += '\nExpecting: %s\n' % allowed From 96873d64ba8ef85fcad1daa2dd2e9bf931eb06ba Mon Sep 17 00:00:00 2001 From: Blank Spruce <32396809+BlankSpruce@users.noreply.github.com> Date: Thu, 13 Aug 2020 18:09:05 +0200 Subject: [PATCH 104/164] Make transformer work with tokens in standalone parser, fixes #648 --- lark/common.py | 3 --- lark/lark.py | 9 ++++++++- lark/parser_frontends.py | 16 +++++++++++++--- tests/test_tools.py | 27 +++++++++++++++++++++++++++ 4 files changed, 48 insertions(+), 7 deletions(-) diff --git a/lark/common.py b/lark/common.py index cc8c73c..714399a 100644 --- a/lark/common.py +++ b/lark/common.py @@ -17,9 +17,6 @@ class LexerConf(Serialize): self.skip_validation = skip_validation self.use_bytes = use_bytes - def _deserialize(self): - self.callbacks = {} # TODO - ###} class ParserConf: diff --git a/lark/lark.py b/lark/lark.py index daab45b..3ed96d7 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -344,7 +344,14 @@ class Lark(Serialize): self.rules = [Rule.deserialize(r, memo) for r in data['rules']] self.source = '' self._prepare_callbacks() - self.parser = self.parser_class.deserialize(data['parser'], memo, self._callbacks, self.options.postlex, re_module) + self.parser = self.parser_class.deserialize( + data['parser'], + memo, + self._callbacks, + self.options.postlex, + self.options.transformer, + re_module + ) return self @classmethod diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 33ad9bc..a45bf9c 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -1,6 +1,6 @@ from .utils import get_regexp_width, Serialize from .parsers.grammar_analysis import GrammarAnalyzer -from .lexer import TraditionalLexer, ContextualLexer, Lexer, Token +from .lexer import TraditionalLexer, ContextualLexer, Lexer, Token, TerminalDef from .parsers import earley, xearley, cyk from .parsers.lalr_parser import LALR_Parser from .grammar import Rule @@ -58,6 +58,16 @@ class _ParserFrontend(Serialize): return self.parser.parse(input, start, *args) +def _recreate_lexer_callbacks(memo, transformer): + result = {} + terminals = [item for item in memo.values() if isinstance(item, TerminalDef)] + for terminal in terminals: + callback = getattr(transformer, terminal.name, None) + if callback is not None: + result[terminal.name] = callback + return result + + class WithLexer(_ParserFrontend): lexer = None parser = None @@ -73,10 +83,11 @@ class WithLexer(_ParserFrontend): self.postlex = lexer_conf.postlex @classmethod - def deserialize(cls, data, memo, callbacks, postlex, re_module): + def deserialize(cls, data, memo, callbacks, postlex, transformer, re_module): inst = super(WithLexer, cls).deserialize(data, memo) inst.postlex = postlex inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks) + inst.lexer_conf.callbacks = _recreate_lexer_callbacks(memo, transformer) inst.lexer_conf.re_module = re_module inst.lexer_conf.skip_validation=True inst.init_lexer() @@ -229,4 +240,3 @@ class CYK(WithLexer): def _apply_callback(self, tree): return self.callbacks[tree.rule](tree.children) - diff --git a/tests/test_tools.py b/tests/test_tools.py index 1e0d78e..e691237 100644 --- a/tests/test_tools.py +++ b/tests/test_tools.py @@ -106,6 +106,33 @@ class TestStandalone(TestCase): x = l.parse('(\n)\n') self.assertEqual(x, Tree('start', [])) + def test_transformer(self): + grammar = r""" + start: some_rule "(" SOME_TERMINAL ")" + some_rule: SOME_TERMINAL + SOME_TERMINAL: /[A-Za-z_][A-Za-z0-9_]*/ + """ + context = self._create_standalone(grammar) + _Lark = context["Lark_StandAlone"] + + _Token = context["Token"] + _Tree = context["Tree"] + + class MyTransformer(context["Transformer"]): + def SOME_TERMINAL(self, token): + return _Token("SOME_TERMINAL", "token is transformed") + + def some_rule(self, children): + return _Tree("rule_is_transformed", []) + + parser = _Lark(transformer=MyTransformer()) + self.assertEqual( + parser.parse("FOO(BAR)"), + _Tree("start", [ + _Tree("rule_is_transformed", []), + _Token("SOME_TERMINAL", "token is transformed") + ]) + ) if __name__ == '__main__': From 2f4831f9b6dd857dcb3b8d53a8839474d3c5e5f7 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Thu, 13 Aug 2020 21:13:42 +0300 Subject: [PATCH 105/164] Small refactor after PR --- lark/lark.py | 12 +++++------- lark/parser_frontends.py | 9 ++++++--- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/lark/lark.py b/lark/lark.py index 3ed96d7..8371943 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -11,7 +11,7 @@ from .common import LexerConf, ParserConf from .lexer import Lexer, TraditionalLexer, TerminalDef, UnexpectedToken from .parse_tree_builder import ParseTreeBuilder -from .parser_frontends import get_frontend +from .parser_frontends import get_frontend, _get_lexer_callbacks from .grammar import Rule import re @@ -278,12 +278,10 @@ class Lark(Serialize): rule.options.priority = None # TODO Deprecate lexer_callbacks? - lexer_callbacks = dict(self.options.lexer_callbacks) - if self.options.transformer: - t = self.options.transformer - for term in self.terminals: - if hasattr(t, term.name): - lexer_callbacks[term.name] = getattr(t, term.name) + lexer_callbacks = (_get_lexer_callbacks(self.options.transformer, self.terminals) + if self.options.transformer + else {}) + lexer_callbacks.update(self.options.lexer_callbacks) self.lexer_conf = LexerConf(self.terminals, re_module, self.ignore_tokens, self.options.postlex, lexer_callbacks, self.options.g_regex_flags, use_bytes=self.options.use_bytes) diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index a45bf9c..b993b9f 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -58,9 +58,8 @@ class _ParserFrontend(Serialize): return self.parser.parse(input, start, *args) -def _recreate_lexer_callbacks(memo, transformer): +def _get_lexer_callbacks(transformer, terminals): result = {} - terminals = [item for item in memo.values() if isinstance(item, TerminalDef)] for terminal in terminals: callback = getattr(transformer, terminal.name, None) if callback is not None: @@ -85,12 +84,16 @@ class WithLexer(_ParserFrontend): @classmethod def deserialize(cls, data, memo, callbacks, postlex, transformer, re_module): inst = super(WithLexer, cls).deserialize(data, memo) + inst.postlex = postlex inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks) - inst.lexer_conf.callbacks = _recreate_lexer_callbacks(memo, transformer) + + terminals = [item for item in memo.values() if isinstance(item, TerminalDef)] + inst.lexer_conf.callbacks = _get_lexer_callbacks(transformer, terminals) inst.lexer_conf.re_module = re_module inst.lexer_conf.skip_validation=True inst.init_lexer() + return inst def _serialize(self, data, memo): From 5559b1a21167c662c385e47e52f27c0cc470c278 Mon Sep 17 00:00:00 2001 From: Blank Spruce <32396809+BlankSpruce@users.noreply.github.com> Date: Fri, 14 Aug 2020 12:08:02 +0200 Subject: [PATCH 106/164] Add missing elements in standalone parser Add: - missing imports - __version__ variable Additionally regenerated json parser example --- examples/standalone/json_parser.py | 178 ++++++++++++++++++++--------- lark/exceptions.py | 5 +- lark/tools/standalone.py | 2 + lark/tree.py | 4 +- 4 files changed, 134 insertions(+), 55 deletions(-) diff --git a/examples/standalone/json_parser.py b/examples/standalone/json_parser.py index c9a5147..cadc51d 100644 --- a/examples/standalone/json_parser.py +++ b/examples/standalone/json_parser.py @@ -1,4 +1,6 @@ # The file was automatically generated by Lark v0.9.0 +__version__ = "0.9.0" + # # # Lark Stand-alone Generator Tool @@ -27,6 +29,9 @@ import os from io import open +import logging + + class LarkError(Exception): pass @@ -54,38 +59,55 @@ class UnexpectedInput(LarkError): pos = self.pos_in_stream start = max(pos - span, 0) end = pos + span - before = text[start:pos].rsplit('\n', 1)[-1] - after = text[pos:end].split('\n', 1)[0] - return before + after + '\n' + ' ' * len(before) + '^\n' + if not isinstance(text, bytes): + before = text[start:pos].rsplit('\n', 1)[-1] + after = text[pos:end].split('\n', 1)[0] + return before + after + '\n' + ' ' * len(before) + '^\n' + else: + before = text[start:pos].rsplit(b'\n', 1)[-1] + after = text[pos:end].split(b'\n', 1)[0] + return (before + after + b'\n' + b' ' * len(before) + b'^\n').decode("ascii", "backslashreplace") - def match_examples(self, parse_fn, examples, token_type_match_fallback=False): + def match_examples(self, parse_fn, examples, token_type_match_fallback=False, use_accepts=False): """ Given a parser instance and a dictionary mapping some label with some malformed syntax examples, it'll return the label for the example that bests matches the current error. + + It's recommended to call this with `use_accepts=True`. The default is False for backwards compatibility. """ assert self.state is not None, "Not supported for this exception" + if isinstance(examples, dict): + examples = examples.items() + candidate = (None, False) - for label, example in examples.items(): + for i, (label, example) in enumerate(examples): assert not isinstance(example, STRING_TYPE) - for malformed in example: + for j, malformed in enumerate(example): try: parse_fn(malformed) except UnexpectedInput as ut: if ut.state == self.state: + if use_accepts and ut.accepts != self.accepts: + logging.debug("Different accepts with same state[%d]: %s != %s at example [%s][%s]" % + (self.state, self.accepts, ut.accepts, i, j)) + continue try: if ut.token == self.token: # Try exact match first + logging.debug("Exact Match at example [%s][%s]" % (i, j)) return label if token_type_match_fallback: # Fallback to token types match if (ut.token.type == self.token.type) and not candidate[-1]: + logging.debug("Token Type Fallback at example [%s][%s]" % (i, j)) candidate = label, True except AttributeError: pass if not candidate[0]: + logging.debug("Same State match at example [%s][%s]" % (i, j)) candidate = label, False return candidate[0] @@ -93,15 +115,20 @@ class UnexpectedInput(LarkError): class UnexpectedCharacters(LexError, UnexpectedInput): def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None): - message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos], line, column) - self.line = line self.column = column - self.allowed = allowed - self.considered_tokens = considered_tokens self.pos_in_stream = lex_pos self.state = state + self.allowed = allowed + self.considered_tokens = considered_tokens + + if isinstance(seq, bytes): + _s = seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace") + else: + _s = seq[lex_pos] + + message = "No terminal defined for '%s' at line %d col %d" % (_s, line, column) message += '\n\n' + self.get_context(seq) if allowed: message += '\nExpecting: %s\n' % allowed @@ -114,18 +141,23 @@ class UnexpectedCharacters(LexError, UnexpectedInput): class UnexpectedToken(ParseError, UnexpectedInput): def __init__(self, token, expected, considered_rules=None, state=None, puppet=None): - self.token = token - self.expected = expected # XXX str shouldn't necessary self.line = getattr(token, 'line', '?') self.column = getattr(token, 'column', '?') - self.considered_rules = considered_rules - self.state = state self.pos_in_stream = getattr(token, 'pos_in_stream', None) + self.state = state + + self.token = token + self.expected = expected # XXX deprecate? `accepts` is better + self.considered_rules = considered_rules self.puppet = puppet + # TODO Only calculate `accepts()` when we need to display it to the user + # This will improve performance when doing automatic error handling + self.accepts = puppet and puppet.accepts() + message = ("Unexpected token %r at line %s, column %s.\n" "Expected one of: \n\t* %s\n" - % (token, self.line, self.column, '\n\t* '.join(self.expected))) + % (token, self.line, self.column, '\n\t* '.join(self.accepts or self.expected))) super(UnexpectedToken, self).__init__(message) @@ -286,6 +318,9 @@ def get_regexp_width(expr): raise ValueError(expr) +from collections import OrderedDict + + class Meta: def __init__(self): self.empty = True @@ -364,6 +399,8 @@ class Discard(Exception): # Transformers class _Decoratable: + "Provides support for decorating methods with @v_args" + @classmethod def _apply_decorator(cls, decorator, **kwargs): mro = getmro(cls) @@ -978,8 +1015,7 @@ class Token(Str): try: self = super(Token, cls).__new__(cls, value) except UnicodeDecodeError: - # value = value.decode('latin1') - value = value.decode("ascii", "backslashreplace") + value = value.decode('latin1') self = super(Token, cls).__new__(cls, value) self.type = type_ @@ -1022,8 +1058,8 @@ class Token(Str): class LineCounter: - def __init__(self): - self.newline_char = '\n' + def __init__(self, newline_char): + self.newline_char = newline_char self.char_pos = 0 self.line = 1 self.column = 1 @@ -1052,7 +1088,7 @@ class _Lex: def lex(self, stream, newline_types, ignore_types): newline_types = frozenset(newline_types) ignore_types = frozenset(ignore_types) - line_ctr = LineCounter() + line_ctr = LineCounter('\n' if not self.lexer.use_bytes else b'\n') last_token = None while line_ctr.char_pos < len(stream): @@ -1113,7 +1149,7 @@ class CallChain: -def _create_unless(terminals, g_regex_flags, re_): +def _create_unless(terminals, g_regex_flags, re_, use_bytes): tokens_by_type = classify(terminals, lambda t: type(t.pattern)) assert len(tokens_by_type) <= 2, tokens_by_type.keys() embedded_strs = set() @@ -1130,31 +1166,34 @@ def _create_unless(terminals, g_regex_flags, re_): if strtok.pattern.flags <= retok.pattern.flags: embedded_strs.add(strtok) if unless: - callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, re_, match_whole=True)) + callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, re_, match_whole=True, use_bytes=use_bytes)) terminals = [t for t in terminals if t not in embedded_strs] return terminals, callback -def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_): +def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_, use_bytes): # Python sets an unreasonable group limit (currently 100) in its re module # Worse, the only way to know we reached it is by catching an AssertionError! # This function recursively tries less and less groups until it's successful. postfix = '$' if match_whole else '' mres = [] while terminals: + pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size]) + if use_bytes: + pattern = pattern.encode('latin-1') try: - mre = re_.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in terminals[:max_size]), g_regex_flags) + mre = re_.compile(pattern, g_regex_flags) except AssertionError: # Yes, this is what Python provides us.. :/ - return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_) + return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_, use_bytes) # terms_from_name = {t.name: t for t in terminals[:max_size]} mres.append((mre, {i:n for n,i in mre.groupindex.items()} )) terminals = terminals[max_size:] return mres -def build_mres(terminals, g_regex_flags, re_, match_whole=False): - return _build_mres(terminals, len(terminals), g_regex_flags, match_whole, re_) +def build_mres(terminals, g_regex_flags, re_, use_bytes, match_whole=False): + return _build_mres(terminals, len(terminals), g_regex_flags, match_whole, re_, use_bytes) def _regexp_has_newline(r): r"""Expressions that may indicate newlines in a regexp: @@ -1204,12 +1243,13 @@ class TraditionalLexer(Lexer): self.terminals = terminals self.user_callbacks = conf.callbacks self.g_regex_flags = conf.g_regex_flags + self.use_bytes = conf.use_bytes self._mres = None # self.build(g_regex_flags) def _build(self): - terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, re_=self.re) + terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, re_=self.re, use_bytes=self.use_bytes) assert all(self.callback.values()) for type_, f in self.user_callbacks.items(): @@ -1219,7 +1259,7 @@ class TraditionalLexer(Lexer): else: self.callback[type_] = f - self._mres = build_mres(terminals, self.g_regex_flags, self.re) + self._mres = build_mres(terminals, self.g_regex_flags, self.re, self.use_bytes) @property def mres(self): @@ -1248,7 +1288,8 @@ class ContextualLexer(Lexer): assert t.name not in tokens_by_name, t tokens_by_name[t.name] = t - trad_conf = type(conf)(terminals, conf.re_module, conf.ignore, callbacks=conf.callbacks, g_regex_flags=conf.g_regex_flags, skip_validation=conf.skip_validation) + trad_conf = copy(conf) + trad_conf.tokens = terminals lexer_by_tokens = {} self.lexers = {} @@ -1293,10 +1334,10 @@ class ContextualLexer(Lexer): class LexerConf(Serialize): - __serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags' + __serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags', 'use_bytes' __serialize_namespace__ = TerminalDef, - def __init__(self, tokens, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False): + def __init__(self, tokens, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False, use_bytes=False): self.tokens = tokens # TODO should be terminals self.ignore = ignore self.postlex = postlex @@ -1304,9 +1345,7 @@ class LexerConf(Serialize): self.g_regex_flags = g_regex_flags self.re_module = re_module self.skip_validation = skip_validation - - def _deserialize(self): - self.callbacks = {} # TODO + self.use_bytes = use_bytes from functools import partial, wraps @@ -1627,10 +1666,10 @@ class _Parser: try: return states[state][token.type] except KeyError: - expected = [s for s in states[state].keys() if s.isupper()] + expected = {s for s in states[state].keys() if s.isupper()} try: puppet = ParserPuppet(self, state_stack, value_stack, start, stream, set_state) - except NameError: + except NameError: # For standalone parser puppet = None raise UnexpectedToken(token, expected, state=state, puppet=puppet) @@ -1760,7 +1799,14 @@ def get_frontend(parser, lexer): elif lexer == 'contextual': return LALR_ContextualLexer elif issubclass(lexer, Lexer): - return partial(LALR_CustomLexer, lexer) + class LALR_CustomLexerWrapper(LALR_CustomLexer): + def __init__(self, lexer_conf, parser_conf, options=None): + super(LALR_CustomLexerWrapper, self).__init__( + lexer, lexer_conf, parser_conf, options=options) + def init_lexer(self): + self.lexer = lexer(self.lexer_conf) + + return LALR_CustomLexerWrapper else: raise ValueError('Unknown lexer: %s' % lexer) elif parser=='earley': @@ -1793,6 +1839,15 @@ class _ParserFrontend(Serialize): return self.parser.parse(input, start, *args) +def _get_lexer_callbacks(transformer, terminals): + result = {} + for terminal in terminals: + callback = getattr(transformer, terminal.name, None) + if callback is not None: + result[terminal.name] = callback + return result + + class WithLexer(_ParserFrontend): lexer = None parser = None @@ -1808,13 +1863,18 @@ class WithLexer(_ParserFrontend): self.postlex = lexer_conf.postlex @classmethod - def deserialize(cls, data, memo, callbacks, postlex, re_module): + def deserialize(cls, data, memo, callbacks, postlex, transformer, re_module): inst = super(WithLexer, cls).deserialize(data, memo) + inst.postlex = postlex inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks) + + terminals = [item for item in memo.values() if isinstance(item, TerminalDef)] + inst.lexer_conf.callbacks = _get_lexer_callbacks(transformer, terminals) inst.lexer_conf.re_module = re_module inst.lexer_conf.skip_validation=True inst.init_lexer() + return inst def _serialize(self, data, memo): @@ -1922,6 +1982,7 @@ class LarkOptions(Serialize): invert (Default: auto) lexer_callbacks - Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution. + use_bytes - Accept an input of type `bytes` instead of `str` (Python 3 only). edit_terminals - A callback """ if __doc__: @@ -1945,6 +2006,7 @@ class LarkOptions(Serialize): 'maybe_placeholders': False, 'edit_terminals': None, 'g_regex_flags': 0, + 'use_bytes': False, } def __init__(self, options_dict): @@ -1954,7 +2016,7 @@ class LarkOptions(Serialize): for name, default in self._defaults.items(): if name in o: value = o.pop(name) - if isinstance(default, bool) and name != 'cache': + if isinstance(default, bool) and name not in ('cache', 'use_bytes'): value = bool(value) else: value = default @@ -2027,6 +2089,13 @@ class Lark(Serialize): grammar = read() assert isinstance(grammar, STRING_TYPE) + self.grammar_source = grammar + if self.options.use_bytes: + if not isascii(grammar): + raise ValueError("Grammar must be ascii only, when use_bytes=True") + if sys.version_info[0] == 2 and self.options.use_bytes != 'force': + raise NotImplementedError("`use_bytes=True` may have issues on python2." + "Use `use_bytes='force'` to use it at your own risk.") cache_fn = None if self.options.cache: @@ -2036,7 +2105,7 @@ class Lark(Serialize): cache_fn = self.options.cache else: if self.options.cache is not True: - raise ValueError("cache must be bool or str") + raise ValueError("cache argument must be bool or str") unhashable = ('transformer', 'postlex', 'lexer_callbacks', 'edit_terminals') from . import __version__ options_str = ''.join(k+str(v) for k, v in options.items() if k not in unhashable) @@ -2092,7 +2161,7 @@ class Lark(Serialize): for t in self.terminals: self.options.edit_terminals(t) - self._terminals_dict = {t.name:t for t in self.terminals} + self._terminals_dict = {t.name: t for t in self.terminals} # If the user asked to invert the priorities, negate them all here. # This replaces the old 'resolve__antiscore_sum' option. @@ -2109,14 +2178,12 @@ class Lark(Serialize): rule.options.priority = None # TODO Deprecate lexer_callbacks? - lexer_callbacks = dict(self.options.lexer_callbacks) - if self.options.transformer: - t = self.options.transformer - for term in self.terminals: - if hasattr(t, term.name): - lexer_callbacks[term.name] = getattr(t, term.name) + lexer_callbacks = (_get_lexer_callbacks(self.options.transformer, self.terminals) + if self.options.transformer + else {}) + lexer_callbacks.update(self.options.lexer_callbacks) - self.lexer_conf = LexerConf(self.terminals, re_module, self.ignore_tokens, self.options.postlex, lexer_callbacks, self.options.g_regex_flags) + self.lexer_conf = LexerConf(self.terminals, re_module, self.ignore_tokens, self.options.postlex, lexer_callbacks, self.options.g_regex_flags, use_bytes=self.options.use_bytes) if self.options.parser: self.parser = self._build_parser() @@ -2175,7 +2242,14 @@ class Lark(Serialize): self.rules = [Rule.deserialize(r, memo) for r in data['rules']] self.source = '' self._prepare_callbacks() - self.parser = self.parser_class.deserialize(data['parser'], memo, self._callbacks, self.options.postlex, re_module) + self.parser = self.parser_class.deserialize( + data['parser'], + memo, + self._callbacks, + self.options.postlex, + self.options.transformer, + re_module + ) return self @classmethod @@ -2244,10 +2318,10 @@ class Lark(Serialize): DATA = ( -{'rules': [{'@': 23}, {'@': 31}, {'@': 26}, {'@': 13}, {'@': 24}, {'@': 19}, {'@': 14}, {'@': 27}, {'@': 28}, {'@': 16}, {'@': 29}, {'@': 12}, {'@': 25}, {'@': 30}, {'@': 20}, {'@': 22}, {'@': 15}, {'@': 21}, {'@': 17}, {'@': 18}], 'parser': {'lexer_conf': {'tokens': [{'@': 0}, {'@': 1}, {'@': 2}, {'@': 3}, {'@': 4}, {'@': 5}, {'@': 6}, {'@': 7}, {'@': 8}, {'@': 9}, {'@': 10}, {'@': 11}], 'ignore': [u'WS'], 'g_regex_flags': 0, '__type__': 'LexerConf'}, 'parser': {'tokens': {0: 'COMMA', 1: 'RSQB', 2: 'RBRACE', 3: '$END', 4: 'LBRACE', 5: u'FALSE', 6: u'string', 7: u'object', 8: u'NULL', 9: u'SIGNED_NUMBER', 10: u'value', 11: u'array', 12: u'ESCAPED_STRING', 13: u'TRUE', 14: 'LSQB', 15: 'COLON', 16: u'pair', 17: u'__array_star_0', 18: u'__object_star_1', 19: 'start'}, 'states': {0: {0: (1, {'@': 12}), 1: (1, {'@': 12}), 2: (1, {'@': 12}), 3: (1, {'@': 12})}, 1: {1: (0, 29), 4: (0, 33), 5: (0, 8), 6: (0, 5), 7: (0, 31), 8: (0, 27), 9: (0, 24), 10: (0, 6), 11: (0, 26), 12: (0, 21), 13: (0, 16), 14: (0, 1)}, 2: {0: (0, 23), 2: (0, 0)}, 3: {15: (0, 12)}, 4: {16: (0, 13), 12: (0, 21), 6: (0, 3)}, 5: {0: (1, {'@': 13}), 1: (1, {'@': 13}), 2: (1, {'@': 13}), 3: (1, {'@': 13})}, 6: {0: (0, 7), 1: (0, 11), 17: (0, 17)}, 7: {4: (0, 33), 5: (0, 8), 6: (0, 5), 7: (0, 31), 8: (0, 27), 9: (0, 24), 10: (0, 9), 11: (0, 26), 12: (0, 21), 13: (0, 16), 14: (0, 1)}, 8: {0: (1, {'@': 14}), 1: (1, {'@': 14}), 2: (1, {'@': 14}), 3: (1, {'@': 14})}, 9: {0: (1, {'@': 15}), 1: (1, {'@': 15})}, 10: {4: (0, 33), 5: (0, 8), 6: (0, 5), 7: (0, 31), 8: (0, 27), 9: (0, 24), 10: (0, 20), 11: (0, 26), 12: (0, 21), 13: (0, 16), 14: (0, 1)}, 11: {0: (1, {'@': 16}), 1: (1, {'@': 16}), 2: (1, {'@': 16}), 3: (1, {'@': 16})}, 12: {4: (0, 33), 5: (0, 8), 6: (0, 5), 7: (0, 31), 8: (0, 27), 9: (0, 24), 10: (0, 18), 11: (0, 26), 12: (0, 21), 13: (0, 16), 14: (0, 1)}, 13: {0: (1, {'@': 17}), 2: (1, {'@': 17})}, 14: {}, 15: {0: (1, {'@': 18}), 2: (1, {'@': 18})}, 16: {0: (1, {'@': 19}), 1: (1, {'@': 19}), 2: (1, {'@': 19}), 3: (1, {'@': 19})}, 17: {0: (0, 10), 1: (0, 28)}, 18: {0: (1, {'@': 20}), 2: (1, {'@': 20})}, 19: {0: (0, 4), 18: (0, 2), 2: (0, 25)}, 20: {0: (1, {'@': 21}), 1: (1, {'@': 21})}, 21: {0: (1, {'@': 22}), 1: (1, {'@': 22}), 2: (1, {'@': 22}), 3: (1, {'@': 22}), 15: (1, {'@': 22})}, 22: {3: (1, {'@': 23})}, 23: {16: (0, 15), 12: (0, 21), 6: (0, 3)}, 24: {0: (1, {'@': 24}), 1: (1, {'@': 24}), 2: (1, {'@': 24}), 3: (1, {'@': 24})}, 25: {0: (1, {'@': 25}), 1: (1, {'@': 25}), 2: (1, {'@': 25}), 3: (1, {'@': 25})}, 26: {0: (1, {'@': 26}), 1: (1, {'@': 26}), 2: (1, {'@': 26}), 3: (1, {'@': 26})}, 27: {0: (1, {'@': 27}), 1: (1, {'@': 27}), 2: (1, {'@': 27}), 3: (1, {'@': 27})}, 28: {0: (1, {'@': 28}), 1: (1, {'@': 28}), 2: (1, {'@': 28}), 3: (1, {'@': 28})}, 29: {0: (1, {'@': 29}), 1: (1, {'@': 29}), 2: (1, {'@': 29}), 3: (1, {'@': 29})}, 30: {0: (1, {'@': 30}), 1: (1, {'@': 30}), 2: (1, {'@': 30}), 3: (1, {'@': 30})}, 31: {0: (1, {'@': 31}), 1: (1, {'@': 31}), 2: (1, {'@': 31}), 3: (1, {'@': 31})}, 32: {4: (0, 33), 5: (0, 8), 6: (0, 5), 7: (0, 31), 8: (0, 27), 9: (0, 24), 10: (0, 22), 11: (0, 26), 12: (0, 21), 13: (0, 16), 14: (0, 1), 19: (0, 14)}, 33: {16: (0, 19), 2: (0, 30), 12: (0, 21), 6: (0, 3)}}, 'end_states': {'start': 14}, 'start_states': {'start': 32}}, '__type__': 'LALR_ContextualLexer', 'start': ['start']}, '__type__': 'Lark', 'options': {'regex': False, 'transformer': None, 'lexer': 'contextual', 'lexer_callbacks': {}, 'start': ['start'], 'debug': False, 'postlex': None, 'parser': 'lalr', 'tree_class': None, 'priority': None, 'cache': False, 'g_regex_flags': 0, 'keep_all_tokens': False, 'ambiguity': 'auto', 'edit_terminals': None, 'propagate_positions': False, 'maybe_placeholders': False}} +{'parser': {'parser': {'tokens': {0: 'RSQB', 1: 'COMMA', 2: '$END', 3: 'RBRACE', 4: 'ESCAPED_STRING', 5: 'string', 6: 'pair', 7: 'LSQB', 8: 'LBRACE', 9: 'SIGNED_NUMBER', 10: 'NULL', 11: 'FALSE', 12: 'value', 13: 'array', 14: 'object', 15: 'TRUE', 16: '__array_star_0', 17: 'COLON', 18: '__object_star_1', 19: 'start'}, 'states': {0: {0: (1, {'@': 12}), 1: (1, {'@': 12}), 2: (1, {'@': 12}), 3: (1, {'@': 12})}, 1: {0: (1, {'@': 13}), 1: (1, {'@': 13}), 2: (1, {'@': 13}), 3: (1, {'@': 13})}, 2: {1: (0, 25), 0: (0, 19)}, 3: {0: (1, {'@': 14}), 1: (1, {'@': 14}), 2: (1, {'@': 14}), 3: (1, {'@': 14})}, 4: {4: (0, 31), 5: (0, 13), 6: (0, 26)}, 5: {0: (1, {'@': 15}), 1: (1, {'@': 15}), 2: (1, {'@': 15}), 3: (1, {'@': 15})}, 6: {0: (1, {'@': 16}), 1: (1, {'@': 16}), 2: (1, {'@': 16}), 3: (1, {'@': 16})}, 7: {0: (1, {'@': 17}), 1: (1, {'@': 17}), 2: (1, {'@': 17}), 3: (1, {'@': 17})}, 8: {1: (0, 14), 3: (0, 28)}, 9: {0: (0, 21), 7: (0, 9), 8: (0, 18), 9: (0, 0), 10: (0, 1), 11: (0, 29), 5: (0, 5), 12: (0, 10), 13: (0, 7), 14: (0, 33), 4: (0, 31), 15: (0, 24)}, 10: {1: (0, 20), 16: (0, 2), 0: (0, 3)}, 11: {0: (1, {'@': 18}), 1: (1, {'@': 18})}, 12: {2: (1, {'@': 19})}, 13: {17: (0, 32)}, 14: {5: (0, 13), 4: (0, 31), 6: (0, 23)}, 15: {18: (0, 8), 1: (0, 4), 3: (0, 17)}, 16: {0: (1, {'@': 20}), 1: (1, {'@': 20})}, 17: {0: (1, {'@': 21}), 1: (1, {'@': 21}), 2: (1, {'@': 21}), 3: (1, {'@': 21})}, 18: {4: (0, 31), 6: (0, 15), 5: (0, 13), 3: (0, 6)}, 19: {0: (1, {'@': 22}), 1: (1, {'@': 22}), 2: (1, {'@': 22}), 3: (1, {'@': 22})}, 20: {7: (0, 9), 8: (0, 18), 12: (0, 11), 9: (0, 0), 14: (0, 33), 10: (0, 1), 4: (0, 31), 15: (0, 24), 5: (0, 5), 11: (0, 29), 13: (0, 7)}, 21: {0: (1, {'@': 23}), 1: (1, {'@': 23}), 2: (1, {'@': 23}), 3: (1, {'@': 23})}, 22: {1: (1, {'@': 24}), 3: (1, {'@': 24})}, 23: {1: (1, {'@': 25}), 3: (1, {'@': 25})}, 24: {0: (1, {'@': 26}), 1: (1, {'@': 26}), 2: (1, {'@': 26}), 3: (1, {'@': 26})}, 25: {7: (0, 9), 12: (0, 16), 8: (0, 18), 9: (0, 0), 14: (0, 33), 10: (0, 1), 4: (0, 31), 15: (0, 24), 5: (0, 5), 11: (0, 29), 13: (0, 7)}, 26: {1: (1, {'@': 27}), 3: (1, {'@': 27})}, 27: {7: (0, 9), 8: (0, 18), 12: (0, 12), 9: (0, 0), 10: (0, 1), 11: (0, 29), 5: (0, 5), 13: (0, 7), 14: (0, 33), 4: (0, 31), 15: (0, 24), 19: (0, 30)}, 28: {0: (1, {'@': 28}), 1: (1, {'@': 28}), 2: (1, {'@': 28}), 3: (1, {'@': 28})}, 29: {0: (1, {'@': 29}), 1: (1, {'@': 29}), 2: (1, {'@': 29}), 3: (1, {'@': 29})}, 30: {}, 31: {17: (1, {'@': 30}), 0: (1, {'@': 30}), 1: (1, {'@': 30}), 2: (1, {'@': 30}), 3: (1, {'@': 30})}, 32: {7: (0, 9), 8: (0, 18), 12: (0, 22), 9: (0, 0), 14: (0, 33), 10: (0, 1), 4: (0, 31), 15: (0, 24), 5: (0, 5), 11: (0, 29), 13: (0, 7)}, 33: {0: (1, {'@': 31}), 1: (1, {'@': 31}), 2: (1, {'@': 31}), 3: (1, {'@': 31})}}, 'start_states': {'start': 27}, 'end_states': {'start': 30}}, 'lexer_conf': {'tokens': [{'@': 0}, {'@': 1}, {'@': 2}, {'@': 3}, {'@': 4}, {'@': 5}, {'@': 6}, {'@': 7}, {'@': 8}, {'@': 9}, {'@': 10}, {'@': 11}], 'ignore': ['WS'], 'g_regex_flags': 0, 'use_bytes': False, '__type__': 'LexerConf'}, 'start': ['start'], '__type__': 'LALR_ContextualLexer'}, 'rules': [{'@': 19}, {'@': 31}, {'@': 17}, {'@': 15}, {'@': 12}, {'@': 26}, {'@': 29}, {'@': 13}, {'@': 22}, {'@': 14}, {'@': 23}, {'@': 28}, {'@': 21}, {'@': 16}, {'@': 24}, {'@': 30}, {'@': 18}, {'@': 20}, {'@': 27}, {'@': 25}], 'options': {'debug': False, 'keep_all_tokens': False, 'tree_class': None, 'cache': False, 'postlex': None, 'parser': 'lalr', 'lexer': 'contextual', 'transformer': None, 'start': ['start'], 'priority': None, 'ambiguity': 'auto', 'regex': False, 'propagate_positions': False, 'lexer_callbacks': {}, 'maybe_placeholders': False, 'edit_terminals': None, 'g_regex_flags': 0, 'use_bytes': False}, '__type__': 'Lark'} ) MEMO = ( -{0: {'priority': 1, 'pattern': {'__type__': 'PatternRE', '_width': [2, 4294967295], 'flags': [], 'value': u'\\".*?(? Date: Fri, 14 Aug 2020 16:17:26 +0300 Subject: [PATCH 107/164] Adjustments to logging PR --- lark/__init__.py | 2 +- lark/common.py | 7 --- lark/exceptions.py | 11 ++-- lark/lark.py | 4 +- lark/parsers/earley.py | 2 +- lark/parsers/lalr_analysis.py | 5 +- lark/utils.py | 96 ++++++++++++++++++----------------- 7 files changed, 61 insertions(+), 66 deletions(-) diff --git a/lark/__init__.py b/lark/__init__.py index 9bd88b0..1b5e7e3 100644 --- a/lark/__init__.py +++ b/lark/__init__.py @@ -1,4 +1,4 @@ -from .common import logger +from .utils import logger from .tree import Tree from .visitors import Transformer, Visitor, v_args, Discard from .visitors import InlineTransformer, inline_args # XXX Deprecated diff --git a/lark/common.py b/lark/common.py index b333dcb..714399a 100644 --- a/lark/common.py +++ b/lark/common.py @@ -1,13 +1,6 @@ -import logging from .utils import Serialize from .lexer import TerminalDef -logger = logging.getLogger("lark") -logger.addHandler(logging.StreamHandler()) -# Set to highest level, since we have some warnings amongst the code -# By default, we should not output any log messages -logger.setLevel(logging.CRITICAL) - ###{standalone class LexerConf(Serialize): diff --git a/lark/exceptions.py b/lark/exceptions.py index d1b956d..9d2d8dc 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -1,7 +1,6 @@ -from .utils import STRING_TYPE +from .utils import STRING_TYPE, logger ###{standalone -import logging class LarkError(Exception): @@ -62,24 +61,24 @@ class UnexpectedInput(LarkError): except UnexpectedInput as ut: if ut.state == self.state: if use_accepts and ut.accepts != self.accepts: - logging.debug("Different accepts with same state[%d]: %s != %s at example [%s][%s]" % + logger.debug("Different accepts with same state[%d]: %s != %s at example [%s][%s]" % (self.state, self.accepts, ut.accepts, i, j)) continue try: if ut.token == self.token: # Try exact match first - logging.debug("Exact Match at example [%s][%s]" % (i, j)) + logger.debug("Exact Match at example [%s][%s]" % (i, j)) return label if token_type_match_fallback: # Fallback to token types match if (ut.token.type == self.token.type) and not candidate[-1]: - logging.debug("Token Type Fallback at example [%s][%s]" % (i, j)) + logger.debug("Token Type Fallback at example [%s][%s]" % (i, j)) candidate = label, True except AttributeError: pass if not candidate[0]: - logging.debug("Same State match at example [%s][%s]" % (i, j)) + logger.debug("Same State match at example [%s][%s]" % (i, j)) candidate = label, False return candidate[0] diff --git a/lark/lark.py b/lark/lark.py index ddea2d6..9a4e001 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -4,10 +4,10 @@ import sys, os, pickle, hashlib from io import open -from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS, isascii +from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS, isascii, logger from .load_grammar import load_grammar from .tree import Tree -from .common import LexerConf, ParserConf, logger +from .common import LexerConf, ParserConf from .lexer import Lexer, TraditionalLexer, TerminalDef, UnexpectedToken from .parse_tree_builder import ParseTreeBuilder diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index bf099e6..098639d 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -14,7 +14,7 @@ from collections import deque from ..visitors import Transformer_InPlace, v_args from ..exceptions import UnexpectedEOF, UnexpectedToken -from ..common import logger +from ..utils import logger from .grammar_analysis import GrammarAnalyzer from ..grammar import NonTerminal from .earley_common import Item, TransitiveItem diff --git a/lark/parsers/lalr_analysis.py b/lark/parsers/lalr_analysis.py index 861941f..7a94b4d 100644 --- a/lark/parsers/lalr_analysis.py +++ b/lark/parsers/lalr_analysis.py @@ -6,11 +6,10 @@ For now, shift/reduce conflicts are automatically resolved as shifts. # Author: Erez Shinan (2017) # Email : erezshin@gmail.com -from collections import defaultdict, deque +from collections import defaultdict -from ..utils import classify, classify_bool, bfs, fzset, Serialize, Enumerator +from ..utils import classify, classify_bool, bfs, fzset, Enumerator, logger from ..exceptions import GrammarError -from ..common import logger from .grammar_analysis import GrammarAnalyzer, Terminal, LR0ItemSet from ..grammar import Rule diff --git a/lark/utils.py b/lark/utils.py index c70b947..0c41e6b 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -4,51 +4,15 @@ from functools import reduce from ast import literal_eval from collections import deque -class fzset(frozenset): - def __repr__(self): - return '{%s}' % ', '.join(map(repr, self)) - - -def classify_bool(seq, pred): - true_elems = [] - false_elems = [] - - for elem in seq: - if pred(elem): - true_elems.append(elem) - else: - false_elems.append(elem) - - return true_elems, false_elems - - - -def bfs(initial, expand): - open_q = deque(list(initial)) - visited = set(open_q) - while open_q: - node = open_q.popleft() - yield node - for next_node in expand(node): - if next_node not in visited: - visited.add(next_node) - open_q.append(next_node) - - +###{standalone +import logging +logger = logging.getLogger("lark") +logger.addHandler(logging.StreamHandler()) +# Set to highest level, since we have some warnings amongst the code +# By default, we should not output any log messages +logger.setLevel(logging.CRITICAL) -def _serialize(value, memo): - if isinstance(value, Serialize): - return value.serialize(memo) - elif isinstance(value, list): - return [_serialize(elem, memo) for elem in value] - elif isinstance(value, frozenset): - return list(value) # TODO reversible? - elif isinstance(value, dict): - return {key:_serialize(elem, memo) for key, elem in value.items()} - return value - -###{standalone def classify(seq, key=None, value=None): d = {} for item in seq: @@ -302,13 +266,11 @@ def combine_alternatives(lists): return reduce(lambda a,b: [i+[j] for i in a for j in b], lists[1:], init) - class FS: open = open exists = os.path.exists - def isascii(s): """ str.isascii only exists in python3.7+ """ try: @@ -318,4 +280,46 @@ def isascii(s): s.encode('ascii') return True except (UnicodeDecodeError, UnicodeEncodeError): - return False \ No newline at end of file + return False + + +class fzset(frozenset): + def __repr__(self): + return '{%s}' % ', '.join(map(repr, self)) + + +def classify_bool(seq, pred): + true_elems = [] + false_elems = [] + + for elem in seq: + if pred(elem): + true_elems.append(elem) + else: + false_elems.append(elem) + + return true_elems, false_elems + + +def bfs(initial, expand): + open_q = deque(list(initial)) + visited = set(open_q) + while open_q: + node = open_q.popleft() + yield node + for next_node in expand(node): + if next_node not in visited: + visited.add(next_node) + open_q.append(next_node) + + +def _serialize(value, memo): + if isinstance(value, Serialize): + return value.serialize(memo) + elif isinstance(value, list): + return [_serialize(elem, memo) for elem in value] + elif isinstance(value, frozenset): + return list(value) # TODO reversible? + elif isinstance(value, dict): + return {key:_serialize(elem, memo) for key, elem in value.items()} + return value \ No newline at end of file From 39fb4c0f3e2c1c24ceeb4de29d6904a957eaaaf1 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Fri, 14 Aug 2020 16:34:51 +0300 Subject: [PATCH 108/164] Bugfix and warn on ambiguous intermediate nodes, based on PR #651 --- lark/parsers/earley_forest.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/lark/parsers/earley_forest.py b/lark/parsers/earley_forest.py index c8b4f25..4ed75d9 100644 --- a/lark/parsers/earley_forest.py +++ b/lark/parsers/earley_forest.py @@ -13,6 +13,7 @@ from collections import deque from operator import attrgetter from importlib import import_module +from ..utils import logger from ..tree import Tree from ..exceptions import ParseError @@ -328,10 +329,17 @@ class ForestToAmbiguousTreeVisitor(ForestToTreeVisitor): self.output_stack[-1].children.append(node) def visit_symbol_node_in(self, node): - if self.forest_sum_visitor and node.is_ambiguous and isinf(node.priority): - self.forest_sum_visitor.visit(node) - if not node.is_intermediate and node.is_ambiguous: - self.output_stack.append(Tree('_ambig', [])) + if node.is_ambiguous: + if self.forest_sum_visitor and isinf(node.priority): + self.forest_sum_visitor.visit(node) + if node.is_intermediate: + # TODO Support ambiguous intermediate nodes! + logger.warning("Ambiguous intermediate node in the SPPF: %s. " + "Lark does not currently process these ambiguities; resolving with the first derivation.", node) + return next(iter(node.children)) + else: + self.output_stack.append(Tree('_ambig', [])) + return iter(node.children) def visit_symbol_node_out(self, node): From 00d953d71a20d78c35723abf58895162fff24831 Mon Sep 17 00:00:00 2001 From: Sasank Chilamkurthy Date: Sun, 16 Aug 2020 13:51:53 +0530 Subject: [PATCH 109/164] first cut sphinx done --- docs/Makefile | 20 +++ docs/{ => _static}/lark_cheatsheet.pdf | Bin docs/classes.md | 2 + docs/conf.py | 177 +++++++++++++++++++++++++ docs/features.md | 8 +- docs/grammar.md | 12 +- docs/how_to_use.md | 6 +- docs/index.md | 55 -------- docs/index.rst | 64 +++++++++ docs/json_tutorial.md | 3 +- docs/make.bat | 36 +++++ docs/parsers.md | 8 +- docs/philosophy.md | 4 +- docs/tree_construction.md | 6 +- docs/visitors.md | 2 +- 15 files changed, 319 insertions(+), 84 deletions(-) create mode 100644 docs/Makefile rename docs/{ => _static}/lark_cheatsheet.pdf (100%) create mode 100644 docs/conf.py delete mode 100644 docs/index.md create mode 100644 docs/index.rst create mode 100644 docs/make.bat diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..58127b4 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +SPHINXPROJ = Lark +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file diff --git a/docs/lark_cheatsheet.pdf b/docs/_static/lark_cheatsheet.pdf similarity index 100% rename from docs/lark_cheatsheet.pdf rename to docs/_static/lark_cheatsheet.pdf diff --git a/docs/classes.md b/docs/classes.md index 7bd92fe..4ec7f4c 100644 --- a/docs/classes.md +++ b/docs/classes.md @@ -2,6 +2,8 @@ This page details the important classes in Lark. +**TODO** convert to sphinx autodoc! + ---- ## lark.Lark diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..a522559 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,177 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# Lark documentation build configuration file, created by +# sphinx-quickstart on Sun Aug 16 13:09:41 2020. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) + + +# -- General configuration ------------------------------------------------ + +# If your documentation needs a minimal Sphinx version, state it here. +# +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.coverage', + 'recommonmark' +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +# source_suffix = ['.rst', '.md'] +source_suffix = { + '.rst': 'restructuredtext', + '.md': 'markdown' +} + + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = 'Lark' +copyright = '2020, Erez Shinan' +author = 'Erez Shinan' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = '' +# The full version, including alpha/beta/rc tags. +release = '' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This patterns also effect to html_static_path and html_extra_path +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = False + + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'sphinx_rtd_theme' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# +# html_theme_options = {} + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# Custom sidebar templates, must be a dictionary that maps document names +# to template names. +# +# This is required for the alabaster theme +# refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars +html_sidebars = { + '**': [ + 'relations.html', # needs 'show_related': True theme option to display + 'searchbox.html', + ] +} + + +# -- Options for HTMLHelp output ------------------------------------------ + +# Output file base name for HTML help builder. +htmlhelp_basename = 'Larkdoc' + + +# -- Options for LaTeX output --------------------------------------------- + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', + + # Latex figure (float) alignment + # + # 'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, 'Lark.tex', 'Lark Documentation', + 'Erez Shinan', 'manual'), +] + + +# -- Options for manual page output --------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + (master_doc, 'lark', 'Lark Documentation', + [author], 1) +] + + +# -- Options for Texinfo output ------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, 'Lark', 'Lark Documentation', + author, 'Lark', 'One line description of project.', + 'Miscellaneous'), +] + + + diff --git a/docs/features.md b/docs/features.md index 9346989..68cde87 100644 --- a/docs/features.md +++ b/docs/features.md @@ -1,4 +1,6 @@ -# Main Features +# Features + +## Main Features - Earley parser, capable of parsing any context-free grammar - Implements SPPF, for efficient parsing and storing of ambiguous grammars. - LALR(1) parser, limited in power of expression, but very efficient in space and performance (O(n)). @@ -18,10 +20,10 @@ [Read more about the parsers](parsers.md) -# Extra features +## Extra features - Import rules and tokens from other Lark grammars, for code reuse and modularity. - - Import grammars from Nearley.js ([read more](/docs/nearley.md)) + - Import grammars from Nearley.js ([read more](nearley.md)) - CYK parser ### Experimental features diff --git a/docs/grammar.md b/docs/grammar.md index d4ecec5..7db6a3c 100644 --- a/docs/grammar.md +++ b/docs/grammar.md @@ -1,13 +1,5 @@ # Grammar Reference -Table of contents: - -1. [Definitions](#defs) -1. [Terminals](#terms) -1. [Rules](#rules) -1. [Directives](#dirs) - - ## Definitions A **grammar** is a list of rules and terminals, that together define a language. @@ -20,7 +12,7 @@ Each rule is a list of terminals and rules, whose location and nesting define th A **parsing algorithm** is an algorithm that takes a grammar definition and a sequence of symbols (members of the alphabet), and matches the entirety of the sequence by searching for a structure that is allowed by the grammar. -## General Syntax and notes +### General Syntax and notes Grammars in Lark are based on [EBNF](https://en.wikipedia.org/wiki/Extended_Backus–Naur_form) syntax, with several enhancements. @@ -58,7 +50,6 @@ Lark begins the parse with the rule 'start', unless specified otherwise in the o Names of rules are always in lowercase, while names of terminals are always in uppercase. This distinction has practical effects, for the shape of the generated parse-tree, and the automatic construction of the lexer (aka tokenizer, or scanner). - ## Terminals Terminals are used to match text into symbols. They can be defined as a combination of literals and other terminals. @@ -190,7 +181,6 @@ _ambig ``` - ## Rules **Syntax:** diff --git a/docs/how_to_use.md b/docs/how_to_use.md index 886b440..c4ba4dd 100644 --- a/docs/how_to_use.md +++ b/docs/how_to_use.md @@ -22,11 +22,11 @@ Of course, some specific use-cases may deviate from this process. Feel free to s Browse the [Examples](https://github.com/lark-parser/lark/tree/master/examples) to find a template that suits your purposes. -Read the tutorials to get a better understanding of how everything works. (links in the [main page](/)) +Read the tutorials to get a better understanding of how everything works. (links in the [main page](/index)) -Use the [Cheatsheet (PDF)](lark_cheatsheet.pdf) for quick reference. +Use the [Cheatsheet (PDF)](/_static/lark_cheatsheet.pdf) for quick reference. -Use the reference pages for more in-depth explanations. (links in the [main page](/)] +Use the reference pages for more in-depth explanations. (links in the [main page](/index)] ## LALR usage diff --git a/docs/index.md b/docs/index.md deleted file mode 100644 index 1310be2..0000000 --- a/docs/index.md +++ /dev/null @@ -1,55 +0,0 @@ -# Lark - -A modern parsing library for Python - -## Overview - -Lark can parse any context-free grammar. - -Lark provides: - -- Advanced grammar language, based on EBNF -- Three parsing algorithms to choose from: Earley, LALR(1) and CYK -- Automatic tree construction, inferred from your grammar -- Fast unicode lexer with regexp support, and automatic line-counting - -Lark's code is hosted on Github: [https://github.com/lark-parser/lark](https://github.com/lark-parser/lark) - -### Install -```bash -$ pip install lark-parser -``` - -#### Syntax Highlighting - -- [Sublime Text & TextMate](https://github.com/lark-parser/lark_syntax) -- [Visual Studio Code](https://github.com/lark-parser/vscode-lark) (Or install through the vscode plugin system) -- [Intellij & PyCharm](https://github.com/lark-parser/intellij-syntax-highlighting) - ------ - -## Documentation Index - - -* [Philosophy & Design Choices](philosophy.md) -* [Full List of Features](features.md) -* [Examples](https://github.com/lark-parser/lark/tree/master/examples) -* [Online IDE](https://lark-parser.github.io/lark/ide/app.html) -* Tutorials - * [How to write a DSL](http://blog.erezsh.com/how-to-write-a-dsl-in-python-with-lark/) - Implements a toy LOGO-like language with an interpreter - * [How to write a JSON parser](json_tutorial.md) - Teaches you how to use Lark - * Unofficial - * [Program Synthesis is Possible](https://www.cs.cornell.edu/~asampson/blog/minisynth.html) - Creates a DSL for Z3 -* Guides - * [How to use Lark](how_to_use.md) - * [How to develop Lark](how_to_develop.md) -* Reference - * [Grammar](grammar.md) - * [Tree Construction](tree_construction.md) - * [Visitors & Transformers](visitors.md) - * [Classes](classes.md) - * [Cheatsheet (PDF)](lark_cheatsheet.pdf) - * [Importing grammars from Nearley](nearley.md) -* Discussion - * [Gitter](https://gitter.im/lark-parser/Lobby) - * [Forum (Google Groups)](https://groups.google.com/forum/#!forum/lark-parser) diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..f6611ce --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,64 @@ +.. Lark documentation master file, created by + sphinx-quickstart on Sun Aug 16 13:09:41 2020. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to Lark's documentation! +================================ + +.. toctree:: + :maxdepth: 2 + :hidden: + + philosophy + features + parsers + +.. toctree:: + :maxdepth: 2 + :caption: Tutorials & Guides + :hidden: + + json_tutorial + how_to_use + how_to_develop + nearley + recipes + + +.. toctree:: + :maxdepth: 2 + :caption: Reference + :hidden: + + grammar + tree_construction + visitors + classes + + +Lark is a modern parsing library for Python. Lark can parse any context-free grammar. + +Lark provides: + +- Advanced grammar language, based on EBNF +- Three parsing algorithms to choose from: Earley, LALR(1) and CYK +- Automatic tree construction, inferred from your grammar +- Fast unicode lexer with regexp support, and automatic line-counting + + +**Install Lark**: + +.. code:: bash + + $ pip install lark-parser + +**Syntax Highlighting**: + +- `Sublime Text & TextMate`_ +- `Visual Studio Code`_ (Or install through the vscode plugin system) +- `Intellij & PyCharm`_ + +.. _Sublime Text & TextMate: https://github.com/lark-parser/lark_syntax +.. _Visual Studio Code: https://github.com/lark-parser/vscode-lark +.. _Intellij & PyCharm: https://github.com/lark-parser/intellij-syntax-highlighting \ No newline at end of file diff --git a/docs/json_tutorial.md b/docs/json_tutorial.md index 9cc87e7..aa9544d 100644 --- a/docs/json_tutorial.md +++ b/docs/json_tutorial.md @@ -1,7 +1,6 @@ -# Lark Tutorial - JSON parser +# JSON parser - Tutorial Lark is a parser - a program that accepts a grammar and text, and produces a structured tree that represents that text. - In this tutorial we will write a JSON parser in Lark, and explore Lark's various features in the process. It has 5 parts. diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..4f2e286 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,36 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build +set SPHINXPROJ=Lark + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% + +:end +popd diff --git a/docs/parsers.md b/docs/parsers.md index cff5a4b..7a05f93 100644 --- a/docs/parsers.md +++ b/docs/parsers.md @@ -1,7 +1,7 @@ - +# Parsers Lark implements the following parsing algorithms: Earley, LALR(1), and CYK -# Earley +## Earley An [Earley Parser](https://www.wikiwand.com/en/Earley_parser) is a chart parser capable of parsing any context-free grammar at O(n^3), and O(n^2) when the grammar is unambiguous. It can parse most LR grammars at O(n). Most programming languages are LR, and can be parsed at a linear time. @@ -30,7 +30,7 @@ Lark provides the following options to combat ambiguity: **TODO: Add documentation on dynamic_complete** -# LALR(1) +## LALR(1) [LALR(1)](https://www.wikiwand.com/en/LALR_parser) is a very efficient, true-and-tested parsing algorithm. It's incredibly fast and requires very little memory. It can parse most programming languages (For example: Python and Java). @@ -42,7 +42,7 @@ The contextual lexer communicates with the parser, and uses the parser's lookahe This is an improvement to LALR(1) that is unique to Lark. -# CYK Parser +## CYK Parser A [CYK parser](https://www.wikiwand.com/en/CYK_algorithm) can parse any context-free grammar at O(n^3*|G|). diff --git a/docs/philosophy.md b/docs/philosophy.md index a2097d0..a1d8f8c 100644 --- a/docs/philosophy.md +++ b/docs/philosophy.md @@ -4,7 +4,7 @@ Parsers are innately complicated and confusing. They're difficult to understand, Lark's mission is to make the process of writing them as simple and abstract as possible, by following these design principles: -### Design Principles +## Design Principles 1. Readability matters @@ -23,7 +23,7 @@ In accordance with these principles, I arrived at the following design choices: ----------- -# Design Choices +## Design Choices ### 1. Separation of code and grammar diff --git a/docs/tree_construction.md b/docs/tree_construction.md index a4d6088..50ce0ee 100644 --- a/docs/tree_construction.md +++ b/docs/tree_construction.md @@ -1,4 +1,4 @@ -# Automatic Tree Construction - Reference +# Tree Construction Reference Lark builds a tree automatically based on the structure of the grammar, where each rule that is matched becomes a branch (node) in the tree, and its children are its matches, in the order of matching. @@ -13,7 +13,7 @@ If `maybe_placeholders=False` (the default), then `[]` behaves like `()?`. If `maybe_placeholders=True`, then using `[item]` will return the item if it matched, or the value `None`, if it didn't. -### Terminals +## Terminals Terminals are always values in the tree, never branches. @@ -74,7 +74,7 @@ Lark will parse "((hello world))" as: The brackets do not appear in the tree by design. The words appear because they are matched by a named terminal. -# Shaping the tree +## Shaping the tree Users can alter the automatic construction of the tree using a collection of grammar features. diff --git a/docs/visitors.md b/docs/visitors.md index dcdc8f8..146af1c 100644 --- a/docs/visitors.md +++ b/docs/visitors.md @@ -1,4 +1,4 @@ -## Transformers & Visitors +# Transformers & Visitors Transformers & Visitors provide a convenient interface to process the parse-trees that Lark returns. From e58642e2d3d175cd1ca7498843627b0b89b4bffc Mon Sep 17 00:00:00 2001 From: Sasank Chilamkurthy Date: Sun, 16 Aug 2020 14:29:02 +0530 Subject: [PATCH 110/164] use sphinx instead of mkdocs --- mkdocs.yml | 16 ---------------- readthedocs.yml | 9 +++------ 2 files changed, 3 insertions(+), 22 deletions(-) delete mode 100644 mkdocs.yml diff --git a/mkdocs.yml b/mkdocs.yml deleted file mode 100644 index 8d2a562..0000000 --- a/mkdocs.yml +++ /dev/null @@ -1,16 +0,0 @@ -site_name: Lark -theme: readthedocs -pages: - - Main Page: index.md - - Philosophy: philosophy.md - - Features: features.md - - Parsers: parsers.md - - How To Use (Guide): how_to_use.md - - How To Develop (Guide): how_to_develop.md - - Grammar Reference: grammar.md - - Tree Construction Reference: tree_construction.md - - Visitors and Transformers: visitors.md - - Classes Reference: classes.md - - Recipes: recipes.md - - Import grammars from Nearley: nearley.md - - Tutorial - JSON Parser: json_tutorial.md diff --git a/readthedocs.yml b/readthedocs.yml index 080eeeb..bda2e6c 100644 --- a/readthedocs.yml +++ b/readthedocs.yml @@ -1,10 +1,7 @@ version: 2 -mkdocs: - configuration: mkdocs.yml - fail_on_warning: false - formats: all -python: - version: 3.5 +# Build documentation in the docs/ directory with Sphinx +sphinx: + configuration: docs/conf.py \ No newline at end of file From 25bb0f6280ab448eb1862ae903c17f23e3b4be57 Mon Sep 17 00:00:00 2001 From: Sasank Chilamkurthy Date: Sun, 16 Aug 2020 14:31:27 +0530 Subject: [PATCH 111/164] remove theme --- docs/_config.yml | 1 - 1 file changed, 1 deletion(-) delete mode 100644 docs/_config.yml diff --git a/docs/_config.yml b/docs/_config.yml deleted file mode 100644 index c741881..0000000 --- a/docs/_config.yml +++ /dev/null @@ -1 +0,0 @@ -theme: jekyll-theme-slate \ No newline at end of file From 0c76844ae7f7c7a27737f63355d0f7b5a6d4c3a4 Mon Sep 17 00:00:00 2001 From: Sasank Chilamkurthy Date: Sun, 16 Aug 2020 14:33:26 +0530 Subject: [PATCH 112/164] move to _static and fix links in readme --- .gitignore | 1 + README.md | 6 +++--- docs/{ => _static}/comparison_memory.png | Bin docs/{ => _static}/comparison_runtime.png | Bin 4 files changed, 4 insertions(+), 3 deletions(-) rename docs/{ => _static}/comparison_memory.png (100%) rename docs/{ => _static}/comparison_runtime.png (100%) diff --git a/.gitignore b/.gitignore index d5fc864..62b900c 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,4 @@ tags .mypy_cache /dist /build +docs/_build \ No newline at end of file diff --git a/README.md b/README.md index 69ccb2b..8bc45f5 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ Most importantly, Lark will save you time and prevent you from getting parsing h ### Quick links - [Documentation @readthedocs](https://lark-parser.readthedocs.io/) -- [Cheatsheet (PDF)](/docs/lark_cheatsheet.pdf) +- [Cheatsheet (PDF)](/docs/_static/lark_cheatsheet.pdf) - [Online IDE (very basic)](https://lark-parser.github.io/lark/ide/app.html) - [Tutorial](/docs/json_tutorial.md) for writing a JSON parser. - Blog post: [How to write a DSL with Lark](http://blog.erezsh.com/how-to-write-a-dsl-in-python-with-lark/) @@ -113,9 +113,9 @@ See the full list of [features here](https://lark-parser.readthedocs.io/en/lates Lark is the fastest and lightest (lower is better) -![Run-time Comparison](docs/comparison_runtime.png) +![Run-time Comparison](docs/_static/comparison_runtime.png) -![Memory Usage Comparison](docs/comparison_memory.png) +![Memory Usage Comparison](docs/_static/comparison_memory.png) Check out the [JSON tutorial](/docs/json_tutorial.md#conclusion) for more details on how the comparison was made. diff --git a/docs/comparison_memory.png b/docs/_static/comparison_memory.png similarity index 100% rename from docs/comparison_memory.png rename to docs/_static/comparison_memory.png diff --git a/docs/comparison_runtime.png b/docs/_static/comparison_runtime.png similarity index 100% rename from docs/comparison_runtime.png rename to docs/_static/comparison_runtime.png From cc24dcffa48ec3127ff97cfc741d5db2dcf61358 Mon Sep 17 00:00:00 2001 From: Sasank Chilamkurthy Date: Sun, 16 Aug 2020 15:08:34 +0530 Subject: [PATCH 113/164] document lark.Lark --- docs/classes.md | 286 ----------------------------------------------- docs/classes.rst | 5 + docs/conf.py | 10 +- lark/lark.py | 47 ++++++-- 4 files changed, 48 insertions(+), 300 deletions(-) delete mode 100644 docs/classes.md create mode 100644 docs/classes.rst diff --git a/docs/classes.md b/docs/classes.md deleted file mode 100644 index 4ec7f4c..0000000 --- a/docs/classes.md +++ /dev/null @@ -1,286 +0,0 @@ -# Classes Reference - -This page details the important classes in Lark. - -**TODO** convert to sphinx autodoc! - ----- - -## lark.Lark - -The Lark class is the main interface for the library. It's mostly a thin wrapper for the many different parsers, and for the tree constructor. - -#### Lark.\_\_init\_\_ -```python -def __init__(self, grammar_string, **options): ... -``` -Creates an instance of Lark with the given grammar - -Example: - -```python - >>> Lark(r'''start: "foo" ''') - Lark(...) -``` - -#### Lark.open -```python -def open(cls, grammar_filename, rel_to=None, **options): ... -``` - -Creates an instance of Lark with the grammar given by its filename - -If rel_to is provided, the function will find the grammar filename in relation to it. - -Example: - -```python - >>> Lark.open("grammar_file.lark", rel_to=__file__, parser="lalr") - Lark(...) -``` - -#### Lark.parse - -```python -def parse(self, text, start=None, on_error=None): ... -``` - -Parse the given text, according to the options provided. - -Returns a complete parse tree for the text (of type Tree) - -If a transformer is supplied to `__init__`, returns whatever is the result of the transformation. - -Parameters: - -* start: str - required if Lark was given multiple possible start symbols (using the start option). - -* on_error: function - if provided, will be called on UnexpectedToken error. Return true to resume parsing. LALR only. - -(See `examples/error_puppet.py` for an example of how to use `on_error`.) - -Example: -```python - >>> Lark(r'''start: "hello" " "+ /\w+/ ''').parse('hello kitty') - Tree(start, [Token(__ANON_0, 'kitty')]) -``` - -#### Lark.save / Lark.load -```python -def save(self, f): ... -def load(cls, f): ... -``` - -Useful for caching and multiprocessing. - -`save` saves the instance into the given file object - -`load` loads an instance from the given file object - -#### - - -### Lark Options -#### General options - -**start** - The start symbol. Either a string, or a list of strings for multiple possible starts (Default: "start") - -**debug** - Display debug information, such as warnings (default: False) - -**transformer** - Applies the transformer to every parse tree (equivlent to applying it after the parse, but faster) - -**propagate_positions** - Propagates (line, column, end_line, end_column) attributes into all tree branches. - -**maybe_placeholders** - -- When True, the `[]` operator returns `None` when not matched. -- When `False`, `[]` behaves like the `?` operator, and returns no value at all. -- (default=`False`. Recommended to set to `True`) - -**g_regex_flags** - Flags that are applied to all terminals (both regex and strings) - -**regex** - Use the `regex` library instead of the built-in `re` module (See below) - -**keep_all_tokens** - Prevent the tree builder from automagically removing "punctuation" tokens (default: False) - -**cache** - Cache the results of the Lark grammar analysis, for x2 to x3 faster loading. LALR only for now. -- When `False`, does nothing (default) -- When `True`, caches to a temporary file in the local directory -- When given a string, caches to the path pointed by the string - -#### Algorithm - -**parser** - Decides which parser engine to use, "earley" or "lalr". (Default: "earley") - (there is also a "cyk" option for legacy) - -**lexer** - Decides whether or not to use a lexer stage - -- "auto" (default): Choose for me based on the parser -- "standard": Use a standard lexer -- "contextual": Stronger lexer (only works with parser="lalr") -- "dynamic": Flexible and powerful (only with parser="earley") -- "dynamic_complete": Same as dynamic, but tries *every* variation of tokenizing possible. (only with parser="earley") - -**ambiguity** - Decides how to handle ambiguity in the parse. Only relevant if parser="earley" -- "resolve": The parser will automatically choose the simplest derivation (it chooses consistently: greedy for tokens, non-greedy for rules) -- "explicit": The parser will return all derivations wrapped in "_ambig" tree nodes (i.e. a forest). - -#### Misc. - -- **postlex** - Lexer post-processing (Default: None) Only works with the standard and contextual lexers. -- **priority** - How priorities should be evaluated - auto, none, normal, invert (Default: auto) -- **lexer_callbacks** - Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution. -- **edit_terminals** - A callback -- **use_bytes** - Accept and parse an input of type `bytes` instead of `str`. Grammar should still be specified as `str`, and terminal values are assumed to be `latin-1`. - - -#### Using Unicode character classes with `regex` -Python's builtin `re` module has a few persistent known bugs and also won't parse -advanced regex features such as character classes. -With `pip install lark-parser[regex]`, the `regex` module will be installed alongside `lark` -and can act as a drop-in replacement to `re`. - -Any instance of `Lark` instantiated with `regex=True` will now use the `regex` module -instead of `re`. For example, we can now use character classes to match PEP-3131 compliant Python identifiers. -```python -from lark import Lark ->>> g = Lark(r""" - ?start: NAME - NAME: ID_START ID_CONTINUE* - ID_START: /[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}_]+/ - ID_CONTINUE: ID_START | /[\p{Mn}\p{Mc}\p{Nd}\p{Pc}·]+/ - """, regex=True) - ->>> g.parse('வணக்கம்') -'வணக்கம்' - -``` ----- - -## Tree - -The main tree class - -* `data` - The name of the rule or alias -* `children` - List of matched sub-rules and terminals -* `meta` - Line & Column numbers (if `propagate_positions` is enabled) - * meta attributes: `line`, `column`, `start_pos`, `end_line`, `end_column`, `end_pos` - -#### \_\_init\_\_(self, data, children) - -Creates a new tree, and stores "data" and "children" in attributes of the same name. - -#### pretty(self, indent_str=' ') - -Returns an indented string representation of the tree. Great for debugging. - -#### find_pred(self, pred) - -Returns all nodes of the tree that evaluate pred(node) as true. - -#### find_data(self, data) - -Returns all nodes of the tree whose data equals the given data. - -#### iter_subtrees(self) - -Depth-first iteration. - -Iterates over all the subtrees, never returning to the same node twice (Lark's parse-tree is actually a DAG). - -#### iter_subtrees_topdown(self) - -Breadth-first iteration. - -Iterates over all the subtrees, return nodes in order like pretty() does. - -#### \_\_eq\_\_, \_\_hash\_\_ - -Trees can be hashed and compared. - ----- - -## Token - -When using a lexer, the resulting tokens in the trees will be of the Token class, which inherits from Python's string. So, normal string comparisons and operations will work as expected. Tokens also have other useful attributes: - -* `type` - Name of the token (as specified in grammar). -* `pos_in_stream` - the index of the token in the text -* `line` - The line of the token in the text (starting with 1) -* `column` - The column of the token in the text (starting with 1) -* `end_line` - The line where the token ends -* `end_column` - The next column after the end of the token. For example, if the token is a single character with a `column` value of 4, `end_column` will be 5. -* `end_pos` - the index where the token ends (basically pos_in_stream + len(token)) - -## Transformer -## Visitor -## Interpreter - -See the [visitors page](visitors.md) - - -## UnexpectedInput - -- `UnexpectedInput` - - `UnexpectedToken` - The parser recieved an unexpected token - - `UnexpectedCharacters` - The lexer encountered an unexpected string - -After catching one of these exceptions, you may call the following helper methods to create a nicer error message: - -#### get_context(text, span) - -Returns a pretty string pinpointing the error in the text, with `span` amount of context characters around it. - -(The parser doesn't hold a copy of the text it has to parse, so you have to provide it again) - -#### match_examples(parse_fn, examples) - -Allows you to detect what's wrong in the input text by matching against example errors. - -Accepts the parse function (usually `lark_instance.parse`) and a dictionary of `{'example_string': value}`. - -The function will iterate the dictionary until it finds a matching error, and return the corresponding value. - -For an example usage, see: [examples/error_reporting_lalr.py](https://github.com/lark-parser/lark/blob/master/examples/error_reporting_lalr.py) - - -### UnexpectedToken - -When the parser throws UnexpectedToken, it instanciates a puppet with its internal state. - -Users can then interactively set the puppet to the desired puppet state, and resume regular parsing. - -See [ParserPuppet](#ParserPuppet) - -### UnexpectedCharacters - -## ParserPuppet - -ParserPuppet gives you advanced control over error handling when parsing with LALR. - -For a simpler, more streamlined interface, see the `on_error` argument to `Lark.parse()`. - -#### choices(self) - -Returns a dictionary of token types, matched to their action in the parser. - -Only returns token types that are accepted by the current state. - -Updated by `feed_token()` - -#### feed_token(self, token) - -Feed the parser with a token, and advance it to the next state, as if it recieved it from the lexer. - -Note that `token` has to be an instance of `Token`. - -#### copy(self) - -Create a new puppet with a separate state. Calls to `feed_token()` won't affect the old puppet, and vice-versa. - -#### pretty(self) - -Print the output of `choices()` in a way that's easier to read. - -#### resume_parse(self) -Resume parsing from the current puppet state. - diff --git a/docs/classes.rst b/docs/classes.rst new file mode 100644 index 0000000..ce4b078 --- /dev/null +++ b/docs/classes.rst @@ -0,0 +1,5 @@ +API Reference +============= + +.. autoclass:: lark.Lark + :members: open, parse, save, load \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py index a522559..887eeb2 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -17,9 +17,10 @@ # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # -# import os -# import sys -# sys.path.insert(0, os.path.abspath('.')) +import os +import sys +sys.path.insert(0, os.path.abspath('..')) +autodoc_member_order = 'bysource' # -- General configuration ------------------------------------------------ @@ -33,8 +34,9 @@ # ones. extensions = [ 'sphinx.ext.autodoc', + 'sphinx.ext.napoleon', 'sphinx.ext.coverage', - 'recommonmark' + 'recommonmark', ] # Add any paths that contain templates here, relative to this directory. diff --git a/lark/lark.py b/lark/lark.py index 9a4e001..b91c983 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -156,12 +156,21 @@ class LarkOptions(Serialize): class Lark(Serialize): - def __init__(self, grammar, **options): - """ - grammar : a string or file-object containing the grammar spec (using Lark's ebnf syntax) - options : a dictionary controlling various aspects of Lark. - """ + """Main interface for the library. + + It’s mostly a thin wrapper for the many different parsers, and for + the tree constructor. + Args: + grammar: a string or file-object containing the + grammar spec (using Lark's ebnf syntax) + options : a dictionary controlling various aspects of Lark. + + Example: + >>> Lark(r'''start: "foo" ''') + Lark(...) + """ + def __init__(self, grammar, **options): self.options = LarkOptions(options) # Set regex or re module @@ -295,6 +304,7 @@ class Lark(Serialize): with FS.open(cache_fn, 'wb') as f: self.save(f) + # TODO: merge with above if __init__.__doc__: __init__.__doc__ += "\nOptions:\n" + LarkOptions.OPTIONS_DOC @@ -314,11 +324,19 @@ class Lark(Serialize): return self.parser_class(self.lexer_conf, parser_conf, options=self.options) def save(self, f): + """Saves the instance into the given file object + + Useful for caching and multiprocessing. + """ data, m = self.memo_serialize([TerminalDef, Rule]) pickle.dump({'data': data, 'memo': m}, f) @classmethod def load(cls, f): + """Loads an instance from the given file object + + Useful for caching and multiprocessing. + """ inst = cls.__new__(cls) return inst._load(f) @@ -361,7 +379,8 @@ class Lark(Serialize): def open(cls, grammar_filename, rel_to=None, **options): """Create an instance of Lark with the grammar given by its filename - If rel_to is provided, the function will find the grammar filename in relation to it. + If ``rel_to`` is provided, the function will find the grammar + filename in relation to it. Example: @@ -395,12 +414,20 @@ class Lark(Serialize): def parse(self, text, start=None, on_error=None): """Parse the given text, according to the options provided. - Parameters: - start: str - required if Lark was given multiple possible start symbols (using the start option). - on_error: function - if provided, will be called on UnexpectedToken error. Return true to resume parsing. LALR only. + If a transformer is supplied to ``__init__``, returns whatever is the + result of the transformation. + + Args: + text (str): Text to be parsed. + start (str, optional): Required if Lark was given multiple + possible start symbols (using the start option). + on_error (function, optional): if provided, will be called on + UnexpectedToken error. Return true to resume parsing. + LALR only. See examples/error_puppet.py for an example + of how to use on_error. - Returns a tree, unless specified otherwise. """ + try: return self.parser.parse(text, start=start) except UnexpectedToken as e: From 9e34dc94e9916a344192d51036dcb82ab2f7fc64 Mon Sep 17 00:00:00 2001 From: Sasank Chilamkurthy Date: Sun, 16 Aug 2020 15:30:48 +0530 Subject: [PATCH 114/164] document LarkOptions --- docs/classes.rst | 10 +++- lark/lark.py | 118 ++++++++++++++++++++++++----------------------- 2 files changed, 70 insertions(+), 58 deletions(-) diff --git a/docs/classes.rst b/docs/classes.rst index ce4b078..df93b5e 100644 --- a/docs/classes.rst +++ b/docs/classes.rst @@ -1,5 +1,13 @@ API Reference ============= +Lark +---- + .. autoclass:: lark.Lark - :members: open, parse, save, load \ No newline at end of file + :members: open, parse, save, load + +LarkOptions +----------- + +.. autoclass:: lark.lark.LarkOptions \ No newline at end of file diff --git a/lark/lark.py b/lark/lark.py index b91c983..b1c45e1 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -27,63 +27,67 @@ class LarkOptions(Serialize): """ OPTIONS_DOC = """ -# General - - start - The start symbol. Either a string, or a list of strings for - multiple possible starts (Default: "start") - debug - Display debug information, such as warnings (default: False) - transformer - Applies the transformer to every parse tree (equivlent to - applying it after the parse, but faster) - propagate_positions - Propagates (line, column, end_line, end_column) - attributes into all tree branches. - maybe_placeholders - When True, the `[]` operator returns `None` when not matched. - When `False`, `[]` behaves like the `?` operator, - and returns no value at all. - (default=`False`. Recommended to set to `True`) - regex - When True, uses the `regex` module instead of the stdlib `re`. - cache - Cache the results of the Lark grammar analysis, for x2 to x3 faster loading. - LALR only for now. - When `False`, does nothing (default) - When `True`, caches to a temporary file in the local directory - When given a string, caches to the path pointed by the string - - g_regex_flags - Flags that are applied to all terminals - (both regex and strings) - keep_all_tokens - Prevent the tree builder from automagically - removing "punctuation" tokens (default: False) - -# Algorithm - - parser - Decides which parser engine to use - Accepts "earley" or "lalr". (Default: "earley") - (there is also a "cyk" option for legacy) - - lexer - Decides whether or not to use a lexer stage - "auto" (default): Choose for me based on the parser - "standard": Use a standard lexer - "contextual": Stronger lexer (only works with parser="lalr") - "dynamic": Flexible and powerful (only with parser="earley") - "dynamic_complete": Same as dynamic, but tries *every* variation - of tokenizing possible. - - ambiguity - Decides how to handle ambiguity in the parse. - Only relevant if parser="earley" - "resolve": The parser will automatically choose the simplest - derivation (it chooses consistently: greedy for - tokens, non-greedy for rules) - "explicit": The parser will return all derivations wrapped - in "_ambig" tree nodes (i.e. a forest). - -# Domain Specific - - postlex - Lexer post-processing (Default: None) Only works with the - standard and contextual lexers. - priority - How priorities should be evaluated - auto, none, normal, - invert (Default: auto) - lexer_callbacks - Dictionary of callbacks for the lexer. May alter - tokens during lexing. Use with caution. - use_bytes - Accept an input of type `bytes` instead of `str` (Python 3 only). - edit_terminals - A callback + **General** + + - **start** - The start symbol. Either a string, or a list of strings for + multiple possible starts (Default: "start") + - **debug** - Display debug information, such as warnings (default: False) + - **transformer** - Applies the transformer to every parse tree (equivlent + to applying it after the parse, but faster) + - **propagate_positions** - Propagates (line, column, end_line, end_column) + attributes into all tree branches. + - **maybe_placeholders** - When True, the ``[]`` operator returns ``None`` + when not matched. When ``False``, ``[]`` behaves like the ``?`` + operator, and returns no value at all. (default= ``False``. Recommended + to set to ``True``) + - **regex** - When True, uses the ``regex`` module instead of the + stdlib ``re``. + - **cache** - Cache the results of the Lark grammar analysis, for x2 to + x3 faster loading. LALR only for now. + + - When ``False``, does nothing (default) + - When ``True``, caches to a temporary file in the local directory + - When given a string, caches to the path pointed by the string + + - **g_regex_flags** - Flags that are applied to all terminals + (both regex and strings) + - **keep_all_tokens** - Prevent the tree builder from automagically + removing "punctuation" tokens (default: False) + + **Algorithm** + + - **parser** - Decides which parser engine to use + Accepts "earley" or "lalr". (Default: "earley") + (there is also a "cyk" option for legacy) + - **lexer** - Decides whether or not to use a lexer stage + + - "auto" (default): Choose for me based on the parser + - "standard": Use a standard lexer + - "contextual": Stronger lexer (only works with parser="lalr") + - "dynamic": Flexible and powerful (only with parser="earley") + - "dynamic_complete": Same as dynamic, but tries *every* variation + of tokenizing possible. + + - **ambiguity** - Decides how to handle ambiguity in the parse. + Only relevant if parser="earley" + + - "resolve" - The parser will automatically choose the simplest + derivation (it chooses consistently: greedy for tokens, + non-greedy for rules) + - "explicit": The parser will return all derivations wrapped in + "_ambig" tree nodes (i.e. a forest). + + **Domain Specific** + + - **postlex** - Lexer post-processing (Default: None) Only works with the + standard and contextual lexers. + - **priority** - How priorities should be evaluated - auto, none, normal, + invert (Default: auto) + - **lexer_callbacks** - Dictionary of callbacks for the lexer. May alter + tokens during lexing. Use with caution. + - **use_bytes** - Accept an input of type ``bytes`` instead of + ``str`` (Python 3 only). + - **edit_terminals** - A callback """ if __doc__: __doc__ += OPTIONS_DOC From b753e24b2f2fbfa334c642b1e5c72256fdb2c8f6 Mon Sep 17 00:00:00 2001 From: Sasank Chilamkurthy Date: Sun, 16 Aug 2020 15:37:38 +0530 Subject: [PATCH 115/164] document lark.Tree --- docs/classes.rst | 9 ++++++++- lark/tree.py | 31 +++++++++++++++++++++++++++++-- 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/docs/classes.rst b/docs/classes.rst index df93b5e..7e62093 100644 --- a/docs/classes.rst +++ b/docs/classes.rst @@ -10,4 +10,11 @@ Lark LarkOptions ----------- -.. autoclass:: lark.lark.LarkOptions \ No newline at end of file +.. autoclass:: lark.lark.LarkOptions + +Tree +---- + +.. autoclass:: lark.Tree + :members: pretty, find_pred, find_data, iter_subtrees, + iter_subtrees_topdown \ No newline at end of file diff --git a/lark/tree.py b/lark/tree.py index 5a594f1..b48450e 100644 --- a/lark/tree.py +++ b/lark/tree.py @@ -14,7 +14,20 @@ class Meta: def __init__(self): self.empty = True + class Tree(object): + """The main tree class. + + Creates a new tree, and stores "data" and "children" in attributes of + the same name. Trees can be hashed and compared. + + Args: + data: The name of the rule or alias + children: List of matched sub-rules and terminals + meta: Line & Column numbers (if ``propagate_positions`` is enabled). + meta attributes: line, column, start_pos, end_line, + end_column, end_pos + """ def __init__(self, data, children, meta=None): self.data = data self.children = children @@ -46,6 +59,10 @@ class Tree(object): return l def pretty(self, indent_str=' '): + """Returns an indented string representation of the tree. + + Great for debugging. + """ return ''.join(self._pretty(0, indent_str)) def __eq__(self, other): @@ -61,6 +78,11 @@ class Tree(object): return hash((self.data, tuple(self.children))) def iter_subtrees(self): + """Depth-first iteration. + + Iterates over all the subtrees, never returning to the + same node twice (Lark's parse-tree is actually a DAG). + """ queue = [self] subtrees = OrderedDict() for subtree in queue: @@ -72,11 +94,11 @@ class Tree(object): return reversed(list(subtrees.values())) def find_pred(self, pred): - "Find all nodes where pred(tree) == True" + """Returns all nodes of the tree that evaluate pred(node) as true.""" return filter(pred, self.iter_subtrees()) def find_data(self, data): - "Find all nodes where tree.data == data" + """Returns all nodes of the tree whose data equals the given data.""" return self.find_pred(lambda t: t.data == data) ###} @@ -97,6 +119,11 @@ class Tree(object): yield c def iter_subtrees_topdown(self): + """Breadth-first iteration. + + Iterates over all the subtrees, return nodes in order like + pretty() does. + """ stack = [self] while stack: node = stack.pop() From fd08f470e2a8388c2ffc5c7309f40d109bf32726 Mon Sep 17 00:00:00 2001 From: Sasank Chilamkurthy Date: Sun, 16 Aug 2020 15:42:55 +0530 Subject: [PATCH 116/164] document lark.Token --- docs/classes.rst | 7 ++++++- lark/lexer.py | 19 +++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/docs/classes.rst b/docs/classes.rst index 7e62093..54faddc 100644 --- a/docs/classes.rst +++ b/docs/classes.rst @@ -17,4 +17,9 @@ Tree .. autoclass:: lark.Tree :members: pretty, find_pred, find_data, iter_subtrees, - iter_subtrees_topdown \ No newline at end of file + iter_subtrees_topdown + +Token +----- + +.. autoclass:: lark.Token \ No newline at end of file diff --git a/lark/lexer.py b/lark/lexer.py index c77207b..87e286e 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -90,6 +90,25 @@ class TerminalDef(Serialize): class Token(Str): + """Token of a lexer. + + When using a lexer, the resulting tokens in the trees will be of the + Token class, which inherits from Python's string. So, normal string + comparisons and operations will work as expected. Tokens also have other + useful attributes. + + Attributes: + type_: Name of the token (as specified in grammar) + pos_in_stream: The index of the token in the text + line: The line of the token in the text (starting with 1) + column: The column of the token in the text (starting with 1) + end_line: The line where the token ends + end_column: The next column after the end of the token. For example, + if the token is a single character with a column value of 4, + end_column will be 5. + end_pos: the index where the token ends (basically pos_in_stream + + len(token)) + """ __slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column', 'end_pos') def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None, end_line=None, end_column=None, end_pos=None): From 168e6db2d1cf96a94ad786c84b43651a96586b43 Mon Sep 17 00:00:00 2001 From: Sasank Chilamkurthy Date: Sun, 16 Aug 2020 15:52:21 +0530 Subject: [PATCH 117/164] document lark.Visitor --- docs/classes.rst | 11 ++++++++++- lark/visitors.py | 26 ++++++++++++++++++++++++-- 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/docs/classes.rst b/docs/classes.rst index 54faddc..0dd5954 100644 --- a/docs/classes.rst +++ b/docs/classes.rst @@ -22,4 +22,13 @@ Tree Token ----- -.. autoclass:: lark.Token \ No newline at end of file +.. autoclass:: lark.Token + +Visitor +------- + +.. autoclass:: lark.visitors.VisitorBase + +.. autoclass:: lark.Visitor + +.. autoclass:: lark.visitors.Visitor_Recursive \ No newline at end of file diff --git a/lark/visitors.py b/lark/visitors.py index 6494deb..525645c 100644 --- a/lark/visitors.py +++ b/lark/visitors.py @@ -199,6 +199,27 @@ class Transformer_InPlaceRecursive(Transformer): # Visitors class VisitorBase: + """Visitors visit each node of the tree + + Run the appropriate method on it according to the node’s data. + They work bottom-up, starting with the leaves and ending at the root + of the tree. + + There are two classes that implement the visitor interface: + + - ``Visitor``: Visit every node (without recursion) + - ``Visitor_Recursive``: Visit every node using recursion. Slightly faster. + + Example: + :: + + class IncreaseAllNumbers(Visitor): + def number(self, tree): + assert tree.data == "number" + tree.children[0] += 1 + + IncreaseAllNumbers().visit(parse_tree) + """ def _call_userfunc(self, tree): return getattr(self, tree.data, self.__default__)(tree) @@ -211,7 +232,7 @@ class VisitorBase: class Visitor(VisitorBase): - """Bottom-up visitor, non-recursive + """Bottom-up visitor, non-recursive. Visits the tree, starting with the leaves and finally the root (bottom-up) Calls its methods (provided by user via inheritance) according to tree.data @@ -227,8 +248,9 @@ class Visitor(VisitorBase): self._call_userfunc(subtree) return tree + class Visitor_Recursive(VisitorBase): - """Bottom-up visitor, recursive + """Bottom-up visitor, recursive. Visits the tree, starting with the leaves and finally the root (bottom-up) Calls its methods (provided by user via inheritance) according to tree.data From cad22acf0f0751fc25741a51a369fa3927e1fe22 Mon Sep 17 00:00:00 2001 From: Sasank Chilamkurthy Date: Sun, 16 Aug 2020 16:07:34 +0530 Subject: [PATCH 118/164] document lark.Interpreter, lark.Transformer --- docs/classes.rst | 15 +++++++-- lark/visitors.py | 87 ++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 90 insertions(+), 12 deletions(-) diff --git a/docs/classes.rst b/docs/classes.rst index 0dd5954..14e9842 100644 --- a/docs/classes.rst +++ b/docs/classes.rst @@ -29,6 +29,17 @@ Visitor .. autoclass:: lark.visitors.VisitorBase -.. autoclass:: lark.Visitor +.. autoclass:: lark.visitors.Visitor -.. autoclass:: lark.visitors.Visitor_Recursive \ No newline at end of file +.. autoclass:: lark.visitors.Visitor_Recursive + +Interpreter +----------- + +.. autoclass:: lark.visitors.Interpreter + + +Transformer +----------- + +.. autoclass:: lark.visitors.Transformer \ No newline at end of file diff --git a/lark/visitors.py b/lark/visitors.py index 525645c..e561beb 100644 --- a/lark/visitors.py +++ b/lark/visitors.py @@ -42,12 +42,61 @@ class _Decoratable: class Transformer(_Decoratable): - """Visits the tree recursively, starting with the leaves and finally the root (bottom-up) + """Transformer visit each node of the tree, and run the appropriate method + on it according to the node's data. - Calls its methods (provided by user via inheritance) according to tree.data - The returned value replaces the old one in the structure. + Calls its methods (provided by user via inheritance) according to + ``tree.data``. The returned value replaces the old one in the structure. + + They work bottom-up (or depth-first), starting with the leaves and + ending at the root of the tree. Transformers can be used to + implement map & reduce patterns. Because nodes are reduced from leaf to + root, at any point the callbacks may assume the children have already been + transformed (if applicable). ``Transformer`` can do anything ``Visitor`` + can do, but because it reconstructs the tree, it is slightly less + efficient. + + All these classes implement the transformer interface: + + - ``Transformer`` - Recursively transforms the tree. This is the one you + probably want. + - ``Transformer_InPlace`` - Non-recursive. Changes the tree in-place + instead of returning new instances + - ``Transformer_InPlaceRecursive`` - Recursive. Changes the tree in-place + instead of returning new instances + + Example: + :: + + from lark import Tree, Transformer + + class EvalExpressions(Transformer): + def expr(self, args): + return eval(args[0]) + + t = Tree('a', [Tree('expr', ['1+2'])]) + print(EvalExpressions().transform( t )) + + # Prints: Tree(a, [3]) + + Args: + visit_tokens: By default, transformers only visit rules. + visit_tokens=True will tell ``Transformer`` to visit tokens + as well. This is a slightly slower alternative to lexer_callbacks + but it's easier to maintain and works for all algorithms + (even when there isn't a lexer). + + Example: + :: + + class T(Transformer): + INT = int + NUMBER = float + def NAME(self, name): + return lookup_dict.get(name, name) + + T(visit_tokens=True).transform(tree) - Can be used to implement map or reduce. """ __visit_tokens__ = True # For backwards compatibility @@ -235,7 +284,8 @@ class Visitor(VisitorBase): """Bottom-up visitor, non-recursive. Visits the tree, starting with the leaves and finally the root (bottom-up) - Calls its methods (provided by user via inheritance) according to tree.data + Calls its methods (provided by user via inheritance) according to + ``tree.data`` """ def visit(self, tree): @@ -253,7 +303,8 @@ class Visitor_Recursive(VisitorBase): """Bottom-up visitor, recursive. Visits the tree, starting with the leaves and finally the root (bottom-up) - Calls its methods (provided by user via inheritance) according to tree.data + Calls its methods (provided by user via inheritance) according to + ``tree.data`` """ def visit(self, tree): @@ -285,13 +336,29 @@ def visit_children_decor(func): class Interpreter(_Decoratable): - """Top-down visitor, recursive + """Interpreter walks the tree starting at the root. Visits the tree, starting with the root and finally the leaves (top-down) - Calls its methods (provided by user via inheritance) according to tree.data + Calls its methods (provided by user via inheritance) according to + ``tree.data`` + + Unlike ``Transformer`` and ``Visitor``, the Interpreter doesn't + automatically visit its sub-branches. The user has to explicitly call ``visit``, + ``visit_children``, or use the ``@visit_children_decor``. This allows the + user to implement branching and loops. + + Example: + :: + + class IncreaseSomeOfTheNumbers(Interpreter): + def number(self, tree): + tree.children[0] += 1 + + def skip(self, tree): + # skip this subtree. don't change any number node inside it. + pass - Unlike Transformer and Visitor, the Interpreter doesn't automatically visit its sub-branches. - The user has to explicitly call visit, visit_children, or use the @visit_children_decor + IncreaseSomeOfTheNumbers().visit(parse_tree) """ def visit(self, tree): From 1928b006826ab86a35e7bd16c81aaac3a38efa0d Mon Sep 17 00:00:00 2001 From: Sasank Chilamkurthy Date: Sun, 16 Aug 2020 16:18:04 +0530 Subject: [PATCH 119/164] document rest of the visitor --- docs/classes.rst | 13 ++++++++++++- lark/visitors.py | 47 ++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 56 insertions(+), 4 deletions(-) diff --git a/docs/classes.rst b/docs/classes.rst index 14e9842..718ec7a 100644 --- a/docs/classes.rst +++ b/docs/classes.rst @@ -42,4 +42,15 @@ Interpreter Transformer ----------- -.. autoclass:: lark.visitors.Transformer \ No newline at end of file +.. autoclass:: lark.visitors.Transformer + :members: __default__, __default_token__ + +v_args +------ + +.. autofunction:: lark.visitors.v_args + +Discard +------- + +.. autoclass:: lark.visitors.Discard \ No newline at end of file diff --git a/lark/visitors.py b/lark/visitors.py index e561beb..cef5623 100644 --- a/lark/visitors.py +++ b/lark/visitors.py @@ -9,6 +9,9 @@ from .lexer import Token from inspect import getmembers, getmro class Discard(Exception): + """When raising the Discard exception in a transformer callback, + that node is discarded and won't appear in the parent. + """ pass # Transformers @@ -159,11 +162,19 @@ class Transformer(_Decoratable): return TransformerChain(self, other) def __default__(self, data, children, meta): - "Default operation on tree (for override)" + """Default operation on tree (for override) + + Function that is called on if a function with a corresponding name has + not been found. Defaults to reconstruct the Tree + """ return Tree(data, children, meta) def __default_token__(self, token): - "Default operation on token (for override)" + """Default operation on token (for override) + + Function that is called on if a function with a corresponding name has + not been found. Defaults to just return the argument. + """ return token @@ -441,8 +452,38 @@ def _vargs_meta(f, data, children, meta): def _vargs_tree(f, data, children, meta): return f(Tree(data, children, meta)) + def v_args(inline=False, meta=False, tree=False, wrapper=None): - "A convenience decorator factory, for modifying the behavior of user-supplied visitor methods" + """A convenience decorator factory for modifying the behavior of + user-supplied visitor methods. + + By default, callback methods of transformers/visitors accept one argument - + a list of the node's children. ``v_args`` can modify this behavior. When + used on a transformer/visitor class definition, it applies to all the + callback methods inside it. Accepts one of three following flags. + + Args: + inline: Children are provided as ``*args`` instead of a list + argument (not recommended for very long lists). + meta: Provides two arguments: ``children`` and ``meta`` (instead of + just the first) + tree: Provides the entire tree as the argument, instead of the + children. + + Example: + :: + + @v_args(inline=True) + class SolveArith(Transformer): + def add(self, left, right): + return left + right + + + class ReverseNotation(Transformer_InPlace): + @v_args(tree=True) + def tree_node(self, tree): + tree.children = tree.children[::-1] + """ if tree and (meta or inline): raise ValueError("Visitor functions cannot combine 'tree' with 'meta' or 'inline'.") From 452f3fc0615eb25446b8587090904cc87ccc51d0 Mon Sep 17 00:00:00 2001 From: Sasank Chilamkurthy Date: Sun, 16 Aug 2020 16:40:31 +0530 Subject: [PATCH 120/164] complete sphinx autodoc --- docs/classes.rst | 36 ++++----- docs/visitors.md | 148 ------------------------------------ docs/visitors.rst | 46 +++++++++++ lark/exceptions.py | 44 +++++++++-- lark/parsers/lalr_puppet.py | 27 ++++++- lark/visitors.py | 4 +- 6 files changed, 123 insertions(+), 182 deletions(-) delete mode 100644 docs/visitors.md create mode 100644 docs/visitors.rst diff --git a/docs/classes.rst b/docs/classes.rst index 718ec7a..63f9aef 100644 --- a/docs/classes.rst +++ b/docs/classes.rst @@ -24,33 +24,23 @@ Token .. autoclass:: lark.Token -Visitor -------- +Transformer, Vistor & Interpretor +--------------------------------- -.. autoclass:: lark.visitors.VisitorBase +See :doc:`visitors`. -.. autoclass:: lark.visitors.Visitor +UnexpectedInput +--------------- -.. autoclass:: lark.visitors.Visitor_Recursive +.. autoclass:: lark.exceptions.UnexpectedInput + :members: get_context, match_examples -Interpreter ------------ - -.. autoclass:: lark.visitors.Interpreter - - -Transformer ------------ - -.. autoclass:: lark.visitors.Transformer - :members: __default__, __default_token__ - -v_args ------- +.. autoclass:: lark.exceptions.UnexpectedToken -.. autofunction:: lark.visitors.v_args +.. autoclass:: lark.exceptions.UnexpectedCharacters -Discard -------- +ParserPuppet +------------ -.. autoclass:: lark.visitors.Discard \ No newline at end of file +.. autoclass:: lark.parsers.lalr_puppet.ParserPuppet + :members: choices, feed_token, copy, pretty, resume_parse diff --git a/docs/visitors.md b/docs/visitors.md deleted file mode 100644 index 146af1c..0000000 --- a/docs/visitors.md +++ /dev/null @@ -1,148 +0,0 @@ -# Transformers & Visitors - -Transformers & Visitors provide a convenient interface to process the parse-trees that Lark returns. - -They are used by inheriting from the correct class (visitor or transformer), and implementing methods corresponding to the rule you wish to process. Each method accepts the children as an argument. That can be modified using the `v_args` decorator, which allows to inline the arguments (akin to `*args`), or add the tree `meta` property as an argument. - -See: visitors.py - -### Visitors - -Visitors visit each node of the tree, and run the appropriate method on it according to the node's data. - -They work bottom-up, starting with the leaves and ending at the root of the tree. - -**Example:** -```python -class IncreaseAllNumbers(Visitor): - def number(self, tree): - assert tree.data == "number" - tree.children[0] += 1 - -IncreaseAllNumbers().visit(parse_tree) -``` - -There are two classes that implement the visitor interface: - -* Visitor - Visit every node (without recursion) - -* Visitor_Recursive - Visit every node using recursion. Slightly faster. - -### Interpreter - -The interpreter walks the tree starting at the root (top-down). - -For each node, it calls the method corresponding with its `data` attribute. - -Unlike Transformer and Visitor, the Interpreter doesn't automatically visit its sub-branches. -The user has to explicitly call `visit`, `visit_children`, or use the `@visit_children_decor`. -This allows the user to implement branching and loops. - -**Example:** -```python -class IncreaseSomeOfTheNumbers(Interpreter): - def number(self, tree): - tree.children[0] += 1 - - def skip(self, tree): - # skip this subtree. don't change any number node inside it. - pass - -IncreaseSomeOfTheNumbers().visit(parse_tree) -``` - -### Transformers - -Transformers visit each node of the tree, and run the appropriate method on it according to the node's data. - -They work bottom-up (or: depth-first), starting with the leaves and ending at the root of the tree. - -Transformers can be used to implement map & reduce patterns. - -Because nodes are reduced from leaf to root, at any point the callbacks may assume the children have already been transformed (if applicable). - -Transformers can be chained into a new transformer by using multiplication. - -`Transformer` can do anything `Visitor` can do, but because it reconstructs the tree, it is slightly less efficient. - - -**Example:** -```python -from lark import Tree, Transformer - -class EvalExpressions(Transformer): - def expr(self, args): - return eval(args[0]) - -t = Tree('a', [Tree('expr', ['1+2'])]) -print(EvalExpressions().transform( t )) - -# Prints: Tree(a, [3]) -``` - -All these classes implement the transformer interface: - -- Transformer - Recursively transforms the tree. This is the one you probably want. -- Transformer_InPlace - Non-recursive. Changes the tree in-place instead of returning new instances -- Transformer_InPlaceRecursive - Recursive. Changes the tree in-place instead of returning new instances - -### visit_tokens - -By default, transformers only visit rules. `visit_tokens=True` will tell Transformer to visit tokens as well. This is a slightly slower alternative to `lexer_callbacks`, but it's easier to maintain and works for all algorithms (even when there isn't a lexer). - -**Example:** - -```python -class T(Transformer): - INT = int - NUMBER = float - def NAME(self, name): - return lookup_dict.get(name, name) - - -T(visit_tokens=True).transform(tree) -``` - - -### v_args - -`v_args` is a decorator. - -By default, callback methods of transformers/visitors accept one argument: a list of the node's children. `v_args` can modify this behavior. - -When used on a transformer/visitor class definition, it applies to all the callback methods inside it. - -`v_args` accepts one of three flags: - -- `inline` - Children are provided as `*args` instead of a list argument (not recommended for very long lists). -- `meta` - Provides two arguments: `children` and `meta` (instead of just the first) -- `tree` - Provides the entire tree as the argument, instead of the children. - -**Examples:** - -```python -@v_args(inline=True) -class SolveArith(Transformer): - def add(self, left, right): - return left + right - - -class ReverseNotation(Transformer_InPlace): - @v_args(tree=True) - def tree_node(self, tree): - tree.children = tree.children[::-1] -``` - -### `__default__` and `__default_token__` -These are the functions that are called on if a function with a corresponding name has not been found. - -- The `__default__` method has the signature `(data, children, meta)`, with `data` being the data attribute of the node. It defaults to reconstruct the Tree - -- The `__default_token__` just takes the `Token` as an argument. It defaults to just return the argument. - - -### Discard - -When raising the `Discard` exception in a transformer callback, that node is discarded and won't appear in the parent. - - diff --git a/docs/visitors.rst b/docs/visitors.rst new file mode 100644 index 0000000..cb1eafd --- /dev/null +++ b/docs/visitors.rst @@ -0,0 +1,46 @@ +Transformers & Visitors +======================= + +Transformers & Visitors provide a convenient interface to process the +parse-trees that Lark returns. + +They are used by inheriting from the correct class (visitor or transformer), +and implementing methods corresponding to the rule you wish to process. Each +method accepts the children as an argument. That can be modified using the +`v_args` decorator, which allows to inline the arguments (akin to `*args`), +or add the tree `meta` property as an argument. + +See: `visitors.py`_ + +.. _visitors.py: https://github.com/lark-parser/lark/blob/master/lark/visitors.py + +Visitor +------- + +.. autoclass:: lark.visitors.VisitorBase + +.. autoclass:: lark.visitors.Visitor + +.. autoclass:: lark.visitors.Visitor_Recursive + + +Transformer +----------- + +.. autoclass:: lark.visitors.Transformer + :members: __default__, __default_token__ + +Interpreter +----------- + +.. autoclass:: lark.visitors.Interpreter + +v_args +------ + +.. autofunction:: lark.visitors.v_args + +Discard +------- + +.. autoclass:: lark.visitors.Discard \ No newline at end of file diff --git a/lark/exceptions.py b/lark/exceptions.py index 9d2d8dc..dcd80b5 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -24,9 +24,24 @@ class UnexpectedEOF(ParseError): class UnexpectedInput(LarkError): + """UnexpectedInput Error. + + - ``UnexpectedToken``: The parser recieved an unexpected token + - ``UnexpectedCharacters``: The lexer encountered an unexpected string + + After catching one of these exceptions, you may call the following + helper methods to create a nicer error message. + """ pos_in_stream = None def get_context(self, text, span=40): + """Returns a pretty string pinpointing the error in the text, + with span amount of context characters around it. + + Note: + The parser doesn't hold a copy of the text it has to parse, + so you have to provide it again + """ pos = self.pos_in_stream start = max(pos - span, 0) end = pos + span @@ -40,11 +55,22 @@ class UnexpectedInput(LarkError): return (before + after + b'\n' + b' ' * len(before) + b'^\n').decode("ascii", "backslashreplace") def match_examples(self, parse_fn, examples, token_type_match_fallback=False, use_accepts=False): - """ Given a parser instance and a dictionary mapping some label with - some malformed syntax examples, it'll return the label for the - example that bests matches the current error. - - It's recommended to call this with `use_accepts=True`. The default is False for backwards compatibility. + """Allows you to detect what's wrong in the input text by matching + against example errors. + + Given a parser instance and a dictionary mapping some label with + some malformed syntax examples, it'll return the label for the + example that bests matches the current error. The function will + iterate the dictionary until it finds a matching error, and + return the corresponding value. + + For an example usage, see examples/error_reporting_lalr.py + + Args: + parse_fn: parse function (usually ``lark_instance.parse``) + examples: dictionary of ``{'example_string': value}``. + use_accepts: Recommended to call this with ``use_accepts=True``. + The default is ``False`` for backwards compatibility. """ assert self.state is not None, "Not supported for this exception" @@ -109,8 +135,13 @@ class UnexpectedCharacters(LexError, UnexpectedInput): super(UnexpectedCharacters, self).__init__(message) - class UnexpectedToken(ParseError, UnexpectedInput): + """When the parser throws UnexpectedToken, it instanciates a puppet + with its internal state. Users can then interactively set the puppet to + the desired puppet state, and resume regular parsing. + + see: ``ParserPuppet``. + """ def __init__(self, token, expected, considered_rules=None, state=None, puppet=None): self.line = getattr(token, 'line', '?') self.column = getattr(token, 'column', '?') @@ -132,6 +163,7 @@ class UnexpectedToken(ParseError, UnexpectedInput): super(UnexpectedToken, self).__init__(message) + class VisitError(LarkError): """VisitError is raised when visitors are interrupted by an exception diff --git a/lark/parsers/lalr_puppet.py b/lark/parsers/lalr_puppet.py index 24c77a1..63642ae 100644 --- a/lark/parsers/lalr_puppet.py +++ b/lark/parsers/lalr_puppet.py @@ -7,6 +7,12 @@ from .. import Token class ParserPuppet(object): + """ParserPuppet gives you advanced control over error handling when + parsing with LALR. + + For a simpler, more streamlined interface, see the ``on_error`` + argument to ``Lark.parse()``. + """ def __init__(self, parser, state_stack, value_stack, start, stream, set_state): self.parser = parser self._state_stack = state_stack @@ -18,8 +24,10 @@ class ParserPuppet(object): self.result = None def feed_token(self, token): - """Advance the parser state, as if it just received `token` from the lexer + """Feed the parser with a token, and advance it to the next state, + as if it recieved it from the lexer. + Note that ``token`` has to be an instance of ``Token``. """ end_state = self.parser.parse_table.end_states[self._start] state_stack = self._state_stack @@ -59,6 +67,10 @@ class ParserPuppet(object): value_stack.append(token) def copy(self): + """Create a new puppet with a separate state. + + Calls to feed_token() won't affect the old puppet, and vice-versa. + """ return type(self)( self.parser, list(self._state_stack), @@ -69,6 +81,7 @@ class ParserPuppet(object): ) def pretty(self): + """Print the output of ``choices()`` in a way that's easier to read.""" out = ["Puppet choices:"] for k, v in self.choices().items(): out.append('\t- %s -> %s' % (k, v)) @@ -76,6 +89,12 @@ class ParserPuppet(object): return '\n'.join(out) def choices(self): + """Returns a dictionary of token types, matched to their action in + the parser. Only returns token types that are accepted by the + current state. + + Updated by ``feed_token()``. + """ return self.parser.parse_table.states[self._state_stack[-1]] def accepts(self): @@ -91,4 +110,8 @@ class ParserPuppet(object): return accepts def resume_parse(self): - return self.parser.parse(self._stream, self._start, self._set_state, self._value_stack, self._state_stack) + """Resume parsing from the current puppet state.""" + return self.parser.parse( + self._stream, self._start, self._set_state, + self._value_stack, self._state_stack + ) diff --git a/lark/visitors.py b/lark/visitors.py index cef5623..195aa20 100644 --- a/lark/visitors.py +++ b/lark/visitors.py @@ -263,9 +263,7 @@ class VisitorBase: Run the appropriate method on it according to the node’s data. They work bottom-up, starting with the leaves and ending at the root - of the tree. - - There are two classes that implement the visitor interface: + of the tree. There are two classes that implement the visitor interface: - ``Visitor``: Visit every node (without recursion) - ``Visitor_Recursive``: Visit every node using recursion. Slightly faster. From 276c250e15cc98822e62c7ee72b70c6b832bf0b0 Mon Sep 17 00:00:00 2001 From: Sasank Chilamkurthy Date: Sun, 16 Aug 2020 16:41:34 +0530 Subject: [PATCH 121/164] small typo --- docs/visitors.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/visitors.rst b/docs/visitors.rst index cb1eafd..0a42c2b 100644 --- a/docs/visitors.rst +++ b/docs/visitors.rst @@ -7,8 +7,8 @@ parse-trees that Lark returns. They are used by inheriting from the correct class (visitor or transformer), and implementing methods corresponding to the rule you wish to process. Each method accepts the children as an argument. That can be modified using the -`v_args` decorator, which allows to inline the arguments (akin to `*args`), -or add the tree `meta` property as an argument. +``v_args`` decorator, which allows to inline the arguments (akin to ``*args``), +or add the tree ``meta`` property as an argument. See: `visitors.py`_ From b18e7e143b6b250fd660fd1c4e699804aa1bfee6 Mon Sep 17 00:00:00 2001 From: Sasank Chilamkurthy Date: Sun, 16 Aug 2020 16:44:44 +0530 Subject: [PATCH 122/164] fix random bug --- lark/visitors.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lark/visitors.py b/lark/visitors.py index 195aa20..81bb831 100644 --- a/lark/visitors.py +++ b/lark/visitors.py @@ -260,8 +260,8 @@ class Transformer_InPlaceRecursive(Transformer): class VisitorBase: """Visitors visit each node of the tree - - Run the appropriate method on it according to the node’s data. + + Run the appropriate method on it according to the node's data. They work bottom-up, starting with the leaves and ending at the root of the tree. There are two classes that implement the visitor interface: From 83006aa0c828433e8cf1fec85cc427e1431af1ff Mon Sep 17 00:00:00 2001 From: Sasank Chilamkurthy Date: Sun, 16 Aug 2020 16:50:38 +0530 Subject: [PATCH 123/164] fix unicode import bug --- lark/lark.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/lark.py b/lark/lark.py index b1c45e1..23e3a7d 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -162,7 +162,7 @@ class LarkOptions(Serialize): class Lark(Serialize): """Main interface for the library. - It’s mostly a thin wrapper for the many different parsers, and for + It's mostly a thin wrapper for the many different parsers, and for the tree constructor. Args: From f107512cb3314ba20410b54af02c81ef5fcfa0d1 Mon Sep 17 00:00:00 2001 From: Sasank Chilamkurthy Date: Sun, 16 Aug 2020 17:34:40 +0530 Subject: [PATCH 124/164] change to definition format for larkoptins --- lark/lark.py | 114 +++++++++++++++++++++++++++------------------------ 1 file changed, 61 insertions(+), 53 deletions(-) diff --git a/lark/lark.py b/lark/lark.py index 23e3a7d..abc87d1 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -29,65 +29,73 @@ class LarkOptions(Serialize): OPTIONS_DOC = """ **General** - - **start** - The start symbol. Either a string, or a list of strings for - multiple possible starts (Default: "start") - - **debug** - Display debug information, such as warnings (default: False) - - **transformer** - Applies the transformer to every parse tree (equivlent - to applying it after the parse, but faster) - - **propagate_positions** - Propagates (line, column, end_line, end_column) - attributes into all tree branches. - - **maybe_placeholders** - When True, the ``[]`` operator returns ``None`` - when not matched. When ``False``, ``[]`` behaves like the ``?`` - operator, and returns no value at all. (default= ``False``. Recommended - to set to ``True``) - - **regex** - When True, uses the ``regex`` module instead of the - stdlib ``re``. - - **cache** - Cache the results of the Lark grammar analysis, for x2 to - x3 faster loading. LALR only for now. - - - When ``False``, does nothing (default) - - When ``True``, caches to a temporary file in the local directory - - When given a string, caches to the path pointed by the string - - - **g_regex_flags** - Flags that are applied to all terminals - (both regex and strings) - - **keep_all_tokens** - Prevent the tree builder from automagically - removing "punctuation" tokens (default: False) + start + The start symbol. Either a string, or a list of strings for + multiple possible starts (Default: "start") + debug + Display debug information, such as warnings (default: False) + transformer + Applies the transformer to every parse tree (equivlent + to applying it after the parse, but faster) + propagate_positions + Propagates (line, column, end_line, end_column) attributes into all tree branches. + maybe_placeholders + When True, the ``[]`` operator returns ``None`` + when not matched. When ``False``, ``[]`` behaves like the ``?`` + operator, and returns no value at all. (default= ``False``. Recommended + to set to ``True``) + regex + When True, uses the ``regex`` module instead of the + stdlib ``re``. + cache + Cache the results of the Lark grammar analysis, for x2 to + x3 faster loading. LALR only for now. + + - When ``False``, does nothing (default) + - When ``True``, caches to a temporary file in the local directory + - When given a string, caches to the path pointed by the string + + g_regex_flags + Flags that are applied to all terminals (both regex and strings) + keep_all_tokens + Prevent the tree builder from automagically removing "punctuation" tokens (default: False) **Algorithm** - - - **parser** - Decides which parser engine to use - Accepts "earley" or "lalr". (Default: "earley") - (there is also a "cyk" option for legacy) - - **lexer** - Decides whether or not to use a lexer stage - - - "auto" (default): Choose for me based on the parser - - "standard": Use a standard lexer - - "contextual": Stronger lexer (only works with parser="lalr") - - "dynamic": Flexible and powerful (only with parser="earley") - - "dynamic_complete": Same as dynamic, but tries *every* variation - of tokenizing possible. - - - **ambiguity** - Decides how to handle ambiguity in the parse. - Only relevant if parser="earley" + + parser + Decides which parser engine to use. Accepts "earley" or "lalr". + (Default: "earley"). (there is also a "cyk" option for legacy) + lexer + Decides whether or not to use a lexer stage + + - "auto" (default): Choose for me based on the parser + - "standard": Use a standard lexer + - "contextual": Stronger lexer (only works with parser="lalr") + - "dynamic": Flexible and powerful (only with parser="earley") + - "dynamic_complete": Same as dynamic, but tries *every* variation + of tokenizing possible. + ambiguity + Decides how to handle ambiguity in the parse. Only relevant if parser="earley" - - "resolve" - The parser will automatically choose the simplest - derivation (it chooses consistently: greedy for tokens, - non-greedy for rules) - - "explicit": The parser will return all derivations wrapped in - "_ambig" tree nodes (i.e. a forest). + - "resolve" - The parser will automatically choose the simplest + derivation (it chooses consistently: greedy for tokens, + non-greedy for rules) + - "explicit": The parser will return all derivations wrapped in + "_ambig" tree nodes (i.e. a forest). **Domain Specific** - - **postlex** - Lexer post-processing (Default: None) Only works with the - standard and contextual lexers. - - **priority** - How priorities should be evaluated - auto, none, normal, - invert (Default: auto) - - **lexer_callbacks** - Dictionary of callbacks for the lexer. May alter - tokens during lexing. Use with caution. - - **use_bytes** - Accept an input of type ``bytes`` instead of - ``str`` (Python 3 only). - - **edit_terminals** - A callback + postlex + Lexer post-processing (Default: None) Only works with the + standard and contextual lexers. + priority + How priorities should be evaluated - auto, none, normal, invert (Default: auto) + lexer_callbacks + Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution. + use_bytes + Accept an input of type ``bytes`` instead of ``str`` (Python 3 only). + edit_terminals + A callback """ if __doc__: __doc__ += OPTIONS_DOC From 0c47b981fc2fe83bc1e690f62776228537fd984f Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sun, 16 Aug 2020 16:14:27 +0300 Subject: [PATCH 125/164] Bugfix: Infinite loop on mishandled $END token in on_error (Issue #656) --- lark/lark.py | 3 +++ lark/parsers/lalr_puppet.py | 11 +++++++++++ 2 files changed, 14 insertions(+) diff --git a/lark/lark.py b/lark/lark.py index 9a4e001..ad0195b 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -413,6 +413,9 @@ class Lark(Serialize): try: return e.puppet.resume_parse() except UnexpectedToken as e2: + if e.token.type == e2.token.type == '$END' and e.puppet == e2.puppet: + # Prevent infinite loop + raise e2 e = e2 diff --git a/lark/parsers/lalr_puppet.py b/lark/parsers/lalr_puppet.py index 24c77a1..4fcea72 100644 --- a/lark/parsers/lalr_puppet.py +++ b/lark/parsers/lalr_puppet.py @@ -68,6 +68,17 @@ class ParserPuppet(object): self._set_state, ) + def __eq__(self, other): + if not isinstance(other, ParserPuppet): + return False + + return ( + self._state_stack == other._state_stack and + self._value_stack == other._value_stack and + self._stream == other._stream and + self._start == other._start + ) + def pretty(self): out = ["Puppet choices:"] for k, v in self.choices().items(): From 7a67f0d027a820543ba5dfe4e429c1c9a3cdeeeb Mon Sep 17 00:00:00 2001 From: julienmalard Date: Sun, 16 Aug 2020 10:47:36 -0400 Subject: [PATCH 126/164] Postproc option for reconstruct, and fixed isalnum bug --- lark/reconstruct.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/lark/reconstruct.py b/lark/reconstruct.py index 89967b2..dfdaae1 100644 --- a/lark/reconstruct.py +++ b/lark/reconstruct.py @@ -1,3 +1,5 @@ +import unicodedata + from collections import defaultdict from .tree import Tree @@ -93,6 +95,8 @@ def make_recons_rule(origin, expansion, old_expansion): def make_recons_rule_to_term(origin, term): return make_recons_rule(origin, [Terminal(term.name)], [term]) +def _isalnum(x): + return unicodedata.category(x) in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Mn', 'Mc', 'Nd', 'Pc'] class Reconstructor: """ @@ -193,12 +197,15 @@ class Reconstructor: else: yield item - def reconstruct(self, tree): - x = self._reconstruct(tree) + def reconstruct(self, tree, postproc=None): + if postproc is None: + x = self._reconstruct(tree) + else: + x = postproc(self._reconstruct(tree)) y = [] prev_item = '' for item in x: - if prev_item and item and prev_item[-1].isalnum() and item[0].isalnum(): + if prev_item and item and _isalnum(prev_item[-1]) and _isalnum(item[0]): y.append(' ') y.append(item) prev_item = item From 288078a6a02ebaa8e741adf56dc46533d4677175 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sun, 16 Aug 2020 19:53:45 +0300 Subject: [PATCH 127/164] Corrections to PR --- docs/classes.rst | 27 +++++++-- docs/index.rst | 4 +- docs/visitors.rst | 30 ++++++--- lark/exceptions.py | 9 +-- lark/lark.py | 118 ++++++++++++++++-------------------- lark/parsers/lalr_puppet.py | 15 ++--- lark/tree.py | 17 +++--- lark/visitors.py | 103 +++++++++++-------------------- 8 files changed, 154 insertions(+), 169 deletions(-) diff --git a/docs/classes.rst b/docs/classes.rst index 63f9aef..3778147 100644 --- a/docs/classes.rst +++ b/docs/classes.rst @@ -7,10 +7,29 @@ Lark .. autoclass:: lark.Lark :members: open, parse, save, load -LarkOptions ------------ +**Using Unicode character classes with regex** -.. autoclass:: lark.lark.LarkOptions +Python's builtin `re` module has a few persistent known bugs and also won't parse +advanced regex features such as character classes. +With `pip install lark-parser[regex]`, the `regex` module will be installed alongside `lark` and can act as a drop-in replacement to `re`. + +Any instance of `Lark` instantiated with `regex=True` will now use the `regex` module instead of `re`. + +For example, we can now use character classes to match PEP-3131 compliant Python identifiers. + +Example: + :: + + from lark import Lark + >>> g = Lark(r""" + ?start: NAME + NAME: ID_START ID_CONTINUE* + ID_START: /[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}_]+/ + ID_CONTINUE: ID_START | /[\p{Mn}\p{Mc}\p{Nd}\p{Pc}·]+/ + """, regex=True) + + >>> g.parse('வணக்கம்') + 'வணக்கம்' Tree ---- @@ -24,7 +43,7 @@ Token .. autoclass:: lark.Token -Transformer, Vistor & Interpretor +Transformer, Visitor & Interpreter --------------------------------- See :doc:`visitors`. diff --git a/docs/index.rst b/docs/index.rst index 8466875..ba2c241 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -33,10 +33,10 @@ Welcome to Lark's documentation! grammar tree_construction - visitors classes + visitors nearley - + Lark is a modern parsing library for Python. Lark can parse any context-free grammar. diff --git a/docs/visitors.rst b/docs/visitors.rst index 0a42c2b..a734e3b 100644 --- a/docs/visitors.rst +++ b/docs/visitors.rst @@ -17,12 +17,33 @@ See: `visitors.py`_ Visitor ------- -.. autoclass:: lark.visitors.VisitorBase +Visitors visit each node of the tree, and run the appropriate method on it according to the node's data. + +They work bottom-up, starting with the leaves and ending at the root of the tree. + +There are two classes that implement the visitor interface: + +- ``Visitor``: Visit every node (without recursion) +- ``Visitor_Recursive``: Visit every node using recursion. Slightly faster. + +Example: + :: + + class IncreaseAllNumbers(Visitor): + def number(self, tree): + assert tree.data == "number" + tree.children[0] += 1 + + IncreaseAllNumbers().visit(parse_tree) .. autoclass:: lark.visitors.Visitor .. autoclass:: lark.visitors.Visitor_Recursive +Interpreter +----------- + +.. autoclass:: lark.visitors.Interpreter Transformer ----------- @@ -30,11 +51,6 @@ Transformer .. autoclass:: lark.visitors.Transformer :members: __default__, __default_token__ -Interpreter ------------ - -.. autoclass:: lark.visitors.Interpreter - v_args ------ @@ -43,4 +59,4 @@ v_args Discard ------- -.. autoclass:: lark.visitors.Discard \ No newline at end of file +.. autoclass:: lark.visitors.Discard \ No newline at end of file diff --git a/lark/exceptions.py b/lark/exceptions.py index dcd80b5..13bf83e 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -26,11 +26,12 @@ class UnexpectedEOF(ParseError): class UnexpectedInput(LarkError): """UnexpectedInput Error. + Used as a base class for the following exceptions: + - ``UnexpectedToken``: The parser recieved an unexpected token - ``UnexpectedCharacters``: The lexer encountered an unexpected string - After catching one of these exceptions, you may call the following - helper methods to create a nicer error message. + After catching one of these exceptions, you may call the following helper methods to create a nicer error message. """ pos_in_stream = None @@ -57,7 +58,7 @@ class UnexpectedInput(LarkError): def match_examples(self, parse_fn, examples, token_type_match_fallback=False, use_accepts=False): """Allows you to detect what's wrong in the input text by matching against example errors. - + Given a parser instance and a dictionary mapping some label with some malformed syntax examples, it'll return the label for the example that bests matches the current error. The function will @@ -66,7 +67,7 @@ class UnexpectedInput(LarkError): For an example usage, see examples/error_reporting_lalr.py - Args: + Parameters: parse_fn: parse function (usually ``lark_instance.parse``) examples: dictionary of ``{'example_string': value}``. use_accepts: Recommended to call this with ``use_accepts=True``. diff --git a/lark/lark.py b/lark/lark.py index abc87d1..b54e725 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -27,75 +27,67 @@ class LarkOptions(Serialize): """ OPTIONS_DOC = """ - **General** - + **=== General ===** + start - The start symbol. Either a string, or a list of strings for - multiple possible starts (Default: "start") + The start symbol. Either a string, or a list of strings for multiple possible starts (Default: "start") debug - Display debug information, such as warnings (default: False) + Display debug information, such as warnings (default: False) transformer - Applies the transformer to every parse tree (equivlent - to applying it after the parse, but faster) + Applies the transformer to every parse tree (equivlent to applying it after the parse, but faster) propagate_positions - Propagates (line, column, end_line, end_column) attributes into all tree branches. + Propagates (line, column, end_line, end_column) attributes into all tree branches. maybe_placeholders - When True, the ``[]`` operator returns ``None`` - when not matched. When ``False``, ``[]`` behaves like the ``?`` - operator, and returns no value at all. (default= ``False``. Recommended - to set to ``True``) + When True, the ``[]`` operator returns ``None`` when not matched. + + When ``False``, ``[]`` behaves like the ``?`` operator, and returns no value at all. + (default= ``False``. Recommended to set to ``True``) regex - When True, uses the ``regex`` module instead of the - stdlib ``re``. + When True, uses the ``regex`` module instead of the stdlib ``re``. cache - Cache the results of the Lark grammar analysis, for x2 to - x3 faster loading. LALR only for now. + Cache the results of the Lark grammar analysis, for x2 to x3 faster loading. LALR only for now. - - When ``False``, does nothing (default) - - When ``True``, caches to a temporary file in the local directory - - When given a string, caches to the path pointed by the string + - When ``False``, does nothing (default) + - When ``True``, caches to a temporary file in the local directory + - When given a string, caches to the path pointed by the string g_regex_flags - Flags that are applied to all terminals (both regex and strings) + Flags that are applied to all terminals (both regex and strings) keep_all_tokens - Prevent the tree builder from automagically removing "punctuation" tokens (default: False) + Prevent the tree builder from automagically removing "punctuation" tokens (default: False) - **Algorithm** + **=== Algorithm ===** parser - Decides which parser engine to use. Accepts "earley" or "lalr". - (Default: "earley"). (there is also a "cyk" option for legacy) + Decides which parser engine to use. Accepts "earley" or "lalr". (Default: "earley"). + (there is also a "cyk" option for legacy) lexer - Decides whether or not to use a lexer stage - - - "auto" (default): Choose for me based on the parser - - "standard": Use a standard lexer - - "contextual": Stronger lexer (only works with parser="lalr") - - "dynamic": Flexible and powerful (only with parser="earley") - - "dynamic_complete": Same as dynamic, but tries *every* variation - of tokenizing possible. + Decides whether or not to use a lexer stage + + - "auto" (default): Choose for me based on the parser + - "standard": Use a standard lexer + - "contextual": Stronger lexer (only works with parser="lalr") + - "dynamic": Flexible and powerful (only with parser="earley") + - "dynamic_complete": Same as dynamic, but tries *every* variation of tokenizing possible. ambiguity - Decides how to handle ambiguity in the parse. Only relevant if parser="earley" - - - "resolve" - The parser will automatically choose the simplest - derivation (it chooses consistently: greedy for tokens, - non-greedy for rules) - - "explicit": The parser will return all derivations wrapped in - "_ambig" tree nodes (i.e. a forest). + Decides how to handle ambiguity in the parse. Only relevant if parser="earley" - **Domain Specific** + - "resolve" - The parser will automatically choose the simplest derivation + (it chooses consistently: greedy for tokens, non-greedy for rules) + - "explicit": The parser will return all derivations wrapped in "_ambig" tree nodes (i.e. a forest). + + **=== Misc. / Domain Specific ===** postlex - Lexer post-processing (Default: None) Only works with the - standard and contextual lexers. + Lexer post-processing (Default: None) Only works with the standard and contextual lexers. priority - How priorities should be evaluated - auto, none, normal, invert (Default: auto) + How priorities should be evaluated - auto, none, normal, invert (Default: auto) lexer_callbacks - Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution. + Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution. use_bytes - Accept an input of type ``bytes`` instead of ``str`` (Python 3 only). + Accept an input of type ``bytes`` instead of ``str`` (Python 3 only). edit_terminals - A callback + A callback for editing the terminals before parse. """ if __doc__: __doc__ += OPTIONS_DOC @@ -170,13 +162,11 @@ class LarkOptions(Serialize): class Lark(Serialize): """Main interface for the library. - It's mostly a thin wrapper for the many different parsers, and for - the tree constructor. + It's mostly a thin wrapper for the many different parsers, and for the tree constructor. - Args: - grammar: a string or file-object containing the - grammar spec (using Lark's ebnf syntax) - options : a dictionary controlling various aspects of Lark. + Parameters: + grammar: a string or file-object containing the grammar spec (using Lark's ebnf syntax) + options: a dictionary controlling various aspects of Lark. Example: >>> Lark(r'''start: "foo" ''') @@ -317,8 +307,7 @@ class Lark(Serialize): self.save(f) # TODO: merge with above - if __init__.__doc__: - __init__.__doc__ += "\nOptions:\n" + LarkOptions.OPTIONS_DOC + __doc__ += "\nOptions:\n" + LarkOptions.OPTIONS_DOC __serialize_fields__ = 'parser', 'rules', 'options' @@ -391,8 +380,7 @@ class Lark(Serialize): def open(cls, grammar_filename, rel_to=None, **options): """Create an instance of Lark with the grammar given by its filename - If ``rel_to`` is provided, the function will find the grammar - filename in relation to it. + If ``rel_to`` is provided, the function will find the grammar filename in relation to it. Example: @@ -426,17 +414,15 @@ class Lark(Serialize): def parse(self, text, start=None, on_error=None): """Parse the given text, according to the options provided. - If a transformer is supplied to ``__init__``, returns whatever is the - result of the transformation. - - Args: + Parameters: text (str): Text to be parsed. - start (str, optional): Required if Lark was given multiple - possible start symbols (using the start option). - on_error (function, optional): if provided, will be called on - UnexpectedToken error. Return true to resume parsing. - LALR only. See examples/error_puppet.py for an example - of how to use on_error. + start (str, optional): Required if Lark was given multiple possible start symbols (using the start option). + on_error (function, optional): if provided, will be called on UnexpectedToken error. Return true to resume parsing. + LALR only. See examples/error_puppet.py for an example of how to use on_error. + + Returns: + If a transformer is supplied to ``__init__``, returns whatever is the + result of the transformation. Otherwise, returns a Tree instance. """ diff --git a/lark/parsers/lalr_puppet.py b/lark/parsers/lalr_puppet.py index 63642ae..50272af 100644 --- a/lark/parsers/lalr_puppet.py +++ b/lark/parsers/lalr_puppet.py @@ -7,11 +7,9 @@ from .. import Token class ParserPuppet(object): - """ParserPuppet gives you advanced control over error handling when - parsing with LALR. + """ParserPuppet gives you advanced control over error handling when parsing with LALR. - For a simpler, more streamlined interface, see the ``on_error`` - argument to ``Lark.parse()``. + For a simpler, more streamlined interface, see the ``on_error`` argument to ``Lark.parse()``. """ def __init__(self, parser, state_stack, value_stack, start, stream, set_state): self.parser = parser @@ -24,8 +22,7 @@ class ParserPuppet(object): self.result = None def feed_token(self, token): - """Feed the parser with a token, and advance it to the next state, - as if it recieved it from the lexer. + """Feed the parser with a token, and advance it to the next state, as if it recieved it from the lexer. Note that ``token`` has to be an instance of ``Token``. """ @@ -89,9 +86,9 @@ class ParserPuppet(object): return '\n'.join(out) def choices(self): - """Returns a dictionary of token types, matched to their action in - the parser. Only returns token types that are accepted by the - current state. + """Returns a dictionary of token types, matched to their action in the parser. + + Only returns token types that are accepted by the current state. Updated by ``feed_token()``. """ diff --git a/lark/tree.py b/lark/tree.py index b48450e..b9dddf4 100644 --- a/lark/tree.py +++ b/lark/tree.py @@ -18,15 +18,14 @@ class Meta: class Tree(object): """The main tree class. - Creates a new tree, and stores "data" and "children" in attributes of - the same name. Trees can be hashed and compared. + Creates a new tree, and stores "data" and "children" in attributes of the same name. + Trees can be hashed and compared. - Args: + Parameters: data: The name of the rule or alias children: List of matched sub-rules and terminals meta: Line & Column numbers (if ``propagate_positions`` is enabled). - meta attributes: line, column, start_pos, end_line, - end_column, end_pos + meta attributes: line, column, start_pos, end_line, end_column, end_pos """ def __init__(self, data, children, meta=None): self.data = data @@ -79,9 +78,8 @@ class Tree(object): def iter_subtrees(self): """Depth-first iteration. - - Iterates over all the subtrees, never returning to the - same node twice (Lark's parse-tree is actually a DAG). + + Iterates over all the subtrees, never returning to the same node twice (Lark's parse-tree is actually a DAG). """ queue = [self] subtrees = OrderedDict() @@ -121,8 +119,7 @@ class Tree(object): def iter_subtrees_topdown(self): """Breadth-first iteration. - Iterates over all the subtrees, return nodes in order like - pretty() does. + Iterates over all the subtrees, return nodes in order like pretty() does. """ stack = [self] while stack: diff --git a/lark/visitors.py b/lark/visitors.py index 81bb831..6ea39b0 100644 --- a/lark/visitors.py +++ b/lark/visitors.py @@ -45,28 +45,23 @@ class _Decoratable: class Transformer(_Decoratable): - """Transformer visit each node of the tree, and run the appropriate method - on it according to the node's data. + """Transformers visit each node of the tree, and run the appropriate method on it according to the node's data. - Calls its methods (provided by user via inheritance) according to - ``tree.data``. The returned value replaces the old one in the structure. + Calls its methods (provided by user via inheritance) according to ``tree.data``. + The returned value replaces the old one in the structure. - They work bottom-up (or depth-first), starting with the leaves and - ending at the root of the tree. Transformers can be used to - implement map & reduce patterns. Because nodes are reduced from leaf to - root, at any point the callbacks may assume the children have already been - transformed (if applicable). ``Transformer`` can do anything ``Visitor`` - can do, but because it reconstructs the tree, it is slightly less - efficient. + They work bottom-up (or depth-first), starting with the leaves and ending at the root of the tree. + Transformers can be used to implement map & reduce patterns. Because nodes are reduced from leaf to root, + at any point the callbacks may assume the children have already been transformed (if applicable). + + ``Transformer`` can do anything ``Visitor`` can do, but because it reconstructs the tree, + it is slightly less efficient. It can be used to implement map or reduce patterns. All these classes implement the transformer interface: - - ``Transformer`` - Recursively transforms the tree. This is the one you - probably want. - - ``Transformer_InPlace`` - Non-recursive. Changes the tree in-place - instead of returning new instances - - ``Transformer_InPlaceRecursive`` - Recursive. Changes the tree in-place - instead of returning new instances + - ``Transformer`` - Recursively transforms the tree. This is the one you probably want. + - ``Transformer_InPlace`` - Non-recursive. Changes the tree in-place instead of returning new instances + - ``Transformer_InPlaceRecursive`` - Recursive. Changes the tree in-place instead of returning new instances Example: :: @@ -82,7 +77,7 @@ class Transformer(_Decoratable): # Prints: Tree(a, [3]) - Args: + Parameters: visit_tokens: By default, transformers only visit rules. visit_tokens=True will tell ``Transformer`` to visit tokens as well. This is a slightly slower alternative to lexer_callbacks @@ -164,16 +159,16 @@ class Transformer(_Decoratable): def __default__(self, data, children, meta): """Default operation on tree (for override) - Function that is called on if a function with a corresponding name has - not been found. Defaults to reconstruct the Tree + Function that is called on if a function with a corresponding name has not been found. + Defaults to reconstruct the Tree. """ return Tree(data, children, meta) def __default_token__(self, token): """Default operation on token (for override) - - Function that is called on if a function with a corresponding name has - not been found. Defaults to just return the argument. + + Function that is called on if a function with a corresponding name has not been found. + Defaults to just return the argument. """ return token @@ -259,25 +254,6 @@ class Transformer_InPlaceRecursive(Transformer): # Visitors class VisitorBase: - """Visitors visit each node of the tree - - Run the appropriate method on it according to the node's data. - They work bottom-up, starting with the leaves and ending at the root - of the tree. There are two classes that implement the visitor interface: - - - ``Visitor``: Visit every node (without recursion) - - ``Visitor_Recursive``: Visit every node using recursion. Slightly faster. - - Example: - :: - - class IncreaseAllNumbers(Visitor): - def number(self, tree): - assert tree.data == "number" - tree.children[0] += 1 - - IncreaseAllNumbers().visit(parse_tree) - """ def _call_userfunc(self, tree): return getattr(self, tree.data, self.__default__)(tree) @@ -293,8 +269,7 @@ class Visitor(VisitorBase): """Bottom-up visitor, non-recursive. Visits the tree, starting with the leaves and finally the root (bottom-up) - Calls its methods (provided by user via inheritance) according to - ``tree.data`` + Calls its methods (provided by user via inheritance) according to ``tree.data`` """ def visit(self, tree): @@ -312,8 +287,7 @@ class Visitor_Recursive(VisitorBase): """Bottom-up visitor, recursive. Visits the tree, starting with the leaves and finally the root (bottom-up) - Calls its methods (provided by user via inheritance) according to - ``tree.data`` + Calls its methods (provided by user via inheritance) according to ``tree.data`` """ def visit(self, tree): @@ -348,13 +322,12 @@ class Interpreter(_Decoratable): """Interpreter walks the tree starting at the root. Visits the tree, starting with the root and finally the leaves (top-down) - Calls its methods (provided by user via inheritance) according to - ``tree.data`` - Unlike ``Transformer`` and ``Visitor``, the Interpreter doesn't - automatically visit its sub-branches. The user has to explicitly call ``visit``, - ``visit_children``, or use the ``@visit_children_decor``. This allows the - user to implement branching and loops. + For each tree node, it calls its methods (provided by user via inheritance) according to ``tree.data``. + + Unlike ``Transformer`` and ``Visitor``, the Interpreter doesn't automatically visit its sub-branches. + The user has to explicitly call ``visit``, ``visit_children``, or use the ``@visit_children_decor``. + This allows the user to implement branching and loops. Example: :: @@ -452,21 +425,17 @@ def _vargs_tree(f, data, children, meta): def v_args(inline=False, meta=False, tree=False, wrapper=None): - """A convenience decorator factory for modifying the behavior of - user-supplied visitor methods. - - By default, callback methods of transformers/visitors accept one argument - - a list of the node's children. ``v_args`` can modify this behavior. When - used on a transformer/visitor class definition, it applies to all the - callback methods inside it. Accepts one of three following flags. - - Args: - inline: Children are provided as ``*args`` instead of a list - argument (not recommended for very long lists). - meta: Provides two arguments: ``children`` and ``meta`` (instead of - just the first) - tree: Provides the entire tree as the argument, instead of the - children. + """A convenience decorator factory for modifying the behavior of user-supplied visitor methods. + + By default, callback methods of transformers/visitors accept one argument - a list of the node's children. + + ``v_args`` can modify this behavior. When used on a transformer/visitor class definition, + it applies to all the callback methods inside it. + + Parameters: + inline: Children are provided as ``*args`` instead of a list argument (not recommended for very long lists). + meta: Provides two arguments: ``children`` and ``meta`` (instead of just the first) + tree: Provides the entire tree as the argument, instead of the children. Example: :: From c6438007a7e274d2c4496842c31ac558cc5ef3a8 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sun, 16 Aug 2020 20:09:11 +0300 Subject: [PATCH 128/164] A tiny fix --- docs/classes.rst | 2 ++ lark/exceptions.py | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/classes.rst b/docs/classes.rst index 3778147..59a6f0b 100644 --- a/docs/classes.rst +++ b/docs/classes.rst @@ -58,6 +58,8 @@ UnexpectedInput .. autoclass:: lark.exceptions.UnexpectedCharacters +.. _parserpuppet: + ParserPuppet ------------ diff --git a/lark/exceptions.py b/lark/exceptions.py index 13bf83e..c538888 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -65,7 +65,7 @@ class UnexpectedInput(LarkError): iterate the dictionary until it finds a matching error, and return the corresponding value. - For an example usage, see examples/error_reporting_lalr.py + For an example usage, see `examples/error_reporting_lalr.py` Parameters: parse_fn: parse function (usually ``lark_instance.parse``) @@ -141,7 +141,7 @@ class UnexpectedToken(ParseError, UnexpectedInput): with its internal state. Users can then interactively set the puppet to the desired puppet state, and resume regular parsing. - see: ``ParserPuppet``. + see: :ref:`ParserPuppet`. """ def __init__(self, token, expected, considered_rules=None, state=None, puppet=None): self.line = getattr(token, 'line', '?') From bf2d9bf7b16cddb39f2e0ea3cefecc8de5269e2c Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sun, 16 Aug 2020 20:43:57 +0300 Subject: [PATCH 129/164] Standalone generator now remove docstrings and comments. The result is a much smaller file. --- examples/standalone/json_parser.py | 513 +++++++++++++++-------------- lark/tools/standalone.py | 39 ++- 2 files changed, 307 insertions(+), 245 deletions(-) diff --git a/examples/standalone/json_parser.py b/examples/standalone/json_parser.py index cadc51d..d20cb4b 100644 --- a/examples/standalone/json_parser.py +++ b/examples/standalone/json_parser.py @@ -29,7 +29,6 @@ __version__ = "0.9.0" import os from io import open -import logging class LarkError(Exception): @@ -53,9 +52,11 @@ class UnexpectedEOF(ParseError): class UnexpectedInput(LarkError): + #-- pos_in_stream = None def get_context(self, text, span=40): + #-- pos = self.pos_in_stream start = max(pos - span, 0) end = pos + span @@ -69,12 +70,7 @@ class UnexpectedInput(LarkError): return (before + after + b'\n' + b' ' * len(before) + b'^\n').decode("ascii", "backslashreplace") def match_examples(self, parse_fn, examples, token_type_match_fallback=False, use_accepts=False): - """ Given a parser instance and a dictionary mapping some label with - some malformed syntax examples, it'll return the label for the - example that bests matches the current error. - - It's recommended to call this with `use_accepts=True`. The default is False for backwards compatibility. - """ + #-- assert self.state is not None, "Not supported for this exception" if isinstance(examples, dict): @@ -90,24 +86,26 @@ class UnexpectedInput(LarkError): except UnexpectedInput as ut: if ut.state == self.state: if use_accepts and ut.accepts != self.accepts: - logging.debug("Different accepts with same state[%d]: %s != %s at example [%s][%s]" % + logger.debug("Different accepts with same state[%d]: %s != %s at example [%s][%s]" % (self.state, self.accepts, ut.accepts, i, j)) continue try: - if ut.token == self.token: # Try exact match first - logging.debug("Exact Match at example [%s][%s]" % (i, j)) + if ut.token == self.token: ## + + logger.debug("Exact Match at example [%s][%s]" % (i, j)) return label if token_type_match_fallback: - # Fallback to token types match + ## + if (ut.token.type == self.token.type) and not candidate[-1]: - logging.debug("Token Type Fallback at example [%s][%s]" % (i, j)) + logger.debug("Token Type Fallback at example [%s][%s]" % (i, j)) candidate = label, True except AttributeError: pass if not candidate[0]: - logging.debug("Same State match at example [%s][%s]" % (i, j)) + logger.debug("Same State match at example [%s][%s]" % (i, j)) candidate = label, False return candidate[0] @@ -138,8 +136,8 @@ class UnexpectedCharacters(LexError, UnexpectedInput): super(UnexpectedCharacters, self).__init__(message) - class UnexpectedToken(ParseError, UnexpectedInput): + #-- def __init__(self, token, expected, considered_rules=None, state=None, puppet=None): self.line = getattr(token, 'line', '?') self.column = getattr(token, 'column', '?') @@ -147,12 +145,15 @@ class UnexpectedToken(ParseError, UnexpectedInput): self.state = state self.token = token - self.expected = expected # XXX deprecate? `accepts` is better + self.expected = expected ## + self.considered_rules = considered_rules self.puppet = puppet - # TODO Only calculate `accepts()` when we need to display it to the user - # This will improve performance when doing automatic error handling + ## + + ## + self.accepts = puppet and puppet.accepts() message = ("Unexpected token %r at line %s, column %s.\n" @@ -161,13 +162,9 @@ class UnexpectedToken(ParseError, UnexpectedInput): super(UnexpectedToken, self).__init__(message) -class VisitError(LarkError): - """VisitError is raised when visitors are interrupted by an exception - It provides the following attributes for inspection: - - obj: the tree node or token it was processing when the exception was raised - - orig_exc: the exception that cause it to fail - """ +class VisitError(LarkError): + #-- def __init__(self, rule, obj, orig_exc): self.obj = obj self.orig_exc = orig_exc @@ -175,6 +172,16 @@ class VisitError(LarkError): message = 'Error trying to process rule "%s":\n\n%s' % (rule, orig_exc) super(VisitError, self).__init__(message) +import logging +logger = logging.getLogger("lark") +logger.addHandler(logging.StreamHandler()) +## + +## + +logger.setLevel(logging.CRITICAL) + + def classify(seq, key=None, value=None): d = {} for item in seq: @@ -189,7 +196,8 @@ def classify(seq, key=None, value=None): def _deserialize(data, namespace, memo): if isinstance(data, dict): - if '__type__' in data: # Object + if '__type__' in data: ## + class_ = namespace[data['__type__']] return class_.deserialize(data, memo) elif '@' in data: @@ -260,7 +268,8 @@ class SerializeMemoizer(Serialize): try: STRING_TYPE = basestring -except NameError: # Python 3 +except NameError: ## + STRING_TYPE = str @@ -270,9 +279,11 @@ from contextlib import contextmanager Str = type(u'') try: - classtype = types.ClassType # Python2 + classtype = types.ClassType ## + except AttributeError: - classtype = type # Python3 + classtype = type ## + def smart_decorator(f, create_decorator): if isinstance(f, types.FunctionType): @@ -285,7 +296,8 @@ def smart_decorator(f, create_decorator): return wraps(f)(create_decorator(f.__func__, True)) elif isinstance(f, partial): - # wraps does not work for partials in 2.7: https://bugs.python.org/issue3445 + ## + return wraps(f.func)(create_decorator(lambda *args, **kw: f(*args[1:], **kw), True)) else: @@ -304,9 +316,12 @@ import sre_constants categ_pattern = re.compile(r'\\p{[A-Za-z_]+}') def get_regexp_width(expr): if regex: - # Since `sre_parse` cannot deal with Unicode categories of the form `\p{Mn}`, we replace these with - # a simple letter, which makes no difference as we are only trying to get the possible lengths of the regex - # match here below. + ## + + ## + + ## + regexp_final = re.sub(categ_pattern, 'A', expr) else: if re.search(categ_pattern, expr): @@ -325,7 +340,9 @@ class Meta: def __init__(self): self.empty = True + class Tree(object): + #-- def __init__(self, data, children, meta=None): self.data = data self.children = children @@ -357,6 +374,7 @@ class Tree(object): return l def pretty(self, indent_str=' '): + #-- return ''.join(self._pretty(0, indent_str)) def __eq__(self, other): @@ -372,6 +390,7 @@ class Tree(object): return hash((self.data, tuple(self.children))) def iter_subtrees(self): + #-- queue = [self] subtrees = OrderedDict() for subtree in queue: @@ -383,23 +402,25 @@ class Tree(object): return reversed(list(subtrees.values())) def find_pred(self, pred): - "Find all nodes where pred(tree) == True" + #-- return filter(pred, self.iter_subtrees()) def find_data(self, data): - "Find all nodes where tree.data == data" + #-- return self.find_pred(lambda t: t.data == data) from inspect import getmembers, getmro class Discard(Exception): + #-- pass -# Transformers +## + class _Decoratable: - "Provides support for decorating methods with @v_args" + #-- @classmethod def _apply_decorator(cls, decorator, **kwargs): @@ -408,13 +429,15 @@ class _Decoratable: libmembers = {name for _cls in mro[1:] for name, _ in getmembers(_cls)} for name, value in getmembers(cls): - # Make sure the function isn't inherited (unless it's overwritten) + ## + if name.startswith('_') or (name in libmembers and name not in cls.__dict__): continue if not callable(value): continue - # Skip if v_args already applied (at the function level) + ## + if hasattr(cls.__dict__[name], 'vargs_applied') or hasattr(value, 'vargs_applied'): continue @@ -427,20 +450,16 @@ class _Decoratable: class Transformer(_Decoratable): - """Visits the tree recursively, starting with the leaves and finally the root (bottom-up) + #-- + __visit_tokens__ = True ## - Calls its methods (provided by user via inheritance) according to tree.data - The returned value replaces the old one in the structure. - - Can be used to implement map or reduce. - """ - __visit_tokens__ = True # For backwards compatibility def __init__(self, visit_tokens=True): self.__visit_tokens__ = visit_tokens def _call_userfunc(self, tree, new_children=None): - # Assumes tree is already transformed + ## + children = new_children if new_children is not None else tree.children try: f = getattr(self, tree.data) @@ -495,18 +514,20 @@ class Transformer(_Decoratable): return TransformerChain(self, other) def __default__(self, data, children, meta): - "Default operation on tree (for override)" + #-- return Tree(data, children, meta) def __default_token__(self, token): - "Default operation on token (for override)" + #-- return token -class InlineTransformer(Transformer): # XXX Deprecated +class InlineTransformer(Transformer): ## + def _call_userfunc(self, tree, new_children=None): - # Assumes tree is already transformed + ## + children = new_children if new_children is not None else tree.children try: f = getattr(self, tree.data) @@ -530,8 +551,9 @@ class TransformerChain(object): class Transformer_InPlace(Transformer): - "Non-recursive. Changes the tree in-place instead of returning new instances" - def _transform_tree(self, tree): # Cancel recursion + #-- + def _transform_tree(self, tree): ## + return self._call_userfunc(tree) def transform(self, tree): @@ -542,10 +564,11 @@ class Transformer_InPlace(Transformer): class Transformer_NonRecursive(Transformer): - "Non-recursive. Doesn't change the original tree." + #-- def transform(self, tree): - # Tree to postfix + ## + rev_postfix = [] q = [tree] while q: @@ -554,7 +577,8 @@ class Transformer_NonRecursive(Transformer): if isinstance(t, Tree): q += t.children - # Postfix to tree + ## + stack = [] for x in reversed(rev_postfix): if isinstance(x, Tree): @@ -568,27 +592,29 @@ class Transformer_NonRecursive(Transformer): else: stack.append(x) - t ,= stack # We should have only one tree remaining + t ,= stack ## + return t class Transformer_InPlaceRecursive(Transformer): - "Recursive. Changes the tree in-place instead of returning new instances" + #-- def _transform_tree(self, tree): tree.children = list(self._transform_children(tree.children)) return self._call_userfunc(tree) -# Visitors +## + class VisitorBase: def _call_userfunc(self, tree): return getattr(self, tree.data, self.__default__)(tree) def __default__(self, tree): - "Default operation on tree (for override)" + #-- return tree def __class_getitem__(cls, _): @@ -596,11 +622,7 @@ class VisitorBase: class Visitor(VisitorBase): - """Bottom-up visitor, non-recursive - - Visits the tree, starting with the leaves and finally the root (bottom-up) - Calls its methods (provided by user via inheritance) according to tree.data - """ + #-- def visit(self, tree): for subtree in tree.iter_subtrees(): @@ -612,12 +634,9 @@ class Visitor(VisitorBase): self._call_userfunc(subtree) return tree -class Visitor_Recursive(VisitorBase): - """Bottom-up visitor, recursive - Visits the tree, starting with the leaves and finally the root (bottom-up) - Calls its methods (provided by user via inheritance) according to tree.data - """ +class Visitor_Recursive(VisitorBase): + #-- def visit(self, tree): for child in tree.children: @@ -639,7 +658,7 @@ class Visitor_Recursive(VisitorBase): def visit_children_decor(func): - "See Interpreter" + #-- @wraps(func) def inner(cls, tree): values = cls.visit_children(tree) @@ -648,14 +667,7 @@ def visit_children_decor(func): class Interpreter(_Decoratable): - """Top-down visitor, recursive - - Visits the tree, starting with the root and finally the leaves (top-down) - Calls its methods (provided by user via inheritance) according to tree.data - - Unlike Transformer and Visitor, the Interpreter doesn't automatically visit its sub-branches. - The user has to explicitly call visit, visit_children, or use the @visit_children_decor - """ + #-- def visit(self, tree): f = getattr(self, tree.data) @@ -678,7 +690,8 @@ class Interpreter(_Decoratable): -# Decorators +## + def _apply_decorator(obj, decorator, **kwargs): try: @@ -704,7 +717,8 @@ def _inline_args__func(func): return smart_decorator(func, create_decorator) -def inline_args(obj): # XXX Deprecated +def inline_args(obj): ## + return _apply_decorator(obj, _inline_args__func) @@ -733,12 +747,14 @@ def _vargs_inline(f, data, children, meta): def _vargs_meta_inline(f, data, children, meta): return f(meta, *children) def _vargs_meta(f, data, children, meta): - return f(children, meta) # TODO swap these for consistency? Backwards incompatible! + return f(children, meta) ## + def _vargs_tree(f, data, children, meta): return f(Tree(data, children, meta)) + def v_args(inline=False, meta=False, tree=False, wrapper=None): - "A convenience decorator factory, for modifying the behavior of user-supplied visitor methods" + #-- if tree and (meta or inline): raise ValueError("Visitor functions cannot combine 'tree' with 'meta' or 'inline'.") @@ -776,7 +792,8 @@ class Indenter: yield token - indent_str = token.rsplit('\n', 1)[1] # Tabs and spaces + indent_str = token.rsplit('\n', 1)[1] ## + indent = indent_str.count(' ') + indent_str.count('\t') * self.tab_len if indent > self.indent_level[-1]: @@ -814,7 +831,8 @@ class Indenter: self.indent_level = [0] return self._process(stream) - # XXX Hack for ContextualLexer. Maybe there's a more elegant solution? + ## + @property def always_accept(self): return (self.NL_type,) @@ -887,11 +905,7 @@ class RuleOptions(Serialize): class Rule(Serialize): - """ - origin : a symbol - expansion : a list of symbols - order : index of this expansion amongst all rules of the same name - """ + #-- __slots__ = ('origin', 'expansion', 'alias', 'options', 'order', '_hash') __serialize_fields__ = 'origin', 'expansion', 'order', 'alias', 'options' @@ -936,7 +950,8 @@ class Pattern(Serialize): def __repr__(self): return repr(self.to_regexp()) - # Pattern Hashing assumes all subclasses have a different priority! + ## + def __hash__(self): return hash((type(self), self.value, self.flags)) def __eq__(self, other): @@ -946,7 +961,8 @@ class Pattern(Serialize): raise NotImplementedError() if Py36: - # Python 3.6 changed syntax for flags in regular expression + ## + def _get_flags(self, value): for f in self.flags: value = ('(?%s:%s)' % (f, value)) @@ -1009,6 +1025,7 @@ class TerminalDef(Serialize): class Token(Str): + #-- __slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column', 'end_pos') def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None, end_line=None, end_column=None, end_pos=None): @@ -1066,10 +1083,7 @@ class LineCounter: self.line_start_pos = 0 def feed(self, token, test_newline=True): - """Consume a token and calculate the new line & column. - - As an optional optimization, set test_newline=False is token doesn't contain a newline. - """ + #-- if test_newline: newlines = token.count(self.newline_char) if newlines: @@ -1080,7 +1094,7 @@ class LineCounter: self.column = self.char_pos - self.line_start_pos + 1 class _Lex: - "Built to serve both Lexer and ContextualLexer" + #-- def __init__(self, lexer, state=None): self.lexer = lexer self.state = state @@ -1155,7 +1169,8 @@ def _create_unless(terminals, g_regex_flags, re_, use_bytes): embedded_strs = set() callback = {} for retok in tokens_by_type.get(PatternRE, []): - unless = [] # {} + unless = [] ## + for strtok in tokens_by_type.get(PatternStr, []): if strtok.priority > retok.priority: continue @@ -1173,9 +1188,12 @@ def _create_unless(terminals, g_regex_flags, re_, use_bytes): def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_, use_bytes): - # Python sets an unreasonable group limit (currently 100) in its re module - # Worse, the only way to know we reached it is by catching an AssertionError! - # This function recursively tries less and less groups until it's successful. + ## + + ## + + ## + postfix = '$' if match_whole else '' mres = [] while terminals: @@ -1184,10 +1202,12 @@ def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_, use_bytes) pattern = pattern.encode('latin-1') try: mre = re_.compile(pattern, g_regex_flags) - except AssertionError: # Yes, this is what Python provides us.. :/ + except AssertionError: ## + return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_, use_bytes) - # terms_from_name = {t.name: t for t in terminals[:max_size]} + ## + mres.append((mre, {i:n for n,i in mre.groupindex.items()} )) terminals = terminals[max_size:] return mres @@ -1196,21 +1216,11 @@ def build_mres(terminals, g_regex_flags, re_, use_bytes, match_whole=False): return _build_mres(terminals, len(terminals), g_regex_flags, match_whole, re_, use_bytes) def _regexp_has_newline(r): - r"""Expressions that may indicate newlines in a regexp: - - newlines (\n) - - escaped newline (\\n) - - anything but ([^...]) - - any-char (.) when the flag (?s) exists - - spaces (\s) - """ + #-- return '\n' in r or '\\n' in r or '\\s' in r or '[^' in r or ('(?s' in r and '.' in r) class Lexer(object): - """Lexer interface - - Method Signatures: - lex(self, stream) -> Iterator[Token] - """ + #-- lex = NotImplemented @@ -1223,7 +1233,8 @@ class TraditionalLexer(Lexer): self.re = conf.re_module if not conf.skip_validation: - # Sanitization + ## + for t in terminals: try: self.re.compile(t.pattern.to_regexp(), conf.g_regex_flags) @@ -1235,7 +1246,8 @@ class TraditionalLexer(Lexer): assert set(conf.ignore) <= {t.name for t in terminals} - # Init + ## + self.newline_types = [t.name for t in terminals if _regexp_has_newline(t.pattern.to_regexp())] self.ignore_types = list(conf.ignore) @@ -1246,7 +1258,8 @@ class TraditionalLexer(Lexer): self.use_bytes = conf.use_bytes self._mres = None - # self.build(g_regex_flags) + ## + def _build(self): terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, re_=self.re, use_bytes=self.use_bytes) @@ -1254,7 +1267,8 @@ class TraditionalLexer(Lexer): for type_, f in self.user_callbacks.items(): if type_ in self.callback: - # Already a callback there, probably UnlessCallback + ## + self.callback[type_] = CallChain(self.callback[type_], f, lambda t: t.type == type_) else: self.callback[type_] = f @@ -1318,11 +1332,15 @@ class ContextualLexer(Lexer): yield x parser_state = get_parser_state() l.lexer = self.lexers[parser_state] - l.state = parser_state # For debug only, no need to worry about multithreading + l.state = parser_state ## + except UnexpectedCharacters as e: - # In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, - # but not in the current context. - # This tests the input against the global context, to provide a nicer error. + ## + + ## + + ## + root_match = self.root_lexer.match(stream, e.pos_in_stream) if not root_match: raise @@ -1338,7 +1356,8 @@ class LexerConf(Serialize): __serialize_namespace__ = TerminalDef, def __init__(self, tokens, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False, use_bytes=False): - self.tokens = tokens # TODO should be terminals + self.tokens = tokens ## + self.ignore = ignore self.postlex = postlex self.callbacks = callbacks or {} @@ -1369,7 +1388,8 @@ class PropagatePositions: def __call__(self, children): res = self.node_builder(children) - # local reference to Tree.meta reduces number of presence checks + ## + if isinstance(res, Tree): res_meta = res.meta for c in children: @@ -1430,7 +1450,7 @@ class ChildFilter: return self.node_builder(filtered) class ChildFilterLALR(ChildFilter): - "Optimized childfilter for LALR (assumes no duplication in parse tree, so it's safe to change it)" + #-- def __call__(self, children): filtered = [] @@ -1440,7 +1460,8 @@ class ChildFilterLALR(ChildFilter): if to_expand: if filtered: filtered += children[i].children - else: # Optimize for left-recursion + else: ## + filtered = children[i].children else: filtered.append(children[i]) @@ -1451,7 +1472,7 @@ class ChildFilterLALR(ChildFilter): return self.node_builder(filtered) class ChildFilterLALR_NoPlaceholders(ChildFilter): - "Optimized childfilter for LALR (assumes no duplication in parse tree, so it's safe to change it)" + #-- def __init__(self, to_include, node_builder): self.node_builder = node_builder self.to_include = to_include @@ -1462,7 +1483,8 @@ class ChildFilterLALR_NoPlaceholders(ChildFilter): if to_expand: if filtered: filtered += children[i].children - else: # Optimize for left-recursion + else: ## + filtered = children[i].children else: filtered.append(children[i]) @@ -1472,7 +1494,8 @@ def _should_expand(sym): return not sym.is_term and sym.name.startswith('_') def maybe_create_child_filter(expansion, keep_all_tokens, ambiguous, _empty_indices): - # Prepare empty_indices as: How many Nones to insert at each index? + ## + if _empty_indices: assert _empty_indices.count(False) == len(expansion) s = ''.join(str(int(b)) for b in _empty_indices) @@ -1495,14 +1518,12 @@ def maybe_create_child_filter(expansion, keep_all_tokens, ambiguous, _empty_indi if _empty_indices or ambiguous: return partial(ChildFilter if ambiguous else ChildFilterLALR, to_include, nones_to_add) else: - # LALR without placeholders + ## + return partial(ChildFilterLALR_NoPlaceholders, [(i, x) for i,x,_ in to_include]) class AmbiguousExpander: - """Deal with the case where we're expanding children ('_rule') into a parent but the children - are ambiguous. i.e. (parent->_ambig->_expand_this_rule). In this case, make the parent itself - ambiguous with as many copies as their are ambiguous children, and then copy the ambiguous children - into the right parents in the right places, essentially shifting the ambiguiuty up the tree.""" + #-- def __init__(self, to_expand, tree_class, node_builder): self.node_builder = node_builder self.tree_class = tree_class @@ -1512,10 +1533,14 @@ class AmbiguousExpander: def _is_ambig_tree(child): return hasattr(child, 'data') and child.data == '_ambig' - #### When we're repeatedly expanding ambiguities we can end up with nested ambiguities. - # All children of an _ambig node should be a derivation of that ambig node, hence - # it is safe to assume that if we see an _ambig node nested within an ambig node - # it is safe to simply expand it into the parent _ambig node as an alternative derivation. + ## + + ## + + ## + + ## + ambiguous = [] for i, child in enumerate(children): if _is_ambig_tree(child): @@ -1546,7 +1571,8 @@ def ptb_inline_args(func): def inplace_transformer(func): @wraps(func) def f(children): - # function name in a Transformer is a rule name. + ## + tree = Tree(func.__name__, children) return func(tree) return f @@ -1594,7 +1620,8 @@ class ParseTreeBuilder: user_callback_name = rule.alias or rule.options.template_source or rule.origin.name try: f = getattr(transformer, user_callback_name) - # XXX InlineTransformer is deprecated! + ## + wrapper = getattr(f, 'visit_wrapper', None) if wrapper is not None: f = apply_visit_wrapper(f, user_callback_name, wrapper) @@ -1669,7 +1696,8 @@ class _Parser: expected = {s for s in states[state].keys() if s.isupper()} try: puppet = ParserPuppet(self, state_stack, value_stack, start, stream, set_state) - except NameError: # For standalone parser + except NameError: ## + puppet = None raise UnexpectedToken(token, expected, state=state, puppet=puppet) @@ -1689,7 +1717,8 @@ class _Parser: state_stack.append(new_state) value_stack.append(value) - # Main LALR-parser loop + ## + try: for token in stream: while True: @@ -1700,7 +1729,8 @@ class _Parser: state_stack.append(arg) value_stack.append(token) if set_state: set_state(arg) - break # next token + break ## + else: reduce(arg) except Exception as e: @@ -1923,67 +1953,69 @@ class LALR_ContextualLexer(LALR_WithLexer): class LarkOptions(Serialize): - """Specifies the options for Lark - - """ + #-- OPTIONS_DOC = """ -# General - - start - The start symbol. Either a string, or a list of strings for - multiple possible starts (Default: "start") - debug - Display debug information, such as warnings (default: False) - transformer - Applies the transformer to every parse tree (equivlent to - applying it after the parse, but faster) - propagate_positions - Propagates (line, column, end_line, end_column) - attributes into all tree branches. - maybe_placeholders - When True, the `[]` operator returns `None` when not matched. - When `False`, `[]` behaves like the `?` operator, - and returns no value at all. - (default=`False`. Recommended to set to `True`) - regex - When True, uses the `regex` module instead of the stdlib `re`. - cache - Cache the results of the Lark grammar analysis, for x2 to x3 faster loading. - LALR only for now. - When `False`, does nothing (default) - When `True`, caches to a temporary file in the local directory - When given a string, caches to the path pointed by the string - - g_regex_flags - Flags that are applied to all terminals - (both regex and strings) - keep_all_tokens - Prevent the tree builder from automagically - removing "punctuation" tokens (default: False) - -# Algorithm - - parser - Decides which parser engine to use - Accepts "earley" or "lalr". (Default: "earley") - (there is also a "cyk" option for legacy) - - lexer - Decides whether or not to use a lexer stage - "auto" (default): Choose for me based on the parser - "standard": Use a standard lexer - "contextual": Stronger lexer (only works with parser="lalr") - "dynamic": Flexible and powerful (only with parser="earley") - "dynamic_complete": Same as dynamic, but tries *every* variation - of tokenizing possible. - - ambiguity - Decides how to handle ambiguity in the parse. - Only relevant if parser="earley" - "resolve": The parser will automatically choose the simplest - derivation (it chooses consistently: greedy for - tokens, non-greedy for rules) - "explicit": The parser will return all derivations wrapped - in "_ambig" tree nodes (i.e. a forest). - -# Domain Specific - - postlex - Lexer post-processing (Default: None) Only works with the - standard and contextual lexers. - priority - How priorities should be evaluated - auto, none, normal, - invert (Default: auto) - lexer_callbacks - Dictionary of callbacks for the lexer. May alter - tokens during lexing. Use with caution. - use_bytes - Accept an input of type `bytes` instead of `str` (Python 3 only). - edit_terminals - A callback + **=== General ===** + + start + The start symbol. Either a string, or a list of strings for multiple possible starts (Default: "start") + debug + Display debug information, such as warnings (default: False) + transformer + Applies the transformer to every parse tree (equivlent to applying it after the parse, but faster) + propagate_positions + Propagates (line, column, end_line, end_column) attributes into all tree branches. + maybe_placeholders + When True, the ``[]`` operator returns ``None`` when not matched. + + When ``False``, ``[]`` behaves like the ``?`` operator, and returns no value at all. + (default= ``False``. Recommended to set to ``True``) + regex + When True, uses the ``regex`` module instead of the stdlib ``re``. + cache + Cache the results of the Lark grammar analysis, for x2 to x3 faster loading. LALR only for now. + + - When ``False``, does nothing (default) + - When ``True``, caches to a temporary file in the local directory + - When given a string, caches to the path pointed by the string + + g_regex_flags + Flags that are applied to all terminals (both regex and strings) + keep_all_tokens + Prevent the tree builder from automagically removing "punctuation" tokens (default: False) + + **=== Algorithm ===** + + parser + Decides which parser engine to use. Accepts "earley" or "lalr". (Default: "earley"). + (there is also a "cyk" option for legacy) + lexer + Decides whether or not to use a lexer stage + + - "auto" (default): Choose for me based on the parser + - "standard": Use a standard lexer + - "contextual": Stronger lexer (only works with parser="lalr") + - "dynamic": Flexible and powerful (only with parser="earley") + - "dynamic_complete": Same as dynamic, but tries *every* variation of tokenizing possible. + ambiguity + Decides how to handle ambiguity in the parse. Only relevant if parser="earley" + + - "resolve" - The parser will automatically choose the simplest derivation + (it chooses consistently: greedy for tokens, non-greedy for rules) + - "explicit": The parser will return all derivations wrapped in "_ambig" tree nodes (i.e. a forest). + + **=== Misc. / Domain Specific ===** + + postlex + Lexer post-processing (Default: None) Only works with the standard and contextual lexers. + priority + How priorities should be evaluated - auto, none, normal, invert (Default: auto) + lexer_callbacks + Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution. + use_bytes + Accept an input of type ``bytes`` instead of ``str`` (Python 3 only). + edit_terminals + A callback for editing the terminals before parse. """ if __doc__: __doc__ += OPTIONS_DOC @@ -2056,15 +2088,12 @@ class LarkOptions(Serialize): class Lark(Serialize): + #-- def __init__(self, grammar, **options): - """ - grammar : a string or file-object containing the grammar spec (using Lark's ebnf syntax) - options : a dictionary controlling various aspects of Lark. - """ - self.options = LarkOptions(options) - # Set regex or re module + ## + use_regex = self.options.regex if use_regex: if regex: @@ -2074,13 +2103,15 @@ class Lark(Serialize): else: re_module = re - # Some, but not all file-like objects have a 'name' attribute + ## + try: self.source = grammar.name except AttributeError: self.source = '' - # Drain file-like objects to get their contents + ## + try: read = grammar.read except AttributeError: @@ -2114,7 +2145,7 @@ class Lark(Serialize): cache_fn = '.lark_cache_%s.tmp' % md5 if FS.exists(cache_fn): - logging.debug('Loading grammar from cache: %s', cache_fn) + logger.debug('Loading grammar from cache: %s', cache_fn) with FS.open(cache_fn, 'rb') as f: self._load(f, self.options.transformer, self.options.postlex) return @@ -2151,10 +2182,12 @@ class Lark(Serialize): assert self.options.ambiguity not in ('resolve__antiscore_sum', ), 'resolve__antiscore_sum has been replaced with the option priority="invert"' assert self.options.ambiguity in ('resolve', 'explicit', 'auto', ) - # Parse the grammar file and compose the grammars (TODO) + ## + self.grammar = load_grammar(grammar, self.source, re_module) - # Compile the EBNF grammar into BNF + ## + self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start) if self.options.edit_terminals: @@ -2163,21 +2196,27 @@ class Lark(Serialize): self._terminals_dict = {t.name: t for t in self.terminals} - # If the user asked to invert the priorities, negate them all here. - # This replaces the old 'resolve__antiscore_sum' option. + ## + + ## + if self.options.priority == 'invert': for rule in self.rules: if rule.options.priority is not None: rule.options.priority = -rule.options.priority - # Else, if the user asked to disable priorities, strip them from the - # rules. This allows the Earley parsers to skip an extra forest walk - # for improved performance, if you don't need them (or didn't specify any). + ## + + ## + + ## + elif self.options.priority == None: for rule in self.rules: if rule.options.priority is not None: rule.options.priority = None - # TODO Deprecate lexer_callbacks? + ## + lexer_callbacks = (_get_lexer_callbacks(self.options.transformer, self.terminals) if self.options.transformer else {}) @@ -2191,12 +2230,13 @@ class Lark(Serialize): self.lexer = self._build_lexer() if cache_fn: - logging.debug('Saving grammar to cache: %s', cache_fn) + logger.debug('Saving grammar to cache: %s', cache_fn) with FS.open(cache_fn, 'wb') as f: self.save(f) - if __init__.__doc__: - __init__.__doc__ += "\nOptions:\n" + LarkOptions.OPTIONS_DOC + ## + + __doc__ += "\nOptions:\n" + LarkOptions.OPTIONS_DOC __serialize_fields__ = 'parser', 'rules', 'options' @@ -2214,11 +2254,13 @@ class Lark(Serialize): return self.parser_class(self.lexer_conf, parser_conf, options=self.options) def save(self, f): + #-- data, m = self.memo_serialize([TerminalDef, Rule]) pickle.dump({'data': data, 'memo': m}, f) @classmethod def load(cls, f): + #-- inst = cls.__new__(cls) return inst._load(f) @@ -2259,16 +2301,7 @@ class Lark(Serialize): @classmethod def open(cls, grammar_filename, rel_to=None, **options): - """Create an instance of Lark with the grammar given by its filename - - If rel_to is provided, the function will find the grammar filename in relation to it. - - Example: - - >>> Lark.open("grammar_file.lark", rel_to=__file__, parser="lalr") - Lark(...) - - """ + #-- if rel_to: basepath = os.path.dirname(rel_to) grammar_filename = os.path.join(basepath, grammar_filename) @@ -2280,7 +2313,7 @@ class Lark(Serialize): def lex(self, text): - "Only lex (and postlex) the text, without parsing it. Only relevant when lexer='standard'" + #-- if not hasattr(self, 'lexer'): self.lexer = self._build_lexer() stream = self.lexer.lex(text) @@ -2289,18 +2322,12 @@ class Lark(Serialize): return stream def get_terminal(self, name): - "Get information about a terminal" + #-- return self._terminals_dict[name] def parse(self, text, start=None, on_error=None): - """Parse the given text, according to the options provided. - - Parameters: - start: str - required if Lark was given multiple possible start symbols (using the start option). - on_error: function - if provided, will be called on UnexpectedToken error. Return true to resume parsing. LALR only. + #-- - Returns a tree, unless specified otherwise. - """ try: return self.parser.parse(text, start=start) except UnexpectedToken as e: @@ -2318,10 +2345,10 @@ class Lark(Serialize): DATA = ( -{'parser': {'parser': {'tokens': {0: 'RSQB', 1: 'COMMA', 2: '$END', 3: 'RBRACE', 4: 'ESCAPED_STRING', 5: 'string', 6: 'pair', 7: 'LSQB', 8: 'LBRACE', 9: 'SIGNED_NUMBER', 10: 'NULL', 11: 'FALSE', 12: 'value', 13: 'array', 14: 'object', 15: 'TRUE', 16: '__array_star_0', 17: 'COLON', 18: '__object_star_1', 19: 'start'}, 'states': {0: {0: (1, {'@': 12}), 1: (1, {'@': 12}), 2: (1, {'@': 12}), 3: (1, {'@': 12})}, 1: {0: (1, {'@': 13}), 1: (1, {'@': 13}), 2: (1, {'@': 13}), 3: (1, {'@': 13})}, 2: {1: (0, 25), 0: (0, 19)}, 3: {0: (1, {'@': 14}), 1: (1, {'@': 14}), 2: (1, {'@': 14}), 3: (1, {'@': 14})}, 4: {4: (0, 31), 5: (0, 13), 6: (0, 26)}, 5: {0: (1, {'@': 15}), 1: (1, {'@': 15}), 2: (1, {'@': 15}), 3: (1, {'@': 15})}, 6: {0: (1, {'@': 16}), 1: (1, {'@': 16}), 2: (1, {'@': 16}), 3: (1, {'@': 16})}, 7: {0: (1, {'@': 17}), 1: (1, {'@': 17}), 2: (1, {'@': 17}), 3: (1, {'@': 17})}, 8: {1: (0, 14), 3: (0, 28)}, 9: {0: (0, 21), 7: (0, 9), 8: (0, 18), 9: (0, 0), 10: (0, 1), 11: (0, 29), 5: (0, 5), 12: (0, 10), 13: (0, 7), 14: (0, 33), 4: (0, 31), 15: (0, 24)}, 10: {1: (0, 20), 16: (0, 2), 0: (0, 3)}, 11: {0: (1, {'@': 18}), 1: (1, {'@': 18})}, 12: {2: (1, {'@': 19})}, 13: {17: (0, 32)}, 14: {5: (0, 13), 4: (0, 31), 6: (0, 23)}, 15: {18: (0, 8), 1: (0, 4), 3: (0, 17)}, 16: {0: (1, {'@': 20}), 1: (1, {'@': 20})}, 17: {0: (1, {'@': 21}), 1: (1, {'@': 21}), 2: (1, {'@': 21}), 3: (1, {'@': 21})}, 18: {4: (0, 31), 6: (0, 15), 5: (0, 13), 3: (0, 6)}, 19: {0: (1, {'@': 22}), 1: (1, {'@': 22}), 2: (1, {'@': 22}), 3: (1, {'@': 22})}, 20: {7: (0, 9), 8: (0, 18), 12: (0, 11), 9: (0, 0), 14: (0, 33), 10: (0, 1), 4: (0, 31), 15: (0, 24), 5: (0, 5), 11: (0, 29), 13: (0, 7)}, 21: {0: (1, {'@': 23}), 1: (1, {'@': 23}), 2: (1, {'@': 23}), 3: (1, {'@': 23})}, 22: {1: (1, {'@': 24}), 3: (1, {'@': 24})}, 23: {1: (1, {'@': 25}), 3: (1, {'@': 25})}, 24: {0: (1, {'@': 26}), 1: (1, {'@': 26}), 2: (1, {'@': 26}), 3: (1, {'@': 26})}, 25: {7: (0, 9), 12: (0, 16), 8: (0, 18), 9: (0, 0), 14: (0, 33), 10: (0, 1), 4: (0, 31), 15: (0, 24), 5: (0, 5), 11: (0, 29), 13: (0, 7)}, 26: {1: (1, {'@': 27}), 3: (1, {'@': 27})}, 27: {7: (0, 9), 8: (0, 18), 12: (0, 12), 9: (0, 0), 10: (0, 1), 11: (0, 29), 5: (0, 5), 13: (0, 7), 14: (0, 33), 4: (0, 31), 15: (0, 24), 19: (0, 30)}, 28: {0: (1, {'@': 28}), 1: (1, {'@': 28}), 2: (1, {'@': 28}), 3: (1, {'@': 28})}, 29: {0: (1, {'@': 29}), 1: (1, {'@': 29}), 2: (1, {'@': 29}), 3: (1, {'@': 29})}, 30: {}, 31: {17: (1, {'@': 30}), 0: (1, {'@': 30}), 1: (1, {'@': 30}), 2: (1, {'@': 30}), 3: (1, {'@': 30})}, 32: {7: (0, 9), 8: (0, 18), 12: (0, 22), 9: (0, 0), 14: (0, 33), 10: (0, 1), 4: (0, 31), 15: (0, 24), 5: (0, 5), 11: (0, 29), 13: (0, 7)}, 33: {0: (1, {'@': 31}), 1: (1, {'@': 31}), 2: (1, {'@': 31}), 3: (1, {'@': 31})}}, 'start_states': {'start': 27}, 'end_states': {'start': 30}}, 'lexer_conf': {'tokens': [{'@': 0}, {'@': 1}, {'@': 2}, {'@': 3}, {'@': 4}, {'@': 5}, {'@': 6}, {'@': 7}, {'@': 8}, {'@': 9}, {'@': 10}, {'@': 11}], 'ignore': ['WS'], 'g_regex_flags': 0, 'use_bytes': False, '__type__': 'LexerConf'}, 'start': ['start'], '__type__': 'LALR_ContextualLexer'}, 'rules': [{'@': 19}, {'@': 31}, {'@': 17}, {'@': 15}, {'@': 12}, {'@': 26}, {'@': 29}, {'@': 13}, {'@': 22}, {'@': 14}, {'@': 23}, {'@': 28}, {'@': 21}, {'@': 16}, {'@': 24}, {'@': 30}, {'@': 18}, {'@': 20}, {'@': 27}, {'@': 25}], 'options': {'debug': False, 'keep_all_tokens': False, 'tree_class': None, 'cache': False, 'postlex': None, 'parser': 'lalr', 'lexer': 'contextual', 'transformer': None, 'start': ['start'], 'priority': None, 'ambiguity': 'auto', 'regex': False, 'propagate_positions': False, 'lexer_callbacks': {}, 'maybe_placeholders': False, 'edit_terminals': None, 'g_regex_flags': 0, 'use_bytes': False}, '__type__': 'Lark'} +{'parser': {'parser': {'tokens': {0: 'RBRACE', 1: 'COMMA', 2: 'RSQB', 3: '$END', 4: '__object_star_1', 5: 'COLON', 6: 'LBRACE', 7: 'value', 8: 'string', 9: 'object', 10: 'TRUE', 11: 'SIGNED_NUMBER', 12: 'LSQB', 13: 'NULL', 14: 'FALSE', 15: 'array', 16: 'ESCAPED_STRING', 17: '__array_star_0', 18: 'pair', 19: 'start'}, 'states': {0: {0: (1, {'@': 12}), 1: (1, {'@': 12})}, 1: {1: (1, {'@': 13}), 2: (1, {'@': 13}), 0: (1, {'@': 13}), 3: (1, {'@': 13})}, 2: {1: (1, {'@': 14}), 2: (1, {'@': 14}), 0: (1, {'@': 14}), 3: (1, {'@': 14})}, 3: {0: (0, 25), 1: (0, 32)}, 4: {4: (0, 3), 1: (0, 27), 0: (0, 33)}, 5: {0: (1, {'@': 15}), 1: (1, {'@': 15})}, 6: {}, 7: {1: (0, 23), 2: (0, 2)}, 8: {1: (1, {'@': 16}), 2: (1, {'@': 16})}, 9: {1: (1, {'@': 17}), 2: (1, {'@': 17}), 5: (1, {'@': 17}), 0: (1, {'@': 17}), 3: (1, {'@': 17})}, 10: {1: (1, {'@': 18}), 2: (1, {'@': 18}), 0: (1, {'@': 18}), 3: (1, {'@': 18})}, 11: {1: (1, {'@': 19}), 2: (1, {'@': 19}), 0: (1, {'@': 19}), 3: (1, {'@': 19})}, 12: {1: (1, {'@': 20}), 2: (1, {'@': 20}), 0: (1, {'@': 20}), 3: (1, {'@': 20})}, 13: {5: (0, 22)}, 14: {6: (0, 21), 7: (0, 29), 8: (0, 12), 9: (0, 1), 10: (0, 16), 11: (0, 11), 12: (0, 26), 13: (0, 30), 14: (0, 15), 15: (0, 10), 16: (0, 9)}, 15: {1: (1, {'@': 21}), 2: (1, {'@': 21}), 0: (1, {'@': 21}), 3: (1, {'@': 21})}, 16: {1: (1, {'@': 22}), 2: (1, {'@': 22}), 0: (1, {'@': 22}), 3: (1, {'@': 22})}, 17: {1: (1, {'@': 23}), 2: (1, {'@': 23}), 0: (1, {'@': 23}), 3: (1, {'@': 23})}, 18: {2: (0, 24), 1: (0, 14), 17: (0, 7)}, 19: {1: (1, {'@': 24}), 2: (1, {'@': 24}), 0: (1, {'@': 24}), 3: (1, {'@': 24})}, 20: {0: (1, {'@': 25}), 1: (1, {'@': 25})}, 21: {8: (0, 13), 18: (0, 4), 16: (0, 9), 0: (0, 19)}, 22: {6: (0, 21), 8: (0, 12), 9: (0, 1), 10: (0, 16), 11: (0, 11), 12: (0, 26), 13: (0, 30), 14: (0, 15), 15: (0, 10), 7: (0, 20), 16: (0, 9)}, 23: {6: (0, 21), 7: (0, 8), 9: (0, 1), 8: (0, 12), 10: (0, 16), 11: (0, 11), 12: (0, 26), 13: (0, 30), 14: (0, 15), 15: (0, 10), 16: (0, 9)}, 24: {1: (1, {'@': 26}), 2: (1, {'@': 26}), 0: (1, {'@': 26}), 3: (1, {'@': 26})}, 25: {1: (1, {'@': 27}), 2: (1, {'@': 27}), 0: (1, {'@': 27}), 3: (1, {'@': 27})}, 26: {6: (0, 21), 10: (0, 16), 12: (0, 26), 13: (0, 30), 14: (0, 15), 7: (0, 18), 8: (0, 12), 16: (0, 9), 9: (0, 1), 11: (0, 11), 15: (0, 10), 2: (0, 17)}, 27: {8: (0, 13), 18: (0, 0), 16: (0, 9)}, 28: {6: (0, 21), 10: (0, 16), 12: (0, 26), 13: (0, 30), 8: (0, 12), 16: (0, 9), 19: (0, 6), 9: (0, 1), 11: (0, 11), 7: (0, 31), 15: (0, 10), 14: (0, 15)}, 29: {1: (1, {'@': 28}), 2: (1, {'@': 28})}, 30: {1: (1, {'@': 29}), 2: (1, {'@': 29}), 0: (1, {'@': 29}), 3: (1, {'@': 29})}, 31: {3: (1, {'@': 30})}, 32: {18: (0, 5), 8: (0, 13), 16: (0, 9)}, 33: {1: (1, {'@': 31}), 2: (1, {'@': 31}), 0: (1, {'@': 31}), 3: (1, {'@': 31})}}, 'start_states': {'start': 28}, 'end_states': {'start': 6}}, 'lexer_conf': {'tokens': [{'@': 0}, {'@': 1}, {'@': 2}, {'@': 3}, {'@': 4}, {'@': 5}, {'@': 6}, {'@': 7}, {'@': 8}, {'@': 9}, {'@': 10}, {'@': 11}], 'ignore': ['WS'], 'g_regex_flags': 0, 'use_bytes': False, '__type__': 'LexerConf'}, 'start': ['start'], '__type__': 'LALR_ContextualLexer'}, 'rules': [{'@': 30}, {'@': 13}, {'@': 18}, {'@': 20}, {'@': 19}, {'@': 22}, {'@': 21}, {'@': 29}, {'@': 14}, {'@': 26}, {'@': 23}, {'@': 27}, {'@': 31}, {'@': 24}, {'@': 25}, {'@': 17}, {'@': 28}, {'@': 16}, {'@': 12}, {'@': 15}], 'options': {'debug': False, 'keep_all_tokens': False, 'tree_class': None, 'cache': False, 'postlex': None, 'parser': 'lalr', 'lexer': 'contextual', 'transformer': None, 'start': ['start'], 'priority': None, 'ambiguity': 'auto', 'regex': False, 'propagate_positions': False, 'lexer_callbacks': {}, 'maybe_placeholders': False, 'edit_terminals': None, 'g_regex_flags': 0, 'use_bytes': False}, '__type__': 'Lark'} ) MEMO = ( -{0: {'name': 'ESCAPED_STRING', 'pattern': {'value': '".*?(? last_lineno: + last_col = 0 + if scol > last_col: + res.append(" " * (scol - last_col)) + if toktype == token.STRING and prev_toktype == token.INDENT: + # Docstring + res.append("#--") + elif toktype == tokenize.COMMENT: + # Comment + res.append("##\n") + else: + res.append(ttext) + prev_toktype = toktype + last_col = ecol + last_lineno = elineno + + return ''.join(res) + + def main(fobj, start): lark_inst = Lark(fobj, parser="lalr", lexer="contextual", start=start) @@ -91,9 +123,12 @@ def main(fobj, start): print('__version__ = "%s"' % lark.__version__) print() - for pyfile in EXTRACT_STANDALONE_FILES: + for i, pyfile in enumerate(EXTRACT_STANDALONE_FILES): with open(os.path.join(_larkdir, pyfile)) as f: - print (extract_sections(f)['standalone']) + code = extract_sections(f)['standalone'] + if i: # if not this file + code = strip_docstrings(iter(code.splitlines(True)).__next__) + print(code) data, m = lark_inst.memo_serialize([TerminalDef, Rule]) print( 'DATA = (' ) From eae1693c768eabaca45e02e216ac8c55bcff46a9 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sun, 16 Aug 2020 20:56:25 +0300 Subject: [PATCH 130/164] Docs: Moved some examples away from docstrings --- docs/visitors.rst | 40 ++++++++++++++++++++++++++++++++++++++++ lark/visitors.py | 38 -------------------------------------- 2 files changed, 40 insertions(+), 38 deletions(-) diff --git a/docs/visitors.rst b/docs/visitors.rst index a734e3b..aa3189e 100644 --- a/docs/visitors.rst +++ b/docs/visitors.rst @@ -45,12 +45,52 @@ Interpreter .. autoclass:: lark.visitors.Interpreter + +Example: + :: + + class IncreaseSomeOfTheNumbers(Interpreter): + def number(self, tree): + tree.children[0] += 1 + + def skip(self, tree): + # skip this subtree. don't change any number node inside it. + pass + + IncreaseSomeOfTheNumbers().visit(parse_tree) + Transformer ----------- .. autoclass:: lark.visitors.Transformer :members: __default__, __default_token__ +Example: + :: + + from lark import Tree, Transformer + + class EvalExpressions(Transformer): + def expr(self, args): + return eval(args[0]) + + t = Tree('a', [Tree('expr', ['1+2'])]) + print(EvalExpressions().transform( t )) + + # Prints: Tree(a, [3]) + +Example: + :: + + class T(Transformer): + INT = int + NUMBER = float + def NAME(self, name): + return lookup_dict.get(name, name) + + T(visit_tokens=True).transform(tree) + + v_args ------ diff --git a/lark/visitors.py b/lark/visitors.py index 6ea39b0..14896e5 100644 --- a/lark/visitors.py +++ b/lark/visitors.py @@ -63,20 +63,6 @@ class Transformer(_Decoratable): - ``Transformer_InPlace`` - Non-recursive. Changes the tree in-place instead of returning new instances - ``Transformer_InPlaceRecursive`` - Recursive. Changes the tree in-place instead of returning new instances - Example: - :: - - from lark import Tree, Transformer - - class EvalExpressions(Transformer): - def expr(self, args): - return eval(args[0]) - - t = Tree('a', [Tree('expr', ['1+2'])]) - print(EvalExpressions().transform( t )) - - # Prints: Tree(a, [3]) - Parameters: visit_tokens: By default, transformers only visit rules. visit_tokens=True will tell ``Transformer`` to visit tokens @@ -84,17 +70,6 @@ class Transformer(_Decoratable): but it's easier to maintain and works for all algorithms (even when there isn't a lexer). - Example: - :: - - class T(Transformer): - INT = int - NUMBER = float - def NAME(self, name): - return lookup_dict.get(name, name) - - T(visit_tokens=True).transform(tree) - """ __visit_tokens__ = True # For backwards compatibility @@ -328,19 +303,6 @@ class Interpreter(_Decoratable): Unlike ``Transformer`` and ``Visitor``, the Interpreter doesn't automatically visit its sub-branches. The user has to explicitly call ``visit``, ``visit_children``, or use the ``@visit_children_decor``. This allows the user to implement branching and loops. - - Example: - :: - - class IncreaseSomeOfTheNumbers(Interpreter): - def number(self, tree): - tree.children[0] += 1 - - def skip(self, tree): - # skip this subtree. don't change any number node inside it. - pass - - IncreaseSomeOfTheNumbers().visit(parse_tree) """ def visit(self, tree): From b601525798de97b42f89d3b979666a5b5088f33c Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sun, 16 Aug 2020 22:35:46 +0300 Subject: [PATCH 131/164] Added logger.debug() prints for unused rules and terminals (#658) --- lark/load_grammar.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 0ee546c..7e83e59 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -5,7 +5,7 @@ import sys from copy import copy, deepcopy from io import open -from .utils import bfs, eval_escaping, Py36 +from .utils import bfs, eval_escaping, Py36, logger, classify_bool from .lexer import Token, TerminalDef, PatternStr, PatternRE from .parse_tree_builder import ParseTreeBuilder @@ -631,7 +631,9 @@ class Grammar: if isinstance(s, NonTerminal) and s != r.origin} used_rules |= {NonTerminal(s) for s in start} - compiled_rules = [r for r in compiled_rules if r.origin in used_rules] + compiled_rules, unused = classify_bool(compiled_rules, lambda r: r.origin in used_rules) + for r in unused: + logger.debug("Unused rule: %s", r) if len(compiled_rules) == c: break @@ -639,7 +641,9 @@ class Grammar: used_terms = {t.name for r in compiled_rules for t in r.expansion if isinstance(t, Terminal)} - terminals = [t for t in terminals if t.name in used_terms or t.name in self.ignore] + terminals, unused = classify_bool(terminals, lambda t: t.name in used_terms or t.name in self.ignore) + if unused: + logger.debug("Unused terminals: %s", [t.name for t in unused]) return terminals, compiled_rules, self.ignore From e6edc109b72ec19e3581548b922d3add1d2c3ea5 Mon Sep 17 00:00:00 2001 From: Sasank Chilamkurthy Date: Mon, 17 Aug 2020 17:13:26 +0530 Subject: [PATCH 132/164] options part of lark.Lark's docs --- docs/classes.rst | 4 ---- lark/lark.py | 10 +++++----- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/docs/classes.rst b/docs/classes.rst index 63f9aef..381c3b2 100644 --- a/docs/classes.rst +++ b/docs/classes.rst @@ -7,10 +7,6 @@ Lark .. autoclass:: lark.Lark :members: open, parse, save, load -LarkOptions ------------ - -.. autoclass:: lark.lark.LarkOptions Tree ---- diff --git a/lark/lark.py b/lark/lark.py index abc87d1..add0ae6 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -27,7 +27,7 @@ class LarkOptions(Serialize): """ OPTIONS_DOC = """ - **General** + **General Options** start The start symbol. Either a string, or a list of strings for @@ -60,7 +60,7 @@ class LarkOptions(Serialize): keep_all_tokens Prevent the tree builder from automagically removing "punctuation" tokens (default: False) - **Algorithm** + **Algorithm Options** parser Decides which parser engine to use. Accepts "earley" or "lalr". @@ -83,7 +83,7 @@ class LarkOptions(Serialize): - "explicit": The parser will return all derivations wrapped in "_ambig" tree nodes (i.e. a forest). - **Domain Specific** + **Domain Specific Options** postlex Lexer post-processing (Default: None) Only works with the @@ -317,8 +317,8 @@ class Lark(Serialize): self.save(f) # TODO: merge with above - if __init__.__doc__: - __init__.__doc__ += "\nOptions:\n" + LarkOptions.OPTIONS_DOC + if __doc__: + __doc__ += "\n\n" + LarkOptions.OPTIONS_DOC __serialize_fields__ = 'parser', 'rules', 'options' From a5495189d9d7db7fa3277e4e321ce14abc8fb024 Mon Sep 17 00:00:00 2001 From: Sasank Chilamkurthy Date: Mon, 17 Aug 2020 17:38:01 +0530 Subject: [PATCH 133/164] convert part of md to rst --- docs/classes.rst | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/docs/classes.rst b/docs/classes.rst index 59a6f0b..a880a97 100644 --- a/docs/classes.rst +++ b/docs/classes.rst @@ -7,29 +7,33 @@ Lark .. autoclass:: lark.Lark :members: open, parse, save, load -**Using Unicode character classes with regex** -Python's builtin `re` module has a few persistent known bugs and also won't parse -advanced regex features such as character classes. -With `pip install lark-parser[regex]`, the `regex` module will be installed alongside `lark` and can act as a drop-in replacement to `re`. +Using Unicode character classes with ``regex`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Any instance of `Lark` instantiated with `regex=True` will now use the `regex` module instead of `re`. +Python’s builtin ``re`` module has a few persistent known bugs and also +won’t parse advanced regex features such as character classes. With +``pip install lark-parser[regex]``, the ``regex`` module will be +installed alongside ``lark`` and can act as a drop-in replacement to +``re``. -For example, we can now use character classes to match PEP-3131 compliant Python identifiers. +Any instance of ``Lark`` instantiated with ``regex=True`` will now use +the ``regex`` module instead of ``re``. For example, we can now use +character classes to match PEP-3131 compliant Python identifiers. -Example: - :: +:: - from lark import Lark - >>> g = Lark(r""" - ?start: NAME - NAME: ID_START ID_CONTINUE* - ID_START: /[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}_]+/ - ID_CONTINUE: ID_START | /[\p{Mn}\p{Mc}\p{Nd}\p{Pc}·]+/ - """, regex=True) + from lark import Lark + >>> g = Lark(r""" + ?start: NAME + NAME: ID_START ID_CONTINUE* + ID_START: /[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}_]+/ + ID_CONTINUE: ID_START | /[\p{Mn}\p{Mc}\p{Nd}\p{Pc}·]+/ + """, regex=True) + + >>> g.parse('வணக்கம்') + 'வணக்கம்' - >>> g.parse('வணக்கம்') - 'வணக்கம்' Tree ---- @@ -44,7 +48,7 @@ Token .. autoclass:: lark.Token Transformer, Visitor & Interpreter ---------------------------------- +---------------------------------- See :doc:`visitors`. From 2af0ac2087f66b3f1ca7876f8fec286b43b458c1 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Mon, 17 Aug 2020 15:21:09 +0300 Subject: [PATCH 134/164] Docs: Adjustments --- docs/classes.rst | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/docs/classes.rst b/docs/classes.rst index a880a97..cf72189 100644 --- a/docs/classes.rst +++ b/docs/classes.rst @@ -11,15 +11,14 @@ Lark Using Unicode character classes with ``regex`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Python’s builtin ``re`` module has a few persistent known bugs and also -won’t parse advanced regex features such as character classes. With -``pip install lark-parser[regex]``, the ``regex`` module will be -installed alongside ``lark`` and can act as a drop-in replacement to -``re``. - -Any instance of ``Lark`` instantiated with ``regex=True`` will now use -the ``regex`` module instead of ``re``. For example, we can now use -character classes to match PEP-3131 compliant Python identifiers. +Python's builtin ``re`` module has a few persistent known bugs and also won't parse +advanced regex features such as character classes. +With ``pip install lark-parser[regex]``, the ``regex`` module will be +installed alongside lark and can act as a drop-in replacement to ``re``. + +Any instance of Lark instantiated with ``regex=True`` will use the ``regex`` module instead of ``re``. + +For example, we can use character classes to match PEP-3131 compliant Python identifiers: :: From a768506945a56ed1503cba230fb79d7beaa4dacf Mon Sep 17 00:00:00 2001 From: julienmalard Date: Mon, 17 Aug 2020 08:23:44 -0400 Subject: [PATCH 135/164] Erez's idea --- lark/reconstruct.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/lark/reconstruct.py b/lark/reconstruct.py index dfdaae1..1091681 100644 --- a/lark/reconstruct.py +++ b/lark/reconstruct.py @@ -198,10 +198,9 @@ class Reconstructor: yield item def reconstruct(self, tree, postproc=None): - if postproc is None: - x = self._reconstruct(tree) - else: - x = postproc(self._reconstruct(tree)) + x = self._reconstruct(tree) + if postproc: + x = postproc(x) y = [] prev_item = '' for item in x: From f86c4cf14d8ff55cbdfd43da06fc4534c4295551 Mon Sep 17 00:00:00 2001 From: MegaIng Date: Mon, 17 Aug 2020 14:35:03 +0200 Subject: [PATCH 136/164] Fixed Failing tests --- lark/tools/standalone.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lark/tools/standalone.py b/lark/tools/standalone.py index 4753894..a1d920a 100644 --- a/lark/tools/standalone.py +++ b/lark/tools/standalone.py @@ -35,6 +35,7 @@ import os from pprint import pprint from os import path from collections import defaultdict +from functools import partial import lark from lark import Lark @@ -127,7 +128,7 @@ def main(fobj, start): with open(os.path.join(_larkdir, pyfile)) as f: code = extract_sections(f)['standalone'] if i: # if not this file - code = strip_docstrings(iter(code.splitlines(True)).__next__) + code = strip_docstrings(partial(next, iter(code.splitlines(True)))) print(code) data, m = lark_inst.memo_serialize([TerminalDef, Rule]) From 023709f7104166e790d8dd0c7e88d8070cc8e4e9 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Mon, 17 Aug 2020 16:40:12 +0300 Subject: [PATCH 137/164] Added comment --- lark/reconstruct.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lark/reconstruct.py b/lark/reconstruct.py index 1091681..35e5994 100644 --- a/lark/reconstruct.py +++ b/lark/reconstruct.py @@ -96,6 +96,7 @@ def make_recons_rule_to_term(origin, term): return make_recons_rule(origin, [Terminal(term.name)], [term]) def _isalnum(x): + # Categories defined here: https://www.python.org/dev/peps/pep-3131/ return unicodedata.category(x) in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Mn', 'Mc', 'Nd', 'Pc'] class Reconstructor: From b076efadffadd6beeaadaa53c26c7875a378b045 Mon Sep 17 00:00:00 2001 From: Chanic Panic Date: Sat, 15 Aug 2020 21:18:09 -0700 Subject: [PATCH 138/164] Create CompleteForestToAmbiguousTreeVisitor --- lark/parsers/earley.py | 4 +- lark/parsers/earley_forest.py | 69 +++++++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+), 2 deletions(-) diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index 098639d..bcb568f 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -18,7 +18,7 @@ from ..utils import logger from .grammar_analysis import GrammarAnalyzer from ..grammar import NonTerminal from .earley_common import Item, TransitiveItem -from .earley_forest import ForestToTreeVisitor, ForestSumVisitor, SymbolNode, ForestToAmbiguousTreeVisitor +from .earley_forest import ForestToTreeVisitor, ForestSumVisitor, SymbolNode, CompleteForestToAmbiguousTreeVisitor class Parser: def __init__(self, parser_conf, term_matcher, resolve_ambiguity=True, debug=False): @@ -313,7 +313,7 @@ class Parser: assert False, 'Earley should not generate multiple start symbol items!' # Perform our SPPF -> AST conversion using the right ForestVisitor. - forest_tree_visitor_cls = ForestToTreeVisitor if self.resolve_ambiguity else ForestToAmbiguousTreeVisitor + forest_tree_visitor_cls = ForestToTreeVisitor if self.resolve_ambiguity else CompleteForestToAmbiguousTreeVisitor forest_tree_visitor = forest_tree_visitor_cls(self.callbacks, self.forest_sum_visitor and self.forest_sum_visitor()) return forest_tree_visitor.visit(solutions[0]) diff --git a/lark/parsers/earley_forest.py b/lark/parsers/earley_forest.py index 4ed75d9..b39d02e 100644 --- a/lark/parsers/earley_forest.py +++ b/lark/parsers/earley_forest.py @@ -363,6 +363,75 @@ class ForestToAmbiguousTreeVisitor(ForestToTreeVisitor): else: self.result = result +class CompleteForestToAmbiguousTreeVisitor(ForestToTreeVisitor): + """ + An augmented version of ForestToAmbiguousTreeVisitor that is designed to + handle ambiguous intermediate nodes as well as ambiguous symbol nodes. + + On the way down: + + - When an ambiguous intermediate node is encountered, an '_iambig' node + is inserted into the tree. + - Each possible derivation of an ambiguous intermediate node is represented + by an '_inter' node added as a child of the corresponding '_iambig' node. + + On the way up, these nodes are propagated up the tree and collapsed + into a single '_ambig' node for the nearest symbol node ancestor. + This is achieved by the AmbiguousIntermediateExpander contained in + the callbacks. + """ + + def _collapse_ambig(self, children): + new_children = [] + for child in children: + if child.data == '_ambig': + new_children += child.children + else: + new_children.append(child) + return new_children + + def visit_token_node(self, node): + self.output_stack[-1].children.append(node) + + def visit_symbol_node_in(self, node): + if node.is_ambiguous: + if self.forest_sum_visitor and isinf(node.priority): + self.forest_sum_visitor.visit(node) + if node.is_intermediate: + self.output_stack.append(Tree('_iambig', [])) + else: + self.output_stack.append(Tree('_ambig', [])) + return iter(node.children) + + def visit_symbol_node_out(self, node): + if node.is_ambiguous: + result = self.output_stack.pop() + if not node.is_intermediate: + result = Tree('_ambig', self._collapse_ambig(result.children)) + if self.output_stack: + self.output_stack[-1].children.append(result) + else: + self.result = result + + def visit_packed_node_in(self, node): + if not node.parent.is_intermediate: + self.output_stack.append(Tree('drv', [])) + elif node.parent.is_ambiguous: + self.output_stack.append(Tree('_inter', [])) + return iter([node.left, node.right]) + + def visit_packed_node_out(self, node): + if not node.parent.is_intermediate: + result = self.callbacks[node.rule](self.output_stack.pop().children) + elif node.parent.is_ambiguous: + result = self.output_stack.pop() + else: + return + if self.output_stack: + self.output_stack[-1].children.append(result) + else: + self.result = result + class ForestToPyDotVisitor(ForestVisitor): """ A Forest visitor which writes the SPPF to a PNG. From e459f28743077b8a3209aad6f45a0a50f48041d6 Mon Sep 17 00:00:00 2001 From: Chanic Panic Date: Sat, 15 Aug 2020 21:34:11 -0700 Subject: [PATCH 139/164] Add AmbiguousIntermediateExpander --- lark/parse_tree_builder.py | 81 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index 5a7c5d7..8b81d29 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -195,6 +195,86 @@ def maybe_create_ambiguous_expander(tree_class, expansion, keep_all_tokens): if to_expand: return partial(AmbiguousExpander, to_expand, tree_class) +class AmbiguousIntermediateExpander: + """ + Propagate ambiguous intermediate nodes and their derivations up to the + current rule. + + In general, converts + + rule + _iambig + _inter + someChildren1 + ... + _inter + someChildren2 + ... + someChildren3 + ... + + to + + _ambig + rule + someChildren1 + ... + someChildren3 + ... + rule + someChildren2 + ... + someChildren3 + ... + rule + childrenFromNestedIambigs + ... + someChildren3 + ... + ... + + propagating up any nested '_iambig' nodes along the way. + """ + + def __init__(self, tree_class, node_builder): + self.node_builder = node_builder + self.tree_class = tree_class + + def __call__(self, children): + def _is_iambig_tree(child): + return hasattr(child, 'data') and child.data == '_iambig' + + def _collapse_iambig(children): + """ + Recursively flatten the derivations of the parent of an '_iambig' + node. Returns a list of '_inter' nodes guaranteed not + to contain any nested '_iambig' nodes, or None if children does + not contain an '_iambig' node. + """ + + # Due to the structure of the SPPF, + # an '_iambig' node can only appear as the first child + if children and _is_iambig_tree(children[0]): + iambig_node = children[0] + result = [] + for grandchild in iambig_node.children: + collapsed = _collapse_iambig(grandchild.children) + if collapsed: + for child in collapsed: + child.children += children[1:] + result += collapsed + else: + new_tree = self.tree_class('_inter', grandchild.children + children[1:]) + result.append(new_tree) + return result + + collapsed = _collapse_iambig(children) + if collapsed: + processed_nodes = [self.node_builder(c.children) for c in collapsed] + return self.tree_class('_ambig', processed_nodes) + + return self.node_builder(children) + def ptb_inline_args(func): @wraps(func) def f(children): @@ -239,6 +319,7 @@ class ParseTreeBuilder: maybe_create_child_filter(rule.expansion, keep_all_tokens, self.ambiguous, options.empty_indices if self.maybe_placeholders else None), self.propagate_positions and PropagatePositions, self.ambiguous and maybe_create_ambiguous_expander(self.tree_class, rule.expansion, keep_all_tokens), + self.ambiguous and partial(AmbiguousIntermediateExpander, self.tree_class) ])) yield rule, wrapper_chain From f89b1549dd2938266e50f34e4debc7790f4ce3f7 Mon Sep 17 00:00:00 2001 From: Chanic Panic Date: Sat, 15 Aug 2020 21:52:43 -0700 Subject: [PATCH 140/164] Write tests for ambiguous intermediate nodes in the SPPF --- tests/test_parser.py | 215 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 215 insertions(+) diff --git a/tests/test_parser.py b/tests/test_parser.py index 2f6a15e..83336c5 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -460,6 +460,221 @@ def _make_full_earley_test(LEXER): ]) self.assertEqual(res, expected) + def test_ambiguous_intermediate_node(self): + grammar = """ + start: ab bc d? + !ab: "A" "B"? + !bc: "B"? "C" + !d: "D" + """ + + l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER) + ambig_tree = l.parse("ABCD") + expected = { + Tree('start', [Tree('ab', ['A']), Tree('bc', ['B', 'C']), Tree('d', ['D'])]), + Tree('start', [Tree('ab', ['A', 'B']), Tree('bc', ['C']), Tree('d', ['D'])]) + } + self.assertEqual(ambig_tree.data, '_ambig') + self.assertEqual(set(ambig_tree.children), expected) + + def test_ambiguous_symbol_and_intermediate_nodes(self): + grammar = """ + start: ab bc cd + !ab: "A" "B"? + !bc: "B"? "C"? + !cd: "C"? "D" + """ + + l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER) + ambig_tree = l.parse("ABCD") + expected = { + Tree('start', [ + Tree('ab', ['A', 'B']), + Tree('bc', ['C']), + Tree('cd', ['D']) + ]), + Tree('start', [ + Tree('ab', ['A', 'B']), + Tree('bc', []), + Tree('cd', ['C', 'D']) + ]), + Tree('start', [ + Tree('ab', ['A']), + Tree('bc', ['B', 'C']), + Tree('cd', ['D']) + ]), + Tree('start', [ + Tree('ab', ['A']), + Tree('bc', ['B']), + Tree('cd', ['C', 'D']) + ]), + } + self.assertEqual(ambig_tree.data, '_ambig') + self.assertEqual(set(ambig_tree.children), expected) + + def test_nested_ambiguous_intermediate_nodes(self): + grammar = """ + start: ab bc cd e? + !ab: "A" "B"? + !bc: "B"? "C"? + !cd: "C"? "D" + !e: "E" + """ + + l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER) + ambig_tree = l.parse("ABCDE") + expected = { + Tree('start', [ + Tree('ab', ['A', 'B']), + Tree('bc', ['C']), + Tree('cd', ['D']), + Tree('e', ['E']) + ]), + Tree('start', [ + Tree('ab', ['A']), + Tree('bc', ['B', 'C']), + Tree('cd', ['D']), + Tree('e', ['E']) + ]), + Tree('start', [ + Tree('ab', ['A']), + Tree('bc', ['B']), + Tree('cd', ['C', 'D']), + Tree('e', ['E']) + ]), + Tree('start', [ + Tree('ab', ['A', 'B']), + Tree('bc', []), + Tree('cd', ['C', 'D']), + Tree('e', ['E']) + ]), + } + self.assertEqual(ambig_tree.data, '_ambig') + self.assertEqual(set(ambig_tree.children), expected) + + def test_nested_ambiguous_intermediate_nodes2(self): + grammar = """ + start: ab bc cd de f + !ab: "A" "B"? + !bc: "B"? "C"? + !cd: "C"? "D"? + !de: "D"? "E" + !f: "F" + """ + + l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER) + ambig_tree = l.parse("ABCDEF") + expected = { + Tree('start', [ + Tree('ab', ['A', 'B']), + Tree('bc', ['C']), + Tree('cd', ['D']), + Tree('de', ['E']), + Tree('f', ['F']), + ]), + Tree('start', [ + Tree('ab', ['A']), + Tree('bc', ['B', 'C']), + Tree('cd', ['D']), + Tree('de', ['E']), + Tree('f', ['F']), + ]), + Tree('start', [ + Tree('ab', ['A']), + Tree('bc', ['B']), + Tree('cd', ['C', 'D']), + Tree('de', ['E']), + Tree('f', ['F']), + ]), + Tree('start', [ + Tree('ab', ['A']), + Tree('bc', ['B']), + Tree('cd', ['C']), + Tree('de', ['D', 'E']), + Tree('f', ['F']), + ]), + Tree('start', [ + Tree('ab', ['A', "B"]), + Tree('bc', []), + Tree('cd', ['C']), + Tree('de', ['D', 'E']), + Tree('f', ['F']), + ]), + Tree('start', [ + Tree('ab', ['A']), + Tree('bc', ['B', 'C']), + Tree('cd', []), + Tree('de', ['D', 'E']), + Tree('f', ['F']), + ]), + Tree('start', [ + Tree('ab', ['A', 'B']), + Tree('bc', []), + Tree('cd', ['C', 'D']), + Tree('de', ['E']), + Tree('f', ['F']), + ]), + Tree('start', [ + Tree('ab', ['A', 'B']), + Tree('bc', ['C']), + Tree('cd', []), + Tree('de', ['D', 'E']), + Tree('f', ['F']), + ]), + } + self.assertEqual(ambig_tree.data, '_ambig') + self.assertEqual(set(ambig_tree.children), expected) + + def test_ambiguous_intermediate_node_unnamed_token(self): + grammar = """ + start: ab bc "D" + !ab: "A" "B"? + !bc: "B"? "C" + """ + + l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER) + ambig_tree = l.parse("ABCD") + expected = { + Tree('start', [Tree('ab', ['A']), Tree('bc', ['B', 'C'])]), + Tree('start', [Tree('ab', ['A', 'B']), Tree('bc', ['C'])]) + } + self.assertEqual(ambig_tree.data, '_ambig') + self.assertEqual(set(ambig_tree.children), expected) + + def test_ambiguous_intermediate_node_inlined_rule(self): + grammar = """ + start: ab _bc d? + !ab: "A" "B"? + _bc: "B"? "C" + !d: "D" + """ + + l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER) + ambig_tree = l.parse("ABCD") + expected = { + Tree('start', [Tree('ab', ['A']), Tree('d', ['D'])]), + Tree('start', [Tree('ab', ['A', 'B']), Tree('d', ['D'])]) + } + self.assertEqual(ambig_tree.data, '_ambig') + self.assertEqual(set(ambig_tree.children), expected) + + def test_ambiguous_intermediate_node_conditionally_inlined_rule(self): + grammar = """ + start: ab bc d? + !ab: "A" "B"? + !?bc: "B"? "C" + !d: "D" + """ + + l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER) + ambig_tree = l.parse("ABCD") + expected = { + Tree('start', [Tree('ab', ['A']), Tree('bc', ['B', 'C']), Tree('d', ['D'])]), + Tree('start', [Tree('ab', ['A', 'B']), 'C', Tree('d', ['D'])]) + } + self.assertEqual(ambig_tree.data, '_ambig') + self.assertEqual(set(ambig_tree.children), expected) + def test_fruitflies_ambig(self): grammar = """ start: noun verb noun -> simple From 298e7cfce64cbb524cd643715dc445d2d71bd711 Mon Sep 17 00:00:00 2001 From: Sasank Chilamkurthy Date: Tue, 18 Aug 2020 12:34:41 +0530 Subject: [PATCH 141/164] add end options in docstring --- lark/lark.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lark/lark.py b/lark/lark.py index 835d28e..d82e723 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -88,6 +88,8 @@ class LarkOptions(Serialize): Accept an input of type ``bytes`` instead of ``str`` (Python 3 only). edit_terminals A callback for editing the terminals before parse. + + **=== End Options ===** """ if __doc__: __doc__ += OPTIONS_DOC @@ -306,7 +308,6 @@ class Lark(Serialize): with FS.open(cache_fn, 'wb') as f: self.save(f) - # TODO: merge with above __doc__ += "\n\n" + LarkOptions.OPTIONS_DOC __serialize_fields__ = 'parser', 'rules', 'options' From cb8a5498966c12503e192d752f4e55f470adbe1b Mon Sep 17 00:00:00 2001 From: Sasank Chilamkurthy Date: Wed, 19 Aug 2020 09:06:37 +0530 Subject: [PATCH 142/164] inital sphinx gallery --- docs/conf.py | 1 + docs/examples | 1 + docs/index.rst | 1 + examples/README.md | 34 ---------------- examples/README.rst | 66 ++++++++++++++++++++++++++++++++ examples/calc.py | 10 +++-- examples/conf_earley.py | 26 +++++++------ examples/conf_lalr.py | 30 ++++++++------- examples/custom_lexer.py | 17 ++++---- examples/error_puppet.py | 19 +++++---- examples/error_reporting_lalr.py | 9 +++-- examples/fruitflies.py | 11 ++++-- examples/indented_tree.py | 21 +++++----- examples/json_parser.py | 14 ++++--- examples/lark_grammar.py | 6 +++ examples/python_bytecode.py | 20 ++++++---- examples/python_parser.py | 10 +++-- examples/qscintilla_json.py | 18 +++++---- examples/reconstruct_json.py | 16 +++++--- examples/templates.py | 9 +++-- examples/turtle_dsl.py | 7 +++- 21 files changed, 218 insertions(+), 128 deletions(-) create mode 120000 docs/examples delete mode 100644 examples/README.md create mode 100644 examples/README.rst diff --git a/docs/conf.py b/docs/conf.py index 887eeb2..e41cf0b 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -37,6 +37,7 @@ extensions = [ 'sphinx.ext.napoleon', 'sphinx.ext.coverage', 'recommonmark', + 'sphinx_gallery.gen_gallery' ] # Add any paths that contain templates here, relative to this directory. diff --git a/docs/examples b/docs/examples new file mode 120000 index 0000000..785887f --- /dev/null +++ b/docs/examples @@ -0,0 +1 @@ +../examples/ \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst index ba2c241..0ddadb3 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -24,6 +24,7 @@ Welcome to Lark's documentation! how_to_use how_to_develop recipes + auto_examples/index .. toctree:: diff --git a/examples/README.md b/examples/README.md deleted file mode 100644 index 8053ebd..0000000 --- a/examples/README.md +++ /dev/null @@ -1,34 +0,0 @@ -# Examples for Lark - -#### How to run the examples - -After cloning the repo, open the terminal into the root directory of the project, and run the following: - -```bash -[lark]$ python -m examples. -``` - -For example, the following will parse all the Python files in the standard library of your local installation: - -```bash -[lark]$ python -m examples.python_parser -``` - -### Beginners - -- [calc.py](calc.py) - A simple example of a REPL calculator -- [json\_parser.py](json_parser.py) - A simple JSON parser (comes with a tutorial, see docs) -- [indented\_tree.py](indented\_tree.py) - A demonstration of parsing indentation ("whitespace significant" language) -- [fruitflies.py](fruitflies.py) - A demonstration of ambiguity -- [turtle\_dsl.py](turtle_dsl.py) - Implements a LOGO-like toy language for Python's turtle, with interpreter. -- [lark\_grammar.py](lark_grammar.py) + [lark.lark](lark.lark) - A reference implementation of the Lark grammar (using LALR(1) + standard lexer) - -### Advanced - -- [error\_reporting\_lalr.py](error_reporting_lalr.py) - A demonstration of example-driven error reporting with the LALR parser -- [python\_parser.py](python_parser.py) - A fully-working Python 2 & 3 parser (but not production ready yet!) -- [python\_bytecode.py](python_bytecode.py) - A toy example showing how to compile Python directly to bytecode -- [conf\_lalr.py](conf_lalr.py) - Demonstrates the power of LALR's contextual lexer on a toy configuration language -- [conf\_earley.py](conf_earley.py) - Demonstrates the power of Earley's dynamic lexer on a toy configuration language -- [custom\_lexer.py](custom_lexer.py) - Demonstrates using a custom lexer to parse a non-textual stream of data -- [reconstruct\_json.py](reconstruct_json.py) - Demonstrates the experimental text-reconstruction feature diff --git a/examples/README.rst b/examples/README.rst new file mode 100644 index 0000000..977c318 --- /dev/null +++ b/examples/README.rst @@ -0,0 +1,66 @@ +Examples for Lark +================= + +How to run the examples +^^^^^^^^^^^^^^^^^^^^^^^ + +After cloning the repo, open the terminal into the root directory of the +project, and run the following: + +.. code:: bash + + [lark]$ python -m examples. + +For example, the following will parse all the Python files in the +standard library of your local installation: + +.. code:: bash + + [lark]$ python -m examples.python_parser + +Beginners +~~~~~~~~~ + +- `calc.py`_ - A simple example of a REPL calculator +- `json_parser.py`_ - A simple JSON parser (comes with a tutorial, see + docs) +- `indented_tree.py`_ - A demonstration of parsing indentation + (“whitespace significant” language) +- `fruitflies.py`_ - A demonstration of ambiguity +- `turtle_dsl.py`_ - Implements a LOGO-like toy language for Python’s + turtle, with interpreter. +- `lark_grammar.py`_ + `lark.lark`_ - A reference implementation of the + Lark grammar (using LALR(1) + standard lexer) + +Advanced +~~~~~~~~ + +- `error_reporting_lalr.py`_ - A demonstration of example-driven error + reporting with the LALR parser +- `python_parser.py`_ - A fully-working Python 2 & 3 parser (but not + production ready yet!) +- `python_bytecode.py`_ - A toy example showing how to compile Python + directly to bytecode +- `conf_lalr.py`_ - Demonstrates the power of LALR’s contextual lexer + on a toy configuration language +- `conf_earley.py`_ - Demonstrates the power of Earley’s dynamic lexer + on a toy configuration language +- `custom_lexer.py`_ - Demonstrates using a custom lexer to parse a + non-textual stream of data +- `reconstruct_json.py`_ - Demonstrates the experimental + text-reconstruction feature + +.. _calc.py: calc.py +.. _json_parser.py: json_parser.py +.. _indented_tree.py: indented_tree.py +.. _fruitflies.py: fruitflies.py +.. _turtle_dsl.py: turtle_dsl.py +.. _lark_grammar.py: lark_grammar.py +.. _lark.lark: lark.lark +.. _error_reporting_lalr.py: error_reporting_lalr.py +.. _python_parser.py: python_parser.py +.. _python_bytecode.py: python_bytecode.py +.. _conf_lalr.py: conf_lalr.py +.. _conf_earley.py: conf_earley.py +.. _custom_lexer.py: custom_lexer.py +.. _reconstruct_json.py: reconstruct_json.py \ No newline at end of file diff --git a/examples/calc.py b/examples/calc.py index c4470ef..cccee9e 100644 --- a/examples/calc.py +++ b/examples/calc.py @@ -1,7 +1,11 @@ -# -# This example shows how to write a basic calculator with variables. -# +""" +Basic calculator +================ + +A simple example of a REPL calculator +This example shows how to write a basic calculator with variables. +""" from lark import Lark, Transformer, v_args diff --git a/examples/conf_earley.py b/examples/conf_earley.py index 13b6c8d..b21c1ac 100644 --- a/examples/conf_earley.py +++ b/examples/conf_earley.py @@ -1,17 +1,19 @@ -# -# This example demonstrates parsing using the dynamic-lexer earley frontend -# -# Using a lexer for configuration files is tricky, because values don't -# have to be surrounded by delimiters. Using a standard lexer for this just won't work. -# -# In this example we use a dynamic lexer and let the Earley parser resolve the ambiguity. -# -# Another approach is to use the contextual lexer with LALR. It is less powerful than Earley, -# but it can handle some ambiguity when lexing and it's much faster. -# See examples/conf_lalr.py for an example of that approach. -# +""" +Earley’s dynamic lexer +====================== + +Demonstrates the power of Earley’s dynamic lexer on a toy configuration language + +Using a lexer for configuration files is tricky, because values don't +have to be surrounded by delimiters. Using a standard lexer for this just won't work. +In this example we use a dynamic lexer and let the Earley parser resolve the ambiguity. +Another approach is to use the contextual lexer with LALR. It is less powerful than Earley, +but it can handle some ambiguity when lexing and it's much faster. +See examples/conf_lalr.py for an example of that approach. + +""" from lark import Lark parser = Lark(r""" diff --git a/examples/conf_lalr.py b/examples/conf_lalr.py index 33d1dc0..5ffd1d2 100644 --- a/examples/conf_lalr.py +++ b/examples/conf_lalr.py @@ -1,18 +1,20 @@ -# -# This example demonstrates the power of the contextual lexer, by parsing a config file. -# -# The tokens NAME and VALUE match the same input. A standard lexer would arbitrarily -# choose one over the other, which would lead to a (confusing) parse error. -# However, due to the unambiguous structure of the grammar, Lark's LALR(1) algorithm knows -# which one of them to expect at each point during the parse. -# The lexer then only matches the tokens that the parser expects. -# The result is a correct parse, something that is impossible with a regular lexer. -# -# Another approach is to discard a lexer altogether and use the Earley algorithm. -# It will handle more cases than the contextual lexer, but at the cost of performance. -# See examples/conf_earley.py for an example of that approach. -# +""" +LALR’s contextual lexer +======================= + +Demonstrates the power of LALR’s contextual lexer on a toy configuration language. +The tokens NAME and VALUE match the same input. A standard lexer would arbitrarily +choose one over the other, which would lead to a (confusing) parse error. +However, due to the unambiguous structure of the grammar, Lark's LALR(1) algorithm knows +which one of them to expect at each point during the parse. +The lexer then only matches the tokens that the parser expects. +The result is a correct parse, something that is impossible with a regular lexer. + +Another approach is to discard a lexer altogether and use the Earley algorithm. +It will handle more cases than the contextual lexer, but at the cost of performance. +See examples/conf_earley.py for an example of that approach. +""" from lark import Lark parser = Lark(r""" diff --git a/examples/custom_lexer.py b/examples/custom_lexer.py index 786bf4f..05a5eb5 100644 --- a/examples/custom_lexer.py +++ b/examples/custom_lexer.py @@ -1,13 +1,14 @@ -# -# This example demonstrates using Lark with a custom lexer. -# -# You can use a custom lexer to tokenize text when the lexers offered by Lark -# are too slow, or not flexible enough. -# -# You can also use it (as shown in this example) to tokenize streams of objects. -# +""" +Custom lexer +============ +Demonstrates using a custom lexer to parse a non-textual stream of data +You can use a custom lexer to tokenize text when the lexers offered by Lark +are too slow, or not flexible enough. + +You can also use it (as shown in this example) to tokenize streams of objects. +""" from lark import Lark, Transformer, v_args from lark.lexer import Lexer, Token diff --git a/examples/error_puppet.py b/examples/error_puppet.py index 87d69e1..d3fca9d 100644 --- a/examples/error_puppet.py +++ b/examples/error_puppet.py @@ -1,11 +1,14 @@ -# -# This example demonstrates error handling using a parsing puppet in LALR -# -# When the parser encounters an UnexpectedToken exception, it creates a -# parsing puppet with the current parse-state, and lets you control how -# to proceed step-by-step. When you've achieved the correct parse-state, -# you can resume the run by returning True. -# +""" +Error handling with parsing puppet +================================== + +This example demonstrates error handling using a parsing puppet in LALR + +When the parser encounters an UnexpectedToken exception, it creates a +parsing puppet with the current parse-state, and lets you control how +to proceed step-by-step. When you've achieved the correct parse-state, +you can resume the run by returning True. +""" from lark import UnexpectedToken, Token diff --git a/examples/error_reporting_lalr.py b/examples/error_reporting_lalr.py index f038eda..deeeb5f 100644 --- a/examples/error_reporting_lalr.py +++ b/examples/error_reporting_lalr.py @@ -1,7 +1,10 @@ -# -# This demonstrates example-driven error reporting with the LALR parser -# +""" +Example Driver Error Reporting +============================== +A demonstration of example-driven error reporting with the LALR parser + +""" from lark import Lark, UnexpectedInput from .json_parser import json_grammar # Using the grammar from the json_parser example diff --git a/examples/fruitflies.py b/examples/fruitflies.py index cb6b5cc..697b4ac 100644 --- a/examples/fruitflies.py +++ b/examples/fruitflies.py @@ -1,7 +1,12 @@ -# -# This example shows how to use get explicit ambiguity from Lark's Earley parser. -# +""" +Handling Ambiguity +================== +A demonstration of ambiguity + +This example shows how to use get explicit ambiguity from Lark's Earley parser. + +""" import sys from lark import Lark, tree diff --git a/examples/indented_tree.py b/examples/indented_tree.py index c31bb13..6cdaf37 100644 --- a/examples/indented_tree.py +++ b/examples/indented_tree.py @@ -1,13 +1,16 @@ -# -# This example demonstrates usage of the Indenter class. -# -# Since indentation is context-sensitive, a postlex stage is introduced to -# manufacture INDENT/DEDENT tokens. -# -# It is crucial for the indenter that the NL_type matches -# the spaces (and tabs) after the newline. -# +""" +Parsing Indentation +=================== + +A demonstration of parsing indentation (“whitespace significant” language) +and the usage of the Indenter class. +Since indentation is context-sensitive, a postlex stage is introduced to +manufacture INDENT/DEDENT tokens. + +It is crucial for the indenter that the NL_type matches +the spaces (and tabs) after the newline. +""" from lark import Lark from lark.indenter import Indenter diff --git a/examples/json_parser.py b/examples/json_parser.py index 7aa7d0f..927585f 100644 --- a/examples/json_parser.py +++ b/examples/json_parser.py @@ -1,10 +1,12 @@ -# -# This example shows how to write a basic JSON parser -# -# The code is short and clear, and outperforms every other parser (that's written in Python). -# For an explanation, check out the JSON parser tutorial at /docs/json_tutorial.md -# +""" +Simple JSON Parser +================== + +A simple JSON parser (comes with a tutorial, see docs) +The code is short and clear, and outperforms every other parser (that's written in Python). +For an explanation, check out the JSON parser tutorial at /docs/json_tutorial.md +""" import sys from lark import Lark, Transformer, v_args diff --git a/examples/lark_grammar.py b/examples/lark_grammar.py index c7ace47..5dc9d11 100644 --- a/examples/lark_grammar.py +++ b/examples/lark_grammar.py @@ -1,3 +1,9 @@ +""" +Lark Grammar +============ + +A reference implementation of the Lark grammar (using LALR(1) + standard lexer) +""" from lark import Lark parser = Lark(open('examples/lark.lark'), parser="lalr") diff --git a/examples/python_bytecode.py b/examples/python_bytecode.py index cbb8ccd..6165e82 100644 --- a/examples/python_bytecode.py +++ b/examples/python_bytecode.py @@ -1,12 +1,16 @@ -# -# This is a toy example that compiles Python directly to bytecode, without generating an AST. -# It currently only works for very very simple Python code. -# -# It requires the 'bytecode' library. You can get it using -# -# $ pip install bytecode -# +""" +Compile Python to Bytecode +========================== + +A toy example that compiles Python directly to bytecode, without generating an AST. +It currently only works for very very simple Python code. +It requires the 'bytecode' library. You can get it using +:: + + $ pip install bytecode + +""" from lark import Lark, Transformer, v_args from lark.indenter import Indenter diff --git a/examples/python_parser.py b/examples/python_parser.py index 82bfcb9..5e4d664 100644 --- a/examples/python_parser.py +++ b/examples/python_parser.py @@ -1,7 +1,11 @@ -# -# This example demonstrates usage of the included Python grammars -# +""" +Real Python Parser +================== +A fully-working Python 2 & 3 parser (but not production ready yet!) + +This example demonstrates usage of the included Python grammars +""" import sys import os, os.path from io import open diff --git a/examples/qscintilla_json.py b/examples/qscintilla_json.py index 287981c..b876d4c 100644 --- a/examples/qscintilla_json.py +++ b/examples/qscintilla_json.py @@ -1,10 +1,14 @@ -# -# This example shows how to write a syntax-highlighted editor with Qt and Lark -# -# Requirements: -# -# PyQt5==5.10.1 -# QScintilla==2.10.4 +""" +Syntax Highlighting +=================== + +This example shows how to write a syntax-highlighted editor with Qt and Lark + +Requirements: + + PyQt5==5.10.1 + QScintilla==2.10.4 +""" import sys import textwrap diff --git a/examples/reconstruct_json.py b/examples/reconstruct_json.py index 59c58b0..4506c3a 100644 --- a/examples/reconstruct_json.py +++ b/examples/reconstruct_json.py @@ -1,9 +1,13 @@ -# -# This example demonstrates an experimental feature: Text reconstruction -# The Reconstructor takes a parse tree (already filtered from punctuation, of course), -# and reconstructs it into correct text, that can be parsed correctly. -# It can be useful for creating "hooks" to alter data before handing it to other parsers. You can also use it to generate samples from scratch. -# +""" +Reconstruct a JSON +================== + +Demonstrates the experimental text-reconstruction feature + +The Reconstructor takes a parse tree (already filtered from punctuation, of course), +and reconstructs it into correct text, that can be parsed correctly. +It can be useful for creating "hooks" to alter data before handing it to other parsers. You can also use it to generate samples from scratch. +""" import json diff --git a/examples/templates.py b/examples/templates.py index 2acc6eb..2f28d90 100644 --- a/examples/templates.py +++ b/examples/templates.py @@ -1,7 +1,10 @@ -# -# This example shows how to use Lark's templates to achieve cleaner grammars -# +""" +Templates +========= + +This example shows how to use Lark's templates to achieve cleaner grammars +"""" from lark import Lark grammar = r""" diff --git a/examples/turtle_dsl.py b/examples/turtle_dsl.py index 775a98e..81a9cde 100644 --- a/examples/turtle_dsl.py +++ b/examples/turtle_dsl.py @@ -1,4 +1,9 @@ -# This example implements a LOGO-like toy language for Python's turtle, with interpreter. +""" +Turtle DSL +========== + +Implements a LOGO-like toy language for Python’s turtle, with interpreter. +""" try: input = raw_input # For Python2 compatibility From 10dd3d7299b5a9a36ed801e276c53c3e7973cead Mon Sep 17 00:00:00 2001 From: starwarswii Date: Wed, 19 Aug 2020 04:23:57 -0400 Subject: [PATCH 143/164] corrected caret placement in error messages due to tabs fixes #663 --- lark/exceptions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lark/exceptions.py b/lark/exceptions.py index c538888..6288398 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -49,11 +49,11 @@ class UnexpectedInput(LarkError): if not isinstance(text, bytes): before = text[start:pos].rsplit('\n', 1)[-1] after = text[pos:end].split('\n', 1)[0] - return before + after + '\n' + ' ' * len(before) + '^\n' + return before + after + '\n' + ' ' * len(before.expandtabs()) + '^\n' else: before = text[start:pos].rsplit(b'\n', 1)[-1] after = text[pos:end].split(b'\n', 1)[0] - return (before + after + b'\n' + b' ' * len(before) + b'^\n').decode("ascii", "backslashreplace") + return (before + after + b'\n' + b' ' * len(before.expandtabs()) + b'^\n').decode("ascii", "backslashreplace") def match_examples(self, parse_fn, examples, token_type_match_fallback=False, use_accepts=False): """Allows you to detect what's wrong in the input text by matching From fdd5c83588ba38ac7e931060f3e84469608e859a Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Wed, 19 Aug 2020 13:03:19 +0300 Subject: [PATCH 144/164] Updated docstring for Token --- lark/lexer.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/lark/lexer.py b/lark/lexer.py index 87e286e..ff22cbb 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -90,15 +90,15 @@ class TerminalDef(Serialize): class Token(Str): - """Token of a lexer. + """A string with meta-information, that is produced by the lexer. - When using a lexer, the resulting tokens in the trees will be of the - Token class, which inherits from Python's string. So, normal string - comparisons and operations will work as expected. Tokens also have other - useful attributes. + When parsing text, the resulting chunks of the input that haven't been discarded, + will end up in the tree as Token instances. The Token class inherits from Python's `str`, + so normal string comparisons and operations will work as expected. Attributes: type_: Name of the token (as specified in grammar) + value: Value of the token (redundant, as `token.value == token` will always be true) pos_in_stream: The index of the token in the text line: The line of the token in the text (starting with 1) column: The column of the token in the text (starting with 1) @@ -106,8 +106,7 @@ class Token(Str): end_column: The next column after the end of the token. For example, if the token is a single character with a column value of 4, end_column will be 5. - end_pos: the index where the token ends (basically pos_in_stream + - len(token)) + end_pos: the index where the token ends (basically `pos_in_stream + len(token)`) """ __slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column', 'end_pos') From 49914424df50f205facb5c48b27a8d154f68f030 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Wed, 19 Aug 2020 13:05:45 +0300 Subject: [PATCH 145/164] Docs: Tiny fix (rst) --- lark/lexer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lark/lexer.py b/lark/lexer.py index ff22cbb..4ee7b53 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -93,12 +93,12 @@ class Token(Str): """A string with meta-information, that is produced by the lexer. When parsing text, the resulting chunks of the input that haven't been discarded, - will end up in the tree as Token instances. The Token class inherits from Python's `str`, + will end up in the tree as Token instances. The Token class inherits from Python's ``str``, so normal string comparisons and operations will work as expected. Attributes: type_: Name of the token (as specified in grammar) - value: Value of the token (redundant, as `token.value == token` will always be true) + value: Value of the token (redundant, as ``token.value == token`` will always be true) pos_in_stream: The index of the token in the text line: The line of the token in the text (starting with 1) column: The column of the token in the text (starting with 1) @@ -106,7 +106,7 @@ class Token(Str): end_column: The next column after the end of the token. For example, if the token is a single character with a column value of 4, end_column will be 5. - end_pos: the index where the token ends (basically `pos_in_stream + len(token)`) + end_pos: the index where the token ends (basically ``pos_in_stream + len(token)``) """ __slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column', 'end_pos') From 96b0cfbbc4706a0d1600de642924ef56287a478f Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Wed, 19 Aug 2020 13:08:32 +0300 Subject: [PATCH 146/164] Docs: error in docs.. --- lark/lexer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/lexer.py b/lark/lexer.py index 4ee7b53..6eb3eca 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -97,7 +97,7 @@ class Token(Str): so normal string comparisons and operations will work as expected. Attributes: - type_: Name of the token (as specified in grammar) + type: Name of the token (as specified in grammar) value: Value of the token (redundant, as ``token.value == token`` will always be true) pos_in_stream: The index of the token in the text line: The line of the token in the text (starting with 1) From e910acffb15fe771ba4496e18a46a8b9b24e7b47 Mon Sep 17 00:00:00 2001 From: Sasank Chilamkurthy Date: Wed, 19 Aug 2020 15:54:53 +0530 Subject: [PATCH 147/164] no symlink --- .gitignore | 3 ++- docs/conf.py | 6 +++++- docs/examples | 1 - docs/index.rst | 2 +- examples/README.rst | 3 +-- 5 files changed, 9 insertions(+), 6 deletions(-) delete mode 120000 docs/examples diff --git a/.gitignore b/.gitignore index 62b900c..b30399e 100644 --- a/.gitignore +++ b/.gitignore @@ -10,4 +10,5 @@ tags .mypy_cache /dist /build -docs/_build \ No newline at end of file +docs/_build +docs/examples \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py index e41cf0b..5f874e9 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -176,5 +176,9 @@ texinfo_documents = [ 'Miscellaneous'), ] +# -- Sphinx gallery config ------------------------------------------- - +sphinx_gallery_conf = { + 'examples_dirs': ['../examples'], + 'gallery_dirs': ['examples'], +} \ No newline at end of file diff --git a/docs/examples b/docs/examples deleted file mode 120000 index 785887f..0000000 --- a/docs/examples +++ /dev/null @@ -1 +0,0 @@ -../examples/ \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst index 0ddadb3..c3163a1 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -24,7 +24,7 @@ Welcome to Lark's documentation! how_to_use how_to_develop recipes - auto_examples/index + examples/index .. toctree:: diff --git a/examples/README.rst b/examples/README.rst index 977c318..e72cb5b 100644 --- a/examples/README.rst +++ b/examples/README.rst @@ -1,8 +1,7 @@ Examples for Lark ================= -How to run the examples -^^^^^^^^^^^^^^^^^^^^^^^ +**How to run the examples**: After cloning the repo, open the terminal into the root directory of the project, and run the following: From 582e9ab9356268d62e63a7d89ea0df0617c36e93 Mon Sep 17 00:00:00 2001 From: Sasank Chilamkurthy Date: Wed, 19 Aug 2020 16:03:02 +0530 Subject: [PATCH 148/164] move advanced examples to a different folder --- examples/README.rst | 48 +------------------ examples/advanced/README.rst | 2 + examples/{ => advanced}/conf_earley.py | 0 examples/{ => advanced}/conf_lalr.py | 0 examples/{ => advanced}/custom_lexer.py | 0 examples/{ => advanced}/error_puppet.py | 0 .../{ => advanced}/error_reporting_lalr.py | 0 examples/{ => advanced}/python2.lark | 0 examples/{ => advanced}/python3.lark | 0 examples/{ => advanced}/python_bytecode.py | 0 examples/{ => advanced}/python_parser.py | 0 examples/{ => advanced}/qscintilla_json.py | 0 examples/{ => advanced}/reconstruct_json.py | 0 examples/{ => advanced}/template_lark.lark | 0 examples/{ => advanced}/templates.py | 0 15 files changed, 4 insertions(+), 46 deletions(-) create mode 100644 examples/advanced/README.rst rename examples/{ => advanced}/conf_earley.py (100%) rename examples/{ => advanced}/conf_lalr.py (100%) rename examples/{ => advanced}/custom_lexer.py (100%) rename examples/{ => advanced}/error_puppet.py (100%) rename examples/{ => advanced}/error_reporting_lalr.py (100%) rename examples/{ => advanced}/python2.lark (100%) rename examples/{ => advanced}/python3.lark (100%) rename examples/{ => advanced}/python_bytecode.py (100%) rename examples/{ => advanced}/python_parser.py (100%) rename examples/{ => advanced}/qscintilla_json.py (100%) rename examples/{ => advanced}/reconstruct_json.py (100%) rename examples/{ => advanced}/template_lark.lark (100%) rename examples/{ => advanced}/templates.py (100%) diff --git a/examples/README.rst b/examples/README.rst index e72cb5b..f2b0125 100644 --- a/examples/README.rst +++ b/examples/README.rst @@ -17,49 +17,5 @@ standard library of your local installation: [lark]$ python -m examples.python_parser -Beginners -~~~~~~~~~ - -- `calc.py`_ - A simple example of a REPL calculator -- `json_parser.py`_ - A simple JSON parser (comes with a tutorial, see - docs) -- `indented_tree.py`_ - A demonstration of parsing indentation - (“whitespace significant” language) -- `fruitflies.py`_ - A demonstration of ambiguity -- `turtle_dsl.py`_ - Implements a LOGO-like toy language for Python’s - turtle, with interpreter. -- `lark_grammar.py`_ + `lark.lark`_ - A reference implementation of the - Lark grammar (using LALR(1) + standard lexer) - -Advanced -~~~~~~~~ - -- `error_reporting_lalr.py`_ - A demonstration of example-driven error - reporting with the LALR parser -- `python_parser.py`_ - A fully-working Python 2 & 3 parser (but not - production ready yet!) -- `python_bytecode.py`_ - A toy example showing how to compile Python - directly to bytecode -- `conf_lalr.py`_ - Demonstrates the power of LALR’s contextual lexer - on a toy configuration language -- `conf_earley.py`_ - Demonstrates the power of Earley’s dynamic lexer - on a toy configuration language -- `custom_lexer.py`_ - Demonstrates using a custom lexer to parse a - non-textual stream of data -- `reconstruct_json.py`_ - Demonstrates the experimental - text-reconstruction feature - -.. _calc.py: calc.py -.. _json_parser.py: json_parser.py -.. _indented_tree.py: indented_tree.py -.. _fruitflies.py: fruitflies.py -.. _turtle_dsl.py: turtle_dsl.py -.. _lark_grammar.py: lark_grammar.py -.. _lark.lark: lark.lark -.. _error_reporting_lalr.py: error_reporting_lalr.py -.. _python_parser.py: python_parser.py -.. _python_bytecode.py: python_bytecode.py -.. _conf_lalr.py: conf_lalr.py -.. _conf_earley.py: conf_earley.py -.. _custom_lexer.py: custom_lexer.py -.. _reconstruct_json.py: reconstruct_json.py \ No newline at end of file +Beginner Examples +~~~~~~~~~~~~~~~~~ diff --git a/examples/advanced/README.rst b/examples/advanced/README.rst new file mode 100644 index 0000000..9605486 --- /dev/null +++ b/examples/advanced/README.rst @@ -0,0 +1,2 @@ +Advanced Examples +~~~~~~~~~~~~~~~~~ diff --git a/examples/conf_earley.py b/examples/advanced/conf_earley.py similarity index 100% rename from examples/conf_earley.py rename to examples/advanced/conf_earley.py diff --git a/examples/conf_lalr.py b/examples/advanced/conf_lalr.py similarity index 100% rename from examples/conf_lalr.py rename to examples/advanced/conf_lalr.py diff --git a/examples/custom_lexer.py b/examples/advanced/custom_lexer.py similarity index 100% rename from examples/custom_lexer.py rename to examples/advanced/custom_lexer.py diff --git a/examples/error_puppet.py b/examples/advanced/error_puppet.py similarity index 100% rename from examples/error_puppet.py rename to examples/advanced/error_puppet.py diff --git a/examples/error_reporting_lalr.py b/examples/advanced/error_reporting_lalr.py similarity index 100% rename from examples/error_reporting_lalr.py rename to examples/advanced/error_reporting_lalr.py diff --git a/examples/python2.lark b/examples/advanced/python2.lark similarity index 100% rename from examples/python2.lark rename to examples/advanced/python2.lark diff --git a/examples/python3.lark b/examples/advanced/python3.lark similarity index 100% rename from examples/python3.lark rename to examples/advanced/python3.lark diff --git a/examples/python_bytecode.py b/examples/advanced/python_bytecode.py similarity index 100% rename from examples/python_bytecode.py rename to examples/advanced/python_bytecode.py diff --git a/examples/python_parser.py b/examples/advanced/python_parser.py similarity index 100% rename from examples/python_parser.py rename to examples/advanced/python_parser.py diff --git a/examples/qscintilla_json.py b/examples/advanced/qscintilla_json.py similarity index 100% rename from examples/qscintilla_json.py rename to examples/advanced/qscintilla_json.py diff --git a/examples/reconstruct_json.py b/examples/advanced/reconstruct_json.py similarity index 100% rename from examples/reconstruct_json.py rename to examples/advanced/reconstruct_json.py diff --git a/examples/template_lark.lark b/examples/advanced/template_lark.lark similarity index 100% rename from examples/template_lark.lark rename to examples/advanced/template_lark.lark diff --git a/examples/templates.py b/examples/advanced/templates.py similarity index 100% rename from examples/templates.py rename to examples/advanced/templates.py From 6d0bff5d407c2e422b9484972980746283958671 Mon Sep 17 00:00:00 2001 From: Sasank Chilamkurthy Date: Wed, 19 Aug 2020 16:36:59 +0530 Subject: [PATCH 149/164] add sphinx-gallery to doc requirements --- docs/requirements.txt | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 docs/requirements.txt diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 0000000..d75b0aa --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,2 @@ +# https://docs.readthedocs.io/en/stable/guides/specifying-dependencies.html#specifying-a-requirements-file +sphinx-gallery \ No newline at end of file From afde561ac3b0da2a18bfa8e2cbdd160e949645b5 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Wed, 19 Aug 2020 14:15:16 +0300 Subject: [PATCH 150/164] Refactored reconstructor out into tree_matcher. Functionality should stay the same. --- lark/reconstruct.py | 144 +++++------------------------------ lark/tree_matcher.py | 177 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 195 insertions(+), 126 deletions(-) create mode 100644 lark/tree_matcher.py diff --git a/lark/reconstruct.py b/lark/reconstruct.py index 35e5994..e7cff31 100644 --- a/lark/reconstruct.py +++ b/lark/reconstruct.py @@ -1,18 +1,13 @@ -import unicodedata +"""Reconstruct text from a tree, based on Lark grammar""" -from collections import defaultdict +import unicodedata from .tree import Tree from .visitors import Transformer_InPlace -from .common import ParserConf from .lexer import Token, PatternStr -from .parsers import earley -from .grammar import Rule, Terminal, NonTerminal - - +from .grammar import Terminal, NonTerminal -def is_discarded_terminal(t): - return t.is_term and t.filter_out +from .tree_matcher import TreeMatcher, is_discarded_terminal def is_iter_empty(i): try: @@ -61,138 +56,35 @@ class WriteTokensTransformer(Transformer_InPlace): return to_write -class MatchTree(Tree): - pass - -class MakeMatchTree: - def __init__(self, name, expansion): - self.name = name - self.expansion = expansion - - def __call__(self, args): - t = MatchTree(self.name, args) - t.meta.match_tree = True - t.meta.orig_expansion = self.expansion - return t - -def best_from_group(seq, group_key, cmp_key): - d = {} - for item in seq: - key = group_key(item) - if key in d: - v1 = cmp_key(item) - v2 = cmp_key(d[key]) - if v2 > v1: - d[key] = item - else: - d[key] = item - return list(d.values()) - - -def make_recons_rule(origin, expansion, old_expansion): - return Rule(origin, expansion, alias=MakeMatchTree(origin.name, old_expansion)) - -def make_recons_rule_to_term(origin, term): - return make_recons_rule(origin, [Terminal(term.name)], [term]) - def _isalnum(x): # Categories defined here: https://www.python.org/dev/peps/pep-3131/ return unicodedata.category(x) in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Mn', 'Mc', 'Nd', 'Pc'] -class Reconstructor: +class Reconstructor(TreeMatcher): """ A Reconstructor that will, given a full parse Tree, generate source code. - Pass `term_subs`, a dictionary of [Terminal name as str] to [output text as str] - to say what discarded Terminals should be written as. - """ - def __init__(self, parser, term_subs=None): - # XXX TODO calling compile twice returns different results! - assert parser.options.maybe_placeholders == False - if term_subs is None: - term_subs = {} - tokens, rules, _grammar_extra = parser.grammar.compile(parser.options.start) - - self.write_tokens = WriteTokensTransformer({t.name:t for t in tokens}, term_subs) - self.rules_for_root = defaultdict(list) - self.rules = list(self._build_recons_rules(rules)) - self.rules.reverse() + Note: + The reconstructor cannot generate values from regexps. If you need to produce discarded + regexes, such as newlines, use `term_subs` and provide default values for them. - # Choose the best rule from each group of {rule => [rule.alias]}, since we only really need one derivation. - self.rules = best_from_group(self.rules, lambda r: r, lambda r: -len(r.expansion)) - - self.rules.sort(key=lambda r: len(r.expansion)) - self.parser = parser - self._parser_cache = {} - - def _build_recons_rules(self, rules): - expand1s = {r.origin for r in rules if r.options.expand1} - - aliases = defaultdict(list) - for r in rules: - if r.alias: - aliases[r.origin].append( r.alias ) - - rule_names = {r.origin for r in rules} - nonterminals = {sym for sym in rule_names - if sym.name.startswith('_') or sym in expand1s or sym in aliases } - - seen = set() - for r in rules: - recons_exp = [sym if sym in nonterminals else Terminal(sym.name) - for sym in r.expansion if not is_discarded_terminal(sym)] - - # Skip self-recursive constructs - if recons_exp == [r.origin] and r.alias is None: - continue - - sym = NonTerminal(r.alias) if r.alias else r.origin - rule = make_recons_rule(sym, recons_exp, r.expansion) - - if sym in expand1s and len(recons_exp) != 1: - self.rules_for_root[sym.name].append(rule) - - if sym.name not in seen: - yield make_recons_rule_to_term(sym, sym) - seen.add(sym.name) - else: - if sym.name.startswith('_') or sym in expand1s: - yield rule - else: - self.rules_for_root[sym.name].append(rule) + Paramters: + parser: a Lark instance + term_subs: a dictionary of [Terminal name as str] to [output text as str] + """ - for origin, rule_aliases in aliases.items(): - for alias in rule_aliases: - yield make_recons_rule_to_term(origin, NonTerminal(alias)) - yield make_recons_rule_to_term(origin, origin) + def __init__(self, parser, term_subs=None): + TreeMatcher.__init__(self, parser) - def _match(self, term, token): - if isinstance(token, Tree): - return Terminal(token.data) == term - elif isinstance(token, Token): - return term == Terminal(token.type) - assert False + self.write_tokens = WriteTokensTransformer({t.name:t for t in self.tokens}, term_subs or {}) def _reconstruct(self, tree): - # TODO: ambiguity? - try: - parser = self._parser_cache[tree.data] - except KeyError: - rules = self.rules + best_from_group( - self.rules_for_root[tree.data], lambda r: r, lambda r: -len(r.expansion) - ) - - rules.sort(key=lambda r: len(r.expansion)) - - callbacks = {rule: rule.alias for rule in rules} # TODO pass callbacks through dict, instead of alias? - parser = earley.Parser(ParserConf(rules, callbacks, [tree.data]), self._match, resolve_ambiguity=True) - self._parser_cache[tree.data] = parser - - unreduced_tree = parser.parse(tree.children, tree.data) # find a full derivation - assert unreduced_tree.data == tree.data + unreduced_tree = self.match_tree(tree, tree.data) + res = self.write_tokens.transform(unreduced_tree) for item in res: if isinstance(item, Tree): + # TODO use orig_expansion.rulename to support templates for x in self._reconstruct(item): yield x else: diff --git a/lark/tree_matcher.py b/lark/tree_matcher.py new file mode 100644 index 0000000..129a15e --- /dev/null +++ b/lark/tree_matcher.py @@ -0,0 +1,177 @@ +"""Tree matcher based on Lark grammar""" + +import re +from collections import defaultdict + +from lark import Tree, Token +from lark.common import ParserConf +from lark.parsers import earley +from lark.grammar import Rule, Terminal, NonTerminal + + +def is_discarded_terminal(t): + return t.is_term and t.filter_out + + +class _MakeTreeMatch: + def __init__(self, name, expansion): + self.name = name + self.expansion = expansion + + def __call__(self, args): + t = Tree(self.name, args) + t.meta.match_tree = True + t.meta.orig_expansion = self.expansion + return t + + +def _best_from_group(seq, group_key, cmp_key): + d = {} + for item in seq: + key = group_key(item) + if key in d: + v1 = cmp_key(item) + v2 = cmp_key(d[key]) + if v2 > v1: + d[key] = item + else: + d[key] = item + return list(d.values()) + + +def _best_rules_from_group(rules): + rules = _best_from_group(rules, lambda r: r, lambda r: -len(r.expansion)) + rules.sort(key=lambda r: len(r.expansion)) + return rules + + +def _match(term, token): + if isinstance(token, Tree): + name, _args = parse_rulename(term.name) + return token.data == name + elif isinstance(token, Token): + return term == Terminal(token.type) + assert False + + +def make_recons_rule(origin, expansion, old_expansion): + return Rule(origin, expansion, alias=_MakeTreeMatch(origin.name, old_expansion)) + + +def make_recons_rule_to_term(origin, term): + return make_recons_rule(origin, [Terminal(term.name)], [term]) + + +def parse_rulename(s): + "Parse rule names that may contain a template syntax (like rule{a, b, ...})" + name, args_str = re.match(r'(\w+)(?:{(.+)})?', s).groups() + args = args_str and [a.strip() for a in args_str.split(',')] + return name, args + + +class TreeMatcher: + """Match the elements of a tree node, based on an ontology + provided by a Lark grammar. + + Supports templates and inlined rules (`rule{a, b,..}` and `_rule`) + + Initiialize with an instance of Lark. + """ + + def __init__(self, parser): + # XXX TODO calling compile twice returns different results! + assert parser.options.maybe_placeholders == False + self.tokens, rules, _extra = parser.grammar.compile(parser.options.start) + + self.rules_for_root = defaultdict(list) + + self.rules = list(self._build_recons_rules(rules)) + self.rules.reverse() + + # Choose the best rule from each group of {rule => [rule.alias]}, since we only really need one derivation. + self.rules = _best_rules_from_group(self.rules) + + self.parser = parser + self._parser_cache = {} + + def _build_recons_rules(self, rules): + "Convert tree-parsing/construction rules to tree-matching rules" + expand1s = {r.origin for r in rules if r.options.expand1} + + aliases = defaultdict(list) + for r in rules: + if r.alias: + aliases[r.origin].append(r.alias) + + rule_names = {r.origin for r in rules} + nonterminals = {sym for sym in rule_names + if sym.name.startswith('_') or sym in expand1s or sym in aliases} + + seen = set() + for r in rules: + recons_exp = [sym if sym in nonterminals else Terminal(sym.name) + for sym in r.expansion if not is_discarded_terminal(sym)] + + # Skip self-recursive constructs + if recons_exp == [r.origin] and r.alias is None: + continue + + sym = NonTerminal(r.alias) if r.alias else r.origin + rule = make_recons_rule(sym, recons_exp, r.expansion) + + if sym in expand1s and len(recons_exp) != 1: + self.rules_for_root[sym.name].append(rule) + + if sym.name not in seen: + yield make_recons_rule_to_term(sym, sym) + seen.add(sym.name) + else: + if sym.name.startswith('_') or sym in expand1s: + yield rule + else: + self.rules_for_root[sym.name].append(rule) + + for origin, rule_aliases in aliases.items(): + for alias in rule_aliases: + yield make_recons_rule_to_term(origin, NonTerminal(alias)) + yield make_recons_rule_to_term(origin, origin) + + def match_tree(self, tree, rulename): + """Match the elements of `tree` to the symbols of rule `rulename`. + + Args: + tree (Tree): the tree node to match + rulename ([type]): [description] + + Returns: + Tree: an unreduced tree that matches `rulename` + + Raises: + UnexpectedToken: If no match was found. + + Note: + It's the callers' responsibility match the tree recursively. + """ + if rulename: + # validate + name, _args = parse_rulename(rulename) + assert tree.data == name + else: + rulename = tree.data + + # TODO: ambiguity? + try: + parser = self._parser_cache[rulename] + except KeyError: + rules = self.rules + _best_rules_from_group(self.rules_for_root[rulename]) + + # TODO pass callbacks through dict, instead of alias? + callbacks = {rule: rule.alias for rule in rules} + conf = ParserConf(rules, callbacks, [rulename]) + parser = earley.Parser(conf, _match, resolve_ambiguity=True) + self._parser_cache[rulename] = parser + + # find a full derivation + unreduced_tree = parser.parse(tree.children, rulename) + assert unreduced_tree.data == rulename + return unreduced_tree From afebbf7348d6c1b664d4167bc2c9a2a1b55e0544 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Wed, 19 Aug 2020 14:16:58 +0300 Subject: [PATCH 151/164] Adjustments --- examples/json_parser.py | 2 -- examples/lark_grammar.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/examples/json_parser.py b/examples/json_parser.py index 927585f..c3573f3 100644 --- a/examples/json_parser.py +++ b/examples/json_parser.py @@ -2,8 +2,6 @@ Simple JSON Parser ================== -A simple JSON parser (comes with a tutorial, see docs) - The code is short and clear, and outperforms every other parser (that's written in Python). For an explanation, check out the JSON parser tutorial at /docs/json_tutorial.md """ diff --git a/examples/lark_grammar.py b/examples/lark_grammar.py index 5dc9d11..e8566fb 100644 --- a/examples/lark_grammar.py +++ b/examples/lark_grammar.py @@ -2,7 +2,7 @@ Lark Grammar ============ -A reference implementation of the Lark grammar (using LALR(1) + standard lexer) +A reference implementation of the Lark grammar (using LALR(1)) """ from lark import Lark From 6f48b23383619d23720367a5a187c4f05e5b845a Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Wed, 19 Aug 2020 14:23:38 +0300 Subject: [PATCH 152/164] Fix imports --- lark/tree_matcher.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lark/tree_matcher.py b/lark/tree_matcher.py index 129a15e..38ac87f 100644 --- a/lark/tree_matcher.py +++ b/lark/tree_matcher.py @@ -3,10 +3,10 @@ import re from collections import defaultdict -from lark import Tree, Token -from lark.common import ParserConf -from lark.parsers import earley -from lark.grammar import Rule, Terminal, NonTerminal +from . import Tree, Token +from .common import ParserConf +from .parsers import earley +from .grammar import Rule, Terminal, NonTerminal def is_discarded_terminal(t): From 29a877ce35d150a7d978efa2eb12647fd85e5c42 Mon Sep 17 00:00:00 2001 From: Sasank Chilamkurthy Date: Wed, 19 Aug 2020 16:54:56 +0530 Subject: [PATCH 153/164] requirements for docs --- readthedocs.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/readthedocs.yml b/readthedocs.yml index bda2e6c..4636dc7 100644 --- a/readthedocs.yml +++ b/readthedocs.yml @@ -2,6 +2,11 @@ version: 2 formats: all +python: + version: 3.7 + install: + - requirements: docs/requirements.txt + # Build documentation in the docs/ directory with Sphinx sphinx: - configuration: docs/conf.py \ No newline at end of file + configuration: docs/conf.py From 42ada346cdb5eb75deb325bc495cf431bea87d01 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Wed, 19 Aug 2020 15:23:20 +0300 Subject: [PATCH 154/164] Corrected docstring --- lark/tree_matcher.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lark/tree_matcher.py b/lark/tree_matcher.py index 38ac87f..b9306c4 100644 --- a/lark/tree_matcher.py +++ b/lark/tree_matcher.py @@ -139,9 +139,9 @@ class TreeMatcher: def match_tree(self, tree, rulename): """Match the elements of `tree` to the symbols of rule `rulename`. - Args: + Parameters: tree (Tree): the tree node to match - rulename ([type]): [description] + rulename (str): The expected full rule name (including template args) Returns: Tree: an unreduced tree that matches `rulename` From c9bd6d491f2840c276feea5b5ff13f567d6f700c Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Wed, 19 Aug 2020 19:39:22 +0300 Subject: [PATCH 155/164] Fixes for examples --- examples/advanced/error_puppet.py | 2 +- examples/advanced/error_reporting_lalr.py | 2 +- examples/advanced/python_parser.py | 4 ++-- examples/advanced/templates.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/advanced/error_puppet.py b/examples/advanced/error_puppet.py index d3fca9d..b749749 100644 --- a/examples/advanced/error_puppet.py +++ b/examples/advanced/error_puppet.py @@ -1,5 +1,5 @@ """ -Error handling with parsing puppet +Error handling with a puppet ================================== This example demonstrates error handling using a parsing puppet in LALR diff --git a/examples/advanced/error_reporting_lalr.py b/examples/advanced/error_reporting_lalr.py index deeeb5f..bf95bd6 100644 --- a/examples/advanced/error_reporting_lalr.py +++ b/examples/advanced/error_reporting_lalr.py @@ -1,5 +1,5 @@ """ -Example Driver Error Reporting +Example-Driven Error Reporting ============================== A demonstration of example-driven error reporting with the LALR parser diff --git a/examples/advanced/python_parser.py b/examples/advanced/python_parser.py index 5e4d664..7fbff2e 100644 --- a/examples/advanced/python_parser.py +++ b/examples/advanced/python_parser.py @@ -1,6 +1,6 @@ """ -Real Python Parser -================== +Grammar-complete Python Parser +============================== A fully-working Python 2 & 3 parser (but not production ready yet!) diff --git a/examples/advanced/templates.py b/examples/advanced/templates.py index 2f28d90..ac59b7a 100644 --- a/examples/advanced/templates.py +++ b/examples/advanced/templates.py @@ -4,7 +4,7 @@ Templates This example shows how to use Lark's templates to achieve cleaner grammars -"""" +""" from lark import Lark grammar = r""" From 1ad46b2a9ab56ad80acb8fd7a44fcd21dccbb867 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Thu, 20 Aug 2020 10:34:41 +0300 Subject: [PATCH 156/164] Fixed examples (Issue #669) --- examples/advanced/_json_parser.py | 64 +++++++++++++++++++++++ examples/advanced/error_puppet.py | 4 +- examples/advanced/error_reporting_lalr.py | 2 +- examples/advanced/reconstruct_json.py | 2 +- examples/lark_grammar.py | 22 ++++---- 5 files changed, 81 insertions(+), 13 deletions(-) create mode 100644 examples/advanced/_json_parser.py diff --git a/examples/advanced/_json_parser.py b/examples/advanced/_json_parser.py new file mode 100644 index 0000000..80d9101 --- /dev/null +++ b/examples/advanced/_json_parser.py @@ -0,0 +1,64 @@ +""" +Simple JSON Parser +================== + +The code is short and clear, and outperforms every other parser (that's written in Python). +For an explanation, check out the JSON parser tutorial at /docs/json_tutorial.md + +(this is here for use by the other examples) +""" +import sys + +from lark import Lark, Transformer, v_args + +json_grammar = r""" + ?start: value + + ?value: object + | array + | string + | SIGNED_NUMBER -> number + | "true" -> true + | "false" -> false + | "null" -> null + + array : "[" [value ("," value)*] "]" + object : "{" [pair ("," pair)*] "}" + pair : string ":" value + + string : ESCAPED_STRING + + %import common.ESCAPED_STRING + %import common.SIGNED_NUMBER + %import common.WS + + %ignore WS +""" + + +class TreeToJson(Transformer): + @v_args(inline=True) + def string(self, s): + return s[1:-1].replace('\\"', '"') + + array = list + pair = tuple + object = dict + number = v_args(inline=True)(float) + + null = lambda self, _: None + true = lambda self, _: True + false = lambda self, _: False + + +### Create the JSON parser with Lark, using the LALR algorithm +json_parser = Lark(json_grammar, parser='lalr', + # Using the standard lexer isn't required, and isn't usually recommended. + # But, it's good enough for JSON, and it's slightly faster. + lexer='standard', + # Disabling propagate_positions and placeholders slightly improves speed + propagate_positions=False, + maybe_placeholders=False, + # Using an internal transformer is faster and more memory efficient + transformer=TreeToJson()) + diff --git a/examples/advanced/error_puppet.py b/examples/advanced/error_puppet.py index b749749..36c14c4 100644 --- a/examples/advanced/error_puppet.py +++ b/examples/advanced/error_puppet.py @@ -10,9 +10,9 @@ to proceed step-by-step. When you've achieved the correct parse-state, you can resume the run by returning True. """ -from lark import UnexpectedToken, Token +from lark import Token -from .json_parser import json_parser +from _json_parser import json_parser def ignore_errors(e): if e.token.type == 'COMMA': diff --git a/examples/advanced/error_reporting_lalr.py b/examples/advanced/error_reporting_lalr.py index bf95bd6..102f7b1 100644 --- a/examples/advanced/error_reporting_lalr.py +++ b/examples/advanced/error_reporting_lalr.py @@ -7,7 +7,7 @@ A demonstration of example-driven error reporting with the LALR parser """ from lark import Lark, UnexpectedInput -from .json_parser import json_grammar # Using the grammar from the json_parser example +from _json_parser import json_grammar # Using the grammar from the json_parser example json_parser = Lark(json_grammar, parser='lalr') diff --git a/examples/advanced/reconstruct_json.py b/examples/advanced/reconstruct_json.py index 4506c3a..201bc32 100644 --- a/examples/advanced/reconstruct_json.py +++ b/examples/advanced/reconstruct_json.py @@ -14,7 +14,7 @@ import json from lark import Lark from lark.reconstruct import Reconstructor -from .json_parser import json_grammar +from _json_parser import json_grammar test_json = ''' { diff --git a/examples/lark_grammar.py b/examples/lark_grammar.py index e8566fb..b424e87 100644 --- a/examples/lark_grammar.py +++ b/examples/lark_grammar.py @@ -4,18 +4,22 @@ Lark Grammar A reference implementation of the Lark grammar (using LALR(1)) """ -from lark import Lark +import lark +from pathlib import Path -parser = Lark(open('examples/lark.lark'), parser="lalr") +parser = lark.Lark.open('lark.lark', rel_to=__file__, parser="lalr") + +examples_path = Path(__file__).parent +lark_path = Path(lark.__file__).parent grammar_files = [ - 'examples/python2.lark', - 'examples/python3.lark', - 'examples/lark.lark', - 'examples/relative-imports/multiples.lark', - 'examples/relative-imports/multiple2.lark', - 'examples/relative-imports/multiple3.lark', - 'lark/grammars/common.lark', + examples_path / 'lark.lark', + examples_path / 'advanced/python2.lark', + examples_path / 'advanced/python3.lark', + examples_path / 'relative-imports/multiples.lark', + examples_path / 'relative-imports/multiple2.lark', + examples_path / 'relative-imports/multiple3.lark', + lark_path / 'grammars/common.lark', ] def test(): From b87acedac43a89bc2883c944febda4c9ece579eb Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sun, 23 Aug 2020 16:07:07 +0300 Subject: [PATCH 157/164] Fixed docs. Added support for hashing ParserPuppet --- docs/grammar.md | 2 +- lark/parsers/lalr_puppet.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/grammar.md b/docs/grammar.md index e532297..1cd72dd 100644 --- a/docs/grammar.md +++ b/docs/grammar.md @@ -204,7 +204,7 @@ Each item is one of: * `TERMINAL` * `"string literal"` or `/regexp literal/` * `(item item ..)` - Group items -* `[item item ..]` - Maybe. Same as `(item item ..)?`, but generates `None` if there is no match +* `[item item ..]` - Maybe. Same as `(item item ..)?`, but when `maybe_placeholders=True`, generates `None` if there is no match. * `item?` - Zero or one instances of item ("maybe") * `item*` - Zero or more instances of item * `item+` - One or more instances of item diff --git a/lark/parsers/lalr_puppet.py b/lark/parsers/lalr_puppet.py index 40cc0c1..35b2250 100644 --- a/lark/parsers/lalr_puppet.py +++ b/lark/parsers/lalr_puppet.py @@ -88,6 +88,9 @@ class ParserPuppet(object): self._start == other._start ) + def __hash__(self): + return hash((tuple(self._state_stack), self._start)) + def pretty(self): """Print the output of ``choices()`` in a way that's easier to read.""" out = ["Puppet choices:"] From a01de190d3bff78de82fc4d86f8b29095f2ae03b Mon Sep 17 00:00:00 2001 From: Omega16 <22673084+omega16@users.noreply.github.com> Date: Mon, 24 Aug 2020 15:35:29 -0500 Subject: [PATCH 158/164] Added pydot__tree_to_dot and pydot__tree_to_graph, changed pydot__tree_to_png --- lark/tree.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/lark/tree.py b/lark/tree.py index b9dddf4..45346c6 100644 --- a/lark/tree.py +++ b/lark/tree.py @@ -160,6 +160,15 @@ class SlottedTree(Tree): def pydot__tree_to_png(tree, filename, rankdir="LR", **kwargs): + graph = pydot__tree_to_graph(tree, rankdir, **kwargs) + graph.write_png(filename) + + +def pydot__tree_to_dot(tree, filename, rankdir="LR", **kwargs): + graph = pydot__tree_to_graph(tree, rankdir, **kwargs) + graph.write(filename) + +def pydot__tree_to_graph(tree, rankdir="LR", **kwargs): """Creates a colorful image that represents the tree (data+children, without meta) Possible values for `rankdir` are "TB", "LR", "BT", "RL", corresponding to @@ -197,5 +206,5 @@ def pydot__tree_to_png(tree, filename, rankdir="LR", **kwargs): return node _to_pydot(tree) - graph.write_png(filename) - + return graph + From 3edd27f3df7790d6bf93e88a6e404c41537a743b Mon Sep 17 00:00:00 2001 From: Omega16 <22673084+omega16@users.noreply.github.com> Date: Mon, 24 Aug 2020 16:25:12 -0500 Subject: [PATCH 159/164] Added mention to lark output as dot or png --- README.md | 2 +- docs/features.md | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 8bc45f5..ce266d3 100644 --- a/README.md +++ b/README.md @@ -81,7 +81,7 @@ Lark is great at handling ambiguity. Here is the result of parsing the phrase "f ![fruitflies.png](examples/fruitflies.png) -See the code and more [examples here](https://github.com/lark-parser/lark/tree/master/examples) +See the code to make [this image](https://github.com/lark-parser/lark/tree/master/examples/fruitflies.py) and more [examples here](https://github.com/lark-parser/lark/tree/master/examples) ## List of main features diff --git a/docs/features.md b/docs/features.md index ccaa4cd..fc8ccb6 100644 --- a/docs/features.md +++ b/docs/features.md @@ -26,6 +26,8 @@ - Support for external regex module ([see here](classes.md#using-unicode-character-classes-with-regex)) - Import grammars from Nearley.js ([read more](nearley.md)) - CYK parser + - Transform your parse tree to dot or png files for better visualization ([see_example](https://github.com/lark-parser/lark/blob/master/examples/fruitflies.py)) + ### Experimental features - Automatic reconstruction of input from parse-tree (see examples) From 0b99356a394b750e56e32daa0ebb3d6c6415d84a Mon Sep 17 00:00:00 2001 From: Omega16 <22673084+omega16@users.noreply.github.com> Date: Mon, 24 Aug 2020 16:29:58 -0500 Subject: [PATCH 160/164] added make_dot --- examples/fruitflies.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/examples/fruitflies.py b/examples/fruitflies.py index 697b4ac..aca0b1b 100644 --- a/examples/fruitflies.py +++ b/examples/fruitflies.py @@ -33,9 +33,13 @@ sentence = 'fruit flies like bananas' def make_png(filename): tree.pydot__tree_to_png( parser.parse(sentence), filename) +def make_dot(filename): + tree.pydot__tree_to_dot( parser.parse(sentence), filename) + if __name__ == '__main__': print(parser.parse(sentence).pretty()) # make_png(sys.argv[1]) + # make_dot(sys.argv[1]) # Output: # From fe892961937e37e36589cb795b95e1794b8e1521 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Thu, 27 Aug 2020 11:02:03 +0300 Subject: [PATCH 161/164] Improved load_grammar's error messages, and added tests --- lark/load_grammar.py | 28 ++++++++++++++++------------ lark/parsers/grammar_analysis.py | 2 +- tests/__main__.py | 1 + tests/test_grammar.py | 31 +++++++++++++++++++++++++++++++ 4 files changed, 49 insertions(+), 13 deletions(-) create mode 100644 tests/test_grammar.py diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 7e83e59..8b2eaa8 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -789,6 +789,20 @@ def _find_used_symbols(tree): for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))} class GrammarLoader: + ERRORS = { + 'Unclosed parenthesis': ['a: (\n'], + 'Umatched closing parenthesis': ['a: )\n', 'a: [)\n', 'a: (]\n'], + 'Expecting rule or terminal definition (missing colon)': ['a\n', 'A\n', 'a->\n', 'A->\n', 'a A\n'], + 'Illegal name for rules or terminals': ['Aa:\n'], + 'Alias expects lowercase name': ['a: -> "a"\n'], + 'Unexpected colon': ['a::\n', 'a: b:\n', 'a: B:\n', 'a: "a":\n'], + 'Misplaced operator': ['a: b??', 'a: b(?)', 'a:+\n', 'a:?\n', 'a:*\n', 'a:|*\n'], + 'Expecting option ("|") or a new rule or terminal definition': ['a:a\n()\n'], + 'Terminal names cannot contain dots': ['A.B\n'], + '%import expects a name': ['%import "a"\n'], + '%ignore expects a value': ['%ignore %import\n'], + } + def __init__(self, re_module): terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()] @@ -814,19 +828,9 @@ class GrammarLoader: (e.line, e.column, grammar_name, context)) except UnexpectedToken as e: context = e.get_context(grammar_text) - error = e.match_examples(self.parser.parse, { - 'Unclosed parenthesis': ['a: (\n'], - 'Umatched closing parenthesis': ['a: )\n', 'a: [)\n', 'a: (]\n'], - 'Expecting rule or terminal definition (missing colon)': ['a\n', 'a->\n', 'A->\n', 'a A\n'], - 'Alias expects lowercase name': ['a: -> "a"\n'], - 'Unexpected colon': ['a::\n', 'a: b:\n', 'a: B:\n', 'a: "a":\n'], - 'Misplaced operator': ['a: b??', 'a: b(?)', 'a:+\n', 'a:?\n', 'a:*\n', 'a:|*\n'], - 'Expecting option ("|") or a new rule or terminal definition': ['a:a\n()\n'], - '%import expects a name': ['%import "a"\n'], - '%ignore expects a value': ['%ignore %import\n'], - }) + error = e.match_examples(self.parser.parse, self.ERRORS, use_accepts=True) if error: - raise GrammarError("%s at line %s column %s\n\n%s" % (error, e.line, e.column, context)) + raise GrammarError("%s, at line %s column %s\n\n%s" % (error, e.line, e.column, context)) elif 'STRING' in e.expected: raise GrammarError("Expecting a value at line %s column %s\n\n%s" % (e.line, e.column, context)) raise diff --git a/lark/parsers/grammar_analysis.py b/lark/parsers/grammar_analysis.py index 94c32cc..737cb02 100644 --- a/lark/parsers/grammar_analysis.py +++ b/lark/parsers/grammar_analysis.py @@ -138,7 +138,7 @@ class GrammarAnalyzer(object): for r in rules: for sym in r.expansion: if not (sym.is_term or sym in self.rules_by_origin): - raise GrammarError("Using an undefined rule: %s" % sym) # TODO test validation + raise GrammarError("Using an undefined rule: %s" % sym) self.start_states = {start: self.expand_rule(root_rule.origin) for start, root_rule in root_rules.items()} diff --git a/tests/__main__.py b/tests/__main__.py index 9ef9f1b..5ec89e3 100644 --- a/tests/__main__.py +++ b/tests/__main__.py @@ -7,6 +7,7 @@ from lark import logger from .test_trees import TestTrees from .test_tools import TestStandalone from .test_cache import TestCache +from .test_grammar import TestGrammar from .test_reconstructor import TestReconstructor try: diff --git a/tests/test_grammar.py b/tests/test_grammar.py new file mode 100644 index 0000000..88c8e22 --- /dev/null +++ b/tests/test_grammar.py @@ -0,0 +1,31 @@ +from __future__ import absolute_import + +import sys +from unittest import TestCase, main + +from lark import Lark +from lark.load_grammar import GrammarLoader, GrammarError + + +class TestGrammar(TestCase): + def setUp(self): + pass + + def test_errors(self): + for msg, examples in GrammarLoader.ERRORS.items(): + for example in examples: + try: + p = Lark(example) + except GrammarError as e: + assert msg in str(e) + else: + assert False, "example did not raise an error" + + + + +if __name__ == '__main__': + main() + + + From 078fa150be9b53ce0d8b18a93a49e4f9c4f9adcc Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Thu, 27 Aug 2020 11:22:42 +0300 Subject: [PATCH 162/164] Adjusted features.md and README.md --- README.md | 2 +- docs/features.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ce266d3..4a67283 100644 --- a/README.md +++ b/README.md @@ -81,7 +81,7 @@ Lark is great at handling ambiguity. Here is the result of parsing the phrase "f ![fruitflies.png](examples/fruitflies.png) -See the code to make [this image](https://github.com/lark-parser/lark/tree/master/examples/fruitflies.py) and more [examples here](https://github.com/lark-parser/lark/tree/master/examples) +[Read the code here](https://github.com/lark-parser/lark/tree/master/examples/fruitflies.py), and [more examples here](https://github.com/lark-parser/lark/tree/master/examples) ## List of main features diff --git a/docs/features.md b/docs/features.md index fc8ccb6..cb711b3 100644 --- a/docs/features.md +++ b/docs/features.md @@ -26,7 +26,7 @@ - Support for external regex module ([see here](classes.md#using-unicode-character-classes-with-regex)) - Import grammars from Nearley.js ([read more](nearley.md)) - CYK parser - - Transform your parse tree to dot or png files for better visualization ([see_example](https://github.com/lark-parser/lark/blob/master/examples/fruitflies.py)) + - Visualize your parse trees as dot or png files ([see_example](https://github.com/lark-parser/lark/blob/master/examples/fruitflies.py)) ### Experimental features From 6cd706279a21df7fd07b0c02c97421342c03933a Mon Sep 17 00:00:00 2001 From: Jonah Yolles-Murphy Date: Fri, 28 Aug 2020 14:43:44 -0400 Subject: [PATCH 163/164] make Trees and Tokens' reprs' evalable --- lark/lexer.py | 2 +- lark/tree.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/lark/lexer.py b/lark/lexer.py index 32bfe78..1905047 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -124,7 +124,7 @@ class Token(Str): return (self.__class__, (self.type, self.value, self.pos_in_stream, self.line, self.column, )) def __repr__(self): - return 'Token(%s, %r)' % (self.type, self.value) + return 'Token(%r, %r)' % (self.type, self.value) def __deepcopy__(self, memo): return Token(self.type, self.value, self.pos_in_stream, self.line, self.column) diff --git a/lark/tree.py b/lark/tree.py index 84c996a..90c7608 100644 --- a/lark/tree.py +++ b/lark/tree.py @@ -25,7 +25,7 @@ class Tree(object): return self._meta def __repr__(self): - return 'Tree(%s, %s)' % (self.data, self.children) + return 'Tree(%r, %r)' % (self.data, self.children) def _pretty_label(self): return self.data @@ -172,4 +172,3 @@ def pydot__tree_to_png(tree, filename, rankdir="LR", **kwargs): _to_pydot(tree) graph.write_png(filename) - From 968d1652d8822a8c6346c02468126c9f64fb138d Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sun, 30 Aug 2020 13:42:15 +0300 Subject: [PATCH 164/164] Fixed issue in grammar error-reporting due to unordered dict (changed it to list) --- lark/load_grammar.py | 26 +++++++++++++------------- tests/test_grammar.py | 2 +- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 8b2eaa8..cd36e4b 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -789,19 +789,19 @@ def _find_used_symbols(tree): for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))} class GrammarLoader: - ERRORS = { - 'Unclosed parenthesis': ['a: (\n'], - 'Umatched closing parenthesis': ['a: )\n', 'a: [)\n', 'a: (]\n'], - 'Expecting rule or terminal definition (missing colon)': ['a\n', 'A\n', 'a->\n', 'A->\n', 'a A\n'], - 'Illegal name for rules or terminals': ['Aa:\n'], - 'Alias expects lowercase name': ['a: -> "a"\n'], - 'Unexpected colon': ['a::\n', 'a: b:\n', 'a: B:\n', 'a: "a":\n'], - 'Misplaced operator': ['a: b??', 'a: b(?)', 'a:+\n', 'a:?\n', 'a:*\n', 'a:|*\n'], - 'Expecting option ("|") or a new rule or terminal definition': ['a:a\n()\n'], - 'Terminal names cannot contain dots': ['A.B\n'], - '%import expects a name': ['%import "a"\n'], - '%ignore expects a value': ['%ignore %import\n'], - } + ERRORS = [ + ('Unclosed parenthesis', ['a: (\n']), + ('Umatched closing parenthesis', ['a: )\n', 'a: [)\n', 'a: (]\n']), + ('Expecting rule or terminal definition (missing colon)', ['a\n', 'A\n', 'a->\n', 'A->\n', 'a A\n']), + ('Illegal name for rules or terminals', ['Aa:\n']), + ('Alias expects lowercase name', ['a: -> "a"\n']), + ('Unexpected colon', ['a::\n', 'a: b:\n', 'a: B:\n', 'a: "a":\n']), + ('Misplaced operator', ['a: b??', 'a: b(?)', 'a:+\n', 'a:?\n', 'a:*\n', 'a:|*\n']), + ('Expecting option ("|") or a new rule or terminal definition', ['a:a\n()\n']), + ('Terminal names cannot contain dots', ['A.B\n']), + ('%import expects a name', ['%import "a"\n']), + ('%ignore expects a value', ['%ignore %import\n']), + ] def __init__(self, re_module): terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()] diff --git a/tests/test_grammar.py b/tests/test_grammar.py index 88c8e22..363f897 100644 --- a/tests/test_grammar.py +++ b/tests/test_grammar.py @@ -12,7 +12,7 @@ class TestGrammar(TestCase): pass def test_errors(self): - for msg, examples in GrammarLoader.ERRORS.items(): + for msg, examples in GrammarLoader.ERRORS: for example in examples: try: p = Lark(example)