Merge branch 'MegaIng-bytes-support'

5 years ago · 9ee8428f3f
--- a/docs/classes.md
+++ b/docs/classes.md
@@ -128,6 +128,7 @@ Useful for caching and multiprocessing.
 - **priority** - How priorities should be evaluated - auto, none, normal, invert (Default: auto)
 - **lexer_callbacks** - Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution.
 - **edit_terminals** - A callback
 - **use_bytes** - Accept and parse an input of type `bytes` instead of `str`. Grammar should still be specified as `str`, and terminal values are assumed to be `latin-1`.
 #### Using Unicode character classes with `regex`
--- a/lark-stubs/lark.pyi
+++ b/lark-stubs/lark.pyi
@@ -31,10 +31,12 @@ class LarkOptions:
    lexer_callbacks: Dict[str, Callable[[Token], Token]]
    cache: Union[bool, str]
    g_regex_flags: int
    use_bytes: bool
 class Lark:
    source: str
    grammar_source: str
    options: LarkOptions
    lexer: Lexer
    terminals: List[TerminalDef]
@@ -56,7 +58,8 @@ class Lark:
        maybe_placeholders: bool = False,
        lexer_callbacks: Optional[Dict[str, Callable[[Token], Token]]] = None,
        cache: Union[bool, str] = False,
        g_regex_flags: int = ...
        g_regex_flags: int = ...,
        use_bytes: bool = False,
    ):
        ...
--- a/lark/common.py
+++ b/lark/common.py
@@ -4,10 +4,10 @@ from .lexer import TerminalDef
 ###{standalone
 class LexerConf(Serialize):
    __serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags'
    __serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags', 'use_bytes'
    __serialize_namespace__ = TerminalDef,
    def __init__(self, tokens, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False):
    def __init__(self, tokens, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False, use_bytes=False):
        self.tokens = tokens    # TODO should be terminals
        self.ignore = ignore
        self.postlex = postlex
@@ -15,6 +15,7 @@ class LexerConf(Serialize):
        self.g_regex_flags = g_regex_flags
        self.re_module = re_module
        self.skip_validation = skip_validation
        self.use_bytes = use_bytes
    def _deserialize(self):
        self.callbacks = {} # TODO
--- a/lark/exceptions.py
+++ b/lark/exceptions.py
@@ -28,9 +28,14 @@ class UnexpectedInput(LarkError):
        pos = self.pos_in_stream
        start = max(pos - span, 0)
        end = pos + span
        before = text[start:pos].rsplit('\n', 1)[-1]
        after = text[pos:end].split('\n', 1)[0]
        return before + after + '\n' + ' ' * len(before) + '^\n'
        if not isinstance(text, bytes):
            before = text[start:pos].rsplit('\n', 1)[-1]
            after = text[pos:end].split('\n', 1)[0]
            return before + after + '\n' + ' ' * len(before) + '^\n'
        else:
            before = text[start:pos].rsplit(b'\n', 1)[-1]
            after = text[pos:end].split(b'\n', 1)[0]
            return (before + after + b'\n' + b' ' * len(before) + b'^\n').decode("ascii", "backslashreplace")
    def match_examples(self, parse_fn, examples, token_type_match_fallback=False):
        """ Given a parser instance and a dictionary mapping some label with
@@ -67,7 +72,11 @@ class UnexpectedInput(LarkError):
 class UnexpectedCharacters(LexError, UnexpectedInput):
    def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None):
        message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos], line, column)
        if isinstance(seq, bytes):
            message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace"), line, column)
        else:
            message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos], line, column)
        self.line = line
        self.column = column
--- a/lark/lark.py
+++ b/lark/lark.py
@@ -4,7 +4,7 @@ import sys, os, pickle, hashlib, logging
 from io import open
 from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS
 from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS, isascii
 from .load_grammar import load_grammar
 from .tree import Tree
 from .common import LexerConf, ParserConf
@@ -82,6 +82,7 @@ class LarkOptions(Serialize):
                invert (Default: auto)
    lexer_callbacks - Dictionary of callbacks for the lexer. May alter
                        tokens during lexing. Use with caution.
    use_bytes - Accept an input of type `bytes` instead of `str` (Python 3 only).
    edit_terminals - A callback
    """
    if __doc__:
@@ -105,6 +106,7 @@ class LarkOptions(Serialize):
        'maybe_placeholders': False,
        'edit_terminals': None,
        'g_regex_flags': 0,
        'use_bytes': False,
    }
    def __init__(self, options_dict):
@@ -114,7 +116,7 @@ class LarkOptions(Serialize):
        for name, default in self._defaults.items():
            if name in o:
                value = o.pop(name)
                if isinstance(default, bool) and name != 'cache':
                if isinstance(default, bool) and name not in ('cache', 'use_bytes'):
                    value = bool(value)
            else:
                value = default
@@ -187,6 +189,13 @@ class Lark(Serialize):
            grammar = read()
        assert isinstance(grammar, STRING_TYPE)
        self.grammar_source = grammar
        if self.options.use_bytes:
            if not isascii(grammar):
                raise ValueError("Grammar must be ascii only, when use_bytes=True")
            if sys.version_info[0] == 2 and self.options.use_bytes != 'force':
                raise NotImplementedError("`use_bytes=True` may have issues on python2."
                                          "Use `use_bytes='force'` to use it at your own risk.")
        cache_fn = None
        if self.options.cache:
@@ -196,7 +205,7 @@ class Lark(Serialize):
                cache_fn = self.options.cache
            else:
                if self.options.cache is not True:
                    raise ValueError("cache must be bool or str")
                    raise ValueError("cache argument must be bool or str")
                unhashable = ('transformer', 'postlex', 'lexer_callbacks', 'edit_terminals')
                from . import __version__
                options_str = ''.join(k+str(v) for k, v in options.items() if k not in unhashable)
@@ -252,7 +261,7 @@ class Lark(Serialize):
            for t in self.terminals:
                self.options.edit_terminals(t)
        self._terminals_dict = {t.name:t for t in self.terminals}
        self._terminals_dict = {t.name: t for t in self.terminals}
        # If the user asked to invert the priorities, negate them all here.
        # This replaces the old 'resolve__antiscore_sum' option.
@@ -276,7 +285,7 @@ class Lark(Serialize):
                if hasattr(t, term.name):
                    lexer_callbacks[term.name] = getattr(t, term.name)
        self.lexer_conf = LexerConf(self.terminals, re_module, self.ignore_tokens, self.options.postlex, lexer_callbacks, self.options.g_regex_flags)
        self.lexer_conf = LexerConf(self.terminals, re_module, self.ignore_tokens, self.options.postlex, lexer_callbacks, self.options.g_regex_flags, use_bytes=self.options.use_bytes)
        if self.options.parser:
            self.parser = self._build_parser()
--- a/lark/lexer.py
+++ b/lark/lexer.py
@@ -139,8 +139,8 @@ class Token(Str):
 class LineCounter:
    def __init__(self):
        self.newline_char = '\n'
    def __init__(self, newline_char):
        self.newline_char = newline_char
        self.char_pos = 0
        self.line = 1
        self.column = 1
@@ -169,7 +169,7 @@ class _Lex:
    def lex(self, stream, newline_types, ignore_types):
        newline_types = frozenset(newline_types)
        ignore_types = frozenset(ignore_types)
        line_ctr = LineCounter()
        line_ctr = LineCounter('\n' if not self.lexer.use_bytes else b'\n')
        last_token = None
        while line_ctr.char_pos < len(stream):
@@ -230,7 +230,7 @@ class CallChain:
 def _create_unless(terminals, g_regex_flags, re_):
 def _create_unless(terminals, g_regex_flags, re_, use_bytes):
    tokens_by_type = classify(terminals, lambda t: type(t.pattern))
    assert len(tokens_by_type) <= 2, tokens_by_type.keys()
    embedded_strs = set()
@@ -247,31 +247,34 @@ def _create_unless(terminals, g_regex_flags, re_):
                if strtok.pattern.flags <= retok.pattern.flags:
                    embedded_strs.add(strtok)
        if unless:
            callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, re_, match_whole=True))
            callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, re_, match_whole=True, use_bytes=use_bytes))
    terminals = [t for t in terminals if t not in embedded_strs]
    return terminals, callback
 def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_):
 def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_, use_bytes):
    # Python sets an unreasonable group limit (currently 100) in its re module
    # Worse, the only way to know we reached it is by catching an AssertionError!
    # This function recursively tries less and less groups until it's successful.
    postfix = '$' if match_whole else ''
    mres = []
    while terminals:
        pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size])
        if use_bytes:
            pattern = pattern.encode('latin-1')
        try:
            mre = re_.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in terminals[:max_size]), g_regex_flags)
            mre = re_.compile(pattern, g_regex_flags)
        except AssertionError:  # Yes, this is what Python provides us.. :/
            return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_)
            return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_, use_bytes)
        # terms_from_name = {t.name: t for t in terminals[:max_size]}
        mres.append((mre, {i:n for n,i in mre.groupindex.items()} ))
        terminals = terminals[max_size:]
    return mres
 def build_mres(terminals, g_regex_flags, re_, match_whole=False):
    return _build_mres(terminals, len(terminals), g_regex_flags, match_whole, re_)
 def build_mres(terminals, g_regex_flags, re_, use_bytes, match_whole=False):
    return _build_mres(terminals, len(terminals), g_regex_flags, match_whole, re_, use_bytes)
 def _regexp_has_newline(r):
    r"""Expressions that may indicate newlines in a regexp:
@@ -321,12 +324,13 @@ class TraditionalLexer(Lexer):
        self.terminals = terminals
        self.user_callbacks = conf.callbacks
        self.g_regex_flags = conf.g_regex_flags
        self.use_bytes = conf.use_bytes
        self._mres = None
        # self.build(g_regex_flags)
    def _build(self):
        terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, re_=self.re)
        terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, re_=self.re, use_bytes=self.use_bytes)
        assert all(self.callback.values())
        for type_, f in self.user_callbacks.items():
@@ -336,7 +340,7 @@ class TraditionalLexer(Lexer):
            else:
                self.callback[type_] = f
        self._mres = build_mres(terminals, self.g_regex_flags, self.re)
        self._mres = build_mres(terminals, self.g_regex_flags, self.re, self.use_bytes)
    @property
    def mres(self):
@@ -365,7 +369,8 @@ class ContextualLexer(Lexer):
            assert t.name not in tokens_by_name, t
            tokens_by_name[t.name] = t
        trad_conf = type(conf)(terminals, conf.re_module, conf.ignore, callbacks=conf.callbacks, g_regex_flags=conf.g_regex_flags, skip_validation=conf.skip_validation)
        trad_conf = copy(conf)
        trad_conf.tokens = terminals
        lexer_by_tokens = {}
        self.lexers = {}
--- a/lark/parser_frontends.py
+++ b/lark/parser_frontends.py
@@ -189,6 +189,8 @@ class XEarley(_ParserFrontend):
            else:
                if width == 0:
                    raise ValueError("Dynamic Earley doesn't allow zero-width regexps", t)
            if lexer_conf.use_bytes:
                regexp = regexp.encode('utf-8')
            self.regexps[t.name] = lexer_conf.re_module.compile(regexp, lexer_conf.g_regex_flags)
--- a/lark/utils.py
+++ b/lark/utils.py
@@ -305,4 +305,17 @@ def combine_alternatives(lists):
 class FS:
    open = open
    exists = os.path.exists
    exists = os.path.exists
 def isascii(s):
    """ str.isascii only exists in python3.7+ """
    try:
        return s.isascii()
    except AttributeError:
        try:
            s.encode('ascii')
            return True
        except (UnicodeDecodeError, UnicodeEncodeError):
            return False
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -8,7 +8,9 @@ import os
 import sys
 from copy import copy, deepcopy
 from lark.utils import Py36
 from lark.utils import Py36, isascii
 from lark import Token
 try:
    from cStringIO import StringIO as cStringIO
@@ -561,12 +563,84 @@ class CustomLexer(Lexer):
    def lex(self, *args, **kwargs):
        return self.lexer.lex(*args, **kwargs)
 def _tree_structure_check(a, b):
    """
    Checks that both Tree objects have the same structure, without checking their values.
    """
    assert a.data == b.data and len(a.children) == len(b.children)
    for ca,cb in zip(a.children, b.children):
        assert type(ca) == type(cb)
        if isinstance(ca, Tree):
            _tree_structure_check(ca, cb)
        elif isinstance(ca, Token):
            assert ca.type == cb.type
        else:
            assert ca == cb
 class DualBytesLark:
    """
    A helper class that wraps both a normal parser, and a parser for bytes.
    It automatically transforms `.parse` calls for both lexer, returning the value from the text lexer
    It always checks that both produce the same output/error
    NOTE: Not currently used, but left here for future debugging.
    """
    def __init__(self, g, *args, **kwargs):
        self.text_lexer = Lark(g, *args, use_bytes=False, **kwargs)
        g = self.text_lexer.grammar_source.lower()
        if '\\u' in g or not isascii(g):
            # Bytes re can't deal with uniode escapes
            self.bytes_lark = None
        else:
            # Everything here should work, so use `use_bytes='force'`
            self.bytes_lark = Lark(self.text_lexer.grammar_source, *args, use_bytes='force', **kwargs)
    def parse(self, text, start=None):
        # TODO: Easy workaround, more complex checks would be beneficial
        if not isascii(text) or self.bytes_lark is None:
            return self.text_lexer.parse(text, start)
        try:
            rv = self.text_lexer.parse(text, start)
        except Exception as e:
            try:
                self.bytes_lark.parse(text.encode(), start)
            except Exception as be:
                assert type(e) == type(be), "Parser with and without `use_bytes` raise different exceptions"
                raise e
            assert False, "Parser without `use_bytes` raises exception, with doesn't"
        try:
            bv = self.bytes_lark.parse(text.encode(), start)
        except Exception as be:
            assert False, "Parser without `use_bytes` doesn't raise an exception, with does"
        _tree_structure_check(rv, bv)
        return rv
    @classmethod
    def open(cls, grammar_filename, rel_to=None, **options):
        if rel_to:
            basepath = os.path.dirname(rel_to)
            grammar_filename = os.path.join(basepath, grammar_filename)
        with open(grammar_filename, encoding='utf8') as f:
            return cls(f, **options)
    def save(self,f):
        self.text_lexer.save(f)
        if self.bytes_lark is not None:
            self.bytes_lark.save(f)
    def load(self,f):
        self.text_lexer = self.text_lexer.load(f)
        if self.bytes_lark is not None:
            self.bytes_lark.load(f)
 def _make_parser_test(LEXER, PARSER):
    lexer_class_or_name = CustomLexer if LEXER == 'custom' else LEXER
    def _Lark(grammar, **kwargs):
        return Lark(grammar, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs)
    def _Lark_open(gfilename, **kwargs):
        return Lark.open(gfilename, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs)
    class _TestParser(unittest.TestCase):
        def test_basic1(self):
            g = _Lark("""start: a+ b a* "b" a*
@@ -647,6 +721,28 @@ def _make_parser_test(LEXER, PARSER):
                          """)
            g.parse('\x01\x02\x03')
        @unittest.skipIf(sys.version_info[:2]==(2, 7), "bytes parser isn't perfect in Python2.7, exceptions don't work correctly")
        def test_bytes_utf8(self):
            g = r"""
            start: BOM? char+
            BOM: "\xef\xbb\xbf"
            char: CHAR1 | CHAR2 | CHAR3 | CHAR4
            CONTINUATION_BYTE: "\x80" .. "\xbf"
            CHAR1: "\x00" .. "\x7f"
            CHAR2: "\xc0" .. "\xdf" CONTINUATION_BYTE
            CHAR3: "\xe0" .. "\xef" CONTINUATION_BYTE CONTINUATION_BYTE
            CHAR4: "\xf0" .. "\xf7" CONTINUATION_BYTE CONTINUATION_BYTE CONTINUATION_BYTE
            """
            g = _Lark(g, use_bytes=True)
            s = u"🔣 地? gurīn".encode('utf-8')
            self.assertEqual(len(g.parse(s).children), 10)
            for enc, j in [("sjis", u"地球の絵はグリーンでグッド?  Chikyuu no e wa guriin de guddo"),
                           ("sjis", u"売春婦"),
                           ("euc-jp", u"乂鵬鵠")]:
                s = j.encode(enc)
                self.assertRaises(UnexpectedCharacters, g.parse, s)
        @unittest.skipIf(PARSER == 'cyk', "Takes forever")
        def test_stack_for_ebnf(self):
            """Verify that stack depth isn't an issue for EBNF grammars"""
@@ -1065,7 +1161,7 @@ def _make_parser_test(LEXER, PARSER):
            g = _Lark(g)
            self.assertEqual( g.parse('"hello"').children, ['"hello"'])
            self.assertEqual( g.parse("'hello'").children, ["'hello'"])
        @unittest.skipIf(not Py36, "Required re syntax only exists in python3.6+")
        def test_join_regex_flags(self):
            g = r"""
@@ -1078,7 +1174,7 @@ def _make_parser_test(LEXER, PARSER):
            self.assertEqual(g.parse("  ").children,["  "])
            self.assertEqual(g.parse("\n ").children,["\n "])
            self.assertRaises(UnexpectedCharacters, g.parse, "\n\n")
            g = r"""
                start: A
                A: B | C