| @@ -128,6 +128,7 @@ Useful for caching and multiprocessing. | |||||
| - **priority** - How priorities should be evaluated - auto, none, normal, invert (Default: auto) | - **priority** - How priorities should be evaluated - auto, none, normal, invert (Default: auto) | ||||
| - **lexer_callbacks** - Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution. | - **lexer_callbacks** - Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution. | ||||
| - **edit_terminals** - A callback | - **edit_terminals** - A callback | ||||
| - **use_bytes** - Accept and parse an input of type `bytes` instead of `str`. Grammar should still be specified as `str`, and terminal values are assumed to be `latin-1`. | |||||
| #### Using Unicode character classes with `regex` | #### Using Unicode character classes with `regex` | ||||
| @@ -31,10 +31,12 @@ class LarkOptions: | |||||
| lexer_callbacks: Dict[str, Callable[[Token], Token]] | lexer_callbacks: Dict[str, Callable[[Token], Token]] | ||||
| cache: Union[bool, str] | cache: Union[bool, str] | ||||
| g_regex_flags: int | g_regex_flags: int | ||||
| use_bytes: bool | |||||
| class Lark: | class Lark: | ||||
| source: str | source: str | ||||
| grammar_source: str | |||||
| options: LarkOptions | options: LarkOptions | ||||
| lexer: Lexer | lexer: Lexer | ||||
| terminals: List[TerminalDef] | terminals: List[TerminalDef] | ||||
| @@ -56,7 +58,8 @@ class Lark: | |||||
| maybe_placeholders: bool = False, | maybe_placeholders: bool = False, | ||||
| lexer_callbacks: Optional[Dict[str, Callable[[Token], Token]]] = None, | lexer_callbacks: Optional[Dict[str, Callable[[Token], Token]]] = None, | ||||
| cache: Union[bool, str] = False, | cache: Union[bool, str] = False, | ||||
| g_regex_flags: int = ... | |||||
| g_regex_flags: int = ..., | |||||
| use_bytes: bool = False, | |||||
| ): | ): | ||||
| ... | ... | ||||
| @@ -4,10 +4,10 @@ from .lexer import TerminalDef | |||||
| ###{standalone | ###{standalone | ||||
| class LexerConf(Serialize): | class LexerConf(Serialize): | ||||
| __serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags' | |||||
| __serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags', 'use_bytes' | |||||
| __serialize_namespace__ = TerminalDef, | __serialize_namespace__ = TerminalDef, | ||||
| def __init__(self, tokens, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False): | |||||
| def __init__(self, tokens, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False, use_bytes=False): | |||||
| self.tokens = tokens # TODO should be terminals | self.tokens = tokens # TODO should be terminals | ||||
| self.ignore = ignore | self.ignore = ignore | ||||
| self.postlex = postlex | self.postlex = postlex | ||||
| @@ -15,6 +15,7 @@ class LexerConf(Serialize): | |||||
| self.g_regex_flags = g_regex_flags | self.g_regex_flags = g_regex_flags | ||||
| self.re_module = re_module | self.re_module = re_module | ||||
| self.skip_validation = skip_validation | self.skip_validation = skip_validation | ||||
| self.use_bytes = use_bytes | |||||
| def _deserialize(self): | def _deserialize(self): | ||||
| self.callbacks = {} # TODO | self.callbacks = {} # TODO | ||||
| @@ -28,9 +28,14 @@ class UnexpectedInput(LarkError): | |||||
| pos = self.pos_in_stream | pos = self.pos_in_stream | ||||
| start = max(pos - span, 0) | start = max(pos - span, 0) | ||||
| end = pos + span | end = pos + span | ||||
| before = text[start:pos].rsplit('\n', 1)[-1] | |||||
| after = text[pos:end].split('\n', 1)[0] | |||||
| return before + after + '\n' + ' ' * len(before) + '^\n' | |||||
| if not isinstance(text, bytes): | |||||
| before = text[start:pos].rsplit('\n', 1)[-1] | |||||
| after = text[pos:end].split('\n', 1)[0] | |||||
| return before + after + '\n' + ' ' * len(before) + '^\n' | |||||
| else: | |||||
| before = text[start:pos].rsplit(b'\n', 1)[-1] | |||||
| after = text[pos:end].split(b'\n', 1)[0] | |||||
| return (before + after + b'\n' + b' ' * len(before) + b'^\n').decode("ascii", "backslashreplace") | |||||
| def match_examples(self, parse_fn, examples, token_type_match_fallback=False): | def match_examples(self, parse_fn, examples, token_type_match_fallback=False): | ||||
| """ Given a parser instance and a dictionary mapping some label with | """ Given a parser instance and a dictionary mapping some label with | ||||
| @@ -67,7 +72,11 @@ class UnexpectedInput(LarkError): | |||||
| class UnexpectedCharacters(LexError, UnexpectedInput): | class UnexpectedCharacters(LexError, UnexpectedInput): | ||||
| def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None): | def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None): | ||||
| message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos], line, column) | |||||
| if isinstance(seq, bytes): | |||||
| message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace"), line, column) | |||||
| else: | |||||
| message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos], line, column) | |||||
| self.line = line | self.line = line | ||||
| self.column = column | self.column = column | ||||
| @@ -4,7 +4,7 @@ import sys, os, pickle, hashlib, logging | |||||
| from io import open | from io import open | ||||
| from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS | |||||
| from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS, isascii | |||||
| from .load_grammar import load_grammar | from .load_grammar import load_grammar | ||||
| from .tree import Tree | from .tree import Tree | ||||
| from .common import LexerConf, ParserConf | from .common import LexerConf, ParserConf | ||||
| @@ -82,6 +82,7 @@ class LarkOptions(Serialize): | |||||
| invert (Default: auto) | invert (Default: auto) | ||||
| lexer_callbacks - Dictionary of callbacks for the lexer. May alter | lexer_callbacks - Dictionary of callbacks for the lexer. May alter | ||||
| tokens during lexing. Use with caution. | tokens during lexing. Use with caution. | ||||
| use_bytes - Accept an input of type `bytes` instead of `str` (Python 3 only). | |||||
| edit_terminals - A callback | edit_terminals - A callback | ||||
| """ | """ | ||||
| if __doc__: | if __doc__: | ||||
| @@ -105,6 +106,7 @@ class LarkOptions(Serialize): | |||||
| 'maybe_placeholders': False, | 'maybe_placeholders': False, | ||||
| 'edit_terminals': None, | 'edit_terminals': None, | ||||
| 'g_regex_flags': 0, | 'g_regex_flags': 0, | ||||
| 'use_bytes': False, | |||||
| } | } | ||||
| def __init__(self, options_dict): | def __init__(self, options_dict): | ||||
| @@ -114,7 +116,7 @@ class LarkOptions(Serialize): | |||||
| for name, default in self._defaults.items(): | for name, default in self._defaults.items(): | ||||
| if name in o: | if name in o: | ||||
| value = o.pop(name) | value = o.pop(name) | ||||
| if isinstance(default, bool) and name != 'cache': | |||||
| if isinstance(default, bool) and name not in ('cache', 'use_bytes'): | |||||
| value = bool(value) | value = bool(value) | ||||
| else: | else: | ||||
| value = default | value = default | ||||
| @@ -187,6 +189,13 @@ class Lark(Serialize): | |||||
| grammar = read() | grammar = read() | ||||
| assert isinstance(grammar, STRING_TYPE) | assert isinstance(grammar, STRING_TYPE) | ||||
| self.grammar_source = grammar | |||||
| if self.options.use_bytes: | |||||
| if not isascii(grammar): | |||||
| raise ValueError("Grammar must be ascii only, when use_bytes=True") | |||||
| if sys.version_info[0] == 2 and self.options.use_bytes != 'force': | |||||
| raise NotImplementedError("`use_bytes=True` may have issues on python2." | |||||
| "Use `use_bytes='force'` to use it at your own risk.") | |||||
| cache_fn = None | cache_fn = None | ||||
| if self.options.cache: | if self.options.cache: | ||||
| @@ -196,7 +205,7 @@ class Lark(Serialize): | |||||
| cache_fn = self.options.cache | cache_fn = self.options.cache | ||||
| else: | else: | ||||
| if self.options.cache is not True: | if self.options.cache is not True: | ||||
| raise ValueError("cache must be bool or str") | |||||
| raise ValueError("cache argument must be bool or str") | |||||
| unhashable = ('transformer', 'postlex', 'lexer_callbacks', 'edit_terminals') | unhashable = ('transformer', 'postlex', 'lexer_callbacks', 'edit_terminals') | ||||
| from . import __version__ | from . import __version__ | ||||
| options_str = ''.join(k+str(v) for k, v in options.items() if k not in unhashable) | options_str = ''.join(k+str(v) for k, v in options.items() if k not in unhashable) | ||||
| @@ -252,7 +261,7 @@ class Lark(Serialize): | |||||
| for t in self.terminals: | for t in self.terminals: | ||||
| self.options.edit_terminals(t) | self.options.edit_terminals(t) | ||||
| self._terminals_dict = {t.name:t for t in self.terminals} | |||||
| self._terminals_dict = {t.name: t for t in self.terminals} | |||||
| # If the user asked to invert the priorities, negate them all here. | # If the user asked to invert the priorities, negate them all here. | ||||
| # This replaces the old 'resolve__antiscore_sum' option. | # This replaces the old 'resolve__antiscore_sum' option. | ||||
| @@ -276,7 +285,7 @@ class Lark(Serialize): | |||||
| if hasattr(t, term.name): | if hasattr(t, term.name): | ||||
| lexer_callbacks[term.name] = getattr(t, term.name) | lexer_callbacks[term.name] = getattr(t, term.name) | ||||
| self.lexer_conf = LexerConf(self.terminals, re_module, self.ignore_tokens, self.options.postlex, lexer_callbacks, self.options.g_regex_flags) | |||||
| self.lexer_conf = LexerConf(self.terminals, re_module, self.ignore_tokens, self.options.postlex, lexer_callbacks, self.options.g_regex_flags, use_bytes=self.options.use_bytes) | |||||
| if self.options.parser: | if self.options.parser: | ||||
| self.parser = self._build_parser() | self.parser = self._build_parser() | ||||
| @@ -139,8 +139,8 @@ class Token(Str): | |||||
| class LineCounter: | class LineCounter: | ||||
| def __init__(self): | |||||
| self.newline_char = '\n' | |||||
| def __init__(self, newline_char): | |||||
| self.newline_char = newline_char | |||||
| self.char_pos = 0 | self.char_pos = 0 | ||||
| self.line = 1 | self.line = 1 | ||||
| self.column = 1 | self.column = 1 | ||||
| @@ -169,7 +169,7 @@ class _Lex: | |||||
| def lex(self, stream, newline_types, ignore_types): | def lex(self, stream, newline_types, ignore_types): | ||||
| newline_types = frozenset(newline_types) | newline_types = frozenset(newline_types) | ||||
| ignore_types = frozenset(ignore_types) | ignore_types = frozenset(ignore_types) | ||||
| line_ctr = LineCounter() | |||||
| line_ctr = LineCounter('\n' if not self.lexer.use_bytes else b'\n') | |||||
| last_token = None | last_token = None | ||||
| while line_ctr.char_pos < len(stream): | while line_ctr.char_pos < len(stream): | ||||
| @@ -230,7 +230,7 @@ class CallChain: | |||||
| def _create_unless(terminals, g_regex_flags, re_): | |||||
| def _create_unless(terminals, g_regex_flags, re_, use_bytes): | |||||
| tokens_by_type = classify(terminals, lambda t: type(t.pattern)) | tokens_by_type = classify(terminals, lambda t: type(t.pattern)) | ||||
| assert len(tokens_by_type) <= 2, tokens_by_type.keys() | assert len(tokens_by_type) <= 2, tokens_by_type.keys() | ||||
| embedded_strs = set() | embedded_strs = set() | ||||
| @@ -247,31 +247,34 @@ def _create_unless(terminals, g_regex_flags, re_): | |||||
| if strtok.pattern.flags <= retok.pattern.flags: | if strtok.pattern.flags <= retok.pattern.flags: | ||||
| embedded_strs.add(strtok) | embedded_strs.add(strtok) | ||||
| if unless: | if unless: | ||||
| callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, re_, match_whole=True)) | |||||
| callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, re_, match_whole=True, use_bytes=use_bytes)) | |||||
| terminals = [t for t in terminals if t not in embedded_strs] | terminals = [t for t in terminals if t not in embedded_strs] | ||||
| return terminals, callback | return terminals, callback | ||||
| def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_): | |||||
| def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_, use_bytes): | |||||
| # Python sets an unreasonable group limit (currently 100) in its re module | # Python sets an unreasonable group limit (currently 100) in its re module | ||||
| # Worse, the only way to know we reached it is by catching an AssertionError! | # Worse, the only way to know we reached it is by catching an AssertionError! | ||||
| # This function recursively tries less and less groups until it's successful. | # This function recursively tries less and less groups until it's successful. | ||||
| postfix = '$' if match_whole else '' | postfix = '$' if match_whole else '' | ||||
| mres = [] | mres = [] | ||||
| while terminals: | while terminals: | ||||
| pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size]) | |||||
| if use_bytes: | |||||
| pattern = pattern.encode('latin-1') | |||||
| try: | try: | ||||
| mre = re_.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in terminals[:max_size]), g_regex_flags) | |||||
| mre = re_.compile(pattern, g_regex_flags) | |||||
| except AssertionError: # Yes, this is what Python provides us.. :/ | except AssertionError: # Yes, this is what Python provides us.. :/ | ||||
| return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_) | |||||
| return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_, use_bytes) | |||||
| # terms_from_name = {t.name: t for t in terminals[:max_size]} | # terms_from_name = {t.name: t for t in terminals[:max_size]} | ||||
| mres.append((mre, {i:n for n,i in mre.groupindex.items()} )) | mres.append((mre, {i:n for n,i in mre.groupindex.items()} )) | ||||
| terminals = terminals[max_size:] | terminals = terminals[max_size:] | ||||
| return mres | return mres | ||||
| def build_mres(terminals, g_regex_flags, re_, match_whole=False): | |||||
| return _build_mres(terminals, len(terminals), g_regex_flags, match_whole, re_) | |||||
| def build_mres(terminals, g_regex_flags, re_, use_bytes, match_whole=False): | |||||
| return _build_mres(terminals, len(terminals), g_regex_flags, match_whole, re_, use_bytes) | |||||
| def _regexp_has_newline(r): | def _regexp_has_newline(r): | ||||
| r"""Expressions that may indicate newlines in a regexp: | r"""Expressions that may indicate newlines in a regexp: | ||||
| @@ -321,12 +324,13 @@ class TraditionalLexer(Lexer): | |||||
| self.terminals = terminals | self.terminals = terminals | ||||
| self.user_callbacks = conf.callbacks | self.user_callbacks = conf.callbacks | ||||
| self.g_regex_flags = conf.g_regex_flags | self.g_regex_flags = conf.g_regex_flags | ||||
| self.use_bytes = conf.use_bytes | |||||
| self._mres = None | self._mres = None | ||||
| # self.build(g_regex_flags) | # self.build(g_regex_flags) | ||||
| def _build(self): | def _build(self): | ||||
| terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, re_=self.re) | |||||
| terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, re_=self.re, use_bytes=self.use_bytes) | |||||
| assert all(self.callback.values()) | assert all(self.callback.values()) | ||||
| for type_, f in self.user_callbacks.items(): | for type_, f in self.user_callbacks.items(): | ||||
| @@ -336,7 +340,7 @@ class TraditionalLexer(Lexer): | |||||
| else: | else: | ||||
| self.callback[type_] = f | self.callback[type_] = f | ||||
| self._mres = build_mres(terminals, self.g_regex_flags, self.re) | |||||
| self._mres = build_mres(terminals, self.g_regex_flags, self.re, self.use_bytes) | |||||
| @property | @property | ||||
| def mres(self): | def mres(self): | ||||
| @@ -365,7 +369,8 @@ class ContextualLexer(Lexer): | |||||
| assert t.name not in tokens_by_name, t | assert t.name not in tokens_by_name, t | ||||
| tokens_by_name[t.name] = t | tokens_by_name[t.name] = t | ||||
| trad_conf = type(conf)(terminals, conf.re_module, conf.ignore, callbacks=conf.callbacks, g_regex_flags=conf.g_regex_flags, skip_validation=conf.skip_validation) | |||||
| trad_conf = copy(conf) | |||||
| trad_conf.tokens = terminals | |||||
| lexer_by_tokens = {} | lexer_by_tokens = {} | ||||
| self.lexers = {} | self.lexers = {} | ||||
| @@ -189,6 +189,8 @@ class XEarley(_ParserFrontend): | |||||
| else: | else: | ||||
| if width == 0: | if width == 0: | ||||
| raise ValueError("Dynamic Earley doesn't allow zero-width regexps", t) | raise ValueError("Dynamic Earley doesn't allow zero-width regexps", t) | ||||
| if lexer_conf.use_bytes: | |||||
| regexp = regexp.encode('utf-8') | |||||
| self.regexps[t.name] = lexer_conf.re_module.compile(regexp, lexer_conf.g_regex_flags) | self.regexps[t.name] = lexer_conf.re_module.compile(regexp, lexer_conf.g_regex_flags) | ||||
| @@ -305,4 +305,17 @@ def combine_alternatives(lists): | |||||
| class FS: | class FS: | ||||
| open = open | open = open | ||||
| exists = os.path.exists | |||||
| exists = os.path.exists | |||||
| def isascii(s): | |||||
| """ str.isascii only exists in python3.7+ """ | |||||
| try: | |||||
| return s.isascii() | |||||
| except AttributeError: | |||||
| try: | |||||
| s.encode('ascii') | |||||
| return True | |||||
| except (UnicodeDecodeError, UnicodeEncodeError): | |||||
| return False | |||||
| @@ -8,7 +8,9 @@ import os | |||||
| import sys | import sys | ||||
| from copy import copy, deepcopy | from copy import copy, deepcopy | ||||
| from lark.utils import Py36 | |||||
| from lark.utils import Py36, isascii | |||||
| from lark import Token | |||||
| try: | try: | ||||
| from cStringIO import StringIO as cStringIO | from cStringIO import StringIO as cStringIO | ||||
| @@ -561,12 +563,84 @@ class CustomLexer(Lexer): | |||||
| def lex(self, *args, **kwargs): | def lex(self, *args, **kwargs): | ||||
| return self.lexer.lex(*args, **kwargs) | return self.lexer.lex(*args, **kwargs) | ||||
| def _tree_structure_check(a, b): | |||||
| """ | |||||
| Checks that both Tree objects have the same structure, without checking their values. | |||||
| """ | |||||
| assert a.data == b.data and len(a.children) == len(b.children) | |||||
| for ca,cb in zip(a.children, b.children): | |||||
| assert type(ca) == type(cb) | |||||
| if isinstance(ca, Tree): | |||||
| _tree_structure_check(ca, cb) | |||||
| elif isinstance(ca, Token): | |||||
| assert ca.type == cb.type | |||||
| else: | |||||
| assert ca == cb | |||||
| class DualBytesLark: | |||||
| """ | |||||
| A helper class that wraps both a normal parser, and a parser for bytes. | |||||
| It automatically transforms `.parse` calls for both lexer, returning the value from the text lexer | |||||
| It always checks that both produce the same output/error | |||||
| NOTE: Not currently used, but left here for future debugging. | |||||
| """ | |||||
| def __init__(self, g, *args, **kwargs): | |||||
| self.text_lexer = Lark(g, *args, use_bytes=False, **kwargs) | |||||
| g = self.text_lexer.grammar_source.lower() | |||||
| if '\\u' in g or not isascii(g): | |||||
| # Bytes re can't deal with uniode escapes | |||||
| self.bytes_lark = None | |||||
| else: | |||||
| # Everything here should work, so use `use_bytes='force'` | |||||
| self.bytes_lark = Lark(self.text_lexer.grammar_source, *args, use_bytes='force', **kwargs) | |||||
| def parse(self, text, start=None): | |||||
| # TODO: Easy workaround, more complex checks would be beneficial | |||||
| if not isascii(text) or self.bytes_lark is None: | |||||
| return self.text_lexer.parse(text, start) | |||||
| try: | |||||
| rv = self.text_lexer.parse(text, start) | |||||
| except Exception as e: | |||||
| try: | |||||
| self.bytes_lark.parse(text.encode(), start) | |||||
| except Exception as be: | |||||
| assert type(e) == type(be), "Parser with and without `use_bytes` raise different exceptions" | |||||
| raise e | |||||
| assert False, "Parser without `use_bytes` raises exception, with doesn't" | |||||
| try: | |||||
| bv = self.bytes_lark.parse(text.encode(), start) | |||||
| except Exception as be: | |||||
| assert False, "Parser without `use_bytes` doesn't raise an exception, with does" | |||||
| _tree_structure_check(rv, bv) | |||||
| return rv | |||||
| @classmethod | |||||
| def open(cls, grammar_filename, rel_to=None, **options): | |||||
| if rel_to: | |||||
| basepath = os.path.dirname(rel_to) | |||||
| grammar_filename = os.path.join(basepath, grammar_filename) | |||||
| with open(grammar_filename, encoding='utf8') as f: | |||||
| return cls(f, **options) | |||||
| def save(self,f): | |||||
| self.text_lexer.save(f) | |||||
| if self.bytes_lark is not None: | |||||
| self.bytes_lark.save(f) | |||||
| def load(self,f): | |||||
| self.text_lexer = self.text_lexer.load(f) | |||||
| if self.bytes_lark is not None: | |||||
| self.bytes_lark.load(f) | |||||
| def _make_parser_test(LEXER, PARSER): | def _make_parser_test(LEXER, PARSER): | ||||
| lexer_class_or_name = CustomLexer if LEXER == 'custom' else LEXER | lexer_class_or_name = CustomLexer if LEXER == 'custom' else LEXER | ||||
| def _Lark(grammar, **kwargs): | def _Lark(grammar, **kwargs): | ||||
| return Lark(grammar, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs) | return Lark(grammar, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs) | ||||
| def _Lark_open(gfilename, **kwargs): | def _Lark_open(gfilename, **kwargs): | ||||
| return Lark.open(gfilename, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs) | return Lark.open(gfilename, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs) | ||||
| class _TestParser(unittest.TestCase): | class _TestParser(unittest.TestCase): | ||||
| def test_basic1(self): | def test_basic1(self): | ||||
| g = _Lark("""start: a+ b a* "b" a* | g = _Lark("""start: a+ b a* "b" a* | ||||
| @@ -647,6 +721,28 @@ def _make_parser_test(LEXER, PARSER): | |||||
| """) | """) | ||||
| g.parse('\x01\x02\x03') | g.parse('\x01\x02\x03') | ||||
| @unittest.skipIf(sys.version_info[:2]==(2, 7), "bytes parser isn't perfect in Python2.7, exceptions don't work correctly") | |||||
| def test_bytes_utf8(self): | |||||
| g = r""" | |||||
| start: BOM? char+ | |||||
| BOM: "\xef\xbb\xbf" | |||||
| char: CHAR1 | CHAR2 | CHAR3 | CHAR4 | |||||
| CONTINUATION_BYTE: "\x80" .. "\xbf" | |||||
| CHAR1: "\x00" .. "\x7f" | |||||
| CHAR2: "\xc0" .. "\xdf" CONTINUATION_BYTE | |||||
| CHAR3: "\xe0" .. "\xef" CONTINUATION_BYTE CONTINUATION_BYTE | |||||
| CHAR4: "\xf0" .. "\xf7" CONTINUATION_BYTE CONTINUATION_BYTE CONTINUATION_BYTE | |||||
| """ | |||||
| g = _Lark(g, use_bytes=True) | |||||
| s = u"🔣 地? gurīn".encode('utf-8') | |||||
| self.assertEqual(len(g.parse(s).children), 10) | |||||
| for enc, j in [("sjis", u"地球の絵はグリーンでグッド? Chikyuu no e wa guriin de guddo"), | |||||
| ("sjis", u"売春婦"), | |||||
| ("euc-jp", u"乂鵬鵠")]: | |||||
| s = j.encode(enc) | |||||
| self.assertRaises(UnexpectedCharacters, g.parse, s) | |||||
| @unittest.skipIf(PARSER == 'cyk', "Takes forever") | @unittest.skipIf(PARSER == 'cyk', "Takes forever") | ||||
| def test_stack_for_ebnf(self): | def test_stack_for_ebnf(self): | ||||
| """Verify that stack depth isn't an issue for EBNF grammars""" | """Verify that stack depth isn't an issue for EBNF grammars""" | ||||
| @@ -1065,7 +1161,7 @@ def _make_parser_test(LEXER, PARSER): | |||||
| g = _Lark(g) | g = _Lark(g) | ||||
| self.assertEqual( g.parse('"hello"').children, ['"hello"']) | self.assertEqual( g.parse('"hello"').children, ['"hello"']) | ||||
| self.assertEqual( g.parse("'hello'").children, ["'hello'"]) | self.assertEqual( g.parse("'hello'").children, ["'hello'"]) | ||||
| @unittest.skipIf(not Py36, "Required re syntax only exists in python3.6+") | @unittest.skipIf(not Py36, "Required re syntax only exists in python3.6+") | ||||
| def test_join_regex_flags(self): | def test_join_regex_flags(self): | ||||
| g = r""" | g = r""" | ||||
| @@ -1078,7 +1174,7 @@ def _make_parser_test(LEXER, PARSER): | |||||
| self.assertEqual(g.parse(" ").children,[" "]) | self.assertEqual(g.parse(" ").children,[" "]) | ||||
| self.assertEqual(g.parse("\n ").children,["\n "]) | self.assertEqual(g.parse("\n ").children,["\n "]) | ||||
| self.assertRaises(UnexpectedCharacters, g.parse, "\n\n") | self.assertRaises(UnexpectedCharacters, g.parse, "\n\n") | ||||
| g = r""" | g = r""" | ||||
| start: A | start: A | ||||
| A: B | C | A: B | C | ||||