| @@ -15,7 +15,7 @@ class PostLex(Protocol): | |||||
| def process(self, stream: Iterator[Token]) -> Iterator[Token]: | def process(self, stream: Iterator[Token]) -> Iterator[Token]: | ||||
| ... | ... | ||||
| always_accept: Iterable[str] | always_accept: Iterable[str] | ||||
| @@ -42,12 +42,12 @@ class LarkOptions: | |||||
| class PackageResource(object): | class PackageResource(object): | ||||
| pkg_name: str | pkg_name: str | ||||
| path: str | path: str | ||||
| def __init__(self, pkg_name: str, path: str): ... | def __init__(self, pkg_name: str, path: str): ... | ||||
| class FromPackageLoader: | class FromPackageLoader: | ||||
| def __init__(self, pkg_name: str, search_paths: Tuple[str, ...] = ...): ... | def __init__(self, pkg_name: str, search_paths: Tuple[str, ...] = ...): ... | ||||
| def __call__(self, base_path: Union[None, str, PackageResource], grammar_path: str) -> Tuple[PackageResource, str]: ... | def __call__(self, base_path: Union[None, str, PackageResource], grammar_path: str) -> Tuple[PackageResource, str]: ... | ||||
| @@ -88,12 +88,12 @@ class Lark: | |||||
| @classmethod | @classmethod | ||||
| def open(cls: Type[_T], grammar_filename: str, rel_to: Optional[str] = None, **options) -> _T: | def open(cls: Type[_T], grammar_filename: str, rel_to: Optional[str] = None, **options) -> _T: | ||||
| ... | ... | ||||
| @classmethod | @classmethod | ||||
| def open_from_package(cls: Type[_T], package: str, grammar_path: str, search_paths: Tuple[str, ...] = ..., **options) -> _T: | def open_from_package(cls: Type[_T], package: str, grammar_path: str, search_paths: Tuple[str, ...] = ..., **options) -> _T: | ||||
| ... | ... | ||||
| def lex(self, text: str) -> Iterator[Token]: | |||||
| def lex(self, text: str, dont_ignore: bool = False) -> Iterator[Token]: | |||||
| ... | ... | ||||
| def get_terminal(self, name: str) -> TerminalDef: | def get_terminal(self, name: str) -> TerminalDef: | ||||
| @@ -11,7 +11,7 @@ from .load_grammar import load_grammar, FromPackageLoader | |||||
| from .tree import Tree | from .tree import Tree | ||||
| from .common import LexerConf, ParserConf | from .common import LexerConf, ParserConf | ||||
| from .lexer import Lexer, TraditionalLexer, TerminalDef | |||||
| from .lexer import Lexer, TraditionalLexer, TerminalDef, LexerThread | |||||
| from .parse_tree_builder import ParseTreeBuilder | from .parse_tree_builder import ParseTreeBuilder | ||||
| from .parser_frontends import get_frontend, _get_lexer_callbacks | from .parser_frontends import get_frontend, _get_lexer_callbacks | ||||
| from .grammar import Rule | from .grammar import Rule | ||||
| @@ -355,8 +355,13 @@ class Lark(Serialize): | |||||
| __serialize_fields__ = 'parser', 'rules', 'options' | __serialize_fields__ = 'parser', 'rules', 'options' | ||||
| def _build_lexer(self): | |||||
| return TraditionalLexer(self.lexer_conf) | |||||
| def _build_lexer(self, dont_ignore=False): | |||||
| lexer_conf = self.lexer_conf | |||||
| if dont_ignore: | |||||
| from copy import copy | |||||
| lexer_conf = copy(lexer_conf) | |||||
| lexer_conf.ignore = () | |||||
| return TraditionalLexer(lexer_conf) | |||||
| def _prepare_callbacks(self): | def _prepare_callbacks(self): | ||||
| self.parser_class = get_frontend(self.options.parser, self.options.lexer) | self.parser_class = get_frontend(self.options.parser, self.options.lexer) | ||||
| @@ -419,6 +424,7 @@ class Lark(Serialize): | |||||
| self._callbacks, | self._callbacks, | ||||
| self.options, # Not all, but multiple attributes are used | self.options, # Not all, but multiple attributes are used | ||||
| ) | ) | ||||
| self.lexer_conf = self.parser.lexer_conf | |||||
| self.terminals = self.parser.lexer_conf.terminals | self.terminals = self.parser.lexer_conf.terminals | ||||
| self._terminals_dict = {t.name: t for t in self.terminals} | self._terminals_dict = {t.name: t for t in self.terminals} | ||||
| return self | return self | ||||
| @@ -468,11 +474,17 @@ class Lark(Serialize): | |||||
| return 'Lark(open(%r), parser=%r, lexer=%r, ...)' % (self.source_path, self.options.parser, self.options.lexer) | return 'Lark(open(%r), parser=%r, lexer=%r, ...)' % (self.source_path, self.options.parser, self.options.lexer) | ||||
| def lex(self, text): | |||||
| "Only lex (and postlex) the text, without parsing it. Only relevant when lexer='standard'" | |||||
| if not hasattr(self, 'lexer'): | |||||
| self.lexer = self._build_lexer() | |||||
| stream = self.lexer.lex(text) | |||||
| def lex(self, text, dont_ignore=False): | |||||
| """Only lex (and postlex) the text, without parsing it. Only relevant when lexer='standard' | |||||
| When dont_ignore=True, the lexer will return all tokens, even those marked for %ignore. | |||||
| """ | |||||
| if not hasattr(self, 'lexer') or dont_ignore: | |||||
| lexer = self._build_lexer(dont_ignore) | |||||
| else: | |||||
| lexer = self.lexer | |||||
| lexer_thread = LexerThread(lexer, text) | |||||
| stream = lexer_thread.lex(None) | |||||
| if self.options.postlex: | if self.options.postlex: | ||||
| return self.options.postlex.process(stream) | return self.options.postlex.process(stream) | ||||
| return stream | return stream | ||||
| @@ -10,6 +10,7 @@ from copy import copy | |||||
| class Pattern(Serialize): | class Pattern(Serialize): | ||||
| raw = None | |||||
| def __init__(self, value, flags=(), raw=None): | def __init__(self, value, flags=(), raw=None): | ||||
| self.value = value | self.value = value | ||||
| @@ -10,6 +10,7 @@ from .test_cache import TestCache | |||||
| from .test_grammar import TestGrammar | from .test_grammar import TestGrammar | ||||
| from .test_reconstructor import TestReconstructor | from .test_reconstructor import TestReconstructor | ||||
| from .test_tree_forest_transformer import TestTreeForestTransformer | from .test_tree_forest_transformer import TestTreeForestTransformer | ||||
| from .test_lexer import TestLexer | |||||
| try: | try: | ||||
| from .test_nearley.test_nearley import TestNearley | from .test_nearley.test_nearley import TestNearley | ||||
| @@ -0,0 +1,23 @@ | |||||
| from unittest import TestCase, main | |||||
| from lark import Lark, Tree | |||||
| class TestLexer(TestCase): | |||||
| def setUp(self): | |||||
| pass | |||||
| def test_basic(self): | |||||
| p = Lark(""" | |||||
| start: "a" "b" "c" "d" | |||||
| %ignore " " | |||||
| """) | |||||
| res = list(p.lex("abc cba dd")) | |||||
| assert res == list('abccbadd') | |||||
| res = list(p.lex("abc cba dd", dont_ignore=True)) | |||||
| assert res == list('abc cba dd') | |||||
| if __name__ == '__main__': | |||||
| main() | |||||