Browse Source

Fixed Lark.lex(), added dont_ignore option, added tests for it.

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.11.2
Erez Sh 5 years ago
parent
commit
a7c71f32cc
5 changed files with 50 additions and 13 deletions
  1. +5
    -5
      lark-stubs/lark.pyi
  2. +20
    -8
      lark/lark.py
  3. +1
    -0
      lark/lexer.py
  4. +1
    -0
      tests/__main__.py
  5. +23
    -0
      tests/test_lexer.py

+ 5
- 5
lark-stubs/lark.pyi View File

@@ -15,7 +15,7 @@ class PostLex(Protocol):


def process(self, stream: Iterator[Token]) -> Iterator[Token]: def process(self, stream: Iterator[Token]) -> Iterator[Token]:
... ...
always_accept: Iterable[str] always_accept: Iterable[str]




@@ -42,12 +42,12 @@ class LarkOptions:
class PackageResource(object): class PackageResource(object):
pkg_name: str pkg_name: str
path: str path: str
def __init__(self, pkg_name: str, path: str): ... def __init__(self, pkg_name: str, path: str): ...


class FromPackageLoader: class FromPackageLoader:
def __init__(self, pkg_name: str, search_paths: Tuple[str, ...] = ...): ... def __init__(self, pkg_name: str, search_paths: Tuple[str, ...] = ...): ...
def __call__(self, base_path: Union[None, str, PackageResource], grammar_path: str) -> Tuple[PackageResource, str]: ... def __call__(self, base_path: Union[None, str, PackageResource], grammar_path: str) -> Tuple[PackageResource, str]: ...




@@ -88,12 +88,12 @@ class Lark:
@classmethod @classmethod
def open(cls: Type[_T], grammar_filename: str, rel_to: Optional[str] = None, **options) -> _T: def open(cls: Type[_T], grammar_filename: str, rel_to: Optional[str] = None, **options) -> _T:
... ...
@classmethod @classmethod
def open_from_package(cls: Type[_T], package: str, grammar_path: str, search_paths: Tuple[str, ...] = ..., **options) -> _T: def open_from_package(cls: Type[_T], package: str, grammar_path: str, search_paths: Tuple[str, ...] = ..., **options) -> _T:
... ...


def lex(self, text: str) -> Iterator[Token]:
def lex(self, text: str, dont_ignore: bool = False) -> Iterator[Token]:
... ...


def get_terminal(self, name: str) -> TerminalDef: def get_terminal(self, name: str) -> TerminalDef:


+ 20
- 8
lark/lark.py View File

@@ -11,7 +11,7 @@ from .load_grammar import load_grammar, FromPackageLoader
from .tree import Tree from .tree import Tree
from .common import LexerConf, ParserConf from .common import LexerConf, ParserConf


from .lexer import Lexer, TraditionalLexer, TerminalDef
from .lexer import Lexer, TraditionalLexer, TerminalDef, LexerThread
from .parse_tree_builder import ParseTreeBuilder from .parse_tree_builder import ParseTreeBuilder
from .parser_frontends import get_frontend, _get_lexer_callbacks from .parser_frontends import get_frontend, _get_lexer_callbacks
from .grammar import Rule from .grammar import Rule
@@ -355,8 +355,13 @@ class Lark(Serialize):


__serialize_fields__ = 'parser', 'rules', 'options' __serialize_fields__ = 'parser', 'rules', 'options'


def _build_lexer(self):
return TraditionalLexer(self.lexer_conf)
def _build_lexer(self, dont_ignore=False):
lexer_conf = self.lexer_conf
if dont_ignore:
from copy import copy
lexer_conf = copy(lexer_conf)
lexer_conf.ignore = ()
return TraditionalLexer(lexer_conf)


def _prepare_callbacks(self): def _prepare_callbacks(self):
self.parser_class = get_frontend(self.options.parser, self.options.lexer) self.parser_class = get_frontend(self.options.parser, self.options.lexer)
@@ -419,6 +424,7 @@ class Lark(Serialize):
self._callbacks, self._callbacks,
self.options, # Not all, but multiple attributes are used self.options, # Not all, but multiple attributes are used
) )
self.lexer_conf = self.parser.lexer_conf
self.terminals = self.parser.lexer_conf.terminals self.terminals = self.parser.lexer_conf.terminals
self._terminals_dict = {t.name: t for t in self.terminals} self._terminals_dict = {t.name: t for t in self.terminals}
return self return self
@@ -468,11 +474,17 @@ class Lark(Serialize):
return 'Lark(open(%r), parser=%r, lexer=%r, ...)' % (self.source_path, self.options.parser, self.options.lexer) return 'Lark(open(%r), parser=%r, lexer=%r, ...)' % (self.source_path, self.options.parser, self.options.lexer)




def lex(self, text):
"Only lex (and postlex) the text, without parsing it. Only relevant when lexer='standard'"
if not hasattr(self, 'lexer'):
self.lexer = self._build_lexer()
stream = self.lexer.lex(text)
def lex(self, text, dont_ignore=False):
"""Only lex (and postlex) the text, without parsing it. Only relevant when lexer='standard'

When dont_ignore=True, the lexer will return all tokens, even those marked for %ignore.
"""
if not hasattr(self, 'lexer') or dont_ignore:
lexer = self._build_lexer(dont_ignore)
else:
lexer = self.lexer
lexer_thread = LexerThread(lexer, text)
stream = lexer_thread.lex(None)
if self.options.postlex: if self.options.postlex:
return self.options.postlex.process(stream) return self.options.postlex.process(stream)
return stream return stream


+ 1
- 0
lark/lexer.py View File

@@ -10,6 +10,7 @@ from copy import copy




class Pattern(Serialize): class Pattern(Serialize):
raw = None


def __init__(self, value, flags=(), raw=None): def __init__(self, value, flags=(), raw=None):
self.value = value self.value = value


+ 1
- 0
tests/__main__.py View File

@@ -10,6 +10,7 @@ from .test_cache import TestCache
from .test_grammar import TestGrammar from .test_grammar import TestGrammar
from .test_reconstructor import TestReconstructor from .test_reconstructor import TestReconstructor
from .test_tree_forest_transformer import TestTreeForestTransformer from .test_tree_forest_transformer import TestTreeForestTransformer
from .test_lexer import TestLexer


try: try:
from .test_nearley.test_nearley import TestNearley from .test_nearley.test_nearley import TestNearley


+ 23
- 0
tests/test_lexer.py View File

@@ -0,0 +1,23 @@
from unittest import TestCase, main

from lark import Lark, Tree

class TestLexer(TestCase):
def setUp(self):
pass

def test_basic(self):
p = Lark("""
start: "a" "b" "c" "d"
%ignore " "
""")

res = list(p.lex("abc cba dd"))
assert res == list('abccbadd')

res = list(p.lex("abc cba dd", dont_ignore=True))
assert res == list('abc cba dd')


if __name__ == '__main__':
main()

Loading…
Cancel
Save