This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1345 lines
45 KiB

  1. # -*- coding: utf-8 -*-
  2. from __future__ import absolute_import
  3. import unittest
  4. import logging
  5. import os
  6. import sys
  7. try:
  8. from cStringIO import StringIO as cStringIO
  9. except ImportError:
  10. # Available only in Python 2.x, 3.x only has io.StringIO from below
  11. cStringIO = None
  12. from io import (
  13. StringIO as uStringIO,
  14. open,
  15. )
  16. logging.basicConfig(level=logging.INFO)
  17. from lark.lark import Lark
  18. from lark.exceptions import GrammarError, ParseError, UnexpectedToken, UnexpectedInput
  19. from lark.tree import Tree
  20. from lark.visitors import Transformer
  21. from lark.parsers.earley import ApplyCallbacks
  22. __path__ = os.path.dirname(__file__)
  23. def _read(n, *args):
  24. with open(os.path.join(__path__, n), *args) as f:
  25. return f.read()
  26. class TestParsers(unittest.TestCase):
  27. def test_same_ast(self):
  28. "Tests that Earley and LALR parsers produce equal trees"
  29. g = Lark(r"""start: "(" name_list ("," "*" NAME)? ")"
  30. name_list: NAME | name_list "," NAME
  31. NAME: /\w+/ """, parser='lalr')
  32. l = g.parse('(a,b,c,*x)')
  33. g = Lark(r"""start: "(" name_list ("," "*" NAME)? ")"
  34. name_list: NAME | name_list "," NAME
  35. NAME: /\w/+ """)
  36. l2 = g.parse('(a,b,c,*x)')
  37. assert l == l2, '%s != %s' % (l.pretty(), l2.pretty())
  38. def test_infinite_recurse(self):
  39. g = """start: a
  40. a: a | "a"
  41. """
  42. self.assertRaises(GrammarError, Lark, g, parser='lalr')
  43. # TODO: should it? shouldn't it?
  44. # l = Lark(g, parser='earley', lexer='dynamic')
  45. # self.assertRaises(ParseError, l.parse, 'a')
  46. def test_propagate_positions(self):
  47. g = Lark("""start: a
  48. a: "a"
  49. """, propagate_positions=True)
  50. r = g.parse('a')
  51. self.assertEqual( r.children[0].meta.line, 1 )
  52. def test_expand1(self):
  53. g = Lark("""start: a
  54. ?a: b
  55. b: "x"
  56. """)
  57. r = g.parse('x')
  58. self.assertEqual( r.children[0].data, "b" )
  59. g = Lark("""start: a
  60. ?a: b -> c
  61. b: "x"
  62. """)
  63. r = g.parse('x')
  64. self.assertEqual( r.children[0].data, "c" )
  65. g = Lark("""start: a
  66. ?a: B -> c
  67. B: "x"
  68. """)
  69. self.assertEqual( r.children[0].data, "c" )
  70. g = Lark("""start: a
  71. ?a: b b -> c
  72. b: "x"
  73. """)
  74. r = g.parse('xx')
  75. self.assertEqual( r.children[0].data, "c" )
  76. def test_embedded_transformer(self):
  77. class T(Transformer):
  78. def a(self, children):
  79. return "<a>"
  80. def b(self, children):
  81. return "<b>"
  82. def c(self, children):
  83. return "<c>"
  84. # Test regular
  85. g = Lark("""start: a
  86. a : "x"
  87. """, parser='lalr')
  88. r = T().transform(g.parse("x"))
  89. self.assertEqual( r.children, ["<a>"] )
  90. g = Lark("""start: a
  91. a : "x"
  92. """, parser='lalr', transformer=T())
  93. r = g.parse("x")
  94. self.assertEqual( r.children, ["<a>"] )
  95. # Test Expand1
  96. g = Lark("""start: a
  97. ?a : b
  98. b : "x"
  99. """, parser='lalr')
  100. r = T().transform(g.parse("x"))
  101. self.assertEqual( r.children, ["<b>"] )
  102. g = Lark("""start: a
  103. ?a : b
  104. b : "x"
  105. """, parser='lalr', transformer=T())
  106. r = g.parse("x")
  107. self.assertEqual( r.children, ["<b>"] )
  108. # Test Expand1 -> Alias
  109. g = Lark("""start: a
  110. ?a : b b -> c
  111. b : "x"
  112. """, parser='lalr')
  113. r = T().transform(g.parse("xx"))
  114. self.assertEqual( r.children, ["<c>"] )
  115. g = Lark("""start: a
  116. ?a : b b -> c
  117. b : "x"
  118. """, parser='lalr', transformer=T())
  119. r = g.parse("xx")
  120. self.assertEqual( r.children, ["<c>"] )
  121. def test_alias(self):
  122. Lark("""start: ["a"] "b" ["c"] "e" ["f"] ["g"] ["h"] "x" -> d """)
  123. def _make_full_earley_test(LEXER):
  124. def _Lark(grammar, **kwargs):
  125. return Lark(grammar, lexer=LEXER, parser='earley', propagate_positions=True, **kwargs)
  126. class _TestFullEarley(unittest.TestCase):
  127. def test_anon(self):
  128. # Fails an Earley implementation without special handling for empty rules,
  129. # or re-processing of already completed rules.
  130. g = Lark(r"""start: B
  131. B: ("ab"|/[^b]/)+
  132. """, lexer=LEXER)
  133. self.assertEqual( g.parse('abc').children[0], 'abc')
  134. def test_earley(self):
  135. g = Lark("""start: A "b" c
  136. A: "a"+
  137. c: "abc"
  138. """, parser="earley", lexer=LEXER)
  139. x = g.parse('aaaababc')
  140. def test_earley2(self):
  141. grammar = """
  142. start: statement+
  143. statement: "r"
  144. | "c" /[a-z]/+
  145. %ignore " "
  146. """
  147. program = """c b r"""
  148. l = Lark(grammar, parser='earley', lexer=LEXER)
  149. l.parse(program)
  150. @unittest.skipIf(LEXER=='dynamic', "Only relevant for the dynamic_complete parser")
  151. def test_earley3(self):
  152. """Tests prioritization and disambiguation for pseudo-terminals (there should be only one result)
  153. By default, `+` should immitate regexp greedy-matching
  154. """
  155. grammar = """
  156. start: A A
  157. A: "a"+
  158. """
  159. l = Lark(grammar, parser='earley', lexer=LEXER)
  160. res = l.parse("aaa")
  161. self.assertEqual(res.children, ['aa', 'a'])
  162. def test_earley4(self):
  163. grammar = """
  164. start: A A?
  165. A: "a"+
  166. """
  167. l = Lark(grammar, parser='earley', lexer=LEXER)
  168. res = l.parse("aaa")
  169. self.assertEqual(res.children, ['aaa'])
  170. def test_earley_repeating_empty(self):
  171. # This was a sneaky bug!
  172. grammar = """
  173. !start: "a" empty empty "b"
  174. empty: empty2
  175. empty2:
  176. """
  177. parser = Lark(grammar, parser='earley', lexer=LEXER)
  178. res = parser.parse('ab')
  179. empty_tree = Tree('empty', [Tree('empty2', [])])
  180. self.assertSequenceEqual(res.children, ['a', empty_tree, empty_tree, 'b'])
  181. @unittest.skipIf(LEXER=='standard', "Requires dynamic lexer")
  182. def test_earley_explicit_ambiguity(self):
  183. # This was a sneaky bug!
  184. grammar = """
  185. start: a b | ab
  186. a: "a"
  187. b: "b"
  188. ab: "ab"
  189. """
  190. parser = Lark(grammar, parser='earley', lexer=LEXER, ambiguity='explicit')
  191. ambig_tree = parser.parse('ab')
  192. self.assertEqual( ambig_tree.data, '_ambig')
  193. self.assertEqual( len(ambig_tree.children), 2)
  194. @unittest.skipIf(LEXER=='standard', "Requires dynamic lexer")
  195. def test_ambiguity1(self):
  196. grammar = """
  197. start: cd+ "e"
  198. !cd: "c"
  199. | "d"
  200. | "cd"
  201. """
  202. l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER)
  203. ambig_tree = l.parse('cde')
  204. assert ambig_tree.data == '_ambig', ambig_tree
  205. assert len(ambig_tree.children) == 2
  206. @unittest.skipIf(LEXER=='standard', "Requires dynamic lexer")
  207. def test_ambiguity2(self):
  208. grammar = """
  209. ANY: /[a-zA-Z0-9 ]+/
  210. a.2: "A" b+
  211. b.2: "B"
  212. c: ANY
  213. start: (a|c)*
  214. """
  215. l = Lark(grammar, parser='earley', lexer=LEXER)
  216. res = l.parse('ABX')
  217. expected = Tree('start', [
  218. Tree('a', [
  219. Tree('b', [])
  220. ]),
  221. Tree('c', [
  222. 'X'
  223. ])
  224. ])
  225. self.assertEqual(res, expected)
  226. def test_fruitflies_ambig(self):
  227. grammar = """
  228. start: noun verb noun -> simple
  229. | noun verb "like" noun -> comparative
  230. noun: adj? NOUN
  231. verb: VERB
  232. adj: ADJ
  233. NOUN: "flies" | "bananas" | "fruit"
  234. VERB: "like" | "flies"
  235. ADJ: "fruit"
  236. %import common.WS
  237. %ignore WS
  238. """
  239. parser = Lark(grammar, ambiguity='explicit', lexer=LEXER)
  240. tree = parser.parse('fruit flies like bananas')
  241. expected = Tree('_ambig', [
  242. Tree('comparative', [
  243. Tree('noun', ['fruit']),
  244. Tree('verb', ['flies']),
  245. Tree('noun', ['bananas'])
  246. ]),
  247. Tree('simple', [
  248. Tree('noun', [Tree('adj', ['fruit']), 'flies']),
  249. Tree('verb', ['like']),
  250. Tree('noun', ['bananas'])
  251. ])
  252. ])
  253. # self.assertEqual(tree, expected)
  254. self.assertEqual(tree.data, expected.data)
  255. self.assertEqual(set(tree.children), set(expected.children))
  256. @unittest.skipIf(LEXER!='dynamic_complete', "Only relevant for the dynamic_complete parser")
  257. def test_explicit_ambiguity2(self):
  258. grammar = r"""
  259. start: NAME+
  260. NAME: /\w+/
  261. %ignore " "
  262. """
  263. text = """cat"""
  264. parser = _Lark(grammar, start='start', ambiguity='explicit')
  265. tree = parser.parse(text)
  266. self.assertEqual(tree.data, '_ambig')
  267. combinations = {tuple(str(s) for s in t.children) for t in tree.children}
  268. self.assertEqual(combinations, {
  269. ('cat',),
  270. ('ca', 't'),
  271. ('c', 'at'),
  272. ('c', 'a' ,'t')
  273. })
  274. def test_term_ambig_resolve(self):
  275. grammar = r"""
  276. !start: NAME+
  277. NAME: /\w+/
  278. %ignore " "
  279. """
  280. text = """foo bar"""
  281. parser = Lark(grammar)
  282. tree = parser.parse(text)
  283. self.assertEqual(tree.children, ['foo', 'bar'])
  284. # @unittest.skipIf(LEXER=='dynamic', "Not implemented in Dynamic Earley yet") # TODO
  285. # def test_not_all_derivations(self):
  286. # grammar = """
  287. # start: cd+ "e"
  288. # !cd: "c"
  289. # | "d"
  290. # | "cd"
  291. # """
  292. # l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER, earley__all_derivations=False)
  293. # x = l.parse('cde')
  294. # assert x.data != '_ambig', x
  295. # assert len(x.children) == 1
  296. _NAME = "TestFullEarley" + LEXER.capitalize()
  297. _TestFullEarley.__name__ = _NAME
  298. globals()[_NAME] = _TestFullEarley
  299. def _make_parser_test(LEXER, PARSER):
  300. def _Lark(grammar, **kwargs):
  301. return Lark(grammar, lexer=LEXER, parser=PARSER, propagate_positions=True, **kwargs)
  302. class _TestParser(unittest.TestCase):
  303. def test_basic1(self):
  304. g = _Lark("""start: a+ b a* "b" a*
  305. b: "b"
  306. a: "a"
  307. """)
  308. r = g.parse('aaabaab')
  309. self.assertEqual( ''.join(x.data for x in r.children), 'aaabaa' )
  310. r = g.parse('aaabaaba')
  311. self.assertEqual( ''.join(x.data for x in r.children), 'aaabaaa' )
  312. self.assertRaises(ParseError, g.parse, 'aaabaa')
  313. def test_basic2(self):
  314. # Multiple parsers and colliding tokens
  315. g = _Lark("""start: B A
  316. B: "12"
  317. A: "1" """)
  318. g2 = _Lark("""start: B A
  319. B: "12"
  320. A: "2" """)
  321. x = g.parse('121')
  322. assert x.data == 'start' and x.children == ['12', '1'], x
  323. x = g2.parse('122')
  324. assert x.data == 'start' and x.children == ['12', '2'], x
  325. @unittest.skipIf(cStringIO is None, "cStringIO not available")
  326. def test_stringio_bytes(self):
  327. """Verify that a Lark can be created from file-like objects other than Python's standard 'file' object"""
  328. _Lark(cStringIO(b'start: a+ b a* "b" a*\n b: "b"\n a: "a" '))
  329. def test_stringio_unicode(self):
  330. """Verify that a Lark can be created from file-like objects other than Python's standard 'file' object"""
  331. _Lark(uStringIO(u'start: a+ b a* "b" a*\n b: "b"\n a: "a" '))
  332. def test_unicode(self):
  333. g = _Lark(u"""start: UNIA UNIB UNIA
  334. UNIA: /\xa3/
  335. UNIB: /\u0101/
  336. """)
  337. g.parse(u'\xa3\u0101\u00a3')
  338. def test_unicode2(self):
  339. g = _Lark(r"""start: UNIA UNIB UNIA UNIC
  340. UNIA: /\xa3/
  341. UNIB: "a\u0101b\ "
  342. UNIC: /a?\u0101c\n/
  343. """)
  344. g.parse(u'\xa3a\u0101b\\ \u00a3\u0101c\n')
  345. def test_unicode3(self):
  346. g = _Lark(r"""start: UNIA UNIB UNIA UNIC
  347. UNIA: /\xa3/
  348. UNIB: "\u0101"
  349. UNIC: /\u0203/ /\n/
  350. """)
  351. g.parse(u'\xa3\u0101\u00a3\u0203\n')
  352. @unittest.skipIf(PARSER == 'cyk', "Takes forever")
  353. def test_stack_for_ebnf(self):
  354. """Verify that stack depth isn't an issue for EBNF grammars"""
  355. g = _Lark(r"""start: a+
  356. a : "a" """)
  357. g.parse("a" * (sys.getrecursionlimit()*2 ))
  358. def test_expand1_lists_with_one_item(self):
  359. g = _Lark(r"""start: list
  360. ?list: item+
  361. item : A
  362. A: "a"
  363. """)
  364. r = g.parse("a")
  365. # because 'list' is an expand-if-contains-one rule and we only provided one element it should have expanded to 'item'
  366. self.assertSequenceEqual([subtree.data for subtree in r.children], ('item',))
  367. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  368. self.assertEqual(len(r.children), 1)
  369. def test_expand1_lists_with_one_item_2(self):
  370. g = _Lark(r"""start: list
  371. ?list: item+ "!"
  372. item : A
  373. A: "a"
  374. """)
  375. r = g.parse("a!")
  376. # because 'list' is an expand-if-contains-one rule and we only provided one element it should have expanded to 'item'
  377. self.assertSequenceEqual([subtree.data for subtree in r.children], ('item',))
  378. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  379. self.assertEqual(len(r.children), 1)
  380. def test_dont_expand1_lists_with_multiple_items(self):
  381. g = _Lark(r"""start: list
  382. ?list: item+
  383. item : A
  384. A: "a"
  385. """)
  386. r = g.parse("aa")
  387. # because 'list' is an expand-if-contains-one rule and we've provided more than one element it should *not* have expanded
  388. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  389. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  390. self.assertEqual(len(r.children), 1)
  391. # Sanity check: verify that 'list' contains the two 'item's we've given it
  392. [list] = r.children
  393. self.assertSequenceEqual([item.data for item in list.children], ('item', 'item'))
  394. def test_dont_expand1_lists_with_multiple_items_2(self):
  395. g = _Lark(r"""start: list
  396. ?list: item+ "!"
  397. item : A
  398. A: "a"
  399. """)
  400. r = g.parse("aa!")
  401. # because 'list' is an expand-if-contains-one rule and we've provided more than one element it should *not* have expanded
  402. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  403. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  404. self.assertEqual(len(r.children), 1)
  405. # Sanity check: verify that 'list' contains the two 'item's we've given it
  406. [list] = r.children
  407. self.assertSequenceEqual([item.data for item in list.children], ('item', 'item'))
  408. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  409. def test_empty_expand1_list(self):
  410. g = _Lark(r"""start: list
  411. ?list: item*
  412. item : A
  413. A: "a"
  414. """)
  415. r = g.parse("")
  416. # because 'list' is an expand-if-contains-one rule and we've provided less than one element (i.e. none) it should *not* have expanded
  417. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  418. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  419. self.assertEqual(len(r.children), 1)
  420. # Sanity check: verify that 'list' contains no 'item's as we've given it none
  421. [list] = r.children
  422. self.assertSequenceEqual([item.data for item in list.children], ())
  423. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  424. def test_empty_expand1_list_2(self):
  425. g = _Lark(r"""start: list
  426. ?list: item* "!"?
  427. item : A
  428. A: "a"
  429. """)
  430. r = g.parse("")
  431. # because 'list' is an expand-if-contains-one rule and we've provided less than one element (i.e. none) it should *not* have expanded
  432. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  433. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  434. self.assertEqual(len(r.children), 1)
  435. # Sanity check: verify that 'list' contains no 'item's as we've given it none
  436. [list] = r.children
  437. self.assertSequenceEqual([item.data for item in list.children], ())
  438. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  439. def test_empty_flatten_list(self):
  440. g = _Lark(r"""start: list
  441. list: | item "," list
  442. item : A
  443. A: "a"
  444. """)
  445. r = g.parse("")
  446. # Because 'list' is a flatten rule it's top-level element should *never* be expanded
  447. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  448. # Sanity check: verify that 'list' contains no 'item's as we've given it none
  449. [list] = r.children
  450. self.assertSequenceEqual([item.data for item in list.children], ())
  451. @unittest.skipIf(True, "Flattening list isn't implemented (and may never be)")
  452. def test_single_item_flatten_list(self):
  453. g = _Lark(r"""start: list
  454. list: | item "," list
  455. item : A
  456. A: "a"
  457. """)
  458. r = g.parse("a,")
  459. # Because 'list' is a flatten rule it's top-level element should *never* be expanded
  460. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  461. # Sanity check: verify that 'list' contains exactly the one 'item' we've given it
  462. [list] = r.children
  463. self.assertSequenceEqual([item.data for item in list.children], ('item',))
  464. @unittest.skipIf(True, "Flattening list isn't implemented (and may never be)")
  465. def test_multiple_item_flatten_list(self):
  466. g = _Lark(r"""start: list
  467. #list: | item "," list
  468. item : A
  469. A: "a"
  470. """)
  471. r = g.parse("a,a,")
  472. # Because 'list' is a flatten rule it's top-level element should *never* be expanded
  473. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  474. # Sanity check: verify that 'list' contains exactly the two 'item's we've given it
  475. [list] = r.children
  476. self.assertSequenceEqual([item.data for item in list.children], ('item', 'item'))
  477. @unittest.skipIf(True, "Flattening list isn't implemented (and may never be)")
  478. def test_recurse_flatten(self):
  479. """Verify that stack depth doesn't get exceeded on recursive rules marked for flattening."""
  480. g = _Lark(r"""start: a | start a
  481. a : A
  482. A : "a" """)
  483. # Force PLY to write to the debug log, but prevent writing it to the terminal (uses repr() on the half-built
  484. # STree data structures, which uses recursion).
  485. g.parse("a" * (sys.getrecursionlimit() // 4))
  486. def test_token_collision(self):
  487. g = _Lark(r"""start: "Hello" NAME
  488. NAME: /\w/+
  489. %ignore " "
  490. """)
  491. x = g.parse('Hello World')
  492. self.assertSequenceEqual(x.children, ['World'])
  493. x = g.parse('Hello HelloWorld')
  494. self.assertSequenceEqual(x.children, ['HelloWorld'])
  495. def test_token_collision_WS(self):
  496. g = _Lark(r"""start: "Hello" NAME
  497. NAME: /\w/+
  498. %import common.WS
  499. %ignore WS
  500. """)
  501. x = g.parse('Hello World')
  502. self.assertSequenceEqual(x.children, ['World'])
  503. x = g.parse('Hello HelloWorld')
  504. self.assertSequenceEqual(x.children, ['HelloWorld'])
  505. def test_token_collision2(self):
  506. g = _Lark("""
  507. !start: "starts"
  508. %import common.LCASE_LETTER
  509. """)
  510. x = g.parse("starts")
  511. self.assertSequenceEqual(x.children, ['starts'])
  512. # def test_string_priority(self):
  513. # g = _Lark("""start: (A | /a?bb/)+
  514. # A: "a" """)
  515. # x = g.parse('abb')
  516. # self.assertEqual(len(x.children), 2)
  517. # # This parse raises an exception because the lexer will always try to consume
  518. # # "a" first and will never match the regular expression
  519. # # This behavior is subject to change!!
  520. # # Thie won't happen with ambiguity handling.
  521. # g = _Lark("""start: (A | /a?ab/)+
  522. # A: "a" """)
  523. # self.assertRaises(LexError, g.parse, 'aab')
  524. def test_undefined_rule(self):
  525. self.assertRaises(GrammarError, _Lark, """start: a""")
  526. def test_undefined_token(self):
  527. self.assertRaises(GrammarError, _Lark, """start: A""")
  528. def test_rule_collision(self):
  529. g = _Lark("""start: "a"+ "b"
  530. | "a"+ """)
  531. x = g.parse('aaaa')
  532. x = g.parse('aaaab')
  533. def test_rule_collision2(self):
  534. g = _Lark("""start: "a"* "b"
  535. | "a"+ """)
  536. x = g.parse('aaaa')
  537. x = g.parse('aaaab')
  538. x = g.parse('b')
  539. def test_token_not_anon(self):
  540. """Tests that "a" is matched as an anonymous token, and not A.
  541. """
  542. g = _Lark("""start: "a"
  543. A: "a" """)
  544. x = g.parse('a')
  545. self.assertEqual(len(x.children), 0, '"a" should be considered anonymous')
  546. g = _Lark("""start: "a" A
  547. A: "a" """)
  548. x = g.parse('aa')
  549. self.assertEqual(len(x.children), 1, 'only "a" should be considered anonymous')
  550. self.assertEqual(x.children[0].type, "A")
  551. g = _Lark("""start: /a/
  552. A: /a/ """)
  553. x = g.parse('a')
  554. self.assertEqual(len(x.children), 1)
  555. self.assertEqual(x.children[0].type, "A", "A isn't associated with /a/")
  556. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  557. def test_maybe(self):
  558. g = _Lark("""start: ["a"] """)
  559. x = g.parse('a')
  560. x = g.parse('')
  561. def test_start(self):
  562. g = _Lark("""a: "a" a? """, start='a')
  563. x = g.parse('a')
  564. x = g.parse('aa')
  565. x = g.parse('aaa')
  566. def test_alias(self):
  567. g = _Lark("""start: "a" -> b """)
  568. x = g.parse('a')
  569. self.assertEqual(x.data, "b")
  570. def test_token_ebnf(self):
  571. g = _Lark("""start: A
  572. A: "a"* ("b"? "c".."e")+
  573. """)
  574. x = g.parse('abcde')
  575. x = g.parse('dd')
  576. def test_backslash(self):
  577. g = _Lark(r"""start: "\\" "a"
  578. """)
  579. x = g.parse(r'\a')
  580. g = _Lark(r"""start: /\\/ /a/
  581. """)
  582. x = g.parse(r'\a')
  583. def test_special_chars(self):
  584. g = _Lark(r"""start: "\n"
  585. """)
  586. x = g.parse('\n')
  587. g = _Lark(r"""start: /\n/
  588. """)
  589. x = g.parse('\n')
  590. def test_backslash2(self):
  591. g = _Lark(r"""start: "\"" "-"
  592. """)
  593. x = g.parse('"-')
  594. g = _Lark(r"""start: /\// /-/
  595. """)
  596. x = g.parse('/-')
  597. # def test_token_recurse(self):
  598. # g = _Lark("""start: A
  599. # A: B
  600. # B: A
  601. # """)
  602. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  603. def test_empty(self):
  604. # Fails an Earley implementation without special handling for empty rules,
  605. # or re-processing of already completed rules.
  606. g = _Lark(r"""start: _empty a "B"
  607. a: _empty "A"
  608. _empty:
  609. """)
  610. x = g.parse('AB')
  611. def test_regex_quote(self):
  612. g = r"""
  613. start: SINGLE_QUOTED_STRING | DOUBLE_QUOTED_STRING
  614. SINGLE_QUOTED_STRING : /'[^']*'/
  615. DOUBLE_QUOTED_STRING : /"[^"]*"/
  616. """
  617. g = _Lark(g)
  618. self.assertEqual( g.parse('"hello"').children, ['"hello"'])
  619. self.assertEqual( g.parse("'hello'").children, ["'hello'"])
  620. def test_lexer_token_limit(self):
  621. "Python has a stupid limit of 100 groups in a regular expression. Test that we handle this limitation"
  622. tokens = {'A%d'%i:'"%d"'%i for i in range(300)}
  623. g = _Lark("""start: %s
  624. %s""" % (' '.join(tokens), '\n'.join("%s: %s"%x for x in tokens.items())))
  625. def test_float_without_lexer(self):
  626. expected_error = UnexpectedInput if LEXER == 'dynamic' else UnexpectedToken
  627. if PARSER == 'cyk':
  628. expected_error = ParseError
  629. g = _Lark("""start: ["+"|"-"] float
  630. float: digit* "." digit+ exp?
  631. | digit+ exp
  632. exp: ("e"|"E") ["+"|"-"] digit+
  633. digit: "0"|"1"|"2"|"3"|"4"|"5"|"6"|"7"|"8"|"9"
  634. """)
  635. g.parse("1.2")
  636. g.parse("-.2e9")
  637. g.parse("+2e-9")
  638. self.assertRaises( expected_error, g.parse, "+2e-9e")
  639. def test_keep_all_tokens(self):
  640. l = _Lark("""start: "a"+ """, keep_all_tokens=True)
  641. tree = l.parse('aaa')
  642. self.assertEqual(tree.children, ['a', 'a', 'a'])
  643. def test_token_flags(self):
  644. l = _Lark("""!start: "a"i+
  645. """
  646. )
  647. tree = l.parse('aA')
  648. self.assertEqual(tree.children, ['a', 'A'])
  649. l = _Lark("""!start: /a/i+
  650. """
  651. )
  652. tree = l.parse('aA')
  653. self.assertEqual(tree.children, ['a', 'A'])
  654. # g = """!start: "a"i "a"
  655. # """
  656. # self.assertRaises(GrammarError, _Lark, g)
  657. # g = """!start: /a/i /a/
  658. # """
  659. # self.assertRaises(GrammarError, _Lark, g)
  660. g = """start: NAME "," "a"
  661. NAME: /[a-z_]/i /[a-z0-9_]/i*
  662. """
  663. l = _Lark(g)
  664. tree = l.parse('ab,a')
  665. self.assertEqual(tree.children, ['ab'])
  666. tree = l.parse('AB,a')
  667. self.assertEqual(tree.children, ['AB'])
  668. def test_token_flags3(self):
  669. l = _Lark("""!start: ABC+
  670. ABC: "abc"i
  671. """
  672. )
  673. tree = l.parse('aBcAbC')
  674. self.assertEqual(tree.children, ['aBc', 'AbC'])
  675. def test_token_flags2(self):
  676. g = """!start: ("a"i | /a/ /b/?)+
  677. """
  678. l = _Lark(g)
  679. tree = l.parse('aA')
  680. self.assertEqual(tree.children, ['a', 'A'])
  681. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  682. def test_twice_empty(self):
  683. g = """!start: [["A"]]
  684. """
  685. l = _Lark(g)
  686. tree = l.parse('A')
  687. self.assertEqual(tree.children, ['A'])
  688. tree = l.parse('')
  689. self.assertEqual(tree.children, [])
  690. def test_undefined_ignore(self):
  691. g = """!start: "A"
  692. %ignore B
  693. """
  694. self.assertRaises( GrammarError, _Lark, g)
  695. def test_alias_in_terminal(self):
  696. g = """start: TERM
  697. TERM: "a" -> alias
  698. """
  699. self.assertRaises( GrammarError, _Lark, g)
  700. def test_line_and_column(self):
  701. g = r"""!start: "A" bc "D"
  702. !bc: "B\nC"
  703. """
  704. l = _Lark(g)
  705. a, bc, d = l.parse("AB\nCD").children
  706. self.assertEqual(a.line, 1)
  707. self.assertEqual(a.column, 1)
  708. bc ,= bc.children
  709. self.assertEqual(bc.line, 1)
  710. self.assertEqual(bc.column, 2)
  711. self.assertEqual(d.line, 2)
  712. self.assertEqual(d.column, 2)
  713. if LEXER != 'dynamic':
  714. self.assertEqual(a.end_line, 1)
  715. self.assertEqual(a.end_column, 2)
  716. self.assertEqual(bc.end_line, 2)
  717. self.assertEqual(bc.end_column, 2)
  718. self.assertEqual(d.end_line, 2)
  719. self.assertEqual(d.end_column, 3)
  720. def test_reduce_cycle(self):
  721. """Tests an edge-condition in the LALR parser, in which a transition state looks exactly like the end state.
  722. It seems that the correct solution is to explicitely distinguish finalization in the reduce() function.
  723. """
  724. l = _Lark("""
  725. term: A
  726. | term term
  727. A: "a"
  728. """, start='term')
  729. tree = l.parse("aa")
  730. self.assertEqual(len(tree.children), 2)
  731. @unittest.skipIf(LEXER != 'standard', "Only standard lexers care about token priority")
  732. def test_lexer_prioritization(self):
  733. "Tests effect of priority on result"
  734. grammar = """
  735. start: A B | AB
  736. A.2: "a"
  737. B: "b"
  738. AB: "ab"
  739. """
  740. l = _Lark(grammar)
  741. res = l.parse("ab")
  742. self.assertEqual(res.children, ['a', 'b'])
  743. self.assertNotEqual(res.children, ['ab'])
  744. grammar = """
  745. start: A B | AB
  746. A: "a"
  747. B: "b"
  748. AB.3: "ab"
  749. """
  750. l = _Lark(grammar)
  751. res = l.parse("ab")
  752. self.assertNotEqual(res.children, ['a', 'b'])
  753. self.assertEqual(res.children, ['ab'])
  754. def test_import(self):
  755. grammar = """
  756. start: NUMBER WORD
  757. %import common.NUMBER
  758. %import common.WORD
  759. %import common.WS
  760. %ignore WS
  761. """
  762. l = _Lark(grammar)
  763. x = l.parse('12 elephants')
  764. self.assertEqual(x.children, ['12', 'elephants'])
  765. def test_relative_import(self):
  766. grammar = """
  767. start: NUMBER WORD
  768. %import .grammars.test.NUMBER
  769. %import common.WORD
  770. %import common.WS
  771. %ignore WS
  772. """
  773. l = _Lark(grammar)
  774. x = l.parse('12 lions')
  775. self.assertEqual(x.children, ['12', 'lions'])
  776. def test_multi_import(self):
  777. grammar = """
  778. start: NUMBER WORD
  779. %import common (NUMBER, WORD, WS)
  780. %ignore WS
  781. """
  782. l = _Lark(grammar)
  783. x = l.parse('12 toucans')
  784. self.assertEqual(x.children, ['12', 'toucans'])
  785. def test_relative_multi_import(self):
  786. grammar = """
  787. start: NUMBER WORD
  788. %import .grammars.test (NUMBER, WORD, WS)
  789. %ignore WS
  790. """
  791. l = _Lark(grammar)
  792. x = l.parse('12 capybaras')
  793. self.assertEqual(x.children, ['12', 'capybaras'])
  794. def test_import_errors(self):
  795. grammar = """
  796. start: NUMBER WORD
  797. %import .grammars.bad_test.NUMBER
  798. """
  799. self.assertRaises(IOError, _Lark, grammar)
  800. grammar = """
  801. start: NUMBER WORD
  802. %import bad_test.NUMBER
  803. """
  804. self.assertRaises(IOError, _Lark, grammar)
  805. @unittest.skipIf(PARSER != 'earley', "Currently only Earley supports priority in rules")
  806. def test_earley_prioritization(self):
  807. "Tests effect of priority on result"
  808. grammar = """
  809. start: a | b
  810. a.1: "a"
  811. b.2: "a"
  812. """
  813. # l = Lark(grammar, parser='earley', lexer='standard')
  814. l = _Lark(grammar)
  815. res = l.parse("a")
  816. self.assertEqual(res.children[0].data, 'b')
  817. grammar = """
  818. start: a | b
  819. a.2: "a"
  820. b.1: "a"
  821. """
  822. l = _Lark(grammar)
  823. # l = Lark(grammar, parser='earley', lexer='standard')
  824. res = l.parse("a")
  825. self.assertEqual(res.children[0].data, 'a')
  826. @unittest.skipIf(PARSER != 'earley', "Currently only Earley supports priority in rules")
  827. def test_earley_prioritization_sum(self):
  828. "Tests effect of priority on result"
  829. grammar = """
  830. start: ab_ b_ a_ | indirection
  831. indirection: a_ bb_ a_
  832. a_: "a"
  833. b_: "b"
  834. ab_: "ab"
  835. bb_.1: "bb"
  836. """
  837. l = Lark(grammar, priority="invert")
  838. res = l.parse('abba')
  839. self.assertEqual(''.join(child.data for child in res.children), 'ab_b_a_')
  840. grammar = """
  841. start: ab_ b_ a_ | indirection
  842. indirection: a_ bb_ a_
  843. a_: "a"
  844. b_: "b"
  845. ab_.1: "ab"
  846. bb_: "bb"
  847. """
  848. l = Lark(grammar, priority="invert")
  849. res = l.parse('abba')
  850. self.assertEqual(''.join(child.data for child in res.children), 'indirection')
  851. grammar = """
  852. start: ab_ b_ a_ | indirection
  853. indirection: a_ bb_ a_
  854. a_.2: "a"
  855. b_.1: "b"
  856. ab_.3: "ab"
  857. bb_.3: "bb"
  858. """
  859. l = Lark(grammar, priority="invert")
  860. res = l.parse('abba')
  861. self.assertEqual(''.join(child.data for child in res.children), 'ab_b_a_')
  862. grammar = """
  863. start: ab_ b_ a_ | indirection
  864. indirection: a_ bb_ a_
  865. a_.1: "a"
  866. b_.1: "b"
  867. ab_.4: "ab"
  868. bb_.3: "bb"
  869. """
  870. l = Lark(grammar, priority="invert")
  871. res = l.parse('abba')
  872. self.assertEqual(''.join(child.data for child in res.children), 'indirection')
  873. def test_utf8(self):
  874. g = u"""start: a
  875. a: "±a"
  876. """
  877. l = _Lark(g)
  878. self.assertEqual(l.parse(u'±a'), Tree('start', [Tree('a', [])]))
  879. g = u"""start: A
  880. A: "±a"
  881. """
  882. l = _Lark(g)
  883. self.assertEqual(l.parse(u'±a'), Tree('start', [u'\xb1a']))
  884. @unittest.skipIf(PARSER == 'cyk', "No empty rules")
  885. def test_ignore(self):
  886. grammar = r"""
  887. COMMENT: /(!|(\/\/))[^\n]*/
  888. %ignore COMMENT
  889. %import common.WS -> _WS
  890. %import common.INT
  891. start: "INT"i _WS+ INT _WS*
  892. """
  893. parser = _Lark(grammar)
  894. tree = parser.parse("int 1 ! This is a comment\n")
  895. self.assertEqual(tree.children, ['1'])
  896. tree = parser.parse("int 1 ! This is a comment") # A trailing ignore token can be tricky!
  897. self.assertEqual(tree.children, ['1'])
  898. parser = _Lark(r"""
  899. start : "a"*
  900. %ignore "b"
  901. """)
  902. tree = parser.parse("bb")
  903. self.assertEqual(tree.children, [])
  904. def test_regex_escaping(self):
  905. g = _Lark("start: /[ab]/")
  906. g.parse('a')
  907. g.parse('b')
  908. self.assertRaises( UnexpectedInput, g.parse, 'c')
  909. _Lark(r'start: /\w/').parse('a')
  910. g = _Lark(r'start: /\\w/')
  911. self.assertRaises( UnexpectedInput, g.parse, 'a')
  912. g.parse(r'\w')
  913. _Lark(r'start: /\[/').parse('[')
  914. _Lark(r'start: /\//').parse('/')
  915. _Lark(r'start: /\\/').parse('\\')
  916. _Lark(r'start: /\[ab]/').parse('[ab]')
  917. _Lark(r'start: /\\[ab]/').parse('\\a')
  918. _Lark(r'start: /\t/').parse('\t')
  919. _Lark(r'start: /\\t/').parse('\\t')
  920. _Lark(r'start: /\\\t/').parse('\\\t')
  921. _Lark(r'start: "\t"').parse('\t')
  922. _Lark(r'start: "\\t"').parse('\\t')
  923. _Lark(r'start: "\\\t"').parse('\\\t')
  924. def test_ranged_repeat_rules(self):
  925. g = u"""!start: "A"~3
  926. """
  927. l = _Lark(g)
  928. self.assertEqual(l.parse(u'AAA'), Tree('start', ["A", "A", "A"]))
  929. self.assertRaises(ParseError, l.parse, u'AA')
  930. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAA')
  931. g = u"""!start: "A"~0..2
  932. """
  933. if PARSER != 'cyk': # XXX CYK currently doesn't support empty grammars
  934. l = _Lark(g)
  935. self.assertEqual(l.parse(u''), Tree('start', []))
  936. self.assertEqual(l.parse(u'A'), Tree('start', ['A']))
  937. self.assertEqual(l.parse(u'AA'), Tree('start', ['A', 'A']))
  938. self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'AAA')
  939. g = u"""!start: "A"~3..2
  940. """
  941. self.assertRaises(GrammarError, _Lark, g)
  942. g = u"""!start: "A"~2..3 "B"~2
  943. """
  944. l = _Lark(g)
  945. self.assertEqual(l.parse(u'AABB'), Tree('start', ['A', 'A', 'B', 'B']))
  946. self.assertEqual(l.parse(u'AAABB'), Tree('start', ['A', 'A', 'A', 'B', 'B']))
  947. self.assertRaises(ParseError, l.parse, u'AAAB')
  948. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAABBB')
  949. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB')
  950. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB')
  951. def test_ranged_repeat_terms(self):
  952. g = u"""!start: AAA
  953. AAA: "A"~3
  954. """
  955. l = _Lark(g)
  956. self.assertEqual(l.parse(u'AAA'), Tree('start', ["AAA"]))
  957. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AA')
  958. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAA')
  959. g = u"""!start: AABB CC
  960. AABB: "A"~0..2 "B"~2
  961. CC: "C"~1..2
  962. """
  963. l = _Lark(g)
  964. self.assertEqual(l.parse(u'AABBCC'), Tree('start', ['AABB', 'CC']))
  965. self.assertEqual(l.parse(u'BBC'), Tree('start', ['BB', 'C']))
  966. self.assertEqual(l.parse(u'ABBCC'), Tree('start', ['ABB', 'CC']))
  967. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAB')
  968. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAABBB')
  969. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB')
  970. self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB')
  971. @unittest.skipIf(PARSER=='earley', "Priority not handled correctly right now") # TODO XXX
  972. def test_priority_vs_embedded(self):
  973. g = """
  974. A.2: "a"
  975. WORD: ("a".."z")+
  976. start: (A | WORD)+
  977. """
  978. l = _Lark(g)
  979. t = l.parse('abc')
  980. self.assertEqual(t.children, ['a', 'bc'])
  981. self.assertEqual(t.children[0].type, 'A')
  982. def test_line_counting(self):
  983. p = _Lark("start: /[^x]+/")
  984. text = 'hello\nworld'
  985. t = p.parse(text)
  986. tok = t.children[0]
  987. self.assertEqual(tok, text)
  988. self.assertEqual(tok.line, 1)
  989. self.assertEqual(tok.column, 1)
  990. if _LEXER != 'dynamic':
  991. self.assertEqual(tok.end_line, 2)
  992. self.assertEqual(tok.end_column, 6)
  993. @unittest.skipIf(PARSER=='cyk', "Empty rules")
  994. def test_empty_end(self):
  995. p = _Lark("""
  996. start: b c d
  997. b: "B"
  998. c: | "C"
  999. d: | "D"
  1000. """)
  1001. res = p.parse('B')
  1002. self.assertEqual(len(res.children), 3)
  1003. @unittest.skipIf(PARSER=='cyk', "Empty rules")
  1004. def test_maybe_placeholders(self):
  1005. # Anonymous tokens shouldn't count
  1006. p = _Lark("""start: "a"? "b"? "c"? """, maybe_placeholders=True)
  1007. self.assertEqual(p.parse("").children, [])
  1008. # Anonymous tokens shouldn't count, other constructs should
  1009. p = _Lark("""start: A? "b"? _c?
  1010. A: "a"
  1011. _c: "c" """, maybe_placeholders=True)
  1012. self.assertEqual(p.parse("").children, [None])
  1013. p = _Lark("""!start: "a"? "b"? "c"? """, maybe_placeholders=True)
  1014. self.assertEqual(p.parse("").children, [None, None, None])
  1015. self.assertEqual(p.parse("a").children, ['a', None, None])
  1016. self.assertEqual(p.parse("b").children, [None, 'b', None])
  1017. self.assertEqual(p.parse("c").children, [None, None, 'c'])
  1018. self.assertEqual(p.parse("ab").children, ['a', 'b', None])
  1019. self.assertEqual(p.parse("ac").children, ['a', None, 'c'])
  1020. self.assertEqual(p.parse("bc").children, [None, 'b', 'c'])
  1021. self.assertEqual(p.parse("abc").children, ['a', 'b', 'c'])
  1022. p = _Lark("""!start: ("a"? "b" "c"?)+ """, maybe_placeholders=True)
  1023. self.assertEqual(p.parse("b").children, [None, 'b', None])
  1024. self.assertEqual(p.parse("bb").children, [None, 'b', None, None, 'b', None])
  1025. self.assertEqual(p.parse("abbc").children, ['a', 'b', None, None, 'b', 'c'])
  1026. self.assertEqual(p.parse("babbcabcb").children,
  1027. [None, 'b', None,
  1028. 'a', 'b', None,
  1029. None, 'b', 'c',
  1030. 'a', 'b', 'c',
  1031. None, 'b', None])
  1032. p = _Lark("""!start: "a"? "c"? "b"+ "a"? "d"? """, maybe_placeholders=True)
  1033. self.assertEqual(p.parse("bb").children, [None, None, 'b', 'b', None, None])
  1034. self.assertEqual(p.parse("bd").children, [None, None, 'b', None, 'd'])
  1035. self.assertEqual(p.parse("abba").children, ['a', None, 'b', 'b', 'a', None])
  1036. self.assertEqual(p.parse("cbbbb").children, [None, 'c', 'b', 'b', 'b', 'b', None, None])
  1037. _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize()
  1038. _TestParser.__name__ = _NAME
  1039. globals()[_NAME] = _TestParser
  1040. # Note: You still have to import them in __main__ for the tests to run
  1041. _TO_TEST = [
  1042. ('standard', 'earley'),
  1043. ('standard', 'cyk'),
  1044. ('dynamic', 'earley'),
  1045. ('dynamic_complete', 'earley'),
  1046. ('standard', 'lalr'),
  1047. ('contextual', 'lalr'),
  1048. # (None, 'earley'),
  1049. ]
  1050. for _LEXER, _PARSER in _TO_TEST:
  1051. _make_parser_test(_LEXER, _PARSER)
  1052. for _LEXER in ('dynamic', 'dynamic_complete'):
  1053. _make_full_earley_test(_LEXER)
  1054. if __name__ == '__main__':
  1055. unittest.main()