This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

473 lines
18 KiB

  1. from __future__ import absolute_import
  2. import unittest
  3. import logging
  4. import os
  5. import sys
  6. try:
  7. from cStringIO import StringIO as cStringIO
  8. except ImportError:
  9. # Available only in Python 2.x, 3.x only has io.StringIO from below
  10. cStringIO = None
  11. from io import (
  12. StringIO as uStringIO,
  13. open,
  14. )
  15. logging.basicConfig(level=logging.INFO)
  16. from lark.lark import Lark
  17. from lark.common import GrammarError, ParseError
  18. from lark.lexer import LexError
  19. __path__ = os.path.dirname(__file__)
  20. def _read(n, *args):
  21. with open(os.path.join(__path__, n), *args) as f:
  22. return f.read()
  23. class TestParsers(unittest.TestCase):
  24. def test_same_ast(self):
  25. "Tests that Earley and LALR parsers produce equal trees"
  26. g = Lark("""start: "(" name_list ("," "*" NAME)? ")"
  27. name_list: NAME | name_list "," NAME
  28. NAME: /\w+/ """, parser='lalr')
  29. l = g.parse('(a,b,c,*x)')
  30. g = Lark("""start: "(" name_list ("," "*" NAME)? ")"
  31. name_list: NAME | name_list "," NAME
  32. NAME: /\w+/ """)
  33. l2 = g.parse('(a,b,c,*x)')
  34. assert l == l2, '%s != %s' % (l.pretty(), l2.pretty())
  35. def test_earley_nolex(self):
  36. g = Lark("""start: A "b" c
  37. A: "a"+
  38. c: "abc"
  39. """, parser="earley", lexer=None)
  40. x = g.parse('aaaababc')
  41. class TestEarley(unittest.TestCase):
  42. def test_anon_in_scanless(self):
  43. # Fails an Earley implementation without special handling for empty rules,
  44. # or re-processing of already completed rules.
  45. g = Lark(r"""start: B
  46. B: ("ab"|/[^b]/)*
  47. """, lexer=None)
  48. assertEqual( g.parse('abc'), 'abc')
  49. def _make_parser_test(LEXER, PARSER):
  50. def _Lark(grammar, **kwargs):
  51. return Lark(grammar, lexer=LEXER, parser=PARSER, **kwargs)
  52. class _TestParser(unittest.TestCase):
  53. def test_basic1(self):
  54. g = _Lark("""start: a+ b a* "b" a*
  55. b: "b"
  56. a: "a"
  57. """)
  58. r = g.parse('aaabaab')
  59. self.assertEqual( ''.join(x.data for x in r.children), 'aaabaa' )
  60. r = g.parse('aaabaaba')
  61. self.assertEqual( ''.join(x.data for x in r.children), 'aaabaaa' )
  62. self.assertRaises(ParseError, g.parse, 'aaabaa')
  63. def test_basic2(self):
  64. # Multiple parsers and colliding tokens
  65. g = _Lark("""start: B A
  66. B: "12"
  67. A: "1" """)
  68. g2 = _Lark("""start: B A
  69. B: "12"
  70. A: "2" """)
  71. x = g.parse('121')
  72. assert x.data == 'start' and x.children == ['12', '1'], x
  73. x = g2.parse('122')
  74. assert x.data == 'start' and x.children == ['12', '2'], x
  75. @unittest.skipIf(cStringIO is None, "cStringIO not available")
  76. def test_stringio_bytes(self):
  77. """Verify that a Lark can be created from file-like objects other than Python's standard 'file' object"""
  78. _Lark(cStringIO(b'start: a+ b a* "b" a*\n b: "b"\n a: "a" '))
  79. def test_stringio_unicode(self):
  80. """Verify that a Lark can be created from file-like objects other than Python's standard 'file' object"""
  81. _Lark(uStringIO(u'start: a+ b a* "b" a*\n b: "b"\n a: "a" '))
  82. def test_unicode(self):
  83. g = _Lark(u"""start: UNIA UNIB UNIA
  84. UNIA: /\xa3/
  85. UNIB: /\u0101/
  86. """)
  87. g.parse(u'\xa3\u0101\u00a3')
  88. @unittest.skipIf(LEXER is None, "Regexps >1 not supported with scanless parsing")
  89. def test_unicode2(self):
  90. g = _Lark(r"""start: UNIA UNIB UNIA UNIC
  91. UNIA: /\xa3/
  92. UNIB: "a\u0101b\ "
  93. UNIC: /a?\u0101c\n/
  94. """)
  95. g.parse(u'\xa3a\u0101b\\ \u00a3\u0101c\n')
  96. def test_unicode3(self):
  97. g = _Lark(r"""start: UNIA UNIB UNIA UNIC
  98. UNIA: /\xa3/
  99. UNIB: "\u0101"
  100. UNIC: /\u0203/ /\n/
  101. """)
  102. g.parse(u'\xa3\u0101\u00a3\u0203\n')
  103. def test_recurse_expansion(self):
  104. """Verify that stack depth doesn't get exceeded on recursive rules marked for expansion."""
  105. g = _Lark(r"""start: a | start a
  106. a : "a" """)
  107. # Force PLY to write to the debug log, but prevent writing it to the terminal (uses repr() on the half-built
  108. # STree data structures, which uses recursion).
  109. g.parse("a" * (sys.getrecursionlimit() // 4))
  110. def test_expand1_lists_with_one_item(self):
  111. g = _Lark(r"""start: list
  112. ?list: item+
  113. item : A
  114. A: "a"
  115. """)
  116. r = g.parse("a")
  117. # because 'list' is an expand-if-contains-one rule and we only provided one element it should have expanded to 'item'
  118. self.assertSequenceEqual([subtree.data for subtree in r.children], ('item',))
  119. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  120. self.assertEqual(len(r.children), 1)
  121. def test_expand1_lists_with_one_item_2(self):
  122. g = _Lark(r"""start: list
  123. ?list: item+ "!"
  124. item : A
  125. A: "a"
  126. """)
  127. r = g.parse("a!")
  128. # because 'list' is an expand-if-contains-one rule and we only provided one element it should have expanded to 'item'
  129. self.assertSequenceEqual([subtree.data for subtree in r.children], ('item',))
  130. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  131. self.assertEqual(len(r.children), 1)
  132. def test_dont_expand1_lists_with_multiple_items(self):
  133. g = _Lark(r"""start: list
  134. ?list: item+
  135. item : A
  136. A: "a"
  137. """)
  138. r = g.parse("aa")
  139. # because 'list' is an expand-if-contains-one rule and we've provided more than one element it should *not* have expanded
  140. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  141. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  142. self.assertEqual(len(r.children), 1)
  143. # Sanity check: verify that 'list' contains the two 'item's we've given it
  144. [list] = r.children
  145. self.assertSequenceEqual([item.data for item in list.children], ('item', 'item'))
  146. def test_dont_expand1_lists_with_multiple_items_2(self):
  147. g = _Lark(r"""start: list
  148. ?list: item+ "!"
  149. item : A
  150. A: "a"
  151. """)
  152. r = g.parse("aa!")
  153. # because 'list' is an expand-if-contains-one rule and we've provided more than one element it should *not* have expanded
  154. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  155. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  156. self.assertEqual(len(r.children), 1)
  157. # Sanity check: verify that 'list' contains the two 'item's we've given it
  158. [list] = r.children
  159. self.assertSequenceEqual([item.data for item in list.children], ('item', 'item'))
  160. def test_empty_expand1_list(self):
  161. g = _Lark(r"""start: list
  162. ?list: item*
  163. item : A
  164. A: "a"
  165. """)
  166. r = g.parse("")
  167. # because 'list' is an expand-if-contains-one rule and we've provided less than one element (i.e. none) it should *not* have expanded
  168. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  169. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  170. self.assertEqual(len(r.children), 1)
  171. # Sanity check: verify that 'list' contains no 'item's as we've given it none
  172. [list] = r.children
  173. self.assertSequenceEqual([item.data for item in list.children], ())
  174. def test_empty_expand1_list_2(self):
  175. g = _Lark(r"""start: list
  176. ?list: item* "!"?
  177. item : A
  178. A: "a"
  179. """)
  180. r = g.parse("")
  181. # because 'list' is an expand-if-contains-one rule and we've provided less than one element (i.e. none) it should *not* have expanded
  182. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  183. # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule
  184. self.assertEqual(len(r.children), 1)
  185. # Sanity check: verify that 'list' contains no 'item's as we've given it none
  186. [list] = r.children
  187. self.assertSequenceEqual([item.data for item in list.children], ())
  188. def test_empty_flatten_list(self):
  189. g = _Lark(r"""start: list
  190. list: | item "," list
  191. item : A
  192. A: "a"
  193. """)
  194. r = g.parse("")
  195. # Because 'list' is a flatten rule it's top-level element should *never* be expanded
  196. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  197. # Sanity check: verify that 'list' contains no 'item's as we've given it none
  198. [list] = r.children
  199. self.assertSequenceEqual([item.data for item in list.children], ())
  200. @unittest.skipIf(True, "Flattening list isn't implemented (and may never be)")
  201. def test_single_item_flatten_list(self):
  202. g = _Lark(r"""start: list
  203. list: | item "," list
  204. item : A
  205. A: "a"
  206. """)
  207. r = g.parse("a,")
  208. # Because 'list' is a flatten rule it's top-level element should *never* be expanded
  209. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  210. # Sanity check: verify that 'list' contains exactly the one 'item' we've given it
  211. [list] = r.children
  212. self.assertSequenceEqual([item.data for item in list.children], ('item',))
  213. @unittest.skipIf(True, "Flattening list isn't implemented (and may never be)")
  214. def test_multiple_item_flatten_list(self):
  215. g = _Lark(r"""start: list
  216. #list: | item "," list
  217. item : A
  218. A: "a"
  219. """)
  220. r = g.parse("a,a,")
  221. # Because 'list' is a flatten rule it's top-level element should *never* be expanded
  222. self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',))
  223. # Sanity check: verify that 'list' contains exactly the two 'item's we've given it
  224. [list] = r.children
  225. self.assertSequenceEqual([item.data for item in list.children], ('item', 'item'))
  226. @unittest.skipIf(True, "Flattening list isn't implemented (and may never be)")
  227. def test_recurse_flatten(self):
  228. """Verify that stack depth doesn't get exceeded on recursive rules marked for flattening."""
  229. g = _Lark(r"""start: a | start a
  230. a : A
  231. A : "a" """)
  232. # Force PLY to write to the debug log, but prevent writing it to the terminal (uses repr() on the half-built
  233. # STree data structures, which uses recursion).
  234. g.parse("a" * (sys.getrecursionlimit() // 4))
  235. def test_token_collision(self):
  236. g = _Lark("""start: "Hello" NAME
  237. NAME: /\w/+
  238. %ignore " "
  239. """)
  240. x = g.parse('Hello World')
  241. self.assertSequenceEqual(x.children, ['World'])
  242. x = g.parse('Hello HelloWorld')
  243. self.assertSequenceEqual(x.children, ['HelloWorld'])
  244. # def test_string_priority(self):
  245. # g = _Lark("""start: (A | /a?bb/)+
  246. # A: "a" """)
  247. # x = g.parse('abb')
  248. # self.assertEqual(len(x.children), 2)
  249. # # This parse raises an exception because the lexer will always try to consume
  250. # # "a" first and will never match the regular expression
  251. # # This behavior is subject to change!!
  252. # # Thie won't happen with ambiguity handling.
  253. # g = _Lark("""start: (A | /a?ab/)+
  254. # A: "a" """)
  255. # self.assertRaises(LexError, g.parse, 'aab')
  256. def test_undefined_rule(self):
  257. self.assertRaises(GrammarError, _Lark, """start: a""")
  258. def test_undefined_token(self):
  259. self.assertRaises(GrammarError, _Lark, """start: A""")
  260. def test_rule_collision(self):
  261. g = _Lark("""start: "a"+ "b"
  262. | "a"+ """)
  263. x = g.parse('aaaa')
  264. x = g.parse('aaaab')
  265. def test_rule_collision2(self):
  266. g = _Lark("""start: "a"* "b"
  267. | "a"+ """)
  268. x = g.parse('aaaa')
  269. x = g.parse('aaaab')
  270. x = g.parse('b')
  271. @unittest.skipIf(LEXER is None, "Regexps >1 not supported with scanless parsing")
  272. def test_regex_embed(self):
  273. g = _Lark("""start: A B C
  274. A: /a/
  275. B: /${A}b/
  276. C: /${B}c/
  277. """)
  278. x = g.parse('aababc')
  279. def test_token_embed(self):
  280. g = _Lark("""start: A B C
  281. A: "a"
  282. B: A "b"
  283. C: B "c"
  284. """)
  285. x = g.parse('aababc')
  286. def test_token_not_anon(self):
  287. """Tests that "a" is matched as A, rather than an anonymous token.
  288. That means that "a" is not filtered out, despite being an 'immediate string'.
  289. Whether or not this is the intuitive behavior, I'm not sure yet.
  290. Perhaps the right thing to do is report a collision (if such is relevant)
  291. -Erez
  292. """
  293. g = _Lark("""start: "a"
  294. A: "a" """)
  295. x = g.parse('a')
  296. self.assertEqual(len(x.children), 1, '"a" should not be considered anonymous')
  297. self.assertEqual(x.children[0].type, "A")
  298. g = _Lark("""start: /a/
  299. A: /a/ """)
  300. x = g.parse('a')
  301. self.assertEqual(len(x.children), 1, '/a/ should not be considered anonymous')
  302. self.assertEqual(x.children[0].type, "A")
  303. def test_maybe(self):
  304. g = _Lark("""start: ["a"] """)
  305. x = g.parse('a')
  306. x = g.parse('')
  307. def test_start(self):
  308. g = _Lark("""a: "a" a? """, start='a')
  309. x = g.parse('a')
  310. x = g.parse('aa')
  311. x = g.parse('aaa')
  312. def test_alias(self):
  313. g = _Lark("""start: "a" -> b """)
  314. x = g.parse('a')
  315. self.assertEqual(x.data, "b")
  316. def test_token_ebnf(self):
  317. g = _Lark("""start: A
  318. A: "a"* ("b"? "c".."e")+
  319. """)
  320. x = g.parse('abcde')
  321. x = g.parse('dd')
  322. def test_backslash(self):
  323. g = _Lark(r"""start: "\\" "a"
  324. """)
  325. x = g.parse(r'\a')
  326. g = _Lark(r"""start: /\\\\/ /a/
  327. """)
  328. x = g.parse(r'\a')
  329. def test_backslash2(self):
  330. g = _Lark(r"""start: "\"" "-"
  331. """)
  332. x = g.parse('"-')
  333. g = _Lark(r"""start: /\// /-/
  334. """)
  335. x = g.parse('/-')
  336. # def test_token_recurse(self):
  337. # g = _Lark("""start: A
  338. # A: B
  339. # B: A
  340. # """)
  341. def test_empty(self):
  342. # Fails an Earley implementation without special handling for empty rules,
  343. # or re-processing of already completed rules.
  344. g = _Lark(r"""start: _empty a "B"
  345. a: _empty "A"
  346. _empty:
  347. """)
  348. x = g.parse('AB')
  349. def test_lexer_token_limit(self):
  350. "Python has a stupid limit of 100 groups in a regular expression. Test that we handle this limitation"
  351. tokens = {'A%d'%i:'"%d"'%i for i in range(300)}
  352. g = _Lark("""start: %s
  353. %s""" % (' '.join(tokens), '\n'.join("%s: %s"%x for x in tokens.items())))
  354. def test_float_without_lexer(self):
  355. g = _Lark("""start: ["+"|"-"] float
  356. float: digit* "." digit+ exp?
  357. | digit+ exp
  358. exp: ("e"|"E") ["+"|"-"] digit+
  359. digit: "0"|"1"|"2"|"3"|"4"|"5"|"6"|"7"|"8"|"9"
  360. """)
  361. g.parse("1.2")
  362. g.parse("-.2e9")
  363. g.parse("+2e-9")
  364. self.assertRaises(ParseError, g.parse, "+2e-9e")
  365. _NAME = "Test" + PARSER.capitalize() + (LEXER or 'Scanless').capitalize()
  366. _TestParser.__name__ = _NAME
  367. globals()[_NAME] = _TestParser
  368. _TO_TEST = [
  369. ('standard', 'earley'),
  370. ('standard', 'lalr'),
  371. ('contextual', 'lalr'),
  372. (None, 'earley'),
  373. ]
  374. for LEXER, PARSER in _TO_TEST:
  375. _make_parser_test(LEXER, PARSER)
  376. if __name__ == '__main__':
  377. unittest.main()