| @@ -6,7 +6,7 @@ from copy import copy, deepcopy | |||||
| from io import open | from io import open | ||||
| import pkgutil | import pkgutil | ||||
| from .utils import bfs, eval_escaping, Py36, logger, classify_bool, isalnum, isalpha | |||||
| from .utils import bfs, eval_escaping, Py36, logger, classify_bool, is_id_continue, isalpha | |||||
| from .lexer import Token, TerminalDef, PatternStr, PatternRE | from .lexer import Token, TerminalDef, PatternStr, PatternRE | ||||
| from .parse_tree_builder import ParseTreeBuilder | from .parse_tree_builder import ParseTreeBuilder | ||||
| @@ -328,10 +328,8 @@ class PrepareAnonTerminals(Transformer_InPlace): | |||||
| try: | try: | ||||
| term_name = _TERMINAL_NAMES[value] | term_name = _TERMINAL_NAMES[value] | ||||
| except KeyError: | except KeyError: | ||||
| if isalnum(value) and isalpha(value[0]) and value.upper() not in self.term_set: | |||||
| with suppress(UnicodeEncodeError): | |||||
| value.upper().encode('utf8') # Why shouldn't we have unicode in our terminal names? | |||||
| term_name = value.upper() | |||||
| if is_id_continue(value) and isalpha(value[0]) and value.upper() not in self.term_set: | |||||
| term_name = value.upper() | |||||
| if term_name in self.term_set: | if term_name in self.term_set: | ||||
| term_name = None | term_name = None | ||||
| @@ -8,7 +8,7 @@ from .lexer import Token, PatternStr | |||||
| from .grammar import Terminal, NonTerminal | from .grammar import Terminal, NonTerminal | ||||
| from .tree_matcher import TreeMatcher, is_discarded_terminal | from .tree_matcher import TreeMatcher, is_discarded_terminal | ||||
| from .utils import isalnum | |||||
| from .utils import is_id_continue | |||||
| def is_iter_empty(i): | def is_iter_empty(i): | ||||
| try: | try: | ||||
| @@ -94,7 +94,7 @@ class Reconstructor(TreeMatcher): | |||||
| y = [] | y = [] | ||||
| prev_item = '' | prev_item = '' | ||||
| for item in x: | for item in x: | ||||
| if prev_item and item and isalnum(prev_item[-1]) and isalnum(item[0]): | |||||
| if prev_item and item and is_id_continue(prev_item[-1]) and is_id_continue(item[0]): | |||||
| y.append(' ') | y.append(' ') | ||||
| y.append(item) | y.append(item) | ||||
| prev_item = item | prev_item = item | ||||
| @@ -13,9 +13,13 @@ logger.addHandler(logging.StreamHandler()) | |||||
| # By default, we should not output any log messages | # By default, we should not output any log messages | ||||
| logger.setLevel(logging.CRITICAL) | logger.setLevel(logging.CRITICAL) | ||||
| def isalnum(x): | |||||
| def is_id_continue(x): | |||||
| """ | |||||
| Checks if all characters in `x` are alphanumeric characters (Unicode standard, so diactrics, Indian vowels, non-latin | |||||
| numbers, etc. all pass). Synonymous with a Python `ID_CONTINUE` identifier. | |||||
| """ | |||||
| if len(x) != 1: | if len(x) != 1: | ||||
| return all(isalnum(y) for y in x) | |||||
| return all(is_id_continue(y) for y in x) | |||||
| return unicodedata.category(x) in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Mn', 'Mc', 'Nd', 'Pc'] | return unicodedata.category(x) in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Mn', 'Mc', 'Nd', 'Pc'] | ||||