From 724fa9213900ef32bc086dd1c6c21474f743bfed Mon Sep 17 00:00:00 2001 From: ptmcg Date: Sun, 10 Mar 2024 17:35:09 -0500 Subject: [PATCH] Refine unicode char set computations using set operations instead of str addition; add more type annotations; some black reformatting --- pyparsing/core.py | 28 ++++++++++----------------- pyparsing/unicode.py | 30 +++++++++++++++-------------- tests/test_diagram.py | 4 +++- tests/test_unit.py | 44 ++++++++++++++++++++++++++++++++++++++++--- 4 files changed, 70 insertions(+), 36 deletions(-) diff --git a/pyparsing/core.py b/pyparsing/core.py index 09d0c09e..3bbeb8ac 100644 --- a/pyparsing/core.py +++ b/pyparsing/core.py @@ -218,19 +218,11 @@ def _should_enable_warnings( # build list of single arg builtins, that can be used as parse actions +# fmt: off _single_arg_builtins = { - sum, - len, - sorted, - reversed, - list, - tuple, - set, - any, - all, - min, - max, + sum, len, sorted, reversed, list, tuple, set, any, all, min, max } +# fmt: on _generatorType = types.GeneratorType ParseImplReturnType = Tuple[int, Any] @@ -255,13 +247,13 @@ def _should_enable_warnings( DebugExceptionAction = Callable[[str, int, "ParserElement", Exception, bool], None] -alphas = string.ascii_uppercase + string.ascii_lowercase -identchars = pyparsing_unicode.Latin1.identchars -identbodychars = pyparsing_unicode.Latin1.identbodychars -nums = "0123456789" -hexnums = nums + "ABCDEFabcdef" -alphanums = alphas + nums -printables = "".join([c for c in string.printable if c not in string.whitespace]) +alphas: str = string.ascii_uppercase + string.ascii_lowercase +identchars: str = pyparsing_unicode.Latin1.identchars +identbodychars: str = pyparsing_unicode.Latin1.identbodychars +nums: str = "0123456789" +hexnums: str = nums + "ABCDEFabcdef" +alphanums: str = alphas + nums +printables: str = "".join([c for c in string.printable if c not in string.whitespace]) _trim_arity_call_line: traceback.StackSummary = None # type: ignore[assignment] diff --git a/pyparsing/unicode.py b/pyparsing/unicode.py index 426b8b23..356a02f1 100644 --- a/pyparsing/unicode.py +++ b/pyparsing/unicode.py @@ -53,51 +53,51 @@ class CJK(Chinese, Japanese, Korean): _ranges: UnicodeRangeList = [] @_lazyclassproperty - def _chars_for_ranges(cls): + def _chars_for_ranges(cls) -> List[str]: ret = [] for cc in cls.__mro__: if cc is unicode_set: break for rr in getattr(cc, "_ranges", ()): ret.extend(range(rr[0], rr[-1] + 1)) - return [chr(c) for c in sorted(set(ret))] + return sorted(chr(c) for c in set(ret)) @_lazyclassproperty - def printables(cls): + def printables(cls) -> str: """all non-whitespace characters in this range""" return "".join(filterfalse(str.isspace, cls._chars_for_ranges)) @_lazyclassproperty - def alphas(cls): + def alphas(cls) -> str: """all alphabetic characters in this range""" return "".join(filter(str.isalpha, cls._chars_for_ranges)) @_lazyclassproperty - def nums(cls): + def nums(cls) -> str: """all numeric digit characters in this range""" return "".join(filter(str.isdigit, cls._chars_for_ranges)) @_lazyclassproperty - def alphanums(cls): + def alphanums(cls) -> str: """all alphanumeric characters in this range""" return cls.alphas + cls.nums @_lazyclassproperty - def identchars(cls): + def identchars(cls) -> str: """all characters in this range that are valid identifier characters, plus underscore '_'""" return "".join( sorted( - set( - "".join(filter(str.isidentifier, cls._chars_for_ranges)) - + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzªµº" - + "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ" - + "_" + set(filter(str.isidentifier, cls._chars_for_ranges)) + | set( + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzªµº" + "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ" + "_" ) ) ) @_lazyclassproperty - def identbodychars(cls): + def identbodychars(cls) -> str: """ all characters in this range that are valid identifier body characters, plus the digits 0-9, and · (Unicode MIDDLE DOT) @@ -105,7 +105,9 @@ def identbodychars(cls): identifier_chars = set( c for c in cls._chars_for_ranges if ("_" + c).isidentifier() ) - return "".join(sorted(identifier_chars | set(cls.identchars + "0123456789·"))) + return "".join( + sorted(identifier_chars | set(cls.identchars) | set("0123456789·")) + ) @_lazyclassproperty def identifier(cls): diff --git a/tests/test_diagram.py b/tests/test_diagram.py index 62a7a91c..66fd275b 100644 --- a/tests/test_diagram.py +++ b/tests/test_diagram.py @@ -212,6 +212,7 @@ def test_create_diagram_embed(self): def test_kwargs_pass_thru_create_diagram(self): from io import StringIO + # Creates a simple diagram with a blue body and # various other railroad features colored with # a complete disregard for taste @@ -302,10 +303,11 @@ def test_kwargs_pass_thru_create_diagram(self): vertical=6, show_results_names=True, css=DEFAULT_STYLE, - head=expStyle + head=expStyle, ) self.assertIn(expStyle, diag_html_capture.getvalue()) + if __name__ == "__main__": unittest.main() diff --git a/tests/test_unit.py b/tests/test_unit.py index 8e1b42c4..3f47e71d 100644 --- a/tests/test_unit.py +++ b/tests/test_unit.py @@ -6,7 +6,7 @@ # Copyright 2002-2021, Paul McGuire # # - +import collections import contextlib import datetime import random @@ -25,6 +25,9 @@ from tests.json_parser_tests import test1, test2, test3, test4, test5 import platform +python_full_version = sys.version_info +python_version = python_full_version[:2] + ppc = pp.pyparsing_common ppt = pp.pyparsing_test @@ -7676,6 +7679,37 @@ def mock_set_trace(): def testUnicodeTests(self): ppu = pp.pyparsing_unicode + # verify ranges are converted to sets properly + for unicode_property, expected_length in [ + ("alphas", 48965), + ("alphanums", 49430), + ("identchars", 49013), + ("identbodychars", 50729), + ("printables", 65484), + ]: + charset = getattr(ppu.BMP, unicode_property) + charset_len = len(charset) + + if python_version >= (3, 9): + # this test is sensitive to the Unicode version used in specific + # python versions + with self.subTest(unicode_property=unicode_property, msg="verify len"): + print(f"ppu.BMP.{unicode_property:14}: {charset_len:6d}") + self.assertEqual( + charset_len, + expected_length, + f"incorrect number of ppu.BMP.{unicode_property}," + f" found {charset_len} expected {expected_length}", + ) + + with self.subTest(unicode_property=unicode_property, msg="verify unique"): + char_counts = collections.Counter(charset) + self.assertTrue( + all(count == 1 for count in char_counts.values()), + f"duplicate items found in ppu.BMP.{unicode_property}:" + f" {[c for c, count in char_counts.items() if count > 1]}", + ) + # verify proper merging of ranges by addition kanji_printables = ppu.Japanese.Kanji.printables katakana_printables = ppu.Japanese.Katakana.printables @@ -10226,7 +10260,9 @@ def run_subtest(fn_name, expr=None, args=""): enablePackrat disableMemoization enableLeftRecursion resetCache """.split() - if not (pp.ParserElement._packratEnabled or pp.ParserElement._left_recursion_enabled): + if not ( + pp.ParserElement._packratEnabled or pp.ParserElement._left_recursion_enabled + ): for name in parser_element_staticmethod_names: run_subtest(name) pp.ParserElement.disable_memoization() @@ -10234,7 +10270,9 @@ def run_subtest(fn_name, expr=None, args=""): run_subtest("setDefaultWhitespaceChars", args="' '") run_subtest("inlineLiteralsUsing", args="pp.Suppress") - run_subtest("setDefaultKeywordChars", expr="pp.Keyword('START')", args="'abcde'") + run_subtest( + "setDefaultKeywordChars", expr="pp.Keyword('START')", args="'abcde'" + ) class Test03_EnablePackratParsing(TestCase):