Refine unicode char set computations using set operations instead of …

…str addition; add more type annotations; some black reformatting
pyparsing · Mar 10, 2024 · 724fa92 · 724fa92
1 parent fe91e53
commit 724fa92
Show file tree

Hide file tree

Showing 4 changed files with 70 additions and 36 deletions.
diff --git a/pyparsing/core.py b/pyparsing/core.py
@@ -218,19 +218,11 @@ def _should_enable_warnings(
 
 
 # build list of single arg builtins, that can be used as parse actions
+# fmt: off
 _single_arg_builtins = {
-    sum,
-    len,
-    sorted,
-    reversed,
-    list,
-    tuple,
-    set,
-    any,
-    all,
-    min,
-    max,
+    sum, len, sorted, reversed, list, tuple, set, any, all, min, max
 }
+# fmt: on
 
 _generatorType = types.GeneratorType
 ParseImplReturnType = Tuple[int, Any]
@@ -255,13 +247,13 @@ def _should_enable_warnings(
 DebugExceptionAction = Callable[[str, int, "ParserElement", Exception, bool], None]
 
 
-alphas = string.ascii_uppercase + string.ascii_lowercase
-identchars = pyparsing_unicode.Latin1.identchars
-identbodychars = pyparsing_unicode.Latin1.identbodychars
-nums = "0123456789"
-hexnums = nums + "ABCDEFabcdef"
-alphanums = alphas + nums
-printables = "".join([c for c in string.printable if c not in string.whitespace])
+alphas: str = string.ascii_uppercase + string.ascii_lowercase
+identchars: str = pyparsing_unicode.Latin1.identchars
+identbodychars: str = pyparsing_unicode.Latin1.identbodychars
+nums: str = "0123456789"
+hexnums: str = nums + "ABCDEFabcdef"
+alphanums: str = alphas + nums
+printables: str = "".join([c for c in string.printable if c not in string.whitespace])
 
 _trim_arity_call_line: traceback.StackSummary = None  # type: ignore[assignment]
 

diff --git a/pyparsing/unicode.py b/pyparsing/unicode.py
@@ -53,59 +53,61 @@ class CJK(Chinese, Japanese, Korean):
     _ranges: UnicodeRangeList = []
 
     @_lazyclassproperty
-    def _chars_for_ranges(cls):
+    def _chars_for_ranges(cls) -> List[str]:
         ret = []
         for cc in cls.__mro__:
             if cc is unicode_set:
                 break
             for rr in getattr(cc, "_ranges", ()):
                 ret.extend(range(rr[0], rr[-1] + 1))
-        return [chr(c) for c in sorted(set(ret))]
+        return sorted(chr(c) for c in set(ret))
 
     @_lazyclassproperty
-    def printables(cls):
+    def printables(cls) -> str:
         """all non-whitespace characters in this range"""
         return "".join(filterfalse(str.isspace, cls._chars_for_ranges))
 
     @_lazyclassproperty
-    def alphas(cls):
+    def alphas(cls) -> str:
         """all alphabetic characters in this range"""
         return "".join(filter(str.isalpha, cls._chars_for_ranges))
 
     @_lazyclassproperty
-    def nums(cls):
+    def nums(cls) -> str:
         """all numeric digit characters in this range"""
         return "".join(filter(str.isdigit, cls._chars_for_ranges))
 
     @_lazyclassproperty
-    def alphanums(cls):
+    def alphanums(cls) -> str:
         """all alphanumeric characters in this range"""
         return cls.alphas + cls.nums
 
     @_lazyclassproperty
-    def identchars(cls):
+    def identchars(cls) -> str:
         """all characters in this range that are valid identifier characters, plus underscore '_'"""
         return "".join(
             sorted(
-                set(
-                    "".join(filter(str.isidentifier, cls._chars_for_ranges))
-                    + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzªµº"
-                    + "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ"
-                    + "_"
+                set(filter(str.isidentifier, cls._chars_for_ranges))
+                | set(
+                    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzªµº"
+                    "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ"
+                    "_"
                 )
             )
         )
 
     @_lazyclassproperty
-    def identbodychars(cls):
+    def identbodychars(cls) -> str:
         """
         all characters in this range that are valid identifier body characters,
         plus the digits 0-9, and · (Unicode MIDDLE DOT)
         """
         identifier_chars = set(
             c for c in cls._chars_for_ranges if ("_" + c).isidentifier()
         )
-        return "".join(sorted(identifier_chars | set(cls.identchars + "0123456789·")))
+        return "".join(
+            sorted(identifier_chars | set(cls.identchars) | set("0123456789·"))
+        )
 
     @_lazyclassproperty
     def identifier(cls):

diff --git a/tests/test_diagram.py b/tests/test_diagram.py
@@ -212,6 +212,7 @@ def test_create_diagram_embed(self):
 
     def test_kwargs_pass_thru_create_diagram(self):
         from io import StringIO
+
         # Creates a simple diagram with a blue body and
         # various other railroad features colored with
         # a complete disregard for taste
@@ -302,10 +303,11 @@ def test_kwargs_pass_thru_create_diagram(self):
             vertical=6,
             show_results_names=True,
             css=DEFAULT_STYLE,
-            head=expStyle
+            head=expStyle,
         )
 
         self.assertIn(expStyle, diag_html_capture.getvalue())
 
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/test_unit.py b/tests/test_unit.py
@@ -6,7 +6,7 @@
 # Copyright 2002-2021, Paul McGuire
 #
 #
-
+import collections
 import contextlib
 import datetime
 import random
@@ -25,6 +25,9 @@
 from tests.json_parser_tests import test1, test2, test3, test4, test5
 import platform
 
+python_full_version = sys.version_info
+python_version = python_full_version[:2]
+
 ppc = pp.pyparsing_common
 ppt = pp.pyparsing_test
 
@@ -7676,6 +7679,37 @@ def mock_set_trace():
     def testUnicodeTests(self):
         ppu = pp.pyparsing_unicode
 
+        # verify ranges are converted to sets properly
+        for unicode_property, expected_length in [
+            ("alphas", 48965),
+            ("alphanums", 49430),
+            ("identchars", 49013),
+            ("identbodychars", 50729),
+            ("printables", 65484),
+        ]:
+            charset = getattr(ppu.BMP, unicode_property)
+            charset_len = len(charset)
+
+            if python_version >= (3, 9):
+                # this test is sensitive to the Unicode version used in specific
+                # python versions
+                with self.subTest(unicode_property=unicode_property, msg="verify len"):
+                    print(f"ppu.BMP.{unicode_property:14}: {charset_len:6d}")
+                    self.assertEqual(
+                        charset_len,
+                        expected_length,
+                        f"incorrect number of ppu.BMP.{unicode_property},"
+                        f" found {charset_len} expected {expected_length}",
+                    )
+
+            with self.subTest(unicode_property=unicode_property, msg="verify unique"):
+                char_counts = collections.Counter(charset)
+                self.assertTrue(
+                    all(count == 1 for count in char_counts.values()),
+                    f"duplicate items found in ppu.BMP.{unicode_property}:"
+                    f" {[c for c, count in char_counts.items() if count > 1]}",
+                )
+
         # verify proper merging of ranges by addition
         kanji_printables = ppu.Japanese.Kanji.printables
         katakana_printables = ppu.Japanese.Katakana.printables
@@ -10226,15 +10260,19 @@ def run_subtest(fn_name, expr=None, args=""):
             enablePackrat disableMemoization enableLeftRecursion resetCache
         """.split()
 
-        if not (pp.ParserElement._packratEnabled or pp.ParserElement._left_recursion_enabled):
+        if not (
+            pp.ParserElement._packratEnabled or pp.ParserElement._left_recursion_enabled
+        ):
             for name in parser_element_staticmethod_names:
                 run_subtest(name)
         pp.ParserElement.disable_memoization()
 
         run_subtest("setDefaultWhitespaceChars", args="' '")
         run_subtest("inlineLiteralsUsing", args="pp.Suppress")
 
-        run_subtest("setDefaultKeywordChars", expr="pp.Keyword('START')", args="'abcde'")
+        run_subtest(
+            "setDefaultKeywordChars", expr="pp.Keyword('START')", args="'abcde'"
+        )
 
 
 class Test03_EnablePackratParsing(TestCase):