Skip to content

Commit

Permalink
Refine unicode char set computations using set operations instead of …
Browse files Browse the repository at this point in the history
…str addition; add more type annotations; some black reformatting
  • Loading branch information
ptmcg committed Mar 10, 2024
1 parent fe91e53 commit 724fa92
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 36 deletions.
28 changes: 10 additions & 18 deletions pyparsing/core.py
Expand Up @@ -218,19 +218,11 @@ def _should_enable_warnings(


# build list of single arg builtins, that can be used as parse actions
# fmt: off
_single_arg_builtins = {
sum,
len,
sorted,
reversed,
list,
tuple,
set,
any,
all,
min,
max,
sum, len, sorted, reversed, list, tuple, set, any, all, min, max
}
# fmt: on

_generatorType = types.GeneratorType
ParseImplReturnType = Tuple[int, Any]
Expand All @@ -255,13 +247,13 @@ def _should_enable_warnings(
DebugExceptionAction = Callable[[str, int, "ParserElement", Exception, bool], None]


alphas = string.ascii_uppercase + string.ascii_lowercase
identchars = pyparsing_unicode.Latin1.identchars
identbodychars = pyparsing_unicode.Latin1.identbodychars
nums = "0123456789"
hexnums = nums + "ABCDEFabcdef"
alphanums = alphas + nums
printables = "".join([c for c in string.printable if c not in string.whitespace])
alphas: str = string.ascii_uppercase + string.ascii_lowercase
identchars: str = pyparsing_unicode.Latin1.identchars
identbodychars: str = pyparsing_unicode.Latin1.identbodychars
nums: str = "0123456789"
hexnums: str = nums + "ABCDEFabcdef"
alphanums: str = alphas + nums
printables: str = "".join([c for c in string.printable if c not in string.whitespace])

_trim_arity_call_line: traceback.StackSummary = None # type: ignore[assignment]

Expand Down
30 changes: 16 additions & 14 deletions pyparsing/unicode.py
Expand Up @@ -53,59 +53,61 @@ class CJK(Chinese, Japanese, Korean):
_ranges: UnicodeRangeList = []

@_lazyclassproperty
def _chars_for_ranges(cls):
def _chars_for_ranges(cls) -> List[str]:
ret = []
for cc in cls.__mro__:
if cc is unicode_set:
break
for rr in getattr(cc, "_ranges", ()):
ret.extend(range(rr[0], rr[-1] + 1))
return [chr(c) for c in sorted(set(ret))]
return sorted(chr(c) for c in set(ret))

@_lazyclassproperty
def printables(cls):
def printables(cls) -> str:
"""all non-whitespace characters in this range"""
return "".join(filterfalse(str.isspace, cls._chars_for_ranges))

@_lazyclassproperty
def alphas(cls):
def alphas(cls) -> str:
"""all alphabetic characters in this range"""
return "".join(filter(str.isalpha, cls._chars_for_ranges))

@_lazyclassproperty
def nums(cls):
def nums(cls) -> str:
"""all numeric digit characters in this range"""
return "".join(filter(str.isdigit, cls._chars_for_ranges))

@_lazyclassproperty
def alphanums(cls):
def alphanums(cls) -> str:
"""all alphanumeric characters in this range"""
return cls.alphas + cls.nums

@_lazyclassproperty
def identchars(cls):
def identchars(cls) -> str:
"""all characters in this range that are valid identifier characters, plus underscore '_'"""
return "".join(
sorted(
set(
"".join(filter(str.isidentifier, cls._chars_for_ranges))
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzªµº"
+ "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ"
+ "_"
set(filter(str.isidentifier, cls._chars_for_ranges))
| set(
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzªµº"
"ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ"
"_"
)
)
)

@_lazyclassproperty
def identbodychars(cls):
def identbodychars(cls) -> str:
"""
all characters in this range that are valid identifier body characters,
plus the digits 0-9, and · (Unicode MIDDLE DOT)
"""
identifier_chars = set(
c for c in cls._chars_for_ranges if ("_" + c).isidentifier()
)
return "".join(sorted(identifier_chars | set(cls.identchars + "0123456789·")))
return "".join(
sorted(identifier_chars | set(cls.identchars) | set("0123456789·"))
)

@_lazyclassproperty
def identifier(cls):
Expand Down
4 changes: 3 additions & 1 deletion tests/test_diagram.py
Expand Up @@ -212,6 +212,7 @@ def test_create_diagram_embed(self):

def test_kwargs_pass_thru_create_diagram(self):
from io import StringIO

# Creates a simple diagram with a blue body and
# various other railroad features colored with
# a complete disregard for taste
Expand Down Expand Up @@ -302,10 +303,11 @@ def test_kwargs_pass_thru_create_diagram(self):
vertical=6,
show_results_names=True,
css=DEFAULT_STYLE,
head=expStyle
head=expStyle,
)

self.assertIn(expStyle, diag_html_capture.getvalue())


if __name__ == "__main__":
unittest.main()
44 changes: 41 additions & 3 deletions tests/test_unit.py
Expand Up @@ -6,7 +6,7 @@
# Copyright 2002-2021, Paul McGuire
#
#

import collections
import contextlib
import datetime
import random
Expand All @@ -25,6 +25,9 @@
from tests.json_parser_tests import test1, test2, test3, test4, test5
import platform

python_full_version = sys.version_info
python_version = python_full_version[:2]

ppc = pp.pyparsing_common
ppt = pp.pyparsing_test

Expand Down Expand Up @@ -7676,6 +7679,37 @@ def mock_set_trace():
def testUnicodeTests(self):
ppu = pp.pyparsing_unicode

# verify ranges are converted to sets properly
for unicode_property, expected_length in [
("alphas", 48965),
("alphanums", 49430),
("identchars", 49013),
("identbodychars", 50729),
("printables", 65484),
]:
charset = getattr(ppu.BMP, unicode_property)
charset_len = len(charset)

if python_version >= (3, 9):
# this test is sensitive to the Unicode version used in specific
# python versions
with self.subTest(unicode_property=unicode_property, msg="verify len"):
print(f"ppu.BMP.{unicode_property:14}: {charset_len:6d}")
self.assertEqual(
charset_len,
expected_length,
f"incorrect number of ppu.BMP.{unicode_property},"
f" found {charset_len} expected {expected_length}",
)

with self.subTest(unicode_property=unicode_property, msg="verify unique"):
char_counts = collections.Counter(charset)
self.assertTrue(
all(count == 1 for count in char_counts.values()),
f"duplicate items found in ppu.BMP.{unicode_property}:"
f" {[c for c, count in char_counts.items() if count > 1]}",
)

# verify proper merging of ranges by addition
kanji_printables = ppu.Japanese.Kanji.printables
katakana_printables = ppu.Japanese.Katakana.printables
Expand Down Expand Up @@ -10226,15 +10260,19 @@ def run_subtest(fn_name, expr=None, args=""):
enablePackrat disableMemoization enableLeftRecursion resetCache
""".split()

if not (pp.ParserElement._packratEnabled or pp.ParserElement._left_recursion_enabled):
if not (
pp.ParserElement._packratEnabled or pp.ParserElement._left_recursion_enabled
):
for name in parser_element_staticmethod_names:
run_subtest(name)
pp.ParserElement.disable_memoization()

run_subtest("setDefaultWhitespaceChars", args="' '")
run_subtest("inlineLiteralsUsing", args="pp.Suppress")

run_subtest("setDefaultKeywordChars", expr="pp.Keyword('START')", args="'abcde'")
run_subtest(
"setDefaultKeywordChars", expr="pp.Keyword('START')", args="'abcde'"
)


class Test03_EnablePackratParsing(TestCase):
Expand Down

0 comments on commit 724fa92

Please sign in to comment.