diff --git a/CHANGES b/CHANGES index 6e6cddbd..d385bd1e 100644 --- a/CHANGES +++ b/CHANGES @@ -4,6 +4,11 @@ Change Log Version 3.0.9 - (in development) --------------- +- Added Unicode set BMP representing the Basic Multilingual Plane + (Unicode characters up to code point 65535). Can be used to parse + most language characters, but omits emojis, wingdings, etc. + Raised in discussion with Dave Tapley (issue #392). + - To address mypy confusion of pyparsing.Optional and typing.Optional resulting in `error: "_SpecialForm" not callable` message reported in issue #365, fixed the import in exceptions.py. Nice diff --git a/docs/pyparsing_class_diagrm.puml b/docs/pyparsing_class_diagrm.puml index 97520b74..61501c0a 100644 --- a/docs/pyparsing_class_diagrm.puml +++ b/docs/pyparsing_class_diagrm.puml @@ -318,9 +318,11 @@ class Hangul class Arabic class Devanagari class Hebrew +class BMP unicode_set <|-- Latin1 unicode_set <|--- LatinA unicode_set <|-- LatinB +unicode_set <|-- BMP unicode_set <|-- Greek unicode_set <|--- Cyrillic unicode_set <|--- Chinese diff --git a/pyparsing/unicode.py b/pyparsing/unicode.py index 92261487..cd6e4827 100644 --- a/pyparsing/unicode.py +++ b/pyparsing/unicode.py @@ -120,7 +120,18 @@ class pyparsing_unicode(unicode_set): A namespace class for defining common language unicode_sets. """ - _ranges: UnicodeRangeList = [(32, sys.maxunicode)] + # fmt: off + + # define ranges in language character sets + _ranges: UnicodeRangeList = [ + (0x0020, sys.maxunicode), + ] + + class BMP(unicode_set): + "Unicode set for the Basic Multilingual Plane" + _ranges: UnicodeRangeList = [ + (0x0020, 0xFFFF), + ] class Latin1(unicode_set): "Unicode set for Latin-1 Unicode Character Range" @@ -278,11 +289,13 @@ class Hangul(unicode_set): class CJK(Chinese, Japanese, Hangul): "Unicode set for combined Chinese, Japanese, and Korean (CJK) Unicode Character Range" - pass class Thai(unicode_set): "Unicode set for Thai Unicode Character Range" - _ranges: UnicodeRangeList = [(0x0E01, 0x0E3A), (0x0E3F, 0x0E5B)] + _ranges: UnicodeRangeList = [ + (0x0E01, 0x0E3A), + (0x0E3F, 0x0E5B) + ] class Arabic(unicode_set): "Unicode set for Arabic Unicode Character Range" @@ -308,7 +321,12 @@ class Hebrew(unicode_set): class Devanagari(unicode_set): "Unicode set for Devanagari Unicode Character Range" - _ranges: UnicodeRangeList = [(0x0900, 0x097F), (0xA8E0, 0xA8FF)] + _ranges: UnicodeRangeList = [ + (0x0900, 0x097F), + (0xA8E0, 0xA8FF) + ] + + # fmt: on pyparsing_unicode.Japanese._ranges = ( @@ -317,7 +335,7 @@ class Devanagari(unicode_set): + pyparsing_unicode.Japanese.Katakana._ranges ) -# define ranges in language character sets +# add language identifiers using language Unicode pyparsing_unicode.العربية = pyparsing_unicode.Arabic pyparsing_unicode.中文 = pyparsing_unicode.Chinese pyparsing_unicode.кириллица = pyparsing_unicode.Cyrillic diff --git a/tests/test_unit.py b/tests/test_unit.py index e8ffc3c6..59fab7b0 100644 --- a/tests/test_unit.py +++ b/tests/test_unit.py @@ -9,6 +9,7 @@ import contextlib import datetime +import random import re import sys import warnings @@ -7127,6 +7128,34 @@ class Turkish_set(ppu.Latin1, ppu.LatinA): msg="Failed to parse Turkish key-value pairs", ) + # Basic Multilingual Plane only contains chars up to 65535 + def filter_16_bit(s): + return "".join(c for c in s if ord(c) < 2**16) + + bmp_printables = ppu.BMP.printables + sample = ( + "".join( + random.choice(filter_16_bit(unicode_set.printables)) + for unicode_set in ( + ppu.Japanese, + Turkish_set, + ppu.Greek, + ppu.Hebrew, + ppu.Devanagari, + ppu.Hangul, + ppu.Latin1, + ppu.Chinese, + ppu.Cyrillic, + ppu.Arabic, + ppu.Thai, + ) + * 8 + ) + + "\N{REPLACEMENT CHARACTER}" + ) + print(sample) + self.assertParseAndCheckList(pp.Word(bmp_printables), sample, [sample]) + # Make sure example in indentedBlock docstring actually works! def testIndentedBlockExample(self):