Added BMP unicode_set for the Unicode Basic Multilingual Plane (issue #…

…392)
pyparsing · Apr 28, 2022 · d6f5655 · d6f5655
1 parent b1fff2e
commit d6f5655
Show file tree

Hide file tree

Showing 4 changed files with 59 additions and 5 deletions.
diff --git a/CHANGES b/CHANGES
@@ -4,6 +4,11 @@ Change Log
 
 Version 3.0.9 - (in development)
 ---------------
+- Added Unicode set BMP representing the Basic Multilingual Plane
+  (Unicode characters up to code point 65535). Can be used to parse
+  most language characters, but omits emojis, wingdings, etc.
+  Raised in discussion with Dave Tapley (issue #392).
+
 - To address mypy confusion of pyparsing.Optional and typing.Optional
   resulting in `error: "_SpecialForm" not callable` message
   reported in issue #365, fixed the import in exceptions.py. Nice

diff --git a/docs/pyparsing_class_diagrm.puml b/docs/pyparsing_class_diagrm.puml
@@ -318,9 +318,11 @@ class Hangul
 class Arabic
 class Devanagari
 class Hebrew
+class BMP
 unicode_set <|-- Latin1
 unicode_set <|--- LatinA
 unicode_set <|-- LatinB
+unicode_set <|-- BMP
 unicode_set <|-- Greek
 unicode_set <|--- Cyrillic
 unicode_set <|--- Chinese

diff --git a/pyparsing/unicode.py b/pyparsing/unicode.py
@@ -120,7 +120,18 @@ class pyparsing_unicode(unicode_set):
     A namespace class for defining common language unicode_sets.
     """
 
-    _ranges: UnicodeRangeList = [(32, sys.maxunicode)]
+    # fmt: off
+
+    # define ranges in language character sets
+    _ranges: UnicodeRangeList = [
+        (0x0020, sys.maxunicode),
+    ]
+
+    class BMP(unicode_set):
+        "Unicode set for the Basic Multilingual Plane"
+        _ranges: UnicodeRangeList = [
+            (0x0020, 0xFFFF),
+        ]
 
     class Latin1(unicode_set):
         "Unicode set for Latin-1 Unicode Character Range"
@@ -278,11 +289,13 @@ class Hangul(unicode_set):
 
     class CJK(Chinese, Japanese, Hangul):
         "Unicode set for combined Chinese, Japanese, and Korean (CJK) Unicode Character Range"
-        pass
 
     class Thai(unicode_set):
         "Unicode set for Thai Unicode Character Range"
-        _ranges: UnicodeRangeList = [(0x0E01, 0x0E3A), (0x0E3F, 0x0E5B)]
+        _ranges: UnicodeRangeList = [
+            (0x0E01, 0x0E3A),
+            (0x0E3F, 0x0E5B)
+        ]
 
     class Arabic(unicode_set):
         "Unicode set for Arabic Unicode Character Range"
@@ -308,7 +321,12 @@ class Hebrew(unicode_set):
 
     class Devanagari(unicode_set):
         "Unicode set for Devanagari Unicode Character Range"
-        _ranges: UnicodeRangeList = [(0x0900, 0x097F), (0xA8E0, 0xA8FF)]
+        _ranges: UnicodeRangeList = [
+            (0x0900, 0x097F),
+            (0xA8E0, 0xA8FF)
+        ]
+
+    # fmt: on
 
 
 pyparsing_unicode.Japanese._ranges = (
@@ -317,7 +335,7 @@ class Devanagari(unicode_set):
     + pyparsing_unicode.Japanese.Katakana._ranges
 )
 
-# define ranges in language character sets
+# add language identifiers using language Unicode
 pyparsing_unicode.العربية = pyparsing_unicode.Arabic
 pyparsing_unicode.中文 = pyparsing_unicode.Chinese
 pyparsing_unicode.кириллица = pyparsing_unicode.Cyrillic

diff --git a/tests/test_unit.py b/tests/test_unit.py
@@ -9,6 +9,7 @@
 
 import contextlib
 import datetime
+import random
 import re
 import sys
 import warnings
@@ -7127,6 +7128,34 @@ class Turkish_set(ppu.Latin1, ppu.LatinA):
             msg="Failed to parse Turkish key-value pairs",
         )
 
+        # Basic Multilingual Plane only contains chars up to 65535
+        def filter_16_bit(s):
+            return "".join(c for c in s if ord(c) < 2**16)
+
+        bmp_printables = ppu.BMP.printables
+        sample = (
+            "".join(
+                random.choice(filter_16_bit(unicode_set.printables))
+                for unicode_set in (
+                    ppu.Japanese,
+                    Turkish_set,
+                    ppu.Greek,
+                    ppu.Hebrew,
+                    ppu.Devanagari,
+                    ppu.Hangul,
+                    ppu.Latin1,
+                    ppu.Chinese,
+                    ppu.Cyrillic,
+                    ppu.Arabic,
+                    ppu.Thai,
+                )
+                * 8
+            )
+            + "\N{REPLACEMENT CHARACTER}"
+        )
+        print(sample)
+        self.assertParseAndCheckList(pp.Word(bmp_printables), sample, [sample])
+
     # Make sure example in indentedBlock docstring actually works!
     def testIndentedBlockExample(self):