Skip to content

Commit

Permalink
Added BMP unicode_set for the Unicode Basic Multilingual Plane (issue #…
Browse files Browse the repository at this point in the history
  • Loading branch information
ptmcg committed Apr 28, 2022
1 parent b1fff2e commit d6f5655
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 5 deletions.
5 changes: 5 additions & 0 deletions CHANGES
Expand Up @@ -4,6 +4,11 @@ Change Log

Version 3.0.9 - (in development)
---------------
- Added Unicode set BMP representing the Basic Multilingual Plane
(Unicode characters up to code point 65535). Can be used to parse
most language characters, but omits emojis, wingdings, etc.
Raised in discussion with Dave Tapley (issue #392).

- To address mypy confusion of pyparsing.Optional and typing.Optional
resulting in `error: "_SpecialForm" not callable` message
reported in issue #365, fixed the import in exceptions.py. Nice
Expand Down
2 changes: 2 additions & 0 deletions docs/pyparsing_class_diagrm.puml
Expand Up @@ -318,9 +318,11 @@ class Hangul
class Arabic
class Devanagari
class Hebrew
class BMP
unicode_set <|-- Latin1
unicode_set <|--- LatinA
unicode_set <|-- LatinB
unicode_set <|-- BMP
unicode_set <|-- Greek
unicode_set <|--- Cyrillic
unicode_set <|--- Chinese
Expand Down
28 changes: 23 additions & 5 deletions pyparsing/unicode.py
Expand Up @@ -120,7 +120,18 @@ class pyparsing_unicode(unicode_set):
A namespace class for defining common language unicode_sets.
"""

_ranges: UnicodeRangeList = [(32, sys.maxunicode)]
# fmt: off

# define ranges in language character sets
_ranges: UnicodeRangeList = [
(0x0020, sys.maxunicode),
]

class BMP(unicode_set):
"Unicode set for the Basic Multilingual Plane"
_ranges: UnicodeRangeList = [
(0x0020, 0xFFFF),
]

class Latin1(unicode_set):
"Unicode set for Latin-1 Unicode Character Range"
Expand Down Expand Up @@ -278,11 +289,13 @@ class Hangul(unicode_set):

class CJK(Chinese, Japanese, Hangul):
"Unicode set for combined Chinese, Japanese, and Korean (CJK) Unicode Character Range"
pass

class Thai(unicode_set):
"Unicode set for Thai Unicode Character Range"
_ranges: UnicodeRangeList = [(0x0E01, 0x0E3A), (0x0E3F, 0x0E5B)]
_ranges: UnicodeRangeList = [
(0x0E01, 0x0E3A),
(0x0E3F, 0x0E5B)
]

class Arabic(unicode_set):
"Unicode set for Arabic Unicode Character Range"
Expand All @@ -308,7 +321,12 @@ class Hebrew(unicode_set):

class Devanagari(unicode_set):
"Unicode set for Devanagari Unicode Character Range"
_ranges: UnicodeRangeList = [(0x0900, 0x097F), (0xA8E0, 0xA8FF)]
_ranges: UnicodeRangeList = [
(0x0900, 0x097F),
(0xA8E0, 0xA8FF)
]

# fmt: on


pyparsing_unicode.Japanese._ranges = (
Expand All @@ -317,7 +335,7 @@ class Devanagari(unicode_set):
+ pyparsing_unicode.Japanese.Katakana._ranges
)

# define ranges in language character sets
# add language identifiers using language Unicode
pyparsing_unicode.العربية = pyparsing_unicode.Arabic
pyparsing_unicode.中文 = pyparsing_unicode.Chinese
pyparsing_unicode.кириллица = pyparsing_unicode.Cyrillic
Expand Down
29 changes: 29 additions & 0 deletions tests/test_unit.py
Expand Up @@ -9,6 +9,7 @@

import contextlib
import datetime
import random
import re
import sys
import warnings
Expand Down Expand Up @@ -7127,6 +7128,34 @@ class Turkish_set(ppu.Latin1, ppu.LatinA):
msg="Failed to parse Turkish key-value pairs",
)

# Basic Multilingual Plane only contains chars up to 65535
def filter_16_bit(s):
return "".join(c for c in s if ord(c) < 2**16)

bmp_printables = ppu.BMP.printables
sample = (
"".join(
random.choice(filter_16_bit(unicode_set.printables))
for unicode_set in (
ppu.Japanese,
Turkish_set,
ppu.Greek,
ppu.Hebrew,
ppu.Devanagari,
ppu.Hangul,
ppu.Latin1,
ppu.Chinese,
ppu.Cyrillic,
ppu.Arabic,
ppu.Thai,
)
* 8
)
+ "\N{REPLACEMENT CHARACTER}"
)
print(sample)
self.assertParseAndCheckList(pp.Word(bmp_printables), sample, [sample])

# Make sure example in indentedBlock docstring actually works!
def testIndentedBlockExample(self):

Expand Down

0 comments on commit d6f5655

Please sign in to comment.