Skip to content

Commit

Permalink
Fixing Unicode block range in examples/booleansearchparser.py (#342)
Browse files Browse the repository at this point in the history
* Updated range for unicode blocks, updated add char logic to include last character in each block, update some function names

* Test case for CJK block and last character in block
  • Loading branch information
tc-yu committed Dec 15, 2021
1 parent 2f633f4 commit d72bd46
Showing 1 changed file with 59 additions and 25 deletions.
84 changes: 59 additions & 25 deletions examples/booleansearchparser.py
Expand Up @@ -84,34 +84,37 @@
from pyparsing import (
Word,
alphanums,
Keyword,
CaselessKeyword,
Group,
Forward,
Suppress,
OneOrMore,
oneOf,
one_of,
)
import re


# Updated on 02 Dec 2021 according to ftp://ftp.unicode.org/Public/UNIDATA/Blocks.txt
alphabet_ranges = [
##CYRILIC: https://en.wikipedia.org/wiki/Cyrillic_(Unicode_block)
# CYRILIC: https://en.wikipedia.org/wiki/Cyrillic_(Unicode_block)
[int("0400", 16), int("04FF", 16)],
##THAI: https://en.wikipedia.org/wiki/Thai_(Unicode_block)
[int("0E00", 16), int("0E7F", 16)],
##ARABIC: https://en.wikipedia.org/wiki/Arabic_(Unicode_block) (Arabic (0600–06FF)+ Syriac (0700–074F)+ Arabic Supplement (0750–077F) )
# ARABIC: https://en.wikipedia.org/wiki/Arabic_(Unicode_block) (Arabic (0600–06FF)+ Syriac (0700–074F)+ Arabic Supplement (0750–077F))
[int("0600", 16), int("07FF", 16)],
##CHINESE: https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
[int("0400", 16), int("09FF", 16)],
# JAPANESE : https://en.wikipedia.org/wiki/Japanese_writing_system
# THAI: https://en.wikipedia.org/wiki/Thai_(Unicode_block)
[int("0E00", 16), int("0E7F", 16)],
# JAPANESE : https://en.wikipedia.org/wiki/Japanese_writing_system (Hiragana (3040–309F) + Katakana (30A0–30FF))
[int("3040", 16), int("30FF", 16)],
# Enclosed CJK Letters and Months
[int("3200", 16), int("32FF", 16)],
# CHINESE: https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
[int("4E00", 16), int("9FFF", 16)],
# KOREAN : https://en.wikipedia.org/wiki/Hangul
[int("AC00", 16), int("D7AF", 16)],
[int("1100", 16), int("11FF", 16)],
[int("3130", 16), int("318F", 16)],
[int("3200", 16), int("32FF", 16)],
[int("A960", 16), int("A97F", 16)],
[int("AC00", 16), int("D7AF", 16)],
[int("D7B0", 16), int("D7FF", 16)],
# Halfwidth and Fullwidth Forms
[int("FF00", 16), int("FFEF", 16)],
]

Expand Down Expand Up @@ -152,31 +155,31 @@ def parser(self):
alphabet = alphanums

# support for non-western alphabets
for r in alphabet_ranges:
alphabet += "".join(chr(c) for c in range(*r) if not chr(c).isspace())
for lo, hi in alphabet_ranges:
alphabet += "".join(chr(c) for c in range(lo, hi + 1) if not chr(c).isspace())

operatorWord = Group(Word(alphabet + "*")).setResultsName("word*")
operatorWord = Group(Word(alphabet + "*")).set_results_name("word*")

operatorQuotesContent = Forward()
operatorQuotesContent << ((operatorWord + operatorQuotesContent) | operatorWord)

operatorQuotes = (
Group(Suppress('"') + operatorQuotesContent + Suppress('"')).setResultsName(
Group(Suppress('"') + operatorQuotesContent + Suppress('"')).set_results_name(
"quotes"
)
| operatorWord
)

operatorParenthesis = (
Group(Suppress("(") + operatorOr + Suppress(")")).setResultsName(
Group(Suppress("(") + operatorOr + Suppress(")")).set_results_name(
"parenthesis"
)
| operatorQuotes
)

operatorNot = Forward()
operatorNot << (
Group(Suppress(Keyword("not", caseless=True)) + operatorNot).setResultsName(
Group(Suppress(CaselessKeyword("not")) + operatorNot).set_results_name(
"not"
)
| operatorParenthesis
Expand All @@ -185,22 +188,22 @@ def parser(self):
operatorAnd = Forward()
operatorAnd << (
Group(
operatorNot + Suppress(Keyword("and", caseless=True)) + operatorAnd
).setResultsName("and")
operatorNot + Suppress(CaselessKeyword("and")) + operatorAnd
).set_results_name("and")
| Group(
operatorNot + OneOrMore(~oneOf("and or") + operatorAnd)
).setResultsName("and")
operatorNot + OneOrMore(~one_of("and or") + operatorAnd)
).set_results_name("and")
| operatorNot
)

operatorOr << (
Group(
operatorAnd + Suppress(Keyword("or", caseless=True)) + operatorOr
).setResultsName("or")
operatorAnd + Suppress(CaselessKeyword("or")) + operatorOr
).set_results_name("or")
| operatorAnd
)

return operatorOr.parseString
return operatorOr.parse_string

def evaluateAnd(self, argument):
return all(self.evaluate(arg) for arg in argument)
Expand All @@ -217,7 +220,7 @@ def evaluateParenthesis(self, argument):
def evaluateQuotes(self, argument):
"""Evaluate quoted strings
First is does an 'and' on the indidual search terms, then it asks the
First is does an 'and' on the individual search terms, then it asks the
function GetQuoted to only return the subset of ID's that contain the
literal string.
"""
Expand Down Expand Up @@ -461,6 +464,37 @@ def Test(self):

all_ok = all_ok and test_passed

# Tests for non western characters, should fail with
# pyparsing.exceptions.ParseException under the previous
# configuration
non_western_exprs = {
"0": "*",
"1": "ヿ", # Edge character
"2": "亀", # Character in CJK block
"3": "ヿ or 亀",
"4": "ヿ and 亀",
"5": "not ヿ"
}

non_western_texts_matcheswith = {
"안녕하세요, 당신은 어떠세요?": ["0", "5"],
"ヿ": ["0", "1", "3"],
"亀": ["0", "2", "3", "5"],
"亀 ヿ": ["0", "1", "2", "3", "4"],
}

for text, matches in non_western_texts_matcheswith.items():
_matches = []
for _id, expr in non_western_exprs.items():
if self.match(text, expr):
_matches.append(_id)

test_passed = sorted(matches) == sorted(_matches)
if not test_passed:
print("Failed", repr(text), "expected", matches, "matched", _matches)

all_ok = all_ok and test_passed

return all_ok


Expand Down

0 comments on commit d72bd46

Please sign in to comment.