From a048a4357fa96708e42d1b2dbc960a8ee443c238 Mon Sep 17 00:00:00 2001 From: Tze Ching Yu <33182836+tc-yu@users.noreply.github.com> Date: Fri, 3 Dec 2021 11:29:57 +0800 Subject: [PATCH 1/2] Updated range for unicode blocks, updated add char logic to include last character in each block, update some function names --- examples/booleansearchparser.py | 39 ++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/examples/booleansearchparser.py b/examples/booleansearchparser.py index d32ef392..1e98fbe4 100644 --- a/examples/booleansearchparser.py +++ b/examples/booleansearchparser.py @@ -84,34 +84,37 @@ from pyparsing import ( Word, alphanums, - Keyword, + CaselessKeyword, Group, Forward, Suppress, OneOrMore, - oneOf, + one_of, ) import re +# Updated on 02 Dec 2021 according to ftp://ftp.unicode.org/Public/UNIDATA/Blocks.txt alphabet_ranges = [ - ##CYRILIC: https://en.wikipedia.org/wiki/Cyrillic_(Unicode_block) + # CYRILIC: https://en.wikipedia.org/wiki/Cyrillic_(Unicode_block) [int("0400", 16), int("04FF", 16)], - ##THAI: https://en.wikipedia.org/wiki/Thai_(Unicode_block) - [int("0E00", 16), int("0E7F", 16)], - ##ARABIC: https://en.wikipedia.org/wiki/Arabic_(Unicode_block) (Arabic (0600–06FF)+ Syriac (0700–074F)+ Arabic Supplement (0750–077F) ) + # ARABIC: https://en.wikipedia.org/wiki/Arabic_(Unicode_block) (Arabic (0600–06FF)+ Syriac (0700–074F)+ Arabic Supplement (0750–077F) ) [int("0600", 16), int("07FF", 16)], - ##CHINESE: https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) - [int("0400", 16), int("09FF", 16)], - # JAPANESE : https://en.wikipedia.org/wiki/Japanese_writing_system + # THAI: https://en.wikipedia.org/wiki/Thai_(Unicode_block) + [int("0E00", 16), int("0E7F", 16)], + # JAPANESE : https://en.wikipedia.org/wiki/Japanese_writing_system (Hiragana (3040–309F) + Katakana (30A0–30FF)) [int("3040", 16), int("30FF", 16)], + # Enclosed CJK Letters and Months + [int("3200", 16), int("32FF", 16)], + # CHINESE: https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + [int("4E00", 16), int("9FFF", 16)], # KOREAN : https://en.wikipedia.org/wiki/Hangul - [int("AC00", 16), int("D7AF", 16)], [int("1100", 16), int("11FF", 16)], [int("3130", 16), int("318F", 16)], - [int("3200", 16), int("32FF", 16)], [int("A960", 16), int("A97F", 16)], + [int("AC00", 16), int("D7AF", 16)], [int("D7B0", 16), int("D7FF", 16)], + # Halfwidth and Fullwidth Forms [int("FF00", 16), int("FFEF", 16)], ] @@ -152,8 +155,8 @@ def parser(self): alphabet = alphanums # support for non-western alphabets - for r in alphabet_ranges: - alphabet += "".join(chr(c) for c in range(*r) if not chr(c).isspace()) + for lo, hi in alphabet_ranges: + alphabet += "".join(chr(c) for c in range(lo, hi + 1) if not chr(c).isspace()) operatorWord = Group(Word(alphabet + "*")).setResultsName("word*") @@ -176,7 +179,7 @@ def parser(self): operatorNot = Forward() operatorNot << ( - Group(Suppress(Keyword("not", caseless=True)) + operatorNot).setResultsName( + Group(Suppress(CaselessKeyword("not")) + operatorNot).setResultsName( "not" ) | operatorParenthesis @@ -185,17 +188,17 @@ def parser(self): operatorAnd = Forward() operatorAnd << ( Group( - operatorNot + Suppress(Keyword("and", caseless=True)) + operatorAnd + operatorNot + Suppress(CaselessKeyword("and")) + operatorAnd ).setResultsName("and") | Group( - operatorNot + OneOrMore(~oneOf("and or") + operatorAnd) + operatorNot + OneOrMore(~one_of("and or") + operatorAnd) ).setResultsName("and") | operatorNot ) operatorOr << ( Group( - operatorAnd + Suppress(Keyword("or", caseless=True)) + operatorOr + operatorAnd + Suppress(CaselessKeyword("or")) + operatorOr ).setResultsName("or") | operatorAnd ) @@ -217,7 +220,7 @@ def evaluateParenthesis(self, argument): def evaluateQuotes(self, argument): """Evaluate quoted strings - First is does an 'and' on the indidual search terms, then it asks the + First is does an 'and' on the individual search terms, then it asks the function GetQuoted to only return the subset of ID's that contain the literal string. """ From 6cca6a171067ab91be914f5a998c8e9abded8164 Mon Sep 17 00:00:00 2001 From: Tze Ching Yu <33182836+tc-yu@users.noreply.github.com> Date: Mon, 6 Dec 2021 12:23:38 +0800 Subject: [PATCH 2/2] Test case for CJK block and last character in block --- examples/booleansearchparser.py | 49 +++++++++++++++++++++++++++------ 1 file changed, 40 insertions(+), 9 deletions(-) diff --git a/examples/booleansearchparser.py b/examples/booleansearchparser.py index 1e98fbe4..c901db14 100644 --- a/examples/booleansearchparser.py +++ b/examples/booleansearchparser.py @@ -98,7 +98,7 @@ alphabet_ranges = [ # CYRILIC: https://en.wikipedia.org/wiki/Cyrillic_(Unicode_block) [int("0400", 16), int("04FF", 16)], - # ARABIC: https://en.wikipedia.org/wiki/Arabic_(Unicode_block) (Arabic (0600–06FF)+ Syriac (0700–074F)+ Arabic Supplement (0750–077F) ) + # ARABIC: https://en.wikipedia.org/wiki/Arabic_(Unicode_block) (Arabic (0600–06FF)+ Syriac (0700–074F)+ Arabic Supplement (0750–077F)) [int("0600", 16), int("07FF", 16)], # THAI: https://en.wikipedia.org/wiki/Thai_(Unicode_block) [int("0E00", 16), int("0E7F", 16)], @@ -158,20 +158,20 @@ def parser(self): for lo, hi in alphabet_ranges: alphabet += "".join(chr(c) for c in range(lo, hi + 1) if not chr(c).isspace()) - operatorWord = Group(Word(alphabet + "*")).setResultsName("word*") + operatorWord = Group(Word(alphabet + "*")).set_results_name("word*") operatorQuotesContent = Forward() operatorQuotesContent << ((operatorWord + operatorQuotesContent) | operatorWord) operatorQuotes = ( - Group(Suppress('"') + operatorQuotesContent + Suppress('"')).setResultsName( + Group(Suppress('"') + operatorQuotesContent + Suppress('"')).set_results_name( "quotes" ) | operatorWord ) operatorParenthesis = ( - Group(Suppress("(") + operatorOr + Suppress(")")).setResultsName( + Group(Suppress("(") + operatorOr + Suppress(")")).set_results_name( "parenthesis" ) | operatorQuotes @@ -179,7 +179,7 @@ def parser(self): operatorNot = Forward() operatorNot << ( - Group(Suppress(CaselessKeyword("not")) + operatorNot).setResultsName( + Group(Suppress(CaselessKeyword("not")) + operatorNot).set_results_name( "not" ) | operatorParenthesis @@ -189,21 +189,21 @@ def parser(self): operatorAnd << ( Group( operatorNot + Suppress(CaselessKeyword("and")) + operatorAnd - ).setResultsName("and") + ).set_results_name("and") | Group( operatorNot + OneOrMore(~one_of("and or") + operatorAnd) - ).setResultsName("and") + ).set_results_name("and") | operatorNot ) operatorOr << ( Group( operatorAnd + Suppress(CaselessKeyword("or")) + operatorOr - ).setResultsName("or") + ).set_results_name("or") | operatorAnd ) - return operatorOr.parseString + return operatorOr.parse_string def evaluateAnd(self, argument): return all(self.evaluate(arg) for arg in argument) @@ -464,6 +464,37 @@ def Test(self): all_ok = all_ok and test_passed + # Tests for non western characters, should fail with + # pyparsing.exceptions.ParseException under the previous + # configuration + non_western_exprs = { + "0": "*", + "1": "ヿ", # Edge character + "2": "亀", # Character in CJK block + "3": "ヿ or 亀", + "4": "ヿ and 亀", + "5": "not ヿ" + } + + non_western_texts_matcheswith = { + "안녕하세요, 당신은 어떠세요?": ["0", "5"], + "ヿ": ["0", "1", "3"], + "亀": ["0", "2", "3", "5"], + "亀 ヿ": ["0", "1", "2", "3", "4"], + } + + for text, matches in non_western_texts_matcheswith.items(): + _matches = [] + for _id, expr in non_western_exprs.items(): + if self.match(text, expr): + _matches.append(_id) + + test_passed = sorted(matches) == sorted(_matches) + if not test_passed: + print("Failed", repr(text), "expected", matches, "matched", _matches) + + all_ok = all_ok and test_passed + return all_ok