Fix repeated quote char (#23)

* Fix repeated quote char
arushadev · Dec 3, 2022 · 0eea412 · 0eea412
1 parent a6d9f50
commit 0eea412
Show file tree

Hide file tree

Showing 3 changed files with 18 additions and 3 deletions.
diff --git a/setup.py b/setup.py
@@ -19,7 +19,7 @@
         "Normalization",
     ],
     url="https://github.com/arushadev/piraye",
-    version="0.1.2",
+    version="0.1.3",
     package_dir={"piraye": "src"},
     packages=["piraye"],
     package_data={"piraye": ["data/*/*.json"]},

diff --git a/src/nltk_tokenizer.py b/src/nltk_tokenizer.py
@@ -97,6 +97,14 @@ def __get_original_tokens(text: str, text2: str, tokens_en: List[str]) -> List[s
                 curr_text = text[token_index:token_index + len(token_en)]
                 tokens.append(curr_text)
                 text2_counter = token_index + len(token_en)
-            except ValueError:
-                tokens.append(text[text2_counter:text2_counter + 1])
+            except ValueError as error:
+                if token_en in ('``', "''"):
+                    while True:
+                        curr_text = text[text2_counter:text2_counter + 1]
+                        text2_counter = text2_counter + 1
+                        if len(curr_text.strip()) > 0:
+                            tokens.append(curr_text)
+                            break
+                else:
+                    raise error
         return tokens
diff --git a/tests/test_normalizer.py b/tests/test_normalizer.py
@@ -38,3 +38,10 @@ def test_quotes():
     norm = NormalizerBuilder().digit_en().punctuation_en().alphabet_fa() \
         .tokenizing().remove_extra_spaces().build()
     norm.normalize(text)
+
+
+def test_quotes2():
+    text = " «««« تست "
+    norm = NormalizerBuilder().digit_en().punctuation_en().alphabet_fa() \
+        .tokenizing().remove_extra_spaces().build()
+    norm.normalize(text)