Skip to content

Commit

Permalink
Fix repeated quote char (#23)
Browse files Browse the repository at this point in the history
* Fix repeated quote char
  • Loading branch information
hosseinkhaledi committed Dec 3, 2022
1 parent a6d9f50 commit 0eea412
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 3 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
"Normalization",
],
url="https://github.com/arushadev/piraye",
version="0.1.2",
version="0.1.3",
package_dir={"piraye": "src"},
packages=["piraye"],
package_data={"piraye": ["data/*/*.json"]},
Expand Down
12 changes: 10 additions & 2 deletions src/nltk_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,14 @@ def __get_original_tokens(text: str, text2: str, tokens_en: List[str]) -> List[s
curr_text = text[token_index:token_index + len(token_en)]
tokens.append(curr_text)
text2_counter = token_index + len(token_en)
except ValueError:
tokens.append(text[text2_counter:text2_counter + 1])
except ValueError as error:
if token_en in ('``', "''"):
while True:
curr_text = text[text2_counter:text2_counter + 1]
text2_counter = text2_counter + 1
if len(curr_text.strip()) > 0:
tokens.append(curr_text)
break
else:
raise error
return tokens
7 changes: 7 additions & 0 deletions tests/test_normalizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,10 @@ def test_quotes():
norm = NormalizerBuilder().digit_en().punctuation_en().alphabet_fa() \
.tokenizing().remove_extra_spaces().build()
norm.normalize(text)


def test_quotes2():
text = " «««« تست "
norm = NormalizerBuilder().digit_en().punctuation_en().alphabet_fa() \
.tokenizing().remove_extra_spaces().build()
norm.normalize(text)

0 comments on commit 0eea412

Please sign in to comment.