Skip to content

Commit

Permalink
Use span tokenizer (#24)
Browse files Browse the repository at this point in the history
* Use span tokenizer
  • Loading branch information
hosseinkhaledi committed Dec 3, 2022
1 parent 0eea412 commit 4e9e97e
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 15 deletions.
2 changes: 1 addition & 1 deletion setup.py
Expand Up @@ -19,7 +19,7 @@
"Normalization",
],
url="https://github.com/arushadev/piraye",
version="0.1.3",
version="0.1.4",
package_dir={"piraye": "src"},
packages=["piraye"],
package_data={"piraye": ["data/*/*.json"]},
Expand Down
25 changes: 21 additions & 4 deletions src/nltk_tokenizer.py
@@ -1,10 +1,10 @@
"""This module includes Tokenizer class for tokenizing texts"""

from abc import ABC, abstractmethod
from typing import List
from typing import List, Tuple

import nltk
from nltk import word_tokenize, sent_tokenize
from nltk import TreebankWordTokenizer, sent_tokenize

from .mappings import MappingDict

Expand All @@ -31,6 +31,14 @@ def word_tokenize(self, text) -> List[str]:
:return: list of words
"""

@abstractmethod
def span_tokenize(self, text) -> List[Tuple[int, int]]:
"""
Return span of tokens
:param text: the input text
:return: list of spans
"""

@abstractmethod
def sentence_tokenize(self, text):
"""
Expand Down Expand Up @@ -62,18 +70,27 @@ def __init__(self, ):
print("downloading tokenizer data : ")
nltk.download('punkt')
self.__en_mapping = MappingDict.load_jsons(["digit_en", "punc_en"])
self.__tokenizer = TreebankWordTokenizer()

def word_tokenize(self, text) -> List[str]:
"""
Return a tokenized text.
:param text: the input text
:return: list of words
"""
tokens_en = self.span_tokenize(text)
return [text[a:b] for (a, b) in tokens_en]

def span_tokenize(self, text) -> List[Tuple[int, int]]:
"""
Return span of tokens
:param text: the input text
:return: list of spans
"""
text2 = ''.join(
[char if not self.__en_mapping.get(char)
else self.__en_mapping.get(char).char for char in text])
tokens_en = word_tokenize(text2)
return NltkTokenizer.__get_original_tokens(text, text2, tokens_en)
return self.__tokenizer.span_tokenize(text2)

def sentence_tokenize(self, text):
"""
Expand Down
11 changes: 4 additions & 7 deletions src/normalizer.py
Expand Up @@ -111,11 +111,8 @@ def __tokenize(self, text: str) -> List[bool]:
:return: list boolean.
"""
is_token_list = [False] * len(text)
tokens = self.__tokenizer.word_tokenize(text)
text_counter = 0
for token in tokens:
token_index = text.index(token, text_counter)
if len(token) == 1:
is_token_list[token_index] = True
text_counter = token_index + len(token)
spans = self.__tokenizer.span_tokenize(text)
for (start, end) in spans:
if start + 1 == end:
is_token_list[start] = True
return is_token_list
7 changes: 4 additions & 3 deletions tests/test_normalizer.py
Expand Up @@ -38,10 +38,11 @@ def test_quotes():
norm = NormalizerBuilder().digit_en().punctuation_en().alphabet_fa() \
.tokenizing().remove_extra_spaces().build()
norm.normalize(text)


def test_quotes2():
text = " «««« تست "
norm = NormalizerBuilder().digit_en().punctuation_en().alphabet_fa() \
.tokenizing().remove_extra_spaces().build()
norm.normalize(text)
text = " \" تست '' تست «««« تست "
norm = NormalizerBuilder().digit_en().punctuation_en().alphabet_fa() \
.tokenizing().remove_extra_spaces().build()
norm.normalize(text)

0 comments on commit 4e9e97e

Please sign in to comment.