diff --git a/setup.py b/setup.py index e7c7bda..71fcc1c 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ "Normalization", ], url="https://github.com/arushadev/piraye", - version="0.1.3", + version="0.1.4", package_dir={"piraye": "src"}, packages=["piraye"], package_data={"piraye": ["data/*/*.json"]}, diff --git a/src/nltk_tokenizer.py b/src/nltk_tokenizer.py index 58d23a8..6f0bdbe 100644 --- a/src/nltk_tokenizer.py +++ b/src/nltk_tokenizer.py @@ -1,10 +1,10 @@ """This module includes Tokenizer class for tokenizing texts""" from abc import ABC, abstractmethod -from typing import List +from typing import List, Tuple import nltk -from nltk import word_tokenize, sent_tokenize +from nltk import TreebankWordTokenizer, sent_tokenize from .mappings import MappingDict @@ -31,6 +31,14 @@ def word_tokenize(self, text) -> List[str]: :return: list of words """ + @abstractmethod + def span_tokenize(self, text) -> List[Tuple[int, int]]: + """ + Return span of tokens + :param text: the input text + :return: list of spans + """ + @abstractmethod def sentence_tokenize(self, text): """ @@ -62,6 +70,7 @@ def __init__(self, ): print("downloading tokenizer data : ") nltk.download('punkt') self.__en_mapping = MappingDict.load_jsons(["digit_en", "punc_en"]) + self.__tokenizer = TreebankWordTokenizer() def word_tokenize(self, text) -> List[str]: """ @@ -69,11 +78,19 @@ def word_tokenize(self, text) -> List[str]: :param text: the input text :return: list of words """ + tokens_en = self.span_tokenize(text) + return [text[a:b] for (a, b) in tokens_en] + + def span_tokenize(self, text) -> List[Tuple[int, int]]: + """ + Return span of tokens + :param text: the input text + :return: list of spans + """ text2 = ''.join( [char if not self.__en_mapping.get(char) else self.__en_mapping.get(char).char for char in text]) - tokens_en = word_tokenize(text2) - return NltkTokenizer.__get_original_tokens(text, text2, tokens_en) + return self.__tokenizer.span_tokenize(text2) def sentence_tokenize(self, text): """ diff --git a/src/normalizer.py b/src/normalizer.py index cfb437f..9882327 100644 --- a/src/normalizer.py +++ b/src/normalizer.py @@ -111,11 +111,8 @@ def __tokenize(self, text: str) -> List[bool]: :return: list boolean. """ is_token_list = [False] * len(text) - tokens = self.__tokenizer.word_tokenize(text) - text_counter = 0 - for token in tokens: - token_index = text.index(token, text_counter) - if len(token) == 1: - is_token_list[token_index] = True - text_counter = token_index + len(token) + spans = self.__tokenizer.span_tokenize(text) + for (start, end) in spans: + if start + 1 == end: + is_token_list[start] = True return is_token_list diff --git a/tests/test_normalizer.py b/tests/test_normalizer.py index 160c9f2..5880206 100644 --- a/tests/test_normalizer.py +++ b/tests/test_normalizer.py @@ -38,10 +38,11 @@ def test_quotes(): norm = NormalizerBuilder().digit_en().punctuation_en().alphabet_fa() \ .tokenizing().remove_extra_spaces().build() norm.normalize(text) - - -def test_quotes2(): text = " «««« تست " norm = NormalizerBuilder().digit_en().punctuation_en().alphabet_fa() \ .tokenizing().remove_extra_spaces().build() norm.normalize(text) + text = " \" تست '' تست «««« تست " + norm = NormalizerBuilder().digit_en().punctuation_en().alphabet_fa() \ + .tokenizing().remove_extra_spaces().build() + norm.normalize(text)