Merge pull request #38 from arushadev/paragraph

Paragraph
arushadev · Apr 28, 2024 · c5f6372 · c5f6372
2 parents 3415b80 + 79465b6
commit c5f6372
Show file tree

Hide file tree

Showing 5 changed files with 77 additions and 34 deletions.
diff --git a/piraye/tasks/tokenizer/nltk_tokenizer.py b/piraye/tasks/tokenizer/nltk_tokenizer.py
@@ -6,7 +6,6 @@
 from nltk import NLTKWordTokenizer
 from nltk.tokenize.punkt import PunktSentenceTokenizer
 
-from ..normalizer.mappings import MappingDict
 from ...tokenizer import Tokenizer
 
 
@@ -22,35 +21,25 @@ class NltkTokenizer(Tokenizer, ABC):
         return sentence tokenized text
     """
 
-    def __init__(self, ):
+    def __init__(self):
         """
         constructor
         """
+        super().__init__()
         try:
             nltk.data.find('tokenizers/punkt')
         except LookupError:
             print("downloading tokenizer data : ")
             nltk.download('punkt')
-        self.__en_mapping = MappingDict.load_jsons(["digit_en", "punc_en"])
         self.__tokenizer = NLTKWordTokenizer()
         self.__sentence_tokenize = PunktSentenceTokenizer()
 
     def word_span_tokenize(self, text) -> List[Tuple[int, int, str]]:
-        text2 = self.__clean_text(text)
+        text2 = self._clean_text(text)
         spans = self.__tokenizer.span_tokenize(text2)
         return [(span[0], span[1], text[span[0]:span[1]]) for span in spans]
 
-    def sentence_span_tokenize(self, text) -> List[Tuple[int, int, str]]:
-        text2 = self.__clean_text(text)
+    def sentence_span_tokenize(self, text, clean_before_tokenize=True) -> List[Tuple[int, int, str]]:
+        text2 = self._clean_text(text) if clean_before_tokenize else text
         spans = self.__sentence_tokenize.span_tokenize(text2)
         return [(span[0], span[1], text[span[0]:span[1]]) for span in spans]
-
-    def __clean_text(self, text: str) -> str:
-        """
-        Clean the input text by replacing digits and punctuation with normalized versions.
-        :param text: he input text to clean.
-        :return: The cleaned text with normalized digits and punctuation.
-        """
-        return ''.join(
-            [char if not self.__en_mapping.get(char)
-             else self.__en_mapping.get(char).char for char in text])
diff --git a/piraye/tasks/tokenizer/spacy_tokenizer.py b/piraye/tasks/tokenizer/spacy_tokenizer.py
@@ -1,10 +1,10 @@
 """This module includes a Tokenizer class for tokenizing texts"""
 from abc import ABC
 from typing import List, Tuple
+
 from spacy.lang.en import English
 from spacy.pipeline import Sentencizer
 
-from ..normalizer.mappings import MappingDict
 from ...tokenizer import Tokenizer
 
 
@@ -20,22 +20,22 @@ class SpacyTokenizer(Tokenizer, ABC):
         return sentence tokenized text
     """
 
-    def __init__(self, ):
+    def __init__(self):
         """
         constructor
         """
+        super().__init__()
         self.__nlp = English()
-        self.__en_mapping = MappingDict.load_jsons(["digit_en", "punc_en"])
         self.__tokenizer = self.__nlp.tokenizer
         self.__sentencizer = Sentencizer()
 
     def word_span_tokenize(self, text) -> List[Tuple[int, int, str]]:
-        text2 = self.__clean_text(text)
+        text2 = self._clean_text(text)
         spans = self.__tokenizer(text2)
         return [(span.idx, span.idx + len(span.text), text[span.idx: span.idx + len(span.text)]) for span in spans]
 
-    def sentence_span_tokenize(self, text) -> List[Tuple[int, int, str]]:
-        text2 = self.__clean_text(text)
+    def sentence_span_tokenize(self, text, clean_before_tokenize=True) -> List[Tuple[int, int, str]]:
+        text2 = self._clean_text(text) if clean_before_tokenize else text
         spans = self.__sentencizer(self.__nlp(text2))
         result = []
         last_index = 0
@@ -44,13 +44,3 @@ def sentence_span_tokenize(self, text) -> List[Tuple[int, int, str]]:
                 result.append((last_index, span.idx + len(span.text), text[last_index:span.idx + len(span.text)]))
                 last_index = span.idx + len(span.text)
         return result
-
-    def __clean_text(self, text: str) -> str:
-        """
-        Clean the input text by replacing digits and punctuation with normalized versions.
-        :param text: He inputs text to clean.
-        :return: The cleaned text with normalized digits and punctuation.
-        """
-        return ''.join(
-            [char if not self.__en_mapping.get(char)
-             else self.__en_mapping.get(char).char for char in text])
diff --git a/piraye/tokenizer.py b/piraye/tokenizer.py
@@ -2,12 +2,18 @@
 from abc import ABC, abstractmethod
 from typing import List, Tuple
 
+from .tasks.normalizer.mappings import MappingDict
+
 
 class Tokenizer(ABC):
     """
     Abstract class for tokenizing
     """
 
+    def __init__(self):
+        self._en_mapping = MappingDict.load_jsons(["digit_en", "punc_en"])
+        self._space_mapping = MappingDict.load_jsons(["space_keep"])
+
     def word_tokenize(self, text: str) -> List[str]:
         """
         Tokenize the input text into a list of words.
@@ -35,9 +41,58 @@ def sentence_tokenize(self, text: str) -> List[str]:
         return [text for (_, _, text) in tokens]
 
     @abstractmethod
-    def sentence_span_tokenize(self, text: str) -> List[Tuple[int, int, str]]:
+    def sentence_span_tokenize(self, text: str, clean_before_tokenize: bool = True) -> List[Tuple[int, int, str]]:
         """
         Tokenize the input text and return spans of the tokenized sentences.
         :param text: The input text to tokenize.
+        :param clean_before_tokenize: clean and then tokenize it
         :return: A list of tuples containing the start index, end index, and the sentence for each sentence span.
         """
+
+    def paragraph_tokenize(self, text: str) -> List[str]:
+        """
+        Tokenize the input text into a list of paragraph.
+        :param text: The input text to tokenize.
+        :return: A list of paragraph.
+        """
+        tokens = self.paragraph_span_tokenize(text)
+        return [text for (_, _, text) in tokens]
+
+    def paragraph_span_tokenize(self, text: str) -> List[Tuple[int, int, str]]:
+        """
+        Tokenize the input text and return spans of the tokenized paragraph.
+        :param text: The input text to tokenize.
+        :return: A list of tuples containing the start index, end index, and the sentence for each sentence span.
+        """
+        text2 = self._clean_text(text)
+        text2_len = len(text2)
+        sentences = self.sentence_span_tokenize(text2, False)
+        paragraphs: List[Tuple[int, int, str]] = []
+        last_index = 0
+        for _, sentence_end, _ in sentences:
+            if last_index + 1 >= text2_len:
+                break
+            pointer = sentence_end + 1
+            while True:
+                if pointer + 1 >= text2_len:
+                    paragraphs.append((last_index, pointer, text[last_index:pointer]))
+                    break
+                character = text2[pointer]
+                if character == "\n":
+                    paragraphs.append((last_index, pointer, text[last_index:pointer]))
+                    last_index = pointer + 1
+                    break
+                if character not in self._space_mapping or not self._space_mapping.get(character).is_space:
+                    break
+                pointer = pointer + 1
+        return paragraphs
+
+    def _clean_text(self, text: str) -> str:
+        """
+        Clean the input text by replacing digits and punctuation with normalized versions.
+        :param text: He inputs text to clean.
+        :return: The cleaned text with normalized digits and punctuation.
+        """
+        return ''.join(
+            [char if not self._en_mapping.get(char)
+             else self._en_mapping.get(char).char for char in text])
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "piraye"
-version = "0.5.1"
+version = "0.6.0"
 authors = [
     { name = "Hamed Khademi Khaledi", email = "khaledihkh@gmail.com" },
     { name = "HosseiN Khademi khaeldi", email = "hossein@arusha.dev" },

diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
@@ -33,6 +33,15 @@ def test_sentence_tokenizer():
     assert len(tokenizer.sentence_span_tokenize(text)) == 2
 
 
+def test_paragraph_tokenizer():
+    text = "par1 sen1 sad.  par1 \n sen2. par1 \n\n sen3.  \n par2 sen1.\n\n\n\n   par3 sen1.\n\n"
+    tokenizer = NltkTokenizer()
+    assert len(tokenizer.paragraph_tokenize(text)) == 3
+    assert len(tokenizer.paragraph_span_tokenize(text)) == 3
+    assert len(tokenizer.paragraph_tokenize("par1 sen1 sad.")) == 1
+    assert len(tokenizer.paragraph_tokenize("par1 sen1 sad. par1 \n sen2. ")) == 1
+
+
 def test_sentence_tokenizer_spacy():
     text = "sentence1 sad. \n asd asd \n asdasd \n sentence 2."
     tokenizer = SpacyTokenizer()