Skip to content

Commit

Permalink
Merge pull request #38 from arushadev/paragraph
Browse files Browse the repository at this point in the history
Paragraph
  • Loading branch information
hosseinkhaledi committed Apr 28, 2024
2 parents 3415b80 + 79465b6 commit c5f6372
Show file tree
Hide file tree
Showing 5 changed files with 77 additions and 34 deletions.
21 changes: 5 additions & 16 deletions piraye/tasks/tokenizer/nltk_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from nltk import NLTKWordTokenizer
from nltk.tokenize.punkt import PunktSentenceTokenizer

from ..normalizer.mappings import MappingDict
from ...tokenizer import Tokenizer


Expand All @@ -22,35 +21,25 @@ class NltkTokenizer(Tokenizer, ABC):
return sentence tokenized text
"""

def __init__(self, ):
def __init__(self):
"""
constructor
"""
super().__init__()
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
print("downloading tokenizer data : ")
nltk.download('punkt')
self.__en_mapping = MappingDict.load_jsons(["digit_en", "punc_en"])
self.__tokenizer = NLTKWordTokenizer()
self.__sentence_tokenize = PunktSentenceTokenizer()

def word_span_tokenize(self, text) -> List[Tuple[int, int, str]]:
text2 = self.__clean_text(text)
text2 = self._clean_text(text)
spans = self.__tokenizer.span_tokenize(text2)
return [(span[0], span[1], text[span[0]:span[1]]) for span in spans]

def sentence_span_tokenize(self, text) -> List[Tuple[int, int, str]]:
text2 = self.__clean_text(text)
def sentence_span_tokenize(self, text, clean_before_tokenize=True) -> List[Tuple[int, int, str]]:
text2 = self._clean_text(text) if clean_before_tokenize else text
spans = self.__sentence_tokenize.span_tokenize(text2)
return [(span[0], span[1], text[span[0]:span[1]]) for span in spans]

def __clean_text(self, text: str) -> str:
"""
Clean the input text by replacing digits and punctuation with normalized versions.
:param text: he input text to clean.
:return: The cleaned text with normalized digits and punctuation.
"""
return ''.join(
[char if not self.__en_mapping.get(char)
else self.__en_mapping.get(char).char for char in text])
22 changes: 6 additions & 16 deletions piraye/tasks/tokenizer/spacy_tokenizer.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
"""This module includes a Tokenizer class for tokenizing texts"""
from abc import ABC
from typing import List, Tuple

from spacy.lang.en import English
from spacy.pipeline import Sentencizer

from ..normalizer.mappings import MappingDict
from ...tokenizer import Tokenizer


Expand All @@ -20,22 +20,22 @@ class SpacyTokenizer(Tokenizer, ABC):
return sentence tokenized text
"""

def __init__(self, ):
def __init__(self):
"""
constructor
"""
super().__init__()
self.__nlp = English()
self.__en_mapping = MappingDict.load_jsons(["digit_en", "punc_en"])
self.__tokenizer = self.__nlp.tokenizer
self.__sentencizer = Sentencizer()

def word_span_tokenize(self, text) -> List[Tuple[int, int, str]]:
text2 = self.__clean_text(text)
text2 = self._clean_text(text)
spans = self.__tokenizer(text2)
return [(span.idx, span.idx + len(span.text), text[span.idx: span.idx + len(span.text)]) for span in spans]

def sentence_span_tokenize(self, text) -> List[Tuple[int, int, str]]:
text2 = self.__clean_text(text)
def sentence_span_tokenize(self, text, clean_before_tokenize=True) -> List[Tuple[int, int, str]]:
text2 = self._clean_text(text) if clean_before_tokenize else text
spans = self.__sentencizer(self.__nlp(text2))
result = []
last_index = 0
Expand All @@ -44,13 +44,3 @@ def sentence_span_tokenize(self, text) -> List[Tuple[int, int, str]]:
result.append((last_index, span.idx + len(span.text), text[last_index:span.idx + len(span.text)]))
last_index = span.idx + len(span.text)
return result

def __clean_text(self, text: str) -> str:
"""
Clean the input text by replacing digits and punctuation with normalized versions.
:param text: He inputs text to clean.
:return: The cleaned text with normalized digits and punctuation.
"""
return ''.join(
[char if not self.__en_mapping.get(char)
else self.__en_mapping.get(char).char for char in text])
57 changes: 56 additions & 1 deletion piraye/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,18 @@
from abc import ABC, abstractmethod
from typing import List, Tuple

from .tasks.normalizer.mappings import MappingDict


class Tokenizer(ABC):
"""
Abstract class for tokenizing
"""

def __init__(self):
self._en_mapping = MappingDict.load_jsons(["digit_en", "punc_en"])
self._space_mapping = MappingDict.load_jsons(["space_keep"])

def word_tokenize(self, text: str) -> List[str]:
"""
Tokenize the input text into a list of words.
Expand Down Expand Up @@ -35,9 +41,58 @@ def sentence_tokenize(self, text: str) -> List[str]:
return [text for (_, _, text) in tokens]

@abstractmethod
def sentence_span_tokenize(self, text: str) -> List[Tuple[int, int, str]]:
def sentence_span_tokenize(self, text: str, clean_before_tokenize: bool = True) -> List[Tuple[int, int, str]]:
"""
Tokenize the input text and return spans of the tokenized sentences.
:param text: The input text to tokenize.
:param clean_before_tokenize: clean and then tokenize it
:return: A list of tuples containing the start index, end index, and the sentence for each sentence span.
"""

def paragraph_tokenize(self, text: str) -> List[str]:
"""
Tokenize the input text into a list of paragraph.
:param text: The input text to tokenize.
:return: A list of paragraph.
"""
tokens = self.paragraph_span_tokenize(text)
return [text for (_, _, text) in tokens]

def paragraph_span_tokenize(self, text: str) -> List[Tuple[int, int, str]]:
"""
Tokenize the input text and return spans of the tokenized paragraph.
:param text: The input text to tokenize.
:return: A list of tuples containing the start index, end index, and the sentence for each sentence span.
"""
text2 = self._clean_text(text)
text2_len = len(text2)
sentences = self.sentence_span_tokenize(text2, False)
paragraphs: List[Tuple[int, int, str]] = []
last_index = 0
for _, sentence_end, _ in sentences:
if last_index + 1 >= text2_len:
break
pointer = sentence_end + 1
while True:
if pointer + 1 >= text2_len:
paragraphs.append((last_index, pointer, text[last_index:pointer]))
break
character = text2[pointer]
if character == "\n":
paragraphs.append((last_index, pointer, text[last_index:pointer]))
last_index = pointer + 1
break
if character not in self._space_mapping or not self._space_mapping.get(character).is_space:
break
pointer = pointer + 1
return paragraphs

def _clean_text(self, text: str) -> str:
"""
Clean the input text by replacing digits and punctuation with normalized versions.
:param text: He inputs text to clean.
:return: The cleaned text with normalized digits and punctuation.
"""
return ''.join(
[char if not self._en_mapping.get(char)
else self._en_mapping.get(char).char for char in text])
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "piraye"
version = "0.5.1"
version = "0.6.0"
authors = [
{ name = "Hamed Khademi Khaledi", email = "khaledihkh@gmail.com" },
{ name = "HosseiN Khademi khaeldi", email = "hossein@arusha.dev" },
Expand Down
9 changes: 9 additions & 0 deletions tests/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,15 @@ def test_sentence_tokenizer():
assert len(tokenizer.sentence_span_tokenize(text)) == 2


def test_paragraph_tokenizer():
text = "par1 sen1 sad. par1 \n sen2. par1 \n\n sen3. \n par2 sen1.\n\n\n\n par3 sen1.\n\n"
tokenizer = NltkTokenizer()
assert len(tokenizer.paragraph_tokenize(text)) == 3
assert len(tokenizer.paragraph_span_tokenize(text)) == 3
assert len(tokenizer.paragraph_tokenize("par1 sen1 sad.")) == 1
assert len(tokenizer.paragraph_tokenize("par1 sen1 sad. par1 \n sen2. ")) == 1


def test_sentence_tokenizer_spacy():
text = "sentence1 sad. \n asd asd \n asdasd \n sentence 2."
tokenizer = SpacyTokenizer()
Expand Down

0 comments on commit c5f6372

Please sign in to comment.