Merge pull request #36 from arushadev/dev-spacy

add spacy tokenizer
arushadev · Apr 3, 2024 · 2801e16 · 2801e16
2 parents 2210111 + 6ab981e
commit 2801e16
Show file tree

Hide file tree

Showing 10 changed files with 143 additions and 37 deletions.
diff --git a/README.md b/README.md
@@ -1,15 +1,16 @@
-# Piraye: NLP Utils
+# Piraye: NLP Utilities
 
 <p align="center">
   <a href="https://pypi.org/project/piraye"><img alt="PyPI Version" src="https://img.shields.io/pypi/v/piraye.svg?maxAge=86400" /></a>
   <a href="https://pypi.org/project/piraye"><img alt="Python Versions" src="https://img.shields.io/pypi/pyversions/piraye.svg?maxAge=86400" /></a>
   <a href="https://pypi.org/project/piraye"><img alt="License" src="https://img.shields.io/pypi/l/piraye.svg?maxAge=86400" /></a>
+  <a href="https://pepy.tech/project/piraye"><img alt="Downloads" src="https://static.pepy.tech/badge/piraye" /></a>
   <a href="https://github.com/arushadev/piraye/actions/workflows/pylint.yml"><img alt="Pylint" src="https://github.com/arushadev/piraye/actions/workflows/pylint.yml/badge.svg" /></a>
   <a href="https://github.com/arushadev/piraye/actions/workflows/unit-test.yml/badge.svg)](https://github.com/arushadev/piraye/actions/workflows/unit-test.yml"><img alt="Unit Test" src="https://github.com/arushadev/piraye/actions/workflows/unit-test.yml/badge.svg" /></a>
 </p>
 
 
-A utility for normalizing persian, arabic and english texts
+**Piraye** is a Python library designed to facilitate text normalization for Persian, Arabic, and English languages.
 
 ## Requirements
 
@@ -18,19 +19,18 @@ A utility for normalizing persian, arabic and english texts
 
 ## Installation
 
-Install the latest version with pip
+You can install the latest version of Piraye via pip:
+
 `pip install piraye`
 
 ## Usage
 
-Create an instance of Normalizer with NormalizerBuilder and then call normalize function. Also see list of all available
-configs in [configs](#Configs) section.
+To use Piraye, create an instance of the Normalizer class with NormalizerBuilder and then call the normalize function. You can configure the normalization process using various settings available. Below are two examples demonstrating different approaches:
 
 * Using builder pattern:
 
 ```python
 from piraye import NormalizerBuilder
-from piraye.tasks.normalizer.normalizer_builder import Config
 
 text = "این یک متن تسة اسﺘ       , 24/12/1400 "
 normalizer = NormalizerBuilder().alphabet_fa().digit_fa().punctuation_fa().tokenizing().remove_extra_spaces().build()
@@ -49,45 +49,48 @@ normalizer = NormalizerBuilder([Config.PUNCTUATION_FA, Config.ALPHABET_FA, Confi
 normalizer.normalize(text)  # "این یک متن تست است ، ۲۴/۱۲/۱۴۰۰"
 ```
 
-Also see [other examples](https://github.com/arushadev/piraye/blob/readme/examples.md)
+You can find more examples [here](https://github.com/arushadev/piraye/blob/readme/examples.md)
 
 ## Configs
 
+Piraye provides various configurations for text normalization. Here's a list of available configurations:
+
 |      Config      |     Function     |                      Description                      |
 |:----------------:|:----------------:|:-----------------------------------------------------:|
-|   ALPHABET_AR    |   alphabet_ar    |         mapping alphabet characters to arabic         |
-|   ALPHABET_EN    |   alphabet_en    |        mapping alphabet characters to english         |
-|   ALPHABET_FA    |   alphabet_fa    |        mapping alphabet characters to persian         |
-|     DIGIT_AR     |     digit_ar     |            convert digits to arabic digits            |
-|     DIGIT_EN     |     digit_en     |           convert digits to english digits            |
-|     DIGIT_FA     |     digit_fa     |           convert digits to persian digits            |
+|   ALPHABET_AR    |   alphabet_ar    |         mapping alphabet characters to Arabic         |
+|   ALPHABET_EN    |   alphabet_en    |        mapping alphabet characters to English         |
+|   ALPHABET_FA    |   alphabet_fa    |        mapping alphabet characters to Persian         |
+|     DIGIT_AR     |     digit_ar     |            convert digits to Arabic digits            |
+|     DIGIT_EN     |     digit_en     |           convert digits to English digits            |
+|     DIGIT_FA     |     digit_fa     |           convert digits to Persian digits            |
 | DIACRITIC_DELETE | diacritic_delete |                 remove all diacritics                 |
 |   SPACE_DELETE   |   space_delete   |                   remove all spaces                   |
 |   SPACE_NORMAL   |   space_normal   | normal spaces ( like NO-BREAK SPACE , Tab and etc...) |
 |    SPACE_KEEP    |    space_keep    |          mapping spaces and not normal them           |
-|  PUNCTUATION_AR  |  punctuation_ar  |      mapping punctuations to arabic punctuations      |
-|  PUNCTUATION_Fa  |  punctuation_fa  |     mapping punctuations to persian punctuations      |
-|  PUNCTUATION_EN  |  punctuation_en  |     mapping punctuations to english punctuations      |
+|  PUNCTUATION_AR  |  punctuation_ar  |      mapping punctuations to Arabic punctuations      |
+|  PUNCTUATION_Fa  |  punctuation_fa  |     mapping punctuations to Persian punctuations      |
+|  PUNCTUATION_EN  |  punctuation_en  |     mapping punctuations to English punctuations      |
 
 Other attributes:
 
-* remove_extra_spaces : append multiple spaces together
-* tokenization : replace punctuation characters that just are tokens
+* remove_extra_spaces: Appends multiple spaces together.
+* tokenization: Replaces punctuation characters which are just tokens.
 
 ## Development
 
-* Install dependencies with `pip install -e .[dev]`
+To set up a development environment, install dependencies with:
+
+`pip install -e .[dev]`
 
 ## License
 
 **GNU Lesser General Public License v2.1**
 
-Primarily used for software libraries, the GNU LGPL requires that derived works be licensed under the same license, but
-works that only link to it do not fall under this restriction. There are two commonly used versions of the GNU LGPL.
-
-See [LICENSE](https://github.com/arushadev/piraye/blob/main/LICENSE)
+Piraye is licensed under the GNU Lesser General Public License v2.1, which primarily applies to software libraries.
+See the [LICENSE](https://github.com/arushadev/piraye/blob/main/LICENSE) file for more details.
 
 ## About ️
 
-[Arusha](https://www.arusha.dev)
+Piraye is maintained by [Arusha](https://www.arusha.dev).
+
 
diff --git a/examples.md b/examples.md
@@ -28,6 +28,8 @@ print(normalizer.normalize(text))  # "این یک متن تسة اسﺘ ، ۲۴,
 # without change punctuations (Config.PUNCTUATION_FA)
 normalizer = NormalizerBuilder().alphabet_fa().digit_fa().remove_extra_spaces().tokenizing().build()
 print(normalizer.normalize(text))  # "این یک متن تست است , ۲۴,۱۲,۱۴۰۰"
+
+#use spacy tokenizer
 ```
 
 ``` python

diff --git a/piraye/__init__.py b/piraye/__init__.py
@@ -4,7 +4,8 @@
 from .tasks.normalizer.multi_lingual_normalizer_builder import MultiLingualNormalizerBuilder
 from .tasks.normalizer.normalizer_builder import NormalizerBuilder
 from .tasks.tokenizer.nltk_tokenizer import NltkTokenizer
+from .tasks.tokenizer.spacy_tokenizer import SpacyTokenizer
 from .tokenizer import Tokenizer
 
 __all__ = ["Normalizer", "Tokenizer", "NormalizerBuilder", "MultiLingualNormalizer", "MultiLingualNormalizerBuilder",
-           "NltkTokenizer"]
+           "NltkTokenizer", "SpacyTokenizer"]
diff --git a/piraye/normalizer.py b/piraye/normalizer.py
@@ -7,19 +7,19 @@
 class Normalizer(ABC):
     """
     The Normalizer class is an abstract base class that defines the interface for text normalization. It provides two
-    abstract methods: normalize() and span_normalize(), which can be implemented by subclasses to perform specific
+    abstract methods: normalize() and span_normalize(), subclasses can implement which to perform specific
     normalization tasks.
 
     Example Usage
 
     # Create a subclass of Normalizer
     class MyNormalizer(Normalizer):
 
-        def normalize(self, text: str) -> str:
+        Def normalize (self, text: str) -> str:
             # Implement the normalization logic here
             ...
 
-        def span_normalize(self, text: str) -> List[Tuple[int, int, str]]:
+        Def span_normalize(self, text: str) -> List[Tuple[int, int, str]]:
             # Implement the normalization logic and return spans of normalized tokens
             ...
 

diff --git a/piraye/tasks/normalizer/character_normalizer.py b/piraye/tasks/normalizer/character_normalizer.py
@@ -6,7 +6,8 @@
 
 from .char_config import CharConfig
 from .mappings import MappingDict
-from ..tokenizer.nltk_tokenizer import NltkTokenizer, Tokenizer
+from ..tokenizer.nltk_tokenizer import NltkTokenizer
+from ...tokenizer import Tokenizer
 from ...normalizer import Normalizer
 
 

diff --git a/piraye/tasks/normalizer/normalizer_builder.py b/piraye/tasks/normalizer/normalizer_builder.py
@@ -6,6 +6,7 @@
 
 from .character_normalizer import CharacterNormalizer
 from ...normalizer import Normalizer
+from ...tokenizer import Tokenizer
 
 
 class Config(enum.Enum):
@@ -57,6 +58,7 @@ def __init__(self, configs: List[Config] | None = None,
         :param remove_extra_spaces: Whether to remove extra spaces during normalization
         :param tokenization: Whether to tokenize the text during normalization.
         """
+        self.__tokenizer = None
         if configs is None:
             configs = []
         self.__configs = configs
@@ -74,7 +76,7 @@ def build(self) -> Normalizer:
                      Config.SPACE_NORMAL in self.__configs):
             self.__configs.append(Config.SPACE_KEEP)
         return CharacterNormalizer([c.value for c in self.__configs],
-                                   self.__remove_extra_spaces, self.__tokenization)
+                                   self.__remove_extra_spaces, self.__tokenization,self.__tokenizer)
 
     def alphabet_ar(self) -> NormalizerBuilder:
         """
@@ -196,9 +198,10 @@ def remove_extra_spaces(self, remove_extra_spaces: bool = True) -> NormalizerBui
         self.__remove_extra_spaces = remove_extra_spaces
         return self
 
-    def tokenizing(self, tokenization: bool = True) -> NormalizerBuilder:
+    def tokenizing(self, tokenization: bool = True, tokenizer: Tokenizer = None) -> NormalizerBuilder:
         """
         Config whether tokenize before normalization or not
         """
         self.__tokenization = tokenization
+        self.__tokenizer = tokenizer
         return self
diff --git a/piraye/tasks/tokenizer/spacy_tokenizer.py b/piraye/tasks/tokenizer/spacy_tokenizer.py
@@ -0,0 +1,56 @@
+"""This module includes a Tokenizer class for tokenizing texts"""
+from abc import ABC
+from typing import List, Tuple
+from spacy.lang.en import English
+from spacy.pipeline import Sentencizer
+
+from ..normalizer.mappings import MappingDict
+from ...tokenizer import Tokenizer
+
+
+class SpacyTokenizer(Tokenizer, ABC):
+    """
+    A class impl tokenizer with spacy
+    ...
+    Methods
+    -------
+    word_tokenize(text: str):
+        return tokenized text
+    sentence_tokenize(text: str):
+        return sentence tokenized text
+    """
+
+    def __init__(self, ):
+        """
+        constructor
+        """
+        self.__nlp = English()
+        self.__en_mapping = MappingDict.load_jsons(["digit_en", "punc_en"])
+        self.__tokenizer = self.__nlp.tokenizer
+        self.__sentencizer = Sentencizer()
+
+    def word_span_tokenize(self, text) -> List[Tuple[int, int, str]]:
+        text2 = self.__clean_text(text)
+        spans = self.__tokenizer(text2)
+        return [(span.idx, span.idx + len(span.text), text[span.idx: span.idx + len(span.text)]) for span in spans]
+
+    def sentence_span_tokenize(self, text) -> List[Tuple[int, int, str]]:
+        text2 = self.__clean_text(text)
+        spans = self.__sentencizer(self.__nlp(text2))
+        result = []
+        last_index = 0
+        for span in spans:
+            if span.is_sent_end:
+                result.append((last_index, span.idx + len(span.text), text[last_index:span.idx + len(span.text)]))
+                last_index = span.idx + len(span.text)
+        return result
+
+    def __clean_text(self, text: str) -> str:
+        """
+        Clean the input text by replacing digits and punctuation with normalized versions.
+        :param text: He inputs text to clean.
+        :return: The cleaned text with normalized digits and punctuation.
+        """
+        return ''.join(
+            [char if not self.__en_mapping.get(char)
+             else self.__en_mapping.get(char).char for char in text])
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "piraye"
-version = "0.4.0"
+version = "0.5.0"
 authors = [
     { name = "Hamed Khademi Khaledi", email = "khaledihkh@gmail.com" },
     { name = "HosseiN Khademi khaeldi", email = "hossein@arusha.dev" },
@@ -56,7 +56,7 @@ namespaces = false  # to disable scanning PEP 420 namespaces (true by default)
 piraye = ["**/*.json"]
 
 [project.optional-dependencies]
-dev = ["tqdm", "pytest", "pylint ==2.17.7", "flake8", "pytest"]
+dev = ["tqdm", "pytest", "pylint ==2.17.7", "flake8", "pytest", "spacy"]
 
 [project.urls]
 "Homepage" = "https://github.com/arushadev/piraye"

diff --git a/tests/test_normalizer.py b/tests/test_normalizer.py
@@ -1,7 +1,6 @@
 # testing Fibonacci number function
 # pylint: skip-file
-
-from ..piraye import NltkTokenizer
+from ..piraye import NltkTokenizer, SpacyTokenizer
 from ..piraye import NormalizerBuilder
 
 
@@ -49,6 +48,22 @@ def test_quotes():
     norm.normalize(text)
 
 
+def test_quotes_spacy():
+    tokenizer = SpacyTokenizer()
+    text = "«"
+    norm = NormalizerBuilder().digit_en().punctuation_en().alphabet_fa() \
+        .tokenizing().remove_extra_spaces().tokenizing(tokenizer=tokenizer).build()
+    norm.normalize(text)
+    text = " «««« تست "
+    norm = NormalizerBuilder().digit_en().punctuation_en().alphabet_fa() \
+        .tokenizing().remove_extra_spaces().build()
+    norm.normalize(text)
+    text = " \" تست '' تست «««« تست "
+    norm = NormalizerBuilder().digit_en().punctuation_en().alphabet_fa() \
+        .tokenizing().remove_extra_spaces().build()
+    norm.normalize(text)
+
+
 def test_normalizer():
     tokens = NltkTokenizer().word_tokenize('\'\'Y\'"')
     print(tokens)
diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
@@ -1,7 +1,6 @@
 # testing Fibonacci number function
 # pylint: skip-file
-
-from ..piraye import NltkTokenizer
+from ..piraye import NltkTokenizer, SpacyTokenizer
 
 
 def test_object():
@@ -15,6 +14,12 @@ def test_sample():
     assert len(tokenizer.word_tokenize(text)) == 7
 
 
+def test_sample_spacy():
+    text = "برای تست (شماره ۲.۱ نوشته) شده است"
+    tokenizer = SpacyTokenizer()
+    assert len(tokenizer.word_tokenize(text)) == 9
+
+
 def test_double_quotes():
     text = "'\"\"تست\""
     tokenizer = NltkTokenizer()
@@ -28,7 +33,27 @@ def test_sentence_tokenizer():
     assert len(tokenizer.sentence_span_tokenize(text)) == 2
 
 
+def test_sentence_tokenizer_spacy():
+    text = "sentence1 sad. \n asd asd \n asdasd \n sentence 2."
+    tokenizer = SpacyTokenizer()
+    assert len(tokenizer.sentence_tokenize(text)) == 2
+    assert len(tokenizer.sentence_span_tokenize(text)) == 2
+
+
 def test_double_quotes2():
     text = "«»"
     tokenizer = NltkTokenizer()
     assert len(tokenizer.word_tokenize(text)) == 2
+
+
+def test_link():
+    # To check nltk is functioning wrong for links
+    text = "این یک لینک تست است https://www.google.com "
+    tokenizer = NltkTokenizer()
+    assert len(tokenizer.word_tokenize(text)) != 9
+
+
+def test_link_spacy():
+    text = "این یک لینک، (تست) است https://www.google.com "
+    tokenizer = SpacyTokenizer()
+    assert len(tokenizer.word_tokenize(text)) == 9