Skip to content

Commit

Permalink
Merge pull request #39 from arushadev/paragraph
Browse files Browse the repository at this point in the history
Fix paragraph tokenizer
  • Loading branch information
hosseinkhaledi committed Apr 28, 2024
2 parents c5f6372 + 9064b4e commit 0bd7374
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 5 deletions.
2 changes: 1 addition & 1 deletion piraye/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def paragraph_span_tokenize(self, text: str) -> List[Tuple[int, int, str]]:
for _, sentence_end, _ in sentences:
if last_index + 1 >= text2_len:
break
pointer = sentence_end + 1
pointer = sentence_end
while True:
if pointer + 1 >= text2_len:
paragraphs.append((last_index, pointer, text[last_index:pointer]))
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "piraye"
version = "0.6.0"
version = "0.6.1"
authors = [
{ name = "Hamed Khademi Khaledi", email = "khaledihkh@gmail.com" },
{ name = "HosseiN Khademi khaeldi", email = "hossein@arusha.dev" },
Expand Down
15 changes: 12 additions & 3 deletions tests/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,19 @@ def test_sentence_tokenizer():


def test_paragraph_tokenizer():
text = "par1 sen1 sad. par1 \n sen2. par1 \n\n sen3. \n par2 sen1.\n\n\n\n par3 sen1.\n\n"
text = "par1 sen1 sad. par1 \n sen2. par1 \n\n sen3.\n par2 sen1.\n\n\n\n par3 sen1. \n par4 sen1.\n\n"
tokenizer = NltkTokenizer()
assert len(tokenizer.paragraph_tokenize(text)) == 3
assert len(tokenizer.paragraph_span_tokenize(text)) == 3
assert len(tokenizer.paragraph_tokenize(text)) == 4
assert len(tokenizer.paragraph_span_tokenize(text)) == 4
assert len(tokenizer.paragraph_tokenize("par1 sen1 sad.")) == 1
assert len(tokenizer.paragraph_tokenize("par1 sen1 sad. par1 \n sen2. ")) == 1


def test_paragraph_tokenizer_spacy():
text = "par1 sen1 sad. par1 \n sen2. par1 \n\n sen3.\n par2 sen1.\n\n\n\n par3 sen1. \n par4 sen1.\n\n"
tokenizer = SpacyTokenizer()
assert len(tokenizer.paragraph_tokenize(text)) == 4
assert len(tokenizer.paragraph_span_tokenize(text)) == 4
assert len(tokenizer.paragraph_tokenize("par1 sen1 sad.")) == 1
assert len(tokenizer.paragraph_tokenize("par1 sen1 sad. par1 \n sen2. ")) == 1

Expand Down

0 comments on commit 0bd7374

Please sign in to comment.