Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed several TreebankWordTokenizer and NLTKWordTokenizer bugs #2877

Merged
merged 7 commits into from Nov 6, 2021
33 changes: 31 additions & 2 deletions nltk/test/tokenize.doctest
Expand Up @@ -3,10 +3,10 @@

>>> from nltk.tokenize import *

Regression Tests: Treebank Tokenizer
Regression Tests: NLTKWordTokenizer
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Some test strings.
Tokenizing some test strings.

>>> s1 = "On a $50,000 mortgage of 30 years at 8 percent, the monthly payment would be $366.88."
>>> word_tokenize(s1)
Expand Down Expand Up @@ -42,6 +42,35 @@ Some test strings.
>>> word_tokenize(s11)
['It', "'s", 'more', "'n", 'enough', '.']

Gathering the spans of the tokenized strings.

>>> s = '''Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks).'''
>>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)]
>>> list(NLTKWordTokenizer().span_tokenize(s)) == expected
True
>>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')',
... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.']
>>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected
True

>>> s = '''I said, "I'd like to buy some ''good muffins" which cost $3.88\n each in New (York)."'''
>>> expected = [(0, 1), (2, 6), (6, 7), (8, 9), (9, 10), (10, 12),
... (13, 17), (18, 20), (21, 24), (25, 29), (30, 32), (32, 36),
... (37, 44), (44, 45), (46, 51), (52, 56), (57, 58), (58, 62),
... (64, 68), (69, 71), (72, 75), (76, 77), (77, 81), (81, 82),
... (82, 83), (83, 84)]
>>> list(NLTKWordTokenizer().span_tokenize(s)) == expected
True
>>> expected = ['I', 'said', ',', '"', 'I', "'d", 'like', 'to',
... 'buy', 'some', "''", "good", 'muffins', '"', 'which', 'cost',
... '$', '3.88', 'each', 'in', 'New', '(', 'York', ')', '.', '"']
>>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected
True

Testing improvement made to the TreebankWordTokenizer

>>> sx1 = '\xabNow that I can do.\xbb'
Expand Down
2 changes: 1 addition & 1 deletion nltk/tokenize/__init__.py
Expand Up @@ -88,7 +88,7 @@
from nltk.tokenize.stanford_segmenter import StanfordSegmenter
from nltk.tokenize.texttiling import TextTilingTokenizer
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.tokenize.treebank import TreebankWordTokenizer
from nltk.tokenize.treebank import TreebankWordDetokenizer, TreebankWordTokenizer
from nltk.tokenize.util import regexp_span_tokenize, string_span_tokenize


Expand Down
60 changes: 57 additions & 3 deletions nltk/tokenize/destructive.py
@@ -1,14 +1,16 @@
# Natural Language Toolkit: NLTK's very own tokenizer.
#
# Copyright (C) 2001-2021 NLTK Project
# Author:
# Author: Liling Tan
# Tom Aarsen <> (modifications)
# URL: <https://www.nltk.org>
# For license information, see LICENSE.TXT


import re

from nltk.tokenize.api import TokenizerI
from nltk.tokenize.util import align_tokens


class MacIntyreContractions:
Expand All @@ -24,7 +26,7 @@ class MacIntyreContractions:
r"(?i)\b(got)(?#X)(ta)\b",
r"(?i)\b(lem)(?#X)(me)\b",
r"(?i)\b(more)(?#X)('n)\b",
r"(?i)\b(wan)(?#X)(na)\s",
r"(?i)\b(wan)(?#X)(na)(?=\s)",
]
CONTRACTIONS3 = [r"(?i) ('t)(?#X)(is)\b", r"(?i) ('t)(?#X)(was)\b"]
CONTRACTIONS4 = [r"(?i)\b(whad)(dd)(ya)\b", r"(?i)\b(wha)(t)(cha)\b"]
Expand Down Expand Up @@ -53,8 +55,8 @@ class NLTKWordTokenizer(TokenizerI):
# Ending quotes.
ENDING_QUOTES = [
(re.compile("([»”’])", re.U), r" \1 "),
(re.compile(r"''"), " '' "),
(re.compile(r'"'), " '' "),
(re.compile(r"(\S)(\'\')"), r"\1 \2 "),
(re.compile(r"([^' ])('[sS]|'[mM]|'[dD]|') "), r"\1 \2 "),
(re.compile(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1 \2 "),
]
Expand Down Expand Up @@ -146,3 +148,55 @@ def tokenize(self, text, convert_parentheses=False, return_str=False):
# text = regexp.sub(r' \1 \2 \3 ', text)

return text if return_str else text.split()

def span_tokenize(self, text):
r"""
Uses the post-hoc nltk.tokens.align_tokens to return the offset spans.
>>> from nltk.tokenize import NLTKWordTokenizer
>>> s = '''Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks).'''
>>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)]
>>> list(NLTKWordTokenizer().span_tokenize(s)) == expected
True
>>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')',
... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.']
>>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected
True
Additional example
>>> from nltk.tokenize import NLTKWordTokenizer
>>> s = '''I said, "I'd like to buy some ''good muffins" which cost $3.88\n each in New (York)."'''
>>> expected = [(0, 1), (2, 6), (6, 7), (8, 9), (9, 10), (10, 12),
... (13, 17), (18, 20), (21, 24), (25, 29), (30, 32), (32, 36),
... (37, 44), (44, 45), (46, 51), (52, 56), (57, 58), (58, 62),
... (64, 68), (69, 71), (72, 75), (76, 77), (77, 81), (81, 82),
... (82, 83), (83, 84)]
>>> list(NLTKWordTokenizer().span_tokenize(s)) == expected
True
>>> expected = ['I', 'said', ',', '"', 'I', "'d", 'like', 'to',
... 'buy', 'some', "''", "good", 'muffins', '"', 'which', 'cost',
... '$', '3.88', 'each', 'in', 'New', '(', 'York', ')', '.', '"']
>>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected
True
"""
raw_tokens = self.tokenize(text)

# Convert converted quotes back to original double quotes
# Do this only if original text contains double quote(s) or double
# single-quotes (because '' might be transformed to `` if it is
# treated as starting quotes).
if ('"' in text) or ("''" in text):
# Find double quotes and converted quotes
matched = [m.group() for m in re.finditer(r"``|'{2}|\"", text)]

# Replace converted quotes back to double quotes
tokens = [
matched.pop(0) if tok in ['"', "``", "''"] else tok
for tok in raw_tokens
]
else:
tokens = raw_tokens

yield from align_tokens(tokens, text)
3 changes: 2 additions & 1 deletion nltk/tokenize/treebank.py
Expand Up @@ -3,6 +3,7 @@
# Copyright (C) 2001-2021 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Michael Heilman <mheilman@cmu.edu> (re-port from http://www.cis.upenn.edu/~treebank/tokenizer.sed)
# Tom Aarsen <> (modifications)
#
# URL: <https://www.nltk.org>
# For license information, see LICENSE.TXT
Expand Down Expand Up @@ -86,8 +87,8 @@ class TreebankWordTokenizer(TokenizerI):

# ending quotes
ENDING_QUOTES = [
(re.compile(r"''"), " '' "),
(re.compile(r'"'), " '' "),
(re.compile(r"(\S)(\'\')"), r"\1 \2 "),
(re.compile(r"([^' ])('[sS]|'[mM]|'[dD]|') "), r"\1 \2 "),
(re.compile(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1 \2 "),
]
Expand Down