Skip to content

Commit

Permalink
Fixed several TreebankWordTokenizer and NLTKWordTokenizer bugs (#2877)
Browse files Browse the repository at this point in the history
* Fixed issue with quote-tokenization, small regression for detokenization

* Updated double-quote to single quote in doctest output

* Resolved issue with 'wanna' absorbing a space too much in (de)tokenization

* Allow importing TreebankWordDetokenizer from nltk.tokenize

* Added additional test for span_tokenize

* Add span_tokenize to NLTKWordTokenizer, like in TreebankWordTokenizer

* Added credits for modifications
  • Loading branch information
tomaarsen committed Nov 6, 2021
1 parent ec1d49d commit 7d3d6a4
Show file tree
Hide file tree
Showing 4 changed files with 91 additions and 7 deletions.
33 changes: 31 additions & 2 deletions nltk/test/tokenize.doctest
Expand Up @@ -3,10 +3,10 @@

>>> from nltk.tokenize import *

Regression Tests: Treebank Tokenizer
Regression Tests: NLTKWordTokenizer
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Some test strings.
Tokenizing some test strings.

>>> s1 = "On a $50,000 mortgage of 30 years at 8 percent, the monthly payment would be $366.88."
>>> word_tokenize(s1)
Expand Down Expand Up @@ -42,6 +42,35 @@ Some test strings.
>>> word_tokenize(s11)
['It', "'s", 'more', "'n", 'enough', '.']

Gathering the spans of the tokenized strings.

>>> s = '''Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks).'''
>>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)]
>>> list(NLTKWordTokenizer().span_tokenize(s)) == expected
True
>>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')',
... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.']
>>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected
True

>>> s = '''I said, "I'd like to buy some ''good muffins" which cost $3.88\n each in New (York)."'''
>>> expected = [(0, 1), (2, 6), (6, 7), (8, 9), (9, 10), (10, 12),
... (13, 17), (18, 20), (21, 24), (25, 29), (30, 32), (32, 36),
... (37, 44), (44, 45), (46, 51), (52, 56), (57, 58), (58, 62),
... (64, 68), (69, 71), (72, 75), (76, 77), (77, 81), (81, 82),
... (82, 83), (83, 84)]
>>> list(NLTKWordTokenizer().span_tokenize(s)) == expected
True
>>> expected = ['I', 'said', ',', '"', 'I', "'d", 'like', 'to',
... 'buy', 'some', "''", "good", 'muffins', '"', 'which', 'cost',
... '$', '3.88', 'each', 'in', 'New', '(', 'York', ')', '.', '"']
>>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected
True

Testing improvement made to the TreebankWordTokenizer

>>> sx1 = '\xabNow that I can do.\xbb'
Expand Down
2 changes: 1 addition & 1 deletion nltk/tokenize/__init__.py
Expand Up @@ -88,7 +88,7 @@
from nltk.tokenize.stanford_segmenter import StanfordSegmenter
from nltk.tokenize.texttiling import TextTilingTokenizer
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.tokenize.treebank import TreebankWordTokenizer
from nltk.tokenize.treebank import TreebankWordDetokenizer, TreebankWordTokenizer
from nltk.tokenize.util import regexp_span_tokenize, string_span_tokenize


Expand Down
60 changes: 57 additions & 3 deletions nltk/tokenize/destructive.py
@@ -1,14 +1,16 @@
# Natural Language Toolkit: NLTK's very own tokenizer.
#
# Copyright (C) 2001-2021 NLTK Project
# Author:
# Author: Liling Tan
# Tom Aarsen <> (modifications)
# URL: <https://www.nltk.org>
# For license information, see LICENSE.TXT


import re

from nltk.tokenize.api import TokenizerI
from nltk.tokenize.util import align_tokens


class MacIntyreContractions:
Expand All @@ -24,7 +26,7 @@ class MacIntyreContractions:
r"(?i)\b(got)(?#X)(ta)\b",
r"(?i)\b(lem)(?#X)(me)\b",
r"(?i)\b(more)(?#X)('n)\b",
r"(?i)\b(wan)(?#X)(na)\s",
r"(?i)\b(wan)(?#X)(na)(?=\s)",
]
CONTRACTIONS3 = [r"(?i) ('t)(?#X)(is)\b", r"(?i) ('t)(?#X)(was)\b"]
CONTRACTIONS4 = [r"(?i)\b(whad)(dd)(ya)\b", r"(?i)\b(wha)(t)(cha)\b"]
Expand Down Expand Up @@ -53,8 +55,8 @@ class NLTKWordTokenizer(TokenizerI):
# Ending quotes.
ENDING_QUOTES = [
(re.compile("([»”’])", re.U), r" \1 "),
(re.compile(r"''"), " '' "),
(re.compile(r'"'), " '' "),
(re.compile(r"(\S)(\'\')"), r"\1 \2 "),
(re.compile(r"([^' ])('[sS]|'[mM]|'[dD]|') "), r"\1 \2 "),
(re.compile(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1 \2 "),
]
Expand Down Expand Up @@ -146,3 +148,55 @@ def tokenize(self, text, convert_parentheses=False, return_str=False):
# text = regexp.sub(r' \1 \2 \3 ', text)

return text if return_str else text.split()

def span_tokenize(self, text):
r"""
Uses the post-hoc nltk.tokens.align_tokens to return the offset spans.
>>> from nltk.tokenize import NLTKWordTokenizer
>>> s = '''Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks).'''
>>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)]
>>> list(NLTKWordTokenizer().span_tokenize(s)) == expected
True
>>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')',
... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.']
>>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected
True
Additional example
>>> from nltk.tokenize import NLTKWordTokenizer
>>> s = '''I said, "I'd like to buy some ''good muffins" which cost $3.88\n each in New (York)."'''
>>> expected = [(0, 1), (2, 6), (6, 7), (8, 9), (9, 10), (10, 12),
... (13, 17), (18, 20), (21, 24), (25, 29), (30, 32), (32, 36),
... (37, 44), (44, 45), (46, 51), (52, 56), (57, 58), (58, 62),
... (64, 68), (69, 71), (72, 75), (76, 77), (77, 81), (81, 82),
... (82, 83), (83, 84)]
>>> list(NLTKWordTokenizer().span_tokenize(s)) == expected
True
>>> expected = ['I', 'said', ',', '"', 'I', "'d", 'like', 'to',
... 'buy', 'some', "''", "good", 'muffins', '"', 'which', 'cost',
... '$', '3.88', 'each', 'in', 'New', '(', 'York', ')', '.', '"']
>>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected
True
"""
raw_tokens = self.tokenize(text)

# Convert converted quotes back to original double quotes
# Do this only if original text contains double quote(s) or double
# single-quotes (because '' might be transformed to `` if it is
# treated as starting quotes).
if ('"' in text) or ("''" in text):
# Find double quotes and converted quotes
matched = [m.group() for m in re.finditer(r"``|'{2}|\"", text)]

# Replace converted quotes back to double quotes
tokens = [
matched.pop(0) if tok in ['"', "``", "''"] else tok
for tok in raw_tokens
]
else:
tokens = raw_tokens

yield from align_tokens(tokens, text)
3 changes: 2 additions & 1 deletion nltk/tokenize/treebank.py
Expand Up @@ -3,6 +3,7 @@
# Copyright (C) 2001-2021 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Michael Heilman <mheilman@cmu.edu> (re-port from http://www.cis.upenn.edu/~treebank/tokenizer.sed)
# Tom Aarsen <> (modifications)
#
# URL: <https://www.nltk.org>
# For license information, see LICENSE.TXT
Expand Down Expand Up @@ -86,8 +87,8 @@ class TreebankWordTokenizer(TokenizerI):

# ending quotes
ENDING_QUOTES = [
(re.compile(r"''"), " '' "),
(re.compile(r'"'), " '' "),
(re.compile(r"(\S)(\'\')"), r"\1 \2 "),
(re.compile(r"([^' ])('[sS]|'[mM]|'[dD]|') "), r"\1 \2 "),
(re.compile(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1 \2 "),
]
Expand Down

0 comments on commit 7d3d6a4

Please sign in to comment.