Fixed several TreebankWordTokenizer and NLTKWordTokenizer bugs (#2877)

* Fixed issue with quote-tokenization, small regression for detokenization * Updated double-quote to single quote in doctest output * Resolved issue with 'wanna' absorbing a space too much in (de)tokenization * Allow importing TreebankWordDetokenizer from nltk.tokenize * Added additional test for span_tokenize * Add span_tokenize to NLTKWordTokenizer, like in TreebankWordTokenizer * Added credits for modifications
nltk · Nov 6, 2021 · 7d3d6a4 · 7d3d6a4
1 parent ec1d49d
commit 7d3d6a4
Show file tree

Hide file tree

Showing 4 changed files with 91 additions and 7 deletions.
diff --git a/nltk/test/tokenize.doctest b/nltk/test/tokenize.doctest
@@ -3,10 +3,10 @@
 
     >>> from nltk.tokenize import *
 
-Regression Tests: Treebank Tokenizer
+Regression Tests: NLTKWordTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Some test strings.
+Tokenizing some test strings.
 
     >>> s1 = "On a $50,000 mortgage of 30 years at 8 percent, the monthly payment would be $366.88."
     >>> word_tokenize(s1)
@@ -42,6 +42,35 @@ Some test strings.
     >>> word_tokenize(s11)
     ['It', "'s", 'more', "'n", 'enough', '.']
 
+Gathering the spans of the tokenized strings.
+
+    >>> s = '''Good muffins cost $3.88\nin New (York).  Please (buy) me\ntwo of them.\n(Thanks).'''
+    >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
+    ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
+    ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
+    ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)]
+    >>> list(NLTKWordTokenizer().span_tokenize(s)) == expected
+    True
+    >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
+    ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')',
+    ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.']
+    >>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected
+    True
+
+    >>> s = '''I said, "I'd like to buy some ''good muffins" which cost $3.88\n each in New (York)."'''
+    >>> expected = [(0, 1), (2, 6), (6, 7), (8, 9), (9, 10), (10, 12),
+    ... (13, 17), (18, 20), (21, 24), (25, 29), (30, 32), (32, 36),
+    ... (37, 44), (44, 45), (46, 51), (52, 56), (57, 58), (58, 62),
+    ... (64, 68), (69, 71), (72, 75), (76, 77), (77, 81), (81, 82),
+    ... (82, 83), (83, 84)]
+    >>> list(NLTKWordTokenizer().span_tokenize(s)) == expected
+    True
+    >>> expected = ['I', 'said', ',', '"', 'I', "'d", 'like', 'to',
+    ... 'buy', 'some', "''", "good", 'muffins', '"', 'which', 'cost',
+    ... '$', '3.88', 'each', 'in', 'New', '(', 'York', ')', '.', '"']
+    >>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected
+    True
+
 Testing improvement made to the TreebankWordTokenizer
 
     >>> sx1 = '\xabNow that I can do.\xbb'

diff --git a/nltk/tokenize/__init__.py b/nltk/tokenize/__init__.py
@@ -88,7 +88,7 @@
 from nltk.tokenize.stanford_segmenter import StanfordSegmenter
 from nltk.tokenize.texttiling import TextTilingTokenizer
 from nltk.tokenize.toktok import ToktokTokenizer
-from nltk.tokenize.treebank import TreebankWordTokenizer
+from nltk.tokenize.treebank import TreebankWordDetokenizer, TreebankWordTokenizer
 from nltk.tokenize.util import regexp_span_tokenize, string_span_tokenize
 
 

diff --git a/nltk/tokenize/destructive.py b/nltk/tokenize/destructive.py
@@ -1,14 +1,16 @@
 # Natural Language Toolkit: NLTK's very own tokenizer.
 #
 # Copyright (C) 2001-2021 NLTK Project
-# Author:
+# Author: Liling Tan
+#         Tom Aarsen <> (modifications)
 # URL: <https://www.nltk.org>
 # For license information, see LICENSE.TXT
 
 
 import re
 
 from nltk.tokenize.api import TokenizerI
+from nltk.tokenize.util import align_tokens
 
 
 class MacIntyreContractions:
@@ -24,7 +26,7 @@ class MacIntyreContractions:
         r"(?i)\b(got)(?#X)(ta)\b",
         r"(?i)\b(lem)(?#X)(me)\b",
         r"(?i)\b(more)(?#X)('n)\b",
-        r"(?i)\b(wan)(?#X)(na)\s",
+        r"(?i)\b(wan)(?#X)(na)(?=\s)",
     ]
     CONTRACTIONS3 = [r"(?i) ('t)(?#X)(is)\b", r"(?i) ('t)(?#X)(was)\b"]
     CONTRACTIONS4 = [r"(?i)\b(whad)(dd)(ya)\b", r"(?i)\b(wha)(t)(cha)\b"]
@@ -53,8 +55,8 @@ class NLTKWordTokenizer(TokenizerI):
     # Ending quotes.
     ENDING_QUOTES = [
         (re.compile("([»”’])", re.U), r" \1 "),
+        (re.compile(r"''"), " '' "),
         (re.compile(r'"'), " '' "),
-        (re.compile(r"(\S)(\'\')"), r"\1 \2 "),
         (re.compile(r"([^' ])('[sS]|'[mM]|'[dD]|') "), r"\1 \2 "),
         (re.compile(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1 \2 "),
     ]
@@ -146,3 +148,55 @@ def tokenize(self, text, convert_parentheses=False, return_str=False):
         #     text = regexp.sub(r' \1 \2 \3 ', text)
 
         return text if return_str else text.split()
+
+    def span_tokenize(self, text):
+        r"""
+        Uses the post-hoc nltk.tokens.align_tokens to return the offset spans.
+            >>> from nltk.tokenize import NLTKWordTokenizer
+            >>> s = '''Good muffins cost $3.88\nin New (York).  Please (buy) me\ntwo of them.\n(Thanks).'''
+            >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
+            ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
+            ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
+            ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)]
+            >>> list(NLTKWordTokenizer().span_tokenize(s)) == expected
+            True
+            >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
+            ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')',
+            ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.']
+            >>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected
+            True
+            Additional example
+            >>> from nltk.tokenize import NLTKWordTokenizer
+            >>> s = '''I said, "I'd like to buy some ''good muffins" which cost $3.88\n each in New (York)."'''
+            >>> expected = [(0, 1), (2, 6), (6, 7), (8, 9), (9, 10), (10, 12),
+            ... (13, 17), (18, 20), (21, 24), (25, 29), (30, 32), (32, 36),
+            ... (37, 44), (44, 45), (46, 51), (52, 56), (57, 58), (58, 62),
+            ... (64, 68), (69, 71), (72, 75), (76, 77), (77, 81), (81, 82),
+            ... (82, 83), (83, 84)]
+            >>> list(NLTKWordTokenizer().span_tokenize(s)) == expected
+            True
+            >>> expected = ['I', 'said', ',', '"', 'I', "'d", 'like', 'to',
+            ... 'buy', 'some', "''", "good", 'muffins', '"', 'which', 'cost',
+            ... '$', '3.88', 'each', 'in', 'New', '(', 'York', ')', '.', '"']
+            >>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected
+            True
+        """
+        raw_tokens = self.tokenize(text)
+
+        # Convert converted quotes back to original double quotes
+        # Do this only if original text contains double quote(s) or double
+        # single-quotes (because '' might be transformed to `` if it is
+        # treated as starting quotes).
+        if ('"' in text) or ("''" in text):
+            # Find double quotes and converted quotes
+            matched = [m.group() for m in re.finditer(r"``|'{2}|\"", text)]
+
+            # Replace converted quotes back to double quotes
+            tokens = [
+                matched.pop(0) if tok in ['"', "``", "''"] else tok
+                for tok in raw_tokens
+            ]
+        else:
+            tokens = raw_tokens
+
+        yield from align_tokens(tokens, text)
diff --git a/nltk/tokenize/treebank.py b/nltk/tokenize/treebank.py
@@ -3,6 +3,7 @@
 # Copyright (C) 2001-2021 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 #         Michael Heilman <mheilman@cmu.edu> (re-port from http://www.cis.upenn.edu/~treebank/tokenizer.sed)
+#         Tom Aarsen <> (modifications)
 #
 # URL: <https://www.nltk.org>
 # For license information, see LICENSE.TXT
@@ -86,8 +87,8 @@ class TreebankWordTokenizer(TokenizerI):
 
     # ending quotes
     ENDING_QUOTES = [
+        (re.compile(r"''"), " '' "),
         (re.compile(r'"'), " '' "),
-        (re.compile(r"(\S)(\'\')"), r"\1 \2 "),
         (re.compile(r"([^' ])('[sS]|'[mM]|'[dD]|') "), r"\1 \2 "),
         (re.compile(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1 \2 "),
     ]