From 7d3d6a4f77b76a1f5eb0d0cbc9731716fcc17e68 Mon Sep 17 00:00:00 2001 From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com> Date: Sun, 7 Nov 2021 00:34:15 +0100 Subject: [PATCH] Fixed several TreebankWordTokenizer and NLTKWordTokenizer bugs (#2877) * Fixed issue with quote-tokenization, small regression for detokenization * Updated double-quote to single quote in doctest output * Resolved issue with 'wanna' absorbing a space too much in (de)tokenization * Allow importing TreebankWordDetokenizer from nltk.tokenize * Added additional test for span_tokenize * Add span_tokenize to NLTKWordTokenizer, like in TreebankWordTokenizer * Added credits for modifications --- nltk/test/tokenize.doctest | 33 ++++++++++++++++++-- nltk/tokenize/__init__.py | 2 +- nltk/tokenize/destructive.py | 60 ++++++++++++++++++++++++++++++++++-- nltk/tokenize/treebank.py | 3 +- 4 files changed, 91 insertions(+), 7 deletions(-) diff --git a/nltk/test/tokenize.doctest b/nltk/test/tokenize.doctest index a5e86d44e8..0e3a0749e7 100644 --- a/nltk/test/tokenize.doctest +++ b/nltk/test/tokenize.doctest @@ -3,10 +3,10 @@ >>> from nltk.tokenize import * -Regression Tests: Treebank Tokenizer +Regression Tests: NLTKWordTokenizer ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Some test strings. +Tokenizing some test strings. >>> s1 = "On a $50,000 mortgage of 30 years at 8 percent, the monthly payment would be $366.88." >>> word_tokenize(s1) @@ -42,6 +42,35 @@ Some test strings. >>> word_tokenize(s11) ['It', "'s", 'more', "'n", 'enough', '.'] +Gathering the spans of the tokenized strings. + + >>> s = '''Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks).''' + >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23), + ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38), + ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59), + ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)] + >>> list(NLTKWordTokenizer().span_tokenize(s)) == expected + True + >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in', + ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')', + ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.'] + >>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected + True + + >>> s = '''I said, "I'd like to buy some ''good muffins" which cost $3.88\n each in New (York)."''' + >>> expected = [(0, 1), (2, 6), (6, 7), (8, 9), (9, 10), (10, 12), + ... (13, 17), (18, 20), (21, 24), (25, 29), (30, 32), (32, 36), + ... (37, 44), (44, 45), (46, 51), (52, 56), (57, 58), (58, 62), + ... (64, 68), (69, 71), (72, 75), (76, 77), (77, 81), (81, 82), + ... (82, 83), (83, 84)] + >>> list(NLTKWordTokenizer().span_tokenize(s)) == expected + True + >>> expected = ['I', 'said', ',', '"', 'I', "'d", 'like', 'to', + ... 'buy', 'some', "''", "good", 'muffins', '"', 'which', 'cost', + ... '$', '3.88', 'each', 'in', 'New', '(', 'York', ')', '.', '"'] + >>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected + True + Testing improvement made to the TreebankWordTokenizer >>> sx1 = '\xabNow that I can do.\xbb' diff --git a/nltk/tokenize/__init__.py b/nltk/tokenize/__init__.py index 9a82830e12..e79fc6a476 100644 --- a/nltk/tokenize/__init__.py +++ b/nltk/tokenize/__init__.py @@ -88,7 +88,7 @@ from nltk.tokenize.stanford_segmenter import StanfordSegmenter from nltk.tokenize.texttiling import TextTilingTokenizer from nltk.tokenize.toktok import ToktokTokenizer -from nltk.tokenize.treebank import TreebankWordTokenizer +from nltk.tokenize.treebank import TreebankWordDetokenizer, TreebankWordTokenizer from nltk.tokenize.util import regexp_span_tokenize, string_span_tokenize diff --git a/nltk/tokenize/destructive.py b/nltk/tokenize/destructive.py index a4daf0941f..32eb64fd8f 100644 --- a/nltk/tokenize/destructive.py +++ b/nltk/tokenize/destructive.py @@ -1,7 +1,8 @@ # Natural Language Toolkit: NLTK's very own tokenizer. # # Copyright (C) 2001-2021 NLTK Project -# Author: +# Author: Liling Tan +# Tom Aarsen <> (modifications) # URL: # For license information, see LICENSE.TXT @@ -9,6 +10,7 @@ import re from nltk.tokenize.api import TokenizerI +from nltk.tokenize.util import align_tokens class MacIntyreContractions: @@ -24,7 +26,7 @@ class MacIntyreContractions: r"(?i)\b(got)(?#X)(ta)\b", r"(?i)\b(lem)(?#X)(me)\b", r"(?i)\b(more)(?#X)('n)\b", - r"(?i)\b(wan)(?#X)(na)\s", + r"(?i)\b(wan)(?#X)(na)(?=\s)", ] CONTRACTIONS3 = [r"(?i) ('t)(?#X)(is)\b", r"(?i) ('t)(?#X)(was)\b"] CONTRACTIONS4 = [r"(?i)\b(whad)(dd)(ya)\b", r"(?i)\b(wha)(t)(cha)\b"] @@ -53,8 +55,8 @@ class NLTKWordTokenizer(TokenizerI): # Ending quotes. ENDING_QUOTES = [ (re.compile("([»”’])", re.U), r" \1 "), + (re.compile(r"''"), " '' "), (re.compile(r'"'), " '' "), - (re.compile(r"(\S)(\'\')"), r"\1 \2 "), (re.compile(r"([^' ])('[sS]|'[mM]|'[dD]|') "), r"\1 \2 "), (re.compile(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1 \2 "), ] @@ -146,3 +148,55 @@ def tokenize(self, text, convert_parentheses=False, return_str=False): # text = regexp.sub(r' \1 \2 \3 ', text) return text if return_str else text.split() + + def span_tokenize(self, text): + r""" + Uses the post-hoc nltk.tokens.align_tokens to return the offset spans. + >>> from nltk.tokenize import NLTKWordTokenizer + >>> s = '''Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks).''' + >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23), + ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38), + ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59), + ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)] + >>> list(NLTKWordTokenizer().span_tokenize(s)) == expected + True + >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in', + ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')', + ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.'] + >>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected + True + Additional example + >>> from nltk.tokenize import NLTKWordTokenizer + >>> s = '''I said, "I'd like to buy some ''good muffins" which cost $3.88\n each in New (York)."''' + >>> expected = [(0, 1), (2, 6), (6, 7), (8, 9), (9, 10), (10, 12), + ... (13, 17), (18, 20), (21, 24), (25, 29), (30, 32), (32, 36), + ... (37, 44), (44, 45), (46, 51), (52, 56), (57, 58), (58, 62), + ... (64, 68), (69, 71), (72, 75), (76, 77), (77, 81), (81, 82), + ... (82, 83), (83, 84)] + >>> list(NLTKWordTokenizer().span_tokenize(s)) == expected + True + >>> expected = ['I', 'said', ',', '"', 'I', "'d", 'like', 'to', + ... 'buy', 'some', "''", "good", 'muffins', '"', 'which', 'cost', + ... '$', '3.88', 'each', 'in', 'New', '(', 'York', ')', '.', '"'] + >>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected + True + """ + raw_tokens = self.tokenize(text) + + # Convert converted quotes back to original double quotes + # Do this only if original text contains double quote(s) or double + # single-quotes (because '' might be transformed to `` if it is + # treated as starting quotes). + if ('"' in text) or ("''" in text): + # Find double quotes and converted quotes + matched = [m.group() for m in re.finditer(r"``|'{2}|\"", text)] + + # Replace converted quotes back to double quotes + tokens = [ + matched.pop(0) if tok in ['"', "``", "''"] else tok + for tok in raw_tokens + ] + else: + tokens = raw_tokens + + yield from align_tokens(tokens, text) diff --git a/nltk/tokenize/treebank.py b/nltk/tokenize/treebank.py index f5b90f5a0a..51c2020130 100644 --- a/nltk/tokenize/treebank.py +++ b/nltk/tokenize/treebank.py @@ -3,6 +3,7 @@ # Copyright (C) 2001-2021 NLTK Project # Author: Edward Loper # Michael Heilman (re-port from http://www.cis.upenn.edu/~treebank/tokenizer.sed) +# Tom Aarsen <> (modifications) # # URL: # For license information, see LICENSE.TXT @@ -86,8 +87,8 @@ class TreebankWordTokenizer(TokenizerI): # ending quotes ENDING_QUOTES = [ + (re.compile(r"''"), " '' "), (re.compile(r'"'), " '' "), - (re.compile(r"(\S)(\'\')"), r"\1 \2 "), (re.compile(r"([^' ])('[sS]|'[mM]|'[dD]|') "), r"\1 \2 "), (re.compile(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1 \2 "), ]