From dfeb8efe26fe1850c52bf6dceed1f399695d6060 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Thu, 4 Nov 2021 18:58:05 +0100 Subject: [PATCH 1/7] Fixed issue with quote-tokenization, small regression for detokenization --- nltk/test/tokenize.doctest | 6 ++++++ nltk/tokenize/destructive.py | 2 +- nltk/tokenize/treebank.py | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/nltk/test/tokenize.doctest b/nltk/test/tokenize.doctest index a5e86d44e8..093f838db9 100644 --- a/nltk/test/tokenize.doctest +++ b/nltk/test/tokenize.doctest @@ -41,6 +41,12 @@ Some test strings. >>> s11 = "It's more'n enough." >>> word_tokenize(s11) ['It', "'s", 'more', "'n", 'enough', '.'] + >>> s12 = "''Hello, there!''" + >>> word_tokenize(s12) + ["''", "Hello", ",", "there", "!", "''"] + >>> s13 = "''What a wonderful quote, this is'' - Someone (12 BC)" + >>> word_tokenize(s13) + ["''", 'What', 'a', 'wonderful', 'quote', ',', 'this', 'is', "''", '-', 'Someone', '(', '12', 'BC', ')'] Testing improvement made to the TreebankWordTokenizer diff --git a/nltk/tokenize/destructive.py b/nltk/tokenize/destructive.py index a4daf0941f..7eac2cb0fe 100644 --- a/nltk/tokenize/destructive.py +++ b/nltk/tokenize/destructive.py @@ -53,8 +53,8 @@ class NLTKWordTokenizer(TokenizerI): # Ending quotes. ENDING_QUOTES = [ (re.compile("([»”’])", re.U), r" \1 "), + (re.compile(r"''"), " '' "), (re.compile(r'"'), " '' "), - (re.compile(r"(\S)(\'\')"), r"\1 \2 "), (re.compile(r"([^' ])('[sS]|'[mM]|'[dD]|') "), r"\1 \2 "), (re.compile(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1 \2 "), ] diff --git a/nltk/tokenize/treebank.py b/nltk/tokenize/treebank.py index f5b90f5a0a..47ffa19a8d 100644 --- a/nltk/tokenize/treebank.py +++ b/nltk/tokenize/treebank.py @@ -86,8 +86,8 @@ class TreebankWordTokenizer(TokenizerI): # ending quotes ENDING_QUOTES = [ + (re.compile(r"''"), " '' "), (re.compile(r'"'), " '' "), - (re.compile(r"(\S)(\'\')"), r"\1 \2 "), (re.compile(r"([^' ])('[sS]|'[mM]|'[dD]|') "), r"\1 \2 "), (re.compile(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1 \2 "), ] From 9a8622f53daefd98a0182c0597f4aa3c67e01991 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Thu, 4 Nov 2021 19:35:54 +0100 Subject: [PATCH 2/7] Updated double-quote to single quote in doctest output --- nltk/test/tokenize.doctest | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nltk/test/tokenize.doctest b/nltk/test/tokenize.doctest index 093f838db9..8153090de0 100644 --- a/nltk/test/tokenize.doctest +++ b/nltk/test/tokenize.doctest @@ -43,7 +43,7 @@ Some test strings. ['It', "'s", 'more', "'n", 'enough', '.'] >>> s12 = "''Hello, there!''" >>> word_tokenize(s12) - ["''", "Hello", ",", "there", "!", "''"] + ["''", 'Hello', ',', 'there', '!', "''"] >>> s13 = "''What a wonderful quote, this is'' - Someone (12 BC)" >>> word_tokenize(s13) ["''", 'What', 'a', 'wonderful', 'quote', ',', 'this', 'is', "''", '-', 'Someone', '(', '12', 'BC', ')'] From ef4739e3aa7436f20478034af5d4e8d9ecd6dee8 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Thu, 4 Nov 2021 19:36:55 +0100 Subject: [PATCH 3/7] Resolved issue with 'wanna' absorbing a space too much in (de)tokenization --- nltk/test/tokenize.doctest | 3 +++ nltk/tokenize/destructive.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/nltk/test/tokenize.doctest b/nltk/test/tokenize.doctest index 8153090de0..b9065f6278 100644 --- a/nltk/test/tokenize.doctest +++ b/nltk/test/tokenize.doctest @@ -73,6 +73,9 @@ Testing treebank's detokenizer >>> s = "Well, we couldn't have this predictable, cliche-ridden, \"Touched by an Angel\" (a show creator John Masius worked on) wanna-be if she didn't." >>> detokenizer.detokenize(word_tokenize(s)) 'Well, we couldn\'t have this predictable, cliche-ridden, "Touched by an Angel" (a show creator John Masius worked on) wanna-be if she didn\'t.' + >>> s = "I wanna watch something" + >>> detokenizer.detokenize(word_tokenize(s)) + 'I wanna watch something' >>> s = "I cannot cannot work under these conditions!" >>> detokenizer.detokenize(word_tokenize(s)) 'I cannot cannot work under these conditions!' diff --git a/nltk/tokenize/destructive.py b/nltk/tokenize/destructive.py index 7eac2cb0fe..4e44de34c5 100644 --- a/nltk/tokenize/destructive.py +++ b/nltk/tokenize/destructive.py @@ -24,7 +24,7 @@ class MacIntyreContractions: r"(?i)\b(got)(?#X)(ta)\b", r"(?i)\b(lem)(?#X)(me)\b", r"(?i)\b(more)(?#X)('n)\b", - r"(?i)\b(wan)(?#X)(na)\s", + r"(?i)\b(wan)(?#X)(na)(?=\s)", ] CONTRACTIONS3 = [r"(?i) ('t)(?#X)(is)\b", r"(?i) ('t)(?#X)(was)\b"] CONTRACTIONS4 = [r"(?i)\b(whad)(dd)(ya)\b", r"(?i)\b(wha)(t)(cha)\b"] From 9766dde0698c900be380539ec4c8720536545dbd Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Thu, 4 Nov 2021 20:22:36 +0100 Subject: [PATCH 4/7] Allow importing TreebankWordDetokenizer from nltk.tokenize --- nltk/tokenize/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nltk/tokenize/__init__.py b/nltk/tokenize/__init__.py index 9a82830e12..e79fc6a476 100644 --- a/nltk/tokenize/__init__.py +++ b/nltk/tokenize/__init__.py @@ -88,7 +88,7 @@ from nltk.tokenize.stanford_segmenter import StanfordSegmenter from nltk.tokenize.texttiling import TextTilingTokenizer from nltk.tokenize.toktok import ToktokTokenizer -from nltk.tokenize.treebank import TreebankWordTokenizer +from nltk.tokenize.treebank import TreebankWordDetokenizer, TreebankWordTokenizer from nltk.tokenize.util import regexp_span_tokenize, string_span_tokenize From 1b854decf1eae5a6f921ca521ca99329f7e2be12 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Thu, 4 Nov 2021 20:44:56 +0100 Subject: [PATCH 5/7] Added additional test for span_tokenize --- nltk/test/tokenize.doctest | 3 +++ 1 file changed, 3 insertions(+) diff --git a/nltk/test/tokenize.doctest b/nltk/test/tokenize.doctest index b9065f6278..f0869aea7e 100644 --- a/nltk/test/tokenize.doctest +++ b/nltk/test/tokenize.doctest @@ -58,6 +58,9 @@ Testing improvement made to the TreebankWordTokenizer >>> expected = ['The', 'unicode', '201C', 'and', '201D', '\u201c', 'LEFT', '(', 'RIGHT', ')', 'DOUBLE', 'QUOTATION', 'MARK', '\u201d', 'is', 'also', 'OPEN_PUNCT', 'and', 'CLOSE_PUNCT', '.'] >>> word_tokenize(sx2) == expected True + >>> sx3 = "''Hello'\"" + >>> list(TreebankWordTokenizer().span_tokenize(sx3)) + [(0, 2), (2, 7), (7, 8), (8, 9)] Testing treebank's detokenizer From 5c42d8357e0bb890077c7a04b51bdbd4a654ec52 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Thu, 4 Nov 2021 21:15:43 +0100 Subject: [PATCH 6/7] Add span_tokenize to NLTKWordTokenizer, like in TreebankWordTokenizer --- nltk/test/tokenize.doctest | 45 ++++++++++++++++++++---------- nltk/tokenize/destructive.py | 53 ++++++++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+), 14 deletions(-) diff --git a/nltk/test/tokenize.doctest b/nltk/test/tokenize.doctest index f0869aea7e..0e3a0749e7 100644 --- a/nltk/test/tokenize.doctest +++ b/nltk/test/tokenize.doctest @@ -3,10 +3,10 @@ >>> from nltk.tokenize import * -Regression Tests: Treebank Tokenizer +Regression Tests: NLTKWordTokenizer ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Some test strings. +Tokenizing some test strings. >>> s1 = "On a $50,000 mortgage of 30 years at 8 percent, the monthly payment would be $366.88." >>> word_tokenize(s1) @@ -41,12 +41,35 @@ Some test strings. >>> s11 = "It's more'n enough." >>> word_tokenize(s11) ['It', "'s", 'more', "'n", 'enough', '.'] - >>> s12 = "''Hello, there!''" - >>> word_tokenize(s12) - ["''", 'Hello', ',', 'there', '!', "''"] - >>> s13 = "''What a wonderful quote, this is'' - Someone (12 BC)" - >>> word_tokenize(s13) - ["''", 'What', 'a', 'wonderful', 'quote', ',', 'this', 'is', "''", '-', 'Someone', '(', '12', 'BC', ')'] + +Gathering the spans of the tokenized strings. + + >>> s = '''Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks).''' + >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23), + ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38), + ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59), + ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)] + >>> list(NLTKWordTokenizer().span_tokenize(s)) == expected + True + >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in', + ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')', + ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.'] + >>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected + True + + >>> s = '''I said, "I'd like to buy some ''good muffins" which cost $3.88\n each in New (York)."''' + >>> expected = [(0, 1), (2, 6), (6, 7), (8, 9), (9, 10), (10, 12), + ... (13, 17), (18, 20), (21, 24), (25, 29), (30, 32), (32, 36), + ... (37, 44), (44, 45), (46, 51), (52, 56), (57, 58), (58, 62), + ... (64, 68), (69, 71), (72, 75), (76, 77), (77, 81), (81, 82), + ... (82, 83), (83, 84)] + >>> list(NLTKWordTokenizer().span_tokenize(s)) == expected + True + >>> expected = ['I', 'said', ',', '"', 'I', "'d", 'like', 'to', + ... 'buy', 'some', "''", "good", 'muffins', '"', 'which', 'cost', + ... '$', '3.88', 'each', 'in', 'New', '(', 'York', ')', '.', '"'] + >>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected + True Testing improvement made to the TreebankWordTokenizer @@ -58,9 +81,6 @@ Testing improvement made to the TreebankWordTokenizer >>> expected = ['The', 'unicode', '201C', 'and', '201D', '\u201c', 'LEFT', '(', 'RIGHT', ')', 'DOUBLE', 'QUOTATION', 'MARK', '\u201d', 'is', 'also', 'OPEN_PUNCT', 'and', 'CLOSE_PUNCT', '.'] >>> word_tokenize(sx2) == expected True - >>> sx3 = "''Hello'\"" - >>> list(TreebankWordTokenizer().span_tokenize(sx3)) - [(0, 2), (2, 7), (7, 8), (8, 9)] Testing treebank's detokenizer @@ -76,9 +96,6 @@ Testing treebank's detokenizer >>> s = "Well, we couldn't have this predictable, cliche-ridden, \"Touched by an Angel\" (a show creator John Masius worked on) wanna-be if she didn't." >>> detokenizer.detokenize(word_tokenize(s)) 'Well, we couldn\'t have this predictable, cliche-ridden, "Touched by an Angel" (a show creator John Masius worked on) wanna-be if she didn\'t.' - >>> s = "I wanna watch something" - >>> detokenizer.detokenize(word_tokenize(s)) - 'I wanna watch something' >>> s = "I cannot cannot work under these conditions!" >>> detokenizer.detokenize(word_tokenize(s)) 'I cannot cannot work under these conditions!' diff --git a/nltk/tokenize/destructive.py b/nltk/tokenize/destructive.py index 4e44de34c5..6cd846c580 100644 --- a/nltk/tokenize/destructive.py +++ b/nltk/tokenize/destructive.py @@ -9,6 +9,7 @@ import re from nltk.tokenize.api import TokenizerI +from nltk.tokenize.util import align_tokens class MacIntyreContractions: @@ -146,3 +147,55 @@ def tokenize(self, text, convert_parentheses=False, return_str=False): # text = regexp.sub(r' \1 \2 \3 ', text) return text if return_str else text.split() + + def span_tokenize(self, text): + r""" + Uses the post-hoc nltk.tokens.align_tokens to return the offset spans. + >>> from nltk.tokenize import NLTKWordTokenizer + >>> s = '''Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks).''' + >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23), + ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38), + ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59), + ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)] + >>> list(NLTKWordTokenizer().span_tokenize(s)) == expected + True + >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in', + ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')', + ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.'] + >>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected + True + Additional example + >>> from nltk.tokenize import NLTKWordTokenizer + >>> s = '''I said, "I'd like to buy some ''good muffins" which cost $3.88\n each in New (York)."''' + >>> expected = [(0, 1), (2, 6), (6, 7), (8, 9), (9, 10), (10, 12), + ... (13, 17), (18, 20), (21, 24), (25, 29), (30, 32), (32, 36), + ... (37, 44), (44, 45), (46, 51), (52, 56), (57, 58), (58, 62), + ... (64, 68), (69, 71), (72, 75), (76, 77), (77, 81), (81, 82), + ... (82, 83), (83, 84)] + >>> list(NLTKWordTokenizer().span_tokenize(s)) == expected + True + >>> expected = ['I', 'said', ',', '"', 'I', "'d", 'like', 'to', + ... 'buy', 'some', "''", "good", 'muffins', '"', 'which', 'cost', + ... '$', '3.88', 'each', 'in', 'New', '(', 'York', ')', '.', '"'] + >>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected + True + """ + raw_tokens = self.tokenize(text) + + # Convert converted quotes back to original double quotes + # Do this only if original text contains double quote(s) or double + # single-quotes (because '' might be transformed to `` if it is + # treated as starting quotes). + if ('"' in text) or ("''" in text): + # Find double quotes and converted quotes + matched = [m.group() for m in re.finditer(r"``|'{2}|\"", text)] + + # Replace converted quotes back to double quotes + tokens = [ + matched.pop(0) if tok in ['"', "``", "''"] else tok + for tok in raw_tokens + ] + else: + tokens = raw_tokens + + yield from align_tokens(tokens, text) From 0e4ad310a4460f69c7af4e75babc0aeb03887777 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Thu, 4 Nov 2021 21:40:24 +0100 Subject: [PATCH 7/7] Added credits for modifications --- nltk/tokenize/destructive.py | 3 ++- nltk/tokenize/treebank.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/nltk/tokenize/destructive.py b/nltk/tokenize/destructive.py index 6cd846c580..32eb64fd8f 100644 --- a/nltk/tokenize/destructive.py +++ b/nltk/tokenize/destructive.py @@ -1,7 +1,8 @@ # Natural Language Toolkit: NLTK's very own tokenizer. # # Copyright (C) 2001-2021 NLTK Project -# Author: +# Author: Liling Tan +# Tom Aarsen <> (modifications) # URL: # For license information, see LICENSE.TXT diff --git a/nltk/tokenize/treebank.py b/nltk/tokenize/treebank.py index 47ffa19a8d..51c2020130 100644 --- a/nltk/tokenize/treebank.py +++ b/nltk/tokenize/treebank.py @@ -3,6 +3,7 @@ # Copyright (C) 2001-2021 NLTK Project # Author: Edward Loper # Michael Heilman (re-port from http://www.cis.upenn.edu/~treebank/tokenizer.sed) +# Tom Aarsen <> (modifications) # # URL: # For license information, see LICENSE.TXT