From dfeb8efe26fe1850c52bf6dceed1f399695d6060 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Thu, 4 Nov 2021 18:58:05 +0100
Subject: [PATCH 1/7] Fixed issue with quote-tokenization, small regression for
 detokenization

---
 nltk/test/tokenize.doctest   | 6 ++++++
 nltk/tokenize/destructive.py | 2 +-
 nltk/tokenize/treebank.py    | 2 +-
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/nltk/test/tokenize.doctest b/nltk/test/tokenize.doctest
index a5e86d44e8..093f838db9 100644
--- a/nltk/test/tokenize.doctest
+++ b/nltk/test/tokenize.doctest
@@ -41,6 +41,12 @@ Some test strings.
     >>> s11 = "It's more'n enough."
     >>> word_tokenize(s11)
     ['It', "'s", 'more', "'n", 'enough', '.']
+    >>> s12 = "''Hello, there!''"
+    >>> word_tokenize(s12)
+    ["''", "Hello", ",", "there", "!", "''"]
+    >>> s13 = "''What a wonderful quote, this is'' - Someone (12 BC)"
+    >>> word_tokenize(s13)
+    ["''", 'What', 'a', 'wonderful', 'quote', ',', 'this', 'is', "''", '-', 'Someone', '(', '12', 'BC', ')']
 
 Testing improvement made to the TreebankWordTokenizer
 
diff --git a/nltk/tokenize/destructive.py b/nltk/tokenize/destructive.py
index a4daf0941f..7eac2cb0fe 100644
--- a/nltk/tokenize/destructive.py
+++ b/nltk/tokenize/destructive.py
@@ -53,8 +53,8 @@ class NLTKWordTokenizer(TokenizerI):
     # Ending quotes.
     ENDING_QUOTES = [
         (re.compile("([»”’])", re.U), r" \1 "),
+        (re.compile(r"''"), " '' "),
         (re.compile(r'"'), " '' "),
-        (re.compile(r"(\S)(\'\')"), r"\1 \2 "),
         (re.compile(r"([^' ])('[sS]|'[mM]|'[dD]|') "), r"\1 \2 "),
         (re.compile(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1 \2 "),
     ]
diff --git a/nltk/tokenize/treebank.py b/nltk/tokenize/treebank.py
index f5b90f5a0a..47ffa19a8d 100644
--- a/nltk/tokenize/treebank.py
+++ b/nltk/tokenize/treebank.py
@@ -86,8 +86,8 @@ class TreebankWordTokenizer(TokenizerI):
 
     # ending quotes
     ENDING_QUOTES = [
+        (re.compile(r"''"), " '' "),
         (re.compile(r'"'), " '' "),
-        (re.compile(r"(\S)(\'\')"), r"\1 \2 "),
         (re.compile(r"([^' ])('[sS]|'[mM]|'[dD]|') "), r"\1 \2 "),
         (re.compile(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1 \2 "),
     ]

From 9a8622f53daefd98a0182c0597f4aa3c67e01991 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Thu, 4 Nov 2021 19:35:54 +0100
Subject: [PATCH 2/7] Updated double-quote to single quote in doctest output

---
 nltk/test/tokenize.doctest | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nltk/test/tokenize.doctest b/nltk/test/tokenize.doctest
index 093f838db9..8153090de0 100644
--- a/nltk/test/tokenize.doctest
+++ b/nltk/test/tokenize.doctest
@@ -43,7 +43,7 @@ Some test strings.
     ['It', "'s", 'more', "'n", 'enough', '.']
     >>> s12 = "''Hello, there!''"
     >>> word_tokenize(s12)
-    ["''", "Hello", ",", "there", "!", "''"]
+    ["''", 'Hello', ',', 'there', '!', "''"]
     >>> s13 = "''What a wonderful quote, this is'' - Someone (12 BC)"
     >>> word_tokenize(s13)
     ["''", 'What', 'a', 'wonderful', 'quote', ',', 'this', 'is', "''", '-', 'Someone', '(', '12', 'BC', ')']

From ef4739e3aa7436f20478034af5d4e8d9ecd6dee8 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Thu, 4 Nov 2021 19:36:55 +0100
Subject: [PATCH 3/7] Resolved issue with 'wanna' absorbing a space too much in
 (de)tokenization

---
 nltk/test/tokenize.doctest   | 3 +++
 nltk/tokenize/destructive.py | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/nltk/test/tokenize.doctest b/nltk/test/tokenize.doctest
index 8153090de0..b9065f6278 100644
--- a/nltk/test/tokenize.doctest
+++ b/nltk/test/tokenize.doctest
@@ -73,6 +73,9 @@ Testing treebank's detokenizer
     >>> s = "Well, we couldn't have this predictable, cliche-ridden, \"Touched by an Angel\" (a show creator John Masius worked on) wanna-be if she didn't."
     >>> detokenizer.detokenize(word_tokenize(s))
     'Well, we couldn\'t have this predictable, cliche-ridden, "Touched by an Angel" (a show creator John Masius worked on) wanna-be if she didn\'t.'
+    >>> s = "I wanna watch something"
+    >>> detokenizer.detokenize(word_tokenize(s))
+    'I wanna watch something'
     >>> s = "I cannot cannot work under these conditions!"
     >>> detokenizer.detokenize(word_tokenize(s))
     'I cannot cannot work under these conditions!'
diff --git a/nltk/tokenize/destructive.py b/nltk/tokenize/destructive.py
index 7eac2cb0fe..4e44de34c5 100644
--- a/nltk/tokenize/destructive.py
+++ b/nltk/tokenize/destructive.py
@@ -24,7 +24,7 @@ class MacIntyreContractions:
         r"(?i)\b(got)(?#X)(ta)\b",
         r"(?i)\b(lem)(?#X)(me)\b",
         r"(?i)\b(more)(?#X)('n)\b",
-        r"(?i)\b(wan)(?#X)(na)\s",
+        r"(?i)\b(wan)(?#X)(na)(?=\s)",
     ]
     CONTRACTIONS3 = [r"(?i) ('t)(?#X)(is)\b", r"(?i) ('t)(?#X)(was)\b"]
     CONTRACTIONS4 = [r"(?i)\b(whad)(dd)(ya)\b", r"(?i)\b(wha)(t)(cha)\b"]

From 9766dde0698c900be380539ec4c8720536545dbd Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Thu, 4 Nov 2021 20:22:36 +0100
Subject: [PATCH 4/7] Allow importing TreebankWordDetokenizer from
 nltk.tokenize

---
 nltk/tokenize/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nltk/tokenize/__init__.py b/nltk/tokenize/__init__.py
index 9a82830e12..e79fc6a476 100644
--- a/nltk/tokenize/__init__.py
+++ b/nltk/tokenize/__init__.py
@@ -88,7 +88,7 @@
 from nltk.tokenize.stanford_segmenter import StanfordSegmenter
 from nltk.tokenize.texttiling import TextTilingTokenizer
 from nltk.tokenize.toktok import ToktokTokenizer
-from nltk.tokenize.treebank import TreebankWordTokenizer
+from nltk.tokenize.treebank import TreebankWordDetokenizer, TreebankWordTokenizer
 from nltk.tokenize.util import regexp_span_tokenize, string_span_tokenize
 
 

From 1b854decf1eae5a6f921ca521ca99329f7e2be12 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Thu, 4 Nov 2021 20:44:56 +0100
Subject: [PATCH 5/7] Added additional test for span_tokenize

---
 nltk/test/tokenize.doctest | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/nltk/test/tokenize.doctest b/nltk/test/tokenize.doctest
index b9065f6278..f0869aea7e 100644
--- a/nltk/test/tokenize.doctest
+++ b/nltk/test/tokenize.doctest
@@ -58,6 +58,9 @@ Testing improvement made to the TreebankWordTokenizer
     >>> expected = ['The', 'unicode', '201C', 'and', '201D', '\u201c', 'LEFT', '(', 'RIGHT', ')', 'DOUBLE', 'QUOTATION', 'MARK', '\u201d', 'is', 'also', 'OPEN_PUNCT', 'and', 'CLOSE_PUNCT', '.']
     >>> word_tokenize(sx2) == expected
     True
+    >>> sx3 = "''Hello'\""
+    >>> list(TreebankWordTokenizer().span_tokenize(sx3))
+    [(0, 2), (2, 7), (7, 8), (8, 9)]
 
 
 Testing treebank's detokenizer

From 5c42d8357e0bb890077c7a04b51bdbd4a654ec52 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Thu, 4 Nov 2021 21:15:43 +0100
Subject: [PATCH 6/7] Add span_tokenize to NLTKWordTokenizer, like in
 TreebankWordTokenizer

---
 nltk/test/tokenize.doctest   | 45 ++++++++++++++++++++----------
 nltk/tokenize/destructive.py | 53 ++++++++++++++++++++++++++++++++++++
 2 files changed, 84 insertions(+), 14 deletions(-)

diff --git a/nltk/test/tokenize.doctest b/nltk/test/tokenize.doctest
index f0869aea7e..0e3a0749e7 100644
--- a/nltk/test/tokenize.doctest
+++ b/nltk/test/tokenize.doctest
@@ -3,10 +3,10 @@
 
     >>> from nltk.tokenize import *
 
-Regression Tests: Treebank Tokenizer
+Regression Tests: NLTKWordTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Some test strings.
+Tokenizing some test strings.
 
     >>> s1 = "On a $50,000 mortgage of 30 years at 8 percent, the monthly payment would be $366.88."
     >>> word_tokenize(s1)
@@ -41,12 +41,35 @@ Some test strings.
     >>> s11 = "It's more'n enough."
     >>> word_tokenize(s11)
     ['It', "'s", 'more', "'n", 'enough', '.']
-    >>> s12 = "''Hello, there!''"
-    >>> word_tokenize(s12)
-    ["''", 'Hello', ',', 'there', '!', "''"]
-    >>> s13 = "''What a wonderful quote, this is'' - Someone (12 BC)"
-    >>> word_tokenize(s13)
-    ["''", 'What', 'a', 'wonderful', 'quote', ',', 'this', 'is', "''", '-', 'Someone', '(', '12', 'BC', ')']
+
+Gathering the spans of the tokenized strings.
+
+    >>> s = '''Good muffins cost $3.88\nin New (York).  Please (buy) me\ntwo of them.\n(Thanks).'''
+    >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
+    ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
+    ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
+    ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)]
+    >>> list(NLTKWordTokenizer().span_tokenize(s)) == expected
+    True
+    >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
+    ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')',
+    ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.']
+    >>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected
+    True
+
+    >>> s = '''I said, "I'd like to buy some ''good muffins" which cost $3.88\n each in New (York)."'''
+    >>> expected = [(0, 1), (2, 6), (6, 7), (8, 9), (9, 10), (10, 12),
+    ... (13, 17), (18, 20), (21, 24), (25, 29), (30, 32), (32, 36),
+    ... (37, 44), (44, 45), (46, 51), (52, 56), (57, 58), (58, 62),
+    ... (64, 68), (69, 71), (72, 75), (76, 77), (77, 81), (81, 82),
+    ... (82, 83), (83, 84)]
+    >>> list(NLTKWordTokenizer().span_tokenize(s)) == expected
+    True
+    >>> expected = ['I', 'said', ',', '"', 'I', "'d", 'like', 'to',
+    ... 'buy', 'some', "''", "good", 'muffins', '"', 'which', 'cost',
+    ... '$', '3.88', 'each', 'in', 'New', '(', 'York', ')', '.', '"']
+    >>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected
+    True
 
 Testing improvement made to the TreebankWordTokenizer
 
@@ -58,9 +81,6 @@ Testing improvement made to the TreebankWordTokenizer
     >>> expected = ['The', 'unicode', '201C', 'and', '201D', '\u201c', 'LEFT', '(', 'RIGHT', ')', 'DOUBLE', 'QUOTATION', 'MARK', '\u201d', 'is', 'also', 'OPEN_PUNCT', 'and', 'CLOSE_PUNCT', '.']
     >>> word_tokenize(sx2) == expected
     True
-    >>> sx3 = "''Hello'\""
-    >>> list(TreebankWordTokenizer().span_tokenize(sx3))
-    [(0, 2), (2, 7), (7, 8), (8, 9)]
 
 
 Testing treebank's detokenizer
@@ -76,9 +96,6 @@ Testing treebank's detokenizer
     >>> s = "Well, we couldn't have this predictable, cliche-ridden, \"Touched by an Angel\" (a show creator John Masius worked on) wanna-be if she didn't."
     >>> detokenizer.detokenize(word_tokenize(s))
     'Well, we couldn\'t have this predictable, cliche-ridden, "Touched by an Angel" (a show creator John Masius worked on) wanna-be if she didn\'t.'
-    >>> s = "I wanna watch something"
-    >>> detokenizer.detokenize(word_tokenize(s))
-    'I wanna watch something'
     >>> s = "I cannot cannot work under these conditions!"
     >>> detokenizer.detokenize(word_tokenize(s))
     'I cannot cannot work under these conditions!'
diff --git a/nltk/tokenize/destructive.py b/nltk/tokenize/destructive.py
index 4e44de34c5..6cd846c580 100644
--- a/nltk/tokenize/destructive.py
+++ b/nltk/tokenize/destructive.py
@@ -9,6 +9,7 @@
 import re
 
 from nltk.tokenize.api import TokenizerI
+from nltk.tokenize.util import align_tokens
 
 
 class MacIntyreContractions:
@@ -146,3 +147,55 @@ def tokenize(self, text, convert_parentheses=False, return_str=False):
         #     text = regexp.sub(r' \1 \2 \3 ', text)
 
         return text if return_str else text.split()
+
+    def span_tokenize(self, text):
+        r"""
+        Uses the post-hoc nltk.tokens.align_tokens to return the offset spans.
+            >>> from nltk.tokenize import NLTKWordTokenizer
+            >>> s = '''Good muffins cost $3.88\nin New (York).  Please (buy) me\ntwo of them.\n(Thanks).'''
+            >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
+            ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
+            ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
+            ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)]
+            >>> list(NLTKWordTokenizer().span_tokenize(s)) == expected
+            True
+            >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
+            ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')',
+            ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.']
+            >>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected
+            True
+            Additional example
+            >>> from nltk.tokenize import NLTKWordTokenizer
+            >>> s = '''I said, "I'd like to buy some ''good muffins" which cost $3.88\n each in New (York)."'''
+            >>> expected = [(0, 1), (2, 6), (6, 7), (8, 9), (9, 10), (10, 12),
+            ... (13, 17), (18, 20), (21, 24), (25, 29), (30, 32), (32, 36),
+            ... (37, 44), (44, 45), (46, 51), (52, 56), (57, 58), (58, 62),
+            ... (64, 68), (69, 71), (72, 75), (76, 77), (77, 81), (81, 82),
+            ... (82, 83), (83, 84)]
+            >>> list(NLTKWordTokenizer().span_tokenize(s)) == expected
+            True
+            >>> expected = ['I', 'said', ',', '"', 'I', "'d", 'like', 'to',
+            ... 'buy', 'some', "''", "good", 'muffins', '"', 'which', 'cost',
+            ... '$', '3.88', 'each', 'in', 'New', '(', 'York', ')', '.', '"']
+            >>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected
+            True
+        """
+        raw_tokens = self.tokenize(text)
+
+        # Convert converted quotes back to original double quotes
+        # Do this only if original text contains double quote(s) or double
+        # single-quotes (because '' might be transformed to `` if it is
+        # treated as starting quotes).
+        if ('"' in text) or ("''" in text):
+            # Find double quotes and converted quotes
+            matched = [m.group() for m in re.finditer(r"``|'{2}|\"", text)]
+
+            # Replace converted quotes back to double quotes
+            tokens = [
+                matched.pop(0) if tok in ['"', "``", "''"] else tok
+                for tok in raw_tokens
+            ]
+        else:
+            tokens = raw_tokens
+
+        yield from align_tokens(tokens, text)

From 0e4ad310a4460f69c7af4e75babc0aeb03887777 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Thu, 4 Nov 2021 21:40:24 +0100
Subject: [PATCH 7/7] Added credits for modifications

---
 nltk/tokenize/destructive.py | 3 ++-
 nltk/tokenize/treebank.py    | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/nltk/tokenize/destructive.py b/nltk/tokenize/destructive.py
index 6cd846c580..32eb64fd8f 100644
--- a/nltk/tokenize/destructive.py
+++ b/nltk/tokenize/destructive.py
@@ -1,7 +1,8 @@
 # Natural Language Toolkit: NLTK's very own tokenizer.
 #
 # Copyright (C) 2001-2021 NLTK Project
-# Author:
+# Author: Liling Tan
+#         Tom Aarsen <> (modifications)
 # URL: <https://www.nltk.org>
 # For license information, see LICENSE.TXT
 
diff --git a/nltk/tokenize/treebank.py b/nltk/tokenize/treebank.py
index 47ffa19a8d..51c2020130 100644
--- a/nltk/tokenize/treebank.py
+++ b/nltk/tokenize/treebank.py
@@ -3,6 +3,7 @@
 # Copyright (C) 2001-2021 NLTK Project
 # Author: Edward Loper <edloper@gmail.com>
 #         Michael Heilman <mheilman@cmu.edu> (re-port from http://www.cis.upenn.edu/~treebank/tokenizer.sed)
+#         Tom Aarsen <> (modifications)
 #
 # URL: <https://www.nltk.org>
 # For license information, see LICENSE.TXT