From 33105b20e520bc399067ff0df8d1fc2b18d36027 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Wed, 27 Oct 2021 12:14:19 +0200
Subject: [PATCH 1/3] Resolved serious ReDOS in PunktSentenceTokenizer

---
 nltk/tokenize/punkt.py | 78 ++++++++++++++++++++++++++++++++++++++----
 1 file changed, 72 insertions(+), 6 deletions(-)

diff --git a/nltk/tokenize/punkt.py b/nltk/tokenize/punkt.py
index a08ff4c903..88ca73f99c 100644
--- a/nltk/tokenize/punkt.py
+++ b/nltk/tokenize/punkt.py
@@ -189,7 +189,7 @@ class PunktLanguageVars:
     constructors.
     """
 
-    __slots__ = ("_re_period_context", "_re_word_tokenizer")
+    __slots__ = ("_re_period_context", "_re_word_tokenizer", "_re_last_word")
 
     def __getstate__(self):
         # All modifications to the class are performed by inheritance.
@@ -266,7 +266,6 @@ def word_tokenize(self, s):
         return self._word_tokenizer_re().findall(s)
 
     _period_context_fmt = r"""
-        \S*                          # some word material
         %(SentEndChars)s             # a potential sentence ending
         (?=(?P<after_tok>
             %(NonWord)s              # either other punctuation
@@ -293,6 +292,16 @@ def period_context_re(self):
             )
             return self._re_period_context
 
+    def last_word_re(self):
+        """Compiles and returns a regular expression to return the sequence
+        of non-whitespace characters in a string starting from the right.
+        It returns an empty match if the text ends with whitespace."""
+        try:
+            return self._re_last_word
+        except:
+            self._re_last_word = re.compile(r"\S*$")
+            return self._re_last_word
+
 
 _re_non_punct = re.compile(r"[^\W\d]", re.UNICODE)
 """Matches token types that are not merely punctuation. (Types for
@@ -1284,8 +1293,7 @@ def debug_decisions(self, text):
         See format_debug_decision() to help make this output readable.
         """
 
-        for match in self._lang_vars.period_context_re().finditer(text):
-            decision_text = match.group() + match.group("after_tok")
+        for match, decision_text in self._match_potential_end_contexts(text):
             tokens = self._tokenize_words(decision_text)
             tokens = list(self._annotate_first_pass(tokens))
             while tokens and not tokens[0].tok.endswith(self._lang_vars.sent_end_chars):
@@ -1333,10 +1341,68 @@ def sentences_from_text(self, text, realign_boundaries=True):
         """
         return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
 
+    def _match_potential_end_contexts(self, text):
+        """
+        Given a text, find the matches of potential sentence breaks,
+        alongside the contexts surrounding these sentence breaks.
+
+        Since the fix for the ReDOS discovered in issue #2866, we no longer match
+        the word before a potential end of sentence token. Instead, we use a separate
+        regex for this. As a consequence, `finditer`'s desire to find non-overlapping
+        matches no longer aids us in finding the single longest match.
+        Where previously, we could use::
+
+            >>> pst = PunktSentenceTokenizer()
+            >>> text = "Very bad acting!!! I promise."
+            >>> list(pst._lang_vars.period_context_re().finditer(text)) # doctest: +SKIP
+            [<re.Match object; span=(9, 18), match='acting!!!'>]
+
+        Now we have to find the word before (i.e. 'acting') separately, and `finditer`
+        returns::
+
+            >>> pst = PunktSentenceTokenizer()
+            >>> text = "Very bad acting!!! I promise."
+            >>> list(pst._lang_vars.period_context_re().finditer(text)) # doctest: +NORMALIZE_WHITESPACE
+            [<re.Match object; span=(15, 16), match='!'>,
+            <re.Match object; span=(16, 17), match='!'>,
+            <re.Match object; span=(17, 18), match='!'>]
+
+        So, we need to find the word before the match from right to left, and then manually remove
+        the overlaps. That is what this method does::
+
+            >>> pst = PunktSentenceTokenizer()
+            >>> text = "Very bad acting!!! I promise."
+            >>> pst._match_potential_end_contexts(text)
+            [(<re.Match object; span=(17, 18), match='!'>, 'acting!!! I')]
+
+        :param text: String of one or more sentences
+        :type text: str
+        :return: List of match-context tuples.
+        :rtype: List[Tuple[re.Match, str]]
+        """
+        before_words = {}
+        matches = []
+        for match in reversed(list(self._lang_vars.period_context_re().finditer(text))):
+            # Ignore matches that have already been captured by matches to the right of this match
+            if matches and match.end() > before_words[matches[-1]].start():
+                continue
+            # Find the word before the current match
+            before_words[match] = self._lang_vars.last_word_re().search(
+                text[: match.start()]
+            )
+            matches.append(match)
+
+        return [
+            (
+                match,
+                before_words[match].group() + match.group() + match.group("after_tok"),
+            )
+            for match in matches[::-1]
+        ]
+
     def _slices_from_text(self, text):
         last_break = 0
-        for match in self._lang_vars.period_context_re().finditer(text):
-            context = match.group() + match.group("after_tok")
+        for match, context in self._match_potential_end_contexts(text):
             if self.text_contains_sentbreak(context):
                 yield slice(last_break, match.end())
                 if match.group("next_tok"):

From d6874e719c21cb09b5d2ae9620f3622e2a5f1f80 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Wed, 27 Oct 2021 13:25:22 +0200
Subject: [PATCH 2/3] Improve performance by relying on string split instead of
 re.search

---
 nltk/tokenize/punkt.py | 22 ++++++----------------
 1 file changed, 6 insertions(+), 16 deletions(-)

diff --git a/nltk/tokenize/punkt.py b/nltk/tokenize/punkt.py
index 88ca73f99c..58d21f0c3c 100644
--- a/nltk/tokenize/punkt.py
+++ b/nltk/tokenize/punkt.py
@@ -189,7 +189,7 @@ class PunktLanguageVars:
     constructors.
     """
 
-    __slots__ = ("_re_period_context", "_re_word_tokenizer", "_re_last_word")
+    __slots__ = ("_re_period_context", "_re_word_tokenizer")
 
     def __getstate__(self):
         # All modifications to the class are performed by inheritance.
@@ -292,16 +292,6 @@ def period_context_re(self):
             )
             return self._re_period_context
 
-    def last_word_re(self):
-        """Compiles and returns a regular expression to return the sequence
-        of non-whitespace characters in a string starting from the right.
-        It returns an empty match if the text ends with whitespace."""
-        try:
-            return self._re_last_word
-        except:
-            self._re_last_word = re.compile(r"\S*$")
-            return self._re_last_word
-
 
 _re_non_punct = re.compile(r"[^\W\d]", re.UNICODE)
 """Matches token types that are not merely punctuation. (Types for
@@ -1384,18 +1374,18 @@ def _match_potential_end_contexts(self, text):
         matches = []
         for match in reversed(list(self._lang_vars.period_context_re().finditer(text))):
             # Ignore matches that have already been captured by matches to the right of this match
-            if matches and match.end() > before_words[matches[-1]].start():
+            if matches and match.end() > before_start:
                 continue
             # Find the word before the current match
-            before_words[match] = self._lang_vars.last_word_re().search(
-                text[: match.start()]
-            )
+            all_before, before_word = text[: match.start()].rsplit(maxsplit=1)
+            before_start = len(all_before)
+            before_words[match] = before_word
             matches.append(match)
 
         return [
             (
                 match,
-                before_words[match].group() + match.group() + match.group("after_tok"),
+                before_words[match] + match.group() + match.group("after_tok"),
             )
             for match in matches[::-1]
         ]

From 589930aa2931331b2cfbc1c27cf218458c49912e Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Wed, 27 Oct 2021 15:14:47 +0200
Subject: [PATCH 3/3] Solved issue if sentence contains just one token

---
 nltk/tokenize/punkt.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/nltk/tokenize/punkt.py b/nltk/tokenize/punkt.py
index 58d21f0c3c..54937b9ecd 100644
--- a/nltk/tokenize/punkt.py
+++ b/nltk/tokenize/punkt.py
@@ -1377,9 +1377,9 @@ def _match_potential_end_contexts(self, text):
             if matches and match.end() > before_start:
                 continue
             # Find the word before the current match
-            all_before, before_word = text[: match.start()].rsplit(maxsplit=1)
-            before_start = len(all_before)
-            before_words[match] = before_word
+            split = text[: match.start()].rsplit(maxsplit=1)
+            before_start = len(split[0]) if len(split) == 2 else 0
+            before_words[match] = split[-1]
             matches.append(match)
 
         return [