From 33105b20e520bc399067ff0df8d1fc2b18d36027 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Wed, 27 Oct 2021 12:14:19 +0200 Subject: [PATCH 1/3] Resolved serious ReDOS in PunktSentenceTokenizer --- nltk/tokenize/punkt.py | 78 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 72 insertions(+), 6 deletions(-) diff --git a/nltk/tokenize/punkt.py b/nltk/tokenize/punkt.py index a08ff4c903..88ca73f99c 100644 --- a/nltk/tokenize/punkt.py +++ b/nltk/tokenize/punkt.py @@ -189,7 +189,7 @@ class PunktLanguageVars: constructors. """ - __slots__ = ("_re_period_context", "_re_word_tokenizer") + __slots__ = ("_re_period_context", "_re_word_tokenizer", "_re_last_word") def __getstate__(self): # All modifications to the class are performed by inheritance. @@ -266,7 +266,6 @@ def word_tokenize(self, s): return self._word_tokenizer_re().findall(s) _period_context_fmt = r""" - \S* # some word material %(SentEndChars)s # a potential sentence ending (?=(?P %(NonWord)s # either other punctuation @@ -293,6 +292,16 @@ def period_context_re(self): ) return self._re_period_context + def last_word_re(self): + """Compiles and returns a regular expression to return the sequence + of non-whitespace characters in a string starting from the right. + It returns an empty match if the text ends with whitespace.""" + try: + return self._re_last_word + except: + self._re_last_word = re.compile(r"\S*$") + return self._re_last_word + _re_non_punct = re.compile(r"[^\W\d]", re.UNICODE) """Matches token types that are not merely punctuation. (Types for @@ -1284,8 +1293,7 @@ def debug_decisions(self, text): See format_debug_decision() to help make this output readable. """ - for match in self._lang_vars.period_context_re().finditer(text): - decision_text = match.group() + match.group("after_tok") + for match, decision_text in self._match_potential_end_contexts(text): tokens = self._tokenize_words(decision_text) tokens = list(self._annotate_first_pass(tokens)) while tokens and not tokens[0].tok.endswith(self._lang_vars.sent_end_chars): @@ -1333,10 +1341,68 @@ def sentences_from_text(self, text, realign_boundaries=True): """ return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)] + def _match_potential_end_contexts(self, text): + """ + Given a text, find the matches of potential sentence breaks, + alongside the contexts surrounding these sentence breaks. + + Since the fix for the ReDOS discovered in issue #2866, we no longer match + the word before a potential end of sentence token. Instead, we use a separate + regex for this. As a consequence, `finditer`'s desire to find non-overlapping + matches no longer aids us in finding the single longest match. + Where previously, we could use:: + + >>> pst = PunktSentenceTokenizer() + >>> text = "Very bad acting!!! I promise." + >>> list(pst._lang_vars.period_context_re().finditer(text)) # doctest: +SKIP + [] + + Now we have to find the word before (i.e. 'acting') separately, and `finditer` + returns:: + + >>> pst = PunktSentenceTokenizer() + >>> text = "Very bad acting!!! I promise." + >>> list(pst._lang_vars.period_context_re().finditer(text)) # doctest: +NORMALIZE_WHITESPACE + [, + , + ] + + So, we need to find the word before the match from right to left, and then manually remove + the overlaps. That is what this method does:: + + >>> pst = PunktSentenceTokenizer() + >>> text = "Very bad acting!!! I promise." + >>> pst._match_potential_end_contexts(text) + [(, 'acting!!! I')] + + :param text: String of one or more sentences + :type text: str + :return: List of match-context tuples. + :rtype: List[Tuple[re.Match, str]] + """ + before_words = {} + matches = [] + for match in reversed(list(self._lang_vars.period_context_re().finditer(text))): + # Ignore matches that have already been captured by matches to the right of this match + if matches and match.end() > before_words[matches[-1]].start(): + continue + # Find the word before the current match + before_words[match] = self._lang_vars.last_word_re().search( + text[: match.start()] + ) + matches.append(match) + + return [ + ( + match, + before_words[match].group() + match.group() + match.group("after_tok"), + ) + for match in matches[::-1] + ] + def _slices_from_text(self, text): last_break = 0 - for match in self._lang_vars.period_context_re().finditer(text): - context = match.group() + match.group("after_tok") + for match, context in self._match_potential_end_contexts(text): if self.text_contains_sentbreak(context): yield slice(last_break, match.end()) if match.group("next_tok"): From d6874e719c21cb09b5d2ae9620f3622e2a5f1f80 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Wed, 27 Oct 2021 13:25:22 +0200 Subject: [PATCH 2/3] Improve performance by relying on string split instead of re.search --- nltk/tokenize/punkt.py | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/nltk/tokenize/punkt.py b/nltk/tokenize/punkt.py index 88ca73f99c..58d21f0c3c 100644 --- a/nltk/tokenize/punkt.py +++ b/nltk/tokenize/punkt.py @@ -189,7 +189,7 @@ class PunktLanguageVars: constructors. """ - __slots__ = ("_re_period_context", "_re_word_tokenizer", "_re_last_word") + __slots__ = ("_re_period_context", "_re_word_tokenizer") def __getstate__(self): # All modifications to the class are performed by inheritance. @@ -292,16 +292,6 @@ def period_context_re(self): ) return self._re_period_context - def last_word_re(self): - """Compiles and returns a regular expression to return the sequence - of non-whitespace characters in a string starting from the right. - It returns an empty match if the text ends with whitespace.""" - try: - return self._re_last_word - except: - self._re_last_word = re.compile(r"\S*$") - return self._re_last_word - _re_non_punct = re.compile(r"[^\W\d]", re.UNICODE) """Matches token types that are not merely punctuation. (Types for @@ -1384,18 +1374,18 @@ def _match_potential_end_contexts(self, text): matches = [] for match in reversed(list(self._lang_vars.period_context_re().finditer(text))): # Ignore matches that have already been captured by matches to the right of this match - if matches and match.end() > before_words[matches[-1]].start(): + if matches and match.end() > before_start: continue # Find the word before the current match - before_words[match] = self._lang_vars.last_word_re().search( - text[: match.start()] - ) + all_before, before_word = text[: match.start()].rsplit(maxsplit=1) + before_start = len(all_before) + before_words[match] = before_word matches.append(match) return [ ( match, - before_words[match].group() + match.group() + match.group("after_tok"), + before_words[match] + match.group() + match.group("after_tok"), ) for match in matches[::-1] ] From 589930aa2931331b2cfbc1c27cf218458c49912e Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Wed, 27 Oct 2021 15:14:47 +0200 Subject: [PATCH 3/3] Solved issue if sentence contains just one token --- nltk/tokenize/punkt.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nltk/tokenize/punkt.py b/nltk/tokenize/punkt.py index 58d21f0c3c..54937b9ecd 100644 --- a/nltk/tokenize/punkt.py +++ b/nltk/tokenize/punkt.py @@ -1377,9 +1377,9 @@ def _match_potential_end_contexts(self, text): if matches and match.end() > before_start: continue # Find the word before the current match - all_before, before_word = text[: match.start()].rsplit(maxsplit=1) - before_start = len(all_before) - before_words[match] = before_word + split = text[: match.start()].rsplit(maxsplit=1) + before_start = len(split[0]) if len(split) == 2 else 0 + before_words[match] = split[-1] matches.append(match) return [