Resolve IndexError in sent_tokenize (#2922)

* Prevent IndexError if input starts with an endline character * Add doctest for Punkt sent_tokenize issue
nltk · Dec 21, 2021 · d4d99b4 · d4d99b4
1 parent dd1494e
commit d4d99b4
Show file tree

Hide file tree

Showing 2 changed files with 6 additions and 1 deletion.
diff --git a/nltk/test/tokenize.doctest b/nltk/test/tokenize.doctest
@@ -310,6 +310,11 @@ Testing mutable default arguments for https://github.com/nltk/nltk/pull/2067
     >>> type(pst._lang_vars)
     <class 'nltk.tokenize.punkt.PunktLanguageVars'>
 
+Testing that inputs can start with dots.
+
+    >>> pst = PunktSentenceTokenizer(lang_vars=None)
+    >>> pst.tokenize(". This input starts with a dot. This used to cause issues.")
+    ['.', 'This input starts with a dot.', 'This used to cause issues.']
 
 Regression Tests: align_tokens
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

diff --git a/nltk/tokenize/punkt.py b/nltk/tokenize/punkt.py
@@ -1379,7 +1379,7 @@ def _match_potential_end_contexts(self, text):
             # Find the word before the current match
             split = text[: match.start()].rsplit(maxsplit=1)
             before_start = len(split[0]) if len(split) == 2 else 0
-            before_words[match] = split[-1]
+            before_words[match] = split[-1] if split else ""
             matches.append(match)
 
         return [