diff --git a/nltk/test/tokenize.doctest b/nltk/test/tokenize.doctest index ec980711b9..53e8ece914 100644 --- a/nltk/test/tokenize.doctest +++ b/nltk/test/tokenize.doctest @@ -310,6 +310,11 @@ Testing mutable default arguments for https://github.com/nltk/nltk/pull/2067 >>> type(pst._lang_vars) +Testing that inputs can start with dots. + + >>> pst = PunktSentenceTokenizer(lang_vars=None) + >>> pst.tokenize(". This input starts with a dot. This used to cause issues.") + ['.', 'This input starts with a dot.', 'This used to cause issues.'] Regression Tests: align_tokens ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/nltk/tokenize/punkt.py b/nltk/tokenize/punkt.py index 54937b9ecd..19fdf31f63 100644 --- a/nltk/tokenize/punkt.py +++ b/nltk/tokenize/punkt.py @@ -1379,7 +1379,7 @@ def _match_potential_end_contexts(self, text): # Find the word before the current match split = text[: match.start()].rsplit(maxsplit=1) before_start = len(split[0]) if len(split) == 2 else 0 - before_words[match] = split[-1] + before_words[match] = split[-1] if split else "" matches.append(match) return [