From 6b78e285efbf6da4f5ecd95a44fdf7264c244fd4 Mon Sep 17 00:00:00 2001 From: adamjhawley Date: Wed, 10 Nov 2021 21:01:50 +0200 Subject: [PATCH] Deprecated 'return_str' parameter in NLTKWordTokenizer and TreebankWordTokenizer --- AUTHORS.md | 1 + nltk/tokenize/destructive.py | 12 +++++++++++- nltk/tokenize/treebank.py | 12 +++++++++++- 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/AUTHORS.md b/AUTHORS.md index 5b7546d43c..04592e11cc 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -285,6 +285,7 @@ - Saibo Geng - Ahmet Yildirim - Yuta Nakamura +- Adam Hawley ## Others whose work we've taken and included in NLTK, but who didn't directly contribute it: diff --git a/nltk/tokenize/destructive.py b/nltk/tokenize/destructive.py index 32eb64fd8f..0e780ab4e6 100644 --- a/nltk/tokenize/destructive.py +++ b/nltk/tokenize/destructive.py @@ -8,6 +8,7 @@ import re +import warnings from nltk.tokenize.api import TokenizerI from nltk.tokenize.util import align_tokens @@ -113,6 +114,15 @@ class NLTKWordTokenizer(TokenizerI): CONTRACTIONS3 = list(map(re.compile, _contractions.CONTRACTIONS3)) def tokenize(self, text, convert_parentheses=False, return_str=False): + + if return_str is not False: + warnings.warn( + "Parameter 'return_str' has been deprecated and should no" + "longer be used.", + category=DeprecationWarning, + stacklevel=2, + ) + for regexp, substitution in self.STARTING_QUOTES: text = regexp.sub(substitution, text) @@ -147,7 +157,7 @@ def tokenize(self, text, convert_parentheses=False, return_str=False): # for regexp in self._contractions.CONTRACTIONS4: # text = regexp.sub(r' \1 \2 \3 ', text) - return text if return_str else text.split() + return text.split() def span_tokenize(self, text): r""" diff --git a/nltk/tokenize/treebank.py b/nltk/tokenize/treebank.py index 51c2020130..9cd11c749c 100644 --- a/nltk/tokenize/treebank.py +++ b/nltk/tokenize/treebank.py @@ -18,6 +18,7 @@ """ import re +import warnings from nltk.tokenize.api import TokenizerI from nltk.tokenize.destructive import MacIntyreContractions @@ -99,6 +100,15 @@ class TreebankWordTokenizer(TokenizerI): CONTRACTIONS3 = list(map(re.compile, _contractions.CONTRACTIONS3)) def tokenize(self, text, convert_parentheses=False, return_str=False): + + if return_str is not False: + warnings.warn( + "Parameter 'return_str' has been deprecated and should no" + "longer be used.", + category=DeprecationWarning, + stacklevel=2, + ) + for regexp, substitution in self.STARTING_QUOTES: text = regexp.sub(substitution, text) @@ -133,7 +143,7 @@ def tokenize(self, text, convert_parentheses=False, return_str=False): # for regexp in self._contractions.CONTRACTIONS4: # text = regexp.sub(r' \1 \2 \3 ', text) - return text if return_str else text.split() + return text.split() def span_tokenize(self, text): r"""