diff --git a/AUTHORS.md b/AUTHORS.md index 5b7546d43c..04592e11cc 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -285,6 +285,7 @@ - Saibo Geng - Ahmet Yildirim - Yuta Nakamura +- Adam Hawley ## Others whose work we've taken and included in NLTK, but who didn't directly contribute it: diff --git a/nltk/tokenize/destructive.py b/nltk/tokenize/destructive.py index 32eb64fd8f..f5d066f537 100644 --- a/nltk/tokenize/destructive.py +++ b/nltk/tokenize/destructive.py @@ -8,6 +8,7 @@ import re +import warnings from nltk.tokenize.api import TokenizerI from nltk.tokenize.util import align_tokens @@ -113,6 +114,15 @@ class NLTKWordTokenizer(TokenizerI): CONTRACTIONS3 = list(map(re.compile, _contractions.CONTRACTIONS3)) def tokenize(self, text, convert_parentheses=False, return_str=False): + + if return_str: + warnings.warn( + "Parameter 'return_str' has been deprecated and should no " + "longer be used.", + category=DeprecationWarning, + stacklevel=2, + ) + for regexp, substitution in self.STARTING_QUOTES: text = regexp.sub(substitution, text) @@ -147,7 +157,7 @@ def tokenize(self, text, convert_parentheses=False, return_str=False): # for regexp in self._contractions.CONTRACTIONS4: # text = regexp.sub(r' \1 \2 \3 ', text) - return text if return_str else text.split() + return text.split() def span_tokenize(self, text): r""" diff --git a/nltk/tokenize/treebank.py b/nltk/tokenize/treebank.py index 51c2020130..571bea77d2 100644 --- a/nltk/tokenize/treebank.py +++ b/nltk/tokenize/treebank.py @@ -18,6 +18,7 @@ """ import re +import warnings from nltk.tokenize.api import TokenizerI from nltk.tokenize.destructive import MacIntyreContractions @@ -99,6 +100,15 @@ class TreebankWordTokenizer(TokenizerI): CONTRACTIONS3 = list(map(re.compile, _contractions.CONTRACTIONS3)) def tokenize(self, text, convert_parentheses=False, return_str=False): + + if return_str is not False: + warnings.warn( + "Parameter 'return_str' has been deprecated and should no " + "longer be used.", + category=DeprecationWarning, + stacklevel=2, + ) + for regexp, substitution in self.STARTING_QUOTES: text = regexp.sub(substitution, text) @@ -133,7 +143,7 @@ def tokenize(self, text, convert_parentheses=False, return_str=False): # for regexp in self._contractions.CONTRACTIONS4: # text = regexp.sub(r' \1 \2 \3 ', text) - return text if return_str else text.split() + return text.split() def span_tokenize(self, text): r"""