From 7f6ebf87b31ea99a152ccae02731e06420b378b8 Mon Sep 17 00:00:00 2001 From: adamjhawley Date: Wed, 10 Nov 2021 21:01:50 +0200 Subject: [PATCH 1/2] Deprecated 'return_str' parameter in NLTKWordTokenizer and TreebankWordTokenizer --- AUTHORS.md | 1 + nltk/tokenize/destructive.py | 12 +++++++++++- nltk/tokenize/treebank.py | 12 +++++++++++- 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/AUTHORS.md b/AUTHORS.md index 5b7546d43c..04592e11cc 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -285,6 +285,7 @@ - Saibo Geng - Ahmet Yildirim - Yuta Nakamura +- Adam Hawley ## Others whose work we've taken and included in NLTK, but who didn't directly contribute it: diff --git a/nltk/tokenize/destructive.py b/nltk/tokenize/destructive.py index 32eb64fd8f..80e30d3e21 100644 --- a/nltk/tokenize/destructive.py +++ b/nltk/tokenize/destructive.py @@ -8,6 +8,7 @@ import re +import warnings from nltk.tokenize.api import TokenizerI from nltk.tokenize.util import align_tokens @@ -113,6 +114,15 @@ class NLTKWordTokenizer(TokenizerI): CONTRACTIONS3 = list(map(re.compile, _contractions.CONTRACTIONS3)) def tokenize(self, text, convert_parentheses=False, return_str=False): + + if return_str is not False: + warnings.warn( + "Parameter 'return_str' has been deprecated and should no " + "longer be used.", + category=DeprecationWarning, + stacklevel=2, + ) + for regexp, substitution in self.STARTING_QUOTES: text = regexp.sub(substitution, text) @@ -147,7 +157,7 @@ def tokenize(self, text, convert_parentheses=False, return_str=False): # for regexp in self._contractions.CONTRACTIONS4: # text = regexp.sub(r' \1 \2 \3 ', text) - return text if return_str else text.split() + return text.split() def span_tokenize(self, text): r""" diff --git a/nltk/tokenize/treebank.py b/nltk/tokenize/treebank.py index 51c2020130..571bea77d2 100644 --- a/nltk/tokenize/treebank.py +++ b/nltk/tokenize/treebank.py @@ -18,6 +18,7 @@ """ import re +import warnings from nltk.tokenize.api import TokenizerI from nltk.tokenize.destructive import MacIntyreContractions @@ -99,6 +100,15 @@ class TreebankWordTokenizer(TokenizerI): CONTRACTIONS3 = list(map(re.compile, _contractions.CONTRACTIONS3)) def tokenize(self, text, convert_parentheses=False, return_str=False): + + if return_str is not False: + warnings.warn( + "Parameter 'return_str' has been deprecated and should no " + "longer be used.", + category=DeprecationWarning, + stacklevel=2, + ) + for regexp, substitution in self.STARTING_QUOTES: text = regexp.sub(substitution, text) @@ -133,7 +143,7 @@ def tokenize(self, text, convert_parentheses=False, return_str=False): # for regexp in self._contractions.CONTRACTIONS4: # text = regexp.sub(r' \1 \2 \3 ', text) - return text if return_str else text.split() + return text.split() def span_tokenize(self, text): r""" From ac092f83eb0aafc3a5a379ed5888c57bcf9eef4b Mon Sep 17 00:00:00 2001 From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com> Date: Mon, 15 Nov 2021 12:29:24 +0100 Subject: [PATCH 2/2] Simplified if-statement as suggested by 12mohaned --- nltk/tokenize/destructive.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nltk/tokenize/destructive.py b/nltk/tokenize/destructive.py index 80e30d3e21..f5d066f537 100644 --- a/nltk/tokenize/destructive.py +++ b/nltk/tokenize/destructive.py @@ -115,7 +115,7 @@ class NLTKWordTokenizer(TokenizerI): def tokenize(self, text, convert_parentheses=False, return_str=False): - if return_str is not False: + if return_str: warnings.warn( "Parameter 'return_str' has been deprecated and should no " "longer be used.",