Deprecate 'return_str' parameter in NLTKWordTokenizer and TreebankWor…

…dTokenizer (#2883) * Deprecated 'return_str' parameter in NLTKWordTokenizer and TreebankWordTokenizer * Simplified if-statement as suggested by 12mohaned Co-authored-by: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com>
nltk · Nov 18, 2021 · e629d7e · e629d7e
1 parent 7fb092a
commit e629d7e
Show file tree

Hide file tree

Showing 3 changed files with 23 additions and 2 deletions.
diff --git a/AUTHORS.md b/AUTHORS.md
@@ -285,6 +285,7 @@
 - Saibo Geng <https://github.com/Saibo-creator>
 - Ahmet Yildirim <https://github.com/RnDevelover>
 - Yuta Nakamura <https://github.com/yutanakamura-tky>
+- Adam Hawley <https://github.com/adamjhawley>
 
 ## Others whose work we've taken and included in NLTK, but who didn't directly contribute it:
 

diff --git a/nltk/tokenize/destructive.py b/nltk/tokenize/destructive.py
@@ -8,6 +8,7 @@
 
 
 import re
+import warnings
 
 from nltk.tokenize.api import TokenizerI
 from nltk.tokenize.util import align_tokens
@@ -113,6 +114,15 @@ class NLTKWordTokenizer(TokenizerI):
     CONTRACTIONS3 = list(map(re.compile, _contractions.CONTRACTIONS3))
 
     def tokenize(self, text, convert_parentheses=False, return_str=False):
+
+        if return_str:
+            warnings.warn(
+                "Parameter 'return_str' has been deprecated and should no "
+                "longer be used.",
+                category=DeprecationWarning,
+                stacklevel=2,
+            )
+
         for regexp, substitution in self.STARTING_QUOTES:
             text = regexp.sub(substitution, text)
 
@@ -147,7 +157,7 @@ def tokenize(self, text, convert_parentheses=False, return_str=False):
         # for regexp in self._contractions.CONTRACTIONS4:
         #     text = regexp.sub(r' \1 \2 \3 ', text)
 
-        return text if return_str else text.split()
+        return text.split()
 
     def span_tokenize(self, text):
         r"""

diff --git a/nltk/tokenize/treebank.py b/nltk/tokenize/treebank.py
@@ -18,6 +18,7 @@
 """
 
 import re
+import warnings
 
 from nltk.tokenize.api import TokenizerI
 from nltk.tokenize.destructive import MacIntyreContractions
@@ -99,6 +100,15 @@ class TreebankWordTokenizer(TokenizerI):
     CONTRACTIONS3 = list(map(re.compile, _contractions.CONTRACTIONS3))
 
     def tokenize(self, text, convert_parentheses=False, return_str=False):
+
+        if return_str is not False:
+            warnings.warn(
+                "Parameter 'return_str' has been deprecated and should no "
+                "longer be used.",
+                category=DeprecationWarning,
+                stacklevel=2,
+            )
+
         for regexp, substitution in self.STARTING_QUOTES:
             text = regexp.sub(substitution, text)
 
@@ -133,7 +143,7 @@ def tokenize(self, text, convert_parentheses=False, return_str=False):
         # for regexp in self._contractions.CONTRACTIONS4:
         #     text = regexp.sub(r' \1 \2 \3 ', text)
 
-        return text if return_str else text.split()
+        return text.split()
 
     def span_tokenize(self, text):
         r"""