Skip to content

Commit

Permalink
Deprecate 'return_str' parameter in NLTKWordTokenizer and TreebankWor…
Browse files Browse the repository at this point in the history
…dTokenizer (#2883)

* Deprecated 'return_str' parameter in NLTKWordTokenizer and TreebankWordTokenizer

* Simplified if-statement

as suggested by 12mohaned

Co-authored-by: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com>
  • Loading branch information
adamjhawley and tomaarsen committed Nov 18, 2021
1 parent 7fb092a commit e629d7e
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 2 deletions.
1 change: 1 addition & 0 deletions AUTHORS.md
Expand Up @@ -285,6 +285,7 @@
- Saibo Geng <https://github.com/Saibo-creator>
- Ahmet Yildirim <https://github.com/RnDevelover>
- Yuta Nakamura <https://github.com/yutanakamura-tky>
- Adam Hawley <https://github.com/adamjhawley>

## Others whose work we've taken and included in NLTK, but who didn't directly contribute it:

Expand Down
12 changes: 11 additions & 1 deletion nltk/tokenize/destructive.py
Expand Up @@ -8,6 +8,7 @@


import re
import warnings

from nltk.tokenize.api import TokenizerI
from nltk.tokenize.util import align_tokens
Expand Down Expand Up @@ -113,6 +114,15 @@ class NLTKWordTokenizer(TokenizerI):
CONTRACTIONS3 = list(map(re.compile, _contractions.CONTRACTIONS3))

def tokenize(self, text, convert_parentheses=False, return_str=False):

if return_str:
warnings.warn(
"Parameter 'return_str' has been deprecated and should no "
"longer be used.",
category=DeprecationWarning,
stacklevel=2,
)

for regexp, substitution in self.STARTING_QUOTES:
text = regexp.sub(substitution, text)

Expand Down Expand Up @@ -147,7 +157,7 @@ def tokenize(self, text, convert_parentheses=False, return_str=False):
# for regexp in self._contractions.CONTRACTIONS4:
# text = regexp.sub(r' \1 \2 \3 ', text)

return text if return_str else text.split()
return text.split()

def span_tokenize(self, text):
r"""
Expand Down
12 changes: 11 additions & 1 deletion nltk/tokenize/treebank.py
Expand Up @@ -18,6 +18,7 @@
"""

import re
import warnings

from nltk.tokenize.api import TokenizerI
from nltk.tokenize.destructive import MacIntyreContractions
Expand Down Expand Up @@ -99,6 +100,15 @@ class TreebankWordTokenizer(TokenizerI):
CONTRACTIONS3 = list(map(re.compile, _contractions.CONTRACTIONS3))

def tokenize(self, text, convert_parentheses=False, return_str=False):

if return_str is not False:
warnings.warn(
"Parameter 'return_str' has been deprecated and should no "
"longer be used.",
category=DeprecationWarning,
stacklevel=2,
)

for regexp, substitution in self.STARTING_QUOTES:
text = regexp.sub(substitution, text)

Expand Down Expand Up @@ -133,7 +143,7 @@ def tokenize(self, text, convert_parentheses=False, return_str=False):
# for regexp in self._contractions.CONTRACTIONS4:
# text = regexp.sub(r' \1 \2 \3 ', text)

return text if return_str else text.split()
return text.split()

def span_tokenize(self, text):
r"""
Expand Down

0 comments on commit e629d7e

Please sign in to comment.