Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Deprecate 'return_str' parameter in NLTKWordTokenizer and TreebankWordTokenizer #2883

Merged
merged 2 commits into from Nov 18, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions AUTHORS.md
Expand Up @@ -285,6 +285,7 @@
- Saibo Geng <https://github.com/Saibo-creator>
- Ahmet Yildirim <https://github.com/RnDevelover>
- Yuta Nakamura <https://github.com/yutanakamura-tky>
- Adam Hawley <https://github.com/adamjhawley>

## Others whose work we've taken and included in NLTK, but who didn't directly contribute it:

Expand Down
12 changes: 11 additions & 1 deletion nltk/tokenize/destructive.py
Expand Up @@ -8,6 +8,7 @@


import re
import warnings

from nltk.tokenize.api import TokenizerI
from nltk.tokenize.util import align_tokens
Expand Down Expand Up @@ -113,6 +114,15 @@ class NLTKWordTokenizer(TokenizerI):
CONTRACTIONS3 = list(map(re.compile, _contractions.CONTRACTIONS3))

def tokenize(self, text, convert_parentheses=False, return_str=False):

if return_str is not False:
tomaarsen marked this conversation as resolved.
Show resolved Hide resolved
warnings.warn(
"Parameter 'return_str' has been deprecated and should no "
"longer be used.",
category=DeprecationWarning,
stacklevel=2,
)

for regexp, substitution in self.STARTING_QUOTES:
text = regexp.sub(substitution, text)

Expand Down Expand Up @@ -147,7 +157,7 @@ def tokenize(self, text, convert_parentheses=False, return_str=False):
# for regexp in self._contractions.CONTRACTIONS4:
# text = regexp.sub(r' \1 \2 \3 ', text)

return text if return_str else text.split()
return text.split()

def span_tokenize(self, text):
r"""
Expand Down
12 changes: 11 additions & 1 deletion nltk/tokenize/treebank.py
Expand Up @@ -18,6 +18,7 @@
"""

import re
import warnings

from nltk.tokenize.api import TokenizerI
from nltk.tokenize.destructive import MacIntyreContractions
Expand Down Expand Up @@ -99,6 +100,15 @@ class TreebankWordTokenizer(TokenizerI):
CONTRACTIONS3 = list(map(re.compile, _contractions.CONTRACTIONS3))

def tokenize(self, text, convert_parentheses=False, return_str=False):

if return_str is not False:
warnings.warn(
"Parameter 'return_str' has been deprecated and should no "
"longer be used.",
category=DeprecationWarning,
stacklevel=2,
)

for regexp, substitution in self.STARTING_QUOTES:
text = regexp.sub(substitution, text)

Expand Down Expand Up @@ -133,7 +143,7 @@ def tokenize(self, text, convert_parentheses=False, return_str=False):
# for regexp in self._contractions.CONTRACTIONS4:
# text = regexp.sub(r' \1 \2 \3 ', text)

return text if return_str else text.split()
return text.split()

def span_tokenize(self, text):
r"""
Expand Down