Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improved Tokenize documentation + added TokenizerI as superclass for TweetTokenizer #2878

Merged
merged 7 commits into from Nov 21, 2021
15 changes: 14 additions & 1 deletion nltk/test/tokenize.doctest
Expand Up @@ -4,7 +4,7 @@
>>> from nltk.tokenize import *

Regression Tests: NLTKWordTokenizer
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Tokenizing some test strings.

Expand Down Expand Up @@ -259,6 +259,19 @@ It should not hang on long sequences of the same punctuation character.
>>> tknzr.tokenize(s10)
['Photo', ':', "Aujourd'hui", 'sur', 'http://t.co/0gebOFDUzn', 'Projet', '...', 'http://t.co/bKfIUbydz2', '...', 'http://fb.me/3b6uXpz0L']

Tokenizing multiple sentences at once:

>>> tknzr = TweetTokenizer()
>>> sentences = [
... "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--",
... "@jrmy: I'm REALLY HAPPYYY about that! NICEEEE :D :P",
... "@_willy65: No place for @chuck tonight. Sorry."
... ]
>>> tknzr.tokenize_sents(sentences) # doctest: +NORMALIZE_WHITESPACE
[['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->', '<--'],
['@jrmy', ':', "I'm", 'REALLY', 'HAPPYYY', 'about', 'that', '!', 'NICEEEE', ':D', ':P'],
['@_willy65', ':', 'No', 'place', 'for', '@chuck', 'tonight', '.', 'Sorry', '.']]


Regression Tests: PunktSentenceTokenizer
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down
19 changes: 11 additions & 8 deletions nltk/tokenize/api.py
Expand Up @@ -11,6 +11,7 @@
"""

from abc import ABC, abstractmethod
from typing import Iterator, List, Tuple

from nltk.internals import overridden
from nltk.tokenize.util import string_span_tokenize
Expand All @@ -23,41 +24,43 @@ class TokenizerI(ABC):
"""

@abstractmethod
def tokenize(self, s):
def tokenize(self, s: str) -> List[str]:
"""
Return a tokenized copy of *s*.

:rtype: list of str
:rtype: List[str]
"""
if overridden(self.tokenize_sents):
return self.tokenize_sents([s])[0]

def span_tokenize(self, s):
def span_tokenize(self, s: str) -> Iterator[Tuple[int, int]]:
"""
Identify the tokens using integer offsets ``(start_i, end_i)``,
where ``s[start_i:end_i]`` is the corresponding token.

:rtype: iter(tuple(int, int))
:rtype: Iterator[Tuple[int, int]]
"""
raise NotImplementedError()

def tokenize_sents(self, strings):
def tokenize_sents(self, strings: List[str]) -> List[List[str]]:
"""
Apply ``self.tokenize()`` to each element of ``strings``. I.e.:

return [self.tokenize(s) for s in strings]

:rtype: list(list(str))
:rtype: List[List[str]]
"""
return [self.tokenize(s) for s in strings]

def span_tokenize_sents(self, strings):
def span_tokenize_sents(
self, strings: List[str]
) -> Iterator[List[Tuple[int, int]]]:
"""
Apply ``self.span_tokenize()`` to each element of ``strings``. I.e.:

return [self.span_tokenize(s) for s in strings]

:rtype: iter(list(tuple(int, int)))
:yield: List[Tuple[int, int]]
"""
for s in strings:
yield list(self.span_tokenize(s))
Expand Down
18 changes: 10 additions & 8 deletions nltk/tokenize/casual.py
Expand Up @@ -28,16 +28,16 @@

4. When instantiating Tokenizer objects, there are several options:
* preserve_case. By default, it is set to True. If it is set to
False, then the tokenizer will downcase everything except for
emoticons.
False, then the tokenizer will downcase everything except for
emoticons.
* reduce_len. By default, it is set to False. It specifies whether
to replace repeated character sequences of length 3 or greater
with sequences of length 3.
to replace repeated character sequences of length 3 or greater
with sequences of length 3.
* strip_handles. By default, it is set to False. It specifies
whether to remove Twitter handles of text used in the
`tokenize` method.
whether to remove Twitter handles of text used in the
`tokenize` method.
* match_phone_numbers. By default, it is set to True. It indicates
whether the `tokenize` method should look for phone numbers.
whether the `tokenize` method should look for phone numbers.
"""


Expand All @@ -48,6 +48,8 @@

import regex # https://github.com/nltk/nltk/issues/2409

from nltk.tokenize.api import TokenizerI

######################################################################
# The following strings are components in the regular expression
# that is used for tokenizing. It's important that phone_number
Expand Down Expand Up @@ -276,7 +278,7 @@ def _convert_entity(match):
######################################################################


class TweetTokenizer:
class TweetTokenizer(TokenizerI):
r"""
Tokenizer for tweets.

Expand Down
58 changes: 40 additions & 18 deletions nltk/tokenize/destructive.py
Expand Up @@ -9,6 +9,7 @@

import re
import warnings
from typing import Iterator, List, Tuple

from nltk.tokenize.api import TokenizerI
from nltk.tokenize.util import align_tokens
Expand Down Expand Up @@ -37,6 +38,9 @@ class NLTKWordTokenizer(TokenizerI):
"""
The NLTK tokenizer that has improved upon the TreebankWordTokenizer.

This is the method that is invoked by ``word_tokenize()``. It assumes that the
text has already been segmented into sentences, e.g. using ``sent_tokenize()``.

The tokenizer is "destructive" such that the regexes applied will munge the
input string to a state beyond re-construction. It is possible to apply
`TreebankWordDetokenizer.detokenize` to the tokenized outputs of
Expand Down Expand Up @@ -113,8 +117,35 @@ class NLTKWordTokenizer(TokenizerI):
CONTRACTIONS2 = list(map(re.compile, _contractions.CONTRACTIONS2))
CONTRACTIONS3 = list(map(re.compile, _contractions.CONTRACTIONS3))

def tokenize(self, text, convert_parentheses=False, return_str=False):

def tokenize(
self, text: str, convert_parentheses: bool = False, return_str: bool = False
) -> List[str]:
r"""Return a tokenized copy of `text`.

>>> from nltk.tokenize import NLTKWordTokenizer
>>> s = '''Good muffins cost $3.88 (roughly 3,36 euros)\nin New York. Please buy me\ntwo of them.\nThanks.'''
>>> NLTKWordTokenizer().tokenize(s)
['Good', 'muffins', 'cost', '$', '3.88', '(', 'roughly', '3,36',
'euros', ')', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
'of', 'them.', 'Thanks', '.']
>>> NLTKWordTokenizer().tokenize(s, convert_parentheses=True)
['Good', 'muffins', 'cost', '$', '3.88', '-LRB-', 'roughly', '3,36',
'euros', '-RRB-', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
'of', 'them.', 'Thanks', '.']
>>> NLTKWordTokenizer().tokenize(s, return_str=True)
' Good muffins cost $ 3.88 ( roughly 3,36 euros ) \nin New York. Please buy me\ntwo of them.\nThanks . '

:param text: A string with a sentence or sentences.
:type text: str
:param convert_parentheses: if True, replace parentheses to PTB symbols,
e.g. `(` to `-LRB-`. Defaults to False.
:type convert_parentheses: bool, optional
:param return_str: If True, return tokens as space-separated string,
defaults to False.
:type return_str: bool, optional
:return: List of tokens from `text`.
:rtype: List[str]
"""
if return_str:
warnings.warn(
"Parameter 'return_str' has been deprecated and should no "
Expand Down Expand Up @@ -159,9 +190,11 @@ def tokenize(self, text, convert_parentheses=False, return_str=False):

return text.split()

def span_tokenize(self, text):
def span_tokenize(self, text: str) -> Iterator[Tuple[int, int]]:
r"""
Returns the spans of the tokens in ``text``.
Uses the post-hoc nltk.tokens.align_tokens to return the offset spans.

>>> from nltk.tokenize import NLTKWordTokenizer
>>> s = '''Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks).'''
>>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
Expand All @@ -175,21 +208,10 @@ def span_tokenize(self, text):
... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.']
>>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected
True
Additional example
>>> from nltk.tokenize import NLTKWordTokenizer
>>> s = '''I said, "I'd like to buy some ''good muffins" which cost $3.88\n each in New (York)."'''
>>> expected = [(0, 1), (2, 6), (6, 7), (8, 9), (9, 10), (10, 12),
... (13, 17), (18, 20), (21, 24), (25, 29), (30, 32), (32, 36),
... (37, 44), (44, 45), (46, 51), (52, 56), (57, 58), (58, 62),
... (64, 68), (69, 71), (72, 75), (76, 77), (77, 81), (81, 82),
... (82, 83), (83, 84)]
>>> list(NLTKWordTokenizer().span_tokenize(s)) == expected
True
>>> expected = ['I', 'said', ',', '"', 'I', "'d", 'like', 'to',
... 'buy', 'some', "''", "good", 'muffins', '"', 'which', 'cost',
... '$', '3.88', 'each', 'in', 'New', '(', 'York', ')', '.', '"']
>>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected
True

:param text: A string with a sentence or sentences.
:type text: str
:yield: Tuple[int, int]
"""
raw_tokens = self.tokenize(text)

Expand Down
85 changes: 51 additions & 34 deletions nltk/tokenize/treebank.py
Expand Up @@ -19,6 +19,7 @@

import re
import warnings
from typing import Iterator, List, Tuple

from nltk.tokenize.api import TokenizerI
from nltk.tokenize.destructive import MacIntyreContractions
Expand All @@ -28,8 +29,6 @@
class TreebankWordTokenizer(TokenizerI):
r"""
The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank.
This is the method that is invoked by ``word_tokenize()``. It assumes that the
text has already been segmented into sentences, e.g. using ``sent_tokenize()``.

This tokenizer performs the following steps:

Expand Down Expand Up @@ -99,8 +98,35 @@ class TreebankWordTokenizer(TokenizerI):
CONTRACTIONS2 = list(map(re.compile, _contractions.CONTRACTIONS2))
CONTRACTIONS3 = list(map(re.compile, _contractions.CONTRACTIONS3))

def tokenize(self, text, convert_parentheses=False, return_str=False):

def tokenize(
self, text: str, convert_parentheses: bool = False, return_str: bool = False
) -> List[str]:
r"""Return a tokenized copy of `text`.

>>> from nltk.tokenize import TreebankWordTokenizer
>>> s = '''Good muffins cost $3.88 (roughly 3,36 euros)\nin New York. Please buy me\ntwo of them.\nThanks.'''
>>> TreebankWordTokenizer().tokenize(s)
['Good', 'muffins', 'cost', '$', '3.88', '(', 'roughly', '3,36',
'euros', ')', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
'of', 'them.', 'Thanks', '.']
>>> TreebankWordTokenizer().tokenize(s, convert_parentheses=True)
['Good', 'muffins', 'cost', '$', '3.88', '-LRB-', 'roughly', '3,36',
'euros', '-RRB-', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
'of', 'them.', 'Thanks', '.']
>>> TreebankWordTokenizer().tokenize(s, return_str=True)
' Good muffins cost $ 3.88 ( roughly 3,36 euros ) \nin New York. Please buy me\ntwo of them.\nThanks . '

:param text: A string with a sentence or sentences.
:type text: str
:param convert_parentheses: if True, replace parentheses to PTB symbols,
e.g. `(` to `-LRB-`. Defaults to False.
:type convert_parentheses: bool, optional
:param return_str: If True, return tokens as space-separated string,
defaults to False.
:type return_str: bool, optional
:return: List of tokens from `text`.
:rtype: List[str]
"""
if return_str is not False:
warnings.warn(
"Parameter 'return_str' has been deprecated and should no "
Expand Down Expand Up @@ -145,8 +171,9 @@ def tokenize(self, text, convert_parentheses=False, return_str=False):

return text.split()

def span_tokenize(self, text):
def span_tokenize(self, text: str) -> Iterator[Tuple[int, int]]:
r"""
Returns the spans of the tokens in ``text``.
Uses the post-hoc nltk.tokens.align_tokens to return the offset spans.

>>> from nltk.tokenize import TreebankWordTokenizer
Expand All @@ -163,22 +190,9 @@ def span_tokenize(self, text):
>>> [s[start:end] for start, end in TreebankWordTokenizer().span_tokenize(s)] == expected
True

Additional example
>>> from nltk.tokenize import TreebankWordTokenizer
>>> s = '''I said, "I'd like to buy some ''good muffins" which cost $3.88\n each in New (York)."'''
>>> expected = [(0, 1), (2, 6), (6, 7), (8, 9), (9, 10), (10, 12),
... (13, 17), (18, 20), (21, 24), (25, 29), (30, 32), (32, 36),
... (37, 44), (44, 45), (46, 51), (52, 56), (57, 58), (58, 62),
... (64, 68), (69, 71), (72, 75), (76, 77), (77, 81), (81, 82),
... (82, 83), (83, 84)]
>>> list(TreebankWordTokenizer().span_tokenize(s)) == expected
True
>>> expected = ['I', 'said', ',', '"', 'I', "'d", 'like', 'to',
... 'buy', 'some', "''", "good", 'muffins', '"', 'which', 'cost',
... '$', '3.88', 'each', 'in', 'New', '(', 'York', ')', '.', '"']
>>> [s[start:end] for start, end in TreebankWordTokenizer().span_tokenize(s)] == expected
True

:param text: A string with a sentence or sentences.
:type text: str
:yield: Tuple[int, int]
"""
raw_tokens = self.tokenize(text)

Expand Down Expand Up @@ -208,13 +222,13 @@ class TreebankWordDetokenizer(TokenizerI):

Note:

- There're additional assumption mades when undoing the padding of `[;@#$%&]`
- There're additional assumption mades when undoing the padding of ``[;@#$%&]``
punctuation symbols that isn't presupposed in the TreebankTokenizer.
- There're additional regexes added in reversing the parentheses tokenization,
- the `r'([\]\)\}\>])\s([:;,.])'` removes the additional right padding added
to the closing parentheses precedding `[:;,.]`.
such as the ``r'([\]\)\}\>])\s([:;,.])'``, which removes the additional right
padding added to the closing parentheses precedding ``[:;,.]``.
- It's not possible to return the original whitespaces as they were because
there wasn't explicit records of where '\n', '\t' or '\s' were removed at
there wasn't explicit records of where `'\n'`, `'\t'` or `'\s'` were removed at
the text.split() operation.

>>> from nltk.tokenize.treebank import TreebankWordTokenizer, TreebankWordDetokenizer
Expand All @@ -225,7 +239,7 @@ class TreebankWordDetokenizer(TokenizerI):
>>> d.detokenize(toks)
'Good muffins cost $3.88 in New York. Please buy me two of them. Thanks.'

The MXPOST parentheses substitution can be undone using the `convert_parentheses`
The MXPOST parentheses substitution can be undone using the ``convert_parentheses``
parameter:

>>> s = '''Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks).'''
Expand All @@ -241,14 +255,14 @@ class TreebankWordDetokenizer(TokenizerI):
During tokenization it's safe to add more spaces but during detokenization,
simply undoing the padding doesn't really help.

- During tokenization, left and right pad is added to [!?], when
detokenizing, only left shift the [!?] is needed.
Thus (re.compile(r'\s([?!])'), r'\g<1>')
- During tokenization, left and right pad is added to ``[!?]``, when
detokenizing, only left shift the ``[!?]`` is needed.
Thus ``(re.compile(r'\s([?!])'), r'\g<1>')``.

- During tokenization [:,] are left and right padded but when detokenizing,
- During tokenization ``[:,]`` are left and right padded but when detokenizing,
only left shift is necessary and we keep right pad after comma/colon
if the string after is a non-digit.
Thus (re.compile(r'\s([:,])\s([^\d])'), r'\1 \2')
Thus ``(re.compile(r'\s([:,])\s([^\d])'), r'\1 \2')``.

>>> from nltk.tokenize.treebank import TreebankWordDetokenizer
>>> toks = ['hello', ',', 'i', 'ca', "n't", 'feel', 'my', 'feet', '!', 'Help', '!', '!']
Expand Down Expand Up @@ -332,13 +346,16 @@ class TreebankWordDetokenizer(TokenizerI):
(re.compile(r"``"), r'"'),
]

def tokenize(self, tokens, convert_parentheses=False):
def tokenize(self, tokens: List[str], convert_parentheses: bool = False) -> str:
"""
Treebank detokenizer, created by undoing the regexes from
the TreebankWordTokenizer.tokenize.

:param tokens: A list of strings, i.e. tokenized text.
:type tokens: list(str)
:type tokens: List[str]
:param convert_parentheses: if True, replace PTB symbols with parentheses,
e.g. `-LRB-` to `(`. Defaults to False.
:type convert_parentheses: bool, optional
:return: str
"""
text = " ".join(tokens)
Expand Down Expand Up @@ -378,6 +395,6 @@ def tokenize(self, tokens, convert_parentheses=False):

return text.strip()

def detokenize(self, tokens, convert_parentheses=False):
def detokenize(self, tokens: List[str], convert_parentheses: bool = False) -> str:
"""Duck-typing the abstract *tokenize()*."""
return self.tokenize(tokens, convert_parentheses)