Skip to content

Commit

Permalink
Improved Tokenize documentation + added TokenizerI as superclass for …
Browse files Browse the repository at this point in the history
…TweetTokenizer (#2878)

* Add span_tokenize to NLTKWordTokenizer, just like in TreebankWordTokenizer

* Added documentation for core tokenization modules

* Added tokenize_sents method to TweetTokenizer

By subclassing it with TokenizerI

* Resolved documentation indentation issue in tokenize/casual.py

* Fixed copy-paste issue in tokenize docstring
  • Loading branch information
tomaarsen committed Nov 21, 2021
1 parent 6dcfa80 commit b30b6ac
Show file tree
Hide file tree
Showing 5 changed files with 126 additions and 69 deletions.
15 changes: 14 additions & 1 deletion nltk/test/tokenize.doctest
Expand Up @@ -4,7 +4,7 @@
>>> from nltk.tokenize import *

Regression Tests: NLTKWordTokenizer
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Tokenizing some test strings.

Expand Down Expand Up @@ -259,6 +259,19 @@ It should not hang on long sequences of the same punctuation character.
>>> tknzr.tokenize(s10)
['Photo', ':', "Aujourd'hui", 'sur', 'http://t.co/0gebOFDUzn', 'Projet', '...', 'http://t.co/bKfIUbydz2', '...', 'http://fb.me/3b6uXpz0L']

Tokenizing multiple sentences at once:

>>> tknzr = TweetTokenizer()
>>> sentences = [
... "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--",
... "@jrmy: I'm REALLY HAPPYYY about that! NICEEEE :D :P",
... "@_willy65: No place for @chuck tonight. Sorry."
... ]
>>> tknzr.tokenize_sents(sentences) # doctest: +NORMALIZE_WHITESPACE
[['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->', '<--'],
['@jrmy', ':', "I'm", 'REALLY', 'HAPPYYY', 'about', 'that', '!', 'NICEEEE', ':D', ':P'],
['@_willy65', ':', 'No', 'place', 'for', '@chuck', 'tonight', '.', 'Sorry', '.']]


Regression Tests: PunktSentenceTokenizer
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down
19 changes: 11 additions & 8 deletions nltk/tokenize/api.py
Expand Up @@ -11,6 +11,7 @@
"""

from abc import ABC, abstractmethod
from typing import Iterator, List, Tuple

from nltk.internals import overridden
from nltk.tokenize.util import string_span_tokenize
Expand All @@ -23,41 +24,43 @@ class TokenizerI(ABC):
"""

@abstractmethod
def tokenize(self, s):
def tokenize(self, s: str) -> List[str]:
"""
Return a tokenized copy of *s*.
:rtype: list of str
:rtype: List[str]
"""
if overridden(self.tokenize_sents):
return self.tokenize_sents([s])[0]

def span_tokenize(self, s):
def span_tokenize(self, s: str) -> Iterator[Tuple[int, int]]:
"""
Identify the tokens using integer offsets ``(start_i, end_i)``,
where ``s[start_i:end_i]`` is the corresponding token.
:rtype: iter(tuple(int, int))
:rtype: Iterator[Tuple[int, int]]
"""
raise NotImplementedError()

def tokenize_sents(self, strings):
def tokenize_sents(self, strings: List[str]) -> List[List[str]]:
"""
Apply ``self.tokenize()`` to each element of ``strings``. I.e.:
return [self.tokenize(s) for s in strings]
:rtype: list(list(str))
:rtype: List[List[str]]
"""
return [self.tokenize(s) for s in strings]

def span_tokenize_sents(self, strings):
def span_tokenize_sents(
self, strings: List[str]
) -> Iterator[List[Tuple[int, int]]]:
"""
Apply ``self.span_tokenize()`` to each element of ``strings``. I.e.:
return [self.span_tokenize(s) for s in strings]
:rtype: iter(list(tuple(int, int)))
:yield: List[Tuple[int, int]]
"""
for s in strings:
yield list(self.span_tokenize(s))
Expand Down
18 changes: 10 additions & 8 deletions nltk/tokenize/casual.py
Expand Up @@ -28,16 +28,16 @@
4. When instantiating Tokenizer objects, there are several options:
* preserve_case. By default, it is set to True. If it is set to
False, then the tokenizer will downcase everything except for
emoticons.
False, then the tokenizer will downcase everything except for
emoticons.
* reduce_len. By default, it is set to False. It specifies whether
to replace repeated character sequences of length 3 or greater
with sequences of length 3.
to replace repeated character sequences of length 3 or greater
with sequences of length 3.
* strip_handles. By default, it is set to False. It specifies
whether to remove Twitter handles of text used in the
`tokenize` method.
whether to remove Twitter handles of text used in the
`tokenize` method.
* match_phone_numbers. By default, it is set to True. It indicates
whether the `tokenize` method should look for phone numbers.
whether the `tokenize` method should look for phone numbers.
"""


Expand All @@ -48,6 +48,8 @@

import regex # https://github.com/nltk/nltk/issues/2409

from nltk.tokenize.api import TokenizerI

######################################################################
# The following strings are components in the regular expression
# that is used for tokenizing. It's important that phone_number
Expand Down Expand Up @@ -276,7 +278,7 @@ def _convert_entity(match):
######################################################################


class TweetTokenizer:
class TweetTokenizer(TokenizerI):
r"""
Tokenizer for tweets.
Expand Down
58 changes: 40 additions & 18 deletions nltk/tokenize/destructive.py
Expand Up @@ -9,6 +9,7 @@

import re
import warnings
from typing import Iterator, List, Tuple

from nltk.tokenize.api import TokenizerI
from nltk.tokenize.util import align_tokens
Expand Down Expand Up @@ -37,6 +38,9 @@ class NLTKWordTokenizer(TokenizerI):
"""
The NLTK tokenizer that has improved upon the TreebankWordTokenizer.
This is the method that is invoked by ``word_tokenize()``. It assumes that the
text has already been segmented into sentences, e.g. using ``sent_tokenize()``.
The tokenizer is "destructive" such that the regexes applied will munge the
input string to a state beyond re-construction. It is possible to apply
`TreebankWordDetokenizer.detokenize` to the tokenized outputs of
Expand Down Expand Up @@ -113,8 +117,35 @@ class NLTKWordTokenizer(TokenizerI):
CONTRACTIONS2 = list(map(re.compile, _contractions.CONTRACTIONS2))
CONTRACTIONS3 = list(map(re.compile, _contractions.CONTRACTIONS3))

def tokenize(self, text, convert_parentheses=False, return_str=False):

def tokenize(
self, text: str, convert_parentheses: bool = False, return_str: bool = False
) -> List[str]:
r"""Return a tokenized copy of `text`.
>>> from nltk.tokenize import NLTKWordTokenizer
>>> s = '''Good muffins cost $3.88 (roughly 3,36 euros)\nin New York. Please buy me\ntwo of them.\nThanks.'''
>>> NLTKWordTokenizer().tokenize(s)
['Good', 'muffins', 'cost', '$', '3.88', '(', 'roughly', '3,36',
'euros', ')', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
'of', 'them.', 'Thanks', '.']
>>> NLTKWordTokenizer().tokenize(s, convert_parentheses=True)
['Good', 'muffins', 'cost', '$', '3.88', '-LRB-', 'roughly', '3,36',
'euros', '-RRB-', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
'of', 'them.', 'Thanks', '.']
>>> NLTKWordTokenizer().tokenize(s, return_str=True)
' Good muffins cost $ 3.88 ( roughly 3,36 euros ) \nin New York. Please buy me\ntwo of them.\nThanks . '
:param text: A string with a sentence or sentences.
:type text: str
:param convert_parentheses: if True, replace parentheses to PTB symbols,
e.g. `(` to `-LRB-`. Defaults to False.
:type convert_parentheses: bool, optional
:param return_str: If True, return tokens as space-separated string,
defaults to False.
:type return_str: bool, optional
:return: List of tokens from `text`.
:rtype: List[str]
"""
if return_str:
warnings.warn(
"Parameter 'return_str' has been deprecated and should no "
Expand Down Expand Up @@ -159,9 +190,11 @@ def tokenize(self, text, convert_parentheses=False, return_str=False):

return text.split()

def span_tokenize(self, text):
def span_tokenize(self, text: str) -> Iterator[Tuple[int, int]]:
r"""
Returns the spans of the tokens in ``text``.
Uses the post-hoc nltk.tokens.align_tokens to return the offset spans.
>>> from nltk.tokenize import NLTKWordTokenizer
>>> s = '''Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks).'''
>>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
Expand All @@ -175,21 +208,10 @@ def span_tokenize(self, text):
... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.']
>>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected
True
Additional example
>>> from nltk.tokenize import NLTKWordTokenizer
>>> s = '''I said, "I'd like to buy some ''good muffins" which cost $3.88\n each in New (York)."'''
>>> expected = [(0, 1), (2, 6), (6, 7), (8, 9), (9, 10), (10, 12),
... (13, 17), (18, 20), (21, 24), (25, 29), (30, 32), (32, 36),
... (37, 44), (44, 45), (46, 51), (52, 56), (57, 58), (58, 62),
... (64, 68), (69, 71), (72, 75), (76, 77), (77, 81), (81, 82),
... (82, 83), (83, 84)]
>>> list(NLTKWordTokenizer().span_tokenize(s)) == expected
True
>>> expected = ['I', 'said', ',', '"', 'I', "'d", 'like', 'to',
... 'buy', 'some', "''", "good", 'muffins', '"', 'which', 'cost',
... '$', '3.88', 'each', 'in', 'New', '(', 'York', ')', '.', '"']
>>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected
True
:param text: A string with a sentence or sentences.
:type text: str
:yield: Tuple[int, int]
"""
raw_tokens = self.tokenize(text)

Expand Down
85 changes: 51 additions & 34 deletions nltk/tokenize/treebank.py
Expand Up @@ -19,6 +19,7 @@

import re
import warnings
from typing import Iterator, List, Tuple

from nltk.tokenize.api import TokenizerI
from nltk.tokenize.destructive import MacIntyreContractions
Expand All @@ -28,8 +29,6 @@
class TreebankWordTokenizer(TokenizerI):
r"""
The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank.
This is the method that is invoked by ``word_tokenize()``. It assumes that the
text has already been segmented into sentences, e.g. using ``sent_tokenize()``.
This tokenizer performs the following steps:
Expand Down Expand Up @@ -99,8 +98,35 @@ class TreebankWordTokenizer(TokenizerI):
CONTRACTIONS2 = list(map(re.compile, _contractions.CONTRACTIONS2))
CONTRACTIONS3 = list(map(re.compile, _contractions.CONTRACTIONS3))

def tokenize(self, text, convert_parentheses=False, return_str=False):

def tokenize(
self, text: str, convert_parentheses: bool = False, return_str: bool = False
) -> List[str]:
r"""Return a tokenized copy of `text`.
>>> from nltk.tokenize import TreebankWordTokenizer
>>> s = '''Good muffins cost $3.88 (roughly 3,36 euros)\nin New York. Please buy me\ntwo of them.\nThanks.'''
>>> TreebankWordTokenizer().tokenize(s)
['Good', 'muffins', 'cost', '$', '3.88', '(', 'roughly', '3,36',
'euros', ')', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
'of', 'them.', 'Thanks', '.']
>>> TreebankWordTokenizer().tokenize(s, convert_parentheses=True)
['Good', 'muffins', 'cost', '$', '3.88', '-LRB-', 'roughly', '3,36',
'euros', '-RRB-', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
'of', 'them.', 'Thanks', '.']
>>> TreebankWordTokenizer().tokenize(s, return_str=True)
' Good muffins cost $ 3.88 ( roughly 3,36 euros ) \nin New York. Please buy me\ntwo of them.\nThanks . '
:param text: A string with a sentence or sentences.
:type text: str
:param convert_parentheses: if True, replace parentheses to PTB symbols,
e.g. `(` to `-LRB-`. Defaults to False.
:type convert_parentheses: bool, optional
:param return_str: If True, return tokens as space-separated string,
defaults to False.
:type return_str: bool, optional
:return: List of tokens from `text`.
:rtype: List[str]
"""
if return_str is not False:
warnings.warn(
"Parameter 'return_str' has been deprecated and should no "
Expand Down Expand Up @@ -145,8 +171,9 @@ def tokenize(self, text, convert_parentheses=False, return_str=False):

return text.split()

def span_tokenize(self, text):
def span_tokenize(self, text: str) -> Iterator[Tuple[int, int]]:
r"""
Returns the spans of the tokens in ``text``.
Uses the post-hoc nltk.tokens.align_tokens to return the offset spans.
>>> from nltk.tokenize import TreebankWordTokenizer
Expand All @@ -163,22 +190,9 @@ def span_tokenize(self, text):
>>> [s[start:end] for start, end in TreebankWordTokenizer().span_tokenize(s)] == expected
True
Additional example
>>> from nltk.tokenize import TreebankWordTokenizer
>>> s = '''I said, "I'd like to buy some ''good muffins" which cost $3.88\n each in New (York)."'''
>>> expected = [(0, 1), (2, 6), (6, 7), (8, 9), (9, 10), (10, 12),
... (13, 17), (18, 20), (21, 24), (25, 29), (30, 32), (32, 36),
... (37, 44), (44, 45), (46, 51), (52, 56), (57, 58), (58, 62),
... (64, 68), (69, 71), (72, 75), (76, 77), (77, 81), (81, 82),
... (82, 83), (83, 84)]
>>> list(TreebankWordTokenizer().span_tokenize(s)) == expected
True
>>> expected = ['I', 'said', ',', '"', 'I', "'d", 'like', 'to',
... 'buy', 'some', "''", "good", 'muffins', '"', 'which', 'cost',
... '$', '3.88', 'each', 'in', 'New', '(', 'York', ')', '.', '"']
>>> [s[start:end] for start, end in TreebankWordTokenizer().span_tokenize(s)] == expected
True
:param text: A string with a sentence or sentences.
:type text: str
:yield: Tuple[int, int]
"""
raw_tokens = self.tokenize(text)

Expand Down Expand Up @@ -208,13 +222,13 @@ class TreebankWordDetokenizer(TokenizerI):
Note:
- There're additional assumption mades when undoing the padding of `[;@#$%&]`
- There're additional assumption mades when undoing the padding of ``[;@#$%&]``
punctuation symbols that isn't presupposed in the TreebankTokenizer.
- There're additional regexes added in reversing the parentheses tokenization,
- the `r'([\]\)\}\>])\s([:;,.])'` removes the additional right padding added
to the closing parentheses precedding `[:;,.]`.
such as the ``r'([\]\)\}\>])\s([:;,.])'``, which removes the additional right
padding added to the closing parentheses precedding ``[:;,.]``.
- It's not possible to return the original whitespaces as they were because
there wasn't explicit records of where '\n', '\t' or '\s' were removed at
there wasn't explicit records of where `'\n'`, `'\t'` or `'\s'` were removed at
the text.split() operation.
>>> from nltk.tokenize.treebank import TreebankWordTokenizer, TreebankWordDetokenizer
Expand All @@ -225,7 +239,7 @@ class TreebankWordDetokenizer(TokenizerI):
>>> d.detokenize(toks)
'Good muffins cost $3.88 in New York. Please buy me two of them. Thanks.'
The MXPOST parentheses substitution can be undone using the `convert_parentheses`
The MXPOST parentheses substitution can be undone using the ``convert_parentheses``
parameter:
>>> s = '''Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks).'''
Expand All @@ -241,14 +255,14 @@ class TreebankWordDetokenizer(TokenizerI):
During tokenization it's safe to add more spaces but during detokenization,
simply undoing the padding doesn't really help.
- During tokenization, left and right pad is added to [!?], when
detokenizing, only left shift the [!?] is needed.
Thus (re.compile(r'\s([?!])'), r'\g<1>')
- During tokenization, left and right pad is added to ``[!?]``, when
detokenizing, only left shift the ``[!?]`` is needed.
Thus ``(re.compile(r'\s([?!])'), r'\g<1>')``.
- During tokenization [:,] are left and right padded but when detokenizing,
- During tokenization ``[:,]`` are left and right padded but when detokenizing,
only left shift is necessary and we keep right pad after comma/colon
if the string after is a non-digit.
Thus (re.compile(r'\s([:,])\s([^\d])'), r'\1 \2')
Thus ``(re.compile(r'\s([:,])\s([^\d])'), r'\1 \2')``.
>>> from nltk.tokenize.treebank import TreebankWordDetokenizer
>>> toks = ['hello', ',', 'i', 'ca', "n't", 'feel', 'my', 'feet', '!', 'Help', '!', '!']
Expand Down Expand Up @@ -332,13 +346,16 @@ class TreebankWordDetokenizer(TokenizerI):
(re.compile(r"``"), r'"'),
]

def tokenize(self, tokens, convert_parentheses=False):
def tokenize(self, tokens: List[str], convert_parentheses: bool = False) -> str:
"""
Treebank detokenizer, created by undoing the regexes from
the TreebankWordTokenizer.tokenize.
:param tokens: A list of strings, i.e. tokenized text.
:type tokens: list(str)
:type tokens: List[str]
:param convert_parentheses: if True, replace PTB symbols with parentheses,
e.g. `-LRB-` to `(`. Defaults to False.
:type convert_parentheses: bool, optional
:return: str
"""
text = " ".join(tokens)
Expand Down Expand Up @@ -378,6 +395,6 @@ def tokenize(self, tokens, convert_parentheses=False):

return text.strip()

def detokenize(self, tokens, convert_parentheses=False):
def detokenize(self, tokens: List[str], convert_parentheses: bool = False) -> str:
"""Duck-typing the abstract *tokenize()*."""
return self.tokenize(tokens, convert_parentheses)

0 comments on commit b30b6ac

Please sign in to comment.