Improved Tokenize documentation + added TokenizerI as superclass for …

…TweetTokenizer (#2878) * Add span_tokenize to NLTKWordTokenizer, just like in TreebankWordTokenizer * Added documentation for core tokenization modules * Added tokenize_sents method to TweetTokenizer By subclassing it with TokenizerI * Resolved documentation indentation issue in tokenize/casual.py * Fixed copy-paste issue in tokenize docstring
nltk · Nov 21, 2021 · b30b6ac · b30b6ac
1 parent 6dcfa80
commit b30b6ac
Show file tree

Hide file tree

Showing 5 changed files with 126 additions and 69 deletions.
diff --git a/nltk/test/tokenize.doctest b/nltk/test/tokenize.doctest
@@ -4,7 +4,7 @@
     >>> from nltk.tokenize import *
 
 Regression Tests: NLTKWordTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Tokenizing some test strings.
 
@@ -259,6 +259,19 @@ It should not hang on long sequences of the same punctuation character.
     >>> tknzr.tokenize(s10)
     ['Photo', ':', "Aujourd'hui", 'sur', 'http://t.co/0gebOFDUzn', 'Projet', '...', 'http://t.co/bKfIUbydz2', '...', 'http://fb.me/3b6uXpz0L']
 
+Tokenizing multiple sentences at once:
+
+    >>> tknzr = TweetTokenizer()
+    >>> sentences = [
+    ...     "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--",
+    ...     "@jrmy: I'm REALLY HAPPYYY about that! NICEEEE :D :P",
+    ...     "@_willy65: No place for @chuck tonight. Sorry."
+    ... ]
+    >>> tknzr.tokenize_sents(sentences) # doctest: +NORMALIZE_WHITESPACE
+    [['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->', '<--'],
+    ['@jrmy', ':', "I'm", 'REALLY', 'HAPPYYY', 'about', 'that', '!', 'NICEEEE', ':D', ':P'],
+    ['@_willy65', ':', 'No', 'place', 'for', '@chuck', 'tonight', '.', 'Sorry', '.']]
+
 
 Regression Tests: PunktSentenceTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

diff --git a/nltk/tokenize/api.py b/nltk/tokenize/api.py
@@ -11,6 +11,7 @@
 """
 
 from abc import ABC, abstractmethod
+from typing import Iterator, List, Tuple
 
 from nltk.internals import overridden
 from nltk.tokenize.util import string_span_tokenize
@@ -23,41 +24,43 @@ class TokenizerI(ABC):
     """
 
     @abstractmethod
-    def tokenize(self, s):
+    def tokenize(self, s: str) -> List[str]:
         """
         Return a tokenized copy of *s*.
 
-        :rtype: list of str
+        :rtype: List[str]
         """
         if overridden(self.tokenize_sents):
             return self.tokenize_sents([s])[0]
 
-    def span_tokenize(self, s):
+    def span_tokenize(self, s: str) -> Iterator[Tuple[int, int]]:
         """
         Identify the tokens using integer offsets ``(start_i, end_i)``,
         where ``s[start_i:end_i]`` is the corresponding token.
 
-        :rtype: iter(tuple(int, int))
+        :rtype: Iterator[Tuple[int, int]]
         """
         raise NotImplementedError()
 
-    def tokenize_sents(self, strings):
+    def tokenize_sents(self, strings: List[str]) -> List[List[str]]:
         """
         Apply ``self.tokenize()`` to each element of ``strings``.  I.e.:
 
             return [self.tokenize(s) for s in strings]
 
-        :rtype: list(list(str))
+        :rtype: List[List[str]]
         """
         return [self.tokenize(s) for s in strings]
 
-    def span_tokenize_sents(self, strings):
+    def span_tokenize_sents(
+        self, strings: List[str]
+    ) -> Iterator[List[Tuple[int, int]]]:
         """
         Apply ``self.span_tokenize()`` to each element of ``strings``.  I.e.:
 
             return [self.span_tokenize(s) for s in strings]
 
-        :rtype: iter(list(tuple(int, int)))
+        :yield: List[Tuple[int, int]]
         """
         for s in strings:
             yield list(self.span_tokenize(s))

diff --git a/nltk/tokenize/casual.py b/nltk/tokenize/casual.py
@@ -28,16 +28,16 @@
 
 4. When instantiating Tokenizer objects, there are several options:
     * preserve_case. By default, it is set to True. If it is set to
-        False, then the tokenizer will downcase everything except for
-        emoticons.
+      False, then the tokenizer will downcase everything except for
+      emoticons.
     * reduce_len. By default, it is set to False. It specifies whether
-        to replace repeated character sequences of length 3 or greater
-        with sequences of length 3.
+      to replace repeated character sequences of length 3 or greater
+      with sequences of length 3.
     * strip_handles. By default, it is set to False. It specifies
-        whether to remove Twitter handles of text used in the
-        `tokenize` method.
+      whether to remove Twitter handles of text used in the
+      `tokenize` method.
     * match_phone_numbers. By default, it is set to True. It indicates
-        whether the `tokenize` method should look for phone numbers.
+      whether the `tokenize` method should look for phone numbers.
 """
 
 
@@ -48,6 +48,8 @@
 
 import regex  # https://github.com/nltk/nltk/issues/2409
 
+from nltk.tokenize.api import TokenizerI
+
 ######################################################################
 # The following strings are components in the regular expression
 # that is used for tokenizing. It's important that phone_number
@@ -276,7 +278,7 @@ def _convert_entity(match):
 ######################################################################
 
 
-class TweetTokenizer:
+class TweetTokenizer(TokenizerI):
     r"""
     Tokenizer for tweets.
 

diff --git a/nltk/tokenize/destructive.py b/nltk/tokenize/destructive.py
@@ -9,6 +9,7 @@
 
 import re
 import warnings
+from typing import Iterator, List, Tuple
 
 from nltk.tokenize.api import TokenizerI
 from nltk.tokenize.util import align_tokens
@@ -37,6 +38,9 @@ class NLTKWordTokenizer(TokenizerI):
     """
     The NLTK tokenizer that has improved upon the TreebankWordTokenizer.
 
+    This is the method that is invoked by ``word_tokenize()``.  It assumes that the
+    text has already been segmented into sentences, e.g. using ``sent_tokenize()``.
+
     The tokenizer is "destructive" such that the regexes applied will munge the
     input string to a state beyond re-construction. It is possible to apply
     `TreebankWordDetokenizer.detokenize` to the tokenized outputs of
@@ -113,8 +117,35 @@ class NLTKWordTokenizer(TokenizerI):
     CONTRACTIONS2 = list(map(re.compile, _contractions.CONTRACTIONS2))
     CONTRACTIONS3 = list(map(re.compile, _contractions.CONTRACTIONS3))
 
-    def tokenize(self, text, convert_parentheses=False, return_str=False):
-
+    def tokenize(
+        self, text: str, convert_parentheses: bool = False, return_str: bool = False
+    ) -> List[str]:
+        r"""Return a tokenized copy of `text`.
+
+        >>> from nltk.tokenize import NLTKWordTokenizer
+        >>> s = '''Good muffins cost $3.88 (roughly 3,36 euros)\nin New York.  Please buy me\ntwo of them.\nThanks.'''
+        >>> NLTKWordTokenizer().tokenize(s)
+        ['Good', 'muffins', 'cost', '$', '3.88', '(', 'roughly', '3,36',
+        'euros', ')', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
+        'of', 'them.', 'Thanks', '.']
+        >>> NLTKWordTokenizer().tokenize(s, convert_parentheses=True)
+        ['Good', 'muffins', 'cost', '$', '3.88', '-LRB-', 'roughly', '3,36',
+        'euros', '-RRB-', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
+        'of', 'them.', 'Thanks', '.']
+        >>> NLTKWordTokenizer().tokenize(s, return_str=True)
+        ' Good muffins cost  $ 3.88  ( roughly 3,36 euros ) \nin New York.  Please buy me\ntwo of them.\nThanks  .  '
+
+        :param text: A string with a sentence or sentences.
+        :type text: str
+        :param convert_parentheses: if True, replace parentheses to PTB symbols,
+            e.g. `(` to `-LRB-`. Defaults to False.
+        :type convert_parentheses: bool, optional
+        :param return_str: If True, return tokens as space-separated string,
+            defaults to False.
+        :type return_str: bool, optional
+        :return: List of tokens from `text`.
+        :rtype: List[str]
+        """
         if return_str:
             warnings.warn(
                 "Parameter 'return_str' has been deprecated and should no "
@@ -159,9 +190,11 @@ def tokenize(self, text, convert_parentheses=False, return_str=False):
 
         return text.split()
 
-    def span_tokenize(self, text):
+    def span_tokenize(self, text: str) -> Iterator[Tuple[int, int]]:
         r"""
+        Returns the spans of the tokens in ``text``.
         Uses the post-hoc nltk.tokens.align_tokens to return the offset spans.
+
             >>> from nltk.tokenize import NLTKWordTokenizer
             >>> s = '''Good muffins cost $3.88\nin New (York).  Please (buy) me\ntwo of them.\n(Thanks).'''
             >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
@@ -175,21 +208,10 @@ def span_tokenize(self, text):
             ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.']
             >>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected
             True
-            Additional example
-            >>> from nltk.tokenize import NLTKWordTokenizer
-            >>> s = '''I said, "I'd like to buy some ''good muffins" which cost $3.88\n each in New (York)."'''
-            >>> expected = [(0, 1), (2, 6), (6, 7), (8, 9), (9, 10), (10, 12),
-            ... (13, 17), (18, 20), (21, 24), (25, 29), (30, 32), (32, 36),
-            ... (37, 44), (44, 45), (46, 51), (52, 56), (57, 58), (58, 62),
-            ... (64, 68), (69, 71), (72, 75), (76, 77), (77, 81), (81, 82),
-            ... (82, 83), (83, 84)]
-            >>> list(NLTKWordTokenizer().span_tokenize(s)) == expected
-            True
-            >>> expected = ['I', 'said', ',', '"', 'I', "'d", 'like', 'to',
-            ... 'buy', 'some', "''", "good", 'muffins', '"', 'which', 'cost',
-            ... '$', '3.88', 'each', 'in', 'New', '(', 'York', ')', '.', '"']
-            >>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected
-            True
+
+        :param text: A string with a sentence or sentences.
+        :type text: str
+        :yield: Tuple[int, int]
         """
         raw_tokens = self.tokenize(text)
 

diff --git a/nltk/tokenize/treebank.py b/nltk/tokenize/treebank.py
@@ -19,6 +19,7 @@
 
 import re
 import warnings
+from typing import Iterator, List, Tuple
 
 from nltk.tokenize.api import TokenizerI
 from nltk.tokenize.destructive import MacIntyreContractions
@@ -28,8 +29,6 @@
 class TreebankWordTokenizer(TokenizerI):
     r"""
     The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank.
-    This is the method that is invoked by ``word_tokenize()``.  It assumes that the
-    text has already been segmented into sentences, e.g. using ``sent_tokenize()``.
 
     This tokenizer performs the following steps:
 
@@ -99,8 +98,35 @@ class TreebankWordTokenizer(TokenizerI):
     CONTRACTIONS2 = list(map(re.compile, _contractions.CONTRACTIONS2))
     CONTRACTIONS3 = list(map(re.compile, _contractions.CONTRACTIONS3))
 
-    def tokenize(self, text, convert_parentheses=False, return_str=False):
-
+    def tokenize(
+        self, text: str, convert_parentheses: bool = False, return_str: bool = False
+    ) -> List[str]:
+        r"""Return a tokenized copy of `text`.
+
+        >>> from nltk.tokenize import TreebankWordTokenizer
+        >>> s = '''Good muffins cost $3.88 (roughly 3,36 euros)\nin New York.  Please buy me\ntwo of them.\nThanks.'''
+        >>> TreebankWordTokenizer().tokenize(s)
+        ['Good', 'muffins', 'cost', '$', '3.88', '(', 'roughly', '3,36',
+        'euros', ')', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
+        'of', 'them.', 'Thanks', '.']
+        >>> TreebankWordTokenizer().tokenize(s, convert_parentheses=True)
+        ['Good', 'muffins', 'cost', '$', '3.88', '-LRB-', 'roughly', '3,36',
+        'euros', '-RRB-', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
+        'of', 'them.', 'Thanks', '.']
+        >>> TreebankWordTokenizer().tokenize(s, return_str=True)
+        ' Good muffins cost  $ 3.88  ( roughly 3,36 euros ) \nin New York.  Please buy me\ntwo of them.\nThanks .  '
+
+        :param text: A string with a sentence or sentences.
+        :type text: str
+        :param convert_parentheses: if True, replace parentheses to PTB symbols,
+            e.g. `(` to `-LRB-`. Defaults to False.
+        :type convert_parentheses: bool, optional
+        :param return_str: If True, return tokens as space-separated string,
+            defaults to False.
+        :type return_str: bool, optional
+        :return: List of tokens from `text`.
+        :rtype: List[str]
+        """
         if return_str is not False:
             warnings.warn(
                 "Parameter 'return_str' has been deprecated and should no "
@@ -145,8 +171,9 @@ def tokenize(self, text, convert_parentheses=False, return_str=False):
 
         return text.split()
 
-    def span_tokenize(self, text):
+    def span_tokenize(self, text: str) -> Iterator[Tuple[int, int]]:
         r"""
+        Returns the spans of the tokens in ``text``.
         Uses the post-hoc nltk.tokens.align_tokens to return the offset spans.
 
             >>> from nltk.tokenize import TreebankWordTokenizer
@@ -163,22 +190,9 @@ def span_tokenize(self, text):
             >>> [s[start:end] for start, end in TreebankWordTokenizer().span_tokenize(s)] == expected
             True
 
-            Additional example
-            >>> from nltk.tokenize import TreebankWordTokenizer
-            >>> s = '''I said, "I'd like to buy some ''good muffins" which cost $3.88\n each in New (York)."'''
-            >>> expected = [(0, 1), (2, 6), (6, 7), (8, 9), (9, 10), (10, 12),
-            ... (13, 17), (18, 20), (21, 24), (25, 29), (30, 32), (32, 36),
-            ... (37, 44), (44, 45), (46, 51), (52, 56), (57, 58), (58, 62),
-            ... (64, 68), (69, 71), (72, 75), (76, 77), (77, 81), (81, 82),
-            ... (82, 83), (83, 84)]
-            >>> list(TreebankWordTokenizer().span_tokenize(s)) == expected
-            True
-            >>> expected = ['I', 'said', ',', '"', 'I', "'d", 'like', 'to',
-            ... 'buy', 'some', "''", "good", 'muffins', '"', 'which', 'cost',
-            ... '$', '3.88', 'each', 'in', 'New', '(', 'York', ')', '.', '"']
-            >>> [s[start:end] for start, end in TreebankWordTokenizer().span_tokenize(s)] == expected
-            True
-
+        :param text: A string with a sentence or sentences.
+        :type text: str
+        :yield: Tuple[int, int]
         """
         raw_tokens = self.tokenize(text)
 
@@ -208,13 +222,13 @@ class TreebankWordDetokenizer(TokenizerI):
 
     Note:
 
-    - There're additional assumption mades when undoing the padding of `[;@#$%&]`
+    - There're additional assumption mades when undoing the padding of ``[;@#$%&]``
       punctuation symbols that isn't presupposed in the TreebankTokenizer.
     - There're additional regexes added in reversing the parentheses tokenization,
-       - the `r'([\]\)\}\>])\s([:;,.])'` removes the additional right padding added
-         to the closing parentheses precedding `[:;,.]`.
+       such as the ``r'([\]\)\}\>])\s([:;,.])'``, which removes the additional right
+       padding added to the closing parentheses precedding ``[:;,.]``.
     - It's not possible to return the original whitespaces as they were because
-      there wasn't explicit records of where '\n', '\t' or '\s' were removed at
+      there wasn't explicit records of where `'\n'`, `'\t'` or `'\s'` were removed at
       the text.split() operation.
 
     >>> from nltk.tokenize.treebank import TreebankWordTokenizer, TreebankWordDetokenizer
@@ -225,7 +239,7 @@ class TreebankWordDetokenizer(TokenizerI):
     >>> d.detokenize(toks)
     'Good muffins cost $3.88 in New York. Please buy me two of them. Thanks.'
 
-    The MXPOST parentheses substitution can be undone using the `convert_parentheses`
+    The MXPOST parentheses substitution can be undone using the ``convert_parentheses``
     parameter:
 
     >>> s = '''Good muffins cost $3.88\nin New (York).  Please (buy) me\ntwo of them.\n(Thanks).'''
@@ -241,14 +255,14 @@ class TreebankWordDetokenizer(TokenizerI):
     During tokenization it's safe to add more spaces but during detokenization,
     simply undoing the padding doesn't really help.
 
-    - During tokenization, left and right pad is added to [!?], when
-      detokenizing, only left shift the [!?] is needed.
-      Thus (re.compile(r'\s([?!])'), r'\g<1>')
+    - During tokenization, left and right pad is added to ``[!?]``, when
+      detokenizing, only left shift the ``[!?]`` is needed.
+      Thus ``(re.compile(r'\s([?!])'), r'\g<1>')``.
 
-    - During tokenization [:,] are left and right padded but when detokenizing,
+    - During tokenization ``[:,]`` are left and right padded but when detokenizing,
       only left shift is necessary and we keep right pad after comma/colon
       if the string after is a non-digit.
-      Thus (re.compile(r'\s([:,])\s([^\d])'), r'\1 \2')
+      Thus ``(re.compile(r'\s([:,])\s([^\d])'), r'\1 \2')``.
 
     >>> from nltk.tokenize.treebank import TreebankWordDetokenizer
     >>> toks = ['hello', ',', 'i', 'ca', "n't", 'feel', 'my', 'feet', '!', 'Help', '!', '!']
@@ -332,13 +346,16 @@ class TreebankWordDetokenizer(TokenizerI):
         (re.compile(r"``"), r'"'),
     ]
 
-    def tokenize(self, tokens, convert_parentheses=False):
+    def tokenize(self, tokens: List[str], convert_parentheses: bool = False) -> str:
         """
         Treebank detokenizer, created by undoing the regexes from
         the TreebankWordTokenizer.tokenize.
 
         :param tokens: A list of strings, i.e. tokenized text.
-        :type tokens: list(str)
+        :type tokens: List[str]
+        :param convert_parentheses: if True, replace PTB symbols with parentheses,
+            e.g. `-LRB-` to `(`. Defaults to False.
+        :type convert_parentheses: bool, optional
         :return: str
         """
         text = " ".join(tokens)
@@ -378,6 +395,6 @@ def tokenize(self, tokens, convert_parentheses=False):
 
         return text.strip()
 
-    def detokenize(self, tokens, convert_parentheses=False):
+    def detokenize(self, tokens: List[str], convert_parentheses: bool = False) -> str:
         """Duck-typing the abstract *tokenize()*."""
         return self.tokenize(tokens, convert_parentheses)