From d97cf17f0ad9903035d0c8e08b3eb552738f28f8 Mon Sep 17 00:00:00 2001 From: Steven Bird Date: Fri, 20 Aug 2021 14:45:37 +0930 Subject: [PATCH 01/36] update for 3.6.3 --- nltk/VERSION | 2 +- web/conf.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/nltk/VERSION b/nltk/VERSION index b72762837e..4a788a01da 100644 --- a/nltk/VERSION +++ b/nltk/VERSION @@ -1 +1 @@ -3.6.2 +3.6.3 diff --git a/web/conf.py b/web/conf.py index 93ea26071f..ea31ee71f0 100644 --- a/web/conf.py +++ b/web/conf.py @@ -55,9 +55,9 @@ # built documents. # # The short X.Y version. -version = "3.6.2" +version = "3.6.3" # The full version, including alpha/beta/rc tags. -release = "3.6.2" +release = "3.6.3" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. From ec0c03c893f6f4f213127f969c61fc8a700d3b36 Mon Sep 17 00:00:00 2001 From: mohaned mashaly <30902228+12mohaned@users.noreply.github.com> Date: Fri, 3 Sep 2021 04:38:00 +0200 Subject: [PATCH 02/36] Fixing TweetTokenizer format (#2791) * Fixing TweetTokenizer format to follow PEP8 format * remove unnecessary else from a condition --- nltk/tokenize/casual.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/nltk/tokenize/casual.py b/nltk/tokenize/casual.py index db7d1dacd2..7d0d17fc64 100644 --- a/nltk/tokenize/casual.py +++ b/nltk/tokenize/casual.py @@ -238,8 +238,7 @@ def _convert_entity(match): else: if entity_body in keep: return match.group(0) - else: - number = html.entities.name2codepoint.get(entity_body) + number = html.entities.name2codepoint.get(entity_body) if number is not None: try: return chr(number) @@ -262,7 +261,8 @@ class TweetTokenizer: >>> tknzr = TweetTokenizer() >>> s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--" >>> tknzr.tokenize(s0) - ['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->', '<--'] + ['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3' + , 'and', 'some', 'arrows', '<', '>', '->', '<--'] Examples using `strip_handles` and `reduce_len parameters`: @@ -323,9 +323,11 @@ def remove_handles(text): Remove Twitter username handles from text. """ pattern = regex.compile( - r"(? Date: Sun, 5 Sep 2021 10:23:12 +0200 Subject: [PATCH 03/36] Use global regex for TweetTokenizer'ss remove_handles (#2795) --- nltk/tokenize/casual.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/nltk/tokenize/casual.py b/nltk/tokenize/casual.py index 7d0d17fc64..66f038105f 100644 --- a/nltk/tokenize/casual.py +++ b/nltk/tokenize/casual.py @@ -177,6 +177,11 @@ # These are for regularizing HTML entities to Unicode: ENT_RE = regex.compile(r"&(#?(x?))([^&;\s]+);") +# For stripping away handles from a tweet: +HANDLES_RE = regex.compile( + r"(? Date: Sat, 11 Sep 2021 17:35:28 +0200 Subject: [PATCH 04/36] build: add matplotlib in deps section --- tox.ini | 1 + 1 file changed, 1 insertion(+) diff --git a/tox.ini b/tox.ini index 50d1693cad..b97d416505 100644 --- a/tox.ini +++ b/tox.ini @@ -29,6 +29,7 @@ deps = click joblib tqdm + matplotlib changedir = nltk/test commands = From ab1027691fcd0e6e9d206156a4fd01eb0c5db626 Mon Sep 17 00:00:00 2001 From: Abhijnan Bajpai <57059194+Abhijnan-Bajpai@users.noreply.github.com> Date: Mon, 13 Sep 2021 21:01:12 +0530 Subject: [PATCH 05/36] Improved the removal of twitter handles (#2799) * Improved the removal of twitter username handles from text * Improved the removal of handles * Changing the length of handles to 15 from 20 * Modified incorrect tests - twitter handles can only be 15 characters * Removed duplicated part of the HANDLES_RE regex Co-authored-by: Tom Aarsen --- nltk/test/unit/test_tokenize.py | 12 ++++++------ nltk/tokenize/casual.py | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/nltk/test/unit/test_tokenize.py b/nltk/test/unit/test_tokenize.py index 65cbef07a4..2efd793ef9 100644 --- a/nltk/test/unit/test_tokenize.py +++ b/nltk/test/unit/test_tokenize.py @@ -306,18 +306,18 @@ def test_remove_handle(self): result = tokenizer.tokenize(test5) assert result == expected - # Tests that handles can have a max length of 20 - test6 = "@abcdefghijklmnopqrstuvwxyz @abcdefghijklmnopqrst1234 @abcdefghijklmnopqrst_ @abcdefghijklmnopqrstendofhandle" - expected = ["uvwxyz", "1234", "_", "endofhandle"] + # Tests that handles can have a max length of 15 + test6 = "@abcdefghijklmnopqrstuvwxyz @abcdefghijklmno1234 @abcdefghijklmno_ @abcdefghijklmnoendofhandle" + expected = ["pqrstuvwxyz", "1234", "_", "endofhandle"] result = tokenizer.tokenize(test6) assert result == expected # Edge case where an @ comes directly after a long handle - test7 = "@abcdefghijklmnopqrstu@abcde @abcdefghijklmnopqrst@abcde @abcdefghijklmnopqrst_@abcde @abcdefghijklmnopqrst5@abcde" + test7 = "@abcdefghijklmnop@abcde @abcdefghijklmno@abcde @abcdefghijklmno_@abcde @abcdefghijklmno5@abcde" expected = [ - "u", + "p", "@abcde", - "@abcdefghijklmnopqrst", + "@abcdefghijklmno", "@abcde", "_", "@abcde", diff --git a/nltk/tokenize/casual.py b/nltk/tokenize/casual.py index 66f038105f..0fb0b39243 100644 --- a/nltk/tokenize/casual.py +++ b/nltk/tokenize/casual.py @@ -179,8 +179,8 @@ # For stripping away handles from a tweet: HANDLES_RE = regex.compile( - r"(? Date: Thu, 16 Sep 2021 17:06:52 -0400 Subject: [PATCH 06/36] Final py3.9 deprecation (#2801) * Final py39 deprecation * acommodate multiple python versions * dont need capsys import anymore --- nltk/test/unit/test_util.py | 215 ++++++++++++++++++++++-------------- nltk/util.py | 32 +++--- 2 files changed, 153 insertions(+), 94 deletions(-) diff --git a/nltk/test/unit/test_util.py b/nltk/test/unit/test_util.py index 365452574b..109a96b31b 100644 --- a/nltk/test/unit/test_util.py +++ b/nltk/test/unit/test_util.py @@ -1,81 +1,134 @@ -""" -Unit tests for nltk.util. -""" - -import unittest - -from nltk.util import everygrams - - -class TestEverygrams(unittest.TestCase): - def setUp(self): - """Form test data for tests.""" - self.test_data = iter("a b c".split()) - - def test_everygrams_without_padding(self): - expected_output = [ - ("a",), - ("a", "b"), - ("a", "b", "c"), - ("b",), - ("b", "c"), - ("c",), - ] - output = everygrams(self.test_data) - self.assertCountEqual(output, expected_output) - - def test_everygrams_max_len(self): - expected_output = [ - ("a",), - ("a", "b"), - ("b",), - ("b", "c"), - ("c",), - ] - output = everygrams(self.test_data, max_len=2) - self.assertCountEqual(output, expected_output) - - def test_everygrams_min_len(self): - expected_output = [ - ("a", "b"), - ("b", "c"), - ("a", "b", "c"), - ] - output = everygrams(self.test_data, min_len=2) - self.assertCountEqual(output, expected_output) - - def test_everygrams_pad_right(self): - expected_output = [ - ("a",), - ("a", "b"), - ("a", "b", "c"), - ("b",), - ("b", "c"), - ("b", "c", None), - ("c",), - ("c", None), - ("c", None, None), - (None,), - (None, None), - (None,), - ] - output = everygrams(self.test_data, max_len=3, pad_right=True) - self.assertCountEqual(output, expected_output) - - def test_everygrams_pad_left(self): - expected_output = [ - (None,), - (None, None), - (None, None, "a"), - (None,), - (None, "a"), - (None, "a", "b"), - ("a",), - ("a", "b"), - ("a", "b", "c"), - ("b",), - ("b", "c"), - ("c",), - ] - output = everygrams(self.test_data, max_len=3, pad_left=True) - self.assertCountEqual(output, expected_output) +import pytest + +from nltk.util import everygrams, usage + + +def test_usage_with_self(capsys): + class MyClass: + def kwargs(self, a=1): + ... + + def no_args(self): + ... + + def pos_args(self, a, b): + ... + + def pos_args_and_kwargs(self, a, b, c=1): + ... + + usage(MyClass) + + captured = capsys.readouterr() + assert captured.out == ( + "MyClass supports the following operations:\n" + " - self.kwargs(a=1)\n" + " - self.no_args()\n" + " - self.pos_args(a, b)\n" + " - self.pos_args_and_kwargs(a, b, c=1)\n" + ) + + +def test_usage_with_cls(capsys): + class MyClass: + @classmethod + def clsmethod(cls): + ... + + @classmethod + def clsmethod_with_args(cls, a, b, c=1): + ... + + usage(MyClass) + + captured = capsys.readouterr() + assert captured.out == ( + "MyClass supports the following operations:\n" + " - cls.clsmethod()\n" + " - cls.clsmethod_with_args(a, b, c=1)\n" + ) + + +def test_usage_on_builtin(): + # just check the func passes, since + # builtins change each python version + usage(dict) + + +@pytest.fixture +def everygram_input(): + """Form test data for tests.""" + return iter(["a", "b", "c"]) + + +def test_everygrams_without_padding(everygram_input): + expected_output = [ + ("a",), + ("a", "b"), + ("a", "b", "c"), + ("b",), + ("b", "c"), + ("c",), + ] + output = list(everygrams(everygram_input)) + assert output == expected_output + + +def test_everygrams_max_len(everygram_input): + expected_output = [ + ("a",), + ("a", "b"), + ("b",), + ("b", "c"), + ("c",), + ] + output = list(everygrams(everygram_input, max_len=2)) + assert output == expected_output + + +def test_everygrams_min_len(everygram_input): + expected_output = [ + ("a", "b"), + ("a", "b", "c"), + ("b", "c"), + ] + output = list(everygrams(everygram_input, min_len=2)) + assert output == expected_output + + +def test_everygrams_pad_right(everygram_input): + expected_output = [ + ("a",), + ("a", "b"), + ("a", "b", "c"), + ("b",), + ("b", "c"), + ("b", "c", None), + ("c",), + ("c", None), + ("c", None, None), + (None,), + (None, None), + (None,), + ] + output = list(everygrams(everygram_input, max_len=3, pad_right=True)) + assert output == expected_output + + +def test_everygrams_pad_left(everygram_input): + expected_output = [ + (None,), + (None, None), + (None, None, "a"), + (None,), + (None, "a"), + (None, "a", "b"), + ("a",), + ("a", "b"), + ("a", "b", "c"), + ("b",), + ("b", "c"), + ("c",), + ] + output = list(everygrams(everygram_input, max_len=3, pad_left=True)) + assert output == expected_output diff --git a/nltk/util.py b/nltk/util.py index c25a8fb339..383342a4e5 100644 --- a/nltk/util.py +++ b/nltk/util.py @@ -37,32 +37,38 @@ ###################################################################### -def usage(obj, selfname="self"): +def usage(obj): str(obj) # In case it's lazy, this will load it. if not isinstance(obj, type): obj = obj.__class__ - print("%s supports the following operations:" % obj.__name__) + print(f"{obj.__name__} supports the following operations:") for (name, method) in sorted(pydoc.allmethods(obj).items()): if name.startswith("_"): continue if getattr(method, "__deprecated__", False): continue - getargspec = inspect.getfullargspec - args, varargs, varkw, defaults = getargspec(method)[:4] - if ( - args - and args[0] == "self" - and (defaults is None or len(args) > len(defaults)) - ): - args = args[1:] - name = f"{selfname}.{name}" - argspec = inspect.formatargspec(args, varargs, varkw, defaults) + try: + sig = str(inspect.signature(method)) + except ValueError as e: + # builtins sometimes don't support introspection + if "builtin" in str(e): + continue + else: + raise + + args = sig.lstrip("(").rstrip(")").split(", ") + meth = inspect.getattr_static(obj, name) + if isinstance(meth, (classmethod, staticmethod)): + name = f"cls.{name}" + elif args and args[0] == "self": + name = f"self.{name}" + args.pop(0) print( textwrap.fill( - f"{name}{argspec}", + f"{name}({', '.join(args)})", initial_indent=" - ", subsequent_indent=" " * (len(name) + 5), ) From 49e5d6ef9fffb9d256c70454503486affbaf0c88 Mon Sep 17 00:00:00 2001 From: Steven Bird Date: Fri, 17 Sep 2021 06:46:59 +0930 Subject: [PATCH 07/36] Update with recent commits --- ChangeLog | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ChangeLog b/ChangeLog index d587aa3974..81052939b2 100644 --- a/ChangeLog +++ b/ChangeLog @@ -12,6 +12,8 @@ Version 3.6.3 2021-08-?? * Fixed AttributeError for Arabic ARLSTem2 stemmer * Many fixes and improvements to lm language model package * Fix bug in nltk.metrics.aline, C_skip = -10 +* Improvements to TweetTokenizer +* Optional show arg for FreqDist.plot, ConditionalFreqDist.plot Thanks to the following contributors to 3.6.3 Tom Aarsen, Michael Wayne Goodman, Michał Górny, Maarten ter Huurne, Manu Joseph, From 53dbaa5591003f6764a3d69834e92bc83e3a754c Mon Sep 17 00:00:00 2001 From: mohaned mashaly <30902228+12mohaned@users.noreply.github.com> Date: Thu, 16 Sep 2021 23:40:50 +0200 Subject: [PATCH 08/36] refactor: refactor sentiment analyzer by removing dead and slow perfomance code (#2804) * refactor: refactor sentiment analyzer by removing dead and slow perfomance code * refactor: refactor sentiment analyzer by removing dead and slow perfomance code * fix: add not to false boolean values * Refactor: Add sentiment keyword in all_words Co-authored-by: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com> --- nltk/sentiment/sentiment_analyzer.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/nltk/sentiment/sentiment_analyzer.py b/nltk/sentiment/sentiment_analyzer.py index 35bc810d7c..1660e2f841 100644 --- a/nltk/sentiment/sentiment_analyzer.py +++ b/nltk/sentiment/sentiment_analyzer.py @@ -47,10 +47,10 @@ def all_words(self, documents, labeled=None): all_words = [] if labeled is None: labeled = documents and isinstance(documents[0], tuple) - if labeled == True: - for words, sentiment in documents: + if labeled: + for words, _sentiment in documents: all_words.extend(words) - elif labeled == False: + elif not labeled: for words in documents: all_words.extend(words) return all_words @@ -218,7 +218,7 @@ def evaluate( classifier = self.classifier print(f"Evaluating {type(classifier).__name__} results...") metrics_results = {} - if accuracy == True: + if accuracy: accuracy_score = eval_accuracy(classifier, test_set) metrics_results["Accuracy"] = accuracy_score @@ -232,22 +232,22 @@ def evaluate( test_results[observed].add(i) for label in labels: - if precision == True: + if precision: precision_score = eval_precision( gold_results[label], test_results[label] ) metrics_results[f"Precision [{label}]"] = precision_score - if recall == True: + if recall: recall_score = eval_recall(gold_results[label], test_results[label]) metrics_results[f"Recall [{label}]"] = recall_score - if f_measure == True: + if f_measure: f_measure_score = eval_f_measure( gold_results[label], test_results[label] ) metrics_results[f"F-measure [{label}]"] = f_measure_score # Print evaluation results (in alphabetical order) - if verbose == True: + if verbose: for result in sorted(metrics_results): print(f"{result}: {metrics_results[result]}") From 77b59458bf2f13cf08d3b265dcebaed653016874 Mon Sep 17 00:00:00 2001 From: avena554 Date: Sat, 18 Sep 2021 07:59:35 -0400 Subject: [PATCH 09/36] Edit_distance now computes the actual Damerau-Levenshtein edit-distance (#2736) * Edit_distance now computes the actual Damerau-Levenshtein edit-distance * adapted edit_distance_align to the changes in _edit_distance_step * +couple unit test for the levensthein edit distance with vs without transpositions * pre commit fails when pushing * commiting to run pre-commit hooks * fixed edit distance unit tests and edit distance with transpositions * Added and pytest-ified edit_distance tests Co-authored-by: Tom Aarsen --- nltk/metrics/distance.py | 30 ++++++-- nltk/test/unit/test_distance.py | 123 ++++++++++++++++++++++++++++++++ 2 files changed, 149 insertions(+), 4 deletions(-) create mode 100644 nltk/test/unit/test_distance.py diff --git a/nltk/metrics/distance.py b/nltk/metrics/distance.py index 85912a3e10..c0da4a1753 100644 --- a/nltk/metrics/distance.py +++ b/nltk/metrics/distance.py @@ -34,7 +34,13 @@ def _edit_dist_init(len1, len2): return lev -def _edit_dist_step(lev, i, j, s1, s2, substitution_cost=1, transpositions=False): +def _last_left_t_init(sigma): + return {c: 0 for c in sigma} + + +def _edit_dist_step( + lev, i, j, s1, s2, last_left, last_right, substitution_cost=1, transpositions=False +): c1 = s1[i - 1] c2 = s2[j - 1] @@ -47,9 +53,8 @@ def _edit_dist_step(lev, i, j, s1, s2, substitution_cost=1, transpositions=False # transposition d = c + 1 # never picked by default - if transpositions and i > 1 and j > 1: - if s1[i - 2] == c2 and s2[j - 2] == c1: - d = lev[i - 2][j - 2] + 1 + if transpositions and last_left > 0 and last_right > 0: + d = lev[last_left - 1][last_right - 1] + i - last_left + j - last_right - 1 # pick the cheapest lev[i][j] = min(a, b, c, d) @@ -85,18 +90,33 @@ def edit_distance(s1, s2, substitution_cost=1, transpositions=False): len2 = len(s2) lev = _edit_dist_init(len1 + 1, len2 + 1) + # retrieve alphabet + sigma = set() + sigma.update(s1) + sigma.update(s2) + + # set up table to remember positions of last seen occurrence in s1 + last_left_t = _last_left_t_init(sigma) + # iterate over the array for i in range(len1): + last_right = 0 for j in range(len2): + last_left = last_left_t[s2[j]] _edit_dist_step( lev, i + 1, j + 1, s1, s2, + last_left, + last_right, substitution_cost=substitution_cost, transpositions=transpositions, ) + if s1[i] == s2[j]: + last_right = j + 1 + last_left_t[s1[i]] = i + 1 return lev[len1][len2] @@ -162,6 +182,8 @@ def edit_distance_align(s1, s2, substitution_cost=1): j + 1, s1, s2, + 0, + 0, substitution_cost=substitution_cost, transpositions=False, ) diff --git a/nltk/test/unit/test_distance.py b/nltk/test/unit/test_distance.py new file mode 100644 index 0000000000..bea1b542c2 --- /dev/null +++ b/nltk/test/unit/test_distance.py @@ -0,0 +1,123 @@ +from typing import Tuple + +import pytest + +from nltk.metrics.distance import edit_distance + + +class TestEditDistance: + @pytest.mark.parametrize( + "left,right,substitution_cost,expecteds", + [ + # Allowing transpositions reduces the number of edits required. + # with transpositions: + # e.g. "abc" -T-> "cba" -D-> "ca": 2 steps + # + # without transpositions: + # e.g. "abc" -D-> "ab" -D-> "a" -I-> "ca": 3 steps + ("abc", "ca", 1, (2, 3)), + ("abc", "ca", 5, (2, 3)), # Doesn't *require* substitutions + # Note, a substition_cost of higher than 2 doesn't make much + # sense, as a deletion + insertion is identical, and always + # costs 2. + # + # + # Transpositions don't always reduce the number of edits required: + # with or without transpositions: + # e.g. "wants" -D-> "wats" -D-> "was" -I-> "wasp": 3 steps + ("wants", "wasp", 1, (3, 3)), + ("wants", "wasp", 5, (3, 3)), # Doesn't *require* substitutions + # + # + # Ought to have the same results with and without transpositions + # with or without transpositions: + # e.g. "rain" -S-> "sain" -S-> "shin" -I-> "shine": 3 steps + # (but cost 5 if substitution_cost=2) + ("rain", "shine", 1, (3, 3)), + ("rain", "shine", 2, (5, 5)), # Does *require* substitutions + # + # + # Several potentially interesting typos + # with transpositions: + # e.g. "acbdef" -T-> "abcdef": 1 step + # + # without transpositions: + # e.g. "acbdef" -D-> "abdef" -I-> "abcdef": 2 steps + ("acbdef", "abcdef", 1, (1, 2)), + ("acbdef", "abcdef", 2, (1, 2)), # Doesn't *require* substitutions + # + # + # with transpositions: + # e.g. "lnaguaeg" -T-> "languaeg" -T-> "language": 2 steps + # + # without transpositions: + # e.g. "lnaguaeg" -D-> "laguaeg" -I-> "languaeg" -D-> "languag" -I-> "language": 4 steps + ("lnaguaeg", "language", 1, (2, 4)), + ("lnaguaeg", "language", 2, (2, 4)), # Doesn't *require* substitutions + # + # + # with transpositions: + # e.g. "lnaugage" -T-> "lanugage" -T-> "language": 2 steps + # + # without transpositions: + # e.g. "lnaugage" -S-> "lnangage" -D-> "langage" -I-> "language": 3 steps + # (but one substitution, so a cost of 4 if substition_cost = 2) + ("lnaugage", "language", 1, (2, 3)), + ("lnaugage", "language", 2, (2, 4)), + # Does *require* substitutions if no transpositions + # + # + # with transpositions: + # e.g. "lngauage" -T-> "lnaguage" -T-> "language": 2 steps + # without transpositions: + # e.g. "lngauage" -I-> "lanaguage" -D-> "language": 2 steps + ("lngauage", "language", 1, (2, 2)), + ("lngauage", "language", 2, (2, 2)), # Doesn't *require* substitutions + # + # + # with or without transpositions: + # e.g. "wants" -S-> "sants" -S-> "swnts" -S-> "swits" -S-> "swims" -D-> "swim": 5 steps + # + # with substitution_cost=2 and transpositions: + # e.g. "wants" -T-> "santw" -D-> "sntw" -D-> "stw" -D-> "sw" + # -I-> "swi" -I-> "swim": 6 steps + # + # with substitution_cost=2 and no transpositions: + # e.g. "wants" -I-> "swants" -D-> "swant" -D-> "swan" -D-> "swa" -D-> "sw" + # -I-> "swi" -I-> "swim": 7 steps + ("wants", "swim", 1, (5, 5)), + ("wants", "swim", 2, (6, 7)), + # + # + # with or without transpositions: + # e.g. "kitten" -S-> "sitten" -s-> "sittin" -I-> "sitting": 3 steps + # (but cost 5 if substitution_cost=2) + ("kitten", "sitting", 1, (3, 3)), + ("kitten", "sitting", 2, (5, 5)), + ], + ) + def test_with_transpositions( + self, left: str, right: str, substitution_cost: int, expecteds: Tuple[int, int] + ): + """Test `edit_distance` between two strings, given some `substitution_cost`, + and whether transpositions are allowed. + + Args: + left (str): First input string to `edit_distance`. + right (str): Second input string to `edit_distance`. + substitution_cost (int): The cost of a substitution action in `edit_distance`. + expecteds (Tuple[int, int]): A tuple of expected outputs, such that `expecteds[0]` is + the expected output with `transpositions=True`, and `expecteds[1]` is + the expected output with `transpositions=False`. + """ + # Test the input strings in both orderings + for s1, s2 in ((left, right), (right, left)): + # zip with [True, False] to get the transpositions value + for expected, transpositions in zip(expecteds, [True, False]): + predicted = edit_distance( + s1, + s2, + substitution_cost=substitution_cost, + transpositions=transpositions, + ) + assert predicted == expected From 656a6677e6778cc538c874dcef7cfe60203a9fcd Mon Sep 17 00:00:00 2001 From: Steven Bird Date: Sat, 18 Sep 2021 21:34:30 +0930 Subject: [PATCH 10/36] Update ChangeLog Last minute additions... --- ChangeLog | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/ChangeLog b/ChangeLog index 81052939b2..923eadee6e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -Version 3.6.3 2021-08-?? +Version 3.6.3 2021-09-19 * Dropped support for Python 3.5 * Run CI tests on Windows, too * Moved from Travis CI to GitHub Actions @@ -14,11 +14,12 @@ Version 3.6.3 2021-08-?? * Fix bug in nltk.metrics.aline, C_skip = -10 * Improvements to TweetTokenizer * Optional show arg for FreqDist.plot, ConditionalFreqDist.plot +* edit_distance now computes Damerau-Levenshtein edit-distance Thanks to the following contributors to 3.6.3 -Tom Aarsen, Michael Wayne Goodman, Michał Górny, Maarten ter Huurne, Manu Joseph, -Eric Kafe, Ilia Kurenkov, Daniel Loney, Rob Malouf, purificant, Danny Sepler, -Anthony Sottile +Tom Aarsen, Abhijnan Bajpai, Michael Wayne Goodman, Michał Górny, Maarten ter Huurne, +Manu Joseph, Eric Kafe, Ilia Kurenkov, Daniel Loney, Rob Malouf, Mohaned Mashaly, +purificant, Danny Sepler, Anthony Sottile Version 3.6.2 2021-04-20 * move test code to nltk/test From c566ef29570fda1c844a7e7f56754654d84203c9 Mon Sep 17 00:00:00 2001 From: Steven Bird Date: Sat, 18 Sep 2021 22:33:11 +0930 Subject: [PATCH 11/36] Update news.rst --- web/news.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/web/news.rst b/web/news.rst index 4c778efb38..80e730c5a4 100644 --- a/web/news.rst +++ b/web/news.rst @@ -4,6 +4,15 @@ NLTK News 2021 ---- +NLTK 3.6.3 release: September 2021 + Drop support for Python 3.5, + added pre-commit hooks (isort, pyupgrade, black), + improvements to WordNet visualization, RIBES score, edit_distance, + METEOR score, Punkt, language model package, TweetTokenizer, + code and comment cleanups, + CI tests now also run on Windows, + moved from Travis CI to GitHub Actions + NLTK 3.6.2 release: April 2021 Minor enhancements From 1d87106312be12b8510890be42c329c00e1b425f Mon Sep 17 00:00:00 2001 From: purificant Date: Sat, 18 Sep 2021 17:35:23 +0100 Subject: [PATCH 12/36] trim whitespace --- web/news.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web/news.rst b/web/news.rst index 80e730c5a4..00734ba0b6 100644 --- a/web/news.rst +++ b/web/news.rst @@ -5,7 +5,7 @@ NLTK News ---- NLTK 3.6.3 release: September 2021 - Drop support for Python 3.5, + Drop support for Python 3.5, added pre-commit hooks (isort, pyupgrade, black), improvements to WordNet visualization, RIBES score, edit_distance, METEOR score, Punkt, language model package, TweetTokenizer, From fa537c49b4ae92141d2dcd4aefe7085940bb6922 Mon Sep 17 00:00:00 2001 From: purificant Date: Sat, 18 Sep 2021 18:47:50 +0100 Subject: [PATCH 13/36] replace travis badge with github actions badge --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 55a173c2f1..914ebfdfcb 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Natural Language Toolkit (NLTK) [![PyPI](https://img.shields.io/pypi/v/nltk.svg)](https://pypi.python.org/pypi/nltk) -[![Travis](https://travis-ci.org/nltk/nltk.svg?branch=develop)](https://travis-ci.org/nltk/nltk) +![CI](https://github.com/nltk/nltk/actions/workflows/ci.yaml/badge.svg?branch=develop) NLTK -- the Natural Language Toolkit -- is a suite of open source Python modules, data sets, and tutorials supporting research and development in Natural From ef57542072975b9261c4a0428cd0c0d29d84833f Mon Sep 17 00:00:00 2001 From: Dimitri Papadopoulos Orfanos <3234522+DimitriPapadopoulos@users.noreply.github.com> Date: Thu, 23 Sep 2021 13:25:06 +0200 Subject: [PATCH 14/36] More typos in comments and outputs (#2814) --- CONTRIBUTING.md | 2 +- ChangeLog | 4 ++-- jenkins.sh | 2 +- nltk/corpus/reader/wordnet.py | 2 +- nltk/featstruct.py | 2 +- nltk/parse/util.py | 2 +- nltk/test/grammartestsuites.doctest | 2 +- nltk/test/relextract.doctest | 2 +- tools/find_deprecated.py | 2 +- tools/travis/third-party.sh | 2 +- 10 files changed, 11 insertions(+), 11 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index ed2214b7ef..36091b3359 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -94,7 +94,7 @@ Summary of our git branching model: - Never use `git add .`: it can add unwanted files; - Avoid using `git commit -a` unless you know what you're doing; - Check every change with `git diff` before adding them to the index (stage - area) and with `git diff --cached` before commiting; + area) and with `git diff --cached` before committing; - Make sure you add your name to our [list of contributors](https://github.com/nltk/nltk/blob/develop/AUTHORS.md); - If you have push access to the main repository, please do not commit directly to `develop`: your access should be used only to accept pull requests; if you diff --git a/ChangeLog b/ChangeLog index 923eadee6e..c4a39bc542 100644 --- a/ChangeLog +++ b/ChangeLog @@ -755,7 +755,7 @@ NLTK: Data: * Corrected identifiers in Dependency Treebank corpus * Basque and Catalan Dependency Treebanks (CoNLL 2007) -* PE08 Parser Evalution data +* PE08 Parser Evaluation data * New models for POS tagger and named-entity tagger Book: @@ -1068,7 +1068,7 @@ Code: - changed corpus.util to use the 'rb' flag for opening files, to fix problems reading corpora under MSWindows - updated stale examples in engineering.txt -- extended feature stucture interface to permit chained features, e.g. fs['F','G'] +- extended feature structure interface to permit chained features, e.g. fs['F','G'] - further misc improvements to test code plus some bugfixes Tutorials: - rewritten opening section of tagging chapter diff --git a/jenkins.sh b/jenkins.sh index 574a9d133b..3c9bdbadc1 100755 --- a/jenkins.sh +++ b/jenkins.sh @@ -24,7 +24,7 @@ if [[ ! -d $senna_folder_name ]]; then rm ${senna_file_name} fi -# Setup the Enviroment variable +# Setup the Environment variable export SENNA=$(pwd)'/senna' popd diff --git a/nltk/corpus/reader/wordnet.py b/nltk/corpus/reader/wordnet.py index 34e2196b0b..493892f301 100644 --- a/nltk/corpus/reader/wordnet.py +++ b/nltk/corpus/reader/wordnet.py @@ -1136,7 +1136,7 @@ def __init__(self, root, omw_reader): # Map from lemma -> pos -> synset_index -> offset self._lemma_pos_offset_map = defaultdict(dict) - # A cache so we don't have to reconstuct synsets + # A cache so we don't have to reconstruct synsets # Map from pos -> offset -> synset self._synset_offset_cache = defaultdict(dict) diff --git a/nltk/featstruct.py b/nltk/featstruct.py index 080eeba6fc..a7001eb9aa 100644 --- a/nltk/featstruct.py +++ b/nltk/featstruct.py @@ -1858,7 +1858,7 @@ def _default_fs_class(obj): class SubstituteBindingsSequence(SubstituteBindingsI): """ - A mixin class for sequence clases that distributes variables() and + A mixin class for sequence classes that distributes variables() and substitute_bindings() over the object's elements. """ diff --git a/nltk/parse/util.py b/nltk/parse/util.py index 3338ccb070..b730556e84 100644 --- a/nltk/parse/util.py +++ b/nltk/parse/util.py @@ -162,7 +162,7 @@ def run(self, show_trees=False): Sentences in the test suite are divided into two classes: - grammatical (``accept``) and - ungrammatical (``reject``). - If a sentence should parse accordng to the grammar, the value of + If a sentence should parse according to the grammar, the value of ``trees`` will be a non-empty list. If a sentence should be rejected according to the grammar, then the value of ``trees`` will be None. """ diff --git a/nltk/test/grammartestsuites.doctest b/nltk/test/grammartestsuites.doctest index 48d06992da..1ad162c16d 100644 --- a/nltk/test/grammartestsuites.doctest +++ b/nltk/test/grammartestsuites.doctest @@ -10,7 +10,7 @@ Sentences in the test suite are divided into two classes: - grammatical (*accept*) and - ungrammatical (*reject*). -If a sentence should parse accordng to the grammar, the value of +If a sentence should parse according to the grammar, the value of ``trees`` will be a non-empty list. If a sentence should be rejected according to the grammar, then the value of ``trees`` will be ``None``. diff --git a/nltk/test/relextract.doctest b/nltk/test/relextract.doctest index 4e6a0a32cf..d13a9f045e 100644 --- a/nltk/test/relextract.doctest +++ b/nltk/test/relextract.doctest @@ -176,7 +176,7 @@ signature . [ORG: 'Open Text'] ', based in' [LOC: 'Waterloo'] ... -The next example illustrates a case where the patter is a disjunction +The next example illustrates a case where the pattern is a disjunction of roles that a PERSON can occupy in an ORGANIZATION. >>> roles = r""" diff --git a/tools/find_deprecated.py b/tools/find_deprecated.py index 822eb63d23..94f1332eab 100755 --- a/tools/find_deprecated.py +++ b/tools/find_deprecated.py @@ -232,7 +232,7 @@ def main(): print("Unable to import nltk -- check your PYTHONPATH.") sys.exit(-1) - print("Finding definitions of deprecated funtions & classes in nltk...") + print("Finding definitions of deprecated functions & classes in nltk...") find_deprecated_defs(nltk.__path__[0]) print("Looking for possible uses of deprecated funcs & classes...") diff --git a/tools/travis/third-party.sh b/tools/travis/third-party.sh index 9e09d757f2..57971b3724 100644 --- a/tools/travis/third-party.sh +++ b/tools/travis/third-party.sh @@ -51,7 +51,7 @@ if [[ ! -d $senna_folder_name ]]; then rm ${senna_file_name} fi -# Setup the Enviroment variable +# Setup the Environment variable export CLASSPATH=$(pwd)"/${stanford_corenlp_package_name}" export CLASSPATH=${CLASSPATH}:$(pwd)"/${stanford_parser_package_name}" export CLASSPATH=${CLASSPATH}:$(pwd)"/${stanford_tagger_package_name}" From 03e1ebd9f720e4263457d4e1d5886f4d7aba0ef5 Mon Sep 17 00:00:00 2001 From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com> Date: Thu, 23 Sep 2021 23:17:42 +0200 Subject: [PATCH 15/36] Refactored CISTEM Stemmer for German (#2815) - Removed code duplication - Modernised variable typing in method signatures - Updated the documentation --- nltk/stem/cistem.py | 109 ++++++++++++++++++++------------------------ 1 file changed, 50 insertions(+), 59 deletions(-) diff --git a/nltk/stem/cistem.py b/nltk/stem/cistem.py index 3bd13354f8..2966e8e589 100644 --- a/nltk/stem/cistem.py +++ b/nltk/stem/cistem.py @@ -1,12 +1,14 @@ # Natural Language Toolkit: CISTEM Stemmer for German # Copyright (C) 2001-2021 NLTK Project # Author: Leonie Weissweiler +# Tom Aarsen <> (modifications) # Algorithm: Leonie Weissweiler # Alexander Fraser # URL: # For license information, see LICENSE.TXT import re +from typing import Tuple from nltk.stem.api import StemmerI @@ -48,11 +50,11 @@ class Cistem(StemmerI): strip_esn = re.compile(r"[esn]$") repl_xx_back = re.compile(r"(.)\*") - def __init__(self, case_insensitive=False): + def __init__(self, case_insensitive: bool = False): self._case_insensitive = case_insensitive @staticmethod - def replace_to(word): + def replace_to(word: str) -> str: word = word.replace("sch", "$") word = word.replace("ei", "%") word = word.replace("ie", "&") @@ -61,7 +63,7 @@ def replace_to(word): return word @staticmethod - def replace_back(word): + def replace_back(word: str) -> str: word = Cistem.repl_xx_back.sub(r"\1\1", word) word = word.replace("%", "ei") word = word.replace("&", "ie") @@ -69,14 +71,13 @@ def replace_back(word): return word - def stem(self, word): - """ - This method takes the word to be stemmed and returns the stemmed word. + def stem(self, word: str) -> str: + """Stems the input word. - :param word: the word that is to be stemmed - :type word: unicode - :return word: the stemmed word - :rtype: unicode + :param word: The word that is to be stemmed. + :type word: str + :return: The stemmed word. + :rtype: str >>> from nltk.stem.cistem import Cistem >>> stemmer = Cistem() @@ -109,34 +110,10 @@ def stem(self, word): word = word.replace("ß", "ss") word = Cistem.strip_ge.sub(r"\1", word) - word = Cistem.replace_to(word) - - while len(word) > 3: - if len(word) > 5: - (word, success) = Cistem.strip_emr.subn("", word) - if success != 0: - continue - - (word, success) = Cistem.strip_nd.subn("", word) - if success != 0: - continue - - if not upper or self._case_insensitive: - (word, success) = Cistem.strip_t.subn("", word) - if success != 0: - continue - - (word, success) = Cistem.strip_esn.subn("", word) - if success != 0: - continue - else: - break - word = Cistem.replace_back(word) + return self._segment_inner(word, upper)[0] - return word - - def segment(self, word): + def segment(self, word: str) -> Tuple[str, str]: """ This method works very similarly to stem (:func:'cistem.stem'). The difference is that in addition to returning the stem, it also returns the rest that was removed at @@ -144,17 +121,15 @@ def segment(self, word): can be concatenated to form the original word, all subsitutions that altered the stem in any other way than by removing letters at the end were left out. - :param word: the word that is to be stemmed - :type word: unicode - :return word: the stemmed word - :rtype: unicode - :return word: the removed suffix - :rtype: unicode + :param word: The word that is to be stemmed. + :type word: str + :return: A tuple of the stemmed word and the removed suffix. + :rtype: Tuple[str, str] >>> from nltk.stem.cistem import Cistem >>> stemmer = Cistem() >>> s1 = "Speicherbehältern" - >>> print("('" + stemmer.segment(s1)[0] + "', '" + stemmer.segment(s1)[1] + "')") + >>> stemmer.segment(s1) ('speicherbehält', 'ern') >>> s2 = "Grenzpostens" >>> stemmer.segment(s2) @@ -163,56 +138,72 @@ def segment(self, word): >>> stemmer.segment(s3) ('ausgefeilt', 'ere') >>> stemmer = Cistem(True) - >>> print("('" + stemmer.segment(s1)[0] + "', '" + stemmer.segment(s1)[1] + "')") + >>> stemmer.segment(s1) ('speicherbehäl', 'tern') >>> stemmer.segment(s2) ('grenzpo', 'stens') >>> stemmer.segment(s3) ('ausgefeil', 'tere') """ - - rest_length = 0 - if len(word) == 0: return ("", "") upper = word[0].isupper() word = word.lower() - original = word[:] + return self._segment_inner(word, upper) + + def _segment_inner(self, word: str, upper: bool): + """Inner method for iteratively applying the code stemming regexes. + This method receives a pre-processed variant of the word to be stemmed, + or the word to be segmented, and returns a tuple of the word and the + removed suffix. + + :param word: A pre-processed variant of the word that is to be stemmed. + :type word: str + :param upper: Whether the original word started with a capital letter. + :type upper: bool + :return: A tuple of the stemmed word and the removed suffix. + :rtype: Tuple[str, str] + """ + + rest_length = 0 + word_copy = word[:] + # Pre-processing before applying the substitution patterns word = Cistem.replace_to(word) + rest = "" + # Apply the substitution patterns while len(word) > 3: if len(word) > 5: - (word, success) = Cistem.strip_emr.subn("", word) - if success != 0: + word, n = Cistem.strip_emr.subn("", word) + if n != 0: rest_length += 2 continue - (word, success) = Cistem.strip_nd.subn("", word) - if success != 0: + word, n = Cistem.strip_nd.subn("", word) + if n != 0: rest_length += 2 continue if not upper or self._case_insensitive: - (word, success) = Cistem.strip_t.subn("", word) - if success != 0: + word, n = Cistem.strip_t.subn("", word) + if n != 0: rest_length += 1 continue - (word, success) = Cistem.strip_esn.subn("", word) - if success != 0: + word, n = Cistem.strip_esn.subn("", word) + if n != 0: rest_length += 1 continue else: break + # Post-processing after applying the substitution patterns word = Cistem.replace_back(word) if rest_length: - rest = original[-rest_length:] - else: - rest = "" + rest = word_copy[-rest_length:] return (word, rest) From 2b1528f952d5c497e3119234b0154fa25fa044e0 Mon Sep 17 00:00:00 2001 From: Steven Bird Date: Fri, 24 Sep 2021 06:52:17 +0930 Subject: [PATCH 16/36] NLTK Team as author and maintainer --- setup.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/setup.py b/setup.py index 48e2a319ab..061ae8899e 100644 --- a/setup.py +++ b/setup.py @@ -3,9 +3,7 @@ # Setup script for the Natural Language Toolkit # # Copyright (C) 2001-2021 NLTK Project -# Author: Steven Bird -# Edward Loper -# Ewan Klein +# Author: NLTK Team # URL: # For license information, see LICENSE.TXT @@ -86,10 +84,10 @@ "natural language", "text analytics", ], - maintainer="Steven Bird", - maintainer_email="stevenbird1@gmail.com", - author="Steven Bird", - author_email="stevenbird1@gmail.com", + maintainer="NLTK Team", + maintainer_email="nltk.team@gmail.com", + author="NLTK Team", + author_email="nltk.team@gmail.com", classifiers=[ "Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", From 8c38a2a789e3c26bcb1297ff98bc304812c5d4c5 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Fri, 24 Sep 2021 09:26:04 +0200 Subject: [PATCH 17/36] NLTK Team as author and maintainer - in __init__.py --- nltk/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nltk/__init__.py b/nltk/__init__.py index b42c548992..b06598edca 100644 --- a/nltk/__init__.py +++ b/nltk/__init__.py @@ -70,8 +70,8 @@ __url__ = "http://nltk.org/" # Maintainer, contributors, etc. -__maintainer__ = "Steven Bird" -__maintainer_email__ = "stevenbird1@gmail.com" +__maintainer__ = "NLTK Team" +__maintainer_email__ = "nltk.team@gmail.com" __author__ = __maintainer__ __author_email__ = __maintainer_email__ From f4ce2cef06125dd406bd57ab08b2fdb57ebe1531 Mon Sep 17 00:00:00 2001 From: Steven Bird Date: Fri, 24 Sep 2021 21:52:07 +0930 Subject: [PATCH 18/36] NLTK authorises nltk.team@gmail.com as its security contact. Resolves #2811 --- SECURITY.md | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 SECURITY.md diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000000..36eaa01ae5 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,6 @@ +# Security Policy + +## Reporting a Vulnerability + +Please report security issues to `nltk.team@gmail.com` + From 5ecfe44571fb4dec77a20a2fc127508948723497 Mon Sep 17 00:00:00 2001 From: mohaned mashaly <30902228+12mohaned@users.noreply.github.com> Date: Fri, 24 Sep 2021 14:31:27 +0200 Subject: [PATCH 19/36] feat: enable phone number regex to recognize phone numbers (#2798) * feat: enable phone number regex to recognize phone numbers * add pre-commit support * remove duplicate regular expression * remove overriden line * Refactored TweetTokenizer, incl. caching regex compilations and adding match_phone_numbers * Removed some test input duplication * Added several TweetTokenizer tests with phone numbers * Set match_phone_numbers=True as the default for TweetTokenizer Co-authored-by: Tom Aarsen --- nltk/test/unit/test_tokenize.py | 207 ++++++++++++++++++++++++++++++++ nltk/tokenize/casual.py | 138 ++++++++++++++++----- 2 files changed, 314 insertions(+), 31 deletions(-) diff --git a/nltk/test/unit/test_tokenize.py b/nltk/test/unit/test_tokenize.py index 2efd793ef9..524aee2b5d 100644 --- a/nltk/test/unit/test_tokenize.py +++ b/nltk/test/unit/test_tokenize.py @@ -2,6 +2,8 @@ Unit tests for nltk.tokenize. See also nltk/test/tokenize.doctest """ +from typing import List, Tuple + import pytest from nltk.tokenize import ( @@ -54,6 +56,211 @@ def test_tweet_tokenizer(self): ] assert tokens == expected + @pytest.mark.parametrize( + "test_input, expecteds", + [ + ( + "My text 0106404243030 is great text", + ( + ["My", "text", "01064042430", "30", "is", "great", "text"], + ["My", "text", "0106404243030", "is", "great", "text"], + ), + ), + ( + "My ticket id is 1234543124123", + ( + ["My", "ticket", "id", "is", "12345431241", "23"], + ["My", "ticket", "id", "is", "1234543124123"], + ), + ), + ( + "@remy: This is waaaaayyyy too much for you!!!!!! 01064042430", + ( + [ + ":", + "This", + "is", + "waaayyy", + "too", + "much", + "for", + "you", + "!", + "!", + "!", + "01064042430", + ], + [ + ":", + "This", + "is", + "waaayyy", + "too", + "much", + "for", + "you", + "!", + "!", + "!", + "01064042430", + ], + ), + ), + # Further tests from https://github.com/nltk/nltk/pull/2798#issuecomment-922533085, + # showing the TweetTokenizer performance for `match_phone_numbers=True` and + # `match_phone_numbers=False`. + ( + # Some phone numbers are always tokenized, even with `match_phone_numbers=`False` + "My number is 06-46124080, except it's not.", + ( + [ + "My", + "number", + "is", + "06-46124080", + ",", + "except", + "it's", + "not", + ".", + ], + [ + "My", + "number", + "is", + "06-46124080", + ",", + "except", + "it's", + "not", + ".", + ], + ), + ), + ( + # Phone number here is only tokenized correctly if `match_phone_numbers=True` + "My number is 601-984-4813, except it's not.", + ( + [ + "My", + "number", + "is", + "601-984-4813", + ",", + "except", + "it's", + "not", + ".", + ], + [ + "My", + "number", + "is", + "601-984-", + "4813", + ",", + "except", + "it's", + "not", + ".", + ], + ), + ), + ( + # Phone number here is only tokenized correctly if `match_phone_numbers=True` + "My number is (393) 928 -3010, except it's not.", + ( + [ + "My", + "number", + "is", + "(393) 928 -3010", + ",", + "except", + "it's", + "not", + ".", + ], + [ + "My", + "number", + "is", + "(", + "393", + ")", + "928", + "-", + "3010", + ",", + "except", + "it's", + "not", + ".", + ], + ), + ), + ( + # A long number is tokenized correctly only if `match_phone_numbers=False` + "The product identification number is 48103284512.", + ( + [ + "The", + "product", + "identification", + "number", + "is", + "4810328451", + "2", + ".", + ], + [ + "The", + "product", + "identification", + "number", + "is", + "48103284512", + ".", + ], + ), + ), + ( + # `match_phone_numbers=True` can have some unforeseen + "My favourite substraction is 240 - 1353.", + ( + ["My", "favourite", "substraction", "is", "240 - 1353", "."], + ["My", "favourite", "substraction", "is", "240", "-", "1353", "."], + ), + ), + ], + ) + def test_tweet_tokenizer_expanded( + self, test_input: str, expecteds: Tuple[List[str], List[str]] + ): + """ + Test `match_phone_numbers` in TweetTokenizer. + + Note that TweetTokenizer is also passed the following for these tests: + * strip_handles=True + * reduce_len=True + + :param test_input: The input string to tokenize using TweetTokenizer. + :type test_input: str + :param expecteds: A 2-tuple of tokenized sentences. The first of the two + tokenized is the expected output of tokenization with `match_phone_numbers=True`. + The second of the two tokenized lists is the expected output of tokenization + with `match_phone_numbers=False`. + :type expecteds: Tuple[List[str], List[str]] + """ + for match_phone_numbers, expected in zip([True, False], expecteds): + tokenizer = TweetTokenizer( + strip_handles=True, + reduce_len=True, + match_phone_numbers=match_phone_numbers, + ) + predicted = tokenizer.tokenize(test_input) + assert predicted == expected + def test_sonority_sequencing_syllable_tokenizer(self): """ Test SyllableTokenizer tokenizer. diff --git a/nltk/tokenize/casual.py b/nltk/tokenize/casual.py index 0fb0b39243..f7b256713e 100644 --- a/nltk/tokenize/casual.py +++ b/nltk/tokenize/casual.py @@ -5,6 +5,7 @@ # Author: Christopher Potts # Ewan Klein (modifications) # Pierpaolo Pantone <> (modifications) +# Tom Aarsen <> (modifications) # URL: # For license information, see LICENSE.TXT # @@ -14,27 +15,36 @@ Twitter-aware tokenizer, designed to be flexible and easy to adapt to new domains and tasks. The basic logic is this: -1. The tuple regex_strings defines a list of regular expression +1. The tuple REGEXPS defines a list of regular expression strings. -2. The regex_strings strings are put, in order, into a compiled - regular expression object called word_re. +2. The REGEXPS strings are put, in order, into a compiled + regular expression object called WORD_RE, under the TweetTokenizer + class. -3. The tokenization is done by word_re.findall(s), where s is the +3. The tokenization is done by WORD_RE.findall(s), where s is the user-supplied string, inside the tokenize() method of the class - Tokenizer. - -4. When instantiating Tokenizer objects, there is a single option: - preserve_case. By default, it is set to True. If it is set to - False, then the tokenizer will downcase everything except for - emoticons. - + TweetTokenizer. + +4. When instantiating Tokenizer objects, there are several options: + * preserve_case. By default, it is set to True. If it is set to + False, then the tokenizer will downcase everything except for + emoticons. + * reduce_len. By default, it is set to False. It specifies whether + to replace repeated character sequences of length 3 or greater + with sequences of length 3. + * strip_handles. By default, it is set to False. It specifies + whether to remove Twitter handles of text used in the + `tokenize` method. + * match_phone_numbers. By default, it is set to True. It indicates + whether the `tokenize` method should look for phone numbers. """ ###################################################################### import html +from typing import List import regex # https://github.com/nltk/nltk/issues/2409 @@ -115,11 +125,8 @@ ) """ -# The components of the tokenizer: -REGEXPS = ( - URLS, - # Phone numbers: - r""" +# Regex for recognizing phone numbers: +PHONE_REGEX = r""" (?: (?: # (international) \+?[01] @@ -133,7 +140,11 @@ \d{3} # exchange [ *\-.\)]* \d{4} # base - )""", + )""" + +# The components of the tokenizer: +REGEXPS = ( + URLS, # ASCII Emoticons EMOTICONS, # HTML tags: @@ -160,12 +171,12 @@ """, ) -###################################################################### -# This is the core tokenizing regex: +# Take the main components and add a phone regex as the second parameter +REGEXPS_PHONE = (REGEXPS[0], PHONE_REGEX, *REGEXPS[1:]) -WORD_RE = regex.compile( - r"""(%s)""" % "|".join(REGEXPS), regex.VERBOSE | regex.I | regex.UNICODE -) +###################################################################### +# TweetTokenizer.WORD_RE and TweetTokenizer.PHONE_WORD_RE represent +# the core tokenizing regexes. They are compiled lazily. # WORD_RE performs poorly on these patterns: HANG_RE = regex.compile(r"([^a-zA-Z0-9])\1{3,}") @@ -277,17 +288,48 @@ class TweetTokenizer: [':', 'This', 'is', 'waaayyy', 'too', 'much', 'for', 'you', '!', '!', '!'] """ - def __init__(self, preserve_case=True, reduce_len=False, strip_handles=False): + # Values used to lazily compile WORD_RE and PHONE_WORD_RE, + # which are the core tokenizing regexes. + _WORD_RE = None + _PHONE_WORD_RE = None + + ###################################################################### + + def __init__( + self, + preserve_case=True, + reduce_len=False, + strip_handles=False, + match_phone_numbers=True, + ): + """ + Create a `TweetTokenizer` instance with settings for use in the `tokenize` method. + + :param preserve_case: Flag indicating whether to preserve the casing (capitalisation) + of text used in the `tokenize` method. Defaults to True. + :type preserve_case: bool + :param reduce_len: Flag indicating whether to replace repeated character sequences + of length 3 or greater with sequences of length 3. Defaults to False. + :type reduce_len: bool + :param strip_handles: Flag indicating whether to remove Twitter handles of text used + in the `tokenize` method. Defaults to False. + :type strip_handles: bool + :param match_phone_numbers: Flag indicating whether the `tokenize` method should look + for phone numbers. Defaults to True. + :type match_phone_numbers: bool + """ self.preserve_case = preserve_case self.reduce_len = reduce_len self.strip_handles = strip_handles + self.match_phone_numbers = match_phone_numbers + + def tokenize(self, text: str) -> List[str]: + """Tokenize the input text. - def tokenize(self, text): - """ :param text: str :rtype: list(str) - :return: a tokenized list of strings; concatenating this list returns\ - the original string if `preserve_case=False` + :return: a tokenized list of strings; joining this list returns\ + the original string if `preserve_case=False`. """ # Fix HTML character entities: text = _replace_html_entities(text) @@ -299,8 +341,11 @@ def tokenize(self, text): text = reduce_lengthening(text) # Shorten problematic sequences of characters safe_text = HANG_RE.sub(r"\1\1\1", text) - # Tokenize: - words = WORD_RE.findall(safe_text) + # Recognise phone numbers during tokenization + if self.match_phone_numbers: + words = self.PHONE_WORD_RE.findall(safe_text) + else: + words = self.WORD_RE.findall(safe_text) # Possibly alter the case, but avoid changing emoticons like :D into :d: if not self.preserve_case: words = list( @@ -308,6 +353,28 @@ def tokenize(self, text): ) return words + @property + def WORD_RE(self) -> regex.Pattern: + """Core TweetTokenizer regex""" + # Compiles the regex for this and all future instantiations of TweetTokenizer. + if not type(self)._WORD_RE: + type(self)._WORD_RE = regex.compile( + f"({'|'.join(REGEXPS)})", + regex.VERBOSE | regex.I | regex.UNICODE, + ) + return type(self)._WORD_RE + + @property + def PHONE_WORD_RE(self) -> regex.Pattern: + """Secondary core TweetTokenizer regex""" + # Compiles the regex for this and all future instantiations of TweetTokenizer. + if not type(self)._PHONE_WORD_RE: + type(self)._PHONE_WORD_RE = regex.compile( + f"({'|'.join(REGEXPS_PHONE)})", + regex.VERBOSE | regex.I | regex.UNICODE, + ) + return type(self)._PHONE_WORD_RE + ###################################################################### # Normalization Functions @@ -336,12 +403,21 @@ def remove_handles(text): ###################################################################### -def casual_tokenize(text, preserve_case=True, reduce_len=False, strip_handles=False): +def casual_tokenize( + text, + preserve_case=True, + reduce_len=False, + strip_handles=False, + match_phone_numbers=True, +): """ Convenience function for wrapping the tokenizer. """ return TweetTokenizer( - preserve_case=preserve_case, reduce_len=reduce_len, strip_handles=strip_handles + preserve_case=preserve_case, + reduce_len=reduce_len, + strip_handles=strip_handles, + match_phone_numbers=match_phone_numbers, ).tokenize(text) From 23f4b1c4b4006b0cb3ec278e801029557cec4e82 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Fri, 24 Sep 2021 15:42:03 +0200 Subject: [PATCH 20/36] Fix end of line in new SECURITY.md --- SECURITY.md | 1 - 1 file changed, 1 deletion(-) diff --git a/SECURITY.md b/SECURITY.md index 36eaa01ae5..27ff9b6aaa 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -3,4 +3,3 @@ ## Reporting a Vulnerability Please report security issues to `nltk.team@gmail.com` - From 277711ab1dec729e626b27aab6fa35ea5efbd7e6 Mon Sep 17 00:00:00 2001 From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com> Date: Sat, 25 Sep 2021 01:14:32 +0200 Subject: [PATCH 21/36] Resolved ReDoS vulnerability in Corpus Reader (#2816) * Resolved ReDoS vulnerability in the Corpus Reader for the Comparative Sentence Dataset * Solidified performance tests --- nltk/corpus/reader/comparative_sents.py | 2 +- nltk/test/corpus.doctest | 32 +++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/nltk/corpus/reader/comparative_sents.py b/nltk/corpus/reader/comparative_sents.py index cbfac4c13c..ed295e4e02 100644 --- a/nltk/corpus/reader/comparative_sents.py +++ b/nltk/corpus/reader/comparative_sents.py @@ -45,7 +45,7 @@ GRAD_COMPARISON = re.compile(r"") NON_GRAD_COMPARISON = re.compile(r"") ENTITIES_FEATS = re.compile(r"(\d)_((?:[\.\w\s/-](?!\d_))+)") -KEYWORD = re.compile(r"\((?!.*\()(.*)\)$") +KEYWORD = re.compile(r"\(([^\(]*)\)$") class Comparison: diff --git a/nltk/test/corpus.doctest b/nltk/test/corpus.doctest index 82b17f8a5a..560e641a69 100644 --- a/nltk/test/corpus.doctest +++ b/nltk/test/corpus.doctest @@ -2162,3 +2162,35 @@ access to its tuples() method >>> from nltk.corpus import qc >>> qc.tuples('test.txt') [('NUM:dist', 'How far is it from Denver to Aspen ?'), ('LOC:city', 'What county is Modesto , California in ?'), ...] + +Ensure that KEYWORD from `comparative_sents.py` no longer contains a ReDoS vulnerability. + + >>> import re + >>> import time + >>> from nltk.corpus.reader.comparative_sents import KEYWORD + >>> sizes = { + ... "short": 4000, + ... "long": 40000 + ... } + >>> exec_times = { + ... "short": [], + ... "long": [], + ... } + >>> for size_name, size in sizes.items(): + ... for j in range(9): + ... start_t = time.perf_counter() + ... payload = "( " + "(" * size + ... output = KEYWORD.findall(payload) + ... exec_times[size_name].append(time.perf_counter() - start_t) + ... exec_times[size_name] = sorted(exec_times[size_name])[4] # Get the mean + +Ideally, the execution time of such a regular expression is linear +in the length of the input. As such, we would expect exec_times["long"] +to be roughly 10 times as big as exec_times["short"]. +With the ReDoS in place, it took roughly 80 times as long. +For now, we accept values below 30 (times as long), due to the potential +for variance. This ensures that the ReDoS has certainly been reduced, +if not removed. + + >>> exec_times["long"] / exec_times["short"] < 30 + True From 63d004b6a7b7357d5597c56ed254624e20abc281 Mon Sep 17 00:00:00 2001 From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com> Date: Sun, 26 Sep 2021 23:27:32 +0200 Subject: [PATCH 22/36] Added docstring for WordNetLemmatizer lemmatize (#2819) Includes importing wordnet as wn, which is common in practice. --- nltk/stem/wordnet.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/nltk/stem/wordnet.py b/nltk/stem/wordnet.py index d56537a084..3ac3424247 100644 --- a/nltk/stem/wordnet.py +++ b/nltk/stem/wordnet.py @@ -6,8 +6,7 @@ # URL: # For license information, see LICENSE.TXT -from nltk.corpus import wordnet -from nltk.corpus.reader.wordnet import NOUN +from nltk.corpus import wordnet as wn class WordNetLemmatizer: @@ -31,11 +30,19 @@ class WordNetLemmatizer: hardrock """ - def __init__(self): - pass - - def lemmatize(self, word, pos=NOUN): - lemmas = wordnet._morphy(word, pos) + def lemmatize(self, word: str, pos: str = wn.NOUN) -> str: + """Lemmatize `word` using WordNet's built-in morphy function. + Returns the input word unchanged if it cannot be found in WordNet. + + :param word: The input word to lemmatize. + :type word: str + :param pos: The Part Of Speech tag. Valid options are `"n"` for nouns, + `"v"` for verbs, `"a"` for adjectives, `"r"` for adverbs and `"s"` + for satellite adjectives. + :param pos: str + :return: The lemma of `word`, for the given `pos`. + """ + lemmas = wn._morphy(word, pos) return min(lemmas, key=len) if lemmas else word def __repr__(self): From e7420b1500fd20133c2bed0eb2cf3611f7574c25 Mon Sep 17 00:00:00 2001 From: Steven Bird Date: Mon, 27 Sep 2021 08:12:09 +0930 Subject: [PATCH 23/36] Configure sphinx in setup.cfg cf #2742 --- setup.cfg | 3 +++ 1 file changed, 3 insertions(+) diff --git a/setup.cfg b/setup.cfg index 3529e62d50..7e3bc1a4be 100644 --- a/setup.cfg +++ b/setup.cfg @@ -3,3 +3,6 @@ license_files = LICENSE.txt AUTHORS.md README.md + +[build_sphinx] +source-dir = web From 1aff8c19ccee8bd56ede056d282568e0ff84b674 Mon Sep 17 00:00:00 2001 From: Steven Bird Date: Mon, 27 Sep 2021 21:18:03 +0930 Subject: [PATCH 24/36] Configure sphinx --- RELEASE-HOWTO.txt | 28 ++--- web/api/nltk.rst | 282 +++++++++++++++++++++++++++++----------------- web/conf.py | 5 + 3 files changed, 195 insertions(+), 120 deletions(-) diff --git a/RELEASE-HOWTO.txt b/RELEASE-HOWTO.txt index 2ef5f8e27c..329957f85e 100644 --- a/RELEASE-HOWTO.txt +++ b/RELEASE-HOWTO.txt @@ -2,8 +2,8 @@ Building an NLTK distribution ---------------------------------- 1. Testing - - Ensure CI server isn't reporting any test failures - https://www.travis-ci.org/nltk/nltk + - Check no errors are reported in our continuous integration service: + https://github.com/nltk/nltk/actions - Optionally test demonstration code locally make demotest - Optionally test individual modules: @@ -29,17 +29,13 @@ Building an NLTK distribution (including the range of Python versions that are supported) edit web/install.rst setup.py - Rebuild the API docs - - make sure you have the current revision of the web pages - cd nltk.github.com; git pull - - build - cd ../nltk/web - make (slow; lots of warning messages about cross references) - - publish - cd ../../nltk.github.com - git add _modules _sources _static api *.html objects.inv searchindex.js - git status (missing any important looking files?) - git commit -m "updates for version 3.X.Y" - git push origin master + python setup.py build_sphinx -b man --build-dir build/sphinx + - Publish them + cd nltk.github.com; git pull (begin with current docs repo) + + git add . + git commit -m "updates for version 3.X.Y" + git push origin master 4. Create a new version - (Optionally do this in a release branch, branching from develop branch @@ -65,12 +61,8 @@ Building an NLTK distribution nltk-dev (for beta releases) nltk-users (for final releases) nltk twitter account - - announce to external mailing lists, for major N.N releases only - CORPORA@uib.no, linguist@linguistlist.org, - PythonSIL@lists.sil.org, edu-sig@python.org - mailing lists for any local courses using NLTK -7. Optionally update to new version +7. Optionally update repo version - we don't want builds from the repository to have the same release number e.g. after release X.Y.4, update repository version to X.Y.5a (alpha) diff --git a/web/api/nltk.rst b/web/api/nltk.rst index c5dad19c58..3da0e9e211 100644 --- a/web/api/nltk.rst +++ b/web/api/nltk.rst @@ -1,148 +1,226 @@ -.. manually constructed -- removed several low-level packages - -nltk Package +nltk package ============ -:mod:`nltk` Package -------------------- +Subpackages +----------- -.. automodule:: nltk.__init__ - :members: - :undoc-members: - :show-inheritance: +.. toctree:: + :maxdepth: 4 + + nltk.app + nltk.ccg + nltk.chat + nltk.chunk + nltk.classify + nltk.cluster + nltk.corpus + nltk.draw + nltk.inference + nltk.lm + nltk.metrics + nltk.misc + nltk.parse + nltk.sem + nltk.sentiment + nltk.stem + nltk.tag + nltk.tbl + nltk.test + nltk.tokenize + nltk.translate + nltk.twitter + +Submodules +---------- + +nltk.book module +---------------- + +.. automodule:: nltk.book + :members: + :undoc-members: + :show-inheritance: + +nltk.cli module +--------------- + +.. automodule:: nltk.cli + :members: + :undoc-members: + :show-inheritance: + +nltk.collections module +----------------------- -:mod:`collocations` Module --------------------------- +.. automodule:: nltk.collections + :members: + :undoc-members: + :show-inheritance: + +nltk.collocations module +------------------------ .. automodule:: nltk.collocations - :members: - :undoc-members: - :show-inheritance: + :members: + :undoc-members: + :show-inheritance: -:mod:`data` Module +nltk.compat module ------------------ +.. automodule:: nltk.compat + :members: + :undoc-members: + :show-inheritance: + +nltk.data module +---------------- + .. automodule:: nltk.data - :members: - :undoc-members: - :show-inheritance: + :members: + :undoc-members: + :show-inheritance: -:mod:`downloader` Module ------------------------- +nltk.decorators module +---------------------- + +.. automodule:: nltk.decorators + :members: + :undoc-members: + :show-inheritance: + +nltk.downloader module +---------------------- .. automodule:: nltk.downloader - :members: - :undoc-members: - :show-inheritance: + :members: + :undoc-members: + :show-inheritance: -:mod:`featstruct` Module ------------------------- +nltk.featstruct module +---------------------- .. automodule:: nltk.featstruct - :members: - :undoc-members: - :show-inheritance: + :members: + :undoc-members: + :show-inheritance: -:mod:`grammar` Module ---------------------- +nltk.grammar module +------------------- .. automodule:: nltk.grammar - :members: - :undoc-members: - :show-inheritance: + :members: + :undoc-members: + :show-inheritance: -:mod:`help` Module ------------------- +nltk.help module +---------------- .. automodule:: nltk.help - :members: - :undoc-members: - :show-inheritance: + :members: + :undoc-members: + :show-inheritance: + +nltk.internals module +--------------------- + +.. automodule:: nltk.internals + :members: + :undoc-members: + :show-inheritance: + +nltk.jsontags module +-------------------- -:mod:`probability` Module -------------------------- +.. automodule:: nltk.jsontags + :members: + :undoc-members: + :show-inheritance: + +nltk.lazyimport module +---------------------- + +.. automodule:: nltk.lazyimport + :members: + :undoc-members: + :show-inheritance: + +nltk.probability module +----------------------- .. automodule:: nltk.probability - :members: - :undoc-members: - :show-inheritance: + :members: + :undoc-members: + :show-inheritance: -:mod:`text` Module ------------------- +nltk.text module +---------------- .. automodule:: nltk.text - :members: - :undoc-members: - :show-inheritance: + :members: + :undoc-members: + :show-inheritance: -:mod:`toolbox` Module ---------------------- +nltk.tgrep module +----------------- -.. automodule:: nltk.toolbox - :members: - :undoc-members: - :show-inheritance: +.. automodule:: nltk.tgrep + :members: + :undoc-members: + :show-inheritance: -:mod:`translate` Module ------------------------ +nltk.toolbox module +------------------- -.. automodule:: nltk.translate - :members: - :undoc-members: - :show-inheritance: +.. automodule:: nltk.toolbox + :members: + :undoc-members: + :show-inheritance: -:mod:`tree` Module ------------------- +nltk.tree module +---------------- .. automodule:: nltk.tree - :members: - :undoc-members: - :show-inheritance: + :members: + :undoc-members: + :show-inheritance: -:mod:`treetransforms` Module ----------------------------- +nltk.treeprettyprinter module +----------------------------- + +.. automodule:: nltk.treeprettyprinter + :members: + :undoc-members: + :show-inheritance: + +nltk.treetransforms module +-------------------------- .. automodule:: nltk.treetransforms - :members: - :undoc-members: - :show-inheritance: + :members: + :undoc-members: + :show-inheritance: -:mod:`util` Module ------------------- +nltk.util module +---------------- .. automodule:: nltk.util - :members: - :undoc-members: - :show-inheritance: + :members: + :undoc-members: + :show-inheritance: -:mod:`wsd` Module ------------------- +nltk.wsd module +--------------- .. automodule:: nltk.wsd - :members: - :undoc-members: - :show-inheritance: + :members: + :undoc-members: + :show-inheritance: +Module contents +--------------- -Subpackages ------------ - -.. toctree:: - - nltk.app - nltk.ccg - nltk.chat - nltk.chunk - nltk.classify - nltk.cluster - nltk.corpus - nltk.draw - nltk.inference - nltk.metrics - nltk.misc - nltk.parse - nltk.sem - nltk.stem - nltk.tag - nltk.test - nltk.tokenize +.. automodule:: nltk + :members: + :undoc-members: + :show-inheritance: diff --git a/web/conf.py b/web/conf.py index 93ea26071f..922c28a4b9 100644 --- a/web/conf.py +++ b/web/conf.py @@ -29,11 +29,16 @@ # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. extensions = [ "sphinx.ext.autodoc", + "sphinxcontrib.apidoc", "sphinx.ext.coverage", "sphinx.ext.imgmath", "sphinx.ext.viewcode", ] +apidoc_module_dir = '../nltk' +apidoc_output_dir = 'api' +apidoc_separate_modules = False + # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] From 4937013af01da20837e17282eee13e1ba64f2f76 Mon Sep 17 00:00:00 2001 From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com> Date: Mon, 27 Sep 2021 14:53:15 +0200 Subject: [PATCH 25/36] Deprecate `nltk.usage(obj)` in favor of `help(obj)` (#2810) * Deprecated nltk.usage(obj); help(obj) is preferred * Removed deprecated nltk.usage() from tests * Use existing @deprecated decorator --- nltk/test/classify.doctest | 16 +++++------ nltk/test/unit/test_util.py | 54 +------------------------------------ nltk/util.py | 3 ++- 3 files changed, 10 insertions(+), 63 deletions(-) diff --git a/nltk/test/classify.doctest b/nltk/test/classify.doctest index bfdf17b1ef..3e1319d312 100644 --- a/nltk/test/classify.doctest +++ b/nltk/test/classify.doctest @@ -11,16 +11,13 @@ Classifiers label tokens with category labels (or *class labels*). Typically, labels are represented with strings (such as ``"health"`` or ``"sports"``. In NLTK, classifiers are defined using classes that -implement the `ClassifyI` interface: +implement the `ClassifierI` interface, which supports the following operations: - >>> import nltk - >>> nltk.usage(nltk.classify.ClassifierI) - ClassifierI supports the following operations: - - self.classify(featureset) - - self.classify_many(featuresets) - - self.labels() - - self.prob_classify(featureset) - - self.prob_classify_many(featuresets) +- self.classify(featureset) +- self.classify_many(featuresets) +- self.labels() +- self.prob_classify(featureset) +- self.prob_classify_many(featuresets) NLTK defines several classifier classes: @@ -42,6 +39,7 @@ We define a very simple training corpus with 3 binary features: ['a', that the correct answers can be calculated analytically (although we haven't done this yet for all tests). + >>> import nltk >>> train = [ ... (dict(a=1,b=1,c=1), 'y'), ... (dict(a=1,b=1,c=1), 'x'), diff --git a/nltk/test/unit/test_util.py b/nltk/test/unit/test_util.py index 109a96b31b..4709e843ca 100644 --- a/nltk/test/unit/test_util.py +++ b/nltk/test/unit/test_util.py @@ -1,58 +1,6 @@ import pytest -from nltk.util import everygrams, usage - - -def test_usage_with_self(capsys): - class MyClass: - def kwargs(self, a=1): - ... - - def no_args(self): - ... - - def pos_args(self, a, b): - ... - - def pos_args_and_kwargs(self, a, b, c=1): - ... - - usage(MyClass) - - captured = capsys.readouterr() - assert captured.out == ( - "MyClass supports the following operations:\n" - " - self.kwargs(a=1)\n" - " - self.no_args()\n" - " - self.pos_args(a, b)\n" - " - self.pos_args_and_kwargs(a, b, c=1)\n" - ) - - -def test_usage_with_cls(capsys): - class MyClass: - @classmethod - def clsmethod(cls): - ... - - @classmethod - def clsmethod_with_args(cls, a, b, c=1): - ... - - usage(MyClass) - - captured = capsys.readouterr() - assert captured.out == ( - "MyClass supports the following operations:\n" - " - cls.clsmethod()\n" - " - cls.clsmethod_with_args(a, b, c=1)\n" - ) - - -def test_usage_on_builtin(): - # just check the func passes, since - # builtins change each python version - usage(dict) +from nltk.util import everygrams @pytest.fixture diff --git a/nltk/util.py b/nltk/util.py index 383342a4e5..da843c3dc4 100644 --- a/nltk/util.py +++ b/nltk/util.py @@ -30,13 +30,14 @@ ) from nltk.collections import * -from nltk.internals import raise_unorderable_types, slice_bounds +from nltk.internals import deprecated, raise_unorderable_types, slice_bounds ###################################################################### # Short usage message ###################################################################### +@deprecated("Use help(obj) instead.") def usage(obj): str(obj) # In case it's lazy, this will load it. From 43f3e3096ade1a2564a20a66172d53bd140ac983 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Mon, 27 Sep 2021 15:09:28 +0200 Subject: [PATCH 26/36] Blacked conf.py --- web/conf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/web/conf.py b/web/conf.py index 922c28a4b9..9f8db4e690 100644 --- a/web/conf.py +++ b/web/conf.py @@ -35,8 +35,8 @@ "sphinx.ext.viewcode", ] -apidoc_module_dir = '../nltk' -apidoc_output_dir = 'api' +apidoc_module_dir = "../nltk" +apidoc_output_dir = "api" apidoc_separate_modules = False # Add any paths that contain templates here, relative to this directory. From d41e405daec3b6732c9bea0354e8f0d122b6f8b8 Mon Sep 17 00:00:00 2001 From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com> Date: Mon, 27 Sep 2021 20:18:01 +0200 Subject: [PATCH 27/36] wn.NOUN -> 'n' (#2823) --- nltk/stem/wordnet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nltk/stem/wordnet.py b/nltk/stem/wordnet.py index 3ac3424247..1ec43bcd1c 100644 --- a/nltk/stem/wordnet.py +++ b/nltk/stem/wordnet.py @@ -30,7 +30,7 @@ class WordNetLemmatizer: hardrock """ - def lemmatize(self, word: str, pos: str = wn.NOUN) -> str: + def lemmatize(self, word: str, pos: str = "n") -> str: """Lemmatize `word` using WordNet's built-in morphy function. Returns the input word unchanged if it cannot be found in WordNet. From a31de6c687165afe64a490053c73c8e0b20ff6db Mon Sep 17 00:00:00 2001 From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com> Date: Mon, 27 Sep 2021 23:48:41 +0200 Subject: [PATCH 28/36] Resolved undefined variable bug in raising Exception for RegexpTagger (#2821) --- nltk/tag/sequential.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/nltk/tag/sequential.py b/nltk/tag/sequential.py index 103fdc10ba..9a58c2f9d4 100644 --- a/nltk/tag/sequential.py +++ b/nltk/tag/sequential.py @@ -20,6 +20,7 @@ class for all the taggers in this module. Tagging of individual words import ast import re from abc import abstractmethod +from typing import List, Optional, Tuple from nltk import jsontags from nltk.classify import NaiveBayesClassifier @@ -533,21 +534,18 @@ class RegexpTagger(SequentialBackoffTagger): json_tag = "nltk.tag.sequential.RegexpTagger" - def __init__(self, regexps, backoff=None): - """ """ + def __init__( + self, regexps: List[Tuple[str, str]], backoff: Optional[TaggerI] = None + ): super().__init__(backoff) - try: - self._regexps = [ - ( - re.compile(regexp), - tag, - ) - for regexp, tag in regexps - ] - except Exception as e: - raise Exception( - "Invalid RegexpTagger regexp:", str(e), "regexp:", regexp, "tag:", tag - ) from e + self._regexps = [] + for regexp, tag in regexps: + try: + self._regexps.append((re.compile(regexp), tag)) + except Exception as e: + raise Exception( + f"Invalid RegexpTagger regexp: {e}\n- regexp: {regexp!r}\n- tag: {tag!r}" + ) from e def encode_json_obj(self): return [(regexp.pattern, tag) for regexp, tag in self._regexps], self.backoff From 3502926c7e2222632d850b70bd41e6492e5172d7 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Thu, 30 Sep 2021 14:19:50 +0200 Subject: [PATCH 29/36] Change mean -> median in ReDoS docstring comment --- nltk/test/corpus.doctest | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nltk/test/corpus.doctest b/nltk/test/corpus.doctest index 560e641a69..edb79221cf 100644 --- a/nltk/test/corpus.doctest +++ b/nltk/test/corpus.doctest @@ -2182,7 +2182,7 @@ Ensure that KEYWORD from `comparative_sents.py` no longer contains a ReDoS vulne ... payload = "( " + "(" * size ... output = KEYWORD.findall(payload) ... exec_times[size_name].append(time.perf_counter() - start_t) - ... exec_times[size_name] = sorted(exec_times[size_name])[4] # Get the mean + ... exec_times[size_name] = sorted(exec_times[size_name])[4] # Get the median Ideally, the execution time of such a regular expression is linear in the length of the input. As such, we would expect exec_times["long"] From dacda6f50057e1de560483f5e432b5ccb0dd5876 Mon Sep 17 00:00:00 2001 From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com> Date: Thu, 30 Sep 2021 14:48:26 +0200 Subject: [PATCH 30/36] Change mean -> median in ReDoS docstring comment (#2831) Thank you @PeterJCLaw --- nltk/test/corpus.doctest | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nltk/test/corpus.doctest b/nltk/test/corpus.doctest index 560e641a69..edb79221cf 100644 --- a/nltk/test/corpus.doctest +++ b/nltk/test/corpus.doctest @@ -2182,7 +2182,7 @@ Ensure that KEYWORD from `comparative_sents.py` no longer contains a ReDoS vulne ... payload = "( " + "(" * size ... output = KEYWORD.findall(payload) ... exec_times[size_name].append(time.perf_counter() - start_t) - ... exec_times[size_name] = sorted(exec_times[size_name])[4] # Get the mean + ... exec_times[size_name] = sorted(exec_times[size_name])[4] # Get the median Ideally, the execution time of such a regular expression is linear in the length of the input. As such, we would expect exec_times["long"] From 2d07c4fe9bc50aa0f0e63fea77cce900895c91d4 Mon Sep 17 00:00:00 2001 From: mohaned mashaly <30902228+12mohaned@users.noreply.github.com> Date: Fri, 1 Oct 2021 03:28:29 +0200 Subject: [PATCH 31/36] fix: correct minor typo in word_tokenize (#2828) --- nltk/tokenize/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nltk/tokenize/__init__.py b/nltk/tokenize/__init__.py index 62c5886d34..c8dd45b319 100644 --- a/nltk/tokenize/__init__.py +++ b/nltk/tokenize/__init__.py @@ -123,7 +123,7 @@ def word_tokenize(text, language="english", preserve_line=False): :type text: str :param language: the model name in the Punkt corpus :type language: str - :param preserve_line: An option to keep the preserve the sentence and not sentence tokenize it. + :param preserve_line: A flag to decide whether to sentence tokenize the text or not. :type preserve_line: bool """ sentences = [text] if preserve_line else sent_tokenize(text, language) From 317c5f88ef66b039dc143a6ad78f2a61f362f03a Mon Sep 17 00:00:00 2001 From: Steven Bird Date: Fri, 1 Oct 2021 11:22:49 +0930 Subject: [PATCH 32/36] updates for 3.6.4 --- ChangeLog | 14 ++++++++++++++ nltk/VERSION | 2 +- web/conf.py | 4 ++-- web/news.rst | 5 +++++ 4 files changed, 22 insertions(+), 3 deletions(-) diff --git a/ChangeLog b/ChangeLog index c4a39bc542..3615a2b8fa 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,17 @@ +Version 3.6.4 2021-10-01 + +* deprecate `nltk.usage(obj)` in favor of `help(obj)` +* resolve ReDoS vulnerability in Corpus Reader +* solidify performance tests +* improve phone number recognition in tweet tokenizer +* refactored CISTEM stemmer for German +* identify NLTK Team as the author +* replace travis badge with github actions badge +* add SECURITY.md + +Thanks to the following contributors to 3.6.4 +Tom Aarsen, Mohaned Mashaly, Dimitri Papadopoulos Orfanos, purificant, Danny Sepler + Version 3.6.3 2021-09-19 * Dropped support for Python 3.5 * Run CI tests on Windows, too diff --git a/nltk/VERSION b/nltk/VERSION index 4a788a01da..0f44168a4d 100644 --- a/nltk/VERSION +++ b/nltk/VERSION @@ -1 +1 @@ -3.6.3 +3.6.4 diff --git a/web/conf.py b/web/conf.py index 609979837c..98e9d86f31 100644 --- a/web/conf.py +++ b/web/conf.py @@ -60,9 +60,9 @@ # built documents. # # The short X.Y version. -version = "3.6.3" +version = "3.6.4" # The full version, including alpha/beta/rc tags. -release = "3.6.3" +release = "3.6.4" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/web/news.rst b/web/news.rst index 00734ba0b6..f44f073177 100644 --- a/web/news.rst +++ b/web/news.rst @@ -4,6 +4,11 @@ NLTK News 2021 ---- +NLTK 3.6.4 release: October 2021 + improved phone number recognition in tweet tokenizer + resolved ReDoS vulnerability in Corpus Reader + refactored CISTEM stemmer for German + NLTK 3.6.3 release: September 2021 Drop support for Python 3.5, added pre-commit hooks (isort, pyupgrade, black), From 6428c9288a86658cb3d9a1e91816c4bcf162a6f0 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Fri, 1 Oct 2021 09:09:38 +0200 Subject: [PATCH 33/36] Avoiding the use of re.Pattern and regex.Pattern This fails for Python 3.6 and 3.7. --- nltk/tokenize/casual.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nltk/tokenize/casual.py b/nltk/tokenize/casual.py index f7b256713e..11c0f11aa8 100644 --- a/nltk/tokenize/casual.py +++ b/nltk/tokenize/casual.py @@ -354,7 +354,7 @@ def tokenize(self, text: str) -> List[str]: return words @property - def WORD_RE(self) -> regex.Pattern: + def WORD_RE(self) -> "regex.Pattern": """Core TweetTokenizer regex""" # Compiles the regex for this and all future instantiations of TweetTokenizer. if not type(self)._WORD_RE: @@ -365,7 +365,7 @@ def WORD_RE(self) -> regex.Pattern: return type(self)._WORD_RE @property - def PHONE_WORD_RE(self) -> regex.Pattern: + def PHONE_WORD_RE(self) -> "regex.Pattern": """Secondary core TweetTokenizer regex""" # Compiles the regex for this and all future instantiations of TweetTokenizer. if not type(self)._PHONE_WORD_RE: From 7ed503d98926c00e26baec7e05895d06286b23d4 Mon Sep 17 00:00:00 2001 From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com> Date: Fri, 1 Oct 2021 11:23:15 +0200 Subject: [PATCH 34/36] Bump minimum regex version to allow usage of 'regex.Pattern' typing (#2834) --- pip-req.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pip-req.txt b/pip-req.txt index 63655dffc9..461e26d656 100644 --- a/pip-req.txt +++ b/pip-req.txt @@ -9,7 +9,7 @@ python-crfsuite>=0.8.2 gensim>=0.11.1,<4.0.0 pyparsing>=2.0.3 twython>=3.2.0 -regex>=2019.08.19 +regex>=2021.8.3 click>=7.1.2 joblib>=1.0.1 tqdm>=4.59.0 From 2a1d794999fb1d80143dd028903dca789e66f519 Mon Sep 17 00:00:00 2001 From: purificant Date: Fri, 1 Oct 2021 22:17:38 +0100 Subject: [PATCH 35/36] specify minimum regex version that supports regex.Pattern (#2835) --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 061ae8899e..eb32f49f9d 100644 --- a/setup.py +++ b/setup.py @@ -115,7 +115,7 @@ install_requires=[ "click", "joblib", - "regex", + "regex>=2021.8.3", "tqdm", ], extras_require=extras_require, From 3ffed20f5afd2412c799ddd03376b071db882ed5 Mon Sep 17 00:00:00 2001 From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com> Date: Mon, 4 Oct 2021 23:55:50 +0200 Subject: [PATCH 36/36] Skip ReDoS test - performance testing isn't viable with cloud computing (#2841) --- nltk/test/corpus.doctest | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nltk/test/corpus.doctest b/nltk/test/corpus.doctest index edb79221cf..ef74077433 100644 --- a/nltk/test/corpus.doctest +++ b/nltk/test/corpus.doctest @@ -2192,5 +2192,5 @@ For now, we accept values below 30 (times as long), due to the potential for variance. This ensures that the ReDoS has certainly been reduced, if not removed. - >>> exec_times["long"] / exec_times["short"] < 30 + >>> exec_times["long"] / exec_times["short"] < 30 # doctest: +SKIP True