From d97cf17f0ad9903035d0c8e08b3eb552738f28f8 Mon Sep 17 00:00:00 2001
From: Steven Bird <stevenbird1@gmail.com>
Date: Fri, 20 Aug 2021 14:45:37 +0930
Subject: [PATCH 01/36] update for 3.6.3

---
 nltk/VERSION | 2 +-
 web/conf.py  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/nltk/VERSION b/nltk/VERSION
index b72762837e..4a788a01da 100644
--- a/nltk/VERSION
+++ b/nltk/VERSION
@@ -1 +1 @@
-3.6.2
+3.6.3
diff --git a/web/conf.py b/web/conf.py
index 93ea26071f..ea31ee71f0 100644
--- a/web/conf.py
+++ b/web/conf.py
@@ -55,9 +55,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = "3.6.2"
+version = "3.6.3"
 # The full version, including alpha/beta/rc tags.
-release = "3.6.2"
+release = "3.6.3"
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

From ec0c03c893f6f4f213127f969c61fc8a700d3b36 Mon Sep 17 00:00:00 2001
From: mohaned mashaly <30902228+12mohaned@users.noreply.github.com>
Date: Fri, 3 Sep 2021 04:38:00 +0200
Subject: [PATCH 02/36] Fixing TweetTokenizer format (#2791)

* Fixing TweetTokenizer format
to follow PEP8 format

* remove unnecessary else from a condition
---
 nltk/tokenize/casual.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/nltk/tokenize/casual.py b/nltk/tokenize/casual.py
index db7d1dacd2..7d0d17fc64 100644
--- a/nltk/tokenize/casual.py
+++ b/nltk/tokenize/casual.py
@@ -238,8 +238,7 @@ def _convert_entity(match):
         else:
             if entity_body in keep:
                 return match.group(0)
-            else:
-                number = html.entities.name2codepoint.get(entity_body)
+            number = html.entities.name2codepoint.get(entity_body)
         if number is not None:
             try:
                 return chr(number)
@@ -262,7 +261,8 @@ class TweetTokenizer:
         >>> tknzr = TweetTokenizer()
         >>> s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"
         >>> tknzr.tokenize(s0)
-        ['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->', '<--']
+        ['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3'
+        , 'and', 'some', 'arrows', '<', '>', '->', '<--']
 
     Examples using `strip_handles` and `reduce_len parameters`:
 
@@ -323,9 +323,11 @@ def remove_handles(text):
     Remove Twitter username handles from text.
     """
     pattern = regex.compile(
-        r"(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){20}(?!@))|(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){1,19})(?![A-Za-z0-9_]*@)"
+        r"(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){20}(?!@))|"
+        r"(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){1,19})(?![A-Za-z0-9_]*@)"
     )
-    # Substitute handles with ' ' to ensure that text on either side of removed handles are tokenized correctly
+    # Substitute handles with ' ' to ensure that text on either
+    # side of removed handles are tokenized correctly
     return pattern.sub(" ", text)
 
 

From d4e8c3c0e99de11f23d19cfe060591a7afb738c3 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com>
Date: Sun, 5 Sep 2021 10:23:12 +0200
Subject: [PATCH 03/36] Use global regex for TweetTokenizer'ss remove_handles
 (#2795)

---
 nltk/tokenize/casual.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/nltk/tokenize/casual.py b/nltk/tokenize/casual.py
index 7d0d17fc64..66f038105f 100644
--- a/nltk/tokenize/casual.py
+++ b/nltk/tokenize/casual.py
@@ -177,6 +177,11 @@
 # These are for regularizing HTML entities to Unicode:
 ENT_RE = regex.compile(r"&(#?(x?))([^&;\s]+);")
 
+# For stripping away handles from a tweet:
+HANDLES_RE = regex.compile(
+    r"(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){20}(?!@))|"
+    r"(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){1,19})(?![A-Za-z0-9_]*@)"
+)
 
 ######################################################################
 # Functions for converting html entities
@@ -322,13 +327,8 @@ def remove_handles(text):
     """
     Remove Twitter username handles from text.
     """
-    pattern = regex.compile(
-        r"(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){20}(?!@))|"
-        r"(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){1,19})(?![A-Za-z0-9_]*@)"
-    )
-    # Substitute handles with ' ' to ensure that text on either
-    # side of removed handles are tokenized correctly
-    return pattern.sub(" ", text)
+    # Substitute handles with ' ' to ensure that text on either side of removed handles are tokenized correctly
+    return HANDLES_RE.sub(" ", text)
 
 
 ######################################################################

From 47e4191efd952b9076a854ba19b1bf503b6806c6 Mon Sep 17 00:00:00 2001
From: mohaned mashaly <mohaned_boss@outlook.com>
Date: Sat, 11 Sep 2021 17:35:28 +0200
Subject: [PATCH 04/36] build: add matplotlib in deps section

---
 tox.ini | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tox.ini b/tox.ini
index 50d1693cad..b97d416505 100644
--- a/tox.ini
+++ b/tox.ini
@@ -29,6 +29,7 @@ deps =
     click
     joblib
     tqdm
+    matplotlib
 
 changedir = nltk/test
 commands =

From ab1027691fcd0e6e9d206156a4fd01eb0c5db626 Mon Sep 17 00:00:00 2001
From: Abhijnan Bajpai <57059194+Abhijnan-Bajpai@users.noreply.github.com>
Date: Mon, 13 Sep 2021 21:01:12 +0530
Subject: [PATCH 05/36] Improved the removal of twitter handles (#2799)

* Improved the removal of twitter username handles from text

* Improved the removal of handles

* Changing the length of handles to 15 from 20

* Modified incorrect tests - twitter handles can only be 15 characters

* Removed duplicated part of the HANDLES_RE regex

Co-authored-by: Tom Aarsen
---
 nltk/test/unit/test_tokenize.py | 12 ++++++------
 nltk/tokenize/casual.py         |  4 ++--
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/nltk/test/unit/test_tokenize.py b/nltk/test/unit/test_tokenize.py
index 65cbef07a4..2efd793ef9 100644
--- a/nltk/test/unit/test_tokenize.py
+++ b/nltk/test/unit/test_tokenize.py
@@ -306,18 +306,18 @@ def test_remove_handle(self):
         result = tokenizer.tokenize(test5)
         assert result == expected
 
-        # Tests that handles can have a max length of 20
-        test6 = "@abcdefghijklmnopqrstuvwxyz @abcdefghijklmnopqrst1234 @abcdefghijklmnopqrst_ @abcdefghijklmnopqrstendofhandle"
-        expected = ["uvwxyz", "1234", "_", "endofhandle"]
+        # Tests that handles can have a max length of 15
+        test6 = "@abcdefghijklmnopqrstuvwxyz @abcdefghijklmno1234 @abcdefghijklmno_ @abcdefghijklmnoendofhandle"
+        expected = ["pqrstuvwxyz", "1234", "_", "endofhandle"]
         result = tokenizer.tokenize(test6)
         assert result == expected
 
         # Edge case where an @ comes directly after a long handle
-        test7 = "@abcdefghijklmnopqrstu@abcde @abcdefghijklmnopqrst@abcde @abcdefghijklmnopqrst_@abcde @abcdefghijklmnopqrst5@abcde"
+        test7 = "@abcdefghijklmnop@abcde @abcdefghijklmno@abcde @abcdefghijklmno_@abcde @abcdefghijklmno5@abcde"
         expected = [
-            "u",
+            "p",
             "@abcde",
-            "@abcdefghijklmnopqrst",
+            "@abcdefghijklmno",
             "@abcde",
             "_",
             "@abcde",
diff --git a/nltk/tokenize/casual.py b/nltk/tokenize/casual.py
index 66f038105f..0fb0b39243 100644
--- a/nltk/tokenize/casual.py
+++ b/nltk/tokenize/casual.py
@@ -179,8 +179,8 @@
 
 # For stripping away handles from a tweet:
 HANDLES_RE = regex.compile(
-    r"(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){20}(?!@))|"
-    r"(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){1,19})(?![A-Za-z0-9_]*@)"
+    r"(?<![A-Za-z0-9_!@#\$%&*])@"
+    r"(([A-Za-z0-9_]){15}(?!@)|([A-Za-z0-9_]){1,14}(?![A-Za-z0-9_]*@))"
 )
 
 ######################################################################

From 83ed3f73f9b8ff09e9b760ad968db11e76fd27ed Mon Sep 17 00:00:00 2001
From: Danny Sepler <dannysepler@gmail.com>
Date: Thu, 16 Sep 2021 17:06:52 -0400
Subject: [PATCH 06/36] Final py3.9 deprecation (#2801)

* Final py39 deprecation

* acommodate multiple python versions

* dont need capsys import anymore
---
 nltk/test/unit/test_util.py | 215 ++++++++++++++++++++++--------------
 nltk/util.py                |  32 +++---
 2 files changed, 153 insertions(+), 94 deletions(-)

diff --git a/nltk/test/unit/test_util.py b/nltk/test/unit/test_util.py
index 365452574b..109a96b31b 100644
--- a/nltk/test/unit/test_util.py
+++ b/nltk/test/unit/test_util.py
@@ -1,81 +1,134 @@
-"""
-Unit tests for nltk.util.
-"""
-
-import unittest
-
-from nltk.util import everygrams
-
-
-class TestEverygrams(unittest.TestCase):
-    def setUp(self):
-        """Form test data for tests."""
-        self.test_data = iter("a b c".split())
-
-    def test_everygrams_without_padding(self):
-        expected_output = [
-            ("a",),
-            ("a", "b"),
-            ("a", "b", "c"),
-            ("b",),
-            ("b", "c"),
-            ("c",),
-        ]
-        output = everygrams(self.test_data)
-        self.assertCountEqual(output, expected_output)
-
-    def test_everygrams_max_len(self):
-        expected_output = [
-            ("a",),
-            ("a", "b"),
-            ("b",),
-            ("b", "c"),
-            ("c",),
-        ]
-        output = everygrams(self.test_data, max_len=2)
-        self.assertCountEqual(output, expected_output)
-
-    def test_everygrams_min_len(self):
-        expected_output = [
-            ("a", "b"),
-            ("b", "c"),
-            ("a", "b", "c"),
-        ]
-        output = everygrams(self.test_data, min_len=2)
-        self.assertCountEqual(output, expected_output)
-
-    def test_everygrams_pad_right(self):
-        expected_output = [
-            ("a",),
-            ("a", "b"),
-            ("a", "b", "c"),
-            ("b",),
-            ("b", "c"),
-            ("b", "c", None),
-            ("c",),
-            ("c", None),
-            ("c", None, None),
-            (None,),
-            (None, None),
-            (None,),
-        ]
-        output = everygrams(self.test_data, max_len=3, pad_right=True)
-        self.assertCountEqual(output, expected_output)
-
-    def test_everygrams_pad_left(self):
-        expected_output = [
-            (None,),
-            (None, None),
-            (None, None, "a"),
-            (None,),
-            (None, "a"),
-            (None, "a", "b"),
-            ("a",),
-            ("a", "b"),
-            ("a", "b", "c"),
-            ("b",),
-            ("b", "c"),
-            ("c",),
-        ]
-        output = everygrams(self.test_data, max_len=3, pad_left=True)
-        self.assertCountEqual(output, expected_output)
+import pytest
+
+from nltk.util import everygrams, usage
+
+
+def test_usage_with_self(capsys):
+    class MyClass:
+        def kwargs(self, a=1):
+            ...
+
+        def no_args(self):
+            ...
+
+        def pos_args(self, a, b):
+            ...
+
+        def pos_args_and_kwargs(self, a, b, c=1):
+            ...
+
+    usage(MyClass)
+
+    captured = capsys.readouterr()
+    assert captured.out == (
+        "MyClass supports the following operations:\n"
+        "  - self.kwargs(a=1)\n"
+        "  - self.no_args()\n"
+        "  - self.pos_args(a, b)\n"
+        "  - self.pos_args_and_kwargs(a, b, c=1)\n"
+    )
+
+
+def test_usage_with_cls(capsys):
+    class MyClass:
+        @classmethod
+        def clsmethod(cls):
+            ...
+
+        @classmethod
+        def clsmethod_with_args(cls, a, b, c=1):
+            ...
+
+    usage(MyClass)
+
+    captured = capsys.readouterr()
+    assert captured.out == (
+        "MyClass supports the following operations:\n"
+        "  - cls.clsmethod()\n"
+        "  - cls.clsmethod_with_args(a, b, c=1)\n"
+    )
+
+
+def test_usage_on_builtin():
+    # just check the func passes, since
+    # builtins change each python version
+    usage(dict)
+
+
+@pytest.fixture
+def everygram_input():
+    """Form test data for tests."""
+    return iter(["a", "b", "c"])
+
+
+def test_everygrams_without_padding(everygram_input):
+    expected_output = [
+        ("a",),
+        ("a", "b"),
+        ("a", "b", "c"),
+        ("b",),
+        ("b", "c"),
+        ("c",),
+    ]
+    output = list(everygrams(everygram_input))
+    assert output == expected_output
+
+
+def test_everygrams_max_len(everygram_input):
+    expected_output = [
+        ("a",),
+        ("a", "b"),
+        ("b",),
+        ("b", "c"),
+        ("c",),
+    ]
+    output = list(everygrams(everygram_input, max_len=2))
+    assert output == expected_output
+
+
+def test_everygrams_min_len(everygram_input):
+    expected_output = [
+        ("a", "b"),
+        ("a", "b", "c"),
+        ("b", "c"),
+    ]
+    output = list(everygrams(everygram_input, min_len=2))
+    assert output == expected_output
+
+
+def test_everygrams_pad_right(everygram_input):
+    expected_output = [
+        ("a",),
+        ("a", "b"),
+        ("a", "b", "c"),
+        ("b",),
+        ("b", "c"),
+        ("b", "c", None),
+        ("c",),
+        ("c", None),
+        ("c", None, None),
+        (None,),
+        (None, None),
+        (None,),
+    ]
+    output = list(everygrams(everygram_input, max_len=3, pad_right=True))
+    assert output == expected_output
+
+
+def test_everygrams_pad_left(everygram_input):
+    expected_output = [
+        (None,),
+        (None, None),
+        (None, None, "a"),
+        (None,),
+        (None, "a"),
+        (None, "a", "b"),
+        ("a",),
+        ("a", "b"),
+        ("a", "b", "c"),
+        ("b",),
+        ("b", "c"),
+        ("c",),
+    ]
+    output = list(everygrams(everygram_input, max_len=3, pad_left=True))
+    assert output == expected_output
diff --git a/nltk/util.py b/nltk/util.py
index c25a8fb339..383342a4e5 100644
--- a/nltk/util.py
+++ b/nltk/util.py
@@ -37,32 +37,38 @@
 ######################################################################
 
 
-def usage(obj, selfname="self"):
+def usage(obj):
     str(obj)  # In case it's lazy, this will load it.
 
     if not isinstance(obj, type):
         obj = obj.__class__
 
-    print("%s supports the following operations:" % obj.__name__)
+    print(f"{obj.__name__} supports the following operations:")
     for (name, method) in sorted(pydoc.allmethods(obj).items()):
         if name.startswith("_"):
             continue
         if getattr(method, "__deprecated__", False):
             continue
 
-        getargspec = inspect.getfullargspec
-        args, varargs, varkw, defaults = getargspec(method)[:4]
-        if (
-            args
-            and args[0] == "self"
-            and (defaults is None or len(args) > len(defaults))
-        ):
-            args = args[1:]
-            name = f"{selfname}.{name}"
-        argspec = inspect.formatargspec(args, varargs, varkw, defaults)
+        try:
+            sig = str(inspect.signature(method))
+        except ValueError as e:
+            # builtins sometimes don't support introspection
+            if "builtin" in str(e):
+                continue
+            else:
+                raise
+
+        args = sig.lstrip("(").rstrip(")").split(", ")
+        meth = inspect.getattr_static(obj, name)
+        if isinstance(meth, (classmethod, staticmethod)):
+            name = f"cls.{name}"
+        elif args and args[0] == "self":
+            name = f"self.{name}"
+            args.pop(0)
         print(
             textwrap.fill(
-                f"{name}{argspec}",
+                f"{name}({', '.join(args)})",
                 initial_indent="  - ",
                 subsequent_indent=" " * (len(name) + 5),
             )

From 49e5d6ef9fffb9d256c70454503486affbaf0c88 Mon Sep 17 00:00:00 2001
From: Steven Bird <stevenbird1@gmail.com>
Date: Fri, 17 Sep 2021 06:46:59 +0930
Subject: [PATCH 07/36] Update with recent commits

---
 ChangeLog | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ChangeLog b/ChangeLog
index d587aa3974..81052939b2 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -12,6 +12,8 @@ Version 3.6.3 2021-08-??
 * Fixed AttributeError for Arabic ARLSTem2 stemmer
 * Many fixes and improvements to lm language model package
 * Fix bug in nltk.metrics.aline, C_skip = -10
+* Improvements to TweetTokenizer
+* Optional show arg for FreqDist.plot, ConditionalFreqDist.plot
 
 Thanks to the following contributors to 3.6.3
 Tom Aarsen, Michael Wayne Goodman, Michał Górny, Maarten ter Huurne, Manu Joseph,

From 53dbaa5591003f6764a3d69834e92bc83e3a754c Mon Sep 17 00:00:00 2001
From: mohaned mashaly <30902228+12mohaned@users.noreply.github.com>
Date: Thu, 16 Sep 2021 23:40:50 +0200
Subject: [PATCH 08/36] refactor: refactor sentiment analyzer by removing dead
 and slow perfomance code (#2804)

* refactor: refactor sentiment analyzer by removing dead and slow perfomance code

* refactor: refactor sentiment analyzer by removing dead and slow perfomance code

* fix: add not to false boolean values

* Refactor: Add sentiment keyword in all_words

Co-authored-by: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com>
---
 nltk/sentiment/sentiment_analyzer.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/nltk/sentiment/sentiment_analyzer.py b/nltk/sentiment/sentiment_analyzer.py
index 35bc810d7c..1660e2f841 100644
--- a/nltk/sentiment/sentiment_analyzer.py
+++ b/nltk/sentiment/sentiment_analyzer.py
@@ -47,10 +47,10 @@ def all_words(self, documents, labeled=None):
         all_words = []
         if labeled is None:
             labeled = documents and isinstance(documents[0], tuple)
-        if labeled == True:
-            for words, sentiment in documents:
+        if labeled:
+            for words, _sentiment in documents:
                 all_words.extend(words)
-        elif labeled == False:
+        elif not labeled:
             for words in documents:
                 all_words.extend(words)
         return all_words
@@ -218,7 +218,7 @@ def evaluate(
             classifier = self.classifier
         print(f"Evaluating {type(classifier).__name__} results...")
         metrics_results = {}
-        if accuracy == True:
+        if accuracy:
             accuracy_score = eval_accuracy(classifier, test_set)
             metrics_results["Accuracy"] = accuracy_score
 
@@ -232,22 +232,22 @@ def evaluate(
             test_results[observed].add(i)
 
         for label in labels:
-            if precision == True:
+            if precision:
                 precision_score = eval_precision(
                     gold_results[label], test_results[label]
                 )
                 metrics_results[f"Precision [{label}]"] = precision_score
-            if recall == True:
+            if recall:
                 recall_score = eval_recall(gold_results[label], test_results[label])
                 metrics_results[f"Recall [{label}]"] = recall_score
-            if f_measure == True:
+            if f_measure:
                 f_measure_score = eval_f_measure(
                     gold_results[label], test_results[label]
                 )
                 metrics_results[f"F-measure [{label}]"] = f_measure_score
 
         # Print evaluation results (in alphabetical order)
-        if verbose == True:
+        if verbose:
             for result in sorted(metrics_results):
                 print(f"{result}: {metrics_results[result]}")
 

From 77b59458bf2f13cf08d3b265dcebaed653016874 Mon Sep 17 00:00:00 2001
From: avena554 <antoine.venant@gmail.com>
Date: Sat, 18 Sep 2021 07:59:35 -0400
Subject: [PATCH 09/36] Edit_distance now computes the actual
 Damerau-Levenshtein edit-distance (#2736)

* Edit_distance now computes the actual Damerau-Levenshtein edit-distance

* adapted edit_distance_align to the changes in _edit_distance_step

* +couple unit test for the levensthein edit distance with vs without transpositions

* pre commit fails when pushing

* commiting to run pre-commit hooks

* fixed edit distance unit tests and edit distance with transpositions

* Added and pytest-ified edit_distance tests

Co-authored-by: Tom Aarsen <Cubiegamedev@gmail.com>
---
 nltk/metrics/distance.py        |  30 ++++++--
 nltk/test/unit/test_distance.py | 123 ++++++++++++++++++++++++++++++++
 2 files changed, 149 insertions(+), 4 deletions(-)
 create mode 100644 nltk/test/unit/test_distance.py

diff --git a/nltk/metrics/distance.py b/nltk/metrics/distance.py
index 85912a3e10..c0da4a1753 100644
--- a/nltk/metrics/distance.py
+++ b/nltk/metrics/distance.py
@@ -34,7 +34,13 @@ def _edit_dist_init(len1, len2):
     return lev
 
 
-def _edit_dist_step(lev, i, j, s1, s2, substitution_cost=1, transpositions=False):
+def _last_left_t_init(sigma):
+    return {c: 0 for c in sigma}
+
+
+def _edit_dist_step(
+    lev, i, j, s1, s2, last_left, last_right, substitution_cost=1, transpositions=False
+):
     c1 = s1[i - 1]
     c2 = s2[j - 1]
 
@@ -47,9 +53,8 @@ def _edit_dist_step(lev, i, j, s1, s2, substitution_cost=1, transpositions=False
 
     # transposition
     d = c + 1  # never picked by default
-    if transpositions and i > 1 and j > 1:
-        if s1[i - 2] == c2 and s2[j - 2] == c1:
-            d = lev[i - 2][j - 2] + 1
+    if transpositions and last_left > 0 and last_right > 0:
+        d = lev[last_left - 1][last_right - 1] + i - last_left + j - last_right - 1
 
     # pick the cheapest
     lev[i][j] = min(a, b, c, d)
@@ -85,18 +90,33 @@ def edit_distance(s1, s2, substitution_cost=1, transpositions=False):
     len2 = len(s2)
     lev = _edit_dist_init(len1 + 1, len2 + 1)
 
+    # retrieve alphabet
+    sigma = set()
+    sigma.update(s1)
+    sigma.update(s2)
+
+    # set up table to remember positions of last seen occurrence in s1
+    last_left_t = _last_left_t_init(sigma)
+
     # iterate over the array
     for i in range(len1):
+        last_right = 0
         for j in range(len2):
+            last_left = last_left_t[s2[j]]
             _edit_dist_step(
                 lev,
                 i + 1,
                 j + 1,
                 s1,
                 s2,
+                last_left,
+                last_right,
                 substitution_cost=substitution_cost,
                 transpositions=transpositions,
             )
+            if s1[i] == s2[j]:
+                last_right = j + 1
+            last_left_t[s1[i]] = i + 1
     return lev[len1][len2]
 
 
@@ -162,6 +182,8 @@ def edit_distance_align(s1, s2, substitution_cost=1):
                 j + 1,
                 s1,
                 s2,
+                0,
+                0,
                 substitution_cost=substitution_cost,
                 transpositions=False,
             )
diff --git a/nltk/test/unit/test_distance.py b/nltk/test/unit/test_distance.py
new file mode 100644
index 0000000000..bea1b542c2
--- /dev/null
+++ b/nltk/test/unit/test_distance.py
@@ -0,0 +1,123 @@
+from typing import Tuple
+
+import pytest
+
+from nltk.metrics.distance import edit_distance
+
+
+class TestEditDistance:
+    @pytest.mark.parametrize(
+        "left,right,substitution_cost,expecteds",
+        [
+            # Allowing transpositions reduces the number of edits required.
+            # with transpositions:
+            # e.g. "abc" -T-> "cba" -D-> "ca": 2 steps
+            #
+            # without transpositions:
+            # e.g. "abc" -D-> "ab" -D-> "a" -I-> "ca": 3 steps
+            ("abc", "ca", 1, (2, 3)),
+            ("abc", "ca", 5, (2, 3)),  # Doesn't *require* substitutions
+            # Note, a substition_cost of higher than 2 doesn't make much
+            # sense, as a deletion + insertion is identical, and always
+            # costs 2.
+            #
+            #
+            # Transpositions don't always reduce the number of edits required:
+            # with or without transpositions:
+            # e.g. "wants" -D-> "wats" -D-> "was" -I-> "wasp": 3 steps
+            ("wants", "wasp", 1, (3, 3)),
+            ("wants", "wasp", 5, (3, 3)),  # Doesn't *require* substitutions
+            #
+            #
+            # Ought to have the same results with and without transpositions
+            # with or without transpositions:
+            # e.g. "rain" -S-> "sain" -S-> "shin" -I-> "shine": 3 steps
+            # (but cost 5 if substitution_cost=2)
+            ("rain", "shine", 1, (3, 3)),
+            ("rain", "shine", 2, (5, 5)),  # Does *require* substitutions
+            #
+            #
+            # Several potentially interesting typos
+            # with transpositions:
+            # e.g. "acbdef" -T-> "abcdef": 1 step
+            #
+            # without transpositions:
+            # e.g. "acbdef" -D-> "abdef" -I-> "abcdef": 2 steps
+            ("acbdef", "abcdef", 1, (1, 2)),
+            ("acbdef", "abcdef", 2, (1, 2)),  # Doesn't *require* substitutions
+            #
+            #
+            # with transpositions:
+            # e.g. "lnaguaeg" -T-> "languaeg" -T-> "language": 2 steps
+            #
+            # without transpositions:
+            # e.g. "lnaguaeg" -D-> "laguaeg" -I-> "languaeg" -D-> "languag" -I-> "language": 4 steps
+            ("lnaguaeg", "language", 1, (2, 4)),
+            ("lnaguaeg", "language", 2, (2, 4)),  # Doesn't *require* substitutions
+            #
+            #
+            # with transpositions:
+            # e.g. "lnaugage" -T-> "lanugage" -T-> "language": 2 steps
+            #
+            # without transpositions:
+            # e.g. "lnaugage" -S-> "lnangage" -D-> "langage" -I-> "language": 3 steps
+            # (but one substitution, so a cost of 4 if substition_cost = 2)
+            ("lnaugage", "language", 1, (2, 3)),
+            ("lnaugage", "language", 2, (2, 4)),
+            # Does *require* substitutions if no transpositions
+            #
+            #
+            # with transpositions:
+            # e.g. "lngauage" -T-> "lnaguage" -T-> "language": 2 steps
+            # without transpositions:
+            # e.g. "lngauage" -I-> "lanaguage" -D-> "language": 2 steps
+            ("lngauage", "language", 1, (2, 2)),
+            ("lngauage", "language", 2, (2, 2)),  # Doesn't *require* substitutions
+            #
+            #
+            # with or without transpositions:
+            # e.g. "wants" -S-> "sants" -S-> "swnts" -S-> "swits" -S-> "swims" -D-> "swim": 5 steps
+            #
+            # with substitution_cost=2 and transpositions:
+            # e.g. "wants" -T-> "santw" -D-> "sntw" -D-> "stw" -D-> "sw"
+            # -I-> "swi" -I-> "swim": 6 steps
+            #
+            # with substitution_cost=2 and no transpositions:
+            # e.g. "wants" -I-> "swants" -D-> "swant" -D-> "swan" -D-> "swa" -D-> "sw"
+            # -I-> "swi" -I-> "swim": 7 steps
+            ("wants", "swim", 1, (5, 5)),
+            ("wants", "swim", 2, (6, 7)),
+            #
+            #
+            # with or without transpositions:
+            # e.g. "kitten" -S-> "sitten" -s-> "sittin" -I-> "sitting": 3 steps
+            # (but cost 5 if substitution_cost=2)
+            ("kitten", "sitting", 1, (3, 3)),
+            ("kitten", "sitting", 2, (5, 5)),
+        ],
+    )
+    def test_with_transpositions(
+        self, left: str, right: str, substitution_cost: int, expecteds: Tuple[int, int]
+    ):
+        """Test `edit_distance` between two strings, given some `substitution_cost`,
+        and whether transpositions are allowed.
+
+        Args:
+            left (str): First input string to `edit_distance`.
+            right (str): Second input string to `edit_distance`.
+            substitution_cost (int): The cost of a substitution action in `edit_distance`.
+            expecteds (Tuple[int, int]): A tuple of expected outputs, such that `expecteds[0]` is
+                the expected output with `transpositions=True`, and `expecteds[1]` is
+                the expected output with `transpositions=False`.
+        """
+        # Test the input strings in both orderings
+        for s1, s2 in ((left, right), (right, left)):
+            # zip with [True, False] to get the transpositions value
+            for expected, transpositions in zip(expecteds, [True, False]):
+                predicted = edit_distance(
+                    s1,
+                    s2,
+                    substitution_cost=substitution_cost,
+                    transpositions=transpositions,
+                )
+                assert predicted == expected

From 656a6677e6778cc538c874dcef7cfe60203a9fcd Mon Sep 17 00:00:00 2001
From: Steven Bird <stevenbird1@gmail.com>
Date: Sat, 18 Sep 2021 21:34:30 +0930
Subject: [PATCH 10/36] Update ChangeLog

Last minute additions...
---
 ChangeLog | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 81052939b2..923eadee6e 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,4 +1,4 @@
-Version 3.6.3 2021-08-??
+Version 3.6.3 2021-09-19
 * Dropped support for Python 3.5
 * Run CI tests on Windows, too
 * Moved from Travis CI to GitHub Actions
@@ -14,11 +14,12 @@ Version 3.6.3 2021-08-??
 * Fix bug in nltk.metrics.aline, C_skip = -10
 * Improvements to TweetTokenizer
 * Optional show arg for FreqDist.plot, ConditionalFreqDist.plot
+* edit_distance now computes Damerau-Levenshtein edit-distance
 
 Thanks to the following contributors to 3.6.3
-Tom Aarsen, Michael Wayne Goodman, Michał Górny, Maarten ter Huurne, Manu Joseph,
-Eric Kafe, Ilia Kurenkov, Daniel Loney, Rob Malouf, purificant, Danny Sepler,
-Anthony Sottile
+Tom Aarsen, Abhijnan Bajpai, Michael Wayne Goodman, Michał Górny, Maarten ter Huurne,
+Manu Joseph, Eric Kafe, Ilia Kurenkov, Daniel Loney, Rob Malouf, Mohaned Mashaly,
+purificant, Danny Sepler, Anthony Sottile
 
 Version 3.6.2 2021-04-20
 * move test code to nltk/test

From c566ef29570fda1c844a7e7f56754654d84203c9 Mon Sep 17 00:00:00 2001
From: Steven Bird <stevenbird1@gmail.com>
Date: Sat, 18 Sep 2021 22:33:11 +0930
Subject: [PATCH 11/36] Update news.rst

---
 web/news.rst | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/web/news.rst b/web/news.rst
index 4c778efb38..80e730c5a4 100644
--- a/web/news.rst
+++ b/web/news.rst
@@ -4,6 +4,15 @@ NLTK News
 2021
 ----
 
+NLTK 3.6.3 release: September 2021
+  Drop support for Python 3.5, 
+  added pre-commit hooks (isort, pyupgrade, black),
+  improvements to WordNet visualization, RIBES score, edit_distance,
+  METEOR score, Punkt, language model package, TweetTokenizer,
+  code and comment cleanups,
+  CI tests now also run on Windows,
+  moved from Travis CI to GitHub Actions
+
 NLTK 3.6.2 release: April 2021
   Minor enhancements
 

From 1d87106312be12b8510890be42c329c00e1b425f Mon Sep 17 00:00:00 2001
From: purificant <purificant@users.noreply.github.com>
Date: Sat, 18 Sep 2021 17:35:23 +0100
Subject: [PATCH 12/36] trim whitespace

---
 web/news.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/web/news.rst b/web/news.rst
index 80e730c5a4..00734ba0b6 100644
--- a/web/news.rst
+++ b/web/news.rst
@@ -5,7 +5,7 @@ NLTK News
 ----
 
 NLTK 3.6.3 release: September 2021
-  Drop support for Python 3.5, 
+  Drop support for Python 3.5,
   added pre-commit hooks (isort, pyupgrade, black),
   improvements to WordNet visualization, RIBES score, edit_distance,
   METEOR score, Punkt, language model package, TweetTokenizer,

From fa537c49b4ae92141d2dcd4aefe7085940bb6922 Mon Sep 17 00:00:00 2001
From: purificant <purificant@users.noreply.github.com>
Date: Sat, 18 Sep 2021 18:47:50 +0100
Subject: [PATCH 13/36] replace travis badge with github actions badge

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 55a173c2f1..914ebfdfcb 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # Natural Language Toolkit (NLTK)
 [![PyPI](https://img.shields.io/pypi/v/nltk.svg)](https://pypi.python.org/pypi/nltk)
-[![Travis](https://travis-ci.org/nltk/nltk.svg?branch=develop)](https://travis-ci.org/nltk/nltk)
+![CI](https://github.com/nltk/nltk/actions/workflows/ci.yaml/badge.svg?branch=develop)
 
 NLTK -- the Natural Language Toolkit -- is a suite of open source Python
 modules, data sets, and tutorials supporting research and development in Natural

From ef57542072975b9261c4a0428cd0c0d29d84833f Mon Sep 17 00:00:00 2001
From: Dimitri Papadopoulos Orfanos
 <3234522+DimitriPapadopoulos@users.noreply.github.com>
Date: Thu, 23 Sep 2021 13:25:06 +0200
Subject: [PATCH 14/36] More typos in comments and outputs (#2814)

---
 CONTRIBUTING.md                     | 2 +-
 ChangeLog                           | 4 ++--
 jenkins.sh                          | 2 +-
 nltk/corpus/reader/wordnet.py       | 2 +-
 nltk/featstruct.py                  | 2 +-
 nltk/parse/util.py                  | 2 +-
 nltk/test/grammartestsuites.doctest | 2 +-
 nltk/test/relextract.doctest        | 2 +-
 tools/find_deprecated.py            | 2 +-
 tools/travis/third-party.sh         | 2 +-
 10 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index ed2214b7ef..36091b3359 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -94,7 +94,7 @@ Summary of our git branching model:
 - Never use `git add .`: it can add unwanted files;
 - Avoid using `git commit -a` unless you know what you're doing;
 - Check every change with `git diff` before adding them to the index (stage
-  area) and with `git diff --cached` before commiting;
+  area) and with `git diff --cached` before committing;
 - Make sure you add your name to our [list of contributors](https://github.com/nltk/nltk/blob/develop/AUTHORS.md);
 - If you have push access to the main repository, please do not commit directly
   to `develop`: your access should be used only to accept pull requests; if you
diff --git a/ChangeLog b/ChangeLog
index 923eadee6e..c4a39bc542 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -755,7 +755,7 @@ NLTK:
 Data:
 * Corrected identifiers in Dependency Treebank corpus
 * Basque and Catalan Dependency Treebanks (CoNLL 2007)
-* PE08 Parser Evalution data
+* PE08 Parser Evaluation data
 * New models for POS tagger and named-entity tagger
 
 Book:
@@ -1068,7 +1068,7 @@ Code:
 - changed corpus.util to use the 'rb' flag for opening files, to fix problems
   reading corpora under MSWindows
 - updated stale examples in engineering.txt
-- extended feature stucture interface to permit chained features, e.g. fs['F','G']
+- extended feature structure interface to permit chained features, e.g. fs['F','G']
 - further misc improvements to test code plus some bugfixes
 Tutorials:
 - rewritten opening section of tagging chapter
diff --git a/jenkins.sh b/jenkins.sh
index 574a9d133b..3c9bdbadc1 100755
--- a/jenkins.sh
+++ b/jenkins.sh
@@ -24,7 +24,7 @@ if [[ ! -d $senna_folder_name ]]; then
         rm ${senna_file_name}
 fi
 
-# Setup the Enviroment variable
+# Setup the Environment variable
 export SENNA=$(pwd)'/senna'
 
 popd
diff --git a/nltk/corpus/reader/wordnet.py b/nltk/corpus/reader/wordnet.py
index 34e2196b0b..493892f301 100644
--- a/nltk/corpus/reader/wordnet.py
+++ b/nltk/corpus/reader/wordnet.py
@@ -1136,7 +1136,7 @@ def __init__(self, root, omw_reader):
         # Map from lemma -> pos -> synset_index -> offset
         self._lemma_pos_offset_map = defaultdict(dict)
 
-        # A cache so we don't have to reconstuct synsets
+        # A cache so we don't have to reconstruct synsets
         # Map from pos -> offset -> synset
         self._synset_offset_cache = defaultdict(dict)
 
diff --git a/nltk/featstruct.py b/nltk/featstruct.py
index 080eeba6fc..a7001eb9aa 100644
--- a/nltk/featstruct.py
+++ b/nltk/featstruct.py
@@ -1858,7 +1858,7 @@ def _default_fs_class(obj):
 
 class SubstituteBindingsSequence(SubstituteBindingsI):
     """
-    A mixin class for sequence clases that distributes variables() and
+    A mixin class for sequence classes that distributes variables() and
     substitute_bindings() over the object's elements.
     """
 
diff --git a/nltk/parse/util.py b/nltk/parse/util.py
index 3338ccb070..b730556e84 100644
--- a/nltk/parse/util.py
+++ b/nltk/parse/util.py
@@ -162,7 +162,7 @@ def run(self, show_trees=False):
         Sentences in the test suite are divided into two classes:
          - grammatical (``accept``) and
          - ungrammatical (``reject``).
-        If a sentence should parse accordng to the grammar, the value of
+        If a sentence should parse according to the grammar, the value of
         ``trees`` will be a non-empty list. If a sentence should be rejected
         according to the grammar, then the value of ``trees`` will be None.
         """
diff --git a/nltk/test/grammartestsuites.doctest b/nltk/test/grammartestsuites.doctest
index 48d06992da..1ad162c16d 100644
--- a/nltk/test/grammartestsuites.doctest
+++ b/nltk/test/grammartestsuites.doctest
@@ -10,7 +10,7 @@ Sentences in the test suite are divided into two classes:
 - grammatical (*accept*) and
 - ungrammatical (*reject*).
 
-If a sentence should parse accordng to the grammar, the value of
+If a sentence should parse according to the grammar, the value of
 ``trees`` will be a non-empty list. If a sentence should be rejected
 according to the grammar, then the value of ``trees`` will be ``None``.
 
diff --git a/nltk/test/relextract.doctest b/nltk/test/relextract.doctest
index 4e6a0a32cf..d13a9f045e 100644
--- a/nltk/test/relextract.doctest
+++ b/nltk/test/relextract.doctest
@@ -176,7 +176,7 @@ signature <ORG, LOC>.
     [ORG: 'Open Text'] ', based in' [LOC: 'Waterloo']
     ...
 
-The next example illustrates a case where the patter is a disjunction
+The next example illustrates a case where the pattern is a disjunction
 of roles that a PERSON can occupy in an ORGANIZATION.
 
     >>> roles = r"""
diff --git a/tools/find_deprecated.py b/tools/find_deprecated.py
index 822eb63d23..94f1332eab 100755
--- a/tools/find_deprecated.py
+++ b/tools/find_deprecated.py
@@ -232,7 +232,7 @@ def main():
         print("Unable to import nltk -- check your PYTHONPATH.")
         sys.exit(-1)
 
-    print("Finding definitions of deprecated funtions & classes in nltk...")
+    print("Finding definitions of deprecated functions & classes in nltk...")
     find_deprecated_defs(nltk.__path__[0])
 
     print("Looking for possible uses of deprecated funcs & classes...")
diff --git a/tools/travis/third-party.sh b/tools/travis/third-party.sh
index 9e09d757f2..57971b3724 100644
--- a/tools/travis/third-party.sh
+++ b/tools/travis/third-party.sh
@@ -51,7 +51,7 @@ if [[ ! -d $senna_folder_name ]]; then
         rm ${senna_file_name}
 fi
 
-# Setup the Enviroment variable
+# Setup the Environment variable
 export CLASSPATH=$(pwd)"/${stanford_corenlp_package_name}"
 export CLASSPATH=${CLASSPATH}:$(pwd)"/${stanford_parser_package_name}"
 export CLASSPATH=${CLASSPATH}:$(pwd)"/${stanford_tagger_package_name}"

From 03e1ebd9f720e4263457d4e1d5886f4d7aba0ef5 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com>
Date: Thu, 23 Sep 2021 23:17:42 +0200
Subject: [PATCH 15/36] Refactored CISTEM Stemmer for German (#2815)

- Removed code duplication
- Modernised variable typing in method signatures
- Updated the documentation
---
 nltk/stem/cistem.py | 109 ++++++++++++++++++++------------------------
 1 file changed, 50 insertions(+), 59 deletions(-)

diff --git a/nltk/stem/cistem.py b/nltk/stem/cistem.py
index 3bd13354f8..2966e8e589 100644
--- a/nltk/stem/cistem.py
+++ b/nltk/stem/cistem.py
@@ -1,12 +1,14 @@
 # Natural Language Toolkit: CISTEM Stemmer for German
 # Copyright (C) 2001-2021 NLTK Project
 # Author: Leonie Weissweiler <l.weissweiler@outlook.de>
+#         Tom Aarsen <> (modifications)
 # Algorithm: Leonie Weissweiler <l.weissweiler@outlook.de>
 #            Alexander Fraser <fraser@cis.lmu.de>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 
 import re
+from typing import Tuple
 
 from nltk.stem.api import StemmerI
 
@@ -48,11 +50,11 @@ class Cistem(StemmerI):
     strip_esn = re.compile(r"[esn]$")
     repl_xx_back = re.compile(r"(.)\*")
 
-    def __init__(self, case_insensitive=False):
+    def __init__(self, case_insensitive: bool = False):
         self._case_insensitive = case_insensitive
 
     @staticmethod
-    def replace_to(word):
+    def replace_to(word: str) -> str:
         word = word.replace("sch", "$")
         word = word.replace("ei", "%")
         word = word.replace("ie", "&")
@@ -61,7 +63,7 @@ def replace_to(word):
         return word
 
     @staticmethod
-    def replace_back(word):
+    def replace_back(word: str) -> str:
         word = Cistem.repl_xx_back.sub(r"\1\1", word)
         word = word.replace("%", "ei")
         word = word.replace("&", "ie")
@@ -69,14 +71,13 @@ def replace_back(word):
 
         return word
 
-    def stem(self, word):
-        """
-        This method takes the word to be stemmed and returns the stemmed word.
+    def stem(self, word: str) -> str:
+        """Stems the input word.
 
-        :param word: the word that is to be stemmed
-        :type word: unicode
-        :return word: the stemmed word
-        :rtype: unicode
+        :param word: The word that is to be stemmed.
+        :type word: str
+        :return: The stemmed word.
+        :rtype: str
 
         >>> from nltk.stem.cistem import Cistem
         >>> stemmer = Cistem()
@@ -109,34 +110,10 @@ def stem(self, word):
         word = word.replace("ß", "ss")
 
         word = Cistem.strip_ge.sub(r"\1", word)
-        word = Cistem.replace_to(word)
-
-        while len(word) > 3:
-            if len(word) > 5:
-                (word, success) = Cistem.strip_emr.subn("", word)
-                if success != 0:
-                    continue
-
-                (word, success) = Cistem.strip_nd.subn("", word)
-                if success != 0:
-                    continue
-
-            if not upper or self._case_insensitive:
-                (word, success) = Cistem.strip_t.subn("", word)
-                if success != 0:
-                    continue
-
-            (word, success) = Cistem.strip_esn.subn("", word)
-            if success != 0:
-                continue
-            else:
-                break
 
-        word = Cistem.replace_back(word)
+        return self._segment_inner(word, upper)[0]
 
-        return word
-
-    def segment(self, word):
+    def segment(self, word: str) -> Tuple[str, str]:
         """
         This method works very similarly to stem (:func:'cistem.stem'). The difference is that in
         addition to returning the stem, it also returns the rest that was removed at
@@ -144,17 +121,15 @@ def segment(self, word):
         can be concatenated to form the original word, all subsitutions that altered
         the stem in any other way than by removing letters at the end were left out.
 
-        :param word: the word that is to be stemmed
-        :type word: unicode
-        :return word: the stemmed word
-        :rtype: unicode
-        :return word: the removed suffix
-        :rtype: unicode
+        :param word: The word that is to be stemmed.
+        :type word: str
+        :return: A tuple of the stemmed word and the removed suffix.
+        :rtype: Tuple[str, str]
 
         >>> from nltk.stem.cistem import Cistem
         >>> stemmer = Cistem()
         >>> s1 = "Speicherbehältern"
-        >>> print("('" + stemmer.segment(s1)[0] + "', '" + stemmer.segment(s1)[1] + "')")
+        >>> stemmer.segment(s1)
         ('speicherbehält', 'ern')
         >>> s2 = "Grenzpostens"
         >>> stemmer.segment(s2)
@@ -163,56 +138,72 @@ def segment(self, word):
         >>> stemmer.segment(s3)
         ('ausgefeilt', 'ere')
         >>> stemmer = Cistem(True)
-        >>> print("('" + stemmer.segment(s1)[0] + "', '" + stemmer.segment(s1)[1] + "')")
+        >>> stemmer.segment(s1)
         ('speicherbehäl', 'tern')
         >>> stemmer.segment(s2)
         ('grenzpo', 'stens')
         >>> stemmer.segment(s3)
         ('ausgefeil', 'tere')
         """
-
-        rest_length = 0
-
         if len(word) == 0:
             return ("", "")
 
         upper = word[0].isupper()
         word = word.lower()
 
-        original = word[:]
+        return self._segment_inner(word, upper)
+
+    def _segment_inner(self, word: str, upper: bool):
+        """Inner method for iteratively applying the code stemming regexes.
+        This method receives a pre-processed variant of the word to be stemmed,
+        or the word to be segmented, and returns a tuple of the word and the
+        removed suffix.
+
+        :param word: A pre-processed variant of the word that is to be stemmed.
+        :type word: str
+        :param upper: Whether the original word started with a capital letter.
+        :type upper: bool
+        :return: A tuple of the stemmed word and the removed suffix.
+        :rtype: Tuple[str, str]
+        """
+
+        rest_length = 0
+        word_copy = word[:]
 
+        # Pre-processing before applying the substitution patterns
         word = Cistem.replace_to(word)
+        rest = ""
 
+        # Apply the substitution patterns
         while len(word) > 3:
             if len(word) > 5:
-                (word, success) = Cistem.strip_emr.subn("", word)
-                if success != 0:
+                word, n = Cistem.strip_emr.subn("", word)
+                if n != 0:
                     rest_length += 2
                     continue
 
-                (word, success) = Cistem.strip_nd.subn("", word)
-                if success != 0:
+                word, n = Cistem.strip_nd.subn("", word)
+                if n != 0:
                     rest_length += 2
                     continue
 
             if not upper or self._case_insensitive:
-                (word, success) = Cistem.strip_t.subn("", word)
-                if success != 0:
+                word, n = Cistem.strip_t.subn("", word)
+                if n != 0:
                     rest_length += 1
                     continue
 
-            (word, success) = Cistem.strip_esn.subn("", word)
-            if success != 0:
+            word, n = Cistem.strip_esn.subn("", word)
+            if n != 0:
                 rest_length += 1
                 continue
             else:
                 break
 
+        # Post-processing after applying the substitution patterns
         word = Cistem.replace_back(word)
 
         if rest_length:
-            rest = original[-rest_length:]
-        else:
-            rest = ""
+            rest = word_copy[-rest_length:]
 
         return (word, rest)

From 2b1528f952d5c497e3119234b0154fa25fa044e0 Mon Sep 17 00:00:00 2001
From: Steven Bird <stevenbird1@gmail.com>
Date: Fri, 24 Sep 2021 06:52:17 +0930
Subject: [PATCH 16/36] NLTK Team as author and maintainer

---
 setup.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/setup.py b/setup.py
index 48e2a319ab..061ae8899e 100644
--- a/setup.py
+++ b/setup.py
@@ -3,9 +3,7 @@
 # Setup script for the Natural Language Toolkit
 #
 # Copyright (C) 2001-2021 NLTK Project
-# Author: Steven Bird <stevenbird1@gmail.com>
-#         Edward Loper <edloper@gmail.com>
-#         Ewan Klein <ewan@inf.ed.ac.uk>
+# Author: NLTK Team <nltk.team@gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 
@@ -86,10 +84,10 @@
         "natural language",
         "text analytics",
     ],
-    maintainer="Steven Bird",
-    maintainer_email="stevenbird1@gmail.com",
-    author="Steven Bird",
-    author_email="stevenbird1@gmail.com",
+    maintainer="NLTK Team",
+    maintainer_email="nltk.team@gmail.com",
+    author="NLTK Team",
+    author_email="nltk.team@gmail.com",
     classifiers=[
         "Development Status :: 5 - Production/Stable",
         "Intended Audience :: Developers",

From 8c38a2a789e3c26bcb1297ff98bc304812c5d4c5 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Fri, 24 Sep 2021 09:26:04 +0200
Subject: [PATCH 17/36] NLTK Team as author and maintainer - in __init__.py

---
 nltk/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nltk/__init__.py b/nltk/__init__.py
index b42c548992..b06598edca 100644
--- a/nltk/__init__.py
+++ b/nltk/__init__.py
@@ -70,8 +70,8 @@
 __url__ = "http://nltk.org/"
 
 # Maintainer, contributors, etc.
-__maintainer__ = "Steven Bird"
-__maintainer_email__ = "stevenbird1@gmail.com"
+__maintainer__ = "NLTK Team"
+__maintainer_email__ = "nltk.team@gmail.com"
 __author__ = __maintainer__
 __author_email__ = __maintainer_email__
 

From f4ce2cef06125dd406bd57ab08b2fdb57ebe1531 Mon Sep 17 00:00:00 2001
From: Steven Bird <stevenbird1@gmail.com>
Date: Fri, 24 Sep 2021 21:52:07 +0930
Subject: [PATCH 18/36] NLTK authorises nltk.team@gmail.com as its security
 contact. Resolves #2811

---
 SECURITY.md | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 SECURITY.md

diff --git a/SECURITY.md b/SECURITY.md
new file mode 100644
index 0000000000..36eaa01ae5
--- /dev/null
+++ b/SECURITY.md
@@ -0,0 +1,6 @@
+# Security Policy
+
+## Reporting a Vulnerability
+
+Please report security issues to `nltk.team@gmail.com`
+

From 5ecfe44571fb4dec77a20a2fc127508948723497 Mon Sep 17 00:00:00 2001
From: mohaned mashaly <30902228+12mohaned@users.noreply.github.com>
Date: Fri, 24 Sep 2021 14:31:27 +0200
Subject: [PATCH 19/36] feat: enable phone number regex to recognize phone
 numbers (#2798)

* feat: enable phone number regex to recognize phone numbers

* add pre-commit support

* remove duplicate regular expression

* remove overriden line

* Refactored TweetTokenizer, incl. caching regex compilations and adding match_phone_numbers

* Removed some test input duplication

* Added several TweetTokenizer tests with phone numbers

* Set match_phone_numbers=True as the default for TweetTokenizer

Co-authored-by: Tom Aarsen <Cubiegamedev@gmail.com>
---
 nltk/test/unit/test_tokenize.py | 207 ++++++++++++++++++++++++++++++++
 nltk/tokenize/casual.py         | 138 ++++++++++++++++-----
 2 files changed, 314 insertions(+), 31 deletions(-)

diff --git a/nltk/test/unit/test_tokenize.py b/nltk/test/unit/test_tokenize.py
index 2efd793ef9..524aee2b5d 100644
--- a/nltk/test/unit/test_tokenize.py
+++ b/nltk/test/unit/test_tokenize.py
@@ -2,6 +2,8 @@
 Unit tests for nltk.tokenize.
 See also nltk/test/tokenize.doctest
 """
+from typing import List, Tuple
+
 import pytest
 
 from nltk.tokenize import (
@@ -54,6 +56,211 @@ def test_tweet_tokenizer(self):
         ]
         assert tokens == expected
 
+    @pytest.mark.parametrize(
+        "test_input, expecteds",
+        [
+            (
+                "My text 0106404243030 is great text",
+                (
+                    ["My", "text", "01064042430", "30", "is", "great", "text"],
+                    ["My", "text", "0106404243030", "is", "great", "text"],
+                ),
+            ),
+            (
+                "My ticket id is 1234543124123",
+                (
+                    ["My", "ticket", "id", "is", "12345431241", "23"],
+                    ["My", "ticket", "id", "is", "1234543124123"],
+                ),
+            ),
+            (
+                "@remy: This is waaaaayyyy too much for you!!!!!! 01064042430",
+                (
+                    [
+                        ":",
+                        "This",
+                        "is",
+                        "waaayyy",
+                        "too",
+                        "much",
+                        "for",
+                        "you",
+                        "!",
+                        "!",
+                        "!",
+                        "01064042430",
+                    ],
+                    [
+                        ":",
+                        "This",
+                        "is",
+                        "waaayyy",
+                        "too",
+                        "much",
+                        "for",
+                        "you",
+                        "!",
+                        "!",
+                        "!",
+                        "01064042430",
+                    ],
+                ),
+            ),
+            # Further tests from https://github.com/nltk/nltk/pull/2798#issuecomment-922533085,
+            # showing the TweetTokenizer performance for `match_phone_numbers=True` and
+            # `match_phone_numbers=False`.
+            (
+                # Some phone numbers are always tokenized, even with `match_phone_numbers=`False`
+                "My number is 06-46124080, except it's not.",
+                (
+                    [
+                        "My",
+                        "number",
+                        "is",
+                        "06-46124080",
+                        ",",
+                        "except",
+                        "it's",
+                        "not",
+                        ".",
+                    ],
+                    [
+                        "My",
+                        "number",
+                        "is",
+                        "06-46124080",
+                        ",",
+                        "except",
+                        "it's",
+                        "not",
+                        ".",
+                    ],
+                ),
+            ),
+            (
+                # Phone number here is only tokenized correctly if `match_phone_numbers=True`
+                "My number is 601-984-4813, except it's not.",
+                (
+                    [
+                        "My",
+                        "number",
+                        "is",
+                        "601-984-4813",
+                        ",",
+                        "except",
+                        "it's",
+                        "not",
+                        ".",
+                    ],
+                    [
+                        "My",
+                        "number",
+                        "is",
+                        "601-984-",
+                        "4813",
+                        ",",
+                        "except",
+                        "it's",
+                        "not",
+                        ".",
+                    ],
+                ),
+            ),
+            (
+                # Phone number here is only tokenized correctly if `match_phone_numbers=True`
+                "My number is (393)  928 -3010, except it's not.",
+                (
+                    [
+                        "My",
+                        "number",
+                        "is",
+                        "(393)  928 -3010",
+                        ",",
+                        "except",
+                        "it's",
+                        "not",
+                        ".",
+                    ],
+                    [
+                        "My",
+                        "number",
+                        "is",
+                        "(",
+                        "393",
+                        ")",
+                        "928",
+                        "-",
+                        "3010",
+                        ",",
+                        "except",
+                        "it's",
+                        "not",
+                        ".",
+                    ],
+                ),
+            ),
+            (
+                # A long number is tokenized correctly only if `match_phone_numbers=False`
+                "The product identification number is 48103284512.",
+                (
+                    [
+                        "The",
+                        "product",
+                        "identification",
+                        "number",
+                        "is",
+                        "4810328451",
+                        "2",
+                        ".",
+                    ],
+                    [
+                        "The",
+                        "product",
+                        "identification",
+                        "number",
+                        "is",
+                        "48103284512",
+                        ".",
+                    ],
+                ),
+            ),
+            (
+                # `match_phone_numbers=True` can have some unforeseen
+                "My favourite substraction is 240 - 1353.",
+                (
+                    ["My", "favourite", "substraction", "is", "240 - 1353", "."],
+                    ["My", "favourite", "substraction", "is", "240", "-", "1353", "."],
+                ),
+            ),
+        ],
+    )
+    def test_tweet_tokenizer_expanded(
+        self, test_input: str, expecteds: Tuple[List[str], List[str]]
+    ):
+        """
+        Test `match_phone_numbers` in TweetTokenizer.
+
+        Note that TweetTokenizer is also passed the following for these tests:
+            * strip_handles=True
+            * reduce_len=True
+
+        :param test_input: The input string to tokenize using TweetTokenizer.
+        :type test_input: str
+        :param expecteds: A 2-tuple of tokenized sentences. The first of the two
+            tokenized is the expected output of tokenization with `match_phone_numbers=True`.
+            The second of the two tokenized lists is the expected output of tokenization
+            with `match_phone_numbers=False`.
+        :type expecteds: Tuple[List[str], List[str]]
+        """
+        for match_phone_numbers, expected in zip([True, False], expecteds):
+            tokenizer = TweetTokenizer(
+                strip_handles=True,
+                reduce_len=True,
+                match_phone_numbers=match_phone_numbers,
+            )
+            predicted = tokenizer.tokenize(test_input)
+            assert predicted == expected
+
     def test_sonority_sequencing_syllable_tokenizer(self):
         """
         Test SyllableTokenizer tokenizer.
diff --git a/nltk/tokenize/casual.py b/nltk/tokenize/casual.py
index 0fb0b39243..f7b256713e 100644
--- a/nltk/tokenize/casual.py
+++ b/nltk/tokenize/casual.py
@@ -5,6 +5,7 @@
 # Author: Christopher Potts <cgpotts@stanford.edu>
 #         Ewan Klein <ewan@inf.ed.ac.uk> (modifications)
 #         Pierpaolo Pantone <> (modifications)
+#         Tom Aarsen <> (modifications)
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 #
@@ -14,27 +15,36 @@
 Twitter-aware tokenizer, designed to be flexible and easy to adapt to new
 domains and tasks. The basic logic is this:
 
-1. The tuple regex_strings defines a list of regular expression
+1. The tuple REGEXPS defines a list of regular expression
    strings.
 
-2. The regex_strings strings are put, in order, into a compiled
-   regular expression object called word_re.
+2. The REGEXPS strings are put, in order, into a compiled
+   regular expression object called WORD_RE, under the TweetTokenizer
+   class.
 
-3. The tokenization is done by word_re.findall(s), where s is the
+3. The tokenization is done by WORD_RE.findall(s), where s is the
    user-supplied string, inside the tokenize() method of the class
-   Tokenizer.
-
-4. When instantiating Tokenizer objects, there is a single option:
-   preserve_case.  By default, it is set to True. If it is set to
-   False, then the tokenizer will downcase everything except for
-   emoticons.
-
+   TweetTokenizer.
+
+4. When instantiating Tokenizer objects, there are several options:
+    * preserve_case. By default, it is set to True. If it is set to
+        False, then the tokenizer will downcase everything except for
+        emoticons.
+    * reduce_len. By default, it is set to False. It specifies whether
+        to replace repeated character sequences of length 3 or greater
+        with sequences of length 3.
+    * strip_handles. By default, it is set to False. It specifies
+        whether to remove Twitter handles of text used in the
+        `tokenize` method.
+    * match_phone_numbers. By default, it is set to True. It indicates
+        whether the `tokenize` method should look for phone numbers.
 """
 
 
 ######################################################################
 
 import html
+from typing import List
 
 import regex  # https://github.com/nltk/nltk/issues/2409
 
@@ -115,11 +125,8 @@
   )
 """
 
-# The components of the tokenizer:
-REGEXPS = (
-    URLS,
-    # Phone numbers:
-    r"""
+# Regex for recognizing phone numbers:
+PHONE_REGEX = r"""
     (?:
       (?:            # (international)
         \+?[01]
@@ -133,7 +140,11 @@
       \d{3}          # exchange
       [ *\-.\)]*
       \d{4}          # base
-    )""",
+    )"""
+
+# The components of the tokenizer:
+REGEXPS = (
+    URLS,
     # ASCII Emoticons
     EMOTICONS,
     # HTML tags:
@@ -160,12 +171,12 @@
     """,
 )
 
-######################################################################
-# This is the core tokenizing regex:
+# Take the main components and add a phone regex as the second parameter
+REGEXPS_PHONE = (REGEXPS[0], PHONE_REGEX, *REGEXPS[1:])
 
-WORD_RE = regex.compile(
-    r"""(%s)""" % "|".join(REGEXPS), regex.VERBOSE | regex.I | regex.UNICODE
-)
+######################################################################
+# TweetTokenizer.WORD_RE and TweetTokenizer.PHONE_WORD_RE represent
+# the core tokenizing regexes. They are compiled lazily.
 
 # WORD_RE performs poorly on these patterns:
 HANG_RE = regex.compile(r"([^a-zA-Z0-9])\1{3,}")
@@ -277,17 +288,48 @@ class TweetTokenizer:
         [':', 'This', 'is', 'waaayyy', 'too', 'much', 'for', 'you', '!', '!', '!']
     """
 
-    def __init__(self, preserve_case=True, reduce_len=False, strip_handles=False):
+    # Values used to lazily compile WORD_RE and PHONE_WORD_RE,
+    # which are the core tokenizing regexes.
+    _WORD_RE = None
+    _PHONE_WORD_RE = None
+
+    ######################################################################
+
+    def __init__(
+        self,
+        preserve_case=True,
+        reduce_len=False,
+        strip_handles=False,
+        match_phone_numbers=True,
+    ):
+        """
+        Create a `TweetTokenizer` instance with settings for use in the `tokenize` method.
+
+        :param preserve_case: Flag indicating whether to preserve the casing (capitalisation)
+            of text used in the `tokenize` method. Defaults to True.
+        :type preserve_case: bool
+        :param reduce_len: Flag indicating whether to replace repeated character sequences
+            of length 3 or greater with sequences of length 3. Defaults to False.
+        :type reduce_len: bool
+        :param strip_handles: Flag indicating whether to remove Twitter handles of text used
+            in the `tokenize` method. Defaults to False.
+        :type strip_handles: bool
+        :param match_phone_numbers: Flag indicating whether the `tokenize` method should look
+            for phone numbers. Defaults to True.
+        :type match_phone_numbers: bool
+        """
         self.preserve_case = preserve_case
         self.reduce_len = reduce_len
         self.strip_handles = strip_handles
+        self.match_phone_numbers = match_phone_numbers
+
+    def tokenize(self, text: str) -> List[str]:
+        """Tokenize the input text.
 
-    def tokenize(self, text):
-        """
         :param text: str
         :rtype: list(str)
-        :return: a tokenized list of strings; concatenating this list returns\
-        the original string if `preserve_case=False`
+        :return: a tokenized list of strings; joining this list returns\
+        the original string if `preserve_case=False`.
         """
         # Fix HTML character entities:
         text = _replace_html_entities(text)
@@ -299,8 +341,11 @@ def tokenize(self, text):
             text = reduce_lengthening(text)
         # Shorten problematic sequences of characters
         safe_text = HANG_RE.sub(r"\1\1\1", text)
-        # Tokenize:
-        words = WORD_RE.findall(safe_text)
+        # Recognise phone numbers during tokenization
+        if self.match_phone_numbers:
+            words = self.PHONE_WORD_RE.findall(safe_text)
+        else:
+            words = self.WORD_RE.findall(safe_text)
         # Possibly alter the case, but avoid changing emoticons like :D into :d:
         if not self.preserve_case:
             words = list(
@@ -308,6 +353,28 @@ def tokenize(self, text):
             )
         return words
 
+    @property
+    def WORD_RE(self) -> regex.Pattern:
+        """Core TweetTokenizer regex"""
+        # Compiles the regex for this and all future instantiations of TweetTokenizer.
+        if not type(self)._WORD_RE:
+            type(self)._WORD_RE = regex.compile(
+                f"({'|'.join(REGEXPS)})",
+                regex.VERBOSE | regex.I | regex.UNICODE,
+            )
+        return type(self)._WORD_RE
+
+    @property
+    def PHONE_WORD_RE(self) -> regex.Pattern:
+        """Secondary core TweetTokenizer regex"""
+        # Compiles the regex for this and all future instantiations of TweetTokenizer.
+        if not type(self)._PHONE_WORD_RE:
+            type(self)._PHONE_WORD_RE = regex.compile(
+                f"({'|'.join(REGEXPS_PHONE)})",
+                regex.VERBOSE | regex.I | regex.UNICODE,
+            )
+        return type(self)._PHONE_WORD_RE
+
 
 ######################################################################
 # Normalization Functions
@@ -336,12 +403,21 @@ def remove_handles(text):
 ######################################################################
 
 
-def casual_tokenize(text, preserve_case=True, reduce_len=False, strip_handles=False):
+def casual_tokenize(
+    text,
+    preserve_case=True,
+    reduce_len=False,
+    strip_handles=False,
+    match_phone_numbers=True,
+):
     """
     Convenience function for wrapping the tokenizer.
     """
     return TweetTokenizer(
-        preserve_case=preserve_case, reduce_len=reduce_len, strip_handles=strip_handles
+        preserve_case=preserve_case,
+        reduce_len=reduce_len,
+        strip_handles=strip_handles,
+        match_phone_numbers=match_phone_numbers,
     ).tokenize(text)
 
 

From 23f4b1c4b4006b0cb3ec278e801029557cec4e82 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Fri, 24 Sep 2021 15:42:03 +0200
Subject: [PATCH 20/36] Fix end of line in new SECURITY.md

---
 SECURITY.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/SECURITY.md b/SECURITY.md
index 36eaa01ae5..27ff9b6aaa 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -3,4 +3,3 @@
 ## Reporting a Vulnerability
 
 Please report security issues to `nltk.team@gmail.com`
-

From 277711ab1dec729e626b27aab6fa35ea5efbd7e6 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com>
Date: Sat, 25 Sep 2021 01:14:32 +0200
Subject: [PATCH 21/36] Resolved ReDoS vulnerability in Corpus Reader (#2816)

* Resolved ReDoS vulnerability in the Corpus Reader for the Comparative Sentence Dataset

* Solidified performance tests
---
 nltk/corpus/reader/comparative_sents.py |  2 +-
 nltk/test/corpus.doctest                | 32 +++++++++++++++++++++++++
 2 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/nltk/corpus/reader/comparative_sents.py b/nltk/corpus/reader/comparative_sents.py
index cbfac4c13c..ed295e4e02 100644
--- a/nltk/corpus/reader/comparative_sents.py
+++ b/nltk/corpus/reader/comparative_sents.py
@@ -45,7 +45,7 @@
 GRAD_COMPARISON = re.compile(r"<cs-[123]>")
 NON_GRAD_COMPARISON = re.compile(r"<cs-4>")
 ENTITIES_FEATS = re.compile(r"(\d)_((?:[\.\w\s/-](?!\d_))+)")
-KEYWORD = re.compile(r"\((?!.*\()(.*)\)$")
+KEYWORD = re.compile(r"\(([^\(]*)\)$")
 
 
 class Comparison:
diff --git a/nltk/test/corpus.doctest b/nltk/test/corpus.doctest
index 82b17f8a5a..560e641a69 100644
--- a/nltk/test/corpus.doctest
+++ b/nltk/test/corpus.doctest
@@ -2162,3 +2162,35 @@ access to its tuples() method
     >>> from nltk.corpus import qc
     >>> qc.tuples('test.txt')
     [('NUM:dist', 'How far is it from Denver to Aspen ?'), ('LOC:city', 'What county is Modesto , California in ?'), ...]
+
+Ensure that KEYWORD from `comparative_sents.py` no longer contains a ReDoS vulnerability.
+
+    >>> import re
+    >>> import time
+    >>> from nltk.corpus.reader.comparative_sents import KEYWORD
+    >>> sizes = {
+    ...     "short": 4000,
+    ...     "long": 40000
+    ... }
+    >>> exec_times = {
+    ...     "short": [],
+    ...     "long": [],
+    ... }
+    >>> for size_name, size in sizes.items():
+    ...     for j in range(9):
+    ...         start_t = time.perf_counter()
+    ...         payload = "( " + "(" * size
+    ...         output = KEYWORD.findall(payload)
+    ...         exec_times[size_name].append(time.perf_counter() - start_t)
+    ...     exec_times[size_name] = sorted(exec_times[size_name])[4] # Get the mean
+
+Ideally, the execution time of such a regular expression is linear
+in the length of the input. As such, we would expect exec_times["long"]
+to be roughly 10 times as big as exec_times["short"].
+With the ReDoS in place, it took roughly 80 times as long.
+For now, we accept values below 30 (times as long), due to the potential
+for variance. This ensures that the ReDoS has certainly been reduced,
+if not removed.
+
+    >>> exec_times["long"] / exec_times["short"] < 30
+    True

From 63d004b6a7b7357d5597c56ed254624e20abc281 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com>
Date: Sun, 26 Sep 2021 23:27:32 +0200
Subject: [PATCH 22/36] Added docstring for WordNetLemmatizer lemmatize (#2819)

Includes importing wordnet as wn, which is common in practice.
---
 nltk/stem/wordnet.py | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/nltk/stem/wordnet.py b/nltk/stem/wordnet.py
index d56537a084..3ac3424247 100644
--- a/nltk/stem/wordnet.py
+++ b/nltk/stem/wordnet.py
@@ -6,8 +6,7 @@
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 
-from nltk.corpus import wordnet
-from nltk.corpus.reader.wordnet import NOUN
+from nltk.corpus import wordnet as wn
 
 
 class WordNetLemmatizer:
@@ -31,11 +30,19 @@ class WordNetLemmatizer:
         hardrock
     """
 
-    def __init__(self):
-        pass
-
-    def lemmatize(self, word, pos=NOUN):
-        lemmas = wordnet._morphy(word, pos)
+    def lemmatize(self, word: str, pos: str = wn.NOUN) -> str:
+        """Lemmatize `word` using WordNet's built-in morphy function.
+        Returns the input word unchanged if it cannot be found in WordNet.
+
+        :param word: The input word to lemmatize.
+        :type word: str
+        :param pos: The Part Of Speech tag. Valid options are `"n"` for nouns,
+            `"v"` for verbs, `"a"` for adjectives, `"r"` for adverbs and `"s"`
+            for satellite adjectives.
+        :param pos: str
+        :return: The lemma of `word`, for the given `pos`.
+        """
+        lemmas = wn._morphy(word, pos)
         return min(lemmas, key=len) if lemmas else word
 
     def __repr__(self):

From e7420b1500fd20133c2bed0eb2cf3611f7574c25 Mon Sep 17 00:00:00 2001
From: Steven Bird <stevenbird1@gmail.com>
Date: Mon, 27 Sep 2021 08:12:09 +0930
Subject: [PATCH 23/36] Configure sphinx in setup.cfg

cf #2742
---
 setup.cfg | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/setup.cfg b/setup.cfg
index 3529e62d50..7e3bc1a4be 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -3,3 +3,6 @@ license_files =
     LICENSE.txt
     AUTHORS.md
     README.md
+
+[build_sphinx]
+source-dir = web

From 1aff8c19ccee8bd56ede056d282568e0ff84b674 Mon Sep 17 00:00:00 2001
From: Steven Bird <stevenbird1@gmail.com>
Date: Mon, 27 Sep 2021 21:18:03 +0930
Subject: [PATCH 24/36] Configure sphinx

---
 RELEASE-HOWTO.txt |  28 ++---
 web/api/nltk.rst  | 282 +++++++++++++++++++++++++++++-----------------
 web/conf.py       |   5 +
 3 files changed, 195 insertions(+), 120 deletions(-)

diff --git a/RELEASE-HOWTO.txt b/RELEASE-HOWTO.txt
index 2ef5f8e27c..329957f85e 100644
--- a/RELEASE-HOWTO.txt
+++ b/RELEASE-HOWTO.txt
@@ -2,8 +2,8 @@ Building an NLTK distribution
 ----------------------------------
 
 1. Testing
-   - Ensure CI server isn't reporting any test failures
-     https://www.travis-ci.org/nltk/nltk
+   - Check no errors are reported in our continuous integration service:
+     https://github.com/nltk/nltk/actions
    - Optionally test demonstration code locally
      make demotest
    - Optionally test individual modules:
@@ -29,17 +29,13 @@ Building an NLTK distribution
      (including the range of Python versions that are supported)
      edit web/install.rst setup.py
    - Rebuild the API docs
-     - make sure you have the current revision of the web pages
-       cd nltk.github.com; git pull
-     - build
-       cd ../nltk/web
-       make (slow; lots of warning messages about cross references)
-     - publish
-       cd ../../nltk.github.com
-       git add _modules _sources _static api *.html objects.inv searchindex.js
-       git status (missing any important looking files?)
-       git commit -m "updates for version 3.X.Y"
-       git push origin master
+     python setup.py build_sphinx -b man --build-dir build/sphinx
+   - Publish them
+     cd nltk.github.com; git pull (begin with current docs repo)
+     <copy them over from build/sphinx to ../nltk.github.com>
+     git add .
+     git commit -m "updates for version 3.X.Y"
+     git push origin master
 
 4. Create a new version
    - (Optionally do this in a release branch, branching from develop branch
@@ -65,12 +61,8 @@ Building an NLTK distribution
      nltk-dev (for beta releases)
      nltk-users (for final releases)
      nltk twitter account
-   - announce to external mailing lists, for major N.N releases only
-     CORPORA@uib.no, linguist@linguistlist.org,
-     PythonSIL@lists.sil.org, edu-sig@python.org
-     mailing lists for any local courses using NLTK
 
-7. Optionally update to new version
+7. Optionally update repo version
    - we don't want builds from the repository to have the same release number
      e.g. after release X.Y.4, update repository version to X.Y.5a (alpha)
 
diff --git a/web/api/nltk.rst b/web/api/nltk.rst
index c5dad19c58..3da0e9e211 100644
--- a/web/api/nltk.rst
+++ b/web/api/nltk.rst
@@ -1,148 +1,226 @@
-.. manually constructed -- removed several low-level packages
-
-nltk Package
+nltk package
 ============
 
-:mod:`nltk` Package
--------------------
+Subpackages
+-----------
 
-.. automodule:: nltk.__init__
-    :members:
-    :undoc-members:
-    :show-inheritance:
+.. toctree::
+   :maxdepth: 4
+
+   nltk.app
+   nltk.ccg
+   nltk.chat
+   nltk.chunk
+   nltk.classify
+   nltk.cluster
+   nltk.corpus
+   nltk.draw
+   nltk.inference
+   nltk.lm
+   nltk.metrics
+   nltk.misc
+   nltk.parse
+   nltk.sem
+   nltk.sentiment
+   nltk.stem
+   nltk.tag
+   nltk.tbl
+   nltk.test
+   nltk.tokenize
+   nltk.translate
+   nltk.twitter
+
+Submodules
+----------
+
+nltk.book module
+----------------
+
+.. automodule:: nltk.book
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+nltk.cli module
+---------------
+
+.. automodule:: nltk.cli
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+nltk.collections module
+-----------------------
 
-:mod:`collocations` Module
---------------------------
+.. automodule:: nltk.collections
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+nltk.collocations module
+------------------------
 
 .. automodule:: nltk.collocations
-    :members:
-    :undoc-members:
-    :show-inheritance:
+   :members:
+   :undoc-members:
+   :show-inheritance:
 
-:mod:`data` Module
+nltk.compat module
 ------------------
 
+.. automodule:: nltk.compat
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+nltk.data module
+----------------
+
 .. automodule:: nltk.data
-    :members:
-    :undoc-members:
-    :show-inheritance:
+   :members:
+   :undoc-members:
+   :show-inheritance:
 
-:mod:`downloader` Module
-------------------------
+nltk.decorators module
+----------------------
+
+.. automodule:: nltk.decorators
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+nltk.downloader module
+----------------------
 
 .. automodule:: nltk.downloader
-    :members:
-    :undoc-members:
-    :show-inheritance:
+   :members:
+   :undoc-members:
+   :show-inheritance:
 
-:mod:`featstruct` Module
-------------------------
+nltk.featstruct module
+----------------------
 
 .. automodule:: nltk.featstruct
-    :members:
-    :undoc-members:
-    :show-inheritance:
+   :members:
+   :undoc-members:
+   :show-inheritance:
 
-:mod:`grammar` Module
----------------------
+nltk.grammar module
+-------------------
 
 .. automodule:: nltk.grammar
-    :members:
-    :undoc-members:
-    :show-inheritance:
+   :members:
+   :undoc-members:
+   :show-inheritance:
 
-:mod:`help` Module
-------------------
+nltk.help module
+----------------
 
 .. automodule:: nltk.help
-    :members:
-    :undoc-members:
-    :show-inheritance:
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+nltk.internals module
+---------------------
+
+.. automodule:: nltk.internals
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+nltk.jsontags module
+--------------------
 
-:mod:`probability` Module
--------------------------
+.. automodule:: nltk.jsontags
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+nltk.lazyimport module
+----------------------
+
+.. automodule:: nltk.lazyimport
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+nltk.probability module
+-----------------------
 
 .. automodule:: nltk.probability
-    :members:
-    :undoc-members:
-    :show-inheritance:
+   :members:
+   :undoc-members:
+   :show-inheritance:
 
-:mod:`text` Module
-------------------
+nltk.text module
+----------------
 
 .. automodule:: nltk.text
-    :members:
-    :undoc-members:
-    :show-inheritance:
+   :members:
+   :undoc-members:
+   :show-inheritance:
 
-:mod:`toolbox` Module
----------------------
+nltk.tgrep module
+-----------------
 
-.. automodule:: nltk.toolbox
-    :members:
-    :undoc-members:
-    :show-inheritance:
+.. automodule:: nltk.tgrep
+   :members:
+   :undoc-members:
+   :show-inheritance:
 
-:mod:`translate` Module
------------------------
+nltk.toolbox module
+-------------------
 
-.. automodule:: nltk.translate
-    :members:
-    :undoc-members:
-    :show-inheritance:
+.. automodule:: nltk.toolbox
+   :members:
+   :undoc-members:
+   :show-inheritance:
 
-:mod:`tree` Module
-------------------
+nltk.tree module
+----------------
 
 .. automodule:: nltk.tree
-    :members:
-    :undoc-members:
-    :show-inheritance:
+   :members:
+   :undoc-members:
+   :show-inheritance:
 
-:mod:`treetransforms` Module
-----------------------------
+nltk.treeprettyprinter module
+-----------------------------
+
+.. automodule:: nltk.treeprettyprinter
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+nltk.treetransforms module
+--------------------------
 
 .. automodule:: nltk.treetransforms
-    :members:
-    :undoc-members:
-    :show-inheritance:
+   :members:
+   :undoc-members:
+   :show-inheritance:
 
-:mod:`util` Module
-------------------
+nltk.util module
+----------------
 
 .. automodule:: nltk.util
-    :members:
-    :undoc-members:
-    :show-inheritance:
+   :members:
+   :undoc-members:
+   :show-inheritance:
 
-:mod:`wsd` Module
-------------------
+nltk.wsd module
+---------------
 
 .. automodule:: nltk.wsd
-    :members:
-    :undoc-members:
-    :show-inheritance:
+   :members:
+   :undoc-members:
+   :show-inheritance:
 
+Module contents
+---------------
 
-Subpackages
------------
-
-.. toctree::
-
-    nltk.app
-    nltk.ccg
-    nltk.chat
-    nltk.chunk
-    nltk.classify
-    nltk.cluster
-    nltk.corpus
-    nltk.draw
-    nltk.inference
-    nltk.metrics
-    nltk.misc
-    nltk.parse
-    nltk.sem
-    nltk.stem
-    nltk.tag
-    nltk.test
-    nltk.tokenize
+.. automodule:: nltk
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/web/conf.py b/web/conf.py
index 93ea26071f..922c28a4b9 100644
--- a/web/conf.py
+++ b/web/conf.py
@@ -29,11 +29,16 @@
 # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 extensions = [
     "sphinx.ext.autodoc",
+    "sphinxcontrib.apidoc",
     "sphinx.ext.coverage",
     "sphinx.ext.imgmath",
     "sphinx.ext.viewcode",
 ]
 
+apidoc_module_dir = '../nltk'
+apidoc_output_dir = 'api'
+apidoc_separate_modules = False
+
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ["_templates"]
 

From 4937013af01da20837e17282eee13e1ba64f2f76 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com>
Date: Mon, 27 Sep 2021 14:53:15 +0200
Subject: [PATCH 25/36] Deprecate `nltk.usage(obj)` in favor of `help(obj)`
 (#2810)

* Deprecated nltk.usage(obj); help(obj) is preferred

* Removed deprecated nltk.usage() from tests

* Use existing @deprecated decorator
---
 nltk/test/classify.doctest  | 16 +++++------
 nltk/test/unit/test_util.py | 54 +------------------------------------
 nltk/util.py                |  3 ++-
 3 files changed, 10 insertions(+), 63 deletions(-)

diff --git a/nltk/test/classify.doctest b/nltk/test/classify.doctest
index bfdf17b1ef..3e1319d312 100644
--- a/nltk/test/classify.doctest
+++ b/nltk/test/classify.doctest
@@ -11,16 +11,13 @@
 Classifiers label tokens with category labels (or *class labels*).
 Typically, labels are represented with strings (such as ``"health"``
 or ``"sports"``.  In NLTK, classifiers are defined using classes that
-implement the `ClassifyI` interface:
+implement the `ClassifierI` interface, which supports the following operations:
 
-    >>> import nltk
-    >>> nltk.usage(nltk.classify.ClassifierI)
-    ClassifierI supports the following operations:
-      - self.classify(featureset)
-      - self.classify_many(featuresets)
-      - self.labels()
-      - self.prob_classify(featureset)
-      - self.prob_classify_many(featuresets)
+- self.classify(featureset)
+- self.classify_many(featuresets)
+- self.labels()
+- self.prob_classify(featureset)
+- self.prob_classify_many(featuresets)
 
 NLTK defines several classifier classes:
 
@@ -42,6 +39,7 @@ We define a very simple training corpus with 3 binary features: ['a',
 that the correct answers can be calculated analytically (although we
 haven't done this yet for all tests).
 
+    >>> import nltk
     >>> train = [
     ...     (dict(a=1,b=1,c=1), 'y'),
     ...     (dict(a=1,b=1,c=1), 'x'),
diff --git a/nltk/test/unit/test_util.py b/nltk/test/unit/test_util.py
index 109a96b31b..4709e843ca 100644
--- a/nltk/test/unit/test_util.py
+++ b/nltk/test/unit/test_util.py
@@ -1,58 +1,6 @@
 import pytest
 
-from nltk.util import everygrams, usage
-
-
-def test_usage_with_self(capsys):
-    class MyClass:
-        def kwargs(self, a=1):
-            ...
-
-        def no_args(self):
-            ...
-
-        def pos_args(self, a, b):
-            ...
-
-        def pos_args_and_kwargs(self, a, b, c=1):
-            ...
-
-    usage(MyClass)
-
-    captured = capsys.readouterr()
-    assert captured.out == (
-        "MyClass supports the following operations:\n"
-        "  - self.kwargs(a=1)\n"
-        "  - self.no_args()\n"
-        "  - self.pos_args(a, b)\n"
-        "  - self.pos_args_and_kwargs(a, b, c=1)\n"
-    )
-
-
-def test_usage_with_cls(capsys):
-    class MyClass:
-        @classmethod
-        def clsmethod(cls):
-            ...
-
-        @classmethod
-        def clsmethod_with_args(cls, a, b, c=1):
-            ...
-
-    usage(MyClass)
-
-    captured = capsys.readouterr()
-    assert captured.out == (
-        "MyClass supports the following operations:\n"
-        "  - cls.clsmethod()\n"
-        "  - cls.clsmethod_with_args(a, b, c=1)\n"
-    )
-
-
-def test_usage_on_builtin():
-    # just check the func passes, since
-    # builtins change each python version
-    usage(dict)
+from nltk.util import everygrams
 
 
 @pytest.fixture
diff --git a/nltk/util.py b/nltk/util.py
index 383342a4e5..da843c3dc4 100644
--- a/nltk/util.py
+++ b/nltk/util.py
@@ -30,13 +30,14 @@
 )
 
 from nltk.collections import *
-from nltk.internals import raise_unorderable_types, slice_bounds
+from nltk.internals import deprecated, raise_unorderable_types, slice_bounds
 
 ######################################################################
 # Short usage message
 ######################################################################
 
 
+@deprecated("Use help(obj) instead.")
 def usage(obj):
     str(obj)  # In case it's lazy, this will load it.
 

From 43f3e3096ade1a2564a20a66172d53bd140ac983 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Mon, 27 Sep 2021 15:09:28 +0200
Subject: [PATCH 26/36] Blacked conf.py

---
 web/conf.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/web/conf.py b/web/conf.py
index 922c28a4b9..9f8db4e690 100644
--- a/web/conf.py
+++ b/web/conf.py
@@ -35,8 +35,8 @@
     "sphinx.ext.viewcode",
 ]
 
-apidoc_module_dir = '../nltk'
-apidoc_output_dir = 'api'
+apidoc_module_dir = "../nltk"
+apidoc_output_dir = "api"
 apidoc_separate_modules = False
 
 # Add any paths that contain templates here, relative to this directory.

From d41e405daec3b6732c9bea0354e8f0d122b6f8b8 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com>
Date: Mon, 27 Sep 2021 20:18:01 +0200
Subject: [PATCH 27/36] wn.NOUN -> 'n' (#2823)

---
 nltk/stem/wordnet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nltk/stem/wordnet.py b/nltk/stem/wordnet.py
index 3ac3424247..1ec43bcd1c 100644
--- a/nltk/stem/wordnet.py
+++ b/nltk/stem/wordnet.py
@@ -30,7 +30,7 @@ class WordNetLemmatizer:
         hardrock
     """
 
-    def lemmatize(self, word: str, pos: str = wn.NOUN) -> str:
+    def lemmatize(self, word: str, pos: str = "n") -> str:
         """Lemmatize `word` using WordNet's built-in morphy function.
         Returns the input word unchanged if it cannot be found in WordNet.
 

From a31de6c687165afe64a490053c73c8e0b20ff6db Mon Sep 17 00:00:00 2001
From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com>
Date: Mon, 27 Sep 2021 23:48:41 +0200
Subject: [PATCH 28/36] Resolved undefined variable bug in raising Exception
 for RegexpTagger (#2821)

---
 nltk/tag/sequential.py | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/nltk/tag/sequential.py b/nltk/tag/sequential.py
index 103fdc10ba..9a58c2f9d4 100644
--- a/nltk/tag/sequential.py
+++ b/nltk/tag/sequential.py
@@ -20,6 +20,7 @@ class for all the taggers in this module.  Tagging of individual words
 import ast
 import re
 from abc import abstractmethod
+from typing import List, Optional, Tuple
 
 from nltk import jsontags
 from nltk.classify import NaiveBayesClassifier
@@ -533,21 +534,18 @@ class RegexpTagger(SequentialBackoffTagger):
 
     json_tag = "nltk.tag.sequential.RegexpTagger"
 
-    def __init__(self, regexps, backoff=None):
-        """ """
+    def __init__(
+        self, regexps: List[Tuple[str, str]], backoff: Optional[TaggerI] = None
+    ):
         super().__init__(backoff)
-        try:
-            self._regexps = [
-                (
-                    re.compile(regexp),
-                    tag,
-                )
-                for regexp, tag in regexps
-            ]
-        except Exception as e:
-            raise Exception(
-                "Invalid RegexpTagger regexp:", str(e), "regexp:", regexp, "tag:", tag
-            ) from e
+        self._regexps = []
+        for regexp, tag in regexps:
+            try:
+                self._regexps.append((re.compile(regexp), tag))
+            except Exception as e:
+                raise Exception(
+                    f"Invalid RegexpTagger regexp: {e}\n- regexp: {regexp!r}\n- tag: {tag!r}"
+                ) from e
 
     def encode_json_obj(self):
         return [(regexp.pattern, tag) for regexp, tag in self._regexps], self.backoff

From 3502926c7e2222632d850b70bd41e6492e5172d7 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Thu, 30 Sep 2021 14:19:50 +0200
Subject: [PATCH 29/36] Change mean -> median in ReDoS docstring comment

---
 nltk/test/corpus.doctest | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nltk/test/corpus.doctest b/nltk/test/corpus.doctest
index 560e641a69..edb79221cf 100644
--- a/nltk/test/corpus.doctest
+++ b/nltk/test/corpus.doctest
@@ -2182,7 +2182,7 @@ Ensure that KEYWORD from `comparative_sents.py` no longer contains a ReDoS vulne
     ...         payload = "( " + "(" * size
     ...         output = KEYWORD.findall(payload)
     ...         exec_times[size_name].append(time.perf_counter() - start_t)
-    ...     exec_times[size_name] = sorted(exec_times[size_name])[4] # Get the mean
+    ...     exec_times[size_name] = sorted(exec_times[size_name])[4] # Get the median
 
 Ideally, the execution time of such a regular expression is linear
 in the length of the input. As such, we would expect exec_times["long"]

From dacda6f50057e1de560483f5e432b5ccb0dd5876 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com>
Date: Thu, 30 Sep 2021 14:48:26 +0200
Subject: [PATCH 30/36] Change mean -> median in ReDoS docstring comment
 (#2831)

Thank you @PeterJCLaw
---
 nltk/test/corpus.doctest | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nltk/test/corpus.doctest b/nltk/test/corpus.doctest
index 560e641a69..edb79221cf 100644
--- a/nltk/test/corpus.doctest
+++ b/nltk/test/corpus.doctest
@@ -2182,7 +2182,7 @@ Ensure that KEYWORD from `comparative_sents.py` no longer contains a ReDoS vulne
     ...         payload = "( " + "(" * size
     ...         output = KEYWORD.findall(payload)
     ...         exec_times[size_name].append(time.perf_counter() - start_t)
-    ...     exec_times[size_name] = sorted(exec_times[size_name])[4] # Get the mean
+    ...     exec_times[size_name] = sorted(exec_times[size_name])[4] # Get the median
 
 Ideally, the execution time of such a regular expression is linear
 in the length of the input. As such, we would expect exec_times["long"]

From 2d07c4fe9bc50aa0f0e63fea77cce900895c91d4 Mon Sep 17 00:00:00 2001
From: mohaned mashaly <30902228+12mohaned@users.noreply.github.com>
Date: Fri, 1 Oct 2021 03:28:29 +0200
Subject: [PATCH 31/36] fix: correct minor typo in word_tokenize (#2828)

---
 nltk/tokenize/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nltk/tokenize/__init__.py b/nltk/tokenize/__init__.py
index 62c5886d34..c8dd45b319 100644
--- a/nltk/tokenize/__init__.py
+++ b/nltk/tokenize/__init__.py
@@ -123,7 +123,7 @@ def word_tokenize(text, language="english", preserve_line=False):
     :type text: str
     :param language: the model name in the Punkt corpus
     :type language: str
-    :param preserve_line: An option to keep the preserve the sentence and not sentence tokenize it.
+    :param preserve_line: A flag to decide whether to sentence tokenize the text or not.
     :type preserve_line: bool
     """
     sentences = [text] if preserve_line else sent_tokenize(text, language)

From 317c5f88ef66b039dc143a6ad78f2a61f362f03a Mon Sep 17 00:00:00 2001
From: Steven Bird <stevenbird1@gmail.com>
Date: Fri, 1 Oct 2021 11:22:49 +0930
Subject: [PATCH 32/36] updates for 3.6.4

---
 ChangeLog    | 14 ++++++++++++++
 nltk/VERSION |  2 +-
 web/conf.py  |  4 ++--
 web/news.rst |  5 +++++
 4 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index c4a39bc542..3615a2b8fa 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,17 @@
+Version 3.6.4 2021-10-01
+
+* deprecate `nltk.usage(obj)` in favor of `help(obj)`
+* resolve ReDoS vulnerability in Corpus Reader
+* solidify performance tests
+* improve phone number recognition in tweet tokenizer
+* refactored CISTEM stemmer for German
+* identify NLTK Team as the author
+* replace travis badge with github actions badge
+* add SECURITY.md
+
+Thanks to the following contributors to 3.6.4
+Tom Aarsen, Mohaned Mashaly, Dimitri Papadopoulos Orfanos, purificant, Danny Sepler
+
 Version 3.6.3 2021-09-19
 * Dropped support for Python 3.5
 * Run CI tests on Windows, too
diff --git a/nltk/VERSION b/nltk/VERSION
index 4a788a01da..0f44168a4d 100644
--- a/nltk/VERSION
+++ b/nltk/VERSION
@@ -1 +1 @@
-3.6.3
+3.6.4
diff --git a/web/conf.py b/web/conf.py
index 609979837c..98e9d86f31 100644
--- a/web/conf.py
+++ b/web/conf.py
@@ -60,9 +60,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = "3.6.3"
+version = "3.6.4"
 # The full version, including alpha/beta/rc tags.
-release = "3.6.3"
+release = "3.6.4"
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/web/news.rst b/web/news.rst
index 00734ba0b6..f44f073177 100644
--- a/web/news.rst
+++ b/web/news.rst
@@ -4,6 +4,11 @@ NLTK News
 2021
 ----
 
+NLTK 3.6.4 release: October 2021
+  improved phone number recognition in tweet tokenizer
+  resolved ReDoS vulnerability in Corpus Reader
+  refactored CISTEM stemmer for German
+
 NLTK 3.6.3 release: September 2021
   Drop support for Python 3.5,
   added pre-commit hooks (isort, pyupgrade, black),

From 6428c9288a86658cb3d9a1e91816c4bcf162a6f0 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Fri, 1 Oct 2021 09:09:38 +0200
Subject: [PATCH 33/36] Avoiding the use of re.Pattern and regex.Pattern

This fails for Python 3.6 and 3.7.
---
 nltk/tokenize/casual.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nltk/tokenize/casual.py b/nltk/tokenize/casual.py
index f7b256713e..11c0f11aa8 100644
--- a/nltk/tokenize/casual.py
+++ b/nltk/tokenize/casual.py
@@ -354,7 +354,7 @@ def tokenize(self, text: str) -> List[str]:
         return words
 
     @property
-    def WORD_RE(self) -> regex.Pattern:
+    def WORD_RE(self) -> "regex.Pattern":
         """Core TweetTokenizer regex"""
         # Compiles the regex for this and all future instantiations of TweetTokenizer.
         if not type(self)._WORD_RE:
@@ -365,7 +365,7 @@ def WORD_RE(self) -> regex.Pattern:
         return type(self)._WORD_RE
 
     @property
-    def PHONE_WORD_RE(self) -> regex.Pattern:
+    def PHONE_WORD_RE(self) -> "regex.Pattern":
         """Secondary core TweetTokenizer regex"""
         # Compiles the regex for this and all future instantiations of TweetTokenizer.
         if not type(self)._PHONE_WORD_RE:

From 7ed503d98926c00e26baec7e05895d06286b23d4 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com>
Date: Fri, 1 Oct 2021 11:23:15 +0200
Subject: [PATCH 34/36] Bump minimum regex version to allow usage of
 'regex.Pattern' typing (#2834)

---
 pip-req.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pip-req.txt b/pip-req.txt
index 63655dffc9..461e26d656 100644
--- a/pip-req.txt
+++ b/pip-req.txt
@@ -9,7 +9,7 @@ python-crfsuite>=0.8.2
 gensim>=0.11.1,<4.0.0
 pyparsing>=2.0.3
 twython>=3.2.0
-regex>=2019.08.19
+regex>=2021.8.3
 click>=7.1.2
 joblib>=1.0.1
 tqdm>=4.59.0

From 2a1d794999fb1d80143dd028903dca789e66f519 Mon Sep 17 00:00:00 2001
From: purificant <purificant@users.noreply.github.com>
Date: Fri, 1 Oct 2021 22:17:38 +0100
Subject: [PATCH 35/36] specify minimum regex version that supports
 regex.Pattern (#2835)

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 061ae8899e..eb32f49f9d 100644
--- a/setup.py
+++ b/setup.py
@@ -115,7 +115,7 @@
     install_requires=[
         "click",
         "joblib",
-        "regex",
+        "regex>=2021.8.3",
         "tqdm",
     ],
     extras_require=extras_require,

From 3ffed20f5afd2412c799ddd03376b071db882ed5 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com>
Date: Mon, 4 Oct 2021 23:55:50 +0200
Subject: [PATCH 36/36] Skip ReDoS test - performance testing isn't viable with
 cloud computing (#2841)

---
 nltk/test/corpus.doctest | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nltk/test/corpus.doctest b/nltk/test/corpus.doctest
index edb79221cf..ef74077433 100644
--- a/nltk/test/corpus.doctest
+++ b/nltk/test/corpus.doctest
@@ -2192,5 +2192,5 @@ For now, we accept values below 30 (times as long), due to the potential
 for variance. This ensures that the ReDoS has certainly been reduced,
 if not removed.
 
-    >>> exec_times["long"] / exec_times["short"] < 30
+    >>> exec_times["long"] / exec_times["short"] < 30 # doctest: +SKIP
     True