Merge pull request #2 from ActiveState/cve-2021-3842

Resolve ReDoS opportunity by fixing incorrectly specified regex (nltk#2906)
ActiveState · Dec 21, 2023 · 41d217b · 41d217b
2 parents 817c76f + 4757a3b
commit 41d217b
Show file tree

Hide file tree

Showing 6 changed files with 63 additions and 63 deletions.
diff --git a/nltk/parse/malt.py b/nltk/parse/malt.py
@@ -31,34 +31,34 @@ def malt_regex_tagger():
 
     _tagger = RegexpTagger(
         [
-            (r'\.$', '.'),
-            (r'\,$', ','),
-            (r'\?$', '?'),  # fullstop, comma, Qmark
-            (r'\($', '('),
-            (r'\)$', ')'),  # round brackets
-            (r'\[$', '['),
-            (r'\]$', ']'),  # square brackets
-            (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
-            (r'(The|the|A|a|An|an)$', 'DT'),  # articles
-            (r'(He|he|She|she|It|it|I|me|Me|You|you)$', 'PRP'),  # pronouns
-            (r'(His|his|Her|her|Its|its)$', 'PRP$'),  # possesive
-            (r'(my|Your|your|Yours|yours)$', 'PRP$'),  # possesive
-            (r'(on|On|in|In|at|At|since|Since)$', 'IN'),  # time prepopsitions
-            (r'(for|For|ago|Ago|before|Before)$', 'IN'),  # time prepopsitions
-            (r'(till|Till|until|Until)$', 'IN'),  # time prepopsitions
-            (r'(by|By|beside|Beside)$', 'IN'),  # space prepopsitions
-            (r'(under|Under|below|Below)$', 'IN'),  # space prepopsitions
-            (r'(over|Over|above|Above)$', 'IN'),  # space prepopsitions
-            (r'(across|Across|through|Through)$', 'IN'),  # space prepopsitions
-            (r'(into|Into|towards|Towards)$', 'IN'),  # space prepopsitions
-            (r'(onto|Onto|from|From)$', 'IN'),  # space prepopsitions
-            (r'.*able$', 'JJ'),  # adjectives
-            (r'.*ness$', 'NN'),  # nouns formed from adjectives
-            (r'.*ly$', 'RB'),  # adverbs
-            (r'.*s$', 'NNS'),  # plural nouns
-            (r'.*ing$', 'VBG'),  # gerunds
-            (r'.*ed$', 'VBD'),  # past tense verbs
-            (r'.*', 'NN'),  # nouns (default)
+            (r"\.$", "."),
+            (r"\,$", ","),
+            (r"\?$", "?"),  # fullstop, comma, Qmark
+            (r"\($", "("),
+            (r"\)$", ")"),  # round brackets
+            (r"\[$", "["),
+            (r"\]$", "]"),  # square brackets
+            (r"^-?[0-9]+(\.[0-9]+)?$", "CD"),  # cardinal numbers
+            (r"(The|the|A|a|An|an)$", "DT"),  # articles
+            (r"(He|he|She|she|It|it|I|me|Me|You|you)$", "PRP"),  # pronouns
+            (r"(His|his|Her|her|Its|its)$", "PRP$"),  # possessive
+            (r"(my|Your|your|Yours|yours)$", "PRP$"),  # possessive
+            (r"(on|On|in|In|at|At|since|Since)$", "IN"),  # time prepopsitions
+            (r"(for|For|ago|Ago|before|Before)$", "IN"),  # time prepopsitions
+            (r"(till|Till|until|Until)$", "IN"),  # time prepopsitions
+            (r"(by|By|beside|Beside)$", "IN"),  # space prepopsitions
+            (r"(under|Under|below|Below)$", "IN"),  # space prepopsitions
+            (r"(over|Over|above|Above)$", "IN"),  # space prepopsitions
+            (r"(across|Across|through|Through)$", "IN"),  # space prepopsitions
+            (r"(into|Into|towards|Towards)$", "IN"),  # space prepopsitions
+            (r"(onto|Onto|from|From)$", "IN"),  # space prepopsitions
+            (r".*able$", "JJ"),  # adjectives
+            (r".*ness$", "NN"),  # nouns formed from adjectives
+            (r".*ly$", "RB"),  # adverbs
+            (r".*s$", "NNS"),  # plural nouns
+            (r".*ing$", "VBG"),  # gerunds
+            (r".*ed$", "VBD"),  # past tense verbs
+            (r".*", "NN"),  # nouns (default)
         ]
     )
     return _tagger.tag

diff --git a/nltk/sem/glue.py b/nltk/sem/glue.py
@@ -706,15 +706,15 @@ def get_pos_tagger(self):
 
         regexp_tagger = RegexpTagger(
             [
-                (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
-                (r'(The|the|A|a|An|an)$', 'AT'),  # articles
-                (r'.*able$', 'JJ'),  # adjectives
-                (r'.*ness$', 'NN'),  # nouns formed from adjectives
-                (r'.*ly$', 'RB'),  # adverbs
-                (r'.*s$', 'NNS'),  # plural nouns
-                (r'.*ing$', 'VBG'),  # gerunds
-                (r'.*ed$', 'VBD'),  # past tense verbs
-                (r'.*', 'NN'),  # nouns (default)
+                (r"^-?[0-9]+(\.[0-9]+)?$", "CD"),  # cardinal numbers
+                (r"(The|the|A|a|An|an)$", "AT"),  # articles
+                (r".*able$", "JJ"),  # adjectives
+                (r".*ness$", "NN"),  # nouns formed from adjectives
+                (r".*ly$", "RB"),  # adverbs
+                (r".*s$", "NNS"),  # plural nouns
+                (r".*ing$", "VBG"),  # gerunds
+                (r".*ed$", "VBD"),  # past tense verbs
+                (r".*", "NN"),  # nouns (default)
             ]
         )
         brown_train = brown.tagged_sents(categories='news')

diff --git a/nltk/tag/brill.py b/nltk/tag/brill.py
@@ -332,7 +332,7 @@ def print_train_stats():
             )
             print(
                 "TRAIN ({tokencount:7d} tokens) initial {initialerrors:5d} {initialacc:.4f} "
-                "final: {finalerrors:5d} {finalacc:.4f} ".format(**train_stats)
+                "final: {finalerrors:5d} {finalacc:.4f}".format(**train_stats)
             )
             head = "#ID | Score (train) |  #Rules     | Template"
             print(head, "\n", "-" * len(head), sep="")

diff --git a/nltk/tag/brill_trainer.py b/nltk/tag/brill_trainer.py
@@ -94,7 +94,7 @@ def __init__(
     # Training
 
     def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
-        """
+        r"""
         Trains the Brill tagger on the corpus *train_sents*,
         producing at most *max_rules* transformations, each of which
         reduces the net number of errors in the corpus by at least
@@ -114,7 +114,7 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
         >>> testing_data = [untag(s) for s in gold_data]
 
         >>> backoff = RegexpTagger([
-        ... (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
+        ... (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'),  # cardinal numbers
         ... (r'(The|the|A|a|An|an)$', 'AT'),   # articles
         ... (r'.*able$', 'JJ'),                # adjectives
         ... (r'.*ness$', 'NN'),                # nouns formed from adjectives
@@ -128,7 +128,7 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
         >>> baseline = backoff #see NOTE1
 
         >>> baseline.evaluate(gold_data) #doctest: +ELLIPSIS
-        0.2450142...
+        0.2433862...
 
         #templates
         >>> Template._cleartemplates() #clear any templates created in earlier tests
@@ -140,7 +140,7 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
         >>> tagger1 = tt.train(training_data, max_rules=10)
         TBL train (fast) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: None)
         Finding initial useful rules...
-            Found 845 useful rules.
+            Found 847 useful rules.
         <BLANKLINE>
                    B      |
            S   F   r   O  |        Score = Fixed - Broken
@@ -153,7 +153,7 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
           85  85   0   0  | NN->, if Pos:NN@[-1] & Word:,@[0]
           69  69   0   0  | NN->. if Pos:NN@[-1] & Word:.@[0]
           51  51   0   0  | NN->IN if Pos:NN@[-1] & Word:of@[0]
-          47  63  16 161  | NN->IN if Pos:NNS@[-1]
+          47  63  16 162  | NN->IN if Pos:NNS@[-1]
           33  33   0   0  | NN->TO if Pos:NN@[-1] & Word:to@[0]
           26  26   0   0  | IN->. if Pos:NNS@[-1] & Word:.@[0]
           24  24   0   0  | IN->, if Pos:NNS@[-1] & Word:,@[0]
@@ -165,11 +165,11 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
 
         >>> train_stats = tagger1.train_stats()
         >>> [train_stats[stat] for stat in ['initialerrors', 'finalerrors', 'rulescores']]
-        [1775, 1269, [132, 85, 69, 51, 47, 33, 26, 24, 22, 17]]
+        [1776, 1270, [132, 85, 69, 51, 47, 33, 26, 24, 22, 17]]
 
         >>> tagger1.print_template_statistics(printunused=False)
         TEMPLATE STATISTICS (TRAIN)  2 templates, 10 rules)
-        TRAIN (   2417 tokens) initial  1775 0.2656 final:  1269 0.4750
+        TRAIN (   2417 tokens) initial  1776 0.2652 final:  1270 0.4746
         #ID | Score (train) |  #Rules     | Template
         --------------------------------------------
         001 |   305   0.603 |   7   0.700 | Template(Pos([-1]),Word([0]))
@@ -178,7 +178,7 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
         <BLANKLINE>
 
         >>> tagger1.evaluate(gold_data) # doctest: +ELLIPSIS
-        0.43996...
+        0.43833...
 
         >>> tagged, test_stats = tagger1.batch_tag_incremental(testing_data, gold_data)
 
@@ -188,13 +188,13 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
         True
 
         >>> [test_stats[stat] for stat in ['initialerrors', 'finalerrors', 'rulescores']]
-        [1855, 1376, [100, 85, 67, 58, 27, 36, 27, 16, 31, 32]]
+        [1859, 1380, [100, 85, 67, 58, 27, 36, 27, 16, 31, 32]]
 
         # a high-accuracy tagger
         >>> tagger2 = tt.train(training_data, max_rules=10, min_acc=0.99)
         TBL train (fast) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: 0.99)
         Finding initial useful rules...
-            Found 845 useful rules.
+            Found 847 useful rules.
         <BLANKLINE>
                    B      |
            S   F   r   O  |        Score = Fixed - Broken
@@ -215,7 +215,7 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
           18  18   0   0  | NN->CC if Pos:NN@[-1] & Word:and@[0]
 
         >>> tagger2.evaluate(gold_data)  # doctest: +ELLIPSIS
-        0.44159544...
+        0.43996743...
         >>> tagger2.rules()[2:4]
         (Rule('001', 'NN', '.', [(Pos([-1]),'NN'), (Word([0]),'.')]), Rule('001', 'NN', 'IN', [(Pos([-1]),'NN'), (Word([0]),'of')]))
 

diff --git a/nltk/tag/sequential.py b/nltk/tag/sequential.py
@@ -320,7 +320,7 @@ class UnigramTagger(NgramTagger):
         >>> test_sent = brown.sents(categories='news')[0]
         >>> unigram_tagger = UnigramTagger(brown.tagged_sents(categories='news')[:500])
         >>> for tok, tag in unigram_tagger.tag(test_sent):
-        ...     print("(%s, %s), " % (tok, tag))
+        ...     print("({}, {}), ".format(tok, tag)) # doctest: +NORMALIZE_WHITESPACE
         (The, AT), (Fulton, NP-TL), (County, NN-TL), (Grand, JJ-TL),
         (Jury, NN-TL), (said, VBD), (Friday, NR), (an, AT),
         (investigation, NN), (of, IN), (Atlanta's, NP$), (recent, JJ),
@@ -499,7 +499,7 @@ def context(self, tokens, index, history):
 @python_2_unicode_compatible
 @jsontags.register_tag
 class RegexpTagger(SequentialBackoffTagger):
-    """
+    r"""
     Regular Expression Tagger
 
     The RegexpTagger assigns tags to tokens by comparing their
@@ -511,7 +511,7 @@ class RegexpTagger(SequentialBackoffTagger):
         >>> from nltk.tag import RegexpTagger
         >>> test_sent = brown.sents(categories='news')[0]
         >>> regexp_tagger = RegexpTagger(
-        ...     [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
+        ...     [(r'^-?[0-9]+(\.[0-9]+)?$', 'CD'),  # cardinal numbers
         ...      (r'(The|the|A|a|An|an)$', 'AT'),   # articles
         ...      (r'.*able$', 'JJ'),                # adjectives
         ...      (r'.*ness$', 'NN'),                # nouns formed from adjectives
@@ -523,7 +523,7 @@ class RegexpTagger(SequentialBackoffTagger):
         ... ])
         >>> regexp_tagger
         <Regexp Tagger: size=9>
-        >>> regexp_tagger.tag(test_sent)
+        >>> regexp_tagger.tag(test_sent) # doctest: +NORMALIZE_WHITESPACE
         [('The', 'AT'), ('Fulton', 'NN'), ('County', 'NN'), ('Grand', 'NN'), ('Jury', 'NN'),
         ('said', 'NN'), ('Friday', 'NN'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'NN'),
         ("Atlanta's", 'NNS'), ('recent', 'NN'), ('primary', 'NN'), ('election', 'NN'),

diff --git a/nltk/tbl/demo.py b/nltk/tbl/demo.py
@@ -399,19 +399,19 @@ def _demo_plot(learning_curve_output, teststats, trainstats=None, take=None):
     plt.savefig(learning_curve_output)
 
 
-NN_CD_TAGGER = RegexpTagger([(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*', 'NN')])
+NN_CD_TAGGER = RegexpTagger([(r"^-?[0-9]+(\.[0-9]+)?$", "CD"), (r".*", "NN")])
 
 REGEXP_TAGGER = RegexpTagger(
     [
-        (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
-        (r'(The|the|A|a|An|an)$', 'AT'),  # articles
-        (r'.*able$', 'JJ'),  # adjectives
-        (r'.*ness$', 'NN'),  # nouns formed from adjectives
-        (r'.*ly$', 'RB'),  # adverbs
-        (r'.*s$', 'NNS'),  # plural nouns
-        (r'.*ing$', 'VBG'),  # gerunds
-        (r'.*ed$', 'VBD'),  # past tense verbs
-        (r'.*', 'NN'),  # nouns (default)
+        (r"^-?[0-9]+(\.[0-9]+)?$", "CD"),  # cardinal numbers
+        (r"(The|the|A|a|An|an)$", "AT"),  # articles
+        (r".*able$", "JJ"),  # adjectives
+        (r".*ness$", "NN"),  # nouns formed from adjectives
+        (r".*ly$", "RB"),  # adverbs
+        (r".*s$", "NNS"),  # plural nouns
+        (r".*ing$", "VBG"),  # gerunds
+        (r".*ed$", "VBD"),  # past tense verbs
+        (r".*", "NN"),  # nouns (default)
     ]
 )