From 4757a3b821f9c32db92f0ebc7bae5964bde3479d Mon Sep 17 00:00:00 2001 From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com> Date: Wed, 8 Dec 2021 15:19:56 +0100 Subject: [PATCH] Resolve ReDoS opportunity by fixing incorrectly specified regex (#2906) --- nltk/parse/malt.py | 56 +++++++++++++++++++-------------------- nltk/sem/glue.py | 18 ++++++------- nltk/tag/brill.py | 2 +- nltk/tag/brill_trainer.py | 22 +++++++-------- nltk/tag/sequential.py | 8 +++--- nltk/tbl/demo.py | 20 +++++++------- 6 files changed, 63 insertions(+), 63 deletions(-) diff --git a/nltk/parse/malt.py b/nltk/parse/malt.py index c588b5cb79..42c5e2266f 100644 --- a/nltk/parse/malt.py +++ b/nltk/parse/malt.py @@ -31,34 +31,34 @@ def malt_regex_tagger(): _tagger = RegexpTagger( [ - (r'\.$', '.'), - (r'\,$', ','), - (r'\?$', '?'), # fullstop, comma, Qmark - (r'\($', '('), - (r'\)$', ')'), # round brackets - (r'\[$', '['), - (r'\]$', ']'), # square brackets - (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers - (r'(The|the|A|a|An|an)$', 'DT'), # articles - (r'(He|he|She|she|It|it|I|me|Me|You|you)$', 'PRP'), # pronouns - (r'(His|his|Her|her|Its|its)$', 'PRP$'), # possesive - (r'(my|Your|your|Yours|yours)$', 'PRP$'), # possesive - (r'(on|On|in|In|at|At|since|Since)$', 'IN'), # time prepopsitions - (r'(for|For|ago|Ago|before|Before)$', 'IN'), # time prepopsitions - (r'(till|Till|until|Until)$', 'IN'), # time prepopsitions - (r'(by|By|beside|Beside)$', 'IN'), # space prepopsitions - (r'(under|Under|below|Below)$', 'IN'), # space prepopsitions - (r'(over|Over|above|Above)$', 'IN'), # space prepopsitions - (r'(across|Across|through|Through)$', 'IN'), # space prepopsitions - (r'(into|Into|towards|Towards)$', 'IN'), # space prepopsitions - (r'(onto|Onto|from|From)$', 'IN'), # space prepopsitions - (r'.*able$', 'JJ'), # adjectives - (r'.*ness$', 'NN'), # nouns formed from adjectives - (r'.*ly$', 'RB'), # adverbs - (r'.*s$', 'NNS'), # plural nouns - (r'.*ing$', 'VBG'), # gerunds - (r'.*ed$', 'VBD'), # past tense verbs - (r'.*', 'NN'), # nouns (default) + (r"\.$", "."), + (r"\,$", ","), + (r"\?$", "?"), # fullstop, comma, Qmark + (r"\($", "("), + (r"\)$", ")"), # round brackets + (r"\[$", "["), + (r"\]$", "]"), # square brackets + (r"^-?[0-9]+(\.[0-9]+)?$", "CD"), # cardinal numbers + (r"(The|the|A|a|An|an)$", "DT"), # articles + (r"(He|he|She|she|It|it|I|me|Me|You|you)$", "PRP"), # pronouns + (r"(His|his|Her|her|Its|its)$", "PRP$"), # possessive + (r"(my|Your|your|Yours|yours)$", "PRP$"), # possessive + (r"(on|On|in|In|at|At|since|Since)$", "IN"), # time prepopsitions + (r"(for|For|ago|Ago|before|Before)$", "IN"), # time prepopsitions + (r"(till|Till|until|Until)$", "IN"), # time prepopsitions + (r"(by|By|beside|Beside)$", "IN"), # space prepopsitions + (r"(under|Under|below|Below)$", "IN"), # space prepopsitions + (r"(over|Over|above|Above)$", "IN"), # space prepopsitions + (r"(across|Across|through|Through)$", "IN"), # space prepopsitions + (r"(into|Into|towards|Towards)$", "IN"), # space prepopsitions + (r"(onto|Onto|from|From)$", "IN"), # space prepopsitions + (r".*able$", "JJ"), # adjectives + (r".*ness$", "NN"), # nouns formed from adjectives + (r".*ly$", "RB"), # adverbs + (r".*s$", "NNS"), # plural nouns + (r".*ing$", "VBG"), # gerunds + (r".*ed$", "VBD"), # past tense verbs + (r".*", "NN"), # nouns (default) ] ) return _tagger.tag diff --git a/nltk/sem/glue.py b/nltk/sem/glue.py index 9fd3cab7a5..2e537c3d18 100644 --- a/nltk/sem/glue.py +++ b/nltk/sem/glue.py @@ -706,15 +706,15 @@ def get_pos_tagger(self): regexp_tagger = RegexpTagger( [ - (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers - (r'(The|the|A|a|An|an)$', 'AT'), # articles - (r'.*able$', 'JJ'), # adjectives - (r'.*ness$', 'NN'), # nouns formed from adjectives - (r'.*ly$', 'RB'), # adverbs - (r'.*s$', 'NNS'), # plural nouns - (r'.*ing$', 'VBG'), # gerunds - (r'.*ed$', 'VBD'), # past tense verbs - (r'.*', 'NN'), # nouns (default) + (r"^-?[0-9]+(\.[0-9]+)?$", "CD"), # cardinal numbers + (r"(The|the|A|a|An|an)$", "AT"), # articles + (r".*able$", "JJ"), # adjectives + (r".*ness$", "NN"), # nouns formed from adjectives + (r".*ly$", "RB"), # adverbs + (r".*s$", "NNS"), # plural nouns + (r".*ing$", "VBG"), # gerunds + (r".*ed$", "VBD"), # past tense verbs + (r".*", "NN"), # nouns (default) ] ) brown_train = brown.tagged_sents(categories='news') diff --git a/nltk/tag/brill.py b/nltk/tag/brill.py index b44e335df9..9b32afef6d 100644 --- a/nltk/tag/brill.py +++ b/nltk/tag/brill.py @@ -332,7 +332,7 @@ def print_train_stats(): ) print( "TRAIN ({tokencount:7d} tokens) initial {initialerrors:5d} {initialacc:.4f} " - "final: {finalerrors:5d} {finalacc:.4f} ".format(**train_stats) + "final: {finalerrors:5d} {finalacc:.4f}".format(**train_stats) ) head = "#ID | Score (train) | #Rules | Template" print(head, "\n", "-" * len(head), sep="") diff --git a/nltk/tag/brill_trainer.py b/nltk/tag/brill_trainer.py index f518dcfd89..aee9826b03 100644 --- a/nltk/tag/brill_trainer.py +++ b/nltk/tag/brill_trainer.py @@ -94,7 +94,7 @@ def __init__( # Training def train(self, train_sents, max_rules=200, min_score=2, min_acc=None): - """ + r""" Trains the Brill tagger on the corpus *train_sents*, producing at most *max_rules* transformations, each of which reduces the net number of errors in the corpus by at least @@ -114,7 +114,7 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None): >>> testing_data = [untag(s) for s in gold_data] >>> backoff = RegexpTagger([ - ... (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers + ... (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'), # cardinal numbers ... (r'(The|the|A|a|An|an)$', 'AT'), # articles ... (r'.*able$', 'JJ'), # adjectives ... (r'.*ness$', 'NN'), # nouns formed from adjectives @@ -128,7 +128,7 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None): >>> baseline = backoff #see NOTE1 >>> baseline.evaluate(gold_data) #doctest: +ELLIPSIS - 0.2450142... + 0.2433862... #templates >>> Template._cleartemplates() #clear any templates created in earlier tests @@ -140,7 +140,7 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None): >>> tagger1 = tt.train(training_data, max_rules=10) TBL train (fast) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: None) Finding initial useful rules... - Found 845 useful rules. + Found 847 useful rules. B | S F r O | Score = Fixed - Broken @@ -153,7 +153,7 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None): 85 85 0 0 | NN->, if Pos:NN@[-1] & Word:,@[0] 69 69 0 0 | NN->. if Pos:NN@[-1] & Word:.@[0] 51 51 0 0 | NN->IN if Pos:NN@[-1] & Word:of@[0] - 47 63 16 161 | NN->IN if Pos:NNS@[-1] + 47 63 16 162 | NN->IN if Pos:NNS@[-1] 33 33 0 0 | NN->TO if Pos:NN@[-1] & Word:to@[0] 26 26 0 0 | IN->. if Pos:NNS@[-1] & Word:.@[0] 24 24 0 0 | IN->, if Pos:NNS@[-1] & Word:,@[0] @@ -165,11 +165,11 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None): >>> train_stats = tagger1.train_stats() >>> [train_stats[stat] for stat in ['initialerrors', 'finalerrors', 'rulescores']] - [1775, 1269, [132, 85, 69, 51, 47, 33, 26, 24, 22, 17]] + [1776, 1270, [132, 85, 69, 51, 47, 33, 26, 24, 22, 17]] >>> tagger1.print_template_statistics(printunused=False) TEMPLATE STATISTICS (TRAIN) 2 templates, 10 rules) - TRAIN ( 2417 tokens) initial 1775 0.2656 final: 1269 0.4750 + TRAIN ( 2417 tokens) initial 1776 0.2652 final: 1270 0.4746 #ID | Score (train) | #Rules | Template -------------------------------------------- 001 | 305 0.603 | 7 0.700 | Template(Pos([-1]),Word([0])) @@ -178,7 +178,7 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None): >>> tagger1.evaluate(gold_data) # doctest: +ELLIPSIS - 0.43996... + 0.43833... >>> tagged, test_stats = tagger1.batch_tag_incremental(testing_data, gold_data) @@ -188,13 +188,13 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None): True >>> [test_stats[stat] for stat in ['initialerrors', 'finalerrors', 'rulescores']] - [1855, 1376, [100, 85, 67, 58, 27, 36, 27, 16, 31, 32]] + [1859, 1380, [100, 85, 67, 58, 27, 36, 27, 16, 31, 32]] # a high-accuracy tagger >>> tagger2 = tt.train(training_data, max_rules=10, min_acc=0.99) TBL train (fast) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: 0.99) Finding initial useful rules... - Found 845 useful rules. + Found 847 useful rules. B | S F r O | Score = Fixed - Broken @@ -215,7 +215,7 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None): 18 18 0 0 | NN->CC if Pos:NN@[-1] & Word:and@[0] >>> tagger2.evaluate(gold_data) # doctest: +ELLIPSIS - 0.44159544... + 0.43996743... >>> tagger2.rules()[2:4] (Rule('001', 'NN', '.', [(Pos([-1]),'NN'), (Word([0]),'.')]), Rule('001', 'NN', 'IN', [(Pos([-1]),'NN'), (Word([0]),'of')])) diff --git a/nltk/tag/sequential.py b/nltk/tag/sequential.py index 3d3a7676b6..906a8015bb 100644 --- a/nltk/tag/sequential.py +++ b/nltk/tag/sequential.py @@ -320,7 +320,7 @@ class UnigramTagger(NgramTagger): >>> test_sent = brown.sents(categories='news')[0] >>> unigram_tagger = UnigramTagger(brown.tagged_sents(categories='news')[:500]) >>> for tok, tag in unigram_tagger.tag(test_sent): - ... print("(%s, %s), " % (tok, tag)) + ... print("({}, {}), ".format(tok, tag)) # doctest: +NORMALIZE_WHITESPACE (The, AT), (Fulton, NP-TL), (County, NN-TL), (Grand, JJ-TL), (Jury, NN-TL), (said, VBD), (Friday, NR), (an, AT), (investigation, NN), (of, IN), (Atlanta's, NP$), (recent, JJ), @@ -499,7 +499,7 @@ def context(self, tokens, index, history): @python_2_unicode_compatible @jsontags.register_tag class RegexpTagger(SequentialBackoffTagger): - """ + r""" Regular Expression Tagger The RegexpTagger assigns tags to tokens by comparing their @@ -511,7 +511,7 @@ class RegexpTagger(SequentialBackoffTagger): >>> from nltk.tag import RegexpTagger >>> test_sent = brown.sents(categories='news')[0] >>> regexp_tagger = RegexpTagger( - ... [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers + ... [(r'^-?[0-9]+(\.[0-9]+)?$', 'CD'), # cardinal numbers ... (r'(The|the|A|a|An|an)$', 'AT'), # articles ... (r'.*able$', 'JJ'), # adjectives ... (r'.*ness$', 'NN'), # nouns formed from adjectives @@ -523,7 +523,7 @@ class RegexpTagger(SequentialBackoffTagger): ... ]) >>> regexp_tagger - >>> regexp_tagger.tag(test_sent) + >>> regexp_tagger.tag(test_sent) # doctest: +NORMALIZE_WHITESPACE [('The', 'AT'), ('Fulton', 'NN'), ('County', 'NN'), ('Grand', 'NN'), ('Jury', 'NN'), ('said', 'NN'), ('Friday', 'NN'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'NN'), ("Atlanta's", 'NNS'), ('recent', 'NN'), ('primary', 'NN'), ('election', 'NN'), diff --git a/nltk/tbl/demo.py b/nltk/tbl/demo.py index 28642aea34..482d283b02 100644 --- a/nltk/tbl/demo.py +++ b/nltk/tbl/demo.py @@ -399,19 +399,19 @@ def _demo_plot(learning_curve_output, teststats, trainstats=None, take=None): plt.savefig(learning_curve_output) -NN_CD_TAGGER = RegexpTagger([(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*', 'NN')]) +NN_CD_TAGGER = RegexpTagger([(r"^-?[0-9]+(\.[0-9]+)?$", "CD"), (r".*", "NN")]) REGEXP_TAGGER = RegexpTagger( [ - (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers - (r'(The|the|A|a|An|an)$', 'AT'), # articles - (r'.*able$', 'JJ'), # adjectives - (r'.*ness$', 'NN'), # nouns formed from adjectives - (r'.*ly$', 'RB'), # adverbs - (r'.*s$', 'NNS'), # plural nouns - (r'.*ing$', 'VBG'), # gerunds - (r'.*ed$', 'VBD'), # past tense verbs - (r'.*', 'NN'), # nouns (default) + (r"^-?[0-9]+(\.[0-9]+)?$", "CD"), # cardinal numbers + (r"(The|the|A|a|An|an)$", "AT"), # articles + (r".*able$", "JJ"), # adjectives + (r".*ness$", "NN"), # nouns formed from adjectives + (r".*ly$", "RB"), # adverbs + (r".*s$", "NNS"), # plural nouns + (r".*ing$", "VBG"), # gerunds + (r".*ed$", "VBD"), # past tense verbs + (r".*", "NN"), # nouns (default) ] )