Skip to content

Commit

Permalink
Merge pull request #2 from ActiveState/cve-2021-3842
Browse files Browse the repository at this point in the history
Resolve ReDoS opportunity by fixing incorrectly specified regex (nltk#2906)
  • Loading branch information
MatthewZMD committed Dec 21, 2023
2 parents 817c76f + 4757a3b commit 41d217b
Show file tree
Hide file tree
Showing 6 changed files with 63 additions and 63 deletions.
56 changes: 28 additions & 28 deletions nltk/parse/malt.py
Expand Up @@ -31,34 +31,34 @@ def malt_regex_tagger():

_tagger = RegexpTagger(
[
(r'\.$', '.'),
(r'\,$', ','),
(r'\?$', '?'), # fullstop, comma, Qmark
(r'\($', '('),
(r'\)$', ')'), # round brackets
(r'\[$', '['),
(r'\]$', ']'), # square brackets
(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
(r'(The|the|A|a|An|an)$', 'DT'), # articles
(r'(He|he|She|she|It|it|I|me|Me|You|you)$', 'PRP'), # pronouns
(r'(His|his|Her|her|Its|its)$', 'PRP$'), # possesive
(r'(my|Your|your|Yours|yours)$', 'PRP$'), # possesive
(r'(on|On|in|In|at|At|since|Since)$', 'IN'), # time prepopsitions
(r'(for|For|ago|Ago|before|Before)$', 'IN'), # time prepopsitions
(r'(till|Till|until|Until)$', 'IN'), # time prepopsitions
(r'(by|By|beside|Beside)$', 'IN'), # space prepopsitions
(r'(under|Under|below|Below)$', 'IN'), # space prepopsitions
(r'(over|Over|above|Above)$', 'IN'), # space prepopsitions
(r'(across|Across|through|Through)$', 'IN'), # space prepopsitions
(r'(into|Into|towards|Towards)$', 'IN'), # space prepopsitions
(r'(onto|Onto|from|From)$', 'IN'), # space prepopsitions
(r'.*able$', 'JJ'), # adjectives
(r'.*ness$', 'NN'), # nouns formed from adjectives
(r'.*ly$', 'RB'), # adverbs
(r'.*s$', 'NNS'), # plural nouns
(r'.*ing$', 'VBG'), # gerunds
(r'.*ed$', 'VBD'), # past tense verbs
(r'.*', 'NN'), # nouns (default)
(r"\.$", "."),
(r"\,$", ","),
(r"\?$", "?"), # fullstop, comma, Qmark
(r"\($", "("),
(r"\)$", ")"), # round brackets
(r"\[$", "["),
(r"\]$", "]"), # square brackets
(r"^-?[0-9]+(\.[0-9]+)?$", "CD"), # cardinal numbers
(r"(The|the|A|a|An|an)$", "DT"), # articles
(r"(He|he|She|she|It|it|I|me|Me|You|you)$", "PRP"), # pronouns
(r"(His|his|Her|her|Its|its)$", "PRP$"), # possessive
(r"(my|Your|your|Yours|yours)$", "PRP$"), # possessive
(r"(on|On|in|In|at|At|since|Since)$", "IN"), # time prepopsitions
(r"(for|For|ago|Ago|before|Before)$", "IN"), # time prepopsitions
(r"(till|Till|until|Until)$", "IN"), # time prepopsitions
(r"(by|By|beside|Beside)$", "IN"), # space prepopsitions
(r"(under|Under|below|Below)$", "IN"), # space prepopsitions
(r"(over|Over|above|Above)$", "IN"), # space prepopsitions
(r"(across|Across|through|Through)$", "IN"), # space prepopsitions
(r"(into|Into|towards|Towards)$", "IN"), # space prepopsitions
(r"(onto|Onto|from|From)$", "IN"), # space prepopsitions
(r".*able$", "JJ"), # adjectives
(r".*ness$", "NN"), # nouns formed from adjectives
(r".*ly$", "RB"), # adverbs
(r".*s$", "NNS"), # plural nouns
(r".*ing$", "VBG"), # gerunds
(r".*ed$", "VBD"), # past tense verbs
(r".*", "NN"), # nouns (default)
]
)
return _tagger.tag
Expand Down
18 changes: 9 additions & 9 deletions nltk/sem/glue.py
Expand Up @@ -706,15 +706,15 @@ def get_pos_tagger(self):

regexp_tagger = RegexpTagger(
[
(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
(r'(The|the|A|a|An|an)$', 'AT'), # articles
(r'.*able$', 'JJ'), # adjectives
(r'.*ness$', 'NN'), # nouns formed from adjectives
(r'.*ly$', 'RB'), # adverbs
(r'.*s$', 'NNS'), # plural nouns
(r'.*ing$', 'VBG'), # gerunds
(r'.*ed$', 'VBD'), # past tense verbs
(r'.*', 'NN'), # nouns (default)
(r"^-?[0-9]+(\.[0-9]+)?$", "CD"), # cardinal numbers
(r"(The|the|A|a|An|an)$", "AT"), # articles
(r".*able$", "JJ"), # adjectives
(r".*ness$", "NN"), # nouns formed from adjectives
(r".*ly$", "RB"), # adverbs
(r".*s$", "NNS"), # plural nouns
(r".*ing$", "VBG"), # gerunds
(r".*ed$", "VBD"), # past tense verbs
(r".*", "NN"), # nouns (default)
]
)
brown_train = brown.tagged_sents(categories='news')
Expand Down
2 changes: 1 addition & 1 deletion nltk/tag/brill.py
Expand Up @@ -332,7 +332,7 @@ def print_train_stats():
)
print(
"TRAIN ({tokencount:7d} tokens) initial {initialerrors:5d} {initialacc:.4f} "
"final: {finalerrors:5d} {finalacc:.4f} ".format(**train_stats)
"final: {finalerrors:5d} {finalacc:.4f}".format(**train_stats)
)
head = "#ID | Score (train) | #Rules | Template"
print(head, "\n", "-" * len(head), sep="")
Expand Down
22 changes: 11 additions & 11 deletions nltk/tag/brill_trainer.py
Expand Up @@ -94,7 +94,7 @@ def __init__(
# Training

def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
"""
r"""
Trains the Brill tagger on the corpus *train_sents*,
producing at most *max_rules* transformations, each of which
reduces the net number of errors in the corpus by at least
Expand All @@ -114,7 +114,7 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
>>> testing_data = [untag(s) for s in gold_data]
>>> backoff = RegexpTagger([
... (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
... (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'), # cardinal numbers
... (r'(The|the|A|a|An|an)$', 'AT'), # articles
... (r'.*able$', 'JJ'), # adjectives
... (r'.*ness$', 'NN'), # nouns formed from adjectives
Expand All @@ -128,7 +128,7 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
>>> baseline = backoff #see NOTE1
>>> baseline.evaluate(gold_data) #doctest: +ELLIPSIS
0.2450142...
0.2433862...
#templates
>>> Template._cleartemplates() #clear any templates created in earlier tests
Expand All @@ -140,7 +140,7 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
>>> tagger1 = tt.train(training_data, max_rules=10)
TBL train (fast) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: None)
Finding initial useful rules...
Found 845 useful rules.
Found 847 useful rules.
<BLANKLINE>
B |
S F r O | Score = Fixed - Broken
Expand All @@ -153,7 +153,7 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
85 85 0 0 | NN->, if Pos:NN@[-1] & Word:,@[0]
69 69 0 0 | NN->. if Pos:NN@[-1] & Word:.@[0]
51 51 0 0 | NN->IN if Pos:NN@[-1] & Word:of@[0]
47 63 16 161 | NN->IN if Pos:NNS@[-1]
47 63 16 162 | NN->IN if Pos:NNS@[-1]
33 33 0 0 | NN->TO if Pos:NN@[-1] & Word:to@[0]
26 26 0 0 | IN->. if Pos:NNS@[-1] & Word:.@[0]
24 24 0 0 | IN->, if Pos:NNS@[-1] & Word:,@[0]
Expand All @@ -165,11 +165,11 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
>>> train_stats = tagger1.train_stats()
>>> [train_stats[stat] for stat in ['initialerrors', 'finalerrors', 'rulescores']]
[1775, 1269, [132, 85, 69, 51, 47, 33, 26, 24, 22, 17]]
[1776, 1270, [132, 85, 69, 51, 47, 33, 26, 24, 22, 17]]
>>> tagger1.print_template_statistics(printunused=False)
TEMPLATE STATISTICS (TRAIN) 2 templates, 10 rules)
TRAIN ( 2417 tokens) initial 1775 0.2656 final: 1269 0.4750
TRAIN ( 2417 tokens) initial 1776 0.2652 final: 1270 0.4746
#ID | Score (train) | #Rules | Template
--------------------------------------------
001 | 305 0.603 | 7 0.700 | Template(Pos([-1]),Word([0]))
Expand All @@ -178,7 +178,7 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
<BLANKLINE>
>>> tagger1.evaluate(gold_data) # doctest: +ELLIPSIS
0.43996...
0.43833...
>>> tagged, test_stats = tagger1.batch_tag_incremental(testing_data, gold_data)
Expand All @@ -188,13 +188,13 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
True
>>> [test_stats[stat] for stat in ['initialerrors', 'finalerrors', 'rulescores']]
[1855, 1376, [100, 85, 67, 58, 27, 36, 27, 16, 31, 32]]
[1859, 1380, [100, 85, 67, 58, 27, 36, 27, 16, 31, 32]]
# a high-accuracy tagger
>>> tagger2 = tt.train(training_data, max_rules=10, min_acc=0.99)
TBL train (fast) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: 0.99)
Finding initial useful rules...
Found 845 useful rules.
Found 847 useful rules.
<BLANKLINE>
B |
S F r O | Score = Fixed - Broken
Expand All @@ -215,7 +215,7 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
18 18 0 0 | NN->CC if Pos:NN@[-1] & Word:and@[0]
>>> tagger2.evaluate(gold_data) # doctest: +ELLIPSIS
0.44159544...
0.43996743...
>>> tagger2.rules()[2:4]
(Rule('001', 'NN', '.', [(Pos([-1]),'NN'), (Word([0]),'.')]), Rule('001', 'NN', 'IN', [(Pos([-1]),'NN'), (Word([0]),'of')]))
Expand Down
8 changes: 4 additions & 4 deletions nltk/tag/sequential.py
Expand Up @@ -320,7 +320,7 @@ class UnigramTagger(NgramTagger):
>>> test_sent = brown.sents(categories='news')[0]
>>> unigram_tagger = UnigramTagger(brown.tagged_sents(categories='news')[:500])
>>> for tok, tag in unigram_tagger.tag(test_sent):
... print("(%s, %s), " % (tok, tag))
... print("({}, {}), ".format(tok, tag)) # doctest: +NORMALIZE_WHITESPACE
(The, AT), (Fulton, NP-TL), (County, NN-TL), (Grand, JJ-TL),
(Jury, NN-TL), (said, VBD), (Friday, NR), (an, AT),
(investigation, NN), (of, IN), (Atlanta's, NP$), (recent, JJ),
Expand Down Expand Up @@ -499,7 +499,7 @@ def context(self, tokens, index, history):
@python_2_unicode_compatible
@jsontags.register_tag
class RegexpTagger(SequentialBackoffTagger):
"""
r"""
Regular Expression Tagger
The RegexpTagger assigns tags to tokens by comparing their
Expand All @@ -511,7 +511,7 @@ class RegexpTagger(SequentialBackoffTagger):
>>> from nltk.tag import RegexpTagger
>>> test_sent = brown.sents(categories='news')[0]
>>> regexp_tagger = RegexpTagger(
... [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
... [(r'^-?[0-9]+(\.[0-9]+)?$', 'CD'), # cardinal numbers
... (r'(The|the|A|a|An|an)$', 'AT'), # articles
... (r'.*able$', 'JJ'), # adjectives
... (r'.*ness$', 'NN'), # nouns formed from adjectives
Expand All @@ -523,7 +523,7 @@ class RegexpTagger(SequentialBackoffTagger):
... ])
>>> regexp_tagger
<Regexp Tagger: size=9>
>>> regexp_tagger.tag(test_sent)
>>> regexp_tagger.tag(test_sent) # doctest: +NORMALIZE_WHITESPACE
[('The', 'AT'), ('Fulton', 'NN'), ('County', 'NN'), ('Grand', 'NN'), ('Jury', 'NN'),
('said', 'NN'), ('Friday', 'NN'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'NN'),
("Atlanta's", 'NNS'), ('recent', 'NN'), ('primary', 'NN'), ('election', 'NN'),
Expand Down
20 changes: 10 additions & 10 deletions nltk/tbl/demo.py
Expand Up @@ -399,19 +399,19 @@ def _demo_plot(learning_curve_output, teststats, trainstats=None, take=None):
plt.savefig(learning_curve_output)


NN_CD_TAGGER = RegexpTagger([(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*', 'NN')])
NN_CD_TAGGER = RegexpTagger([(r"^-?[0-9]+(\.[0-9]+)?$", "CD"), (r".*", "NN")])

REGEXP_TAGGER = RegexpTagger(
[
(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
(r'(The|the|A|a|An|an)$', 'AT'), # articles
(r'.*able$', 'JJ'), # adjectives
(r'.*ness$', 'NN'), # nouns formed from adjectives
(r'.*ly$', 'RB'), # adverbs
(r'.*s$', 'NNS'), # plural nouns
(r'.*ing$', 'VBG'), # gerunds
(r'.*ed$', 'VBD'), # past tense verbs
(r'.*', 'NN'), # nouns (default)
(r"^-?[0-9]+(\.[0-9]+)?$", "CD"), # cardinal numbers
(r"(The|the|A|a|An|an)$", "AT"), # articles
(r".*able$", "JJ"), # adjectives
(r".*ness$", "NN"), # nouns formed from adjectives
(r".*ly$", "RB"), # adverbs
(r".*s$", "NNS"), # plural nouns
(r".*ing$", "VBG"), # gerunds
(r".*ed$", "VBD"), # past tense verbs
(r".*", "NN"), # nouns (default)
]
)

Expand Down

0 comments on commit 41d217b

Please sign in to comment.