Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Resolve ReDoS opportunity by fixing incorrectly specified regex #2906

Merged
merged 1 commit into from Dec 8, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion nltk/parse/malt.py
Expand Up @@ -32,7 +32,7 @@ def malt_regex_tagger():
(r"\)$", ")"), # round brackets
(r"\[$", "["),
(r"\]$", "]"), # square brackets
(r"^-?[0-9]+(.[0-9]+)?$", "CD"), # cardinal numbers
(r"^-?[0-9]+(\.[0-9]+)?$", "CD"), # cardinal numbers
(r"(The|the|A|a|An|an)$", "DT"), # articles
(r"(He|he|She|she|It|it|I|me|Me|You|you)$", "PRP"), # pronouns
(r"(His|his|Her|her|Its|its)$", "PRP$"), # possessive
Expand Down
2 changes: 1 addition & 1 deletion nltk/sem/glue.py
Expand Up @@ -703,7 +703,7 @@ def get_pos_tagger(self):

regexp_tagger = RegexpTagger(
[
(r"^-?[0-9]+(.[0-9]+)?$", "CD"), # cardinal numbers
(r"^-?[0-9]+(\.[0-9]+)?$", "CD"), # cardinal numbers
(r"(The|the|A|a|An|an)$", "AT"), # articles
(r".*able$", "JJ"), # adjectives
(r".*ness$", "NN"), # nouns formed from adjectives
Expand Down
2 changes: 1 addition & 1 deletion nltk/tag/brill.py
Expand Up @@ -329,7 +329,7 @@ def print_train_stats():
)
print(
"TRAIN ({tokencount:7d} tokens) initial {initialerrors:5d} {initialacc:.4f} "
"final: {finalerrors:5d} {finalacc:.4f} ".format(**train_stats)
"final: {finalerrors:5d} {finalacc:.4f}".format(**train_stats)
)
head = "#ID | Score (train) | #Rules | Template"
print(head, "\n", "-" * len(head), sep="")
Expand Down
22 changes: 11 additions & 11 deletions nltk/tag/brill_trainer.py
Expand Up @@ -91,7 +91,7 @@ def __init__(
# Training

def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
"""
r"""
Trains the Brill tagger on the corpus *train_sents*,
producing at most *max_rules* transformations, each of which
reduces the net number of errors in the corpus by at least
Expand All @@ -111,7 +111,7 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
>>> testing_data = [untag(s) for s in gold_data]

>>> backoff = RegexpTagger([
... (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
... (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'), # cardinal numbers
... (r'(The|the|A|a|An|an)$', 'AT'), # articles
... (r'.*able$', 'JJ'), # adjectives
... (r'.*ness$', 'NN'), # nouns formed from adjectives
Expand All @@ -125,7 +125,7 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
>>> baseline = backoff #see NOTE1

>>> baseline.evaluate(gold_data) #doctest: +ELLIPSIS
0.2450142...
0.2433862...

>>> # Set up templates
>>> Template._cleartemplates() #clear any templates created in earlier tests
Expand All @@ -137,7 +137,7 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
>>> tagger1 = tt.train(training_data, max_rules=10)
TBL train (fast) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: None)
Finding initial useful rules...
Found 845 useful rules.
Found 847 useful rules.
<BLANKLINE>
B |
S F r O | Score = Fixed - Broken
Expand All @@ -150,7 +150,7 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
85 85 0 0 | NN->, if Pos:NN@[-1] & Word:,@[0]
69 69 0 0 | NN->. if Pos:NN@[-1] & Word:.@[0]
51 51 0 0 | NN->IN if Pos:NN@[-1] & Word:of@[0]
47 63 16 161 | NN->IN if Pos:NNS@[-1]
47 63 16 162 | NN->IN if Pos:NNS@[-1]
33 33 0 0 | NN->TO if Pos:NN@[-1] & Word:to@[0]
26 26 0 0 | IN->. if Pos:NNS@[-1] & Word:.@[0]
24 24 0 0 | IN->, if Pos:NNS@[-1] & Word:,@[0]
Expand All @@ -162,11 +162,11 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):

>>> train_stats = tagger1.train_stats()
>>> [train_stats[stat] for stat in ['initialerrors', 'finalerrors', 'rulescores']]
[1775, 1269, [132, 85, 69, 51, 47, 33, 26, 24, 22, 17]]
[1776, 1270, [132, 85, 69, 51, 47, 33, 26, 24, 22, 17]]

>>> tagger1.print_template_statistics(printunused=False)
TEMPLATE STATISTICS (TRAIN) 2 templates, 10 rules)
TRAIN ( 2417 tokens) initial 1775 0.2656 final: 1269 0.4750
TRAIN ( 2417 tokens) initial 1776 0.2652 final: 1270 0.4746
#ID | Score (train) | #Rules | Template
--------------------------------------------
001 | 305 0.603 | 7 0.700 | Template(Pos([-1]),Word([0]))
Expand All @@ -175,7 +175,7 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
<BLANKLINE>

>>> tagger1.evaluate(gold_data) # doctest: +ELLIPSIS
0.43996...
0.43833...

>>> tagged, test_stats = tagger1.batch_tag_incremental(testing_data, gold_data)

Expand All @@ -185,13 +185,13 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
True

>>> [test_stats[stat] for stat in ['initialerrors', 'finalerrors', 'rulescores']]
[1855, 1376, [100, 85, 67, 58, 27, 36, 27, 16, 31, 32]]
[1859, 1380, [100, 85, 67, 58, 27, 36, 27, 16, 31, 32]]

>>> # A high-accuracy tagger
>>> tagger2 = tt.train(training_data, max_rules=10, min_acc=0.99)
TBL train (fast) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: 0.99)
Finding initial useful rules...
Found 845 useful rules.
Found 847 useful rules.
<BLANKLINE>
B |
S F r O | Score = Fixed - Broken
Expand All @@ -212,7 +212,7 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
18 18 0 0 | NN->CC if Pos:NN@[-1] & Word:and@[0]

>>> tagger2.evaluate(gold_data) # doctest: +ELLIPSIS
0.44159544...
0.43996743...
>>> tagger2.rules()[2:4]
(Rule('001', 'NN', '.', [(Pos([-1]),'NN'), (Word([0]),'.')]), Rule('001', 'NN', 'IN', [(Pos([-1]),'NN'), (Word([0]),'of')]))

Expand Down
8 changes: 4 additions & 4 deletions nltk/tag/sequential.py
Expand Up @@ -337,7 +337,7 @@ class UnigramTagger(NgramTagger):
>>> test_sent = brown.sents(categories='news')[0]
>>> unigram_tagger = UnigramTagger(brown.tagged_sents(categories='news')[:500])
>>> for tok, tag in unigram_tagger.tag(test_sent):
... print("({}, {}), ".format(tok, tag))
... print("({}, {}), ".format(tok, tag)) # doctest: +NORMALIZE_WHITESPACE
(The, AT), (Fulton, NP-TL), (County, NN-TL), (Grand, JJ-TL),
(Jury, NN-TL), (said, VBD), (Friday, NR), (an, AT),
(investigation, NN), (of, IN), (Atlanta's, NP$), (recent, JJ),
Expand Down Expand Up @@ -491,7 +491,7 @@ def context(self, tokens, index, history):

@jsontags.register_tag
class RegexpTagger(SequentialBackoffTagger):
"""
r"""
Regular Expression Tagger

The RegexpTagger assigns tags to tokens by comparing their
Expand All @@ -503,7 +503,7 @@ class RegexpTagger(SequentialBackoffTagger):
>>> from nltk.tag import RegexpTagger
>>> test_sent = brown.sents(categories='news')[0]
>>> regexp_tagger = RegexpTagger(
... [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
... [(r'^-?[0-9]+(\.[0-9]+)?$', 'CD'), # cardinal numbers
... (r'(The|the|A|a|An|an)$', 'AT'), # articles
... (r'.*able$', 'JJ'), # adjectives
... (r'.*ness$', 'NN'), # nouns formed from adjectives
Expand All @@ -515,7 +515,7 @@ class RegexpTagger(SequentialBackoffTagger):
... ])
>>> regexp_tagger
<Regexp Tagger: size=9>
>>> regexp_tagger.tag(test_sent)
>>> regexp_tagger.tag(test_sent) # doctest: +NORMALIZE_WHITESPACE
[('The', 'AT'), ('Fulton', 'NN'), ('County', 'NN'), ('Grand', 'NN'), ('Jury', 'NN'),
('said', 'NN'), ('Friday', 'NN'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'NN'),
("Atlanta's", 'NNS'), ('recent', 'NN'), ('primary', 'NN'), ('election', 'NN'),
Expand Down
4 changes: 2 additions & 2 deletions nltk/tbl/demo.py
Expand Up @@ -393,11 +393,11 @@ def _demo_plot(learning_curve_output, teststats, trainstats=None, take=None):
plt.savefig(learning_curve_output)


NN_CD_TAGGER = RegexpTagger([(r"^-?[0-9]+(.[0-9]+)?$", "CD"), (r".*", "NN")])
NN_CD_TAGGER = RegexpTagger([(r"^-?[0-9]+(\.[0-9]+)?$", "CD"), (r".*", "NN")])

REGEXP_TAGGER = RegexpTagger(
[
(r"^-?[0-9]+(.[0-9]+)?$", "CD"), # cardinal numbers
(r"^-?[0-9]+(\.[0-9]+)?$", "CD"), # cardinal numbers
(r"(The|the|A|a|An|an)$", "AT"), # articles
(r".*able$", "JJ"), # adjectives
(r".*ness$", "NN"), # nouns formed from adjectives
Expand Down