Skip to content

Commit

Permalink
Added multi Bleu functionality and tests (#2793)
Browse files Browse the repository at this point in the history
* Added multi Bleu functionality and tests

* added supports for weights

* Code changes for weights and doc

Co-authored-by: Tom Aarsen <>
  • Loading branch information
BatMrE committed Nov 20, 2021
1 parent e629d7e commit 3c2a5a6
Show file tree
Hide file tree
Showing 3 changed files with 183 additions and 22 deletions.
2 changes: 1 addition & 1 deletion nltk/test/bleu.doctest
Expand Up @@ -9,7 +9,7 @@ If the candidate has no alignment to any of the references, the BLEU score is 0.
>>> bleu(
... ['The candidate has no alignment to any of the references'.split()],
... 'John loves Mary'.split(),
... [1],
... (1,),
... )
0

Expand Down
130 changes: 127 additions & 3 deletions nltk/test/unit/translate/test_bleu.py
Expand Up @@ -120,7 +120,7 @@ def test_zero_matches(self):

# Test BLEU to nth order of n-grams, where n is len(hypothesis).
for n in range(1, len(hypothesis)):
weights = [1.0 / n] * n # Uniform weights.
weights = (1.0 / n,) * n # Uniform weights.
assert sentence_bleu(references, hypothesis, weights) == 0

def test_full_matches(self):
Expand All @@ -130,7 +130,7 @@ def test_full_matches(self):

# Test BLEU to nth order of n-grams, where n is len(hypothesis).
for n in range(1, len(hypothesis)):
weights = [1.0 / n] * n # Uniform weights.
weights = (1.0 / n,) * n # Uniform weights.
assert sentence_bleu(references, hypothesis, weights) == 1.0

def test_partial_matches_hypothesis_longer_than_reference(self):
Expand All @@ -153,7 +153,7 @@ def test_case_where_n_is_bigger_than_hypothesis_length(self):
references = ["John loves Mary ?".split()]
hypothesis = "John loves Mary".split()
n = len(hypothesis) + 1 #
weights = [1.0 / n] * n # Uniform weights.
weights = (1.0 / n,) * n # Uniform weights.
# Since no n-grams matches were found the result should be zero
# exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0
self.assertAlmostEqual(
Expand Down Expand Up @@ -279,3 +279,127 @@ def test_corpus_bleu_with_bad_sentence(self):
)
except AttributeError: # unittest.TestCase.assertWarns is only supported in Python >= 3.2.
self.assertAlmostEqual(corpus_bleu(references, hypotheses), 0.0, places=4)


class TestBLEUWithMultipleWeights(unittest.TestCase):
def test_corpus_bleu_with_multiple_weights(self):
hyp1 = [
"It",
"is",
"a",
"guide",
"to",
"action",
"which",
"ensures",
"that",
"the",
"military",
"always",
"obeys",
"the",
"commands",
"of",
"the",
"party",
]
ref1a = [
"It",
"is",
"a",
"guide",
"to",
"action",
"that",
"ensures",
"that",
"the",
"military",
"will",
"forever",
"heed",
"Party",
"commands",
]
ref1b = [
"It",
"is",
"the",
"guiding",
"principle",
"which",
"guarantees",
"the",
"military",
"forces",
"always",
"being",
"under",
"the",
"command",
"of",
"the",
"Party",
]
ref1c = [
"It",
"is",
"the",
"practical",
"guide",
"for",
"the",
"army",
"always",
"to",
"heed",
"the",
"directions",
"of",
"the",
"party",
]
hyp2 = [
"he",
"read",
"the",
"book",
"because",
"he",
"was",
"interested",
"in",
"world",
"history",
]
ref2a = [
"he",
"was",
"interested",
"in",
"world",
"history",
"because",
"he",
"read",
"the",
"book",
]
weight_1 = (1, 0, 0, 0)
weight_2 = (0.25, 0.25, 0.25, 0.25)
weight_3 = (0, 0, 0, 0, 1)

bleu_scores = corpus_bleu(
list_of_references=[[ref1a, ref1b, ref1c], [ref2a]],
hypotheses=[hyp1, hyp2],
weights=[weight_1, weight_2, weight_3],
)
assert bleu_scores[0] == corpus_bleu(
[[ref1a, ref1b, ref1c], [ref2a]], [hyp1, hyp2], weight_1
)
assert bleu_scores[1] == corpus_bleu(
[[ref1a, ref1b, ref1c], [ref2a]], [hyp1, hyp2], weight_2
)
assert bleu_scores[2] == corpus_bleu(
[[ref1a, ref1b, ref1c], [ref2a]], [hyp1, hyp2], weight_3
)
73 changes: 55 additions & 18 deletions nltk/translate/bleu_score.py
Expand Up @@ -81,18 +81,28 @@ def sentence_bleu(
>>> sentence_bleu([reference1, reference2, reference3], hypothesis1, weights) # doctest: +ELLIPSIS
0.3920...
Multiple BLEU scores can be computed at once, by supplying a list of weights.
E.g. for computing BLEU-2, BLEU-3 *and* BLEU-4 in one computation, use:
>>> weights = [
... (1./2., 1./2.),
... (1./3., 1./3., 1./3.),
... (1./4., 1./4., 1./4., 1./4.)
... ]
>>> sentence_bleu([reference1, reference2, reference3], hypothesis1, weights) # doctest: +ELLIPSIS
[0.7453..., 0.6240..., 0.5045...]
:param references: reference sentences
:type references: list(list(str))
:param hypothesis: a hypothesis sentence
:type hypothesis: list(str)
:param weights: weights for unigrams, bigrams, trigrams and so on
:type weights: list(float)
:param weights: weights for unigrams, bigrams, trigrams and so on (one or a list of weights)
:type weights: tuple(float) / list(tuple(float))
:param smoothing_function:
:type smoothing_function: SmoothingFunction
:param auto_reweigh: Option to re-normalize the weights uniformly.
:type auto_reweigh: bool
:return: The sentence-level BLEU score.
:rtype: float
:return: The sentence-level BLEU score. Returns a list if multiple weights were supplied.
:rtype: float / list(float)
"""
return corpus_bleu(
[references], [hypothesis], weights, smoothing_function, auto_reweigh
Expand Down Expand Up @@ -146,12 +156,29 @@ def corpus_bleu(
>>> (score1 + score2) / 2 # doctest: +ELLIPSIS
0.6223...
Custom weights may be supplied to fine-tune the BLEU score further.
A tuple of float weights for unigrams, bigrams, trigrams and so on can be given.
>>> weights = (0.1, 0.3, 0.5, 0.1)
>>> corpus_bleu(list_of_references, hypotheses, weights=weights) # doctest: +ELLIPSIS
0.5818...
This particular weight gave extra value to trigrams.
Furthermore, multiple weights can be given, resulting in multiple BLEU scores.
>>> weights = [
... (0.5, 0.5),
... (0.333, 0.333, 0.334),
... (0.25, 0.25, 0.25, 0.25),
... (0.2, 0.2, 0.2, 0.2, 0.2)
... ]
>>> corpus_bleu(list_of_references, hypotheses, weights=weights) # doctest: +ELLIPSIS
[0.8242..., 0.7067..., 0.5920..., 0.4719...]
:param list_of_references: a corpus of lists of reference sentences, w.r.t. hypotheses
:type list_of_references: list(list(list(str)))
:param hypotheses: a list of hypothesis sentences
:type hypotheses: list(list(str))
:param weights: weights for unigrams, bigrams, trigrams and so on
:type weights: list(float)
:param weights: weights for unigrams, bigrams, trigrams and so on (one or a list of weights)
:type weights: tuple(float) / list(tuple(float))
:param smoothing_function:
:type smoothing_function: SmoothingFunction
:param auto_reweigh: Option to re-normalize the weights uniformly.
Expand All @@ -169,11 +196,17 @@ def corpus_bleu(
"The number of hypotheses and their reference(s) should be the " "same "
)

try:
weights[0][0]
except TypeError:
weights = [weights]
max_weight_length = max(len(weight) for weight in weights)

# Iterate through each hypothesis and their corresponding references.
for references, hypothesis in zip(list_of_references, hypotheses):
# For each order of ngram, calculate the numerator and
# denominator for the corpus-level modified precision.
for i, _ in enumerate(weights, start=1):
for i in range(1, max_weight_length + 1):
p_i = modified_precision(references, hypothesis, i)
p_numerators[i] += p_i.numerator
p_denominators[i] += p_i.denominator
Expand All @@ -187,23 +220,17 @@ def corpus_bleu(
# Calculate corpus-level brevity penalty.
bp = brevity_penalty(ref_lengths, hyp_lengths)

# Uniformly re-weighting based on maximum hypothesis lengths if largest
# order of n-grams < 4 and weights is set at default.
if auto_reweigh:
if hyp_lengths < 4 and weights == (0.25, 0.25, 0.25, 0.25):
weights = (1 / hyp_lengths,) * hyp_lengths

# Collects the various precision values for the different ngram orders.
p_n = [
Fraction(p_numerators[i], p_denominators[i], _normalize=False)
for i, _ in enumerate(weights, start=1)
for i in range(1, max_weight_length + 1)
]

# Returns 0 if there's no matching n-grams
# We only need to check for p_numerators[1] == 0, since if there's
# no unigrams, there won't be any higher order ngrams.
if p_numerators[1] == 0:
return 0
return 0 if len(weights) == 1 else [0] * len(weights)

# If there's no smoothing, set use method0 from SmoothinFunction class.
if not smoothing_function:
Expand All @@ -215,9 +242,19 @@ def corpus_bleu(
p_n = smoothing_function(
p_n, references=references, hypothesis=hypothesis, hyp_len=hyp_lengths
)
s = (w_i * math.log(p_i) for w_i, p_i in zip(weights, p_n) if p_i > 0)
s = bp * math.exp(math.fsum(s))
return s

bleu_scores = []
for weight in weights:
# Uniformly re-weighting based on maximum hypothesis lengths if largest
# order of n-grams < 4 and weights is set at default.
if auto_reweigh:
if hyp_lengths < 4 and weight == (0.25, 0.25, 0.25, 0.25):
weight = (1 / hyp_lengths,) * hyp_lengths

s = (w_i * math.log(p_i) for w_i, p_i in zip(weight, p_n) if p_i > 0)
s = bp * math.exp(math.fsum(s))
bleu_scores.append(s)
return bleu_scores[0] if len(weights) == 1 else bleu_scores


def modified_precision(references, hypothesis, n):
Expand Down

0 comments on commit 3c2a5a6

Please sign in to comment.