Added multi Bleu functionality and tests (#2793)

* Added multi Bleu functionality and tests * added supports for weights * Code changes for weights and doc Co-authored-by: Tom Aarsen <>
nltk · Nov 20, 2021 · 3c2a5a6 · 3c2a5a6
1 parent e629d7e
commit 3c2a5a6
Show file tree

Hide file tree

Showing 3 changed files with 183 additions and 22 deletions.
diff --git a/nltk/test/bleu.doctest b/nltk/test/bleu.doctest
@@ -9,7 +9,7 @@ If the candidate has no alignment to any of the references, the BLEU score is 0.
 >>> bleu(
 ...     ['The candidate has no alignment to any of the references'.split()],
 ...     'John loves Mary'.split(),
-...     [1],
+...     (1,),
 ... )
 0
 

diff --git a/nltk/test/unit/translate/test_bleu.py b/nltk/test/unit/translate/test_bleu.py
@@ -120,7 +120,7 @@ def test_zero_matches(self):
 
         # Test BLEU to nth order of n-grams, where n is len(hypothesis).
         for n in range(1, len(hypothesis)):
-            weights = [1.0 / n] * n  # Uniform weights.
+            weights = (1.0 / n,) * n  # Uniform weights.
             assert sentence_bleu(references, hypothesis, weights) == 0
 
     def test_full_matches(self):
@@ -130,7 +130,7 @@ def test_full_matches(self):
 
         # Test BLEU to nth order of n-grams, where n is len(hypothesis).
         for n in range(1, len(hypothesis)):
-            weights = [1.0 / n] * n  # Uniform weights.
+            weights = (1.0 / n,) * n  # Uniform weights.
             assert sentence_bleu(references, hypothesis, weights) == 1.0
 
     def test_partial_matches_hypothesis_longer_than_reference(self):
@@ -153,7 +153,7 @@ def test_case_where_n_is_bigger_than_hypothesis_length(self):
         references = ["John loves Mary ?".split()]
         hypothesis = "John loves Mary".split()
         n = len(hypothesis) + 1  #
-        weights = [1.0 / n] * n  # Uniform weights.
+        weights = (1.0 / n,) * n  # Uniform weights.
         # Since no n-grams matches were found the result should be zero
         # exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0
         self.assertAlmostEqual(
@@ -279,3 +279,127 @@ def test_corpus_bleu_with_bad_sentence(self):
                 )
         except AttributeError:  # unittest.TestCase.assertWarns is only supported in Python >= 3.2.
             self.assertAlmostEqual(corpus_bleu(references, hypotheses), 0.0, places=4)
+
+
+class TestBLEUWithMultipleWeights(unittest.TestCase):
+    def test_corpus_bleu_with_multiple_weights(self):
+        hyp1 = [
+            "It",
+            "is",
+            "a",
+            "guide",
+            "to",
+            "action",
+            "which",
+            "ensures",
+            "that",
+            "the",
+            "military",
+            "always",
+            "obeys",
+            "the",
+            "commands",
+            "of",
+            "the",
+            "party",
+        ]
+        ref1a = [
+            "It",
+            "is",
+            "a",
+            "guide",
+            "to",
+            "action",
+            "that",
+            "ensures",
+            "that",
+            "the",
+            "military",
+            "will",
+            "forever",
+            "heed",
+            "Party",
+            "commands",
+        ]
+        ref1b = [
+            "It",
+            "is",
+            "the",
+            "guiding",
+            "principle",
+            "which",
+            "guarantees",
+            "the",
+            "military",
+            "forces",
+            "always",
+            "being",
+            "under",
+            "the",
+            "command",
+            "of",
+            "the",
+            "Party",
+        ]
+        ref1c = [
+            "It",
+            "is",
+            "the",
+            "practical",
+            "guide",
+            "for",
+            "the",
+            "army",
+            "always",
+            "to",
+            "heed",
+            "the",
+            "directions",
+            "of",
+            "the",
+            "party",
+        ]
+        hyp2 = [
+            "he",
+            "read",
+            "the",
+            "book",
+            "because",
+            "he",
+            "was",
+            "interested",
+            "in",
+            "world",
+            "history",
+        ]
+        ref2a = [
+            "he",
+            "was",
+            "interested",
+            "in",
+            "world",
+            "history",
+            "because",
+            "he",
+            "read",
+            "the",
+            "book",
+        ]
+        weight_1 = (1, 0, 0, 0)
+        weight_2 = (0.25, 0.25, 0.25, 0.25)
+        weight_3 = (0, 0, 0, 0, 1)
+
+        bleu_scores = corpus_bleu(
+            list_of_references=[[ref1a, ref1b, ref1c], [ref2a]],
+            hypotheses=[hyp1, hyp2],
+            weights=[weight_1, weight_2, weight_3],
+        )
+        assert bleu_scores[0] == corpus_bleu(
+            [[ref1a, ref1b, ref1c], [ref2a]], [hyp1, hyp2], weight_1
+        )
+        assert bleu_scores[1] == corpus_bleu(
+            [[ref1a, ref1b, ref1c], [ref2a]], [hyp1, hyp2], weight_2
+        )
+        assert bleu_scores[2] == corpus_bleu(
+            [[ref1a, ref1b, ref1c], [ref2a]], [hyp1, hyp2], weight_3
+        )
diff --git a/nltk/translate/bleu_score.py b/nltk/translate/bleu_score.py
@@ -81,18 +81,28 @@ def sentence_bleu(
     >>> sentence_bleu([reference1, reference2, reference3], hypothesis1, weights) # doctest: +ELLIPSIS
     0.3920...
 
+    Multiple BLEU scores can be computed at once, by supplying a list of weights.
+    E.g. for computing BLEU-2, BLEU-3 *and* BLEU-4 in one computation, use:
+    >>> weights = [
+    ...     (1./2., 1./2.),
+    ...     (1./3., 1./3., 1./3.),
+    ...     (1./4., 1./4., 1./4., 1./4.)
+    ... ]
+    >>> sentence_bleu([reference1, reference2, reference3], hypothesis1, weights) # doctest: +ELLIPSIS
+    [0.7453..., 0.6240..., 0.5045...]
+
     :param references: reference sentences
     :type references: list(list(str))
     :param hypothesis: a hypothesis sentence
     :type hypothesis: list(str)
-    :param weights: weights for unigrams, bigrams, trigrams and so on
-    :type weights: list(float)
+    :param weights: weights for unigrams, bigrams, trigrams and so on (one or a list of weights)
+    :type weights: tuple(float) / list(tuple(float))
     :param smoothing_function:
     :type smoothing_function: SmoothingFunction
     :param auto_reweigh: Option to re-normalize the weights uniformly.
     :type auto_reweigh: bool
-    :return: The sentence-level BLEU score.
-    :rtype: float
+    :return: The sentence-level BLEU score. Returns a list if multiple weights were supplied.
+    :rtype: float / list(float)
     """
     return corpus_bleu(
         [references], [hypothesis], weights, smoothing_function, auto_reweigh
@@ -146,12 +156,29 @@ def corpus_bleu(
     >>> (score1 + score2) / 2 # doctest: +ELLIPSIS
     0.6223...
 
+    Custom weights may be supplied to fine-tune the BLEU score further.
+    A tuple of float weights for unigrams, bigrams, trigrams and so on can be given.
+    >>> weights = (0.1, 0.3, 0.5, 0.1)
+    >>> corpus_bleu(list_of_references, hypotheses, weights=weights) # doctest: +ELLIPSIS
+    0.5818...
+
+    This particular weight gave extra value to trigrams.
+    Furthermore, multiple weights can be given, resulting in multiple BLEU scores.
+    >>> weights = [
+    ...     (0.5, 0.5),
+    ...     (0.333, 0.333, 0.334),
+    ...     (0.25, 0.25, 0.25, 0.25),
+    ...     (0.2, 0.2, 0.2, 0.2, 0.2)
+    ... ]
+    >>> corpus_bleu(list_of_references, hypotheses, weights=weights) # doctest: +ELLIPSIS
+    [0.8242..., 0.7067..., 0.5920..., 0.4719...]
+
     :param list_of_references: a corpus of lists of reference sentences, w.r.t. hypotheses
     :type list_of_references: list(list(list(str)))
     :param hypotheses: a list of hypothesis sentences
     :type hypotheses: list(list(str))
-    :param weights: weights for unigrams, bigrams, trigrams and so on
-    :type weights: list(float)
+    :param weights: weights for unigrams, bigrams, trigrams and so on (one or a list of weights)
+    :type weights: tuple(float) / list(tuple(float))
     :param smoothing_function:
     :type smoothing_function: SmoothingFunction
     :param auto_reweigh: Option to re-normalize the weights uniformly.
@@ -169,11 +196,17 @@ def corpus_bleu(
         "The number of hypotheses and their reference(s) should be the " "same "
     )
 
+    try:
+        weights[0][0]
+    except TypeError:
+        weights = [weights]
+    max_weight_length = max(len(weight) for weight in weights)
+
     # Iterate through each hypothesis and their corresponding references.
     for references, hypothesis in zip(list_of_references, hypotheses):
         # For each order of ngram, calculate the numerator and
         # denominator for the corpus-level modified precision.
-        for i, _ in enumerate(weights, start=1):
+        for i in range(1, max_weight_length + 1):
             p_i = modified_precision(references, hypothesis, i)
             p_numerators[i] += p_i.numerator
             p_denominators[i] += p_i.denominator
@@ -187,23 +220,17 @@ def corpus_bleu(
     # Calculate corpus-level brevity penalty.
     bp = brevity_penalty(ref_lengths, hyp_lengths)
 
-    # Uniformly re-weighting based on maximum hypothesis lengths if largest
-    # order of n-grams < 4 and weights is set at default.
-    if auto_reweigh:
-        if hyp_lengths < 4 and weights == (0.25, 0.25, 0.25, 0.25):
-            weights = (1 / hyp_lengths,) * hyp_lengths
-
     # Collects the various precision values for the different ngram orders.
     p_n = [
         Fraction(p_numerators[i], p_denominators[i], _normalize=False)
-        for i, _ in enumerate(weights, start=1)
+        for i in range(1, max_weight_length + 1)
     ]
 
     # Returns 0 if there's no matching n-grams
     # We only need to check for p_numerators[1] == 0, since if there's
     # no unigrams, there won't be any higher order ngrams.
     if p_numerators[1] == 0:
-        return 0
+        return 0 if len(weights) == 1 else [0] * len(weights)
 
     # If there's no smoothing, set use method0 from SmoothinFunction class.
     if not smoothing_function:
@@ -215,9 +242,19 @@ def corpus_bleu(
     p_n = smoothing_function(
         p_n, references=references, hypothesis=hypothesis, hyp_len=hyp_lengths
     )
-    s = (w_i * math.log(p_i) for w_i, p_i in zip(weights, p_n) if p_i > 0)
-    s = bp * math.exp(math.fsum(s))
-    return s
+
+    bleu_scores = []
+    for weight in weights:
+        # Uniformly re-weighting based on maximum hypothesis lengths if largest
+        # order of n-grams < 4 and weights is set at default.
+        if auto_reweigh:
+            if hyp_lengths < 4 and weight == (0.25, 0.25, 0.25, 0.25):
+                weight = (1 / hyp_lengths,) * hyp_lengths
+
+        s = (w_i * math.log(p_i) for w_i, p_i in zip(weight, p_n) if p_i > 0)
+        s = bp * math.exp(math.fsum(s))
+        bleu_scores.append(s)
+    return bleu_scores[0] if len(weights) == 1 else bleu_scores
 
 
 def modified_precision(references, hypothesis, n):