From d96754bf6fe42a853cc81854dfdf6fe62fc76502 Mon Sep 17 00:00:00 2001 From: BatMrE <48859022+BatMrE@users.noreply.github.com> Date: Tue, 31 Aug 2021 16:13:26 +0530 Subject: [PATCH 1/3] Added multi Bleu functionality and tests --- nltk/test/bleu.doctest | 2 +- nltk/test/unit/translate/test_bleu.py | 130 +++++++++++++++++++++++++- nltk/translate/bleu_score.py | 40 +++++--- 3 files changed, 155 insertions(+), 17 deletions(-) diff --git a/nltk/test/bleu.doctest b/nltk/test/bleu.doctest index a116dcb613..4ea6b8ac9c 100644 --- a/nltk/test/bleu.doctest +++ b/nltk/test/bleu.doctest @@ -9,7 +9,7 @@ If the candidate has no alignment to any of the references, the BLEU score is 0. >>> bleu( ... ['The candidate has no alignment to any of the references'.split()], ... 'John loves Mary'.split(), -... [1], +... (1,), ... ) 0 diff --git a/nltk/test/unit/translate/test_bleu.py b/nltk/test/unit/translate/test_bleu.py index 0a1811f031..ccabcc89ed 100644 --- a/nltk/test/unit/translate/test_bleu.py +++ b/nltk/test/unit/translate/test_bleu.py @@ -120,7 +120,7 @@ def test_zero_matches(self): # Test BLEU to nth order of n-grams, where n is len(hypothesis). for n in range(1, len(hypothesis)): - weights = [1.0 / n] * n # Uniform weights. + weights = (1.0 / n,) * n # Uniform weights. assert sentence_bleu(references, hypothesis, weights) == 0 def test_full_matches(self): @@ -130,7 +130,7 @@ def test_full_matches(self): # Test BLEU to nth order of n-grams, where n is len(hypothesis). for n in range(1, len(hypothesis)): - weights = [1.0 / n] * n # Uniform weights. + weights = (1.0 / n,) * n # Uniform weights. assert sentence_bleu(references, hypothesis, weights) == 1.0 def test_partial_matches_hypothesis_longer_than_reference(self): @@ -153,7 +153,7 @@ def test_case_where_n_is_bigger_than_hypothesis_length(self): references = ["John loves Mary ?".split()] hypothesis = "John loves Mary".split() n = len(hypothesis) + 1 # - weights = [1.0 / n] * n # Uniform weights. + weights = (1.0 / n,) * n # Uniform weights. # Since no n-grams matches were found the result should be zero # exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0 self.assertAlmostEqual( @@ -269,3 +269,127 @@ def test_corpus_bleu_with_bad_sentence(self): ) except AttributeError: # unittest.TestCase.assertWarns is only supported in Python >= 3.2. self.assertAlmostEqual(corpus_bleu(references, hypotheses), 0.0, places=4) + + +class TestBLEUWithMultipleWeights(unittest.TestCase): + def test_corpus_bleu_with_multiple_weights(self): + hyp1 = [ + "It", + "is", + "a", + "guide", + "to", + "action", + "which", + "ensures", + "that", + "the", + "military", + "always", + "obeys", + "the", + "commands", + "of", + "the", + "party", + ] + ref1a = [ + "It", + "is", + "a", + "guide", + "to", + "action", + "that", + "ensures", + "that", + "the", + "military", + "will", + "forever", + "heed", + "Party", + "commands", + ] + ref1b = [ + "It", + "is", + "the", + "guiding", + "principle", + "which", + "guarantees", + "the", + "military", + "forces", + "always", + "being", + "under", + "the", + "command", + "of", + "the", + "Party", + ] + ref1c = [ + "It", + "is", + "the", + "practical", + "guide", + "for", + "the", + "army", + "always", + "to", + "heed", + "the", + "directions", + "of", + "the", + "party", + ] + hyp2 = [ + "he", + "read", + "the", + "book", + "because", + "he", + "was", + "interested", + "in", + "world", + "history", + ] + ref2a = [ + "he", + "was", + "interested", + "in", + "world", + "history", + "because", + "he", + "read", + "the", + "book", + ] + weight_1 = (1, 0, 0, 0) + weight_2 = (0.25, 0.25, 0.25, 0.25) + weight_3 = (0, 0, 0, 0, 1) + + bleu_scores = corpus_bleu( + list_of_references=[[ref1a, ref1b, ref1c], [ref2a]], + hypotheses=[hyp1, hyp2], + weights=[weight_1, weight_2, weight_3], + ) + assert bleu_scores[0] == corpus_bleu( + [[ref1a, ref1b, ref1c], [ref2a]], [hyp1, hyp2], weight_1 + ) + assert bleu_scores[1] == corpus_bleu( + [[ref1a, ref1b, ref1c], [ref2a]], [hyp1, hyp2], weight_2 + ) + assert bleu_scores[2] == corpus_bleu( + [[ref1a, ref1b, ref1c], [ref2a]], [hyp1, hyp2], weight_3 + ) diff --git a/nltk/translate/bleu_score.py b/nltk/translate/bleu_score.py index 654609847d..d4bc387054 100644 --- a/nltk/translate/bleu_score.py +++ b/nltk/translate/bleu_score.py @@ -150,8 +150,8 @@ def corpus_bleu( :type list_of_references: list(list(list(str))) :param hypotheses: a list of hypothesis sentences :type hypotheses: list(list(str)) - :param weights: weights for unigrams, bigrams, trigrams and so on - :type weights: list(float) + :param weights: weights for unigrams, bigrams, trigrams and so on, (one or list of weights) + :type weights: tuple(float) or list(tuple(float)) :param smoothing_function: :type smoothing_function: SmoothingFunction :param auto_reweigh: Option to re-normalize the weights uniformly. @@ -169,11 +169,15 @@ def corpus_bleu( "The number of hypotheses and their reference(s) should be the " "same " ) + if isinstance(weights, tuple): + weights = [weights] + max_weight_length = max(len(weight) for weight in weights) + # Iterate through each hypothesis and their corresponding references. for references, hypothesis in zip(list_of_references, hypotheses): # For each order of ngram, calculate the numerator and # denominator for the corpus-level modified precision. - for i, _ in enumerate(weights, start=1): + for i in range(1, max_weight_length + 1): p_i = modified_precision(references, hypothesis, i) p_numerators[i] += p_i.numerator p_denominators[i] += p_i.denominator @@ -187,23 +191,23 @@ def corpus_bleu( # Calculate corpus-level brevity penalty. bp = brevity_penalty(ref_lengths, hyp_lengths) - # Uniformly re-weighting based on maximum hypothesis lengths if largest - # order of n-grams < 4 and weights is set at default. - if auto_reweigh: - if hyp_lengths < 4 and weights == (0.25, 0.25, 0.25, 0.25): - weights = (1 / hyp_lengths,) * hyp_lengths + # # Uniformly re-weighting based on maximum hypothesis lengths if largest + # # order of n-grams < 4 and weights is set at default. + # if auto_reweigh: + # if hyp_lengths < 4 and weights == (0.25, 0.25, 0.25, 0.25): + # weights = (1 / hyp_lengths,) * hyp_lengths # Collects the various precision values for the different ngram orders. p_n = [ Fraction(p_numerators[i], p_denominators[i], _normalize=False) - for i, _ in enumerate(weights, start=1) + for i in range(1, max_weight_length + 1) ] # Returns 0 if there's no matching n-grams # We only need to check for p_numerators[1] == 0, since if there's # no unigrams, there won't be any higher order ngrams. if p_numerators[1] == 0: - return 0 + return 0 if len(weights) == 1 else [0] * len(weights) # If there's no smoothing, set use method0 from SmoothinFunction class. if not smoothing_function: @@ -215,9 +219,19 @@ def corpus_bleu( p_n = smoothing_function( p_n, references=references, hypothesis=hypothesis, hyp_len=hyp_lengths ) - s = (w_i * math.log(p_i) for w_i, p_i in zip(weights, p_n)) - s = bp * math.exp(math.fsum(s)) - return s + + bleu_scores = [] + for weight in weights: + # Uniformly re-weighting based on maximum hypothesis lengths if largest + # order of n-grams < 4 and weights is set at default. + if auto_reweigh: + if hyp_lengths < 4 and weight == (0.25, 0.25, 0.25, 0.25): + weight = (1 / hyp_lengths,) * hyp_lengths + + s = (w_i * math.log(p_i) for w_i, p_i in zip(weight, p_n)) + s = bp * math.exp(math.fsum(s)) + bleu_scores.append(s) + return bleu_scores[0] if len(weights) == 1 else bleu_scores def modified_precision(references, hypothesis, n): From dda94de4aafc43cc3f50ff8dd695a9eee444649b Mon Sep 17 00:00:00 2001 From: BatMrE <48859022+BatMrE@users.noreply.github.com> Date: Thu, 7 Oct 2021 00:10:19 +0530 Subject: [PATCH 2/3] added supports for weights --- nltk/translate/bleu_score.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nltk/translate/bleu_score.py b/nltk/translate/bleu_score.py index d4bc387054..379efee9f0 100644 --- a/nltk/translate/bleu_score.py +++ b/nltk/translate/bleu_score.py @@ -169,7 +169,9 @@ def corpus_bleu( "The number of hypotheses and their reference(s) should be the " "same " ) - if isinstance(weights, tuple): + if weights and isinstance(weights[0], float): + weights = [weights] + elif isinstance(weights, tuple): weights = [weights] max_weight_length = max(len(weight) for weight in weights) From adf9e1d03af19fcf8f0e2e2353026d6a5f69e88e Mon Sep 17 00:00:00 2001 From: BatMrE <48859022+BatMrE@users.noreply.github.com> Date: Sat, 13 Nov 2021 21:23:56 +0530 Subject: [PATCH 3/3] Code changes for weights and doc --- nltk/translate/bleu_score.py | 50 +++++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 15 deletions(-) diff --git a/nltk/translate/bleu_score.py b/nltk/translate/bleu_score.py index 379efee9f0..afac23093e 100644 --- a/nltk/translate/bleu_score.py +++ b/nltk/translate/bleu_score.py @@ -81,18 +81,28 @@ def sentence_bleu( >>> sentence_bleu([reference1, reference2, reference3], hypothesis1, weights) # doctest: +ELLIPSIS 0.3920... + Multiple BLEU scores can be computed at once, by supplying a list of weights. + E.g. for computing BLEU-2, BLEU-3 *and* BLEU-4 in one computation, use: + >>> weights = [ + ... (1./2., 1./2.), + ... (1./3., 1./3., 1./3.), + ... (1./4., 1./4., 1./4., 1./4.) + ... ] + >>> sentence_bleu([reference1, reference2, reference3], hypothesis1, weights) # doctest: +ELLIPSIS + [0.7453..., 0.6240..., 0.5045...] + :param references: reference sentences :type references: list(list(str)) :param hypothesis: a hypothesis sentence :type hypothesis: list(str) - :param weights: weights for unigrams, bigrams, trigrams and so on - :type weights: list(float) + :param weights: weights for unigrams, bigrams, trigrams and so on (one or a list of weights) + :type weights: tuple(float) / list(tuple(float)) :param smoothing_function: :type smoothing_function: SmoothingFunction :param auto_reweigh: Option to re-normalize the weights uniformly. :type auto_reweigh: bool - :return: The sentence-level BLEU score. - :rtype: float + :return: The sentence-level BLEU score. Returns a list if multiple weights were supplied. + :rtype: float / list(float) """ return corpus_bleu( [references], [hypothesis], weights, smoothing_function, auto_reweigh @@ -146,12 +156,28 @@ def corpus_bleu( >>> (score1 + score2) / 2 # doctest: +ELLIPSIS 0.6223... + Custom weights may be supplied to fine-tune the BLEU score further. + A tuple of float weights for unigrams, bigrams, trigrams and so on can be given. + >>> weights = (0.1, 0.3, 0.5, 0.1) + >>> corpus_bleu(list_of_references, hypotheses, weights=weights) # doctest: +ELLIPSIS + 0.5818... + This particular weight gave extra value to trigrams. + Furthermore, multiple weights can be given, resulting in multiple BLEU scores. + >>> weights = [ + ... (0.5, 0.5), + ... (0.333, 0.333, 0.334), + ... (0.25, 0.25, 0.25, 0.25), + ... (0.2, 0.2, 0.2, 0.2, 0.2) + ... ] + >>> corpus_bleu(list_of_references, hypotheses, weights=weights) # doctest: +ELLIPSIS + [0.8242..., 0.7067..., 0.5920..., 0.4719...] + :param list_of_references: a corpus of lists of reference sentences, w.r.t. hypotheses :type list_of_references: list(list(list(str))) :param hypotheses: a list of hypothesis sentences :type hypotheses: list(list(str)) - :param weights: weights for unigrams, bigrams, trigrams and so on, (one or list of weights) - :type weights: tuple(float) or list(tuple(float)) + :param weights: weights for unigrams, bigrams, trigrams and so on (one or a list of weights) + :type weights: tuple(float) / list(tuple(float)) :param smoothing_function: :type smoothing_function: SmoothingFunction :param auto_reweigh: Option to re-normalize the weights uniformly. @@ -169,9 +195,9 @@ def corpus_bleu( "The number of hypotheses and their reference(s) should be the " "same " ) - if weights and isinstance(weights[0], float): - weights = [weights] - elif isinstance(weights, tuple): + try: + weights[0][0] + except TypeError: weights = [weights] max_weight_length = max(len(weight) for weight in weights) @@ -193,12 +219,6 @@ def corpus_bleu( # Calculate corpus-level brevity penalty. bp = brevity_penalty(ref_lengths, hyp_lengths) - # # Uniformly re-weighting based on maximum hypothesis lengths if largest - # # order of n-grams < 4 and weights is set at default. - # if auto_reweigh: - # if hyp_lengths < 4 and weights == (0.25, 0.25, 0.25, 0.25): - # weights = (1 / hyp_lengths,) * hyp_lengths - # Collects the various precision values for the different ngram orders. p_n = [ Fraction(p_numerators[i], p_denominators[i], _normalize=False)