/
test_bleu.py
281 lines (245 loc) · 12.4 KB
/
test_bleu.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
"""
Tests for BLEU translation evaluation metric
"""
import io
import unittest
from nltk.data import find
from nltk.translate.bleu_score import (
SmoothingFunction,
brevity_penalty,
closest_ref_length,
corpus_bleu,
modified_precision,
sentence_bleu,
)
class TestBLEU(unittest.TestCase):
def test_modified_precision(self):
"""
Examples from the original BLEU paper
https://www.aclweb.org/anthology/P02-1040.pdf
"""
# Example 1: the "the*" example.
# Reference sentences.
ref1 = "the cat is on the mat".split()
ref2 = "there is a cat on the mat".split()
# Hypothesis sentence(s).
hyp1 = "the the the the the the the".split()
references = [ref1, ref2]
# Testing modified unigram precision.
hyp1_unigram_precision = float(modified_precision(references, hyp1, n=1))
assert round(hyp1_unigram_precision, 4) == 0.2857
# With assertAlmostEqual at 4 place precision.
self.assertAlmostEqual(hyp1_unigram_precision, 0.28571428, places=4)
# Testing modified bigram precision.
assert float(modified_precision(references, hyp1, n=2)) == 0.0
# Example 2: the "of the" example.
# Reference sentences
ref1 = str(
"It is a guide to action that ensures that the military "
"will forever heed Party commands"
).split()
ref2 = str(
"It is the guiding principle which guarantees the military "
"forces always being under the command of the Party"
).split()
ref3 = str(
"It is the practical guide for the army always to heed "
"the directions of the party"
).split()
# Hypothesis sentence(s).
hyp1 = "of the".split()
references = [ref1, ref2, ref3]
# Testing modified unigram precision.
assert float(modified_precision(references, hyp1, n=1)) == 1.0
# Testing modified bigram precision.
assert float(modified_precision(references, hyp1, n=2)) == 1.0
# Example 3: Proper MT outputs.
hyp1 = str(
"It is a guide to action which ensures that the military "
"always obeys the commands of the party"
).split()
hyp2 = str(
"It is to insure the troops forever hearing the activity "
"guidebook that party direct"
).split()
references = [ref1, ref2, ref3]
# Unigram precision.
hyp1_unigram_precision = float(modified_precision(references, hyp1, n=1))
hyp2_unigram_precision = float(modified_precision(references, hyp2, n=1))
# Test unigram precision with assertAlmostEqual at 4 place precision.
self.assertAlmostEqual(hyp1_unigram_precision, 0.94444444, places=4)
self.assertAlmostEqual(hyp2_unigram_precision, 0.57142857, places=4)
# Test unigram precision with rounding.
assert round(hyp1_unigram_precision, 4) == 0.9444
assert round(hyp2_unigram_precision, 4) == 0.5714
# Bigram precision
hyp1_bigram_precision = float(modified_precision(references, hyp1, n=2))
hyp2_bigram_precision = float(modified_precision(references, hyp2, n=2))
# Test bigram precision with assertAlmostEqual at 4 place precision.
self.assertAlmostEqual(hyp1_bigram_precision, 0.58823529, places=4)
self.assertAlmostEqual(hyp2_bigram_precision, 0.07692307, places=4)
# Test bigram precision with rounding.
assert round(hyp1_bigram_precision, 4) == 0.5882
assert round(hyp2_bigram_precision, 4) == 0.0769
def test_brevity_penalty(self):
# Test case from brevity_penalty_closest function in mteval-v13a.pl.
# Same test cases as in the doctest in nltk.translate.bleu_score.py
references = [["a"] * 11, ["a"] * 8]
hypothesis = ["a"] * 7
hyp_len = len(hypothesis)
closest_ref_len = closest_ref_length(references, hyp_len)
self.assertAlmostEqual(
brevity_penalty(closest_ref_len, hyp_len), 0.8669, places=4
)
references = [["a"] * 11, ["a"] * 8, ["a"] * 6, ["a"] * 7]
hypothesis = ["a"] * 7
hyp_len = len(hypothesis)
closest_ref_len = closest_ref_length(references, hyp_len)
assert brevity_penalty(closest_ref_len, hyp_len) == 1.0
def test_zero_matches(self):
# Test case where there's 0 matches
references = ["The candidate has no alignment to any of the references".split()]
hypothesis = "John loves Mary".split()
# Test BLEU to nth order of n-grams, where n is len(hypothesis).
for n in range(1, len(hypothesis)):
weights = [1.0 / n] * n # Uniform weights.
assert sentence_bleu(references, hypothesis, weights) == 0
def test_full_matches(self):
# Test case where there's 100% matches
references = ["John loves Mary".split()]
hypothesis = "John loves Mary".split()
# Test BLEU to nth order of n-grams, where n is len(hypothesis).
for n in range(1, len(hypothesis)):
weights = [1.0 / n] * n # Uniform weights.
assert sentence_bleu(references, hypothesis, weights) == 1.0
def test_partial_matches_hypothesis_longer_than_reference(self):
references = ["John loves Mary".split()]
hypothesis = "John loves Mary who loves Mike".split()
# Since no 4-grams matches were found the result should be zero
# exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0
self.assertAlmostEqual(sentence_bleu(references, hypothesis), 0.0, places=4)
# Checks that the warning has been raised because len(reference) < 4.
try:
self.assertWarns(UserWarning, sentence_bleu, references, hypothesis)
except AttributeError:
pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2.
# @unittest.skip("Skipping fringe cases for BLEU.")
class TestBLEUFringeCases(unittest.TestCase):
def test_case_where_n_is_bigger_than_hypothesis_length(self):
# Test BLEU to nth order of n-grams, where n > len(hypothesis).
references = ["John loves Mary ?".split()]
hypothesis = "John loves Mary".split()
n = len(hypothesis) + 1 #
weights = [1.0 / n] * n # Uniform weights.
# Since no n-grams matches were found the result should be zero
# exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0
self.assertAlmostEqual(
sentence_bleu(references, hypothesis, weights), 0.0, places=4
)
# Checks that the warning has been raised because len(hypothesis) < 4.
try:
self.assertWarns(UserWarning, sentence_bleu, references, hypothesis)
except AttributeError:
pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2.
# Test case where n > len(hypothesis) but so is n > len(reference), and
# it's a special case where reference == hypothesis.
references = ["John loves Mary".split()]
hypothesis = "John loves Mary".split()
# Since no 4-grams matches were found the result should be zero
# exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0
self.assertAlmostEqual(
sentence_bleu(references, hypothesis, weights), 0.0, places=4
)
def test_empty_hypothesis(self):
# Test case where there's hypothesis is empty.
references = ["The candidate has no alignment to any of the references".split()]
hypothesis = []
assert sentence_bleu(references, hypothesis) == 0
def test_length_one_hypothesis(self):
# Test case where there's hypothesis is of length 1 in Smoothing method 4.
references = ["The candidate has no alignment to any of the references".split()]
hypothesis = ["Foo"]
method4 = SmoothingFunction().method4
try:
sentence_bleu(references, hypothesis, smoothing_function=method4)
except ValueError:
pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2.
def test_empty_references(self):
# Test case where there's reference is empty.
references = [[]]
hypothesis = "John loves Mary".split()
assert sentence_bleu(references, hypothesis) == 0
def test_empty_references_and_hypothesis(self):
# Test case where both references and hypothesis is empty.
references = [[]]
hypothesis = []
assert sentence_bleu(references, hypothesis) == 0
def test_reference_or_hypothesis_shorter_than_fourgrams(self):
# Test case where the length of reference or hypothesis
# is shorter than 4.
references = ["let it go".split()]
hypothesis = "let go it".split()
# Checks that the value the hypothesis and reference returns is 0.0
# exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0
self.assertAlmostEqual(sentence_bleu(references, hypothesis), 0.0, places=4)
# Checks that the warning has been raised.
try:
self.assertWarns(UserWarning, sentence_bleu, references, hypothesis)
except AttributeError:
pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2.
class TestBLEUvsMteval13a(unittest.TestCase):
def test_corpus_bleu(self):
ref_file = find("models/wmt15_eval/ref.ru")
hyp_file = find("models/wmt15_eval/google.ru")
mteval_output_file = find("models/wmt15_eval/mteval-13a.output")
# Reads the BLEU scores from the `mteval-13a.output` file.
# The order of the list corresponds to the order of the ngrams.
with open(mteval_output_file) as mteval_fin:
# The numbers are located in the last 2nd line of the file.
# The first and 2nd item in the list are the score and system names.
mteval_bleu_scores = map(float, mteval_fin.readlines()[-2].split()[1:-1])
with open(ref_file, encoding="utf8") as ref_fin:
with open(hyp_file, encoding="utf8") as hyp_fin:
# Whitespace tokenize the file.
# Note: split() automatically strip().
hypothesis = list(map(lambda x: x.split(), hyp_fin))
# Note that the corpus_bleu input is list of list of references.
references = list(map(lambda x: [x.split()], ref_fin))
# Without smoothing.
for i, mteval_bleu in zip(range(1, 10), mteval_bleu_scores):
nltk_bleu = corpus_bleu(
references, hypothesis, weights=(1.0 / i,) * i
)
# Check that the BLEU scores difference is less than 0.005 .
# Note: This is an approximate comparison; as much as
# +/- 0.01 BLEU might be "statistically significant",
# the actual translation quality might not be.
assert abs(mteval_bleu - nltk_bleu) < 0.005
# With the same smoothing method used in mteval-v13a.pl
chencherry = SmoothingFunction()
for i, mteval_bleu in zip(range(1, 10), mteval_bleu_scores):
nltk_bleu = corpus_bleu(
references,
hypothesis,
weights=(1.0 / i,) * i,
smoothing_function=chencherry.method3,
)
assert abs(mteval_bleu - nltk_bleu) < 0.005
class TestBLEUWithBadSentence(unittest.TestCase):
def test_corpus_bleu_with_bad_sentence(self):
hyp = "Teo S yb , oe uNb , R , T t , , t Tue Ar saln S , , 5istsi l , 5oe R ulO sae oR R"
ref = str(
"Their tasks include changing a pump on the faulty stokehold ."
"Likewise , two species that are very similar in morphology "
"were distinguished using genetics ."
)
references = [[ref.split()]]
hypotheses = [hyp.split()]
try: # Check that the warning is raised since no. of 2-grams < 0.
with self.assertWarns(UserWarning):
# Verify that the BLEU output is undesired since no. of 2-grams < 0.
self.assertAlmostEqual(
corpus_bleu(references, hypotheses), 0.0, places=4
)
except AttributeError: # unittest.TestCase.assertWarns is only supported in Python >= 3.2.
self.assertAlmostEqual(corpus_bleu(references, hypotheses), 0.0, places=4)