-
Notifications
You must be signed in to change notification settings - Fork 2.8k
/
ibm2.py
319 lines (261 loc) · 11.9 KB
/
ibm2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
# Natural Language Toolkit: IBM Model 2
#
# Copyright (C) 2001-2013 NLTK Project
# Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Lexical translation model that considers word order.
IBM Model 2 improves on Model 1 by accounting for word order.
An alignment probability is introduced, a(i | j,l,m), which predicts
a source word position, given its aligned target word's position.
The EM algorithm used in Model 2 is:
:E step: In the training data, collect counts, weighted by prior
probabilities.
- (a) count how many times a source language word is translated
into a target language word
- (b) count how many times a particular position in the source
sentence is aligned to a particular position in the target
sentence
:M step: Estimate new probabilities based on the counts from the E step
Notations
---------
:i: Position in the source sentence
Valid values are 0 (for NULL), 1, 2, ..., length of source sentence
:j: Position in the target sentence
Valid values are 1, 2, ..., length of target sentence
:l: Number of words in the source sentence, excluding NULL
:m: Number of words in the target sentence
:s: A word in the source language
:t: A word in the target language
References
----------
Philipp Koehn. 2010. Statistical Machine Translation.
Cambridge University Press, New York.
Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and
Robert L. Mercer. 1993. The Mathematics of Statistical Machine
Translation: Parameter Estimation. Computational Linguistics, 19 (2),
263-311.
"""
import warnings
from collections import defaultdict
from nltk.translate import AlignedSent, Alignment, IBMModel, IBMModel1
from nltk.translate.ibm_model import Counts
class IBMModel2(IBMModel):
"""
Lexical translation model that considers word order
>>> bitext = []
>>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small']))
>>> bitext.append(AlignedSent(['das', 'haus', 'ist', 'ja', 'groß'], ['the', 'house', 'is', 'big']))
>>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small']))
>>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house']))
>>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book']))
>>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book']))
>>> ibm2 = IBMModel2(bitext, 5)
>>> print(round(ibm2.translation_table['buch']['book'], 3))
1.0
>>> print(round(ibm2.translation_table['das']['book'], 3))
0.0
>>> print(round(ibm2.translation_table['buch'][None], 3))
0.0
>>> print(round(ibm2.translation_table['ja'][None], 3))
0.0
>>> print(round(ibm2.alignment_table[1][1][2][2], 3))
0.939
>>> print(round(ibm2.alignment_table[1][2][2][2], 3))
0.0
>>> print(round(ibm2.alignment_table[2][2][4][5], 3))
1.0
>>> test_sentence = bitext[2]
>>> test_sentence.words
['das', 'buch', 'ist', 'ja', 'klein']
>>> test_sentence.mots
['the', 'book', 'is', 'small']
>>> test_sentence.alignment
Alignment([(0, 0), (1, 1), (2, 2), (3, 2), (4, 3)])
"""
def __init__(self, sentence_aligned_corpus, iterations, probability_tables=None):
"""
Train on ``sentence_aligned_corpus`` and create a lexical
translation model and an alignment model.
Translation direction is from ``AlignedSent.mots`` to
``AlignedSent.words``.
:param sentence_aligned_corpus: Sentence-aligned parallel corpus
:type sentence_aligned_corpus: list(AlignedSent)
:param iterations: Number of iterations to run training algorithm
:type iterations: int
:param probability_tables: Optional. Use this to pass in custom
probability values. If not specified, probabilities will be
set to a uniform distribution, or some other sensible value.
If specified, all the following entries must be present:
``translation_table``, ``alignment_table``.
See ``IBMModel`` for the type and purpose of these tables.
:type probability_tables: dict[str]: object
"""
super().__init__(sentence_aligned_corpus)
if probability_tables is None:
# Get translation probabilities from IBM Model 1
# Run more iterations of training for Model 1, since it is
# faster than Model 2
ibm1 = IBMModel1(sentence_aligned_corpus, 2 * iterations)
self.translation_table = ibm1.translation_table
self.set_uniform_probabilities(sentence_aligned_corpus)
else:
# Set user-defined probabilities
self.translation_table = probability_tables["translation_table"]
self.alignment_table = probability_tables["alignment_table"]
for n in range(0, iterations):
self.train(sentence_aligned_corpus)
self.align_all(sentence_aligned_corpus)
def set_uniform_probabilities(self, sentence_aligned_corpus):
# a(i | j,l,m) = 1 / (l+1) for all i, j, l, m
l_m_combinations = set()
for aligned_sentence in sentence_aligned_corpus:
l = len(aligned_sentence.mots)
m = len(aligned_sentence.words)
if (l, m) not in l_m_combinations:
l_m_combinations.add((l, m))
initial_prob = 1 / (l + 1)
if initial_prob < IBMModel.MIN_PROB:
warnings.warn(
"A source sentence is too long ("
+ str(l)
+ " words). Results may be less accurate."
)
for i in range(0, l + 1):
for j in range(1, m + 1):
self.alignment_table[i][j][l][m] = initial_prob
def train(self, parallel_corpus):
counts = Model2Counts()
for aligned_sentence in parallel_corpus:
src_sentence = [None] + aligned_sentence.mots
trg_sentence = ["UNUSED"] + aligned_sentence.words # 1-indexed
l = len(aligned_sentence.mots)
m = len(aligned_sentence.words)
# E step (a): Compute normalization factors to weigh counts
total_count = self.prob_all_alignments(src_sentence, trg_sentence)
# E step (b): Collect counts
for j in range(1, m + 1):
t = trg_sentence[j]
for i in range(0, l + 1):
s = src_sentence[i]
count = self.prob_alignment_point(i, j, src_sentence, trg_sentence)
normalized_count = count / total_count[t]
counts.update_lexical_translation(normalized_count, s, t)
counts.update_alignment(normalized_count, i, j, l, m)
# M step: Update probabilities with maximum likelihood estimates
self.maximize_lexical_translation_probabilities(counts)
self.maximize_alignment_probabilities(counts)
def maximize_alignment_probabilities(self, counts):
MIN_PROB = IBMModel.MIN_PROB
for i, j_s in counts.alignment.items():
for j, src_sentence_lengths in j_s.items():
for l, trg_sentence_lengths in src_sentence_lengths.items():
for m in trg_sentence_lengths:
estimate = (
counts.alignment[i][j][l][m]
/ counts.alignment_for_any_i[j][l][m]
)
self.alignment_table[i][j][l][m] = max(estimate, MIN_PROB)
def prob_all_alignments(self, src_sentence, trg_sentence):
"""
Computes the probability of all possible word alignments,
expressed as a marginal distribution over target words t
Each entry in the return value represents the contribution to
the total alignment probability by the target word t.
To obtain probability(alignment | src_sentence, trg_sentence),
simply sum the entries in the return value.
:return: Probability of t for all s in ``src_sentence``
:rtype: dict(str): float
"""
alignment_prob_for_t = defaultdict(float)
for j in range(1, len(trg_sentence)):
t = trg_sentence[j]
for i in range(0, len(src_sentence)):
alignment_prob_for_t[t] += self.prob_alignment_point(
i, j, src_sentence, trg_sentence
)
return alignment_prob_for_t
def prob_alignment_point(self, i, j, src_sentence, trg_sentence):
"""
Probability that position j in ``trg_sentence`` is aligned to
position i in the ``src_sentence``
"""
l = len(src_sentence) - 1
m = len(trg_sentence) - 1
s = src_sentence[i]
t = trg_sentence[j]
return self.translation_table[t][s] * self.alignment_table[i][j][l][m]
def prob_t_a_given_s(self, alignment_info):
"""
Probability of target sentence and an alignment given the
source sentence
"""
prob = 1.0
l = len(alignment_info.src_sentence) - 1
m = len(alignment_info.trg_sentence) - 1
for j, i in enumerate(alignment_info.alignment):
if j == 0:
continue # skip the dummy zeroeth element
trg_word = alignment_info.trg_sentence[j]
src_word = alignment_info.src_sentence[i]
prob *= (
self.translation_table[trg_word][src_word]
* self.alignment_table[i][j][l][m]
)
return max(prob, IBMModel.MIN_PROB)
def align_all(self, parallel_corpus):
for sentence_pair in parallel_corpus:
self.align(sentence_pair)
def align(self, sentence_pair):
"""
Determines the best word alignment for one sentence pair from
the corpus that the model was trained on.
The best alignment will be set in ``sentence_pair`` when the
method returns. In contrast with the internal implementation of
IBM models, the word indices in the ``Alignment`` are zero-
indexed, not one-indexed.
:param sentence_pair: A sentence in the source language and its
counterpart sentence in the target language
:type sentence_pair: AlignedSent
"""
best_alignment = []
l = len(sentence_pair.mots)
m = len(sentence_pair.words)
for j, trg_word in enumerate(sentence_pair.words):
# Initialize trg_word to align with the NULL token
best_prob = (
self.translation_table[trg_word][None]
* self.alignment_table[0][j + 1][l][m]
)
best_prob = max(best_prob, IBMModel.MIN_PROB)
best_alignment_point = None
for i, src_word in enumerate(sentence_pair.mots):
align_prob = (
self.translation_table[trg_word][src_word]
* self.alignment_table[i + 1][j + 1][l][m]
)
if align_prob >= best_prob:
best_prob = align_prob
best_alignment_point = i
best_alignment.append((j, best_alignment_point))
sentence_pair.alignment = Alignment(best_alignment)
class Model2Counts(Counts):
"""
Data object to store counts of various parameters during training.
Includes counts for alignment.
"""
def __init__(self):
super().__init__()
self.alignment = defaultdict(
lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(float)))
)
self.alignment_for_any_i = defaultdict(
lambda: defaultdict(lambda: defaultdict(float))
)
def update_lexical_translation(self, count, s, t):
self.t_given_s[t][s] += count
self.any_t_given_s[s] += count
def update_alignment(self, count, i, j, l, m):
self.alignment[i][j][l][m] += count
self.alignment_for_any_i[j][l][m] += count