-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathalign_em
119 lines (97 loc) · 3.88 KB
/
align_em
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#!/usr/bin/env python
import optparse
import sys
from collections import defaultdict
import random
optparser = optparse.OptionParser()
optparser.add_option("-d", "--data", dest="train", default="data/hansards", help="Data filename prefix (default=data)")
optparser.add_option("-e", "--english", dest="english", default="e", help="Suffix of English filename (default=e)")
optparser.add_option("-f", "--french", dest="french", default="f", help="Suffix of French filename (default=f)")
optparser.add_option("-t", "--threshold", dest="threshold", default=0.5, type="float", help="Threshold for aligning with Dice's coefficient (default=0.5)")
optparser.add_option("-n", "--num_sentences", dest="num_sents", default=100000000000, type="int", help="Number of sentences to use for training and alignment")
(opts, _) = optparser.parse_args()
f_data = "%s.%s" % (opts.train, opts.french)
e_data = "%s.%s" % (opts.train, opts.english)
sys.stderr.write("Training with EM...")
bitext = [[sentence.strip().split() for sentence in pair] for pair in zip(open(f_data), open(e_data))][:opts.num_sents]
# bitext = []
# for pair in zip(open(f_data), open(e_data))[:opts.num_sents]:
# for sentence in pair:
# bitext.append(sentence.strip().split())
f_count = defaultdict(int)
e_count = defaultdict(int)
fe_count = defaultdict(int)
theta = {}
# sentence, foreign sentence, english sentence in all
for (n, (f, e)) in enumerate(bitext):
# foreign word in foreign sentence
for f_i in set(f):
# count of foreign word inc
f_count[f_i] += 1
# Eng word found w/ foreign word. inc count
for e_j in set(e):
fe_count[(f_i,e_j)] += 1
for e_j in set(e):
e_count[e_j] += 1
if n % 500 == 0:
sys.stderr.write(".")
#
# print(bitext[0][0])
# print(bitext[0][1])
# Expectation Maximization (EM) in a nutshell
# 2. assign probabilities to the missing data
iter = 0
sys.stderr.write("Keys in f: " + str(len(f_count.keys())))
while iter < 6:
for key in fe_count.keys():
fe_count[key] = 0
for key in f_count.keys():
f_count[key] = 0
for (n, (f, e)) in enumerate(bitext):
for key in e_count.keys(): # for all sent pairs set total_s [e] = 0
e_count[key] = 0
for e_j in set(e):
for f_i in set(f):
# 1. initialize model parameters (e.g. uniform)
if (f_i, e_j) not in theta.keys():
theta[(f_i, e_j)] = 1/len(f_count.keys()) # size of french vocabulary
e_count[e_j] += theta[(f_i, e_j)]
for e_j in set(e):
for f_i in set(f):
fe_count[(f_i, e_j)] += theta[(f_i, e_j)] / e_count[e_j]
f_count[f_i] += theta[(f_i, e_j)] / e_count[e_j]
# 3. estimate model parameters from completed data
for f_i in f_count.keys():
for e_j in e_count.keys():
theta[(f_i, e_j)] = fe_count[(f_i, e_j)] / f_count[f_i]
iter += 1
# Decoding function
for (f, e) in bitext:
# Using the dictionary , if f_i is in the key, select the max out of all
for (i, f_i) in enumerate(f):
max = 0
subscript = -1
for (j, e_j) in enumerate(e):
if theta[(f_i, e_j)] > max:
max = theta[(f_i, e_j)]
subscript = j
sys.stdout.write("%i-%i " % (i,subscript))
sys.stdout.write("\n")
# Referenced http://mt-class.org/jhu/assets/papers/alopez-model1-tutorial.pdf
# and pseudocode from https://www.cis.uni-muenchen.de/~fraser/readinggroup/model1.html
# initialize t(e|f) uniformly
# do until convergence
# set count(e|f) to 0 for all e,f
# set total(f) to 0 for all f
# for all sentence pairs (e_s,f_s)
# set total_s(e) = 0 for all e
# for all words e in e_s
# for all words f in f_s
# total_s(e) += t(e|f)
# for all words e in e_s
# for all words f in f_s
# count(e|f) += t(e|f) / total_s(e)
# total(f) += t(e|f) / total_s(e)
# for all f
# for all e
# t(e|f) = count(e|f) / total(f)