1 """
2 This is an implementation of a state-emitting MarkovModel. I am using
3 terminology similar to Manning and Schutze.
4
5
6
7 Functions:
8 train_bw Train a markov model using the Baum-Welch algorithm.
9 train_visible Train a visible markov model using MLE.
10 find_states Find the a state sequence that explains some observations.
11
12 load Load a MarkovModel.
13 save Save a MarkovModel.
14
15 Classes:
16 MarkovModel Holds the description of a markov model
17 """
18
19 import numpy
20
21 try:
22 logaddexp = numpy.logaddexp
23 except AttributeError:
24
25
26
27
28 import warnings
29 warnings.warn("For optimal speed, please update to Numpy version 1.3 or later (current version is %s)" % numpy.__version__)
31 if logy - logx > 100:
32 return logy
33 elif logx - logy > 100:
34 return logx
35 minxy = min(logx, logy)
36 return minxy + numpy.log(numpy.exp(logx-minxy) + numpy.exp(logy-minxy))
37
38
39
46
47 numpy.random.seed()
48
49 VERY_SMALL_NUMBER = 1E-300
50 LOG0 = numpy.log(VERY_SMALL_NUMBER)
51
53 - def __init__(self, states, alphabet,
54 p_initial=None, p_transition=None, p_emission=None):
55 self.states = states
56 self.alphabet = alphabet
57 self.p_initial = p_initial
58 self.p_transition = p_transition
59 self.p_emission = p_emission
66
72
74 """load(handle) -> MarkovModel()"""
75
76 line = _readline_and_check_start(handle, "STATES:")
77 states = line.split()[1:]
78
79
80 line = _readline_and_check_start(handle, "ALPHABET:")
81 alphabet = line.split()[1:]
82
83 mm = MarkovModel(states, alphabet)
84 N, M = len(states), len(alphabet)
85
86
87 mm.p_initial = numpy.zeros(N)
88 line = _readline_and_check_start(handle, "INITIAL:")
89 for i in range(len(states)):
90 line = _readline_and_check_start(handle, " %s:" % states[i])
91 mm.p_initial[i] = float(line.split()[-1])
92
93
94 mm.p_transition = numpy.zeros((N, N))
95 line = _readline_and_check_start(handle, "TRANSITION:")
96 for i in range(len(states)):
97 line = _readline_and_check_start(handle, " %s:" % states[i])
98 mm.p_transition[i,:] = map(float, line.split()[1:])
99
100
101 mm.p_emission = numpy.zeros((N, M))
102 line = _readline_and_check_start(handle, "EMISSION:")
103 for i in range(len(states)):
104 line = _readline_and_check_start(handle, " %s:" % states[i])
105 mm.p_emission[i,:] = map(float, line.split()[1:])
106
107 return mm
108
109 -def save(mm, handle):
110 """save(mm, handle)"""
111
112 w = handle.write
113 w("STATES: %s\n" % ' '.join(mm.states))
114 w("ALPHABET: %s\n" % ' '.join(mm.alphabet))
115 w("INITIAL:\n")
116 for i in range(len(mm.p_initial)):
117 w(" %s: %g\n" % (mm.states[i], mm.p_initial[i]))
118 w("TRANSITION:\n")
119 for i in range(len(mm.p_transition)):
120 x = map(str, mm.p_transition[i])
121 w(" %s: %s\n" % (mm.states[i], ' '.join(x)))
122 w("EMISSION:\n")
123 for i in range(len(mm.p_emission)):
124 x = map(str, mm.p_emission[i])
125 w(" %s: %s\n" % (mm.states[i], ' '.join(x)))
126
127
128 -def train_bw(states, alphabet, training_data,
129 pseudo_initial=None, pseudo_transition=None, pseudo_emission=None,
130 update_fn=None,
131 ):
132 """train_bw(states, alphabet, training_data[, pseudo_initial]
133 [, pseudo_transition][, pseudo_emission][, update_fn]) -> MarkovModel
134
135 Train a MarkovModel using the Baum-Welch algorithm. states is a list
136 of strings that describe the names of each state. alphabet is a
137 list of objects that indicate the allowed outputs. training_data
138 is a list of observations. Each observation is a list of objects
139 from the alphabet.
140
141 pseudo_initial, pseudo_transition, and pseudo_emission are
142 optional parameters that you can use to assign pseudo-counts to
143 different matrices. They should be matrices of the appropriate
144 size that contain numbers to add to each parameter matrix, before
145 normalization.
146
147 update_fn is an optional callback that takes parameters
148 (iteration, log_likelihood). It is called once per iteration.
149
150 """
151 N, M = len(states), len(alphabet)
152 if not training_data:
153 raise ValueError("No training data given.")
154 if pseudo_initial!=None:
155 pseudo_initial = numpy.asarray(pseudo_initial)
156 if pseudo_initial.shape != (N,):
157 raise ValueError("pseudo_initial not shape len(states)")
158 if pseudo_transition!=None:
159 pseudo_transition = numpy.asarray(pseudo_transition)
160 if pseudo_transition.shape != (N,N):
161 raise ValueError("pseudo_transition not shape " + \
162 "len(states) X len(states)")
163 if pseudo_emission!=None:
164 pseudo_emission = numpy.asarray(pseudo_emission)
165 if pseudo_emission.shape != (N,M):
166 raise ValueError("pseudo_emission not shape " + \
167 "len(states) X len(alphabet)")
168
169
170
171
172 training_outputs = []
173 indexes = itemindex(alphabet)
174 for outputs in training_data:
175 training_outputs.append([indexes[x] for x in outputs])
176
177
178 lengths = map(len, training_outputs)
179 if min(lengths) == 0:
180 raise ValueError("I got training data with outputs of length 0")
181
182
183 x = _baum_welch(N, M, training_outputs,
184 pseudo_initial=pseudo_initial,
185 pseudo_transition=pseudo_transition,
186 pseudo_emission=pseudo_emission,
187 update_fn=update_fn)
188 p_initial, p_transition, p_emission = x
189 return MarkovModel(states, alphabet, p_initial, p_transition, p_emission)
190
191 MAX_ITERATIONS = 1000
192 -def _baum_welch(N, M, training_outputs,
193 p_initial=None, p_transition=None, p_emission=None,
194 pseudo_initial=None, pseudo_transition=None,
195 pseudo_emission=None, update_fn=None):
196
197 if p_initial==None:
198 p_initial = _random_norm(N)
199 else:
200 p_initial = _copy_and_check(p_initial, (N,))
201
202 if p_transition==None:
203 p_transition = _random_norm((N,N))
204 else:
205 p_transition = _copy_and_check(p_transition, (N,N))
206 if p_emission==None:
207 p_emission = _random_norm((N,M))
208 else:
209 p_emission = _copy_and_check(p_emission, (N,M))
210
211
212 lp_initial, lp_transition, lp_emission = map(
213 numpy.log, (p_initial, p_transition, p_emission))
214 if pseudo_initial!=None:
215 lpseudo_initial = numpy.log(pseudo_initial)
216 else:
217 lpseudo_initial = None
218 if pseudo_transition!=None:
219 lpseudo_transition = numpy.log(pseudo_transition)
220 else:
221 lpseudo_transition = None
222 if pseudo_emission!=None:
223 lpseudo_emission = numpy.log(pseudo_emission)
224 else:
225 lpseudo_emission = None
226
227
228
229
230 prev_llik = None
231 for i in range(MAX_ITERATIONS):
232 llik = LOG0
233 for outputs in training_outputs:
234 x = _baum_welch_one(
235 N, M, outputs,
236 lp_initial, lp_transition, lp_emission,
237 lpseudo_initial, lpseudo_transition, lpseudo_emission,)
238 llik += x
239 if update_fn is not None:
240 update_fn(i, llik)
241 if prev_llik is not None and numpy.fabs(prev_llik-llik) < 0.1:
242 break
243 prev_llik = llik
244 else:
245 raise RuntimeError("HMM did not converge in %d iterations" \
246 % MAX_ITERATIONS)
247
248
249 return map(numpy.exp, (lp_initial, lp_transition, lp_emission))
250
251 -def _baum_welch_one(N, M, outputs,
252 lp_initial, lp_transition, lp_emission,
253 lpseudo_initial, lpseudo_transition, lpseudo_emission):
254
255
256
257 T = len(outputs)
258 fmat = _forward(N, T, lp_initial, lp_transition, lp_emission, outputs)
259 bmat = _backward(N, T, lp_transition, lp_emission, outputs)
260
261
262
263 lp_arc = numpy.zeros((N, N, T))
264 for t in range(T):
265 k = outputs[t]
266 lp_traverse = numpy.zeros((N, N))
267 for i in range(N):
268 for j in range(N):
269
270
271
272
273 lp = fmat[i][t] + \
274 lp_transition[i][j] + \
275 lp_emission[i][k] + \
276 bmat[j][t+1]
277 lp_traverse[i][j] = lp
278
279 lp_arc[:,:,t] = lp_traverse - _logsum(lp_traverse)
280
281
282
283 lp_arcout_t = numpy.zeros((N, T))
284 for t in range(T):
285 for i in range(N):
286 lp_arcout_t[i][t] = _logsum(lp_arc[i,:,t])
287
288
289 lp_arcout = numpy.zeros(N)
290 for i in range(N):
291 lp_arcout[i] = _logsum(lp_arcout_t[i,:])
292
293
294 lp_initial = lp_arcout_t[:,0]
295 if lpseudo_initial!=None:
296 lp_initial = _logvecadd(lp_initial, lpseudo_initial)
297 lp_initial = lp_initial - _logsum(lp_initial)
298
299
300
301
302 for i in range(N):
303 for j in range(N):
304 lp_transition[i][j] = _logsum(lp_arc[i,j,:]) - lp_arcout[i]
305 if lpseudo_transition!=None:
306 lp_transition[i] = _logvecadd(lp_transition[i], lpseudo_transition)
307 lp_transition[i] = lp_transition[i] - _logsum(lp_transition[i])
308
309
310
311
312 for i in range(N):
313 ksum = numpy.zeros(M)+LOG0
314 for t in range(T):
315 k = outputs[t]
316 for j in range(N):
317 ksum[k] = logaddexp(ksum[k], lp_arc[i,j,t])
318 ksum = ksum - _logsum(ksum)
319 if lpseudo_emission!=None:
320 ksum = _logvecadd(ksum, lpseudo_emission[i])
321 ksum = ksum - _logsum(ksum)
322 lp_emission[i,:] = ksum
323
324
325
326
327
328
329
330
331 return _logsum(fmat[:,T])
332
333 -def _forward(N, T, lp_initial, lp_transition, lp_emission, outputs):
334
335
336
337
338 matrix = numpy.zeros((N, T+1))
339
340
341 matrix[:,0] = lp_initial
342 for t in range(1, T+1):
343 k = outputs[t-1]
344 for j in range(N):
345
346
347 lprob = LOG0
348 for i in range(N):
349 lp = matrix[i][t-1] + \
350 lp_transition[i][j] + \
351 lp_emission[i][k]
352 lprob = logaddexp(lprob, lp)
353 matrix[j][t] = lprob
354 return matrix
355
356 -def _backward(N, T, lp_transition, lp_emission, outputs):
357 matrix = numpy.zeros((N, T+1))
358 for t in range(T-1, -1, -1):
359 k = outputs[t]
360 for i in range(N):
361
362
363 lprob = LOG0
364 for j in range(N):
365 lp = matrix[j][t+1] + \
366 lp_transition[i][j] + \
367 lp_emission[i][k]
368 lprob = logaddexp(lprob, lp)
369 matrix[i][t] = lprob
370 return matrix
371
372 -def train_visible(states, alphabet, training_data,
373 pseudo_initial=None, pseudo_transition=None,
374 pseudo_emission=None):
375 """train_visible(states, alphabet, training_data[, pseudo_initial]
376 [, pseudo_transition][, pseudo_emission]) -> MarkovModel
377
378 Train a visible MarkovModel using maximum likelihoood estimates
379 for each of the parameters. states is a list of strings that
380 describe the names of each state. alphabet is a list of objects
381 that indicate the allowed outputs. training_data is a list of
382 (outputs, observed states) where outputs is a list of the emission
383 from the alphabet, and observed states is a list of states from
384 states.
385
386 pseudo_initial, pseudo_transition, and pseudo_emission are
387 optional parameters that you can use to assign pseudo-counts to
388 different matrices. They should be matrices of the appropriate
389 size that contain numbers to add to each parameter matrix
390
391 """
392 N, M = len(states), len(alphabet)
393 if pseudo_initial!=None:
394 pseudo_initial = numpy.asarray(pseudo_initial)
395 if pseudo_initial.shape != (N,):
396 raise ValueError("pseudo_initial not shape len(states)")
397 if pseudo_transition!=None:
398 pseudo_transition = numpy.asarray(pseudo_transition)
399 if pseudo_transition.shape != (N,N):
400 raise ValueError("pseudo_transition not shape " + \
401 "len(states) X len(states)")
402 if pseudo_emission!=None:
403 pseudo_emission = numpy.asarray(pseudo_emission)
404 if pseudo_emission.shape != (N,M):
405 raise ValueError("pseudo_emission not shape " + \
406 "len(states) X len(alphabet)")
407
408
409
410
411 training_states, training_outputs = [], []
412 states_indexes = itemindex(states)
413 outputs_indexes = itemindex(alphabet)
414 for toutputs, tstates in training_data:
415 if len(tstates) != len(toutputs):
416 raise ValueError("states and outputs not aligned")
417 training_states.append([states_indexes[x] for x in tstates])
418 training_outputs.append([outputs_indexes[x] for x in toutputs])
419
420 x = _mle(N, M, training_outputs, training_states,
421 pseudo_initial, pseudo_transition, pseudo_emission)
422 p_initial, p_transition, p_emission = x
423
424 return MarkovModel(states, alphabet, p_initial, p_transition, p_emission)
425
426 -def _mle(N, M, training_outputs, training_states, pseudo_initial,
427 pseudo_transition, pseudo_emission):
428
429
430 p_initial = numpy.zeros(N)
431 if pseudo_initial:
432 p_initial = p_initial + pseudo_initial
433 for states in training_states:
434 p_initial[states[0]] += 1
435 p_initial = _normalize(p_initial)
436
437
438
439 p_transition = numpy.zeros((N,N))
440 if pseudo_transition:
441 p_transition = p_transition + pseudo_transition
442 for states in training_states:
443 for n in range(len(states)-1):
444 i, j = states[n], states[n+1]
445 p_transition[i, j] += 1
446 for i in range(len(p_transition)):
447 p_transition[i,:] = p_transition[i,:] / sum(p_transition[i,:])
448
449
450
451 p_emission = numpy.zeros((N,M))
452 if pseudo_emission:
453 p_emission = p_emission + pseudo_emission
454 p_emission = numpy.ones((N,M))
455 for outputs, states in zip(training_outputs, training_states):
456 for o, s in zip(outputs, states):
457 p_emission[s, o] += 1
458 for i in range(len(p_emission)):
459 p_emission[i,:] = p_emission[i,:] / sum(p_emission[i,:])
460
461 return p_initial, p_transition, p_emission
462
464 return [numpy.argmax(vector)]
465
467 """find_states(markov_model, output) -> list of (states, score)"""
468 mm = markov_model
469 N = len(mm.states)
470
471
472
473 x = mm.p_initial + VERY_SMALL_NUMBER
474 y = mm.p_transition + VERY_SMALL_NUMBER
475 z = mm.p_emission + VERY_SMALL_NUMBER
476 lp_initial, lp_transition, lp_emission = map(numpy.log, (x, y, z))
477
478 indexes = itemindex(mm.alphabet)
479 output = [indexes[x] for x in output]
480
481
482 results = _viterbi(N, lp_initial, lp_transition, lp_emission, output)
483
484 for i in range(len(results)):
485 states, score = results[i]
486 results[i] = [mm.states[x] for x in states], numpy.exp(score)
487 return results
488
489 -def _viterbi(N, lp_initial, lp_transition, lp_emission, output):
490
491
492
493 T = len(output)
494
495 backtrace = []
496 for i in range(N):
497 backtrace.append([None] * T)
498
499
500 scores = numpy.zeros((N, T))
501 scores[:,0] = lp_initial + lp_emission[:,output[0]]
502 for t in range(1, T):
503 k = output[t]
504 for j in range(N):
505
506 i_scores = scores[:,t-1] + \
507 lp_transition[:,j] + \
508 lp_emission[j,k]
509 indexes = _argmaxes(i_scores)
510 scores[j,t] = i_scores[indexes[0]]
511 backtrace[j][t] = indexes
512
513
514
515
516
517
518 in_process = []
519 results = []
520 indexes = _argmaxes(scores[:,T-1])
521 for i in indexes:
522 in_process.append((T-1, [i], scores[i][T-1]))
523 while in_process:
524 t, states, score = in_process.pop()
525 if t == 0:
526 results.append((states, score))
527 else:
528 indexes = backtrace[states[0]][t]
529 for i in indexes:
530 in_process.append((t-1, [i]+states, score))
531 return results
532
544
548
552
554
555 matrix = numpy.array(matrix, copy=1)
556
557 if matrix.shape != desired_shape:
558 raise ValueError("Incorrect dimension")
559
560 if len(matrix.shape) == 1:
561 if numpy.fabs(sum(matrix)-1.0) > 0.01:
562 raise ValueError("matrix not normalized to 1.0")
563 elif len(matrix.shape) == 2:
564 for i in range(len(matrix)):
565 if numpy.fabs(sum(matrix[i])-1.0) > 0.01:
566 raise ValueError("matrix %d not normalized to 1.0" % i)
567 else:
568 raise ValueError("I don't handle matrices > 2 dimensions")
569 return matrix
570
580
582 assert len(logvec1) == len(logvec2), "vectors aren't the same length"
583 sumvec = numpy.zeros(len(logvec1))
584 for i in range(len(logvec1)):
585 sumvec[i] = logaddexp(logvec1[i], logvec2[i])
586 return sumvec
587
591