Package Bio :: Module MarkovModel
[hide private]
[frames] | no frames]

Source Code for Module Bio.MarkovModel

  1  """ 
  2  This is an implementation of a state-emitting MarkovModel.  I am using 
  3  terminology similar to Manning and Schutze. 
  4   
  5   
  6   
  7  Functions: 
  8  train_bw        Train a markov model using the Baum-Welch algorithm. 
  9  train_visible   Train a visible markov model using MLE. 
 10  find_states     Find the a state sequence that explains some observations. 
 11   
 12  load            Load a MarkovModel. 
 13  save            Save a MarkovModel. 
 14   
 15  Classes: 
 16  MarkovModel     Holds the description of a markov model 
 17  """ 
 18   
 19  import numpy 
 20   
 21  try: 
 22      logaddexp = numpy.logaddexp 
 23  except AttributeError: 
 24      # Numpy versions older than 1.3 do not contain logaddexp. 
 25      # Once we require Numpy version 1.3 or later, we should revisit this 
 26      # module to see if we can simplify some of the other functions in 
 27      # this module. 
 28      import warnings 
 29      warnings.warn("For optimal speed, please update to Numpy version 1.3 or later (current version is %s)" % numpy.__version__) 
30 - def logaddexp(logx, logy):
31 if logy - logx > 100: 32 return logy 33 elif logx - logy > 100: 34 return logx 35 minxy = min(logx, logy) 36 return minxy + numpy.log(numpy.exp(logx-minxy) + numpy.exp(logy-minxy))
37 38 39
40 -def itemindex(values):
41 d = {} 42 entries = enumerate(values[::-1]) 43 n = len(values)-1 44 for index, key in entries: d[key] = n-index 45 return d
46 47 numpy.random.seed() 48 49 VERY_SMALL_NUMBER = 1E-300 50 LOG0 = numpy.log(VERY_SMALL_NUMBER) 51
52 -class MarkovModel:
53 - def __init__(self, states, alphabet, 54 p_initial=None, p_transition=None, p_emission=None):
55 self.states = states 56 self.alphabet = alphabet 57 self.p_initial = p_initial 58 self.p_transition = p_transition 59 self.p_emission = p_emission
60 - def __str__(self):
61 import StringIO 62 handle = StringIO.StringIO() 63 save(self, handle) 64 handle.seek(0) 65 return handle.read()
66
67 -def _readline_and_check_start(handle, start):
68 line = handle.readline() 69 if not line.startswith(start): 70 raise ValueError("I expected %r but got %r" % (start, line)) 71 return line
72
73 -def load(handle):
74 """load(handle) -> MarkovModel()""" 75 # Load the states. 76 line = _readline_and_check_start(handle, "STATES:") 77 states = line.split()[1:] 78 79 # Load the alphabet. 80 line = _readline_and_check_start(handle, "ALPHABET:") 81 alphabet = line.split()[1:] 82 83 mm = MarkovModel(states, alphabet) 84 N, M = len(states), len(alphabet) 85 86 # Load the initial probabilities. 87 mm.p_initial = numpy.zeros(N) 88 line = _readline_and_check_start(handle, "INITIAL:") 89 for i in range(len(states)): 90 line = _readline_and_check_start(handle, " %s:" % states[i]) 91 mm.p_initial[i] = float(line.split()[-1]) 92 93 # Load the transition. 94 mm.p_transition = numpy.zeros((N, N)) 95 line = _readline_and_check_start(handle, "TRANSITION:") 96 for i in range(len(states)): 97 line = _readline_and_check_start(handle, " %s:" % states[i]) 98 mm.p_transition[i,:] = map(float, line.split()[1:]) 99 100 # Load the emission. 101 mm.p_emission = numpy.zeros((N, M)) 102 line = _readline_and_check_start(handle, "EMISSION:") 103 for i in range(len(states)): 104 line = _readline_and_check_start(handle, " %s:" % states[i]) 105 mm.p_emission[i,:] = map(float, line.split()[1:]) 106 107 return mm
108
109 -def save(mm, handle):
110 """save(mm, handle)""" 111 # This will fail if there are spaces in the states or alphabet. 112 w = handle.write 113 w("STATES: %s\n" % ' '.join(mm.states)) 114 w("ALPHABET: %s\n" % ' '.join(mm.alphabet)) 115 w("INITIAL:\n") 116 for i in range(len(mm.p_initial)): 117 w(" %s: %g\n" % (mm.states[i], mm.p_initial[i])) 118 w("TRANSITION:\n") 119 for i in range(len(mm.p_transition)): 120 x = map(str, mm.p_transition[i]) 121 w(" %s: %s\n" % (mm.states[i], ' '.join(x))) 122 w("EMISSION:\n") 123 for i in range(len(mm.p_emission)): 124 x = map(str, mm.p_emission[i]) 125 w(" %s: %s\n" % (mm.states[i], ' '.join(x)))
126 127 # XXX allow them to specify starting points
128 -def train_bw(states, alphabet, training_data, 129 pseudo_initial=None, pseudo_transition=None, pseudo_emission=None, 130 update_fn=None, 131 ):
132 """train_bw(states, alphabet, training_data[, pseudo_initial] 133 [, pseudo_transition][, pseudo_emission][, update_fn]) -> MarkovModel 134 135 Train a MarkovModel using the Baum-Welch algorithm. states is a list 136 of strings that describe the names of each state. alphabet is a 137 list of objects that indicate the allowed outputs. training_data 138 is a list of observations. Each observation is a list of objects 139 from the alphabet. 140 141 pseudo_initial, pseudo_transition, and pseudo_emission are 142 optional parameters that you can use to assign pseudo-counts to 143 different matrices. They should be matrices of the appropriate 144 size that contain numbers to add to each parameter matrix, before 145 normalization. 146 147 update_fn is an optional callback that takes parameters 148 (iteration, log_likelihood). It is called once per iteration. 149 150 """ 151 N, M = len(states), len(alphabet) 152 if not training_data: 153 raise ValueError("No training data given.") 154 if pseudo_initial!=None: 155 pseudo_initial = numpy.asarray(pseudo_initial) 156 if pseudo_initial.shape != (N,): 157 raise ValueError("pseudo_initial not shape len(states)") 158 if pseudo_transition!=None: 159 pseudo_transition = numpy.asarray(pseudo_transition) 160 if pseudo_transition.shape != (N,N): 161 raise ValueError("pseudo_transition not shape " + \ 162 "len(states) X len(states)") 163 if pseudo_emission!=None: 164 pseudo_emission = numpy.asarray(pseudo_emission) 165 if pseudo_emission.shape != (N,M): 166 raise ValueError("pseudo_emission not shape " + \ 167 "len(states) X len(alphabet)") 168 169 # Training data is given as a list of members of the alphabet. 170 # Replace those with indexes into the alphabet list for easier 171 # computation. 172 training_outputs = [] 173 indexes = itemindex(alphabet) 174 for outputs in training_data: 175 training_outputs.append([indexes[x] for x in outputs]) 176 177 # Do some sanity checking on the outputs. 178 lengths = map(len, training_outputs) 179 if min(lengths) == 0: 180 raise ValueError("I got training data with outputs of length 0") 181 182 # Do the training with baum welch. 183 x = _baum_welch(N, M, training_outputs, 184 pseudo_initial=pseudo_initial, 185 pseudo_transition=pseudo_transition, 186 pseudo_emission=pseudo_emission, 187 update_fn=update_fn) 188 p_initial, p_transition, p_emission = x 189 return MarkovModel(states, alphabet, p_initial, p_transition, p_emission)
190 191 MAX_ITERATIONS = 1000
192 -def _baum_welch(N, M, training_outputs, 193 p_initial=None, p_transition=None, p_emission=None, 194 pseudo_initial=None, pseudo_transition=None, 195 pseudo_emission=None, update_fn=None):
196 # Returns (p_initial, p_transition, p_emission) 197 if p_initial==None: 198 p_initial = _random_norm(N) 199 else: 200 p_initial = _copy_and_check(p_initial, (N,)) 201 202 if p_transition==None: 203 p_transition = _random_norm((N,N)) 204 else: 205 p_transition = _copy_and_check(p_transition, (N,N)) 206 if p_emission==None: 207 p_emission = _random_norm((N,M)) 208 else: 209 p_emission = _copy_and_check(p_emission, (N,M)) 210 211 # Do all the calculations in log space to avoid underflows. 212 lp_initial, lp_transition, lp_emission = map( 213 numpy.log, (p_initial, p_transition, p_emission)) 214 if pseudo_initial!=None: 215 lpseudo_initial = numpy.log(pseudo_initial) 216 else: 217 lpseudo_initial = None 218 if pseudo_transition!=None: 219 lpseudo_transition = numpy.log(pseudo_transition) 220 else: 221 lpseudo_transition = None 222 if pseudo_emission!=None: 223 lpseudo_emission = numpy.log(pseudo_emission) 224 else: 225 lpseudo_emission = None 226 227 # Iterate through each sequence of output, updating the parameters 228 # to the HMM. Stop when the log likelihoods of the sequences 229 # stops varying. 230 prev_llik = None 231 for i in range(MAX_ITERATIONS): 232 llik = LOG0 233 for outputs in training_outputs: 234 x = _baum_welch_one( 235 N, M, outputs, 236 lp_initial, lp_transition, lp_emission, 237 lpseudo_initial, lpseudo_transition, lpseudo_emission,) 238 llik += x 239 if update_fn is not None: 240 update_fn(i, llik) 241 if prev_llik is not None and numpy.fabs(prev_llik-llik) < 0.1: 242 break 243 prev_llik = llik 244 else: 245 raise RuntimeError("HMM did not converge in %d iterations" \ 246 % MAX_ITERATIONS) 247 248 # Return everything back in normal space. 249 return map(numpy.exp, (lp_initial, lp_transition, lp_emission))
250
251 -def _baum_welch_one(N, M, outputs, 252 lp_initial, lp_transition, lp_emission, 253 lpseudo_initial, lpseudo_transition, lpseudo_emission):
254 # Do one iteration of Baum-Welch based on a sequence of output. 255 # NOTE: This will change the values of lp_initial, lp_transition, 256 # and lp_emission in place. 257 T = len(outputs) 258 fmat = _forward(N, T, lp_initial, lp_transition, lp_emission, outputs) 259 bmat = _backward(N, T, lp_transition, lp_emission, outputs) 260 261 # Calculate the probability of traversing each arc for any given 262 # transition. 263 lp_arc = numpy.zeros((N, N, T)) 264 for t in range(T): 265 k = outputs[t] 266 lp_traverse = numpy.zeros((N, N)) # P going over one arc. 267 for i in range(N): 268 for j in range(N): 269 # P(getting to this arc) 270 # P(making this transition) 271 # P(emitting this character) 272 # P(going to the end) 273 lp = fmat[i][t] + \ 274 lp_transition[i][j] + \ 275 lp_emission[i][k] + \ 276 bmat[j][t+1] 277 lp_traverse[i][j] = lp 278 # Normalize the probability for this time step. 279 lp_arc[:,:,t] = lp_traverse - _logsum(lp_traverse) 280 281 282 # Sum of all the transitions out of state i at time t. 283 lp_arcout_t = numpy.zeros((N, T)) 284 for t in range(T): 285 for i in range(N): 286 lp_arcout_t[i][t] = _logsum(lp_arc[i,:,t]) 287 288 # Sum of all the transitions out of state i. 289 lp_arcout = numpy.zeros(N) 290 for i in range(N): 291 lp_arcout[i] = _logsum(lp_arcout_t[i,:]) 292 293 # UPDATE P_INITIAL. 294 lp_initial = lp_arcout_t[:,0] 295 if lpseudo_initial!=None: 296 lp_initial = _logvecadd(lp_initial, lpseudo_initial) 297 lp_initial = lp_initial - _logsum(lp_initial) 298 299 # UPDATE P_TRANSITION. p_transition[i][j] is the sum of all the 300 # transitions from i to j, normalized by the sum of the 301 # transitions out of i. 302 for i in range(N): 303 for j in range(N): 304 lp_transition[i][j] = _logsum(lp_arc[i,j,:]) - lp_arcout[i] 305 if lpseudo_transition!=None: 306 lp_transition[i] = _logvecadd(lp_transition[i], lpseudo_transition) 307 lp_transition[i] = lp_transition[i] - _logsum(lp_transition[i]) 308 309 # UPDATE P_EMISSION. lp_emission[i][k] is the sum of all the 310 # transitions out of i when k is observed, divided by the sum of 311 # the transitions out of i. 312 for i in range(N): 313 ksum = numpy.zeros(M)+LOG0 # ksum[k] is the sum of all i with k. 314 for t in range(T): 315 k = outputs[t] 316 for j in range(N): 317 ksum[k] = logaddexp(ksum[k], lp_arc[i,j,t]) 318 ksum = ksum - _logsum(ksum) # Normalize 319 if lpseudo_emission!=None: 320 ksum = _logvecadd(ksum, lpseudo_emission[i]) 321 ksum = ksum - _logsum(ksum) # Renormalize 322 lp_emission[i,:] = ksum 323 324 # Calculate the log likelihood of the output based on the forward 325 # matrix. Since the parameters of the HMM has changed, the log 326 # likelihoods are going to be a step behind, and we might be doing 327 # one extra iteration of training. The alternative is to rerun 328 # the _forward algorithm and calculate from the clean one, but 329 # that may be more expensive than overshooting the training by one 330 # step. 331 return _logsum(fmat[:,T])
332
333 -def _forward(N, T, lp_initial, lp_transition, lp_emission, outputs):
334 # Implement the forward algorithm. This actually calculates a 335 # Nx(T+1) matrix, where the last column is the total probability 336 # of the output. 337 338 matrix = numpy.zeros((N, T+1)) 339 340 # Initialize the first column to be the initial values. 341 matrix[:,0] = lp_initial 342 for t in range(1, T+1): 343 k = outputs[t-1] 344 for j in range(N): 345 # The probability of the state is the sum of the 346 # transitions from all the states from time t-1. 347 lprob = LOG0 348 for i in range(N): 349 lp = matrix[i][t-1] + \ 350 lp_transition[i][j] + \ 351 lp_emission[i][k] 352 lprob = logaddexp(lprob, lp) 353 matrix[j][t] = lprob 354 return matrix
355
356 -def _backward(N, T, lp_transition, lp_emission, outputs):
357 matrix = numpy.zeros((N, T+1)) 358 for t in range(T-1, -1, -1): 359 k = outputs[t] 360 for i in range(N): 361 # The probability of the state is the sum of the 362 # transitions from all the states from time t+1. 363 lprob = LOG0 364 for j in range(N): 365 lp = matrix[j][t+1] + \ 366 lp_transition[i][j] + \ 367 lp_emission[i][k] 368 lprob = logaddexp(lprob, lp) 369 matrix[i][t] = lprob 370 return matrix
371
372 -def train_visible(states, alphabet, training_data, 373 pseudo_initial=None, pseudo_transition=None, 374 pseudo_emission=None):
375 """train_visible(states, alphabet, training_data[, pseudo_initial] 376 [, pseudo_transition][, pseudo_emission]) -> MarkovModel 377 378 Train a visible MarkovModel using maximum likelihoood estimates 379 for each of the parameters. states is a list of strings that 380 describe the names of each state. alphabet is a list of objects 381 that indicate the allowed outputs. training_data is a list of 382 (outputs, observed states) where outputs is a list of the emission 383 from the alphabet, and observed states is a list of states from 384 states. 385 386 pseudo_initial, pseudo_transition, and pseudo_emission are 387 optional parameters that you can use to assign pseudo-counts to 388 different matrices. They should be matrices of the appropriate 389 size that contain numbers to add to each parameter matrix 390 391 """ 392 N, M = len(states), len(alphabet) 393 if pseudo_initial!=None: 394 pseudo_initial = numpy.asarray(pseudo_initial) 395 if pseudo_initial.shape != (N,): 396 raise ValueError("pseudo_initial not shape len(states)") 397 if pseudo_transition!=None: 398 pseudo_transition = numpy.asarray(pseudo_transition) 399 if pseudo_transition.shape != (N,N): 400 raise ValueError("pseudo_transition not shape " + \ 401 "len(states) X len(states)") 402 if pseudo_emission!=None: 403 pseudo_emission = numpy.asarray(pseudo_emission) 404 if pseudo_emission.shape != (N,M): 405 raise ValueError("pseudo_emission not shape " + \ 406 "len(states) X len(alphabet)") 407 408 # Training data is given as a list of members of the alphabet. 409 # Replace those with indexes into the alphabet list for easier 410 # computation. 411 training_states, training_outputs = [], [] 412 states_indexes = itemindex(states) 413 outputs_indexes = itemindex(alphabet) 414 for toutputs, tstates in training_data: 415 if len(tstates) != len(toutputs): 416 raise ValueError("states and outputs not aligned") 417 training_states.append([states_indexes[x] for x in tstates]) 418 training_outputs.append([outputs_indexes[x] for x in toutputs]) 419 420 x = _mle(N, M, training_outputs, training_states, 421 pseudo_initial, pseudo_transition, pseudo_emission) 422 p_initial, p_transition, p_emission = x 423 424 return MarkovModel(states, alphabet, p_initial, p_transition, p_emission)
425
426 -def _mle(N, M, training_outputs, training_states, pseudo_initial, 427 pseudo_transition, pseudo_emission):
428 # p_initial is the probability that a sequence of states starts 429 # off with a particular one. 430 p_initial = numpy.zeros(N) 431 if pseudo_initial: 432 p_initial = p_initial + pseudo_initial 433 for states in training_states: 434 p_initial[states[0]] += 1 435 p_initial = _normalize(p_initial) 436 437 # p_transition is the probability that a state leads to the next 438 # one. C(i,j)/C(i) where i and j are states. 439 p_transition = numpy.zeros((N,N)) 440 if pseudo_transition: 441 p_transition = p_transition + pseudo_transition 442 for states in training_states: 443 for n in range(len(states)-1): 444 i, j = states[n], states[n+1] 445 p_transition[i, j] += 1 446 for i in range(len(p_transition)): 447 p_transition[i,:] = p_transition[i,:] / sum(p_transition[i,:]) 448 449 # p_emission is the probability of an output given a state. 450 # C(s,o)|C(s) where o is an output and s is a state. 451 p_emission = numpy.zeros((N,M)) 452 if pseudo_emission: 453 p_emission = p_emission + pseudo_emission 454 p_emission = numpy.ones((N,M)) 455 for outputs, states in zip(training_outputs, training_states): 456 for o, s in zip(outputs, states): 457 p_emission[s, o] += 1 458 for i in range(len(p_emission)): 459 p_emission[i,:] = p_emission[i,:] / sum(p_emission[i,:]) 460 461 return p_initial, p_transition, p_emission
462
463 -def _argmaxes(vector, allowance=None):
464 return [numpy.argmax(vector)]
465
466 -def find_states(markov_model, output):
467 """find_states(markov_model, output) -> list of (states, score)""" 468 mm = markov_model 469 N = len(mm.states) 470 471 # _viterbi does calculations in log space. Add a tiny bit to the 472 # matrices so that the logs will not break. 473 x = mm.p_initial + VERY_SMALL_NUMBER 474 y = mm.p_transition + VERY_SMALL_NUMBER 475 z = mm.p_emission + VERY_SMALL_NUMBER 476 lp_initial, lp_transition, lp_emission = map(numpy.log, (x, y, z)) 477 # Change output into a list of indexes into the alphabet. 478 indexes = itemindex(mm.alphabet) 479 output = [indexes[x] for x in output] 480 481 # Run the viterbi algorithm. 482 results = _viterbi(N, lp_initial, lp_transition, lp_emission, output) 483 484 for i in range(len(results)): 485 states, score = results[i] 486 results[i] = [mm.states[x] for x in states], numpy.exp(score) 487 return results
488
489 -def _viterbi(N, lp_initial, lp_transition, lp_emission, output):
490 # The Viterbi algorithm finds the most likely set of states for a 491 # given output. Returns a list of states. 492 493 T = len(output) 494 # Store the backtrace in a NxT matrix. 495 backtrace = [] # list of indexes of states in previous timestep. 496 for i in range(N): 497 backtrace.append([None] * T) 498 499 # Store the best scores. 500 scores = numpy.zeros((N, T)) 501 scores[:,0] = lp_initial + lp_emission[:,output[0]] 502 for t in range(1, T): 503 k = output[t] 504 for j in range(N): 505 # Find the most likely place it came from. 506 i_scores = scores[:,t-1] + \ 507 lp_transition[:,j] + \ 508 lp_emission[j,k] 509 indexes = _argmaxes(i_scores) 510 scores[j,t] = i_scores[indexes[0]] 511 backtrace[j][t] = indexes 512 513 # Do the backtrace. First, find a good place to start. Then, 514 # we'll follow the backtrace matrix to find the list of states. 515 # In the event of ties, there may be multiple paths back through 516 # the matrix, which implies a recursive solution. We'll simulate 517 # it by keeping our own stack. 518 in_process = [] # list of (t, states, score) 519 results = [] # return values. list of (states, score) 520 indexes = _argmaxes(scores[:,T-1]) # pick the first place 521 for i in indexes: 522 in_process.append((T-1, [i], scores[i][T-1])) 523 while in_process: 524 t, states, score = in_process.pop() 525 if t == 0: 526 results.append((states, score)) 527 else: 528 indexes = backtrace[states[0]][t] 529 for i in indexes: 530 in_process.append((t-1, [i]+states, score)) 531 return results
532
533 -def _normalize(matrix):
534 # Make sure numbers add up to 1.0 535 if len(matrix.shape) == 1: 536 matrix = matrix / float(sum(matrix)) 537 elif len(matrix.shape) == 2: 538 # Normalize by rows. 539 for i in range(len(matrix)): 540 matrix[i,:] = matrix[i,:] / sum(matrix[i,:]) 541 else: 542 raise ValueError("I cannot handle matrixes of that shape") 543 return matrix
544
545 -def _uniform_norm(shape):
546 matrix = numpy.ones(shape) 547 return _normalize(matrix)
548
549 -def _random_norm(shape):
550 matrix = numpy.random.random(shape) 551 return _normalize(matrix)
552
553 -def _copy_and_check(matrix, desired_shape):
554 # Copy the matrix. 555 matrix = numpy.array(matrix, copy=1) 556 # Check the dimensions. 557 if matrix.shape != desired_shape: 558 raise ValueError("Incorrect dimension") 559 # Make sure it's normalized. 560 if len(matrix.shape) == 1: 561 if numpy.fabs(sum(matrix)-1.0) > 0.01: 562 raise ValueError("matrix not normalized to 1.0") 563 elif len(matrix.shape) == 2: 564 for i in range(len(matrix)): 565 if numpy.fabs(sum(matrix[i])-1.0) > 0.01: 566 raise ValueError("matrix %d not normalized to 1.0" % i) 567 else: 568 raise ValueError("I don't handle matrices > 2 dimensions") 569 return matrix
570
571 -def _logsum(matrix):
572 if len(matrix.shape) > 1: 573 vec = numpy.reshape(matrix, (numpy.product(matrix.shape),)) 574 else: 575 vec = matrix 576 sum = LOG0 577 for num in vec: 578 sum = logaddexp(sum, num) 579 return sum
580
581 -def _logvecadd(logvec1, logvec2):
582 assert len(logvec1) == len(logvec2), "vectors aren't the same length" 583 sumvec = numpy.zeros(len(logvec1)) 584 for i in range(len(logvec1)): 585 sumvec[i] = logaddexp(logvec1[i], logvec2[i]) 586 return sumvec
587
588 -def _exp_logsum(numbers):
589 sum = _logsum(numbers) 590 return numpy.exp(sum)
591