1
2
3
4
5
6 """This package implements pairwise sequence alignment using a dynamic
7 programming algorithm.
8
9 This provides functions to get global and local alignments between two
10 sequences. A global alignment finds the best concordance between all
11 characters in two sequences. A local alignment finds just the
12 subsequences that align the best.
13
14 When doing alignments, you can specify the match score and gap
15 penalties. The match score indicates the compatibility between an
16 alignment of two characters in the sequences. Highly compatible
17 characters should be given positive scores, and incompatible ones
18 should be given negative scores or 0. The gap penalties should be
19 negative.
20
21 The names of the alignment functions in this module follow the
22 convention
23 <alignment type>XX
24 where <alignment type> is either "global" or "local" and XX is a 2
25 character code indicating the parameters it takes. The first
26 character indicates the parameters for matches (and mismatches), and
27 the second indicates the parameters for gap penalties.
28
29 The match parameters are
30 CODE DESCRIPTION
31 x No parameters. Identical characters have score of 1, otherwise 0.
32 m A match score is the score of identical chars, otherwise mismatch score.
33 d A dictionary returns the score of any pair of characters.
34 c A callback function returns scores.
35
36 The gap penalty parameters are
37 CODE DESCRIPTION
38 x No gap penalties.
39 s Same open and extend gap penalties for both sequences.
40 d The sequences have different open and extend gap penalties.
41 c A callback function returns the gap penalties.
42
43 All the different alignment functions are contained in an object
44 "align". For example:
45
46 >>> from Bio import pairwise2
47 >>> alignments = pairwise2.align.globalxx("ACCGT", "ACG")
48
49 will return a list of the alignments between the two strings. The
50 parameters of the alignment function depends on the function called.
51 Some examples:
52
53 >>> pairwise2.align.globalxx("ACCGT", "ACG")
54 # Find the best global alignment between the two sequences.
55 # Identical characters are given 1 point. No points are deducted
56 # for mismatches or gaps.
57
58 >>> pairwise2.align.localxx("ACCGT", "ACG")
59 # Same thing as before, but with a local alignment.
60
61 >>> pairwise2.align.globalmx("ACCGT", "ACG", 2, -1)
62 # Do a global alignment. Identical characters are given 2 points,
63 # 1 point is deducted for each non-identical character.
64
65 >>> pairwise2.align.globalms("ACCGT", "ACG", 2, -1, -.5, -.1)
66 # Same as above, except now 0.5 points are deducted when opening a
67 # gap, and 0.1 points are deducted when extending it.
68
69
70 To see a description of the parameters for a function, please look at
71 the docstring for the function.
72
73 >>> print newalign.align.localds.__doc__
74 localds(sequenceA, sequenceB, match_dict, open, extend) -> alignments
75
76 """
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98 MAX_ALIGNMENTS = 1000
99
101 """This class provides functions that do alignments."""
102
104 """This class is callable impersonates an alignment function.
105 The constructor takes the name of the function. This class
106 will decode the name of the function to figure out how to
107 interpret the parameters.
108
109 """
110
111 match2args = {
112 'x' : ([], ''),
113 'm' : (['match', 'mismatch'],
114 """match is the score to given to identical characters. mismatch is
115 the score given to non-identical ones."""),
116 'd' : (['match_dict'],
117 """match_dict is a dictionary where the keys are tuples of pairs of
118 characters and the values are the scores, e.g. ("A", "C") : 2.5."""),
119 'c' : (['match_fn'],
120 """match_fn is a callback function that takes two characters and
121 returns the score between them."""),
122 }
123
124 penalty2args = {
125 'x' : ([], ''),
126 's' : (['open', 'extend'],
127 """open and extend are the gap penalties when a gap is opened and
128 extended. They should be negative."""),
129 'd' : (['openA', 'extendA', 'openB', 'extendB'],
130 """openA and extendA are the gap penalties for sequenceA, and openB
131 and extendB for sequeneB. The penalties should be negative."""),
132 'c' : (['gap_A_fn', 'gap_B_fn'],
133 """gap_A_fn and gap_B_fn are callback functions that takes 1) the
134 index where the gap is opened, and 2) the length of the gap. They
135 should return a gap penalty."""),
136 }
137
139
140
141 if name.startswith("global"):
142 if len(name) != 8:
143 raise AttributeError("function should be globalXX")
144 elif name.startswith("local"):
145 if len(name) != 7:
146 raise AttributeError("function should be localXX")
147 else:
148 raise AttributeError(name)
149 align_type, match_type, penalty_type = \
150 name[:-2], name[-2], name[-1]
151 try:
152 match_args, match_doc = self.match2args[match_type]
153 except KeyError, x:
154 raise AttributeError("unknown match type %r" % match_type)
155 try:
156 penalty_args, penalty_doc = self.penalty2args[penalty_type]
157 except KeyError, x:
158 raise AttributeError("unknown penalty type %r" % penalty_type)
159
160
161 param_names = ['sequenceA', 'sequenceB']
162 param_names.extend(match_args)
163 param_names.extend(penalty_args)
164 self.function_name = name
165 self.align_type = align_type
166 self.param_names = param_names
167
168 self.__name__ = self.function_name
169
170 doc = "%s(%s) -> alignments\n" % (
171 self.__name__, ', '.join(self.param_names))
172 if match_doc:
173 doc += "\n%s\n" % match_doc
174 if penalty_doc:
175 doc += "\n%s\n" % penalty_doc
176 doc += (
177 """\nalignments is a list of tuples (seqA, seqB, score, begin, end).
178 seqA and seqB are strings showing the alignment between the
179 sequences. score is the score of the alignment. begin and end
180 are indexes into seqA and seqB that indicate the where the
181 alignment occurs.
182 """)
183 self.__doc__ = doc
184
185 - def decode(self, *args, **keywds):
186
187
188
189 keywds = keywds.copy()
190 if len(args) != len(self.param_names):
191 raise TypeError("%s takes exactly %d argument (%d given)" \
192 % (self.function_name, len(self.param_names), len(args)))
193 i = 0
194 while i < len(self.param_names):
195 if self.param_names[i] in [
196 'sequenceA', 'sequenceB',
197 'gap_A_fn', 'gap_B_fn', 'match_fn']:
198 keywds[self.param_names[i]] = args[i]
199 i += 1
200 elif self.param_names[i] == 'match':
201 assert self.param_names[i+1] == 'mismatch'
202 match, mismatch = args[i], args[i+1]
203 keywds['match_fn'] = identity_match(match, mismatch)
204 i += 2
205 elif self.param_names[i] == 'match_dict':
206 keywds['match_fn'] = dictionary_match(args[i])
207 i += 1
208 elif self.param_names[i] == 'open':
209 assert self.param_names[i+1] == 'extend'
210 open, extend = args[i], args[i+1]
211 pe = keywds.get('penalize_extend_when_opening', 0)
212 keywds['gap_A_fn'] = affine_penalty(open, extend, pe)
213 keywds['gap_B_fn'] = affine_penalty(open, extend, pe)
214 i += 2
215 elif self.param_names[i] == 'openA':
216 assert self.param_names[i+3] == 'extendB'
217 openA, extendA, openB, extendB = args[i:i+4]
218 pe = keywds.get('penalize_extend_when_opening', 0)
219 keywds['gap_A_fn'] = affine_penalty(openA, extendA, pe)
220 keywds['gap_B_fn'] = affine_penalty(openB, extendB, pe)
221 i += 4
222 else:
223 raise ValueError("unknown parameter %r" \
224 % self.param_names[i])
225
226
227
228 pe = keywds.get('penalize_extend_when_opening', 0)
229 default_params = [
230 ('match_fn', identity_match(1, 0)),
231 ('gap_A_fn', affine_penalty(0, 0, pe)),
232 ('gap_B_fn', affine_penalty(0, 0, pe)),
233 ('penalize_extend_when_opening', 0),
234 ('penalize_end_gaps', self.align_type == 'global'),
235 ('align_globally', self.align_type == 'global'),
236 ('gap_char', '-'),
237 ('force_generic', 0),
238 ('score_only', 0),
239 ('one_alignment_only', 0)
240 ]
241 for name, default in default_params:
242 keywds[name] = keywds.get(name, default)
243 return keywds
244
246 keywds = self.decode(*args, **keywds)
247 return _align(**keywds)
248
250 return self.alignment_function(attr)
251 align = align()
252
253
254 -def _align(sequenceA, sequenceB, match_fn, gap_A_fn, gap_B_fn,
255 penalize_extend_when_opening, penalize_end_gaps,
256 align_globally, gap_char, force_generic, score_only,
257 one_alignment_only):
258 if not sequenceA or not sequenceB:
259 return []
260
261 if (not force_generic) and isinstance(gap_A_fn, affine_penalty) \
262 and isinstance(gap_B_fn, affine_penalty):
263 open_A, extend_A = gap_A_fn.open, gap_A_fn.extend
264 open_B, extend_B = gap_B_fn.open, gap_B_fn.extend
265 x = _make_score_matrix_fast(
266 sequenceA, sequenceB, match_fn, open_A, extend_A, open_B, extend_B,
267 penalize_extend_when_opening, penalize_end_gaps, align_globally,
268 score_only)
269 else:
270 x = _make_score_matrix_generic(
271 sequenceA, sequenceB, match_fn, gap_A_fn, gap_B_fn,
272 penalize_extend_when_opening, penalize_end_gaps, align_globally,
273 score_only)
274 score_matrix, trace_matrix = x
275
276
277
278
279
280
281 starts = _find_start(
282 score_matrix, sequenceA, sequenceB,
283 gap_A_fn, gap_B_fn, penalize_end_gaps, align_globally)
284
285 best_score = max([x[0] for x in starts])
286
287
288 if score_only:
289 return best_score
290
291 tolerance = 0
292
293
294 i = 0
295 while i < len(starts):
296 score, pos = starts[i]
297 if rint(abs(score-best_score)) > rint(tolerance):
298 del starts[i]
299 else:
300 i += 1
301
302
303 x = _recover_alignments(
304 sequenceA, sequenceB, starts, score_matrix, trace_matrix,
305 align_globally, penalize_end_gaps, gap_char, one_alignment_only)
306 return x
307
308 -def _make_score_matrix_generic(
309 sequenceA, sequenceB, match_fn, gap_A_fn, gap_B_fn,
310 penalize_extend_when_opening, penalize_end_gaps, align_globally,
311 score_only):
312
313
314
315
316
317
318 lenA, lenB = len(sequenceA), len(sequenceB)
319 score_matrix, trace_matrix = [], []
320 for i in range(lenA):
321 score_matrix.append([None] * lenB)
322 trace_matrix.append([[None]] * lenB)
323
324
325
326
327 for i in range(lenA):
328
329
330
331 score = match_fn(sequenceA[i], sequenceB[0])
332 if penalize_end_gaps:
333 score += gap_B_fn(0, i)
334 score_matrix[i][0] = score
335 for i in range(1, lenB):
336 score = match_fn(sequenceA[0], sequenceB[i])
337 if penalize_end_gaps:
338 score += gap_A_fn(0, i)
339 score_matrix[0][i] = score
340
341
342
343
344
345
346
347
348 for row in range(1, lenA):
349 for col in range(1, lenB):
350
351
352 best_score = score_matrix[row-1][col-1]
353 best_score_rint = rint(best_score)
354 best_indexes = [(row-1, col-1)]
355
356
357
358
359
360
361 for i in range(0, col-1):
362 score = score_matrix[row-1][i] + gap_A_fn(i, col-1-i)
363 score_rint = rint(score)
364 if score_rint == best_score_rint:
365 best_score, best_score_rint = score, score_rint
366 best_indexes.append((row-1, i))
367 elif score_rint > best_score_rint:
368 best_score, best_score_rint = score, score_rint
369 best_indexes = [(row-1, i)]
370
371
372 for i in range(0, row-1):
373 score = score_matrix[i][col-1] + gap_B_fn(i, row-1-i)
374 score_rint = rint(score)
375 if score_rint == best_score_rint:
376 best_score, best_score_rint = score, score_rint
377 best_indexes.append((i, col-1))
378 elif score_rint > best_score_rint:
379 best_score, best_score_rint = score, score_rint
380 best_indexes = [(i, col-1)]
381
382 score_matrix[row][col] = best_score + \
383 match_fn(sequenceA[row], sequenceB[col])
384 if not align_globally and score_matrix[row][col] < 0:
385 score_matrix[row][col] = 0
386 trace_matrix[row][col] = best_indexes
387 return score_matrix, trace_matrix
388
389 -def _make_score_matrix_fast(
390 sequenceA, sequenceB, match_fn, open_A, extend_A, open_B, extend_B,
391 penalize_extend_when_opening, penalize_end_gaps,
392 align_globally, score_only):
393 first_A_gap = calc_affine_penalty(1, open_A, extend_A,
394 penalize_extend_when_opening)
395 first_B_gap = calc_affine_penalty(1, open_B, extend_B,
396 penalize_extend_when_opening)
397
398
399
400
401 lenA, lenB = len(sequenceA), len(sequenceB)
402 score_matrix, trace_matrix = [], []
403 for i in range(lenA):
404 score_matrix.append([None] * lenB)
405 trace_matrix.append([[None]] * lenB)
406
407
408
409
410 for i in range(lenA):
411
412
413
414 score = match_fn(sequenceA[i], sequenceB[0])
415 if penalize_end_gaps:
416 score += calc_affine_penalty(
417 i, open_B, extend_B, penalize_extend_when_opening)
418 score_matrix[i][0] = score
419 for i in range(1, lenB):
420 score = match_fn(sequenceA[0], sequenceB[i])
421 if penalize_end_gaps:
422 score += calc_affine_penalty(
423 i, open_A, extend_A, penalize_extend_when_opening)
424 score_matrix[0][i] = score
425
426
427
428
429
430
431
432
433
434
435
436
437
438 row_cache_score, row_cache_index = [None]*(lenA-1), [None]*(lenA-1)
439
440 col_cache_score, col_cache_index = [None]*(lenB-1), [None]*(lenB-1)
441
442 for i in range(lenA-1):
443
444
445 row_cache_score[i] = score_matrix[i][0] + first_A_gap
446 row_cache_index[i] = [(i, 0)]
447 for i in range(lenB-1):
448 col_cache_score[i] = score_matrix[0][i] + first_B_gap
449 col_cache_index[i] = [(0, i)]
450
451
452 for row in range(1, lenA):
453 for col in range(1, lenB):
454
455
456 nogap_score = score_matrix[row-1][col-1]
457
458
459
460 if col > 1:
461 row_score = row_cache_score[row-1]
462 else:
463 row_score = nogap_score - 1
464
465
466 if row > 1:
467 col_score = col_cache_score[col-1]
468 else:
469 col_score = nogap_score - 1
470
471 best_score = max(nogap_score, row_score, col_score)
472 best_score_rint = rint(best_score)
473 best_index = []
474 if best_score_rint == rint(nogap_score):
475 best_index.append((row-1, col-1))
476 if best_score_rint == rint(row_score):
477 best_index.extend(row_cache_index[row-1])
478 if best_score_rint == rint(col_score):
479 best_index.extend(col_cache_index[col-1])
480
481
482 score = best_score + match_fn(sequenceA[row], sequenceB[col])
483 if not align_globally and score < 0:
484 score_matrix[row][col] = 0
485 else:
486 score_matrix[row][col] = score
487 trace_matrix[row][col] = best_index
488
489
490
491
492
493
494 open_score = score_matrix[row-1][col-1] + first_B_gap
495 extend_score = col_cache_score[col-1] + extend_B
496 open_score_rint, extend_score_rint = \
497 rint(open_score), rint(extend_score)
498 if open_score_rint > extend_score_rint:
499 col_cache_score[col-1] = open_score
500 col_cache_index[col-1] = [(row-1, col-1)]
501 elif extend_score_rint > open_score_rint:
502 col_cache_score[col-1] = extend_score
503 else:
504 col_cache_score[col-1] = open_score
505 if (row-1, col-1) not in col_cache_index[col-1]:
506 col_cache_index[col-1] = col_cache_index[col-1] + \
507 [(row-1, col-1)]
508
509
510 open_score = score_matrix[row-1][col-1] + first_A_gap
511 extend_score = row_cache_score[row-1] + extend_A
512 open_score_rint, extend_score_rint = \
513 rint(open_score), rint(extend_score)
514 if open_score_rint > extend_score_rint:
515 row_cache_score[row-1] = open_score
516 row_cache_index[row-1] = [(row-1, col-1)]
517 elif extend_score_rint > open_score_rint:
518 row_cache_score[row-1] = extend_score
519 else:
520 row_cache_score[row-1] = open_score
521 if (row-1, col-1) not in row_cache_index[row-1]:
522 row_cache_index[row-1] = row_cache_index[row-1] + \
523 [(row-1, col-1)]
524
525 return score_matrix, trace_matrix
526
527 -def _recover_alignments(sequenceA, sequenceB, starts,
528 score_matrix, trace_matrix, align_globally,
529 penalize_end_gaps, gap_char, one_alignment_only):
530
531
532
533 lenA, lenB = len(sequenceA), len(sequenceB)
534 tracebacks = []
535 in_process = []
536
537
538
539
540
541
542
543
544
545
546
547 for score, (row, col) in starts:
548 if align_globally:
549 begin, end = None, None
550 else:
551 begin, end = None, -max(lenA-row, lenB-col)+1
552 if not end:
553 end = None
554
555
556
557 in_process.append(
558 (sequenceA[0:0], sequenceB[0:0], score, begin, end,
559 (lenA, lenB), (row, col)))
560 if one_alignment_only:
561 break
562 while in_process and len(tracebacks) < MAX_ALIGNMENTS:
563 seqA, seqB, score, begin, end, prev_pos, next_pos = in_process.pop()
564 prevA, prevB = prev_pos
565 if next_pos is None:
566 prevlen = len(seqA)
567
568 seqA = sequenceA[:prevA] + seqA
569 seqB = sequenceB[:prevB] + seqB
570
571 seqA, seqB = _lpad_until_equal(seqA, seqB, gap_char)
572
573
574 if begin is None:
575 if align_globally:
576 begin = 0
577 else:
578 begin = len(seqA) - prevlen
579 tracebacks.append((seqA, seqB, score, begin, end))
580 else:
581 nextA, nextB = next_pos
582 nseqA, nseqB = prevA-nextA, prevB-nextB
583 maxseq = max(nseqA, nseqB)
584 ngapA, ngapB = maxseq-nseqA, maxseq-nseqB
585 seqA = sequenceA[nextA:nextA+nseqA] + gap_char*ngapA + seqA
586 seqB = sequenceB[nextB:nextB+nseqB] + gap_char*ngapB + seqB
587 prev_pos = next_pos
588
589 if not align_globally and score_matrix[nextA][nextB] <= 0:
590 begin = max(prevA, prevB)
591 in_process.append(
592 (seqA, seqB, score, begin, end, prev_pos, None))
593 else:
594 for next_pos in trace_matrix[nextA][nextB]:
595 in_process.append(
596 (seqA, seqB, score, begin, end, prev_pos, next_pos))
597 if one_alignment_only:
598 break
599
600 return _clean_alignments(tracebacks)
601
602 -def _find_start(score_matrix, sequenceA, sequenceB, gap_A_fn, gap_B_fn,
603 penalize_end_gaps, align_globally):
604
605
606 if align_globally:
607 if penalize_end_gaps:
608 starts = _find_global_start(
609 sequenceA, sequenceB, score_matrix, gap_A_fn, gap_B_fn, 1)
610 else:
611 starts = _find_global_start(
612 sequenceA, sequenceB, score_matrix, None, None, 0)
613 else:
614 starts = _find_local_start(score_matrix)
615 return starts
616
617 -def _find_global_start(sequenceA, sequenceB,
618 score_matrix, gap_A_fn, gap_B_fn, penalize_end_gaps):
637
639
640 positions = []
641 nrows, ncols = len(score_matrix), len(score_matrix[0])
642 for row in range(nrows):
643 for col in range(ncols):
644 score = score_matrix[row][col]
645 positions.append((score, (row, col)))
646 return positions
647
649
650
651
652 unique_alignments = []
653 for align in alignments :
654 if align not in unique_alignments :
655 unique_alignments.append(align)
656 i = 0
657 while i < len(unique_alignments):
658 seqA, seqB, score, begin, end = unique_alignments[i]
659
660 if end is None:
661 end = len(seqA)
662 elif end < 0:
663 end = end + len(seqA)
664
665 if begin >= end:
666 del unique_alignments[i]
667 continue
668 unique_alignments[i] = seqA, seqB, score, begin, end
669 i += 1
670 return unique_alignments
671
673
674 ls1, ls2 = len(s1), len(s2)
675 if ls1 < ls2:
676 s1 = _pad(s1, char, ls2-ls1)
677 elif ls2 < ls1:
678 s2 = _pad(s2, char, ls1-ls2)
679 return s1, s2
680
682
683
684 ls1, ls2 = len(s1), len(s2)
685 if ls1 < ls2:
686 s1 = _lpad(s1, char, ls2-ls1)
687 elif ls2 < ls1:
688 s2 = _lpad(s2, char, ls1-ls2)
689 return s1, s2
690
691 -def _pad(s, char, n):
692
693 return s + char*n
694
696
697 return char*n + s
698
699 _PRECISION = 1000
701 return int(x * precision + 0.5)
702
704 """identity_match([match][, mismatch]) -> match_fn
705
706 Create a match function for use in an alignment. match and
707 mismatch are the scores to give when two residues are equal or
708 unequal. By default, match is 1 and mismatch is 0.
709
710 """
711 - def __init__(self, match=1, mismatch=0):
712 self.match = match
713 self.mismatch = mismatch
715 if charA == charB:
716 return self.match
717 return self.mismatch
718
720 """dictionary_match(score_dict[, symmetric]) -> match_fn
721
722 Create a match function for use in an alignment. score_dict is a
723 dictionary where the keys are tuples (residue 1, residue 2) and
724 the values are the match scores between those residues. symmetric
725 is a flag that indicates whether the scores are symmetric. If
726 true, then if (res 1, res 2) doesn't exist, I will use the score
727 at (res 2, res 1).
728
729 """
730 - def __init__(self, score_dict, symmetric=1):
731 self.score_dict = score_dict
732 self.symmetric = symmetric
734 if self.symmetric and (charA, charB) not in self.score_dict:
735
736
737 charB, charA = charA, charB
738 return self.score_dict[(charA, charB)]
739
741 """affine_penalty(open, extend[, penalize_extend_when_opening]) -> gap_fn
742
743 Create a gap function for use in an alignment.
744
745 """
746 - def __init__(self, open, extend, penalize_extend_when_opening=0):
747 if open > 0 or extend > 0:
748 raise ValueError("Gap penalties should be non-positive.")
749 self.open, self.extend = open, extend
750 self.penalize_extend_when_opening = penalize_extend_when_opening
754
756 if length <= 0:
757 return 0
758 penalty = open + extend * length
759 if not penalize_extend_when_opening:
760 penalty -= extend
761 return penalty
762
764 """print_matrix(matrix)
765
766 Print out a matrix. For debugging purposes.
767
768 """
769
770 matrixT = [[] for x in range(len(matrix[0]))]
771 for i in range(len(matrix)):
772 for j in range(len(matrix[i])):
773 matrixT[j].append(len(str(matrix[i][j])))
774 ndigits = map(max, matrixT)
775 for i in range(len(matrix)):
776
777 print " ".join("%*s " % (ndigits[j], matrix[i][j]) \
778 for j in range(len(matrix[i])))
779
792
793
794
795
796 try:
797 from cpairwise2 import rint, _make_score_matrix_fast
798 except ImportError:
799 pass
800