1
2
3
4
5
6
7
8 """
9 Contains classes to deal with generic sequence alignment stuff not
10 specific to a particular program or format.
11
12 classes:
13 o Alignment
14 """
15
16
17 from Bio.Seq import Seq
18 from Bio.SeqRecord import SeqRecord
19 from Bio import Alphabet
20
22 """Represent a set of alignments (OBSOLETE?).
23
24 This is a base class to represent alignments, which can be subclassed
25 to deal with an alignment in a specific format.
26
27 With the introduction of the MultipleSeqAlignment class in Bio.Align,
28 this base class is effectively obsolete and will likely be deprecated and
29 later removed in future releases of Biopython.
30 """
32 """Initialize a new Alignment object.
33
34 Arguments:
35 o alphabet - The alphabet to use for the sequence objects that are
36 created. This alphabet must be a gapped type.
37
38 e.g.
39 >>> from Bio.Alphabet import IUPAC, Gapped
40 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
41 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG")
42 >>> align.add_sequence("Beta", "ACT-CTAGCTAG")
43 >>> align.add_sequence("Gamma", "ACTGCTAGATAG")
44 >>> print align
45 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns
46 ACTGCTAGCTAG Alpha
47 ACT-CTAGCTAG Beta
48 ACTGCTAGATAG Gamma
49 """
50 import warnings
51 warnings.warn("With the introduction of the MultipleSeqAlignment class in Bio.Align, this base class is effectively obsolete and will likely be deprecated and later removed in future releases of Biopython.", PendingDeprecationWarning)
52 if not (isinstance(alphabet, Alphabet.Alphabet) \
53 or isinstance(alphabet, Alphabet.AlphabetEncoder)):
54 raise ValueError("Invalid alphabet argument")
55 self._alphabet = alphabet
56
57 self._records = []
58
60 """Returns a truncated string representation of a SeqRecord (PRIVATE).
61
62 This is a PRIVATE function used by the __str__ method.
63 """
64 if len(record.seq) <= 50:
65 return "%s %s" % (record.seq, record.id)
66 else:
67 return "%s...%s %s" \
68 % (record.seq[:44], record.seq[-3:], record.id)
69
71 """Returns a multi-line string summary of the alignment.
72
73 This output is intended to be readable, but large alignments are
74 shown truncated. A maximum of 20 rows (sequences) and 50 columns
75 are shown, with the record identifiers. This should fit nicely on a
76 single screen. e.g.
77
78 >>> from Bio.Alphabet import IUPAC, Gapped
79 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
80 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG")
81 >>> align.add_sequence("Beta", "ACT-CTAGCTAG")
82 >>> align.add_sequence("Gamma", "ACTGCTAGATAG")
83 >>> print align
84 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns
85 ACTGCTAGCTAG Alpha
86 ACT-CTAGCTAG Beta
87 ACTGCTAGATAG Gamma
88
89 See also the alignment's format method.
90 """
91 rows = len(self._records)
92 lines = ["%s alignment with %i rows and %i columns" \
93 % (str(self._alphabet), rows, self.get_alignment_length())]
94 if rows <= 20:
95 lines.extend([self._str_line(rec) for rec in self._records])
96 else:
97 lines.extend([self._str_line(rec) for rec in self._records[:18]])
98 lines.append("...")
99 lines.append(self._str_line(self._records[-1]))
100 return "\n".join(lines)
101
103 """Returns a representation of the object for debugging.
104
105 The representation cannot be used with eval() to recreate the object,
106 which is usually possible with simple python ojects. For example:
107
108 <Bio.Align.Generic.Alignment instance (2 records of length 14,
109 SingleLetterAlphabet()) at a3c184c>
110
111 The hex string is the memory address of the object, see help(id).
112 This provides a simple way to visually distinguish alignments of
113 the same size.
114 """
115
116
117 return "<%s instance (%i records of length %i, %s) at %x>" % \
118 (self.__class__, len(self._records),
119 self.get_alignment_length(), repr(self._alphabet), id(self))
120
121
122
123
124
159
160
177
179 """Return all of the sequences involved in the alignment (DEPRECATED).
180
181 The return value is a list of SeqRecord objects.
182
183 This method is deprecated, as the Alignment object itself now offers
184 much of the functionality of a list of SeqRecord objects (e.g.
185 iteration or slicing to create a sub-alignment). Instead use the
186 Python builtin function list, i.e. my_list = list(my_align)
187 """
188 import warnings
189 import Bio
190 warnings.warn("This method is deprecated, since the alignment object"
191 "now acts more like a list. Instead of calling "
192 "align.get_all_seqs() you can use list(align)",
193 Bio.BiopythonDeprecationWarning)
194 return self._records
195
197 """Iterate over alignment rows as SeqRecord objects.
198
199 e.g.
200 >>> from Bio.Alphabet import IUPAC, Gapped
201 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
202 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG")
203 >>> align.add_sequence("Beta", "ACT-CTAGCTAG")
204 >>> align.add_sequence("Gamma", "ACTGCTAGATAG")
205 >>> for record in align:
206 ... print record.id
207 ... print record.seq
208 Alpha
209 ACTGCTAGCTAG
210 Beta
211 ACT-CTAGCTAG
212 Gamma
213 ACTGCTAGATAG
214 """
215 return iter(self._records)
216
218 """Retrieve a sequence by row number (OBSOLETE).
219
220 Returns:
221 o A Seq object for the requested sequence.
222
223 Raises:
224 o IndexError - If the specified number is out of range.
225
226 NOTE: This is a legacy method. In new code where you need to access
227 the rows of the alignment (i.e. the sequences) consider iterating
228 over them or accessing them as SeqRecord objects. e.g.
229
230 for record in alignment:
231 print record.id
232 print record.seq
233 first_record = alignment[0]
234 last_record = alignment[-1]
235 """
236 import warnings
237 warnings.warn("This is a legacy method. In new code where you need to access the rows of the alignment (i.e. the sequences) consider iterating over them or accessing them as SeqRecord objects.", PendingDeprecationWarning)
238 return self._records[number].seq
239
241 """Returns the number of sequences in the alignment.
242
243 Use len(alignment) to get the number of sequences (i.e. the number of
244 rows), and alignment.get_alignment_length() to get the length of the
245 longest sequence (i.e. the number of columns).
246
247 This is easy to remember if you think of the alignment as being like a
248 list of SeqRecord objects.
249 """
250 return len(self._records)
251
253 """Return the maximum length of the alignment.
254
255 All objects in the alignment should (hopefully) have the same
256 length. This function will go through and find this length
257 by finding the maximum length of sequences in the alignment.
258
259 >>> from Bio.Alphabet import IUPAC, Gapped
260 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
261 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG")
262 >>> align.add_sequence("Beta", "ACT-CTAGCTAG")
263 >>> align.add_sequence("Gamma", "ACTGCTAGATAG")
264 >>> align.get_alignment_length()
265 12
266
267 If you want to know the number of sequences in the alignment,
268 use len(align) instead:
269
270 >>> len(align)
271 3
272
273 """
274 max_length = 0
275
276 for record in self._records:
277 if len(record.seq) > max_length:
278 max_length = len(record.seq)
279
280 return max_length
281
282 - def add_sequence(self, descriptor, sequence, start = None, end = None,
283 weight = 1.0):
284 """Add a sequence to the alignment.
285
286 This doesn't do any kind of alignment, it just adds in the sequence
287 object, which is assumed to be prealigned with the existing
288 sequences.
289
290 Arguments:
291 o descriptor - The descriptive id of the sequence being added.
292 This will be used as the resulting SeqRecord's
293 .id property (and, for historical compatibility,
294 also the .description property)
295 o sequence - A string with sequence info.
296 o start - You can explicitly set the start point of the sequence.
297 This is useful (at least) for BLAST alignments, which can just
298 be partial alignments of sequences.
299 o end - Specify the end of the sequence, which is important
300 for the same reason as the start.
301 o weight - The weight to place on the sequence in the alignment.
302 By default, all sequences have the same weight. (0.0 => no weight,
303 1.0 => highest weight)
304 """
305 new_seq = Seq(sequence, self._alphabet)
306
307
308
309
310
311
312 new_record = SeqRecord(new_seq,
313 id = descriptor,
314 description = descriptor)
315
316
317
318
319
320
321
322
323 if start:
324 new_record.annotations['start'] = start
325 if end:
326 new_record.annotations['end'] = end
327
328
329 new_record.annotations['weight'] = weight
330
331 self._records.append(new_record)
332
334 """Returns a string containing a given column.
335
336 e.g.
337 >>> from Bio.Alphabet import IUPAC, Gapped
338 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
339 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG")
340 >>> align.add_sequence("Beta", "ACT-CTAGCTAG")
341 >>> align.add_sequence("Gamma", "ACTGCTAGATAG")
342 >>> align.get_column(0)
343 'AAA'
344 >>> align.get_column(3)
345 'G-G'
346 """
347
348 col_str = ''
349 assert col >= 0 and col <= self.get_alignment_length()
350 for rec in self._records:
351 col_str += rec.seq[col]
352 return col_str
353
355 """Access part of the alignment.
356
357 We'll use the following example alignment here for illustration:
358
359 >>> from Bio.Alphabet import IUPAC, Gapped
360 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
361 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG")
362 >>> align.add_sequence("Beta", "ACT-CTAGCTAG")
363 >>> align.add_sequence("Gamma", "ACTGCTAGATAG")
364 >>> align.add_sequence("Delta", "ACTGCTTGCTAG")
365 >>> align.add_sequence("Epsilon","ACTGCTTGATAG")
366
367 You can access a row of the alignment as a SeqRecord using an integer
368 index (think of the alignment as a list of SeqRecord objects here):
369
370 >>> first_record = align[0]
371 >>> print first_record.id, first_record.seq
372 Alpha ACTGCTAGCTAG
373 >>> last_record = align[-1]
374 >>> print last_record.id, last_record.seq
375 Epsilon ACTGCTTGATAG
376
377 You can also access use python's slice notation to create a sub-alignment
378 containing only some of the SeqRecord objects:
379
380 >>> sub_alignment = align[2:5]
381 >>> print sub_alignment
382 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns
383 ACTGCTAGATAG Gamma
384 ACTGCTTGCTAG Delta
385 ACTGCTTGATAG Epsilon
386
387 This includes support for a step, i.e. align[start:end:step], which
388 can be used to select every second sequence:
389
390 >>> sub_alignment = align[::2]
391 >>> print sub_alignment
392 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns
393 ACTGCTAGCTAG Alpha
394 ACTGCTAGATAG Gamma
395 ACTGCTTGATAG Epsilon
396
397 Or to get a copy of the alignment with the rows in reverse order:
398
399 >>> rev_alignment = align[::-1]
400 >>> print rev_alignment
401 Gapped(IUPACUnambiguousDNA(), '-') alignment with 5 rows and 12 columns
402 ACTGCTTGATAG Epsilon
403 ACTGCTTGCTAG Delta
404 ACTGCTAGATAG Gamma
405 ACT-CTAGCTAG Beta
406 ACTGCTAGCTAG Alpha
407
408 Right now, these are the ONLY indexing operations supported. The use of
409 a second column based index is under discussion for a future update.
410 """
411 if isinstance(index, int):
412
413
414 return self._records[index]
415 elif isinstance(index, slice):
416
417
418
419
420 sub_align = Alignment(self._alphabet)
421 sub_align._records = self._records[index]
422 return sub_align
423 elif len(index)==2:
424 raise TypeError("Row and Column indexing is not currently supported,"\
425 +"but may be in future.")
426 else:
427 raise TypeError("Invalid index type.")
428
430 """Run the Bio.Align.Generic module's doctests."""
431 print "Running doctests..."
432 import doctest
433 doctest.testmod()
434 print "Done"
435
436 if __name__ == "__main__":
437 _test()
438