1
2
3
4
5 """
6 Bio.AlignIO support for the "stockholm" format (used in the PFAM database).
7
8 You are expected to use this module via the Bio.AlignIO functions (or the
9 Bio.SeqIO functions if you want to work directly with the gapped sequences).
10
11 For example, consider a Stockholm alignment file containing the following::
12
13 # STOCKHOLM 1.0
14 #=GC SS_cons .................<<<<<<<<...<<<<<<<........>>>>>>>..
15 AP001509.1 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGU
16 #=GR AP001509.1 SS -----------------<<<<<<<<---..<<-<<-------->>->>..--
17 AE007476.1 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGU
18 #=GR AE007476.1 SS -----------------<<<<<<<<-----<<.<<-------->>.>>----
19
20 #=GC SS_cons ......<<<<<<<.......>>>>>>>..>>>>>>>>...............
21 AP001509.1 CUCUAC-AGGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU
22 #=GR AP001509.1 SS -------<<<<<--------->>>>>--->>>>>>>>---------------
23 AE007476.1 UUCUACAAGGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU
24 #=GR AE007476.1 SS ------.<<<<<--------->>>>>.-->>>>>>>>---------------
25 //
26
27 This is a single multiple sequence alignment, so you would probably load this
28 using the Bio.AlignIO.read() function:
29
30 >>> from Bio import AlignIO
31 >>> handle = open("Stockholm/simple.sth", "rU")
32 >>> align = AlignIO.read(handle, "stockholm")
33 >>> handle.close()
34 >>> print align
35 SingleLetterAlphabet() alignment with 2 rows and 104 columns
36 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-G...UGU AP001509.1
37 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-C...GAU AE007476.1
38 >>> for record in align:
39 ... print record.id, len(record)
40 AP001509.1 104
41 AE007476.1 104
42
43 This example file is clearly using RNA, so you might want the alignment object
44 (and the SeqRecord objects it holds) to reflect this, rather than simple using
45 the default single letter alphabet as shown above. You can do this with an
46 optional argument to the Bio.AlignIO.read() function:
47
48 >>> from Bio import AlignIO
49 >>> from Bio.Alphabet import generic_rna
50 >>> handle = open("Stockholm/simple.sth", "rU")
51 >>> align = AlignIO.read(handle, "stockholm", alphabet=generic_rna)
52 >>> handle.close()
53 >>> print align
54 RNAAlphabet() alignment with 2 rows and 104 columns
55 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-G...UGU AP001509.1
56 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-C...GAU AE007476.1
57
58 In addition to the sequences themselves, this example alignment also includes
59 some GR lines for the secondary structure of the sequences. These are strings,
60 with one character for each letter in the associated sequence:
61
62 >>> for record in align:
63 ... print record.id
64 ... print record.seq
65 ... print record.letter_annotations['secondary_structure']
66 AP001509.1
67 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGUCUCUAC-AGGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU
68 -----------------<<<<<<<<---..<<-<<-------->>->>..---------<<<<<--------->>>>>--->>>>>>>>---------------
69 AE007476.1
70 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGUUUCUACAAGGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU
71 -----------------<<<<<<<<-----<<.<<-------->>.>>----------.<<<<<--------->>>>>.-->>>>>>>>---------------
72
73 Any general annotation for each row is recorded in the SeqRecord's annotations
74 dictionary. You can output this alignment in many different file formats using
75 Bio.AlignIO.write(), or the MultipleSeqAlignment object's format method:
76
77 >>> print align.format("fasta")
78 >AP001509.1
79 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGUCUCUAC-A
80 GGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU
81 >AE007476.1
82 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGUUUCUACAA
83 GGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU
84 <BLANKLINE>
85
86 Most output formats won't be able to hold the annotation possible in a Stockholm file:
87
88 >>> print align.format("stockholm")
89 # STOCKHOLM 1.0
90 #=GF SQ 2
91 AP001509.1 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGUCUCUAC-AGGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU
92 #=GS AP001509.1 AC AP001509.1
93 #=GS AP001509.1 DE AP001509.1
94 #=GR AP001509.1 SS -----------------<<<<<<<<---..<<-<<-------->>->>..---------<<<<<--------->>>>>--->>>>>>>>---------------
95 AE007476.1 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGUUUCUACAAGGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU
96 #=GS AE007476.1 AC AE007476.1
97 #=GS AE007476.1 DE AE007476.1
98 #=GR AE007476.1 SS -----------------<<<<<<<<-----<<.<<-------->>.>>----------.<<<<<--------->>>>>.-->>>>>>>>---------------
99 //
100 <BLANKLINE>
101
102 Note that when writing Stockholm files, Biopython does not break long sequences up and
103 interleave them (as in the input file shown above). The standard allows this simpler
104 layout, and it is more likely to be understood by other tools.
105
106 Finally, as an aside, it can sometimes be useful to use Bio.SeqIO.parse() to iterate over
107 the two rows as SeqRecord objects - rather than working with Alignnment objects.
108 Again, if you want to you can specify this is RNA:
109
110 >>> from Bio import SeqIO
111 >>> from Bio.Alphabet import generic_rna
112 >>> handle = open("Stockholm/simple.sth", "rU")
113 >>> for record in SeqIO.parse(handle, "stockholm", alphabet=generic_rna):
114 ... print record.id
115 ... print record.seq
116 ... print record.letter_annotations['secondary_structure']
117 AP001509.1
118 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGUCUCUAC-AGGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU
119 -----------------<<<<<<<<---..<<-<<-------->>->>..---------<<<<<--------->>>>>--->>>>>>>>---------------
120 AE007476.1
121 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGUUUCUACAAGGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU
122 -----------------<<<<<<<<-----<<.<<-------->>.>>----------.<<<<<--------->>>>>.-->>>>>>>>---------------
123 >>> handle.close()
124
125 Remember that if you slice a SeqRecord, the per-letter-annotions like the
126 secondary structure string here, are also sliced:
127
128 >>> sub_record = record[10:20]
129 >>> print sub_record.seq
130 AUCGUUUUAC
131 >>> print sub_record.letter_annotations['secondary_structure']
132 -------<<<
133 """
134 __docformat__ = "epytext en"
135 from Bio.Seq import Seq
136 from Bio.SeqRecord import SeqRecord
137 from Bio.Align import MultipleSeqAlignment
138 from Interfaces import AlignmentIterator, SequentialAlignmentWriter
139
141 """Stockholm/PFAM alignment writer."""
142
143
144
145 pfam_gr_mapping = {"secondary_structure" : "SS",
146 "surface_accessibility" : "SA",
147 "transmembrane" : "TM",
148 "posterior_probability" : "PP",
149 "ligand_binding" : "LI",
150 "active_site" : "AS",
151 "intron" : "IN"}
152
153 pfam_gs_mapping = {"organism" : "OS",
154 "organism_classification" : "OC",
155 "look" : "LO"}
156
158 """Use this to write (another) single alignment to an open file.
159
160 Note that sequences and their annotation are recorded
161 together (rather than having a block of annotation followed
162 by a block of aligned sequences).
163 """
164 count = len(alignment)
165
166 self._length_of_sequences = alignment.get_alignment_length()
167 self._ids_written = []
168
169
170
171
172 if count == 0:
173 raise ValueError("Must have at least one sequence")
174 if self._length_of_sequences == 0:
175 raise ValueError("Non-empty sequences are required")
176
177 self.handle.write("# STOCKHOLM 1.0\n")
178 self.handle.write("#=GF SQ %i\n" % count)
179 for record in alignment:
180 self._write_record(record)
181 self.handle.write("//\n")
182
184 """Write a single SeqRecord to the file"""
185 if self._length_of_sequences != len(record.seq):
186 raise ValueError("Sequences must all be the same length")
187
188
189 seq_name = record.id
190 if record.name is not None:
191 if "accession" in record.annotations:
192 if record.id == record.annotations["accession"]:
193 seq_name = record.name
194
195
196 seq_name = seq_name.replace(" ","_")
197
198 if "start" in record.annotations \
199 and "end" in record.annotations:
200 suffix = "/%s-%s" % (str(record.annotations["start"]),
201 str(record.annotations["end"]))
202 if seq_name[-len(suffix):] != suffix:
203 seq_name = "%s/%s-%s" % (seq_name,
204 str(record.annotations["start"]),
205 str(record.annotations["end"]))
206
207 if seq_name in self._ids_written:
208 raise ValueError("Duplicate record identifier: %s" % seq_name)
209 self._ids_written.append(seq_name)
210 self.handle.write("%s %s\n" % (seq_name, record.seq.tostring()))
211
212
213
214
215
216
217
218
219
220
221
222
223
224 if "accession" in record.annotations:
225 self.handle.write("#=GS %s AC %s\n" \
226 % (seq_name, self.clean(record.annotations["accession"])))
227 elif record.id:
228 self.handle.write("#=GS %s AC %s\n" \
229 % (seq_name, self.clean(record.id)))
230
231
232 if record.description:
233 self.handle.write("#=GS %s DE %s\n" \
234 % (seq_name, self.clean(record.description)))
235
236
237 for xref in record.dbxrefs:
238 self.handle.write("#=GS %s DR %s\n" \
239 % (seq_name, self.clean(xref)))
240
241
242 for key, value in record.annotations.iteritems():
243 if key in self.pfam_gs_mapping:
244 data = self.clean(str(value))
245 if data:
246 self.handle.write("#=GS %s %s %s\n" \
247 % (seq_name,
248 self.clean(self.pfam_gs_mapping[key]),
249 data))
250 else:
251
252
253 pass
254
255
256 for key, value in record.letter_annotations.iteritems():
257 if key in self.pfam_gr_mapping and len(str(value))==len(record.seq):
258 data = self.clean(str(value))
259 if data:
260 self.handle.write("#=GR %s %s %s\n" \
261 % (seq_name,
262 self.clean(self.pfam_gr_mapping[key]),
263 data))
264 else:
265
266
267 pass
268
270 """Loads a Stockholm file from PFAM into MultipleSeqAlignment objects.
271
272 The file may contain multiple concatenated alignments, which are loaded
273 and returned incrementally.
274
275 This parser will detect if the Stockholm file follows the PFAM
276 conventions for sequence specific meta-data (lines starting #=GS
277 and #=GR) and populates the SeqRecord fields accordingly.
278
279 Any annotation which does not follow the PFAM conventions is currently
280 ignored.
281
282 If an accession is provided for an entry in the meta data, IT WILL NOT
283 be used as the record.id (it will be recorded in the record's
284 annotations). This is because some files have (sub) sequences from
285 different parts of the same accession (differentiated by different
286 start-end positions).
287
288 Wrap-around alignments are not supported - each sequences must be on
289 a single line. However, interlaced sequences should work.
290
291 For more information on the file format, please see:
292 http://www.bioperl.org/wiki/Stockholm_multiple_alignment_format
293 http://www.cgb.ki.se/cgb/groups/sonnhammer/Stockholm.html
294
295 For consistency with BioPerl and EMBOSS we call this the "stockholm"
296 format.
297 """
298
299
300
301 pfam_gr_mapping = {"SS" : "secondary_structure",
302 "SA" : "surface_accessibility",
303 "TM" : "transmembrane",
304 "PP" : "posterior_probability",
305 "LI" : "ligand_binding",
306 "AS" : "active_site",
307 "IN" : "intron"}
308
309 pfam_gs_mapping = {"OS" : "organism",
310 "OC" : "organism_classification",
311 "LO" : "look"}
312
314 try:
315 line = self._header
316 del self._header
317 except AttributeError:
318 line = self.handle.readline()
319 if not line:
320
321 raise StopIteration
322 if not line.strip() == '# STOCKHOLM 1.0':
323 raise ValueError("Did not find STOCKHOLM header")
324
325
326
327
328
329
330
331
332 seqs = {}
333 ids = []
334 gs = {}
335 gr = {}
336 gf = {}
337 passed_end_alignment = False
338 while 1:
339 line = self.handle.readline()
340 if not line: break
341 line = line.strip()
342 if line == '# STOCKHOLM 1.0':
343 self._header = line
344 break
345 elif line == "//":
346
347
348 passed_end_alignment = True
349 elif line == "":
350
351 pass
352 elif line[0] != "#":
353
354
355 assert not passed_end_alignment
356 parts = [x.strip() for x in line.split(" ",1)]
357 if len(parts) != 2:
358
359 raise ValueError("Could not split line into identifier " \
360 + "and sequence:\n" + line)
361 id, seq = parts
362 if id not in ids:
363 ids.append(id)
364 seqs.setdefault(id, '')
365 seqs[id] += seq.replace(".","-")
366 elif len(line) >= 5:
367
368 if line[:5] == "#=GF ":
369
370
371 feature, text = line[5:].strip().split(None,1)
372
373
374 if feature not in gf:
375 gf[feature] = [text]
376 else:
377 gf[feature].append(text)
378 elif line[:5] == '#=GC ':
379
380
381 pass
382 elif line[:5] == '#=GS ':
383
384
385 id, feature, text = line[5:].strip().split(None,2)
386
387
388 if id not in gs:
389 gs[id] = {}
390 if feature not in gs[id]:
391 gs[id][feature] = [text]
392 else:
393 gs[id][feature].append(text)
394 elif line[:5] == "#=GR ":
395
396
397 id, feature, text = line[5:].strip().split(None,2)
398
399
400 if id not in gr:
401 gr[id] = {}
402 if feature not in gr[id]:
403 gr[id][feature] = ""
404 gr[id][feature] += text.strip()
405
406
407
408
409
410
411 assert len(seqs) <= len(ids)
412
413
414
415 self.ids = ids
416 self.sequences = seqs
417 self.seq_annotation = gs
418 self.seq_col_annotation = gr
419
420 if ids and seqs:
421
422 if self.records_per_alignment is not None \
423 and self.records_per_alignment != len(ids):
424 raise ValueError("Found %i records in this alignment, told to expect %i" \
425 % (len(ids), self.records_per_alignment))
426
427 alignment_length = len(seqs.values()[0])
428 records = []
429 for id in ids:
430 seq = seqs[id]
431 if alignment_length != len(seq):
432 raise ValueError("Sequences have different lengths, or repeated identifier")
433 name, start, end = self._identifier_split(id)
434 record = SeqRecord(Seq(seq, self.alphabet),
435 id = id, name = name, description = id,
436 annotations = {"accession":name})
437
438
439 record.annotations["accession"]=name
440
441 if start is not None:
442 record.annotations["start"] = start
443 if end is not None:
444 record.annotations["end"] = end
445
446 self._populate_meta_data(id, record)
447 records.append(record)
448 alignment = MultipleSeqAlignment(records, self.alphabet)
449
450
451
452 alignment._annotations = gr
453
454 return alignment
455 else:
456 raise StopIteration
457
458
460 """Returns (name,start,end) string tuple from an identier."""
461 if identifier.find("/")!=-1:
462 start_end = identifier.split("/",1)[1]
463 if start_end.count("-")==1:
464 start, end = map(int, start_end.split("-"))
465 name = identifier.split("/",1)[0]
466 return (name, start, end)
467 return (identifier, None, None)
468
501
533
535 """Run the Bio.SeqIO module's doctests.
536
537 This will try and locate the unit tests directory, and run the doctests
538 from there in order that the relative paths used in the examples work.
539 """
540 import doctest
541 import os
542 if os.path.isdir(os.path.join("..","..","Tests")):
543 print "Runing doctests..."
544 cur_dir = os.path.abspath(os.curdir)
545 os.chdir(os.path.join("..","..","Tests"))
546 assert os.path.isfile("Stockholm/simple.sth")
547 doctest.testmod()
548 os.chdir(cur_dir)
549 del cur_dir
550 print "Done"
551
552 if __name__ == "__main__":
553 _test()
554