1
2
3
4
5 """
6 AlignIO support for the "phylip" format used in Joe Felsenstein's PHYLIP tools.
7
8 You are expected to use this module via the Bio.AlignIO functions (or the
9 Bio.SeqIO functions if you want to work directly with the gapped sequences).
10
11 Note
12 ====
13 In TREE_PUZZLE (Schmidt et al. 2003) and PHYML (Guindon and Gascuel 2003)
14 a dot/period (".") in a sequence is interpreted as meaning the same
15 character as in the first sequence. The PHYLIP 3.6 documentation says:
16
17 "a period was also previously allowed but it is no longer allowed,
18 because it sometimes is used in different senses in other programs"
19
20 At the time of writing, we do nothing special with a dot/period.
21 """
22
23 from Bio.Seq import Seq
24 from Bio.SeqRecord import SeqRecord
25 from Bio.Alphabet import single_letter_alphabet
26 from Bio.Align import MultipleSeqAlignment
27 from Interfaces import AlignmentIterator, SequentialAlignmentWriter
28
30 """Phylip alignment writer."""
32 """Use this to write (another) single alignment to an open file.
33
34 This code will write interlaced alignments (when the sequences are
35 longer than 50 characters).
36
37 Note that record identifiers are strictly truncated at 10 characters.
38
39 For more information on the file format, please see:
40 http://evolution.genetics.washington.edu/phylip/doc/sequence.html
41 http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles
42 """
43 truncate=10
44 handle = self.handle
45
46 if len(alignment)==0:
47 raise ValueError("Must have at least one sequence")
48 length_of_seqs = alignment.get_alignment_length()
49 for record in alignment:
50 if length_of_seqs != len(record.seq):
51 raise ValueError("Sequences must all be the same length")
52 if length_of_seqs <= 0:
53 raise ValueError("Non-empty sequences are required")
54
55 if len(alignment) > len(set([r.id[:truncate] for r in alignment])):
56 raise ValueError("Repeated identifier, possibly due to truncation")
57
58
59
60
61
62
63
64 handle.write(" %i %s\n" % (len(alignment), length_of_seqs))
65 block=0
66 while True:
67 for record in alignment:
68 if block==0:
69
70 """
71 Quoting the PHYLIP version 3.6 documentation:
72
73 The name should be ten characters in length, filled out to
74 the full ten characters by blanks if shorter. Any printable
75 ASCII/ISO character is allowed in the name, except for
76 parentheses ("(" and ")"), square brackets ("[" and "]"),
77 colon (":"), semicolon (";") and comma (","). If you forget
78 to extend the names to ten characters in length by blanks,
79 the program [i.e. PHYLIP] will get out of synchronization
80 with the contents of the data file, and an error message will
81 result.
82
83 Note that Tab characters count as only one character in the
84 species names. Their inclusion can cause trouble.
85 """
86 name = record.id.strip()
87
88
89 for char in "[](),":
90 name = name.replace(char,"")
91 for char in ":;":
92 name = name.replace(char,"|")
93
94
95 handle.write(name[:truncate].ljust(truncate))
96 else:
97
98 handle.write(" "*truncate)
99
100 for chunk in range(0,5):
101 i = block*50 + chunk*10
102 seq_segment = record.seq.tostring()[i:i+10]
103
104
105 handle.write(" %s" % seq_segment)
106 if i+10 > length_of_seqs : break
107 handle.write("\n")
108 block=block+1
109 if block*50 > length_of_seqs : break
110 handle.write("\n")
111
113 """Reads a Phylip alignment file returning a MultipleSeqAlignment iterator.
114
115 Record identifiers are limited to at most 10 characters.
116
117 It only copes with interlaced phylip files! Sequential files won't work
118 where the sequences are split over multiple lines.
119
120 For more information on the file format, please see:
121 http://evolution.genetics.washington.edu/phylip/doc/sequence.html
122 http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles
123 """
124
126 line = line.strip()
127 parts = filter(None, line.split())
128 if len(parts)!=2:
129 return False
130 try:
131 number_of_seqs = int(parts[0])
132 length_of_seqs = int(parts[1])
133 return True
134 except ValueError:
135 return False
136
138 handle = self.handle
139
140 try:
141
142
143 line = self._header
144 del self._header
145 except AttributeError:
146 line = handle.readline()
147
148 if not line:
149 raise StopIteration
150 line = line.strip()
151 parts = filter(None, line.split())
152 if len(parts)!=2:
153 raise ValueError("First line should have two integers")
154 try:
155 number_of_seqs = int(parts[0])
156 length_of_seqs = int(parts[1])
157 except ValueError:
158 raise ValueError("First line should have two integers")
159
160 assert self._is_header(line)
161
162 if self.records_per_alignment is not None \
163 and self.records_per_alignment != number_of_seqs:
164 raise ValueError("Found %i records in this alignment, told to expect %i" \
165 % (number_of_seqs, self.records_per_alignment))
166
167 ids = []
168 seqs = []
169
170
171
172 for i in range(0,number_of_seqs):
173 line = handle.readline().rstrip()
174 ids.append(line[:10].strip())
175 seqs.append([line[10:].strip().replace(" ","")])
176
177
178 line=""
179 while True:
180
181 while ""==line.strip():
182 line = handle.readline()
183 if not line : break
184 if not line : break
185
186 if self._is_header(line):
187
188 self._header = line
189 break
190
191
192 for i in range(0,number_of_seqs):
193 seqs[i].append(line.strip().replace(" ",""))
194 line = handle.readline()
195 if (not line) and i+1 < number_of_seqs:
196 raise ValueError("End of file mid-block")
197 if not line : break
198
199 records = (SeqRecord(Seq("".join(s), self.alphabet), \
200 id=i, name=i, description=i) \
201 for (i,s) in zip(ids, seqs))
202 return MultipleSeqAlignment(records, self.alphabet)
203
204 if __name__=="__main__":
205 print "Running short mini-test"
206
207 phylip_text=""" 8 286
208 V_Harveyi_ --MKNWIKVA VAAIA--LSA A--------- ---------T VQAATEVKVG
209 B_subtilis MKMKKWTVLV VAALLAVLSA CG-------- ----NGNSSS KEDDNVLHVG
210 B_subtilis MKKALLALFM VVSIAALAAC GAGNDNQSKD NAKDGDLWAS IKKKGVLTVG
211 YA80_HAEIN MKKLLFTTAL LTGAIAFSTF ---------- -SHAGEIADR VEKTKTLLVG
212 FLIY_ECOLI MKLAHLGRQA LMGVMAVALV AG---MSVKS FADEG-LLNK VKERGTLLVG
213 E_coli_Gln --MKSVLKVS LAALTLAFAV S--------- ---------S HAADKKLVVA
214 Deinococcu -MKKSLLSLK LSGLLVPSVL ALS------- -LSACSSPSS TLNQGTLKIA
215 HISJ_E_COL MKKLVLSLSL VLAFSSATAA F--------- ---------- AAIPQNIRIG
216
217 MSGRYFPFTF VKQ--DKLQG FEVDMWDEIG KRNDYKIEYV TANFSGLFGL
218 ATGQSYPFAY KEN--GKLTG FDVEVMEAVA KKIDMKLDWK LLEFSGLMGE
219 TEGTYEPFTY HDKDTDKLTG YDVEVITEVA KRLGLKVDFK ETQWGSMFAG
220 TEGTYAPFTF HDK-SGKLTG FDVEVIRKVA EKLGLKVEFK ETQWDAMYAG
221 LEGTYPPFSF QGD-DGKLTG FEVEFAQQLA KHLGVEASLK PTKWDGMLAS
222 TDTAFVPFEF KQG--DKYVG FDVDLWAAIA KELKLDYELK PMDFSGIIPA
223 MEGTYPPFTS KNE-QGELVG FDVDIAKAVA QKLNLKPEFV LTEWSGILAG
224 TDPTYAPFES KNS-QGELVG FDIDLAKELC KRINTQCTFV ENPLDALIPS
225
226 LETGRIDTIS NQITMTDARK AKYLFADPYV VDG-AQITVR KGNDSIQGVE
227 LQTGKLDTIS NQVAVTDERK ETYNFTKPYA YAG-TQIVVK KDNTDIKSVD
228 LNSKRFDVVA NQVG-KTDRE DKYDFSDKYT TSR-AVVVTK KDNNDIKSEA
229 LNAKRFDVIA NQTNPSPERL KKYSFTTPYN YSG-GVIVTK SSDNSIKSFE
230 LDSKRIDVVI NQVTISDERK KKYDFSTPYT ISGIQALVKK GNEGTIKTAD
231 LQTKNVDLAL AGITITDERK KAIDFSDGYY KSG-LLVMVK ANNNDVKSVK
232 LQANKYDVIV NQVGITPERQ NSIGFSQPYA YSRPEIIVAK NNTFNPQSLA
233 LKAKKIDAIM SSLSITEKRQ QEIAFTDKLY AADSRLVVAK NSDIQP-TVE
234
235 DLAGKTVAVN LGSNFEQLLR DYDKDGKINI KTYDT--GIE HDVALGRADA
236 DLKGKTVAAV LGSNHAKNLE SKDPDKKINI KTYETQEGTL KDVAYGRVDA
237 DVKGKTSAQS LTSNYNKLAT N----AGAKV EGVEGMAQAL QMIQQARVDM
238 DLKGRKSAQS ATSNWGKDAK A----AGAQI LVVDGLAQSL ELIKQGRAEA
239 DLKGKKVGVG LGTNYEEWLR QNV--QGVDV RTYDDDPTKY QDLRVGRIDA
240 DLDGKVVAVK SGTGSVDYAK AN--IKTKDL RQFPNIDNAY MELGTNRADA
241 DLKGKRVGST LGSNYEKQLI DTG---DIKI VTYPGAPEIL ADLVAGRIDA
242 SLKGKRVGVL QGTTQETFGN EHWAPKGIEI VSYQGQDNIY SDLTAGRIDA
243
244 FIMDRLSALE -LIKKT-GLP LQLAGEPFET I-----QNAW PFVDNEKGRK
245 YVNSRTVLIA -QIKKT-GLP LKLAGDPIVY E-----QVAF PFAKDDAHDK
246 TYNDKLAVLN -YLKTSGNKN VKIAFETGEP Q-----STYF TFRKGS--GE
247 TINDKLAVLD -YFKQHPNSG LKIAYDRGDK T-----PTAF AFLQGE--DA
248 ILVDRLAALD -LVKKT-NDT LAVTGEAFSR Q-----ESGV ALRKGN--ED
249 VLHDTPNILY -FIKTAGNGQ FKAVGDSLEA Q-----QYGI AFPKGS--DE
250 AYNDRLVVNY -IINDQ-KLP VRGAGQIGDA A-----PVGI ALKKGN--SA
251 AFQDEVAASE GFLKQPVGKD YKFGGPSVKD EKLFGVGTGM GLRKED--NE
252
253 LQAEVNKALA EMRADGTVEK ISVKWFGADI TK----
254 LRKKVNKALD ELRKDGTLKK LSEKYFNEDI TVEQKH
255 VVDQVNKALK EMKEDGTLSK ISKKWFGEDV SK----
256 LITKFNQVLE ALRQDGTLKQ ISIEWFGYDI TQ----
257 LLKAVNDAIA EMQKDGTLQA LSEKWFGADV TK----
258 LRDKVNGALK TLRENGTYNE IYKKWFGTEP K-----
259 LKDQIDKALT EMRSDGTFEK ISQKWFGQDV GQP---
260 LREALNKAFA EMRADGTYEK LAKKYFDFDV YGG---
261 """
262
263 from cStringIO import StringIO
264 handle = StringIO(phylip_text)
265 count=0
266 for alignment in PhylipIterator(handle):
267 for record in alignment:
268 count=count+1
269 print record.id
270
271 assert count == 8
272
273 expected="""mkklvlslsl vlafssataa faaipqniri gtdptyapfe sknsqgelvg
274 fdidlakelc krintqctfv enpldalips lkakkidaim sslsitekrq qeiaftdkly
275 aadsrlvvak nsdiqptves lkgkrvgvlq gttqetfgne hwapkgieiv syqgqdniys
276 dltagridaafqdevaaseg flkqpvgkdy kfggpsvkde klfgvgtgmg lrkednelre
277 alnkafaemradgtyeklak kyfdfdvygg""".replace(" ","").replace("\n","").upper()
278 assert record.seq.tostring().replace("-","") == expected
279
280
281
282 phylip_text2="""5 60
283 Tax1 CCATCTCACGGTCGGTACGATACACCTGCTTTTGGCAG
284 Tax2 CCATCTCACGGTCAGTAAGATACACCTGCTTTTGGCGG
285 Tax3 CCATCTCCCGCTCAGTAAGATACCCCTGCTGTTGGCGG
286 Tax4 TCATCTCATGGTCAATAAGATACTCCTGCTTTTGGCGG
287 Tax5 CCATCTCACGGTCGGTAAGATACACCTGCTTTTGGCGG
288
289 GAAATGGTCAATATTACAAGGT
290 GAAATGGTCAACATTAAAAGAT
291 GAAATCGTCAATATTAAAAGGT
292 GAAATGGTCAATCTTAAAAGGT
293 GAAATGGTCAATATTAAAAGGT"""
294
295 phylip_text3="""5 60
296 Tax1 CCATCTCACGGTCGGTACGATACACCTGCTTTTGGCAGGAAATGGTCAATATTACAAGGT
297 Tax2 CCATCTCACGGTCAGTAAGATACACCTGCTTTTGGCGGGAAATGGTCAACATTAAAAGAT
298 Tax3 CCATCTCCCGCTCAGTAAGATACCCCTGCTGTTGGCGGGAAATCGTCAATATTAAAAGGT
299 Tax4 TCATCTCATGGTCAATAAGATACTCCTGCTTTTGGCGGGAAATGGTCAATCTTAAAAGGT
300 Tax5 CCATCTCACGGTCGGTAAGATACACCTGCTTTTGGCGGGAAATGGTCAATATTAAAAGGT"""
301
302 handle = StringIO(phylip_text2)
303 list2 = list(PhylipIterator(handle))
304 handle.close()
305 assert len(list2)==1
306 assert len(list2[0])==5
307
308 handle = StringIO(phylip_text3)
309 list3 = list(PhylipIterator(handle))
310 handle.close()
311 assert len(list3)==1
312 assert len(list3[0])==5
313
314 for i in range(0,5):
315 list2[0][i].id == list3[0][i].id
316 list2[0][i].seq.tostring() == list3[0][i].seq.tostring()
317
318
319
320
321 phylip_text4=""" 5 42
322 Turkey AAGCTNGGGC ATTTCAGGGT
323 Salmo gairAAGCCTTGGC AGTGCAGGGT
324 H. SapiensACCGGTTGGC CGTTCAGGGT
325 Chimp AAACCCTTGC CGTTACGCTT
326 Gorilla AAACCCTTGC CGGTACGCTT
327
328 GAGCCCGGGC AATACAGGGT AT
329 GAGCCGTGGC CGGGCACGGT AT
330 ACAGGTTGGC CGTTCAGGGT AA
331 AAACCGAGGC CGGGACACTC AT
332 AAACCATTGC CGGTACGCTT AA"""
333
334
335
336 phylip_text5=""" 5 42
337 Turkey AAGCTNGGGC ATTTCAGGGT
338 GAGCCCGGGC AATACAGGGT AT
339 Salmo gairAAGCCTTGGC AGTGCAGGGT
340 GAGCCGTGGC CGGGCACGGT AT
341 H. SapiensACCGGTTGGC CGTTCAGGGT
342 ACAGGTTGGC CGTTCAGGGT AA
343 Chimp AAACCCTTGC CGTTACGCTT
344 AAACCGAGGC CGGGACACTC AT
345 Gorilla AAACCCTTGC CGGTACGCTT
346 AAACCATTGC CGGTACGCTT AA"""
347
348 phylip_text5a=""" 5 42
349 Turkey AAGCTNGGGC ATTTCAGGGT GAGCCCGGGC AATACAGGGT AT
350 Salmo gairAAGCCTTGGC AGTGCAGGGT GAGCCGTGGC CGGGCACGGT AT
351 H. SapiensACCGGTTGGC CGTTCAGGGT ACAGGTTGGC CGTTCAGGGT AA
352 Chimp AAACCCTTGC CGTTACGCTT AAACCGAGGC CGGGACACTC AT
353 Gorilla AAACCCTTGC CGGTACGCTT AAACCATTGC CGGTACGCTT AA"""
354
355 handle = StringIO(phylip_text4)
356 list4 = list(PhylipIterator(handle))
357 handle.close()
358 assert len(list4)==1
359 assert len(list4[0])==5
360
361 handle = StringIO(phylip_text5)
362 try:
363 list5 = list(PhylipIterator(handle))
364 assert len(list5)==1
365 assert len(list5[0])==5
366 print "That should have failed..."
367 except ValueError:
368 print "Evil multiline non-interlaced example failed as expected"
369 handle.close()
370
371 handle = StringIO(phylip_text5a)
372 list5 = list(PhylipIterator(handle))
373 handle.close()
374 assert len(list5)==1
375 assert len(list4[0])==5
376
377 print "Concatenation"
378 handle = StringIO(phylip_text4 + "\n" + phylip_text4)
379 assert len(list(PhylipIterator(handle))) == 2
380
381 handle = StringIO(phylip_text3 + "\n" + phylip_text4 + "\n\n\n" + phylip_text)
382 assert len(list(PhylipIterator(handle))) == 3
383
384 print "OK"
385
386 print "Checking write/read"
387 handle = StringIO()
388 PhylipWriter(handle).write_file(list5)
389 handle.seek(0)
390 list6 = list(PhylipIterator(handle))
391 assert len(list5) == len(list6)
392 for a1,a2 in zip(list5, list6):
393 assert len(a1) == len(a2)
394 for r1, r2 in zip(a1, a2):
395 assert r1.id == r2.id
396 assert r1.seq.tostring() == r2.seq.tostring()
397 print "Done"
398