Package Bio :: Package AlignIO :: Module NexusIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignIO.NexusIO

  1  # Copyright 2008-2010 by Peter Cock.  All rights reserved. 
  2  # 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  """ 
  7  Bio.AlignIO support for the "nexus" file format. 
  8   
  9  You are expected to use this module via the Bio.AlignIO functions (or the 
 10  Bio.SeqIO functions if you want to work directly with the gapped sequences). 
 11   
 12  See also the Bio.Nexus module (which this code calls internally), 
 13  as this offers more than just accessing the alignment or its 
 14  sequences as SeqRecord objects. 
 15  """ 
 16   
 17  from Bio.Seq import Seq 
 18  from Bio.SeqRecord import SeqRecord   
 19  from Bio.Nexus import Nexus 
 20  from Bio.Align import MultipleSeqAlignment 
 21  from Interfaces import AlignmentWriter 
 22  from Bio import Alphabet 
 23   
 24  #You can get a couple of example files here: 
 25  #http://www.molecularevolution.org/resources/fileformats/ 
 26       
 27  #This is a generator function! 
28 -def NexusIterator(handle, seq_count=None):
29 """Returns SeqRecord objects from a Nexus file. 30 31 Thus uses the Bio.Nexus module to do the hard work. 32 33 You are expected to call this function via Bio.SeqIO or Bio.AlignIO 34 (and not use it directly). 35 36 NOTE - We only expect ONE alignment matrix per Nexus file, 37 meaning this iterator will only yield one MultipleSeqAlignment. 38 """ 39 n = Nexus.Nexus(handle) 40 if not n.matrix: 41 #No alignment found 42 raise StopIteration 43 44 #Bio.Nexus deals with duplicated names by adding a '.copy' suffix. 45 #The original names and the modified names are kept in these two lists: 46 assert len(n.unaltered_taxlabels) == len(n.taxlabels) 47 48 if seq_count and seq_count != len(n.unaltered_taxlabels): 49 raise ValueError("Found %i sequences, but seq_count=%i" \ 50 % (len(n.unaltered_taxlabels), seq_count)) 51 52 #ToDo - Can we extract any annotation too? 53 records = (SeqRecord(n.matrix[new_name], id=new_name, \ 54 name=old_name, description="") \ 55 for old_name, new_name \ 56 in zip (n.unaltered_taxlabels, n.taxlabels)) 57 #All done 58 yield MultipleSeqAlignment(records, n.alphabet)
59
60 -class NexusWriter(AlignmentWriter):
61 """Nexus alignment writer. 62 63 Note that Nexus files are only expected to hold ONE alignment 64 matrix. 65 66 You are expected to call this class via the Bio.AlignIO.write() or 67 Bio.SeqIO.write() functions. 68 """
69 - def write_file(self, alignments):
70 """Use this to write an entire file containing the given alignments. 71 72 alignments - A list or iterator returning MultipleSeqAlignment objects. 73 This should hold ONE and only one alignment. 74 """ 75 align_iter = iter(alignments) #Could have been a list 76 try: 77 first_alignment = align_iter.next() 78 except StopIteration: 79 first_alignment = None 80 if first_alignment is None: 81 #Nothing to write! 82 return 0 83 84 #Check there is only one alignment... 85 try: 86 second_alignment = align_iter.next() 87 except StopIteration: 88 second_alignment = None 89 if second_alignment is not None: 90 raise ValueError("We can only write one Alignment to a Nexus file.") 91 92 #Good. Actually write the single alignment, 93 self.write_alignment(first_alignment) 94 return 1 #we only support writing one alignment!
95
96 - def write_alignment(self, alignment):
97 #Creates an empty Nexus object, adds the sequences, 98 #and then gets Nexus to prepare the output. 99 if len(alignment) == 0: 100 raise ValueError("Must have at least one sequence") 101 if alignment.get_alignment_length() == 0: 102 raise ValueError("Non-empty sequences are required") 103 minimal_record = "#NEXUS\nbegin data; dimensions ntax=0 nchar=0; " \ 104 + "format datatype=%s; end;" \ 105 % self._classify_alphabet_for_nexus(alignment._alphabet) 106 n = Nexus.Nexus(minimal_record) 107 n.alphabet = alignment._alphabet 108 for record in alignment: 109 n.add_sequence(record.id, record.seq.tostring()) 110 n.write_nexus_data(self.handle)
111
112 - def _classify_alphabet_for_nexus(self, alphabet):
113 """Returns 'protein', 'dna', 'rna' based on the alphabet (PRIVATE). 114 115 Raises an exception if this is not possible.""" 116 #Get the base alphabet (underneath any Gapped or StopCodon encoding) 117 a = Alphabet._get_base_alphabet(alphabet) 118 119 if not isinstance(a, Alphabet.Alphabet): 120 raise TypeError("Invalid alphabet") 121 elif isinstance(a, Alphabet.ProteinAlphabet): 122 return "protein" 123 elif isinstance(a, Alphabet.DNAAlphabet): 124 return "dna" 125 elif isinstance(a, Alphabet.RNAAlphabet): 126 return "rna" 127 else: 128 #Must be something like NucleotideAlphabet or 129 #just the generic Alphabet (default for fasta files) 130 raise ValueError("Need a DNA, RNA or Protein alphabet")
131 132 if __name__ == "__main__": 133 from StringIO import StringIO 134 print "Quick self test" 135 print 136 print "Repeated names without a TAXA block" 137 handle = StringIO("""#NEXUS 138 [TITLE: NoName] 139 140 begin data; 141 dimensions ntax=4 nchar=50; 142 format interleave datatype=protein gap=- symbols="FSTNKEYVQMCLAWPHDRIG"; 143 144 matrix 145 CYS1_DICDI -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---- 146 ALEU_HORVU MAHARVLLLA LAVLATAAVA VASSSSFADS NPIRPVTDRA ASTLESAVLG 147 CATH_HUMAN ------MWAT LPLLCAGAWL LGV------- -PVCGAAELS VNSLEK---- 148 CYS1_DICDI -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---X 149 ; 150 end; 151 """) 152 for a in NexusIterator(handle): 153 print a 154 for r in a: 155 print repr(r.seq), r.name, r.id 156 print "Done" 157 158 print 159 print "Repeated names with a TAXA block" 160 handle = StringIO("""#NEXUS 161 [TITLE: NoName] 162 163 begin taxa 164 CYS1_DICDI 165 ALEU_HORVU 166 CATH_HUMAN 167 CYS1_DICDI; 168 end; 169 170 begin data; 171 dimensions ntax=4 nchar=50; 172 format interleave datatype=protein gap=- symbols="FSTNKEYVQMCLAWPHDRIG"; 173 174 matrix 175 CYS1_DICDI -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---- 176 ALEU_HORVU MAHARVLLLA LAVLATAAVA VASSSSFADS NPIRPVTDRA ASTLESAVLG 177 CATH_HUMAN ------MWAT LPLLCAGAWL LGV------- -PVCGAAELS VNSLEK---- 178 CYS1_DICDI -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---X 179 ; 180 end; 181 """) 182 for a in NexusIterator(handle): 183 print a 184 for r in a: 185 print repr(r.seq), r.name, r.id 186 print "Done" 187 print 188 print "Reading an empty file" 189 assert 0 == len(list(NexusIterator(StringIO()))) 190 print "Done" 191 print 192 print "Writing..." 193 194 handle = StringIO() 195 NexusWriter(handle).write_file([a]) 196 handle.seek(0) 197 print handle.read() 198 199 handle = StringIO() 200 try: 201 NexusWriter(handle).write_file([a,a]) 202 assert False, "Should have rejected more than one alignment!" 203 except ValueError: 204 pass 205