Package Bio :: Package SeqIO :: Module AceIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.AceIO

  1  # Copyright 2008-2010 by Peter Cock.  All rights reserved. 
  2  # 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6   
  7  """Bio.SeqIO support for the "ace" file format. 
  8   
  9  You are expected to use this module via the Bio.SeqIO functions. 
 10  See also the Bio.Sequencing.Ace module which offers more than just accessing 
 11  the contig consensus sequences in an ACE file as SeqRecord objects. 
 12  """ 
 13   
 14  from Bio.Seq import Seq 
 15  from Bio.SeqRecord import SeqRecord 
 16  from Bio.Alphabet import generic_nucleotide, generic_dna, generic_rna, Gapped 
 17  from Bio.Sequencing import Ace 
 18   
 19  #This is a generator function! 
20 -def AceIterator(handle):
21 """Returns SeqRecord objects from an ACE file. 22 23 This uses the Bio.Sequencing.Ace module to do the hard work. Note that 24 by iterating over the file in a single pass, we are forced to ignore any 25 WA, CT, RT or WR footer tags. 26 27 Ace files include the base quality for each position, which are taken 28 to be PHRED style scores. Just as if you had read in a FASTQ or QUAL file 29 using PHRED scores using Bio.SeqIO, these are stored in the SeqRecord's 30 letter_annotations dictionary under the "phred_quality" key. 31 32 >>> from Bio import SeqIO 33 >>> handle = open("Ace/consed_sample.ace", "rU") 34 >>> for record in SeqIO.parse(handle, "ace"): 35 ... print record.id, record.seq[:10]+"...", len(record) 36 ... print max(record.letter_annotations["phred_quality"]) 37 Contig1 agccccgggc... 1475 38 90 39 40 However, ACE files do not include a base quality for any gaps in the 41 consensus sequence, and these are represented in Biopython with a quality 42 of zero. Using zero is perhaps misleading as there may be very strong 43 evidence to support the gap in the consensus. Previous versions of 44 Biopython therefore used None instead, but this complicated usage, and 45 prevented output of the gapped sequence as FASTQ format. 46 47 >>> from Bio import SeqIO 48 >>> handle = open("Ace/contig1.ace", "rU") 49 >>> for record in SeqIO.parse(handle, "ace"): 50 ... print record.id, "..." + record.seq[85:95]+"..." 51 ... print record.letter_annotations["phred_quality"][85:95] 52 ... print max(record.letter_annotations["phred_quality"]) 53 Contig1 ...AGAGG-ATGC... 54 [57, 57, 54, 57, 57, 0, 57, 72, 72, 72] 55 90 56 Contig2 ...GAATTACTAT... 57 [68, 68, 68, 68, 68, 68, 68, 68, 68, 68] 58 90 59 60 """ 61 for ace_contig in Ace.parse(handle): 62 #Convert the ACE contig record into a SeqRecord... 63 consensus_seq_str = ace_contig.sequence 64 #Assume its DNA unless there is a U in it, 65 if "U" in consensus_seq_str: 66 if "T" in consensus_seq_str: 67 #Very odd! Error? 68 alpha = generic_nucleotide 69 else: 70 alpha = generic_rna 71 else: 72 alpha = generic_dna 73 74 if "*" in consensus_seq_str: 75 #For consistency with most other file formats, map 76 #any * gaps into - gaps. 77 assert "-" not in consensus_seq_str 78 consensus_seq = Seq(consensus_seq_str.replace("*","-"), 79 Gapped(alpha, gap_char="-")) 80 else: 81 consensus_seq = Seq(consensus_seq_str, alpha) 82 83 #TODO? - Base segments (BS lines) which indicates which read 84 #phrap has chosen to be the consensus at a particular position. 85 #Perhaps as SeqFeature objects? 86 87 #TODO - Supporting reads (RD lines, plus perhaps QA and DS lines) 88 #Perhaps as SeqFeature objects? 89 90 seq_record = SeqRecord(consensus_seq, 91 id = ace_contig.name, 92 name = ace_contig.name) 93 94 #Consensus base quality (BQ lines). Note that any gaps (originally 95 #as * characters) in the consensus do not get a quality entry, so 96 #we assign a quality of None (zero would be missleading as there may 97 #be excelent support for having a gap here). 98 quals = [] 99 i = 0 100 for base in consensus_seq: 101 if base == "-": 102 quals.append(0) 103 else: 104 quals.append(ace_contig.quality[i]) 105 i += 1 106 assert i == len(ace_contig.quality) 107 seq_record.letter_annotations["phred_quality"] = quals 108 109 yield seq_record
110 #All done 111
112 -def _test():
113 """Run the Bio.SeqIO module's doctests. 114 115 This will try and locate the unit tests directory, and run the doctests 116 from there in order that the relative paths used in the examples work. 117 """ 118 import doctest 119 import os 120 if os.path.isdir(os.path.join("..", "..", "Tests", "Ace")): 121 print "Runing doctests..." 122 cur_dir = os.path.abspath(os.curdir) 123 os.chdir(os.path.join("..", "..", "Tests")) 124 assert os.path.isfile("Ace/consed_sample.ace") 125 doctest.testmod() 126 os.chdir(cur_dir) 127 del cur_dir 128 print "Done" 129 elif os.path.isdir(os.path.join("Tests", "Ace")): 130 print "Runing doctests..." 131 cur_dir = os.path.abspath(os.curdir) 132 os.chdir(os.path.join("Tests")) 133 doctest.testmod() 134 os.chdir(cur_dir) 135 del cur_dir 136 print "Done"
137 138 if __name__ == "__main__": 139 _test() 140