1
2
3
4
5
6
7 """Bio.SeqIO support for the "ace" file format.
8
9 You are expected to use this module via the Bio.SeqIO functions.
10 See also the Bio.Sequencing.Ace module which offers more than just accessing
11 the contig consensus sequences in an ACE file as SeqRecord objects.
12 """
13
14 from Bio.Seq import Seq
15 from Bio.SeqRecord import SeqRecord
16 from Bio.Alphabet import generic_nucleotide, generic_dna, generic_rna, Gapped
17 from Bio.Sequencing import Ace
18
19
21 """Returns SeqRecord objects from an ACE file.
22
23 This uses the Bio.Sequencing.Ace module to do the hard work. Note that
24 by iterating over the file in a single pass, we are forced to ignore any
25 WA, CT, RT or WR footer tags.
26
27 Ace files include the base quality for each position, which are taken
28 to be PHRED style scores. Just as if you had read in a FASTQ or QUAL file
29 using PHRED scores using Bio.SeqIO, these are stored in the SeqRecord's
30 letter_annotations dictionary under the "phred_quality" key.
31
32 >>> from Bio import SeqIO
33 >>> handle = open("Ace/consed_sample.ace", "rU")
34 >>> for record in SeqIO.parse(handle, "ace"):
35 ... print record.id, record.seq[:10]+"...", len(record)
36 ... print max(record.letter_annotations["phred_quality"])
37 Contig1 agccccgggc... 1475
38 90
39
40 However, ACE files do not include a base quality for any gaps in the
41 consensus sequence, and these are represented in Biopython with a quality
42 of zero. Using zero is perhaps misleading as there may be very strong
43 evidence to support the gap in the consensus. Previous versions of
44 Biopython therefore used None instead, but this complicated usage, and
45 prevented output of the gapped sequence as FASTQ format.
46
47 >>> from Bio import SeqIO
48 >>> handle = open("Ace/contig1.ace", "rU")
49 >>> for record in SeqIO.parse(handle, "ace"):
50 ... print record.id, "..." + record.seq[85:95]+"..."
51 ... print record.letter_annotations["phred_quality"][85:95]
52 ... print max(record.letter_annotations["phred_quality"])
53 Contig1 ...AGAGG-ATGC...
54 [57, 57, 54, 57, 57, 0, 57, 72, 72, 72]
55 90
56 Contig2 ...GAATTACTAT...
57 [68, 68, 68, 68, 68, 68, 68, 68, 68, 68]
58 90
59
60 """
61 for ace_contig in Ace.parse(handle):
62
63 consensus_seq_str = ace_contig.sequence
64
65 if "U" in consensus_seq_str:
66 if "T" in consensus_seq_str:
67
68 alpha = generic_nucleotide
69 else:
70 alpha = generic_rna
71 else:
72 alpha = generic_dna
73
74 if "*" in consensus_seq_str:
75
76
77 assert "-" not in consensus_seq_str
78 consensus_seq = Seq(consensus_seq_str.replace("*","-"),
79 Gapped(alpha, gap_char="-"))
80 else:
81 consensus_seq = Seq(consensus_seq_str, alpha)
82
83
84
85
86
87
88
89
90 seq_record = SeqRecord(consensus_seq,
91 id = ace_contig.name,
92 name = ace_contig.name)
93
94
95
96
97
98 quals = []
99 i = 0
100 for base in consensus_seq:
101 if base == "-":
102 quals.append(0)
103 else:
104 quals.append(ace_contig.quality[i])
105 i += 1
106 assert i == len(ace_contig.quality)
107 seq_record.letter_annotations["phred_quality"] = quals
108
109 yield seq_record
110
111
113 """Run the Bio.SeqIO module's doctests.
114
115 This will try and locate the unit tests directory, and run the doctests
116 from there in order that the relative paths used in the examples work.
117 """
118 import doctest
119 import os
120 if os.path.isdir(os.path.join("..", "..", "Tests", "Ace")):
121 print "Runing doctests..."
122 cur_dir = os.path.abspath(os.curdir)
123 os.chdir(os.path.join("..", "..", "Tests"))
124 assert os.path.isfile("Ace/consed_sample.ace")
125 doctest.testmod()
126 os.chdir(cur_dir)
127 del cur_dir
128 print "Done"
129 elif os.path.isdir(os.path.join("Tests", "Ace")):
130 print "Runing doctests..."
131 cur_dir = os.path.abspath(os.curdir)
132 os.chdir(os.path.join("Tests"))
133 doctest.testmod()
134 os.chdir(cur_dir)
135 del cur_dir
136 print "Done"
137
138 if __name__ == "__main__":
139 _test()
140