1
2
3
4
5
6 """Record classes to hold BLAST output.
7
8 Classes:
9 Blast Holds all the information from a blast search.
10 PSIBlast Holds all the information from a psi-blast search.
11
12 Header Holds information from the header.
13 Description Holds information about one hit description.
14 Alignment Holds information about one alignment hit.
15 HSP Holds information about one HSP.
16 MultipleAlignment Holds information about a multiple alignment.
17 DatabaseReport Holds information from the database report.
18 Parameters Holds information from the parameters.
19
20 """
21
22
23 import string
24
25 from Bio.Align import Generic
26
28 """Saves information from a blast header.
29
30 Members:
31 application The name of the BLAST flavor that generated this data.
32 version Version of blast used.
33 date Date this data was generated.
34 reference Reference for blast.
35
36 query Name of query sequence.
37 query_letters Number of letters in the query sequence. (int)
38
39 database Name of the database.
40 database_sequences Number of sequences in the database. (int)
41 database_letters Number of letters in the database. (int)
42
43 """
45 self.application = ''
46 self.version = ''
47 self.date = ''
48 self.reference = ''
49
50 self.query = ''
51 self.query_letters = None
52
53 self.database = ''
54 self.database_sequences = None
55 self.database_letters = None
56
58 """Stores information about one hit in the descriptions section.
59
60 Members:
61 title Title of the hit.
62 score Number of bits. (int)
63 bits Bit score. (float)
64 e E value. (float)
65 num_alignments Number of alignments for the same subject. (int)
66
67 """
69 self.title = ''
70 self.score = None
71 self.bits = None
72 self.e = None
73 self.num_alignments = None
75 return "%-66s %5s %s" % (self.title, self.score, self.e)
76
78 """Stores information about one hit in the alignments section.
79
80 Members:
81 title Name.
82 hit_id Hit identifier. (str)
83 hit_def Hit definition. (str)
84 length Length. (int)
85 hsps A list of HSP objects.
86
87 """
89 self.title = ''
90 self.hit_id = ''
91 self.hit_def = ''
92 self.length = None
93 self.hsps = []
95 lines = []
96 titles = string.split(self.title, '\n')
97 for i in range(len(titles)):
98 if i:
99 lines.append(" ")
100 lines.append("%s\n" % titles[i])
101 lines.append(" Length = %s\n" % self.length)
102 return string.join(lines, '')
103
105 """Stores information about one hsp in an alignment hit.
106
107 Members:
108 score BLAST score of hit. (float)
109 bits Number of bits for that score. (float)
110 expect Expect value. (float)
111 num_alignments Number of alignments for same subject. (int)
112 identities Number of identities/total aligned. tuple of (int, int)
113 positives Number of positives/total aligned. tuple of (int, int)
114 gaps Numer of gaps/total aligned. tuple of (int, int)
115 align_length Length of the alignment. (int)
116 strand Tuple of (query, target) strand.
117 frame Tuple of 1 or 2 frame shifts, depending on the flavor.
118
119 query The query sequence.
120 query_start The start residue for the query sequence. (1-based)
121 query_end The end residue for the query sequence. (1-based)
122 match The match sequence.
123 sbjct The sbjct sequence.
124 sbjct_start The start residue for the sbjct sequence. (1-based)
125 sbjct_end The end residue for the sbjct sequence. (1-based)
126
127 Not all flavors of BLAST return values for every attribute:
128 score expect identities positives strand frame
129 BLASTP X X X X
130 BLASTN X X X X X
131 BLASTX X X X X X
132 TBLASTN X X X X X
133 TBLASTX X X X X X/X
134
135 Note: for BLASTX, the query sequence is shown as a protein sequence,
136 but the numbering is based on the nucleotides. Thus, the numbering
137 is 3x larger than the number of amino acid residues. A similar effect
138 can be seen for the sbjct sequence in TBLASTN, and for both sequences
139 in TBLASTX.
140
141 Also, for negative frames, the sequence numbering starts from
142 query_start and counts down.
143
144 """
146 self.score = None
147 self.bits = None
148 self.expect = None
149 self.num_alignments = None
150 self.identities = (None, None)
151 self.positives = (None, None)
152 self.gaps = (None, None)
153 self.align_length = None
154 self.strand = (None, None)
155 self.frame = ()
156
157 self.query = ''
158 self.query_start = None
159 self.query_end = None
160 self.match = ''
161 self.sbjct = ''
162 self.sbjct_start = None
163 self.sbjct_end = None
164
166 lines = ["Score %i (%i bits), expectation %0.1e, alignment length %i" \
167 % (self.score, self.bits, self.expect, self.align_length)]
168 if self.align_length < 50 :
169 lines.append("Query:%s %s %s" % (str(self.query_start).rjust(8),
170 str(self.query),
171 str(self.query_end)))
172 lines.append(" %s" \
173 % (str(self.match)))
174 lines.append("Sbjct:%s %s %s" % (str(self.sbjct_start).rjust(8),
175 str(self.sbjct),
176 str(self.sbjct_end)))
177 else :
178 lines.append("Query:%s %s...%s %s" \
179 % (str(self.query_start).rjust(8),
180 str(self.query)[:45],
181 str(self.query)[-3:],
182 str(self.query_end)))
183 lines.append(" %s...%s" \
184 % (str(self.match)[:45],
185 str(self.match)[-3:]))
186 lines.append("Sbjct:%s %s...%s %s" \
187 % (str(self.sbjct_start).rjust(8),
188 str(self.sbjct)[:45],
189 str(self.sbjct)[-3:],
190 str(self.sbjct_end)))
191 return "\n".join(lines)
192
194 """Holds information about a multiple alignment.
195
196 Members:
197 alignment A list of tuples (name, start residue, sequence, end residue).
198
199 The start residue is 1-based. It may be blank, if that sequence is
200 not aligned in the multiple alignment.
201
202 """
205
207 """Retrieve generic alignment object for the given alignment.
208
209 Instead of the tuples, this returns an Alignment object from
210 Bio.Align.Generic, through which you can manipulate and query
211 the object.
212
213 alphabet is the specified alphabet for the sequences in the code (for
214 example IUPAC.IUPACProtein.
215
216 Thanks to James Casbon for the code.
217 """
218 seq_parts = []
219 seq_names = []
220 parse_number = 0
221 n = 0
222 for name, start, seq, end in self.alignment:
223 if name == 'QUERY':
224 parse_number = parse_number + 1
225 n = 0
226
227 if parse_number == 1:
228 seq_parts.append(seq)
229 seq_names.append(name)
230 else:
231 seq_parts[n] = seq_parts[n] + seq
232 n = n + 1
233
234 generic = Generic.Alignment(alphabet)
235 for (name,seq) in zip(seq_names,seq_parts):
236 generic.add_sequence(name, seq)
237
238 return generic
239
241 """Holds information from a PSI-BLAST round.
242
243 Members:
244 number Round number. (int)
245 reused_seqs Sequences in model, found again. List of Description objects.
246 new_seqs Sequences not found, or below threshold. List of Description.
247 alignments A list of Alignment objects.
248 multiple_alignment A MultipleAlignment object.
249
250 """
252 self.number = None
253 self.reused_seqs = []
254 self.new_seqs = []
255 self.alignments = []
256 self.multiple_alignment = None
257
259 """Holds information about a database report.
260
261 Members:
262 database_name List of database names. (can have multiple dbs)
263 num_letters_in_database Number of letters in the database. (int)
264 num_sequences_in_database List of number of sequences in the database.
265 posted_date List of the dates the databases were posted.
266 ka_params A tuple of (lambda, k, h) values. (floats)
267 gapped # XXX this isn't set right!
268 ka_params_gap A tuple of (lambda, k, h) values. (floats)
269
270 """
279
281 """Holds information about the parameters.
282
283 Members:
284 matrix Name of the matrix.
285 gap_penalties Tuple of (open, extend) penalties. (floats)
286 sc_match Match score for nucleotide-nucleotide comparison
287 sc_mismatch Mismatch penalty for nucleotide-nucleotide comparison
288 num_hits Number of hits to the database. (int)
289 num_sequences Number of sequences. (int)
290 num_good_extends Number of extensions. (int)
291 num_seqs_better_e Number of sequences better than e-value. (int)
292 hsps_no_gap Number of HSP's better, without gapping. (int)
293 hsps_prelim_gapped Number of HSP's gapped in prelim test. (int)
294 hsps_prelim_gapped_attemped Number of HSP's attempted in prelim. (int)
295 hsps_gapped Total number of HSP's gapped. (int)
296 query_length Length of the query. (int)
297 query_id Identifier of the query sequence. (str)
298 database_length Number of letters in the database. (int)
299 effective_hsp_length Effective HSP length. (int)
300 effective_query_length Effective length of query. (int)
301 effective_database_length Effective length of database. (int)
302 effective_search_space Effective search space. (int)
303 effective_search_space_used Effective search space used. (int)
304 frameshift Frameshift window. Tuple of (int, float)
305 threshold Threshold. (int)
306 window_size Window size. (int)
307 dropoff_1st_pass Tuple of (score, bits). (int, float)
308 gap_x_dropoff Tuple of (score, bits). (int, float)
309 gap_x_dropoff_final Tuple of (score, bits). (int, float)
310 gap_trigger Tuple of (score, bits). (int, float)
311 blast_cutoff Tuple of (score, bits). (int, float)
312 """
342
343 -class Blast(Header, DatabaseReport, Parameters):
344 """Saves the results from a blast search.
345
346 Members:
347 descriptions A list of Description objects.
348 alignments A list of Alignment objects.
349 multiple_alignment A MultipleAlignment object.
350 + members inherited from base classes
351
352 """
360
361 -class PSIBlast(Header, DatabaseReport, Parameters):
362 """Saves the results from a blastpgp search.
363
364 Members:
365 rounds A list of Round objects.
366 converged Whether the search converged.
367 + members inherited from base classes
368
369 """
376