Package Bio :: Package Blast :: Module NCBIStandalone
[hide private]
[frames] | no frames]

Source Code for Module Bio.Blast.NCBIStandalone

   1  # Copyright 1999-2000 by Jeffrey Chang.  All rights reserved. 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license.  Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5  # Patches by Mike Poidinger to support multiple databases. 
   6  # Updated by Peter Cock in 2007 to do a better job on BLAST 2.2.15 
   7   
   8  """ 
   9  This module provides code to work with the standalone version of 
  10  BLAST, either blastall or blastpgp, provided by the NCBI. 
  11  http://www.ncbi.nlm.nih.gov/BLAST/ 
  12   
  13  Classes: 
  14  LowQualityBlastError     Except that indicates low quality query sequences. 
  15  BlastParser              Parses output from blast. 
  16  BlastErrorParser         Parses output and tries to diagnose possible errors. 
  17  PSIBlastParser           Parses output from psi-blast. 
  18  Iterator                 Iterates over a file of blast results. 
  19   
  20  _Scanner                 Scans output from standalone BLAST. 
  21  _BlastConsumer           Consumes output from blast. 
  22  _PSIBlastConsumer        Consumes output from psi-blast. 
  23  _HeaderConsumer          Consumes header information. 
  24  _DescriptionConsumer     Consumes description information. 
  25  _AlignmentConsumer       Consumes alignment information. 
  26  _HSPConsumer             Consumes hsp information. 
  27  _DatabaseReportConsumer  Consumes database report information. 
  28  _ParametersConsumer      Consumes parameters information. 
  29   
  30  Functions: 
  31  blastall        Execute blastall. 
  32  blastpgp        Execute blastpgp. 
  33  rpsblast        Execute rpsblast. 
  34   
  35  """ 
  36   
  37  import os 
  38  import re 
  39   
  40  from Bio import File 
  41  from Bio.ParserSupport import * 
  42  from Bio.Blast import Record 
  43   
  44   
45 -class LowQualityBlastError(Exception):
46 """Error caused by running a low quality sequence through BLAST. 47 48 When low quality sequences (like GenBank entries containing only 49 stretches of a single nucleotide) are BLASTed, they will result in 50 BLAST generating an error and not being able to perform the BLAST. 51 search. This error should be raised for the BLAST reports produced 52 in this case. 53 """ 54 pass
55
56 -class ShortQueryBlastError(Exception):
57 """Error caused by running a short query sequence through BLAST. 58 59 If the query sequence is too short, BLAST outputs warnings and errors: 60 Searching[blastall] WARNING: [000.000] AT1G08320: SetUpBlastSearch failed. 61 [blastall] ERROR: [000.000] AT1G08320: Blast: 62 [blastall] ERROR: [000.000] AT1G08320: Blast: Query must be at least wordsize 63 done 64 65 This exception is raised when that condition is detected. 66 67 """ 68 pass
69 70
71 -class _Scanner:
72 """Scan BLAST output from blastall or blastpgp. 73 74 Tested with blastall and blastpgp v2.0.10, v2.0.11 75 76 Methods: 77 feed Feed data into the scanner. 78 79 """
80 - def feed(self, handle, consumer):
81 """S.feed(handle, consumer) 82 83 Feed in a BLAST report for scanning. handle is a file-like 84 object that contains the BLAST report. consumer is a Consumer 85 object that will receive events as the report is scanned. 86 87 """ 88 if isinstance(handle, File.UndoHandle): 89 uhandle = handle 90 else: 91 uhandle = File.UndoHandle(handle) 92 93 # Try to fast-forward to the beginning of the blast report. 94 read_and_call_until(uhandle, consumer.noevent, contains='BLAST') 95 # Now scan the BLAST report. 96 self._scan_header(uhandle, consumer) 97 self._scan_rounds(uhandle, consumer) 98 self._scan_database_report(uhandle, consumer) 99 self._scan_parameters(uhandle, consumer)
100
101 - def _scan_header(self, uhandle, consumer):
102 # BLASTP 2.0.10 [Aug-26-1999] 103 # 104 # 105 # Reference: Altschul, Stephen F., Thomas L. Madden, Alejandro A. Schaf 106 # Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), 107 # "Gapped BLAST and PSI-BLAST: a new generation of protein database sea 108 # programs", Nucleic Acids Res. 25:3389-3402. 109 # 110 # Query= test 111 # (140 letters) 112 # 113 # Database: sdqib40-1.35.seg.fa 114 # 1323 sequences; 223,339 total letters 115 # 116 # ======================================================== 117 # This next example is from the online version of Blast, 118 # note there are TWO references, an RID line, and also 119 # the database is BEFORE the query line. 120 # Note there possibleuse of non-ASCII in the author names. 121 # ======================================================== 122 # 123 # BLASTP 2.2.15 [Oct-15-2006] 124 # Reference: Altschul, Stephen F., Thomas L. Madden, Alejandro A. Sch??ffer, 125 # Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman 126 # (1997), "Gapped BLAST and PSI-BLAST: a new generation of 127 # protein database search programs", Nucleic Acids Res. 25:3389-3402. 128 # 129 # Reference: Sch??ffer, Alejandro A., L. Aravind, Thomas L. Madden, Sergei 130 # Shavirin, John L. Spouge, Yuri I. Wolf, Eugene V. Koonin, and 131 # Stephen F. Altschul (2001), "Improving the accuracy of PSI-BLAST 132 # protein database searches with composition-based statistics 133 # and other refinements", Nucleic Acids Res. 29:2994-3005. 134 # 135 # RID: 1166022616-19998-65316425856.BLASTQ1 136 # 137 # 138 # Database: All non-redundant GenBank CDS 139 # translations+PDB+SwissProt+PIR+PRF excluding environmental samples 140 # 4,254,166 sequences; 1,462,033,012 total letters 141 # Query= gi:16127998 142 # Length=428 143 # 144 145 consumer.start_header() 146 147 read_and_call(uhandle, consumer.version, contains='BLAST') 148 read_and_call_while(uhandle, consumer.noevent, blank=1) 149 150 # There might be a <pre> line, for qblast output. 151 attempt_read_and_call(uhandle, consumer.noevent, start="<pre>") 152 153 # Read the reference(s) 154 while attempt_read_and_call(uhandle, 155 consumer.reference, start='Reference') : 156 # References are normally multiline terminated by a blank line 157 # (or, based on the old code, the RID line) 158 while 1: 159 line = uhandle.readline() 160 if is_blank_line(line) : 161 consumer.noevent(line) 162 break 163 elif line.startswith("RID"): 164 break 165 else : 166 #More of the reference 167 consumer.reference(line) 168 169 #Deal with the optional RID: ... 170 read_and_call_while(uhandle, consumer.noevent, blank=1) 171 attempt_read_and_call(uhandle, consumer.reference, start="RID:") 172 read_and_call_while(uhandle, consumer.noevent, blank=1) 173 174 # blastpgp may have a reference for compositional score matrix 175 # adjustment (see Bug 2502): 176 if attempt_read_and_call( 177 uhandle, consumer.reference, start="Reference"): 178 read_and_call_until(uhandle, consumer.reference, blank=1) 179 read_and_call_while(uhandle, consumer.noevent, blank=1) 180 181 # blastpgp has a Reference for composition-based statistics. 182 if attempt_read_and_call( 183 uhandle, consumer.reference, start="Reference"): 184 read_and_call_until(uhandle, consumer.reference, blank=1) 185 read_and_call_while(uhandle, consumer.noevent, blank=1) 186 187 line = uhandle.peekline() 188 assert line.strip() != "" 189 assert not line.startswith("RID:") 190 if line.startswith("Query=") : 191 #This is an old style query then database... 192 193 # Read the Query lines and the following blank line. 194 read_and_call(uhandle, consumer.query_info, start='Query=') 195 read_and_call_until(uhandle, consumer.query_info, blank=1) 196 read_and_call_while(uhandle, consumer.noevent, blank=1) 197 198 # Read the database lines and the following blank line. 199 read_and_call_until(uhandle, consumer.database_info, end='total letters') 200 read_and_call(uhandle, consumer.database_info, contains='sequences') 201 read_and_call_while(uhandle, consumer.noevent, blank=1) 202 elif line.startswith("Database:") : 203 #This is a new style database then query... 204 read_and_call_until(uhandle, consumer.database_info, end='total letters') 205 read_and_call(uhandle, consumer.database_info, contains='sequences') 206 read_and_call_while(uhandle, consumer.noevent, blank=1) 207 208 # Read the Query lines and the following blank line. 209 read_and_call(uhandle, consumer.query_info, start='Query=') 210 read_and_call_until(uhandle, consumer.query_info, blank=1) 211 read_and_call_while(uhandle, consumer.noevent, blank=1) 212 else : 213 raise ValueError("Invalid header?") 214 215 consumer.end_header()
216
217 - def _scan_rounds(self, uhandle, consumer):
218 # Scan a bunch of rounds. 219 # Each round begins with either a "Searching......" line 220 # or a 'Score E' line followed by descriptions and alignments. 221 # The email server doesn't give the "Searching....." line. 222 # If there is no 'Searching.....' line then you'll first see a 223 # 'Results from round' line 224 225 while 1: 226 line = safe_peekline(uhandle) 227 if (not line.startswith('Searching') and 228 not line.startswith('Results from round') and 229 re.search(r"Score +E", line) is None and 230 line.find('No hits found') == -1): 231 break 232 233 self._scan_descriptions(uhandle, consumer) 234 self._scan_alignments(uhandle, consumer)
235
236 - def _scan_descriptions(self, uhandle, consumer):
237 # Searching..................................................done 238 # Results from round 2 239 # 240 # 241 # Sc 242 # Sequences producing significant alignments: (b 243 # Sequences used in model and found again: 244 # 245 # d1tde_2 3.4.1.4.4 (119-244) Thioredoxin reductase [Escherichia ... 246 # d1tcob_ 1.31.1.5.16 Calcineurin regulatory subunit (B-chain) [B... 247 # d1symb_ 1.31.1.2.2 Calcyclin (S100) [RAT (RATTUS NORVEGICUS)] 248 # 249 # Sequences not found previously or not previously below threshold: 250 # 251 # d1osa__ 1.31.1.5.11 Calmodulin [Paramecium tetraurelia] 252 # d1aoza3 2.5.1.3.3 (339-552) Ascorbate oxidase [zucchini (Cucurb... 253 # 254 255 # If PSI-BLAST, may also have: 256 # 257 # CONVERGED! 258 259 consumer.start_descriptions() 260 261 # Read 'Searching' 262 # This line seems to be missing in BLASTN 2.1.2 (others?) 263 attempt_read_and_call(uhandle, consumer.noevent, start='Searching') 264 265 # blastpgp 2.0.10 from NCBI 9/19/99 for Solaris sometimes crashes here. 266 # If this happens, the handle will yield no more information. 267 if not uhandle.peekline(): 268 raise ValueError("Unexpected end of blast report. " + \ 269 "Looks suspiciously like a PSI-BLAST crash.") 270 271 # BLASTN 2.2.3 sometimes spews a bunch of warnings and errors here: 272 # Searching[blastall] WARNING: [000.000] AT1G08320: SetUpBlastSearch 273 # [blastall] ERROR: [000.000] AT1G08320: Blast: 274 # [blastall] ERROR: [000.000] AT1G08320: Blast: Query must be at leas 275 # done 276 # Reported by David Weisman. 277 # Check for these error lines and ignore them for now. Let 278 # the BlastErrorParser deal with them. 279 line = uhandle.peekline() 280 if line.find("ERROR:") != -1 or line.startswith("done"): 281 read_and_call_while(uhandle, consumer.noevent, contains="ERROR:") 282 read_and_call(uhandle, consumer.noevent, start="done") 283 284 # Check to see if this is PSI-BLAST. 285 # If it is, the 'Searching' line will be followed by: 286 # (version 2.0.10) 287 # Searching............................. 288 # Results from round 2 289 # or (version 2.0.11) 290 # Searching............................. 291 # 292 # 293 # Results from round 2 294 295 # Skip a bunch of blank lines. 296 read_and_call_while(uhandle, consumer.noevent, blank=1) 297 # Check for the results line if it's there. 298 if attempt_read_and_call(uhandle, consumer.round, start='Results'): 299 read_and_call_while(uhandle, consumer.noevent, blank=1) 300 301 # Three things can happen here: 302 # 1. line contains 'Score E' 303 # 2. line contains "No hits found" 304 # 3. no descriptions 305 # The first one begins a bunch of descriptions. The last two 306 # indicates that no descriptions follow, and we should go straight 307 # to the alignments. 308 if not attempt_read_and_call( 309 uhandle, consumer.description_header, 310 has_re=re.compile(r'Score +E')): 311 # Either case 2 or 3. Look for "No hits found". 312 attempt_read_and_call(uhandle, consumer.no_hits, 313 contains='No hits found') 314 read_and_call_while(uhandle, consumer.noevent, blank=1) 315 316 consumer.end_descriptions() 317 # Stop processing. 318 return 319 320 # Read the score header lines 321 read_and_call(uhandle, consumer.description_header, 322 start='Sequences producing') 323 324 # If PSI-BLAST, read the 'Sequences used in model' line. 325 attempt_read_and_call(uhandle, consumer.model_sequences, 326 start='Sequences used in model') 327 read_and_call_while(uhandle, consumer.noevent, blank=1) 328 329 # Read the descriptions and the following blank lines, making 330 # sure that there are descriptions. 331 if not uhandle.peekline().startswith('Sequences not found'): 332 read_and_call_until(uhandle, consumer.description, blank=1) 333 read_and_call_while(uhandle, consumer.noevent, blank=1) 334 335 # If PSI-BLAST, read the 'Sequences not found' line followed 336 # by more descriptions. However, I need to watch out for the 337 # case where there were no sequences not found previously, in 338 # which case there will be no more descriptions. 339 if attempt_read_and_call(uhandle, consumer.nonmodel_sequences, 340 start='Sequences not found'): 341 # Read the descriptions and the following blank lines. 342 read_and_call_while(uhandle, consumer.noevent, blank=1) 343 l = safe_peekline(uhandle) 344 # Brad -- added check for QUERY. On some PSI-BLAST outputs 345 # there will be a 'Sequences not found' line followed by no 346 # descriptions. Check for this case since the first thing you'll 347 # get is a blank line and then 'QUERY' 348 if not l.startswith('CONVERGED') and l[0] != '>' \ 349 and not l.startswith('QUERY'): 350 read_and_call_until(uhandle, consumer.description, blank=1) 351 read_and_call_while(uhandle, consumer.noevent, blank=1) 352 353 attempt_read_and_call(uhandle, consumer.converged, start='CONVERGED') 354 read_and_call_while(uhandle, consumer.noevent, blank=1) 355 356 consumer.end_descriptions()
357
358 - def _scan_alignments(self, uhandle, consumer):
359 # qblast inserts a helpful line here. 360 attempt_read_and_call(uhandle, consumer.noevent, start="ALIGNMENTS") 361 362 # First, check to see if I'm at the database report. 363 line = safe_peekline(uhandle) 364 if line.startswith(' Database'): 365 return 366 elif line[0] == '>': 367 # XXX make a better check here between pairwise and masterslave 368 self._scan_pairwise_alignments(uhandle, consumer) 369 else: 370 # XXX put in a check to make sure I'm in a masterslave alignment 371 self._scan_masterslave_alignment(uhandle, consumer)
372
373 - def _scan_pairwise_alignments(self, uhandle, consumer):
374 while 1: 375 line = safe_peekline(uhandle) 376 if line[0] != '>': 377 break 378 self._scan_one_pairwise_alignment(uhandle, consumer)
379
380 - def _scan_one_pairwise_alignment(self, uhandle, consumer):
381 consumer.start_alignment() 382 383 self._scan_alignment_header(uhandle, consumer) 384 385 # Scan a bunch of score/alignment pairs. 386 while 1: 387 line = safe_peekline(uhandle) 388 if not line.startswith(' Score'): 389 break 390 self._scan_hsp(uhandle, consumer) 391 consumer.end_alignment()
392
393 - def _scan_alignment_header(self, uhandle, consumer):
394 # >d1rip__ 2.24.7.1.1 Ribosomal S17 protein [Bacillus 395 # stearothermophilus] 396 # Length = 81 397 # 398 # Or, more recently with different white space: 399 # 400 # >gi|15799684|ref|NP_285696.1| threonine synthase ... 401 # gi|15829258|ref|NP_308031.1| threonine synthase 402 # ... 403 # Length=428 404 read_and_call(uhandle, consumer.title, start='>') 405 while 1: 406 line = safe_readline(uhandle) 407 if line.lstrip().startswith('Length =') \ 408 or line.lstrip().startswith('Length='): 409 consumer.length(line) 410 break 411 elif is_blank_line(line): 412 # Check to make sure I haven't missed the Length line 413 raise ValueError("I missed the Length in an alignment header") 414 consumer.title(line) 415 416 # Older versions of BLAST will have a line with some spaces. 417 # Version 2.0.14 (maybe 2.0.13?) and above print a true blank line. 418 if not attempt_read_and_call(uhandle, consumer.noevent, 419 start=' '): 420 read_and_call(uhandle, consumer.noevent, blank=1)
421
422 - def _scan_hsp(self, uhandle, consumer):
423 consumer.start_hsp() 424 self._scan_hsp_header(uhandle, consumer) 425 self._scan_hsp_alignment(uhandle, consumer) 426 consumer.end_hsp()
427
428 - def _scan_hsp_header(self, uhandle, consumer):
429 # Score = 22.7 bits (47), Expect = 2.5 430 # Identities = 10/36 (27%), Positives = 18/36 (49%) 431 # Strand = Plus / Plus 432 # Frame = +3 433 # 434 435 read_and_call(uhandle, consumer.score, start=' Score') 436 read_and_call(uhandle, consumer.identities, start=' Identities') 437 # BLASTN 438 attempt_read_and_call(uhandle, consumer.strand, start = ' Strand') 439 # BLASTX, TBLASTN, TBLASTX 440 attempt_read_and_call(uhandle, consumer.frame, start = ' Frame') 441 read_and_call(uhandle, consumer.noevent, blank=1)
442
443 - def _scan_hsp_alignment(self, uhandle, consumer):
444 # Query: 11 GRGVSACA-------TCDGFFYRNQKVAVIGGGNTAVEEALYLSNIASEVHLIHRRDGF 445 # GRGVS+ TC Y + + V GGG+ + EE L + I R+ 446 # Sbjct: 12 GRGVSSVVRRCIHKPTCKE--YAVKIIDVTGGGSFSAEEVQELREATLKEVDILRKVSG 447 # 448 # Query: 64 AEKILIKR 71 449 # I +K 450 # Sbjct: 70 PNIIQLKD 77 451 # 452 453 while 1: 454 # Blastn adds an extra line filled with spaces before Query 455 attempt_read_and_call(uhandle, consumer.noevent, start=' ') 456 read_and_call(uhandle, consumer.query, start='Query') 457 read_and_call(uhandle, consumer.align, start=' ') 458 read_and_call(uhandle, consumer.sbjct, start='Sbjct') 459 read_and_call_while(uhandle, consumer.noevent, blank=1) 460 line = safe_peekline(uhandle) 461 # Alignment continues if I see a 'Query' or the spaces for Blastn. 462 if not (line.startswith('Query') or line.startswith(' ')): 463 break
464
465 - def _scan_masterslave_alignment(self, uhandle, consumer):
466 consumer.start_alignment() 467 while 1: 468 line = safe_readline(uhandle) 469 # Check to see whether I'm finished reading the alignment. 470 # This is indicated by 1) database section, 2) next psi-blast 471 # round, which can also be a 'Results from round' if no 472 # searching line is present 473 # patch by chapmanb 474 if line.startswith('Searching') or \ 475 line.startswith('Results from round'): 476 uhandle.saveline(line) 477 break 478 elif line.startswith(' Database'): 479 uhandle.saveline(line) 480 break 481 elif is_blank_line(line): 482 consumer.noevent(line) 483 else: 484 consumer.multalign(line) 485 read_and_call_while(uhandle, consumer.noevent, blank=1) 486 consumer.end_alignment()
487
488 - def _scan_database_report(self, uhandle, consumer):
489 # Database: sdqib40-1.35.seg.fa 490 # Posted date: Nov 1, 1999 4:25 PM 491 # Number of letters in database: 223,339 492 # Number of sequences in database: 1323 493 # 494 # Lambda K H 495 # 0.322 0.133 0.369 496 # 497 # Gapped 498 # Lambda K H 499 # 0.270 0.0470 0.230 500 # 501 ########################################## 502 # Or, more recently Blast 2.2.15 gives less blank lines 503 ########################################## 504 # Database: All non-redundant GenBank CDS translations+PDB+SwissProt+PIR+PRF excluding 505 # environmental samples 506 # Posted date: Dec 12, 2006 5:51 PM 507 # Number of letters in database: 667,088,753 508 # Number of sequences in database: 2,094,974 509 # Lambda K H 510 # 0.319 0.136 0.395 511 # Gapped 512 # Lambda K H 513 # 0.267 0.0410 0.140 514 515 516 consumer.start_database_report() 517 518 # Subset of the database(s) listed below 519 # Number of letters searched: 562,618,960 520 # Number of sequences searched: 228,924 521 if attempt_read_and_call(uhandle, consumer.noevent, start=" Subset"): 522 read_and_call(uhandle, consumer.noevent, contains="letters") 523 read_and_call(uhandle, consumer.noevent, contains="sequences") 524 read_and_call(uhandle, consumer.noevent, start=" ") 525 526 # Sameet Mehta reported seeing output from BLASTN 2.2.9 that 527 # was missing the "Database" stanza completely. 528 while attempt_read_and_call(uhandle, consumer.database, 529 start=' Database'): 530 # BLAT output ends abruptly here, without any of the other 531 # information. Check to see if this is the case. If so, 532 # then end the database report here gracefully. 533 if not uhandle.peekline(): 534 consumer.end_database_report() 535 return 536 537 # Database can span multiple lines. 538 read_and_call_until(uhandle, consumer.database, start=' Posted') 539 read_and_call(uhandle, consumer.posted_date, start=' Posted') 540 read_and_call(uhandle, consumer.num_letters_in_database, 541 start=' Number of letters') 542 read_and_call(uhandle, consumer.num_sequences_in_database, 543 start=' Number of sequences') 544 #There may not be a line starting with spaces... 545 attempt_read_and_call(uhandle, consumer.noevent, start=' ') 546 547 line = safe_readline(uhandle) 548 uhandle.saveline(line) 549 if line.find('Lambda') != -1: 550 break 551 552 read_and_call(uhandle, consumer.noevent, start='Lambda') 553 read_and_call(uhandle, consumer.ka_params) 554 555 #This blank line is optional: 556 attempt_read_and_call(uhandle, consumer.noevent, blank=1) 557 558 # not BLASTP 559 attempt_read_and_call(uhandle, consumer.gapped, start='Gapped') 560 # not TBLASTX 561 if attempt_read_and_call(uhandle, consumer.noevent, start='Lambda'): 562 read_and_call(uhandle, consumer.ka_params_gap) 563 564 # Blast 2.2.4 can sometimes skip the whole parameter section. 565 # Thus, I need to be careful not to read past the end of the 566 # file. 567 try: 568 read_and_call_while(uhandle, consumer.noevent, blank=1) 569 except ValueError, x: 570 if str(x) != "Unexpected end of stream.": 571 raise 572 consumer.end_database_report()
573
574 - def _scan_parameters(self, uhandle, consumer):
575 # Matrix: BLOSUM62 576 # Gap Penalties: Existence: 11, Extension: 1 577 # Number of Hits to DB: 50604 578 # Number of Sequences: 1323 579 # Number of extensions: 1526 580 # Number of successful extensions: 6 581 # Number of sequences better than 10.0: 5 582 # Number of HSP's better than 10.0 without gapping: 5 583 # Number of HSP's successfully gapped in prelim test: 0 584 # Number of HSP's that attempted gapping in prelim test: 1 585 # Number of HSP's gapped (non-prelim): 5 586 # length of query: 140 587 # length of database: 223,339 588 # effective HSP length: 39 589 # effective length of query: 101 590 # effective length of database: 171,742 591 # effective search space: 17345942 592 # effective search space used: 17345942 593 # T: 11 594 # A: 40 595 # X1: 16 ( 7.4 bits) 596 # X2: 38 (14.8 bits) 597 # X3: 64 (24.9 bits) 598 # S1: 41 (21.9 bits) 599 # S2: 42 (20.8 bits) 600 ########################################## 601 # Or, more recently Blast(x) 2.2.15 gives 602 ########################################## 603 # Matrix: BLOSUM62 604 # Gap Penalties: Existence: 11, Extension: 1 605 # Number of Sequences: 4535438 606 # Number of Hits to DB: 2,588,844,100 607 # Number of extensions: 60427286 608 # Number of successful extensions: 126433 609 # Number of sequences better than 2.0: 30 610 # Number of HSP's gapped: 126387 611 # Number of HSP's successfully gapped: 35 612 # Length of query: 291 613 # Length of database: 1,573,298,872 614 # Length adjustment: 130 615 # Effective length of query: 161 616 # Effective length of database: 983,691,932 617 # Effective search space: 158374401052 618 # Effective search space used: 158374401052 619 # Neighboring words threshold: 12 620 # Window for multiple hits: 40 621 # X1: 16 ( 7.3 bits) 622 # X2: 38 (14.6 bits) 623 # X3: 64 (24.7 bits) 624 # S1: 41 (21.7 bits) 625 # S2: 32 (16.9 bits) 626 627 628 # Blast 2.2.4 can sometimes skip the whole parameter section. 629 # Thus, check to make sure that the parameter section really 630 # exists. 631 if not uhandle.peekline(): 632 return 633 634 # BLASTN 2.2.9 looks like it reverses the "Number of Hits" and 635 # "Number of Sequences" lines. 636 consumer.start_parameters() 637 638 # Matrix line may be missing in BLASTN 2.2.9 639 attempt_read_and_call(uhandle, consumer.matrix, start='Matrix') 640 # not TBLASTX 641 attempt_read_and_call(uhandle, consumer.gap_penalties, start='Gap') 642 643 attempt_read_and_call(uhandle, consumer.num_sequences, 644 start='Number of Sequences') 645 read_and_call(uhandle, consumer.num_hits, 646 start='Number of Hits') 647 attempt_read_and_call(uhandle, consumer.num_sequences, 648 start='Number of Sequences') 649 read_and_call(uhandle, consumer.num_extends, 650 start='Number of extensions') 651 read_and_call(uhandle, consumer.num_good_extends, 652 start='Number of successful') 653 654 read_and_call(uhandle, consumer.num_seqs_better_e, 655 start='Number of sequences') 656 657 # not BLASTN, TBLASTX 658 if attempt_read_and_call(uhandle, consumer.hsps_no_gap, 659 start="Number of HSP's better"): 660 # BLASTN 2.2.9 661 if attempt_read_and_call(uhandle, consumer.noevent, 662 start="Number of HSP's gapped:"): 663 read_and_call(uhandle, consumer.noevent, 664 start="Number of HSP's successfully") 665 #This is ommitted in 2.2.15 666 attempt_read_and_call(uhandle, consumer.noevent, 667 start="Number of extra gapped extensions") 668 else: 669 read_and_call(uhandle, consumer.hsps_prelim_gapped, 670 start="Number of HSP's successfully") 671 read_and_call(uhandle, consumer.hsps_prelim_gap_attempted, 672 start="Number of HSP's that") 673 read_and_call(uhandle, consumer.hsps_gapped, 674 start="Number of HSP's gapped") 675 #e.g. BLASTX 2.2.15 where the "better" line is missing 676 elif attempt_read_and_call(uhandle, consumer.noevent, 677 start="Number of HSP's gapped"): 678 read_and_call(uhandle, consumer.noevent, 679 start="Number of HSP's successfully") 680 681 # not in blastx 2.2.1 682 attempt_read_and_call(uhandle, consumer.query_length, 683 has_re=re.compile(r"[Ll]ength of query")) 684 read_and_call(uhandle, consumer.database_length, 685 has_re=re.compile(r"[Ll]ength of \s*[Dd]atabase")) 686 687 # BLASTN 2.2.9 688 attempt_read_and_call(uhandle, consumer.noevent, 689 start="Length adjustment") 690 attempt_read_and_call(uhandle, consumer.effective_hsp_length, 691 start='effective HSP') 692 # Not in blastx 2.2.1 693 attempt_read_and_call( 694 uhandle, consumer.effective_query_length, 695 has_re=re.compile(r'[Ee]ffective length of query')) 696 697 # This is not in BLASTP 2.2.15 698 attempt_read_and_call( 699 uhandle, consumer.effective_database_length, 700 has_re=re.compile(r'[Ee]ffective length of \s*[Dd]atabase')) 701 # Not in blastx 2.2.1, added a ':' to distinguish between 702 # this and the 'effective search space used' line 703 attempt_read_and_call( 704 uhandle, consumer.effective_search_space, 705 has_re=re.compile(r'[Ee]ffective search space:')) 706 # Does not appear in BLASTP 2.0.5 707 attempt_read_and_call( 708 uhandle, consumer.effective_search_space_used, 709 has_re=re.compile(r'[Ee]ffective search space used')) 710 711 # BLASTX, TBLASTN, TBLASTX 712 attempt_read_and_call(uhandle, consumer.frameshift, start='frameshift') 713 714 # not in BLASTN 2.2.9 715 attempt_read_and_call(uhandle, consumer.threshold, start='T') 716 # In BLASTX 2.2.15 replaced by: "Neighboring words threshold: 12" 717 attempt_read_and_call(uhandle, consumer.threshold, start='Neighboring words threshold') 718 719 # not in BLASTX 2.2.15 720 attempt_read_and_call(uhandle, consumer.window_size, start='A') 721 # get this instead: "Window for multiple hits: 40" 722 attempt_read_and_call(uhandle, consumer.window_size, start='Window for multiple hits') 723 724 read_and_call(uhandle, consumer.dropoff_1st_pass, start='X1') 725 read_and_call(uhandle, consumer.gap_x_dropoff, start='X2') 726 727 # not BLASTN, TBLASTX 728 attempt_read_and_call(uhandle, consumer.gap_x_dropoff_final, 729 start='X3') 730 731 read_and_call(uhandle, consumer.gap_trigger, start='S1') 732 # not in blastx 2.2.1 733 # first we make sure we have additional lines to work with, if 734 # not then the file is done and we don't have a final S2 735 if not is_blank_line(uhandle.peekline(), allow_spaces=1): 736 read_and_call(uhandle, consumer.blast_cutoff, start='S2') 737 738 consumer.end_parameters()
739
740 -class BlastParser(AbstractParser):
741 """Parses BLAST data into a Record.Blast object. 742 743 """
744 - def __init__(self):
745 """__init__(self)""" 746 self._scanner = _Scanner() 747 self._consumer = _BlastConsumer()
748
749 - def parse(self, handle):
750 """parse(self, handle)""" 751 self._scanner.feed(handle, self._consumer) 752 return self._consumer.data
753
754 -class PSIBlastParser(AbstractParser):
755 """Parses BLAST data into a Record.PSIBlast object. 756 757 """
758 - def __init__(self):
759 """__init__(self)""" 760 self._scanner = _Scanner() 761 self._consumer = _PSIBlastConsumer()
762
763 - def parse(self, handle):
764 """parse(self, handle)""" 765 self._scanner.feed(handle, self._consumer) 766 return self._consumer.data
767
768 -class _HeaderConsumer:
769 - def start_header(self):
770 self._header = Record.Header()
771
772 - def version(self, line):
773 c = line.split() 774 self._header.application = c[0] 775 self._header.version = c[1] 776 self._header.date = c[2][1:-1]
777
778 - def reference(self, line):
779 if line.startswith('Reference: '): 780 self._header.reference = line[11:] 781 else: 782 self._header.reference = self._header.reference + line
783
784 - def query_info(self, line):
785 if line.startswith('Query= '): 786 self._header.query = line[7:] 787 elif not line.startswith(' '): # continuation of query_info 788 self._header.query = "%s%s" % (self._header.query, line) 789 else: 790 letters, = _re_search( 791 r"([0-9,]+) letters", line, 792 "I could not find the number of letters in line\n%s" % line) 793 self._header.query_letters = _safe_int(letters)
794
795 - def database_info(self, line):
796 line = line.rstrip() 797 if line.startswith('Database: '): 798 self._header.database = line[10:] 799 elif not line.endswith('total letters'): 800 self._header.database = self._header.database + line.strip() 801 else: 802 sequences, letters =_re_search( 803 r"([0-9,]+) sequences; ([0-9,-]+) total letters", line, 804 "I could not find the sequences and letters in line\n%s" %line) 805 self._header.database_sequences = _safe_int(sequences) 806 self._header.database_letters = _safe_int(letters)
807
808 - def end_header(self):
809 # Get rid of the trailing newlines 810 self._header.reference = self._header.reference.rstrip() 811 self._header.query = self._header.query.rstrip()
812
813 -class _DescriptionConsumer:
814 - def start_descriptions(self):
815 self._descriptions = [] 816 self._model_sequences = [] 817 self._nonmodel_sequences = [] 818 self._converged = 0 819 self._type = None 820 self._roundnum = None 821 822 self.__has_n = 0 # Does the description line contain an N value?
823
824 - def description_header(self, line):
825 if line.startswith('Sequences producing'): 826 cols = line.split() 827 if cols[-1] == 'N': 828 self.__has_n = 1
829
830 - def description(self, line):
831 dh = self._parse(line) 832 if self._type == 'model': 833 self._model_sequences.append(dh) 834 elif self._type == 'nonmodel': 835 self._nonmodel_sequences.append(dh) 836 else: 837 self._descriptions.append(dh)
838
839 - def model_sequences(self, line):
840 self._type = 'model'
841
842 - def nonmodel_sequences(self, line):
843 self._type = 'nonmodel'
844
845 - def converged(self, line):
846 self._converged = 1
847
848 - def no_hits(self, line):
849 pass
850
851 - def round(self, line):
852 if not line.startswith('Results from round'): 853 raise ValueError("I didn't understand the round line\n%s" % line) 854 self._roundnum = _safe_int(line[18:].strip())
855
856 - def end_descriptions(self):
857 pass
858
859 - def _parse(self, description_line):
860 line = description_line # for convenience 861 dh = Record.Description() 862 863 # I need to separate the score and p-value from the title. 864 # sp|P21297|FLBT_CAUCR FLBT PROTEIN [snip] 284 7e-77 865 # sp|P21297|FLBT_CAUCR FLBT PROTEIN [snip] 284 7e-77 1 866 # special cases to handle: 867 # - title must be preserved exactly (including whitespaces) 868 # - score could be equal to e-value (not likely, but what if??) 869 # - sometimes there's an "N" score of '1'. 870 cols = line.split() 871 if len(cols) < 3: 872 raise ValueError( \ 873 "Line does not appear to contain description:\n%s" % line) 874 if self.__has_n: 875 i = line.rfind(cols[-1]) # find start of N 876 i = line.rfind(cols[-2], 0, i) # find start of p-value 877 i = line.rfind(cols[-3], 0, i) # find start of score 878 else: 879 i = line.rfind(cols[-1]) # find start of p-value 880 i = line.rfind(cols[-2], 0, i) # find start of score 881 if self.__has_n: 882 dh.title, dh.score, dh.e, dh.num_alignments = \ 883 line[:i].rstrip(), cols[-3], cols[-2], cols[-1] 884 else: 885 dh.title, dh.score, dh.e, dh.num_alignments = \ 886 line[:i].rstrip(), cols[-2], cols[-1], 1 887 dh.num_alignments = _safe_int(dh.num_alignments) 888 dh.score = _safe_int(dh.score) 889 dh.e = _safe_float(dh.e) 890 return dh
891
892 -class _AlignmentConsumer:
893 # This is a little bit tricky. An alignment can either be a 894 # pairwise alignment or a multiple alignment. Since it's difficult 895 # to know a-priori which one the blast record will contain, I'm going 896 # to make one class that can parse both of them.
897 - def start_alignment(self):
898 self._alignment = Record.Alignment() 899 self._multiple_alignment = Record.MultipleAlignment()
900
901 - def title(self, line):
902 self._alignment.title = "%s%s" % (self._alignment.title, 903 line.lstrip())
904
905 - def length(self, line):
906 #e.g. "Length = 81" or more recently, "Length=428" 907 parts = line.replace(" ","").split("=") 908 assert len(parts)==2, "Unrecognised format length line" 909 self._alignment.length = parts[1] 910 self._alignment.length = _safe_int(self._alignment.length)
911
912 - def multalign(self, line):
913 # Standalone version uses 'QUERY', while WWW version uses blast_tmp. 914 if line.startswith('QUERY') or line.startswith('blast_tmp'): 915 # If this is the first line of the multiple alignment, 916 # then I need to figure out how the line is formatted. 917 918 # Format of line is: 919 # QUERY 1 acttg...gccagaggtggtttattcagtctccataagagaggggacaaacg 60 920 try: 921 name, start, seq, end = line.split() 922 except ValueError: 923 raise ValueError("I do not understand the line\n%s" % line) 924 self._start_index = line.index(start, len(name)) 925 self._seq_index = line.index(seq, 926 self._start_index+len(start)) 927 # subtract 1 for the space 928 self._name_length = self._start_index - 1 929 self._start_length = self._seq_index - self._start_index - 1 930 self._seq_length = line.rfind(end) - self._seq_index - 1 931 932 #self._seq_index = line.index(seq) 933 ## subtract 1 for the space 934 #self._seq_length = line.rfind(end) - self._seq_index - 1 935 #self._start_index = line.index(start) 936 #self._start_length = self._seq_index - self._start_index - 1 937 #self._name_length = self._start_index 938 939 # Extract the information from the line 940 name = line[:self._name_length] 941 name = name.rstrip() 942 start = line[self._start_index:self._start_index+self._start_length] 943 start = start.rstrip() 944 if start: 945 start = _safe_int(start) 946 end = line[self._seq_index+self._seq_length:].rstrip() 947 if end: 948 end = _safe_int(end) 949 seq = line[self._seq_index:self._seq_index+self._seq_length].rstrip() 950 # right pad the sequence with spaces if necessary 951 if len(seq) < self._seq_length: 952 seq = seq + ' '*(self._seq_length-len(seq)) 953 954 # I need to make sure the sequence is aligned correctly with the query. 955 # First, I will find the length of the query. Then, if necessary, 956 # I will pad my current sequence with spaces so that they will line 957 # up correctly. 958 959 # Two possible things can happen: 960 # QUERY 961 # 504 962 # 963 # QUERY 964 # 403 965 # 966 # Sequence 504 will need padding at the end. Since I won't know 967 # this until the end of the alignment, this will be handled in 968 # end_alignment. 969 # Sequence 403 will need padding before being added to the alignment. 970 971 align = self._multiple_alignment.alignment # for convenience 972 align.append((name, start, seq, end))
973 974 # This is old code that tried to line up all the sequences 975 # in a multiple alignment by using the sequence title's as 976 # identifiers. The problem with this is that BLAST assigns 977 # different HSP's from the same sequence the same id. Thus, 978 # in one alignment block, there may be multiple sequences with 979 # the same id. I'm not sure how to handle this, so I'm not 980 # going to. 981 982 # # If the sequence is the query, then just add it. 983 # if name == 'QUERY': 984 # if len(align) == 0: 985 # align.append((name, start, seq)) 986 # else: 987 # aname, astart, aseq = align[0] 988 # if name != aname: 989 # raise ValueError, "Query is not the first sequence" 990 # aseq = aseq + seq 991 # align[0] = aname, astart, aseq 992 # else: 993 # if len(align) == 0: 994 # raise ValueError, "I could not find the query sequence" 995 # qname, qstart, qseq = align[0] 996 # 997 # # Now find my sequence in the multiple alignment. 998 # for i in range(1, len(align)): 999 # aname, astart, aseq = align[i] 1000 # if name == aname: 1001 # index = i 1002 # break 1003 # else: 1004 # # If I couldn't find it, then add a new one. 1005 # align.append((None, None, None)) 1006 # index = len(align)-1 1007 # # Make sure to left-pad it. 1008 # aname, astart, aseq = name, start, ' '*(len(qseq)-len(seq)) 1009 # 1010 # if len(qseq) != len(aseq) + len(seq): 1011 # # If my sequences are shorter than the query sequence, 1012 # # then I will need to pad some spaces to make them line up. 1013 # # Since I've already right padded seq, that means aseq 1014 # # must be too short. 1015 # aseq = aseq + ' '*(len(qseq)-len(aseq)-len(seq)) 1016 # aseq = aseq + seq 1017 # if astart is None: 1018 # astart = start 1019 # align[index] = aname, astart, aseq 1020
1021 - def end_alignment(self):
1022 # Remove trailing newlines 1023 if self._alignment: 1024 self._alignment.title = self._alignment.title.rstrip() 1025 1026 # This code is also obsolete. See note above. 1027 # If there's a multiple alignment, I will need to make sure 1028 # all the sequences are aligned. That is, I may need to 1029 # right-pad the sequences. 1030 # if self._multiple_alignment is not None: 1031 # align = self._multiple_alignment.alignment 1032 # seqlen = None 1033 # for i in range(len(align)): 1034 # name, start, seq = align[i] 1035 # if seqlen is None: 1036 # seqlen = len(seq) 1037 # else: 1038 # if len(seq) < seqlen: 1039 # seq = seq + ' '*(seqlen - len(seq)) 1040 # align[i] = name, start, seq 1041 # elif len(seq) > seqlen: 1042 # raise ValueError, \ 1043 # "Sequence %s is longer than the query" % name 1044 1045 # Clean up some variables, if they exist. 1046 try: 1047 del self._seq_index 1048 del self._seq_length 1049 del self._start_index 1050 del self._start_length 1051 del self._name_length 1052 except AttributeError: 1053 pass
1054
1055 -class _HSPConsumer:
1056 - def start_hsp(self):
1057 self._hsp = Record.HSP()
1058
1059 - def score(self, line):
1060 self._hsp.bits, self._hsp.score = _re_search( 1061 r"Score =\s*([0-9.e+]+) bits \(([0-9]+)\)", line, 1062 "I could not find the score in line\n%s" % line) 1063 self._hsp.score = _safe_float(self._hsp.score) 1064 self._hsp.bits = _safe_float(self._hsp.bits) 1065 1066 x, y = _re_search( 1067 r"Expect\(?(\d*)\)? = +([0-9.e\-|\+]+)", line, 1068 "I could not find the expect in line\n%s" % line) 1069 if x: 1070 self._hsp.num_alignments = _safe_int(x) 1071 else: 1072 self._hsp.num_alignments = 1 1073 self._hsp.expect = _safe_float(y)
1074
1075 - def identities(self, line):
1076 x, y = _re_search( 1077 r"Identities = (\d+)\/(\d+)", line, 1078 "I could not find the identities in line\n%s" % line) 1079 self._hsp.identities = _safe_int(x), _safe_int(y) 1080 self._hsp.align_length = _safe_int(y) 1081 1082 if line.find('Positives') != -1: 1083 x, y = _re_search( 1084 r"Positives = (\d+)\/(\d+)", line, 1085 "I could not find the positives in line\n%s" % line) 1086 self._hsp.positives = _safe_int(x), _safe_int(y) 1087 assert self._hsp.align_length == _safe_int(y) 1088 1089 if line.find('Gaps') != -1: 1090 x, y = _re_search( 1091 r"Gaps = (\d+)\/(\d+)", line, 1092 "I could not find the gaps in line\n%s" % line) 1093 self._hsp.gaps = _safe_int(x), _safe_int(y) 1094 assert self._hsp.align_length == _safe_int(y)
1095 1096
1097 - def strand(self, line):
1098 self._hsp.strand = _re_search( 1099 r"Strand = (\w+) / (\w+)", line, 1100 "I could not find the strand in line\n%s" % line)
1101
1102 - def frame(self, line):
1103 # Frame can be in formats: 1104 # Frame = +1 1105 # Frame = +2 / +2 1106 if line.find('/') != -1: 1107 self._hsp.frame = _re_search( 1108 r"Frame = ([-+][123]) / ([-+][123])", line, 1109 "I could not find the frame in line\n%s" % line) 1110 else: 1111 self._hsp.frame = _re_search( 1112 r"Frame = ([-+][123])", line, 1113 "I could not find the frame in line\n%s" % line)
1114 1115 # Match a space, if one is available. Masahir Ishikawa found a 1116 # case where there's no space between the start and the sequence: 1117 # Query: 100tt 101 1118 # line below modified by Yair Benita, Sep 2004 1119 # Note that the colon is not always present. 2006 1120 _query_re = re.compile(r"Query(:?) \s*(\d+)\s*(.+) (\d+)")
1121 - def query(self, line):
1122 m = self._query_re.search(line) 1123 if m is None: 1124 raise ValueError("I could not find the query in line\n%s" % line) 1125 1126 # line below modified by Yair Benita, Sep 2004. 1127 # added the end attribute for the query 1128 colon, start, seq, end = m.groups() 1129 self._hsp.query = self._hsp.query + seq 1130 if self._hsp.query_start is None: 1131 self._hsp.query_start = _safe_int(start) 1132 1133 # line below added by Yair Benita, Sep 2004. 1134 # added the end attribute for the query 1135 self._hsp.query_end = _safe_int(end) 1136 1137 #Get index for sequence start (regular expression element 3) 1138 self._query_start_index = m.start(3) 1139 self._query_len = len(seq)
1140
1141 - def align(self, line):
1142 seq = line[self._query_start_index:].rstrip() 1143 if len(seq) < self._query_len: 1144 # Make sure the alignment is the same length as the query 1145 seq = seq + ' ' * (self._query_len-len(seq)) 1146 elif len(seq) < self._query_len: 1147 raise ValueError("Match is longer than the query in line\n%s" \ 1148 % line) 1149 self._hsp.match = self._hsp.match + seq
1150 1151 # To match how we do the query, cache the regular expression. 1152 # Note that the colon is not always present. 1153 _sbjct_re = re.compile(r"Sbjct(:?) \s*(\d+)\s*(.+) (\d+)")
1154 - def sbjct(self, line):
1155 m = self._sbjct_re.search(line) 1156 if m is None: 1157 raise ValueError("I could not find the sbjct in line\n%s" % line) 1158 colon, start, seq, end = m.groups() 1159 #mikep 26/9/00 1160 #On occasion, there is a blast hit with no subject match 1161 #so far, it only occurs with 1-line short "matches" 1162 #I have decided to let these pass as they appear 1163 if not seq.strip(): 1164 seq = ' ' * self._query_len 1165 self._hsp.sbjct = self._hsp.sbjct + seq 1166 if self._hsp.sbjct_start is None: 1167 self._hsp.sbjct_start = _safe_int(start) 1168 1169 self._hsp.sbjct_end = _safe_int(end) 1170 if len(seq) != self._query_len: 1171 raise ValueError( \ 1172 "QUERY and SBJCT sequence lengths don't match in line\n%s" \ 1173 % line) 1174 1175 del self._query_start_index # clean up unused variables 1176 del self._query_len
1177
1178 - def end_hsp(self):
1179 pass
1180
1181 -class _DatabaseReportConsumer:
1182
1183 - def start_database_report(self):
1184 self._dr = Record.DatabaseReport()
1185
1186 - def database(self, line):
1187 m = re.search(r"Database: (.+)$", line) 1188 if m: 1189 self._dr.database_name.append(m.group(1)) 1190 elif self._dr.database_name: 1191 # This must be a continuation of the previous name. 1192 self._dr.database_name[-1] = "%s%s" % (self._dr.database_name[-1], 1193 line.strip())
1194
1195 - def posted_date(self, line):
1196 self._dr.posted_date.append(_re_search( 1197 r"Posted date:\s*(.+)$", line, 1198 "I could not find the posted date in line\n%s" % line))
1199
1200 - def num_letters_in_database(self, line):
1201 letters, = _get_cols( 1202 line, (-1,), ncols=6, expected={2:"letters", 4:"database:"}) 1203 self._dr.num_letters_in_database.append(_safe_int(letters))
1204
1205 - def num_sequences_in_database(self, line):
1206 sequences, = _get_cols( 1207 line, (-1,), ncols=6, expected={2:"sequences", 4:"database:"}) 1208 self._dr.num_sequences_in_database.append(_safe_int(sequences))
1209
1210 - def ka_params(self, line):
1211 x = line.split() 1212 self._dr.ka_params = map(_safe_float, x)
1213
1214 - def gapped(self, line):
1215 self._dr.gapped = 1
1216
1217 - def ka_params_gap(self, line):
1218 x = line.split() 1219 self._dr.ka_params_gap = map(_safe_float, x)
1220
1221 - def end_database_report(self):
1222 pass
1223
1224 -class _ParametersConsumer:
1225 - def start_parameters(self):
1226 self._params = Record.Parameters()
1227
1228 - def matrix(self, line):
1229 self._params.matrix = line[8:].rstrip()
1230
1231 - def gap_penalties(self, line):
1232 x = _get_cols( 1233 line, (3, 5), ncols=6, expected={2:"Existence:", 4:"Extension:"}) 1234 self._params.gap_penalties = map(_safe_float, x)
1235
1236 - def num_hits(self, line):
1237 if line.find('1st pass') != -1: 1238 x, = _get_cols(line, (-4,), ncols=11, expected={2:"Hits"}) 1239 self._params.num_hits = _safe_int(x) 1240 else: 1241 x, = _get_cols(line, (-1,), ncols=6, expected={2:"Hits"}) 1242 self._params.num_hits = _safe_int(x)
1243
1244 - def num_sequences(self, line):
1245 if line.find('1st pass') != -1: 1246 x, = _get_cols(line, (-4,), ncols=9, expected={2:"Sequences:"}) 1247 self._params.num_sequences = _safe_int(x) 1248 else: 1249 x, = _get_cols(line, (-1,), ncols=4, expected={2:"Sequences:"}) 1250 self._params.num_sequences = _safe_int(x)
1251
1252 - def num_extends(self, line):
1253 if line.find('1st pass') != -1: 1254 x, = _get_cols(line, (-4,), ncols=9, expected={2:"extensions:"}) 1255 self._params.num_extends = _safe_int(x) 1256 else: 1257 x, = _get_cols(line, (-1,), ncols=4, expected={2:"extensions:"}) 1258 self._params.num_extends = _safe_int(x)
1259
1260 - def num_good_extends(self, line):
1261 if line.find('1st pass') != -1: 1262 x, = _get_cols(line, (-4,), ncols=10, expected={3:"extensions:"}) 1263 self._params.num_good_extends = _safe_int(x) 1264 else: 1265 x, = _get_cols(line, (-1,), ncols=5, expected={3:"extensions:"}) 1266 self._params.num_good_extends = _safe_int(x)
1267
1268 - def num_seqs_better_e(self, line):
1269 self._params.num_seqs_better_e, = _get_cols( 1270 line, (-1,), ncols=7, expected={2:"sequences"}) 1271 self._params.num_seqs_better_e = _safe_int( 1272 self._params.num_seqs_better_e)
1273
1274 - def hsps_no_gap(self, line):
1275 self._params.hsps_no_gap, = _get_cols( 1276 line, (-1,), ncols=9, expected={3:"better", 7:"gapping:"}) 1277 self._params.hsps_no_gap = _safe_int(self._params.hsps_no_gap)
1278
1279 - def hsps_prelim_gapped(self, line):
1280 self._params.hsps_prelim_gapped, = _get_cols( 1281 line, (-1,), ncols=9, expected={4:"gapped", 6:"prelim"}) 1282 self._params.hsps_prelim_gapped = _safe_int( 1283 self._params.hsps_prelim_gapped)
1284
1285 - def hsps_prelim_gapped_attempted(self, line):
1286 self._params.hsps_prelim_gapped_attempted, = _get_cols( 1287 line, (-1,), ncols=10, expected={4:"attempted", 7:"prelim"}) 1288 self._params.hsps_prelim_gapped_attempted = _safe_int( 1289 self._params.hsps_prelim_gapped_attempted)
1290
1291 - def hsps_gapped(self, line):
1292 self._params.hsps_gapped, = _get_cols( 1293 line, (-1,), ncols=6, expected={3:"gapped"}) 1294 self._params.hsps_gapped = _safe_int(self._params.hsps_gapped)
1295
1296 - def query_length(self, line):
1297 self._params.query_length, = _get_cols( 1298 line.lower(), (-1,), ncols=4, expected={0:"length", 2:"query:"}) 1299 self._params.query_length = _safe_int(self._params.query_length)
1300
1301 - def database_length(self, line):
1302 self._params.database_length, = _get_cols( 1303 line.lower(), (-1,), ncols=4, expected={0:"length", 2:"database:"}) 1304 self._params.database_length = _safe_int(self._params.database_length)
1305
1306 - def effective_hsp_length(self, line):
1307 self._params.effective_hsp_length, = _get_cols( 1308 line, (-1,), ncols=4, expected={1:"HSP", 2:"length:"}) 1309 self._params.effective_hsp_length = _safe_int( 1310 self._params.effective_hsp_length)
1311
1312 - def effective_query_length(self, line):
1313 self._params.effective_query_length, = _get_cols( 1314 line, (-1,), ncols=5, expected={1:"length", 3:"query:"}) 1315 self._params.effective_query_length = _safe_int( 1316 self._params.effective_query_length)
1317
1318 - def effective_database_length(self, line):
1319 self._params.effective_database_length, = _get_cols( 1320 line.lower(), (-1,), ncols=5, expected={1:"length", 3:"database:"}) 1321 self._params.effective_database_length = _safe_int( 1322 self._params.effective_database_length)
1323
1324 - def effective_search_space(self, line):
1325 self._params.effective_search_space, = _get_cols( 1326 line, (-1,), ncols=4, expected={1:"search"}) 1327 self._params.effective_search_space = _safe_int( 1328 self._params.effective_search_space)
1329
1330 - def effective_search_space_used(self, line):
1331 self._params.effective_search_space_used, = _get_cols( 1332 line, (-1,), ncols=5, expected={1:"search", 3:"used:"}) 1333 self._params.effective_search_space_used = _safe_int( 1334 self._params.effective_search_space_used)
1335
1336 - def frameshift(self, line):
1337 self._params.frameshift = _get_cols( 1338 line, (4, 5), ncols=6, expected={0:"frameshift", 2:"decay"})
1339
1340 - def threshold(self, line):
1341 if line[:2] == "T:" : 1342 #Assume its an old stlye line like "T: 123" 1343 self._params.threshold, = _get_cols( 1344 line, (1,), ncols=2, expected={0:"T:"}) 1345 elif line[:28] == "Neighboring words threshold:" : 1346 self._params.threshold, = _get_cols( 1347 line, (3,), ncols=4, expected={0:"Neighboring", 1:"words", 2:"threshold:"}) 1348 else : 1349 raise ValueError("Unrecognised threshold line:\n%s" % line) 1350 self._params.threshold = _safe_int(self._params.threshold)
1351
1352 - def window_size(self, line):
1353 if line[:2] == "A:" : 1354 self._params.window_size, = _get_cols( 1355 line, (1,), ncols=2, expected={0:"A:"}) 1356 elif line[:25] == "Window for multiple hits:" : 1357 self._params.window_size, = _get_cols( 1358 line, (4,), ncols=5, expected={0:"Window", 2:"multiple", 3:"hits:"}) 1359 else : 1360 raise ValueError("Unrecognised window size line:\n%s" % line) 1361 self._params.window_size = _safe_int(self._params.window_size)
1362
1363 - def dropoff_1st_pass(self, line):
1364 score, bits = _re_search( 1365 r"X1: (\d+) \(\s*([0-9,.]+) bits\)", line, 1366 "I could not find the dropoff in line\n%s" % line) 1367 self._params.dropoff_1st_pass = _safe_int(score), _safe_float(bits)
1368
1369 - def gap_x_dropoff(self, line):
1370 score, bits = _re_search( 1371 r"X2: (\d+) \(\s*([0-9,.]+) bits\)", line, 1372 "I could not find the gap dropoff in line\n%s" % line) 1373 self._params.gap_x_dropoff = _safe_int(score), _safe_float(bits)
1374
1375 - def gap_x_dropoff_final(self, line):
1376 score, bits = _re_search( 1377 r"X3: (\d+) \(\s*([0-9,.]+) bits\)", line, 1378 "I could not find the gap dropoff final in line\n%s" % line) 1379 self._params.gap_x_dropoff_final = _safe_int(score), _safe_float(bits)
1380
1381 - def gap_trigger(self, line):
1382 score, bits = _re_search( 1383 r"S1: (\d+) \(\s*([0-9,.]+) bits\)", line, 1384 "I could not find the gap trigger in line\n%s" % line) 1385 self._params.gap_trigger = _safe_int(score), _safe_float(bits)
1386
1387 - def blast_cutoff(self, line):
1388 score, bits = _re_search( 1389 r"S2: (\d+) \(\s*([0-9,.]+) bits\)", line, 1390 "I could not find the blast cutoff in line\n%s" % line) 1391 self._params.blast_cutoff = _safe_int(score), _safe_float(bits)
1392
1393 - def end_parameters(self):
1394 pass
1395 1396
1397 -class _BlastConsumer(AbstractConsumer, 1398 _HeaderConsumer, 1399 _DescriptionConsumer, 1400 _AlignmentConsumer, 1401 _HSPConsumer, 1402 _DatabaseReportConsumer, 1403 _ParametersConsumer 1404 ):
1405 # This Consumer is inherits from many other consumer classes that handle 1406 # the actual dirty work. An alternate way to do it is to create objects 1407 # of those classes and then delegate the parsing tasks to them in a 1408 # decorator-type pattern. The disadvantage of that is that the method 1409 # names will need to be resolved in this classes. However, using 1410 # a decorator will retain more control in this class (which may or 1411 # may not be a bad thing). In addition, having each sub-consumer as 1412 # its own object prevents this object's dictionary from being cluttered 1413 # with members and reduces the chance of member collisions.
1414 - def __init__(self):
1415 self.data = None
1416
1417 - def round(self, line):
1418 # Make sure nobody's trying to pass me PSI-BLAST data! 1419 raise ValueError("This consumer doesn't handle PSI-BLAST data")
1420
1421 - def start_header(self):
1422 self.data = Record.Blast() 1423 _HeaderConsumer.start_header(self)
1424
1425 - def end_header(self):
1426 _HeaderConsumer.end_header(self) 1427 self.data.__dict__.update(self._header.__dict__)
1428
1429 - def end_descriptions(self):
1430 self.data.descriptions = self._descriptions
1431
1432 - def end_alignment(self):
1433 _AlignmentConsumer.end_alignment(self) 1434 if self._alignment.hsps: 1435 self.data.alignments.append(self._alignment) 1436 if self._multiple_alignment.alignment: 1437 self.data.multiple_alignment = self._multiple_alignment
1438
1439 - def end_hsp(self):
1440 _HSPConsumer.end_hsp(self) 1441 try: 1442 self._alignment.hsps.append(self._hsp) 1443 except AttributeError: 1444 raise ValueError("Found an HSP before an alignment")
1445
1446 - def end_database_report(self):
1447 _DatabaseReportConsumer.end_database_report(self) 1448 self.data.__dict__.update(self._dr.__dict__)
1449
1450 - def end_parameters(self):
1451 _ParametersConsumer.end_parameters(self) 1452 self.data.__dict__.update(self._params.__dict__)
1453
1454 -class _PSIBlastConsumer(AbstractConsumer, 1455 _HeaderConsumer, 1456 _DescriptionConsumer, 1457 _AlignmentConsumer, 1458 _HSPConsumer, 1459 _DatabaseReportConsumer, 1460 _ParametersConsumer 1461 ):
1462 - def __init__(self):
1463 self.data = None
1464
1465 - def start_header(self):
1466 self.data = Record.PSIBlast() 1467 _HeaderConsumer.start_header(self)
1468
1469 - def end_header(self):
1470 _HeaderConsumer.end_header(self) 1471 self.data.__dict__.update(self._header.__dict__)
1472
1473 - def start_descriptions(self):
1474 self._round = Record.Round() 1475 self.data.rounds.append(self._round) 1476 _DescriptionConsumer.start_descriptions(self)
1477
1478 - def end_descriptions(self):
1479 _DescriptionConsumer.end_descriptions(self) 1480 self._round.number = self._roundnum 1481 if self._descriptions: 1482 self._round.new_seqs.extend(self._descriptions) 1483 self._round.reused_seqs.extend(self._model_sequences) 1484 self._round.new_seqs.extend(self._nonmodel_sequences) 1485 if self._converged: 1486 self.data.converged = 1
1487
1488 - def end_alignment(self):
1489 _AlignmentConsumer.end_alignment(self) 1490 if self._alignment.hsps: 1491 self._round.alignments.append(self._alignment) 1492 if self._multiple_alignment: 1493 self._round.multiple_alignment = self._multiple_alignment
1494
1495 - def end_hsp(self):
1496 _HSPConsumer.end_hsp(self) 1497 try: 1498 self._alignment.hsps.append(self._hsp) 1499 except AttributeError: 1500 raise ValueError("Found an HSP before an alignment")
1501
1502 - def end_database_report(self):
1503 _DatabaseReportConsumer.end_database_report(self) 1504 self.data.__dict__.update(self._dr.__dict__)
1505
1506 - def end_parameters(self):
1507 _ParametersConsumer.end_parameters(self) 1508 self.data.__dict__.update(self._params.__dict__)
1509
1510 -class Iterator:
1511 """Iterates over a file of multiple BLAST results. 1512 1513 Methods: 1514 next Return the next record from the stream, or None. 1515 1516 """
1517 - def __init__(self, handle, parser=None):
1518 """__init__(self, handle, parser=None) 1519 1520 Create a new iterator. handle is a file-like object. parser 1521 is an optional Parser object to change the results into another form. 1522 If set to None, then the raw contents of the file will be returned. 1523 1524 """ 1525 try: 1526 handle.readline 1527 except AttributeError: 1528 raise ValueError( 1529 "I expected a file handle or file-like object, got %s" 1530 % type(handle)) 1531 self._uhandle = File.UndoHandle(handle) 1532 self._parser = parser
1533
1534 - def next(self):
1535 """next(self) -> object 1536 1537 Return the next Blast record from the file. If no more records, 1538 return None. 1539 1540 """ 1541 lines = [] 1542 while 1: 1543 line = self._uhandle.readline() 1544 if not line: 1545 break 1546 # If I've reached the next one, then put the line back and stop. 1547 if lines and (line.startswith('BLAST') 1548 or line.startswith('BLAST', 1) 1549 or line.startswith('<?xml ')): 1550 self._uhandle.saveline(line) 1551 break 1552 lines.append(line) 1553 1554 if not lines: 1555 return None 1556 1557 data = ''.join(lines) 1558 if self._parser is not None: 1559 return self._parser.parse(File.StringHandle(data)) 1560 return data
1561
1562 - def __iter__(self):
1563 return iter(self.next, None)
1564
1565 -def blastall(blastcmd, program, database, infile, align_view='7', **keywds):
1566 """Execute and retrieve data from standalone BLASTPALL as handles. 1567 1568 Execute and retrieve data from blastall. blastcmd is the command 1569 used to launch the 'blastall' executable. program is the blast program 1570 to use, e.g. 'blastp', 'blastn', etc. database is the path to the database 1571 to search against. infile is the path to the file containing 1572 the sequence to search with. 1573 1574 The return values are two handles, for standard output and standard error. 1575 1576 You may pass more parameters to **keywds to change the behavior of 1577 the search. Otherwise, optional values will be chosen by blastall. 1578 The Blast output is by default in XML format. Use the align_view keyword 1579 for output in a different format. 1580 1581 Scoring 1582 matrix Matrix to use. 1583 gap_open Gap open penalty. 1584 gap_extend Gap extension penalty. 1585 nuc_match Nucleotide match reward. (BLASTN) 1586 nuc_mismatch Nucleotide mismatch penalty. (BLASTN) 1587 query_genetic_code Genetic code for Query. 1588 db_genetic_code Genetic code for database. (TBLAST[NX]) 1589 1590 Algorithm 1591 gapped Whether to do a gapped alignment. T/F (not for TBLASTX) 1592 expectation Expectation value cutoff. 1593 wordsize Word size. 1594 strands Query strands to search against database.([T]BLAST[NX]) 1595 keep_hits Number of best hits from a region to keep. 1596 xdrop Dropoff value (bits) for gapped alignments. 1597 hit_extend Threshold for extending hits. 1598 region_length Length of region used to judge hits. 1599 db_length Effective database length. 1600 search_length Effective length of search space. 1601 1602 Processing 1603 filter Filter query sequence for low complexity (with SEG)? T/F 1604 believe_query Believe the query defline. T/F 1605 restrict_gi Restrict search to these GI's. 1606 nprocessors Number of processors to use. 1607 oldengine Force use of old engine T/F 1608 1609 Formatting 1610 html Produce HTML output? T/F 1611 descriptions Number of one-line descriptions. 1612 alignments Number of alignments. 1613 align_view Alignment view. Integer 0-11, 1614 passed as a string or integer. 1615 show_gi Show GI's in deflines? T/F 1616 seqalign_file seqalign file to output. 1617 1618 """ 1619 1620 _security_check_parameters(keywds) 1621 1622 att2param = { 1623 'matrix' : '-M', 1624 'gap_open' : '-G', 1625 'gap_extend' : '-E', 1626 'nuc_match' : '-r', 1627 'nuc_mismatch' : '-q', 1628 'query_genetic_code' : '-Q', 1629 'db_genetic_code' : '-D', 1630 1631 'gapped' : '-g', 1632 'expectation' : '-e', 1633 'wordsize' : '-W', 1634 'strands' : '-S', 1635 'keep_hits' : '-K', 1636 'xdrop' : '-X', 1637 'hit_extend' : '-f', 1638 'region_length' : '-L', 1639 'db_length' : '-z', 1640 'search_length' : '-Y', 1641 1642 'program' : '-p', 1643 'database' : '-d', 1644 'infile' : '-i', 1645 'filter' : '-F', 1646 'believe_query' : '-J', 1647 'restrict_gi' : '-l', 1648 'nprocessors' : '-a', 1649 'oldengine' : '-V', 1650 1651 'html' : '-T', 1652 'descriptions' : '-v', 1653 'alignments' : '-b', 1654 'align_view' : '-m', 1655 'show_gi' : '-I', 1656 'seqalign_file' : '-O' 1657 } 1658 1659 params = [] 1660 params.extend([att2param['program'], program]) 1661 params.extend([att2param['database'], database]) 1662 params.extend([att2param['infile'], _escape_filename(infile)]) 1663 params.extend([att2param['align_view'], str(align_view)]) 1664 1665 for attr in keywds.keys(): 1666 params.extend([att2param[attr], str(keywds[attr])]) 1667 1668 return _invoke_blast(blastcmd, params)
1669
1670 -def blastpgp(blastcmd, database, infile, align_view='7', **keywds):
1671 """Execute and retrieve data from standalone BLASTPGP as handles. 1672 1673 Execute and retrieve data from blastpgp. blastcmd is the command 1674 used to launch the 'blastpgp' executable. database is the path to the 1675 database to search against. infile is the path to the file containing 1676 the sequence to search with. 1677 1678 The return values are two handles, for standard output and standard error. 1679 1680 You may pass more parameters to **keywds to change the behavior of 1681 the search. Otherwise, optional values will be chosen by blastpgp. 1682 The Blast output is by default in XML format. Use the align_view keyword 1683 for output in a different format. 1684 1685 Scoring 1686 matrix Matrix to use. 1687 gap_open Gap open penalty. 1688 gap_extend Gap extension penalty. 1689 window_size Multiple hits window size. 1690 npasses Number of passes. 1691 passes Hits/passes. Integer 0-2. 1692 1693 Algorithm 1694 gapped Whether to do a gapped alignment. T/F 1695 expectation Expectation value cutoff. 1696 wordsize Word size. 1697 keep_hits Number of beset hits from a region to keep. 1698 xdrop Dropoff value (bits) for gapped alignments. 1699 hit_extend Threshold for extending hits. 1700 region_length Length of region used to judge hits. 1701 db_length Effective database length. 1702 search_length Effective length of search space. 1703 nbits_gapping Number of bits to trigger gapping. 1704 pseudocounts Pseudocounts constants for multiple passes. 1705 xdrop_final X dropoff for final gapped alignment. 1706 xdrop_extension Dropoff for blast extensions. 1707 model_threshold E-value threshold to include in multipass model. 1708 required_start Start of required region in query. 1709 required_end End of required region in query. 1710 1711 Processing 1712 XXX should document default values 1713 program The blast program to use. (PHI-BLAST) 1714 filter Filter query sequence for low complexity (with SEG)? T/F 1715 believe_query Believe the query defline? T/F 1716 nprocessors Number of processors to use. 1717 1718 Formatting 1719 html Produce HTML output? T/F 1720 descriptions Number of one-line descriptions. 1721 alignments Number of alignments. 1722 align_view Alignment view. Integer 0-11, 1723 passed as a string or integer. 1724 show_gi Show GI's in deflines? T/F 1725 seqalign_file seqalign file to output. 1726 align_outfile Output file for alignment. 1727 checkpoint_outfile Output file for PSI-BLAST checkpointing. 1728 restart_infile Input file for PSI-BLAST restart. 1729 hit_infile Hit file for PHI-BLAST. 1730 matrix_outfile Output file for PSI-BLAST matrix in ASCII. 1731 1732 align_infile Input alignment file for PSI-BLAST restart. 1733 1734 """ 1735 1736 _security_check_parameters(keywds) 1737 1738 att2param = { 1739 'matrix' : '-M', 1740 'gap_open' : '-G', 1741 'gap_extend' : '-E', 1742 'window_size' : '-A', 1743 'npasses' : '-j', 1744 'passes' : '-P', 1745 1746 'gapped' : '-g', 1747 'expectation' : '-e', 1748 'wordsize' : '-W', 1749 'keep_hits' : '-K', 1750 'xdrop' : '-X', 1751 'hit_extend' : '-f', 1752 'region_length' : '-L', 1753 'db_length' : '-Z', 1754 'search_length' : '-Y', 1755 'nbits_gapping' : '-N', 1756 'pseudocounts' : '-c', 1757 'xdrop_final' : '-Z', 1758 'xdrop_extension' : '-y', 1759 'model_threshold' : '-h', 1760 'required_start' : '-S', 1761 'required_end' : '-H', 1762 1763 'program' : '-p', 1764 'database' : '-d', 1765 'infile' : '-i', 1766 'filter' : '-F', 1767 'believe_query' : '-J', 1768 'nprocessors' : '-a', 1769 1770 'html' : '-T', 1771 'descriptions' : '-v', 1772 'alignments' : '-b', 1773 'align_view' : '-m', 1774 'show_gi' : '-I', 1775 'seqalign_file' : '-O', 1776 'align_outfile' : '-o', 1777 'checkpoint_outfile' : '-C', 1778 'restart_infile' : '-R', 1779 'hit_infile' : '-k', 1780 'matrix_outfile' : '-Q', 1781 'align_infile' : '-B' 1782 } 1783 1784 params = [] 1785 params.extend([att2param['database'], database]) 1786 params.extend([att2param['infile'], _escape_filename(infile)]) 1787 params.extend([att2param['align_view'], str(align_view)]) 1788 1789 for attr in keywds.keys(): 1790 params.extend([att2param[attr], str(keywds[attr])]) 1791 1792 return _invoke_blast(blastcmd, params)
1793
1794 -def rpsblast(blastcmd, database, infile, align_view="7", **keywds):
1795 """Execute and retrieve data from standalone RPS-BLAST as handles. 1796 1797 Execute and retrieve data from standalone RPS-BLAST. blastcmd is the 1798 command used to launch the 'rpsblast' executable. database is the path 1799 to the database to search against. infile is the path to the file 1800 containing the sequence to search with. 1801 1802 The return values are two handles, for standard output and standard error. 1803 1804 You may pass more parameters to **keywds to change the behavior of 1805 the search. Otherwise, optional values will be chosen by rpsblast. 1806 1807 Please note that this function will give XML output by default, by 1808 setting align_view to seven (i.e. command line option -m 7). 1809 You should use the NCBIXML.parse() function to read the resulting output. 1810 This is because NCBIStandalone.BlastParser() does not understand the 1811 plain text output format from rpsblast. 1812 1813 WARNING - The following text and associated parameter handling has not 1814 received extensive testing. Please report any errors we might have made... 1815 1816 Algorithm/Scoring 1817 gapped Whether to do a gapped alignment. T/F 1818 multihit 0 for multiple hit (default), 1 for single hit 1819 expectation Expectation value cutoff. 1820 range_restriction Range restriction on query sequence (Format: start,stop) blastp only 1821 0 in 'start' refers to the beginning of the sequence 1822 0 in 'stop' refers to the end of the sequence 1823 Default = 0,0 1824 xdrop Dropoff value (bits) for gapped alignments. 1825 xdrop_final X dropoff for final gapped alignment (in bits). 1826 xdrop_extension Dropoff for blast extensions (in bits). 1827 search_length Effective length of search space. 1828 nbits_gapping Number of bits to trigger gapping. 1829 protein Query sequence is protein. T/F 1830 db_length Effective database length. 1831 1832 Processing 1833 filter Filter query sequence for low complexity? T/F 1834 case_filter Use lower case filtering of FASTA sequence T/F, default F 1835 believe_query Believe the query defline. T/F 1836 nprocessors Number of processors to use. 1837 logfile Name of log file to use, default rpsblast.log 1838 1839 Formatting 1840 html Produce HTML output? T/F 1841 descriptions Number of one-line descriptions. 1842 alignments Number of alignments. 1843 align_view Alignment view. Integer 0-11, 1844 passed as a string or integer. 1845 show_gi Show GI's in deflines? T/F 1846 seqalign_file seqalign file to output. 1847 align_outfile Output file for alignment. 1848 1849 """ 1850 1851 _security_check_parameters(keywds) 1852 1853 att2param = { 1854 'multihit' : '-P', 1855 'gapped' : '-g', 1856 'expectation' : '-e', 1857 'range_restriction' : '-L', 1858 'xdrop' : '-X', 1859 'xdrop_final' : '-Z', 1860 'xdrop_extension' : '-y', 1861 'search_length' : '-Y', 1862 'nbits_gapping' : '-N', 1863 'protein' : '-p', 1864 'db_length' : '-z', 1865 1866 'database' : '-d', 1867 'infile' : '-i', 1868 'filter' : '-F', 1869 'case_filter' : '-U', 1870 'believe_query' : '-J', 1871 'nprocessors' : '-a', 1872 'logfile' : '-l', 1873 1874 'html' : '-T', 1875 'descriptions' : '-v', 1876 'alignments' : '-b', 1877 'align_view' : '-m', 1878 'show_gi' : '-I', 1879 'seqalign_file' : '-O', 1880 'align_outfile' : '-o' 1881 } 1882 1883 params = [] 1884 1885 params.extend([att2param['database'], database]) 1886 params.extend([att2param['infile'], _escape_filename(infile)]) 1887 params.extend([att2param['align_view'], str(align_view)]) 1888 1889 for attr in keywds.keys(): 1890 params.extend([att2param[attr], str(keywds[attr])]) 1891 1892 return _invoke_blast(blastcmd, params)
1893
1894 -def _re_search(regex, line, error_msg):
1895 m = re.search(regex, line) 1896 if not m: 1897 raise ValueError(error_msg) 1898 return m.groups()
1899
1900 -def _get_cols(line, cols_to_get, ncols=None, expected={}):
1901 cols = line.split() 1902 1903 # Check to make sure number of columns is correct 1904 if ncols is not None and len(cols) != ncols: 1905 raise ValueError("I expected %d columns (got %d) in line\n%s" \ 1906 % (ncols, len(cols), line)) 1907 1908 # Check to make sure columns contain the correct data 1909 for k in expected.keys(): 1910 if cols[k] != expected[k]: 1911 raise ValueError("I expected '%s' in column %d in line\n%s" \ 1912 % (expected[k], k, line)) 1913 1914 # Construct the answer tuple 1915 results = [] 1916 for c in cols_to_get: 1917 results.append(cols[c]) 1918 return tuple(results)
1919
1920 -def _safe_int(str):
1921 try: 1922 return int(str) 1923 except ValueError: 1924 # Something went wrong. Try to clean up the string. 1925 # Remove all commas from the string 1926 str = str.replace(',', '') 1927 try: 1928 # try again. 1929 return int(str) 1930 except ValueError: 1931 pass 1932 # If it fails again, maybe it's too long? 1933 # XXX why converting to float? 1934 return long(float(str))
1935
1936 -def _safe_float(str):
1937 # Thomas Rosleff Soerensen (rosleff@mpiz-koeln.mpg.de) noted that 1938 # float('e-172') does not produce an error on his platform. Thus, 1939 # we need to check the string for this condition. 1940 1941 # Sometimes BLAST leaves of the '1' in front of an exponent. 1942 if str and str[0] in ['E', 'e']: 1943 str = '1' + str 1944 try: 1945 return float(str) 1946 except ValueError: 1947 # Remove all commas from the string 1948 str = str.replace(',', '') 1949 # try again. 1950 return float(str)
1951
1952 -def _escape_filename(filename) :
1953 """Escape filenames with spaces (PRIVATE).""" 1954 if " " not in filename : 1955 return filename 1956 1957 #See Bug 2480 - is adding the following helpful? 1958 #if os.path.isfile(filename) : 1959 # #On Windows, if the file exists, we can ask for 1960 # #its alternative short name (DOS style 8.3 format) 1961 # #which has no spaces in it. 1962 # try : 1963 # import win32api 1964 # short = win32api.GetShortPathName(filename) 1965 # assert os.path.isfile(short) 1966 # return short 1967 # except ImportError : 1968 # pass 1969 1970 #We'll just quote it - works on Mac etc 1971 if filename.startswith('"') and filename.endswith('"') : 1972 #Its already quoted 1973 return filename 1974 else : 1975 return '"%s"' % filename
1976
1977 -def _invoke_blast(blast_cmd, params) :
1978 """Start BLAST and returns handles for stdout and stderr (PRIVATE). 1979 1980 Tries to deal with spaces in the BLAST executable path. 1981 """ 1982 if not os.path.exists(blast_cmd): 1983 raise ValueError("BLAST executable does not exist at %s" % blast_cmd) 1984 1985 cmd_string = " ".join([_escape_filename(blast_cmd)] + params) 1986 1987 try : 1988 import subprocess, sys 1989 blast_process = subprocess.Popen(cmd_string, 1990 stdout=subprocess.PIPE, 1991 stderr=subprocess.PIPE, 1992 shell=(sys.platform!="win32")) 1993 return blast_process.stdout, blast_process.stderr 1994 except ImportError : 1995 #subprocess isn't available on python 2.3 1996 #Note os.popen3 is deprecated in python 2.6 1997 write_handle, result_handle, error_handle \ 1998 = os.popen3(cmd_string) 1999 write_handle.close() 2000 return result_handle, error_handle
2001
2002 -def _security_check_parameters(param_dict) :
2003 """Look for any attempt to insert a command into a parameter. 2004 2005 e.g. blastall(..., matrix='IDENTITY -F 0; rm -rf /etc/passwd') 2006 2007 Looks for ";" or "&&" in the strings (Unix and Windows syntax 2008 for appending a command line), or ">", "<" or "|" (redirection) 2009 and if any are found raises an exception. 2010 """ 2011 for key, value in param_dict.iteritems() : 2012 str_value = str(value) # Could easily be an int or a float 2013 for bad_str in [";", "&&", ">", "<", "|"] : 2014 if bad_str in str_value : 2015 raise ValueError("Rejecting suspicious argument for %s" % key)
2016
2017 -class _BlastErrorConsumer(_BlastConsumer):
2018 - def __init__(self):
2020 - def noevent(self, line):
2021 if line.find("Query must be at least wordsize") != -1: 2022 raise ShortQueryBlastError("Query must be at least wordsize") 2023 # Now pass the line back up to the superclass. 2024 method = getattr(_BlastConsumer, 'noevent', 2025 _BlastConsumer.__getattr__(self, 'noevent')) 2026 method(line)
2027
2028 -class BlastErrorParser(AbstractParser):
2029 """Attempt to catch and diagnose BLAST errors while parsing. 2030 2031 This utilizes the BlastParser module but adds an additional layer 2032 of complexity on top of it by attempting to diagnose ValueErrors 2033 that may actually indicate problems during BLAST parsing. 2034 2035 Current BLAST problems this detects are: 2036 o LowQualityBlastError - When BLASTing really low quality sequences 2037 (ie. some GenBank entries which are just short streches of a single 2038 nucleotide), BLAST will report an error with the sequence and be 2039 unable to search with this. This will lead to a badly formatted 2040 BLAST report that the parsers choke on. The parser will convert the 2041 ValueError to a LowQualityBlastError and attempt to provide useful 2042 information. 2043 2044 """
2045 - def __init__(self, bad_report_handle = None):
2046 """Initialize a parser that tries to catch BlastErrors. 2047 2048 Arguments: 2049 o bad_report_handle - An optional argument specifying a handle 2050 where bad reports should be sent. This would allow you to save 2051 all of the bad reports to a file, for instance. If no handle 2052 is specified, the bad reports will not be saved. 2053 """ 2054 self._bad_report_handle = bad_report_handle 2055 2056 #self._b_parser = BlastParser() 2057 self._scanner = _Scanner() 2058 self._consumer = _BlastErrorConsumer()
2059
2060 - def parse(self, handle):
2061 """Parse a handle, attempting to diagnose errors. 2062 """ 2063 results = handle.read() 2064 2065 try: 2066 self._scanner.feed(File.StringHandle(results), self._consumer) 2067 except ValueError, msg: 2068 # if we have a bad_report_file, save the info to it first 2069 if self._bad_report_handle: 2070 # send the info to the error handle 2071 self._bad_report_handle.write(results) 2072 2073 # now we want to try and diagnose the error 2074 self._diagnose_error( 2075 File.StringHandle(results), self._consumer.data) 2076 2077 # if we got here we can't figure out the problem 2078 # so we should pass along the syntax error we got 2079 raise 2080 return self._consumer.data
2081
2082 - def _diagnose_error(self, handle, data_record):
2083 """Attempt to diagnose an error in the passed handle. 2084 2085 Arguments: 2086 o handle - The handle potentially containing the error 2087 o data_record - The data record partially created by the consumer. 2088 """ 2089 line = handle.readline() 2090 2091 while line: 2092 # 'Searchingdone' instead of 'Searching......done' seems 2093 # to indicate a failure to perform the BLAST due to 2094 # low quality sequence 2095 if line.startswith('Searchingdone'): 2096 raise LowQualityBlastError("Blast failure occured on query: ", 2097 data_record.query) 2098 line = handle.readline()
2099