Package Bio :: Package Blast :: Module NCBIStandalone
[hide private]
[frames] | no frames]

Source Code for Module Bio.Blast.NCBIStandalone

   1  # Copyright 1999-2000 by Jeffrey Chang.  All rights reserved. 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license.  Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5  # Patches by Mike Poidinger to support multiple databases. 
   6  # Updated by Peter Cock in 2007 to do a better job on BLAST 2.2.15 
   7   
   8  """Code for calling standalone BLAST and parsing plain text output (OBSOLETE). 
   9   
  10  Rather than parsing the human readable plain text BLAST output (which seems to 
  11  change with every update to BLAST), we and the NBCI recommend you parse the 
  12  XML output instead. The plain text parser in this module still works at the 
  13  time of writing, but is considered obsolete and updating it to cope with the 
  14  latest versions of BLAST is not a priority for us. 
  15   
  16  This module also provides code to work with the "legacy" standalone version of 
  17  NCBI BLAST, tools blastall, rpsblast and blastpgp via three helper functions of 
  18  the same name. These functions are very limited for dealing with the output as 
  19  files rather than handles, for which the wrappers in Bio.Blast.Applications are 
  20  prefered. Furthermore, the NCBI themselves regard these command line tools as 
  21  "legacy", and encourage using the new BLAST+ tools instead. Biopython has 
  22  wrappers for these under Bio.Blast.Applications (see the tutorial). 
  23   
  24  Classes: 
  25  LowQualityBlastError     Except that indicates low quality query sequences. 
  26  BlastParser              Parses output from blast. 
  27  BlastErrorParser         Parses output and tries to diagnose possible errors. 
  28  PSIBlastParser           Parses output from psi-blast. 
  29  Iterator                 Iterates over a file of blast results. 
  30   
  31  _Scanner                 Scans output from standalone BLAST. 
  32  _BlastConsumer           Consumes output from blast. 
  33  _PSIBlastConsumer        Consumes output from psi-blast. 
  34  _HeaderConsumer          Consumes header information. 
  35  _DescriptionConsumer     Consumes description information. 
  36  _AlignmentConsumer       Consumes alignment information. 
  37  _HSPConsumer             Consumes hsp information. 
  38  _DatabaseReportConsumer  Consumes database report information. 
  39  _ParametersConsumer      Consumes parameters information. 
  40   
  41  Functions: 
  42  blastall        Execute blastall (OBSOLETE). 
  43  blastpgp        Execute blastpgp (OBSOLETE). 
  44  rpsblast        Execute rpsblast (OBSOLETE). 
  45   
  46  For calling the BLAST command line tools, we encourage you to use the 
  47  command line wrappers in Bio.Blast.Applications - the three functions 
  48  blastall, blastpgp and rpsblast are considered to be obsolete now, and 
  49  are likely to be deprecated and then removed in future releases. 
  50  """ 
  51   
  52  import warnings 
  53  warnings.warn("The plain text parser in this module still works at the time of writing, but is considered obsolete and updating it to cope with the latest versions of BLAST is not a priority for us.", PendingDeprecationWarning) 
  54   
  55  import os 
  56  import re 
  57   
  58  from Bio import File 
  59  from Bio.ParserSupport import * 
  60  from Bio.Blast import Record 
  61  from Bio.Application import _escape_filename 
  62   
63 -class LowQualityBlastError(Exception):
64 """Error caused by running a low quality sequence through BLAST. 65 66 When low quality sequences (like GenBank entries containing only 67 stretches of a single nucleotide) are BLASTed, they will result in 68 BLAST generating an error and not being able to perform the BLAST. 69 search. This error should be raised for the BLAST reports produced 70 in this case. 71 """ 72 pass
73
74 -class ShortQueryBlastError(Exception):
75 """Error caused by running a short query sequence through BLAST. 76 77 If the query sequence is too short, BLAST outputs warnings and errors: 78 Searching[blastall] WARNING: [000.000] AT1G08320: SetUpBlastSearch failed. 79 [blastall] ERROR: [000.000] AT1G08320: Blast: 80 [blastall] ERROR: [000.000] AT1G08320: Blast: Query must be at least wordsize 81 done 82 83 This exception is raised when that condition is detected. 84 85 """ 86 pass
87 88
89 -class _Scanner:
90 """Scan BLAST output from blastall or blastpgp. 91 92 Tested with blastall and blastpgp v2.0.10, v2.0.11 93 94 Methods: 95 feed Feed data into the scanner. 96 97 """
98 - def feed(self, handle, consumer):
99 """S.feed(handle, consumer) 100 101 Feed in a BLAST report for scanning. handle is a file-like 102 object that contains the BLAST report. consumer is a Consumer 103 object that will receive events as the report is scanned. 104 105 """ 106 if isinstance(handle, File.UndoHandle): 107 uhandle = handle 108 else: 109 uhandle = File.UndoHandle(handle) 110 111 # Try to fast-forward to the beginning of the blast report. 112 read_and_call_until(uhandle, consumer.noevent, contains='BLAST') 113 # Now scan the BLAST report. 114 self._scan_header(uhandle, consumer) 115 self._scan_rounds(uhandle, consumer) 116 self._scan_database_report(uhandle, consumer) 117 self._scan_parameters(uhandle, consumer)
118
119 - def _scan_header(self, uhandle, consumer):
120 # BLASTP 2.0.10 [Aug-26-1999] 121 # 122 # 123 # Reference: Altschul, Stephen F., Thomas L. Madden, Alejandro A. Schaf 124 # Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), 125 # "Gapped BLAST and PSI-BLAST: a new generation of protein database sea 126 # programs", Nucleic Acids Res. 25:3389-3402. 127 # 128 # Query= test 129 # (140 letters) 130 # 131 # Database: sdqib40-1.35.seg.fa 132 # 1323 sequences; 223,339 total letters 133 # 134 # ======================================================== 135 # This next example is from the online version of Blast, 136 # note there are TWO references, an RID line, and also 137 # the database is BEFORE the query line. 138 # Note there possibleuse of non-ASCII in the author names. 139 # ======================================================== 140 # 141 # BLASTP 2.2.15 [Oct-15-2006] 142 # Reference: Altschul, Stephen F., Thomas L. Madden, Alejandro A. Sch??ffer, 143 # Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman 144 # (1997), "Gapped BLAST and PSI-BLAST: a new generation of 145 # protein database search programs", Nucleic Acids Res. 25:3389-3402. 146 # 147 # Reference: Sch??ffer, Alejandro A., L. Aravind, Thomas L. Madden, Sergei 148 # Shavirin, John L. Spouge, Yuri I. Wolf, Eugene V. Koonin, and 149 # Stephen F. Altschul (2001), "Improving the accuracy of PSI-BLAST 150 # protein database searches with composition-based statistics 151 # and other refinements", Nucleic Acids Res. 29:2994-3005. 152 # 153 # RID: 1166022616-19998-65316425856.BLASTQ1 154 # 155 # 156 # Database: All non-redundant GenBank CDS 157 # translations+PDB+SwissProt+PIR+PRF excluding environmental samples 158 # 4,254,166 sequences; 1,462,033,012 total letters 159 # Query= gi:16127998 160 # Length=428 161 # 162 163 consumer.start_header() 164 165 read_and_call(uhandle, consumer.version, contains='BLAST') 166 read_and_call_while(uhandle, consumer.noevent, blank=1) 167 168 # There might be a <pre> line, for qblast output. 169 attempt_read_and_call(uhandle, consumer.noevent, start="<pre>") 170 171 # Read the reference(s) 172 while attempt_read_and_call(uhandle, 173 consumer.reference, start='Reference'): 174 # References are normally multiline terminated by a blank line 175 # (or, based on the old code, the RID line) 176 while 1: 177 line = uhandle.readline() 178 if is_blank_line(line): 179 consumer.noevent(line) 180 break 181 elif line.startswith("RID"): 182 break 183 else: 184 #More of the reference 185 consumer.reference(line) 186 187 #Deal with the optional RID: ... 188 read_and_call_while(uhandle, consumer.noevent, blank=1) 189 attempt_read_and_call(uhandle, consumer.reference, start="RID:") 190 read_and_call_while(uhandle, consumer.noevent, blank=1) 191 192 # blastpgp may have a reference for compositional score matrix 193 # adjustment (see Bug 2502): 194 if attempt_read_and_call( 195 uhandle, consumer.reference, start="Reference"): 196 read_and_call_until(uhandle, consumer.reference, blank=1) 197 read_and_call_while(uhandle, consumer.noevent, blank=1) 198 199 # blastpgp has a Reference for composition-based statistics. 200 if attempt_read_and_call( 201 uhandle, consumer.reference, start="Reference"): 202 read_and_call_until(uhandle, consumer.reference, blank=1) 203 read_and_call_while(uhandle, consumer.noevent, blank=1) 204 205 line = uhandle.peekline() 206 assert line.strip() != "" 207 assert not line.startswith("RID:") 208 if line.startswith("Query="): 209 #This is an old style query then database... 210 211 # Read the Query lines and the following blank line. 212 read_and_call(uhandle, consumer.query_info, start='Query=') 213 read_and_call_until(uhandle, consumer.query_info, blank=1) 214 read_and_call_while(uhandle, consumer.noevent, blank=1) 215 216 # Read the database lines and the following blank line. 217 read_and_call_until(uhandle, consumer.database_info, end='total letters') 218 read_and_call(uhandle, consumer.database_info, contains='sequences') 219 read_and_call_while(uhandle, consumer.noevent, blank=1) 220 elif line.startswith("Database:"): 221 #This is a new style database then query... 222 read_and_call_until(uhandle, consumer.database_info, end='total letters') 223 read_and_call(uhandle, consumer.database_info, contains='sequences') 224 read_and_call_while(uhandle, consumer.noevent, blank=1) 225 226 # Read the Query lines and the following blank line. 227 # Or, on BLAST 2.2.22+ there is no blank link - need to spot 228 # the "... Score E" line instead. 229 read_and_call(uhandle, consumer.query_info, start='Query=') 230 #read_and_call_until(uhandle, consumer.query_info, blank=1) 231 while True: 232 line = uhandle.peekline() 233 if not line.strip() : break 234 if "Score E" in line : break 235 #It is more of the query (and its length) 236 read_and_call(uhandle, consumer.query_info) 237 read_and_call_while(uhandle, consumer.noevent, blank=1) 238 else: 239 raise ValueError("Invalid header?") 240 241 consumer.end_header()
242
243 - def _scan_rounds(self, uhandle, consumer):
244 # Scan a bunch of rounds. 245 # Each round begins with either a "Searching......" line 246 # or a 'Score E' line followed by descriptions and alignments. 247 # The email server doesn't give the "Searching....." line. 248 # If there is no 'Searching.....' line then you'll first see a 249 # 'Results from round' line 250 251 while not self._eof(uhandle): 252 line = safe_peekline(uhandle) 253 if (not line.startswith('Searching') and 254 not line.startswith('Results from round') and 255 re.search(r"Score +E", line) is None and 256 line.find('No hits found') == -1): 257 break 258 self._scan_descriptions(uhandle, consumer) 259 self._scan_alignments(uhandle, consumer)
260
261 - def _scan_descriptions(self, uhandle, consumer):
262 # Searching..................................................done 263 # Results from round 2 264 # 265 # 266 # Sc 267 # Sequences producing significant alignments: (b 268 # Sequences used in model and found again: 269 # 270 # d1tde_2 3.4.1.4.4 (119-244) Thioredoxin reductase [Escherichia ... 271 # d1tcob_ 1.31.1.5.16 Calcineurin regulatory subunit (B-chain) [B... 272 # d1symb_ 1.31.1.2.2 Calcyclin (S100) [RAT (RATTUS NORVEGICUS)] 273 # 274 # Sequences not found previously or not previously below threshold: 275 # 276 # d1osa__ 1.31.1.5.11 Calmodulin [Paramecium tetraurelia] 277 # d1aoza3 2.5.1.3.3 (339-552) Ascorbate oxidase [zucchini (Cucurb... 278 # 279 280 # If PSI-BLAST, may also have: 281 # 282 # CONVERGED! 283 284 consumer.start_descriptions() 285 286 # Read 'Searching' 287 # This line seems to be missing in BLASTN 2.1.2 (others?) 288 attempt_read_and_call(uhandle, consumer.noevent, start='Searching') 289 290 # blastpgp 2.0.10 from NCBI 9/19/99 for Solaris sometimes crashes here. 291 # If this happens, the handle will yield no more information. 292 if not uhandle.peekline(): 293 raise ValueError("Unexpected end of blast report. " + \ 294 "Looks suspiciously like a PSI-BLAST crash.") 295 296 # BLASTN 2.2.3 sometimes spews a bunch of warnings and errors here: 297 # Searching[blastall] WARNING: [000.000] AT1G08320: SetUpBlastSearch 298 # [blastall] ERROR: [000.000] AT1G08320: Blast: 299 # [blastall] ERROR: [000.000] AT1G08320: Blast: Query must be at leas 300 # done 301 # Reported by David Weisman. 302 # Check for these error lines and ignore them for now. Let 303 # the BlastErrorParser deal with them. 304 line = uhandle.peekline() 305 if line.find("ERROR:") != -1 or line.startswith("done"): 306 read_and_call_while(uhandle, consumer.noevent, contains="ERROR:") 307 read_and_call(uhandle, consumer.noevent, start="done") 308 309 # Check to see if this is PSI-BLAST. 310 # If it is, the 'Searching' line will be followed by: 311 # (version 2.0.10) 312 # Searching............................. 313 # Results from round 2 314 # or (version 2.0.11) 315 # Searching............................. 316 # 317 # 318 # Results from round 2 319 320 # Skip a bunch of blank lines. 321 read_and_call_while(uhandle, consumer.noevent, blank=1) 322 # Check for the results line if it's there. 323 if attempt_read_and_call(uhandle, consumer.round, start='Results'): 324 read_and_call_while(uhandle, consumer.noevent, blank=1) 325 326 # Three things can happen here: 327 # 1. line contains 'Score E' 328 # 2. line contains "No hits found" 329 # 3. no descriptions 330 # The first one begins a bunch of descriptions. The last two 331 # indicates that no descriptions follow, and we should go straight 332 # to the alignments. 333 if not attempt_read_and_call( 334 uhandle, consumer.description_header, 335 has_re=re.compile(r'Score +E')): 336 # Either case 2 or 3. Look for "No hits found". 337 attempt_read_and_call(uhandle, consumer.no_hits, 338 contains='No hits found') 339 try: 340 read_and_call_while(uhandle, consumer.noevent, blank=1) 341 except ValueError, err: 342 if str(err) != "Unexpected end of stream." : raise err 343 344 consumer.end_descriptions() 345 # Stop processing. 346 return 347 348 # Read the score header lines 349 read_and_call(uhandle, consumer.description_header, 350 start='Sequences producing') 351 352 # If PSI-BLAST, read the 'Sequences used in model' line. 353 attempt_read_and_call(uhandle, consumer.model_sequences, 354 start='Sequences used in model') 355 read_and_call_while(uhandle, consumer.noevent, blank=1) 356 357 # In BLAT, rather than a "No hits found" line, we just 358 # get no descriptions (and no alignments). This can be 359 # spotted because the next line is the database block: 360 if safe_peekline(uhandle).startswith(" Database:"): 361 consumer.end_descriptions() 362 # Stop processing. 363 return 364 365 # Read the descriptions and the following blank lines, making 366 # sure that there are descriptions. 367 if not uhandle.peekline().startswith('Sequences not found'): 368 read_and_call_until(uhandle, consumer.description, blank=1) 369 read_and_call_while(uhandle, consumer.noevent, blank=1) 370 371 # If PSI-BLAST, read the 'Sequences not found' line followed 372 # by more descriptions. However, I need to watch out for the 373 # case where there were no sequences not found previously, in 374 # which case there will be no more descriptions. 375 if attempt_read_and_call(uhandle, consumer.nonmodel_sequences, 376 start='Sequences not found'): 377 # Read the descriptions and the following blank lines. 378 read_and_call_while(uhandle, consumer.noevent, blank=1) 379 l = safe_peekline(uhandle) 380 # Brad -- added check for QUERY. On some PSI-BLAST outputs 381 # there will be a 'Sequences not found' line followed by no 382 # descriptions. Check for this case since the first thing you'll 383 # get is a blank line and then 'QUERY' 384 if not l.startswith('CONVERGED') and l[0] != '>' \ 385 and not l.startswith('QUERY'): 386 read_and_call_until(uhandle, consumer.description, blank=1) 387 read_and_call_while(uhandle, consumer.noevent, blank=1) 388 389 attempt_read_and_call(uhandle, consumer.converged, start='CONVERGED') 390 read_and_call_while(uhandle, consumer.noevent, blank=1) 391 392 consumer.end_descriptions()
393
394 - def _scan_alignments(self, uhandle, consumer):
395 if self._eof(uhandle) : return 396 397 # qblast inserts a helpful line here. 398 attempt_read_and_call(uhandle, consumer.noevent, start="ALIGNMENTS") 399 400 # First, check to see if I'm at the database report. 401 line = safe_peekline(uhandle) 402 if not line: 403 #EOF 404 return 405 elif line.startswith(' Database') or line.startswith("Lambda"): 406 return 407 elif line[0] == '>': 408 # XXX make a better check here between pairwise and masterslave 409 self._scan_pairwise_alignments(uhandle, consumer) 410 else: 411 # XXX put in a check to make sure I'm in a masterslave alignment 412 self._scan_masterslave_alignment(uhandle, consumer)
413
414 - def _scan_pairwise_alignments(self, uhandle, consumer):
415 while not self._eof(uhandle): 416 line = safe_peekline(uhandle) 417 if line[0] != '>': 418 break 419 self._scan_one_pairwise_alignment(uhandle, consumer)
420
421 - def _scan_one_pairwise_alignment(self, uhandle, consumer):
422 if self._eof(uhandle) : return 423 consumer.start_alignment() 424 425 self._scan_alignment_header(uhandle, consumer) 426 427 # Scan a bunch of score/alignment pairs. 428 while 1: 429 if self._eof(uhandle): 430 #Shouldn't have issued that _scan_alignment_header event... 431 break 432 line = safe_peekline(uhandle) 433 if not line.startswith(' Score'): 434 break 435 self._scan_hsp(uhandle, consumer) 436 consumer.end_alignment()
437
438 - def _scan_alignment_header(self, uhandle, consumer):
439 # >d1rip__ 2.24.7.1.1 Ribosomal S17 protein [Bacillus 440 # stearothermophilus] 441 # Length = 81 442 # 443 # Or, more recently with different white space: 444 # 445 # >gi|15799684|ref|NP_285696.1| threonine synthase ... 446 # gi|15829258|ref|NP_308031.1| threonine synthase 447 # ... 448 # Length=428 449 read_and_call(uhandle, consumer.title, start='>') 450 while 1: 451 line = safe_readline(uhandle) 452 if line.lstrip().startswith('Length =') \ 453 or line.lstrip().startswith('Length='): 454 consumer.length(line) 455 break 456 elif is_blank_line(line): 457 # Check to make sure I haven't missed the Length line 458 raise ValueError("I missed the Length in an alignment header") 459 consumer.title(line) 460 461 # Older versions of BLAST will have a line with some spaces. 462 # Version 2.0.14 (maybe 2.0.13?) and above print a true blank line. 463 if not attempt_read_and_call(uhandle, consumer.noevent, 464 start=' '): 465 read_and_call(uhandle, consumer.noevent, blank=1)
466
467 - def _scan_hsp(self, uhandle, consumer):
468 consumer.start_hsp() 469 self._scan_hsp_header(uhandle, consumer) 470 self._scan_hsp_alignment(uhandle, consumer) 471 consumer.end_hsp()
472
473 - def _scan_hsp_header(self, uhandle, consumer):
474 # Score = 22.7 bits (47), Expect = 2.5 475 # Identities = 10/36 (27%), Positives = 18/36 (49%) 476 # Strand = Plus / Plus 477 # Frame = +3 478 # 479 480 read_and_call(uhandle, consumer.score, start=' Score') 481 read_and_call(uhandle, consumer.identities, start=' Identities') 482 # BLASTN 483 attempt_read_and_call(uhandle, consumer.strand, start = ' Strand') 484 # BLASTX, TBLASTN, TBLASTX 485 attempt_read_and_call(uhandle, consumer.frame, start = ' Frame') 486 read_and_call(uhandle, consumer.noevent, blank=1)
487
488 - def _scan_hsp_alignment(self, uhandle, consumer):
489 # Query: 11 GRGVSACA-------TCDGFFYRNQKVAVIGGGNTAVEEALYLSNIASEVHLIHRRDGF 490 # GRGVS+ TC Y + + V GGG+ + EE L + I R+ 491 # Sbjct: 12 GRGVSSVVRRCIHKPTCKE--YAVKIIDVTGGGSFSAEEVQELREATLKEVDILRKVSG 492 # 493 # Query: 64 AEKILIKR 71 494 # I +K 495 # Sbjct: 70 PNIIQLKD 77 496 # 497 498 while 1: 499 # Blastn adds an extra line filled with spaces before Query 500 attempt_read_and_call(uhandle, consumer.noevent, start=' ') 501 read_and_call(uhandle, consumer.query, start='Query') 502 read_and_call(uhandle, consumer.align, start=' ') 503 read_and_call(uhandle, consumer.sbjct, start='Sbjct') 504 try: 505 read_and_call_while(uhandle, consumer.noevent, blank=1) 506 except ValueError, err: 507 if str(err) != "Unexpected end of stream." : raise err 508 # End of File (well, it looks like it with recent versions 509 # of BLAST for multiple queries after the Iterator class 510 # has broken up the whole file into chunks). 511 break 512 line = safe_peekline(uhandle) 513 # Alignment continues if I see a 'Query' or the spaces for Blastn. 514 if not (line.startswith('Query') or line.startswith(' ')): 515 break
516
517 - def _scan_masterslave_alignment(self, uhandle, consumer):
518 consumer.start_alignment() 519 while 1: 520 line = safe_readline(uhandle) 521 # Check to see whether I'm finished reading the alignment. 522 # This is indicated by 1) database section, 2) next psi-blast 523 # round, which can also be a 'Results from round' if no 524 # searching line is present 525 # patch by chapmanb 526 if line.startswith('Searching') or \ 527 line.startswith('Results from round'): 528 uhandle.saveline(line) 529 break 530 elif line.startswith(' Database'): 531 uhandle.saveline(line) 532 break 533 elif is_blank_line(line): 534 consumer.noevent(line) 535 else: 536 consumer.multalign(line) 537 read_and_call_while(uhandle, consumer.noevent, blank=1) 538 consumer.end_alignment()
539
540 - def _eof(self, uhandle):
541 try: 542 line = safe_peekline(uhandle) 543 except ValueError, err: 544 if str(err) != "Unexpected end of stream." : raise err 545 line = "" 546 return not line
547
548 - def _scan_database_report(self, uhandle, consumer):
549 # Database: sdqib40-1.35.seg.fa 550 # Posted date: Nov 1, 1999 4:25 PM 551 # Number of letters in database: 223,339 552 # Number of sequences in database: 1323 553 # 554 # Lambda K H 555 # 0.322 0.133 0.369 556 # 557 # Gapped 558 # Lambda K H 559 # 0.270 0.0470 0.230 560 # 561 ########################################## 562 # Or, more recently Blast 2.2.15 gives less blank lines 563 ########################################## 564 # Database: All non-redundant GenBank CDS translations+PDB+SwissProt+PIR+PRF excluding 565 # environmental samples 566 # Posted date: Dec 12, 2006 5:51 PM 567 # Number of letters in database: 667,088,753 568 # Number of sequences in database: 2,094,974 569 # Lambda K H 570 # 0.319 0.136 0.395 571 # Gapped 572 # Lambda K H 573 # 0.267 0.0410 0.140 574 575 if self._eof(uhandle) : return 576 577 consumer.start_database_report() 578 579 # Subset of the database(s) listed below 580 # Number of letters searched: 562,618,960 581 # Number of sequences searched: 228,924 582 if attempt_read_and_call(uhandle, consumer.noevent, start=" Subset"): 583 read_and_call(uhandle, consumer.noevent, contains="letters") 584 read_and_call(uhandle, consumer.noevent, contains="sequences") 585 read_and_call(uhandle, consumer.noevent, start=" ") 586 587 # Sameet Mehta reported seeing output from BLASTN 2.2.9 that 588 # was missing the "Database" stanza completely. 589 while attempt_read_and_call(uhandle, consumer.database, 590 start=' Database'): 591 # BLAT output ends abruptly here, without any of the other 592 # information. Check to see if this is the case. If so, 593 # then end the database report here gracefully. 594 if not uhandle.peekline().strip() \ 595 or uhandle.peekline().startswith("BLAST"): 596 consumer.end_database_report() 597 return 598 599 # Database can span multiple lines. 600 read_and_call_until(uhandle, consumer.database, start=' Posted') 601 read_and_call(uhandle, consumer.posted_date, start=' Posted') 602 read_and_call(uhandle, consumer.num_letters_in_database, 603 start=' Number of letters') 604 read_and_call(uhandle, consumer.num_sequences_in_database, 605 start=' Number of sequences') 606 #There may not be a line starting with spaces... 607 attempt_read_and_call(uhandle, consumer.noevent, start=' ') 608 609 line = safe_readline(uhandle) 610 uhandle.saveline(line) 611 if line.find('Lambda') != -1: 612 break 613 614 read_and_call(uhandle, consumer.noevent, start='Lambda') 615 read_and_call(uhandle, consumer.ka_params) 616 617 #This blank line is optional: 618 attempt_read_and_call(uhandle, consumer.noevent, blank=1) 619 620 # not BLASTP 621 attempt_read_and_call(uhandle, consumer.gapped, start='Gapped') 622 # not TBLASTX 623 if attempt_read_and_call(uhandle, consumer.noevent, start='Lambda'): 624 read_and_call(uhandle, consumer.ka_params_gap) 625 626 # Blast 2.2.4 can sometimes skip the whole parameter section. 627 # Thus, I need to be careful not to read past the end of the 628 # file. 629 try: 630 read_and_call_while(uhandle, consumer.noevent, blank=1) 631 except ValueError, x: 632 if str(x) != "Unexpected end of stream.": 633 raise 634 consumer.end_database_report()
635
636 - def _scan_parameters(self, uhandle, consumer):
637 # Matrix: BLOSUM62 638 # Gap Penalties: Existence: 11, Extension: 1 639 # Number of Hits to DB: 50604 640 # Number of Sequences: 1323 641 # Number of extensions: 1526 642 # Number of successful extensions: 6 643 # Number of sequences better than 10.0: 5 644 # Number of HSP's better than 10.0 without gapping: 5 645 # Number of HSP's successfully gapped in prelim test: 0 646 # Number of HSP's that attempted gapping in prelim test: 1 647 # Number of HSP's gapped (non-prelim): 5 648 # length of query: 140 649 # length of database: 223,339 650 # effective HSP length: 39 651 # effective length of query: 101 652 # effective length of database: 171,742 653 # effective search space: 17345942 654 # effective search space used: 17345942 655 # T: 11 656 # A: 40 657 # X1: 16 ( 7.4 bits) 658 # X2: 38 (14.8 bits) 659 # X3: 64 (24.9 bits) 660 # S1: 41 (21.9 bits) 661 # S2: 42 (20.8 bits) 662 ########################################## 663 # Or, more recently Blast(x) 2.2.15 gives 664 ########################################## 665 # Matrix: BLOSUM62 666 # Gap Penalties: Existence: 11, Extension: 1 667 # Number of Sequences: 4535438 668 # Number of Hits to DB: 2,588,844,100 669 # Number of extensions: 60427286 670 # Number of successful extensions: 126433 671 # Number of sequences better than 2.0: 30 672 # Number of HSP's gapped: 126387 673 # Number of HSP's successfully gapped: 35 674 # Length of query: 291 675 # Length of database: 1,573,298,872 676 # Length adjustment: 130 677 # Effective length of query: 161 678 # Effective length of database: 983,691,932 679 # Effective search space: 158374401052 680 # Effective search space used: 158374401052 681 # Neighboring words threshold: 12 682 # Window for multiple hits: 40 683 # X1: 16 ( 7.3 bits) 684 # X2: 38 (14.6 bits) 685 # X3: 64 (24.7 bits) 686 # S1: 41 (21.7 bits) 687 # S2: 32 (16.9 bits) 688 689 690 # Blast 2.2.4 can sometimes skip the whole parameter section. 691 # BLAT also skips the whole parameter section. 692 # Thus, check to make sure that the parameter section really 693 # exists. 694 if not uhandle.peekline().strip(): 695 return 696 697 # BLASTN 2.2.9 looks like it reverses the "Number of Hits" and 698 # "Number of Sequences" lines. 699 consumer.start_parameters() 700 701 # Matrix line may be missing in BLASTN 2.2.9 702 attempt_read_and_call(uhandle, consumer.matrix, start='Matrix') 703 # not TBLASTX 704 attempt_read_and_call(uhandle, consumer.gap_penalties, start='Gap') 705 706 attempt_read_and_call(uhandle, consumer.num_sequences, 707 start='Number of Sequences') 708 attempt_read_and_call(uhandle, consumer.num_hits, 709 start='Number of Hits') 710 attempt_read_and_call(uhandle, consumer.num_sequences, 711 start='Number of Sequences') 712 attempt_read_and_call(uhandle, consumer.num_extends, 713 start='Number of extensions') 714 attempt_read_and_call(uhandle, consumer.num_good_extends, 715 start='Number of successful') 716 717 attempt_read_and_call(uhandle, consumer.num_seqs_better_e, 718 start='Number of sequences') 719 720 # not BLASTN, TBLASTX 721 if attempt_read_and_call(uhandle, consumer.hsps_no_gap, 722 start="Number of HSP's better"): 723 # BLASTN 2.2.9 724 if attempt_read_and_call(uhandle, consumer.noevent, 725 start="Number of HSP's gapped:"): 726 read_and_call(uhandle, consumer.noevent, 727 start="Number of HSP's successfully") 728 #This is ommitted in 2.2.15 729 attempt_read_and_call(uhandle, consumer.noevent, 730 start="Number of extra gapped extensions") 731 else: 732 read_and_call(uhandle, consumer.hsps_prelim_gapped, 733 start="Number of HSP's successfully") 734 read_and_call(uhandle, consumer.hsps_prelim_gap_attempted, 735 start="Number of HSP's that") 736 read_and_call(uhandle, consumer.hsps_gapped, 737 start="Number of HSP's gapped") 738 #e.g. BLASTX 2.2.15 where the "better" line is missing 739 elif attempt_read_and_call(uhandle, consumer.noevent, 740 start="Number of HSP's gapped"): 741 read_and_call(uhandle, consumer.noevent, 742 start="Number of HSP's successfully") 743 744 # not in blastx 2.2.1 745 attempt_read_and_call(uhandle, consumer.query_length, 746 has_re=re.compile(r"[Ll]ength of query")) 747 # Not in BLASTX 2.2.22+ 748 attempt_read_and_call(uhandle, consumer.database_length, 749 has_re=re.compile(r"[Ll]ength of \s*[Dd]atabase")) 750 751 # BLASTN 2.2.9 752 attempt_read_and_call(uhandle, consumer.noevent, 753 start="Length adjustment") 754 attempt_read_and_call(uhandle, consumer.effective_hsp_length, 755 start='effective HSP') 756 # Not in blastx 2.2.1 757 attempt_read_and_call( 758 uhandle, consumer.effective_query_length, 759 has_re=re.compile(r'[Ee]ffective length of query')) 760 761 # This is not in BLASTP 2.2.15 762 attempt_read_and_call( 763 uhandle, consumer.effective_database_length, 764 has_re=re.compile(r'[Ee]ffective length of \s*[Dd]atabase')) 765 # Not in blastx 2.2.1, added a ':' to distinguish between 766 # this and the 'effective search space used' line 767 attempt_read_and_call( 768 uhandle, consumer.effective_search_space, 769 has_re=re.compile(r'[Ee]ffective search space:')) 770 # Does not appear in BLASTP 2.0.5 771 attempt_read_and_call( 772 uhandle, consumer.effective_search_space_used, 773 has_re=re.compile(r'[Ee]ffective search space used')) 774 775 # BLASTX, TBLASTN, TBLASTX 776 attempt_read_and_call(uhandle, consumer.frameshift, start='frameshift') 777 778 # not in BLASTN 2.2.9 779 attempt_read_and_call(uhandle, consumer.threshold, start='T') 780 # In BLASTX 2.2.15 replaced by: "Neighboring words threshold: 12" 781 attempt_read_and_call(uhandle, consumer.threshold, start='Neighboring words threshold') 782 783 # not in BLASTX 2.2.15 784 attempt_read_and_call(uhandle, consumer.window_size, start='A') 785 # get this instead: "Window for multiple hits: 40" 786 attempt_read_and_call(uhandle, consumer.window_size, start='Window for multiple hits') 787 788 # not in BLASTX 2.2.22+ 789 attempt_read_and_call(uhandle, consumer.dropoff_1st_pass, start='X1') 790 # not TBLASTN 791 attempt_read_and_call(uhandle, consumer.gap_x_dropoff, start='X2') 792 793 # not BLASTN, TBLASTX 794 attempt_read_and_call(uhandle, consumer.gap_x_dropoff_final, 795 start='X3') 796 797 # not TBLASTN 798 attempt_read_and_call(uhandle, consumer.gap_trigger, start='S1') 799 # not in blastx 2.2.1 800 # first we make sure we have additional lines to work with, if 801 # not then the file is done and we don't have a final S2 802 if not is_blank_line(uhandle.peekline(), allow_spaces=1): 803 read_and_call(uhandle, consumer.blast_cutoff, start='S2') 804 805 consumer.end_parameters()
806
807 -class BlastParser(AbstractParser):
808 """Parses BLAST data into a Record.Blast object. 809 810 """
811 - def __init__(self):
812 """__init__(self)""" 813 self._scanner = _Scanner() 814 self._consumer = _BlastConsumer()
815
816 - def parse(self, handle):
817 """parse(self, handle)""" 818 self._scanner.feed(handle, self._consumer) 819 return self._consumer.data
820
821 -class PSIBlastParser(AbstractParser):
822 """Parses BLAST data into a Record.PSIBlast object. 823 824 """
825 - def __init__(self):
826 """__init__(self)""" 827 self._scanner = _Scanner() 828 self._consumer = _PSIBlastConsumer()
829
830 - def parse(self, handle):
831 """parse(self, handle)""" 832 self._scanner.feed(handle, self._consumer) 833 return self._consumer.data
834
835 -class _HeaderConsumer:
836 - def start_header(self):
837 self._header = Record.Header()
838
839 - def version(self, line):
840 c = line.split() 841 self._header.application = c[0] 842 self._header.version = c[1] 843 if len(c) > 2: 844 #The date is missing in the new C++ output from blastx 2.2.22+ 845 #Just get "BLASTX 2.2.22+\n" and that's all. 846 self._header.date = c[2][1:-1]
847
848 - def reference(self, line):
849 if line.startswith('Reference: '): 850 self._header.reference = line[11:] 851 else: 852 self._header.reference = self._header.reference + line
853
854 - def query_info(self, line):
855 if line.startswith('Query= '): 856 self._header.query = line[7:].lstrip() 857 elif line.startswith('Length='): 858 #New style way to give the query length in BLAST 2.2.22+ (the C++ code) 859 self._header.query_letters = _safe_int(line[7:].strip()) 860 elif not line.startswith(' '): # continuation of query_info 861 self._header.query = "%s%s" % (self._header.query, line) 862 else: 863 #Hope it is the old style way to give the query length: 864 letters, = _re_search( 865 r"([0-9,]+) letters", line, 866 "I could not find the number of letters in line\n%s" % line) 867 self._header.query_letters = _safe_int(letters)
868
869 - def database_info(self, line):
870 line = line.rstrip() 871 if line.startswith('Database: '): 872 self._header.database = line[10:] 873 elif not line.endswith('total letters'): 874 if self._header.database: 875 #Need to include a space when merging multi line datase descr 876 self._header.database = self._header.database + " " + line.strip() 877 else: 878 self._header.database = line.strip() 879 else: 880 sequences, letters =_re_search( 881 r"([0-9,]+) sequences; ([0-9,-]+) total letters", line, 882 "I could not find the sequences and letters in line\n%s" %line) 883 self._header.database_sequences = _safe_int(sequences) 884 self._header.database_letters = _safe_int(letters)
885
886 - def end_header(self):
887 # Get rid of the trailing newlines 888 self._header.reference = self._header.reference.rstrip() 889 self._header.query = self._header.query.rstrip()
890
891 -class _DescriptionConsumer:
892 - def start_descriptions(self):
893 self._descriptions = [] 894 self._model_sequences = [] 895 self._nonmodel_sequences = [] 896 self._converged = 0 897 self._type = None 898 self._roundnum = None 899 900 self.__has_n = 0 # Does the description line contain an N value?
901
902 - def description_header(self, line):
903 if line.startswith('Sequences producing'): 904 cols = line.split() 905 if cols[-1] == 'N': 906 self.__has_n = 1
907
908 - def description(self, line):
909 dh = self._parse(line) 910 if self._type == 'model': 911 self._model_sequences.append(dh) 912 elif self._type == 'nonmodel': 913 self._nonmodel_sequences.append(dh) 914 else: 915 self._descriptions.append(dh)
916
917 - def model_sequences(self, line):
918 self._type = 'model'
919
920 - def nonmodel_sequences(self, line):
921 self._type = 'nonmodel'
922
923 - def converged(self, line):
924 self._converged = 1
925
926 - def no_hits(self, line):
927 pass
928
929 - def round(self, line):
930 if not line.startswith('Results from round'): 931 raise ValueError("I didn't understand the round line\n%s" % line) 932 self._roundnum = _safe_int(line[18:].strip())
933
934 - def end_descriptions(self):
935 pass
936
937 - def _parse(self, description_line):
938 line = description_line # for convenience 939 dh = Record.Description() 940 941 # I need to separate the score and p-value from the title. 942 # sp|P21297|FLBT_CAUCR FLBT PROTEIN [snip] 284 7e-77 943 # sp|P21297|FLBT_CAUCR FLBT PROTEIN [snip] 284 7e-77 1 944 # special cases to handle: 945 # - title must be preserved exactly (including whitespaces) 946 # - score could be equal to e-value (not likely, but what if??) 947 # - sometimes there's an "N" score of '1'. 948 cols = line.split() 949 if len(cols) < 3: 950 raise ValueError( \ 951 "Line does not appear to contain description:\n%s" % line) 952 if self.__has_n: 953 i = line.rfind(cols[-1]) # find start of N 954 i = line.rfind(cols[-2], 0, i) # find start of p-value 955 i = line.rfind(cols[-3], 0, i) # find start of score 956 else: 957 i = line.rfind(cols[-1]) # find start of p-value 958 i = line.rfind(cols[-2], 0, i) # find start of score 959 if self.__has_n: 960 dh.title, dh.score, dh.e, dh.num_alignments = \ 961 line[:i].rstrip(), cols[-3], cols[-2], cols[-1] 962 else: 963 dh.title, dh.score, dh.e, dh.num_alignments = \ 964 line[:i].rstrip(), cols[-2], cols[-1], 1 965 dh.num_alignments = _safe_int(dh.num_alignments) 966 dh.score = _safe_int(dh.score) 967 dh.e = _safe_float(dh.e) 968 return dh
969
970 -class _AlignmentConsumer:
971 # This is a little bit tricky. An alignment can either be a 972 # pairwise alignment or a multiple alignment. Since it's difficult 973 # to know a-priori which one the blast record will contain, I'm going 974 # to make one class that can parse both of them.
975 - def start_alignment(self):
976 self._alignment = Record.Alignment() 977 self._multiple_alignment = Record.MultipleAlignment()
978
979 - def title(self, line):
980 if self._alignment.title: 981 self._alignment.title += " " 982 self._alignment.title += line.strip()
983
984 - def length(self, line):
985 #e.g. "Length = 81" or more recently, "Length=428" 986 parts = line.replace(" ","").split("=") 987 assert len(parts)==2, "Unrecognised format length line" 988 self._alignment.length = parts[1] 989 self._alignment.length = _safe_int(self._alignment.length)
990
991 - def multalign(self, line):
992 # Standalone version uses 'QUERY', while WWW version uses blast_tmp. 993 if line.startswith('QUERY') or line.startswith('blast_tmp'): 994 # If this is the first line of the multiple alignment, 995 # then I need to figure out how the line is formatted. 996 997 # Format of line is: 998 # QUERY 1 acttg...gccagaggtggtttattcagtctccataagagaggggacaaacg 60 999 try: 1000 name, start, seq, end = line.split() 1001 except ValueError: 1002 raise ValueError("I do not understand the line\n%s" % line) 1003 self._start_index = line.index(start, len(name)) 1004 self._seq_index = line.index(seq, 1005 self._start_index+len(start)) 1006 # subtract 1 for the space 1007 self._name_length = self._start_index - 1 1008 self._start_length = self._seq_index - self._start_index - 1 1009 self._seq_length = line.rfind(end) - self._seq_index - 1 1010 1011 #self._seq_index = line.index(seq) 1012 ## subtract 1 for the space 1013 #self._seq_length = line.rfind(end) - self._seq_index - 1 1014 #self._start_index = line.index(start) 1015 #self._start_length = self._seq_index - self._start_index - 1 1016 #self._name_length = self._start_index 1017 1018 # Extract the information from the line 1019 name = line[:self._name_length] 1020 name = name.rstrip() 1021 start = line[self._start_index:self._start_index+self._start_length] 1022 start = start.rstrip() 1023 if start: 1024 start = _safe_int(start) 1025 end = line[self._seq_index+self._seq_length:].rstrip() 1026 if end: 1027 end = _safe_int(end) 1028 seq = line[self._seq_index:self._seq_index+self._seq_length].rstrip() 1029 # right pad the sequence with spaces if necessary 1030 if len(seq) < self._seq_length: 1031 seq = seq + ' '*(self._seq_length-len(seq)) 1032 1033 # I need to make sure the sequence is aligned correctly with the query. 1034 # First, I will find the length of the query. Then, if necessary, 1035 # I will pad my current sequence with spaces so that they will line 1036 # up correctly. 1037 1038 # Two possible things can happen: 1039 # QUERY 1040 # 504 1041 # 1042 # QUERY 1043 # 403 1044 # 1045 # Sequence 504 will need padding at the end. Since I won't know 1046 # this until the end of the alignment, this will be handled in 1047 # end_alignment. 1048 # Sequence 403 will need padding before being added to the alignment. 1049 1050 align = self._multiple_alignment.alignment # for convenience 1051 align.append((name, start, seq, end))
1052 1053 # This is old code that tried to line up all the sequences 1054 # in a multiple alignment by using the sequence title's as 1055 # identifiers. The problem with this is that BLAST assigns 1056 # different HSP's from the same sequence the same id. Thus, 1057 # in one alignment block, there may be multiple sequences with 1058 # the same id. I'm not sure how to handle this, so I'm not 1059 # going to. 1060 1061 # # If the sequence is the query, then just add it. 1062 # if name == 'QUERY': 1063 # if len(align) == 0: 1064 # align.append((name, start, seq)) 1065 # else: 1066 # aname, astart, aseq = align[0] 1067 # if name != aname: 1068 # raise ValueError, "Query is not the first sequence" 1069 # aseq = aseq + seq 1070 # align[0] = aname, astart, aseq 1071 # else: 1072 # if len(align) == 0: 1073 # raise ValueError, "I could not find the query sequence" 1074 # qname, qstart, qseq = align[0] 1075 # 1076 # # Now find my sequence in the multiple alignment. 1077 # for i in range(1, len(align)): 1078 # aname, astart, aseq = align[i] 1079 # if name == aname: 1080 # index = i 1081 # break 1082 # else: 1083 # # If I couldn't find it, then add a new one. 1084 # align.append((None, None, None)) 1085 # index = len(align)-1 1086 # # Make sure to left-pad it. 1087 # aname, astart, aseq = name, start, ' '*(len(qseq)-len(seq)) 1088 # 1089 # if len(qseq) != len(aseq) + len(seq): 1090 # # If my sequences are shorter than the query sequence, 1091 # # then I will need to pad some spaces to make them line up. 1092 # # Since I've already right padded seq, that means aseq 1093 # # must be too short. 1094 # aseq = aseq + ' '*(len(qseq)-len(aseq)-len(seq)) 1095 # aseq = aseq + seq 1096 # if astart is None: 1097 # astart = start 1098 # align[index] = aname, astart, aseq 1099
1100 - def end_alignment(self):
1101 # Remove trailing newlines 1102 if self._alignment: 1103 self._alignment.title = self._alignment.title.rstrip() 1104 1105 # This code is also obsolete. See note above. 1106 # If there's a multiple alignment, I will need to make sure 1107 # all the sequences are aligned. That is, I may need to 1108 # right-pad the sequences. 1109 # if self._multiple_alignment is not None: 1110 # align = self._multiple_alignment.alignment 1111 # seqlen = None 1112 # for i in range(len(align)): 1113 # name, start, seq = align[i] 1114 # if seqlen is None: 1115 # seqlen = len(seq) 1116 # else: 1117 # if len(seq) < seqlen: 1118 # seq = seq + ' '*(seqlen - len(seq)) 1119 # align[i] = name, start, seq 1120 # elif len(seq) > seqlen: 1121 # raise ValueError, \ 1122 # "Sequence %s is longer than the query" % name 1123 1124 # Clean up some variables, if they exist. 1125 try: 1126 del self._seq_index 1127 del self._seq_length 1128 del self._start_index 1129 del self._start_length 1130 del self._name_length 1131 except AttributeError: 1132 pass
1133
1134 -class _HSPConsumer:
1135 - def start_hsp(self):
1136 self._hsp = Record.HSP()
1137
1138 - def score(self, line):
1139 self._hsp.bits, self._hsp.score = _re_search( 1140 r"Score =\s*([0-9.e+]+) bits \(([0-9]+)\)", line, 1141 "I could not find the score in line\n%s" % line) 1142 self._hsp.score = _safe_float(self._hsp.score) 1143 self._hsp.bits = _safe_float(self._hsp.bits) 1144 1145 x, y = _re_search( 1146 r"Expect\(?(\d*)\)? = +([0-9.e\-|\+]+)", line, 1147 "I could not find the expect in line\n%s" % line) 1148 if x: 1149 self._hsp.num_alignments = _safe_int(x) 1150 else: 1151 self._hsp.num_alignments = 1 1152 self._hsp.expect = _safe_float(y)
1153
1154 - def identities(self, line):
1155 x, y = _re_search( 1156 r"Identities = (\d+)\/(\d+)", line, 1157 "I could not find the identities in line\n%s" % line) 1158 self._hsp.identities = _safe_int(x), _safe_int(y) 1159 self._hsp.align_length = _safe_int(y) 1160 1161 if line.find('Positives') != -1: 1162 x, y = _re_search( 1163 r"Positives = (\d+)\/(\d+)", line, 1164 "I could not find the positives in line\n%s" % line) 1165 self._hsp.positives = _safe_int(x), _safe_int(y) 1166 assert self._hsp.align_length == _safe_int(y) 1167 1168 if line.find('Gaps') != -1: 1169 x, y = _re_search( 1170 r"Gaps = (\d+)\/(\d+)", line, 1171 "I could not find the gaps in line\n%s" % line) 1172 self._hsp.gaps = _safe_int(x), _safe_int(y) 1173 assert self._hsp.align_length == _safe_int(y)
1174 1175
1176 - def strand(self, line):
1177 self._hsp.strand = _re_search( 1178 r"Strand = (\w+) / (\w+)", line, 1179 "I could not find the strand in line\n%s" % line)
1180
1181 - def frame(self, line):
1182 # Frame can be in formats: 1183 # Frame = +1 1184 # Frame = +2 / +2 1185 if line.find('/') != -1: 1186 self._hsp.frame = _re_search( 1187 r"Frame = ([-+][123]) / ([-+][123])", line, 1188 "I could not find the frame in line\n%s" % line) 1189 else: 1190 self._hsp.frame = _re_search( 1191 r"Frame = ([-+][123])", line, 1192 "I could not find the frame in line\n%s" % line)
1193 1194 # Match a space, if one is available. Masahir Ishikawa found a 1195 # case where there's no space between the start and the sequence: 1196 # Query: 100tt 101 1197 # line below modified by Yair Benita, Sep 2004 1198 # Note that the colon is not always present. 2006 1199 _query_re = re.compile(r"Query(:?) \s*(\d+)\s*(.+) (\d+)")
1200 - def query(self, line):
1201 m = self._query_re.search(line) 1202 if m is None: 1203 raise ValueError("I could not find the query in line\n%s" % line) 1204 1205 # line below modified by Yair Benita, Sep 2004. 1206 # added the end attribute for the query 1207 colon, start, seq, end = m.groups() 1208 self._hsp.query = self._hsp.query + seq 1209 if self._hsp.query_start is None: 1210 self._hsp.query_start = _safe_int(start) 1211 1212 # line below added by Yair Benita, Sep 2004. 1213 # added the end attribute for the query 1214 self._hsp.query_end = _safe_int(end) 1215 1216 #Get index for sequence start (regular expression element 3) 1217 self._query_start_index = m.start(3) 1218 self._query_len = len(seq)
1219
1220 - def align(self, line):
1221 seq = line[self._query_start_index:].rstrip() 1222 if len(seq) < self._query_len: 1223 # Make sure the alignment is the same length as the query 1224 seq = seq + ' ' * (self._query_len-len(seq)) 1225 elif len(seq) < self._query_len: 1226 raise ValueError("Match is longer than the query in line\n%s" \ 1227 % line) 1228 self._hsp.match = self._hsp.match + seq
1229 1230 # To match how we do the query, cache the regular expression. 1231 # Note that the colon is not always present. 1232 _sbjct_re = re.compile(r"Sbjct(:?) \s*(\d+)\s*(.+) (\d+)")
1233 - def sbjct(self, line):
1234 m = self._sbjct_re.search(line) 1235 if m is None: 1236 raise ValueError("I could not find the sbjct in line\n%s" % line) 1237 colon, start, seq, end = m.groups() 1238 #mikep 26/9/00 1239 #On occasion, there is a blast hit with no subject match 1240 #so far, it only occurs with 1-line short "matches" 1241 #I have decided to let these pass as they appear 1242 if not seq.strip(): 1243 seq = ' ' * self._query_len 1244 self._hsp.sbjct = self._hsp.sbjct + seq 1245 if self._hsp.sbjct_start is None: 1246 self._hsp.sbjct_start = _safe_int(start) 1247 1248 self._hsp.sbjct_end = _safe_int(end) 1249 if len(seq) != self._query_len: 1250 raise ValueError( \ 1251 "QUERY and SBJCT sequence lengths don't match in line\n%s" \ 1252 % line) 1253 1254 del self._query_start_index # clean up unused variables 1255 del self._query_len
1256
1257 - def end_hsp(self):
1258 pass
1259
1260 -class _DatabaseReportConsumer:
1261
1262 - def start_database_report(self):
1263 self._dr = Record.DatabaseReport()
1264
1265 - def database(self, line):
1266 m = re.search(r"Database: (.+)$", line) 1267 if m: 1268 self._dr.database_name.append(m.group(1)) 1269 elif self._dr.database_name: 1270 # This must be a continuation of the previous name. 1271 self._dr.database_name[-1] = "%s%s" % (self._dr.database_name[-1], 1272 line.strip())
1273
1274 - def posted_date(self, line):
1275 self._dr.posted_date.append(_re_search( 1276 r"Posted date:\s*(.+)$", line, 1277 "I could not find the posted date in line\n%s" % line))
1278
1279 - def num_letters_in_database(self, line):
1280 letters, = _get_cols( 1281 line, (-1,), ncols=6, expected={2:"letters", 4:"database:"}) 1282 self._dr.num_letters_in_database.append(_safe_int(letters))
1283
1284 - def num_sequences_in_database(self, line):
1285 sequences, = _get_cols( 1286 line, (-1,), ncols=6, expected={2:"sequences", 4:"database:"}) 1287 self._dr.num_sequences_in_database.append(_safe_int(sequences))
1288
1289 - def ka_params(self, line):
1290 x = line.split() 1291 self._dr.ka_params = map(_safe_float, x)
1292
1293 - def gapped(self, line):
1294 self._dr.gapped = 1
1295
1296 - def ka_params_gap(self, line):
1297 x = line.split() 1298 self._dr.ka_params_gap = map(_safe_float, x)
1299
1300 - def end_database_report(self):
1301 pass
1302
1303 -class _ParametersConsumer:
1304 - def start_parameters(self):
1305 self._params = Record.Parameters()
1306
1307 - def matrix(self, line):
1308 self._params.matrix = line[8:].rstrip()
1309
1310 - def gap_penalties(self, line):
1311 x = _get_cols( 1312 line, (3, 5), ncols=6, expected={2:"Existence:", 4:"Extension:"}) 1313 self._params.gap_penalties = map(_safe_float, x)
1314
1315 - def num_hits(self, line):
1316 if line.find('1st pass') != -1: 1317 x, = _get_cols(line, (-4,), ncols=11, expected={2:"Hits"}) 1318 self._params.num_hits = _safe_int(x) 1319 else: 1320 x, = _get_cols(line, (-1,), ncols=6, expected={2:"Hits"}) 1321 self._params.num_hits = _safe_int(x)
1322
1323 - def num_sequences(self, line):
1324 if line.find('1st pass') != -1: 1325 x, = _get_cols(line, (-4,), ncols=9, expected={2:"Sequences:"}) 1326 self._params.num_sequences = _safe_int(x) 1327 else: 1328 x, = _get_cols(line, (-1,), ncols=4, expected={2:"Sequences:"}) 1329 self._params.num_sequences = _safe_int(x)
1330
1331 - def num_extends(self, line):
1332 if line.find('1st pass') != -1: 1333 x, = _get_cols(line, (-4,), ncols=9, expected={2:"extensions:"}) 1334 self._params.num_extends = _safe_int(x) 1335 else: 1336 x, = _get_cols(line, (-1,), ncols=4, expected={2:"extensions:"}) 1337 self._params.num_extends = _safe_int(x)
1338
1339 - def num_good_extends(self, line):
1340 if line.find('1st pass') != -1: 1341 x, = _get_cols(line, (-4,), ncols=10, expected={3:"extensions:"}) 1342 self._params.num_good_extends = _safe_int(x) 1343 else: 1344 x, = _get_cols(line, (-1,), ncols=5, expected={3:"extensions:"}) 1345 self._params.num_good_extends = _safe_int(x)
1346
1347 - def num_seqs_better_e(self, line):
1348 self._params.num_seqs_better_e, = _get_cols( 1349 line, (-1,), ncols=7, expected={2:"sequences"}) 1350 self._params.num_seqs_better_e = _safe_int( 1351 self._params.num_seqs_better_e)
1352
1353 - def hsps_no_gap(self, line):
1354 self._params.hsps_no_gap, = _get_cols( 1355 line, (-1,), ncols=9, expected={3:"better", 7:"gapping:"}) 1356 self._params.hsps_no_gap = _safe_int(self._params.hsps_no_gap)
1357
1358 - def hsps_prelim_gapped(self, line):
1359 self._params.hsps_prelim_gapped, = _get_cols( 1360 line, (-1,), ncols=9, expected={4:"gapped", 6:"prelim"}) 1361 self._params.hsps_prelim_gapped = _safe_int( 1362 self._params.hsps_prelim_gapped)
1363
1364 - def hsps_prelim_gapped_attempted(self, line):
1365 self._params.hsps_prelim_gapped_attempted, = _get_cols( 1366 line, (-1,), ncols=10, expected={4:"attempted", 7:"prelim"}) 1367 self._params.hsps_prelim_gapped_attempted = _safe_int( 1368 self._params.hsps_prelim_gapped_attempted)
1369
1370 - def hsps_gapped(self, line):
1371 self._params.hsps_gapped, = _get_cols( 1372 line, (-1,), ncols=6, expected={3:"gapped"}) 1373 self._params.hsps_gapped = _safe_int(self._params.hsps_gapped)
1374
1375 - def query_length(self, line):
1376 self._params.query_length, = _get_cols( 1377 line.lower(), (-1,), ncols=4, expected={0:"length", 2:"query:"}) 1378 self._params.query_length = _safe_int(self._params.query_length)
1379
1380 - def database_length(self, line):
1381 self._params.database_length, = _get_cols( 1382 line.lower(), (-1,), ncols=4, expected={0:"length", 2:"database:"}) 1383 self._params.database_length = _safe_int(self._params.database_length)
1384
1385 - def effective_hsp_length(self, line):
1386 self._params.effective_hsp_length, = _get_cols( 1387 line, (-1,), ncols=4, expected={1:"HSP", 2:"length:"}) 1388 self._params.effective_hsp_length = _safe_int( 1389 self._params.effective_hsp_length)
1390
1391 - def effective_query_length(self, line):
1392 self._params.effective_query_length, = _get_cols( 1393 line, (-1,), ncols=5, expected={1:"length", 3:"query:"}) 1394 self._params.effective_query_length = _safe_int( 1395 self._params.effective_query_length)
1396
1397 - def effective_database_length(self, line):
1398 self._params.effective_database_length, = _get_cols( 1399 line.lower(), (-1,), ncols=5, expected={1:"length", 3:"database:"}) 1400 self._params.effective_database_length = _safe_int( 1401 self._params.effective_database_length)
1402
1403 - def effective_search_space(self, line):
1404 self._params.effective_search_space, = _get_cols( 1405 line, (-1,), ncols=4, expected={1:"search"}) 1406 self._params.effective_search_space = _safe_int( 1407 self._params.effective_search_space)
1408
1409 - def effective_search_space_used(self, line):
1410 self._params.effective_search_space_used, = _get_cols( 1411 line, (-1,), ncols=5, expected={1:"search", 3:"used:"}) 1412 self._params.effective_search_space_used = _safe_int( 1413 self._params.effective_search_space_used)
1414
1415 - def frameshift(self, line):
1416 self._params.frameshift = _get_cols( 1417 line, (4, 5), ncols=6, expected={0:"frameshift", 2:"decay"})
1418
1419 - def threshold(self, line):
1420 if line[:2] == "T:": 1421 #Assume its an old stlye line like "T: 123" 1422 self._params.threshold, = _get_cols( 1423 line, (1,), ncols=2, expected={0:"T:"}) 1424 elif line[:28] == "Neighboring words threshold:": 1425 self._params.threshold, = _get_cols( 1426 line, (3,), ncols=4, expected={0:"Neighboring", 1:"words", 2:"threshold:"}) 1427 else: 1428 raise ValueError("Unrecognised threshold line:\n%s" % line) 1429 self._params.threshold = _safe_int(self._params.threshold)
1430
1431 - def window_size(self, line):
1432 if line[:2] == "A:": 1433 self._params.window_size, = _get_cols( 1434 line, (1,), ncols=2, expected={0:"A:"}) 1435 elif line[:25] == "Window for multiple hits:": 1436 self._params.window_size, = _get_cols( 1437 line, (4,), ncols=5, expected={0:"Window", 2:"multiple", 3:"hits:"}) 1438 else: 1439 raise ValueError("Unrecognised window size line:\n%s" % line) 1440 self._params.window_size = _safe_int(self._params.window_size)
1441
1442 - def dropoff_1st_pass(self, line):
1443 score, bits = _re_search( 1444 r"X1: (\d+) \(\s*([0-9,.]+) bits\)", line, 1445 "I could not find the dropoff in line\n%s" % line) 1446 self._params.dropoff_1st_pass = _safe_int(score), _safe_float(bits)
1447
1448 - def gap_x_dropoff(self, line):
1449 score, bits = _re_search( 1450 r"X2: (\d+) \(\s*([0-9,.]+) bits\)", line, 1451 "I could not find the gap dropoff in line\n%s" % line) 1452 self._params.gap_x_dropoff = _safe_int(score), _safe_float(bits)
1453
1454 - def gap_x_dropoff_final(self, line):
1455 score, bits = _re_search( 1456 r"X3: (\d+) \(\s*([0-9,.]+) bits\)", line, 1457 "I could not find the gap dropoff final in line\n%s" % line) 1458 self._params.gap_x_dropoff_final = _safe_int(score), _safe_float(bits)
1459
1460 - def gap_trigger(self, line):
1461 score, bits = _re_search( 1462 r"S1: (\d+) \(\s*([0-9,.]+) bits\)", line, 1463 "I could not find the gap trigger in line\n%s" % line) 1464 self._params.gap_trigger = _safe_int(score), _safe_float(bits)
1465
1466 - def blast_cutoff(self, line):
1467 score, bits = _re_search( 1468 r"S2: (\d+) \(\s*([0-9,.]+) bits\)", line, 1469 "I could not find the blast cutoff in line\n%s" % line) 1470 self._params.blast_cutoff = _safe_int(score), _safe_float(bits)
1471
1472 - def end_parameters(self):
1473 pass
1474 1475
1476 -class _BlastConsumer(AbstractConsumer, 1477 _HeaderConsumer, 1478 _DescriptionConsumer, 1479 _AlignmentConsumer, 1480 _HSPConsumer, 1481 _DatabaseReportConsumer, 1482 _ParametersConsumer 1483 ):
1484 # This Consumer is inherits from many other consumer classes that handle 1485 # the actual dirty work. An alternate way to do it is to create objects 1486 # of those classes and then delegate the parsing tasks to them in a 1487 # decorator-type pattern. The disadvantage of that is that the method 1488 # names will need to be resolved in this classes. However, using 1489 # a decorator will retain more control in this class (which may or 1490 # may not be a bad thing). In addition, having each sub-consumer as 1491 # its own object prevents this object's dictionary from being cluttered 1492 # with members and reduces the chance of member collisions.
1493 - def __init__(self):
1494 self.data = None
1495
1496 - def round(self, line):
1497 # Make sure nobody's trying to pass me PSI-BLAST data! 1498 raise ValueError("This consumer doesn't handle PSI-BLAST data")
1499
1500 - def start_header(self):
1501 self.data = Record.Blast() 1502 _HeaderConsumer.start_header(self)
1503
1504 - def end_header(self):
1505 _HeaderConsumer.end_header(self) 1506 self.data.__dict__.update(self._header.__dict__)
1507
1508 - def end_descriptions(self):
1509 self.data.descriptions = self._descriptions
1510
1511 - def end_alignment(self):
1512 _AlignmentConsumer.end_alignment(self) 1513 if self._alignment.hsps: 1514 self.data.alignments.append(self._alignment) 1515 if self._multiple_alignment.alignment: 1516 self.data.multiple_alignment = self._multiple_alignment
1517
1518 - def end_hsp(self):
1519 _HSPConsumer.end_hsp(self) 1520 try: 1521 self._alignment.hsps.append(self._hsp) 1522 except AttributeError: 1523 raise ValueError("Found an HSP before an alignment")
1524
1525 - def end_database_report(self):
1526 _DatabaseReportConsumer.end_database_report(self) 1527 self.data.__dict__.update(self._dr.__dict__)
1528
1529 - def end_parameters(self):
1530 _ParametersConsumer.end_parameters(self) 1531 self.data.__dict__.update(self._params.__dict__)
1532
1533 -class _PSIBlastConsumer(AbstractConsumer, 1534 _HeaderConsumer, 1535 _DescriptionConsumer, 1536 _AlignmentConsumer, 1537 _HSPConsumer, 1538 _DatabaseReportConsumer, 1539 _ParametersConsumer 1540 ):
1541 - def __init__(self):
1542 self.data = None
1543
1544 - def start_header(self):
1545 self.data = Record.PSIBlast() 1546 _HeaderConsumer.start_header(self)
1547
1548 - def end_header(self):
1549 _HeaderConsumer.end_header(self) 1550 self.data.__dict__.update(self._header.__dict__)
1551
1552 - def start_descriptions(self):
1553 self._round = Record.Round() 1554 self.data.rounds.append(self._round) 1555 _DescriptionConsumer.start_descriptions(self)
1556
1557 - def end_descriptions(self):
1558 _DescriptionConsumer.end_descriptions(self) 1559 self._round.number = self._roundnum 1560 if self._descriptions: 1561 self._round.new_seqs.extend(self._descriptions) 1562 self._round.reused_seqs.extend(self._model_sequences) 1563 self._round.new_seqs.extend(self._nonmodel_sequences) 1564 if self._converged: 1565 self.data.converged = 1
1566
1567 - def end_alignment(self):
1568 _AlignmentConsumer.end_alignment(self) 1569 if self._alignment.hsps: 1570 self._round.alignments.append(self._alignment) 1571 if self._multiple_alignment: 1572 self._round.multiple_alignment = self._multiple_alignment
1573
1574 - def end_hsp(self):
1575 _HSPConsumer.end_hsp(self) 1576 try: 1577 self._alignment.hsps.append(self._hsp) 1578 except AttributeError: 1579 raise ValueError("Found an HSP before an alignment")
1580
1581 - def end_database_report(self):
1582 _DatabaseReportConsumer.end_database_report(self) 1583 self.data.__dict__.update(self._dr.__dict__)
1584
1585 - def end_parameters(self):
1586 _ParametersConsumer.end_parameters(self) 1587 self.data.__dict__.update(self._params.__dict__)
1588
1589 -class Iterator:
1590 """Iterates over a file of multiple BLAST results. 1591 1592 Methods: 1593 next Return the next record from the stream, or None. 1594 1595 """
1596 - def __init__(self, handle, parser=None):
1597 """__init__(self, handle, parser=None) 1598 1599 Create a new iterator. handle is a file-like object. parser 1600 is an optional Parser object to change the results into another form. 1601 If set to None, then the raw contents of the file will be returned. 1602 1603 """ 1604 try: 1605 handle.readline 1606 except AttributeError: 1607 raise ValueError( 1608 "I expected a file handle or file-like object, got %s" 1609 % type(handle)) 1610 self._uhandle = File.UndoHandle(handle) 1611 self._parser = parser 1612 self._header = []
1613
1614 - def next(self):
1615 """next(self) -> object 1616 1617 Return the next Blast record from the file. If no more records, 1618 return None. 1619 1620 """ 1621 lines = [] 1622 query = False 1623 while 1: 1624 line = self._uhandle.readline() 1625 if not line: 1626 break 1627 # If I've reached the next one, then put the line back and stop. 1628 if lines and (line.startswith('BLAST') 1629 or line.startswith('BLAST', 1) 1630 or line.startswith('<?xml ')): 1631 self._uhandle.saveline(line) 1632 break 1633 # New style files ommit the BLAST line to mark a new query: 1634 if line.startswith("Query="): 1635 if not query: 1636 if not self._header: 1637 self._header = lines[:] 1638 query = True 1639 else: 1640 #Start of another record 1641 self._uhandle.saveline(line) 1642 break 1643 lines.append(line) 1644 1645 if query and "BLAST" not in lines[0]: 1646 #Cheat and re-insert the header 1647 #print "-"*50 1648 #print "".join(self._header) 1649 #print "-"*50 1650 #print "".join(lines) 1651 #print "-"*50 1652 lines = self._header + lines 1653 1654 if not lines: 1655 return None 1656 1657 data = ''.join(lines) 1658 if self._parser is not None: 1659 return self._parser.parse(File.StringHandle(data)) 1660 return data
1661
1662 - def __iter__(self):
1663 return iter(self.next, None)
1664
1665 -def blastall(blastcmd, program, database, infile, align_view='7', **keywds):
1666 """Execute and retrieve data from standalone BLASTPALL as handles (OBSOLETE). 1667 1668 NOTE - This function is obsolete, you are encouraged to the command 1669 line wrapper Bio.Blast.Applications.BlastallCommandline instead. 1670 1671 Execute and retrieve data from blastall. blastcmd is the command 1672 used to launch the 'blastall' executable. program is the blast program 1673 to use, e.g. 'blastp', 'blastn', etc. database is the path to the database 1674 to search against. infile is the path to the file containing 1675 the sequence to search with. 1676 1677 The return values are two handles, for standard output and standard error. 1678 1679 You may pass more parameters to **keywds to change the behavior of 1680 the search. Otherwise, optional values will be chosen by blastall. 1681 The Blast output is by default in XML format. Use the align_view keyword 1682 for output in a different format. 1683 1684 Scoring 1685 matrix Matrix to use. 1686 gap_open Gap open penalty. 1687 gap_extend Gap extension penalty. 1688 nuc_match Nucleotide match reward. (BLASTN) 1689 nuc_mismatch Nucleotide mismatch penalty. (BLASTN) 1690 query_genetic_code Genetic code for Query. 1691 db_genetic_code Genetic code for database. (TBLAST[NX]) 1692 1693 Algorithm 1694 gapped Whether to do a gapped alignment. T/F (not for TBLASTX) 1695 expectation Expectation value cutoff. 1696 wordsize Word size. 1697 strands Query strands to search against database.([T]BLAST[NX]) 1698 keep_hits Number of best hits from a region to keep. 1699 xdrop Dropoff value (bits) for gapped alignments. 1700 hit_extend Threshold for extending hits. 1701 region_length Length of region used to judge hits. 1702 db_length Effective database length. 1703 search_length Effective length of search space. 1704 1705 Processing 1706 filter Filter query sequence for low complexity (with SEG)? T/F 1707 believe_query Believe the query defline. T/F 1708 restrict_gi Restrict search to these GI's. 1709 nprocessors Number of processors to use. 1710 oldengine Force use of old engine T/F 1711 1712 Formatting 1713 html Produce HTML output? T/F 1714 descriptions Number of one-line descriptions. 1715 alignments Number of alignments. 1716 align_view Alignment view. Integer 0-11, 1717 passed as a string or integer. 1718 show_gi Show GI's in deflines? T/F 1719 seqalign_file seqalign file to output. 1720 outfile Output file for report. Filename to write to, if 1721 ommitted standard output is used (which you can access 1722 from the returned handles). 1723 """ 1724 1725 _security_check_parameters(keywds) 1726 1727 att2param = { 1728 'matrix' : '-M', 1729 'gap_open' : '-G', 1730 'gap_extend' : '-E', 1731 'nuc_match' : '-r', 1732 'nuc_mismatch' : '-q', 1733 'query_genetic_code' : '-Q', 1734 'db_genetic_code' : '-D', 1735 1736 'gapped' : '-g', 1737 'expectation' : '-e', 1738 'wordsize' : '-W', 1739 'strands' : '-S', 1740 'keep_hits' : '-K', 1741 'xdrop' : '-X', 1742 'hit_extend' : '-f', 1743 'region_length' : '-L', 1744 'db_length' : '-z', 1745 'search_length' : '-Y', 1746 1747 'program' : '-p', 1748 'database' : '-d', 1749 'infile' : '-i', 1750 'filter' : '-F', 1751 'believe_query' : '-J', 1752 'restrict_gi' : '-l', 1753 'nprocessors' : '-a', 1754 'oldengine' : '-V', 1755 1756 'html' : '-T', 1757 'descriptions' : '-v', 1758 'alignments' : '-b', 1759 'align_view' : '-m', 1760 'show_gi' : '-I', 1761 'seqalign_file' : '-O', 1762 'outfile' : '-o', 1763 } 1764 import warnings 1765 warnings.warn("This function is obsolete, you are encouraged to the command line wrapper Bio.Blast.Applications.BlastallCommandline instead.", PendingDeprecationWarning) 1766 from Applications import BlastallCommandline 1767 cline = BlastallCommandline(blastcmd) 1768 cline.set_parameter(att2param['program'], program) 1769 cline.set_parameter(att2param['database'], database) 1770 cline.set_parameter(att2param['infile'], infile) 1771 cline.set_parameter(att2param['align_view'], str(align_view)) 1772 for key, value in keywds.iteritems(): 1773 cline.set_parameter(att2param[key], str(value)) 1774 return _invoke_blast(cline)
1775 1776
1777 -def blastpgp(blastcmd, database, infile, align_view='7', **keywds):
1778 """Execute and retrieve data from standalone BLASTPGP as handles (OBSOLETE). 1779 1780 NOTE - This function is obsolete, you are encouraged to the command 1781 line wrapper Bio.Blast.Applications.BlastpgpCommandline instead. 1782 1783 Execute and retrieve data from blastpgp. blastcmd is the command 1784 used to launch the 'blastpgp' executable. database is the path to the 1785 database to search against. infile is the path to the file containing 1786 the sequence to search with. 1787 1788 The return values are two handles, for standard output and standard error. 1789 1790 You may pass more parameters to **keywds to change the behavior of 1791 the search. Otherwise, optional values will be chosen by blastpgp. 1792 The Blast output is by default in XML format. Use the align_view keyword 1793 for output in a different format. 1794 1795 Scoring 1796 matrix Matrix to use. 1797 gap_open Gap open penalty. 1798 gap_extend Gap extension penalty. 1799 window_size Multiple hits window size. 1800 npasses Number of passes. 1801 passes Hits/passes. Integer 0-2. 1802 1803 Algorithm 1804 gapped Whether to do a gapped alignment. T/F 1805 expectation Expectation value cutoff. 1806 wordsize Word size. 1807 keep_hits Number of beset hits from a region to keep. 1808 xdrop Dropoff value (bits) for gapped alignments. 1809 hit_extend Threshold for extending hits. 1810 region_length Length of region used to judge hits. 1811 db_length Effective database length. 1812 search_length Effective length of search space. 1813 nbits_gapping Number of bits to trigger gapping. 1814 pseudocounts Pseudocounts constants for multiple passes. 1815 xdrop_final X dropoff for final gapped alignment. 1816 xdrop_extension Dropoff for blast extensions. 1817 model_threshold E-value threshold to include in multipass model. 1818 required_start Start of required region in query. 1819 required_end End of required region in query. 1820 1821 Processing 1822 XXX should document default values 1823 program The blast program to use. (PHI-BLAST) 1824 filter Filter query sequence for low complexity (with SEG)? T/F 1825 believe_query Believe the query defline? T/F 1826 nprocessors Number of processors to use. 1827 1828 Formatting 1829 html Produce HTML output? T/F 1830 descriptions Number of one-line descriptions. 1831 alignments Number of alignments. 1832 align_view Alignment view. Integer 0-11, 1833 passed as a string or integer. 1834 show_gi Show GI's in deflines? T/F 1835 seqalign_file seqalign file to output. 1836 align_outfile Output file for alignment. 1837 checkpoint_outfile Output file for PSI-BLAST checkpointing. 1838 restart_infile Input file for PSI-BLAST restart. 1839 hit_infile Hit file for PHI-BLAST. 1840 matrix_outfile Output file for PSI-BLAST matrix in ASCII. 1841 align_outfile Output file for alignment. Filename to write to, if 1842 ommitted standard output is used (which you can access 1843 from the returned handles). 1844 1845 align_infile Input alignment file for PSI-BLAST restart. 1846 1847 """ 1848 1849 import warnings 1850 warnings.warn("This function is obsolete, you are encouraged to the command line wrapper Bio.Blast.Applications.BlastpgpCommandline instead.", PendingDeprecationWarning) 1851 _security_check_parameters(keywds) 1852 1853 att2param = { 1854 'matrix' : '-M', 1855 'gap_open' : '-G', 1856 'gap_extend' : '-E', 1857 'window_size' : '-A', 1858 'npasses' : '-j', 1859 'passes' : '-P', 1860 1861 'gapped' : '-g', 1862 'expectation' : '-e', 1863 'wordsize' : '-W', 1864 'keep_hits' : '-K', 1865 'xdrop' : '-X', 1866 'hit_extend' : '-f', 1867 'region_length' : '-L', 1868 'db_length' : '-Z', 1869 'search_length' : '-Y', 1870 'nbits_gapping' : '-N', 1871 'pseudocounts' : '-c', 1872 'xdrop_final' : '-Z', 1873 'xdrop_extension' : '-y', 1874 'model_threshold' : '-h', 1875 'required_start' : '-S', 1876 'required_end' : '-H', 1877 1878 'program' : '-p', 1879 'database' : '-d', 1880 'infile' : '-i', 1881 'filter' : '-F', 1882 'believe_query' : '-J', 1883 'nprocessors' : '-a', 1884 1885 'html' : '-T', 1886 'descriptions' : '-v', 1887 'alignments' : '-b', 1888 'align_view' : '-m', 1889 'show_gi' : '-I', 1890 'seqalign_file' : '-O', 1891 'align_outfile' : '-o', 1892 'checkpoint_outfile' : '-C', 1893 'restart_infile' : '-R', 1894 'hit_infile' : '-k', 1895 'matrix_outfile' : '-Q', 1896 'align_infile' : '-B', 1897 } 1898 from Applications import BlastpgpCommandline 1899 cline = BlastpgpCommandline(blastcmd) 1900 cline.set_parameter(att2param['database'], database) 1901 cline.set_parameter(att2param['infile'], infile) 1902 cline.set_parameter(att2param['align_view'], str(align_view)) 1903 for key, value in keywds.iteritems(): 1904 cline.set_parameter(att2param[key], str(value)) 1905 return _invoke_blast(cline)
1906 1907
1908 -def rpsblast(blastcmd, database, infile, align_view="7", **keywds):
1909 """Execute and retrieve data from standalone RPS-BLAST as handles (OBSOLETE). 1910 1911 NOTE - This function is obsolete, you are encouraged to the command 1912 line wrapper Bio.Blast.Applications.RpsBlastCommandline instead. 1913 1914 Execute and retrieve data from standalone RPS-BLAST. blastcmd is the 1915 command used to launch the 'rpsblast' executable. database is the path 1916 to the database to search against. infile is the path to the file 1917 containing the sequence to search with. 1918 1919 The return values are two handles, for standard output and standard error. 1920 1921 You may pass more parameters to **keywds to change the behavior of 1922 the search. Otherwise, optional values will be chosen by rpsblast. 1923 1924 Please note that this function will give XML output by default, by 1925 setting align_view to seven (i.e. command line option -m 7). 1926 You should use the NCBIXML.parse() function to read the resulting output. 1927 This is because NCBIStandalone.BlastParser() does not understand the 1928 plain text output format from rpsblast. 1929 1930 WARNING - The following text and associated parameter handling has not 1931 received extensive testing. Please report any errors we might have made... 1932 1933 Algorithm/Scoring 1934 gapped Whether to do a gapped alignment. T/F 1935 multihit 0 for multiple hit (default), 1 for single hit 1936 expectation Expectation value cutoff. 1937 range_restriction Range restriction on query sequence (Format: start,stop) blastp only 1938 0 in 'start' refers to the beginning of the sequence 1939 0 in 'stop' refers to the end of the sequence 1940 Default = 0,0 1941 xdrop Dropoff value (bits) for gapped alignments. 1942 xdrop_final X dropoff for final gapped alignment (in bits). 1943 xdrop_extension Dropoff for blast extensions (in bits). 1944 search_length Effective length of search space. 1945 nbits_gapping Number of bits to trigger gapping. 1946 protein Query sequence is protein. T/F 1947 db_length Effective database length. 1948 1949 Processing 1950 filter Filter query sequence for low complexity? T/F 1951 case_filter Use lower case filtering of FASTA sequence T/F, default F 1952 believe_query Believe the query defline. T/F 1953 nprocessors Number of processors to use. 1954 logfile Name of log file to use, default rpsblast.log 1955 1956 Formatting 1957 html Produce HTML output? T/F 1958 descriptions Number of one-line descriptions. 1959 alignments Number of alignments. 1960 align_view Alignment view. Integer 0-11, 1961 passed as a string or integer. 1962 show_gi Show GI's in deflines? T/F 1963 seqalign_file seqalign file to output. 1964 align_outfile Output file for alignment. Filename to write to, if 1965 ommitted standard output is used (which you can access 1966 from the returned handles). 1967 """ 1968 1969 import warnings 1970 warnings.warn("This function is obsolete, you are encouraged to the command line wrapper Bio.Blast.Applications.BlastrpsCommandline instead.", PendingDeprecationWarning) 1971 _security_check_parameters(keywds) 1972 1973 att2param = { 1974 'multihit' : '-P', 1975 'gapped' : '-g', 1976 'expectation' : '-e', 1977 'range_restriction' : '-L', 1978 'xdrop' : '-X', 1979 'xdrop_final' : '-Z', 1980 'xdrop_extension' : '-y', 1981 'search_length' : '-Y', 1982 'nbits_gapping' : '-N', 1983 'protein' : '-p', 1984 'db_length' : '-z', 1985 1986 'database' : '-d', 1987 'infile' : '-i', 1988 'filter' : '-F', 1989 'case_filter' : '-U', 1990 'believe_query' : '-J', 1991 'nprocessors' : '-a', 1992 'logfile' : '-l', 1993 1994 'html' : '-T', 1995 'descriptions' : '-v', 1996 'alignments' : '-b', 1997 'align_view' : '-m', 1998 'show_gi' : '-I', 1999 'seqalign_file' : '-O', 2000 'align_outfile' : '-o', 2001 } 2002 2003 from Applications import RpsBlastCommandline 2004 cline = RpsBlastCommandline(blastcmd) 2005 cline.set_parameter(att2param['database'], database) 2006 cline.set_parameter(att2param['infile'], infile) 2007 cline.set_parameter(att2param['align_view'], str(align_view)) 2008 for key, value in keywds.iteritems(): 2009 cline.set_parameter(att2param[key], str(value)) 2010 return _invoke_blast(cline)
2011 2012
2013 -def _re_search(regex, line, error_msg):
2014 m = re.search(regex, line) 2015 if not m: 2016 raise ValueError(error_msg) 2017 return m.groups()
2018
2019 -def _get_cols(line, cols_to_get, ncols=None, expected={}):
2020 cols = line.split() 2021 2022 # Check to make sure number of columns is correct 2023 if ncols is not None and len(cols) != ncols: 2024 raise ValueError("I expected %d columns (got %d) in line\n%s" \ 2025 % (ncols, len(cols), line)) 2026 2027 # Check to make sure columns contain the correct data 2028 for k in expected: 2029 if cols[k] != expected[k]: 2030 raise ValueError("I expected '%s' in column %d in line\n%s" \ 2031 % (expected[k], k, line)) 2032 2033 # Construct the answer tuple 2034 results = [] 2035 for c in cols_to_get: 2036 results.append(cols[c]) 2037 return tuple(results)
2038 2039
2040 -def _safe_int(str):
2041 try: 2042 return int(str) 2043 except ValueError: 2044 # Something went wrong. Try to clean up the string. 2045 # Remove all commas from the string 2046 str = str.replace(',', '') 2047 # try again after removing commas. 2048 # Note int() will return a long rather than overflow 2049 try: 2050 return int(str) 2051 except ValueError: 2052 pass 2053 # Call float to handle things like "54.3", note could lose precision, e.g. 2054 # >>> int("5399354557888517312") 2055 # 5399354557888517312 2056 # >>> int(float("5399354557888517312")) 2057 # 5399354557888517120 2058 return int(float(str))
2059 2060
2061 -def _safe_float(str):
2062 # Thomas Rosleff Soerensen (rosleff@mpiz-koeln.mpg.de) noted that 2063 # float('e-172') does not produce an error on his platform. Thus, 2064 # we need to check the string for this condition. 2065 2066 # Sometimes BLAST leaves of the '1' in front of an exponent. 2067 if str and str[0] in ['E', 'e']: 2068 str = '1' + str 2069 try: 2070 return float(str) 2071 except ValueError: 2072 # Remove all commas from the string 2073 str = str.replace(',', '') 2074 # try again. 2075 return float(str)
2076 2077
2078 -def _invoke_blast(cline):
2079 """Start BLAST and returns handles for stdout and stderr (PRIVATE). 2080 2081 Expects a command line wrapper object from Bio.Blast.Applications 2082 """ 2083 import subprocess, sys 2084 blast_cmd = cline.program_name 2085 if not os.path.exists(blast_cmd): 2086 raise ValueError("BLAST executable does not exist at %s" % blast_cmd) 2087 #We don't need to supply any piped input, but we setup the 2088 #standard input pipe anyway as a work around for a python 2089 #bug if this is called from a Windows GUI program. For 2090 #details, see http://bugs.python.org/issue1124861 2091 blast_process = subprocess.Popen(str(cline), 2092 stdin=subprocess.PIPE, 2093 stdout=subprocess.PIPE, 2094 stderr=subprocess.PIPE, 2095 universal_newlines=True, 2096 shell=(sys.platform!="win32")) 2097 blast_process.stdin.close() 2098 return blast_process.stdout, blast_process.stderr
2099 2100
2101 -def _security_check_parameters(param_dict):
2102 """Look for any attempt to insert a command into a parameter. 2103 2104 e.g. blastall(..., matrix='IDENTITY -F 0; rm -rf /etc/passwd') 2105 2106 Looks for ";" or "&&" in the strings (Unix and Windows syntax 2107 for appending a command line), or ">", "<" or "|" (redirection) 2108 and if any are found raises an exception. 2109 """ 2110 for key, value in param_dict.iteritems(): 2111 str_value = str(value) # Could easily be an int or a float 2112 for bad_str in [";", "&&", ">", "<", "|"]: 2113 if bad_str in str_value: 2114 raise ValueError("Rejecting suspicious argument for %s" % key)
2115
2116 -class _BlastErrorConsumer(_BlastConsumer):
2117 - def __init__(self):
2119 - def noevent(self, line):
2120 if line.find("Query must be at least wordsize") != -1: 2121 raise ShortQueryBlastError("Query must be at least wordsize") 2122 # Now pass the line back up to the superclass. 2123 method = getattr(_BlastConsumer, 'noevent', 2124 _BlastConsumer.__getattr__(self, 'noevent')) 2125 method(line)
2126
2127 -class BlastErrorParser(AbstractParser):
2128 """Attempt to catch and diagnose BLAST errors while parsing. 2129 2130 This utilizes the BlastParser module but adds an additional layer 2131 of complexity on top of it by attempting to diagnose ValueErrors 2132 that may actually indicate problems during BLAST parsing. 2133 2134 Current BLAST problems this detects are: 2135 o LowQualityBlastError - When BLASTing really low quality sequences 2136 (ie. some GenBank entries which are just short streches of a single 2137 nucleotide), BLAST will report an error with the sequence and be 2138 unable to search with this. This will lead to a badly formatted 2139 BLAST report that the parsers choke on. The parser will convert the 2140 ValueError to a LowQualityBlastError and attempt to provide useful 2141 information. 2142 2143 """
2144 - def __init__(self, bad_report_handle = None):
2145 """Initialize a parser that tries to catch BlastErrors. 2146 2147 Arguments: 2148 o bad_report_handle - An optional argument specifying a handle 2149 where bad reports should be sent. This would allow you to save 2150 all of the bad reports to a file, for instance. If no handle 2151 is specified, the bad reports will not be saved. 2152 """ 2153 self._bad_report_handle = bad_report_handle 2154 2155 #self._b_parser = BlastParser() 2156 self._scanner = _Scanner() 2157 self._consumer = _BlastErrorConsumer()
2158
2159 - def parse(self, handle):
2160 """Parse a handle, attempting to diagnose errors. 2161 """ 2162 results = handle.read() 2163 2164 try: 2165 self._scanner.feed(File.StringHandle(results), self._consumer) 2166 except ValueError, msg: 2167 # if we have a bad_report_file, save the info to it first 2168 if self._bad_report_handle: 2169 # send the info to the error handle 2170 self._bad_report_handle.write(results) 2171 2172 # now we want to try and diagnose the error 2173 self._diagnose_error( 2174 File.StringHandle(results), self._consumer.data) 2175 2176 # if we got here we can't figure out the problem 2177 # so we should pass along the syntax error we got 2178 raise 2179 return self._consumer.data
2180
2181 - def _diagnose_error(self, handle, data_record):
2182 """Attempt to diagnose an error in the passed handle. 2183 2184 Arguments: 2185 o handle - The handle potentially containing the error 2186 o data_record - The data record partially created by the consumer. 2187 """ 2188 line = handle.readline() 2189 2190 while line: 2191 # 'Searchingdone' instead of 'Searching......done' seems 2192 # to indicate a failure to perform the BLAST due to 2193 # low quality sequence 2194 if line.startswith('Searchingdone'): 2195 raise LowQualityBlastError("Blast failure occured on query: ", 2196 data_record.query) 2197 line = handle.readline()
2198