Package Bio :: Package Blast :: Module NCBIWWW
[hide private]
[frames] | no frames]

Source Code for Module Bio.Blast.NCBIWWW

  1  # Copyright 1999 by Jeffrey Chang.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  # Patched by Brad Chapman. 
  7  # Chris Wroe added modifications for work in myGrid 
  8   
  9  """ 
 10  This module provides code to work with the WWW version of BLAST 
 11  provided by the NCBI. 
 12  http://blast.ncbi.nlm.nih.gov/ 
 13   
 14  Functions: 
 15  qblast        Do a BLAST search using the QBLAST API. 
 16   
 17  Deprecated classes: 
 18  BlastParser   Parses output from WWW blast. 
 19  _Scanner      Scans output from NCBI's BLAST WWW server. 
 20   
 21   
 22  """ 
 23  import re 
 24   
 25  try: 
 26      import cStringIO as StringIO 
 27  except ImportError: 
 28      import StringIO 
 29   
 30  from Bio.ParserSupport import * 
 31   
32 -class BlastParser(AbstractParser):
33 """Parses WWW BLAST data into a Record.Blast object (DEPRECATED). 34 35 This is a parser for the NCBI's HTML (web page) BLAST output. 36 """
37 - def __init__(self):
38 """Create a BlastParser object (DEPRECATED).""" 39 import warnings 40 warnings.warn("Bio.Blast.NCBIWWW.BlastParser is deprecated." \ 41 + " We recommend you use the XML output with" \ 42 + " the parser in Bio.Blast.NCBIXML instead.", 43 DeprecationWarning) 44 45 import NCBIStandalone 46 self._scanner = _Scanner() 47 self._consumer = SGMLStrippingConsumer(NCBIStandalone._BlastConsumer())
48
49 - def parse(self, handle):
50 """parse(self, handle)""" 51 self._scanner.feed(handle, self._consumer) 52 return self._consumer.data
53
54 -class _Scanner:
55 """Scanner for the HTML BLAST parser (PRIVATE, DEPRECATED). 56 57 Scan BLAST output from NCBI's web server at: 58 http://www.ncbi.nlm.nih.gov/BLAST/ 59 60 Tested with BLAST v2.0.10 61 62 Methods: 63 feed Feed data into the scanner. 64 """
65 - def feed(self, handle, consumer):
66 """S.feed(handle, consumer) 67 68 Feed in a BLAST report for scanning. handle is a file-like 69 object that contains the BLAST report. consumer is a Consumer 70 object that will receive events as the report is scanned. 71 72 """ 73 from Bio import File 74 75 # This stuff appears in 2.0.12. 76 # <p><!-- 77 # QBlastInfoBegin 78 # Status=READY 79 # QBlastInfoEnd 80 # --><p> 81 82 # <HTML> 83 # <HEAD> 84 # <TITLE>BLAST Search Results </TITLE> 85 # </HEAD> 86 # <BODY BGCOLOR="#FFFFFF" LINK="#0000FF" VLINK="#660099" ALINK="#660099 87 # <A HREF="http://www.ncbi.nlm.nih.gov/BLAST/blast_form.map"> <IMG SRC= 88 # <BR><BR><PRE> 89 90 # BLAST Formatted information 91 92 # 93 # </BODY> 94 # </HTML> 95 # </BODY> 96 # </HTML> 97 if isinstance(handle, File.UndoHandle): 98 uhandle = handle 99 else: 100 uhandle = File.UndoHandle(handle) 101 # Read HTML formatting up to the "BLAST" version line. 102 read_and_call_until(uhandle, consumer.noevent, 103 has_re=re.compile(r'<b>.?BLAST')) 104 105 self._scan_header(uhandle, consumer) 106 self._scan_rounds(uhandle, consumer) 107 self._scan_database_report(uhandle, consumer) 108 self._scan_parameters(uhandle, consumer) 109 110 # Read HTML footer information. 111 while uhandle.peekline(): 112 read_and_call(uhandle, consumer.noevent)
113
114 - def _scan_header(self, uhandle, consumer):
115 # <b>BLASTP 2.0.10 [Aug-26-1999]</b> 116 # 117 # 118 # <b><a href="http://www.ncbi.nlm.nih.gov/htbin- 119 # post/Entrez/query?uid=9254694&form=6&db=m&Dopt=r">Reference</a>:</b> 120 # Altschul, Stephen F., Thomas L. Madden, Alejandro A. Sch&auml;ffer, 121 # Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), 122 # "Gapped BLAST and PSI-BLAST: a new generation of protein database sea 123 # programs", Nucleic Acids Res. 25:3389-3402. 124 # <p> 125 # <b>Query=</b> gi|120291|sp|P21297|FLBT_CAUCR FLBT PROTEIN. 126 # (141 letters) 127 # 128 # <b>Database:</b> Non-redundant SwissProt sequences 129 # 82,258 sequences; 29,652,561 total letters 130 # 131 # <p> <p>If you have any problems or questions with the results of this 132 133 # If there are hits, and Graphical Overview was selected: 134 # <FORM NAME="BLASTFORM"> 135 # </PRE> 136 # <CENTER> 137 # <H3><a href="/BLAST/newoptions.html#graphical-overview"> Distribution 138 # <input name=defline size=80 value="Mouse-over to show defline and sco 139 # </CENTER> 140 # <map name=img_map> 141 # <area shape=rect coords=69,101,476,106 href="#120291" ONMOUSEOVER='do 142 # <area shape=rect coords=156,108,305,113 href="#3024946" ONMOUSEOVER=' 143 # </map> 144 # <CENTER> 145 # <IMG WIDTH=529 HEIGHT=115 USEMAP=#img_map BORDER=1 SRC="nph-getgif.cg 146 # <HR> 147 # <PRE> XXX 148 consumer.start_header() 149 150 # Read the "BLAST" version line and the following blanks. 151 read_and_call(uhandle, consumer.version, contains='BLAST') 152 read_and_call_while(uhandle, consumer.noevent, blank=1) 153 154 # Read the reference lines and the '<p>' line. 155 # TBLASTN 2.2.6 has a blank line instead of a "<p>". 156 while 1: 157 line = uhandle.readline() 158 if line[:3] == '<p>' or not line.strip(): 159 consumer.noevent(line) 160 break 161 consumer.reference(line) 162 163 # Read the RID line, for version 2.0.12 (2.0.11?) and above. 164 attempt_read_and_call(uhandle, consumer.noevent, start='RID') 165 # Brad Chapman noticed a '<p>' line in BLASTN 2.1.1; this line 166 # seems to have disappeared again. 167 # attempt_read_and_call(uhandle, consumer.noevent, start='<p>') 168 attempt_read_and_call(uhandle, consumer.noevent) 169 170 # Apparently, there's some discrepancy between whether the 171 # Query or database comes first. Usually the Query does, but 172 # Brad noticed a case where the database came first. 173 if uhandle.peekline().find("Query=") >= 0: 174 self._scan_query_info(uhandle, consumer) 175 self._scan_database_info(uhandle, consumer) 176 else: 177 self._scan_database_info(uhandle, consumer) 178 self._scan_query_info(uhandle, consumer) 179 read_and_call_while(uhandle, consumer.noevent, blank=1) 180 consumer.end_header()
181
182 - def _scan_blastform(self, uhandle, consumer):
183 if attempt_read_and_call(uhandle, consumer.noevent, 184 contains="BLASTFORM"): 185 while 1: 186 line = uhandle.peekline() 187 if is_blank_line(line): 188 break 189 elif "Query=" in line: 190 break 191 consumer.noevent(uhandle.readline())
192
193 - def _scan_database_info(self, uhandle, consumer):
194 attempt_read_and_call(uhandle, consumer.noevent, start='<p>') 195 read_and_call(uhandle, consumer.database_info, contains='Database') 196 # Sagar Damle reported that databases can consist of multiple lines. 197 # But, trickily enough, sometimes the second line can also have the 198 # word sequences in it. Try to use 'sequences;' (with a semicolon) 199 read_and_call_until(uhandle, consumer.database_info, 200 contains='sequences;') 201 read_and_call(uhandle, consumer.database_info, contains='sequences;') 202 read_and_call(uhandle, consumer.noevent, blank=1) 203 attempt_read_and_call(uhandle, consumer.noevent, 204 contains='problems or questions') 205 self._scan_blastform(uhandle, consumer) 206 207 attempt_read_and_call(uhandle, consumer.noevent, blank=1) 208 if attempt_read_and_call(uhandle, consumer.noevent, 209 start="<table border=0 width=600"): 210 read_and_call_until(uhandle, consumer.noevent, 211 contains="</table>") 212 consumer.noevent(uhandle.readline()) 213 read_and_call(uhandle, consumer.noevent, blank=1) 214 215 attempt_read_and_call(uhandle, consumer.noevent, start="<p>") 216 217 if attempt_read_and_call(uhandle, consumer.noevent, 218 contains="Taxonomy reports"): 219 read_and_call(uhandle, consumer.noevent, start="<BR>") 220 attempt_read_and_call(uhandle, consumer.noevent, start="<PRE>") 221 222 # </PRE> 223 # <!-- Progress msg from the server 500 7--> 224 # <!-- Progress msg from the server 1000 15--> 225 # <!-- Progress msg from the server 1500 21--> 226 # ... 227 # <PRE><HR><BR><b>Query=</b> test 228 # (60 letters) 229 if attempt_read_and_call(uhandle, consumer.noevent, start="</PRE>"): 230 read_and_call_until(uhandle, consumer.noevent, start="<PRE>") 231 while 1: 232 line = uhandle.peekline() 233 if not line[:5] == "<PRE>" or line.find("Query=") >= 0: 234 break 235 read_and_call(uhandle, consumer.noevent, start="<PRE>") 236 237 read_and_call_while(uhandle, consumer.noevent, blank=1)
238
239 - def _scan_query_info(self, uhandle, consumer):
240 # Read the Query lines and the following blank line. 241 read_and_call(uhandle, consumer.query_info, contains='Query=') 242 read_and_call_until(uhandle, consumer.query_info, blank=1) 243 read_and_call_while(uhandle, consumer.noevent, blank=1) 244 if attempt_read_and_call(uhandle, consumer.noevent, start="<PRE>"): 245 read_and_call_while(uhandle, consumer.noevent, blank=1) 246 self._scan_blastform(uhandle, consumer)
247
248 - def _scan_rounds(self, uhandle, consumer):
249 self._scan_descriptions(uhandle, consumer) 250 self._scan_alignments(uhandle, consumer)
251
252 - def _scan_descriptions(self, uhandle, consumer):
253 consumer.start_descriptions() 254 255 # Three things can happen here: 256 # 1. line contains 'Score E' 257 # 2. line contains "No significant similarity" 258 # 3. no descriptions 259 if not attempt_read_and_call( 260 uhandle, consumer.description_header, 261 has_re=re.compile(r"Score {4,5}E")): 262 # Either case 2 or 3. Look for "No hits found". 263 attempt_read_and_call(uhandle, consumer.no_hits, 264 contains='No significant similarity') 265 read_and_call_while(uhandle, consumer.noevent, blank=1) 266 consumer.end_descriptions() 267 # Stop processing. 268 return 269 # Sequences producing significant alignments: 270 # 271 # <a href="http://www.ncbi.nlm.nih.gov:80/entrez/query.fcgi?cmd=Ret 272 # <a href="http://www.ncbi.nlm.nih.gov:80/entrez/query.fcgi?cmd=Ret 273 # 274 # Read the score header lines and a blank line. 275 read_and_call(uhandle, consumer.description_header, 276 start='Sequences producing') 277 read_and_call(uhandle, consumer.noevent, blank=1) 278 279 # Read the descriptions 280 # The description contains at least an <a href> into the alignments. 281 # What is no alignments are chosen? 282 read_and_call_while(uhandle, consumer.description, 283 blank=0, contains='<a') 284 285 # two choices here, either blank lines or a </PRE> 286 if not attempt_read_and_call(uhandle, consumer.noevent, 287 contains='</PRE>'): 288 read_and_call_while(uhandle, consumer.noevent, blank=1) 289 290 consumer.end_descriptions()
291
292 - def _scan_alignments(self, uhandle, consumer):
293 # Check to see whether I'm at an alignment or database report. 294 # Possibilities: 295 # 1) BLASTP 2.0.14, pairwise alignment 296 # <CENTER><b><FONT color="green">Alignments</FONT></b></CENTER> 297 # <PRE> 298 # ><a name = 121837></a><a href="http://www.ncbi.nlm.nih.gov:80/entre 299 # 2) BLASTP 2.0.10, pairwise alignment 300 # <PRE> 301 # <a name = 120291> </a><a href="http://www.ncbi.nlm.nih.gov:80/entre 302 # 3) BLASTP 2.0.10, master-slave 303 # <PRE> 304 # blast_tmp 1 MFQQIGAVQAKSGTDEPAHPCEKFPPERKCEAVFWKPLPRHEAREILLAARK 305 # 4) BLASTP 2.0.10, 2.0.14, database 306 # <PRE> 307 # Database: Non-redundant SwissProt sequences 308 # 5) BLASTX 2.2.4, pairwise alignment 309 # <CENTER><b><FONT color="green">Alignments</FONT></b></CENTER> 310 # </form> 311 # <script src="blastResult.js"></script><table border="0"><tr><td><FO 312 # <PRE> 313 # 6) Qblast 2.2.10, database (no 'Database' line) 314 # <PRE> 315 # Lambda K H 316 317 # Get the first two lines and examine them. 318 line1 = safe_readline(uhandle) 319 line2 = safe_readline(uhandle) 320 uhandle.saveline(line2) 321 uhandle.saveline(line1) 322 323 is_pairwise = is_masterslave = 0 324 if 'Alignments' in line2: 325 is_pairwise = 1 326 elif line2.startswith(' Database'): 327 pass 328 elif line2.startswith('Lambda K H'): 329 pass 330 elif line2.startswith('blast_tmp'): 331 is_masterslave = 1 332 elif line1.startswith('<PRE>'): 333 is_pairwise = 1 334 else: 335 raise ValueError("Cannot resolve location at lines:\n%s\n%s" \ 336 % (line1, line2)) 337 338 if is_pairwise: 339 self._scan_pairwise_alignments(uhandle, consumer) 340 elif is_masterslave: 341 self._scan_masterslave_alignment(uhandle, consumer)
342
343 - def _scan_pairwise_alignments(self, uhandle, consumer):
344 while 1: 345 read_and_call_until(uhandle, consumer.noevent, start='<PRE>') 346 347 # The first line is <PRE>. Check the second line to see if 348 # I'm still at an alignment. 349 line1 = safe_readline(uhandle) 350 line2 = safe_readline(uhandle) 351 uhandle.saveline(line2) 352 uhandle.saveline(line1) 353 # Lambda is for Q-blast results, which do not have a Database line 354 if line1.find('Database') >= 0 or line2.find("Database") >= 0 \ 355 or line2.find('Lambda K H') >= 0: 356 break 357 358 # Occasionally, there's a bug where the alignment_header and 359 # hsp_header are skipped, leaving only the hsp_alignment. 360 # Detect this and handle it accordingly. 361 if line2[:6] == 'Query:': 362 self._scan_abbreviated_pairwise_alignment(uhandle, consumer) 363 else: 364 self._scan_one_pairwise_alignment(uhandle, consumer)
365
366 - def _scan_abbreviated_pairwise_alignment(self, uhandle, consumer):
367 # Sometimes all header information is skipped, leaving 368 # only the raw alignments. I believe this is a bug because 369 # without the header information, you lose vital information such 370 # as score, target sequence id, etc. 371 # Format: 372 # <PRE> 373 # hsp_alignment 374 375 consumer.start_alignment() 376 consumer.start_hsp() 377 read_and_call(uhandle, consumer.noevent, start='<PRE>') 378 self._scan_hsp_alignment(uhandle, consumer) 379 consumer.end_hsp() 380 consumer.end_alignment()
381
382 - def _scan_one_pairwise_alignment(self, uhandle, consumer):
383 # Alignment format: 384 # <CENTER><b><FONT color="green">Alignments</FONT></b></CENTER> 385 # (BLAST 2.0.14) 386 # <PRE> 387 # alignment_header 388 # hsp_header 389 # hsp_alignment 390 # [...] 391 # The hsp_header and hsp_alignment blocks can be repeated. 392 393 consumer.start_alignment() 394 read_and_call(uhandle, consumer.noevent, start='<PRE>') 395 self._scan_alignment_header(uhandle, consumer) 396 397 # Scan a bunch of score/alignment's. 398 while 1: 399 # An HSP header starts with ' Score'. 400 # However, if the HSP header is not the first one in the 401 # alignment, there will be a '<PRE>' line first. Therefore, 402 # I will need to check either of the first two lines to 403 # see if I'm at an HSP header. 404 line1 = safe_readline(uhandle) 405 line2 = safe_readline(uhandle) 406 line3 = safe_readline(uhandle) 407 uhandle.saveline(line3) 408 uhandle.saveline(line2) 409 uhandle.saveline(line1) 410 # There can be <a> links in front of 'Score' 411 rea = re.compile(r"</?a[^>]*>") 412 line1 = rea.sub("", line1) 413 line2 = rea.sub("", line2) 414 line3 = rea.sub("", line3) 415 if line1[:6] != ' Score' and line2[:6] != ' Score' and \ 416 line3[:6] != ' Score': 417 break 418 self._scan_hsp(uhandle, consumer) 419 420 consumer.end_alignment()
421
422 - def _scan_alignment_header(self, uhandle, consumer):
423 # <a name = 120291> </a><a href="http://www.ncbi.nlm.nih.gov:80/entrez/ 424 # Length = 141 425 # 426 while 1: 427 line = safe_readline(uhandle) 428 if line.lstrip().startswith('Length ='): 429 consumer.length(line) 430 break 431 elif is_blank_line(line): 432 # Check to make sure I haven't missed the Length line 433 raise ValueError("I missed the Length in an alignment header") 434 consumer.title(line) 435 436 if not attempt_read_and_call(uhandle, consumer.noevent, 437 start=' '): 438 read_and_call(uhandle, consumer.noevent, blank=1)
439
440 - def _scan_hsp(self, uhandle, consumer):
441 consumer.start_hsp() 442 self._scan_hsp_header(uhandle, consumer) 443 self._scan_hsp_alignment(uhandle, consumer) 444 consumer.end_hsp()
445
446 - def _scan_hsp_header(self, uhandle, consumer):
447 # If the HSP is not the first one within an alignment, includes: 448 # <PRE> 449 450 # Score = 22.7 bits (47), Expect = 2.5 451 # Identities = 10/36 (27%), Positives = 18/36 (49%) 452 # Strand = Plus / Plus 453 # Frame = +3 454 # 455 456 attempt_read_and_call(uhandle, consumer.noevent, start='<PRE>') 457 attempt_read_and_call(uhandle, consumer.noevent, blank=1) 458 read_and_call(uhandle, consumer.score, 459 has_re=re.compile(r'^ (<a[^>]*></a>)*Score')) 460 read_and_call(uhandle, consumer.identities, start=' Identities') 461 # BLASTN 462 attempt_read_and_call(uhandle, consumer.strand, start = ' Strand') 463 # BLASTX, TBLASTN, TBLASTX 464 attempt_read_and_call(uhandle, consumer.frame, start = ' Frame') 465 read_and_call(uhandle, consumer.noevent, blank=1)
466
467 - def _scan_hsp_alignment(self, uhandle, consumer):
468 # Query: 11 GRGVSACA-------TCDGFFYRNQKVAVIGGGNTAVEEALYLSNIASEVHLIHRRDGF 469 # GRGVS+ TC Y + + V GGG+ + EE L + I R+ 470 # Sbjct: 12 GRGVSSVVRRCIHKPTCKE--YAVKIIDVTGGGSFSAEEVQELREATLKEVDILRKVSG 471 # 472 # Query: 64 AEKILIKR 71 473 # I +K 474 # Sbjct: 70 PNIIQLKD 77 475 # </PRE> 476 # 477 # 478 479 while 1: 480 # Blastn adds an extra line filled with spaces before Query 481 attempt_read_and_call(uhandle, consumer.noevent, start=' ') 482 read_and_call(uhandle, consumer.query, start='Query') 483 read_and_call(uhandle, consumer.align, start=' ') 484 read_and_call(uhandle, consumer.sbjct, start='Sbjct') 485 if not attempt_read_and_call(uhandle, consumer.noevent, blank=1): 486 break 487 read_and_call(uhandle, consumer.noevent, start='</PRE>') 488 read_and_call_while(uhandle, consumer.noevent, blank=1)
489
490 - def _scan_masterslave_alignment(self, uhandle, consumer):
491 consumer.start_alignment() 492 read_and_call(uhandle, consumer.noevent, start='<PRE>') 493 while 1: 494 line = safe_readline(uhandle) 495 if is_blank_line(line): 496 consumer.noevent(line) 497 elif line[:6] == '</PRE>': 498 consumer.noevent(line) 499 break 500 else: 501 consumer.multalign(line) 502 read_and_call_while(uhandle, consumer.noevent, blank=1) 503 consumer.end_alignment()
504
505 - def _scan_database_report(self, uhandle, consumer):
506 # <PRE> 507 # Database: Non-redundant SwissProt sequences 508 # Posted date: Dec 18, 1999 8:26 PM 509 # Number of letters in database: 29,652,561 510 # Number of sequences in database: 82,258 511 # 512 # Lambda K H 513 # 0.317 0.133 0.395 514 # 515 # Gapped 516 # Lambda K H 517 # 0.270 0.0470 0.230 518 # 519 520 # qblast (BLASTN 2.2.10) does not give the Database: bits before the Lambda 521 # information, so that needs to be skipped 522 523 consumer.start_database_report() 524 525 # TBALSTN 2.2.6 526 # <PRE> Database: /tmp/affyA.fasta 527 line = uhandle.peekline() 528 # only look for database information if we aren't already at the 529 # Lambda bits 530 if line.find("Database") < 0: 531 read_and_call(uhandle, consumer.noevent, start='<PRE>') 532 line2 = uhandle.peekline() 533 if line2.find("Lambda K H") < 0: 534 read_and_call(uhandle, consumer.database, contains=' Database') 535 read_and_call_until(uhandle, consumer.database, contains="Posted") 536 read_and_call(uhandle, consumer.posted_date, start=' Posted') 537 read_and_call(uhandle, consumer.num_letters_in_database, 538 start=' Number of letters') 539 read_and_call(uhandle, consumer.num_sequences_in_database, 540 start=' Number of sequences') 541 read_and_call(uhandle, consumer.noevent, start=' ') 542 543 read_and_call(uhandle, consumer.noevent, start='Lambda') 544 read_and_call(uhandle, consumer.ka_params) 545 read_and_call(uhandle, consumer.noevent, blank=1) 546 547 # not BLASTP 548 attempt_read_and_call(uhandle, consumer.gapped, start='Gapped') 549 # not TBLASTX 550 if attempt_read_and_call(uhandle, consumer.noevent, start='Lambda'): 551 read_and_call(uhandle, consumer.ka_params_gap) 552 read_and_call_while(uhandle, consumer.noevent, blank=1) 553 554 consumer.end_database_report()
555
556 - def _scan_parameters(self, uhandle, consumer):
557 # Matrix: BLOSUM62 558 # Number of Hits to DB: 1st pass: 41542626, 2nd pass: 9765 559 # Number of Sequences: 1st pass: 89405, 2nd pass: 84 560 # Number of extensions: 1st pass: 500847, 2nd pass: 6747 561 # Number of successful extensions: 1st pass: 14, 2nd pass: 49 562 # Number of sequences better than 10.0: 20 563 # length of query: 205 564 # length of database: 10,955,950 565 # effective HSP length: 46 566 # effective length of query: 158 567 # effective length of database: 6,843,320 568 # effective search space: 1081244560 569 # effective search space used: 1081244560 570 # frameshift window, decay const: 50, 0.5 571 # T: 13 572 # A: 40 573 # X1: 16 ( 7.3 bits) 574 # X2: 0 ( 0.0 bits) 575 # S1: 41 (21.7 bits) 576 # S2: 52 (26.7 bits) 577 # 578 # </PRE> 579 580 # 6/3/2001, </PRE> is gone, replaced by </form> 581 582 583 consumer.start_parameters() 584 585 # qblast doesn't have Matrix line 586 attempt_read_and_call(uhandle, consumer.matrix, start='Matrix') 587 # not TBLASTX 588 attempt_read_and_call(uhandle, consumer.gap_penalties, start='Gap') 589 590 # in qblast the Number of Hits and Number of Sequences lines are 591 # reversed 592 if attempt_read_and_call(uhandle, consumer.num_hits, 593 start='Number of Hits'): 594 read_and_call(uhandle, consumer.num_sequences, 595 start='Number of Sequences') 596 else: 597 read_and_call(uhandle, consumer.num_sequences, 598 start='Number of Sequences') 599 read_and_call(uhandle, consumer.num_hits, 600 start='Number of Hits') 601 602 read_and_call(uhandle, consumer.num_extends, 603 start='Number of extensions') 604 read_and_call(uhandle, consumer.num_good_extends, 605 start='Number of successful') 606 607 read_and_call(uhandle, consumer.num_seqs_better_e, 608 start='Number of sequences') 609 610 # not BLASTN, TBLASTX 611 if attempt_read_and_call(uhandle, consumer.hsps_no_gap, 612 start="Number of HSP's better"): 613 # for qblast order of HSP info is changed 614 if attempt_read_and_call(uhandle, consumer.hsps_prelim_gapped, 615 start="Number of HSP's successfully"): 616 read_and_call(uhandle, consumer.hsps_prelim_gap_attempted, 617 start="Number of HSP's that") 618 read_and_call(uhandle, consumer.hsps_gapped, 619 start="Number of HSP's gapped") 620 else: 621 read_and_call(uhandle, consumer.no_event, 622 start="Number of HSP's gapped") 623 read_and_call(uhandle, consumer.no_event, 624 start="Number of HSP's successfully") 625 read_and_call(uhandle, consumer.no_event, 626 start="Number of extra gapped") 627 628 # QBlast has different capitalization on the Length info: 629 if attempt_read_and_call(uhandle, consumer.query_length, 630 start='Length of query'): 631 read_and_call(uhandle, consumer.database_length, 632 start='Length of database') 633 read_and_call(uhandle, consumer.no_event, 634 start='Length adjustment') 635 attempt_read_and_call(uhandle, consumer.effective_query_length, 636 start='Effective length of query') 637 read_and_call(uhandle, consumer.effective_database_length, 638 start='Effective length of database') 639 attempt_read_and_call(uhandle, consumer.effective_search_space, 640 start='Effective search space:') 641 attempt_read_and_call(uhandle, consumer.effective_search_space_used, 642 start='Effective search space used') 643 644 else: 645 attempt_read_and_call(uhandle, consumer.query_length, 646 start='length of query') 647 read_and_call(uhandle, consumer.database_length, 648 start='length of database') 649 read_and_call(uhandle, consumer.effective_hsp_length, 650 start='effective HSP') 651 attempt_read_and_call(uhandle, consumer.effective_query_length, 652 start='effective length of query') 653 read_and_call(uhandle, consumer.effective_database_length, 654 start='effective length of database') 655 attempt_read_and_call(uhandle, consumer.effective_search_space, 656 start='effective search space:') 657 attempt_read_and_call(uhandle, consumer.effective_search_space_used, 658 start='effective search space used') 659 660 # BLASTX, TBLASTN, TBLASTX 661 attempt_read_and_call(uhandle, consumer.frameshift, start='frameshift') 662 attempt_read_and_call(uhandle, consumer.threshold, start='T') 663 read_and_call(uhandle, consumer.window_size, start='A') 664 read_and_call(uhandle, consumer.dropoff_1st_pass, start='X1') 665 read_and_call(uhandle, consumer.gap_x_dropoff, start='X2') 666 # not BLASTN, TBLASTX 667 attempt_read_and_call(uhandle, consumer.gap_x_dropoff_final, 668 start='X3') 669 read_and_call(uhandle, consumer.gap_trigger, start='S1') 670 attempt_read_and_call(uhandle, consumer.blast_cutoff, start='S2') 671 672 attempt_read_and_call(uhandle, consumer.noevent, blank=1) 673 attempt_read_and_call(uhandle, consumer.noevent, start="</PRE>") 674 attempt_read_and_call(uhandle, consumer.noevent, start="</form>") 675 676 consumer.end_parameters()
677
678 -def qblast(program, database, sequence, 679 auto_format=None,composition_based_statistics=None, 680 db_genetic_code=None,endpoints=None,entrez_query='(none)', 681 expect=10.0,filter=None,gapcosts=None,genetic_code=None, 682 hitlist_size=50,i_thresh=None,layout=None,lcase_mask=None, 683 matrix_name=None,nucl_penalty=None,nucl_reward=None, 684 other_advanced=None,perc_ident=None,phi_pattern=None, 685 query_file=None,query_believe_defline=None,query_from=None, 686 query_to=None,searchsp_eff=None,service=None,threshold=None, 687 ungapped_alignment=None,word_size=None, 688 alignments=500,alignment_view=None,descriptions=500, 689 entrez_links_new_window=None,expect_low=None,expect_high=None, 690 format_entrez_query=None,format_object=None,format_type='XML', 691 ncbi_gi=None,results_file=None,show_overview=None 692 ):
693 """Do a BLAST search using the QBLAST server at NCBI. 694 695 Supports all parameters of the qblast API for Put and Get. 696 Some useful parameters: 697 program blastn, blastp, blastx, tblastn, or tblastx (lower case) 698 database Which database to search against (e.g. "nr"). 699 sequence The sequence to search. 700 ncbi_gi TRUE/FALSE whether to give 'gi' identifier. 701 descriptions Number of descriptions to show. Def 500. 702 alignments Number of alignments to show. Def 500. 703 expect An expect value cutoff. Def 10.0. 704 matrix_name Specify an alt. matrix (PAM30, PAM70, BLOSUM80, BLOSUM45). 705 filter "none" turns off filtering. Default no filtering 706 format_type "HTML", "Text", "ASN.1", or "XML". Def. "XML". 707 entrez_query Entrez query to limit Blast search 708 hitlist_size Number of hits to return. Default 50 709 710 This function does no checking of the validity of the parameters 711 and passes the values to the server as is. More help is available at: 712 http://www.ncbi.nlm.nih.gov/BLAST/blast_overview.html 713 714 """ 715 import urllib, urllib2 716 import time 717 718 assert program in ['blastn', 'blastp', 'blastx', 'tblastn', 'tblastx'] 719 720 # Format the "Put" command, which sends search requests to qblast. 721 # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node5.html on 9 July 2007 722 parameters = [ 723 ('AUTO_FORMAT',auto_format), 724 ('COMPOSITION_BASED_STATISTICS',composition_based_statistics), 725 ('DATABASE',database), 726 ('DB_GENETIC_CODE',db_genetic_code), 727 ('ENDPOINTS',endpoints), 728 ('ENTREZ_QUERY',entrez_query), 729 ('EXPECT',expect), 730 ('FILTER',filter), 731 ('GAPCOSTS',gapcosts), 732 ('GENETIC_CODE',genetic_code), 733 ('HITLIST_SIZE',hitlist_size), 734 ('I_THRESH',i_thresh), 735 ('LAYOUT',layout), 736 ('LCASE_MASK',lcase_mask), 737 ('MATRIX_NAME',matrix_name), 738 ('NUCL_PENALTY',nucl_penalty), 739 ('NUCL_REWARD',nucl_reward), 740 ('OTHER_ADVANCED',other_advanced), 741 ('PERC_IDENT',perc_ident), 742 ('PHI_PATTERN',phi_pattern), 743 ('PROGRAM',program), 744 ('QUERY',sequence), 745 ('QUERY_FILE',query_file), 746 ('QUERY_BELIEVE_DEFLINE',query_believe_defline), 747 ('QUERY_FROM',query_from), 748 ('QUERY_TO',query_to), 749 ('SEARCHSP_EFF',searchsp_eff), 750 ('SERVICE',service), 751 ('THRESHOLD',threshold), 752 ('UNGAPPED_ALIGNMENT',ungapped_alignment), 753 ('WORD_SIZE',word_size), 754 ('CMD', 'Put'), 755 ] 756 query = [x for x in parameters if x[1] is not None] 757 message = urllib.urlencode(query) 758 759 # Send off the initial query to qblast. 760 # Note the NCBI do not currently impose a rate limit here, other 761 # than the request not to make say 50 queries at once using multiple 762 # threads. 763 request = urllib2.Request("http://blast.ncbi.nlm.nih.gov/Blast.cgi", 764 message, 765 {"User-Agent":"BiopythonClient"}) 766 handle = urllib2.urlopen(request) 767 768 # Format the "Get" command, which gets the formatted results from qblast 769 # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node6.html on 9 July 2007 770 rid, rtoe = _parse_qblast_ref_page(handle) 771 parameters = [ 772 ('ALIGNMENTS',alignments), 773 ('ALIGNMENT_VIEW',alignment_view), 774 ('DESCRIPTIONS',descriptions), 775 ('ENTREZ_LINKS_NEW_WINDOW',entrez_links_new_window), 776 ('EXPECT_LOW',expect_low), 777 ('EXPECT_HIGH',expect_high), 778 ('FORMAT_ENTREZ_QUERY',format_entrez_query), 779 ('FORMAT_OBJECT',format_object), 780 ('FORMAT_TYPE',format_type), 781 ('NCBI_GI',ncbi_gi), 782 ('RID',rid), 783 ('RESULTS_FILE',results_file), 784 ('SERVICE',service), 785 ('SHOW_OVERVIEW',show_overview), 786 ('CMD', 'Get'), 787 ] 788 query = [x for x in parameters if x[1] is not None] 789 message = urllib.urlencode(query) 790 791 # Poll NCBI until the results are ready. Use a 3 second wait 792 delay = 3.0 793 previous = time.time() 794 while True: 795 current = time.time() 796 wait = previous + delay - current 797 if wait > 0: 798 time.sleep(wait) 799 previous = current + wait 800 else: 801 previous = current 802 803 request = urllib2.Request("http://blast.ncbi.nlm.nih.gov/Blast.cgi", 804 message, 805 {"User-Agent":"BiopythonClient"}) 806 handle = urllib2.urlopen(request) 807 results = handle.read() 808 # XML results don't have the Status tag when finished 809 if results.find("Status=") < 0: 810 break 811 i = results.index("Status=") 812 j = results.index("\n", i) 813 status = results[i+len("Status="):j].strip() 814 if status.upper() == "READY": 815 break 816 817 return StringIO.StringIO(results)
818
819 -def _parse_qblast_ref_page(handle):
820 """Extract a tuple of RID, RTOE from the 'please wait' page (PRIVATE). 821 """ 822 s = handle.read() 823 i = s.find("RID =") 824 if i == 1 : 825 raise ValueError("No RID found in the 'please wait' page.") 826 j = s.find("\n", i) 827 rid = s[i+len("RID ="):j].strip() 828 829 i = s.find("RTOE =") 830 if i == 1 : 831 raise ValueError("No RTOE found in the 'please wait' page.") 832 j = s.find("\n", i) 833 rtoe = s[i+len("RTOE ="):j].strip() 834 try : 835 return rid, int(rtoe) 836 except ValueError : 837 raise ValueError("A non-integer RTOE found in " \ 838 +"the 'please wait' page, %s" % repr(rtoe))
839