Package Bio :: Package Blast :: Module NCBIXML
[hide private]
[frames] | no frames]

Source Code for Module Bio.Blast.NCBIXML

  1  # Copyright 2000 by Bertrand Frottier .  All rights reserved. 
  2  # Revisions 2005-2006 copyright Michiel de Hoon 
  3  # Revisions 2006-2008 copyright Peter Cock 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7  """This module provides code to work with the BLAST XML output 
  8  following the DTD available on the NCBI FTP 
  9  ftp://ftp.ncbi.nlm.nih.gov/blast/documents/xml/NCBI_BlastOutput.dtd 
 10   
 11  Classes: 
 12  BlastParser         Parses XML output from BLAST. 
 13                      This (now) returns a list of Blast records. 
 14                      Historically it returned a single Blast record. 
 15   
 16  _XMLParser          Generic SAX parser. 
 17   
 18  Functions: 
 19  parse               Incremental parser, this is an iterator that returns 
 20                      Blast records. 
 21  """ 
 22  from Bio.Blast import Record 
 23  import xml.sax 
 24  from xml.sax.handler import ContentHandler 
 25   
26 -class _XMLparser(ContentHandler):
27 """Generic SAX Parser 28 29 Just a very basic SAX parser. 30 31 Redefine the methods startElement, characters and endElement. 32 """
33 - def __init__(self, debug=0):
34 """Constructor 35 36 debug - integer, amount of debug information to print 37 """ 38 self._tag = [] 39 self._value = '' 40 self._debug = debug 41 self._debug_ignore_list = []
42
43 - def _secure_name(self, name):
44 """Removes 'dangerous' from tag names 45 46 name -- name to be 'secured' 47 """ 48 # Replace '-' with '_' in XML tag names 49 return name.replace('-', '_')
50
51 - def startElement(self, name, attr):
52 """Found XML start tag 53 54 No real need of attr, BLAST DTD doesn't use them 55 56 name -- name of the tag 57 58 attr -- tag attributes 59 """ 60 self._tag.append(name) 61 62 # Try to call a method (defined in subclasses) 63 method = self._secure_name('_start_' + name) 64 65 #Note could use try / except AttributeError 66 #BUT I found often triggered by nested errors... 67 if hasattr(self, method) : 68 eval("self.%s()" % method) 69 if self._debug > 4 : 70 print "NCBIXML: Parsed: " + method 71 else : 72 # Doesn't exist (yet) 73 if method not in self._debug_ignore_list : 74 if self._debug > 3 : 75 print "NCBIXML: Ignored: " + method 76 self._debug_ignore_list.append(method)
77
78 - def characters(self, ch):
79 """Found some text 80 81 ch -- characters read 82 """ 83 self._value += ch # You don't ever get the whole string
84
85 - def endElement(self, name):
86 """Found XML end tag 87 88 name -- tag name 89 """ 90 # Strip character buffer 91 self._value = self._value.strip() 92 93 # Try to call a method (defined in subclasses) 94 method = self._secure_name('_end_' + name) 95 #Note could use try / except AttributeError 96 #BUT I found often triggered by nested errors... 97 if hasattr(self, method) : 98 eval("self.%s()" % method) 99 if self._debug > 2 : 100 print "NCBIXML: Parsed: " + method, self._value 101 else : 102 # Doesn't exist (yet) 103 if method not in self._debug_ignore_list : 104 if self._debug > 1 : 105 print "NCBIXML: Ignored: " + method, self._value 106 self._debug_ignore_list.append(method) 107 108 # Reset character buffer 109 self._value = ''
110
111 -class BlastParser(_XMLparser):
112 """Parse XML BLAST data into a Record.Blast object 113 114 All XML 'action' methods are private methods and may be: 115 _start_TAG called when the start tag is found 116 _end_TAG called when the end tag is found 117 """ 118
119 - def __init__(self, debug=0):
120 """Constructor 121 122 debug - integer, amount of debug information to print 123 """ 124 # Calling superclass method 125 _XMLparser.__init__(self, debug) 126 127 self._parser = xml.sax.make_parser() 128 self._parser.setContentHandler(self) 129 130 # To avoid ValueError: unknown url type: NCBI_BlastOutput.dtd 131 self._parser.setFeature(xml.sax.handler.feature_validation, 0) 132 self._parser.setFeature(xml.sax.handler.feature_namespaces, 0) 133 self._parser.setFeature(xml.sax.handler.feature_external_pes, 0) 134 self._parser.setFeature(xml.sax.handler.feature_external_ges, 0) 135 136 self.reset()
137
138 - def reset(self) :
139 """Reset all the data allowing reuse of the BlastParser() object""" 140 self._records = [] 141 self._header = Record.Header() 142 self._parameters = Record.Parameters() 143 self._parameters.filter = None #Maybe I should update the class?
144
145 - def parse(self, handler):
146 """Parses the XML data 147 148 handler -- file handler or StringIO 149 150 This method returns a list of Blast record objects. 151 """ 152 import warnings 153 warnings.warn("Bio.Blast.NCBIXML.BlastParser.parse has been deprecated; please use Bio.Blast.NCBIXML.parse instead", DeprecationWarning) 154 self.reset() 155 self._parser.parse(handler) 156 return self._records
157
158 - def _start_Iteration(self):
159 self._blast = Record.Blast() 160 pass
161
162 - def _end_Iteration(self):
163 # We stored a lot of generic "top level" information 164 # in self._header (an object of type Record.Header) 165 self._blast.reference = self._header.reference 166 self._blast.date = self._header.date 167 self._blast.version = self._header.version 168 self._blast.database = self._header.database 169 self._blast.application = self._header.application 170 171 # These are required for "old" pre 2.2.14 files 172 # where only <BlastOutput_query-ID>, <BlastOutput_query-def> 173 # and <BlastOutput_query-len> were used. Now they 174 # are suplemented/replaced by <Iteration_query-ID>, 175 # <Iteration_query-def> and <Iteration_query-len> 176 if not hasattr(self._blast, "query") \ 177 or not self._blast.query : 178 self._blast.query = self._header.query 179 if not hasattr(self._blast, "query_id") \ 180 or not self._blast.query_id : 181 self._blast.query_id = self._header.query_id 182 if not hasattr(self._blast, "query_letters") \ 183 or not self._blast.query_letters : 184 self._blast.query_letters = self._header.query_letters 185 186 # Apply the "top level" parameter information 187 self._blast.matrix = self._parameters.matrix 188 self._blast.num_seqs_better_e = self._parameters.num_seqs_better_e 189 self._blast.gap_penalties = self._parameters.gap_penalties 190 self._blast.filter = self._parameters.filter 191 self._blast.expect = self._parameters.expect 192 self._blast.sc_match = self._parameters.sc_match 193 self._blast.sc_mismatch = self._parameters.sc_mismatch 194 195 #Add to the list 196 self._records.append(self._blast) 197 #Clear the object (a new empty one is create in _start_Iteration) 198 self._blast = None 199 200 if self._debug : "NCBIXML: Added Blast record to results"
201 202 # Header
203 - def _end_BlastOutput_program(self):
204 """BLAST program, e.g., blastp, blastn, etc. 205 206 Save this to put on each blast record object 207 """ 208 self._header.application = self._value.upper()
209
210 - def _end_BlastOutput_version(self):
211 """version number and date of the BLAST engine. 212 213 e.g. "BLASTX 2.2.12 [Aug-07-2005]" but there can also be 214 variants like "BLASTP 2.2.18+" without the date. 215 216 Save this to put on each blast record object 217 """ 218 parts = self._value.split() 219 #TODO - Check the first word starts with BLAST? 220 221 #The version is the second word (field one) 222 self._header.version = parts[1] 223 224 #Check there is a third word (the date) 225 if len(parts) >= 3 : 226 if parts[2][0] == "[" and parts[2][-1] == "]" : 227 self._header.date = parts[2][1:-1] 228 else : 229 #Assume this is still a date, but without the 230 #square brackets 231 self._header.date = parts[2]
232
234 """a reference to the article describing the algorithm 235 236 Save this to put on each blast record object 237 """ 238 self._header.reference = self._value
239
240 - def _end_BlastOutput_db(self):
241 """the database(s) searched 242 243 Save this to put on each blast record object 244 """ 245 self._header.database = self._value
246
248 """the identifier of the query 249 250 Important in old pre 2.2.14 BLAST, for recent versions 251 <Iteration_query-ID> is enough 252 """ 253 self._header.query_id = self._value
254
256 """the definition line of the query 257 258 Important in old pre 2.2.14 BLAST, for recent versions 259 <Iteration_query-def> is enough 260 """ 261 self._header.query = self._value
262
264 """the length of the query 265 266 Important in old pre 2.2.14 BLAST, for recent versions 267 <Iteration_query-len> is enough 268 """ 269 self._header.query_letters = int(self._value)
270
271 - def _end_Iteration_query_ID(self):
272 """the identifier of the query 273 """ 274 self._blast.query_id = self._value
275
276 - def _end_Iteration_query_def(self):
277 """the definition line of the query 278 """ 279 self._blast.query = self._value
280
281 - def _end_Iteration_query_len(self):
282 """the length of the query 283 """ 284 self._blast.query_letters = int(self._value)
285 286 ## def _end_BlastOutput_query_seq(self): 287 ## """the query sequence 288 ## """ 289 ## pass # XXX Missing in Record.Blast ? 290 291 ## def _end_BlastOutput_iter_num(self): 292 ## """the psi-blast iteration number 293 ## """ 294 ## pass # XXX TODO PSI 295
296 - def _end_BlastOutput_hits(self):
297 """hits to the database sequences, one for every sequence 298 """ 299 self._blast.num_hits = int(self._value)
300 301 ## def _end_BlastOutput_message(self): 302 ## """error messages 303 ## """ 304 ## pass # XXX What to do ? 305 306 # Parameters
307 - def _end_Parameters_matrix(self):
308 """matrix used (-M) 309 """ 310 self._parameters.matrix = self._value
311
312 - def _end_Parameters_expect(self):
313 """expect values cutoff (-e) 314 """ 315 # NOTE: In old text output there was a line: 316 # Number of sequences better than 1.0e-004: 1 317 # As far as I can see, parameters.num_seqs_better_e 318 # would take the value of 1, and the expectation 319 # value was not recorded. 320 # 321 # Anyway we should NOT record this against num_seqs_better_e 322 self._parameters.expect = self._value
323 324 ## def _end_Parameters_include(self): 325 ## """inclusion threshold for a psi-blast iteration (-h) 326 ## """ 327 ## pass # XXX TODO PSI 328
329 - def _end_Parameters_sc_match(self):
330 """match score for nucleotide-nucleotide comparaison (-r) 331 """ 332 self._parameters.sc_match = int(self._value)
333
335 """mismatch penalty for nucleotide-nucleotide comparaison (-r) 336 """ 337 self._parameters.sc_mismatch = int(self._value)
338
339 - def _end_Parameters_gap_open(self):
340 """gap existence cost (-G) 341 """ 342 self._parameters.gap_penalties = int(self._value)
343
345 """gap extension cose (-E) 346 """ 347 self._parameters.gap_penalties = (self._parameters.gap_penalties, 348 int(self._value))
349
350 - def _end_Parameters_filter(self):
351 """filtering options (-F) 352 """ 353 self._parameters.filter = self._value
354 355 ## def _end_Parameters_pattern(self): 356 ## """pattern used for phi-blast search 357 ## """ 358 ## pass # XXX TODO PSI 359 360 ## def _end_Parameters_entrez_query(self): 361 ## """entrez query used to limit search 362 ## """ 363 ## pass # XXX TODO PSI 364 365 # Hits
366 - def _start_Hit(self):
367 self._blast.alignments.append(Record.Alignment()) 368 self._blast.descriptions.append(Record.Description()) 369 self._blast.multiple_alignment = [] 370 self._hit = self._blast.alignments[-1] 371 self._descr = self._blast.descriptions[-1] 372 self._descr.num_alignments = 0
373
374 - def _end_Hit(self):
375 #Cleanup 376 self._blast.multiple_alignment = None 377 self._hit = None 378 self._descr = None
379
380 - def _end_Hit_id(self):
381 """identifier of the database sequence 382 """ 383 self._hit.hit_id = self._value 384 self._hit.title = self._value + ' '
385
386 - def _end_Hit_def(self):
387 """definition line of the database sequence 388 """ 389 self._hit.hit_def = self._value 390 self._hit.title += self._value 391 self._descr.title = self._hit.title
392
393 - def _end_Hit_accession(self):
394 """accession of the database sequence 395 """ 396 self._hit.accession = self._value 397 self._descr.accession = self._value
398
399 - def _end_Hit_len(self):
400 self._hit.length = int(self._value)
401 402 # HSPs
403 - def _start_Hsp(self):
404 #Note that self._start_Hit() should have been called 405 #to setup things like self._blast.multiple_alignment 406 self._hit.hsps.append(Record.HSP()) 407 self._hsp = self._hit.hsps[-1] 408 self._descr.num_alignments += 1 409 self._blast.multiple_alignment.append(Record.MultipleAlignment()) 410 self._mult_al = self._blast.multiple_alignment[-1]
411 412 # Hsp_num is useless
413 - def _end_Hsp_score(self):
414 """raw score of HSP 415 """ 416 self._hsp.score = float(self._value) 417 if self._descr.score == None: 418 self._descr.score = float(self._value)
419
420 - def _end_Hsp_bit_score(self):
421 """bit score of HSP 422 """ 423 self._hsp.bits = float(self._value) 424 if self._descr.bits == None: 425 self._descr.bits = float(self._value)
426
427 - def _end_Hsp_evalue(self):
428 """expect value value of the HSP 429 """ 430 self._hsp.expect = float(self._value) 431 if self._descr.e == None: 432 self._descr.e = float(self._value)
433
434 - def _end_Hsp_query_from(self):
435 """offset of query at the start of the alignment (one-offset) 436 """ 437 self._hsp.query_start = int(self._value)
438
439 - def _end_Hsp_query_to(self):
440 """offset of query at the end of the alignment (one-offset) 441 """ 442 self._hsp.query_end = int(self._value)
443
444 - def _end_Hsp_hit_from(self):
445 """offset of the database at the start of the alignment (one-offset) 446 """ 447 self._hsp.sbjct_start = int(self._value)
448
449 - def _end_Hsp_hit_to(self):
450 """offset of the database at the end of the alignment (one-offset) 451 """ 452 self._hsp.sbjct_end = int(self._value)
453 454 ## def _end_Hsp_pattern_from(self): 455 ## """start of phi-blast pattern on the query (one-offset) 456 ## """ 457 ## pass # XXX TODO PSI 458 459 ## def _end_Hsp_pattern_to(self): 460 ## """end of phi-blast pattern on the query (one-offset) 461 ## """ 462 ## pass # XXX TODO PSI 463
464 - def _end_Hsp_query_frame(self):
465 """frame of the query if applicable 466 """ 467 self._hsp.frame = (int(self._value),)
468
469 - def _end_Hsp_hit_frame(self):
470 """frame of the database sequence if applicable 471 """ 472 self._hsp.frame += (int(self._value),)
473
474 - def _end_Hsp_identity(self):
475 """number of identities in the alignment 476 """ 477 self._hsp.identities = int(self._value)
478
479 - def _end_Hsp_positive(self):
480 """number of positive (conservative) substitutions in the alignment 481 """ 482 self._hsp.positives = int(self._value)
483
484 - def _end_Hsp_gaps(self):
485 """number of gaps in the alignment 486 """ 487 self._hsp.gaps = int(self._value)
488
489 - def _end_Hsp_align_len(self):
490 """length of the alignment 491 """ 492 self._hsp.align_length = int(self._value)
493 494 ## def _en_Hsp_density(self): 495 ## """score density 496 ## """ 497 ## pass # XXX ??? 498
499 - def _end_Hsp_qseq(self):
500 """alignment string for the query 501 """ 502 self._hsp.query = self._value
503
504 - def _end_Hsp_hseq(self):
505 """alignment string for the database 506 """ 507 self._hsp.sbjct = self._value
508
509 - def _end_Hsp_midline(self):
510 """Formatting middle line as normally seen in BLAST report 511 """ 512 self._hsp.match = self._value
513 514 # Statistics
515 - def _end_Statistics_db_num(self):
516 """number of sequences in the database 517 """ 518 self._blast.num_sequences_in_database = int(self._value)
519
520 - def _end_Statistics_db_len(self):
521 """number of letters in the database 522 """ 523 self._blast.num_letters_in_database = int(self._value)
524
525 - def _end_Statistics_hsp_len(self):
526 """the effective HSP length 527 """ 528 self._blast.effective_hsp_length = int(self._value)
529
531 """the effective search space 532 """ 533 self._blast.effective_search_space = float(self._value)
534
535 - def _end_Statistics_kappa(self):
536 """Karlin-Altschul parameter K 537 """ 538 self._blast.ka_params = float(self._value)
539
540 - def _end_Statistics_lambda(self):
541 """Karlin-Altschul parameter Lambda 542 """ 543 self._blast.ka_params = (float(self._value), 544 self._blast.ka_params)
545
546 - def _end_Statistics_entropy(self):
547 """Karlin-Altschul parameter H 548 """ 549 self._blast.ka_params = self._blast.ka_params + (float(self._value),)
550
551 -def parse(handle, debug=0):
552 """Returns an iterator a Blast record for each query. 553 554 handle - file handle to and XML file to parse 555 debug - integer, amount of debug information to print 556 557 This is a generator function that returns multiple Blast records 558 objects - one for each query sequence given to blast. The file 559 is read incrementally, returning complete records as they are read 560 in. 561 562 Should cope with new BLAST 2.2.14+ which gives a single XML file 563 for mutliple query records. 564 565 Should also cope with XML output from older versions BLAST which 566 gave multiple XML files concatenated together (giving a single file 567 which strictly speaking wasn't valid XML).""" 568 from xml.parsers import expat 569 BLOCK = 1024 570 MARGIN = 10 # must be at least length of newline + XML start 571 XML_START = "<?xml" 572 573 text = handle.read(BLOCK) 574 pending = "" 575 576 if not text : 577 #NO DATA FOUND! 578 raise ValueError("Your XML file was empty") 579 580 while text : 581 #We are now starting a new XML file 582 if not text.startswith(XML_START) : 583 raise ValueError("Your XML file did not start with %s..." \ 584 % XML_START) 585 586 expat_parser = expat.ParserCreate() 587 blast_parser = BlastParser(debug) 588 expat_parser.StartElementHandler = blast_parser.startElement 589 expat_parser.EndElementHandler = blast_parser.endElement 590 expat_parser.CharacterDataHandler = blast_parser.characters 591 592 expat_parser.Parse(text, False) 593 while blast_parser._records: 594 record = blast_parser._records[0] 595 blast_parser._records = blast_parser._records[1:] 596 yield record 597 598 while True : 599 #Read in another block of the file... 600 text, pending = pending + handle.read(BLOCK), "" 601 if not text: 602 #End of the file! 603 expat_parser.Parse("", True) # End of XML record 604 break 605 606 #Now read a little bit more so we can check for the 607 #start of another XML file... 608 pending = handle.read(MARGIN) 609 610 if (text+pending).find("\n" + XML_START) == -1 : 611 # Good - still dealing with the same XML file 612 expat_parser.Parse(text, False) 613 while blast_parser._records: 614 record = blast_parser._records[0] 615 blast_parser._records = blast_parser._records[1:] 616 yield record 617 else : 618 # This is output from pre 2.2.14 BLAST, 619 # one XML file for each query! 620 621 # Finish the old file: 622 text, pending = (text+pending).split("\n" + XML_START,1) 623 pending = XML_START + pending 624 625 expat_parser.Parse(text, True) # End of XML record 626 while blast_parser._records: 627 record = blast_parser._records[0] 628 blast_parser._records = blast_parser._records[1:] 629 yield record 630 631 #Now we are going to re-loop, reset the 632 #parsers and start reading the next XML file 633 text, pending = pending, "" 634 break 635 636 #At this point we have finished the first XML record. 637 #If the file is from an old version of blast, it may 638 #contain more XML records (check if text==""). 639 assert pending=="" 640 assert len(blast_parser._records) == 0 641 642 #We should have finished the file! 643 assert text=="" 644 assert pending=="" 645 assert len(blast_parser._records) == 0
646 647 if __name__ == '__main__': 648 import sys 649 import os 650 handle = open(sys.argv[1]) 651 r_list = parse(handle) 652 653 for r in r_list : 654 # Small test 655 print 'Blast of', r.query 656 print 'Found %s alignments with a total of %s HSPs' % (len(r.alignments), 657 reduce(lambda a,b: a+b, 658 [len(a.hsps) for a in r.alignments])) 659 660 for al in r.alignments: 661 print al.title[:50], al.length, 'bp', len(al.hsps), 'HSPs' 662 663 # Cookbook example 664 E_VALUE_THRESH = 0.04 665 for alignment in r.alignments: 666 for hsp in alignment.hsps: 667 if hsp.expect < E_VALUE_THRESH: 668 print '*****' 669 print 'sequence', alignment.title 670 print 'length', alignment.length 671 print 'e value', hsp.expect 672 print hsp.query[:75] + '...' 673 print hsp.match[:75] + '...' 674 print hsp.sbjct[:75] + '...' 675