1
2
3
4
5
6
7
8
9 """
10 This module provides code to work with the WWW version of BLAST
11 provided by the NCBI.
12 http://blast.ncbi.nlm.nih.gov/
13
14 Functions:
15 qblast Do a BLAST search using the QBLAST API.
16
17 Deprecated classes:
18 BlastParser Parses output from WWW blast.
19 _Scanner Scans output from NCBI's BLAST WWW server.
20
21
22 """
23 import re
24
25 try:
26 import cStringIO as StringIO
27 except ImportError:
28 import StringIO
29
30 from Bio.ParserSupport import *
31
33 """Parses WWW BLAST data into a Record.Blast object (DEPRECATED).
34
35 This is a parser for the NCBI's HTML (web page) BLAST output.
36 """
38 """Create a BlastParser object (DEPRECATED)."""
39 import warnings
40 warnings.warn("Bio.Blast.NCBIWWW.BlastParser is deprecated." \
41 + " We recommend you use the XML output with" \
42 + " the parser in Bio.Blast.NCBIXML instead.",
43 DeprecationWarning)
44
45 import NCBIStandalone
46 self._scanner = _Scanner()
47 self._consumer = SGMLStrippingConsumer(NCBIStandalone._BlastConsumer())
48
50 """parse(self, handle)"""
51 self._scanner.feed(handle, self._consumer)
52 return self._consumer.data
53
55 """Scanner for the HTML BLAST parser (PRIVATE, DEPRECATED).
56
57 Scan BLAST output from NCBI's web server at:
58 http://www.ncbi.nlm.nih.gov/BLAST/
59
60 Tested with BLAST v2.0.10
61
62 Methods:
63 feed Feed data into the scanner.
64 """
65 - def feed(self, handle, consumer):
113
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148 consumer.start_header()
149
150
151 read_and_call(uhandle, consumer.version, contains='BLAST')
152 read_and_call_while(uhandle, consumer.noevent, blank=1)
153
154
155
156 while 1:
157 line = uhandle.readline()
158 if line[:3] == '<p>' or not line.strip():
159 consumer.noevent(line)
160 break
161 consumer.reference(line)
162
163
164 attempt_read_and_call(uhandle, consumer.noevent, start='RID')
165
166
167
168 attempt_read_and_call(uhandle, consumer.noevent)
169
170
171
172
173 if uhandle.peekline().find("Query=") >= 0:
174 self._scan_query_info(uhandle, consumer)
175 self._scan_database_info(uhandle, consumer)
176 else:
177 self._scan_database_info(uhandle, consumer)
178 self._scan_query_info(uhandle, consumer)
179 read_and_call_while(uhandle, consumer.noevent, blank=1)
180 consumer.end_header()
181
192
194 attempt_read_and_call(uhandle, consumer.noevent, start='<p>')
195 read_and_call(uhandle, consumer.database_info, contains='Database')
196
197
198
199 read_and_call_until(uhandle, consumer.database_info,
200 contains='sequences;')
201 read_and_call(uhandle, consumer.database_info, contains='sequences;')
202 read_and_call(uhandle, consumer.noevent, blank=1)
203 attempt_read_and_call(uhandle, consumer.noevent,
204 contains='problems or questions')
205 self._scan_blastform(uhandle, consumer)
206
207 attempt_read_and_call(uhandle, consumer.noevent, blank=1)
208 if attempt_read_and_call(uhandle, consumer.noevent,
209 start="<table border=0 width=600"):
210 read_and_call_until(uhandle, consumer.noevent,
211 contains="</table>")
212 consumer.noevent(uhandle.readline())
213 read_and_call(uhandle, consumer.noevent, blank=1)
214
215 attempt_read_and_call(uhandle, consumer.noevent, start="<p>")
216
217 if attempt_read_and_call(uhandle, consumer.noevent,
218 contains="Taxonomy reports"):
219 read_and_call(uhandle, consumer.noevent, start="<BR>")
220 attempt_read_and_call(uhandle, consumer.noevent, start="<PRE>")
221
222
223
224
225
226
227
228
229 if attempt_read_and_call(uhandle, consumer.noevent, start="</PRE>"):
230 read_and_call_until(uhandle, consumer.noevent, start="<PRE>")
231 while 1:
232 line = uhandle.peekline()
233 if not line[:5] == "<PRE>" or line.find("Query=") >= 0:
234 break
235 read_and_call(uhandle, consumer.noevent, start="<PRE>")
236
237 read_and_call_while(uhandle, consumer.noevent, blank=1)
238
240
241 read_and_call(uhandle, consumer.query_info, contains='Query=')
242 read_and_call_until(uhandle, consumer.query_info, blank=1)
243 read_and_call_while(uhandle, consumer.noevent, blank=1)
244 if attempt_read_and_call(uhandle, consumer.noevent, start="<PRE>"):
245 read_and_call_while(uhandle, consumer.noevent, blank=1)
246 self._scan_blastform(uhandle, consumer)
247
251
253 consumer.start_descriptions()
254
255
256
257
258
259 if not attempt_read_and_call(
260 uhandle, consumer.description_header,
261 has_re=re.compile(r"Score {4,5}E")):
262
263 attempt_read_and_call(uhandle, consumer.no_hits,
264 contains='No significant similarity')
265 read_and_call_while(uhandle, consumer.noevent, blank=1)
266 consumer.end_descriptions()
267
268 return
269
270
271
272
273
274
275 read_and_call(uhandle, consumer.description_header,
276 start='Sequences producing')
277 read_and_call(uhandle, consumer.noevent, blank=1)
278
279
280
281
282 read_and_call_while(uhandle, consumer.description,
283 blank=0, contains='<a')
284
285
286 if not attempt_read_and_call(uhandle, consumer.noevent,
287 contains='</PRE>'):
288 read_and_call_while(uhandle, consumer.noevent, blank=1)
289
290 consumer.end_descriptions()
291
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318 line1 = safe_readline(uhandle)
319 line2 = safe_readline(uhandle)
320 uhandle.saveline(line2)
321 uhandle.saveline(line1)
322
323 is_pairwise = is_masterslave = 0
324 if 'Alignments' in line2:
325 is_pairwise = 1
326 elif line2.startswith(' Database'):
327 pass
328 elif line2.startswith('Lambda K H'):
329 pass
330 elif line2.startswith('blast_tmp'):
331 is_masterslave = 1
332 elif line1.startswith('<PRE>'):
333 is_pairwise = 1
334 else:
335 raise ValueError("Cannot resolve location at lines:\n%s\n%s" \
336 % (line1, line2))
337
338 if is_pairwise:
339 self._scan_pairwise_alignments(uhandle, consumer)
340 elif is_masterslave:
341 self._scan_masterslave_alignment(uhandle, consumer)
342
365
381
421
439
445
447
448
449
450
451
452
453
454
455
456 attempt_read_and_call(uhandle, consumer.noevent, start='<PRE>')
457 attempt_read_and_call(uhandle, consumer.noevent, blank=1)
458 read_and_call(uhandle, consumer.score,
459 has_re=re.compile(r'^ (<a[^>]*></a>)*Score'))
460 read_and_call(uhandle, consumer.identities, start=' Identities')
461
462 attempt_read_and_call(uhandle, consumer.strand, start = ' Strand')
463
464 attempt_read_and_call(uhandle, consumer.frame, start = ' Frame')
465 read_and_call(uhandle, consumer.noevent, blank=1)
466
468
469
470
471
472
473
474
475
476
477
478
479 while 1:
480
481 attempt_read_and_call(uhandle, consumer.noevent, start=' ')
482 read_and_call(uhandle, consumer.query, start='Query')
483 read_and_call(uhandle, consumer.align, start=' ')
484 read_and_call(uhandle, consumer.sbjct, start='Sbjct')
485 if not attempt_read_and_call(uhandle, consumer.noevent, blank=1):
486 break
487 read_and_call(uhandle, consumer.noevent, start='</PRE>')
488 read_and_call_while(uhandle, consumer.noevent, blank=1)
489
504
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523 consumer.start_database_report()
524
525
526
527 line = uhandle.peekline()
528
529
530 if line.find("Database") < 0:
531 read_and_call(uhandle, consumer.noevent, start='<PRE>')
532 line2 = uhandle.peekline()
533 if line2.find("Lambda K H") < 0:
534 read_and_call(uhandle, consumer.database, contains=' Database')
535 read_and_call_until(uhandle, consumer.database, contains="Posted")
536 read_and_call(uhandle, consumer.posted_date, start=' Posted')
537 read_and_call(uhandle, consumer.num_letters_in_database,
538 start=' Number of letters')
539 read_and_call(uhandle, consumer.num_sequences_in_database,
540 start=' Number of sequences')
541 read_and_call(uhandle, consumer.noevent, start=' ')
542
543 read_and_call(uhandle, consumer.noevent, start='Lambda')
544 read_and_call(uhandle, consumer.ka_params)
545 read_and_call(uhandle, consumer.noevent, blank=1)
546
547
548 attempt_read_and_call(uhandle, consumer.gapped, start='Gapped')
549
550 if attempt_read_and_call(uhandle, consumer.noevent, start='Lambda'):
551 read_and_call(uhandle, consumer.ka_params_gap)
552 read_and_call_while(uhandle, consumer.noevent, blank=1)
553
554 consumer.end_database_report()
555
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583 consumer.start_parameters()
584
585
586 attempt_read_and_call(uhandle, consumer.matrix, start='Matrix')
587
588 attempt_read_and_call(uhandle, consumer.gap_penalties, start='Gap')
589
590
591
592 if attempt_read_and_call(uhandle, consumer.num_hits,
593 start='Number of Hits'):
594 read_and_call(uhandle, consumer.num_sequences,
595 start='Number of Sequences')
596 else:
597 read_and_call(uhandle, consumer.num_sequences,
598 start='Number of Sequences')
599 read_and_call(uhandle, consumer.num_hits,
600 start='Number of Hits')
601
602 read_and_call(uhandle, consumer.num_extends,
603 start='Number of extensions')
604 read_and_call(uhandle, consumer.num_good_extends,
605 start='Number of successful')
606
607 read_and_call(uhandle, consumer.num_seqs_better_e,
608 start='Number of sequences')
609
610
611 if attempt_read_and_call(uhandle, consumer.hsps_no_gap,
612 start="Number of HSP's better"):
613
614 if attempt_read_and_call(uhandle, consumer.hsps_prelim_gapped,
615 start="Number of HSP's successfully"):
616 read_and_call(uhandle, consumer.hsps_prelim_gap_attempted,
617 start="Number of HSP's that")
618 read_and_call(uhandle, consumer.hsps_gapped,
619 start="Number of HSP's gapped")
620 else:
621 read_and_call(uhandle, consumer.no_event,
622 start="Number of HSP's gapped")
623 read_and_call(uhandle, consumer.no_event,
624 start="Number of HSP's successfully")
625 read_and_call(uhandle, consumer.no_event,
626 start="Number of extra gapped")
627
628
629 if attempt_read_and_call(uhandle, consumer.query_length,
630 start='Length of query'):
631 read_and_call(uhandle, consumer.database_length,
632 start='Length of database')
633 read_and_call(uhandle, consumer.no_event,
634 start='Length adjustment')
635 attempt_read_and_call(uhandle, consumer.effective_query_length,
636 start='Effective length of query')
637 read_and_call(uhandle, consumer.effective_database_length,
638 start='Effective length of database')
639 attempt_read_and_call(uhandle, consumer.effective_search_space,
640 start='Effective search space:')
641 attempt_read_and_call(uhandle, consumer.effective_search_space_used,
642 start='Effective search space used')
643
644 else:
645 attempt_read_and_call(uhandle, consumer.query_length,
646 start='length of query')
647 read_and_call(uhandle, consumer.database_length,
648 start='length of database')
649 read_and_call(uhandle, consumer.effective_hsp_length,
650 start='effective HSP')
651 attempt_read_and_call(uhandle, consumer.effective_query_length,
652 start='effective length of query')
653 read_and_call(uhandle, consumer.effective_database_length,
654 start='effective length of database')
655 attempt_read_and_call(uhandle, consumer.effective_search_space,
656 start='effective search space:')
657 attempt_read_and_call(uhandle, consumer.effective_search_space_used,
658 start='effective search space used')
659
660
661 attempt_read_and_call(uhandle, consumer.frameshift, start='frameshift')
662 attempt_read_and_call(uhandle, consumer.threshold, start='T')
663 read_and_call(uhandle, consumer.window_size, start='A')
664 read_and_call(uhandle, consumer.dropoff_1st_pass, start='X1')
665 read_and_call(uhandle, consumer.gap_x_dropoff, start='X2')
666
667 attempt_read_and_call(uhandle, consumer.gap_x_dropoff_final,
668 start='X3')
669 read_and_call(uhandle, consumer.gap_trigger, start='S1')
670 attempt_read_and_call(uhandle, consumer.blast_cutoff, start='S2')
671
672 attempt_read_and_call(uhandle, consumer.noevent, blank=1)
673 attempt_read_and_call(uhandle, consumer.noevent, start="</PRE>")
674 attempt_read_and_call(uhandle, consumer.noevent, start="</form>")
675
676 consumer.end_parameters()
677
678 -def qblast(program, database, sequence,
679 auto_format=None,composition_based_statistics=None,
680 db_genetic_code=None,endpoints=None,entrez_query='(none)',
681 expect=10.0,filter=None,gapcosts=None,genetic_code=None,
682 hitlist_size=50,i_thresh=None,layout=None,lcase_mask=None,
683 matrix_name=None,nucl_penalty=None,nucl_reward=None,
684 other_advanced=None,perc_ident=None,phi_pattern=None,
685 query_file=None,query_believe_defline=None,query_from=None,
686 query_to=None,searchsp_eff=None,service=None,threshold=None,
687 ungapped_alignment=None,word_size=None,
688 alignments=500,alignment_view=None,descriptions=500,
689 entrez_links_new_window=None,expect_low=None,expect_high=None,
690 format_entrez_query=None,format_object=None,format_type='XML',
691 ncbi_gi=None,results_file=None,show_overview=None
692 ):
693 """Do a BLAST search using the QBLAST server at NCBI.
694
695 Supports all parameters of the qblast API for Put and Get.
696 Some useful parameters:
697 program blastn, blastp, blastx, tblastn, or tblastx (lower case)
698 database Which database to search against (e.g. "nr").
699 sequence The sequence to search.
700 ncbi_gi TRUE/FALSE whether to give 'gi' identifier.
701 descriptions Number of descriptions to show. Def 500.
702 alignments Number of alignments to show. Def 500.
703 expect An expect value cutoff. Def 10.0.
704 matrix_name Specify an alt. matrix (PAM30, PAM70, BLOSUM80, BLOSUM45).
705 filter "none" turns off filtering. Default no filtering
706 format_type "HTML", "Text", "ASN.1", or "XML". Def. "XML".
707 entrez_query Entrez query to limit Blast search
708 hitlist_size Number of hits to return. Default 50
709
710 This function does no checking of the validity of the parameters
711 and passes the values to the server as is. More help is available at:
712 http://www.ncbi.nlm.nih.gov/BLAST/blast_overview.html
713
714 """
715 import urllib, urllib2
716 import time
717
718 assert program in ['blastn', 'blastp', 'blastx', 'tblastn', 'tblastx']
719
720
721
722 parameters = [
723 ('AUTO_FORMAT',auto_format),
724 ('COMPOSITION_BASED_STATISTICS',composition_based_statistics),
725 ('DATABASE',database),
726 ('DB_GENETIC_CODE',db_genetic_code),
727 ('ENDPOINTS',endpoints),
728 ('ENTREZ_QUERY',entrez_query),
729 ('EXPECT',expect),
730 ('FILTER',filter),
731 ('GAPCOSTS',gapcosts),
732 ('GENETIC_CODE',genetic_code),
733 ('HITLIST_SIZE',hitlist_size),
734 ('I_THRESH',i_thresh),
735 ('LAYOUT',layout),
736 ('LCASE_MASK',lcase_mask),
737 ('MATRIX_NAME',matrix_name),
738 ('NUCL_PENALTY',nucl_penalty),
739 ('NUCL_REWARD',nucl_reward),
740 ('OTHER_ADVANCED',other_advanced),
741 ('PERC_IDENT',perc_ident),
742 ('PHI_PATTERN',phi_pattern),
743 ('PROGRAM',program),
744 ('QUERY',sequence),
745 ('QUERY_FILE',query_file),
746 ('QUERY_BELIEVE_DEFLINE',query_believe_defline),
747 ('QUERY_FROM',query_from),
748 ('QUERY_TO',query_to),
749 ('SEARCHSP_EFF',searchsp_eff),
750 ('SERVICE',service),
751 ('THRESHOLD',threshold),
752 ('UNGAPPED_ALIGNMENT',ungapped_alignment),
753 ('WORD_SIZE',word_size),
754 ('CMD', 'Put'),
755 ]
756 query = [x for x in parameters if x[1] is not None]
757 message = urllib.urlencode(query)
758
759
760
761
762
763 request = urllib2.Request("http://blast.ncbi.nlm.nih.gov/Blast.cgi",
764 message,
765 {"User-Agent":"BiopythonClient"})
766 handle = urllib2.urlopen(request)
767
768
769
770 rid, rtoe = _parse_qblast_ref_page(handle)
771 parameters = [
772 ('ALIGNMENTS',alignments),
773 ('ALIGNMENT_VIEW',alignment_view),
774 ('DESCRIPTIONS',descriptions),
775 ('ENTREZ_LINKS_NEW_WINDOW',entrez_links_new_window),
776 ('EXPECT_LOW',expect_low),
777 ('EXPECT_HIGH',expect_high),
778 ('FORMAT_ENTREZ_QUERY',format_entrez_query),
779 ('FORMAT_OBJECT',format_object),
780 ('FORMAT_TYPE',format_type),
781 ('NCBI_GI',ncbi_gi),
782 ('RID',rid),
783 ('RESULTS_FILE',results_file),
784 ('SERVICE',service),
785 ('SHOW_OVERVIEW',show_overview),
786 ('CMD', 'Get'),
787 ]
788 query = [x for x in parameters if x[1] is not None]
789 message = urllib.urlencode(query)
790
791
792 delay = 3.0
793 previous = time.time()
794 while True:
795 current = time.time()
796 wait = previous + delay - current
797 if wait > 0:
798 time.sleep(wait)
799 previous = current + wait
800 else:
801 previous = current
802
803 request = urllib2.Request("http://blast.ncbi.nlm.nih.gov/Blast.cgi",
804 message,
805 {"User-Agent":"BiopythonClient"})
806 handle = urllib2.urlopen(request)
807 results = handle.read()
808
809 if results.find("Status=") < 0:
810 break
811 i = results.index("Status=")
812 j = results.index("\n", i)
813 status = results[i+len("Status="):j].strip()
814 if status.upper() == "READY":
815 break
816
817 return StringIO.StringIO(results)
818
820 """Extract a tuple of RID, RTOE from the 'please wait' page (PRIVATE).
821 """
822 s = handle.read()
823 i = s.find("RID =")
824 if i == 1 :
825 raise ValueError("No RID found in the 'please wait' page.")
826 j = s.find("\n", i)
827 rid = s[i+len("RID ="):j].strip()
828
829 i = s.find("RTOE =")
830 if i == 1 :
831 raise ValueError("No RTOE found in the 'please wait' page.")
832 j = s.find("\n", i)
833 rtoe = s[i+len("RTOE ="):j].strip()
834 try :
835 return rid, int(rtoe)
836 except ValueError :
837 raise ValueError("A non-integer RTOE found in " \
838 +"the 'please wait' page, %s" % repr(rtoe))
839