Package Bio :: Package AlignIO :: Module FastaIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignIO.FastaIO

  1  # Copyright 2008 by Peter Cock.  All rights reserved. 
  2  # 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  """ 
  7  Bio.AlignIO support for "fasta-m10" output from Bill Pearson's FASTA tools. 
  8   
  9  You are expected to use this module via the Bio.AlignIO functions (or the 
 10  Bio.SeqIO functions if you want to work directly with the gapped sequences). 
 11   
 12  This module contains a parser for the pairwise alignments produced by Bill 
 13  Pearson's FASTA tools, for use from the Bio.AlignIO interface where it is 
 14  refered to as the "fasta-m10" file format (as we only support the machine 
 15  readable output format selected with the -m 10 command line option). 
 16   
 17  This module does NOT cover the generic "fasta" file format originally 
 18  developed as an input format to the FASTA tools.  The Bio.AlignIO and 
 19  Bio.SeqIO both use the Bio.SeqIO.FastaIO module to deal with these files, 
 20  which can also be used to store a multiple sequence alignments. 
 21  """ 
 22   
 23  from Bio.Align.Generic import Alignment 
 24  from Interfaces import AlignmentIterator 
 25  from Bio.Alphabet import single_letter_alphabet, generic_dna, generic_protein 
 26  from Bio.Alphabet import Gapped 
 27   
 28   
29 -class FastaM10Iterator(AlignmentIterator) :
30 """Alignment iterator for the FASTA tool's pairwise alignment output. 31 32 This is for reading the pairwise alignments output by Bill Pearson's 33 FASTA program when called with the -m 10 command line option for machine 34 readable output. For more details about the FASTA tools, see the website 35 http://fasta.bioch.virginia.edu/ and the paper: 36 37 W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448 38 39 This class is intended to be used via the Bio.AlignIO.parse() function 40 by specifying the format as "fasta-m10" as shown in the following code: 41 42 from Bio import AlignIO 43 handle = ... 44 for a in AlignIO.parse(handle, "fasta-m10") : 45 assert len(a.get_all_seqs()) == 2, "Should be pairwise!" 46 print "Alignment length %i" % a.get_alignment_length() 47 for record in a : 48 print record.seq, record.name, record.id 49 50 Note that this is not a full blown parser for all the information 51 in the FASTA output - for example, most of the header and all of the 52 footer is ignored. Also, the alignments are not batched according to 53 the input queries. 54 55 Also note that there can be up to about 30 letters of flanking region 56 included in the raw FASTA output as contextual information. This is NOT 57 part of the alignment itself, and is not included in the resulting 58 Alignment objects returned. 59 """ 60
61 - def next(self) :
62 """Reads from the handle to construct and return the next alignment. 63 64 This returns the pairwise alignment of query and match/library 65 sequences as an Alignment object containing two rows.""" 66 handle = self.handle 67 68 try : 69 #Header we saved from when we were parsing 70 #the previous alignment. 71 line = self._header 72 del self._header 73 except AttributeError: 74 line = handle.readline() 75 if not line: 76 return None 77 78 if line.startswith("#") : 79 #Skip the file header before the alignments. e.g. 80 line = self._skip_file_header(line) 81 while ">>>" in line and not line.startswith(">>>") : 82 #Moved onto the next query sequence! 83 self._query_descr = "" 84 self._query_header_annotation = {} 85 #Read in the query header 86 line = self._parse_query_header(line) 87 #Now should be some alignments, but if not we move onto the next query 88 if not line : 89 #End of file 90 return None 91 if ">>><<<" in line : 92 #Reached the end of the alignments, no need to read the footer... 93 return None 94 95 96 #Should start >>... and not >>>... 97 assert line[0:2] == ">>" and not line[2] == ">", line 98 99 query_seq_parts, match_seq_parts = [], [] 100 query_annotation, match_annotation = {}, {} 101 match_descr = "" 102 alignment_annotation = {} 103 104 #This should be followed by the target match ID line, then more tags. 105 #e.g. 106 """ 107 >>gi|152973545|ref|YP_001338596.1| putative plasmid SOS inhibition protein A [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] 108 ; fa_frame: f 109 ; fa_initn: 52 110 ; fa_init1: 52 111 ; fa_opt: 70 112 ; fa_z-score: 105.5 113 ; fa_bits: 27.5 114 ; fa_expect: 0.082 115 ; sw_score: 70 116 ; sw_ident: 0.279 117 ; sw_sim: 0.651 118 ; sw_overlap: 43 119 """ 120 if (not line[0:2] == ">>") or line[0:3] == ">>>" : 121 raise ValueError("Expected target line starting '>>'") 122 match_descr = line[2:].strip() 123 #Handle the following "alignment hit" tagged data, e.g. 124 line = handle.readline() 125 line = self._parse_tag_section(line, alignment_annotation) 126 assert not line[0:2] == "; " 127 128 #Then we have the alignment numbers and sequence for the query 129 """ 130 >gi|10955265| .. 131 ; sq_len: 346 132 ; sq_offset: 1 133 ; sq_type: p 134 ; al_start: 197 135 ; al_stop: 238 136 ; al_display_start: 167 137 DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK 138 QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL 139 GEYFTENKPKYIIREIHQET 140 """ 141 if not (line[0] == ">" and line.strip().endswith("..")): 142 raise ValueError("Expected line starting '>' and ending '..'") 143 assert self._query_descr.startswith(line[1:].split(None,1)[0]) 144 145 #Handle the following "query alignment" tagged data 146 line = handle.readline() 147 line = self._parse_tag_section(line, query_annotation) 148 assert not line[0:2] == "; " 149 150 #Now should have the aligned query sequence (with leading flanking region) 151 while not line[0] == ">" : 152 query_seq_parts.append(line.strip()) 153 line = handle.readline() 154 155 #Handle the following "match alignment" data 156 """ 157 >gi|152973545|ref|YP_001338596.1| .. 158 ; sq_len: 242 159 ; sq_type: p 160 ; al_start: 52 161 ; al_stop: 94 162 ; al_display_start: 22 163 IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD 164 RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR 165 QDFAFTRKMRREARQVEQSW 166 """ 167 #Match identifier 168 if not (line[0] == ">" and line.strip().endswith("..")): 169 raise ValueError("Expected line starting '>' and ending '..', got '%s'" % repr(line)) 170 assert match_descr.startswith(line[1:].split(None,1)[0]) 171 172 #Tagged data, 173 line = handle.readline() 174 line = self._parse_tag_section(line, match_annotation) 175 assert not line[0:2] == "; " 176 177 #Now should have the aligned query sequence with flanking region... 178 #but before that, since FASTA 35.4.1 there can be an consensus here, 179 """ 180 ; al_cons: 181 .::. : :. ---. :: :. . : ..-:::-: :.: ..:...: 182 etc 183 """ 184 while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line): 185 match_seq_parts.append(line.strip()) 186 line = handle.readline() 187 if line[0:2] == "; " : 188 assert line.strip() == "; al_cons:" 189 align_consensus_parts = [] 190 line = handle.readline() 191 while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line): 192 align_consensus_parts.append(line.strip()) 193 line = handle.readline() 194 #If we do anything with this in future, must remove any flanking region. 195 align_consensus = "".join(align_consensus_parts) 196 del align_consensus_parts 197 assert not line[0:2] == "; " 198 else : 199 align_consensus = None 200 assert (line[0] == ">" or ">>>" in line) 201 self._header = line 202 203 #We built a list of strings and then joined them because 204 #its faster than appending to a string. 205 query_seq = "".join(query_seq_parts) 206 match_seq = "".join(match_seq_parts) 207 del query_seq_parts, match_seq_parts 208 #Note, query_seq and match_seq will usually be of different lengths, apparently 209 #because in the m10 format leading gaps are added but not trailing gaps! 210 211 #Remove the flanking regions, 212 query_align_seq = self._extract_alignment_region(query_seq, query_annotation) 213 match_align_seq = self._extract_alignment_region(match_seq, match_annotation) 214 #How can we do this for the (optional) consensus? 215 216 #The "sq_offset" values can be specified with the -X command line option. 217 #They appear to just shift the origin used in the calculation of the coordinates. 218 219 if len(query_align_seq) != len(match_align_seq) : 220 raise ValueError("Problem parsing the alignment sequence coordinates") 221 if "sw_overlap" in alignment_annotation : 222 if int(alignment_annotation["sw_overlap"]) != len(query_align_seq) : 223 raise ValueError("Specified sw_overlap = %s does not match expected value %i" \ 224 % (alignment_annotation["sw_overlap"], 225 len(query_align_seq))) 226 227 #TODO - Look at the "sq_type" to assign a sensible alphabet? 228 alphabet = self.alphabet 229 alignment = Alignment(alphabet) 230 231 #TODO - Introduce an annotated alignment class? 232 #For now, store the annotation a new private property: 233 alignment._annotations = {} 234 235 #Want to record both the query header tags, and the alignment tags. 236 for key, value in self._query_header_annotation.iteritems() : 237 alignment._annotations[key] = value 238 for key, value in alignment_annotation.iteritems() : 239 alignment._annotations[key] = value 240 241 242 #TODO - Once the alignment object gets an append method, use it. 243 #(i.e. an add SeqRecord method) 244 alignment.add_sequence(self._query_descr, query_align_seq) 245 record = alignment.get_all_seqs()[-1] 246 assert record.id == self._query_descr or record.description == self._query_descr 247 #assert record.seq.tostring() == query_align_seq 248 record.id = self._query_descr.split(None,1)[0].strip(",") 249 record.name = "query" 250 record.annotations["original_length"] = int(query_annotation["sq_len"]) 251 252 #TODO - What if a specific alphabet has been requested? 253 #TODO - Use an IUPAC alphabet? 254 #TODO - Can FASTA output RNA? 255 if alphabet == single_letter_alphabet and "sq_type" in query_annotation : 256 if query_annotation["sq_type"] == "D" : 257 record.seq.alphabet = generic_dna 258 elif query_annotation["sq_type"] == "p" : 259 record.seq.alphabet = generic_protein 260 if "-" in query_align_seq : 261 if not hasattr(record.seq.alphabet,"gap_char") : 262 record.seq.alphabet = Gapped(record.seq.alphabet, "-") 263 264 alignment.add_sequence(match_descr, match_align_seq) 265 record = alignment.get_all_seqs()[-1] 266 assert record.id == match_descr or record.description == match_descr 267 #assert record.seq.tostring() == match_align_seq 268 record.id = match_descr.split(None,1)[0].strip(",") 269 record.name = "match" 270 record.annotations["original_length"] = int(match_annotation["sq_len"]) 271 272 #This is still a very crude way of dealing with the alphabet: 273 if alphabet == single_letter_alphabet and "sq_type" in match_annotation : 274 if match_annotation["sq_type"] == "D" : 275 record.seq.alphabet = generic_dna 276 elif match_annotation["sq_type"] == "p" : 277 record.seq.alphabet = generic_protein 278 if "-" in match_align_seq : 279 if not hasattr(record.seq.alphabet,"gap_char") : 280 record.seq.alphabet = Gapped(record.seq.alphabet, "-") 281 282 return alignment
283
284 - def _skip_file_header(self, line) :
285 """Helper function for the main parsing code. 286 287 Skips over the file header region. 288 """ 289 #e.g. This region: 290 """ 291 # /home/xxx/Downloads/FASTA/fasta-35.3.6/fasta35 -Q -H -E 1 -m 10 -X "-5 -5" NC_002127.faa NC_009649.faa 292 FASTA searches a protein or DNA sequence data bank 293 version 35.03 Feb. 18, 2008 294 Please cite: 295 W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448 296 297 Query: NC_002127.faa 298 """ 299 #Note that there is no point recording the command line here 300 #from the # line, as it is included again in each alignment 301 #under the "pg_argv" tag. Likewise for the program version. 302 while ">>>" not in line : 303 line = self.handle.readline() 304 return line
305
306 - def _parse_query_header(self, line) :
307 """Helper function for the main parsing code. 308 309 Skips over the free format query header, extracting the tagged parameters. 310 311 If there are no hits for the current query, it is skipped entirely.""" 312 #e.g. this region (where there is often a histogram too): 313 """ 314 2>>>gi|10955264|ref|NP_052605.1| hypothetical protein pOSAK1_02 [Escherichia coli O157:H7 s 126 aa - 126 aa 315 Library: NC_009649.faa 45119 residues in 180 sequences 316 317 45119 residues in 180 sequences 318 Statistics: (shuffled [500]) Expectation_n fit: rho(ln(x))= 5.0398+/-0.00968; mu= 2.8364+/- 0.508 319 mean_var=44.7937+/-10.479, 0's: 0 Z-trim: 0 B-trim: 0 in 0/32 320 Lambda= 0.191631 321 Algorithm: FASTA (3.5 Sept 2006) [optimized] 322 Parameters: BL50 matrix (15:-5) ktup: 2 323 join: 36, opt: 24, open/ext: -10/-2, width: 16 324 Scan time: 0.040 325 326 The best scores are: opt bits E(180) 327 gi|152973462|ref|YP_001338513.1| hypothetical prot ( 101) 58 23.3 0.22 328 gi|152973501|ref|YP_001338552.1| hypothetical prot ( 245) 55 22.5 0.93 329 """ 330 #Sometimes have queries with no matches, in which case we continue to the 331 #next query block: 332 """ 333 2>>>gi|152973838|ref|YP_001338875.1| hypothetical protein KPN_pKPN7p10263 [Klebsiella pneumoniae subsp. pneumonia 76 aa - 76 aa 334 vs NC_002127.faa library 335 336 579 residues in 3 sequences 337 Altschul/Gish params: n0: 76 Lambda: 0.158 K: 0.019 H: 0.100 338 339 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2 340 join: 36, opt: 24, open/ext: -10/-2, width: 16 341 Scan time: 0.000 342 !! No library sequences with E() < 0.5 343 """ 344 345 self._query_header_annotation = {} 346 self._query_descr = "" 347 348 assert ">>>" in line and not line[0:3] == ">>>" 349 #There is nothing useful in this line, the query description is truncated. 350 351 line = self.handle.readline() 352 #We ignore the free form text... 353 while not line[0:3] == ">>>" : 354 #print "Ignoring %s" % line.strip() 355 line = self.handle.readline() 356 if not line : 357 raise ValueError("Premature end of file!") 358 if ">>><<<" in line : 359 #End of alignments, looks like the last query 360 #or queries had no hits. 361 return line 362 363 #Now want to parse this section: 364 """ 365 >>>gi|10955264|ref|NP_052605.1|, 126 aa vs NC_009649.faa library 366 ; pg_name: /home/pjcock/Downloads/FASTA/fasta-35.3.6/fasta35 367 ; pg_ver: 35.03 368 ; pg_argv: /home/pjcock/Downloads/FASTA/fasta-35.3.6/fasta35 -Q -H -E 1 -m 10 -X -5 -5 NC_002127.faa NC_009649.faa 369 ; pg_name: FASTA 370 ; pg_ver: 3.5 Sept 2006 371 ; pg_matrix: BL50 (15:-5) 372 ; pg_open-ext: -10 -2 373 ; pg_ktup: 2 374 ; pg_optcut: 24 375 ; pg_cgap: 36 376 ; mp_extrap: 60000 500 377 ; mp_stats: (shuffled [500]) Expectation_n fit: rho(ln(x))= 5.0398+/-0.00968; mu= 2.8364+/- 0.508 mean_var=44.7937+/-10.479, 0's: 0 Z-trim: 0 B-trim: 0 in 0/32 Lambda= 0.191631 378 ; mp_KS: -0.0000 (N=1066338402) at 20 379 ; mp_Algorithm: FASTA (3.5 Sept 2006) [optimized] 380 ; mp_Parameters: BL50 matrix (15:-5) ktup: 2 join: 36, opt: 24, open/ext: -10/-2, width: 16 381 """ 382 383 assert line[0:3] == ">>>", line 384 self._query_descr = line[3:].strip() 385 386 #Handle the following "program" tagged data, 387 line = self.handle.readline() 388 line = self._parse_tag_section(line, self._query_header_annotation) 389 assert not line[0:2] == "; ", line 390 assert line[0:2] == ">>" or ">>>" in line, line 391 return line
392 393
394 - def _extract_alignment_region(self, alignment_seq_with_flanking, annotation) :
395 """Helper function for the main parsing code. 396 397 To get the actual pairwise alignment sequences, we must first 398 translate the un-gapped sequence based coordinates into positions 399 in the gapped sequence (which may have a flanking region shown 400 using leading - characters). To date, I have never seen any 401 trailing flanking region shown in the m10 file, but the 402 following code should also cope with that. 403 404 Note that this code seems to work fine even when the "sq_offset" 405 entries are prsent as a result of using the -X command line option. 406 """ 407 if int(annotation['al_start']) > int(annotation['al_stop']) : 408 raise ValueError("Unsupported inverted sequence found (from the -i option?)") 409 align_stripped = alignment_seq_with_flanking.strip("-") 410 display_start = int(annotation['al_display_start']) 411 start = int(annotation['al_start']) \ 412 - display_start 413 end = int(annotation['al_stop']) \ 414 - display_start \ 415 + align_stripped.count("-") + 1 416 return align_stripped[start:end]
417 418
419 - def _parse_tag_section(self, line, dictionary) :
420 """Helper function for the main parsing code. 421 422 line - supply line just read from the handle containing the start of 423 the tagged section. 424 dictionary - where to record the tagged values 425 426 Returns a string, the first line following the tagged section.""" 427 if not line[0:2] == "; " : 428 raise ValueError("Expected line starting '; '") 429 while line[0:2] == "; " : 430 tag, value = line[2:].strip().split(": ",1) 431 #fasta34 and early versions of fasta35 will reuse the pg_name and 432 #pg_ver tags for the program executable and name, and the program 433 #version and the algorithm version, respectively. This is fixed 434 #in FASTA 35.4.1, but we can't assume the tags are unique: 435 #if tag in dictionary : 436 # raise ValueError("Repeated tag '%s' in section" % tag) 437 dictionary[tag] = value 438 line = self.handle.readline() 439 return line
440 441 if __name__ == "__main__" : 442 print "Running a quick self-test" 443 444 #http://emboss.sourceforge.net/docs/themes/alnformats/align.simple 445 simple_example = \ 446 """# /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa 447 FASTA searches a protein or DNA sequence data bank 448 version 34.26 January 12, 2007 449 Please cite: 450 W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448 451 452 Query library NC_002127.faa vs NC_009649.faa library 453 searching NC_009649.faa library 454 455 1>>>gi|10955263|ref|NP_052604.1| plasmid mobilization [Escherichia coli O157:H7 s 107 aa - 107 aa 456 vs NC_009649.faa library 457 458 45119 residues in 180 sequences 459 Expectation_n fit: rho(ln(x))= 6.9146+/-0.0249; mu= -5.7948+/- 1.273 460 mean_var=53.6859+/-13.609, 0's: 0 Z-trim: 1 B-trim: 9 in 1/25 461 Lambda= 0.175043 462 463 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2 464 join: 36, opt: 24, open/ext: -10/-2, width: 16 465 Scan time: 0.000 466 The best scores are: opt bits E(180) 467 gi|152973457|ref|YP_001338508.1| ATPase with chape ( 931) 71 24.9 0.58 468 gi|152973588|ref|YP_001338639.1| F pilus assembly ( 459) 63 23.1 0.99 469 470 >>>gi|10955263|ref|NP_052604.1|, 107 aa vs NC_009649.faa library 471 ; pg_name: /opt/fasta/fasta34 472 ; pg_ver: 34.26 473 ; pg_argv: /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa 474 ; pg_name: FASTA 475 ; pg_ver: 3.5 Sept 2006 476 ; pg_matrix: BL50 (15:-5) 477 ; pg_open-ext: -10 -2 478 ; pg_ktup: 2 479 ; pg_optcut: 24 480 ; pg_cgap: 36 481 ; mp_extrap: 60000 180 482 ; mp_stats: Expectation_n fit: rho(ln(x))= 6.9146+/-0.0249; mu= -5.7948+/- 1.273 mean_var=53.6859+/-13.609, 0's: 0 Z-trim: 1 B-trim: 9 in 1/25 Lambda= 0.175043 483 ; mp_KS: -0.0000 (N=0) at 8159228 484 >>gi|152973457|ref|YP_001338508.1| ATPase with chaperone activity, ATP-binding subunit [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] 485 ; fa_frame: f 486 ; fa_initn: 65 487 ; fa_init1: 43 488 ; fa_opt: 71 489 ; fa_z-score: 90.3 490 ; fa_bits: 24.9 491 ; fa_expect: 0.58 492 ; sw_score: 71 493 ; sw_ident: 0.250 494 ; sw_sim: 0.574 495 ; sw_overlap: 108 496 >gi|10955263| .. 497 ; sq_len: 107 498 ; sq_offset: 1 499 ; sq_type: p 500 ; al_start: 5 501 ; al_stop: 103 502 ; al_display_start: 1 503 --------------------------MTKRSGSNT-RRRAISRPVRLTAE 504 ED---QEIRKRAAECGKTVSGFLRAAALGKKVNSLTDDRVLKEVM----- 505 RLGALQKKLFIDGKRVGDREYAEVLIAITEYHRALLSRLMAD 506 >gi|152973457|ref|YP_001338508.1| .. 507 ; sq_len: 931 508 ; sq_type: p 509 ; al_start: 96 510 ; al_stop: 195 511 ; al_display_start: 66 512 SDFFRIGDDATPVAADTDDVVDASFGEPAAAGSGAPRRRGSGLASRISEQ 513 SEALLQEAAKHAAEFGRS------EVDTEHLLLALADSDVVKTILGQFKI 514 KVDDLKRQIESEAKR-GDKPF-EGEIGVSPRVKDALSRAFVASNELGHSY 515 VGPEHFLIGLAEEGEGLAANLLRRYGLTPQ 516 >>gi|152973588|ref|YP_001338639.1| F pilus assembly protein [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] 517 ; fa_frame: f 518 ; fa_initn: 33 519 ; fa_init1: 33 520 ; fa_opt: 63 521 ; fa_z-score: 86.1 522 ; fa_bits: 23.1 523 ; fa_expect: 0.99 524 ; sw_score: 63 525 ; sw_ident: 0.266 526 ; sw_sim: 0.656 527 ; sw_overlap: 64 528 >gi|10955263| .. 529 ; sq_len: 107 530 ; sq_offset: 1 531 ; sq_type: p 532 ; al_start: 32 533 ; al_stop: 94 534 ; al_display_start: 2 535 TKRSGSNTRRRAISRPVRLTAEEDQEIRKRAAECGKTVSGFLRAAALGKK 536 VNSLTDDRVLKEV-MRLGALQKKLFIDGKRVGDREYAEVLIAITEYHRAL 537 LSRLMAD 538 >gi|152973588|ref|YP_001338639.1| .. 539 ; sq_len: 459 540 ; sq_type: p 541 ; al_start: 191 542 ; al_stop: 248 543 ; al_display_start: 161 544 VGGLFPRTQVAQQKVCQDIAGESNIFSDWAASRQGCTVGG--KMDSVQDK 545 ASDKDKERVMKNINIMWNALSKNRLFDG----NKELKEFIMTLTGTLIFG 546 ENSEITPLPARTTDQDLIRAMMEGGTAKIYHCNDSDKCLKVVADATVTIT 547 SNKALKSQISALLSSIQNKAVADEKLTDQE 548 2>>>gi|10955264|ref|NP_052605.1| hypothetical protein pOSAK1_02 [Escherichia coli O157:H7 s 126 aa - 126 aa 549 vs NC_009649.faa library 550 551 45119 residues in 180 sequences 552 Expectation_n fit: rho(ln(x))= 7.1374+/-0.0246; mu= -7.6540+/- 1.313 553 mean_var=51.1189+/-13.171, 0's: 0 Z-trim: 1 B-trim: 8 in 1/25 554 Lambda= 0.179384 555 556 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2 557 join: 36, opt: 24, open/ext: -10/-2, width: 16 558 Scan time: 0.000 559 The best scores are: opt bits E(180) 560 gi|152973462|ref|YP_001338513.1| hypothetical prot ( 101) 58 22.9 0.29 561 562 >>>gi|10955264|ref|NP_052605.1|, 126 aa vs NC_009649.faa library 563 ; pg_name: /opt/fasta/fasta34 564 ; pg_ver: 34.26 565 ; pg_argv: /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa 566 ; pg_name: FASTA 567 ; pg_ver: 3.5 Sept 2006 568 ; pg_matrix: BL50 (15:-5) 569 ; pg_open-ext: -10 -2 570 ; pg_ktup: 2 571 ; pg_optcut: 24 572 ; pg_cgap: 36 573 ; mp_extrap: 60000 180 574 ; mp_stats: Expectation_n fit: rho(ln(x))= 7.1374+/-0.0246; mu= -7.6540+/- 1.313 mean_var=51.1189+/-13.171, 0's: 0 Z-trim: 1 B-trim: 8 in 1/25 Lambda= 0.179384 575 ; mp_KS: -0.0000 (N=0) at 8159228 576 >>gi|152973462|ref|YP_001338513.1| hypothetical protein KPN_pKPN3p05904 [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] 577 ; fa_frame: f 578 ; fa_initn: 50 579 ; fa_init1: 50 580 ; fa_opt: 58 581 ; fa_z-score: 95.8 582 ; fa_bits: 22.9 583 ; fa_expect: 0.29 584 ; sw_score: 58 585 ; sw_ident: 0.289 586 ; sw_sim: 0.632 587 ; sw_overlap: 38 588 >gi|10955264| .. 589 ; sq_len: 126 590 ; sq_offset: 1 591 ; sq_type: p 592 ; al_start: 1 593 ; al_stop: 38 594 ; al_display_start: 1 595 ------------------------------MKKDKKYQIEAIKNKDKTLF 596 IVYATDIYSPSEFFSKIESDLKKKKSKGDVFFDLIIPNGGKKDRYVYTSF 597 NGEKFSSYTLNKVTKTDEYN 598 >gi|152973462|ref|YP_001338513.1| .. 599 ; sq_len: 101 600 ; sq_type: p 601 ; al_start: 44 602 ; al_stop: 81 603 ; al_display_start: 14 604 DALLGEIQRLRKQVHQLQLERDILTKANELIKKDLGVSFLKLKNREKTLI 605 VDALKKKYPVAELLSVLQLARSCYFYQNVCTISMRKYA 606 3>>>gi|10955265|ref|NP_052606.1| hypothetical protein pOSAK1_03 [Escherichia coli O157:H7 s 346 aa - 346 aa 607 vs NC_009649.faa library 608 609 45119 residues in 180 sequences 610 Expectation_n fit: rho(ln(x))= 6.0276+/-0.0276; mu= 3.0670+/- 1.461 611 mean_var=37.1634+/- 8.980, 0's: 0 Z-trim: 1 B-trim: 14 in 1/25 612 Lambda= 0.210386 613 614 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2 615 join: 37, opt: 25, open/ext: -10/-2, width: 16 616 Scan time: 0.020 617 The best scores are: opt bits E(180) 618 gi|152973545|ref|YP_001338596.1| putative plasmid ( 242) 70 27.5 0.082 619 620 >>>gi|10955265|ref|NP_052606.1|, 346 aa vs NC_009649.faa library 621 ; pg_name: /opt/fasta/fasta34 622 ; pg_ver: 34.26 623 ; pg_argv: /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa 624 ; pg_name: FASTA 625 ; pg_ver: 3.5 Sept 2006 626 ; pg_matrix: BL50 (15:-5) 627 ; pg_open-ext: -10 -2 628 ; pg_ktup: 2 629 ; pg_optcut: 25 630 ; pg_cgap: 37 631 ; mp_extrap: 60000 180 632 ; mp_stats: Expectation_n fit: rho(ln(x))= 6.0276+/-0.0276; mu= 3.0670+/- 1.461 mean_var=37.1634+/- 8.980, 0's: 0 Z-trim: 1 B-trim: 14 in 1/25 Lambda= 0.210386 633 ; mp_KS: -0.0000 (N=0) at 8159228 634 >>gi|152973545|ref|YP_001338596.1| putative plasmid SOS inhibition protein A [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] 635 ; fa_frame: f 636 ; fa_initn: 52 637 ; fa_init1: 52 638 ; fa_opt: 70 639 ; fa_z-score: 105.5 640 ; fa_bits: 27.5 641 ; fa_expect: 0.082 642 ; sw_score: 70 643 ; sw_ident: 0.279 644 ; sw_sim: 0.651 645 ; sw_overlap: 43 646 >gi|10955265| .. 647 ; sq_len: 346 648 ; sq_offset: 1 649 ; sq_type: p 650 ; al_start: 197 651 ; al_stop: 238 652 ; al_display_start: 167 653 DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK 654 QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL 655 GEYFTENKPKYIIREIHQET 656 >gi|152973545|ref|YP_001338596.1| .. 657 ; sq_len: 242 658 ; sq_type: p 659 ; al_start: 52 660 ; al_stop: 94 661 ; al_display_start: 22 662 IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD 663 RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR 664 QDFAFTRKMRREARQVEQSW 665 >>><<< 666 667 668 579 residues in 3 query sequences 669 45119 residues in 180 library sequences 670 Scomplib [34.26] 671 start: Tue May 20 16:38:45 2008 done: Tue May 20 16:38:45 2008 672 Total Scan time: 0.020 Total Display time: 0.010 673 674 Function used was FASTA [version 34.26 January 12, 2007] 675 676 """ 677 678 679 from StringIO import StringIO 680 681 alignments = list(FastaM10Iterator(StringIO(simple_example))) 682 assert len(alignments) == 4, len(alignments) 683 assert len(alignments[0].get_all_seqs()) == 2 684 for a in alignments : 685 print "Alignment %i sequences of length %i" \ 686 % (len(a.get_all_seqs()), a.get_alignment_length()) 687 for r in a : 688 print "%s %s %i" % (r.seq, r.id, r.annotations["original_length"]) 689 #print a.annotations 690 print "Done" 691 692 import os 693 path = "../../Tests/Fasta/" 694 files = [f for f in os.listdir(path) if os.path.splitext(f)[-1] == ".m10"] 695 files.sort() 696 for filename in files : 697 if os.path.splitext(filename)[-1] == ".m10" : 698 print 699 print filename 700 print "="*len(filename) 701 for i,a in enumerate(FastaM10Iterator(open(os.path.join(path,filename)))): 702 print "#%i, %s" % (i+1,a) 703 for r in a : 704 if "-" in r.seq : 705 assert r.seq.alphabet.gap_char == "-" 706 else : 707 assert not hasattr(r.seq.alphabet, "gap_char") 708