Package Bio :: Package AlignIO :: Module FastaIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignIO.FastaIO

  1  # Copyright 2008-2010 by Peter Cock.  All rights reserved. 
  2  # 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  """ 
  7  Bio.AlignIO support for "fasta-m10" output from Bill Pearson's FASTA tools. 
  8   
  9  You are expected to use this module via the Bio.AlignIO functions (or the 
 10  Bio.SeqIO functions if you want to work directly with the gapped sequences). 
 11   
 12  This module contains a parser for the pairwise alignments produced by Bill 
 13  Pearson's FASTA tools, for use from the Bio.AlignIO interface where it is 
 14  refered to as the "fasta-m10" file format (as we only support the machine 
 15  readable output format selected with the -m 10 command line option). 
 16   
 17  This module does NOT cover the generic "fasta" file format originally 
 18  developed as an input format to the FASTA tools.  The Bio.AlignIO and 
 19  Bio.SeqIO both use the Bio.SeqIO.FastaIO module to deal with these files, 
 20  which can also be used to store a multiple sequence alignments. 
 21  """ 
 22   
 23  from Bio.Seq import Seq 
 24  from Bio.SeqRecord import SeqRecord 
 25  from Bio.Align import MultipleSeqAlignment 
 26  from Interfaces import AlignmentIterator 
 27  from Bio.Alphabet import single_letter_alphabet, generic_dna, generic_protein 
 28  from Bio.Alphabet import Gapped 
 29   
 30  # TODO - Turn this into a doctest 
31 -class FastaM10Iterator(AlignmentIterator):
32 """Alignment iterator for the FASTA tool's pairwise alignment output. 33 34 This is for reading the pairwise alignments output by Bill Pearson's 35 FASTA program when called with the -m 10 command line option for machine 36 readable output. For more details about the FASTA tools, see the website 37 http://fasta.bioch.virginia.edu/ and the paper: 38 39 W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448 40 41 This class is intended to be used via the Bio.AlignIO.parse() function 42 by specifying the format as "fasta-m10" as shown in the following code: 43 44 from Bio import AlignIO 45 handle = ... 46 for a in AlignIO.parse(handle, "fasta-m10"): 47 assert len(a) == 2, "Should be pairwise!" 48 print "Alignment length %i" % a.get_alignment_length() 49 for record in a: 50 print record.seq, record.name, record.id 51 52 Note that this is not a full blown parser for all the information 53 in the FASTA output - for example, most of the header and all of the 54 footer is ignored. Also, the alignments are not batched according to 55 the input queries. 56 57 Also note that there can be up to about 30 letters of flanking region 58 included in the raw FASTA output as contextual information. This is NOT 59 part of the alignment itself, and is not included in the resulting 60 MultipleSeqAlignment objects returned. 61 """ 62
63 - def next(self):
64 """Reads from the handle to construct and return the next alignment. 65 66 This returns the pairwise alignment of query and match/library 67 sequences as an MultipleSeqAlignment object containing two rows. 68 """ 69 handle = self.handle 70 71 try: 72 #Header we saved from when we were parsing 73 #the previous alignment. 74 line = self._header 75 del self._header 76 except AttributeError: 77 line = handle.readline() 78 if not line: 79 raise StopIteration 80 81 if line.startswith("#"): 82 #Skip the file header before the alignments. e.g. 83 line = self._skip_file_header(line) 84 while ">>>" in line and not line.startswith(">>>"): 85 #Moved onto the next query sequence! 86 self._query_descr = "" 87 self._query_header_annotation = {} 88 #Read in the query header 89 line = self._parse_query_header(line) 90 #Now should be some alignments, but if not we move onto the next query 91 if not line: 92 #End of file 93 raise StopIteration 94 if ">>><<<" in line: 95 #Reached the end of the alignments, no need to read the footer... 96 raise StopIteration 97 98 99 #Should start >>... and not >>>... 100 assert line[0:2] == ">>" and not line[2] == ">", line 101 102 query_seq_parts, match_seq_parts = [], [] 103 query_annotation, match_annotation = {}, {} 104 match_descr = "" 105 alignment_annotation = {} 106 107 #This should be followed by the target match ID line, then more tags. 108 #e.g. 109 """ 110 >>gi|152973545|ref|YP_001338596.1| putative plasmid SOS inhibition protein A [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] 111 ; fa_frame: f 112 ; fa_initn: 52 113 ; fa_init1: 52 114 ; fa_opt: 70 115 ; fa_z-score: 105.5 116 ; fa_bits: 27.5 117 ; fa_expect: 0.082 118 ; sw_score: 70 119 ; sw_ident: 0.279 120 ; sw_sim: 0.651 121 ; sw_overlap: 43 122 """ 123 if (not line[0:2] == ">>") or line[0:3] == ">>>": 124 raise ValueError("Expected target line starting '>>'") 125 match_descr = line[2:].strip() 126 #Handle the following "alignment hit" tagged data, e.g. 127 line = handle.readline() 128 line = self._parse_tag_section(line, alignment_annotation) 129 assert not line[0:2] == "; " 130 131 #Then we have the alignment numbers and sequence for the query 132 """ 133 >gi|10955265| .. 134 ; sq_len: 346 135 ; sq_offset: 1 136 ; sq_type: p 137 ; al_start: 197 138 ; al_stop: 238 139 ; al_display_start: 167 140 DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK 141 QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL 142 GEYFTENKPKYIIREIHQET 143 """ 144 if not (line[0] == ">" and line.strip().endswith("..")): 145 raise ValueError("Expected line starting '>' and ending '..'") 146 assert self._query_descr.startswith(line[1:].split(None,1)[0]) 147 148 #Handle the following "query alignment" tagged data 149 line = handle.readline() 150 line = self._parse_tag_section(line, query_annotation) 151 assert not line[0:2] == "; " 152 153 #Now should have the aligned query sequence (with leading flanking region) 154 while not line[0] == ">": 155 query_seq_parts.append(line.strip()) 156 line = handle.readline() 157 158 #Handle the following "match alignment" data 159 """ 160 >gi|152973545|ref|YP_001338596.1| .. 161 ; sq_len: 242 162 ; sq_type: p 163 ; al_start: 52 164 ; al_stop: 94 165 ; al_display_start: 22 166 IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD 167 RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR 168 QDFAFTRKMRREARQVEQSW 169 """ 170 #Match identifier 171 if not (line[0] == ">" and line.strip().endswith("..")): 172 raise ValueError("Expected line starting '>' and ending '..', got '%s'" % repr(line)) 173 assert match_descr.startswith(line[1:].split(None,1)[0]) 174 175 #Tagged data, 176 line = handle.readline() 177 line = self._parse_tag_section(line, match_annotation) 178 assert not line[0:2] == "; " 179 180 #Now should have the aligned query sequence with flanking region... 181 #but before that, since FASTA 35.4.1 there can be an consensus here, 182 """ 183 ; al_cons: 184 .::. : :. ---. :: :. . : ..-:::-: :.: ..:...: 185 etc 186 """ 187 while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line): 188 match_seq_parts.append(line.strip()) 189 line = handle.readline() 190 if line[0:2] == "; ": 191 assert line.strip() == "; al_cons:" 192 align_consensus_parts = [] 193 line = handle.readline() 194 while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line): 195 align_consensus_parts.append(line.strip()) 196 line = handle.readline() 197 #If we do anything with this in future, must remove any flanking region. 198 align_consensus = "".join(align_consensus_parts) 199 del align_consensus_parts 200 assert not line[0:2] == "; " 201 else: 202 align_consensus = None 203 assert (line[0] == ">" or ">>>" in line) 204 self._header = line 205 206 #We built a list of strings and then joined them because 207 #its faster than appending to a string. 208 query_seq = "".join(query_seq_parts) 209 match_seq = "".join(match_seq_parts) 210 del query_seq_parts, match_seq_parts 211 #Note, query_seq and match_seq will usually be of different lengths, apparently 212 #because in the m10 format leading gaps are added but not trailing gaps! 213 214 #Remove the flanking regions, 215 query_align_seq = self._extract_alignment_region(query_seq, query_annotation) 216 match_align_seq = self._extract_alignment_region(match_seq, match_annotation) 217 #How can we do this for the (optional) consensus? 218 219 #The "sq_offset" values can be specified with the -X command line option. 220 #They appear to just shift the origin used in the calculation of the coordinates. 221 222 if len(query_align_seq) != len(match_align_seq): 223 raise ValueError("Problem parsing the alignment sequence coordinates, " 224 "following should be the same length but are not:\n" 225 "%s - len %i\n%s - len %i" % (query_align_seq, 226 len(query_align_seq), 227 match_align_seq, 228 len(match_align_seq))) 229 if "sw_overlap" in alignment_annotation: 230 if int(alignment_annotation["sw_overlap"]) != len(query_align_seq): 231 raise ValueError("Specified sw_overlap = %s does not match expected value %i" \ 232 % (alignment_annotation["sw_overlap"], 233 len(query_align_seq))) 234 235 #TODO - Look at the "sq_type" to assign a sensible alphabet? 236 alphabet = self.alphabet 237 alignment = MultipleSeqAlignment([], alphabet) 238 239 #TODO - Introduce an annotated alignment class? 240 #For now, store the annotation a new private property: 241 alignment._annotations = {} 242 243 #Want to record both the query header tags, and the alignment tags. 244 for key, value in self._query_header_annotation.iteritems(): 245 alignment._annotations[key] = value 246 for key, value in alignment_annotation.iteritems(): 247 alignment._annotations[key] = value 248 249 #Query 250 #===== 251 record = SeqRecord(Seq(query_align_seq, alphabet), 252 id = self._query_descr.split(None,1)[0].strip(","), 253 name = "query", 254 description = self._query_descr, 255 annotations = {"original_length" : int(query_annotation["sq_len"])}) 256 #TODO - handle start/end coordinates properly. Short term hack for now: 257 record._al_start = int(query_annotation["al_start"]) 258 record._al_stop = int(query_annotation["al_stop"]) 259 alignment.append(record) 260 261 #TODO - What if a specific alphabet has been requested? 262 #TODO - Use an IUPAC alphabet? 263 #TODO - Can FASTA output RNA? 264 if alphabet == single_letter_alphabet and "sq_type" in query_annotation: 265 if query_annotation["sq_type"] == "D": 266 record.seq.alphabet = generic_dna 267 elif query_annotation["sq_type"] == "p": 268 record.seq.alphabet = generic_protein 269 if "-" in query_align_seq: 270 if not hasattr(record.seq.alphabet,"gap_char"): 271 record.seq.alphabet = Gapped(record.seq.alphabet, "-") 272 273 #Match 274 #===== 275 record = SeqRecord(Seq(match_align_seq, alphabet), 276 id = match_descr.split(None,1)[0].strip(","), 277 name = "match", 278 description = match_descr, 279 annotations = {"original_length" : int(match_annotation["sq_len"])}) 280 #TODO - handle start/end coordinates properly. Short term hack for now: 281 record._al_start = int(match_annotation["al_start"]) 282 record._al_stop = int(match_annotation["al_stop"]) 283 alignment.append(record) 284 285 #This is still a very crude way of dealing with the alphabet: 286 if alphabet == single_letter_alphabet and "sq_type" in match_annotation: 287 if match_annotation["sq_type"] == "D": 288 record.seq.alphabet = generic_dna 289 elif match_annotation["sq_type"] == "p": 290 record.seq.alphabet = generic_protein 291 if "-" in match_align_seq: 292 if not hasattr(record.seq.alphabet,"gap_char"): 293 record.seq.alphabet = Gapped(record.seq.alphabet, "-") 294 295 return alignment
296
297 - def _skip_file_header(self, line):
298 """Helper function for the main parsing code. 299 300 Skips over the file header region. 301 """ 302 #e.g. This region: 303 """ 304 # /home/xxx/Downloads/FASTA/fasta-35.3.6/fasta35 -Q -H -E 1 -m 10 -X "-5 -5" NC_002127.faa NC_009649.faa 305 FASTA searches a protein or DNA sequence data bank 306 version 35.03 Feb. 18, 2008 307 Please cite: 308 W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448 309 310 Query: NC_002127.faa 311 """ 312 #Note that there is no point recording the command line here 313 #from the # line, as it is included again in each alignment 314 #under the "pg_argv" tag. Likewise for the program version. 315 while ">>>" not in line: 316 line = self.handle.readline() 317 return line
318
319 - def _parse_query_header(self, line):
320 """Helper function for the main parsing code. 321 322 Skips over the free format query header, extracting the tagged parameters. 323 324 If there are no hits for the current query, it is skipped entirely.""" 325 #e.g. this region (where there is often a histogram too): 326 """ 327 2>>>gi|10955264|ref|NP_052605.1| hypothetical protein pOSAK1_02 [Escherichia coli O157:H7 s 126 aa - 126 aa 328 Library: NC_009649.faa 45119 residues in 180 sequences 329 330 45119 residues in 180 sequences 331 Statistics: (shuffled [500]) Expectation_n fit: rho(ln(x))= 5.0398+/-0.00968; mu= 2.8364+/- 0.508 332 mean_var=44.7937+/-10.479, 0's: 0 Z-trim: 0 B-trim: 0 in 0/32 333 Lambda= 0.191631 334 Algorithm: FASTA (3.5 Sept 2006) [optimized] 335 Parameters: BL50 matrix (15:-5) ktup: 2 336 join: 36, opt: 24, open/ext: -10/-2, width: 16 337 Scan time: 0.040 338 339 The best scores are: opt bits E(180) 340 gi|152973462|ref|YP_001338513.1| hypothetical prot ( 101) 58 23.3 0.22 341 gi|152973501|ref|YP_001338552.1| hypothetical prot ( 245) 55 22.5 0.93 342 """ 343 #Sometimes have queries with no matches, in which case we continue to the 344 #next query block: 345 """ 346 2>>>gi|152973838|ref|YP_001338875.1| hypothetical protein KPN_pKPN7p10263 [Klebsiella pneumoniae subsp. pneumonia 76 aa - 76 aa 347 vs NC_002127.faa library 348 349 579 residues in 3 sequences 350 Altschul/Gish params: n0: 76 Lambda: 0.158 K: 0.019 H: 0.100 351 352 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2 353 join: 36, opt: 24, open/ext: -10/-2, width: 16 354 Scan time: 0.000 355 !! No library sequences with E() < 0.5 356 """ 357 358 self._query_header_annotation = {} 359 self._query_descr = "" 360 361 assert ">>>" in line and not line[0:3] == ">>>" 362 #There is nothing useful in this line, the query description is truncated. 363 364 line = self.handle.readline() 365 #We ignore the free form text... 366 while not line[0:3] == ">>>": 367 #print "Ignoring %s" % line.strip() 368 line = self.handle.readline() 369 if not line: 370 raise ValueError("Premature end of file!") 371 if ">>><<<" in line: 372 #End of alignments, looks like the last query 373 #or queries had no hits. 374 return line 375 376 #Now want to parse this section: 377 """ 378 >>>gi|10955264|ref|NP_052605.1|, 126 aa vs NC_009649.faa library 379 ; pg_name: /home/pjcock/Downloads/FASTA/fasta-35.3.6/fasta35 380 ; pg_ver: 35.03 381 ; pg_argv: /home/pjcock/Downloads/FASTA/fasta-35.3.6/fasta35 -Q -H -E 1 -m 10 -X -5 -5 NC_002127.faa NC_009649.faa 382 ; pg_name: FASTA 383 ; pg_ver: 3.5 Sept 2006 384 ; pg_matrix: BL50 (15:-5) 385 ; pg_open-ext: -10 -2 386 ; pg_ktup: 2 387 ; pg_optcut: 24 388 ; pg_cgap: 36 389 ; mp_extrap: 60000 500 390 ; mp_stats: (shuffled [500]) Expectation_n fit: rho(ln(x))= 5.0398+/-0.00968; mu= 2.8364+/- 0.508 mean_var=44.7937+/-10.479, 0's: 0 Z-trim: 0 B-trim: 0 in 0/32 Lambda= 0.191631 391 ; mp_KS: -0.0000 (N=1066338402) at 20 392 ; mp_Algorithm: FASTA (3.5 Sept 2006) [optimized] 393 ; mp_Parameters: BL50 matrix (15:-5) ktup: 2 join: 36, opt: 24, open/ext: -10/-2, width: 16 394 """ 395 396 assert line[0:3] == ">>>", line 397 self._query_descr = line[3:].strip() 398 399 #Handle the following "program" tagged data, 400 line = self.handle.readline() 401 line = self._parse_tag_section(line, self._query_header_annotation) 402 assert not line[0:2] == "; ", line 403 assert line[0:2] == ">>" or ">>>" in line, line 404 return line
405 406
407 - def _extract_alignment_region(self, alignment_seq_with_flanking, annotation):
408 """Helper function for the main parsing code. 409 410 To get the actual pairwise alignment sequences, we must first 411 translate the un-gapped sequence based coordinates into positions 412 in the gapped sequence (which may have a flanking region shown 413 using leading - characters). To date, I have never seen any 414 trailing flanking region shown in the m10 file, but the 415 following code should also cope with that. 416 417 Note that this code seems to work fine even when the "sq_offset" 418 entries are prsent as a result of using the -X command line option. 419 """ 420 align_stripped = alignment_seq_with_flanking.strip("-") 421 display_start = int(annotation['al_display_start']) 422 if int(annotation['al_start']) <= int(annotation['al_stop']): 423 start = int(annotation['al_start']) \ 424 - display_start 425 end = int(annotation['al_stop']) \ 426 - display_start \ 427 + align_stripped.count("-") + 1 428 else: 429 #FASTA has flipped this sequence... 430 start = display_start \ 431 - int(annotation['al_start']) 432 end = display_start \ 433 - int(annotation['al_stop']) \ 434 + align_stripped.count("-") + 1 435 assert 0 <= start and start < end and end <= len(align_stripped), \ 436 "Problem with sequence start/stop,\n%s[%i:%i]\n%s" \ 437 % (alignment_seq_with_flanking, start, end, annotation) 438 return align_stripped[start:end]
439 440
441 - def _parse_tag_section(self, line, dictionary):
442 """Helper function for the main parsing code. 443 444 line - supply line just read from the handle containing the start of 445 the tagged section. 446 dictionary - where to record the tagged values 447 448 Returns a string, the first line following the tagged section.""" 449 if not line[0:2] == "; ": 450 raise ValueError("Expected line starting '; '") 451 while line[0:2] == "; ": 452 tag, value = line[2:].strip().split(": ",1) 453 #fasta34 and early versions of fasta35 will reuse the pg_name and 454 #pg_ver tags for the program executable and name, and the program 455 #version and the algorithm version, respectively. This is fixed 456 #in FASTA 35.4.1, but we can't assume the tags are unique: 457 #if tag in dictionary: 458 # raise ValueError("Repeated tag '%s' in section" % tag) 459 dictionary[tag] = value 460 line = self.handle.readline() 461 return line
462 463 if __name__ == "__main__": 464 print "Running a quick self-test" 465 466 #http://emboss.sourceforge.net/docs/themes/alnformats/align.simple 467 simple_example = \ 468 """# /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa 469 FASTA searches a protein or DNA sequence data bank 470 version 34.26 January 12, 2007 471 Please cite: 472 W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448 473 474 Query library NC_002127.faa vs NC_009649.faa library 475 searching NC_009649.faa library 476 477 1>>>gi|10955263|ref|NP_052604.1| plasmid mobilization [Escherichia coli O157:H7 s 107 aa - 107 aa 478 vs NC_009649.faa library 479 480 45119 residues in 180 sequences 481 Expectation_n fit: rho(ln(x))= 6.9146+/-0.0249; mu= -5.7948+/- 1.273 482 mean_var=53.6859+/-13.609, 0's: 0 Z-trim: 1 B-trim: 9 in 1/25 483 Lambda= 0.175043 484 485 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2 486 join: 36, opt: 24, open/ext: -10/-2, width: 16 487 Scan time: 0.000 488 The best scores are: opt bits E(180) 489 gi|152973457|ref|YP_001338508.1| ATPase with chape ( 931) 71 24.9 0.58 490 gi|152973588|ref|YP_001338639.1| F pilus assembly ( 459) 63 23.1 0.99 491 492 >>>gi|10955263|ref|NP_052604.1|, 107 aa vs NC_009649.faa library 493 ; pg_name: /opt/fasta/fasta34 494 ; pg_ver: 34.26 495 ; pg_argv: /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa 496 ; pg_name: FASTA 497 ; pg_ver: 3.5 Sept 2006 498 ; pg_matrix: BL50 (15:-5) 499 ; pg_open-ext: -10 -2 500 ; pg_ktup: 2 501 ; pg_optcut: 24 502 ; pg_cgap: 36 503 ; mp_extrap: 60000 180 504 ; mp_stats: Expectation_n fit: rho(ln(x))= 6.9146+/-0.0249; mu= -5.7948+/- 1.273 mean_var=53.6859+/-13.609, 0's: 0 Z-trim: 1 B-trim: 9 in 1/25 Lambda= 0.175043 505 ; mp_KS: -0.0000 (N=0) at 8159228 506 >>gi|152973457|ref|YP_001338508.1| ATPase with chaperone activity, ATP-binding subunit [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] 507 ; fa_frame: f 508 ; fa_initn: 65 509 ; fa_init1: 43 510 ; fa_opt: 71 511 ; fa_z-score: 90.3 512 ; fa_bits: 24.9 513 ; fa_expect: 0.58 514 ; sw_score: 71 515 ; sw_ident: 0.250 516 ; sw_sim: 0.574 517 ; sw_overlap: 108 518 >gi|10955263| .. 519 ; sq_len: 107 520 ; sq_offset: 1 521 ; sq_type: p 522 ; al_start: 5 523 ; al_stop: 103 524 ; al_display_start: 1 525 --------------------------MTKRSGSNT-RRRAISRPVRLTAE 526 ED---QEIRKRAAECGKTVSGFLRAAALGKKVNSLTDDRVLKEVM----- 527 RLGALQKKLFIDGKRVGDREYAEVLIAITEYHRALLSRLMAD 528 >gi|152973457|ref|YP_001338508.1| .. 529 ; sq_len: 931 530 ; sq_type: p 531 ; al_start: 96 532 ; al_stop: 195 533 ; al_display_start: 66 534 SDFFRIGDDATPVAADTDDVVDASFGEPAAAGSGAPRRRGSGLASRISEQ 535 SEALLQEAAKHAAEFGRS------EVDTEHLLLALADSDVVKTILGQFKI 536 KVDDLKRQIESEAKR-GDKPF-EGEIGVSPRVKDALSRAFVASNELGHSY 537 VGPEHFLIGLAEEGEGLAANLLRRYGLTPQ 538 >>gi|152973588|ref|YP_001338639.1| F pilus assembly protein [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] 539 ; fa_frame: f 540 ; fa_initn: 33 541 ; fa_init1: 33 542 ; fa_opt: 63 543 ; fa_z-score: 86.1 544 ; fa_bits: 23.1 545 ; fa_expect: 0.99 546 ; sw_score: 63 547 ; sw_ident: 0.266 548 ; sw_sim: 0.656 549 ; sw_overlap: 64 550 >gi|10955263| .. 551 ; sq_len: 107 552 ; sq_offset: 1 553 ; sq_type: p 554 ; al_start: 32 555 ; al_stop: 94 556 ; al_display_start: 2 557 TKRSGSNTRRRAISRPVRLTAEEDQEIRKRAAECGKTVSGFLRAAALGKK 558 VNSLTDDRVLKEV-MRLGALQKKLFIDGKRVGDREYAEVLIAITEYHRAL 559 LSRLMAD 560 >gi|152973588|ref|YP_001338639.1| .. 561 ; sq_len: 459 562 ; sq_type: p 563 ; al_start: 191 564 ; al_stop: 248 565 ; al_display_start: 161 566 VGGLFPRTQVAQQKVCQDIAGESNIFSDWAASRQGCTVGG--KMDSVQDK 567 ASDKDKERVMKNINIMWNALSKNRLFDG----NKELKEFIMTLTGTLIFG 568 ENSEITPLPARTTDQDLIRAMMEGGTAKIYHCNDSDKCLKVVADATVTIT 569 SNKALKSQISALLSSIQNKAVADEKLTDQE 570 2>>>gi|10955264|ref|NP_052605.1| hypothetical protein pOSAK1_02 [Escherichia coli O157:H7 s 126 aa - 126 aa 571 vs NC_009649.faa library 572 573 45119 residues in 180 sequences 574 Expectation_n fit: rho(ln(x))= 7.1374+/-0.0246; mu= -7.6540+/- 1.313 575 mean_var=51.1189+/-13.171, 0's: 0 Z-trim: 1 B-trim: 8 in 1/25 576 Lambda= 0.179384 577 578 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2 579 join: 36, opt: 24, open/ext: -10/-2, width: 16 580 Scan time: 0.000 581 The best scores are: opt bits E(180) 582 gi|152973462|ref|YP_001338513.1| hypothetical prot ( 101) 58 22.9 0.29 583 584 >>>gi|10955264|ref|NP_052605.1|, 126 aa vs NC_009649.faa library 585 ; pg_name: /opt/fasta/fasta34 586 ; pg_ver: 34.26 587 ; pg_argv: /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa 588 ; pg_name: FASTA 589 ; pg_ver: 3.5 Sept 2006 590 ; pg_matrix: BL50 (15:-5) 591 ; pg_open-ext: -10 -2 592 ; pg_ktup: 2 593 ; pg_optcut: 24 594 ; pg_cgap: 36 595 ; mp_extrap: 60000 180 596 ; mp_stats: Expectation_n fit: rho(ln(x))= 7.1374+/-0.0246; mu= -7.6540+/- 1.313 mean_var=51.1189+/-13.171, 0's: 0 Z-trim: 1 B-trim: 8 in 1/25 Lambda= 0.179384 597 ; mp_KS: -0.0000 (N=0) at 8159228 598 >>gi|152973462|ref|YP_001338513.1| hypothetical protein KPN_pKPN3p05904 [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] 599 ; fa_frame: f 600 ; fa_initn: 50 601 ; fa_init1: 50 602 ; fa_opt: 58 603 ; fa_z-score: 95.8 604 ; fa_bits: 22.9 605 ; fa_expect: 0.29 606 ; sw_score: 58 607 ; sw_ident: 0.289 608 ; sw_sim: 0.632 609 ; sw_overlap: 38 610 >gi|10955264| .. 611 ; sq_len: 126 612 ; sq_offset: 1 613 ; sq_type: p 614 ; al_start: 1 615 ; al_stop: 38 616 ; al_display_start: 1 617 ------------------------------MKKDKKYQIEAIKNKDKTLF 618 IVYATDIYSPSEFFSKIESDLKKKKSKGDVFFDLIIPNGGKKDRYVYTSF 619 NGEKFSSYTLNKVTKTDEYN 620 >gi|152973462|ref|YP_001338513.1| .. 621 ; sq_len: 101 622 ; sq_type: p 623 ; al_start: 44 624 ; al_stop: 81 625 ; al_display_start: 14 626 DALLGEIQRLRKQVHQLQLERDILTKANELIKKDLGVSFLKLKNREKTLI 627 VDALKKKYPVAELLSVLQLARSCYFYQNVCTISMRKYA 628 3>>>gi|10955265|ref|NP_052606.1| hypothetical protein pOSAK1_03 [Escherichia coli O157:H7 s 346 aa - 346 aa 629 vs NC_009649.faa library 630 631 45119 residues in 180 sequences 632 Expectation_n fit: rho(ln(x))= 6.0276+/-0.0276; mu= 3.0670+/- 1.461 633 mean_var=37.1634+/- 8.980, 0's: 0 Z-trim: 1 B-trim: 14 in 1/25 634 Lambda= 0.210386 635 636 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2 637 join: 37, opt: 25, open/ext: -10/-2, width: 16 638 Scan time: 0.020 639 The best scores are: opt bits E(180) 640 gi|152973545|ref|YP_001338596.1| putative plasmid ( 242) 70 27.5 0.082 641 642 >>>gi|10955265|ref|NP_052606.1|, 346 aa vs NC_009649.faa library 643 ; pg_name: /opt/fasta/fasta34 644 ; pg_ver: 34.26 645 ; pg_argv: /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa 646 ; pg_name: FASTA 647 ; pg_ver: 3.5 Sept 2006 648 ; pg_matrix: BL50 (15:-5) 649 ; pg_open-ext: -10 -2 650 ; pg_ktup: 2 651 ; pg_optcut: 25 652 ; pg_cgap: 37 653 ; mp_extrap: 60000 180 654 ; mp_stats: Expectation_n fit: rho(ln(x))= 6.0276+/-0.0276; mu= 3.0670+/- 1.461 mean_var=37.1634+/- 8.980, 0's: 0 Z-trim: 1 B-trim: 14 in 1/25 Lambda= 0.210386 655 ; mp_KS: -0.0000 (N=0) at 8159228 656 >>gi|152973545|ref|YP_001338596.1| putative plasmid SOS inhibition protein A [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] 657 ; fa_frame: f 658 ; fa_initn: 52 659 ; fa_init1: 52 660 ; fa_opt: 70 661 ; fa_z-score: 105.5 662 ; fa_bits: 27.5 663 ; fa_expect: 0.082 664 ; sw_score: 70 665 ; sw_ident: 0.279 666 ; sw_sim: 0.651 667 ; sw_overlap: 43 668 >gi|10955265| .. 669 ; sq_len: 346 670 ; sq_offset: 1 671 ; sq_type: p 672 ; al_start: 197 673 ; al_stop: 238 674 ; al_display_start: 167 675 DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK 676 QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL 677 GEYFTENKPKYIIREIHQET 678 >gi|152973545|ref|YP_001338596.1| .. 679 ; sq_len: 242 680 ; sq_type: p 681 ; al_start: 52 682 ; al_stop: 94 683 ; al_display_start: 22 684 IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD 685 RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR 686 QDFAFTRKMRREARQVEQSW 687 >>><<< 688 689 690 579 residues in 3 query sequences 691 45119 residues in 180 library sequences 692 Scomplib [34.26] 693 start: Tue May 20 16:38:45 2008 done: Tue May 20 16:38:45 2008 694 Total Scan time: 0.020 Total Display time: 0.010 695 696 Function used was FASTA [version 34.26 January 12, 2007] 697 698 """ 699 700 701 from StringIO import StringIO 702 703 alignments = list(FastaM10Iterator(StringIO(simple_example))) 704 assert len(alignments) == 4, len(alignments) 705 assert len(alignments[0]) == 2 706 for a in alignments: 707 print "Alignment %i sequences of length %i" \ 708 % (len(a), a.get_alignment_length()) 709 for r in a: 710 print "%s %s %i" % (r.seq, r.id, r.annotations["original_length"]) 711 #print a.annotations 712 print "Done" 713 714 import os 715 path = "../../Tests/Fasta/" 716 files = [f for f in os.listdir(path) if os.path.splitext(f)[-1] == ".m10"] 717 files.sort() 718 for filename in files: 719 if os.path.splitext(filename)[-1] == ".m10": 720 print 721 print filename 722 print "="*len(filename) 723 for i,a in enumerate(FastaM10Iterator(open(os.path.join(path,filename)))): 724 print "#%i, %s" % (i+1,a) 725 for r in a: 726 if "-" in r.seq: 727 assert r.seq.alphabet.gap_char == "-" 728 else: 729 assert not hasattr(r.seq.alphabet, "gap_char") 730