Package Bio :: Package Align :: Module AlignInfo
[hide private]
[frames] | no frames]

Source Code for Module Bio.Align.AlignInfo

  1  """Extract information from alignment objects. 
  2   
  3  In order to try and avoid huge alignment objects with tons of functions, 
  4  functions which return summary type information about alignments should 
  5  be put into classes in this module. 
  6   
  7  classes: 
  8  o SummaryInfo 
  9  o PSSM 
 10  """ 
 11   
 12  # standard library 
 13  import string 
 14  import math 
 15  import sys 
 16   
 17  #TODO - Remove this work around once we drop python 2.3 support 
 18  try: 
 19     set = set 
 20  except NameError: 
 21     from sets import Set as set 
 22   
 23  # biopython modules 
 24  from Bio import Alphabet 
 25  from Bio.Alphabet import IUPAC 
 26  from Bio.Seq import Seq 
 27  from Bio.SubsMat import FreqTable 
 28   
 29  # Expected random distributions for 20-letter protein, and 
 30  # for 4-letter nucleotide alphabets 
 31  Protein20Random = 0.05 
 32  Nucleotide4Random = 0.25 
33 -class SummaryInfo:
34 """Calculate summary info about the alignment. 35 36 This class should be used to caclculate information summarizing the 37 results of an alignment. This may either be straight consensus info 38 or more complicated things. 39 """
40 - def __init__(self, alignment):
41 """Initialize with the alignment to calculate information on. 42 ic_vector attribute. A dictionary. Keys: column numbers. Values: 43 """ 44 self.alignment = alignment 45 self.ic_vector = {}
46
47 - def dumb_consensus(self, threshold = .7, ambiguous = "X", 48 consensus_alpha = None, require_multiple = 0):
49 """Output a fast consensus sequence of the alignment. 50 51 This doesn't do anything fancy at all. It will just go through the 52 sequence residue by residue and count up the number of each type 53 of residue (ie. A or G or T or C for DNA) in all sequences in the 54 alignment. If the percentage of the most common residue type is 55 greater then the passed threshold, then we will add that residue type, 56 otherwise an ambiguous character will be added. 57 58 This could be made a lot fancier (ie. to take a substitution matrix 59 into account), but it just meant for a quick and dirty consensus. 60 61 Arguments: 62 o threshold - The threshold value that is required to add a particular 63 atom. 64 o ambiguous - The ambiguous character to be added when the threshold is 65 not reached. 66 o consensus_alpha - The alphabet to return for the consensus sequence. 67 If this is None, then we will try to guess the alphabet. 68 o require_multiple - If set as 1, this will require that more than 69 1 sequence be part of an alignment to put it in the consensus (ie. 70 not just 1 sequence and gaps). 71 """ 72 # Iddo Friedberg, 1-JUL-2004: changed ambiguous default to "X" 73 consensus = '' 74 75 # find the length of the consensus we are creating 76 con_len = self.alignment.get_alignment_length() 77 78 # go through each seq item 79 for n in range(con_len): 80 # keep track of the counts of the different atoms we get 81 atom_dict = {} 82 num_atoms = 0 83 84 for record in self.alignment._records: 85 # make sure we haven't run past the end of any sequences 86 # if they are of different lengths 87 if n < len(record.seq): 88 if record.seq[n] != '-' and record.seq[n] != '.': 89 if record.seq[n] not in atom_dict.keys(): 90 atom_dict[record.seq[n]] = 1 91 else: 92 atom_dict[record.seq[n]] = \ 93 atom_dict[record.seq[n]] + 1 94 95 num_atoms = num_atoms + 1 96 97 max_atoms = [] 98 max_size = 0 99 100 for atom in atom_dict.keys(): 101 if atom_dict[atom] > max_size: 102 max_atoms = [atom] 103 max_size = atom_dict[atom] 104 elif atom_dict[atom] == max_size: 105 max_atoms.append(atom) 106 107 if require_multiple and num_atoms == 1: 108 consensus = consensus + ambiguous 109 elif (len(max_atoms) == 1) and ((float(max_size)/float(num_atoms)) 110 >= threshold): 111 consensus = consensus + max_atoms[0] 112 else: 113 consensus = consensus + ambiguous 114 115 # we need to guess a consensus alphabet if one isn't specified 116 if consensus_alpha is None: 117 consensus_alpha = self._guess_consensus_alphabet(ambiguous) 118 119 return Seq(consensus, consensus_alpha)
120
121 - def gap_consensus(self, threshold = .7, ambiguous = "X", 122 consensus_alpha = None, require_multiple = 0):
123 """Same as dumb_consensus(), but allows gap on the output. 124 125 Things to do: Let the user define that with only one gap, the result 126 character in consensus is gap. Let the user select gap character, now 127 it takes the same is input. 128 """ 129 # Iddo Friedberg, 1-JUL-2004: changed ambiguous default to "X" 130 consensus = '' 131 132 # find the length of the consensus we are creating 133 con_len = self.alignment.get_alignment_length() 134 135 # go through each seq item 136 for n in range(con_len): 137 # keep track of the counts of the different atoms we get 138 atom_dict = {} 139 num_atoms = 0 140 141 for record in self.alignment._records: 142 # make sure we haven't run past the end of any sequences 143 # if they are of different lengths 144 if n < len(record.seq): 145 if record.seq[n] not in atom_dict.keys(): 146 atom_dict[record.seq[n]] = 1 147 else: 148 atom_dict[record.seq[n]] = \ 149 atom_dict[record.seq[n]] + 1 150 151 num_atoms = num_atoms + 1 152 153 max_atoms = [] 154 max_size = 0 155 156 for atom in atom_dict.keys(): 157 if atom_dict[atom] > max_size: 158 max_atoms = [atom] 159 max_size = atom_dict[atom] 160 elif atom_dict[atom] == max_size: 161 max_atoms.append(atom) 162 163 if require_multiple and num_atoms == 1: 164 consensus = consensus + ambiguous 165 elif (len(max_atoms) == 1) and ((float(max_size)/float(num_atoms)) 166 >= threshold): 167 consensus = consensus + max_atoms[0] 168 else: 169 consensus = consensus + ambiguous 170 171 # we need to guess a consensus alphabet if one isn't specified 172 if consensus_alpha is None: 173 #TODO - Should we make this into a Gapped alphabet? 174 consensus_alpha = self._guess_consensus_alphabet(ambiguous) 175 176 return Seq(consensus, consensus_alpha)
177
178 - def _guess_consensus_alphabet(self, ambiguous):
179 """Pick an (ungapped) alphabet for an alignment consesus sequence. 180 181 This just looks at the sequences we have, checks their type, and 182 returns as appropriate type which seems to make sense with the 183 sequences we've got. 184 """ 185 #Start with the (un-gapped version of) the alignment alphabet 186 a = Alphabet._get_base_alphabet(self.alignment._alphabet) 187 188 #Now check its compatible with all the rest of the sequences 189 for record in self.alignment : 190 #Get the (un-gapped version of) the sequence's alphabet 191 alt = Alphabet._get_base_alphabet(record.seq.alphabet) 192 if not isinstance(alt, a.__class__) : 193 raise ValueError \ 194 ("Alignment contains a sequence with an incompatible alphabet.") 195 196 #Check the ambiguous character we are going to use in the consensus 197 #is in the alphabet's list of valid letters (if defined). 198 if hasattr(a, "letters") and a.letters is not None \ 199 and ambiguous not in a.letters : 200 #We'll need to pick a more generic alphabet... 201 if isinstance(a, IUPAC.IUPACUnambiguousDNA) : 202 if ambiguous in IUPAC.IUPACUnambiguousDNA().letters : 203 a = IUPAC.IUPACUnambiguousDNA() 204 else : 205 a = Alphabet.generic_dna 206 elif isinstance(a, IUPAC.IUPACUnambiguousRNA) : 207 if ambiguous in IUPAC.IUPACUnambiguousRNA().letters : 208 a = IUPAC.IUPACUnambiguousRNA() 209 else : 210 a = Alphabet.generic_rna 211 elif isinstance(a, IUPAC.IUPACProtein) : 212 if ambiguous in IUPAC.ExtendedIUPACProtein().letters : 213 a = IUPAC.ExtendedIUPACProtein() 214 else : 215 a = Alphabet.generic_protein 216 else : 217 a = Alphabet.single_letter_alphabet 218 return a
219
220 - def replacement_dictionary(self, skip_chars = []):
221 """Generate a replacement dictionary to plug into a substitution matrix 222 223 This should look at an alignment, and be able to generate the number 224 of substitutions of different residues for each other in the 225 aligned object. 226 227 Will then return a dictionary with this information: 228 {('A', 'C') : 10, ('C', 'A') : 12, ('G', 'C') : 15 ....} 229 230 This also treats weighted sequences. The following example shows how 231 we calculate the replacement dictionary. Given the following 232 multiple sequence alignments: 233 234 GTATC 0.5 235 AT--C 0.8 236 CTGTC 1.0 237 238 For the first column we have: 239 ('A', 'G') : 0.5 * 0.8 = 0.4 240 ('C', 'G') : 0.5 * 1.0 = 0.5 241 ('A', 'C') : 0.8 * 1.0 = 0.8 242 243 We then continue this for all of the columns in the alignment, summing 244 the information for each substitution in each column, until we end 245 up with the replacement dictionary. 246 247 Arguments: 248 o skip_chars - A list of characters to skip when creating the dictionary. 249 For instance, you might have Xs (screened stuff) or Ns, and not want 250 to include the ambiguity characters in the dictionary. 251 """ 252 # get a starting dictionary based on the alphabet of the alignment 253 rep_dict, skip_items = self._get_base_replacements(skip_chars) 254 255 # iterate through each record 256 for rec_num1 in range(len(self.alignment._records)): 257 # iterate through each record from one beyond the current record 258 # to the end of the list of records 259 for rec_num2 in range(rec_num1 + 1, len(self.alignment._records)): 260 # for each pair of records, compare the sequences and add 261 # the pertinent info to the dictionary 262 rep_dict = self._pair_replacement( 263 self.alignment._records[rec_num1].seq, 264 self.alignment._records[rec_num2].seq, 265 self.alignment._records[rec_num1].annotations.get('weight',1), 266 self.alignment._records[rec_num2].annotations.get('weight',1), 267 rep_dict, skip_items) 268 269 return rep_dict
270
271 - def _pair_replacement(self, seq1, seq2, weight1, weight2, 272 start_dict, ignore_chars):
273 """Compare two sequences and generate info on the replacements seen. 274 275 Arguments: 276 o seq1, seq2 - The two sequences to compare. 277 o weight1, weight2 - The relative weights of seq1 and seq2. 278 o start_dict - The dictionary containing the starting replacement 279 info that we will modify. 280 o ignore_chars - A list of characters to ignore when calculating 281 replacements (ie. '-'). 282 283 Returns: 284 o A replacment dictionary which is modified from initial_dict with 285 the information from the sequence comparison. 286 """ 287 # loop through each residue in the sequences 288 for residue_num in range(len(seq1)): 289 residue1 = seq1[residue_num] 290 try: 291 residue2 = seq2[residue_num] 292 # if seq2 is shorter, then we just stop looking at replacements 293 # and return the information 294 except IndexError: 295 return start_dict 296 297 # if the two residues are characters we want to count 298 if (residue1 not in ignore_chars) and (residue2 not in ignore_chars): 299 try: 300 # add info about the replacement to the dictionary, 301 # modified by the sequence weights 302 start_dict[(residue1, residue2)] = \ 303 start_dict[(residue1, residue2)] + \ 304 weight1 * weight2 305 306 # if we get a key error, then we've got a problem with alphabets 307 except KeyError: 308 raise ValueError("Residues %s, %s not found in alphabet %s" 309 % (residue1, residue2, 310 self.alignment._alphabet)) 311 312 return start_dict
313 314
315 - def _get_all_letters(self):
316 """Returns a string containing the expected letters in the alignment.""" 317 all_letters = self.alignment._alphabet.letters 318 if all_letters is None \ 319 or (isinstance(self.alignment._alphabet, Alphabet.Gapped) \ 320 and all_letters == self.alignment._alphabet.gap_char): 321 #We are dealing with a generic alphabet class where the 322 #letters are not defined! We must build a list of the 323 #letters used... 324 set_letters = set() 325 for record in self.alignment : 326 #Note the built in set does not have a union_update 327 #which was provided by the sets module's Set 328 set_letters = set_letters.union(record.seq) 329 list_letters = list(set_letters) 330 list_letters.sort() 331 all_letters = "".join(list_letters) 332 return all_letters
333
334 - def _get_base_replacements(self, skip_items = []):
335 """Get a zeroed dictonary of all possible letter combinations. 336 337 This looks at the type of alphabet and gets the letters for it. 338 It then creates a dictionary with all possible combinations of these 339 letters as keys (ie. ('A', 'G')) and sets the values as zero. 340 341 Returns: 342 o The base dictionary created 343 o A list of alphabet items to skip when filling the dictionary.Right 344 now the only thing I can imagine in this list is gap characters, but 345 maybe X's or something else might be useful later. This will also 346 include any characters that are specified to be skipped. 347 """ 348 base_dictionary = {} 349 all_letters = self._get_all_letters() 350 351 # if we have a gapped alphabet we need to find the gap character 352 # and drop it out 353 if isinstance(self.alignment._alphabet, Alphabet.Gapped): 354 skip_items.append(self.alignment._alphabet.gap_char) 355 all_letters = string.replace(all_letters, 356 self.alignment._alphabet.gap_char, 357 '') 358 359 # now create the dictionary 360 for first_letter in all_letters: 361 for second_letter in all_letters: 362 if (first_letter not in skip_items and 363 second_letter not in skip_items): 364 base_dictionary[(first_letter, second_letter)] = 0 365 366 return base_dictionary, skip_items
367 368
369 - def pos_specific_score_matrix(self, axis_seq = None, 370 chars_to_ignore = []):
371 """Create a position specific score matrix object for the alignment. 372 373 This creates a position specific score matrix (pssm) which is an 374 alternative method to look at a consensus sequence. 375 376 Arguments: 377 o chars_to_ignore - A listing of all characters not to include in 378 the pssm. If the alignment alphabet declares a gap character, 379 then it will be excluded automatically. 380 o axis_seq - An optional argument specifying the sequence to 381 put on the axis of the PSSM. This should be a Seq object. If nothing 382 is specified, the consensus sequence, calculated with default 383 parameters, will be used. 384 385 Returns: 386 o A PSSM (position specific score matrix) object. 387 """ 388 # determine all of the letters we have to deal with 389 all_letters = self._get_all_letters() 390 assert all_letters 391 392 if not isinstance(chars_to_ignore, list) : 393 raise TypeError("chars_to_ignore should be a list.") 394 395 # if we have a gap char, add it to stuff to ignore 396 if isinstance(self.alignment._alphabet, Alphabet.Gapped): 397 chars_to_ignore.append(self.alignment._alphabet.gap_char) 398 399 for char in chars_to_ignore: 400 all_letters = string.replace(all_letters, char, '') 401 402 if axis_seq: 403 left_seq = axis_seq 404 assert len(axis_seq) == self.alignment.get_alignment_length() 405 else: 406 left_seq = self.dumb_consensus() 407 408 pssm_info = [] 409 # now start looping through all of the sequences and getting info 410 for residue_num in range(len(left_seq)): 411 score_dict = self._get_base_letters(all_letters) 412 for record in self.alignment._records: 413 try: 414 this_residue = record.seq[residue_num] 415 # if we hit an index error we've run out of sequence and 416 # should not add new residues 417 except IndexError: 418 this_residue = None 419 420 if this_residue and this_residue not in chars_to_ignore: 421 weight = record.annotations.get('weight', 1) 422 try: 423 score_dict[this_residue] += weight 424 # if we get a KeyError then we have an alphabet problem 425 except KeyError: 426 raise ValueError("Residue %s not found in alphabet %s" 427 % (this_residue, 428 self.alignment._alphabet)) 429 430 pssm_info.append((left_seq[residue_num], 431 score_dict)) 432 433 434 return PSSM(pssm_info)
435
436 - def _get_base_letters(self, letters):
437 """Create a zeroed dictionary with all of the specified letters. 438 """ 439 base_info = {} 440 for letter in letters: 441 base_info[letter] = 0 442 443 return base_info
444
445 - def information_content(self, start = 0, 446 end = None, 447 e_freq_table = None, log_base = 2, 448 chars_to_ignore = []):
449 """Calculate the information content for each residue along an alignment. 450 451 Arguments: 452 o start, end - The starting an ending points to calculate the 453 information content. These points should be relative to the first 454 sequence in the alignment, starting at zero (ie. even if the 'real' 455 first position in the seq is 203 in the initial sequence, for 456 the info content, we need to use zero). This defaults to the entire 457 length of the first sequence. 458 o e_freq_table - A FreqTable object specifying the expected frequencies 459 for each letter in the alphabet we are using (e.g. {'G' : 0.4, 460 'C' : 0.4, 'T' : 0.1, 'A' : 0.1}). Gap characters should not be 461 included, since these should not have expected frequencies. 462 o log_base - The base of the logathrim to use in calculating the 463 information content. This defaults to 2 so the info is in bits. 464 o chars_to_ignore - A listing of characterw which should be ignored 465 in calculating the info content. 466 467 Returns: 468 o A number representing the info content for the specified region. 469 470 Please see the Biopython manual for more information on how information 471 content is calculated. 472 """ 473 # if no end was specified, then we default to the end of the sequence 474 if end is None: 475 end = len(self.alignment._records[0].seq) 476 477 if start < 0 or end > len(self.alignment._records[0].seq): 478 raise ValueError \ 479 ("Start (%s) and end (%s) are not in the range %s to %s" 480 % (start, end, 0, len(self.alignment._records[0].seq))) 481 # determine random expected frequencies, if necessary 482 random_expected = None 483 if not e_freq_table: 484 #TODO - What about ambiguous alphabets? 485 base_alpha = Alphabet._get_base_alphabet(self.alignment._alphabet) 486 if isinstance(base_alpha, Alphabet.ProteinAlphabet) : 487 random_expected = Protein20Random 488 elif isinstance(base_alpha, Alphabet.NucleotideAlphabet) : 489 random_expected = Nucleotide4Random 490 else : 491 errstr = "Error in alphabet: not Nucleotide or Protein, " 492 errstr += "supply expected frequencies" 493 raise ValueError(errstr) 494 del base_alpha 495 elif not isinstance(e_freq_table, FreqTable.FreqTable) : 496 raise ValueError("e_freq_table should be a FreqTable object") 497 498 499 # determine all of the letters we have to deal with 500 all_letters = self._get_all_letters() 501 for char in chars_to_ignore: 502 all_letters = string.replace(all_letters, char, '') 503 504 info_content = {} 505 for residue_num in range(start, end): 506 freq_dict = self._get_letter_freqs(residue_num, 507 self.alignment._records, 508 all_letters, chars_to_ignore) 509 # print freq_dict, 510 column_score = self._get_column_info_content(freq_dict, 511 e_freq_table, 512 log_base, 513 random_expected) 514 515 info_content[residue_num] = column_score 516 # sum up the score 517 total_info = 0 518 for column_info in info_content.values(): 519 total_info = total_info + column_info 520 # fill in the ic_vector member: holds IC for each column 521 for i in info_content.keys(): 522 self.ic_vector[i] = info_content[i] 523 return total_info
524
525 - def _get_letter_freqs(self, residue_num, all_records, letters, to_ignore):
526 """Determine the frequency of specific letters in the alignment. 527 528 Arguments: 529 o residue_num - The number of the column we are getting frequencies 530 from. 531 o all_records - All of the SeqRecords in the alignment. 532 o letters - The letters we are interested in getting the frequency 533 for. 534 o to_ignore - Letters we are specifically supposed to ignore. 535 536 This will calculate the frequencies of each of the specified letters 537 in the alignment at the given frequency, and return this as a 538 dictionary where the keys are the letters and the values are the 539 frequencies. 540 """ 541 freq_info = self._get_base_letters(letters) 542 543 total_count = 0 544 # collect the count info into the dictionary for all the records 545 for record in all_records: 546 try: 547 if record.seq[residue_num] not in to_ignore: 548 weight = record.annotations.get('weight',1) 549 freq_info[record.seq[residue_num]] += weight 550 total_count += weight 551 # getting a key error means we've got a problem with the alphabet 552 except KeyError: 553 raise ValueError("Residue %s not found in alphabet %s" 554 % (record.seq[residue_num], 555 self.alignment._alphabet)) 556 557 if total_count == 0 : 558 # This column must be entirely ignored characters 559 for letter in freq_info.keys(): 560 assert freq_info[letter] == 0 561 #TODO - Map this to NA or NaN? 562 else : 563 # now convert the counts into frequencies 564 for letter in freq_info.keys(): 565 freq_info[letter] = freq_info[letter] / total_count 566 567 return freq_info
568
569 - def _get_column_info_content(self, obs_freq, e_freq_table, log_base, 570 random_expected):
571 """Calculate the information content for a column. 572 573 Arguments: 574 o obs_freq - The frequencies observed for each letter in the column. 575 o e_freq_table - An optional argument specifying the expected 576 frequencies for each letter. This is a SubsMat.FreqTable instance. 577 o log_base - The base of the logathrim to use in calculating the 578 info content. 579 """ 580 try : 581 gap_char = self.alignment._alphabet.gap_char 582 except AttributeError : 583 #The alphabet doesn't declare a gap - there could be none 584 #in the sequence... or just a vague alphabet. 585 gap_char = "-" #Safe? 586 587 if e_freq_table: 588 if not isinstance(e_freq_table, FreqTable.FreqTable) : 589 raise ValueError("e_freq_table should be a FreqTable object") 590 # check the expected freq information to make sure it is good 591 for key in obs_freq.keys(): 592 if (key != gap_char and key not in e_freq_table): 593 raise ValueError("Expected frequency letters %s" + 594 " do not match observed %s" 595 % (e_freq_table.keys(), obs_freq.keys() - 596 [gap_char])) 597 598 total_info = 0.0 599 600 for letter in obs_freq.keys(): 601 inner_log = 0.0 602 # if we have expected frequencies, modify the log value by them 603 # gap characters do not have expected frequencies, so they 604 # should just be the observed frequency. 605 if letter != gap_char: 606 if e_freq_table: 607 inner_log = obs_freq[letter] / e_freq_table[letter] 608 else: 609 inner_log = obs_freq[letter] / random_expected 610 # if the observed frequency is zero, we don't add any info to the 611 # total information content 612 if inner_log > 0: 613 letter_info = (obs_freq[letter] * 614 math.log(inner_log) / math.log(log_base)) 615 total_info = total_info + letter_info 616 return total_info
617
618 - def get_column(self,col):
619 return self.alignment.get_column(col)
620
621 -class PSSM:
622 """Represent a position specific score matrix. 623 624 This class is meant to make it easy to access the info within a PSSM 625 and also make it easy to print out the information in a nice table. 626 627 Let's say you had an alignment like this: 628 GTATC 629 AT--C 630 CTGTC 631 632 The position specific score matrix (when printed) looks like: 633 634 G A T C 635 G 1 1 0 1 636 T 0 0 3 0 637 A 1 1 0 0 638 T 0 0 2 0 639 C 0 0 0 3 640 641 You can access a single element of the PSSM using the following: 642 643 your_pssm[sequence_number][residue_count_name] 644 645 For instance, to get the 'T' residue for the second element in the 646 above alignment you would need to do: 647 648 your_pssm[1]['T'] 649 """
650 - def __init__(self, pssm):
651 """Initialize with pssm data to represent. 652 653 The pssm passed should be a list with the following structure: 654 655 list[0] - The letter of the residue being represented (for instance, 656 from the example above, the first few list[0]s would be GTAT... 657 list[1] - A dictionary with the letter substitutions and counts. 658 """ 659 self.pssm = pssm
660
661 - def __getitem__(self, pos):
662 return self.pssm[pos][1]
663
664 - def __str__(self):
665 out = " " 666 all_residues = self.pssm[0][1].keys() 667 all_residues.sort() 668 669 # first print out the top header 670 for res in all_residues: 671 out = out + " %s" % res 672 out = out + "\n" 673 674 # for each item, write out the substitutions 675 for item in self.pssm: 676 out = out + "%s " % item[0] 677 for res in all_residues: 678 out = out + " %.1f" % item[1][res] 679 680 out = out + "\n" 681 return out
682
683 - def get_residue(self, pos):
684 """Return the residue letter at the specified position. 685 """ 686 return self.pssm[pos][0]
687 688 701 702 if __name__ == "__main__" : 703 print "Quick test" 704 from Bio import AlignIO 705 from Bio.Align.Generic import Alignment 706 707 filename = "../../Tests/GFF/multi.fna" 708 format = "fasta" 709 expected = FreqTable.FreqTable({"A":0.25,"G":0.25,"T":0.25,"C":0.25}, 710 FreqTable.FREQ, 711 IUPAC.unambiguous_dna) 712 713 alignment = AlignIO.read(open(filename), format) 714 for record in alignment : 715 print record.seq.tostring() 716 print "="*alignment.get_alignment_length() 717 718 summary = SummaryInfo(alignment) 719 consensus = summary.dumb_consensus(ambiguous="N") 720 print consensus 721 consensus = summary.gap_consensus(ambiguous="N") 722 print consensus 723 print 724 print summary.pos_specific_score_matrix(chars_to_ignore=['-'], 725 axis_seq=consensus) 726 print 727 #Have a generic alphabet, without a declared gap char, so must tell 728 #provide the frequencies and chars to ignore explicitly. 729 print summary.information_content(e_freq_table=expected, 730 chars_to_ignore=['-']) 731 print 732 print "Trying a protein sequence with gaps and stops" 733 734 alpha = Alphabet.HasStopCodon(Alphabet.Gapped(Alphabet.generic_protein, "-"), "*") 735 a = Alignment(alpha) 736 a.add_sequence("ID001", "MHQAIFIYQIGYP*LKSGYIQSIRSPEYDNW-") 737 a.add_sequence("ID002", "MH--IFIYQIGYAYLKSGYIQSIRSPEY-NW*") 738 a.add_sequence("ID003", "MHQAIFIYQIGYPYLKSGYIQSIRSPEYDNW*") 739 print a 740 print "="*a.get_alignment_length() 741 742 s = SummaryInfo(a) 743 c = s.dumb_consensus(ambiguous="X") 744 print c 745 c = s.gap_consensus(ambiguous="X") 746 print c 747 print 748 print s.pos_specific_score_matrix(chars_to_ignore=['-', '*'], axis_seq=c) 749 750 print s.information_content(chars_to_ignore=['-', '*']) 751 752 753 print "Done" 754