Package Bio :: Package NeuralNetwork :: Package Gene :: Module Schema
[hide private]
[frames] | no frames]

Source Code for Module Bio.NeuralNetwork.Gene.Schema

  1  """Deal with Motifs or Signatures allowing ambiguity in the sequences. 
  2   
  3  This class contains Schema which deal with Motifs and Signatures at 
  4  a higher level, by introducing `don't care` (ambiguity) symbols into 
  5  the sequences. For instance, you could combine the following Motifs: 
  6   
  7  'GATC', 'GATG', 'GATG', 'GATT' 
  8   
  9  as all falling under a schema like 'GAT*', where the star indicates a 
 10  character can be anything. This helps us condense a whole ton of 
 11  motifs or signatures. 
 12  """ 
 13  # standard modules 
 14  import random 
 15  import string 
 16  import re 
 17   
 18  # biopython 
 19  from Bio import Alphabet 
 20  from Bio.Seq import MutableSeq 
 21   
 22  # neural network libraries 
 23  from Pattern import PatternRepository 
 24   
 25  # genetic algorithm libraries 
 26  from Bio.GA import Organism 
 27  from Bio.GA.Evolver import GenerationEvolver 
 28  from Bio.GA.Mutation.Simple import SinglePositionMutation 
 29  from Bio.GA.Crossover.Point import SinglePointCrossover 
 30  from Bio.GA.Repair.Stabilizing import AmbiguousRepair 
 31  from Bio.GA.Selection.Tournament import TournamentSelection 
 32  from Bio.GA.Selection.Diversity import DiversitySelection 
 33   
34 -class Schema:
35 """Deal with motifs that have ambiguity characters in it. 36 37 This motif class allows specific ambiguity characters and tries to 38 speed up finding motifs using regular expressions. 39 40 This is likely to be a replacement for the Schema representation, 41 since it allows multiple ambiguity characters to be used. 42 """
43 - def __init__(self, ambiguity_info):
44 """Initialize with ambiguity information. 45 46 Arguments: 47 48 o ambiguity_info - A dictionary which maps letters in the motifs to 49 the ambiguous characters which they might represent. For example, 50 {'R' : 'AG'} specifies that Rs in the motif can match a A or a G. 51 All letters in the motif must be represented in the ambiguity_info 52 dictionary. 53 """ 54 self._ambiguity_info = ambiguity_info 55 56 # a cache of all encoded motifs 57 self._motif_cache = {}
58
59 - def encode_motif(self, motif):
60 """Encode the passed motif as a regular expression pattern object. 61 62 Arguments: 63 64 o motif - The motif we want to encode. This should be a string. 65 66 Returns: 67 A compiled regular expression pattern object that can be used 68 for searching strings. 69 """ 70 regexp_string = "" 71 72 for motif_letter in motif: 73 try: 74 letter_matches = self._ambiguity_info[motif_letter] 75 except KeyError: 76 raise KeyError("No match information for letter %s" 77 % motif_letter) 78 79 if len(letter_matches) > 1: 80 regexp_match = "[" + letter_matches + "]" 81 elif len(letter_matches) == 1: 82 regexp_match = letter_matches 83 else: 84 raise ValueError("Unexpected match information %s" 85 % letter_matches) 86 87 regexp_string += regexp_match 88 89 return re.compile(regexp_string)
90
91 - def find_ambiguous(self, motif):
92 """Return the location of ambiguous items in the motif. 93 94 This just checks through the motif and compares each letter 95 against the ambiguity information. If a letter stands for multiple 96 items, it is ambiguous. 97 """ 98 ambig_positions = [] 99 for motif_letter_pos in range(len(motif)): 100 motif_letter = motif[motif_letter_pos] 101 try: 102 letter_matches = self._ambiguity_info[motif_letter] 103 except KeyError: 104 raise KeyError("No match information for letter %s" 105 % motif_letter) 106 107 if len(letter_matches) > 1: 108 ambig_positions.append(motif_letter_pos) 109 110 return ambig_positions
111
112 - def num_ambiguous(self, motif):
113 """Return the number of ambiguous letters in a given motif. 114 """ 115 ambig_positions = self.find_ambiguous(motif) 116 return len(ambig_positions)
117
118 - def find_matches(self, motif, query):
119 """Return all non-overlapping motif matches in the query string. 120 121 This utilizes the regular expression findall function, and will 122 return a list of all non-overlapping occurances in query that 123 match the ambiguous motif. 124 """ 125 try: 126 motif_pattern = self._motif_cache[motif] 127 except KeyError: 128 motif_pattern = self.encode_motif(motif) 129 self._motif_cache[motif] = motif_pattern 130 131 return motif_pattern.findall(query)
132
133 - def num_matches(self, motif, query):
134 """Find the number of non-overlapping times motif occurs in query. 135 """ 136 all_matches = self.find_matches(motif, query) 137 return len(all_matches)
138
139 - def all_unambiguous(self):
140 """Return a listing of all unambiguous letters allowed in motifs. 141 """ 142 all_letters = self._ambiguity_info.keys() 143 all_letters.sort() 144 145 unambig_letters = [] 146 147 for letter in all_letters: 148 possible_matches = self._ambiguity_info[letter] 149 if len(possible_matches) == 1: 150 unambig_letters.append(letter) 151 152 return unambig_letters
153 154 # --- helper classes and functions for the default SchemaFinder 155 156 # -- Alphabets 157
158 -class SchemaDNAAlphabet(Alphabet.Alphabet):
159 """Alphabet of a simple Schema for DNA sequences. 160 161 This defines a simple alphabet for DNA sequences that has a single 162 character which can match any other character. 163 164 o G,A,T,C - The standard unambiguous DNA alphabet. 165 166 o * - Any letter 167 """ 168 letters = ["G", "A", "T", "C", "*"] 169 170 alphabet_matches = {"G" : "G", 171 "A" : "A", 172 "T" : "T", 173 "C" : "C", 174 "*" : "GATC"}
175 176 # -- GA schema finder 177
178 -class GeneticAlgorithmFinder:
179 """Find schemas using a genetic algorithm approach. 180 181 This approach to finding schema uses Genetic Algorithms to evolve 182 a set of schema and find the best schema for a specific set of 183 records. 184 185 The 'default' finder searches for ambiguous DNA elements. This 186 can be overridden easily by creating a GeneticAlgorithmFinder 187 with a different alphabet. 188 """
189 - def __init__(self, alphabet = SchemaDNAAlphabet()):
190 """Initialize a finder to get schemas using Genetic Algorithms. 191 192 Arguments: 193 194 o alphabet -- The alphabet which specifies the contents of the 195 schemas we'll be generating. This alphabet must contain the 196 attribute 'alphabet_matches', which is a dictionary specifying 197 the potential ambiguities of each letter in the alphabet. These 198 ambiguities will be used in building up the schema. 199 """ 200 self.alphabet = alphabet 201 202 self.initial_population = 500 203 self.min_generations = 10 204 205 self._set_up_genetic_algorithm()
206
208 """Overrideable function to set up the genetic algorithm parameters. 209 210 This functions sole job is to set up the different genetic 211 algorithm functionality. Since this can be quite complicated, this 212 allows cusotmizablity of all of the parameters. If you want to 213 customize specially, you can inherit from this class and override 214 this function. 215 """ 216 self.motif_generator = RandomMotifGenerator(self.alphabet) 217 218 self.mutator = SinglePositionMutation(mutation_rate = 0.1) 219 self.crossover = SinglePointCrossover(crossover_prob = 0.25) 220 self.repair = AmbiguousRepair(Schema(self.alphabet.alphabet_matches), 221 4) 222 self.base_selector = TournamentSelection(self.mutator, self.crossover, 223 self.repair, 2) 224 self.selector = DiversitySelection(self.base_selector, 225 self.motif_generator.random_motif)
226
227 - def find_schemas(self, fitness, num_schemas):
228 """Find the given number of unique schemas using a genetic algorithm 229 230 Arguments: 231 232 o fitness - A callable object (ie. function) which will evaluate 233 the fitness of a motif. 234 235 o num_schemas - The number of unique schemas with good fitness 236 that we want to generate. 237 """ 238 start_population = \ 239 Organism.function_population(self.motif_generator.random_motif, 240 self.initial_population, 241 fitness) 242 finisher = SimpleFinisher(num_schemas, self.min_generations) 243 244 # set up the evolver and do the evolution 245 evolver = GenerationEvolver(start_population, self.selector) 246 evolved_pop = evolver.evolve(finisher.is_finished) 247 248 # convert the evolved population into a PatternRepository 249 schema_info = {} 250 for org in evolved_pop: 251 # convert the Genome from a MutableSeq to a Seq so that 252 # the schemas are just strings (and not array("c")s) 253 seq_genome = org.genome.toseq() 254 schema_info[seq_genome.data] = org.fitness 255 256 return PatternRepository(schema_info)
257 258 # -- fitness classes 259
260 -class DifferentialSchemaFitness:
261 """Calculate fitness for schemas that differentiate between sequences. 262 """
263 - def __init__(self, positive_seqs, negative_seqs, schema_evaluator):
264 """Initialize with different sequences to evaluate 265 266 Arguments: 267 268 o positive_seq - A list of SeqRecord objects which are the 'positive' 269 sequences -- the ones we want to select for. 270 271 o negative_seq - A list of SeqRecord objects which are the 'negative' 272 sequences that we want to avoid selecting. 273 274 o schema_evaluator - An Schema class which can be used to 275 evaluate find motif matches in sequences. 276 """ 277 self._pos_seqs = positive_seqs 278 self._neg_seqs = negative_seqs 279 self._schema_eval = schema_evaluator
280
281 - def calculate_fitness(self, genome):
282 """Calculate the fitness for a given schema. 283 284 Fitness is specified by the number of occurances of the schema in 285 the positive sequences minus the number of occurances in the 286 negative examples. 287 288 This fitness is then modified by multiplying by the length of the 289 schema and then dividing by the number of ambiguous characters in 290 the schema. This helps select for schema which are longer and have 291 less redundancy. 292 """ 293 # convert the genome into a string 294 seq_motif = genome.toseq() 295 motif = seq_motif.data 296 297 # get the counts in the positive examples 298 num_pos = 0 299 for seq_record in self._pos_seqs: 300 cur_counts = self._schema_eval.num_matches(motif, 301 seq_record.seq.data) 302 num_pos += cur_counts 303 304 # get the counts in the negative examples 305 num_neg = 0 306 for seq_record in self._neg_seqs: 307 cur_counts = self._schema_eval.num_matches(motif, 308 seq_record.seq.data) 309 310 num_neg += cur_counts 311 312 num_ambiguous = self._schema_eval.num_ambiguous(motif) 313 # weight the ambiguous stuff more highly 314 num_ambiguous = pow(2.0, num_ambiguous) 315 # increment num ambiguous to prevent division by zero errors. 316 num_ambiguous += 1 317 318 motif_size = len(motif) 319 motif_size = motif_size * 4.0 320 321 discerning_power = num_pos - num_neg 322 323 diff = (discerning_power * motif_size) / float(num_ambiguous) 324 return diff
325
326 -class MostCountSchemaFitness:
327 """Calculate a fitness giving weight to schemas that match many times. 328 329 This fitness function tries to maximize schemas which are found many 330 times in a group of sequences. 331 """
332 - def __init__(self, seq_records, schema_evaluator):
333 """Initialize with sequences to evaluate. 334 335 Arguments: 336 337 o seq_records -- A set of SeqRecord objects which we use to 338 calculate the fitness. 339 340 o schema_evaluator - An Schema class which can be used to 341 evaluate find motif matches in sequences. 342 """ 343 self._records = seq_records 344 self._evaluator = schema_evaluator
345
346 - def calculate_fitness(self, genome):
347 """Calculate the fitness of a genome based on schema matches. 348 349 This bases the fitness of a genome completely on the number of times 350 it matches in the set of seq_records. Matching more times gives a 351 better fitness 352 """ 353 # convert the genome into a string 354 seq_motif = genome.toseq() 355 motif = seq_motif.data 356 357 # find the number of times the genome matches 358 num_times = 0 359 for seq_record in self._records: 360 cur_counts = self._evaluator.num_matches(motif, 361 seq_record.seq.data) 362 num_times += cur_counts 363 364 return num_times
365 366 # -- Helper classes
367 -class RandomMotifGenerator:
368 """Generate a random motif within given parameters. 369 """
370 - def __init__(self, alphabet, min_size = 12, max_size = 17):
371 """Initialize with the motif parameters. 372 373 Arguments: 374 375 o alphabet - An alphabet specifying what letters can be inserted in 376 a motif. 377 378 o min_size, max_size - Specify the range of sizes for motifs. 379 """ 380 self._alphabet = alphabet 381 self._min_size = min_size 382 self._max_size = max_size
383
384 - def random_motif(self):
385 """Create a random motif within the given parameters. 386 387 This returns a single motif string with letters from the given 388 alphabet. The size of the motif will be randomly chosen between 389 max_size and min_size. 390 """ 391 motif_size = random.randrange(self._min_size, self._max_size) 392 393 motif = "" 394 for letter_num in range(motif_size): 395 cur_letter = random.choice(self._alphabet.letters) 396 motif += cur_letter 397 398 return MutableSeq(motif, self._alphabet)
399
400 -class SimpleFinisher:
401 """Determine when we are done evolving motifs. 402 403 This takes the very simple approach of halting evolution when the 404 GA has proceeded for a specified number of generations and has 405 a given number of unique schema with positive fitness. 406 """
407 - def __init__(self, num_schemas, min_generations = 100):
408 """Initialize the finisher with its parameters. 409 410 Arguments: 411 412 o num_schemas -- the number of useful (positive fitness) schemas 413 we want to generation 414 415 o min_generations -- The minimum number of generations to allow 416 the GA to proceed. 417 """ 418 self.num_generations = 0 419 420 self.num_schemas = num_schemas 421 self.min_generations = min_generations
422
423 - def is_finished(self, organisms):
424 """Determine when we can stop evolving the population. 425 """ 426 self.num_generations += 1 427 # print "generation %s" % self.num_generations 428 429 if self.num_generations >= self.min_generations: 430 all_seqs = [] 431 for org in organisms: 432 if org.fitness > 0: 433 if org.genome not in all_seqs: 434 all_seqs.append(org.genome) 435 436 if len(all_seqs) >= self.num_schemas: 437 return 1 438 439 return 0
440 # --- 441
442 -class SchemaFinder:
443 """Find schema in a set of sequences using a genetic algorithm approach. 444 445 Finding good schemas is very difficult because it takes forever to 446 enumerate all of the potential schemas. This finder using a genetic 447 algorithm approach to evolve good schema which match many times in 448 a set of sequences. 449 450 The default implementation of the finder is ready to find schemas 451 in a set of DNA sequences, but the finder can be customized to deal 452 with any type of data. 453 """
454 - def __init__(self, num_schemas = 100, 455 schema_finder = GeneticAlgorithmFinder()):
456 self.num_schemas = num_schemas 457 self._finder = schema_finder 458 459 self.evaluator = Schema(self._finder.alphabet.alphabet_matches)
460
461 - def find(self, seq_records):
462 """Find well-represented schemas in the given set of SeqRecords. 463 """ 464 fitness_evaluator = MostCountSchemaFitness(seq_records, 465 self.evaluator) 466 467 return self._finder.find_schemas(fitness_evaluator.calculate_fitness, 468 self.num_schemas)
469
470 - def find_differences(self, first_records, second_records):
471 """Find schemas which differentiate between the two sets of SeqRecords. 472 """ 473 fitness_evaluator = DifferentialSchemaFitness(first_records, 474 second_records, 475 self.evaluator) 476 477 return self._finder.find_schemas(fitness_evaluator.calculate_fitness, 478 self.num_schemas)
479
480 -class SchemaCoder:
481 """Convert a sequence into a representation of ambiguous motifs (schemas). 482 483 This takes a sequence, and returns the number of times specified 484 motifs are found in the sequence. This lets you represent a sequence 485 as just a count of (possibly ambiguous) motifs. 486 """
487 - def __init__(self, schemas, ambiguous_converter):
488 """Initialize the coder to convert sequences 489 490 Arguments: 491 492 o schema - A list of all of the schemas we want to search for 493 in input sequences. 494 495 o ambiguous_converter - An Schema class which can be 496 used to convert motifs into regular expressions for searching. 497 """ 498 self._schemas = schemas 499 self._converter = ambiguous_converter
500
501 - def representation(self, sequence):
502 """Represent the given input sequence as a bunch of motif counts. 503 504 Arguments: 505 506 o sequence - A Bio.Seq object we are going to represent as schemas. 507 508 This takes the sequence, searches for the motifs within it, and then 509 returns counts specifying the relative number of times each motifs 510 was found. The frequencies are in the order the original motifs were 511 passed into the initializer. 512 """ 513 schema_counts = [] 514 515 for schema in self._schemas: 516 num_counts = self._converter.num_matches(schema, sequence.data) 517 schema_counts.append(num_counts) 518 519 # normalize the counts to go between zero and one 520 min_count = 0 521 max_count = max(schema_counts) 522 523 # only normalize if we've actually found something, otherwise 524 # we'll just return 0 for everything 525 if max_count > 0: 526 for count_num in range(len(schema_counts)): 527 schema_counts[count_num] = (float(schema_counts[count_num]) - 528 float(min_count)) / float(max_count) 529 530 return schema_counts
531
532 -def matches_schema(pattern, schema, ambiguity_character = '*'):
533 """Determine whether or not the given pattern matches the schema. 534 535 Arguments: 536 537 o pattern - A string representing the pattern we want to check for 538 matching. This pattern can contain ambiguity characters (which are 539 assumed to be the same as those in the schema). 540 541 o schema - A string schema with ambiguity characters. 542 543 o ambiguity_character - The character used for ambiguity in the schema. 544 """ 545 if len(pattern) != len(schema): 546 return 0 547 548 # check each position, and return a non match if the schema and pattern 549 # are non ambiguous and don't match 550 for pos in range(len(pattern)): 551 if (schema[pos] != ambiguity_character and 552 pattern[pos] != ambiguity_character and 553 pattern[pos] != schema[pos]): 554 555 return 0 556 557 return 1
558
559 -class SchemaFactory:
560 """Generate Schema from inputs of Motifs or Signatures. 561 """
562 - def __init__(self, ambiguity_symbol = '*'):
563 """Initialize the SchemaFactory 564 565 Arguments: 566 567 o ambiguity_symbol -- The symbol to use when specifying that 568 a position is arbitrary. 569 """ 570 self._ambiguity_symbol = ambiguity_symbol
571
572 - def from_motifs(self, motif_repository, motif_percent, num_ambiguous):
573 """Generate schema from a list of motifs. 574 575 Arguments: 576 577 o motif_repository - A MotifRepository class that has all of the 578 motifs we want to convert to Schema. 579 580 o motif_percent - The percentage of motifs in the motif bank which 581 should be matches. We'll try to create schema that match this 582 percentage of motifs. 583 584 o num_ambiguous - The number of ambiguous characters to include 585 in each schema. The positions of these ambiguous characters will 586 be randomly selected. 587 """ 588 # get all of the motifs we can deal with 589 all_motifs = motif_repository.get_top_percentage(motif_percent) 590 591 # start building up schemas 592 schema_info = {} 593 # continue until we've built schema matching the desired percentage 594 # of motifs 595 total_count = self._get_num_motifs(motif_repository, all_motifs) 596 matched_count = 0 597 assert total_count > 0, "Expected to have motifs to match" 598 while (float(matched_count) / float(total_count)) < motif_percent: 599 600 new_schema, matching_motifs = \ 601 self._get_unique_schema(schema_info.keys(), 602 all_motifs, num_ambiguous) 603 604 # get the number of counts for the new schema and clean up 605 # the motif list 606 schema_counts = 0 607 for motif in matching_motifs: 608 # get the counts for the motif 609 schema_counts += motif_repository.count(motif) 610 611 # remove the motif from the motif list since it is already 612 # represented by this schema 613 all_motifs.remove(motif) 614 615 616 # all the schema info 617 schema_info[new_schema] = schema_counts 618 619 matched_count += schema_counts 620 621 # print "percentage:", float(matched_count) / float(total_count) 622 623 return PatternRepository(schema_info)
624
625 - def _get_num_motifs(self, repository, motif_list):
626 """Return the number of motif counts for the list of motifs. 627 """ 628 motif_count = 0 629 for motif in motif_list: 630 motif_count += repository.count(motif) 631 632 return motif_count
633
634 - def _get_unique_schema(self, cur_schemas, motif_list, num_ambiguous):
635 """Retrieve a unique schema from a motif. 636 637 We don't want to end up with schema that match the same thing, 638 since this could lead to ambiguous results, and be messy. This 639 tries to create schema, and checks that they do not match any 640 currently existing schema. 641 """ 642 # create a schema starting with a random motif 643 # we'll keep doing this until we get a completely new schema that 644 # doesn't match any old schema 645 num_tries = 0 646 647 while 1: 648 # pick a motif to work from and make a schema from it 649 cur_motif = random.choice(motif_list) 650 651 num_tries += 1 652 653 new_schema, matching_motifs = \ 654 self._schema_from_motif(cur_motif, motif_list, 655 num_ambiguous) 656 657 has_match = 0 658 for old_schema in cur_schemas: 659 if matches_schema(new_schema, old_schema, 660 self._ambiguity_symbol): 661 has_match = 1 662 663 # if the schema doesn't match any other schema we've got 664 # a good one 665 if not(has_match): 666 break 667 668 # check for big loops in which we can't find a new schema 669 assert num_tries < 150, \ 670 "Could not generate schema in %s tries from %s with %s" \ 671 % (num_tries, motif_list, cur_schemas) 672 673 return new_schema, matching_motifs
674
675 - def _schema_from_motif(self, motif, motif_list, num_ambiguous):
676 """Create a schema from a given starting motif. 677 678 Arguments: 679 680 o motif - A motif with the pattern we will start from. 681 682 o motif_list - The total motifs we have.to match to. 683 684 o num_ambiguous - The number of ambiguous characters that should 685 be present in the schema. 686 687 Returns: 688 689 o A string representing the newly generated schema. 690 691 o A list of all of the motifs in motif_list that match the schema. 692 """ 693 assert motif in motif_list, \ 694 "Expected starting motif present in remaining motifs." 695 696 # convert random positions in the motif to ambiguous characters 697 # convert the motif into a list of characters so we can manipulate it 698 new_schema_list = list(motif) 699 for add_ambiguous in range(num_ambiguous): 700 # add an ambiguous position in a new place in the motif 701 while 1: 702 ambig_pos = random.choice(range(len(new_schema_list))) 703 704 # only add a position if it isn't already ambiguous 705 # otherwise, we'll try again 706 if new_schema_list[ambig_pos] != self._ambiguity_symbol: 707 new_schema_list[ambig_pos] = self._ambiguity_symbol 708 break 709 710 # convert the schema back to a string 711 new_schema = string.join(new_schema_list, '') 712 713 # get the motifs that the schema matches 714 matched_motifs = [] 715 for motif in motif_list: 716 if matches_schema(motif, new_schema, self._ambiguity_symbol): 717 matched_motifs.append(motif) 718 719 return new_schema, matched_motifs
720
721 - def from_signatures(self, signature_repository, num_ambiguous):
722 raise NotImplementedError("Still need to code this.")
723