1 """Deal with Motifs or Signatures allowing ambiguity in the sequences.
2
3 This class contains Schema which deal with Motifs and Signatures at
4 a higher level, by introducing `don't care` (ambiguity) symbols into
5 the sequences. For instance, you could combine the following Motifs:
6
7 'GATC', 'GATG', 'GATG', 'GATT'
8
9 as all falling under a schema like 'GAT*', where the star indicates a
10 character can be anything. This helps us condense a whole ton of
11 motifs or signatures.
12 """
13
14 import random
15 import re
16
17
18 from Bio import Alphabet
19 from Bio.Seq import MutableSeq
20
21
22 from Pattern import PatternRepository
23
24
25 from Bio.GA import Organism
26 from Bio.GA.Evolver import GenerationEvolver
27 from Bio.GA.Mutation.Simple import SinglePositionMutation
28 from Bio.GA.Crossover.Point import SinglePointCrossover
29 from Bio.GA.Repair.Stabilizing import AmbiguousRepair
30 from Bio.GA.Selection.Tournament import TournamentSelection
31 from Bio.GA.Selection.Diversity import DiversitySelection
32
34 """Deal with motifs that have ambiguity characters in it.
35
36 This motif class allows specific ambiguity characters and tries to
37 speed up finding motifs using regular expressions.
38
39 This is likely to be a replacement for the Schema representation,
40 since it allows multiple ambiguity characters to be used.
41 """
43 """Initialize with ambiguity information.
44
45 Arguments:
46
47 o ambiguity_info - A dictionary which maps letters in the motifs to
48 the ambiguous characters which they might represent. For example,
49 {'R' : 'AG'} specifies that Rs in the motif can match a A or a G.
50 All letters in the motif must be represented in the ambiguity_info
51 dictionary.
52 """
53 self._ambiguity_info = ambiguity_info
54
55
56 self._motif_cache = {}
57
59 """Encode the passed motif as a regular expression pattern object.
60
61 Arguments:
62
63 o motif - The motif we want to encode. This should be a string.
64
65 Returns:
66 A compiled regular expression pattern object that can be used
67 for searching strings.
68 """
69 regexp_string = ""
70
71 for motif_letter in motif:
72 try:
73 letter_matches = self._ambiguity_info[motif_letter]
74 except KeyError:
75 raise KeyError("No match information for letter %s"
76 % motif_letter)
77
78 if len(letter_matches) > 1:
79 regexp_match = "[" + letter_matches + "]"
80 elif len(letter_matches) == 1:
81 regexp_match = letter_matches
82 else:
83 raise ValueError("Unexpected match information %s"
84 % letter_matches)
85
86 regexp_string += regexp_match
87
88 return re.compile(regexp_string)
89
91 """Return the location of ambiguous items in the motif.
92
93 This just checks through the motif and compares each letter
94 against the ambiguity information. If a letter stands for multiple
95 items, it is ambiguous.
96 """
97 ambig_positions = []
98 for motif_letter_pos in range(len(motif)):
99 motif_letter = motif[motif_letter_pos]
100 try:
101 letter_matches = self._ambiguity_info[motif_letter]
102 except KeyError:
103 raise KeyError("No match information for letter %s"
104 % motif_letter)
105
106 if len(letter_matches) > 1:
107 ambig_positions.append(motif_letter_pos)
108
109 return ambig_positions
110
112 """Return the number of ambiguous letters in a given motif.
113 """
114 ambig_positions = self.find_ambiguous(motif)
115 return len(ambig_positions)
116
118 """Return all non-overlapping motif matches in the query string.
119
120 This utilizes the regular expression findall function, and will
121 return a list of all non-overlapping occurances in query that
122 match the ambiguous motif.
123 """
124 try:
125 motif_pattern = self._motif_cache[motif]
126 except KeyError:
127 motif_pattern = self.encode_motif(motif)
128 self._motif_cache[motif] = motif_pattern
129
130 return motif_pattern.findall(query)
131
133 """Find the number of non-overlapping times motif occurs in query.
134 """
135 all_matches = self.find_matches(motif, query)
136 return len(all_matches)
137
139 """Return a listing of all unambiguous letters allowed in motifs.
140 """
141 all_letters = sorted(self._ambiguity_info)
142 unambig_letters = []
143
144 for letter in all_letters:
145 possible_matches = self._ambiguity_info[letter]
146 if len(possible_matches) == 1:
147 unambig_letters.append(letter)
148
149 return unambig_letters
150
151
152
153
154
156 """Alphabet of a simple Schema for DNA sequences.
157
158 This defines a simple alphabet for DNA sequences that has a single
159 character which can match any other character.
160
161 o G,A,T,C - The standard unambiguous DNA alphabet.
162
163 o * - Any letter
164 """
165 letters = ["G", "A", "T", "C", "*"]
166
167 alphabet_matches = {"G" : "G",
168 "A" : "A",
169 "T" : "T",
170 "C" : "C",
171 "*" : "GATC"}
172
173
174
176 """Find schemas using a genetic algorithm approach.
177
178 This approach to finding schema uses Genetic Algorithms to evolve
179 a set of schema and find the best schema for a specific set of
180 records.
181
182 The 'default' finder searches for ambiguous DNA elements. This
183 can be overridden easily by creating a GeneticAlgorithmFinder
184 with a different alphabet.
185 """
187 """Initialize a finder to get schemas using Genetic Algorithms.
188
189 Arguments:
190
191 o alphabet -- The alphabet which specifies the contents of the
192 schemas we'll be generating. This alphabet must contain the
193 attribute 'alphabet_matches', which is a dictionary specifying
194 the potential ambiguities of each letter in the alphabet. These
195 ambiguities will be used in building up the schema.
196 """
197 self.alphabet = alphabet
198
199 self.initial_population = 500
200 self.min_generations = 10
201
202 self._set_up_genetic_algorithm()
203
223
225 """Find the given number of unique schemas using a genetic algorithm
226
227 Arguments:
228
229 o fitness - A callable object (ie. function) which will evaluate
230 the fitness of a motif.
231
232 o num_schemas - The number of unique schemas with good fitness
233 that we want to generate.
234 """
235 start_population = \
236 Organism.function_population(self.motif_generator.random_motif,
237 self.initial_population,
238 fitness)
239 finisher = SimpleFinisher(num_schemas, self.min_generations)
240
241
242 evolver = GenerationEvolver(start_population, self.selector)
243 evolved_pop = evolver.evolve(finisher.is_finished)
244
245
246 schema_info = {}
247 for org in evolved_pop:
248
249
250 seq_genome = org.genome.toseq()
251 schema_info[seq_genome.tostring()] = org.fitness
252
253 return PatternRepository(schema_info)
254
255
256
258 """Calculate fitness for schemas that differentiate between sequences.
259 """
260 - def __init__(self, positive_seqs, negative_seqs, schema_evaluator):
261 """Initialize with different sequences to evaluate
262
263 Arguments:
264
265 o positive_seq - A list of SeqRecord objects which are the 'positive'
266 sequences -- the ones we want to select for.
267
268 o negative_seq - A list of SeqRecord objects which are the 'negative'
269 sequences that we want to avoid selecting.
270
271 o schema_evaluator - An Schema class which can be used to
272 evaluate find motif matches in sequences.
273 """
274 self._pos_seqs = positive_seqs
275 self._neg_seqs = negative_seqs
276 self._schema_eval = schema_evaluator
277
279 """Calculate the fitness for a given schema.
280
281 Fitness is specified by the number of occurances of the schema in
282 the positive sequences minus the number of occurances in the
283 negative examples.
284
285 This fitness is then modified by multiplying by the length of the
286 schema and then dividing by the number of ambiguous characters in
287 the schema. This helps select for schema which are longer and have
288 less redundancy.
289 """
290
291 seq_motif = genome.toseq()
292 motif = seq_motif.tostring()
293
294
295 num_pos = 0
296 for seq_record in self._pos_seqs:
297 cur_counts = self._schema_eval.num_matches(motif,
298 seq_record.seq.tostring())
299 num_pos += cur_counts
300
301
302 num_neg = 0
303 for seq_record in self._neg_seqs:
304 cur_counts = self._schema_eval.num_matches(motif,
305 seq_record.seq.tostring())
306
307 num_neg += cur_counts
308
309 num_ambiguous = self._schema_eval.num_ambiguous(motif)
310
311 num_ambiguous = pow(2.0, num_ambiguous)
312
313 num_ambiguous += 1
314
315 motif_size = len(motif)
316 motif_size = motif_size * 4.0
317
318 discerning_power = num_pos - num_neg
319
320 diff = (discerning_power * motif_size) / float(num_ambiguous)
321 return diff
322
324 """Calculate a fitness giving weight to schemas that match many times.
325
326 This fitness function tries to maximize schemas which are found many
327 times in a group of sequences.
328 """
329 - def __init__(self, seq_records, schema_evaluator):
330 """Initialize with sequences to evaluate.
331
332 Arguments:
333
334 o seq_records -- A set of SeqRecord objects which we use to
335 calculate the fitness.
336
337 o schema_evaluator - An Schema class which can be used to
338 evaluate find motif matches in sequences.
339 """
340 self._records = seq_records
341 self._evaluator = schema_evaluator
342
344 """Calculate the fitness of a genome based on schema matches.
345
346 This bases the fitness of a genome completely on the number of times
347 it matches in the set of seq_records. Matching more times gives a
348 better fitness
349 """
350
351 seq_motif = genome.toseq()
352 motif = seq_motif.tostring()
353
354
355 num_times = 0
356 for seq_record in self._records:
357 cur_counts = self._evaluator.num_matches(motif,
358 seq_record.seq.tostring())
359 num_times += cur_counts
360
361 return num_times
362
363
365 """Generate a random motif within given parameters.
366 """
367 - def __init__(self, alphabet, min_size = 12, max_size = 17):
368 """Initialize with the motif parameters.
369
370 Arguments:
371
372 o alphabet - An alphabet specifying what letters can be inserted in
373 a motif.
374
375 o min_size, max_size - Specify the range of sizes for motifs.
376 """
377 self._alphabet = alphabet
378 self._min_size = min_size
379 self._max_size = max_size
380
382 """Create a random motif within the given parameters.
383
384 This returns a single motif string with letters from the given
385 alphabet. The size of the motif will be randomly chosen between
386 max_size and min_size.
387 """
388 motif_size = random.randrange(self._min_size, self._max_size)
389
390 motif = ""
391 for letter_num in range(motif_size):
392 cur_letter = random.choice(self._alphabet.letters)
393 motif += cur_letter
394
395 return MutableSeq(motif, self._alphabet)
396
398 """Determine when we are done evolving motifs.
399
400 This takes the very simple approach of halting evolution when the
401 GA has proceeded for a specified number of generations and has
402 a given number of unique schema with positive fitness.
403 """
404 - def __init__(self, num_schemas, min_generations = 100):
405 """Initialize the finisher with its parameters.
406
407 Arguments:
408
409 o num_schemas -- the number of useful (positive fitness) schemas
410 we want to generation
411
412 o min_generations -- The minimum number of generations to allow
413 the GA to proceed.
414 """
415 self.num_generations = 0
416
417 self.num_schemas = num_schemas
418 self.min_generations = min_generations
419
421 """Determine when we can stop evolving the population.
422 """
423 self.num_generations += 1
424
425
426 if self.num_generations >= self.min_generations:
427 all_seqs = []
428 for org in organisms:
429 if org.fitness > 0:
430 if org.genome not in all_seqs:
431 all_seqs.append(org.genome)
432
433 if len(all_seqs) >= self.num_schemas:
434 return 1
435
436 return 0
437
438
440 """Find schema in a set of sequences using a genetic algorithm approach.
441
442 Finding good schemas is very difficult because it takes forever to
443 enumerate all of the potential schemas. This finder using a genetic
444 algorithm approach to evolve good schema which match many times in
445 a set of sequences.
446
447 The default implementation of the finder is ready to find schemas
448 in a set of DNA sequences, but the finder can be customized to deal
449 with any type of data.
450 """
457
458 - def find(self, seq_records):
459 """Find well-represented schemas in the given set of SeqRecords.
460 """
461 fitness_evaluator = MostCountSchemaFitness(seq_records,
462 self.evaluator)
463
464 return self._finder.find_schemas(fitness_evaluator.calculate_fitness,
465 self.num_schemas)
466
468 """Find schemas which differentiate between the two sets of SeqRecords.
469 """
470 fitness_evaluator = DifferentialSchemaFitness(first_records,
471 second_records,
472 self.evaluator)
473
474 return self._finder.find_schemas(fitness_evaluator.calculate_fitness,
475 self.num_schemas)
476
478 """Convert a sequence into a representation of ambiguous motifs (schemas).
479
480 This takes a sequence, and returns the number of times specified
481 motifs are found in the sequence. This lets you represent a sequence
482 as just a count of (possibly ambiguous) motifs.
483 """
484 - def __init__(self, schemas, ambiguous_converter):
485 """Initialize the coder to convert sequences
486
487 Arguments:
488
489 o schema - A list of all of the schemas we want to search for
490 in input sequences.
491
492 o ambiguous_converter - An Schema class which can be
493 used to convert motifs into regular expressions for searching.
494 """
495 self._schemas = schemas
496 self._converter = ambiguous_converter
497
499 """Represent the given input sequence as a bunch of motif counts.
500
501 Arguments:
502
503 o sequence - A Bio.Seq object we are going to represent as schemas.
504
505 This takes the sequence, searches for the motifs within it, and then
506 returns counts specifying the relative number of times each motifs
507 was found. The frequencies are in the order the original motifs were
508 passed into the initializer.
509 """
510 schema_counts = []
511
512 for schema in self._schemas:
513 num_counts = self._converter.num_matches(schema, sequence.tostring())
514 schema_counts.append(num_counts)
515
516
517 min_count = 0
518 max_count = max(schema_counts)
519
520
521
522 if max_count > 0:
523 for count_num in range(len(schema_counts)):
524 schema_counts[count_num] = (float(schema_counts[count_num]) -
525 float(min_count)) / float(max_count)
526
527 return schema_counts
528
530 """Determine whether or not the given pattern matches the schema.
531
532 Arguments:
533
534 o pattern - A string representing the pattern we want to check for
535 matching. This pattern can contain ambiguity characters (which are
536 assumed to be the same as those in the schema).
537
538 o schema - A string schema with ambiguity characters.
539
540 o ambiguity_character - The character used for ambiguity in the schema.
541 """
542 if len(pattern) != len(schema):
543 return 0
544
545
546
547 for pos in range(len(pattern)):
548 if (schema[pos] != ambiguity_character and
549 pattern[pos] != ambiguity_character and
550 pattern[pos] != schema[pos]):
551
552 return 0
553
554 return 1
555
557 """Generate Schema from inputs of Motifs or Signatures.
558 """
559 - def __init__(self, ambiguity_symbol = '*'):
560 """Initialize the SchemaFactory
561
562 Arguments:
563
564 o ambiguity_symbol -- The symbol to use when specifying that
565 a position is arbitrary.
566 """
567 self._ambiguity_symbol = ambiguity_symbol
568
569 - def from_motifs(self, motif_repository, motif_percent, num_ambiguous):
570 """Generate schema from a list of motifs.
571
572 Arguments:
573
574 o motif_repository - A MotifRepository class that has all of the
575 motifs we want to convert to Schema.
576
577 o motif_percent - The percentage of motifs in the motif bank which
578 should be matches. We'll try to create schema that match this
579 percentage of motifs.
580
581 o num_ambiguous - The number of ambiguous characters to include
582 in each schema. The positions of these ambiguous characters will
583 be randomly selected.
584 """
585
586 all_motifs = motif_repository.get_top_percentage(motif_percent)
587
588
589 schema_info = {}
590
591
592 total_count = self._get_num_motifs(motif_repository, all_motifs)
593 matched_count = 0
594 assert total_count > 0, "Expected to have motifs to match"
595 while (float(matched_count) / float(total_count)) < motif_percent:
596
597 new_schema, matching_motifs = \
598 self._get_unique_schema(schema_info.keys(),
599 all_motifs, num_ambiguous)
600
601
602
603 schema_counts = 0
604 for motif in matching_motifs:
605
606 schema_counts += motif_repository.count(motif)
607
608
609
610 all_motifs.remove(motif)
611
612
613
614 schema_info[new_schema] = schema_counts
615
616 matched_count += schema_counts
617
618
619
620 return PatternRepository(schema_info)
621
623 """Return the number of motif counts for the list of motifs.
624 """
625 motif_count = 0
626 for motif in motif_list:
627 motif_count += repository.count(motif)
628
629 return motif_count
630
632 """Retrieve a unique schema from a motif.
633
634 We don't want to end up with schema that match the same thing,
635 since this could lead to ambiguous results, and be messy. This
636 tries to create schema, and checks that they do not match any
637 currently existing schema.
638 """
639
640
641
642 num_tries = 0
643
644 while 1:
645
646 cur_motif = random.choice(motif_list)
647
648 num_tries += 1
649
650 new_schema, matching_motifs = \
651 self._schema_from_motif(cur_motif, motif_list,
652 num_ambiguous)
653
654 has_match = 0
655 for old_schema in cur_schemas:
656 if matches_schema(new_schema, old_schema,
657 self._ambiguity_symbol):
658 has_match = 1
659
660
661
662 if not(has_match):
663 break
664
665
666 assert num_tries < 150, \
667 "Could not generate schema in %s tries from %s with %s" \
668 % (num_tries, motif_list, cur_schemas)
669
670 return new_schema, matching_motifs
671
673 """Create a schema from a given starting motif.
674
675 Arguments:
676
677 o motif - A motif with the pattern we will start from.
678
679 o motif_list - The total motifs we have.to match to.
680
681 o num_ambiguous - The number of ambiguous characters that should
682 be present in the schema.
683
684 Returns:
685
686 o A string representing the newly generated schema.
687
688 o A list of all of the motifs in motif_list that match the schema.
689 """
690 assert motif in motif_list, \
691 "Expected starting motif present in remaining motifs."
692
693
694
695 new_schema_list = list(motif)
696 for add_ambiguous in range(num_ambiguous):
697
698 while 1:
699 ambig_pos = random.choice(range(len(new_schema_list)))
700
701
702
703 if new_schema_list[ambig_pos] != self._ambiguity_symbol:
704 new_schema_list[ambig_pos] = self._ambiguity_symbol
705 break
706
707
708 new_schema = ''.join(new_schema_list)
709
710
711 matched_motifs = []
712 for motif in motif_list:
713 if matches_schema(motif, new_schema, self._ambiguity_symbol):
714 matched_motifs.append(motif)
715
716 return new_schema, matched_motifs
717
719 raise NotImplementedError("Still need to code this.")
720