1 """Find and deal with signatures in biological sequence data.
2
3 In addition to representing sequences according to motifs (see Motif.py
4 for more information), we can also use Signatures, which are conserved
5 regions that are not necessarily consecutive. This may be useful in
6 the case of very diverged sequences, where signatures may pick out
7 important conservation that can't be found by motifs (hopefully!).
8 """
9
10 from Bio.Alphabet import _verify_alphabet
11 from Bio.Seq import Seq
12
13
14 from Pattern import PatternRepository
15
17 """Find Signatures in a group of sequence records.
18
19 In this simple implementation, signatures are just defined as a
20 two motifs separated by a gap. We need something a lot smarter than
21 this to find more complicated signatures.
22 """
23 - def __init__(self, alphabet_strict = 1):
24 """Initialize a finder to get signatures.
25
26 Arguments:
27
28 o alphabet_strict - Specify whether signatures should be required
29 to have all letters in the signature be consistent with the
30 alphabet of the original sequence. This requires that all Seqs
31 used have a consistent alphabet. This helps protect against getting
32 useless signatures full of ambiguity signals.
33 """
34 self._alphabet_strict = alphabet_strict
35
36 - def find(self, seq_records, signature_size, max_gap):
37 """Find all signatures in a group of sequences.
38
39 Arguments:
40
41 o seq_records - A list of SeqRecord objects we'll use the sequences
42 from to find signatures.
43
44 o signature_size - The size of each half of a signature (ie. if this
45 is set at 3, then the signature could be AGC-----GAC)
46
47 o max_gap - The maximum gap size between two parts of a signature.
48 """
49 sig_info = self._get_signature_dict(seq_records, signature_size,
50 max_gap)
51
52 return PatternRepository(sig_info)
53
55 """Return a dictionary with all signatures and their counts.
56
57 This internal function does all of the hard work for the
58 find_signatures function.
59 """
60 if self._alphabet_strict:
61 alphabet = seq_records[0].seq.alphabet
62 else:
63 alphabet = None
64
65
66 all_sigs = {}
67 for seq_record in seq_records:
68
69 if alphabet is not None:
70 assert seq_record.seq.alphabet == alphabet, \
71 "Working with alphabet %s and got %s" % \
72 (alphabet, seq_record.seq.alphabet)
73
74
75 largest_sig_size = sig_size * 2 + max_gap
76 for start in range(len(seq_record.seq) - (largest_sig_size - 1)):
77
78 first_sig = seq_record.seq[start:start + sig_size].tostring()
79
80
81 for second in range(start + 1, (start + 1) + max_gap):
82 second_sig = seq_record.seq[second: second + sig_size].tostring()
83
84
85
86 if alphabet is not None:
87 first_seq = Seq(first_sig, alphabet)
88 second_seq = Seq(second_sig, alphabet)
89 if _verify_alphabet(first_seq) \
90 and _verify_alphabet(second_seq):
91 all_sigs = self._add_sig(all_sigs,
92 (first_sig, second_sig))
93
94
95 else:
96 all_sigs = self._add_sig(all_sigs,
97 (first_sig, second_sig))
98
99 return all_sigs
100
101 - def _add_sig(self, sig_dict, sig_to_add):
102 """Add a signature to the given dictionary.
103 """
104
105 if sig_to_add in sig_dict:
106 sig_dict[sig_to_add] += 1
107
108 else:
109 sig_dict[sig_to_add] = 1
110
111 return sig_dict
112
114 """Convert a Sequence into its signature representatives.
115
116 This takes a sequence and a set of signatures, and converts the
117 sequence into a list of numbers representing the relative amounts
118 each signature is seen in the sequence. This allows a sequence to
119 serve as input into a neural network.
120 """
121 - def __init__(self, signatures, max_gap):
122 """Initialize with the signatures to look for.
123
124 Arguments:
125
126 o signatures - A complete list of signatures, in order, that
127 are to be searched for in the sequences. The signatures should
128 be represented as a tuple of (first part of the signature,
129 second_part of the signature) -- ('GATC', 'GATC').
130
131 o max_gap - The maximum gap we can have between the two
132 elements of the signature.
133 """
134 self._signatures = signatures
135 self._max_gap = max_gap
136
137
138
139 if len(self._signatures) > 0:
140 first_sig_size = len(self._signatures[0][0])
141 second_sig_size = len(self._signatures[0][1])
142
143 assert first_sig_size == second_sig_size, \
144 "Ends of the signature do not match: %s" \
145 % self._signatures[0]
146
147 for sig in self._signatures:
148 assert len(sig[0]) == first_sig_size, \
149 "Got first part of signature %s, expected size %s" % \
150 (sig[0], first_sig_size)
151 assert len(sig[1]) == second_sig_size, \
152 "Got second part of signature %s, expected size %s" % \
153 (sig[1], second_sig_size)
154
156 """Convert a sequence into a representation of its signatures.
157
158 Arguments:
159
160 o sequence - A Seq object we are going to convert into a set of
161 signatures.
162
163 Returns:
164 A list of relative signature representations. Each item in the
165 list corresponds to the signature passed in to the initializer and
166 is the number of times that the signature was found, divided by the
167 total number of signatures found in the sequence.
168 """
169
170
171 if len(self._signatures) == 0:
172 return []
173
174
175 sequence_sigs = {}
176 for sig in self._signatures:
177 sequence_sigs[sig] = 0
178
179
180 all_first_sigs = []
181 for sig_start, sig_end in self._signatures:
182 all_first_sigs.append(sig_start)
183
184
185 sig_size = len(self._signatures[0][0])
186 smallest_sig_size = sig_size * 2
187
188 for start in range(len(sequence) - (smallest_sig_size - 1)):
189
190
191 first_sig = sequence[start:start + sig_size].tostring()
192 if first_sig in all_first_sigs:
193 for second in range(start + sig_size,
194 (start + sig_size + 1) + self._max_gap):
195 second_sig = sequence[second:second + sig_size].tostring()
196
197
198 if (first_sig, second_sig) in sequence_sigs:
199 sequence_sigs[(first_sig, second_sig)] += 1
200
201
202 min_count = min(sequence_sigs.values())
203 max_count = max(sequence_sigs.values())
204
205
206
207 if max_count > 0:
208 for sig in sequence_sigs:
209 sequence_sigs[sig] = (float(sequence_sigs[sig] - min_count)
210 / float(max_count))
211
212
213 sig_amounts = []
214 for sig in self._signatures:
215 sig_amounts.append(sequence_sigs[sig])
216
217 return sig_amounts
218