1 """Represent a Sequence Feature holding info about a part of a sequence.
2
3 This is heavily modeled after the Biocorba SeqFeature objects, and
4 may be pretty biased towards GenBank stuff since I'm writing it
5 for the GenBank parser output...
6
7 What's here:
8
9 Base class to hold a Feature.
10 ----------------------------
11 classes:
12 o SeqFeature
13
14 Hold information about a Reference.
15 ----------------------------------
16
17 This is an attempt to create a General class to hold Reference type
18 information.
19
20 classes:
21 o Reference
22
23 Specify locations of a feature on a Sequence.
24 ---------------------------------------------
25
26 This aims to handle, in Ewan's words, 'the dreaded fuzziness issue' in
27 much the same way as Biocorba. This has the advantages of allowing us
28 to handle fuzzy stuff in case anyone needs it, and also be compatible
29 with Biocorba.
30
31 classes:
32 o FeatureLocation - Specify the start and end location of a feature.
33
34 o ExactPosition - Specify the position as being exact.
35 o WithinPosition - Specify a position occuring within some range.
36 o BetweenPosition - Specify a position occuring between a range.
37 o BeforePosition - Specify the position as being found before some base.
38 o AfterPosition - Specify the position as being found after some base.
39 """
40
42 """Represent a Sequence Feature on an object.
43
44 Attributes:
45 o location - the location of the feature on the sequence
46 o type - the specified type of the feature (ie. CDS, exon, repeat...)
47 o location_operator - a string specifying how this SeqFeature may
48 be related to others. For example, in the example GenBank feature
49 shown below, the location_operator would be "join"
50 o strand - A value specifying on which strand (of a DNA sequence, for
51 instance) the feature deals with. 1 indicates the plus strand, -1
52 indicates the minus strand, 0 indicates both strands, and None indicates
53 that strand doesn't apply (ie. for proteins) or is not known.
54 o id - A string identifier for the feature.
55 o ref - A reference to another sequence. This could be an accession
56 number for some different sequence.
57 o ref_db - A different database for the reference accession number.
58 o qualifier - A dictionary of qualifiers on the feature. These are
59 analagous to the qualifiers from a GenBank feature table. The keys of
60 the dictionary are qualifier names, the values are the qualifier
61 values.
62 o sub_features - Additional SeqFeatures which fall under this 'parent'
63 feature. For instance, if we having something like:
64
65 CDS join(1..10,30..40,50..60)
66
67 The the top level feature would be a CDS from 1 to 60, and the sub
68 features would be of 'CDS_join' type and would be from 1 to 10, 30 to
69 40 and 50 to 60, respectively.
70 """
71 - def __init__(self, location = None, type = '', location_operator = '',
72 strand = None, id = "<unknown id>",
73 qualifiers = {}, sub_features = [],
74 ref = None, ref_db = None):
75 """Initialize a SeqFeature on a Sequence.
76 """
77 self.location = location
78
79 self.type = type
80 self.location_operator = location_operator
81 self.strand = strand
82 self.id = id
83
84
85
86
87 self.qualifiers = {}
88 self.sub_features = []
89 self.ref = ref
90 self.ref_db = ref_db
91
93 """A string representation of the record for debugging."""
94 answer = "%s(%s" % (self.__class__, repr(self.location))
95 if self.type :
96 answer += ", type=%s" % repr(self.type)
97 if self.location_operator :
98 answer += ", location_operator=%s" % repr(self.location_operator)
99 if self.strand :
100 answer += ", strand=%s" % repr(self.strand)
101 if self.id and self.id != "<unknown id>" :
102 answer += ", id=%s" % repr(self.id)
103 if self.ref :
104 answer += ", ref=%s" % repr(self.ref)
105 if self.ref_db :
106 answer += ", ref_db=%s" % repr(self.ref_db)
107 answer += ")"
108 return answer
109
111 """A readable summary of the feature intended to be printed to screen.
112 """
113 out = "type: %s\n" % self.type
114 out += "location: %s\n" % self.location
115 out += "ref: %s:%s\n" % (self.ref, self.ref_db)
116 out += "strand: %s\n" % self.strand
117 out += "qualifiers: \n"
118 qualifier_keys = self.qualifiers.keys()
119 qualifier_keys.sort()
120 for qual_key in qualifier_keys:
121 out += "\tKey: %s, Value: %s\n" % (qual_key,
122 self.qualifiers[qual_key])
123 if len(self.sub_features) != 0:
124 out += "Sub-Features\n"
125 for sub_feature in self.sub_features:
126 out +="%s\n" % sub_feature
127
128 return out
129
130
131
132
134 """Represent a Generic Reference object.
135
136 Attributes:
137 o location - A list of Location objects specifying regions of
138 the sequence that the references correspond to. If no locations are
139 specified, the entire sequence is assumed.
140 o authors - A big old string, or a list split by author, of authors
141 for the reference.
142 o title - The title of the reference.
143 o journal - Journal the reference was published in.
144 o medline_id - A medline reference for the article.
145 o pubmed_id - A pubmed reference for the article.
146 o comment - A place to stick any comments about the reference.
147 """
157
159 """Output an informative string for debugging.
160 """
161 out = ""
162 for single_location in self.location:
163 out += "location: %s\n" % single_location
164 out += "authors: %s\n" % self.authors
165 if self.consrtm:
166 out += "consrtm: %s\n" % self.consrtm
167 out += "title: %s\n" % self.title
168 out += "journal: %s\n" % self.journal
169 out += "medline id: %s\n" % self.medline_id
170 out += "pubmed id: %s\n" % self.pubmed_id
171 out += "comment: %s\n" % self.comment
172
173 return out
174
175
176
178 """Specify the location of a feature along a sequence.
179
180 This attempts to deal with fuzziness of position ends, but also
181 make it easy to get the start and end in the 'normal' case (no
182 fuzziness).
183
184 You should access the start and end attributes with
185 your_location.start and your_location.end. If the start and
186 end are exact, this will return the positions, if not, we'll return
187 the approriate Fuzzy class with info about the position and fuzziness.
188
189 Note that the start and end location numbering follow Python's scheme,
190 thus a GenBank entry of 123..150 (one based counting) becomes a location
191 of [122:150] (zero based counting).
192 """
194 """Specify the start and end of a sequence feature.
195
196 start and end arguments specify the values where the feature begins
197 and ends. These can either by any of the *Position objects that
198 inherit from AbstractPosition, or can just be integers specifying the
199 position. In the case of integers, the values are assumed to be
200 exact and are converted in ExactPosition arguments. This is meant
201 to make it easy to deal with non-fuzzy ends.
202 """
203 if isinstance(start, AbstractPosition):
204 self._start = start
205 else:
206 self._start = ExactPosition(start)
207
208 if isinstance(end, AbstractPosition):
209 self._end = end
210 else:
211 self._end = ExactPosition(end)
212
214 """Returns a representation of the location (with python counting).
215
216 For the simple case this uses the python splicing syntax, [122:150]
217 (zero based counting) which GenBank would call 123..150 (one based
218 counting).
219 """
220 return "[%s:%s]" % (self._start, self._end)
221
223 """A string representation of the location for debugging."""
224 return "%s(%s,%s)" \
225 % (self.__class__, repr(self.start), repr(self.end))
226
228 """Make it easy to get non-fuzzy starts and ends.
229
230 We override get_attribute here so that in non-fuzzy cases we
231 can just return the start and end position without any hassle.
232
233 To get fuzzy start and ends, just ask for item.start and
234 item.end. To get non-fuzzy attributes (ie. the position only)
235 ask for 'item.nofuzzy_start', 'item.nofuzzy_end'. These should return
236 the largest range of the fuzzy position. So something like:
237 (10.20)..(30.40) should return 10 for start, and 40 for end.
238
239 The special tricky case where is when we have a single between position
240 argument like 2^3 for the range. We want nofuzzy_start and nofuzzy_end
241 to give a reasonable approximation of what this really means, which
242 is an empty string -- so the same position for both. Doing a special
243 case here sucks, but there is really not a general rule you can apply
244 to this.
245 """
246 if attr == 'start':
247 return self._start
248 elif attr == 'end':
249 return self._end
250 elif attr == 'nofuzzy_start':
251 if ((self._start == self._end) and isinstance(self._start,
252 BetweenPosition)):
253 return self._start.position
254 else:
255 return min(self._start.position,
256 self._start.position + self._start.extension)
257 elif attr == 'nofuzzy_end':
258 if ((self._start == self._end) and isinstance(self._start,
259 BetweenPosition)):
260 return self._end.position
261 else:
262 return max(self._end.position,
263 self._end.position + self._end.extension)
264 else:
265 raise AttributeError("Cannot evaluate attribute %s." % attr)
266
268 """Abstract base class representing a position.
269 """
270 - def __init__(self, position, extension):
271 self.position = position
272 self.extension = extension
273
275 """String representation of the location for debugging."""
276 return "%s(%s,%s)" \
277 % (self.__class__, repr(self.position), repr(self.extension))
278
280 """A simple comparison function for positions.
281
282 This is very simple-minded and just compares the position attribute
283 of the features; extensions are not considered at all. This could
284 potentially be expanded to try to take advantage of extensions.
285 """
286 assert isinstance(other, AbstractPosition), \
287 "We can only do comparisons between Biopython Position objects."
288
289 return cmp(self.position, other.position)
290
292 """Specify the specific position of a boundary.
293
294 o position - The position of the boundary.
295 o extension - An optional argument which must be zero since we don't
296 have an extension. The argument is provided so that the same number of
297 arguments can be passed to all position types.
298
299 In this case, there is no fuzziness associated with the position.
300 """
301 - def __init__(self, position, extension = 0):
302 if extension != 0:
303 raise AttributeError("Non-zero extension %s for exact position."
304 % extension)
305 AbstractPosition.__init__(self, position, 0)
306
308 """String representation of the ExactPosition location for debugging."""
309 assert self.extension == 0
310 return "%s(%s)" % (self.__class__, repr(self.position))
311
313 return str(self.position)
314
316 """Specify the position of a boundary within some coordinates.
317
318 Arguments:
319 o position - The start position of the boundary
320 o extension - The range to which the boundary can extend.
321
322 This allows dealing with a position like ((1.4)..100). This
323 indicates that the start of the sequence is somewhere between 1
324 and 4. To represent that with this class we would set position as
325 1 and extension as 3.
326 """
327 - def __init__(self, position, extension = 0):
329
331 return "(%s.%s)" % (self.position, self.position + self.extension)
332
334 """Specify the position of a boundary between two coordinates.
335
336 Arguments:
337 o position - The start position of the boundary.
338 o extension - The range to the other position of a boundary.
339
340 This specifies a coordinate which is found between the two positions.
341 So this allows us to deal with a position like ((1^2)..100). To
342 represent that with this class we set position as 1 and the
343 extension as 1.
344 """
345 - def __init__(self, position, extension = 0):
347
349 return "(%s^%s)" % (self.position, self.position + self.extension)
350
352 """Specify a position where the actual location occurs before it.
353
354 Arguments:
355 o position - The upper boundary of where the location can occur.
356 o extension - An optional argument which must be zero since we don't
357 have an extension. The argument is provided so that the same number of
358 arguments can be passed to all position types.
359
360 This is used to specify positions like (<10..100) where the location
361 occurs somewhere before position 10.
362 """
363 - def __init__(self, position, extension = 0):
364 if extension != 0:
365 raise AttributeError("Non-zero extension %s for exact position."
366 % extension)
367 AbstractPosition.__init__(self, position, 0)
368
370 """A string representation of the location for debugging."""
371 assert self.extension == 0
372 return "%s(%s)" % (self.__class__, repr(self.position))
373
375 return "<%s" % self.position
376
378 """Specify a position where the actual location is found after it.
379
380 Arguments:
381 o position - The lower boundary of where the location can occur.
382 o extension - An optional argument which must be zero since we don't
383 have an extension. The argument is provided so that the same number of
384 arguments can be passed to all position types.
385
386 This is used to specify positions like (>10..100) where the location
387 occurs somewhere after position 10.
388 """
389 - def __init__(self, position, extension = 0):
390 if extension != 0:
391 raise AttributeError("Non-zero extension %s for exact position."
392 % extension)
393 AbstractPosition.__init__(self, position, 0)
394
396 """A string representation of the location for debugging."""
397 assert self.extension == 0
398 return "%s(%s)" % (self.__class__, repr(self.position))
399
401 return ">%s" % self.position
402
404 """Specify a position where the location can be multiple positions.
405
406 This models the GenBank 'one-of(1888,1901)' function, and tries
407 to make this fit within the Biopython Position models. In our case
408 the position of the "one-of" is set as the lowest choice, and the
409 extension is the range to the highest choice.
410 """
412 """Initialize with a set of posssible positions.
413
414 position_list is a list of AbstractPosition derived objects,
415 specifying possible locations.
416 """
417
418 self.position_choices = position_list
419
420 smallest = None
421 largest = None
422 for position_choice in self.position_choices:
423 assert isinstance(position_choice, AbstractPosition), \
424 "Expected position objects, got %r" % position_choice
425 if smallest is None and largest is None:
426 smallest = position_choice.position
427 largest = position_choice.position
428 elif position_choice.position > largest:
429 largest = position_choice.position
430 elif position_choice.position < smallest:
431 smallest = position_choice.position
432
433 AbstractPosition.__init__(self, smallest, largest - smallest)
434
436 """String representation of the OneOfPosition location for debugging."""
437 return "%s(%s)" % (self.__class__, repr(self.position_choices))
438
440 out = "one-of("
441 for position in self.position_choices:
442 out += "%s," % position
443
444 out = out[:-1] + ")"
445 return out
446
448 """Simple class to hold information about a gap between positions.
449 """
451 """Intialize with a position object containing the gap information.
452 """
453 self.gap_size = gap_size
454
456 """A string representation of the position gap for debugging."""
457 return "%s(%s)" % (self.__class__, repr(self.gap_size))
458
460 out = "gap(%s)" % self.gap_size
461 return out
462