1
2
3
4
5
6
7
8 """
9 Access to General Feature Format databases created with Bio::DB:GFF
10
11 based on documentation for Lincoln Stein's Perl Bio::DB::GFF
12
13 >>> import os
14 >>> import Bio.GFF
15 >>> PASSWORD = os.environ['MYSQLPASS']
16 >>> DATABASE_GFF = 'wormbase_ws93'
17 >>> FASTADIR = '/local/wormbase_ws93/'
18 >>> gff = Bio.GFF.Connection(passwd=PASSWORD, db=DATABASE_GFF, fastadir=FASTADIR)
19
20 """
21
22 __version__ = "$Revision: 1.9 $"
23
24
25 import exceptions
26 import operator
27 import os.path
28 import sys
29 import types
30
31 from Bio import MissingExternalDependencyError
32
33 try:
34 import MySQLdb
35 except:
36 raise MissingExternalDependencyError("Install MySQLdb if you want to use Bio.GFF.")
37
38 from Bio.Alphabet import IUPAC
39 from Bio import DocSQL
40 from Bio.Seq import Seq
41 from Bio.SeqRecord import SeqRecord
42 from Bio import Translate
43
44 import binning
45 import easy
46 import GenericTools
47
48
49 DEFAULT_ALPHABET = IUPAC.unambiguous_dna
50 standard_translator = Translate.unambiguous_dna_by_id[1]
51
53 """
54 this will only work for the simplest of easy.Location objects
55 """
66
69
75
77 """
78 Connection to GFF database
79
80 also functions as whole segment
81 """
82
91
92 - def segment(self, *args, **keywds):
93 return Segment(self, *args, **keywds)
94
96 """
97 Singleton: contain records of loaded FASTA files
98
99 >>> RetrieveSeqname._dir = 'GFF'
100 >>> RetrieveSeqname._diagnostics = 1
101 >>> sys.stderr = sys.stdout # dump diagnostics to stdout so doctest can see them
102 >>> record1 = RetrieveSeqname('NC_001802.fna')
103 Loading GFF/NC_001802.fna... Done.
104 >>> record1.id
105 'gi|9629357|ref|NC_001802.1|'
106 >>> record2 = RetrieveSeqname('NC_001802.fna')
107 >>> record1.seq is record2.seq # should be same space in memory
108 1
109 """
110 __records = {}
111 _diagnostics = 0
112
125
127 """
128 strand may be:
129 +/0 = Watson
130 -/1 = Crick
131
132 I propose that we start calling these the Rosalind and Franklin strands
133
134 >>> RetrieveSeqname._dir = 'GFF'
135 >>> feature = Feature("NC_001802x.fna", 73, 78) # not having the x will interfere with the RetrieveSequence test
136 >>> feature.seq()
137 Seq('AATAAA', Alphabet())
138 >>> print feature.location()
139 NC_001802x.fna:73..78
140 >>> from Bio import SeqIO
141 >>> record = feature.record()
142 >>> records = [record]
143 >>> SeqIO.write(records, sys.stdout, 'fasta')
144 > NC_001802x.fna:73..78
145 AATAAA
146 >>> feature2 = Feature(location=easy.LocationFromString("NC_001802x.fna:73..78"))
147 >>> writer.write(feature2.record())
148 > NC_001802x.fna:73..78
149 AATAAA
150 >>> location3 = easy.LocationFromString("NC_001802x.fna:complement(73..78)")
151 >>> feature3 = Feature(location=location3)
152 >>> writer.write(feature3.record())
153 > NC_001802x.fna:complement(73..78)
154 TTTATT
155 >>> location4 = easy.LocationFromString("NC_001802x.fna:336..1631")
156 >>> feature4 = Feature(location=location4, frame=0)
157 >>> feature4.frame
158 0
159 >>> feature4.translate()[:7]
160 Seq('MGARASV', HasStopCodon(IUPACProtein(), '*'))
161 >>> feature4.frame = 6 # can't happen, but a useful demonstration
162 >>> feature4.translate()[:5]
163 Seq('ARASV', HasStopCodon(IUPACProtein(), '*'))
164 >>> feature4.frame = 1
165 >>> feature4.translate()[:5]
166 Seq('WVRER', HasStopCodon(IUPACProtein(), '*'))
167 >>> location5 = easy.LocationFromString("NC_001802lc.fna:336..1631") # lowercase data
168 >>> feature5 = Feature(location=location5, frame=0)
169 >>> feature5.translate()[:7]
170 Seq('MGARASV', HasStopCodon(IUPACProtein(), '*'))
171 >>> location6 = easy.LocationFromString("NC_001802lc.fna:335..351")
172 >>> feature6 = Feature(location=location6, frame=1)
173 >>> feature6.translate()
174 Seq('MGARA', HasStopCodon(IUPACProtein(), '*'))
175 """
176 - def __init__(self,
177 seqname=None,
178 start=None,
179 end=None,
180 strand="+",
181 frame=None,
182 location=None,
183 alphabet=DEFAULT_ALPHABET):
199
207
211
228
231
233 if self.target_start <= self.target_end:
234 return easy.LocationFromCoords(self.target_start-1, self.target_end-1, 0, seqname=self.gname)
235 else:
236
237 return easy.LocationFromCoords(self.target_end-1, self.target_start-1, 1, seqname=self.gname)
238
240 try:
241 return "%s:%s" % (self.gclass, self.gname)
242 except AttributeError:
243 return ""
244
247
250
252 return "Feature(%s)" % self.location()
253
255 """
256 row of FeatureQuery results
257 works like a Feature
258 """
268
270 """
271 SELECT fdata.fref AS seqname,
272 ftype.fsource AS source,
273 ftype.fmethod AS feature,
274 fdata.fstart AS start,
275 fdata.fstop AS end,
276 fdata.fscore AS score,
277 fdata.fstrand AS strand,
278 fdata.fphase AS frame,
279 fdata.ftarget_start AS target_start,
280 fdata.ftarget_stop AS target_end,
281 fdata.gid,
282 fgroup.gclass,
283 fgroup.gname
284 FROM fdata, ftype, fgroup
285 WHERE fdata.ftypeid = ftype.ftypeid
286 AND fdata.gid = fgroup.gid
287 """
288
289 row_class = FeatureQueryRow
290 - def __init__(self,
291 seqname=None,
292 coords=None,
293 complement=0,
294 method=None,
295 source=None,
296 object=None,
297 gid=None,
298 *args,
299 **keywds):
300
301 DocSQL.Query.__init__(self, *args, **keywds)
302
303 if seqname is not None:
304 self.statement += ' AND fref="%s"\n' % seqname
305 if coords is not None:
306 self.statement += " AND (%s)\n" % binning.query(coords[0], coords[1])
307 if coords[0] is not None:
308 self.statement += (" AND fstop >= %s\n" % coords[0])
309 if coords[1] is not None:
310 self.statement += (" AND fstart <= %s\n" % coords[1])
311 if method is not None:
312 self.statement += ' AND fmethod LIKE "%s"\n' % method
313 if source is not None:
314 self.statement += ' AND fsource LIKE "%s"\n' % source
315 if object is not None:
316 self.statement += ' AND %s\n' % object2fgroup_sql(object)
317 if gid is not None:
318 self.statement += ' AND fgroup.gid = %s\n' % gid
319
320 if complement:
321 self.statement += " ORDER BY 0-fstart, 0-fstop"
322 else:
323 self.statement += " ORDER BY fstart, fstop"
324
327
329 return 'gclass LIKE "%s" AND gname LIKE "%s"' % object_split(object)
330
332 """
333 >>> feature1_1 = Feature(location=easy.LocationFromString("NC_001802x.fna:336..1631"), frame=0) # gag-pol
334 >>> feature1_2 = Feature(location=easy.LocationFromString("NC_001802x.fna:1631..4642"), frame=0) # slippage
335 >>> aggregate = FeatureAggregate([feature1_1, feature1_2])
336 >>> print aggregate.location()
337 join(NC_001802x.fna:336..1631,NC_001802x.fna:1631..4642)
338 >>> xlate_str = aggregate.translate().tostring()
339 >>> xlate_str[:5], xlate_str[-5:]
340 ('MGARA', 'RQDED')
341
342 >>> location1 = easy.LocationFromString("NC_001802x.fna:complement(1..6)")
343 >>> location2 = easy.LocationFromString("NC_001802x.fna:complement(7..12)")
344 >>> feature2_1 = Feature(location=location1, frame=0)
345 >>> feature2_2 = Feature(location=location2, frame=0)
346 >>> aggregate2 = FeatureAggregate([feature2_1, feature2_2])
347 >>> print aggregate2.location()
348 complement(join(NC_001802x.fna:1..6,NC_001802x.fna:7..12))
349 >>> print aggregate2.translate()
350 Seq('TRET', HasStopCodon(IUPACProtein(), '*'))
351 >>> location1.reverse()
352 >>> location2.reverse()
353 >>> aggregate3 = FeatureAggregate([Feature(location=x, frame=0) for x in [location1, location2]])
354 >>> print aggregate3.location()
355 join(NC_001802x.fna:1..6,NC_001802x.fna:7..12)
356 >>> print aggregate3.translate()
357 Seq('GLSG', HasStopCodon(IUPACProtein(), '*'))
358 >>> aggregate3[0].frame = 3
359 >>> print aggregate3.translate()
360 Seq('LSG', HasStopCodon(IUPACProtein(), '*'))
361
362 >>> aggregate4 = FeatureAggregate()
363 >>> aggregate4.append(Feature(location=easy.LocationFromString("NC_001802x.fna:1..5"), frame=0))
364 >>> aggregate4.append(Feature(location=easy.LocationFromString("NC_001802x.fna:6..12"), frame=2))
365 >>> aggregate4.seq()
366 Seq('GGTCTCTCTGGT', Alphabet())
367 >>> aggregate4.translate()
368 Seq('GLSG', HasStopCodon(IUPACProtein(), '*'))
369 """
370 - def __init__(self, feature_query=None):
375
380
381 - def map(self, func):
386
389
401
403 """
404 >>> object_split("Sequence:F02E9.2a")
405 ('Sequence', 'F02E9.2a')
406 """
407 return tuple(object.split(":"))
408
409 -def _test(*args, **keywds):
410 import doctest, sys
411 doctest.testmod(sys.modules[__name__], *args, **keywds)
412
413 if __name__ == "__main__":
414 if __debug__:
415 _test()
416