1
2
3
4
5
6
7
8 """Access to General Feature Format databases created with BioPerl (DEPRECATED).
9
10 This is the "old" Bio.GFF module by Michael Hoffman, which offers access to
11 a MySQL database holding GFF data loaded by BioPerl. This code has now been
12 deprecated, and will be removed (or at best, relocated) in order to free the
13 Bio.GFF namespace for a new GFF parser in Biopython (including GFF3 support).
14
15 Based on documentation for Lincoln Stein's Perl Bio::DB::GFF
16
17 >>> import os
18 >>> import Bio.GFF
19 >>> PASSWORD = os.environ['MYSQLPASS']
20 >>> DATABASE_GFF = 'wormbase_ws93'
21 >>> FASTADIR = '/local/wormbase_ws93/'
22 >>> gff = Bio.GFF.Connection(passwd=PASSWORD, db=DATABASE_GFF, fastadir=FASTADIR)
23
24 """
25
26 import warnings
27 import Bio
28 warnings.warn("The old Bio.GFF module for access to a MySQL GFF database "
29 "created with BioPerl is deprecated, and will be removed (or "
30 "possibly just moved) in a future release of Biopython. If you "
31 "want to continue to use this code, please get in contact with "
32 "the developers via the mailing lists to avoid its permanent "
33 "removal from Biopython. The plan is to re-use the Bio.GFF "
34 "namespace for a new GFF parsing module.", Bio.BiopythonDeprecationWarning)
35
36 __version__ = "$Revision: 1.10 $"
37
38
39 import operator
40 import os.path
41 import sys
42 import types
43
44 from Bio import MissingPythonDependencyError
45
46 try:
47 import MySQLdb
48 except:
49 raise MissingPythonDependencyError("Install MySQLdb if you want to use "
50 "Bio.GFF.")
51
52 from Bio.Alphabet import IUPAC
53 from Bio import DocSQL
54 from Bio.Seq import Seq
55 from Bio.SeqRecord import SeqRecord
56
57 import binning
58 import easy
59 import GenericTools
60
61
62 DEFAULT_ALPHABET = IUPAC.unambiguous_dna
63
65 """
66 this will only work for the simplest of easy.Location objects
67 """
78
81
87
89 """
90 Connection to GFF database
91
92 also functions as whole segment
93 """
94
103
104 - def segment(self, *args, **keywds):
105 return Segment(self, *args, **keywds)
106
108 """
109 Singleton: contain records of loaded FASTA files
110
111 >>> RetrieveSeqname._dir = 'GFF'
112 >>> RetrieveSeqname._diagnostics = 1
113 >>> sys.stderr = sys.stdout # dump diagnostics to stdout so doctest can see them
114 >>> record1 = RetrieveSeqname('NC_001802.fna')
115 Loading GFF/NC_001802.fna... Done.
116 >>> record1.id
117 'gi|9629357|ref|NC_001802.1|'
118 >>> record2 = RetrieveSeqname('NC_001802.fna')
119 >>> record1.seq is record2.seq # should be same space in memory
120 1
121 """
122 __records = {}
123 _diagnostics = 0
124
137
139 """
140 strand may be:
141 +/0 = Watson
142 -/1 = Crick
143
144 I propose that we start calling these the Rosalind and Franklin strands
145
146 >>> RetrieveSeqname._dir = 'GFF'
147 >>> feature = Feature("NC_001802x.fna", 73, 78) # not having the x will interfere with the RetrieveSequence test
148 >>> feature.seq()
149 Seq('AATAAA', Alphabet())
150 >>> print feature.location()
151 NC_001802x.fna:73..78
152 >>> from Bio import SeqIO
153 >>> record = feature.record()
154 >>> records = [record]
155 >>> SeqIO.write(records, sys.stdout, 'fasta')
156 > NC_001802x.fna:73..78
157 AATAAA
158 >>> feature2 = Feature(location=easy.LocationFromString("NC_001802x.fna:73..78"))
159 >>> writer.write(feature2.record())
160 > NC_001802x.fna:73..78
161 AATAAA
162 >>> location3 = easy.LocationFromString("NC_001802x.fna:complement(73..78)")
163 >>> feature3 = Feature(location=location3)
164 >>> writer.write(feature3.record())
165 > NC_001802x.fna:complement(73..78)
166 TTTATT
167 >>> location4 = easy.LocationFromString("NC_001802x.fna:336..1631")
168 >>> feature4 = Feature(location=location4, frame=0)
169 >>> feature4.frame
170 0
171 >>> feature4.translate()[:7]
172 Seq('MGARASV', HasStopCodon(IUPACProtein(), '*'))
173 >>> feature4.frame = 6 # can't happen, but a useful demonstration
174 >>> feature4.translate()[:5]
175 Seq('ARASV', HasStopCodon(IUPACProtein(), '*'))
176 >>> feature4.frame = 1
177 >>> feature4.translate()[:5]
178 Seq('WVRER', HasStopCodon(IUPACProtein(), '*'))
179 >>> location5 = easy.LocationFromString("NC_001802lc.fna:336..1631") # lowercase data
180 >>> feature5 = Feature(location=location5, frame=0)
181 >>> feature5.translate()[:7]
182 Seq('MGARASV', HasStopCodon(IUPACProtein(), '*'))
183 >>> location6 = easy.LocationFromString("NC_001802lc.fna:335..351")
184 >>> feature6 = Feature(location=location6, frame=1)
185 >>> feature6.translate()
186 Seq('MGARA', HasStopCodon(IUPACProtein(), '*'))
187 """
188 - def __init__(self,
189 seqname=None,
190 start=None,
191 end=None,
192 strand="+",
193 frame=None,
194 location=None,
195 alphabet=DEFAULT_ALPHABET):
211
219
223
240
243
245 if self.target_start <= self.target_end:
246 return easy.LocationFromCoords(self.target_start-1, self.target_end-1, 0, seqname=self.gname)
247 else:
248
249 return easy.LocationFromCoords(self.target_end-1, self.target_start-1, 1, seqname=self.gname)
250
252 try:
253 return "%s:%s" % (self.gclass, self.gname)
254 except AttributeError:
255 return ""
256
259
262
264 return "Feature(%s)" % self.location()
265
267 """
268 row of FeatureQuery results
269 works like a Feature
270 """
280
282 """
283 SELECT fdata.fref AS seqname,
284 ftype.fsource AS source,
285 ftype.fmethod AS feature,
286 fdata.fstart AS start,
287 fdata.fstop AS end,
288 fdata.fscore AS score,
289 fdata.fstrand AS strand,
290 fdata.fphase AS frame,
291 fdata.ftarget_start AS target_start,
292 fdata.ftarget_stop AS target_end,
293 fdata.gid,
294 fgroup.gclass,
295 fgroup.gname
296 FROM fdata, ftype, fgroup
297 WHERE fdata.ftypeid = ftype.ftypeid
298 AND fdata.gid = fgroup.gid
299 """
300
301 row_class = FeatureQueryRow
302 - def __init__(self,
303 seqname=None,
304 coords=None,
305 complement=0,
306 method=None,
307 source=None,
308 object=None,
309 gid=None,
310 *args,
311 **keywds):
312
313 DocSQL.Query.__init__(self, *args, **keywds)
314
315 if seqname is not None:
316 self.statement += ' AND fref="%s"\n' % seqname
317 if coords is not None:
318 self.statement += " AND (%s)\n" % binning.query(coords[0], coords[1])
319 if coords[0] is not None:
320 self.statement += (" AND fstop >= %s\n" % coords[0])
321 if coords[1] is not None:
322 self.statement += (" AND fstart <= %s\n" % coords[1])
323 if method is not None:
324 self.statement += ' AND fmethod LIKE "%s"\n' % method
325 if source is not None:
326 self.statement += ' AND fsource LIKE "%s"\n' % source
327 if object is not None:
328 self.statement += ' AND %s\n' % object2fgroup_sql(object)
329 if gid is not None:
330 self.statement += ' AND fgroup.gid = %s\n' % gid
331
332 if complement:
333 self.statement += " ORDER BY 0-fstart, 0-fstop"
334 else:
335 self.statement += " ORDER BY fstart, fstop"
336
339
341 return 'gclass LIKE "%s" AND gname LIKE "%s"' % object_split(object)
342
344 """
345 >>> feature1_1 = Feature(location=easy.LocationFromString("NC_001802x.fna:336..1631"), frame=0) # gag-pol
346 >>> feature1_2 = Feature(location=easy.LocationFromString("NC_001802x.fna:1631..4642"), frame=0) # slippage
347 >>> aggregate = FeatureAggregate([feature1_1, feature1_2])
348 >>> print aggregate.location()
349 join(NC_001802x.fna:336..1631,NC_001802x.fna:1631..4642)
350 >>> xlate_str = aggregate.translate().tostring()
351 >>> xlate_str[:5], xlate_str[-5:]
352 ('MGARA', 'RQDED')
353
354 >>> location1 = easy.LocationFromString("NC_001802x.fna:complement(1..6)")
355 >>> location2 = easy.LocationFromString("NC_001802x.fna:complement(7..12)")
356 >>> feature2_1 = Feature(location=location1, frame=0)
357 >>> feature2_2 = Feature(location=location2, frame=0)
358 >>> aggregate2 = FeatureAggregate([feature2_1, feature2_2])
359 >>> print aggregate2.location()
360 complement(join(NC_001802x.fna:1..6,NC_001802x.fna:7..12))
361 >>> print aggregate2.translate()
362 Seq('TRET', HasStopCodon(IUPACProtein(), '*'))
363 >>> location1.reverse()
364 >>> location2.reverse()
365 >>> aggregate3 = FeatureAggregate([Feature(location=x, frame=0) for x in [location1, location2]])
366 >>> print aggregate3.location()
367 join(NC_001802x.fna:1..6,NC_001802x.fna:7..12)
368 >>> print aggregate3.translate()
369 Seq('GLSG', HasStopCodon(IUPACProtein(), '*'))
370 >>> aggregate3[0].frame = 3
371 >>> print aggregate3.translate()
372 Seq('LSG', HasStopCodon(IUPACProtein(), '*'))
373
374 >>> aggregate4 = FeatureAggregate()
375 >>> aggregate4.append(Feature(location=easy.LocationFromString("NC_001802x.fna:1..5"), frame=0))
376 >>> aggregate4.append(Feature(location=easy.LocationFromString("NC_001802x.fna:6..12"), frame=2))
377 >>> aggregate4.seq()
378 Seq('GGTCTCTCTGGT', Alphabet())
379 >>> aggregate4.translate()
380 Seq('GLSG', HasStopCodon(IUPACProtein(), '*'))
381 """
382 - def __init__(self, feature_query=None):
387
392
393 - def map(self, func):
398
401
403 attributes = ['alphabet', 'frame']
404 index = [0, -1][self.location().complement]
405 for attribute in attributes:
406 self.__dict__[attribute] = self[index].__dict__[attribute]
407 translation = Feature.translate(self)
408 try:
409 assert translation.tostring().index("*") == len(translation) - 1
410 return translation[:translation.tostring().index("*")]
411 except ValueError:
412 return translation
413
415 """
416 >>> object_split("Sequence:F02E9.2a")
417 ('Sequence', 'F02E9.2a')
418 """
419 return tuple(object.split(":"))
420
421 -def _test(*args, **keywds):
422 import doctest, sys
423 doctest.testmod(sys.modules[__name__], *args, **keywds)
424
425 if __name__ == "__main__":
426 if __debug__:
427 _test()
428