1
2
3
4
5
6
7
8 """
9 Bio.GFF.easy: some functions to ease the use of Biopython
10 """
11
12 __version__ = "$Revision: 1.11 $"
13
14
15 import copy
16 import re
17 import string
18 import sys
19
20 import Bio
21 from Bio import GenBank
22 from Bio.Data import IUPACData
23 from Bio.Seq import Seq
24
25 from Bio import SeqIO
26 from Bio import SeqUtils
27
28 import GenericTools
29
31 """ JH: accessing feature.qualifiers as a list is stupid. Here's a dict that does it"""
32 - def __init__(self, feature_list, default=None):
46
47 -class Location(GenericTools.VerboseList):
48 """
49 this is really best interfaced through LocationFromString
50 fuzzy: < or >
51 join: {0 = no join, 1 = join, 2 = order}
52
53 >>> location = Location([Location([339]), Location([564])]) # zero-based
54 >>> location
55 Location(Location(339), Location(564))
56 >>> print location # one-based
57 340..565
58 >>> print location.five_prime()
59 340
60 >>> location_rev = Location([Location([339]), Location([564])], 1)
61 >>> print location_rev
62 complement(340..565)
63 >>> print location_rev.five_prime()
64 565
65 """
66 - def __init__(self, the_list, complement=0, seqname=None):
72
74 if self.join == 1:
75 label = 'join'
76 elif self.join == 2:
77 label = 'order'
78 return "%s(%s)" % (label, ",".join(map(str, self)))
79
81 if self.seqname:
82 format = "%s:%%s" % self.seqname
83 else:
84 format = "%s"
85
86 if self.complement:
87 format = format % "complement(%s)"
88
89 if self.join:
90 return format % self._joinstr()
91
92 elif isinstance(self[0], list):
93 return format % "%s..%s" % (str(self[0]), str(self[1]))
94 else:
95 if self.fuzzy:
96 format = format % self.fuzzy + "%s"
97 return format % str(self[0] + 1)
98
100 return "Location(%s)" % ", ".join(map(repr, self))
101
102 direction2index = {1: 0, -1: -1}
104 """
105 1: 5'
106 -1: 3'
107
108 >>> loc1 = LocationFromString("join(1..3,complement(5..6))")
109 >>> loc1.direction_and_index(1)
110 (1, 0)
111 >>> loc1.direction_and_index(-1)
112 (-1, -1)
113 >>> loc1.reverse()
114 >>> print loc1
115 complement(join(1..3,complement(5..6)))
116 >>> loc1.direction_and_index(1)
117 (-1, -1)
118 """
119 if self.complement:
120 direction = direction * -1
121 index = self.direction2index[direction]
122 return direction, index
123
125 """
126 >>> loc = LocationFromString("complement(join(1..5,complement(6..10)))")
127 >>> loc.findside(1)
128 Location(5)
129 >>> loc.findside(-1)
130 Location(0)
131 """
132 direction, index = self.direction_and_index(direction)
133 if self.join or isinstance(self[0], list):
134 return self[index].findside(direction)
135 else:
136 return self
137
139 """
140 >>> loc = LocationFromString("complement(join(MOOCOW:1..5,SEQ:complement(6..10)))")
141 >>> loc.findseqname_3prime()
142 'MOOCOW'
143 """
144 return self.findseqname(-1)
145
147 """
148 >>> loc = LocationFromString("complement(join(MOOCOW:1..5,SEQ:complement(6..10)))")
149 >>> loc.findseqname()
150 'SEQ'
151 >>> loc.findseqname(-1)
152 'MOOCOW'
153 """
154 direction, index = self.direction_and_index(direction)
155 if self.seqname:
156 return self.seqname
157 elif self.join:
158 return self[index].findseqname(direction)
159 else:
160 raise AttributeError('no sequence name')
161
166
168 """
169 WARNING: doesn't deal with joins!!!!
170 """
171 return self.end()-self.start()
172
174 """
175 WARNING: doesn't deal with joins!!!!
176
177 >>> location1 = LocationFromString("1..50")
178 >>> location2 = LocationFromString("25..200")
179 >>> print location1.intersection(location2)
180 25..50
181 >>> print location1.intersection(location2)
182 25..50
183 """
184 if self.start() >= other.start():
185 start = self.start()
186 else:
187 start = other.start()
188 if self.end() <= other.end():
189 end = self.end()
190 else:
191 end = other.end()
192 return Location([Location([start]), Location([end])])
193
200
207
214
216 """
217 >>> fwd_location = LocationFromString('X:5830132..5831528')
218 >>> print fwd_location.sublocation(LocationFromString('1..101'))
219 X:5830132..5830232
220 >>> print fwd_location.sublocation(LocationFromString('1267..1286'))
221 X:5831398..5831417
222 >>> rev_location = LocationFromString('I:complement(8415686..8416216)')
223 >>> print rev_location.sublocation(LocationFromString('1..101'))
224 I:complement(8416116..8416216)
225 >>> print rev_location.sublocation(LocationFromString('100..200'))
226 I:complement(8416017..8416117)
227 """
228
229 absolute_location = copy.deepcopy(self)
230 for i in xrange(2):
231 absolute_location[i] = self.five_prime().add(sub_location[i], self.complement)
232 if absolute_location.complement:
233 list.reverse(absolute_location)
234 return absolute_location
235
237 return self.add(addend)
238
239 - def add(self, addend, complement=0):
240 self_copy = copy.deepcopy(self)
241 if isinstance(addend, Location):
242 addend = addend[0]
243 if complement:
244 addend *= -1
245 self_copy[0] += addend
246 return self_copy
247
249 return self + -subtrahend
250
253
255 """
256 >>> loc1 = LocationFromString("join(I:complement(1..9000),I:complement(9001..10000))")
257 >>> loc1.reorient()
258 >>> print loc1
259 complement(join(I:1..9000,I:9001..10000))
260 >>> loc2 = LocationFromString("join(I:complement(1..9000),I:9001..10000)")
261 >>> loc2.reorient()
262 >>> print loc2
263 join(I:complement(1..9000),I:9001..10000)
264 """
265 if self.join:
266 if len([x for x in self if x.complement]) == len(self):
267 self.reverse()
268 for segment in self:
269 segment.reverse()
270
272 """
273 works for single level non-complex joins
274
275 >>> LOC = LocationFromString
276 >>> l1 = LOC("join(alpha:1..30,alpha:50..70)")
277 >>> print l1.bounding()
278 join(alpha:1..70)
279 >>> l2 = LOC("join(alpha:1..30,alpha:complement(50..70))")
280 >>> print l2.bounding()
281 join(alpha:1..30,alpha:complement(50..70))
282 >>> l3 = LOC("join(alpha:1..30,alpha:complement(50..70),beta:6..20,alpha:25..45)")
283 >>> print l3.bounding()
284 join(alpha:1..45,alpha:complement(50..70),beta:6..20)
285
286 """
287 if not self.join:
288 return
289
290 seqdict = {}
291 seqkeys = []
292 for subloc in self:
293 assert subloc.seqname
294 assert not subloc.join
295 try:
296 seqdict[_hashname(subloc)].append(subloc)
297 except KeyError:
298 key = _hashname(subloc)
299 seqdict[key] = [subloc]
300 seqkeys.append(key)
301
302 res = LocationJoin()
303 for key in seqkeys:
304 locations = seqdict[key]
305 coords = []
306 for subloc in locations:
307 coords.append(subloc.start())
308 coords.append(subloc.end())
309 res.append(LocationFromCoords(min(coords), max(coords), locations[0].complement, locations[0].seqname))
310 return res
311
314
316 """
317 >>> join = LocationJoin([LocationFromCoords(339, 564, 1), LocationFromString("complement(100..339)")])
318 >>> appendloc = LocationFromString("order(complement(66..99),complement(5..55))")
319 >>> join.append(appendloc)
320 >>> print join
321 join(complement(340..565),complement(100..339),order(complement(66..99),complement(5..55)))
322 >>> join2 = LocationJoin()
323 >>> join2.append(LocationFromString("complement(66..99)"))
324 >>> join2.append(LocationFromString("complement(5..55)"))
325 >>> print join2
326 join(complement(66..99),complement(5..55))
327 """
328 - def __init__(self, the_list = [], complement=0, seqname=None):
334
336 """
337 >>> print LocationFromCoords(339, 564)
338 340..565
339 >>> print LocationFromCoords(339, 564, seqname="I")
340 I:340..565
341 >>> print LocationFromCoords(999, 3234, "-", seqname="NC_343434")
342 NC_343434:complement(1000..3235)
343 """
344 - def __init__(self, start, end, strand=0, seqname=None):
350
351
352
353 re_complement = re.compile(r"^complement\((.*)\)$")
354 re_seqname = re.compile(r"^(?!join|order|complement)([^\:]+?):(.*)$")
355 re_join = re.compile(r"^(join|order)\((.*)\)$")
356 re_dotdot = re.compile(r"^([><]*\d+)\.\.([><]*\d+)$")
357 re_fuzzy = re.compile(r"^([><])(\d+)")
359 """
360 >>> # here are some tests from http://www.ncbi.nlm.nih.gov/collab/FT/index.html#location
361 >>> print LocationFromString("467")
362 467
363 >>> print LocationFromString("340..565")
364 340..565
365 >>> print LocationFromString("<345..500")
366 <345..500
367 >>> print LocationFromString("<1..888")
368 <1..888
369 >>> # (102.110) and 123^124 syntax unimplemented
370 >>> print LocationFromString("join(12..78,134..202)")
371 join(12..78,134..202)
372 >>> print LocationFromString("complement(join(2691..4571,4918..5163))")
373 complement(join(2691..4571,4918..5163))
374 >>> print LocationFromString("join(complement(4918..5163),complement(2691..4571))")
375 join(complement(4918..5163),complement(2691..4571))
376 >>> print LocationFromString("order(complement(4918..5163),complement(2691..4571))")
377 order(complement(4918..5163),complement(2691..4571))
378 >>> print LocationFromString("NC_001802x.fna:73..78")
379 NC_001802x.fna:73..78
380 >>> print LocationFromString("J00194:100..202")
381 J00194:100..202
382
383 >>> print LocationFromString("join(117505..118584,1..609)")
384 join(117505..118584,1..609)
385 >>> print LocationFromString("join(test3:complement(4..6),test3:complement(1..3))")
386 join(test3:complement(4..6),test3:complement(1..3))
387 >>> print LocationFromString("test3:join(test1:complement(1..3),4..6)")
388 test3:join(test1:complement(1..3),4..6)
389 """
421
423 if filename:
424 return open(filename)
425 else:
426 return sys.stdin
427
429 """
430 >>> record = fasta_single(string=\"""
431 ... >gi|9629360|ref|NP_057850.1| Gag [Human immunodeficiency virus type 1]
432 ... MGARASVLSGGELDRWEKIRLRPGGKKKYKLKHIVWASRELERFAVNPGLLETSEGCRQILGQLQPSLQT
433 ... GSEELRSLYNTVATLYCVHQRIEIKDTKEALDKIEEEQNKSKKKAQQAAADTGHSNQVSQNYPIVQNIQG
434 ... QMVHQAISPRTLNAWVKVVEEKAFSPEVIPMFSALSEGATPQDLNTMLNTVGGHQAAMQMLKETINEEAA
435 ... EWDRVHPVHAGPIAPGQMREPRGSDIAGTTSTLQEQIGWMTNNPPIPVGEIYKRWIILGLNKIVRMYSPT
436 ... SILDIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNWMTETLLVQNANPDCKTILKALGPAATLEEMMTAC
437 ... QGVGGPGHKARVLAEAMSQVTNSATIMMQRGNFRNQRKIVKCFNCGKEGHTARNCRAPRKKGCWKCGKEG
438 ... HQMKDCTERQANFLGKIWPSYKGRPGNFLQSRPEPTAPPEESFRSGVETTTPPQKQEPIDKELYPLTSLR
439 ... SLFGNDPSSQ
440 ... \""")
441 >>> record.id
442 'gi|9629360|ref|NP_057850.1|'
443 >>> record.description
444 'gi|9629360|ref|NP_057850.1| Gag [Human immunodeficiency virus type 1]'
445 >>> record.seq[0:5]
446 Seq('MGARA', SingleLetterAlphabet())
447 """
448
449
450 if string:
451 import cStringIO
452 handle = cStringIO.StringIO(string)
453 else :
454 handle = open_file(filename)
455 try :
456 record = SeqIO.parse(handle, format="fasta").next()
457 except StopIteration :
458 record = None
459 return record
460
473
475 """
476 >>> records = fasta_readrecords('GFF/multi.fna')
477 >>> records[0].id
478 'test1'
479 >>> records[2].seq
480 Seq('AAACACAC', SingleLetterAlphabet())
481 """
482 return list(SeqIO.parse(open_file(filename), format="fasta"))
483
488
490 """
491 >>> record = genbank_single("GFF/NC_001422.gbk")
492 >>> record.taxonomy
493 ['Viruses', 'ssDNA viruses', 'Microviridae', 'Microvirus']
494 >>> cds = record.features[-4]
495 >>> cds.key
496 'CDS'
497 >>> location = LocationFromString(cds.location)
498 >>> print location
499 2931..3917
500 >>> subseq = record_subseq(record, location)
501 >>> subseq[0:20]
502 Seq('ATGTTTGGTGCTATTGCTGG', Alphabet())
503 """
504 return GenBank.RecordParser().parse(open(filename))
505
507 """
508 >>> from Bio.SeqRecord import SeqRecord
509 >>> record = SeqRecord(Seq("gagttttatcgcttccatga"),
510 ... "ref|NC_001422",
511 ... "Coliphage phiX174, complete genome",
512 ... "bases 1-11")
513 >>> record_subseq(record, LocationFromString("1..4")) # one-based
514 Seq('GAGT', Alphabet())
515 >>> record_subseq(record, LocationFromString("complement(1..4)")) # one-based
516 Seq('ACTC', Alphabet())
517 >>> record_subseq(record, LocationFromString("join(complement(1..4),1..4)")) # what an idea!
518 Seq('ACTCGAGT', Alphabet())
519 >>> loc = LocationFromString("complement(join(complement(5..7),1..4))")
520 >>> print loc
521 complement(join(complement(5..7),1..4))
522 >>> record_subseq(record, loc)
523 Seq('ACTCTTT', Alphabet())
524 >>> print loc
525 complement(join(complement(5..7),1..4))
526 >>> loc.reverse()
527 >>> record_subseq(record, loc)
528 Seq('AAAGAGT', Alphabet())
529 >>> record_subseq(record, loc, upper=1)
530 Seq('AAAGAGT', Alphabet())
531 """
532 if location.join:
533 subsequence_list = []
534 if location.complement:
535 location_copy = copy.copy(location)
536 list.reverse(location_copy)
537 else:
538 location_copy = location
539 for sublocation in location_copy:
540 if location.complement:
541 sublocation_copy = copy.copy(sublocation)
542 sublocation_copy.reverse()
543 else:
544 sublocation_copy = sublocation
545 subsequence_list.append(record_subseq(record, sublocation_copy, *args, **keywds).tostring())
546 return Seq(''.join(subsequence_list), record_sequence(record).alphabet)
547 else:
548 return record_coords(record, location.start(), location.end()+1, location.complement, *args, **keywds)
549
562
564 """
565 >>> from Bio.SeqRecord import SeqRecord
566 >>> record = SeqRecord(Seq("gagttttatcgcttccatga"),
567 ... "ref|NC_001422",
568 ... "Coliphage phiX174, complete genome",
569 ... "bases 1-11")
570 >>> record_coords(record, 0, 4) # zero-based
571 Seq('GAGT', Alphabet())
572 >>> record_coords(record, 0, 4, "-") # zero-based
573 Seq('ACTC', Alphabet())
574 >>> record_coords(record, 0, 4, "-", upper=1) # zero-based
575 Seq('ACTC', Alphabet())
576 """
577
578 subseq = record_sequence(record)[start:end]
579 subseq_str = subseq.tostring()
580 subseq_str = subseq_str.upper()
581 subseq = Seq(subseq_str, subseq.alphabet)
582 if strand == '-' or strand == 1:
583 return subseq.reverse_complement()
584 else:
585 return subseq
586
587 -def _test(*args, **keywds):
588 import doctest, sys
589 doctest.testmod(sys.modules[__name__], *args, **keywds)
590
591 if __name__ == "__main__":
592 if __debug__:
593 _test()
594