Package Bio :: Package GFF :: Module easy
[hide private]
[frames] | no frames]

Source Code for Module Bio.GFF.easy

  1  #!/usr/bin/env python 
  2  # 
  3  # Copyright 2002 by Michael Hoffman.  All rights reserved. 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7   
  8  """ 
  9  Bio.GFF.easy: some functions to ease the use of Biopython 
 10  """ 
 11   
 12  __version__ = "$Revision: 1.11 $" 
 13  # $Source: /home/repository/biopython/biopython/Bio/GFF/easy.py,v $ 
 14   
 15  import copy 
 16  import re 
 17  import string 
 18  import sys 
 19   
 20  import Bio 
 21  from Bio import GenBank 
 22  from Bio.Data import IUPACData 
 23  from Bio.Seq import Seq 
 24   
 25  from Bio import SeqIO 
 26  from Bio import SeqUtils 
 27   
 28  import GenericTools 
 29   
30 -class FeatureDict(dict):
31 """ JH: accessing feature.qualifiers as a list is stupid. Here's a dict that does it"""
32 - def __init__(self, feature_list, default=None):
33 dict.__init__(self) 34 self.default = default 35 key_re = re.compile(r'/(\S+)=') 36 37 for i in feature_list: 38 key = key_re.match(i.key).group(1) 39 val = string.replace(i.value,'"','') 40 self[key] = val
41 - def __getitem__(self, key):
42 try: 43 return dict.__getitem__(self, key) 44 except KeyError: 45 return self.default
46
47 -class Location(GenericTools.VerboseList):
48 """ 49 this is really best interfaced through LocationFromString 50 fuzzy: < or > 51 join: {0 = no join, 1 = join, 2 = order} 52 53 >>> location = Location([Location([339]), Location([564])]) # zero-based 54 >>> location 55 Location(Location(339), Location(564)) 56 >>> print location # one-based 57 340..565 58 >>> print location.five_prime() 59 340 60 >>> location_rev = Location([Location([339]), Location([564])], 1) 61 >>> print location_rev 62 complement(340..565) 63 >>> print location_rev.five_prime() 64 565 65 """
66 - def __init__(self, the_list, complement=0, seqname=None):
67 self.complement = complement 68 self.join = 0 69 self.fuzzy = None 70 self.seqname = seqname 71 list.__init__(self, the_list)
72
73 - def _joinstr(self):
74 if self.join == 1: 75 label = 'join' 76 elif self.join == 2: 77 label = 'order' 78 return "%s(%s)" % (label, ",".join(map(str, self)))
79
80 - def __str__(self):
81 if self.seqname: 82 format = "%s:%%s" % self.seqname 83 else: 84 format = "%s" 85 86 if self.complement: 87 format = format % "complement(%s)" 88 89 if self.join: 90 return format % self._joinstr() 91 92 elif isinstance(self[0], list): 93 return format % "%s..%s" % (str(self[0]), str(self[1])) 94 else: 95 if self.fuzzy: 96 format = format % self.fuzzy + "%s" 97 return format % str(self[0] + 1)
98
99 - def __repr__(self):
100 return "Location(%s)" % ", ".join(map(repr, self))
101 102 direction2index = {1: 0, -1: -1}
103 - def direction_and_index(self, direction):
104 """ 105 1: 5' 106 -1: 3' 107 108 >>> loc1 = LocationFromString("join(1..3,complement(5..6))") 109 >>> loc1.direction_and_index(1) 110 (1, 0) 111 >>> loc1.direction_and_index(-1) 112 (-1, -1) 113 >>> loc1.reverse() 114 >>> print loc1 115 complement(join(1..3,complement(5..6))) 116 >>> loc1.direction_and_index(1) 117 (-1, -1) 118 """ 119 if self.complement: 120 direction = direction * -1 121 index = self.direction2index[direction] 122 return direction, index
123
124 - def findside(self, direction):
125 """ 126 >>> loc = LocationFromString("complement(join(1..5,complement(6..10)))") 127 >>> loc.findside(1) 128 Location(5) 129 >>> loc.findside(-1) 130 Location(0) 131 """ 132 direction, index = self.direction_and_index(direction) 133 if self.join or isinstance(self[0], list): 134 return self[index].findside(direction) 135 else: 136 return self
137
138 - def findseqname_3prime(self):
139 """ 140 >>> loc = LocationFromString("complement(join(MOOCOW:1..5,SEQ:complement(6..10)))") 141 >>> loc.findseqname_3prime() 142 'MOOCOW' 143 """ 144 return self.findseqname(-1)
145
146 - def findseqname(self, direction=1): # find 5' seqname
147 """ 148 >>> loc = LocationFromString("complement(join(MOOCOW:1..5,SEQ:complement(6..10)))") 149 >>> loc.findseqname() 150 'SEQ' 151 >>> loc.findseqname(-1) 152 'MOOCOW' 153 """ 154 direction, index = self.direction_and_index(direction) 155 if self.seqname: 156 return self.seqname 157 elif self.join: 158 return self[index].findseqname(direction) 159 else: 160 raise AttributeError('no sequence name')
161
162 - def five_prime(self):
163 return self.findside(1)
164 - def three_prime(self):
165 return self.findside(-1)
166
167 - def length(self):
168 """ 169 WARNING: doesn't deal with joins!!!! 170 """ 171 return self.end()-self.start()
172
173 - def intersection(self, other):
174 """ 175 WARNING: doesn't deal with joins!!!! 176 177 >>> location1 = LocationFromString("1..50") 178 >>> location2 = LocationFromString("25..200") 179 >>> print location1.intersection(location2) 180 25..50 181 >>> print location1.intersection(location2) 182 25..50 183 """ 184 if self.start() >= other.start(): 185 start = self.start() 186 else: 187 start = other.start() 188 if self.end() <= other.end(): 189 end = self.end() 190 else: 191 end = other.end() 192 return Location([Location([start]), Location([end])])
193
194 - def start(self):
195 # zero-based 196 if self.complement: 197 return self.three_prime()[0] 198 else: 199 return self.five_prime()[0]
200
201 - def end(self):
202 # zero-based 203 if self.complement: 204 return self.five_prime()[0] 205 else: 206 return self.three_prime()[0]
207
208 - def three_prime_range(self, window):
209 three_prime_loc = self.three_prime() 210 if self.complement: 211 return Location([three_prime_loc-window, three_prime_loc], complement=1) 212 else: 213 return Location([three_prime_loc, three_prime_loc+window])
214
215 - def sublocation(self, sub_location):
216 """ 217 >>> fwd_location = LocationFromString('X:5830132..5831528') 218 >>> print fwd_location.sublocation(LocationFromString('1..101')) 219 X:5830132..5830232 220 >>> print fwd_location.sublocation(LocationFromString('1267..1286')) 221 X:5831398..5831417 222 >>> rev_location = LocationFromString('I:complement(8415686..8416216)') 223 >>> print rev_location.sublocation(LocationFromString('1..101')) 224 I:complement(8416116..8416216) 225 >>> print rev_location.sublocation(LocationFromString('100..200')) 226 I:complement(8416017..8416117) 227 """ 228 229 absolute_location = copy.deepcopy(self) 230 for i in xrange(2): 231 absolute_location[i] = self.five_prime().add(sub_location[i], self.complement) 232 if absolute_location.complement: 233 list.reverse(absolute_location) 234 return absolute_location
235
236 - def __add__(self, addend):
237 return self.add(addend)
238
239 - def add(self, addend, complement=0):
240 self_copy = copy.deepcopy(self) 241 if isinstance(addend, Location): 242 addend = addend[0] 243 if complement: 244 addend *= -1 245 self_copy[0] += addend 246 return self_copy
247
248 - def __sub__(self, subtrahend):
249 return self + -subtrahend
250
251 - def reverse(self):
252 self.complement = [1, 0][self.complement]
253
254 - def reorient(self):
255 """ 256 >>> loc1 = LocationFromString("join(I:complement(1..9000),I:complement(9001..10000))") 257 >>> loc1.reorient() 258 >>> print loc1 259 complement(join(I:1..9000,I:9001..10000)) 260 >>> loc2 = LocationFromString("join(I:complement(1..9000),I:9001..10000)") 261 >>> loc2.reorient() 262 >>> print loc2 263 join(I:complement(1..9000),I:9001..10000) 264 """ 265 if self.join: 266 if len([x for x in self if x.complement]) == len(self): 267 self.reverse() 268 for segment in self: 269 segment.reverse()
270
271 - def bounding(self):
272 """ 273 works for single level non-complex joins 274 275 >>> LOC = LocationFromString 276 >>> l1 = LOC("join(alpha:1..30,alpha:50..70)") 277 >>> print l1.bounding() 278 join(alpha:1..70) 279 >>> l2 = LOC("join(alpha:1..30,alpha:complement(50..70))") 280 >>> print l2.bounding() 281 join(alpha:1..30,alpha:complement(50..70)) 282 >>> l3 = LOC("join(alpha:1..30,alpha:complement(50..70),beta:6..20,alpha:25..45)") 283 >>> print l3.bounding() 284 join(alpha:1..45,alpha:complement(50..70),beta:6..20) 285 286 """ 287 if not self.join: 288 return 289 290 seqdict = {} 291 seqkeys = [] 292 for subloc in self: 293 assert subloc.seqname 294 assert not subloc.join 295 try: 296 seqdict[_hashname(subloc)].append(subloc) 297 except KeyError: 298 key = _hashname(subloc) 299 seqdict[key] = [subloc] 300 seqkeys.append(key) 301 302 res = LocationJoin() 303 for key in seqkeys: 304 locations = seqdict[key] 305 coords = [] 306 for subloc in locations: 307 coords.append(subloc.start()) 308 coords.append(subloc.end()) 309 res.append(LocationFromCoords(min(coords), max(coords), locations[0].complement, locations[0].seqname)) 310 return res
311
312 -def _hashname(location):
313 return str(location.complement) + location.seqname
314
315 -class LocationJoin(Location):
316 """ 317 >>> join = LocationJoin([LocationFromCoords(339, 564, 1), LocationFromString("complement(100..339)")]) 318 >>> appendloc = LocationFromString("order(complement(66..99),complement(5..55))") 319 >>> join.append(appendloc) 320 >>> print join 321 join(complement(340..565),complement(100..339),order(complement(66..99),complement(5..55))) 322 >>> join2 = LocationJoin() 323 >>> join2.append(LocationFromString("complement(66..99)")) 324 >>> join2.append(LocationFromString("complement(5..55)")) 325 >>> print join2 326 join(complement(66..99),complement(5..55)) 327 """
328 - def __init__(self, the_list = [], complement=0, seqname=None):
329 self.complement = complement 330 self.join = 1 331 self.fuzzy = None 332 self.seqname = seqname 333 list.__init__(self, the_list)
334
335 -class LocationFromCoords(Location):
336 """ 337 >>> print LocationFromCoords(339, 564) 338 340..565 339 >>> print LocationFromCoords(339, 564, seqname="I") 340 I:340..565 341 >>> print LocationFromCoords(999, 3234, "-", seqname="NC_343434") 342 NC_343434:complement(1000..3235) 343 """
344 - def __init__(self, start, end, strand=0, seqname=None):
345 if strand == "+": 346 strand = 0 347 elif strand == "-": 348 strand = 1 349 Location.__init__(self, [Location([start]), Location([end])], strand, seqname)
350 351 # see http://www.ncbi.nlm.nih.gov/collab/FT/index.html#backus-naur 352 # for how this should actually be implemented 353 re_complement = re.compile(r"^complement\((.*)\)$") 354 re_seqname = re.compile(r"^(?!join|order|complement)([^\:]+?):(.*)$") # not every character is allowed by spec 355 re_join = re.compile(r"^(join|order)\((.*)\)$") 356 re_dotdot = re.compile(r"^([><]*\d+)\.\.([><]*\d+)$") 357 re_fuzzy = re.compile(r"^([><])(\d+)")
358 -class LocationFromString(Location):
359 """ 360 >>> # here are some tests from http://www.ncbi.nlm.nih.gov/collab/FT/index.html#location 361 >>> print LocationFromString("467") 362 467 363 >>> print LocationFromString("340..565") 364 340..565 365 >>> print LocationFromString("<345..500") 366 <345..500 367 >>> print LocationFromString("<1..888") 368 <1..888 369 >>> # (102.110) and 123^124 syntax unimplemented 370 >>> print LocationFromString("join(12..78,134..202)") 371 join(12..78,134..202) 372 >>> print LocationFromString("complement(join(2691..4571,4918..5163))") 373 complement(join(2691..4571,4918..5163)) 374 >>> print LocationFromString("join(complement(4918..5163),complement(2691..4571))") 375 join(complement(4918..5163),complement(2691..4571)) 376 >>> print LocationFromString("order(complement(4918..5163),complement(2691..4571))") 377 order(complement(4918..5163),complement(2691..4571)) 378 >>> print LocationFromString("NC_001802x.fna:73..78") 379 NC_001802x.fna:73..78 380 >>> print LocationFromString("J00194:100..202") 381 J00194:100..202 382 383 >>> print LocationFromString("join(117505..118584,1..609)") 384 join(117505..118584,1..609) 385 >>> print LocationFromString("join(test3:complement(4..6),test3:complement(1..3))") 386 join(test3:complement(4..6),test3:complement(1..3)) 387 >>> print LocationFromString("test3:join(test1:complement(1..3),4..6)") 388 test3:join(test1:complement(1..3),4..6) 389 """
390 - def __init__(self, location_str):
391 match_seqname = re_seqname.match(location_str) 392 if match_seqname: 393 self.seqname = match_seqname.group(1) 394 location_str = match_seqname.group(2) 395 else: 396 self.seqname = None 397 match_complement = re_complement.match(location_str) 398 if match_complement: 399 self.complement = 1 400 location_str = match_complement.group(1) 401 else: 402 self.complement = 0 403 match_join = re_join.match(location_str) 404 if match_join: 405 self.join = {'join':1, 'order':2}[match_join.group(1)] 406 list.__init__(self, map(lambda x: LocationFromString(x), match_join.group(2).split(","))) 407 else: 408 self.join = 0 409 match_dotdot = re_dotdot.match(location_str) 410 if match_dotdot: 411 list.__init__(self, map(lambda x: LocationFromString(match_dotdot.group(x)), (1, 2))) 412 else: 413 match_fuzzy = re_fuzzy.match(location_str) 414 if match_fuzzy: 415 self.fuzzy = match_fuzzy.group(1) 416 location_str = match_fuzzy.group(2) 417 else: 418 self.fuzzy = None 419 420 list.__init__(self, [int(location_str)-1]) # zero based, nip it in the bud
421
422 -def open_file(filename):
423 if filename: 424 return open(filename) 425 else: 426 return sys.stdin
427
428 -def fasta_single(filename=None, string=None):
429 """ 430 >>> record = fasta_single(string=\""" 431 ... >gi|9629360|ref|NP_057850.1| Gag [Human immunodeficiency virus type 1] 432 ... MGARASVLSGGELDRWEKIRLRPGGKKKYKLKHIVWASRELERFAVNPGLLETSEGCRQILGQLQPSLQT 433 ... GSEELRSLYNTVATLYCVHQRIEIKDTKEALDKIEEEQNKSKKKAQQAAADTGHSNQVSQNYPIVQNIQG 434 ... QMVHQAISPRTLNAWVKVVEEKAFSPEVIPMFSALSEGATPQDLNTMLNTVGGHQAAMQMLKETINEEAA 435 ... EWDRVHPVHAGPIAPGQMREPRGSDIAGTTSTLQEQIGWMTNNPPIPVGEIYKRWIILGLNKIVRMYSPT 436 ... SILDIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNWMTETLLVQNANPDCKTILKALGPAATLEEMMTAC 437 ... QGVGGPGHKARVLAEAMSQVTNSATIMMQRGNFRNQRKIVKCFNCGKEGHTARNCRAPRKKGCWKCGKEG 438 ... HQMKDCTERQANFLGKIWPSYKGRPGNFLQSRPEPTAPPEESFRSGVETTTPPQKQEPIDKELYPLTSLR 439 ... SLFGNDPSSQ 440 ... \""") 441 >>> record.id 442 'gi|9629360|ref|NP_057850.1|' 443 >>> record.description 444 'gi|9629360|ref|NP_057850.1| Gag [Human immunodeficiency virus type 1]' 445 >>> record.seq[0:5] 446 Seq('MGARA', SingleLetterAlphabet()) 447 """ 448 #Returns the first record in a fasta file as a SeqRecord, 449 #or None if there are no records in the file. 450 if string: 451 import cStringIO 452 handle = cStringIO.StringIO(string) 453 else : 454 handle = open_file(filename) 455 try : 456 record = SeqIO.parse(handle, format="fasta").next() 457 except StopIteration : 458 record = None 459 return record
460
461 -def fasta_multi(filename=None):
462 #Simple way is just: 463 #return SeqIO.parse(open_file(filename), format="fasta") 464 #However, for backwards compatibility make sure we raise 465 #the StopIteration exception rather than returning None. 466 reader = SeqIO.parse(open_file(filename), format="fasta") 467 while True: 468 record = reader.next() 469 if record is None: 470 raise StopIteration 471 else: 472 yield record
473
474 -def fasta_readrecords(filename=None):
475 """ 476 >>> records = fasta_readrecords('GFF/multi.fna') 477 >>> records[0].id 478 'test1' 479 >>> records[2].seq 480 Seq('AAACACAC', SingleLetterAlphabet()) 481 """ 482 return list(SeqIO.parse(open_file(filename), format="fasta"))
483
484 -def fasta_write(filename, records):
485 handle = open(filename, "w") 486 SeqIO.write(records, handle, format="fasta") 487 handle.close()
488
489 -def genbank_single(filename):
490 """ 491 >>> record = genbank_single("GFF/NC_001422.gbk") 492 >>> record.taxonomy 493 ['Viruses', 'ssDNA viruses', 'Microviridae', 'Microvirus'] 494 >>> cds = record.features[-4] 495 >>> cds.key 496 'CDS' 497 >>> location = LocationFromString(cds.location) 498 >>> print location 499 2931..3917 500 >>> subseq = record_subseq(record, location) 501 >>> subseq[0:20] 502 Seq('ATGTTTGGTGCTATTGCTGG', Alphabet()) 503 """ 504 return GenBank.RecordParser().parse(open(filename))
505
506 -def record_subseq(record, location, *args, **keywds):
507 """ 508 >>> from Bio.SeqRecord import SeqRecord 509 >>> record = SeqRecord(Seq("gagttttatcgcttccatga"), 510 ... "ref|NC_001422", 511 ... "Coliphage phiX174, complete genome", 512 ... "bases 1-11") 513 >>> record_subseq(record, LocationFromString("1..4")) # one-based 514 Seq('GAGT', Alphabet()) 515 >>> record_subseq(record, LocationFromString("complement(1..4)")) # one-based 516 Seq('ACTC', Alphabet()) 517 >>> record_subseq(record, LocationFromString("join(complement(1..4),1..4)")) # what an idea! 518 Seq('ACTCGAGT', Alphabet()) 519 >>> loc = LocationFromString("complement(join(complement(5..7),1..4))") 520 >>> print loc 521 complement(join(complement(5..7),1..4)) 522 >>> record_subseq(record, loc) 523 Seq('ACTCTTT', Alphabet()) 524 >>> print loc 525 complement(join(complement(5..7),1..4)) 526 >>> loc.reverse() 527 >>> record_subseq(record, loc) 528 Seq('AAAGAGT', Alphabet()) 529 >>> record_subseq(record, loc, upper=1) 530 Seq('AAAGAGT', Alphabet()) 531 """ 532 if location.join: 533 subsequence_list = [] 534 if location.complement: 535 location_copy = copy.copy(location) 536 list.reverse(location_copy) 537 else: 538 location_copy = location 539 for sublocation in location_copy: 540 if location.complement: 541 sublocation_copy = copy.copy(sublocation) 542 sublocation_copy.reverse() 543 else: 544 sublocation_copy = sublocation 545 subsequence_list.append(record_subseq(record, sublocation_copy, *args, **keywds).tostring()) 546 return Seq(''.join(subsequence_list), record_sequence(record).alphabet) 547 else: 548 return record_coords(record, location.start(), location.end()+1, location.complement, *args, **keywds)
549
550 -def record_sequence(record):
551 """ 552 returns the sequence of a record 553 554 can be Bio.SeqRecord.SeqRecord or Bio.GenBank.Record.Record 555 """ 556 if isinstance(record, Bio.SeqRecord.SeqRecord): 557 return record.seq 558 elif isinstance(record, Bio.GenBank.Record.Record): 559 return Seq(record.sequence) 560 else: 561 raise TypeError('not Bio.SeqRecord.SeqRecord or Bio.GenBank.Record.Record')
562
563 -def record_coords(record, start, end, strand=0, upper=0):
564 """ 565 >>> from Bio.SeqRecord import SeqRecord 566 >>> record = SeqRecord(Seq("gagttttatcgcttccatga"), 567 ... "ref|NC_001422", 568 ... "Coliphage phiX174, complete genome", 569 ... "bases 1-11") 570 >>> record_coords(record, 0, 4) # zero-based 571 Seq('GAGT', Alphabet()) 572 >>> record_coords(record, 0, 4, "-") # zero-based 573 Seq('ACTC', Alphabet()) 574 >>> record_coords(record, 0, 4, "-", upper=1) # zero-based 575 Seq('ACTC', Alphabet()) 576 """ 577 578 subseq = record_sequence(record)[start:end] 579 subseq_str = subseq.tostring() 580 subseq_str = subseq_str.upper() 581 subseq = Seq(subseq_str, subseq.alphabet) 582 if strand == '-' or strand == 1: 583 return subseq.reverse_complement() 584 else: 585 return subseq
586
587 -def _test(*args, **keywds):
588 import doctest, sys 589 doctest.testmod(sys.modules[__name__], *args, **keywds)
590 591 if __name__ == "__main__": 592 if __debug__: 593 _test() 594