Package Bio :: Package SeqUtils
[hide private]
[frames] | no frames]

Source Code for Package Bio.SeqUtils

  1  #!/usr/bin/env python 
  2  # Created: Wed May 29 08:07:18 2002 
  3  # thomas@cbs.dtu.dk, Cecilia.Alsmark@ebc.uu.se 
  4  # Copyright 2001 by Thomas Sicheritz-Ponten and Cecilia Alsmark. 
  5  # All rights reserved. 
  6  # This code is part of the Biopython distribution and governed by its 
  7  # license.  Please see the LICENSE file that should have been included 
  8  # as part of this package. 
  9   
 10  """Miscellaneous functions for dealing with sequences.""" 
 11   
 12  import re, time 
 13  from Bio import SeqIO 
 14  from Bio import Translate 
 15  from Bio.Seq import Seq 
 16  from Bio import Alphabet 
 17  from Bio.Alphabet import IUPAC 
 18  from Bio.Data import IUPACData, CodonTable 
 19   
 20   
 21  ###################################### 
 22  # DNA 
 23  ###################### 
 24  # {{{  
 25   
26 -def reverse(seq):
27 """Reverse the sequence. Works on string sequences. 28 """ 29 r = map(None, seq) 30 r.reverse() 31 return ''.join(r)
32
33 -def GC(seq):
34 """Calculates G+C content, returns the percentage (float between 0 and 100). 35 36 Copes mixed case seuqneces, and with the ambiguous nucleotide S (G or C) 37 when counting the G and C content. The percentage is calculated against 38 the full length, e.g.: 39 40 >>> from Bio.SeqUtils import GC 41 >>> GC("ACTGN") 42 40.0 43 """ 44 # 19/8/03: Iddo: added provision for lowercase 45 # 19/8/03: Iddo: divide by the sequence's length rather than by the 46 # A+T+G+C number. In that way, make provision for N. 47 48 d = {} 49 for nt in ['A','T','G','C','a','t','g','c','S','s']: 50 d[nt] = seq.count(nt) 51 gc = d.get('G',0) + d.get('C',0) + d.get('g',0) + d.get('c',0) + \ 52 d.get('S',0) + d.get('s',0) 53 54 if gc == 0: return 0 55 return gc*100.0/len(seq)
56
57 -def GC123(seq):
58 """Calculates total G+C content plus first, second and third positions. 59 60 Returns a tuple of four floats (percentages between 0 and 100) for the 61 entire sequence, and the three codon positions. e.g. 62 63 >>> from Bio.SeqUtils import GC123 64 >>> GC123("ACTGTN") 65 (40.0, 50.0, 50.0, 0.0) 66 67 Copes with mixed case sequences, but does NOT deal with ambiguous 68 nucleotides. 69 """ 70 d= {} 71 for nt in ['A','T','G','C']: 72 d[nt] = [0,0,0] 73 74 for i in range(0,len(seq),3): 75 codon = seq[i:i+3] 76 if len(codon) <3: codon += ' ' 77 for pos in range(0,3): 78 for nt in ['A','T','G','C']: 79 if codon[pos] == nt or codon[pos] == nt.lower(): 80 d[nt][pos] += 1 81 gc = {} 82 gcall = 0 83 nall = 0 84 for i in range(0,3): 85 try: 86 n = d['G'][i] + d['C'][i] +d['T'][i] + d['A'][i] 87 gc[i] = (d['G'][i] + d['C'][i])*100.0/n 88 except: 89 gc[i] = 0 90 91 gcall = gcall + d['G'][i] + d['C'][i] 92 nall = nall + n 93 94 gcall = 100.0*gcall/nall 95 return gcall, gc[0], gc[1], gc[2]
96
97 -def GC_skew(seq, window = 100):
98 """Calculates GC skew (G-C)/(G+C) for multuple windows along the sequence. 99 100 Returns a list of ratios (floats), controlled by the length of the sequence 101 and the size of the window. 102 103 Does NOT look at any ambiguous nucleotides. 104 """ 105 # 8/19/03: Iddo: added lowercase 106 values = [] 107 for i in range(0, len(seq), window): 108 s = seq[i: i + window] 109 g = s.count('G') + s.count('g') 110 c = s.count('C') + s.count('c') 111 skew = (g-c)/float(g+c) 112 values.append(skew) 113 return values
114 115 # 8/19/03 Iddo: moved these imports from within the function as 116 # ``import * '' is only 117 # allowed within the module 118 # Brad -- added an import check so you don't have to have Tkinter 119 # installed to use other sequtils functions. A little ugly but 120 # it really should be fixed by not using 'import *' which I'm not 121 # really excited about doing right now. 122 try: 123 from Tkinter import * 124 except ImportError: 125 pass 126 from math import pi, sin, cos, log
127 -def xGC_skew(seq, window = 1000, zoom = 100, 128 r = 300, px = 100, py = 100):
129 """Calculates and plots normal and accumulated GC skew (GRAPHICS !!!).""" 130 yscroll = Scrollbar(orient = VERTICAL) 131 xscroll = Scrollbar(orient = HORIZONTAL) 132 canvas = Canvas(yscrollcommand = yscroll.set, 133 xscrollcommand = xscroll.set, background = 'white') 134 win = canvas.winfo_toplevel() 135 win.geometry('700x700') 136 137 yscroll.config(command = canvas.yview) 138 xscroll.config(command = canvas.xview) 139 yscroll.pack(side = RIGHT, fill = Y) 140 xscroll.pack(side = BOTTOM, fill = X) 141 canvas.pack(fill=BOTH, side = LEFT, expand = 1) 142 canvas.update() 143 144 X0, Y0 = r + px, r + py 145 x1, x2, y1, y2 = X0 - r, X0 + r, Y0 -r, Y0 + r 146 147 ty = Y0 148 canvas.create_text(X0, ty, text = '%s...%s (%d nt)' % (seq[:7], seq[-7:], len(seq))) 149 ty +=20 150 canvas.create_text(X0, ty, text = 'GC %3.2f%%' % (GC(seq))) 151 ty +=20 152 canvas.create_text(X0, ty, text = 'GC Skew', fill = 'blue') 153 ty +=20 154 canvas.create_text(X0, ty, text = 'Accumulated GC Skew', fill = 'magenta') 155 ty +=20 156 canvas.create_oval(x1,y1, x2, y2) 157 158 acc = 0 159 start = 0 160 for gc in GC_skew(seq, window): 161 r1 = r 162 acc+=gc 163 # GC skew 164 alpha = pi - (2*pi*start)/len(seq) 165 r2 = r1 - gc*zoom 166 x1 = X0 + r1 * sin(alpha) 167 y1 = Y0 + r1 * cos(alpha) 168 x2 = X0 + r2 * sin(alpha) 169 y2 = Y0 + r2 * cos(alpha) 170 canvas.create_line(x1,y1,x2,y2, fill = 'blue') 171 # accumulated GC skew 172 r1 = r - 50 173 r2 = r1 - acc 174 x1 = X0 + r1 * sin(alpha) 175 y1 = Y0 + r1 * cos(alpha) 176 x2 = X0 + r2 * sin(alpha) 177 y2 = Y0 + r2 * cos(alpha) 178 canvas.create_line(x1,y1,x2,y2, fill = 'magenta') 179 180 canvas.update() 181 start += window 182 183 canvas.configure(scrollregion = canvas.bbox(ALL))
184
185 -def molecular_weight(seq):
186 """Calculate the molecular weight of a DNA sequence.""" 187 if type(seq) == type(''): seq = Seq(seq, IUPAC.unambiguous_dna) 188 weight_table = IUPACData.unambiguous_dna_weights 189 #TODO, use a generator expession once we drop Python 2.3? 190 #e.g. return sum(weight_table[x] for x in seq) 191 total = 0 192 for x in seq: 193 total += weight_table[x] 194 return total
195
196 -def nt_search(seq, subseq):
197 """Search for a DNA subseq in sequence. 198 199 use ambiguous values (like N = A or T or C or G, R = A or G etc.) 200 searches only on forward strand 201 """ 202 pattern = '' 203 for nt in subseq: 204 value = IUPACData.ambiguous_dna_values[nt] 205 if len(value) == 1: 206 pattern += value 207 else: 208 pattern += '[%s]' % value 209 210 pos = -1 211 result = [pattern] 212 l = len(seq) 213 while True: 214 pos+=1 215 s = seq[pos:] 216 m = re.search(pattern, s) 217 if not m: break 218 pos += int(m.start(0)) 219 result.append(pos) 220 return result
221 222 # }}} 223 224 ###################################### 225 # Protein 226 ###################### 227 # {{{ 228 229 # temporary hack for exception free translation of "dirty" DNA 230 # should be moved to ??? 231
232 -class ProteinX(Alphabet.ProteinAlphabet):
233 letters = IUPACData.extended_protein_letters + "X"
234 235 proteinX = ProteinX() 236
237 -class MissingTable:
238 - def __init__(self, table):
239 self._table = table
240 - def get(self, codon, stop_symbol):
241 try: 242 return self._table.get(codon, stop_symbol) 243 except CodonTable.TranslationError: 244 return 'X'
245
246 -def makeTableX(table):
247 assert table.protein_alphabet == IUPAC.extended_protein 248 return CodonTable.CodonTable(table.nucleotide_alphabet, proteinX, 249 MissingTable(table.forward_table), 250 table.back_table, table.start_codons, 251 table.stop_codons)
252 253 # end of hacks 254
255 -def seq3(seq):
256 """Turn a one letter code protein sequence into one with three letter codes. 257 258 The single input argument 'seq' should be a protein sequence using single 259 letter codes, either as a python string or as a Seq or MutableSeq object. 260 261 This function returns the amino acid sequence as a string using the three 262 letter amino acid codes. Output follows the IUPAC standard (including 263 ambiguous characters B for "Asx", J for "Xle" and X for "Xaa", and also U 264 for "Sel" and O for "Pyl") plus "Ter" for a terminator given as an asterisk. Any unknown 265 character (including possible gap characters), is changed into 'Xaa'. 266 267 e.g. 268 >>> from Bio.SeqUtils import seq3 269 >>> seq3("MAIVMGRWKGAR*") 270 'MetAlaIleValMetGlyArgTrpLysGlyAlaArgTer' 271 272 This function was inspired by BioPerl's seq3. 273 """ 274 threecode = {'A':'Ala', 'B':'Asx', 'C':'Cys', 'D':'Asp', 275 'E':'Glu', 'F':'Phe', 'G':'Gly', 'H':'His', 276 'I':'Ile', 'K':'Lys', 'L':'Leu', 'M':'Met', 277 'N':'Asn', 'P':'Pro', 'Q':'Gln', 'R':'Arg', 278 'S':'Ser', 'T':'Thr', 'V':'Val', 'W':'Trp', 279 'Y':'Tyr', 'Z':'Glx', 'X':'Xaa', '*':'Ter', 280 'U':'Sel', 'O':'Pyl', 'J':'Xle', 281 } 282 #We use a default of 'Xaa' for undefined letters 283 #Note this will map '-' to 'Xaa' which may be undesirable! 284 return ''.join([threecode.get(aa,'Xaa') for aa in seq])
285 286 287 # }}} 288 289 ###################################### 290 # Mixed ??? 291 ###################### 292 # {{{ 293
294 -def translate(seq, frame = 1, genetic_code = 1, translator = None):
295 """Translation of DNA in one of the six different reading frames (DEPRECATED). 296 297 Use the Bio.Seq.Translate function, or the Seq object's translate method 298 instead: 299 300 >>> from Bio.Seq import Seq 301 >>> my_seq = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG") 302 >>> my_seq = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUA") 303 >>> for frame in [0,1,2] : 304 ... print my_seq[frame:].translate() 305 ... 306 MAIVMGR*KGAR* 307 WPL*WAAERVPDS 308 GHCNGPLKGCPIV 309 >>> for frame in [0,1,2] : 310 ... print my_seq.reverse_complement()[frame:].translate() 311 ... 312 YYRAPFQRPITMA 313 TIGHPFSGPLQWP 314 LSGTLSAAHYNGH 315 """ 316 import warnings 317 warnings.warn("Bio.SeqUtils.translate() has been deprecated, and we intend" \ 318 +" to remove it in a future release of Biopython. Please use"\ 319 +" the method or function in Bio.Seq instead, as described in"\ 320 +" the Tutorial.", DeprecationWarning) 321 322 if frame not in [1,2,3,-1,-2,-3]: 323 raise ValueError('invalid frame') 324 325 if not translator: 326 table = makeTableX(CodonTable.ambiguous_dna_by_id[genetic_code]) 327 translator = Translate.Translator(table) 328 329 #Does this frame calculation do something sensible? No RC taken! 330 return translator.translate(Seq(seq[frame-1:], IUPAC.ambiguous_dna)).data
331
332 -def GC_Frame(seq, genetic_code = 1):
333 """Just an alias for six_frame_translations (OBSOLETE). 334 335 Use six_frame_translation directly, as this function may be deprecated 336 in a future release.""" 337 return six_frame_translations(seq, genetic_code)
338
339 -def six_frame_translations(seq, genetic_code = 1):
340 """Formatted string showing the 6 frame translations and GC content. 341 342 nice looking 6 frame translation with GC content - code from xbbtools 343 similar to DNA Striders six-frame translation 344 345 e.g. 346 >>> from Bio.SeqUtils import six_frame_translations 347 >>> print six_frame_translations("AUGGCCAUUGUAAUGGGCCGCUGA") 348 """ 349 from Bio.Seq import reverse_complement, translate 350 anti = reverse_complement(seq) 351 comp = anti[::-1] 352 length = len(seq) 353 frames = {} 354 for i in range(0,3): 355 frames[i+1] = translate(seq[i:], genetic_code) 356 frames[-(i+1)] = reverse(translate(anti[i:], genetic_code)) 357 358 # create header 359 if length > 20: 360 short = '%s ... %s' % (seq[:10], seq[-10:]) 361 else: 362 short = seq 363 #TODO? Remove the date as this would spoil any unit test... 364 date = time.strftime('%y %b %d, %X', time.localtime(time.time())) 365 header = 'GC_Frame: %s, ' % date 366 for nt in ['a','t','g','c']: 367 header += '%s:%d ' % (nt, seq.count(nt.upper())) 368 369 header += '\nSequence: %s, %d nt, %0.2f %%GC\n\n\n' % (short.lower(),length, GC(seq)) 370 res = header 371 372 for i in range(0,length,60): 373 subseq = seq[i:i+60] 374 csubseq = comp[i:i+60] 375 p = i/3 376 res = res + '%d/%d\n' % (i+1, i/3+1) 377 res = res + ' ' + ' '.join(map(None,frames[3][p:p+20])) + '\n' 378 res = res + ' ' + ' '.join(map(None,frames[2][p:p+20])) + '\n' 379 res = res + ' '.join(map(None,frames[1][p:p+20])) + '\n' 380 # seq 381 res = res + subseq.lower() + '%5d %%\n' % int(GC(subseq)) 382 res = res + csubseq.lower() + '\n' 383 # - frames 384 res = res + ' '.join(map(None,frames[-2][p:p+20])) +' \n' 385 res = res + ' ' + ' '.join(map(None,frames[-1][p:p+20])) + '\n' 386 res = res + ' ' + ' '.join(map(None,frames[-3][p:p+20])) + '\n\n' 387 return res
388 389 # }}} 390 391 ###################################### 392 # FASTA file utilities 393 ###################### 394 # {{{ 395
396 -def fasta_uniqids(file):
397 """Checks and changes the name/ID's to be unique identifiers by adding numbers (OBSOLETE). 398 399 file - a FASTA format filename to read in. 400 401 No return value, the output is written to screen. 402 """ 403 dict = {} 404 txt = open(file).read() 405 entries = [] 406 for entry in txt.split('>')[1:]: 407 name, seq= entry.split('\n',1) 408 name = name.split()[0].split(',')[0] 409 410 if name in dict: 411 n = 1 412 while 1: 413 n = n + 1 414 _name = name + str(n) 415 if _name not in dict: 416 name = _name 417 break 418 419 dict[name] = seq 420 421 for name, seq in dict.items(): 422 print '>%s\n%s' % (name, seq)
423
424 -def quick_FASTA_reader(file):
425 """Simple FASTA reader, returning a list of string tuples. 426 427 The single argument 'file' should be the filename of a FASTA format file. 428 This function will open and read in the entire file, constructing a list 429 of all the records, each held as a tuple of strings (the sequence name or 430 title, and its sequence). 431 432 This function was originally intended for use on large files, where its 433 low overhead makes it very fast. However, because it returns the data as 434 a single in memory list, this can require a lot of RAM on large files. 435 436 You are generally encouraged to use Bio.SeqIO.parse(handle, "fasta") which 437 allows you to iterate over the records one by one (avoiding having all the 438 records in memory at once). Using Bio.SeqIO also makes it easy to switch 439 between different input file formats. However, please note that rather 440 than simple strings, Bio.SeqIO uses SeqRecord objects for each record. 441 """ 442 #Want to split on "\n>" not just ">" in case there are any extra ">" 443 #in the name/description. So, in order to make sure we also split on 444 #the first entry, prepend a "\n" to the start of the file. 445 handle = open(file) 446 txt = "\n" + handle.read() 447 handle.close() 448 entries = [] 449 for entry in txt.split('\n>')[1:]: 450 name,seq= entry.split('\n',1) 451 seq = seq.replace('\n','').replace(' ','').upper() 452 entries.append((name, seq)) 453 return entries
454
455 -def apply_on_multi_fasta(file, function, *args):
456 """Apply a function on each sequence in a multiple FASTA file (OBSOLETE). 457 458 file - filename of a FASTA format file 459 function - the function you wish to invoke on each record 460 *args - any extra arguments you want passed to the function 461 462 This function will iterate over each record in a FASTA file as SeqRecord 463 objects, calling your function with the record (and supplied args) as 464 arguments. 465 466 This function returns a list. For those records where your function 467 returns a value, this is taken as a sequence and used to construct a 468 FASTA format string. If your function never has a return value, this 469 means apply_on_multi_fasta will return an empty list. 470 """ 471 try: 472 f = globals()[function] 473 except: 474 raise NotImplementedError("%s not implemented" % function) 475 476 handle = open(file, 'r') 477 records = SeqIO.parse(handle, "fasta") 478 results = [] 479 for record in records: 480 arguments = [record.sequence] 481 for arg in args: arguments.append(arg) 482 result = f(*arguments) 483 if result: 484 results.append('>%s\n%s' % (record.name, result)) 485 handle.close() 486 return results
487
488 -def quicker_apply_on_multi_fasta(file, function, *args):
489 """Apply a function on each sequence in a multiple FASTA file (OBSOLETE). 490 491 file - filename of a FASTA format file 492 function - the function you wish to invoke on each record 493 *args - any extra arguments you want passed to the function 494 495 This function will use quick_FASTA_reader to load every record in the 496 FASTA file into memory as a list of tuples. For each record, it will 497 call your supplied function with the record as a tuple of the name and 498 sequence as strings (plus any supplied args). 499 500 This function returns a list. For those records where your function 501 returns a value, this is taken as a sequence and used to construct a 502 FASTA format string. If your function never has a return value, this 503 means quicker_apply_on_multi_fasta will return an empty list. 504 """ 505 try: 506 f = globals()[function] 507 except: 508 raise NotImplementedError("%s not implemented" % function) 509 510 entries = quick_FASTA_reader(file) 511 results = [] 512 for name, seq in entries: 513 arguments = [seq] 514 for arg in args: arguments.append(arg) 515 result = f(*arguments) 516 if result: 517 results.append('>%s\n%s' % (name, result)) 518 handle.close() 519 return results
520 521 # }}} 522 523 ###################################### 524 # Main 525 ##################### 526 # {{{ 527 528 if __name__ == '__main__': 529 import sys, getopt 530 # crude command line options to use most functions directly on a FASTA file 531 options = {'apply_on_multi_fasta':0, 532 'quick':0, 533 'uniq_ids':0, 534 } 535 536 optlist, args = getopt.getopt(sys.argv[1:], '', ['describe', 'apply_on_multi_fasta=', 537 'help', 'quick', 'uniq_ids', 'search=']) 538 for arg in optlist: 539 if arg[0] in ['-h', '--help']: 540 pass 541 elif arg[0] in ['--describe']: 542 # get all new functions from this file 543 mol_funcs = [x[0] for x in locals().items() if type(x[1]) == type(GC)] 544 mol_funcs.sort() 545 print 'available functions:' 546 for f in mol_funcs: print '\t--%s' % f 547 print '\n\ne.g.\n./sequtils.py --apply_on_multi_fasta GC test.fas' 548 549 sys.exit(0) 550 elif arg[0] in ['--apply_on_multi_fasta']: 551 options['apply_on_multi_fasta'] = arg[1] 552 elif arg[0] in ['--search']: 553 options['search'] = arg[1] 554 else: 555 key = re.search('-*(.+)', arg[0]).group(1) 556 options[key] = 1 557 558 559 if options.get('apply_on_multi_fasta'): 560 file = args[0] 561 function = options['apply_on_multi_fasta'] 562 arguments = [] 563 if options.get('search'): 564 arguments = options['search'] 565 if function == 'xGC_skew': 566 arguments = 1000 567 if options.get('quick'): 568 results = quicker_apply_on_multi_fasta(file, function, arguments) 569 else: 570 results = apply_on_multi_fasta(file, function, arguments) 571 for result in results: print result 572 573 elif options.get('uniq_ids'): 574 file = args[0] 575 fasta_uniqids(file) 576 577 # }}} 578