1
2
3
4
5
6
7
8
9
10 """Miscellaneous functions for dealing with sequences."""
11
12 import re, time
13 from Bio import SeqIO
14 from Bio import Translate
15 from Bio.Seq import Seq
16 from Bio import Alphabet
17 from Bio.Alphabet import IUPAC
18 from Bio.Data import IUPACData, CodonTable
19
20
21
22
23
24
25
27 """Reverse the sequence. Works on string sequences.
28 """
29 r = map(None, seq)
30 r.reverse()
31 return ''.join(r)
32
34 """Calculates G+C content, returns the percentage (float between 0 and 100).
35
36 Copes mixed case seuqneces, and with the ambiguous nucleotide S (G or C)
37 when counting the G and C content. The percentage is calculated against
38 the full length, e.g.:
39
40 >>> from Bio.SeqUtils import GC
41 >>> GC("ACTGN")
42 40.0
43 """
44
45
46
47
48 d = {}
49 for nt in ['A','T','G','C','a','t','g','c','S','s']:
50 d[nt] = seq.count(nt)
51 gc = d.get('G',0) + d.get('C',0) + d.get('g',0) + d.get('c',0) + \
52 d.get('S',0) + d.get('s',0)
53
54 if gc == 0: return 0
55 return gc*100.0/len(seq)
56
58 """Calculates total G+C content plus first, second and third positions.
59
60 Returns a tuple of four floats (percentages between 0 and 100) for the
61 entire sequence, and the three codon positions. e.g.
62
63 >>> from Bio.SeqUtils import GC123
64 >>> GC123("ACTGTN")
65 (40.0, 50.0, 50.0, 0.0)
66
67 Copes with mixed case sequences, but does NOT deal with ambiguous
68 nucleotides.
69 """
70 d= {}
71 for nt in ['A','T','G','C']:
72 d[nt] = [0,0,0]
73
74 for i in range(0,len(seq),3):
75 codon = seq[i:i+3]
76 if len(codon) <3: codon += ' '
77 for pos in range(0,3):
78 for nt in ['A','T','G','C']:
79 if codon[pos] == nt or codon[pos] == nt.lower():
80 d[nt][pos] += 1
81 gc = {}
82 gcall = 0
83 nall = 0
84 for i in range(0,3):
85 try:
86 n = d['G'][i] + d['C'][i] +d['T'][i] + d['A'][i]
87 gc[i] = (d['G'][i] + d['C'][i])*100.0/n
88 except:
89 gc[i] = 0
90
91 gcall = gcall + d['G'][i] + d['C'][i]
92 nall = nall + n
93
94 gcall = 100.0*gcall/nall
95 return gcall, gc[0], gc[1], gc[2]
96
98 """Calculates GC skew (G-C)/(G+C) for multuple windows along the sequence.
99
100 Returns a list of ratios (floats), controlled by the length of the sequence
101 and the size of the window.
102
103 Does NOT look at any ambiguous nucleotides.
104 """
105
106 values = []
107 for i in range(0, len(seq), window):
108 s = seq[i: i + window]
109 g = s.count('G') + s.count('g')
110 c = s.count('C') + s.count('c')
111 skew = (g-c)/float(g+c)
112 values.append(skew)
113 return values
114
115
116
117
118
119
120
121
122 try:
123 from Tkinter import *
124 except ImportError:
125 pass
126 from math import pi, sin, cos, log
127 -def xGC_skew(seq, window = 1000, zoom = 100,
128 r = 300, px = 100, py = 100):
129 """Calculates and plots normal and accumulated GC skew (GRAPHICS !!!)."""
130 yscroll = Scrollbar(orient = VERTICAL)
131 xscroll = Scrollbar(orient = HORIZONTAL)
132 canvas = Canvas(yscrollcommand = yscroll.set,
133 xscrollcommand = xscroll.set, background = 'white')
134 win = canvas.winfo_toplevel()
135 win.geometry('700x700')
136
137 yscroll.config(command = canvas.yview)
138 xscroll.config(command = canvas.xview)
139 yscroll.pack(side = RIGHT, fill = Y)
140 xscroll.pack(side = BOTTOM, fill = X)
141 canvas.pack(fill=BOTH, side = LEFT, expand = 1)
142 canvas.update()
143
144 X0, Y0 = r + px, r + py
145 x1, x2, y1, y2 = X0 - r, X0 + r, Y0 -r, Y0 + r
146
147 ty = Y0
148 canvas.create_text(X0, ty, text = '%s...%s (%d nt)' % (seq[:7], seq[-7:], len(seq)))
149 ty +=20
150 canvas.create_text(X0, ty, text = 'GC %3.2f%%' % (GC(seq)))
151 ty +=20
152 canvas.create_text(X0, ty, text = 'GC Skew', fill = 'blue')
153 ty +=20
154 canvas.create_text(X0, ty, text = 'Accumulated GC Skew', fill = 'magenta')
155 ty +=20
156 canvas.create_oval(x1,y1, x2, y2)
157
158 acc = 0
159 start = 0
160 for gc in GC_skew(seq, window):
161 r1 = r
162 acc+=gc
163
164 alpha = pi - (2*pi*start)/len(seq)
165 r2 = r1 - gc*zoom
166 x1 = X0 + r1 * sin(alpha)
167 y1 = Y0 + r1 * cos(alpha)
168 x2 = X0 + r2 * sin(alpha)
169 y2 = Y0 + r2 * cos(alpha)
170 canvas.create_line(x1,y1,x2,y2, fill = 'blue')
171
172 r1 = r - 50
173 r2 = r1 - acc
174 x1 = X0 + r1 * sin(alpha)
175 y1 = Y0 + r1 * cos(alpha)
176 x2 = X0 + r2 * sin(alpha)
177 y2 = Y0 + r2 * cos(alpha)
178 canvas.create_line(x1,y1,x2,y2, fill = 'magenta')
179
180 canvas.update()
181 start += window
182
183 canvas.configure(scrollregion = canvas.bbox(ALL))
184
195
221
222
223
224
225
226
227
228
229
230
231
232 -class ProteinX(Alphabet.ProteinAlphabet):
234
235 proteinX = ProteinX()
236
240 - def get(self, codon, stop_symbol):
245
252
253
254
256 """Turn a one letter code protein sequence into one with three letter codes.
257
258 The single input argument 'seq' should be a protein sequence using single
259 letter codes, either as a python string or as a Seq or MutableSeq object.
260
261 This function returns the amino acid sequence as a string using the three
262 letter amino acid codes. Output follows the IUPAC standard (including
263 ambiguous characters B for "Asx", J for "Xle" and X for "Xaa", and also U
264 for "Sel" and O for "Pyl") plus "Ter" for a terminator given as an asterisk. Any unknown
265 character (including possible gap characters), is changed into 'Xaa'.
266
267 e.g.
268 >>> from Bio.SeqUtils import seq3
269 >>> seq3("MAIVMGRWKGAR*")
270 'MetAlaIleValMetGlyArgTrpLysGlyAlaArgTer'
271
272 This function was inspired by BioPerl's seq3.
273 """
274 threecode = {'A':'Ala', 'B':'Asx', 'C':'Cys', 'D':'Asp',
275 'E':'Glu', 'F':'Phe', 'G':'Gly', 'H':'His',
276 'I':'Ile', 'K':'Lys', 'L':'Leu', 'M':'Met',
277 'N':'Asn', 'P':'Pro', 'Q':'Gln', 'R':'Arg',
278 'S':'Ser', 'T':'Thr', 'V':'Val', 'W':'Trp',
279 'Y':'Tyr', 'Z':'Glx', 'X':'Xaa', '*':'Ter',
280 'U':'Sel', 'O':'Pyl', 'J':'Xle',
281 }
282
283
284 return ''.join([threecode.get(aa,'Xaa') for aa in seq])
285
286
287
288
289
290
291
292
293
294 -def translate(seq, frame = 1, genetic_code = 1, translator = None):
295 """Translation of DNA in one of the six different reading frames (DEPRECATED).
296
297 Use the Bio.Seq.Translate function, or the Seq object's translate method
298 instead:
299
300 >>> from Bio.Seq import Seq
301 >>> my_seq = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG")
302 >>> my_seq = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUA")
303 >>> for frame in [0,1,2] :
304 ... print my_seq[frame:].translate()
305 ...
306 MAIVMGR*KGAR*
307 WPL*WAAERVPDS
308 GHCNGPLKGCPIV
309 >>> for frame in [0,1,2] :
310 ... print my_seq.reverse_complement()[frame:].translate()
311 ...
312 YYRAPFQRPITMA
313 TIGHPFSGPLQWP
314 LSGTLSAAHYNGH
315 """
316 import warnings
317 warnings.warn("Bio.SeqUtils.translate() has been deprecated, and we intend" \
318 +" to remove it in a future release of Biopython. Please use"\
319 +" the method or function in Bio.Seq instead, as described in"\
320 +" the Tutorial.", DeprecationWarning)
321
322 if frame not in [1,2,3,-1,-2,-3]:
323 raise ValueError('invalid frame')
324
325 if not translator:
326 table = makeTableX(CodonTable.ambiguous_dna_by_id[genetic_code])
327 translator = Translate.Translator(table)
328
329
330 return translator.translate(Seq(seq[frame-1:], IUPAC.ambiguous_dna)).data
331
333 """Just an alias for six_frame_translations (OBSOLETE).
334
335 Use six_frame_translation directly, as this function may be deprecated
336 in a future release."""
337 return six_frame_translations(seq, genetic_code)
338
340 """Formatted string showing the 6 frame translations and GC content.
341
342 nice looking 6 frame translation with GC content - code from xbbtools
343 similar to DNA Striders six-frame translation
344
345 e.g.
346 >>> from Bio.SeqUtils import six_frame_translations
347 >>> print six_frame_translations("AUGGCCAUUGUAAUGGGCCGCUGA")
348 """
349 from Bio.Seq import reverse_complement, translate
350 anti = reverse_complement(seq)
351 comp = anti[::-1]
352 length = len(seq)
353 frames = {}
354 for i in range(0,3):
355 frames[i+1] = translate(seq[i:], genetic_code)
356 frames[-(i+1)] = reverse(translate(anti[i:], genetic_code))
357
358
359 if length > 20:
360 short = '%s ... %s' % (seq[:10], seq[-10:])
361 else:
362 short = seq
363
364 date = time.strftime('%y %b %d, %X', time.localtime(time.time()))
365 header = 'GC_Frame: %s, ' % date
366 for nt in ['a','t','g','c']:
367 header += '%s:%d ' % (nt, seq.count(nt.upper()))
368
369 header += '\nSequence: %s, %d nt, %0.2f %%GC\n\n\n' % (short.lower(),length, GC(seq))
370 res = header
371
372 for i in range(0,length,60):
373 subseq = seq[i:i+60]
374 csubseq = comp[i:i+60]
375 p = i/3
376 res = res + '%d/%d\n' % (i+1, i/3+1)
377 res = res + ' ' + ' '.join(map(None,frames[3][p:p+20])) + '\n'
378 res = res + ' ' + ' '.join(map(None,frames[2][p:p+20])) + '\n'
379 res = res + ' '.join(map(None,frames[1][p:p+20])) + '\n'
380
381 res = res + subseq.lower() + '%5d %%\n' % int(GC(subseq))
382 res = res + csubseq.lower() + '\n'
383
384 res = res + ' '.join(map(None,frames[-2][p:p+20])) +' \n'
385 res = res + ' ' + ' '.join(map(None,frames[-1][p:p+20])) + '\n'
386 res = res + ' ' + ' '.join(map(None,frames[-3][p:p+20])) + '\n\n'
387 return res
388
389
390
391
392
393
394
395
397 """Checks and changes the name/ID's to be unique identifiers by adding numbers (OBSOLETE).
398
399 file - a FASTA format filename to read in.
400
401 No return value, the output is written to screen.
402 """
403 dict = {}
404 txt = open(file).read()
405 entries = []
406 for entry in txt.split('>')[1:]:
407 name, seq= entry.split('\n',1)
408 name = name.split()[0].split(',')[0]
409
410 if name in dict:
411 n = 1
412 while 1:
413 n = n + 1
414 _name = name + str(n)
415 if _name not in dict:
416 name = _name
417 break
418
419 dict[name] = seq
420
421 for name, seq in dict.items():
422 print '>%s\n%s' % (name, seq)
423
425 """Simple FASTA reader, returning a list of string tuples.
426
427 The single argument 'file' should be the filename of a FASTA format file.
428 This function will open and read in the entire file, constructing a list
429 of all the records, each held as a tuple of strings (the sequence name or
430 title, and its sequence).
431
432 This function was originally intended for use on large files, where its
433 low overhead makes it very fast. However, because it returns the data as
434 a single in memory list, this can require a lot of RAM on large files.
435
436 You are generally encouraged to use Bio.SeqIO.parse(handle, "fasta") which
437 allows you to iterate over the records one by one (avoiding having all the
438 records in memory at once). Using Bio.SeqIO also makes it easy to switch
439 between different input file formats. However, please note that rather
440 than simple strings, Bio.SeqIO uses SeqRecord objects for each record.
441 """
442
443
444
445 handle = open(file)
446 txt = "\n" + handle.read()
447 handle.close()
448 entries = []
449 for entry in txt.split('\n>')[1:]:
450 name,seq= entry.split('\n',1)
451 seq = seq.replace('\n','').replace(' ','').upper()
452 entries.append((name, seq))
453 return entries
454
456 """Apply a function on each sequence in a multiple FASTA file (OBSOLETE).
457
458 file - filename of a FASTA format file
459 function - the function you wish to invoke on each record
460 *args - any extra arguments you want passed to the function
461
462 This function will iterate over each record in a FASTA file as SeqRecord
463 objects, calling your function with the record (and supplied args) as
464 arguments.
465
466 This function returns a list. For those records where your function
467 returns a value, this is taken as a sequence and used to construct a
468 FASTA format string. If your function never has a return value, this
469 means apply_on_multi_fasta will return an empty list.
470 """
471 try:
472 f = globals()[function]
473 except:
474 raise NotImplementedError("%s not implemented" % function)
475
476 handle = open(file, 'r')
477 records = SeqIO.parse(handle, "fasta")
478 results = []
479 for record in records:
480 arguments = [record.sequence]
481 for arg in args: arguments.append(arg)
482 result = f(*arguments)
483 if result:
484 results.append('>%s\n%s' % (record.name, result))
485 handle.close()
486 return results
487
489 """Apply a function on each sequence in a multiple FASTA file (OBSOLETE).
490
491 file - filename of a FASTA format file
492 function - the function you wish to invoke on each record
493 *args - any extra arguments you want passed to the function
494
495 This function will use quick_FASTA_reader to load every record in the
496 FASTA file into memory as a list of tuples. For each record, it will
497 call your supplied function with the record as a tuple of the name and
498 sequence as strings (plus any supplied args).
499
500 This function returns a list. For those records where your function
501 returns a value, this is taken as a sequence and used to construct a
502 FASTA format string. If your function never has a return value, this
503 means quicker_apply_on_multi_fasta will return an empty list.
504 """
505 try:
506 f = globals()[function]
507 except:
508 raise NotImplementedError("%s not implemented" % function)
509
510 entries = quick_FASTA_reader(file)
511 results = []
512 for name, seq in entries:
513 arguments = [seq]
514 for arg in args: arguments.append(arg)
515 result = f(*arguments)
516 if result:
517 results.append('>%s\n%s' % (name, result))
518 handle.close()
519 return results
520
521
522
523
524
525
526
527
528 if __name__ == '__main__':
529 import sys, getopt
530
531 options = {'apply_on_multi_fasta':0,
532 'quick':0,
533 'uniq_ids':0,
534 }
535
536 optlist, args = getopt.getopt(sys.argv[1:], '', ['describe', 'apply_on_multi_fasta=',
537 'help', 'quick', 'uniq_ids', 'search='])
538 for arg in optlist:
539 if arg[0] in ['-h', '--help']:
540 pass
541 elif arg[0] in ['--describe']:
542
543 mol_funcs = [x[0] for x in locals().items() if type(x[1]) == type(GC)]
544 mol_funcs.sort()
545 print 'available functions:'
546 for f in mol_funcs: print '\t--%s' % f
547 print '\n\ne.g.\n./sequtils.py --apply_on_multi_fasta GC test.fas'
548
549 sys.exit(0)
550 elif arg[0] in ['--apply_on_multi_fasta']:
551 options['apply_on_multi_fasta'] = arg[1]
552 elif arg[0] in ['--search']:
553 options['search'] = arg[1]
554 else:
555 key = re.search('-*(.+)', arg[0]).group(1)
556 options[key] = 1
557
558
559 if options.get('apply_on_multi_fasta'):
560 file = args[0]
561 function = options['apply_on_multi_fasta']
562 arguments = []
563 if options.get('search'):
564 arguments = options['search']
565 if function == 'xGC_skew':
566 arguments = 1000
567 if options.get('quick'):
568 results = quicker_apply_on_multi_fasta(file, function, arguments)
569 else:
570 results = apply_on_multi_fasta(file, function, arguments)
571 for result in results: print result
572
573 elif options.get('uniq_ids'):
574 file = args[0]
575 fasta_uniqids(file)
576
577
578