1
2
3 """
4 A set of classes to interact with the multiple alignment command
5 line program clustalw.
6
7 Clustalw is the command line version of the graphical Clustalx
8 aligment program.
9
10 This requires clustalw available from:
11
12 ftp://ftp-igbmc.u-strasbg.fr/pub/ClustalW/.
13
14 functions:
15 o read
16 o parse_file
17 o do_alignment
18
19 classes:
20 o ClustalAlignment
21 o MultipleAlignCL"""
22
23
24 import os
25 import sys
26
27
28 from Bio import Alphabet
29 from Bio.Alphabet import IUPAC
30 from Bio.Align.Generic import Alignment
31
33 """Parse the given file into a clustal aligment object.
34
35 Arguments:
36 o file_name - The name of the file to parse.
37 o alphabet - The type of alphabet to use for the alignment sequences.
38 This should correspond to the type of information contained in the file.
39 Defaults to be unambiguous_dna sequence.
40
41 There is a deprecated optional argument debug_level which has no effect.
42
43 Since Biopython 1.46, this has called Bio.AlignIO internally.
44 """
45
46
47 handle = open(file_name, 'r')
48 from Bio import AlignIO
49 generic_alignment = AlignIO.read(handle, "clustal")
50 handle.close()
51
52
53 if isinstance(alphabet, Alphabet.Gapped) :
54 alpha = alphabet
55 else :
56 alpha = Alphabet.Gapped(alphabet)
57 clustal_alignment = ClustalAlignment(alpha)
58 clustal_alignment._records = generic_alignment._records
59 for record in clustal_alignment._records :
60 record.seq.alphabet = alpha
61
62 try :
63 clustal_alignment._version = generic_alignment._version
64 except AttributeError :
65
66 pass
67
68 try :
69 clustal_alignment._star_info = generic_alignment._star_info
70 except AttributeError :
71
72 pass
73
74 return clustal_alignment
75
77 """Perform an alignment with the given command line.
78
79 Arguments:
80 o command_line - A command line object that can give out
81 the command line we will input into clustalw.
82 o alphabet - the alphabet to use in the created alignment. If not
83 specified IUPAC.unambiguous_dna and IUPAC.protein will be used for
84 dna and protein alignment respectively.
85
86 Returns:
87 o A clustal alignment object corresponding to the created alignment.
88 If the alignment type was not a clustal object, None is returned.
89 """
90 try :
91 import subprocess
92 child_process = subprocess.Popen(str(command_line),
93 stdout=subprocess.PIPE,
94 stderr=subprocess.PIPE,
95 shell=(sys.platform!="win32")
96 )
97 status = child_process.wait()
98 except ImportError :
99
100 run_clust = os.popen(str(command_line))
101 status = run_clust.close()
102
103
104
105
106 value = 0
107 if status: value = status / 256
108
109
110
111 if value == 1:
112 raise ValueError("Bad command line option in the command: %s"
113 % str(command_line))
114
115 elif value == 2:
116 raise IOError("Cannot open sequence file %s"
117 % command_line.sequence_file)
118
119 elif value == 3:
120 raise IOError("Sequence file %s has an invalid format."
121 % command_line.sequence_file)
122
123 elif value == 4:
124 raise IOError("Sequence file %s has only one sequence present."
125 % command_line.sequence_file)
126
127
128 if command_line.output_file:
129 out_file = command_line.output_file
130 else:
131 out_file = os.path.splitext(command_line.sequence_file)[0] + '.aln'
132
133
134 if command_line.output_type and command_line.output_type != 'CLUSTAL':
135 return None
136
137 else:
138 if not alphabet:
139 alphabet = (IUPAC.unambiguous_dna, IUPAC.protein)[
140 command_line.type == 'PROTEIN']
141
142
143 if not(os.path.exists(out_file)):
144 raise IOError("Output .aln file %s not produced, commandline: %s"
145 % (out_file, command_line))
146
147 return parse_file(out_file, alphabet)
148
149
151 """Work with the clustal aligment format.
152
153 This format is the default output from clustal -- these files normally
154 have an extension of .aln.
155 """
156
157 DEFAULT_VERSION = '1.81'
158
166
168 """Print out the alignment so it looks pretty.
169
170 The output produced from this should also be formatted in valid
171 clustal format.
172 """
173
174 from Bio import AlignIO
175 from StringIO import StringIO
176 handle = StringIO()
177 AlignIO.write([self], handle, "clustal")
178 handle.seek(0)
179 return handle.read()
180
182 """Escape filenames with spaces (PRIVATE).
183
184 NOTE - This code is duplicated in Bio.Blast.NCBIStandalone,
185 scope for refactoring!
186 """
187 if " " not in filename :
188 return filename
189
190
191 if filename.startswith('"') and filename.endswith('"') :
192
193 return filename
194 else :
195 return '"%s"' % filename
196
198 """Represent a clustalw multiple alignment command line.
199
200 This is meant to make it easy to code the command line options you
201 want to submit to clustalw.
202
203 Clustalw has a ton of options and things to do but this is set up to
204 represent a clustalw mutliple alignment.
205
206 Warning: I don't use all of these options personally, so if you find
207 one to be broken for any reason, please let us know!
208 """
209
210 OUTPUT_TYPES = ['GCG', 'GDE', 'PHYLIP', 'PIR', 'NEXUS', 'FASTA']
211 OUTPUT_ORDER = ['INPUT', 'ALIGNED']
212 OUTPUT_CASE = ['LOWER', 'UPPER']
213 OUTPUT_SEQNOS = ['OFF', 'ON']
214 RESIDUE_TYPES = ['PROTEIN', 'DNA']
215 PROTEIN_MATRIX = ['BLOSUM', 'PAM', 'GONNET', 'ID']
216 DNA_MATRIX = ['IUB', 'CLUSTALW']
217
218 - def __init__(self, sequence_file, command = 'clustalw'):
219 """Initialize some general parameters that can be set as attributes.
220
221 Arguments:
222 o sequence_file - The file to read the sequences for alignment from.
223 o command - The command used to run clustalw. This defaults to
224 just 'clustalw' (ie. assumes you have it on your path somewhere).
225
226 General attributes that can be set:
227 o is_quick - if set as 1, will use a fast algorithm to create
228 the alignment guide tree.
229 o allow_negative - allow negative values in the alignment matrix.
230
231 Multiple alignment attributes that can be set as attributes:
232 o gap_open_pen - Gap opening penalty
233 o gap_ext_pen - Gap extension penalty
234 o is_no_end_pen - A flag as to whether or not there should be a gap
235 separation penalty for the ends.
236 o gap_sep_range - The gap separation penalty range.
237 o is_no_pgap - A flag to turn off residue specific gaps
238 o is_no_hgap - A flag to turn off hydrophilic gaps
239 o h_gap_residues - A list of residues to count a hydrophilic
240 o max_div - A percent identity to use for delay (? - I don't undertand
241 this!)
242 o trans_weight - The weight to use for transitions
243 """
244 self.sequence_file = sequence_file
245 self.command = command
246
247 self.is_quick = None
248 self.allow_negative = None
249
250 self.gap_open_pen = None
251 self.gap_ext_pen = None
252 self.is_no_end_pen = None
253 self.gap_sep_range = None
254 self.is_no_pgap = None
255 self.is_no_hgap = None
256 self.h_gap_residues = []
257 self.max_div = None
258 self.trans_weight = None
259
260
261
262 self.output_file = None
263 self.output_type = None
264 self.output_order = None
265 self.change_case = None
266 self.add_seqnos = None
267
268
269 self.guide_tree = None
270 self.new_tree = None
271
272
273 self.protein_matrix = None
274 self.dna_matrix = None
275
276
277 self.type = None
278
280 """Write out the command line as a string."""
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325 cline = _escape_filename(self.command)
326 cline += ' -INFILE=%s' % _escape_filename(self.sequence_file)
327
328
329 if self.type:
330 cline += " -TYPE=%s" % self.type
331 if self.is_quick == 1:
332
333
334 cline += " -quicktree"
335 if self.allow_negative == 1:
336 cline += " -NEGATIVE"
337
338
339 if self.output_file:
340 cline += " -OUTFILE=%s" % _escape_filename(self.output_file)
341 if self.output_type:
342 cline += " -OUTPUT=%s" % self.output_type
343 if self.output_order:
344 cline += " -OUTORDER=%s" % self.output_order
345 if self.change_case:
346 cline += " -CASE=%s" % self.change_case
347 if self.add_seqnos:
348 cline += " -SEQNOS=%s" % self.add_seqnos
349 if self.new_tree:
350
351 cline += " -NEWTREE=%s -align" % _escape_filename(self.new_tree)
352
353
354 if self.guide_tree:
355 cline += " -USETREE=%s" % _escape_filename(self.guide_tree)
356 if self.protein_matrix:
357 cline += " -MATRIX=%s" % self.protein_matrix
358 if self.dna_matrix:
359 cline += " -DNAMATRIX=%s" % self.dna_matrix
360 if self.gap_open_pen:
361 cline += " -GAPOPEN=%s" % self.gap_open_pen
362 if self.gap_ext_pen:
363 cline += " -GAPEXT=%s" % self.gap_ext_pen
364 if self.is_no_end_pen == 1:
365 cline += " -ENDGAPS"
366 if self.gap_sep_range:
367 cline += " -GAPDIST=%s" % self.gap_sep_range
368 if self.is_no_pgap == 1:
369 cline += " -NOPGAP"
370 if self.is_no_hgap == 1:
371 cline += " -NOHGAP"
372 if len(self.h_gap_residues) != 0:
373
374 residue_list = ''
375 for residue in self.h_gap_residues:
376 residue_list = residue_list + residue
377 cline += " -HGAPRESIDUES=%s" % residue_list
378 if self.max_div:
379 cline += " -MAXDIV=%s" % self.max_div
380 if self.trans_weight:
381 cline += " -TRANSWEIGHT=%s" % self.trans_weight
382
383 return cline
384
385 - def set_output(self, output_file, output_type = None, output_order = None,
386 change_case = None, add_seqnos = None):
387 """Set the output parameters for the command line.
388 """
389 self.output_file = output_file
390
391 if output_type:
392 output_type = output_type.upper()
393 if output_type not in self.OUTPUT_TYPES:
394 raise ValueError("Invalid output type %s. Valid choices are %s"
395 % (output_type, self.OUTPUT_TYPES))
396 else:
397 self.output_type = output_type
398
399 if output_order:
400 output_order = output_order.upper()
401 if output_order not in self.OUTPUT_ORDER:
402 raise ValueError("Invalid output order %s. Valid choices are %s"
403 % (output_order, self.OUTPUT_ORDER))
404 else:
405 self.output_order = output_order
406
407 if change_case:
408 change_case = change_case.upper()
409 if output_type != "GDE":
410 raise ValueError("Change case only valid for GDE output.")
411 elif change_case not in self.CHANGE_CASE:
412 raise ValueError("Invalid change case %s. Valid choices are %s"
413 % (change_case, self.CHANGE_CASE))
414 else:
415 self.change_case = change_case
416
417 if add_seqnos:
418 add_seqnos = add_seqnos.upper()
419 if output_type:
420 raise ValueError("Add SeqNos only valid for CLUSTAL output.")
421 elif add_seqnos not in self.OUTPUT_SEQNOS:
422 raise ValueError("Invalid seqnos option %s. Valid choices: %s"
423 % (add_seqnos, self.OUTPUT_SEQNOS))
424 else:
425 self.add_seqnos = add_seqnos
426
428 """Provide a file to use as the guide tree for alignment.
429
430 Raises:
431 o IOError - If the tree_file doesn't exist."""
432 if not(os.path.exists(tree_file)):
433 raise IOError("Could not find the guide tree file %s." %
434 tree_file)
435 else:
436 self.guide_tree = tree_file
437
439 """Set the name of the guide tree file generated in the alignment.
440 """
441 self.new_tree = tree_file
442
444 """Set the type of protein matrix to use.
445
446 Protein matrix can be either one of the defined types (blosum, pam,
447 gonnet or id) or a file with your own defined matrix.
448 """
449 if protein_matrix.upper() in self.PROTEIN_MATRIX:
450 self.protein_matrix = protein_matrix.upper()
451 elif os.path.exists(protein_matrix):
452 self.protein_matrix = protein_matrix
453 else:
454 raise ValueError("Invalid matrix %s. Options are %s or a file." %
455 (protein_matrix.upper(), self.PROTEIN_MATRIX))
456
458 """Set the type of DNA matrix to use.
459
460 The dna_matrix can either be one of the defined types (iub or clustalw)
461 or a file with the matrix to use."""
462 if dna_matrix.upper() in self.DNA_MATRIX:
463 self.dna_matrix = dna_matrix.upper()
464 elif os.path.exists(dna_matrix):
465 self.dna_matrix = dna_matrix
466 else:
467 raise ValueError("Invalid matrix %s. Options are %s or a file." %
468 (dna_matrix, self.DNA_MATRIX))
469
471 """Set the type of residues within the file.
472
473 Clustal tries to guess whether the info is protein or DNA based on
474 the number of GATCs, but this can be wrong if you have a messed up
475 protein or DNA you are working with, so this allows you to set it
476 explicitly.
477 """
478 residue_type = residue_type.upper()
479 if residue_type in self.RESIDUE_TYPES:
480 self.type = residue_type
481 else:
482 raise ValueError("Invalid residue type %s. Valid choices are %s"
483 % (residue_type, self.RESIDUE_TYPES))
484