1
2
3
4 """Code for calling ClustalW and parsing its output (DEPRECATED).
5
6 This module has been superseded by the Bio.AlignIO framework for
7 alignment parsing, and the ClustalW command line wrapper in
8 Bio.Align.Applications for calling the tool. These are both described
9 in the current version of the Biopython Tutorial and Cookbook.
10 This means Bio.Clustalw is now deprecated and likely to be
11 removed in future releases of Biopython.
12
13 A set of classes to interact with the multiple alignment command
14 line program clustalw.
15
16 Clustalw is the command line version of the graphical Clustalx
17 aligment program.
18
19 This requires clustalw available from:
20
21 ftp://ftp-igbmc.u-strasbg.fr/pub/ClustalW/.
22
23 functions:
24 o read
25 o parse_file
26 o do_alignment
27
28 classes:
29 o ClustalAlignment
30 o MultipleAlignCL"""
31
32 import Bio
33 import warnings
34 warnings.warn("Bio.Clustalw is deprecated. Please use the Bio.AlignIO framework for alignment parsing, and the ClustalW command line wrapper in Bio.Align.Applications for calling the tool. These are both described in the current version of the Biopython Tutorial and Cookbook.", Bio.BiopythonDeprecationWarning)
35
36
37 import os
38 import sys
39 import subprocess
40
41
42 from Bio import Alphabet
43 from Bio.Alphabet import IUPAC
44 from Bio.Align.Generic import Alignment
45 from Bio.Application import _escape_filename
46
48 """Parse the given file into a clustal aligment object (OBSOLETE).
49
50 Arguments:
51 o file_name - The name of the file to parse.
52 o alphabet - The type of alphabet to use for the alignment sequences.
53 This should correspond to the type of information contained in the file.
54 Defaults to be unambiguous_dna sequence.
55
56 There is a deprecated optional argument debug_level which has no effect.
57
58 This function is obsolete, and any new code should call Bio.AlignIO
59 instead. For example using Bio.Clustalw, you might have:
60
61 >>> from Bio import Clustalw
62 >>> from Bio import Alphabet
63 >>> filename = "Clustalw/protein.aln"
64 >>> alpha = Alphabet.Gapped(Alphabet.generic_protein)
65 >>> align = Clustalw.parse_file(filename, alphabet=alpha)
66 >>> print align.get_alignment_length()
67 411
68 >>> clustalw_string = str(align)
69
70 This becomes:
71
72 >>> from Bio import AlignIO
73 >>> from Bio import Alphabet
74 >>> filename = "Clustalw/protein.aln"
75 >>> alpha = Alphabet.Gapped(Alphabet.generic_protein)
76 >>> align = AlignIO.read(open(filename), "clustal", alphabet=alpha)
77 >>> print align.get_alignment_length()
78 411
79 >>> assert clustalw_string == align.format("clustal")
80 """
81
82 import warnings
83 warnings.warn("This function is obsolete, and any new code should call Bio.AlignIO instead.", PendingDeprecationWarning)
84
85 handle = open(file_name, 'r')
86 from Bio import AlignIO
87 generic_alignment = AlignIO.read(handle, "clustal")
88 handle.close()
89
90
91 if isinstance(alphabet, Alphabet.Gapped):
92 alpha = alphabet
93 else:
94 alpha = Alphabet.Gapped(alphabet)
95 clustal_alignment = ClustalAlignment(alpha)
96 clustal_alignment._records = generic_alignment._records
97 for record in clustal_alignment._records:
98 record.seq.alphabet = alpha
99
100 try:
101 clustal_alignment._version = generic_alignment._version
102 except AttributeError:
103
104 pass
105
106 try :
107 clustal_alignment._star_info = generic_alignment._star_info
108 except AttributeError:
109
110 pass
111
112 return clustal_alignment
113
115 """Perform an alignment with the given command line (OBSOLETE).
116
117 Arguments:
118 o command_line - A command line object that can give out
119 the command line we will input into clustalw.
120 o alphabet - the alphabet to use in the created alignment. If not
121 specified IUPAC.unambiguous_dna and IUPAC.protein will be used for
122 dna and protein alignment respectively.
123
124 Returns:
125 o A clustal alignment object corresponding to the created alignment.
126 If the alignment type was not a clustal object, None is returned.
127
128 This function (and the associated command line object) are now obsolete.
129 Please use the Bio.Align.Applications.ClustalwCommandline wrapper with
130 the Python subprocess module (and Bio.AlignIO for parsing) as described
131 in the tutorial.
132 """
133 import warnings
134 warnings.warn("This function (and the associated command line object) are now obsolete. Please use the Bio.Align.Applications.ClustalwCommandline wrapper with the Python subprocess module (and Bio.AlignIO for parsing) as described in the tutorial.", PendingDeprecationWarning)
135
136
137
138
139 child_process = subprocess.Popen(str(command_line),
140 stdin=subprocess.PIPE,
141 stdout=subprocess.PIPE,
142 stderr=subprocess.PIPE,
143 universal_newlines=True,
144 shell=(sys.platform!="win32")
145 )
146
147 child_process.communicate()
148 value = child_process.returncode
149
150
151
152
153
154
155 if value == 1:
156 raise ValueError("Bad command line option in the command: %s"
157 % str(command_line))
158
159 elif value == 2:
160 raise IOError("Cannot open sequence file %s"
161 % command_line.sequence_file)
162
163 elif value == 3:
164 raise IOError("Sequence file %s has an invalid format."
165 % command_line.sequence_file)
166
167 elif value == 4:
168 raise IOError("Sequence file %s has only one sequence present."
169 % command_line.sequence_file)
170
171
172 if command_line.output_file:
173 out_file = command_line.output_file
174 else:
175 out_file = os.path.splitext(command_line.sequence_file)[0] + '.aln'
176
177
178 if command_line.output_type and command_line.output_type != 'CLUSTAL':
179 return None
180
181 else:
182 if not alphabet:
183 alphabet = (IUPAC.unambiguous_dna, IUPAC.protein)[
184 command_line.type == 'PROTEIN']
185
186
187 if not(os.path.exists(out_file)):
188 raise IOError("Output .aln file %s not produced, commandline: %s"
189 % (out_file, command_line))
190
191 return parse_file(out_file, alphabet)
192
193
195 """Work with the clustal aligment format (OBSOLETE).
196
197 This format is the default output from clustal -- these files normally
198 have an extension of .aln.
199
200 This obsolete alignment object is a subclass of the more general alignment
201 object used in Bio.AlignIO. The old practical difference is here str(align)
202 would give the alignment as a string in clustal format, whereas in general
203 you must do align.format("clustal"), which supports other formats too.
204 """
205
206 import warnings
207 warnings.warn("This class is obsolete.", PendingDeprecationWarning)
208 DEFAULT_VERSION = '1.81'
209
215
217 """Print out the alignment so it looks pretty.
218
219 The output produced from this should also be formatted in valid
220 clustal format.
221 """
222 return self.format("clustal")
223
225 """Represent a clustalw multiple alignment command line (OBSOLETE).
226
227 This command line wrapper is considerd obsolete. Please use the replacement
228 Bio.Align.Applications.ClustalwCommandline wrapper instead, which uses the
229 standardised Bio.Application style interface. This is described in the
230 tutorial, with examples using ClustalW.
231 """
232 import warnings
233 warnings.warn("This command line wrapper is considerd obsolete. Please use the replacement Bio.Align.Applications.ClustalwCommandline wrapper instead, which uses the standardised Bio.Application style interface. This is described in the tutorial, with examples using ClustalW.", PendingDeprecationWarning)
234
235 OUTPUT_TYPES = ['GCG', 'GDE', 'PHYLIP', 'PIR', 'NEXUS', 'FASTA']
236 OUTPUT_ORDER = ['INPUT', 'ALIGNED']
237 OUTPUT_CASE = ['LOWER', 'UPPER']
238 OUTPUT_SEQNOS = ['OFF', 'ON']
239 RESIDUE_TYPES = ['PROTEIN', 'DNA']
240 PROTEIN_MATRIX = ['BLOSUM', 'PAM', 'GONNET', 'ID']
241 DNA_MATRIX = ['IUB', 'CLUSTALW']
242
243 - def __init__(self, sequence_file, command = 'clustalw'):
244 """Initialize some general parameters that can be set as attributes.
245
246 Arguments:
247 o sequence_file - The file to read the sequences for alignment from.
248 o command - The command used to run clustalw. This defaults to
249 just 'clustalw' (ie. assumes you have it on your path somewhere).
250
251 General attributes that can be set:
252 o is_quick - if set as 1, will use a fast algorithm to create
253 the alignment guide tree.
254 o allow_negative - allow negative values in the alignment matrix.
255
256 Multiple alignment attributes that can be set as attributes:
257 o gap_open_pen - Gap opening penalty
258 o gap_ext_pen - Gap extension penalty
259 o is_no_end_pen - A flag as to whether or not there should be a gap
260 separation penalty for the ends.
261 o gap_sep_range - The gap separation penalty range.
262 o is_no_pgap - A flag to turn off residue specific gaps
263 o is_no_hgap - A flag to turn off hydrophilic gaps
264 o h_gap_residues - A list of residues to count a hydrophilic
265 o max_div - A percent identity to use for delay (? - I don't undertand
266 this!)
267 o trans_weight - The weight to use for transitions
268 """
269 self.sequence_file = sequence_file
270 self.command = command
271
272 self.is_quick = None
273 self.allow_negative = None
274
275 self.gap_open_pen = None
276 self.gap_ext_pen = None
277 self.is_no_end_pen = None
278 self.gap_sep_range = None
279 self.is_no_pgap = None
280 self.is_no_hgap = None
281 self.h_gap_residues = []
282 self.max_div = None
283 self.trans_weight = None
284
285
286
287 self.output_file = None
288 self.output_type = None
289 self.output_order = None
290 self.change_case = None
291 self.add_seqnos = None
292
293
294 self.guide_tree = None
295 self.new_tree = None
296
297
298 self.protein_matrix = None
299 self.dna_matrix = None
300
301
302 self.type = None
303
305 """Write out the command line as a string."""
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350 cline = _escape_filename(self.command)
351 cline += ' -INFILE=%s' % _escape_filename(self.sequence_file)
352
353
354 if self.type:
355 cline += " -TYPE=%s" % self.type
356 if self.is_quick == 1:
357
358
359 cline += " -quicktree"
360 if self.allow_negative == 1:
361 cline += " -NEGATIVE"
362
363
364 if self.output_file:
365 cline += " -OUTFILE=%s" % _escape_filename(self.output_file)
366 if self.output_type:
367 cline += " -OUTPUT=%s" % self.output_type
368 if self.output_order:
369 cline += " -OUTORDER=%s" % self.output_order
370 if self.change_case:
371 cline += " -CASE=%s" % self.change_case
372 if self.add_seqnos:
373 cline += " -SEQNOS=%s" % self.add_seqnos
374 if self.new_tree:
375
376 cline += " -NEWTREE=%s -align" % _escape_filename(self.new_tree)
377
378
379 if self.guide_tree:
380 cline += " -USETREE=%s" % _escape_filename(self.guide_tree)
381 if self.protein_matrix:
382 cline += " -MATRIX=%s" % self.protein_matrix
383 if self.dna_matrix:
384 cline += " -DNAMATRIX=%s" % self.dna_matrix
385 if self.gap_open_pen:
386 cline += " -GAPOPEN=%s" % self.gap_open_pen
387 if self.gap_ext_pen:
388 cline += " -GAPEXT=%s" % self.gap_ext_pen
389 if self.is_no_end_pen == 1:
390 cline += " -ENDGAPS"
391 if self.gap_sep_range:
392 cline += " -GAPDIST=%s" % self.gap_sep_range
393 if self.is_no_pgap == 1:
394 cline += " -NOPGAP"
395 if self.is_no_hgap == 1:
396 cline += " -NOHGAP"
397 if len(self.h_gap_residues) != 0:
398
399 residue_list = ''
400 for residue in self.h_gap_residues:
401 residue_list = residue_list + residue
402 cline += " -HGAPRESIDUES=%s" % residue_list
403 if self.max_div:
404 cline += " -MAXDIV=%s" % self.max_div
405 if self.trans_weight:
406 cline += " -TRANSWEIGHT=%s" % self.trans_weight
407
408 return cline
409
410 - def set_output(self, output_file, output_type = None, output_order = None,
411 change_case = None, add_seqnos = None):
412 """Set the output parameters for the command line.
413 """
414 self.output_file = output_file
415
416 if output_type:
417 output_type = output_type.upper()
418 if output_type not in self.OUTPUT_TYPES:
419 raise ValueError("Invalid output type %s. Valid choices are %s"
420 % (output_type, self.OUTPUT_TYPES))
421 else:
422 self.output_type = output_type
423
424 if output_order:
425 output_order = output_order.upper()
426 if output_order not in self.OUTPUT_ORDER:
427 raise ValueError("Invalid output order %s. Valid choices are %s"
428 % (output_order, self.OUTPUT_ORDER))
429 else:
430 self.output_order = output_order
431
432 if change_case:
433 change_case = change_case.upper()
434 if output_type != "GDE":
435 raise ValueError("Change case only valid for GDE output.")
436 elif change_case not in self.CHANGE_CASE:
437 raise ValueError("Invalid change case %s. Valid choices are %s"
438 % (change_case, self.CHANGE_CASE))
439 else:
440 self.change_case = change_case
441
442 if add_seqnos:
443 add_seqnos = add_seqnos.upper()
444 if output_type:
445 raise ValueError("Add SeqNos only valid for CLUSTAL output.")
446 elif add_seqnos not in self.OUTPUT_SEQNOS:
447 raise ValueError("Invalid seqnos option %s. Valid choices: %s"
448 % (add_seqnos, self.OUTPUT_SEQNOS))
449 else:
450 self.add_seqnos = add_seqnos
451
453 """Provide a file to use as the guide tree for alignment.
454
455 Raises:
456 o IOError - If the tree_file doesn't exist."""
457 if not(os.path.exists(tree_file)):
458 raise IOError("Could not find the guide tree file %s." %
459 tree_file)
460 else:
461 self.guide_tree = tree_file
462
464 """Set the name of the guide tree file generated in the alignment.
465 """
466 self.new_tree = tree_file
467
469 """Set the type of protein matrix to use.
470
471 Protein matrix can be either one of the defined types (blosum, pam,
472 gonnet or id) or a file with your own defined matrix.
473 """
474 if protein_matrix.upper() in self.PROTEIN_MATRIX:
475 self.protein_matrix = protein_matrix.upper()
476 elif os.path.exists(protein_matrix):
477 self.protein_matrix = protein_matrix
478 else:
479 raise ValueError("Invalid matrix %s. Options are %s or a file." %
480 (protein_matrix.upper(), self.PROTEIN_MATRIX))
481
483 """Set the type of DNA matrix to use.
484
485 The dna_matrix can either be one of the defined types (iub or clustalw)
486 or a file with the matrix to use."""
487 if dna_matrix.upper() in self.DNA_MATRIX:
488 self.dna_matrix = dna_matrix.upper()
489 elif os.path.exists(dna_matrix):
490 self.dna_matrix = dna_matrix
491 else:
492 raise ValueError("Invalid matrix %s. Options are %s or a file." %
493 (dna_matrix, self.DNA_MATRIX))
494
496 """Set the type of residues within the file.
497
498 Clustal tries to guess whether the info is protein or DNA based on
499 the number of GATCs, but this can be wrong if you have a messed up
500 protein or DNA you are working with, so this allows you to set it
501 explicitly.
502 """
503 residue_type = residue_type.upper()
504 if residue_type in self.RESIDUE_TYPES:
505 self.type = residue_type
506 else:
507 raise ValueError("Invalid residue type %s. Valid choices are %s"
508 % (residue_type, self.RESIDUE_TYPES))
509
511 """Run the Bio.Clustalw module's doctests (PRIVATE).
512
513 This will try and locate the unit tests directory, and run the doctests
514 from there in order that the relative paths used in the examples work.
515 """
516 import doctest
517 import os
518 if os.path.isdir(os.path.join("..","..","Tests")):
519 print "Runing doctests..."
520 cur_dir = os.path.abspath(os.curdir)
521 os.chdir(os.path.join("..","..","Tests"))
522 doctest.testmod()
523 os.chdir(cur_dir)
524 del cur_dir
525 print "Done"
526
527 if __name__ == "__main__":
528 _test()
529