Package Bio :: Package Clustalw
[hide private]
[frames] | no frames]

Source Code for Package Bio.Clustalw

  1  # This code is part of the Biopython distribution and governed by its 
  2  # license.  Please see the LICENSE file that should have been included 
  3  # as part of this package. 
  4  """Code for calling ClustalW and parsing its output (DEPRECATED). 
  5   
  6  This module has been superseded by the Bio.AlignIO framework for 
  7  alignment parsing, and the ClustalW command line wrapper in 
  8  Bio.Align.Applications for calling the tool. These are both described 
  9  in the current version of the Biopython Tutorial and Cookbook. 
 10  This means Bio.Clustalw is now deprecated and likely to be 
 11  removed in future releases of Biopython. 
 12   
 13  A set of classes to interact with the multiple alignment command 
 14  line program clustalw.  
 15   
 16  Clustalw is the command line version of the graphical Clustalx  
 17  aligment program. 
 18   
 19  This requires clustalw available from: 
 20   
 21  ftp://ftp-igbmc.u-strasbg.fr/pub/ClustalW/. 
 22   
 23  functions: 
 24  o read 
 25  o parse_file 
 26  o do_alignment 
 27   
 28  classes: 
 29  o ClustalAlignment 
 30  o MultipleAlignCL""" 
 31   
 32  import Bio 
 33  import warnings 
 34  warnings.warn("Bio.Clustalw is deprecated. Please use the Bio.AlignIO framework for alignment parsing, and the ClustalW command line wrapper in Bio.Align.Applications for calling the tool. These are both described in the current version of the Biopython Tutorial and Cookbook.", Bio.BiopythonDeprecationWarning) 
 35   
 36  # standard library 
 37  import os 
 38  import sys 
 39  import subprocess 
 40   
 41  # biopython 
 42  from Bio import Alphabet 
 43  from Bio.Alphabet import IUPAC 
 44  from Bio.Align.Generic import Alignment 
 45  from Bio.Application import _escape_filename 
 46   
47 -def parse_file(file_name, alphabet = IUPAC.unambiguous_dna, debug_level = 0):
48 """Parse the given file into a clustal aligment object (OBSOLETE). 49 50 Arguments: 51 o file_name - The name of the file to parse. 52 o alphabet - The type of alphabet to use for the alignment sequences. 53 This should correspond to the type of information contained in the file. 54 Defaults to be unambiguous_dna sequence. 55 56 There is a deprecated optional argument debug_level which has no effect. 57 58 This function is obsolete, and any new code should call Bio.AlignIO 59 instead. For example using Bio.Clustalw, you might have: 60 61 >>> from Bio import Clustalw 62 >>> from Bio import Alphabet 63 >>> filename = "Clustalw/protein.aln" 64 >>> alpha = Alphabet.Gapped(Alphabet.generic_protein) 65 >>> align = Clustalw.parse_file(filename, alphabet=alpha) 66 >>> print align.get_alignment_length() 67 411 68 >>> clustalw_string = str(align) 69 70 This becomes: 71 72 >>> from Bio import AlignIO 73 >>> from Bio import Alphabet 74 >>> filename = "Clustalw/protein.aln" 75 >>> alpha = Alphabet.Gapped(Alphabet.generic_protein) 76 >>> align = AlignIO.read(open(filename), "clustal", alphabet=alpha) 77 >>> print align.get_alignment_length() 78 411 79 >>> assert clustalw_string == align.format("clustal") 80 """ 81 82 import warnings 83 warnings.warn("This function is obsolete, and any new code should call Bio.AlignIO instead.", PendingDeprecationWarning) 84 # Avoid code duplication by calling Bio.AlignIO to do this for us. 85 handle = open(file_name, 'r') 86 from Bio import AlignIO 87 generic_alignment = AlignIO.read(handle, "clustal") 88 handle.close() 89 90 #Force this generic alignment into a ClustalAlignment... nasty hack 91 if isinstance(alphabet, Alphabet.Gapped): 92 alpha = alphabet 93 else: 94 alpha = Alphabet.Gapped(alphabet) 95 clustal_alignment = ClustalAlignment(alpha) 96 clustal_alignment._records = generic_alignment._records 97 for record in clustal_alignment._records: 98 record.seq.alphabet = alpha 99 100 try: 101 clustal_alignment._version = generic_alignment._version 102 except AttributeError: 103 #Missing the version, could be a 3rd party tool's output 104 pass 105 106 try : 107 clustal_alignment._star_info = generic_alignment._star_info 108 except AttributeError: 109 #Missing the consensus, again, this is not always present 110 pass 111 112 return clustal_alignment
113
114 -def do_alignment(command_line, alphabet=None):
115 """Perform an alignment with the given command line (OBSOLETE). 116 117 Arguments: 118 o command_line - A command line object that can give out 119 the command line we will input into clustalw. 120 o alphabet - the alphabet to use in the created alignment. If not 121 specified IUPAC.unambiguous_dna and IUPAC.protein will be used for 122 dna and protein alignment respectively. 123 124 Returns: 125 o A clustal alignment object corresponding to the created alignment. 126 If the alignment type was not a clustal object, None is returned. 127 128 This function (and the associated command line object) are now obsolete. 129 Please use the Bio.Align.Applications.ClustalwCommandline wrapper with 130 the Python subprocess module (and Bio.AlignIO for parsing) as described 131 in the tutorial. 132 """ 133 import warnings 134 warnings.warn("This function (and the associated command line object) are now obsolete. Please use the Bio.Align.Applications.ClustalwCommandline wrapper with the Python subprocess module (and Bio.AlignIO for parsing) as described in the tutorial.", PendingDeprecationWarning) 135 #We don't need to supply any piped input, but we setup the 136 #standard input pipe anyway as a work around for a python 137 #bug if this is called from a Windows GUI program. For 138 #details, see http://bugs.python.org/issue1124861 139 child_process = subprocess.Popen(str(command_line), 140 stdin=subprocess.PIPE, 141 stdout=subprocess.PIPE, 142 stderr=subprocess.PIPE, 143 universal_newlines=True, 144 shell=(sys.platform!="win32") 145 ) 146 #Use .communicate as can get deadlocks with .wait(), see Bug 2804 147 child_process.communicate() #ignore the stdout and strerr data 148 value = child_process.returncode 149 150 # check the return value for errors, as on 1.81 the return value 151 # from Clustalw is actually helpful for figuring out errors 152 # TODO - Update this for new error codes using in clustalw 2 153 154 # 1 => bad command line option 155 if value == 1: 156 raise ValueError("Bad command line option in the command: %s" 157 % str(command_line)) 158 # 2 => can't open sequence file 159 elif value == 2: 160 raise IOError("Cannot open sequence file %s" 161 % command_line.sequence_file) 162 # 3 => wrong format in sequence file 163 elif value == 3: 164 raise IOError("Sequence file %s has an invalid format." 165 % command_line.sequence_file) 166 # 4 => sequence file only has one sequence 167 elif value == 4: 168 raise IOError("Sequence file %s has only one sequence present." 169 % command_line.sequence_file) 170 171 # if an output file was specified, we need to grab it 172 if command_line.output_file: 173 out_file = command_line.output_file 174 else: 175 out_file = os.path.splitext(command_line.sequence_file)[0] + '.aln' 176 177 # if we can't deal with the format, just return None 178 if command_line.output_type and command_line.output_type != 'CLUSTAL': 179 return None 180 # otherwise parse it into a ClustalAlignment object 181 else: 182 if not alphabet: 183 alphabet = (IUPAC.unambiguous_dna, IUPAC.protein)[ 184 command_line.type == 'PROTEIN'] 185 186 # check if the outfile exists before parsing 187 if not(os.path.exists(out_file)): 188 raise IOError("Output .aln file %s not produced, commandline: %s" 189 % (out_file, command_line)) 190 191 return parse_file(out_file, alphabet)
192 193
194 -class ClustalAlignment(Alignment):
195 """Work with the clustal aligment format (OBSOLETE). 196 197 This format is the default output from clustal -- these files normally 198 have an extension of .aln. 199 200 This obsolete alignment object is a subclass of the more general alignment 201 object used in Bio.AlignIO. The old practical difference is here str(align) 202 would give the alignment as a string in clustal format, whereas in general 203 you must do align.format("clustal"), which supports other formats too. 204 """ 205 # the default version to use if one isn't set 206 import warnings 207 warnings.warn("This class is obsolete.", PendingDeprecationWarning) 208 DEFAULT_VERSION = '1.81' 209
210 - def __init__(self, alphabet = Alphabet.Gapped(IUPAC.ambiguous_dna)):
211 Alignment.__init__(self, alphabet) 212 # represent all of those stars in the aln output format 213 self._star_info = '' 214 self._version = ''
215
216 - def __str__(self):
217 """Print out the alignment so it looks pretty. 218 219 The output produced from this should also be formatted in valid 220 clustal format. 221 """ 222 return self.format("clustal")
223
224 -class MultipleAlignCL:
225 """Represent a clustalw multiple alignment command line (OBSOLETE). 226 227 This command line wrapper is considerd obsolete. Please use the replacement 228 Bio.Align.Applications.ClustalwCommandline wrapper instead, which uses the 229 standardised Bio.Application style interface. This is described in the 230 tutorial, with examples using ClustalW. 231 """ 232 import warnings 233 warnings.warn("This command line wrapper is considerd obsolete. Please use the replacement Bio.Align.Applications.ClustalwCommandline wrapper instead, which uses the standardised Bio.Application style interface. This is described in the tutorial, with examples using ClustalW.", PendingDeprecationWarning) 234 # set the valid options for different parameters 235 OUTPUT_TYPES = ['GCG', 'GDE', 'PHYLIP', 'PIR', 'NEXUS', 'FASTA'] 236 OUTPUT_ORDER = ['INPUT', 'ALIGNED'] 237 OUTPUT_CASE = ['LOWER', 'UPPER'] 238 OUTPUT_SEQNOS = ['OFF', 'ON'] 239 RESIDUE_TYPES = ['PROTEIN', 'DNA'] 240 PROTEIN_MATRIX = ['BLOSUM', 'PAM', 'GONNET', 'ID'] 241 DNA_MATRIX = ['IUB', 'CLUSTALW'] 242
243 - def __init__(self, sequence_file, command = 'clustalw'):
244 """Initialize some general parameters that can be set as attributes. 245 246 Arguments: 247 o sequence_file - The file to read the sequences for alignment from. 248 o command - The command used to run clustalw. This defaults to 249 just 'clustalw' (ie. assumes you have it on your path somewhere). 250 251 General attributes that can be set: 252 o is_quick - if set as 1, will use a fast algorithm to create 253 the alignment guide tree. 254 o allow_negative - allow negative values in the alignment matrix. 255 256 Multiple alignment attributes that can be set as attributes: 257 o gap_open_pen - Gap opening penalty 258 o gap_ext_pen - Gap extension penalty 259 o is_no_end_pen - A flag as to whether or not there should be a gap 260 separation penalty for the ends. 261 o gap_sep_range - The gap separation penalty range. 262 o is_no_pgap - A flag to turn off residue specific gaps 263 o is_no_hgap - A flag to turn off hydrophilic gaps 264 o h_gap_residues - A list of residues to count a hydrophilic 265 o max_div - A percent identity to use for delay (? - I don't undertand 266 this!) 267 o trans_weight - The weight to use for transitions 268 """ 269 self.sequence_file = sequence_file 270 self.command = command 271 272 self.is_quick = None 273 self.allow_negative = None 274 275 self.gap_open_pen = None 276 self.gap_ext_pen = None 277 self.is_no_end_pen = None 278 self.gap_sep_range = None 279 self.is_no_pgap = None 280 self.is_no_hgap = None 281 self.h_gap_residues = [] 282 self.max_div = None 283 self.trans_weight = None 284 285 # other attributes that should be set via various functions 286 # 1. output parameters 287 self.output_file = None 288 self.output_type = None 289 self.output_order = None 290 self.change_case = None 291 self.add_seqnos = None 292 293 # 2. a guide tree to use 294 self.guide_tree = None 295 self.new_tree = None 296 297 # 3. matrices 298 self.protein_matrix = None 299 self.dna_matrix = None 300 301 # 4. type of residues 302 self.type = None
303
304 - def __str__(self):
305 """Write out the command line as a string.""" 306 307 #On Linux with clustalw 1.83, you can do: 308 #clustalw input.faa 309 #clustalw /full/path/input.faa 310 #clustalw -INFILE=input.faa 311 #clustalw -INFILE=/full/path/input.faa 312 # 313 #Note these fail (using DOS style slashes): 314 # 315 #clustalw /INFILE=input.faa 316 #clustalw /INFILE=/full/path/input.faa 317 # 318 #On Windows XP with clustalw.exe 1.83, these work at 319 #the command prompt: 320 # 321 #clustalw.exe input.faa 322 #clustalw.exe /INFILE=input.faa 323 #clustalw.exe /INFILE="input.faa" 324 #clustalw.exe /INFILE="with space.faa" 325 #clustalw.exe /INFILE=C:\full\path\input.faa 326 #clustalw.exe /INFILE="C:\full path\with spaces.faa" 327 # 328 #Sadly these fail: 329 #clustalw.exe "input.faa" 330 #clustalw.exe "with space.faa" 331 #clustalw.exe C:\full\path\input.faa 332 #clustalw.exe "C:\full path\with spaces.faa" 333 # 334 #Testing today (using a different binary of clustalw.exe 1.83), 335 #using -INFILE as follows seems to work. However I had once noted: 336 #These also fail but a minus/dash does seem to 337 #work with other options (!): 338 #clustalw.exe -INFILE=input.faa 339 #clustalw.exe -INFILE=C:\full\path\input.faa 340 # 341 #Also these fail: 342 #clustalw.exe "/INFILE=input.faa" 343 #clustalw.exe "/INFILE=C:\full\path\input.faa" 344 # 345 #Thanks to Emanuel Hey for flagging this on the mailing list. 346 # 347 #In addition, both self.command and self.sequence_file 348 #may contain spaces, so should be quoted. But clustalw 349 #is fussy. 350 cline = _escape_filename(self.command) 351 cline += ' -INFILE=%s' % _escape_filename(self.sequence_file) 352 353 # general options 354 if self.type: 355 cline += " -TYPE=%s" % self.type 356 if self.is_quick == 1: 357 #Some versions of clustalw are case sensitive, 358 #and require -quicktree rather than -QUICKTREE 359 cline += " -quicktree" 360 if self.allow_negative == 1: 361 cline += " -NEGATIVE" 362 363 # output options 364 if self.output_file: 365 cline += " -OUTFILE=%s" % _escape_filename(self.output_file) 366 if self.output_type: 367 cline += " -OUTPUT=%s" % self.output_type 368 if self.output_order: 369 cline += " -OUTORDER=%s" % self.output_order 370 if self.change_case: 371 cline += " -CASE=%s" % self.change_case 372 if self.add_seqnos: 373 cline += " -SEQNOS=%s" % self.add_seqnos 374 if self.new_tree: 375 # clustal does not work if -align is written -ALIGN 376 cline += " -NEWTREE=%s -align" % _escape_filename(self.new_tree) 377 378 # multiple alignment options 379 if self.guide_tree: 380 cline += " -USETREE=%s" % _escape_filename(self.guide_tree) 381 if self.protein_matrix: 382 cline += " -MATRIX=%s" % self.protein_matrix 383 if self.dna_matrix: 384 cline += " -DNAMATRIX=%s" % self.dna_matrix 385 if self.gap_open_pen: 386 cline += " -GAPOPEN=%s" % self.gap_open_pen 387 if self.gap_ext_pen: 388 cline += " -GAPEXT=%s" % self.gap_ext_pen 389 if self.is_no_end_pen == 1: 390 cline += " -ENDGAPS" 391 if self.gap_sep_range: 392 cline += " -GAPDIST=%s" % self.gap_sep_range 393 if self.is_no_pgap == 1: 394 cline += " -NOPGAP" 395 if self.is_no_hgap == 1: 396 cline += " -NOHGAP" 397 if len(self.h_gap_residues) != 0: 398 # stick the list of residues together as one big list o' residues 399 residue_list = '' 400 for residue in self.h_gap_residues: 401 residue_list = residue_list + residue 402 cline += " -HGAPRESIDUES=%s" % residue_list 403 if self.max_div: 404 cline += " -MAXDIV=%s" % self.max_div 405 if self.trans_weight: 406 cline += " -TRANSWEIGHT=%s" % self.trans_weight 407 408 return cline
409
410 - def set_output(self, output_file, output_type = None, output_order = None, 411 change_case = None, add_seqnos = None):
412 """Set the output parameters for the command line. 413 """ 414 self.output_file = output_file 415 416 if output_type: 417 output_type = output_type.upper() 418 if output_type not in self.OUTPUT_TYPES: 419 raise ValueError("Invalid output type %s. Valid choices are %s" 420 % (output_type, self.OUTPUT_TYPES)) 421 else: 422 self.output_type = output_type 423 424 if output_order: 425 output_order = output_order.upper() 426 if output_order not in self.OUTPUT_ORDER: 427 raise ValueError("Invalid output order %s. Valid choices are %s" 428 % (output_order, self.OUTPUT_ORDER)) 429 else: 430 self.output_order = output_order 431 432 if change_case: 433 change_case = change_case.upper() 434 if output_type != "GDE": 435 raise ValueError("Change case only valid for GDE output.") 436 elif change_case not in self.CHANGE_CASE: 437 raise ValueError("Invalid change case %s. Valid choices are %s" 438 % (change_case, self.CHANGE_CASE)) 439 else: 440 self.change_case = change_case 441 442 if add_seqnos: 443 add_seqnos = add_seqnos.upper() 444 if output_type: 445 raise ValueError("Add SeqNos only valid for CLUSTAL output.") 446 elif add_seqnos not in self.OUTPUT_SEQNOS: 447 raise ValueError("Invalid seqnos option %s. Valid choices: %s" 448 % (add_seqnos, self.OUTPUT_SEQNOS)) 449 else: 450 self.add_seqnos = add_seqnos
451
452 - def set_guide_tree(self, tree_file):
453 """Provide a file to use as the guide tree for alignment. 454 455 Raises: 456 o IOError - If the tree_file doesn't exist.""" 457 if not(os.path.exists(tree_file)): 458 raise IOError("Could not find the guide tree file %s." % 459 tree_file) 460 else: 461 self.guide_tree = tree_file
462
463 - def set_new_guide_tree(self, tree_file):
464 """Set the name of the guide tree file generated in the alignment. 465 """ 466 self.new_tree = tree_file
467
468 - def set_protein_matrix(self, protein_matrix):
469 """Set the type of protein matrix to use. 470 471 Protein matrix can be either one of the defined types (blosum, pam, 472 gonnet or id) or a file with your own defined matrix. 473 """ 474 if protein_matrix.upper() in self.PROTEIN_MATRIX: 475 self.protein_matrix = protein_matrix.upper() 476 elif os.path.exists(protein_matrix): 477 self.protein_matrix = protein_matrix 478 else: 479 raise ValueError("Invalid matrix %s. Options are %s or a file." % 480 (protein_matrix.upper(), self.PROTEIN_MATRIX))
481
482 - def set_dna_matrix(self, dna_matrix):
483 """Set the type of DNA matrix to use. 484 485 The dna_matrix can either be one of the defined types (iub or clustalw) 486 or a file with the matrix to use.""" 487 if dna_matrix.upper() in self.DNA_MATRIX: 488 self.dna_matrix = dna_matrix.upper() 489 elif os.path.exists(dna_matrix): 490 self.dna_matrix = dna_matrix 491 else: 492 raise ValueError("Invalid matrix %s. Options are %s or a file." % 493 (dna_matrix, self.DNA_MATRIX))
494
495 - def set_type(self, residue_type):
496 """Set the type of residues within the file. 497 498 Clustal tries to guess whether the info is protein or DNA based on 499 the number of GATCs, but this can be wrong if you have a messed up 500 protein or DNA you are working with, so this allows you to set it 501 explicitly. 502 """ 503 residue_type = residue_type.upper() 504 if residue_type in self.RESIDUE_TYPES: 505 self.type = residue_type 506 else: 507 raise ValueError("Invalid residue type %s. Valid choices are %s" 508 % (residue_type, self.RESIDUE_TYPES))
509
510 -def _test():
511 """Run the Bio.Clustalw module's doctests (PRIVATE). 512 513 This will try and locate the unit tests directory, and run the doctests 514 from there in order that the relative paths used in the examples work. 515 """ 516 import doctest 517 import os 518 if os.path.isdir(os.path.join("..","..","Tests")): 519 print "Runing doctests..." 520 cur_dir = os.path.abspath(os.curdir) 521 os.chdir(os.path.join("..","..","Tests")) 522 doctest.testmod() 523 os.chdir(cur_dir) 524 del cur_dir 525 print "Done"
526 527 if __name__ == "__main__": 528 _test() 529