1
2
3
4
5 """Command line wrapper for the multiple alignment program MUSCLE.
6
7 http://www.drive5.com/muscle/
8
9 Citations:
10
11 Edgar, Robert C. (2004), MUSCLE: multiple sequence alignment with high accuracy
12 and high throughput, Nucleic Acids Research 32(5), 1792-97.
13
14 Edgar, R.C. (2004) MUSCLE: a multiple sequence alignment method with reduced
15 time and space complexity. BMC Bioinformatics 5(1): 113.
16
17 Last checked against version: 3.7, briefly against 3.8
18 """
19
20 from Bio.Application import _Option, _Switch, AbstractCommandline
21
23 """Command line wrapper for the multiple alignment program MUSCLE."""
24 - def __init__(self, cmd="muscle", **kwargs):
25 CLUSTERING_ALGORITHMS = ["upgma", "upgmb", "neighborjoining"]
26 DISTANCE_MEASURES_ITER1 = ["kmer6_6", "kmer20_3", "kmer20_4", "kbit20_3",
27 "kmer4_6"]
28 DISTANCE_MEASURES_ITER2 = DISTANCE_MEASURES_ITER1 + \
29 ["pctid_kimura", "pctid_log"]
30 OBJECTIVE_SCORES = ["sp", "ps", "dp", "xp", "spf", "spm"]
31 TREE_ROOT_METHODS = ["pseudo", "midlongestspan", "minavgleafdist"]
32 SEQUENCE_TYPES = ["protein", "nucleo", "auto"]
33 WEIGHTING_SCHEMES = ["none", "clustalw", "henikoff", "henikoffpb",
34 "gsc", "threeway"]
35 self.parameters = \
36 [
37
38 _Option(["-in", "in", "input"], ["input", "file"],
39 None, 0, "Input filename",
40 0),
41 _Option(["-out", "out"], ["output", "file"],
42 None, 0, "Output filename",
43 0),
44 _Switch(["-diags", "diags"], ["input"],
45 "Find diagonals (faster for similar sequences)"),
46 _Switch(["-profile", "profile"], ["input"],
47 "Perform a profile alignment"),
48 _Option(["-in1", "in1"], ["input", "file"],
49 None, 0,
50 "First input filename for profile alignment",
51 0),
52 _Option(["-in2", "in2"], ["input", "file"],
53 None, 0,
54 "Second input filename for a profile alignment",
55 0),
56
57 _Option(["-anchorspacing", "anchorspacing"], ["input"],
58 lambda x: isinstance(x, int),
59 0,
60 "Minimum spacing between anchor columns",
61 0),
62
63
64 _Option(["-center", "center"], ["input"],
65 lambda x: isinstance(x, float),
66 0,
67 "Center parameter - should be negative",
68 0),
69
70 _Option(["-cluster1", "cluster1"], ["input"],
71 lambda x: x in CLUSTERING_ALGORITHMS, 0,
72 "Clustering method used in iteration 1",
73 0),
74
75
76
77
78 _Option(["-cluster2", "cluster2"], ["input"],
79 lambda x: x in CLUSTERING_ALGORITHMS, 0,
80 "Clustering method used in iteration 2",
81 0),
82
83
84 _Option(["-diaglength", "diaglength"], ["input"],
85 lambda x: isinstance(x, int),
86 0,
87 "Minimum length of diagonal",
88 0),
89
90
91
92 _Option(["-diagmargin", "diagmargin"], ["input"],
93 lambda x: isinstance(x, int),
94 0,
95 "Discard this many positions at ends of diagonal",
96 0),
97
98
99
100
101
102 _Option(["-distance1", "distance1"], ["input"],
103 lambda x: x in DISTANCE_MEASURES_ITER1,
104 0,
105 "Distance measure for iteration 1",
106 0),
107
108
109
110
111
112
113 _Option(["-distance2", "distance2"], ["input"],
114 lambda x: x in DISTANCE_MEASURES_ITER2,
115 0,
116 "Distance measure for iteration 2",
117 0),
118
119
120 _Option(["-gapopen", "gapopen"], ["input"],
121 lambda x: isinstance(x, float),
122 0,
123 "Gap open score - negative number",
124 0),
125
126
127
128 _Option(["-hydro", "hydro"], ["input"],
129 lambda x: isinstance(x, int),
130 0,
131 "Window size for hydrophobic region",
132 0),
133
134
135
136 _Option(["-hydrofactor", "hydrofactor"], ["input"],
137 lambda x: isinstance(x, float),
138 0,
139 "Multiplier for gap penalties in hydrophobic regions",
140 0),
141
142
143 _Option(["-log", "log"], ["output", "file"],
144 None, 0,
145 "Log file name",
146 0),
147
148
149 _Option(["-loga", "loga"], ["output", "file"],
150 None, 0,
151 "Log file name (append to existing file)",
152 0),
153
154
155
156
157
158 _Option(["-maxdiagbreak", "maxdiagbreak"], ["input"],
159 lambda x: isinstance(x, int),
160 0,
161 "Maximum distance between two diagonals that allows " + \
162 "them to merge into one diagonal",
163 0),
164
165
166
167
168
169
170
171
172 _Option(["-maxhours", "maxhours"], ["input"],
173 lambda x: isinstance(x, float),
174 0,
175 "Maximum time to run in hours",
176 0),
177
178
179 _Option(["-maxiters", "maxiters"], ["input"],
180 lambda x: isinstance(x, int),
181 0,
182 "Maximum number of iterations",
183 0),
184
185
186
187 _Option(["-maxtrees", "maxtrees"], ["input"],
188 lambda x: isinstance(x, int),
189 0,
190 "Maximum number of trees to build in iteration 2",
191 0),
192
193
194
195 _Option(["-minbestcolscore", "minbestcolscore"], ["input"],
196 lambda x: isinstance(x, float),
197 0,
198 "Minimum score a column must have to be an anchor",
199 0),
200
201
202
203 _Option(["-minsmoothscore", "minsmoothscore"], ["input"],
204 lambda x: isinstance(x, float),
205 0,
206 "Minimum smoothed score a column must have to " + \
207 "be an anchor",
208 0),
209
210
211
212
213
214
215
216
217
218
219
220
221
222 _Option(["-objscore", "objscore"], ["input"],
223 lambda x: x in OBJECTIVE_SCORES,
224 0,
225 "Objective score used by tree dependent refinement",
226 0),
227
228 _Option(["-root1", "root1"], ["input"],
229 lambda x: x in TREE_ROOT_METHODS,
230 0,
231 "Method used to root tree in iteration 1",
232 0),
233
234
235
236
237 _Option(["-root2", "root2"], ["input"],
238 lambda x: x in TREE_ROOT_METHODS,
239 0,
240 "Method used to root tree in iteration 2",
241 0),
242
243
244
245 _Option(["-seqtype", "seqtype"], ["input"],
246 lambda x: x in SEQUENCE_TYPES,
247 0,
248 "Sequence type",
249 0),
250
251
252
253 _Option(["-smoothscoreceil", "smoothscoreceil"], ["input"],
254 lambda x: isinstance(x, float),
255 0,
256 "Maximum value of column score for smoothing",
257 0),
258
259
260 _Option(["-smoothwindow", "smoothwindow"], ["input"],
261 lambda x: isinstance(x, int),
262 0,
263 "Window used for anchor column smoothing",
264 0),
265
266
267
268
269
270
271
272 _Option(["-sueff", "sueff"], ["input"],
273 lambda x: isinstance(x, float),
274 0,
275 "Constant used in UPGMB clustering",
276 0),
277
278 _Option(["-tree1", "tree1"], ["input"],
279 None, 0,
280 "Save Newick tree from iteration 1",
281 0),
282
283
284
285
286 _Option(["-tree2", "tree2"], ["input"],
287 None, 0,
288 "Save Newick tree from iteration 2",
289 0),
290
291 _Option(["-weight1", "weight1"], ["input"],
292 lambda x: x in WEIGHTING_SCHEMES,
293 0,
294 "Weighting scheme used in iteration 1",
295 0),
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314 _Option(["-weight2", "weight2"], ["input"],
315 lambda x: x in WEIGHTING_SCHEMES,
316 0,
317 "Weighting scheme used in iteration 2",
318 0),
319
320
321
322
323
324
325
326
327
328
329
330
331 _Switch(["-clw", "clw"], ["input"],
332 "Write output in CLUSTALW format (with a MUSCLE header)"),
333
334
335
336
337
338 _Switch(["-clwstrict", "clwstrict"], ["input"],
339 "Write output in CLUSTALW format with version 1.81 header"),
340
341
342
343 _Switch(["-fasta", "fasta"], ["input"],
344 "Write output in FASTA format"),
345
346
347 _Switch(["-html", "html"], ["input"],
348 "Write output in HTML format"),
349
350
351 _Switch(["-msf", "msf"], ["input"],
352 "Write output in MSF format"),
353
354 _Switch(["-phyi", "phyi"], ["input"],
355 "Write output in PHYLIP interleaved format"),
356
357 _Switch(["-phys", "phys"], ["input"],
358 "Write output in PHYLIP sequential format"),
359
360 _Option(["-phyiout", "phyiout"], ["output", "file"],
361 None, 0,
362 "Write PHYLIP interleaved output to specified filename",
363 0),
364 _Option(["-physout", "physout"], ["output", "file"],
365 None, 0,
366 "Write PHYLIP sequential format to specified filename",
367 0),
368 _Option(["-htmlout", "htmlout"], ["output", "file"],
369 None, 0,
370 "Write HTML output to specified filename",
371 0),
372 _Option(["-clwout", "clwout"], ["output", "file"],
373 None, 0,
374 "Write CLUSTALW output (with MUSCLE header) to specified "
375 "filename",
376 0),
377 _Option(["-clwstrictout", "clwstrictout"], ["output", "file"],
378 None, 0,
379 "Write CLUSTALW output (with version 1.81 header) to "
380 "specified filename",
381 0),
382 _Option(["-msfout", "msfout"], ["output", "file"],
383 None, 0,
384 "Write MSF format output to specified filename",
385 0),
386 _Option(["-fastaout", "fastaout"], ["output", "file"],
387 None, 0,
388 "Write FASTA format output to specified filename",
389 0),
390
391
392
393 _Switch(["-anchors", "anchors"], ["input"],
394 "Use anchor optimisation in tree dependent " + \
395 "refinement iterations"),
396
397
398 _Switch(["-noanchors", "noanchors"], ["input"],
399 "Do not use anchor optimisation in tree dependent " + \
400 "refinement iterations"),
401
402
403
404 _Switch(["-group", "group"], ["input"],
405 "Group similar sequences in output"),
406
407
408
409 _Switch(["-stable", "stable"], ["input"],
410 "Do not group similar sequences in output (not supported in v3.8)"),
411
412
413
414
415
416
417
418
419
420
421
422 _Switch(["-le", "le"], ["input"],
423 "Use log-expectation profile score (VTML240)"),
424
425
426 _Switch(["-sv", "sv"], ["input"],
427 "Use sum-of-pairs profile score (VTML240)"),
428
429
430 _Switch(["-sp", "sp"], ["input"],
431 "Use sum-of-pairs protein profile score (PAM200)"),
432
433
434
435
436 _Switch(["-spn", "spn"], ["input"],
437 "Use sum-of-pairs protein nucleotide profile score"),
438
439
440 _Switch(["-quiet", "quiet"], ["input"],
441 "Use sum-of-pairs protein nucleotide profile score"),
442
443
444
445 _Switch(["-refine", "refine"], ["input"],
446 "Only do tree dependent refinement"),
447
448
449 _Switch(["-core", "core"], ["input"],
450 "Catch exceptions"),
451
452
453 _Switch(["-nocore", "nocore"], ["input"],
454 "Do not catch exceptions"),
455
456
457
458
459
460
461
462
463
464
465
466
467
468 _Switch(["-verbose", "verbose"], ["input"],
469 "Write parameter settings and progress"),
470
471 _Switch(["-version", "version"], ["input"],
472 "Write version string to stdout and exit"),
473 ]
474 AbstractCommandline.__init__(self, cmd, **kwargs)
475