1
2
3
4
5
6
7 """Nexus class. Parse the contents of a NEXUS file.
8
9 Based upon 'NEXUS: An extensible file format for systematic information'
10 Maddison, Swofford, Maddison. 1997. Syst. Biol. 46(4):590-621
11 """
12
13 import os,sys, math, random, copy
14
15 from Bio.Alphabet import IUPAC
16 from Bio.Data import IUPACData
17 from Bio.Seq import Seq
18
19 from Trees import Tree,NodeData
20
21 try:
22 import cnexus
23 except ImportError:
24 C=False
25 else:
26 C=True
27
28 INTERLEAVE=70
29 SPECIAL_COMMANDS=['charstatelabels','charlabels','taxlabels', 'taxset', 'charset','charpartition','taxpartition',\
30 'matrix','tree', 'utree','translate','codonposset','title']
31 KNOWN_NEXUS_BLOCKS = ['trees','data', 'characters', 'taxa', 'sets','codons']
32 PUNCTUATION='()[]{}/\,;:=*\'"`+-<>'
33 MRBAYESSAFE='abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890_'
34 WHITESPACE=' \t\n'
35
36 SPECIALCOMMENTS=['&']
37 CHARSET='chars'
38 TAXSET='taxa'
39 CODONPOSITIONS='codonpositions'
40 DEFAULTNEXUS='#NEXUS\nbegin data; dimensions ntax=0 nchar=0; format datatype=dna; end; '
42
44 """Helps reading NEXUS-words and characters from a buffer."""
46 if string:
47 self.buffer=list(string)
48 else:
49 self.buffer=[]
50
52 if self.buffer:
53 return self.buffer[0]
54 else:
55 return None
56
58 b=''.join(self.buffer).strip()
59 if b:
60 return b[0]
61 else:
62 return None
63
65 if self.buffer:
66 return self.buffer.pop(0)
67 else:
68 return None
69
71 while True:
72 p=self.next()
73 if p is None:
74 break
75 if p not in WHITESPACE:
76 return p
77 return None
78
80 while self.buffer[0] in WHITESPACE:
81 self.buffer=self.buffer[1:]
82
84 for t in target:
85 try:
86 pos=self.buffer.index(t)
87 except ValueError:
88 pass
89 else:
90 found=''.join(self.buffer[:pos])
91 self.buffer=self.buffer[pos:]
92 return found
93 else:
94 return None
95
97 return ''.join(self.buffer[:len(word)])==word
98
100 """Return the next NEXUS word from a string.
101
102 This deals with single and double quotes, whitespace and punctuation.
103 """
104
105 word=[]
106 quoted=False
107 first=self.next_nonwhitespace()
108 if not first:
109 return None
110 word.append(first)
111 if first=="'":
112 quoted="'"
113 elif first=='"':
114 quoted='"'
115 elif first in PUNCTUATION:
116 return first
117 while True:
118 c=self.peek()
119 if c==quoted:
120 word.append(self.next())
121 if self.peek()==quoted:
122 skip=self.next()
123 elif quoted:
124 break
125 elif quoted:
126 word.append(self.next())
127 elif not c or c in PUNCTUATION or c in WHITESPACE:
128 break
129 else:
130 word.append(self.next())
131 return ''.join(word)
132
134 """Return the rest of the string without parsing."""
135 return ''.join(self.buffer)
136
138 """Calculate a stepmatrix for weighted parsimony.
139
140 See Wheeler (1990), Cladistics 6:269-275.
141 """
142
144 self.data={}
145 self.symbols=[s for s in symbols]
146 self.symbols.sort()
147 if gap:
148 self.symbols.append(gap)
149 for x in self.symbols:
150 for y in [s for s in self.symbols if s!=x]:
151 self.set(x,y,0)
152
153 - def set(self,x,y,value):
157
158 - def add(self,x,y,value):
162
165
172
174 for k in self.data:
175 if self.data[k]!=0:
176 self.data[k]=-math.log(self.data[k])
177 return self
178
179 - def smprint(self,name='your_name_here'):
180 matrix='usertype %s stepmatrix=%d\n' % (name,len(self.symbols))
181 matrix+=' %s\n' % ' '.join(self.symbols)
182 for x in self.symbols:
183 matrix+='[%s]'.ljust(8) % x
184 for y in self.symbols:
185 if x==y:
186 matrix+=' . '
187 else:
188 if x>y:
189 x1,y1=y,x
190 else:
191 x1,y1=x,y
192 if self.data[x1+y1]==0:
193 matrix+='inf. '
194 else:
195 matrix+='%2.2f'.ljust(10) % (self.data[x1+y1])
196 matrix+='\n'
197 matrix+=';\n'
198 return matrix
199
201 """Return a taxon identifier according to NEXUS standard.
202
203 Wrap quotes around names with punctuation or whitespace, and double
204 single quotes.
205
206 mrbayes=True: write names without quotes, whitespace or punctuation
207 for the mrbayes software package.
208 """
209 if mrbayes:
210 safe=name.replace(' ','_')
211 safe=''.join([c for c in safe if c in MRBAYESSAFE])
212 else:
213 safe=name.replace("'","''")
214 if set(safe).intersection(set(WHITESPACE+PUNCTUATION)):
215 safe="'"+safe+"'"
216 return safe
217
219 """Remove quotes and/or double quotes around identifiers."""
220 if not word:
221 return None
222 while (word.startswith("'") and word.endswith("'")) or (word.startswith('"') and word.endswith('"')):
223 word=word[1:-1]
224 return word
225
244
246 """Returns a sorted list of keys of p sorted by values of p."""
247 startpos=[(p[pn],pn) for pn in p if p[pn]]
248 startpos.sort()
249
250 return (zip(*startpos))[1]
251
253 """Check that all values in list are unique and return a pruned and sorted list."""
254 l=list(set(l))
255 l.sort()
256 return l
257
259 """Returns a unique name if label is already in previous_labels."""
260 while label in previous_labels:
261 if label.split('.')[-1].startswith('copy'):
262 label='.'.join(label.split('.')[:-1])+'.copy'+str(eval('0'+label.split('.')[-1][4:])+1)
263 else:
264 label+='.copy'
265 return label
266
268 """Converts a Seq-object matrix to a plain sequence-string matrix."""
269 return dict([(t,matrix[t].tostring()) for t in matrix])
270
272 """Transform [1 2 3 5 6 7 8 12 15 18 20] (baseindex 0, used in the Nexus class)
273 into '2-4 6-9 13-19\\3 21' (baseindex 1, used in programs like Paup or MrBayes.).
274 """
275
276 if not orig_list:
277 return ''
278 orig_list=list(set(orig_list))
279 orig_list.sort()
280 shortlist=[]
281 clist=orig_list[:]
282 clist.append(clist[-1]+.5)
283 while len(clist)>1:
284 step=1
285 for i,x in enumerate(clist):
286 if x==clist[0]+i*step:
287 continue
288 elif i==1 and len(clist)>3 and clist[i+1]-x==x-clist[0]:
289
290
291 step=x-clist[0]
292 else:
293 sub=clist[:i]
294 if len(sub)==1:
295 shortlist.append(str(sub[0]+1))
296 else:
297 if step==1:
298 shortlist.append('%d-%d' % (sub[0]+1,sub[-1]+1))
299 else:
300 shortlist.append('%d-%d\\%d' % (sub[0]+1,sub[-1]+1,step))
301 clist=clist[i:]
302 break
303 return ' '.join(shortlist)
304
306 """Combine matrices in [(name,nexus-instance),...] and return new nexus instance.
307
308 combined_matrix=combine([(name1,nexus_instance1),(name2,nexus_instance2),...]
309 Character sets, character partitions and taxon sets are prefixed, readjusted
310 and present in the combined matrix.
311 """
312
313 if not matrices:
314 return None
315 name=matrices[0][0]
316 combined=copy.deepcopy(matrices[0][1])
317 mixed_datatypes=(len(set([n[1].datatype for n in matrices]))>1)
318 if mixed_datatypes:
319 combined.datatype='None'
320
321 combined.charlabels=None
322 combined.statelabels=None
323 combined.interleave=False
324 combined.translate=None
325
326
327 for cn,cs in combined.charsets.iteritems():
328 combined.charsets['%s.%s' % (name,cn)]=cs
329 del combined.charsets[cn]
330 for tn,ts in combined.taxsets.iteritems():
331 combined.taxsets['%s.%s' % (name,tn)]=ts
332 del combined.taxsets[tn]
333
334
335 combined.charpartitions={'combined':{name:range(combined.nchar)}}
336 for n,m in matrices[1:]:
337 both=[t for t in combined.taxlabels if t in m.taxlabels]
338 combined_only=[t for t in combined.taxlabels if t not in both]
339 m_only=[t for t in m.taxlabels if t not in both]
340 for t in both:
341
342 combined.matrix[t]+=Seq(m.matrix[t].tostring().replace(m.gap,combined.gap).replace(m.missing,combined.missing),combined.alphabet)
343
344 for t in combined_only:
345 combined.matrix[t]+=Seq(combined.missing*m.nchar,combined.alphabet)
346 for t in m_only:
347 combined.matrix[t]=Seq(combined.missing*combined.nchar,combined.alphabet)+\
348 Seq(m.matrix[t].tostring().replace(m.gap,combined.gap).replace(m.missing,combined.missing),combined.alphabet)
349 combined.taxlabels.extend(m_only)
350 for cn,cs in m.charsets.iteritems():
351 combined.charsets['%s.%s' % (n,cn)]=[x+combined.nchar for x in cs]
352 if m.taxsets:
353 if not combined.taxsets:
354 combined.taxsets={}
355
356 combined.taxsets.update(dict(('%s.%s' % (n,tn),ts) \
357 for tn,ts in m.taxsets.iteritems()))
358
359 combined.charpartitions['combined'][n]=range(combined.nchar,combined.nchar+m.nchar)
360
361 if m.charlabels:
362 if not combined.charlabels:
363 combined.charlabels={}
364 combined.charlabels.update(dict((combined.nchar+i,label) \
365 for (i,label) in m.charlabels.iteritems()))
366 combined.nchar+=m.nchar
367 combined.ntax+=len(m_only)
368
369
370
371 for c in combined.charpartitions['combined']:
372 combined.charsets[c]=combined.charpartitions['combined'][c]
373
374 return combined
375
436
437
439 """Adjust linebreaks to match ';', strip leading/trailing whitespace.
440
441 list_of_commandlines=_adjust_lines(input_text)
442 Lines are adjusted so that no linebreaks occur within a commandline
443 (except matrix command line)
444 """
445 formatted_lines=[]
446 for l in lines:
447
448 l=l.replace('\r\n','\n').replace('\r','\n').strip()
449 if l.lower().startswith('matrix'):
450 formatted_lines.append(l)
451 else:
452 l=l.replace('\n',' ')
453 if l:
454 formatted_lines.append(l)
455 return formatted_lines
456
458 """Replaces ambigs in xxx(ACG)xxx format by IUPAC ambiguity code."""
459
460 opening=seq.find('(')
461 while opening>-1:
462 closing=seq.find(')')
463 if closing<0:
464 raise NexusError('Missing closing parenthesis in: '+seq)
465 elif closing<opening:
466 raise NexusError('Missing opening parenthesis in: '+seq)
467 ambig=[x for x in seq[opening+1:closing]]
468 ambig.sort()
469 ambig=''.join(ambig)
470 ambig_code=rev_ambig_values[ambig.upper()]
471 if ambig!=ambig.upper():
472 ambig_code=ambig_code.lower()
473 seq=seq[:opening]+ambig_code+seq[closing+1:]
474 opening=seq.find('(')
475 return seq
476
478 """Represent a commandline as command and options."""
479
481 self.options={}
482 options=[]
483 self.command=None
484 try:
485
486 self.command, options = line.strip().split('\n', 1)
487 except ValueError:
488
489 self.command=line.split()[0]
490 options=' '.join(line.split()[1:])
491 self.command = self.command.strip().lower()
492 if self.command in SPECIAL_COMMANDS:
493 self.options=options.strip()
494 else:
495 if len(options) > 0:
496 try:
497 options = options.replace('=', ' = ').split()
498 valued_indices=[(n-1,n,n+1) for n in range(len(options)) if options[n]=='=' and n!=0 and n!=len((options))]
499 indices = []
500 for sl in valued_indices:
501 indices.extend(sl)
502 token_indices = [n for n in range(len(options)) if n not in indices]
503 for opt in valued_indices:
504
505 self.options[options[opt[0]].lower()] = options[opt[2]]
506 for token in token_indices:
507 self.options[options[token].lower()] = None
508 except ValueError:
509 raise NexusError('Incorrect formatting in line: %s' % line)
510
512 """Represent a NEXUS block with block name and list of commandlines."""
516
518
519 __slots__=['original_taxon_order','__dict__']
520
522 self.ntax=0
523 self.nchar=0
524 self.unaltered_taxlabels=[]
525 self.taxlabels=[]
526 self.charlabels=None
527 self.statelabels=None
528 self.datatype='dna'
529 self.respectcase=False
530 self.missing='?'
531 self.gap='-'
532 self.symbols=None
533 self.equate=None
534 self.matchchar=None
535 self.labels=None
536 self.transpose=False
537 self.interleave=False
538 self.tokens=False
539 self.eliminate=None
540 self.matrix=None
541 self.unknown_blocks=[]
542 self.taxsets={}
543 self.charsets={}
544 self.charpartitions={}
545 self.taxpartitions={}
546 self.trees=[]
547 self.translate=None
548 self.structured=[]
549 self.set={}
550 self.options={}
551 self.codonposset=None
552
553
554 self.options['gapmode']='missing'
555
556 if input:
557 self.read(input)
558 else:
559 self.read(DEFAULTNEXUS)
560
562 """Included for backwards compatibility (DEPRECATED)."""
563 return self.taxlabels
565 """Included for backwards compatibility (DEPRECATED)."""
566 self.taxlabels=value
567 original_taxon_order=property(get_original_taxon_order,set_original_taxon_order)
568
569 - def read(self,input):
570 """Read and parse NEXUS imput (a filename, file-handle, or string)."""
571
572
573
574 try:
575 file_contents = open(os.path.expanduser(input),'rU').read()
576 self.filename=input
577 except (TypeError,IOError,AttributeError):
578
579 if isinstance(input, str):
580 file_contents = input
581 self.filename='input_string'
582
583 elif hasattr(input,'read'):
584 file_contents=input.read()
585 if hasattr(input,"name") and input.name:
586 self.filename=input.name
587 else:
588 self.filename='Unknown_nexus_file'
589 else:
590 print input.strip()[:50]
591 raise NexusError('Unrecognized input: %s ...' % input[:100])
592 file_contents=file_contents.strip()
593 if file_contents.startswith('#NEXUS'):
594 file_contents=file_contents[6:]
595 if C:
596 decommented=cnexus.scanfile(file_contents)
597
598 if decommented=='[' or decommented==']':
599 raise NexusError('Unmatched %s' % decommented)
600
601
602 commandlines=_adjust_lines(decommented.split(chr(7)))
603 else:
604 commandlines=_adjust_lines(_kill_comments_and_break_lines(file_contents))
605
606 for i,cl in enumerate(commandlines):
607 try:
608 if cl[:6].upper()=='#NEXUS':
609 commandlines[i]=cl[6:].strip()
610 except:
611 pass
612
613 nexus_block_gen = self._get_nexus_block(commandlines)
614 while 1:
615 try:
616 title, contents = nexus_block_gen.next()
617 except StopIteration:
618 break
619 if title in KNOWN_NEXUS_BLOCKS:
620 self._parse_nexus_block(title, contents)
621 else:
622 self._unknown_nexus_block(title, contents)
623
625 """Generator for looping through Nexus blocks."""
626 inblock=False
627 blocklines=[]
628 while file_contents:
629 cl=file_contents.pop(0)
630 if cl.lower().startswith('begin'):
631 if not inblock:
632 inblock=True
633 title=cl.split()[1].lower()
634 else:
635 raise NexusError('Illegal block nesting in block %s' % title)
636 elif cl.lower().startswith('end'):
637 if inblock:
638 inblock=False
639 yield title,blocklines
640 blocklines=[]
641 else:
642 raise NexusError('Unmatched \'end\'.')
643 elif inblock:
644 blocklines.append(cl)
645
651
653 """Parse a known Nexus Block (PRIVATE)."""
654
655 self._apply_block_structure(title, contents)
656
657
658 block=self.structured[-1]
659 for line in block.commandlines:
660 try:
661 getattr(self,'_'+line.command)(line.options)
662 except AttributeError:
663 raise
664 raise NexusError('Unknown command: %s ' % line.command)
665
668
670 if 'ntax' in options:
671 self.ntax=eval(options['ntax'])
672 if 'nchar' in options:
673 self.nchar=eval(options['nchar'])
674
754
755
756 - def _set(self,options):
758
760 self.options=options;
761
763 self.eliminate=options
764
766 """Get taxon labels (PRIVATE).
767
768 As the taxon names are already in the matrix, this is superfluous
769 except for transpose matrices, which are currently unsupported anyway.
770 Thus, we ignore the taxlabels command to make handling of duplicate
771 taxon names easier.
772 """
773 pass
774
775
776
777
778
779
780
781
783 """Check for presence of taxon in self.taxlabels."""
784
785
786 nextaxa=dict([(t.replace(' ','_'),t) for t in self.taxlabels])
787 nexid=taxon.replace(' ','_')
788 return nextaxa.get(nexid)
789
812
816
821
823 if not self.ntax or not self.nchar:
824 raise NexusError('Dimensions must be specified before matrix!')
825 self.matrix={}
826 taxcount=0
827 first_matrix_block=True
828
829
830 lines=[l.strip() for l in options.split('\n') if l.strip()!='']
831 lineiter=iter(lines)
832 while 1:
833 try:
834 l=lineiter.next()
835 except StopIteration:
836 if taxcount<self.ntax:
837 raise NexusError('Not enough taxa in matrix.')
838 elif taxcount>self.ntax:
839 raise NexusError('Too many taxa in matrix.')
840 else:
841 break
842
843 taxcount+=1
844
845 if taxcount>self.ntax:
846 if not self.interleave:
847 raise NexusError('Too many taxa in matrix - should matrix be interleaved?')
848 else:
849 taxcount=1
850 first_matrix_block=False
851
852 linechars=CharBuffer(l)
853 id=quotestrip(linechars.next_word())
854 l=linechars.rest().strip()
855 chars=''
856 if self.interleave:
857
858
859 if l:
860 chars=''.join(l.split())
861 else:
862 chars=''.join(lineiter.next().split())
863 else:
864
865 chars=''.join(l.split())
866 while len(chars)<self.nchar:
867 l=lineiter.next()
868 chars+=''.join(l.split())
869 iupac_seq=Seq(_replace_parenthesized_ambigs(chars,self.rev_ambiguous_values),self.alphabet)
870
871 if taxcount==1:
872 refseq=iupac_seq
873 else:
874 if self.matchchar:
875 while 1:
876 p=iupac_seq.tostring().find(self.matchchar)
877 if p==-1:
878 break
879 iupac_seq=Seq(iupac_seq.tostring()[:p]+refseq[p]+iupac_seq.tostring()[p+1:],self.alphabet)
880
881 for i,c in enumerate(iupac_seq.tostring()):
882 if c not in self.valid_characters and c!=self.gap and c!=self.missing:
883 raise NexusError( \
884 ('Taxon %s: Illegal character %s in sequence %s ' + \
885 '(check dimensions/interleaving)') % (id,c, iupac_seq))
886
887 if first_matrix_block:
888 self.unaltered_taxlabels.append(id)
889 id=_unique_label(self.matrix.keys(),id)
890 self.matrix[id]=iupac_seq
891 self.taxlabels.append(id)
892 else:
893
894 id=_unique_label(self.taxlabels[:taxcount-1],id)
895 taxon_present=self._check_taxlabels(id)
896 if taxon_present:
897 self.matrix[taxon_present]+=iupac_seq
898 else:
899 raise NexusError('Taxon %s not in first block of interleaved matrix. Check matrix dimensions and interleave.' % id)
900
901 for taxon in self.matrix:
902 if len(self.matrix[taxon])!=self.nchar:
903 raise NexusError('Matrx Nchar %d does not match data length (%d) for taxon %s' \
904 % (self.nchar, len(self.matrix[taxon]),taxon))
905
906 matrixkeys=sorted(self.matrix)
907 taxlabelssort=self.taxlabels[:]
908 taxlabelssort.sort()
909 assert matrixkeys==taxlabelssort,"ERROR: TAXLABELS must be identical with MATRIX. Please Report this as a bug, and send in data file."
910
930
932 """Some software (clustalx) uses 'utree' to denote an unrooted tree."""
933 self._tree(options)
934
935 - def _tree(self,options):
970
977
981
985
987 taxpartition={}
988 quotelevel=False
989 opts=CharBuffer(options)
990 name=self._name_n_vector(opts)
991 if not name:
992 raise NexusError('Formatting error in taxpartition: %s ' % options)
993
994
995
996 sub=''
997 while True:
998 w=opts.next()
999 if w is None or (w==',' and not quotelevel):
1000 subname,subindices=self._get_indices(sub,set_type=TAXSET,separator=':')
1001 taxpartition[subname]=_make_unique(subindices)
1002 sub=''
1003 if w is None:
1004 break
1005 else:
1006 if w=="'":
1007 quotelevel=not quotelevel
1008 sub+=w
1009 self.taxpartitions[name]=taxpartition
1010
1012 """Read codon positions from a codons block as written from McClade.
1013
1014 Here codonposset is just a fancy name for a character partition with
1015 the name CodonPositions and the partitions N,1,2,3
1016 """
1017
1018 prev_partitions=self.charpartitions
1019 self._charpartition(options)
1020
1021 codonname=[n for n in self.charpartitions if n not in prev_partitions]
1022 if codonname==[] or len(codonname)>1:
1023 raise NexusError('Formatting Error in codonposset: %s ' % options)
1024 else:
1025 self.codonposset=codonname[0]
1026
1029
1031 charpartition={}
1032 quotelevel=False
1033 opts=CharBuffer(options)
1034 name=self._name_n_vector(opts)
1035 if not name:
1036 raise NexusError('Formatting error in charpartition: %s ' % options)
1037
1038
1039 sub=''
1040 while True:
1041 w=opts.next()
1042 if w is None or (w==',' and not quotelevel):
1043 subname,subindices=self._get_indices(sub,set_type=CHARSET,separator=':')
1044 charpartition[subname]=_make_unique(subindices)
1045 sub=''
1046 if w is None:
1047 break
1048 else:
1049 if w=="'":
1050 quotelevel=not quotelevel
1051 sub+=w
1052 self.charpartitions[name]=charpartition
1053
1055 """Parse the taxset/charset specification (PRIVATE).
1056
1057 e.g. '1 2 3 - 5 dog cat 10 - 20 \\ 3'
1058 --> [0,1,2,3,4,'dog','cat',9,12,15,18]
1059 """
1060 opts=CharBuffer(options)
1061 name=self._name_n_vector(opts,separator=separator)
1062 indices=self._parse_list(opts,set_type=set_type)
1063 if indices is None:
1064 raise NexusError('Formatting error in line: %s ' % options)
1065 return name,indices
1066
1090
1092 """Parse a NEXUS list (PRIVATE).
1093
1094 e.g. [1, 2, 4-8\\2, dog, cat] --> [1,2,4,6,8,17,21],
1095 (assuming dog is taxon no. 17 and cat is taxon no. 21).
1096 """
1097 plain_list=[]
1098 if options_buffer.peek_nonwhitespace():
1099 try:
1100 while True:
1101 identifier=options_buffer.next_word()
1102 if not identifier:
1103 break
1104 start=self._resolve(identifier,set_type=set_type)
1105 if options_buffer.peek_nonwhitespace()=='-':
1106 end=start
1107 step=1
1108
1109 hyphen=options_buffer.next_nonwhitespace()
1110 end=self._resolve(options_buffer.next_word(),set_type=set_type)
1111 if set_type==CHARSET:
1112 if options_buffer.peek_nonwhitespace()=='\\':
1113 backslash=options_buffer.next_nonwhitespace()
1114 step=int(options_buffer.next_word())
1115 plain_list.extend(range(start,end+1,step))
1116 else:
1117 if type(start)==list or type(end)==list:
1118 raise NexusError('Name if character sets not allowed in range definition: %s'\
1119 % identifier)
1120 start=self.taxlabels.index(start)
1121 end=self.taxlabels.index(end)
1122 taxrange=self.taxlabels[start:end+1]
1123 plain_list.extend(taxrange)
1124 else:
1125 if type(start)==list:
1126 plain_list.extend(start)
1127 else:
1128 plain_list.append(start)
1129 except NexusError:
1130 raise
1131 except:
1132 return None
1133 return plain_list
1134
1135 - def _resolve(self,identifier,set_type=None):
1136 """Translate identifier in list into character/taxon index.
1137
1138 Characters (which are referred to by their index in Nexus.py):
1139 Plain numbers are returned minus 1 (Nexus indices to python indices)
1140 Text identifiers are translaterd into their indices (if plain character indentifiers),
1141 the first hit in charlabels is returned (charlabels don't need to be unique)
1142 or the range of indices is returned (if names of character sets).
1143 Taxa (which are referred to by their unique name in Nexus.py):
1144 Plain numbers are translated in their taxon name, underscores and spaces are considered equal.
1145 Names are returned unchanged (if plain taxon identifiers), or the names in
1146 the corresponding taxon set is returned
1147 """
1148 identifier=quotestrip(identifier)
1149 if not set_type:
1150 raise NexusError('INTERNAL ERROR: Need type to resolve identifier.')
1151 if set_type==CHARSET:
1152 try:
1153 n=int(identifier)
1154 except ValueError:
1155 if self.charlabels and identifier in self.charlabels.itervalues():
1156 for k in self.charlabels:
1157 if self.charlabels[k]==identifier:
1158 return k
1159 elif self.charsets and identifier in self.charsets:
1160 return self.charsets[identifier]
1161 else:
1162 raise NexusError('Unknown character identifier: %s' \
1163 % identifier)
1164 else:
1165 if n<=self.nchar:
1166 return n-1
1167 else:
1168 raise NexusError('Illegal character identifier: %d>nchar (=%d).' \
1169 % (identifier,self.nchar))
1170 elif set_type==TAXSET:
1171 try:
1172 n=int(identifier)
1173 except ValueError:
1174 taxlabels_id=self._check_taxlabels(identifier)
1175 if taxlabels_id:
1176 return taxlabels_id
1177 elif self.taxsets and identifier in self.taxsets:
1178 return self.taxsets[identifier]
1179 else:
1180 raise NexusError('Unknown taxon identifier: %s' \
1181 % identifier)
1182 else:
1183 if n>0 and n<=self.ntax:
1184 return self.taxlabels[n-1]
1185 else:
1186 raise NexusError('Illegal taxon identifier: %d>ntax (=%d).' \
1187 % (identifier,self.ntax))
1188 else:
1189 raise NexusError('Unknown set specification: %s.'% set_type)
1190
1194
1198
1202
1206
1207 - def write_nexus_data_partitions(self, matrix=None, filename=None, blocksize=None, interleave=False,
1208 exclude=[], delete=[], charpartition=None, comment='',mrbayes=False):
1209 """Writes a nexus file for each partition in charpartition.
1210
1211 Only non-excluded characters and non-deleted taxa are included,
1212 just the data block is written.
1213 """
1214
1215 if not matrix:
1216 matrix=self.matrix
1217 if not matrix:
1218 return
1219 if not filename:
1220 filename=self.filename
1221 if charpartition:
1222 pfilenames={}
1223 for p in charpartition:
1224 total_exclude=[]+exclude
1225 total_exclude.extend([c for c in range(self.nchar) if c not in charpartition[p]])
1226 total_exclude=_make_unique(total_exclude)
1227 pcomment=comment+'\nPartition: '+p+'\n'
1228 dot=filename.rfind('.')
1229 if dot>0:
1230 pfilename=filename[:dot]+'_'+p+'.data'
1231 else:
1232 pfilename=filename+'_'+p
1233 pfilenames[p]=pfilename
1234 self.write_nexus_data(filename=pfilename,matrix=matrix,blocksize=blocksize,
1235 interleave=interleave,exclude=total_exclude,delete=delete,comment=pcomment,append_sets=False,
1236 mrbayes=mrbayes)
1237 return pfilenames
1238 else:
1239 fn=self.filename+'.data'
1240 self.write_nexus_data(filename=fn,matrix=matrix,blocksize=blocksize,interleave=interleave,
1241 exclude=exclude,delete=delete,comment=comment,append_sets=False,
1242 mrbayes=mrbayes)
1243 return fn
1244
1245 - def write_nexus_data(self, filename=None, matrix=None, exclude=[], delete=[],\
1246 blocksize=None, interleave=False, interleave_by_partition=False,\
1247 comment=None,omit_NEXUS=False,append_sets=True,mrbayes=False,\
1248 codons_block=True):
1249 """Writes a nexus file with data and sets block to a file or handle.
1250
1251 Character sets and partitions are appended by default, and are
1252 adjusted according to excluded characters (i.e. character sets
1253 still point to the same sites (not necessarily same positions),
1254 without including the deleted characters.
1255
1256 filename - Either a filename as a string (which will be opened,
1257 written to and closed), or a handle object (which will
1258 be written to but NOT closed).
1259 omit_NEXUS - Boolean. If true, the '#NEXUS' line normally at the
1260 start of the file is ommited.
1261
1262 Returns the filename/handle used to write the data.
1263 """
1264 if not matrix:
1265 matrix=self.matrix
1266 if not matrix:
1267 return
1268 if not filename:
1269 filename=self.filename
1270 if [t for t in delete if not self._check_taxlabels(t)]:
1271 raise NexusError('Unknown taxa: %s' \
1272 % ', '.join(set(delete).difference(set(self.taxlabels))))
1273 if interleave_by_partition:
1274 if not interleave_by_partition in self.charpartitions:
1275 raise NexusError('Unknown partition: '+interleave_by_partition)
1276 else:
1277 partition=self.charpartitions[interleave_by_partition]
1278
1279 names=_sort_keys_by_values(partition)
1280 newpartition={}
1281 for p in partition:
1282 newpartition[p]=[c for c in partition[p] if c not in exclude]
1283
1284 undelete=[taxon for taxon in self.taxlabels if taxon in matrix and taxon not in delete]
1285 cropped_matrix=_seqmatrix2strmatrix(self.crop_matrix(matrix,exclude=exclude,delete=delete))
1286 ntax_adjusted=len(undelete)
1287 nchar_adjusted=len(cropped_matrix[undelete[0]])
1288 if not undelete or (undelete and undelete[0]==''):
1289 return
1290 if isinstance(filename,basestring):
1291 try:
1292 fh=open(filename,'w',encoding="utf-8")
1293 except IOError:
1294 raise NexusError('Could not open %s for writing.' % filename)
1295 elif hasattr(filename, 'write'):
1296
1297 fh=filename
1298 else:
1299 raise ValueError("Neither a filename nor a handle was supplied")
1300 if not omit_NEXUS:
1301 fh.write('#NEXUS\n')
1302 if comment:
1303 fh.write('['+comment+']\n')
1304 fh.write('begin data;\n')
1305 fh.write('\tdimensions ntax=%d nchar=%d;\n' % (ntax_adjusted, nchar_adjusted))
1306 fh.write('\tformat datatype='+self.datatype)
1307 if self.respectcase:
1308 fh.write(' respectcase')
1309 if self.missing:
1310 fh.write(' missing='+self.missing)
1311 if self.gap:
1312 fh.write(' gap='+self.gap)
1313 if self.matchchar:
1314 fh.write(' matchchar='+self.matchchar)
1315 if self.labels:
1316 fh.write(' labels='+self.labels)
1317 if self.equate:
1318 fh.write(' equate='+self.equate)
1319 if interleave or interleave_by_partition:
1320 fh.write(' interleave')
1321 fh.write(';\n')
1322
1323
1324 if self.charlabels:
1325 newcharlabels=self._adjust_charlabels(exclude=exclude)
1326 clkeys=sorted(newcharlabels)
1327 fh.write('charlabels '+', '.join(["%s %s" % (k+1,safename(newcharlabels[k])) for k in clkeys])+';\n')
1328 fh.write('matrix\n')
1329 if not blocksize:
1330 if interleave:
1331 blocksize=70
1332 else:
1333 blocksize=self.nchar
1334
1335 namelength=max([len(safename(t,mrbayes=mrbayes)) for t in undelete])
1336 if interleave_by_partition:
1337
1338 seek=0
1339 for p in names:
1340 fh.write('[%s: %s]\n' % (interleave_by_partition,p))
1341 if len(newpartition[p])>0:
1342 for taxon in undelete:
1343 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1))
1344 fh.write(cropped_matrix[taxon][seek:seek+len(newpartition[p])]+'\n')
1345 fh.write('\n')
1346 else:
1347 fh.write('[empty]\n\n')
1348 seek+=len(newpartition[p])
1349 elif interleave:
1350 for seek in range(0,nchar_adjusted,blocksize):
1351 for taxon in undelete:
1352 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1))
1353 fh.write(cropped_matrix[taxon][seek:seek+blocksize]+'\n')
1354 fh.write('\n')
1355 else:
1356 for taxon in undelete:
1357 if blocksize<nchar_adjusted:
1358 fh.write(safename(taxon,mrbayes=mrbayes)+'\n')
1359 else:
1360 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1))
1361 for seek in range(0,nchar_adjusted,blocksize):
1362 fh.write(cropped_matrix[taxon][seek:seek+blocksize]+'\n')
1363 fh.write(';\nend;\n')
1364 if append_sets:
1365 if codons_block:
1366 fh.write(self.append_sets(exclude=exclude,delete=delete,mrbayes=mrbayes,include_codons=False))
1367 fh.write(self.append_sets(exclude=exclude,delete=delete,mrbayes=mrbayes,codons_only=True))
1368 else:
1369 fh.write(self.append_sets(exclude=exclude,delete=delete,mrbayes=mrbayes))
1370
1371 if fh == filename:
1372
1373 return filename
1374 else:
1375
1376 fh.close()
1377 return filename
1378
1379 - def append_sets(self,exclude=[],delete=[],mrbayes=False,include_codons=True,codons_only=False):
1380 """Returns a sets block."""
1381 if not self.charsets and not self.taxsets and not self.charpartitions:
1382 return ''
1383 if codons_only:
1384 setsb=['\nbegin codons']
1385 else:
1386 setsb=['\nbegin sets']
1387
1388
1389
1390
1391 offset=0
1392 offlist=[]
1393 for c in range(self.nchar):
1394 if c in exclude:
1395 offset+=1
1396 offlist.append(-1)
1397 else:
1398 offlist.append(c-offset)
1399
1400 if not codons_only:
1401 for n,ns in self.charsets.iteritems():
1402 cset=[offlist[c] for c in ns if c not in exclude]
1403 if cset:
1404 setsb.append('charset %s = %s' % (safename(n),_compact4nexus(cset)))
1405 for n,s in self.taxsets.iteritems():
1406 tset=[safename(t,mrbayes=mrbayes) for t in s if t not in delete]
1407 if tset:
1408 setsb.append('taxset %s = %s' % (safename(n),' '.join(tset)))
1409 for n,p in self.charpartitions.iteritems():
1410 if not include_codons and n==CODONPOSITIONS:
1411 continue
1412 elif codons_only and n!=CODONPOSITIONS:
1413 continue
1414
1415
1416
1417 names=_sort_keys_by_values(p)
1418 newpartition={}
1419 for sn in names:
1420 nsp=[offlist[c] for c in p[sn] if c not in exclude]
1421 if nsp:
1422 newpartition[sn]=nsp
1423 if newpartition:
1424 if include_codons and n==CODONPOSITIONS:
1425 command='codonposset'
1426 else:
1427 command='charpartition'
1428 setsb.append('%s %s = %s' % (command,safename(n),\
1429 ', '.join(['%s: %s' % (sn,_compact4nexus(newpartition[sn])) for sn in names if sn in newpartition])))
1430
1431 for n,p in self.taxpartitions.iteritems():
1432 names=_sort_keys_by_values(p)
1433 newpartition={}
1434 for sn in names:
1435 nsp=[t for t in p[sn] if t not in delete]
1436 if nsp:
1437 newpartition[sn]=nsp
1438 if newpartition:
1439 setsb.append('taxpartition %s = %s' % (safename(n),\
1440 ', '.join(['%s: %s' % (safename(sn),' '.join(map(safename,newpartition[sn]))) for sn in names if sn in newpartition])))
1441
1442 setsb.append('end;\n')
1443 if len(setsb)==2:
1444 return ''
1445 else:
1446 return ';\n'.join(setsb)
1447
1449 """Writes matrix into a fasta file: (self, filename=None, width=70)."""
1450 if not filename:
1451 if '.' in self.filename and self.filename.split('.')[-1].lower() in ['paup','nexus','nex','dat']:
1452 filename='.'.join(self.filename.split('.')[:-1])+'.fas'
1453 else:
1454 filename=self.filename+'.fas'
1455 fh=open(filename,'w')
1456 for taxon in self.taxlabels:
1457 fh.write('>'+safename(taxon)+'\n')
1458 for i in range(0, len(self.matrix[taxon].tostring()), width):
1459 fh.write(self.matrix[taxon].tostring()[i:i+width] + '\n')
1460 fh.close()
1461 return filename
1462
1464 """Writes matrix into a PHYLIP file: (self, filename=None).
1465
1466 Note that this writes a relaxed PHYLIP format file, where the names
1467 are not truncated, nor checked for invalid characters."""
1468 if not filename:
1469 if '.' in self.filename and self.filename.split('.')[-1].lower() in ['paup','nexus','nex','dat']:
1470 filename='.'.join(self.filename.split('.')[:-1])+'.phy'
1471 else:
1472 filename=self.filename+'.phy'
1473 fh=open(filename,'w')
1474 fh.write('%d %d\n' % (self.ntax,self.nchar))
1475 for taxon in self.taxlabels:
1476 fh.write('%s %s\n' % (safename(taxon),self.matrix[taxon].tostring()))
1477 fh.close()
1478 return filename
1479
1480 - def constant(self,matrix=None,delete=[],exclude=[]):
1481 """Return a list with all constant characters."""
1482 if not matrix:
1483 matrix=self.matrix
1484 undelete=[t for t in self.taxlabels if t in matrix and t not in delete]
1485 if not undelete:
1486 return None
1487 elif len(undelete)==1:
1488 return [x for x in range(len(matrix[undelete[0]])) if x not in exclude]
1489
1490 constant=[(x,self.ambiguous_values.get(n.upper(),n.upper())) for
1491 x,n in enumerate(matrix[undelete[0]].tostring()) if x not in exclude]
1492 for taxon in undelete[1:]:
1493 newconstant=[]
1494 for site in constant:
1495
1496 seqsite=matrix[taxon][site[0]].upper()
1497
1498 if seqsite==self.missing or (seqsite==self.gap and self.options['gapmode'].lower()=='missing') or seqsite==site[1]:
1499
1500 newconstant.append(site)
1501 elif seqsite in site[1] or site[1]==self.missing or (self.options['gapmode'].lower()=='missing' and site[1]==self.gap):
1502
1503 newconstant.append((site[0],self.ambiguous_values.get(seqsite,seqsite)))
1504 elif seqsite in self.ambiguous_values:
1505 intersect = set(self.ambiguous_values[seqsite]).intersection(set(site[1]))
1506 if intersect:
1507 newconstant.append((site[0],''.join(intersect)))
1508
1509
1510
1511
1512
1513 constant=newconstant
1514 cpos=[s[0] for s in constant]
1515 return cpos
1516
1517 - def cstatus(self,site,delete=[],narrow=True):
1518 """Summarize character.
1519
1520 narrow=True: paup-mode (a c ? --> ac; ? ? ? --> ?)
1521 narrow=false: (a c ? --> a c g t -; ? ? ? --> a c g t -)
1522 """
1523 undelete=[t for t in self.taxlabels if t not in delete]
1524 if not undelete:
1525 return None
1526 cstatus=[]
1527 for t in undelete:
1528 c=self.matrix[t][site].upper()
1529 if self.options.get('gapmode')=='missing' and c==self.gap:
1530 c=self.missing
1531 if narrow and c==self.missing:
1532 if c not in cstatus:
1533 cstatus.append(c)
1534 else:
1535 cstatus.extend([b for b in self.ambiguous_values[c] if b not in cstatus])
1536 if self.missing in cstatus and narrow and len(cstatus)>1:
1537 cstatus=[c for c in cstatus if c!=self.missing]
1538 cstatus.sort()
1539 return cstatus
1540
1554
1555 - def crop_matrix(self,matrix=None, delete=[], exclude=[]):
1556 """Return a matrix without deleted taxa and excluded characters."""
1557 if not matrix:
1558 matrix=self.matrix
1559 if [t for t in delete if not self._check_taxlabels(t)]:
1560 raise NexusError('Unknown taxa: %s' \
1561 % ', '.join(set(delete).difference(self.taxlabels)))
1562 if exclude!=[]:
1563 undelete=[t for t in self.taxlabels if t in matrix and t not in delete]
1564 if not undelete:
1565 return {}
1566 m=[matrix[k].tostring() for k in undelete]
1567 zipped_m=zip(*m)
1568 sitesm=[s for i,s in enumerate(zipped_m) if i not in exclude]
1569 if sitesm==[]:
1570 return dict([(t,Seq('',self.alphabet)) for t in undelete])
1571 else:
1572 zipped_sitesm=zip(*sitesm)
1573 m=[Seq(s,self.alphabet) for s in map(''.join,zipped_sitesm)]
1574 return dict(zip(undelete,m))
1575 else:
1576 return dict([(t,matrix[t]) for t in self.taxlabels if t in matrix and t not in delete])
1577
1578 - def bootstrap(self,matrix=None,delete=[],exclude=[]):
1579 """Return a bootstrapped matrix."""
1580 if not matrix:
1581 matrix=self.matrix
1582 seqobjects=isinstance(matrix[matrix.keys()[0]],Seq)
1583 cm=self.crop_matrix(delete=delete,exclude=exclude)
1584 if not cm:
1585 return {}
1586 elif len(cm[cm.keys()[0]])==0:
1587 return cm
1588 undelete=[t for t in self.taxlabels if t in cm]
1589 if seqobjects:
1590 sitesm=zip(*[cm[t].tostring() for t in undelete])
1591 alphabet=matrix[matrix.keys()[0]].alphabet
1592 else:
1593 sitesm=zip(*[cm[t] for t in undelete])
1594 bootstrapsitesm=[sitesm[random.randint(0,len(sitesm)-1)] for i in range(len(sitesm))]
1595 bootstrapseqs=map(''.join,zip(*bootstrapsitesm))
1596 if seqobjects:
1597 bootstrapseqs=[Seq(s,alphabet) for s in bootstrapseqs]
1598 return dict(zip(undelete,bootstrapseqs))
1599
1601 """Adds a sequence (string) to the matrix."""
1602
1603 if not name:
1604 raise NexusError('New sequence must have a name')
1605
1606 diff=self.nchar-len(sequence)
1607 if diff<0:
1608 self.insert_gap(self.nchar,-diff)
1609 elif diff>0:
1610 sequence+=self.missing*diff
1611
1612 if name in self.taxlabels:
1613 unique_name=_unique_label(self.taxlabels,name)
1614
1615 else:
1616 unique_name=name
1617
1618 assert unique_name not in self.matrix, "ERROR. There is a discrepancy between taxlabels and matrix keys. Report this as a bug."
1619
1620 self.matrix[unique_name]=Seq(sequence,self.alphabet)
1621 self.ntax+=1
1622 self.taxlabels.append(unique_name)
1623 self.unaltered_taxlabels.append(name)
1624
1626 """Add a gap into the matrix and adjust charsets and partitions.
1627
1628 pos=0: first position
1629 pos=nchar: last position
1630 """
1631
1632 def _adjust(set,x,d,leftgreedy=False):
1633 """Adjusts chartacter sets if gaps are inserted, taking care of
1634 new gaps within a coherent character set."""
1635
1636
1637
1638 set.sort()
1639 addpos=0
1640 for i,c in enumerate(set):
1641 if c>=x:
1642 set[i]=c+d
1643
1644 if c==x:
1645 if leftgreedy or (i>0 and set[i-1]==c-1):
1646 addpos=i
1647 if addpos>0:
1648 set[addpos:addpos]=range(x,x+d)
1649 return set
1650
1651 if pos<0 or pos>self.nchar:
1652 raise NexusError('Illegal gap position: %d' % pos)
1653 if n==0:
1654 return
1655 if self.taxlabels:
1656
1657 sitesm=zip(*[self.matrix[t].tostring() for t in self.taxlabels])
1658 else:
1659 sitesm=[]
1660 sitesm[pos:pos]=[['-']*len(self.taxlabels)]*n
1661
1662
1663 zipped=zip(*sitesm)
1664 mapped=map(''.join,zipped)
1665 listed=[(taxon,Seq(mapped[i],self.alphabet)) for i,taxon in enumerate(self.taxlabels)]
1666 self.matrix=dict(listed)
1667 self.nchar+=n
1668
1669 for i,s in self.charsets.iteritems():
1670 self.charsets[i]=_adjust(s,pos,n,leftgreedy=leftgreedy)
1671 for p in self.charpartitions:
1672 for sp,s in self.charpartitions[p].iteritems():
1673 self.charpartitions[p][sp]=_adjust(s,pos,n,leftgreedy=leftgreedy)
1674
1675 self.charlabels=self._adjust_charlabels(insert=[pos]*n)
1676 return self.charlabels
1677
1679 """Return adjusted indices of self.charlabels if characters are excluded or inserted."""
1680 if exclude and insert:
1681 raise NexusError('Can\'t exclude and insert at the same time')
1682 if not self.charlabels:
1683 return None
1684 labels=sorted(self.charlabels)
1685 newcharlabels={}
1686 if exclude:
1687 exclude.sort()
1688 exclude.append(sys.maxint)
1689 excount=0
1690 for c in labels:
1691 if not c in exclude:
1692 while c>exclude[excount]:
1693 excount+=1
1694 newcharlabels[c-excount]=self.charlabels[c]
1695 elif insert:
1696 insert.sort()
1697 insert.append(sys.maxint)
1698 icount=0
1699 for c in labels:
1700 while c>=insert[icount]:
1701 icount+=1
1702 newcharlabels[c+icount]=self.charlabels[c]
1703 else:
1704 return self.charlabels
1705 return newcharlabels
1706
1708 """Returns all character indices that are not in charlist."""
1709 return [c for c in range(self.nchar) if c not in charlist]
1710
1711 - def gaponly(self,include_missing=False):
1712 """Return gap-only sites."""
1713 gap=set(self.gap)
1714 if include_missing:
1715 gap.add(self.missing)
1716 sitesm=zip(*[self.matrix[t].tostring() for t in self.taxlabels])
1717 gaponly=[i for i,site in enumerate(sitesm) if set(site).issubset(gap)]
1718 return gaponly
1719
1741