Package Bio :: Package Nexus :: Module Nexus
[hide private]
[frames] | no frames]

Source Code for Module Bio.Nexus.Nexus

   1  # Nexus.py - a NEXUS parser 
   2  # 
   3  # Copyright 2005-2008 by Frank Kauff & Cymon J. Cox. All rights reserved. 
   4  # This code is part of the Biopython distribution and governed by its 
   5  # license. Please see the LICENSE file that should have been included 
   6  # as part of this package. 
   7  #  
   8  # Bug reports welcome: fkauff@biologie.uni-kl.de 
   9  # 
  10   
  11  """Nexus class. Parse the contents of a nexus file. 
  12  Based upon 'NEXUS: An extensible file format for systematic information' 
  13  Maddison, Swofford, Maddison. 1997. Syst. Biol. 46(4):590-621 
  14  """ 
  15   
  16  import os,sys, math, random, copy 
  17   
  18  from Bio.Alphabet import IUPAC 
  19  from Bio.Data import IUPACData 
  20  from Bio.Seq import Seq 
  21   
  22  from Trees import Tree,NodeData 
  23   
  24  try: 
  25      import cnexus 
  26  except ImportError: 
  27      C=False 
  28  else: 
  29      C=True 
  30   
  31  try: 
  32      #Check the built in set function is present (python 2.4+) 
  33      set = set 
  34  except NameError: 
  35      #For python 2.3 fall back on the sets module (deprecated in python 2.6) 
  36      from sets import Set as set 
  37   
  38  INTERLEAVE=70 
  39  SPECIAL_COMMANDS=['charstatelabels','charlabels','taxlabels', 'taxset', 'charset','charpartition','taxpartition',\ 
  40          'matrix','tree', 'utree','translate','codonposset','title'] 
  41  KNOWN_NEXUS_BLOCKS = ['trees','data', 'characters', 'taxa', 'sets','codons'] 
  42  PUNCTUATION='()[]{}/\,;:=*\'"`+-<>' 
  43  MRBAYESSAFE='abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890_' 
  44  WHITESPACE=' \t\n' 
  45  #SPECIALCOMMENTS=['!','&','%','/','\\','@'] #original list of special comments 
  46  SPECIALCOMMENTS=['&'] # supported special comment ('tree' command), all others are ignored 
  47  CHARSET='chars' 
  48  TAXSET='taxa' 
  49  CODONPOSITIONS='codonpositions' 
  50  DEFAULTNEXUS='#NEXUS\nbegin data; dimensions ntax=0 nchar=0; format datatype=dna; end; ' 
51 -class NexusError(Exception): pass
52
53 -class CharBuffer:
54 """Helps reading NEXUS-words and characters from a buffer."""
55 - def __init__(self,string):
56 if string: 57 self.buffer=list(string) 58 else: 59 self.buffer=[]
60
61 - def peek(self):
62 if self.buffer: 63 return self.buffer[0] 64 else: 65 return None
66
67 - def peek_nonwhitespace(self):
68 b=''.join(self.buffer).strip() 69 if b: 70 return b[0] 71 else: 72 return None
73
74 - def next(self):
75 if self.buffer: 76 return self.buffer.pop(0) 77 else: 78 return None
79
80 - def next_nonwhitespace(self):
81 while True: 82 p=self.next() 83 if p is None: 84 break 85 if p not in WHITESPACE: 86 return p 87 return None
88
89 - def skip_whitespace(self):
90 while self.buffer[0] in WHITESPACE: 91 self.buffer=self.buffer[1:]
92
93 - def next_until(self,target):
94 for t in target: 95 try: 96 pos=self.buffer.index(t) 97 except ValueError: 98 pass 99 else: 100 found=''.join(self.buffer[:pos]) 101 self.buffer=self.buffer[pos:] 102 return found 103 else: 104 return None
105
106 - def peek_word(self,word):
107 return ''.join(self.buffer[:len(word)])==word
108
109 - def next_word(self):
110 """Return the next NEXUS word from a string. 111 112 This deals with single and double quotes, whitespace and punctuation. 113 """ 114 115 word=[] 116 quoted=False 117 first=self.next_nonwhitespace() # get first character 118 if not first: # return empty if only whitespace left 119 return None 120 word.append(first) 121 if first=="'": # word starts with a quote 122 quoted=True 123 elif first in PUNCTUATION: # if it's punctuation, return immediately 124 return first 125 while True: 126 c=self.peek() 127 if c=="'": # a quote? 128 word.append(self.next()) # store quote 129 if self.peek()=="'": # double quote 130 skip=self.next() # skip second quote 131 elif quoted: # second single quote ends word 132 break 133 elif quoted: 134 word.append(self.next()) # if quoted, then add anything 135 elif not c or c in PUNCTUATION or c in WHITESPACE: # if not quoted and special character, stop 136 break 137 else: 138 word.append(self.next()) # standard character 139 return ''.join(word)
140
141 - def rest(self):
142 """Return the rest of the string without parsing.""" 143 return ''.join(self.buffer)
144
145 -class StepMatrix:
146 """Calculate a stepmatrix for weighted parsimony. 147 148 See Wheeler (1990), Cladistics 6:269-275. 149 """ 150
151 - def __init__(self,symbols,gap):
152 self.data={} 153 self.symbols=[s for s in symbols] 154 self.symbols.sort() 155 if gap: 156 self.symbols.append(gap) 157 for x in self.symbols: 158 for y in [s for s in self.symbols if s!=x]: 159 self.set(x,y,0)
160
161 - def set(self,x,y,value):
162 if x>y: 163 x,y=y,x 164 self.data[x+y]=value
165
166 - def add(self,x,y,value):
167 if x>y: 168 x,y=y,x 169 self.data[x+y]+=value
170
171 - def sum(self):
172 return reduce(lambda x,y:x+y,self.data.values())
173
174 - def transformation(self):
175 total=self.sum() 176 if total!=0: 177 for k in self.data: 178 self.data[k]=self.data[k]/float(total) 179 return self
180
181 - def weighting(self):
182 for k in self.data: 183 if self.data[k]!=0: 184 self.data[k]=-math.log(self.data[k]) 185 return self
186
187 - def smprint(self,name='your_name_here'):
188 matrix='usertype %s stepmatrix=%d\n' % (name,len(self.symbols)) 189 matrix+=' %s\n' % ' '.join(self.symbols) 190 for x in self.symbols: 191 matrix+='[%s]'.ljust(8) % x 192 for y in self.symbols: 193 if x==y: 194 matrix+=' . ' 195 else: 196 if x>y: 197 x1,y1=y,x 198 else: 199 x1,y1=x,y 200 if self.data[x1+y1]==0: 201 matrix+='inf. ' 202 else: 203 matrix+='%2.2f'.ljust(10) % (self.data[x1+y1]) 204 matrix+='\n' 205 matrix+=';\n' 206 return matrix
207
208 -def safename(name,mrbayes=False):
209 """Return a taxon identifier according to NEXUS standard. 210 211 Wrap quotes around names with punctuation or whitespace, and double 212 single quotes. 213 214 mrbayes=True: write names without quotes, whitespace or punctuation 215 for the mrbayes software package. 216 """ 217 if mrbayes: 218 safe=name.replace(' ','_') 219 safe=''.join([c for c in safe if c in MRBAYESSAFE]) 220 else: 221 safe=name.replace("'","''") 222 if set(safe).intersection(set(WHITESPACE+PUNCTUATION)): 223 safe="'"+safe+"'" 224 return safe
225
226 -def quotestrip(word):
227 """Remove quotes and/or double quotes around identifiers.""" 228 if not word: 229 return None 230 while (word.startswith("'") and word.endswith("'")) or (word.startswith('"') and word.endswith('"')): 231 word=word[1:-1] 232 return word
233
234 -def get_start_end(sequence, skiplist=['-','?']):
235 """Return position of first and last character which is not in skiplist. 236 237 Skiplist defaults to ['-','?']).""" 238 239 length=len(sequence) 240 if length==0: 241 return None,None 242 end=length-1 243 while end>=0 and (sequence[end] in skiplist): 244 end-=1 245 start=0 246 while start<length and (sequence[start] in skiplist): 247 start+=1 248 if start==length and end==-1: # empty sequence 249 return -1,-1 250 else: 251 return start,end
252
253 -def _sort_keys_by_values(p):
254 """Returns a sorted list of keys of p sorted by values of p.""" 255 startpos=[(p[pn],pn) for pn in p if p[pn]] 256 startpos.sort() 257 return zip(*startpos)[1]
258
259 -def _make_unique(l):
260 """Check that all values in list are unique and return a pruned and sorted list.""" 261 l=list(set(l)) 262 l.sort() 263 return l
264
265 -def _unique_label(previous_labels,label):
266 """Returns a unique name if label is already in previous_labels.""" 267 while label in previous_labels: 268 if label.split('.')[-1].startswith('copy'): 269 label='.'.join(label.split('.')[:-1])+'.copy'+str(eval('0'+label.split('.')[-1][4:])+1) 270 else: 271 label+='.copy' 272 return label
273
274 -def _seqmatrix2strmatrix(matrix):
275 """Converts a Seq-object matrix to a plain sequence-string matrix.""" 276 return dict([(t,matrix[t].tostring()) for t in matrix])
277
278 -def _compact4nexus(orig_list):
279 """Transform [1 2 3 5 6 7 8 12 15 18 20] (baseindex 0, used in the Nexus class) 280 into '2-4 6-9 13-19\\3 21' (baseindex 1, used in programs like Paup or MrBayes.). 281 """ 282 283 if not orig_list: 284 return '' 285 orig_list=list(set(orig_list)) 286 orig_list.sort() 287 shortlist=[] 288 clist=orig_list[:] 289 clist.append(clist[-1]+.5) # dummy value makes it easier 290 while len(clist)>1: 291 step=1 292 for i,x in enumerate(clist): 293 if x==clist[0]+i*step: # are we still in the right step? 294 continue 295 elif i==1 and len(clist)>3 and clist[i+1]-x==x-clist[0]: 296 # second element, and possibly at least 3 elements to link, 297 # and the next one is in the right step 298 step=x-clist[0] 299 else: # pattern broke, add all values before current position to new list 300 sub=clist[:i] 301 if len(sub)==1: 302 shortlist.append(str(sub[0]+1)) 303 else: 304 if step==1: 305 shortlist.append('%d-%d' % (sub[0]+1,sub[-1]+1)) 306 else: 307 shortlist.append('%d-%d\\%d' % (sub[0]+1,sub[-1]+1,step)) 308 clist=clist[i:] 309 break 310 return ' '.join(shortlist)
311
312 -def combine(matrices):
313 """Combine matrices in [(name,nexus-instance),...] and return new nexus instance. 314 315 combined_matrix=combine([(name1,nexus_instance1),(name2,nexus_instance2),...] 316 Character sets, character partitions and taxon sets are prefixed, readjusted 317 and present in the combined matrix. 318 """ 319 320 if not matrices: 321 return None 322 name=matrices[0][0] 323 combined=copy.deepcopy(matrices[0][1]) # initiate with copy of first matrix 324 mixed_datatypes=(len(set([n[1].datatype for n in matrices]))>1) 325 if mixed_datatypes: 326 combined.datatype='None' # dealing with mixed matrices is application specific. You take care of that yourself! 327 # raise NexusError('Matrices must be of same datatype') 328 combined.charlabels=None 329 combined.statelabels=None 330 combined.interleave=False 331 combined.translate=None 332 333 # rename taxon sets and character sets and name them with prefix 334 for cn,cs in combined.charsets.items(): 335 combined.charsets['%s.%s' % (name,cn)]=cs 336 del combined.charsets[cn] 337 for tn,ts in combined.taxsets.items(): 338 combined.taxsets['%s.%s' % (name,tn)]=ts 339 del combined.taxsets[tn] 340 # previous partitions usually don't make much sense in combined matrix 341 # just initiate one new partition parted by single matrices 342 combined.charpartitions={'combined':{name:range(combined.nchar)}} 343 for n,m in matrices[1:]: # add all other matrices 344 both=[t for t in combined.taxlabels if t in m.taxlabels] 345 combined_only=[t for t in combined.taxlabels if t not in both] 346 m_only=[t for t in m.taxlabels if t not in both] 347 for t in both: 348 # concatenate sequences and unify gap and missing character symbols 349 combined.matrix[t]+=Seq(m.matrix[t].tostring().replace(m.gap,combined.gap).replace(m.missing,combined.missing),combined.alphabet) 350 # replace date of missing taxa with symbol for missing data 351 for t in combined_only: 352 combined.matrix[t]+=Seq(combined.missing*m.nchar,combined.alphabet) 353 for t in m_only: 354 combined.matrix[t]=Seq(combined.missing*combined.nchar,combined.alphabet)+\ 355 Seq(m.matrix[t].tostring().replace(m.gap,combined.gap).replace(m.missing,combined.missing),combined.alphabet) 356 combined.taxlabels.extend(m_only) # new taxon list 357 for cn,cs in m.charsets.items(): # adjust character sets for new matrix 358 combined.charsets['%s.%s' % (n,cn)]=[x+combined.nchar for x in cs] 359 if m.taxsets: 360 if not combined.taxsets: 361 combined.taxsets={} 362 combined.taxsets.update(dict([('%s.%s' % (n,tn),ts) for tn,ts in m.taxsets.items()])) # update taxon sets 363 combined.charpartitions['combined'][n]=range(combined.nchar,combined.nchar+m.nchar) # update new charpartition 364 # update charlabels 365 if m.charlabels: 366 if not combined.charlabels: 367 combined.charlabels={} 368 combined.charlabels.update(dict([(combined.nchar+i,label) for (i,label) in m.charlabels.items()])) 369 combined.nchar+=m.nchar # update nchar and ntax 370 combined.ntax+=len(m_only) 371 372 # some prefer partitions, some charsets: 373 # make separate charset for ecah initial dataset 374 for c in combined.charpartitions['combined']: 375 combined.charsets[c]=combined.charpartitions['combined'][c] 376 377 return combined
378
379 -def _kill_comments_and_break_lines(text):
380 """Delete []-delimited comments out of a file and break into lines separated by ';'. 381 382 stripped_text=_kill_comments_and_break_lines(text): 383 Nested and multiline comments are allowed. [ and ] symbols within single 384 or double quotes are ignored, newline ends a quote, all symbols with quotes are 385 treated the same (thus not quoting inside comments like [this character ']' ends a comment]) 386 Special [&...] and [\...] comments remain untouched, if not inside standard comment. 387 Quotes inside special [& and [\ are treated as normal characters, 388 but no nesting inside these special comments allowed (like [& [\ ]]). 389 ';' ist deleted from end of line. 390 391 NOTE: this function is very slow for large files, and obsolete when using C extension cnexus 392 """ 393 contents=CharBuffer(text) 394 newtext=[] 395 newline=[] 396 quotelevel='' 397 speciallevel=False 398 commlevel=0 399 while True: 400 #plain=contents.next_until(["'",'"','[',']','\n',';']) # search for next special character 401 #if not plain: 402 # newline.append(contents.rest) # not found, just add the rest 403 # break 404 #newline.append(plain) # add intermediate text 405 t=contents.next() # and get special character 406 if t is None: 407 break 408 if t==quotelevel and not (commlevel or speciallevel): # matching quote ends quotation 409 quotelevel='' 410 elif not quotelevel and not (commlevel or speciallevel) and (t=='"' or t=="'"): # single or double quote starts quotation 411 quotelevel=t 412 elif not quotelevel and t=='[': # opening bracket outside a quote 413 if contents.peek() in SPECIALCOMMENTS and commlevel==0 and not speciallevel: 414 speciallevel=True 415 else: 416 commlevel+=1 417 elif not quotelevel and t==']': # closing bracket ioutside a quote 418 if speciallevel: 419 speciallevel=False 420 else: 421 commlevel-=1 422 if commlevel<0: 423 raise NexusError('Nexus formatting error: unmatched ]') 424 continue 425 if commlevel==0: # copy if we're not in comment 426 if t==';' and not quotelevel: 427 newtext.append(''.join(newline)) 428 newline=[] 429 else: 430 newline.append(t) 431 #level of comments should be 0 at the end of the file 432 if newline: 433 newtext.append('\n'.join(newline)) 434 if commlevel>0: 435 raise NexusError('Nexus formatting error: unmatched [') 436 return newtext
437 438
439 -def _adjust_lines(lines):
440 """Adjust linebreaks to match ';', strip leading/trailing whitespace. 441 442 list_of_commandlines=_adjust_lines(input_text) 443 Lines are adjusted so that no linebreaks occur within a commandline 444 (except matrix command line) 445 """ 446 formatted_lines=[] 447 for l in lines: 448 #Convert line endings 449 l=l.replace('\r\n','\n').replace('\r','\n').strip() 450 if l.lower().startswith('matrix'): 451 formatted_lines.append(l) 452 else: 453 l=l.replace('\n',' ') 454 if l: 455 formatted_lines.append(l) 456 return formatted_lines
457
458 -def _replace_parenthesized_ambigs(seq,rev_ambig_values):
459 """Replaces ambigs in xxx(ACG)xxx format by IUPAC ambiguity code.""" 460 461 opening=seq.find('(') 462 while opening>-1: 463 closing=seq.find(')') 464 if closing<0: 465 raise NexusError('Missing closing parenthesis in: '+seq) 466 elif closing<opening: 467 raise NexusError('Missing opening parenthesis in: '+seq) 468 ambig=[x for x in seq[opening+1:closing]] 469 ambig.sort() 470 ambig=''.join(ambig) 471 ambig_code=rev_ambig_values[ambig.upper()] 472 if ambig!=ambig.upper(): 473 ambig_code=ambig_code.lower() 474 seq=seq[:opening]+ambig_code+seq[closing+1:] 475 opening=seq.find('(') 476 return seq
477
478 -class Commandline:
479 """Represent a commandline as command and options.""" 480
481 - def __init__(self, line, title):
482 self.options={} 483 options=[] 484 self.command=None 485 try: 486 #Assume matrix (all other command lines have been stripped of \n) 487 self.command, options = line.strip().split('\n', 1) 488 except ValueError: #Not matrix 489 #self.command,options=line.split(' ',1) #no: could be tab or spaces (translate...) 490 self.command=line.split()[0] 491 options=' '.join(line.split()[1:]) 492 self.command = self.command.strip().lower() 493 if self.command in SPECIAL_COMMANDS: # special command that need newlines and order of options preserved 494 self.options=options.strip() 495 else: 496 if len(options) > 0: 497 try: 498 options = options.replace('=', ' = ').split() 499 valued_indices=[(n-1,n,n+1) for n in range(len(options)) if options[n]=='=' and n!=0 and n!=len((options))] 500 indices = [] 501 for sl in valued_indices: 502 indices.extend(sl) 503 token_indices = [n for n in range(len(options)) if n not in indices] 504 for opt in valued_indices: 505 #self.options[options[opt[0]].lower()] = options[opt[2]].lower() 506 self.options[options[opt[0]].lower()] = options[opt[2]] 507 for token in token_indices: 508 self.options[options[token].lower()] = None 509 except ValueError: 510 raise NexusError('Incorrect formatting in line: %s' % line)
511
512 -class Block:
513 """Represent a NEXUS block with block name and list of commandlines."""
514 - def __init__(self,title=None):
515 self.title=title 516 self.commandlines=[]
517
518 -class Nexus(object):
519 520 __slots__=['original_taxon_order','__dict__'] 521
522 - def __init__(self, input=None):
523 self.ntax=0 # number of taxa 524 self.nchar=0 # number of characters 525 self.unaltered_taxlabels=[] # taxlabels as the appear in the input file (incl. duplicates, etc.) 526 self.taxlabels=[] # labels for taxa, ordered by their id 527 self.charlabels=None # ... and for characters 528 self.statelabels=None # ... and for states 529 self.datatype='dna' # (standard), dna, rna, nucleotide, protein 530 self.respectcase=False # case sensitivity 531 self.missing='?' # symbol for missing characters 532 self.gap='-' # symbol for gap 533 self.symbols=None # set of symbols 534 self.equate=None # set of symbol synonyms 535 self.matchchar=None # matching char for matrix representation 536 self.labels=None # left, right, no 537 self.transpose=False # whether matrix is transposed 538 self.interleave=False # whether matrix is interleaved 539 self.tokens=False # unsupported 540 self.eliminate=None # unsupported 541 self.matrix=None # ... 542 self.unknown_blocks=[] # blocks we don't care about 543 self.taxsets={} 544 self.charsets={} 545 self.charpartitions={} 546 self.taxpartitions={} 547 self.trees=[] # list of Trees (instances of Tree class) 548 self.translate=None # Dict to translate taxon <-> taxon numbers 549 self.structured=[] # structured input representation 550 self.set={} # dict of the set command to set various options 551 self.options={} # dict of the options command in the data block 552 self.codonposset=None # name of the charpartition that defines codon positions 553 554 # some defaults 555 self.options['gapmode']='missing' 556 557 if input: 558 self.read(input) 559 else: 560 self.read(DEFAULTNEXUS)
561
562 - def get_original_taxon_order(self):
563 """Included for backwards compatibility (DEPRECATED).""" 564 return self.taxlabels
565 - def set_original_taxon_order(self,value):
566 """Included for backwards compatibility (DEPRECATED).""" 567 self.taxlabels=value
568 original_taxon_order=property(get_original_taxon_order,set_original_taxon_order) 569
570 - def read(self,input):
571 """Read and parse NEXUS imput (a filename, file-handle, or string).""" 572 573 # 1. Assume we have the name of a file in the execution dir 574 # Note we need to add parsing of the path to dir/filename 575 try: 576 file_contents = open(os.path.expanduser(input),'rU').read() 577 self.filename=input 578 except (TypeError,IOError,AttributeError): 579 #2 Assume we have a string from a fh.read() 580 if isinstance(input, str): 581 file_contents = input 582 self.filename='input_string' 583 #3 Assume we have a file object 584 elif hasattr(input,'read'): # file objects or StringIO objects 585 file_contents=input.read() 586 if hasattr(input,"name") and input.name: 587 self.filename=input.name 588 else: 589 self.filename='Unknown_nexus_file' 590 else: 591 print input.strip()[:50] 592 raise NexusError('Unrecognized input: %s ...' % input[:100]) 593 file_contents=file_contents.strip() 594 if file_contents.startswith('#NEXUS'): 595 file_contents=file_contents[6:] 596 if C: 597 decommented=cnexus.scanfile(file_contents) 598 #check for unmatched parentheses 599 if decommented=='[' or decommented==']': 600 raise NexusError('Unmatched %s' % decommented) 601 # cnexus can't return lists, so in analogy we separate commandlines with chr(7) 602 # (a character that shoudn't be part of a nexus file under normal circumstances) 603 commandlines=_adjust_lines(decommented.split(chr(7))) 604 else: 605 commandlines=_adjust_lines(_kill_comments_and_break_lines(file_contents)) 606 # get rid of stupid 'NEXUS token - in merged treefiles, this might appear multiple times' 607 for i,cl in enumerate(commandlines): 608 try: 609 if cl[:6].upper()=='#NEXUS': 610 commandlines[i]=cl[6:].strip() 611 except: 612 pass 613 # now loop through blocks (we parse only data in known blocks, thus ignoring non-block commands 614 nexus_block_gen = self._get_nexus_block(commandlines) 615 while 1: 616 try: 617 title, contents = nexus_block_gen.next() 618 except StopIteration: 619 break 620 if title in KNOWN_NEXUS_BLOCKS: 621 self._parse_nexus_block(title, contents) 622 else: 623 self._unknown_nexus_block(title, contents)
624
625 - def _get_nexus_block(self,file_contents):
626 """Generator for looping through Nexus blocks.""" 627 inblock=False 628 blocklines=[] 629 while file_contents: 630 cl=file_contents.pop(0) 631 if cl.lower().startswith('begin'): 632 if not inblock: 633 inblock=True 634 title=cl.split()[1].lower() 635 else: 636 raise NexusError('Illegal block nesting in block %s' % title) 637 elif cl.lower().startswith('end'): 638 if inblock: 639 inblock=False 640 yield title,blocklines 641 blocklines=[] 642 else: 643 raise NexusError('Unmatched \'end\'.') 644 elif inblock: 645 blocklines.append(cl)
646
647 - def _unknown_nexus_block(self,title, contents):
648 block = Block() 649 block.commandlines.append(contents) 650 block.title = title 651 self.unknown_blocks.append(block)
652
653 - def _parse_nexus_block(self,title, contents):
654 """Parse a known Nexus Block (PRIVATE).""" 655 # attached the structered block representation 656 self._apply_block_structure(title, contents) 657 #now check for taxa,characters,data blocks. If this stuff is defined more than once 658 #the later occurences will override the previous ones. 659 block=self.structured[-1] 660 for line in block.commandlines: 661 try: 662 getattr(self,'_'+line.command)(line.options) 663 except AttributeError: 664 raise 665 raise NexusError('Unknown command: %s ' % line.command)
666
667 - def _title(self,options):
668 pass
669
670 - def _dimensions(self,options):
671 if 'ntax' in options: 672 self.ntax=eval(options['ntax']) 673 if 'nchar' in options: 674 self.nchar=eval(options['nchar'])
675
676 - def _format(self,options):
677 # print options 678 # we first need to test respectcase, then symbols (which depends on respectcase) 679 # then datatype (which, if standard, depends on symbols and respectcase in order to generate 680 # dicts for ambiguous values and alphabet 681 if 'respectcase' in options: 682 self.respectcase=True 683 # adjust symbols to for respectcase 684 if 'symbols' in options: 685 self.symbols=options['symbols'] 686 if (self.symbols.startswith('"') and self.symbols.endswith('"')) or\ 687 (self.symbold.startswith("'") and self.symbols.endswith("'")): 688 self.symbols=self.symbols[1:-1].replace(' ','') 689 if not self.respectcase: 690 self.symbols=self.symbols.lower()+self.symbols.upper() 691 self.symbols=list(set(self.symbols)) 692 if 'datatype' in options: 693 self.datatype=options['datatype'].lower() 694 if self.datatype=='dna' or self.datatype=='nucleotide': 695 self.alphabet=copy.deepcopy(IUPAC.ambiguous_dna) 696 self.ambiguous_values=copy.deepcopy(IUPACData.ambiguous_dna_values) 697 self.unambiguous_letters=copy.deepcopy(IUPACData.unambiguous_dna_letters) 698 elif self.datatype=='rna': 699 self.alphabet=copy.deepcopy(IUPAC.ambiguous_rna) 700 self.ambiguous_values=copy.deepcopy(IUPACData.ambiguous_rna_values) 701 self.unambiguous_letters=copy.deepcopy(IUPACData.unambiguous_rna_letters) 702 elif self.datatype=='protein': 703 self.alphabet=copy.deepcopy(IUPAC.protein) 704 self.ambiguous_values={'B':'DN','Z':'EQ','X':copy.deepcopy(IUPACData.protein_letters)} # that's how PAUP handles it 705 self.unambiguous_letters=copy.deepcopy(IUPACData.protein_letters)+'*' # stop-codon 706 elif self.datatype=='standard': 707 raise NexusError('Datatype standard is not yet supported.') 708 #self.alphabet=None 709 #self.ambiguous_values={} 710 #if not self.symbols: 711 # self.symbols='01' # if nothing else defined, then 0 and 1 are the default states 712 #self.unambiguous_letters=self.symbols 713 else: 714 raise NexusError('Unsupported datatype: '+self.datatype) 715 self.valid_characters=''.join(self.ambiguous_values.keys())+self.unambiguous_letters 716 if not self.respectcase: 717 self.valid_characters=self.valid_characters.lower()+self.valid_characters.upper() 718 #we have to sort the reverse ambig coding dict key characters: 719 #to be sure that it's 'ACGT':'N' and not 'GTCA':'N' 720 rev=dict([(i[1],i[0]) for i in self.ambiguous_values.items() if i[0]!='X']) 721 self.rev_ambiguous_values={} 722 for (k,v) in rev.items(): 723 key=[c for c in k] 724 key.sort() 725 self.rev_ambiguous_values[''.join(key)]=v 726 #overwrite symbols for datype rna,dna,nucleotide 727 if self.datatype in ['dna','rna','nucleotide']: 728 self.symbols=self.alphabet.letters 729 if self.missing not in self.ambiguous_values: 730 self.ambiguous_values[self.missing]=self.unambiguous_letters+self.gap 731 self.ambiguous_values[self.gap]=self.gap 732 elif self.datatype=='standard': 733 if not self.symbols: 734 self.symbols=['1','0'] 735 if 'missing' in options: 736 self.missing=options['missing'][0] 737 if 'gap' in options: 738 self.gap=options['gap'][0] 739 if 'equate' in options: 740 self.equate=options['equate'] 741 if 'matchchar' in options: 742 self.matchchar=options['matchchar'][0] 743 if 'labels' in options: 744 self.labels=options['labels'] 745 if 'transpose' in options: 746 raise NexusError('TRANSPOSE is not supported!') 747 self.transpose=True 748 if 'interleave' in options: 749 if options['interleave']==None or options['interleave'].lower()=='yes': 750 self.interleave=True 751 if 'tokens' in options: 752 self.tokens=True 753 if 'notokens' in options: 754 self.tokens=False
755 756
757 - def _set(self,options):
758 self.set=options;
759
760 - def _options(self,options):
761 self.options=options;
762
763 - def _eliminate(self,options):
764 self.eliminate=options
765
766 - def _taxlabels(self,options):
767 """Get taxon labels (PRIVATE). 768 769 As the taxon names are already in the matrix, this is superfluous 770 except for transpose matrices, which are currently unsupported anyway. 771 Thus, we ignore the taxlabels command to make handling of duplicate 772 taxon names easier. 773 """ 774 pass
775 #self.taxlabels=[] 776 #opts=CharBuffer(options) 777 #while True: 778 # taxon=quotestrip(opts.next_word()) 779 # if not taxon: 780 # break 781 # self.taxlabels.append(taxon) 782
783 - def _check_taxlabels(self,taxon):
784 """Check for presence of taxon in self.taxlabels.""" 785 # According to NEXUS standard, underscores shall be treated as spaces..., 786 # so checking for identity is more difficult 787 nextaxa=dict([(t.replace(' ','_'),t) for t in self.taxlabels]) 788 nexid=taxon.replace(' ','_') 789 return nextaxa.get(nexid)
790
791 - def _charlabels(self,options):
792 self.charlabels={} 793 opts=CharBuffer(options) 794 while True: 795 try: 796 # get id and state 797 w=opts.next_word() 798 if w is None: # McClade saves and reads charlabel-lists with terminal comma?! 799 break 800 identifier=self._resolve(w,set_type=CHARSET) 801 state=quotestrip(opts.next_word()) 802 self.charlabels[identifier]=state 803 # check for comma or end of command 804 c=opts.next_nonwhitespace() 805 if c is None: 806 break 807 elif c!=',': 808 raise NexusError('Missing \',\' in line %s.' % options) 809 except NexusError: 810 raise 811 except: 812 raise NexusError('Format error in line %s.' % options)
813
814 - def _charstatelabels(self,options):
815 # warning: charstatelabels supports only charlabels-syntax! 816 self._charlabels(options)
817
818 - def _statelabels(self,options):
819 #self.charlabels=options 820 #print 'Command statelabels is not supported and will be ignored.' 821 pass
822
823 - def _matrix(self,options):
824 if not self.ntax or not self.nchar: 825 raise NexusError('Dimensions must be specified before matrix!') 826 self.matrix={} 827 taxcount=0 828 first_matrix_block=True 829 830 #eliminate empty lines and leading/trailing whitespace 831 lines=[l.strip() for l in options.split('\n') if l.strip()!=''] 832 lineiter=iter(lines) 833 while 1: 834 try: 835 l=lineiter.next() 836 except StopIteration: 837 if taxcount<self.ntax: 838 raise NexusError('Not enough taxa in matrix.') 839 elif taxcount>self.ntax: 840 raise NexusError('Too many taxa in matrix.') 841 else: 842 break 843 # count the taxa and check for interleaved matrix 844 taxcount+=1 845 ##print taxcount 846 if taxcount>self.ntax: 847 if not self.interleave: 848 raise NexusError('Too many taxa in matrix - should matrix be interleaved?') 849 else: 850 taxcount=1 851 first_matrix_block=False 852 #get taxon name and sequence 853 linechars=CharBuffer(l) 854 id=quotestrip(linechars.next_word()) 855 l=linechars.rest().strip() 856 chars='' 857 if self.interleave: 858 #interleaved matrix 859 #print 'In interleave' 860 if l: 861 chars=''.join(l.split()) 862 else: 863 chars=''.join(lineiter.next().split()) 864 else: 865 #non-interleaved matrix 866 chars=''.join(l.split()) 867 while len(chars)<self.nchar: 868 l=lineiter.next() 869 chars+=''.join(l.split()) 870 iupac_seq=Seq(_replace_parenthesized_ambigs(chars,self.rev_ambiguous_values),self.alphabet) 871 #first taxon has the reference sequence if matchhar is used 872 if taxcount==1: 873 refseq=iupac_seq 874 else: 875 if self.matchchar: 876 while 1: 877 p=iupac_seq.tostring().find(self.matchchar) 878 if p==-1: 879 break 880 iupac_seq=Seq(iupac_seq.tostring()[:p]+refseq[p]+iupac_seq.tostring()[p+1:],self.alphabet) 881 #check for invalid characters 882 for i,c in enumerate(iupac_seq.tostring()): 883 if c not in self.valid_characters and c!=self.gap and c!=self.missing: 884 raise NexusError('Taxon %s: Illegal character %s in line: %s (check dimensions / interleaving)'\ 885 % (id,c,l[i-10:i+10])) 886 #add sequence to matrix 887 if first_matrix_block: 888 self.unaltered_taxlabels.append(id) 889 id=_unique_label(self.matrix.keys(),id) 890 self.matrix[id]=iupac_seq 891 self.taxlabels.append(id) 892 else: 893 # taxon names need to be in the same order in each interleaved block 894 id=_unique_label(self.taxlabels[:taxcount-1],id) 895 taxon_present=self._check_taxlabels(id) 896 if taxon_present: 897 self.matrix[taxon_present]+=iupac_seq 898 else: 899 raise NexusError('Taxon %s not in first block of interleaved matrix. Check matrix dimensions and interleave.' % id) 900 #check all sequences for length according to nchar 901 for taxon in self.matrix: 902 if len(self.matrix[taxon])!=self.nchar: 903 raise NexusError('Matrx Nchar %d does not match data length (%d) for taxon %s' \ 904 % (self.nchar, len(self.matrix[taxon]),taxon)) 905 #check that taxlabels is identical with matrix.keys. If not, it's a problem 906 matrixkeys=self.matrix.keys() 907 matrixkeys.sort() 908 taxlabelssort=self.taxlabels[:] 909 taxlabelssort.sort() 910 assert matrixkeys==taxlabelssort,"ERROR: TAXLABELS must be identical with MATRIX. Please Report this as a bug, and send in data file."
911
912 - def _translate(self,options):
913 self.translate={} 914 opts=CharBuffer(options) 915 while True: 916 try: 917 # get id and state 918 identifier=int(opts.next_word()) 919 label=quotestrip(opts.next_word()) 920 self.translate[identifier]=label 921 # check for comma or end of command 922 c=opts.next_nonwhitespace() 923 if c is None: 924 break 925 elif c!=',': 926 raise NexusError('Missing \',\' in line %s.' % options) 927 except NexusError: 928 raise 929 except: 930 raise NexusError('Format error in line %s.' % options)
931
932 - def _utree(self,options):
933 """Some software (clustalx) uses 'utree' to denote an unrooted tree.""" 934 self._tree(options)
935
936 - def _tree(self,options):
937 opts=CharBuffer(options) 938 name=opts.next_word() 939 if opts.next_nonwhitespace()!='=': 940 raise NexusError('Syntax error in tree description: %s' \ 941 % options[:50]) 942 rooted=False 943 weight=1.0 944 while opts.peek_nonwhitespace()=='[': 945 open=opts.next_nonwhitespace() 946 symbol=opts.next() 947 if symbol!='&': 948 raise NexusError('Illegal special comment [%s...] in tree description: %s' \ 949 % (symbol, options[:50])) 950 special=opts.next() 951 value=opts.next_until(']') 952 closing=opts.next() 953 if special=='R': 954 rooted=True 955 elif special=='U': 956 rooted=False 957 elif special=='W': 958 weight=float(value) 959 tree=Tree(name=name,weight=weight,rooted=rooted,tree=opts.rest().strip()) 960 # if there's an active translation table, translate 961 if self.translate: 962 for n in tree.get_terminals(): 963 try: 964 tree.node(n).data.taxon=safename(self.translate[int(tree.node(n).data.taxon)]) 965 except (ValueError,KeyError): 966 raise NexusError('Unable to substitue %s using \'translate\' data.' \ 967 % tree.node(n).data.taxon) 968 self.trees.append(tree)
969
970 - def _apply_block_structure(self,title,lines):
971 block=Block('') 972 block.title = title 973 for line in lines: 974 block.commandlines.append(Commandline(line, title)) 975 self.structured.append(block)
976
977 - def _taxset(self, options):
978 name,taxa=self._get_indices(options,set_type=TAXSET) 979 self.taxsets[name]=_make_unique(taxa)
980
981 - def _charset(self, options):
982 name,sites=self._get_indices(options,set_type=CHARSET) 983 self.charsets[name]=_make_unique(sites)
984
985 - def _taxpartition(self, options):
986 taxpartition={} 987 quotelevel=False 988 opts=CharBuffer(options) 989 name=self._name_n_vector(opts) 990 if not name: 991 raise NexusError('Formatting error in taxpartition: %s ' % options) 992 # now collect thesubbpartitions and parse them 993 # subpartitons separated by commas - which unfortunately could be part of a quoted identifier... 994 # this is rather unelegant, but we have to avoid double-parsing and potential change of special nexus-words 995 sub='' 996 while True: 997 w=opts.next() 998 if w is None or (w==',' and not quotelevel): 999 subname,subindices=self._get_indices(sub,set_type=TAXSET,separator=':') 1000 taxpartition[subname]=_make_unique(subindices) 1001 sub='' 1002 if w is None: 1003 break 1004 else: 1005 if w=="'": 1006 quotelevel=not quotelevel 1007 sub+=w 1008 self.taxpartitions[name]=taxpartition
1009
1010 - def _codonposset(self,options):
1011 """Read codon positions from a codons block as written from McClade. 1012 1013 Here codonposset is just a fancy name for a character partition with 1014 the name CodonPositions and the partitions N,1,2,3 1015 """ 1016 1017 prev_partitions=self.charpartitions.keys() 1018 self._charpartition(options) 1019 # mcclade calls it CodonPositions, but you never know... 1020 codonname=[n for n in self.charpartitions.keys() if n not in prev_partitions] 1021 if codonname==[] or len(codonname)>1: 1022 raise NexusError('Formatting Error in codonposset: %s ' % options) 1023 else: 1024 self.codonposset=codonname[0]
1025
1026 - def _codeset(self,options):
1027 pass
1028
1029 - def _charpartition(self, options):
1030 charpartition={} 1031 quotelevel=False 1032 opts=CharBuffer(options) 1033 name=self._name_n_vector(opts) 1034 if not name: 1035 raise NexusError('Formatting error in charpartition: %s ' % options) 1036 # now collect thesubbpartitions and parse them 1037 # subpartitons separated by commas - which unfortunately could be part of a quoted identifier... 1038 sub='' 1039 while True: 1040 w=opts.next() 1041 if w is None or (w==',' and not quotelevel): 1042 subname,subindices=self._get_indices(sub,set_type=CHARSET,separator=':') 1043 charpartition[subname]=_make_unique(subindices) 1044 sub='' 1045 if w is None: 1046 break 1047 else: 1048 if w=="'": 1049 quotelevel=not quotelevel 1050 sub+=w 1051 self.charpartitions[name]=charpartition
1052
1053 - def _get_indices(self,options,set_type=CHARSET,separator='='):
1054 """Parse the taxset/charset specification (PRIVATE). 1055 1056 e.g. '1 2 3 - 5 dog cat 10 - 20 \\ 3' 1057 --> [0,1,2,3,4,'dog','cat',9,12,15,18] 1058 """ 1059 opts=CharBuffer(options) 1060 name=self._name_n_vector(opts,separator=separator) 1061 indices=self._parse_list(opts,set_type=set_type) 1062 if indices is None: 1063 raise NexusError('Formatting error in line: %s ' % options) 1064 return name,indices
1065
1066 - def _name_n_vector(self,opts,separator='='):
1067 """Extract name and check that it's not in vector format.""" 1068 rest=opts.rest() 1069 name=opts.next_word() 1070 # we ignore * before names 1071 if name=='*': 1072 name=opts.next_word() 1073 if not name: 1074 raise NexusError('Formatting error in line: %s ' % rest) 1075 name=quotestrip(name) 1076 if opts.peek_nonwhitespace=='(': 1077 open=opts.next_nonwhitespace() 1078 qualifier=open.next_word() 1079 close=opts.next_nonwhitespace() 1080 if qualifier.lower()=='vector': 1081 raise NexusError('Unsupported VECTOR format in line %s' \ 1082 % (options)) 1083 elif qualifier.lower()!='standard': 1084 raise NexusError('Unknown qualifier %s in line %s' \ 1085 % (qualifier,options)) 1086 if opts.next_nonwhitespace()!=separator: 1087 raise NexusError('Formatting error in line: %s ' % rest) 1088 return name
1089
1090 - def _parse_list(self,options_buffer,set_type):
1091 """Parse a NEXUS list (PRIVATE). 1092 1093 e.g. [1, 2, 4-8\\2, dog, cat] --> [1,2,4,6,8,17,21], 1094 (assuming dog is taxon no. 17 and cat is taxon no. 21). 1095 """ 1096 plain_list=[] 1097 if options_buffer.peek_nonwhitespace(): 1098 try: # capture all possible exceptions and treat them as formatting erros, if they are not NexusError 1099 while True: 1100 identifier=options_buffer.next_word() # next list element 1101 if not identifier: # end of list? 1102 break 1103 start=self._resolve(identifier,set_type=set_type) 1104 if options_buffer.peek_nonwhitespace()=='-': # followd by - 1105 end=start 1106 step=1 1107 # get hyphen and end of range 1108 hyphen=options_buffer.next_nonwhitespace() 1109 end=self._resolve(options_buffer.next_word(),set_type=set_type) 1110 if set_type==CHARSET: 1111 if options_buffer.peek_nonwhitespace()=='\\': # followd by \ 1112 backslash=options_buffer.next_nonwhitespace() 1113 step=int(options_buffer.next_word()) # get backslash and step 1114 plain_list.extend(range(start,end+1,step)) 1115 else: 1116 if type(start)==list or type(end)==list: 1117 raise NexusError('Name if character sets not allowed in range definition: %s'\ 1118 % identifier) 1119 start=self.taxlabels.index(start) 1120 end=self.taxlabels.index(end) 1121 taxrange=self.taxlabels[start:end+1] 1122 plain_list.extend(taxrange) 1123 else: 1124 if type(start)==list: # start was the name of charset or taxset 1125 plain_list.extend(start) 1126 else: # start was an ordinary identifier 1127 plain_list.append(start) 1128 except NexusError: 1129 raise 1130 except: 1131 return None 1132 return plain_list
1133
1134 - def _resolve(self,identifier,set_type=None):
1135 """Translate identifier in list into character/taxon index. 1136 1137 Characters (which are referred to by their index in Nexus.py): 1138 Plain numbers are returned minus 1 (Nexus indices to python indices) 1139 Text identifiers are translaterd into their indices (if plain character indentifiers), 1140 the first hit in charlabels is returned (charlabels don't need to be unique) 1141 or the range of indices is returned (if names of character sets). 1142 Taxa (which are referred to by their unique name in Nexus.py): 1143 Plain numbers are translated in their taxon name, underscores and spaces are considered equal. 1144 Names are returned unchanged (if plain taxon identifiers), or the names in 1145 the corresponding taxon set is returned 1146 """ 1147 identifier=quotestrip(identifier) 1148 if not set_type: 1149 raise NexusError('INTERNAL ERROR: Need type to resolve identifier.') 1150 if set_type==CHARSET: 1151 try: 1152 n=int(identifier) 1153 except ValueError: 1154 if self.charlabels and identifier in self.charlabels.values(): 1155 for k in self.charlabels: 1156 if self.charlabels[k]==identifier: 1157 return k 1158 elif self.charsets and identifier in self.charsets: 1159 return self.charsets[identifier] 1160 else: 1161 raise NexusError('Unknown character identifier: %s' \ 1162 % identifier) 1163 else: 1164 if n<=self.nchar: 1165 return n-1 1166 else: 1167 raise NexusError('Illegal character identifier: %d>nchar (=%d).' \ 1168 % (identifier,self.nchar)) 1169 elif set_type==TAXSET: 1170 try: 1171 n=int(identifier) 1172 except ValueError: 1173 taxlabels_id=self._check_taxlabels(identifier) 1174 if taxlabels_id: 1175 return taxlabels_id 1176 elif self.taxsets and identifier in self.taxsets: 1177 return self.taxsets[identifier] 1178 else: 1179 raise NexusError('Unknown taxon identifier: %s' \ 1180 % identifier) 1181 else: 1182 if n>0 and n<=self.ntax: 1183 return self.taxlabels[n-1] 1184 else: 1185 raise NexusError('Illegal taxon identifier: %d>ntax (=%d).' \ 1186 % (identifier,self.ntax)) 1187 else: 1188 raise NexusError('Unknown set specification: %s.'% set_type)
1189
1190 - def _stateset(self, options):
1191 #Not implemented 1192 pass
1193
1194 - def _changeset(self, options):
1195 #Not implemented 1196 pass
1197
1198 - def _treeset(self, options):
1199 #Not implemented 1200 pass
1201
1202 - def _treepartition(self, options):
1203 #Not implemented 1204 pass
1205
1206 - def write_nexus_data_partitions(self, matrix=None, filename=None, blocksize=None, interleave=False, 1207 exclude=[], delete=[], charpartition=None, comment='',mrbayes=False):
1208 """Writes a nexus file for each partition in charpartition. 1209 1210 Only non-excluded characters and non-deleted taxa are included, 1211 just the data block is written. 1212 """ 1213 1214 if not matrix: 1215 matrix=self.matrix 1216 if not matrix: 1217 return 1218 if not filename: 1219 filename=self.filename 1220 if charpartition: 1221 pfilenames={} 1222 for p in charpartition: 1223 total_exclude=[]+exclude 1224 total_exclude.extend([c for c in range(self.nchar) if c not in charpartition[p]]) 1225 total_exclude=_make_unique(total_exclude) 1226 pcomment=comment+'\nPartition: '+p+'\n' 1227 dot=filename.rfind('.') 1228 if dot>0: 1229 pfilename=filename[:dot]+'_'+p+'.data' 1230 else: 1231 pfilename=filename+'_'+p 1232 pfilenames[p]=pfilename 1233 self.write_nexus_data(filename=pfilename,matrix=matrix,blocksize=blocksize, 1234 interleave=interleave,exclude=total_exclude,delete=delete,comment=pcomment,append_sets=False, 1235 mrbayes=mrbayes) 1236 return pfilenames 1237 else: 1238 fn=self.filename+'.data' 1239 self.write_nexus_data(filename=fn,matrix=matrix,blocksize=blocksize,interleave=interleave, 1240 exclude=exclude,delete=delete,comment=comment,append_sets=False, 1241 mrbayes=mrbayes) 1242 return fn
1243
1244 - def write_nexus_data(self, filename=None, matrix=None, exclude=[], delete=[],\ 1245 blocksize=None, interleave=False, interleave_by_partition=False,\ 1246 comment=None,omit_NEXUS=False,append_sets=True,mrbayes=False,\ 1247 codons_block=True):
1248 """Writes a nexus file with data and sets block to a file or handle. 1249 1250 Character sets and partitions are appended by default, and are 1251 adjusted according to excluded characters (i.e. character sets 1252 still point to the same sites (not necessarily same positions), 1253 without including the deleted characters. 1254 1255 filename - Either a filename as a string (which will be opened, 1256 written to and closed), or a handle object (which will 1257 be written to but NOT closed). 1258 omit_NEXUS - Boolean. If true, the '#NEXUS' line normally at the 1259 start of the file is ommited. 1260 1261 Returns the filename/handle used to write the data. 1262 """ 1263 if not matrix: 1264 matrix=self.matrix 1265 if not matrix: 1266 return 1267 if not filename: 1268 filename=self.filename 1269 if [t for t in delete if not self._check_taxlabels(t)]: 1270 raise NexusError('Unknown taxa: %s' \ 1271 % ', '.join(set(delete).difference(set(self.taxlabels)))) 1272 if interleave_by_partition: 1273 if not interleave_by_partition in self.charpartitions: 1274 raise NexusError('Unknown partition: '+interleave_by_partition) 1275 else: 1276 partition=self.charpartitions[interleave_by_partition] 1277 # we need to sort the partition names by starting position before we exclude characters 1278 names=_sort_keys_by_values(partition) 1279 newpartition={} 1280 for p in partition: 1281 newpartition[p]=[c for c in partition[p] if c not in exclude] 1282 # how many taxa and how many characters are left? 1283 undelete=[taxon for taxon in self.taxlabels if taxon in matrix and taxon not in delete] 1284 cropped_matrix=_seqmatrix2strmatrix(self.crop_matrix(matrix,exclude=exclude,delete=delete)) 1285 ntax_adjusted=len(undelete) 1286 nchar_adjusted=len(cropped_matrix[undelete[0]]) 1287 if not undelete or (undelete and undelete[0]==''): 1288 return 1289 if isinstance(filename,str): 1290 try: 1291 fh=open(filename,'w') 1292 except IOError: 1293 raise NexusError('Could not open %s for writing.' % filename) 1294 elif hasattr(file, "write"): 1295 fh=filename 1296 else : 1297 raise ValueError("Neither a filename nor a handle was supplied") 1298 if not omit_NEXUS: 1299 fh.write('#NEXUS\n') 1300 if comment: 1301 fh.write('['+comment+']\n') 1302 fh.write('begin data;\n') 1303 fh.write('\tdimensions ntax=%d nchar=%d;\n' % (ntax_adjusted, nchar_adjusted)) 1304 fh.write('\tformat datatype='+self.datatype) 1305 if self.respectcase: 1306 fh.write(' respectcase') 1307 if self.missing: 1308 fh.write(' missing='+self.missing) 1309 if self.gap: 1310 fh.write(' gap='+self.gap) 1311 if self.matchchar: 1312 fh.write(' matchchar='+self.matchchar) 1313 if self.labels: 1314 fh.write(' labels='+self.labels) 1315 if self.equate: 1316 fh.write(' equate='+self.equate) 1317 if interleave or interleave_by_partition: 1318 fh.write(' interleave') 1319 fh.write(';\n') 1320 #if self.taxlabels: 1321 # fh.write('taxlabels '+' '.join(self.taxlabels)+';\n') 1322 if self.charlabels: 1323 newcharlabels=self._adjust_charlabels(exclude=exclude) 1324 clkeys=newcharlabels.keys() 1325 clkeys.sort() 1326 fh.write('charlabels '+', '.join(["%s %s" % (k+1,safename(newcharlabels[k])) for k in clkeys])+';\n') 1327 fh.write('matrix\n') 1328 if not blocksize: 1329 if interleave: 1330 blocksize=70 1331 else: 1332 blocksize=self.nchar 1333 # delete deleted taxa and ecxclude excluded characters... 1334 namelength=max([len(safename(t,mrbayes=mrbayes)) for t in undelete]) 1335 if interleave_by_partition: 1336 # interleave by partitions, but adjust partitions with regard to excluded characters 1337 seek=0 1338 for p in names: 1339 fh.write('[%s: %s]\n' % (interleave_by_partition,p)) 1340 if len(newpartition[p])>0: 1341 for taxon in undelete: 1342 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1)) 1343 fh.write(cropped_matrix[taxon][seek:seek+len(newpartition[p])]+'\n') 1344 fh.write('\n') 1345 else: 1346 fh.write('[empty]\n\n') 1347 seek+=len(newpartition[p]) 1348 elif interleave: 1349 for seek in range(0,nchar_adjusted,blocksize): 1350 for taxon in undelete: 1351 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1)) 1352 fh.write(cropped_matrix[taxon][seek:seek+blocksize]+'\n') 1353 fh.write('\n') 1354 else: 1355 for taxon in undelete: 1356 if blocksize<nchar_adjusted: 1357 fh.write(safename(taxon,mrbayes=mrbayes)+'\n') 1358 else: 1359 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1)) 1360 for seek in range(0,nchar_adjusted,blocksize): 1361 fh.write(cropped_matrix[taxon][seek:seek+blocksize]+'\n') 1362 fh.write(';\nend;\n') 1363 if append_sets: 1364 if codons_block: 1365 fh.write(self.append_sets(exclude=exclude,delete=delete,mrbayes=mrbayes,include_codons=False)) 1366 fh.write(self.append_sets(exclude=exclude,delete=delete,mrbayes=mrbayes,codons_only=True)) 1367 else: 1368 fh.write(self.append_sets(exclude=exclude,delete=delete,mrbayes=mrbayes)) 1369 1370 if fh == filename : 1371 #We were given the handle, don't close it. 1372 return filename 1373 else : 1374 #We opened the handle, so we should close it. 1375 fh.close() 1376 return filename
1377
1378 - def append_sets(self,exclude=[],delete=[],mrbayes=False,include_codons=True,codons_only=False):
1379 """Returns a sets block.""" 1380 if not self.charsets and not self.taxsets and not self.charpartitions: 1381 return '' 1382 if codons_only: 1383 setsb=['\nbegin codons'] 1384 else: 1385 setsb=['\nbegin sets'] 1386 # - now if characters have been excluded, the character sets need to be adjusted, 1387 # so that they still point to the right character positions 1388 # calculate a list of offsets: for each deleted character, the following character position 1389 # in the new file will have an additional offset of -1 1390 offset=0 1391 offlist=[] 1392 for c in range(self.nchar): 1393 if c in exclude: 1394 offset+=1 1395 offlist.append(-1) # dummy value as these character positions are excluded 1396 else: 1397 offlist.append(c-offset) 1398 # now adjust each of the character sets 1399 if not codons_only: 1400 for n,ns in self.charsets.items(): 1401 cset=[offlist[c] for c in ns if c not in exclude] 1402 if cset: 1403 setsb.append('charset %s = %s' % (safename(n),_compact4nexus(cset))) 1404 for n,s in self.taxsets.items(): 1405 tset=[safename(t,mrbayes=mrbayes) for t in s if t not in delete] 1406 if tset: 1407 setsb.append('taxset %s = %s' % (safename(n),' '.join(tset))) 1408 for n,p in self.charpartitions.items(): 1409 if not include_codons and n==CODONPOSITIONS: 1410 continue 1411 elif codons_only and n!=CODONPOSITIONS: 1412 continue 1413 # as characters have been excluded, the partitions must be adjusted 1414 # if a partition is empty, it will be omitted from the charpartition command 1415 # (although paup allows charpartition part=t1:,t2:,t3:1-100) 1416 names=_sort_keys_by_values(p) 1417 newpartition={} 1418 for sn in names: 1419 nsp=[offlist[c] for c in p[sn] if c not in exclude] 1420 if nsp: 1421 newpartition[sn]=nsp 1422 if newpartition: 1423 if include_codons and n==CODONPOSITIONS: 1424 command='codonposset' 1425 else: 1426 command='charpartition' 1427 setsb.append('%s %s = %s' % (command,safename(n),\ 1428 ', '.join(['%s: %s' % (sn,_compact4nexus(newpartition[sn])) for sn in names if sn in newpartition]))) 1429 # now write charpartititions, much easier than charpartitions 1430 for n,p in self.taxpartitions.items(): 1431 names=_sort_keys_by_values(p) 1432 newpartition={} 1433 for sn in names: 1434 nsp=[t for t in p[sn] if t not in delete] 1435 if nsp: 1436 newpartition[sn]=nsp 1437 if newpartition: 1438 setsb.append('taxpartition %s = %s' % (safename(n),\ 1439 ', '.join(['%s: %s' % (safename(sn),' '.join(map(safename,newpartition[sn]))) for sn in names if sn in newpartition]))) 1440 # add 'end' and return everything 1441 setsb.append('end;\n') 1442 if len(setsb)==2: # begin and end only 1443 return '' 1444 else: 1445 return ';\n'.join(setsb)
1446
1447 - def export_fasta(self, filename=None, width=70):
1448 """Writes matrix into a fasta file: (self, filename=None, width=70).""" 1449 if not filename: 1450 if '.' in self.filename and self.filename.split('.')[-1].lower() in ['paup','nexus','nex','dat']: 1451 filename='.'.join(self.filename.split('.')[:-1])+'.fas' 1452 else: 1453 filename=self.filename+'.fas' 1454 fh=open(filename,'w') 1455 for taxon in self.taxlabels: 1456 fh.write('>'+safename(taxon)+'\n') 1457 for i in range(0, len(self.matrix[taxon].tostring()), width): 1458 fh.write(self.matrix[taxon].tostring()[i:i+width] + '\n') 1459 fh.close() 1460 return filename
1461
1462 - def export_phylip(self, filename=None):
1463 """Writes matrix into a PHYLIP file: (self, filename=None). 1464 1465 Note that this writes a relaxed PHYLIP format file, where the names 1466 are not truncated, nor checked for invalid characters.""" 1467 if not filename: 1468 if '.' in self.filename and self.filename.split('.')[-1].lower() in ['paup','nexus','nex','dat']: 1469 filename='.'.join(self.filename.split('.')[:-1])+'.phy' 1470 else: 1471 filename=self.filename+'.phy' 1472 fh=open(filename,'w') 1473 fh.write('%d %d\n' % (self.ntax,self.nchar)) 1474 for taxon in self.taxlabels: 1475 fh.write('%s %s\n' % (safename(taxon),self.matrix[taxon].tostring())) 1476 fh.close() 1477 return filename
1478
1479 - def constant(self,matrix=None,delete=[],exclude=[]):
1480 """Return a list with all constant characters.""" 1481 if not matrix: 1482 matrix=self.matrix 1483 undelete=[t for t in self.taxlabels if t in matrix and t not in delete] 1484 if not undelete: 1485 return None 1486 elif len(undelete)==1: 1487 return [x for x in range(len(matrix[undelete[0]])) if x not in exclude] 1488 # get the first sequence and expand all ambiguous values 1489 constant=[(x,self.ambiguous_values.get(n.upper(),n.upper())) for 1490 x,n in enumerate(matrix[undelete[0]].tostring()) if x not in exclude] 1491 for taxon in undelete[1:]: 1492 newconstant=[] 1493 for site in constant: 1494 #print '%d (paup=%d)' % (site[0],site[0]+1), 1495 seqsite=matrix[taxon][site[0]].upper() 1496 #print seqsite,'checked against',site[1],'\t', 1497 if seqsite==self.missing or (seqsite==self.gap and self.options['gapmode'].lower()=='missing') or seqsite==site[1]: 1498 # missing or same as before -> ok 1499 newconstant.append(site) 1500 elif seqsite in site[1] or site[1]==self.missing or (self.options['gapmode'].lower()=='missing' and site[1]==self.gap): 1501 # subset of an ambig or only missing in previous -> take subset 1502 newconstant.append((site[0],self.ambiguous_values.get(seqsite,seqsite))) 1503 elif seqsite in self.ambiguous_values: # is it an ambig: check the intersection with prev. values 1504 intersect=sets.set(self.ambiguous_values[seqsite]).intersection(sets.set(site[1])) 1505 if intersect: 1506 newconstant.append((site[0],''.join(intersect))) 1507 # print 'ok' 1508 #else: 1509 # print 'failed' 1510 #else: 1511 # print 'failed' 1512 constant=newconstant 1513 cpos=[s[0] for s in constant] 1514 return constant
1515 # return [x[0] for x in constant] 1516
1517 - def cstatus(self,site,delete=[],narrow=True):
1518 """Summarize character. 1519 1520 narrow=True: paup-mode (a c ? --> ac; ? ? ? --> ?) 1521 narrow=false: (a c ? --> a c g t -; ? ? ? --> a c g t -) 1522 """ 1523 undelete=[t for t in self.taxlabels if t not in delete] 1524 if not undelete: 1525 return None 1526 cstatus=[] 1527 for t in undelete: 1528 c=self.matrix[t][site].upper() 1529 if self.options.get('gapmode')=='missing' and c==self.gap: 1530 c=self.missing 1531 if narrow and c==self.missing: 1532 if c not in cstatus: 1533 cstatus.append(c) 1534 else: 1535 cstatus.extend([b for b in self.ambiguous_values[c] if b not in cstatus]) 1536 if self.missing in cstatus and narrow and len(cstatus)>1: 1537 cstatus=[c for c in cstatus if c!=self.missing] 1538 cstatus.sort() 1539 return cstatus
1540
1541 - def weighted_stepmatrix(self,name='your_name_here',exclude=[],delete=[]):
1542 """Calculates a stepmatrix for weighted parsimony. 1543 1544 See Wheeler (1990), Cladistics 6:269-275 and 1545 Felsenstein (1981), Biol. J. Linn. Soc. 16:183-196 1546 """ 1547 m=StepMatrix(self.unambiguous_letters,self.gap) 1548 for site in [s for s in range(self.nchar) if s not in exclude]: 1549 cstatus=self.cstatus(site,delete) 1550 for i,b1 in enumerate(cstatus[:-1]): 1551 for b2 in cstatus[i+1:]: 1552 m.add(b1.upper(),b2.upper(),1) 1553 return m.transformation().weighting().smprint(name=name)
1554
1555 - def crop_matrix(self,matrix=None, delete=[], exclude=[]):
1556 """Return a matrix without deleted taxa and excluded characters.""" 1557 if not matrix: 1558 matrix=self.matrix 1559 if [t for t in delete if not self._check_taxlabels(t)]: 1560 raise NexusError('Unknown taxa: %s' \ 1561 % ', '.join(sets.set(delete).difference(self.taxlabels))) 1562 if exclude!=[]: 1563 undelete=[t for t in self.taxlabels if t in matrix and t not in delete] 1564 if not undelete: 1565 return {} 1566 m=[matrix[k].tostring() for k in undelete] 1567 zipped_m=zip(*m) 1568 sitesm=[s for i,s in enumerate(zipped_m) if i not in exclude] 1569 if sitesm==[]: 1570 return dict([(t,Seq('',self.alphabet)) for t in undelete]) 1571 else: 1572 zipped_sitesm=zip(*sitesm) 1573 m=[Seq(s,self.alphabet) for s in map(''.join,zipped_sitesm)] 1574 return dict(zip(undelete,m)) 1575 else: 1576 return dict([(t,matrix[t]) for t in self.taxlabels if t in matrix and t not in delete])
1577
1578 - def bootstrap(self,matrix=None,delete=[],exclude=[]):
1579 """Return a bootstrapped matrix.""" 1580 if not matrix: 1581 matrix=self.matrix 1582 seqobjects=isinstance(matrix[matrix.keys()[0]],Seq) # remember if Seq objects 1583 cm=self.crop_matrix(delete=delete,exclude=exclude) # crop data out 1584 if not cm: # everything deleted? 1585 return {} 1586 elif len(cm[cm.keys()[0]])==0: # everything excluded? 1587 return cm 1588 undelete=[t for t in self.taxlabels if t in cm] 1589 if seqobjects: 1590 sitesm=zip(*[cm[t].tostring() for t in undelete]) 1591 alphabet=matrix[matrix.keys()[0]].alphabet 1592 else: 1593 sitesm=zip(*[cm[t] for t in undelete]) 1594 bootstrapsitesm=[sitesm[random.randint(0,len(sitesm)-1)] for i in range(len(sitesm))] 1595 bootstrapseqs=map(''.join,zip(*bootstrapsitesm)) 1596 if seqobjects: 1597 bootstrapseqs=[Seq(s,alphabet) for s in bootstrapseqs] 1598 return dict(zip(undelete,bootstrapseqs))
1599
1600 - def add_sequence(self,name,sequence):
1601 """Adds a sequence (string) to the matrix.""" 1602 1603 if not name: 1604 raise NexusError('New sequence must have a name') 1605 1606 diff=self.nchar-len(sequence) 1607 if diff<0: 1608 self.insert_gap(self.nchar,-diff) 1609 elif diff>0: 1610 sequence+=self.missing*diff 1611 1612 if name in self.taxlabels: 1613 unique_name=_unique_label(self.taxlabels,name) 1614 #print "WARNING: Sequence name %s is already present. Sequence was added as %s." % (name,unique_name) 1615 else: 1616 unique_name=name 1617 1618 assert unique_name not in self.matrix.keys(), "ERROR. There is a discrepancy between taxlabels and matrix keys. Report this as a bug." 1619 1620 self.matrix[unique_name]=Seq(sequence,self.alphabet) 1621 self.ntax+=1 1622 self.taxlabels.append(unique_name) 1623 self.unaltered_taxlabels.append(name)
1624
1625 - def insert_gap(self,pos,n=1,leftgreedy=False):
1626 """Add a gap into the matrix and adjust charsets and partitions. 1627 1628 pos=0: first position 1629 pos=nchar: last position 1630 """ 1631 1632 def _adjust(set,x,d,leftgreedy=False): 1633 """Adjusts chartacter sets if gaps are inserted, taking care of 1634 new gaps within a coherent character set.""" 1635 # if 3 gaps are inserted at pos. 9 in a set that looks like 1 2 3 8 9 10 11 13 14 15 1636 # then the adjusted set will be 1 2 3 8 9 10 11 12 13 14 15 16 17 18 1637 # but inserting into position 8 it will stay like 1 2 3 11 12 13 14 15 16 17 18 1638 set.sort() 1639 addpos=0 1640 for i,c in enumerate(set): 1641 if c>=x: 1642 set[i]=c+d 1643 # if we add gaps within a group of characters, we want the gap position included in this group 1644 if c==x: 1645 if leftgreedy or (i>0 and set[i-1]==c-1): 1646 addpos=i 1647 if addpos>0: 1648 set[addpos:addpos]=range(x,x+d) 1649 return set
1650 1651 if pos<0 or pos>self.nchar: 1652 raise NexusError('Illegal gap position: %d' % pos) 1653 if n==0: 1654 return 1655 if self.taxlabels : 1656 #python 2.3 does not support zip(*[]) 1657 sitesm=zip(*[self.matrix[t].tostring() for t in self.taxlabels]) 1658 else : 1659 sitesm=[] 1660 sitesm[pos:pos]=[['-']*len(self.taxlabels)]*n 1661 # #self.matrix=dict([(taxon,Seq(map(''.join,zip(*sitesm))[i],self.alphabet)) for\ 1662 # i,taxon in enumerate(self.taxlabels)]) 1663 zipped=zip(*sitesm) 1664 mapped=map(''.join,zipped) 1665 listed=[(taxon,Seq(mapped[i],self.alphabet)) for i,taxon in enumerate(self.taxlabels)] 1666 self.matrix=dict(listed) 1667 self.nchar+=n 1668 # now adjust character sets 1669 for i,s in self.charsets.items(): 1670 self.charsets[i]=_adjust(s,pos,n,leftgreedy=leftgreedy) 1671 for p in self.charpartitions: 1672 for sp,s in self.charpartitions[p].items(): 1673 self.charpartitions[p][sp]=_adjust(s,pos,n,leftgreedy=leftgreedy) 1674 # now adjust character state labels 1675 self.charlabels=self._adjust_charlabels(insert=[pos]*n) 1676 return self.charlabels 1677
1678 - def _adjust_charlabels(self,exclude=None,insert=None):
1679 """Return adjusted indices of self.charlabels if characters are excluded or inserted.""" 1680 if exclude and insert: 1681 raise NexusError('Can\'t exclude and insert at the same time') 1682 if not self.charlabels: 1683 return None 1684 labels=self.charlabels.keys() 1685 labels.sort() 1686 newcharlabels={} 1687 if exclude: 1688 exclude.sort() 1689 exclude.append(sys.maxint) 1690 excount=0 1691 for c in labels: 1692 if not c in exclude: 1693 while c>exclude[excount]: 1694 excount+=1 1695 newcharlabels[c-excount]=self.charlabels[c] 1696 elif insert: 1697 insert.sort() 1698 insert.append(sys.maxint) 1699 icount=0 1700 for c in labels: 1701 while c>=insert[icount]: 1702 icount+=1 1703 newcharlabels[c+icount]=self.charlabels[c] 1704 else: 1705 return self.charlabels 1706 return newcharlabels
1707
1708 - def invert(self,charlist):
1709 """Returns all character indices that are not in charlist.""" 1710 return [c for c in range(self.nchar) if c not in charlist]
1711
1712 - def gaponly(self,include_missing=False):
1713 """Return gap-only sites.""" 1714 gap=set(self.gap) 1715 if include_missing: 1716 gap.add(self.missing) 1717 sitesm=zip(*[self.matrix[t].tostring() for t in self.taxlabels]) 1718 gaponly=[i for i,site in enumerate(sitesm) if set(site).issubset(gap)] 1719 return gaponly
1720
1721 - def terminal_gap_to_missing(self,missing=None,skip_n=True):
1722 """Replaces all terminal gaps with missing character. 1723 1724 Mixtures like ???------??------- are properly resolved.""" 1725 1726 if not missing: 1727 missing=self.missing 1728 replace=[self.missing,self.gap] 1729 if not skip_n: 1730 replace.extend(['n','N']) 1731 for taxon in self.taxlabels: 1732 sequence=self.matrix[taxon].tostring() 1733 length=len(sequence) 1734 start,end=get_start_end(sequence,skiplist=replace) 1735 if start==-1 and end==-1: 1736 sequence=missing*length 1737 else: 1738 sequence=sequence[:end+1]+missing*(length-end-1) 1739 sequence=start*missing+sequence[start:] 1740 assert length==len(sequence), 'Illegal sequence manipulation in Nexus.termial_gap_to_missing in taxon %s' % taxon 1741 self.matrix[taxon]=Seq(sequence,self.alphabet)
1742