Package Bio :: Package Nexus :: Module Nexus
[hide private]
[frames] | no frames]

Source Code for Module Bio.Nexus.Nexus

   1  # Copyright 2005-2008 by Frank Kauff & Cymon J. Cox. All rights reserved. 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license. Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5  # 
   6  # Bug reports welcome: fkauff@biologie.uni-kl.de or on Biopython's bugzilla. 
   7  """Nexus class. Parse the contents of a NEXUS file. 
   8   
   9  Based upon 'NEXUS: An extensible file format for systematic information' 
  10  Maddison, Swofford, Maddison. 1997. Syst. Biol. 46(4):590-621 
  11  """ 
  12   
  13  import os,sys, math, random, copy 
  14   
  15  from Bio.Alphabet import IUPAC 
  16  from Bio.Data import IUPACData 
  17  from Bio.Seq import Seq 
  18   
  19  from Trees import Tree,NodeData 
  20   
  21  try: 
  22      import cnexus 
  23  except ImportError: 
  24      C=False 
  25  else: 
  26      C=True 
  27   
  28  INTERLEAVE=70 
  29  SPECIAL_COMMANDS=['charstatelabels','charlabels','taxlabels', 'taxset', 'charset','charpartition','taxpartition',\ 
  30          'matrix','tree', 'utree','translate','codonposset','title'] 
  31  KNOWN_NEXUS_BLOCKS = ['trees','data', 'characters', 'taxa', 'sets','codons'] 
  32  PUNCTUATION='()[]{}/\,;:=*\'"`+-<>' 
  33  MRBAYESSAFE='abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890_' 
  34  WHITESPACE=' \t\n' 
  35  #SPECIALCOMMENTS=['!','&','%','/','\\','@'] #original list of special comments 
  36  SPECIALCOMMENTS=['&'] # supported special comment ('tree' command), all others are ignored 
  37  CHARSET='chars' 
  38  TAXSET='taxa' 
  39  CODONPOSITIONS='codonpositions' 
  40  DEFAULTNEXUS='#NEXUS\nbegin data; dimensions ntax=0 nchar=0; format datatype=dna; end; ' 
41 -class NexusError(Exception): pass
42
43 -class CharBuffer:
44 """Helps reading NEXUS-words and characters from a buffer."""
45 - def __init__(self,string):
46 if string: 47 self.buffer=list(string) 48 else: 49 self.buffer=[]
50
51 - def peek(self):
52 if self.buffer: 53 return self.buffer[0] 54 else: 55 return None
56
57 - def peek_nonwhitespace(self):
58 b=''.join(self.buffer).strip() 59 if b: 60 return b[0] 61 else: 62 return None
63
64 - def next(self):
65 if self.buffer: 66 return self.buffer.pop(0) 67 else: 68 return None
69
70 - def next_nonwhitespace(self):
71 while True: 72 p=self.next() 73 if p is None: 74 break 75 if p not in WHITESPACE: 76 return p 77 return None
78
79 - def skip_whitespace(self):
80 while self.buffer[0] in WHITESPACE: 81 self.buffer=self.buffer[1:]
82
83 - def next_until(self,target):
84 for t in target: 85 try: 86 pos=self.buffer.index(t) 87 except ValueError: 88 pass 89 else: 90 found=''.join(self.buffer[:pos]) 91 self.buffer=self.buffer[pos:] 92 return found 93 else: 94 return None
95
96 - def peek_word(self,word):
97 return ''.join(self.buffer[:len(word)])==word
98
99 - def next_word(self):
100 """Return the next NEXUS word from a string. 101 102 This deals with single and double quotes, whitespace and punctuation. 103 """ 104 105 word=[] 106 quoted=False 107 first=self.next_nonwhitespace() # get first character 108 if not first: # return empty if only whitespace left 109 return None 110 word.append(first) 111 if first=="'": # word starts with a quote 112 quoted=True 113 elif first in PUNCTUATION: # if it's punctuation, return immediately 114 return first 115 while True: 116 c=self.peek() 117 if c=="'": # a quote? 118 word.append(self.next()) # store quote 119 if self.peek()=="'": # double quote 120 skip=self.next() # skip second quote 121 elif quoted: # second single quote ends word 122 break 123 elif quoted: 124 word.append(self.next()) # if quoted, then add anything 125 elif not c or c in PUNCTUATION or c in WHITESPACE: # if not quoted and special character, stop 126 break 127 else: 128 word.append(self.next()) # standard character 129 return ''.join(word)
130
131 - def rest(self):
132 """Return the rest of the string without parsing.""" 133 return ''.join(self.buffer)
134
135 -class StepMatrix:
136 """Calculate a stepmatrix for weighted parsimony. 137 138 See Wheeler (1990), Cladistics 6:269-275. 139 """ 140
141 - def __init__(self,symbols,gap):
142 self.data={} 143 self.symbols=[s for s in symbols] 144 self.symbols.sort() 145 if gap: 146 self.symbols.append(gap) 147 for x in self.symbols: 148 for y in [s for s in self.symbols if s!=x]: 149 self.set(x,y,0)
150
151 - def set(self,x,y,value):
152 if x>y: 153 x,y=y,x 154 self.data[x+y]=value
155
156 - def add(self,x,y,value):
157 if x>y: 158 x,y=y,x 159 self.data[x+y]+=value
160
161 - def sum(self):
162 return reduce(lambda x,y:x+y,self.data.values())
163
164 - def transformation(self):
165 total=self.sum() 166 if total!=0: 167 for k in self.data: 168 self.data[k]=self.data[k]/float(total) 169 return self
170
171 - def weighting(self):
172 for k in self.data: 173 if self.data[k]!=0: 174 self.data[k]=-math.log(self.data[k]) 175 return self
176
177 - def smprint(self,name='your_name_here'):
178 matrix='usertype %s stepmatrix=%d\n' % (name,len(self.symbols)) 179 matrix+=' %s\n' % ' '.join(self.symbols) 180 for x in self.symbols: 181 matrix+='[%s]'.ljust(8) % x 182 for y in self.symbols: 183 if x==y: 184 matrix+=' . ' 185 else: 186 if x>y: 187 x1,y1=y,x 188 else: 189 x1,y1=x,y 190 if self.data[x1+y1]==0: 191 matrix+='inf. ' 192 else: 193 matrix+='%2.2f'.ljust(10) % (self.data[x1+y1]) 194 matrix+='\n' 195 matrix+=';\n' 196 return matrix
197
198 -def safename(name,mrbayes=False):
199 """Return a taxon identifier according to NEXUS standard. 200 201 Wrap quotes around names with punctuation or whitespace, and double 202 single quotes. 203 204 mrbayes=True: write names without quotes, whitespace or punctuation 205 for the mrbayes software package. 206 """ 207 if mrbayes: 208 safe=name.replace(' ','_') 209 safe=''.join([c for c in safe if c in MRBAYESSAFE]) 210 else: 211 safe=name.replace("'","''") 212 if set(safe).intersection(set(WHITESPACE+PUNCTUATION)): 213 safe="'"+safe+"'" 214 return safe
215
216 -def quotestrip(word):
217 """Remove quotes and/or double quotes around identifiers.""" 218 if not word: 219 return None 220 while (word.startswith("'") and word.endswith("'")) or (word.startswith('"') and word.endswith('"')): 221 word=word[1:-1] 222 return word
223
224 -def get_start_end(sequence, skiplist=['-','?']):
225 """Return position of first and last character which is not in skiplist. 226 227 Skiplist defaults to ['-','?']).""" 228 229 length=len(sequence) 230 if length==0: 231 return None,None 232 end=length-1 233 while end>=0 and (sequence[end] in skiplist): 234 end-=1 235 start=0 236 while start<length and (sequence[start] in skiplist): 237 start+=1 238 if start==length and end==-1: # empty sequence 239 return -1,-1 240 else: 241 return start,end
242
243 -def _sort_keys_by_values(p):
244 """Returns a sorted list of keys of p sorted by values of p.""" 245 startpos=[(p[pn],pn) for pn in p if p[pn]] 246 startpos.sort() 247 return zip(*startpos)[1]
248
249 -def _make_unique(l):
250 """Check that all values in list are unique and return a pruned and sorted list.""" 251 l=list(set(l)) 252 l.sort() 253 return l
254
255 -def _unique_label(previous_labels,label):
256 """Returns a unique name if label is already in previous_labels.""" 257 while label in previous_labels: 258 if label.split('.')[-1].startswith('copy'): 259 label='.'.join(label.split('.')[:-1])+'.copy'+str(eval('0'+label.split('.')[-1][4:])+1) 260 else: 261 label+='.copy' 262 return label
263
264 -def _seqmatrix2strmatrix(matrix):
265 """Converts a Seq-object matrix to a plain sequence-string matrix.""" 266 return dict([(t,matrix[t].tostring()) for t in matrix])
267
268 -def _compact4nexus(orig_list):
269 """Transform [1 2 3 5 6 7 8 12 15 18 20] (baseindex 0, used in the Nexus class) 270 into '2-4 6-9 13-19\\3 21' (baseindex 1, used in programs like Paup or MrBayes.). 271 """ 272 273 if not orig_list: 274 return '' 275 orig_list=list(set(orig_list)) 276 orig_list.sort() 277 shortlist=[] 278 clist=orig_list[:] 279 clist.append(clist[-1]+.5) # dummy value makes it easier 280 while len(clist)>1: 281 step=1 282 for i,x in enumerate(clist): 283 if x==clist[0]+i*step: # are we still in the right step? 284 continue 285 elif i==1 and len(clist)>3 and clist[i+1]-x==x-clist[0]: 286 # second element, and possibly at least 3 elements to link, 287 # and the next one is in the right step 288 step=x-clist[0] 289 else: # pattern broke, add all values before current position to new list 290 sub=clist[:i] 291 if len(sub)==1: 292 shortlist.append(str(sub[0]+1)) 293 else: 294 if step==1: 295 shortlist.append('%d-%d' % (sub[0]+1,sub[-1]+1)) 296 else: 297 shortlist.append('%d-%d\\%d' % (sub[0]+1,sub[-1]+1,step)) 298 clist=clist[i:] 299 break 300 return ' '.join(shortlist)
301
302 -def combine(matrices):
303 """Combine matrices in [(name,nexus-instance),...] and return new nexus instance. 304 305 combined_matrix=combine([(name1,nexus_instance1),(name2,nexus_instance2),...] 306 Character sets, character partitions and taxon sets are prefixed, readjusted 307 and present in the combined matrix. 308 """ 309 310 if not matrices: 311 return None 312 name=matrices[0][0] 313 combined=copy.deepcopy(matrices[0][1]) # initiate with copy of first matrix 314 mixed_datatypes=(len(set([n[1].datatype for n in matrices]))>1) 315 if mixed_datatypes: 316 combined.datatype='None' # dealing with mixed matrices is application specific. You take care of that yourself! 317 # raise NexusError('Matrices must be of same datatype') 318 combined.charlabels=None 319 combined.statelabels=None 320 combined.interleave=False 321 combined.translate=None 322 323 # rename taxon sets and character sets and name them with prefix 324 for cn,cs in combined.charsets.items(): 325 combined.charsets['%s.%s' % (name,cn)]=cs 326 del combined.charsets[cn] 327 for tn,ts in combined.taxsets.items(): 328 combined.taxsets['%s.%s' % (name,tn)]=ts 329 del combined.taxsets[tn] 330 # previous partitions usually don't make much sense in combined matrix 331 # just initiate one new partition parted by single matrices 332 combined.charpartitions={'combined':{name:range(combined.nchar)}} 333 for n,m in matrices[1:]: # add all other matrices 334 both=[t for t in combined.taxlabels if t in m.taxlabels] 335 combined_only=[t for t in combined.taxlabels if t not in both] 336 m_only=[t for t in m.taxlabels if t not in both] 337 for t in both: 338 # concatenate sequences and unify gap and missing character symbols 339 combined.matrix[t]+=Seq(m.matrix[t].tostring().replace(m.gap,combined.gap).replace(m.missing,combined.missing),combined.alphabet) 340 # replace date of missing taxa with symbol for missing data 341 for t in combined_only: 342 combined.matrix[t]+=Seq(combined.missing*m.nchar,combined.alphabet) 343 for t in m_only: 344 combined.matrix[t]=Seq(combined.missing*combined.nchar,combined.alphabet)+\ 345 Seq(m.matrix[t].tostring().replace(m.gap,combined.gap).replace(m.missing,combined.missing),combined.alphabet) 346 combined.taxlabels.extend(m_only) # new taxon list 347 for cn,cs in m.charsets.items(): # adjust character sets for new matrix 348 combined.charsets['%s.%s' % (n,cn)]=[x+combined.nchar for x in cs] 349 if m.taxsets: 350 if not combined.taxsets: 351 combined.taxsets={} 352 combined.taxsets.update(dict([('%s.%s' % (n,tn),ts) for tn,ts in m.taxsets.items()])) # update taxon sets 353 combined.charpartitions['combined'][n]=range(combined.nchar,combined.nchar+m.nchar) # update new charpartition 354 # update charlabels 355 if m.charlabels: 356 if not combined.charlabels: 357 combined.charlabels={} 358 combined.charlabels.update(dict([(combined.nchar+i,label) for (i,label) in m.charlabels.items()])) 359 combined.nchar+=m.nchar # update nchar and ntax 360 combined.ntax+=len(m_only) 361 362 # some prefer partitions, some charsets: 363 # make separate charset for ecah initial dataset 364 for c in combined.charpartitions['combined']: 365 combined.charsets[c]=combined.charpartitions['combined'][c] 366 367 return combined
368
369 -def _kill_comments_and_break_lines(text):
370 """Delete []-delimited comments out of a file and break into lines separated by ';'. 371 372 stripped_text=_kill_comments_and_break_lines(text): 373 Nested and multiline comments are allowed. [ and ] symbols within single 374 or double quotes are ignored, newline ends a quote, all symbols with quotes are 375 treated the same (thus not quoting inside comments like [this character ']' ends a comment]) 376 Special [&...] and [\...] comments remain untouched, if not inside standard comment. 377 Quotes inside special [& and [\ are treated as normal characters, 378 but no nesting inside these special comments allowed (like [& [\ ]]). 379 ';' ist deleted from end of line. 380 381 NOTE: this function is very slow for large files, and obsolete when using C extension cnexus 382 """ 383 contents=CharBuffer(text) 384 newtext=[] 385 newline=[] 386 quotelevel='' 387 speciallevel=False 388 commlevel=0 389 while True: 390 #plain=contents.next_until(["'",'"','[',']','\n',';']) # search for next special character 391 #if not plain: 392 # newline.append(contents.rest) # not found, just add the rest 393 # break 394 #newline.append(plain) # add intermediate text 395 t=contents.next() # and get special character 396 if t is None: 397 break 398 if t==quotelevel and not (commlevel or speciallevel): # matching quote ends quotation 399 quotelevel='' 400 elif not quotelevel and not (commlevel or speciallevel) and (t=='"' or t=="'"): # single or double quote starts quotation 401 quotelevel=t 402 elif not quotelevel and t=='[': # opening bracket outside a quote 403 if contents.peek() in SPECIALCOMMENTS and commlevel==0 and not speciallevel: 404 speciallevel=True 405 else: 406 commlevel+=1 407 elif not quotelevel and t==']': # closing bracket ioutside a quote 408 if speciallevel: 409 speciallevel=False 410 else: 411 commlevel-=1 412 if commlevel<0: 413 raise NexusError('Nexus formatting error: unmatched ]') 414 continue 415 if commlevel==0: # copy if we're not in comment 416 if t==';' and not quotelevel: 417 newtext.append(''.join(newline)) 418 newline=[] 419 else: 420 newline.append(t) 421 #level of comments should be 0 at the end of the file 422 if newline: 423 newtext.append('\n'.join(newline)) 424 if commlevel>0: 425 raise NexusError('Nexus formatting error: unmatched [') 426 return newtext
427 428
429 -def _adjust_lines(lines):
430 """Adjust linebreaks to match ';', strip leading/trailing whitespace. 431 432 list_of_commandlines=_adjust_lines(input_text) 433 Lines are adjusted so that no linebreaks occur within a commandline 434 (except matrix command line) 435 """ 436 formatted_lines=[] 437 for l in lines: 438 #Convert line endings 439 l=l.replace('\r\n','\n').replace('\r','\n').strip() 440 if l.lower().startswith('matrix'): 441 formatted_lines.append(l) 442 else: 443 l=l.replace('\n',' ') 444 if l: 445 formatted_lines.append(l) 446 return formatted_lines
447
448 -def _replace_parenthesized_ambigs(seq,rev_ambig_values):
449 """Replaces ambigs in xxx(ACG)xxx format by IUPAC ambiguity code.""" 450 451 opening=seq.find('(') 452 while opening>-1: 453 closing=seq.find(')') 454 if closing<0: 455 raise NexusError('Missing closing parenthesis in: '+seq) 456 elif closing<opening: 457 raise NexusError('Missing opening parenthesis in: '+seq) 458 ambig=[x for x in seq[opening+1:closing]] 459 ambig.sort() 460 ambig=''.join(ambig) 461 ambig_code=rev_ambig_values[ambig.upper()] 462 if ambig!=ambig.upper(): 463 ambig_code=ambig_code.lower() 464 seq=seq[:opening]+ambig_code+seq[closing+1:] 465 opening=seq.find('(') 466 return seq
467
468 -class Commandline:
469 """Represent a commandline as command and options.""" 470
471 - def __init__(self, line, title):
472 self.options={} 473 options=[] 474 self.command=None 475 try: 476 #Assume matrix (all other command lines have been stripped of \n) 477 self.command, options = line.strip().split('\n', 1) 478 except ValueError: #Not matrix 479 #self.command,options=line.split(' ',1) #no: could be tab or spaces (translate...) 480 self.command=line.split()[0] 481 options=' '.join(line.split()[1:]) 482 self.command = self.command.strip().lower() 483 if self.command in SPECIAL_COMMANDS: # special command that need newlines and order of options preserved 484 self.options=options.strip() 485 else: 486 if len(options) > 0: 487 try: 488 options = options.replace('=', ' = ').split() 489 valued_indices=[(n-1,n,n+1) for n in range(len(options)) if options[n]=='=' and n!=0 and n!=len((options))] 490 indices = [] 491 for sl in valued_indices: 492 indices.extend(sl) 493 token_indices = [n for n in range(len(options)) if n not in indices] 494 for opt in valued_indices: 495 #self.options[options[opt[0]].lower()] = options[opt[2]].lower() 496 self.options[options[opt[0]].lower()] = options[opt[2]] 497 for token in token_indices: 498 self.options[options[token].lower()] = None 499 except ValueError: 500 raise NexusError('Incorrect formatting in line: %s' % line)
501
502 -class Block:
503 """Represent a NEXUS block with block name and list of commandlines."""
504 - def __init__(self,title=None):
505 self.title=title 506 self.commandlines=[]
507
508 -class Nexus(object):
509 510 __slots__=['original_taxon_order','__dict__'] 511
512 - def __init__(self, input=None):
513 self.ntax=0 # number of taxa 514 self.nchar=0 # number of characters 515 self.unaltered_taxlabels=[] # taxlabels as the appear in the input file (incl. duplicates, etc.) 516 self.taxlabels=[] # labels for taxa, ordered by their id 517 self.charlabels=None # ... and for characters 518 self.statelabels=None # ... and for states 519 self.datatype='dna' # (standard), dna, rna, nucleotide, protein 520 self.respectcase=False # case sensitivity 521 self.missing='?' # symbol for missing characters 522 self.gap='-' # symbol for gap 523 self.symbols=None # set of symbols 524 self.equate=None # set of symbol synonyms 525 self.matchchar=None # matching char for matrix representation 526 self.labels=None # left, right, no 527 self.transpose=False # whether matrix is transposed 528 self.interleave=False # whether matrix is interleaved 529 self.tokens=False # unsupported 530 self.eliminate=None # unsupported 531 self.matrix=None # ... 532 self.unknown_blocks=[] # blocks we don't care about 533 self.taxsets={} 534 self.charsets={} 535 self.charpartitions={} 536 self.taxpartitions={} 537 self.trees=[] # list of Trees (instances of Tree class) 538 self.translate=None # Dict to translate taxon <-> taxon numbers 539 self.structured=[] # structured input representation 540 self.set={} # dict of the set command to set various options 541 self.options={} # dict of the options command in the data block 542 self.codonposset=None # name of the charpartition that defines codon positions 543 544 # some defaults 545 self.options['gapmode']='missing' 546 547 if input: 548 self.read(input) 549 else: 550 self.read(DEFAULTNEXUS)
551
552 - def get_original_taxon_order(self):
553 """Included for backwards compatibility (DEPRECATED).""" 554 return self.taxlabels
555 - def set_original_taxon_order(self,value):
556 """Included for backwards compatibility (DEPRECATED).""" 557 self.taxlabels=value
558 original_taxon_order=property(get_original_taxon_order,set_original_taxon_order) 559
560 - def read(self,input):
561 """Read and parse NEXUS imput (a filename, file-handle, or string).""" 562 563 # 1. Assume we have the name of a file in the execution dir 564 # Note we need to add parsing of the path to dir/filename 565 try: 566 file_contents = open(os.path.expanduser(input),'rU').read() 567 self.filename=input 568 except (TypeError,IOError,AttributeError): 569 #2 Assume we have a string from a fh.read() 570 if isinstance(input, str): 571 file_contents = input 572 self.filename='input_string' 573 #3 Assume we have a file object 574 elif hasattr(input,'read'): # file objects or StringIO objects 575 file_contents=input.read() 576 if hasattr(input,"name") and input.name: 577 self.filename=input.name 578 else: 579 self.filename='Unknown_nexus_file' 580 else: 581 print input.strip()[:50] 582 raise NexusError('Unrecognized input: %s ...' % input[:100]) 583 file_contents=file_contents.strip() 584 if file_contents.startswith('#NEXUS'): 585 file_contents=file_contents[6:] 586 if C: 587 decommented=cnexus.scanfile(file_contents) 588 #check for unmatched parentheses 589 if decommented=='[' or decommented==']': 590 raise NexusError('Unmatched %s' % decommented) 591 # cnexus can't return lists, so in analogy we separate commandlines with chr(7) 592 # (a character that shoudn't be part of a nexus file under normal circumstances) 593 commandlines=_adjust_lines(decommented.split(chr(7))) 594 else: 595 commandlines=_adjust_lines(_kill_comments_and_break_lines(file_contents)) 596 # get rid of stupid 'NEXUS token - in merged treefiles, this might appear multiple times' 597 for i,cl in enumerate(commandlines): 598 try: 599 if cl[:6].upper()=='#NEXUS': 600 commandlines[i]=cl[6:].strip() 601 except: 602 pass 603 # now loop through blocks (we parse only data in known blocks, thus ignoring non-block commands 604 nexus_block_gen = self._get_nexus_block(commandlines) 605 while 1: 606 try: 607 title, contents = nexus_block_gen.next() 608 except StopIteration: 609 break 610 if title in KNOWN_NEXUS_BLOCKS: 611 self._parse_nexus_block(title, contents) 612 else: 613 self._unknown_nexus_block(title, contents)
614
615 - def _get_nexus_block(self,file_contents):
616 """Generator for looping through Nexus blocks.""" 617 inblock=False 618 blocklines=[] 619 while file_contents: 620 cl=file_contents.pop(0) 621 if cl.lower().startswith('begin'): 622 if not inblock: 623 inblock=True 624 title=cl.split()[1].lower() 625 else: 626 raise NexusError('Illegal block nesting in block %s' % title) 627 elif cl.lower().startswith('end'): 628 if inblock: 629 inblock=False 630 yield title,blocklines 631 blocklines=[] 632 else: 633 raise NexusError('Unmatched \'end\'.') 634 elif inblock: 635 blocklines.append(cl)
636
637 - def _unknown_nexus_block(self,title, contents):
638 block = Block() 639 block.commandlines.append(contents) 640 block.title = title 641 self.unknown_blocks.append(block)
642
643 - def _parse_nexus_block(self,title, contents):
644 """Parse a known Nexus Block (PRIVATE).""" 645 # attached the structered block representation 646 self._apply_block_structure(title, contents) 647 #now check for taxa,characters,data blocks. If this stuff is defined more than once 648 #the later occurences will override the previous ones. 649 block=self.structured[-1] 650 for line in block.commandlines: 651 try: 652 getattr(self,'_'+line.command)(line.options) 653 except AttributeError: 654 raise 655 raise NexusError('Unknown command: %s ' % line.command)
656
657 - def _title(self,options):
658 pass
659
660 - def _dimensions(self,options):
661 if 'ntax' in options: 662 self.ntax=eval(options['ntax']) 663 if 'nchar' in options: 664 self.nchar=eval(options['nchar'])
665
666 - def _format(self,options):
667 # print options 668 # we first need to test respectcase, then symbols (which depends on respectcase) 669 # then datatype (which, if standard, depends on symbols and respectcase in order to generate 670 # dicts for ambiguous values and alphabet 671 if 'respectcase' in options: 672 self.respectcase=True 673 # adjust symbols to for respectcase 674 if 'symbols' in options: 675 self.symbols=options['symbols'] 676 if (self.symbols.startswith('"') and self.symbols.endswith('"')) or\ 677 (self.symbold.startswith("'") and self.symbols.endswith("'")): 678 self.symbols=self.symbols[1:-1].replace(' ','') 679 if not self.respectcase: 680 self.symbols=self.symbols.lower()+self.symbols.upper() 681 self.symbols=list(set(self.symbols)) 682 if 'datatype' in options: 683 self.datatype=options['datatype'].lower() 684 if self.datatype=='dna' or self.datatype=='nucleotide': 685 self.alphabet=copy.deepcopy(IUPAC.ambiguous_dna) 686 self.ambiguous_values=copy.deepcopy(IUPACData.ambiguous_dna_values) 687 self.unambiguous_letters=copy.deepcopy(IUPACData.unambiguous_dna_letters) 688 elif self.datatype=='rna': 689 self.alphabet=copy.deepcopy(IUPAC.ambiguous_rna) 690 self.ambiguous_values=copy.deepcopy(IUPACData.ambiguous_rna_values) 691 self.unambiguous_letters=copy.deepcopy(IUPACData.unambiguous_rna_letters) 692 elif self.datatype=='protein': 693 self.alphabet=copy.deepcopy(IUPAC.protein) 694 self.ambiguous_values={'B':'DN','Z':'EQ','X':copy.deepcopy(IUPACData.protein_letters)} # that's how PAUP handles it 695 self.unambiguous_letters=copy.deepcopy(IUPACData.protein_letters)+'*' # stop-codon 696 elif self.datatype=='standard': 697 raise NexusError('Datatype standard is not yet supported.') 698 #self.alphabet=None 699 #self.ambiguous_values={} 700 #if not self.symbols: 701 # self.symbols='01' # if nothing else defined, then 0 and 1 are the default states 702 #self.unambiguous_letters=self.symbols 703 else: 704 raise NexusError('Unsupported datatype: '+self.datatype) 705 self.valid_characters=''.join(self.ambiguous_values.keys())+self.unambiguous_letters 706 if not self.respectcase: 707 self.valid_characters=self.valid_characters.lower()+self.valid_characters.upper() 708 #we have to sort the reverse ambig coding dict key characters: 709 #to be sure that it's 'ACGT':'N' and not 'GTCA':'N' 710 rev=dict([(i[1],i[0]) for i in self.ambiguous_values.items() if i[0]!='X']) 711 self.rev_ambiguous_values={} 712 for (k,v) in rev.items(): 713 key=[c for c in k] 714 key.sort() 715 self.rev_ambiguous_values[''.join(key)]=v 716 #overwrite symbols for datype rna,dna,nucleotide 717 if self.datatype in ['dna','rna','nucleotide']: 718 self.symbols=self.alphabet.letters 719 if self.missing not in self.ambiguous_values: 720 self.ambiguous_values[self.missing]=self.unambiguous_letters+self.gap 721 self.ambiguous_values[self.gap]=self.gap 722 elif self.datatype=='standard': 723 if not self.symbols: 724 self.symbols=['1','0'] 725 if 'missing' in options: 726 self.missing=options['missing'][0] 727 if 'gap' in options: 728 self.gap=options['gap'][0] 729 if 'equate' in options: 730 self.equate=options['equate'] 731 if 'matchchar' in options: 732 self.matchchar=options['matchchar'][0] 733 if 'labels' in options: 734 self.labels=options['labels'] 735 if 'transpose' in options: 736 raise NexusError('TRANSPOSE is not supported!') 737 self.transpose=True 738 if 'interleave' in options: 739 if options['interleave']==None or options['interleave'].lower()=='yes': 740 self.interleave=True 741 if 'tokens' in options: 742 self.tokens=True 743 if 'notokens' in options: 744 self.tokens=False
745 746
747 - def _set(self,options):
748 self.set=options;
749
750 - def _options(self,options):
751 self.options=options;
752
753 - def _eliminate(self,options):
754 self.eliminate=options
755
756 - def _taxlabels(self,options):
757 """Get taxon labels (PRIVATE). 758 759 As the taxon names are already in the matrix, this is superfluous 760 except for transpose matrices, which are currently unsupported anyway. 761 Thus, we ignore the taxlabels command to make handling of duplicate 762 taxon names easier. 763 """ 764 pass
765 #self.taxlabels=[] 766 #opts=CharBuffer(options) 767 #while True: 768 # taxon=quotestrip(opts.next_word()) 769 # if not taxon: 770 # break 771 # self.taxlabels.append(taxon) 772
773 - def _check_taxlabels(self,taxon):
774 """Check for presence of taxon in self.taxlabels.""" 775 # According to NEXUS standard, underscores shall be treated as spaces..., 776 # so checking for identity is more difficult 777 nextaxa=dict([(t.replace(' ','_'),t) for t in self.taxlabels]) 778 nexid=taxon.replace(' ','_') 779 return nextaxa.get(nexid)
780
781 - def _charlabels(self,options):
782 self.charlabels={} 783 opts=CharBuffer(options) 784 while True: 785 try: 786 # get id and state 787 w=opts.next_word() 788 if w is None: # McClade saves and reads charlabel-lists with terminal comma?! 789 break 790 identifier=self._resolve(w,set_type=CHARSET) 791 state=quotestrip(opts.next_word()) 792 self.charlabels[identifier]=state 793 # check for comma or end of command 794 c=opts.next_nonwhitespace() 795 if c is None: 796 break 797 elif c!=',': 798 raise NexusError('Missing \',\' in line %s.' % options) 799 except NexusError: 800 raise 801 except: 802 raise NexusError('Format error in line %s.' % options)
803
804 - def _charstatelabels(self,options):
805 # warning: charstatelabels supports only charlabels-syntax! 806 self._charlabels(options)
807
808 - def _statelabels(self,options):
809 #self.charlabels=options 810 #print 'Command statelabels is not supported and will be ignored.' 811 pass
812
813 - def _matrix(self,options):
814 if not self.ntax or not self.nchar: 815 raise NexusError('Dimensions must be specified before matrix!') 816 self.matrix={} 817 taxcount=0 818 first_matrix_block=True 819 820 #eliminate empty lines and leading/trailing whitespace 821 lines=[l.strip() for l in options.split('\n') if l.strip()!=''] 822 lineiter=iter(lines) 823 while 1: 824 try: 825 l=lineiter.next() 826 except StopIteration: 827 if taxcount<self.ntax: 828 raise NexusError('Not enough taxa in matrix.') 829 elif taxcount>self.ntax: 830 raise NexusError('Too many taxa in matrix.') 831 else: 832 break 833 # count the taxa and check for interleaved matrix 834 taxcount+=1 835 ##print taxcount 836 if taxcount>self.ntax: 837 if not self.interleave: 838 raise NexusError('Too many taxa in matrix - should matrix be interleaved?') 839 else: 840 taxcount=1 841 first_matrix_block=False 842 #get taxon name and sequence 843 linechars=CharBuffer(l) 844 id=quotestrip(linechars.next_word()) 845 l=linechars.rest().strip() 846 chars='' 847 if self.interleave: 848 #interleaved matrix 849 #print 'In interleave' 850 if l: 851 chars=''.join(l.split()) 852 else: 853 chars=''.join(lineiter.next().split()) 854 else: 855 #non-interleaved matrix 856 chars=''.join(l.split()) 857 while len(chars)<self.nchar: 858 l=lineiter.next() 859 chars+=''.join(l.split()) 860 iupac_seq=Seq(_replace_parenthesized_ambigs(chars,self.rev_ambiguous_values),self.alphabet) 861 #first taxon has the reference sequence if matchhar is used 862 if taxcount==1: 863 refseq=iupac_seq 864 else: 865 if self.matchchar: 866 while 1: 867 p=iupac_seq.tostring().find(self.matchchar) 868 if p==-1: 869 break 870 iupac_seq=Seq(iupac_seq.tostring()[:p]+refseq[p]+iupac_seq.tostring()[p+1:],self.alphabet) 871 #check for invalid characters 872 for i,c in enumerate(iupac_seq.tostring()): 873 if c not in self.valid_characters and c!=self.gap and c!=self.missing: 874 raise NexusError('Taxon %s: Illegal character %s in line: %s (check dimensions / interleaving)'\ 875 % (id,c,l[i-10:i+10])) 876 #add sequence to matrix 877 if first_matrix_block: 878 self.unaltered_taxlabels.append(id) 879 id=_unique_label(self.matrix.keys(),id) 880 self.matrix[id]=iupac_seq 881 self.taxlabels.append(id) 882 else: 883 # taxon names need to be in the same order in each interleaved block 884 id=_unique_label(self.taxlabels[:taxcount-1],id) 885 taxon_present=self._check_taxlabels(id) 886 if taxon_present: 887 self.matrix[taxon_present]+=iupac_seq 888 else: 889 raise NexusError('Taxon %s not in first block of interleaved matrix. Check matrix dimensions and interleave.' % id) 890 #check all sequences for length according to nchar 891 for taxon in self.matrix: 892 if len(self.matrix[taxon])!=self.nchar: 893 raise NexusError('Matrx Nchar %d does not match data length (%d) for taxon %s' \ 894 % (self.nchar, len(self.matrix[taxon]),taxon)) 895 #check that taxlabels is identical with matrix.keys. If not, it's a problem 896 matrixkeys=self.matrix.keys() 897 matrixkeys.sort() 898 taxlabelssort=self.taxlabels[:] 899 taxlabelssort.sort() 900 assert matrixkeys==taxlabelssort,"ERROR: TAXLABELS must be identical with MATRIX. Please Report this as a bug, and send in data file."
901
902 - def _translate(self,options):
903 self.translate={} 904 opts=CharBuffer(options) 905 while True: 906 try: 907 # get id and state 908 identifier=int(opts.next_word()) 909 label=quotestrip(opts.next_word()) 910 self.translate[identifier]=label 911 # check for comma or end of command 912 c=opts.next_nonwhitespace() 913 if c is None: 914 break 915 elif c!=',': 916 raise NexusError('Missing \',\' in line %s.' % options) 917 except NexusError: 918 raise 919 except: 920 raise NexusError('Format error in line %s.' % options)
921
922 - def _utree(self,options):
923 """Some software (clustalx) uses 'utree' to denote an unrooted tree.""" 924 self._tree(options)
925
926 - def _tree(self,options):
927 opts=CharBuffer(options) 928 name=opts.next_word() 929 if opts.next_nonwhitespace()!='=': 930 raise NexusError('Syntax error in tree description: %s' \ 931 % options[:50]) 932 rooted=False 933 weight=1.0 934 while opts.peek_nonwhitespace()=='[': 935 open=opts.next_nonwhitespace() 936 symbol=opts.next() 937 if symbol!='&': 938 raise NexusError('Illegal special comment [%s...] in tree description: %s' \ 939 % (symbol, options[:50])) 940 special=opts.next() 941 value=opts.next_until(']') 942 closing=opts.next() 943 if special=='R': 944 rooted=True 945 elif special=='U': 946 rooted=False 947 elif special=='W': 948 weight=float(value) 949 tree=Tree(name=name,weight=weight,rooted=rooted,tree=opts.rest().strip()) 950 # if there's an active translation table, translate 951 if self.translate: 952 for n in tree.get_terminals(): 953 try: 954 tree.node(n).data.taxon=safename(self.translate[int(tree.node(n).data.taxon)]) 955 except (ValueError,KeyError): 956 raise NexusError('Unable to substitue %s using \'translate\' data.' \ 957 % tree.node(n).data.taxon) 958 self.trees.append(tree)
959
960 - def _apply_block_structure(self,title,lines):
961 block=Block('') 962 block.title = title 963 for line in lines: 964 block.commandlines.append(Commandline(line, title)) 965 self.structured.append(block)
966
967 - def _taxset(self, options):
968 name,taxa=self._get_indices(options,set_type=TAXSET) 969 self.taxsets[name]=_make_unique(taxa)
970
971 - def _charset(self, options):
972 name,sites=self._get_indices(options,set_type=CHARSET) 973 self.charsets[name]=_make_unique(sites)
974
975 - def _taxpartition(self, options):
976 taxpartition={} 977 quotelevel=False 978 opts=CharBuffer(options) 979 name=self._name_n_vector(opts) 980 if not name: 981 raise NexusError('Formatting error in taxpartition: %s ' % options) 982 # now collect thesubbpartitions and parse them 983 # subpartitons separated by commas - which unfortunately could be part of a quoted identifier... 984 # this is rather unelegant, but we have to avoid double-parsing and potential change of special nexus-words 985 sub='' 986 while True: 987 w=opts.next() 988 if w is None or (w==',' and not quotelevel): 989 subname,subindices=self._get_indices(sub,set_type=TAXSET,separator=':') 990 taxpartition[subname]=_make_unique(subindices) 991 sub='' 992 if w is None: 993 break 994 else: 995 if w=="'": 996 quotelevel=not quotelevel 997 sub+=w 998 self.taxpartitions[name]=taxpartition
999
1000 - def _codonposset(self,options):
1001 """Read codon positions from a codons block as written from McClade. 1002 1003 Here codonposset is just a fancy name for a character partition with 1004 the name CodonPositions and the partitions N,1,2,3 1005 """ 1006 1007 prev_partitions=self.charpartitions.keys() 1008 self._charpartition(options) 1009 # mcclade calls it CodonPositions, but you never know... 1010 codonname=[n for n in self.charpartitions.keys() if n not in prev_partitions] 1011 if codonname==[] or len(codonname)>1: 1012 raise NexusError('Formatting Error in codonposset: %s ' % options) 1013 else: 1014 self.codonposset=codonname[0]
1015
1016 - def _codeset(self,options):
1017 pass
1018
1019 - def _charpartition(self, options):
1020 charpartition={} 1021 quotelevel=False 1022 opts=CharBuffer(options) 1023 name=self._name_n_vector(opts) 1024 if not name: 1025 raise NexusError('Formatting error in charpartition: %s ' % options) 1026 # now collect thesubbpartitions and parse them 1027 # subpartitons separated by commas - which unfortunately could be part of a quoted identifier... 1028 sub='' 1029 while True: 1030 w=opts.next() 1031 if w is None or (w==',' and not quotelevel): 1032 subname,subindices=self._get_indices(sub,set_type=CHARSET,separator=':') 1033 charpartition[subname]=_make_unique(subindices) 1034 sub='' 1035 if w is None: 1036 break 1037 else: 1038 if w=="'": 1039 quotelevel=not quotelevel 1040 sub+=w 1041 self.charpartitions[name]=charpartition
1042
1043 - def _get_indices(self,options,set_type=CHARSET,separator='='):
1044 """Parse the taxset/charset specification (PRIVATE). 1045 1046 e.g. '1 2 3 - 5 dog cat 10 - 20 \\ 3' 1047 --> [0,1,2,3,4,'dog','cat',9,12,15,18] 1048 """ 1049 opts=CharBuffer(options) 1050 name=self._name_n_vector(opts,separator=separator) 1051 indices=self._parse_list(opts,set_type=set_type) 1052 if indices is None: 1053 raise NexusError('Formatting error in line: %s ' % options) 1054 return name,indices
1055
1056 - def _name_n_vector(self,opts,separator='='):
1057 """Extract name and check that it's not in vector format.""" 1058 rest=opts.rest() 1059 name=opts.next_word() 1060 # we ignore * before names 1061 if name=='*': 1062 name=opts.next_word() 1063 if not name: 1064 raise NexusError('Formatting error in line: %s ' % rest) 1065 name=quotestrip(name) 1066 if opts.peek_nonwhitespace=='(': 1067 open=opts.next_nonwhitespace() 1068 qualifier=open.next_word() 1069 close=opts.next_nonwhitespace() 1070 if qualifier.lower()=='vector': 1071 raise NexusError('Unsupported VECTOR format in line %s' \ 1072 % (options)) 1073 elif qualifier.lower()!='standard': 1074 raise NexusError('Unknown qualifier %s in line %s' \ 1075 % (qualifier,options)) 1076 if opts.next_nonwhitespace()!=separator: 1077 raise NexusError('Formatting error in line: %s ' % rest) 1078 return name
1079
1080 - def _parse_list(self,options_buffer,set_type):
1081 """Parse a NEXUS list (PRIVATE). 1082 1083 e.g. [1, 2, 4-8\\2, dog, cat] --> [1,2,4,6,8,17,21], 1084 (assuming dog is taxon no. 17 and cat is taxon no. 21). 1085 """ 1086 plain_list=[] 1087 if options_buffer.peek_nonwhitespace(): 1088 try: # capture all possible exceptions and treat them as formatting erros, if they are not NexusError 1089 while True: 1090 identifier=options_buffer.next_word() # next list element 1091 if not identifier: # end of list? 1092 break 1093 start=self._resolve(identifier,set_type=set_type) 1094 if options_buffer.peek_nonwhitespace()=='-': # followd by - 1095 end=start 1096 step=1 1097 # get hyphen and end of range 1098 hyphen=options_buffer.next_nonwhitespace() 1099 end=self._resolve(options_buffer.next_word(),set_type=set_type) 1100 if set_type==CHARSET: 1101 if options_buffer.peek_nonwhitespace()=='\\': # followd by \ 1102 backslash=options_buffer.next_nonwhitespace() 1103 step=int(options_buffer.next_word()) # get backslash and step 1104 plain_list.extend(range(start,end+1,step)) 1105 else: 1106 if type(start)==list or type(end)==list: 1107 raise NexusError('Name if character sets not allowed in range definition: %s'\ 1108 % identifier) 1109 start=self.taxlabels.index(start) 1110 end=self.taxlabels.index(end) 1111 taxrange=self.taxlabels[start:end+1] 1112 plain_list.extend(taxrange) 1113 else: 1114 if type(start)==list: # start was the name of charset or taxset 1115 plain_list.extend(start) 1116 else: # start was an ordinary identifier 1117 plain_list.append(start) 1118 except NexusError: 1119 raise 1120 except: 1121 return None 1122 return plain_list
1123
1124 - def _resolve(self,identifier,set_type=None):
1125 """Translate identifier in list into character/taxon index. 1126 1127 Characters (which are referred to by their index in Nexus.py): 1128 Plain numbers are returned minus 1 (Nexus indices to python indices) 1129 Text identifiers are translaterd into their indices (if plain character indentifiers), 1130 the first hit in charlabels is returned (charlabels don't need to be unique) 1131 or the range of indices is returned (if names of character sets). 1132 Taxa (which are referred to by their unique name in Nexus.py): 1133 Plain numbers are translated in their taxon name, underscores and spaces are considered equal. 1134 Names are returned unchanged (if plain taxon identifiers), or the names in 1135 the corresponding taxon set is returned 1136 """ 1137 identifier=quotestrip(identifier) 1138 if not set_type: 1139 raise NexusError('INTERNAL ERROR: Need type to resolve identifier.') 1140 if set_type==CHARSET: 1141 try: 1142 n=int(identifier) 1143 except ValueError: 1144 if self.charlabels and identifier in self.charlabels.values(): 1145 for k in self.charlabels: 1146 if self.charlabels[k]==identifier: 1147 return k 1148 elif self.charsets and identifier in self.charsets: 1149 return self.charsets[identifier] 1150 else: 1151 raise NexusError('Unknown character identifier: %s' \ 1152 % identifier) 1153 else: 1154 if n<=self.nchar: 1155 return n-1 1156 else: 1157 raise NexusError('Illegal character identifier: %d>nchar (=%d).' \ 1158 % (identifier,self.nchar)) 1159 elif set_type==TAXSET: 1160 try: 1161 n=int(identifier) 1162 except ValueError: 1163 taxlabels_id=self._check_taxlabels(identifier) 1164 if taxlabels_id: 1165 return taxlabels_id 1166 elif self.taxsets and identifier in self.taxsets: 1167 return self.taxsets[identifier] 1168 else: 1169 raise NexusError('Unknown taxon identifier: %s' \ 1170 % identifier) 1171 else: 1172 if n>0 and n<=self.ntax: 1173 return self.taxlabels[n-1] 1174 else: 1175 raise NexusError('Illegal taxon identifier: %d>ntax (=%d).' \ 1176 % (identifier,self.ntax)) 1177 else: 1178 raise NexusError('Unknown set specification: %s.'% set_type)
1179
1180 - def _stateset(self, options):
1181 #Not implemented 1182 pass
1183
1184 - def _changeset(self, options):
1185 #Not implemented 1186 pass
1187
1188 - def _treeset(self, options):
1189 #Not implemented 1190 pass
1191
1192 - def _treepartition(self, options):
1193 #Not implemented 1194 pass
1195
1196 - def write_nexus_data_partitions(self, matrix=None, filename=None, blocksize=None, interleave=False, 1197 exclude=[], delete=[], charpartition=None, comment='',mrbayes=False):
1198 """Writes a nexus file for each partition in charpartition. 1199 1200 Only non-excluded characters and non-deleted taxa are included, 1201 just the data block is written. 1202 """ 1203 1204 if not matrix: 1205 matrix=self.matrix 1206 if not matrix: 1207 return 1208 if not filename: 1209 filename=self.filename 1210 if charpartition: 1211 pfilenames={} 1212 for p in charpartition: 1213 total_exclude=[]+exclude 1214 total_exclude.extend([c for c in range(self.nchar) if c not in charpartition[p]]) 1215 total_exclude=_make_unique(total_exclude) 1216 pcomment=comment+'\nPartition: '+p+'\n' 1217 dot=filename.rfind('.') 1218 if dot>0: 1219 pfilename=filename[:dot]+'_'+p+'.data' 1220 else: 1221 pfilename=filename+'_'+p 1222 pfilenames[p]=pfilename 1223 self.write_nexus_data(filename=pfilename,matrix=matrix,blocksize=blocksize, 1224 interleave=interleave,exclude=total_exclude,delete=delete,comment=pcomment,append_sets=False, 1225 mrbayes=mrbayes) 1226 return pfilenames 1227 else: 1228 fn=self.filename+'.data' 1229 self.write_nexus_data(filename=fn,matrix=matrix,blocksize=blocksize,interleave=interleave, 1230 exclude=exclude,delete=delete,comment=comment,append_sets=False, 1231 mrbayes=mrbayes) 1232 return fn
1233
1234 - def write_nexus_data(self, filename=None, matrix=None, exclude=[], delete=[],\ 1235 blocksize=None, interleave=False, interleave_by_partition=False,\ 1236 comment=None,omit_NEXUS=False,append_sets=True,mrbayes=False,\ 1237 codons_block=True):
1238 """Writes a nexus file with data and sets block to a file or handle. 1239 1240 Character sets and partitions are appended by default, and are 1241 adjusted according to excluded characters (i.e. character sets 1242 still point to the same sites (not necessarily same positions), 1243 without including the deleted characters. 1244 1245 filename - Either a filename as a string (which will be opened, 1246 written to and closed), or a handle object (which will 1247 be written to but NOT closed). 1248 omit_NEXUS - Boolean. If true, the '#NEXUS' line normally at the 1249 start of the file is ommited. 1250 1251 Returns the filename/handle used to write the data. 1252 """ 1253 if not matrix: 1254 matrix=self.matrix 1255 if not matrix: 1256 return 1257 if not filename: 1258 filename=self.filename 1259 if [t for t in delete if not self._check_taxlabels(t)]: 1260 raise NexusError('Unknown taxa: %s' \ 1261 % ', '.join(set(delete).difference(set(self.taxlabels)))) 1262 if interleave_by_partition: 1263 if not interleave_by_partition in self.charpartitions: 1264 raise NexusError('Unknown partition: '+interleave_by_partition) 1265 else: 1266 partition=self.charpartitions[interleave_by_partition] 1267 # we need to sort the partition names by starting position before we exclude characters 1268 names=_sort_keys_by_values(partition) 1269 newpartition={} 1270 for p in partition: 1271 newpartition[p]=[c for c in partition[p] if c not in exclude] 1272 # how many taxa and how many characters are left? 1273 undelete=[taxon for taxon in self.taxlabels if taxon in matrix and taxon not in delete] 1274 cropped_matrix=_seqmatrix2strmatrix(self.crop_matrix(matrix,exclude=exclude,delete=delete)) 1275 ntax_adjusted=len(undelete) 1276 nchar_adjusted=len(cropped_matrix[undelete[0]]) 1277 if not undelete or (undelete and undelete[0]==''): 1278 return 1279 if isinstance(filename,str): 1280 try: 1281 fh=open(filename,'w') 1282 except IOError: 1283 raise NexusError('Could not open %s for writing.' % filename) 1284 elif hasattr(file, "write"): 1285 fh=filename 1286 else: 1287 raise ValueError("Neither a filename nor a handle was supplied") 1288 if not omit_NEXUS: 1289 fh.write('#NEXUS\n') 1290 if comment: 1291 fh.write('['+comment+']\n') 1292 fh.write('begin data;\n') 1293 fh.write('\tdimensions ntax=%d nchar=%d;\n' % (ntax_adjusted, nchar_adjusted)) 1294 fh.write('\tformat datatype='+self.datatype) 1295 if self.respectcase: 1296 fh.write(' respectcase') 1297 if self.missing: 1298 fh.write(' missing='+self.missing) 1299 if self.gap: 1300 fh.write(' gap='+self.gap) 1301 if self.matchchar: 1302 fh.write(' matchchar='+self.matchchar) 1303 if self.labels: 1304 fh.write(' labels='+self.labels) 1305 if self.equate: 1306 fh.write(' equate='+self.equate) 1307 if interleave or interleave_by_partition: 1308 fh.write(' interleave') 1309 fh.write(';\n') 1310 #if self.taxlabels: 1311 # fh.write('taxlabels '+' '.join(self.taxlabels)+';\n') 1312 if self.charlabels: 1313 newcharlabels=self._adjust_charlabels(exclude=exclude) 1314 clkeys=newcharlabels.keys() 1315 clkeys.sort() 1316 fh.write('charlabels '+', '.join(["%s %s" % (k+1,safename(newcharlabels[k])) for k in clkeys])+';\n') 1317 fh.write('matrix\n') 1318 if not blocksize: 1319 if interleave: 1320 blocksize=70 1321 else: 1322 blocksize=self.nchar 1323 # delete deleted taxa and ecxclude excluded characters... 1324 namelength=max([len(safename(t,mrbayes=mrbayes)) for t in undelete]) 1325 if interleave_by_partition: 1326 # interleave by partitions, but adjust partitions with regard to excluded characters 1327 seek=0 1328 for p in names: 1329 fh.write('[%s: %s]\n' % (interleave_by_partition,p)) 1330 if len(newpartition[p])>0: 1331 for taxon in undelete: 1332 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1)) 1333 fh.write(cropped_matrix[taxon][seek:seek+len(newpartition[p])]+'\n') 1334 fh.write('\n') 1335 else: 1336 fh.write('[empty]\n\n') 1337 seek+=len(newpartition[p]) 1338 elif interleave: 1339 for seek in range(0,nchar_adjusted,blocksize): 1340 for taxon in undelete: 1341 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1)) 1342 fh.write(cropped_matrix[taxon][seek:seek+blocksize]+'\n') 1343 fh.write('\n') 1344 else: 1345 for taxon in undelete: 1346 if blocksize<nchar_adjusted: 1347 fh.write(safename(taxon,mrbayes=mrbayes)+'\n') 1348 else: 1349 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1)) 1350 for seek in range(0,nchar_adjusted,blocksize): 1351 fh.write(cropped_matrix[taxon][seek:seek+blocksize]+'\n') 1352 fh.write(';\nend;\n') 1353 if append_sets: 1354 if codons_block: 1355 fh.write(self.append_sets(exclude=exclude,delete=delete,mrbayes=mrbayes,include_codons=False)) 1356 fh.write(self.append_sets(exclude=exclude,delete=delete,mrbayes=mrbayes,codons_only=True)) 1357 else: 1358 fh.write(self.append_sets(exclude=exclude,delete=delete,mrbayes=mrbayes)) 1359 1360 if fh == filename: 1361 #We were given the handle, don't close it. 1362 return filename 1363 else: 1364 #We opened the handle, so we should close it. 1365 fh.close() 1366 return filename
1367
1368 - def append_sets(self,exclude=[],delete=[],mrbayes=False,include_codons=True,codons_only=False):
1369 """Returns a sets block.""" 1370 if not self.charsets and not self.taxsets and not self.charpartitions: 1371 return '' 1372 if codons_only: 1373 setsb=['\nbegin codons'] 1374 else: 1375 setsb=['\nbegin sets'] 1376 # - now if characters have been excluded, the character sets need to be adjusted, 1377 # so that they still point to the right character positions 1378 # calculate a list of offsets: for each deleted character, the following character position 1379 # in the new file will have an additional offset of -1 1380 offset=0 1381 offlist=[] 1382 for c in range(self.nchar): 1383 if c in exclude: 1384 offset+=1 1385 offlist.append(-1) # dummy value as these character positions are excluded 1386 else: 1387 offlist.append(c-offset) 1388 # now adjust each of the character sets 1389 if not codons_only: 1390 for n,ns in self.charsets.items(): 1391 cset=[offlist[c] for c in ns if c not in exclude] 1392 if cset: 1393 setsb.append('charset %s = %s' % (safename(n),_compact4nexus(cset))) 1394 for n,s in self.taxsets.items(): 1395 tset=[safename(t,mrbayes=mrbayes) for t in s if t not in delete] 1396 if tset: 1397 setsb.append('taxset %s = %s' % (safename(n),' '.join(tset))) 1398 for n,p in self.charpartitions.items(): 1399 if not include_codons and n==CODONPOSITIONS: 1400 continue 1401 elif codons_only and n!=CODONPOSITIONS: 1402 continue 1403 # as characters have been excluded, the partitions must be adjusted 1404 # if a partition is empty, it will be omitted from the charpartition command 1405 # (although paup allows charpartition part=t1:,t2:,t3:1-100) 1406 names=_sort_keys_by_values(p) 1407 newpartition={} 1408 for sn in names: 1409 nsp=[offlist[c] for c in p[sn] if c not in exclude] 1410 if nsp: 1411 newpartition[sn]=nsp 1412 if newpartition: 1413 if include_codons and n==CODONPOSITIONS: 1414 command='codonposset' 1415 else: 1416 command='charpartition' 1417 setsb.append('%s %s = %s' % (command,safename(n),\ 1418 ', '.join(['%s: %s' % (sn,_compact4nexus(newpartition[sn])) for sn in names if sn in newpartition]))) 1419 # now write charpartititions, much easier than charpartitions 1420 for n,p in self.taxpartitions.items(): 1421 names=_sort_keys_by_values(p) 1422 newpartition={} 1423 for sn in names: 1424 nsp=[t for t in p[sn] if t not in delete] 1425 if nsp: 1426 newpartition[sn]=nsp 1427 if newpartition: 1428 setsb.append('taxpartition %s = %s' % (safename(n),\ 1429 ', '.join(['%s: %s' % (safename(sn),' '.join(map(safename,newpartition[sn]))) for sn in names if sn in newpartition]))) 1430 # add 'end' and return everything 1431 setsb.append('end;\n') 1432 if len(setsb)==2: # begin and end only 1433 return '' 1434 else: 1435 return ';\n'.join(setsb)
1436
1437 - def export_fasta(self, filename=None, width=70):
1438 """Writes matrix into a fasta file: (self, filename=None, width=70).""" 1439 if not filename: 1440 if '.' in self.filename and self.filename.split('.')[-1].lower() in ['paup','nexus','nex','dat']: 1441 filename='.'.join(self.filename.split('.')[:-1])+'.fas' 1442 else: 1443 filename=self.filename+'.fas' 1444 fh=open(filename,'w') 1445 for taxon in self.taxlabels: 1446 fh.write('>'+safename(taxon)+'\n') 1447 for i in range(0, len(self.matrix[taxon].tostring()), width): 1448 fh.write(self.matrix[taxon].tostring()[i:i+width] + '\n') 1449 fh.close() 1450 return filename
1451
1452 - def export_phylip(self, filename=None):
1453 """Writes matrix into a PHYLIP file: (self, filename=None). 1454 1455 Note that this writes a relaxed PHYLIP format file, where the names 1456 are not truncated, nor checked for invalid characters.""" 1457 if not filename: 1458 if '.' in self.filename and self.filename.split('.')[-1].lower() in ['paup','nexus','nex','dat']: 1459 filename='.'.join(self.filename.split('.')[:-1])+'.phy' 1460 else: 1461 filename=self.filename+'.phy' 1462 fh=open(filename,'w') 1463 fh.write('%d %d\n' % (self.ntax,self.nchar)) 1464 for taxon in self.taxlabels: 1465 fh.write('%s %s\n' % (safename(taxon),self.matrix[taxon].tostring())) 1466 fh.close() 1467 return filename
1468
1469 - def constant(self,matrix=None,delete=[],exclude=[]):
1470 """Return a list with all constant characters.""" 1471 if not matrix: 1472 matrix=self.matrix 1473 undelete=[t for t in self.taxlabels if t in matrix and t not in delete] 1474 if not undelete: 1475 return None 1476 elif len(undelete)==1: 1477 return [x for x in range(len(matrix[undelete[0]])) if x not in exclude] 1478 # get the first sequence and expand all ambiguous values 1479 constant=[(x,self.ambiguous_values.get(n.upper(),n.upper())) for 1480 x,n in enumerate(matrix[undelete[0]].tostring()) if x not in exclude] 1481 for taxon in undelete[1:]: 1482 newconstant=[] 1483 for site in constant: 1484 #print '%d (paup=%d)' % (site[0],site[0]+1), 1485 seqsite=matrix[taxon][site[0]].upper() 1486 #print seqsite,'checked against',site[1],'\t', 1487 if seqsite==self.missing or (seqsite==self.gap and self.options['gapmode'].lower()=='missing') or seqsite==site[1]: 1488 # missing or same as before -> ok 1489 newconstant.append(site) 1490 elif seqsite in site[1] or site[1]==self.missing or (self.options['gapmode'].lower()=='missing' and site[1]==self.gap): 1491 # subset of an ambig or only missing in previous -> take subset 1492 newconstant.append((site[0],self.ambiguous_values.get(seqsite,seqsite))) 1493 elif seqsite in self.ambiguous_values: # is it an ambig: check the intersection with prev. values 1494 intersect=sets.set(self.ambiguous_values[seqsite]).intersection(sets.set(site[1])) 1495 if intersect: 1496 newconstant.append((site[0],''.join(intersect))) 1497 # print 'ok' 1498 #else: 1499 # print 'failed' 1500 #else: 1501 # print 'failed' 1502 constant=newconstant 1503 cpos=[s[0] for s in constant] 1504 return constant
1505 # return [x[0] for x in constant] 1506
1507 - def cstatus(self,site,delete=[],narrow=True):
1508 """Summarize character. 1509 1510 narrow=True: paup-mode (a c ? --> ac; ? ? ? --> ?) 1511 narrow=false: (a c ? --> a c g t -; ? ? ? --> a c g t -) 1512 """ 1513 undelete=[t for t in self.taxlabels if t not in delete] 1514 if not undelete: 1515 return None 1516 cstatus=[] 1517 for t in undelete: 1518 c=self.matrix[t][site].upper() 1519 if self.options.get('gapmode')=='missing' and c==self.gap: 1520 c=self.missing 1521 if narrow and c==self.missing: 1522 if c not in cstatus: 1523 cstatus.append(c) 1524 else: 1525 cstatus.extend([b for b in self.ambiguous_values[c] if b not in cstatus]) 1526 if self.missing in cstatus and narrow and len(cstatus)>1: 1527 cstatus=[c for c in cstatus if c!=self.missing] 1528 cstatus.sort() 1529 return cstatus
1530
1531 - def weighted_stepmatrix(self,name='your_name_here',exclude=[],delete=[]):
1532 """Calculates a stepmatrix for weighted parsimony. 1533 1534 See Wheeler (1990), Cladistics 6:269-275 and 1535 Felsenstein (1981), Biol. J. Linn. Soc. 16:183-196 1536 """ 1537 m=StepMatrix(self.unambiguous_letters,self.gap) 1538 for site in [s for s in range(self.nchar) if s not in exclude]: 1539 cstatus=self.cstatus(site,delete) 1540 for i,b1 in enumerate(cstatus[:-1]): 1541 for b2 in cstatus[i+1:]: 1542 m.add(b1.upper(),b2.upper(),1) 1543 return m.transformation().weighting().smprint(name=name)
1544
1545 - def crop_matrix(self,matrix=None, delete=[], exclude=[]):
1546 """Return a matrix without deleted taxa and excluded characters.""" 1547 if not matrix: 1548 matrix=self.matrix 1549 if [t for t in delete if not self._check_taxlabels(t)]: 1550 raise NexusError('Unknown taxa: %s' \ 1551 % ', '.join(sets.set(delete).difference(self.taxlabels))) 1552 if exclude!=[]: 1553 undelete=[t for t in self.taxlabels if t in matrix and t not in delete] 1554 if not undelete: 1555 return {} 1556 m=[matrix[k].tostring() for k in undelete] 1557 zipped_m=zip(*m) 1558 sitesm=[s for i,s in enumerate(zipped_m) if i not in exclude] 1559 if sitesm==[]: 1560 return dict([(t,Seq('',self.alphabet)) for t in undelete]) 1561 else: 1562 zipped_sitesm=zip(*sitesm) 1563 m=[Seq(s,self.alphabet) for s in map(''.join,zipped_sitesm)] 1564 return dict(zip(undelete,m)) 1565 else: 1566 return dict([(t,matrix[t]) for t in self.taxlabels if t in matrix and t not in delete])
1567
1568 - def bootstrap(self,matrix=None,delete=[],exclude=[]):
1569 """Return a bootstrapped matrix.""" 1570 if not matrix: 1571 matrix=self.matrix 1572 seqobjects=isinstance(matrix[matrix.keys()[0]],Seq) # remember if Seq objects 1573 cm=self.crop_matrix(delete=delete,exclude=exclude) # crop data out 1574 if not cm: # everything deleted? 1575 return {} 1576 elif len(cm[cm.keys()[0]])==0: # everything excluded? 1577 return cm 1578 undelete=[t for t in self.taxlabels if t in cm] 1579 if seqobjects: 1580 sitesm=zip(*[cm[t].tostring() for t in undelete]) 1581 alphabet=matrix[matrix.keys()[0]].alphabet 1582 else: 1583 sitesm=zip(*[cm[t] for t in undelete]) 1584 bootstrapsitesm=[sitesm[random.randint(0,len(sitesm)-1)] for i in range(len(sitesm))] 1585 bootstrapseqs=map(''.join,zip(*bootstrapsitesm)) 1586 if seqobjects: 1587 bootstrapseqs=[Seq(s,alphabet) for s in bootstrapseqs] 1588 return dict(zip(undelete,bootstrapseqs))
1589
1590 - def add_sequence(self,name,sequence):
1591 """Adds a sequence (string) to the matrix.""" 1592 1593 if not name: 1594 raise NexusError('New sequence must have a name') 1595 1596 diff=self.nchar-len(sequence) 1597 if diff<0: 1598 self.insert_gap(self.nchar,-diff) 1599 elif diff>0: 1600 sequence+=self.missing*diff 1601 1602 if name in self.taxlabels: 1603 unique_name=_unique_label(self.taxlabels,name) 1604 #print "WARNING: Sequence name %s is already present. Sequence was added as %s." % (name,unique_name) 1605 else: 1606 unique_name=name 1607 1608 assert unique_name not in self.matrix.keys(), "ERROR. There is a discrepancy between taxlabels and matrix keys. Report this as a bug." 1609 1610 self.matrix[unique_name]=Seq(sequence,self.alphabet) 1611 self.ntax+=1 1612 self.taxlabels.append(unique_name) 1613 self.unaltered_taxlabels.append(name)
1614
1615 - def insert_gap(self,pos,n=1,leftgreedy=False):
1616 """Add a gap into the matrix and adjust charsets and partitions. 1617 1618 pos=0: first position 1619 pos=nchar: last position 1620 """ 1621 1622 def _adjust(set,x,d,leftgreedy=False): 1623 """Adjusts chartacter sets if gaps are inserted, taking care of 1624 new gaps within a coherent character set.""" 1625 # if 3 gaps are inserted at pos. 9 in a set that looks like 1 2 3 8 9 10 11 13 14 15 1626 # then the adjusted set will be 1 2 3 8 9 10 11 12 13 14 15 16 17 18 1627 # but inserting into position 8 it will stay like 1 2 3 11 12 13 14 15 16 17 18 1628 set.sort() 1629 addpos=0 1630 for i,c in enumerate(set): 1631 if c>=x: 1632 set[i]=c+d 1633 # if we add gaps within a group of characters, we want the gap position included in this group 1634 if c==x: 1635 if leftgreedy or (i>0 and set[i-1]==c-1): 1636 addpos=i 1637 if addpos>0: 1638 set[addpos:addpos]=range(x,x+d) 1639 return set
1640 1641 if pos<0 or pos>self.nchar: 1642 raise NexusError('Illegal gap position: %d' % pos) 1643 if n==0: 1644 return 1645 if self.taxlabels: 1646 #python 2.3 does not support zip(*[]) 1647 sitesm=zip(*[self.matrix[t].tostring() for t in self.taxlabels]) 1648 else: 1649 sitesm=[] 1650 sitesm[pos:pos]=[['-']*len(self.taxlabels)]*n 1651 # #self.matrix=dict([(taxon,Seq(map(''.join,zip(*sitesm))[i],self.alphabet)) for\ 1652 # i,taxon in enumerate(self.taxlabels)]) 1653 zipped=zip(*sitesm) 1654 mapped=map(''.join,zipped) 1655 listed=[(taxon,Seq(mapped[i],self.alphabet)) for i,taxon in enumerate(self.taxlabels)] 1656 self.matrix=dict(listed) 1657 self.nchar+=n 1658 # now adjust character sets 1659 for i,s in self.charsets.items(): 1660 self.charsets[i]=_adjust(s,pos,n,leftgreedy=leftgreedy) 1661 for p in self.charpartitions: 1662 for sp,s in self.charpartitions[p].items(): 1663 self.charpartitions[p][sp]=_adjust(s,pos,n,leftgreedy=leftgreedy) 1664 # now adjust character state labels 1665 self.charlabels=self._adjust_charlabels(insert=[pos]*n) 1666 return self.charlabels 1667
1668 - def _adjust_charlabels(self,exclude=None,insert=None):
1669 """Return adjusted indices of self.charlabels if characters are excluded or inserted.""" 1670 if exclude and insert: 1671 raise NexusError('Can\'t exclude and insert at the same time') 1672 if not self.charlabels: 1673 return None 1674 labels=self.charlabels.keys() 1675 labels.sort() 1676 newcharlabels={} 1677 if exclude: 1678 exclude.sort() 1679 exclude.append(sys.maxint) 1680 excount=0 1681 for c in labels: 1682 if not c in exclude: 1683 while c>exclude[excount]: 1684 excount+=1 1685 newcharlabels[c-excount]=self.charlabels[c] 1686 elif insert: 1687 insert.sort() 1688 insert.append(sys.maxint) 1689 icount=0 1690 for c in labels: 1691 while c>=insert[icount]: 1692 icount+=1 1693 newcharlabels[c+icount]=self.charlabels[c] 1694 else: 1695 return self.charlabels 1696 return newcharlabels
1697
1698 - def invert(self,charlist):
1699 """Returns all character indices that are not in charlist.""" 1700 return [c for c in range(self.nchar) if c not in charlist]
1701
1702 - def gaponly(self,include_missing=False):
1703 """Return gap-only sites.""" 1704 gap=set(self.gap) 1705 if include_missing: 1706 gap.add(self.missing) 1707 sitesm=zip(*[self.matrix[t].tostring() for t in self.taxlabels]) 1708 gaponly=[i for i,site in enumerate(sitesm) if set(site).issubset(gap)] 1709 return gaponly
1710
1711 - def terminal_gap_to_missing(self,missing=None,skip_n=True):
1712 """Replaces all terminal gaps with missing character. 1713 1714 Mixtures like ???------??------- are properly resolved.""" 1715 1716 if not missing: 1717 missing=self.missing 1718 replace=[self.missing,self.gap] 1719 if not skip_n: 1720 replace.extend(['n','N']) 1721 for taxon in self.taxlabels: 1722 sequence=self.matrix[taxon].tostring() 1723 length=len(sequence) 1724 start,end=get_start_end(sequence,skiplist=replace) 1725 if start==-1 and end==-1: 1726 sequence=missing*length 1727 else: 1728 sequence=sequence[:end+1]+missing*(length-end-1) 1729 sequence=start*missing+sequence[start:] 1730 assert length==len(sequence), 'Illegal sequence manipulation in Nexus.termial_gap_to_missing in taxon %s' % taxon 1731 self.matrix[taxon]=Seq(sequence,self.alphabet)
1732