1
2
3
4
5
6
7 """Tree class to handle phylogenetic trees.
8
9 Provides a set of methods to read and write newick-format tree descriptions,
10 get information about trees (monphyly of taxon sets, congruence between trees,
11 common ancestors,...) and to manipulate trees (reroot trees, split terminal
12 nodes).
13 """
14
15 import sys, random, copy
16 import Nodes
17
18 PRECISION_BRANCHLENGTH=6
19 PRECISION_SUPPORT=6
20 NODECOMMENT_START='[&'
21 NODECOMMENT_END=']'
22
24
26 """Stores tree-relevant data associated with nodes (e.g. branches or otus)."""
27 - def __init__(self,taxon=None,branchlength=0.0,support=None,comment=None):
28 self.taxon=taxon
29 self.branchlength=branchlength
30 self.support=support
31 self.comment=comment
32
33 -class Tree(Nodes.Chain):
34 """Represents a tree using a chain of nodes with on predecessor (=ancestor)
35 and multiple successors (=subclades).
36 """
37
38
39
40
41
42
43
44
45
46 - def __init__(self,tree=None,weight=1.0,rooted=False,name='',data=NodeData,values_are_support=False,max_support=1.0):
47 """Ntree(self,tree)."""
48 Nodes.Chain.__init__(self)
49 self.dataclass=data
50 self.__values_are_support=values_are_support
51 self.max_support=max_support
52 self.weight=weight
53 self.rooted=rooted
54 self.name=name
55 root=Nodes.Node(data())
56 self.root = self.add(root)
57 if tree:
58
59 tree=tree.strip().replace('\n','').replace('\r','')
60
61 tree=tree.rstrip(';')
62 subtree_info, base_info = self._parse(tree)
63 root.data = self._add_nodedata(root.data, [[], base_info])
64 self._add_subtree(parent_id=root.id,tree=subtree_info)
65
67 """Parses (a,b,c...)[[[xx]:]yy] into subcomponents and travels down recursively."""
68
69
70 tree = tree.strip()
71 if tree.count('(')!=tree.count(')'):
72 raise TreeError('Parentheses do not match in (sub)tree: '+tree)
73 if tree.count('(')==0:
74
75 nodecomment=tree.find(NODECOMMENT_START)
76 colon=tree.find(':')
77 if colon==-1 and nodecomment==-1:
78 return [tree,[None]]
79 elif colon==-1 and nodecomment>-1:
80 return [tree[:nodecomment],self._get_values(tree[nodecomment:])]
81 elif colon>-1 and nodecomment==-1:
82 return [tree[:colon],self._get_values(tree[colon+1:])]
83 elif colon < nodecomment:
84 return [tree[:colon],self._get_values(tree[colon+1:])]
85 else:
86 return [tree[:nodecomment],self._get_values(tree[nodecomment:])]
87 else:
88 closing=tree.rfind(')')
89 val=self._get_values(tree[closing+1:])
90 if not val:
91 val=[None]
92 subtrees=[]
93 plevel=0
94 prev=1
95 for p in range(1,closing):
96 if tree[p]=='(':
97 plevel+=1
98 elif tree[p]==')':
99 plevel-=1
100 elif tree[p]==',' and plevel==0:
101 subtrees.append(tree[prev:p])
102 prev=p+1
103 subtrees.append(tree[prev:closing])
104 subclades=[self._parse(subtree) for subtree in subtrees]
105 return [subclades,val]
106
108 """Adds leaf or tree (in newick format) to a parent_id. (self,parent_id,tree)."""
109 if parent_id is None:
110 raise TreeError('Need node_id to connect to.')
111 for st in tree:
112 nd=self.dataclass()
113 nd = self._add_nodedata(nd, st)
114 if type(st[0])==list:
115 sn=Nodes.Node(nd)
116 self.add(sn,parent_id)
117 self._add_subtree(sn.id,st[0])
118 else:
119 nd.taxon=st[0]
120 leaf=Nodes.Node(nd)
121 self.add(leaf,parent_id)
122
124 """Add data to the node parsed from the comments, taxon and support.
125 """
126 if isinstance(st[1][-1],str) and st[1][-1].startswith(NODECOMMENT_START):
127 nd.comment=st[1].pop(-1)
128
129 elif isinstance(st[1][0], str):
130 nd.taxon = st[1][0]
131 st[1] = st[1][1:]
132 if len(st)>1:
133 if len(st[1])>=2:
134 nd.support=st[1][0]
135 if st[1][1] is not None:
136 nd.branchlength=st[1][1]
137 elif len(st[1])==1:
138 if not self.__values_are_support:
139 if st[1][0] is not None:
140 nd.branchlength=st[1][0]
141 else:
142 nd.support=st[1][0]
143 return nd
144
175
176 - def _walk(self,node=None):
177 """Return all node_ids downwards from a node."""
178
179 if node is None:
180 node=self.root
181 for n in self.node(node).succ:
182 yield n
183 for sn in self._walk(n):
184 yield sn
185
186 - def node(self,node_id):
187 """Return the instance of node_id.
188
189 node = node(self,node_id)
190 """
191 if node_id not in self.chain:
192 raise TreeError('Unknown node_id: %d' % node_id)
193 return self.chain[node_id]
194
195 - def split(self,parent_id=None,n=2,branchlength=1.0):
196 """Speciation: generates n (default two) descendants of a node.
197
198 [new ids] = split(self,parent_id=None,n=2,branchlength=1.0):
199 """
200 if parent_id is None:
201 raise TreeError('Missing node_id.')
202 ids=[]
203 parent_data=self.chain[parent_id].data
204 for i in range(n):
205 node=Nodes.Node()
206 if parent_data:
207 node.data=self.dataclass()
208
209 if parent_data.taxon:
210 node.data.taxon=parent_data.taxon+str(i)
211 node.data.branchlength=branchlength
212 ids.append(self.add(node,parent_id))
213 return ids
214
216 """Returns the first matching taxon in self.data.taxon. Not restricted to terminal nodes.
217
218 node_id = search_taxon(self,taxon)
219 """
220 for id,node in self.chain.items():
221 if node.data.taxon==taxon:
222 return id
223 return None
224
226 """Prunes a terminal taxon from the tree.
227
228 id_of_previous_node = prune(self,taxon)
229 If taxon is from a bifurcation, the connectiong node will be collapsed
230 and its branchlength added to remaining terminal node. This might be no
231 longer a meaningful value'
232 """
233
234 id=self.search_taxon(taxon)
235 if id is None:
236 raise TreeError('Taxon not found: %s' % taxon)
237 elif id not in self.get_terminals():
238 raise TreeError('Not a terminal taxon: %s' % taxon)
239 else:
240 prev=self.unlink(id)
241 self.kill(id)
242 if len(self.node(prev).succ)==1:
243 if prev==self.root:
244 self.root=self.node(self.root).succ[0]
245 self.node(self.root).branchlength=0.0
246 self.kill(prev)
247 else:
248 succ=self.node(prev).succ[0]
249 new_bl=self.node(prev).data.branchlength+self.node(succ).data.branchlength
250 self.collapse(prev)
251 self.node(succ).data.branchlength=new_bl
252 return prev
253
255 """Return a list of all otus downwards from a node (self, node_id).
256
257 nodes = get_taxa(self,node_id=None)
258 """
259
260 if node_id is None:
261 node_id=self.root
262 if node_id not in self.chain:
263 raise TreeError('Unknown node_id: %d.' % node_id)
264 if self.chain[node_id].succ==[]:
265 if self.chain[node_id].data:
266 return [self.chain[node_id].data.taxon]
267 else:
268 return None
269 else:
270 list=[]
271 for succ in self.chain[node_id].succ:
272 list.extend(self.get_taxa(succ))
273 return list
274
276 """Return a list of all terminal nodes."""
277 return [i for i in self.all_ids() if self.node(i).succ==[]]
278
280 """Returns True if node is a terminal node."""
281 return self.node(node).succ==[]
282
284 """Returns True if node is an internal node."""
285 return len(self.node(node).succ)>0
286
288 """Returns True if all successors of a node are terminal ones."""
289 if self.is_terminal(node):
290 return False not in [self.is_terminal(n) for n in self.node(node).succ]
291 else:
292 return False
294 """Counts the number of terminal nodes that are attached to a node."""
295 if node is None:
296 node=self.root
297 return len([n for n in self._walk(node) if self.is_terminal(n)])
298
300 """Collapses all subtrees which belong to the same genus (i.e share the same first word in their taxon name."""
301
302 while True:
303 for n in self._walk():
304 if self.is_terminal(n):
305 continue
306 taxa=self.get_taxa(n)
307 genera=[]
308 for t in taxa:
309 if space_equals_underscore:
310 t=t.replace(' ','_')
311 try:
312 genus=t.split('_',1)[0]
313 except:
314 genus='None'
315 if genus not in genera:
316 genera.append(genus)
317 if len(genera)==1:
318 self.node(n).data.taxon=genera[0]+' <collapsed>'
319
320 nodes2kill=[kn for kn in self._walk(node=n)]
321 for kn in nodes2kill:
322 self.kill(kn)
323 self.node(n).succ=[]
324 break
325 else:
326 break
327
328
330 """Adds up the branchlengths from root (default self.root) to node.
331
332 sum = sum_branchlength(self,root=None,node=None)
333 """
334
335 if root is None:
336 root=self.root
337 if node is None:
338 raise TreeError('Missing node id.')
339 blen=0.0
340 while node is not None and node is not root:
341 blen+=self.node(node).data.branchlength
342 node=self.node(node).prev
343 return blen
344
346 """Return subtree as a set of nested sets.
347
348 sets = set_subtree(self,node)
349 """
350
351 if self.node(node).succ==[]:
352 return self.node(node).data.taxon
353 else:
354 try:
355 return frozenset([self.set_subtree(n) for n in self.node(node).succ])
356 except:
357 print node
358 print self.node(node).succ
359 for n in self.node(node).succ:
360 print n, self.set_subtree(n)
361 print [self.set_subtree(n) for n in self.node(node).succ]
362 raise
363
365 """Compare tree and tree2 for identity.
366
367 result = is_identical(self,tree2)
368 """
369 return self.set_subtree(self.root)==tree2.set_subtree(tree2.root)
370
372 """Compares branches with support>threshold for compatibility.
373
374 result = is_compatible(self,tree2,threshold)
375 """
376
377
378 missing2=set(self.get_taxa())-set(tree2.get_taxa())
379 missing1=set(tree2.get_taxa())-set(self.get_taxa())
380 if strict and (missing1 or missing2):
381 if missing1:
382 print 'Taxon/taxa %s is/are missing in tree %s' % (','.join(missing1) , self.name)
383 if missing2:
384 print 'Taxon/taxa %s is/are missing in tree %s' % (','.join(missing2) , tree2.name)
385 raise TreeError('Can\'t compare trees with different taxon compositions.')
386 t1=[(set(self.get_taxa(n)),self.node(n).data.support) for n in self.all_ids() if \
387 self.node(n).succ and\
388 (self.node(n).data and self.node(n).data.support and self.node(n).data.support>=threshold)]
389 t2=[(set(tree2.get_taxa(n)),tree2.node(n).data.support) for n in tree2.all_ids() if \
390 tree2.node(n).succ and\
391 (tree2.node(n).data and tree2.node(n).data.support and tree2.node(n).data.support>=threshold)]
392 conflict=[]
393 for (st1,sup1) in t1:
394 for (st2,sup2) in t2:
395 if not st1.issubset(st2) and not st2.issubset(st1):
396 intersect,notin1,notin2=st1 & st2, st2-st1, st1-st2
397
398 if intersect and not (notin1.issubset(missing1) or notin2.issubset(missing2)):
399 conflict.append((st1,sup1,st2,sup2,intersect,notin1,notin2))
400 return conflict
401
403 """Return the common ancestor that connects two nodes.
404
405 node_id = common_ancestor(self,node1,node2)
406 """
407
408 l1=[self.root]+self.trace(self.root,node1)
409 l2=[self.root]+self.trace(self.root,node2)
410 return [n for n in l1 if n in l2][-1]
411
412
414 """Add and return the sum of the branchlengths between two nodes.
415 dist = distance(self,node1,node2)
416 """
417
418 ca=self.common_ancestor(node1,node2)
419 return self.sum_branchlength(ca,node1)+self.sum_branchlength(ca,node2)
420
422 """Return node_id of common ancestor if taxon_list is monophyletic, -1 otherwise.
423
424 result = is_monophyletic(self,taxon_list)
425 """
426 if isinstance(taxon_list,str):
427 taxon_set=set([taxon_list])
428 else:
429 taxon_set=set(taxon_list)
430 node_id=self.root
431 while 1:
432 subclade_taxa=set(self.get_taxa(node_id))
433 if subclade_taxa==taxon_set:
434 return node_id
435 else:
436 for subnode in self.chain[node_id].succ:
437 if set(self.get_taxa(subnode)).issuperset(taxon_set):
438 node_id=subnode
439 break
440 else:
441 return -1
442
457
458
459
461 """Move values stored in data.branchlength to data.support, and set branchlength to 0.0
462
463 This is necessary when support has been stored as branchlength (e.g. paup), and has thus
464 been read in as branchlength.
465 """
466
467 for n in self.chain.keys():
468 self.node(n).data.support=self.node(n).data.branchlength
469 self.node(n).data.branchlength=0.0
470
472 """Convert absolute support (clade-count) to rel. frequencies.
473
474 Some software (e.g. PHYLIP consense) just calculate how often clades appear, instead of
475 calculating relative frequencies."""
476
477 for n in self._walk():
478 if self.node(n).data.support:
479 self.node(n).data.support/=float(nrep)
480
482 """Returns True if any of the nodes has data.support != None."""
483 for n in self._walk(node):
484 if self.node(n).data.support:
485 return True
486 else:
487 return False
488
489 - def randomize(self,ntax=None,taxon_list=None,branchlength=1.0,branchlength_sd=None,bifurcate=True):
490 """Generates a random tree with ntax taxa and/or taxa from taxlabels.
491
492 new_tree = randomize(self,ntax=None,taxon_list=None,branchlength=1.0,branchlength_sd=None,bifurcate=True)
493 Trees are bifurcating by default. (Polytomies not yet supported).
494 """
495
496 if not ntax and taxon_list:
497 ntax=len(taxon_list)
498 elif not taxon_list and ntax:
499 taxon_list=['taxon'+str(i+1) for i in range(ntax)]
500 elif not ntax and not taxon_list:
501 raise TreeError('Either numer of taxa or list of taxa must be specified.')
502 elif ntax != len(taxon_list):
503 raise TreeError('Length of taxon list must correspond to ntax.')
504
505 self.__init__()
506 terminals=self.get_terminals()
507
508 while len(terminals)<ntax:
509 newsplit=random.choice(terminals)
510 new_terminals=self.split(parent_id=newsplit,branchlength=branchlength)
511
512 if branchlength_sd:
513 for nt in new_terminals:
514 bl=random.gauss(branchlength,branchlength_sd)
515 if bl<0:
516 bl=0
517 self.node(nt).data.branchlength=bl
518 terminals.extend(new_terminals)
519 terminals.remove(newsplit)
520
521 random.shuffle(taxon_list)
522 for (node,name) in zip(terminals,taxon_list):
523 self.node(node).data.taxon=name
524
526 """Quick and dirty lists of all nodes."""
527 table=[('#','taxon','prev','succ','brlen','blen (sum)','support','comment')]
528
529 for i in sorted(self.all_ids()):
530 n=self.node(i)
531 if not n.data:
532 table.append((str(i),'-',str(n.prev),str(n.succ),'-','-','-','-'))
533 else:
534 tx=n.data.taxon
535 if not tx:
536 tx='-'
537 blength=n.data.branchlength
538 if blength is None:
539 blength='-'
540 sum_blength='-'
541 else:
542 sum_blength=self.sum_branchlength(node=i)
543 support=n.data.support
544 if support is None:
545 support='-'
546 comment=n.data.comment
547 if comment is None:
548 comment='-'
549 table.append((str(i),tx,str(n.prev),str(n.succ),blength,sum_blength,support,comment))
550 print '\n'.join(['%3s %32s %15s %15s %8s %10s %8s %20s' % l for l in table])
551 print '\nRoot: ',self.root
552
553 - def to_string(self,support_as_branchlengths=False,branchlengths_only=False,plain=True,plain_newick=False,ladderize=None):
554 """Return a paup compatible tree line.
555
556 to_string(self,support_as_branchlengths=False,branchlengths_only=False,plain=True)
557 """
558
559 if support_as_branchlengths or branchlengths_only:
560 plain=False
561 self.support_as_branchlengths=support_as_branchlengths
562 self.branchlengths_only=branchlengths_only
563 self.plain=plain
564
565 def make_info_string(data,terminal=False):
566 """Creates nicely formatted support/branchlengths."""
567
568 if self.plain:
569 return ''
570 elif self.support_as_branchlengths:
571 if terminal:
572 return ':%1.2f' % self.max_support
573 else:
574 return ':%1.2f' % (data.support)
575 elif self.branchlengths_only:
576 return ':%1.5f' % (data.branchlength)
577 else:
578 if terminal:
579 return ':%1.5f' % (data.branchlength)
580 else:
581 if data.branchlength is not None and data.support is not None:
582 return '%1.2f:%1.5f' % (data.support,data.branchlength)
583 elif data.branchlength is not None:
584 return '0.00000:%1.5f' % (data.branchlength)
585 elif data.support is not None:
586 return '%1.2f:0.00000' % (data.support)
587 else:
588 return '0.00:0.00000'
589 def ladderize_nodes(nodes,ladderize=None):
590 """Sorts node numbers according to the number of terminal nodes."""
591 if ladderize in ['left','LEFT','right','RIGHT']:
592 succnode_terminals=[(self.count_terminals(node=n),n) for n in nodes]
593 succnode_terminals.sort()
594 if (ladderize=='right' or ladderize=='RIGHT'):
595 succnode_terminals.reverse()
596 if succnode_terminals:
597 succnodes=zip(*succnode_terminals)[1]
598 else:
599 succnodes=[]
600 else:
601 succnodes=nodes
602 return succnodes
603
604 def newickize(node,ladderize=None):
605 """Convert a node tree to a newick tree recursively."""
606
607 if not self.node(node).succ:
608 return self.node(node).data.taxon+make_info_string(self.node(node).data,terminal=True)
609 else:
610 succnodes=ladderize_nodes(self.node(node).succ,ladderize=ladderize)
611 subtrees=[newickize(sn,ladderize=ladderize) for sn in succnodes]
612 return '(%s)%s' % (','.join(subtrees),make_info_string(self.node(node).data))
613
614 treeline=['tree']
615 if self.name:
616 treeline.append(self.name)
617 else:
618 treeline.append('a_tree')
619 treeline.append('=')
620 if self.weight != 1:
621 treeline.append('[&W%s]' % str(round(float(self.weight),3)))
622 if self.rooted:
623 treeline.append('[&R]')
624 succnodes=ladderize_nodes(self.node(self.root).succ)
625 subtrees=[newickize(sn,ladderize=ladderize) for sn in succnodes]
626 treeline.append('(%s)' % ','.join(subtrees))
627 if plain_newick:
628 return treeline[-1]
629 else:
630 return ' '.join(treeline)+';'
631
633 """Short version of to_string(), gives plain tree"""
634 return self.to_string(plain=True)
635
637 """Defines a unrooted Tree structure, using data of a rooted Tree."""
638
639
640
641 def _get_branches(node):
642 branches=[]
643 for b in self.node(node).succ:
644 branches.append([node,b,self.node(b).data.branchlength,self.node(b).data.support])
645 branches.extend(_get_branches(b))
646 return branches
647
648 self.unrooted=_get_branches(self.root)
649
650 if len(self.node(self.root).succ)==2:
651
652 rootbranches=[b for b in self.unrooted if self.root in b[:2]]
653 b1=self.unrooted.pop(self.unrooted.index(rootbranches[0]))
654 b2=self.unrooted.pop(self.unrooted.index(rootbranches[1]))
655
656
657 newbranch=[b1[1],b2[1],b1[2]+b2[2]]
658 if b1[3] is None:
659 newbranch.append(b2[3])
660 elif b2[3] is None:
661 newbranch.append(b1[3])
662 elif b1[3]==b2[3]:
663 newbranch.append(b1[3])
664 elif b1[3]==0 or b2[3]==0:
665 newbranch.append(b1[3]+b2[3])
666 else:
667 raise TreeError('Support mismatch in bifurcating root: %f, %f' \
668 % (float(b1[3]),float(b2[3])))
669 self.unrooted.append(newbranch)
670
672
673 def _connect_subtree(parent,child):
674 """Hook subtree starting with node child to parent."""
675 for i,branch in enumerate(self.unrooted):
676 if parent in branch[:2] and child in branch[:2]:
677 branch=self.unrooted.pop(i)
678 break
679 else:
680 raise TreeError('Unable to connect nodes for rooting: nodes %d and %d are not connected' \
681 % (parent,child))
682 self.link(parent,child)
683 self.node(child).data.branchlength=branch[2]
684 self.node(child).data.support=branch[3]
685
686 child_branches=[b for b in self.unrooted if child in b[:2]]
687 for b in child_branches:
688 if child==b[0]:
689 succ=b[1]
690 else:
691 succ=b[0]
692 _connect_subtree(child,succ)
693
694
695 if outgroup is None:
696 return self.root
697 outgroup_node=self.is_monophyletic(outgroup)
698 if outgroup_node==-1:
699 return -1
700
701
702 if (len(self.node(self.root).succ)==2 and outgroup_node in self.node(self.root).succ) or outgroup_node==self.root:
703 return self.root
704
705 self.unroot()
706
707
708 for i,b in enumerate(self.unrooted):
709 if outgroup_node in b[:2] and self.node(outgroup_node).prev in b[:2]:
710 root_branch=self.unrooted.pop(i)
711 break
712 else:
713 raise TreeError('Unrooted and rooted Tree do not match')
714 if outgroup_node==root_branch[1]:
715 ingroup_node=root_branch[0]
716 else:
717 ingroup_node=root_branch[1]
718
719 for n in self.all_ids():
720 self.node(n).prev=None
721 self.node(n).succ=[]
722
723 root=Nodes.Node(data=NodeData())
724 self.add(root)
725 self.root=root.id
726 self.unrooted.append([root.id,ingroup_node,root_branch[2],root_branch[3]])
727 self.unrooted.append([root.id,outgroup_node,0.0,0.0])
728 _connect_subtree(root.id,ingroup_node)
729 _connect_subtree(root.id,outgroup_node)
730
731 oldroot=[i for i in self.all_ids() if self.node(i).prev is None and i!=self.root]
732 if len(oldroot)>1:
733 raise TreeError('Isolated nodes in tree description: %s' \
734 % ','.join(oldroot))
735 elif len(oldroot)==1:
736 self.kill(oldroot[0])
737 return self.root
738
740 """Merges clade support (from consensus or list of bootstrap-trees) with phylogeny.
741
742 tree=merge_bootstrap(phylo,bs_tree=<list_of_trees>)
743 or
744 tree=merge_bootstrap(phylo,consree=consensus_tree with clade support)
745 """
746
747 if bstrees and constree:
748 raise TreeError('Specify either list of boostrap trees or consensus tree, not both')
749 if not (bstrees or constree):
750 raise TreeError('Specify either list of boostrap trees or consensus tree.')
751
752 if outgroup is None:
753 try:
754 succnodes=self.node(self.root).succ
755 smallest=min([(len(self.get_taxa(n)),n) for n in succnodes])
756 outgroup=self.get_taxa(smallest[1])
757 except:
758 raise TreeError("Error determining outgroup.")
759 else:
760 self.root_with_outgroup(outgroup)
761
762 if bstrees:
763 constree=consensus(bstrees,threshold=threshold,outgroup=outgroup)
764 else:
765 if not constree.has_support():
766 constree.branchlength2support()
767 constree.root_with_outgroup(outgroup)
768
769 for pnode in self._walk():
770 cnode=constree.is_monophyletic(self.get_taxa(pnode))
771 if cnode>-1:
772 self.node(pnode).data.support=constree.node(cnode).data.support
773
774
775 -def consensus(trees, threshold=0.5,outgroup=None):
849