1
2
3
4
5
6
7 """Nexus class. Parse the contents of a NEXUS file.
8
9 Based upon 'NEXUS: An extensible file format for systematic information'
10 Maddison, Swofford, Maddison. 1997. Syst. Biol. 46(4):590-621
11 """
12
13 import os,sys, math, random, copy
14
15 from Bio.Alphabet import IUPAC
16 from Bio.Data import IUPACData
17 from Bio.Seq import Seq
18
19 from Trees import Tree,NodeData
20
21 try:
22 import cnexus
23 except ImportError:
24 C=False
25 else:
26 C=True
27
28 INTERLEAVE=70
29 SPECIAL_COMMANDS=['charstatelabels','charlabels','taxlabels', 'taxset', 'charset','charpartition','taxpartition',\
30 'matrix','tree', 'utree','translate','codonposset','title']
31 KNOWN_NEXUS_BLOCKS = ['trees','data', 'characters', 'taxa', 'sets','codons']
32 PUNCTUATION='()[]{}/\,;:=*\'"`+-<>'
33 MRBAYESSAFE='abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890_'
34 WHITESPACE=' \t\n'
35
36 SPECIALCOMMENTS=['&']
37 CHARSET='chars'
38 TAXSET='taxa'
39 CODONPOSITIONS='codonpositions'
40 DEFAULTNEXUS='#NEXUS\nbegin data; dimensions ntax=0 nchar=0; format datatype=dna; end; '
42
44 """Helps reading NEXUS-words and characters from a buffer."""
46 if string:
47 self.buffer=list(string)
48 else:
49 self.buffer=[]
50
52 if self.buffer:
53 return self.buffer[0]
54 else:
55 return None
56
58 b=''.join(self.buffer).strip()
59 if b:
60 return b[0]
61 else:
62 return None
63
65 if self.buffer:
66 return self.buffer.pop(0)
67 else:
68 return None
69
71 while True:
72 p=self.next()
73 if p is None:
74 break
75 if p not in WHITESPACE:
76 return p
77 return None
78
80 while self.buffer[0] in WHITESPACE:
81 self.buffer=self.buffer[1:]
82
84 for t in target:
85 try:
86 pos=self.buffer.index(t)
87 except ValueError:
88 pass
89 else:
90 found=''.join(self.buffer[:pos])
91 self.buffer=self.buffer[pos:]
92 return found
93 else:
94 return None
95
97 return ''.join(self.buffer[:len(word)])==word
98
100 """Return the next NEXUS word from a string.
101
102 This deals with single and double quotes, whitespace and punctuation.
103 """
104
105 word=[]
106 quoted=False
107 first=self.next_nonwhitespace()
108 if not first:
109 return None
110 word.append(first)
111 if first=="'":
112 quoted=True
113 elif first in PUNCTUATION:
114 return first
115 while True:
116 c=self.peek()
117 if c=="'":
118 word.append(self.next())
119 if self.peek()=="'":
120 skip=self.next()
121 elif quoted:
122 break
123 elif quoted:
124 word.append(self.next())
125 elif not c or c in PUNCTUATION or c in WHITESPACE:
126 break
127 else:
128 word.append(self.next())
129 return ''.join(word)
130
132 """Return the rest of the string without parsing."""
133 return ''.join(self.buffer)
134
136 """Calculate a stepmatrix for weighted parsimony.
137
138 See Wheeler (1990), Cladistics 6:269-275.
139 """
140
142 self.data={}
143 self.symbols=[s for s in symbols]
144 self.symbols.sort()
145 if gap:
146 self.symbols.append(gap)
147 for x in self.symbols:
148 for y in [s for s in self.symbols if s!=x]:
149 self.set(x,y,0)
150
151 - def set(self,x,y,value):
155
156 - def add(self,x,y,value):
160
163
170
172 for k in self.data:
173 if self.data[k]!=0:
174 self.data[k]=-math.log(self.data[k])
175 return self
176
177 - def smprint(self,name='your_name_here'):
178 matrix='usertype %s stepmatrix=%d\n' % (name,len(self.symbols))
179 matrix+=' %s\n' % ' '.join(self.symbols)
180 for x in self.symbols:
181 matrix+='[%s]'.ljust(8) % x
182 for y in self.symbols:
183 if x==y:
184 matrix+=' . '
185 else:
186 if x>y:
187 x1,y1=y,x
188 else:
189 x1,y1=x,y
190 if self.data[x1+y1]==0:
191 matrix+='inf. '
192 else:
193 matrix+='%2.2f'.ljust(10) % (self.data[x1+y1])
194 matrix+='\n'
195 matrix+=';\n'
196 return matrix
197
199 """Return a taxon identifier according to NEXUS standard.
200
201 Wrap quotes around names with punctuation or whitespace, and double
202 single quotes.
203
204 mrbayes=True: write names without quotes, whitespace or punctuation
205 for the mrbayes software package.
206 """
207 if mrbayes:
208 safe=name.replace(' ','_')
209 safe=''.join([c for c in safe if c in MRBAYESSAFE])
210 else:
211 safe=name.replace("'","''")
212 if set(safe).intersection(set(WHITESPACE+PUNCTUATION)):
213 safe="'"+safe+"'"
214 return safe
215
217 """Remove quotes and/or double quotes around identifiers."""
218 if not word:
219 return None
220 while (word.startswith("'") and word.endswith("'")) or (word.startswith('"') and word.endswith('"')):
221 word=word[1:-1]
222 return word
223
242
244 """Returns a sorted list of keys of p sorted by values of p."""
245 startpos=[(p[pn],pn) for pn in p if p[pn]]
246 startpos.sort()
247 return zip(*startpos)[1]
248
250 """Check that all values in list are unique and return a pruned and sorted list."""
251 l=list(set(l))
252 l.sort()
253 return l
254
256 """Returns a unique name if label is already in previous_labels."""
257 while label in previous_labels:
258 if label.split('.')[-1].startswith('copy'):
259 label='.'.join(label.split('.')[:-1])+'.copy'+str(eval('0'+label.split('.')[-1][4:])+1)
260 else:
261 label+='.copy'
262 return label
263
265 """Converts a Seq-object matrix to a plain sequence-string matrix."""
266 return dict([(t,matrix[t].tostring()) for t in matrix])
267
269 """Transform [1 2 3 5 6 7 8 12 15 18 20] (baseindex 0, used in the Nexus class)
270 into '2-4 6-9 13-19\\3 21' (baseindex 1, used in programs like Paup or MrBayes.).
271 """
272
273 if not orig_list:
274 return ''
275 orig_list=list(set(orig_list))
276 orig_list.sort()
277 shortlist=[]
278 clist=orig_list[:]
279 clist.append(clist[-1]+.5)
280 while len(clist)>1:
281 step=1
282 for i,x in enumerate(clist):
283 if x==clist[0]+i*step:
284 continue
285 elif i==1 and len(clist)>3 and clist[i+1]-x==x-clist[0]:
286
287
288 step=x-clist[0]
289 else:
290 sub=clist[:i]
291 if len(sub)==1:
292 shortlist.append(str(sub[0]+1))
293 else:
294 if step==1:
295 shortlist.append('%d-%d' % (sub[0]+1,sub[-1]+1))
296 else:
297 shortlist.append('%d-%d\\%d' % (sub[0]+1,sub[-1]+1,step))
298 clist=clist[i:]
299 break
300 return ' '.join(shortlist)
301
303 """Combine matrices in [(name,nexus-instance),...] and return new nexus instance.
304
305 combined_matrix=combine([(name1,nexus_instance1),(name2,nexus_instance2),...]
306 Character sets, character partitions and taxon sets are prefixed, readjusted
307 and present in the combined matrix.
308 """
309
310 if not matrices:
311 return None
312 name=matrices[0][0]
313 combined=copy.deepcopy(matrices[0][1])
314 mixed_datatypes=(len(set([n[1].datatype for n in matrices]))>1)
315 if mixed_datatypes:
316 combined.datatype='None'
317
318 combined.charlabels=None
319 combined.statelabels=None
320 combined.interleave=False
321 combined.translate=None
322
323
324 for cn,cs in combined.charsets.items():
325 combined.charsets['%s.%s' % (name,cn)]=cs
326 del combined.charsets[cn]
327 for tn,ts in combined.taxsets.items():
328 combined.taxsets['%s.%s' % (name,tn)]=ts
329 del combined.taxsets[tn]
330
331
332 combined.charpartitions={'combined':{name:range(combined.nchar)}}
333 for n,m in matrices[1:]:
334 both=[t for t in combined.taxlabels if t in m.taxlabels]
335 combined_only=[t for t in combined.taxlabels if t not in both]
336 m_only=[t for t in m.taxlabels if t not in both]
337 for t in both:
338
339 combined.matrix[t]+=Seq(m.matrix[t].tostring().replace(m.gap,combined.gap).replace(m.missing,combined.missing),combined.alphabet)
340
341 for t in combined_only:
342 combined.matrix[t]+=Seq(combined.missing*m.nchar,combined.alphabet)
343 for t in m_only:
344 combined.matrix[t]=Seq(combined.missing*combined.nchar,combined.alphabet)+\
345 Seq(m.matrix[t].tostring().replace(m.gap,combined.gap).replace(m.missing,combined.missing),combined.alphabet)
346 combined.taxlabels.extend(m_only)
347 for cn,cs in m.charsets.items():
348 combined.charsets['%s.%s' % (n,cn)]=[x+combined.nchar for x in cs]
349 if m.taxsets:
350 if not combined.taxsets:
351 combined.taxsets={}
352 combined.taxsets.update(dict([('%s.%s' % (n,tn),ts) for tn,ts in m.taxsets.items()]))
353 combined.charpartitions['combined'][n]=range(combined.nchar,combined.nchar+m.nchar)
354
355 if m.charlabels:
356 if not combined.charlabels:
357 combined.charlabels={}
358 combined.charlabels.update(dict([(combined.nchar+i,label) for (i,label) in m.charlabels.items()]))
359 combined.nchar+=m.nchar
360 combined.ntax+=len(m_only)
361
362
363
364 for c in combined.charpartitions['combined']:
365 combined.charsets[c]=combined.charpartitions['combined'][c]
366
367 return combined
368
427
428
430 """Adjust linebreaks to match ';', strip leading/trailing whitespace.
431
432 list_of_commandlines=_adjust_lines(input_text)
433 Lines are adjusted so that no linebreaks occur within a commandline
434 (except matrix command line)
435 """
436 formatted_lines=[]
437 for l in lines:
438
439 l=l.replace('\r\n','\n').replace('\r','\n').strip()
440 if l.lower().startswith('matrix'):
441 formatted_lines.append(l)
442 else:
443 l=l.replace('\n',' ')
444 if l:
445 formatted_lines.append(l)
446 return formatted_lines
447
449 """Replaces ambigs in xxx(ACG)xxx format by IUPAC ambiguity code."""
450
451 opening=seq.find('(')
452 while opening>-1:
453 closing=seq.find(')')
454 if closing<0:
455 raise NexusError('Missing closing parenthesis in: '+seq)
456 elif closing<opening:
457 raise NexusError('Missing opening parenthesis in: '+seq)
458 ambig=[x for x in seq[opening+1:closing]]
459 ambig.sort()
460 ambig=''.join(ambig)
461 ambig_code=rev_ambig_values[ambig.upper()]
462 if ambig!=ambig.upper():
463 ambig_code=ambig_code.lower()
464 seq=seq[:opening]+ambig_code+seq[closing+1:]
465 opening=seq.find('(')
466 return seq
467
469 """Represent a commandline as command and options."""
470
472 self.options={}
473 options=[]
474 self.command=None
475 try:
476
477 self.command, options = line.strip().split('\n', 1)
478 except ValueError:
479
480 self.command=line.split()[0]
481 options=' '.join(line.split()[1:])
482 self.command = self.command.strip().lower()
483 if self.command in SPECIAL_COMMANDS:
484 self.options=options.strip()
485 else:
486 if len(options) > 0:
487 try:
488 options = options.replace('=', ' = ').split()
489 valued_indices=[(n-1,n,n+1) for n in range(len(options)) if options[n]=='=' and n!=0 and n!=len((options))]
490 indices = []
491 for sl in valued_indices:
492 indices.extend(sl)
493 token_indices = [n for n in range(len(options)) if n not in indices]
494 for opt in valued_indices:
495
496 self.options[options[opt[0]].lower()] = options[opt[2]]
497 for token in token_indices:
498 self.options[options[token].lower()] = None
499 except ValueError:
500 raise NexusError('Incorrect formatting in line: %s' % line)
501
503 """Represent a NEXUS block with block name and list of commandlines."""
507
509
510 __slots__=['original_taxon_order','__dict__']
511
513 self.ntax=0
514 self.nchar=0
515 self.unaltered_taxlabels=[]
516 self.taxlabels=[]
517 self.charlabels=None
518 self.statelabels=None
519 self.datatype='dna'
520 self.respectcase=False
521 self.missing='?'
522 self.gap='-'
523 self.symbols=None
524 self.equate=None
525 self.matchchar=None
526 self.labels=None
527 self.transpose=False
528 self.interleave=False
529 self.tokens=False
530 self.eliminate=None
531 self.matrix=None
532 self.unknown_blocks=[]
533 self.taxsets={}
534 self.charsets={}
535 self.charpartitions={}
536 self.taxpartitions={}
537 self.trees=[]
538 self.translate=None
539 self.structured=[]
540 self.set={}
541 self.options={}
542 self.codonposset=None
543
544
545 self.options['gapmode']='missing'
546
547 if input:
548 self.read(input)
549 else:
550 self.read(DEFAULTNEXUS)
551
553 """Included for backwards compatibility (DEPRECATED)."""
554 return self.taxlabels
556 """Included for backwards compatibility (DEPRECATED)."""
557 self.taxlabels=value
558 original_taxon_order=property(get_original_taxon_order,set_original_taxon_order)
559
560 - def read(self,input):
561 """Read and parse NEXUS imput (a filename, file-handle, or string)."""
562
563
564
565 try:
566 file_contents = open(os.path.expanduser(input),'rU').read()
567 self.filename=input
568 except (TypeError,IOError,AttributeError):
569
570 if isinstance(input, str):
571 file_contents = input
572 self.filename='input_string'
573
574 elif hasattr(input,'read'):
575 file_contents=input.read()
576 if hasattr(input,"name") and input.name:
577 self.filename=input.name
578 else:
579 self.filename='Unknown_nexus_file'
580 else:
581 print input.strip()[:50]
582 raise NexusError('Unrecognized input: %s ...' % input[:100])
583 file_contents=file_contents.strip()
584 if file_contents.startswith('#NEXUS'):
585 file_contents=file_contents[6:]
586 if C:
587 decommented=cnexus.scanfile(file_contents)
588
589 if decommented=='[' or decommented==']':
590 raise NexusError('Unmatched %s' % decommented)
591
592
593 commandlines=_adjust_lines(decommented.split(chr(7)))
594 else:
595 commandlines=_adjust_lines(_kill_comments_and_break_lines(file_contents))
596
597 for i,cl in enumerate(commandlines):
598 try:
599 if cl[:6].upper()=='#NEXUS':
600 commandlines[i]=cl[6:].strip()
601 except:
602 pass
603
604 nexus_block_gen = self._get_nexus_block(commandlines)
605 while 1:
606 try:
607 title, contents = nexus_block_gen.next()
608 except StopIteration:
609 break
610 if title in KNOWN_NEXUS_BLOCKS:
611 self._parse_nexus_block(title, contents)
612 else:
613 self._unknown_nexus_block(title, contents)
614
616 """Generator for looping through Nexus blocks."""
617 inblock=False
618 blocklines=[]
619 while file_contents:
620 cl=file_contents.pop(0)
621 if cl.lower().startswith('begin'):
622 if not inblock:
623 inblock=True
624 title=cl.split()[1].lower()
625 else:
626 raise NexusError('Illegal block nesting in block %s' % title)
627 elif cl.lower().startswith('end'):
628 if inblock:
629 inblock=False
630 yield title,blocklines
631 blocklines=[]
632 else:
633 raise NexusError('Unmatched \'end\'.')
634 elif inblock:
635 blocklines.append(cl)
636
642
644 """Parse a known Nexus Block (PRIVATE)."""
645
646 self._apply_block_structure(title, contents)
647
648
649 block=self.structured[-1]
650 for line in block.commandlines:
651 try:
652 getattr(self,'_'+line.command)(line.options)
653 except AttributeError:
654 raise
655 raise NexusError('Unknown command: %s ' % line.command)
656
659
661 if 'ntax' in options:
662 self.ntax=eval(options['ntax'])
663 if 'nchar' in options:
664 self.nchar=eval(options['nchar'])
665
745
746
747 - def _set(self,options):
749
751 self.options=options;
752
754 self.eliminate=options
755
757 """Get taxon labels (PRIVATE).
758
759 As the taxon names are already in the matrix, this is superfluous
760 except for transpose matrices, which are currently unsupported anyway.
761 Thus, we ignore the taxlabels command to make handling of duplicate
762 taxon names easier.
763 """
764 pass
765
766
767
768
769
770
771
772
774 """Check for presence of taxon in self.taxlabels."""
775
776
777 nextaxa=dict([(t.replace(' ','_'),t) for t in self.taxlabels])
778 nexid=taxon.replace(' ','_')
779 return nextaxa.get(nexid)
780
803
807
812
814 if not self.ntax or not self.nchar:
815 raise NexusError('Dimensions must be specified before matrix!')
816 self.matrix={}
817 taxcount=0
818 first_matrix_block=True
819
820
821 lines=[l.strip() for l in options.split('\n') if l.strip()!='']
822 lineiter=iter(lines)
823 while 1:
824 try:
825 l=lineiter.next()
826 except StopIteration:
827 if taxcount<self.ntax:
828 raise NexusError('Not enough taxa in matrix.')
829 elif taxcount>self.ntax:
830 raise NexusError('Too many taxa in matrix.')
831 else:
832 break
833
834 taxcount+=1
835
836 if taxcount>self.ntax:
837 if not self.interleave:
838 raise NexusError('Too many taxa in matrix - should matrix be interleaved?')
839 else:
840 taxcount=1
841 first_matrix_block=False
842
843 linechars=CharBuffer(l)
844 id=quotestrip(linechars.next_word())
845 l=linechars.rest().strip()
846 chars=''
847 if self.interleave:
848
849
850 if l:
851 chars=''.join(l.split())
852 else:
853 chars=''.join(lineiter.next().split())
854 else:
855
856 chars=''.join(l.split())
857 while len(chars)<self.nchar:
858 l=lineiter.next()
859 chars+=''.join(l.split())
860 iupac_seq=Seq(_replace_parenthesized_ambigs(chars,self.rev_ambiguous_values),self.alphabet)
861
862 if taxcount==1:
863 refseq=iupac_seq
864 else:
865 if self.matchchar:
866 while 1:
867 p=iupac_seq.tostring().find(self.matchchar)
868 if p==-1:
869 break
870 iupac_seq=Seq(iupac_seq.tostring()[:p]+refseq[p]+iupac_seq.tostring()[p+1:],self.alphabet)
871
872 for i,c in enumerate(iupac_seq.tostring()):
873 if c not in self.valid_characters and c!=self.gap and c!=self.missing:
874 raise NexusError('Taxon %s: Illegal character %s in line: %s (check dimensions / interleaving)'\
875 % (id,c,l[i-10:i+10]))
876
877 if first_matrix_block:
878 self.unaltered_taxlabels.append(id)
879 id=_unique_label(self.matrix.keys(),id)
880 self.matrix[id]=iupac_seq
881 self.taxlabels.append(id)
882 else:
883
884 id=_unique_label(self.taxlabels[:taxcount-1],id)
885 taxon_present=self._check_taxlabels(id)
886 if taxon_present:
887 self.matrix[taxon_present]+=iupac_seq
888 else:
889 raise NexusError('Taxon %s not in first block of interleaved matrix. Check matrix dimensions and interleave.' % id)
890
891 for taxon in self.matrix:
892 if len(self.matrix[taxon])!=self.nchar:
893 raise NexusError('Matrx Nchar %d does not match data length (%d) for taxon %s' \
894 % (self.nchar, len(self.matrix[taxon]),taxon))
895
896 matrixkeys=self.matrix.keys()
897 matrixkeys.sort()
898 taxlabelssort=self.taxlabels[:]
899 taxlabelssort.sort()
900 assert matrixkeys==taxlabelssort,"ERROR: TAXLABELS must be identical with MATRIX. Please Report this as a bug, and send in data file."
901
921
923 """Some software (clustalx) uses 'utree' to denote an unrooted tree."""
924 self._tree(options)
925
926 - def _tree(self,options):
959
966
970
974
976 taxpartition={}
977 quotelevel=False
978 opts=CharBuffer(options)
979 name=self._name_n_vector(opts)
980 if not name:
981 raise NexusError('Formatting error in taxpartition: %s ' % options)
982
983
984
985 sub=''
986 while True:
987 w=opts.next()
988 if w is None or (w==',' and not quotelevel):
989 subname,subindices=self._get_indices(sub,set_type=TAXSET,separator=':')
990 taxpartition[subname]=_make_unique(subindices)
991 sub=''
992 if w is None:
993 break
994 else:
995 if w=="'":
996 quotelevel=not quotelevel
997 sub+=w
998 self.taxpartitions[name]=taxpartition
999
1001 """Read codon positions from a codons block as written from McClade.
1002
1003 Here codonposset is just a fancy name for a character partition with
1004 the name CodonPositions and the partitions N,1,2,3
1005 """
1006
1007 prev_partitions=self.charpartitions.keys()
1008 self._charpartition(options)
1009
1010 codonname=[n for n in self.charpartitions.keys() if n not in prev_partitions]
1011 if codonname==[] or len(codonname)>1:
1012 raise NexusError('Formatting Error in codonposset: %s ' % options)
1013 else:
1014 self.codonposset=codonname[0]
1015
1018
1020 charpartition={}
1021 quotelevel=False
1022 opts=CharBuffer(options)
1023 name=self._name_n_vector(opts)
1024 if not name:
1025 raise NexusError('Formatting error in charpartition: %s ' % options)
1026
1027
1028 sub=''
1029 while True:
1030 w=opts.next()
1031 if w is None or (w==',' and not quotelevel):
1032 subname,subindices=self._get_indices(sub,set_type=CHARSET,separator=':')
1033 charpartition[subname]=_make_unique(subindices)
1034 sub=''
1035 if w is None:
1036 break
1037 else:
1038 if w=="'":
1039 quotelevel=not quotelevel
1040 sub+=w
1041 self.charpartitions[name]=charpartition
1042
1044 """Parse the taxset/charset specification (PRIVATE).
1045
1046 e.g. '1 2 3 - 5 dog cat 10 - 20 \\ 3'
1047 --> [0,1,2,3,4,'dog','cat',9,12,15,18]
1048 """
1049 opts=CharBuffer(options)
1050 name=self._name_n_vector(opts,separator=separator)
1051 indices=self._parse_list(opts,set_type=set_type)
1052 if indices is None:
1053 raise NexusError('Formatting error in line: %s ' % options)
1054 return name,indices
1055
1079
1081 """Parse a NEXUS list (PRIVATE).
1082
1083 e.g. [1, 2, 4-8\\2, dog, cat] --> [1,2,4,6,8,17,21],
1084 (assuming dog is taxon no. 17 and cat is taxon no. 21).
1085 """
1086 plain_list=[]
1087 if options_buffer.peek_nonwhitespace():
1088 try:
1089 while True:
1090 identifier=options_buffer.next_word()
1091 if not identifier:
1092 break
1093 start=self._resolve(identifier,set_type=set_type)
1094 if options_buffer.peek_nonwhitespace()=='-':
1095 end=start
1096 step=1
1097
1098 hyphen=options_buffer.next_nonwhitespace()
1099 end=self._resolve(options_buffer.next_word(),set_type=set_type)
1100 if set_type==CHARSET:
1101 if options_buffer.peek_nonwhitespace()=='\\':
1102 backslash=options_buffer.next_nonwhitespace()
1103 step=int(options_buffer.next_word())
1104 plain_list.extend(range(start,end+1,step))
1105 else:
1106 if type(start)==list or type(end)==list:
1107 raise NexusError('Name if character sets not allowed in range definition: %s'\
1108 % identifier)
1109 start=self.taxlabels.index(start)
1110 end=self.taxlabels.index(end)
1111 taxrange=self.taxlabels[start:end+1]
1112 plain_list.extend(taxrange)
1113 else:
1114 if type(start)==list:
1115 plain_list.extend(start)
1116 else:
1117 plain_list.append(start)
1118 except NexusError:
1119 raise
1120 except:
1121 return None
1122 return plain_list
1123
1124 - def _resolve(self,identifier,set_type=None):
1125 """Translate identifier in list into character/taxon index.
1126
1127 Characters (which are referred to by their index in Nexus.py):
1128 Plain numbers are returned minus 1 (Nexus indices to python indices)
1129 Text identifiers are translaterd into their indices (if plain character indentifiers),
1130 the first hit in charlabels is returned (charlabels don't need to be unique)
1131 or the range of indices is returned (if names of character sets).
1132 Taxa (which are referred to by their unique name in Nexus.py):
1133 Plain numbers are translated in their taxon name, underscores and spaces are considered equal.
1134 Names are returned unchanged (if plain taxon identifiers), or the names in
1135 the corresponding taxon set is returned
1136 """
1137 identifier=quotestrip(identifier)
1138 if not set_type:
1139 raise NexusError('INTERNAL ERROR: Need type to resolve identifier.')
1140 if set_type==CHARSET:
1141 try:
1142 n=int(identifier)
1143 except ValueError:
1144 if self.charlabels and identifier in self.charlabels.values():
1145 for k in self.charlabels:
1146 if self.charlabels[k]==identifier:
1147 return k
1148 elif self.charsets and identifier in self.charsets:
1149 return self.charsets[identifier]
1150 else:
1151 raise NexusError('Unknown character identifier: %s' \
1152 % identifier)
1153 else:
1154 if n<=self.nchar:
1155 return n-1
1156 else:
1157 raise NexusError('Illegal character identifier: %d>nchar (=%d).' \
1158 % (identifier,self.nchar))
1159 elif set_type==TAXSET:
1160 try:
1161 n=int(identifier)
1162 except ValueError:
1163 taxlabels_id=self._check_taxlabels(identifier)
1164 if taxlabels_id:
1165 return taxlabels_id
1166 elif self.taxsets and identifier in self.taxsets:
1167 return self.taxsets[identifier]
1168 else:
1169 raise NexusError('Unknown taxon identifier: %s' \
1170 % identifier)
1171 else:
1172 if n>0 and n<=self.ntax:
1173 return self.taxlabels[n-1]
1174 else:
1175 raise NexusError('Illegal taxon identifier: %d>ntax (=%d).' \
1176 % (identifier,self.ntax))
1177 else:
1178 raise NexusError('Unknown set specification: %s.'% set_type)
1179
1183
1187
1191
1195
1196 - def write_nexus_data_partitions(self, matrix=None, filename=None, blocksize=None, interleave=False,
1197 exclude=[], delete=[], charpartition=None, comment='',mrbayes=False):
1198 """Writes a nexus file for each partition in charpartition.
1199
1200 Only non-excluded characters and non-deleted taxa are included,
1201 just the data block is written.
1202 """
1203
1204 if not matrix:
1205 matrix=self.matrix
1206 if not matrix:
1207 return
1208 if not filename:
1209 filename=self.filename
1210 if charpartition:
1211 pfilenames={}
1212 for p in charpartition:
1213 total_exclude=[]+exclude
1214 total_exclude.extend([c for c in range(self.nchar) if c not in charpartition[p]])
1215 total_exclude=_make_unique(total_exclude)
1216 pcomment=comment+'\nPartition: '+p+'\n'
1217 dot=filename.rfind('.')
1218 if dot>0:
1219 pfilename=filename[:dot]+'_'+p+'.data'
1220 else:
1221 pfilename=filename+'_'+p
1222 pfilenames[p]=pfilename
1223 self.write_nexus_data(filename=pfilename,matrix=matrix,blocksize=blocksize,
1224 interleave=interleave,exclude=total_exclude,delete=delete,comment=pcomment,append_sets=False,
1225 mrbayes=mrbayes)
1226 return pfilenames
1227 else:
1228 fn=self.filename+'.data'
1229 self.write_nexus_data(filename=fn,matrix=matrix,blocksize=blocksize,interleave=interleave,
1230 exclude=exclude,delete=delete,comment=comment,append_sets=False,
1231 mrbayes=mrbayes)
1232 return fn
1233
1234 - def write_nexus_data(self, filename=None, matrix=None, exclude=[], delete=[],\
1235 blocksize=None, interleave=False, interleave_by_partition=False,\
1236 comment=None,omit_NEXUS=False,append_sets=True,mrbayes=False,\
1237 codons_block=True):
1238 """Writes a nexus file with data and sets block to a file or handle.
1239
1240 Character sets and partitions are appended by default, and are
1241 adjusted according to excluded characters (i.e. character sets
1242 still point to the same sites (not necessarily same positions),
1243 without including the deleted characters.
1244
1245 filename - Either a filename as a string (which will be opened,
1246 written to and closed), or a handle object (which will
1247 be written to but NOT closed).
1248 omit_NEXUS - Boolean. If true, the '#NEXUS' line normally at the
1249 start of the file is ommited.
1250
1251 Returns the filename/handle used to write the data.
1252 """
1253 if not matrix:
1254 matrix=self.matrix
1255 if not matrix:
1256 return
1257 if not filename:
1258 filename=self.filename
1259 if [t for t in delete if not self._check_taxlabels(t)]:
1260 raise NexusError('Unknown taxa: %s' \
1261 % ', '.join(set(delete).difference(set(self.taxlabels))))
1262 if interleave_by_partition:
1263 if not interleave_by_partition in self.charpartitions:
1264 raise NexusError('Unknown partition: '+interleave_by_partition)
1265 else:
1266 partition=self.charpartitions[interleave_by_partition]
1267
1268 names=_sort_keys_by_values(partition)
1269 newpartition={}
1270 for p in partition:
1271 newpartition[p]=[c for c in partition[p] if c not in exclude]
1272
1273 undelete=[taxon for taxon in self.taxlabels if taxon in matrix and taxon not in delete]
1274 cropped_matrix=_seqmatrix2strmatrix(self.crop_matrix(matrix,exclude=exclude,delete=delete))
1275 ntax_adjusted=len(undelete)
1276 nchar_adjusted=len(cropped_matrix[undelete[0]])
1277 if not undelete or (undelete and undelete[0]==''):
1278 return
1279 if isinstance(filename,str):
1280 try:
1281 fh=open(filename,'w')
1282 except IOError:
1283 raise NexusError('Could not open %s for writing.' % filename)
1284 elif hasattr(file, "write"):
1285 fh=filename
1286 else:
1287 raise ValueError("Neither a filename nor a handle was supplied")
1288 if not omit_NEXUS:
1289 fh.write('#NEXUS\n')
1290 if comment:
1291 fh.write('['+comment+']\n')
1292 fh.write('begin data;\n')
1293 fh.write('\tdimensions ntax=%d nchar=%d;\n' % (ntax_adjusted, nchar_adjusted))
1294 fh.write('\tformat datatype='+self.datatype)
1295 if self.respectcase:
1296 fh.write(' respectcase')
1297 if self.missing:
1298 fh.write(' missing='+self.missing)
1299 if self.gap:
1300 fh.write(' gap='+self.gap)
1301 if self.matchchar:
1302 fh.write(' matchchar='+self.matchchar)
1303 if self.labels:
1304 fh.write(' labels='+self.labels)
1305 if self.equate:
1306 fh.write(' equate='+self.equate)
1307 if interleave or interleave_by_partition:
1308 fh.write(' interleave')
1309 fh.write(';\n')
1310
1311
1312 if self.charlabels:
1313 newcharlabels=self._adjust_charlabels(exclude=exclude)
1314 clkeys=newcharlabels.keys()
1315 clkeys.sort()
1316 fh.write('charlabels '+', '.join(["%s %s" % (k+1,safename(newcharlabels[k])) for k in clkeys])+';\n')
1317 fh.write('matrix\n')
1318 if not blocksize:
1319 if interleave:
1320 blocksize=70
1321 else:
1322 blocksize=self.nchar
1323
1324 namelength=max([len(safename(t,mrbayes=mrbayes)) for t in undelete])
1325 if interleave_by_partition:
1326
1327 seek=0
1328 for p in names:
1329 fh.write('[%s: %s]\n' % (interleave_by_partition,p))
1330 if len(newpartition[p])>0:
1331 for taxon in undelete:
1332 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1))
1333 fh.write(cropped_matrix[taxon][seek:seek+len(newpartition[p])]+'\n')
1334 fh.write('\n')
1335 else:
1336 fh.write('[empty]\n\n')
1337 seek+=len(newpartition[p])
1338 elif interleave:
1339 for seek in range(0,nchar_adjusted,blocksize):
1340 for taxon in undelete:
1341 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1))
1342 fh.write(cropped_matrix[taxon][seek:seek+blocksize]+'\n')
1343 fh.write('\n')
1344 else:
1345 for taxon in undelete:
1346 if blocksize<nchar_adjusted:
1347 fh.write(safename(taxon,mrbayes=mrbayes)+'\n')
1348 else:
1349 fh.write(safename(taxon,mrbayes=mrbayes).ljust(namelength+1))
1350 for seek in range(0,nchar_adjusted,blocksize):
1351 fh.write(cropped_matrix[taxon][seek:seek+blocksize]+'\n')
1352 fh.write(';\nend;\n')
1353 if append_sets:
1354 if codons_block:
1355 fh.write(self.append_sets(exclude=exclude,delete=delete,mrbayes=mrbayes,include_codons=False))
1356 fh.write(self.append_sets(exclude=exclude,delete=delete,mrbayes=mrbayes,codons_only=True))
1357 else:
1358 fh.write(self.append_sets(exclude=exclude,delete=delete,mrbayes=mrbayes))
1359
1360 if fh == filename:
1361
1362 return filename
1363 else:
1364
1365 fh.close()
1366 return filename
1367
1368 - def append_sets(self,exclude=[],delete=[],mrbayes=False,include_codons=True,codons_only=False):
1369 """Returns a sets block."""
1370 if not self.charsets and not self.taxsets and not self.charpartitions:
1371 return ''
1372 if codons_only:
1373 setsb=['\nbegin codons']
1374 else:
1375 setsb=['\nbegin sets']
1376
1377
1378
1379
1380 offset=0
1381 offlist=[]
1382 for c in range(self.nchar):
1383 if c in exclude:
1384 offset+=1
1385 offlist.append(-1)
1386 else:
1387 offlist.append(c-offset)
1388
1389 if not codons_only:
1390 for n,ns in self.charsets.items():
1391 cset=[offlist[c] for c in ns if c not in exclude]
1392 if cset:
1393 setsb.append('charset %s = %s' % (safename(n),_compact4nexus(cset)))
1394 for n,s in self.taxsets.items():
1395 tset=[safename(t,mrbayes=mrbayes) for t in s if t not in delete]
1396 if tset:
1397 setsb.append('taxset %s = %s' % (safename(n),' '.join(tset)))
1398 for n,p in self.charpartitions.items():
1399 if not include_codons and n==CODONPOSITIONS:
1400 continue
1401 elif codons_only and n!=CODONPOSITIONS:
1402 continue
1403
1404
1405
1406 names=_sort_keys_by_values(p)
1407 newpartition={}
1408 for sn in names:
1409 nsp=[offlist[c] for c in p[sn] if c not in exclude]
1410 if nsp:
1411 newpartition[sn]=nsp
1412 if newpartition:
1413 if include_codons and n==CODONPOSITIONS:
1414 command='codonposset'
1415 else:
1416 command='charpartition'
1417 setsb.append('%s %s = %s' % (command,safename(n),\
1418 ', '.join(['%s: %s' % (sn,_compact4nexus(newpartition[sn])) for sn in names if sn in newpartition])))
1419
1420 for n,p in self.taxpartitions.items():
1421 names=_sort_keys_by_values(p)
1422 newpartition={}
1423 for sn in names:
1424 nsp=[t for t in p[sn] if t not in delete]
1425 if nsp:
1426 newpartition[sn]=nsp
1427 if newpartition:
1428 setsb.append('taxpartition %s = %s' % (safename(n),\
1429 ', '.join(['%s: %s' % (safename(sn),' '.join(map(safename,newpartition[sn]))) for sn in names if sn in newpartition])))
1430
1431 setsb.append('end;\n')
1432 if len(setsb)==2:
1433 return ''
1434 else:
1435 return ';\n'.join(setsb)
1436
1438 """Writes matrix into a fasta file: (self, filename=None, width=70)."""
1439 if not filename:
1440 if '.' in self.filename and self.filename.split('.')[-1].lower() in ['paup','nexus','nex','dat']:
1441 filename='.'.join(self.filename.split('.')[:-1])+'.fas'
1442 else:
1443 filename=self.filename+'.fas'
1444 fh=open(filename,'w')
1445 for taxon in self.taxlabels:
1446 fh.write('>'+safename(taxon)+'\n')
1447 for i in range(0, len(self.matrix[taxon].tostring()), width):
1448 fh.write(self.matrix[taxon].tostring()[i:i+width] + '\n')
1449 fh.close()
1450 return filename
1451
1453 """Writes matrix into a PHYLIP file: (self, filename=None).
1454
1455 Note that this writes a relaxed PHYLIP format file, where the names
1456 are not truncated, nor checked for invalid characters."""
1457 if not filename:
1458 if '.' in self.filename and self.filename.split('.')[-1].lower() in ['paup','nexus','nex','dat']:
1459 filename='.'.join(self.filename.split('.')[:-1])+'.phy'
1460 else:
1461 filename=self.filename+'.phy'
1462 fh=open(filename,'w')
1463 fh.write('%d %d\n' % (self.ntax,self.nchar))
1464 for taxon in self.taxlabels:
1465 fh.write('%s %s\n' % (safename(taxon),self.matrix[taxon].tostring()))
1466 fh.close()
1467 return filename
1468
1469 - def constant(self,matrix=None,delete=[],exclude=[]):
1470 """Return a list with all constant characters."""
1471 if not matrix:
1472 matrix=self.matrix
1473 undelete=[t for t in self.taxlabels if t in matrix and t not in delete]
1474 if not undelete:
1475 return None
1476 elif len(undelete)==1:
1477 return [x for x in range(len(matrix[undelete[0]])) if x not in exclude]
1478
1479 constant=[(x,self.ambiguous_values.get(n.upper(),n.upper())) for
1480 x,n in enumerate(matrix[undelete[0]].tostring()) if x not in exclude]
1481 for taxon in undelete[1:]:
1482 newconstant=[]
1483 for site in constant:
1484
1485 seqsite=matrix[taxon][site[0]].upper()
1486
1487 if seqsite==self.missing or (seqsite==self.gap and self.options['gapmode'].lower()=='missing') or seqsite==site[1]:
1488
1489 newconstant.append(site)
1490 elif seqsite in site[1] or site[1]==self.missing or (self.options['gapmode'].lower()=='missing' and site[1]==self.gap):
1491
1492 newconstant.append((site[0],self.ambiguous_values.get(seqsite,seqsite)))
1493 elif seqsite in self.ambiguous_values:
1494 intersect=sets.set(self.ambiguous_values[seqsite]).intersection(sets.set(site[1]))
1495 if intersect:
1496 newconstant.append((site[0],''.join(intersect)))
1497
1498
1499
1500
1501
1502 constant=newconstant
1503 cpos=[s[0] for s in constant]
1504 return constant
1505
1506
1507 - def cstatus(self,site,delete=[],narrow=True):
1508 """Summarize character.
1509
1510 narrow=True: paup-mode (a c ? --> ac; ? ? ? --> ?)
1511 narrow=false: (a c ? --> a c g t -; ? ? ? --> a c g t -)
1512 """
1513 undelete=[t for t in self.taxlabels if t not in delete]
1514 if not undelete:
1515 return None
1516 cstatus=[]
1517 for t in undelete:
1518 c=self.matrix[t][site].upper()
1519 if self.options.get('gapmode')=='missing' and c==self.gap:
1520 c=self.missing
1521 if narrow and c==self.missing:
1522 if c not in cstatus:
1523 cstatus.append(c)
1524 else:
1525 cstatus.extend([b for b in self.ambiguous_values[c] if b not in cstatus])
1526 if self.missing in cstatus and narrow and len(cstatus)>1:
1527 cstatus=[c for c in cstatus if c!=self.missing]
1528 cstatus.sort()
1529 return cstatus
1530
1544
1545 - def crop_matrix(self,matrix=None, delete=[], exclude=[]):
1546 """Return a matrix without deleted taxa and excluded characters."""
1547 if not matrix:
1548 matrix=self.matrix
1549 if [t for t in delete if not self._check_taxlabels(t)]:
1550 raise NexusError('Unknown taxa: %s' \
1551 % ', '.join(sets.set(delete).difference(self.taxlabels)))
1552 if exclude!=[]:
1553 undelete=[t for t in self.taxlabels if t in matrix and t not in delete]
1554 if not undelete:
1555 return {}
1556 m=[matrix[k].tostring() for k in undelete]
1557 zipped_m=zip(*m)
1558 sitesm=[s for i,s in enumerate(zipped_m) if i not in exclude]
1559 if sitesm==[]:
1560 return dict([(t,Seq('',self.alphabet)) for t in undelete])
1561 else:
1562 zipped_sitesm=zip(*sitesm)
1563 m=[Seq(s,self.alphabet) for s in map(''.join,zipped_sitesm)]
1564 return dict(zip(undelete,m))
1565 else:
1566 return dict([(t,matrix[t]) for t in self.taxlabels if t in matrix and t not in delete])
1567
1568 - def bootstrap(self,matrix=None,delete=[],exclude=[]):
1569 """Return a bootstrapped matrix."""
1570 if not matrix:
1571 matrix=self.matrix
1572 seqobjects=isinstance(matrix[matrix.keys()[0]],Seq)
1573 cm=self.crop_matrix(delete=delete,exclude=exclude)
1574 if not cm:
1575 return {}
1576 elif len(cm[cm.keys()[0]])==0:
1577 return cm
1578 undelete=[t for t in self.taxlabels if t in cm]
1579 if seqobjects:
1580 sitesm=zip(*[cm[t].tostring() for t in undelete])
1581 alphabet=matrix[matrix.keys()[0]].alphabet
1582 else:
1583 sitesm=zip(*[cm[t] for t in undelete])
1584 bootstrapsitesm=[sitesm[random.randint(0,len(sitesm)-1)] for i in range(len(sitesm))]
1585 bootstrapseqs=map(''.join,zip(*bootstrapsitesm))
1586 if seqobjects:
1587 bootstrapseqs=[Seq(s,alphabet) for s in bootstrapseqs]
1588 return dict(zip(undelete,bootstrapseqs))
1589
1591 """Adds a sequence (string) to the matrix."""
1592
1593 if not name:
1594 raise NexusError('New sequence must have a name')
1595
1596 diff=self.nchar-len(sequence)
1597 if diff<0:
1598 self.insert_gap(self.nchar,-diff)
1599 elif diff>0:
1600 sequence+=self.missing*diff
1601
1602 if name in self.taxlabels:
1603 unique_name=_unique_label(self.taxlabels,name)
1604
1605 else:
1606 unique_name=name
1607
1608 assert unique_name not in self.matrix.keys(), "ERROR. There is a discrepancy between taxlabels and matrix keys. Report this as a bug."
1609
1610 self.matrix[unique_name]=Seq(sequence,self.alphabet)
1611 self.ntax+=1
1612 self.taxlabels.append(unique_name)
1613 self.unaltered_taxlabels.append(name)
1614
1616 """Add a gap into the matrix and adjust charsets and partitions.
1617
1618 pos=0: first position
1619 pos=nchar: last position
1620 """
1621
1622 def _adjust(set,x,d,leftgreedy=False):
1623 """Adjusts chartacter sets if gaps are inserted, taking care of
1624 new gaps within a coherent character set."""
1625
1626
1627
1628 set.sort()
1629 addpos=0
1630 for i,c in enumerate(set):
1631 if c>=x:
1632 set[i]=c+d
1633
1634 if c==x:
1635 if leftgreedy or (i>0 and set[i-1]==c-1):
1636 addpos=i
1637 if addpos>0:
1638 set[addpos:addpos]=range(x,x+d)
1639 return set
1640
1641 if pos<0 or pos>self.nchar:
1642 raise NexusError('Illegal gap position: %d' % pos)
1643 if n==0:
1644 return
1645 if self.taxlabels:
1646
1647 sitesm=zip(*[self.matrix[t].tostring() for t in self.taxlabels])
1648 else:
1649 sitesm=[]
1650 sitesm[pos:pos]=[['-']*len(self.taxlabels)]*n
1651
1652
1653 zipped=zip(*sitesm)
1654 mapped=map(''.join,zipped)
1655 listed=[(taxon,Seq(mapped[i],self.alphabet)) for i,taxon in enumerate(self.taxlabels)]
1656 self.matrix=dict(listed)
1657 self.nchar+=n
1658
1659 for i,s in self.charsets.items():
1660 self.charsets[i]=_adjust(s,pos,n,leftgreedy=leftgreedy)
1661 for p in self.charpartitions:
1662 for sp,s in self.charpartitions[p].items():
1663 self.charpartitions[p][sp]=_adjust(s,pos,n,leftgreedy=leftgreedy)
1664
1665 self.charlabels=self._adjust_charlabels(insert=[pos]*n)
1666 return self.charlabels
1667
1669 """Return adjusted indices of self.charlabels if characters are excluded or inserted."""
1670 if exclude and insert:
1671 raise NexusError('Can\'t exclude and insert at the same time')
1672 if not self.charlabels:
1673 return None
1674 labels=self.charlabels.keys()
1675 labels.sort()
1676 newcharlabels={}
1677 if exclude:
1678 exclude.sort()
1679 exclude.append(sys.maxint)
1680 excount=0
1681 for c in labels:
1682 if not c in exclude:
1683 while c>exclude[excount]:
1684 excount+=1
1685 newcharlabels[c-excount]=self.charlabels[c]
1686 elif insert:
1687 insert.sort()
1688 insert.append(sys.maxint)
1689 icount=0
1690 for c in labels:
1691 while c>=insert[icount]:
1692 icount+=1
1693 newcharlabels[c+icount]=self.charlabels[c]
1694 else:
1695 return self.charlabels
1696 return newcharlabels
1697
1699 """Returns all character indices that are not in charlist."""
1700 return [c for c in range(self.nchar) if c not in charlist]
1701
1702 - def gaponly(self,include_missing=False):
1703 """Return gap-only sites."""
1704 gap=set(self.gap)
1705 if include_missing:
1706 gap.add(self.missing)
1707 sitesm=zip(*[self.matrix[t].tostring() for t in self.taxlabels])
1708 gaponly=[i for i,site in enumerate(sitesm) if set(site).issubset(gap)]
1709 return gaponly
1710
1732