Package Bio :: Package AlignIO :: Module NexusIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignIO.NexusIO

  1  # Copyright 2008-2009 by Peter Cock.  All rights reserved. 
  2  # 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  """ 
  7  Bio.AlignIO support for the "nexus" file format. 
  8   
  9  You are expected to use this module via the Bio.AlignIO functions (or the 
 10  Bio.SeqIO functions if you want to work directly with the gapped sequences). 
 11   
 12  See also the Bio.Nexus module (which this code calls internally), 
 13  as this offers more than just accessing the alignment or its 
 14  sequences as SeqRecord objects. 
 15  """ 
 16   
 17  from Bio.Nexus import Nexus 
 18  from Bio.Align.Generic import Alignment 
 19  from Bio.SeqRecord import SeqRecord 
 20  from Interfaces import AlignmentWriter 
 21  from Bio import Alphabet 
 22   
 23  #You can get a couple of example files here: 
 24  #http://www.molecularevolution.org/resources/fileformats/ 
 25       
 26  #This is a generator function! 
27 -def NexusIterator(handle, seq_count=None):
28 """Returns SeqRecord objects from a Nexus file. 29 30 Thus uses the Bio.Nexus module to do the hard work. 31 32 You are expected to call this function via Bio.SeqIO or Bio.AlignIO 33 (and not use it directly). 34 35 NOTE - We only expect ONE alignment matrix per Nexus file, 36 meaning this iterator will only yield one Alignment.""" 37 n = Nexus.Nexus(handle) 38 if not n.matrix: 39 #No alignment found 40 raise StopIteration 41 alignment = Alignment(n.alphabet) 42 43 #Bio.Nexus deals with duplicated names by adding a '.copy' suffix. 44 #The original names and the modified names are kept in these two lists: 45 assert len(n.unaltered_taxlabels) == len(n.taxlabels) 46 47 if seq_count and seq_count != len(n.unaltered_taxlabels): 48 raise ValueError("Found %i sequences, but seq_count=%i" \ 49 % (len(n.unaltered_taxlabels), seq_count)) 50 51 for old_name, new_name in zip (n.unaltered_taxlabels, n.taxlabels): 52 assert new_name.startswith(old_name) 53 seq = n.matrix[new_name] #already a Seq object with the alphabet set 54 #ToDo - Can we extract any annotation too? 55 #ToDo - Avoid abusing the private _records list 56 alignment._records.append(SeqRecord(seq, 57 id=new_name, 58 name=old_name, 59 description="")) 60 #All done 61 yield alignment
62
63 -class NexusWriter(AlignmentWriter):
64 """Nexus alignment writer. 65 66 Note that Nexus files are only expected to hold ONE alignment 67 matrix. 68 69 You are expected to call this class via the Bio.AlignIO.write() or 70 Bio.SeqIO.write() functions. 71 """
72 - def write_file(self, alignments):
73 """Use this to write an entire file containing the given alignments. 74 75 alignments - A list or iterator returning Alignment objects. 76 This should hold ONE and only one Alignment. 77 """ 78 align_iter = iter(alignments) #Could have been a list 79 try: 80 first_alignment = align_iter.next() 81 except StopIteration: 82 first_alignment = None 83 if first_alignment is None: 84 #Nothing to write! 85 return 0 86 87 #Check there is only one alignment... 88 try: 89 second_alignment = align_iter.next() 90 except StopIteration: 91 second_alignment = None 92 if second_alignment is not None: 93 raise ValueError("We can only write one Alignment to a Nexus file.") 94 95 #Good. Actually write the single alignment, 96 self.write_alignment(first_alignment) 97 return 1 #we only support writing one alignment!
98
99 - def write_alignment(self, alignment):
100 #Creates an empty Nexus object, adds the sequences, 101 #and then gets Nexus to prepare the output. 102 if len(alignment.get_all_seqs()) == 0: 103 raise ValueError("Must have at least one sequence") 104 if alignment.get_alignment_length() == 0: 105 raise ValueError("Non-empty sequences are required") 106 minimal_record = "#NEXUS\nbegin data; dimensions ntax=0 nchar=0; " \ 107 + "format datatype=%s; end;" \ 108 % self._classify_alphabet_for_nexus(alignment._alphabet) 109 n = Nexus.Nexus(minimal_record) 110 n.alphabet = alignment._alphabet 111 for record in alignment: 112 n.add_sequence(record.id, record.seq.tostring()) 113 n.write_nexus_data(self.handle)
114
115 - def _classify_alphabet_for_nexus(self, alphabet):
116 """Returns 'protein', 'dna', 'rna' based on the alphabet (PRIVATE). 117 118 Raises an exception if this is not possible.""" 119 #Get the base alphabet (underneath any Gapped or StopCodon encoding) 120 a = Alphabet._get_base_alphabet(alphabet) 121 122 if not isinstance(a, Alphabet.Alphabet): 123 raise TypeError("Invalid alphabet") 124 elif isinstance(a, Alphabet.ProteinAlphabet): 125 return "protein" 126 elif isinstance(a, Alphabet.DNAAlphabet): 127 return "dna" 128 elif isinstance(a, Alphabet.RNAAlphabet): 129 return "rna" 130 else: 131 #Must be something like NucleotideAlphabet or 132 #just the generic Alphabet (default for fasta files) 133 raise ValueError("Need a DNA, RNA or Protein alphabet")
134 135 if __name__ == "__main__": 136 from StringIO import StringIO 137 print "Quick self test" 138 print 139 print "Repeated names without a TAXA block" 140 handle = StringIO("""#NEXUS 141 [TITLE: NoName] 142 143 begin data; 144 dimensions ntax=4 nchar=50; 145 format interleave datatype=protein gap=- symbols="FSTNKEYVQMCLAWPHDRIG"; 146 147 matrix 148 CYS1_DICDI -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---- 149 ALEU_HORVU MAHARVLLLA LAVLATAAVA VASSSSFADS NPIRPVTDRA ASTLESAVLG 150 CATH_HUMAN ------MWAT LPLLCAGAWL LGV------- -PVCGAAELS VNSLEK---- 151 CYS1_DICDI -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---X 152 ; 153 end; 154 """) 155 for a in NexusIterator(handle): 156 print a 157 for r in a: 158 print repr(r.seq), r.name, r.id 159 print "Done" 160 161 print 162 print "Repeated names with a TAXA block" 163 handle = StringIO("""#NEXUS 164 [TITLE: NoName] 165 166 begin taxa 167 CYS1_DICDI 168 ALEU_HORVU 169 CATH_HUMAN 170 CYS1_DICDI; 171 end; 172 173 begin data; 174 dimensions ntax=4 nchar=50; 175 format interleave datatype=protein gap=- symbols="FSTNKEYVQMCLAWPHDRIG"; 176 177 matrix 178 CYS1_DICDI -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---- 179 ALEU_HORVU MAHARVLLLA LAVLATAAVA VASSSSFADS NPIRPVTDRA ASTLESAVLG 180 CATH_HUMAN ------MWAT LPLLCAGAWL LGV------- -PVCGAAELS VNSLEK---- 181 CYS1_DICDI -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---X 182 ; 183 end; 184 """) 185 for a in NexusIterator(handle): 186 print a 187 for r in a: 188 print repr(r.seq), r.name, r.id 189 print "Done" 190 print 191 print "Reading an empty file" 192 assert 0 == len(list(NexusIterator(StringIO()))) 193 print "Done" 194 print 195 print "Writing..." 196 197 handle = StringIO() 198 NexusWriter(handle).write_file([a]) 199 handle.seek(0) 200 print handle.read() 201 202 handle = StringIO() 203 try: 204 NexusWriter(handle).write_file([a,a]) 205 assert False, "Should have rejected more than one alignment!" 206 except ValueError: 207 pass 208