Package Bio :: Package AlignIO :: Module ClustalIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignIO.ClustalIO

  1  # Copyright 2006-2008 by Peter Cock.  All rights reserved. 
  2  # 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  """ 
  7  Bio.AlignIO support for the "clustal" output from CLUSTAL W and other tools. 
  8   
  9  You are expected to use this module via the Bio.AlignIO functions (or the 
 10  Bio.SeqIO functions if you want to work directly with the gapped sequences). 
 11  """ 
 12   
 13  from Bio.Align.Generic import Alignment 
 14  from Interfaces import AlignmentIterator, SequentialAlignmentWriter 
 15   
16 -class ClustalWriter(SequentialAlignmentWriter):
17 """Clustalw alignment writer."""
18 - def write_alignment(self, alignment):
19 """Use this to write (another) single alignment to an open file.""" 20 21 if len(alignment.get_all_seqs()) == 0: 22 raise ValueError("Must have at least one sequence") 23 24 #Old versions of the parser in Bio.Clustalw used a ._version property, 25 try: 26 version = str(alignment._version) 27 except AttributeError: 28 version = "" 29 if not version: 30 version = '1.81' 31 if version.startswith("2."): 32 #e.g. 2.0.x 33 output = "CLUSTAL %s multiple sequence alignment\n\n\n" % version 34 else: 35 #e.g. 1.81 or 1.83 36 output = "CLUSTAL X (%s) multiple sequence alignment\n\n\n" % version 37 38 cur_char = 0 39 max_length = len(alignment._records[0].seq) 40 41 if max_length <= 0: 42 raise ValueError("Non-empty sequences are required") 43 44 # keep displaying sequences until we reach the end 45 while cur_char != max_length: 46 # calculate the number of sequences to show, which will 47 # be less if we are at the end of the sequence 48 if (cur_char + 50) > max_length: 49 show_num = max_length - cur_char 50 else: 51 show_num = 50 52 53 # go through all of the records and print out the sequences 54 # when we output, we do a nice 80 column output, although this 55 # may result in truncation of the ids. 56 for record in alignment._records: 57 #Make sure we don't get any spaces in the record 58 #identifier when output in the file by replacing 59 #them with underscores: 60 line = record.id[0:30].replace(" ","_").ljust(36) 61 line += record.seq.data[cur_char:(cur_char + show_num)] 62 output += line + "\n" 63 64 # now we need to print out the star info, if we've got it 65 # This was stored by Bio.Clustalw using a ._star_info property. 66 if hasattr(alignment, "_star_info") and alignment._star_info != '': 67 output += (" " * 36) + \ 68 alignment._star_info[cur_char:(cur_char + show_num)] + "\n" 69 70 output += "\n" 71 cur_char += show_num 72 73 # Want a trailing blank new line in case the output is concatenated 74 self.handle.write(output + "\n")
75
76 -class ClustalIterator(AlignmentIterator):
77 """Clustalw alignment iterator.""" 78
79 - def next(self):
80 handle = self.handle 81 try: 82 #Header we saved from when we were parsing 83 #the previous alignment. 84 line = self._header 85 del self._header 86 except AttributeError: 87 line = handle.readline() 88 if not line: 89 return None 90 91 #Whitelisted headers we know about 92 known_headers = ['CLUSTAL', 'PROBCONS', 'MUSCLE'] 93 if line.strip().split()[0] not in known_headers: 94 raise ValueError("%s is not a known CLUSTAL header: %s" % \ 95 (line.strip().split()[0], 96 ", ".join(known_headers))) 97 98 # find the clustal version in the header line 99 version = None 100 for word in line.split(): 101 if word[0]=='(' and word[-1]==')': 102 word = word[1:-1] 103 if word[0] in '0123456789': 104 version = word 105 break 106 107 #There should be two blank lines after the header line 108 line = handle.readline() 109 while line.strip() == "": 110 line = handle.readline() 111 112 #If the alignment contains entries with the same sequence 113 #identifier (not a good idea - but seems possible), then this 114 #dictionary based parser will merge their sequences. Fix this? 115 ids = [] 116 seqs = [] 117 consensus = "" 118 seq_cols = None #: Used to extract the consensus 119 120 #Use the first block to get the sequence identifiers 121 while True: 122 if line[0] != " " and line.strip() != "": 123 #Sequences identifier... 124 fields = line.rstrip().split() 125 126 #We expect there to be two fields, there can be an optional 127 #"sequence number" field containing the letter count. 128 if len(fields) < 2 or len(fields) > 3: 129 raise ValueError("Could not parse line:\n%s" % line) 130 131 ids.append(fields[0]) 132 seqs.append(fields[1]) 133 134 #Record the sequence position to get the consensus 135 if seq_cols is None: 136 start = len(fields[0]) + line[len(fields[0]):].find(fields[1]) 137 end = start + len(fields[1]) 138 seq_cols = slice(start, end) 139 del start, end 140 assert fields[1] == line[seq_cols] 141 142 if len(fields) == 3: 143 #This MAY be an old style file with a letter count... 144 try: 145 letters = int(fields[2]) 146 except ValueError: 147 raise ValueError("Could not parse line, bad sequence number:\n%s" % line) 148 if len(fields[1].replace("-","")) != letters: 149 raise ValueError("Could not parse line, invalid sequence number:\n%s" % line) 150 elif line[0] == " ": 151 #Sequence consensus line... 152 assert len(ids) == len(seqs) 153 assert len(ids) > 0 154 assert seq_cols is not None 155 consensus = line[seq_cols] 156 assert not line[:seq_cols.start].strip() 157 assert not line[seq_cols.stop:].strip() 158 #Check for blank line (or end of file) 159 line = handle.readline() 160 assert line.strip() == "" 161 break 162 else: 163 #No consensus 164 break 165 line = handle.readline() 166 if not line : break #end of file 167 168 assert line.strip() == "" 169 assert seq_cols is not None 170 171 #Confirm all same length 172 for s in seqs: 173 assert len(s) == len(seqs[0]) 174 if consensus: 175 assert len(consensus) == len(seqs[0]) 176 177 #Loop over any remaining blocks... 178 done = False 179 while not done: 180 #There should be a blank line between each block. 181 #Also want to ignore any consensus line from the 182 #previous block. 183 while (not line) or line.strip() == "": 184 line = handle.readline() 185 if not line : break # end of file 186 if not line : break # end of file 187 188 if line.split(None,1)[0] in known_headers: 189 #Found concatenated alignment. 190 done = True 191 self._header = line 192 break 193 194 for i in range(len(ids)): 195 assert line[0] != " ", "Unexpected line:\n%s" % repr(line) 196 fields = line.rstrip().split() 197 198 #We expect there to be two fields, there can be an optional 199 #"sequence number" field containing the letter count. 200 if len(fields) < 2 or len(fields) > 3: 201 raise ValueError("Could not parse line:\n%s" % repr(line)) 202 203 if fields[0] != ids[i]: 204 raise ValueError("Identifiers out of order? Got '%s' but expected '%s'" \ 205 % (fields[0], ids[i])) 206 207 if fields[1] != line[seq_cols]: 208 start = len(fields[0]) + line[len(fields[0]):].find(fields[1]) 209 assert start == seq_cols.start, 'Old location %s -> %i:XX' % (seq_cols, start) 210 end = start + len(fields[1]) 211 seq_cols = slice(start, end) 212 del start, end 213 214 #Append the sequence 215 seqs[i] += fields[1] 216 assert len(seqs[i]) == len(seqs[0]) 217 218 if len(fields) == 3: 219 #This MAY be an old style file with a letter count... 220 try: 221 letters = int(fields[2]) 222 except ValueError: 223 raise ValueError("Could not parse line, bad sequence number:\n%s" % line) 224 if len(seqs[i].replace("-","")) != letters: 225 raise ValueError("Could not parse line, invalid sequence number:\n%s" % line) 226 227 #Read in the next line 228 line = handle.readline() 229 #There should now be a consensus line 230 if consensus: 231 assert line[0] == " " 232 assert seq_cols is not None 233 consensus += line[seq_cols] 234 assert len(consensus) == len(seqs[0]) 235 assert not line[:seq_cols.start].strip() 236 assert not line[seq_cols.stop:].strip() 237 #Read in the next line 238 line = handle.readline() 239 240 241 assert len(ids) == len(seqs) 242 if len(seqs) == 0 or len(seqs[0]) == 0: 243 return None 244 245 if self.records_per_alignment is not None \ 246 and self.records_per_alignment != len(ids): 247 raise ValueError("Found %i records in this alignment, told to expect %i" \ 248 % (len(ids), self.records_per_alignment)) 249 250 alignment = Alignment(self.alphabet) 251 alignment_length = len(seqs[0]) 252 for i in range(len(ids)): 253 if len(seqs[i]) != alignment_length: 254 raise ValueError("Error parsing alignment - sequences of different length?") 255 alignment.add_sequence(ids[i], seqs[i]) 256 #TODO - Handle alignment annotation better, for now 257 #mimic the old parser in Bio.Clustalw 258 if version: 259 alignment._version = version 260 if consensus: 261 assert len(consensus) == alignment_length, \ 262 "Alignment length is %i, consensus length is %i, '%s'" \ 263 % (alignment_length, len(consensus), consensus) 264 alignment._star_info = consensus 265 return alignment
266 267 if __name__ == "__main__": 268 print "Running a quick self-test" 269 270 #This is a truncated version of the example in Tests/cw02.aln 271 #Notice the inclusion of sequence numbers (right hand side) 272 aln_example1 = \ 273 """CLUSTAL W (1.81) multiple sequence alignment 274 275 276 gi|4959044|gb|AAD34209.1|AF069 MENSDSNDKGSDQSAAQRRSQMDRLDREEAFYQFVNNLSEEDYRLMRDNN 50 277 gi|671626|emb|CAA85685.1| ---------MSPQTETKASVGFKAGVKEYKLTYYTPEYETKDTDILAAFR 41 278 * *: :: :. :* : :. : . :* :: . 279 280 gi|4959044|gb|AAD34209.1|AF069 LLGTPGESTEEELLRRLQQIKEGPPPQSPDENRAGESSDDVTNSDSIIDW 100 281 gi|671626|emb|CAA85685.1| VTPQPG-----------------VPPEEAGAAVAAESSTGT--------- 65 282 : ** **:... *.*** .. 283 284 gi|4959044|gb|AAD34209.1|AF069 LNSVRQTGNTTRSRQRGNQSWRAVSRTNPNSGDFRFSLEINVNRNNGSQT 150 285 gi|671626|emb|CAA85685.1| WTTVWTDGLTSLDRYKG-----RCYHIEPVPG------------------ 92 286 .:* * *: .* :* : :* .* 287 288 gi|4959044|gb|AAD34209.1|AF069 SENESEPSTRRLSVENMESSSQRQMENSASESASARPSRAERNSTEAVTE 200 289 gi|671626|emb|CAA85685.1| -EKDQCICYVAYPLDLFEEGSVTNMFTSIVGNVFGFKALRALRLEDLRIP 141 290 *::. . .:: :*..* :* .* .. . : . : 291 292 gi|4959044|gb|AAD34209.1|AF069 VPTTRAQRRA 210 293 gi|671626|emb|CAA85685.1| VAYVKTFQGP 151 294 *. .:: : . 295 296 """ 297 298 #This example is a truncated version of the dataset used here: 299 #http://virgil.ruc.dk/kurser/Sekvens/Treedraw.htm 300 #with the last record repeated twice (deliberate toture test) 301 aln_example2 = \ 302 """CLUSTAL X (1.83) multiple sequence alignment 303 304 305 V_Harveyi_PATH --MKNWIKVAVAAIA--LSAA------------------TVQAATEVKVG 306 B_subtilis_YXEM MKMKKWTVLVVAALLAVLSACG------------NGNSSSKEDDNVLHVG 307 B_subtilis_GlnH_homo_YCKK MKKALLALFMVVSIAALAACGAGNDNQSKDNAKDGDLWASIKKKGVLTVG 308 YA80_HAEIN MKKLLFTTALLTGAIAFSTF-----------SHAGEIADRVEKTKTLLVG 309 FLIY_ECOLI MKLAHLGRQALMGVMAVALVAG---MSVKSFADEG-LLNKVKERGTLLVG 310 E_coli_GlnH --MKSVLKVSLAALTLAFAVS------------------SHAADKKLVVA 311 Deinococcus_radiodurans -MKKSLLSLKLSGLLVPSVLALS--------LSACSSPSSTLNQGTLKIA 312 HISJ_E_COLI MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG 313 HISJ_E_COLI MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG 314 : . : :. 315 316 V_Harveyi_PATH MSGRYFPFTFVKQ--DKLQGFEVDMWDEIGKRNDYKIEYVTANFSGLFGL 317 B_subtilis_YXEM ATGQSYPFAYKEN--GKLTGFDVEVMEAVAKKIDMKLDWKLLEFSGLMGE 318 B_subtilis_GlnH_homo_YCKK TEGTYEPFTYHDKDTDKLTGYDVEVITEVAKRLGLKVDFKETQWGSMFAG 319 YA80_HAEIN TEGTYAPFTFHDK-SGKLTGFDVEVIRKVAEKLGLKVEFKETQWDAMYAG 320 FLIY_ECOLI LEGTYPPFSFQGD-DGKLTGFEVEFAQQLAKHLGVEASLKPTKWDGMLAS 321 E_coli_GlnH TDTAFVPFEFKQG--DKYVGFDVDLWAAIAKELKLDYELKPMDFSGIIPA 322 Deinococcus_radiodurans MEGTYPPFTSKNE-QGELVGFDVDIAKAVAQKLNLKPEFVLTEWSGILAG 323 HISJ_E_COLI TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS 324 HISJ_E_COLI TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS 325 ** .: *::::. : :. . ..: 326 327 V_Harveyi_PATH LETGRIDTISNQITMTDARKAKYLFADPYVVDG-AQI 328 B_subtilis_YXEM LQTGKLDTISNQVAVTDERKETYNFTKPYAYAG-TQI 329 B_subtilis_GlnH_homo_YCKK LNSKRFDVVANQVG-KTDREDKYDFSDKYTTSR-AVV 330 YA80_HAEIN LNAKRFDVIANQTNPSPERLKKYSFTTPYNYSG-GVI 331 FLIY_ECOLI LDSKRIDVVINQVTISDERKKKYDFSTPYTISGIQAL 332 E_coli_GlnH LQTKNVDLALAGITITDERKKAIDFSDGYYKSG-LLV 333 Deinococcus_radiodurans LQANKYDVIVNQVGITPERQNSIGFSQPYAYSRPEII 334 HISJ_E_COLI LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV 335 HISJ_E_COLI LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV 336 *.: . * . * *: : 337 338 """ 339 340 from StringIO import StringIO 341 342 alignments = list(ClustalIterator(StringIO(aln_example1))) 343 assert 1 == len(alignments) 344 assert alignments[0]._version == "1.81" 345 records = alignments[0].get_all_seqs() 346 assert 2 == len(records) 347 assert records[0].id == "gi|4959044|gb|AAD34209.1|AF069" 348 assert records[1].id == "gi|671626|emb|CAA85685.1|" 349 assert records[0].seq.tostring() == \ 350 "MENSDSNDKGSDQSAAQRRSQMDRLDREEAFYQFVNNLSEEDYRLMRDNN" + \ 351 "LLGTPGESTEEELLRRLQQIKEGPPPQSPDENRAGESSDDVTNSDSIIDW" + \ 352 "LNSVRQTGNTTRSRQRGNQSWRAVSRTNPNSGDFRFSLEINVNRNNGSQT" + \ 353 "SENESEPSTRRLSVENMESSSQRQMENSASESASARPSRAERNSTEAVTE" + \ 354 "VPTTRAQRRA" 355 356 alignments = list(ClustalIterator(StringIO(aln_example2))) 357 assert 1 == len(alignments) 358 assert alignments[0]._version == "1.83" 359 records = alignments[0].get_all_seqs() 360 assert 9 == len(records) 361 assert records[-1].id == "HISJ_E_COLI" 362 assert records[-1].seq.tostring() == \ 363 "MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG" + \ 364 "TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS" + \ 365 "LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV" 366 367 for alignment in ClustalIterator(StringIO(aln_example2 + aln_example1)): 368 print "Alignment with %i records of length %i" \ 369 % (len(alignment.get_all_seqs()), 370 alignment.get_alignment_length()) 371 372 print "Checking empty file..." 373 assert 0 == len(list(ClustalIterator(StringIO("")))) 374 375 print "Checking write/read..." 376 alignments = list(ClustalIterator(StringIO(aln_example1))) \ 377 + list(ClustalIterator(StringIO(aln_example2)))*2 378 handle = StringIO() 379 ClustalWriter(handle).write_file(alignments) 380 handle.seek(0) 381 for i,a in enumerate(ClustalIterator(handle)): 382 assert a.get_alignment_length() == alignments[i].get_alignment_length() 383 handle.seek(0) 384 385 print "Testing write/read when there is only one sequence..." 386 alignment._records = alignment._records[0:1] 387 handle = StringIO() 388 ClustalWriter(handle).write_file([alignment]) 389 handle.seek(0) 390 for i,a in enumerate(ClustalIterator(handle)): 391 assert a.get_alignment_length() == alignment.get_alignment_length() 392 assert len(a.get_all_seqs()) == 1 393 394 aln_example3 = \ 395 """CLUSTAL 2.0.9 multiple sequence alignment 396 397 398 Test1seq ------------------------------------------------------------ 399 AT3G20900.1-SEQ ATGAACAAAGTAGCGAGGAAGAACAAAACATCAGGTGAACAAAAAAAAAACTCAATCCAC 400 AT3G20900.1-CDS ------------------------------------------------------------ 401 402 403 Test1seq -----AGTTACAATAACTGACGAAGCTAAGTAGGCTACTAATTAACGTCATCAACCTAAT 404 AT3G20900.1-SEQ ATCAAAGTTACAATAACTGACGAAGCTAAGTAGGCTAGAAATTAAAGTCATCAACCTAAT 405 AT3G20900.1-CDS ------------------------------------------------------------ 406 407 408 Test1seq ACATAGCACTTAGAAAAAAGTGAAGTAAGAAAATATAAAATAATAAAAGGGTGGGTTATC 409 AT3G20900.1-SEQ ACATAGCACTTAGAAAAAAGTGAAGCAAGAAAATATAAAATAATAAAAGGGTGGGTTATC 410 AT3G20900.1-CDS ------------------------------------------------------------ 411 412 413 Test1seq AATTGATAGTGTAAATCATCGTATTCCGGTGATATACCCTACCACAAAAACTCAAACCGA 414 AT3G20900.1-SEQ AATTGATAGTGTAAATCATAGTTGATTTTTGATATACCCTACCACAAAAACTCAAACCGA 415 AT3G20900.1-CDS ------------------------------------------------------------ 416 417 418 Test1seq CTTGATTCAAATCATCTCAATAAATTAGCGCCAAAATAATGAAAAAAATAATAACAAACA 419 AT3G20900.1-SEQ CTTGATTCAAATCATCTCAAAAAACAAGCGCCAAAATAATGAAAAAAATAATAACAAAAA 420 AT3G20900.1-CDS ------------------------------------------------------------ 421 422 423 Test1seq AAAACAAACCAAAATAAGAAAAAACATTACGCAAAACATAATAATTTACTCTTCGTTATT 424 AT3G20900.1-SEQ CAAACAAACCAAAATAAGAAAAAACATTACGCAAAACATAATAATTTACTCTTCGTTATT 425 AT3G20900.1-CDS ------------------------------------------------------------ 426 427 428 Test1seq GTATTAACAAATCAAAGAGCTGAATTTTGATCACCTGCTAATACTACTTTCTGTATTGAT 429 AT3G20900.1-SEQ GTATTAACAAATCAAAGAGATGAATTTTGATCACCTGCTAATACTACTTTCTGTATTGAT 430 AT3G20900.1-CDS ------------------------------------------------------------ 431 432 433 Test1seq CCTATATCAACGTAAACAAAGATACTAATAATTAACTAAAAGTACGTTCATCGATCGTGT 434 AT3G20900.1-SEQ CCTATATCAAAAAAAAAAAAGATACTAATAATTAACTAAAAGTACGTTCATCGATCGTGT 435 AT3G20900.1-CDS ------------------------------------------------------ATGAAC 436 * 437 438 Test1seq TCGTTGACGAAGAAGAGCTCTATCTCCGGCGGAGCAAAGAAAACGATCTGTCTCCGTCGT 439 AT3G20900.1-SEQ GCGTTGACGAAGAAGAGCTCTATCTCCGGCGGAGCAAAGAAAACGATCTGTCTCCGTCGT 440 AT3G20900.1-CDS AAAGTAGCGAGGAAGAACAAAACATC------AGCAAAGAAAACGATCTGTCTCCGTCGT 441 * *** ***** * * ** **************************** 442 443 Test1seq AACACACGGTCGCTAGAGAAACTTTGCTTCTTCGGCGCCGGTGGACACGTCAGCATCTCC 444 AT3G20900.1-SEQ AACACACAGTTTTTCGAGACCCTTTGCTTCTTCGGCGCCGGTGGACACGTCAGCATCTCC 445 AT3G20900.1-CDS AACACACAGTTTTTCGAGACCCTTTGCTTCTTCGGCGCCGGTGGACACGTCAGCATCTCC 446 ******* ** * **** *************************************** 447 448 Test1seq GGTATCCTAGACTTCTTGGCTTTCGGGGTACAACAACCGCGTGGTGACGTCAGCACCGCT 449 AT3G20900.1-SEQ GGTATCCTAGACTTCTTGGCTTTCGGGGTACAACAACCGCCTGGTGACGTCAGCACCGCT 450 AT3G20900.1-CDS GGTATCCTAGACTTCTTGGCTTTCGGGGTACAACAACCGCCTGGTGACGTCAGCACCGCT 451 **************************************** ******************* 452 453 Test1seq GCTGGGGATGGAGAGGGAACAGAGTT- 454 AT3G20900.1-SEQ GCTGGGGATGGAGAGGGAACAGAGTAG 455 AT3G20900.1-CDS GCTGGGGATGGAGAGGGAACAGAGTAG 456 ************************* 457 """ 458 alignments = list(ClustalIterator(StringIO(aln_example3))) 459 assert 1 == len(alignments) 460 assert alignments[0]._version == "2.0.9" 461 462 print "The End" 463