1
2
3
4
5
6 """
7 Bio.AlignIO support for the "clustal" output from CLUSTAL W and other tools.
8
9 You are expected to use this module via the Bio.AlignIO functions (or the
10 Bio.SeqIO functions if you want to work directly with the gapped sequences).
11 """
12
13 from Bio.Align.Generic import Alignment
14 from Interfaces import AlignmentIterator, SequentialAlignmentWriter
15
17 """Clustalw alignment writer."""
19 """Use this to write (another) single alignment to an open file."""
20
21 if len(alignment.get_all_seqs()) == 0:
22 raise ValueError("Must have at least one sequence")
23
24
25 try:
26 version = str(alignment._version)
27 except AttributeError:
28 version = ""
29 if not version:
30 version = '1.81'
31 if version.startswith("2."):
32
33 output = "CLUSTAL %s multiple sequence alignment\n\n\n" % version
34 else:
35
36 output = "CLUSTAL X (%s) multiple sequence alignment\n\n\n" % version
37
38 cur_char = 0
39 max_length = len(alignment._records[0].seq)
40
41 if max_length <= 0:
42 raise ValueError("Non-empty sequences are required")
43
44
45 while cur_char != max_length:
46
47
48 if (cur_char + 50) > max_length:
49 show_num = max_length - cur_char
50 else:
51 show_num = 50
52
53
54
55
56 for record in alignment._records:
57
58
59
60 line = record.id[0:30].replace(" ","_").ljust(36)
61 line += record.seq.data[cur_char:(cur_char + show_num)]
62 output += line + "\n"
63
64
65
66 if hasattr(alignment, "_star_info") and alignment._star_info != '':
67 output += (" " * 36) + \
68 alignment._star_info[cur_char:(cur_char + show_num)] + "\n"
69
70 output += "\n"
71 cur_char += show_num
72
73
74 self.handle.write(output + "\n")
75
77 """Clustalw alignment iterator."""
78
80 handle = self.handle
81 try:
82
83
84 line = self._header
85 del self._header
86 except AttributeError:
87 line = handle.readline()
88 if not line:
89 return None
90
91
92 known_headers = ['CLUSTAL', 'PROBCONS', 'MUSCLE']
93 if line.strip().split()[0] not in known_headers:
94 raise ValueError("%s is not a known CLUSTAL header: %s" % \
95 (line.strip().split()[0],
96 ", ".join(known_headers)))
97
98
99 version = None
100 for word in line.split():
101 if word[0]=='(' and word[-1]==')':
102 word = word[1:-1]
103 if word[0] in '0123456789':
104 version = word
105 break
106
107
108 line = handle.readline()
109 while line.strip() == "":
110 line = handle.readline()
111
112
113
114
115 ids = []
116 seqs = []
117 consensus = ""
118 seq_cols = None
119
120
121 while True:
122 if line[0] != " " and line.strip() != "":
123
124 fields = line.rstrip().split()
125
126
127
128 if len(fields) < 2 or len(fields) > 3:
129 raise ValueError("Could not parse line:\n%s" % line)
130
131 ids.append(fields[0])
132 seqs.append(fields[1])
133
134
135 if seq_cols is None:
136 start = len(fields[0]) + line[len(fields[0]):].find(fields[1])
137 end = start + len(fields[1])
138 seq_cols = slice(start, end)
139 del start, end
140 assert fields[1] == line[seq_cols]
141
142 if len(fields) == 3:
143
144 try:
145 letters = int(fields[2])
146 except ValueError:
147 raise ValueError("Could not parse line, bad sequence number:\n%s" % line)
148 if len(fields[1].replace("-","")) != letters:
149 raise ValueError("Could not parse line, invalid sequence number:\n%s" % line)
150 elif line[0] == " ":
151
152 assert len(ids) == len(seqs)
153 assert len(ids) > 0
154 assert seq_cols is not None
155 consensus = line[seq_cols]
156 assert not line[:seq_cols.start].strip()
157 assert not line[seq_cols.stop:].strip()
158
159 line = handle.readline()
160 assert line.strip() == ""
161 break
162 else:
163
164 break
165 line = handle.readline()
166 if not line : break
167
168 assert line.strip() == ""
169 assert seq_cols is not None
170
171
172 for s in seqs:
173 assert len(s) == len(seqs[0])
174 if consensus:
175 assert len(consensus) == len(seqs[0])
176
177
178 done = False
179 while not done:
180
181
182
183 while (not line) or line.strip() == "":
184 line = handle.readline()
185 if not line : break
186 if not line : break
187
188 if line.split(None,1)[0] in known_headers:
189
190 done = True
191 self._header = line
192 break
193
194 for i in range(len(ids)):
195 assert line[0] != " ", "Unexpected line:\n%s" % repr(line)
196 fields = line.rstrip().split()
197
198
199
200 if len(fields) < 2 or len(fields) > 3:
201 raise ValueError("Could not parse line:\n%s" % repr(line))
202
203 if fields[0] != ids[i]:
204 raise ValueError("Identifiers out of order? Got '%s' but expected '%s'" \
205 % (fields[0], ids[i]))
206
207 if fields[1] != line[seq_cols]:
208 start = len(fields[0]) + line[len(fields[0]):].find(fields[1])
209 assert start == seq_cols.start, 'Old location %s -> %i:XX' % (seq_cols, start)
210 end = start + len(fields[1])
211 seq_cols = slice(start, end)
212 del start, end
213
214
215 seqs[i] += fields[1]
216 assert len(seqs[i]) == len(seqs[0])
217
218 if len(fields) == 3:
219
220 try:
221 letters = int(fields[2])
222 except ValueError:
223 raise ValueError("Could not parse line, bad sequence number:\n%s" % line)
224 if len(seqs[i].replace("-","")) != letters:
225 raise ValueError("Could not parse line, invalid sequence number:\n%s" % line)
226
227
228 line = handle.readline()
229
230 if consensus:
231 assert line[0] == " "
232 assert seq_cols is not None
233 consensus += line[seq_cols]
234 assert len(consensus) == len(seqs[0])
235 assert not line[:seq_cols.start].strip()
236 assert not line[seq_cols.stop:].strip()
237
238 line = handle.readline()
239
240
241 assert len(ids) == len(seqs)
242 if len(seqs) == 0 or len(seqs[0]) == 0:
243 return None
244
245 if self.records_per_alignment is not None \
246 and self.records_per_alignment != len(ids):
247 raise ValueError("Found %i records in this alignment, told to expect %i" \
248 % (len(ids), self.records_per_alignment))
249
250 alignment = Alignment(self.alphabet)
251 alignment_length = len(seqs[0])
252 for i in range(len(ids)):
253 if len(seqs[i]) != alignment_length:
254 raise ValueError("Error parsing alignment - sequences of different length?")
255 alignment.add_sequence(ids[i], seqs[i])
256
257
258 if version:
259 alignment._version = version
260 if consensus:
261 assert len(consensus) == alignment_length, \
262 "Alignment length is %i, consensus length is %i, '%s'" \
263 % (alignment_length, len(consensus), consensus)
264 alignment._star_info = consensus
265 return alignment
266
267 if __name__ == "__main__":
268 print "Running a quick self-test"
269
270
271
272 aln_example1 = \
273 """CLUSTAL W (1.81) multiple sequence alignment
274
275
276 gi|4959044|gb|AAD34209.1|AF069 MENSDSNDKGSDQSAAQRRSQMDRLDREEAFYQFVNNLSEEDYRLMRDNN 50
277 gi|671626|emb|CAA85685.1| ---------MSPQTETKASVGFKAGVKEYKLTYYTPEYETKDTDILAAFR 41
278 * *: :: :. :* : :. : . :* :: .
279
280 gi|4959044|gb|AAD34209.1|AF069 LLGTPGESTEEELLRRLQQIKEGPPPQSPDENRAGESSDDVTNSDSIIDW 100
281 gi|671626|emb|CAA85685.1| VTPQPG-----------------VPPEEAGAAVAAESSTGT--------- 65
282 : ** **:... *.*** ..
283
284 gi|4959044|gb|AAD34209.1|AF069 LNSVRQTGNTTRSRQRGNQSWRAVSRTNPNSGDFRFSLEINVNRNNGSQT 150
285 gi|671626|emb|CAA85685.1| WTTVWTDGLTSLDRYKG-----RCYHIEPVPG------------------ 92
286 .:* * *: .* :* : :* .*
287
288 gi|4959044|gb|AAD34209.1|AF069 SENESEPSTRRLSVENMESSSQRQMENSASESASARPSRAERNSTEAVTE 200
289 gi|671626|emb|CAA85685.1| -EKDQCICYVAYPLDLFEEGSVTNMFTSIVGNVFGFKALRALRLEDLRIP 141
290 *::. . .:: :*..* :* .* .. . : . :
291
292 gi|4959044|gb|AAD34209.1|AF069 VPTTRAQRRA 210
293 gi|671626|emb|CAA85685.1| VAYVKTFQGP 151
294 *. .:: : .
295
296 """
297
298
299
300
301 aln_example2 = \
302 """CLUSTAL X (1.83) multiple sequence alignment
303
304
305 V_Harveyi_PATH --MKNWIKVAVAAIA--LSAA------------------TVQAATEVKVG
306 B_subtilis_YXEM MKMKKWTVLVVAALLAVLSACG------------NGNSSSKEDDNVLHVG
307 B_subtilis_GlnH_homo_YCKK MKKALLALFMVVSIAALAACGAGNDNQSKDNAKDGDLWASIKKKGVLTVG
308 YA80_HAEIN MKKLLFTTALLTGAIAFSTF-----------SHAGEIADRVEKTKTLLVG
309 FLIY_ECOLI MKLAHLGRQALMGVMAVALVAG---MSVKSFADEG-LLNKVKERGTLLVG
310 E_coli_GlnH --MKSVLKVSLAALTLAFAVS------------------SHAADKKLVVA
311 Deinococcus_radiodurans -MKKSLLSLKLSGLLVPSVLALS--------LSACSSPSSTLNQGTLKIA
312 HISJ_E_COLI MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG
313 HISJ_E_COLI MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG
314 : . : :.
315
316 V_Harveyi_PATH MSGRYFPFTFVKQ--DKLQGFEVDMWDEIGKRNDYKIEYVTANFSGLFGL
317 B_subtilis_YXEM ATGQSYPFAYKEN--GKLTGFDVEVMEAVAKKIDMKLDWKLLEFSGLMGE
318 B_subtilis_GlnH_homo_YCKK TEGTYEPFTYHDKDTDKLTGYDVEVITEVAKRLGLKVDFKETQWGSMFAG
319 YA80_HAEIN TEGTYAPFTFHDK-SGKLTGFDVEVIRKVAEKLGLKVEFKETQWDAMYAG
320 FLIY_ECOLI LEGTYPPFSFQGD-DGKLTGFEVEFAQQLAKHLGVEASLKPTKWDGMLAS
321 E_coli_GlnH TDTAFVPFEFKQG--DKYVGFDVDLWAAIAKELKLDYELKPMDFSGIIPA
322 Deinococcus_radiodurans MEGTYPPFTSKNE-QGELVGFDVDIAKAVAQKLNLKPEFVLTEWSGILAG
323 HISJ_E_COLI TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS
324 HISJ_E_COLI TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS
325 ** .: *::::. : :. . ..:
326
327 V_Harveyi_PATH LETGRIDTISNQITMTDARKAKYLFADPYVVDG-AQI
328 B_subtilis_YXEM LQTGKLDTISNQVAVTDERKETYNFTKPYAYAG-TQI
329 B_subtilis_GlnH_homo_YCKK LNSKRFDVVANQVG-KTDREDKYDFSDKYTTSR-AVV
330 YA80_HAEIN LNAKRFDVIANQTNPSPERLKKYSFTTPYNYSG-GVI
331 FLIY_ECOLI LDSKRIDVVINQVTISDERKKKYDFSTPYTISGIQAL
332 E_coli_GlnH LQTKNVDLALAGITITDERKKAIDFSDGYYKSG-LLV
333 Deinococcus_radiodurans LQANKYDVIVNQVGITPERQNSIGFSQPYAYSRPEII
334 HISJ_E_COLI LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV
335 HISJ_E_COLI LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV
336 *.: . * . * *: :
337
338 """
339
340 from StringIO import StringIO
341
342 alignments = list(ClustalIterator(StringIO(aln_example1)))
343 assert 1 == len(alignments)
344 assert alignments[0]._version == "1.81"
345 records = alignments[0].get_all_seqs()
346 assert 2 == len(records)
347 assert records[0].id == "gi|4959044|gb|AAD34209.1|AF069"
348 assert records[1].id == "gi|671626|emb|CAA85685.1|"
349 assert records[0].seq.tostring() == \
350 "MENSDSNDKGSDQSAAQRRSQMDRLDREEAFYQFVNNLSEEDYRLMRDNN" + \
351 "LLGTPGESTEEELLRRLQQIKEGPPPQSPDENRAGESSDDVTNSDSIIDW" + \
352 "LNSVRQTGNTTRSRQRGNQSWRAVSRTNPNSGDFRFSLEINVNRNNGSQT" + \
353 "SENESEPSTRRLSVENMESSSQRQMENSASESASARPSRAERNSTEAVTE" + \
354 "VPTTRAQRRA"
355
356 alignments = list(ClustalIterator(StringIO(aln_example2)))
357 assert 1 == len(alignments)
358 assert alignments[0]._version == "1.83"
359 records = alignments[0].get_all_seqs()
360 assert 9 == len(records)
361 assert records[-1].id == "HISJ_E_COLI"
362 assert records[-1].seq.tostring() == \
363 "MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG" + \
364 "TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS" + \
365 "LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV"
366
367 for alignment in ClustalIterator(StringIO(aln_example2 + aln_example1)):
368 print "Alignment with %i records of length %i" \
369 % (len(alignment.get_all_seqs()),
370 alignment.get_alignment_length())
371
372 print "Checking empty file..."
373 assert 0 == len(list(ClustalIterator(StringIO(""))))
374
375 print "Checking write/read..."
376 alignments = list(ClustalIterator(StringIO(aln_example1))) \
377 + list(ClustalIterator(StringIO(aln_example2)))*2
378 handle = StringIO()
379 ClustalWriter(handle).write_file(alignments)
380 handle.seek(0)
381 for i,a in enumerate(ClustalIterator(handle)):
382 assert a.get_alignment_length() == alignments[i].get_alignment_length()
383 handle.seek(0)
384
385 print "Testing write/read when there is only one sequence..."
386 alignment._records = alignment._records[0:1]
387 handle = StringIO()
388 ClustalWriter(handle).write_file([alignment])
389 handle.seek(0)
390 for i,a in enumerate(ClustalIterator(handle)):
391 assert a.get_alignment_length() == alignment.get_alignment_length()
392 assert len(a.get_all_seqs()) == 1
393
394 aln_example3 = \
395 """CLUSTAL 2.0.9 multiple sequence alignment
396
397
398 Test1seq ------------------------------------------------------------
399 AT3G20900.1-SEQ ATGAACAAAGTAGCGAGGAAGAACAAAACATCAGGTGAACAAAAAAAAAACTCAATCCAC
400 AT3G20900.1-CDS ------------------------------------------------------------
401
402
403 Test1seq -----AGTTACAATAACTGACGAAGCTAAGTAGGCTACTAATTAACGTCATCAACCTAAT
404 AT3G20900.1-SEQ ATCAAAGTTACAATAACTGACGAAGCTAAGTAGGCTAGAAATTAAAGTCATCAACCTAAT
405 AT3G20900.1-CDS ------------------------------------------------------------
406
407
408 Test1seq ACATAGCACTTAGAAAAAAGTGAAGTAAGAAAATATAAAATAATAAAAGGGTGGGTTATC
409 AT3G20900.1-SEQ ACATAGCACTTAGAAAAAAGTGAAGCAAGAAAATATAAAATAATAAAAGGGTGGGTTATC
410 AT3G20900.1-CDS ------------------------------------------------------------
411
412
413 Test1seq AATTGATAGTGTAAATCATCGTATTCCGGTGATATACCCTACCACAAAAACTCAAACCGA
414 AT3G20900.1-SEQ AATTGATAGTGTAAATCATAGTTGATTTTTGATATACCCTACCACAAAAACTCAAACCGA
415 AT3G20900.1-CDS ------------------------------------------------------------
416
417
418 Test1seq CTTGATTCAAATCATCTCAATAAATTAGCGCCAAAATAATGAAAAAAATAATAACAAACA
419 AT3G20900.1-SEQ CTTGATTCAAATCATCTCAAAAAACAAGCGCCAAAATAATGAAAAAAATAATAACAAAAA
420 AT3G20900.1-CDS ------------------------------------------------------------
421
422
423 Test1seq AAAACAAACCAAAATAAGAAAAAACATTACGCAAAACATAATAATTTACTCTTCGTTATT
424 AT3G20900.1-SEQ CAAACAAACCAAAATAAGAAAAAACATTACGCAAAACATAATAATTTACTCTTCGTTATT
425 AT3G20900.1-CDS ------------------------------------------------------------
426
427
428 Test1seq GTATTAACAAATCAAAGAGCTGAATTTTGATCACCTGCTAATACTACTTTCTGTATTGAT
429 AT3G20900.1-SEQ GTATTAACAAATCAAAGAGATGAATTTTGATCACCTGCTAATACTACTTTCTGTATTGAT
430 AT3G20900.1-CDS ------------------------------------------------------------
431
432
433 Test1seq CCTATATCAACGTAAACAAAGATACTAATAATTAACTAAAAGTACGTTCATCGATCGTGT
434 AT3G20900.1-SEQ CCTATATCAAAAAAAAAAAAGATACTAATAATTAACTAAAAGTACGTTCATCGATCGTGT
435 AT3G20900.1-CDS ------------------------------------------------------ATGAAC
436 *
437
438 Test1seq TCGTTGACGAAGAAGAGCTCTATCTCCGGCGGAGCAAAGAAAACGATCTGTCTCCGTCGT
439 AT3G20900.1-SEQ GCGTTGACGAAGAAGAGCTCTATCTCCGGCGGAGCAAAGAAAACGATCTGTCTCCGTCGT
440 AT3G20900.1-CDS AAAGTAGCGAGGAAGAACAAAACATC------AGCAAAGAAAACGATCTGTCTCCGTCGT
441 * *** ***** * * ** ****************************
442
443 Test1seq AACACACGGTCGCTAGAGAAACTTTGCTTCTTCGGCGCCGGTGGACACGTCAGCATCTCC
444 AT3G20900.1-SEQ AACACACAGTTTTTCGAGACCCTTTGCTTCTTCGGCGCCGGTGGACACGTCAGCATCTCC
445 AT3G20900.1-CDS AACACACAGTTTTTCGAGACCCTTTGCTTCTTCGGCGCCGGTGGACACGTCAGCATCTCC
446 ******* ** * **** ***************************************
447
448 Test1seq GGTATCCTAGACTTCTTGGCTTTCGGGGTACAACAACCGCGTGGTGACGTCAGCACCGCT
449 AT3G20900.1-SEQ GGTATCCTAGACTTCTTGGCTTTCGGGGTACAACAACCGCCTGGTGACGTCAGCACCGCT
450 AT3G20900.1-CDS GGTATCCTAGACTTCTTGGCTTTCGGGGTACAACAACCGCCTGGTGACGTCAGCACCGCT
451 **************************************** *******************
452
453 Test1seq GCTGGGGATGGAGAGGGAACAGAGTT-
454 AT3G20900.1-SEQ GCTGGGGATGGAGAGGGAACAGAGTAG
455 AT3G20900.1-CDS GCTGGGGATGGAGAGGGAACAGAGTAG
456 *************************
457 """
458 alignments = list(ClustalIterator(StringIO(aln_example3)))
459 assert 1 == len(alignments)
460 assert alignments[0]._version == "2.0.9"
461
462 print "The End"
463