1
2
3
4
5
6
7
8 """
9 Contains classes to deal with generic sequence alignment stuff not
10 specific to a particular program or format.
11
12 classes:
13 o Alignment
14 """
15
16
17 from Bio.Seq import Seq
18 from Bio.SeqRecord import SeqRecord
19 from Bio import Alphabet
20
22 """Represent a set of alignments.
23
24 This is a base class to represent alignments, which can be subclassed
25 to deal with an alignment in a specific format.
26 """
28 """Initialize a new Alignment object.
29
30 Arguments:
31 o alphabet - The alphabet to use for the sequence objects that are
32 created. This alphabet must be a gapped type.
33
34 e.g.
35 >>> from Bio.Alphabet import IUPAC, Gapped
36 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
37 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG")
38 >>> align.add_sequence("Beta", "ACT-CTAGCTAG")
39 >>> align.add_sequence("Gamma", "ACTGCTAGATAG")
40 >>> print align
41 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns
42 ACTGCTAGCTAG Alpha
43 ACT-CTAGCTAG Beta
44 ACTGCTAGATAG Gamma
45 """
46 if not (isinstance(alphabet, Alphabet.Alphabet) \
47 or isinstance(alphabet, Alphabet.AlphabetEncoder)):
48 raise ValueError("Invalid alphabet argument")
49 self._alphabet = alphabet
50
51 self._records = []
52
54 """Returns a truncated string representation of a SeqRecord (PRIVATE).
55
56 This is a PRIVATE function used by the __str__ method.
57 """
58 if len(record.seq) <= 50:
59 return "%s %s" % (record.seq, record.id)
60 else:
61 return "%s...%s %s" \
62 % (record.seq[:44], record.seq[-3:], record.id)
63
65 """Returns a multi-line string summary of the alignment.
66
67 This output is intended to be readable, but large alignments are
68 shown truncated. A maximum of 20 rows (sequences) and 50 columns
69 are shown, with the record identifiers. This should fit nicely on a
70 single screen. e.g.
71
72 >>> from Bio.Alphabet import IUPAC, Gapped
73 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
74 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG")
75 >>> align.add_sequence("Beta", "ACT-CTAGCTAG")
76 >>> align.add_sequence("Gamma", "ACTGCTAGATAG")
77 >>> print align
78 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns
79 ACTGCTAGCTAG Alpha
80 ACT-CTAGCTAG Beta
81 ACTGCTAGATAG Gamma
82
83 See also the alignment's format method.
84 """
85 rows = len(self._records)
86 lines = ["%s alignment with %i rows and %i columns" \
87 % (str(self._alphabet), rows, self.get_alignment_length())]
88 if rows <= 20:
89 lines.extend([self._str_line(rec) for rec in self._records])
90 else:
91 lines.extend([self._str_line(rec) for rec in self._records[:18]])
92 lines.append("...")
93 lines.append(self._str_line(self._records[-1]))
94 return "\n".join(lines)
95
97 """Returns a representation of the object for debugging.
98
99 The representation cannot be used with eval() to recreate the object,
100 which is usually possible with simple python ojects. For example:
101
102 <Bio.Align.Generic.Alignment instance (2 records of length 14,
103 SingleLetterAlphabet()) at a3c184c>
104
105 The hex string is the memory address of the object, see help(id).
106 This provides a simple way to visually distinguish alignments of
107 the same size.
108 """
109
110
111 return "<%s instance (%i records of length %i, %s) at %x>" % \
112 (self.__class__, len(self._records),
113 self.get_alignment_length(), repr(self._alphabet), id(self))
114
115
116
117
118
153
154
171
173 """Return all of the sequences involved in the alignment.
174
175 The return value is a list of SeqRecord objects.
176
177 This method is semi-obsolete, as the Alignment object itself offers
178 much of the functionality of a list of SeqRecord objects (e.g.
179 iteration or slicing to create a sub-alignment).
180 """
181 return self._records
182
184 """Iterate over alignment rows as SeqRecord objects.
185
186 e.g.
187 >>> from Bio.Alphabet import IUPAC, Gapped
188 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
189 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG")
190 >>> align.add_sequence("Beta", "ACT-CTAGCTAG")
191 >>> align.add_sequence("Gamma", "ACTGCTAGATAG")
192 >>> for record in align:
193 ... print record.id
194 ... print record.seq
195 Alpha
196 ACTGCTAGCTAG
197 Beta
198 ACT-CTAGCTAG
199 Gamma
200 ACTGCTAGATAG
201 """
202 return iter(self._records)
203
205 """Retrieve a sequence by row number (OBSOLETE).
206
207 Returns:
208 o A Seq object for the requested sequence.
209
210 Raises:
211 o IndexError - If the specified number is out of range.
212
213 NOTE: This is a legacy method. In new code where you need to access
214 the rows of the alignment (i.e. the sequences) consider iterating
215 over them or accessing them as SeqRecord objects. e.g.
216
217 for record in alignment:
218 print record.id
219 print record.seq
220 first_record = alignment[0]
221 last_record = alignment[-1]
222 """
223 return self._records[number].seq
224
226 """Returns the number of sequences in the alignment.
227
228 Use len(alignment) to get the number of sequences (i.e. the number of
229 rows), and alignment.get_alignment_length() to get the length of the
230 longest sequence (i.e. the number of columns).
231
232 This is easy to remember if you think of the alignment as being like a
233 list of SeqRecord objects.
234 """
235 return len(self._records)
236
238 """Return the maximum length of the alignment.
239
240 All objects in the alignment should (hopefully) have the same
241 length. This function will go through and find this length
242 by finding the maximum length of sequences in the alignment.
243
244 >>> from Bio.Alphabet import IUPAC, Gapped
245 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
246 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG")
247 >>> align.add_sequence("Beta", "ACT-CTAGCTAG")
248 >>> align.add_sequence("Gamma", "ACTGCTAGATAG")
249 >>> align.get_alignment_length()
250 12
251
252 If you want to know the number of sequences in the alignment,
253 use len(align) instead:
254
255 >>> len(align)
256 3
257
258 """
259 max_length = 0
260
261 for record in self._records:
262 if len(record.seq) > max_length:
263 max_length = len(record.seq)
264
265 return max_length
266
267 - def add_sequence(self, descriptor, sequence, start = None, end = None,
268 weight = 1.0):
269 """Add a sequence to the alignment.
270
271 This doesn't do any kind of alignment, it just adds in the sequence
272 object, which is assumed to be prealigned with the existing
273 sequences.
274
275 Arguments:
276 o descriptor - The descriptive id of the sequence being added.
277 This will be used as the resulting SeqRecord's
278 .id property (and, for historical compatibility,
279 also the .description property)
280 o sequence - A string with sequence info.
281 o start - You can explicitly set the start point of the sequence.
282 This is useful (at least) for BLAST alignments, which can just
283 be partial alignments of sequences.
284 o end - Specify the end of the sequence, which is important
285 for the same reason as the start.
286 o weight - The weight to place on the sequence in the alignment.
287 By default, all sequences have the same weight. (0.0 => no weight,
288 1.0 => highest weight)
289 """
290 new_seq = Seq(sequence, self._alphabet)
291
292
293
294
295
296
297 new_record = SeqRecord(new_seq,
298 id = descriptor,
299 description = descriptor)
300
301
302
303
304
305
306
307
308 if start:
309 new_record.annotations['start'] = start
310 if end:
311 new_record.annotations['end'] = end
312
313
314 new_record.annotations['weight'] = weight
315
316 self._records.append(new_record)
317
319 """Returns a string containing a given column.
320
321 e.g.
322 >>> from Bio.Alphabet import IUPAC, Gapped
323 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
324 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG")
325 >>> align.add_sequence("Beta", "ACT-CTAGCTAG")
326 >>> align.add_sequence("Gamma", "ACTGCTAGATAG")
327 >>> align.get_column(0)
328 'AAA'
329 >>> align.get_column(3)
330 'G-G'
331 """
332
333 col_str = ''
334 assert col >= 0 and col <= self.get_alignment_length()
335 for rec in self._records:
336 col_str += rec.seq[col]
337 return col_str
338
340 """Access part of the alignment.
341
342 We'll use the following example alignment here for illustration:
343
344 >>> from Bio.Alphabet import IUPAC, Gapped
345 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
346 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG")
347 >>> align.add_sequence("Beta", "ACT-CTAGCTAG")
348 >>> align.add_sequence("Gamma", "ACTGCTAGATAG")
349 >>> align.add_sequence("Delta", "ACTGCTTGCTAG")
350 >>> align.add_sequence("Epsilon","ACTGCTTGATAG")
351
352 You can access a row of the alignment as a SeqRecord using an integer
353 index (think of the alignment as a list of SeqRecord objects here):
354
355 >>> first_record = align[0]
356 >>> print first_record.id, first_record.seq
357 Alpha ACTGCTAGCTAG
358 >>> last_record = align[-1]
359 >>> print last_record.id, last_record.seq
360 Epsilon ACTGCTTGATAG
361
362 You can also access use python's slice notation to create a sub-alignment
363 containing only some of the SeqRecord objects:
364
365 >>> sub_alignment = align[2:5]
366 >>> print sub_alignment
367 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns
368 ACTGCTAGATAG Gamma
369 ACTGCTTGCTAG Delta
370 ACTGCTTGATAG Epsilon
371
372 This includes support for a step, i.e. align[start:end:step], which
373 can be used to select every second sequence:
374
375 >>> sub_alignment = align[::2]
376 >>> print sub_alignment
377 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns
378 ACTGCTAGCTAG Alpha
379 ACTGCTAGATAG Gamma
380 ACTGCTTGATAG Epsilon
381
382 Or to get a copy of the alignment with the rows in reverse order:
383
384 >>> rev_alignment = align[::-1]
385 >>> print rev_alignment
386 Gapped(IUPACUnambiguousDNA(), '-') alignment with 5 rows and 12 columns
387 ACTGCTTGATAG Epsilon
388 ACTGCTTGCTAG Delta
389 ACTGCTAGATAG Gamma
390 ACT-CTAGCTAG Beta
391 ACTGCTAGCTAG Alpha
392
393 Right now, these are the ONLY indexing operations supported. The use of
394 a second column based index is under discussion for a future update.
395 """
396 if isinstance(index, int):
397
398
399 return self._records[index]
400 elif isinstance(index, slice):
401
402
403
404
405 sub_align = Alignment(self._alphabet)
406 sub_align._records = self._records[index]
407 return sub_align
408 elif len(index)==2:
409 raise TypeError("Row and Column indexing is not currently supported,"\
410 +"but may be in future.")
411 else:
412 raise TypeError("Invalid index type.")
413
415 """Run the Bio.Seq module's doctests."""
416 print "Running doctests..."
417 import doctest
418 doctest.testmod()
419 print "Done"
420
421 if __name__ == "__main__":
422 _test()
423