Package Bio :: Package Align :: Module Generic
[hide private]
[frames] | no frames]

Source Code for Module Bio.Align.Generic

  1  # Copyright 2000-2004 Brad Chapman. 
  2  # Copyright 2001 Iddo Friedberg. 
  3  # Copyright 2007-2009 by Peter Cock. 
  4  # All rights reserved. 
  5  # This code is part of the Biopython distribution and governed by its 
  6  # license.  Please see the LICENSE file that should have been included 
  7  # as part of this package. 
  8  """ 
  9  Contains classes to deal with generic sequence alignment stuff not 
 10  specific to a particular program or format. 
 11   
 12  classes: 
 13  o Alignment 
 14  """ 
 15   
 16  # biopython 
 17  from Bio.Seq import Seq 
 18  from Bio.SeqRecord import SeqRecord 
 19  from Bio import Alphabet 
 20   
21 -class Alignment:
22 """Represent a set of alignments. 23 24 This is a base class to represent alignments, which can be subclassed 25 to deal with an alignment in a specific format. 26 """
27 - def __init__(self, alphabet):
28 """Initialize a new Alignment object. 29 30 Arguments: 31 o alphabet - The alphabet to use for the sequence objects that are 32 created. This alphabet must be a gapped type. 33 34 e.g. 35 >>> from Bio.Alphabet import IUPAC, Gapped 36 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 37 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 38 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 39 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 40 >>> print align 41 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns 42 ACTGCTAGCTAG Alpha 43 ACT-CTAGCTAG Beta 44 ACTGCTAGATAG Gamma 45 """ 46 if not (isinstance(alphabet, Alphabet.Alphabet) \ 47 or isinstance(alphabet, Alphabet.AlphabetEncoder)): 48 raise ValueError("Invalid alphabet argument") 49 self._alphabet = alphabet 50 # hold everything at a list of SeqRecord objects 51 self._records = []
52
53 - def _str_line(self, record):
54 """Returns a truncated string representation of a SeqRecord (PRIVATE). 55 56 This is a PRIVATE function used by the __str__ method. 57 """ 58 if len(record.seq) <= 50: 59 return "%s %s" % (record.seq, record.id) 60 else: 61 return "%s...%s %s" \ 62 % (record.seq[:44], record.seq[-3:], record.id)
63
64 - def __str__(self):
65 """Returns a multi-line string summary of the alignment. 66 67 This output is intended to be readable, but large alignments are 68 shown truncated. A maximum of 20 rows (sequences) and 50 columns 69 are shown, with the record identifiers. This should fit nicely on a 70 single screen. e.g. 71 72 >>> from Bio.Alphabet import IUPAC, Gapped 73 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 74 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 75 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 76 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 77 >>> print align 78 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns 79 ACTGCTAGCTAG Alpha 80 ACT-CTAGCTAG Beta 81 ACTGCTAGATAG Gamma 82 83 See also the alignment's format method. 84 """ 85 rows = len(self._records) 86 lines = ["%s alignment with %i rows and %i columns" \ 87 % (str(self._alphabet), rows, self.get_alignment_length())] 88 if rows <= 20: 89 lines.extend([self._str_line(rec) for rec in self._records]) 90 else: 91 lines.extend([self._str_line(rec) for rec in self._records[:18]]) 92 lines.append("...") 93 lines.append(self._str_line(self._records[-1])) 94 return "\n".join(lines)
95
96 - def __repr__(self):
97 """Returns a representation of the object for debugging. 98 99 The representation cannot be used with eval() to recreate the object, 100 which is usually possible with simple python ojects. For example: 101 102 <Bio.Align.Generic.Alignment instance (2 records of length 14, 103 SingleLetterAlphabet()) at a3c184c> 104 105 The hex string is the memory address of the object, see help(id). 106 This provides a simple way to visually distinguish alignments of 107 the same size. 108 """ 109 #A doctest for __repr__ would be nice, but __class__ comes out differently 110 #if run via the __main__ trick. 111 return "<%s instance (%i records of length %i, %s) at %x>" % \ 112 (self.__class__, len(self._records), 113 self.get_alignment_length(), repr(self._alphabet), id(self))
114 #This version is useful for doing eval(repr(alignment)), 115 #but it can be VERY long: 116 #return "%s(%s, %s)" \ 117 # % (self.__class__, repr(self._records), repr(self._alphabet)) 118
119 - def format(self, format):
120 """Returns the alignment as a string in the specified file format. 121 122 The format should be a lower case string supported as an output 123 format by Bio.AlignIO (such as "fasta", "clustal", "phylip", 124 "stockholm", etc), which is used to turn the alignment into a 125 string. 126 127 e.g. 128 >>> from Bio.Alphabet import IUPAC, Gapped 129 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 130 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 131 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 132 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 133 >>> print align.format("fasta") 134 >Alpha 135 ACTGCTAGCTAG 136 >Beta 137 ACT-CTAGCTAG 138 >Gamma 139 ACTGCTAGATAG 140 <BLANKLINE> 141 >>> print align.format("phylip") 142 3 12 143 Alpha ACTGCTAGCT AG 144 Beta ACT-CTAGCT AG 145 Gamma ACTGCTAGAT AG 146 <BLANKLINE> 147 148 For Python 2.6, 3.0 or later see also the built in format() function. 149 """ 150 #See also the __format__ added for Python 2.6 / 3.0, PEP 3101 151 #See also the SeqRecord class and its format() method using Bio.SeqIO 152 return self.__format__(format)
153 154
155 - def __format__(self, format_spec):
156 """Returns the alignment as a string in the specified file format. 157 158 This method supports the python format() function added in 159 Python 2.6/3.0. The format_spec should be a lower case 160 string supported by Bio.AlignIO as an output file format. 161 See also the alignment's format() method.""" 162 if format_spec: 163 from StringIO import StringIO 164 from Bio import AlignIO 165 handle = StringIO() 166 AlignIO.write([self], handle, format_spec) 167 return handle.getvalue() 168 else: 169 #Follow python convention and default to using __str__ 170 return str(self)
171
172 - def get_all_seqs(self):
173 """Return all of the sequences involved in the alignment. 174 175 The return value is a list of SeqRecord objects. 176 177 This method is semi-obsolete, as the Alignment object itself offers 178 much of the functionality of a list of SeqRecord objects (e.g. 179 iteration or slicing to create a sub-alignment). 180 """ 181 return self._records
182
183 - def __iter__(self):
184 """Iterate over alignment rows as SeqRecord objects. 185 186 e.g. 187 >>> from Bio.Alphabet import IUPAC, Gapped 188 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 189 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 190 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 191 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 192 >>> for record in align: 193 ... print record.id 194 ... print record.seq 195 Alpha 196 ACTGCTAGCTAG 197 Beta 198 ACT-CTAGCTAG 199 Gamma 200 ACTGCTAGATAG 201 """ 202 return iter(self._records)
203
204 - def get_seq_by_num(self, number):
205 """Retrieve a sequence by row number (OBSOLETE). 206 207 Returns: 208 o A Seq object for the requested sequence. 209 210 Raises: 211 o IndexError - If the specified number is out of range. 212 213 NOTE: This is a legacy method. In new code where you need to access 214 the rows of the alignment (i.e. the sequences) consider iterating 215 over them or accessing them as SeqRecord objects. e.g. 216 217 for record in alignment: 218 print record.id 219 print record.seq 220 first_record = alignment[0] 221 last_record = alignment[-1] 222 """ 223 return self._records[number].seq
224
225 - def __len__(self):
226 """Returns the number of sequences in the alignment. 227 228 Use len(alignment) to get the number of sequences (i.e. the number of 229 rows), and alignment.get_alignment_length() to get the length of the 230 longest sequence (i.e. the number of columns). 231 232 This is easy to remember if you think of the alignment as being like a 233 list of SeqRecord objects. 234 """ 235 return len(self._records)
236
237 - def get_alignment_length(self):
238 """Return the maximum length of the alignment. 239 240 All objects in the alignment should (hopefully) have the same 241 length. This function will go through and find this length 242 by finding the maximum length of sequences in the alignment. 243 244 >>> from Bio.Alphabet import IUPAC, Gapped 245 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 246 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 247 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 248 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 249 >>> align.get_alignment_length() 250 12 251 252 If you want to know the number of sequences in the alignment, 253 use len(align) instead: 254 255 >>> len(align) 256 3 257 258 """ 259 max_length = 0 260 261 for record in self._records: 262 if len(record.seq) > max_length: 263 max_length = len(record.seq) 264 265 return max_length
266
267 - def add_sequence(self, descriptor, sequence, start = None, end = None, 268 weight = 1.0):
269 """Add a sequence to the alignment. 270 271 This doesn't do any kind of alignment, it just adds in the sequence 272 object, which is assumed to be prealigned with the existing 273 sequences. 274 275 Arguments: 276 o descriptor - The descriptive id of the sequence being added. 277 This will be used as the resulting SeqRecord's 278 .id property (and, for historical compatibility, 279 also the .description property) 280 o sequence - A string with sequence info. 281 o start - You can explicitly set the start point of the sequence. 282 This is useful (at least) for BLAST alignments, which can just 283 be partial alignments of sequences. 284 o end - Specify the end of the sequence, which is important 285 for the same reason as the start. 286 o weight - The weight to place on the sequence in the alignment. 287 By default, all sequences have the same weight. (0.0 => no weight, 288 1.0 => highest weight) 289 """ 290 new_seq = Seq(sequence, self._alphabet) 291 292 #We are now effectively using the SeqRecord's .id as 293 #the primary identifier (e.g. in Bio.SeqIO) so we should 294 #populate it with the descriptor. 295 #For backwards compatibility, also store this in the 296 #SeqRecord's description property. 297 new_record = SeqRecord(new_seq, 298 id = descriptor, 299 description = descriptor) 300 301 # hack! We really need to work out how to deal with annotations 302 # and features in biopython. Right now, I'll just use the 303 # generic annotations dictionary we've got to store the start 304 # and end, but we should think up something better. I don't know 305 # if I'm really a big fan of the LocatableSeq thing they've got 306 # in BioPerl, but I'm not positive what the best thing to do on 307 # this is... 308 if start: 309 new_record.annotations['start'] = start 310 if end: 311 new_record.annotations['end'] = end 312 313 # another hack to add weight information to the sequence 314 new_record.annotations['weight'] = weight 315 316 self._records.append(new_record)
317
318 - def get_column(self,col):
319 """Returns a string containing a given column. 320 321 e.g. 322 >>> from Bio.Alphabet import IUPAC, Gapped 323 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 324 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 325 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 326 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 327 >>> align.get_column(0) 328 'AAA' 329 >>> align.get_column(3) 330 'G-G' 331 """ 332 #TODO - Support negative indices? 333 col_str = '' 334 assert col >= 0 and col <= self.get_alignment_length() 335 for rec in self._records: 336 col_str += rec.seq[col] 337 return col_str
338
339 - def __getitem__(self, index):
340 """Access part of the alignment. 341 342 We'll use the following example alignment here for illustration: 343 344 >>> from Bio.Alphabet import IUPAC, Gapped 345 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 346 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 347 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 348 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 349 >>> align.add_sequence("Delta", "ACTGCTTGCTAG") 350 >>> align.add_sequence("Epsilon","ACTGCTTGATAG") 351 352 You can access a row of the alignment as a SeqRecord using an integer 353 index (think of the alignment as a list of SeqRecord objects here): 354 355 >>> first_record = align[0] 356 >>> print first_record.id, first_record.seq 357 Alpha ACTGCTAGCTAG 358 >>> last_record = align[-1] 359 >>> print last_record.id, last_record.seq 360 Epsilon ACTGCTTGATAG 361 362 You can also access use python's slice notation to create a sub-alignment 363 containing only some of the SeqRecord objects: 364 365 >>> sub_alignment = align[2:5] 366 >>> print sub_alignment 367 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns 368 ACTGCTAGATAG Gamma 369 ACTGCTTGCTAG Delta 370 ACTGCTTGATAG Epsilon 371 372 This includes support for a step, i.e. align[start:end:step], which 373 can be used to select every second sequence: 374 375 >>> sub_alignment = align[::2] 376 >>> print sub_alignment 377 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns 378 ACTGCTAGCTAG Alpha 379 ACTGCTAGATAG Gamma 380 ACTGCTTGATAG Epsilon 381 382 Or to get a copy of the alignment with the rows in reverse order: 383 384 >>> rev_alignment = align[::-1] 385 >>> print rev_alignment 386 Gapped(IUPACUnambiguousDNA(), '-') alignment with 5 rows and 12 columns 387 ACTGCTTGATAG Epsilon 388 ACTGCTTGCTAG Delta 389 ACTGCTAGATAG Gamma 390 ACT-CTAGCTAG Beta 391 ACTGCTAGCTAG Alpha 392 393 Right now, these are the ONLY indexing operations supported. The use of 394 a second column based index is under discussion for a future update. 395 """ 396 if isinstance(index, int): 397 #e.g. result = align[x] 398 #Return a SeqRecord 399 return self._records[index] 400 elif isinstance(index, slice): 401 #e.g. sub_aling = align[i:j:k] 402 #Return a new Alignment using only the specified records. 403 #TODO - See Bug 2554 for changing the __init__ method 404 #to allow us to do this more cleanly. 405 sub_align = Alignment(self._alphabet) 406 sub_align._records = self._records[index] 407 return sub_align 408 elif len(index)==2: 409 raise TypeError("Row and Column indexing is not currently supported,"\ 410 +"but may be in future.") 411 else: 412 raise TypeError("Invalid index type.")
413
414 -def _test():
415 """Run the Bio.Seq module's doctests.""" 416 print "Running doctests..." 417 import doctest 418 doctest.testmod() 419 print "Done"
420 421 if __name__ == "__main__": 422 _test() 423