Package Bio :: Module Seq
[hide private]
[frames] | no frames]

Source Code for Module Bio.Seq

   1  # Copyright 2000-2002 Brad Chapman. 
   2  # Copyright 2004-2005 by M de Hoon. 
   3  # Copyright 2007-2009 by Peter Cock. 
   4  # All rights reserved. 
   5  # This code is part of the Biopython distribution and governed by its 
   6  # license.  Please see the LICENSE file that should have been included 
   7  # as part of this package. 
   8  """Provides objects to represent biological sequences with alphabets. 
   9   
  10  See also U{http://biopython.org/wiki/Seq} and the chapter in our tutorial: 
  11   - U{http://biopython.org/DIST/docs/tutorial/Tutorial.html} 
  12   - U{http://biopython.org/DIST/docs/tutorial/Tutorial.pdf} 
  13  """ 
  14  __docformat__ ="epytext en" #Don't just use plain text in epydoc API pages! 
  15   
  16  import string #for maketrans only 
  17  import array 
  18  import sys 
  19   
  20  import Alphabet 
  21  from Alphabet import IUPAC 
  22  from Bio.SeqRecord import SeqRecord 
  23  from Data.IUPACData import ambiguous_dna_complement, ambiguous_rna_complement 
  24  from Bio.Data import CodonTable 
25 26 -def _maketrans(complement_mapping):
27 """Makes a python string translation table (PRIVATE). 28 29 Arguments: 30 - complement_mapping - a dictionary such as ambiguous_dna_complement 31 and ambiguous_rna_complement from Data.IUPACData. 32 33 Returns a translation table (a string of length 256) for use with the 34 python string's translate method to use in a (reverse) complement. 35 36 Compatible with lower case and upper case sequences. 37 38 For internal use only. 39 """ 40 before = ''.join(complement_mapping.keys()) 41 after = ''.join(complement_mapping.values()) 42 before = before + before.lower() 43 after = after + after.lower() 44 return string.maketrans(before, after)
45 46 _dna_complement_table = _maketrans(ambiguous_dna_complement) 47 _rna_complement_table = _maketrans(ambiguous_rna_complement)
48 49 -class Seq(object):
50 """A read-only sequence object (essentially a string with an alphabet). 51 52 Like normal python strings, our basic sequence object is immutable. 53 This prevents you from doing my_seq[5] = "A" for example, but does allow 54 Seq objects to be used as dictionary keys. 55 56 The Seq object provides a number of string like methods (such as count, 57 find, split and strip), which are alphabet aware where appropriate. 58 59 The Seq object also provides some biological methods, such as complement, 60 reverse_complement, transcribe, back_transcribe and translate (which are 61 not applicable to sequences with a protein alphabet). 62 """
63 - def __init__(self, data, alphabet = Alphabet.generic_alphabet):
64 """Create a Seq object. 65 66 Arguments: 67 - seq - Sequence, required (string) 68 - alphabet - Optional argument, an Alphabet object from Bio.Alphabet 69 70 You will typically use Bio.SeqIO to read in sequences from files as 71 SeqRecord objects, whose sequence will be exposed as a Seq object via 72 the seq property. 73 74 However, will often want to create your own Seq objects directly: 75 76 >>> from Bio.Seq import Seq 77 >>> from Bio.Alphabet import IUPAC 78 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 79 ... IUPAC.protein) 80 >>> my_seq 81 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) 82 >>> print my_seq 83 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 84 """ 85 # Enforce string storage 86 assert (type(data) == type("") or # must use a string 87 type(data) == type(u"")) # but can be a unicode string 88 self._data = data 89 self.alphabet = alphabet # Seq API requirement
90 91 # A data property is/was a Seq API requirement 92 # Note this is read only since the Seq object is meant to be imutable 93 @property
94 - def data(self) :
95 """Sequence as a string (OBSOLETE/DEPRECATED). 96 97 This is a read only property provided for backwards compatility with 98 older versions of Biopython (as is the tostring() method). We now 99 encourage you to use str(my_seq) instead of my_seq.data or the method 100 my_seq.tostring(). 101 102 In recent releases of Biopython it was possible to change a Seq object 103 by updating its data property, but this triggered a deprecation warning. 104 Now the data property is read only, since Seq objects are meant to be 105 immutable: 106 107 >>> from Bio.Seq import Seq 108 >>> from Bio.Alphabet import generic_dna 109 >>> my_seq = Seq("ACGT", generic_dna) 110 >>> str(my_seq) == my_seq.tostring() == my_seq.data == "ACGT" 111 True 112 >>> my_seq.data = "AAAA" 113 Traceback (most recent call last): 114 ... 115 AttributeError: can't set attribute 116 """ 117 return str(self)
118
119 - def __repr__(self):
120 """Returns a (truncated) representation of the sequence for debugging.""" 121 if len(self) > 60: 122 #Shows the last three letters as it is often useful to see if there 123 #is a stop codon at the end of a sequence. 124 #Note total length is 54+3+3=60 125 return "%s('%s...%s', %s)" % (self.__class__.__name__, 126 str(self)[:54], str(self)[-3:], 127 repr(self.alphabet)) 128 else: 129 return "%s(%s, %s)" % (self.__class__.__name__, 130 repr(self.data), 131 repr(self.alphabet))
132 - def __str__(self):
133 """Returns the full sequence as a python string, use str(my_seq). 134 135 Note that Biopython 1.44 and earlier would give a truncated 136 version of repr(my_seq) for str(my_seq). If you are writing code 137 which need to be backwards compatible with old Biopython, you 138 should continue to use my_seq.tostring() rather than str(my_seq). 139 """ 140 return self._data
141 142 # TODO - Alphabet aware __eq__ etc would be nice, but has implications for 143 # __hash__ and therefore use as dictionary keys. See also: 144 # http://mail.python.org/pipermail/python-dev/2002-December/031455.html 145
146 - def __len__(self):
147 """Returns the length of the sequence, use len(my_seq).""" 148 return len(self._data) # Seq API requirement
149
150 - def __getitem__(self, index) : # Seq API requirement
151 """Returns a subsequence of single letter, use my_seq[index].""" 152 #Note since Python 2.0, __getslice__ is deprecated 153 #and __getitem__ is used instead. 154 #See http://docs.python.org/ref/sequence-methods.html 155 if isinstance(index, int): 156 #Return a single letter as a string 157 return self._data[index] 158 else: 159 #Return the (sub)sequence as another Seq object 160 return Seq(self._data[index], self.alphabet)
161
162 - def __add__(self, other):
163 """Add another sequence or string to this sequence. 164 165 If adding a string to a Seq, the alphabet is preserved: 166 167 >>> from Bio.Seq import Seq 168 >>> from Bio.Alphabet import generic_protein 169 >>> Seq("MELKI", generic_protein) + "LV" 170 Seq('MELKILV', ProteinAlphabet()) 171 172 When adding two Seq (like) objects, the alphabets are important. 173 Consider this example: 174 175 >>> from Bio.Seq import Seq 176 >>> from Bio.Alphabet.IUPAC import unambiguous_dna, ambiguous_dna 177 >>> unamb_dna_seq = Seq("ACGT", unambiguous_dna) 178 >>> ambig_dna_seq = Seq("ACRGT", ambiguous_dna) 179 >>> unamb_dna_seq 180 Seq('ACGT', IUPACUnambiguousDNA()) 181 >>> ambig_dna_seq 182 Seq('ACRGT', IUPACAmbiguousDNA()) 183 184 If we add the ambiguous and unambiguous IUPAC DNA alphabets, we get 185 the more general ambiguous IUPAC DNA alphabet: 186 187 >>> unamb_dna_seq + ambig_dna_seq 188 Seq('ACGTACRGT', IUPACAmbiguousDNA()) 189 190 However, if the default generic alphabet is included, the result is 191 a generic alphabet: 192 193 >>> Seq("") + ambig_dna_seq 194 Seq('ACRGT', Alphabet()) 195 196 You can't add RNA and DNA sequences: 197 198 >>> from Bio.Alphabet import generic_dna, generic_rna 199 >>> Seq("ACGT", generic_dna) + Seq("ACGU", generic_rna) 200 Traceback (most recent call last): 201 ... 202 TypeError: Incompatable alphabets DNAAlphabet() and RNAAlphabet() 203 204 You can't add nucleotide and protein sequences: 205 206 >>> from Bio.Alphabet import generic_dna, generic_protein 207 >>> Seq("ACGT", generic_dna) + Seq("MELKI", generic_protein) 208 Traceback (most recent call last): 209 ... 210 TypeError: Incompatable alphabets DNAAlphabet() and ProteinAlphabet() 211 """ 212 if hasattr(other, "alphabet"): 213 #other should be a Seq or a MutableSeq 214 if not Alphabet._check_type_compatible([self.alphabet, 215 other.alphabet]): 216 raise TypeError("Incompatable alphabets %s and %s" \ 217 % (repr(self.alphabet), repr(other.alphabet))) 218 #They should be the same sequence type (or one of them is generic) 219 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 220 return self.__class__(str(self) + str(other), a) 221 elif isinstance(other, basestring): 222 #other is a plain string - use the current alphabet 223 return self.__class__(str(self) + other, self.alphabet) 224 elif isinstance(other, SeqRecord): 225 #Get the SeqRecord's __radd__ to handle this 226 return NotImplemented 227 else : 228 raise TypeError
229
230 - def __radd__(self, other):
231 """Adding a sequence on the left. 232 233 If adding a string to a Seq, the alphabet is preserved: 234 235 >>> from Bio.Seq import Seq 236 >>> from Bio.Alphabet import generic_protein 237 >>> "LV" + Seq("MELKI", generic_protein) 238 Seq('LVMELKI', ProteinAlphabet()) 239 240 Adding two Seq (like) objects is handled via the __add__ method. 241 """ 242 if hasattr(other, "alphabet"): 243 #other should be a Seq or a MutableSeq 244 if not Alphabet._check_type_compatible([self.alphabet, 245 other.alphabet]): 246 raise TypeError("Incompatable alphabets %s and %s" \ 247 % (repr(self.alphabet), repr(other.alphabet))) 248 #They should be the same sequence type (or one of them is generic) 249 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 250 return self.__class__(str(other) + str(self), a) 251 elif isinstance(other, basestring): 252 #other is a plain string - use the current alphabet 253 return self.__class__(other + str(self), self.alphabet) 254 else: 255 raise TypeError
256
257 - def tostring(self): # Seq API requirement
258 """Returns the full sequence as a python string (OBSOLETE). 259 260 Although not formally deprecated, you are now encouraged to use 261 str(my_seq) instead of my_seq.tostring().""" 262 return str(self) 263
264 - def tomutable(self): # Needed? Or use a function?
265 """Returns the full sequence as a MutableSeq object. 266 267 >>> from Bio.Seq import Seq 268 >>> from Bio.Alphabet import IUPAC 269 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAAL", 270 ... IUPAC.protein) 271 >>> my_seq 272 Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 273 >>> my_seq.tomutable() 274 MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 275 276 Note that the alphabet is preserved. 277 """ 278 return MutableSeq(str(self), self.alphabet) 279
280 - def _get_seq_str_and_check_alphabet(self, other_sequence):
281 """string/Seq/MutableSeq to string, checking alphabet (PRIVATE). 282 283 For a string argument, returns the string. 284 285 For a Seq or MutableSeq, it checks the alphabet is compatible 286 (raising an exception if it isn't), and then returns a string. 287 """ 288 try: 289 other_alpha = other_sequence.alphabet 290 except AttributeError: 291 #Assume other_sequence is a string 292 return other_sequence 293 294 #Other should be a Seq or a MutableSeq 295 if not Alphabet._check_type_compatible([self.alphabet, other_alpha]): 296 raise TypeError("Incompatable alphabets %s and %s" \ 297 % (repr(self.alphabet), repr(other_alpha))) 298 #Return as a string 299 return str(other_sequence)
300
301 - def count(self, sub, start=0, end=sys.maxint):
302 """Non-overlapping count method, like that of a python string. 303 304 This behaves like the python string method of the same name, 305 which does a non-overlapping count! 306 307 Returns an integer, the number of occurrences of substring 308 argument sub in the (sub)sequence given by [start:end]. 309 Optional arguments start and end are interpreted as in slice 310 notation. 311 312 Arguments: 313 - sub - a string or another Seq object to look for 314 - start - optional integer, slice start 315 - end - optional integer, slice end 316 317 e.g. 318 319 >>> from Bio.Seq import Seq 320 >>> my_seq = Seq("AAAATGA") 321 >>> print my_seq.count("A") 322 5 323 >>> print my_seq.count("ATG") 324 1 325 >>> print my_seq.count(Seq("AT")) 326 1 327 >>> print my_seq.count("AT", 2, -1) 328 1 329 330 HOWEVER, please note because python strings and Seq objects (and 331 MutableSeq objects) do a non-overlapping search, this may not give 332 the answer you expect: 333 334 >>> "AAAA".count("AA") 335 2 336 >>> print Seq("AAAA").count("AA") 337 2 338 339 A non-overlapping search would give the answer as three! 340 """ 341 #If it has one, check the alphabet: 342 sub_str = self._get_seq_str_and_check_alphabet(sub) 343 return str(self).count(sub_str, start, end)
344
345 - def __contains__(self, char):
346 """Implements the 'in' keyword, like a python string. 347 348 e.g. 349 350 >>> from Bio.Seq import Seq 351 >>> from Bio.Alphabet import generic_dna, generic_rna, generic_protein 352 >>> my_dna = Seq("ATATGAAATTTGAAAA", generic_dna) 353 >>> "AAA" in my_dna 354 True 355 >>> Seq("AAA") in my_dna 356 True 357 >>> Seq("AAA", generic_dna) in my_dna 358 True 359 360 Like other Seq methods, this will raise a type error if another Seq 361 (or Seq like) object with an incompatible alphabet is used: 362 363 >>> Seq("AAA", generic_rna) in my_dna 364 Traceback (most recent call last): 365 ... 366 TypeError: Incompatable alphabets DNAAlphabet() and RNAAlphabet() 367 >>> Seq("AAA", generic_protein) in my_dna 368 Traceback (most recent call last): 369 ... 370 TypeError: Incompatable alphabets DNAAlphabet() and ProteinAlphabet() 371 """ 372 #If it has one, check the alphabet: 373 sub_str = self._get_seq_str_and_check_alphabet(char) 374 return sub_str in str(self)
375
376 - def find(self, sub, start=0, end=sys.maxint):
377 """Find method, like that of a python string. 378 379 This behaves like the python string method of the same name. 380 381 Returns an integer, the index of the first occurrence of substring 382 argument sub in the (sub)sequence given by [start:end]. 383 384 Arguments: 385 - sub - a string or another Seq object to look for 386 - start - optional integer, slice start 387 - end - optional integer, slice end 388 389 Returns -1 if the subsequence is NOT found. 390 391 e.g. Locating the first typical start codon, AUG, in an RNA sequence: 392 393 >>> from Bio.Seq import Seq 394 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 395 >>> my_rna.find("AUG") 396 3 397 """ 398 #If it has one, check the alphabet: 399 sub_str = self._get_seq_str_and_check_alphabet(sub) 400 return str(self).find(sub_str, start, end)
401
402 - def rfind(self, sub, start=0, end=sys.maxint):
403 """Find from right method, like that of a python string. 404 405 This behaves like the python string method of the same name. 406 407 Returns an integer, the index of the last (right most) occurrence of 408 substring argument sub in the (sub)sequence given by [start:end]. 409 410 Arguments: 411 - sub - a string or another Seq object to look for 412 - start - optional integer, slice start 413 - end - optional integer, slice end 414 415 Returns -1 if the subsequence is NOT found. 416 417 e.g. Locating the last typical start codon, AUG, in an RNA sequence: 418 419 >>> from Bio.Seq import Seq 420 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 421 >>> my_rna.rfind("AUG") 422 15 423 """ 424 #If it has one, check the alphabet: 425 sub_str = self._get_seq_str_and_check_alphabet(sub) 426 return str(self).rfind(sub_str, start, end)
427
428 - def startswith(self, prefix, start=0, end=sys.maxint):
429 """Does the Seq start with the given prefix? Returns True/False. 430 431 This behaves like the python string method of the same name. 432 433 Return True if the sequence starts with the specified prefix 434 (a string or another Seq object), False otherwise. 435 With optional start, test sequence beginning at that position. 436 With optional end, stop comparing sequence at that position. 437 prefix can also be a tuple of strings to try. e.g. 438 439 >>> from Bio.Seq import Seq 440 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 441 >>> my_rna.startswith("GUC") 442 True 443 >>> my_rna.startswith("AUG") 444 False 445 >>> my_rna.startswith("AUG", 3) 446 True 447 >>> my_rna.startswith(("UCC","UCA","UCG"),1) 448 True 449 """ 450 #If it has one, check the alphabet: 451 if isinstance(prefix, tuple): 452 #TODO - Once we drop support for Python 2.4, instead of this 453 #loop offload to the string method (requires Python 2.5+). 454 #Check all the alphabets first... 455 prefix_strings = [self._get_seq_str_and_check_alphabet(p) \ 456 for p in prefix] 457 for prefix_str in prefix_strings: 458 if str(self).startswith(prefix_str, start, end): 459 return True 460 return False 461 else: 462 prefix_str = self._get_seq_str_and_check_alphabet(prefix) 463 return str(self).startswith(prefix_str, start, end)
464
465 - def endswith(self, suffix, start=0, end=sys.maxint):
466 """Does the Seq end with the given suffix? Returns True/False. 467 468 This behaves like the python string method of the same name. 469 470 Return True if the sequence ends with the specified suffix 471 (a string or another Seq object), False otherwise. 472 With optional start, test sequence beginning at that position. 473 With optional end, stop comparing sequence at that position. 474 suffix can also be a tuple of strings to try. e.g. 475 476 >>> from Bio.Seq import Seq 477 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 478 >>> my_rna.endswith("UUG") 479 True 480 >>> my_rna.endswith("AUG") 481 False 482 >>> my_rna.endswith("AUG", 0, 18) 483 True 484 >>> my_rna.endswith(("UCC","UCA","UUG")) 485 True 486 """ 487 #If it has one, check the alphabet: 488 if isinstance(suffix, tuple): 489 #TODO - Once we drop support for Python 2.4, instead of this 490 #loop offload to the string method (requires Python 2.5+). 491 #Check all the alphabets first... 492 suffix_strings = [self._get_seq_str_and_check_alphabet(p) \ 493 for p in suffix] 494 for suffix_str in suffix_strings: 495 if str(self).endswith(suffix_str, start, end): 496 return True 497 return False 498 else: 499 suffix_str = self._get_seq_str_and_check_alphabet(suffix) 500 return str(self).endswith(suffix_str, start, end)
501 502
503 - def split(self, sep=None, maxsplit=-1):
504 """Split method, like that of a python string. 505 506 This behaves like the python string method of the same name. 507 508 Return a list of the 'words' in the string (as Seq objects), 509 using sep as the delimiter string. If maxsplit is given, at 510 most maxsplit splits are done. If maxsplit is ommited, all 511 splits are made. 512 513 Following the python string method, sep will by default be any 514 white space (tabs, spaces, newlines) but this is unlikely to 515 apply to biological sequences. 516 517 e.g. 518 519 >>> from Bio.Seq import Seq 520 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 521 >>> my_aa = my_rna.translate() 522 >>> my_aa 523 Seq('VMAIVMGR*KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*')) 524 >>> my_aa.split("*") 525 [Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*'))] 526 >>> my_aa.split("*",1) 527 [Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*'))] 528 529 See also the rsplit method: 530 531 >>> my_aa.rsplit("*",1) 532 [Seq('VMAIVMGR*KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*'))] 533 """ 534 #If it has one, check the alphabet: 535 sep_str = self._get_seq_str_and_check_alphabet(sep) 536 #TODO - If the sep is the defined stop symbol, or gap char, 537 #should we adjust the alphabet? 538 return [Seq(part, self.alphabet) \ 539 for part in str(self).split(sep_str, maxsplit)]
540
541 - def rsplit(self, sep=None, maxsplit=-1):
542 """Right split method, like that of a python string. 543 544 This behaves like the python string method of the same name. 545 546 Return a list of the 'words' in the string (as Seq objects), 547 using sep as the delimiter string. If maxsplit is given, at 548 most maxsplit splits are done COUNTING FROM THE RIGHT. 549 If maxsplit is ommited, all splits are made. 550 551 Following the python string method, sep will by default be any 552 white space (tabs, spaces, newlines) but this is unlikely to 553 apply to biological sequences. 554 555 e.g. print my_seq.rsplit("*",1) 556 557 See also the split method. 558 """ 559 #If it has one, check the alphabet: 560 sep_str = self._get_seq_str_and_check_alphabet(sep) 561 return [Seq(part, self.alphabet) \ 562 for part in str(self).rsplit(sep_str, maxsplit)]
563
564 - def strip(self, chars=None):
565 """Returns a new Seq object with leading and trailing ends stripped. 566 567 This behaves like the python string method of the same name. 568 569 Optional argument chars defines which characters to remove. If 570 ommitted or None (default) then as for the python string method, 571 this defaults to removing any white space. 572 573 e.g. print my_seq.strip("-") 574 575 See also the lstrip and rstrip methods. 576 """ 577 #If it has one, check the alphabet: 578 strip_str = self._get_seq_str_and_check_alphabet(chars) 579 return Seq(str(self).strip(strip_str), self.alphabet)
580
581 - def lstrip(self, chars=None):
582 """Returns a new Seq object with leading (left) end stripped. 583 584 This behaves like the python string method of the same name. 585 586 Optional argument chars defines which characters to remove. If 587 ommitted or None (default) then as for the python string method, 588 this defaults to removing any white space. 589 590 e.g. print my_seq.lstrip("-") 591 592 See also the strip and rstrip methods. 593 """ 594 #If it has one, check the alphabet: 595 strip_str = self._get_seq_str_and_check_alphabet(chars) 596 return Seq(str(self).lstrip(strip_str), self.alphabet)
597
598 - def rstrip(self, chars=None):
599 """Returns a new Seq object with trailing (right) end stripped. 600 601 This behaves like the python string method of the same name. 602 603 Optional argument chars defines which characters to remove. If 604 ommitted or None (default) then as for the python string method, 605 this defaults to removing any white space. 606 607 e.g. Removing a nucleotide sequence's polyadenylation (poly-A tail): 608 609 >>> from Bio.Alphabet import IUPAC 610 >>> from Bio.Seq import Seq 611 >>> my_seq = Seq("CGGTACGCTTATGTCACGTAGAAAAAA", IUPAC.unambiguous_dna) 612 >>> my_seq 613 Seq('CGGTACGCTTATGTCACGTAGAAAAAA', IUPACUnambiguousDNA()) 614 >>> my_seq.rstrip("A") 615 Seq('CGGTACGCTTATGTCACGTAG', IUPACUnambiguousDNA()) 616 617 See also the strip and lstrip methods. 618 """ 619 #If it has one, check the alphabet: 620 strip_str = self._get_seq_str_and_check_alphabet(chars) 621 return Seq(str(self).rstrip(strip_str), self.alphabet)
622
623 - def upper(self):
624 """Returns an upper case copy of the sequence. 625 626 >>> from Bio.Alphabet import HasStopCodon, generic_protein 627 >>> from Bio.Seq import Seq 628 >>> my_seq = Seq("VHLTPeeK*", HasStopCodon(generic_protein)) 629 >>> my_seq 630 Seq('VHLTPeeK*', HasStopCodon(ProteinAlphabet(), '*')) 631 >>> my_seq.lower() 632 Seq('vhltpeek*', HasStopCodon(ProteinAlphabet(), '*')) 633 >>> my_seq.upper() 634 Seq('VHLTPEEK*', HasStopCodon(ProteinAlphabet(), '*')) 635 636 This will adjust the alphabet if required. See also the lower method. 637 """ 638 return Seq(str(self).upper(), self.alphabet._upper())
639
640 - def lower(self):
641 """Returns a lower case copy of the sequence. 642 643 This will adjust the alphabet if required. Note that the IUPAC alphabets 644 are upper case only, and thus a generic alphabet must be substituted. 645 646 >>> from Bio.Alphabet import Gapped, generic_dna 647 >>> from Bio.Alphabet import IUPAC 648 >>> from Bio.Seq import Seq 649 >>> my_seq = Seq("CGGTACGCTTATGTCACGTAG*AAAAAA", Gapped(IUPAC.unambiguous_dna, "*")) 650 >>> my_seq 651 Seq('CGGTACGCTTATGTCACGTAG*AAAAAA', Gapped(IUPACUnambiguousDNA(), '*')) 652 >>> my_seq.lower() 653 Seq('cggtacgcttatgtcacgtag*aaaaaa', Gapped(DNAAlphabet(), '*')) 654 655 See also the upper method. 656 """ 657 return Seq(str(self).lower(), self.alphabet._lower())
658
659 - def complement(self):
660 """Returns the complement sequence. New Seq object. 661 662 >>> from Bio.Seq import Seq 663 >>> from Bio.Alphabet import IUPAC 664 >>> my_dna = Seq("CCCCCGATAG", IUPAC.unambiguous_dna) 665 >>> my_dna 666 Seq('CCCCCGATAG', IUPACUnambiguousDNA()) 667 >>> my_dna.complement() 668 Seq('GGGGGCTATC', IUPACUnambiguousDNA()) 669 670 You can of course used mixed case sequences, 671 672 >>> from Bio.Seq import Seq 673 >>> from Bio.Alphabet import generic_dna 674 >>> my_dna = Seq("CCCCCgatA-GD", generic_dna) 675 >>> my_dna 676 Seq('CCCCCgatA-GD', DNAAlphabet()) 677 >>> my_dna.complement() 678 Seq('GGGGGctaT-CH', DNAAlphabet()) 679 680 Note in the above example, ambiguous character D denotes 681 G, A or T so its complement is H (for C, T or A). 682 683 Trying to complement a protein sequence raises an exception. 684 685 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 686 >>> my_protein.complement() 687 Traceback (most recent call last): 688 ... 689 ValueError: Proteins do not have complements! 690 """ 691 base = Alphabet._get_base_alphabet(self.alphabet) 692 if isinstance(base, Alphabet.ProteinAlphabet): 693 raise ValueError("Proteins do not have complements!") 694 if isinstance(base, Alphabet.DNAAlphabet): 695 ttable = _dna_complement_table 696 elif isinstance(base, Alphabet.RNAAlphabet): 697 ttable = _rna_complement_table 698 elif ('U' in self._data or 'u' in self._data) \ 699 and ('T' in self._data or 't' in self._data): 700 #TODO - Handle this cleanly? 701 raise ValueError("Mixed RNA/DNA found") 702 elif 'U' in self._data or 'u' in self._data: 703 ttable = _rna_complement_table 704 else: 705 ttable = _dna_complement_table 706 #Much faster on really long sequences than the previous loop based one. 707 #thx to Michael Palmer, University of Waterloo 708 return Seq(str(self).translate(ttable), self.alphabet)
709
710 - def reverse_complement(self):
711 """Returns the reverse complement sequence. New Seq object. 712 713 >>> from Bio.Seq import Seq 714 >>> from Bio.Alphabet import IUPAC 715 >>> my_dna = Seq("CCCCCGATAGNR", IUPAC.ambiguous_dna) 716 >>> my_dna 717 Seq('CCCCCGATAGNR', IUPACAmbiguousDNA()) 718 >>> my_dna.reverse_complement() 719 Seq('YNCTATCGGGGG', IUPACAmbiguousDNA()) 720 721 Note in the above example, since R = G or A, its complement 722 is Y (which denotes C or T). 723 724 You can of course used mixed case sequences, 725 726 >>> from Bio.Seq import Seq 727 >>> from Bio.Alphabet import generic_dna 728 >>> my_dna = Seq("CCCCCgatA-G", generic_dna) 729 >>> my_dna 730 Seq('CCCCCgatA-G', DNAAlphabet()) 731 >>> my_dna.reverse_complement() 732 Seq('C-TatcGGGGG', DNAAlphabet()) 733 734 Trying to complement a protein sequence raises an exception: 735 736 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 737 >>> my_protein.reverse_complement() 738 Traceback (most recent call last): 739 ... 740 ValueError: Proteins do not have complements! 741 """ 742 #Use -1 stride/step to reverse the complement 743 return self.complement()[::-1]
744
745 - def transcribe(self):
746 """Returns the RNA sequence from a DNA sequence. New Seq object. 747 748 >>> from Bio.Seq import Seq 749 >>> from Bio.Alphabet import IUPAC 750 >>> coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG", 751 ... IUPAC.unambiguous_dna) 752 >>> coding_dna 753 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA()) 754 >>> coding_dna.transcribe() 755 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA()) 756 757 Trying to transcribe a protein or RNA sequence raises an exception: 758 759 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 760 >>> my_protein.transcribe() 761 Traceback (most recent call last): 762 ... 763 ValueError: Proteins cannot be transcribed! 764 """ 765 base = Alphabet._get_base_alphabet(self.alphabet) 766 if isinstance(base, Alphabet.ProteinAlphabet): 767 raise ValueError("Proteins cannot be transcribed!") 768 if isinstance(base, Alphabet.RNAAlphabet): 769 raise ValueError("RNA cannot be transcribed!") 770 771 if self.alphabet==IUPAC.unambiguous_dna: 772 alphabet = IUPAC.unambiguous_rna 773 elif self.alphabet==IUPAC.ambiguous_dna: 774 alphabet = IUPAC.ambiguous_rna 775 else: 776 alphabet = Alphabet.generic_rna 777 return Seq(str(self).replace('T','U').replace('t','u'), alphabet)
778
779 - def back_transcribe(self):
780 """Returns the DNA sequence from an RNA sequence. New Seq object. 781 782 >>> from Bio.Seq import Seq 783 >>> from Bio.Alphabet import IUPAC 784 >>> messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG", 785 ... IUPAC.unambiguous_rna) 786 >>> messenger_rna 787 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA()) 788 >>> messenger_rna.back_transcribe() 789 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA()) 790 791 Trying to back-transcribe a protein or DNA sequence raises an 792 exception: 793 794 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 795 >>> my_protein.back_transcribe() 796 Traceback (most recent call last): 797 ... 798 ValueError: Proteins cannot be back transcribed! 799 """ 800 base = Alphabet._get_base_alphabet(self.alphabet) 801 if isinstance(base, Alphabet.ProteinAlphabet): 802 raise ValueError("Proteins cannot be back transcribed!") 803 if isinstance(base, Alphabet.DNAAlphabet): 804 raise ValueError("DNA cannot be back transcribed!") 805 806 if self.alphabet==IUPAC.unambiguous_rna: 807 alphabet = IUPAC.unambiguous_dna 808 elif self.alphabet==IUPAC.ambiguous_rna: 809 alphabet = IUPAC.ambiguous_dna 810 else: 811 alphabet = Alphabet.generic_dna 812 return Seq(str(self).replace("U", "T").replace("u", "t"), alphabet)
813
814 - def translate(self, table="Standard", stop_symbol="*", to_stop=False, 815 cds=False):
816 """Turns a nucleotide sequence into a protein sequence. New Seq object. 817 818 This method will translate DNA or RNA sequences, and those with a 819 nucleotide or generic alphabet. Trying to translate a protein 820 sequence raises an exception. 821 822 Arguments: 823 - table - Which codon table to use? This can be either a name 824 (string) or an NCBI identifier (integer). This defaults 825 to the "Standard" table. 826 - stop_symbol - Single character string, what to use for terminators. 827 This defaults to the asterisk, "*". 828 - to_stop - Boolean, defaults to False meaning do a full translation 829 continuing on past any stop codons (translated as the 830 specified stop_symbol). If True, translation is 831 terminated at the first in frame stop codon (and the 832 stop_symbol is not appended to the returned protein 833 sequence). 834 - cds - Boolean, indicates this is a complete CDS. If True, 835 this checks the sequence starts with a valid alternative start 836 codon (which will be translated as methionine, M), that the 837 sequence length is a multiple of three, and that there is a 838 single in frame stop codon at the end (this will be excluded 839 from the protein sequence, regardless of the to_stop option). 840 If these tests fail, an exception is raised. 841 842 e.g. Using the standard table: 843 844 >>> coding_dna = Seq("GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG") 845 >>> coding_dna.translate() 846 Seq('VAIVMGR*KGAR*', HasStopCodon(ExtendedIUPACProtein(), '*')) 847 >>> coding_dna.translate(stop_symbol="@") 848 Seq('VAIVMGR@KGAR@', HasStopCodon(ExtendedIUPACProtein(), '@')) 849 >>> coding_dna.translate(to_stop=True) 850 Seq('VAIVMGR', ExtendedIUPACProtein()) 851 852 Now using NCBI table 2, where TGA is not a stop codon: 853 854 >>> coding_dna.translate(table=2) 855 Seq('VAIVMGRWKGAR*', HasStopCodon(ExtendedIUPACProtein(), '*')) 856 >>> coding_dna.translate(table=2, to_stop=True) 857 Seq('VAIVMGRWKGAR', ExtendedIUPACProtein()) 858 859 In fact, GTG is an alternative start codon under NCBI table 2, meaning 860 this sequence could be a complete CDS: 861 862 >>> coding_dna.translate(table=2, cds=True) 863 Seq('MAIVMGRWKGAR', ExtendedIUPACProtein()) 864 865 It isn't a valid CDS under NCBI table 1, due to both the start codon and 866 also the in frame stop codons: 867 868 >>> coding_dna.translate(table=1, cds=True) 869 Traceback (most recent call last): 870 ... 871 TranslationError: First codon 'GTG' is not a start codon 872 873 If the sequence has no in-frame stop codon, then the to_stop argument 874 has no effect: 875 876 >>> coding_dna2 = Seq("TTGGCCATTGTAATGGGCCGC") 877 >>> coding_dna2.translate() 878 Seq('LAIVMGR', ExtendedIUPACProtein()) 879 >>> coding_dna2.translate(to_stop=True) 880 Seq('LAIVMGR', ExtendedIUPACProtein()) 881 882 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid 883 or a stop codon. These are translated as "X". Any invalid codon 884 (e.g. "TA?" or "T-A") will throw a TranslationError. 885 886 NOTE - Does NOT support gapped sequences. 887 888 NOTE - This does NOT behave like the python string's translate 889 method. For that use str(my_seq).translate(...) instead. 890 """ 891 try: 892 table_id = int(table) 893 except ValueError: 894 table_id = None 895 if isinstance(table, str) and len(table)==256: 896 raise ValueError("The Seq object translate method DOES NOT take " \ 897 + "a 256 character string mapping table like " \ 898 + "the python string object's translate method. " \ 899 + "Use str(my_seq).translate(...) instead.") 900 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 901 Alphabet.ProteinAlphabet): 902 raise ValueError("Proteins cannot be translated!") 903 if self.alphabet==IUPAC.unambiguous_dna: 904 #Will use standard IUPAC protein alphabet, no need for X 905 if table_id is None: 906 codon_table = CodonTable.unambiguous_dna_by_name[table] 907 else: 908 codon_table = CodonTable.unambiguous_dna_by_id[table_id] 909 elif self.alphabet==IUPAC.unambiguous_rna: 910 #Will use standard IUPAC protein alphabet, no need for X 911 if table_id is None: 912 codon_table = CodonTable.unambiguous_rna_by_name[table] 913 else: 914 codon_table = CodonTable.unambiguous_rna_by_id[table_id] 915 else: 916 #This will use the extend IUPAC protein alphabet with X etc. 917 #The same table can be used for RNA or DNA (we use this for 918 #translating strings). 919 if table_id is None: 920 codon_table = CodonTable.ambiguous_generic_by_name[table] 921 else: 922 codon_table = CodonTable.ambiguous_generic_by_id[table_id] 923 protein = _translate_str(str(self), codon_table, \ 924 stop_symbol, to_stop, cds) 925 if stop_symbol in protein: 926 alphabet = Alphabet.HasStopCodon(codon_table.protein_alphabet, 927 stop_symbol = stop_symbol) 928 else: 929 alphabet = codon_table.protein_alphabet 930 return Seq(protein, alphabet)
931
932 - def ungap(self, gap=None):
933 """Return a copy of the sequence without the gap character(s). 934 935 The gap character can be specified in two ways - either as an explicit 936 argument, or via the sequence's alphabet. For example: 937 938 >>> from Bio.Seq import Seq 939 >>> from Bio.Alphabet import generic_dna 940 >>> my_dna = Seq("-ATA--TGAAAT-TTGAAAA", generic_dna) 941 >>> my_dna 942 Seq('-ATA--TGAAAT-TTGAAAA', DNAAlphabet()) 943 >>> my_dna.ungap("-") 944 Seq('ATATGAAATTTGAAAA', DNAAlphabet()) 945 946 If the gap character is not given as an argument, it will be taken from 947 the sequence's alphabet (if defined). Notice that the returned sequence's 948 alphabet is adjusted since it no longer requires a gapped alphabet: 949 950 >>> from Bio.Seq import Seq 951 >>> from Bio.Alphabet import IUPAC, Gapped, HasStopCodon 952 >>> my_pro = Seq("MVVLE=AD*", HasStopCodon(Gapped(IUPAC.protein, "="))) 953 >>> my_pro 954 Seq('MVVLE=AD*', HasStopCodon(Gapped(IUPACProtein(), '='), '*')) 955 >>> my_pro.ungap() 956 Seq('MVVLEAD*', HasStopCodon(IUPACProtein(), '*')) 957 958 Or, with a simpler gapped DNA example: 959 960 >>> from Bio.Seq import Seq 961 >>> from Bio.Alphabet import IUPAC, Gapped 962 >>> my_seq = Seq("CGGGTAG=AAAAAA", Gapped(IUPAC.unambiguous_dna, "=")) 963 >>> my_seq 964 Seq('CGGGTAG=AAAAAA', Gapped(IUPACUnambiguousDNA(), '=')) 965 >>> my_seq.ungap() 966 Seq('CGGGTAGAAAAAA', IUPACUnambiguousDNA()) 967 968 As long as it is consistent with the alphabet, although it is redundant, 969 you can still supply the gap character as an argument to this method: 970 971 >>> my_seq 972 Seq('CGGGTAG=AAAAAA', Gapped(IUPACUnambiguousDNA(), '=')) 973 >>> my_seq.ungap("=") 974 Seq('CGGGTAGAAAAAA', IUPACUnambiguousDNA()) 975 976 However, if the gap character given as the argument disagrees with that 977 declared in the alphabet, an exception is raised: 978 979 >>> my_seq 980 Seq('CGGGTAG=AAAAAA', Gapped(IUPACUnambiguousDNA(), '=')) 981 >>> my_seq.ungap("-") 982 Traceback (most recent call last): 983 ... 984 ValueError: Gap '-' does not match '=' from alphabet 985 986 Finally, if a gap character is not supplied, and the alphabet does not 987 define one, an exception is raised: 988 989 >>> from Bio.Seq import Seq 990 >>> from Bio.Alphabet import generic_dna 991 >>> my_dna = Seq("ATA--TGAAAT-TTGAAAA", generic_dna) 992 >>> my_dna 993 Seq('ATA--TGAAAT-TTGAAAA', DNAAlphabet()) 994 >>> my_dna.ungap() 995 Traceback (most recent call last): 996 ... 997 ValueError: Gap character not given and not defined in alphabet 998 999 """ 1000 if hasattr(self.alphabet, "gap_char"): 1001 if not gap: 1002 gap = self.alphabet.gap_char 1003 elif gap != self.alphabet.gap_char: 1004 raise ValueError("Gap %s does not match %s from alphabet" \ 1005 % (repr(gap), repr(self.alphabet.gap_char))) 1006 alpha = Alphabet._ungap(self.alphabet) 1007 elif not gap: 1008 raise ValueError("Gap character not given and not defined in alphabet") 1009 else: 1010 alpha = self.alphabet #modify! 1011 if len(gap)!=1 or not isinstance(gap, str): 1012 raise ValueError("Unexpected gap character, %s" % repr(gap)) 1013 return Seq(str(self).replace(gap, ""), alpha)
1014
1015 -class UnknownSeq(Seq):
1016 """A read-only sequence object of known length but unknown contents. 1017 1018 If you have an unknown sequence, you can represent this with a normal 1019 Seq object, for example: 1020 1021 >>> my_seq = Seq("N"*5) 1022 >>> my_seq 1023 Seq('NNNNN', Alphabet()) 1024 >>> len(my_seq) 1025 5 1026 >>> print my_seq 1027 NNNNN 1028 1029 However, this is rather wasteful of memory (especially for large 1030 sequences), which is where this class is most usefull: 1031 1032 >>> unk_five = UnknownSeq(5) 1033 >>> unk_five 1034 UnknownSeq(5, alphabet = Alphabet(), character = '?') 1035 >>> len(unk_five) 1036 5 1037 >>> print(unk_five) 1038 ????? 1039 1040 You can add unknown sequence together, provided their alphabets and 1041 characters are compatible, and get another memory saving UnknownSeq: 1042 1043 >>> unk_four = UnknownSeq(4) 1044 >>> unk_four 1045 UnknownSeq(4, alphabet = Alphabet(), character = '?') 1046 >>> unk_four + unk_five 1047 UnknownSeq(9, alphabet = Alphabet(), character = '?') 1048 1049 If the alphabet or characters don't match up, the addition gives an 1050 ordinary Seq object: 1051 1052 >>> unk_nnnn = UnknownSeq(4, character = "N") 1053 >>> unk_nnnn 1054 UnknownSeq(4, alphabet = Alphabet(), character = 'N') 1055 >>> unk_nnnn + unk_four 1056 Seq('NNNN????', Alphabet()) 1057 1058 Combining with a real Seq gives a new Seq object: 1059 1060 >>> known_seq = Seq("ACGT") 1061 >>> unk_four + known_seq 1062 Seq('????ACGT', Alphabet()) 1063 >>> known_seq + unk_four 1064 Seq('ACGT????', Alphabet()) 1065 """
1066 - def __init__(self, length, alphabet = Alphabet.generic_alphabet, character = None):
1067 """Create a new UnknownSeq object. 1068 1069 If character is ommited, it is determed from the alphabet, "N" for 1070 nucleotides, "X" for proteins, and "?" otherwise. 1071 """ 1072 self._length = int(length) 1073 if self._length < 0: 1074 #TODO - Block zero length UnknownSeq? You can just use a Seq! 1075 raise ValueError("Length must not be negative.") 1076 self.alphabet = alphabet 1077 if character: 1078 if len(character) != 1: 1079 raise ValueError("character argument should be a single letter string.") 1080 self._character = character 1081 else: 1082 base = Alphabet._get_base_alphabet(alphabet) 1083 #TODO? Check the case of the letters in the alphabet? 1084 #We may have to use "n" instead of "N" etc. 1085 if isinstance(base, Alphabet.NucleotideAlphabet): 1086 self._character = "N" 1087 elif isinstance(base, Alphabet.ProteinAlphabet): 1088 self._character = "X" 1089 else: 1090 self._character = "?"
1091
1092 - def __len__(self):
1093 """Returns the stated length of the unknown sequence.""" 1094 return self._length
1095
1096 - def __str__(self):
1097 """Returns the unknown sequence as full string of the given length.""" 1098 return self._character * self._length
1099
1100 - def __repr__(self):
1101 return "UnknownSeq(%i, alphabet = %s, character = %s)" \ 1102 % (self._length, repr(self.alphabet), repr(self._character))
1103
1104 - def __add__(self, other):
1105 """Add another sequence or string to this sequence. 1106 1107 Adding two UnknownSeq objects returns another UnknownSeq object 1108 provided the character is the same and the alphabets are compatible. 1109 1110 >>> from Bio.Seq import UnknownSeq 1111 >>> from Bio.Alphabet import generic_protein 1112 >>> UnknownSeq(10, generic_protein) + UnknownSeq(5, generic_protein) 1113 UnknownSeq(15, alphabet = ProteinAlphabet(), character = 'X') 1114 1115 If the characters differ, an UnknownSeq object cannot be used, so a 1116 Seq object is returned: 1117 1118 >>> from Bio.Seq import UnknownSeq 1119 >>> from Bio.Alphabet import generic_protein 1120 >>> UnknownSeq(10, generic_protein) + UnknownSeq(5, generic_protein, 1121 ... character="x") 1122 Seq('XXXXXXXXXXxxxxx', ProteinAlphabet()) 1123 1124 If adding a string to an UnknownSeq, a new Seq is returned with the 1125 same alphabet: 1126 1127 >>> from Bio.Seq import UnknownSeq 1128 >>> from Bio.Alphabet import generic_protein 1129 >>> UnknownSeq(5, generic_protein) + "LV" 1130 Seq('XXXXXLV', ProteinAlphabet()) 1131 """ 1132 if isinstance(other, UnknownSeq) \ 1133 and other._character == self._character: 1134 #TODO - Check the alphabets match 1135 return UnknownSeq(len(self)+len(other), 1136 self.alphabet, self._character) 1137 #Offload to the base class... 1138 return Seq(str(self), self.alphabet) + other
1139
1140 - def __radd__(self, other):
1141 #If other is an UnknownSeq, then __add__ would be called. 1142 #Offload to the base class... 1143 return other + Seq(str(self), self.alphabet)
1144
1145 - def __getitem__(self, index):
1146 if isinstance(index, int): 1147 #TODO - Check the bounds without wasting memory 1148 return str(self)[index] 1149 else: 1150 #TODO - Work out the length without wasting memory 1151 return UnknownSeq(len(("#"*self._length)[index]), 1152 self.alphabet, self._character)
1153
1154 - def count(self, sub, start=0, end=sys.maxint):
1155 """Non-overlapping count method, like that of a python string. 1156 1157 This behaves like the python string (and Seq object) method of the 1158 same name, which does a non-overlapping count! 1159 1160 Returns an integer, the number of occurrences of substring 1161 argument sub in the (sub)sequence given by [start:end]. 1162 Optional arguments start and end are interpreted as in slice 1163 notation. 1164 1165 Arguments: 1166 - sub - a string or another Seq object to look for 1167 - start - optional integer, slice start 1168 - end - optional integer, slice end 1169 1170 >>> "NNNN".count("N") 1171 4 1172 >>> Seq("NNNN").count("N") 1173 4 1174 >>> UnknownSeq(4, character="N").count("N") 1175 4 1176 >>> UnknownSeq(4, character="N").count("A") 1177 0 1178 >>> UnknownSeq(4, character="N").count("AA") 1179 0 1180 1181 HOWEVER, please note because that python strings and Seq objects (and 1182 MutableSeq objects) do a non-overlapping search, this may not give 1183 the answer you expect: 1184 1185 >>> UnknownSeq(4, character="N").count("NN") 1186 2 1187 >>> UnknownSeq(4, character="N").count("NNN") 1188 1 1189 """ 1190 sub_str = self._get_seq_str_and_check_alphabet(sub) 1191 if len(sub_str) == 1: 1192 if str(sub_str) == self._character: 1193 if start==0 and end >= self._length: 1194 return self._length 1195 else: 1196 #This could be done more cleverly... 1197 return str(self).count(sub_str, start, end) 1198 else: 1199 return 0 1200 else: 1201 if set(sub_str) == set(self._character): 1202 if start==0 and end >= self._length: 1203 return self._length // len(sub_str) 1204 else: 1205 #This could be done more cleverly... 1206 return str(self).count(sub_str, start, end) 1207 else: 1208 return 0
1209
1210 - def complement(self):
1211 """The complement of an unknown nucleotide equals itself. 1212 1213 >>> my_nuc = UnknownSeq(8) 1214 >>> my_nuc 1215 UnknownSeq(8, alphabet = Alphabet(), character = '?') 1216 >>> print my_nuc 1217 ???????? 1218 >>> my_nuc.complement() 1219 UnknownSeq(8, alphabet = Alphabet(), character = '?') 1220 >>> print my_nuc.complement() 1221 ???????? 1222 """ 1223 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 1224 Alphabet.ProteinAlphabet): 1225 raise ValueError("Proteins do not have complements!") 1226 return self
1227
1228 - def reverse_complement(self):
1229 """The reverse complement of an unknown nucleotide equals itself. 1230 1231 >>> my_nuc = UnknownSeq(10) 1232 >>> my_nuc 1233 UnknownSeq(10, alphabet = Alphabet(), character = '?') 1234 >>> print my_nuc 1235 ?????????? 1236 >>> my_nuc.reverse_complement() 1237 UnknownSeq(10, alphabet = Alphabet(), character = '?') 1238 >>> print my_nuc.reverse_complement() 1239 ?????????? 1240 """ 1241 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 1242 Alphabet.ProteinAlphabet): 1243 raise ValueError("Proteins do not have complements!") 1244 return self
1245
1246 - def transcribe(self):
1247 """Returns unknown RNA sequence from an unknown DNA sequence. 1248 1249 >>> my_dna = UnknownSeq(10, character="N") 1250 >>> my_dna 1251 UnknownSeq(10, alphabet = Alphabet(), character = 'N') 1252 >>> print my_dna 1253 NNNNNNNNNN 1254 >>> my_rna = my_dna.transcribe() 1255 >>> my_rna 1256 UnknownSeq(10, alphabet = RNAAlphabet(), character = 'N') 1257 >>> print my_rna 1258 NNNNNNNNNN 1259 """ 1260 #Offload the alphabet stuff 1261 s = Seq(self._character, self.alphabet).transcribe() 1262 return UnknownSeq(self._length, s.alphabet, self._character)
1263
1264 - def back_transcribe(self):
1265 """Returns unknown DNA sequence from an unknown RNA sequence. 1266 1267 >>> my_rna = UnknownSeq(20, character="N") 1268 >>> my_rna 1269 UnknownSeq(20, alphabet = Alphabet(), character = 'N') 1270 >>> print my_rna 1271 NNNNNNNNNNNNNNNNNNNN 1272 >>> my_dna = my_rna.back_transcribe() 1273 >>> my_dna 1274 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N') 1275 >>> print my_dna 1276 NNNNNNNNNNNNNNNNNNNN 1277 """ 1278 #Offload the alphabet stuff 1279 s = Seq(self._character, self.alphabet).back_transcribe() 1280 return UnknownSeq(self._length, s.alphabet, self._character)
1281
1282 - def upper(self):
1283 """Returns an upper case copy of the sequence. 1284 1285 >>> from Bio.Alphabet import generic_dna 1286 >>> from Bio.Seq import UnknownSeq 1287 >>> my_seq = UnknownSeq(20, generic_dna, character="n") 1288 >>> my_seq 1289 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'n') 1290 >>> print my_seq 1291 nnnnnnnnnnnnnnnnnnnn 1292 >>> my_seq.upper() 1293 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N') 1294 >>> print my_seq.upper() 1295 NNNNNNNNNNNNNNNNNNNN 1296 1297 This will adjust the alphabet if required. See also the lower method. 1298 """ 1299 return UnknownSeq(self._length, self.alphabet._upper(), self._character.upper())
1300
1301 - def lower(self):
1302 """Returns a lower case copy of the sequence. 1303 1304 This will adjust the alphabet if required: 1305 1306 >>> from Bio.Alphabet import IUPAC 1307 >>> from Bio.Seq import UnknownSeq 1308 >>> my_seq = UnknownSeq(20, IUPAC.extended_protein) 1309 >>> my_seq 1310 UnknownSeq(20, alphabet = ExtendedIUPACProtein(), character = 'X') 1311 >>> print my_seq 1312 XXXXXXXXXXXXXXXXXXXX 1313 >>> my_seq.lower() 1314 UnknownSeq(20, alphabet = ProteinAlphabet(), character = 'x') 1315 >>> print my_seq.lower() 1316 xxxxxxxxxxxxxxxxxxxx 1317 1318 See also the upper method. 1319 """ 1320 return UnknownSeq(self._length, self.alphabet._lower(), self._character.lower())
1321
1322 - def translate(self, **kwargs):
1323 """Translate an unknown nucleotide sequence into an unknown protein. 1324 1325 e.g. 1326 1327 >>> my_seq = UnknownSeq(11, character="N") 1328 >>> print my_seq 1329 NNNNNNNNNNN 1330 >>> my_protein = my_seq.translate() 1331 >>> my_protein 1332 UnknownSeq(3, alphabet = ProteinAlphabet(), character = 'X') 1333 >>> print my_protein 1334 XXX 1335 1336 In comparison, using a normal Seq object: 1337 1338 >>> my_seq = Seq("NNNNNNNNNNN") 1339 >>> print my_seq 1340 NNNNNNNNNNN 1341 >>> my_protein = my_seq.translate() 1342 >>> my_protein 1343 Seq('XXX', ExtendedIUPACProtein()) 1344 >>> print my_protein 1345 XXX 1346 1347 """ 1348 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 1349 Alphabet.ProteinAlphabet): 1350 raise ValueError("Proteins cannot be translated!") 1351 return UnknownSeq(self._length//3, Alphabet.generic_protein, "X")
1352
1353 - def ungap(self, gap=None):
1354 """Return a copy of the sequence without the gap character(s). 1355 1356 The gap character can be specified in two ways - either as an explicit 1357 argument, or via the sequence's alphabet. For example: 1358 1359 >>> from Bio.Seq import UnknownSeq 1360 >>> from Bio.Alphabet import Gapped, generic_dna 1361 >>> my_dna = UnknownSeq(20, Gapped(generic_dna,"-")) 1362 >>> my_dna 1363 UnknownSeq(20, alphabet = Gapped(DNAAlphabet(), '-'), character = 'N') 1364 >>> my_dna.ungap() 1365 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N') 1366 >>> my_dna.ungap("-") 1367 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N') 1368 1369 If the UnknownSeq is using the gap character, then an empty Seq is 1370 returned: 1371 1372 >>> my_gap = UnknownSeq(20, Gapped(generic_dna,"-"), character="-") 1373 >>> my_gap 1374 UnknownSeq(20, alphabet = Gapped(DNAAlphabet(), '-'), character = '-') 1375 >>> my_gap.ungap() 1376 Seq('', DNAAlphabet()) 1377 >>> my_gap.ungap("-") 1378 Seq('', DNAAlphabet()) 1379 1380 Notice that the returned sequence's alphabet is adjusted to remove any 1381 explicit gap character declaration. 1382 """ 1383 #Offload the alphabet stuff 1384 s = Seq(self._character, self.alphabet).ungap() 1385 if s : 1386 return UnknownSeq(self._length, s.alphabet, self._character) 1387 else : 1388 return Seq("", s.alphabet)
1389
1390 -class MutableSeq(object):
1391 """An editable sequence object (with an alphabet). 1392 1393 Unlike normal python strings and our basic sequence object (the Seq class) 1394 which are immuatable, the MutableSeq lets you edit the sequence in place. 1395 However, this means you cannot use a MutableSeq object as a dictionary key. 1396 1397 >>> from Bio.Seq import MutableSeq 1398 >>> from Bio.Alphabet import generic_dna 1399 >>> my_seq = MutableSeq("ACTCGTCGTCG", generic_dna) 1400 >>> my_seq 1401 MutableSeq('ACTCGTCGTCG', DNAAlphabet()) 1402 >>> my_seq[5] 1403 'T' 1404 >>> my_seq[5] = "A" 1405 >>> my_seq 1406 MutableSeq('ACTCGACGTCG', DNAAlphabet()) 1407 >>> my_seq[5] 1408 'A' 1409 >>> my_seq[5:8] = "NNN" 1410 >>> my_seq 1411 MutableSeq('ACTCGNNNTCG', DNAAlphabet()) 1412 >>> len(my_seq) 1413 11 1414 1415 Note that the MutableSeq object does not support as many string-like 1416 or biological methods as the Seq object. 1417 """
1418 - def __init__(self, data, alphabet = Alphabet.generic_alphabet):
1419 if type(data) == type(""): 1420 self.data = array.array("c", data) 1421 else: 1422 self.data = data # assumes the input is an array 1423 self.alphabet = alphabet
1424
1425 - def __repr__(self):
1426 """Returns a (truncated) representation of the sequence for debugging.""" 1427 if len(self) > 60: 1428 #Shows the last three letters as it is often useful to see if there 1429 #is a stop codon at the end of a sequence. 1430 #Note total length is 54+3+3=60 1431 return "%s('%s...%s', %s)" % (self.__class__.__name__, 1432 str(self[:54]), str(self[-3:]), 1433 repr(self.alphabet)) 1434 else: 1435 return "%s('%s', %s)" % (self.__class__.__name__, 1436 str(self), 1437 repr(self.alphabet))
1438
1439 - def __str__(self):
1440 """Returns the full sequence as a python string. 1441 1442 Note that Biopython 1.44 and earlier would give a truncated 1443 version of repr(my_seq) for str(my_seq). If you are writing code 1444 which needs to be backwards compatible with old Biopython, you 1445 should continue to use my_seq.tostring() rather than str(my_seq). 1446 """ 1447 #See test_GAQueens.py for an historic usage of a non-string alphabet! 1448 return "".join(self.data)
1449
1450 - def __cmp__(self, other):
1451 """Compare the sequence for to another sequence or a string. 1452 1453 If compared to another sequence the alphabets must be compatible. 1454 Comparing DNA to RNA, or Nucleotide to Protein will raise an 1455 exception. 1456 1457 Otherwise only the sequence itself is compared, not the precise 1458 alphabet. 1459 1460 This method indirectly supports ==, < , etc.""" 1461 if hasattr(other, "alphabet"): 1462 #other should be a Seq or a MutableSeq 1463 if not Alphabet._check_type_compatible([self.alphabet, 1464 other.alphabet]): 1465 raise TypeError("Incompatable alphabets %s and %s" \ 1466 % (repr(self.alphabet), repr(other.alphabet))) 1467 #They should be the same sequence type (or one of them is generic) 1468 if isinstance(other, MutableSeq): 1469 #See test_GAQueens.py for an historic usage of a non-string 1470 #alphabet! Comparing the arrays supports this. 1471 return cmp(self.data, other.data) 1472 else: 1473 return cmp(str(self), str(other)) 1474 elif isinstance(other, basestring): 1475 return cmp(str(self), other) 1476 else: 1477 raise TypeError
1478
1479 - def __len__(self): return len(self.data)
1480
1481 - def __getitem__(self, index):
1482 #Note since Python 2.0, __getslice__ is deprecated 1483 #and __getitem__ is used instead. 1484 #See http://docs.python.org/ref/sequence-methods.html 1485 if isinstance(index, int): 1486 #Return a single letter as a string 1487 return self.data[index] 1488 else: 1489 #Return the (sub)sequence as another Seq object 1490 return MutableSeq(self.data[index], self.alphabet)
1491
1492 - def __setitem__(self, index, value):
1493 #Note since Python 2.0, __setslice__ is deprecated 1494 #and __setitem__ is used instead. 1495 #See http://docs.python.org/ref/sequence-methods.html 1496 if isinstance(index, int): 1497 #Replacing a single letter with a new string 1498 self.data[index] = value 1499 else: 1500 #Replacing a sub-sequence 1501 if isinstance(value, MutableSeq): 1502 self.data[index] = value.data 1503 elif isinstance(value, type(self.data)): 1504 self.data[index] = value 1505 else: 1506 self.data[index] = array.array("c", str(value))
1507
1508 - def __delitem__(self, index):
1509 #Note since Python 2.0, __delslice__ is deprecated 1510 #and __delitem__ is used instead. 1511 #See http://docs.python.org/ref/sequence-methods.html 1512 1513 #Could be deleting a single letter, or a slice 1514 del self.data[index]
1515
1516 - def __add__(self, other):
1517 """Add another sequence or string to this sequence. 1518 1519 Returns a new MutableSeq object.""" 1520 if hasattr(other, "alphabet"): 1521 #other should be a Seq or a MutableSeq 1522 if not Alphabet._check_type_compatible([self.alphabet, 1523 other.alphabet]): 1524 raise TypeError("Incompatable alphabets %s and %s" \ 1525 % (repr(self.alphabet), repr(other.alphabet))) 1526 #They should be the same sequence type (or one of them is generic) 1527 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 1528 if isinstance(other, MutableSeq): 1529 #See test_GAQueens.py for an historic usage of a non-string 1530 #alphabet! Adding the arrays should support this. 1531 return self.__class__(self.data + other.data, a) 1532 else: 1533 return self.__class__(str(self) + str(other), a) 1534 elif isinstance(other, basestring): 1535 #other is a plain string - use the current alphabet 1536 return self.__class__(str(self) + str(other), self.alphabet) 1537 else: 1538 raise TypeError
1539
1540 - def __radd__(self, other):
1541 if hasattr(other, "alphabet"): 1542 #other should be a Seq or a MutableSeq 1543 if not Alphabet._check_type_compatible([self.alphabet, 1544 other.alphabet]): 1545 raise TypeError("Incompatable alphabets %s and %s" \ 1546 % (repr(self.alphabet), repr(other.alphabet))) 1547 #They should be the same sequence type (or one of them is generic) 1548 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 1549 if isinstance(other, MutableSeq): 1550 #See test_GAQueens.py for an historic usage of a non-string 1551 #alphabet! Adding the arrays should support this. 1552 return self.__class__(other.data + self.data, a) 1553 else: 1554 return self.__class__(str(other) + str(self), a) 1555 elif isinstance(other, basestring): 1556 #other is a plain string - use the current alphabet 1557 return self.__class__(str(other) + str(self), self.alphabet) 1558 else: 1559 raise TypeError
1560
1561 - def append(self, c):
1562 self.data.append(c)
1563
1564 - def insert(self, i, c):
1565 self.data.insert(i, c)
1566
1567 - def pop(self, i = (-1)):
1568 c = self.data[i] 1569 del self.data[i] 1570 return c
1571
1572 - def remove(self, item):
1573 for i in range(len(self.data)): 1574 if self.data[i] == item: 1575 del self.data[i] 1576 return 1577 raise ValueError("MutableSeq.remove(x): x not in list")
1578
1579 - def count(self, sub, start=0, end=sys.maxint):
1580 """Non-overlapping count method, like that of a python string. 1581 1582 This behaves like the python string method of the same name, 1583 which does a non-overlapping count! 1584 1585 Returns an integer, the number of occurrences of substring 1586 argument sub in the (sub)sequence given by [start:end]. 1587 Optional arguments start and end are interpreted as in slice 1588 notation. 1589 1590 Arguments: 1591 - sub - a string or another Seq object to look for 1592 - start - optional integer, slice start 1593 - end - optional integer, slice end 1594 1595 e.g. 1596 1597 >>> from Bio.Seq import MutableSeq 1598 >>> my_mseq = MutableSeq("AAAATGA") 1599 >>> print my_mseq.count("A") 1600 5 1601 >>> print my_mseq.count("ATG") 1602 1 1603 >>> print my_mseq.count(Seq("AT")) 1604 1 1605 >>> print my_mseq.count("AT", 2, -1) 1606 1 1607 1608 HOWEVER, please note because that python strings, Seq objects and 1609 MutableSeq objects do a non-overlapping search, this may not give 1610 the answer you expect: 1611 1612 >>> "AAAA".count("AA") 1613 2 1614 >>> print MutableSeq("AAAA").count("AA") 1615 2 1616 1617 A non-overlapping search would give the answer as three! 1618 """ 1619 try: 1620 #TODO - Should we check the alphabet? 1621 search = sub.tostring() 1622 except AttributeError: 1623 search = sub 1624 1625 if not isinstance(search, basestring): 1626 raise TypeError("expected a string, Seq or MutableSeq") 1627 1628 if len(search) == 1: 1629 #Try and be efficient and work directly from the array. 1630 count = 0 1631 for c in self.data[start:end]: 1632 if c == search: count += 1 1633 return count 1634 else: 1635 #TODO - Can we do this more efficiently? 1636 return self.tostring().count(search, start, end)
1637
1638 - def index(self, item):
1639 for i in range(len(self.data)): 1640 if self.data[i] == item: 1641 return i 1642 raise ValueError("MutableSeq.index(x): x not in list")
1643
1644 - def reverse(self):
1645 """Modify the mutable sequence to reverse itself. 1646 1647 No return value. 1648 """ 1649 self.data.reverse()
1650
1651 - def complement(self):
1652 """Modify the mutable sequence to take on its complement. 1653 1654 Trying to complement a protein sequence raises an exception. 1655 1656 No return value. 1657 """ 1658 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 1659 Alphabet.ProteinAlphabet): 1660 raise ValueError("Proteins do not have complements!") 1661 if self.alphabet in (IUPAC.ambiguous_dna, IUPAC.unambiguous_dna): 1662 d = ambiguous_dna_complement 1663 elif self.alphabet in (IUPAC.ambiguous_rna, IUPAC.unambiguous_rna): 1664 d = ambiguous_rna_complement 1665 elif 'U' in self.data and 'T' in self.data: 1666 #TODO - Handle this cleanly? 1667 raise ValueError("Mixed RNA/DNA found") 1668 elif 'U' in self.data: 1669 d = ambiguous_rna_complement 1670 else: 1671 d = ambiguous_dna_complement 1672 c = dict([(x.lower(), y.lower()) for x,y in d.iteritems()]) 1673 d.update(c) 1674 self.data = map(lambda c: d[c], self.data) 1675 self.data = array.array('c', self.data)
1676
1677 - def reverse_complement(self):
1678 """Modify the mutable sequence to take on its reverse complement. 1679 1680 Trying to reverse complement a protein sequence raises an exception. 1681 1682 No return value. 1683 """ 1684 self.complement() 1685 self.data.reverse()
1686 1687 ## Sorting a sequence makes no sense. 1688 # def sort(self, *args): self.data.sort(*args) 1689
1690 - def extend(self, other):
1691 if isinstance(other, MutableSeq): 1692 for c in other.data: 1693 self.data.append(c) 1694 else: 1695 for c in other: 1696 self.data.append(c)
1697
1698 - def tostring(self):
1699 """Returns the full sequence as a python string. 1700 1701 Although not formally deprecated, you are now encouraged to use 1702 str(my_seq) instead of my_seq.tostring(). 1703 1704 Because str(my_seq) will give you the full sequence as a python string, 1705 there is often no need to make an explicit conversion. For example, 1706 1707 print "ID={%s}, sequence={%s}" % (my_name, my_seq) 1708 1709 On Biopython 1.44 or older you would have to have done this: 1710 1711 print "ID={%s}, sequence={%s}" % (my_name, my_seq.tostring()) 1712 """ 1713 return "".join(self.data)
1714
1715 - def toseq(self):
1716 """Returns the full sequence as a new immutable Seq object. 1717 1718 >>> from Bio.Seq import Seq 1719 >>> from Bio.Alphabet import IUPAC 1720 >>> my_mseq = MutableSeq("MKQHKAMIVALIVICITAVVAAL", \ 1721 IUPAC.protein) 1722 >>> my_mseq 1723 MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 1724 >>> my_mseq.toseq() 1725 Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 1726 1727 Note that the alphabet is preserved. 1728 """ 1729 return Seq("".join(self.data), self.alphabet)
1730
1731 # The transcribe, backward_transcribe, and translate functions are 1732 # user-friendly versions of the corresponding functions in Bio.Transcribe 1733 # and Bio.Translate. The functions work both on Seq objects, and on strings. 1734 1735 -def transcribe(dna):
1736 """Transcribes a DNA sequence into RNA. 1737 1738 If given a string, returns a new string object. 1739 1740 Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet. 1741 1742 Trying to transcribe a protein or RNA sequence raises an exception. 1743 1744 e.g. 1745 1746 >>> transcribe("ACTGN") 1747 'ACUGN' 1748 """ 1749 if isinstance(dna, Seq): 1750 return dna.transcribe() 1751 elif isinstance(dna, MutableSeq): 1752 return dna.toseq().transcribe() 1753 else: 1754 return dna.replace('T','U').replace('t','u')
1755
1756 -def back_transcribe(rna):
1757 """Back-transcribes an RNA sequence into DNA. 1758 1759 If given a string, returns a new string object. 1760 1761 Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet. 1762 1763 Trying to transcribe a protein or DNA sequence raises an exception. 1764 1765 e.g. 1766 1767 >>> back_transcribe("ACUGN") 1768 'ACTGN' 1769 """ 1770 if isinstance(rna, Seq): 1771 return rna.back_transcribe() 1772 elif isinstance(rna, MutableSeq): 1773 return rna.toseq().back_transcribe() 1774 else: 1775 return rna.replace('U','T').replace('u','t')
1776
1777 -def _translate_str(sequence, table, stop_symbol="*", to_stop=False, 1778 cds=False, pos_stop="X"):
1779 """Helper function to translate a nucleotide string (PRIVATE). 1780 1781 Arguments: 1782 - sequence - a string 1783 - table - a CodonTable object (NOT a table name or id number) 1784 - stop_symbol - a single character string, what to use for terminators. 1785 - to_stop - boolean, should translation terminate at the first 1786 in frame stop codon? If there is no in-frame stop codon 1787 then translation continues to the end. 1788 - pos_stop - a single character string for a possible stop codon 1789 (e.g. TAN or NNN) 1790 - cds - Boolean, indicates this is a complete CDS. If True, this 1791 checks the sequence starts with a valid alternative start 1792 codon (which will be translated as methionine, M), that the 1793 sequence length is a multiple of three, and that there is a 1794 single in frame stop codon at the end (this will be excluded 1795 from the protein sequence, regardless of the to_stop option). 1796 If these tests fail, an exception is raised. 1797 1798 Returns a string. 1799 1800 e.g. 1801 1802 >>> from Bio.Data import CodonTable 1803 >>> table = CodonTable.ambiguous_dna_by_id[1] 1804 >>> _translate_str("AAA", table) 1805 'K' 1806 >>> _translate_str("TAR", table) 1807 '*' 1808 >>> _translate_str("TAN", table) 1809 'X' 1810 >>> _translate_str("TAN", table, pos_stop="@") 1811 '@' 1812 >>> _translate_str("TA?", table) 1813 Traceback (most recent call last): 1814 ... 1815 TranslationError: Codon 'TA?' is invalid 1816 >>> _translate_str("ATGCCCTAG", table, cds=True) 1817 'MP' 1818 >>> _translate_str("AAACCCTAG", table, cds=True) 1819 Traceback (most recent call last): 1820 ... 1821 TranslationError: First codon 'AAA' is not a start codon 1822 >>> _translate_str("ATGCCCTAGCCCTAG", table, cds=True) 1823 Traceback (most recent call last): 1824 ... 1825 TranslationError: Extra in frame stop codon found. 1826 """ 1827 sequence = sequence.upper() 1828 amino_acids = [] 1829 forward_table = table.forward_table 1830 stop_codons = table.stop_codons 1831 if table.nucleotide_alphabet.letters is not None: 1832 valid_letters = set(table.nucleotide_alphabet.letters.upper()) 1833 else: 1834 #Assume the worst case, ambiguous DNA or RNA: 1835 valid_letters = set(IUPAC.ambiguous_dna.letters.upper() + \ 1836 IUPAC.ambiguous_rna.letters.upper()) 1837 if cds: 1838 if str(sequence[:3]).upper() not in table.start_codons: 1839 raise CodonTable.TranslationError(\ 1840 "First codon '%s' is not a start codon" % sequence[:3]) 1841 if len(sequence) % 3 != 0: 1842 raise CodonTable.TranslationError(\ 1843 "Sequence length %i is not a multiple of three" % len(sequence)) 1844 if str(sequence[-3:]).upper() not in stop_codons: 1845 raise CodonTable.TranslationError(\ 1846 "Final codon '%s' is not a stop codon" % sequence[-3:]) 1847 #Don't translate the stop symbol, and manually translate the M 1848 sequence = sequence[3:-3] 1849 amino_acids = ["M"] 1850 n = len(sequence) 1851 for i in xrange(0,n-n%3,3): 1852 codon = sequence[i:i+3] 1853 try: 1854 amino_acids.append(forward_table[codon]) 1855 except (KeyError, CodonTable.TranslationError): 1856 #Todo? Treat "---" as a special case (gapped translation) 1857 if codon in table.stop_codons: 1858 if cds: 1859 raise CodonTable.TranslationError(\ 1860 "Extra in frame stop codon found.") 1861 if to_stop : break 1862 amino_acids.append(stop_symbol) 1863 elif valid_letters.issuperset(set(codon)): 1864 #Possible stop codon (e.g. NNN or TAN) 1865 amino_acids.append(pos_stop) 1866 else: 1867 raise CodonTable.TranslationError(\ 1868 "Codon '%s' is invalid" % codon) 1869 return "".join(amino_acids)
1870
1871 -def translate(sequence, table="Standard", stop_symbol="*", to_stop=False, 1872 cds=False):
1873 """Translate a nucleotide sequence into amino acids. 1874 1875 If given a string, returns a new string object. Given a Seq or 1876 MutableSeq, returns a Seq object with a protein alphabet. 1877 1878 Arguments: 1879 - table - Which codon table to use? This can be either a name 1880 (string) or an NCBI identifier (integer). Defaults 1881 to the "Standard" table. 1882 - stop_symbol - Single character string, what to use for any 1883 terminators, defaults to the asterisk, "*". 1884 - to_stop - Boolean, defaults to False meaning do a full 1885 translation continuing on past any stop codons 1886 (translated as the specified stop_symbol). If 1887 True, translation is terminated at the first in 1888 frame stop codon (and the stop_symbol is not 1889 appended to the returned protein sequence). 1890 - cds - Boolean, indicates this is a complete CDS. If True, this 1891 checks the sequence starts with a valid alternative start 1892 codon (which will be translated as methionine, M), that the 1893 sequence length is a multiple of three, and that there is a 1894 single in frame stop codon at the end (this will be excluded 1895 from the protein sequence, regardless of the to_stop option). 1896 If these tests fail, an exception is raised. 1897 1898 A simple string example using the default (standard) genetic code: 1899 1900 >>> coding_dna = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG" 1901 >>> translate(coding_dna) 1902 'VAIVMGR*KGAR*' 1903 >>> translate(coding_dna, stop_symbol="@") 1904 'VAIVMGR@KGAR@' 1905 >>> translate(coding_dna, to_stop=True) 1906 'VAIVMGR' 1907 1908 Now using NCBI table 2, where TGA is not a stop codon: 1909 1910 >>> translate(coding_dna, table=2) 1911 'VAIVMGRWKGAR*' 1912 >>> translate(coding_dna, table=2, to_stop=True) 1913 'VAIVMGRWKGAR' 1914 1915 In fact this example uses an alternative start codon valid under NCBI table 2, 1916 GTG, which means this example is a complete valid CDS which when translated 1917 should really start with methionine (not valine): 1918 1919 >>> translate(coding_dna, table=2, cds=True) 1920 'MAIVMGRWKGAR' 1921 1922 Note that if the sequence has no in-frame stop codon, then the to_stop 1923 argument has no effect: 1924 1925 >>> coding_dna2 = "GTGGCCATTGTAATGGGCCGC" 1926 >>> translate(coding_dna2) 1927 'VAIVMGR' 1928 >>> translate(coding_dna2, to_stop=True) 1929 'VAIVMGR' 1930 1931 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid 1932 or a stop codon. These are translated as "X". Any invalid codon 1933 (e.g. "TA?" or "T-A") will throw a TranslationError. 1934 1935 NOTE - Does NOT support gapped sequences. 1936 1937 It will however translate either DNA or RNA. 1938 """ 1939 if isinstance(sequence, Seq): 1940 return sequence.translate(table, stop_symbol, to_stop, cds) 1941 elif isinstance(sequence, MutableSeq): 1942 #Return a Seq object 1943 return sequence.toseq().translate(table, stop_symbol, to_stop, cds) 1944 else: 1945 #Assume its a string, return a string 1946 try: 1947 codon_table = CodonTable.ambiguous_generic_by_id[int(table)] 1948 except ValueError: 1949 codon_table = CodonTable.ambiguous_generic_by_name[table] 1950 return _translate_str(sequence, codon_table, stop_symbol, to_stop, cds)
1951
1952 -def reverse_complement(sequence):
1953 """Returns the reverse complement sequence of a nucleotide string. 1954 1955 If given a string, returns a new string object. 1956 Given a Seq or a MutableSeq, returns a new Seq object with the same alphabet. 1957 1958 Supports unambiguous and ambiguous nucleotide sequences. 1959 1960 e.g. 1961 1962 >>> reverse_complement("ACTG-NH") 1963 'DN-CAGT' 1964 """ 1965 if isinstance(sequence, Seq): 1966 #Return a Seq 1967 return sequence.reverse_complement() 1968 elif isinstance(sequence, MutableSeq): 1969 #Return a Seq 1970 #Don't use the MutableSeq reverse_complement method as it is 'in place'. 1971 return sequence.toseq().reverse_complement() 1972 1973 #Assume its a string. 1974 #In order to avoid some code duplication, the old code would turn the string 1975 #into a Seq, use the reverse_complement method, and convert back to a string. 1976 #This worked, but is over five times slower on short sequences! 1977 if ('U' in sequence or 'u' in sequence) \ 1978 and ('T' in sequence or 't' in sequence): 1979 raise ValueError("Mixed RNA/DNA found") 1980 elif 'U' in sequence or 'u' in sequence: 1981 ttable = _rna_complement_table 1982 else: 1983 ttable = _dna_complement_table 1984 return sequence.translate(ttable)[::-1]
1985
1986 -def _test():
1987 """Run the Bio.Seq module's doctests.""" 1988 print "Runing doctests..." 1989 import doctest 1990 doctest.testmod() 1991 print "Done"
1992 1993 if __name__ == "__main__": 1994 _test() 1995