1
2
3
4
5
6
7
8 """Provides objects to represent biological sequences with alphabets.
9
10 See also U{http://biopython.org/wiki/Seq} and the chapter in our tutorial:
11 - U{http://biopython.org/DIST/docs/tutorial/Tutorial.html}
12 - U{http://biopython.org/DIST/docs/tutorial/Tutorial.pdf}
13 """
14 __docformat__ ="epytext en"
15
16 import string
17 import array
18 import sys
19
20 import Alphabet
21 from Alphabet import IUPAC
22 from Bio.SeqRecord import SeqRecord
23 from Data.IUPACData import ambiguous_dna_complement, ambiguous_rna_complement
24 from Bio.Data import CodonTable
27 """Makes a python string translation table (PRIVATE).
28
29 Arguments:
30 - complement_mapping - a dictionary such as ambiguous_dna_complement
31 and ambiguous_rna_complement from Data.IUPACData.
32
33 Returns a translation table (a string of length 256) for use with the
34 python string's translate method to use in a (reverse) complement.
35
36 Compatible with lower case and upper case sequences.
37
38 For internal use only.
39 """
40 before = ''.join(complement_mapping.keys())
41 after = ''.join(complement_mapping.values())
42 before = before + before.lower()
43 after = after + after.lower()
44 return string.maketrans(before, after)
45
46 _dna_complement_table = _maketrans(ambiguous_dna_complement)
47 _rna_complement_table = _maketrans(ambiguous_rna_complement)
48
49 -class Seq(object):
50 """A read-only sequence object (essentially a string with an alphabet).
51
52 Like normal python strings, our basic sequence object is immutable.
53 This prevents you from doing my_seq[5] = "A" for example, but does allow
54 Seq objects to be used as dictionary keys.
55
56 The Seq object provides a number of string like methods (such as count,
57 find, split and strip), which are alphabet aware where appropriate.
58
59 The Seq object also provides some biological methods, such as complement,
60 reverse_complement, transcribe, back_transcribe and translate (which are
61 not applicable to sequences with a protein alphabet).
62 """
64 """Create a Seq object.
65
66 Arguments:
67 - seq - Sequence, required (string)
68 - alphabet - Optional argument, an Alphabet object from Bio.Alphabet
69
70 You will typically use Bio.SeqIO to read in sequences from files as
71 SeqRecord objects, whose sequence will be exposed as a Seq object via
72 the seq property.
73
74 However, will often want to create your own Seq objects directly:
75
76 >>> from Bio.Seq import Seq
77 >>> from Bio.Alphabet import IUPAC
78 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF",
79 ... IUPAC.protein)
80 >>> my_seq
81 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein())
82 >>> print my_seq
83 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
84 """
85
86 assert (type(data) == type("") or
87 type(data) == type(u""))
88 self._data = data
89 self.alphabet = alphabet
90
91
92
93 @property
95 """Sequence as a string (OBSOLETE/DEPRECATED).
96
97 This is a read only property provided for backwards compatility with
98 older versions of Biopython (as is the tostring() method). We now
99 encourage you to use str(my_seq) instead of my_seq.data or the method
100 my_seq.tostring().
101
102 In recent releases of Biopython it was possible to change a Seq object
103 by updating its data property, but this triggered a deprecation warning.
104 Now the data property is read only, since Seq objects are meant to be
105 immutable:
106
107 >>> from Bio.Seq import Seq
108 >>> from Bio.Alphabet import generic_dna
109 >>> my_seq = Seq("ACGT", generic_dna)
110 >>> str(my_seq) == my_seq.tostring() == my_seq.data == "ACGT"
111 True
112 >>> my_seq.data = "AAAA"
113 Traceback (most recent call last):
114 ...
115 AttributeError: can't set attribute
116 """
117 return str(self)
118
120 """Returns a (truncated) representation of the sequence for debugging."""
121 if len(self) > 60:
122
123
124
125 return "%s('%s...%s', %s)" % (self.__class__.__name__,
126 str(self)[:54], str(self)[-3:],
127 repr(self.alphabet))
128 else:
129 return "%s(%s, %s)" % (self.__class__.__name__,
130 repr(self.data),
131 repr(self.alphabet))
133 """Returns the full sequence as a python string, use str(my_seq).
134
135 Note that Biopython 1.44 and earlier would give a truncated
136 version of repr(my_seq) for str(my_seq). If you are writing code
137 which need to be backwards compatible with old Biopython, you
138 should continue to use my_seq.tostring() rather than str(my_seq).
139 """
140 return self._data
141
142
143
144
145
147 """Returns the length of the sequence, use len(my_seq)."""
148 return len(self._data)
149
151 """Returns a subsequence of single letter, use my_seq[index]."""
152
153
154
155 if isinstance(index, int):
156
157 return self._data[index]
158 else:
159
160 return Seq(self._data[index], self.alphabet)
161
163 """Add another sequence or string to this sequence.
164
165 If adding a string to a Seq, the alphabet is preserved:
166
167 >>> from Bio.Seq import Seq
168 >>> from Bio.Alphabet import generic_protein
169 >>> Seq("MELKI", generic_protein) + "LV"
170 Seq('MELKILV', ProteinAlphabet())
171
172 When adding two Seq (like) objects, the alphabets are important.
173 Consider this example:
174
175 >>> from Bio.Seq import Seq
176 >>> from Bio.Alphabet.IUPAC import unambiguous_dna, ambiguous_dna
177 >>> unamb_dna_seq = Seq("ACGT", unambiguous_dna)
178 >>> ambig_dna_seq = Seq("ACRGT", ambiguous_dna)
179 >>> unamb_dna_seq
180 Seq('ACGT', IUPACUnambiguousDNA())
181 >>> ambig_dna_seq
182 Seq('ACRGT', IUPACAmbiguousDNA())
183
184 If we add the ambiguous and unambiguous IUPAC DNA alphabets, we get
185 the more general ambiguous IUPAC DNA alphabet:
186
187 >>> unamb_dna_seq + ambig_dna_seq
188 Seq('ACGTACRGT', IUPACAmbiguousDNA())
189
190 However, if the default generic alphabet is included, the result is
191 a generic alphabet:
192
193 >>> Seq("") + ambig_dna_seq
194 Seq('ACRGT', Alphabet())
195
196 You can't add RNA and DNA sequences:
197
198 >>> from Bio.Alphabet import generic_dna, generic_rna
199 >>> Seq("ACGT", generic_dna) + Seq("ACGU", generic_rna)
200 Traceback (most recent call last):
201 ...
202 TypeError: Incompatable alphabets DNAAlphabet() and RNAAlphabet()
203
204 You can't add nucleotide and protein sequences:
205
206 >>> from Bio.Alphabet import generic_dna, generic_protein
207 >>> Seq("ACGT", generic_dna) + Seq("MELKI", generic_protein)
208 Traceback (most recent call last):
209 ...
210 TypeError: Incompatable alphabets DNAAlphabet() and ProteinAlphabet()
211 """
212 if hasattr(other, "alphabet"):
213
214 if not Alphabet._check_type_compatible([self.alphabet,
215 other.alphabet]):
216 raise TypeError("Incompatable alphabets %s and %s" \
217 % (repr(self.alphabet), repr(other.alphabet)))
218
219 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet])
220 return self.__class__(str(self) + str(other), a)
221 elif isinstance(other, basestring):
222
223 return self.__class__(str(self) + other, self.alphabet)
224 elif isinstance(other, SeqRecord):
225
226 return NotImplemented
227 else :
228 raise TypeError
229
231 """Adding a sequence on the left.
232
233 If adding a string to a Seq, the alphabet is preserved:
234
235 >>> from Bio.Seq import Seq
236 >>> from Bio.Alphabet import generic_protein
237 >>> "LV" + Seq("MELKI", generic_protein)
238 Seq('LVMELKI', ProteinAlphabet())
239
240 Adding two Seq (like) objects is handled via the __add__ method.
241 """
242 if hasattr(other, "alphabet"):
243
244 if not Alphabet._check_type_compatible([self.alphabet,
245 other.alphabet]):
246 raise TypeError("Incompatable alphabets %s and %s" \
247 % (repr(self.alphabet), repr(other.alphabet)))
248
249 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet])
250 return self.__class__(str(other) + str(self), a)
251 elif isinstance(other, basestring):
252
253 return self.__class__(other + str(self), self.alphabet)
254 else:
255 raise TypeError
256
258 """Returns the full sequence as a python string (OBSOLETE).
259
260 Although not formally deprecated, you are now encouraged to use
261 str(my_seq) instead of my_seq.tostring()."""
262 return str(self)
263
265 """Returns the full sequence as a MutableSeq object.
266
267 >>> from Bio.Seq import Seq
268 >>> from Bio.Alphabet import IUPAC
269 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAAL",
270 ... IUPAC.protein)
271 >>> my_seq
272 Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein())
273 >>> my_seq.tomutable()
274 MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein())
275
276 Note that the alphabet is preserved.
277 """
278 return MutableSeq(str(self), self.alphabet)
279
281 """string/Seq/MutableSeq to string, checking alphabet (PRIVATE).
282
283 For a string argument, returns the string.
284
285 For a Seq or MutableSeq, it checks the alphabet is compatible
286 (raising an exception if it isn't), and then returns a string.
287 """
288 try:
289 other_alpha = other_sequence.alphabet
290 except AttributeError:
291
292 return other_sequence
293
294
295 if not Alphabet._check_type_compatible([self.alphabet, other_alpha]):
296 raise TypeError("Incompatable alphabets %s and %s" \
297 % (repr(self.alphabet), repr(other_alpha)))
298
299 return str(other_sequence)
300
301 - def count(self, sub, start=0, end=sys.maxint):
302 """Non-overlapping count method, like that of a python string.
303
304 This behaves like the python string method of the same name,
305 which does a non-overlapping count!
306
307 Returns an integer, the number of occurrences of substring
308 argument sub in the (sub)sequence given by [start:end].
309 Optional arguments start and end are interpreted as in slice
310 notation.
311
312 Arguments:
313 - sub - a string or another Seq object to look for
314 - start - optional integer, slice start
315 - end - optional integer, slice end
316
317 e.g.
318
319 >>> from Bio.Seq import Seq
320 >>> my_seq = Seq("AAAATGA")
321 >>> print my_seq.count("A")
322 5
323 >>> print my_seq.count("ATG")
324 1
325 >>> print my_seq.count(Seq("AT"))
326 1
327 >>> print my_seq.count("AT", 2, -1)
328 1
329
330 HOWEVER, please note because python strings and Seq objects (and
331 MutableSeq objects) do a non-overlapping search, this may not give
332 the answer you expect:
333
334 >>> "AAAA".count("AA")
335 2
336 >>> print Seq("AAAA").count("AA")
337 2
338
339 A non-overlapping search would give the answer as three!
340 """
341
342 sub_str = self._get_seq_str_and_check_alphabet(sub)
343 return str(self).count(sub_str, start, end)
344
346 """Implements the 'in' keyword, like a python string.
347
348 e.g.
349
350 >>> from Bio.Seq import Seq
351 >>> from Bio.Alphabet import generic_dna, generic_rna, generic_protein
352 >>> my_dna = Seq("ATATGAAATTTGAAAA", generic_dna)
353 >>> "AAA" in my_dna
354 True
355 >>> Seq("AAA") in my_dna
356 True
357 >>> Seq("AAA", generic_dna) in my_dna
358 True
359
360 Like other Seq methods, this will raise a type error if another Seq
361 (or Seq like) object with an incompatible alphabet is used:
362
363 >>> Seq("AAA", generic_rna) in my_dna
364 Traceback (most recent call last):
365 ...
366 TypeError: Incompatable alphabets DNAAlphabet() and RNAAlphabet()
367 >>> Seq("AAA", generic_protein) in my_dna
368 Traceback (most recent call last):
369 ...
370 TypeError: Incompatable alphabets DNAAlphabet() and ProteinAlphabet()
371 """
372
373 sub_str = self._get_seq_str_and_check_alphabet(char)
374 return sub_str in str(self)
375
376 - def find(self, sub, start=0, end=sys.maxint):
377 """Find method, like that of a python string.
378
379 This behaves like the python string method of the same name.
380
381 Returns an integer, the index of the first occurrence of substring
382 argument sub in the (sub)sequence given by [start:end].
383
384 Arguments:
385 - sub - a string or another Seq object to look for
386 - start - optional integer, slice start
387 - end - optional integer, slice end
388
389 Returns -1 if the subsequence is NOT found.
390
391 e.g. Locating the first typical start codon, AUG, in an RNA sequence:
392
393 >>> from Bio.Seq import Seq
394 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
395 >>> my_rna.find("AUG")
396 3
397 """
398
399 sub_str = self._get_seq_str_and_check_alphabet(sub)
400 return str(self).find(sub_str, start, end)
401
402 - def rfind(self, sub, start=0, end=sys.maxint):
403 """Find from right method, like that of a python string.
404
405 This behaves like the python string method of the same name.
406
407 Returns an integer, the index of the last (right most) occurrence of
408 substring argument sub in the (sub)sequence given by [start:end].
409
410 Arguments:
411 - sub - a string or another Seq object to look for
412 - start - optional integer, slice start
413 - end - optional integer, slice end
414
415 Returns -1 if the subsequence is NOT found.
416
417 e.g. Locating the last typical start codon, AUG, in an RNA sequence:
418
419 >>> from Bio.Seq import Seq
420 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
421 >>> my_rna.rfind("AUG")
422 15
423 """
424
425 sub_str = self._get_seq_str_and_check_alphabet(sub)
426 return str(self).rfind(sub_str, start, end)
427
428 - def startswith(self, prefix, start=0, end=sys.maxint):
429 """Does the Seq start with the given prefix? Returns True/False.
430
431 This behaves like the python string method of the same name.
432
433 Return True if the sequence starts with the specified prefix
434 (a string or another Seq object), False otherwise.
435 With optional start, test sequence beginning at that position.
436 With optional end, stop comparing sequence at that position.
437 prefix can also be a tuple of strings to try. e.g.
438
439 >>> from Bio.Seq import Seq
440 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
441 >>> my_rna.startswith("GUC")
442 True
443 >>> my_rna.startswith("AUG")
444 False
445 >>> my_rna.startswith("AUG", 3)
446 True
447 >>> my_rna.startswith(("UCC","UCA","UCG"),1)
448 True
449 """
450
451 if isinstance(prefix, tuple):
452
453
454
455 prefix_strings = [self._get_seq_str_and_check_alphabet(p) \
456 for p in prefix]
457 for prefix_str in prefix_strings:
458 if str(self).startswith(prefix_str, start, end):
459 return True
460 return False
461 else:
462 prefix_str = self._get_seq_str_and_check_alphabet(prefix)
463 return str(self).startswith(prefix_str, start, end)
464
465 - def endswith(self, suffix, start=0, end=sys.maxint):
466 """Does the Seq end with the given suffix? Returns True/False.
467
468 This behaves like the python string method of the same name.
469
470 Return True if the sequence ends with the specified suffix
471 (a string or another Seq object), False otherwise.
472 With optional start, test sequence beginning at that position.
473 With optional end, stop comparing sequence at that position.
474 suffix can also be a tuple of strings to try. e.g.
475
476 >>> from Bio.Seq import Seq
477 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
478 >>> my_rna.endswith("UUG")
479 True
480 >>> my_rna.endswith("AUG")
481 False
482 >>> my_rna.endswith("AUG", 0, 18)
483 True
484 >>> my_rna.endswith(("UCC","UCA","UUG"))
485 True
486 """
487
488 if isinstance(suffix, tuple):
489
490
491
492 suffix_strings = [self._get_seq_str_and_check_alphabet(p) \
493 for p in suffix]
494 for suffix_str in suffix_strings:
495 if str(self).endswith(suffix_str, start, end):
496 return True
497 return False
498 else:
499 suffix_str = self._get_seq_str_and_check_alphabet(suffix)
500 return str(self).endswith(suffix_str, start, end)
501
502
503 - def split(self, sep=None, maxsplit=-1):
504 """Split method, like that of a python string.
505
506 This behaves like the python string method of the same name.
507
508 Return a list of the 'words' in the string (as Seq objects),
509 using sep as the delimiter string. If maxsplit is given, at
510 most maxsplit splits are done. If maxsplit is ommited, all
511 splits are made.
512
513 Following the python string method, sep will by default be any
514 white space (tabs, spaces, newlines) but this is unlikely to
515 apply to biological sequences.
516
517 e.g.
518
519 >>> from Bio.Seq import Seq
520 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
521 >>> my_aa = my_rna.translate()
522 >>> my_aa
523 Seq('VMAIVMGR*KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*'))
524 >>> my_aa.split("*")
525 [Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*'))]
526 >>> my_aa.split("*",1)
527 [Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*'))]
528
529 See also the rsplit method:
530
531 >>> my_aa.rsplit("*",1)
532 [Seq('VMAIVMGR*KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*'))]
533 """
534
535 sep_str = self._get_seq_str_and_check_alphabet(sep)
536
537
538 return [Seq(part, self.alphabet) \
539 for part in str(self).split(sep_str, maxsplit)]
540
541 - def rsplit(self, sep=None, maxsplit=-1):
542 """Right split method, like that of a python string.
543
544 This behaves like the python string method of the same name.
545
546 Return a list of the 'words' in the string (as Seq objects),
547 using sep as the delimiter string. If maxsplit is given, at
548 most maxsplit splits are done COUNTING FROM THE RIGHT.
549 If maxsplit is ommited, all splits are made.
550
551 Following the python string method, sep will by default be any
552 white space (tabs, spaces, newlines) but this is unlikely to
553 apply to biological sequences.
554
555 e.g. print my_seq.rsplit("*",1)
556
557 See also the split method.
558 """
559
560 sep_str = self._get_seq_str_and_check_alphabet(sep)
561 return [Seq(part, self.alphabet) \
562 for part in str(self).rsplit(sep_str, maxsplit)]
563
564 - def strip(self, chars=None):
565 """Returns a new Seq object with leading and trailing ends stripped.
566
567 This behaves like the python string method of the same name.
568
569 Optional argument chars defines which characters to remove. If
570 ommitted or None (default) then as for the python string method,
571 this defaults to removing any white space.
572
573 e.g. print my_seq.strip("-")
574
575 See also the lstrip and rstrip methods.
576 """
577
578 strip_str = self._get_seq_str_and_check_alphabet(chars)
579 return Seq(str(self).strip(strip_str), self.alphabet)
580
581 - def lstrip(self, chars=None):
582 """Returns a new Seq object with leading (left) end stripped.
583
584 This behaves like the python string method of the same name.
585
586 Optional argument chars defines which characters to remove. If
587 ommitted or None (default) then as for the python string method,
588 this defaults to removing any white space.
589
590 e.g. print my_seq.lstrip("-")
591
592 See also the strip and rstrip methods.
593 """
594
595 strip_str = self._get_seq_str_and_check_alphabet(chars)
596 return Seq(str(self).lstrip(strip_str), self.alphabet)
597
598 - def rstrip(self, chars=None):
599 """Returns a new Seq object with trailing (right) end stripped.
600
601 This behaves like the python string method of the same name.
602
603 Optional argument chars defines which characters to remove. If
604 ommitted or None (default) then as for the python string method,
605 this defaults to removing any white space.
606
607 e.g. Removing a nucleotide sequence's polyadenylation (poly-A tail):
608
609 >>> from Bio.Alphabet import IUPAC
610 >>> from Bio.Seq import Seq
611 >>> my_seq = Seq("CGGTACGCTTATGTCACGTAGAAAAAA", IUPAC.unambiguous_dna)
612 >>> my_seq
613 Seq('CGGTACGCTTATGTCACGTAGAAAAAA', IUPACUnambiguousDNA())
614 >>> my_seq.rstrip("A")
615 Seq('CGGTACGCTTATGTCACGTAG', IUPACUnambiguousDNA())
616
617 See also the strip and lstrip methods.
618 """
619
620 strip_str = self._get_seq_str_and_check_alphabet(chars)
621 return Seq(str(self).rstrip(strip_str), self.alphabet)
622
624 """Returns an upper case copy of the sequence.
625
626 >>> from Bio.Alphabet import HasStopCodon, generic_protein
627 >>> from Bio.Seq import Seq
628 >>> my_seq = Seq("VHLTPeeK*", HasStopCodon(generic_protein))
629 >>> my_seq
630 Seq('VHLTPeeK*', HasStopCodon(ProteinAlphabet(), '*'))
631 >>> my_seq.lower()
632 Seq('vhltpeek*', HasStopCodon(ProteinAlphabet(), '*'))
633 >>> my_seq.upper()
634 Seq('VHLTPEEK*', HasStopCodon(ProteinAlphabet(), '*'))
635
636 This will adjust the alphabet if required. See also the lower method.
637 """
638 return Seq(str(self).upper(), self.alphabet._upper())
639
641 """Returns a lower case copy of the sequence.
642
643 This will adjust the alphabet if required. Note that the IUPAC alphabets
644 are upper case only, and thus a generic alphabet must be substituted.
645
646 >>> from Bio.Alphabet import Gapped, generic_dna
647 >>> from Bio.Alphabet import IUPAC
648 >>> from Bio.Seq import Seq
649 >>> my_seq = Seq("CGGTACGCTTATGTCACGTAG*AAAAAA", Gapped(IUPAC.unambiguous_dna, "*"))
650 >>> my_seq
651 Seq('CGGTACGCTTATGTCACGTAG*AAAAAA', Gapped(IUPACUnambiguousDNA(), '*'))
652 >>> my_seq.lower()
653 Seq('cggtacgcttatgtcacgtag*aaaaaa', Gapped(DNAAlphabet(), '*'))
654
655 See also the upper method.
656 """
657 return Seq(str(self).lower(), self.alphabet._lower())
658
660 """Returns the complement sequence. New Seq object.
661
662 >>> from Bio.Seq import Seq
663 >>> from Bio.Alphabet import IUPAC
664 >>> my_dna = Seq("CCCCCGATAG", IUPAC.unambiguous_dna)
665 >>> my_dna
666 Seq('CCCCCGATAG', IUPACUnambiguousDNA())
667 >>> my_dna.complement()
668 Seq('GGGGGCTATC', IUPACUnambiguousDNA())
669
670 You can of course used mixed case sequences,
671
672 >>> from Bio.Seq import Seq
673 >>> from Bio.Alphabet import generic_dna
674 >>> my_dna = Seq("CCCCCgatA-GD", generic_dna)
675 >>> my_dna
676 Seq('CCCCCgatA-GD', DNAAlphabet())
677 >>> my_dna.complement()
678 Seq('GGGGGctaT-CH', DNAAlphabet())
679
680 Note in the above example, ambiguous character D denotes
681 G, A or T so its complement is H (for C, T or A).
682
683 Trying to complement a protein sequence raises an exception.
684
685 >>> my_protein = Seq("MAIVMGR", IUPAC.protein)
686 >>> my_protein.complement()
687 Traceback (most recent call last):
688 ...
689 ValueError: Proteins do not have complements!
690 """
691 base = Alphabet._get_base_alphabet(self.alphabet)
692 if isinstance(base, Alphabet.ProteinAlphabet):
693 raise ValueError("Proteins do not have complements!")
694 if isinstance(base, Alphabet.DNAAlphabet):
695 ttable = _dna_complement_table
696 elif isinstance(base, Alphabet.RNAAlphabet):
697 ttable = _rna_complement_table
698 elif ('U' in self._data or 'u' in self._data) \
699 and ('T' in self._data or 't' in self._data):
700
701 raise ValueError("Mixed RNA/DNA found")
702 elif 'U' in self._data or 'u' in self._data:
703 ttable = _rna_complement_table
704 else:
705 ttable = _dna_complement_table
706
707
708 return Seq(str(self).translate(ttable), self.alphabet)
709
711 """Returns the reverse complement sequence. New Seq object.
712
713 >>> from Bio.Seq import Seq
714 >>> from Bio.Alphabet import IUPAC
715 >>> my_dna = Seq("CCCCCGATAGNR", IUPAC.ambiguous_dna)
716 >>> my_dna
717 Seq('CCCCCGATAGNR', IUPACAmbiguousDNA())
718 >>> my_dna.reverse_complement()
719 Seq('YNCTATCGGGGG', IUPACAmbiguousDNA())
720
721 Note in the above example, since R = G or A, its complement
722 is Y (which denotes C or T).
723
724 You can of course used mixed case sequences,
725
726 >>> from Bio.Seq import Seq
727 >>> from Bio.Alphabet import generic_dna
728 >>> my_dna = Seq("CCCCCgatA-G", generic_dna)
729 >>> my_dna
730 Seq('CCCCCgatA-G', DNAAlphabet())
731 >>> my_dna.reverse_complement()
732 Seq('C-TatcGGGGG', DNAAlphabet())
733
734 Trying to complement a protein sequence raises an exception:
735
736 >>> my_protein = Seq("MAIVMGR", IUPAC.protein)
737 >>> my_protein.reverse_complement()
738 Traceback (most recent call last):
739 ...
740 ValueError: Proteins do not have complements!
741 """
742
743 return self.complement()[::-1]
744
746 """Returns the RNA sequence from a DNA sequence. New Seq object.
747
748 >>> from Bio.Seq import Seq
749 >>> from Bio.Alphabet import IUPAC
750 >>> coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG",
751 ... IUPAC.unambiguous_dna)
752 >>> coding_dna
753 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA())
754 >>> coding_dna.transcribe()
755 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA())
756
757 Trying to transcribe a protein or RNA sequence raises an exception:
758
759 >>> my_protein = Seq("MAIVMGR", IUPAC.protein)
760 >>> my_protein.transcribe()
761 Traceback (most recent call last):
762 ...
763 ValueError: Proteins cannot be transcribed!
764 """
765 base = Alphabet._get_base_alphabet(self.alphabet)
766 if isinstance(base, Alphabet.ProteinAlphabet):
767 raise ValueError("Proteins cannot be transcribed!")
768 if isinstance(base, Alphabet.RNAAlphabet):
769 raise ValueError("RNA cannot be transcribed!")
770
771 if self.alphabet==IUPAC.unambiguous_dna:
772 alphabet = IUPAC.unambiguous_rna
773 elif self.alphabet==IUPAC.ambiguous_dna:
774 alphabet = IUPAC.ambiguous_rna
775 else:
776 alphabet = Alphabet.generic_rna
777 return Seq(str(self).replace('T','U').replace('t','u'), alphabet)
778
780 """Returns the DNA sequence from an RNA sequence. New Seq object.
781
782 >>> from Bio.Seq import Seq
783 >>> from Bio.Alphabet import IUPAC
784 >>> messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG",
785 ... IUPAC.unambiguous_rna)
786 >>> messenger_rna
787 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA())
788 >>> messenger_rna.back_transcribe()
789 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA())
790
791 Trying to back-transcribe a protein or DNA sequence raises an
792 exception:
793
794 >>> my_protein = Seq("MAIVMGR", IUPAC.protein)
795 >>> my_protein.back_transcribe()
796 Traceback (most recent call last):
797 ...
798 ValueError: Proteins cannot be back transcribed!
799 """
800 base = Alphabet._get_base_alphabet(self.alphabet)
801 if isinstance(base, Alphabet.ProteinAlphabet):
802 raise ValueError("Proteins cannot be back transcribed!")
803 if isinstance(base, Alphabet.DNAAlphabet):
804 raise ValueError("DNA cannot be back transcribed!")
805
806 if self.alphabet==IUPAC.unambiguous_rna:
807 alphabet = IUPAC.unambiguous_dna
808 elif self.alphabet==IUPAC.ambiguous_rna:
809 alphabet = IUPAC.ambiguous_dna
810 else:
811 alphabet = Alphabet.generic_dna
812 return Seq(str(self).replace("U", "T").replace("u", "t"), alphabet)
813
814 - def translate(self, table="Standard", stop_symbol="*", to_stop=False,
815 cds=False):
816 """Turns a nucleotide sequence into a protein sequence. New Seq object.
817
818 This method will translate DNA or RNA sequences, and those with a
819 nucleotide or generic alphabet. Trying to translate a protein
820 sequence raises an exception.
821
822 Arguments:
823 - table - Which codon table to use? This can be either a name
824 (string) or an NCBI identifier (integer). This defaults
825 to the "Standard" table.
826 - stop_symbol - Single character string, what to use for terminators.
827 This defaults to the asterisk, "*".
828 - to_stop - Boolean, defaults to False meaning do a full translation
829 continuing on past any stop codons (translated as the
830 specified stop_symbol). If True, translation is
831 terminated at the first in frame stop codon (and the
832 stop_symbol is not appended to the returned protein
833 sequence).
834 - cds - Boolean, indicates this is a complete CDS. If True,
835 this checks the sequence starts with a valid alternative start
836 codon (which will be translated as methionine, M), that the
837 sequence length is a multiple of three, and that there is a
838 single in frame stop codon at the end (this will be excluded
839 from the protein sequence, regardless of the to_stop option).
840 If these tests fail, an exception is raised.
841
842 e.g. Using the standard table:
843
844 >>> coding_dna = Seq("GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
845 >>> coding_dna.translate()
846 Seq('VAIVMGR*KGAR*', HasStopCodon(ExtendedIUPACProtein(), '*'))
847 >>> coding_dna.translate(stop_symbol="@")
848 Seq('VAIVMGR@KGAR@', HasStopCodon(ExtendedIUPACProtein(), '@'))
849 >>> coding_dna.translate(to_stop=True)
850 Seq('VAIVMGR', ExtendedIUPACProtein())
851
852 Now using NCBI table 2, where TGA is not a stop codon:
853
854 >>> coding_dna.translate(table=2)
855 Seq('VAIVMGRWKGAR*', HasStopCodon(ExtendedIUPACProtein(), '*'))
856 >>> coding_dna.translate(table=2, to_stop=True)
857 Seq('VAIVMGRWKGAR', ExtendedIUPACProtein())
858
859 In fact, GTG is an alternative start codon under NCBI table 2, meaning
860 this sequence could be a complete CDS:
861
862 >>> coding_dna.translate(table=2, cds=True)
863 Seq('MAIVMGRWKGAR', ExtendedIUPACProtein())
864
865 It isn't a valid CDS under NCBI table 1, due to both the start codon and
866 also the in frame stop codons:
867
868 >>> coding_dna.translate(table=1, cds=True)
869 Traceback (most recent call last):
870 ...
871 TranslationError: First codon 'GTG' is not a start codon
872
873 If the sequence has no in-frame stop codon, then the to_stop argument
874 has no effect:
875
876 >>> coding_dna2 = Seq("TTGGCCATTGTAATGGGCCGC")
877 >>> coding_dna2.translate()
878 Seq('LAIVMGR', ExtendedIUPACProtein())
879 >>> coding_dna2.translate(to_stop=True)
880 Seq('LAIVMGR', ExtendedIUPACProtein())
881
882 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid
883 or a stop codon. These are translated as "X". Any invalid codon
884 (e.g. "TA?" or "T-A") will throw a TranslationError.
885
886 NOTE - Does NOT support gapped sequences.
887
888 NOTE - This does NOT behave like the python string's translate
889 method. For that use str(my_seq).translate(...) instead.
890 """
891 try:
892 table_id = int(table)
893 except ValueError:
894 table_id = None
895 if isinstance(table, str) and len(table)==256:
896 raise ValueError("The Seq object translate method DOES NOT take " \
897 + "a 256 character string mapping table like " \
898 + "the python string object's translate method. " \
899 + "Use str(my_seq).translate(...) instead.")
900 if isinstance(Alphabet._get_base_alphabet(self.alphabet),
901 Alphabet.ProteinAlphabet):
902 raise ValueError("Proteins cannot be translated!")
903 if self.alphabet==IUPAC.unambiguous_dna:
904
905 if table_id is None:
906 codon_table = CodonTable.unambiguous_dna_by_name[table]
907 else:
908 codon_table = CodonTable.unambiguous_dna_by_id[table_id]
909 elif self.alphabet==IUPAC.unambiguous_rna:
910
911 if table_id is None:
912 codon_table = CodonTable.unambiguous_rna_by_name[table]
913 else:
914 codon_table = CodonTable.unambiguous_rna_by_id[table_id]
915 else:
916
917
918
919 if table_id is None:
920 codon_table = CodonTable.ambiguous_generic_by_name[table]
921 else:
922 codon_table = CodonTable.ambiguous_generic_by_id[table_id]
923 protein = _translate_str(str(self), codon_table, \
924 stop_symbol, to_stop, cds)
925 if stop_symbol in protein:
926 alphabet = Alphabet.HasStopCodon(codon_table.protein_alphabet,
927 stop_symbol = stop_symbol)
928 else:
929 alphabet = codon_table.protein_alphabet
930 return Seq(protein, alphabet)
931
932 - def ungap(self, gap=None):
933 """Return a copy of the sequence without the gap character(s).
934
935 The gap character can be specified in two ways - either as an explicit
936 argument, or via the sequence's alphabet. For example:
937
938 >>> from Bio.Seq import Seq
939 >>> from Bio.Alphabet import generic_dna
940 >>> my_dna = Seq("-ATA--TGAAAT-TTGAAAA", generic_dna)
941 >>> my_dna
942 Seq('-ATA--TGAAAT-TTGAAAA', DNAAlphabet())
943 >>> my_dna.ungap("-")
944 Seq('ATATGAAATTTGAAAA', DNAAlphabet())
945
946 If the gap character is not given as an argument, it will be taken from
947 the sequence's alphabet (if defined). Notice that the returned sequence's
948 alphabet is adjusted since it no longer requires a gapped alphabet:
949
950 >>> from Bio.Seq import Seq
951 >>> from Bio.Alphabet import IUPAC, Gapped, HasStopCodon
952 >>> my_pro = Seq("MVVLE=AD*", HasStopCodon(Gapped(IUPAC.protein, "=")))
953 >>> my_pro
954 Seq('MVVLE=AD*', HasStopCodon(Gapped(IUPACProtein(), '='), '*'))
955 >>> my_pro.ungap()
956 Seq('MVVLEAD*', HasStopCodon(IUPACProtein(), '*'))
957
958 Or, with a simpler gapped DNA example:
959
960 >>> from Bio.Seq import Seq
961 >>> from Bio.Alphabet import IUPAC, Gapped
962 >>> my_seq = Seq("CGGGTAG=AAAAAA", Gapped(IUPAC.unambiguous_dna, "="))
963 >>> my_seq
964 Seq('CGGGTAG=AAAAAA', Gapped(IUPACUnambiguousDNA(), '='))
965 >>> my_seq.ungap()
966 Seq('CGGGTAGAAAAAA', IUPACUnambiguousDNA())
967
968 As long as it is consistent with the alphabet, although it is redundant,
969 you can still supply the gap character as an argument to this method:
970
971 >>> my_seq
972 Seq('CGGGTAG=AAAAAA', Gapped(IUPACUnambiguousDNA(), '='))
973 >>> my_seq.ungap("=")
974 Seq('CGGGTAGAAAAAA', IUPACUnambiguousDNA())
975
976 However, if the gap character given as the argument disagrees with that
977 declared in the alphabet, an exception is raised:
978
979 >>> my_seq
980 Seq('CGGGTAG=AAAAAA', Gapped(IUPACUnambiguousDNA(), '='))
981 >>> my_seq.ungap("-")
982 Traceback (most recent call last):
983 ...
984 ValueError: Gap '-' does not match '=' from alphabet
985
986 Finally, if a gap character is not supplied, and the alphabet does not
987 define one, an exception is raised:
988
989 >>> from Bio.Seq import Seq
990 >>> from Bio.Alphabet import generic_dna
991 >>> my_dna = Seq("ATA--TGAAAT-TTGAAAA", generic_dna)
992 >>> my_dna
993 Seq('ATA--TGAAAT-TTGAAAA', DNAAlphabet())
994 >>> my_dna.ungap()
995 Traceback (most recent call last):
996 ...
997 ValueError: Gap character not given and not defined in alphabet
998
999 """
1000 if hasattr(self.alphabet, "gap_char"):
1001 if not gap:
1002 gap = self.alphabet.gap_char
1003 elif gap != self.alphabet.gap_char:
1004 raise ValueError("Gap %s does not match %s from alphabet" \
1005 % (repr(gap), repr(self.alphabet.gap_char)))
1006 alpha = Alphabet._ungap(self.alphabet)
1007 elif not gap:
1008 raise ValueError("Gap character not given and not defined in alphabet")
1009 else:
1010 alpha = self.alphabet
1011 if len(gap)!=1 or not isinstance(gap, str):
1012 raise ValueError("Unexpected gap character, %s" % repr(gap))
1013 return Seq(str(self).replace(gap, ""), alpha)
1014
1016 """A read-only sequence object of known length but unknown contents.
1017
1018 If you have an unknown sequence, you can represent this with a normal
1019 Seq object, for example:
1020
1021 >>> my_seq = Seq("N"*5)
1022 >>> my_seq
1023 Seq('NNNNN', Alphabet())
1024 >>> len(my_seq)
1025 5
1026 >>> print my_seq
1027 NNNNN
1028
1029 However, this is rather wasteful of memory (especially for large
1030 sequences), which is where this class is most usefull:
1031
1032 >>> unk_five = UnknownSeq(5)
1033 >>> unk_five
1034 UnknownSeq(5, alphabet = Alphabet(), character = '?')
1035 >>> len(unk_five)
1036 5
1037 >>> print(unk_five)
1038 ?????
1039
1040 You can add unknown sequence together, provided their alphabets and
1041 characters are compatible, and get another memory saving UnknownSeq:
1042
1043 >>> unk_four = UnknownSeq(4)
1044 >>> unk_four
1045 UnknownSeq(4, alphabet = Alphabet(), character = '?')
1046 >>> unk_four + unk_five
1047 UnknownSeq(9, alphabet = Alphabet(), character = '?')
1048
1049 If the alphabet or characters don't match up, the addition gives an
1050 ordinary Seq object:
1051
1052 >>> unk_nnnn = UnknownSeq(4, character = "N")
1053 >>> unk_nnnn
1054 UnknownSeq(4, alphabet = Alphabet(), character = 'N')
1055 >>> unk_nnnn + unk_four
1056 Seq('NNNN????', Alphabet())
1057
1058 Combining with a real Seq gives a new Seq object:
1059
1060 >>> known_seq = Seq("ACGT")
1061 >>> unk_four + known_seq
1062 Seq('????ACGT', Alphabet())
1063 >>> known_seq + unk_four
1064 Seq('ACGT????', Alphabet())
1065 """
1067 """Create a new UnknownSeq object.
1068
1069 If character is ommited, it is determed from the alphabet, "N" for
1070 nucleotides, "X" for proteins, and "?" otherwise.
1071 """
1072 self._length = int(length)
1073 if self._length < 0:
1074
1075 raise ValueError("Length must not be negative.")
1076 self.alphabet = alphabet
1077 if character:
1078 if len(character) != 1:
1079 raise ValueError("character argument should be a single letter string.")
1080 self._character = character
1081 else:
1082 base = Alphabet._get_base_alphabet(alphabet)
1083
1084
1085 if isinstance(base, Alphabet.NucleotideAlphabet):
1086 self._character = "N"
1087 elif isinstance(base, Alphabet.ProteinAlphabet):
1088 self._character = "X"
1089 else:
1090 self._character = "?"
1091
1093 """Returns the stated length of the unknown sequence."""
1094 return self._length
1095
1097 """Returns the unknown sequence as full string of the given length."""
1098 return self._character * self._length
1099
1101 return "UnknownSeq(%i, alphabet = %s, character = %s)" \
1102 % (self._length, repr(self.alphabet), repr(self._character))
1103
1105 """Add another sequence or string to this sequence.
1106
1107 Adding two UnknownSeq objects returns another UnknownSeq object
1108 provided the character is the same and the alphabets are compatible.
1109
1110 >>> from Bio.Seq import UnknownSeq
1111 >>> from Bio.Alphabet import generic_protein
1112 >>> UnknownSeq(10, generic_protein) + UnknownSeq(5, generic_protein)
1113 UnknownSeq(15, alphabet = ProteinAlphabet(), character = 'X')
1114
1115 If the characters differ, an UnknownSeq object cannot be used, so a
1116 Seq object is returned:
1117
1118 >>> from Bio.Seq import UnknownSeq
1119 >>> from Bio.Alphabet import generic_protein
1120 >>> UnknownSeq(10, generic_protein) + UnknownSeq(5, generic_protein,
1121 ... character="x")
1122 Seq('XXXXXXXXXXxxxxx', ProteinAlphabet())
1123
1124 If adding a string to an UnknownSeq, a new Seq is returned with the
1125 same alphabet:
1126
1127 >>> from Bio.Seq import UnknownSeq
1128 >>> from Bio.Alphabet import generic_protein
1129 >>> UnknownSeq(5, generic_protein) + "LV"
1130 Seq('XXXXXLV', ProteinAlphabet())
1131 """
1132 if isinstance(other, UnknownSeq) \
1133 and other._character == self._character:
1134
1135 return UnknownSeq(len(self)+len(other),
1136 self.alphabet, self._character)
1137
1138 return Seq(str(self), self.alphabet) + other
1139
1141
1142
1143 return other + Seq(str(self), self.alphabet)
1144
1153
1154 - def count(self, sub, start=0, end=sys.maxint):
1155 """Non-overlapping count method, like that of a python string.
1156
1157 This behaves like the python string (and Seq object) method of the
1158 same name, which does a non-overlapping count!
1159
1160 Returns an integer, the number of occurrences of substring
1161 argument sub in the (sub)sequence given by [start:end].
1162 Optional arguments start and end are interpreted as in slice
1163 notation.
1164
1165 Arguments:
1166 - sub - a string or another Seq object to look for
1167 - start - optional integer, slice start
1168 - end - optional integer, slice end
1169
1170 >>> "NNNN".count("N")
1171 4
1172 >>> Seq("NNNN").count("N")
1173 4
1174 >>> UnknownSeq(4, character="N").count("N")
1175 4
1176 >>> UnknownSeq(4, character="N").count("A")
1177 0
1178 >>> UnknownSeq(4, character="N").count("AA")
1179 0
1180
1181 HOWEVER, please note because that python strings and Seq objects (and
1182 MutableSeq objects) do a non-overlapping search, this may not give
1183 the answer you expect:
1184
1185 >>> UnknownSeq(4, character="N").count("NN")
1186 2
1187 >>> UnknownSeq(4, character="N").count("NNN")
1188 1
1189 """
1190 sub_str = self._get_seq_str_and_check_alphabet(sub)
1191 if len(sub_str) == 1:
1192 if str(sub_str) == self._character:
1193 if start==0 and end >= self._length:
1194 return self._length
1195 else:
1196
1197 return str(self).count(sub_str, start, end)
1198 else:
1199 return 0
1200 else:
1201 if set(sub_str) == set(self._character):
1202 if start==0 and end >= self._length:
1203 return self._length // len(sub_str)
1204 else:
1205
1206 return str(self).count(sub_str, start, end)
1207 else:
1208 return 0
1209
1211 """The complement of an unknown nucleotide equals itself.
1212
1213 >>> my_nuc = UnknownSeq(8)
1214 >>> my_nuc
1215 UnknownSeq(8, alphabet = Alphabet(), character = '?')
1216 >>> print my_nuc
1217 ????????
1218 >>> my_nuc.complement()
1219 UnknownSeq(8, alphabet = Alphabet(), character = '?')
1220 >>> print my_nuc.complement()
1221 ????????
1222 """
1223 if isinstance(Alphabet._get_base_alphabet(self.alphabet),
1224 Alphabet.ProteinAlphabet):
1225 raise ValueError("Proteins do not have complements!")
1226 return self
1227
1229 """The reverse complement of an unknown nucleotide equals itself.
1230
1231 >>> my_nuc = UnknownSeq(10)
1232 >>> my_nuc
1233 UnknownSeq(10, alphabet = Alphabet(), character = '?')
1234 >>> print my_nuc
1235 ??????????
1236 >>> my_nuc.reverse_complement()
1237 UnknownSeq(10, alphabet = Alphabet(), character = '?')
1238 >>> print my_nuc.reverse_complement()
1239 ??????????
1240 """
1241 if isinstance(Alphabet._get_base_alphabet(self.alphabet),
1242 Alphabet.ProteinAlphabet):
1243 raise ValueError("Proteins do not have complements!")
1244 return self
1245
1247 """Returns unknown RNA sequence from an unknown DNA sequence.
1248
1249 >>> my_dna = UnknownSeq(10, character="N")
1250 >>> my_dna
1251 UnknownSeq(10, alphabet = Alphabet(), character = 'N')
1252 >>> print my_dna
1253 NNNNNNNNNN
1254 >>> my_rna = my_dna.transcribe()
1255 >>> my_rna
1256 UnknownSeq(10, alphabet = RNAAlphabet(), character = 'N')
1257 >>> print my_rna
1258 NNNNNNNNNN
1259 """
1260
1261 s = Seq(self._character, self.alphabet).transcribe()
1262 return UnknownSeq(self._length, s.alphabet, self._character)
1263
1265 """Returns unknown DNA sequence from an unknown RNA sequence.
1266
1267 >>> my_rna = UnknownSeq(20, character="N")
1268 >>> my_rna
1269 UnknownSeq(20, alphabet = Alphabet(), character = 'N')
1270 >>> print my_rna
1271 NNNNNNNNNNNNNNNNNNNN
1272 >>> my_dna = my_rna.back_transcribe()
1273 >>> my_dna
1274 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N')
1275 >>> print my_dna
1276 NNNNNNNNNNNNNNNNNNNN
1277 """
1278
1279 s = Seq(self._character, self.alphabet).back_transcribe()
1280 return UnknownSeq(self._length, s.alphabet, self._character)
1281
1283 """Returns an upper case copy of the sequence.
1284
1285 >>> from Bio.Alphabet import generic_dna
1286 >>> from Bio.Seq import UnknownSeq
1287 >>> my_seq = UnknownSeq(20, generic_dna, character="n")
1288 >>> my_seq
1289 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'n')
1290 >>> print my_seq
1291 nnnnnnnnnnnnnnnnnnnn
1292 >>> my_seq.upper()
1293 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N')
1294 >>> print my_seq.upper()
1295 NNNNNNNNNNNNNNNNNNNN
1296
1297 This will adjust the alphabet if required. See also the lower method.
1298 """
1299 return UnknownSeq(self._length, self.alphabet._upper(), self._character.upper())
1300
1302 """Returns a lower case copy of the sequence.
1303
1304 This will adjust the alphabet if required:
1305
1306 >>> from Bio.Alphabet import IUPAC
1307 >>> from Bio.Seq import UnknownSeq
1308 >>> my_seq = UnknownSeq(20, IUPAC.extended_protein)
1309 >>> my_seq
1310 UnknownSeq(20, alphabet = ExtendedIUPACProtein(), character = 'X')
1311 >>> print my_seq
1312 XXXXXXXXXXXXXXXXXXXX
1313 >>> my_seq.lower()
1314 UnknownSeq(20, alphabet = ProteinAlphabet(), character = 'x')
1315 >>> print my_seq.lower()
1316 xxxxxxxxxxxxxxxxxxxx
1317
1318 See also the upper method.
1319 """
1320 return UnknownSeq(self._length, self.alphabet._lower(), self._character.lower())
1321
1323 """Translate an unknown nucleotide sequence into an unknown protein.
1324
1325 e.g.
1326
1327 >>> my_seq = UnknownSeq(11, character="N")
1328 >>> print my_seq
1329 NNNNNNNNNNN
1330 >>> my_protein = my_seq.translate()
1331 >>> my_protein
1332 UnknownSeq(3, alphabet = ProteinAlphabet(), character = 'X')
1333 >>> print my_protein
1334 XXX
1335
1336 In comparison, using a normal Seq object:
1337
1338 >>> my_seq = Seq("NNNNNNNNNNN")
1339 >>> print my_seq
1340 NNNNNNNNNNN
1341 >>> my_protein = my_seq.translate()
1342 >>> my_protein
1343 Seq('XXX', ExtendedIUPACProtein())
1344 >>> print my_protein
1345 XXX
1346
1347 """
1348 if isinstance(Alphabet._get_base_alphabet(self.alphabet),
1349 Alphabet.ProteinAlphabet):
1350 raise ValueError("Proteins cannot be translated!")
1351 return UnknownSeq(self._length//3, Alphabet.generic_protein, "X")
1352
1353 - def ungap(self, gap=None):
1354 """Return a copy of the sequence without the gap character(s).
1355
1356 The gap character can be specified in two ways - either as an explicit
1357 argument, or via the sequence's alphabet. For example:
1358
1359 >>> from Bio.Seq import UnknownSeq
1360 >>> from Bio.Alphabet import Gapped, generic_dna
1361 >>> my_dna = UnknownSeq(20, Gapped(generic_dna,"-"))
1362 >>> my_dna
1363 UnknownSeq(20, alphabet = Gapped(DNAAlphabet(), '-'), character = 'N')
1364 >>> my_dna.ungap()
1365 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N')
1366 >>> my_dna.ungap("-")
1367 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N')
1368
1369 If the UnknownSeq is using the gap character, then an empty Seq is
1370 returned:
1371
1372 >>> my_gap = UnknownSeq(20, Gapped(generic_dna,"-"), character="-")
1373 >>> my_gap
1374 UnknownSeq(20, alphabet = Gapped(DNAAlphabet(), '-'), character = '-')
1375 >>> my_gap.ungap()
1376 Seq('', DNAAlphabet())
1377 >>> my_gap.ungap("-")
1378 Seq('', DNAAlphabet())
1379
1380 Notice that the returned sequence's alphabet is adjusted to remove any
1381 explicit gap character declaration.
1382 """
1383
1384 s = Seq(self._character, self.alphabet).ungap()
1385 if s :
1386 return UnknownSeq(self._length, s.alphabet, self._character)
1387 else :
1388 return Seq("", s.alphabet)
1389
1391 """An editable sequence object (with an alphabet).
1392
1393 Unlike normal python strings and our basic sequence object (the Seq class)
1394 which are immuatable, the MutableSeq lets you edit the sequence in place.
1395 However, this means you cannot use a MutableSeq object as a dictionary key.
1396
1397 >>> from Bio.Seq import MutableSeq
1398 >>> from Bio.Alphabet import generic_dna
1399 >>> my_seq = MutableSeq("ACTCGTCGTCG", generic_dna)
1400 >>> my_seq
1401 MutableSeq('ACTCGTCGTCG', DNAAlphabet())
1402 >>> my_seq[5]
1403 'T'
1404 >>> my_seq[5] = "A"
1405 >>> my_seq
1406 MutableSeq('ACTCGACGTCG', DNAAlphabet())
1407 >>> my_seq[5]
1408 'A'
1409 >>> my_seq[5:8] = "NNN"
1410 >>> my_seq
1411 MutableSeq('ACTCGNNNTCG', DNAAlphabet())
1412 >>> len(my_seq)
1413 11
1414
1415 Note that the MutableSeq object does not support as many string-like
1416 or biological methods as the Seq object.
1417 """
1424
1426 """Returns a (truncated) representation of the sequence for debugging."""
1427 if len(self) > 60:
1428
1429
1430
1431 return "%s('%s...%s', %s)" % (self.__class__.__name__,
1432 str(self[:54]), str(self[-3:]),
1433 repr(self.alphabet))
1434 else:
1435 return "%s('%s', %s)" % (self.__class__.__name__,
1436 str(self),
1437 repr(self.alphabet))
1438
1440 """Returns the full sequence as a python string.
1441
1442 Note that Biopython 1.44 and earlier would give a truncated
1443 version of repr(my_seq) for str(my_seq). If you are writing code
1444 which needs to be backwards compatible with old Biopython, you
1445 should continue to use my_seq.tostring() rather than str(my_seq).
1446 """
1447
1448 return "".join(self.data)
1449
1451 """Compare the sequence for to another sequence or a string.
1452
1453 If compared to another sequence the alphabets must be compatible.
1454 Comparing DNA to RNA, or Nucleotide to Protein will raise an
1455 exception.
1456
1457 Otherwise only the sequence itself is compared, not the precise
1458 alphabet.
1459
1460 This method indirectly supports ==, < , etc."""
1461 if hasattr(other, "alphabet"):
1462
1463 if not Alphabet._check_type_compatible([self.alphabet,
1464 other.alphabet]):
1465 raise TypeError("Incompatable alphabets %s and %s" \
1466 % (repr(self.alphabet), repr(other.alphabet)))
1467
1468 if isinstance(other, MutableSeq):
1469
1470
1471 return cmp(self.data, other.data)
1472 else:
1473 return cmp(str(self), str(other))
1474 elif isinstance(other, basestring):
1475 return cmp(str(self), other)
1476 else:
1477 raise TypeError
1478
1480
1491
1507
1509
1510
1511
1512
1513
1514 del self.data[index]
1515
1517 """Add another sequence or string to this sequence.
1518
1519 Returns a new MutableSeq object."""
1520 if hasattr(other, "alphabet"):
1521
1522 if not Alphabet._check_type_compatible([self.alphabet,
1523 other.alphabet]):
1524 raise TypeError("Incompatable alphabets %s and %s" \
1525 % (repr(self.alphabet), repr(other.alphabet)))
1526
1527 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet])
1528 if isinstance(other, MutableSeq):
1529
1530
1531 return self.__class__(self.data + other.data, a)
1532 else:
1533 return self.__class__(str(self) + str(other), a)
1534 elif isinstance(other, basestring):
1535
1536 return self.__class__(str(self) + str(other), self.alphabet)
1537 else:
1538 raise TypeError
1539
1560
1563
1566
1567 - def pop(self, i = (-1)):
1568 c = self.data[i]
1569 del self.data[i]
1570 return c
1571
1573 for i in range(len(self.data)):
1574 if self.data[i] == item:
1575 del self.data[i]
1576 return
1577 raise ValueError("MutableSeq.remove(x): x not in list")
1578
1579 - def count(self, sub, start=0, end=sys.maxint):
1580 """Non-overlapping count method, like that of a python string.
1581
1582 This behaves like the python string method of the same name,
1583 which does a non-overlapping count!
1584
1585 Returns an integer, the number of occurrences of substring
1586 argument sub in the (sub)sequence given by [start:end].
1587 Optional arguments start and end are interpreted as in slice
1588 notation.
1589
1590 Arguments:
1591 - sub - a string or another Seq object to look for
1592 - start - optional integer, slice start
1593 - end - optional integer, slice end
1594
1595 e.g.
1596
1597 >>> from Bio.Seq import MutableSeq
1598 >>> my_mseq = MutableSeq("AAAATGA")
1599 >>> print my_mseq.count("A")
1600 5
1601 >>> print my_mseq.count("ATG")
1602 1
1603 >>> print my_mseq.count(Seq("AT"))
1604 1
1605 >>> print my_mseq.count("AT", 2, -1)
1606 1
1607
1608 HOWEVER, please note because that python strings, Seq objects and
1609 MutableSeq objects do a non-overlapping search, this may not give
1610 the answer you expect:
1611
1612 >>> "AAAA".count("AA")
1613 2
1614 >>> print MutableSeq("AAAA").count("AA")
1615 2
1616
1617 A non-overlapping search would give the answer as three!
1618 """
1619 try:
1620
1621 search = sub.tostring()
1622 except AttributeError:
1623 search = sub
1624
1625 if not isinstance(search, basestring):
1626 raise TypeError("expected a string, Seq or MutableSeq")
1627
1628 if len(search) == 1:
1629
1630 count = 0
1631 for c in self.data[start:end]:
1632 if c == search: count += 1
1633 return count
1634 else:
1635
1636 return self.tostring().count(search, start, end)
1637
1639 for i in range(len(self.data)):
1640 if self.data[i] == item:
1641 return i
1642 raise ValueError("MutableSeq.index(x): x not in list")
1643
1645 """Modify the mutable sequence to reverse itself.
1646
1647 No return value.
1648 """
1649 self.data.reverse()
1650
1676
1678 """Modify the mutable sequence to take on its reverse complement.
1679
1680 Trying to reverse complement a protein sequence raises an exception.
1681
1682 No return value.
1683 """
1684 self.complement()
1685 self.data.reverse()
1686
1687
1688
1689
1697
1699 """Returns the full sequence as a python string.
1700
1701 Although not formally deprecated, you are now encouraged to use
1702 str(my_seq) instead of my_seq.tostring().
1703
1704 Because str(my_seq) will give you the full sequence as a python string,
1705 there is often no need to make an explicit conversion. For example,
1706
1707 print "ID={%s}, sequence={%s}" % (my_name, my_seq)
1708
1709 On Biopython 1.44 or older you would have to have done this:
1710
1711 print "ID={%s}, sequence={%s}" % (my_name, my_seq.tostring())
1712 """
1713 return "".join(self.data)
1714
1716 """Returns the full sequence as a new immutable Seq object.
1717
1718 >>> from Bio.Seq import Seq
1719 >>> from Bio.Alphabet import IUPAC
1720 >>> my_mseq = MutableSeq("MKQHKAMIVALIVICITAVVAAL", \
1721 IUPAC.protein)
1722 >>> my_mseq
1723 MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein())
1724 >>> my_mseq.toseq()
1725 Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein())
1726
1727 Note that the alphabet is preserved.
1728 """
1729 return Seq("".join(self.data), self.alphabet)
1730
1736 """Transcribes a DNA sequence into RNA.
1737
1738 If given a string, returns a new string object.
1739
1740 Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet.
1741
1742 Trying to transcribe a protein or RNA sequence raises an exception.
1743
1744 e.g.
1745
1746 >>> transcribe("ACTGN")
1747 'ACUGN'
1748 """
1749 if isinstance(dna, Seq):
1750 return dna.transcribe()
1751 elif isinstance(dna, MutableSeq):
1752 return dna.toseq().transcribe()
1753 else:
1754 return dna.replace('T','U').replace('t','u')
1755
1757 """Back-transcribes an RNA sequence into DNA.
1758
1759 If given a string, returns a new string object.
1760
1761 Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet.
1762
1763 Trying to transcribe a protein or DNA sequence raises an exception.
1764
1765 e.g.
1766
1767 >>> back_transcribe("ACUGN")
1768 'ACTGN'
1769 """
1770 if isinstance(rna, Seq):
1771 return rna.back_transcribe()
1772 elif isinstance(rna, MutableSeq):
1773 return rna.toseq().back_transcribe()
1774 else:
1775 return rna.replace('U','T').replace('u','t')
1776
1777 -def _translate_str(sequence, table, stop_symbol="*", to_stop=False,
1778 cds=False, pos_stop="X"):
1779 """Helper function to translate a nucleotide string (PRIVATE).
1780
1781 Arguments:
1782 - sequence - a string
1783 - table - a CodonTable object (NOT a table name or id number)
1784 - stop_symbol - a single character string, what to use for terminators.
1785 - to_stop - boolean, should translation terminate at the first
1786 in frame stop codon? If there is no in-frame stop codon
1787 then translation continues to the end.
1788 - pos_stop - a single character string for a possible stop codon
1789 (e.g. TAN or NNN)
1790 - cds - Boolean, indicates this is a complete CDS. If True, this
1791 checks the sequence starts with a valid alternative start
1792 codon (which will be translated as methionine, M), that the
1793 sequence length is a multiple of three, and that there is a
1794 single in frame stop codon at the end (this will be excluded
1795 from the protein sequence, regardless of the to_stop option).
1796 If these tests fail, an exception is raised.
1797
1798 Returns a string.
1799
1800 e.g.
1801
1802 >>> from Bio.Data import CodonTable
1803 >>> table = CodonTable.ambiguous_dna_by_id[1]
1804 >>> _translate_str("AAA", table)
1805 'K'
1806 >>> _translate_str("TAR", table)
1807 '*'
1808 >>> _translate_str("TAN", table)
1809 'X'
1810 >>> _translate_str("TAN", table, pos_stop="@")
1811 '@'
1812 >>> _translate_str("TA?", table)
1813 Traceback (most recent call last):
1814 ...
1815 TranslationError: Codon 'TA?' is invalid
1816 >>> _translate_str("ATGCCCTAG", table, cds=True)
1817 'MP'
1818 >>> _translate_str("AAACCCTAG", table, cds=True)
1819 Traceback (most recent call last):
1820 ...
1821 TranslationError: First codon 'AAA' is not a start codon
1822 >>> _translate_str("ATGCCCTAGCCCTAG", table, cds=True)
1823 Traceback (most recent call last):
1824 ...
1825 TranslationError: Extra in frame stop codon found.
1826 """
1827 sequence = sequence.upper()
1828 amino_acids = []
1829 forward_table = table.forward_table
1830 stop_codons = table.stop_codons
1831 if table.nucleotide_alphabet.letters is not None:
1832 valid_letters = set(table.nucleotide_alphabet.letters.upper())
1833 else:
1834
1835 valid_letters = set(IUPAC.ambiguous_dna.letters.upper() + \
1836 IUPAC.ambiguous_rna.letters.upper())
1837 if cds:
1838 if str(sequence[:3]).upper() not in table.start_codons:
1839 raise CodonTable.TranslationError(\
1840 "First codon '%s' is not a start codon" % sequence[:3])
1841 if len(sequence) % 3 != 0:
1842 raise CodonTable.TranslationError(\
1843 "Sequence length %i is not a multiple of three" % len(sequence))
1844 if str(sequence[-3:]).upper() not in stop_codons:
1845 raise CodonTable.TranslationError(\
1846 "Final codon '%s' is not a stop codon" % sequence[-3:])
1847
1848 sequence = sequence[3:-3]
1849 amino_acids = ["M"]
1850 n = len(sequence)
1851 for i in xrange(0,n-n%3,3):
1852 codon = sequence[i:i+3]
1853 try:
1854 amino_acids.append(forward_table[codon])
1855 except (KeyError, CodonTable.TranslationError):
1856
1857 if codon in table.stop_codons:
1858 if cds:
1859 raise CodonTable.TranslationError(\
1860 "Extra in frame stop codon found.")
1861 if to_stop : break
1862 amino_acids.append(stop_symbol)
1863 elif valid_letters.issuperset(set(codon)):
1864
1865 amino_acids.append(pos_stop)
1866 else:
1867 raise CodonTable.TranslationError(\
1868 "Codon '%s' is invalid" % codon)
1869 return "".join(amino_acids)
1870
1871 -def translate(sequence, table="Standard", stop_symbol="*", to_stop=False,
1872 cds=False):
1873 """Translate a nucleotide sequence into amino acids.
1874
1875 If given a string, returns a new string object. Given a Seq or
1876 MutableSeq, returns a Seq object with a protein alphabet.
1877
1878 Arguments:
1879 - table - Which codon table to use? This can be either a name
1880 (string) or an NCBI identifier (integer). Defaults
1881 to the "Standard" table.
1882 - stop_symbol - Single character string, what to use for any
1883 terminators, defaults to the asterisk, "*".
1884 - to_stop - Boolean, defaults to False meaning do a full
1885 translation continuing on past any stop codons
1886 (translated as the specified stop_symbol). If
1887 True, translation is terminated at the first in
1888 frame stop codon (and the stop_symbol is not
1889 appended to the returned protein sequence).
1890 - cds - Boolean, indicates this is a complete CDS. If True, this
1891 checks the sequence starts with a valid alternative start
1892 codon (which will be translated as methionine, M), that the
1893 sequence length is a multiple of three, and that there is a
1894 single in frame stop codon at the end (this will be excluded
1895 from the protein sequence, regardless of the to_stop option).
1896 If these tests fail, an exception is raised.
1897
1898 A simple string example using the default (standard) genetic code:
1899
1900 >>> coding_dna = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG"
1901 >>> translate(coding_dna)
1902 'VAIVMGR*KGAR*'
1903 >>> translate(coding_dna, stop_symbol="@")
1904 'VAIVMGR@KGAR@'
1905 >>> translate(coding_dna, to_stop=True)
1906 'VAIVMGR'
1907
1908 Now using NCBI table 2, where TGA is not a stop codon:
1909
1910 >>> translate(coding_dna, table=2)
1911 'VAIVMGRWKGAR*'
1912 >>> translate(coding_dna, table=2, to_stop=True)
1913 'VAIVMGRWKGAR'
1914
1915 In fact this example uses an alternative start codon valid under NCBI table 2,
1916 GTG, which means this example is a complete valid CDS which when translated
1917 should really start with methionine (not valine):
1918
1919 >>> translate(coding_dna, table=2, cds=True)
1920 'MAIVMGRWKGAR'
1921
1922 Note that if the sequence has no in-frame stop codon, then the to_stop
1923 argument has no effect:
1924
1925 >>> coding_dna2 = "GTGGCCATTGTAATGGGCCGC"
1926 >>> translate(coding_dna2)
1927 'VAIVMGR'
1928 >>> translate(coding_dna2, to_stop=True)
1929 'VAIVMGR'
1930
1931 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid
1932 or a stop codon. These are translated as "X". Any invalid codon
1933 (e.g. "TA?" or "T-A") will throw a TranslationError.
1934
1935 NOTE - Does NOT support gapped sequences.
1936
1937 It will however translate either DNA or RNA.
1938 """
1939 if isinstance(sequence, Seq):
1940 return sequence.translate(table, stop_symbol, to_stop, cds)
1941 elif isinstance(sequence, MutableSeq):
1942
1943 return sequence.toseq().translate(table, stop_symbol, to_stop, cds)
1944 else:
1945
1946 try:
1947 codon_table = CodonTable.ambiguous_generic_by_id[int(table)]
1948 except ValueError:
1949 codon_table = CodonTable.ambiguous_generic_by_name[table]
1950 return _translate_str(sequence, codon_table, stop_symbol, to_stop, cds)
1951
1985
1987 """Run the Bio.Seq module's doctests."""
1988 print "Runing doctests..."
1989 import doctest
1990 doctest.testmod()
1991 print "Done"
1992
1993 if __name__ == "__main__":
1994 _test()
1995