Package Bio :: Package Alphabet
[hide private]
[frames] | no frames]

Source Code for Package Bio.Alphabet

  1  # Copyright 2000-2002 by Andrew Dalke. 
  2  # Revisions copyright 2007-2008 by Peter Cock. 
  3  # All rights reserved. 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7   
  8  """Alphabets used in Seq objects etc to declare sequence type and letters. 
  9   
 10  This is used by sequences which contain a finite number of similar words. 
 11  """ 
 12   
13 -class Alphabet:
14 size = None # no fixed size for words 15 letters = None # no fixed alphabet; implement as a list-like 16 # interface,
17 - def __repr__(self):
18 return self.__class__.__name__ + "()"
19
20 - def contains(self, other):
21 """Does this alphabet 'contain' the other (OBSOLETE?). 22 23 Returns a boolean. This relies on the Alphabet subclassing 24 hierarchy only, and does not check the letters property. 25 This isn't ideal, and doesn't seem to work as intended 26 with the AlphabetEncoder classes.""" 27 return isinstance(other, self.__class__)
28
29 - def _case_less(self):
30 """Return an case-less variant of the current alphabet (PRIVATE).""" 31 #TODO - remove this method by dealing with things in subclasses? 32 if isinstance(self, ProteinAlphabet): 33 return generic_protein 34 elif isinstance(self, DNAAlphabet): 35 return generic_dna 36 elif isinstance(self, NucleotideAlphabet): 37 return generic_rna 38 elif isinstance(self, NucleotideAlphabet): 39 return generic_nucleotide 40 elif isinstance(self, SingleLetterAlphabet): 41 return single_letter_alphabet 42 else: 43 return generic_alphabet
44
45 - def _upper(self):
46 """Return an upper case variant of the current alphabet (PRIVATE).""" 47 if not self.letters or self.letters==self.letters.upper(): 48 #Easy case, no letters or already upper case! 49 return self 50 else: 51 #TODO - Raise NotImplementedError and handle via subclass? 52 return self._case_less()
53
54 - def _lower(self):
55 """Return a lower case variant of the current alphabet (PRIVATE).""" 56 if not self.letters or self.letters==self.letters.lower(): 57 #Easy case, no letters or already lower case! 58 return self 59 else: 60 #TODO - Raise NotImplementedError and handle via subclass? 61 return self._case_less()
62 63 generic_alphabet = Alphabet() 64
65 -class SingleLetterAlphabet(Alphabet):
66 size = 1 67 letters = None # string of all letters in the alphabet
68 69 single_letter_alphabet = SingleLetterAlphabet() 70 71 ########### Protein 72
73 -class ProteinAlphabet(SingleLetterAlphabet):
74 pass
75 76 generic_protein = ProteinAlphabet() 77 78 ########### DNA
79 -class NucleotideAlphabet(SingleLetterAlphabet):
80 pass
81 82 generic_nucleotide = NucleotideAlphabet() 83
84 -class DNAAlphabet(NucleotideAlphabet):
85 pass
86 87 generic_dna = DNAAlphabet() 88 89 90 ########### RNA 91
92 -class RNAAlphabet(NucleotideAlphabet):
93 pass
94 95 generic_rna = RNAAlphabet() 96 97 98 99 ########### Other per-sequence encodings 100
101 -class SecondaryStructure(SingleLetterAlphabet):
102 letters = "HSTC"
103
104 -class ThreeLetterProtein(Alphabet):
105 size = 3 106 letters = [ 107 "Ala", "Asx", "Cys", "Asp", "Glu", "Phe", "Gly", "His", "Ile", 108 "Lys", "Leu", "Met", "Asn", "Pro", "Gln", "Arg", "Ser", "Thr", 109 "Sec", "Val", "Trp", "Xaa", "Tyr", "Glx", 110 ]
111 112 ###### Non per-sequence modifications 113 114 # (These are Decorator classes) 115
116 -class AlphabetEncoder:
117 - def __init__(self, alphabet, new_letters):
118 self.alphabet = alphabet 119 self.new_letters = new_letters 120 if alphabet.letters is not None: 121 self.letters = alphabet.letters + new_letters 122 else: 123 self.letters = None
124 - def __getattr__(self, key):
125 if key[:2] == "__" and key[-2:] == "__": 126 raise AttributeError(key) 127 return getattr(self.alphabet, key)
128
129 - def __repr__(self):
130 return "%s(%r, %r)" % (self.__class__.__name__, self.alphabet, 131 self.new_letters)
132
133 - def contains(self, other):
134 """Does this alphabet 'contain' the other (OBSOLETE?). 135 136 This is isn't implemented for the base AlphabetEncoder, 137 which will always return 0 (False).""" 138 return 0
139
140 - def _upper(self):
141 """Return an upper case variant of the current alphabet (PRIVATE).""" 142 return AlphabetEncoder(self.alphabet._upper(), self.new_letters.upper())
143
144 - def _lower(self):
145 """Return a lower case variant of the current alphabet (PRIVATE).""" 146 return AlphabetEncoder(self.alphabet._lower(), self.new_letters.lower())
147 148
149 -class Gapped(AlphabetEncoder):
150 - def __init__(self, alphabet, gap_char = "-"):
151 AlphabetEncoder.__init__(self, alphabet, gap_char) 152 self.gap_char = gap_char
153
154 - def contains(self, other):
155 """Does this alphabet 'contain' the other (OBSOLETE?). 156 157 Returns a boolean. This relies on the Alphabet subclassing 158 hierarchy, and attempts to check the gap character. This fails 159 if the other alphabet does not have a gap character! 160 """ 161 return other.gap_char == self.gap_char and \ 162 self.alphabet.contains(other.alphabet)
163
164 - def _upper(self):
165 """Return an upper case variant of the current alphabet (PRIVATE).""" 166 return Gapped(self.alphabet._upper(), self.gap_char.upper())
167
168 - def _lower(self):
169 """Return a lower case variant of the current alphabet (PRIVATE).""" 170 return Gapped(self.alphabet._lower(), self.gap_char.lower())
171 172
173 -class HasStopCodon(AlphabetEncoder):
174 - def __init__(self, alphabet, stop_symbol = "*"):
175 AlphabetEncoder.__init__(self, alphabet, stop_symbol) 176 self.stop_symbol = stop_symbol
177
178 - def __cmp__(self, other):
179 x = cmp(self.alphabet, other.alphabet) 180 if x == 0: 181 return cmp(self.stop_symbol, other.stop_symbol) 182 return x
183
184 - def contains(self, other):
185 """Does this alphabet 'contain' the other (OBSOLETE?). 186 187 Returns a boolean. This relies on the Alphabet subclassing 188 hierarchy, and attempts to check the stop symbol. This fails 189 if the other alphabet does not have a stop symbol! 190 """ 191 return other.stop_symbol == self.stop_symbol and \ 192 self.alphabet.contains(other.alphabet)
193
194 - def _upper(self):
195 """Return an upper case variant of the current alphabet (PRIVATE).""" 196 return HasStopCodon(self.alphabet._upper(), self.stop_symbol.upper())
197
198 - def _lower(self):
199 """Return a lower case variant of the current alphabet (PRIVATE).""" 200 return HasStopCodon(self.alphabet._lower(), self.stop_symbol.lower())
201 202
203 -def _get_base_alphabet(alphabet):
204 """Returns the non-gapped non-stop-codon Alphabet object (PRIVATE).""" 205 a = alphabet 206 while isinstance(a, AlphabetEncoder): 207 a = a.alphabet 208 assert isinstance(a, Alphabet), \ 209 "Invalid alphabet found, %s" % repr(a) 210 return a
211
212 -def _ungap(alphabet):
213 """Returns the alphabet without any gap encoder (PRIVATE).""" 214 #TODO - Handle via method of the objects? 215 if not hasattr(alphabet, "gap_char"): 216 return alphabet 217 elif isinstance(alphabet, Gapped): 218 return alphabet.alphabet 219 elif isinstance(alphabet, HasStopCodon): 220 return HasStopCodon(_ungap(alphabet.alphabet), stop_symbol=alphabet.stop_symbol) 221 elif isinstance(alphabet, AlphabetEncoder): 222 return AlphabetEncoder(_ungap(alphabet.alphabet), letters=alphabet.letters) 223 else: 224 raise NotImplementedError
225
226 -def _consensus_base_alphabet(alphabets):
227 """Returns a common but often generic base alphabet object (PRIVATE). 228 229 This throws away any AlphabetEncoder information, e.g. Gapped alphabets. 230 231 Note that DNA+RNA -> Nucleotide, and Nucleotide+Protein-> generic single 232 letter. These DO NOT raise an exception!""" 233 common = None 234 for alpha in alphabets: 235 a = _get_base_alphabet(alpha) 236 if common is None: 237 common = a 238 elif common == a: 239 pass 240 elif isinstance(a, common.__class__): 241 pass 242 elif isinstance(common, a.__class__): 243 common = a 244 elif isinstance(a, NucleotideAlphabet) \ 245 and isinstance(common, NucleotideAlphabet): 246 #e.g. Give a mix of RNA and DNA alphabets 247 common = generic_nucleotide 248 elif isinstance(a, SingleLetterAlphabet) \ 249 and isinstance(common, SingleLetterAlphabet): 250 #This is a pretty big mis-match! 251 common = single_letter_alphabet 252 else: 253 #We have a major mis-match... take the easy way out! 254 return generic_alphabet 255 if common is None: 256 #Given NO alphabets! 257 return generic_alphabet 258 return common
259
260 -def _consensus_alphabet(alphabets):
261 """Returns a common but often generic alphabet object (PRIVATE). 262 263 Note that DNA+RNA -> Nucleotide, and Nucleotide+Protein-> generic single 264 letter. These DO NOT raise an exception! 265 266 This is aware of Gapped and HasStopCodon and new letters added by 267 other AlphabetEncoders. This WILL raise an exception if more than 268 one gap character or stop symbol is present.""" 269 base = _consensus_base_alphabet(alphabets) 270 gap = None 271 stop = None 272 new_letters = "" 273 for alpha in alphabets: 274 #Gaps... 275 if not hasattr(alpha, "gap_char"): 276 pass 277 elif gap is None: 278 gap = alpha.gap_char 279 elif gap == alpha.gap_char: 280 pass 281 else: 282 raise ValueError("More than one gap character present") 283 #Stops... 284 if not hasattr(alpha, "stop_symbol"): 285 pass 286 elif stop is None: 287 stop = alpha.stop_symbol 288 elif stop == alpha.stop_symbol: 289 pass 290 else: 291 raise ValueError("More than one stop symbol present") 292 #New letters... 293 if hasattr(alpha, "new_letters"): 294 for letter in alpha.new_letters: 295 if letter not in new_letters \ 296 and letter != gap and letter != stop: 297 new_letters += letter 298 299 alpha = base 300 if new_letters: 301 alpha = AlphabetEncoder(alpha, new_letters) 302 if gap: 303 alpha = Gapped(alpha, gap_char=gap) 304 if stop: 305 alpha = HasStopCodon(alpha, stop_symbol=stop) 306 return alpha
307
308 -def _check_type_compatible(alphabets):
309 """Returns True except for DNA+RNA or Nucleotide+Protein (PRIVATE). 310 311 This relies on the Alphabet subclassing hierarchy. It does not 312 check things like gap characters or stop symbols.""" 313 dna, rna, nucl, protein = False, False, False, False 314 for alpha in alphabets: 315 a = _get_base_alphabet(alpha) 316 if isinstance(a, DNAAlphabet): 317 dna = True 318 nucl = True 319 if rna or protein : return False 320 elif isinstance(a, RNAAlphabet): 321 rna = True 322 nucl = True 323 if dna or protein : return False 324 elif isinstance(a, NucleotideAlphabet): 325 nucl = True 326 if protein : return False 327 elif isinstance(a, ProteinAlphabet): 328 protein = True 329 if nucl : return False 330 return True
331