Package Bio :: Package SeqUtils :: Module ProtParam
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqUtils.ProtParam

  1  # Copyright Yair Benita Y.Benita@pharm.uu.nl 
  2  # Biopython (http://biopython.org) license applies 
  3   
  4  import sys 
  5  import ProtParamData, IsoelectricPoint 
  6  from ProtParamData import kd  # Added by Iddo to enable the gravy method 
  7  from Bio.Seq import Seq 
  8  from Bio.Alphabet import IUPAC 
  9  from Bio.Data import IUPACData 
 10  #from BioModule import  
 11   
12 -class ProteinAnalysis:
13 """Class containing methods for protein analysis. 14 15 The class init method takes only one argument, the protein sequence as a 16 string and builds a sequence object using the Bio.Seq module. This is done 17 just to make sure the sequence is a protein sequence and not anything else. 18 19 methods: 20 21 count_amino_acids: 22 23 Simply counts the number times an amino acid is repeated in the protein 24 sequence. Returns a dictionary {AminoAcid:Number} and also stores the 25 dictionary in self.amino_acids_content. 26 27 get_amino_acids_percent: 28 29 The same as count_amino_acids only returns the Number in percentage of entire 30 sequence. Returns a dictionary and stores the dictionary in 31 self.amino_acids_content_percent. 32 33 molecular_weight: 34 Calculates the molecular weight of a protein. 35 36 aromaticity: 37 38 Calculates the aromaticity value of a protein according to Lobry, 1994. It is 39 simply the relative frequency of Phe+Trp+Tyr. 40 41 42 instability_index: 43 44 Implementation of the method of Guruprasad et al. (Protein Engineering 45 4:155-161,1990). This method tests a protein for stability. Any value above 40 46 means the protein is unstable (=has a short half life). 47 48 flexibility: 49 Implementation of the flexibility method of Vihinen et al. (Proteins. 1994 Jun;19(2):141-9). 50 51 isoelectric_point: 52 This method uses the module IsoelectricPoint to calculate the pI of a protein. 53 54 secondary_structure_fraction: 55 This methods returns a list of the fraction of amino acids which tend to be in Helix, Turn or Sheet. 56 Amino acids in helix: V, I, Y, F, W, L. 57 Amino acids in Turn: N, P, G, S. 58 Amino acids in sheet: E, M, A, L. 59 The list contains 3 values: [Helix, Turn, Sheet]. 60 61 62 protein_scale(Scale, WindwonSize, Edge): 63 64 An amino acid scale is defined by a numerical value assigned to each type of 65 amino acid. The most frequently used scales are the hydrophobicity or 66 hydrophilicity scales and the secondary structure conformational parameters 67 scales, but many other scales exist which are based on different chemical and 68 physical properties of the amino acids. You can set several parameters that 69 control the computation of a scale profile, such as the window size and the 70 window edge relative weight value. WindowSize: The window size is the length 71 of the interval to use for the profile computation. For a window size n, we 72 use the i- ( n-1)/2 neighboring residues on each side of residue it compute 73 the score for residue i. The score for residue is the sum of the scale values 74 for these amino acids, optionally weighted according to their position in the 75 window. Edge: The central amino acid of the window always has a weight of 1. 76 By default, the amino acids at the remaining window positions have the same 77 weight, but you can make the residue at the center of the window have a 78 larger weight than the others by setting the edge value for the residues at 79 the beginning and end of the interval to a value between 0 and 1. For 80 instance, for Edge=0.4 and a window size of 5 the weights will be: 0.4, 0.7, 81 1.0, 0.7, 0.4. The method returns a list of values which can be plotted to 82 view the change along a protein sequence. Many scales exist. Just add your 83 favorites to the ProtParamData modules. 84 """
85 - def __init__(self, ProtSequence):
86 if ProtSequence.islower(): 87 self.sequence = Seq(ProtSequence.upper(), IUPAC.protein) 88 else: 89 self.sequence = Seq(ProtSequence, IUPAC.protein) 90 self.amino_acids_content = None 91 self.amino_acids_percent = None 92 self.length = len(self.sequence)
93
94 - def count_amino_acids(self):
95 ProtDic = dict([ (k, 0) for k in IUPACData.protein_letters]) 96 for i in ProtDic.keys(): 97 ProtDic[i]=self.sequence.count(i) 98 self.amino_acids_content = ProtDic 99 return ProtDic
100 101 """Calculate the amino acid content in percents. 102 input is the dictionary from CountAA. 103 output is a dictionary with AA as keys."""
104 - def get_amino_acids_percent(self):
105 if not self.amino_acids_content: 106 self.count_amino_acids() 107 108 PercentAA = {} 109 for i in self.amino_acids_content.keys(): 110 if self.amino_acids_content[i] > 0: 111 PercentAA[i]=self.amino_acids_content[i]/float(self.length) 112 else: 113 PercentAA[i] = 0 114 self.amino_acids_percent = PercentAA 115 return PercentAA
116 117 # Calculate MW from Protein sequence 118 # Calculate MW from Protein sequence
119 - def molecular_weight (self):
120 # make local dictionary for speed 121 MwDict = {} 122 # remove a molecule of water from the amino acid weight. 123 for i in IUPACData.protein_weights.keys(): 124 MwDict[i] = IUPACData.protein_weights[i] - 18.02 125 MW = 18.02 # add just one water molecule for the whole sequence. 126 for i in self.sequence: 127 MW += MwDict[i] 128 return MW
129 130 # calculate the aromaticity according to Lobry, 1994. 131 # Arom=sum of relative frequency of Phe+Trp+Tyr
132 - def aromaticity(self):
133 if not self.amino_acids_percent: 134 self.get_amino_acids_percent() 135 136 Arom= self.amino_acids_percent['Y']+self.amino_acids_percent['W']+self.amino_acids_percent['F'] 137 return Arom
138 139 # a function to calculate the instability index according to: 140 # Guruprasad K., Reddy B.V.B., Pandit M.W. Protein Engineering 4:155-161(1990).
141 - def instability_index(self):
142 #make the dictionary local for speed. 143 DIWV=ProtParamData.DIWV.copy() 144 score=0.0 145 for i in range(self.length - 1): 146 DiPeptide=DIWV[self.sequence[i]][self.sequence[i+1]] 147 score += DiPeptide 148 return (10.0/self.length) * score
149 150 # Calculate the flexibility according to Vihinen, 1994. 151 # No argument to change window size because parameters are specific for a window=9. 152 # the parameters used are optimized for determining the flexibility.
153 - def flexibility(self):
154 Flex = ProtParamData.Flex.copy() 155 Window=9 156 Weights=[0.25,0.4375,0.625,0.8125,1] 157 List=[] 158 for i in range(self.length - Window): 159 SubSeq=self.sequence[i:i+Window] 160 score = 0.0 161 for j in range(Window/2): 162 score += (Flex[SubSeq[j]]+Flex[SubSeq[Window-j-1]]) * Weights[j] 163 score += Flex[SubSeq[Window/2+1]] 164 List.append(score/5.25) 165 return List
166 167 # calculate the gravy according to kyte and doolittle.
168 - def gravy(self):
169 ProtGravy=0.0 170 for i in self.sequence: 171 ProtGravy += kd[i] 172 173 return ProtGravy/self.length
174 175 # this method is used to make a list of relative weight of the 176 # window edges compared to the window center. The weights are linear. 177 # it actually generates half a list. For a window of size 9 and edge 0.4 178 # you get a list of [0.4, 0.55, 0.7, 0.85].
179 - def _weight_list(self, window, edge):
180 unit = ((1.0-edge)/(window-1))*2 181 list = [0.0]*(window/2) 182 for i in range(window/2): 183 list[i] = edge + unit * i 184 return list
185 186 # this method allows you to compute and represent the profile produced 187 # by any amino acid scale on a selected protein. 188 # Similar to expasy's ProtScale: http://www.expasy.org/cgi-bin/protscale.pl 189 # The weight list returns only one tail. If the list should be [0.4,0.7,1.0,0.7,0.4] 190 # what you actually get from _weights_list is [0.4,0.7]. The correct calculation is done 191 # in the loop.
192 - def protein_scale(self, ParamDict, Window, Edge=1.0):
193 # generate the weights 194 weight = self._weight_list(Window,Edge) 195 list = [] 196 # the score in each Window is divided by the sum of weights 197 sum_of_weights = 0.0 198 for i in weight: sum_of_weights += i 199 # since the weight list is one sided: 200 sum_of_weights = sum_of_weights*2+1 201 202 for i in range(self.length-Window+1): 203 subsequence = self.sequence[i:i+Window] 204 score = 0.0 205 for j in range(Window/2): 206 # walk from the outside of the Window towards the middle. 207 # Iddo: try/except clauses added to avoid raising an exception on a non-standad amino acid 208 try: 209 score += weight[j] * ParamDict[subsequence[j]] + weight[j] * ParamDict[subsequence[Window-j-1]] 210 except KeyError: 211 sys.stderr.write('warning: %s or %s is not a standard amino acid.\n' % 212 (subsequence[j],subsequence[Window-j-1])) 213 214 # Now add the middle value, which always has a weight of 1. 215 if subsequence[Window/2] in ParamDict: 216 score += ParamDict[subsequence[Window/2]] 217 else: 218 sys.stderr.write('warning: %s is not a standard amino acid.\n' % (subsequence[Window/2])) 219 220 list.append(score/sum_of_weights) 221 return list
222 223 # calculate the isoelectric point.
224 - def isoelectric_point(self):
225 if not self.amino_acids_content: 226 self.count_amino_acids() 227 X = IsoelectricPoint.IsoelectricPoint(self.sequence, self.amino_acids_content) 228 return X.pi()
229 230 # calculate fraction of helix, turn and sheet
232 if not self.amino_acids_percent: 233 self.get_amino_acids_percent() 234 Helix = self.amino_acids_percent['V'] + self.amino_acids_percent['I'] + self.amino_acids_percent['Y'] + self.amino_acids_percent['F'] + self.amino_acids_percent['W'] + self.amino_acids_percent['L'] 235 Turn = self.amino_acids_percent['N'] + self.amino_acids_percent['P'] + self.amino_acids_percent['G'] + self.amino_acids_percent['S'] 236 Sheet = self.amino_acids_percent['E'] + self.amino_acids_percent['M'] + self.amino_acids_percent['A'] + self.amino_acids_percent['L'] 237 return Helix, Turn, Sheet
238 239 #---------------------------------------------------------# 240 """ 241 X = ProteinAnalysis("MAEGEITTFTALTEKFNLPPGNYKKPKLLYCSNGGHFLRILPDGTVDGTRDRSDQHIQLQLSAESVGEVYIKSTETGQYLAMDTSGLLYGSQTPSEECLFLERLEENHYNTYTSKKHAEKNWFVGLKKNGSCKRGPRTHYGQKAILFLPLPV") 242 print X.count_amino_acids() 243 print X.get_amino_acids_percent() 244 print X.molecular_weight() 245 print X.aromaticity() 246 print X.instability_index() 247 print X.flexibility() 248 print X.pi() 249 print X.secondary_structure_fraction() 250 print X.protein_scale(ProtParamData.kd, 9, 0.4) 251 """ 252