Package Bio :: Package SCOP :: Module Raf
[hide private]
[frames] | no frames]

Source Code for Module Bio.SCOP.Raf

  1  # Copyright 2001 by Gavin E. Crooks.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  # Gavin E. Crooks 2001-10-10 
  7   
  8  """ASTRAL RAF (Rapid Access Format) Sequence Maps. 
  9   
 10  The ASTRAL RAF Sequence Maps record the relationship between the PDB SEQRES 
 11  records (representing the sequence of the molecule used in an experiment) to  
 12  the ATOM records (representing the atoms experimentally observed).  
 13   
 14  This data is derived from the Protein Data Bank CIF files. Known errors in the 
 15  CIF files are corrected manually, with the original PDB file serving as the 
 16  final arbiter in case of discrepancies.  
 17   
 18  Residues are referenced by residue ID. This consists of a the PDB residue 
 19  sequence number (upto 4 digits) and an optional PDB  insertion code (an 
 20  ascii alphabetic character, a-z, A-Z). e.g. "1", "10A", "1010b", "-1" 
 21   
 22  See "ASTRAL RAF Sequence Maps":http://astral.stanford.edu/raf.html 
 23   
 24  to_one_letter_code -- A mapping from the 3-letter amino acid codes found 
 25                          in PDB files to 1-letter codes.  The 3-letter codes 
 26                          include chemically modified residues. 
 27  """ 
 28   
 29  from copy import copy  
 30  from types import * 
 31   
 32  from Residues import Residues 
 33   
 34  # This table is taken from the RAF release notes, and includes the 
 35  # undocumented mapping "UNK" -> "X" 
 36  to_one_letter_code= { 
 37      'ALA':'A', 'VAL':'V', 'PHE':'F', 'PRO':'P', 'MET':'M', 
 38      'ILE':'I', 'LEU':'L', 'ASP':'D', 'GLU':'E', 'LYS':'K', 
 39      'ARG':'R', 'SER':'S', 'THR':'T', 'TYR':'Y', 'HIS':'H', 
 40      'CYS':'C', 'ASN':'N', 'GLN':'Q', 'TRP':'W', 'GLY':'G', 
 41      '2AS':'D', '3AH':'H', '5HP':'E', 'ACL':'R', 'AIB':'A', 
 42      'ALM':'A', 'ALO':'T', 'ALY':'K', 'ARM':'R', 'ASA':'D', 
 43      'ASB':'D', 'ASK':'D', 'ASL':'D', 'ASQ':'D', 'AYA':'A', 
 44      'BCS':'C', 'BHD':'D', 'BMT':'T', 'BNN':'A', 'BUC':'C', 
 45      'BUG':'L', 'C5C':'C', 'C6C':'C', 'CCS':'C', 'CEA':'C', 
 46      'CHG':'A', 'CLE':'L', 'CME':'C', 'CSD':'A', 'CSO':'C', 
 47      'CSP':'C', 'CSS':'C', 'CSW':'C', 'CXM':'M', 'CY1':'C', 
 48      'CY3':'C', 'CYG':'C', 'CYM':'C', 'CYQ':'C', 'DAH':'F', 
 49      'DAL':'A', 'DAR':'R', 'DAS':'D', 'DCY':'C', 'DGL':'E', 
 50      'DGN':'Q', 'DHA':'A', 'DHI':'H', 'DIL':'I', 'DIV':'V', 
 51      'DLE':'L', 'DLY':'K', 'DNP':'A', 'DPN':'F', 'DPR':'P', 
 52      'DSN':'S', 'DSP':'D', 'DTH':'T', 'DTR':'W', 'DTY':'Y', 
 53      'DVA':'V', 'EFC':'C', 'FLA':'A', 'FME':'M', 'GGL':'E', 
 54      'GLZ':'G', 'GMA':'E', 'GSC':'G', 'HAC':'A', 'HAR':'R', 
 55      'HIC':'H', 'HIP':'H', 'HMR':'R', 'HPQ':'F', 'HTR':'W', 
 56      'HYP':'P', 'IIL':'I', 'IYR':'Y', 'KCX':'K', 'LLP':'K', 
 57      'LLY':'K', 'LTR':'W', 'LYM':'K', 'LYZ':'K', 'MAA':'A', 
 58      'MEN':'N', 'MHS':'H', 'MIS':'S', 'MLE':'L', 'MPQ':'G', 
 59      'MSA':'G', 'MSE':'M', 'MVA':'V', 'NEM':'H', 'NEP':'H', 
 60      'NLE':'L', 'NLN':'L', 'NLP':'L', 'NMC':'G', 'OAS':'S', 
 61      'OCS':'C', 'OMT':'M', 'PAQ':'Y', 'PCA':'E', 'PEC':'C', 
 62      'PHI':'F', 'PHL':'F', 'PR3':'C', 'PRR':'A', 'PTR':'Y', 
 63      'SAC':'S', 'SAR':'G', 'SCH':'C', 'SCS':'C', 'SCY':'C', 
 64      'SEL':'S', 'SEP':'S', 'SET':'S', 'SHC':'C', 'SHR':'K', 
 65      'SOC':'C', 'STY':'Y', 'SVA':'S', 'TIH':'A', 'TPL':'W', 
 66      'TPO':'T', 'TPQ':'A', 'TRG':'K', 'TRO':'W', 'TYB':'Y', 
 67      'TYQ':'Y', 'TYS':'Y', 'TYY':'Y', 'AGM':'R', 'GL3':'G', 
 68      'SMC':'C', 'ASX':'B', 'CGU':'E', 'CSX':'C', 'GLX':'Z', 
 69      'PYX':'C', 
 70      'UNK':'X' 
 71      } 
 72   
 73   
74 -def normalize_letters(one_letter_code):
75 """Convert RAF one-letter amino acid codes into IUPAC standard codes. 76 77 Letters are uppercased, and "." ("Unknown") is converted to "X". 78 """ 79 if one_letter_code == '.': 80 return 'X' 81 else: 82 return one_letter_code.upper()
83
84 -class SeqMapIndex(dict):
85 """An RAF file index. 86 87 The RAF file itself is about 50 MB. This index provides rapid, random 88 access of RAF records without having to load the entire file into memory. 89 90 The index key is a concatenation of the PDB ID and chain ID. e.g 91 "2drcA", "155c_". RAF uses an underscore to indicate blank 92 chain IDs. 93 """ 94
95 - def __init__(self, filename):
96 """ 97 Arguments: 98 99 filename -- The file to index 100 """ 101 dict.__init__(self) 102 self.filename = filename 103 104 f = open(self.filename) 105 try: 106 position = 0 107 while True: 108 line = f.readline() 109 if not line: break 110 key = line[0:5] 111 if key != None: 112 self[key]=position 113 position = f.tell() 114 finally: 115 f.close()
116
117 - def __getitem__(self, key):
118 """ Return an item from the indexed file. """ 119 position = dict.__getitem__(self,key) 120 121 f = open(self.filename) 122 try: 123 f.seek(position) 124 line = f.readline() 125 record = SeqMap(line) 126 finally: 127 f.close() 128 return record
129 130
131 - def getSeqMap(self, residues):
132 """Get the sequence map for a collection of residues. 133 134 residues -- A Residues instance, or a string that can be converted into 135 a Residues instance. 136 """ 137 if type(residues) == StringType: 138 residues = Residues(residues) 139 140 pdbid = residues.pdbid 141 frags = residues.fragments 142 if not frags: frags =(('_','',''),) # All residues of unnamed chain 143 144 seqMap = None 145 for frag in frags: 146 chainid = frag[0] 147 if chainid=='' or chainid=='-' or chainid==' ' or chainid=='_': 148 chainid = '_' 149 id = pdbid + chainid 150 151 152 sm = self[id] 153 154 #Cut out fragment of interest 155 start = 0 156 end = len(sm.res) 157 if frag[1] : start = int(sm.index(frag[1], chainid)) 158 if frag[2] : end = int(sm.index(frag[2], chainid)+1) 159 160 sm = sm[start:end] 161 162 if seqMap == None: 163 seqMap = sm 164 else: 165 seqMap += sm 166 167 return seqMap
168 169 170
171 -class SeqMap:
172 """An ASTRAL RAF (Rapid Access Format) Sequence Map. 173 174 This is a list like object; You can find the location of particular residues 175 with index(), slice this SeqMap into fragments, and glue fragments back 176 together with extend(). 177 178 pdbid -- The PDB 4 character ID 179 180 pdb_datestamp -- From the PDB file 181 182 version -- The RAF format version. e.g. 0.01 183 184 flags -- RAF flags. (See release notes for more information.) 185 186 res -- A list of Res objects, one for each residue in this sequence map 187 """ 188
189 - def __init__(self, line=None):
190 self.pdbid = '' 191 self.pdb_datestamp = '' 192 self.version = '' 193 self.flags = '' 194 self.res = [] 195 if line: 196 self._process(line)
197 198
199 - def _process(self, line):
200 """Parses a RAF record into a SeqMap object. 201 """ 202 header_len = 38 203 204 line = line.rstrip() # no trailing whitespace 205 206 if len(line)<header_len: 207 raise ValueError("Incomplete header: "+line) 208 209 self.pdbid = line[0:4] 210 chainid = line[4:5] 211 212 self.version = line[6:10] 213 214 #Raf format versions 0.01 and 0.02 are identical for practical purposes 215 if(self.version != "0.01" and self.version !="0.02"): 216 raise ValueError("Incompatible RAF version: "+self.version) 217 218 self.pdb_datestamp = line[14:20] 219 self.flags = line[21:27] 220 221 for i in range(header_len, len(line), 7): 222 f = line[i : i+7] 223 if len(f)!=7: 224 raise ValueError("Corrupt Field: ("+f+")") 225 r = Res() 226 r.chainid = chainid 227 r.resid = f[0:5].strip() 228 r.atom = normalize_letters(f[5:6]) 229 r.seqres = normalize_letters(f[6:7]) 230 231 self.res.append(r)
232 233
234 - def index(self, resid, chainid="_"):
235 for i in range(0, len(self.res)): 236 if self.res[i].resid == resid and self.res[i].chainid == chainid: 237 return i 238 raise KeyError("No such residue "+chainid+resid)
239
240 - def __getslice__(self, i, j):
241 s = copy(self) 242 s.res = s.res[i:j] 243 return s
244
245 - def append(self, res):
246 """Append another Res object onto the list of residue mappings.""" 247 self.res.append(res)
248
249 - def extend(self, other):
250 """Append another SeqMap onto the end of self. 251 252 Both SeqMaps must have the same PDB ID, PDB datestamp and 253 RAF version. The RAF flags are erased if they are inconsistent. This 254 may happen when fragments are taken from different chains. 255 """ 256 if not isinstance(other, SeqMap): 257 raise TypeError("Can only extend a SeqMap with a SeqMap.") 258 if self.pdbid != other.pdbid: 259 raise TypeError("Cannot add fragments from different proteins") 260 if self.version != other.version: 261 raise TypeError("Incompatible rafs") 262 if self.pdb_datestamp != other.pdb_datestamp: 263 raise TypeError("Different pdb dates!") 264 if self.flags != other.flags: 265 self.flags = '' 266 self.res += other.res
267
268 - def __iadd__(self, other):
269 self.extend(other) 270 return self
271
272 - def __add__(self, other):
273 s = copy(self) 274 s.extend(other) 275 return s
276
277 - def getAtoms(self, pdb_handle, out_handle):
278 """Extract all relevant ATOM and HETATOM records from a PDB file. 279 280 The PDB file is scanned for ATOM and HETATOM records. If the 281 chain ID, residue ID (seqNum and iCode), and residue type match 282 a residue in this sequence map, then the record is echoed to the 283 output handle. 284 285 This is typically used to find the coordinates of a domain, or other 286 residue subset. 287 288 pdb_handle -- A handle to the relevant PDB file. 289 290 out_handle -- All output is written to this file like object. 291 """ 292 #This code should be refactored when (if?) biopython gets a PDB parser 293 294 #The set of residues that I have to find records for. 295 resSet = {} 296 for r in self.res: 297 if r.atom=='X' : #Unknown residue type 298 continue 299 chainid = r.chainid 300 if chainid == '_': 301 chainid = ' ' 302 resid = r.resid 303 resSet[(chainid,resid)] = r 304 305 resFound = {} 306 for line in pdb_handle.xreadlines(): 307 if line.startswith("ATOM ") or line.startswith("HETATM"): 308 chainid = line[21:22] 309 resid = line[22:27].strip() 310 key = (chainid, resid) 311 if key in resSet: 312 res = resSet[key] 313 atom_aa = res.atom 314 resName = line[17:20] 315 if resName in to_one_letter_code: 316 if to_one_letter_code[resName] == atom_aa: 317 out_handle.write(line) 318 resFound[key] = res 319 320 if len(resSet) != len(resFound): 321 #for k in resFound.keys(): 322 # del resSet[k] 323 #print resSet 324 325 raise RuntimeError('I could not find at least one ATOM or HETATM' \ 326 +' record for each and every residue in this sequence map.')
327 328 329
330 -class Res:
331 """ A single residue mapping from a RAF record. 332 333 chainid -- A single character chain ID. 334 335 resid -- The residue ID. 336 337 atom -- amino acid one-letter code from ATOM records. 338 339 seqres -- amino acid one-letter code from SEQRES records. 340 """
341 - def __init__(self):
342 self.chainid = '' 343 self.resid = '' 344 self.atom = '' 345 self.seqres = ''
346 347
348 -def parse(handle):
349 """Iterates over a RAF file, returning a SeqMap object for each line 350 in the file. 351 352 Arguments: 353 354 handle -- file-like object. 355 """ 356 for line in handle: 357 yield SeqMap(line)
358