Package Bio :: Package SwissProt
[hide private]
[frames] | no frames]

Source Code for Package Bio.SwissProt

  1  # Copyright 2007 by Michiel de Hoon.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """ 
  6  This module provides code to work with the sprotXX.dat file from 
  7  SwissProt. 
  8  http://www.expasy.ch/sprot/sprot-top.html 
  9   
 10  Tested with: 
 11  Release 56.9, 03-March-2009. 
 12   
 13   
 14  Classes: 
 15  Record             Holds SwissProt data. 
 16  Reference          Holds reference data from a SwissProt record. 
 17   
 18  Functions: 
 19  read               Read one SwissProt record 
 20  parse              Read multiple SwissProt records 
 21   
 22  """ 
 23   
 24   
25 -class Record:
26 """Holds information from a SwissProt record. 27 28 Members: 29 entry_name Name of this entry, e.g. RL1_ECOLI. 30 data_class Either 'STANDARD' or 'PRELIMINARY'. 31 molecule_type Type of molecule, 'PRT', 32 sequence_length Number of residues. 33 34 accessions List of the accession numbers, e.g. ['P00321'] 35 created A tuple of (date, release). 36 sequence_update A tuple of (date, release). 37 annotation_update A tuple of (date, release). 38 39 description Free-format description. 40 gene_name Gene name. See userman.txt for description. 41 organism The source of the sequence. 42 organelle The origin of the sequence. 43 organism_classification The taxonomy classification. List of strings. 44 (http://www.ncbi.nlm.nih.gov/Taxonomy/) 45 taxonomy_id A list of NCBI taxonomy id's. 46 host_organism A list of NCBI taxonomy id's of the hosts of a virus, 47 if any. 48 references List of Reference objects. 49 comments List of strings. 50 cross_references List of tuples (db, id1[, id2][, id3]). See the docs. 51 keywords List of the keywords. 52 features List of tuples (key name, from, to, description). 53 from and to can be either integers for the residue 54 numbers, '<', '>', or '?' 55 56 seqinfo tuple of (length, molecular weight, CRC32 value) 57 sequence The sequence. 58 59 """
60 - def __init__(self):
61 self.entry_name = None 62 self.data_class = None 63 self.molecule_type = None 64 self.sequence_length = None 65 66 self.accessions = [] 67 self.created = None 68 self.sequence_update = None 69 self.annotation_update = None 70 71 self.description = [] 72 self.gene_name = '' 73 self.organism = [] 74 self.organelle = '' 75 self.organism_classification = [] 76 self.taxonomy_id = [] 77 self.host_organism = [] 78 self.references = [] 79 self.comments = [] 80 self.cross_references = [] 81 self.keywords = [] 82 self.features = [] 83 84 self.seqinfo = None 85 self.sequence = ''
86 87
88 -class Reference:
89 """Holds information from one reference in a SwissProt entry. 90 91 Members: 92 number Number of reference in an entry. 93 positions Describes extent of work. list of strings. 94 comments Comments. List of (token, text). 95 references References. List of (dbname, identifier) 96 authors The authors of the work. 97 title Title of the work. 98 location A citation for the work. 99 100 """
101 - def __init__(self):
102 self.number = None 103 self.positions = [] 104 self.comments = [] 105 self.references = [] 106 self.authors = [] 107 self.title = [] 108 self.location = []
109 110
111 -def parse(handle):
112 while True: 113 record = _read(handle) 114 if not record: 115 return 116 yield record
117 118
119 -def read(handle):
120 record = _read(handle) 121 if not record: 122 raise ValueError("No SwissProt record found") 123 # We should have reached the end of the record by now 124 remainder = handle.read() 125 if remainder: 126 raise ValueError("More than one SwissProt record found") 127 return record
128 129 130 # Everything below is considered private 131 132
133 -def _read(handle):
134 record = None 135 unread = "" 136 for line in handle: 137 key, value = line[:2], line[5:].rstrip() 138 if unread: 139 value = unread + " " + value 140 unread = "" 141 if key=='**': 142 #See Bug 2353, some files from the EBI have extra lines 143 #starting "**" (two asterisks/stars). They appear 144 #to be unofficial automated annotations. e.g. 145 #** 146 #** ################# INTERNAL SECTION ################## 147 #**HA SAM; Annotated by PicoHamap 1.88; MF_01138.1; 09-NOV-2003. 148 pass 149 elif key=='ID': 150 record = Record() 151 _read_id(record, line) 152 _sequence_lines = [] 153 elif key=='AC': 154 accessions = [word for word in value.rstrip(";").split("; ")] 155 record.accessions.extend(accessions) 156 elif key=='DT': 157 _read_dt(record, line) 158 elif key=='DE': 159 record.description.append(value.strip()) 160 elif key=='GN': 161 if record.gene_name: 162 record.gene_name += " " 163 record.gene_name += value 164 elif key=='OS': 165 record.organism.append(value) 166 elif key=='OG': 167 record.organelle += line[5:] 168 elif key=='OC': 169 cols = [col for col in value.rstrip(";.").split("; ")] 170 record.organism_classification.extend(cols) 171 elif key=='OX': 172 _read_ox(record, line) 173 elif key=='OH': 174 _read_oh(record, line) 175 elif key=='RN': 176 reference = Reference() 177 _read_rn(reference, value) 178 record.references.append(reference) 179 elif key=='RP': 180 assert record.references, "RP: missing RN" 181 record.references[-1].positions.append(value) 182 elif key=='RC': 183 assert record.references, "RC: missing RN" 184 reference = record.references[-1] 185 unread = _read_rc(reference, value) 186 elif key=='RX': 187 assert record.references, "RX: missing RN" 188 reference = record.references[-1] 189 _read_rx(reference, value) 190 elif key=='RL': 191 assert record.references, "RL: missing RN" 192 reference = record.references[-1] 193 reference.location.append(value) 194 # In UniProt release 1.12 of 6/21/04, there is a new RG 195 # (Reference Group) line, which references a group instead of 196 # an author. Each block must have at least 1 RA or RG line. 197 elif key=='RA': 198 assert record.references, "RA: missing RN" 199 reference = record.references[-1] 200 reference.authors.append(value) 201 elif key=='RG': 202 assert record.references, "RG: missing RN" 203 reference = record.references[-1] 204 reference.authors.append(value) 205 elif key=="RT": 206 assert record.references, "RT: missing RN" 207 reference = record.references[-1] 208 reference.title.append(value) 209 elif key=='CC': 210 _read_cc(record, line) 211 elif key=='DR': 212 _read_dr(record, value) 213 elif key=='PE': 214 #TODO - Record this information? 215 pass 216 elif key=='KW': 217 cols = value.rstrip(";.").split('; ') 218 record.keywords.extend(cols) 219 elif key=='FT': 220 _read_ft(record, line) 221 elif key=='SQ': 222 cols = value.split() 223 assert len(cols) == 7, "I don't understand SQ line %s" % line 224 # Do more checking here? 225 record.seqinfo = int(cols[1]), int(cols[3]), cols[5] 226 elif key==' ': 227 _sequence_lines.append(value.replace(" ", "").rstrip()) 228 elif key=='//': 229 # Join multiline data into one string 230 record.description = " ".join(record.description) 231 record.organism = " ".join(record.organism) 232 record.organelle = record.organelle.rstrip() 233 for reference in record.references: 234 reference.authors = " ".join(reference.authors) 235 reference.title = " ".join(reference.title) 236 reference.location = " ".join(reference.location) 237 record.sequence = "".join(_sequence_lines) 238 return record 239 else: 240 raise ValueError("Unknown keyword '%s' found" % key) 241 if record: 242 raise ValueError("Unexpected end of stream.")
243 244
245 -def _read_id(record, line):
246 cols = line[5:].split() 247 #Prior to release 51, included with MoleculeType: 248 #ID EntryName DataClass; MoleculeType; SequenceLength AA. 249 # 250 #Newer files lack the MoleculeType: 251 #ID EntryName DataClass; SequenceLength AA. 252 if len(cols) == 5: 253 record.entry_name = cols[0] 254 record.data_class = cols[1].rstrip(";") 255 record.molecule_type = cols[2].rstrip(";") 256 record.sequence_length = int(cols[3]) 257 elif len(cols) == 4: 258 record.entry_name = cols[0] 259 record.data_class = cols[1].rstrip(";") 260 record.molecule_type = None 261 record.sequence_length = int(cols[2]) 262 else: 263 raise ValueError("ID line has unrecognised format:\n"+line) 264 # check if the data class is one of the allowed values 265 allowed = ('STANDARD', 'PRELIMINARY', 'IPI', 'Reviewed', 'Unreviewed') 266 if record.data_class not in allowed: 267 raise ValueError("Unrecognized data class %s in line\n%s" % \ 268 (record.data_class, line)) 269 # molecule_type should be 'PRT' for PRoTein 270 # Note that has been removed in recent releases (set to None) 271 if record.molecule_type not in (None, 'PRT'): 272 raise ValueError("Unrecognized molecule type %s in line\n%s" % \ 273 (record.molecule_type, line))
274 275
276 -def _read_dt(record, line):
277 value = line[5:] 278 uprline = value.upper() 279 cols = value.rstrip().split() 280 if 'CREATED' in uprline \ 281 or 'LAST SEQUENCE UPDATE' in uprline \ 282 or 'LAST ANNOTATION UPDATE' in uprline: 283 # Old style DT line 284 # ================= 285 # e.g. 286 # DT 01-FEB-1995 (Rel. 31, Created) 287 # DT 01-FEB-1995 (Rel. 31, Last sequence update) 288 # DT 01-OCT-2000 (Rel. 40, Last annotation update) 289 # 290 # or: 291 # DT 08-JAN-2002 (IPI Human rel. 2.3, Created) 292 # ... 293 294 # find where the version information will be located 295 # This is needed for when you have cases like IPI where 296 # the release verison is in a different spot: 297 # DT 08-JAN-2002 (IPI Human rel. 2.3, Created) 298 uprcols = uprline.split() 299 rel_index = -1 300 for index in range(len(uprcols)): 301 if uprcols[index].find("REL.") >= 0: 302 rel_index = index 303 assert rel_index >= 0, \ 304 "Could not find Rel. in DT line: %s" % line 305 version_index = rel_index + 1 306 # get the version information 307 str_version = cols[version_index].rstrip(",") 308 # no version number 309 if str_version == '': 310 version = 0 311 # dot versioned 312 elif str_version.find(".") >= 0: 313 version = str_version 314 # integer versioned 315 else: 316 version = int(str_version) 317 date = cols[0] 318 319 if 'CREATED' in uprline: 320 record.created = date, version 321 elif 'LAST SEQUENCE UPDATE' in uprline: 322 record.sequence_update = date, version 323 elif 'LAST ANNOTATION UPDATE' in uprline: 324 record.annotation_update = date, version 325 else: 326 assert False, "Shouldn't reach this line!" 327 elif 'INTEGRATED INTO' in uprline \ 328 or 'SEQUENCE VERSION' in uprline \ 329 or 'ENTRY VERSION' in uprline: 330 # New style DT line 331 # ================= 332 # As of UniProt Knowledgebase release 7.0 (including 333 # Swiss-Prot release 49.0 and TrEMBL release 32.0) the 334 # format of the DT lines and the version information 335 # in them was changed - the release number was dropped. 336 # 337 # For more information see bug 1948 and 338 # http://ca.expasy.org/sprot/relnotes/sp_news.html#rel7.0 339 # 340 # e.g. 341 # DT 01-JAN-1998, integrated into UniProtKB/Swiss-Prot. 342 # DT 15-OCT-2001, sequence version 3. 343 # DT 01-APR-2004, entry version 14. 344 # 345 #This is a new style DT line... 346 347 # The date should be in string cols[1] 348 # Get the version number if there is one. 349 # For the three DT lines above: 0, 3, 14 350 try: 351 version = int(cols[-1]) 352 except ValueError: 353 version = 0 354 date = cols[0].rstrip(",") 355 356 # Re-use the historical property names, even though 357 # the meaning has changed slighty: 358 if "INTEGRATED" in uprline: 359 record.created = date, version 360 elif 'SEQUENCE VERSION' in uprline: 361 record.sequence_update = date, version 362 elif 'ENTRY VERSION' in uprline: 363 record.annotation_update = date, version 364 else: 365 assert False, "Shouldn't reach this line!" 366 else: 367 raise ValueError("I don't understand the date line %s" % line)
368 369
370 -def _read_ox(record, line):
371 # The OX line is in the format: 372 # OX DESCRIPTION=ID[, ID]...; 373 # If there are too many id's to fit onto a line, then the ID's 374 # continue directly onto the next line, e.g. 375 # OX DESCRIPTION=ID[, ID]... 376 # OX ID[, ID]...; 377 # Currently, the description is always "NCBI_TaxID". 378 # To parse this, I need to check to see whether I'm at the 379 # first line. If I am, grab the description and make sure 380 # it's an NCBI ID. Then, grab all the id's. 381 if record.taxonomy_id: 382 ids = line[5:].rstrip().rstrip(";") 383 else: 384 descr, ids = line[5:].rstrip().rstrip(";").split("=") 385 assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr 386 record.taxonomy_id.extend(ids.split(', '))
387 388
389 -def _read_oh(record, line):
390 # Line type OH (Organism Host) for viral hosts 391 # same code as in taxonomy_id() 392 line = line[5:].rstrip().rstrip(";") 393 index = line.find('=') 394 if index >= 0: 395 descr = line[:index] 396 assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr 397 ids = line[index+1:].split(',') 398 else: 399 ids = line.split(',') 400 record.host_organism.extend([id.strip() for id in ids])
401 402
403 -def _read_rn(reference, rn):
404 assert rn[0] == '[' and rn[-1] == ']', "Missing brackets %s" % rn 405 reference.number = int(rn[1:-1])
406 407
408 -def _read_rc(reference, value):
409 cols = value.split(';') 410 if value[-1]==';': 411 unread = "" 412 else: 413 cols, unread = cols[:-1], cols[-1] 414 for col in cols: 415 if not col: # last column will be the empty string 416 return 417 # The token is everything before the first '=' character. 418 i = col.find("=") 419 if i>=0: 420 token, text = col[:i], col[i+1:] 421 comment = token.lstrip(), text 422 reference.comments.append(comment) 423 else: 424 comment = reference.comments[-1] 425 comment = "%s %s" % (comment, col) 426 reference.comments[-1] = comment 427 return unread
428 429
430 -def _read_rx(reference, value):
431 # The basic (older?) RX line is of the form: 432 # RX MEDLINE; 85132727. 433 # but there are variants of this that need to be dealt with (see below) 434 435 # CLD1_HUMAN in Release 39 and DADR_DIDMA in Release 33 436 # have extraneous information in the RX line. Check for 437 # this and chop it out of the line. 438 # (noticed by katel@worldpath.net) 439 value = value.replace(' [NCBI, ExPASy, Israel, Japan]','') 440 441 # RX lines can also be used of the form 442 # RX PubMed=9603189; 443 # reported by edvard@farmasi.uit.no 444 # and these can be more complicated like: 445 # RX MEDLINE=95385798; PubMed=7656980; 446 # RX PubMed=15060122; DOI=10.1136/jmg 2003.012781; 447 # We look for these cases first and deal with them 448 if "=" in value: 449 cols = value.split("; ") 450 cols = [x.strip() for x in cols] 451 cols = [x for x in cols if x] 452 for col in cols: 453 x = col.split("=") 454 assert len(x) == 2, "I don't understand RX line %s" % line 455 key, value = x[0], x[1].rstrip(";") 456 reference.references.append((key, value)) 457 # otherwise we assume we have the type 'RX MEDLINE; 85132727.' 458 else: 459 cols = value.split("; ") 460 # normally we split into the three parts 461 assert len(cols) == 2, "I don't understand RX line %s" % line 462 reference.references.append((cols[0].rstrip(";"), cols[1].rstrip(".")))
463 464
465 -def _read_cc(record, line):
466 key, value = line[5:8], line[9:].rstrip() 467 if key=='-!-': # Make a new comment 468 record.comments.append(value) 469 elif key==' ': # add to the previous comment 470 if not record.comments: 471 # TCMO_STRGA in Release 37 has comment with no topic 472 record.comments.append(value) 473 else: 474 record.comments[-1] += " " + value
475 476
477 -def _read_dr(record, value):
478 # Remove the comments at the end of the line 479 i = value.find(' [') 480 if i >= 0: 481 value = value[:i] 482 cols = value.rstrip(".").split('; ') 483 record.cross_references.append(tuple(cols))
484 485
486 -def _read_ft(record, line):
487 line = line[5:] # get rid of junk in front 488 name = line[0:8].rstrip() 489 try: 490 from_res = int(line[9:15]) 491 except ValueError: 492 from_res = line[9:15].lstrip() 493 try: 494 to_res = int(line[16:22]) 495 except ValueError: 496 to_res = line[16:22].lstrip() 497 description = line[29:70].rstrip() 498 #if there is a feature_id (FTId), store it away 499 if line[29:35]==r"/FTId=": 500 ft_id = line[35:70].rstrip()[:-1] 501 else: 502 ft_id ="" 503 if not name: # is continuation of last one 504 assert not from_res and not to_res 505 name, from_res, to_res, old_description,old_ft_id = record.features[-1] 506 del record.features[-1] 507 description = "%s %s" % (old_description, description) 508 509 # special case -- VARSPLIC, reported by edvard@farmasi.uit.no 510 if name == "VARSPLIC": 511 # Remove unwanted spaces in sequences. 512 # During line carryover, the sequences in VARSPLIC can get mangled 513 # with unwanted spaces like: 514 # 'DISSTKLQALPSHGLESIQT -> PCRATGWSPFRRSSPC LPTH' 515 # We want to check for this case and correct it as it happens. 516 descr_cols = description.split(" -> ") 517 if len(descr_cols) == 2: 518 first_seq, second_seq = descr_cols 519 extra_info = '' 520 # we might have more information at the end of the 521 # second sequence, which should be in parenthesis 522 extra_info_pos = second_seq.find(" (") 523 if extra_info_pos != -1: 524 extra_info = second_seq[extra_info_pos:] 525 second_seq = second_seq[:extra_info_pos] 526 # now clean spaces out of the first and second string 527 first_seq = first_seq.replace(" ", "") 528 second_seq = second_seq.replace(" ", "") 529 # reassemble the description 530 description = first_seq + " -> " + second_seq + extra_info 531 record.features.append((name, from_res, to_res, description,ft_id))
532 533 534 if __name__ == "__main__": 535 print "Quick self test..." 536 537 example_filename = "../../Tests/SwissProt/sp008" 538 539 import os 540 if not os.path.isfile(example_filename): 541 print "Missing test file %s" % example_filename 542 else: 543 #Try parsing it! 544 545 handle = open(example_filename) 546 records = parse(handle) 547 for record in records: 548 print record.entry_name 549 print ",".join(record.accessions) 550 print record.keywords 551 print repr(record.organism) 552 print record.sequence[:20] + "..." 553 handle.close() 554