1
2
3
4
5 """
6 This module provides code to work with the sprotXX.dat file from
7 SwissProt.
8 http://www.expasy.ch/sprot/sprot-top.html
9
10 Tested with:
11 Release 56.9, 03-March-2009.
12
13
14 Classes:
15 Record Holds SwissProt data.
16 Reference Holds reference data from a SwissProt record.
17
18 Functions:
19 read Read one SwissProt record
20 parse Read multiple SwissProt records
21
22 """
23
24
26 """Holds information from a SwissProt record.
27
28 Members:
29 entry_name Name of this entry, e.g. RL1_ECOLI.
30 data_class Either 'STANDARD' or 'PRELIMINARY'.
31 molecule_type Type of molecule, 'PRT',
32 sequence_length Number of residues.
33
34 accessions List of the accession numbers, e.g. ['P00321']
35 created A tuple of (date, release).
36 sequence_update A tuple of (date, release).
37 annotation_update A tuple of (date, release).
38
39 description Free-format description.
40 gene_name Gene name. See userman.txt for description.
41 organism The source of the sequence.
42 organelle The origin of the sequence.
43 organism_classification The taxonomy classification. List of strings.
44 (http://www.ncbi.nlm.nih.gov/Taxonomy/)
45 taxonomy_id A list of NCBI taxonomy id's.
46 host_organism A list of NCBI taxonomy id's of the hosts of a virus,
47 if any.
48 references List of Reference objects.
49 comments List of strings.
50 cross_references List of tuples (db, id1[, id2][, id3]). See the docs.
51 keywords List of the keywords.
52 features List of tuples (key name, from, to, description).
53 from and to can be either integers for the residue
54 numbers, '<', '>', or '?'
55
56 seqinfo tuple of (length, molecular weight, CRC32 value)
57 sequence The sequence.
58
59 """
61 self.entry_name = None
62 self.data_class = None
63 self.molecule_type = None
64 self.sequence_length = None
65
66 self.accessions = []
67 self.created = None
68 self.sequence_update = None
69 self.annotation_update = None
70
71 self.description = []
72 self.gene_name = ''
73 self.organism = []
74 self.organelle = ''
75 self.organism_classification = []
76 self.taxonomy_id = []
77 self.host_organism = []
78 self.references = []
79 self.comments = []
80 self.cross_references = []
81 self.keywords = []
82 self.features = []
83
84 self.seqinfo = None
85 self.sequence = ''
86
87
89 """Holds information from one reference in a SwissProt entry.
90
91 Members:
92 number Number of reference in an entry.
93 positions Describes extent of work. list of strings.
94 comments Comments. List of (token, text).
95 references References. List of (dbname, identifier)
96 authors The authors of the work.
97 title Title of the work.
98 location A citation for the work.
99
100 """
102 self.number = None
103 self.positions = []
104 self.comments = []
105 self.references = []
106 self.authors = []
107 self.title = []
108 self.location = []
109
110
117
118
128
129
130
131
132
134 record = None
135 unread = ""
136 for line in handle:
137 key, value = line[:2], line[5:].rstrip()
138 if unread:
139 value = unread + " " + value
140 unread = ""
141 if key=='**':
142
143
144
145
146
147
148 pass
149 elif key=='ID':
150 record = Record()
151 _read_id(record, line)
152 _sequence_lines = []
153 elif key=='AC':
154 accessions = [word for word in value.rstrip(";").split("; ")]
155 record.accessions.extend(accessions)
156 elif key=='DT':
157 _read_dt(record, line)
158 elif key=='DE':
159 record.description.append(value.strip())
160 elif key=='GN':
161 if record.gene_name:
162 record.gene_name += " "
163 record.gene_name += value
164 elif key=='OS':
165 record.organism.append(value)
166 elif key=='OG':
167 record.organelle += line[5:]
168 elif key=='OC':
169 cols = [col for col in value.rstrip(";.").split("; ")]
170 record.organism_classification.extend(cols)
171 elif key=='OX':
172 _read_ox(record, line)
173 elif key=='OH':
174 _read_oh(record, line)
175 elif key=='RN':
176 reference = Reference()
177 _read_rn(reference, value)
178 record.references.append(reference)
179 elif key=='RP':
180 assert record.references, "RP: missing RN"
181 record.references[-1].positions.append(value)
182 elif key=='RC':
183 assert record.references, "RC: missing RN"
184 reference = record.references[-1]
185 unread = _read_rc(reference, value)
186 elif key=='RX':
187 assert record.references, "RX: missing RN"
188 reference = record.references[-1]
189 _read_rx(reference, value)
190 elif key=='RL':
191 assert record.references, "RL: missing RN"
192 reference = record.references[-1]
193 reference.location.append(value)
194
195
196
197 elif key=='RA':
198 assert record.references, "RA: missing RN"
199 reference = record.references[-1]
200 reference.authors.append(value)
201 elif key=='RG':
202 assert record.references, "RG: missing RN"
203 reference = record.references[-1]
204 reference.authors.append(value)
205 elif key=="RT":
206 assert record.references, "RT: missing RN"
207 reference = record.references[-1]
208 reference.title.append(value)
209 elif key=='CC':
210 _read_cc(record, line)
211 elif key=='DR':
212 _read_dr(record, value)
213 elif key=='PE':
214
215 pass
216 elif key=='KW':
217 cols = value.rstrip(";.").split('; ')
218 record.keywords.extend(cols)
219 elif key=='FT':
220 _read_ft(record, line)
221 elif key=='SQ':
222 cols = value.split()
223 assert len(cols) == 7, "I don't understand SQ line %s" % line
224
225 record.seqinfo = int(cols[1]), int(cols[3]), cols[5]
226 elif key==' ':
227 _sequence_lines.append(value.replace(" ", "").rstrip())
228 elif key=='//':
229
230 record.description = " ".join(record.description)
231 record.organism = " ".join(record.organism)
232 record.organelle = record.organelle.rstrip()
233 for reference in record.references:
234 reference.authors = " ".join(reference.authors)
235 reference.title = " ".join(reference.title)
236 reference.location = " ".join(reference.location)
237 record.sequence = "".join(_sequence_lines)
238 return record
239 else:
240 raise ValueError("Unknown keyword '%s' found" % key)
241 if record:
242 raise ValueError("Unexpected end of stream.")
243
244
246 cols = line[5:].split()
247
248
249
250
251
252 if len(cols) == 5:
253 record.entry_name = cols[0]
254 record.data_class = cols[1].rstrip(";")
255 record.molecule_type = cols[2].rstrip(";")
256 record.sequence_length = int(cols[3])
257 elif len(cols) == 4:
258 record.entry_name = cols[0]
259 record.data_class = cols[1].rstrip(";")
260 record.molecule_type = None
261 record.sequence_length = int(cols[2])
262 else:
263 raise ValueError("ID line has unrecognised format:\n"+line)
264
265 allowed = ('STANDARD', 'PRELIMINARY', 'IPI', 'Reviewed', 'Unreviewed')
266 if record.data_class not in allowed:
267 raise ValueError("Unrecognized data class %s in line\n%s" % \
268 (record.data_class, line))
269
270
271 if record.molecule_type not in (None, 'PRT'):
272 raise ValueError("Unrecognized molecule type %s in line\n%s" % \
273 (record.molecule_type, line))
274
275
277 value = line[5:]
278 uprline = value.upper()
279 cols = value.rstrip().split()
280 if 'CREATED' in uprline \
281 or 'LAST SEQUENCE UPDATE' in uprline \
282 or 'LAST ANNOTATION UPDATE' in uprline:
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298 uprcols = uprline.split()
299 rel_index = -1
300 for index in range(len(uprcols)):
301 if uprcols[index].find("REL.") >= 0:
302 rel_index = index
303 assert rel_index >= 0, \
304 "Could not find Rel. in DT line: %s" % line
305 version_index = rel_index + 1
306
307 str_version = cols[version_index].rstrip(",")
308
309 if str_version == '':
310 version = 0
311
312 elif str_version.find(".") >= 0:
313 version = str_version
314
315 else:
316 version = int(str_version)
317 date = cols[0]
318
319 if 'CREATED' in uprline:
320 record.created = date, version
321 elif 'LAST SEQUENCE UPDATE' in uprline:
322 record.sequence_update = date, version
323 elif 'LAST ANNOTATION UPDATE' in uprline:
324 record.annotation_update = date, version
325 else:
326 assert False, "Shouldn't reach this line!"
327 elif 'INTEGRATED INTO' in uprline \
328 or 'SEQUENCE VERSION' in uprline \
329 or 'ENTRY VERSION' in uprline:
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350 try:
351 version = int(cols[-1])
352 except ValueError:
353 version = 0
354 date = cols[0].rstrip(",")
355
356
357
358 if "INTEGRATED" in uprline:
359 record.created = date, version
360 elif 'SEQUENCE VERSION' in uprline:
361 record.sequence_update = date, version
362 elif 'ENTRY VERSION' in uprline:
363 record.annotation_update = date, version
364 else:
365 assert False, "Shouldn't reach this line!"
366 else:
367 raise ValueError("I don't understand the date line %s" % line)
368
369
387
388
401
402
404 assert rn[0] == '[' and rn[-1] == ']', "Missing brackets %s" % rn
405 reference.number = int(rn[1:-1])
406
407
428
429
431
432
433
434
435
436
437
438
439 value = value.replace(' [NCBI, ExPASy, Israel, Japan]','')
440
441
442
443
444
445
446
447
448 if "=" in value:
449 cols = value.split("; ")
450 cols = [x.strip() for x in cols]
451 cols = [x for x in cols if x]
452 for col in cols:
453 x = col.split("=")
454 assert len(x) == 2, "I don't understand RX line %s" % line
455 key, value = x[0], x[1].rstrip(";")
456 reference.references.append((key, value))
457
458 else:
459 cols = value.split("; ")
460
461 assert len(cols) == 2, "I don't understand RX line %s" % line
462 reference.references.append((cols[0].rstrip(";"), cols[1].rstrip(".")))
463
464
475
476
484
485
487 line = line[5:]
488 name = line[0:8].rstrip()
489 try:
490 from_res = int(line[9:15])
491 except ValueError:
492 from_res = line[9:15].lstrip()
493 try:
494 to_res = int(line[16:22])
495 except ValueError:
496 to_res = line[16:22].lstrip()
497 description = line[29:70].rstrip()
498
499 if line[29:35]==r"/FTId=":
500 ft_id = line[35:70].rstrip()[:-1]
501 else:
502 ft_id =""
503 if not name:
504 assert not from_res and not to_res
505 name, from_res, to_res, old_description,old_ft_id = record.features[-1]
506 del record.features[-1]
507 description = "%s %s" % (old_description, description)
508
509
510 if name == "VARSPLIC":
511
512
513
514
515
516 descr_cols = description.split(" -> ")
517 if len(descr_cols) == 2:
518 first_seq, second_seq = descr_cols
519 extra_info = ''
520
521
522 extra_info_pos = second_seq.find(" (")
523 if extra_info_pos != -1:
524 extra_info = second_seq[extra_info_pos:]
525 second_seq = second_seq[:extra_info_pos]
526
527 first_seq = first_seq.replace(" ", "")
528 second_seq = second_seq.replace(" ", "")
529
530 description = first_seq + " -> " + second_seq + extra_info
531 record.features.append((name, from_res, to_res, description,ft_id))
532
533
534 if __name__ == "__main__":
535 print "Quick self test..."
536
537 example_filename = "../../Tests/SwissProt/sp008"
538
539 import os
540 if not os.path.isfile(example_filename):
541 print "Missing test file %s" % example_filename
542 else:
543
544
545 handle = open(example_filename)
546 records = parse(handle)
547 for record in records:
548 print record.entry_name
549 print ",".join(record.accessions)
550 print record.keywords
551 print repr(record.organism)
552 print record.sequence[:20] + "..."
553 handle.close()
554