1
2
3
4
5
6
7
8
9
10
11
12
13 """Parse Unigene flat file format files such as the Hs.data file.
14
15 Here is an overview of the flat file format that this parser deals with:
16 Line types/qualifiers:
17
18 ID UniGene cluster ID
19 TITLE Title for the cluster
20 GENE Gene symbol
21 CYTOBAND Cytological band
22 EXPRESS Tissues of origin for ESTs in cluster
23 RESTR_EXPR Single tissue or development stage contributes
24 more than half the total EST frequency for this gene.
25 GNM_TERMINUS genomic confirmation of presence of a 3' terminus;
26 T if a non-templated polyA tail is found among
27 a cluster's sequences; else
28 I if templated As are found in genomic sequence or
29 S if a canonical polyA signal is found on
30 the genomic sequence
31 GENE_ID Entrez gene identifier associated with at least one
32 sequence in this cluster;
33 to be used instead of LocusLink.
34 LOCUSLINK LocusLink identifier associated with at least one
35 sequence in this cluster;
36 deprecated in favor of GENE_ID
37 HOMOL Homology;
38 CHROMOSOME Chromosome. For plants, CHROMOSOME refers to mapping
39 on the arabidopsis genome.
40 STS STS
41 ACC= GenBank/EMBL/DDBJ accession number of STS
42 [optional field]
43 UNISTS= identifier in NCBI's UNISTS database
44 TXMAP Transcript map interval
45 MARKER= Marker found on at least one sequence in this
46 cluster
47 RHPANEL= Radiation Hybrid panel used to place marker
48 PROTSIM Protein Similarity data for the sequence with
49 highest-scoring protein similarity in this cluster
50 ORG= Organism
51 PROTGI= Sequence GI of protein
52 PROTID= Sequence ID of protein
53 PCT= Percent alignment
54 ALN= length of aligned region (aa)
55 SCOUNT Number of sequences in the cluster
56 SEQUENCE Sequence
57 ACC= GenBank/EMBL/DDBJ accession number of sequence
58 NID= Unique nucleotide sequence identifier (gi)
59 PID= Unique protein sequence identifier (used for
60 non-ESTs)
61 CLONE= Clone identifier (used for ESTs only)
62 END= End (5'/3') of clone insert read (used for
63 ESTs only)
64 LID= Library ID; see Hs.lib.info for library name
65 and tissue
66 MGC= 5' CDS-completeness indicator; if present, the
67 clone associated with this sequence is believed
68 CDS-complete. A value greater than 511 is the gi
69 of the CDS-complete mRNA matched by the EST,
70 otherwise the value is an indicator of the
71 reliability of the test indicating CDS
72 completeness; higher values indicate more
73 reliable CDS-completeness predictions.
74 SEQTYPE= Description of the nucleotide sequence.
75 Possible values are mRNA, EST and HTC.
76 TRACE= The Trace ID of the EST sequence, as provided by
77 NCBI Trace Archive
78 """
79
80
82 """Store the information for one SEQUENCE line from a Unigene file
83
84 Initialize with the text part of the SEQUENCE line, or nothing.
85
86 Attributes and descriptions (access as LOWER CASE)
87 ACC= GenBank/EMBL/DDBJ accession number of sequence
88 NID= Unique nucleotide sequence identifier (gi)
89 PID= Unique protein sequence identifier (used for non-ESTs)
90 CLONE= Clone identifier (used for ESTs only)
91 END= End (5'/3') of clone insert read (used for ESTs only)
92 LID= Library ID; see Hs.lib.info for library name and tissue
93 MGC= 5' CDS-completeness indicator; if present,
94 the clone associated with this sequence
95 is believed CDS-complete. A value greater than 511
96 is the gi of the CDS-complete mRNA matched by the EST,
97 otherwise the value is an indicator of the reliability
98 of the test indicating CDS completeness;
99 higher values indicate more reliable CDS-completeness
100 predictions.
101 SEQTYPE= Description of the nucleotide sequence. Possible values
102 are mRNA, EST and HTC.
103 TRACE= The Trace ID of the EST sequence, as provided by NCBI
104 Trace Archive
105 """
106
108 self.acc = ''
109 self.nid = ''
110 self.lid = ''
111 self.pid = ''
112 self.clone = ''
113 self.image = ''
114 self.is_image = False
115 self.end = ''
116 self.mgc = ''
117 self.seqtype = ''
118 self.trace = ''
119 if not text==None:
120 self.text=text
121 self._init_from_text(text)
122
123 - def _init_from_text(self,text):
124 parts = text.split('; ');
125 for part in parts:
126 key, val = part.split("=")
127 if key=='CLONE':
128 if val[:5]=='IMAGE':
129 self.is_image=True
130 self.image = val[6:]
131 setattr(self,key.lower(),val)
132
135
136
138 """Store the information for one PROTSIM line from a Unigene file
139
140 Initialize with the text part of the PROTSIM line, or nothing.
141
142 Attributes and descriptions (access as LOWER CASE)
143 ORG= Organism
144 PROTGI= Sequence GI of protein
145 PROTID= Sequence ID of protein
146 PCT= Percent alignment
147 ALN= length of aligned region (aa)
148 """
149
151 self.org = ''
152 self.protgi = ''
153 self.protid = ''
154 self.pct = ''
155 self.aln = ''
156 if not text==None:
157 self.text=text
158 self._init_from_text(text)
159
160 - def _init_from_text(self,text):
161 parts = text.split('; ');
162
163 for part in parts:
164 key, val = part.split("=")
165 setattr(self,key.lower(),val)
166
169
170
172 """Store the information for one STS line from a Unigene file
173
174 Initialize with the text part of the STS line, or nothing.
175
176 Attributes and descriptions (access as LOWER CASE)
177
178 ACC= GenBank/EMBL/DDBJ accession number of STS [optional field]
179 UNISTS= identifier in NCBI's UNISTS database
180 """
181
188
189 - def _init_from_text(self,text):
190 parts = text.split(' ');
191
192 for part in parts:
193 key, val = part.split("=")
194 setattr(self,key.lower(),val)
195
198
199
201 """Store a Unigene record
202
203 Here is what is stored:
204
205 self.ID = '' # ID line
206 self.species = '' # Hs, Bt, etc.
207 self.title = '' # TITLE line
208 self.symbol = '' # GENE line
209 self.cytoband = '' # CYTOBAND line
210 self.express = [] # EXPRESS line, parsed on ';'
211 # Will be an array of strings
212 self.restr_expr = '' # RESTR_EXPR line
213 self.gnm_terminus = '' # GNM_TERMINUS line
214 self.gene_id = '' # GENE_ID line
215 self.locuslink = '' # LOCUSLINK line
216 self.homol = '' # HOMOL line
217 self.chromosome = '' # CHROMOSOME line
218 self.protsim = [] # PROTSIM entries, array of Protsims
219 # Type ProtsimLine
220 self.sequence = [] # SEQUENCE entries, array of Sequence entries
221 # Type SequenceLine
222 self.sts = [] # STS entries, array of STS entries
223 # Type STSLine
224 self.txmap = [] # TXMAP entries, array of TXMap entries
225 """
226
228 self.ID = ''
229 self.species = ''
230 self.title = ''
231 self.symbol = ''
232 self.cytoband = ''
233 self.express = []
234 self.restr_expr = ''
235 self.gnm_terminus = ''
236 self.gene_id = ''
237 self.locuslink = ''
238 self.homol = ''
239 self.chromosome = ''
240 self.protsim = []
241 self.sequence = []
242 self.sts = []
243 self.txmap = []
244
246 return "<%s> %s %s\n%s" % (self.__class__.__name__,
247 self.ID, self.symbol, self.title)
248
249
256
257
267
268
269
270
271
324
325
326
327
328
329 from Bio.ParserSupport import *
330 import re
331
332
333
334
335 UG_INDENT=12
336
338 """Store the information for one SEQUENCE line from a Unigene file
339
340 Initialize with the text part of the SEQUENCE line, or nothing.
341
342 Attributes and descriptions (access as LOWER CASE)
343 ACC= GenBank/EMBL/DDBJ accession number of sequence
344 NID= Unique nucleotide sequence identifier (gi)
345 PID= Unique protein sequence identifier (used for non-ESTs)
346 CLONE= Clone identifier (used for ESTs only)
347 END= End (5'/3') of clone insert read (used for ESTs only)
348 LID= Library ID; see Hs.lib.info for library name and tissue
349 MGC= 5' CDS-completeness indicator; if present,
350 the clone associated with this sequence
351 is believed CDS-complete. A value greater than 511
352 is the gi of the CDS-complete mRNA matched by the EST,
353 otherwise the value is an indicator of the reliability
354 of the test indicating CDS comleteness;
355 higher values indicate more reliable CDS-completeness predictions.
356 SEQTYPE= Description of the nucleotide sequence. Possible values are
357 mRNA, EST and HTC.
358 TRACE= The Trace ID of the EST sequence, as provided by NCBI Trace Archive
359 PERIPHERAL= Indicator that the sequence is a suboptimal
360 representative of the gene represented by this cluster.
361 Peripheral sequences are those that are in a cluster
362 which represents a spliced gene without sharing a
363 splice junction with any other sequence. In many
364 cases, they are unspliced transcripts originating
365 from the gene.
366 """
367
369 self.acc = ''
370 self.nid = ''
371 self.lid = ''
372 self.pid = ''
373 self.clone = ''
374 self.image = ''
375 self.is_image = False
376 self.end = ''
377 self.mgc = ''
378 self.seqtype = ''
379 self.Trace = ''
380 self.peripheral = ''
381 if not text==None:
382 self.text=text
383 return self._init_from_text(text)
384
385 - def _init_from_text(self,text):
386 parts = text.split('; ');
387 for part in parts:
388 key,val = re.match('(\w+)=(\S+)',part).groups()
389 if key=='CLONE':
390 if val[:5]=='IMAGE':
391 self.is_image=True
392 self.image = val[6:]
393 setattr(self,key.lower(),val)
394
397
398
400 """Store the information for one PROTSIM line from a Unigene file
401
402 Initialize with the text part of the PROTSIM line, or nothing.
403
404 Attributes and descriptions (access as LOWER CASE)
405 ORG= Organism
406 PROTGI= Sequence GI of protein
407 PROTID= Sequence ID of protein
408 PCT= Percent alignment
409 ALN= length of aligned region (aa)
410 """
411
413 self.org = ''
414 self.protgi = ''
415 self.protid = ''
416 self.pct = ''
417 self.aln = ''
418 if not text==None:
419 self.text=text
420 return self._init_from_text(text)
421
422 - def _init_from_text(self,text):
423 parts = text.split('; ');
424
425 for part in parts:
426 key,val = re.match('(\w+)=(\S+)',part).groups()
427 setattr(self,key.lower(),val)
428
431
432
434 """Store the information for one STS line from a Unigene file
435
436 Initialize with the text part of the STS line, or nothing.
437
438 Attributes and descriptions (access as LOWER CASE)
439
440 NAME= Name of STS
441 ACC= GenBank/EMBL/DDBJ accession number of STS [optional field]
442 DSEG= GDB Dsegment number [optional field]
443 UNISTS= identifier in NCBI's UNISTS database
444 """
445
454
455 - def _init_from_text(self,text):
456 parts = text.split(' ');
457
458 for part in parts:
459 key,val = re.match('(\w+)=(\S+)',part).groups()
460 setattr(self,key.lower(),val)
461
464
465
467 """Store a Unigene record
468
469 Here is what is stored:
470
471 self.ID = '' # ID line
472 self.species = '' # Hs, Bt, etc.
473 self.title = '' # TITLE line
474 self.symbol = '' # GENE line
475 self.cytoband = '' # CYTOBAND line
476 self.express = [] # EXPRESS line, parsed on ';'
477 # Will be an array of strings
478 self.restr_expr = '' # RESTR_EXPR line
479 self.gnm_terminus = '' # GNM_TERMINUS line
480 self.gene_id = '' # GENE_ID line
481 self.chromosome = '' # CHROMOSOME
482 self.protsim = [] # PROTSIM entries, array of Protsims
483 # Type UnigeneProtsimRecord
484 self.sequence = [] # SEQUENCE entries, array of Sequence entries
485 # Type UnigeneSequenceRecord
486 self.sts = [] # STS entries, array of STS entries
487 # Type UnigeneSTSRecord
488 self.txmap = [] # TXMAP entries, array of TXMap entries
489 """
490
492 self.ID = ''
493 self.species = ''
494 self.title = ''
495 self.symbol = ''
496 self.cytoband = ''
497 self.express = []
498 self.restr_expr = ''
499 self.gnm_terminus = ''
500 self.gene_id = ''
501 self.chromosome = ''
502 self.protsim = []
503 self.sequence = []
504 self.sts = []
505 self.txmap = []
506
508 return "<%s> %s %s\n%s" % (self.__class__.__name__,
509 self.ID, self.symbol, self.title)
510
511
513
521 - def GENE(self,line):
539 - def STS(self,line):
542
543
544 - def _get_single_entry(self,line):
545 """Consume a single-value line
546 """
547 return line[UG_INDENT:]
548
549 - def _get_array_entry(self,line,split_on):
550 """Consume a multi-value line by splitting on split_on
551 """
552 return line[UG_INDENT:].split(split_on)
553
554
556 """Scans a Unigene Flat File Format file
557 """
558
559 - def feed(self, handle, consumer):
560 """feed(self, handle, consumer)
561
562 Feed events from parsing a Unigene file to a consumer.
563 handle is a file-like object, and consumer is a consumer object
564 that will receive events as the file is scanned
565
566 """
567 consumer.start_record()
568 for line in handle:
569 tag = line.split(' ')[0]
570 line = line.rstrip()
571 if line=='//':
572 consumer.end_record()
573 break
574 try:
575 f = getattr(consumer, tag)
576 except AttributeError:
577 print 'no method called', tag
578 else:
579 if callable(f):
580 f(line)
581
582
587
588 - def parse(self, handle):
595
597 - def __init__(self, handle, parser=None):
599
601 self._parser = RecordParser()
602 lines = []
603 while True:
604 line = self._uhandle.readline()
605 if not line: break
606 if line[:2] == '//':
607 break
608 lines.append(line)
609 if not lines:
610 return None
611 lines.append('//')
612 data = ''.join(lines)
613 if self._parser is not None:
614 return self._parser.parse(File.StringHandle(data))
615 return data
616
618 return iter(self.next, None)
619