Package Bio :: Package UniGene :: Module UniGene
[hide private]
[frames] | no frames]

Source Code for Module Bio.UniGene.UniGene

  1   
  2  # Permission to use, copy, modify, and distribute this software and 
  3  # its documentation with or without modifications and for any purpose 
  4  # and without fee is hereby granted, provided that any copyright 
  5  # notices appear in all copies and that both those copyright notices 
  6  # and this permission notice appear in supporting documentation, and 
  7  # that the names of the contributors or copyright holders not be used 
  8  # in advertising or publicity pertaining to distribution of the software 
  9  # without specific prior permission. 
 10  # 
 11  # THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL 
 12  # WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED 
 13  # WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE 
 14  # CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT 
 15  # OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 
 16  # LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, 
 17  # NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION 
 18  # WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 
 19   
 20  import string 
 21  import operator 
 22  import urllib 
 23  import sgmllib 
 24  import UserDict 
 25  import Bio.File 
 26   
 27   
 28   
29 -class UniGeneParser( sgmllib.SGMLParser ):
30
31 - def reset( self ):
32 sgmllib.SGMLParser.reset( self ) 33 self.text = '' 34 self.queue = UserDict.UserDict() 35 self.open_tag_stack = [] 36 self.open_tag = 'open_html' 37 self.key_waiting = '' 38 self.master_key = '' 39 self.context = 'general_info'
40
41 - def parse( self, handle ):
42 self.reset() 43 self.feed( handle ) 44 for key in self.queue.keys(): 45 if( self.queue[ key ] == {} ): 46 if( key[ :15 ] == 'UniGene Cluster' ): 47 self.queue[ 'UniGene Cluster' ] = key[ 16: ] 48 del self.queue[ key ] 49 return self.queue
50 51 # 52 # Assumes an empty line between records 53 #
54 - def feed( self, handle ):
55 if isinstance(handle, Bio.File.UndoHandle): 56 uhandle = handle 57 else: 58 uhandle = Bio.File.UndoHandle(handle) 59 text = '' 60 while 1: 61 line = uhandle.readline() 62 line = string.strip( line ) 63 if( line == '' ): 64 break 65 text = text + ' ' + line 66 67 sgmllib.SGMLParser.feed( self, text )
68 69 70
71 - def handle_data(self, newtext ):
72 newtext = string.strip( newtext ) 73 self.text = self.text + newtext
74
75 - def start_a( self, attrs ):
76 if( self.context == 'seq_info' ): 77 if( self.open_tag != 'open_b' ): 78 self.text = ''
79 80 # self.queue.append( attrs ) 81
82 - def end_a( self ):
83 if( self.context == 'seq_info' ): 84 if( self.open_tag != 'open_b' ): 85 if( self.key_waiting == '' ): 86 self.key_waiting = self.text 87 self.text = ''
88
89 - def start_b( self, attrs ):
90 91 self.open_tag_stack.append( self.open_tag ) 92 self.open_tag = 'open_b' 93 if( self.key_waiting == '' ): 94 self.text = ''
95
96 - def end_b( self ):
97 if( self.text[ :15 ] == 'UniGene Cluster' ): 98 self.queue[ 'UniGene Cluster' ] = self.text[ 16: ] 99 self.text = '' 100 elif( self.key_waiting == '' ): 101 self.extract_key()
102
103 - def extract_key( self ):
104 text = string.strip( self.text ) 105 key = string.join( string.split( text ) ) 106 words = string.split( key ) 107 key = string.join( words[ :2 ] ) 108 self.text = '' 109 110 try: 111 self.open_tag = self.open_tag_stack.pop() 112 except: 113 self.open_tag = 'open_html' 114 if( self.open_tag == 'open_table_data' ): 115 if( self.context == 'general_info' ): 116 if( self.key_waiting == '' ): 117 self.key_waiting = key 118 self.text = '' 119 elif( self.context == 'seq_info' ): 120 if( text == 'Key to Symbols' ): 121 self.context = 'legend' 122 self.master_key = key 123 elif( self.context == 'general_info' ): 124 self.master_key = key 125 if( string.find( key, 'SEQUENCE' ) != -1 ): 126 self.context = 'seq_info' 127 self.queue[ key ] = UserDict.UserDict() 128 elif( self.context == 'seq_info' ): 129 self.queue[ key ] = UserDict.UserDict() 130 self.master_key = key
131 132 133
134 - def start_table( self, attrs ):
135 self.open_tag_stack.append( self.open_tag ) 136 self.open_tag = 'open_table'
137
138 - def end_table( self ):
139 try: 140 self.open_tag = self.open_tag_stack.pop() 141 except: 142 self.open_tag = 'open_html' 143 self.key_waiting = ''
144
145 - def start_tr( self, attrs ):
146 self.open_tag_stack.append( self.open_tag ) 147 self.open_tag = 'open_table_row' 148 self.text = ''
149
150 - def end_tr( self ):
151 try: 152 self.open_tag = self.open_tag_stack.pop() 153 except: 154 self.open_tag = 'open_html' 155 text = self.text 156 if text: 157 self.text = '' 158 if( text[ 0 ] == ':' ): 159 text = text[ 1: ] 160 text = string.join( string.split( text ) ) 161 if( ( self.context == 'general_info' ) or \ 162 ( self.context == 'seq_info' ) ): 163 try: 164 contents = self.queue[ self.master_key ][ self.key_waiting ] 165 if( type( contents ) == type( [] ) ): 166 contents.append( text ) 167 else: 168 self.queue[ self.master_key ][ self.key_waiting ] = \ 169 [ contents , text ] 170 except: 171 self.queue[ self.master_key ][ self.key_waiting ] = text 172 173 174 self.key_waiting = ''
175 176 177
178 - def start_td( self, attrs ):
179 self.open_tag_stack.append( self.open_tag ) 180 self.open_tag = 'open_table_data'
181
182 - def end_td( self ):
183 try: 184 self.open_tag = self.open_tag_stack.pop() 185 except: 186 self.open_tag = 'open_html' 187 if( self.context == 'seq_info' ): 188 self.text = self.text + ' '
189
190 - def print_item( self, item, level = 1 ):
191 indent = ' ' 192 for j in range( 0, level ): 193 indent = indent + ' ' 194 if( type( item ) == type( '' ) ): 195 if( item != '' ): 196 print '%s%s' % ( indent, item ) 197 elif( type( item ) == type([])): 198 for subitem in item: 199 self.print_item( subitem, level + 1 ) 200 elif( isinstance( item, UserDict.UserDict ) ): 201 for subitem in item.keys(): 202 print '%skey is %s' % ( indent, subitem ) 203 self.print_item( item[ subitem ], level + 1 ) 204 else: 205 print item
206
207 - def print_tags( self ):
208 for key in self.queue.keys(): 209 print 'key %s' % key 210 self.print_item( self.queue[ key ] )
211 212 213 214 if( __name__ == '__main__' ): 215 handle = open( 'Hs13225.htm') 216 undo_handle = Bio.File.UndoHandle( handle ) 217 unigene_parser = UniGeneParser() 218 unigene_parser.parse( handle ) 219 unigene_parser.print_tags() 220