Package Bio :: Package GenBank :: Module LocationParser
[hide private]
[frames] | no frames]

Source Code for Module Bio.GenBank.LocationParser

  1  # First pass at a parser for the location fields of a feature table. 
  2  # Everything likely to change. 
  3  # 
  4  # This does NOT cope with the Gap(), Gap(X), or Gap(unkXXX) tokens used 
  5  # in CONTIG lines, which are otherwise similar to feature locations. 
  6  # 
  7  # Based on the DDBJ/EMBL/GenBank Feature Table Definition Version 2.2 
  8  # Dec 15 1999 available from EBI, but the documentation is not 
  9  # completely internally consistent much less agree with real-life 
 10  # examples.  Conflicts resolved to agree with real examples. 
 11  # 
 12  # This does NOT cope with the Gap(), Gap(X), or Gap(unkXXX) tokens used 
 13  # in CONTIG lines, which are otherwise similar to feature locations. 
 14  # 
 15  # Uses John Aycock's SPARK for parsing 
 16  from Bio.Parsers.spark import GenericScanner, GenericParser 
 17   
18 -class Token:
19 - def __init__(self, type):
20 self.type = type
21 - def __cmp__(self, other):
22 return cmp(self.type, other)
23 - def __repr__(self):
24 return "Tokens(%r)" % (self.type,)
25 26 # "38"
27 -class Integer:
28 type = "integer"
29 - def __init__(self, val):
30 self.val = val
31 - def __cmp__(self, other):
32 return cmp(self.type, other)
33 - def __str__(self):
34 return str(self.val)
35 - def __repr__(self):
36 return "Integer(%s)" % self.val
37 38 # From the BNF definition, this isn't needed. Does tht mean 39 # that bases can be refered to with negative numbers?
40 -class UnsignedInteger(Integer):
41 type = "unsigned_integer"
42 - def __repr__(self):
43 return "UnsignedInteger(%s)" % self.val
44
45 -class Symbol:
46 type = "symbol"
47 - def __init__(self, name):
48 self.name = name
49 - def __cmp__(self, other):
50 return cmp(self.type, other)
51 - def __str__(self):
52 return str(self.name)
53 - def __repr__(self):
54 return "Symbol(%s)" % repr(self.name)
55 56 # ">38" -- The BNF says ">" is for the lower bound.. seems wrong to me
57 -class LowBound:
58 - def __init__(self, base):
59 self.base = base
60 - def __repr__(self):
61 return "LowBound(%r)" % self.base
62 63 # "<38"
64 -class HighBound:
65 - def __init__(self, base):
66 self.base = base
67 - def __repr__(self):
68 return "HighBound(%r)" % self.base
69 70 # 12.34
71 -class TwoBound:
72 - def __init__(self, low, high):
73 self.low = low 74 self.high = high
75 - def __repr__(self):
76 return "TwoBound(%r, %r)" % (self.low, self.high)
77 78 # 12^34
79 -class Between:
80 - def __init__(self, low, high):
81 self.low = low 82 self.high = high
83 - def __repr__(self):
84 return "Between(%r, %r)" % (self.low, self.high)
85 86 # 12..34
87 -class Range:
88 - def __init__(self, low, high):
89 self.low = low 90 self.high = high
91 - def __repr__(self):
92 return "Range(%r, %r)" % (self.low, self.high)
93
94 -class Function:
95 - def __init__(self, name, args):
96 self.name = name 97 self.args = args
98 - def __repr__(self):
99 return "Function(%r, %r)" % (self.name, self.args)
100
101 -class AbsoluteLocation:
102 - def __init__(self, path, local_location):
103 self.path = path 104 self.local_location = local_location
105 - def __repr__(self):
106 return "AbsoluteLocation(%r, %r)" % (self.path, self.local_location)
107
108 -class Path:
109 - def __init__(self, database, accession):
110 self.database = database 111 self.accession = accession
112 - def __repr__(self):
113 return "Path(%r, %r)" % (self.database, self.accession)
114
115 -class FeatureName:
116 - def __init__(self, path, label):
117 self.path = path 118 self.label = label
119 - def __repr__(self):
120 return "FeatureName(%r, %r)" % (self.path, self.label)
121
122 -class LocationScanner(GenericScanner):
123 - def __init__(self):
125
126 - def tokenize(self, input):
127 self.rv = [] 128 GenericScanner.tokenize(self, input) 129 return self.rv
130
131 - def t_double_colon(self, input):
132 r" :: " 133 self.rv.append(Token("double_colon"))
134 - def t_double_dot(self, input):
135 r" \.\. " 136 self.rv.append(Token("double_dot"))
137 - def t_dot(self, input):
138 r" \.(?!\.) " 139 self.rv.append(Token("dot"))
140 - def t_caret(self, input):
141 r" \^ " 142 self.rv.append(Token("caret"))
143 - def t_comma(self, input):
144 r" \, " 145 self.rv.append(Token("comma"))
146 - def t_integer(self, input):
147 r" -?[0-9]+ " 148 self.rv.append(Integer(int(input)))
149 - def t_unsigned_integer(self, input):
150 r" [0-9]+ " 151 self.rv.append(UnsignedInteger(int(input)))
152 - def t_colon(self, input):
153 r" :(?!:) " 154 self.rv.append(Token("colon"))
155 - def t_open_paren(self, input):
156 r" \( " 157 self.rv.append(Token("open_paren"))
158 - def t_close_paren(self, input):
159 r" \) " 160 self.rv.append(Token("close_paren"))
161 - def t_symbol(self, input):
162 r" [A-Za-z0-9_'*-][A-Za-z0-9_'*.-]* " 163 # Needed an extra '.' 164 self.rv.append(Symbol(input))
165 - def t_less_than(self, input):
166 r" < " 167 self.rv.append(Token("less_than"))
168 - def t_greater_than(self, input):
169 r" > " 170 self.rv.append(Token("greater_than"))
171 172 # punctuation .. hmm, isn't needed for location 173 # r''' [ !#$%&'()*+,\-./:;<=>?@\[\\\]^_`{|}~] ''' 174
175 -class LocationParser(GenericParser):
176 - def __init__(self, start='location'):
177 GenericParser.__init__(self, start) 178 self.begin_pos = 0
179
180 - def p_location(self, args):
181 """ 182 location ::= absolute_location 183 location ::= feature_name 184 location ::= function 185 """ 186 return args[0]
187
188 - def p_function(self, args):
189 """ 190 function ::= functional_operator open_paren location_list close_paren 191 """ 192 return Function(args[0].name, args[2])
193
194 - def p_absolute_location(self, args):
195 """ 196 absolute_location ::= local_location 197 absolute_location ::= path colon local_location 198 """ 199 if len(args) == 1: 200 return AbsoluteLocation(None, args[-1]) 201 return AbsoluteLocation(args[0], args[-1])
202
203 - def p_path(self, args):
204 """ 205 path ::= database double_colon primary_accession 206 path ::= primary_accession 207 """ 208 if len(args) == 3: 209 return Path(args[0], args[2]) 210 return Path(None, args[0])
211
212 - def p_feature_name(self, args):
213 """ 214 feature_name ::= path colon feature_label 215 feature_name ::= feature_label 216 """ 217 if len(args) == 3: 218 return FeatureName(args[0], args[2]) 219 return FeatureName(None, args[0])
220
221 - def p_feature_label(self, args):
222 """ 223 label ::= symbol 224 """ 225 return args[0].name
226
227 - def p_local_location(self, args):
228 """ 229 local_location ::= base_position 230 local_location ::= between_position 231 local_location ::= base_range 232 """ 233 return args[0]
234 - def p_location_list(self, args):
235 """ 236 location_list ::= location 237 location_list ::= location_list comma location 238 """ 239 if len(args) == 1: 240 return args 241 return args[0] + [args[2]]
242
243 - def p_functional_operator(self, args):
244 """ 245 functional_operator ::= symbol 246 """ 247 return args[0]
248
249 - def p_base_position(self, args):
250 """ 251 base_position ::= integer 252 base_position ::= low_base_bound 253 base_position ::= high_base_bound 254 base_position ::= two_base_bound 255 """ 256 return args[0]
257
258 - def p_low_base_bound(self, args):
259 """ 260 low_base_bound ::= greater_than integer 261 """ 262 return LowBound(args[1])
263
264 - def p_high_base_bound(self, args):
265 """ 266 high_base_bound ::= less_than integer 267 """ 268 return HighBound(args[1])
269
270 - def p_two_base_bound_1(self, args):
271 """ 272 two_base_bound ::= open_paren base_position dot base_position close_paren 273 """ 274 # main example doesn't have parens but others do.. (?) 275 return TwoBound(args[1], args[3])
276
277 - def p_two_base_bound_2(self, args):
278 """ 279 two_base_bound ::= base_position dot base_position 280 """ 281 # two_base_bound with no parentheses like 1.6 282 return TwoBound(args[0], args[2])
283
284 - def p_between_position(self, args):
285 """ 286 between_position ::= base_position caret base_position 287 """ 288 return Between(args[0], args[2])
289
290 - def p_base_range(self, args):
291 """ 292 base_range ::= base_position double_dot base_position 293 base_range ::= function double_dot base_position 294 base_range ::= base_position double_dot function 295 base_range ::= function double_dot function 296 """ 297 return Range(args[0], args[2])
298
299 - def p_database(self, args):
300 """ 301 database ::= symbol 302 """ 303 return args[0].name
304
305 - def p_primary_accession(self, args):
306 """ 307 primary_accession ::= symbol 308 """ 309 return args[0].name
310 311 312 _cached_scanner = LocationScanner()
313 -def scan(input):
314 """Break a location string into a set of tokens""" 315 #scanner = LocationScanner() 316 #return scanner.tokenize(input) 317 return _cached_scanner.tokenize(input)
318 319 _cached_parser = LocationParser()
320 -def parse(tokens):
321 """Go from a set of tokens to an object representation""" 322 #print "I have", tokens 323 #parser = LocationParser() 324 #return parser.parse(tokens) 325 return _cached_parser.parse(tokens)
326