Package Bio :: Package MEME :: Module Parser
[hide private]
[frames] | no frames]

Source Code for Module Bio.MEME.Parser

  1  # Copyright 2004 by Jason A. Hackney.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  from Bio.Alphabet import IUPAC 
  7  from Bio import File 
  8  from Bio.ParserSupport import * 
  9  from Bio import Seq 
 10  from Bio.MEME import Motif 
 11  import re 
 12   
13 -class MEMERecord:
14 """A class for holding the results of a MEME run. 15 16 A MEMERecord is an object that holds the results from running 17 MEME. It implements no methods of its own. 18 19 """
20 - def __init__ (self):
21 """__init__ (self)""" 22 self.motifs = [] 23 self.version = "" 24 self.datafile = "" 25 self.command = "" 26 self.alphabet = None 27 self.sequence_names = []
28
29 - def get_motif_by_name (self, name):
30 for m in self.motifs: 31 if m.name == name: 32 return m
33
34 -class MEMEParser (AbstractParser):
35 """A parser for the text output of the MEME program. 36 Parses the output into an object of the MEMERecord class. 37 38 Methods: 39 parse (handle): parses the contents of the file handle passed to it. 40 41 Example: 42 43 f = open("meme.output.txt") 44 parser = MEMEParser() 45 meme_record = parser.parse(f) 46 for motif in meme_record.motifs: 47 for instance in motif.instances: 48 print instance.motif_name, instance.sequence_name, instance.strand, instance.pvalue 49 50 """
51 - def __init__ (self):
52 """__init__ (self)""" 53 self._scanner = _MEMEScanner() 54 self._consumer = _MEMEConsumer()
55
56 - def parse (self, handle):
57 """parse (self, handle)""" 58 self._scanner.feed(handle, self._consumer) 59 return self._consumer.data
60 61 62
63 -class _MEMEScanner:
64 """Scanner for MEME output. 65 66 Methods: 67 feed 68 69 """ 70
71 - def feed (self, handle, consumer):
72 """ 73 Feeds in MEME output for scanning. handle should 74 implement the readline method. consumer is 75 a Consumer object that can receive the salient events. 76 """ 77 if isinstance(handle, File.UndoHandle): 78 uhandle = handle 79 else: 80 uhandle = File.UndoHandle(handle) 81 82 self._scan_header(uhandle, consumer) 83 self._scan_motifs (uhandle, consumer)
84
85 - def _scan_header(self, uhandle, consumer):
86 try: 87 read_and_call_until(uhandle, consumer.noevent, contains = 'MEME version') 88 except ValueError: 89 raise ValueError("Improper input file. File should contain a line starting MEME version.") 90 read_and_call(uhandle, consumer._version, start = 'MEME version') 91 read_and_call_until(uhandle, consumer.noevent, start = 'TRAINING SET') 92 read_and_call(uhandle, consumer.noevent, start = 'TRAINING SET') 93 read_and_call(uhandle, consumer.noevent, start = '****') 94 read_and_call(uhandle, consumer._datafile, start = 'DATAFILE') 95 read_and_call(uhandle, consumer._alphabet, start = 'ALPHABET') 96 read_and_call(uhandle, consumer.noevent, start = 'Sequence name') 97 read_and_call(uhandle, consumer.noevent, start = '----') 98 read_and_call_until(uhandle, consumer._sequence_name, start = '***') 99 read_and_call_until(uhandle, consumer.noevent, start = 'command:') 100 read_and_call(uhandle, consumer._commandline, start = 'command:') 101 read_and_call_until(uhandle, consumer.noevent, start = 'MOTIF 1')
102
103 - def _scan_motifs(self, uhandle, consumer):
104 while 1: 105 read_and_call(uhandle, consumer._add_motif_with_info, start = 'MOTIF') 106 read_and_call_until(uhandle, consumer.noevent, contains = 'sorted by position p-value') 107 read_and_call(uhandle, consumer.motif_name, contains = 'sorted by position p-value') 108 read_and_call(uhandle, consumer.noevent, start = '---') 109 read_and_call(uhandle, consumer.noevent, start = 'Sequence name') 110 read_and_call(uhandle, consumer.noevent, start = '---') 111 read_and_call_until(uhandle, consumer.add_instance, start = '---') 112 read_and_call_until(uhandle, consumer.noevent, start = 'log-odds matrix') 113 read_and_call(uhandle, consumer.noevent) 114 read_and_call_until(uhandle, consumer.add_to_logodds, start = '---') 115 read_and_call_until(uhandle, consumer.noevent, start = 'letter-probability matrix') 116 read_and_call(uhandle, consumer.noevent, start = 'letter-probability matrix') 117 read_and_call_until(uhandle, consumer.add_to_pssm, start = '---') 118 read_and_call_until(uhandle, consumer.noevent, start = 'Time') 119 read_and_call(uhandle, consumer.noevent, start = 'Time') 120 read_and_call(uhandle, consumer.noevent, blank = 1) 121 read_and_call(uhandle, consumer.noevent, start = '***') 122 read_and_call_while(uhandle, consumer.noevent, blank = 1) 123 read_and_call(uhandle, consumer.noevent, start = '***') 124 line = safe_peekline(uhandle) 125 if line.startswith("SUMMARY OF MOTIFS"): 126 break
127 128 129
130 -class _MEMEConsumer:
131 """ 132 Consumer that can receive events from MEME Scanner. 133 134 This is the Consumer object that should be passed to the 135 MEME Scanner. 136 """ 137
138 - def __init__ (self):
139 self.current_motif = None 140 self.sequence_names = [] 141 self.data = MEMERecord()
142
143 - def _version (self, line):
144 line = line.strip() 145 ls = line.split() 146 self.data.version = ls[2]
147
148 - def _datafile (self, line):
149 line = line.strip() 150 line = line.replace('DATAFILE= ','') 151 self.data.datafile = line
152
153 - def _alphabet (self, line):
154 line = line.strip() 155 line = line.replace('ALPHABET= ','') 156 if line == 'ACGT': 157 al = IUPAC.unambiguous_dna 158 else: 159 al = IUPAC.protein 160 self.data.alphabet = al
161
162 - def _sequence_name (self, line):
163 line = line.strip() 164 ls = line.split() 165 self.data.sequence_names.append(ls[0]) 166 if len(ls) == 6: 167 self.data.sequence_names.append(ls[3])
168
169 - def _commandline (self, line):
170 line = line.strip() 171 line = line.replace('command: ','') 172 self.data.command = line
173
174 - def _add_motif_with_info (self, line):
175 line = line.strip() 176 ls = line.split() 177 motif = Motif.MEMEMotif() 178 motif._length(ls[4]) 179 motif._numoccurrences(ls[7]) 180 motif._evalue(ls[13]) 181 motif._alphabet(self.data.alphabet) 182 self.data.motifs.append(motif) 183 self.current_motif = motif
184
185 - def motif_name (self, line):
186 line = line.strip() 187 ls = line.split() 188 name = ' '.join(ls[0:2]) 189 self.current_motif._name(name)
190
191 - def add_instance (self, line):
192 line = line.strip() 193 ls = line.split() 194 if self.data.command.find('revcomp') != -1: 195 seq = Seq.Seq(ls[5], self.data.alphabet) 196 self.current_motif.add_instance_from_values(name = ls[0], sequence = seq, start = ls[2], pvalue = ls[3], strand = ls[1]) 197 else: 198 seq = Seq.Seq(ls[4], self.data.alphabet) 199 self.current_motif.add_instance_from_values(name = ls[0], sequence = seq, start = ls[1], pvalue = ls[2])
200
201 - def add_to_pssm (self, line):
202 line = line.strip() 203 sl = line.split() 204 thisposition = tuple([float(i) for i in sl]) 205 self.current_motif.add_to_pssm(thisposition)
206
207 - def add_to_logodds (self, line):
208 line = line.strip() 209 sl = line.split() 210 thisposition = tuple([float(i) for i in sl]) 211 self.current_motif.add_to_logodds(thisposition)
212
213 - def noevent (self,line):
214 pass
215 216 217
218 -class _MASTConsumer:
219 """ 220 Consumer that can receive events from _MASTScanner. 221 222 A _MASTConsumer parses lines from a mast text output file. 223 The motif match diagrams are parsed using line buffering. 224 Each of the buffering functions have a dummy variable that is 225 required for testing using the Bio.ParserSupport.TaggingConsumer. 226 If this variable isn't there, the TaggingConsumer barfs. In 227 the _MASTScanner, None is passed in the place of this variable. 228 """
229 - def __init__ (self):
230 self.data = MASTRecord() 231 self._current_seq = "" 232 self._line_buffer = [] 233 self._buffer_size = 0 234 self._buffered_seq_start = 0
235
236 - def _version (self, line):
237 line = line.strip() 238 ls = line.split() 239 self.data._version(ls[2])
240
241 - def _database (self, line):
242 line = line.strip() 243 ls = line.split() 244 self.data._database(ls[1]) 245 al = "" 246 if ls[2] == '(nucleotide)': 247 al = IUPAC.unambiguous_dna 248 self.data._alphabet(al) 249 else: 250 al = IUPAC.protein 251 self.data._alphabet(al)
252
253 - def _add_motif (self, line):
254 line = line.strip() 255 ls = line.split() 256 m = Motif.MEMEMotif() 257 m._alphabet(self.data.alphabet) 258 m._length(ls[1]) 259 name = ls[0] 260 m._name(name) 261 m._consensus(ls[2]) 262 self.data._add_motif(m)
263
264 - def _add_match_diagram (self, line):
265 line = line.strip() 266 ls = line.split() 267 self.data._add_diagram_for_sequence(ls[1], self._current_seq) 268 ds = ls[1].split('_') 269 i = 0 270 start = 0 271 for i in range(0,len(ds)): 272 if ds[i].find('[') != -1 or ds[i].find('<') != -1: 273 inst = Motif.Instance() 274 inst._seqname (self._current_seq) 275 inst._start (start) 276 r = re.compile('\d+') 277 mn = r.findall(ds[i])[0] 278 if ds[i].find('-') != -1: 279 inst.strand = '-' 280 else: 281 inst.strand = '+' 282 motif = self.data.get_motif_by_name(mn) 283 motif.add_instance(inst) 284 start += motif.length 285 else: 286 start += int(ds[i])
287
288 - def _add_sequence_match_with_diagram (self, line):
289 line = line.strip() 290 ls = line.split() 291 self.data._add_sequence(ls[0]) 292 self.data._add_diagram_for_sequence(ls[2],ls[0]) 293 ds = ls[2].split('_') 294 i = 0 295 start = 0 296 for i in range(0,len(ds)): 297 if ds[i].find('+') != -1 or ds[i].find('-') != -1: 298 inst = Motif.Instance() 299 inst._seqname (ls[0]) 300 inst._start (start) 301 r = re.compile('\d+') 302 mn = r.findall(ds[i])[0] 303 if ds[i].find('-') != -1: 304 inst.strand = '-' 305 else: 306 inst.strand = '+' 307 motif = self.data.get_motif_by_name(mn) 308 motif.add_instance(inst) 309 start += motif.length 310 else: 311 start += int(ds[i])
312
313 - def _add_diagram_from_buffer (self, dummy):
314 line = "" 315 for l in self._line_buffer: 316 line += l.strip() 317 ls = line.split() 318 self.data._add_diagram_for_sequence(ls[1], self._current_seq) 319 ds = ls[1].split('_') 320 i = 0 321 start = 0 322 for i in range(0,len(ds)): 323 if ds[i].find('[') != -1 or ds[i].find('<') != -1: 324 inst = Motif.Instance() 325 inst._seqname (self._current_seq) 326 inst._start (start) 327 r = re.compile('\d+') 328 mn = r.findall(ds[i])[0] 329 if ds[i].find('-') != -1: 330 inst.strand = '-' 331 else: 332 inst.strand = '+' 333 motif = self.data.get_motif_by_name(mn) 334 motif.add_instance(inst) 335 start += motif.length 336 else: 337 start += int(ds[i])
338
339 - def _set_current_seq (self, line):
340 line = line.strip() 341 self._current_seq = line 342 if not self.data.sequences.count(line): 343 self.data.sequences.append(line)
344
345 - def _add_line_to_buffer (self, line):
346 line = line.strip() 347 if not line.startswith('*****'): 348 self._line_buffer.append(line) 349 else: 350 return -1
351
352 - def _parse_buffer (self, dummy):
353 """Parses the line buffer to get e-values for each instance of a motif. 354 This buffer parser is the most likely point of failure for the 355 MASTParser. 356 """ 357 insts = self.data.get_motif_matches_for_sequence(self._current_seq) 358 if len(insts) > 0: 359 360 fullSeq = self._line_buffer[self._buffer_size-1] 361 pvals = self._line_buffer[1].split() 362 p = 0 363 lpval = len(pvals) 364 while p < lpval: 365 if pvals[p].count('e') > 1: 366 #Break blocks up by e and parse into valid floats. This only 367 #works if there are no e-values greater than 1e-5. 368 pvs = [] 369 spe = pvals[p].split('e') 370 spe.reverse() 371 dotind = spe[1].find('.') 372 if dotind == -1: 373 thispval = spe[1][-1] + 'e' + spe[0] 374 else: 375 thispval = spe[1][dotind-1:] + 'e' + spe[0] 376 pvs.append(thispval) 377 for spi in range(2,len(spe)): 378 dotind = spe[spi].find('.') 379 prevdotind = spe[spi-1].find('.') 380 if dotind != -1: 381 if prevdotind == -1: 382 thispval = spe[spi][dotind-1:] + 'e' + spe[spi-1][:-1] 383 else: 384 thispval = spe[spi][dotind-1:] + 'e' + spe[spi-1][0:prevdotind-1] 385 else: 386 if prevdotind == -1: 387 thispval = spe[spi][-1] + 'e' + spe[spi-1][:-1] 388 else: 389 thispval = spe[spi][-1] + 'e' + spe[spi-1][0:prevdotind-1] 390 pvs.append(thispval) 391 pvs.reverse() 392 if p > 0: 393 pvals = pvals[0:p] + pvs + pvals[p+1:] 394 else: 395 pvals = pvs + pvals[p+1:] 396 lpval = len(pvals) 397 p += 1 398 i = 0 399 if len(pvals) != len(insts): 400 sys.stderr.write("Failure to parse p-values for " + self._current_seq + ": " + self._line_buffer[1] + " to: " + str(pvals) + "\n") 401 pvals = [] 402 # else: 403 # sys.stderr.write('These are just fine' + self._current_seq + ': ' + self._line_buffer[1] + " to: " + str(pvals) + "\n") 404 for i in range(0,len(insts)): 405 inst = insts[i] 406 start = inst.start - self._buffered_seq_start + 1 407 thisSeq = fullSeq[start:start+inst.length] 408 thisSeq = Seq.Seq(thisSeq, self.data.alphabet) 409 inst._sequence(thisSeq) 410 if pvals: 411 inst._pvalue(float(pvals[i]))
412
413 - def _blank_buffer (self, dummy):
414 self._line_buffer = [] 415 self._buffer_size = 0
416
417 - def _collapse_buffer(self, dummy):
418 if self._buffer_size == 0: 419 if len(self._line_buffer) > 0: 420 self._buffer_size = len(self._line_buffer) 421 ll = self._line_buffer[self._buffer_size-1].split() 422 self._line_buffer[self._buffer_size-1] = ll[1] 423 self._buffered_seq_start = int(ll[0]) 424 else: 425 i = 0 426 for i in range(self._buffer_size, len(self._line_buffer)-1): 427 self._line_buffer[i-self._buffer_size] = self._line_buffer[i-self._buffer_size] + self._line_buffer[i].strip() 428 ll = self._line_buffer[len(self._line_buffer)-1].split() 429 if int(ll[0]) == self._buffered_seq_start + len(self._line_buffer[self._buffer_size-1]): 430 self._line_buffer[self._buffer_size-1] += ll[1] 431 else: 432 differ = int(ll[0]) - (self._buffered_seq_start + len(self._line_buffer[self._buffer_size-1])) 433 self._line_buffer[self._buffer_size-1] += "N"*differ 434 self._line_buffer[self._buffer_size-1] += ll[1] 435 self._line_buffer = self._line_buffer[0:self._buffer_size]
436
437 - def _add_motif_match (self, line):
438 line = line.strip() 439 if line.find('[') != -1 or line.find('<') != -1: 440 pass 441 elif line.find('e') != -1: 442 pass 443 elif line.find('+') != -1: 444 pass
445
446 - def noevent (self, line):
447 pass
448 449 450
451 -class MASTParser(AbstractParser):
452 """ 453 Parser for MAST text output. HTML output cannot be parsed, yet. Returns a MASTRecord 454 455 A MASTParser takes a file handle for a MAST text output file and 456 returns a MASTRecord, containing the hits between motifs and 457 sequences. The parser does some unusual line buffering to parse out 458 match diagrams. Really complex diagrams often lead to an error message 459 and p-values not being parsed for a given line. 460 461 Methods: 462 parse (handle): parses the data from the file handle passed to it. 463 464 Example: 465 466 f = open("mast_file.txt") 467 parser = MASTParser() 468 mast_record = parser.parse(f) 469 for motif in mast_record.motifs: 470 for instance in motif.instances: 471 print instance.motif_name, instance.sequence_name, instance.strand, instance.pvalue 472 """
473 - def __init__ (self):
474 self._consumer = _MASTConsumer() 475 self._scanner = _MASTScanner()
476
477 - def parse (self, handle):
478 self._scanner.feed(handle, self._consumer) 479 return self._consumer.data
480 481 482
483 -class _MASTScanner:
484 """ 485 Scanner for MAST text output. 486 487 """
488 - def feed (self, handle, consumer):
489 if isinstance(handle, File.UndoHandle): 490 uhandle = handle 491 else: 492 uhandle = File.UndoHandle(handle) 493 494 self._scan_header(uhandle, consumer) 495 self._scan_matches(uhandle, consumer) 496 self._scan_annotated_matches(uhandle, consumer)
497
498 - def _scan_header (self, uhandle, consumer):
499 try: 500 read_and_call_until(uhandle, consumer.noevent, contains = "MAST version") 501 except ValueError: 502 raise ValueError("Improper input file. Does not begin with a line with 'MAST version'") 503 read_and_call(uhandle, consumer._version, contains = 'MAST version') 504 read_and_call_until(uhandle, consumer.noevent, start = 'DATABASE AND MOTIFS') 505 read_and_call(uhandle, consumer.noevent, start = 'DATABASE') 506 read_and_call(uhandle, consumer.noevent, start = '****') 507 read_and_call(uhandle, consumer._database, contains = 'DATABASE') 508 read_and_call_until(uhandle, consumer.noevent, contains = 'MOTIF WIDTH') 509 read_and_call(uhandle, consumer.noevent, contains = 'MOTIF') 510 read_and_call(uhandle, consumer.noevent, contains = '----') 511 read_and_call_until(uhandle, consumer._add_motif, blank = 1) 512 read_and_call_until(uhandle, consumer.noevent, start = 'SECTION II:')
513
514 - def _scan_matches (self, uhandle, consumer):
515 read_and_call_until(uhandle, consumer.noevent, start = 'SEQUENCE NAME') 516 read_and_call(uhandle, consumer.noevent, start = 'SEQUENCE NAME') 517 read_and_call(uhandle, consumer.noevent, start = '---') 518 # read_and_call_until(uhandle, consumer._add_sequence_match_with_diagram, blank = 1) 519 read_and_call_until(uhandle, consumer.noevent, blank = 1) 520 read_and_call(uhandle, consumer.noevent, blank = 1)
521
522 - def _scan_annotated_matches (self, uhandle, consumer):
523 read_and_call_until(uhandle, consumer.noevent, start = 'SECTION III:') 524 read_and_call(uhandle, consumer.noevent, start = 'SECTION III:') 525 read_and_call_until(uhandle, consumer.noevent, start = '****') 526 read_and_call(uhandle, consumer.noevent, start = '****') 527 read_and_call_until(uhandle, consumer.noevent, start = '*****') 528 read_and_call(uhandle, consumer.noevent) 529 read_and_call_while(uhandle, consumer.noevent, blank = 1) 530 readMatches = 1 531 while readMatches == 1: 532 if consumer._current_seq: 533 if consumer._buffer_size != 0: 534 consumer._parse_buffer(None) 535 consumer._blank_buffer(None) 536 read_and_call(uhandle, consumer._set_current_seq) 537 read_and_call_until(uhandle, consumer.noevent, start = ' DIAGRAM') 538 read_and_call_until(uhandle, consumer._add_line_to_buffer, blank = 1) 539 consumer._add_diagram_from_buffer(None) 540 consumer._blank_buffer(None) 541 read_and_call(uhandle, consumer.noevent, blank = 1) 542 while 1: 543 line = safe_peekline(uhandle) 544 if line.startswith('****'): 545 consumer._parse_buffer(None) 546 readMatches = 0 547 break 548 read_and_call_until(uhandle, consumer._add_line_to_buffer, blank = 1) 549 read_and_call(uhandle, consumer.noevent, blank = 1) 550 consumer._collapse_buffer(None) 551 if attempt_read_and_call(uhandle, consumer.noevent, blank = 1): 552 break 553 elif attempt_read_and_call(uhandle, consumer.noevent, start = '*****'): 554 consumer._parse_buffer(None) 555 consumer._blank_buffer(None) 556 readMatches = 0 557 break
558 559 560
561 -class MASTRecord:
562 """The class for holding the results from a MAST run. 563 564 A MASTRecord holds data about matches between motifs and sequences. 565 The motifs held by the MASTRecord are objects of the class MEMEMotif. 566 567 Methods: 568 get_motif_matches_for_sequence(sequence_name): returns all of the 569 motif matches within a given sequence. The matches are objects of 570 the class MEME.Motif.Instance 571 get_motif_matches (motif_name): returns all of the matches for a motif 572 in the sequences searched. The matches returned are of class 573 MEME.Motif.Instance 574 get_motif_by_name (motif_name): returns a MEMEMotif with the given 575 name. 576 """
577 - def __init__ (self):
578 self.sequences = [] 579 self.version = "" 580 self.matches = [] 581 self.database = "" 582 self.diagrams = {} 583 self.alphabet = None 584 self.motifs = []
585
586 - def _version (self, version):
587 self.version = version
588
589 - def _alphabet (self, alphabet):
590 if alphabet == IUPAC.protein or alphabet == IUPAC.ambiguous_dna or alphabet == IUPAC.unambiguous_dna: 591 self.alphabet = alphabet 592 else: 593 return -1
594
595 - def _database(self, database):
596 self.database = database
597
598 - def get_motif_matches_for_sequence (self, seq):
599 insts = [] 600 for m in self.motifs: 601 for i in m.instances: 602 if i.sequence_name == seq: 603 insts.append(i) 604 insts.sort(lambda x,y: cmp(x.start, y.start)) 605 return insts
606
607 - def get_motif_matches (self, motif):
608 m = self.get_motif_by_name (motif.name) 609 return m.instances
610
611 - def _add_diagram_for_sequence (self, diagram, seq):
612 self.diagrams[seq] = diagram
613
614 - def _add_match (self, match):
615 self.matches.append(match)
616
617 - def _add_sequence (self, sequence):
619
620 - def _add_motif (self, motif):
621 self.motifs.append(motif)
622
623 - def get_motif_by_name (self, name):
624 for m in self.motifs: 625 if m.name == name: 626 return m
627