Package Bio :: Package Motif :: Package Parsers :: Module MEME
[hide private]
[frames] | no frames]

Source Code for Module Bio.Motif.Parsers.MEME

  1  # Copyright 2008 by Bartek Wilczynski 
  2  # Adapted from  Bio.MEME.Parser by Jason A. Hackney.  All rights reserved. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6   
  7  from Bio.Alphabet import IUPAC 
  8  from Bio import Seq 
  9  import re 
 10  from math import sqrt 
 11  import sys 
 12  from Bio.Motif import Motif 
 13   
 14   
 15   
16 -def read(handle):
17 """Parses the text output of the MEME program into MEME.Record object. 18 19 Example: 20 21 >>> f = open("meme.output.txt") 22 >>> from Bio.Motif.Parsers import MEME 23 >>> record = MEME.read(f) 24 >>> for motif in record.motifs: 25 ... for instance in motif.instances: 26 ... print instance.motif_name, instance.sequence_name, instance.strand, instance.pvalue 27 28 """ 29 record = MEMERecord() 30 __read_version(record, handle) 31 __read_datafile(record, handle) 32 __read_alphabet(record, handle) 33 __read_sequence_names(record, handle) 34 __read_command(record, handle) 35 for line in handle: 36 if line.startswith('MOTIF 1'): 37 break 38 else: 39 raise ValueError('Unexpected end of stream') 40 while True: 41 motif = __create_motif(line) 42 motif.alphabet = record.alphabet 43 record.motifs.append(motif) 44 __read_motif_name(motif, handle) 45 __read_motif_sequences(motif, handle, 'revcomp' in record.command) 46 __skip_unused_lines(handle) 47 try: 48 line = handle.next() 49 except StopIteration: 50 raise ValueError('Unexpected end of stream: Expected to find new motif, or the summary of motifs') 51 if line.startswith("SUMMARY OF MOTIFS"): 52 break 53 if not line.startswith('MOTIF'): 54 raise ValueError("Line does not start with 'MOTIF':\n%s" % line) 55 return record
56 57
58 -class MEMEMotif (Motif):
59 """A subclass of Motif used in parsing MEME (and MAST) output. 60 61 This sublcass defines functions and data specific to MEME motifs. 62 This includes the evalue for a motif and the PSSM of the motif. 63 64 Methods: 65 add_instance_from_values (name = 'default', pvalue = 1, sequence = 'ATA', start = 0, strand = +): create a new instance of the motif with the specified values. 66 add_to_pssm (position): add a new position to the pssm. The position should be a list of nucleotide/amino acid frequencies 67 add_to_logodds (position): add a new position to the log odds matrix. The position should be a tuple of log odds values for the nucleotide/amino acid at that position. 68 compare_motifs (other_motif): returns the maximum correlation between this motif and other_motif 69 """
70 - def __init__ (self):
71 Motif.__init__(self) 72 self.evalue = 0.0
73
74 - def _numoccurrences (self, number):
75 if type(number) == int: 76 self.num_occurrences = number 77 else: 78 number = int(number) 79 self.num_occurrences = number
80
81 - def get_instance_by_name (self,name):
82 for i in self.instances: 83 if i.sequence_name == name: 84 return i 85 return None
86
87 - def add_instance_from_values (self, name = 'default', pvalue = 1, sequence = 'ATA', start = 0, strand = '+'):
88 inst = MEMEInstance(sequence,self.alphabet) 89 inst._pvalue(pvalue) 90 inst._seqname(name) 91 inst._start(start) 92 inst._strand(strand) 93 if self.length: 94 inst._length(self.length) 95 if self.name: 96 inst._motifname(self.name) 97 self.add_instance(inst)
98
99 - def _evalue (self, evalue):
100 if type(evalue) == float: 101 self.evalue = evalue 102 else: 103 evalue = float(evalue) 104 self.evalue = evalue
105 106
107 -class MEMEInstance(Seq.Seq):
108 """A class describing the instances of a MEME motif, and the data thereof. 109 """
110 - def __init__ (self,*args,**kwds):
111 Seq.Seq.__init__(self,*args,**kwds) 112 self.sequence_name = "" 113 self.start = 0 114 self.pvalue = 1.0 115 self.strand = 0 116 self.length = 0 117 self.motif_name = ""
118 119
120 - def _seqname (self, name):
121 self.sequence_name = name
122
123 - def _motifname (self, name):
124 self.motif_name = name
125
126 - def _start (self,start):
127 start = int(start) 128 self.start = start
129
130 - def _pvalue (self,pval):
131 pval = float(pval) 132 self.pvalue = pval
133
134 - def _score (self, score):
135 score = float(score) 136 self.score = score
137
138 - def _strand (self, strand):
139 self.strand = strand
140
141 - def _length (self, length):
142 self.length = length
143 144
145 -class MEMERecord:
146 """A class for holding the results of a MEME run. 147 148 A MEMERecord is an object that holds the results from running 149 MEME. It implements no methods of its own. 150 151 """
152 - def __init__ (self):
153 """__init__ (self)""" 154 self.motifs = [] 155 self.version = "" 156 self.datafile = "" 157 self.command = "" 158 self.alphabet = None 159 self.sequence_names = []
160
161 - def get_motif_by_name (self, name):
162 for m in self.motifs: 163 if m.name == name: 164 return m
165 166 167 # Everything below is private 168 169
170 -def __read_version(record, handle):
171 for line in handle: 172 if line.startswith('MEME version'): 173 break 174 else: 175 raise ValueError("Improper input file. File should contain a line starting MEME version.") 176 line = line.strip() 177 ls = line.split() 178 record.version = ls[2]
179 180
181 -def __read_datafile(record, handle):
182 for line in handle: 183 if line.startswith('TRAINING SET'): 184 break 185 else: 186 raise ValueError("Unexpected end of stream: 'TRAINING SET' not found.") 187 try: 188 line = handle.next() 189 except StopIteration: 190 raise ValueError("Unexpected end of stream: Expected to find line starting with '****'") 191 if not line.startswith('****'): 192 raise ValueError("Line does not start with '****':\n%s" % line) 193 try: 194 line = handle.next() 195 except StopIteration: 196 raise ValueError("Unexpected end of stream: Expected to find line starting with 'DATAFILE'") 197 if not line.startswith('DATAFILE'): 198 raise ValueError("Line does not start with 'DATAFILE':\n%s" % line) 199 line = line.strip() 200 line = line.replace('DATAFILE= ','') 201 record.datafile = line
202 203
204 -def __read_alphabet(record, handle):
205 try: 206 line = handle.next() 207 except StopIteration: 208 raise ValueError("Unexpected end of stream: Expected to find line starting with 'ALPHABET'") 209 if not line.startswith('ALPHABET'): 210 raise ValueError("Line does not start with 'ALPHABET':\n%s" % line) 211 line = line.strip() 212 line = line.replace('ALPHABET= ','') 213 if line == 'ACGT': 214 al = IUPAC.unambiguous_dna 215 else: 216 al = IUPAC.protein 217 record.alphabet = al
218 219
220 -def __read_sequence_names(record, handle):
221 try: 222 line = handle.next() 223 except StopIteration: 224 raise ValueError("Unexpected end of stream: Expected to find line starting with 'Sequence name'") 225 if not line.startswith('Sequence name'): 226 raise ValueError("Line does not start with 'Sequence name':\n%s" % line) 227 try: 228 line = handle.next() 229 except StopIteration: 230 raise ValueError("Unexpected end of stream: Expected to find line starting with '----'") 231 if not line.startswith('----'): 232 raise ValueError("Line does not start with '----':\n%s" % line) 233 for line in handle: 234 if line.startswith('***'): 235 break 236 line = line.strip() 237 ls = line.split() 238 record.sequence_names.append(ls[0]) 239 if len(ls) == 6: 240 record.sequence_names.append(ls[3]) 241 else: 242 raise ValueError("Unexpected end of stream: Expected to find line starting with '***'")
243 244
245 -def __read_command(record, handle):
246 for line in handle: 247 if line.startswith('command:'): 248 break 249 else: 250 raise ValueError("Unexpected end of stream: Expected to find line starting with 'command'") 251 line = line.strip() 252 line = line.replace('command: ','') 253 record.command = line
254 255
256 -def __create_motif(line):
257 line = line.strip() 258 ls = line.split() 259 motif = MEMEMotif() 260 motif.length = int(ls[4]) 261 motif._numoccurrences(ls[7]) 262 motif._evalue(ls[13]) 263 return motif
264 265
266 -def __read_motif_name(motif, handle):
267 for line in handle: 268 if 'sorted by position p-value' in line: 269 break 270 else: 271 raise ValueError('Unexpected end of stream: Failed to find motif name') 272 line = line.strip() 273 ls = line.split() 274 name = " ".join(ls[0:2]) 275 motif.name=name
276 277
278 -def __read_motif_sequences(motif, handle, rv):
279 try: 280 line = handle.next() 281 except StopIteration: 282 raise ValueError('Unexpected end of stream: Failed to find motif sequences') 283 if not line.startswith('---'): 284 raise ValueError("Line does not start with '---':\n%s" % line) 285 try: 286 line = handle.next() 287 except StopIteration: 288 raise ValueError("Unexpected end of stream: Expected to find line starting with 'Sequence name'") 289 if not line.startswith('Sequence name'): 290 raise ValueError("Line does not start with 'Sequence name':\n%s" % line) 291 try: 292 line = handle.next() 293 except StopIteration: 294 raise ValueError('Unexpected end of stream: Failed to find motif sequences') 295 if not line.startswith('---'): 296 raise ValueError("Line does not start with '---':\n%s" % line) 297 for line in handle: 298 if line.startswith('---'): 299 break 300 line = line.strip() 301 ls = line.split() 302 if rv: 303 #seq = Seq.Seq(ls[5], record.alphabet) 304 motif.add_instance_from_values(name = ls[0], sequence = ls[5], start = ls[2], pvalue = ls[3], strand = ls[1]) 305 else: 306 #seq = Seq.Seq(ls[4], record.alphabet) 307 motif.add_instance_from_values(name = ls[0], sequence = ls[4], start = ls[1], pvalue = ls[2]) 308 else: 309 raise ValueError('Unexpected end of stream')
310 311
312 -def __skip_unused_lines(handle):
313 for line in handle: 314 if line.startswith('log-odds matrix'): 315 break 316 else: 317 raise ValueError("Unexpected end of stream: Expected to find line starting with 'log-odds matrix'") 318 for line in handle: 319 if line.startswith('---'): 320 break 321 else: 322 raise ValueError("Unexpected end of stream: Expected to find line starting with '---'") 323 for line in handle: 324 if line.startswith('letter-probability matrix'): 325 break 326 else: 327 raise ValueError("Unexpected end of stream: Expected to find line starting with 'letter-probability matrix'") 328 for line in handle: 329 if line.startswith('---'): 330 break 331 else: 332 raise ValueError("Unexpected end of stream: Expected to find line starting with '---'") 333 for line in handle: 334 if line.startswith('Time'): 335 break 336 else: 337 raise ValueError("Unexpected end of stream: Expected to find line starting with 'Time'") 338 try: 339 line = handle.next() 340 except StopIteration: 341 raise ValueError('Unexpected end of stream: Expected to find blank line') 342 if line.strip(): 343 raise ValueError("Expected blank line, but got:\n%s" % line) 344 try: 345 line = handle.next() 346 except StopIteration: 347 raise ValueError("Unexpected end of stream: Expected to find line starting with '***'") 348 if not line.startswith('***'): 349 raise ValueError("Line does not start with '***':\n%s" % line) 350 for line in handle: 351 if line.strip(): 352 break 353 else: 354 raise ValueError("Unexpected end of stream: Expected to find line starting with '***'") 355 if not line.startswith('***'): 356 raise ValueError("Line does not start with '***':\n%s" % line)
357 358 359 # Everything below is obsolete. 360 361 362 from Bio import File 363 from Bio.ParserSupport import * 364 365
366 -class MEMEParser (AbstractParser):
367 """A parser for the text output of the MEME program. 368 Parses the output into an object of the MEMERecord class. 369 370 Methods: 371 parse (handle): parses the contents of the file handle passed to it. 372 373 Example: 374 375 >>>f = open("meme.output.txt") 376 >>>parser = MEMEParser() 377 >>>meme_record = parser.parse(f) 378 >>>for motif in meme_record.motifs: 379 ... for instance in motif.instances: 380 ... print instance.motif_name, instance.sequence_name, instance.strand, instance.pvalue 381 382 """
383 - def __init__ (self):
384 """__init__ (self)""" 385 self._scanner = _MEMEScanner() 386 self._consumer = _MEMEConsumer()
387
388 - def parse (self, handle):
389 """parse (self, handle)""" 390 self._scanner.feed(handle, self._consumer) 391 return self._consumer.data
392 393 394
395 -class _MEMEScanner:
396 """Scanner for MEME output. 397 398 Methods: 399 feed 400 401 """ 402
403 - def feed (self, handle, consumer):
404 """ 405 Feeds in MEME output for scanning. handle should 406 implement the readline method. consumer is 407 a Consumer object that can receive the salient events. 408 """ 409 if isinstance(handle, File.UndoHandle): 410 uhandle = handle 411 else: 412 uhandle = File.UndoHandle(handle) 413 414 self._scan_header(uhandle, consumer) 415 self._scan_motifs (uhandle, consumer)
416
417 - def _scan_header(self, uhandle, consumer):
418 try: 419 read_and_call_until(uhandle, consumer.noevent, contains = 'MEME version') 420 except ValueError: 421 raise ValueError("Improper input file. File should contain a line starting MEME version.") 422 read_and_call(uhandle, consumer._version, start = 'MEME version') 423 read_and_call_until(uhandle, consumer.noevent, start = 'TRAINING SET') 424 read_and_call(uhandle, consumer.noevent, start = 'TRAINING SET') 425 read_and_call(uhandle, consumer.noevent, start = '****') 426 read_and_call(uhandle, consumer._datafile, start = 'DATAFILE') 427 read_and_call(uhandle, consumer._alphabet, start = 'ALPHABET') 428 read_and_call(uhandle, consumer.noevent, start = 'Sequence name') 429 read_and_call(uhandle, consumer.noevent, start = '----') 430 read_and_call_until(uhandle, consumer._sequence_name, start = '***') 431 read_and_call_until(uhandle, consumer.noevent, start = 'command:') 432 read_and_call(uhandle, consumer._commandline, start = 'command:') 433 read_and_call_until(uhandle, consumer.noevent, start = 'MOTIF 1')
434
435 - def _scan_motifs(self, uhandle, consumer):
436 while 1: 437 read_and_call(uhandle, consumer._add_motif_with_info, start = 'MOTIF') 438 read_and_call_until(uhandle, consumer.noevent, contains = 'sorted by position p-value') 439 read_and_call(uhandle, consumer.motif_name, contains = 'sorted by position p-value') 440 read_and_call(uhandle, consumer.noevent, start = '---') 441 read_and_call(uhandle, consumer.noevent, start = 'Sequence name') 442 read_and_call(uhandle, consumer.noevent, start = '---') 443 read_and_call_until(uhandle, consumer.add_instance, start = '---') 444 read_and_call_until(uhandle, consumer.noevent, start = 'log-odds matrix') 445 read_and_call(uhandle, consumer.noevent) 446 read_and_call_until(uhandle, consumer.add_to_logodds, start = '---') 447 read_and_call_until(uhandle, consumer.noevent, start = 'letter-probability matrix') 448 read_and_call(uhandle, consumer.noevent, start = 'letter-probability matrix') 449 read_and_call_until(uhandle, consumer.add_to_pssm, start = '---') 450 read_and_call_until(uhandle, consumer.noevent, start = 'Time') 451 read_and_call(uhandle, consumer.noevent, start = 'Time') 452 read_and_call(uhandle, consumer.noevent, blank = 1) 453 read_and_call(uhandle, consumer.noevent, start = '***') 454 read_and_call_while(uhandle, consumer.noevent, blank = 1) 455 read_and_call(uhandle, consumer.noevent, start = '***') 456 line = safe_peekline(uhandle) 457 if line.startswith("SUMMARY OF MOTIFS"): 458 break
459 460 461
462 -class _MEMEConsumer:
463 """ 464 Consumer that can receive events from MEME Scanner. 465 466 This is the Consumer object that should be passed to the 467 MEME Scanner. 468 """ 469
470 - def __init__ (self):
471 self.current_motif = None 472 self.sequence_names = [] 473 self.data = MEMERecord()
474
475 - def _version (self, line):
476 line = line.strip() 477 ls = line.split() 478 self.data.version = ls[2]
479
480 - def _datafile (self, line):
481 line = line.strip() 482 line = line.replace('DATAFILE= ','') 483 self.data.datafile = line
484
485 - def _alphabet (self, line):
486 line = line.strip() 487 line = line.replace('ALPHABET= ','') 488 if line == 'ACGT': 489 al = IUPAC.unambiguous_dna 490 else: 491 al = IUPAC.protein 492 self.data.alphabet = al
493
494 - def _sequence_name (self, line):
495 line = line.strip() 496 ls = line.split() 497 self.data.sequence_names.append(ls[0]) 498 if len(ls) == 6: 499 self.data.sequence_names.append(ls[3])
500
501 - def _commandline (self, line):
502 line = line.strip() 503 line = line.replace('command: ','') 504 self.data.command = line
505
506 - def _add_motif_with_info (self, line):
507 line = line.strip() 508 ls = line.split() 509 motif = MEMEMotif() 510 #motif.length=ls[4] 511 motif._numoccurrences(ls[7]) 512 motif._evalue(ls[13]) 513 motif.alphabet=self.data.alphabet 514 self.data.motifs.append(motif) 515 self.current_motif = motif
516
517 - def motif_name (self, line):
518 line = line.strip() 519 ls = line.split() 520 name = " ".join(ls[0:2]) 521 self.current_motif.name=name
522
523 - def add_instance (self, line):
524 line = line.strip() 525 ls = line.split() 526 if self.data.command.find('revcomp') != -1: 527 #seq = Seq.Seq(ls[5], self.data.alphabet) 528 self.current_motif.add_instance_from_values(name = ls[0], sequence = ls[5], start = ls[2], pvalue = ls[3], strand = ls[1]) 529 else: 530 #seq = Seq.Seq(ls[4], self.data.alphabet) 531 self.current_motif.add_instance_from_values(name = ls[0], sequence = ls[4], start = ls[1], pvalue = ls[2])
532
533 - def add_to_pssm (self, line):
534 pass
535
536 - def add_to_logodds (self, line):
537 pass
538
539 - def noevent (self,line):
540 pass
541 542 543
544 -class _MASTConsumer:
545 """ 546 Consumer that can receive events from _MASTScanner. 547 548 A _MASTConsumer parses lines from a mast text output file. 549 The motif match diagrams are parsed using line buffering. 550 Each of the buffering functions have a dummy variable that is 551 required for testing using the Bio.ParserSupport.TaggingConsumer. 552 If this variable isn't there, the TaggingConsumer barfs. In 553 the _MASTScanner, None is passed in the place of this variable. 554 """
555 - def __init__ (self):
556 self.data = MASTRecord() 557 self._current_seq = "" 558 self._line_buffer = [] 559 self._buffer_size = 0 560 self._buffered_seq_start = 0
561
562 - def _version (self, line):
563 line = line.strip() 564 ls = line.split() 565 self.data._version(ls[2])
566
567 - def _database (self, line):
568 line = line.strip() 569 ls = line.split() 570 self.data._database(ls[1]) 571 al = "" 572 if ls[2] == '(nucleotide)': 573 al = IUPAC.unambiguous_dna 574 self.data._alphabet(al) 575 else: 576 al = IUPAC.protein 577 self.data._alphabet(al)
578
579 - def _add_motif (self, line):
580 line = line.strip() 581 ls = line.split() 582 m = MEMEMotif() 583 m.alphabet=self.data.alphabet 584 m.length=ls[1] 585 name = ls[0] 586 m.name=name 587 m.add_instance(ls[2]) 588 self.data._add_motif(m)
589
590 - def _add_match_diagram (self, line):
591 line = line.strip() 592 ls = line.split() 593 self.data._add_diagram_for_sequence(ls[1], self._current_seq) 594 ds = ls[1].split('_') 595 i = 0 596 start = 0 597 for i in range(0,len(ds)): 598 if ds[i].find('[') != -1 or ds[i].find('<') != -1: 599 inst = MEMEInstance() 600 inst._seqname (self._current_seq) 601 inst._start (start) 602 r = re.compile('\d+') 603 mn = r.findall(ds[i])[0] 604 if ds[i].find('-') != -1: 605 inst.strand = '-' 606 else: 607 inst.strand = '+' 608 motif = self.data.get_motif_by_name(mn) 609 motif.add_instance(inst) 610 start += motif.length 611 else: 612 start += int(ds[i])
613
614 - def _add_sequence_match_with_diagram (self, line):
615 line = line.strip() 616 ls = line.split() 617 self.data._add_sequence(ls[0]) 618 self.data._add_diagram_for_sequence(ls[2],ls[0]) 619 ds = ls[2].split('_') 620 i = 0 621 start = 0 622 for i in range(0,len(ds)): 623 if ds[i].find('+') != -1 or ds[i].find('-') != -1: 624 inst = MEMEInstance() 625 inst._seqname (ls[0]) 626 inst._start (start) 627 r = re.compile('\d+') 628 mn = r.findall(ds[i])[0] 629 if ds[i].find('-') != -1: 630 inst.strand = '-' 631 else: 632 inst.strand = '+' 633 motif = self.data.get_motif_by_name(mn) 634 motif.add_instance(inst) 635 start += motif.length 636 else: 637 start += int(ds[i])
638
639 - def _add_diagram_from_buffer (self, dummy):
640 line = "" 641 for l in self._line_buffer: 642 line += l.strip() 643 ls = line.split() 644 self.data._add_diagram_for_sequence(ls[1], self._current_seq) 645 ds = ls[1].split('_') 646 i = 0 647 start = 0 648 for i in range(0,len(ds)): 649 if ds[i].find('[') != -1 or ds[i].find('<') != -1: 650 inst = MEMEInstance() 651 inst._seqname (self._current_seq) 652 inst._start (start) 653 r = re.compile('\d+') 654 mn = r.findall(ds[i])[0] 655 if ds[i].find('-') != -1: 656 inst.strand = '-' 657 else: 658 inst.strand = '+' 659 motif = self.data.get_motif_by_name(mn) 660 motif.add_instance(inst) 661 start += motif.length 662 else: 663 start += int(ds[i])
664
665 - def _set_current_seq (self, line):
666 line = line.strip() 667 self._current_seq = line 668 if not self.data.sequences.count(line): 669 self.data.sequences.append(line)
670
671 - def _add_line_to_buffer (self, line):
672 line = line.strip() 673 if not line.startswith('*****'): 674 self._line_buffer.append(line) 675 else: 676 return -1
677
678 - def _parse_buffer (self, dummy):
679 """Parses the line buffer to get e-values for each instance of a motif. 680 This buffer parser is the most likely point of failure for the 681 MASTParser. 682 """ 683 insts = self.data.get_motif_matches_for_sequence(self._current_seq) 684 if len(insts) > 0: 685 686 fullSeq = self._line_buffer[self._buffer_size-1] 687 pvals = self._line_buffer[1].split() 688 p = 0 689 lpval = len(pvals) 690 while p < lpval: 691 if pvals[p].count('e') > 1: 692 #Break blocks up by e and parse into valid floats. This only 693 #works if there are no e-values greater than 1e-5. 694 pvs = [] 695 spe = pvals[p].split('e') 696 spe.reverse() 697 dotind = spe[1].find('.') 698 if dotind == -1: 699 thispval = spe[1][-1] + 'e' + spe[0] 700 else: 701 thispval = spe[1][dotind-1:] + 'e' + spe[0] 702 pvs.append(thispval) 703 for spi in range(2,len(spe)): 704 dotind = spe[spi].find('.') 705 prevdotind = spe[spi-1].find('.') 706 if dotind != -1: 707 if prevdotind == -1: 708 thispval = spe[spi][dotind-1:] + 'e' + spe[spi-1][:-1] 709 else: 710 thispval = spe[spi][dotind-1:] + 'e' + spe[spi-1][0:prevdotind-1] 711 else: 712 if prevdotind == -1: 713 thispval = spe[spi][-1] + 'e' + spe[spi-1][:-1] 714 else: 715 thispval = spe[spi][-1] + 'e' + spe[spi-1][0:prevdotind-1] 716 pvs.append(thispval) 717 pvs.reverse() 718 if p > 0: 719 pvals = pvals[0:p] + pvs + pvals[p+1:] 720 else: 721 pvals = pvs + pvals[p+1:] 722 lpval = len(pvals) 723 p += 1 724 i = 0 725 if len(pvals) != len(insts): 726 sys.stderr.write("Failure to parse p-values for " + self._current_seq + ": " + self._line_buffer[1] + " to: " + str(pvals) + "\n") 727 pvals = [] 728 # else: 729 # sys.stderr.write('These are just fine' + self._current_seq + ': ' + self._line_buffer[1] + " to: " + str(pvals) + "\n") 730 for i in range(0,len(insts)): 731 inst = insts[i] 732 start = inst.start - self._buffered_seq_start + 1 733 thisSeq = fullSeq[start:start+inst.length] 734 thisSeq = Seq.Seq(thisSeq, self.data.alphabet) 735 inst._sequence(thisSeq) 736 if pvals: 737 inst._pvalue(float(pvals[i]))
738
739 - def _blank_buffer (self, dummy):
740 self._line_buffer = [] 741 self._buffer_size = 0
742
743 - def _collapse_buffer(self, dummy):
744 if self._buffer_size == 0: 745 if len(self._line_buffer) > 0: 746 self._buffer_size = len(self._line_buffer) 747 ll = self._line_buffer[self._buffer_size-1].split() 748 self._line_buffer[self._buffer_size-1] = ll[1] 749 self._buffered_seq_start = int(ll[0]) 750 else: 751 i = 0 752 for i in range(self._buffer_size, len(self._line_buffer)-1): 753 self._line_buffer[i-self._buffer_size] = self._line_buffer[i-self._buffer_size] + self._line_buffer[i].strip() 754 ll = self._line_buffer[len(self._line_buffer)-1].split() 755 if int(ll[0]) == self._buffered_seq_start + len(self._line_buffer[self._buffer_size-1]): 756 self._line_buffer[self._buffer_size-1] += ll[1] 757 else: 758 differ = int(ll[0]) - (self._buffered_seq_start + len(self._line_buffer[self._buffer_size-1])) 759 self._line_buffer[self._buffer_size-1] += "N"*differ 760 self._line_buffer[self._buffer_size-1] += ll[1] 761 self._line_buffer = self._line_buffer[0:self._buffer_size]
762
763 - def _add_motif_match (self, line):
764 line = line.strip() 765 if line.find('[') != -1 or line.find('<') != -1: 766 pass 767 elif line.find('e') != -1: 768 pass 769 elif line.find('+') != -1: 770 pass
771
772 - def noevent (self, line):
773 pass
774 775 776
777 -class MASTParser(AbstractParser):
778 """ 779 Parser for MAST text output. HTML output cannot be parsed, yet. Returns a MASTRecord 780 781 A MASTParser takes a file handle for a MAST text output file and 782 returns a MASTRecord, containing the hits between motifs and 783 sequences. The parser does some unusual line buffering to parse out 784 match diagrams. Really complex diagrams often lead to an error message 785 and p-values not being parsed for a given line. 786 787 Methods: 788 parse (handle): parses the data from the file handle passed to it. 789 790 Example: 791 792 >>>f = open("mast_file.txt") 793 >>>parser = MASTParser() 794 >>>mast_record = parser.parse(f) 795 >>>for motif in mast_record.motifs: 796 >>> for instance in motif.instances: 797 >>> print instance.motif_name, instance.sequence_name, instance.strand, instance.pvalue 798 """
799 - def __init__ (self):
800 self._consumer = _MASTConsumer() 801 self._scanner = _MASTScanner()
802
803 - def parse (self, handle):
804 self._scanner.feed(handle, self._consumer) 805 return self._consumer.data
806 807 808
809 -class _MASTScanner:
810 """ 811 Scanner for MAST text output. 812 813 """
814 - def feed (self, handle, consumer):
815 if isinstance(handle, File.UndoHandle): 816 uhandle = handle 817 else: 818 uhandle = File.UndoHandle(handle) 819 820 self._scan_header(uhandle, consumer) 821 self._scan_matches(uhandle, consumer) 822 self._scan_annotated_matches(uhandle, consumer)
823
824 - def _scan_header (self, uhandle, consumer):
825 try: 826 read_and_call_until(uhandle, consumer.noevent, contains = "MAST version") 827 except ValueError: 828 raise ValueError("Improper input file. Does not begin with a line with 'MAST version'") 829 read_and_call(uhandle, consumer._version, contains = 'MAST version') 830 read_and_call_until(uhandle, consumer.noevent, start = 'DATABASE AND MOTIFS') 831 read_and_call(uhandle, consumer.noevent, start = 'DATABASE') 832 read_and_call(uhandle, consumer.noevent, start = '****') 833 read_and_call(uhandle, consumer._database, contains = 'DATABASE') 834 read_and_call_until(uhandle, consumer.noevent, contains = 'MOTIF WIDTH') 835 read_and_call(uhandle, consumer.noevent, contains = 'MOTIF') 836 read_and_call(uhandle, consumer.noevent, contains = '----') 837 read_and_call_until(uhandle, consumer._add_motif, blank = 1) 838 read_and_call_until(uhandle, consumer.noevent, start = 'SECTION II:')
839
840 - def _scan_matches (self, uhandle, consumer):
841 read_and_call_until(uhandle, consumer.noevent, start = 'SEQUENCE NAME') 842 read_and_call(uhandle, consumer.noevent, start = 'SEQUENCE NAME') 843 read_and_call(uhandle, consumer.noevent, start = '---') 844 # read_and_call_until(uhandle, consumer._add_sequence_match_with_diagram, blank = 1) 845 read_and_call_until(uhandle, consumer.noevent, blank = 1) 846 read_and_call(uhandle, consumer.noevent, blank = 1)
847
848 - def _scan_annotated_matches (self, uhandle, consumer):
849 read_and_call_until(uhandle, consumer.noevent, start = 'SECTION III:') 850 read_and_call(uhandle, consumer.noevent, start = 'SECTION III:') 851 read_and_call_until(uhandle, consumer.noevent, start = '****') 852 read_and_call(uhandle, consumer.noevent, start = '****') 853 read_and_call_until(uhandle, consumer.noevent, start = '*****') 854 read_and_call(uhandle, consumer.noevent) 855 read_and_call_while(uhandle, consumer.noevent, blank = 1) 856 readMatches = 1 857 while readMatches == 1: 858 if consumer._current_seq: 859 if consumer._buffer_size != 0: 860 consumer._parse_buffer(None) 861 consumer._blank_buffer(None) 862 read_and_call(uhandle, consumer._set_current_seq) 863 read_and_call_until(uhandle, consumer.noevent, start = ' DIAGRAM') 864 read_and_call_until(uhandle, consumer._add_line_to_buffer, blank = 1) 865 consumer._add_diagram_from_buffer(None) 866 consumer._blank_buffer(None) 867 read_and_call(uhandle, consumer.noevent, blank = 1) 868 while 1: 869 line = safe_peekline(uhandle) 870 if line.startswith('****'): 871 consumer._parse_buffer(None) 872 readMatches = 0 873 break 874 read_and_call_until(uhandle, consumer._add_line_to_buffer, blank = 1) 875 read_and_call(uhandle, consumer.noevent, blank = 1) 876 consumer._collapse_buffer(None) 877 if attempt_read_and_call(uhandle, consumer.noevent, blank = 1): 878 break 879 elif attempt_read_and_call(uhandle, consumer.noevent, start = '*****'): 880 consumer._parse_buffer(None) 881 consumer._blank_buffer(None) 882 readMatches = 0 883 break
884 885 886
887 -class MASTRecord:
888 """The class for holding the results from a MAST run. 889 890 A MASTRecord holds data about matches between motifs and sequences. 891 The motifs held by the MASTRecord are objects of the class MEMEMotif. 892 893 Methods: 894 get_motif_matches_for_sequence(sequence_name): returns all of the 895 motif matches within a given sequence. The matches are objects of 896 the class MEMEInstance 897 get_motif_matches (motif_name): returns all of the matches for a motif 898 in the sequences searched. The matches returned are of class 899 MEMEInstance 900 get_motif_by_name (motif_name): returns a MEMEMotif with the given 901 name. 902 """
903 - def __init__ (self):
904 self.sequences = [] 905 self.version = "" 906 self.matches = [] 907 self.database = "" 908 self.diagrams = {} 909 self.alphabet = None 910 self.motifs = []
911
912 - def _version (self, version):
913 self.version = version
914
915 - def _alphabet (self, alphabet):
916 if alphabet == IUPAC.protein or alphabet == IUPAC.ambiguous_dna or alphabet == IUPAC.unambiguous_dna: 917 self.alphabet = alphabet 918 else: 919 return -1
920
921 - def _database(self, database):
922 self.database = database
923
924 - def get_motif_matches_for_sequence (self, seq):
925 insts = [] 926 for m in self.motifs: 927 for i in m.instances: 928 if i.sequence_name == seq: 929 insts.append(i) 930 insts.sort(lambda x,y: cmp(x.start, y.start)) 931 return insts
932
933 - def get_motif_matches (self, motif):
934 m = self.get_motif_by_name (motif.name) 935 return m.instances
936
937 - def _add_diagram_for_sequence (self, diagram, seq):
938 self.diagrams[seq] = diagram
939
940 - def _add_match (self, match):
941 self.matches.append(match)
942
943 - def _add_sequence (self, sequence):
945
946 - def _add_motif (self, motif):
947 self.motifs.append(motif)
948
949 - def get_motif_by_name (self, name):
950 for m in self.motifs: 951 if m.name == name: 952 return m
953