1
2
3
4
5
6
7 from Bio.Alphabet import IUPAC
8 from Bio import Seq
9 import re
10 from math import sqrt
11 import sys
12 from Bio.Motif import Motif
13
14
15
17 """Parses the text output of the MEME program into MEME.Record object.
18
19 Example:
20
21 >>> f = open("meme.output.txt")
22 >>> from Bio.Motif.Parsers import MEME
23 >>> record = MEME.read(f)
24 >>> for motif in record.motifs:
25 ... for instance in motif.instances:
26 ... print instance.motif_name, instance.sequence_name, instance.strand, instance.pvalue
27
28 """
29 record = MEMERecord()
30 __read_version(record, handle)
31 __read_datafile(record, handle)
32 __read_alphabet(record, handle)
33 __read_sequence_names(record, handle)
34 __read_command(record, handle)
35 for line in handle:
36 if line.startswith('MOTIF 1'):
37 break
38 else:
39 raise ValueError('Unexpected end of stream')
40 while True:
41 motif = __create_motif(line)
42 motif.alphabet = record.alphabet
43 record.motifs.append(motif)
44 __read_motif_name(motif, handle)
45 __read_motif_sequences(motif, handle, 'revcomp' in record.command)
46 __skip_unused_lines(handle)
47 try:
48 line = handle.next()
49 except StopIteration:
50 raise ValueError('Unexpected end of stream: Expected to find new motif, or the summary of motifs')
51 if line.startswith("SUMMARY OF MOTIFS"):
52 break
53 if not line.startswith('MOTIF'):
54 raise ValueError("Line does not start with 'MOTIF':\n%s" % line)
55 return record
56
57
59 """A subclass of Motif used in parsing MEME (and MAST) output.
60
61 This sublcass defines functions and data specific to MEME motifs.
62 This includes the evalue for a motif and the PSSM of the motif.
63
64 Methods:
65 add_instance_from_values (name = 'default', pvalue = 1, sequence = 'ATA', start = 0, strand = +): create a new instance of the motif with the specified values.
66 add_to_pssm (position): add a new position to the pssm. The position should be a list of nucleotide/amino acid frequencies
67 add_to_logodds (position): add a new position to the log odds matrix. The position should be a tuple of log odds values for the nucleotide/amino acid at that position.
68 compare_motifs (other_motif): returns the maximum correlation between this motif and other_motif
69 """
73
75 if type(number) == int:
76 self.num_occurrences = number
77 else:
78 number = int(number)
79 self.num_occurrences = number
80
82 for i in self.instances:
83 if i.sequence_name == name:
84 return i
85 return None
86
98
100 if type(evalue) == float:
101 self.evalue = evalue
102 else:
103 evalue = float(evalue)
104 self.evalue = evalue
105
106
108 """A class describing the instances of a MEME motif, and the data thereof.
109 """
118
119
121 self.sequence_name = name
122
125
129
131 pval = float(pval)
132 self.pvalue = pval
133
137
140
143
144
146 """A class for holding the results of a MEME run.
147
148 A MEMERecord is an object that holds the results from running
149 MEME. It implements no methods of its own.
150
151 """
153 """__init__ (self)"""
154 self.motifs = []
155 self.version = ""
156 self.datafile = ""
157 self.command = ""
158 self.alphabet = None
159 self.sequence_names = []
160
162 for m in self.motifs:
163 if m.name == name:
164 return m
165
166
167
168
169
171 for line in handle:
172 if line.startswith('MEME version'):
173 break
174 else:
175 raise ValueError("Improper input file. File should contain a line starting MEME version.")
176 line = line.strip()
177 ls = line.split()
178 record.version = ls[2]
179
180
182 for line in handle:
183 if line.startswith('TRAINING SET'):
184 break
185 else:
186 raise ValueError("Unexpected end of stream: 'TRAINING SET' not found.")
187 try:
188 line = handle.next()
189 except StopIteration:
190 raise ValueError("Unexpected end of stream: Expected to find line starting with '****'")
191 if not line.startswith('****'):
192 raise ValueError("Line does not start with '****':\n%s" % line)
193 try:
194 line = handle.next()
195 except StopIteration:
196 raise ValueError("Unexpected end of stream: Expected to find line starting with 'DATAFILE'")
197 if not line.startswith('DATAFILE'):
198 raise ValueError("Line does not start with 'DATAFILE':\n%s" % line)
199 line = line.strip()
200 line = line.replace('DATAFILE= ','')
201 record.datafile = line
202
203
205 try:
206 line = handle.next()
207 except StopIteration:
208 raise ValueError("Unexpected end of stream: Expected to find line starting with 'ALPHABET'")
209 if not line.startswith('ALPHABET'):
210 raise ValueError("Line does not start with 'ALPHABET':\n%s" % line)
211 line = line.strip()
212 line = line.replace('ALPHABET= ','')
213 if line == 'ACGT':
214 al = IUPAC.unambiguous_dna
215 else:
216 al = IUPAC.protein
217 record.alphabet = al
218
219
221 try:
222 line = handle.next()
223 except StopIteration:
224 raise ValueError("Unexpected end of stream: Expected to find line starting with 'Sequence name'")
225 if not line.startswith('Sequence name'):
226 raise ValueError("Line does not start with 'Sequence name':\n%s" % line)
227 try:
228 line = handle.next()
229 except StopIteration:
230 raise ValueError("Unexpected end of stream: Expected to find line starting with '----'")
231 if not line.startswith('----'):
232 raise ValueError("Line does not start with '----':\n%s" % line)
233 for line in handle:
234 if line.startswith('***'):
235 break
236 line = line.strip()
237 ls = line.split()
238 record.sequence_names.append(ls[0])
239 if len(ls) == 6:
240 record.sequence_names.append(ls[3])
241 else:
242 raise ValueError("Unexpected end of stream: Expected to find line starting with '***'")
243
244
246 for line in handle:
247 if line.startswith('command:'):
248 break
249 else:
250 raise ValueError("Unexpected end of stream: Expected to find line starting with 'command'")
251 line = line.strip()
252 line = line.replace('command: ','')
253 record.command = line
254
255
264
265
267 for line in handle:
268 if 'sorted by position p-value' in line:
269 break
270 else:
271 raise ValueError('Unexpected end of stream: Failed to find motif name')
272 line = line.strip()
273 ls = line.split()
274 name = " ".join(ls[0:2])
275 motif.name=name
276
277
279 try:
280 line = handle.next()
281 except StopIteration:
282 raise ValueError('Unexpected end of stream: Failed to find motif sequences')
283 if not line.startswith('---'):
284 raise ValueError("Line does not start with '---':\n%s" % line)
285 try:
286 line = handle.next()
287 except StopIteration:
288 raise ValueError("Unexpected end of stream: Expected to find line starting with 'Sequence name'")
289 if not line.startswith('Sequence name'):
290 raise ValueError("Line does not start with 'Sequence name':\n%s" % line)
291 try:
292 line = handle.next()
293 except StopIteration:
294 raise ValueError('Unexpected end of stream: Failed to find motif sequences')
295 if not line.startswith('---'):
296 raise ValueError("Line does not start with '---':\n%s" % line)
297 for line in handle:
298 if line.startswith('---'):
299 break
300 line = line.strip()
301 ls = line.split()
302 if rv:
303
304 motif.add_instance_from_values(name = ls[0], sequence = ls[5], start = ls[2], pvalue = ls[3], strand = ls[1])
305 else:
306
307 motif.add_instance_from_values(name = ls[0], sequence = ls[4], start = ls[1], pvalue = ls[2])
308 else:
309 raise ValueError('Unexpected end of stream')
310
311
313 for line in handle:
314 if line.startswith('log-odds matrix'):
315 break
316 else:
317 raise ValueError("Unexpected end of stream: Expected to find line starting with 'log-odds matrix'")
318 for line in handle:
319 if line.startswith('---'):
320 break
321 else:
322 raise ValueError("Unexpected end of stream: Expected to find line starting with '---'")
323 for line in handle:
324 if line.startswith('letter-probability matrix'):
325 break
326 else:
327 raise ValueError("Unexpected end of stream: Expected to find line starting with 'letter-probability matrix'")
328 for line in handle:
329 if line.startswith('---'):
330 break
331 else:
332 raise ValueError("Unexpected end of stream: Expected to find line starting with '---'")
333 for line in handle:
334 if line.startswith('Time'):
335 break
336 else:
337 raise ValueError("Unexpected end of stream: Expected to find line starting with 'Time'")
338 try:
339 line = handle.next()
340 except StopIteration:
341 raise ValueError('Unexpected end of stream: Expected to find blank line')
342 if line.strip():
343 raise ValueError("Expected blank line, but got:\n%s" % line)
344 try:
345 line = handle.next()
346 except StopIteration:
347 raise ValueError("Unexpected end of stream: Expected to find line starting with '***'")
348 if not line.startswith('***'):
349 raise ValueError("Line does not start with '***':\n%s" % line)
350 for line in handle:
351 if line.strip():
352 break
353 else:
354 raise ValueError("Unexpected end of stream: Expected to find line starting with '***'")
355 if not line.startswith('***'):
356 raise ValueError("Line does not start with '***':\n%s" % line)
357
358
359
360
361
362 from Bio import File
363 from Bio.ParserSupport import *
364
365
367 """A parser for the text output of the MEME program.
368 Parses the output into an object of the MEMERecord class.
369
370 Methods:
371 parse (handle): parses the contents of the file handle passed to it.
372
373 Example:
374
375 >>>f = open("meme.output.txt")
376 >>>parser = MEMEParser()
377 >>>meme_record = parser.parse(f)
378 >>>for motif in meme_record.motifs:
379 ... for instance in motif.instances:
380 ... print instance.motif_name, instance.sequence_name, instance.strand, instance.pvalue
381
382 """
387
388 - def parse (self, handle):
389 """parse (self, handle)"""
390 self._scanner.feed(handle, self._consumer)
391 return self._consumer.data
392
393
394
396 """Scanner for MEME output.
397
398 Methods:
399 feed
400
401 """
402
403 - def feed (self, handle, consumer):
404 """
405 Feeds in MEME output for scanning. handle should
406 implement the readline method. consumer is
407 a Consumer object that can receive the salient events.
408 """
409 if isinstance(handle, File.UndoHandle):
410 uhandle = handle
411 else:
412 uhandle = File.UndoHandle(handle)
413
414 self._scan_header(uhandle, consumer)
415 self._scan_motifs (uhandle, consumer)
416
418 try:
419 read_and_call_until(uhandle, consumer.noevent, contains = 'MEME version')
420 except ValueError:
421 raise ValueError("Improper input file. File should contain a line starting MEME version.")
422 read_and_call(uhandle, consumer._version, start = 'MEME version')
423 read_and_call_until(uhandle, consumer.noevent, start = 'TRAINING SET')
424 read_and_call(uhandle, consumer.noevent, start = 'TRAINING SET')
425 read_and_call(uhandle, consumer.noevent, start = '****')
426 read_and_call(uhandle, consumer._datafile, start = 'DATAFILE')
427 read_and_call(uhandle, consumer._alphabet, start = 'ALPHABET')
428 read_and_call(uhandle, consumer.noevent, start = 'Sequence name')
429 read_and_call(uhandle, consumer.noevent, start = '----')
430 read_and_call_until(uhandle, consumer._sequence_name, start = '***')
431 read_and_call_until(uhandle, consumer.noevent, start = 'command:')
432 read_and_call(uhandle, consumer._commandline, start = 'command:')
433 read_and_call_until(uhandle, consumer.noevent, start = 'MOTIF 1')
434
436 while 1:
437 read_and_call(uhandle, consumer._add_motif_with_info, start = 'MOTIF')
438 read_and_call_until(uhandle, consumer.noevent, contains = 'sorted by position p-value')
439 read_and_call(uhandle, consumer.motif_name, contains = 'sorted by position p-value')
440 read_and_call(uhandle, consumer.noevent, start = '---')
441 read_and_call(uhandle, consumer.noevent, start = 'Sequence name')
442 read_and_call(uhandle, consumer.noevent, start = '---')
443 read_and_call_until(uhandle, consumer.add_instance, start = '---')
444 read_and_call_until(uhandle, consumer.noevent, start = 'log-odds matrix')
445 read_and_call(uhandle, consumer.noevent)
446 read_and_call_until(uhandle, consumer.add_to_logodds, start = '---')
447 read_and_call_until(uhandle, consumer.noevent, start = 'letter-probability matrix')
448 read_and_call(uhandle, consumer.noevent, start = 'letter-probability matrix')
449 read_and_call_until(uhandle, consumer.add_to_pssm, start = '---')
450 read_and_call_until(uhandle, consumer.noevent, start = 'Time')
451 read_and_call(uhandle, consumer.noevent, start = 'Time')
452 read_and_call(uhandle, consumer.noevent, blank = 1)
453 read_and_call(uhandle, consumer.noevent, start = '***')
454 read_and_call_while(uhandle, consumer.noevent, blank = 1)
455 read_and_call(uhandle, consumer.noevent, start = '***')
456 line = safe_peekline(uhandle)
457 if line.startswith("SUMMARY OF MOTIFS"):
458 break
459
460
461
463 """
464 Consumer that can receive events from MEME Scanner.
465
466 This is the Consumer object that should be passed to the
467 MEME Scanner.
468 """
469
471 self.current_motif = None
472 self.sequence_names = []
473 self.data = MEMERecord()
474
479
481 line = line.strip()
482 line = line.replace('DATAFILE= ','')
483 self.data.datafile = line
484
493
495 line = line.strip()
496 ls = line.split()
497 self.data.sequence_names.append(ls[0])
498 if len(ls) == 6:
499 self.data.sequence_names.append(ls[3])
500
502 line = line.strip()
503 line = line.replace('command: ','')
504 self.data.command = line
505
516
522
532
535
538
541
542
543
545 """
546 Consumer that can receive events from _MASTScanner.
547
548 A _MASTConsumer parses lines from a mast text output file.
549 The motif match diagrams are parsed using line buffering.
550 Each of the buffering functions have a dummy variable that is
551 required for testing using the Bio.ParserSupport.TaggingConsumer.
552 If this variable isn't there, the TaggingConsumer barfs. In
553 the _MASTScanner, None is passed in the place of this variable.
554 """
556 self.data = MASTRecord()
557 self._current_seq = ""
558 self._line_buffer = []
559 self._buffer_size = 0
560 self._buffered_seq_start = 0
561
566
578
589
613
638
664
670
672 line = line.strip()
673 if not line.startswith('*****'):
674 self._line_buffer.append(line)
675 else:
676 return -1
677
679 """Parses the line buffer to get e-values for each instance of a motif.
680 This buffer parser is the most likely point of failure for the
681 MASTParser.
682 """
683 insts = self.data.get_motif_matches_for_sequence(self._current_seq)
684 if len(insts) > 0:
685
686 fullSeq = self._line_buffer[self._buffer_size-1]
687 pvals = self._line_buffer[1].split()
688 p = 0
689 lpval = len(pvals)
690 while p < lpval:
691 if pvals[p].count('e') > 1:
692
693
694 pvs = []
695 spe = pvals[p].split('e')
696 spe.reverse()
697 dotind = spe[1].find('.')
698 if dotind == -1:
699 thispval = spe[1][-1] + 'e' + spe[0]
700 else:
701 thispval = spe[1][dotind-1:] + 'e' + spe[0]
702 pvs.append(thispval)
703 for spi in range(2,len(spe)):
704 dotind = spe[spi].find('.')
705 prevdotind = spe[spi-1].find('.')
706 if dotind != -1:
707 if prevdotind == -1:
708 thispval = spe[spi][dotind-1:] + 'e' + spe[spi-1][:-1]
709 else:
710 thispval = spe[spi][dotind-1:] + 'e' + spe[spi-1][0:prevdotind-1]
711 else:
712 if prevdotind == -1:
713 thispval = spe[spi][-1] + 'e' + spe[spi-1][:-1]
714 else:
715 thispval = spe[spi][-1] + 'e' + spe[spi-1][0:prevdotind-1]
716 pvs.append(thispval)
717 pvs.reverse()
718 if p > 0:
719 pvals = pvals[0:p] + pvs + pvals[p+1:]
720 else:
721 pvals = pvs + pvals[p+1:]
722 lpval = len(pvals)
723 p += 1
724 i = 0
725 if len(pvals) != len(insts):
726 sys.stderr.write("Failure to parse p-values for " + self._current_seq + ": " + self._line_buffer[1] + " to: " + str(pvals) + "\n")
727 pvals = []
728
729
730 for i in range(0,len(insts)):
731 inst = insts[i]
732 start = inst.start - self._buffered_seq_start + 1
733 thisSeq = fullSeq[start:start+inst.length]
734 thisSeq = Seq.Seq(thisSeq, self.data.alphabet)
735 inst._sequence(thisSeq)
736 if pvals:
737 inst._pvalue(float(pvals[i]))
738
740 self._line_buffer = []
741 self._buffer_size = 0
742
744 if self._buffer_size == 0:
745 if len(self._line_buffer) > 0:
746 self._buffer_size = len(self._line_buffer)
747 ll = self._line_buffer[self._buffer_size-1].split()
748 self._line_buffer[self._buffer_size-1] = ll[1]
749 self._buffered_seq_start = int(ll[0])
750 else:
751 i = 0
752 for i in range(self._buffer_size, len(self._line_buffer)-1):
753 self._line_buffer[i-self._buffer_size] = self._line_buffer[i-self._buffer_size] + self._line_buffer[i].strip()
754 ll = self._line_buffer[len(self._line_buffer)-1].split()
755 if int(ll[0]) == self._buffered_seq_start + len(self._line_buffer[self._buffer_size-1]):
756 self._line_buffer[self._buffer_size-1] += ll[1]
757 else:
758 differ = int(ll[0]) - (self._buffered_seq_start + len(self._line_buffer[self._buffer_size-1]))
759 self._line_buffer[self._buffer_size-1] += "N"*differ
760 self._line_buffer[self._buffer_size-1] += ll[1]
761 self._line_buffer = self._line_buffer[0:self._buffer_size]
762
764 line = line.strip()
765 if line.find('[') != -1 or line.find('<') != -1:
766 pass
767 elif line.find('e') != -1:
768 pass
769 elif line.find('+') != -1:
770 pass
771
774
775
776
778 """
779 Parser for MAST text output. HTML output cannot be parsed, yet. Returns a MASTRecord
780
781 A MASTParser takes a file handle for a MAST text output file and
782 returns a MASTRecord, containing the hits between motifs and
783 sequences. The parser does some unusual line buffering to parse out
784 match diagrams. Really complex diagrams often lead to an error message
785 and p-values not being parsed for a given line.
786
787 Methods:
788 parse (handle): parses the data from the file handle passed to it.
789
790 Example:
791
792 >>>f = open("mast_file.txt")
793 >>>parser = MASTParser()
794 >>>mast_record = parser.parse(f)
795 >>>for motif in mast_record.motifs:
796 >>> for instance in motif.instances:
797 >>> print instance.motif_name, instance.sequence_name, instance.strand, instance.pvalue
798 """
802
803 - def parse (self, handle):
804 self._scanner.feed(handle, self._consumer)
805 return self._consumer.data
806
807
808
810 """
811 Scanner for MAST text output.
812
813 """
814 - def feed (self, handle, consumer):
823
825 try:
826 read_and_call_until(uhandle, consumer.noevent, contains = "MAST version")
827 except ValueError:
828 raise ValueError("Improper input file. Does not begin with a line with 'MAST version'")
829 read_and_call(uhandle, consumer._version, contains = 'MAST version')
830 read_and_call_until(uhandle, consumer.noevent, start = 'DATABASE AND MOTIFS')
831 read_and_call(uhandle, consumer.noevent, start = 'DATABASE')
832 read_and_call(uhandle, consumer.noevent, start = '****')
833 read_and_call(uhandle, consumer._database, contains = 'DATABASE')
834 read_and_call_until(uhandle, consumer.noevent, contains = 'MOTIF WIDTH')
835 read_and_call(uhandle, consumer.noevent, contains = 'MOTIF')
836 read_and_call(uhandle, consumer.noevent, contains = '----')
837 read_and_call_until(uhandle, consumer._add_motif, blank = 1)
838 read_and_call_until(uhandle, consumer.noevent, start = 'SECTION II:')
839
841 read_and_call_until(uhandle, consumer.noevent, start = 'SEQUENCE NAME')
842 read_and_call(uhandle, consumer.noevent, start = 'SEQUENCE NAME')
843 read_and_call(uhandle, consumer.noevent, start = '---')
844
845 read_and_call_until(uhandle, consumer.noevent, blank = 1)
846 read_and_call(uhandle, consumer.noevent, blank = 1)
847
849 read_and_call_until(uhandle, consumer.noevent, start = 'SECTION III:')
850 read_and_call(uhandle, consumer.noevent, start = 'SECTION III:')
851 read_and_call_until(uhandle, consumer.noevent, start = '****')
852 read_and_call(uhandle, consumer.noevent, start = '****')
853 read_and_call_until(uhandle, consumer.noevent, start = '*****')
854 read_and_call(uhandle, consumer.noevent)
855 read_and_call_while(uhandle, consumer.noevent, blank = 1)
856 readMatches = 1
857 while readMatches == 1:
858 if consumer._current_seq:
859 if consumer._buffer_size != 0:
860 consumer._parse_buffer(None)
861 consumer._blank_buffer(None)
862 read_and_call(uhandle, consumer._set_current_seq)
863 read_and_call_until(uhandle, consumer.noevent, start = ' DIAGRAM')
864 read_and_call_until(uhandle, consumer._add_line_to_buffer, blank = 1)
865 consumer._add_diagram_from_buffer(None)
866 consumer._blank_buffer(None)
867 read_and_call(uhandle, consumer.noevent, blank = 1)
868 while 1:
869 line = safe_peekline(uhandle)
870 if line.startswith('****'):
871 consumer._parse_buffer(None)
872 readMatches = 0
873 break
874 read_and_call_until(uhandle, consumer._add_line_to_buffer, blank = 1)
875 read_and_call(uhandle, consumer.noevent, blank = 1)
876 consumer._collapse_buffer(None)
877 if attempt_read_and_call(uhandle, consumer.noevent, blank = 1):
878 break
879 elif attempt_read_and_call(uhandle, consumer.noevent, start = '*****'):
880 consumer._parse_buffer(None)
881 consumer._blank_buffer(None)
882 readMatches = 0
883 break
884
885
886
888 """The class for holding the results from a MAST run.
889
890 A MASTRecord holds data about matches between motifs and sequences.
891 The motifs held by the MASTRecord are objects of the class MEMEMotif.
892
893 Methods:
894 get_motif_matches_for_sequence(sequence_name): returns all of the
895 motif matches within a given sequence. The matches are objects of
896 the class MEMEInstance
897 get_motif_matches (motif_name): returns all of the matches for a motif
898 in the sequences searched. The matches returned are of class
899 MEMEInstance
900 get_motif_by_name (motif_name): returns a MEMEMotif with the given
901 name.
902 """
911
914
920
923
925 insts = []
926 for m in self.motifs:
927 for i in m.instances:
928 if i.sequence_name == seq:
929 insts.append(i)
930 insts.sort(lambda x,y: cmp(x.start, y.start))
931 return insts
932
936
938 self.diagrams[seq] = diagram
939
942
945
948
950 for m in self.motifs:
951 if m.name == name:
952 return m
953