1
2
3
4
5
6 """
7 Bio.AlignIO support for "fasta-m10" output from Bill Pearson's FASTA tools.
8
9 You are expected to use this module via the Bio.AlignIO functions (or the
10 Bio.SeqIO functions if you want to work directly with the gapped sequences).
11
12 This module contains a parser for the pairwise alignments produced by Bill
13 Pearson's FASTA tools, for use from the Bio.AlignIO interface where it is
14 refered to as the "fasta-m10" file format (as we only support the machine
15 readable output format selected with the -m 10 command line option).
16
17 This module does NOT cover the generic "fasta" file format originally
18 developed as an input format to the FASTA tools. The Bio.AlignIO and
19 Bio.SeqIO both use the Bio.SeqIO.FastaIO module to deal with these files,
20 which can also be used to store a multiple sequence alignments.
21 """
22
23 from Bio.Align.Generic import Alignment
24 from Interfaces import AlignmentIterator
25 from Bio.Alphabet import single_letter_alphabet, generic_dna, generic_protein
26 from Bio.Alphabet import Gapped
27
28
30 """Alignment iterator for the FASTA tool's pairwise alignment output.
31
32 This is for reading the pairwise alignments output by Bill Pearson's
33 FASTA program when called with the -m 10 command line option for machine
34 readable output. For more details about the FASTA tools, see the website
35 http://fasta.bioch.virginia.edu/ and the paper:
36
37 W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448
38
39 This class is intended to be used via the Bio.AlignIO.parse() function
40 by specifying the format as "fasta-m10" as shown in the following code:
41
42 from Bio import AlignIO
43 handle = ...
44 for a in AlignIO.parse(handle, "fasta-m10"):
45 assert len(a.get_all_seqs()) == 2, "Should be pairwise!"
46 print "Alignment length %i" % a.get_alignment_length()
47 for record in a:
48 print record.seq, record.name, record.id
49
50 Note that this is not a full blown parser for all the information
51 in the FASTA output - for example, most of the header and all of the
52 footer is ignored. Also, the alignments are not batched according to
53 the input queries.
54
55 Also note that there can be up to about 30 letters of flanking region
56 included in the raw FASTA output as contextual information. This is NOT
57 part of the alignment itself, and is not included in the resulting
58 Alignment objects returned.
59 """
60
62 """Reads from the handle to construct and return the next alignment.
63
64 This returns the pairwise alignment of query and match/library
65 sequences as an Alignment object containing two rows."""
66 handle = self.handle
67
68 try:
69
70
71 line = self._header
72 del self._header
73 except AttributeError:
74 line = handle.readline()
75 if not line:
76 return None
77
78 if line.startswith("#"):
79
80 line = self._skip_file_header(line)
81 while ">>>" in line and not line.startswith(">>>"):
82
83 self._query_descr = ""
84 self._query_header_annotation = {}
85
86 line = self._parse_query_header(line)
87
88 if not line:
89
90 return None
91 if ">>><<<" in line:
92
93 return None
94
95
96
97 assert line[0:2] == ">>" and not line[2] == ">", line
98
99 query_seq_parts, match_seq_parts = [], []
100 query_annotation, match_annotation = {}, {}
101 match_descr = ""
102 alignment_annotation = {}
103
104
105
106 """
107 >>gi|152973545|ref|YP_001338596.1| putative plasmid SOS inhibition protein A [Klebsiella pneumoniae subsp. pneumoniae MGH 78578]
108 ; fa_frame: f
109 ; fa_initn: 52
110 ; fa_init1: 52
111 ; fa_opt: 70
112 ; fa_z-score: 105.5
113 ; fa_bits: 27.5
114 ; fa_expect: 0.082
115 ; sw_score: 70
116 ; sw_ident: 0.279
117 ; sw_sim: 0.651
118 ; sw_overlap: 43
119 """
120 if (not line[0:2] == ">>") or line[0:3] == ">>>":
121 raise ValueError("Expected target line starting '>>'")
122 match_descr = line[2:].strip()
123
124 line = handle.readline()
125 line = self._parse_tag_section(line, alignment_annotation)
126 assert not line[0:2] == "; "
127
128
129 """
130 >gi|10955265| ..
131 ; sq_len: 346
132 ; sq_offset: 1
133 ; sq_type: p
134 ; al_start: 197
135 ; al_stop: 238
136 ; al_display_start: 167
137 DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK
138 QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL
139 GEYFTENKPKYIIREIHQET
140 """
141 if not (line[0] == ">" and line.strip().endswith("..")):
142 raise ValueError("Expected line starting '>' and ending '..'")
143 assert self._query_descr.startswith(line[1:].split(None,1)[0])
144
145
146 line = handle.readline()
147 line = self._parse_tag_section(line, query_annotation)
148 assert not line[0:2] == "; "
149
150
151 while not line[0] == ">":
152 query_seq_parts.append(line.strip())
153 line = handle.readline()
154
155
156 """
157 >gi|152973545|ref|YP_001338596.1| ..
158 ; sq_len: 242
159 ; sq_type: p
160 ; al_start: 52
161 ; al_stop: 94
162 ; al_display_start: 22
163 IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD
164 RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR
165 QDFAFTRKMRREARQVEQSW
166 """
167
168 if not (line[0] == ">" and line.strip().endswith("..")):
169 raise ValueError("Expected line starting '>' and ending '..', got '%s'" % repr(line))
170 assert match_descr.startswith(line[1:].split(None,1)[0])
171
172
173 line = handle.readline()
174 line = self._parse_tag_section(line, match_annotation)
175 assert not line[0:2] == "; "
176
177
178
179 """
180 ; al_cons:
181 .::. : :. ---. :: :. . : ..-:::-: :.: ..:...:
182 etc
183 """
184 while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line):
185 match_seq_parts.append(line.strip())
186 line = handle.readline()
187 if line[0:2] == "; ":
188 assert line.strip() == "; al_cons:"
189 align_consensus_parts = []
190 line = handle.readline()
191 while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line):
192 align_consensus_parts.append(line.strip())
193 line = handle.readline()
194
195 align_consensus = "".join(align_consensus_parts)
196 del align_consensus_parts
197 assert not line[0:2] == "; "
198 else:
199 align_consensus = None
200 assert (line[0] == ">" or ">>>" in line)
201 self._header = line
202
203
204
205 query_seq = "".join(query_seq_parts)
206 match_seq = "".join(match_seq_parts)
207 del query_seq_parts, match_seq_parts
208
209
210
211
212 query_align_seq = self._extract_alignment_region(query_seq, query_annotation)
213 match_align_seq = self._extract_alignment_region(match_seq, match_annotation)
214
215
216
217
218
219 if len(query_align_seq) != len(match_align_seq):
220 raise ValueError("Problem parsing the alignment sequence coordinates, "
221 "following should be the same length but are not:\n"
222 "%s - len %i\n%s - len %i" % (query_align_seq,
223 len(query_align_seq),
224 match_align_seq,
225 len(match_align_seq)))
226 if "sw_overlap" in alignment_annotation:
227 if int(alignment_annotation["sw_overlap"]) != len(query_align_seq):
228 raise ValueError("Specified sw_overlap = %s does not match expected value %i" \
229 % (alignment_annotation["sw_overlap"],
230 len(query_align_seq)))
231
232
233 alphabet = self.alphabet
234 alignment = Alignment(alphabet)
235
236
237
238 alignment._annotations = {}
239
240
241 for key, value in self._query_header_annotation.iteritems():
242 alignment._annotations[key] = value
243 for key, value in alignment_annotation.iteritems():
244 alignment._annotations[key] = value
245
246
247
248
249 alignment.add_sequence(self._query_descr, query_align_seq)
250 record = alignment.get_all_seqs()[-1]
251 assert record.id == self._query_descr or record.description == self._query_descr
252
253 record.id = self._query_descr.split(None,1)[0].strip(",")
254 record.name = "query"
255 record.annotations["original_length"] = int(query_annotation["sq_len"])
256
257 record._al_start = int(query_annotation["al_start"])
258 record._al_stop = int(query_annotation["al_stop"])
259
260
261
262
263 if alphabet == single_letter_alphabet and "sq_type" in query_annotation:
264 if query_annotation["sq_type"] == "D":
265 record.seq.alphabet = generic_dna
266 elif query_annotation["sq_type"] == "p":
267 record.seq.alphabet = generic_protein
268 if "-" in query_align_seq:
269 if not hasattr(record.seq.alphabet,"gap_char"):
270 record.seq.alphabet = Gapped(record.seq.alphabet, "-")
271
272 alignment.add_sequence(match_descr, match_align_seq)
273 record = alignment.get_all_seqs()[-1]
274 assert record.id == match_descr or record.description == match_descr
275
276 record.id = match_descr.split(None,1)[0].strip(",")
277 record.name = "match"
278 record.annotations["original_length"] = int(match_annotation["sq_len"])
279
280 record._al_start = int(match_annotation["al_start"])
281 record._al_stop = int(match_annotation["al_stop"])
282
283
284 if alphabet == single_letter_alphabet and "sq_type" in match_annotation:
285 if match_annotation["sq_type"] == "D":
286 record.seq.alphabet = generic_dna
287 elif match_annotation["sq_type"] == "p":
288 record.seq.alphabet = generic_protein
289 if "-" in match_align_seq:
290 if not hasattr(record.seq.alphabet,"gap_char"):
291 record.seq.alphabet = Gapped(record.seq.alphabet, "-")
292
293 return alignment
294
296 """Helper function for the main parsing code.
297
298 Skips over the file header region.
299 """
300
301 """
302 # /home/xxx/Downloads/FASTA/fasta-35.3.6/fasta35 -Q -H -E 1 -m 10 -X "-5 -5" NC_002127.faa NC_009649.faa
303 FASTA searches a protein or DNA sequence data bank
304 version 35.03 Feb. 18, 2008
305 Please cite:
306 W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448
307
308 Query: NC_002127.faa
309 """
310
311
312
313 while ">>>" not in line:
314 line = self.handle.readline()
315 return line
316
318 """Helper function for the main parsing code.
319
320 Skips over the free format query header, extracting the tagged parameters.
321
322 If there are no hits for the current query, it is skipped entirely."""
323
324 """
325 2>>>gi|10955264|ref|NP_052605.1| hypothetical protein pOSAK1_02 [Escherichia coli O157:H7 s 126 aa - 126 aa
326 Library: NC_009649.faa 45119 residues in 180 sequences
327
328 45119 residues in 180 sequences
329 Statistics: (shuffled [500]) Expectation_n fit: rho(ln(x))= 5.0398+/-0.00968; mu= 2.8364+/- 0.508
330 mean_var=44.7937+/-10.479, 0's: 0 Z-trim: 0 B-trim: 0 in 0/32
331 Lambda= 0.191631
332 Algorithm: FASTA (3.5 Sept 2006) [optimized]
333 Parameters: BL50 matrix (15:-5) ktup: 2
334 join: 36, opt: 24, open/ext: -10/-2, width: 16
335 Scan time: 0.040
336
337 The best scores are: opt bits E(180)
338 gi|152973462|ref|YP_001338513.1| hypothetical prot ( 101) 58 23.3 0.22
339 gi|152973501|ref|YP_001338552.1| hypothetical prot ( 245) 55 22.5 0.93
340 """
341
342
343 """
344 2>>>gi|152973838|ref|YP_001338875.1| hypothetical protein KPN_pKPN7p10263 [Klebsiella pneumoniae subsp. pneumonia 76 aa - 76 aa
345 vs NC_002127.faa library
346
347 579 residues in 3 sequences
348 Altschul/Gish params: n0: 76 Lambda: 0.158 K: 0.019 H: 0.100
349
350 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2
351 join: 36, opt: 24, open/ext: -10/-2, width: 16
352 Scan time: 0.000
353 !! No library sequences with E() < 0.5
354 """
355
356 self._query_header_annotation = {}
357 self._query_descr = ""
358
359 assert ">>>" in line and not line[0:3] == ">>>"
360
361
362 line = self.handle.readline()
363
364 while not line[0:3] == ">>>":
365
366 line = self.handle.readline()
367 if not line:
368 raise ValueError("Premature end of file!")
369 if ">>><<<" in line:
370
371
372 return line
373
374
375 """
376 >>>gi|10955264|ref|NP_052605.1|, 126 aa vs NC_009649.faa library
377 ; pg_name: /home/pjcock/Downloads/FASTA/fasta-35.3.6/fasta35
378 ; pg_ver: 35.03
379 ; pg_argv: /home/pjcock/Downloads/FASTA/fasta-35.3.6/fasta35 -Q -H -E 1 -m 10 -X -5 -5 NC_002127.faa NC_009649.faa
380 ; pg_name: FASTA
381 ; pg_ver: 3.5 Sept 2006
382 ; pg_matrix: BL50 (15:-5)
383 ; pg_open-ext: -10 -2
384 ; pg_ktup: 2
385 ; pg_optcut: 24
386 ; pg_cgap: 36
387 ; mp_extrap: 60000 500
388 ; mp_stats: (shuffled [500]) Expectation_n fit: rho(ln(x))= 5.0398+/-0.00968; mu= 2.8364+/- 0.508 mean_var=44.7937+/-10.479, 0's: 0 Z-trim: 0 B-trim: 0 in 0/32 Lambda= 0.191631
389 ; mp_KS: -0.0000 (N=1066338402) at 20
390 ; mp_Algorithm: FASTA (3.5 Sept 2006) [optimized]
391 ; mp_Parameters: BL50 matrix (15:-5) ktup: 2 join: 36, opt: 24, open/ext: -10/-2, width: 16
392 """
393
394 assert line[0:3] == ">>>", line
395 self._query_descr = line[3:].strip()
396
397
398 line = self.handle.readline()
399 line = self._parse_tag_section(line, self._query_header_annotation)
400 assert not line[0:2] == "; ", line
401 assert line[0:2] == ">>" or ">>>" in line, line
402 return line
403
404
406 """Helper function for the main parsing code.
407
408 To get the actual pairwise alignment sequences, we must first
409 translate the un-gapped sequence based coordinates into positions
410 in the gapped sequence (which may have a flanking region shown
411 using leading - characters). To date, I have never seen any
412 trailing flanking region shown in the m10 file, but the
413 following code should also cope with that.
414
415 Note that this code seems to work fine even when the "sq_offset"
416 entries are prsent as a result of using the -X command line option.
417 """
418 align_stripped = alignment_seq_with_flanking.strip("-")
419 display_start = int(annotation['al_display_start'])
420 if int(annotation['al_start']) <= int(annotation['al_stop']):
421 start = int(annotation['al_start']) \
422 - display_start
423 end = int(annotation['al_stop']) \
424 - display_start \
425 + align_stripped.count("-") + 1
426 else:
427
428 start = display_start \
429 - int(annotation['al_start'])
430 end = display_start \
431 - int(annotation['al_stop']) \
432 + align_stripped.count("-") + 1
433 assert 0 <= start and start < end and end <= len(align_stripped), \
434 "Problem with sequence start/stop,\n%s[%i:%i]\n%s" \
435 % (alignment_seq_with_flanking, start, end, annotation)
436 return align_stripped[start:end]
437
438
440 """Helper function for the main parsing code.
441
442 line - supply line just read from the handle containing the start of
443 the tagged section.
444 dictionary - where to record the tagged values
445
446 Returns a string, the first line following the tagged section."""
447 if not line[0:2] == "; ":
448 raise ValueError("Expected line starting '; '")
449 while line[0:2] == "; ":
450 tag, value = line[2:].strip().split(": ",1)
451
452
453
454
455
456
457 dictionary[tag] = value
458 line = self.handle.readline()
459 return line
460
461 if __name__ == "__main__":
462 print "Running a quick self-test"
463
464
465 simple_example = \
466 """# /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa
467 FASTA searches a protein or DNA sequence data bank
468 version 34.26 January 12, 2007
469 Please cite:
470 W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448
471
472 Query library NC_002127.faa vs NC_009649.faa library
473 searching NC_009649.faa library
474
475 1>>>gi|10955263|ref|NP_052604.1| plasmid mobilization [Escherichia coli O157:H7 s 107 aa - 107 aa
476 vs NC_009649.faa library
477
478 45119 residues in 180 sequences
479 Expectation_n fit: rho(ln(x))= 6.9146+/-0.0249; mu= -5.7948+/- 1.273
480 mean_var=53.6859+/-13.609, 0's: 0 Z-trim: 1 B-trim: 9 in 1/25
481 Lambda= 0.175043
482
483 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2
484 join: 36, opt: 24, open/ext: -10/-2, width: 16
485 Scan time: 0.000
486 The best scores are: opt bits E(180)
487 gi|152973457|ref|YP_001338508.1| ATPase with chape ( 931) 71 24.9 0.58
488 gi|152973588|ref|YP_001338639.1| F pilus assembly ( 459) 63 23.1 0.99
489
490 >>>gi|10955263|ref|NP_052604.1|, 107 aa vs NC_009649.faa library
491 ; pg_name: /opt/fasta/fasta34
492 ; pg_ver: 34.26
493 ; pg_argv: /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa
494 ; pg_name: FASTA
495 ; pg_ver: 3.5 Sept 2006
496 ; pg_matrix: BL50 (15:-5)
497 ; pg_open-ext: -10 -2
498 ; pg_ktup: 2
499 ; pg_optcut: 24
500 ; pg_cgap: 36
501 ; mp_extrap: 60000 180
502 ; mp_stats: Expectation_n fit: rho(ln(x))= 6.9146+/-0.0249; mu= -5.7948+/- 1.273 mean_var=53.6859+/-13.609, 0's: 0 Z-trim: 1 B-trim: 9 in 1/25 Lambda= 0.175043
503 ; mp_KS: -0.0000 (N=0) at 8159228
504 >>gi|152973457|ref|YP_001338508.1| ATPase with chaperone activity, ATP-binding subunit [Klebsiella pneumoniae subsp. pneumoniae MGH 78578]
505 ; fa_frame: f
506 ; fa_initn: 65
507 ; fa_init1: 43
508 ; fa_opt: 71
509 ; fa_z-score: 90.3
510 ; fa_bits: 24.9
511 ; fa_expect: 0.58
512 ; sw_score: 71
513 ; sw_ident: 0.250
514 ; sw_sim: 0.574
515 ; sw_overlap: 108
516 >gi|10955263| ..
517 ; sq_len: 107
518 ; sq_offset: 1
519 ; sq_type: p
520 ; al_start: 5
521 ; al_stop: 103
522 ; al_display_start: 1
523 --------------------------MTKRSGSNT-RRRAISRPVRLTAE
524 ED---QEIRKRAAECGKTVSGFLRAAALGKKVNSLTDDRVLKEVM-----
525 RLGALQKKLFIDGKRVGDREYAEVLIAITEYHRALLSRLMAD
526 >gi|152973457|ref|YP_001338508.1| ..
527 ; sq_len: 931
528 ; sq_type: p
529 ; al_start: 96
530 ; al_stop: 195
531 ; al_display_start: 66
532 SDFFRIGDDATPVAADTDDVVDASFGEPAAAGSGAPRRRGSGLASRISEQ
533 SEALLQEAAKHAAEFGRS------EVDTEHLLLALADSDVVKTILGQFKI
534 KVDDLKRQIESEAKR-GDKPF-EGEIGVSPRVKDALSRAFVASNELGHSY
535 VGPEHFLIGLAEEGEGLAANLLRRYGLTPQ
536 >>gi|152973588|ref|YP_001338639.1| F pilus assembly protein [Klebsiella pneumoniae subsp. pneumoniae MGH 78578]
537 ; fa_frame: f
538 ; fa_initn: 33
539 ; fa_init1: 33
540 ; fa_opt: 63
541 ; fa_z-score: 86.1
542 ; fa_bits: 23.1
543 ; fa_expect: 0.99
544 ; sw_score: 63
545 ; sw_ident: 0.266
546 ; sw_sim: 0.656
547 ; sw_overlap: 64
548 >gi|10955263| ..
549 ; sq_len: 107
550 ; sq_offset: 1
551 ; sq_type: p
552 ; al_start: 32
553 ; al_stop: 94
554 ; al_display_start: 2
555 TKRSGSNTRRRAISRPVRLTAEEDQEIRKRAAECGKTVSGFLRAAALGKK
556 VNSLTDDRVLKEV-MRLGALQKKLFIDGKRVGDREYAEVLIAITEYHRAL
557 LSRLMAD
558 >gi|152973588|ref|YP_001338639.1| ..
559 ; sq_len: 459
560 ; sq_type: p
561 ; al_start: 191
562 ; al_stop: 248
563 ; al_display_start: 161
564 VGGLFPRTQVAQQKVCQDIAGESNIFSDWAASRQGCTVGG--KMDSVQDK
565 ASDKDKERVMKNINIMWNALSKNRLFDG----NKELKEFIMTLTGTLIFG
566 ENSEITPLPARTTDQDLIRAMMEGGTAKIYHCNDSDKCLKVVADATVTIT
567 SNKALKSQISALLSSIQNKAVADEKLTDQE
568 2>>>gi|10955264|ref|NP_052605.1| hypothetical protein pOSAK1_02 [Escherichia coli O157:H7 s 126 aa - 126 aa
569 vs NC_009649.faa library
570
571 45119 residues in 180 sequences
572 Expectation_n fit: rho(ln(x))= 7.1374+/-0.0246; mu= -7.6540+/- 1.313
573 mean_var=51.1189+/-13.171, 0's: 0 Z-trim: 1 B-trim: 8 in 1/25
574 Lambda= 0.179384
575
576 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2
577 join: 36, opt: 24, open/ext: -10/-2, width: 16
578 Scan time: 0.000
579 The best scores are: opt bits E(180)
580 gi|152973462|ref|YP_001338513.1| hypothetical prot ( 101) 58 22.9 0.29
581
582 >>>gi|10955264|ref|NP_052605.1|, 126 aa vs NC_009649.faa library
583 ; pg_name: /opt/fasta/fasta34
584 ; pg_ver: 34.26
585 ; pg_argv: /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa
586 ; pg_name: FASTA
587 ; pg_ver: 3.5 Sept 2006
588 ; pg_matrix: BL50 (15:-5)
589 ; pg_open-ext: -10 -2
590 ; pg_ktup: 2
591 ; pg_optcut: 24
592 ; pg_cgap: 36
593 ; mp_extrap: 60000 180
594 ; mp_stats: Expectation_n fit: rho(ln(x))= 7.1374+/-0.0246; mu= -7.6540+/- 1.313 mean_var=51.1189+/-13.171, 0's: 0 Z-trim: 1 B-trim: 8 in 1/25 Lambda= 0.179384
595 ; mp_KS: -0.0000 (N=0) at 8159228
596 >>gi|152973462|ref|YP_001338513.1| hypothetical protein KPN_pKPN3p05904 [Klebsiella pneumoniae subsp. pneumoniae MGH 78578]
597 ; fa_frame: f
598 ; fa_initn: 50
599 ; fa_init1: 50
600 ; fa_opt: 58
601 ; fa_z-score: 95.8
602 ; fa_bits: 22.9
603 ; fa_expect: 0.29
604 ; sw_score: 58
605 ; sw_ident: 0.289
606 ; sw_sim: 0.632
607 ; sw_overlap: 38
608 >gi|10955264| ..
609 ; sq_len: 126
610 ; sq_offset: 1
611 ; sq_type: p
612 ; al_start: 1
613 ; al_stop: 38
614 ; al_display_start: 1
615 ------------------------------MKKDKKYQIEAIKNKDKTLF
616 IVYATDIYSPSEFFSKIESDLKKKKSKGDVFFDLIIPNGGKKDRYVYTSF
617 NGEKFSSYTLNKVTKTDEYN
618 >gi|152973462|ref|YP_001338513.1| ..
619 ; sq_len: 101
620 ; sq_type: p
621 ; al_start: 44
622 ; al_stop: 81
623 ; al_display_start: 14
624 DALLGEIQRLRKQVHQLQLERDILTKANELIKKDLGVSFLKLKNREKTLI
625 VDALKKKYPVAELLSVLQLARSCYFYQNVCTISMRKYA
626 3>>>gi|10955265|ref|NP_052606.1| hypothetical protein pOSAK1_03 [Escherichia coli O157:H7 s 346 aa - 346 aa
627 vs NC_009649.faa library
628
629 45119 residues in 180 sequences
630 Expectation_n fit: rho(ln(x))= 6.0276+/-0.0276; mu= 3.0670+/- 1.461
631 mean_var=37.1634+/- 8.980, 0's: 0 Z-trim: 1 B-trim: 14 in 1/25
632 Lambda= 0.210386
633
634 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2
635 join: 37, opt: 25, open/ext: -10/-2, width: 16
636 Scan time: 0.020
637 The best scores are: opt bits E(180)
638 gi|152973545|ref|YP_001338596.1| putative plasmid ( 242) 70 27.5 0.082
639
640 >>>gi|10955265|ref|NP_052606.1|, 346 aa vs NC_009649.faa library
641 ; pg_name: /opt/fasta/fasta34
642 ; pg_ver: 34.26
643 ; pg_argv: /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa
644 ; pg_name: FASTA
645 ; pg_ver: 3.5 Sept 2006
646 ; pg_matrix: BL50 (15:-5)
647 ; pg_open-ext: -10 -2
648 ; pg_ktup: 2
649 ; pg_optcut: 25
650 ; pg_cgap: 37
651 ; mp_extrap: 60000 180
652 ; mp_stats: Expectation_n fit: rho(ln(x))= 6.0276+/-0.0276; mu= 3.0670+/- 1.461 mean_var=37.1634+/- 8.980, 0's: 0 Z-trim: 1 B-trim: 14 in 1/25 Lambda= 0.210386
653 ; mp_KS: -0.0000 (N=0) at 8159228
654 >>gi|152973545|ref|YP_001338596.1| putative plasmid SOS inhibition protein A [Klebsiella pneumoniae subsp. pneumoniae MGH 78578]
655 ; fa_frame: f
656 ; fa_initn: 52
657 ; fa_init1: 52
658 ; fa_opt: 70
659 ; fa_z-score: 105.5
660 ; fa_bits: 27.5
661 ; fa_expect: 0.082
662 ; sw_score: 70
663 ; sw_ident: 0.279
664 ; sw_sim: 0.651
665 ; sw_overlap: 43
666 >gi|10955265| ..
667 ; sq_len: 346
668 ; sq_offset: 1
669 ; sq_type: p
670 ; al_start: 197
671 ; al_stop: 238
672 ; al_display_start: 167
673 DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK
674 QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL
675 GEYFTENKPKYIIREIHQET
676 >gi|152973545|ref|YP_001338596.1| ..
677 ; sq_len: 242
678 ; sq_type: p
679 ; al_start: 52
680 ; al_stop: 94
681 ; al_display_start: 22
682 IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD
683 RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR
684 QDFAFTRKMRREARQVEQSW
685 >>><<<
686
687
688 579 residues in 3 query sequences
689 45119 residues in 180 library sequences
690 Scomplib [34.26]
691 start: Tue May 20 16:38:45 2008 done: Tue May 20 16:38:45 2008
692 Total Scan time: 0.020 Total Display time: 0.010
693
694 Function used was FASTA [version 34.26 January 12, 2007]
695
696 """
697
698
699 from StringIO import StringIO
700
701 alignments = list(FastaM10Iterator(StringIO(simple_example)))
702 assert len(alignments) == 4, len(alignments)
703 assert len(alignments[0].get_all_seqs()) == 2
704 for a in alignments:
705 print "Alignment %i sequences of length %i" \
706 % (len(a.get_all_seqs()), a.get_alignment_length())
707 for r in a:
708 print "%s %s %i" % (r.seq, r.id, r.annotations["original_length"])
709
710 print "Done"
711
712 import os
713 path = "../../Tests/Fasta/"
714 files = [f for f in os.listdir(path) if os.path.splitext(f)[-1] == ".m10"]
715 files.sort()
716 for filename in files:
717 if os.path.splitext(filename)[-1] == ".m10":
718 print
719 print filename
720 print "="*len(filename)
721 for i,a in enumerate(FastaM10Iterator(open(os.path.join(path,filename)))):
722 print "#%i, %s" % (i+1,a)
723 for r in a:
724 if "-" in r.seq:
725 assert r.seq.alphabet.gap_char == "-"
726 else:
727 assert not hasattr(r.seq.alphabet, "gap_char")
728