Package Bio :: Package AlignIO :: Module FastaIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignIO.FastaIO

  1  # Copyright 2008-2009 by Peter Cock.  All rights reserved. 
  2  # 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  """ 
  7  Bio.AlignIO support for "fasta-m10" output from Bill Pearson's FASTA tools. 
  8   
  9  You are expected to use this module via the Bio.AlignIO functions (or the 
 10  Bio.SeqIO functions if you want to work directly with the gapped sequences). 
 11   
 12  This module contains a parser for the pairwise alignments produced by Bill 
 13  Pearson's FASTA tools, for use from the Bio.AlignIO interface where it is 
 14  refered to as the "fasta-m10" file format (as we only support the machine 
 15  readable output format selected with the -m 10 command line option). 
 16   
 17  This module does NOT cover the generic "fasta" file format originally 
 18  developed as an input format to the FASTA tools.  The Bio.AlignIO and 
 19  Bio.SeqIO both use the Bio.SeqIO.FastaIO module to deal with these files, 
 20  which can also be used to store a multiple sequence alignments. 
 21  """ 
 22   
 23  from Bio.Align.Generic import Alignment 
 24  from Interfaces import AlignmentIterator 
 25  from Bio.Alphabet import single_letter_alphabet, generic_dna, generic_protein 
 26  from Bio.Alphabet import Gapped 
 27   
 28   
29 -class FastaM10Iterator(AlignmentIterator):
30 """Alignment iterator for the FASTA tool's pairwise alignment output. 31 32 This is for reading the pairwise alignments output by Bill Pearson's 33 FASTA program when called with the -m 10 command line option for machine 34 readable output. For more details about the FASTA tools, see the website 35 http://fasta.bioch.virginia.edu/ and the paper: 36 37 W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448 38 39 This class is intended to be used via the Bio.AlignIO.parse() function 40 by specifying the format as "fasta-m10" as shown in the following code: 41 42 from Bio import AlignIO 43 handle = ... 44 for a in AlignIO.parse(handle, "fasta-m10"): 45 assert len(a.get_all_seqs()) == 2, "Should be pairwise!" 46 print "Alignment length %i" % a.get_alignment_length() 47 for record in a: 48 print record.seq, record.name, record.id 49 50 Note that this is not a full blown parser for all the information 51 in the FASTA output - for example, most of the header and all of the 52 footer is ignored. Also, the alignments are not batched according to 53 the input queries. 54 55 Also note that there can be up to about 30 letters of flanking region 56 included in the raw FASTA output as contextual information. This is NOT 57 part of the alignment itself, and is not included in the resulting 58 Alignment objects returned. 59 """ 60
61 - def next(self):
62 """Reads from the handle to construct and return the next alignment. 63 64 This returns the pairwise alignment of query and match/library 65 sequences as an Alignment object containing two rows.""" 66 handle = self.handle 67 68 try: 69 #Header we saved from when we were parsing 70 #the previous alignment. 71 line = self._header 72 del self._header 73 except AttributeError: 74 line = handle.readline() 75 if not line: 76 return None 77 78 if line.startswith("#"): 79 #Skip the file header before the alignments. e.g. 80 line = self._skip_file_header(line) 81 while ">>>" in line and not line.startswith(">>>"): 82 #Moved onto the next query sequence! 83 self._query_descr = "" 84 self._query_header_annotation = {} 85 #Read in the query header 86 line = self._parse_query_header(line) 87 #Now should be some alignments, but if not we move onto the next query 88 if not line: 89 #End of file 90 return None 91 if ">>><<<" in line: 92 #Reached the end of the alignments, no need to read the footer... 93 return None 94 95 96 #Should start >>... and not >>>... 97 assert line[0:2] == ">>" and not line[2] == ">", line 98 99 query_seq_parts, match_seq_parts = [], [] 100 query_annotation, match_annotation = {}, {} 101 match_descr = "" 102 alignment_annotation = {} 103 104 #This should be followed by the target match ID line, then more tags. 105 #e.g. 106 """ 107 >>gi|152973545|ref|YP_001338596.1| putative plasmid SOS inhibition protein A [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] 108 ; fa_frame: f 109 ; fa_initn: 52 110 ; fa_init1: 52 111 ; fa_opt: 70 112 ; fa_z-score: 105.5 113 ; fa_bits: 27.5 114 ; fa_expect: 0.082 115 ; sw_score: 70 116 ; sw_ident: 0.279 117 ; sw_sim: 0.651 118 ; sw_overlap: 43 119 """ 120 if (not line[0:2] == ">>") or line[0:3] == ">>>": 121 raise ValueError("Expected target line starting '>>'") 122 match_descr = line[2:].strip() 123 #Handle the following "alignment hit" tagged data, e.g. 124 line = handle.readline() 125 line = self._parse_tag_section(line, alignment_annotation) 126 assert not line[0:2] == "; " 127 128 #Then we have the alignment numbers and sequence for the query 129 """ 130 >gi|10955265| .. 131 ; sq_len: 346 132 ; sq_offset: 1 133 ; sq_type: p 134 ; al_start: 197 135 ; al_stop: 238 136 ; al_display_start: 167 137 DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK 138 QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL 139 GEYFTENKPKYIIREIHQET 140 """ 141 if not (line[0] == ">" and line.strip().endswith("..")): 142 raise ValueError("Expected line starting '>' and ending '..'") 143 assert self._query_descr.startswith(line[1:].split(None,1)[0]) 144 145 #Handle the following "query alignment" tagged data 146 line = handle.readline() 147 line = self._parse_tag_section(line, query_annotation) 148 assert not line[0:2] == "; " 149 150 #Now should have the aligned query sequence (with leading flanking region) 151 while not line[0] == ">": 152 query_seq_parts.append(line.strip()) 153 line = handle.readline() 154 155 #Handle the following "match alignment" data 156 """ 157 >gi|152973545|ref|YP_001338596.1| .. 158 ; sq_len: 242 159 ; sq_type: p 160 ; al_start: 52 161 ; al_stop: 94 162 ; al_display_start: 22 163 IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD 164 RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR 165 QDFAFTRKMRREARQVEQSW 166 """ 167 #Match identifier 168 if not (line[0] == ">" and line.strip().endswith("..")): 169 raise ValueError("Expected line starting '>' and ending '..', got '%s'" % repr(line)) 170 assert match_descr.startswith(line[1:].split(None,1)[0]) 171 172 #Tagged data, 173 line = handle.readline() 174 line = self._parse_tag_section(line, match_annotation) 175 assert not line[0:2] == "; " 176 177 #Now should have the aligned query sequence with flanking region... 178 #but before that, since FASTA 35.4.1 there can be an consensus here, 179 """ 180 ; al_cons: 181 .::. : :. ---. :: :. . : ..-:::-: :.: ..:...: 182 etc 183 """ 184 while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line): 185 match_seq_parts.append(line.strip()) 186 line = handle.readline() 187 if line[0:2] == "; ": 188 assert line.strip() == "; al_cons:" 189 align_consensus_parts = [] 190 line = handle.readline() 191 while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line): 192 align_consensus_parts.append(line.strip()) 193 line = handle.readline() 194 #If we do anything with this in future, must remove any flanking region. 195 align_consensus = "".join(align_consensus_parts) 196 del align_consensus_parts 197 assert not line[0:2] == "; " 198 else: 199 align_consensus = None 200 assert (line[0] == ">" or ">>>" in line) 201 self._header = line 202 203 #We built a list of strings and then joined them because 204 #its faster than appending to a string. 205 query_seq = "".join(query_seq_parts) 206 match_seq = "".join(match_seq_parts) 207 del query_seq_parts, match_seq_parts 208 #Note, query_seq and match_seq will usually be of different lengths, apparently 209 #because in the m10 format leading gaps are added but not trailing gaps! 210 211 #Remove the flanking regions, 212 query_align_seq = self._extract_alignment_region(query_seq, query_annotation) 213 match_align_seq = self._extract_alignment_region(match_seq, match_annotation) 214 #How can we do this for the (optional) consensus? 215 216 #The "sq_offset" values can be specified with the -X command line option. 217 #They appear to just shift the origin used in the calculation of the coordinates. 218 219 if len(query_align_seq) != len(match_align_seq): 220 raise ValueError("Problem parsing the alignment sequence coordinates, " 221 "following should be the same length but are not:\n" 222 "%s - len %i\n%s - len %i" % (query_align_seq, 223 len(query_align_seq), 224 match_align_seq, 225 len(match_align_seq))) 226 if "sw_overlap" in alignment_annotation: 227 if int(alignment_annotation["sw_overlap"]) != len(query_align_seq): 228 raise ValueError("Specified sw_overlap = %s does not match expected value %i" \ 229 % (alignment_annotation["sw_overlap"], 230 len(query_align_seq))) 231 232 #TODO - Look at the "sq_type" to assign a sensible alphabet? 233 alphabet = self.alphabet 234 alignment = Alignment(alphabet) 235 236 #TODO - Introduce an annotated alignment class? 237 #For now, store the annotation a new private property: 238 alignment._annotations = {} 239 240 #Want to record both the query header tags, and the alignment tags. 241 for key, value in self._query_header_annotation.iteritems(): 242 alignment._annotations[key] = value 243 for key, value in alignment_annotation.iteritems(): 244 alignment._annotations[key] = value 245 246 247 #TODO - Once the alignment object gets an append method, use it. 248 #(i.e. an add SeqRecord method) 249 alignment.add_sequence(self._query_descr, query_align_seq) 250 record = alignment.get_all_seqs()[-1] 251 assert record.id == self._query_descr or record.description == self._query_descr 252 #assert record.seq.tostring() == query_align_seq 253 record.id = self._query_descr.split(None,1)[0].strip(",") 254 record.name = "query" 255 record.annotations["original_length"] = int(query_annotation["sq_len"]) 256 #TODO - handle start/end coordinates properly. Short term hack for now: 257 record._al_start = int(query_annotation["al_start"]) 258 record._al_stop = int(query_annotation["al_stop"]) 259 260 #TODO - What if a specific alphabet has been requested? 261 #TODO - Use an IUPAC alphabet? 262 #TODO - Can FASTA output RNA? 263 if alphabet == single_letter_alphabet and "sq_type" in query_annotation: 264 if query_annotation["sq_type"] == "D": 265 record.seq.alphabet = generic_dna 266 elif query_annotation["sq_type"] == "p": 267 record.seq.alphabet = generic_protein 268 if "-" in query_align_seq: 269 if not hasattr(record.seq.alphabet,"gap_char"): 270 record.seq.alphabet = Gapped(record.seq.alphabet, "-") 271 272 alignment.add_sequence(match_descr, match_align_seq) 273 record = alignment.get_all_seqs()[-1] 274 assert record.id == match_descr or record.description == match_descr 275 #assert record.seq.tostring() == match_align_seq 276 record.id = match_descr.split(None,1)[0].strip(",") 277 record.name = "match" 278 record.annotations["original_length"] = int(match_annotation["sq_len"]) 279 #TODO - handle start/end coordinates properly. Short term hack for now: 280 record._al_start = int(match_annotation["al_start"]) 281 record._al_stop = int(match_annotation["al_stop"]) 282 283 #This is still a very crude way of dealing with the alphabet: 284 if alphabet == single_letter_alphabet and "sq_type" in match_annotation: 285 if match_annotation["sq_type"] == "D": 286 record.seq.alphabet = generic_dna 287 elif match_annotation["sq_type"] == "p": 288 record.seq.alphabet = generic_protein 289 if "-" in match_align_seq: 290 if not hasattr(record.seq.alphabet,"gap_char"): 291 record.seq.alphabet = Gapped(record.seq.alphabet, "-") 292 293 return alignment
294
295 - def _skip_file_header(self, line):
296 """Helper function for the main parsing code. 297 298 Skips over the file header region. 299 """ 300 #e.g. This region: 301 """ 302 # /home/xxx/Downloads/FASTA/fasta-35.3.6/fasta35 -Q -H -E 1 -m 10 -X "-5 -5" NC_002127.faa NC_009649.faa 303 FASTA searches a protein or DNA sequence data bank 304 version 35.03 Feb. 18, 2008 305 Please cite: 306 W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448 307 308 Query: NC_002127.faa 309 """ 310 #Note that there is no point recording the command line here 311 #from the # line, as it is included again in each alignment 312 #under the "pg_argv" tag. Likewise for the program version. 313 while ">>>" not in line: 314 line = self.handle.readline() 315 return line
316
317 - def _parse_query_header(self, line):
318 """Helper function for the main parsing code. 319 320 Skips over the free format query header, extracting the tagged parameters. 321 322 If there are no hits for the current query, it is skipped entirely.""" 323 #e.g. this region (where there is often a histogram too): 324 """ 325 2>>>gi|10955264|ref|NP_052605.1| hypothetical protein pOSAK1_02 [Escherichia coli O157:H7 s 126 aa - 126 aa 326 Library: NC_009649.faa 45119 residues in 180 sequences 327 328 45119 residues in 180 sequences 329 Statistics: (shuffled [500]) Expectation_n fit: rho(ln(x))= 5.0398+/-0.00968; mu= 2.8364+/- 0.508 330 mean_var=44.7937+/-10.479, 0's: 0 Z-trim: 0 B-trim: 0 in 0/32 331 Lambda= 0.191631 332 Algorithm: FASTA (3.5 Sept 2006) [optimized] 333 Parameters: BL50 matrix (15:-5) ktup: 2 334 join: 36, opt: 24, open/ext: -10/-2, width: 16 335 Scan time: 0.040 336 337 The best scores are: opt bits E(180) 338 gi|152973462|ref|YP_001338513.1| hypothetical prot ( 101) 58 23.3 0.22 339 gi|152973501|ref|YP_001338552.1| hypothetical prot ( 245) 55 22.5 0.93 340 """ 341 #Sometimes have queries with no matches, in which case we continue to the 342 #next query block: 343 """ 344 2>>>gi|152973838|ref|YP_001338875.1| hypothetical protein KPN_pKPN7p10263 [Klebsiella pneumoniae subsp. pneumonia 76 aa - 76 aa 345 vs NC_002127.faa library 346 347 579 residues in 3 sequences 348 Altschul/Gish params: n0: 76 Lambda: 0.158 K: 0.019 H: 0.100 349 350 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2 351 join: 36, opt: 24, open/ext: -10/-2, width: 16 352 Scan time: 0.000 353 !! No library sequences with E() < 0.5 354 """ 355 356 self._query_header_annotation = {} 357 self._query_descr = "" 358 359 assert ">>>" in line and not line[0:3] == ">>>" 360 #There is nothing useful in this line, the query description is truncated. 361 362 line = self.handle.readline() 363 #We ignore the free form text... 364 while not line[0:3] == ">>>": 365 #print "Ignoring %s" % line.strip() 366 line = self.handle.readline() 367 if not line: 368 raise ValueError("Premature end of file!") 369 if ">>><<<" in line: 370 #End of alignments, looks like the last query 371 #or queries had no hits. 372 return line 373 374 #Now want to parse this section: 375 """ 376 >>>gi|10955264|ref|NP_052605.1|, 126 aa vs NC_009649.faa library 377 ; pg_name: /home/pjcock/Downloads/FASTA/fasta-35.3.6/fasta35 378 ; pg_ver: 35.03 379 ; pg_argv: /home/pjcock/Downloads/FASTA/fasta-35.3.6/fasta35 -Q -H -E 1 -m 10 -X -5 -5 NC_002127.faa NC_009649.faa 380 ; pg_name: FASTA 381 ; pg_ver: 3.5 Sept 2006 382 ; pg_matrix: BL50 (15:-5) 383 ; pg_open-ext: -10 -2 384 ; pg_ktup: 2 385 ; pg_optcut: 24 386 ; pg_cgap: 36 387 ; mp_extrap: 60000 500 388 ; mp_stats: (shuffled [500]) Expectation_n fit: rho(ln(x))= 5.0398+/-0.00968; mu= 2.8364+/- 0.508 mean_var=44.7937+/-10.479, 0's: 0 Z-trim: 0 B-trim: 0 in 0/32 Lambda= 0.191631 389 ; mp_KS: -0.0000 (N=1066338402) at 20 390 ; mp_Algorithm: FASTA (3.5 Sept 2006) [optimized] 391 ; mp_Parameters: BL50 matrix (15:-5) ktup: 2 join: 36, opt: 24, open/ext: -10/-2, width: 16 392 """ 393 394 assert line[0:3] == ">>>", line 395 self._query_descr = line[3:].strip() 396 397 #Handle the following "program" tagged data, 398 line = self.handle.readline() 399 line = self._parse_tag_section(line, self._query_header_annotation) 400 assert not line[0:2] == "; ", line 401 assert line[0:2] == ">>" or ">>>" in line, line 402 return line
403 404
405 - def _extract_alignment_region(self, alignment_seq_with_flanking, annotation):
406 """Helper function for the main parsing code. 407 408 To get the actual pairwise alignment sequences, we must first 409 translate the un-gapped sequence based coordinates into positions 410 in the gapped sequence (which may have a flanking region shown 411 using leading - characters). To date, I have never seen any 412 trailing flanking region shown in the m10 file, but the 413 following code should also cope with that. 414 415 Note that this code seems to work fine even when the "sq_offset" 416 entries are prsent as a result of using the -X command line option. 417 """ 418 align_stripped = alignment_seq_with_flanking.strip("-") 419 display_start = int(annotation['al_display_start']) 420 if int(annotation['al_start']) <= int(annotation['al_stop']): 421 start = int(annotation['al_start']) \ 422 - display_start 423 end = int(annotation['al_stop']) \ 424 - display_start \ 425 + align_stripped.count("-") + 1 426 else: 427 #FASTA has flipped this sequence... 428 start = display_start \ 429 - int(annotation['al_start']) 430 end = display_start \ 431 - int(annotation['al_stop']) \ 432 + align_stripped.count("-") + 1 433 assert 0 <= start and start < end and end <= len(align_stripped), \ 434 "Problem with sequence start/stop,\n%s[%i:%i]\n%s" \ 435 % (alignment_seq_with_flanking, start, end, annotation) 436 return align_stripped[start:end]
437 438
439 - def _parse_tag_section(self, line, dictionary):
440 """Helper function for the main parsing code. 441 442 line - supply line just read from the handle containing the start of 443 the tagged section. 444 dictionary - where to record the tagged values 445 446 Returns a string, the first line following the tagged section.""" 447 if not line[0:2] == "; ": 448 raise ValueError("Expected line starting '; '") 449 while line[0:2] == "; ": 450 tag, value = line[2:].strip().split(": ",1) 451 #fasta34 and early versions of fasta35 will reuse the pg_name and 452 #pg_ver tags for the program executable and name, and the program 453 #version and the algorithm version, respectively. This is fixed 454 #in FASTA 35.4.1, but we can't assume the tags are unique: 455 #if tag in dictionary: 456 # raise ValueError("Repeated tag '%s' in section" % tag) 457 dictionary[tag] = value 458 line = self.handle.readline() 459 return line
460 461 if __name__ == "__main__": 462 print "Running a quick self-test" 463 464 #http://emboss.sourceforge.net/docs/themes/alnformats/align.simple 465 simple_example = \ 466 """# /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa 467 FASTA searches a protein or DNA sequence data bank 468 version 34.26 January 12, 2007 469 Please cite: 470 W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448 471 472 Query library NC_002127.faa vs NC_009649.faa library 473 searching NC_009649.faa library 474 475 1>>>gi|10955263|ref|NP_052604.1| plasmid mobilization [Escherichia coli O157:H7 s 107 aa - 107 aa 476 vs NC_009649.faa library 477 478 45119 residues in 180 sequences 479 Expectation_n fit: rho(ln(x))= 6.9146+/-0.0249; mu= -5.7948+/- 1.273 480 mean_var=53.6859+/-13.609, 0's: 0 Z-trim: 1 B-trim: 9 in 1/25 481 Lambda= 0.175043 482 483 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2 484 join: 36, opt: 24, open/ext: -10/-2, width: 16 485 Scan time: 0.000 486 The best scores are: opt bits E(180) 487 gi|152973457|ref|YP_001338508.1| ATPase with chape ( 931) 71 24.9 0.58 488 gi|152973588|ref|YP_001338639.1| F pilus assembly ( 459) 63 23.1 0.99 489 490 >>>gi|10955263|ref|NP_052604.1|, 107 aa vs NC_009649.faa library 491 ; pg_name: /opt/fasta/fasta34 492 ; pg_ver: 34.26 493 ; pg_argv: /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa 494 ; pg_name: FASTA 495 ; pg_ver: 3.5 Sept 2006 496 ; pg_matrix: BL50 (15:-5) 497 ; pg_open-ext: -10 -2 498 ; pg_ktup: 2 499 ; pg_optcut: 24 500 ; pg_cgap: 36 501 ; mp_extrap: 60000 180 502 ; mp_stats: Expectation_n fit: rho(ln(x))= 6.9146+/-0.0249; mu= -5.7948+/- 1.273 mean_var=53.6859+/-13.609, 0's: 0 Z-trim: 1 B-trim: 9 in 1/25 Lambda= 0.175043 503 ; mp_KS: -0.0000 (N=0) at 8159228 504 >>gi|152973457|ref|YP_001338508.1| ATPase with chaperone activity, ATP-binding subunit [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] 505 ; fa_frame: f 506 ; fa_initn: 65 507 ; fa_init1: 43 508 ; fa_opt: 71 509 ; fa_z-score: 90.3 510 ; fa_bits: 24.9 511 ; fa_expect: 0.58 512 ; sw_score: 71 513 ; sw_ident: 0.250 514 ; sw_sim: 0.574 515 ; sw_overlap: 108 516 >gi|10955263| .. 517 ; sq_len: 107 518 ; sq_offset: 1 519 ; sq_type: p 520 ; al_start: 5 521 ; al_stop: 103 522 ; al_display_start: 1 523 --------------------------MTKRSGSNT-RRRAISRPVRLTAE 524 ED---QEIRKRAAECGKTVSGFLRAAALGKKVNSLTDDRVLKEVM----- 525 RLGALQKKLFIDGKRVGDREYAEVLIAITEYHRALLSRLMAD 526 >gi|152973457|ref|YP_001338508.1| .. 527 ; sq_len: 931 528 ; sq_type: p 529 ; al_start: 96 530 ; al_stop: 195 531 ; al_display_start: 66 532 SDFFRIGDDATPVAADTDDVVDASFGEPAAAGSGAPRRRGSGLASRISEQ 533 SEALLQEAAKHAAEFGRS------EVDTEHLLLALADSDVVKTILGQFKI 534 KVDDLKRQIESEAKR-GDKPF-EGEIGVSPRVKDALSRAFVASNELGHSY 535 VGPEHFLIGLAEEGEGLAANLLRRYGLTPQ 536 >>gi|152973588|ref|YP_001338639.1| F pilus assembly protein [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] 537 ; fa_frame: f 538 ; fa_initn: 33 539 ; fa_init1: 33 540 ; fa_opt: 63 541 ; fa_z-score: 86.1 542 ; fa_bits: 23.1 543 ; fa_expect: 0.99 544 ; sw_score: 63 545 ; sw_ident: 0.266 546 ; sw_sim: 0.656 547 ; sw_overlap: 64 548 >gi|10955263| .. 549 ; sq_len: 107 550 ; sq_offset: 1 551 ; sq_type: p 552 ; al_start: 32 553 ; al_stop: 94 554 ; al_display_start: 2 555 TKRSGSNTRRRAISRPVRLTAEEDQEIRKRAAECGKTVSGFLRAAALGKK 556 VNSLTDDRVLKEV-MRLGALQKKLFIDGKRVGDREYAEVLIAITEYHRAL 557 LSRLMAD 558 >gi|152973588|ref|YP_001338639.1| .. 559 ; sq_len: 459 560 ; sq_type: p 561 ; al_start: 191 562 ; al_stop: 248 563 ; al_display_start: 161 564 VGGLFPRTQVAQQKVCQDIAGESNIFSDWAASRQGCTVGG--KMDSVQDK 565 ASDKDKERVMKNINIMWNALSKNRLFDG----NKELKEFIMTLTGTLIFG 566 ENSEITPLPARTTDQDLIRAMMEGGTAKIYHCNDSDKCLKVVADATVTIT 567 SNKALKSQISALLSSIQNKAVADEKLTDQE 568 2>>>gi|10955264|ref|NP_052605.1| hypothetical protein pOSAK1_02 [Escherichia coli O157:H7 s 126 aa - 126 aa 569 vs NC_009649.faa library 570 571 45119 residues in 180 sequences 572 Expectation_n fit: rho(ln(x))= 7.1374+/-0.0246; mu= -7.6540+/- 1.313 573 mean_var=51.1189+/-13.171, 0's: 0 Z-trim: 1 B-trim: 8 in 1/25 574 Lambda= 0.179384 575 576 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2 577 join: 36, opt: 24, open/ext: -10/-2, width: 16 578 Scan time: 0.000 579 The best scores are: opt bits E(180) 580 gi|152973462|ref|YP_001338513.1| hypothetical prot ( 101) 58 22.9 0.29 581 582 >>>gi|10955264|ref|NP_052605.1|, 126 aa vs NC_009649.faa library 583 ; pg_name: /opt/fasta/fasta34 584 ; pg_ver: 34.26 585 ; pg_argv: /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa 586 ; pg_name: FASTA 587 ; pg_ver: 3.5 Sept 2006 588 ; pg_matrix: BL50 (15:-5) 589 ; pg_open-ext: -10 -2 590 ; pg_ktup: 2 591 ; pg_optcut: 24 592 ; pg_cgap: 36 593 ; mp_extrap: 60000 180 594 ; mp_stats: Expectation_n fit: rho(ln(x))= 7.1374+/-0.0246; mu= -7.6540+/- 1.313 mean_var=51.1189+/-13.171, 0's: 0 Z-trim: 1 B-trim: 8 in 1/25 Lambda= 0.179384 595 ; mp_KS: -0.0000 (N=0) at 8159228 596 >>gi|152973462|ref|YP_001338513.1| hypothetical protein KPN_pKPN3p05904 [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] 597 ; fa_frame: f 598 ; fa_initn: 50 599 ; fa_init1: 50 600 ; fa_opt: 58 601 ; fa_z-score: 95.8 602 ; fa_bits: 22.9 603 ; fa_expect: 0.29 604 ; sw_score: 58 605 ; sw_ident: 0.289 606 ; sw_sim: 0.632 607 ; sw_overlap: 38 608 >gi|10955264| .. 609 ; sq_len: 126 610 ; sq_offset: 1 611 ; sq_type: p 612 ; al_start: 1 613 ; al_stop: 38 614 ; al_display_start: 1 615 ------------------------------MKKDKKYQIEAIKNKDKTLF 616 IVYATDIYSPSEFFSKIESDLKKKKSKGDVFFDLIIPNGGKKDRYVYTSF 617 NGEKFSSYTLNKVTKTDEYN 618 >gi|152973462|ref|YP_001338513.1| .. 619 ; sq_len: 101 620 ; sq_type: p 621 ; al_start: 44 622 ; al_stop: 81 623 ; al_display_start: 14 624 DALLGEIQRLRKQVHQLQLERDILTKANELIKKDLGVSFLKLKNREKTLI 625 VDALKKKYPVAELLSVLQLARSCYFYQNVCTISMRKYA 626 3>>>gi|10955265|ref|NP_052606.1| hypothetical protein pOSAK1_03 [Escherichia coli O157:H7 s 346 aa - 346 aa 627 vs NC_009649.faa library 628 629 45119 residues in 180 sequences 630 Expectation_n fit: rho(ln(x))= 6.0276+/-0.0276; mu= 3.0670+/- 1.461 631 mean_var=37.1634+/- 8.980, 0's: 0 Z-trim: 1 B-trim: 14 in 1/25 632 Lambda= 0.210386 633 634 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2 635 join: 37, opt: 25, open/ext: -10/-2, width: 16 636 Scan time: 0.020 637 The best scores are: opt bits E(180) 638 gi|152973545|ref|YP_001338596.1| putative plasmid ( 242) 70 27.5 0.082 639 640 >>>gi|10955265|ref|NP_052606.1|, 346 aa vs NC_009649.faa library 641 ; pg_name: /opt/fasta/fasta34 642 ; pg_ver: 34.26 643 ; pg_argv: /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa 644 ; pg_name: FASTA 645 ; pg_ver: 3.5 Sept 2006 646 ; pg_matrix: BL50 (15:-5) 647 ; pg_open-ext: -10 -2 648 ; pg_ktup: 2 649 ; pg_optcut: 25 650 ; pg_cgap: 37 651 ; mp_extrap: 60000 180 652 ; mp_stats: Expectation_n fit: rho(ln(x))= 6.0276+/-0.0276; mu= 3.0670+/- 1.461 mean_var=37.1634+/- 8.980, 0's: 0 Z-trim: 1 B-trim: 14 in 1/25 Lambda= 0.210386 653 ; mp_KS: -0.0000 (N=0) at 8159228 654 >>gi|152973545|ref|YP_001338596.1| putative plasmid SOS inhibition protein A [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] 655 ; fa_frame: f 656 ; fa_initn: 52 657 ; fa_init1: 52 658 ; fa_opt: 70 659 ; fa_z-score: 105.5 660 ; fa_bits: 27.5 661 ; fa_expect: 0.082 662 ; sw_score: 70 663 ; sw_ident: 0.279 664 ; sw_sim: 0.651 665 ; sw_overlap: 43 666 >gi|10955265| .. 667 ; sq_len: 346 668 ; sq_offset: 1 669 ; sq_type: p 670 ; al_start: 197 671 ; al_stop: 238 672 ; al_display_start: 167 673 DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK 674 QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL 675 GEYFTENKPKYIIREIHQET 676 >gi|152973545|ref|YP_001338596.1| .. 677 ; sq_len: 242 678 ; sq_type: p 679 ; al_start: 52 680 ; al_stop: 94 681 ; al_display_start: 22 682 IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD 683 RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR 684 QDFAFTRKMRREARQVEQSW 685 >>><<< 686 687 688 579 residues in 3 query sequences 689 45119 residues in 180 library sequences 690 Scomplib [34.26] 691 start: Tue May 20 16:38:45 2008 done: Tue May 20 16:38:45 2008 692 Total Scan time: 0.020 Total Display time: 0.010 693 694 Function used was FASTA [version 34.26 January 12, 2007] 695 696 """ 697 698 699 from StringIO import StringIO 700 701 alignments = list(FastaM10Iterator(StringIO(simple_example))) 702 assert len(alignments) == 4, len(alignments) 703 assert len(alignments[0].get_all_seqs()) == 2 704 for a in alignments: 705 print "Alignment %i sequences of length %i" \ 706 % (len(a.get_all_seqs()), a.get_alignment_length()) 707 for r in a: 708 print "%s %s %i" % (r.seq, r.id, r.annotations["original_length"]) 709 #print a.annotations 710 print "Done" 711 712 import os 713 path = "../../Tests/Fasta/" 714 files = [f for f in os.listdir(path) if os.path.splitext(f)[-1] == ".m10"] 715 files.sort() 716 for filename in files: 717 if os.path.splitext(filename)[-1] == ".m10": 718 print 719 print filename 720 print "="*len(filename) 721 for i,a in enumerate(FastaM10Iterator(open(os.path.join(path,filename)))): 722 print "#%i, %s" % (i+1,a) 723 for r in a: 724 if "-" in r.seq: 725 assert r.seq.alphabet.gap_char == "-" 726 else: 727 assert not hasattr(r.seq.alphabet, "gap_char") 728