Package Bio :: Package Blast :: Module NCBIStandalone
[hide private]
[frames] | no frames]

Source Code for Module Bio.Blast.NCBIStandalone

   1  # Copyright 1999-2000 by Jeffrey Chang.  All rights reserved. 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license.  Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5  # Patches by Mike Poidinger to support multiple databases. 
   6  # Updated by Peter Cock in 2007 to do a better job on BLAST 2.2.15 
   7   
   8  """ 
   9  This module provides code to work with the standalone version of 
  10  BLAST, either blastall, rpsblast or blastpgp, provided by the NCBI. 
  11  http://www.ncbi.nlm.nih.gov/BLAST/ 
  12   
  13  Classes: 
  14  LowQualityBlastError     Except that indicates low quality query sequences. 
  15  BlastParser              Parses output from blast. 
  16  BlastErrorParser         Parses output and tries to diagnose possible errors. 
  17  PSIBlastParser           Parses output from psi-blast. 
  18  Iterator                 Iterates over a file of blast results. 
  19   
  20  _Scanner                 Scans output from standalone BLAST. 
  21  _BlastConsumer           Consumes output from blast. 
  22  _PSIBlastConsumer        Consumes output from psi-blast. 
  23  _HeaderConsumer          Consumes header information. 
  24  _DescriptionConsumer     Consumes description information. 
  25  _AlignmentConsumer       Consumes alignment information. 
  26  _HSPConsumer             Consumes hsp information. 
  27  _DatabaseReportConsumer  Consumes database report information. 
  28  _ParametersConsumer      Consumes parameters information. 
  29   
  30  Functions: 
  31  blastall        Execute blastall (OBSOLETE). 
  32  blastpgp        Execute blastpgp (OBSOLETE). 
  33  rpsblast        Execute rpsblast (OBSOLETE). 
  34   
  35  For calling the BLAST command line tools, we encourage you to use the 
  36  command line wrappers in Bio.Blast.Applications - the three functions 
  37  blastall, blastpgp and rpsblast are considered to be obsolete now, and 
  38  are likely to be deprecated and then removed in future releases. 
  39  """ 
  40   
  41  import os 
  42  import re 
  43   
  44  from Bio import File 
  45  from Bio.ParserSupport import * 
  46  from Bio.Blast import Record 
  47  from Bio.Application import _escape_filename 
  48   
49 -class LowQualityBlastError(Exception):
50 """Error caused by running a low quality sequence through BLAST. 51 52 When low quality sequences (like GenBank entries containing only 53 stretches of a single nucleotide) are BLASTed, they will result in 54 BLAST generating an error and not being able to perform the BLAST. 55 search. This error should be raised for the BLAST reports produced 56 in this case. 57 """ 58 pass
59
60 -class ShortQueryBlastError(Exception):
61 """Error caused by running a short query sequence through BLAST. 62 63 If the query sequence is too short, BLAST outputs warnings and errors: 64 Searching[blastall] WARNING: [000.000] AT1G08320: SetUpBlastSearch failed. 65 [blastall] ERROR: [000.000] AT1G08320: Blast: 66 [blastall] ERROR: [000.000] AT1G08320: Blast: Query must be at least wordsize 67 done 68 69 This exception is raised when that condition is detected. 70 71 """ 72 pass
73 74
75 -class _Scanner:
76 """Scan BLAST output from blastall or blastpgp. 77 78 Tested with blastall and blastpgp v2.0.10, v2.0.11 79 80 Methods: 81 feed Feed data into the scanner. 82 83 """
84 - def feed(self, handle, consumer):
85 """S.feed(handle, consumer) 86 87 Feed in a BLAST report for scanning. handle is a file-like 88 object that contains the BLAST report. consumer is a Consumer 89 object that will receive events as the report is scanned. 90 91 """ 92 if isinstance(handle, File.UndoHandle): 93 uhandle = handle 94 else: 95 uhandle = File.UndoHandle(handle) 96 97 # Try to fast-forward to the beginning of the blast report. 98 read_and_call_until(uhandle, consumer.noevent, contains='BLAST') 99 # Now scan the BLAST report. 100 self._scan_header(uhandle, consumer) 101 self._scan_rounds(uhandle, consumer) 102 self._scan_database_report(uhandle, consumer) 103 self._scan_parameters(uhandle, consumer)
104
105 - def _scan_header(self, uhandle, consumer):
106 # BLASTP 2.0.10 [Aug-26-1999] 107 # 108 # 109 # Reference: Altschul, Stephen F., Thomas L. Madden, Alejandro A. Schaf 110 # Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), 111 # "Gapped BLAST and PSI-BLAST: a new generation of protein database sea 112 # programs", Nucleic Acids Res. 25:3389-3402. 113 # 114 # Query= test 115 # (140 letters) 116 # 117 # Database: sdqib40-1.35.seg.fa 118 # 1323 sequences; 223,339 total letters 119 # 120 # ======================================================== 121 # This next example is from the online version of Blast, 122 # note there are TWO references, an RID line, and also 123 # the database is BEFORE the query line. 124 # Note there possibleuse of non-ASCII in the author names. 125 # ======================================================== 126 # 127 # BLASTP 2.2.15 [Oct-15-2006] 128 # Reference: Altschul, Stephen F., Thomas L. Madden, Alejandro A. Sch??ffer, 129 # Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman 130 # (1997), "Gapped BLAST and PSI-BLAST: a new generation of 131 # protein database search programs", Nucleic Acids Res. 25:3389-3402. 132 # 133 # Reference: Sch??ffer, Alejandro A., L. Aravind, Thomas L. Madden, Sergei 134 # Shavirin, John L. Spouge, Yuri I. Wolf, Eugene V. Koonin, and 135 # Stephen F. Altschul (2001), "Improving the accuracy of PSI-BLAST 136 # protein database searches with composition-based statistics 137 # and other refinements", Nucleic Acids Res. 29:2994-3005. 138 # 139 # RID: 1166022616-19998-65316425856.BLASTQ1 140 # 141 # 142 # Database: All non-redundant GenBank CDS 143 # translations+PDB+SwissProt+PIR+PRF excluding environmental samples 144 # 4,254,166 sequences; 1,462,033,012 total letters 145 # Query= gi:16127998 146 # Length=428 147 # 148 149 consumer.start_header() 150 151 read_and_call(uhandle, consumer.version, contains='BLAST') 152 read_and_call_while(uhandle, consumer.noevent, blank=1) 153 154 # There might be a <pre> line, for qblast output. 155 attempt_read_and_call(uhandle, consumer.noevent, start="<pre>") 156 157 # Read the reference(s) 158 while attempt_read_and_call(uhandle, 159 consumer.reference, start='Reference'): 160 # References are normally multiline terminated by a blank line 161 # (or, based on the old code, the RID line) 162 while 1: 163 line = uhandle.readline() 164 if is_blank_line(line): 165 consumer.noevent(line) 166 break 167 elif line.startswith("RID"): 168 break 169 else: 170 #More of the reference 171 consumer.reference(line) 172 173 #Deal with the optional RID: ... 174 read_and_call_while(uhandle, consumer.noevent, blank=1) 175 attempt_read_and_call(uhandle, consumer.reference, start="RID:") 176 read_and_call_while(uhandle, consumer.noevent, blank=1) 177 178 # blastpgp may have a reference for compositional score matrix 179 # adjustment (see Bug 2502): 180 if attempt_read_and_call( 181 uhandle, consumer.reference, start="Reference"): 182 read_and_call_until(uhandle, consumer.reference, blank=1) 183 read_and_call_while(uhandle, consumer.noevent, blank=1) 184 185 # blastpgp has a Reference for composition-based statistics. 186 if attempt_read_and_call( 187 uhandle, consumer.reference, start="Reference"): 188 read_and_call_until(uhandle, consumer.reference, blank=1) 189 read_and_call_while(uhandle, consumer.noevent, blank=1) 190 191 line = uhandle.peekline() 192 assert line.strip() != "" 193 assert not line.startswith("RID:") 194 if line.startswith("Query="): 195 #This is an old style query then database... 196 197 # Read the Query lines and the following blank line. 198 read_and_call(uhandle, consumer.query_info, start='Query=') 199 read_and_call_until(uhandle, consumer.query_info, blank=1) 200 read_and_call_while(uhandle, consumer.noevent, blank=1) 201 202 # Read the database lines and the following blank line. 203 read_and_call_until(uhandle, consumer.database_info, end='total letters') 204 read_and_call(uhandle, consumer.database_info, contains='sequences') 205 read_and_call_while(uhandle, consumer.noevent, blank=1) 206 elif line.startswith("Database:"): 207 #This is a new style database then query... 208 read_and_call_until(uhandle, consumer.database_info, end='total letters') 209 read_and_call(uhandle, consumer.database_info, contains='sequences') 210 read_and_call_while(uhandle, consumer.noevent, blank=1) 211 212 # Read the Query lines and the following blank line. 213 # Or, on BLAST 2.2.22+ there is no blank link - need to spot 214 # the "... Score E" line instead. 215 read_and_call(uhandle, consumer.query_info, start='Query=') 216 #read_and_call_until(uhandle, consumer.query_info, blank=1) 217 while True: 218 line = uhandle.peekline() 219 if not line.strip() : break 220 if "Score E" in line : break 221 #It is more of the query (and its length) 222 read_and_call(uhandle, consumer.query_info) 223 read_and_call_while(uhandle, consumer.noevent, blank=1) 224 else: 225 raise ValueError("Invalid header?") 226 227 consumer.end_header()
228
229 - def _scan_rounds(self, uhandle, consumer):
230 # Scan a bunch of rounds. 231 # Each round begins with either a "Searching......" line 232 # or a 'Score E' line followed by descriptions and alignments. 233 # The email server doesn't give the "Searching....." line. 234 # If there is no 'Searching.....' line then you'll first see a 235 # 'Results from round' line 236 237 while not self._eof(uhandle): 238 line = safe_peekline(uhandle) 239 if (not line.startswith('Searching') and 240 not line.startswith('Results from round') and 241 re.search(r"Score +E", line) is None and 242 line.find('No hits found') == -1): 243 break 244 self._scan_descriptions(uhandle, consumer) 245 self._scan_alignments(uhandle, consumer)
246
247 - def _scan_descriptions(self, uhandle, consumer):
248 # Searching..................................................done 249 # Results from round 2 250 # 251 # 252 # Sc 253 # Sequences producing significant alignments: (b 254 # Sequences used in model and found again: 255 # 256 # d1tde_2 3.4.1.4.4 (119-244) Thioredoxin reductase [Escherichia ... 257 # d1tcob_ 1.31.1.5.16 Calcineurin regulatory subunit (B-chain) [B... 258 # d1symb_ 1.31.1.2.2 Calcyclin (S100) [RAT (RATTUS NORVEGICUS)] 259 # 260 # Sequences not found previously or not previously below threshold: 261 # 262 # d1osa__ 1.31.1.5.11 Calmodulin [Paramecium tetraurelia] 263 # d1aoza3 2.5.1.3.3 (339-552) Ascorbate oxidase [zucchini (Cucurb... 264 # 265 266 # If PSI-BLAST, may also have: 267 # 268 # CONVERGED! 269 270 consumer.start_descriptions() 271 272 # Read 'Searching' 273 # This line seems to be missing in BLASTN 2.1.2 (others?) 274 attempt_read_and_call(uhandle, consumer.noevent, start='Searching') 275 276 # blastpgp 2.0.10 from NCBI 9/19/99 for Solaris sometimes crashes here. 277 # If this happens, the handle will yield no more information. 278 if not uhandle.peekline(): 279 raise ValueError("Unexpected end of blast report. " + \ 280 "Looks suspiciously like a PSI-BLAST crash.") 281 282 # BLASTN 2.2.3 sometimes spews a bunch of warnings and errors here: 283 # Searching[blastall] WARNING: [000.000] AT1G08320: SetUpBlastSearch 284 # [blastall] ERROR: [000.000] AT1G08320: Blast: 285 # [blastall] ERROR: [000.000] AT1G08320: Blast: Query must be at leas 286 # done 287 # Reported by David Weisman. 288 # Check for these error lines and ignore them for now. Let 289 # the BlastErrorParser deal with them. 290 line = uhandle.peekline() 291 if line.find("ERROR:") != -1 or line.startswith("done"): 292 read_and_call_while(uhandle, consumer.noevent, contains="ERROR:") 293 read_and_call(uhandle, consumer.noevent, start="done") 294 295 # Check to see if this is PSI-BLAST. 296 # If it is, the 'Searching' line will be followed by: 297 # (version 2.0.10) 298 # Searching............................. 299 # Results from round 2 300 # or (version 2.0.11) 301 # Searching............................. 302 # 303 # 304 # Results from round 2 305 306 # Skip a bunch of blank lines. 307 read_and_call_while(uhandle, consumer.noevent, blank=1) 308 # Check for the results line if it's there. 309 if attempt_read_and_call(uhandle, consumer.round, start='Results'): 310 read_and_call_while(uhandle, consumer.noevent, blank=1) 311 312 # Three things can happen here: 313 # 1. line contains 'Score E' 314 # 2. line contains "No hits found" 315 # 3. no descriptions 316 # The first one begins a bunch of descriptions. The last two 317 # indicates that no descriptions follow, and we should go straight 318 # to the alignments. 319 if not attempt_read_and_call( 320 uhandle, consumer.description_header, 321 has_re=re.compile(r'Score +E')): 322 # Either case 2 or 3. Look for "No hits found". 323 attempt_read_and_call(uhandle, consumer.no_hits, 324 contains='No hits found') 325 try: 326 read_and_call_while(uhandle, consumer.noevent, blank=1) 327 except ValueError, err: 328 if str(err) != "Unexpected end of stream." : raise err 329 330 consumer.end_descriptions() 331 # Stop processing. 332 return 333 334 # Read the score header lines 335 read_and_call(uhandle, consumer.description_header, 336 start='Sequences producing') 337 338 # If PSI-BLAST, read the 'Sequences used in model' line. 339 attempt_read_and_call(uhandle, consumer.model_sequences, 340 start='Sequences used in model') 341 read_and_call_while(uhandle, consumer.noevent, blank=1) 342 343 # In BLAT, rather than a "No hits found" line, we just 344 # get no descriptions (and no alignments). This can be 345 # spotted because the next line is the database block: 346 if safe_peekline(uhandle).startswith(" Database:"): 347 consumer.end_descriptions() 348 # Stop processing. 349 return 350 351 # Read the descriptions and the following blank lines, making 352 # sure that there are descriptions. 353 if not uhandle.peekline().startswith('Sequences not found'): 354 read_and_call_until(uhandle, consumer.description, blank=1) 355 read_and_call_while(uhandle, consumer.noevent, blank=1) 356 357 # If PSI-BLAST, read the 'Sequences not found' line followed 358 # by more descriptions. However, I need to watch out for the 359 # case where there were no sequences not found previously, in 360 # which case there will be no more descriptions. 361 if attempt_read_and_call(uhandle, consumer.nonmodel_sequences, 362 start='Sequences not found'): 363 # Read the descriptions and the following blank lines. 364 read_and_call_while(uhandle, consumer.noevent, blank=1) 365 l = safe_peekline(uhandle) 366 # Brad -- added check for QUERY. On some PSI-BLAST outputs 367 # there will be a 'Sequences not found' line followed by no 368 # descriptions. Check for this case since the first thing you'll 369 # get is a blank line and then 'QUERY' 370 if not l.startswith('CONVERGED') and l[0] != '>' \ 371 and not l.startswith('QUERY'): 372 read_and_call_until(uhandle, consumer.description, blank=1) 373 read_and_call_while(uhandle, consumer.noevent, blank=1) 374 375 attempt_read_and_call(uhandle, consumer.converged, start='CONVERGED') 376 read_and_call_while(uhandle, consumer.noevent, blank=1) 377 378 consumer.end_descriptions()
379
380 - def _scan_alignments(self, uhandle, consumer):
381 if self._eof(uhandle) : return 382 383 # qblast inserts a helpful line here. 384 attempt_read_and_call(uhandle, consumer.noevent, start="ALIGNMENTS") 385 386 # First, check to see if I'm at the database report. 387 line = safe_peekline(uhandle) 388 if not line: 389 #EOF 390 return 391 elif line.startswith(' Database') or line.startswith("Lambda"): 392 return 393 elif line[0] == '>': 394 # XXX make a better check here between pairwise and masterslave 395 self._scan_pairwise_alignments(uhandle, consumer) 396 else: 397 # XXX put in a check to make sure I'm in a masterslave alignment 398 self._scan_masterslave_alignment(uhandle, consumer)
399
400 - def _scan_pairwise_alignments(self, uhandle, consumer):
401 while not self._eof(uhandle): 402 line = safe_peekline(uhandle) 403 if line[0] != '>': 404 break 405 self._scan_one_pairwise_alignment(uhandle, consumer)
406
407 - def _scan_one_pairwise_alignment(self, uhandle, consumer):
408 if self._eof(uhandle) : return 409 consumer.start_alignment() 410 411 self._scan_alignment_header(uhandle, consumer) 412 413 # Scan a bunch of score/alignment pairs. 414 while 1: 415 if self._eof(uhandle): 416 #Shouldn't have issued that _scan_alignment_header event... 417 break 418 line = safe_peekline(uhandle) 419 if not line.startswith(' Score'): 420 break 421 self._scan_hsp(uhandle, consumer) 422 consumer.end_alignment()
423
424 - def _scan_alignment_header(self, uhandle, consumer):
425 # >d1rip__ 2.24.7.1.1 Ribosomal S17 protein [Bacillus 426 # stearothermophilus] 427 # Length = 81 428 # 429 # Or, more recently with different white space: 430 # 431 # >gi|15799684|ref|NP_285696.1| threonine synthase ... 432 # gi|15829258|ref|NP_308031.1| threonine synthase 433 # ... 434 # Length=428 435 read_and_call(uhandle, consumer.title, start='>') 436 while 1: 437 line = safe_readline(uhandle) 438 if line.lstrip().startswith('Length =') \ 439 or line.lstrip().startswith('Length='): 440 consumer.length(line) 441 break 442 elif is_blank_line(line): 443 # Check to make sure I haven't missed the Length line 444 raise ValueError("I missed the Length in an alignment header") 445 consumer.title(line) 446 447 # Older versions of BLAST will have a line with some spaces. 448 # Version 2.0.14 (maybe 2.0.13?) and above print a true blank line. 449 if not attempt_read_and_call(uhandle, consumer.noevent, 450 start=' '): 451 read_and_call(uhandle, consumer.noevent, blank=1)
452
453 - def _scan_hsp(self, uhandle, consumer):
454 consumer.start_hsp() 455 self._scan_hsp_header(uhandle, consumer) 456 self._scan_hsp_alignment(uhandle, consumer) 457 consumer.end_hsp()
458
459 - def _scan_hsp_header(self, uhandle, consumer):
460 # Score = 22.7 bits (47), Expect = 2.5 461 # Identities = 10/36 (27%), Positives = 18/36 (49%) 462 # Strand = Plus / Plus 463 # Frame = +3 464 # 465 466 read_and_call(uhandle, consumer.score, start=' Score') 467 read_and_call(uhandle, consumer.identities, start=' Identities') 468 # BLASTN 469 attempt_read_and_call(uhandle, consumer.strand, start = ' Strand') 470 # BLASTX, TBLASTN, TBLASTX 471 attempt_read_and_call(uhandle, consumer.frame, start = ' Frame') 472 read_and_call(uhandle, consumer.noevent, blank=1)
473
474 - def _scan_hsp_alignment(self, uhandle, consumer):
475 # Query: 11 GRGVSACA-------TCDGFFYRNQKVAVIGGGNTAVEEALYLSNIASEVHLIHRRDGF 476 # GRGVS+ TC Y + + V GGG+ + EE L + I R+ 477 # Sbjct: 12 GRGVSSVVRRCIHKPTCKE--YAVKIIDVTGGGSFSAEEVQELREATLKEVDILRKVSG 478 # 479 # Query: 64 AEKILIKR 71 480 # I +K 481 # Sbjct: 70 PNIIQLKD 77 482 # 483 484 while 1: 485 # Blastn adds an extra line filled with spaces before Query 486 attempt_read_and_call(uhandle, consumer.noevent, start=' ') 487 read_and_call(uhandle, consumer.query, start='Query') 488 read_and_call(uhandle, consumer.align, start=' ') 489 read_and_call(uhandle, consumer.sbjct, start='Sbjct') 490 try: 491 read_and_call_while(uhandle, consumer.noevent, blank=1) 492 except ValueError, err: 493 if str(err) != "Unexpected end of stream." : raise err 494 # End of File (well, it looks like it with recent versions 495 # of BLAST for multiple queries after the Iterator class 496 # has broken up the whole file into chunks). 497 break 498 line = safe_peekline(uhandle) 499 # Alignment continues if I see a 'Query' or the spaces for Blastn. 500 if not (line.startswith('Query') or line.startswith(' ')): 501 break
502
503 - def _scan_masterslave_alignment(self, uhandle, consumer):
504 consumer.start_alignment() 505 while 1: 506 line = safe_readline(uhandle) 507 # Check to see whether I'm finished reading the alignment. 508 # This is indicated by 1) database section, 2) next psi-blast 509 # round, which can also be a 'Results from round' if no 510 # searching line is present 511 # patch by chapmanb 512 if line.startswith('Searching') or \ 513 line.startswith('Results from round'): 514 uhandle.saveline(line) 515 break 516 elif line.startswith(' Database'): 517 uhandle.saveline(line) 518 break 519 elif is_blank_line(line): 520 consumer.noevent(line) 521 else: 522 consumer.multalign(line) 523 read_and_call_while(uhandle, consumer.noevent, blank=1) 524 consumer.end_alignment()
525
526 - def _eof(self, uhandle):
527 try: 528 line = safe_peekline(uhandle) 529 except ValueError, err: 530 if str(err) != "Unexpected end of stream." : raise err 531 line = "" 532 return not line
533
534 - def _scan_database_report(self, uhandle, consumer):
535 # Database: sdqib40-1.35.seg.fa 536 # Posted date: Nov 1, 1999 4:25 PM 537 # Number of letters in database: 223,339 538 # Number of sequences in database: 1323 539 # 540 # Lambda K H 541 # 0.322 0.133 0.369 542 # 543 # Gapped 544 # Lambda K H 545 # 0.270 0.0470 0.230 546 # 547 ########################################## 548 # Or, more recently Blast 2.2.15 gives less blank lines 549 ########################################## 550 # Database: All non-redundant GenBank CDS translations+PDB+SwissProt+PIR+PRF excluding 551 # environmental samples 552 # Posted date: Dec 12, 2006 5:51 PM 553 # Number of letters in database: 667,088,753 554 # Number of sequences in database: 2,094,974 555 # Lambda K H 556 # 0.319 0.136 0.395 557 # Gapped 558 # Lambda K H 559 # 0.267 0.0410 0.140 560 561 if self._eof(uhandle) : return 562 563 consumer.start_database_report() 564 565 # Subset of the database(s) listed below 566 # Number of letters searched: 562,618,960 567 # Number of sequences searched: 228,924 568 if attempt_read_and_call(uhandle, consumer.noevent, start=" Subset"): 569 read_and_call(uhandle, consumer.noevent, contains="letters") 570 read_and_call(uhandle, consumer.noevent, contains="sequences") 571 read_and_call(uhandle, consumer.noevent, start=" ") 572 573 # Sameet Mehta reported seeing output from BLASTN 2.2.9 that 574 # was missing the "Database" stanza completely. 575 while attempt_read_and_call(uhandle, consumer.database, 576 start=' Database'): 577 # BLAT output ends abruptly here, without any of the other 578 # information. Check to see if this is the case. If so, 579 # then end the database report here gracefully. 580 if not uhandle.peekline().strip() \ 581 or uhandle.peekline().startswith("BLAST"): 582 consumer.end_database_report() 583 return 584 585 # Database can span multiple lines. 586 read_and_call_until(uhandle, consumer.database, start=' Posted') 587 read_and_call(uhandle, consumer.posted_date, start=' Posted') 588 read_and_call(uhandle, consumer.num_letters_in_database, 589 start=' Number of letters') 590 read_and_call(uhandle, consumer.num_sequences_in_database, 591 start=' Number of sequences') 592 #There may not be a line starting with spaces... 593 attempt_read_and_call(uhandle, consumer.noevent, start=' ') 594 595 line = safe_readline(uhandle) 596 uhandle.saveline(line) 597 if line.find('Lambda') != -1: 598 break 599 600 read_and_call(uhandle, consumer.noevent, start='Lambda') 601 read_and_call(uhandle, consumer.ka_params) 602 603 #This blank line is optional: 604 attempt_read_and_call(uhandle, consumer.noevent, blank=1) 605 606 # not BLASTP 607 attempt_read_and_call(uhandle, consumer.gapped, start='Gapped') 608 # not TBLASTX 609 if attempt_read_and_call(uhandle, consumer.noevent, start='Lambda'): 610 read_and_call(uhandle, consumer.ka_params_gap) 611 612 # Blast 2.2.4 can sometimes skip the whole parameter section. 613 # Thus, I need to be careful not to read past the end of the 614 # file. 615 try: 616 read_and_call_while(uhandle, consumer.noevent, blank=1) 617 except ValueError, x: 618 if str(x) != "Unexpected end of stream.": 619 raise 620 consumer.end_database_report()
621
622 - def _scan_parameters(self, uhandle, consumer):
623 # Matrix: BLOSUM62 624 # Gap Penalties: Existence: 11, Extension: 1 625 # Number of Hits to DB: 50604 626 # Number of Sequences: 1323 627 # Number of extensions: 1526 628 # Number of successful extensions: 6 629 # Number of sequences better than 10.0: 5 630 # Number of HSP's better than 10.0 without gapping: 5 631 # Number of HSP's successfully gapped in prelim test: 0 632 # Number of HSP's that attempted gapping in prelim test: 1 633 # Number of HSP's gapped (non-prelim): 5 634 # length of query: 140 635 # length of database: 223,339 636 # effective HSP length: 39 637 # effective length of query: 101 638 # effective length of database: 171,742 639 # effective search space: 17345942 640 # effective search space used: 17345942 641 # T: 11 642 # A: 40 643 # X1: 16 ( 7.4 bits) 644 # X2: 38 (14.8 bits) 645 # X3: 64 (24.9 bits) 646 # S1: 41 (21.9 bits) 647 # S2: 42 (20.8 bits) 648 ########################################## 649 # Or, more recently Blast(x) 2.2.15 gives 650 ########################################## 651 # Matrix: BLOSUM62 652 # Gap Penalties: Existence: 11, Extension: 1 653 # Number of Sequences: 4535438 654 # Number of Hits to DB: 2,588,844,100 655 # Number of extensions: 60427286 656 # Number of successful extensions: 126433 657 # Number of sequences better than 2.0: 30 658 # Number of HSP's gapped: 126387 659 # Number of HSP's successfully gapped: 35 660 # Length of query: 291 661 # Length of database: 1,573,298,872 662 # Length adjustment: 130 663 # Effective length of query: 161 664 # Effective length of database: 983,691,932 665 # Effective search space: 158374401052 666 # Effective search space used: 158374401052 667 # Neighboring words threshold: 12 668 # Window for multiple hits: 40 669 # X1: 16 ( 7.3 bits) 670 # X2: 38 (14.6 bits) 671 # X3: 64 (24.7 bits) 672 # S1: 41 (21.7 bits) 673 # S2: 32 (16.9 bits) 674 675 676 # Blast 2.2.4 can sometimes skip the whole parameter section. 677 # BLAT also skips the whole parameter section. 678 # Thus, check to make sure that the parameter section really 679 # exists. 680 if not uhandle.peekline().strip(): 681 return 682 683 # BLASTN 2.2.9 looks like it reverses the "Number of Hits" and 684 # "Number of Sequences" lines. 685 consumer.start_parameters() 686 687 # Matrix line may be missing in BLASTN 2.2.9 688 attempt_read_and_call(uhandle, consumer.matrix, start='Matrix') 689 # not TBLASTX 690 attempt_read_and_call(uhandle, consumer.gap_penalties, start='Gap') 691 692 attempt_read_and_call(uhandle, consumer.num_sequences, 693 start='Number of Sequences') 694 attempt_read_and_call(uhandle, consumer.num_hits, 695 start='Number of Hits') 696 attempt_read_and_call(uhandle, consumer.num_sequences, 697 start='Number of Sequences') 698 attempt_read_and_call(uhandle, consumer.num_extends, 699 start='Number of extensions') 700 attempt_read_and_call(uhandle, consumer.num_good_extends, 701 start='Number of successful') 702 703 attempt_read_and_call(uhandle, consumer.num_seqs_better_e, 704 start='Number of sequences') 705 706 # not BLASTN, TBLASTX 707 if attempt_read_and_call(uhandle, consumer.hsps_no_gap, 708 start="Number of HSP's better"): 709 # BLASTN 2.2.9 710 if attempt_read_and_call(uhandle, consumer.noevent, 711 start="Number of HSP's gapped:"): 712 read_and_call(uhandle, consumer.noevent, 713 start="Number of HSP's successfully") 714 #This is ommitted in 2.2.15 715 attempt_read_and_call(uhandle, consumer.noevent, 716 start="Number of extra gapped extensions") 717 else: 718 read_and_call(uhandle, consumer.hsps_prelim_gapped, 719 start="Number of HSP's successfully") 720 read_and_call(uhandle, consumer.hsps_prelim_gap_attempted, 721 start="Number of HSP's that") 722 read_and_call(uhandle, consumer.hsps_gapped, 723 start="Number of HSP's gapped") 724 #e.g. BLASTX 2.2.15 where the "better" line is missing 725 elif attempt_read_and_call(uhandle, consumer.noevent, 726 start="Number of HSP's gapped"): 727 read_and_call(uhandle, consumer.noevent, 728 start="Number of HSP's successfully") 729 730 # not in blastx 2.2.1 731 attempt_read_and_call(uhandle, consumer.query_length, 732 has_re=re.compile(r"[Ll]ength of query")) 733 # Not in BLASTX 2.2.22+ 734 attempt_read_and_call(uhandle, consumer.database_length, 735 has_re=re.compile(r"[Ll]ength of \s*[Dd]atabase")) 736 737 # BLASTN 2.2.9 738 attempt_read_and_call(uhandle, consumer.noevent, 739 start="Length adjustment") 740 attempt_read_and_call(uhandle, consumer.effective_hsp_length, 741 start='effective HSP') 742 # Not in blastx 2.2.1 743 attempt_read_and_call( 744 uhandle, consumer.effective_query_length, 745 has_re=re.compile(r'[Ee]ffective length of query')) 746 747 # This is not in BLASTP 2.2.15 748 attempt_read_and_call( 749 uhandle, consumer.effective_database_length, 750 has_re=re.compile(r'[Ee]ffective length of \s*[Dd]atabase')) 751 # Not in blastx 2.2.1, added a ':' to distinguish between 752 # this and the 'effective search space used' line 753 attempt_read_and_call( 754 uhandle, consumer.effective_search_space, 755 has_re=re.compile(r'[Ee]ffective search space:')) 756 # Does not appear in BLASTP 2.0.5 757 attempt_read_and_call( 758 uhandle, consumer.effective_search_space_used, 759 has_re=re.compile(r'[Ee]ffective search space used')) 760 761 # BLASTX, TBLASTN, TBLASTX 762 attempt_read_and_call(uhandle, consumer.frameshift, start='frameshift') 763 764 # not in BLASTN 2.2.9 765 attempt_read_and_call(uhandle, consumer.threshold, start='T') 766 # In BLASTX 2.2.15 replaced by: "Neighboring words threshold: 12" 767 attempt_read_and_call(uhandle, consumer.threshold, start='Neighboring words threshold') 768 769 # not in BLASTX 2.2.15 770 attempt_read_and_call(uhandle, consumer.window_size, start='A') 771 # get this instead: "Window for multiple hits: 40" 772 attempt_read_and_call(uhandle, consumer.window_size, start='Window for multiple hits') 773 774 # not in BLASTX 2.2.22+ 775 attempt_read_and_call(uhandle, consumer.dropoff_1st_pass, start='X1') 776 # not TBLASTN 777 attempt_read_and_call(uhandle, consumer.gap_x_dropoff, start='X2') 778 779 # not BLASTN, TBLASTX 780 attempt_read_and_call(uhandle, consumer.gap_x_dropoff_final, 781 start='X3') 782 783 # not TBLASTN 784 attempt_read_and_call(uhandle, consumer.gap_trigger, start='S1') 785 # not in blastx 2.2.1 786 # first we make sure we have additional lines to work with, if 787 # not then the file is done and we don't have a final S2 788 if not is_blank_line(uhandle.peekline(), allow_spaces=1): 789 read_and_call(uhandle, consumer.blast_cutoff, start='S2') 790 791 consumer.end_parameters()
792
793 -class BlastParser(AbstractParser):
794 """Parses BLAST data into a Record.Blast object. 795 796 """
797 - def __init__(self):
798 """__init__(self)""" 799 self._scanner = _Scanner() 800 self._consumer = _BlastConsumer()
801
802 - def parse(self, handle):
803 """parse(self, handle)""" 804 self._scanner.feed(handle, self._consumer) 805 return self._consumer.data
806
807 -class PSIBlastParser(AbstractParser):
808 """Parses BLAST data into a Record.PSIBlast object. 809 810 """
811 - def __init__(self):
812 """__init__(self)""" 813 self._scanner = _Scanner() 814 self._consumer = _PSIBlastConsumer()
815
816 - def parse(self, handle):
817 """parse(self, handle)""" 818 self._scanner.feed(handle, self._consumer) 819 return self._consumer.data
820
821 -class _HeaderConsumer:
822 - def start_header(self):
823 self._header = Record.Header()
824
825 - def version(self, line):
826 c = line.split() 827 self._header.application = c[0] 828 self._header.version = c[1] 829 if len(c) > 2: 830 #The date is missing in the new C++ output from blastx 2.2.22+ 831 #Just get "BLASTX 2.2.22+\n" and that's all. 832 self._header.date = c[2][1:-1]
833
834 - def reference(self, line):
835 if line.startswith('Reference: '): 836 self._header.reference = line[11:] 837 else: 838 self._header.reference = self._header.reference + line
839
840 - def query_info(self, line):
841 if line.startswith('Query= '): 842 self._header.query = line[7:].lstrip() 843 elif line.startswith('Length='): 844 #New style way to give the query length in BLAST 2.2.22+ (the C++ code) 845 self._header.query_letters = _safe_int(line[7:].strip()) 846 elif not line.startswith(' '): # continuation of query_info 847 self._header.query = "%s%s" % (self._header.query, line) 848 else: 849 #Hope it is the old style way to give the query length: 850 letters, = _re_search( 851 r"([0-9,]+) letters", line, 852 "I could not find the number of letters in line\n%s" % line) 853 self._header.query_letters = _safe_int(letters)
854
855 - def database_info(self, line):
856 line = line.rstrip() 857 if line.startswith('Database: '): 858 self._header.database = line[10:] 859 elif not line.endswith('total letters'): 860 if self._header.database: 861 #Need to include a space when merging multi line datase descr 862 self._header.database = self._header.database + " " + line.strip() 863 else: 864 self._header.database = line.strip() 865 else: 866 sequences, letters =_re_search( 867 r"([0-9,]+) sequences; ([0-9,-]+) total letters", line, 868 "I could not find the sequences and letters in line\n%s" %line) 869 self._header.database_sequences = _safe_int(sequences) 870 self._header.database_letters = _safe_int(letters)
871
872 - def end_header(self):
873 # Get rid of the trailing newlines 874 self._header.reference = self._header.reference.rstrip() 875 self._header.query = self._header.query.rstrip()
876
877 -class _DescriptionConsumer:
878 - def start_descriptions(self):
879 self._descriptions = [] 880 self._model_sequences = [] 881 self._nonmodel_sequences = [] 882 self._converged = 0 883 self._type = None 884 self._roundnum = None 885 886 self.__has_n = 0 # Does the description line contain an N value?
887
888 - def description_header(self, line):
889 if line.startswith('Sequences producing'): 890 cols = line.split() 891 if cols[-1] == 'N': 892 self.__has_n = 1
893
894 - def description(self, line):
895 dh = self._parse(line) 896 if self._type == 'model': 897 self._model_sequences.append(dh) 898 elif self._type == 'nonmodel': 899 self._nonmodel_sequences.append(dh) 900 else: 901 self._descriptions.append(dh)
902
903 - def model_sequences(self, line):
904 self._type = 'model'
905
906 - def nonmodel_sequences(self, line):
907 self._type = 'nonmodel'
908
909 - def converged(self, line):
910 self._converged = 1
911
912 - def no_hits(self, line):
913 pass
914
915 - def round(self, line):
916 if not line.startswith('Results from round'): 917 raise ValueError("I didn't understand the round line\n%s" % line) 918 self._roundnum = _safe_int(line[18:].strip())
919
920 - def end_descriptions(self):
921 pass
922
923 - def _parse(self, description_line):
924 line = description_line # for convenience 925 dh = Record.Description() 926 927 # I need to separate the score and p-value from the title. 928 # sp|P21297|FLBT_CAUCR FLBT PROTEIN [snip] 284 7e-77 929 # sp|P21297|FLBT_CAUCR FLBT PROTEIN [snip] 284 7e-77 1 930 # special cases to handle: 931 # - title must be preserved exactly (including whitespaces) 932 # - score could be equal to e-value (not likely, but what if??) 933 # - sometimes there's an "N" score of '1'. 934 cols = line.split() 935 if len(cols) < 3: 936 raise ValueError( \ 937 "Line does not appear to contain description:\n%s" % line) 938 if self.__has_n: 939 i = line.rfind(cols[-1]) # find start of N 940 i = line.rfind(cols[-2], 0, i) # find start of p-value 941 i = line.rfind(cols[-3], 0, i) # find start of score 942 else: 943 i = line.rfind(cols[-1]) # find start of p-value 944 i = line.rfind(cols[-2], 0, i) # find start of score 945 if self.__has_n: 946 dh.title, dh.score, dh.e, dh.num_alignments = \ 947 line[:i].rstrip(), cols[-3], cols[-2], cols[-1] 948 else: 949 dh.title, dh.score, dh.e, dh.num_alignments = \ 950 line[:i].rstrip(), cols[-2], cols[-1], 1 951 dh.num_alignments = _safe_int(dh.num_alignments) 952 dh.score = _safe_int(dh.score) 953 dh.e = _safe_float(dh.e) 954 return dh
955
956 -class _AlignmentConsumer:
957 # This is a little bit tricky. An alignment can either be a 958 # pairwise alignment or a multiple alignment. Since it's difficult 959 # to know a-priori which one the blast record will contain, I'm going 960 # to make one class that can parse both of them.
961 - def start_alignment(self):
962 self._alignment = Record.Alignment() 963 self._multiple_alignment = Record.MultipleAlignment()
964
965 - def title(self, line):
966 if self._alignment.title: 967 self._alignment.title += " " 968 self._alignment.title += line.strip()
969
970 - def length(self, line):
971 #e.g. "Length = 81" or more recently, "Length=428" 972 parts = line.replace(" ","").split("=") 973 assert len(parts)==2, "Unrecognised format length line" 974 self._alignment.length = parts[1] 975 self._alignment.length = _safe_int(self._alignment.length)
976
977 - def multalign(self, line):
978 # Standalone version uses 'QUERY', while WWW version uses blast_tmp. 979 if line.startswith('QUERY') or line.startswith('blast_tmp'): 980 # If this is the first line of the multiple alignment, 981 # then I need to figure out how the line is formatted. 982 983 # Format of line is: 984 # QUERY 1 acttg...gccagaggtggtttattcagtctccataagagaggggacaaacg 60 985 try: 986 name, start, seq, end = line.split() 987 except ValueError: 988 raise ValueError("I do not understand the line\n%s" % line) 989 self._start_index = line.index(start, len(name)) 990 self._seq_index = line.index(seq, 991 self._start_index+len(start)) 992 # subtract 1 for the space 993 self._name_length = self._start_index - 1 994 self._start_length = self._seq_index - self._start_index - 1 995 self._seq_length = line.rfind(end) - self._seq_index - 1 996 997 #self._seq_index = line.index(seq) 998 ## subtract 1 for the space 999 #self._seq_length = line.rfind(end) - self._seq_index - 1 1000 #self._start_index = line.index(start) 1001 #self._start_length = self._seq_index - self._start_index - 1 1002 #self._name_length = self._start_index 1003 1004 # Extract the information from the line 1005 name = line[:self._name_length] 1006 name = name.rstrip() 1007 start = line[self._start_index:self._start_index+self._start_length] 1008 start = start.rstrip() 1009 if start: 1010 start = _safe_int(start) 1011 end = line[self._seq_index+self._seq_length:].rstrip() 1012 if end: 1013 end = _safe_int(end) 1014 seq = line[self._seq_index:self._seq_index+self._seq_length].rstrip() 1015 # right pad the sequence with spaces if necessary 1016 if len(seq) < self._seq_length: 1017 seq = seq + ' '*(self._seq_length-len(seq)) 1018 1019 # I need to make sure the sequence is aligned correctly with the query. 1020 # First, I will find the length of the query. Then, if necessary, 1021 # I will pad my current sequence with spaces so that they will line 1022 # up correctly. 1023 1024 # Two possible things can happen: 1025 # QUERY 1026 # 504 1027 # 1028 # QUERY 1029 # 403 1030 # 1031 # Sequence 504 will need padding at the end. Since I won't know 1032 # this until the end of the alignment, this will be handled in 1033 # end_alignment. 1034 # Sequence 403 will need padding before being added to the alignment. 1035 1036 align = self._multiple_alignment.alignment # for convenience 1037 align.append((name, start, seq, end))
1038 1039 # This is old code that tried to line up all the sequences 1040 # in a multiple alignment by using the sequence title's as 1041 # identifiers. The problem with this is that BLAST assigns 1042 # different HSP's from the same sequence the same id. Thus, 1043 # in one alignment block, there may be multiple sequences with 1044 # the same id. I'm not sure how to handle this, so I'm not 1045 # going to. 1046 1047 # # If the sequence is the query, then just add it. 1048 # if name == 'QUERY': 1049 # if len(align) == 0: 1050 # align.append((name, start, seq)) 1051 # else: 1052 # aname, astart, aseq = align[0] 1053 # if name != aname: 1054 # raise ValueError, "Query is not the first sequence" 1055 # aseq = aseq + seq 1056 # align[0] = aname, astart, aseq 1057 # else: 1058 # if len(align) == 0: 1059 # raise ValueError, "I could not find the query sequence" 1060 # qname, qstart, qseq = align[0] 1061 # 1062 # # Now find my sequence in the multiple alignment. 1063 # for i in range(1, len(align)): 1064 # aname, astart, aseq = align[i] 1065 # if name == aname: 1066 # index = i 1067 # break 1068 # else: 1069 # # If I couldn't find it, then add a new one. 1070 # align.append((None, None, None)) 1071 # index = len(align)-1 1072 # # Make sure to left-pad it. 1073 # aname, astart, aseq = name, start, ' '*(len(qseq)-len(seq)) 1074 # 1075 # if len(qseq) != len(aseq) + len(seq): 1076 # # If my sequences are shorter than the query sequence, 1077 # # then I will need to pad some spaces to make them line up. 1078 # # Since I've already right padded seq, that means aseq 1079 # # must be too short. 1080 # aseq = aseq + ' '*(len(qseq)-len(aseq)-len(seq)) 1081 # aseq = aseq + seq 1082 # if astart is None: 1083 # astart = start 1084 # align[index] = aname, astart, aseq 1085
1086 - def end_alignment(self):
1087 # Remove trailing newlines 1088 if self._alignment: 1089 self._alignment.title = self._alignment.title.rstrip() 1090 1091 # This code is also obsolete. See note above. 1092 # If there's a multiple alignment, I will need to make sure 1093 # all the sequences are aligned. That is, I may need to 1094 # right-pad the sequences. 1095 # if self._multiple_alignment is not None: 1096 # align = self._multiple_alignment.alignment 1097 # seqlen = None 1098 # for i in range(len(align)): 1099 # name, start, seq = align[i] 1100 # if seqlen is None: 1101 # seqlen = len(seq) 1102 # else: 1103 # if len(seq) < seqlen: 1104 # seq = seq + ' '*(seqlen - len(seq)) 1105 # align[i] = name, start, seq 1106 # elif len(seq) > seqlen: 1107 # raise ValueError, \ 1108 # "Sequence %s is longer than the query" % name 1109 1110 # Clean up some variables, if they exist. 1111 try: 1112 del self._seq_index 1113 del self._seq_length 1114 del self._start_index 1115 del self._start_length 1116 del self._name_length 1117 except AttributeError: 1118 pass
1119
1120 -class _HSPConsumer:
1121 - def start_hsp(self):
1122 self._hsp = Record.HSP()
1123
1124 - def score(self, line):
1125 self._hsp.bits, self._hsp.score = _re_search( 1126 r"Score =\s*([0-9.e+]+) bits \(([0-9]+)\)", line, 1127 "I could not find the score in line\n%s" % line) 1128 self._hsp.score = _safe_float(self._hsp.score) 1129 self._hsp.bits = _safe_float(self._hsp.bits) 1130 1131 x, y = _re_search( 1132 r"Expect\(?(\d*)\)? = +([0-9.e\-|\+]+)", line, 1133 "I could not find the expect in line\n%s" % line) 1134 if x: 1135 self._hsp.num_alignments = _safe_int(x) 1136 else: 1137 self._hsp.num_alignments = 1 1138 self._hsp.expect = _safe_float(y)
1139
1140 - def identities(self, line):
1141 x, y = _re_search( 1142 r"Identities = (\d+)\/(\d+)", line, 1143 "I could not find the identities in line\n%s" % line) 1144 self._hsp.identities = _safe_int(x), _safe_int(y) 1145 self._hsp.align_length = _safe_int(y) 1146 1147 if line.find('Positives') != -1: 1148 x, y = _re_search( 1149 r"Positives = (\d+)\/(\d+)", line, 1150 "I could not find the positives in line\n%s" % line) 1151 self._hsp.positives = _safe_int(x), _safe_int(y) 1152 assert self._hsp.align_length == _safe_int(y) 1153 1154 if line.find('Gaps') != -1: 1155 x, y = _re_search( 1156 r"Gaps = (\d+)\/(\d+)", line, 1157 "I could not find the gaps in line\n%s" % line) 1158 self._hsp.gaps = _safe_int(x), _safe_int(y) 1159 assert self._hsp.align_length == _safe_int(y)
1160 1161
1162 - def strand(self, line):
1163 self._hsp.strand = _re_search( 1164 r"Strand = (\w+) / (\w+)", line, 1165 "I could not find the strand in line\n%s" % line)
1166
1167 - def frame(self, line):
1168 # Frame can be in formats: 1169 # Frame = +1 1170 # Frame = +2 / +2 1171 if line.find('/') != -1: 1172 self._hsp.frame = _re_search( 1173 r"Frame = ([-+][123]) / ([-+][123])", line, 1174 "I could not find the frame in line\n%s" % line) 1175 else: 1176 self._hsp.frame = _re_search( 1177 r"Frame = ([-+][123])", line, 1178 "I could not find the frame in line\n%s" % line)
1179 1180 # Match a space, if one is available. Masahir Ishikawa found a 1181 # case where there's no space between the start and the sequence: 1182 # Query: 100tt 101 1183 # line below modified by Yair Benita, Sep 2004 1184 # Note that the colon is not always present. 2006 1185 _query_re = re.compile(r"Query(:?) \s*(\d+)\s*(.+) (\d+)")
1186 - def query(self, line):
1187 m = self._query_re.search(line) 1188 if m is None: 1189 raise ValueError("I could not find the query in line\n%s" % line) 1190 1191 # line below modified by Yair Benita, Sep 2004. 1192 # added the end attribute for the query 1193 colon, start, seq, end = m.groups() 1194 self._hsp.query = self._hsp.query + seq 1195 if self._hsp.query_start is None: 1196 self._hsp.query_start = _safe_int(start) 1197 1198 # line below added by Yair Benita, Sep 2004. 1199 # added the end attribute for the query 1200 self._hsp.query_end = _safe_int(end) 1201 1202 #Get index for sequence start (regular expression element 3) 1203 self._query_start_index = m.start(3) 1204 self._query_len = len(seq)
1205
1206 - def align(self, line):
1207 seq = line[self._query_start_index:].rstrip() 1208 if len(seq) < self._query_len: 1209 # Make sure the alignment is the same length as the query 1210 seq = seq + ' ' * (self._query_len-len(seq)) 1211 elif len(seq) < self._query_len: 1212 raise ValueError("Match is longer than the query in line\n%s" \ 1213 % line) 1214 self._hsp.match = self._hsp.match + seq
1215 1216 # To match how we do the query, cache the regular expression. 1217 # Note that the colon is not always present. 1218 _sbjct_re = re.compile(r"Sbjct(:?) \s*(\d+)\s*(.+) (\d+)")
1219 - def sbjct(self, line):
1220 m = self._sbjct_re.search(line) 1221 if m is None: 1222 raise ValueError("I could not find the sbjct in line\n%s" % line) 1223 colon, start, seq, end = m.groups() 1224 #mikep 26/9/00 1225 #On occasion, there is a blast hit with no subject match 1226 #so far, it only occurs with 1-line short "matches" 1227 #I have decided to let these pass as they appear 1228 if not seq.strip(): 1229 seq = ' ' * self._query_len 1230 self._hsp.sbjct = self._hsp.sbjct + seq 1231 if self._hsp.sbjct_start is None: 1232 self._hsp.sbjct_start = _safe_int(start) 1233 1234 self._hsp.sbjct_end = _safe_int(end) 1235 if len(seq) != self._query_len: 1236 raise ValueError( \ 1237 "QUERY and SBJCT sequence lengths don't match in line\n%s" \ 1238 % line) 1239 1240 del self._query_start_index # clean up unused variables 1241 del self._query_len
1242
1243 - def end_hsp(self):
1244 pass
1245
1246 -class _DatabaseReportConsumer:
1247
1248 - def start_database_report(self):
1249 self._dr = Record.DatabaseReport()
1250
1251 - def database(self, line):
1252 m = re.search(r"Database: (.+)$", line) 1253 if m: 1254 self._dr.database_name.append(m.group(1)) 1255 elif self._dr.database_name: 1256 # This must be a continuation of the previous name. 1257 self._dr.database_name[-1] = "%s%s" % (self._dr.database_name[-1], 1258 line.strip())
1259
1260 - def posted_date(self, line):
1261 self._dr.posted_date.append(_re_search( 1262 r"Posted date:\s*(.+)$", line, 1263 "I could not find the posted date in line\n%s" % line))
1264
1265 - def num_letters_in_database(self, line):
1266 letters, = _get_cols( 1267 line, (-1,), ncols=6, expected={2:"letters", 4:"database:"}) 1268 self._dr.num_letters_in_database.append(_safe_int(letters))
1269
1270 - def num_sequences_in_database(self, line):
1271 sequences, = _get_cols( 1272 line, (-1,), ncols=6, expected={2:"sequences", 4:"database:"}) 1273 self._dr.num_sequences_in_database.append(_safe_int(sequences))
1274
1275 - def ka_params(self, line):
1276 x = line.split() 1277 self._dr.ka_params = map(_safe_float, x)
1278
1279 - def gapped(self, line):
1280 self._dr.gapped = 1
1281
1282 - def ka_params_gap(self, line):
1283 x = line.split() 1284 self._dr.ka_params_gap = map(_safe_float, x)
1285
1286 - def end_database_report(self):
1287 pass
1288
1289 -class _ParametersConsumer:
1290 - def start_parameters(self):
1291 self._params = Record.Parameters()
1292
1293 - def matrix(self, line):
1294 self._params.matrix = line[8:].rstrip()
1295
1296 - def gap_penalties(self, line):
1297 x = _get_cols( 1298 line, (3, 5), ncols=6, expected={2:"Existence:", 4:"Extension:"}) 1299 self._params.gap_penalties = map(_safe_float, x)
1300
1301 - def num_hits(self, line):
1302 if line.find('1st pass') != -1: 1303 x, = _get_cols(line, (-4,), ncols=11, expected={2:"Hits"}) 1304 self._params.num_hits = _safe_int(x) 1305 else: 1306 x, = _get_cols(line, (-1,), ncols=6, expected={2:"Hits"}) 1307 self._params.num_hits = _safe_int(x)
1308
1309 - def num_sequences(self, line):
1310 if line.find('1st pass') != -1: 1311 x, = _get_cols(line, (-4,), ncols=9, expected={2:"Sequences:"}) 1312 self._params.num_sequences = _safe_int(x) 1313 else: 1314 x, = _get_cols(line, (-1,), ncols=4, expected={2:"Sequences:"}) 1315 self._params.num_sequences = _safe_int(x)
1316
1317 - def num_extends(self, line):
1318 if line.find('1st pass') != -1: 1319 x, = _get_cols(line, (-4,), ncols=9, expected={2:"extensions:"}) 1320 self._params.num_extends = _safe_int(x) 1321 else: 1322 x, = _get_cols(line, (-1,), ncols=4, expected={2:"extensions:"}) 1323 self._params.num_extends = _safe_int(x)
1324
1325 - def num_good_extends(self, line):
1326 if line.find('1st pass') != -1: 1327 x, = _get_cols(line, (-4,), ncols=10, expected={3:"extensions:"}) 1328 self._params.num_good_extends = _safe_int(x) 1329 else: 1330 x, = _get_cols(line, (-1,), ncols=5, expected={3:"extensions:"}) 1331 self._params.num_good_extends = _safe_int(x)
1332
1333 - def num_seqs_better_e(self, line):
1334 self._params.num_seqs_better_e, = _get_cols( 1335 line, (-1,), ncols=7, expected={2:"sequences"}) 1336 self._params.num_seqs_better_e = _safe_int( 1337 self._params.num_seqs_better_e)
1338
1339 - def hsps_no_gap(self, line):
1340 self._params.hsps_no_gap, = _get_cols( 1341 line, (-1,), ncols=9, expected={3:"better", 7:"gapping:"}) 1342 self._params.hsps_no_gap = _safe_int(self._params.hsps_no_gap)
1343
1344 - def hsps_prelim_gapped(self, line):
1345 self._params.hsps_prelim_gapped, = _get_cols( 1346 line, (-1,), ncols=9, expected={4:"gapped", 6:"prelim"}) 1347 self._params.hsps_prelim_gapped = _safe_int( 1348 self._params.hsps_prelim_gapped)
1349
1350 - def hsps_prelim_gapped_attempted(self, line):
1351 self._params.hsps_prelim_gapped_attempted, = _get_cols( 1352 line, (-1,), ncols=10, expected={4:"attempted", 7:"prelim"}) 1353 self._params.hsps_prelim_gapped_attempted = _safe_int( 1354 self._params.hsps_prelim_gapped_attempted)
1355
1356 - def hsps_gapped(self, line):
1357 self._params.hsps_gapped, = _get_cols( 1358 line, (-1,), ncols=6, expected={3:"gapped"}) 1359 self._params.hsps_gapped = _safe_int(self._params.hsps_gapped)
1360
1361 - def query_length(self, line):
1362 self._params.query_length, = _get_cols( 1363 line.lower(), (-1,), ncols=4, expected={0:"length", 2:"query:"}) 1364 self._params.query_length = _safe_int(self._params.query_length)
1365
1366 - def database_length(self, line):
1367 self._params.database_length, = _get_cols( 1368 line.lower(), (-1,), ncols=4, expected={0:"length", 2:"database:"}) 1369 self._params.database_length = _safe_int(self._params.database_length)
1370
1371 - def effective_hsp_length(self, line):
1372 self._params.effective_hsp_length, = _get_cols( 1373 line, (-1,), ncols=4, expected={1:"HSP", 2:"length:"}) 1374 self._params.effective_hsp_length = _safe_int( 1375 self._params.effective_hsp_length)
1376
1377 - def effective_query_length(self, line):
1378 self._params.effective_query_length, = _get_cols( 1379 line, (-1,), ncols=5, expected={1:"length", 3:"query:"}) 1380 self._params.effective_query_length = _safe_int( 1381 self._params.effective_query_length)
1382
1383 - def effective_database_length(self, line):
1384 self._params.effective_database_length, = _get_cols( 1385 line.lower(), (-1,), ncols=5, expected={1:"length", 3:"database:"}) 1386 self._params.effective_database_length = _safe_int( 1387 self._params.effective_database_length)
1388
1389 - def effective_search_space(self, line):
1390 self._params.effective_search_space, = _get_cols( 1391 line, (-1,), ncols=4, expected={1:"search"}) 1392 self._params.effective_search_space = _safe_int( 1393 self._params.effective_search_space)
1394
1395 - def effective_search_space_used(self, line):
1396 self._params.effective_search_space_used, = _get_cols( 1397 line, (-1,), ncols=5, expected={1:"search", 3:"used:"}) 1398 self._params.effective_search_space_used = _safe_int( 1399 self._params.effective_search_space_used)
1400
1401 - def frameshift(self, line):
1402 self._params.frameshift = _get_cols( 1403 line, (4, 5), ncols=6, expected={0:"frameshift", 2:"decay"})
1404
1405 - def threshold(self, line):
1406 if line[:2] == "T:": 1407 #Assume its an old stlye line like "T: 123" 1408 self._params.threshold, = _get_cols( 1409 line, (1,), ncols=2, expected={0:"T:"}) 1410 elif line[:28] == "Neighboring words threshold:": 1411 self._params.threshold, = _get_cols( 1412 line, (3,), ncols=4, expected={0:"Neighboring", 1:"words", 2:"threshold:"}) 1413 else: 1414 raise ValueError("Unrecognised threshold line:\n%s" % line) 1415 self._params.threshold = _safe_int(self._params.threshold)
1416
1417 - def window_size(self, line):
1418 if line[:2] == "A:": 1419 self._params.window_size, = _get_cols( 1420 line, (1,), ncols=2, expected={0:"A:"}) 1421 elif line[:25] == "Window for multiple hits:": 1422 self._params.window_size, = _get_cols( 1423 line, (4,), ncols=5, expected={0:"Window", 2:"multiple", 3:"hits:"}) 1424 else: 1425 raise ValueError("Unrecognised window size line:\n%s" % line) 1426 self._params.window_size = _safe_int(self._params.window_size)
1427
1428 - def dropoff_1st_pass(self, line):
1429 score, bits = _re_search( 1430 r"X1: (\d+) \(\s*([0-9,.]+) bits\)", line, 1431 "I could not find the dropoff in line\n%s" % line) 1432 self._params.dropoff_1st_pass = _safe_int(score), _safe_float(bits)
1433
1434 - def gap_x_dropoff(self, line):
1435 score, bits = _re_search( 1436 r"X2: (\d+) \(\s*([0-9,.]+) bits\)", line, 1437 "I could not find the gap dropoff in line\n%s" % line) 1438 self._params.gap_x_dropoff = _safe_int(score), _safe_float(bits)
1439
1440 - def gap_x_dropoff_final(self, line):
1441 score, bits = _re_search( 1442 r"X3: (\d+) \(\s*([0-9,.]+) bits\)", line, 1443 "I could not find the gap dropoff final in line\n%s" % line) 1444 self._params.gap_x_dropoff_final = _safe_int(score), _safe_float(bits)
1445
1446 - def gap_trigger(self, line):
1447 score, bits = _re_search( 1448 r"S1: (\d+) \(\s*([0-9,.]+) bits\)", line, 1449 "I could not find the gap trigger in line\n%s" % line) 1450 self._params.gap_trigger = _safe_int(score), _safe_float(bits)
1451
1452 - def blast_cutoff(self, line):
1453 score, bits = _re_search( 1454 r"S2: (\d+) \(\s*([0-9,.]+) bits\)", line, 1455 "I could not find the blast cutoff in line\n%s" % line) 1456 self._params.blast_cutoff = _safe_int(score), _safe_float(bits)
1457
1458 - def end_parameters(self):
1459 pass
1460 1461
1462 -class _BlastConsumer(AbstractConsumer, 1463 _HeaderConsumer, 1464 _DescriptionConsumer, 1465 _AlignmentConsumer, 1466 _HSPConsumer, 1467 _DatabaseReportConsumer, 1468 _ParametersConsumer 1469 ):
1470 # This Consumer is inherits from many other consumer classes that handle 1471 # the actual dirty work. An alternate way to do it is to create objects 1472 # of those classes and then delegate the parsing tasks to them in a 1473 # decorator-type pattern. The disadvantage of that is that the method 1474 # names will need to be resolved in this classes. However, using 1475 # a decorator will retain more control in this class (which may or 1476 # may not be a bad thing). In addition, having each sub-consumer as 1477 # its own object prevents this object's dictionary from being cluttered 1478 # with members and reduces the chance of member collisions.
1479 - def __init__(self):
1480 self.data = None
1481
1482 - def round(self, line):
1483 # Make sure nobody's trying to pass me PSI-BLAST data! 1484 raise ValueError("This consumer doesn't handle PSI-BLAST data")
1485
1486 - def start_header(self):
1487 self.data = Record.Blast() 1488 _HeaderConsumer.start_header(self)
1489
1490 - def end_header(self):
1491 _HeaderConsumer.end_header(self) 1492 self.data.__dict__.update(self._header.__dict__)
1493
1494 - def end_descriptions(self):
1495 self.data.descriptions = self._descriptions
1496
1497 - def end_alignment(self):
1498 _AlignmentConsumer.end_alignment(self) 1499 if self._alignment.hsps: 1500 self.data.alignments.append(self._alignment) 1501 if self._multiple_alignment.alignment: 1502 self.data.multiple_alignment = self._multiple_alignment
1503
1504 - def end_hsp(self):
1505 _HSPConsumer.end_hsp(self) 1506 try: 1507 self._alignment.hsps.append(self._hsp) 1508 except AttributeError: 1509 raise ValueError("Found an HSP before an alignment")
1510
1511 - def end_database_report(self):
1512 _DatabaseReportConsumer.end_database_report(self) 1513 self.data.__dict__.update(self._dr.__dict__)
1514
1515 - def end_parameters(self):
1516 _ParametersConsumer.end_parameters(self) 1517 self.data.__dict__.update(self._params.__dict__)
1518
1519 -class _PSIBlastConsumer(AbstractConsumer, 1520 _HeaderConsumer, 1521 _DescriptionConsumer, 1522 _AlignmentConsumer, 1523 _HSPConsumer, 1524 _DatabaseReportConsumer, 1525 _ParametersConsumer 1526 ):
1527 - def __init__(self):
1528 self.data = None
1529
1530 - def start_header(self):
1531 self.data = Record.PSIBlast() 1532 _HeaderConsumer.start_header(self)
1533
1534 - def end_header(self):
1535 _HeaderConsumer.end_header(self) 1536 self.data.__dict__.update(self._header.__dict__)
1537
1538 - def start_descriptions(self):
1539 self._round = Record.Round() 1540 self.data.rounds.append(self._round) 1541 _DescriptionConsumer.start_descriptions(self)
1542
1543 - def end_descriptions(self):
1544 _DescriptionConsumer.end_descriptions(self) 1545 self._round.number = self._roundnum 1546 if self._descriptions: 1547 self._round.new_seqs.extend(self._descriptions) 1548 self._round.reused_seqs.extend(self._model_sequences) 1549 self._round.new_seqs.extend(self._nonmodel_sequences) 1550 if self._converged: 1551 self.data.converged = 1
1552
1553 - def end_alignment(self):
1554 _AlignmentConsumer.end_alignment(self) 1555 if self._alignment.hsps: 1556 self._round.alignments.append(self._alignment) 1557 if self._multiple_alignment: 1558 self._round.multiple_alignment = self._multiple_alignment
1559
1560 - def end_hsp(self):
1561 _HSPConsumer.end_hsp(self) 1562 try: 1563 self._alignment.hsps.append(self._hsp) 1564 except AttributeError: 1565 raise ValueError("Found an HSP before an alignment")
1566
1567 - def end_database_report(self):
1568 _DatabaseReportConsumer.end_database_report(self) 1569 self.data.__dict__.update(self._dr.__dict__)
1570
1571 - def end_parameters(self):
1572 _ParametersConsumer.end_parameters(self) 1573 self.data.__dict__.update(self._params.__dict__)
1574
1575 -class Iterator:
1576 """Iterates over a file of multiple BLAST results. 1577 1578 Methods: 1579 next Return the next record from the stream, or None. 1580 1581 """
1582 - def __init__(self, handle, parser=None):
1583 """__init__(self, handle, parser=None) 1584 1585 Create a new iterator. handle is a file-like object. parser 1586 is an optional Parser object to change the results into another form. 1587 If set to None, then the raw contents of the file will be returned. 1588 1589 """ 1590 try: 1591 handle.readline 1592 except AttributeError: 1593 raise ValueError( 1594 "I expected a file handle or file-like object, got %s" 1595 % type(handle)) 1596 self._uhandle = File.UndoHandle(handle) 1597 self._parser = parser 1598 self._header = []
1599
1600 - def next(self):
1601 """next(self) -> object 1602 1603 Return the next Blast record from the file. If no more records, 1604 return None. 1605 1606 """ 1607 lines = [] 1608 query = False 1609 while 1: 1610 line = self._uhandle.readline() 1611 if not line: 1612 break 1613 # If I've reached the next one, then put the line back and stop. 1614 if lines and (line.startswith('BLAST') 1615 or line.startswith('BLAST', 1) 1616 or line.startswith('<?xml ')): 1617 self._uhandle.saveline(line) 1618 break 1619 # New style files ommit the BLAST line to mark a new query: 1620 if line.startswith("Query="): 1621 if not query: 1622 if not self._header: 1623 self._header = lines[:] 1624 query = True 1625 else: 1626 #Start of another record 1627 self._uhandle.saveline(line) 1628 break 1629 lines.append(line) 1630 1631 if query and "BLAST" not in lines[0]: 1632 #Cheat and re-insert the header 1633 #print "-"*50 1634 #print "".join(self._header) 1635 #print "-"*50 1636 #print "".join(lines) 1637 #print "-"*50 1638 lines = self._header + lines 1639 1640 if not lines: 1641 return None 1642 1643 data = ''.join(lines) 1644 if self._parser is not None: 1645 return self._parser.parse(File.StringHandle(data)) 1646 return data
1647
1648 - def __iter__(self):
1649 return iter(self.next, None)
1650
1651 -def blastall(blastcmd, program, database, infile, align_view='7', **keywds):
1652 """Execute and retrieve data from standalone BLASTPALL as handles (OBSOLETE). 1653 1654 NOTE - This function is obsolete, you are encouraged to the command 1655 line wrapper Bio.Blast.Applications.BlastallCommandline instead. 1656 1657 Execute and retrieve data from blastall. blastcmd is the command 1658 used to launch the 'blastall' executable. program is the blast program 1659 to use, e.g. 'blastp', 'blastn', etc. database is the path to the database 1660 to search against. infile is the path to the file containing 1661 the sequence to search with. 1662 1663 The return values are two handles, for standard output and standard error. 1664 1665 You may pass more parameters to **keywds to change the behavior of 1666 the search. Otherwise, optional values will be chosen by blastall. 1667 The Blast output is by default in XML format. Use the align_view keyword 1668 for output in a different format. 1669 1670 Scoring 1671 matrix Matrix to use. 1672 gap_open Gap open penalty. 1673 gap_extend Gap extension penalty. 1674 nuc_match Nucleotide match reward. (BLASTN) 1675 nuc_mismatch Nucleotide mismatch penalty. (BLASTN) 1676 query_genetic_code Genetic code for Query. 1677 db_genetic_code Genetic code for database. (TBLAST[NX]) 1678 1679 Algorithm 1680 gapped Whether to do a gapped alignment. T/F (not for TBLASTX) 1681 expectation Expectation value cutoff. 1682 wordsize Word size. 1683 strands Query strands to search against database.([T]BLAST[NX]) 1684 keep_hits Number of best hits from a region to keep. 1685 xdrop Dropoff value (bits) for gapped alignments. 1686 hit_extend Threshold for extending hits. 1687 region_length Length of region used to judge hits. 1688 db_length Effective database length. 1689 search_length Effective length of search space. 1690 1691 Processing 1692 filter Filter query sequence for low complexity (with SEG)? T/F 1693 believe_query Believe the query defline. T/F 1694 restrict_gi Restrict search to these GI's. 1695 nprocessors Number of processors to use. 1696 oldengine Force use of old engine T/F 1697 1698 Formatting 1699 html Produce HTML output? T/F 1700 descriptions Number of one-line descriptions. 1701 alignments Number of alignments. 1702 align_view Alignment view. Integer 0-11, 1703 passed as a string or integer. 1704 show_gi Show GI's in deflines? T/F 1705 seqalign_file seqalign file to output. 1706 outfile Output file for report. Filename to write to, if 1707 ommitted standard output is used (which you can access 1708 from the returned handles). 1709 """ 1710 1711 _security_check_parameters(keywds) 1712 1713 att2param = { 1714 'matrix' : '-M', 1715 'gap_open' : '-G', 1716 'gap_extend' : '-E', 1717 'nuc_match' : '-r', 1718 'nuc_mismatch' : '-q', 1719 'query_genetic_code' : '-Q', 1720 'db_genetic_code' : '-D', 1721 1722 'gapped' : '-g', 1723 'expectation' : '-e', 1724 'wordsize' : '-W', 1725 'strands' : '-S', 1726 'keep_hits' : '-K', 1727 'xdrop' : '-X', 1728 'hit_extend' : '-f', 1729 'region_length' : '-L', 1730 'db_length' : '-z', 1731 'search_length' : '-Y', 1732 1733 'program' : '-p', 1734 'database' : '-d', 1735 'infile' : '-i', 1736 'filter' : '-F', 1737 'believe_query' : '-J', 1738 'restrict_gi' : '-l', 1739 'nprocessors' : '-a', 1740 'oldengine' : '-V', 1741 1742 'html' : '-T', 1743 'descriptions' : '-v', 1744 'alignments' : '-b', 1745 'align_view' : '-m', 1746 'show_gi' : '-I', 1747 'seqalign_file' : '-O', 1748 'outfile' : '-o', 1749 } 1750 from Applications import BlastallCommandline 1751 cline = BlastallCommandline(blastcmd) 1752 cline.set_parameter(att2param['program'], program) 1753 cline.set_parameter(att2param['database'], database) 1754 cline.set_parameter(att2param['infile'], infile) 1755 cline.set_parameter(att2param['align_view'], str(align_view)) 1756 for key, value in keywds.iteritems(): 1757 cline.set_parameter(att2param[key], str(value)) 1758 return _invoke_blast(cline)
1759 1760
1761 -def blastpgp(blastcmd, database, infile, align_view='7', **keywds):
1762 """Execute and retrieve data from standalone BLASTPGP as handles (OBSOLETE). 1763 1764 NOTE - This function is obsolete, you are encouraged to the command 1765 line wrapper Bio.Blast.Applications.BlastpgpCommandline instead. 1766 1767 Execute and retrieve data from blastpgp. blastcmd is the command 1768 used to launch the 'blastpgp' executable. database is the path to the 1769 database to search against. infile is the path to the file containing 1770 the sequence to search with. 1771 1772 The return values are two handles, for standard output and standard error. 1773 1774 You may pass more parameters to **keywds to change the behavior of 1775 the search. Otherwise, optional values will be chosen by blastpgp. 1776 The Blast output is by default in XML format. Use the align_view keyword 1777 for output in a different format. 1778 1779 Scoring 1780 matrix Matrix to use. 1781 gap_open Gap open penalty. 1782 gap_extend Gap extension penalty. 1783 window_size Multiple hits window size. 1784 npasses Number of passes. 1785 passes Hits/passes. Integer 0-2. 1786 1787 Algorithm 1788 gapped Whether to do a gapped alignment. T/F 1789 expectation Expectation value cutoff. 1790 wordsize Word size. 1791 keep_hits Number of beset hits from a region to keep. 1792 xdrop Dropoff value (bits) for gapped alignments. 1793 hit_extend Threshold for extending hits. 1794 region_length Length of region used to judge hits. 1795 db_length Effective database length. 1796 search_length Effective length of search space. 1797 nbits_gapping Number of bits to trigger gapping. 1798 pseudocounts Pseudocounts constants for multiple passes. 1799 xdrop_final X dropoff for final gapped alignment. 1800 xdrop_extension Dropoff for blast extensions. 1801 model_threshold E-value threshold to include in multipass model. 1802 required_start Start of required region in query. 1803 required_end End of required region in query. 1804 1805 Processing 1806 XXX should document default values 1807 program The blast program to use. (PHI-BLAST) 1808 filter Filter query sequence for low complexity (with SEG)? T/F 1809 believe_query Believe the query defline? T/F 1810 nprocessors Number of processors to use. 1811 1812 Formatting 1813 html Produce HTML output? T/F 1814 descriptions Number of one-line descriptions. 1815 alignments Number of alignments. 1816 align_view Alignment view. Integer 0-11, 1817 passed as a string or integer. 1818 show_gi Show GI's in deflines? T/F 1819 seqalign_file seqalign file to output. 1820 align_outfile Output file for alignment. 1821 checkpoint_outfile Output file for PSI-BLAST checkpointing. 1822 restart_infile Input file for PSI-BLAST restart. 1823 hit_infile Hit file for PHI-BLAST. 1824 matrix_outfile Output file for PSI-BLAST matrix in ASCII. 1825 align_outfile Output file for alignment. Filename to write to, if 1826 ommitted standard output is used (which you can access 1827 from the returned handles). 1828 1829 align_infile Input alignment file for PSI-BLAST restart. 1830 1831 """ 1832 1833 _security_check_parameters(keywds) 1834 1835 att2param = { 1836 'matrix' : '-M', 1837 'gap_open' : '-G', 1838 'gap_extend' : '-E', 1839 'window_size' : '-A', 1840 'npasses' : '-j', 1841 'passes' : '-P', 1842 1843 'gapped' : '-g', 1844 'expectation' : '-e', 1845 'wordsize' : '-W', 1846 'keep_hits' : '-K', 1847 'xdrop' : '-X', 1848 'hit_extend' : '-f', 1849 'region_length' : '-L', 1850 'db_length' : '-Z', 1851 'search_length' : '-Y', 1852 'nbits_gapping' : '-N', 1853 'pseudocounts' : '-c', 1854 'xdrop_final' : '-Z', 1855 'xdrop_extension' : '-y', 1856 'model_threshold' : '-h', 1857 'required_start' : '-S', 1858 'required_end' : '-H', 1859 1860 'program' : '-p', 1861 'database' : '-d', 1862 'infile' : '-i', 1863 'filter' : '-F', 1864 'believe_query' : '-J', 1865 'nprocessors' : '-a', 1866 1867 'html' : '-T', 1868 'descriptions' : '-v', 1869 'alignments' : '-b', 1870 'align_view' : '-m', 1871 'show_gi' : '-I', 1872 'seqalign_file' : '-O', 1873 'align_outfile' : '-o', 1874 'checkpoint_outfile' : '-C', 1875 'restart_infile' : '-R', 1876 'hit_infile' : '-k', 1877 'matrix_outfile' : '-Q', 1878 'align_infile' : '-B', 1879 } 1880 from Applications import BlastpgpCommandline 1881 cline = BlastpgpCommandline(blastcmd) 1882 cline.set_parameter(att2param['database'], database) 1883 cline.set_parameter(att2param['infile'], infile) 1884 cline.set_parameter(att2param['align_view'], str(align_view)) 1885 for key, value in keywds.iteritems(): 1886 cline.set_parameter(att2param[key], str(value)) 1887 return _invoke_blast(cline)
1888 1889
1890 -def rpsblast(blastcmd, database, infile, align_view="7", **keywds):
1891 """Execute and retrieve data from standalone RPS-BLAST as handles (OBSOLETE). 1892 1893 NOTE - This function is obsolete, you are encouraged to the command 1894 line wrapper Bio.Blast.Applications.RpsBlastCommandline instead. 1895 1896 Execute and retrieve data from standalone RPS-BLAST. blastcmd is the 1897 command used to launch the 'rpsblast' executable. database is the path 1898 to the database to search against. infile is the path to the file 1899 containing the sequence to search with. 1900 1901 The return values are two handles, for standard output and standard error. 1902 1903 You may pass more parameters to **keywds to change the behavior of 1904 the search. Otherwise, optional values will be chosen by rpsblast. 1905 1906 Please note that this function will give XML output by default, by 1907 setting align_view to seven (i.e. command line option -m 7). 1908 You should use the NCBIXML.parse() function to read the resulting output. 1909 This is because NCBIStandalone.BlastParser() does not understand the 1910 plain text output format from rpsblast. 1911 1912 WARNING - The following text and associated parameter handling has not 1913 received extensive testing. Please report any errors we might have made... 1914 1915 Algorithm/Scoring 1916 gapped Whether to do a gapped alignment. T/F 1917 multihit 0 for multiple hit (default), 1 for single hit 1918 expectation Expectation value cutoff. 1919 range_restriction Range restriction on query sequence (Format: start,stop) blastp only 1920 0 in 'start' refers to the beginning of the sequence 1921 0 in 'stop' refers to the end of the sequence 1922 Default = 0,0 1923 xdrop Dropoff value (bits) for gapped alignments. 1924 xdrop_final X dropoff for final gapped alignment (in bits). 1925 xdrop_extension Dropoff for blast extensions (in bits). 1926 search_length Effective length of search space. 1927 nbits_gapping Number of bits to trigger gapping. 1928 protein Query sequence is protein. T/F 1929 db_length Effective database length. 1930 1931 Processing 1932 filter Filter query sequence for low complexity? T/F 1933 case_filter Use lower case filtering of FASTA sequence T/F, default F 1934 believe_query Believe the query defline. T/F 1935 nprocessors Number of processors to use. 1936 logfile Name of log file to use, default rpsblast.log 1937 1938 Formatting 1939 html Produce HTML output? T/F 1940 descriptions Number of one-line descriptions. 1941 alignments Number of alignments. 1942 align_view Alignment view. Integer 0-11, 1943 passed as a string or integer. 1944 show_gi Show GI's in deflines? T/F 1945 seqalign_file seqalign file to output. 1946 align_outfile Output file for alignment. Filename to write to, if 1947 ommitted standard output is used (which you can access 1948 from the returned handles). 1949 """ 1950 1951 _security_check_parameters(keywds) 1952 1953 att2param = { 1954 'multihit' : '-P', 1955 'gapped' : '-g', 1956 'expectation' : '-e', 1957 'range_restriction' : '-L', 1958 'xdrop' : '-X', 1959 'xdrop_final' : '-Z', 1960 'xdrop_extension' : '-y', 1961 'search_length' : '-Y', 1962 'nbits_gapping' : '-N', 1963 'protein' : '-p', 1964 'db_length' : '-z', 1965 1966 'database' : '-d', 1967 'infile' : '-i', 1968 'filter' : '-F', 1969 'case_filter' : '-U', 1970 'believe_query' : '-J', 1971 'nprocessors' : '-a', 1972 'logfile' : '-l', 1973 1974 'html' : '-T', 1975 'descriptions' : '-v', 1976 'alignments' : '-b', 1977 'align_view' : '-m', 1978 'show_gi' : '-I', 1979 'seqalign_file' : '-O', 1980 'align_outfile' : '-o', 1981 } 1982 1983 from Applications import RpsBlastCommandline 1984 cline = RpsBlastCommandline(blastcmd) 1985 cline.set_parameter(att2param['database'], database) 1986 cline.set_parameter(att2param['infile'], infile) 1987 cline.set_parameter(att2param['align_view'], str(align_view)) 1988 for key, value in keywds.iteritems(): 1989 cline.set_parameter(att2param[key], str(value)) 1990 return _invoke_blast(cline)
1991 1992
1993 -def _re_search(regex, line, error_msg):
1994 m = re.search(regex, line) 1995 if not m: 1996 raise ValueError(error_msg) 1997 return m.groups()
1998
1999 -def _get_cols(line, cols_to_get, ncols=None, expected={}):
2000 cols = line.split() 2001 2002 # Check to make sure number of columns is correct 2003 if ncols is not None and len(cols) != ncols: 2004 raise ValueError("I expected %d columns (got %d) in line\n%s" \ 2005 % (ncols, len(cols), line)) 2006 2007 # Check to make sure columns contain the correct data 2008 for k in expected.keys(): 2009 if cols[k] != expected[k]: 2010 raise ValueError("I expected '%s' in column %d in line\n%s" \ 2011 % (expected[k], k, line)) 2012 2013 # Construct the answer tuple 2014 results = [] 2015 for c in cols_to_get: 2016 results.append(cols[c]) 2017 return tuple(results)
2018
2019 -def _safe_int(str):
2020 try: 2021 return int(str) 2022 except ValueError: 2023 # Something went wrong. Try to clean up the string. 2024 # Remove all commas from the string 2025 str = str.replace(',', '') 2026 try: 2027 # try again. 2028 return int(str) 2029 except ValueError: 2030 pass 2031 # If it fails again, maybe it's too long? 2032 # XXX why converting to float? 2033 return long(float(str))
2034
2035 -def _safe_float(str):
2036 # Thomas Rosleff Soerensen (rosleff@mpiz-koeln.mpg.de) noted that 2037 # float('e-172') does not produce an error on his platform. Thus, 2038 # we need to check the string for this condition. 2039 2040 # Sometimes BLAST leaves of the '1' in front of an exponent. 2041 if str and str[0] in ['E', 'e']: 2042 str = '1' + str 2043 try: 2044 return float(str) 2045 except ValueError: 2046 # Remove all commas from the string 2047 str = str.replace(',', '') 2048 # try again. 2049 return float(str)
2050 2051
2052 -def _invoke_blast(cline):
2053 """Start BLAST and returns handles for stdout and stderr (PRIVATE). 2054 2055 Expects a command line wrapper object from Bio.Blast.Applications 2056 """ 2057 import subprocess, sys 2058 blast_cmd = cline.program_name 2059 if not os.path.exists(blast_cmd): 2060 raise ValueError("BLAST executable does not exist at %s" % blast_cmd) 2061 #We don't need to supply any piped input, but we setup the 2062 #standard input pipe anyway as a work around for a python 2063 #bug if this is called from a Windows GUI program. For 2064 #details, see http://bugs.python.org/issue1124861 2065 blast_process = subprocess.Popen(str(cline), 2066 stdin=subprocess.PIPE, 2067 stdout=subprocess.PIPE, 2068 stderr=subprocess.PIPE, 2069 shell=(sys.platform!="win32")) 2070 blast_process.stdin.close() 2071 return blast_process.stdout, blast_process.stderr
2072 2073
2074 -def _security_check_parameters(param_dict):
2075 """Look for any attempt to insert a command into a parameter. 2076 2077 e.g. blastall(..., matrix='IDENTITY -F 0; rm -rf /etc/passwd') 2078 2079 Looks for ";" or "&&" in the strings (Unix and Windows syntax 2080 for appending a command line), or ">", "<" or "|" (redirection) 2081 and if any are found raises an exception. 2082 """ 2083 for key, value in param_dict.iteritems(): 2084 str_value = str(value) # Could easily be an int or a float 2085 for bad_str in [";", "&&", ">", "<", "|"]: 2086 if bad_str in str_value: 2087 raise ValueError("Rejecting suspicious argument for %s" % key)
2088
2089 -class _BlastErrorConsumer(_BlastConsumer):
2090 - def __init__(self):
2092 - def noevent(self, line):
2093 if line.find("Query must be at least wordsize") != -1: 2094 raise ShortQueryBlastError("Query must be at least wordsize") 2095 # Now pass the line back up to the superclass. 2096 method = getattr(_BlastConsumer, 'noevent', 2097 _BlastConsumer.__getattr__(self, 'noevent')) 2098 method(line)
2099
2100 -class BlastErrorParser(AbstractParser):
2101 """Attempt to catch and diagnose BLAST errors while parsing. 2102 2103 This utilizes the BlastParser module but adds an additional layer 2104 of complexity on top of it by attempting to diagnose ValueErrors 2105 that may actually indicate problems during BLAST parsing. 2106 2107 Current BLAST problems this detects are: 2108 o LowQualityBlastError - When BLASTing really low quality sequences 2109 (ie. some GenBank entries which are just short streches of a single 2110 nucleotide), BLAST will report an error with the sequence and be 2111 unable to search with this. This will lead to a badly formatted 2112 BLAST report that the parsers choke on. The parser will convert the 2113 ValueError to a LowQualityBlastError and attempt to provide useful 2114 information. 2115 2116 """
2117 - def __init__(self, bad_report_handle = None):
2118 """Initialize a parser that tries to catch BlastErrors. 2119 2120 Arguments: 2121 o bad_report_handle - An optional argument specifying a handle 2122 where bad reports should be sent. This would allow you to save 2123 all of the bad reports to a file, for instance. If no handle 2124 is specified, the bad reports will not be saved. 2125 """ 2126 self._bad_report_handle = bad_report_handle 2127 2128 #self._b_parser = BlastParser() 2129 self._scanner = _Scanner() 2130 self._consumer = _BlastErrorConsumer()
2131
2132 - def parse(self, handle):
2133 """Parse a handle, attempting to diagnose errors. 2134 """ 2135 results = handle.read() 2136 2137 try: 2138 self._scanner.feed(File.StringHandle(results), self._consumer) 2139 except ValueError, msg: 2140 # if we have a bad_report_file, save the info to it first 2141 if self._bad_report_handle: 2142 # send the info to the error handle 2143 self._bad_report_handle.write(results) 2144 2145 # now we want to try and diagnose the error 2146 self._diagnose_error( 2147 File.StringHandle(results), self._consumer.data) 2148 2149 # if we got here we can't figure out the problem 2150 # so we should pass along the syntax error we got 2151 raise 2152 return self._consumer.data
2153
2154 - def _diagnose_error(self, handle, data_record):
2155 """Attempt to diagnose an error in the passed handle. 2156 2157 Arguments: 2158 o handle - The handle potentially containing the error 2159 o data_record - The data record partially created by the consumer. 2160 """ 2161 line = handle.readline() 2162 2163 while line: 2164 # 'Searchingdone' instead of 'Searching......done' seems 2165 # to indicate a failure to perform the BLAST due to 2166 # low quality sequence 2167 if line.startswith('Searchingdone'): 2168 raise LowQualityBlastError("Blast failure occured on query: ", 2169 data_record.query) 2170 line = handle.readline()
2171