1
2
3
4
5
6
7
8
9 """
10 This module provides code to work with the WWW version of BLAST
11 provided by the NCBI.
12 http://blast.ncbi.nlm.nih.gov/
13
14 Functions:
15 qblast Do a BLAST search using the QBLAST API.
16 """
17
18 try:
19 import cStringIO as StringIO
20 except ImportError:
21 import StringIO
22
23
24
25 -def qblast(program, database, sequence,
26 auto_format=None,composition_based_statistics=None,
27 db_genetic_code=None,endpoints=None,entrez_query='(none)',
28 expect=10.0,filter=None,gapcosts=None,genetic_code=None,
29 hitlist_size=50,i_thresh=None,layout=None,lcase_mask=None,
30 matrix_name=None,nucl_penalty=None,nucl_reward=None,
31 other_advanced=None,perc_ident=None,phi_pattern=None,
32 query_file=None,query_believe_defline=None,query_from=None,
33 query_to=None,searchsp_eff=None,service=None,threshold=None,
34 ungapped_alignment=None,word_size=None,
35 alignments=500,alignment_view=None,descriptions=500,
36 entrez_links_new_window=None,expect_low=None,expect_high=None,
37 format_entrez_query=None,format_object=None,format_type='XML',
38 ncbi_gi=None,results_file=None,show_overview=None
39 ):
40 """Do a BLAST search using the QBLAST server at NCBI.
41
42 Supports all parameters of the qblast API for Put and Get.
43 Some useful parameters:
44 program blastn, blastp, blastx, tblastn, or tblastx (lower case)
45 database Which database to search against (e.g. "nr").
46 sequence The sequence to search.
47 ncbi_gi TRUE/FALSE whether to give 'gi' identifier.
48 descriptions Number of descriptions to show. Def 500.
49 alignments Number of alignments to show. Def 500.
50 expect An expect value cutoff. Def 10.0.
51 matrix_name Specify an alt. matrix (PAM30, PAM70, BLOSUM80, BLOSUM45).
52 filter "none" turns off filtering. Default no filtering
53 format_type "HTML", "Text", "ASN.1", or "XML". Def. "XML".
54 entrez_query Entrez query to limit Blast search
55 hitlist_size Number of hits to return. Default 50
56
57 This function does no checking of the validity of the parameters
58 and passes the values to the server as is. More help is available at:
59 http://www.ncbi.nlm.nih.gov/BLAST/blast_overview.html
60
61 """
62 import urllib, urllib2
63 import time
64
65 assert program in ['blastn', 'blastp', 'blastx', 'tblastn', 'tblastx']
66
67
68
69 parameters = [
70 ('AUTO_FORMAT',auto_format),
71 ('COMPOSITION_BASED_STATISTICS',composition_based_statistics),
72 ('DATABASE',database),
73 ('DB_GENETIC_CODE',db_genetic_code),
74 ('ENDPOINTS',endpoints),
75 ('ENTREZ_QUERY',entrez_query),
76 ('EXPECT',expect),
77 ('FILTER',filter),
78 ('GAPCOSTS',gapcosts),
79 ('GENETIC_CODE',genetic_code),
80 ('HITLIST_SIZE',hitlist_size),
81 ('I_THRESH',i_thresh),
82 ('LAYOUT',layout),
83 ('LCASE_MASK',lcase_mask),
84 ('MATRIX_NAME',matrix_name),
85 ('NUCL_PENALTY',nucl_penalty),
86 ('NUCL_REWARD',nucl_reward),
87 ('OTHER_ADVANCED',other_advanced),
88 ('PERC_IDENT',perc_ident),
89 ('PHI_PATTERN',phi_pattern),
90 ('PROGRAM',program),
91 ('QUERY',sequence),
92 ('QUERY_FILE',query_file),
93 ('QUERY_BELIEVE_DEFLINE',query_believe_defline),
94 ('QUERY_FROM',query_from),
95 ('QUERY_TO',query_to),
96 ('SEARCHSP_EFF',searchsp_eff),
97 ('SERVICE',service),
98 ('THRESHOLD',threshold),
99 ('UNGAPPED_ALIGNMENT',ungapped_alignment),
100 ('WORD_SIZE',word_size),
101 ('CMD', 'Put'),
102 ]
103 query = [x for x in parameters if x[1] is not None]
104 message = urllib.urlencode(query)
105
106
107
108
109
110 request = urllib2.Request("http://blast.ncbi.nlm.nih.gov/Blast.cgi",
111 message,
112 {"User-Agent":"BiopythonClient"})
113 handle = urllib2.urlopen(request)
114
115
116
117 rid, rtoe = _parse_qblast_ref_page(handle)
118 parameters = [
119 ('ALIGNMENTS',alignments),
120 ('ALIGNMENT_VIEW',alignment_view),
121 ('DESCRIPTIONS',descriptions),
122 ('ENTREZ_LINKS_NEW_WINDOW',entrez_links_new_window),
123 ('EXPECT_LOW',expect_low),
124 ('EXPECT_HIGH',expect_high),
125 ('FORMAT_ENTREZ_QUERY',format_entrez_query),
126 ('FORMAT_OBJECT',format_object),
127 ('FORMAT_TYPE',format_type),
128 ('NCBI_GI',ncbi_gi),
129 ('RID',rid),
130 ('RESULTS_FILE',results_file),
131 ('SERVICE',service),
132 ('SHOW_OVERVIEW',show_overview),
133 ('CMD', 'Get'),
134 ]
135 query = [x for x in parameters if x[1] is not None]
136 message = urllib.urlencode(query)
137
138
139 delay = 3.0
140 previous = time.time()
141 while True:
142 current = time.time()
143 wait = previous + delay - current
144 if wait > 0:
145 time.sleep(wait)
146 previous = current + wait
147 else:
148 previous = current
149
150 request = urllib2.Request("http://blast.ncbi.nlm.nih.gov/Blast.cgi",
151 message,
152 {"User-Agent":"BiopythonClient"})
153 handle = urllib2.urlopen(request)
154 results = handle.read()
155
156
157 if results=="\n\n":
158 continue
159
160 if results.find("Status=") < 0:
161 break
162 i = results.index("Status=")
163 j = results.index("\n", i)
164 status = results[i+len("Status="):j].strip()
165 if status.upper() == "READY":
166 break
167
168 return StringIO.StringIO(results)
169
171 """Extract a tuple of RID, RTOE from the 'please wait' page (PRIVATE).
172
173 The NCBI FAQ pages use TOE for 'Time of Execution', so RTOE is proably
174 'Request Time of Execution' and RID would be 'Request Identifier'.
175 """
176 s = handle.read()
177 i = s.find("RID =")
178 if i == -1:
179 rid = None
180 else:
181 j = s.find("\n", i)
182 rid = s[i+len("RID ="):j].strip()
183
184 i = s.find("RTOE =")
185 if i == -1:
186 rtoe = None
187 else:
188 j = s.find("\n", i)
189 rtoe = s[i+len("RTOE ="):j].strip()
190
191 if not rid and not rtoe:
192
193
194
195
196 i = s.find('<div class="error msInf">')
197 if i != -1:
198 msg = s[i+len('<div class="error msInf">'):].strip()
199 msg = msg.split("</div>",1)[0].split("\n",1)[0].strip()
200 if msg:
201 raise ValueError("Error message from NCBI: %s" % msg)
202
203 raise ValueError("No RID and no RTOE found in the 'please wait' page."
204 " (there was probably a problem with your request)")
205 elif not rid:
206
207 raise ValueError("No RID found in the 'please wait' page."
208 " (although RTOE = %s)" % repr(rtoe))
209 elif not rtoe:
210
211 raise ValueError("No RTOE found in the 'please wait' page."
212 " (although RID = %s)" % repr(rid))
213
214 try:
215 return rid, int(rtoe)
216 except ValueError:
217 raise ValueError("A non-integer RTOE found in " \
218 +"the 'please wait' page, %s" % repr(rtoe))
219