1
2
3
4
5
6
7 """Provides code to access NCBI over the WWW.
8
9 The main Entrez web page is available at:
10 http://www.ncbi.nlm.nih.gov/Entrez/
11
12 A list of the Entrez utilities is available at:
13 http://www.ncbi.nlm.nih.gov/entrez/utils/utils_index.html
14
15
16 Functions:
17 efetch Retrieves records in the requested format from a list of one or
18 more primary IDs or from the user's environment
19 epost Posts a file containing a list of primary IDs for future use in
20 the user's environment to use with subsequent search strategies
21 esearch Searches and retrieves primary IDs (for use in EFetch, ELink,
22 and ESummary) and term translations and optionally retains
23 results for future use in the user's environment.
24 elink Checks for the existence of an external or Related Articles link
25 from a list of one or more primary IDs. Retrieves primary IDs
26 and relevancy scores for links to Entrez databases or Related
27 Articles; creates a hyperlink to the primary LinkOut provider
28 for a specific ID and database, or lists LinkOut URLs
29 and Attributes for multiple IDs.
30 einfo Provides field index term counts, last update, and available
31 links for each database.
32 esummary Retrieves document summaries from a list of primary IDs or from
33 the user's environment.
34 egquery Provides Entrez database counts in XML for a single search
35 using Global Query.
36 espell Retrieves spelling suggestions.
37
38 read Parses the XML results returned by any of the above functions.
39 Typical usage is:
40 >>> handle = Entrez.einfo() # or esearch, efetch, ...
41 >>> record = Entrez.read(handle)
42 where record is now a Python dictionary or list.
43
44 _open Internally used function.
45
46 """
47 import urllib, time, warnings
48 import os.path
49 from Bio import File
50
51
52 email = None
53
54
55
56 -def epost(db, **keywds):
57 """Post a file of identifiers for future use.
58
59 Posts a file containing a list of UIs for future use in the user's
60 environment to use with subsequent search strategies.
61
62 See the online documentation for an explanation of the parameters:
63 http://www.ncbi.nlm.nih.gov/entrez/query/static/epost_help.html
64
65 Return a handle to the results.
66
67 Raises an IOError exception if there's a network error.
68 """
69 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/epost.fcgi'
70 variables = {'db' : db}
71 variables.update(keywds)
72 return _open(cgi, variables, post=True)
73
75 """Fetches Entrez results which are returned as a handle.
76
77 EFetch retrieves records in the requested format from a list of one or
78 more UIs or from user's environment.
79
80 See the online documentation for an explanation of the parameters:
81 http://www.ncbi.nlm.nih.gov/entrez/query/static/efetch_help.html
82
83 Return a handle to the results.
84
85 Raises an IOError exception if there's a network error.
86
87 Short example:
88
89 from Bio import Entrez
90 handle = Entrez.efetch(db="nucleotide", id="57240072", rettype="gb")
91 print handle.read()
92 """
93 for key in keywds:
94 if key.lower()=="rettype" and keywds[key].lower()=="genbank":
95 warnings.warn('As of Easter 2009, Entrez EFetch no longer '
96 'supports the unofficial return type "genbank", '
97 'use "gb" or "gp" instead.', DeprecationWarning)
98 if db.lower()=="protein":
99 keywds[key] = "gp"
100 else:
101 keywds[key] = "gb"
102 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
103 variables = {'db' : db}
104 variables.update(keywds)
105 return _open(cgi, variables)
106
108 """ESearch runs an Entrez search and returns a handle to the results.
109
110 ESearch searches and retrieves primary IDs (for use in EFetch, ELink
111 and ESummary) and term translations, and optionally retains results
112 for future use in the user's environment.
113
114 See the online documentation for an explanation of the parameters:
115 http://www.ncbi.nlm.nih.gov/entrez/query/static/esearch_help.html
116
117 Return a handle to the results which are always in XML format.
118
119 Raises an IOError exception if there's a network error.
120
121 Short example:
122
123 from Bio import Entez
124 handle = Entrez.esearch(db="nucleotide", retmax=10, term="Opuntia")
125 record = Entrez.read(handle)
126 print record["Count"]
127 print record["IdList"]
128 """
129 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
130 variables = {'db' : db,
131 'term' : term}
132 variables.update(keywds)
133 return _open(cgi, variables)
134
136 """ELink checks for linked external articles and returns a handle.
137
138 ELink checks for the existence of an external or Related Articles link
139 from a list of one or more primary IDs; retrieves IDs and relevancy
140 scores for links to Entrez databases or Related Articles; creates a
141 hyperlink to the primary LinkOut provider for a specific ID and
142 database, or lists LinkOut URLs and attributes for multiple IDs.
143
144 See the online documentation for an explanation of the parameters:
145 http://www.ncbi.nlm.nih.gov/entrez/query/static/elink_help.html
146
147 Return a handle to the results, by default in XML format.
148
149 Raises an IOError exception if there's a network error.
150 """
151 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi'
152 variables = {}
153 variables.update(keywds)
154 return _open(cgi, variables)
155
157 """EInfo returns a summary of the Entez databases as a results handle.
158
159 EInfo provides field names, index term counts, last update, and
160 available links for each Entrez database.
161
162 See the online documentation for an explanation of the parameters:
163 http://www.ncbi.nlm.nih.gov/entrez/query/static/einfo_help.html
164
165 Return a handle to the results, by default in XML format.
166
167 Raises an IOError exception if there's a network error.
168
169 Short example:
170
171 from Bio import Entrez
172 record = Entrez.read(Entrez.einfo())
173 print record['DbList']
174 """
175 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi'
176 variables = {}
177 variables.update(keywds)
178 return _open(cgi, variables)
179
181 """ESummary retrieves document summaries as a results handle.
182
183 ESummary retrieves document summaries from a list of primary IDs or
184 from the user's environment.
185
186 See the online documentation for an explanation of the parameters:
187 http://www.ncbi.nlm.nih.gov/entrez/query/static/esummary_help.html
188
189 Return a handle to the results, by default in XML format.
190
191 Raises an IOError exception if there's a network error.
192 """
193 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi'
194 variables = {}
195 variables.update(keywds)
196 return _open(cgi, variables)
197
199 """EGQuery provides Entrez database counts for a global search.
200
201 EGQuery provides Entrez database counts in XML for a single search
202 using Global Query.
203
204 See the online documentation for an explanation of the parameters:
205 http://www.ncbi.nlm.nih.gov/entrez/query/static/egquery_help.html
206
207 Return a handle to the results in XML format.
208
209 Raises an IOError exception if there's a network error.
210 """
211 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/egquery.fcgi'
212 variables = {}
213 variables.update(keywds)
214 return _open(cgi, variables)
215
217 """ESpell retrieves spelling suggestions, returned in a results handle.
218
219 ESpell retrieves spelling suggestions, if available.
220
221 See the online documentation for an explanation of the parameters:
222 http://www.ncbi.nlm.nih.gov/entrez/query/static/espell_help.html
223
224 Return a handle to the results, by default in XML format.
225
226 Raises an IOError exception if there's a network error.
227
228 Short example:
229
230 from Bio import Entrez
231 record = Entrez.read(Entrez.espell(term="biopythooon"))
232 print record["Query"]
233 print record["CorrectedQuery"]
234 """
235 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/espell.fcgi'
236 variables = {}
237 variables.update(keywds)
238 return _open(cgi, variables)
239
241 """Parses an XML file from the NCBI Entrez Utilities into python objects.
242
243 This function parses an XML file created by NCBI's Entrez Utilities,
244 returning a multilevel data structure of Python lists and dictionaries.
245 Most XML files returned by NCBI's Entrez Utilities can be parsed by
246 this function, provided its DTD is available. Biopython includes the
247 DTDs for most commonly used Entrez Utilities.
248
249 Whereas the data structure seems to consist of generic Python lists,
250 dictionaries, strings, and so on, each of these is actually a class
251 derived from the base type. This allows us to store the attributes
252 (if any) of each element in a dictionary my_element.attributes, and
253 the tag name in my_element.tag.
254 """
255 from Parser import DataHandler
256 DTDs = os.path.join(__path__[0], "DTDs")
257 handler = DataHandler(DTDs)
258 record = handler.read(handle)
259 return record
260
267
268 -def _open(cgi, params={}, post=False):
269 """Helper function to build the URL and open a handle to it (PRIVATE).
270
271 Open a handle to Entrez. cgi is the URL for the cgi script to access.
272 params is a dictionary with the options to pass to it. Does some
273 simple error checking, and will raise an IOError if it encounters one.
274
275 This function also enforces the "up to three queries per second rule"
276 to avoid abusing the NCBI servers.
277 """
278
279
280 delay = 0.333333334
281 current = time.time()
282 wait = _open.previous + delay - current
283 if wait > 0:
284 time.sleep(wait)
285 _open.previous = current + wait
286 else:
287 _open.previous = current
288
289 for key, value in params.items():
290 if value is None:
291 del params[key]
292
293 if not "tool" in params:
294 params["tool"] = "biopython"
295
296 if not "email" in params:
297 if email!=None:
298 params["email"] = email
299 else:
300 warnings.warn("""
301 Email address is not specified.
302
303 To make use of NCBI's E-utilities, NCBI strongly recommends you to specify
304 your email address with each request. From June 1, 2010, this will be
305 mandatory. As an example, if your email address is A.N.Other@example.com, you
306 can specify it as follows:
307 from Bio import Entrez
308 Entrez.email = 'A.N.Other@example.com'
309 In case of excessive usage of the E-utilities, NCBI will attempt to contact
310 a user at the email address provided before blocking access to the
311 E-utilities.""", UserWarning)
312
313 options = urllib.urlencode(params, doseq=True)
314 if post:
315
316 handle = urllib.urlopen(cgi, data=options)
317 else:
318
319 cgi += "?" + options
320 handle = urllib.urlopen(cgi)
321
322
323 uhandle = File.UndoHandle(handle)
324
325
326
327 lines = []
328 for i in range(7):
329 lines.append(uhandle.readline())
330 for i in range(6, -1, -1):
331 uhandle.saveline(lines[i])
332 data = ''.join(lines)
333
334 if "500 Proxy Error" in data:
335
336 raise IOError("500 Proxy Error (NCBI busy?)")
337 elif "502 Proxy Error" in data:
338 raise IOError("502 Proxy Error (NCBI busy?)")
339 elif "WWW Error 500 Diagnostic" in data:
340 raise IOError("WWW Error 500 Diagnostic (NCBI busy?)")
341 elif "<title>Service unavailable!</title>" in data:
342
343 raise IOError("Service unavailable!")
344 elif "<title>Bad Gateway!</title>" in data:
345
346
347
348 raise IOError("Bad Gateway!")
349 elif "<title>414 Request-URI Too Large</title>" in data \
350 or "<h1>Request-URI Too Large</h1>" in data:
351 raise IOError("Requested URL too long (try using EPost?)")
352 elif data.startswith("Error:"):
353
354 raise IOError(data.strip())
355 elif data.startswith("The resource is temporarily unavailable"):
356
357
358 raise IOError("The resource is temporarily unavailable")
359 elif data.startswith("download dataset is empty"):
360
361
362 raise IOError("download dataset is empty")
363 elif data[:5] == "ERROR":
364
365
366 raise IOError("ERROR, possibly because id not available?")
367
368 return uhandle
369
370 _open.previous = 0
371