Package Bio :: Package Entrez
[hide private]
[frames] | no frames]

Source Code for Package Bio.Entrez

  1  # Copyright 1999-2000 by Jeffrey Chang.  All rights reserved. 
  2  # Copyright 2008 by Michiel de Hoon.  All rights reserved. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6   
  7  """Provides code to access NCBI over the WWW. 
  8   
  9  The main Entrez web page is available at: 
 10  http://www.ncbi.nlm.nih.gov/Entrez/ 
 11   
 12  A list of the Entrez utilities is available at: 
 13  http://www.ncbi.nlm.nih.gov/entrez/utils/utils_index.html 
 14   
 15   
 16  Functions: 
 17  efetch       Retrieves records in the requested format from a list of one or 
 18               more primary IDs or from the user's environment 
 19  epost        Posts a file containing a list of primary IDs for future use in 
 20               the user's environment to use with subsequent search strategies 
 21  esearch      Searches and retrieves primary IDs (for use in EFetch, ELink, 
 22               and ESummary) and term translations and optionally retains 
 23               results for future use in the user's environment. 
 24  elink        Checks for the existence of an external or Related Articles link 
 25               from a list of one or more primary IDs.  Retrieves primary IDs 
 26               and relevancy scores for links to Entrez databases or Related 
 27               Articles;  creates a hyperlink to the primary LinkOut provider 
 28               for a specific ID and database, or lists LinkOut URLs 
 29               and Attributes for multiple IDs. 
 30  einfo        Provides field index term counts, last update, and available 
 31               links for each database. 
 32  esummary     Retrieves document summaries from a list of primary IDs or from 
 33               the user's environment. 
 34  egquery      Provides Entrez database counts in XML for a single search 
 35               using Global Query. 
 36  espell       Retrieves spelling suggestions. 
 37   
 38  read         Parses the XML results returned by any of the above functions. 
 39               Typical usage is: 
 40               >>> handle = Entrez.einfo() # or esearch, efetch, ... 
 41               >>> record = Entrez.read(handle) 
 42               where record is now a Python dictionary or list. 
 43   
 44  _open        Internally used function. 
 45   
 46  """ 
 47  import urllib, time, warnings 
 48  import os.path 
 49  from Bio import File 
 50   
 51   
 52  email = None 
 53   
 54   
 55  # XXX retmode? 
56 -def epost(db, **keywds):
57 """Post a file of identifiers for future use. 58 59 Posts a file containing a list of UIs for future use in the user's 60 environment to use with subsequent search strategies. 61 62 See the online documentation for an explanation of the parameters: 63 http://www.ncbi.nlm.nih.gov/entrez/query/static/epost_help.html 64 65 Return a handle to the results. 66 67 Raises an IOError exception if there's a network error. 68 """ 69 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/epost.fcgi' 70 variables = {'db' : db} 71 variables.update(keywds) 72 return _open(cgi, variables, post=True)
73
74 -def efetch(db, **keywds):
75 """Fetches Entrez results which are returned as a handle. 76 77 EFetch retrieves records in the requested format from a list of one or 78 more UIs or from user's environment. 79 80 See the online documentation for an explanation of the parameters: 81 http://www.ncbi.nlm.nih.gov/entrez/query/static/efetch_help.html 82 83 Return a handle to the results. 84 85 Raises an IOError exception if there's a network error. 86 87 Short example: 88 89 from Bio import Entrez 90 handle = Entrez.efetch(db="nucleotide", id="57240072", rettype="gb") 91 print handle.read() 92 """ 93 for key in keywds: 94 if key.lower()=="rettype" and keywds[key].lower()=="genbank": 95 warnings.warn('As of Easter 2009, Entrez EFetch no longer ' 96 'supports the unofficial return type "genbank", ' 97 'use "gb" or "gp" instead.', DeprecationWarning) 98 if db.lower()=="protein": 99 keywds[key] = "gp" #GenPept 100 else: 101 keywds[key] = "gb" #GenBank 102 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi' 103 variables = {'db' : db} 104 variables.update(keywds) 105 return _open(cgi, variables)
106
107 -def esearch(db, term, **keywds):
108 """ESearch runs an Entrez search and returns a handle to the results. 109 110 ESearch searches and retrieves primary IDs (for use in EFetch, ELink 111 and ESummary) and term translations, and optionally retains results 112 for future use in the user's environment. 113 114 See the online documentation for an explanation of the parameters: 115 http://www.ncbi.nlm.nih.gov/entrez/query/static/esearch_help.html 116 117 Return a handle to the results which are always in XML format. 118 119 Raises an IOError exception if there's a network error. 120 121 Short example: 122 123 from Bio import Entez 124 handle = Entrez.esearch(db="nucleotide", retmax=10, term="Opuntia") 125 record = Entrez.read(handle) 126 print record["Count"] 127 print record["IdList"] 128 """ 129 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi' 130 variables = {'db' : db, 131 'term' : term} 132 variables.update(keywds) 133 return _open(cgi, variables)
134 155
156 -def einfo(**keywds):
157 """EInfo returns a summary of the Entez databases as a results handle. 158 159 EInfo provides field names, index term counts, last update, and 160 available links for each Entrez database. 161 162 See the online documentation for an explanation of the parameters: 163 http://www.ncbi.nlm.nih.gov/entrez/query/static/einfo_help.html 164 165 Return a handle to the results, by default in XML format. 166 167 Raises an IOError exception if there's a network error. 168 169 Short example: 170 171 from Bio import Entrez 172 record = Entrez.read(Entrez.einfo()) 173 print record['DbList'] 174 """ 175 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi' 176 variables = {} 177 variables.update(keywds) 178 return _open(cgi, variables)
179
180 -def esummary(**keywds):
181 """ESummary retrieves document summaries as a results handle. 182 183 ESummary retrieves document summaries from a list of primary IDs or 184 from the user's environment. 185 186 See the online documentation for an explanation of the parameters: 187 http://www.ncbi.nlm.nih.gov/entrez/query/static/esummary_help.html 188 189 Return a handle to the results, by default in XML format. 190 191 Raises an IOError exception if there's a network error. 192 """ 193 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi' 194 variables = {} 195 variables.update(keywds) 196 return _open(cgi, variables)
197
198 -def egquery(**keywds):
199 """EGQuery provides Entrez database counts for a global search. 200 201 EGQuery provides Entrez database counts in XML for a single search 202 using Global Query. 203 204 See the online documentation for an explanation of the parameters: 205 http://www.ncbi.nlm.nih.gov/entrez/query/static/egquery_help.html 206 207 Return a handle to the results in XML format. 208 209 Raises an IOError exception if there's a network error. 210 """ 211 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/egquery.fcgi' 212 variables = {} 213 variables.update(keywds) 214 return _open(cgi, variables)
215
216 -def espell(**keywds):
217 """ESpell retrieves spelling suggestions, returned in a results handle. 218 219 ESpell retrieves spelling suggestions, if available. 220 221 See the online documentation for an explanation of the parameters: 222 http://www.ncbi.nlm.nih.gov/entrez/query/static/espell_help.html 223 224 Return a handle to the results, by default in XML format. 225 226 Raises an IOError exception if there's a network error. 227 228 Short example: 229 230 from Bio import Entrez 231 record = Entrez.read(Entrez.espell(term="biopythooon")) 232 print record["Query"] 233 print record["CorrectedQuery"] 234 """ 235 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/espell.fcgi' 236 variables = {} 237 variables.update(keywds) 238 return _open(cgi, variables)
239
240 -def read(handle):
241 """Parses an XML file from the NCBI Entrez Utilities into python objects. 242 243 This function parses an XML file created by NCBI's Entrez Utilities, 244 returning a multilevel data structure of Python lists and dictionaries. 245 Most XML files returned by NCBI's Entrez Utilities can be parsed by 246 this function, provided its DTD is available. Biopython includes the 247 DTDs for most commonly used Entrez Utilities. 248 249 Whereas the data structure seems to consist of generic Python lists, 250 dictionaries, strings, and so on, each of these is actually a class 251 derived from the base type. This allows us to store the attributes 252 (if any) of each element in a dictionary my_element.attributes, and 253 the tag name in my_element.tag. 254 """ 255 from Parser import DataHandler 256 DTDs = os.path.join(__path__[0], "DTDs") 257 handler = DataHandler(DTDs) 258 record = handler.read(handle) 259 return record
260
261 -def parse(handle):
262 from Parser import DataHandler 263 DTDs = os.path.join(__path__[0], "DTDs") 264 handler = DataHandler(DTDs) 265 records = handler.parse(handle) 266 return records
267
268 -def _open(cgi, params={}, post=False):
269 """Helper function to build the URL and open a handle to it (PRIVATE). 270 271 Open a handle to Entrez. cgi is the URL for the cgi script to access. 272 params is a dictionary with the options to pass to it. Does some 273 simple error checking, and will raise an IOError if it encounters one. 274 275 This function also enforces the "up to three queries per second rule" 276 to avoid abusing the NCBI servers. 277 """ 278 # NCBI requirement: At most three queries per second. 279 # Equivalently, at least a third of second between queries 280 delay = 0.333333334 281 current = time.time() 282 wait = _open.previous + delay - current 283 if wait > 0: 284 time.sleep(wait) 285 _open.previous = current + wait 286 else: 287 _open.previous = current 288 # Remove None values from the parameters 289 for key, value in params.items(): 290 if value is None: 291 del params[key] 292 # Tell Entrez that we are using Biopython 293 if not "tool" in params: 294 params["tool"] = "biopython" 295 # Tell Entrez who we are 296 if not "email" in params: 297 if email!=None: 298 params["email"] = email 299 else: 300 warnings.warn(""" 301 Email address is not specified. 302 303 To make use of NCBI's E-utilities, NCBI strongly recommends you to specify 304 your email address with each request. From June 1, 2010, this will be 305 mandatory. As an example, if your email address is A.N.Other@example.com, you 306 can specify it as follows: 307 from Bio import Entrez 308 Entrez.email = 'A.N.Other@example.com' 309 In case of excessive usage of the E-utilities, NCBI will attempt to contact 310 a user at the email address provided before blocking access to the 311 E-utilities.""", UserWarning) 312 # Open a handle to Entrez. 313 options = urllib.urlencode(params, doseq=True) 314 if post: 315 #HTTP POST 316 handle = urllib.urlopen(cgi, data=options) 317 else: 318 #HTTP GET 319 cgi += "?" + options 320 handle = urllib.urlopen(cgi) 321 322 # Wrap the handle inside an UndoHandle. 323 uhandle = File.UndoHandle(handle) 324 325 # Check for errors in the first 7 lines. 326 # This is kind of ugly. 327 lines = [] 328 for i in range(7): 329 lines.append(uhandle.readline()) 330 for i in range(6, -1, -1): 331 uhandle.saveline(lines[i]) 332 data = ''.join(lines) 333 334 if "500 Proxy Error" in data: 335 # Sometimes Entrez returns a Proxy Error instead of results 336 raise IOError("500 Proxy Error (NCBI busy?)") 337 elif "502 Proxy Error" in data: 338 raise IOError("502 Proxy Error (NCBI busy?)") 339 elif "WWW Error 500 Diagnostic" in data: 340 raise IOError("WWW Error 500 Diagnostic (NCBI busy?)") 341 elif "<title>Service unavailable!</title>" in data: 342 #Probably later in the file it will say "Error 503" 343 raise IOError("Service unavailable!") 344 elif "<title>Bad Gateway!</title>" in data: 345 #Probably later in the file it will say: 346 # "The proxy server received an invalid 347 # response from an upstream server." 348 raise IOError("Bad Gateway!") 349 elif "<title>414 Request-URI Too Large</title>" in data \ 350 or "<h1>Request-URI Too Large</h1>" in data: 351 raise IOError("Requested URL too long (try using EPost?)") 352 elif data.startswith("Error:"): 353 #e.g. 'Error: Your session has expired. Please repeat your search.\n' 354 raise IOError(data.strip()) 355 elif data.startswith("The resource is temporarily unavailable"): 356 #This can occur with an invalid query_key 357 #Perhaps this should be a ValueError? 358 raise IOError("The resource is temporarily unavailable") 359 elif data.startswith("download dataset is empty"): 360 #This can occur when omit the identifier, or the WebEnv and query_key 361 #Perhaps this should be a ValueError? 362 raise IOError("download dataset is empty") 363 elif data[:5] == "ERROR": 364 # XXX Possible bug here, because I don't know whether this really 365 # occurs on the first line. I need to check this! 366 raise IOError("ERROR, possibly because id not available?") 367 # Should I check for 404? timeout? etc? 368 return uhandle
369 370 _open.previous = 0 371