Package Bio :: Package Entrez
[hide private]
[frames] | no frames]

Source Code for Package Bio.Entrez

  1  # Copyright 1999-2000 by Jeffrey Chang.  All rights reserved. 
  2  # Copyright 2008 by Michiel de Hoon.  All rights reserved. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6   
  7  """Provides code to access NCBI over the WWW. 
  8   
  9  The main Entrez web page is available at: 
 10  http://www.ncbi.nlm.nih.gov/Entrez/ 
 11   
 12  A list of the Entrez utilities is available at: 
 13  http://www.ncbi.nlm.nih.gov/entrez/utils/utils_index.html 
 14   
 15   
 16  Functions: 
 17  efetch       Retrieves records in the requested format from a list of one or 
 18               more primary IDs or from the user's environment 
 19  epost        Posts a file containing a list of primary IDs for future use in 
 20               the user's environment to use with subsequent search strategies 
 21  esearch      Searches and retrieves primary IDs (for use in EFetch, ELink, 
 22               and ESummary) and term translations and optionally retains 
 23               results for future use in the user's environment. 
 24  elink        Checks for the existence of an external or Related Articles link 
 25               from a list of one or more primary IDs.  Retrieves primary IDs 
 26               and relevancy scores for links to Entrez databases or Related 
 27               Articles;  creates a hyperlink to the primary LinkOut provider 
 28               for a specific ID and database, or lists LinkOut URLs 
 29               and Attributes for multiple IDs. 
 30  einfo        Provides field index term counts, last update, and available 
 31               links for each database. 
 32  esummary     Retrieves document summaries from a list of primary IDs or from 
 33               the user's environment. 
 34  egquery      Provides Entrez database counts in XML for a single search 
 35               using Global Query. 
 36  espell       Retrieves spelling suggestions. 
 37   
 38  read         Parses the XML results returned by any of the above functions. 
 39               Typical usage is: 
 40               >>> handle = Entrez.einfo() # or esearch, efetch, ... 
 41               >>> record = Entrez.read(handle) 
 42               where record is now a Python dictionary or list. 
 43   
 44  _open        Internally used function. 
 45   
 46  """ 
 47  import urllib, time, warnings 
 48  import os.path 
 49  from Bio import File 
 50   
 51   
 52  email = None 
 53   
54 -def query(cmd, db, cgi='http://www.ncbi.nlm.nih.gov/sites/entrez', 55 **keywds):
56 """Query Entrez and return a handle to the HTML results (DEPRECATED). 57 58 See the online documentation for an explanation of the parameters: 59 http://www.ncbi.nlm.nih.gov/books/bv.fcgi?rid=helplinks.chapter.linkshelp 60 61 Return a handle to the results. 62 63 Raises an IOError exception if there's a network error. 64 """ 65 import warnings 66 warnings.warn("Bio.Entrez.query is deprecated, since it breaks NCBI's rule to only use the E-Utilities URL.", DeprecationWarning)
67 68 # XXX retmode?
69 -def epost(db, cgi=None, **keywds):
70 """Post a file of identifiers for future use. 71 72 Posts a file containing a list of UIs for future use in the user's 73 environment to use with subsequent search strategies. 74 75 See the online documentation for an explanation of the parameters: 76 http://www.ncbi.nlm.nih.gov/entrez/query/static/epost_help.html 77 78 Return a handle to the results. 79 80 Raises an IOError exception if there's a network error. 81 """ 82 if cgi: 83 import warnings 84 warnings.warn("Using a URL other than NCBI's main url for the E-Utilities is deprecated.", DeprecationWarning) 85 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/epost.fcgi' 86 variables = {'db' : db} 87 variables.update(keywds) 88 return _open(cgi, variables)
89
90 -def efetch(db, cgi=None, **keywds):
91 """Fetches Entrez results which are returned as a handle. 92 93 EFetch retrieves records in the requested format from a list of one or 94 more UIs or from user's environment. 95 96 See the online documentation for an explanation of the parameters: 97 http://www.ncbi.nlm.nih.gov/entrez/query/static/efetch_help.html 98 99 Return a handle to the results. 100 101 Raises an IOError exception if there's a network error. 102 103 Short example: 104 105 from Bio import Entrez 106 handle = Entrez.efetch(db="nucleotide", id="57240072", rettype="genbank") 107 print handle.read() 108 """ 109 if cgi: 110 import warnings 111 warnings.warn("Using a URL other than NCBI's main url for the E-Utilities is deprecated.", DeprecationWarning) 112 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi' 113 variables = {'db' : db} 114 variables.update(keywds) 115 return _open(cgi, variables)
116
117 -def esearch(db, term, cgi=None, **keywds):
118 """ESearch runs an Entrez search and returns a handle to the results. 119 120 ESearch searches and retrieves primary IDs (for use in EFetch, ELink 121 and ESummary) and term translations, and optionally retains results 122 for future use in the user's environment. 123 124 See the online documentation for an explanation of the parameters: 125 http://www.ncbi.nlm.nih.gov/entrez/query/static/esearch_help.html 126 127 Return a handle to the results which are always in XML format. 128 129 Raises an IOError exception if there's a network error. 130 131 Short example: 132 133 from Bio import Entez 134 handle = Entrez.esearch(db="nucleotide", retmax=10, term="Opuntia") 135 record = Entrez.read(handle) 136 print record["Count"] 137 print record["IdList"] 138 """ 139 if cgi: 140 import warnings 141 warnings.warn("Using a URL other than NCBI's main url for the E-Utilities is deprecated.", DeprecationWarning) 142 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi' 143 variables = {'db' : db, 144 'term' : term} 145 variables.update(keywds) 146 return _open(cgi, variables)
147 171
172 -def einfo(cgi=None, **keywds):
173 """EInfo returns a summary of the Entez databases as a results handle. 174 175 EInfo provides field names, index term counts, last update, and 176 available links for each Entrez database. 177 178 See the online documentation for an explanation of the parameters: 179 http://www.ncbi.nlm.nih.gov/entrez/query/static/einfo_help.html 180 181 Return a handle to the results, by default in XML format. 182 183 Raises an IOError exception if there's a network error. 184 185 Short example: 186 187 from Bio import Entrez 188 record = Entrez.read(Entrez.einfo()) 189 print record['DbList'] 190 """ 191 if cgi: 192 import warnings 193 warnings.warn("Using a URL other than NCBI's main url for the E-Utilities is deprecated.", DeprecationWarning) 194 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi' 195 variables = {} 196 variables.update(keywds) 197 return _open(cgi, variables)
198
199 -def esummary(cgi=None, **keywds):
200 """ESummary retrieves document summaries as a results handle. 201 202 ESummary retrieves document summaries from a list of primary IDs or 203 from the user's environment. 204 205 See the online documentation for an explanation of the parameters: 206 http://www.ncbi.nlm.nih.gov/entrez/query/static/esummary_help.html 207 208 Return a handle to the results, by default in XML format. 209 210 Raises an IOError exception if there's a network error. 211 """ 212 if cgi: 213 import warnings 214 warnings.warn("Using a URL other than NCBI's main url for the E-Utilities is deprecated.", DeprecationWarning) 215 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi' 216 variables = {} 217 variables.update(keywds) 218 return _open(cgi, variables)
219
220 -def egquery(cgi=None, **keywds):
221 """EGQuery provides Entrez database counts for a global search. 222 223 EGQuery provides Entrez database counts in XML for a single search 224 using Global Query. 225 226 See the online documentation for an explanation of the parameters: 227 http://www.ncbi.nlm.nih.gov/entrez/query/static/egquery_help.html 228 229 Return a handle to the results in XML format. 230 231 Raises an IOError exception if there's a network error. 232 """ 233 if cgi: 234 import warnings 235 warnings.warn("Using a URL other than NCBI's main url for the E-Utilities is deprecated.", DeprecationWarning) 236 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/egquery.fcgi' 237 variables = {} 238 variables.update(keywds) 239 return _open(cgi, variables)
240
241 -def espell(cgi=None, **keywds):
242 """ESpell retrieves spelling suggestions, returned in a results handle. 243 244 ESpell retrieves spelling suggestions, if available. 245 246 See the online documentation for an explanation of the parameters: 247 http://www.ncbi.nlm.nih.gov/entrez/query/static/espell_help.html 248 249 Return a handle to the results, by default in XML format. 250 251 Raises an IOError exception if there's a network error. 252 253 Short example: 254 255 from Bio import Entrez 256 record = Entrez.read(Entrez.espell(term="biopythooon")) 257 print record["Query"] 258 print record["CorrectedQuery"] 259 """ 260 if cgi: 261 import warnings 262 warnings.warn("Using a URL other than NCBI's main url for the E-Utilities is deprecated.", DeprecationWarning) 263 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/espell.fcgi' 264 variables = {} 265 variables.update(keywds) 266 return _open(cgi, variables)
267
268 -def read(handle):
269 """Parses an XML file from the NCBI Entrez Utilities into python objects. 270 271 This function parses an XML file created by NCBI's Entrez Utilities, 272 returning a multilevel data structure of Python lists and dictionaries. 273 Most XML files returned by NCBI's Entrez Utilities can be parsed by 274 this function, provided its DTD is available. Biopython includes the 275 DTDs for most commonly used Entrez Utilities. 276 277 Whereas the data structure seems to consist of generic Python lists, 278 dictionaries, strings, and so on, each of these is actually a class 279 derived from the base type. This allows us to store the attributes 280 (if any) of each element in a dictionary my_element.attributes, and 281 the tag name in my_element.tag. 282 """ 283 from Parser import DataHandler 284 DTDs = os.path.join(__path__[0], "DTDs") 285 handler = DataHandler(DTDs) 286 record = handler.run(handle) 287 return record
288
289 -def _open(cgi, params={}):
290 """Helper function to build the URL and open a handle to it (PRIVATE). 291 292 Open a handle to Entrez. cgi is the URL for the cgi script to access. 293 params is a dictionary with the options to pass to it. Does some 294 simple error checking, and will raise an IOError if it encounters one. 295 296 This function also enforces the "three second rule" to avoid abusing 297 the NCBI servers. 298 """ 299 # NCBI requirement: At least three seconds between queries 300 delay = 3.0 301 current = time.time() 302 wait = _open.previous + delay - current 303 if wait > 0: 304 time.sleep(wait) 305 _open.previous = current + wait 306 else: 307 _open.previous = current 308 # Remove None values from the parameters 309 for key, value in params.items(): 310 if value is None: 311 del params[key] 312 # Tell Entrez that we are using Biopython 313 if not "tool" in params: 314 params["tool"] = "biopython" 315 # Tell Entrez who we are 316 if not "email" in params: 317 if email!=None: 318 params["email"] = email 319 # Open a handle to Entrez. 320 options = urllib.urlencode(params, doseq=True) 321 cgi += "?" + options 322 handle = urllib.urlopen(cgi) 323 324 # Wrap the handle inside an UndoHandle. 325 uhandle = File.UndoHandle(handle) 326 327 # Check for errors in the first 5 lines. 328 # This is kind of ugly. 329 lines = [] 330 for i in range(5): 331 lines.append(uhandle.readline()) 332 for i in range(4, -1, -1): 333 uhandle.saveline(lines[i]) 334 data = ''.join(lines) 335 336 if "500 Proxy Error" in data: 337 # Sometimes Entrez returns a Proxy Error instead of results 338 raise IOError("500 Proxy Error (NCBI busy?)") 339 elif "502 Proxy Error" in data: 340 raise IOError("502 Proxy Error (NCBI busy?)") 341 elif "WWW Error 500 Diagnostic" in data: 342 raise IOError("WWW Error 500 Diagnostic (NCBI busy?)") 343 elif data.startswith("Error:") : 344 #e.g. 'Error: Your session has expired. Please repeat your search.\n' 345 raise IOError(data.strip()) 346 elif data.startswith("The resource is temporarily unavailable") : 347 #This can occur with an invalid query_key 348 #Perhaps this should be a ValueError? 349 raise IOError("The resource is temporarily unavailable") 350 elif data.startswith("download dataset is empty") : 351 #This can occur when omit the identifier, or the WebEnv and query_key 352 #Perhaps this should be a ValueError? 353 raise IOError("download dataset is empty") 354 elif data[:5] == "ERROR": 355 # XXX Possible bug here, because I don't know whether this really 356 # occurs on the first line. I need to check this! 357 raise IOError("ERROR, possibly because id not available?") 358 # Should I check for 404? timeout? etc? 359 return uhandle
360 361 _open.previous = 0 362