Package Bio :: Module PubMed
[hide private]
[frames] | no frames]

Source Code for Module Bio.PubMed

  1  # Copyright 1999-2000 by Jeffrey Chang.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """ 
  7  This module provides code to work with PubMed from the NCBI (DEPRECATED). 
  8   
  9  This module has been deprecated and is likely to be removed in a future 
 10  release of Biopython.  Please use Bio.Entrez instead, which is described 
 11  in the Biopython Tutorial. 
 12   
 13  See also: 
 14  http://www.ncbi.nlm.nih.gov/PubMed/ 
 15   
 16  Online documentation for linking to PubMed is available at: 
 17  http://www.ncbi.nlm.nih.gov/PubMed/linking.html 
 18   
 19   
 20  Classes: 
 21  Dictionary     Access PubMed articles using a dictionary interface. 
 22   
 23  Functions: 
 24  search_for     Search PubMed. 
 25  find_related   Find related articles in PubMed. 
 26  download_many  Download many articles from PubMed in batch mode. 
 27   
 28  """ 
 29   
 30  import warnings 
 31  warnings.warn("Bio.PubMed has been deprecated, and we intend to remove it in" \ 
 32                +" a future release of Biopython.  Please use Bio.Entrez"\ 
 33                +" instead as described in the Tutorial.  If you need help" \ 
 34                +" with this transition, or wish to continue to use this code,"\ 
 35                +" please get in contact via the mailing lists.", \ 
 36                DeprecationWarning) 
 37   
 38  import re 
 39  import sgmllib 
 40   
 41  from Bio import File 
 42  from Bio import Entrez 
 43  from Bio import Medline 
 44   
45 -class Dictionary:
46 """Access PubMed using a read-only dictionary interface (DEPRECATED). 47 48 Please use Bio.Entrez instead as described in the Biopython Tutorial. 49 """
50 - def __init__(self, parser=None):
51 """Dictionary(parser=None) 52 53 Create a new Dictionary to access PubMed. parser is an optional 54 parser (e.g. Medline.RecordParser) object to change the results 55 into another form. If set to None, then the raw contents of the 56 file will be returned. 57 58 """ 59 self.parser = parser
60
61 - def __len__(self):
62 raise NotImplementedError("PubMed contains lots of entries")
63 - def clear(self):
64 raise NotImplementedError("This is a read-only dictionary")
65 - def __setitem__(self, key, item):
66 raise NotImplementedError("This is a read-only dictionary")
67 - def update(self):
68 raise NotImplementedError("This is a read-only dictionary")
69 - def copy(self):
70 raise NotImplementedError("You don't need to do this...")
71 - def keys(self):
72 raise NotImplementedError("You don't really want to do this...")
73 - def items(self):
74 raise NotImplementedError("You don't really want to do this...")
75 - def values(self):
76 raise NotImplementedError("You don't really want to do this...")
77
78 - def has_key(self, id):
79 """S.has_key(id) -> bool""" 80 try: 81 self[id] 82 except KeyError: 83 return 0 84 return 1
85
86 - def get(self, id, failobj=None):
87 try: 88 return self[id] 89 except KeyError: 90 return failobj
91
92 - def __getitem__(self, id):
93 """S.__getitem__(id) -> object 94 95 Return the Medline entry. id is either the Medline Unique ID 96 or the Pubmed ID of the article. Raises a KeyError if there's an 97 error. 98 99 """ 100 try: 101 handle = Entrez.efetch( 102 db="pubmed", id=id, retmode='text', rettype='medlars') 103 except IOError, x: 104 # raise a KeyError instead of an IOError 105 # XXX I really should distinguish between a real IOError and 106 # if the id is not in the database. 107 raise KeyError(x) 108 if self.parser is not None: 109 return self.parser.parse(handle) 110 return handle.read()
111
112 -def search_for(search, reldate=None, mindate=None, maxdate=None, 113 batchsize=100, callback_fn=None, start_id=0, max_ids=None):
114 """Search PubMed, returns a list of IDs (DEPRECATED). 115 116 Please use Bio.Entrez instead as described in the Biopython Tutorial. 117 118 Search PubMed and return a list of the PMID's that match the 119 criteria. search is the search string used to search the 120 database. reldate is the number of dates prior to the current 121 date to restrict the search. mindate and maxdate are the dates to 122 restrict the search, e.g. 2002/01/01. batchsize specifies the 123 number of ids to return at one time. By default, it is set to 124 10000, the maximum. callback_fn is an optional callback function 125 that will be called as passed a PMID as results are retrieved. 126 start_id specifies the index of the first id to retrieve and 127 max_ids specifies the maximum number of id's to retrieve. 128 129 XXX The date parameters don't seem to be working with NCBI's 130 script. Please let me know if you can get it to work. 131 132 """ 133 params = { 134 'db' : 'pubmed', 135 'term' : search, 136 'reldate' : reldate, 137 'mindate' : mindate, 138 'maxdate' : maxdate 139 } 140 #Note that Bio.Entrez can now cope with None arguments (it ignores them) 141 142 ids = [] 143 while max_ids is None or len(ids) < max_ids: 144 start = start_id + len(ids) 145 max = batchsize 146 if max_ids is not None and max > max_ids - len(ids): 147 max = max_ids - len(ids) 148 149 params['retstart'] = start 150 params['retmax'] = max 151 h = Entrez.esearch(**params) 152 record = Entrez.read(h) 153 idlist = record["IdList"] 154 ids.extend(idlist) 155 if callback_fn is not None: 156 # Call the callback function with each of the new ID's. 157 for id in idlist: 158 callback_fn(id) 159 if len(idlist) < max: # no more id's to read 160 break 161 return ids
162 186 def start_id(self, attributes): 187 self.in_id = 1 188 def end_id(self): 189 self.in_id = 0 190 def start_link(self, attributes): 191 self.in_link = 1 192 def end_link(self): 193 self.in_link = 0 194 _not_pmid_re = re.compile(r'\D') 195 def handle_data(self, data): 196 if not self.in_link or not self.in_id: 197 return 198 # Everything here should be a PMID. Check and make sure 199 # data really is one. A PMID should be a string consisting 200 # of only integers. Should I check to make sure it 201 # meets a certain minimum length? 202 if self._not_pmid_re.search(data): 203 raise ValueError(\ 204 "I expected an ID, but '%s' doesn't look like one." % \ 205 repr(data)) 206 self.ids.append(data) 207 208 parser = ResultParser() 209 if type(pmid) is type([]): 210 pmid = ','.join(pmid) 211 h = Entrez.elink(dbfrom='pubmed', id=pmid) 212 parser.feed(h.read()) 213 return parser.ids 214
215 -def download_many(ids, callback_fn, broken_fn=None, 216 batchsize=500, parser=None):
217 """Download multiple PubMed records, no return value (DEPRECATED). 218 219 Please use Bio.Entrez instead as described in the Biopython Tutorial. 220 221 Download many records from PubMed. ids is a list of either the 222 Medline Unique ID or the PubMed ID's of the articles. Each time a 223 record is downloaded, callback_fn is called with the text of the 224 record. broken_fn is an optional function that is called with the 225 id of records that were not able to be downloaded. batchsize is the 226 number of records to request each time. 227 228 """ 229 # parser is an undocumented parameter that allows people to 230 # specify an optional parser to handle each record. This is 231 # dangerous because the results may be malformed, and exceptions 232 # in the parser may disrupt the whole download process. 233 if batchsize > 500 or batchsize < 1: 234 raise ValueError("batchsize must be between 1 and 500") 235 current_batchsize = batchsize 236 237 # Loop until all the ids are processed. We want to process as 238 # many as possible with each request. Unfortunately, errors can 239 # occur. Some id may be incorrect, or the server may be 240 # unresponsive. In addition, one broken id out of a list of id's 241 # can cause a non-specific error. Thus, the strategy I'm going to 242 # take, is to start by downloading as many as I can. If the 243 # request fails, I'm going to half the number of records I try to 244 # get. If there's only one more record, then I'll report it as 245 # broken and move on. If the request succeeds, I'll double the 246 # number of records until I get back up to the batchsize. 247 nsuccesses = 0 248 while ids: 249 if current_batchsize > len(ids): 250 current_batchsize = len(ids) 251 252 id_str = ','.join(ids[:current_batchsize]) 253 254 try: 255 # Query PubMed. If one or more of the id's are broken, 256 # this will raise an IOError. 257 handle = Entrez.efetch( 258 db="pubmed", id=id_str, retmode='text', rettype='medlars') 259 260 # I'm going to check to make sure PubMed returned the same 261 # number of id's as I requested. If it didn't then I'm going 262 # to raise an exception. This could take a lot of memory if 263 # the batchsize is large. 264 results = handle.read() 265 num_ids = 0 266 for x in Medline.Iterator(File.StringHandle(results)): 267 num_ids = num_ids + 1 268 if num_ids != current_batchsize: 269 raise IOError 270 handle = File.StringHandle(results) 271 except IOError: # Query did not work. 272 if current_batchsize == 1: 273 # There was only 1 id in the query. Report it as 274 # broken and move on. 275 id = ids.pop(0) 276 if broken_fn is not None: 277 broken_fn(id) 278 else: 279 # I don't know which one is broken. Try again with 280 # fewer id's. 281 current_batchsize = current_batchsize / 2 282 nsuccesses = 0 283 continue 284 nsuccesses = nsuccesses + 1 285 286 # Iterate through the results and pass the records to the 287 # callback. 288 idnum = 0 289 for rec in Medline.Iterator(handle, parser): 290 callback_fn(ids[idnum], rec) 291 idnum = idnum + 1 292 293 ids = ids[current_batchsize:] 294 295 # If I'm not downloading the maximum number of articles, 296 # double the number for next time. 297 if nsuccesses >= 2 and current_batchsize < batchsize: 298 current_batchsize = current_batchsize * 2 299 if current_batchsize > batchsize: 300 current_batchsize = batchsize
301