1
2
3
4
5
6 """
7 This module provides code to work with PubMed from the NCBI (DEPRECATED).
8
9 This module has been deprecated and is likely to be removed in a future
10 release of Biopython. Please use Bio.Entrez instead, which is described
11 in the Biopython Tutorial.
12
13 See also:
14 http://www.ncbi.nlm.nih.gov/PubMed/
15
16 Online documentation for linking to PubMed is available at:
17 http://www.ncbi.nlm.nih.gov/PubMed/linking.html
18
19
20 Classes:
21 Dictionary Access PubMed articles using a dictionary interface.
22
23 Functions:
24 search_for Search PubMed.
25 find_related Find related articles in PubMed.
26 download_many Download many articles from PubMed in batch mode.
27
28 """
29
30 import warnings
31 warnings.warn("Bio.PubMed has been deprecated, and we intend to remove it in" \
32 +" a future release of Biopython. Please use Bio.Entrez"\
33 +" instead as described in the Tutorial. If you need help" \
34 +" with this transition, or wish to continue to use this code,"\
35 +" please get in contact via the mailing lists.", \
36 DeprecationWarning)
37
38 import re
39 import sgmllib
40
41 from Bio import File
42 from Bio import Entrez
43 from Bio import Medline
44
46 """Access PubMed using a read-only dictionary interface (DEPRECATED).
47
48 Please use Bio.Entrez instead as described in the Biopython Tutorial.
49 """
51 """Dictionary(parser=None)
52
53 Create a new Dictionary to access PubMed. parser is an optional
54 parser (e.g. Medline.RecordParser) object to change the results
55 into another form. If set to None, then the raw contents of the
56 file will be returned.
57
58 """
59 self.parser = parser
60
62 raise NotImplementedError("PubMed contains lots of entries")
64 raise NotImplementedError("This is a read-only dictionary")
66 raise NotImplementedError("This is a read-only dictionary")
68 raise NotImplementedError("This is a read-only dictionary")
70 raise NotImplementedError("You don't need to do this...")
72 raise NotImplementedError("You don't really want to do this...")
74 raise NotImplementedError("You don't really want to do this...")
76 raise NotImplementedError("You don't really want to do this...")
77
79 """S.has_key(id) -> bool"""
80 try:
81 self[id]
82 except KeyError:
83 return 0
84 return 1
85
86 - def get(self, id, failobj=None):
87 try:
88 return self[id]
89 except KeyError:
90 return failobj
91
93 """S.__getitem__(id) -> object
94
95 Return the Medline entry. id is either the Medline Unique ID
96 or the Pubmed ID of the article. Raises a KeyError if there's an
97 error.
98
99 """
100 try:
101 handle = Entrez.efetch(
102 db="pubmed", id=id, retmode='text', rettype='medlars')
103 except IOError, x:
104
105
106
107 raise KeyError(x)
108 if self.parser is not None:
109 return self.parser.parse(handle)
110 return handle.read()
111
112 -def search_for(search, reldate=None, mindate=None, maxdate=None,
113 batchsize=100, callback_fn=None, start_id=0, max_ids=None):
114 """Search PubMed, returns a list of IDs (DEPRECATED).
115
116 Please use Bio.Entrez instead as described in the Biopython Tutorial.
117
118 Search PubMed and return a list of the PMID's that match the
119 criteria. search is the search string used to search the
120 database. reldate is the number of dates prior to the current
121 date to restrict the search. mindate and maxdate are the dates to
122 restrict the search, e.g. 2002/01/01. batchsize specifies the
123 number of ids to return at one time. By default, it is set to
124 10000, the maximum. callback_fn is an optional callback function
125 that will be called as passed a PMID as results are retrieved.
126 start_id specifies the index of the first id to retrieve and
127 max_ids specifies the maximum number of id's to retrieve.
128
129 XXX The date parameters don't seem to be working with NCBI's
130 script. Please let me know if you can get it to work.
131
132 """
133 params = {
134 'db' : 'pubmed',
135 'term' : search,
136 'reldate' : reldate,
137 'mindate' : mindate,
138 'maxdate' : maxdate
139 }
140
141
142 ids = []
143 while max_ids is None or len(ids) < max_ids:
144 start = start_id + len(ids)
145 max = batchsize
146 if max_ids is not None and max > max_ids - len(ids):
147 max = max_ids - len(ids)
148
149 params['retstart'] = start
150 params['retmax'] = max
151 h = Entrez.esearch(**params)
152 record = Entrez.read(h)
153 idlist = record["IdList"]
154 ids.extend(idlist)
155 if callback_fn is not None:
156
157 for id in idlist:
158 callback_fn(id)
159 if len(idlist) < max:
160 break
161 return ids
162
186 def start_id(self, attributes):
187 self.in_id = 1
188 def end_id(self):
189 self.in_id = 0
190 def start_link(self, attributes):
191 self.in_link = 1
192 def end_link(self):
193 self.in_link = 0
194 _not_pmid_re = re.compile(r'\D')
195 def handle_data(self, data):
196 if not self.in_link or not self.in_id:
197 return
198
199
200
201
202 if self._not_pmid_re.search(data):
203 raise ValueError(\
204 "I expected an ID, but '%s' doesn't look like one." % \
205 repr(data))
206 self.ids.append(data)
207
208 parser = ResultParser()
209 if type(pmid) is type([]):
210 pmid = ','.join(pmid)
211 h = Entrez.elink(dbfrom='pubmed', id=pmid)
212 parser.feed(h.read())
213 return parser.ids
214
215 -def download_many(ids, callback_fn, broken_fn=None,
216 batchsize=500, parser=None):
217 """Download multiple PubMed records, no return value (DEPRECATED).
218
219 Please use Bio.Entrez instead as described in the Biopython Tutorial.
220
221 Download many records from PubMed. ids is a list of either the
222 Medline Unique ID or the PubMed ID's of the articles. Each time a
223 record is downloaded, callback_fn is called with the text of the
224 record. broken_fn is an optional function that is called with the
225 id of records that were not able to be downloaded. batchsize is the
226 number of records to request each time.
227
228 """
229
230
231
232
233 if batchsize > 500 or batchsize < 1:
234 raise ValueError("batchsize must be between 1 and 500")
235 current_batchsize = batchsize
236
237
238
239
240
241
242
243
244
245
246
247 nsuccesses = 0
248 while ids:
249 if current_batchsize > len(ids):
250 current_batchsize = len(ids)
251
252 id_str = ','.join(ids[:current_batchsize])
253
254 try:
255
256
257 handle = Entrez.efetch(
258 db="pubmed", id=id_str, retmode='text', rettype='medlars')
259
260
261
262
263
264 results = handle.read()
265 num_ids = 0
266 for x in Medline.Iterator(File.StringHandle(results)):
267 num_ids = num_ids + 1
268 if num_ids != current_batchsize:
269 raise IOError
270 handle = File.StringHandle(results)
271 except IOError:
272 if current_batchsize == 1:
273
274
275 id = ids.pop(0)
276 if broken_fn is not None:
277 broken_fn(id)
278 else:
279
280
281 current_batchsize = current_batchsize / 2
282 nsuccesses = 0
283 continue
284 nsuccesses = nsuccesses + 1
285
286
287
288 idnum = 0
289 for rec in Medline.Iterator(handle, parser):
290 callback_fn(ids[idnum], rec)
291 idnum = idnum + 1
292
293 ids = ids[current_batchsize:]
294
295
296
297 if nsuccesses >= 2 and current_batchsize < batchsize:
298 current_batchsize = current_batchsize * 2
299 if current_batchsize > batchsize:
300 current_batchsize = batchsize
301