1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21 """
22 Programmatic access to the PubMed database
23 """
24
25
26
27
28
29 import logging, urllib
30 import datetime
31
32 from gettext import gettext as _
33
34 from Pyblio import Compat
35
36 from twisted.web import client
37 from twisted.internet import defer, reactor
38
39 from Pyblio.Exceptions import QueryError
40 from Pyblio.External.HTTP import HTTPRetrieve
41 from Pyblio.Parsers.Semantic.PubMed import Reader
42
43
45 """ Parse the result from the server, and immeditately catch
46 possible errors."""
47 tree = Compat.ElementTree.XML(data)
48
49 err = tree.find('./ERROR')
50 if err is not None:
51 raise QueryError(err.text)
52
53 return tree
54
56
57 query_fields = {
58 'ALL': _('All Fields'),
59 'AD': _('Affiliation'),
60 'AU': _('Author Name'),
61 'RN': _('EC/RN Number'),
62 'EDAT': _('Entrez Date'),
63 'FILTER': _('Filter'),
64 'IP': _('Issue'),
65 'TA': _('Journal Title'),
66 'LA': _('Language'),
67 'MHDA': _('MeSH Date'),
68 'MAJR': _('MeSH Major Topic'),
69 'SH': _('MeSH Subheading'),
70 'MH': _('MeSH Terms'),
71 'PG': _('Pagination'),
72 'DP': _('Publication Date'),
73 'PT': _('Publication Type'),
74 'SI': _('Secondary Source ID'),
75 'NM': _('Substance Name'),
76 'TW': _('Text Word'),
77 'TI': _('Title'),
78 'TIAB': _('Title/Abstract'),
79 'PMID': _('UID'),
80 'VI': _('Volume'),
81 }
82 publication_types = {
83 'addresses': _('Addresses'),
84 'bibliography': _('Bibliography'),
85 'biography': _('Biography'),
86 'classical article': _('Classical Article'),
87 'clinical conference': _('Clinical Conference'),
88 'clinical trial': _('Clinical Trial'),
89 'clinical trial, phase I': _('Clinical Trial, Phase I'),
90 'clinical trial, phase II': _('Clinical Trial, Phase II'),
91 'clinical trial, phase III': _('Clinical Trial, Phase III'),
92 'clinical trial, phase IV': _('Clinical Trial, Phase IV'),
93 'comment': _('Comment'),
94 'congresses': _('Congresses'),
95 'consensus development conference': _('Consensus Development Conference'),
96 'consensus development conference, NIH': _('Consensus Development Conference, NIH'),
97 'controlled clinical trial': _('Controlled Clinical Trial'),
98 'corrected and republished article': _('Corrected and Republished Article'),
99 'dictionary': _('Dictionary'),
100 'directory': _('Directory'),
101 'duplicate publication': _('Duplicate Publication'),
102 'editorial': _('Editorial'),
103 'evaluation studies': _('Evaluation Studies'),
104 'festschrift': _('Festschrift'),
105 'government publications': _('Government Publications'),
106 'guideline': _('Guideline'),
107 'historical article': _('Historical Article'),
108 'interview': _('Interview'),
109 'journal article': _('Journal Article'),
110 'lectures': _('Lectures'),
111 'legal cases': _('Legal Cases'),
112 'legislation': _('Legislation'),
113 'letter': _('Letter'),
114 'meta-analysis': _('Meta-Analysis'),
115 'multicenter study': _('Multicenter Study'),
116 'news': _('News'),
117 'newspaper article': _('Newspaper Article'),
118 'overall': _('Overall'),
119 'periodical index': _('Periodical Index'),
120 'practice guideline': _('Practice Guideline'),
121 'randomized controlled trial': _('Randomized Controlled Trial'),
122 'retraction of publication': _('Retraction of Publication'),
123 'retracted publication': _('Retracted Publication'),
124 'review': _('Review'),
125 'review, academic': _('Review, Academic'),
126 'review, literature': _('Review Literature'),
127 'review, multicase': _('Review, Multicase'),
128 'review of reported cases': _('Review of Reported Cases'),
129 'review, tutorial': _('Review, Tutorial'),
130 'scientific integrity review': _('Scientific Integrity Review'),
131 'technical report': _('Technical Report'),
132 'twin study': _('Twin Study'),
133 'validation studies': _('Validation Studies'),
134 }
135
136 language = {
137 'english': _('English'),
138 'french': _('French'),
139 'german': _('German'),
140 'italian': _('Italian'),
141 'japanese': _('Japanese'),
142 'russian': _('Russian'),
143 'spanish': _('Spanish'),
144 }
145
146 age_range = [
147 ('infant', _('All Infant (birth-23 month)')),
148 ('child', _('All Child (0-18 years)')),
149 ('adult', _('All Adult (19+ years)')),
150 ('infant, newborn', _('Newborn (birth-1 month)')),
151 ('infant', _('Infant (1-23 months)')),
152 ('child, preschool', _('Preschool Child (2-5 years)')),
153 ('child', _('Child (6-12 years)')),
154 ('adolescence', _('Adolescent (13-18 years)')),
155 ('adult', _('Adult (19-44 years)')),
156 ('middle age', _('Middle Aged (45-64 years)')),
157 ('aged', _('Aged (65+ years)')),
158 ('aged, 80 and over', _('80 and over')),
159 ]
160
161 human_animal = {
162 'human': _('Human'),
163 'animal': _('Animal'),
164 }
165
166 gender = {
167 'female': _('Female'),
168 'male': _('Male'),
169 }
170
171 subset = {
172 'bioethics[ab]': _('Bioethics'),
173
174 'jsubsetaim': _('Core clinical journals'),
175 'jsubsetb': _('Biotechnology journals'),
176 'jusbsetc': _('Communication disorders journals'),
177 'jsubsetd': _('Dental journals'),
178 'jsubsete': _('Bioethics journals'),
179 'jsubseth': _('Health administration journals'),
180 'jsubsetim': _('Index Medicus journals'),
181 'jsubsetk': _('Consumer health journals'),
182 'jsubsetn': _('Nursing journals'),
183 'jsubsetq': _('History of Medicine journals'),
184 'jsubsetr': _('Reproduction journals'),
185 'jsubsets': _('NASA journals'),
186 'jsubsett': _('Health tech assesment journals'),
187 'jsubsetx': _('AIDS/HIV journals'),
188
189 'aids[sb]': _('AIDS'),
190 'cam[sb]': _('Complementary and Alternative Medicine'),
191 'history[sb]': _('History of Medicine'),
192 'in process[sb]': _('In process'),
193 'medline[sb]': _('MEDLINE'),
194 'medline pmc[sb]': _('PubMed Central'),
195 'space[sb]': _('Space Life Sciences'),
196 'publisher[sb]': _('Supplied by Publisher'),
197 'tox[sb]': _('Toxicology'),
198 }
199
200 - def makeQuery(self, field='ALL', keyword=None, abstract=False,
201 epubahead=False, publication_type=None,
202 language=None, subset=None, age_range=None,
203 human_animal=None, gender=None,
204 use_publication_date=False, from_date=None,
205 to_date=None):
206
207 """Compose an advanced query.
208
209 'field' is a single value from self.query_fields.
210 'publication_type' is a single value from self.publication_types, or None.
211 'language' is from self.language or None
212 'subset' is from self.subset or None
213 'age_range' is from self.age_range or None
214 'human_animal' is from self.human_animal or None
215 'gender' is from self.gender or None
216
217 If use_publication_date is True, select publications whose
218 publication date is between from_date and to_date, otherwise
219 use the entrez date.
220
221 Args:
222 field: string
223 keyword: string
224 abstract: bool
225 epubahead: bool
226 publication_type: string or None
227 language: string or None
228 subset: string or None
229 age_range: string or None
230 human_animal: string or None
231 gender: string or None
232 pubdate: bool
233 from_date: datetime.date() or None
234 to_date: datetime.date() or None
235 """
236
237 parts = []
238 if keyword is not None:
239 parts.append(keyword + '[%s]' % field)
240 if abstract:
241 parts.append('hasabstract')
242 if epubahead:
243 parts.append('pubstatusaheadofprint')
244 if publication_type:
245 parts.append(pubtype + '[pt]')
246 if language:
247 parts.append(language + '[la]')
248 if subset:
249 parts.append(subset)
250 if age_range:
251 parts.append(age_range + '[mh]')
252 if human_animal:
253 parts.append(human_animal + '[mh]')
254 if gender:
255 parts.append(gender + '[mh]')
256
257 if from_date:
258 if not to_date:
259 to_date = datetime.date.today()
260 date = ':'.join([from_date.strftime('%Y/%m/%d'),
261 to_date.strftime('%Y/%m/%d')])
262
263 if use_publication_date:
264 date += '[dp]'
265 else:
266 date += '[edat]'
267 parts.append(date)
268
269 keywords = ' AND '.join(parts)
270
271 return keywords
272
274 """ A connection to the PubMed database """
275
276 schema = 'org.pybliographer/pubmed/0.1'
277
278 baseURL = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils'
279
280 BATCH_SIZE = 500
281
282 toolName = 'pybliographer'
283 adminEmail = 'webmaster@pybliographer.org'
284
285 log = logging.getLogger('pyblio.external.pubmed')
286
287 SRV_SEARCH = '/esearch.fcgi'
288 SRV_FETCH = '/efetch.fcgi'
289
291
292 self.db = db
293 self._pending = None
294 self.reader = Reader()
295
296 return
297
298 - def _query(self, service, args, **kargs):
299
300 all = {'email': self.adminEmail,
301 'tool': self.toolName,
302 'retmode': 'xml'}
303
304 all.update(args)
305 all.update(kargs)
306
307
308 for k, v in all.items():
309 if isinstance(v, unicode):
310 all[k] = v.encode('utf-8')
311
312 url = self.baseURL + service + '?' + urllib.urlencode(all)
313
314 self.log.debug('sending query %r' % url)
315
316
317 self._pending = HTTPRetrieve(url)
318
319 def done(data):
320 self._pending = None
321 return data
322
323 return self._pending.deferred.addBoth(done)
324
325
326 - def count(self, query, db='PubMed'):
327
328 assert self._pending is None, 'no more than one search at a time per connection'
329
330 data = {'db': db,
331 'term': query}
332
333 req = self._query(self.SRV_SEARCH, data, rettype='count')
334
335 def success(data):
336 return int(data.find('./Count').text)
337
338 return req.addCallback(_xml).addCallback(success)
339
340
341 - def search(self, query, maxhits=500, db='PubMed'):
342
343 assert self._pending is None, 'no more than one search at a time per connection'
344
345 query = query.strip()
346
347 data = {'db': db,
348 'term': query}
349
350 req = self._query(self.SRV_SEARCH, data, usehistory='y')
351
352
353 results = defer.Deferred()
354
355
356 rs = self.db.rs.new()
357 rs.name = _('Imported from PubMed')
358
359
360
361 if not query:
362 def autofire():
363 results.callback(0)
364 reactor.callLater(0, autofire)
365 return results, rs
366
367 stats = {}
368
369 def failed(reason):
370 results.errback(reason)
371
372 def got_summary(data):
373
374 all_results = int(data.find('./Count').text)
375
376
377 fetchdata = {
378 'db': db,
379 'WebEnv': data.find('./WebEnv').text,
380 'query_key': data.find('./QueryKey').text,
381 }
382
383 stats['missing'] = min(all_results, maxhits)
384
385 self.log.info('%d results, retrieving %d' % (
386 all_results, stats['missing']))
387
388 def fetch(data):
389
390
391 if data is not None:
392
393 previously = len(rs)
394 self.reader.parse(data, self.db, rs)
395 freshly_parsed = len(rs) - previously
396 if freshly_parsed <= 0:
397 self.log.warn("what happend? I increased the result set by %d" % freshly_parsed)
398
399
400
401
402 freshly_parsed = 1
403
404 stats['missing'] -= freshly_parsed
405
406 if stats['missing'] <= 0:
407 self.log.info('finished')
408 results.callback(all_results)
409 return
410
411
412 batch = min(self.BATCH_SIZE, stats['missing'])
413 self.log.info('retrieving next %d' % batch)
414
415 d = self._query(self.SRV_FETCH, fetchdata,
416 retstart=len(rs), retmax=batch)
417
418 d.addCallback(_xml).\
419 addCallback(fetch).\
420 addErrback(failed)
421 return
422
423
424 fetch(None)
425
426 req.addCallback(_xml).\
427 addCallback(got_summary).\
428 addErrback(failed)
429
430 return results, rs
431
432
434 """ Cancel a running query. The database is not reverted to its
435 original state."""
436 if not self._pending:
437 return
438
439 self._pending.cancel()
440 return
441