1 """Search and retreive information using the EUtils history.
2
3 EUtils has two major modes. One uses history while the other uses
4 database identifiers. This is a high-level interface for working with
5 the history. You should use this module if you expect to work with
6 large or an unknown number of identifiers.
7
8 See DBIdsClient if you want to get information about a set of known
9 database identifiers.
10
11 >>> from Bio.EUtils import HistoryClient
12 >>> client = HistoryClient.HistoryClient()
13 >>> cancer = client.search("cancer")
14 >>> print len(cancer)
15 1458353
16 >>>
17
18 That's quite a few hits. Most people would like to see the first few
19 records then try to refine the search.
20
21 >>> print cancer[:5].efetch(retmode = "text", rettype = "docsum").read()
22
23 1: Seow-Choen F.
24 Author's reply: Adjuvant therapy for rectal cancer cannot be based on the
25 results of other surgeons (Br J Surg 2002; 89: 946-947).
26 Br J Surg. 2003 Jan;90(1):121-122.
27 PMID: 12520589 [PubMed - as supplied by publisher]
28
29 2: Mortensen N, Lindsey I.
30 Adjuvant therapy for rectal cancer cannot be based on the results of other
31 surgeons (Br J Surg 2002; 89: 946-947).
32 Br J Surg. 2003 Jan;90(1):121.
33 PMID: 12520588 [PubMed - as supplied by publisher]
34
35 3: Osugi H, Takemura M, Higashino M, Takada N, Lee S, Kinoshita H.
36 A comparison of video-assisted thoracoscopic oesophagectomy and radical lymph
37 node dissection for squamous cell cancer of the oesophagus with open operation.
38 Br J Surg. 2003 Jan;90(1):108-13.
39 PMID: 12520585 [PubMed - in process]
40
41 4: Tanaka M, Kitajima Y, Sato S, Miyazaki K.
42 Combined evaluation of mucin antigen and E-cadherin expression may help select
43 patients with gastric cancer suitable for minimally invasive therapy.
44 Br J Surg. 2003 Jan;90(1):95-101.
45 PMID: 12520583 [PubMed - in process]
46
47 5: Diaz De Liano A, Oteiza Martinez F, Ciga MA, Aizcorbe M, Cobo F, Trujillo R.
48 Impact of surgical procedure for gastric cancer on quality of life.
49 Br J Surg. 2003 Jan;90(1):91-4.
50 PMID: 12520582 [PubMed - in process]
51
52 >>>
53
54 Now refine the query to publications in the last day
55
56 >>> from Bio import EUtils
57 >>> recent_cancer = client.search("#%s" % (cancer.query_key,),
58 ... daterange = EUtils.WithinNDays(1))
59 >>> len(recent_cancer)
60 106
61 >>>
62
63 Still quite a few. What's the last one about?
64 >>> for k, v in recent_cancer[-1].summary().dataitems.allitems():
65 ... print k, "=", v
66 ...
67
68 PubDate = 2002/12/01
69 Source = Nippon Shokakibyo Gakkai Zasshi
70 Authors = Kuroki T
71 Title = [Strategy against cancer in 21 century, with emphasis of cancer prevention and refractory cancer]
72 Volume = 99
73 Pages = 1423-7
74 EntrezDate = 2003/01/10
75 PubMedId = 12518389
76 MedlineId = 22406828
77 Lang = Japanese
78 PubType =
79 RecordStatus = PubMed - in process
80 Issue = 12
81 SO = 2002 Dec;99(12):1423-7
82 DOI =
83 JTA = KJY
84 ISSN = 0446-6586
85 PubId =
86 PubStatus = 4
87 Status = 6
88 HasAbstract = 0
89 ArticleIds = {'MedlineUID': u'22406828', 'PubMedId': u'12518389'}
90 >>>
91
92 Here's an interesting one. Which articles are related to this one but
93 are not about cancer? First, get the related articles.
94
95
96 >>> neighbors = recent_cancer[-1].neighbor_links()
97 >>> dbids = neighbors.linksetdbs["pubmed_pubmed"].dbids
98 >>> len(dbids)
99 10296
100 >>>
101
102 Upload that back to the server
103
104 >>> related_result = client.post(dbids)
105 >>>
106 >>> non_cancer = client.search("#%s NOT #%s" % (related_result.query_key,
107 ... cancer.query_key))
108 >>> len(non_cancer)
109 4000
110 >>>
111
112 The HistoryClient instance has an attribute named 'query_history'
113 which stores the searches done so far, keyed by the query_key value
114 assigned by the server. The history on the server can expire. If
115 that is detected during a search then previous results are invalidated
116 and removed from the query_history. Future requests from invalidated
117 results will raise an error.
118
119 If a request is made from a search which has not been invalidated but
120 whose history has expired then queries like 'summary' will raise an
121 error. Some other request (like 'dbids') may return success but
122 contain undefined information.
123
124 """
125
126 import types
127 import ThinClient, parse, Datatypes, Mixins, Config
128
130 """Data needed to get back to the history"""
131 - def __init__(self, db, webenv_ref, query_key):
132 self.db = db
133 self.webenv_ref = webenv_ref
134 self.query_key = query_key
135
136 -class HistoryLookup(object):
137 """Look up information about a search in history
138
139 To get the list of dbids by fetching the server's "uilist",
140 use the "dbids" attribute.
141 """
142 - def __init__(self, eutils, cookie, retstart, retmax):
143 self.eutils = eutils
144 self.cookie = cookie
145 self.retstart = retstart
146 self.retmax = retmax
147 self.db = cookie.db
148 self.query_key = cookie.query_key
149
150 - def _check_invalid(self):
151
152 if self.cookie.query_key is None:
153 raise NotImplementedError("empty data set")
154 if self.query_key is None:
155 raise Datatypes.EUtilsError(
156 "query history no longer available on server")
157
158 - def esummary(self, retmode = 'xml', rettype = None):
159 """Request the eSummary for this history; returns the socket handle"""
160 self._check_invalid()
161 infile = self.eutils.esummary_using_history(
162 webenv = self.cookie.webenv_ref[0],
163 db = self.cookie.db,
164 query_key = self.cookie.query_key,
165 retstart = self.retstart,
166 retmax = self.retmax)
167 return infile
168
170 """the Datatypes.Summary for this history"""
171 return parse.parse_summary_xml(self.esummary("xml"))
172
173 - def elink(self,
174 db = "pubmed",
175 cmd = "neighbor",
176 term = None,
177 field = None,
178 daterange = None):
179 """Request an eLink for this history; returns the socket handle"""
180 self._check_invalid()
181 return self.eutils.elink_using_history(
182 webenv = self.cookie.webenv_ref[0],
183 query_key = self.cookie.query_key,
184 db = db,
185 dbfrom = self.cookie.db,
186 cmd = cmd,
187 retstart = self.retstart,
188 retmax = self.retmax,
189 daterange = daterange,
190 term = term,
191 field = field,
192 )
193
194 - def _get_dbids(self):
195 infile = self.efetch(retmode = "text", rettype = "uilist")
196 ids = parse.parse_fetch_identifiers(infile)
197 return Datatypes.DBIds(self.cookie.db, ids)
198 dbids = property(_get_dbids, None, None,
199 "The DBIds for this results set, fetched from the server's 'uilist'")
200
201 -class HistoryRecord(HistoryLookup):
202 """Get information about a single record in a history"""
203 - def __init__(self, eutils, cookie, offset):
204 HistoryLookup.__init__(self, eutils, cookie, offset, 1)
206 """the Datatypes.Summary for this history record"""
207 return HistoryLookup.summary(self)[0]
208
210 - def efetch(self, retmode = 'xml', rettype = None,
211 seq_start = None, seq_stop = None, strand = None,
212 complexity = None):
213 self._check_invalid()
214 if strand not in (None, 1, 2):
215 raise TypeError("Strand can only be 1 (plus, default) or 2 (minus)")
216 return self.eutils.efetch_using_history(
217 webenv = self.cookie.webenv_ref[0],
218 db = self.cookie.db,
219 query_key = self.cookie.query_key,
220 retstart = self.retstart,
221 retmax = self.retmax,
222 retmode = retmode,
223 rettype = rettype,
224 seq_start = seq_start,
225 seq_stop = seq_stop,
226 strand = strand,
227 complexity = complexity)
228
229 -class SequenceHistoryRecord(Mixins.SequenceFetchMixin,
230 SequenceHistoryFetchMixin,
231 HistoryRecord):
233
235 - def efetch(self, retmode = "xml", rettype = None):
236 self._check_invalid()
237 return self.eutils.efetch_using_history(
238 webenv = self.cookie.webenv_ref[0],
239 db = self.cookie.db,
240 query_key = self.cookie.query_key,
241 retstart = self.retstart,
242 retmax = self.retmax,
243 retmode = retmode,
244 rettype = rettype)
245
246 -class PublicationHistoryRecord(Mixins.PublicationFetchMixin,
247 PublicationHistoryFetchMixin,
248 HistoryRecord):
250
251 -class BaseHistoryRecordSet(HistoryLookup):
252 - def __init__(self, eutils, cookie, retstart, retmax, metadata = None):
253 HistoryLookup.__init__(self, eutils, cookie, retstart, retmax)
254 self.metadata = metadata
255
258
259 - def __getitem__(self, i):
260 if isinstance(i, types.SliceType):
261 if i.step is not None:
262 raise TypeError("cannot set step size in slice")
263
264 start = i.start
265 if start is None: start = 0
266 stop = i.stop
267 if stop is None: stop = self.retmax
268
269
270 x = range(self.retstart, self.retstart + self.retmax)[start:stop]
271 if x:
272 retstart = x[0]
273 retmax = x[-1] - x[0] + 1
274 else:
275 retstart = 0
276 retmax = 0
277 return self.__class__(self.eutils, self.cookie, retstart,
278 retmax)
279 if 0 <= i < self.retmax:
280 pos = self.retstart + i
281 elif 1 <= -i <= self.retmax:
282 pos = self.retstart + i + self.retmax
283 else:
284 raise IndexError(i)
285 return self._record_class(self.eutils, self.cookie, pos)
286
287 -class SequenceHistoryRecordSet(Mixins.SequenceFetchMixin,
288 SequenceHistoryFetchMixin,
289 BaseHistoryRecordSet):
291
292 -class PublicationHistoryRecordSet(Mixins.PublicationFetchMixin,
293 PublicationHistoryFetchMixin,
294 BaseHistoryRecordSet):
296
306
308 - def __init__(self, eutils = None):
309 if eutils is None:
310 eutils = ThinClient.ThinClient()
311 self.eutils = eutils
312 self.webenv_ref = [None]
313 self.query_history = {}
314
315 - def _check_for_cache_reset(self, query_key):
316
317
318
319 if query_key not in self.query_history:
320
321 return
322
323 for v in self.query_history.values():
324 v.query_key = None
325
326 self.query_history.clear()
327
328 - def search(self,
329 term,
330 db = "pubmed",
331 field = None,
332 daterange = None,
333 dbtype = None
334 ):
335
336 set_klass = _get_recordset_constructor(db, dbtype)
337
338 infile = self.eutils.esearch(
339 term = term,
340 db = db,
341 field = field,
342
343 retstart = 0,
344 retmax = 0,
345
346 daterange = daterange,
347
348 usehistory = 1,
349 webenv = self.webenv_ref[0],
350 )
351 searchinfo = parse.parse_search(infile, self.webenv_ref)
352
353 if searchinfo.query_key is not None:
354 cookie = HistoryCookie(db, self.webenv_ref, searchinfo.query_key)
355 else:
356 assert searchinfo.count == 0
357 cookie = HistoryCookie(db, None, None)
358
359 recordset = set_klass(self.eutils, cookie, 0, searchinfo.count,
360 searchinfo)
361
362 if searchinfo.query_key is not None:
363 self._check_for_cache_reset(searchinfo.query_key)
364 self.query_history[searchinfo.query_key] = recordset
365
366 return recordset
367
368 - def post(self, dbids, dbtype = None):
369 set_klass = _get_recordset_constructor(dbids.db, dbtype)
370
371 infile = self.eutils.epost(dbids,
372 webenv = self.webenv_ref[0])
373
374 postinfo = parse.parse_post(infile, self.webenv_ref)
375
376
377 n = len(dbids) - len(postinfo.invalid_ids)
378
379 cookie = HistoryCookie(dbids.db, self.webenv_ref,
380 postinfo.query_key)
381 recordset = set_klass(self.eutils, cookie, 0, n, postinfo)
382 self._check_for_cache_reset(postinfo.query_key)
383 self.query_history[postinfo.query_key] = recordset
384 return recordset
385
386 from_dbids = post
387