Package Pyblio :: Package External :: Module WOK
[hide private]
[frames] | no frames]

Source Code for Module Pyblio.External.WOK

  1  """ 
  2  Interface to ISI Web of Knowledge. 
  3   
  4  """ 
  5   
  6  from twisted.web import client 
  7  from twisted.internet import defer 
  8  from twisted.python import failure 
  9   
 10  from Pyblio import Compat 
 11   
 12  import urllib, sys, logging 
 13   
 14  from gettext import gettext as _ 
 15   
 16  from Pyblio.Exceptions import QueryError 
 17  from Pyblio.Parsers.Semantic.WOK import Reader 
 18  from Pyblio.External.HTTP import HTTPRetrieve 
 19  from Pyblio.External import IExternal 
 20   
 21   
22 -def _xml(data):
23 """ Parse the result from the server, and immeditately catch 24 possible errors.""" 25 26 tree = Compat.ElementTree.XML(data) 27 28 err = tree.find('./error') 29 if err is not None: 30 raise QueryError(err.text) 31 32 return tree
33
34 -def _r_info(tree):
35 """ Return (number of hits, number of searched records).""" 36 37 stats = [ int(tree.findtext(f)) for f in 38 ('./searchResults/recordsFound', 39 './searchResults/recordsSearched') ] 40 41 return stats, tree.findtext('./sessionID')
42 43
44 -class WOK(IExternal):
45 """ I represent a query session on the Web of Knowledge. 46 47 The session is connected to a database whose schema is 48 'org.pybliographer/wok/...'. 49 50 """ 51 52 schema = 'org.pybliographer/wok/0.1' 53 54 # This base URL is for IP-based authentification. Don't know how 55 # other systems work. 56 baseURL = "http://estipub.isiknowledge.com/esti/cgi" 57 58 # Maximal number of results one can ask in a single result set. 59 MAX_PER_BATCH = 100 60 61 log = logging.getLogger('pyblio.external.wok') 62
63 - def __init__(self, db):
64 self.reader = Reader() 65 self.db = db 66 67 self._pending = None 68 self._debug = False 69 return
70 71
72 - def _query(self, **args):
73 74 assert not self._pending 75 assert 'query' in args 76 77 self._running = True 78 79 data = { 80 'databaseID': 'WOS', 81 'rspType': 'xml', 82 'method': 'searchRetrieve', 83 'firstRec': '1', 84 'numRecs': self.MAX_PER_BATCH, 85 'depth': '', 86 'editions': '', 87 'fields': '', 88 } 89 90 data.update(args) 91 92 self.log.debug('sending query %s' % repr(data)) 93 94 # ensure all arguments are utf8 encoded 95 for k, v in data.items(): 96 if isinstance(v, unicode): 97 data[k] = v.encode('utf-8') 98 99 q = self.baseURL + '?' + urllib.urlencode(data) 100 101 self._pending = HTTPRetrieve(q, method='GET') 102 103 return self._pending.deferred
104 105
106 - def _done(self, data):
107 """ Called in any case to mark the end of a pending request to 108 the WOK server.""" 109 self._pending = None 110 return data
111 112
113 - def count(self, query):
114 """ Ask WOK for the number of results of a given query.""" 115 116 d = self._query(query=query, numRecs=1, Logout='yes') 117 118 def process(tree): 119 return _r_info(tree)[0][0]
120 121 if self._debug: 122 def show(data): 123 sys.stderr.write(data) 124 return data
125 d = d.addCallback(show) 126 127 return d.addBoth(self._done).\ 128 addCallback(_xml).\ 129 addCallback(process) 130 131
132 - def search(self, query, maxhits=500):
133 """ Start a query on the WOK, and fill in the database with 134 the matches. 135 136 @arg query: the query, in Web of Science format 137 @type query: unicode string 138 139 @return: a deferred that will fire when the query is 140 finished. 141 """ 142 143 assert not self._pending 144 assert maxhits > 0 145 146 self._first = 1 147 self._to_fetch = None 148 149 # Limit our initial query to the max per batch amount. 150 data = {'query': query, 151 'firstRec': self._first, 152 'numRecs': min(self.MAX_PER_BATCH, maxhits)} 153 154 # We know we won't have to continue this session. 155 if maxhits < self.MAX_PER_BATCH: 156 data['Logout'] = 'yes' 157 158 results = defer.Deferred() 159 160 rs = self.db.rs.new() 161 rs.name = _('Imported from Web of Knowledge') 162 163 164 def failed(failure): 165 results.errback(failure)
166 167 # We retrieve a first result containing the total, which might 168 # lead to more hits afterward. 169 def received(tree): 170 stats, sessionID = _r_info(tree) 171 found, total = stats 172 173 if self._to_fetch is None: 174 # Now, we know how much records we are supposed to fetch 175 self._to_fetch = min(found, maxhits) 176 177 self.log.debug('session %s: received batch (%d pending)' % ( 178 repr(sessionID), self._to_fetch)) 179 180 self.reader.parse(tree.find('./records'), self.db, rs) 181 182 parsed = len(rs) 183 missing = self._to_fetch - parsed 184 185 # Are we supposed to continue the current query? 186 if missing <= 0: 187 # If not, the main deferred returns the result set and 188 # the stats, as the DB itself has been modified in the 189 # meantime. 190 results.callback(found) 191 return 192 193 # We can ajust the query more tightly 194 data['firstRec'] = 1 + parsed 195 data['numRecs'] = min(self.MAX_PER_BATCH, missing) 196 data['SID'] = sessionID 197 198 if missing < self.MAX_PER_BATCH: 199 data['Logout'] = 'yes' 200 201 d = self._query(**data).addBoth(self._done) 202 203 d.addCallback(_xml).\ 204 addCallback(received).\ 205 addErrback(failed) 206 return 207 208 # start the query process 209 d = self._query(**data).addBoth(self._done) 210 211 d.addCallback(_xml).\ 212 addCallback(received).\ 213 addErrback(failed) 214 215 return results, rs 216 217
218 - def cancel(self):
219 """ Cancel a running query. The database is not reverted to its 220 original state.""" 221 if not self._pending: 222 return 223 224 self._pending.cancel() 225 self._pending = None 226 return
227