Package Pyblio :: Package External :: Module CrossRef
[hide private]
[frames] | no frames]

Source Code for Module Pyblio.External.CrossRef

  1  """ 
  2  An asynchronous query module to get DOI numbers given publication information. 
  3   
  4  This module connects to http://crossref.org/ and tries to resolve DOI 
  5  numbers given fuzzy publication informations like journal title, 
  6  volume, year and start page. 
  7  """ 
  8   
  9  from twisted.web import client 
 10  from twisted.internet import defer 
 11   
 12  import random 
 13  import urllib 
 14   
 15  import logging 
 16   
 17  from Pyblio import Store, Attribute 
 18   
 19   
20 -class DOIQuery(object):
21 22 """ Query DOI numbers. 23 24 Convenience module that properly groups queries to CrossRef in 25 order to increase throughput. 26 27 >>> cnx = DOIQuery(db, user=..., pwd=...) 28 >>> for info in to_resolve: 29 ... cnx.journalSearch(...).addCallback(got_results) 30 >>> cnx.finished() 31 32 The 'db' parameter is a database from which the queries and 33 results will be composed. It must conform to the 34 I{org.pybliographer/crossref/0.1} schema. 35 36 The actual queries take place when enough searches have been 37 requested, or when the .finished() method is called. 38 39 For each query, a list of possible DOIs is returned. It can 40 possibly be empty if the citation could not be resolved. 41 42 In case of a failure in the query protocol itself, the registered 43 errback handlers are called for each query. 44 """ 45 46 # Maximal number of queries to send in a single batch 47 BATCH = 30 48 49 baseURL = 'http://doi.crossref.org/servlet/query' 50 51 52 log = logging.getLogger('pyblio.external.crossref') 53
54 - def __init__(self, db, user, pwd):
55 self.db = db 56 self.user = user 57 self.pwd = pwd 58 59 self._pending = {} 60 self._uid = 0 61 self._queue = [] 62 63 # This holds the pending batchs to submit to the remote system 64 self._batch = [] 65 66 self._running = False 67 68 self._finished = None 69 self._stats = [0, 0] 70 return
71
72 - def _make_batch(self):
73 74 enqueued = self._queue 75 self._queue = [] 76 77 self._batch.append(enqueued) 78 79 if not self._running: 80 self._running = True 81 self._send() 82 return
83
84 - def _send(self):
85 86 try: 87 enqueued = self._batch.pop() 88 except IndexError: 89 self._running = False 90 return 91 92 qdata = '\n'.join([x[1] for x in enqueued]).encode('utf-8') 93 94 self.log.debug('sending a batch to the server') 95 self.log.debug(repr(qdata)) 96 97 data = { 98 'usr': self.user, 99 'pwd': self.pwd, 100 'qdata': qdata, 101 } 102 103 req = client.getPage( 104 self.baseURL, method='POST', 105 headers={'Content-Type': 'application/x-www-form-urlencoded'}, 106 postdata=urllib.urlencode(data)) 107 108 109 def received(data): 110 self.log.debug('received a batch from the server') 111 self.log.debug(repr(data)) 112 113 r = {} 114 115 116 for line in data.decode('latin-1').split('\n'): 117 line = line.strip() 118 if not line: continue 119 120 try: 121 parts = line.split('|') 122 key, doi = parts[-2:] 123 124 key = int(key) 125 doi = doi.strip() 126 127 except (IndexError, ValueError): 128 continue 129 130 if key not in self._pending: 131 raise ValueError('key %s received while not expected' % repr(key)) 132 133 lp = len(parts) 134 135 if lp not in (10, 12): 136 raise ValueError('result %s has not the expected syntax' % repr(line)) 137 138 if not doi: 139 self.log.debug('no DOI for key %s (%s)' % (repr(key), repr(line))) 140 continue 141 142 # recreate a proper record given the fields 143 rec = Store.Record() 144 def one(field, val): 145 if val: 146 rec.add(field, val, Attribute.Text) 147 return
148 149 def person(val): 150 return Attribute.Person(last=val)
151 152 def year(val): 153 return Attribute.Date(year=int(val)) 154 155 tp = self.db.schema.txo['doctype'].byname 156 157 158 rec.add('doi', doi, Attribute.ID) 159 160 if lp == 10: 161 rec.add('doctype', tp('article'), Attribute.Txo) 162 one('issn', parts[0]) 163 one('title', parts[1]) 164 rec.add('author', parts[2], person) 165 one('volume', parts[3]) 166 one('issue', parts[4]) 167 one('startpage', parts[5]) 168 rec.add('year', parts[6], year) 169 170 else: 171 rec.add('doctype', tp('book'), Attribute.Txo) 172 one('isbn', parts[0]) 173 one('serial', parts[1]) 174 one('title', parts[1]) 175 rec.add('author', parts[2], person) 176 one('volume', parts[3]) 177 one('edition', parts[4]) 178 one('startpage', parts[5]) 179 rec.add('year', parts[6], year) 180 one('part', parts[7]) 181 182 r.setdefault(key, []).append(rec) 183 184 185 # trigger the deferred of _all_ the clients of this batch 186 for uid, q in enqueued: 187 self._pending[uid].callback(r.get(uid, [])) 188 del self._pending[uid] 189 190 self._stats[0] += len(enqueued) 191 self._batch_done() 192 return 193 194 def failed(reason): 195 self.log.debug('too bad, the batch failed: %s' % str(reason)) 196 197 for uid, q in enqueued: 198 self._pending[uid].errback(reason) 199 del self._pending[uid] 200 201 self._stats[1] += len(enqueued) 202 self._batch_done() 203 return 204 205 req.addCallback(received).addErrback(failed) 206 return 207
208 - def _batch_done(self):
209 if self._finished and not self._pending: 210 self._finished.callback(self._stats) 211 212 self._send() 213 return
214
215 - def _prepare(self, q):
216 d = defer.Deferred() 217 218 self._pending[self._uid] = d 219 self._queue.append((self._uid, q)) 220 self._uid += 1 221 222 if len(self._queue) >= self.BATCH: 223 self._make_batch() 224 225 return d
226
227 - def finished(self):
228 assert not self._finished, 'finished() called twice' 229 self._make_batch() 230 231 self._finished = defer.Deferred() 232 return self._finished
233
234 - def search(self, record):
235 assert not self._finished, 'finished() already called' 236 237 t = record['doctype'][0] 238 t = self.db.schema.txo[t.group][t.id].names['C'] 239 240 def one(field): 241 return record.get(field, [''])[0]
242 243 if t == 'article': 244 issn = one('issn') 245 title = one('title') 246 volume = one('volume') 247 issue = one('issue') 248 startpage = one('startpage') 249 250 try: 251 year = str(record['year'][0].year) 252 except KeyError: 253 year = '' 254 255 try: 256 author = record['author'][0].last 257 except KeyError: 258 author = '' 259 260 q = '|'.join([ 261 issn, title, author, volume, issue, startpage, 262 year, 'full_text', str(self._uid), '']) 263 264 elif t == 'book': 265 isbn = one('isbn') 266 serial = one('serial') 267 title = one('title') 268 volume = one('volume') 269 edition = one('edition') 270 page = one('startpage') 271 part = one('part') 272 273 try: 274 year = str(record['year'][0].year) 275 except KeyError: 276 year = '' 277 278 try: 279 author = record['author'][0].last 280 except KeyError: 281 author = '' 282 283 q = '|'.join([ 284 isbn, serial, title, author, volume, edition, page, 285 year, part, 'full_text', str(self._uid), '']) 286 287 else: 288 raise ValueError('cannot search for doctype %s' % repr(t)) 289 290 return self._prepare(q) 291