Package Pyblio :: Package External :: Module Citeseer
[hide private]
[frames] | no frames]

Source Code for Module Pyblio.External.Citeseer

  1  # This file is part of pybliographer 
  2  #  
  3  # Copyright (C) 1998-2006 Frederic GOBRY 
  4  # Email : gobry@pybliographer.org 
  5  #           
  6  # This program is free software; you can redistribute it and/or 
  7  # modify it under the terms of the GNU General Public License 
  8  # as published by the Free Software Foundation; either version 2  
  9  # of the License, or (at your option) any later version. 
 10  #    
 11  # This program is distributed in the hope that it will be useful, 
 12  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 13  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 14  # GNU General Public License for more details.  
 15  #  
 16  # You should have received a copy of the GNU General Public License 
 17  # along with this program; if not, write to the Free Software 
 18  # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA. 
 19  #  
 20   
 21  """ 
 22  Citeseer (http://citeseer.ist.psu.edu/) queries 
 23  """ 
 24   
 25  # Citeseer provides two ways to search for documents: its own search 
 26  # engine, and Google. This code use the first solution (as Google's 
 27  # search API is not what it used to be...): first, all the links to 
 28  # all the detailed citation pages are collected, then each page is 
 29  # parsed, and the bibtex and abstract are extracted. 
 30   
 31  import urllib 
 32  import logging 
 33  import BeautifulSoup 
 34  import re 
 35  import StringIO 
 36   
 37  from gettext import gettext as _ 
 38  from twisted.internet import defer, reactor 
 39   
 40  from Pyblio import Attribute 
 41  from Pyblio.External import IExternal 
 42  from Pyblio.External.HTTP import HTTPRetrieve 
 43  from Pyblio.Exceptions import QueryError, ParserError 
 44  from Pyblio.Parsers.Semantic import BibTeX 
 45   
 46  log = logging.getLogger('pyblio.external.citeseer') 
 47   
 48  whitespace = re.compile(r'[\s\n]+', re.M) 
 49   
50 -class ResultScraper(object):
51 """Parse a Citeseer result page containing links to the actual 52 detailed citations.""" 53 54 results = re.compile(r'(\d+|No)\s+documents?\s+found')
55 - def __init__(self, page):
56 self.soup = BeautifulSoup.BeautifulSoup(page) 57 self.rls = self.soup.findAll( 58 text=lambda text: isinstance(text, BeautifulSoup.Comment) and \ 59 text == 'RLS')[0] 60 self.ris = self.soup.findAll( 61 text=lambda text: isinstance(text, BeautifulSoup.Comment) and \ 62 text == 'RIS')
63
64 - def count(self):
65 """Return the overall result count.""" 66 # the result count is immediately before the list of results, 67 # unless we see no RIS comments, in which case there is no 68 # result at all. 69 if not self.ris: 70 return 0 71 current = self.rls.previous 72 while current is not None: 73 if current.string is not None: 74 m = self.results.search(current.string) 75 if m: 76 return int(m.group(1)) 77 current = current.previous 78 raise QueryError(_("cannot parse result page"))
79
83 84
85 -class RelaxedBibTeX(BibTeX.Reader):
86 - def do_default(self, field, value):
87 log.warn('dropping field %r' % field)
88
89 - def to_text(self, stream):
90 text = stream.execute(self.env).flat().strip() 91 return Attribute.Text(whitespace.sub(' ', text))
92
93 -class CitationScraper(object):
94 """Parse a detailed citation page, containing an abstract and a 95 BibTeX snippet.""" 96
97 - def __init__(self, page):
98 self.soup = BeautifulSoup.BeautifulSoup(page)
99
100 - def citation(self):
101 content = {'bibtex': self.soup.pre.string} 102 abstract = self.soup.findAll(text='Abstract:') 103 if abstract: 104 abstract = abstract[0].parent.nextSibling.strip() 105 content['abstract'] = whitespace.sub(' ', abstract) 106 return content
107 108
109 -class Citeseer(IExternal):
110 """A connection to Citeseer.""" 111 112 schema = 'org.pybliographer/bibtex/0.1' 113 114 BATCH_SIZE = 50 115 FETCHER_POOL = 2 # how many detailed pages to fetch at a time 116 117 MIRRORS = ['http://citeseer.ist.psu.edu/cis', 118 'http://citeseer.ittc.ku.edu/cs'] 119 120 baseURL = MIRRORS[1] 121
122 - def __init__(self, db):
123 self.db = db 124 self._pending = None 125 self._reader = RelaxedBibTeX('utf-8')
126
127 - def _query(self, query, start=0):
128 assert self._pending is None, \ 129 'no more than one search at a time per connection' 130 131 qb = {'dbnum': 1, 132 'start': start, 133 'am': self.BATCH_SIZE, 134 'ao': 'Citations', 135 'af': 'Any', 136 'qtype': 'document:'} 137 all = {'q': query, 138 'qb': ','.join('%s=%s' % v for v in qb.iteritems())} 139 140 for k, v in all.items(): 141 if isinstance(v, unicode): 142 all[k] = v.encode('utf-8') 143 url = self.baseURL +'?' + urllib.urlencode(all) 144 145 log.info('sending query %r' % url) 146 self._pending = HTTPRetrieve(url) 147 148 def done(data): 149 self._pending = None 150 return data
151 def parse(data): 152 return ResultScraper(data)
153 return self._pending.deferred.\ 154 addBoth(done).\ 155 addCallback(parse) 156
157 - def count(self, query):
158 req = self._query(query) 159 results = defer.Deferred() 160 161 def failed(reason): 162 results.errback(reason)
163 def got_summary(data): 164 results.callback(data.count()) 165 req.addCallback(got_summary).addErrback(failed) 166 return results 167
168 - def search(self, query, maxhits=100):
169 rs = self.db.rs.new() 170 rs.name = _('Imported from Citeseer') 171 172 req = self._query(query) 173 results = defer.Deferred() 174 175 self._abort = False 176 177 def failed(reason): 178 results.errback(reason)
179 180 def got_page(data, link): 181 """Handle a detailed citation page.""" 182 if data: 183 log.info('obtained page %r' % link) 184 citation = data.citation() 185 if not citation['bibtex']: 186 log.warn('page has no bibtex field?') 187 else: 188 fd = StringIO.StringIO(citation['bibtex'].encode('utf-8')) 189 try: 190 obtained = self._reader.parse(fd, self.db) 191 except ParserError, msg: 192 log.error('unable to parse %r: %s' % ( 193 citation['bibtex'], msg)) 194 obtained = [] 195 for key in obtained: 196 # we can enrich the result with an abstract 197 if 'abstract' in citation: 198 record = self.db[key] 199 record.add('abstract', 200 citation['abstract'], 201 Attribute.Text) 202 self.db[key] = record 203 rs.add(key) 204 if self._links and not self._abort: 205 # there are more links to process, launch a new 206 # HTTPRetrieve(). 207 link = self._links.pop() 208 fetcher = HTTPRetrieve(link) 209 log.info('fetching detailed page %r' % link) 210 self._running.append(link) 211 def done(data): 212 self._running.remove(link) 213 return data 214 def parse_citation(data): 215 return CitationScraper(data) 216 def inner_failure(data): 217 if not self._running: 218 results.errback(data) 219 self._abort = data 220 fetcher.deferred.\ 221 addBoth(done).\ 222 addCallback(parse_citation).\ 223 addCallback(got_page, link).\ 224 addErrback(inner_failure) 225 elif not self._running: 226 # we are done once there is no pending link to fetch 227 # and all the running fetchers have returned. 228 if not self._abort or self._abort is True: 229 results.callback(self._total) 230 else: 231 results.errback(self._abort) 232 233 def got_summary(data): 234 """Handle a result page.""" 235 # initial pass, collect all the results, up to maxhits 236 self._total = data.count() 237 self._target = min(maxhits, self._total) 238 self._current = 0 239 log.info('%d results for the query' % self._total) 240 self._links = set() 241 242 def got_links(data): 243 current = data.links() 244 previous = len(self._links) 245 self._links.update(current) 246 obtained = len(self._links) - previous 247 if obtained == 0: 248 log.warn('this batch did not provide new links, stopping') 249 self._current += self.BATCH_SIZE 250 log.info('%d links in this batch (%s/%d)' % ( 251 len(current), len(self._links), self._total)) 252 missing = self._target - len(self._links) 253 if missing > 0 and obtained > 0: 254 log.info('getting batch at %d, %d missing' % ( 255 self._current, missing)) 256 next = self._query(query, self._current) 257 next.addCallback(got_links).addErrback(failed) 258 else: 259 # start getting the detailed citation pages 260 self._running = [] 261 for i in xrange(self.FETCHER_POOL): 262 got_page(None, None) 263 got_links(data) 264 req.addCallback(got_summary).addErrback(failed) 265 return results, rs 266
267 - def cancel(self):
268 self._abort = True 269 if self._pending: 270 self._pending.cancel()
271