Package Pyblio :: Package Parsers :: Package Syntax :: Package BibTeX
[hide private]
[frames] | no frames]

Source Code for Package Pyblio.Parsers.Syntax.BibTeX

  1  # -*- coding: utf-8 -*- 
  2  # This file is part of pybliographer 
  3  #  
  4  # Copyright (C) 1998-2006 Frederic GOBRY 
  5  # Email : gobry@pybliographer.org 
  6  #           
  7  # This program is free software; you can redistribute it and/or 
  8  # modify it under the terms of the GNU General Public License 
  9  # as published by the Free Software Foundation; either version 2  
 10  # of the License, or (at your option) any later version. 
 11  #    
 12  # This program is distributed in the hope that it will be useful, 
 13  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 14  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 15  # GNU General Public License for more details.  
 16  #  
 17  # You should have received a copy of the GNU General Public License 
 18  # along with this program; if not, write to the Free Software 
 19  # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA. 
 20  #  
 21  #  
 22   
 23  """ Extension module for BibTeX files """ 
 24   
 25   
 26  import re, os, string, logging 
 27   
 28  from Pyblio.Parsers.Syntax.BibTeX import Parser, Coding 
 29  from Pyblio.Parsers.Syntax.BibTeX import Environ as BaseEnviron 
 30   
 31  from Pyblio import Attribute, Store, Exceptions, Tools 
 32   
 33  from gettext import gettext as _ 
 34   
 35  # ================================================== 
 36  # Base Classes 
 37  # ================================================== 
 38   
39 -class Environ(BaseEnviron.Environ):
40 - def __init__(self):
41 self.strings = { 42 'jan': Parser.Text('January'), 43 'feb': Parser.Text('February'), 44 'mar': Parser.Text('March'), 45 'apr': Parser.Text('April'), 46 'may': Parser.Text('May'), 47 'jun': Parser.Text('June'), 48 'jul': Parser.Text('July'), 49 'aug': Parser.Text('August'), 50 'sep': Parser.Text('September'), 51 'oct': Parser.Text('October'), 52 'nov': Parser.Text('November'), 53 'dec': Parser.Text('December'), 54 }
55 56 57 # ================================================== 58 # BibTeX interface 59 # ================================================== 60 _lf_re = re.compile ('^N+I+$') 61 _fl_re = re.compile ('^[IN]*?I+N+$') 62 63 _split_re = re.compile (r'[,.]|\s+|\~') 64 65 _dotdash_re = re.compile(r'\.\s+-') 66
67 -def _nodotdash(txt):
68 return _dotdash_re.sub('.-', txt)
69
70 -class Reader(object):
71 # The official channel in which messages must be sent 72 log = logging.getLogger('pyblio.import.bibtex') 73
74 - def __init__(self, charset='ISO8859-1'):
75 self.charset = charset 76 self._mapping = { 77 Attribute.Text: self.text_add, 78 Attribute.Person: self.person_add, 79 Attribute.URL: self.url_add, 80 Attribute.Date: self.date_add, 81 Attribute.ID: self.id_add, 82 } 83 self.env = Environ () 84 return
85
86 - def id_add(self, field, stream):
87 self.record[field] = [Attribute.ID(stream.flat())] 88 return
89
90 - def url_add(self, field, stream):
91 url = stream.flat().replace(u'\xa0', u'~') 92 self.record.add(field, url, Attribute.URL) 93 return
94
95 - def date_add(self, field, stream):
96 self.record [field] = [Attribute.Date()]
97
98 - def to_text(self, stream):
99 return Attribute.Text (stream.execute (self.env).flat ())
100
101 - def text_add(self, field, stream):
102 self.record [field] = [ self.to_text (stream) ] 103 return
104
105 - def person_add(self, field, stream):
106 ''' Parse a stream of tokens as a series of person names ''' 107 108 # The first level of the parsing is of interest, as non-person 109 # names can be written for instance: 110 # author = "{Name of a Company} and {Another One}" 111 112 # Join joins, ie strings written as {toto} # {tutu} 113 stream = stream.join () 114 115 # ...and expand the low-level text in fragment split on "," "." and space 116 stream, os = [], stream 117 for v in os: 118 if not isinstance (v, Parser.Text): 119 stream.append (v) 120 continue 121 122 i = 0 123 for m in _split_re.finditer (v): 124 s, e = m.start (0), m.end (0) 125 if i != s: stream.append (Parser.Text (v [i:s])) 126 127 sep = Parser.Text (v [s:e]) 128 if sep [0] in ' \n\t~': sep = Parser.Text (' ') 129 stream.append (sep) 130 131 i = e 132 133 if i < len (v): stream.append (Parser.Text (v [i:])) 134 135 # These high-level groups are separated by 'and' keywords 136 avail = [] 137 138 while 1: 139 try: 140 i = stream.index ('and') 141 except ValueError: 142 break 143 144 avail.append (stream [0:i]) 145 stream = stream [i+1:] 146 147 if stream: 148 avail.append (stream) 149 150 def _wordify (stream): 151 152 stream = stream.execute(self.env) 153 stream = stream.subst() 154 155 # Ensure the stream is a sequence of complete words (ie, 156 # concatenate successive text parts and space parts). The 157 # comma must remain on its own, as it serves as a separator. 158 # The dot is always appended to the previous word. 159 160 in_space = True 161 os, stream = stream, [] 162 163 while os: 164 s = os.pop (0) 165 166 if s == '.': 167 stream [-1] += '.' 168 in_space = True 169 continue 170 171 is_space = s in (' ', '\n') 172 173 if in_space: 174 if not is_space: 175 stream.append (s) 176 in_space = False 177 continue 178 179 else: 180 if is_space: 181 in_space = True 182 else: 183 if s == ',': 184 stream.append (s) 185 in_space = True 186 else: 187 stream [-1] += s 188 189 return stream
190 191 def _typetag (stream): 192 """ For each element of the string, return a list that 193 indicates if the corresponding element is : 194 - I : an initial 195 - L : a lower case word 196 - N : a name 197 """ 198 199 tags = [] 200 201 for s in stream: 202 if '.' in s: 203 tags.append ('I') 204 205 elif s.lower () == s: 206 tags.append ('L') 207 208 elif s.lower () in ('van', 'von', 'de'): 209 tags.append ('L') 210 211 else: 212 tags.append ('N') 213 214 return tags 215 216 def _person_decode (stream): 217 218 if len(stream) == 1 and isinstance(stream[0], Parser.Block): 219 return Attribute.Person(last=stream [0].flat()) 220 221 stream = _wordify (Parser.Block ('', stream)) 222 223 # Check for ',' syntax for names 224 comma = stream.count (',') 225 226 if comma == 0: 227 # Use the number of segments in the name 228 ls = len (stream) 229 if ls == 1: 230 return Attribute.Person (last = stream [0]) 231 elif ls == 0: 232 return None 233 else: 234 tt = ''.join(_typetag(stream)) 235 236 if _lf_re.match(tt): 237 idx = tt.index('I') 238 return Attribute.Person(first=_nodotdash(' '.join(stream[idx:])), 239 last=' '.join(stream[:idx])) 240 if tt == 'NN': 241 return Attribute.Person(first=_nodotdash(stream[0]), 242 last=stream[1]) 243 if _fl_re.match (tt): 244 idx = tt.rindex ('I') + 1 245 return Attribute.Person (first=_nodotdash(' '.join (stream [:idx])), 246 last = ' '.join (stream [idx:])) 247 248 try: 249 von = tt.index ('L') 250 251 return Attribute.Person (first =_nodotdash(' '.join (stream [0:von])), 252 last = ' '.join (stream [von:])) 253 254 except ValueError: 255 pass 256 257 # As a fallback, consider that the last name is the last component 258 if tt == 'NNN': 259 return Attribute.Person (first = ' '.join (stream [:-1]), 260 last = stream [-1]) 261 262 elif tt == 'II': 263 # Handle the case of a final . after the author's name 264 first, last = stream 265 266 if last[-1] == '.' and len(last) > 2: 267 last = last[:-1] 268 return Attribute.Person (first=_nodotdash(first), last=last) 269 270 271 raise Exceptions.ParserError ("%s: unable to parse name properly: %s (typed as %s)" % ( 272 unicode(self.key), repr(stream), repr(tt))) 273 274 elif comma == 1: 275 i = stream.index (',') 276 277 return Attribute.Person \ 278 (last = ' '.join (stream [:i]), 279 first = _nodotdash(' '.join (stream [i+1:]))) 280 281 282 raise Exceptions.ParserError ("%s: unable to parse name %s properly: %d commas" % ( 283 unicode(self.key), repr(stream), comma)) 284 285 self.record [field] = filter(None, map(_person_decode, avail)) 286 return 287 288
289 - def comment_add (self, stream):
290 # by default, we drop comments 291 return
292
293 - def string_add (self, stream):
294 # by default, we drop strings 295 return
296
297 - def preamble_add (self, stream):
298 # by default, we drop the preamble 299 return
300
301 - def type_add (self, data):
302 # by default, we drop the document type informatio 303 return
304
305 - def record_begin (self):
306 pass
307
308 - def record_end (self):
309 pass
310
311 - def do_default(self, k, v):
312 raise Exceptions.SchemaError( 313 _("no attribute '%s' in document '%s'") % ( 314 k, self.tp))
315
316 - def record_dispatch(self, k, v):
317 # Dispatch by name, on do_<fieldname> methods 318 try: 319 m = getattr(self, 'do_' + k.lower()) 320 return m(v) 321 except AttributeError: 322 pass 323 324 # Dispatch by type, calling <type>_add methods 325 try: 326 attp = self.db.schema [k] 327 except KeyError: 328 return self.do_default(k, v) 329 return self._mapping[attp.type](k, v)
330
331 - def record_parse(self, record):
332 333 tp = record.type.lower () 334 335 if tp == 'string': 336 return self.string_add (record) 337 338 elif tp == 'preamble': 339 return self.preamble_add (record) 340 341 self.tp, self.key, val = record.type, record.key, record.fields 342 343 self.record = Store.Record () 344 345 self.record_begin () 346 347 for k, v in val: 348 self.record_dispatch (k.lower (), v) 349 350 # Add the document type at the end, as it might have been 351 # modified during parsing. 352 self.type_add (self.tp) 353 354 self.record_end () 355 return
356 357
358 - def parse(self, fd, db):
359 360 self.db = db 361 362 self.doctype = {} 363 364 rs = db.rs.new() 365 rs.name = _('Imported from BibTeX') 366 367 for v in db.schema.txo['doctype'].values (): 368 self.doctype [v.names ['C'].lower ()] = v 369 370 for data in Parser.read (fd, self.charset): 371 372 if isinstance (data, Parser.Comment): 373 self.comment_add (data) 374 continue 375 376 self.record = None 377 378 self.record_parse (data) 379 380 if self.record: 381 k = self.db.add (self.record) 382 rs.add(k) 383 384 return rs
385 386 387 # -------------------------------------------------- 388 389
390 -class Writer(object):
391 392 # The official channel in which messages must be sent 393 log = logging.getLogger('pyblio.export.bibtex') 394 395 _collapse = re.compile (r'[\s\n]+', re.MULTILINE) 396
397 - def __init__ (self):
398 399 self._mapping = { 400 Attribute.Text: self.text_add, 401 Attribute.Person: self.person_add, 402 Attribute.URL: self.url_add, 403 Attribute.Date: self.date_add, 404 Attribute.ID: self.id_add, 405 Attribute.Txo: self.txo_add, 406 } 407 return
408
409 - def _escape (self, text):
410 if not text: 411 return '' 412 return Coding.encode(text)
413
414 - def txo_add (self, field, data):
415 416 r = [] 417 for d in data: 418 v = self.db.schema.txo[d.group][d.id] 419 420 # Use the 'C' name by default, as it is easier to parse 421 # back. 422 try: n = v.names.get ('C', None) 423 except KeyError: n = v.name 424 425 if n: r.append (n) 426 427 data = self._escape ('; '.join (r)) 428 429 self.field [field] = '{%s}' % data 430 return
431
432 - def text_add (self, field, data):
433 434 data = self._escape (' '.join (data)) 435 436 # by default, new lines and multiple spaces are not significant in bibtex fields 437 data = self._collapse.sub (' ', data) 438 439 self.field [field] = '{%s}' % data 440 return
441
442 - def capitalized_text_add (self, field, data):
443 444 # by default, new lines and multiple spaces are not significant in bibtex fields 445 data = self._collapse.sub (' ', ' '.join (data)) 446 447 # If the text contains capitals that are not at the beginning 448 # of a sentence, protect these capitals. Similarly for 449 # lowercase letters at the beginning. 450 451 res = Parser.Block ('{', []) 452 453 beginning = True 454 in_upper = False 455 block = [] 456 braced = False 457 458 def _close_upper (): 459 res.append (Parser.Block ('{', (Parser.Text (''.join (block)),))) 460 del block[:]
461 462 while data: 463 c, data = data [0], data [1:] 464 465 if c in '.!?': 466 if in_upper: 467 _close_upper () 468 in_upper = False 469 470 beginning = True 471 block.append (c) 472 continue 473 474 if not c.isalpha (): 475 if in_upper: 476 _close_upper () 477 in_upper = False 478 479 block.append (c) 480 481 if c == '"': braced = not braced 482 continue 483 484 if not braced: 485 if beginning and c.lower () == c: 486 res.append (Parser.Text (''.join (block))) 487 res.append (Parser.Block ('{', (Parser.Text (c),))) 488 489 block = [] 490 beginning = False 491 continue 492 493 if (not beginning and c.lower () != c) \ 494 or (beginning and data and data [0].lower () != data [0]): 495 if in_upper: 496 block.append (c) 497 else: 498 in_upper = True 499 res.append (Parser.Text (''.join (block))) 500 501 block = [c] 502 beginning = False 503 continue 504 505 if in_upper: 506 _close_upper () 507 in_upper = False 508 509 block.append (c) 510 beginning = False 511 512 513 if in_upper: _close_upper () 514 if block: res.append (Parser.Text (''.join (block))) 515 516 self.field [field] = res.tobib () 517 return
518
519 - def id_add (self, field, data):
520 521 data = self._escape ('; '.join (data)) 522 523 self.field [field] = '{%s}' % data 524 return
525
526 - def _single_person (self, person):
527 528 if person.first: 529 return self._escape('%s, %s' % (person.last, person.first)) 530 else: 531 return '{' + self._escape(person.last) + '}'
532
533 - def person_add (self, field, data):
534 535 v = ' and '.join (map (self._single_person, data)) 536 537 self.field [field] = '{%s}' % v 538 return
539
540 - def url_add (self, field, data):
541 542 v = ', '.join (data) 543 544 self.field [field] = '{%s}' % v 545 return
546
547 - def date_add (self, field, data):
548 549 v = str (data [0].year) 550 551 self.field [field] = v 552 return
553
554 - def record_begin (self):
555 if 'id' in self.record: 556 self.key = str(self.record['id'][0]) 557 558 tp = self.record ['doctype'] [0] 559 self.type = self.db.schema.txo[tp.group][tp.id].names ['C']
560
561 - def record_end (self):
562 return
563
564 - def record_parse (self, key, value):
565 566 if key in ('id', 'doctype'): return 567 568 key = Coding.encode(key) 569 570 self._mapping[self.db.schema[key].type](key, self.record [key]) 571 return
572
573 - def write (self, fd, rs, db):
574 575 """ Write a result set to a given file descriptor """ 576 577 self.db = db 578 self.rs = rs 579 580 self.doctype = {} 581 582 for v in db.schema.txo['doctype'].values (): 583 self.doctype [v.names ['C'].lower ()] = v 584 585 for e in rs.itervalues (): 586 587 self.record = e 588 589 self.field = {} 590 self.type = None 591 self.key = None 592 593 self.to_delete = False 594 self.record_begin () 595 596 for k, v in e.items (): 597 self.record_parse (k, v) 598 599 self.record_end () 600 601 if self.to_delete: 602 continue 603 604 # Fully support the (bad) case where there is no key in 605 # the record, in order to support bad behaved applications 606 # that use it. 607 if self.key is None: 608 key = '' 609 else: 610 key = self.key + ',' 611 ret = '@%s{%s\n' % (self.type, key) 612 613 attrs = [] 614 keys = self.field.keys () 615 keys.sort () 616 617 maxlen = 0 618 for k in keys: 619 l = len (k) 620 if l > maxlen: maxlen = l 621 622 for k in keys: 623 v = self.field [k] 624 625 left = ' %s%s = ' % (k, ' ' * (maxlen - len (k))) 626 627 attrs.append (left + Tools.format (v, 75, 0, len (left))) 628 629 fd.write (ret + ',\n'.join (attrs) + '\n}\n') 630 631 return
632