Package Pyblio :: Package Parsers :: Package Syntax :: Package BibTeX :: Module Parser
[hide private]
[frames] | no frames]

Source Code for Module Pyblio.Parsers.Syntax.BibTeX.Parser

  1  # -*- coding: utf-8 -*- 
  2  # This file is part of pybliographer 
  3  #  
  4  # Copyright (C) 1998-2006 Frederic GOBRY 
  5  # Email : gobry@pybliographer.org 
  6  #           
  7  # This program is free software; you can redistribute it and/or 
  8  # modify it under the terms of the GNU General Public License 
  9  # as published by the Free Software Foundation; either version 2  
 10  # of the License, or (at your option) any later version. 
 11  #    
 12  # This program is distributed in the hope that it will be useful, 
 13  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 14  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 15  # GNU General Public License for more details.  
 16  #  
 17  # You should have received a copy of the GNU General Public License 
 18  # along with this program; if not, write to the Free Software 
 19  # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA. 
 20  #  
 21   
 22  """ Stream oriented reading of a BibTeX file, with no actual semantic 
 23  operation on the content. Tries to return _everything_ from the file, 
 24  including comments, strings,...""" 
 25   
 26  import re 
 27  from gettext import gettext as _ 
 28   
 29  from Pyblio.Exceptions import ParserError 
 30  from Pyblio.Parsers.Syntax.BibTeX import Coding 
 31   
32 -class IBibTeX:
33 - def flat (self):
34 """ Return a textual version of the field, with no visible BibTeX / LaTeX markup """ 35 pass
36
37 - def subst (self):
38 """ Return a flattened list of the balanced expressions composing the field """ 39 pass
40
41 - def execute (self, environ):
42 """ Execute the known LaTeX commands forming the field, 43 substitute the known symbols, and return the resulting string""" 44 pass
45
46 - def tobib (self):
47 """ Return the BibTeX version of the field """ 48 pass
49 50
51 -class Comment (unicode):
52 """ A bibtex file comment """ 53
54 - def __repr__ (self):
55 return 'Comment (%s)' % unicode.__repr__ (self)
56 57
58 -class ATComment (Comment):
59 - def __repr__ (self):
60 return '@Comment (%s)' % unicode.__repr__ (self)
61
62 -class Record (object):
63
64 - def __init__ (self, type, key, fields):
65 66 self.type = type 67 self.key = key 68 self.fields = fields 69 70 return
71
72 - def __cmp__ (self, other):
73 74 if self.type != other.type: return 1 75 if self.key != other.key : return 1 76 77 return cmp (self.fields, other.fields)
78
79 - def __repr__ (self):
80 81 return 'Record (%s, %s, %s)' % ( 82 repr (self.type), 83 repr (self.key), 84 repr (self.fields))
85
86 -class Join (list):
87 """ A value, as a concatenation of blocks """ 88
89 - def __repr__ (self):
90 return 'Join (%s)' % list.__repr__ (self)
91
92 - def subst (self):
93 v = [] 94 for data in self: 95 v = v + data.subst () 96 97 return v
98
99 - def join (self):
100 r = [] 101 for v in self: 102 r += v.join () 103 return r
104
105 - def execute (self, env):
106 # Joining of bare Text fragments leads to a lookup in the @string environment 107 def subjoin (fragment): 108 if isinstance (fragment, Text): 109 try: 110 return env.strings [fragment] 111 except KeyError: 112 pass 113 return fragment.execute (env)
114 115 return Join ([ subjoin (x) for x in self ])
116 117
118 - def flat (self):
119 try: 120 return ''.join (map (lambda x: x.flat (), self)) 121 except AttributeError: 122 print repr (self) 123 raise
124 125
126 - def tobib (self):
127 return ' # '.join (map (lambda x: x.tobib (), self))
128
129 -class Text(unicode):
130
131 - def flat(self):
132 return self.replace ('~', u'\xa0')
133
134 - def __repr__ (self):
135 return 'Text(%s)' % unicode.__repr__(self)
136
137 - def subst (self):
138 return [self]
139
140 - def tobib(self):
141 return Coding.encode(self)
142
143 - def execute (self, env):
144 return self
145
146 -class Cmd (object):
147 """ A LaTeX \-command """ 148
149 - def __init__ (self, cmd):
150 self._cmd = cmd 151 return
152
153 - def __repr__ (self):
154 return 'Cmd (%s)' % `self._cmd`
155
156 - def flat (self):
157 return self._cmd
158
159 - def subst (self):
160 return [self]
161
162 - def tobib (self):
163 return '\\%s' % self._cmd
164
165 - def __cmp__ (self, other):
166 if not isinstance (other, Cmd): return 1 167 168 return cmp (self._cmd, other._cmd)
169
170 -class Block (object):
171 """ A textual block, as a sequence of text and commands """ 172 173 closer = { 174 '"': '"', 175 '{': '}', 176 '(': ')', 177 } 178
179 - def __init__ (self, opening, data = None):
180 self._o = opening 181 182 if data is None: self._d = () 183 else: self._d = data 184 return
185
186 - def flat (self):
187 r = '' 188 for o in self._d: 189 r = r + o.flat () 190 191 return r
192
193 - def append (self, v):
194 return self._d.append (v)
195
196 - def execute (self, env):
197 final = [] 198 stack = [] + list (self._d) 199 200 while stack: 201 d = stack.pop (0) 202 203 if isinstance (d, Cmd): 204 r = env.run (d._cmd, stack) 205 else: 206 r = d.execute (env) 207 208 final.append (r) 209 210 return Block (self._o, final)
211 212
213 - def join (self):
214 return list (self._d)
215
216 - def __repr__ (self):
217 return 'Block (%s, %s)' % (`self._o`, 218 `self._d`)
219
220 - def subst (self):
221 r = [] 222 for d in self._d: 223 try: 224 r = r + d.subst () 225 except AttributeError: 226 print repr (d) 227 return r
228
229 - def __cmp__ (self, other):
230 if not isinstance (other, Block): return 1 231 232 if self._o != other._o: return 1 233 return cmp (self._d, other._d)
234
235 - def tobib (self):
236 return '%s%s%s' % ( 237 self._o, 238 ''.join([x.tobib() for x in self._d]), 239 self.closer[self._o])
240 241
242 -class EndOfFile(ParserError):
243 - def __init__(self):
244 ParserError.__init__(self, _('end of file reached'))
245
246 -class Cache(object):
247
248 - def __init__(self, fd, charset):
249 250 self.fd = fd 251 self.ln = 0 252 self.cs = charset 253 254 self._buf = [] 255 return
256
257 - def readline(self):
258 self.ln += 1 259 if self._buf: 260 return self._buf.pop() 261 262 l = self.fd.readline() 263 if not l: 264 raise EndOfFile() 265 try: 266 return l.decode(self.cs) 267 except UnicodeDecodeError, err: 268 raise ParserError(str(err), self.ln)
269
270 - def unreadline(self, line):
271 self.ln -= 1 272 self._buf.append (line)
273
274 -class Context (object):
275
276 - def __init__ (self):
277 278 self.rectype = None 279 return
280 281 ST_OUT, ST_OPEN, ST_DONE = range (3) 282 283 _record_start = re.compile ('\s*@\s*(\w+)(.*)') 284
285 -def _on_out (fd, ctx):
286 """ Called when the parser is not in a record """ 287 288 assert ctx.rectype is None 289 290 comment = '' 291 292 while 1: 293 try: 294 l = fd.readline () 295 296 except EndOfFile, _: 297 if comment: return ST_DONE, Comment (comment) 298 else: return ST_DONE, None 299 300 m = _record_start.match (l) 301 if m: 302 # Handle the case of a @comment comment. 303 if m.group (1).lower () == 'comment': 304 r = [] 305 if comment: 306 r.append (Comment (comment)) 307 308 r.append (ATComment (m.group (2))) 309 return ST_OUT, r 310 311 ctx.rectype = m.group (1) 312 fd.unreadline (m.group (2).lstrip ()) 313 314 if comment: return ST_OPEN, Comment (comment) 315 else: return ST_OPEN, None 316 317 comment += l 318 319 assert False
320 321 _brace_re = re.compile (r'[()"{}\\]') 322 _cmd_re = re.compile (r'(\w+|\S| )(.*)') 323 _inline_re = re.compile (r'([,#=])') 324
325 -def _on_open (fd, ctx):
326 """ Called at the opening of a record """ 327 328 assert ctx.rectype is not None 329 330 # We eat up input as long as we don't have a balanced expression 331 stack = [] 332 curr = [] 333 334 container = None 335 data = '' 336 337 l = fd.readline () 338 start = fd.ln 339 340 while 1: 341 m = _brace_re.search (l) 342 if not m: 343 data += l 344 l = fd.readline () 345 continue 346 347 idx = m.start (0) 348 before, brace, l = l [:idx], l [idx], l [idx+1:] 349 350 data += before 351 352 if brace == '\\': 353 m = _cmd_re.match (l) 354 355 if not m: 356 raise ParserError ('backslash at the end of a line', fd.ln) 357 358 if data: curr.append (Text (data)) 359 curr.append (Cmd (m.group (1))) 360 361 l = m.group (2) 362 data = '' 363 continue 364 365 if not container: 366 if data: 367 raise ParserError ( 368 'unexpected data before ' 369 'the opening of the record: %s' % repr (data), 370 fd.ln) 371 372 if brace in ')}': 373 raise ParserError ('unexpected closing symbol %s' % repr (brace), 374 fd.ln) 375 376 container = brace 377 378 else: 379 if brace in '})': 380 # Discard bad matching of braces 381 if (brace == '}' and container != '{'): 382 raise ParserError ('mismatched "%s"' % brace, fd.ln) 383 384 if brace == ')' and container != '(': 385 data += ')' 386 continue 387 388 if data: curr.append (Text (data)) 389 data = '' 390 391 if not stack: break 392 393 v = Block (container, curr) 394 395 curr, container = stack.pop () 396 curr.append (v) 397 continue 398 399 elif brace == '(': 400 # Except during the opening, the parenthesis is a normal token 401 data += '(' 402 continue 403 404 elif brace == '"': 405 406 if container == '"': 407 # closing the brace 408 if data: curr.append (Text (data)) 409 data = '' 410 411 if not stack: break 412 413 v = Block ('"', curr) 414 curr, container = stack.pop () 415 curr.append (v) 416 continue 417 418 else: 419 # opening the brace only occurs on the second level 420 421 if len (stack) == 0: 422 #create a new context 423 if data: curr.append (Text (data)) 424 stack.append ((curr, container)) 425 426 curr = [] 427 data = '' 428 container = '"' 429 430 else: 431 data += '"' 432 433 elif brace == '{': 434 if data: curr.append (Text (data)) 435 stack.append ((curr, container)) 436 437 curr = [] 438 data = '' 439 container = '{' 440 441 # We are only interested in first level items now 442 stream = [] 443 444 while curr: 445 l = curr.pop (0) 446 447 if not isinstance (l, Text): 448 stream.append (l) 449 continue 450 451 i = 0 452 for m in _inline_re.finditer (l): 453 s, e = m.start (1), m.end (1) 454 455 stream += [ Text (x) for x in l [i:s].split () ] 456 stream.append (Text (l [s])) 457 i = e 458 459 if i < len (l): stream += [ Text (x) for x in l [i:].split () ] 460 461 final = [] 462 key = None 463 field = [] 464 465 while stream: 466 467 k = stream.pop (0) 468 469 if not stream or stream [0] == ',': 470 if key: raise ParserError ( 471 "key is defined twice", start) 472 473 if field: raise ParserError ( 474 "key is defined in the middle of the record", start) 475 476 key = k 477 if stream: stream.pop (0) 478 continue 479 480 v = stream.pop (0) 481 if v != '=': 482 raise ParserError ( 483 "invalid syntax after field %s" % repr (k), start) 484 485 vs = Join () 486 487 while stream: 488 v = stream.pop (0) 489 if v == ',': break 490 491 if vs: 492 if v == '#': 493 if not stream: 494 raise ParserError ( 495 "field %s: unexpected #" % k, start) 496 vs.append (stream.pop (0)) 497 498 else: 499 if isinstance (v, Text): 500 # Give a chance, in case a comma was missing 501 stream.insert (0, v) 502 break 503 504 raise ParserError ( 505 "field %s: missing #" % k, start) 506 else: 507 vs.append (v) 508 509 field.append ((k, vs)) 510 511 rec = Record (ctx.rectype, key, field) 512 513 ctx.rectype = None 514 515 return ST_OUT, rec
516 517 _fstm = { 518 ST_OUT: _on_out, 519 ST_OPEN: _on_open, 520 } 521
522 -def read (fd, charset = 'utf-8'):
523 524 ctx = Context () 525 526 fd = Cache (fd, charset) 527 st = ST_OUT 528 529 while st != ST_DONE: 530 st, data = _fstm [st] (fd, ctx) 531 if data is None: continue 532 533 if type (data) is type ([]): 534 for d in data: yield d 535 else: 536 yield data 537 538 return
539