1
2
3
4 """Implement Martel parsers.
5
6 The classes in this module are used by other Martel modules and not
7 typically by external users.
8
9 There are two major parsers, 'Parser' and 'RecordParser.' The first
10 is the standard one, which parses the file as one string in memory
11 then generates the SAX events. The other reads a record at a time
12 using a RecordReader and generates events after each read. The
13 generated event callbacks are identical.
14
15 At some level, both parsers use "_do_callback" to convert mxTextTools
16 tags into SAX events.
17
18 XXX finish this documentation
19
20 XXX need a better way to get closer to the likely error position when
21 parsing.
22
23 XXX need to implement Locator
24
25 """
26
27 import urllib, pprint, traceback, sys, string
28 from xml.sax import xmlreader, _exceptions, handler, saxutils
29
30 try:
31 from mx import TextTools
32 except ImportError:
33 import TextTools
34
35 try:
36 from cStringIO import StringIO
37 except ImportError:
38 from StringIO import StringIO
39
40 import Dispatch
41
42
44 """used when a parse cannot be done"""
46 self._msg += "; in %s" % repr(text)
47
55 self.pos += offset
56 self._msg = "error parsing at or beyond character %d" % self.pos
57 return self
58
66
68 """used by the RecordParser when it can't read a record"""
69 pass
70
71
72
73
74
75
76
77
78
79
80 _match_group = {}
81
82
83
84
85
86
99 if type(key) == type(0):
100 raise IndexError, key
101 else:
102 raise KeyError, key
111 - def get(self, key, alternative):
117
118
119 _attribute_list = MartelAttributeList([])
120
121
122 -def _do_callback(s, begin, end, taglist, cont_handler, attrlookup):
123 """internal function to convert the tagtable into ContentHandler events
124
125 's' is the input text
126 'begin' is the current position in the text
127 'end' is 1 past the last position of the text allowed to be parsed
128 'taglist' is the tag list from mxTextTools.parse
129 'cont_handler' is the SAX ContentHandler
130 'attrlookup' is a dict mapping the encoded tag name to the element info
131 """
132
133 characters = cont_handler.characters
134 startElement = cont_handler.startElement
135 endElement = cont_handler.endElement
136
137 for tag, l, r, subtags in taglist:
138
139
140 assert begin <= l, "begin = %d and l = %d" % (begin, l)
141 if begin < l:
142 characters(s[begin:l])
143
144 if tag.startswith(">"):
145
146
147
148
149 if not tag == ">ignore":
150 assert tag.startswith(">G"),"Unknown special tag %s" % repr(tag)
151
152 realtag, attrs = attrlookup[tag]
153 startElement(realtag, attrs)
154
155 else:
156
157 startElement(tag, _attribute_list)
158
159
160 if subtags:
161 _do_callback(s, l, r, subtags, cont_handler, attrlookup)
162 else:
163 characters(s[l:r])
164 begin = r
165
166 if tag.startswith(">"):
167 if tag.startswith(">G"):
168 realtag, attrs = attrlookup[tag]
169 endElement(realtag)
170 else:
171 endElement(tag)
172
173
174
175 if begin < end:
176 characters(s[begin:end])
177
178 -def _do_dispatch_callback(s, begin, end, taglist,
179 start_table_get, cont_handler, save_stack,
180 end_table_get,
181 attrlookup):
182 """internal function to convert the tagtable into ContentHandler events
183
184 THIS IS A SPECIAL CASE FOR Dispatch.Dispatcher objects
185
186 's' is the input text
187 'begin' is the current position in the text
188 'end' is 1 past the last position of the text allowed to be parsed
189 'taglist' is the tag list from mxTextTools.parse
190 'start_table_get' is the Dispatcher._start_table
191 'cont_handler' is the Dispatcher
192 'end_table_get' is the Dispatcher._end_table
193 'cont_handler' is the SAX ContentHandler
194 'attrlookup' is a dict mapping the encoded tag name to the element info
195 """
196 for tag, l, r, subtags in taglist:
197
198
199 assert begin <= l, "begin = %d and l = %d" % (begin, l)
200 if begin < l and save_stack:
201 cont_handler._save_text += s[begin:l]
202
203
204
205
206
207
208
209
210
211
212 f = start_table_get(tag)
213 if f is not None:
214 f(tag, _attribute_list)
215 else:
216
217 x = attrlookup.get(tag)
218 if x is not None:
219 realtag, attrs = x
220
221 f = start_table_get(realtag)
222 if f is not None:
223 f(realtag, attrs)
224
225
226 if subtags:
227 _do_dispatch_callback(s, l, r, subtags,
228 start_table_get,
229 cont_handler, save_stack,
230 end_table_get,
231 attrlookup)
232 elif save_stack:
233
234 cont_handler._save_text += s[l:r]
235 begin = r
236
237
238 f = end_table_get(tag)
239 if f is not None:
240 f(tag)
241 else:
242
243 x = attrlookup.get(tag)
244 if x is not None:
245 realtag, attrs = x
246
247 f = end_table_get(realtag)
248 if f is not None:
249 f(realtag)
250
251
252
253 if begin < end and save_stack:
254 cont_handler._save_text += s[begin:end]
255
257 """parse the string with the tagtable and send the ContentHandler events
258
259 Specifically, it sends the startElement, endElement and characters
260 events but not startDocument and endDocument.
261 """
262 if debug_level:
263 import Generate
264 Generate._position = 0
265
266 result, taglist, pos = TextTools.tag(s, tagtable, 0, len(s))
267
268
269
270 if isinstance(cont_handler, Dispatch.Dispatcher):
271 _do_dispatch_callback(s, 0, pos, taglist,
272 cont_handler._start_table.get,
273 cont_handler, cont_handler._save_stack,
274 cont_handler._end_table.get,
275 attrlookup)
276 elif cont_handler.__class__ != handler.ContentHandler:
277
278 _do_callback(s, 0, pos, taglist, cont_handler, attrlookup)
279
280 if not result:
281 if debug_level:
282 return ParserPositionException(Generate._position)
283 else:
284 return ParserPositionException(pos)
285 elif pos != len(s):
286 return pos
287 else:
288 return None
289
290
291 -class Parser(xmlreader.XMLReader):
292 """Parse the input data all in memory"""
293
294 - def __init__(self, tagtable, (want_groupref_names, debug_level, attrlookup) = (0, 1, {})):
295 xmlreader.XMLReader.__init__(self)
296
297 assert type(tagtable) == type( () ), "mxTextTools only allows a tuple tagtable"
298 self.tagtable = tagtable
299
300
301
302
303 self.want_groupref_names = want_groupref_names
304
305 self.debug_level = debug_level
306 self.attrlookup = attrlookup
307
309 parser = Parser(self.tagtable, (self.want_groupref_names,
310 self.debug_level, self.attrlookup))
311 parser.setContentHandler(self.getContentHandler())
312 parser.setErrorHandler(self.getErrorHandler())
313 parser.setDTDHandler(self.getDTDHandler())
314 return parser
315
317 x = StringIO()
318 pprint.pprint(self.tagtable, x)
319 return x.getvalue()
320
322 """parse using the input file object
323
324 XXX will be removed with the switch to Python 2.0, where parse()
325 takes an 'InputSource'
326 """
327
328 self.parseString(fileobj.read())
329
330 - def parse(self, source):
334
336 """parse using the given string
337
338 XXX will be removed with the switch to Python 2.0, where parse()
339 takes an 'InputSource'
340 """
341 self._cont_handler.startDocument()
342
343 if self.want_groupref_names:
344 _match_group.clear()
345
346
347 result = _parse_elements(s, self.tagtable, self._cont_handler,
348 self.debug_level, self.attrlookup)
349
350 if result is None:
351
352 pass
353
354 elif isinstance(result, _exceptions.SAXException):
355
356 self._err_handler.fatalError(result)
357
358 else:
359
360 pos = result
361 self._err_handler.fatalError(ParserIncompleteException(pos))
362
363
364 self._cont_handler.endDocument()
365
368
370 """Parse the input data a record at a time"""
371 - def __init__(self, format_name, attrs, record_tagtable,
372 (want_groupref_names, debug_level, attrlookup),
373 make_reader, reader_args = ()):
374 """parse the input data a record at a time
375
376 format_name - XML tag name for the whole data file
377 record_tagtable - mxTexTools tag table for each record
378 want_groupref_names - flag to say if the match_group table needs to
379 be reset (will disappear with better support from mxTextTools)
380
381 make_reader - callable object which creates a RecordReader; first
382 parameter will be an input file object
383 reader_args - optional arguments to pass to make_reader after the
384 input file object
385 """
386 xmlreader.XMLReader.__init__(self)
387
388 self.format_name = format_name
389 self.attrs = attrs
390 assert type(record_tagtable) == type( () ), \
391 "mxTextTools only allows a tuple tagtable"
392 self.tagtable = record_tagtable
393 self.want_groupref_names = want_groupref_names
394 self.debug_level = debug_level
395 self.attrlookup = attrlookup
396 self.make_reader = make_reader
397 self.reader_args = reader_args
398
400 parser = RecordParser(self.format_name, self.attrs, self.tagtable,
401 (self.want_groupref_names, self.debug_level,
402 self.attrlookup),
403 self.make_reader, self.reader_args)
404 parser.setContentHandler(self.getContentHandler())
405 parser.setErrorHandler(self.getErrorHandler())
406 parser.setDTDHandler(self.getDTDHandler())
407 return parser
408
409
411 x = StringIO()
412 pprint.pprint(self.tagtable, x)
413 return "parse records: " + x.getvalue()
414
416 """parse using the input file object
417
418 XXX will be removed with the switch to Python 2.0, where parse()
419 takes an 'InputSource'
420 """
421 self._cont_handler.startDocument()
422
423 try:
424 reader = self.make_reader( *(fileobj,) + self.reader_args)
425 except (KeyboardInterrupt, SystemExit):
426 raise
427 except:
428
429
430 outfile = StringIO()
431 traceback.print_exc(file=outfile)
432 self._err_handler.fatalError(ParserRecordException(
433 outfile.getvalue(), sys.exc_info()[1]))
434 self._cont_handler.endDocument()
435 return
436
437 if self.want_groupref_names:
438 _match_group.clear()
439
440 self._cont_handler.startElement(self.format_name, self.attrs)
441 filepos = 0
442 while 1:
443 try:
444 record = reader.next()
445 except (KeyboardInterrupt, SystemExit):
446 raise
447 except:
448
449
450 outfile = StringIO()
451 traceback.print_exc(file=outfile)
452 self._err_handler.fatalError(ParserRecordException(
453 outfile.getvalue(), sys.exc_info()[1]))
454 self._cont_handler.endDocument()
455 return
456
457 if record is None:
458 break
459 result = _parse_elements(record, self.tagtable, self._cont_handler,
460 self.debug_level, self.attrlookup)
461
462 if result is None:
463
464 pass
465 elif isinstance(result, _exceptions.SAXException):
466
467 result += filepos
468 self._err_handler.error(result)
469 else:
470
471 pos = filepos + result
472 self._err_handler.error(ParserPositionException(pos))
473
474 filepos = filepos + len(record)
475
476 self._cont_handler.endElement(self.format_name)
477 self._cont_handler.endDocument()
478
479 - def parse(self, source):
483
485 """parse using the given string
486
487 XXX will be removed with the switch to Python 2.0, where parse()
488 takes an 'InputSource'
489 """
490
491 strfile = StringIO(s)
492 self.parseFile(strfile)
493
496
830