1
2
3
4 """Iterate over records of a XML parse tree.
5
6 The standard parser is callback based over all the elements of a file.
7 If the file contains records, many people would like to be able to
8 iterate over each record and only use the callback parser to analyze
9 the record.
10
11 If the expression is a 'ParseRecords', then the code to do this is
12 easy; use its make_reader to grab records and its record_expression to
13 parse them. However, this isn't general enough. The use of a
14 ParseRecords in the format definition should be strictly a
15 implementation decision for better memory use. So there needs to be
16 an API which allows both full and record oriented parsers.
17
18 Here's an example use of the API:
19 >>> import sys
20 >>> import swissprot38 # one is in Martel/test/testformats
21 >>> from xml.dom import pulldom
22 >>> iterator = swissprot38.format.make_iterator("swissprot38_record")
23 >>> text = open("sample.swissprot").read()
24 >>> for record in iterator.iterateString(text, pulldom.SAX2DOM()):
25 .. print "Read a record with the following AC numbers:"
26 ... for acc in record.document.getElementsByTagName("ac_number"):
27 ... acc.writexml(sys.stdout)
28 ... sys.stdout.write("\n")
29 ...
30
31
32 There are several parts to this API. First is the 'Iterator
33
34 There are two parts to the API. One is the EventStream. This
35 contains a single method called "next()" which returns a list of SAX
36 events in the 2-ple (event_name, args). It is called multiple times
37 to return successive event lists and returns None if no events are
38 available.
39
40 The other is the Iterator
41
42 Sean McGrath has a RAX parser (Record API for XML) which uses a
43 concept similar to this.
44 """
45
46
47 import sys, urllib, traceback
48 from xml.sax import saxutils
49 import Parser
50 try:
51 from cStringIO import StringIO
52 except ImportError:
53 from StringIO import StringIO
54
61
66
68 self.events.append( ("startElement", args) )
69
70
71
73 self.events.append( ("endElement", args) )
74
76 self.has_error = 1
77 self.events.append( ("error", args) )
79 self.has_error = 1
80 self.events.append( ("fatalError", args) )
81
84 self.events = event_list
86 if self.events:
87 x = self.events
88 self.events = None
89 return x
90 return None
91
94 self.parser = parser
95 self.tag = tag
96
98 """create an iterator over a string"""
99 events = StoreEvents()
100 self.parser.setContentHandler(events)
101 self.parser.setErrorHandler(events)
102 self.parser.parseString(s)
103 return Iterate(self, EventStream(events.events), self.tag,
104 cont_handler)
105
108
109 - def iterate(self, source, cont_handler = None):
114
117 self.reader = reader
118 self.parser = parser
120 text = self.reader.next()
121 if text is None:
122 return None
123 events = StoreEvents()
124 self.parser.setContentHandler(events)
125 self.parser.setErrorHandler(events)
126 self.parser.parseString(text)
127 return events.events
128
130 - def __init__(self, record_parser, make_reader, reader_args, marker_tag):
131 self.record_parser = record_parser
132 self.make_reader = make_reader
133 self.reader_args = reader_args
134 self.marker_tag = marker_tag
135
138
141
143 record_reader = self.make_reader(
144 *(fileobj,) + self.reader_args)
145 return Iterate(self,
146 RecordEventStream(record_reader, self.record_parser),
147 self.marker_tag, cont_handler)
148
149 - def iterate(self, source, cont_handler = None):
154
155 -def _get_next_text(reader):
156 try:
157 return reader.next(), None
158 except (KeyboardInterrupt, SystemExit):
159 raise
160 except:
161
162
163 outfile = StringIO()
164 traceback.print_exc(file=outfile)
165 exc = Parser.ParserRecordException(
166 outfile.getvalue(), sys.exc_info()[1])
167 events = [ ("fatalError", (exc,)) ]
168 return None, events
169
170
289
290
291
317
318
320 - def __init__(self, parent, event_stream, tag, cont_handler = None):
321 self.parent = parent
322 if cont_handler is None:
323 import LAX
324 cont_handler = LAX.LAX()
325 self.event_stream = event_stream
326 self.events = None
327 self.tag = tag
328 self.cont_handler = cont_handler
329 self._n = 0
330 self.parent.start_position = 0
331 self.parent.end_position = 0
332 self.current_position = 0
333
335 events = self.events
336 if not events:
337 events = self.event_stream.next()
338 if events is None:
339 return None
340 self.events = events
341
342 i = 0
343 n = len(events)
344
345 while 1:
346 if i == n:
347 new_events = self.event_stream.next()
348 if new_events is None:
349 break
350 events.extend(new_events)
351 n = len(events)
352
353 name, args = events[i]
354 if name == "error" or name == "fatalError":
355
356 self.events = None
357 if isinstance(args[0], Parser.ParserPositionException):
358 exc = args[0]
359 exc.pos = 0
360 exc += self.current_position
361 raise args[0]
362
363 if name == "startElement" and args[0] == self.tag:
364 self.parent.start_position = self.current_position
365 cont_handler = self.cont_handler
366 cont_handler.startDocument()
367 while i < n:
368 name, args = events[i]
369 if name == "characters":
370
371
372 cont_handler.characters(args)
373 self.current_position += len(args)
374 i = i + 1
375 elif name == "error":
376
377
378 exc = args[0]
379 while i < n:
380 name, args = events[i]
381 if name == "endElement" and args[0] == self.tag:
382 del self.events[:i+1]
383 if isinstance(exc, Parser.ParserPositionException):
384 exc.pos = 0
385 exc += self.current_position
386 raise exc
387 elif name == "characters":
388 self.current_position += len(args)
389 i = i + 1
390
391 self.events = None
392 if isinstance(exc, Parser.ParserPositionException):
393 exc.pos = 0
394 exc += self.parent.start_position
395 raise exc
396 elif name == "fatalError":
397
398 self.events = None
399 if isinstance(args[0], Parser.ParserPositionException):
400 exc = args[0]
401 exc = 0
402 exc += self.parent.start_position
403 raise args[0]
404 else:
405 getattr(cont_handler, name)(*args)
406 if name == "endElement" and args[0] == self.tag:
407 self.parent.end_position = self.current_position
408 del self.events[:i+1]
409 cont_handler.endDocument()
410 self._n = self._n + 1
411 return cont_handler
412 i = i + 1
413
414
415 raise AssertionError, "no endElement(%s) and no errors?" % \
416 repr(self.tag)
417 else:
418 if name == "characters":
419 self.current_position += len(args)
420 i = i + 1
421
422
423 self.events = None
424 return None
425
427 assert n == self._n, "forward iteration only"
428 x = self.next()
429 if x is None:
430 raise IndexError, n
431 return x
432
434 return iter(self.next, None)
435