1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21 from gettext import gettext as _
22
23 from Pyblio import Callback, Store, Attribute
24
25
27
28 """ Generic Parser for 'tagged' records, to be derived by actual
29 parsers. An actual subclass will need to at least override the
30 self.line_handler () method to generate events by calling
31 self.push (). The parser is in charge of general state
32 bookkeeping, and that sort of things..."""
33
34
35 EV_RECORD_START, EV_RECORD_END, EV_FIELD_START, \
36 EV_FIELD_DATA, EV_FIELD_END, \
37 EV_FILE_END, EV_DONE, \
38 EV_METADATA = range (8)
39
40
41 ST_IN_RECORD, ST_IN_FIELD, ST_OUTSIDE = range (3)
42
43
44 - def __init__ (self, fd, charset = 'UTF-8'):
45
46 """ Create a new parser for a file containing 'tagged' records """
47
48 self._fd = fd
49 self._ln = 0
50
51 self._charset = charset
52 self._stack = []
53 self._evstack = []
54 self._started = False
55
56 self.state = self.ST_OUTSIDE
57
58 self.file_start ()
59 return
60
61
63 """ Override me to have a function called before the first
64 record is to be parsed """
65
66 pass
67
68
70
71 """ Override me to be called after the last record has been parsed """
72
73 pass
74
76
77 """ Override me to be called just at the end of file """
78
79 pass
80
82
83 """ Override me to handle each line of input and generate
84 self.push () events. Will be called with line == '' when the
85 end of file is reached. """
86
87 return
88
89
91
92 """ Transforms a single field of a record """
93
94 return tag, value.decode (self._charset)
95
96
97 - def push (self, * ev):
98
99 """ Emit a new event. Available events are listed below, with
100 their additional parameters listed, when needed:
101
102 - self.EV_RECORD_START
103 - self.EV_RECORD_END
104 - self.EV_FIELD_START, tag, line
105 - self.EV_FIELD_DATA, data
106 - self.EV_FIELD_END
107 - self.EV_FILE_END
108
109 """
110
111 self._evstack.append (ev)
112 return
113
118
122
126
130
134
138
139
140 - def unread (self, line, count):
141
142 """ Put back a line so that it will be returned by self._pop
143 when it is next invoked."""
144
145 self._stack.append ((line, count))
146 return
147
148
150
151 """ Call this function to get the next record as a list of tuples
152
153 ('D', [ (tag, value), ...])
154 ('M', tag, value)
155
156 or None when there are no more records
157 """
158
159 record = []
160
161 while 1:
162 ev = self._ev_pop ()
163
164 ev, args = ev [0], ev [1:]
165
166 if ev == self.EV_FIELD_DATA:
167 if self.state != self.ST_IN_FIELD:
168 raise SyntaxError (_('line %d: unexpected field content') % self._ln)
169
170 data = data + args [0]
171 continue
172
173 if ev == self.EV_FIELD_START:
174 if self.state == self.ST_IN_FIELD:
175 raise SyntaxError (_('line %d: nested field') % self._ln)
176
177 if self.state == self.ST_OUTSIDE:
178 raise SyntaxError (_('line %d: field is not in a record') % self._ln)
179
180 self.state = self.ST_IN_FIELD
181
182 tag, start = args
183 data = ''
184 continue
185
186 if ev == self.EV_FIELD_END:
187 record.append ((start,) + self.field_handler (tag, data))
188
189 self.state = self.ST_IN_RECORD
190 continue
191
192 if ev == self.EV_RECORD_START:
193 if self.state == self.ST_IN_RECORD:
194 raise SyntaxError (_('line %d: nested record') % self._ln)
195
196 self.state = self.ST_IN_RECORD
197
198 record = []
199 continue
200
201 if ev == self.EV_RECORD_END:
202 if self.state != self.ST_IN_RECORD:
203 raise SyntaxError (_('line %d: unexpected end of record') % self._ln)
204 self.state = self.ST_OUTSIDE
205 return ('D', record)
206
207 if ev == self.EV_FILE_END:
208 self.file_stopping ()
209 self.push (self.EV_DONE)
210 continue
211
212 if ev == self.EV_DONE:
213 if self.state != self.ST_OUTSIDE:
214 raise SyntaxError (_('line %d: unexpected end of file') % self._ln)
215 self.file_stop ()
216 return None
217
218 if ev == self.EV_METADATA:
219 if self.state != self.ST_OUTSIDE:
220 raise SyntaxError (_('line %d: metadata in the middle of a record') % self._ln)
221 return ('M', args)
222
223 return
224
226
227 """ Parse enough lines to get the next event """
228
229 while 1:
230 try:
231 return self._evstack.pop (0)
232
233 except IndexError:
234 pass
235
236 line, count = self._pop ()
237
238 self.line_handler (line, count)
239
240 if line == '': self.push (self.EV_FILE_END)
241
242 return
243
244
246
247 """ Return a line from the file with its line number. """
248
249 try:
250 line, count = self._stack.pop ()
251
252 except IndexError:
253 self._ln = self._ln + 1
254
255 line = self._fd.readline ()
256 count = self._ln
257
258 return line, count
259
260
261
262 -class Reader(Callback.Publisher):
263
264 Parser = None
265
266 - def parse (self, fd, db, charset = 'UTF-8'):
267
268 self.parser = self.Parser (fd, charset)
269 self.db = db
270
271 self.emit ('file-start')
272
273 while 1:
274 record = self.parser.next ()
275 if record is None: break
276
277 t, record = record
278
279 if t == 'D': self.record_parse (record)
280 elif t == 'M': self.metadata_parse (record)
281
282 self.emit ('file-stop')
283 return
284
288
289
293
297
299
300 self.record = Store.Record ()
301
302 self.record_begin ()
303
304 for line, tag, data in record:
305
306 try:
307 cmd = getattr (self, 'do_%s' % tag.replace ('-', '_'))
308
309 except AttributeError:
310
311 try:
312 cmd = getattr (self, 'do_default')
313
314 except AttributeError:
315
316 self.emit ('warning', _('line %d: unhandled tag %s' % (
317 line, `tag`)))
318 continue
319
320 cmd (line, tag, data)
321
322 self.record_end ()
323
324
325
326 if self.record is not None:
327
328 k = self.db.add (self.record)
329 self.emit ('record-added', k)
330
331 self.record = None
332
333 return
334
335 - def text_add (self, field, value):
336 self.record.add (field, value, Attribute.Text)
337 return
338
339 - def id_add (self, field, value):
342
346