1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21 import string, re, logging
22
23 from xml.sax.saxutils import escape, quoteattr
24
25 from Pyblio import Attribute, Store, Exceptions, Tools, Compat
26
27 from gettext import gettext as _
28
29
31
32
33 log = logging.getLogger('pyblio.import.xmlmarc')
34
35
39
43
47
51
52 - def parse (self, fd, db):
53
54 self.db = db
55
56 rs = db.rs.new()
57 rs.name = _('Imported from XML MARC')
58
59
60 subs = {
61 'record': ('controlfield', 'datafield', 'subfield'),
62
63 '{http://www.loc.gov/MARC21/slim}record': (
64 '{http://www.loc.gov/MARC21/slim}controlfield',
65 '{http://www.loc.gov/MARC21/slim}datafield',
66 '{http://www.loc.gov/MARC21/slim}subfield'
67 )
68 }
69
70 for event, elem in Compat.ElementTree.iterparse (fd, events = ('end',)):
71 try: controlfield, datafield, subfield = subs [elem.tag]
72 except KeyError: continue
73
74 self.record = Store.Record ()
75 self.record_begin ()
76
77
78
79
80 for ctr in elem.findall (controlfield):
81 self.do_control (int (ctr.attrib ['tag']), ctr.text)
82
83 for data in elem.findall (datafield):
84 attrs = data.attrib
85 tag, ind1, ind2 = int (attrs ['tag']), attrs ['ind1'], attrs ['ind2']
86
87 values = [ (x.attrib ['code'], x.text or '') for x in data.findall (subfield) ]
88
89 fn = getattr (self, 'do_%03d' % tag, self.do_default)
90 fn (tag, ind1, ind2, values)
91
92 self.record_end ()
93
94 if self.record is not None:
95 k = self.db.add(self.record)
96 rs.add(k)
97
98 elem.clear()
99 return rs
100
101
103
104 _date_re = re.compile (r'(.*)(\d{4,})')
105
118
119 - def parse (self, fd, db):
120
121 self._mapping = {}
122
123 for k, v in self._logical.items ():
124
125 if v is None:
126 self._mapping [k] = (v, self.skip)
127 continue
128
129 attribute = db.schema [v]
130
131 self._mapping [k] = (v, self._physical [attribute.type])
132
133
134 return Reader.parse (self, fd, db)
135
136 - def skip (self, field, value):
139
141
142 f = self.record.get (field, [])
143
144
145 d = self._date_re.match (value)
146
147 if d is None:
148 raise Exceptions.ParserError ('unknown date %s' % `value`)
149
150 year = int (d.group (2))
151
152 f.append (Attribute.Date (year = year))
153
154 self.record [field] = f
155 return
156
157 - def id_add (self, field, value):
158
159 f = self.record.get (field, [])
160 f.append (Attribute.ID (value))
161
162 self.record [field] = f
163 return
164
165 - def text_add (self, field, value):
166
167 f = self.record.get (field, [])
168 f.append (Attribute.Text (value))
169
170 self.record [field] = f
171 return
172
173 - def url_add (self, field, value, q={}):
174 f = self.record.get (field, [])
175 attrib = Attribute.URL (value)
176
177 attrib.q.update (q)
178
179 f.append (attrib)
180
181 self.record [field] = f
182 return
183
185 f = self.record.get (field, [])
186
187 parts = map (string.strip, value.split (','))
188 if len (parts) == 1:
189 f.append (Attribute.Person (last = parts [0]))
190 elif len (parts) == 2:
191 f.append (Attribute.Person (last = parts [0],
192 first = parts [1]))
193 else:
194 raise Exceptions.ParserError (_('unsupported author syntax: %s') %
195 `value`)
196
197 self.record [field] = f
198 pass
199
200
201 - def do_unknown (self, tag, ind1, ind2, key, value):
205
206
208
209 for key, value in values:
210 try:
211 field, fn = self._mapping [(tag, ind1, ind2, key)]
212 fn (field, value)
213
214 except KeyError:
215 self.do_unknown (tag, ind1, ind2, key, value)
216
217 return
218
220
221 try:
222 field, fn = self._mapping [field]
223 fn (field, value)
224
225 except KeyError:
226 pass
227
228 return
229
231
232
233 log = logging.getLogger('pyblio.export.xmlmarc')
234
235
236 _re_marc = re.compile ('(\d{3,})(\w)(\w)')
237
239
240 self.fd.write (' <record>\n')
241 self._fields = {}
242 self._control = {}
243 return
244
246
247 ks = self._control.keys ()
248 ks.sort ()
249
250 for k in ks:
251 data = self._control [k]
252 if not data: continue
253
254 self.fd.write (' <controlfield tag="%s">%s</controlfield>\n' % (
255 k, data.encode ('utf-8')))
256
257 ks = self._fields.keys ()
258 ks.sort ()
259
260 for k in ks:
261 data = self._fields [k]
262
263 r = self._re_marc.match (k)
264
265 if r is None:
266 raise SyntaxError ('invalid MARC code: %s' % `k`)
267
268 tag, ind1, ind2 = r.groups ((1, 2, 3, 4))
269
270 if ind1 == '_': ind1 = ''
271 if ind2 == '_': ind2 = ''
272
273 for kval in data:
274 self.fd.write (' <datafield tag="%s" ind1="%s" ind2="%s">\n' % (
275 tag, ind1, ind2))
276
277 for sub, values in kval.items ():
278
279 for value in values:
280 if not value: continue
281
282 self.fd.write (' <subfield code="%s">%s</subfield>\n' % (
283 sub, escape (value.encode ('utf-8'))))
284
285 self.fd.write (' </datafield>\n')
286
287 self.fd.write (' </record>\n')
288 return
289
290 - def single (self, rec, field):
291
292 return rec.get (field, [None]) [0]
293
294 - def add (self, code, ** kval):
295
296 for k, v in kval.items ():
297 if not isinstance (v, (list, tuple)):
298 v = [v]
299
300
301 v = [ x for x in v if x ]
302
303 if not v:
304 del kval [k]
305 continue
306
307 if k [0] == '_':
308 del kval [k]
309 k = k [1:]
310
311 kval [k] = v
312
313
314 if not kval: return
315
316 data = self._fields.get (code, [])
317 data.append (kval)
318
319 self._fields [code] = data
320 return
321
323 self._control ["%03d" % int (code)] = val
324 return
325
329
330
331 - def write (self, fd, rs, db):
332
333 self.fd = fd
334 self.db = db
335
336 fd.write ('''\
337 <?xml version="1.0" encoding="UTF-8"?>
338 <collection>
339 ''')
340
341 for r in rs.itervalues ():
342 self.record_parse (r)
343
344 fd.write ('''\
345 </collection>
346 ''')
347 return
348