Package Pyblio :: Package Parsers :: Package Syntax :: Module XMLMARC
[hide private]
[frames] | no frames]

Source Code for Module Pyblio.Parsers.Syntax.XMLMARC

  1  # This file is part of pybliographer 
  2  #  
  3  # Copyright (C) 1998-2006 Frederic GOBRY 
  4  # Email : gobry@pybliographer.org 
  5  #           
  6  # This program is free software; you can redistribute it and/or 
  7  # modify it under the terms of the GNU General Public License 
  8  # as published by the Free Software Foundation; either version 2  
  9  # of the License, or (at your option) any later version. 
 10  #    
 11  # This program is distributed in the hope that it will be useful, 
 12  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 13  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 14  # GNU General Public License for more details.  
 15  #  
 16  # You should have received a copy of the GNU General Public License 
 17  # along with this program; if not, write to the Free Software 
 18  # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA. 
 19  #  
 20   
 21  import string, re, logging 
 22   
 23  from xml.sax.saxutils import escape, quoteattr 
 24   
 25  from Pyblio import Attribute, Store, Exceptions, Tools, Compat 
 26   
 27  from gettext import gettext as _ 
 28   
 29   
30 -class Reader(object):
31 32 # The official channel in which messages must be sent 33 log = logging.getLogger('pyblio.import.xmlmarc') 34 35
36 - def record_begin (self):
37 38 pass
39
40 - def record_end (self):
41 42 pass
43
44 - def do_default (self, tag, ind1, ind2, values):
45 46 pass
47
48 - def do_control (self, field, value):
49 50 pass
51
52 - def parse (self, fd, db):
53 54 self.db = db 55 56 rs = db.rs.new() 57 rs.name = _('Imported from XML MARC') 58 59 # We support both the NS-aware and non-NS aware versions of the MARC file 60 subs = { 61 'record': ('controlfield', 'datafield', 'subfield'), 62 63 '{http://www.loc.gov/MARC21/slim}record': ( 64 '{http://www.loc.gov/MARC21/slim}controlfield', 65 '{http://www.loc.gov/MARC21/slim}datafield', 66 '{http://www.loc.gov/MARC21/slim}subfield' 67 ) 68 } 69 70 for event, elem in Compat.ElementTree.iterparse (fd, events = ('end',)): 71 try: controlfield, datafield, subfield = subs [elem.tag] 72 except KeyError: continue 73 74 self.record = Store.Record () 75 self.record_begin () 76 77 # get all the control fields first, then the datafields 78 # (as the controlfields can have an impact on the 79 # datafields) 80 for ctr in elem.findall (controlfield): 81 self.do_control (int (ctr.attrib ['tag']), ctr.text) 82 83 for data in elem.findall (datafield): 84 attrs = data.attrib 85 tag, ind1, ind2 = int (attrs ['tag']), attrs ['ind1'], attrs ['ind2'] 86 87 values = [ (x.attrib ['code'], x.text or '') for x in data.findall (subfield) ] 88 89 fn = getattr (self, 'do_%03d' % tag, self.do_default) 90 fn (tag, ind1, ind2, values) 91 92 self.record_end () 93 94 if self.record is not None: 95 k = self.db.add(self.record) 96 rs.add(k) 97 98 elem.clear() 99 return rs
100 101
102 -class SimpleReader(Reader):
103 104 _date_re = re.compile (r'(.*)(\d{4,})') 105
106 - def __init__ (self, mapping):
107 108 self._logical = mapping 109 110 self._physical = { 111 Attribute.Text : self.text_add, 112 Attribute.URL : self.url_add, 113 Attribute.Person: self.person_add, 114 Attribute.ID : self.id_add, 115 Attribute.Date : self.date_add, 116 } 117 return
118
119 - def parse (self, fd, db):
120 121 self._mapping = {} 122 123 for k, v in self._logical.items (): 124 125 if v is None: 126 self._mapping [k] = (v, self.skip) 127 continue 128 129 attribute = db.schema [v] 130 131 self._mapping [k] = (v, self._physical [attribute.type]) 132 133 134 return Reader.parse (self, fd, db)
135
136 - def skip (self, field, value):
137 138 pass
139
140 - def date_add (self, field, value):
141 142 f = self.record.get (field, []) 143 144 # heuristic to match a date 145 d = self._date_re.match (value) 146 147 if d is None: 148 raise Exceptions.ParserError ('unknown date %s' % `value`) 149 150 year = int (d.group (2)) 151 152 f.append (Attribute.Date (year = year)) 153 154 self.record [field] = f 155 return
156
157 - def id_add (self, field, value):
158 159 f = self.record.get (field, []) 160 f.append (Attribute.ID (value)) 161 162 self.record [field] = f 163 return
164
165 - def text_add (self, field, value):
166 167 f = self.record.get (field, []) 168 f.append (Attribute.Text (value)) 169 170 self.record [field] = f 171 return
172
173 - def url_add (self, field, value, q={}):
174 f = self.record.get (field, []) 175 attrib = Attribute.URL (value) 176 #TODO: all types can have qualifiers.... 177 attrib.q.update (q) 178 179 f.append (attrib) 180 181 self.record [field] = f 182 return
183
184 - def person_add (self, field, value):
185 f = self.record.get (field, []) 186 187 parts = map (string.strip, value.split (',')) 188 if len (parts) == 1: 189 f.append (Attribute.Person (last = parts [0])) 190 elif len (parts) == 2: 191 f.append (Attribute.Person (last = parts [0], 192 first = parts [1])) 193 else: 194 raise Exceptions.ParserError (_('unsupported author syntax: %s') % 195 `value`) 196 197 self.record [field] = f 198 pass
199 200
201 - def do_unknown (self, tag, ind1, ind2, key, value):
202 203 raise Exceptions.ParserError (_('unknown field %s%s%s $%s') % ( 204 tag, ind1, ind2, key))
205 206
207 - def do_default (self, tag, ind1, ind2, values):
208 209 for key, value in values: 210 try: 211 field, fn = self._mapping [(tag, ind1, ind2, key)] 212 fn (field, value) 213 214 except KeyError: 215 self.do_unknown (tag, ind1, ind2, key, value) 216 217 return
218
219 - def do_control (self, field, value):
220 221 try: 222 field, fn = self._mapping [field] 223 fn (field, value) 224 225 except KeyError: 226 pass 227 228 return
229
230 -class Writer(object):
231 232 # The official channel in which messages must be sent 233 log = logging.getLogger('pyblio.export.xmlmarc') 234 235 236 _re_marc = re.compile ('(\d{3,})(\w)(\w)') 237
238 - def begin (self):
239 240 self.fd.write (' <record>\n') 241 self._fields = {} 242 self._control = {} 243 return
244
245 - def end (self):
246 247 ks = self._control.keys () 248 ks.sort () 249 250 for k in ks: 251 data = self._control [k] 252 if not data: continue 253 254 self.fd.write (' <controlfield tag="%s">%s</controlfield>\n' % ( 255 k, data.encode ('utf-8'))) 256 257 ks = self._fields.keys () 258 ks.sort () 259 260 for k in ks: 261 data = self._fields [k] 262 263 r = self._re_marc.match (k) 264 265 if r is None: 266 raise SyntaxError ('invalid MARC code: %s' % `k`) 267 268 tag, ind1, ind2 = r.groups ((1, 2, 3, 4)) 269 270 if ind1 == '_': ind1 = '' 271 if ind2 == '_': ind2 = '' 272 273 for kval in data: 274 self.fd.write (' <datafield tag="%s" ind1="%s" ind2="%s">\n' % ( 275 tag, ind1, ind2)) 276 277 for sub, values in kval.items (): 278 279 for value in values: 280 if not value: continue 281 282 self.fd.write (' <subfield code="%s">%s</subfield>\n' % ( 283 sub, escape (value.encode ('utf-8')))) 284 285 self.fd.write (' </datafield>\n') 286 287 self.fd.write (' </record>\n') 288 return
289
290 - def single (self, rec, field):
291 292 return rec.get (field, [None]) [0]
293
294 - def add (self, code, ** kval):
295 296 for k, v in kval.items (): 297 if not isinstance (v, (list, tuple)): 298 v = [v] 299 300 # Cleanup empty data 301 v = [ x for x in v if x ] 302 303 if not v: 304 del kval [k] 305 continue 306 307 if k [0] == '_': 308 del kval [k] 309 k = k [1:] 310 311 kval [k] = v 312 313 314 if not kval: return 315 316 data = self._fields.get (code, []) 317 data.append (kval) 318 319 self._fields [code] = data 320 return
321
322 - def control_add (self, code, val):
323 self._control ["%03d" % int (code)] = val 324 return
325
326 - def record_parse (self, record):
327 328 pass
329 330
331 - def write (self, fd, rs, db):
332 333 self.fd = fd 334 self.db = db 335 336 fd.write ('''\ 337 <?xml version="1.0" encoding="UTF-8"?> 338 <collection> 339 ''') 340 341 for r in rs.itervalues (): 342 self.record_parse (r) 343 344 fd.write ('''\ 345 </collection> 346 ''') 347 return
348