Package nltk_lite :: Package contrib :: Package toolbox :: Module settings
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.contrib.toolbox.settings

  1  #!/usr/bin/env python 
  2  # -*- coding: utf8 -*- 
  3   
  4  # Natural Language Toolkit: Toolbox Settings Parser 
  5  # 
  6  # Copyright (C) 2001-2006 University of Pennsylvania 
  7  # Author: Greg Aumann <greg_aumann@sil.org>/Stuart Robinson <stuart@zapata.org> 
  8  # URL: <http://nltk.sf.net> 
  9  # For license information, see LICENSE.TXT 
 10   
 11  """ 
 12  This module provides functionality for reading settings files for Toolbox.  
 13  Settings files provide information (metadata) concerning lexicons and texts,  
 14  such as which fields are found within them and what kind of values those  
 15  fields can have. 
 16  """ 
 17   
 18  from nltk_lite.etree.ElementTree import TreeBuilder 
 19  from nltk_lite.corpora.toolbox import StandardFormat 
 20  #from nltk_lite.parse.tree import Tree 
 21   
22 -class ToolboxSettings(StandardFormat):
23 """This class is the base class for settings files.""" 24
25 - def __init__(self):
26 super(ToolboxSettings, self).__init__()
27
28 - def parse(self, encoding=None, errors='strict', **kwargs):
29 """Parses a settings file using ElementTree. 30 31 @param encoding: encoding used by settings file 32 @type encoding: string 33 @param errors: Error handling scheme for codec. Same as C{.decode} inbuilt method. 34 @type errors: string 35 @param kwargs: Keyword arguments passed to L{StandardFormat.fields()} 36 @type kwargs: keyword arguments dictionary 37 @rtype: ElementTree._ElementInterface 38 @return: contents of toolbox settings file with a nested structure 39 """ 40 builder = TreeBuilder() 41 for mkr, value in self.fields(encoding=encoding, errors=errors, **kwargs): 42 # Check whether the first char of the field marker 43 # indicates a block start (+) or end (-) 44 block=mkr[0] 45 if block in ("+", "-"): 46 mkr=mkr[1:] 47 else: 48 block=None 49 # Build tree on the basis of block char 50 if block == "+": 51 builder.start(mkr, {}) 52 builder.data(value) 53 elif block == '-': 54 builder.end(mkr) 55 else: 56 builder.start(mkr, {}) 57 builder.data(value) 58 builder.end(mkr) 59 return builder.close()
60
61 -def to_settings_string(tree, encoding=None, errors='strict', unicode_fields=None):
62 # write XML to file 63 l = list() 64 _to_settings_string(tree.getroot(), l, encoding=encoding, errors=errors, unicode_fields=unicode_fields) 65 return ''.join(l)
66
67 -def _to_settings_string(node, l, **kwargs):
68 # write XML to file 69 tag = node.tag 70 text = node.text 71 if len(node) == 0: 72 if text: 73 l.append('\\%s %s\n' % (tag, text)) 74 else: 75 l.append('\\%s\n' % tag) 76 else: 77 l.append('\n') 78 if text: 79 l.append('\\+%s %s\n' % (tag, text)) 80 else: 81 l.append('\\+%s\n' % tag) 82 for n in node: 83 _to_settings_string(n, l, **kwargs) 84 l.append('\\-%s\n' % tag) 85 return
86
87 -class MarkerSet :
88 """This class is a container for FieldMetadata objects. A marker set 89 contains a list of the fields in a database together with information 90 about those files. 91 92 The raw SFB looks like this:: 93 94 \\+mkrset 95 \\lngDefault Default 96 \\mkrRecord lx 97 98 \\+mkr dt 99 \\nam Date Last Edited 100 \\lng Default 101 \\mkrOverThis lx 102 \\-mkr 103 104 \\+mkr lx 105 \\nam Rotokas Word 106 \\lng Rotokas 107 \\-mkr 108 \\-mkrset 109 """ 110
111 - def __init__(self) :
112 self._dict = {}
113
114 - def get_markers(self) :
115 """Obtain a list of all of the field markers for the marker set. 116 @returns: list of field markers 117 @rtype: list of strings""" 118 return self._dict.keys()
119
120 - def add_field_metadata(self, fmeta) :
121 """Add FieldMetadata object to dictionary of marker sets, keyed by field marker. 122 @param fmeta: field metadata to be added to collection for marker set 123 @type fmeta: FieldMetadata""" 124 self._dict[fmeta.get_marker()] = fmeta
125
126 - def get_metadata_by_marker(self, mkr) :
127 """Obtain a FieldMetadata object for the field marker provided. 128 @param mkr: field to obtain metadata for 129 @type mkr: string 130 @returns: metadata for field type associated with marker 131 @rtype: FieldMetadata""" 132 return self._dict[mkr]
133
134 - def get_field_marker_hierarchy(self) :
135 # Find root field marker 136 root = None 137 for fm in self.get_markers() : 138 fmmd = self.get_metadata_by_marker(fm) 139 if not fmmd.get_parent_marker() : 140 root = fm 141 142 # Build tree for field markers 143 builder = TreeBuilder() 144 builder.start(root, {}) 145 self.build_tree(root, builder) 146 builder.end(root) 147 return builder.close()
148
149 - def build_tree(self, mkr, builder) :
150 markers = self.get_markers() 151 markers.sort() 152 for tmpmkr in markers : 153 fmmd = self.get_metadata_by_marker(tmpmkr) 154 # Field is child of current field 155 if fmmd.get_parent_marker() == mkr : 156 # Handle rangeset 157 rangeset = fmmd.get_rangeset() 158 if rangeset : 159 builder.start("rangeset", {}) 160 for rsi in rangeset : 161 builder.start("value", {}) 162 builder.data(rsi) 163 builder.end("value") 164 builder.end("rangeset") 165 166 # Handle rangeset 167 name = fmmd.get_name() 168 if not name : 169 name = "" 170 desc = fmmd.get_description() 171 if not desc : 172 desc = "" 173 d = {"name" : name, 174 "desc" : desc} 175 #print fmmd.get_language() 176 #print fmmd.is_multiword() 177 #print fmmd.requires_value() 178 builder.start(tmpmkr, d) 179 self.build_tree(tmpmkr, builder) 180 builder.end(tmpmkr) 181 return builder
182 183
184 -class FieldMetadata :
185 """This class is a container for information about a field, including its marker, name, 186 description, language, range set (valid values), and parent marker. 187 188 The raw field metadata looks like this:: 189 190 \\+mkr dx 191 \\nam Dialect 192 \\desc dialects in which lexeme is found 193 \\lng Default 194 \\rngset Aita Atsilima Central Pipipaia 195 \\mkrOverThis lx 196 \\-mkr 197 """ 198
199 - def __init__(self, 200 marker = None, 201 name = None, 202 desc = None, 203 lang = None, 204 rangeset = None, 205 multiword = None, 206 required = None, 207 parent_mkr = None) :
208 self._marker = marker 209 self._name = name 210 self._desc = desc 211 self._lang = lang 212 self._rangeset = rangeset 213 self._parent_mkr = parent_mkr 214 self._multiword = multiword 215 self._required = required
216
217 - def get_marker(self) :
218 """Obtain the marker for this field (e.g., 'dx'). 219 @returns: marker for field 220 @rtype: string 221 """ 222 return self._marker
223
224 - def get_name(self) :
225 """Obtain the name for this field (e.g., 'Dialect'). 226 @returns: name of field 227 @rtype: string 228 """ 229 return self._name
230
231 - def get_description(self) :
232 """Obtain the marker for this field (e.g., 'dialects in which lexeme is found'). 233 @returns: description of field 234 @rtype: string 235 """ 236 return self._desc
237
238 - def get_language(self) :
239 """Obtain language in which field is encoded (e.g., 'Default'). 240 @returns: name of language used for field 241 @rtype: string 242 """ 243 return self._lang
244
245 - def get_rangeset(self) :
246 """Obtain range set for field (e.g., ['Aita', 'Atsilima', 'Central', 'Pipipaia']). 247 @returns: list of possible values for field 248 @rtype: list of strings 249 """ 250 return self._rangeset
251
252 - def set_rangeset(self, rangeset) :
253 """Set list of valid values for field. 254 @param rangeset: list of valid values for the field 255 @type rangeset: list 256 """ 257 self._rangeset = rangeset
258
259 - def get_parent_marker(self) :
260 """Obtain the marker for the parent of this field (e.g., 'lx'). 261 @returns: marker for parent field 262 @rtype: string 263 """ 264 return self._parent_mkr
265
266 - def is_multiword(self) :
267 """Determine whether the value of the field consists of multiple words. 268 @returns: whether field values can be multiword 269 @rtype: boolean 270 """ 271 return self._multiword
272
273 - def requires_value(self) :
274 """Determine whether the field requires a value. 275 @returns: whether field requires a value 276 @rtype: boolean 277 """ 278 return self._required
279 280
281 -class LexiconSettings(ToolboxSettings) :
282 """This class is used to parse and manipulate settings file for 283 lexicons.""" 284
285 - def __init__(self, file):
286 self._file = file 287 self._markerset = MarkerSet() 288 self._tree = None
289
290 - def parse(self, encoding=None) :
291 """Parse a settings file with lexicon metadata.""" 292 s = Settings() 293 s.open(self._file) 294 self._tree = s.parse(encoding=encoding) 295 s.close() 296 297 # Handle metadata for field markers (aka, marker set) 298 for mkr in self._tree.findall('mkrset/mkr') : 299 rangeset = None 300 if self.__parse_value(mkr, "rngset") : 301 rangeset = self.__parse_value(mkr, "rngset").split() 302 fm = FieldMetadata(marker = mkr.text, 303 name = self.__parse_value(mkr, "nam"), 304 desc = self.__parse_value(mkr, "desc"), 305 lang = self.__parse_value(mkr, "lng"), 306 rangeset = rangeset, 307 multiword = self.__parse_boolean(mkr, "MultipleWordItems"), 308 required = self.__parse_boolean(mkr, "MustHaveData"), 309 parent_mkr = self.__parse_value(mkr, "mkrOverThis")) 310 self._markerset.add_field_metadata(fm) 311 312 # Handle range sets defined outside of marker set 313 # WARNING: Range sets outside the marker set override those inside the 314 # marker set 315 for rs in self._tree.findall("rngset") : 316 mkr = rs.findtext("mkr") 317 fm = self._markerset.get_metadata_by_marker(mkr) 318 fm.set_rangeset([d.text for d in rs.findall("dat") ]) 319 self._markerset.add_field_metadata(fm)
320
321 - def get_record_marker(self) :
322 return self._tree.find('mkrset/mkrRecord').text
323
324 - def get_marker_set(self) :
325 return self._markerset
326
327 - def __parse_boolean(self, mkr, name) :
328 if mkr.find(name) == None : 329 return False 330 else : 331 return True
332
333 - def __parse_value(self, mkr, name) :
334 try : 335 return mkr.find(name).text 336 except : 337 return None
338
339 -class InterlinearProcess :
340 """This class represents a process for text interlinearization.""" 341
342 - def __init__(self, 343 from_mkr = None, 344 to_mkr = None, 345 out_mkr = None, 346 gloss_sep = None, 347 fail_mark = None, 348 parse_proc = None, 349 show_fail_mark = None, 350 show_root_guess = None) :
351 self.__from_mkr = from_mkr 352 self.__to_mkr = to_mkr 353 self.__out_mkr = out_mkr 354 self.__gloss_sep = gloss_sep 355 self.__fail_mark = fail_mark 356 self.__parse_proc = parse_proc 357 self.__show_fail_mark = show_fail_mark 358 self.__show_root_guess = show_root_guess
359
360 - def get_output_marker(self) :
361 return self.__out_mkr
362
363 - def get_from_marker(self) :
364 """The marker searched for in the lookup process.""" 365 return self.__from_mkr
366
367 - def get_to_marker(self) :
368 """The marker found in the lookup process.""" 369 return self.__to_mkr
370
371 - def get_gloss_separator(self) :
372 """???""" 373 return self.__gloss_sep
374
375 - def get_failure_marker(self) :
376 """The string used in the case of lookup failure,""" 377 return self.__fail_mark
378
379 - def is_parse_process(self) :
380 """Determine whether this process is a parse process (as opposed to a lookup process).""" 381 return self.__parse_proc
382
383 - def show_failure_marker(self) :
384 """???""" 385 return self.__show_fail_mark
386
387 - def show_root_guess(self) :
388 """???""" 389 return self.__show_root_guess
390 391
392 -class LookupProcess(InterlinearProcess) :
393 pass
394 395
396 -class ParseProcess(InterlinearProcess) :
397 pass
398 399
400 -class TextSettings(ToolboxSettings) :
401 """This class is used to parse and manipulate settings file for 402 lexicons.""" 403
404 - def __init__(self, file):
405 self._file = file 406 self._markerset = MarkerSet() 407 self._tree = None
408
409 - def parse(self, encoding=None) :
410 """Parse a settings file with lexicon metadata.""" 411 s = Settings() 412 s.open(self._file) 413 self._tree = s.parse(encoding=encoding) 414 s.close() 415 416 # Handle interlinear process list 417 for proc in self._tree.findall("intprclst/intprc") : 418 parseProcess = self.__parse_boolean(proc, "bParseProc") 419 showRootGuess = self.__parse_boolean(proc, "bShowRootGuess") 420 showFailMark = self.__parse_boolean(proc, "bShowFailMark") 421 fromMkr = self.__parse_value(proc, "mkrFrom") 422 outMkr = self.__parse_value(proc, "mkrOut") 423 toMkr = self.__parse_value(proc, "mkrTo").strip() 424 glossSep = self.__parse_value(proc, "GlossSeparator") 425 failMark = self.__parse_value(proc, "FailMark") 426 ip = ParseProcess(from_mkr = fromMkr, 427 to_mkr = toMkr, 428 gloss_sep = glossSep, 429 fail_mark = failMark, 430 parse_proc = parseProcess, 431 show_fail_mark = showFailMark, 432 show_root_guess = showRootGuess, 433 out_mkr = outMkr) 434 if parseProcess : 435 pass 436 else : 437 pass 438 439 print "----- Interlinear Process -----" 440 print " FROM: [%s]" % ip.get_from_marker() 441 print " TO: [%s]" % ip.get_to_marker() 442 print " GLOSS SEP: [%s]" % ip.get_gloss_separator() 443 print " FAIL MARK: [%s]" % ip.get_failure_marker() 444 print " SHOW FAIL MARK: [%s]" % ip.show_failure_marker() 445 print " SHOW ROOT GUESS: [%s]" % ip.show_root_guess() 446 print " PARSE PROCESS: [%s]" % ip.is_parse_process() 447 448 trilook = proc.find("triLook") 449 if trilook : 450 print " -- trilook --" 451 print " DB TYPE: [%s]" % self.__parse_value(trilook, "dbtyp") 452 print " MKR OUTPUT: [%s]" % self.__parse_value(trilook, "mkrOut") 453 454 tripref = proc.find("triPref") 455 if tripref : 456 print " -- tripref --" 457 print " DB TYPE: [%s]" % self.__parse_value(tripref, "dbtyp") 458 print " MKR OUTPUT: [%s]" % self.__parse_value(tripref, "mkrOut") 459 try : 460 for d in tripref.findall("drflst/drf") : 461 print " DB: [%s]" % self.__parse_value(d, "File") 462 except : 463 pass 464 try : 465 for d in tripref.find("mrflst") : 466 print " MKR: [%s]" % d.text 467 except : 468 pass 469 470 triroot = proc.find("triRoot") 471 if triroot : 472 print " -- triroot --" 473 print " DB TYPE: [%s]" % self.__parse_value(triroot, "dbtyp") 474 print " MKR OUTPUT: [%s]" % self.__parse_value(triroot, "mkrOut") 475 try : 476 for d in triroot.findall("drflst/drf") : 477 print " DB: [%s]" % self.__parse_value(d, "File") 478 except : 479 pass 480 try : 481 for d in triroot.find("mrflst") : 482 print " MKR: [%s]" % d.text 483 except : 484 pass 485 486 print "" 487 488 # Handle metadata for field markers (aka, marker set) 489 for mkr in self._tree.findall('mkrset/mkr') : 490 rangeset = None 491 if self.__parse_value(mkr, "rngset") : 492 rangeset = self.__parse_value(mkr, "rngset").split() 493 fm = FieldMetadata(marker = mkr.text, 494 name = self.__parse_value(mkr, "nam"), 495 desc = self.__parse_value(mkr, "desc"), 496 lang = self.__parse_value(mkr, "lng"), 497 rangeset = rangeset, 498 multiword = self.__parse_boolean(mkr, "MultipleWordItems"), 499 required = self.__parse_boolean(mkr, "MustHaveData"), 500 parent_mkr = self.__parse_value(mkr, "mkrOverThis")) 501 self._markerset.add_field_metadata(fm) 502 503 # Handle range sets defined outside of marker set 504 # WARNING: Range sets outside the marker set override those inside the 505 # marker set 506 for rs in self._tree.findall("rngset") : 507 mkr = rs.findtext("mkr") 508 fm = self._markerset.get_metadata_by_marker(mkr) 509 fm.set_rangeset([d.text for d in rs.findall("dat") ]) 510 self._markerset.add_field_metadata(fm)
511
512 - def get_record_marker(self) :
513 return self._tree.find('mkrset/mkrRecord').text
514
515 - def get_version(self) :
516 return self._tree.find('ver').text
517
518 - def get_description(self) :
519 return self._tree.find('desc').text
520
521 - def get_marker_set(self) :
522 return self._markerset
523
524 - def __parse_boolean(self, mkr, name) :
525 if mkr.find(name) == None : 526 return False 527 else : 528 return True
529
530 - def __parse_value(self, mkr, name) :
531 try : 532 return mkr.find(name).text 533 except : 534 return None
535
536 -def demo():
537 from nltk_lite.etree.ElementTree import ElementTree 538 539 settings = ToolboxSettings() 540 settings.open('demos/MDF_AltH.typ') 541 tree = settings.parse(unwrap=False, encoding='gbk') 542 print tree.find('expset/expMDF/rtfPageSetup/paperSize').text 543 settings_tree = ElementTree(tree) 544 settings_tree.write('test.xml') 545 print to_settings_string(settings_tree).encode('gbk')
546 547 if __name__ == '__main__': 548 demo() 549