Package nltk_lite :: Package contrib :: Package toolbox :: Module data
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.contrib.toolbox.data

  1  #!/usr/bin/env python 
  2  # -*- coding: utf8 -*- 
  3   
  4  # Natural Language Toolkit: Toolbox data file parser 
  5  # 
  6  # Copyright (C) 2001-2006 University of Pennsylvania 
  7  # Author: Greg Aumann <greg_aumann@sil.org> 
  8  # URL: <http://nltk.sf.net> 
  9  # For license information, see LICENSE.TXT 
 10   
 11  """module for reading Toolbox data files 
 12  """ 
 13   
 14  from nltk_lite.etree.ElementTree import Element, SubElement, TreeBuilder 
 15  from nltk_lite.corpora import toolbox 
 16  import re 
 17   
18 -class ToolboxData(toolbox.ToolboxData):
19 - def __init__(self):
20 super(toolbox.ToolboxData, self).__init__()
21
22 - def _tree2etree(self, parent, no_blanks):
23 from nltk_lite.parse import Tree 24 25 root = Element(parent.node) 26 for child in parent: 27 if isinstance(child, Tree): 28 root.append(self._tree2etree(child, no_blanks)) 29 else: 30 text, tag = child 31 if no_blanks == False or text: 32 e = SubElement(root, tag) 33 e.text = text 34 return root
35
36 - def chunk_parse(self, grammar, no_blanks=True, incomplete='record', **kwargs):
37 """ 38 Returns an element tree structure corresponding to a toolbox data file 39 parsed according to the chunk grammar. 40 41 @type grammar: string 42 @param grammar: Contains the chunking rules used to parse the 43 database. See L{chunk.RegExp} for documentation. 44 @type no_blanks: boolean 45 @param no_blanks: blank fields that are not important to the structure are deleted 46 @type kwargs: keyword arguments dictionary 47 @param incomplete: name of element used if parse doesn't result in one toplevel element 48 @rtype: string 49 @param kwargs: Keyword arguments passed to L{toolbox.StandardFormat.fields()} 50 @rtype: ElementTree._ElementInterface 51 @return: Contents of toolbox data parsed according to the rules in grammar 52 """ 53 from nltk_lite import chunk 54 from nltk_lite.parse import Tree 55 56 cp = chunk.Regexp(grammar) 57 db = self.parse(**kwargs) 58 tb_etree = Element('toolbox_data') 59 header = db.find('header') 60 tb_etree.append(header) 61 for record in db.findall('record'): 62 parsed = cp.parse([(elem.text, elem.tag) for elem in record]) 63 top = parsed[0] 64 if not isinstance(top, Tree) or len(parsed) != 1: 65 # didn't get a full parse 66 parsed.node = incomplete 67 top = parsed 68 tb_etree.append(self._tree2etree(top, no_blanks)) 69 return tb_etree
70
71 - def _make_parse_table(self, grammar):
72 """ 73 Return parsing state information used by tree_parser. 74 """ 75 76 first = dict() 77 gram = dict() 78 for sym, value in grammar.items(): 79 first[sym] = value[0] 80 gram[sym] = value[0] + value[1] 81 parse_table = dict() 82 for state in gram.keys(): 83 parse_table[state] = dict() 84 for to_sym in gram[state]: 85 if to_sym in grammar: 86 # is a nonterminal 87 # assume all firsts are terminals 88 for i in first[to_sym]: 89 parse_table[state][i] = to_sym 90 else: 91 parse_table[state][to_sym] = to_sym 92 return (parse_table, first)
93
94 - def grammar_parse(self, startsym, grammar, no_blanks=True, **kwargs):
95 """ 96 Returns an element tree structure corresponding to a toolbox data file 97 parsed according to the grammar. 98 99 @type startsym: string 100 @param startsym: Start symbol used for the grammar 101 @type grammar: dictionary of tuple of tuples 102 @param grammar: Contains the set of rewrite rules used to parse the 103 database. See the description below. 104 @type no_blanks: boolean 105 @param no_blanks: blank fields that are not important to the structure are deleted 106 @type kwargs: keyword arguments dictionary 107 @param kwargs: Keyword arguments passed to L{toolbox.StandardFormat.fields()} 108 @rtype: ElementTree._ElementInterface 109 @return: Contents of toolbox data parsed according to rules in grammar 110 111 The rewrite rules in the grammar look similar to those usually used in 112 computer languages. The difference is that the ordering constraints 113 that are usually present are relaxed in this parser. The reason is that 114 toolbox databases seldom have consistent ordering of fields. Hence the 115 right side of each rule consists of a tuple with two parts. The 116 fields in the first part mark the start of nonterminal. 117 Each of them can occur only once and all those must 118 occur before any of the fields in the second part of that nonterminal. 119 Otherwise they are interpreted as marking the start 120 of another one of the same nonterminal. If there is more than one 121 in the first part of the tuple they do not need to all appear in a parse. 122 The fields in the second part of the tuple can occur in any order. 123 124 Sample grammar:: 125 126 grammar = { 127 'toolbox': (('_sh',), ('_DateStampHasFourDigitYear', 'entry')), 128 'entry': (('lx',), ('hm', 'sense', 'dt')), 129 'sense': (('sn', 'ps'), ('pn', 'gv', 'dv', 130 'gn', 'gp', 'dn', 'rn', 131 'ge', 'de', 're', 132 'example', 'lexfunc')), 133 'example': (('rf', 'xv',), ('xn', 'xe')), 134 'lexfunc': (('lf',), ('lexvalue',)), 135 'lexvalue': (('lv',), ('ln', 'le')), 136 } 137 """ 138 parse_table, first = self._make_parse_table(grammar) 139 builder = TreeBuilder() 140 pstack = list() 141 state = startsym 142 first_elems = list() 143 pstack.append((state, first_elems)) 144 builder.start(state, {}) 145 field_iter = self.fields(**kwargs) 146 loop = True 147 try: 148 mkr, value = field_iter.next() 149 except StopIteration: 150 loop = False 151 while loop: 152 (state, first_elems) = pstack[-1] 153 if mkr in parse_table[state]: 154 next_state = parse_table[state][mkr] 155 if next_state == mkr: 156 if mkr in first[state]: 157 # may be start of a new nonterminal 158 if mkr not in first_elems: 159 # not a new nonterminal 160 first_elems.append(mkr) 161 add = True 162 else: 163 # a new nonterminal, second or subsequent instance 164 add = False 165 if len(pstack) > 1: 166 builder.end(state) 167 pstack.pop() 168 else: 169 raise ValueError, \ 170 'Line %d: syntax error, unexpected marker %s.' % (self.line_num, mkr) 171 else: 172 # start of terminal marker 173 add = True 174 if add: 175 if not no_blanks or value: 176 builder.start(mkr, dict()) 177 builder.data(value) 178 builder.end(mkr) 179 try: 180 mkr, value = field_iter.next() 181 except StopIteration: 182 loop = False 183 else: 184 # a non terminal, first instance 185 first_elems = list() 186 builder.start(next_state, dict()) 187 pstack.append((next_state, first_elems)) 188 else: 189 if len(pstack) > 1: 190 builder.end(state) 191 pstack.pop() 192 else: 193 raise ValueError, \ 194 'Line %d: syntax error, unexpected marker %s.' % (self.line_num, mkr) 195 for state, first_elems in reversed(pstack): 196 builder.end(state) 197 return builder.close()
198
199 -def indent(elem, level=0):
200 """ 201 Recursive function to indent an ElementTree._ElementInterface 202 used for pretty printing. Code from 203 U{http://www.effbot.org/zone/element-lib.htm}. To use run indent 204 on elem and then output in the normal way. 205 206 @param elem: element to be indented. will be modified. 207 @type elem: ElementTree._ElementInterface 208 @param level: level of indentation for this element 209 @type level: nonnegative integer 210 @rtype: ElementTree._ElementInterface 211 @return: Contents of elem indented to reflect its structure 212 """ 213 i = "\n" + level*" " 214 if len(elem): 215 if not elem.text or not elem.text.strip(): 216 elem.text = i + " " 217 for elem in elem: 218 indent(elem, level+1) 219 if not elem.tail or not elem.tail.strip(): 220 elem.tail = i 221 else: 222 if level and (not elem.tail or not elem.tail.strip()): 223 elem.tail = i
224
225 -def to_sfm_string(tree, encoding=None, errors='strict', unicode_fields=None):
226 """Return a string with a standard format representation of the toolbox 227 data in tree (tree can be a toolbox database or a single record). Should work for trees 228 parsed by grammar_parse too. 229 230 @param tree: flat representation of toolbox data (whole database or single record) 231 @type tree: ElementTree._ElementInterface 232 @param encoding: Name of an encoding to use. 233 @type encoding: string 234 @param errors: Error handling scheme for codec. Same as the C{encode} 235 inbuilt string method. 236 @type errors: string 237 @param unicode_fields: 238 @type unicode_fields: string 239 @rtype: string 240 @return: string using standard format markup 241 """ 242 # write SFM to file 243 # unicode_fields parameter does nothing as yet 244 l = list() 245 _to_sfm_string(tree, l, encoding=encoding, errors=errors, unicode_fields=unicode_fields) 246 s = ''.join(l) 247 if encoding is not None: 248 s = s.encode(encoding, errors) 249 return s
250 251 _is_value = re.compile(r"\S") 252
253 -def _to_sfm_string(node, l, **kwargs):
254 # write SFM to file 255 tag = node.tag 256 text = node.text 257 if len(node) == 0: 258 if re.search(_is_value, text): 259 l.append('\\%s %s\n' % (tag, text)) 260 else: 261 l.append('\\%s%s\n' % (tag, text)) 262 else: 263 #l.append('\n') 264 for n in node: 265 _to_sfm_string(n, l, **kwargs) 266 return
267
268 -def demo_flat():
269 from nltk_lite.etree.ElementTree import ElementTree 270 import sys 271 272 tree = ElementTree(toolbox.parse_corpus('iu_mien_samp.db', key='lx', encoding='utf8')) 273 tree.write(sys.stdout)
274 275 276 if __name__ == '__main__': 277 demo_flat() 278