Package nltk_lite :: Package contrib :: Module paradigmquery
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.contrib.paradigmquery

  1  # Natural Language Toolkit: Paradigm Visualisation 
  2  # 
  3  # Copyright (C) 2005 University of Melbourne 
  4  # Author: Will Hardy 
  5  # URL: <http://nltk.sf.net> 
  6  # For license information, see LICENSE.TXT 
  7   
  8  # Parses a paradigm query and produces an XML representation of 
  9  # that query. This is part of a Python implementation of David 
 10  # Penton's paradigm visualisation model. 
 11   
 12  #This is the query XML version of "table(person, number, content)" 
 13  # 
 14  #<?xml version="1.0"?> 
 15  #<document> 
 16  #  <parse-tree> 
 17  #    <operator opcode="table" instruction="1"> 
 18  #      <operand type="domain" 
 19  #        arg="horizontal">person</operand> 
 20  #      <operand type="domain" 
 21  #        arg="vertical">number</operand> 
 22  #      <operand type="domain" 
 23  #        arg="cell">content</operand> 
 24  #    </operator> 
 25  #  </parse-tree> 
 26  #</document> 
 27   
 28  from nltk_lite import tokenize 
 29  from nltk_lite import parse 
 30  from nltk_lite.parse import cfg 
 31  from re import * 
 32   
33 -class ParadigmQuery(object):
34 """ 35 Class to read and parse a paradigm visualisation query 36 """ 37
38 - def __init__(self, p_string=None):
39 """ 40 Construct a query. 41 Setup various attributes and parse given string 42 """ 43 self.nltktree = None 44 self.string = p_string 45 self.parseList = None 46 self.nltkTree = None 47 self.parseTree = None 48 self.xml = None 49 50 # If p_string was given, parse it 51 if p_string <> None: 52 self.parse(p_string)
53
54 - def parse(self, p_string):
55 """ 56 Parses a string and stores the resulting hierarchy of "domains" 57 "hierarchies" and "tables" 58 59 For the sake of NLP I've parsed the string using the nltk_lite 60 context free grammar library. 61 62 A query is a "sentence" and can either be a domain, hierarchy or a table. 63 A domain is simply a word. 64 A hierarchy is expressed as "domain/domain" 65 A table is exressed as "table(sentence, sentence, sentence)" 66 67 Internally the query is represented as a nltk_lite.parse.tree 68 69 Process: 70 1. string is tokenized 71 2. develop a context free grammar 72 3. parse 73 4. convert to a tree representation 74 """ 75 self.nltktree = None 76 77 # Store the query string 78 self.string = p_string 79 80 # Tokenize the query string, allowing only strings, parentheses, 81 # forward slashes and commas. 82 re_all = r'table[(]|\,|[)]|[/]|\w+' 83 data_tokens = tokenize.regexp(self.string, re_all) 84 85 # Develop a context free grammar 86 # S = sentence, T = table, H = hierarchy, D = domain 87 O, T, H, D = cfg.nonterminals('O, T, H, D') 88 89 # Specify the grammar 90 productions = ( 91 # A sentence can be either a table, hierarchy or domain 92 cfg.Production(O, [D]), cfg.Production(O, [H]), cfg.Production(O, [T]), 93 94 # A table must be the following sequence: 95 # "table(", sentence, comma, sentence, comma, sentence, ")" 96 cfg.Production(T, ['table(', O, ',', O, ',', O, ')']), 97 98 # A hierarchy must be the following sequence: 99 # domain, forward slash, domain 100 cfg.Production(H, [D, '/', D]), 101 # domain, forward slash, another operator 102 cfg.Production(H, [D, '/', O]) 103 ) 104 105 # Add domains to the cfg productions 106 # A domain is a token that is entirely word chars 107 re_domain = compile(r'^\w+$') 108 # Try every token and add if it matches the above regular expression 109 for tok in data_tokens: 110 if re_domain.match(tok): 111 prod = cfg.Production(D,[tok]), 112 productions = productions + prod 113 114 # Make a grammar out of our productions 115 grammar = cfg.Grammar(O, productions) 116 rd_parser = parse.RecursiveDescent(grammar) 117 118 # Tokens need to be redefined. 119 # It disappears after first use, and I don't know why. 120 tokens = tokenize.regexp(self.string, re_all) 121 toklist = list(tokens) 122 123 # Store the parsing. 124 # Only the first one, as the grammar should be completely nonambiguous. 125 try: 126 self.parseList = rd_parser.get_parse_list(toklist)[0] 127 except IndexError: 128 print "Could not parse query." 129 return 130 131 # Set the nltk_lite.parse.tree tree for this query to the global sentence 132 string = str(self.parseList) 133 string2 = string.replace(":","").replace("')'","").replace("table(","").replace("','","").replace("'","").replace("/","") 134 self.nltktree = parse.tree.bracket_parse(string2) 135 136 # Store the resulting nltk_lite.parse.tree tree 137 self.parseTree = QuerySentence(self.nltktree) 138 self.xml = self.parseTree.toXML()
139 140
141 - def getTree(self):
142 """ 143 Returns the results from the CFG parsing 144 """ 145 if self.string == None: 146 print "No string has been parsed. Please use parse(string)." 147 return None 148 return self.nltktree
149
150 - def getXML(self):
151 if self.string == None: 152 print "No string has been parsed. Please use parse(string)." 153 return None 154 return '<?xml version="1.0"?>\n<document><parse-tree>' + self.xml \ 155 + "</parse-tree></document>"
156 157 158 159 # Additional Classes for handling The various types of recursive operations 160
161 -class QuerySentence(object):
162 """ 163 Handles the XML export of sentences 164 """
165 - def __init__(self, tree):
166 self.tree = tree 167 type = str(tree[0])[1:2] 168 # Move on, nothing to see here 169 if type == "O": 170 self.child = QuerySentence(tree[0]) 171 self.content = self.child.content 172 # Get the child and replicate the data 173 elif type == "D": 174 self.child = QueryDomain(tree[0]) 175 self.content = self.child.content 176 elif type == "H": 177 self.child = QueryHierarchy(tree[0]) 178 self.root = self.child.root 179 self.leaf = self.child.leaf 180 elif type == "T": 181 self.child = QueryTable(tree[0]) 182 self.horizontal = self.child.horizontal 183 self.vertical = self.child.vertical 184 # Otherwise, must simply be a domain... 185 else: 186 self.child = QueryDomain(tree[0]) 187 self.content = self.child.content 188 self.type = self.child.type
189 190
191 - def __str__(self):
192 return str(self.tree[0])
193
194 - def toXML(self):
195 """ 196 Export this class to an xml string 197 """ 198 return self.child.toXML()
199 200
201 -class QueryDomain(object):
202 """ 203 Handles the XML export of the domain operation 204 """
205 - def __init__(self, tree):
206 self.type = 'domain' 207 self.content = tree[0]
208
209 - def __str__(self):
210 return tree[0]
211
212 - def toXML(self):
213 """ 214 Export this class to an xml string 215 """ 216 return self.content
217 218
219 -class QueryHierarchy(object):
220 """ 221 Handles the XML export of the hierarchy operation 222 """
223 - def __init__(self, tree):
224 self.type = 'hierarchy' 225 # First argument must be a Domain 226 self.root = QueryDomain(tree[0]) 227 # Second argument can conceivably be anything 228 self.leaf = QuerySentence(tree[1])
229
230 - def __str__(self):
231 return tree[0]
232
233 - def toXML(self):
234 """ 235 Export this class to an xml string 236 """ 237 return '<operator opcode="hierarchy">' \ 238 + '<operand type="' + self.root.type + '" arg="root">' \ 239 + self.root.toXML() + "</operand>" \ 240 + '<operand type="' + self.leaf.type + '" arg="leaf">' \ 241 + self.leaf.toXML() + "</operand>" \ 242 + '</operator>'
243 244
245 -class QueryTable(object):
246 """ 247 Handles the XML export of the hierarchy operation 248 """
249 - def __init__(self, tree):
250 """ 251 Simply stores attributes, passing off handling of attributes to the 252 QuerySentence class 253 """ 254 self.type = 'table' 255 self.horizontal = QuerySentence(tree[0]) 256 self.vertical = QuerySentence(tree[1]) 257 self.content = QuerySentence(tree[2])
258
259 - def __str__(self):
260 return tree[0]
261
262 - def toXML(self):
263 """ 264 Export this class to an xml string 265 """ 266 return '<operator opcode="table">' \ 267 + '<operand type="' + self.horizontal.type + '" arg="horizontal">' \ 268 + self.horizontal.toXML() + "</operand>" \ 269 + '<operand type="' + self.vertical.type + '" arg="vertical">' \ 270 + self.vertical.toXML() + "</operand>" \ 271 + '<operand type="' + self.content.type + '" arg="cell">' \ 272 + self.content.toXML() + "</operand>" \ 273 + '</operator>'
274 275
276 -def demo():
277 """ 278 A demonstration of the use of this class 279 """ 280 query = r'table(one/two/three, four, five)' 281 282 # Print the query 283 print """ 284 ================================================================================ 285 Query: ParadigmQuery(query) 286 ================================================================================ 287 """ 288 a = ParadigmQuery(query) 289 print query 290 291 # Print the Tree representation 292 print """ 293 ================================================================================ 294 Tree: getTree() 295 O is an operator 296 T is a table 297 H is a hierarchy 298 D is a domain 299 ================================================================================ 300 """ 301 print a.getTree() 302 303 # Print the XML representation 304 print """ 305 ================================================================================ 306 XML: getXML() 307 ================================================================================ 308 """ 309 print a.getXML() 310 311 # Some space 312 print
313 314 315 if __name__ == '__main__': 316 demo() 317