1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28 from nltk_lite import tokenize
29 from nltk_lite import parse
30 from nltk_lite.parse import cfg
31 from re import *
32
34 """
35 Class to read and parse a paradigm visualisation query
36 """
37
39 """
40 Construct a query.
41 Setup various attributes and parse given string
42 """
43 self.nltktree = None
44 self.string = p_string
45 self.parseList = None
46 self.nltkTree = None
47 self.parseTree = None
48 self.xml = None
49
50
51 if p_string <> None:
52 self.parse(p_string)
53
54 - def parse(self, p_string):
55 """
56 Parses a string and stores the resulting hierarchy of "domains"
57 "hierarchies" and "tables"
58
59 For the sake of NLP I've parsed the string using the nltk_lite
60 context free grammar library.
61
62 A query is a "sentence" and can either be a domain, hierarchy or a table.
63 A domain is simply a word.
64 A hierarchy is expressed as "domain/domain"
65 A table is exressed as "table(sentence, sentence, sentence)"
66
67 Internally the query is represented as a nltk_lite.parse.tree
68
69 Process:
70 1. string is tokenized
71 2. develop a context free grammar
72 3. parse
73 4. convert to a tree representation
74 """
75 self.nltktree = None
76
77
78 self.string = p_string
79
80
81
82 re_all = r'table[(]|\,|[)]|[/]|\w+'
83 data_tokens = tokenize.regexp(self.string, re_all)
84
85
86
87 O, T, H, D = cfg.nonterminals('O, T, H, D')
88
89
90 productions = (
91
92 cfg.Production(O, [D]), cfg.Production(O, [H]), cfg.Production(O, [T]),
93
94
95
96 cfg.Production(T, ['table(', O, ',', O, ',', O, ')']),
97
98
99
100 cfg.Production(H, [D, '/', D]),
101
102 cfg.Production(H, [D, '/', O])
103 )
104
105
106
107 re_domain = compile(r'^\w+$')
108
109 for tok in data_tokens:
110 if re_domain.match(tok):
111 prod = cfg.Production(D,[tok]),
112 productions = productions + prod
113
114
115 grammar = cfg.Grammar(O, productions)
116 rd_parser = parse.RecursiveDescent(grammar)
117
118
119
120 tokens = tokenize.regexp(self.string, re_all)
121 toklist = list(tokens)
122
123
124
125 try:
126 self.parseList = rd_parser.get_parse_list(toklist)[0]
127 except IndexError:
128 print "Could not parse query."
129 return
130
131
132 string = str(self.parseList)
133 string2 = string.replace(":","").replace("')'","").replace("table(","").replace("','","").replace("'","").replace("/","")
134 self.nltktree = parse.tree.bracket_parse(string2)
135
136
137 self.parseTree = QuerySentence(self.nltktree)
138 self.xml = self.parseTree.toXML()
139
140
142 """
143 Returns the results from the CFG parsing
144 """
145 if self.string == None:
146 print "No string has been parsed. Please use parse(string)."
147 return None
148 return self.nltktree
149
151 if self.string == None:
152 print "No string has been parsed. Please use parse(string)."
153 return None
154 return '<?xml version="1.0"?>\n<document><parse-tree>' + self.xml \
155 + "</parse-tree></document>"
156
157
158
159
160
162 """
163 Handles the XML export of sentences
164 """
189
190
193
195 """
196 Export this class to an xml string
197 """
198 return self.child.toXML()
199
200
201 -class QueryDomain(object):
202 """
203 Handles the XML export of the domain operation
204 """
205 - def __init__(self, tree):
206 self.type = 'domain'
207 self.content = tree[0]
208
211
213 """
214 Export this class to an xml string
215 """
216 return self.content
217
218
220 """
221 Handles the XML export of the hierarchy operation
222 """
229
232
234 """
235 Export this class to an xml string
236 """
237 return '<operator opcode="hierarchy">' \
238 + '<operand type="' + self.root.type + '" arg="root">' \
239 + self.root.toXML() + "</operand>" \
240 + '<operand type="' + self.leaf.type + '" arg="leaf">' \
241 + self.leaf.toXML() + "</operand>" \
242 + '</operator>'
243
244
246 """
247 Handles the XML export of the hierarchy operation
248 """
258
261
263 """
264 Export this class to an xml string
265 """
266 return '<operator opcode="table">' \
267 + '<operand type="' + self.horizontal.type + '" arg="horizontal">' \
268 + self.horizontal.toXML() + "</operand>" \
269 + '<operand type="' + self.vertical.type + '" arg="vertical">' \
270 + self.vertical.toXML() + "</operand>" \
271 + '<operand type="' + self.content.type + '" arg="cell">' \
272 + self.content.toXML() + "</operand>" \
273 + '</operator>'
274
275
277 """
278 A demonstration of the use of this class
279 """
280 query = r'table(one/two/three, four, five)'
281
282
283 print """
284 ================================================================================
285 Query: ParadigmQuery(query)
286 ================================================================================
287 """
288 a = ParadigmQuery(query)
289 print query
290
291
292 print """
293 ================================================================================
294 Tree: getTree()
295 O is an operator
296 T is a table
297 H is a hierarchy
298 D is a domain
299 ================================================================================
300 """
301 print a.getTree()
302
303
304 print """
305 ================================================================================
306 XML: getXML()
307 ================================================================================
308 """
309 print a.getXML()
310
311
312 print
313
314
315 if __name__ == '__main__':
316 demo()
317