1
2
3
4
5
6
7
8
9 """
10 Classes and interfaces for associating probabilities with tree
11 structures that represent the internal organization of a text. The
12 probabilistic parser module defines C{BottomUpChartParse}.
13
14 C{BottomUpChartParse} is an abstract class that implements a
15 bottom-up chart parser for C{PCFG}s. It maintains a queue of edges,
16 and adds them to the chart one at a time. The ordering of this queue
17 is based on the probabilities associated with the edges, allowing the
18 parser to expand more likely edges before less likely ones. Each
19 subclass implements a different queue ordering, producing different
20 search strategies. Currently the following subclasses are defined:
21
22 - C{InsideParse} searches edges in decreasing order of
23 their trees' inside probabilities.
24 - C{RandomParse} searches edges in random order.
25 - C{LongestParse} searches edges in decreasing order of their
26 location's length.
27
28 The C{BottomUpChartParse} constructor has an optional argument beam_size.
29 If non-zero, this controls the size of the beam (aka the edge queue).
30 This option is most useful with InsideParse.
31 """
32
33
34
35
36
37
38
39
40 from nltk_lite.parse import *
41
42
44 - def prob(self): return 1.0
45
58
59
67
77
102
104 NUM_EDGES=1
105
106 _fundamental_rule = ProbabilisticFundamentalRule()
107
109 fr = self._fundamental_rule
110 if edge1.is_incomplete():
111
112 for edge2 in chart.select(start=edge1.end(), is_complete=True,
113 lhs=edge1.next()):
114 for new_edge in fr.apply_iter(chart, grammar, edge1, edge2):
115 yield new_edge
116 else:
117
118 for edge2 in chart.select(end=edge1.start(), is_complete=False,
119 next=edge1.lhs()):
120 for new_edge in fr.apply_iter(chart, grammar, edge2, edge1):
121 yield new_edge
122
123 - def __str__(self): return 'Fundamental Rule'
124
126 """
127 An abstract bottom-up parser for C{PCFG}s that uses a C{Chart} to
128 record partial results. C{BottomUpChartParse} maintains a
129 queue of edges that can be added to the chart. This queue is
130 initialized with edges for each token in the text that is being
131 parsed. C{BottomUpChartParse} inserts these edges into the
132 chart one at a time, starting with the most likely edges, and
133 proceeding to less likely edges. For each edge that is added to
134 the chart, it may become possible to insert additional edges into
135 the chart; these are added to the queue. This process continues
136 until enough complete parses have been generated, or until the
137 queue is empty.
138
139 The sorting order for the queue is not specified by
140 C{BottomUpChartParse}. Different sorting orders will result
141 in different search strategies. The sorting order for the queue
142 is defined by the method C{sort_queue}; subclasses are required
143 to provide a definition for this method.
144
145 @type _grammar: C{PCFG}
146 @ivar _grammar: The grammar used to parse sentences.
147 @type _trace: C{int}
148 @ivar _trace: The level of tracing output that should be generated
149 when parsing a text.
150 """
151 - def __init__(self, grammar, beam_size=0, trace=0):
152 """
153 Create a new C{BottomUpChartParse}, that uses C{grammar}
154 to parse texts.
155
156 @type grammar: C{PCFG}
157 @param grammar: The grammar used to parse texts.
158 @type beam_size: C{int}
159 @param beam_size: The maximum length for the parser's edge queue.
160 @type trace: C{int}
161 @param trace: The level of tracing that should be used when
162 parsing a text. C{0} will generate no tracing output;
163 and higher numbers will produce more verbose tracing
164 output.
165 """
166 self._grammar = grammar
167 self.beam_size = beam_size
168 self._trace = trace
169 AbstractParse.__init__(self)
170
171 - def trace(self, trace=2):
172 """
173 Set the level of tracing output that should be generated when
174 parsing a text.
175
176 @type trace: C{int}
177 @param trace: The trace level. A trace level of C{0} will
178 generate no tracing output; and higher trace levels will
179 produce more verbose tracing output.
180 @rtype: C{None}
181 """
182 self._trace = trace
183
185 chart = Chart(list(tokens))
186 grammar = self._grammar
187
188
189 bu_init = BottomUpInitRule()
190 bu = BottomUpPredictRule()
191 fr = SingleEdgeProbabilisticFundamentalRule()
192
193
194 queue = []
195
196
197 for e in bu_init.apply_iter(chart, grammar):
198 if self._trace>1: chart.pp_edge(e,width=2)
199 queue.append(e)
200
201 while len(queue) > 0:
202
203 self.sort_queue(queue, chart)
204
205
206 if self.beam_size:
207 self._prune(queue, chart)
208
209
210 edge = queue.pop()
211 if self._trace>0:
212 print ' %-50s [%s]' % (chart.pp_edge(edge,width=2),
213 edge.prob())
214
215
216 queue.extend(bu.apply(chart, grammar, edge))
217 queue.extend(fr.apply(chart, grammar, edge))
218
219
220 parses = chart.parses(grammar.start(), ProbabilisticTree)
221
222
223 prod_probs = {}
224 for prod in grammar.productions():
225 prod_probs[prod.lhs(), prod.rhs()] = prod.prob()
226 for parse in parses:
227 self._setprob(parse, prod_probs)
228
229
230 parses.sort(lambda a,b: cmp(b.prob(), a.prob()))
231
232 return parses
233
254
256 """
257 Sort the given queue of C{Edge}s, placing the edge that should
258 be tried first at the beginning of the queue. This method
259 will be called after each C{Edge} is added to the queue.
260
261 @param queue: The queue of C{Edge}s to sort. Each edge in
262 this queue is an edge that could be added to the chart by
263 the fundamental rule; but that has not yet been added.
264 @type queue: C{list} of C{Edge}
265 @param chart: The chart being used to parse the text. This
266 chart can be used to provide extra information for sorting
267 the queue.
268 @type chart: C{Chart}
269 @rtype: C{None}
270 """
271 raise AssertionError, "BottomUpChartParse is an abstract class"
272
273 - def _prune(self, queue, chart):
274 """ Discard items in the queue if the queue is longer than the beam."""
275 if len(queue) > self.beam_size:
276 split = len(queue)-self.beam_size
277 if self._trace > 2:
278 for edge in queue[:split]:
279 print ' %-50s [DISCARDED]' % chart.pp_edge(edge,2)
280 del queue[:split]
281
283 """
284 A bottom-up parser for C{PCFG}s that tries edges in descending
285 order of the inside probabilities of their trees. The X{inside
286 probability} of a tree is simply the
287 probability of the entire tree, ignoring its context. In
288 particular, the inside probability of a tree generated by
289 production M{p} with children M{c[1]}, M{c[2]}, ..., M{c[n]} is
290 P(M{p})*P(M{c[1]})*P(M{c[2]})*M{...}*P(M{c[n]}); and the inside
291 probability of a token is 1 if it is present in the text, and 0 if
292 it is absent.
293
294 This sorting order results in a type of lowest-cost-first search
295 strategy.
296 """
297
299 """
300 Sort the given queue of edges, in descending order of the
301 inside probabilities of the edges' trees.
302
303 @param queue: The queue of C{Edge}s to sort. Each edge in
304 this queue is an edge that could be added to the chart by
305 the fundamental rule; but that has not yet been added.
306 @type queue: C{list} of C{Edge}
307 @param chart: The chart being used to parse the text. This
308 chart can be used to provide extra information for sorting
309 the queue.
310 @type chart: C{Chart}
311 @rtype: C{None}
312 """
313 queue.sort(lambda e1,e2:cmp(e1.prob(), e2.prob()))
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343 import random
345 """
346 A bottom-up parser for C{PCFG}s that tries edges in random order.
347 This sorting order results in a random search strategy.
348 """
349
351 i = random.randint(0, len(queue)-1)
352 (queue[-1], queue[i]) = (queue[i], queue[-1])
353
355 """
356 A bottom-up parser for C{PCFG}s that tries edges in whatever order.
357 """
358
360
362 """
363 A bottom-up parser for C{PCFG}s that tries longer edges before
364 shorter ones. This sorting order results in a type of best-first
365 search strategy.
366 """
367
370
371
372
373
374
376 """
377 A demonstration of the probabilistic parsers. The user is
378 prompted to select which demo to run, and how many parses should
379 be found; and then each parser is run on the same demo, and a
380 summary of the results are displayed.
381 """
382 import sys, time
383 from nltk_lite import tokenize
384 from nltk_lite.parse import cfg, pcfg, pchart
385
386
387 demos = [('I saw John with my cookie', pcfg.toy1),
388 ('the boy saw Jack with Bob under the table with a telescope',
389 pcfg.toy2)]
390
391
392 print
393 for i in range(len(demos)):
394 print '%3s: %s' % (i+1, demos[i][0])
395 print ' %r' % demos[i][1]
396 print
397 print 'Which demo (%d-%d)? ' % (1, len(demos)),
398 try:
399 snum = int(sys.stdin.readline().strip())-1
400 sent, grammar = demos[snum]
401 except:
402 print 'Bad sentence number'
403 return
404
405
406 tokens = list(tokenize.whitespace(sent))
407
408
409 parsers = [
410 pchart.InsideParse(grammar),
411 pchart.RandomParse(grammar),
412 pchart.UnsortedParse(grammar),
413 pchart.LongestParse(grammar),
414 pchart.InsideParse(grammar, beam_size = len(tokens)+1)
415 ]
416
417
418 times = []
419 average_p = []
420 num_parses = []
421 all_parses = {}
422 for parser in parsers:
423 print '\ns: %s\nparser: %s\ngrammar: %s' % (sent,parser,pcfg)
424 parser.trace(3)
425 t = time.time()
426 parses = parser.get_parse_list(tokens)
427 times.append(time.time()-t)
428 if parses: p = reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses)
429 else: p = 0
430 average_p.append(p)
431 num_parses.append(len(parses))
432 for p in parses: all_parses[p.freeze()] = 1
433
434
435 print
436 print ' Parser Beam | Time (secs) # Parses Average P(parse)'
437 print '------------------------+------------------------------------------'
438 for i in range(len(parsers)):
439 print '%18s %4d |%11.4f%11d%19.14f' % (parsers[i].__class__.__name__,
440 parsers[i].beam_size,
441 times[i],num_parses[i],average_p[i])
442 parses = all_parses.keys()
443 if parses: p = reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses)
444 else: p = 0
445 print '------------------------+------------------------------------------'
446 print '%18s |%11s%11d%19.14f' % ('(All Parses)', 'n/a', len(parses), p)
447
448
449 print
450 print 'Draw parses (y/n)? ',
451 if sys.stdin.readline().strip().lower().startswith('y'):
452 from nltk_lite.draw.tree import draw_trees
453 print ' please wait...'
454 draw_trees(*parses)
455
456
457 print
458 print 'Print parses (y/n)? ',
459 if sys.stdin.readline().strip().lower().startswith('y'):
460 for parse in parses:
461 print parse
462
463 if __name__ == '__main__':
464 demo()
465