1
2
3
4
5
6
7
8
9 import types
10 from nltk_lite.parse import *
11
12
13
14
15
17 """
18 A bottom-up C{PCFG} parser that uses dynamic programming to find
19 the single most likely parse for a text. The C{ViterbiParse} parser
20 parses texts by filling in a X{most likely constituent table}.
21 This table records the most probable tree representation for any
22 given span and node value. In particular, it has an entry for
23 every start index, end index, and node value, recording the most
24 likely subtree that spans from the start index to the end index,
25 and has the given node value.
26
27 The C{ViterbiParse} parser fills in this table incrementally. It starts
28 by filling in all entries for constituents that span one element
29 of text (i.e., entries where the end index is one greater than the
30 start index). After it has filled in all table entries for
31 constituents that span one element of text, it fills in the
32 entries for constitutants that span two elements of text. It
33 continues filling in the entries for constituents spanning larger
34 and larger portions of the text, until the entire table has been
35 filled. Finally, it returns the table entry for a constituent
36 spanning the entire text, whose node value is the grammar's start
37 symbol.
38
39 In order to find the most likely constituent with a given span and
40 node value, the C{ViterbiParse} parser considers all productions that
41 could produce that node value. For each production, it finds all
42 children that collectively cover the span and have the node values
43 specified by the production's right hand side. If the probability
44 of the tree formed by applying the production to the children is
45 greater than the probability of the current entry in the table,
46 then the table is updated with this new tree.
47
48 A pseudo-code description of the algorithm used by
49 C{ViterbiParse} is:
50
51 - Create an empty most likely constituent table, M{MLC}.
52 - For M{width} in 1...len(M{text}):
53 - For M{start} in 1...len(M{text})-M{width}:
54 - For M{prod} in grammar.productions:
55 - For each sequence of subtrees [M{t[1]}, M{t[2]}, ...,
56 M{t[n]}] in M{MLC}, where M{t[i]}.node==M{prod}.rhs[i],
57 and the sequence covers [M{start}:M{start}+M{width}]:
58 - M{old_p} = M{MLC}[M{start}, M{start+width}, M{prod}.lhs]
59 - M{new_p} = P(M{t[1]})*P(M{t[1]})*...*P(M{t[n]})*P(M{prod})
60 - if M{new_p} > M{old_p}:
61 - M{new_tree} = Tree(M{prod}.lhs, M{t[1]}, M{t[2]},
62 ..., M{t[n]})
63 - M{MLC}[M{start}, M{start+width}, M{prod}.lhs]
64 = M{new_tree}
65 - Return M{MLC}[0, len(M{text}), M{start_symbol}]
66
67 @type _grammar: C{pcfg.Grammar}
68 @ivar _grammar: The grammar used to parse sentences.
69 @type _trace: C{int}
70 @ivar _trace: The level of tracing output that should be generated
71 when parsing a text.
72 """
74 """
75 Create a new C{ViterbiParse} parser, that uses {grammar} to
76 parse texts.
77
78 @type grammar: C{pcfg.Grammar}
79 @param grammar: The grammar used to parse texts.
80 @type trace: C{int}
81 @param trace: The level of tracing that should be used when
82 parsing a text. C{0} will generate no tracing output;
83 and higher numbers will produce more verbose tracing
84 output.
85 """
86 self._grammar = grammar
87 self._trace = trace
88 AbstractParse.__init__(self)
89
90 - def trace(self, trace=2):
91 """
92 Set the level of tracing output that should be generated when
93 parsing a text.
94
95 @type trace: C{int}
96 @param trace: The trace level. A trace level of C{0} will
97 generate no tracing output; and higher trace levels will
98 produce more verbose tracing output.
99 @rtype: C{None}
100 """
101 self._trace = trace
102
104
105
106
107
108
109
110
111
112
113 constituents = {}
114
115
116
117 if self._trace: print ('Inserting tokens into the most likely'+
118 ' constituents table...')
119 for index in range(len(tokens)):
120 token = tokens[index]
121 constituents[index,index+1,token] = token
122 if self._trace > 1:
123 self._trace_lexical_insertion(token, index, len(tokens))
124
125
126
127 for length in range(1, len(tokens)+1):
128 if self._trace:
129 print ('Finding the most likely constituents'+
130 ' spanning %d text elements...' % length)
131
132 for start in range(len(tokens)-length+1):
133 span = (start, start+length)
134 self._add_constituents_spanning(span, constituents,
135 tokens)
136
137
138 trees = [constituents.get((0, len(tokens),
139 self._grammar.start()), [])]
140
141
142 trees.sort(lambda t1,t2: cmp(t2.prob(), t1.prob()))
143 return trees
144
146 """
147 Find any constituents that might cover C{span}, and add them
148 to the most likely constituents table.
149
150 @rtype: C{None}
151 @type span: C{(int, int)}
152 @param span: The section of the text for which we are
153 trying to find possible constituents. The span is
154 specified as a pair of integers, where the first integer
155 is the index of the first token that should be included in
156 the constituent; and the second integer is the index of
157 the first token that should not be included in the
158 constituent. I.e., the constituent should cover
159 C{M{text}[span[0]:span[1]]}, where C{M{text}} is the text
160 that we are parsing.
161
162 @type constituents: C{dictionary} from
163 C{(int,int,Nonterminal)} to (C{ProbabilisticToken} or
164 C{ProbabilisticTree}).
165 @param constituents: The most likely constituents table. This
166 table records the most probable tree representation for
167 any given span and node value. In particular,
168 C{constituents(M{s},M{e},M{nv})} is the most likely
169 C{ProbabilisticTree} that covers C{M{text}[M{s}:M{e}]}
170 and has a node value C{M{nv}.symbol()}, where C{M{text}}
171 is the text that we are parsing. When
172 C{_add_constituents_spanning} is called, C{constituents}
173 should contain all possible constituents that are shorter
174 than C{span}.
175
176 @type tokens: C{list} of tokens
177 @param tokens: The text we are parsing. This is only used for
178 trace output.
179 """
180
181
182
183 changed = 1
184 while changed:
185 changed = 0
186
187
188
189 instantiations = self._find_instantiations(span, constituents)
190
191
192
193
194
195 for (production, children) in instantiations:
196 subtrees = [c for c in children if isinstance(c, Tree)]
197 p = reduce(lambda pr,t:pr*t.prob(),
198 subtrees, production.prob())
199 node = production.lhs().symbol()
200 tree = ProbabilisticTree(node, children, prob=p)
201
202
203
204 c = constituents.get((span[0], span[1], production.lhs()),
205 None)
206 if self._trace > 1:
207 if c is None or c != tree:
208 if c is None or c.prob() < tree.prob():
209 print ' Insert:',
210 else:
211 print ' Discard:',
212 self._trace_production(production, p, span, len(tokens))
213 if c is None or c.prob() < tree.prob():
214 constituents[span[0], span[1], production.lhs()] = tree
215 changed = 1
216
218 """
219 @return: a list of the production instantiations that cover a
220 given span of the text. A X{production instantiation} is
221 a tuple containing a production and a list of children,
222 where the production's right hand side matches the list of
223 children; and the children cover C{span}. @rtype: C{list}
224 of C{pair} of C{Production}, (C{list} of
225 (C{ProbabilisticTree} or token.
226
227 @type span: C{(int, int)}
228 @param span: The section of the text for which we are
229 trying to find production instantiations. The span is
230 specified as a pair of integers, where the first integer
231 is the index of the first token that should be covered by
232 the production instantiation; and the second integer is
233 the index of the first token that should not be covered by
234 the production instantiation.
235 @type constituents: C{dictionary} from
236 C{(int,int,Nonterminal)} to (C{ProbabilisticToken} or
237 C{ProbabilisticTree}).
238 @param constituents: The most likely constituents table. This
239 table records the most probable tree representation for
240 any given span and node value. See the module
241 documentation for more information.
242 """
243 rv = []
244 for production in self._grammar.productions():
245 childlists = self._match_rhs(production.rhs(), span, constituents)
246
247 for childlist in childlists:
248 rv.append( (production, childlist) )
249 return rv
250
252 """
253 @return: a set of all the lists of children that cover C{span}
254 and that match C{rhs}.
255 @rtype: C{list} of (C{list} of C{ProbabilisticTree} or
256 C{Token})
257
258 @type rhs: C{list} of C{Nonterminal} or (any)
259 @param rhs: The list specifying what kinds of children need to
260 cover C{span}. Each nonterminal in C{rhs} specifies
261 that the corresponding child should be a tree whose node
262 value is that nonterminal's symbol. Each terminal in C{rhs}
263 specifies that the corresponding child should be a token
264 whose type is that terminal.
265 @type span: C{(int, int)}
266 @param span: The section of the text for which we are
267 trying to find child lists. The span is specified as a
268 pair of integers, where the first integer is the index of
269 the first token that should be covered by the child list;
270 and the second integer is the index of the first token
271 that should not be covered by the child list.
272 @type constituents: C{dictionary} from
273 C{(int,int,Nonterminal)} to (C{ProbabilisticToken} or
274 C{ProbabilisticTree}).
275 @param constituents: The most likely constituents table. This
276 table records the most probable tree representation for
277 any given span and node value. See the module
278 documentation for more information.
279 """
280 (start, end) = span
281
282
283 if start >= end and rhs == (): return [[]]
284 if start >= end or rhs == (): return []
285
286
287 childlists = []
288 for split in range(start, end+1):
289 l=constituents.get((start,split,rhs[0]))
290 if l is not None:
291 rights = self._match_rhs(rhs[1:], (split,end), constituents)
292 childlists += [[l]+r for r in rights]
293
294 return childlists
295
297 """
298 Print trace output indicating that a given production has been
299 applied at a given location.
300
301 @param production: The production that has been applied
302 @type production: C{Production}
303 @param p: The probability of the tree produced by the production.
304 @type p: C{float}
305 @param span: The span of the production
306 @type span: C{tuple}
307 @rtype: C{None}
308 """
309
310 str = '|' + '.' * span[0]
311 str += '=' * (span[1] - span[0])
312 str += '.' * (width - span[1]) + '| '
313 str += '%s' % production
314 if self._trace > 2: str = '%-40s %12.10f ' % (str, p)
315
316 print str
317
322
324 return '<ViterbiParser for %r>' % self._grammar
325
326
327
328
329
330
332 """
333 A demonstration of the probabilistic parsers. The user is
334 prompted to select which demo to run, and how many parses should
335 be found; and then each parser is run on the same demo, and a
336 summary of the results are displayed.
337 """
338 import sys, time
339 from nltk_lite import tokenize
340 from nltk_lite.parse import cfg, pcfg, ViterbiParse
341
342
343 demos = [('I saw John with my cookie', pcfg.toy1),
344 ('the boy saw Jack with Bob under the table with a telescope',
345 pcfg.toy2)]
346
347
348 print
349 for i in range(len(demos)):
350 print '%3s: %s' % (i+1, demos[i][0])
351 print ' %r' % demos[i][1]
352 print
353 print 'Which demo (%d-%d)? ' % (1, len(demos)),
354 try:
355 snum = int(sys.stdin.readline().strip())-1
356 sent, grammar = demos[snum]
357 except:
358 print 'Bad sentence number'
359 return
360
361
362 tokens = list(tokenize.whitespace(sent))
363
364 parser = ViterbiParse(grammar)
365 all_parses = {}
366
367 print '\nsent: %s\nparser: %s\ngrammar: %s' % (sent,parser,grammar)
368 parser.trace(3)
369 t = time.time()
370 parses = parser.get_parse_list(tokens)
371 time = time.time()-t
372 if parses:
373 average = reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses)
374 else:
375 average = 0
376 num_parses = len(parses)
377 for p in parses:
378 all_parses[p.freeze()] = 1
379
380
381 print
382 print 'Time (secs) # Parses Average P(parse)'
383 print '-----------------------------------------'
384 print '%11.4f%11d%19.14f' % (time, num_parses, average)
385 parses = all_parses.keys()
386 if parses:
387 p = reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses)
388 else: p = 0
389 print '------------------------------------------'
390 print '%11s%11d%19.14f' % ('n/a', len(parses), p)
391
392
393 print
394 print 'Draw parses (y/n)? ',
395 if sys.stdin.readline().strip().lower().startswith('y'):
396 from nltk_lite.draw.tree import draw_trees
397 print ' please wait...'
398 draw_trees(*parses)
399
400
401 print
402 print 'Print parses (y/n)? ',
403 if sys.stdin.readline().strip().lower().startswith('y'):
404 for parse in parses:
405 print parse
406
407 if __name__ == '__main__':
408 demo()
409