1
2
3
4
5
6
7
8
9
10 """
11 Extension of chart parsing implementation to handle grammars with
12 feature structures as nodes.
13 """
14
15 import yaml
16 from chart import *
17 from category import *
18 import cfg
19
20 from featurelite import *
21
23 """
24 Load a grammar from a file, and build an Earley feature parser based on
25 that grammar.
26
27 You can optionally specify a tracing level, for how much output you
28 want to see:
29
30 0: No output.
31 1: Show edges from scanner and completer rules (not predictor).
32 2 (default): Show all edges as they are added to the chart.
33 3: Show all edges, plus the results of successful unifications.
34 4: Show all edges, plus the results of all attempted unifications.
35 5: Show all edges, plus the results of all attempted unifications,
36 including those with cached results.
37 """
38
39 grammar = GrammarFile.read_file(filename)
40 return grammar.earley_parser(trace)
41
43 """
44 FIXME: out of date documentation
45
46 A modification of L{TreeEdge} to handle nonterminals with features
47 (known as L{Categories<Category>}.
48
49 In addition to the span, left-hand side, right-hand side, and dot position
50 (described at L{TreeEdge}), a C{FeatureTreeEdge} includes X{vars}, a
51 set of L{FeatureBindings} saying which L{FeatureVariable}s are set to which
52 values.
53
54 These values are applied when examining the C{lhs} or C{rhs} of a
55 C{FeatureTreeEdge}.
56
57 For more information about edges, see the L{EdgeI} interface.
58 """
59 - def __init__(self, span, lhs, rhs, dot=0, vars=None):
60 """
61 Construct a new C{FeatureTreeEdge}.
62
63 @type span: C{(int, int)}
64 @param span: A tuple C{(s,e)}, where C{subtokens[s:e]} is the
65 portion of the sentence that is consistent with the new
66 edge's structure.
67 @type lhs: L{Category}
68 @param lhs: The new edge's left-hand side, specifying the
69 hypothesized tree's node value.
70 @type rhs: C{list} of (L{Category} and C{string})
71 @param rhs: The new edge's right-hand side, specifying the
72 hypothesized tree's children.
73 @type dot: C{int}
74 @param dot: The position of the new edge's dot. This position
75 specifies what prefix of the production's right hand side
76 is consistent with the text. In particular, if
77 C{sentence} is the list of subtokens in the sentence, then
78 C{subtokens[span[0]:span[1]]} can be spanned by the
79 children specified by C{rhs[:dot]}.
80 @type vars: L{FeatureBindings}
81 @param vars: The bindings specifying what values certain variables in
82 this edge must have.
83 """
84 TreeEdge.__init__(self, span, lhs, rhs, dot)
85 if vars is None: vars = {}
86 self._vars = vars
87
88 @staticmethod
90 """
91 @return: A new C{FeatureTreeEdge} formed from the given production.
92 The new edge's left-hand side and right-hand side will
93 be taken from C{production}; its span will be C{(index,
94 index)}; its dot position will be C{0}, and it may have specified
95 variables set.
96 @rtype: L{FeatureTreeEdge}
97 """
98 return FeatureTreeEdge(span=(index, index), lhs=production.lhs(),
99 rhs=production.rhs(), dot=0, vars=bindings)
100
101
103 """
104 @return: the L{VariableBindings} mapping L{FeatureVariable}s to values.
105 @rtype: L{VariableBindings}
106 """
107 return self._vars
108
110 """
111 @return: the value of the left-hand side with variables set.
112 @rtype: C{Category}
113 """
114 return substitute_bindings(TreeEdge.lhs(self), self._vars)
115
117 """
118 @return: the value of the left-hand side with no variables set.
119 @rtype: C{Category}
120 """
121 return TreeEdge.lhs(self)
122
124 """
125 @return: the value of the right-hand side with variables set.
126 @rtype: C{Category}
127 """
128 return tuple(apply(x, self._vars) for x in TreeEdge.rhs(self))
129
131 """
132 @return: the value of the right-hand side with no variables set.
133 @rtype: C{Category}
134 """
135 return TreeEdge.rhs(self)
136
137
139 str = '%s ->' % self.lhs()
140
141 for i in range(len(self._rhs)):
142 if i == self._dot: str += ' *'
143 str += ' %s' % (self.rhs()[i],)
144 if len(self._rhs) == self._dot: str += ' *'
145 return '%s %s' % (str, self._vars)
146
152 - def apply_iter(self, chart, grammar, left_edge, right_edge):
153
154 if not (left_edge.end() == right_edge.start() and
155 left_edge.is_incomplete() and right_edge.is_complete() and
156 isinstance(left_edge, FeatureTreeEdge) and
157 isinstance(right_edge, FeatureTreeEdge)
158 ):
159 return
160 left_bindings = left_edge.vars().copy()
161 right_bindings = right_edge.vars().copy()
162 try:
163 unified = unify(left_edge.next(), right_edge.lhs(), left_bindings,
164 right_bindings, memo=self.unify_memo, trace=self.trace-2)
165 if isinstance(unified, Category): unified.freeze()
166 except UnificationFailure: return
167
168
169 new_edge = FeatureTreeEdge(span=(left_edge.start(), right_edge.end()),
170 lhs=left_edge.lhs(), rhs=left_edge.rhs(),
171 dot=left_edge.dot()+1, vars=left_bindings)
172
173
174 changed_chart = False
175 for cpl1 in chart.child_pointer_lists(left_edge):
176 if chart.insert(new_edge, cpl1+(right_edge,)):
177 changed_chart = True
178 assert chart.child_pointer_lists(new_edge), new_edge
179
180
181 if changed_chart:
182 yield new_edge
183
201
202
204 """
205 The @C{TopDownExpandRule} specialised for feature-based grammars.
206 """
224
226 """
227 A chart parser implementing the Earley parsing algorithm, allowing
228 nonterminals that have features (known as L{Categories<Category>}).
229
230 - For each index I{end} in [0, 1, ..., N]:
231 - For each I{edge} s.t. I{edge}.end = I{end}:
232 - If I{edge} is incomplete, and I{edge}.next is not a part
233 of speech:
234 - Apply PredictorRule to I{edge}
235 - If I{edge} is incomplete, and I{edge}.next is a part of
236 speech:
237 - Apply ScannerRule to I{edge}
238 - If I{edge} is complete:
239 - Apply CompleterRule to I{edge}
240 - Return any complete parses in the chart
241
242 C{FeatureEarleyChartParse} uses a X{lexicon} to decide whether a leaf
243 has a given part of speech. This lexicon is encoded as a
244 dictionary that maps each word to a list of parts of speech that
245 word can have. Unlike in the L{EarleyChartParse}, this lexicon is
246 case-insensitive.
247 """
248 - def __init__(self, grammar, lexicon, trace=0):
253
255 chart = Chart(tokens)
256 grammar = self._grammar
257
258
259
260 w = 2
261 if self._trace > 0: print ' '*9, chart.pp_leaves(w)
262
263
264 root = GrammarCategory(pos='[INIT]')
265 edge = FeatureTreeEdge((0,0), root, (grammar.start(),), 0,
266 {})
267 chart.insert(edge, ())
268
269
270 predictor = FeatureTopDownExpandRule(self._trace)
271 completer = SingleEdgeFeatureFundamentalRule(self._trace)
272
273
274 for end in range(chart.num_leaves()+1):
275 if self._trace > 1: print 'Processing queue %d' % end
276
277
278
279 if end > 0 and end-1 < chart.num_leaves():
280 leaf = chart.leaf(end-1)
281 for pos in self._lexicon(leaf):
282 new_leaf_edge = LeafEdge(leaf, end-1)
283 chart.insert(new_leaf_edge, ())
284 new_pos_edge = FeatureTreeEdge((end-1, end), pos, [leaf], 1,
285 {})
286 chart.insert(new_pos_edge, (new_leaf_edge,))
287 if self._trace > 0:
288 print 'Scanner ', chart.pp_edge(new_pos_edge,w)
289
290
291 for edge in chart.select(end=end):
292 if edge.is_incomplete():
293 for e in predictor.apply(chart, grammar, edge):
294 if self._trace > 1:
295 print 'Predictor', chart.pp_edge(e,w)
296
297
298
299
300 if edge.is_complete():
301 for e in completer.apply(chart, grammar, edge):
302 if self._trace > 0:
303 print 'Completer', chart.pp_edge(e,w)
304
305
306 return chart.parses(root)
307
309 import sys, time
310
311 S = GrammarCategory.parse('S')
312 VP = GrammarCategory.parse('VP')
313 NP = GrammarCategory.parse('NP')
314 PP = GrammarCategory.parse('PP')
315 V = GrammarCategory.parse('V')
316 N = GrammarCategory.parse('N')
317 P = GrammarCategory.parse('P')
318 Name = GrammarCategory.parse('Name')
319 Det = GrammarCategory.parse('Det')
320 DetSg = GrammarCategory.parse('Det[-pl]')
321 DetPl = GrammarCategory.parse('Det[+pl]')
322 NSg = GrammarCategory.parse('N[-pl]')
323 NPl = GrammarCategory.parse('N[+pl]')
324
325
326 grammatical_productions = [
327 cfg.Production(S, (NP, VP)), cfg.Production(PP, (P, NP)),
328 cfg.Production(NP, (NP, PP)),
329 cfg.Production(VP, (VP, PP)), cfg.Production(VP, (V, NP)),
330 cfg.Production(VP, (V,)), cfg.Production(NP, (DetPl, NPl)),
331 cfg.Production(NP, (DetSg, NSg))]
332
333
334 lexical_productions = [
335 cfg.Production(NP, ('John',)), cfg.Production(NP, ('I',)),
336 cfg.Production(Det, ('the',)), cfg.Production(Det, ('my',)),
337 cfg.Production(Det, ('a',)),
338 cfg.Production(NSg, ('dog',)), cfg.Production(NSg, ('cookie',)),
339 cfg.Production(V, ('ate',)), cfg.Production(V, ('saw',)),
340 cfg.Production(P, ('with',)), cfg.Production(P, ('under',)),
341 ]
342
343 earley_grammar = cfg.Grammar(S, grammatical_productions)
344 earley_lexicon = {}
345 for prod in lexical_productions:
346 earley_lexicon.setdefault(prod.rhs()[0].upper(), []).append(prod.lhs())
347 def lexicon(word):
348 return earley_lexicon.get(word.upper(), [])
349
350 sent = 'I saw John with a dog with my cookie'
351 print "Sentence:\n", sent
352 from nltk_lite import tokenize
353 tokens = list(tokenize.whitespace(sent))
354 t = time.time()
355 cp = FeatureEarleyChartParse(earley_grammar, lexicon, trace=1)
356 trees = cp.get_parse_list(tokens)
357 print "Time: %s" % (time.time() - t)
358 for tree in trees: print tree
359
361 import profile
362 profile.run('for i in range(1): demo()', '/tmp/profile.out')
363 import pstats
364 p = pstats.Stats('/tmp/profile.out')
365 p.strip_dirs().sort_stats('time', 'cum').print_stats(60)
366 p.strip_dirs().sort_stats('cum', 'time').print_stats(60)
367
368 if __name__ == '__main__':
369 demo()
370