1
2
3
4
5
6
7
8
9
10 """
11 Extension of chart parsing implementation to handle grammars with
12 feature structures as nodes.
13 """
14
15 import yaml
16 from chart import *
17 from category import *
18 import cfg
19
20 from featurelite import *
21
23 """
24 Load a grammar from a file, and build an Earley feature parser based on
25 that grammar.
26
27 You can optionally specify a tracing level, for how much output you
28 want to see:
29
30 0: No output.
31 1: Show edges from scanner and completer rules (not predictor).
32 2 (default): Show all edges as they are added to the chart.
33 3: Show all edges, plus the results of successful unifications.
34 4: Show all edges, plus the results of all attempted unifications.
35 5: Show all edges, plus the results of all attempted unifications,
36 including those with cached results.
37 """
38
39 grammar = GrammarFile.read_file(filename)
40 return grammar.earley_parser(trace)
41
43 """
44 FIXME: out of date documentation
45
46 A modification of L{TreeEdge} to handle nonterminals with features
47 (known as L{Categories<Category>}.
48
49 In addition to the span, left-hand side, right-hand side, and dot position
50 (described at L{TreeEdge}), a C{FeatureTreeEdge} includes X{vars}, a
51 set of L{FeatureBindings} saying which L{FeatureVariable}s are set to which
52 values.
53
54 These values are applied when examining the C{lhs} or C{rhs} of a
55 C{FeatureTreeEdge}.
56
57 For more information about edges, see the L{EdgeI} interface.
58 """
59 - def __init__(self, span, lhs, rhs, dot=0, vars=None):
60 """
61 Construct a new C{FeatureTreeEdge}.
62
63 @type span: C{(int, int)}
64 @param span: A tuple C{(s,e)}, where C{subtokens[s:e]} is the
65 portion of the sentence that is consistent with the new
66 edge's structure.
67 @type lhs: L{Category}
68 @param lhs: The new edge's left-hand side, specifying the
69 hypothesized tree's node value.
70 @type rhs: C{list} of (L{Category} and C{string})
71 @param rhs: The new edge's right-hand side, specifying the
72 hypothesized tree's children.
73 @type dot: C{int}
74 @param dot: The position of the new edge's dot. This position
75 specifies what prefix of the production's right hand side
76 is consistent with the text. In particular, if
77 C{sentence} is the list of subtokens in the sentence, then
78 C{subtokens[span[0]:span[1]]} can be spanned by the
79 children specified by C{rhs[:dot]}.
80 @type vars: L{FeatureBindings}
81 @param vars: The bindings specifying what values certain variables in
82 this edge must have.
83 """
84 TreeEdge.__init__(self, span, lhs, rhs, dot)
85 if vars is None: vars = {}
86 self._vars = vars
87
88 @staticmethod
90 """
91 @return: A new C{FeatureTreeEdge} formed from the given production.
92 The new edge's left-hand side and right-hand side will
93 be taken from C{production}; its span will be C{(index,
94 index)}; its dot position will be C{0}, and it may have specified
95 variables set.
96 @rtype: L{FeatureTreeEdge}
97 """
98 return FeatureTreeEdge(span=(index, index), lhs=production.lhs(),
99 rhs=production.rhs(), dot=0, vars=bindings)
100
101
103 """
104 @return: the L{VariableBindings} mapping L{FeatureVariable}s to values.
105 @rtype: L{VariableBindings}
106 """
107 return self._vars
108
110 """
111 @return: the value of the left-hand side with variables set.
112 @rtype: C{Category}
113 """
114 return apply(TreeEdge.lhs(self), self._vars)
115
117 """
118 @return: the value of the left-hand side with no variables set.
119 @rtype: C{Category}
120 """
121 return TreeEdge.lhs(self)
122
124 """
125 @return: the value of the right-hand side with variables set.
126 @rtype: C{Category}
127 """
128 return tuple(apply(x, self._vars) for x in TreeEdge.rhs(self))
129
131 """
132 @return: the value of the right-hand side with no variables set.
133 @rtype: C{Category}
134 """
135 return TreeEdge.rhs(self)
136
137
139 str = '%s ->' % self.lhs()
140
141 for i in range(len(self._rhs)):
142 if i == self._dot: str += ' *'
143 str += ' %s' % (self.rhs()[i],)
144 if len(self._rhs) == self._dot: str += ' *'
145 return '%s %s' % (str, self._vars)
146
152 - def apply_iter(self, chart, grammar, left_edge, right_edge):
153
154 if not (left_edge.end() == right_edge.start() and
155 left_edge.is_incomplete() and right_edge.is_complete() and
156 isinstance(left_edge, FeatureTreeEdge) and
157 isinstance(right_edge, FeatureTreeEdge)
158 ):
159 return
160 left_bindings = left_edge.vars().copy()
161 right_bindings = right_edge.vars().copy()
162 try:
163 unified = unify(left_edge.next(), right_edge.lhs(), left_bindings,
164 right_bindings, memo=self.unify_memo, trace=self.trace-2)
165 if isinstance(unified, Category): unified.freeze()
166 except UnificationFailure: return
167
168
169 new_edge = FeatureTreeEdge(span=(left_edge.start(), right_edge.end()),
170 lhs=left_edge.lhs(), rhs=left_edge.rhs(),
171 dot=left_edge.dot()+1, vars=left_bindings)
172
173
174 changed_chart = False
175 for cpl1 in chart.child_pointer_lists(left_edge):
176 if chart.insert(new_edge, cpl1+(right_edge,)):
177 changed_chart = True
178
179
180 if changed_chart: yield new_edge
181
199
201 """
202 The @C{TopDownExpandRule} specialised for feature-based grammars.
203 """
221
223 """
224 A chart parser implementing the Earley parsing algorithm, allowing
225 nonterminals that have features (known as L{Categories<Category>}).
226
227 - For each index I{end} in [0, 1, ..., N]:
228 - For each I{edge} s.t. I{edge}.end = I{end}:
229 - If I{edge} is incomplete, and I{edge}.next is not a part
230 of speech:
231 - Apply PredictorRule to I{edge}
232 - If I{edge} is incomplete, and I{edge}.next is a part of
233 speech:
234 - Apply ScannerRule to I{edge}
235 - If I{edge} is complete:
236 - Apply CompleterRule to I{edge}
237 - Return any complete parses in the chart
238
239 C{FeatureEarleyChartParse} uses a X{lexicon} to decide whether a leaf
240 has a given part of speech. This lexicon is encoded as a
241 dictionary that maps each word to a list of parts of speech that
242 word can have. Unlike in the L{EarleyChartParse}, this lexicon is
243 case-insensitive.
244 """
245 - def __init__(self, grammar, lexicon, trace=0):
250
252 chart = Chart(tokens)
253 grammar = self._grammar
254
255
256
257 w = 2
258 if self._trace > 0: print ' '*9, chart.pp_leaves(w)
259
260
261 root = GrammarCategory(pos='[INIT]')
262 edge = FeatureTreeEdge((0,0), root, (grammar.start(),), 0,
263 {})
264 chart.insert(edge, ())
265
266
267 predictor = FeatureTopDownExpandRule(self._trace)
268 completer = SingleEdgeFeatureFundamentalRule(self._trace)
269
270
271 for end in range(chart.num_leaves()+1):
272 if self._trace > 1: print 'Processing queue %d' % end
273
274
275
276 if end > 0 and end-1 < chart.num_leaves():
277 leaf = chart.leaf(end-1)
278 for pos in self._lexicon(leaf):
279 new_leaf_edge = LeafEdge(leaf, end-1)
280 chart.insert(new_leaf_edge, ())
281 new_pos_edge = FeatureTreeEdge((end-1, end), pos, [leaf], 1,
282 {})
283 chart.insert(new_pos_edge, (new_leaf_edge,))
284 if self._trace > 0:
285 print 'Scanner ', chart.pp_edge(new_pos_edge,w)
286
287
288 for edge in chart.select(end=end):
289 if edge.is_incomplete():
290 for e in predictor.apply(chart, grammar, edge):
291 if self._trace > 1:
292 print 'Predictor', chart.pp_edge(e,w)
293
294
295
296
297 if edge.is_complete():
298 for e in completer.apply(chart, grammar, edge):
299 if self._trace > 0:
300 print 'Completer', chart.pp_edge(e,w)
301
302
303 return chart.parses(root)
304
306 import sys, time
307
308 S = GrammarCategory.parse('S')
309 VP = GrammarCategory.parse('VP')
310 NP = GrammarCategory.parse('NP')
311 PP = GrammarCategory.parse('PP')
312 V = GrammarCategory.parse('V')
313 N = GrammarCategory.parse('N')
314 P = GrammarCategory.parse('P')
315 Name = GrammarCategory.parse('Name')
316 Det = GrammarCategory.parse('Det')
317 DetSg = GrammarCategory.parse('Det[-pl]')
318 DetPl = GrammarCategory.parse('Det[+pl]')
319 NSg = GrammarCategory.parse('N[-pl]')
320 NPl = GrammarCategory.parse('N[+pl]')
321
322
323 grammatical_productions = [
324 cfg.Production(S, (NP, VP)), cfg.Production(PP, (P, NP)),
325 cfg.Production(NP, (NP, PP)),
326 cfg.Production(VP, (VP, PP)), cfg.Production(VP, (V, NP)),
327 cfg.Production(VP, (V,)), cfg.Production(NP, (DetPl, NPl)),
328 cfg.Production(NP, (DetSg, NSg))]
329
330
331 lexical_productions = [
332 cfg.Production(NP, ('John',)), cfg.Production(NP, ('I',)),
333 cfg.Production(Det, ('the',)), cfg.Production(Det, ('my',)),
334 cfg.Production(Det, ('a',)),
335 cfg.Production(NSg, ('dog',)), cfg.Production(NSg, ('cookie',)),
336 cfg.Production(V, ('ate',)), cfg.Production(V, ('saw',)),
337 cfg.Production(P, ('with',)), cfg.Production(P, ('under',)),
338 ]
339
340 earley_grammar = cfg.Grammar(S, grammatical_productions)
341 earley_lexicon = {}
342 for prod in lexical_productions:
343 earley_lexicon.setdefault(prod.rhs()[0].upper(), []).append(prod.lhs())
344 def lexicon(word):
345 return earley_lexicon.get(word.upper(), [])
346
347 sent = 'I saw John with a dog with my cookie'
348 print "Sentence:\n", sent
349 from nltk_lite import tokenize
350 tokens = list(tokenize.whitespace(sent))
351 t = time.time()
352 cp = FeatureEarleyChartParse(earley_grammar, lexicon, trace=1)
353 trees = cp.get_parse_list(tokens)
354 print "Time: %s" % (time.time() - t)
355 for tree in trees: print tree
356
358 import profile
359 profile.run('for i in range(1): demo()', '/tmp/profile.out')
360 import pstats
361 p = pstats.Stats('/tmp/profile.out')
362 p.strip_dirs().sort_stats('time', 'cum').print_stats(60)
363 p.strip_dirs().sort_stats('cum', 'time').print_stats(60)
364
365 if __name__ == '__main__':
366 demo()
367