Package nltk_lite :: Package contrib :: Package mit :: Package six863 :: Package parse :: Module featurechart
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.contrib.mit.six863.parse.featurechart

  1  # Natural Language Toolkit: Chart Parser for Feature-Based Grammars 
  2  # 
  3  # Copyright (C) 2001-2007 University of Pennsylvania 
  4  # Author: Rob Speer <rspeer@mit.edu> 
  5  # URL: <http://nltk.sf.net> 
  6  # For license information, see LICENSE.TXT 
  7  # 
  8  # $Id: featurechart.py 4107 2007-02-01 00:07:42Z stevenbird $ 
  9   
 10  """ 
 11  Extension of chart parsing implementation to handle grammars with 
 12  feature structures as nodes. 
 13  """ 
 14   
 15  import yaml 
 16  from chart import * 
 17  from category import * 
 18  import cfg 
 19   
 20  from featurelite import * 
 21   
22 -def load_earley(filename, trace=1):
23 """ 24 Load a grammar from a file, and build an Earley feature parser based on 25 that grammar. 26 27 You can optionally specify a tracing level, for how much output you 28 want to see: 29 30 0: No output. 31 1: Show edges from scanner and completer rules (not predictor). 32 2 (default): Show all edges as they are added to the chart. 33 3: Show all edges, plus the results of successful unifications. 34 4: Show all edges, plus the results of all attempted unifications. 35 5: Show all edges, plus the results of all attempted unifications, 36 including those with cached results. 37 """ 38 39 grammar = GrammarFile.read_file(filename) 40 return grammar.earley_parser(trace)
41
42 -class FeatureTreeEdge(TreeEdge):
43 """ 44 FIXME: out of date documentation 45 46 A modification of L{TreeEdge} to handle nonterminals with features 47 (known as L{Categories<Category>}. 48 49 In addition to the span, left-hand side, right-hand side, and dot position 50 (described at L{TreeEdge}), a C{FeatureTreeEdge} includes X{vars}, a 51 set of L{FeatureBindings} saying which L{FeatureVariable}s are set to which 52 values. 53 54 These values are applied when examining the C{lhs} or C{rhs} of a 55 C{FeatureTreeEdge}. 56 57 For more information about edges, see the L{EdgeI} interface. 58 """
59 - def __init__(self, span, lhs, rhs, dot=0, vars=None):
60 """ 61 Construct a new C{FeatureTreeEdge}. 62 63 @type span: C{(int, int)} 64 @param span: A tuple C{(s,e)}, where C{subtokens[s:e]} is the 65 portion of the sentence that is consistent with the new 66 edge's structure. 67 @type lhs: L{Category} 68 @param lhs: The new edge's left-hand side, specifying the 69 hypothesized tree's node value. 70 @type rhs: C{list} of (L{Category} and C{string}) 71 @param rhs: The new edge's right-hand side, specifying the 72 hypothesized tree's children. 73 @type dot: C{int} 74 @param dot: The position of the new edge's dot. This position 75 specifies what prefix of the production's right hand side 76 is consistent with the text. In particular, if 77 C{sentence} is the list of subtokens in the sentence, then 78 C{subtokens[span[0]:span[1]]} can be spanned by the 79 children specified by C{rhs[:dot]}. 80 @type vars: L{FeatureBindings} 81 @param vars: The bindings specifying what values certain variables in 82 this edge must have. 83 """ 84 TreeEdge.__init__(self, span, lhs, rhs, dot) 85 if vars is None: vars = {} 86 self._vars = vars
87 88 @staticmethod
89 - def from_production(production, index, bindings=None):
90 """ 91 @return: A new C{FeatureTreeEdge} formed from the given production. 92 The new edge's left-hand side and right-hand side will 93 be taken from C{production}; its span will be C{(index, 94 index)}; its dot position will be C{0}, and it may have specified 95 variables set. 96 @rtype: L{FeatureTreeEdge} 97 """ 98 return FeatureTreeEdge(span=(index, index), lhs=production.lhs(), 99 rhs=production.rhs(), dot=0, vars=bindings)
100 101 # Accessors
102 - def vars(self):
103 """ 104 @return: the L{VariableBindings} mapping L{FeatureVariable}s to values. 105 @rtype: L{VariableBindings} 106 """ 107 return self._vars
108
109 - def lhs(self):
110 """ 111 @return: the value of the left-hand side with variables set. 112 @rtype: C{Category} 113 """ 114 return apply(TreeEdge.lhs(self), self._vars)
115
116 - def orig_lhs(self):
117 """ 118 @return: the value of the left-hand side with no variables set. 119 @rtype: C{Category} 120 """ 121 return TreeEdge.lhs(self)
122
123 - def rhs(self):
124 """ 125 @return: the value of the right-hand side with variables set. 126 @rtype: C{Category} 127 """ 128 return tuple(apply(x, self._vars) for x in TreeEdge.rhs(self))
129
130 - def orig_rhs(self):
131 """ 132 @return: the value of the right-hand side with no variables set. 133 @rtype: C{Category} 134 """ 135 return TreeEdge.rhs(self)
136 137 # String representation
138 - def __str__(self):
139 str = '%s ->' % self.lhs() 140 141 for i in range(len(self._rhs)): 142 if i == self._dot: str += ' *' 143 str += ' %s' % (self.rhs()[i],) 144 if len(self._rhs) == self._dot: str += ' *' 145 return '%s %s' % (str, self._vars)
146
147 -class FeatureFundamentalRule(FundamentalRule):
148 - def __init__(self, trace=0):
149 FundamentalRule.__init__(self) 150 self.trace = trace 151 self.unify_memo = {}
152 - def apply_iter(self, chart, grammar, left_edge, right_edge):
153 # Make sure the rule is applicable. 154 if not (left_edge.end() == right_edge.start() and 155 left_edge.is_incomplete() and right_edge.is_complete() and 156 isinstance(left_edge, FeatureTreeEdge) and 157 isinstance(right_edge, FeatureTreeEdge) 158 ): 159 return 160 left_bindings = left_edge.vars().copy() 161 right_bindings = right_edge.vars().copy() 162 try: 163 unified = unify(left_edge.next(), right_edge.lhs(), left_bindings, 164 right_bindings, memo=self.unify_memo, trace=self.trace-2) 165 if isinstance(unified, Category): unified.freeze() 166 except UnificationFailure: return 167 168 # Construct the new edge. 169 new_edge = FeatureTreeEdge(span=(left_edge.start(), right_edge.end()), 170 lhs=left_edge.lhs(), rhs=left_edge.rhs(), 171 dot=left_edge.dot()+1, vars=left_bindings) 172 173 # Add it to the chart, with appropraite child pointers. 174 changed_chart = False 175 for cpl1 in chart.child_pointer_lists(left_edge): 176 if chart.insert(new_edge, cpl1+(right_edge,)): 177 changed_chart = True 178 179 # If we changed the chart, then generate the edge. 180 if changed_chart: yield new_edge
181
182 -class SingleEdgeFeatureFundamentalRule(SingleEdgeFundamentalRule):
183 - def __init__(self, trace=0):
186
187 - def apply_iter(self, chart, grammar, edge1):
188 fr = self._fundamental_rule 189 if edge1.is_incomplete(): 190 # edge1 = left_edge; edge2 = right_edge 191 for edge2 in chart.select(start=edge1.end(), is_complete=True): 192 for new_edge in fr.apply_iter(chart, grammar, edge1, edge2): 193 yield new_edge 194 else: 195 # edge2 = left_edge; edge1 = right_edge 196 for edge2 in chart.select(end=edge1.start(), is_complete=False): 197 for new_edge in fr.apply_iter(chart, grammar, edge2, edge1): 198 yield new_edge
199
200 -class FeatureTopDownExpandRule(TopDownExpandRule):
201 """ 202 The @C{TopDownExpandRule} specialised for feature-based grammars. 203 """
204 - def __init__(self, trace=0):
205 TopDownExpandRule.__init__(self) 206 self.unify_memo = {} 207 self.trace = trace
208 - def apply_iter(self, chart, grammar, edge):
209 if edge.is_complete(): return 210 for prod in grammar.productions(): 211 bindings = edge.vars().copy() 212 try: 213 unified = unify(edge.next(), prod.lhs(), bindings, {}, 214 memo=self.unify_memo, trace=self.trace-2) 215 if isinstance(unified, Category): unified.freeze() 216 except UnificationFailure: 217 continue 218 new_edge = FeatureTreeEdge.from_production(prod, edge.end()) 219 if chart.insert(new_edge, ()): 220 yield new_edge
221
222 -class FeatureEarleyChartParse(EarleyChartParse):
223 """ 224 A chart parser implementing the Earley parsing algorithm, allowing 225 nonterminals that have features (known as L{Categories<Category>}). 226 227 - For each index I{end} in [0, 1, ..., N]: 228 - For each I{edge} s.t. I{edge}.end = I{end}: 229 - If I{edge} is incomplete, and I{edge}.next is not a part 230 of speech: 231 - Apply PredictorRule to I{edge} 232 - If I{edge} is incomplete, and I{edge}.next is a part of 233 speech: 234 - Apply ScannerRule to I{edge} 235 - If I{edge} is complete: 236 - Apply CompleterRule to I{edge} 237 - Return any complete parses in the chart 238 239 C{FeatureEarleyChartParse} uses a X{lexicon} to decide whether a leaf 240 has a given part of speech. This lexicon is encoded as a 241 dictionary that maps each word to a list of parts of speech that 242 word can have. Unlike in the L{EarleyChartParse}, this lexicon is 243 case-insensitive. 244 """
245 - def __init__(self, grammar, lexicon, trace=0):
246 # Build a case-insensitive lexicon. 247 #ci_lexicon = dict((k.upper(), v) for k, v in lexicon.iteritems()) 248 # Call the super constructor. 249 EarleyChartParse.__init__(self, grammar, lexicon, trace)
250
251 - def get_parse_list(self, tokens):
252 chart = Chart(tokens) 253 grammar = self._grammar 254 255 # Width, for printing trace edges. 256 #w = 40/(chart.num_leaves()+1) 257 w = 2 258 if self._trace > 0: print ' '*9, chart.pp_leaves(w) 259 260 # Initialize the chart with a special "starter" edge. 261 root = GrammarCategory(pos='[INIT]') 262 edge = FeatureTreeEdge((0,0), root, (grammar.start(),), 0, 263 {}) 264 chart.insert(edge, ()) 265 266 # Create the 3 rules: 267 predictor = FeatureTopDownExpandRule(self._trace) 268 completer = SingleEdgeFeatureFundamentalRule(self._trace) 269 #scanner = FeatureScannerRule(self._lexicon) 270 271 for end in range(chart.num_leaves()+1): 272 if self._trace > 1: print 'Processing queue %d' % end 273 274 # Scanner rule substitute, i.e. this is being used in place 275 # of a proper FeatureScannerRule at the moment. 276 if end > 0 and end-1 < chart.num_leaves(): 277 leaf = chart.leaf(end-1) 278 for pos in self._lexicon(leaf): 279 new_leaf_edge = LeafEdge(leaf, end-1) 280 chart.insert(new_leaf_edge, ()) 281 new_pos_edge = FeatureTreeEdge((end-1, end), pos, [leaf], 1, 282 {}) 283 chart.insert(new_pos_edge, (new_leaf_edge,)) 284 if self._trace > 0: 285 print 'Scanner ', chart.pp_edge(new_pos_edge,w) 286 287 288 for edge in chart.select(end=end): 289 if edge.is_incomplete(): 290 for e in predictor.apply(chart, grammar, edge): 291 if self._trace > 1: 292 print 'Predictor', chart.pp_edge(e,w) 293 #if edge.is_incomplete(): 294 # for e in scanner.apply(chart, grammar, edge): 295 # if self._trace > 0: 296 # print 'Scanner ', chart.pp_edge(e,w) 297 if edge.is_complete(): 298 for e in completer.apply(chart, grammar, edge): 299 if self._trace > 0: 300 print 'Completer', chart.pp_edge(e,w) 301 302 # Output a list of complete parses. 303 return chart.parses(root)
304
305 -def demo():
306 import sys, time 307 308 S = GrammarCategory.parse('S') 309 VP = GrammarCategory.parse('VP') 310 NP = GrammarCategory.parse('NP') 311 PP = GrammarCategory.parse('PP') 312 V = GrammarCategory.parse('V') 313 N = GrammarCategory.parse('N') 314 P = GrammarCategory.parse('P') 315 Name = GrammarCategory.parse('Name') 316 Det = GrammarCategory.parse('Det') 317 DetSg = GrammarCategory.parse('Det[-pl]') 318 DetPl = GrammarCategory.parse('Det[+pl]') 319 NSg = GrammarCategory.parse('N[-pl]') 320 NPl = GrammarCategory.parse('N[+pl]') 321 322 # Define some grammatical productions. 323 grammatical_productions = [ 324 cfg.Production(S, (NP, VP)), cfg.Production(PP, (P, NP)), 325 cfg.Production(NP, (NP, PP)), 326 cfg.Production(VP, (VP, PP)), cfg.Production(VP, (V, NP)), 327 cfg.Production(VP, (V,)), cfg.Production(NP, (DetPl, NPl)), 328 cfg.Production(NP, (DetSg, NSg))] 329 330 # Define some lexical productions. 331 lexical_productions = [ 332 cfg.Production(NP, ('John',)), cfg.Production(NP, ('I',)), 333 cfg.Production(Det, ('the',)), cfg.Production(Det, ('my',)), 334 cfg.Production(Det, ('a',)), 335 cfg.Production(NSg, ('dog',)), cfg.Production(NSg, ('cookie',)), 336 cfg.Production(V, ('ate',)), cfg.Production(V, ('saw',)), 337 cfg.Production(P, ('with',)), cfg.Production(P, ('under',)), 338 ] 339 340 earley_grammar = cfg.Grammar(S, grammatical_productions) 341 earley_lexicon = {} 342 for prod in lexical_productions: 343 earley_lexicon.setdefault(prod.rhs()[0].upper(), []).append(prod.lhs()) 344 def lexicon(word): 345 return earley_lexicon.get(word.upper(), [])
346 347 sent = 'I saw John with a dog with my cookie' 348 print "Sentence:\n", sent 349 from nltk_lite import tokenize 350 tokens = list(tokenize.whitespace(sent)) 351 t = time.time() 352 cp = FeatureEarleyChartParse(earley_grammar, lexicon, trace=1) 353 trees = cp.get_parse_list(tokens) 354 print "Time: %s" % (time.time() - t) 355 for tree in trees: print tree 356
357 -def run_profile():
358 import profile 359 profile.run('for i in range(1): demo()', '/tmp/profile.out') 360 import pstats 361 p = pstats.Stats('/tmp/profile.out') 362 p.strip_dirs().sort_stats('time', 'cum').print_stats(60) 363 p.strip_dirs().sort_stats('cum', 'time').print_stats(60)
364 365 if __name__ == '__main__': 366 demo() 367