Package nltk_lite :: Package parse :: Module sr
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.parse.sr

  1  # Natural Language Toolkit: Shift-Reduce Parser 
  2  # 
  3  # Copyright (C) 2001-2007 University of Pennsylvania 
  4  # Author: Edward Loper <edloper@gradient.cis.upenn.edu> 
  5  #         Steven Bird <sb@csse.unimelb.edu.au> 
  6  # URL: <http://nltk.sf.net> 
  7  # For license information, see LICENSE.TXT 
  8   
  9  from types import * 
 10  from nltk_lite import tokenize 
 11  from nltk_lite.parse import * 
 12  import string 
 13   
 14  ##////////////////////////////////////////////////////// 
 15  ##  Shift/Reduce Parser 
 16  ##////////////////////////////////////////////////////// 
17 -class ShiftReduce(AbstractParse):
18 """ 19 A simple bottom-up CFG parser that uses two operations, "shift" 20 and "reduce", to find a single parse for a text. 21 22 C{ShiftReduce} maintains a stack, which records the 23 structure of a portion of the text. This stack is a list of 24 C{String}s and C{Tree}s that collectively cover a portion of 25 the text. For example, while parsing the sentence "the dog saw 26 the man" with a typical grammar, C{ShiftReduce} will produce 27 the following stack, which covers "the dog saw":: 28 29 [(NP: (Det: 'the') (N: 'dog')), (V: 'saw')] 30 31 C{ShiftReduce} attempts to extend the stack to cover the 32 entire text, and to combine the stack elements into a single tree, 33 producing a complete parse for the sentence. 34 35 Initially, the stack is empty. It is extended to cover the text, 36 from left to right, by repeatedly applying two operations: 37 38 - X{shift} moves a token from the beginning of the text to the 39 end of the stack. 40 - X{reduce} uses a CFG production to combine the rightmost stack 41 elements into a single C{Tree}. 42 43 Often, more than one operation can be performed on a given stack. 44 In this case, C{ShiftReduce} uses the following heuristics 45 to decide which operation to perform: 46 47 - Only shift if no reductions are available. 48 - If multiple reductions are available, then apply the reduction 49 whose CFG production is listed earliest in the grammar. 50 51 Note that these heuristics are not guaranteed to choose an 52 operation that leads to a parse of the text. Also, if multiple 53 parses exists, C{ShiftReduce} will return at most one of 54 them. 55 56 @see: C{nltk.cfg} 57 """
58 - def __init__(self, grammar, trace=0):
59 """ 60 Create a new C{ShiftReduce}, that uses C{grammar} to 61 parse texts. 62 63 @type grammar: C{Grammar} 64 @param grammar: The grammar used to parse texts. 65 @type trace: C{int} 66 @param trace: The level of tracing that should be used when 67 parsing a text. C{0} will generate no tracing output; 68 and higher numbers will produce more verbose tracing 69 output. 70 """ 71 self._grammar = grammar 72 self._trace = trace 73 AbstractParse.__init__(self) 74 self._check_grammar()
75
76 - def get_parse(self, tokens):
77 78 # initialize the stack. 79 stack = [] 80 tokens = list(tokens) 81 remaining_text = tokens 82 83 # Trace output. 84 if self._trace: 85 print 'Parsing %r' % string.join(tokens) 86 self._trace_stack(stack, remaining_text) 87 88 # iterate through the text, pushing the token onto 89 # the stack, then reducing the stack. 90 while len(remaining_text) > 0: 91 self._shift(stack, remaining_text) 92 while self._reduce(stack, remaining_text): pass 93 94 # Did we reduce everything? 95 if len(stack) != 1: return None 96 97 # Did we end up with the right category? 98 if stack[0].node != self._grammar.start().symbol(): 99 return None 100 101 # We parsed successfully! 102 return stack[0]
103
104 - def _shift(self, stack, remaining_text):
105 """ 106 Move a token from the beginning of C{remaining_text} to the 107 end of C{stack}. 108 109 @type stack: C{list} of C{String} and C{Tree} 110 @param stack: A list of C{String}s and C{Tree}s, encoding 111 the structure of the text that has been parsed so far. 112 @type remaining_text: C{list} of C{String} 113 @param remaining_text: The portion of the text that is not yet 114 covered by C{stack}. 115 @rtype: C{None} 116 """ 117 stack.append(remaining_text[0]) 118 remaining_text.remove(remaining_text[0]) 119 if self._trace: self._trace_shift(stack, remaining_text)
120
121 - def _match_rhs(self, rhs, rightmost_stack):
122 """ 123 @rtype: C{boolean} 124 @return: true if the right hand side of a CFG production 125 matches the rightmost elements of the stack. C{rhs} 126 matches C{rightmost_stack} if they are the same length, 127 and each element of C{rhs} matches the corresponding 128 element of C{rightmost_stack}. A nonterminal element of 129 C{rhs} matches any C{Tree} whose node value is equal 130 to the nonterminal's symbol. A terminal element of C{rhs} 131 matches any C{String} whose type is equal to the terminal. 132 @type rhs: C{list} of (terminal and C{Nonterminal}) 133 @param rhs: The right hand side of a CFG production. 134 @type rightmost_stack: C{list} of (C{String} and C{Tree}) 135 @param rightmost_stack: The rightmost elements of the parser's 136 stack. 137 """ 138 139 if len(rightmost_stack) != len(rhs): return 0 140 for i in range(len(rightmost_stack)): 141 if isinstance(rightmost_stack[i], Tree): 142 if not isinstance(rhs[i], cfg.Nonterminal): return 0 143 if rightmost_stack[i].node != rhs[i].symbol(): return 0 144 else: 145 if isinstance(rhs[i], cfg.Nonterminal): return 0 146 if rightmost_stack[i] != rhs[i]: return 0 147 return 1
148
149 - def _reduce(self, stack, remaining_text, production=None):
150 """ 151 Find a CFG production whose right hand side matches the 152 rightmost stack elements; and combine those stack elements 153 into a single C{Tree}, with the node specified by the 154 production's left-hand side. If more than one CFG production 155 matches the stack, then use the production that is listed 156 earliest in the grammar. The new C{Tree} replaces the 157 elements in the stack. 158 159 @rtype: C{Production} or C{None} 160 @return: If a reduction is performed, then return the CFG 161 production that the reduction is based on; otherwise, 162 return false. 163 @type stack: C{list} of C{String} and C{Tree} 164 @param stack: A list of C{String}s and C{Tree}s, encoding 165 the structure of the text that has been parsed so far. 166 @type remaining_text: C{list} of C{String} 167 @param remaining_text: The portion of the text that is not yet 168 covered by C{stack}. 169 """ 170 if production is None: productions = self._grammar.productions() 171 else: productions = [production] 172 173 # Try each production, in order. 174 for production in productions: 175 rhslen = len(production.rhs()) 176 177 # check if the RHS of a production matches the top of the stack 178 if self._match_rhs(production.rhs(), stack[-rhslen:]): 179 180 # combine the tree to reflect the reduction 181 tree = Tree(production.lhs().symbol(), stack[-rhslen:]) 182 stack[-rhslen:] = [tree] 183 184 # We reduced something 185 if self._trace: 186 self._trace_reduce(stack, production, remaining_text) 187 return production 188 189 # We didn't reduce anything 190 return None
191
192 - def trace(self, trace=2):
193 """ 194 Set the level of tracing output that should be generated when 195 parsing a text. 196 197 @type trace: C{int} 198 @param trace: The trace level. A trace level of C{0} will 199 generate no tracing output; and higher trace levels will 200 produce more verbose tracing output. 201 @rtype: C{None} 202 """ 203 # 1: just show shifts. 204 # 2: show shifts & reduces 205 # 3: display which tokens & productions are shifed/reduced 206 self._trace = trace
207
208 - def _trace_stack(self, stack, remaining_text, marker=' '):
209 """ 210 Print trace output displaying the given stack and text. 211 212 @rtype: C{None} 213 @param marker: A character that is printed to the left of the 214 stack. This is used with trace level 2 to print 'S' 215 before shifted stacks and 'R' before reduced stacks. 216 """ 217 str = ' '+marker+' [ ' 218 for elt in stack: 219 if isinstance(elt, Tree): 220 str += `cfg.Nonterminal(elt.node)` + ' ' 221 else: 222 str += `elt` + ' ' 223 str += '* ' + string.join(remaining_text) + ']' 224 print str
225
226 - def _trace_shift(self, stack, remaining_text):
227 """ 228 Print trace output displaying that a token has been shifted. 229 230 @rtype: C{None} 231 """ 232 if self._trace > 2: print 'Shift %r:' % stack[-1] 233 if self._trace == 2: self._trace_stack(stack, remaining_text, 'S') 234 elif self._trace > 0: self._trace_stack(stack, remaining_text)
235
236 - def _trace_reduce(self, stack, production, remaining_text):
237 """ 238 Print trace output displaying that C{production} was used to 239 reduce C{stack}. 240 241 @rtype: C{None} 242 """ 243 if self._trace > 2: 244 rhs = string.join(production.rhs()) 245 print 'Reduce %r <- %s' % (production.lhs(), rhs) 246 if self._trace == 2: self._trace_stack(stack, remaining_text, 'R') 247 elif self._trace > 1: self._trace_stack(stack, remaining_text)
248
249 - def _check_grammar(self):
250 """ 251 Check to make sure that all of the CFG productions are 252 potentially useful. If any productions can never be used, 253 then print a warning. 254 255 @rtype: C{None} 256 """ 257 productions = self._grammar.productions() 258 259 # Any production whose RHS is an extension of another production's RHS 260 # will never be used. 261 for i in range(len(productions)): 262 for j in range(i+1, len(productions)): 263 rhs1 = productions[i].rhs() 264 rhs2 = productions[j].rhs() 265 if rhs1[:len(rhs2)] == rhs2: 266 print 'Warning: %r will never be used' % productions[i]
267 268 ##////////////////////////////////////////////////////// 269 ## Stepping Shift/Reduce Parser 270 ##//////////////////////////////////////////////////////
271 -class SteppingShiftReduce(ShiftReduce):
272 """ 273 A C{ShiftReduce} that allows you to setp through the parsing 274 process, performing a single operation at a time. It also allows 275 you to change the parser's grammar midway through parsing a text. 276 277 The C{initialize} method is used to start parsing a text. 278 C{shift} performs a single shift operation, and C{reduce} performs 279 a single reduce operation. C{step} will perform a single reduce 280 operation if possible; otherwise, it will perform a single shift 281 operation. C{parses} returns the set of parses that have been 282 found by the parser. 283 284 @ivar _history: A list of C{(stack, remaining_text)} pairs, 285 containing all of the previous states of the parser. This 286 history is used to implement the C{undo} operation. 287 @see: C{nltk.cfg} 288 """
289 - def __init__(self, grammar, trace=0):
290 self._grammar = grammar 291 self._trace = trace 292 self._stack = None 293 self._remaining_text = None 294 self._history = [] 295 AbstractParse.__init__(self)
296
297 - def get_parse_list(self, tokens):
298 tokens = list(tokens) 299 self.initialize(tokens) 300 while self.step(): pass 301 302 return self.parses()
303
304 - def stack(self):
305 """ 306 @return: The parser's stack. 307 @rtype: C{list} of C{String} and C{Tree} 308 """ 309 return self._stack
310
311 - def remaining_text(self):
312 """ 313 @return: The portion of the text that is not yet covered by the 314 stack. 315 @rtype: C{list} of C{String} 316 """ 317 return self._remaining_text
318
319 - def initialize(self, tokens):
320 """ 321 Start parsing a given text. This sets the parser's stack to 322 C{[]} and sets its remaining text to C{tokens}. 323 """ 324 self._stack = [] 325 self._remaining_text = tokens 326 self._history = []
327
328 - def step(self):
329 """ 330 Perform a single parsing operation. If a reduction is 331 possible, then perform that reduction, and return the 332 production that it is based on. Otherwise, if a shift is 333 possible, then perform it, and return 1. Otherwise, 334 return 0. 335 336 @return: 0 if no operation was performed; 1 if a shift was 337 performed; and the CFG production used to reduce if a 338 reduction was performed. 339 @rtype: C{Production} or C{boolean} 340 """ 341 return self.reduce() or self.shift()
342
343 - def shift(self):
344 """ 345 Move a token from the beginning of the remaining text to the 346 end of the stack. If there are no more tokens in the 347 remaining text, then do nothing. 348 349 @return: True if the shift operation was successful. 350 @rtype: C{boolean} 351 """ 352 if len(self._remaining_text) == 0: return 0 353 self._history.append( (self._stack[:], self._remaining_text[:]) ) 354 self._shift(self._stack, self._remaining_text) 355 return 1
356
357 - def reduce(self, production=None):
358 """ 359 Use C{production} to combine the rightmost stack elements into 360 a single C{Tree}. If C{production} does not match the 361 rightmost stack elements, then do nothing. 362 363 @return: The production used to reduce the stack, if a 364 reduction was performed. If no reduction was performed, 365 return C{None}. 366 367 @rtype: C{Production} or C{None} 368 """ 369 self._history.append( (self._stack[:], self._remaining_text[:]) ) 370 return_val = self._reduce(self._stack, self._remaining_text, 371 production) 372 373 if not return_val: self._history.pop() 374 return return_val
375
376 - def undo(self):
377 """ 378 Return the parser to its state before the most recent 379 shift or reduce operation. Calling C{undo} repeatedly return 380 the parser to successively earlier states. If no shift or 381 reduce operations have been performed, C{undo} will make no 382 changes. 383 384 @return: true if an operation was successfully undone. 385 @rtype: C{boolean} 386 """ 387 if len(self._history) == 0: return 0 388 (self._stack, self._remaining_text) = self._history.pop() 389 return 1
390
391 - def reducible_productions(self):
392 """ 393 @return: A list of the productions for which reductions are 394 available for the current parser state. 395 @rtype: C{list} of C{Production} 396 """ 397 productions = [] 398 for production in self._grammar.productions(): 399 rhslen = len(production.rhs()) 400 if self._match_rhs(production.rhs(), self._stack[-rhslen:]): 401 productions.append(production) 402 return productions
403
404 - def parses(self):
405 """ 406 @return: A list of the parses that have been found by this 407 parser so far. 408 @rtype: C{list} of C{Tree} 409 """ 410 if len(self._remaining_text) != 0: return [] 411 if len(self._stack) != 1: return [] 412 if self._stack[0].node != self._grammar.start().symbol(): 413 return [] 414 return self._stack
415 416 # copied from nltk.parser 417
418 - def set_grammar(self, grammar):
419 """ 420 Change the grammar used to parse texts. 421 422 @param grammar: The new grammar. 423 @type grammar: C{CFG} 424 """ 425 self._grammar = grammar
426 427 ##////////////////////////////////////////////////////// 428 ## Demonstration Code 429 ##////////////////////////////////////////////////////// 430
431 -def demo():
432 """ 433 A demonstration of the shift-reduce parser. 434 """ 435 436 from nltk_lite import parse 437 438 grammar = parse.cfg.parse_cfg(""" 439 S -> NP 'saw' NP | NP VP 440 NP -> Det N | Det N PP 441 VP -> V NP PP 442 PP -> P NP 443 NP -> 'I' 444 N -> 'man' | 'park' | 'telescope' | 'dog' 445 Det -> 'the' | 'a' 446 P -> 'in' | 'with' 447 V -> 'saw' 448 """) 449 450 sent = tokenize.whitespace('I saw a man in the park') 451 452 parser = parse.ShiftReduce(grammar) 453 parser.trace() 454 for p in parser.get_parse_list(sent): 455 print p
456 457 if __name__ == '__main__': demo() 458