Package nltk_lite :: Package parse :: Module rd
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.parse.rd

  1  # Natural Language Toolkit: Recursive Descent Parser 
  2  # 
  3  # Copyright (C) 2001-2007 University of Pennsylvania 
  4  # Author: Edward Loper <edloper@gradient.cis.upenn.edu> 
  5  #         Steven Bird <sb@csse.unimelb.edu.au> 
  6  # URL: <http://nltk.sf.net> 
  7  # For license information, see LICENSE.TXT 
  8   
  9  from types import * 
 10  from nltk_lite import tokenize 
 11  from nltk_lite.parse import * 
 12  import string 
 13   
 14  ##////////////////////////////////////////////////////// 
 15  ##  Recursive Descent Parser 
 16  ##////////////////////////////////////////////////////// 
17 -class RecursiveDescent(AbstractParse):
18 """ 19 A simple top-down CFG parser that parses texts by recursively 20 expanding the fringe of a C{Tree}, and matching it against a 21 text. 22 23 C{RecursiveDescent} uses a list of tree locations called a 24 X{frontier} to remember which subtrees have not yet been expanded 25 and which leaves have not yet been matched against the text. Each 26 tree location consists of a list of child indices specifying the 27 path from the root of the tree to a subtree or a leaf; see the 28 reference documentation for C{Tree} for more information 29 about tree locations. 30 31 When the parser begins parsing a text, it constructs a tree 32 containing only the start symbol, and a frontier containing the 33 location of the tree's root node. It then extends the tree to 34 cover the text, using the following recursive procedure: 35 36 - If the frontier is empty, and the text is covered by the tree, 37 then return the tree as a possible parse. 38 - If the frontier is empty, and the text is not covered by the 39 tree, then return no parses. 40 - If the first element of the frontier is a subtree, then 41 use CFG productions to X{expand} it. For each applicable 42 production, add the expanded subtree's children to the 43 frontier, and recursively find all parses that can be 44 generated by the new tree and frontier. 45 - If the first element of the frontier is a token, then X{match} 46 it against the next token from the text. Remove the token 47 from the frontier, and recursively find all parses that can be 48 generated by the new tree and frontier. 49 50 @see: C{nltk.cfg} 51 """
52 - def __init__(self, grammar, trace=0):
53 """ 54 Create a new C{RecursiveDescent}, that uses C{grammar} 55 to parse texts. 56 57 @type grammar: C{Grammar} 58 @param grammar: The grammar used to parse texts. 59 @type trace: C{int} 60 @param trace: The level of tracing that should be used when 61 parsing a text. C{0} will generate no tracing output; 62 and higher numbers will produce more verbose tracing 63 output. 64 """ 65 self._grammar = grammar 66 self._trace = trace 67 AbstractParse.__init__(self)
68
69 - def get_parse_list(self, tokens):
70 # Inherit docs from ParseI 71 72 # Start a recursive descent parse, with an initial tree 73 # containing just the start symbol. 74 tokens = list(tokens) 75 start = self._grammar.start().symbol() 76 initial_tree = Tree(start, []) 77 frontier = [()] 78 if self._trace: 79 self._trace_start(initial_tree, frontier, tokens) 80 parses = self._parse(tokens, initial_tree, frontier) 81 82 # Return the parses. 83 return parses
84
85 - def _parse(self, remaining_text, tree, frontier):
86 """ 87 Recursively expand and match each elements of C{tree} 88 specified by C{frontier}, to cover C{remaining_text}. Return 89 a list of all parses found. 90 91 @return: A list of all parses that can be generated by 92 matching and expanding the elements of C{tree} 93 specified by C{frontier}. 94 @rtype: C{list} of C{Tree} 95 @type tree: C{Tree} 96 @param tree: A partial structure for the text that is 97 currently being parsed. The elements of C{tree} 98 that are specified by C{frontier} have not yet been 99 expanded or matched. 100 @type remaining_text: C{list} of C{String}s 101 @param remaining_text: The portion of the text that is not yet 102 covered by C{tree}. 103 @type frontier: C{list} of C{tuple} of C{int} 104 @param frontier: A list of the locations within C{tree} of 105 all subtrees that have not yet been expanded, and all 106 leaves that have not yet been matched. This list sorted 107 in left-to-right order of location within the tree. 108 """ 109 110 # If the tree covers the text, and there's nothing left to 111 # expand, then we've found a complete parse; return it. 112 if len(remaining_text) == 0 and len(frontier) == 0: 113 if self._trace: 114 self._trace_succeed(tree, frontier) 115 return [tree] 116 117 # If there's still text, but nothing left to expand, we failed. 118 elif len(frontier) == 0: 119 if self._trace: 120 self._trace_backtrack(tree, frontier) 121 return [] 122 123 # If the next element on the frontier is a tree, expand it. 124 elif isinstance(tree[frontier[0]], Tree): 125 return self._expand(remaining_text, tree, frontier) 126 127 # If the next element on the frontier is a token, match it. 128 else: 129 return self._match(remaining_text, tree, frontier)
130
131 - def _match(self, rtext, tree, frontier):
132 """ 133 @rtype: C{list} of C{Tree} 134 @return: a list of all parses that can be generated by 135 matching the first element of C{frontier} against the 136 first token in C{rtext}. In particular, if the first 137 element of C{frontier} has the same type as the first 138 token in C{rtext}, then substitute the token into 139 C{tree}; and return all parses that can be generated by 140 matching and expanding the remaining elements of 141 C{frontier}. If the first element of C{frontier} does not 142 have the same type as the first token in C{rtext}, then 143 return empty list. 144 145 @type tree: C{Tree} 146 @param tree: A partial structure for the text that is 147 currently being parsed. The elements of C{tree} 148 that are specified by C{frontier} have not yet been 149 expanded or matched. 150 @type rtext: C{list} of C{String}s 151 @param rtext: The portion of the text that is not yet 152 covered by C{tree}. 153 @type frontier: C{list} of C{tuple} of C{int} 154 @param frontier: A list of the locations within C{tree} of 155 all subtrees that have not yet been expanded, and all 156 leaves that have not yet been matched. 157 """ 158 159 tree_leaf = tree[frontier[0]] 160 if (len(rtext) > 0 and tree_leaf == rtext[0]): 161 # If it's a terminal that matches rtext[0], then substitute 162 # in the token, and continue parsing. 163 newtree = tree.copy(deep=True) 164 newtree[frontier[0]] = rtext[0] 165 if self._trace: 166 self._trace_match(newtree, frontier[1:], rtext[0]) 167 return self._parse(rtext[1:], newtree, frontier[1:]) 168 else: 169 # If it's a non-matching terminal, fail. 170 if self._trace: 171 self._trace_backtrack(tree, frontier, rtext[:1]) 172 return []
173
174 - def _expand(self, remaining_text, tree, frontier, production=None):
175 """ 176 @rtype: C{list} of C{Tree} 177 @return: A list of all parses that can be generated by 178 expanding the first element of C{frontier} with 179 C{production}. In particular, if the first element of 180 C{frontier} is a subtree whose node type is equal to 181 C{production}'s left hand side, then add a child to that 182 subtree for each element of C{production}'s right hand 183 side; and return all parses that can be generated by 184 matching and expanding the remaining elements of 185 C{frontier}. If the first element of C{frontier} is not a 186 subtree whose node type is equal to C{production}'s left 187 hand side, then return an empty list. If C{production} is 188 not specified, then return a list of all parses that can 189 be generated by expanding the first element of C{frontier} 190 with I{any} CFG production. 191 192 @type tree: C{Tree} 193 @param tree: A partial structure for the text that is 194 currently being parsed. The elements of C{tree} 195 that are specified by C{frontier} have not yet been 196 expanded or matched. 197 @type remaining_text: C{list} of C{String}s 198 @param remaining_text: The portion of the text that is not yet 199 covered by C{tree}. 200 @type frontier: C{list} of C{tuple} of C{int} 201 @param frontier: A list of the locations within C{tree} of 202 all subtrees that have not yet been expanded, and all 203 leaves that have not yet been matched. 204 """ 205 206 if production is None: productions = self._grammar.productions() 207 else: productions = [production] 208 209 parses = [] 210 for production in productions: 211 lhs = production.lhs().symbol() 212 if lhs == tree[frontier[0]].node: 213 subtree = self._production_to_tree(production) 214 if frontier[0] == (): 215 newtree = subtree 216 else: 217 newtree = tree.copy(deep=True) 218 newtree[frontier[0]] = subtree 219 new_frontier = [frontier[0]+(i,) for i in 220 range(len(production.rhs()))] 221 if self._trace: 222 self._trace_expand(newtree, new_frontier, production) 223 parses += self._parse(remaining_text, newtree, 224 new_frontier + frontier[1:]) 225 return parses
226
227 - def _production_to_tree(self, production):
228 """ 229 @rtype: C{Tree} 230 @return: The C{Tree} that is licensed by C{production}. 231 In particular, given the production:: 232 233 C{[M{lhs} -> M{elt[1]} ... M{elt[n]}]} 234 235 Return a tree token that has a node C{M{lhs}.symbol}, and 236 C{M{n}} children. For each nonterminal element 237 C{M{elt[i]}} in the production, the tree token has a 238 childless subtree with node value C{M{elt[i]}.symbol}; and 239 for each terminal element C{M{elt[j]}}, the tree token has 240 a leaf token with type C{M{elt[j]}}. 241 242 @param production: The CFG production that licenses the tree 243 token that should be returned. 244 @type production: C{Production} 245 """ 246 children = [] 247 for elt in production.rhs(): 248 if isinstance(elt, cfg.Nonterminal): 249 children.append(Tree(elt.symbol(), [])) 250 else: 251 # This will be matched. 252 children.append(elt) 253 return Tree(production.lhs().symbol(), children)
254
255 - def trace(self, trace=2):
256 """ 257 Set the level of tracing output that should be generated when 258 parsing a text. 259 260 @type trace: C{int} 261 @param trace: The trace level. A trace level of C{0} will 262 generate no tracing output; and higher trace levels will 263 produce more verbose tracing output. 264 @rtype: C{None} 265 """ 266 self._trace = trace
267
268 - def _trace_fringe(self, tree, treeloc=None):
269 """ 270 Print trace output displaying the fringe of C{tree}. The 271 fringe of C{tree} consists of all of its leaves and all of 272 its childless subtrees. 273 274 @rtype: C{None} 275 """ 276 277 if treeloc == (): print "*", 278 if isinstance(tree, Tree): 279 if len(tree) == 0: print `cfg.Nonterminal(tree.node)`, 280 for i in range(len(tree)): 281 if treeloc is not None and i == treeloc[0]: 282 self._trace_fringe(tree[i], treeloc[1:]) 283 else: 284 self._trace_fringe(tree[i]) 285 else: 286 print `tree`,
287
288 - def _trace_tree(self, tree, frontier, operation):
289 """ 290 Print trace output displaying the parser's current state. 291 292 @param operation: A character identifying the operation that 293 generated the current state. 294 @rtype: C{None} 295 """ 296 if self._trace == 2: print ' %c [' % operation, 297 else: print ' [', 298 if len(frontier) > 0: self._trace_fringe(tree, frontier[0]) 299 else: self._trace_fringe(tree) 300 print ']'
301
302 - def _trace_start(self, tree, frontier, text):
303 print 'Parsing %r' % string.join(text) 304 if self._trace > 2: print 'Start:' 305 if self._trace > 1: self._trace_tree(tree, frontier, ' ')
306
307 - def _trace_expand(self, tree, frontier, production):
308 if self._trace > 2: print 'Expand: %s' % production 309 if self._trace > 1: self._trace_tree(tree, frontier, 'E')
310
311 - def _trace_match(self, tree, frontier, tok):
312 if self._trace > 2: print 'Match: %r' % tok 313 if self._trace > 1: self._trace_tree(tree, frontier, 'M')
314
315 - def _trace_succeed(self, tree, frontier):
316 if self._trace > 2: print 'GOOD PARSE:' 317 if self._trace == 1: print 'Found a parse:\n%s' % tree 318 if self._trace > 1: self._trace_tree(tree, frontier, '+')
319
320 - def _trace_backtrack(self, tree, frontier, toks=None):
321 if self._trace > 2: 322 if toks: print 'Backtrack: %r match failed' % toks[0] 323 else: print 'Backtrack'
324 325 ##////////////////////////////////////////////////////// 326 ## Stepping Recursive Descent Parser 327 ##//////////////////////////////////////////////////////
328 -class SteppingRecursiveDescent(RecursiveDescent):
329 """ 330 A C{RecursiveDescent} that allows you to step through the 331 parsing process, performing a single operation at a time. 332 333 The C{initialize} method is used to start parsing a text. 334 C{expand} expands the first element on the frontier using a single 335 CFG production, and C{match} matches the first element on the 336 frontier against the next text token. C{backtrack} undoes the most 337 recent expand or match operation. C{step} performs a single 338 expand, match, or backtrack operation. C{parses} returns the set 339 of parses that have been found by the parser. 340 341 @ivar _history: A list of C{(rtext, tree, frontier)} tripples, 342 containing the previous states of the parser. This history is 343 used to implement the C{backtrack} operation. 344 @ivar _tried_e: A record of all productions that have been tried 345 for a given tree. This record is used by C{expand} to perform 346 the next untried production. 347 @ivar _tried_m: A record of what tokens have been matched for a 348 given tree. This record is used by C{step} to decide whether 349 or not to match a token. 350 @see: C{nltk.cfg} 351 """
352 - def __init__(self, grammar, trace=0):
353 self._grammar = grammar 354 self._trace = trace 355 self._rtext = None 356 self._tree = None 357 self._frontier = [()] 358 self._tried_e = {} 359 self._tried_m = {} 360 self._history = [] 361 self._parses = [] 362 AbstractParse.__init__(self)
363 364 # [XX] TEMPORARY HACK WARNING! This should be replaced with 365 # something nicer when we get the chance.
366 - def _freeze(self, tree):
367 c = tree.copy() 368 # for pos in c.treepositions('leaves'): 369 # c[pos] = c[pos].freeze() 370 return ImmutableTree.convert(c)
371
372 - def get_parse_list(self, tokens):
373 tokens = list(tokens) 374 self.initialize(tokens) 375 while self.step() is not None: pass 376 377 return self.parses()
378
379 - def initialize(self, tokens):
380 """ 381 Start parsing a given text. This sets the parser's tree to 382 the start symbol, its frontier to the root node, and its 383 remaining text to C{token['SUBTOKENS']}. 384 """ 385 386 self._rtext = tokens 387 start = self._grammar.start().symbol() 388 self._tree = Tree(start, []) 389 self._frontier = [()] 390 self._tried_e = {} 391 self._tried_m = {} 392 self._history = [] 393 self._parses = [] 394 if self._trace: 395 self._trace_start(self._tree, self._frontier, self._rtext)
396
397 - def remaining_text(self):
398 """ 399 @return: The portion of the text that is not yet covered by the 400 tree. 401 @rtype: C{list} of C{String} 402 """ 403 return self._rtext
404
405 - def frontier(self):
406 """ 407 @return: A list of the tree locations of all subtrees that 408 have not yet been expanded, and all leaves that have not 409 yet been matched. 410 @rtype: C{list} of C{tuple} of C{int} 411 """ 412 return self._frontier
413
414 - def tree(self):
415 """ 416 @return: A partial structure for the text that is 417 currently being parsed. The elements specified by the 418 frontier have not yet been expanded or matched. 419 @rtype: C{Tree} 420 """ 421 return self._tree
422
423 - def step(self):
424 """ 425 Perform a single parsing operation. If an untried match is 426 possible, then perform the match, and return the matched 427 token. If an untried expansion is possible, then perform the 428 expansion, and return the production that it is based on. If 429 backtracking is possible, then backtrack, and return 1. 430 Otherwise, return 0. 431 432 @return: 0 if no operation was performed; a token if a match 433 was performed; a production if an expansion was performed; 434 and 1 if a backtrack operation was performed. 435 @rtype: C{Production} or C{String} or C{boolean} 436 """ 437 # Try matching (if we haven't already) 438 if self.untried_match(): 439 token = self.match() 440 if token is not None: return token 441 442 # Try expanding. 443 production = self.expand() 444 if production is not None: return production 445 446 # Try backtracking 447 if self.backtrack(): 448 self._trace_backtrack(self._tree, self._frontier) 449 return 1 450 451 # Nothing left to do. 452 return None
453
454 - def expand(self, production=None):
455 """ 456 Expand the first element of the frontier. In particular, if 457 the first element of the frontier is a subtree whose node type 458 is equal to C{production}'s left hand side, then add a child 459 to that subtree for each element of C{production}'s right hand 460 side. If C{production} is not specified, then use the first 461 untried expandable production. If all expandable productions 462 have been tried, do nothing. 463 464 @return: The production used to expand the frontier, if an 465 expansion was performed. If no expansion was performed, 466 return C{None}. 467 @rtype: C{Production} or C{None} 468 """ 469 470 # Make sure we *can* expand. 471 if len(self._frontier) == 0: 472 return None 473 if not isinstance(self._tree[self._frontier[0]], Tree): 474 return None 475 476 # If they didn't specify a production, check all untried ones. 477 if production is None: 478 productions = self.untried_expandable_productions() 479 else: productions = [production] 480 481 parses = [] 482 for prod in productions: 483 # Record that we've tried this production now. 484 self._tried_e.setdefault(self._freeze(self._tree), []).append(prod) 485 486 # Try expanding. 487 if self._expand(self._rtext, self._tree, self._frontier, prod): 488 return prod 489 490 # We didn't expand anything. 491 return None
492
493 - def match(self):
494 """ 495 Match the first element of the frontier. In particular, if 496 the first element of the frontier has the same type as the 497 next text token, then substitute the text token into the tree. 498 499 @return: The token matched, if a match operation was 500 performed. If no match was performed, return C{None} 501 @rtype: C{String} or C{None} 502 """ 503 504 # Record that we've tried matching this token. 505 tok = self._rtext[0] 506 self._tried_m.setdefault(self._freeze(self._tree), []).append(tok) 507 508 # Make sure we *can* match. 509 if len(self._frontier) == 0: 510 return None 511 if isinstance(self._tree[self._frontier[0]], Tree): 512 return None 513 514 if self._match(self._rtext, self._tree, self._frontier): 515 # Return the token we just matched. 516 return self._history[-1][0][0] 517 else: 518 return None
519
520 - def backtrack(self):
521 """ 522 Return the parser to its state before the most recent 523 match or expand operation. Calling C{undo} repeatedly return 524 the parser to successively earlier states. If no match or 525 expand operations have been performed, C{undo} will make no 526 changes. 527 528 @return: true if an operation was successfully undone. 529 @rtype: C{boolean} 530 """ 531 if len(self._history) == 0: return 0 532 (self._rtext, self._tree, self._frontier) = self._history.pop() 533 return 1
534
535 - def expandable_productions(self):
536 """ 537 @return: A list of all the productions for which expansions 538 are available for the current parser state. 539 @rtype: C{list} of C{Production} 540 """ 541 # Make sure we *can* expand. 542 if len(self._frontier) == 0: return [] 543 frontier_child = self._tree[self._frontier[0]] 544 if (len(self._frontier) == 0 or 545 not isinstance(frontier_child, Tree)): 546 return [] 547 548 return [p for p in self._grammar.productions() 549 if p.lhs().symbol() == frontier_child.node]
550
552 """ 553 @return: A list of all the untried productions for which 554 expansions are available for the current parser state. 555 @rtype: C{list} of C{Production} 556 """ 557 558 tried_expansions = self._tried_e.get(self._freeze(self._tree), []) 559 return [p for p in self.expandable_productions() 560 if p not in tried_expansions]
561
562 - def untried_match(self):
563 """ 564 @return: Whether the first element of the frontier is a token 565 that has not yet been matched. 566 @rtype: C{boolean} 567 """ 568 569 if len(self._rtext) == 0: return 0 570 tried_matches = self._tried_m.get(self._freeze(self._tree), []) 571 return (self._rtext[0] not in tried_matches)
572
573 - def currently_complete(self):
574 """ 575 @return: Whether the parser's current state represents a 576 complete parse. 577 @rtype: C{boolean} 578 """ 579 return (len(self._frontier) == 0 and len(self._rtext) == 0)
580
581 - def _parse(self, remaining_text, tree, frontier):
582 """ 583 A stub version of C{_parse} that sets the parsers current 584 state to the given arguments. In C{RecursiveDescent}, 585 the C{_parse} method is used to recursively continue parsing a 586 text. C{SteppingRecursiveDescent} overrides it to 587 capture these recursive calls. It records the parser's old 588 state in the history (to allow for backtracking), and updates 589 the parser's new state using the given arguments. Finally, it 590 returns C{[1]}, which is used by C{match} and C{expand} to 591 detect whether their operations were successful. 592 593 @return: C{[1]} 594 @rtype: C{list} of C{int} 595 """ 596 self._history.append( (self._rtext, self._tree, self._frontier) ) 597 self._rtext = remaining_text 598 self._tree = tree 599 self._frontier = frontier 600 601 # Is it a good parse? If so, record it. 602 if (len(frontier) == 0 and len(remaining_text) == 0): 603 self._parses.append(tree) 604 self._trace_succeed(self._tree, self._frontier) 605 606 return [1]
607
608 - def parses(self):
609 """ 610 @return: A list of the parses that have been found by this 611 parser so far. 612 @rtype: C{list} of C{Tree} 613 """ 614 return self._parses
615 616 617 # copied from nltk.parser 618
619 - def set_grammar(self, grammar):
620 """ 621 Change the grammar used to parse texts. 622 623 @param grammar: The new grammar. 624 @type grammar: C{CFG} 625 """ 626 self._grammar = grammar
627 628 ##////////////////////////////////////////////////////// 629 ## Demonstration Code 630 ##////////////////////////////////////////////////////// 631
632 -def demo():
633 """ 634 A demonstration of the recursive descent parser. 635 """ 636 637 from nltk_lite import parse 638 639 grammar = parse.cfg.parse_cfg(""" 640 S -> NP 'saw' NP | NP VP 641 NP -> Det N | Det N PP 642 VP -> V NP PP 643 PP -> P NP 644 NP -> 'I' 645 N -> 'man' | 'park' | 'telescope' | 'dog' 646 Det -> 'the' | 'a' 647 P -> 'in' | 'with' 648 V -> 'saw' 649 """) 650 651 for prod in grammar.productions(): 652 print prod 653 654 sent = tokenize.whitespace('I saw a man in the park') 655 parser = parse.RecursiveDescent(grammar) 656 parser.trace() 657 for p in parser.get_parse_list(sent): 658 print p
659 660 if __name__ == '__main__': demo() 661