Package nltk_lite :: Package contrib :: Package mit :: Package six863 :: Package parse :: Module category
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.contrib.mit.six863.parse.category

  1  # Natural Language Toolkit: Categories 
  2  # 
  3  # Copyright (C) 2001-2007 University of Pennsylvania 
  4  # Author: Contributed by Rob Speer (NLTK version) 
  5  #         Steven Bird <sb@csse.unimelb.edu.au> (NLTK-Lite Port) 
  6  #         Ewan Klein <ewan@inf.ed.ac.uk> (Hooks for semantics) 
  7  #         Peter Wang <wangp@csse.unimelb.edu.au> (Overhaul) 
  8  # URL: <http://nltk.sourceforge.net> 
  9  # For license information, see LICENSE.TXT 
 10  # 
 11  # $Id: category.py 4162 2007-03-01 00:46:05Z stevenbird $ 
 12   
 13  from nltk_lite.semantics import logic 
 14  from cfg import * 
 15  from kimmo import kimmo 
 16   
 17  from featurelite import * 
 18  from copy import deepcopy 
 19  import yaml 
 20  # import nltk_lite.yamltags 
 21   
22 -def makevar(varname):
23 """ 24 Given a variable representation such as C{?x}, construct a corresponding 25 Variable object. 26 """ 27 return Variable(varname[1:])
28
29 -class Category(Nonterminal, FeatureI):
30 """ 31 A C{Category} is a wrapper for feature dictionaries, intended for use in 32 parsing. It can act as a C{Nonterminal}. 33 34 A C{Category} acts like a dictionary, except in the following ways: 35 36 - Categories can be "frozen" so that they can act as hash keys; 37 before they are frozen, they are mutable. 38 39 - In addition to being hashable, Categories have consistent str() 40 representations. 41 42 - Categories have one feature marked as the 'head', which prints 43 differently than other features if it has a value. For example, 44 in the C{repr()} representation of a Category, the head goes to the 45 left, on the outside of the brackets. Subclasses of C{Category} 46 may change the feature name that is designated as the head, which is 47 _head by default. 48 49 Categories can contain any kind of object as their values, and can be 50 recursive and even re-entrant. Categories are not necessarily "categories 51 all the way down"; they can contain plain dictionaries as their values, and 52 converting inner dictionaries to categories would probably lead to messier 53 code for no gain. 54 55 Because Categories can contain any kind of object, they do not try to 56 keep control over what their inner objects do. If you freeze a Category 57 but mutate its inner objects, undefined behavior will occur. 58 """ 59 60 headname = 'head' 61
62 - def __init__(self, features=None, **morefeatures):
63 if features is None: features = {} 64 self._features = unify(features, morefeatures) 65 self._hash = None 66 self._frozen = False 67 self._memostr = None
68
69 - def __cmp__(self, other):
70 return cmp(repr(self), repr(other))
71
72 - def __div__(self, other):
73 """ 74 @return: A new Category based on this one, with its C{/} feature set to 75 C{other}. 76 """ 77 return unify(self, {'/': other})
78
79 - def __eq__(self, other):
80 """ 81 Compare Categories for equality. This relies on Python's built-in 82 __eq__ for dictionaries, which is fairly thorough in checking for 83 recursion and reentrance. 84 85 @return: True if C{self} and C{other} assign the same value to 86 every feature. In particular, return true if 87 C{self[M{p}]==other[M{p}]} for every feature path M{p} such 88 that C{self[M{p}]} or C{other[M{p}]} is a base value (i.e., 89 not a nested Category). 90 @rtype: C{bool} 91 """ 92 if not other.__class__ == self.__class__: return False 93 return self._features == other._features
94
95 - def __ne__(self, other):
96 return not (self == other)
97
98 - def __hash__(self):
99 if self._hash is not None: return self._hash 100 return hash(str(self))
101
102 - def freeze(self):
103 """ 104 Freezing a Category memoizes its hash value, to make comparisons on it 105 faster. After freezing, the Category and all its values are immutable. 106 107 @return: self 108 """ 109 self._memostr = str(self) 110 self._hash = hash(self) 111 self._frozen = True 112 return self
113
114 - def frozen(self):
115 """ 116 Returns whether this Category is frozen (immutable). 117 118 @rtype: C{bool} 119 """ 120 return self._frozen
121
122 - def get(self, key):
123 return self._features.get(key)
124
125 - def __getitem__(self, key):
126 return self._features.get(key)
127
128 - def __setitem__(self, key, value):
129 if self._frozen: raise "Cannot modify a frozen Category" 130 self._features[key] = value
131
132 - def items(self):
133 return self._features.items()
134
135 - def keys(self):
136 return self._features.keys()
137
138 - def values(self):
139 return self._features.values()
140
141 - def has_key(self, key):
142 return self._features.has_key(key)
143
144 - def symbol(self):
145 """ 146 @return: The node value corresponding to this C{Category}. 147 @rtype: C{Category} 148 """ 149 return self
150
151 - def head(self):
152 """ 153 @return: The head of this category (the value shown outside the 154 brackets in its string representation). If there is no head, returns 155 None. 156 @rtype: C{str} or C{None} 157 """ 158 return self.get(self.__class__.headname)
159
160 - def copy(self):
161 """ 162 @return: A deep copy of C{self}. 163 """ 164 # Create a reentrant deep copy by round-tripping it through YAML. 165 return deepcopy(self)
166
167 - def feature_names(self):
168 """ 169 @return: a list of all features that have values. 170 """ 171 return self._features.keys()
172 173 has_feature = has_key 174 175 ################################################################# 176 ## Variables 177 ################################################################# 178
179 - def remove_unbound_vars(self):
180 selfcopy = self.copy() 181 Category._remove_unbound_vars(self) 182 return selfcopy
183 184 @staticmethod
185 - def _remove_unbound_vars(obj):
186 for (key, value) in obj.items(): 187 if isinstance(value, Variable): 188 del obj[key] 189 elif isinstance(value, (Category, dict)): 190 Category._remove_unbound_vars(value)
191 192 ################################################################# 193 ## String Representations 194 ################################################################# 195
196 - def __repr__(self):
197 """ 198 @return: A string representation of this feature structure. 199 """ 200 return str(self)
201
202 - def __str__(self):
203 """ 204 @return: A string representation of this feature structure. 205 """ 206 if self._memostr is not None: return self._memostr 207 return self.__class__._str(self, {}, {})
208 209 @classmethod
210 - def _str(cls, obj, reentrances, reentrance_ids):
211 segments = [] 212 213 keys = obj.keys() 214 keys.sort() 215 for fname in keys: 216 if fname == cls.headname: continue 217 fval = obj[fname] 218 if isinstance(fval, bool): 219 if fval: segments.append('+%s' % fname) 220 else: segments.append('-%s' % fname) 221 elif not isinstance(fval, dict): 222 segments.append('%s=%r' % (fname, fval)) 223 else: 224 fval_repr = cls._str(fval, reentrances, reentrance_ids) 225 segments.append('%s=%s' % (fname, fval_repr)) 226 227 head = obj.get(cls.headname) 228 if head is None: head = '' 229 if head and not len(segments): return str(head) 230 return '%s[%s]' % (head, ', '.join(segments))
231 232 yaml_tag = '!parse.Category' 233 234 @classmethod
235 - def to_yaml(cls, dumper, data):
236 node = dumper.represent_mapping(cls.yaml_tag, data._features) 237 return node
238 239 @classmethod
240 - def from_yaml(cls, loader, node):
241 features = loader.construct_mapping(node, deep=True) 242 return cls(features)
243 244 ################################################################# 245 ## Parsing 246 ################################################################# 247 248 # Regular expressions for parsing. 249 _PARSE_RE = {'name': re.compile(r'\s*([^\s\(\)"\'=,\[\]/\?]+)\s*'), 250 'ident': re.compile(r'\s*\((\d+)\)\s*'), 251 'reentrance': re.compile(r'\s*->\s*'), 252 'assign': re.compile(r'\s*=?\s*'), 253 'bracket': re.compile(r'\s*]\s*'), 254 'comma': re.compile(r'\s*,\s*'), 255 'none': re.compile(r'None(?=\s|\]|,)'), 256 'int': re.compile(r'-?\d+(?=\s|\]|,)'), 257 'var': re.compile(r'\?[a-zA-Z_][a-zA-Z0-9_]*'+'|'+ 258 r'\?<[a-zA-Z_][a-zA-Z0-9_]*'+ 259 r'(=[a-zA-Z_][a-zA-Z0-9_]*)*>'), 260 'symbol': re.compile(r'\w+'), 261 'stringmarker': re.compile("['\"\\\\]"), 262 263 'categorystart':re.compile(r'\s*([^\s\(\)"\'\-=,\[\]/\?]*)\s*\['), 264 'bool': re.compile(r'\s*([-\+])'), 265 'arrow': re.compile(r'\s*->\s*'), 266 'disjunct': re.compile(r'\s*\|\s*'), 267 'whitespace': re.compile(r'\s*'), 268 'semantics': re.compile(r'<([^>]+)>'), 269 'application': re.compile(r'<(app)\((\?[a-z][a-z]*)\s*,\s*(\?[a-z][a-z]*)\)>'), 270 'slash': re.compile(r'\s*/\s*'), 271 } 272 273 @classmethod
274 - def parse(cls, s):
275 parsed, position = cls._parse(s, 0) 276 if position != len(s): 277 raise ValueError('end of string', position) 278 return cls(parsed)
279 280 @classmethod
281 - def inner_parse(cls, s, position, reentrances={}):
282 if reentrances is None: reentrances = {} 283 parsed, position = cls._parse(s, position) 284 return cls(parsed), position
285 286 @classmethod
287 - def _parse(cls, s, position=0, reentrances=None):
288 """ 289 Helper function that parses a Category. 290 @param s: The string to parse. 291 @param position: The position in the string to start parsing. 292 @param reentrances: A dictionary from reentrance ids to values. 293 @return: A tuple (val, pos) of the feature structure created 294 by parsing and the position where the parsed feature 295 structure ends. 296 """ 297 # A set of useful regular expressions (precompiled) 298 _PARSE_RE = cls._PARSE_RE 299 300 features = {} 301 302 # Find the head, if there is one. 303 match = _PARSE_RE['name'].match(s, position) 304 if match is not None: 305 features[cls.headname] = match.group(1) 306 position = match.end() 307 else: 308 match = _PARSE_RE['var'].match(s, position) 309 if match is not None: 310 features[cls.headname] = makevar(match.group(0)) 311 position = match.end() 312 313 314 # If the name is followed by an open bracket, start looking for 315 # features. 316 if position < len(s) and s[position] == '[': 317 position += 1 318 319 # Build a list of the features defined by the structure. 320 # Each feature has one of the three following forms: 321 # name = value 322 # +name 323 # -name 324 while True: 325 if not position < len(s): 326 raise ValueError('close bracket', position) 327 328 # Use these variables to hold info about the feature: 329 name = target = val = None 330 331 # Check for a close bracket at the beginning 332 match = _PARSE_RE['bracket'].match(s, position) 333 if match is not None: 334 position = match.end() 335 break 336 337 # Is this a shorthand boolean value? 338 match = _PARSE_RE['bool'].match(s, position) 339 if match is not None: 340 if match.group(1) == '+': val = True 341 else: val = False 342 position = match.end() 343 344 # Find the next feature's name. 345 match = _PARSE_RE['name'].match(s, position) 346 if match is None: raise ValueError('feature name', position) 347 name = match.group(1) 348 position = match.end() 349 350 # If it's not a shorthand boolean, it must be an assignment. 351 if val is None: 352 match = _PARSE_RE['assign'].match(s, position) 353 if match is None: raise ValueError('equals sign', position) 354 position = match.end() 355 356 val, position = cls._parseval(s, position, reentrances) 357 features[name] = val 358 359 # Check for a close bracket 360 match = _PARSE_RE['bracket'].match(s, position) 361 if match is not None: 362 position = match.end() 363 break 364 365 # Otherwise, there should be a comma 366 match = _PARSE_RE['comma'].match(s, position) 367 if match is None: raise ValueError('comma', position) 368 position = match.end() 369 370 return features, position
371 372 @classmethod
373 - def _parseval(cls, s, position, reentrances):
374 """ 375 Helper function that parses a feature value. Currently 376 supports: None, bools, integers, variables, strings, nested feature 377 structures. 378 @param s: The string to parse. 379 @param position: The position in the string to start parsing. 380 @param reentrances: A dictionary from reentrance ids to values. 381 @return: A tuple (val, pos) of the value created by parsing 382 and the position where the parsed value ends. 383 """ 384 385 # A set of useful regular expressions (precompiled) 386 _PARSE_RE = cls._PARSE_RE 387 388 # End of string (error) 389 if position == len(s): raise ValueError('value', position) 390 391 # Semantic value of the form <app(?x, ?y) >'; return an ApplicationExpression 392 match = _PARSE_RE['application'].match(s, position) 393 if match is not None: 394 fun = ParserSubstitute(match.group(2)).next() 395 arg = ParserSubstitute(match.group(3)).next() 396 return ApplicationExpressionSubst(fun, arg), match.end() 397 398 # other semantic value enclosed by '< >'; return value given by the lambda expr parser 399 match = _PARSE_RE['semantics'].match(s, position) 400 if match is not None: 401 return ParserSubstitute(match.group(1)).next(), match.end() 402 403 # String value 404 if s[position] in "'\"": 405 start = position 406 quotemark = s[position:position+1] 407 position += 1 408 while 1: 409 match = cls._PARSE_RE['stringmarker'].search(s, position) 410 if not match: raise ValueError('close quote', position) 411 position = match.end() 412 if match.group() == '\\': position += 1 413 elif match.group() == quotemark: 414 return s[start+1:position-1], position 415 416 # Nested category 417 if _PARSE_RE['categorystart'].match(s, position) is not None: 418 return cls._parse(s, position, reentrances) 419 420 # Variable 421 match = _PARSE_RE['var'].match(s, position) 422 if match is not None: 423 return makevar(match.group()), match.end() 424 425 # None 426 match = _PARSE_RE['none'].match(s, position) 427 if match is not None: 428 return None, match.end() 429 430 # Integer value 431 match = _PARSE_RE['int'].match(s, position) 432 if match is not None: 433 return int(match.group()), match.end() 434 435 # Alphanumeric symbol (must be checked after integer) 436 match = _PARSE_RE['symbol'].match(s, position) 437 if match is not None: 438 return match.group(), match.end() 439 440 # We don't know how to parse this value. 441 raise ValueError('value', position)
442 443 @classmethod
444 - def parse_rules(cls, s):
445 """ 446 Parse a L{CFG} line involving C{Categories}. A line has this form: 447 448 C{lhs -> rhs | rhs | ...} 449 450 where C{lhs} is a Category, and each C{rhs} is a sequence of 451 Categories. 452 453 @returns: a list of C{Productions}, one for each C{rhs}. 454 """ 455 _PARSE_RE = cls._PARSE_RE 456 position = 0 457 try: 458 lhs, position = cls.inner_parse(s, position) 459 lhs = cls(lhs) 460 except ValueError, e: 461 estr = ('Error parsing field structure\n\n\t' + 462 s + '\n\t' + ' '*e.args[1] + '^ ' + 463 'Expected %s\n' % e.args[0]) 464 raise ValueError, estr 465 lhs.freeze() 466 467 match = _PARSE_RE['arrow'].match(s, position) 468 if match is None: 469 raise ValueError('expected arrow', s, s[position:]) 470 else: position = match.end() 471 rules = [] 472 while position < len(s): 473 rhs = [] 474 while position < len(s) and _PARSE_RE['disjunct'].match(s, position) is None: 475 try: 476 val, position = cls.inner_parse(s, position, {}) 477 if isinstance(val, dict): val = cls(val) 478 except ValueError, e: 479 estr = ('Error parsing field structure\n\n\t' + 480 s + '\n\t' + ' '*e.args[1] + '^ ' + 481 'Expected %s\n' % e.args[0]) 482 raise ValueError, estr 483 if isinstance(val, Category): val.freeze() 484 rhs.append(val) 485 position = _PARSE_RE['whitespace'].match(s, position).end() 486 rules.append(Production(lhs, rhs)) 487 488 if position < len(s): 489 match = _PARSE_RE['disjunct'].match(s, position) 490 position = match.end() 491 492 # Special case: if there's nothing after the arrow, it is one rule with 493 # an empty RHS, instead of no rules. 494 if len(rules) == 0: rules = [Production(lhs, ())] 495 return rules
496
497 -class GrammarCategory(Category):
498 """ 499 A class of C{Category} for use in parsing. 500 501 The name of the head feature in a C{GrammarCategory} is C{pos} (for "part 502 of speech"). 503 504 In addition, GrammarCategories are displayed and parse differently, to be 505 consistent with NLP teaching materials: the value of the C{/} feature can 506 be written with a slash after the right bracket, so that the string 507 representation looks like: C{head[...]/value}. 508 509 Every GrammarCategory has a / feature implicitly present; if it is not 510 explicitly written, it has the value False. This is so that "slashed" 511 features cannot unify with "unslashed" ones. 512 513 An example of a C{GrammarCategory} is C{VP[+fin]/NP}, for a verb phrase 514 that is finite and has an omitted noun phrase inside it. 515 """ 516 517 headname = 'pos' 518 yaml_tag = '!parse.GrammarCategory' 519 520 @classmethod
521 - def _str(cls, obj, reentrances, reentrance_ids):
522 segments = [] 523 524 keys = obj.keys() 525 keys.sort() 526 for fname in keys: 527 if fname == cls.headname: continue 528 if isinstance(obj, GrammarCategory) and fname == '/': continue 529 fval = obj[fname] 530 if isinstance(fval, bool): 531 if fval: segments.append('+%s' % fname) 532 else: segments.append('-%s' % fname) 533 elif not isinstance(fval, dict): 534 segments.append('%s=%r' % (fname, fval)) 535 else: 536 fval_repr = cls._str(fval, reentrances, reentrance_ids) 537 segments.append('%s=%s' % (fname, fval_repr)) 538 539 if segments: features = '[%s]' % ', '.join(segments) 540 else: features = '' 541 head = obj.get(cls.headname) 542 if head is None: head = '' 543 slash = None 544 if isinstance(obj, GrammarCategory): slash = obj.get('/') 545 if not slash: slash = '' 546 else: 547 if isinstance(slash, dict): 548 slash = '/%s' % cls._str(slash, reentrances, reentrance_ids) 549 else: 550 slash = '/%r' % slash 551 552 553 return '%s%s%s' % (head, features, slash)
554 555 @staticmethod
556 - def parse(s, position=0):
557 return GrammarCategory.inner_parse(s, position)[0]
558 559 @classmethod
560 - def inner_parse(cls, s, position, reentrances=None):
561 if reentrances is None: reentrances = {} 562 if s[position] in "'\"": 563 start = position 564 quotemark = s[position:position+1] 565 position += 1 566 while 1: 567 match = cls._PARSE_RE['stringmarker'].search(s, position) 568 if not match: raise ValueError('close quote', position) 569 position = match.end() 570 if match.group() == '\\': position += 1 571 elif match.group() == quotemark: 572 return s[start+1:position-1], position 573 574 body, position = GrammarCategory._parse(s, position, reentrances) 575 slash_match = Category._PARSE_RE['slash'].match(s, position) 576 if slash_match is not None: 577 position = slash_match.end() 578 slash, position = GrammarCategory._parseval(s, position, reentrances) 579 if isinstance(slash, basestring): slash = {'pos': slash} 580 body['/'] = unify(body.get('/'), slash) 581 elif not body.has_key('/'): 582 body['/'] = False 583 return cls(body), position
584
585 -class SubstituteBindingsI:
586 """ 587 An interface for classes that can perform substitutions for feature 588 variables. 589 """
590 - def substitute_bindings(self, bindings):
591 """ 592 @return: The object that is obtained by replacing 593 each variable bound by C{bindings} with its values. 594 @rtype: (any) 595 """ 596 raise NotImplementedError
597
598 -class ParserSubstitute(logic.Parser):
599 """ 600 A lambda calculus expression parser, extended to create application 601 expressions which support the SubstituteBindingsI interface. 602 """
603 - def make_ApplicationExpression(self, first, second):
604 return ApplicationExpressionSubst(first, second)
605
606 -class ApplicationExpressionSubst(logic.ApplicationExpression, SubstituteBindingsI):
607 """ 608 A lambda application expression, extended to implement the 609 SubstituteBindingsI interface. 610 """
611 - def substitute_bindings(self, bindings):
612 newval = self 613 for semvar in self.variables(): 614 varstr = str(semvar) 615 # discard Variables which are not FeatureVariables 616 if varstr.startswith('?'): 617 var = makevar(varstr) 618 if bindings.is_bound(var): 619 newval = newval.replace(semvar, bindings.lookup(var)) 620 return newval
621 622 ############################################################################ 623 # Read a grammar from a file 624 ############################################################################ 625
626 -class GrammarFile(object):
627 - def __init__(self):
628 self.grammatical_productions = [] 629 self.lexical_productions = [] 630 self.start = GrammarCategory(pos='Start') 631 self.kimmo = None
632
633 - def grammar(self):
634 return Grammar(self.start, self.grammatical_productions +\ 635 self.lexical_productions)
636
637 - def earley_grammar(self):
638 return Grammar(self.start, self.grammatical_productions)
639
640 - def earley_lexicon(self):
641 lexicon = {} 642 for prod in self.lexical_productions: 643 lexicon.setdefault(prod.rhs()[0].upper(), []).append(prod.lhs()) 644 def lookup(word): 645 return lexicon.get(word.upper(), [])
646 return lookup
647
648 - def kimmo_lexicon(self):
649 def lookup(word): 650 kimmo_results = self.kimmo.recognize(word.lower()) 651 return [GrammarCategory(k[1]) for k in kimmo_results]
652 return lookup 653
654 - def earley_parser(self, trace=1):
655 from featurechart import FeatureEarleyChartParse 656 if self.kimmo is None: lexicon = self.earley_lexicon() 657 else: lexicon = self.kimmo_lexicon() 658 659 return FeatureEarleyChartParse(self.earley_grammar(), 660 lexicon, trace=trace)
661
662 - def apply_lines(self, lines):
663 for line in lines: 664 line = line.strip() 665 if not len(line): continue 666 if line[0] == '#': continue 667 if line[0] == '%': 668 parts = line[1:].split() 669 directive = parts[0] 670 args = " ".join(parts[1:]) 671 if directive == 'start': 672 self.start = GrammarCategory.parse(args).freeze() 673 elif directive == 'include': 674 filename = args.strip('"') 675 self.apply_file(filename) 676 elif directive == 'tagger_file': 677 import yaml, nltk_lite.yamltags 678 filename = args.strip('"') 679 tagger = yaml.load(filename) 680 self.tagproc = chart_tagger(tagger) 681 elif directive == 'kimmo': 682 filename = args.strip('"') 683 kimmorules = kimmo.load(filename) 684 self.kimmo = kimmorules 685 else: 686 rules = GrammarCategory.parse_rules(line) 687 for rule in rules: 688 if len(rule.rhs()) == 1 and isinstance(rule.rhs()[0], str): 689 self.lexical_productions.append(rule) 690 else: 691 self.grammatical_productions.append(rule)
692
693 - def apply_file(self, filename):
694 f = open(filename) 695 lines = f.readlines() 696 self.apply_lines(lines) 697 f.close()
698 699 @staticmethod
700 - def read_file(filename):
701 result = GrammarFile() 702 result.apply_file(filename) 703 return result
704 705 yaml.add_representer(Category, Category.to_yaml) 706 yaml.add_representer(GrammarCategory, GrammarCategory.to_yaml) 707
708 -def demo():
709 print "Category(pos='n', agr=dict(number='pl', gender='f')):" 710 print 711 print Category(pos='n', agr=dict(number='pl', gender='f')) 712 print repr(Category(pos='n', agr=dict(number='pl', gender='f'))) 713 print 714 print "GrammarCategory.parse('NP/NP'):" 715 print 716 print GrammarCategory.parse('NP/NP') 717 print repr(GrammarCategory.parse('NP/NP')) 718 print 719 print "GrammarCategory.parse('?x/?x'):" 720 print 721 print GrammarCategory.parse('?x/?x') 722 print repr(GrammarCategory.parse('?x/?x')) 723 print 724 print "GrammarCategory.parse('VP[+fin, agr=?x, tense=past]/NP[+pl, agr=?x]'):" 725 print 726 print GrammarCategory.parse('VP[+fin, agr=?x, tense=past]/NP[+pl, agr=?x]') 727 print repr(GrammarCategory.parse('VP[+fin, agr=?x, tense=past]/NP[+pl, agr=?x]')) 728 print 729 g = GrammarFile.read_file("speer.cfg") 730 print g.grammar()
731 732 if __name__ == '__main__': 733 demo() 734