Package nltk_lite :: Package contrib :: Module marshalbrill
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.contrib.marshalbrill

   1  # Natural Language Toolkit: Brill Tagger 
   2  # 
   3  # Copyright (C) 2001-2005 University of Pennsylvania 
   4  # Authors: Christopher Maloof <cjmaloof@gradient.cis.upenn.edu> 
   5  #          Edward Loper <edloper@gradient.cis.upenn.edu> 
   6  #          Steven Bird <sb@ldc.upenn.edu> 
   7  # URL: <http://nltk.sf.net> 
   8  # For license information, see LICENSE.TXT 
   9   
  10  """ 
  11  Brill's transformational rule-based tagger. 
  12  """ 
  13   
  14  from nltk_lite.tag import TagI 
  15   
  16  import bisect        # for binary search through a subset of indices 
  17  import os            # for finding WSJ files 
  18  import random        # for shuffling WSJ files 
  19  import sys           # for getting command-line arguments 
  20  import re            # for performing regular expression matching 
  21   
  22  ###################################################################### 
  23  ## The Brill Tagger 
  24  ###################################################################### 
  25   
26 -class Brill(TagI):
27 """ 28 Brill's transformational rule-based tagger. Brill taggers use an 29 X{initial tagger} (such as L{tag.Default}) to assign an intial 30 tag sequence to a text; and then apply an ordered list of 31 transformational rules to correct the tags of individual tokens. 32 These transformation rules are specified by the L{BrillRuleI} 33 interface. 34 35 Brill taggers can be created directly, from an initial tagger and 36 a list of transformational rules; but more often, Brill taggers 37 are created by learning rules from a training corpus, using either 38 L{BrillTrainer} or L{FastBrillTrainer}. 39 """ 40 41 # TODO: move into __init__() when all marshalling classes will be moved into 42 # standard tree 43 _classname = "BrillTagger" 44
45 - def __init__(self, initial_tagger, rules):
46 """ 47 @param initial_tagger: The initial tagger 48 @type initial_tagger: L{TagI} 49 @param rules: An ordered list of transformation rules that 50 should be used to correct the initial tagging. 51 @type rules: C{list} of L{BrillRuleI} 52 """ 53 self._initial_tagger = initial_tagger 54 self._rules = rules
55
56 - def rules(self):
57 return self._rules[:]
58
59 - def tag (self, tokens):
60 # Inherit documentation from TagI 61 62 # Run the initial tagger. 63 tagged_tokens = list(self._initial_tagger.tag(tokens)) 64 65 # Create a dictionary that maps each tag to a list of the 66 # indices of tokens that have that tag. 67 tag_to_positions = {} 68 for i, (token, tag) in enumerate(tagged_tokens): 69 if tag not in tag_to_positions: 70 tag_to_positions[tag] = set([i]) 71 else: 72 tag_to_positions[tag].add(i) 73 74 # Apply each rule, in order. Only try to apply rules at 75 # positions that have the desired original tag. 76 for rule in self._rules: 77 # Find the positions where it might apply 78 positions = tag_to_positions.get(rule.original_tag(), []) 79 # Apply the rule at those positions. 80 changed = rule.apply_at(tagged_tokens, positions) 81 # Update tag_to_positions with the positions of tags that 82 # were modified. 83 for i in changed: 84 tag_to_positions[rule.original_tag()].remove(i) 85 if rule.replacement_tag() not in tag_to_positions: 86 tag_to_positions[rule.replacement_tag()] = set([i]) 87 else: 88 tag_to_positions[rule.replacement_tag()].add(i) 89 for t in tagged_tokens: 90 yield t
91 92 # marshal() and unmarshal() methods by Tiago Tresoldi <tresoldi@users.sf.net>
93 - def marshal (self, filename):
94 """ 95 Marshals (saves to a plain text file) the tagger model. 96 97 @param filename: Name of the file to which save the model (will 98 be overwritten if it already exists). 99 @type filename: C{string} 100 """ 101 102 handler = file(filename, "w") 103 104 for rule in self.rules(): 105 handler.write("%s\n" % rule) 106 107 handler.close()
108
109 - def unmarshal (self, filename):
110 """ 111 Unmarshals (loads from a plain text file) the tagger model. This 112 operation will override any previously stored rules. 113 114 @param filename: Name of the file from which the model will 115 be read. 116 @type filename: C{string} 117 """ 118 rule_a = re.compile(r"^(.+) -> (.+) if the (.+) of words i([+-]\d+)...i([+-]\d+) is '(.+)'$", re.UNICODE) 119 rule_b = re.compile(r"^(.+) -> (.+) if the (.+) of the (.+) word is '(.+)'$", re.UNICODE) 120 121 # erase any previous rules 122 self._rules = [] 123 124 # load from file 125 handler = file(filename, "r") 126 lines = handler.readlines() 127 handler.close() 128 129 # remove '\n's, even though $ would catch them 130 lines = [line[:-1] for line in lines] 131 # remove empty lines 132 lines = [line for line in lines if len(line)>0] 133 134 # parse rules 135 for rule in lines: 136 match = re.match(rule_b, rule) 137 if match: 138 groups = list( match.groups() ) 139 if groups[3] == "preceding": 140 groups.pop(3) 141 groups.insert(3, "-1") 142 groups.insert(4, "-1") 143 else: 144 groups.pop(3) 145 groups.insert(3, "1") 146 groups.insert(4, "1") 147 else: 148 match = re.match(rule_a, rule) 149 groups = list( match.groups() ) 150 151 conditions = (int(groups[3]), int(groups[4]), groups[5]) 152 if groups[2] == "tag": 153 r = ProximateTagsRule(groups[0], groups[1], conditions) 154 else: 155 r = ProximateWordsRule(groups[0], groups[1], conditions) 156 157 self._rules.append(r)
158 159 160 ###################################################################### 161 ## Brill Rules 162 ###################################################################### 163
164 -class BrillRuleI(object):
165 """ 166 An interface for tag transformations on a tagged corpus, as 167 performed by brill taggers. Each transformation finds all tokens 168 in the corpus that are tagged with a specific X{original tag} and 169 satisfy a specific X{condition}, and replaces their tags with a 170 X{replacement tag}. For any given transformation, the original 171 tag, replacement tag, and condition are fixed. Conditions may 172 depend on the token under consideration, as well as any other 173 tokens in the corpus. 174 175 Brill rules must be comparable and hashable. 176 """
177 - def apply_to(self, tokens):
178 """ 179 Apply this rule everywhere it applies in the corpus. I.e., 180 for each token in the corpus that is tagged with this rule's 181 original tag, and that satisfies this rule's condition, set 182 its tag to be this rule's replacement tag. 183 184 @param tokens: The tagged corpus 185 @type tokens: C{list} of C{tuple} 186 @return: The indices of tokens whose tags were changed by this 187 rule. 188 @rtype: C{list} of C{int} 189 """ 190 return self.apply_at(tokens, range(len(tokens)))
191
192 - def apply_at(self, tokens, positions):
193 """ 194 Apply this rule at every position in C{positions} where it 195 applies to the corpus. I.e., for each position M{p} in 196 C{positions}, if C{tokens[M{p}]} is tagged with this rule's 197 original tag, and satisfies this rule's condition, then set 198 its tag to be this rule's replacement tag. 199 200 @param tokens: The tagged corpus 201 @type tokens: list of Token 202 @type positions: C{list} of C{int} 203 @param positions: The positions where the transformation is to 204 be tried. 205 @return: The indices of tokens whose tags were changed by this 206 rule. 207 @rtype: C{int} 208 """ 209 assert False, "BrillRuleI is an abstract interface"
210
211 - def applies(self, tokens, index):
212 """ 213 @return: True if the rule would change the tag of 214 C{tokens[index]}, False otherwise 215 @rtype: Boolean 216 217 @param tokens: A tagged corpus 218 @type tokens: list of Token 219 @param index: The index to check 220 @type index: int 221 """ 222 assert False, "BrillRuleI is an abstract interface"
223
224 - def original_tag(self):
225 """ 226 @return: The tag which this C{BrillRuleI} may cause to be 227 replaced. 228 @rtype: any 229 """ 230 assert False, "BrillRuleI is an abstract interface"
231
232 - def replacement_tag(self):
233 """ 234 @return: the tag with which this C{BrillRuleI} may replace 235 another tag. 236 @rtype: any 237 """ 238 assert False, "BrillRuleI is an abstract interface"
239 240 # Rules must be comparable and hashable for the algorithm to work
241 - def __eq__(self):
242 assert False, "Brill rules must be comparable"
243 - def __hash__(self):
244 assert False, "Brill rules must be hashable"
245
246 -class ProximateTokensRule(BrillRuleI):
247 """ 248 An abstract base class for brill rules whose condition checks for 249 the presence of tokens with given properties at given ranges of 250 positions, relative to the token. 251 252 Each subclass of proximate tokens brill rule defines a method 253 M{extract_property}, which extracts a specific property from the 254 the token, such as its text or tag. Each instance is 255 parameterized by a set of tuples, specifying ranges of positions 256 and property values to check for in those ranges: 257 258 - (M{start}, M{end}, M{value}) 259 260 The brill rule is then applicable to the M{n}th token iff: 261 262 - The M{n}th token is tagged with the rule's original tag; and 263 - For each (M{start}, M{end}, M{value}) triple: 264 - The property value of at least one token between 265 M{n+start} and M{n+end} (inclusive) is M{value}. 266 267 For example, a proximate token brill template with M{start=end=-1} 268 generates rules that check just the property of the preceding 269 token. Note that multiple properties may be included in a single 270 rule; the rule applies if they all hold. 271 """ 272
273 - def __init__(self, original_tag, replacement_tag, *conditions):
274 """ 275 276 Construct a new brill rule that changes a token's tag from 277 C{original_tag} to C{replacement_tag} if all of the properties 278 specified in C{conditions} hold. 279 280 @type conditions: C{tuple} of C{(int, int, *)} 281 @param conditions: A list of 3-tuples C{(start, end, value)}, 282 each of which specifies that the property of at least one 283 token between M{n}+C{start} and M{n}+C{end} (inclusive) is 284 C{value}. 285 @raise ValueError: If C{start}>C{end} for any condition. 286 """ 287 assert self.__class__ != ProximateTokensRule, \ 288 "ProximateTokensRule is an abstract base class" 289 290 self._original = original_tag 291 self._replacement = replacement_tag 292 self._conditions = conditions 293 for (s,e,v) in conditions: 294 if s>e: 295 raise ValueError('Condition %s has an invalid range' % 296 ((s,e,v),))
297
298 - def extract_property(token): # [staticmethod]
299 """ 300 Returns some property characterizing this token, such as its 301 base lexical item or its tag. 302 303 Each implentation of this method should correspond to an 304 implementation of the method with the same name in a subclass 305 of L{ProximateTokensTemplate}. 306 307 @param token: The token 308 @type token: Token 309 @return: The property 310 @rtype: any 311 """ 312 assert False, "ProximateTokensRule is an abstract interface"
313 extract_property = staticmethod(extract_property) 314
315 - def apply_at(self, tokens, positions):
316 # Inherit docs from BrillRuleI 317 318 # Find all locations where the rule is applicable 319 change = [] 320 for i in positions: 321 if self.applies(tokens, i): 322 change.append(i) 323 324 # Make the changes. Note: this must be done in a separate 325 # step from finding applicable locations, since we don't want 326 # the rule to interact with itself. 327 for i in change: 328 (token, tag) = tokens[i] 329 tokens[i] = (token, self._replacement) 330 331 return change
332
333 - def applies(self, tokens, index):
334 # Inherit docs from BrillRuleI 335 336 # Does the given token have this rule's "original tag"? 337 if tokens[index][1] != self._original: 338 return False 339 340 # Check to make sure that every condition holds. 341 for (start, end, val) in self._conditions: 342 # Find the (absolute) start and end indices. 343 s = max(0, index+start) 344 e = min(index+end+1, len(tokens)) 345 346 # Look for *any* token that satisfies the condition. 347 for i in range(s, e): 348 if self.extract_property(tokens[i]) == val: 349 break 350 else: 351 # No token satisfied the condition; return false. 352 return False 353 354 # Every condition checked out, so the rule is applicable. 355 return True
356
357 - def original_tag(self):
358 # Inherit docs from BrillRuleI 359 return self._original
360
361 - def replacement_tag(self):
362 # Inherit docs from BrillRuleI 363 return self._replacement
364
365 - def __eq__(self, other):
366 return (other != None and 367 other.__class__ == self.__class__ and 368 self._original == other._original and 369 self._replacement == other._replacement and 370 self._conditions == other._conditions)
371
372 - def __hash__(self):
373 # Needs to include extract_property in order to distinguish subclasses 374 # A nicer way would be welcome. 375 return hash( (self._original, self._replacement, self._conditions, 376 self.extract_property.func_code) )
377
378 - def __repr__(self):
379 conditions = ' and '.join(['%s in %d...%d' % (v,s,e) 380 for (s,e,v) in self._conditions]) 381 return '<%s: %s->%s if %s>' % (self.__class__.__name__, 382 self._original, self._replacement, 383 conditions)
384
385 - def __str__(self):
386 replacement = '%s -> %s' % (self._original, 387 self._replacement) 388 if len(self._conditions) == 0: 389 conditions = '' 390 else: 391 conditions = ' if '+ ', and '.join([self._condition_to_str(c) 392 for c in self._conditions]) 393 return replacement+conditions
394
395 - def _condition_to_str(self, condition):
396 """ 397 Return a string representation of the given condition. 398 This helper method is used by L{__str__}. 399 """ 400 (start, end, value) = condition 401 return ('the %s of %s is %r' % 402 (self.PROPERTY_NAME, self._range_to_str(start, end), value))
403
404 - def _range_to_str(self, start, end):
405 """ 406 Return a string representation for the given range. This 407 helper method is used by L{__str__}. 408 """ 409 if start == end == 0: 410 return 'this word' 411 if start == end == -1: 412 return 'the preceding word' 413 elif start == end == 1: 414 return 'the following word' 415 elif start == end and start < 0: 416 return 'word i-%d' % -start 417 elif start == end and start > 0: 418 return 'word i+%d' % start 419 else: 420 if start >= 0: start = '+%d' % start 421 if end >= 0: end = '+%d' % end 422 return 'words i%s...i%s' % (start, end)
423
424 -class ProximateTagsRule(ProximateTokensRule):
425 """ 426 A rule which examines the tags of nearby tokens. 427 @see: superclass L{ProximateTokensRule} for details. 428 @see: L{ProximateTagsTemplate}, which generates these rules. 429 """ 430 PROPERTY_NAME = 'tag' # for printing.
431 - def extract_property(token): # [staticmethod]
432 """@return: The given token's tag.""" 433 return token[1]
434 extract_property = staticmethod(extract_property) 435
436 -class ProximateWordsRule(ProximateTokensRule):
437 """ 438 A rule which examines the base types of nearby tokens. 439 @see: L{ProximateTokensRule} for details. 440 @see: L{ProximateWordsTemplate}, which generates these rules. 441 """ 442 PROPERTY_NAME = 'text' # for printing.
443 - def extract_property(token): # [staticmethod]
444 """@return: The given token's text.""" 445 return token[0]
446 extract_property = staticmethod(extract_property) 447 448 ###################################################################### 449 ## Brill Templates 450 ###################################################################### 451
452 -class BrillTemplateI(object):
453 """ 454 An interface for generating lists of transformational rules that 455 apply at given corpus positions. C{BrillTemplateI} is used by 456 C{Brill} training algorithms to generate candidate rules. 457 """
458 - def __init__(self):
459 raise AssertionError, "BrillTemplateI is an abstract interface"
460
461 - def applicable_rules(self, tokens, i, correctTag):
462 """ 463 Return a list of the transformational rules that would correct 464 the C{i}th subtoken's tag in the given token. In particular, 465 return a list of zero or more rules that would change 466 C{tagged_tokens[i][1]} to C{correctTag}, if applied 467 to C{token}. 468 469 If the C{i}th subtoken already has the correct tag (i.e., if 470 C{tagged_tokens[i][1]} == C{correctTag}), then 471 C{applicable_rules} should return the empty list. 472 473 @param tokens: The tagged tokens being tagged. 474 @type tokens: C{list} of C{tuple} 475 @param i: The index of the token whose tag should be corrected. 476 @type i: C{int} 477 @param correctTag: The correct tag for the C{i}th token. 478 @type correctTag: (any) 479 @rtype: C{list} of L{BrillRuleI} 480 """ 481 raise AssertionError, "BrillTemplateI is an abstract interface"
482
483 - def get_neighborhood(self, token, index):
484 """ 485 Returns the set of indices C{i} such that 486 C{applicable_rules(token, index, ...)} depends on the value of 487 the C{i}th subtoken of C{token}. 488 489 This method is used by the \"fast\" Brill tagger trainer. 490 491 @param token: The tokens being tagged. 492 @type token: C{list} of C{tuple} 493 @param index: The index whose neighborhood should be returned. 494 @type index: C{int} 495 @rtype: C{Set} 496 """ 497 raise AssertionError, "BrillTemplateI is an abstract interface"
498
499 -class ProximateTokensTemplate(BrillTemplateI):
500 """ 501 An brill templates that generates a list of 502 L{ProximateTokensRule}s that apply at a given corpus 503 position. In particular, each C{ProximateTokensTemplate} is 504 parameterized by a proximate token brill rule class and a list of 505 boundaries, and generates all rules that: 506 507 - use the given brill rule class 508 - use the given list of boundaries as the C{start} and C{end} 509 points for their conditions 510 - are applicable to the given token. 511 """
512 - def __init__(self, rule_class, *boundaries):
513 """ 514 Construct a template for generating proximate token brill 515 rules. 516 517 @type rule_class: C{class} 518 @param rule_class: The proximate token brill rule class that 519 should be used to generate new rules. This class must be a 520 subclass of L{ProximateTokensRule}. 521 @type boundaries: C{tuple} of C{(int, int)} 522 @param boundaries: A list of tuples C{(start, end)}, each of 523 which specifies a range for which a condition should be 524 created by each rule. 525 @raise ValueError: If C{start}>C{end} for any boundary. 526 """ 527 self._rule_class = rule_class 528 self._boundaries = boundaries 529 for (s,e) in boundaries: 530 if s>e: 531 raise ValueError('Boundary %s has an invalid range' % 532 ((s,e),))
533
534 - def applicable_rules(self, tokens, index, correct_tag):
535 if tokens[index][1] == correct_tag: 536 return [] 537 538 # For each of this template's boundaries, Find the conditions 539 # that are applicable for the given token. 540 applicable_conditions = \ 541 [self._applicable_conditions(tokens, index, start, end) 542 for (start, end) in self._boundaries] 543 544 # Find all combinations of these applicable conditions. E.g., 545 # if applicable_conditions=[[A,B], [C,D]], then this will 546 # generate [[A,C], [A,D], [B,C], [B,D]]. 547 condition_combos = [[]] 548 for conditions in applicable_conditions: 549 condition_combos = [old_conditions+[new_condition] 550 for old_conditions in condition_combos 551 for new_condition in conditions] 552 553 # Translate the condition sets into rules. 554 return [self._rule_class(tokens[index][1], correct_tag, *conds) 555 for conds in condition_combos]
556
557 - def _applicable_conditions(self, tokens, index, start, end):
558 """ 559 @return: A set of all conditions for proximate token rules 560 that are applicable to C{tokens[index]}, given boundaries of 561 C{(start, end)}. I.e., return a list of all tuples C{(start, 562 end, M{value})}, such the property value of at least one token 563 between M{index+start} and M{index+end} (inclusive) is 564 M{value}. 565 """ 566 conditions = set() 567 s = max(0, index+start) 568 e = min(index+end+1, len(tokens)) 569 for i in range(s, e): 570 value = self._rule_class.extract_property(tokens[i]) 571 conditions.add( (start, end, value) ) 572 return conditions
573
574 - def get_neighborhood(self, tokens, index):
575 # inherit docs from BrillTemplateI 576 neighborhood = set([index]) 577 for (start, end) in self._boundaries: 578 s = max(0, index+start) 579 e = min(index+end+1, len(tokens)) 580 for i in range(s, e): 581 neighborhood.add(i) 582 583 return neighborhood
584
585 -class SymmetricProximateTokensTemplate(BrillTemplateI):
586 """ 587 Simulates two L{ProximateTokensTemplate}s which are symmetric 588 across the location of the token. For rules of the form \"If the 589 M{n}th token is tagged C{A}, and any tag preceding B{or} following 590 the M{n}th token by a distance between M{x} and M{y} is C{B}, and 591 ... , then change the tag of the nth token from C{A} to C{C}.\" 592 593 One C{ProximateTokensTemplate} is formed by passing in the 594 same arguments given to this class's constructor: tuples 595 representing intervals in which a tag may be found. The other 596 C{ProximateTokensTemplate} is constructed with the negative 597 of all the arguments in reversed order. For example, a 598 C{SymmetricProximateTokensTemplate} using the pair (-2,-1) and the 599 constructor C{ProximateTagsTemplate} generates the same rules as a 600 C{ProximateTagsTemplate} using (-2,-1) plus a second 601 C{ProximateTagsTemplate} using (1,2). 602 603 This is useful because we typically don't want templates to 604 specify only \"following\" or only \"preceding\"; we'd like our 605 rules to be able to look in either direction. 606 """
607 - def __init__(self, rule_class, *boundaries):
608 """ 609 Construct a template for generating proximate token brill 610 rules. 611 612 @type rule_class: C{class} 613 @param rule_class: The proximate token brill rule class that 614 should be used to generate new rules. This class must be a 615 subclass of L{ProximateTokensRule}. 616 @type boundaries: C{tuple} of C{(int, int)} 617 @param boundaries: A list of tuples C{(start, end)}, each of 618 which specifies a range for which a condition should be 619 created by each rule. 620 @raise ValueError: If C{start}>C{end} for any boundary. 621 """ 622 self._ptt1 = ProximateTokensTemplate(rule_class, *boundaries) 623 reversed = [(-e,-s) for (s,e) in boundaries] 624 self._ptt2 = ProximateTokensTemplate(rule_class, *reversed)
625 626 # Generates lists of a subtype of ProximateTokensRule.
627 - def applicable_rules(self, tokens, index, correctTag):
628 """ 629 See L{BrillTemplateI} for full specifications. 630 631 @rtype: list of ProximateTokensRule 632 """ 633 return (self._ptt1.applicable_rules(tokens, index, correctTag) + 634 self._ptt2.applicable_rules(tokens, index, correctTag))
635
636 - def get_neighborhood(self, tokens, index):
637 # inherit docs from BrillTemplateI 638 n1 = self._ptt1.get_neighborhood(tokens, index) 639 n2 = self._ptt2.get_neighborhood(tokens, index) 640 return n1.union(n2)
641 642 ###################################################################### 643 ## Brill Tagger Trainer 644 ###################################################################### 645
646 -class BrillTrainer(object):
647 """ 648 A trainer for brill taggers. 649 """
650 - def __init__(self, initial_tagger, templates, trace=0):
651 self._initial_tagger = initial_tagger 652 self._templates = templates 653 self._trace = trace
654 655 #//////////////////////////////////////////////////////////// 656 # Training 657 #//////////////////////////////////////////////////////////// 658
659 - def train(self, train_tokens, max_rules=200, min_score=2):
660 """ 661 Trains the Brill tagger on the corpus C{train_token}, 662 producing at most C{max_rules} transformations, each of which 663 reduces the net number of errors in the corpus by at least 664 C{min_score}. 665 666 @type train_tokens: C{list} of L{tuple} 667 @param train_tokens: The corpus of tagged tokens 668 @type max_rules: C{int} 669 @param max_rules: The maximum number of transformations to be created 670 @type min_score: C{int} 671 @param min_score: The minimum acceptable net error reduction 672 that each transformation must produce in the corpus. 673 """ 674 if self._trace > 0: print ("Training Brill tagger on %d tokens..." % 675 len(train_tokens)) 676 677 # Create a new copy of the training token, and run the initial 678 # tagger on this. We will progressively update this test 679 # token to look more like the training token. 680 681 test_tokens = list(self._initial_tagger.tag(t[0] for t in train_tokens)) 682 683 if self._trace > 2: self._trace_header() 684 685 # Look for useful rules. 686 rules = [] 687 try: 688 while len(rules) < max_rules: 689 old_tags = [t[1] for t in test_tokens] 690 (rule, score, fixscore) = self._best_rule(test_tokens, 691 train_tokens) 692 if rule is None or score < min_score: 693 if self._trace > 1: 694 print 'Insufficient improvement; stopping' 695 break 696 else: 697 # Add the rule to our list of rules. 698 rules.append(rule) 699 # Use the rules to update the test token. 700 k = rule.apply_to(test_tokens) 701 # Display trace output. 702 if self._trace > 1: 703 self._trace_rule(rule, score, fixscore, len(k)) 704 # The user can also cancel training manually: 705 except KeyboardInterrupt: pass 706 707 # Create and return a tagger from the rules we found. 708 return Brill(self._initial_tagger, rules)
709 710 #//////////////////////////////////////////////////////////// 711 # Finding the best rule 712 #//////////////////////////////////////////////////////////// 713 714 # Finds the rule that makes the biggest net improvement in the corpus. 715 # Returns a (rule, score) pair.
716 - def _best_rule(self, test_tokens, train_tokens):
717 718 # Create a dictionary mapping from each tag to a list of the 719 # indices that have that tag in both test_tokens and 720 # train_tokens (i.e., where it is correctly tagged). 721 correct_indices = {} 722 for i in range(len(test_tokens)): 723 if test_tokens[i][1] == train_tokens[i][1]: 724 tag = test_tokens[i][1] 725 correct_indices.setdefault(tag, []).append(i) 726 727 # Find all the rules that correct at least one token's tag, 728 # and the number of tags that each rule corrects (in 729 # descending order of number of tags corrected). 730 rules = self._find_rules(test_tokens, train_tokens) 731 732 # Keep track of the current best rule, and its score. 733 best_rule, best_score, best_fixscore = None, 0, 0 734 735 # Consider each rule, in descending order of fixscore (the 736 # number of tags that the rule corrects, not including the 737 # number that it breaks). 738 for (rule, fixscore) in rules: 739 # The actual score must be <= fixscore; so if best_score 740 # is bigger than fixscore, then we already have the best 741 # rule. 742 if best_score >= fixscore: 743 return best_rule, best_score, best_fixscore 744 745 # Calculate the actual score, by decrementing fixscore 746 # once for each tag that the rule changes to an incorrect 747 # value. 748 score = fixscore 749 if correct_indices.has_key(rule.original_tag()): 750 for i in correct_indices[rule.original_tag()]: 751 if rule.applies(test_tokens, i): 752 score -= 1 753 # If the score goes below best_score, then we know 754 # that this isn't the best rule; so move on: 755 if score <= best_score: break 756 757 #print '%5d %5d %s' % (fixscore, score, rule) 758 759 # If the actual score is better than the best score, then 760 # update best_score and best_rule. 761 if score > best_score: 762 best_rule, best_score, best_fixscore = rule, score, fixscore 763 764 # Return the best rule, and its score. 765 return best_rule, best_score, best_fixscore
766
767 - def _find_rules(self, test_tokens, train_tokens):
768 """ 769 Find all rules that correct at least one token's tag in 770 C{test_tokens}. 771 772 @return: A list of tuples C{(rule, fixscore)}, where C{rule} 773 is a brill rule and C{fixscore} is the number of tokens 774 whose tag the rule corrects. Note that C{fixscore} does 775 I{not} include the number of tokens whose tags are changed 776 to incorrect values. 777 """ 778 779 # Create a list of all indices that are incorrectly tagged. 780 error_indices = [i for i in range(len(test_tokens)) 781 if (test_tokens[i][1] != 782 train_tokens[i][1])] 783 784 # Create a dictionary mapping from rules to their positive-only 785 # scores. 786 rule_score_dict = {} 787 for i in range(len(test_tokens)): 788 rules = self._find_rules_at(test_tokens, train_tokens, i) 789 for rule in rules: 790 rule_score_dict[rule] = rule_score_dict.get(rule,0) + 1 791 792 # Convert the dictionary into a list of (rule, score) tuples, 793 # sorted in descending order of score. 794 rule_score_items = rule_score_dict.items() 795 temp = [(-score, rule) for (rule, score) in rule_score_items] 796 temp.sort() 797 return [(rule, -negscore) for (negscore, rule) in temp]
798
799 - def _find_rules_at(self, test_tokens, train_tokens, i):
800 """ 801 @rtype: C{Set} 802 @return: the set of all rules (based on the templates) that 803 correct token C{i}'s tag in C{test_tokens}. 804 """ 805 806 applicable_rules = set() 807 if test_tokens[i][1] != train_tokens[i][1]: 808 correct_tag = train_tokens[i][1] 809 for template in self._templates: 810 new_rules = template.applicable_rules(test_tokens, i, 811 correct_tag) 812 applicable_rules.update(new_rules) 813 814 return applicable_rules
815 816 #//////////////////////////////////////////////////////////// 817 # Tracing 818 #//////////////////////////////////////////////////////////// 819
820 - def _trace_header(self):
821 print """ 822 B | 823 S F r O | Score = Fixed - Broken 824 c i o t | R Fixed = num tags changed incorrect -> correct 825 o x k h | u Broken = num tags changed correct -> incorrect 826 r e e e | l Other = num tags changed incorrect -> incorrect 827 e d n r | e 828 ------------------+------------------------------------------------------- 829 """.rstrip()
830
831 - def _trace_rule(self, rule, score, fixscore, numchanges):
832 if self._trace > 2: 833 print ('%4d%4d%4d%4d ' % (score, fixscore, fixscore-score, 834 numchanges-fixscore*2+score)), '|', 835 print rule
836 837 ###################################################################### 838 ## Fast Brill Tagger Trainer 839 ###################################################################### 840
841 -class FastBrillTrainer(object):
842 """ 843 A faster trainer for brill taggers. 844 """
845 - def __init__(self, initial_tagger, templates, trace=0):
846 self._initial_tagger = initial_tagger 847 self._templates = templates 848 self._trace = trace
849 850 #//////////////////////////////////////////////////////////// 851 # Training 852 #//////////////////////////////////////////////////////////// 853
854 - def train(self, train_tokens, max_rules=200, min_score=2):
855 856 # If TESTING is true, extra computation is done to determine whether 857 # each "best" rule actually reduces net error by the score it received. 858 TESTING = False 859 860 # Basic idea: Keep track of the rules that apply at each position. 861 # And keep track of the positions to which each rule applies. 862 863 # The set of somewhere-useful rules that apply at each position 864 rulesByPosition = [] 865 for i in range(len(train_tokens)): 866 rulesByPosition.append(set()) 867 868 # Mapping somewhere-useful rules to the positions where they apply. 869 # Then maps each position to the score change the rule generates there. 870 # (always -1, 0, or 1) 871 positionsByRule = {} 872 873 # Map scores to sets of rules known to achieve *at most* that score. 874 rulesByScore = {0:{}} 875 # Conversely, map somewhere-useful rules to their minimal scores. 876 ruleScores = {} 877 878 tagIndices = {} # Lists of indices, mapped to by their tags 879 880 # Maps rules to the first index in the corpus where it may not be known 881 # whether the rule applies. (Rules can't be chosen for inclusion 882 # unless this value = len(corpus). But most rules are bad, and 883 # we won't need to check the whole corpus to know that.) 884 # Some indices past this may actually have been checked; it just isn't 885 # guaranteed. 886 firstUnknownIndex = {} 887 888 # Make entries in the rule-mapping dictionaries. 889 # Should be called before _updateRuleApplies. 890 def _initRule (rule): 891 positionsByRule[rule] = {} 892 rulesByScore[0][rule] = None 893 ruleScores[rule] = 0 894 firstUnknownIndex[rule] = 0
895 896 # Takes a somewhere-useful rule which applies at index i; 897 # Updates all rule data to reflect that the rule so applies. 898 def _updateRuleApplies (rule, i): 899 900 # If the rule is already known to apply here, ignore. 901 # (This only happens if the position's tag hasn't changed.) 902 if positionsByRule[rule].has_key(i): 903 return 904 905 if rule.replacement_tag() == train_tokens[i][1]: 906 positionsByRule[rule][i] = 1 907 elif rule.original_tag() == train_tokens[i][1]: 908 positionsByRule[rule][i] = -1 909 else: # was wrong, remains wrong 910 positionsByRule[rule][i] = 0 911 912 # Update rules in the other dictionaries 913 del rulesByScore[ruleScores[rule]][rule] 914 ruleScores[rule] += positionsByRule[rule][i] 915 if not rulesByScore.has_key(ruleScores[rule]): 916 rulesByScore[ruleScores[rule]] = {} 917 rulesByScore[ruleScores[rule]][rule] = None 918 rulesByPosition[i].add(rule)
919 920 # Takes a rule which no longer applies at index i; 921 # Updates all rule data to reflect that the rule doesn't apply. 922 def _updateRuleNotApplies (rule, i): 923 del rulesByScore[ruleScores[rule]][rule] 924 ruleScores[rule] -= positionsByRule[rule][i] 925 if not rulesByScore.has_key(ruleScores[rule]): 926 rulesByScore[ruleScores[rule]] = {} 927 rulesByScore[ruleScores[rule]][rule] = None 928 929 del positionsByRule[rule][i] 930 rulesByPosition[i].remove(rule) 931 # Optional addition: if the rule now applies nowhere, delete 932 # all its dictionary entries. 933 934 tagged_tokens = list(self._initial_tagger.tag(t[0] for t in train_tokens)) 935 936 # First sort the corpus by tag, and also note where the errors are. 937 errorIndices = [] # only used in initialization 938 for i in range(len(tagged_tokens)): 939 tag = tagged_tokens[i][1] 940 if tag != train_tokens[i][1]: 941 errorIndices.append(i) 942 if not tagIndices.has_key(tag): 943 tagIndices[tag] = [] 944 tagIndices[tag].append(i) 945 946 print "Finding useful rules..." 947 # Collect all rules that fix any errors, with their positive scores. 948 for i in errorIndices: 949 for template in self._templates: 950 # Find the templated rules that could fix the error. 951 for rule in template.applicable_rules(tagged_tokens, i, 952 train_tokens[i][1]): 953 if not positionsByRule.has_key(rule): 954 _initRule(rule) 955 _updateRuleApplies(rule, i) 956 957 print "Done initializing %i useful rules." %len(positionsByRule) 958 959 if TESTING: 960 after = -1 # bug-check only 961 962 # Each iteration through the loop tries a new maxScore. 963 maxScore = max(rulesByScore.keys()) 964 rules = [] 965 while len(rules) < max_rules and maxScore >= min_score: 966 967 # Find the next best rule. This is done by repeatedly taking a rule with 968 # the highest score and stepping through the corpus to see where it 969 # applies. When it makes an error (decreasing its score) it's bumped 970 # down, and we try a new rule with the highest score. 971 # When we find a rule which has the highest score AND which has been 972 # tested against the entire corpus, we can conclude that it's the next 973 # best rule. 974 975 bestRule = None 976 bestRules = rulesByScore[maxScore].keys() 977 978 for rule in bestRules: 979 # Find the first relevant index at or following the first 980 # unknown index. (Only check indices with the right tag.) 981 ti = bisect.bisect_left(tagIndices[rule.original_tag()], 982 firstUnknownIndex[rule]) 983 for nextIndex in tagIndices[rule.original_tag()][ti:]: 984 if rule.applies(tagged_tokens, nextIndex): 985 _updateRuleApplies(rule, nextIndex) 986 if ruleScores[rule] < maxScore: 987 firstUnknownIndex[rule] = nextIndex+1 988 break # the _update demoted the rule 989 990 # If we checked all remaining indices and found no more errors: 991 if ruleScores[rule] == maxScore: 992 firstUnknownIndex[rule] = len(tagged_tokens) # i.e., we checked them all 993 print "%i) %s (score: %i)" %(len(rules)+1, rule, maxScore) 994 bestRule = rule 995 break 996 997 if bestRule == None: # all rules dropped below maxScore 998 del rulesByScore[maxScore] 999 maxScore = max(rulesByScore.keys()) 1000 continue # with next-best rules 1001 1002 # bug-check only 1003 if TESTING: 1004 before = len(_errorPositions(tagged_tokens, train_tokens)) 1005 print "There are %i errors before applying this rule." %before 1006 assert after == -1 or before == after, \ 1007 "after=%i but before=%i" %(after,before) 1008 1009 print "Applying best rule at %i locations..." \ 1010 %len(positionsByRule[bestRule].keys()) 1011 1012 # If we reach this point, we've found a new best rule. 1013 # Apply the rule at the relevant sites. 1014 # (apply_at is a little inefficient here, since we know the rule applies 1015 # and don't actually need to test it again.) 1016 rules.append(bestRule) 1017 bestRule.apply_at(tagged_tokens, positionsByRule[bestRule].keys()) 1018 1019 # Update the tag index accordingly. 1020 for i in positionsByRule[bestRule].keys(): # where it applied 1021 # Update positions of tags 1022 # First, find and delete the index for i from the old tag. 1023 oldIndex = bisect.bisect_left(tagIndices[bestRule.original_tag()], i) 1024 del tagIndices[bestRule.original_tag()][oldIndex] 1025 1026 # Then, insert i into the index list of the new tag. 1027 if not tagIndices.has_key(bestRule.replacement_tag()): 1028 tagIndices[bestRule.replacement_tag()] = [] 1029 newIndex = bisect.bisect_left(tagIndices[bestRule.replacement_tag()], i) 1030 tagIndices[bestRule.replacement_tag()].insert(newIndex, i) 1031 1032 # This part is tricky. 1033 # We need to know which sites might now require new rules -- that 1034 # is, which sites are close enough to the changed site so that 1035 # a template might now generate different rules for it. 1036 # Only the templates can know this. 1037 # 1038 # If a template now generates a different set of rules, we have 1039 # to update our indices to reflect that. 1040 print "Updating neighborhoods of changed sites.\n" 1041 1042 # First, collect all the indices that might get new rules. 1043 neighbors = set() 1044 for i in positionsByRule[bestRule].keys(): # sites changed 1045 for template in self._templates: 1046 neighbors.update(template.get_neighborhood(tagged_tokens, i)) 1047 1048 # Then collect the new set of rules for each such index. 1049 c = d = e = 0 1050 for i in neighbors: 1051 siteRules = set() 1052 for template in self._templates: 1053 # Get a set of the rules that the template now generates 1054 siteRules.update(set(template.applicable_rules( 1055 tagged_tokens, i, train_tokens[i][1]))) 1056 1057 # Update rules no longer generated here by any template 1058 for obsolete in rulesByPosition[i] - siteRules: 1059 c += 1 1060 _updateRuleNotApplies(obsolete, i) 1061 1062 # Update rules only now generated by this template 1063 for newRule in siteRules - rulesByPosition[i]: 1064 d += 1 1065 if not positionsByRule.has_key(newRule): 1066 e += 1 1067 _initRule(newRule) # make a new rule w/score=0 1068 _updateRuleApplies(newRule, i) # increment score, etc. 1069 1070 if TESTING: 1071 after = before - maxScore 1072 print "%i obsolete rule applications, %i new ones, " %(c,d)+ \ 1073 "using %i previously-unseen rules." %e 1074 1075 maxScore = max(rulesByScore.keys()) # may have gone up 1076 1077 1078 if self._trace > 0: print ("Training Brill tagger on %d tokens..." % 1079 len(train_tokens)) 1080 1081 # Maintain a list of the rules that apply at each position. 1082 rules_by_position = [{} for tok in train_tokens] 1083 1084 # Create and return a tagger from the rules we found. 1085 return Brill(self._initial_tagger, rules) 1086 1087 ###################################################################### 1088 ## Testing 1089 ###################################################################### 1090
1091 -def _errorPositions (train_tokens, tokens):
1092 return [i for i in range(len(tokens)) 1093 if tokens[i][1] != 1094 train_tokens[i][1] ]
1095 1096 # returns a list of errors in string format
1097 -def errorList (train_tokens, tokens, radius=2):
1098 """ 1099 Returns a list of human-readable strings indicating the errors in the 1100 given tagging of the corpus. 1101 1102 @param train_tokens: The correct tagging of the corpus 1103 @type train_tokens: C{list} of C{tuple} 1104 @param tokens: The tagged corpus 1105 @type tokens: C{list} of C{tuple} 1106 @param radius: How many tokens on either side of a wrongly-tagged token 1107 to include in the error string. For example, if C{radius}=2, each error 1108 string will show the incorrect token plus two tokens on either side. 1109 @type radius: int 1110 """ 1111 errors = [] 1112 indices = _errorPositions(train_tokens, tokens) 1113 tokenLen = len(tokens) 1114 for i in indices: 1115 ei = tokens[i][1].rjust(3) + " -> " \ 1116 + train_tokens[i][1].rjust(3) + ": " 1117 for j in range( max(i-radius, 0), min(i+radius+1, tokenLen) ): 1118 if tokens[j][0] == tokens[j][1]: 1119 s = tokens[j][0] # don't print punctuation tags 1120 else: 1121 s = tokens[j][0] + "/" + tokens[j][1] 1122 1123 if j == i: 1124 ei += "**"+s+"** " 1125 else: 1126 ei += s + " " 1127 errors.append(ei) 1128 1129 return errors
1130 1131 ##################################################################################### 1132 # Demonstration 1133 ##################################################################################### 1134
1135 -def demo(num_sents=100, max_rules=200, min_score=2, error_output = "errors.out", 1136 rule_output="rules.out", randomize=False, train=.8, trace=3):
1137 """ 1138 Brill Tagger Demonstration 1139 1140 @param num_sents: how many sentences of training and testing data to use 1141 @type num_sents: L{int} 1142 @param max_rules: maximum number of rule instances to create 1143 @type max_rules: L{int} 1144 @param min_score: the minimum score for a rule in order for it to be considered 1145 @type min_score: L{int} 1146 @param error_output: the file where errors will be saved 1147 @type error_output: L{string} 1148 @param rule_output: the file where rules will be saved 1149 @type rule_output: L{string} 1150 @param randomize: whether the training data should be a random subset of the corpus 1151 @type randomize: L{boolean} 1152 @param train: the fraction of the the corpus to be used for training (1=all) 1153 @type train: L{float} 1154 @param trace: the level of diagnostic tracing output to produce (0-3) 1155 @type trace: L{int} 1156 """ 1157 1158 from nltk_lite.corpora import treebank 1159 from nltk_lite import tag 1160 from nltk_lite.tag import brill 1161 1162 NN_CD_tagger = tag.Regexp([(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*', 'NN')]) 1163 1164 # train is the proportion of data used in training; the rest is reserved 1165 # for testing. 1166 1167 print "Loading tagged data..." 1168 sents = list(treebank.tagged()) 1169 if randomize: 1170 random.seed(len(sents)) 1171 random.shuffle(sents) 1172 1173 tagged_data = [t for s in sents[:num_sents] for t in s] 1174 cutoff = int(len(tagged_data)*train) 1175 1176 training_data = tagged_data[:cutoff] 1177 gold_data = tagged_data[cutoff:] 1178 1179 testing_data = [t[0] for t in gold_data] 1180 1181 # Unigram tagger 1182 1183 print "Training unigram tagger:", 1184 u = tag.Unigram(backoff=NN_CD_tagger) 1185 1186 # NB training and testing are required to use a list-of-lists structure, 1187 # so we wrap the flattened corpus data with the extra list structure. 1188 u.train([training_data]) 1189 print("[accuracy: %f]" % tag.accuracy(u, [gold_data])) 1190 1191 # Brill tagger 1192 1193 templates = [ 1194 brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,1)), 1195 brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (2,2)), 1196 brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,2)), 1197 brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,3)), 1198 brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,1)), 1199 brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (2,2)), 1200 brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,2)), 1201 brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,3)), 1202 brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-1, -1), (1,1)), 1203 brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1), (1,1)), 1204 ] 1205 1206 #trainer = brill.FastBrillTrainer(u, templates, trace) 1207 trainer = brill.BrillTrainer(u, templates, trace) 1208 b = trainer.train(training_data, max_rules, min_score) 1209 1210 print 1211 print("Brill accuracy: %f" % tag.accuracy(b, [gold_data])) 1212 1213 print("\nRules: ") 1214 printRules = file(rule_output, 'w') 1215 for rule in b.rules(): 1216 print(str(rule)) 1217 printRules.write(str(rule)+"\n\n") 1218 1219 testing_data = list(b.tag(testing_data)) 1220 el = errorList(gold_data, testing_data) 1221 errorFile = file(error_output, 'w') 1222 1223 for e in el: 1224 errorFile.write(e+"\n\n") 1225 errorFile.close() 1226 print "Done; rules and errors saved to %s and %s." % (rule_output, error_output)
1227 1228 if __name__ == '__main__': 1229 demo() 1230