Package nltk_lite :: Package tag :: Module brill
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.tag.brill

   1  # Natural Language Toolkit: Brill Tagger 
   2  # 
   3  # Copyright (C) 2001-2007 University of Pennsylvania 
   4  # Authors: Christopher Maloof <cjmaloof@gradient.cis.upenn.edu> 
   5  #          Edward Loper <edloper@gradient.cis.upenn.edu> 
   6  #          Steven Bird <sb@csse.unimelb.edu.au> 
   7  # URL: <http://nltk.sf.net> 
   8  # For license information, see LICENSE.TXT 
   9   
  10  """ 
  11  Brill's transformational rule-based tagger. 
  12  """ 
  13   
  14  from nltk_lite.tag import TagI 
  15   
  16  import bisect        # for binary search through a subset of indices 
  17  import random        # for shuffling WSJ files 
  18  import yaml          # to save and load taggers in files 
  19  from nltk_lite import yamltags 
  20   
  21  ###################################################################### 
  22  ## The Brill Tagger 
  23  ###################################################################### 
  24   
25 -class Brill(yaml.YAMLObject):
26 """ 27 Brill's transformational rule-based tagger. Brill taggers use an 28 X{initial tagger} (such as L{tag.Default}) to assign an intial 29 tag sequence to a text; and then apply an ordered list of 30 transformational rules to correct the tags of individual tokens. 31 These transformation rules are specified by the L{BrillRuleI} 32 interface. 33 34 Brill taggers can be created directly, from an initial tagger and 35 a list of transformational rules; but more often, Brill taggers 36 are created by learning rules from a training corpus, using either 37 L{BrillTrainer} or L{FastBrillTrainer}. 38 """ 39 40 yaml_tag = '!tag.Brill'
41 - def __init__(self, initial_tagger, rules):
42 """ 43 @param initial_tagger: The initial tagger 44 @type initial_tagger: L{TagI} 45 @param rules: An ordered list of transformation rules that 46 should be used to correct the initial tagging. 47 @type rules: C{list} of L{BrillRuleI} 48 """ 49 self._initial_tagger = initial_tagger 50 self._rules = rules
51
52 - def rules(self):
53 return self._rules[:]
54
55 - def tag (self, tokens):
56 # Inherit documentation from TagI 57 58 # Run the initial tagger. 59 tagged_tokens = list(self._initial_tagger.tag(tokens)) 60 61 # Create a dictionary that maps each tag to a list of the 62 # indices of tokens that have that tag. 63 tag_to_positions = {} 64 for i, (token, tag) in enumerate(tagged_tokens): 65 if tag not in tag_to_positions: 66 tag_to_positions[tag] = set([i]) 67 else: 68 tag_to_positions[tag].add(i) 69 70 # Apply each rule, in order. Only try to apply rules at 71 # positions that have the desired original tag. 72 for rule in self._rules: 73 # Find the positions where it might apply 74 positions = tag_to_positions.get(rule.original_tag(), []) 75 # Apply the rule at those positions. 76 changed = rule.apply_at(tagged_tokens, positions) 77 # Update tag_to_positions with the positions of tags that 78 # were modified. 79 for i in changed: 80 tag_to_positions[rule.original_tag()].remove(i) 81 if rule.replacement_tag() not in tag_to_positions: 82 tag_to_positions[rule.replacement_tag()] = set([i]) 83 else: 84 tag_to_positions[rule.replacement_tag()].add(i) 85 for t in tagged_tokens: 86 yield t
87 88 ###################################################################### 89 ## Brill Rules 90 ###################################################################### 91
92 -class BrillRuleI(yaml.YAMLObject):
93 """ 94 An interface for tag transformations on a tagged corpus, as 95 performed by brill taggers. Each transformation finds all tokens 96 in the corpus that are tagged with a specific X{original tag} and 97 satisfy a specific X{condition}, and replaces their tags with a 98 X{replacement tag}. For any given transformation, the original 99 tag, replacement tag, and condition are fixed. Conditions may 100 depend on the token under consideration, as well as any other 101 tokens in the corpus. 102 103 Brill rules must be comparable and hashable. 104 """
105 - def apply_to(self, tokens):
106 """ 107 Apply this rule everywhere it applies in the corpus. I.e., 108 for each token in the corpus that is tagged with this rule's 109 original tag, and that satisfies this rule's condition, set 110 its tag to be this rule's replacement tag. 111 112 @param tokens: The tagged corpus 113 @type tokens: C{list} of C{tuple} 114 @return: The indices of tokens whose tags were changed by this 115 rule. 116 @rtype: C{list} of C{int} 117 """ 118 return self.apply_at(tokens, range(len(tokens)))
119
120 - def apply_at(self, tokens, positions):
121 """ 122 Apply this rule at every position in C{positions} where it 123 applies to the corpus. I.e., for each position M{p} in 124 C{positions}, if C{tokens[M{p}]} is tagged with this rule's 125 original tag, and satisfies this rule's condition, then set 126 its tag to be this rule's replacement tag. 127 128 @param tokens: The tagged corpus 129 @type tokens: list of Token 130 @type positions: C{list} of C{int} 131 @param positions: The positions where the transformation is to 132 be tried. 133 @return: The indices of tokens whose tags were changed by this 134 rule. 135 @rtype: C{int} 136 """ 137 assert False, "BrillRuleI is an abstract interface"
138
139 - def applies(self, tokens, index):
140 """ 141 @return: True if the rule would change the tag of 142 C{tokens[index]}, False otherwise 143 @rtype: Boolean 144 145 @param tokens: A tagged corpus 146 @type tokens: list of Token 147 @param index: The index to check 148 @type index: int 149 """ 150 assert False, "BrillRuleI is an abstract interface"
151
152 - def original_tag(self):
153 """ 154 @return: The tag which this C{BrillRuleI} may cause to be 155 replaced. 156 @rtype: any 157 """ 158 assert False, "BrillRuleI is an abstract interface"
159
160 - def replacement_tag(self):
161 """ 162 @return: the tag with which this C{BrillRuleI} may replace 163 another tag. 164 @rtype: any 165 """ 166 assert False, "BrillRuleI is an abstract interface"
167 168 # Rules must be comparable and hashable for the algorithm to work
169 - def __eq__(self):
170 assert False, "Brill rules must be comparable"
171 - def __hash__(self):
172 assert False, "Brill rules must be hashable"
173 174
175 -class ProximateTokensRule(BrillRuleI):
176 """ 177 An abstract base class for brill rules whose condition checks for 178 the presence of tokens with given properties at given ranges of 179 positions, relative to the token. 180 181 Each subclass of proximate tokens brill rule defines a method 182 M{extract_property}, which extracts a specific property from the 183 the token, such as its text or tag. Each instance is 184 parameterized by a set of tuples, specifying ranges of positions 185 and property values to check for in those ranges: 186 187 - (M{start}, M{end}, M{value}) 188 189 The brill rule is then applicable to the M{n}th token iff: 190 191 - The M{n}th token is tagged with the rule's original tag; and 192 - For each (M{start}, M{end}, M{value}) triple: 193 - The property value of at least one token between 194 M{n+start} and M{n+end} (inclusive) is M{value}. 195 196 For example, a proximate token brill template with M{start=end=-1} 197 generates rules that check just the property of the preceding 198 token. Note that multiple properties may be included in a single 199 rule; the rule applies if they all hold. 200 """ 201
202 - def __init__(self, original_tag, replacement_tag, *conditions):
203 """ 204 205 Construct a new brill rule that changes a token's tag from 206 C{original_tag} to C{replacement_tag} if all of the properties 207 specified in C{conditions} hold. 208 209 @type conditions: C{tuple} of C{(int, int, *)} 210 @param conditions: A list of 3-tuples C{(start, end, value)}, 211 each of which specifies that the property of at least one 212 token between M{n}+C{start} and M{n}+C{end} (inclusive) is 213 C{value}. 214 @raise ValueError: If C{start}>C{end} for any condition. 215 """ 216 assert self.__class__ != ProximateTokensRule, \ 217 "ProximateTokensRule is an abstract base class" 218 219 self._original = original_tag 220 self._replacement = replacement_tag 221 self._conditions = conditions 222 for (s,e,v) in conditions: 223 if s>e: 224 raise ValueError('Condition %s has an invalid range' % 225 ((s,e,v),))
226 227 # Make Brill rules look nice in YAML. 228 @classmethod
229 - def to_yaml(cls, dumper, data):
230 node = dumper.represent_mapping(cls.yaml_tag, dict( 231 description=str(data), 232 conditions=list(list(x) for x in data._conditions), 233 original=data._original, 234 replacement=data._replacement)) 235 return node
236 @classmethod
237 - def from_yaml(cls, loader, node):
238 map = loader.construct_mapping(node, deep=True) 239 return cls(map['original'], map['replacement'], 240 *(tuple(x) for x in map['conditions']))
241
242 - def extract_property(token): # [staticmethod]
243 """ 244 Returns some property characterizing this token, such as its 245 base lexical item or its tag. 246 247 Each implentation of this method should correspond to an 248 implementation of the method with the same name in a subclass 249 of L{ProximateTokensTemplate}. 250 251 @param token: The token 252 @type token: Token 253 @return: The property 254 @rtype: any 255 """ 256 assert False, "ProximateTokensRule is an abstract interface"
257 extract_property = staticmethod(extract_property) 258
259 - def apply_at(self, tokens, positions):
260 # Inherit docs from BrillRuleI 261 262 # Find all locations where the rule is applicable 263 change = [] 264 for i in positions: 265 if self.applies(tokens, i): 266 change.append(i) 267 268 # Make the changes. Note: this must be done in a separate 269 # step from finding applicable locations, since we don't want 270 # the rule to interact with itself. 271 for i in change: 272 (token, tag) = tokens[i] 273 tokens[i] = (token, self._replacement) 274 275 return change
276
277 - def applies(self, tokens, index):
278 # Inherit docs from BrillRuleI 279 280 # Does the given token have this rule's "original tag"? 281 if tokens[index][1] != self._original: 282 return False 283 284 # Check to make sure that every condition holds. 285 for (start, end, val) in self._conditions: 286 # Find the (absolute) start and end indices. 287 s = max(0, index+start) 288 e = min(index+end+1, len(tokens)) 289 290 # Look for *any* token that satisfies the condition. 291 for i in range(s, e): 292 if self.extract_property(tokens[i]) == val: 293 break 294 else: 295 # No token satisfied the condition; return false. 296 return False 297 298 # Every condition checked out, so the rule is applicable. 299 return True
300
301 - def original_tag(self):
302 # Inherit docs from BrillRuleI 303 return self._original
304
305 - def replacement_tag(self):
306 # Inherit docs from BrillRuleI 307 return self._replacement
308
309 - def __eq__(self, other):
310 return (other != None and 311 other.__class__ == self.__class__ and 312 self._original == other._original and 313 self._replacement == other._replacement and 314 self._conditions == other._conditions)
315
316 - def __hash__(self):
317 return hash( (self._original, self._replacement, self._conditions, 318 self.__class__.__name__) )
319
320 - def __repr__(self):
321 conditions = ' and '.join(['%s in %d...%d' % (v,s,e) 322 for (s,e,v) in self._conditions]) 323 return '<%s: %s->%s if %s>' % (self.__class__.__name__, 324 self._original, self._replacement, 325 conditions)
326
327 - def __str__(self):
328 replacement = '%s -> %s' % (self._original, 329 self._replacement) 330 if len(self._conditions) == 0: 331 conditions = '' 332 else: 333 conditions = ' if '+ ', and '.join([self._condition_to_str(c) 334 for c in self._conditions]) 335 return replacement+conditions
336
337 - def _condition_to_str(self, condition):
338 """ 339 Return a string representation of the given condition. 340 This helper method is used by L{__str__}. 341 """ 342 (start, end, value) = condition 343 return ('the %s of %s is %r' % 344 (self.PROPERTY_NAME, self._range_to_str(start, end), value))
345
346 - def _range_to_str(self, start, end):
347 """ 348 Return a string representation for the given range. This 349 helper method is used by L{__str__}. 350 """ 351 if start == end == 0: 352 return 'this word' 353 if start == end == -1: 354 return 'the preceding word' 355 elif start == end == 1: 356 return 'the following word' 357 elif start == end and start < 0: 358 return 'word i-%d' % -start 359 elif start == end and start > 0: 360 return 'word i+%d' % start 361 else: 362 if start >= 0: start = '+%d' % start 363 if end >= 0: end = '+%d' % end 364 return 'words i%s...i%s' % (start, end)
365
366 -class ProximateTagsRule(ProximateTokensRule):
367 """ 368 A rule which examines the tags of nearby tokens. 369 @see: superclass L{ProximateTokensRule} for details. 370 @see: L{ProximateTagsTemplate}, which generates these rules. 371 """ 372 PROPERTY_NAME = 'tag' # for printing. 373 yaml_tag = '!ProximateTagsRule'
374 - def extract_property(token): # [staticmethod]
375 """@return: The given token's tag.""" 376 return token[1]
377 extract_property = staticmethod(extract_property) 378
379 -class ProximateWordsRule(ProximateTokensRule):
380 """ 381 A rule which examines the base types of nearby tokens. 382 @see: L{ProximateTokensRule} for details. 383 @see: L{ProximateWordsTemplate}, which generates these rules. 384 """ 385 PROPERTY_NAME = 'text' # for printing. 386 yaml_tag = '!ProximateWordsRule'
387 - def extract_property(token): # [staticmethod]
388 """@return: The given token's text.""" 389 return token[0]
390 extract_property = staticmethod(extract_property) 391 392 ###################################################################### 393 ## Brill Templates 394 ###################################################################### 395
396 -class BrillTemplateI(object):
397 """ 398 An interface for generating lists of transformational rules that 399 apply at given corpus positions. C{BrillTemplateI} is used by 400 C{Brill} training algorithms to generate candidate rules. 401 """
402 - def __init__(self):
403 raise AssertionError, "BrillTemplateI is an abstract interface"
404
405 - def applicable_rules(self, tokens, i, correctTag):
406 """ 407 Return a list of the transformational rules that would correct 408 the C{i}th subtoken's tag in the given token. In particular, 409 return a list of zero or more rules that would change 410 C{tagged_tokens[i][1]} to C{correctTag}, if applied 411 to C{token}. 412 413 If the C{i}th subtoken already has the correct tag (i.e., if 414 C{tagged_tokens[i][1]} == C{correctTag}), then 415 C{applicable_rules} should return the empty list. 416 417 @param tokens: The tagged tokens being tagged. 418 @type tokens: C{list} of C{tuple} 419 @param i: The index of the token whose tag should be corrected. 420 @type i: C{int} 421 @param correctTag: The correct tag for the C{i}th token. 422 @type correctTag: (any) 423 @rtype: C{list} of L{BrillRuleI} 424 """ 425 raise AssertionError, "BrillTemplateI is an abstract interface"
426
427 - def get_neighborhood(self, token, index):
428 """ 429 Returns the set of indices C{i} such that 430 C{applicable_rules(token, index, ...)} depends on the value of 431 the C{i}th subtoken of C{token}. 432 433 This method is used by the \"fast\" Brill tagger trainer. 434 435 @param token: The tokens being tagged. 436 @type token: C{list} of C{tuple} 437 @param index: The index whose neighborhood should be returned. 438 @type index: C{int} 439 @rtype: C{Set} 440 """ 441 raise AssertionError, "BrillTemplateI is an abstract interface"
442
443 -class ProximateTokensTemplate(BrillTemplateI):
444 """ 445 An brill templates that generates a list of 446 L{ProximateTokensRule}s that apply at a given corpus 447 position. In particular, each C{ProximateTokensTemplate} is 448 parameterized by a proximate token brill rule class and a list of 449 boundaries, and generates all rules that: 450 451 - use the given brill rule class 452 - use the given list of boundaries as the C{start} and C{end} 453 points for their conditions 454 - are applicable to the given token. 455 """
456 - def __init__(self, rule_class, *boundaries):
457 """ 458 Construct a template for generating proximate token brill 459 rules. 460 461 @type rule_class: C{class} 462 @param rule_class: The proximate token brill rule class that 463 should be used to generate new rules. This class must be a 464 subclass of L{ProximateTokensRule}. 465 @type boundaries: C{tuple} of C{(int, int)} 466 @param boundaries: A list of tuples C{(start, end)}, each of 467 which specifies a range for which a condition should be 468 created by each rule. 469 @raise ValueError: If C{start}>C{end} for any boundary. 470 """ 471 self._rule_class = rule_class 472 self._boundaries = boundaries 473 for (s,e) in boundaries: 474 if s>e: 475 raise ValueError('Boundary %s has an invalid range' % 476 ((s,e),))
477
478 - def applicable_rules(self, tokens, index, correct_tag):
479 if tokens[index][1] == correct_tag: 480 return [] 481 482 # For each of this template's boundaries, Find the conditions 483 # that are applicable for the given token. 484 applicable_conditions = \ 485 [self._applicable_conditions(tokens, index, start, end) 486 for (start, end) in self._boundaries] 487 488 # Find all combinations of these applicable conditions. E.g., 489 # if applicable_conditions=[[A,B], [C,D]], then this will 490 # generate [[A,C], [A,D], [B,C], [B,D]]. 491 condition_combos = [[]] 492 for conditions in applicable_conditions: 493 condition_combos = [old_conditions+[new_condition] 494 for old_conditions in condition_combos 495 for new_condition in conditions] 496 497 # Translate the condition sets into rules. 498 return [self._rule_class(tokens[index][1], correct_tag, *conds) 499 for conds in condition_combos]
500
501 - def _applicable_conditions(self, tokens, index, start, end):
502 """ 503 @return: A set of all conditions for proximate token rules 504 that are applicable to C{tokens[index]}, given boundaries of 505 C{(start, end)}. I.e., return a list of all tuples C{(start, 506 end, M{value})}, such the property value of at least one token 507 between M{index+start} and M{index+end} (inclusive) is 508 M{value}. 509 """ 510 conditions = set() 511 s = max(0, index+start) 512 e = min(index+end+1, len(tokens)) 513 for i in range(s, e): 514 value = self._rule_class.extract_property(tokens[i]) 515 conditions.add( (start, end, value) ) 516 return conditions
517
518 - def get_neighborhood(self, tokens, index):
519 # inherit docs from BrillTemplateI 520 neighborhood = set([index]) 521 for (start, end) in self._boundaries: 522 s = max(0, index+start) 523 e = min(index+end+1, len(tokens)) 524 for i in range(s, e): 525 neighborhood.add(i) 526 527 return neighborhood
528
529 -class SymmetricProximateTokensTemplate(BrillTemplateI):
530 """ 531 Simulates two L{ProximateTokensTemplate}s which are symmetric 532 across the location of the token. For rules of the form \"If the 533 M{n}th token is tagged C{A}, and any tag preceding B{or} following 534 the M{n}th token by a distance between M{x} and M{y} is C{B}, and 535 ... , then change the tag of the nth token from C{A} to C{C}.\" 536 537 One C{ProximateTokensTemplate} is formed by passing in the 538 same arguments given to this class's constructor: tuples 539 representing intervals in which a tag may be found. The other 540 C{ProximateTokensTemplate} is constructed with the negative 541 of all the arguments in reversed order. For example, a 542 C{SymmetricProximateTokensTemplate} using the pair (-2,-1) and the 543 constructor C{ProximateTagsTemplate} generates the same rules as a 544 C{ProximateTagsTemplate} using (-2,-1) plus a second 545 C{ProximateTagsTemplate} using (1,2). 546 547 This is useful because we typically don't want templates to 548 specify only \"following\" or only \"preceding\"; we'd like our 549 rules to be able to look in either direction. 550 """
551 - def __init__(self, rule_class, *boundaries):
552 """ 553 Construct a template for generating proximate token brill 554 rules. 555 556 @type rule_class: C{class} 557 @param rule_class: The proximate token brill rule class that 558 should be used to generate new rules. This class must be a 559 subclass of L{ProximateTokensRule}. 560 @type boundaries: C{tuple} of C{(int, int)} 561 @param boundaries: A list of tuples C{(start, end)}, each of 562 which specifies a range for which a condition should be 563 created by each rule. 564 @raise ValueError: If C{start}>C{end} for any boundary. 565 """ 566 self._ptt1 = ProximateTokensTemplate(rule_class, *boundaries) 567 reversed = [(-e,-s) for (s,e) in boundaries] 568 self._ptt2 = ProximateTokensTemplate(rule_class, *reversed)
569 570 # Generates lists of a subtype of ProximateTokensRule.
571 - def applicable_rules(self, tokens, index, correctTag):
572 """ 573 See L{BrillTemplateI} for full specifications. 574 575 @rtype: list of ProximateTokensRule 576 """ 577 return (self._ptt1.applicable_rules(tokens, index, correctTag) + 578 self._ptt2.applicable_rules(tokens, index, correctTag))
579
580 - def get_neighborhood(self, tokens, index):
581 # inherit docs from BrillTemplateI 582 n1 = self._ptt1.get_neighborhood(tokens, index) 583 n2 = self._ptt2.get_neighborhood(tokens, index) 584 return n1.union(n2)
585 586 ###################################################################### 587 ## Brill Tagger Trainer 588 ###################################################################### 589
590 -class BrillTrainer(object):
591 """ 592 A trainer for brill taggers. 593 """
594 - def __init__(self, initial_tagger, templates, trace=0):
595 self._initial_tagger = initial_tagger 596 self._templates = templates 597 self._trace = trace
598 599 #//////////////////////////////////////////////////////////// 600 # Training 601 #//////////////////////////////////////////////////////////// 602
603 - def train(self, train_tokens, max_rules=200, min_score=2):
604 """ 605 Trains the Brill tagger on the corpus C{train_token}, 606 producing at most C{max_rules} transformations, each of which 607 reduces the net number of errors in the corpus by at least 608 C{min_score}. 609 610 @type train_tokens: C{list} of L{tuple} 611 @param train_tokens: The corpus of tagged tokens 612 @type max_rules: C{int} 613 @param max_rules: The maximum number of transformations to be created 614 @type min_score: C{int} 615 @param min_score: The minimum acceptable net error reduction 616 that each transformation must produce in the corpus. 617 """ 618 if self._trace > 0: print ("Training Brill tagger on %d tokens..." % 619 len(train_tokens)) 620 621 # Create a new copy of the training token, and run the initial 622 # tagger on this. We will progressively update this test 623 # token to look more like the training token. 624 625 test_tokens = list(self._initial_tagger.tag(t[0] for t in train_tokens)) 626 627 if self._trace > 2: self._trace_header() 628 629 # Look for useful rules. 630 rules = [] 631 try: 632 while len(rules) < max_rules: 633 old_tags = [t[1] for t in test_tokens] 634 (rule, score, fixscore) = self._best_rule(test_tokens, 635 train_tokens) 636 if rule is None or score < min_score: 637 if self._trace > 1: 638 print 'Insufficient improvement; stopping' 639 break 640 else: 641 # Add the rule to our list of rules. 642 rules.append(rule) 643 # Use the rules to update the test token. 644 k = rule.apply_to(test_tokens) 645 # Display trace output. 646 if self._trace > 1: 647 self._trace_rule(rule, score, fixscore, len(k)) 648 # The user can also cancel training manually: 649 except KeyboardInterrupt: pass 650 651 # Create and return a tagger from the rules we found. 652 return Brill(self._initial_tagger, rules)
653 654 #//////////////////////////////////////////////////////////// 655 # Finding the best rule 656 #//////////////////////////////////////////////////////////// 657 658 # Finds the rule that makes the biggest net improvement in the corpus. 659 # Returns a (rule, score) pair.
660 - def _best_rule(self, test_tokens, train_tokens):
661 662 # Create a dictionary mapping from each tag to a list of the 663 # indices that have that tag in both test_tokens and 664 # train_tokens (i.e., where it is correctly tagged). 665 correct_indices = {} 666 for i in range(len(test_tokens)): 667 if test_tokens[i][1] == train_tokens[i][1]: 668 tag = test_tokens[i][1] 669 correct_indices.setdefault(tag, []).append(i) 670 671 # Find all the rules that correct at least one token's tag, 672 # and the number of tags that each rule corrects (in 673 # descending order of number of tags corrected). 674 rules = self._find_rules(test_tokens, train_tokens) 675 676 # Keep track of the current best rule, and its score. 677 best_rule, best_score, best_fixscore = None, 0, 0 678 679 # Consider each rule, in descending order of fixscore (the 680 # number of tags that the rule corrects, not including the 681 # number that it breaks). 682 for (rule, fixscore) in rules: 683 # The actual score must be <= fixscore; so if best_score 684 # is bigger than fixscore, then we already have the best 685 # rule. 686 if best_score >= fixscore: 687 return best_rule, best_score, best_fixscore 688 689 # Calculate the actual score, by decrementing fixscore 690 # once for each tag that the rule changes to an incorrect 691 # value. 692 score = fixscore 693 if correct_indices.has_key(rule.original_tag()): 694 for i in correct_indices[rule.original_tag()]: 695 if rule.applies(test_tokens, i): 696 score -= 1 697 # If the score goes below best_score, then we know 698 # that this isn't the best rule; so move on: 699 if score <= best_score: break 700 701 #print '%5d %5d %s' % (fixscore, score, rule) 702 703 # If the actual score is better than the best score, then 704 # update best_score and best_rule. 705 if score > best_score: 706 best_rule, best_score, best_fixscore = rule, score, fixscore 707 708 # Return the best rule, and its score. 709 return best_rule, best_score, best_fixscore
710
711 - def _find_rules(self, test_tokens, train_tokens):
712 """ 713 Find all rules that correct at least one token's tag in 714 C{test_tokens}. 715 716 @return: A list of tuples C{(rule, fixscore)}, where C{rule} 717 is a brill rule and C{fixscore} is the number of tokens 718 whose tag the rule corrects. Note that C{fixscore} does 719 I{not} include the number of tokens whose tags are changed 720 to incorrect values. 721 """ 722 723 # Create a list of all indices that are incorrectly tagged. 724 error_indices = [i for i in range(len(test_tokens)) 725 if (test_tokens[i][1] != 726 train_tokens[i][1])] 727 728 # Create a dictionary mapping from rules to their positive-only 729 # scores. 730 rule_score_dict = {} 731 for i in range(len(test_tokens)): 732 rules = self._find_rules_at(test_tokens, train_tokens, i) 733 for rule in rules: 734 rule_score_dict[rule] = rule_score_dict.get(rule,0) + 1 735 736 # Convert the dictionary into a list of (rule, score) tuples, 737 # sorted in descending order of score. 738 rule_score_items = rule_score_dict.items() 739 temp = [(-score, rule) for (rule, score) in rule_score_items] 740 temp.sort() 741 return [(rule, -negscore) for (negscore, rule) in temp]
742
743 - def _find_rules_at(self, test_tokens, train_tokens, i):
744 """ 745 @rtype: C{Set} 746 @return: the set of all rules (based on the templates) that 747 correct token C{i}'s tag in C{test_tokens}. 748 """ 749 750 applicable_rules = set() 751 if test_tokens[i][1] != train_tokens[i][1]: 752 correct_tag = train_tokens[i][1] 753 for template in self._templates: 754 new_rules = template.applicable_rules(test_tokens, i, 755 correct_tag) 756 applicable_rules.update(new_rules) 757 758 return applicable_rules
759 760 #//////////////////////////////////////////////////////////// 761 # Tracing 762 #//////////////////////////////////////////////////////////// 763
764 - def _trace_header(self):
765 print """ 766 B | 767 S F r O | Score = Fixed - Broken 768 c i o t | R Fixed = num tags changed incorrect -> correct 769 o x k h | u Broken = num tags changed correct -> incorrect 770 r e e e | l Other = num tags changed incorrect -> incorrect 771 e d n r | e 772 ------------------+------------------------------------------------------- 773 """.rstrip()
774
775 - def _trace_rule(self, rule, score, fixscore, numchanges):
776 if self._trace > 2: 777 print ('%4d%4d%4d%4d ' % (score, fixscore, fixscore-score, 778 numchanges-fixscore*2+score)), '|', 779 print rule
780 781 ###################################################################### 782 ## Fast Brill Tagger Trainer 783 ###################################################################### 784
785 -class FastBrillTrainer(object):
786 """ 787 A faster trainer for brill taggers. 788 """
789 - def __init__(self, initial_tagger, templates, trace=0):
790 self._initial_tagger = initial_tagger 791 self._templates = templates 792 self._trace = trace
793 794 #//////////////////////////////////////////////////////////// 795 # Training 796 #//////////////////////////////////////////////////////////// 797
798 - def train(self, train_tokens, max_rules=200, min_score=2):
799 800 # If TESTING is true, extra computation is done to determine whether 801 # each "best" rule actually reduces net error by the score it received. 802 TESTING = False 803 804 # Basic idea: Keep track of the rules that apply at each position. 805 # And keep track of the positions to which each rule applies. 806 807 # The set of somewhere-useful rules that apply at each position 808 rulesByPosition = [] 809 for i in range(len(train_tokens)): 810 rulesByPosition.append(set()) 811 812 # Mapping somewhere-useful rules to the positions where they apply. 813 # Then maps each position to the score change the rule generates there. 814 # (always -1, 0, or 1) 815 positionsByRule = {} 816 817 # Map scores to sets of rules known to achieve *at most* that score. 818 rulesByScore = {0:{}} 819 # Conversely, map somewhere-useful rules to their minimal scores. 820 ruleScores = {} 821 822 tagIndices = {} # Lists of indices, mapped to by their tags 823 824 # Maps rules to the first index in the corpus where it may not be known 825 # whether the rule applies. (Rules can't be chosen for inclusion 826 # unless this value = len(corpus). But most rules are bad, and 827 # we won't need to check the whole corpus to know that.) 828 # Some indices past this may actually have been checked; it just isn't 829 # guaranteed. 830 firstUnknownIndex = {} 831 832 # Make entries in the rule-mapping dictionaries. 833 # Should be called before _updateRuleApplies. 834 def _initRule (rule): 835 positionsByRule[rule] = {} 836 rulesByScore[0][rule] = None 837 ruleScores[rule] = 0 838 firstUnknownIndex[rule] = 0
839 840 # Takes a somewhere-useful rule which applies at index i; 841 # Updates all rule data to reflect that the rule so applies. 842 def _updateRuleApplies (rule, i): 843 844 # If the rule is already known to apply here, ignore. 845 # (This only happens if the position's tag hasn't changed.) 846 if positionsByRule[rule].has_key(i): 847 return 848 849 if rule.replacement_tag() == train_tokens[i][1]: 850 positionsByRule[rule][i] = 1 851 elif rule.original_tag() == train_tokens[i][1]: 852 positionsByRule[rule][i] = -1 853 else: # was wrong, remains wrong 854 positionsByRule[rule][i] = 0 855 856 # Update rules in the other dictionaries 857 del rulesByScore[ruleScores[rule]][rule] 858 ruleScores[rule] += positionsByRule[rule][i] 859 if not rulesByScore.has_key(ruleScores[rule]): 860 rulesByScore[ruleScores[rule]] = {} 861 rulesByScore[ruleScores[rule]][rule] = None 862 rulesByPosition[i].add(rule)
863 864 # Takes a rule which no longer applies at index i; 865 # Updates all rule data to reflect that the rule doesn't apply. 866 def _updateRuleNotApplies (rule, i): 867 del rulesByScore[ruleScores[rule]][rule] 868 ruleScores[rule] -= positionsByRule[rule][i] 869 if not rulesByScore.has_key(ruleScores[rule]): 870 rulesByScore[ruleScores[rule]] = {} 871 rulesByScore[ruleScores[rule]][rule] = None 872 873 del positionsByRule[rule][i] 874 rulesByPosition[i].remove(rule) 875 # Optional addition: if the rule now applies nowhere, delete 876 # all its dictionary entries. 877 878 tagged_tokens = list(self._initial_tagger.tag(t[0] for t in train_tokens)) 879 880 # First sort the corpus by tag, and also note where the errors are. 881 errorIndices = [] # only used in initialization 882 for i in range(len(tagged_tokens)): 883 tag = tagged_tokens[i][1] 884 if tag != train_tokens[i][1]: 885 errorIndices.append(i) 886 if not tagIndices.has_key(tag): 887 tagIndices[tag] = [] 888 tagIndices[tag].append(i) 889 890 print "Finding useful rules..." 891 # Collect all rules that fix any errors, with their positive scores. 892 for i in errorIndices: 893 for template in self._templates: 894 # Find the templated rules that could fix the error. 895 for rule in template.applicable_rules(tagged_tokens, i, 896 train_tokens[i][1]): 897 if not positionsByRule.has_key(rule): 898 _initRule(rule) 899 _updateRuleApplies(rule, i) 900 901 print "Done initializing %i useful rules." %len(positionsByRule) 902 903 if TESTING: 904 after = -1 # bug-check only 905 906 # Each iteration through the loop tries a new maxScore. 907 maxScore = max(rulesByScore.keys()) 908 rules = [] 909 while len(rules) < max_rules and maxScore >= min_score: 910 911 # Find the next best rule. This is done by repeatedly taking a rule with 912 # the highest score and stepping through the corpus to see where it 913 # applies. When it makes an error (decreasing its score) it's bumped 914 # down, and we try a new rule with the highest score. 915 # When we find a rule which has the highest score AND which has been 916 # tested against the entire corpus, we can conclude that it's the next 917 # best rule. 918 919 bestRule = None 920 bestRules = rulesByScore[maxScore].keys() 921 922 for rule in bestRules: 923 # Find the first relevant index at or following the first 924 # unknown index. (Only check indices with the right tag.) 925 ti = bisect.bisect_left(tagIndices[rule.original_tag()], 926 firstUnknownIndex[rule]) 927 for nextIndex in tagIndices[rule.original_tag()][ti:]: 928 if rule.applies(tagged_tokens, nextIndex): 929 _updateRuleApplies(rule, nextIndex) 930 if ruleScores[rule] < maxScore: 931 firstUnknownIndex[rule] = nextIndex+1 932 break # the _update demoted the rule 933 934 # If we checked all remaining indices and found no more errors: 935 if ruleScores[rule] == maxScore: 936 firstUnknownIndex[rule] = len(tagged_tokens) # i.e., we checked them all 937 print "%i) %s (score: %i)" %(len(rules)+1, rule, maxScore) 938 bestRule = rule 939 break 940 941 if bestRule == None: # all rules dropped below maxScore 942 del rulesByScore[maxScore] 943 maxScore = max(rulesByScore.keys()) 944 continue # with next-best rules 945 946 # bug-check only 947 if TESTING: 948 before = len(_errorPositions(tagged_tokens, train_tokens)) 949 print "There are %i errors before applying this rule." %before 950 assert after == -1 or before == after, \ 951 "after=%i but before=%i" %(after,before) 952 953 print "Applying best rule at %i locations..." \ 954 %len(positionsByRule[bestRule].keys()) 955 956 # If we reach this point, we've found a new best rule. 957 # Apply the rule at the relevant sites. 958 # (apply_at is a little inefficient here, since we know the rule applies 959 # and don't actually need to test it again.) 960 rules.append(bestRule) 961 bestRule.apply_at(tagged_tokens, positionsByRule[bestRule].keys()) 962 963 # Update the tag index accordingly. 964 for i in positionsByRule[bestRule].keys(): # where it applied 965 # Update positions of tags 966 # First, find and delete the index for i from the old tag. 967 oldIndex = bisect.bisect_left(tagIndices[bestRule.original_tag()], i) 968 del tagIndices[bestRule.original_tag()][oldIndex] 969 970 # Then, insert i into the index list of the new tag. 971 if not tagIndices.has_key(bestRule.replacement_tag()): 972 tagIndices[bestRule.replacement_tag()] = [] 973 newIndex = bisect.bisect_left(tagIndices[bestRule.replacement_tag()], i) 974 tagIndices[bestRule.replacement_tag()].insert(newIndex, i) 975 976 # This part is tricky. 977 # We need to know which sites might now require new rules -- that 978 # is, which sites are close enough to the changed site so that 979 # a template might now generate different rules for it. 980 # Only the templates can know this. 981 # 982 # If a template now generates a different set of rules, we have 983 # to update our indices to reflect that. 984 print "Updating neighborhoods of changed sites.\n" 985 986 # First, collect all the indices that might get new rules. 987 neighbors = set() 988 for i in positionsByRule[bestRule].keys(): # sites changed 989 for template in self._templates: 990 neighbors.update(template.get_neighborhood(tagged_tokens, i)) 991 992 # Then collect the new set of rules for each such index. 993 c = d = e = 0 994 for i in neighbors: 995 siteRules = set() 996 for template in self._templates: 997 # Get a set of the rules that the template now generates 998 siteRules.update(set(template.applicable_rules( 999 tagged_tokens, i, train_tokens[i][1]))) 1000 1001 # Update rules no longer generated here by any template 1002 for obsolete in rulesByPosition[i] - siteRules: 1003 c += 1 1004 _updateRuleNotApplies(obsolete, i) 1005 1006 # Update rules only now generated by this template 1007 for newRule in siteRules - rulesByPosition[i]: 1008 d += 1 1009 if not positionsByRule.has_key(newRule): 1010 e += 1 1011 _initRule(newRule) # make a new rule w/score=0 1012 _updateRuleApplies(newRule, i) # increment score, etc. 1013 1014 if TESTING: 1015 after = before - maxScore 1016 print "%i obsolete rule applications, %i new ones, " %(c,d)+ \ 1017 "using %i previously-unseen rules." %e 1018 1019 maxScore = max(rulesByScore.keys()) # may have gone up 1020 1021 1022 if self._trace > 0: print ("Training Brill tagger on %d tokens..." % 1023 len(train_tokens)) 1024 1025 # Maintain a list of the rules that apply at each position. 1026 rules_by_position = [{} for tok in train_tokens] 1027 1028 # Create and return a tagger from the rules we found. 1029 return Brill(self._initial_tagger, rules) 1030 1031 ###################################################################### 1032 ## Testing 1033 ###################################################################### 1034
1035 -def _errorPositions (train_tokens, tokens):
1036 return [i for i in range(len(tokens)) 1037 if tokens[i][1] != 1038 train_tokens[i][1] ]
1039 1040 # returns a list of errors in string format
1041 -def errorList (train_tokens, tokens, radius=2):
1042 """ 1043 Returns a list of human-readable strings indicating the errors in the 1044 given tagging of the corpus. 1045 1046 @param train_tokens: The correct tagging of the corpus 1047 @type train_tokens: C{list} of C{tuple} 1048 @param tokens: The tagged corpus 1049 @type tokens: C{list} of C{tuple} 1050 @param radius: How many tokens on either side of a wrongly-tagged token 1051 to include in the error string. For example, if C{radius}=2, each error 1052 string will show the incorrect token plus two tokens on either side. 1053 @type radius: int 1054 """ 1055 errors = [] 1056 indices = _errorPositions(train_tokens, tokens) 1057 tokenLen = len(tokens) 1058 for i in indices: 1059 ei = tokens[i][1].rjust(3) + " -> " \ 1060 + train_tokens[i][1].rjust(3) + ": " 1061 for j in range( max(i-radius, 0), min(i+radius+1, tokenLen) ): 1062 if tokens[j][0] == tokens[j][1]: 1063 s = tokens[j][0] # don't print punctuation tags 1064 else: 1065 s = tokens[j][0] + "/" + tokens[j][1] 1066 1067 if j == i: 1068 ei += "**"+s+"** " 1069 else: 1070 ei += s + " " 1071 errors.append(ei) 1072 1073 return errors
1074 1075 ##################################################################################### 1076 # Demonstration 1077 ##################################################################################### 1078
1079 -def demo(num_sents=100, max_rules=200, min_score=3, error_output = "errors.out", 1080 rule_output="rules.yaml", randomize=False, train=.8, trace=3):
1081 """ 1082 Brill Tagger Demonstration 1083 1084 @param num_sents: how many sentences of training and testing data to use 1085 @type num_sents: L{int} 1086 @param max_rules: maximum number of rule instances to create 1087 @type max_rules: L{int} 1088 @param min_score: the minimum score for a rule in order for it to be considered 1089 @type min_score: L{int} 1090 @param error_output: the file where errors will be saved 1091 @type error_output: L{string} 1092 @param rule_output: the file where rules will be saved 1093 @type rule_output: L{string} 1094 @param randomize: whether the training data should be a random subset of the corpus 1095 @type randomize: L{boolean} 1096 @param train: the fraction of the the corpus to be used for training (1=all) 1097 @type train: L{float} 1098 @param trace: the level of diagnostic tracing output to produce (0-3) 1099 @type trace: L{int} 1100 """ 1101 1102 from nltk_lite.corpora import treebank 1103 from nltk_lite import tag 1104 from nltk_lite.tag import brill 1105 1106 NN_CD_tagger = tag.Regexp([(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*', 'NN')]) 1107 1108 # train is the proportion of data used in training; the rest is reserved 1109 # for testing. 1110 1111 print "Loading tagged data..." 1112 sents = list(treebank.tagged()) 1113 if randomize: 1114 random.seed(len(sents)) 1115 random.shuffle(sents) 1116 1117 tagged_data = [t for s in sents[:num_sents] for t in s] 1118 cutoff = int(len(tagged_data)*train) 1119 1120 training_data = tagged_data[:cutoff] 1121 gold_data = tagged_data[cutoff:] 1122 1123 testing_data = [t[0] for t in gold_data] 1124 1125 # Unigram tagger 1126 1127 print "Training unigram tagger:", 1128 u = tag.Unigram(backoff=NN_CD_tagger) 1129 1130 # NB training and testing are required to use a list-of-lists structure, 1131 # so we wrap the flattened corpus data with the extra list structure. 1132 u.train([training_data]) 1133 print("[accuracy: %f]" % tag.accuracy(u, [gold_data])) 1134 1135 # Brill tagger 1136 1137 templates = [ 1138 brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,1)), 1139 brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (2,2)), 1140 brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,2)), 1141 brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,3)), 1142 brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,1)), 1143 brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (2,2)), 1144 brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,2)), 1145 brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,3)), 1146 brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-1, -1), (1,1)), 1147 brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1), (1,1)), 1148 ] 1149 1150 #trainer = brill.FastBrillTrainer(u, templates, trace) 1151 trainer = brill.BrillTrainer(u, templates, trace) 1152 b = trainer.train(training_data, max_rules, min_score) 1153 1154 print 1155 print("Brill accuracy: %f" % tag.accuracy(b, [gold_data])) 1156 1157 print("\nRules: ") 1158 for rule in b.rules(): 1159 print(str(rule)) 1160 1161 printRules = file(rule_output, 'w') 1162 yaml.dump(b, printRules) 1163 printRules.close() 1164 1165 testing_data = list(b.tag(testing_data)) 1166 el = errorList(gold_data, testing_data) 1167 errorFile = file(error_output, 'w') 1168 1169 for e in el: 1170 errorFile.write(e+"\n\n") 1171 errorFile.close() 1172 print "Done; rules and errors saved to %s and %s." % (rule_output, error_output)
1173 1174 if __name__ == '__main__': 1175 demo() 1176