Package nltk_lite :: Package tag :: Module ngram
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.tag.ngram

  1  # Natural Language Toolkit: N-Gram Taggers 
  2  # 
  3  # Copyright (C) 2001-2007 University of Pennsylvania 
  4  # Author: Edward Loper <edloper@gradient.cis.upenn.edu> 
  5  #         Steven Bird <sb@csse.unimelb.edu.au> (minor additions) 
  6  # URL: <http://nltk.sf.net> 
  7  # For license information, see LICENSE.TXT 
  8   
  9  """ 
 10  Classes and interfaces for tagging each token of a document with 
 11  supplementary information, such as its part of speech or its WordNet 
 12  synset tag.  This task, which is known as X{tagging}, is defined by 
 13  the L{TagI} interface. 
 14  """ 
 15   
 16  import types, re 
 17  from nltk_lite.probability import FreqDist, ConditionalFreqDist 
 18  from nltk_lite.tag import * 
 19   
 20  ############################################################## 
 21  # N-GRAM TAGGERS: these make use of history 
 22  ############################################################## 
 23   
24 -class Ngram(SequentialBackoff):
25 """ 26 An I{n}-gram stochastic tagger. Before an C{tagger.Ngram} 27 can be used, it should be trained on a tagged corpus. Using this 28 training data, it will construct a frequency distribution 29 describing the frequencies with each word is tagged in different 30 contexts. The context considered consists of the word to be 31 tagged and the I{n-1} previous words' tags. Once the tagger has been 32 trained, it uses this frequency distribution to tag words by 33 assigning each word the tag with the maximum frequency given its 34 context. If the C{tagger.Ngram} encounters a word in a context 35 for which it has no data, it will assign it the tag C{None}. 36 """
37 - def __init__(self, n, cutoff=1, backoff=None):
38 """ 39 Construct an I{n}-gram stochastic tagger. The tagger must be trained 40 using the L{train()} method before being used to tag data. 41 42 @param n: The order of the new C{tagger.Ngram}. 43 @type n: int 44 @type cutoff: C{int} 45 @param cutoff: A count-cutoff for the tagger's frequency 46 distribution. If the tagger saw fewer than 47 C{cutoff} examples of a given context in training, 48 then it will return a tag of C{None} for that context. 49 """ 50 if n < 2: raise ValueError('n must be greater than 1') 51 self._model = {} 52 self._n = n 53 self._cutoff = cutoff 54 self._history = [None] * (n-1) 55 self._backoff = backoff
56
57 - def train(self, tagged_corpus, verbose=False):
58 """ 59 Train this C{tagger.Ngram} using the given training data. 60 61 @param tagged_corpus: A tagged corpus. Each item should be 62 a C{list} of tagged tokens, where each consists of 63 C{text} and a C{tag}. 64 @type tagged_corpus: C{list} or C{iter(list)} 65 """ 66 67 if self.size() != 0: 68 raise ValueError, 'Tagger is already trained' 69 token_count = hit_count = 0 70 fd = ConditionalFreqDist() 71 for sentence in tagged_corpus: 72 self._history = [None] * (self._n-1) 73 for (token, tag) in sentence: 74 token_count += 1 75 fd[(tuple(self._history), token)].inc(tag) 76 del self._history[0] 77 self._history.append(tag) 78 for context in fd.conditions(): 79 best_tag = fd[context].max() 80 backoff_tag = self._backoff_tag_one(token, tuple(self._history)) 81 hits = fd[context].count(best_tag) 82 83 # is the tag we would assign different from the backoff tagger 84 # and do we have sufficient evidence? 85 if best_tag != backoff_tag and hits > self._cutoff: 86 self._model[context] = best_tag 87 hit_count += hits 88 89 # generate stats 90 if verbose: 91 size = len(self._model) 92 backoff = 100 - (hit_count * 100.0)/ token_count 93 pruning = 100 - (size * 100.0) / len(fd.conditions()) 94 print "[Trained %d-gram tagger:" % self._n, 95 print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (size, backoff, pruning)
96
97 - def tag_one(self, token, history=None):
98 if self.size() == 0: 99 raise ValueError, 'Tagger is not trained' 100 if history: 101 if len(history) >= (self._n-1): # restrict to required width 102 self._history = list(history[-(self._n-1):]) 103 else: # pad to required width 104 self._history = [None] * (self._n - 2 - len(history)) + history 105 106 history = tuple(self._history) 107 context = (history, token) 108 109 if self._model.has_key(context): 110 return self._model[context] 111 if self._backoff: 112 return self._backoff.tag_one(token, history) 113 return None
114
115 - def size(self):
116 return len(self._model)
117
118 - def __repr__(self):
119 return '<%d-gram Tagger: size=%d, cutoff=%d>' % ( 120 self._n, self.size(), self._cutoff)
121
122 -class Bigram(Ngram):
123 - def __init__(self, cutoff=1, backoff=None):
124 Ngram.__init__(self, 2, cutoff, backoff)
125
126 -class Trigram(Ngram):
127 - def __init__(self, cutoff=1, backoff=None):
128 Ngram.__init__(self, 3, cutoff, backoff)
129 130 131 ### 132 # 133 # def print_usage_stats(self): 134 # total = self._total_count 135 # print ' %20s | %s' % ('Tagger', 'Words Tagged') 136 # print ' '+'-'*21+'|'+'-'*17 137 # for tagger in self._taggers: 138 # count = self._tagger_count[tagger] 139 # print ' %20s | %4.1f%%' % (tagger, 100.0*count/total) 140 # 141 # def __repr__(self): 142 # return '<BackoffTagger: %s>' % self._taggers 143 ### 144 145 ##////////////////////////////////////////////////////// 146 ## Demonstration 147 ##////////////////////////////////////////////////////// 148
149 -def _demo_tagger(tagger, gold):
150 from nltk_lite.tag import accuracy 151 acc = accuracy(tagger, gold) 152 print 'Accuracy = %4.1f%%' % (100.0 * acc)
153
154 -def demo():
155 """ 156 A simple demonstration function for the C{Tagger} classes. It 157 constructs a backoff tagger using a trigram tagger, bigram tagger 158 unigram tagger and a default tagger. It trains and tests the 159 tagger using the Brown corpus. 160 """ 161 from nltk_lite.corpora import brown 162 import sys 163 164 print 'Training taggers.' 165 166 # Create a default tagger 167 t0 = Default('nn') 168 169 # t1a = Affix(length=-3, minlength=5, backoff=t0) 170 # t1b = Unigram(cutoff=2, backoff=t1a) 171 t1 = Unigram(cutoff=1, backoff=t0) 172 t2 = Bigram(cutoff=1, backoff=t1) 173 t3 = Trigram(backoff=t2) 174 175 t1.train(brown.tagged('a'), verbose=True) 176 t2.train(brown.tagged('a'), verbose=True) 177 t3.train(brown.tagged('a'), verbose=True) 178 179 # Tokenize the testing files 180 test_tokens = [] 181 num_words = 0 182 183 # Run the taggers. For t0, t1, and t2, back-off to the default tagger. 184 # This is especially important for t1 and t2, which count on 185 # having known tags as contexts; if they get a context containing 186 # None, then they will generate an output of None, and so all 187 # words will get tagged a None. 188 189 print '='*75 190 print 'Running the taggers on test data...' 191 print ' Default (nn) tagger: ', 192 sys.stdout.flush() 193 _demo_tagger(t0, brown.tagged('b')) 194 195 print ' Unigram tagger: ', 196 sys.stdout.flush() 197 _demo_tagger(t1, list(brown.tagged('b'))[:1000]) 198 199 print ' Bigram tagger: ', 200 sys.stdout.flush() 201 _demo_tagger(t2, list(brown.tagged('b'))[:1000]) 202 203 print ' Trigram tagger: ', 204 sys.stdout.flush() 205 _demo_tagger(t3, list(brown.tagged('b'))[:1000])
206 207 # print '\nUsage statistics for the trigram tagger:\n' 208 # trigram.print_usage_stats() 209 # print '='*75 210 211 if __name__ == '__main__': 212 demo() 213