1
2
3
4
5
6
7
8
9 """
10 Classes and interfaces for tagging each token of a document with
11 supplementary information, such as its part of speech or its WordNet
12 synset tag. This task, which is known as X{tagging}, is defined by
13 the L{TagI} interface.
14 """
15
16 import types, re
17 from nltk_lite.probability import FreqDist, ConditionalFreqDist
18 from nltk_lite.tag import *
19
20
21
22
23
24 -class Ngram(SequentialBackoff):
25 """
26 An I{n}-gram stochastic tagger. Before an C{tagger.Ngram}
27 can be used, it should be trained on a tagged corpus. Using this
28 training data, it will construct a frequency distribution
29 describing the frequencies with each word is tagged in different
30 contexts. The context considered consists of the word to be
31 tagged and the I{n-1} previous words' tags. Once the tagger has been
32 trained, it uses this frequency distribution to tag words by
33 assigning each word the tag with the maximum frequency given its
34 context. If the C{tagger.Ngram} encounters a word in a context
35 for which it has no data, it will assign it the tag C{None}.
36 """
37 - def __init__(self, n, cutoff=1, backoff=None):
38 """
39 Construct an I{n}-gram stochastic tagger. The tagger must be trained
40 using the L{train()} method before being used to tag data.
41
42 @param n: The order of the new C{tagger.Ngram}.
43 @type n: int
44 @type cutoff: C{int}
45 @param cutoff: A count-cutoff for the tagger's frequency
46 distribution. If the tagger saw fewer than
47 C{cutoff} examples of a given context in training,
48 then it will return a tag of C{None} for that context.
49 """
50 if n < 2: raise ValueError('n must be greater than 1')
51 self._model = {}
52 self._n = n
53 self._cutoff = cutoff
54 self._history = [None] * (n-1)
55 self._backoff = backoff
56
57 - def train(self, tagged_corpus, verbose=False):
58 """
59 Train this C{tagger.Ngram} using the given training data.
60
61 @param tagged_corpus: A tagged corpus. Each item should be
62 a C{list} of tagged tokens, where each consists of
63 C{text} and a C{tag}.
64 @type tagged_corpus: C{list} or C{iter(list)}
65 """
66
67 if self.size() != 0:
68 raise ValueError, 'Tagger is already trained'
69 token_count = hit_count = 0
70 fd = ConditionalFreqDist()
71 for sentence in tagged_corpus:
72 self._history = [None] * (self._n-1)
73 for (token, tag) in sentence:
74 token_count += 1
75 fd[(tuple(self._history), token)].inc(tag)
76 del self._history[0]
77 self._history.append(tag)
78 for context in fd.conditions():
79 best_tag = fd[context].max()
80 backoff_tag = self._backoff_tag_one(token, tuple(self._history))
81 hits = fd[context].count(best_tag)
82
83
84
85 if best_tag != backoff_tag and hits > self._cutoff:
86 self._model[context] = best_tag
87 hit_count += hits
88
89
90 if verbose:
91 size = len(self._model)
92 backoff = 100 - (hit_count * 100.0)/ token_count
93 pruning = 100 - (size * 100.0) / len(fd.conditions())
94 print "[Trained %d-gram tagger:" % self._n,
95 print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (size, backoff, pruning)
96
97 - def tag_one(self, token, history=None):
98 if self.size() == 0:
99 raise ValueError, 'Tagger is not trained'
100 if history:
101 if len(history) >= (self._n-1):
102 self._history = list(history[-(self._n-1):])
103 else:
104 self._history = [None] * (self._n - 2 - len(history)) + history
105
106 history = tuple(self._history)
107 context = (history, token)
108
109 if self._model.has_key(context):
110 return self._model[context]
111 if self._backoff:
112 return self._backoff.tag_one(token, history)
113 return None
114
116 return len(self._model)
117
119 return '<%d-gram Tagger: size=%d, cutoff=%d>' % (
120 self._n, self.size(), self._cutoff)
121
123 - def __init__(self, cutoff=1, backoff=None):
125
127 - def __init__(self, cutoff=1, backoff=None):
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
153
155 """
156 A simple demonstration function for the C{Tagger} classes. It
157 constructs a backoff tagger using a trigram tagger, bigram tagger
158 unigram tagger and a default tagger. It trains and tests the
159 tagger using the Brown corpus.
160 """
161 from nltk_lite.corpora import brown
162 import sys
163
164 print 'Training taggers.'
165
166
167 t0 = Default('nn')
168
169
170
171 t1 = Unigram(cutoff=1, backoff=t0)
172 t2 = Bigram(cutoff=1, backoff=t1)
173 t3 = Trigram(backoff=t2)
174
175 t1.train(brown.tagged('a'), verbose=True)
176 t2.train(brown.tagged('a'), verbose=True)
177 t3.train(brown.tagged('a'), verbose=True)
178
179
180 test_tokens = []
181 num_words = 0
182
183
184
185
186
187
188
189 print '='*75
190 print 'Running the taggers on test data...'
191 print ' Default (nn) tagger: ',
192 sys.stdout.flush()
193 _demo_tagger(t0, brown.tagged('b'))
194
195 print ' Unigram tagger: ',
196 sys.stdout.flush()
197 _demo_tagger(t1, list(brown.tagged('b'))[:1000])
198
199 print ' Bigram tagger: ',
200 sys.stdout.flush()
201 _demo_tagger(t2, list(brown.tagged('b'))[:1000])
202
203 print ' Trigram tagger: ',
204 sys.stdout.flush()
205 _demo_tagger(t3, list(brown.tagged('b'))[:1000])
206
207
208
209
210
211 if __name__ == '__main__':
212 demo()
213