Package nltk_lite :: Module evaluate
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.evaluate

  1  # Natural Language Toolkit: Evaluation 
  2  # 
  3  # Copyright (C) 2001-2007 University of Pennsylvania 
  4  # Author: Edward Loper <edloper@gradient.cis.upenn.edu> 
  5  #         Steven Bird <sb@csse.unimelb.edu.au> 
  6  # URL: <http://nltk.sf.net> 
  7  # For license information, see LICENSE.TXT 
  8   
  9   
 10  """ 
 11  Utility functions for evaluating processing modules. 
 12  """ 
 13   
 14  import sets, math 
 15   
16 -def accuracy(reference, test):
17 """ 18 Given a list of reference values and a corresponding list of test 19 values, return the percentage of corresponding values that are 20 equal. In particular, return the percentage of indices 21 C{0<i<=len(test)} such that C{test[i] == reference[i]}. 22 23 @type reference: C{list} 24 @param reference: An ordered list of reference values. 25 @type test: C{list} 26 @param test: A list of values to compare against the corresponding 27 reference values. 28 @raise ValueError: If C{reference} and C{length} do not have the 29 same length. 30 """ 31 if len(reference) != len(test): 32 raise ValueError("Lists must have the same length.") 33 num_correct = [1 for x,y in zip(reference, test) if x==y] 34 return float(len(num_correct)) / len(reference)
35
36 -def precision(reference, test):
37 """ 38 Given a set of reference values and a set of test values, return 39 the percentage of test values that appear in the reference set. 40 In particular, return |C{reference}S{cap}C{test}|/|C{test}|. 41 If C{test} is empty, then return C{None}. 42 43 @type reference: C{Set} 44 @param reference: A set of reference values. 45 @type test: C{Set} 46 @param test: A set of values to compare against the reference set. 47 @rtype: C{float} or C{None} 48 """ 49 if len(test) == 0: 50 return None 51 else: 52 return float(len(reference.intersection(test)))/len(test)
53
54 -def recall(reference, test):
55 """ 56 Given a set of reference values and a set of test values, return 57 the percentage of reference values that appear in the test set. 58 In particular, return |C{reference}S{cap}C{test}|/|C{reference}|. 59 If C{reference} is empty, then return C{None}. 60 61 @type reference: C{Set} 62 @param reference: A set of reference values. 63 @type test: C{Set} 64 @param test: A set of values to compare against the reference set. 65 @rtype: C{float} or C{None} 66 """ 67 if len(reference) == 0: 68 return None 69 else: 70 return float(len(reference.intersection(test)))/len(reference)
71
72 -def f_measure(reference, test, alpha=0.5):
73 """ 74 Given a set of reference values and a set of test values, return 75 the f-measure of the test values, when compared against the 76 reference values. The f-measure is the harmonic mean of the 77 L{precision} and L{recall}, weighted by C{alpha}. In particular, 78 given the precision M{p} and recall M{r} defined by: 79 - M{p} = |C{reference}S{cap}C{test}|/|C{test}| 80 - M{r} = |C{reference}S{cap}C{test}|/|C{reference}| 81 The f-measure is: 82 - 1/(C{alpha}/M{p} + (1-C{alpha})/M{r}) 83 84 If either C{reference} or C{test} is empty, then C{f_measure} 85 returns C{None}. 86 87 @type reference: C{Set} 88 @param reference: A set of reference values. 89 @type test: C{Set} 90 @param test: A set of values to compare against the reference set. 91 @rtype: C{float} or C{None} 92 """ 93 p = precision(reference, test) 94 r = recall(reference, test) 95 if p is None or r is None: 96 return None 97 if p == 0 or r == 0: 98 return 0 99 return 1.0/(alpha/p + (1-alpha)/r)
100
101 -def log_likelihood(reference, test):
102 """ 103 Given a list of reference values and a corresponding list of test 104 probability distributions, return the average log likelihood of 105 the reference values, given the probability distributions. 106 107 @param reference: A list of reference values 108 @type reference: C{list} 109 @param test: A list of probability distributions over values to 110 compare against the corresponding reference values. 111 @type test: C{list} of L{ProbDist} 112 """ 113 if len(reference) != len(test): 114 raise ValueError("Lists must have the same length.") 115 116 # Return the average value of dist.logprob(val). 117 total_likelihood = sum(dist.logprob(val) 118 for (val, dist) in zip(reference, test)) 119 return total_likelihood/len(reference)
120
121 -class ConfusionMatrix(object):
122 """ 123 The confusion matrix between a list of reference values and a 124 corresponding list of test values. Entry [M{r},M{t}] of this 125 matrix is a count of the number of times that the reference value 126 M{r} corresponds to the test value M{t}. E.g.: 127 128 >>> ref = 'DET NN VB DET JJ NN NN IN DET NN'.split() 129 >>> test = 'DET VB VB DET NN NN NN IN DET NN'.split() 130 >>> cm = ConfusionMatrix(ref, test) 131 >>> print cm['NN', 'NN'] 132 3 133 134 Note that the diagonal entries (M{Ri}=M{Tj}) of this matrix 135 corresponds to correct values; and the off-diagonal entries 136 correspond to incorrect values. 137 """
138 - def __init__(self, reference, test):
139 """ 140 Construct a new confusion matrix from a list of reference 141 values and a corresponding list of test values. 142 143 @type reference: C{list} 144 @param reference: An ordered list of reference values. 145 @type test: C{list} 146 @param test: A list of values to compare against the 147 corresponding reference values. 148 @raise ValueError: If C{reference} and C{length} do not have 149 the same length. 150 """ 151 if len(reference) != len(test): 152 raise ValueError('Lists must have the same length.') 153 154 # Get a list of all values. 155 values = sorted(set(reference+test)) 156 157 # Construct a value->index dictionary 158 indices = dict((val,i) for (i,val) in enumerate(values)) 159 160 # Make a confusion matrix table. 161 confusion = [[0 for val in values] for val in values] 162 max_conf = 0 # Maximum confusion 163 for w,g in zip(reference, test): 164 confusion[indices[w]][indices[g]] += 1 165 max_conf = max(max_conf, confusion[indices[w]][indices[g]]) 166 167 #: A list of all values in C{reference} or C{test}. 168 self._values = values 169 #: A dictionary mapping values in L{self._values} to their indices. 170 self._indices = indices 171 #: The confusion matrix itself (as a list of lists of counts). 172 self._confusion = confusion 173 #: The greatest count in L{self._confusion} (used for printing). 174 self._max_conf = 0 175 #: The total number of values in the confusion matrix. 176 self._total = len(reference) 177 #: The number of correct (on-diagonal) values in the matrix. 178 self._correct = sum(confusion[i][i] for i in range(len(values)))
179
180 - def __getitem__(self, (li,lj)):
181 """ 182 @return: The number of times that value C{li} was expected and 183 value C{lj} was given. 184 @rtype: C{int} 185 """ 186 i = self._indices[li] 187 j = self._indices[lj] 188 return self._confusion[i][j]
189
190 - def __repr__(self):
191 return '<ConfusionMatrix: %s/%s correct>' % (self._correct, 192 self._total)
193
194 - def __str__(self):
195 return self.pp()
196
197 - def pp(self, show_percents=False, values_in_chart=True):
198 """ 199 @return: A multi-line string representation of this confusion 200 matrix. 201 @todo: add marginals? 202 """ 203 confusion = self._confusion 204 205 if values_in_chart: 206 values = self._values 207 else: 208 values = range(len(self._values)) 209 210 # Construct a format string for row values 211 valuelen = max(len(str(val)) for val in values) 212 value_format = '%' + `valuelen` + 's |' 213 # Construct a format string for matrix entries 214 if show_percents: 215 entrylen = 6 216 entry_format = '%5.1f%%' 217 else: 218 entrylen = len(`self._max_conf`) 219 entry_format = '%' + `entrylen` + 'd' 220 221 # Write the column values. 222 value_strings = [str(val) for val in values] 223 s = '' 224 for i in range(valuelen): 225 s += (' '*valuelen)+' |' 226 for val in value_strings: 227 if i >= valuelen-len(val): 228 s += val[i-valuelen+len(val)].rjust(entrylen+1) 229 else: 230 s += ' '*(entrylen+1) 231 s += ' |\n' 232 233 # Write a dividing line 234 s += '%s-+-%s+\n' % ('-'*valuelen, '-'*((entrylen+1)*len(values))) 235 236 # Write the entries. 237 for i in range(len(values)): 238 s += value_format % values[i] 239 for j in range(len(values)): 240 s += ' ' 241 if show_percents: 242 s += entry_format % (100.0*confusion[i][j]/self._total) 243 else: 244 s += entry_format % confusion[i][j] 245 s += ' |\n' 246 247 # Write a dividing line 248 s += '%s-+-%s+\n' % ('-'*valuelen, '-'*((entrylen+1)*len(values))) 249 250 # Write a key 251 s += '(row = reference; col = test)\n' 252 if not values_in_chart: 253 s += 'Value key:\n' 254 for i, value in enumerate(self._values): 255 s += '%6d: %s\n' % (i, value) 256 257 return s
258
259 - def key(self):
260 values = self._values 261 str = 'Value key:\n' 262 indexlen = len(`len(values)-1`) 263 key_format = ' %'+`indexlen`+'d: %s\n' 264 for i in range(len(values)): 265 str += key_format % (i, values[i]) 266 267 return str
268
269 -def demo():
270 print '-'*75 271 reference = 'DET NN VB DET JJ NN NN IN DET NN'.split() 272 test = 'DET VB VB DET NN NN NN IN DET NN'.split() 273 print 'Reference =', reference 274 print 'Test =', test 275 print 'Confusion matrix:' 276 print ConfusionMatrix(reference, test) 277 print 'Accuracy:', accuracy(reference, test) 278 279 print '-'*75 280 reference_set = sets.Set(reference) 281 test_set = sets.Set(test) 282 print 'Reference =', reference_set 283 print 'Test = ', test_set 284 print 'Precision:', precision(reference_set, test_set) 285 print ' Recall:', recall(reference_set, test_set) 286 print 'F-Measure:', f_measure(reference_set, test_set) 287 print '-'*75
288 if __name__ == '__main__': 289 demo() 290