Package nltk_lite :: Package contrib :: Package classify :: Module spearman
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.contrib.classify.spearman

  1  # Natural Language Toolkit: Spearman Rank Correlation classifier 
  2  # 
  3  # Copyright (C) 2001-2007 University of Pennsylvania 
  4  # Author: Sam Huston <shuston@csse.unimelb.edu.au> 
  5  #         Steven Bird <sb@csse.unimelb.edu.au> 
  6  # URL: <http://nltk.sf.net> 
  7  # For license information, see LICENSE.TXT 
  8  # 
  9   
 10  """ 
 11  Spearman Classifier -- Beta version 
 12  """ 
 13   
 14  from math import pow 
 15  from nltk_lite.probability import * 
 16  from nltk_lite.contrib.classify import * 
 17   
18 -class Spearman(AbstractClassify):
19 """ 20 The Spearman-rho Classifier is a non-parametric measure of correlation between two 21 sets of ranked values 22 Spearman-rho classification is a supervised classifier. It needs to be trained 23 with representative examples of each class. From these examples the classifier 24 calculates the most probable classification of the sample. 25 26 6 * sum((Ai - Bi)^2) 27 p = 1 - ------------------------- 28 (n^3) - n 29 30 where A, B are the vectors of ranked objects 31 n is the number of ranked objects being compared 32 33 Internal data structures: 34 _feature_dectector: 35 holds a feature detector function 36 _classes: 37 holds a list of classes supplied during trainning 38 _cls_rank: 39 holds a dictionary of ordered lists, 40 the order of the list is deturnmined by: 41 first ranked object is ordered first 42 duplicate values are ordered in alphabetical order 43 """ 44
45 - def __init__(self, feature_detector, crop_data=100):
46 """ 47 @param feature_detector: feature detector produced function 48 @param crop_data: ranking beyond which features are ignored 49 this produces a maximum rank for large data sets 50 where the lower order values would offset the results 51 """ 52 self._feature_detector = feature_detector 53 self._crop_data = crop_data
54 55
56 - def train(self, gold):
57 """ 58 trains the classifier 59 @param classes: dictionary of class names to representative examples 60 61 function takes representative examples of classes 62 then creates ordered lists of ranked features 63 indexed by class name and feature type 64 """ 65 66 self._classes = [] 67 self._cls_rank = {} 68 69 for cls in gold: 70 self._classes.append(cls) 71 for (fname, fvals) in self._feature_detector(gold[cls]): 72 cls_freq_dist = FreqDist() 73 for fval in fvals: 74 cls_freq_dist.inc(fval) 75 self._cls_rank[cls, fname] = self._get_ranks(cls_freq_dist)
76 77
78 - def get_class_dict(self, sample):
79 """ 80 @param text: sample to be classified 81 @ret: Dictionary (class to probability) 82 """ 83 return self._spearman_ranked(sample)
84 85
86 - def _spearman_ranked(self, sample):
87 """ 88 @param text: sample to be classified 89 @ret: Dictionary class to probability 90 91 function uses sample to create an ordered list of ranked features 92 the spearman-rho formula is then applied to produce a correlation 93 94 a union operation is used to create two lists of the same length for the formula 95 missing values are appended to each list in alphabetical order 96 """ 97 98 rank_diff = {} 99 score = {} 100 sample_rank = {} 101 totalfvals = {} 102 103 for (fname, fvals) in self._feature_detector(sample): 104 sample_dist = FreqDist() 105 for fval in fvals: 106 sample_dist.inc(fval) 107 sample_rank[fname] = self._get_ranks(sample_dist) 108 109 110 for cls in self._classes: 111 rank_diff[cls] = 0 112 totalfvals[cls] = 0 113 114 for fname in sample_rank: 115 for cls in self._classes: 116 tmp_clslist = self._cls_rank[cls, fname] 117 tmp_smplist = sample_rank[fname] 118 119 # take the union of the the class and sample lists, append each in alphabetical order 120 121 tmp_clslist.extend(list(set(tmp_smplist).difference(set(tmp_clslist)))) 122 tmp_smplist.extend(list(set(tmp_clslist).difference(set(tmp_smplist)))) 123 124 totalfvals[cls] += len(tmp_clslist) 125 126 for fval in tmp_smplist: 127 rank_diff[cls] += pow(tmp_smplist.index(fval) - tmp_clslist.index(fval), 2) 128 129 for cls in self._classes: 130 score[cls] = 1 - (float(6 * rank_diff[cls]) / (pow(totalfvals[cls], 3) - totalfvals[cls])) 131 132 return score
133 134 135
136 - def _get_ranks(self, sample_dist):
137 """ 138 @param sample_dist: sample frequency distribution 139 @ret: ordered list of features. 140 """ 141 142 samples = sample_dist.samples() 143 ordered_list = [item for (freq, item) in sorted([(sample_dist.count(item),item) for item in samples], reverse=True)] 144 return ordered_list[:self._crop_data]
145
146 - def __repr__(self):
147 return '<SpearmanClassifier: classes=%d>' % len(self._classes)
148 149 ########################################################### 150
151 -def demo():
152 from nltk_lite.contrib import classify 153 from nltk_lite import detect 154 155 fd = detect.feature({"1-tup": lambda t: [t[n] for n in range(len(t))]}) 156 157 classifier = classify.spearman.Spearman(fd) 158 trainning_data = {"class a": "aaaaaab", 159 "class b": "bbbbbba"} 160 classifier.train(trainning_data) 161 162 result = classifier.get_class_dict("a") 163 164 for cls in result: 165 print cls, ':', result[cls] 166 """ 167 expected values: 168 class a: 'a' = 1 169 'b' = 2 170 b: 'a' = 2 171 'b' = 1 172 sample: 'a' = 1 173 174 score a: 6*(0^2) / 8-2= 0 175 score b: 6*(1^2) / 8-2 = 1 176 """
177 178
179 -def demo2():
180 from nltk_lite.contrib import classify 181 from nltk_lite import detect 182 183 fd = detect.feature({"2-tup": lambda t: [t[n:n+2] for n in range(len(t)-1)]}) 184 185 classifier = classify.spearman.Spearman(fd) 186 trainning_data = {"class a": "aaaaaab", 187 "class b": "bbbbbba"} 188 classifier.train(trainning_data) 189 190 result = classifier.get_class_dict("aaababb") 191 192 for cls in result: 193 print cls, ':', result[cls] 194 """ 195 expected values: 196 class a: 'aa' = 1 197 'ab' = 2 198 b: 'bb' = 1 199 'ba' = 2 200 sample: 'aa' = 1 201 'ab' = 1 202 'ba' = 2 203 'bb' = 2 204 205 206 score a: 1/ 1+0+1+5+5 = 0.5 207 score b: 1/ 1+1+0+6+6 = 0.5 208 """
209 210 211
212 -def demo3():
213 from nltk_lite.contrib import classify 214 from nltk_lite import detect 215 216 fd = detect.feature({"1-tup": lambda t: [t[n] for n in range(len(t))], 217 "2-tup": lambda t: [t[n:n+2] for n in range(len(t)-1)]}) 218 219 classifier = classify.spearman.Spearman(fd) 220 trainning_data = {"class a": "aaaaaab", 221 "class b": "bbbbbba"} 222 classifier.train(trainning_data) 223 224 result = classifier.get_class_dict("aaababb") 225 226 for cls in result: 227 print cls, ':', result[cls] 228 229 """ 230 expected values: 231 class a: 'a' = 1 232 'b' = 3 233 'aa' = 2 234 'ab' = 3 235 b: 'a' = 3 236 'b' = 1 237 'bb' = 2 238 'ba' = 3 239 sample: 'a' = 1 240 'b' = 2 241 'aa' = 3 242 'ab' = 3 243 'ba' = 4 244 'bb' = 4 245 246 score a: 1/ 1+0+1+1+0+3+3 = 1/ 9 247 score b: 1/ 1+2+1+2+1+4+4 = 1/ 15 248 """
249 250
251 -def demo4():
252 from nltk_lite.contrib import classify 253 from nltk_lite import detect 254 255 from nltk_lite.corpora import genesis 256 from itertools import islice 257 258 fd = detect.feature({"2-tup": lambda t: [' '.join(t)[n:n+2] for n in range(len(' '.join(t))-1)], 259 "words": lambda t: t}) 260 261 classifier = classify.spearman.Spearman(fd) 262 training_data = {} 263 training_data["english-kjv"] = list(islice(genesis.raw("english-kjv"), 0, 400)) 264 training_data["french"] = list(islice(genesis.raw("french"), 0, 400)) 265 training_data["finnish"] = list(islice(genesis.raw("finnish"), 0, 400)) 266 267 classifier.train(training_data) 268 269 result = classifier.get_class_probs(list(islice(genesis.raw("english-kjv"), 150, 200))) 270 271 print 'english-kjv :', result.prob('english-kjv') 272 print 'french :', result.prob('french') 273 print 'finnish :', result.prob('finnish')
274 275 276 if __name__ == '__main__': 277 demo2() 278