Package nltk_lite :: Package contrib :: Package classify :: Module cosine
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.contrib.classify.cosine

  1  # Natural Language Toolkit: Cosine Classifier 
  2  # 
  3  # Copyright (C) 2001-2007 University of Pennsylvania 
  4  # Author: Sam Huston <shuston@csse.unimelb.edu.au> 
  5  #         Steven Bird <sb@csse.unimelb.edu.au> 
  6  # URL: <http://nltk.sf.net> 
  7  # For license information, see LICENSE.TXT 
  8  # 
  9   
 10  """ 
 11  Cosine Classifier -- Beta version 
 12  """ 
 13   
 14  from math import sqrt, pow 
 15  from nltk_lite.probability import * 
 16  from nltk_lite.contrib.classify import * 
 17   
18 -class Cosine(AbstractClassify):
19 """ 20 The Cosine Classifier uses the cosine distance algorithm to compute 21 the distance between the sample document and each of the specified classes. 22 A cosine classifier needs to be trained with representative examples 23 of each class. From these examples the classifier 24 calculates the most probable classification of the sample. 25 26 C . S 27 D(C|S) = ------------------------- 28 sqroot(C^2) * sqroot (S^2) 29 30 Internal data structures: 31 _feature_dectector: 32 holds a feature detector function 33 _classes: 34 holds a list of classes supplied during training 35 _cls_freq_dist: 36 holds a dictionary of Frequency Distributions, 37 this structure is defined in probabilty.py in nltk_lite 38 this structure is indexed by class names and feature types 39 the frequency distributions are indexed by feature values 40 41 """ 42
43 - def __init__(self, feature_detector):
44 """ 45 @param feature_detector: feature detector produced function, which takes 46 a sample of object to be classified (eg: string or list of words) and returns 47 a list of tuples (feature_type_name, list of values of this feature type) 48 """ 49 self._feature_detector = feature_detector
50
51 - def train(self, gold):
52 """ 53 Train classifier using representative examples of classes; 54 creates frequency distributions of these classes 55 56 @param gold: dictionary mapping class names to representative examples 57 """ 58 self._classes = [] 59 self._cls_freq_dist = {} 60 for cls in gold: 61 self._classes.append(cls) 62 for (fname, fvals) in self._feature_detector(gold[cls]): 63 self._cls_freq_dist[cls, fname] = FreqDist() 64 for fval in fvals: 65 self._cls_freq_dist[cls, fname].inc(fval)
66 67 68
69 - def get_class_dict(self, sample):
70 """ 71 @type sample: (any) 72 @param sample: sample to be classified 73 @return: Dictionary (class to probability) 74 """ 75 return self._cosine(sample)
76
77 - def _cosine(self, sample):
78 """ 79 @param salmple: sample to be classified 80 @return: Dictionary class to probability 81 82 function uses sample to create a frequency distribution 83 cosine distance is computed between each of the class distribustions 84 and the sample's distribution 85 """ 86 sample_vector_len = 0 87 dot_prod = {} 88 score = {} 89 90 sample_dist = {} 91 92 for (fname, fvals) in self._feature_detector(sample): 93 sample_dist[fname] = FreqDist() 94 for fval in fvals: 95 sample_dist[fname].inc(fval) 96 97 for cls in self._classes: 98 dot_prod[cls] = 0 99 100 for fname in sample_dist: 101 for fval in sample_dist[fname].samples(): 102 #calculate the length of the sample vector 103 sample_vector_len += pow(sample_dist[fname].count(fval), 2) 104 105 for cls in self._classes: 106 if fval in self._cls_freq_dist[cls, fname].samples(): 107 #calculate the dot product of the sample to each class 108 dot_prod[cls] += sample_dist[fname].count(fval) * self._cls_freq_dist[cls,fname].count(fval) 109 110 111 for cls in self._classes: 112 cls_vector_len = 0 113 for fname in sample_dist: 114 for fval in self._cls_freq_dist[cls, fname].samples(): 115 #calculate the length of the class vector 116 cls_vector_len += pow(self._cls_freq_dist[cls, fname].count(fval), 2) 117 118 #calculate the final score for this class 119 if sample_vector_len == 0 or cls_vector_len == 0: 120 score[cls] = 0 121 else : 122 score[cls] = float(dot_prod[cls]) / (sqrt(sample_vector_len) * sqrt(cls_vector_len)) 123 124 return score
125
126 - def __repr__(self):
127 return '<CosineClassifier: classes=%d>' % len(self._classes)
128 129 ##////////////////////////////////////////////////////// 130 ## Demonstration code 131 ##////////////////////////////////////////////////////// 132
133 -def demo():
134 from nltk_lite.contrib import classify 135 from nltk_lite import detect 136 137 fd = detect.feature({"1-tup": lambda t: [t[n] for n in range(len(t))]}) 138 139 classifier = classify.cosine.Cosine(fd) 140 training_data = {"class a": "aaaaaab", 141 "class b": "bbbbbba"} 142 classifier.train(training_data) 143 144 result = classifier.get_class_dict("a") 145 146 for cls in result: 147 print cls, ':', result[cls] 148 149 """ 150 expected values: 151 class a: 'a' = 6 152 'b' = 1 153 vector = 6^2 + 1^2 = 37 154 b: 'a' = 1 155 'b' = 6 156 vector = 1^2 + 6^2 = 37 157 sample: 'a' = 1 158 vector = 1^2 = 1 159 160 dot_prod a: 6*1 161 b: 1*1 162 163 score a: 6 / (sqrt(37) * sqrt(1)) = 0.98~ 164 score b: 1 / (sqrt(37) * sqrt(1)) = 0.16~ 165 """
166 167 168 169
170 -def demo2():
171 from nltk_lite.contrib import classify 172 from nltk_lite import detect 173 174 fd = detect.feature({"2-tup": lambda t: [t[n:n+2] for n in range(len(t)-1)]}) 175 176 classifier = classify.Cosine(fd) 177 training_data = {"class a": "aaaaaab", 178 "class b": "bbbbbba"} 179 classifier.train(training_data) 180 181 result = classifier.get_class_dict("aaababb") 182 183 for cls in result: 184 print cls, ':', result[cls] 185 """ 186 expected values: 187 class a: 'aa' = 5 188 'ab' = 1 189 vector = 5^2 + 1^2 = 26 190 b: 'bb' = 5 191 'ba' = 1 192 vector = 5^2 + 1^2 = 26 193 sample: 'aa' = 2 194 'ab' = 2 195 'ba' = 1 196 'bb' = 1 197 vector = 2^2 + 2^2 + 1^2 + 1^2 = 10 198 199 dot_prod a: 5*2 + 1*2 200 b: 5*1 + 1*1 201 202 score a: 12 / (sqrt(26) * sqrt(10)) = 0.74~ 203 score b: 6 / (sqrt(26) * sqrt(10)) = 0.37~ 204 """
205 206 207
208 -def demo3():
209 from nltk_lite.contrib import classify 210 from nltk_lite import detect 211 212 fd = detect.feature({"1-tup": lambda t: [t[n] for n in range(len(t))], 213 "2-tup": lambda t: [t[n:n+2] for n in range(len(t)-1)]}) 214 215 classifier = classify.Cosine(fd) 216 training_data = {"class a": "aaaaaab", 217 "class b": "bbbbbba"} 218 classifier.train(training_data) 219 220 result = classifier.get_class_dict("aaababb") 221 222 for cls in result: 223 print cls, ':', result[cls] 224 225 """ 226 expected values: 227 class a: 'a' = 6 228 'b' = 1 229 'aa' = 5 230 'ab' = 1 231 vector = 6^2 + 5^2 + 1 + 1 = 63 232 b: 'a' = 1 233 'b' = 6 234 'bb' = 5 235 'ba' = 1 236 vector = 6^2 + 5^2 + 1 + 1 = 63 237 sample: 'a' = 4 238 'b' = 3 239 'aa' = 2 240 'ab' = 2 241 'ba' = 1 242 'bb' = 1 243 vector = 4^2 + 3^2 + 2^2 + 2^2 + 1 + 1 = 35 244 245 dot_prod a: 4*6 + 3*1 + 5*2 + 2*1 = 39 246 b: 4*1 + 3*6 + 5*1 + 1*1 = 28 247 248 score a: 39 / (sqrt(63) * sqrt(35)) = 0.83~ 249 score b: 28 / (sqrt(63) * sqrt(35)) = 0.59~ 250 """
251 252
253 -def demo4():
254 from nltk_lite.contrib import classify 255 from nltk_lite import detect 256 257 from nltk_lite.corpora import genesis 258 from itertools import islice 259 260 fd = detect.feature({"2-tup": lambda t: [' '.join(t)[n:n+2] for n in range(len(' '.join(t))-1)], 261 "words": lambda t: t}) 262 263 classifier = classify.Cosine(fd) 264 training_data = {} 265 training_data["english-kjv"] = list(islice(genesis.raw("english-kjv"), 0, 400)) 266 training_data["french"] = list(islice(genesis.raw("french"), 0, 400)) 267 training_data["finnish"] = list(islice(genesis.raw("finnish"), 0, 400)) 268 269 classifier.train(training_data) 270 271 result = classifier.get_class_probs(list(islice(genesis.raw("english-kjv"), 150, 200))) 272 273 print 'english-kjv :', result.prob('english-kjv') 274 print 'french :', result.prob('french') 275 print 'finnish :', result.prob('finnish')
276 277 278 if __name__ == '__main__': 279 demo2() 280