Package nltk_lite :: Package contrib :: Package classify :: Module naivebayes
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.contrib.classify.naivebayes

  1  # Natural Language Toolkit: Naive Bayes Classifier 
  2  # 
  3  # Copyright (C) 2001-2007 University of Pennsylvania 
  4  # Author: Edward Loper <edloper@gradient.cis.upenn.edu> 
  5  #         Sam Huston <shuston@csse.unimelb.edu.au> 
  6  #         Steven Bird <sb@csse.unimelb.edu.au> 
  7  # URL: <http://nltk.sf.net> 
  8  # For license information, see LICENSE.TXT 
  9  # 
 10   
 11  """ 
 12  Naive Bayes Classifier -- Beta version 
 13  """ 
 14   
 15  from operator import itemgetter 
 16  from nltk_lite.probability import * 
 17  from nltk_lite.contrib.classify import * 
 18   
19 -class NaiveBayes(AbstractClassify):
20 """ 21 The Naive Bayes Classifier is a supervised classifier. 22 It needs to be trained with representative examples of 23 each class. From these examples the classifier 24 calculates the most probable classification of the sample. 25 26 27 P(class) * P(features|class) 28 P(class|features) = ------------------------- 29 P(features) 30 31 Internal data structures: 32 _feature_dectector: 33 holds a feature detector function 34 _classes: 35 holds a list of classes supplied during training 36 _cls_prob_dist: 37 hols a Probability Distribution, namely GoodTuringProbDist 38 this structure is defined in probabilty.py in nltk_lite 39 this structure is indexed by classnames 40 _feat_prob_dist: 41 holds Conditional Probability Distribution, conditions are 42 class name, and feature type name 43 these probability distributions are indexed by feature values 44 this structure is defined in probabilty.py in nltk_lite 45 """ 46
47 - def __init__(self, feature_detector):
48 """ 49 @param feature_detector: feature detector produced function, which takes 50 a sample of object to be classified (eg: string or list of words) and returns 51 a list of tuples (feature_type_name, list of values of this feature type) 52 """ 53 self._feature_detector = feature_detector
54
55 - def train(self, gold):
56 """ 57 @param gold: dictionary of class names to representative examples 58 function takes representative examples of classes 59 then creates frequency distributions of these classes 60 these freqdists are used to create probability distributions 61 """ 62 cls_freq_dist = FreqDist() 63 feat_freq_dist = ConditionalFreqDist() 64 self._classes = [] 65 feature_values = {} 66 67 for cls in gold: 68 self._classes.append(cls) 69 for (fname, fvals) in self._feature_detector(gold[cls]): 70 for fval in fvals: 71 #increment number of tokens found in a particular class 72 cls_freq_dist.inc(cls) 73 74 #increment number of features found in (class, feature type) 75 feat_freq_dist[cls, fname].inc(fval) 76 77 #record that fname can be associated with this feature 78 if fname not in feature_values: feature_values[fname] = set() 79 feature_values[fname].add(fval) 80 81 # convert the frequency distributions to probability distribution for classes 82 self._cls_prob_dist = GoodTuringProbDist(cls_freq_dist, cls_freq_dist.B()) 83 84 # for features 85 def make_probdist(freqdist, (cls, fname)): 86 return GoodTuringProbDist(freqdist, len(feature_values[fname]))
87 self._feat_prob_dist = ConditionalProbDist(feat_freq_dist, make_probdist, True)
88
89 - def get_class_dict(self, sample):
90 """ 91 @param sample: sample to be classified 92 @ret: Dictionary (class to probability) 93 """ 94 return self._naivebayes(sample)
95
96 - def _naivebayes(self, sample):
97 """ 98 @param sample: sample to be tested 99 @ret: Dictionary (class to probability) 100 101 naivebayes classifier: 102 creates a probability distribution based on sample string 103 104 sums the log probabilities of each feature value 105 for each class and feature type 106 and with the probability of the resepective class 107 """ 108 sample_feats = self._feature_detector(sample) 109 110 logprob_dict = {} 111 score = {} 112 for cls in self._classes: 113 # start with the probability of each class 114 logprob_dict[cls] = self._cls_prob_dist.prob(cls) 115 116 for fname, fvals in sample_feats: 117 for cls in self._classes: 118 probdist = self._feat_prob_dist[cls, fname] 119 for fval in fvals: 120 if fval in probdist.samples(): 121 logprob_dict[cls] += probdist.logprob(fval) 122 123 dicttmp = DictionaryProbDist(logprob_dict, normalize=True, log=True) 124 for sample in dicttmp.samples(): 125 score[sample] = dicttmp.prob(sample) 126 127 return score
128
129 - def __repr__(self):
130 return '<NaiveBayesClassifier: classes=%d>' % len(self._classes)
131 132 133 ##////////////////////////////////////////////////////// 134 ## Demonstration code 135 ##////////////////////////////////////////////////////// 136 137
138 -def demo():
139 from nltk_lite.contrib import classify 140 from nltk_lite import detect 141 142 fd = detect.feature({"1-tup": lambda t: list(t)}) 143 144 classifier = classify.NaiveBayes(fd) 145 training_data = {"class a": "aaaaaab", 146 "class b": "bbbbbba"} 147 classifier.train(training_data) 148 149 result = classifier.get_class_dict("a") 150 151 for cls in result: 152 print cls, ':', result[cls] 153 154 """ 155 expected values: 156 class_probs a = 0.5 157 b = 0.5 158 class a: 'a' = 6/7 159 'b' = 1/7 160 b: 'a' = 1/7 161 'b' = 6/7 162 sample: 'a' = 1 163 164 score a: 0.5 * 6/7 = 0.42~ 165 score b: 0.5 * 1/7 = 0.07~ 166 """
167 168
169 -def demo2():
170 from nltk_lite.contrib import classify 171 from nltk_lite import detect 172 173 fd = detect.feature({"2-tup": lambda t: [t[n:n+2] for n in range(len(t))]}) 174 175 classifier = classify.NaiveBayes(fd) 176 training_data = {"class a": "aaaaaab", 177 "class b": "bbbbbba"} 178 classifier.train(training_data) 179 180 result = classifier.get_class_dict("aababb") 181 182 for cls in result: 183 print cls, ':', result[cls] 184 """ 185 expected values: 186 class_probs a = 0.5 187 b = 0.5 188 class a: 'aa' = 5/6 189 'ab' = 1/6 190 b: 'bb' = 5/6 191 'ba' = 1/6 192 sample: 'aa' = 2 193 'ab' = 2 194 'ba' = 1 195 'bb' = 1 196 197 score a: 0.5 * 5/6 * 5/6 * 1/6 * 1/6 = 0.09~ 198 score b: 0.5 * 5/6 * 1/6 = 0.06~ 199 """
200 201 202
203 -def demo3():
204 from nltk_lite.contrib import classify 205 from nltk_lite import detect 206 207 fd = detect.feature({"1-tup": lambda t: [t[n] for n in range(len(t))], 208 "2-tup": lambda t: [t[n:n+2] for n in range(len(t))]}) 209 210 classifier = classify.NaiveBayes(fd) 211 training_data = {"class a": "aaaaaab", 212 "class b": "bbbbbba"} 213 classifier.train(training_data) 214 215 result = classifier.get_class_dict("aaababb") 216 217 for cls in result: 218 print cls, ':', result[cls] 219 220 """ 221 expected values: 222 class_probs a = 0.5 223 b = 0.5 224 class a: 'a' = 6/7 225 'b' = 1/7 226 'aa' = 5/6 227 'ab' = 1/6 228 b: 'a' = 1/7 229 'b' = 6/7 230 'bb' = 5/6 231 'ba' = 1/6 232 sample: 'a' = 4 233 'b' = 3 234 'aa' = 2 235 'ab' = 2 236 'ba' = 1 237 'bb' = 1 238 239 score a: 0.5 * 6/7^4 * 1/7^3 * 5/6^2 * 1/6^2 = 1.5 e-5 240 score b: 0.5 * 1/7^4 * 6/7^3 * 5/6 * 1/6 = 0.0014~ 241 """
242
243 -def demo4():
244 from nltk_lite.contrib import classify 245 from nltk_lite import detect 246 247 from nltk_lite.corpora import genesis 248 from itertools import islice 249 250 fd = detect.feature({"2-tup": lambda t: [' '.join(t)[n:n+2] for n in range(len(' '.join(t))-1)], 251 "words": lambda t: t}) 252 253 classifier = classify.NaiveBayes(fd) 254 training_data = {} 255 training_data["english-kjv"] = list(islice(genesis.raw("english-kjv"), 0, 400)) 256 training_data["french"] = list(islice(genesis.raw("french"), 0, 400)) 257 training_data["finnish"] = list(islice(genesis.raw("finnish"), 0, 400)) 258 259 classifier.train(training_data) 260 261 result = classifier.get_class_probs(list(islice(genesis.raw("english-kjv"), 150, 200))) 262 263 print 'english-kjv :', result.prob('english-kjv') 264 print 'french :', result.prob('french') 265 print 'finnish :', result.prob('finnish')
266 267 if __name__ == '__main__': 268 demo2() 269