Package nltk_lite :: Package contrib :: Package classifier :: Module decisionstump
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.contrib.classifier.decisionstump

  1  # Natural Language Toolkit - Decision Stump 
  2  #  Understands the procedure of creating a decision stump and  
  3  #     calculating the number of errors 
  4  #  Is generally created at the attribute level 
  5  #   ie. each attribute will have a decision stump of its own 
  6  # 
  7  # Author: Sumukh Ghodke <sumukh dot ghodke at gmail dot com> 
  8  # 
  9  # URL: <http://nltk.sf.net> 
 10  # This software is distributed under GPL, for license information see LICENSE.TXT 
 11   
 12  from math import log 
 13  from nltk_lite.probability import FreqDist 
 14   
15 -class DecisionStump:
16 - def __init__(self, attribute, klass):
17 self.attribute = attribute 18 """ 19 counts is a dictionary in which 20 each key is an attribute value 21 and each value is a dictionary of class frequencies for that attribute value 22 """ 23 self.counts, self.children = {}, {} #it has children only in decision trees 24 self.root = dictionary_of_values(klass) 25 for value in attribute.values: 26 self.counts[value] = dictionary_of_values(klass)
27
28 - def update_count(self, instance):
29 attr_value = instance.value(self.attribute) 30 self.counts[attr_value][instance.klass_value] += 1 31 self.root[instance.klass_value] += 1
32
33 - def error(self):
34 count_for_each_attr_value = self.counts.values() 35 total, errors = 0, 0 36 for class_count in count_for_each_attr_value: 37 subtotal, counts = 0, class_count.values() 38 counts.sort() 39 for count in counts: subtotal += count 40 errors += (subtotal - counts[-1]) 41 total += subtotal 42 return float(errors)/ total
43
44 - def klass(self, instance):
45 attr_value = instance.value(self.attribute) 46 if not self.children.has_key(attr_value): 47 return self.majority_klass(attr_value) 48 return self.children[attr_value].klass(instance)
49
50 - def majority_klass(self, attr_value):
51 klass_values_with_count = self.counts[attr_value] 52 _max, klass_value = 0, None 53 for klass, count in klass_values_with_count.items(): 54 if count > _max: 55 _max, klass_value = count, klass 56 return klass_value
57
58 - def entropy(self, attr_value):
59 """ 60 Returns the entropy of class disctribution for a particular attribute value 61 """ 62 from nltk_lite.contrib.classifier import entropy_of_key_counts 63 return entropy_of_key_counts(self.counts[attr_value])
64
65 - def mean_information(self):
66 total, total_num_of_instances = 0, 0 67 for attr_value in self.attribute.values: 68 instance_count = total_counts(self.counts[attr_value]) 69 if instance_count == 0: 70 continue 71 total += (instance_count * self.entropy(attr_value)) 72 total_num_of_instances += instance_count 73 return float(total) / total_num_of_instances
74
75 - def information_gain(self):
78
79 - def gain_ratio(self):
80 return float(self.information_gain()) / self.attribute.split_info()
81
82 - def __str__(self):
83 _str = 'Decision stump for attribute ' + self.attribute.name 84 for key, value in self.counts.items(): 85 _str += '\nAttr value: ' + key + '; counts: ' + value.__str__() 86 for child in self.children: 87 _str += child.__str__() 88 return _str
89
90 -def total_counts(dictionary_of_klass_freq):
91 total = 0 92 for count in dictionary_of_klass_freq.values(): 93 total += count 94 return total
95
96 -def dictionary_of_values(klass):
97 _values = {} 98 for value in klass: 99 _values[value] = 0 100 return _values
101