Package nltk_lite :: Package contrib :: Package classifier_tests :: Module decisionstumptests
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.contrib.classifier_tests.decisionstumptests

  1  # Natural Language Toolkit 
  2  # 
  3  # Author: Sumukh Ghodke <sumukh dot ghodke at gmail dot com> 
  4  # 
  5  # URL: <http://nltk.sf.net> 
  6  # This software is distributed under GPL, for license information see LICENSE.TXT 
  7   
  8  from nltk_lite.contrib.classifier import decisionstump as ds, instances as ins, instance, format 
  9  from nltk_lite.contrib.classifier_tests import * 
 10  import math 
 11   
12 -class DecisionStumpTestCase(unittest.TestCase):
13 - def setUp(self):
14 self.attributes = format.C45_FORMAT.get_attributes(datasetsDir(self) + 'minigolf' + SEP + 'weather') 15 self.outlook_attr = self.attributes[0] 16 self.klass = format.C45_FORMAT.get_klass(datasetsDir(self) + 'minigolf' + SEP + 'weather') 17 self.outlook_stump = ds.DecisionStump(self.outlook_attr, self.klass) 18 self.instances = format.C45_FORMAT.get_training_instances(datasetsDir(self) + 'minigolf' + SEP + 'weather')
19
20 - def test_creates_count_map(self):
21 self.assertEqual(3, len(self.outlook_stump.counts)) 22 for attr_value in self.outlook_attr.values: 23 for class_value in self.klass: 24 self.assertEqual(0, self.outlook_stump.counts[attr_value][class_value])
25
26 - def test_updates_count_with_instance_values(self):
27 self.outlook_stump.update_count(self.instances[0]) 28 for attr_value in self.outlook_attr.values: 29 for class_value in self.klass: 30 if attr_value == 'sunny' and class_value == 'no': continue 31 self.assertEqual(0, self.outlook_stump.counts[attr_value][class_value]) 32 self.assertEqual(1, self.outlook_stump.counts['sunny']['no'])
33
34 - def test_error_count(self):
35 self.__update_stump() 36 self.assertAlmostEqual(0.2222222, self.outlook_stump.error()) 37 self.assertEqual('outlook', self.outlook_stump.attribute.name)
38
39 - def __update_stump(self):
40 for instance in self.instances: 41 self.outlook_stump.update_count(instance)
42
43 - def test_majority_class_for_attr_value(self):
44 self.__update_stump() 45 self.assertEqual('no', self.outlook_stump.majority_klass('sunny')) 46 self.assertEqual('yes', self.outlook_stump.majority_klass('overcast')) 47 self.assertEqual('yes', self.outlook_stump.majority_klass('rainy'))
48
49 - def test_classifies_instance_correctly(self):
50 self.__update_stump() 51 self.assertEqual('no', self.outlook_stump.klass(instance.GoldInstance(['sunny','mild','normal','true'],'yes'))) 52 self.assertEqual('yes', self.outlook_stump.klass(instance.GoldInstance(['overcast','mild','normal','true'],'yes'))) 53 self.assertEqual('yes', self.outlook_stump.klass(instance.GoldInstance(['rainy','mild','normal','true'],'yes'))) 54 self.assertEqual('no', self.outlook_stump.klass(instance.TestInstance(['sunny','mild','normal','true']))) 55 self.assertEqual('yes', self.outlook_stump.klass(instance.TestInstance(['overcast','mild','normal','true']))) 56 self.assertEqual('yes', self.outlook_stump.klass(instance.TestInstance(['rainy','mild','normal','true'])))
57
58 - def test_total_counts(self):
59 dictionary_of_klass_counts = {} 60 dictionary_of_klass_counts['yes'] = 2 61 dictionary_of_klass_counts['no'] = 0 62 self.assertEqual(2, ds.total_counts(dictionary_of_klass_counts)) 63 64 dictionary_of_klass_counts['yes'] = 9 65 dictionary_of_klass_counts['no'] = 5 66 self.assertEqual(14, ds.total_counts(dictionary_of_klass_counts))
67 68 # root - yes 5 69 # | no 4 70 # | 71 # |------sunny----- yes 1 72 # | no 3 73 # | 74 # |------rainy------yes 2 75 # | no 1 76 # | 77 # |------overcast---yes 2 78 # no 0 79 # 80 # mean info = 4.0/9 * (-(1.0/4 * log(1.0/4, 2)) + -(3.0/4 * log(3.0/4, 2))) + 3.0/9 * (-(2.0/3 * log(2.0/3, 2)) + -(1.0/3 * log(1.0/3, 2)))
81 - def test_mean_information(self):
82 self.__update_stump() 83 expected = 4.0/9 * (-(1.0/4 * math.log(1.0/4, 2)) + -(3.0/4 * math.log(3.0/4, 2))) + 3.0/9 * (-(2.0/3 * math.log(2.0/3, 2)) + -(1.0/3 * math.log(1.0/3, 2))) 84 self.assertAlmostEqual(expected, self.outlook_stump.mean_information(), 6)
85 86 # info_gain = entropy(root) - mean_information() 87 # entropy(root) = -(5.0/9 * log(5.0/9, 2)) + -(4.0/9 * log(4.0/9, 2)) = 0.99107605983822222 88 # mean_info = 0.666666666
89 - def test_information_gain(self):
90 self.__update_stump() 91 entropy = -(5.0/9 * math.log(5.0/9, 2)) + -(4.0/9 * math.log(4.0/9, 2)) 92 mean_info = 4.0/9 * (-(1.0/4 * math.log(1.0/4, 2)) + -(3.0/4 * math.log(3.0/4, 2))) + 3.0/9 * (-(2.0/3 * math.log(2.0/3, 2)) + -(1.0/3 * math.log(1.0/3, 2))) 93 expected = entropy - mean_info 94 self.assertAlmostEqual(expected, self.outlook_stump.information_gain(), 6)
95
96 - def test_returns_entropy_for_each_attribute_value(self):
97 self.__update_stump() 98 99 # there are 4 training instances in all out of which 100 # 3 training instances have their class assigned as no and 101 # 1 training instance has its class assigned as yes 102 expected = -(1.0/4 * math.log(1.0/4, 2)) + -(3.0/4 * math.log(3.0/4, 2)) 103 self.assertAlmostEqual(expected, self.outlook_stump.entropy('sunny'), 6) 104 105 expected = -(2.0/2 * math.log(2.0/2, 2)) + 0 106 self.assertAlmostEqual(0, self.outlook_stump.entropy('overcast')) 107 108 expected = -(2.0/3 * math.log(2.0/3, 2)) + -(1.0/3 * math.log(1.0/3, 2)) 109 self.assertAlmostEqual(expected, self.outlook_stump.entropy('rainy'))
110
111 - def test_dictionary_of_all_values_with_count_0(self):
112 phoney = format.C45_FORMAT.get_klass(datasetsDir(self) + 'test_phones' + SEP + 'phoney') 113 values = ds.dictionary_of_values(phoney); 114 self.assertEqual(3, len(values)) 115 for i in ['a', 'b', 'c']: 116 self.assertTrue(values.has_key(i)) 117 self.assertEqual(0, values[i])
118
119 - def test_gain_ratio(self):
120 self.__update_stump() 121 122 entropy = -(5.0/9 * math.log(5.0/9, 2)) + -(4.0/9 * math.log(4.0/9, 2)) 123 mean_info = 4.0/9 * (-(1.0/4 * math.log(1.0/4, 2)) + -(3.0/4 * math.log(3.0/4, 2))) + 3.0/9 * (-(2.0/3 * math.log(2.0/3, 2)) + -(1.0/3 * math.log(1.0/3, 2))) 124 info_gain = entropy - mean_info 125 split_info = -(1.0/3 * math.log(1.0/3, 2)) * 3 # outlook attribute has 3 values 126 expected = float(info_gain) / split_info 127 128 self.assertAlmostEqual(expected, self.outlook_stump.gain_ratio(), 6)
129
130 - def test_sorting_of_decision_stumps(self):
131 stumps = [] 132 for attribute in self.attributes: 133 stumps.append(ds.DecisionStump(attribute, self.klass)) 134 for instance in self.instances: 135 for stump in stumps: 136 stump.update_count(instance) 137 138 self.assertAlmostEqual(0.324409, stumps[0].information_gain(), 6) 139 self.assertAlmostEqual(0.102187, stumps[1].information_gain(), 6) 140 self.assertAlmostEqual(0.091091, stumps[2].information_gain(), 6) 141 self.assertAlmostEqual(0.072780, stumps[3].information_gain(), 6) 142 143 stumps.sort(lambda x, y: cmp(getattr(x, 'information_gain'), getattr(y, 'information_gain'))) 144 145 self.assertAlmostEqual(0.324409, stumps[0].information_gain(), 6) 146 self.assertAlmostEqual(0.102187, stumps[1].information_gain(), 6) 147 self.assertAlmostEqual(0.091091, stumps[2].information_gain(), 6) 148 self.assertAlmostEqual(0.072780, stumps[3].information_gain(), 6)
149