1
2
3
4
5
6
7
8 from nltk_lite.contrib.classifier import split_ignore_space
9 from nltk_lite.contrib.classifier import format, cfile, commandline as cl
10 from nltk_lite.contrib.classifier.exceptions import invaliddataerror as inv
11
12 import sys
13
14 a_help = "Selects the feature selection algorithm " \
15 + "Options: RNK for Ranking(Filter based Feature Selection)" \
16 + "Default: RNK. "
17
18 f_help = "Base name of attribute, klass, training, test and gold " \
19 + " files. "
20
21 t_help = "Base name of training file for feature selection. "
22
23 T_help = "Base name of test file for feature selection. "
24
25 g_help = "Base name of gold file for feature selection. "
26
27 o_help = "Algorithm specific options " \
28 + "For rank based feature selection the options should " \
29 + "include the method to calculate the rank: " \
30 + " IG: for Information gain " \
31 + " GR: for Gain ratio " \
32 + "followed by a number which indicates the number of " \
33 + "attributes which should be chosen. "
34
35 OPTION_MAPPINGS = {'IG': 'information_gain', 'GR': 'gain_ratio'}
36
37 RANK='RNK'
38
39 ALGORITHM_MAPPINGS = {RANK:'by_rank'}
40
43 cl.CommandLineInterface.__init__(self, ALGORITHM_MAPPINGS.keys(), RANK, a_help, f_help, t_help, T_help, g_help)
44 self.add_option("-o", "--options", dest="options", type="string", help=o_help)
45
56
58 ignore_missing = False
59
60 if self.files is not None:
61 self.training_path, self.test_path, self.gold_path = [self.files] * 3
62 ignore_missing = True
63 training, attributes, klass, test, gold = self.get_instances(self.training_path, self.test_path, self.gold_path, ignore_missing)
64
65 feature_sel = FeatureSelection(training, attributes, klass, test, gold, self.options)
66 getattr(feature_sel, ALGORITHM_MAPPINGS[self.algorithm])()
67
68 files_written = self.write_to_file(self.get_suffix(), training, attributes, klass, test, gold)
69 print 'The following files were created after feature selection...'
70 for file_name in files_written:
71 print file_name
72
74 if self.options is None: return '-' + self.algorithm
75 suf = '-' + self.algorithm
76 for option in self.options:
77 suf += '_' + option
78 return suf
79
81 - def __init__(self, training, attributes, klass, test, gold, options):
82 self.training, self.attributes, self.klass, self.test, self.gold = training, attributes, klass, test, gold
83 self.options = options
84
92
94 decision_stumps = self.attributes.empty_decision_stumps([], self.klass)
95 for decision_stump in decision_stumps:
96 for instance in self.training:
97 decision_stump.update_count(instance)
98 decision_stumps.sort(lambda x, y: cmp(getattr(x, method)(), getattr(y, method)()))
99
100 if number > len(decision_stumps): number = len(decision_stumps)
101 to_remove, attributes_to_remove = decision_stumps[:number * -1], []
102 for stump in to_remove:
103 attributes_to_remove.append(stump.attribute)
104 return attributes_to_remove
105
106 - def remove(self, attributes):
111
112 if __name__ == "__main__":
113 FeatureSelect().run(sys.argv[1:])
114