1
2
3
4
5
6
7
8
9
10 """
11 Spearman Classifier -- Beta version
12 """
13
14 from math import pow
15 from nltk_lite.probability import *
16 from nltk_lite.contrib.classify import *
17
19 """
20 The Spearman-rho Classifier is a non-parametric measure of correlation between two
21 sets of ranked values
22 Spearman-rho classification is a supervised classifier. It needs to be trained
23 with representative examples of each class. From these examples the classifier
24 calculates the most probable classification of the sample.
25
26 6 * sum((Ai - Bi)^2)
27 p = 1 - -------------------------
28 (n^3) - n
29
30 where A, B are the vectors of ranked objects
31 n is the number of ranked objects being compared
32
33 Internal data structures:
34 _feature_dectector:
35 holds a feature detector function
36 _classes:
37 holds a list of classes supplied during trainning
38 _cls_rank:
39 holds a dictionary of ordered lists,
40 the order of the list is deturnmined by:
41 first ranked object is ordered first
42 duplicate values are ordered in alphabetical order
43 """
44
45 - def __init__(self, feature_detector, crop_data=100):
46 """
47 @param feature_detector: feature detector produced function
48 @param crop_data: ranking beyond which features are ignored
49 this produces a maximum rank for large data sets
50 where the lower order values would offset the results
51 """
52 self._feature_detector = feature_detector
53 self._crop_data = crop_data
54
55
57 """
58 trains the classifier
59 @param classes: dictionary of class names to representative examples
60
61 function takes representative examples of classes
62 then creates ordered lists of ranked features
63 indexed by class name and feature type
64 """
65
66 self._classes = []
67 self._cls_rank = {}
68
69 for cls in gold:
70 self._classes.append(cls)
71 for (fname, fvals) in self._feature_detector(gold[cls]):
72 cls_freq_dist = FreqDist()
73 for fval in fvals:
74 cls_freq_dist.inc(fval)
75 self._cls_rank[cls, fname] = self._get_ranks(cls_freq_dist)
76
77
79 """
80 @param text: sample to be classified
81 @ret: Dictionary (class to probability)
82 """
83 return self._spearman_ranked(sample)
84
85
87 """
88 @param text: sample to be classified
89 @ret: Dictionary class to probability
90
91 function uses sample to create an ordered list of ranked features
92 the spearman-rho formula is then applied to produce a correlation
93
94 a union operation is used to create two lists of the same length for the formula
95 missing values are appended to each list in alphabetical order
96 """
97
98 rank_diff = {}
99 score = {}
100 sample_rank = {}
101 totalfvals = {}
102
103 for (fname, fvals) in self._feature_detector(sample):
104 sample_dist = FreqDist()
105 for fval in fvals:
106 sample_dist.inc(fval)
107 sample_rank[fname] = self._get_ranks(sample_dist)
108
109
110 for cls in self._classes:
111 rank_diff[cls] = 0
112 totalfvals[cls] = 0
113
114 for fname in sample_rank:
115 for cls in self._classes:
116 tmp_clslist = self._cls_rank[cls, fname]
117 tmp_smplist = sample_rank[fname]
118
119
120
121 tmp_clslist.extend(list(set(tmp_smplist).difference(set(tmp_clslist))))
122 tmp_smplist.extend(list(set(tmp_clslist).difference(set(tmp_smplist))))
123
124 totalfvals[cls] += len(tmp_clslist)
125
126 for fval in tmp_smplist:
127 rank_diff[cls] += pow(tmp_smplist.index(fval) - tmp_clslist.index(fval), 2)
128
129 for cls in self._classes:
130 score[cls] = 1 - (float(6 * rank_diff[cls]) / (pow(totalfvals[cls], 3) - totalfvals[cls]))
131
132 return score
133
134
135
137 """
138 @param sample_dist: sample frequency distribution
139 @ret: ordered list of features.
140 """
141
142 samples = sample_dist.samples()
143 ordered_list = [item for (freq, item) in sorted([(sample_dist.count(item),item) for item in samples], reverse=True)]
144 return ordered_list[:self._crop_data]
145
147 return '<SpearmanClassifier: classes=%d>' % len(self._classes)
148
149
150
177
178
180 from nltk_lite.contrib import classify
181 from nltk_lite import detect
182
183 fd = detect.feature({"2-tup": lambda t: [t[n:n+2] for n in range(len(t)-1)]})
184
185 classifier = classify.spearman.Spearman(fd)
186 trainning_data = {"class a": "aaaaaab",
187 "class b": "bbbbbba"}
188 classifier.train(trainning_data)
189
190 result = classifier.get_class_dict("aaababb")
191
192 for cls in result:
193 print cls, ':', result[cls]
194 """
195 expected values:
196 class a: 'aa' = 1
197 'ab' = 2
198 b: 'bb' = 1
199 'ba' = 2
200 sample: 'aa' = 1
201 'ab' = 1
202 'ba' = 2
203 'bb' = 2
204
205
206 score a: 1/ 1+0+1+5+5 = 0.5
207 score b: 1/ 1+1+0+6+6 = 0.5
208 """
209
210
211
213 from nltk_lite.contrib import classify
214 from nltk_lite import detect
215
216 fd = detect.feature({"1-tup": lambda t: [t[n] for n in range(len(t))],
217 "2-tup": lambda t: [t[n:n+2] for n in range(len(t)-1)]})
218
219 classifier = classify.spearman.Spearman(fd)
220 trainning_data = {"class a": "aaaaaab",
221 "class b": "bbbbbba"}
222 classifier.train(trainning_data)
223
224 result = classifier.get_class_dict("aaababb")
225
226 for cls in result:
227 print cls, ':', result[cls]
228
229 """
230 expected values:
231 class a: 'a' = 1
232 'b' = 3
233 'aa' = 2
234 'ab' = 3
235 b: 'a' = 3
236 'b' = 1
237 'bb' = 2
238 'ba' = 3
239 sample: 'a' = 1
240 'b' = 2
241 'aa' = 3
242 'ab' = 3
243 'ba' = 4
244 'bb' = 4
245
246 score a: 1/ 1+0+1+1+0+3+3 = 1/ 9
247 score b: 1/ 1+2+1+2+1+4+4 = 1/ 15
248 """
249
250
252 from nltk_lite.contrib import classify
253 from nltk_lite import detect
254
255 from nltk_lite.corpora import genesis
256 from itertools import islice
257
258 fd = detect.feature({"2-tup": lambda t: [' '.join(t)[n:n+2] for n in range(len(' '.join(t))-1)],
259 "words": lambda t: t})
260
261 classifier = classify.spearman.Spearman(fd)
262 training_data = {}
263 training_data["english-kjv"] = list(islice(genesis.raw("english-kjv"), 0, 400))
264 training_data["french"] = list(islice(genesis.raw("french"), 0, 400))
265 training_data["finnish"] = list(islice(genesis.raw("finnish"), 0, 400))
266
267 classifier.train(training_data)
268
269 result = classifier.get_class_probs(list(islice(genesis.raw("english-kjv"), 150, 200)))
270
271 print 'english-kjv :', result.prob('english-kjv')
272 print 'french :', result.prob('french')
273 print 'finnish :', result.prob('finnish')
274
275
276 if __name__ == '__main__':
277 demo2()
278