Package nltk_lite :: Package cluster :: Module kmeans
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.cluster.kmeans

  1  # Natural Language Toolkit: K-Means Clusterer 
  2  # 
  3  # Copyright (C) 2004-2007 University of Melbourne 
  4  # Author: Trevor Cohn <tacohn@cs.mu.oz.au> 
  5  # Porting: Steven Bird <sb@csse.unimelb.edu.au> 
  6  # URL: <http://nltk.sf.net> 
  7  # For license information, see LICENSE.TXT 
  8   
  9  from nltk_lite.cluster import * 
 10   
11 -class KMeans(VectorSpace):
12 """ 13 The K-means clusterer starts with k arbitrary chosen means then allocates 14 each vector to the cluster with the closest mean. It then recalculates the 15 means of each cluster as the centroid of the vectors in the cluster. This 16 process repeats until the cluster memberships stabilise. This is a 17 hill-climbing algorithm which may converge to a local maximum. Hence the 18 clustering is often repeated with random initial means and the most 19 commonly occuring output means are chosen. 20 """ 21
22 - def __init__(self, num_means, distance, repeats=1, 23 conv_test=1e-6, initial_means=None, 24 normalise=False, svd_dimensions=None, 25 rng=None):
26 """ 27 @param num_means: the number of means to use (may use fewer) 28 @type num_means: int 29 @param distance: measure of distance between two vectors 30 @type distance: function taking two vectors and returing a float 31 @param repeats: number of randomised clustering trials to use 32 @type repeats: int 33 @param conv_test: maximum variation in mean differences before 34 deemed convergent 35 @type conv_test: number 36 @param initial_means: set of k initial means 37 @type initial_means: sequence of vectors 38 @param normalise: should vectors be normalised to length 1 39 @type normalise: boolean 40 @param svd_dimensions: number of dimensions to use in reducing vector 41 dimensionsionality with SVD 42 @type svd_dimensions: int 43 @param rng: random number generator (or None) 44 @type rng: Random 45 """ 46 VectorSpace.__init__(self, normalise, svd_dimensions) 47 self._num_means = num_means 48 self._distance = distance 49 self._max_difference = conv_test 50 assert not initial_means or len(initial_means) == num_means 51 self._means = initial_means 52 assert repeats >= 1 53 assert not (initial_means and repeats > 1) 54 self._repeats = repeats 55 if rng: self._rng = rng 56 else: self._rng = random.Random()
57
58 - def cluster_vectorspace(self, vectors, trace=False):
59 if self._means and self._repeats > 1: 60 print 'Warning: means will be discarded for subsequent trials' 61 62 meanss = [] 63 for trial in range(self._repeats): 64 if trace: print 'k-means trial', trial 65 if not self._means or trial > 1: 66 self._means = self._rng.sample(vectors, self._num_means) 67 self._cluster_vectorspace(vectors, trace) 68 meanss.append(self._means) 69 70 if len(meanss) > 1: 71 # sort the means first (so that different cluster numbering won't 72 # effect the distance comparison) 73 for means in meanss: 74 means.sort(cmp = _vector_compare) 75 76 # find the set of means that's minimally different from the others 77 min_difference = min_means = None 78 for i in range(len(meanss)): 79 d = 0 80 for j in range(len(meanss)): 81 if i != j: 82 d += self._sum_distances(meanss[i], meanss[j]) 83 if min_difference == None or d < min_difference: 84 min_difference, min_means = d, meanss[i] 85 86 # use the best means 87 self._means = min_means
88
89 - def _cluster_vectorspace(self, vectors, trace=False):
90 if self._num_means < len(vectors): 91 # perform k-means clustering 92 converged = False 93 while not converged: 94 # assign the tokens to clusters based on minimum distance to 95 # the cluster means 96 clusters = [[] for m in range(self._num_means)] 97 for vector in vectors: 98 index = self.classify_vectorspace(vector) 99 clusters[index].append(vector) 100 101 if trace: print 'iteration' 102 #for i in range(self._num_means): 103 #print ' mean', i, 'allocated', len(clusters[i]), 'vectors' 104 105 # recalculate cluster means by computing the centroid of each cluster 106 new_means = map(self._centroid, clusters) 107 108 # measure the degree of change from the previous step for convergence 109 difference = self._sum_distances(self._means, new_means) 110 if difference < self._max_difference: 111 converged = True 112 113 # remember the new means 114 self._means = new_means
115
116 - def classify_vectorspace(self, vector):
117 # finds the closest cluster centroid 118 # returns that cluster's index 119 best_distance = best_index = None 120 for index in range(len(self._means)): 121 mean = self._means[index] 122 dist = self._distance(vector, mean) 123 if best_distance == None or dist < best_distance: 124 best_index, best_distance = index, dist 125 return best_index
126
127 - def num_clusters(self):
128 if self._means: 129 return len(self._means) 130 else: 131 return self._num_means
132
133 - def means(self):
134 """ 135 The means used for clustering. 136 """ 137 return self._means
138
139 - def _sum_distances(self, vectors1, vectors2):
140 difference = 0.0 141 for u, v in zip(vectors1, vectors2): 142 difference += self._distance(u, v) 143 return difference
144
145 - def _centroid(self, cluster):
146 assert len(cluster) > 0 147 centroid = copy.copy(cluster[0]) 148 for vector in cluster[1:]: 149 centroid += vector 150 return centroid / float(len(cluster))
151
152 - def __repr__(self):
153 return '<KMeans Clusterer means=%s repeats=%d>' % \ 154 (self._means, self._repeats)
155
156 -def _vector_compare(x, y):
157 xs, ys = sum(x), sum(y) 158 if xs < ys: return -1 159 elif xs > ys: return 1 160 else: return 0
161 162 ################################################################################# 163
164 -def demo():
165 # example from figure 14.9, page 517, Manning and Schutze 166 167 from nltk_lite import cluster 168 169 vectors = [array(f) for f in [[2, 1], [1, 3], [4, 7], [6, 7]]] 170 means = [[4, 3], [5, 5]] 171 172 clusterer = cluster.KMeans(2, euclidean_distance, initial_means=means) 173 clusters = clusterer.cluster(vectors, True, trace=True) 174 175 print 'Clustered:', vectors 176 print 'As:', clusters 177 print 'Means:', clusterer.means() 178 print 179 180 vectors = [array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]] 181 182 # test k-means using the euclidean distance metric, 2 means and repeat 183 # clustering 10 times with random seeds 184 185 clusterer = cluster.KMeans(2, euclidean_distance, repeats=10) 186 clusters = clusterer.cluster(vectors, True) 187 print 'Clustered:', vectors 188 print 'As:', clusters 189 print 'Means:', clusterer.means() 190 print 191 192 # classify a new vector 193 vector = array([3, 3]) 194 print 'classify(%s):' % vector, 195 print clusterer.classify(vector) 196 print
197 198 if __name__ == '__main__': 199 demo() 200