Package nltk_lite :: Package cluster :: Module em
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.cluster.em

  1  # Natural Language Toolkit: Expectation Maximization Clusterer 
  2  # 
  3  # Copyright (C) 2004-2007 University of Melbourne 
  4  # Author: Trevor Cohn <tacohn@cs.mu.oz.au> 
  5  # Porting: Steven Bird <sb@csse.unimelb.edu.au> 
  6  # URL: <http://nltk.sf.net> 
  7  # For license information, see LICENSE.TXT 
  8   
  9  from nltk_lite.cluster import * 
 10   
11 -class EM(VectorSpace):
12 """ 13 The Gaussian EM clusterer models the vectors as being produced by 14 a mixture of k Gaussian sources. The parameters of these sources 15 (prior probability, mean and covariance matrix) are then found to 16 maximise the likelihood of the given data. This is done with the 17 expectation maximisation algorithm. It starts with k arbitrarily 18 chosen means, priors and covariance matrices. It then calculates 19 the membership probabilities for each vector in each of the 20 clusters; this is the 'E' step. The cluster parameters are then 21 updated in the 'M' step using the maximum likelihood estimate from 22 the cluster membership probabilities. This process continues until 23 the likelihood of the data does not significantly increase. 24 """ 25
26 - def __init__(self, initial_means, priors=None, covariance_matrices=None, 27 conv_threshold=1e-6, bias=0.1, normalise=False, 28 svd_dimensions=None):
29 """ 30 Creates an EM clusterer with the given starting parameters, 31 convergence threshold and vector mangling parameters. 32 33 @param initial_means: the means of the gaussian cluster centers 34 @type initial_means: [seq of] numpy array or seq of SparseArray 35 @param priors: the prior probability for each cluster 36 @type priors: numpy array or seq of float 37 @param covariance_matrices: the covariance matrix for each cluster 38 @type covariance_matrices: [seq of] numpy array 39 @param conv_threshold: maximum change in likelihood before deemed 40 convergent 41 @type conv_threshold: int or float 42 @param bias: variance bias used to ensure non-singular covariance 43 matrices 44 @type bias: float 45 @param normalise: should vectors be normalised to length 1 46 @type normalise: boolean 47 @param svd_dimensions: number of dimensions to use in reducing vector 48 dimensionsionality with SVD 49 @type svd_dimensions: int 50 """ 51 VectorSpace.__init__(self, normalise, svd_dimensions) 52 self._means = array(initial_means, numpy.float64) 53 self._num_clusters = len(initial_means) 54 self._conv_threshold = conv_threshold 55 self._covariance_matrices = covariance_matrices 56 self._priors = priors 57 self._bias = bias
58
59 - def num_clusters(self):
60 return self._num_clusters
61
62 - def cluster_vectorspace(self, vectors, trace=False):
63 assert len(vectors) > 0 64 65 # set the parameters to initial values 66 dimensions = len(vectors[0]) 67 means = self._means 68 priors = self._priors 69 if not priors: 70 priors = self._priors = numpy.ones(self._num_clusters, 71 numpy.float64) / self._num_clusters 72 covariances = self._covariance_matrices 73 if not covariances: 74 covariances = self._covariance_matrices = \ 75 [ numpy.identity(dimensions, numpy.float64) 76 for i in range(self._num_clusters) ] 77 78 # do the E and M steps until the likelihood plateaus 79 lastl = self._loglikelihood(vectors, priors, means, covariances) 80 converged = False 81 82 while not converged: 83 if trace: print 'iteration; loglikelihood', lastl 84 # E-step, calculate hidden variables, h[i,j] 85 h = numpy.zeros((len(vectors), self._num_clusters), 86 numpy.float64) 87 for i in range(len(vectors)): 88 for j in range(self._num_clusters): 89 h[i,j] = priors[j] * self._gaussian(means[j], 90 covariances[j], vectors[i]) 91 h[i,:] /= sum(h[i,:]) 92 93 # M-step, update parameters - cvm, p, mean 94 for j in range(self._num_clusters): 95 covariance_before = covariances[j] 96 new_covariance = numpy.zeros((dimensions, dimensions), 97 numpy.float64) 98 new_mean = numpy.zeros(dimensions, numpy.float64) 99 sum_hj = 0.0 100 for i in range(len(vectors)): 101 delta = vectors[i] - means[j] 102 new_covariance += h[i,j] * \ 103 numpy.multiply.outer(delta, delta) 104 sum_hj += h[i,j] 105 new_mean += h[i,j] * vectors[i] 106 covariances[j] = new_covariance / sum_hj 107 means[j] = new_mean / sum_hj 108 priors[j] = sum_hj / len(vectors) 109 110 # bias term to stop covariance matrix being singular 111 covariances[j] += self._bias * \ 112 numpy.identity(dimensions, numpy.float64) 113 114 # calculate likelihood - FIXME: may be broken 115 l = self._loglikelihood(vectors, priors, means, covariances) 116 117 # check for convergence 118 if abs(lastl - l) < self._conv_threshold: 119 converged = True 120 lastl = l
121
122 - def classify_vectorspace(self, vector):
123 best = None 124 for j in range(self._num_clusters): 125 p = self._priors[j] * self._gaussian(self._means[j], 126 self._covariance_matrices[j], vector) 127 if not best or p > best[0]: 128 best = (p, j) 129 return best[1]
130
131 - def likelihood_vectorspace(self, vector, cluster):
132 cid = self.cluster_names().index(cluster) 133 return self._priors[cluster] * self._gaussian(self._means[cluster], 134 self._covariance_matrices[cluster], vector)
135
136 - def _gaussian(self, mean, cvm, x):
137 m = len(mean) 138 assert cvm.shape == (m, m), \ 139 'bad sized covariance matrix, %s' % str(cvm.shape) 140 try: 141 det = linalg.det(cvm) 142 inv = linalg.inv(cvm) 143 a = det ** -0.5 * (2 * numpy.pi) ** (-m / 2.0) 144 dx = x - mean 145 print dx, inv 146 b = -0.5 * numpy.dot( numpy.dot(dx, inv), dx) 147 return a * numpy.exp(b) 148 except OverflowError: 149 # happens when the exponent is negative infinity - i.e. b = 0 150 # i.e. the inverse of cvm is huge (cvm is almost zero) 151 return 0
152
153 - def _loglikelihood(self, vectors, priors, means, covariances):
154 llh = 0.0 155 for vector in vectors: 156 p = 0 157 for j in range(len(priors)): 158 p += priors[j] * \ 159 self._gaussian(means[j], covariances[j], vector) 160 llh += numpy.log(p) 161 return llh
162
163 - def __repr__(self):
164 return '<EM Clusterer means=%s>' % list(self._means)
165
166 -def euclidean_distance(u, v):
167 """ 168 Returns the euclidean distance between vectors u and v. This is equivalent 169 to the length of the vector (u - v). 170 """ 171 diff = u - v 172 return math.sqrt(numpy.dot(diff, diff))
173
174 -def cosine_distance(u, v):
175 """ 176 Returns the cosine of the angle between vectors v and u. This is equal to 177 u.v / |u||v|. 178 """ 179 return numpy.dot(u, v) / (math.sqrt(numpy.dot(u, u)) * math.sqrt(numpy.dot(v, v)))
180
181 -def demo():
182 """ 183 Non-interactive demonstration of the clusterers with simple 2-D data. 184 """ 185 186 from nltk_lite import cluster 187 188 # example from figure 14.10, page 519, Manning and Schutze 189 190 vectors = [array(f) for f in [[0.5, 0.5], [1.5, 0.5], [1, 3]]] 191 means = [[4, 2], [4, 2.01]] 192 193 clusterer = cluster.EM(means, bias=0.1) 194 clusters = clusterer.cluster(vectors, True, trace=True) 195 196 print 'Clustered:', vectors 197 print 'As: ', clusters 198 print 199 200 for c in range(2): 201 print 'Cluster:', c 202 print 'Prior: ', clusterer._priors[c] 203 print 'Mean: ', clusterer._means[c] 204 print 'Covar: ', clusterer._covariance_matrices[c] 205 print 206 207 # classify a new vector 208 vector = array([2, 2]) 209 print 'classify(%s):' % vector, 210 print clusterer.classify(vector) 211 212 # show the classification probabilities 213 vector = array([2, 2]) 214 print 'classification_probdist(%s):' % vector 215 pdist = clusterer.classification_probdist(vector) 216 for sample in pdist.samples(): 217 print '%s => %.0f%%' % (sample, 218 pdist.prob(sample) *100)
219 220 # 221 # The following demo code is broken. 222 # 223 # # use a set of tokens with 2D indices 224 # vectors = [array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]] 225 226 # # test the EM clusterer with means given by k-means (2) and 227 # # dimensionality reduction 228 # clusterer = cluster.KMeans(2, euclidean_distance, svd_dimensions=1) 229 # print 'Clusterer:', clusterer 230 # clusters = clusterer.cluster(vectors) 231 # means = clusterer.means() 232 # print 'Means:', clusterer.means() 233 # print 234 235 # clusterer = cluster.EM(means, svd_dimensions=1) 236 # clusters = clusterer.cluster(vectors, True) 237 # print 'Clusterer:', clusterer 238 # print 'Clustered:', str(vectors)[:60], '...' 239 # print 'As:', str(clusters)[:60], '...' 240 # print 241 242 # # classify a new vector 243 # vector = array([3, 3]) 244 # print 'classify(%s):' % vector, 245 # print clusterer.classify(vector) 246 # print 247 248 # # show the classification probabilities 249 # vector = array([2.2, 2]) 250 # print 'classification_probdist(%s)' % vector 251 # pdist = clusterer.classification_probdist(vector) 252 # for sample in pdist.samples(): 253 # print '%s => %.0f%%' % (sample, pdist.prob(sample) *100) 254 255 if __name__ == '__main__': 256 demo() 257