SHOGUN
v2.0.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 1999-2008 Gunnar Raetsch 00008 * Written (W) 2007-2009 Soeren Sonnenburg 00009 * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00010 */ 00011 00012 #include <shogun/clustering/KMeans.h> 00013 #include <shogun/distance/Distance.h> 00014 #include <shogun/labels/Labels.h> 00015 #include <shogun/features/DenseFeatures.h> 00016 #include <shogun/mathematics/Math.h> 00017 #include <shogun/base/Parallel.h> 00018 00019 #ifdef HAVE_PTHREAD 00020 #include <pthread.h> 00021 #endif 00022 00023 #define MUSRECALC 00024 00025 #define PAR_THRESH 10 00026 00027 using namespace shogun; 00028 00029 CKMeans::CKMeans() 00030 : CDistanceMachine() 00031 { 00032 init(); 00033 } 00034 00035 CKMeans::CKMeans(int32_t k_, CDistance* d) 00036 : CDistanceMachine() 00037 { 00038 init(); 00039 k=k_; 00040 set_distance(d); 00041 } 00042 00043 CKMeans::~CKMeans() 00044 { 00045 } 00046 00047 bool CKMeans::train_machine(CFeatures* data) 00048 { 00049 ASSERT(distance); 00050 00051 if (data) 00052 distance->init(data, data); 00053 00054 ASSERT(distance->get_feature_type()==F_DREAL); 00055 00056 CDenseFeatures<float64_t>* lhs= 00057 (CDenseFeatures<float64_t>*)distance->get_lhs(); 00058 ASSERT(lhs); 00059 int32_t num=lhs->get_num_vectors(); 00060 SG_UNREF(lhs); 00061 00062 Weights=SGVector<float64_t>(num); 00063 for (int32_t i=0; i<num; i++) 00064 Weights.vector[i]=1.0; 00065 00066 clustknb(false, NULL); 00067 00068 return true; 00069 } 00070 00071 bool CKMeans::load(FILE* srcfile) 00072 { 00073 SG_SET_LOCALE_C; 00074 SG_RESET_LOCALE; 00075 return false; 00076 } 00077 00078 bool CKMeans::save(FILE* dstfile) 00079 { 00080 SG_SET_LOCALE_C; 00081 SG_RESET_LOCALE; 00082 return false; 00083 } 00084 00085 00086 void CKMeans::set_k(int32_t p_k) 00087 { 00088 ASSERT(p_k>0); 00089 this->k=p_k; 00090 } 00091 00092 int32_t CKMeans::get_k() 00093 { 00094 return k; 00095 } 00096 00097 void CKMeans::set_max_iter(int32_t iter) 00098 { 00099 ASSERT(iter>0); 00100 max_iter=iter; 00101 } 00102 00103 float64_t CKMeans::get_max_iter() 00104 { 00105 return max_iter; 00106 } 00107 00108 SGVector<float64_t> CKMeans::get_radiuses() 00109 { 00110 return R; 00111 } 00112 00113 SGMatrix<float64_t> CKMeans::get_cluster_centers() 00114 { 00115 if (!R.vector) 00116 return SGMatrix<float64_t>(); 00117 00118 CDenseFeatures<float64_t>* lhs= 00119 (CDenseFeatures<float64_t>*)distance->get_lhs(); 00120 SGMatrix<float64_t> centers=lhs->get_feature_matrix(); 00121 SG_UNREF(lhs); 00122 return centers; 00123 } 00124 00125 int32_t CKMeans::get_dimensions() 00126 { 00127 return dimensions; 00128 } 00129 00130 #ifndef DOXYGEN_SHOULD_SKIP_THIS 00131 struct thread_data 00132 { 00133 float64_t* x; 00134 CDenseFeatures<float64_t>* y; 00135 float64_t* z; 00136 int32_t n1, n2, m; 00137 int32_t js, je; /* defines the matrix stripe */ 00138 int32_t offs; 00139 }; 00140 #endif // DOXYGEN_SHOULD_SKIP_THIS 00141 00142 namespace shogun 00143 { 00144 void *sqdist_thread_func(void * P) 00145 { 00146 struct thread_data *TD=(struct thread_data*) P; 00147 float64_t* x=TD->x; 00148 CDenseFeatures<float64_t>* y=TD->y; 00149 float64_t* z=TD->z; 00150 int32_t n1=TD->n1, 00151 m=TD->m, 00152 js=TD->js, 00153 je=TD->je, 00154 offs=TD->offs, 00155 j,i,k; 00156 00157 for (j=js; j<je; j++) 00158 { 00159 int32_t vlen=0; 00160 bool vfree=false; 00161 float64_t* vec=y->get_feature_vector(j+offs, vlen, vfree); 00162 00163 for (i=0; i<n1; i++) 00164 { 00165 float64_t sum=0; 00166 for (k=0; k<m; k++) 00167 sum = sum + CMath::sq(x[i*m + k] - vec[k]); 00168 z[j*n1 + i] = sum; 00169 } 00170 00171 y->free_feature_vector(vec, j, vfree); 00172 } 00173 return NULL; 00174 } 00175 } 00176 00177 void CKMeans::clustknb(bool use_old_mus, float64_t *mus_start) 00178 { 00179 ASSERT(distance && distance->get_feature_type()==F_DREAL); 00180 CDenseFeatures<float64_t>* lhs = (CDenseFeatures<float64_t>*) distance->get_lhs(); 00181 ASSERT(lhs && lhs->get_num_features()>0 && lhs->get_num_vectors()>0); 00182 00183 int32_t XSize=lhs->get_num_vectors(); 00184 dimensions=lhs->get_num_features(); 00185 int32_t i, changed=1; 00186 const int32_t XDimk=dimensions*k; 00187 int32_t iter=0; 00188 00189 R=SGVector<float64_t>(k); 00190 00191 mus=SGMatrix<float64_t>(dimensions, k); 00192 00193 int32_t *ClList=SG_CALLOC(int32_t, XSize); 00194 float64_t *weights_set=SG_CALLOC(float64_t, k); 00195 float64_t *dists=SG_CALLOC(float64_t, k*XSize); 00196 00198 CDenseFeatures<float64_t>* rhs_mus = new CDenseFeatures<float64_t>(0); 00199 CFeatures* rhs_cache = distance->replace_rhs(rhs_mus); 00200 00201 int32_t vlen=0; 00202 bool vfree=false; 00203 float64_t* vec=NULL; 00204 00205 /* ClList=zeros(XSize,1) ; */ 00206 memset(ClList, 0, sizeof(int32_t)*XSize); 00207 /* weights_set=zeros(k,1) ; */ 00208 memset(weights_set, 0, sizeof(float64_t)*k); 00209 00210 /* cluster_centers=zeros(dimensions, k) ; */ 00211 memset(mus.matrix, 0, sizeof(float64_t)*XDimk); 00212 00213 if (!use_old_mus) 00214 { 00215 for (i=0; i<XSize; i++) 00216 { 00217 const int32_t Cl=CMath::random(0, k-1); 00218 int32_t j; 00219 float64_t weight=Weights.vector[i]; 00220 00221 weights_set[Cl]+=weight; 00222 ClList[i]=Cl; 00223 00224 vec=lhs->get_feature_vector(i, vlen, vfree); 00225 00226 for (j=0; j<dimensions; j++) 00227 mus.matrix[Cl*dimensions+j] += weight*vec[j]; 00228 00229 lhs->free_feature_vector(vec, i, vfree); 00230 } 00231 for (i=0; i<k; i++) 00232 { 00233 int32_t j; 00234 00235 if (weights_set[i]!=0.0) 00236 for (j=0; j<dimensions; j++) 00237 mus.matrix[i*dimensions+j] /= weights_set[i]; 00238 } 00239 } 00240 else 00241 { 00242 ASSERT(mus_start); 00243 00245 rhs_mus->copy_feature_matrix(SGMatrix<float64_t>(mus_start,dimensions,k)); 00246 float64_t* p_dists=dists; 00247 00248 for(int32_t idx=0;idx<XSize;idx++,p_dists+=k) 00249 distances_rhs(p_dists,0,k,idx); 00250 p_dists=NULL; 00251 00252 for (i=0; i<XSize; i++) 00253 { 00254 float64_t mini=dists[i*k]; 00255 int32_t Cl = 0, j; 00256 00257 for (j=1; j<k; j++) 00258 { 00259 if (dists[i*k+j]<mini) 00260 { 00261 Cl=j; 00262 mini=dists[i*k+j]; 00263 } 00264 } 00265 ClList[i]=Cl; 00266 } 00267 00268 /* Compute the sum of all points belonging to a cluster 00269 * and count the points */ 00270 for (i=0; i<XSize; i++) 00271 { 00272 const int32_t Cl = ClList[i]; 00273 float64_t weight=Weights.vector[i]; 00274 weights_set[Cl]+=weight; 00275 #ifndef MUSRECALC 00276 vec=lhs->get_feature_vector(i, vlen, vfree); 00277 00278 for (j=0; j<dimensions; j++) 00279 mus.matrix[Cl*dimensions+j] += weight*vec[j]; 00280 00281 lhs->free_feature_vector(vec, i, vfree); 00282 #endif 00283 } 00284 #ifndef MUSRECALC 00285 /* normalization to get the mean */ 00286 for (i=0; i<k; i++) 00287 { 00288 if (weights_set[i]!=0.0) 00289 { 00290 int32_t j; 00291 for (j=0; j<dimensions; j++) 00292 mus.matrix[i*dimensions+j] /= weights_set[i]; 00293 } 00294 } 00295 #endif 00296 } 00297 00298 00299 00300 while (changed && (iter<max_iter)) 00301 { 00302 iter++; 00303 if (iter==max_iter-1) 00304 SG_WARNING("kmeans clustering changed throughout %d iterations stopping...\n", max_iter-1); 00305 00306 if (iter%1000 == 0) 00307 SG_INFO("Iteration[%d/%d]: Assignment of %i patterns changed.\n", iter, max_iter, changed); 00308 changed=0; 00309 00310 #ifdef MUSRECALC 00311 /* mus=zeros(dimensions, k) ; */ 00312 memset(mus.matrix, 0, sizeof(float64_t)*XDimk); 00313 00314 for (i=0; i<XSize; i++) 00315 { 00316 int32_t j; 00317 int32_t Cl=ClList[i]; 00318 float64_t weight=Weights.vector[i]; 00319 00320 vec=lhs->get_feature_vector(i, vlen, vfree); 00321 00322 for (j=0; j<dimensions; j++) 00323 mus.matrix[Cl*dimensions+j] += weight*vec[j]; 00324 00325 lhs->free_feature_vector(vec, i, vfree); 00326 } 00327 for (i=0; i<k; i++) 00328 { 00329 int32_t j; 00330 00331 if (weights_set[i]!=0.0) 00332 for (j=0; j<dimensions; j++) 00333 mus.matrix[i*dimensions+j] /= weights_set[i]; 00334 } 00335 #endif 00336 00337 rhs_mus->copy_feature_matrix(mus); 00338 00339 for (i=0; i<XSize; i++) 00340 { 00341 /* ks=ceil(rand(1,XSize)*XSize) ; */ 00342 const int32_t Pat= CMath::random(0, XSize-1); 00343 const int32_t ClList_Pat=ClList[Pat]; 00344 int32_t imini, j; 00345 float64_t mini, weight; 00346 00347 weight=Weights.vector[Pat]; 00348 00349 /* compute the distance of this point to all centers */ 00350 for(int32_t idx_k=0;idx_k<k;idx_k++) 00351 dists[idx_k]=distance->distance(Pat,idx_k); 00352 00353 /* [mini,imini]=min(dists(:,i)) ; */ 00354 imini=0 ; mini=dists[0]; 00355 for (j=1; j<k; j++) 00356 if (dists[j]<mini) 00357 { 00358 mini=dists[j]; 00359 imini=j; 00360 } 00361 00362 if (imini!=ClList_Pat) 00363 { 00364 changed= changed + 1; 00365 00366 /* weights_set(imini) = weights_set(imini) + weight ; */ 00367 weights_set[imini]+= weight; 00368 /* weights_set(j) = weights_set(j) - weight ; */ 00369 weights_set[ClList_Pat]-= weight; 00370 00371 vec=lhs->get_feature_vector(Pat, vlen, vfree); 00372 00373 for (j=0; j<dimensions; j++) 00374 { 00375 mus.matrix[imini*dimensions+j]-=(vec[j] 00376 -mus.matrix[imini*dimensions+j]) 00377 *(weight/weights_set[imini]); 00378 } 00379 00380 lhs->free_feature_vector(vec, Pat, vfree); 00381 00382 /* mu_new = mu_old - (x - mu_old)/(n-1) */ 00383 /* if weights_set(j)~=0 */ 00384 if (weights_set[ClList_Pat]!=0.0) 00385 { 00386 vec=lhs->get_feature_vector(Pat, vlen, vfree); 00387 00388 for (j=0; j<dimensions; j++) 00389 { 00390 mus.matrix[ClList_Pat*dimensions+j]-= 00391 (vec[j] 00392 -mus.matrix[ClList_Pat 00393 *dimensions+j]) 00394 *(weight/weights_set[ClList_Pat]); 00395 } 00396 lhs->free_feature_vector(vec, Pat, vfree); 00397 } 00398 else 00399 /* mus(:,j)=zeros(dimensions,1) ; */ 00400 for (j=0; j<dimensions; j++) 00401 mus.matrix[ClList_Pat*dimensions+j]=0; 00402 00403 /* ClList(i)= imini ; */ 00404 ClList[Pat] = imini; 00405 } 00406 } 00407 } 00408 00409 /* compute the ,,variances'' of the clusters */ 00410 for (i=0; i<k; i++) 00411 { 00412 float64_t rmin1=0; 00413 float64_t rmin2=0; 00414 00415 bool first_round=true; 00416 00417 for (int32_t j=0; j<k; j++) 00418 { 00419 if (j!=i) 00420 { 00421 int32_t l; 00422 float64_t dist = 0; 00423 00424 for (l=0; l<dimensions; l++) 00425 { 00426 dist+=CMath::sq( 00427 mus.matrix[i*dimensions+l] 00428 -mus.matrix[j*dimensions+l]); 00429 } 00430 00431 if (first_round) 00432 { 00433 rmin1=dist; 00434 rmin2=dist; 00435 first_round=false; 00436 } 00437 else 00438 { 00439 if ((dist<rmin2) && (dist>=rmin1)) 00440 rmin2=dist; 00441 00442 if (dist<rmin1) 00443 { 00444 rmin2=rmin1; 00445 rmin1=dist; 00446 } 00447 } 00448 } 00449 } 00450 00451 R.vector[i]=(0.7*CMath::sqrt(rmin1)+0.3*CMath::sqrt(rmin2)); 00452 } 00453 00454 distance->replace_rhs(rhs_cache); 00455 delete rhs_mus; 00456 SG_FREE(ClList); 00457 SG_FREE(weights_set); 00458 SG_FREE(dists); 00459 SG_UNREF(lhs); 00460 } 00461 00462 void CKMeans::store_model_features() 00463 { 00464 /* set lhs of underlying distance to cluster centers */ 00465 CDenseFeatures<float64_t>* cluster_centers=new CDenseFeatures<float64_t>( 00466 mus); 00467 00468 /* store cluster centers in lhs of distance variable */ 00469 CFeatures* rhs=distance->get_rhs(); 00470 distance->init(cluster_centers, rhs); 00471 SG_UNREF(rhs); 00472 } 00473 00474 void CKMeans::init() 00475 { 00476 max_iter=10000; 00477 k=3; 00478 dimensions=0; 00479 00480 SG_ADD(&max_iter, "max_iter", "Maximum number of iterations", MS_AVAILABLE); 00481 SG_ADD(&k, "k", "k, the number of clusters", MS_AVAILABLE); 00482 SG_ADD(&dimensions, "dimensions", "Dimensions of data", MS_NOT_AVAILABLE); 00483 SG_ADD(&R, "R", "Cluster radiuses", MS_NOT_AVAILABLE); 00484 } 00485