SHOGUN
v2.0.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2011 Sergey Lisitsyn 00008 * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society 00009 */ 00010 00011 #include <shogun/converter/MultidimensionalScaling.h> 00012 #ifdef HAVE_LAPACK 00013 #include <shogun/converter/EmbeddingConverter.h> 00014 #include <shogun/mathematics/lapack.h> 00015 #include <shogun/mathematics/arpack.h> 00016 #include <shogun/distance/CustomDistance.h> 00017 #include <shogun/lib/common.h> 00018 #include <shogun/mathematics/Math.h> 00019 #include <shogun/mathematics/Statistics.h> 00020 #include <shogun/io/SGIO.h> 00021 #include <shogun/distance/EuclideanDistance.h> 00022 00023 #ifdef HAVE_PTHREAD 00024 #include <pthread.h> 00025 #endif 00026 00027 using namespace shogun; 00028 00029 #ifndef DOXYGEN_SHOULD_SKIP_THIS 00030 struct TRIANGULATION_THREAD_PARAM 00031 { 00033 int32_t idx_start; 00035 int32_t idx_stop; 00037 int32_t idx_step; 00039 int32_t lmk_N; 00041 int32_t total_N; 00043 int32_t m_target_dim; 00045 float64_t* current_dist_to_lmks; 00047 float64_t* lmk_feature_matrix; 00049 float64_t* new_feature_matrix; 00051 const float64_t* distance_matrix; 00053 const float64_t* mean_sq_dist_vector; 00055 const int32_t* lmk_idxs; 00057 const bool* to_process; 00058 }; 00059 #endif 00060 00061 CMultidimensionalScaling::CMultidimensionalScaling() : CEmbeddingConverter() 00062 { 00063 m_eigenvalues = SGVector<float64_t>(); 00064 m_landmark_number = 3; 00065 m_landmark = false; 00066 00067 init(); 00068 } 00069 00070 void CMultidimensionalScaling::init() 00071 { 00072 SG_ADD(&m_eigenvalues, "eigenvalues", "eigenvalues of last embedding", 00073 MS_NOT_AVAILABLE); 00074 SG_ADD(&m_landmark, "landmark", 00075 "indicates if landmark approximation should be used", MS_NOT_AVAILABLE); 00076 SG_ADD(&m_landmark_number, "landmark number", 00077 "the number of landmarks for approximation", MS_AVAILABLE); 00078 } 00079 00080 CMultidimensionalScaling::~CMultidimensionalScaling() 00081 { 00082 } 00083 00084 SGVector<float64_t> CMultidimensionalScaling::get_eigenvalues() const 00085 { 00086 return m_eigenvalues; 00087 } 00088 00089 void CMultidimensionalScaling::set_landmark_number(int32_t num) 00090 { 00091 if (num<3) 00092 SG_ERROR("Number of landmarks should be greater than 3 to make triangulation possible while %d given.", 00093 num); 00094 m_landmark_number = num; 00095 } 00096 00097 int32_t CMultidimensionalScaling::get_landmark_number() const 00098 { 00099 return m_landmark_number; 00100 } 00101 00102 void CMultidimensionalScaling::set_landmark(bool landmark) 00103 { 00104 m_landmark = landmark; 00105 } 00106 00107 bool CMultidimensionalScaling::get_landmark() const 00108 { 00109 return m_landmark; 00110 } 00111 00112 const char* CMultidimensionalScaling::get_name() const 00113 { 00114 return "MultidimensionalScaling"; 00115 }; 00116 00117 CDenseFeatures<float64_t>* CMultidimensionalScaling::embed_distance(CDistance* distance) 00118 { 00119 ASSERT(distance); 00120 00121 // compute feature_matrix by landmark or classic embedding of distance matrix 00122 SGMatrix<float64_t> distance_matrix = process_distance_matrix(distance->get_distance_matrix()); 00123 SGMatrix<float64_t> feature_matrix; 00124 if (m_landmark) 00125 feature_matrix = landmark_embedding(distance_matrix); 00126 else 00127 feature_matrix = classic_embedding(distance_matrix); 00128 00129 return new CDenseFeatures<float64_t>(feature_matrix); 00130 } 00131 00132 SGMatrix<float64_t> CMultidimensionalScaling::process_distance_matrix(SGMatrix<float64_t> distance_matrix) 00133 { 00134 return distance_matrix; 00135 } 00136 00137 CFeatures* CMultidimensionalScaling::apply(CFeatures* features) 00138 { 00139 SG_REF(features); 00140 ASSERT(m_distance); 00141 00142 m_distance->init(features,features); 00143 CDenseFeatures<float64_t>* embedding = embed_distance(m_distance); 00144 m_distance->remove_lhs_and_rhs(); 00145 00146 SG_UNREF(features); 00147 return (CFeatures*)embedding; 00148 } 00149 00150 SGMatrix<float64_t> CMultidimensionalScaling::classic_embedding(SGMatrix<float64_t> distance_matrix) 00151 { 00152 ASSERT(m_target_dim>0); 00153 ASSERT(distance_matrix.num_cols==distance_matrix.num_rows); 00154 int32_t N = distance_matrix.num_cols; 00155 00156 // loop variables 00157 int32_t i,j; 00158 00159 // double center distance_matrix 00160 for (i=0; i<N; i++) 00161 { 00162 for (j=0; j<N; j++) 00163 distance_matrix(i,j) = CMath::sq(distance_matrix(i,j)); 00164 } 00165 SGMatrix<float64_t>::center_matrix(distance_matrix.matrix,N,N); 00166 for (i=0; i<N; i++) 00167 { 00168 for (j=0; j<N; j++) 00169 distance_matrix(i,j) = distance_matrix(i,j)*(-0.5); 00170 } 00171 00172 // feature matrix representing given distance 00173 float64_t* replace_feature_matrix = SG_MALLOC(float64_t, N*m_target_dim); 00174 00175 // status of eigenproblem to be solved 00176 int eigenproblem_status = 0; 00177 #ifdef HAVE_ARPACK 00178 // using ARPACK 00179 float64_t* eigenvalues_vector = SG_MALLOC(float64_t, m_target_dim); 00180 // solve eigenproblem with ARPACK (faster) 00181 arpack_dsxupd(distance_matrix.matrix,NULL,false,N,m_target_dim,"LM",false,1,false,false,0.0,0.0, 00182 eigenvalues_vector,replace_feature_matrix,eigenproblem_status); 00183 // check for failure 00184 ASSERT(eigenproblem_status == 0); 00185 // reverse eigenvectors order 00186 float64_t tmp; 00187 for (j=0; j<N; j++) 00188 { 00189 for (i=0; i<m_target_dim/2; i++) 00190 { 00191 tmp = replace_feature_matrix[j*m_target_dim+i]; 00192 replace_feature_matrix[j*m_target_dim+i] = 00193 replace_feature_matrix[j*m_target_dim+(m_target_dim-i-1)]; 00194 replace_feature_matrix[j*m_target_dim+(m_target_dim-i-1)] = tmp; 00195 } 00196 } 00197 // reverse eigenvalues order 00198 for (i=0; i<m_target_dim/2; i++) 00199 { 00200 tmp = eigenvalues_vector[i]; 00201 eigenvalues_vector[i] = eigenvalues_vector[m_target_dim-i-1]; 00202 eigenvalues_vector[m_target_dim-i-1] = tmp; 00203 } 00204 00205 // finally construct embedding 00206 for (i=0; i<m_target_dim; i++) 00207 { 00208 for (j=0; j<N; j++) 00209 replace_feature_matrix[j*m_target_dim+i] *= 00210 CMath::sqrt(eigenvalues_vector[i]); 00211 } 00212 00213 // set eigenvalues vector 00214 m_eigenvalues = SGVector<float64_t>(eigenvalues_vector,m_target_dim); 00215 #else /* not HAVE_ARPACK */ 00216 // using LAPACK 00217 float64_t* eigenvalues_vector = SG_MALLOC(float64_t, N); 00218 float64_t* eigenvectors = SG_MALLOC(float64_t, m_target_dim*N); 00219 // solve eigenproblem with LAPACK 00220 wrap_dsyevr('V','U',N,distance_matrix.matrix,N,N-m_target_dim+1,N,eigenvalues_vector,eigenvectors,&eigenproblem_status); 00221 // check for failure 00222 ASSERT(eigenproblem_status==0); 00223 00224 // set eigenvalues vector 00225 m_eigenvalues = SGVector<float64_t>(m_target_dim); 00226 00227 // fill eigenvalues vector in backwards order 00228 for (i=0; i<m_target_dim; i++) 00229 m_eigenvalues.vector[i] = eigenvalues_vector[m_target_dim-i-1]; 00230 00231 // construct embedding 00232 for (i=0; i<m_target_dim; i++) 00233 { 00234 for (j=0; j<N; j++) 00235 { 00236 replace_feature_matrix[j*m_target_dim+i] = 00237 eigenvectors[(m_target_dim-i-1)*N+j] * CMath::sqrt(m_eigenvalues.vector[i]); 00238 } 00239 } 00240 SG_FREE(eigenvectors); 00241 SG_FREE(eigenvalues_vector); 00242 #endif /* HAVE_ARPACK else */ 00243 00244 // warn user if there are negative or zero eigenvalues 00245 for (i=0; i<m_eigenvalues.vlen; i++) 00246 { 00247 if (m_eigenvalues.vector[i]<=0.0) 00248 { 00249 SG_WARNING("Embedding is not consistent (got neg eigenvalues): features %d-%d are wrong", 00250 i, m_eigenvalues.vlen-1); 00251 break; 00252 } 00253 } 00254 00255 return SGMatrix<float64_t>(replace_feature_matrix,m_target_dim,N); 00256 } 00257 00258 SGMatrix<float64_t> CMultidimensionalScaling::landmark_embedding(SGMatrix<float64_t> distance_matrix) 00259 { 00260 ASSERT(m_target_dim>0); 00261 ASSERT(distance_matrix.num_cols==distance_matrix.num_rows); 00262 int32_t lmk_N = m_landmark_number; 00263 int32_t i,j; 00264 int32_t total_N = distance_matrix.num_cols; 00265 if (lmk_N<3) 00266 { 00267 SG_ERROR("Number of landmarks (%d) should be greater than 3 for proper triangulation.\n", 00268 lmk_N); 00269 } 00270 if (lmk_N>total_N) 00271 { 00272 SG_ERROR("Number of landmarks (%d) should be less than total number of vectors (%d).\n", 00273 lmk_N, total_N); 00274 } 00275 00276 // get landmark indexes with random permutation 00277 SGVector<int32_t> lmk_idxs = CStatistics::sample_indices(lmk_N,total_N); 00278 // compute distances between landmarks 00279 float64_t* lmk_dist_matrix = SG_MALLOC(float64_t, lmk_N*lmk_N); 00280 for (i=0; i<lmk_N; i++) 00281 { 00282 for (j=0; j<lmk_N; j++) 00283 lmk_dist_matrix[i*lmk_N+j] = 00284 distance_matrix[lmk_idxs.vector[i]*total_N+lmk_idxs.vector[j]]; 00285 } 00286 00287 // get landmarks embedding 00288 SGMatrix<float64_t> lmk_dist_sgmatrix(lmk_dist_matrix,lmk_N,lmk_N); 00289 // compute mean vector of squared distances 00290 float64_t* mean_sq_dist_vector = SG_CALLOC(float64_t, lmk_N); 00291 for (i=0; i<lmk_N; i++) 00292 { 00293 for (j=0; j<lmk_N; j++) 00294 mean_sq_dist_vector[i] += CMath::sq(lmk_dist_matrix[i*lmk_N+j]); 00295 00296 mean_sq_dist_vector[i] /= lmk_N; 00297 } 00298 SGMatrix<float64_t> lmk_feature_matrix = classic_embedding(lmk_dist_sgmatrix); 00299 00300 // construct new feature matrix 00301 float64_t* new_feature_matrix = SG_CALLOC(float64_t, m_target_dim*total_N); 00302 00303 // fill new feature matrix with embedded landmarks 00304 for (i=0; i<lmk_N; i++) 00305 { 00306 for (j=0; j<m_target_dim; j++) 00307 new_feature_matrix[lmk_idxs.vector[i]*m_target_dim+j] = 00308 lmk_feature_matrix[i*m_target_dim+j]; 00309 } 00310 00311 // get exactly defined pseudoinverse of landmarks feature matrix 00312 ASSERT(m_eigenvalues.vector && m_eigenvalues.vlen == m_target_dim); 00313 for (i=0; i<lmk_N; i++) 00314 { 00315 for (j=0; j<m_target_dim; j++) 00316 lmk_feature_matrix[i*m_target_dim+j] /= m_eigenvalues.vector[j]; 00317 } 00318 00319 00320 // set to_process els true if should be processed 00321 bool* to_process = SG_MALLOC(bool, total_N); 00322 for (j=0; j<total_N; j++) 00323 to_process[j] = true; 00324 for (j=0; j<lmk_N; j++) 00325 to_process[lmk_idxs.vector[j]] = false; 00326 00327 // get embedding for non-landmark vectors 00328 #ifdef HAVE_PTHREAD 00329 int32_t t; 00330 int32_t num_threads = parallel->get_num_threads(); 00331 ASSERT(num_threads>0); 00332 // allocate threads and it's parameters 00333 pthread_t* threads = SG_MALLOC(pthread_t, num_threads); 00334 TRIANGULATION_THREAD_PARAM* parameters = SG_MALLOC(TRIANGULATION_THREAD_PARAM, num_threads); 00335 pthread_attr_t attr; 00336 pthread_attr_init(&attr); 00337 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); 00338 float64_t* current_dist_to_lmks = SG_MALLOC(float64_t, lmk_N*num_threads); 00339 // run threads 00340 for (t=0; t<num_threads; t++) 00341 { 00342 parameters[t].idx_start = t; 00343 parameters[t].idx_stop = total_N; 00344 parameters[t].idx_step = num_threads; 00345 parameters[t].lmk_N = lmk_N; 00346 parameters[t].total_N = total_N; 00347 parameters[t].m_target_dim = m_target_dim; 00348 parameters[t].current_dist_to_lmks = current_dist_to_lmks+t*lmk_N; 00349 parameters[t].distance_matrix = distance_matrix.matrix; 00350 parameters[t].mean_sq_dist_vector = mean_sq_dist_vector; 00351 parameters[t].lmk_idxs = lmk_idxs.vector; 00352 parameters[t].lmk_feature_matrix = lmk_feature_matrix.matrix; 00353 parameters[t].new_feature_matrix = new_feature_matrix; 00354 parameters[t].to_process = to_process; 00355 pthread_create(&threads[t], &attr, run_triangulation_thread, (void*)¶meters[t]); 00356 } 00357 // join threads 00358 for (t=0; t<num_threads; t++) 00359 pthread_join(threads[t], NULL); 00360 pthread_attr_destroy(&attr); 00361 SG_FREE(parameters); 00362 SG_FREE(threads); 00363 #else 00364 // run single 'thread' 00365 float64_t* current_dist_to_lmks = SG_MALLOC(float64_t, lmk_N); 00366 TRIANGULATION_THREAD_PARAM single_thread_param; 00367 single_thread_param.idx_start = 0; 00368 single_thread_param.idx_stop = total_N; 00369 single_thread_param.idx_step = 1; 00370 single_thread_param.lmk_N = lmk_N; 00371 single_thread_param.total_N = total_N; 00372 single_thread_param.m_target_dim = m_target_dim; 00373 single_thread_param.current_dist_to_lmks = current_dist_to_lmks; 00374 single_thread_param.distance_matrix = distance_matrix.matrix; 00375 single_thread_param.mean_sq_dist_vector = mean_sq_dist_vector; 00376 single_thread_param.lmk_idxs = lmk_idxs.vector; 00377 single_thread_param.lmk_feature_matrix = lmk_feature_matrix.matrix; 00378 single_thread_param.new_feature_matrix = new_feature_matrix; 00379 single_thread_param.to_process = to_process; 00380 run_triangulation_thread((void*)&single_thread_param); 00381 #endif 00382 // cleanup 00383 SG_FREE(current_dist_to_lmks); 00384 SG_FREE(mean_sq_dist_vector); 00385 SG_FREE(to_process); 00386 00387 return SGMatrix<float64_t>(new_feature_matrix,m_target_dim,total_N); 00388 } 00389 00390 void* CMultidimensionalScaling::run_triangulation_thread(void* p) 00391 { 00392 TRIANGULATION_THREAD_PARAM* parameters = (TRIANGULATION_THREAD_PARAM*)p; 00393 int32_t idx_start = parameters->idx_start; 00394 int32_t idx_step = parameters->idx_step; 00395 int32_t idx_stop = parameters->idx_stop; 00396 const int32_t* lmk_idxs = parameters->lmk_idxs; 00397 const float64_t* distance_matrix = parameters->distance_matrix; 00398 const float64_t* mean_sq_dist_vector = parameters->mean_sq_dist_vector; 00399 float64_t* current_dist_to_lmks = parameters->current_dist_to_lmks; 00400 int32_t m_target_dim = parameters->m_target_dim; 00401 int32_t lmk_N = parameters->lmk_N; 00402 int32_t total_N = parameters->total_N; 00403 const bool* to_process = parameters->to_process; 00404 float64_t* lmk_feature_matrix = parameters->lmk_feature_matrix; 00405 float64_t* new_feature_matrix = parameters->new_feature_matrix; 00406 00407 int32_t i,k; 00408 for (i=idx_start; i<idx_stop; i+=idx_step) 00409 { 00410 // skip if landmark 00411 if (!to_process[i]) 00412 continue; 00413 00414 // compute difference from mean landmark distance vector 00415 for (k=0; k<lmk_N; k++) 00416 { 00417 current_dist_to_lmks[k] = 00418 CMath::sq(distance_matrix[i*total_N+lmk_idxs[k]]) - 00419 mean_sq_dist_vector[k]; 00420 } 00421 // compute embedding 00422 cblas_dgemv(CblasColMajor,CblasNoTrans, 00423 m_target_dim,lmk_N, 00424 -0.5,lmk_feature_matrix,m_target_dim, 00425 current_dist_to_lmks,1, 00426 0.0,(new_feature_matrix+i*m_target_dim),1); 00427 } 00428 return NULL; 00429 } 00430 00431 #endif /* HAVE_LAPACK */