SHOGUN
v2.0.0
|
00001 /* 00002 * this program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2006-2009 Christian Gehl 00008 * Written (W) 2006-2009 Soeren Sonnenburg 00009 * Copyright (C) 2006-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00010 */ 00011 00012 #include <shogun/lib/config.h> 00013 #include <shogun/lib/common.h> 00014 #include <shogun/io/SGIO.h> 00015 #include <shogun/io/File.h> 00016 #include <shogun/lib/Time.h> 00017 #include <shogun/base/Parallel.h> 00018 #include <shogun/base/Parameter.h> 00019 00020 #include <shogun/distance/Distance.h> 00021 #include <shogun/features/Features.h> 00022 00023 #include <string.h> 00024 #include <unistd.h> 00025 00026 #ifdef HAVE_PTHREAD 00027 #include <pthread.h> 00028 #endif 00029 00030 using namespace shogun; 00031 00032 #ifndef DOXYGEN_SHOULD_SKIP_THIS 00033 struct DISTANCE_THREAD_PARAM 00034 { 00035 // CDistance instance used by thread to compute distance 00036 CDistance* distance; 00037 // distance matrix to store computed distances 00038 float64_t* distance_matrix; 00039 // starting index of the main loop 00040 int32_t idx_start; 00041 // end index of the main loop 00042 int32_t idx_stop; 00043 // step of the main loop 00044 int32_t idx_step; 00045 // number of lhs vectors 00046 int32_t lhs_vectors_number; 00047 // number of rhs vectors 00048 int32_t rhs_vectors_number; 00049 // whether matrix distance is symmetric 00050 bool symmetric; 00051 // chunking method 00052 bool chunk_by_lhs; 00053 }; 00054 #endif /* DOXYGEN_SHOULD_SKIP_THIS */ 00055 00056 CDistance::CDistance() : CSGObject() 00057 { 00058 init(); 00059 } 00060 00061 00062 CDistance::CDistance(CFeatures* p_lhs, CFeatures* p_rhs) : CSGObject() 00063 { 00064 init(); 00065 init(p_lhs, p_rhs); 00066 } 00067 00068 CDistance::~CDistance() 00069 { 00070 SG_FREE(precomputed_matrix); 00071 precomputed_matrix=NULL; 00072 00073 remove_lhs_and_rhs(); 00074 } 00075 00076 bool CDistance::init(CFeatures* l, CFeatures* r) 00077 { 00078 //make sure features were indeed supplied 00079 ASSERT(l); 00080 ASSERT(r); 00081 00082 //make sure features are compatible 00083 ASSERT(l->get_feature_class()==r->get_feature_class()); 00084 ASSERT(l->get_feature_type()==r->get_feature_type()); 00085 00086 //remove references to previous features 00087 remove_lhs_and_rhs(); 00088 00089 //increase reference counts 00090 SG_REF(l); 00091 SG_REF(r); 00092 00093 lhs=l; 00094 rhs=r; 00095 00096 num_lhs=l->get_num_vectors(); 00097 num_rhs=r->get_num_vectors(); 00098 00099 SG_FREE(precomputed_matrix); 00100 precomputed_matrix=NULL ; 00101 00102 return true; 00103 } 00104 00105 void CDistance::load(CFile* loader) 00106 { 00107 SG_SET_LOCALE_C; 00108 SG_RESET_LOCALE; 00109 } 00110 00111 void CDistance::save(CFile* writer) 00112 { 00113 SG_SET_LOCALE_C; 00114 SG_RESET_LOCALE; 00115 } 00116 00117 void CDistance::remove_lhs_and_rhs() 00118 { 00119 SG_UNREF(rhs); 00120 rhs = NULL; 00121 num_rhs=0; 00122 00123 SG_UNREF(lhs); 00124 lhs = NULL; 00125 num_lhs=0; 00126 } 00127 00128 void CDistance::remove_lhs() 00129 { 00130 SG_UNREF(lhs); 00131 lhs = NULL; 00132 num_lhs=0; 00133 } 00134 00136 void CDistance::remove_rhs() 00137 { 00138 SG_UNREF(rhs); 00139 rhs = NULL; 00140 num_rhs=0; 00141 } 00142 00143 CFeatures* CDistance::replace_rhs(CFeatures* r) 00144 { 00145 //make sure features were indeed supplied 00146 ASSERT(r); 00147 00148 //make sure features are compatible 00149 ASSERT(lhs->get_feature_class()==r->get_feature_class()); 00150 ASSERT(lhs->get_feature_type()==r->get_feature_type()); 00151 00152 //remove references to previous rhs features 00153 CFeatures* tmp=rhs; 00154 00155 rhs=r; 00156 num_rhs=r->get_num_vectors(); 00157 00158 SG_FREE(precomputed_matrix); 00159 precomputed_matrix=NULL ; 00160 00161 // return old features including reference count 00162 return tmp; 00163 } 00164 00165 CFeatures* CDistance::replace_lhs(CFeatures* l) 00166 { 00167 //make sure features were indeed supplied 00168 ASSERT(l); 00169 00170 //make sure features are compatible 00171 ASSERT(rhs->get_feature_class()==l->get_feature_class()); 00172 ASSERT(rhs->get_feature_type()==l->get_feature_type()); 00173 00174 //remove references to previous rhs features 00175 CFeatures* tmp=lhs; 00176 00177 lhs=l; 00178 num_lhs=l->get_num_vectors(); 00179 00180 SG_FREE(precomputed_matrix); 00181 precomputed_matrix=NULL ; 00182 00183 // return old features including reference count 00184 return tmp; 00185 } 00186 00187 void CDistance::do_precompute_matrix() 00188 { 00189 int32_t num_left=lhs->get_num_vectors(); 00190 int32_t num_right=rhs->get_num_vectors(); 00191 SG_INFO( "precomputing distance matrix (%ix%i)\n", num_left, num_right) ; 00192 00193 ASSERT(num_left==num_right); 00194 ASSERT(lhs==rhs); 00195 int32_t num=num_left; 00196 00197 SG_FREE(precomputed_matrix); 00198 precomputed_matrix=SG_MALLOC(float32_t, num*(num+1)/2); 00199 00200 for (int32_t i=0; i<num; i++) 00201 { 00202 SG_PROGRESS(i*i,0,num*num); 00203 for (int32_t j=0; j<=i; j++) 00204 precomputed_matrix[i*(i+1)/2+j] = compute(i,j) ; 00205 } 00206 00207 SG_PROGRESS(num*num,0,num*num); 00208 SG_DONE(); 00209 } 00210 00211 SGMatrix<float64_t> CDistance::get_distance_matrix() 00212 { 00213 int32_t m,n; 00214 float64_t* data=get_distance_matrix_real(m,n,NULL); 00215 return SGMatrix<float64_t>(data, m,n); 00216 } 00217 00218 float32_t* CDistance::get_distance_matrix_shortreal( 00219 int32_t &num_vec1, int32_t &num_vec2, float32_t* target) 00220 { 00221 float32_t* result = NULL; 00222 CFeatures* f1 = lhs; 00223 CFeatures* f2 = rhs; 00224 00225 if (has_features()) 00226 { 00227 if (target && (num_vec1!=get_num_vec_lhs() || num_vec2!=get_num_vec_rhs())) 00228 SG_ERROR("distance matrix does not fit into target\n"); 00229 00230 num_vec1=get_num_vec_lhs(); 00231 num_vec2=get_num_vec_rhs(); 00232 int64_t total_num=num_vec1*num_vec2; 00233 int32_t num_done=0; 00234 00235 SG_DEBUG("returning distance matrix of size %dx%d\n", num_vec1, num_vec2); 00236 00237 if (target) 00238 result=target; 00239 else 00240 result=SG_MALLOC(float32_t, total_num); 00241 00242 if ( (f1 == f2) && (num_vec1 == num_vec2) && (f1!=NULL && f2!=NULL) ) 00243 { 00244 for (int32_t i=0; i<num_vec1; i++) 00245 { 00246 for (int32_t j=i; j<num_vec1; j++) 00247 { 00248 float64_t v=distance(i,j); 00249 00250 result[i+j*num_vec1]=v; 00251 result[j+i*num_vec1]=v; 00252 00253 if (num_done%100000) 00254 SG_PROGRESS(num_done, 0, total_num-1); 00255 00256 if (i!=j) 00257 num_done+=2; 00258 else 00259 num_done+=1; 00260 } 00261 } 00262 } 00263 else 00264 { 00265 for (int32_t i=0; i<num_vec1; i++) 00266 { 00267 for (int32_t j=0; j<num_vec2; j++) 00268 { 00269 result[i+j*num_vec1]=distance(i,j) ; 00270 00271 if (num_done%100000) 00272 SG_PROGRESS(num_done, 0, total_num-1); 00273 00274 num_done++; 00275 } 00276 } 00277 } 00278 00279 SG_DONE(); 00280 } 00281 else 00282 SG_ERROR("no features assigned to distance\n"); 00283 00284 return result; 00285 } 00286 00287 float64_t* CDistance::get_distance_matrix_real( 00288 int32_t &lhs_vectors_number, int32_t &rhs_vectors_number, float64_t* target) 00289 { 00290 float64_t* distance_matrix = NULL; 00291 CFeatures* lhs_features = lhs; 00292 CFeatures* rhs_features = rhs; 00293 00294 // check for errors 00295 if (!has_features()) 00296 SG_ERROR("No features assigned to the distance.\n"); 00297 00298 if (target && 00299 (lhs_vectors_number!=get_num_vec_lhs() || 00300 rhs_vectors_number!=get_num_vec_rhs())) 00301 SG_ERROR("Distance matrix does not fit into the given target.\n"); 00302 00303 // init numbers of vectors and total number of distances 00304 lhs_vectors_number = get_num_vec_lhs(); 00305 rhs_vectors_number = get_num_vec_rhs(); 00306 int64_t total_distances_number = lhs_vectors_number*rhs_vectors_number; 00307 00308 SG_DEBUG("Calculating distance matrix of size %dx%d.\n", lhs_vectors_number, rhs_vectors_number); 00309 00310 // redirect to target or allocate memory 00311 if (target) 00312 distance_matrix = target; 00313 else 00314 distance_matrix = SG_MALLOC(float64_t, total_distances_number); 00315 00316 // check if we're computing symmetric distance_matrix 00317 bool symmetric = (lhs_features==rhs_features) || (lhs_vectors_number==rhs_vectors_number); 00318 // select chunking method according to greatest dimension 00319 bool chunk_by_lhs = (lhs_vectors_number >= rhs_vectors_number); 00320 00321 #ifdef HAVE_PTHREAD 00322 // init parallel to work 00323 int32_t num_threads = parallel->get_num_threads(); 00324 ASSERT(num_threads>0); 00325 pthread_t* threads = SG_MALLOC(pthread_t, num_threads); 00326 DISTANCE_THREAD_PARAM* parameters = SG_MALLOC(DISTANCE_THREAD_PARAM,num_threads); 00327 pthread_attr_t attr; 00328 pthread_attr_init(&attr); 00329 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); 00330 // run threads 00331 for (int32_t t=0; t<num_threads; t++) 00332 { 00333 parameters[t].idx_start = t; 00334 parameters[t].idx_stop = chunk_by_lhs ? lhs_vectors_number : rhs_vectors_number; 00335 parameters[t].idx_step = num_threads; 00336 parameters[t].distance_matrix = distance_matrix; 00337 parameters[t].symmetric = symmetric; 00338 parameters[t].lhs_vectors_number = lhs_vectors_number; 00339 parameters[t].rhs_vectors_number = rhs_vectors_number; 00340 parameters[t].chunk_by_lhs = chunk_by_lhs; 00341 parameters[t].distance = this; 00342 pthread_create(&threads[t], &attr, run_distance_thread, (void*)¶meters[t]); 00343 } 00344 // join, i.e. wait threads for finish 00345 for (int32_t t=0; t<num_threads; t++) 00346 { 00347 pthread_join(threads[t], NULL); 00348 } 00349 // cleanup 00350 pthread_attr_destroy(&attr); 00351 SG_FREE(parameters); 00352 SG_FREE(threads); 00353 #else 00354 // init one-threaded parameters 00355 DISTANCE_THREAD_PARAM single_thread_param; 00356 single_thread_param.idx_start = 0; 00357 single_thread_param.idx_stop = chunk_by_lhs ? lhs_vectors_number : rhs_vectors_number; 00358 single_thread_param.idx_step = 1; 00359 single_thread_param.distance_matrix = distance_matrix; 00360 single_thread_param.symmetric = symmetric; 00361 single_thread_param.lhs_vectors_number = lhs_vectors_number; 00362 single_thread_param.rhs_vectors_number = rhs_vectors_number; 00363 single_thread_param.chunk_by_lhs = chunk_by_lhs; 00364 single_thread_param.distance = this; 00365 // run thread 00366 run_distance_thread((void*)&single_thread_param); 00367 #endif 00368 00369 return distance_matrix; 00370 } 00371 00372 void CDistance::init() 00373 { 00374 precomputed_matrix = NULL; 00375 precompute_matrix = false; 00376 lhs = NULL; 00377 rhs = NULL; 00378 num_lhs=0; 00379 num_rhs=0; 00380 00381 m_parameters->add((CSGObject**) &lhs, "lhs", 00382 "Feature vectors to occur on left hand side."); 00383 m_parameters->add((CSGObject**) &rhs, "rhs", 00384 "Feature vectors to occur on right hand side."); 00385 } 00386 00387 void* CDistance::run_distance_thread(void* p) 00388 { 00389 DISTANCE_THREAD_PARAM* parameters = (DISTANCE_THREAD_PARAM*)p; 00390 float64_t* distance_matrix = parameters->distance_matrix; 00391 CDistance* distance = parameters->distance; 00392 int32_t idx_start = parameters->idx_start; 00393 int32_t idx_stop = parameters->idx_stop; 00394 int32_t idx_step = parameters->idx_step; 00395 int32_t lhs_vectors_number = parameters->lhs_vectors_number; 00396 int32_t rhs_vectors_number = parameters->rhs_vectors_number; 00397 bool symmetric = parameters->symmetric; 00398 bool chunk_by_lhs = parameters->chunk_by_lhs; 00399 00400 if (symmetric) 00401 { 00402 for (int32_t i=idx_start; i<idx_stop; i+=idx_step) 00403 { 00404 for (int32_t j=i; j<rhs_vectors_number; j++) 00405 { 00406 float64_t ij_distance = distance->compute(i,j); 00407 distance_matrix[i*rhs_vectors_number+j] = ij_distance; 00408 distance_matrix[j*rhs_vectors_number+i] = ij_distance; 00409 } 00410 } 00411 } 00412 else 00413 { 00414 if (chunk_by_lhs) 00415 { 00416 for (int32_t i=idx_start; i<idx_stop; i+=idx_step) 00417 { 00418 for (int32_t j=0; j<rhs_vectors_number; j++) 00419 { 00420 distance_matrix[j*lhs_vectors_number+i] = distance->compute(i,j); 00421 } 00422 } 00423 } 00424 else 00425 { 00426 for (int32_t j=idx_start; j<idx_stop; j+=idx_step) 00427 { 00428 for (int32_t i=0; i<lhs_vectors_number; i++) 00429 { 00430 distance_matrix[j*lhs_vectors_number+i] = distance->compute(i,j); 00431 } 00432 } 00433 } 00434 } 00435 00436 return NULL; 00437 }