SHOGUN
v2.0.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2012 Fernando José Iglesias García 00008 * Copyright (C) 2012 Fernando José Iglesias García 00009 */ 00010 00011 #include <shogun/converter/StochasticProximityEmbedding.h> 00012 #include <shogun/lib/config.h> 00013 #ifdef HAVE_LAPACK 00014 #include <shogun/converter/EmbeddingConverter.h> 00015 #include <shogun/io/SGIO.h> 00016 #include <shogun/lib/CoverTree.h> 00017 #include <shogun/mathematics/Math.h> 00018 #include <shogun/distance/Distance.h> 00019 00020 using namespace shogun; 00021 00022 class SPE_COVERTREE_POINT 00023 { 00024 public: 00025 00026 SPE_COVERTREE_POINT(int32_t index, CDistance* specp_distance) 00027 { 00028 m_point_index = index; 00029 m_distance = specp_distance; 00030 } 00031 00032 inline double distance(const SPE_COVERTREE_POINT& p) const 00033 { 00034 return m_distance->distance(m_point_index, p.m_point_index); 00035 } 00036 00037 inline bool operator==(const SPE_COVERTREE_POINT& p) const 00038 { 00039 return (p.m_point_index == m_point_index); 00040 } 00041 00042 int32_t m_point_index; 00043 CDistance* m_distance; 00044 }; 00045 00046 CStochasticProximityEmbedding::CStochasticProximityEmbedding() : 00047 CEmbeddingConverter() 00048 { 00049 // Initialize to default values 00050 m_k = 12; 00051 m_nupdates = 100; 00052 m_strategy = SPE_GLOBAL; 00053 m_tolerance = 1e-5; 00054 00055 init(); 00056 } 00057 00058 void CStochasticProximityEmbedding::init() 00059 { 00060 SG_ADD(&m_k, "m_k", "Number of neighbors", MS_NOT_AVAILABLE); 00061 SG_ADD((machine_int_t*) &m_strategy, "m_strategy", "SPE strategy", 00062 MS_NOT_AVAILABLE); 00063 SG_ADD(&m_tolerance, "m_tolerance", "Regularization parameter", 00064 MS_NOT_AVAILABLE); 00065 } 00066 00067 CStochasticProximityEmbedding::~CStochasticProximityEmbedding() 00068 { 00069 } 00070 00071 void CStochasticProximityEmbedding::set_k(int32_t k) 00072 { 00073 if ( k <= 0 ) 00074 SG_ERROR("Number of neighbors k must be greater than 0"); 00075 00076 m_k = k; 00077 } 00078 00079 int32_t CStochasticProximityEmbedding::get_k() const 00080 { 00081 return m_k; 00082 } 00083 00084 void CStochasticProximityEmbedding::set_strategy(ESPEStrategy strategy) 00085 { 00086 m_strategy = strategy; 00087 } 00088 00089 ESPEStrategy CStochasticProximityEmbedding::get_strategy() const 00090 { 00091 return m_strategy; 00092 } 00093 00094 void CStochasticProximityEmbedding::set_tolerance(float32_t tolerance) 00095 { 00096 if ( tolerance <= 0 ) 00097 SG_ERROR("Tolerance regularization parameter must be greater " 00098 "than 0"); 00099 00100 m_tolerance = tolerance; 00101 } 00102 00103 int32_t CStochasticProximityEmbedding::get_tolerance() const 00104 { 00105 return m_tolerance; 00106 } 00107 00108 void CStochasticProximityEmbedding::set_nupdates(int32_t nupdates) 00109 { 00110 if ( nupdates <= 0 ) 00111 SG_ERROR("The number of updates must be greater than 0"); 00112 00113 m_nupdates = nupdates; 00114 } 00115 00116 int32_t CStochasticProximityEmbedding::get_nupdates() const 00117 { 00118 return m_nupdates; 00119 } 00120 00121 const char * CStochasticProximityEmbedding::get_name() const 00122 { 00123 return "StochasticProximityEmbedding"; 00124 } 00125 00126 CFeatures* CStochasticProximityEmbedding::apply(CFeatures* features) 00127 { 00128 if ( !features ) 00129 SG_ERROR("Features are required to apply SPE\n"); 00130 00131 // Shorthand for the DenseFeatures 00132 CDenseFeatures< float64_t >* simple_features = 00133 (CDenseFeatures< float64_t >*) features; 00134 SG_REF(features); 00135 00136 // Get and check the number of vectors 00137 int32_t N = simple_features->get_num_vectors(); 00138 if ( m_strategy == SPE_LOCAL && m_k >= N ) 00139 SG_ERROR("The number of neighbors (%d) must be less than " 00140 "the number of vectors (%d)\n", m_k, N); 00141 00142 if ( 2*m_nupdates > N ) 00143 SG_ERROR("The number of vectors (%d) must be at least two times " 00144 "the number of updates (%d)\n", N, m_nupdates); 00145 00146 m_distance->init(simple_features, simple_features); 00147 CDenseFeatures< float64_t >* embedding = embed_distance(m_distance); 00148 m_distance->remove_lhs_and_rhs(); 00149 00150 SG_UNREF(features); 00151 return (CFeatures*)embedding; 00152 } 00153 00154 SGMatrix<int32_t> CStochasticProximityEmbedding::get_neighborhood_matrix(CDistance* distance, int32_t k, int32_t N, float64_t max_dist) 00155 { 00156 int32_t i; 00157 int32_t* neighborhood_matrix = SG_MALLOC(int32_t, N*k); 00158 00159 CoverTree<SPE_COVERTREE_POINT>* coverTree = new CoverTree<SPE_COVERTREE_POINT>(max_dist); 00160 00161 for (i=0; i<N; i++) 00162 coverTree->insert(SPE_COVERTREE_POINT(i, distance)); 00163 00164 for (i=0; i<N; i++) 00165 { 00166 std::vector<SPE_COVERTREE_POINT> neighbors = 00167 coverTree->kNearestNeighbors(SPE_COVERTREE_POINT(i, distance), k+1); 00168 for (std::size_t m=1; m<unsigned(k+1); m++) 00169 neighborhood_matrix[i*k+m-1] = neighbors[m].m_point_index; 00170 } 00171 00172 delete coverTree; 00173 00174 return SGMatrix<int32_t>(neighborhood_matrix,k,N); 00175 } 00176 00177 CDenseFeatures< float64_t >* CStochasticProximityEmbedding::embed_distance(CDistance* distance) 00178 { 00179 if ( !distance ) 00180 SG_ERROR("Embed distance received no instance of CDistance\n"); 00181 00182 // Compute distance matrix 00183 SG_DEBUG("Computing distance matrix\n"); 00184 00185 if ( distance->get_distance_type() != D_EUCLIDEAN ) 00186 SG_ERROR("SPE only supports Euclidean distance, %s given\n", 00187 distance->get_name()); 00188 00189 // Get the number of features, assume that distance(features, features) 00190 int32_t N = distance->get_num_vec_rhs(); 00191 00192 // Look for the maximum distance (make the same assumption) 00193 int32_t i, j, k; 00194 float64_t max = 0.0, tmp = 0.0; 00195 for ( i = 0 ; i < N ; ++i ) 00196 for ( j = i+1 ; j < N ; ++j ) 00197 if ( ( tmp = distance->distance(i, j) ) > max ) 00198 max = tmp; 00199 00200 // Compute a normalizer to be used for the distances if global strategy selected 00201 float64_t alpha = 0.0; 00202 if ( m_strategy == SPE_GLOBAL ) 00203 alpha = 1.0 / max * CMath::sqrt(2.0); 00204 00205 // Compute neighborhood matrix if local strategy used 00206 SGMatrix< int32_t > neighbors_mat; 00207 if ( m_strategy == SPE_LOCAL ) 00208 { 00209 SG_DEBUG("Computing neighborhood matrix\n"); 00210 neighbors_mat = get_neighborhood_matrix(distance, m_k, N, max); 00211 } 00212 00213 // Initialize vectors in the embedded space randomly, Y is the short for 00214 // new_feature_matrix 00215 SGMatrix< float64_t > Y(m_target_dim, N); 00216 SGVector<float64_t>::random_vector(Y.matrix, m_target_dim*N, 0.0, 1.0); 00217 00218 // SPE's loop 00219 00220 // Initialize the maximum number of iterations 00221 int32_t max_iter = 2000 + CMath::round(0.04 * N*N); 00222 if ( m_strategy == SPE_LOCAL ) 00223 max_iter *= 3; 00224 00225 // Initialize the learning parameter 00226 float32_t lambda = 1.0; 00227 00228 // Initialize variables to use in the main loop 00229 float64_t sum = 0.0; 00230 index_t idx1 = 0, idx2 = 0, idx = 0; 00231 00232 SGVector< float64_t > scale(m_nupdates); 00233 SGVector< float64_t > D(m_nupdates); 00234 SGMatrix< float64_t > Yd(m_nupdates, m_target_dim); 00235 SGVector< float64_t > Rt(m_nupdates); 00236 int32_t* ind2 = NULL; 00237 00238 // Variables required just if local strategy used 00239 SGMatrix< int32_t > ind1Neighbors; 00240 SGVector< int32_t > J2; 00241 if ( m_strategy == SPE_LOCAL ) 00242 { 00243 ind2 = SG_MALLOC(int32_t, m_nupdates); 00244 00245 ind1Neighbors = SGMatrix<int32_t>(m_k,m_nupdates); 00246 00247 J2 = SGVector<int32_t>(m_nupdates); 00248 } 00249 00250 for ( i = 0 ; i < max_iter ; ++i ) 00251 { 00252 if ( !(i % 1000) ) 00253 SG_DEBUG("SPE's loop, iteration %d of %d\n", i, max_iter); 00254 00255 // Select the vectors to be updated in this iteration 00256 int32_t* J = CMath::randperm(N); 00257 00258 // Pointer to the first set of vector indices to update 00259 int32_t* ind1 = J; 00260 00261 // Pointer ind2 to the second set of vector indices to update 00262 if ( m_strategy == SPE_GLOBAL ) 00263 ind2 = J + m_nupdates; 00264 else 00265 { 00266 // Select the second set of indices to update among neighbors 00267 // of the first set 00268 00269 // Get the neighbors of interest 00270 for ( j = 0 ; j < m_nupdates ; ++j ) 00271 { 00272 for ( k = 0 ; k < m_k ; ++k ) 00273 ind1Neighbors[k + j*m_k] = 00274 neighbors_mat.matrix[k + ind1[j]*m_k]; 00275 } 00276 00277 // Generate pseudo-random indices 00278 for ( j = 0 ; j < m_nupdates ; ++j ) 00279 { 00280 J2[j] = CMath::round( CMath::random(0.0, 1.0)*(m_k-1) ) 00281 + m_k*j; 00282 } 00283 00284 // Select final indices 00285 for ( j = 0 ; j < m_nupdates ; ++j ) 00286 ind2[j] = ind1Neighbors.matrix[ J2[j] ]; 00287 } 00288 00289 // Compute distances betweeen the selected points in embedded space 00290 00291 for ( j = 0 ; j < m_nupdates ; ++j ) 00292 { 00293 sum = 0.0; 00294 00295 for ( k = 0 ; k < m_target_dim ; ++k ) 00296 { 00297 idx1 = k + ind1[j]*m_target_dim; 00298 idx2 = k + ind2[j]*m_target_dim; 00299 sum += CMath::sq(Y.matrix[idx1] - Y.matrix[idx2]); 00300 } 00301 00302 D[j] = CMath::sqrt(sum); 00303 } 00304 00305 // Get the corresponding distances in the original space 00306 00307 if ( m_strategy == SPE_LOCAL ) 00308 Rt.set_const(1); 00309 else // SPE_GLOBAL strategy used 00310 Rt.set_const(alpha); 00311 00312 for ( j = 0 ; j < m_nupdates ; ++j ) 00313 Rt[j] *= distance->distance( ind1[j], ind2[j] ); 00314 00315 // Compute some terms for update 00316 00317 // Scale factor: (Rt - D) ./ (D + m_tolerance) 00318 for ( j = 0 ; j < m_nupdates ; ++j ) 00319 scale[j] = ( Rt[j] - D[j] ) / ( D[j] + m_tolerance ); 00320 00321 // Difference matrix: Y(ind1) - Y(ind2) 00322 for ( j = 0 ; j < m_nupdates ; ++j ) 00323 for ( k = 0 ; k < m_target_dim ; ++k ) 00324 { 00325 idx1 = k + ind1[j]*m_target_dim; 00326 idx2 = k + ind2[j]*m_target_dim; 00327 idx = k + j *m_target_dim; 00328 00329 Yd[idx] = Y[idx1] - Y[idx2]; 00330 } 00331 00332 // Update the location of the vectors in the embedded space 00333 for ( j = 0 ; j < m_nupdates ; ++j ) 00334 for ( k = 0 ; k < m_target_dim ; ++k ) 00335 { 00336 idx1 = k + ind1[j]*m_target_dim; 00337 idx2 = k + ind2[j]*m_target_dim; 00338 idx = k + j *m_target_dim; 00339 00340 Y[idx1] += lambda / 2 * scale[j] * Yd[idx]; 00341 Y[idx2] -= lambda / 2 * scale[j] * Yd[idx]; 00342 } 00343 00344 // Update the learning parameter 00345 lambda = lambda - ( lambda / max_iter ); 00346 00347 // Free memory 00348 SG_FREE(J); 00349 } 00350 00351 // Free memory 00352 if ( m_strategy == SPE_LOCAL ) 00353 SG_FREE(ind2); 00354 00355 return new CDenseFeatures< float64_t >(Y); 00356 } 00357 00358 #endif /* HAVE_LAPACK */