SHOGUN
v2.0.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 1999-2008 Gunnar Raetsch 00008 * Written (W) 1999-2008,2011 Soeren Sonnenburg 00009 * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00010 * Copyright (C) 2011 Berlin Institute of Technology 00011 */ 00012 #include <shogun/preprocessor/PCA.h> 00013 #ifdef HAVE_LAPACK 00014 #include <shogun/mathematics/lapack.h> 00015 #include <shogun/lib/config.h> 00016 #include <shogun/mathematics/Math.h> 00017 #include <string.h> 00018 #include <stdlib.h> 00019 #include <shogun/lib/common.h> 00020 #include <shogun/preprocessor/DensePreprocessor.h> 00021 #include <shogun/features/Features.h> 00022 #include <shogun/io/SGIO.h> 00023 00024 using namespace shogun; 00025 00026 CPCA::CPCA(bool do_whitening_, EPCAMode mode_, float64_t thresh_) 00027 : CDimensionReductionPreprocessor(), num_dim(0), m_initialized(false), 00028 m_whitening(do_whitening_), m_mode(mode_), thresh(thresh_) 00029 { 00030 init(); 00031 } 00032 00033 void CPCA::init() 00034 { 00035 m_transformation_matrix = SGMatrix<float64_t>(); 00036 m_mean_vector = SGVector<float64_t>(); 00037 m_eigenvalues_vector = SGVector<float64_t>(); 00038 00039 SG_ADD(&m_transformation_matrix, "transformation matrix", 00040 "Transformation matrix (Eigenvectors of covariance matrix).", 00041 MS_NOT_AVAILABLE); 00042 SG_ADD(&m_mean_vector, "mean vector", "Mean Vector.", MS_NOT_AVAILABLE); 00043 SG_ADD(&m_eigenvalues_vector, "eigenvalues vector", 00044 "Vector with Eigenvalues.", MS_NOT_AVAILABLE); 00045 SG_ADD(&m_initialized, "initalized", "True when initialized.", 00046 MS_NOT_AVAILABLE); 00047 SG_ADD(&m_whitening, "whitening", "Whether data shall be whitened.", 00048 MS_AVAILABLE); 00049 SG_ADD((machine_int_t*) &m_mode, "mode", "PCA Mode.", MS_AVAILABLE); 00050 SG_ADD(&thresh, "thresh", "Cutoff threshold.", MS_AVAILABLE); 00051 } 00052 00053 CPCA::~CPCA() 00054 { 00055 } 00056 00057 bool CPCA::init(CFeatures* features) 00058 { 00059 if (!m_initialized) 00060 { 00061 // loop varibles 00062 int32_t i,j,k; 00063 00064 ASSERT(features->get_feature_class()==C_DENSE); 00065 ASSERT(features->get_feature_type()==F_DREAL); 00066 00067 int32_t num_vectors=((CDenseFeatures<float64_t>*)features)->get_num_vectors(); 00068 int32_t num_features=((CDenseFeatures<float64_t>*)features)->get_num_features(); 00069 SG_INFO("num_examples: %ld num_features: %ld \n", num_vectors, num_features); 00070 00071 m_mean_vector.vlen = num_features; 00072 m_mean_vector.vector = SG_CALLOC(float64_t, num_features); 00073 00074 // sum 00075 SGMatrix<float64_t> feature_matrix = ((CDenseFeatures<float64_t>*)features)->get_feature_matrix(); 00076 for (i=0; i<num_vectors; i++) 00077 { 00078 for (j=0; j<num_features; j++) 00079 m_mean_vector.vector[j] += feature_matrix.matrix[i*num_features+j]; 00080 } 00081 00082 //divide 00083 for (i=0; i<num_features; i++) 00084 m_mean_vector.vector[i] /= num_vectors; 00085 00086 float64_t* cov = SG_CALLOC(float64_t, num_features*num_features); 00087 00088 float64_t* sub_mean = SG_MALLOC(float64_t, num_features); 00089 00090 for (i=0; i<num_vectors; i++) 00091 { 00092 for (k=0; k<num_features; k++) 00093 sub_mean[k]=feature_matrix.matrix[i*num_features+k]-m_mean_vector.vector[k]; 00094 00095 cblas_dger(CblasColMajor, 00096 num_features,num_features, 00097 1.0,sub_mean,1, 00098 sub_mean,1, 00099 cov, num_features); 00100 } 00101 00102 SG_FREE(sub_mean); 00103 00104 for (i=0; i<num_features; i++) 00105 { 00106 for (j=0; j<num_features; j++) 00107 cov[i*num_features+j]/=(num_vectors-1); 00108 } 00109 00110 SG_INFO("Computing Eigenvalues ... ") ; 00111 00112 m_eigenvalues_vector.vector = SGMatrix<float64_t>::compute_eigenvectors(cov,num_features,num_features); 00113 m_eigenvalues_vector.vlen = num_features; 00114 num_dim=0; 00115 00116 if (m_mode == FIXED_NUMBER) 00117 { 00118 ASSERT(m_target_dim <= num_features); 00119 num_dim = m_target_dim; 00120 } 00121 if (m_mode == VARIANCE_EXPLAINED) 00122 { 00123 float64_t eig_sum = 0; 00124 for (i=0; i<num_features; i++) 00125 eig_sum += m_eigenvalues_vector.vector[i]; 00126 00127 float64_t com_sum = 0; 00128 for (i=num_features-1; i>-1; i--) 00129 { 00130 num_dim++; 00131 com_sum += m_eigenvalues_vector.vector[i]; 00132 if (com_sum/eig_sum>=thresh) 00133 break; 00134 } 00135 } 00136 if (m_mode == THRESHOLD) 00137 { 00138 for (i=num_features-1; i>-1; i--) 00139 { 00140 if (m_eigenvalues_vector.vector[i]>thresh) 00141 num_dim++; 00142 else 00143 break; 00144 } 00145 } 00146 00147 SG_INFO("Done\nReducing from %i to %i features..", num_features, num_dim) ; 00148 00149 m_transformation_matrix = SGMatrix<float64_t>(num_features,num_dim); 00150 num_old_dim = num_features; 00151 00152 int32_t offs=0; 00153 for (i=num_features-num_dim; i<num_features; i++) 00154 { 00155 for (k=0; k<num_features; k++) 00156 if (m_whitening) 00157 m_transformation_matrix.matrix[offs+k*num_dim] = 00158 cov[num_features*i+k]/sqrt(m_eigenvalues_vector.vector[i]); 00159 else 00160 m_transformation_matrix.matrix[offs+k*num_dim] = 00161 cov[num_features*i+k]; 00162 offs++; 00163 } 00164 00165 SG_FREE(cov); 00166 m_initialized = true; 00167 return true; 00168 } 00169 00170 return false; 00171 } 00172 00173 void CPCA::cleanup() 00174 { 00175 m_transformation_matrix=SGMatrix<float64_t>(); 00176 } 00177 00178 SGMatrix<float64_t> CPCA::apply_to_feature_matrix(CFeatures* features) 00179 { 00180 ASSERT(m_initialized); 00181 SGMatrix<float64_t> m = ((CDenseFeatures<float64_t>*) features)->get_feature_matrix(); 00182 int32_t num_vectors = m.num_cols; 00183 int32_t num_features = m.num_rows; 00184 SG_INFO("get Feature matrix: %ix%i\n", num_vectors, num_features); 00185 00186 if (m.matrix) 00187 { 00188 SG_INFO("Preprocessing feature matrix\n"); 00189 float64_t* res = SG_MALLOC(float64_t, num_dim); 00190 float64_t* sub_mean = SG_MALLOC(float64_t, num_features); 00191 00192 for (int32_t vec=0; vec<num_vectors; vec++) 00193 { 00194 int32_t i; 00195 00196 for (i=0; i<num_features; i++) 00197 sub_mean[i] = m.matrix[num_features*vec+i] - m_mean_vector.vector[i]; 00198 00199 cblas_dgemv(CblasColMajor,CblasNoTrans, 00200 num_dim,num_features, 00201 1.0,m_transformation_matrix.matrix,num_dim, 00202 sub_mean,1, 00203 0.0,res,1); 00204 00205 float64_t* m_transformed = &m.matrix[num_dim*vec]; 00206 00207 for (i=0; i<num_dim; i++) 00208 m_transformed[i] = res[i]; 00209 } 00210 SG_FREE(res); 00211 SG_FREE(sub_mean); 00212 00213 ((CDenseFeatures<float64_t>*) features)->set_num_features(num_dim); 00214 ((CDenseFeatures<float64_t>*) features)->get_feature_matrix(num_features, num_vectors); 00215 SG_INFO("new Feature matrix: %ix%i\n", num_vectors, num_features); 00216 } 00217 00218 return m; 00219 } 00220 00221 SGVector<float64_t> CPCA::apply_to_feature_vector(SGVector<float64_t> vector) 00222 { 00223 float64_t* result = SG_MALLOC(float64_t, num_dim); 00224 float64_t* sub_mean = SG_MALLOC(float64_t, vector.vlen); 00225 00226 for (int32_t i=0; i<vector.vlen; i++) 00227 sub_mean[i]=vector.vector[i]-m_mean_vector.vector[i]; 00228 00229 cblas_dgemv(CblasColMajor,CblasNoTrans, 00230 num_dim,vector.vlen, 00231 1.0,m_transformation_matrix.matrix,m_transformation_matrix.num_cols, 00232 sub_mean,1, 00233 0.0,result,1); 00234 00235 SG_FREE(sub_mean); 00236 return SGVector<float64_t>(result,num_dim); 00237 } 00238 00239 SGMatrix<float64_t> CPCA::get_transformation_matrix() 00240 { 00241 return m_transformation_matrix; 00242 } 00243 00244 SGVector<float64_t> CPCA::get_eigenvalues() 00245 { 00246 return m_eigenvalues_vector; 00247 } 00248 00249 SGVector<float64_t> CPCA::get_mean() 00250 { 00251 return m_mean_vector; 00252 } 00253 00254 #endif /* HAVE_LAPACK */