SHOGUN
v2.0.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 1999-2008 Gunnar Raetsch 00008 * Written (W) 1999-2009 Soeren Sonnenburg 00009 * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00010 */ 00011 00012 #include <shogun/preprocessor/PruneVarSubMean.h> 00013 #include <shogun/preprocessor/DensePreprocessor.h> 00014 #include <shogun/features/Features.h> 00015 #include <shogun/io/SGIO.h> 00016 #include <shogun/mathematics/Math.h> 00017 00018 using namespace shogun; 00019 00020 CPruneVarSubMean::CPruneVarSubMean(bool divide) 00021 : CDensePreprocessor<float64_t>(), idx(NULL), mean(NULL), 00022 std(NULL), num_idx(0), divide_by_std(divide), initialized(false) 00023 { 00024 } 00025 00026 CPruneVarSubMean::~CPruneVarSubMean() 00027 { 00028 cleanup(); 00029 } 00030 00032 bool CPruneVarSubMean::init(CFeatures* features) 00033 { 00034 if (!initialized) 00035 { 00036 ASSERT(features->get_feature_class()==C_DENSE); 00037 ASSERT(features->get_feature_type()==F_DREAL); 00038 00039 CDenseFeatures<float64_t>* simple_features=(CDenseFeatures<float64_t>*) features; 00040 int32_t num_examples = simple_features->get_num_vectors(); 00041 int32_t num_features = simple_features->get_num_features(); 00042 00043 SG_FREE(mean); 00044 SG_FREE(idx); 00045 SG_FREE(std); 00046 mean=NULL; 00047 idx=NULL; 00048 std=NULL; 00049 00050 mean=SG_MALLOC(float64_t, num_features); 00051 float64_t* var=SG_MALLOC(float64_t, num_features); 00052 int32_t i,j; 00053 00054 for (i=0; i<num_features; i++) 00055 { 00056 mean[i]=0; 00057 var[i]=0 ; 00058 } 00059 00060 SGMatrix<float64_t> feature_matrix = simple_features->get_feature_matrix(); 00061 00062 // compute mean 00063 for (i=0; i<num_examples; i++) 00064 { 00065 for (j=0; j<num_features; j++) 00066 mean[j]+=feature_matrix.matrix[i*num_features+j]; 00067 } 00068 00069 for (j=0; j<num_features; j++) 00070 mean[j]/=num_examples; 00071 00072 // compute var 00073 for (i=0; i<num_examples; i++) 00074 { 00075 for (j=0; j<num_features; j++) 00076 var[j]+=CMath::sq(mean[j]-feature_matrix.matrix[i*num_features+j]); 00077 } 00078 00079 int32_t num_ok=0; 00080 int32_t* idx_ok=SG_MALLOC(int, num_features); 00081 00082 for (j=0; j<num_features; j++) 00083 { 00084 var[j]/=num_examples; 00085 00086 if (var[j]>=1e-14) 00087 { 00088 idx_ok[num_ok]=j; 00089 num_ok++ ; 00090 } 00091 } 00092 00093 SG_INFO( "Reducing number of features from %i to %i\n", num_features, num_ok) ; 00094 00095 SG_FREE(idx); 00096 idx=SG_MALLOC(int, num_ok); 00097 float64_t* new_mean=SG_MALLOC(float64_t, num_ok); 00098 std=SG_MALLOC(float64_t, num_ok); 00099 00100 for (j=0; j<num_ok; j++) 00101 { 00102 idx[j]=idx_ok[j] ; 00103 new_mean[j]=mean[idx_ok[j]]; 00104 std[j]=sqrt(var[idx_ok[j]]); 00105 } 00106 num_idx = num_ok ; 00107 SG_FREE(idx_ok); 00108 SG_FREE(mean); 00109 SG_FREE(var); 00110 mean = new_mean; 00111 00112 initialized = true; 00113 return true; 00114 } 00115 else 00116 return false; 00117 } 00118 00120 void CPruneVarSubMean::cleanup() 00121 { 00122 SG_FREE(idx); 00123 idx=NULL; 00124 SG_FREE(mean); 00125 mean=NULL; 00126 SG_FREE(std); 00127 std=NULL; 00128 } 00129 00133 SGMatrix<float64_t> CPruneVarSubMean::apply_to_feature_matrix(CFeatures* features) 00134 { 00135 ASSERT(initialized); 00136 00137 int32_t num_vectors=0; 00138 int32_t num_features=0; 00139 float64_t* m=((CDenseFeatures<float64_t>*) features)->get_feature_matrix(num_features, num_vectors); 00140 00141 SG_INFO( "get Feature matrix: %ix%i\n", num_vectors, num_features); 00142 SG_INFO( "Preprocessing feature matrix\n"); 00143 for (int32_t vec=0; vec<num_vectors; vec++) 00144 { 00145 float64_t* v_src=&m[num_features*vec]; 00146 float64_t* v_dst=&m[num_idx*vec]; 00147 00148 if (divide_by_std) 00149 { 00150 for (int32_t feat=0; feat<num_idx; feat++) 00151 v_dst[feat]=(v_src[idx[feat]]-mean[feat])/std[feat]; 00152 } 00153 else 00154 { 00155 for (int32_t feat=0; feat<num_idx; feat++) 00156 v_dst[feat]=(v_src[idx[feat]]-mean[feat]); 00157 } 00158 } 00159 00160 ((CDenseFeatures<float64_t>*) features)->set_num_features(num_idx); 00161 ((CDenseFeatures<float64_t>*) features)->get_feature_matrix(num_features, num_vectors); 00162 SG_INFO( "new Feature matrix: %ix%i\n", num_vectors, num_features); 00163 00164 return ((CDenseFeatures<float64_t>*) features)->get_feature_matrix(); 00165 } 00166 00169 SGVector<float64_t> CPruneVarSubMean::apply_to_feature_vector(SGVector<float64_t> vector) 00170 { 00171 float64_t* ret=NULL; 00172 00173 if (initialized) 00174 { 00175 ret=SG_MALLOC(float64_t, num_idx); 00176 00177 if (divide_by_std) 00178 { 00179 for (int32_t i=0; i<num_idx; i++) 00180 ret[i]=(vector.vector[idx[i]]-mean[i])/std[i]; 00181 } 00182 else 00183 { 00184 for (int32_t i=0; i<num_idx; i++) 00185 ret[i]=(vector.vector[idx[i]]-mean[i]); 00186 } 00187 } 00188 else 00189 { 00190 ret=SG_MALLOC(float64_t, vector.vlen); 00191 for (int32_t i=0; i<vector.vlen; i++) 00192 ret[i]=vector.vector[i]; 00193 } 00194 00195 return SGVector<float64_t>(ret,num_idx); 00196 }