SHOGUN
v2.0.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2010 Soeren Sonnenburg 00008 * Copyright (C) 2010 Berlin Institute of Technology 00009 */ 00010 #include <shogun/features/SparsePolyFeatures.h> 00011 #include <shogun/lib/Hash.h> 00012 00013 using namespace shogun; 00014 00015 CSparsePolyFeatures::CSparsePolyFeatures() 00016 { 00017 SG_UNSTABLE("CSparsePolyFeatures::CSparsePolyFeatures()", 00018 "\n"); 00019 00020 m_feat = NULL; 00021 m_degree = 0; 00022 m_normalize = false; 00023 m_input_dimensions = 0; 00024 m_output_dimensions = 0; 00025 m_normalization_values = NULL; 00026 mask = 0; 00027 m_hash_bits = 0; 00028 } 00029 00030 CSparsePolyFeatures::CSparsePolyFeatures(CSparseFeatures<float64_t>* feat, int32_t degree, bool normalize, int32_t hash_bits) 00031 : CDotFeatures(), m_normalization_values(NULL) 00032 { 00033 ASSERT(feat); 00034 00035 m_feat = feat; 00036 SG_REF(m_feat); 00037 m_degree=degree; 00038 m_normalize=normalize; 00039 m_hash_bits=hash_bits; 00040 mask=(uint32_t) (((uint64_t) 1)<<m_hash_bits)-1; 00041 m_output_dimensions=1<<m_hash_bits; 00042 m_input_dimensions=feat->get_num_features(); 00043 00044 if (m_normalize) 00045 store_normalization_values(); 00046 } 00047 00048 CSparsePolyFeatures::~CSparsePolyFeatures() 00049 { 00050 SG_FREE(m_normalization_values); 00051 SG_UNREF(m_feat); 00052 } 00053 00054 CSparsePolyFeatures::CSparsePolyFeatures(const CSparsePolyFeatures & orig) 00055 { 00056 SG_PRINT("CSparsePolyFeatures:\n"); 00057 SG_NOTIMPLEMENTED; 00058 } 00059 00060 int32_t CSparsePolyFeatures::get_dim_feature_space() const 00061 { 00062 return m_output_dimensions; 00063 } 00064 00065 int32_t CSparsePolyFeatures::get_nnz_features_for_vector(int32_t num) 00066 { 00067 int32_t vlen; 00068 SGSparseVector<float64_t> vec=m_feat->get_sparse_feature_vector(num); 00069 vlen=vec.num_feat_entries; 00070 m_feat->free_feature_vector(num); 00071 return vlen*(vlen+1)/2; 00072 } 00073 00074 EFeatureType CSparsePolyFeatures::get_feature_type() const 00075 { 00076 return F_UNKNOWN; 00077 } 00078 00079 EFeatureClass CSparsePolyFeatures::get_feature_class() const 00080 { 00081 return C_POLY; 00082 } 00083 00084 int32_t CSparsePolyFeatures::get_num_vectors() const 00085 { 00086 if (m_feat) 00087 return m_feat->get_num_vectors(); 00088 else 00089 return 0; 00090 00091 } 00092 00093 int32_t CSparsePolyFeatures::get_size() const 00094 { 00095 return sizeof(float64_t); 00096 } 00097 00098 void* CSparsePolyFeatures::get_feature_iterator(int32_t vector_index) 00099 { 00100 SG_NOTIMPLEMENTED; 00101 return NULL; 00102 } 00103 00104 bool CSparsePolyFeatures::get_next_feature(int32_t& index, float64_t& value, void* iterator) 00105 { 00106 SG_NOTIMPLEMENTED; 00107 return NULL; 00108 } 00109 00110 void CSparsePolyFeatures::free_feature_iterator(void* iterator) 00111 { 00112 SG_NOTIMPLEMENTED; 00113 } 00114 00115 float64_t CSparsePolyFeatures::dot(int32_t vec_idx1, CDotFeatures* df, int32_t vec_idx2) 00116 { 00117 ASSERT(df); 00118 ASSERT(df->get_feature_type() == get_feature_type()); 00119 ASSERT(df->get_feature_class() == get_feature_class()); 00120 00121 CSparsePolyFeatures* pf=(CSparsePolyFeatures*) df; 00122 00123 SGSparseVector<float64_t> vec1=m_feat->get_sparse_feature_vector(vec_idx1); 00124 SGSparseVector<float64_t> vec2=pf->m_feat->get_sparse_feature_vector( 00125 vec_idx2); 00126 00127 float64_t result=CSparseFeatures<float64_t>::sparse_dot(1, vec1.features, 00128 vec1.num_feat_entries, vec2.features, vec2.num_feat_entries); 00129 result=CMath::pow(result, m_degree); 00130 00131 m_feat->free_feature_vector(vec_idx1); 00132 pf->m_feat->free_feature_vector(vec_idx2); 00133 00134 return result; 00135 } 00136 00137 float64_t CSparsePolyFeatures::dense_dot(int32_t vec_idx1, float64_t* vec2, int32_t vec2_len) 00138 { 00139 if (vec2_len != m_output_dimensions) 00140 SG_ERROR("Dimensions don't match, vec2_dim=%d, m_output_dimensions=%d\n", vec2_len, m_output_dimensions); 00141 00142 SGSparseVector<float64_t> vec=m_feat->get_sparse_feature_vector(vec_idx1); 00143 00144 float64_t result=0; 00145 00146 if (vec.features) 00147 { 00148 if (m_degree==2) 00149 { 00150 /* (a+b)^2 = a^2 + 2ab +b^2 */ 00151 for (int32_t i=0; i<vec.num_feat_entries; i++) 00152 { 00153 float64_t v1=vec.features[i].entry; 00154 uint32_t seed=CHash::MurmurHash3( 00155 (uint8_t*)&(vec.features[i].feat_index), 00156 sizeof(int32_t), 0xDEADBEAF); 00157 00158 for (int32_t j=i; j<vec.num_feat_entries; j++) 00159 { 00160 float64_t v2=vec.features[j].entry; 00161 uint32_t h=CHash::MurmurHash3( 00162 (uint8_t*)&(vec.features[j].feat_index), 00163 sizeof(int32_t), seed)&mask; 00164 float64_t v; 00165 00166 if (i==j) 00167 v=v1*v1; 00168 else 00169 v=CMath::sqrt(2.0)*v1*v2; 00170 00171 result+=v*vec2[h]; 00172 } 00173 } 00174 } 00175 else if (m_degree==3) 00176 SG_NOTIMPLEMENTED; 00177 } 00178 00179 if (m_normalize) 00180 result/=m_normalization_values[vec_idx1]; 00181 00182 m_feat->free_feature_vector(vec_idx1); 00183 return result; 00184 } 00185 00186 void CSparsePolyFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val) 00187 { 00188 if (vec2_len!=m_output_dimensions) 00189 SG_ERROR("Dimensions don't match, vec2_dim=%d, m_output_dimensions=%d\n", vec2_len, m_output_dimensions); 00190 00191 SGSparseVector<float64_t> vec=m_feat->get_sparse_feature_vector(vec_idx1); 00192 00193 float64_t norm_val=1.0; 00194 if (m_normalize) 00195 norm_val = m_normalization_values[vec_idx1]; 00196 alpha/=norm_val; 00197 00198 if (m_degree==2) 00199 { 00200 /* (a+b)^2 = a^2 + 2ab +b^2 */ 00201 for (int32_t i=0; i<vec.num_feat_entries; i++) 00202 { 00203 float64_t v1=vec.features[i].entry; 00204 uint32_t seed=CHash::MurmurHash3( 00205 (uint8_t*)&(vec.features[i].feat_index), sizeof(int32_t), 00206 0xDEADBEAF); 00207 00208 for (int32_t j=i; j<vec.num_feat_entries; j++) 00209 { 00210 float64_t v2=vec.features[j].entry; 00211 uint32_t h=CHash::MurmurHash3( 00212 (uint8_t*)&(vec.features[j].feat_index), 00213 sizeof(int32_t), seed)&mask; 00214 float64_t v; 00215 00216 if (i==j) 00217 v=alpha*v1*v1; 00218 else 00219 v=alpha*CMath::sqrt(2.0)*v1*v2; 00220 00221 if (abs_val) 00222 vec2[h]+=CMath::abs(v); 00223 else 00224 vec2[h]+=v; 00225 } 00226 } 00227 } 00228 else if (m_degree==3) 00229 SG_NOTIMPLEMENTED; 00230 00231 m_feat->free_feature_vector(vec_idx1); 00232 } 00233 00234 void CSparsePolyFeatures::store_normalization_values() 00235 { 00236 SG_FREE(m_normalization_values); 00237 00238 m_normalization_values_len = this->get_num_vectors(); 00239 00240 m_normalization_values=SG_MALLOC(float64_t, m_normalization_values_len); 00241 for (int i=0; i<m_normalization_values_len; i++) 00242 { 00243 float64_t val = CMath::sqrt(dot(i, this,i)); 00244 if (val==0) 00245 // trap division by zero 00246 m_normalization_values[i]=1.0; 00247 else 00248 m_normalization_values[i]=val; 00249 } 00250 00251 } 00252 00253 CFeatures* CSparsePolyFeatures::duplicate() const 00254 { 00255 return new CSparsePolyFeatures(*this); 00256 } 00257 00258 void CSparsePolyFeatures::init() 00259 { 00260 m_parameters->add((CSGObject**) &m_feat, "features", 00261 "Features in original space."); 00262 m_parameters->add(&m_degree, "degree", "Degree of the polynomial kernel."); 00263 m_parameters->add(&m_normalize, "normalize", "Normalize"); 00264 m_parameters->add(&m_input_dimensions, "input_dimensions", 00265 "Dimensions of the input space."); 00266 m_parameters->add(&m_output_dimensions, "output_dimensions", 00267 "Dimensions of the feature space of the polynomial kernel."); 00268 m_normalization_values_len = get_num_vectors(); 00269 m_parameters->add_vector(&m_normalization_values, &m_normalization_values_len, 00270 "m_normalization_values", "Norm of each training example"); 00271 m_parameters->add(&mask, "mask", "Mask."); 00272 m_parameters->add(&m_hash_bits, "m_hash_bits", "Number of bits in hash"); 00273 }