SHOGUN
v2.0.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2011 Shashwat Lal Das 00008 * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society 00009 */ 00010 #include <shogun/features/streaming/StreamingSparseFeatures.h> 00011 namespace shogun 00012 { 00013 00014 template <class T> 00015 CStreamingSparseFeatures<T>::CStreamingSparseFeatures() : CStreamingDotFeatures() 00016 { 00017 set_read_functions(); 00018 init(); 00019 } 00020 00021 template <class T> 00022 CStreamingSparseFeatures<T>::CStreamingSparseFeatures(CStreamingFile* file, 00023 bool is_labelled, 00024 int32_t size) 00025 : CStreamingDotFeatures() 00026 { 00027 set_read_functions(); 00028 init(file, is_labelled, size); 00029 } 00030 00031 template <class T> 00032 CStreamingSparseFeatures<T>::~CStreamingSparseFeatures() 00033 { 00034 parser.end_parser(); 00035 } 00036 00037 template <class T> 00038 T CStreamingSparseFeatures<T>::get_feature(int32_t index) 00039 { 00040 ASSERT(index>=0 && index<current_num_features); 00041 00042 T ret=0; 00043 00044 if (current_vector) 00045 { 00046 for (int32_t i=0; i<current_length; i++) 00047 if (current_vector[i].feat_index==index) 00048 ret += current_vector[i].entry; 00049 } 00050 00051 return ret; 00052 } 00053 00054 template <class T> 00055 void CStreamingSparseFeatures<T>::reset_stream() 00056 { 00057 } 00058 00059 template <class T> 00060 int32_t CStreamingSparseFeatures<T>::set_num_features(int32_t num) 00061 { 00062 int32_t n=current_num_features; 00063 ASSERT(n<=num); 00064 current_num_features=num; 00065 return n; 00066 } 00067 00068 template <class T> 00069 void CStreamingSparseFeatures<T>::expand_if_required(float32_t*& vec, int32_t &len) 00070 { 00071 int32_t dim = get_dim_feature_space(); 00072 if (dim > len) 00073 { 00074 vec = SG_REALLOC(float32_t, vec, dim); 00075 memset(&vec[len], 0, (dim-len) * sizeof(float32_t)); 00076 len = dim; 00077 } 00078 } 00079 00080 template <class T> 00081 void CStreamingSparseFeatures<T>::expand_if_required(float64_t*& vec, int32_t &len) 00082 { 00083 int32_t dim = get_dim_feature_space(); 00084 if (dim > len) 00085 { 00086 vec = SG_REALLOC(float64_t, vec, dim); 00087 memset(&vec[len], 0, (dim-len) * sizeof(float64_t)); 00088 len = dim; 00089 } 00090 } 00091 00092 template <class T> 00093 T CStreamingSparseFeatures<T>::sparse_dot(T alpha, SGSparseVectorEntry<T>* avec, int32_t alen, SGSparseVectorEntry<T>* bvec, int32_t blen) 00094 { 00095 T result=0; 00096 00097 //result remains zero when one of the vectors is non existent 00098 if (avec && bvec) 00099 { 00100 if (alen<=blen) 00101 { 00102 int32_t j=0; 00103 for (int32_t i=0; i<alen; i++) 00104 { 00105 int32_t a_feat_idx=avec[i].feat_index; 00106 00107 while ( (j<blen) && (bvec[j].feat_index < a_feat_idx) ) 00108 j++; 00109 00110 if ( (j<blen) && (bvec[j].feat_index == a_feat_idx) ) 00111 { 00112 result+= avec[i].entry * bvec[j].entry; 00113 j++; 00114 } 00115 } 00116 } 00117 else 00118 { 00119 int32_t j=0; 00120 for (int32_t i=0; i<blen; i++) 00121 { 00122 int32_t b_feat_idx=bvec[i].feat_index; 00123 00124 while ( (j<alen) && (avec[j].feat_index < b_feat_idx) ) 00125 j++; 00126 00127 if ( (j<alen) && (avec[j].feat_index == b_feat_idx) ) 00128 { 00129 result+= bvec[i].entry * avec[j].entry; 00130 j++; 00131 } 00132 } 00133 } 00134 00135 result*=alpha; 00136 } 00137 00138 return result; 00139 } 00140 00141 template <class T> 00142 T CStreamingSparseFeatures<T>::dense_dot(T alpha, T* vec, int32_t dim, T b) 00143 { 00144 ASSERT(vec); 00145 ASSERT(dim>=current_num_features); 00146 T result=b; 00147 00148 int32_t num_feat=current_length; 00149 SGSparseVectorEntry<T>* sv=current_vector; 00150 00151 if (sv) 00152 { 00153 for (int32_t i=0; i<num_feat; i++) 00154 result+=alpha*vec[sv[i].feat_index]*sv[i].entry; 00155 } 00156 00157 return result; 00158 } 00159 00160 template <class T> 00161 float64_t CStreamingSparseFeatures<T>::dense_dot(const float64_t* vec2, int32_t vec2_len) 00162 { 00163 ASSERT(vec2); 00164 if (vec2_len < current_num_features) 00165 { 00166 SG_ERROR("dimension of vec2 (=%d) does not match number of features (=%d)\n", 00167 vec2_len, current_num_features); 00168 } 00169 00170 float64_t result=0; 00171 if (current_vector) 00172 { 00173 for (int32_t i=0; i<current_length; i++) 00174 result+=vec2[current_vector[i].feat_index]*current_vector[i].entry; 00175 } 00176 00177 return result; 00178 } 00179 00180 template <class T> 00181 float32_t CStreamingSparseFeatures<T>::dense_dot(const float32_t* vec2, int32_t vec2_len) 00182 { 00183 ASSERT(vec2); 00184 if (vec2_len < current_num_features) 00185 { 00186 SG_ERROR("dimension of vec2 (=%d) does not match number of features (=%d)\n", 00187 vec2_len, current_num_features); 00188 } 00189 00190 float32_t result=0; 00191 if (current_vector) 00192 { 00193 for (int32_t i=0; i<current_length; i++) 00194 result+=vec2[current_vector[i].feat_index]*current_vector[i].entry; 00195 } 00196 00197 return result; 00198 } 00199 00200 template <class T> 00201 void CStreamingSparseFeatures<T>::add_to_dense_vec(float64_t alpha, float64_t* vec2, int32_t vec2_len, bool abs_val) 00202 { 00203 ASSERT(vec2); 00204 if (vec2_len < current_num_features) 00205 { 00206 SG_ERROR("dimension of vec (=%d) does not match number of features (=%d)\n", 00207 vec2_len, current_num_features); 00208 } 00209 00210 SGSparseVectorEntry<T>* sv=current_vector; 00211 int32_t num_feat=current_length; 00212 00213 if (sv) 00214 { 00215 if (abs_val) 00216 { 00217 for (int32_t i=0; i<num_feat; i++) 00218 vec2[sv[i].feat_index]+= alpha*CMath::abs(sv[i].entry); 00219 } 00220 else 00221 { 00222 for (int32_t i=0; i<num_feat; i++) 00223 vec2[sv[i].feat_index]+= alpha*sv[i].entry; 00224 } 00225 } 00226 } 00227 00228 template <class T> 00229 void CStreamingSparseFeatures<T>::add_to_dense_vec(float32_t alpha, float32_t* vec2, int32_t vec2_len, bool abs_val) 00230 { 00231 ASSERT(vec2); 00232 if (vec2_len < current_num_features) 00233 { 00234 SG_ERROR("dimension of vec (=%d) does not match number of features (=%d)\n", 00235 vec2_len, current_num_features); 00236 } 00237 00238 SGSparseVectorEntry<T>* sv=current_vector; 00239 int32_t num_feat=current_length; 00240 00241 if (sv) 00242 { 00243 if (abs_val) 00244 { 00245 for (int32_t i=0; i<num_feat; i++) 00246 vec2[sv[i].feat_index]+= alpha*CMath::abs(sv[i].entry); 00247 } 00248 else 00249 { 00250 for (int32_t i=0; i<num_feat; i++) 00251 vec2[sv[i].feat_index]+= alpha*sv[i].entry; 00252 } 00253 } 00254 } 00255 00256 template <class T> 00257 int64_t CStreamingSparseFeatures<T>::get_num_nonzero_entries() 00258 { 00259 return current_length; 00260 } 00261 00262 template <class T> 00263 float32_t CStreamingSparseFeatures<T>::compute_squared() 00264 { 00265 ASSERT(current_vector); 00266 00267 float32_t sq=0; 00268 00269 for (int32_t i=0; i<current_length; i++) 00270 sq += current_vector[i].entry * current_vector[i].entry; 00271 00272 return sq; 00273 } 00274 00275 template <class T> 00276 void CStreamingSparseFeatures<T>::sort_features() 00277 { 00278 ASSERT(current_vector); 00279 00280 SGSparseVectorEntry<T>* sf_orig=current_vector; 00281 int32_t len=current_length; 00282 00283 int32_t* feat_idx=SG_MALLOC(int32_t, len); 00284 int32_t* orig_idx=SG_MALLOC(int32_t, len); 00285 00286 for (int32_t i=0; i<len; i++) 00287 { 00288 feat_idx[i]=sf_orig[i].feat_index; 00289 orig_idx[i]=i; 00290 } 00291 00292 CMath::qsort_index(feat_idx, orig_idx, len); 00293 00294 SGSparseVectorEntry<T>* sf_new=SG_MALLOC(SGSparseVectorEntry<T>, len); 00295 00296 for (int32_t i=0; i<len; i++) 00297 sf_new[i]=sf_orig[orig_idx[i]]; 00298 00299 // sanity check 00300 for (int32_t i=0; i<len-1; i++) 00301 ASSERT(sf_new[i].feat_index<sf_new[i+1].feat_index); 00302 00303 // Copy new vector back to original 00304 for (int32_t i=0; i<len; i++) 00305 sf_orig[i]=sf_new[i]; 00306 00307 SG_FREE(orig_idx); 00308 SG_FREE(feat_idx); 00309 SG_FREE(sf_new); 00310 } 00311 00312 template <class T> 00313 CFeatures* CStreamingSparseFeatures<T>::duplicate() const 00314 { 00315 return new CStreamingSparseFeatures<T>(*this); 00316 } 00317 00318 template <class T> 00319 int32_t CStreamingSparseFeatures<T>::get_num_vectors() const 00320 { 00321 if (current_vector) 00322 return 1; 00323 return 0; 00324 } 00325 00326 template <class T> 00327 int32_t CStreamingSparseFeatures<T>::get_size() const 00328 { 00329 return sizeof(T); 00330 } 00331 00332 template <class T> void CStreamingSparseFeatures<T>::set_vector_reader() 00333 { 00334 parser.set_read_vector(&CStreamingFile::get_sparse_vector); 00335 } 00336 00337 template <class T> void CStreamingSparseFeatures<T>::set_vector_and_label_reader() 00338 { 00339 parser.set_read_vector_and_label 00340 (&CStreamingFile::get_sparse_vector_and_label); 00341 } 00342 00343 #define GET_FEATURE_TYPE(f_type, sg_type) \ 00344 template<> EFeatureType CStreamingSparseFeatures<sg_type>::get_feature_type() const \ 00345 { \ 00346 return f_type; \ 00347 } 00348 00349 GET_FEATURE_TYPE(F_BOOL, bool) 00350 GET_FEATURE_TYPE(F_CHAR, char) 00351 GET_FEATURE_TYPE(F_BYTE, uint8_t) 00352 GET_FEATURE_TYPE(F_BYTE, int8_t) 00353 GET_FEATURE_TYPE(F_SHORT, int16_t) 00354 GET_FEATURE_TYPE(F_WORD, uint16_t) 00355 GET_FEATURE_TYPE(F_INT, int32_t) 00356 GET_FEATURE_TYPE(F_UINT, uint32_t) 00357 GET_FEATURE_TYPE(F_LONG, int64_t) 00358 GET_FEATURE_TYPE(F_ULONG, uint64_t) 00359 GET_FEATURE_TYPE(F_SHORTREAL, float32_t) 00360 GET_FEATURE_TYPE(F_DREAL, float64_t) 00361 GET_FEATURE_TYPE(F_LONGREAL, floatmax_t) 00362 #undef GET_FEATURE_TYPE 00363 00364 00365 template <class T> 00366 void CStreamingSparseFeatures<T>::init() 00367 { 00368 working_file=NULL; 00369 current_vector=NULL; 00370 current_length=-1; 00371 current_vec_index=0; 00372 current_num_features=-1; 00373 } 00374 00375 template <class T> 00376 void CStreamingSparseFeatures<T>::init(CStreamingFile* file, 00377 bool is_labelled, 00378 int32_t size) 00379 { 00380 init(); 00381 has_labels = is_labelled; 00382 working_file = file; 00383 parser.init(file, is_labelled, size); 00384 } 00385 00386 template <class T> 00387 void CStreamingSparseFeatures<T>::start_parser() 00388 { 00389 if (!parser.is_running()) 00390 parser.start_parser(); 00391 } 00392 00393 template <class T> 00394 void CStreamingSparseFeatures<T>::end_parser() 00395 { 00396 parser.end_parser(); 00397 } 00398 00399 template <class T> 00400 bool CStreamingSparseFeatures<T>::get_next_example() 00401 { 00402 bool ret_value; 00403 ret_value = (bool) parser.get_next_example(current_vector, 00404 current_length, 00405 current_label); 00406 00407 if (!ret_value) 00408 return false; 00409 00410 // Update number of features based on highest index 00411 for (int32_t i=0; i<current_length; i++) 00412 { 00413 if (current_vector[i].feat_index > current_num_features) 00414 current_num_features = current_vector[i].feat_index+1; 00415 } 00416 current_vec_index++; 00417 00418 return true; 00419 } 00420 00421 template <class T> 00422 SGSparseVector<T> CStreamingSparseFeatures<T>::get_vector() 00423 { 00424 current_sgvector.features=current_vector; 00425 current_sgvector.num_feat_entries=current_length; 00426 00427 return current_sgvector; 00428 } 00429 00430 template <class T> 00431 float64_t CStreamingSparseFeatures<T>::get_label() 00432 { 00433 ASSERT(has_labels); 00434 00435 return current_label; 00436 } 00437 00438 template <class T> 00439 void CStreamingSparseFeatures<T>::release_example() 00440 { 00441 parser.finalize_example(); 00442 } 00443 00444 template <class T> 00445 int32_t CStreamingSparseFeatures<T>::get_dim_feature_space() const 00446 { 00447 return current_num_features; 00448 } 00449 00450 template <class T> 00451 float32_t CStreamingSparseFeatures<T>::dot(CStreamingDotFeatures* df) 00452 { 00453 SG_NOTIMPLEMENTED; 00454 return -1; 00455 } 00456 00457 template <class T> 00458 int32_t CStreamingSparseFeatures<T>::get_num_features() 00459 { 00460 return current_num_features; 00461 } 00462 00463 template <class T> 00464 int32_t CStreamingSparseFeatures<T>::get_nnz_features_for_vector() 00465 { 00466 return current_length; 00467 } 00468 00469 template <class T> 00470 EFeatureClass CStreamingSparseFeatures<T>::get_feature_class() const 00471 { 00472 return C_STREAMING_SPARSE; 00473 } 00474 00475 template class CStreamingSparseFeatures<bool>; 00476 template class CStreamingSparseFeatures<char>; 00477 template class CStreamingSparseFeatures<int8_t>; 00478 template class CStreamingSparseFeatures<uint8_t>; 00479 template class CStreamingSparseFeatures<int16_t>; 00480 template class CStreamingSparseFeatures<uint16_t>; 00481 template class CStreamingSparseFeatures<int32_t>; 00482 template class CStreamingSparseFeatures<uint32_t>; 00483 template class CStreamingSparseFeatures<int64_t>; 00484 template class CStreamingSparseFeatures<uint64_t>; 00485 template class CStreamingSparseFeatures<float32_t>; 00486 template class CStreamingSparseFeatures<float64_t>; 00487 template class CStreamingSparseFeatures<floatmax_t>; 00488 }