SHOGUN
v2.0.0
|
00001 #include <shogun/features/SparseFeatures.h> 00002 #include <shogun/preprocessor/SparsePreprocessor.h> 00003 #include <shogun/mathematics/Math.h> 00004 #include <shogun/lib/DataType.h> 00005 #include <shogun/labels/RegressionLabels.h> 00006 #include <shogun/io/SGIO.h> 00007 00008 #include <string.h> 00009 #include <stdlib.h> 00010 00011 namespace shogun 00012 { 00013 00014 template<class ST> CSparseFeatures<ST>::CSparseFeatures(int32_t size) 00015 : CDotFeatures(size), num_vectors(0), num_features(0), 00016 sparse_feature_matrix(NULL), feature_cache(NULL) 00017 { 00018 init(); 00019 } 00020 00021 template<class ST> CSparseFeatures<ST>::CSparseFeatures(SGSparseVector<ST>* src, 00022 int32_t num_feat, int32_t num_vec, bool copy) 00023 : CDotFeatures(0), num_vectors(num_vec), num_features(num_feat), 00024 sparse_feature_matrix(NULL), feature_cache(NULL) 00025 { 00026 init(); 00027 00028 sparse_feature_matrix = src; 00029 //SG_MALLOC(SGSparseVector<ST>, num_vec); 00030 //for (int32_t i=0; i< num_vec; i++) 00031 //{ 00032 // new (&sparse_feature_matrix[i]) SGSparseVector<ST>(); 00033 // sparse_feature_matrix[i] = src[i]; 00034 //} 00035 } 00036 00037 template<class ST> CSparseFeatures<ST>::CSparseFeatures(SGSparseMatrix<ST> sparse) 00038 : CDotFeatures(0), num_vectors(0), num_features(0), 00039 sparse_feature_matrix(NULL), feature_cache(NULL) 00040 { 00041 init(); 00042 00043 set_sparse_feature_matrix(sparse); 00044 } 00045 00046 template<class ST> CSparseFeatures<ST>::CSparseFeatures(SGMatrix<ST> dense) 00047 : CDotFeatures(0), num_vectors(0), num_features(0), 00048 sparse_feature_matrix(NULL), feature_cache(NULL) 00049 { 00050 init(); 00051 00052 set_full_feature_matrix(dense); 00053 } 00054 00055 template<class ST> CSparseFeatures<ST>::CSparseFeatures(const CSparseFeatures & orig) 00056 : CDotFeatures(orig), num_vectors(orig.num_vectors), 00057 num_features(orig.num_features), 00058 sparse_feature_matrix(orig.sparse_feature_matrix), 00059 feature_cache(orig.feature_cache) 00060 { 00061 init(); 00062 00063 if (orig.sparse_feature_matrix) 00064 { 00065 free_sparse_feature_matrix(); 00066 sparse_feature_matrix=SG_MALLOC(SGSparseVector<ST>, num_vectors); 00067 memcpy(sparse_feature_matrix, orig.sparse_feature_matrix, sizeof(SGSparseVector<ST>)*num_vectors); 00068 for (int32_t i=0; i< num_vectors; i++) 00069 { 00070 sparse_feature_matrix[i].features=SG_MALLOC(SGSparseVectorEntry<ST>, sparse_feature_matrix[i].num_feat_entries); 00071 memcpy(sparse_feature_matrix[i].features, orig.sparse_feature_matrix[i].features, sizeof(SGSparseVectorEntry<ST>)*sparse_feature_matrix[i].num_feat_entries); 00072 00073 } 00074 } 00075 00076 m_subset_stack=orig.m_subset_stack; 00077 SG_REF(m_subset_stack); 00078 } 00079 template<class ST> CSparseFeatures<ST>::CSparseFeatures(CFile* loader) 00080 : CDotFeatures(loader), num_vectors(0), num_features(0), 00081 sparse_feature_matrix(NULL), feature_cache(NULL) 00082 { 00083 init(); 00084 00085 load(loader); 00086 } 00087 00088 template<class ST> CSparseFeatures<ST>::~CSparseFeatures() 00089 { 00090 free_sparse_features(); 00091 } 00092 template<class ST> void CSparseFeatures<ST>::free_sparse_feature_matrix() 00093 { 00094 for (int32_t i=0; i<num_vectors; i++) 00095 (&sparse_feature_matrix[i])->~SGSparseVector(); 00096 00097 SG_FREE(sparse_feature_matrix); 00098 num_vectors=0; 00099 num_features=0; 00100 remove_all_subsets(); 00101 } 00102 template<class ST> void CSparseFeatures<ST>::free_sparse_features() 00103 { 00104 free_sparse_feature_matrix(); 00105 delete feature_cache; 00106 feature_cache = NULL; 00107 } 00108 template<class ST> CFeatures* CSparseFeatures<ST>::duplicate() const 00109 { 00110 return new CSparseFeatures<ST>(*this); 00111 } 00112 00113 template<class ST> ST CSparseFeatures<ST>::get_feature(int32_t num, int32_t index) 00114 { 00115 ASSERT(index>=0 && index<num_features); 00116 ASSERT(num>=0 && num<get_num_vectors()); 00117 00118 int32_t i; 00119 SGSparseVector<ST> sv=get_sparse_feature_vector(num); 00120 ST ret = 0 ; 00121 00122 if (sv.features) 00123 { 00124 for (i=0; i<sv.num_feat_entries; i++) 00125 if (sv.features[i].feat_index==index) 00126 ret+=sv.features[i].entry ; 00127 } 00128 00129 free_sparse_feature_vector(num); 00130 00131 return ret ; 00132 } 00133 00134 template<class ST> ST* CSparseFeatures<ST>::get_full_feature_vector(int32_t num, int32_t& len) 00135 { 00136 int32_t i; 00137 len=0; 00138 SGSparseVector<ST> sv=get_sparse_feature_vector(num); 00139 ST* fv=NULL; 00140 00141 if (sv.features) 00142 { 00143 len=num_features; 00144 fv=SG_MALLOC(ST, num_features); 00145 00146 for (i=0; i<num_features; i++) 00147 fv[i]=0; 00148 00149 for (i=0; i<sv.num_feat_entries; i++) 00150 fv[sv.features[i].feat_index]= sv.features[i].entry; 00151 } 00152 00153 free_sparse_feature_vector(num); 00154 00155 return fv; 00156 } 00157 00158 template<class ST> SGVector<ST> CSparseFeatures<ST>::get_full_feature_vector(int32_t num) 00159 { 00160 if (num>=get_num_vectors()) 00161 { 00162 SG_ERROR("Index out of bounds (number of vectors %d, you " 00163 "requested %d)\n", get_num_vectors(), num); 00164 } 00165 00166 SGSparseVector<ST> sv=get_sparse_feature_vector(num); 00167 00168 SGVector<ST> dense; 00169 00170 if (sv.features) 00171 { 00172 dense=SGVector<ST>(num_features); 00173 dense.zero(); 00174 00175 for (int32_t i=0; i<sv.num_feat_entries; i++) 00176 dense.vector[sv.features[i].feat_index]= sv.features[i].entry; 00177 } 00178 00179 free_sparse_feature_vector(num); 00180 00181 return dense; 00182 } 00183 00184 template<class ST> int32_t CSparseFeatures<ST>::get_nnz_features_for_vector(int32_t num) 00185 { 00186 SGSparseVector<ST> sv = get_sparse_feature_vector(num); 00187 int32_t len=sv.num_feat_entries; 00188 free_sparse_feature_vector(num); 00189 return len; 00190 } 00191 00192 template<class ST> SGSparseVector<ST> CSparseFeatures<ST>::get_sparse_feature_vector(int32_t num) 00193 { 00194 ASSERT(num<get_num_vectors()); 00195 00196 index_t real_num=m_subset_stack->subset_idx_conversion(num); 00197 00198 SGSparseVector<ST> result; 00199 00200 if (sparse_feature_matrix) 00201 { 00202 return sparse_feature_matrix[real_num]; 00203 } 00204 else 00205 { 00206 if (feature_cache) 00207 { 00208 result.features=feature_cache->lock_entry(num); 00209 00210 if (result.features) 00211 return result; 00212 else 00213 { 00214 result.features=feature_cache->set_entry(num); 00215 } 00216 } 00217 00218 //if (!result.features) 00219 // result.do_free=true; 00220 00221 result.features=compute_sparse_feature_vector(num, 00222 result.num_feat_entries, result.features); 00223 00224 00225 if (get_num_preprocessors()) 00226 { 00227 int32_t tmp_len=result.num_feat_entries; 00228 SGSparseVectorEntry<ST>* tmp_feat_before=result.features; 00229 SGSparseVectorEntry<ST>* tmp_feat_after = NULL; 00230 00231 for (int32_t i=0; i<get_num_preprocessors(); i++) 00232 { 00233 //tmp_feat_after=((CSparsePreprocessor<ST>*) get_preproc(i))->apply_to_feature_vector(tmp_feat_before, tmp_len); 00234 00235 if (i!=0) // delete feature vector, except for the the first one, i.e., feat 00236 SG_FREE(tmp_feat_before); 00237 tmp_feat_before=tmp_feat_after; 00238 } 00239 00240 memcpy(result.features, tmp_feat_after, 00241 sizeof(SGSparseVectorEntry<ST>)*tmp_len); 00242 00243 SG_FREE(tmp_feat_after); 00244 result.num_feat_entries=tmp_len ; 00245 SG_DEBUG( "len: %d len2: %d\n", result.num_feat_entries, num_features); 00246 } 00247 return result ; 00248 } 00249 } 00250 00251 template<class ST> ST CSparseFeatures<ST>::sparse_dot(ST alpha, SGSparseVectorEntry<ST>* avec, int32_t alen, SGSparseVectorEntry<ST>* bvec, int32_t blen) 00252 { 00253 ST result=0; 00254 00255 //result remains zero when one of the vectors is non existent 00256 if (avec && bvec) 00257 { 00258 if (alen<=blen) 00259 { 00260 int32_t j=0; 00261 for (int32_t i=0; i<alen; i++) 00262 { 00263 int32_t a_feat_idx=avec[i].feat_index; 00264 00265 while ( (j<blen) && (bvec[j].feat_index < a_feat_idx) ) 00266 j++; 00267 00268 if ( (j<blen) && (bvec[j].feat_index == a_feat_idx) ) 00269 { 00270 result+= avec[i].entry * bvec[j].entry; 00271 j++; 00272 } 00273 } 00274 } 00275 else 00276 { 00277 int32_t j=0; 00278 for (int32_t i=0; i<blen; i++) 00279 { 00280 int32_t b_feat_idx=bvec[i].feat_index; 00281 00282 while ( (j<alen) && (avec[j].feat_index < b_feat_idx) ) 00283 j++; 00284 00285 if ( (j<alen) && (avec[j].feat_index == b_feat_idx) ) 00286 { 00287 result+= bvec[i].entry * avec[j].entry; 00288 j++; 00289 } 00290 } 00291 } 00292 00293 result*=alpha; 00294 } 00295 00296 return result; 00297 } 00298 00299 template<class ST> ST CSparseFeatures<ST>::dense_dot(ST alpha, int32_t num, ST* vec, int32_t dim, ST b) 00300 { 00301 ASSERT(vec); 00302 ASSERT(dim==num_features); 00303 ST result=b; 00304 00305 SGSparseVector<ST> sv=get_sparse_feature_vector(num); 00306 00307 if (sv.features) 00308 { 00309 for (int32_t i=0; i<sv.num_feat_entries; i++) 00310 { 00311 result+=alpha*vec[sv.features[i].feat_index] 00312 *sv.features[i].entry; 00313 } 00314 } 00315 00316 free_sparse_feature_vector(num); 00317 return result; 00318 } 00319 00320 template<class ST> void CSparseFeatures<ST>::add_to_dense_vec(float64_t alpha, int32_t num, float64_t* vec, int32_t dim, bool abs_val) 00321 { 00322 ASSERT(vec); 00323 if (dim!=num_features) 00324 { 00325 SG_ERROR("dimension of vec (=%d) does not match number of features (=%d)\n", 00326 dim, num_features); 00327 } 00328 00329 SGSparseVector<ST> sv=get_sparse_feature_vector(num); 00330 00331 if (sv.features) 00332 { 00333 if (abs_val) 00334 { 00335 for (int32_t i=0; i<sv.num_feat_entries; i++) 00336 { 00337 vec[sv.features[i].feat_index]+=alpha 00338 *CMath::abs(sv.features[i].entry); 00339 } 00340 } 00341 else 00342 { 00343 for (int32_t i=0; i<sv.num_feat_entries; i++) 00344 { 00345 vec[sv.features[i].feat_index]+=alpha 00346 *sv.features[i].entry; 00347 } 00348 } 00349 } 00350 00351 free_sparse_feature_vector(num); 00352 } 00353 00354 template<class ST> void CSparseFeatures<ST>::free_sparse_feature_vector(int32_t num) 00355 { 00356 if (feature_cache) 00357 feature_cache->unlock_entry(m_subset_stack->subset_idx_conversion(num)); 00358 00359 //vec.free_vector(); 00360 } 00361 00362 template<class ST> SGSparseVector<ST>* CSparseFeatures<ST>::get_sparse_feature_matrix(int32_t &num_feat, int32_t &num_vec) 00363 { 00364 if (m_subset_stack->has_subsets()) 00365 SG_ERROR("get_sparse_feature_matrix() not allowed with subset\n"); 00366 00367 num_feat=num_features; 00368 num_vec=num_vectors; 00369 00370 return sparse_feature_matrix; 00371 } 00372 00373 template<class ST> SGSparseMatrix<ST> CSparseFeatures<ST>::get_sparse_feature_matrix() 00374 { 00375 if (m_subset_stack->has_subsets()) 00376 SG_ERROR("get_sparse_feature_matrix() not allowed with subset\n"); 00377 00378 SGSparseMatrix<ST> sm=SGSparseMatrix<ST>(NULL, 0, 0, false); 00379 sm.sparse_matrix=get_sparse_feature_matrix(sm.num_features, sm.num_vectors); 00380 return sm; 00381 } 00382 00383 template<class ST> CSparseFeatures<ST>* CSparseFeatures<ST>::get_transposed() 00384 { 00385 int32_t num_feat; 00386 int32_t num_vec; 00387 SGSparseVector<ST>* s=get_transposed(num_feat, num_vec); 00388 //SG_PRINT("num_feat = %d , num_vec = %d \n", num_feat, num_vec); 00389 return new CSparseFeatures<ST>(s, num_feat, num_vec); 00390 } 00391 00392 template<class ST> SGSparseVector<ST>* CSparseFeatures<ST>::get_transposed(int32_t &num_feat, int32_t &num_vec) 00393 { 00394 num_feat=get_num_vectors(); 00395 num_vec=num_features; 00396 //SG_PRINT("get transposed num_feat = %d , num_vec = %d \n", num_feat, num_vec); 00397 00398 int32_t* hist=SG_MALLOC(int32_t, num_features); 00399 memset(hist, 0, sizeof(int32_t)*num_features); 00400 00401 // count how lengths of future feature vectors 00402 for (int32_t v=0; v<num_feat; v++) 00403 { 00404 SGSparseVector<ST> sv=get_sparse_feature_vector(v); 00405 00406 for (int32_t i=0; i<sv.num_feat_entries; i++) 00407 hist[sv.features[i].feat_index]++; 00408 } 00409 00410 // allocate room for future feature vectors 00411 SGSparseVector<ST>* sfm=SG_MALLOC(SGSparseVector<ST>, num_vec); 00412 for (int32_t v=0; v<num_vec; v++) 00413 new (&sfm[v]) SGSparseVector<ST>(hist[v]); 00414 00415 // fill future feature vectors with content 00416 memset(hist,0,sizeof(int32_t)*num_features); 00417 for (int32_t v=0; v<num_feat; v++) 00418 { 00419 SGSparseVector<ST> sv=get_sparse_feature_vector(v); 00420 00421 for (int32_t i=0; i<sv.num_feat_entries; i++) 00422 { 00423 int32_t vidx=sv.features[i].feat_index; 00424 int32_t fidx=v; 00425 sfm[vidx].features[hist[vidx]].feat_index=fidx; 00426 sfm[vidx].features[hist[vidx]].entry=sv.features[i].entry; 00427 hist[vidx]++; 00428 } 00429 00430 } 00431 00432 SG_FREE(hist); 00433 return sfm; 00434 } 00435 00436 template<class ST> void CSparseFeatures<ST>::set_sparse_feature_matrix(SGSparseMatrix<ST> sm) 00437 { 00438 if (m_subset_stack->has_subsets()) 00439 SG_ERROR("set_sparse_feature_matrix() not allowed with subset\n"); 00440 00441 SGSparseVector<ST>* sparse_matrix = SG_MALLOC(SGSparseVector<ST>, sm.num_vectors); 00442 for (int32_t i=0; i<sm.num_vectors; i++) 00443 { 00444 new (&sparse_matrix[i]) SGSparseVector<ST>(); 00445 sparse_matrix[i] = sm[i]; 00446 } 00447 00448 sparse_feature_matrix=sparse_matrix; 00449 num_features=sm.num_features; 00450 num_vectors=sm.num_vectors; 00451 } 00452 00453 template<class ST> SGMatrix<ST> CSparseFeatures<ST>::get_full_feature_matrix() 00454 { 00455 SGMatrix<ST> full(num_features, get_num_vectors()); 00456 full.zero(); 00457 00458 SG_INFO( "converting sparse features to full feature matrix of %ld x %ld entries\n", num_vectors, num_features); 00459 00460 for (int32_t v=0; v<full.num_cols; v++) 00461 { 00462 int32_t idx=m_subset_stack->subset_idx_conversion(v); 00463 SGSparseVector<ST> current=sparse_feature_matrix[idx]; 00464 00465 for (int32_t f=0; f<current.num_feat_entries; f++) 00466 { 00467 int64_t offs=(idx*num_features) 00468 +current.features[f].feat_index; 00469 00470 full.matrix[offs]=current.features[f].entry; 00471 } 00472 } 00473 00474 return full; 00475 } 00476 00477 template<class ST> bool CSparseFeatures<ST>::set_full_feature_matrix(SGMatrix<ST> full) 00478 { 00479 remove_all_subsets(); 00480 00481 ST* src=full.matrix; 00482 int32_t num_feat=full.num_rows; 00483 int32_t num_vec=full.num_cols; 00484 00485 free_sparse_feature_matrix(); 00486 bool result=true; 00487 num_features=num_feat; 00488 num_vectors=num_vec; 00489 00490 SG_INFO("converting dense feature matrix to sparse one\n"); 00491 int32_t* num_feat_entries=SG_MALLOC(int, num_vectors); 00492 00493 if (num_feat_entries) 00494 { 00495 int64_t num_total_entries=0; 00496 00497 // count nr of non sparse features 00498 for (int32_t i=0; i< num_vec; i++) 00499 { 00500 num_feat_entries[i]=0; 00501 for (int32_t j=0; j< num_feat; j++) 00502 { 00503 if (src[i*((int64_t) num_feat) + j] != 0) 00504 num_feat_entries[i]++; 00505 } 00506 } 00507 00508 if (num_vec>0) 00509 { 00510 sparse_feature_matrix=SG_MALLOC(SGSparseVector<ST>, num_vec); 00511 00512 if (sparse_feature_matrix) 00513 { 00514 for (int32_t i=0; i< num_vec; i++) 00515 { 00516 new(&sparse_feature_matrix[i]) SGSparseVector<ST>(); 00517 sparse_feature_matrix[i] = SGSparseVector<ST>(num_feat_entries[i]); 00518 int32_t sparse_feat_idx=0; 00519 00520 for (int32_t j=0; j< num_feat; j++) 00521 { 00522 int64_t pos= i*num_feat + j; 00523 00524 if (src[pos] != 0) 00525 { 00526 sparse_feature_matrix[i].features[sparse_feat_idx].entry=src[pos]; 00527 sparse_feature_matrix[i].features[sparse_feat_idx].feat_index=j; 00528 sparse_feat_idx++; 00529 num_total_entries++; 00530 } 00531 } 00532 } 00533 } 00534 else 00535 { 00536 SG_ERROR( "allocation of sparse feature matrix failed\n"); 00537 result=false; 00538 } 00539 00540 SG_INFO( "sparse feature matrix has %ld entries (full matrix had %ld, sparsity %2.2f%%)\n", 00541 num_total_entries, int64_t(num_feat)*num_vec, (100.0*num_total_entries)/(int64_t(num_feat)*num_vec)); 00542 } 00543 else 00544 { 00545 SG_ERROR( "huh ? zero size matrix given ?\n"); 00546 result=false; 00547 } 00548 } 00549 SG_FREE(num_feat_entries); 00550 return result; 00551 } 00552 00553 template<class ST> bool CSparseFeatures<ST>::apply_preprocessor(bool force_preprocessing) 00554 { 00555 SG_INFO( "force: %d\n", force_preprocessing); 00556 00557 if ( sparse_feature_matrix && get_num_preprocessors() ) 00558 { 00559 for (int32_t i=0; i<get_num_preprocessors(); i++) 00560 { 00561 if ( (!is_preprocessed(i) || force_preprocessing) ) 00562 { 00563 set_preprocessed(i); 00564 SG_INFO( "preprocessing using preproc %s\n", get_preprocessor(i)->get_name()); 00565 if (((CSparsePreprocessor<ST>*) get_preprocessor(i))->apply_to_sparse_feature_matrix(this) == NULL) 00566 return false; 00567 } 00568 return true; 00569 } 00570 return true; 00571 } 00572 else 00573 { 00574 SG_WARNING( "no sparse feature matrix available or features already preprocessed - skipping.\n"); 00575 return false; 00576 } 00577 } 00578 00579 template<class ST> int32_t CSparseFeatures<ST>::get_size() const 00580 { 00581 return sizeof(ST); 00582 } 00583 00584 template<class ST> bool CSparseFeatures<ST>::obtain_from_simple(CDenseFeatures<ST>* sf) 00585 { 00586 SGMatrix<ST> fm=sf->get_feature_matrix(); 00587 ASSERT(fm.matrix && fm.num_cols>0 && fm.num_rows>0); 00588 00589 return set_full_feature_matrix(fm); 00590 } 00591 00592 template<class ST> int32_t CSparseFeatures<ST>::get_num_vectors() const 00593 { 00594 return m_subset_stack->has_subsets() ? m_subset_stack->get_size() : num_vectors; 00595 } 00596 00597 template<class ST> int32_t CSparseFeatures<ST>::get_num_features() 00598 { 00599 return num_features; 00600 } 00601 00602 template<class ST> int32_t CSparseFeatures<ST>::set_num_features(int32_t num) 00603 { 00604 int32_t n=num_features; 00605 ASSERT(n<=num); 00606 num_features=num; 00607 return num_features; 00608 } 00609 00610 template<class ST> EFeatureClass CSparseFeatures<ST>::get_feature_class() const 00611 { 00612 return C_SPARSE; 00613 } 00614 00615 template<class ST> void CSparseFeatures<ST>::free_feature_vector(int32_t num) 00616 { 00617 if (feature_cache) 00618 feature_cache->unlock_entry(m_subset_stack->subset_idx_conversion(num)); 00619 00620 //vec.free_vector(); 00621 } 00622 00623 template<class ST> int64_t CSparseFeatures<ST>::get_num_nonzero_entries() 00624 { 00625 int64_t num=0; 00626 index_t num_vec=get_num_vectors(); 00627 for (int32_t i=0; i<num_vec; i++) 00628 num+=sparse_feature_matrix[m_subset_stack->subset_idx_conversion(i)].num_feat_entries; 00629 00630 return num; 00631 } 00632 00633 template<class ST> float64_t* CSparseFeatures<ST>::compute_squared(float64_t* sq) 00634 { 00635 ASSERT(sq); 00636 00637 index_t num_vec=get_num_vectors(); 00638 for (int32_t i=0; i<num_vec; i++) 00639 { 00640 sq[i]=0; 00641 SGSparseVector<ST> vec=get_sparse_feature_vector(i); 00642 00643 for (int32_t j=0; j<vec.num_feat_entries; j++) 00644 sq[i]+=vec.features[j].entry*vec.features[j].entry; 00645 00646 free_feature_vector(i); 00647 } 00648 00649 return sq; 00650 } 00651 00652 template<class ST> float64_t CSparseFeatures<ST>::compute_squared_norm( 00653 CSparseFeatures<float64_t>* lhs, float64_t* sq_lhs, int32_t idx_a, 00654 CSparseFeatures<float64_t>* rhs, float64_t* sq_rhs, int32_t idx_b) 00655 { 00656 int32_t i,j; 00657 ASSERT(lhs); 00658 ASSERT(rhs); 00659 00660 SGSparseVector<float64_t> avec=lhs->get_sparse_feature_vector(idx_a); 00661 SGSparseVector<float64_t> bvec=rhs->get_sparse_feature_vector(idx_b); 00662 ASSERT(avec.features); 00663 ASSERT(bvec.features); 00664 00665 float64_t result=sq_lhs[idx_a]+sq_rhs[idx_b]; 00666 00667 if (avec.num_feat_entries<=bvec.num_feat_entries) 00668 { 00669 j=0; 00670 for (i=0; i<avec.num_feat_entries; i++) 00671 { 00672 int32_t a_feat_idx=avec.features[i].feat_index; 00673 00674 while ((j<bvec.num_feat_entries) 00675 &&(bvec.features[j].feat_index<a_feat_idx)) 00676 j++; 00677 00678 if ((j<bvec.num_feat_entries) 00679 &&(bvec.features[j].feat_index==a_feat_idx)) 00680 { 00681 result-=2*(avec.features[i].entry*bvec.features[j].entry); 00682 j++; 00683 } 00684 } 00685 } 00686 else 00687 { 00688 j=0; 00689 for (i=0; i<bvec.num_feat_entries; i++) 00690 { 00691 int32_t b_feat_idx=bvec.features[i].feat_index; 00692 00693 while ((j<avec.num_feat_entries) 00694 &&(avec.features[j].feat_index<b_feat_idx)) 00695 j++; 00696 00697 if ((j<avec.num_feat_entries) 00698 &&(avec.features[j].feat_index==b_feat_idx)) 00699 { 00700 result-=2*(bvec.features[i].entry*avec.features[j].entry); 00701 j++; 00702 } 00703 } 00704 } 00705 00706 ((CSparseFeatures<float64_t>*) lhs)->free_feature_vector(idx_a); 00707 ((CSparseFeatures<float64_t>*) rhs)->free_feature_vector(idx_b); 00708 00709 return CMath::abs(result); 00710 } 00711 00712 template<class ST> CRegressionLabels* CSparseFeatures<ST>::load_svmlight_file(char* fname, 00713 bool do_sort_features) 00714 { 00715 remove_all_subsets(); 00716 00717 CRegressionLabels* lab=NULL; 00718 00719 size_t blocksize=1024*1024; 00720 size_t required_blocksize=blocksize; 00721 uint8_t* dummy=SG_MALLOC(uint8_t, blocksize); 00722 FILE* f=fopen(fname, "ro"); 00723 00724 if (f) 00725 { 00726 free_sparse_feature_matrix(); 00727 num_vectors=0; 00728 num_features=0; 00729 00730 SG_INFO("counting line numbers in file %s\n", fname); 00731 size_t sz=blocksize; 00732 size_t block_offs=0; 00733 size_t old_block_offs=0; 00734 fseek(f, 0, SEEK_END); 00735 size_t fsize=ftell(f); 00736 rewind(f); 00737 00738 while (sz == blocksize) 00739 { 00740 sz=fread(dummy, sizeof(uint8_t), blocksize, f); 00741 for (size_t i=0; i<sz; i++) 00742 { 00743 block_offs++; 00744 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize)) 00745 { 00746 num_vectors++; 00747 required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs+1); 00748 old_block_offs=block_offs; 00749 } 00750 } 00751 SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t"); 00752 } 00753 00754 SG_INFO("found %d feature vectors\n", num_vectors); 00755 SG_FREE(dummy); 00756 blocksize=required_blocksize; 00757 dummy = SG_MALLOC(uint8_t, blocksize+1); //allow setting of '\0' at EOL 00758 00759 lab=new CRegressionLabels(num_vectors); 00760 sparse_feature_matrix=SG_MALLOC(SGSparseVector<ST>, num_vectors); 00761 for (int32_t i=0; i<num_vectors; i++) 00762 new (&sparse_feature_matrix[i]) SGSparseVector<ST>(); 00763 rewind(f); 00764 sz=blocksize; 00765 int32_t lines=0; 00766 while (sz == blocksize) 00767 { 00768 sz=fread(dummy, sizeof(uint8_t), blocksize, f); 00769 00770 size_t old_sz=0; 00771 for (size_t i=0; i<sz; i++) 00772 { 00773 if (i==sz-1 && dummy[i]!='\n' && sz==blocksize) 00774 { 00775 size_t len=i-old_sz+1; 00776 uint8_t* data=&dummy[old_sz]; 00777 00778 for (size_t j=0; j<len; j++) 00779 dummy[j]=data[j]; 00780 00781 sz=fread(dummy+len, sizeof(uint8_t), blocksize-len, f); 00782 i=0; 00783 old_sz=0; 00784 sz+=len; 00785 } 00786 00787 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize)) 00788 { 00789 00790 size_t len=i-old_sz; 00791 uint8_t* data=&dummy[old_sz]; 00792 00793 int32_t dims=0; 00794 for (size_t j=0; j<len; j++) 00795 { 00796 if (data[j]==':') 00797 dims++; 00798 } 00799 00800 if (dims<=0) 00801 { 00802 SG_ERROR("Error in line %d - number of" 00803 " dimensions is %d line is %d characters" 00804 " long\n line_content:'%.*s'\n", lines, 00805 dims, len, len, (const char*) data); 00806 } 00807 00808 SGSparseVectorEntry<ST>* feat=SG_MALLOC(SGSparseVectorEntry<ST>, dims); 00809 size_t j=0; 00810 for (; j<len; j++) 00811 { 00812 if (data[j]==' ') 00813 { 00814 data[j]='\0'; 00815 00816 lab->set_label(lines, atof((const char*) data)); 00817 break; 00818 } 00819 } 00820 00821 int32_t d=0; 00822 j++; 00823 uint8_t* start=&data[j]; 00824 for (; j<len; j++) 00825 { 00826 if (data[j]==':') 00827 { 00828 data[j]='\0'; 00829 00830 feat[d].feat_index=(int32_t) atoi((const char*) start)-1; 00831 num_features=CMath::max(num_features, feat[d].feat_index+1); 00832 00833 j++; 00834 start=&data[j]; 00835 for (; j<len; j++) 00836 { 00837 if (data[j]==' ' || data[j]=='\n') 00838 { 00839 data[j]='\0'; 00840 feat[d].entry=(ST) atof((const char*) start); 00841 d++; 00842 break; 00843 } 00844 } 00845 00846 if (j==len) 00847 { 00848 data[j]='\0'; 00849 feat[dims-1].entry=(ST) atof((const char*) start); 00850 } 00851 00852 j++; 00853 start=&data[j]; 00854 } 00855 } 00856 00857 sparse_feature_matrix[lines].num_feat_entries=dims; 00858 sparse_feature_matrix[lines].features=feat; 00859 00860 old_sz=i+1; 00861 lines++; 00862 SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t"); 00863 } 00864 } 00865 } 00866 SG_INFO("file successfully read\n"); 00867 fclose(f); 00868 } 00869 00870 SG_FREE(dummy); 00871 00872 if (do_sort_features) 00873 sort_features(); 00874 00875 return lab; 00876 } 00877 00878 template<class ST> void CSparseFeatures<ST>::sort_features() 00879 { 00880 if (m_subset_stack->has_subsets()) 00881 SG_ERROR("sort_features() not allowed with subset\n"); 00882 00883 ASSERT(get_num_preprocessors()==0); 00884 00885 if (!sparse_feature_matrix) 00886 SG_ERROR("Requires sparse feature matrix to be available in-memory\n"); 00887 00888 for (int32_t i=0; i<num_vectors; i++) 00889 { 00890 int32_t len=sparse_feature_matrix[i].num_feat_entries; 00891 00892 if (!len) 00893 continue; 00894 00895 SGSparseVectorEntry<ST>* sf_orig=sparse_feature_matrix[i].features; 00896 int32_t* feat_idx=SG_MALLOC(int32_t, len); 00897 int32_t* orig_idx=SG_MALLOC(int32_t, len); 00898 00899 for (int j=0; j<len; j++) 00900 { 00901 feat_idx[j]=sf_orig[j].feat_index; 00902 orig_idx[j]=j; 00903 } 00904 00905 CMath::qsort_index(feat_idx, orig_idx, len); 00906 00907 SGSparseVectorEntry<ST>* sf_new= SG_MALLOC(SGSparseVectorEntry<ST>, len); 00908 for (int j=0; j<len; j++) 00909 sf_new[j]=sf_orig[orig_idx[j]]; 00910 00911 sparse_feature_matrix[i].features=sf_new; 00912 00913 // sanity check 00914 for (int j=0; j<len-1; j++) 00915 ASSERT(sf_new[j].feat_index<sf_new[j+1].feat_index); 00916 00917 SG_FREE(orig_idx); 00918 SG_FREE(feat_idx); 00919 SG_FREE(sf_orig); 00920 } 00921 } 00922 00923 template<class ST> bool CSparseFeatures<ST>::write_svmlight_file(char* fname, 00924 CRegressionLabels* label) 00925 { 00926 if (m_subset_stack->has_subsets()) 00927 SG_ERROR("write_svmlight_file() not allowed with subset\n"); 00928 00929 ASSERT(label); 00930 int32_t num=label->get_num_labels(); 00931 ASSERT(num>0); 00932 ASSERT(num==num_vectors); 00933 00934 FILE* f=fopen(fname, "wb"); 00935 00936 if (f) 00937 { 00938 for (int32_t i=0; i<num; i++) 00939 { 00940 fprintf(f, "%d ", (int32_t) label->get_int_label(i)); 00941 00942 SGSparseVectorEntry<ST>* vec = sparse_feature_matrix[i].features; 00943 int32_t num_feat = sparse_feature_matrix[i].num_feat_entries; 00944 00945 for (int32_t j=0; j<num_feat; j++) 00946 { 00947 if (j<num_feat-1) 00948 fprintf(f, "%d:%f ", (int32_t) vec[j].feat_index+1, (double) vec[j].entry); 00949 else 00950 fprintf(f, "%d:%f\n", (int32_t) vec[j].feat_index+1, (double) vec[j].entry); 00951 } 00952 } 00953 00954 fclose(f); 00955 return true; 00956 } 00957 return false; 00958 } 00959 00960 template<class ST> int32_t CSparseFeatures<ST>::get_dim_feature_space() const 00961 { 00962 return num_features; 00963 } 00964 00965 template<class ST> float64_t CSparseFeatures<ST>::dot(int32_t vec_idx1, 00966 CDotFeatures* df, int32_t vec_idx2) 00967 { 00968 ASSERT(df); 00969 ASSERT(df->get_feature_type() == get_feature_type()); 00970 ASSERT(df->get_feature_class() == get_feature_class()); 00971 CSparseFeatures<ST>* sf = (CSparseFeatures<ST>*) df; 00972 00973 SGSparseVector<ST> avec=get_sparse_feature_vector(vec_idx1); 00974 SGSparseVector<ST> bvec=sf->get_sparse_feature_vector(vec_idx2); 00975 00976 float64_t result=sparse_dot(1, avec.features, avec.num_feat_entries, 00977 bvec.features, bvec.num_feat_entries); 00978 00979 free_sparse_feature_vector(vec_idx1); 00980 sf->free_sparse_feature_vector(vec_idx2); 00981 00982 return result; 00983 } 00984 template<class ST> float64_t CSparseFeatures<ST>::dense_dot(int32_t vec_idx1, float64_t* vec2, int32_t vec2_len) 00985 { 00986 ASSERT(vec2); 00987 if (vec2_len!=num_features) 00988 { 00989 SG_ERROR("dimension of vec2 (=%d) does not match number of features (=%d)\n", 00990 vec2_len, num_features); 00991 } 00992 float64_t result=0; 00993 00994 SGSparseVector<ST> sv=get_sparse_feature_vector(vec_idx1); 00995 00996 if (sv.features) 00997 { 00998 for (int32_t i=0; i<sv.num_feat_entries; i++) 00999 result+=vec2[sv.features[i].feat_index]*sv.features[i].entry; 01000 } 01001 01002 free_sparse_feature_vector(vec_idx1); 01003 01004 return result; 01005 } 01006 01007 template<class ST> void* CSparseFeatures<ST>::get_feature_iterator(int32_t vector_index) 01008 { 01009 if (vector_index>=get_num_vectors()) 01010 { 01011 SG_ERROR("Index out of bounds (number of vectors %d, you " 01012 "requested %d)\n", get_num_vectors(), vector_index); 01013 } 01014 01015 if (!sparse_feature_matrix) 01016 SG_ERROR("Requires a in-memory feature matrix\n"); 01017 01018 sparse_feature_iterator* it=SG_MALLOC(sparse_feature_iterator, 1); 01019 it->sv=get_sparse_feature_vector(vector_index); 01020 it->index=0; 01021 it->vector_index=vector_index; 01022 01023 return it; 01024 } 01025 01026 template<class ST> bool CSparseFeatures<ST>::get_next_feature(int32_t& index, float64_t& value, void* iterator) 01027 { 01028 sparse_feature_iterator* it=(sparse_feature_iterator*) iterator; 01029 if (!it || it->index>=it->sv.num_feat_entries) 01030 return false; 01031 01032 int32_t i=it->index++; 01033 01034 index=it->sv.features[i].feat_index; 01035 value=(float64_t) it->sv.features[i].entry; 01036 01037 return true; 01038 } 01039 01040 template<class ST> void CSparseFeatures<ST>::free_feature_iterator(void* iterator) 01041 { 01042 if (!iterator) 01043 return; 01044 01045 sparse_feature_iterator* it=(sparse_feature_iterator*) iterator; 01046 free_sparse_feature_vector(it->vector_index); 01047 SG_FREE(it); 01048 } 01049 01050 template<class ST> CFeatures* CSparseFeatures<ST>::copy_subset(SGVector<index_t> indices) 01051 { 01052 SGSparseMatrix<ST> matrix_copy=SGSparseMatrix<ST>(indices.vlen, 01053 get_dim_feature_space()); 01054 01055 for (index_t i=0; i<indices.vlen; ++i) 01056 { 01057 /* index to copy */ 01058 index_t index=indices.vector[i]; 01059 index_t real_index=m_subset_stack->subset_idx_conversion(index); 01060 01061 /* copy sparse vector */ 01062 SGSparseVector<ST> current=get_sparse_feature_vector(real_index); 01063 matrix_copy.sparse_matrix[i]=current; 01064 01065 free_sparse_feature_vector(index); 01066 } 01067 01068 CFeatures* result=new CSparseFeatures<ST>(matrix_copy); 01069 SG_REF(result); 01070 return result; 01071 } 01072 01073 template<class ST> SGSparseVectorEntry<ST>* CSparseFeatures<ST>::compute_sparse_feature_vector(int32_t num, 01074 int32_t& len, SGSparseVectorEntry<ST>* target) 01075 { 01076 SG_NOTIMPLEMENTED; 01077 01078 len=0; 01079 return NULL; 01080 } 01081 01082 template<class ST> void CSparseFeatures<ST>::init() 01083 { 01084 set_generic<ST>(); 01085 01086 m_parameters->add_vector(&sparse_feature_matrix, &num_vectors, 01087 "sparse_feature_matrix", 01088 "Array of sparse vectors."); 01089 m_parameters->add(&num_features, "num_features", 01090 "Total number of features."); 01091 } 01092 01093 #define GET_FEATURE_TYPE(sg_type, f_type) \ 01094 template<> EFeatureType CSparseFeatures<sg_type>::get_feature_type() const \ 01095 { \ 01096 return f_type; \ 01097 } 01098 GET_FEATURE_TYPE(bool, F_BOOL) 01099 GET_FEATURE_TYPE(char, F_CHAR) 01100 GET_FEATURE_TYPE(uint8_t, F_BYTE) 01101 GET_FEATURE_TYPE(int8_t, F_BYTE) 01102 GET_FEATURE_TYPE(int16_t, F_SHORT) 01103 GET_FEATURE_TYPE(uint16_t, F_WORD) 01104 GET_FEATURE_TYPE(int32_t, F_INT) 01105 GET_FEATURE_TYPE(uint32_t, F_UINT) 01106 GET_FEATURE_TYPE(int64_t, F_LONG) 01107 GET_FEATURE_TYPE(uint64_t, F_ULONG) 01108 GET_FEATURE_TYPE(float32_t, F_SHORTREAL) 01109 GET_FEATURE_TYPE(float64_t, F_DREAL) 01110 GET_FEATURE_TYPE(floatmax_t, F_LONGREAL) 01111 #undef GET_FEATURE_TYPE 01112 01113 #define LOAD(fname, sg_type) \ 01114 template<> void CSparseFeatures<sg_type>::load(CFile* loader) \ 01115 { \ 01116 remove_all_subsets(); \ 01117 SG_SET_LOCALE_C; \ 01118 ASSERT(loader); \ 01119 SGSparseVector<sg_type>* matrix=NULL; \ 01120 int32_t num_feat=0; \ 01121 int32_t num_vec=0; \ 01122 loader->fname(matrix, num_feat, num_vec); \ 01123 set_sparse_feature_matrix(SGSparseMatrix<sg_type>(matrix, num_feat, num_vec)); \ 01124 SG_RESET_LOCALE; \ 01125 } 01126 LOAD(get_sparse_matrix, bool) 01127 LOAD(get_sparse_matrix, char) 01128 LOAD(get_sparse_matrix, uint8_t) 01129 LOAD(get_int8_sparsematrix, int8_t) 01130 LOAD(get_sparse_matrix, int16_t) 01131 LOAD(get_sparse_matrix, uint16_t) 01132 LOAD(get_sparse_matrix, int32_t) 01133 LOAD(get_uint_sparsematrix, uint32_t) 01134 LOAD(get_long_sparsematrix, int64_t) 01135 LOAD(get_ulong_sparsematrix, uint64_t) 01136 LOAD(get_sparse_matrix, float32_t) 01137 LOAD(get_sparse_matrix, float64_t) 01138 LOAD(get_longreal_sparsematrix, floatmax_t) 01139 #undef LOAD 01140 01141 #define WRITE(fname, sg_type) \ 01142 template<> void CSparseFeatures<sg_type>::save(CFile* writer) \ 01143 { \ 01144 if (m_subset_stack->has_subsets()) \ 01145 SG_ERROR("save() not allowed with subset\n"); \ 01146 SG_SET_LOCALE_C; \ 01147 ASSERT(writer); \ 01148 writer->fname(sparse_feature_matrix, num_features, num_vectors); \ 01149 SG_RESET_LOCALE; \ 01150 } 01151 WRITE(set_sparse_matrix, bool) 01152 WRITE(set_sparse_matrix, char) 01153 WRITE(set_sparse_matrix, uint8_t) 01154 WRITE(set_int8_sparsematrix, int8_t) 01155 WRITE(set_sparse_matrix, int16_t) 01156 WRITE(set_sparse_matrix, uint16_t) 01157 WRITE(set_sparse_matrix, int32_t) 01158 WRITE(set_uint_sparsematrix, uint32_t) 01159 WRITE(set_long_sparsematrix, int64_t) 01160 WRITE(set_ulong_sparsematrix, uint64_t) 01161 WRITE(set_sparse_matrix, float32_t) 01162 WRITE(set_sparse_matrix, float64_t) 01163 WRITE(set_longreal_sparsematrix, floatmax_t) 01164 #undef WRITE 01165 01166 template class CSparseFeatures<bool>; 01167 template class CSparseFeatures<char>; 01168 template class CSparseFeatures<int8_t>; 01169 template class CSparseFeatures<uint8_t>; 01170 template class CSparseFeatures<int16_t>; 01171 template class CSparseFeatures<uint16_t>; 01172 template class CSparseFeatures<int32_t>; 01173 template class CSparseFeatures<uint32_t>; 01174 template class CSparseFeatures<int64_t>; 01175 template class CSparseFeatures<uint64_t>; 01176 template class CSparseFeatures<float32_t>; 01177 template class CSparseFeatures<float64_t>; 01178 template class CSparseFeatures<floatmax_t>; 01179 }