SHOGUN  v2.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
SparseFeatures.cpp
Go to the documentation of this file.
00001 #include <shogun/features/SparseFeatures.h>
00002 #include <shogun/preprocessor/SparsePreprocessor.h>
00003 #include <shogun/mathematics/Math.h>
00004 #include <shogun/lib/DataType.h>
00005 #include <shogun/labels/RegressionLabels.h>
00006 #include <shogun/io/SGIO.h>
00007 
00008 #include <string.h>
00009 #include <stdlib.h>
00010 
00011 namespace shogun
00012 {
00013 
00014 template<class ST> CSparseFeatures<ST>::CSparseFeatures(int32_t size)
00015 : CDotFeatures(size), num_vectors(0), num_features(0),
00016     sparse_feature_matrix(NULL), feature_cache(NULL)
00017 {
00018     init();
00019 }
00020 
00021 template<class ST> CSparseFeatures<ST>::CSparseFeatures(SGSparseVector<ST>* src,
00022         int32_t num_feat, int32_t num_vec, bool copy)
00023 : CDotFeatures(0), num_vectors(num_vec), num_features(num_feat),
00024     sparse_feature_matrix(NULL), feature_cache(NULL)
00025 {
00026     init();
00027 
00028     sparse_feature_matrix = src;
00029     //SG_MALLOC(SGSparseVector<ST>, num_vec);
00030     //for (int32_t i=0; i< num_vec; i++)
00031     //{
00032     //  new (&sparse_feature_matrix[i]) SGSparseVector<ST>();
00033     //  sparse_feature_matrix[i] = src[i];
00034     //}
00035 }
00036 
00037 template<class ST> CSparseFeatures<ST>::CSparseFeatures(SGSparseMatrix<ST> sparse)
00038 : CDotFeatures(0), num_vectors(0), num_features(0),
00039     sparse_feature_matrix(NULL), feature_cache(NULL)
00040 {
00041     init();
00042 
00043     set_sparse_feature_matrix(sparse);
00044 }
00045 
00046 template<class ST> CSparseFeatures<ST>::CSparseFeatures(SGMatrix<ST> dense)
00047 : CDotFeatures(0), num_vectors(0), num_features(0),
00048     sparse_feature_matrix(NULL), feature_cache(NULL)
00049 {
00050     init();
00051 
00052     set_full_feature_matrix(dense);
00053 }
00054 
00055 template<class ST> CSparseFeatures<ST>::CSparseFeatures(const CSparseFeatures & orig)
00056 : CDotFeatures(orig), num_vectors(orig.num_vectors),
00057     num_features(orig.num_features),
00058     sparse_feature_matrix(orig.sparse_feature_matrix),
00059     feature_cache(orig.feature_cache)
00060 {
00061     init();
00062 
00063     if (orig.sparse_feature_matrix)
00064     {
00065         free_sparse_feature_matrix();
00066         sparse_feature_matrix=SG_MALLOC(SGSparseVector<ST>, num_vectors);
00067         memcpy(sparse_feature_matrix, orig.sparse_feature_matrix, sizeof(SGSparseVector<ST>)*num_vectors);
00068         for (int32_t i=0; i< num_vectors; i++)
00069         {
00070             sparse_feature_matrix[i].features=SG_MALLOC(SGSparseVectorEntry<ST>, sparse_feature_matrix[i].num_feat_entries);
00071             memcpy(sparse_feature_matrix[i].features, orig.sparse_feature_matrix[i].features, sizeof(SGSparseVectorEntry<ST>)*sparse_feature_matrix[i].num_feat_entries);
00072 
00073         }
00074     }
00075 
00076     m_subset_stack=orig.m_subset_stack;
00077     SG_REF(m_subset_stack);
00078 }
00079 template<class ST> CSparseFeatures<ST>::CSparseFeatures(CFile* loader)
00080 : CDotFeatures(loader), num_vectors(0), num_features(0),
00081     sparse_feature_matrix(NULL), feature_cache(NULL)
00082 {
00083     init();
00084 
00085     load(loader);
00086 }
00087 
00088 template<class ST> CSparseFeatures<ST>::~CSparseFeatures()
00089 {
00090     free_sparse_features();
00091 }
00092 template<class ST> void CSparseFeatures<ST>::free_sparse_feature_matrix()
00093 {
00094     for (int32_t i=0; i<num_vectors; i++)
00095         (&sparse_feature_matrix[i])->~SGSparseVector();
00096 
00097     SG_FREE(sparse_feature_matrix);
00098     num_vectors=0;
00099     num_features=0;
00100     remove_all_subsets();
00101 }
00102 template<class ST> void CSparseFeatures<ST>::free_sparse_features()
00103 {
00104     free_sparse_feature_matrix();
00105     delete feature_cache;
00106     feature_cache = NULL;
00107 }
00108 template<class ST> CFeatures* CSparseFeatures<ST>::duplicate() const
00109 {
00110     return new CSparseFeatures<ST>(*this);
00111 }
00112 
00113 template<class ST> ST CSparseFeatures<ST>::get_feature(int32_t num, int32_t index)
00114 {
00115     ASSERT(index>=0 && index<num_features);
00116     ASSERT(num>=0 && num<get_num_vectors());
00117 
00118     int32_t i;
00119     SGSparseVector<ST> sv=get_sparse_feature_vector(num);
00120     ST ret = 0 ;
00121 
00122     if (sv.features)
00123     {
00124         for (i=0; i<sv.num_feat_entries; i++)
00125             if (sv.features[i].feat_index==index)
00126                 ret+=sv.features[i].entry ;
00127     }
00128 
00129     free_sparse_feature_vector(num);
00130 
00131     return ret ;
00132 }
00133 
00134 template<class ST> ST* CSparseFeatures<ST>::get_full_feature_vector(int32_t num, int32_t& len)
00135 {
00136     int32_t i;
00137     len=0;
00138     SGSparseVector<ST> sv=get_sparse_feature_vector(num);
00139     ST* fv=NULL;
00140 
00141     if (sv.features)
00142     {
00143         len=num_features;
00144         fv=SG_MALLOC(ST, num_features);
00145 
00146         for (i=0; i<num_features; i++)
00147             fv[i]=0;
00148 
00149         for (i=0; i<sv.num_feat_entries; i++)
00150             fv[sv.features[i].feat_index]= sv.features[i].entry;
00151     }
00152 
00153     free_sparse_feature_vector(num);
00154 
00155     return fv;
00156 }
00157 
00158 template<class ST> SGVector<ST> CSparseFeatures<ST>::get_full_feature_vector(int32_t num)
00159 {
00160     if (num>=get_num_vectors())
00161     {
00162         SG_ERROR("Index out of bounds (number of vectors %d, you "
00163                 "requested %d)\n", get_num_vectors(), num);
00164     }
00165 
00166     SGSparseVector<ST> sv=get_sparse_feature_vector(num);
00167 
00168     SGVector<ST> dense;
00169 
00170     if (sv.features)
00171     {
00172         dense=SGVector<ST>(num_features);
00173         dense.zero();
00174 
00175         for (int32_t i=0; i<sv.num_feat_entries; i++)
00176             dense.vector[sv.features[i].feat_index]= sv.features[i].entry;
00177     }
00178 
00179     free_sparse_feature_vector(num);
00180 
00181     return dense;
00182 }
00183 
00184 template<class ST> int32_t CSparseFeatures<ST>::get_nnz_features_for_vector(int32_t num)
00185 {
00186     SGSparseVector<ST> sv = get_sparse_feature_vector(num);
00187     int32_t len=sv.num_feat_entries;
00188     free_sparse_feature_vector(num);
00189     return len;
00190 }
00191 
00192 template<class ST> SGSparseVector<ST> CSparseFeatures<ST>::get_sparse_feature_vector(int32_t num)
00193 {
00194     ASSERT(num<get_num_vectors());
00195 
00196     index_t real_num=m_subset_stack->subset_idx_conversion(num);
00197 
00198     SGSparseVector<ST> result;
00199 
00200     if (sparse_feature_matrix)
00201     {
00202         return sparse_feature_matrix[real_num];
00203     }
00204     else
00205     {
00206         if (feature_cache)
00207         {
00208             result.features=feature_cache->lock_entry(num);
00209 
00210             if (result.features)
00211                 return result;
00212             else
00213             {
00214                 result.features=feature_cache->set_entry(num);
00215             }
00216         }
00217 
00218         //if (!result.features)
00219         //  result.do_free=true;
00220 
00221         result.features=compute_sparse_feature_vector(num,
00222             result.num_feat_entries, result.features);
00223 
00224 
00225         if (get_num_preprocessors())
00226         {
00227             int32_t tmp_len=result.num_feat_entries;
00228             SGSparseVectorEntry<ST>* tmp_feat_before=result.features;
00229             SGSparseVectorEntry<ST>* tmp_feat_after = NULL;
00230 
00231             for (int32_t i=0; i<get_num_preprocessors(); i++)
00232             {
00233                 //tmp_feat_after=((CSparsePreprocessor<ST>*) get_preproc(i))->apply_to_feature_vector(tmp_feat_before, tmp_len);
00234 
00235                 if (i!=0)   // delete feature vector, except for the the first one, i.e., feat
00236                     SG_FREE(tmp_feat_before);
00237                 tmp_feat_before=tmp_feat_after;
00238             }
00239 
00240             memcpy(result.features, tmp_feat_after,
00241                     sizeof(SGSparseVectorEntry<ST>)*tmp_len);
00242 
00243             SG_FREE(tmp_feat_after);
00244             result.num_feat_entries=tmp_len ;
00245             SG_DEBUG( "len: %d len2: %d\n", result.num_feat_entries, num_features);
00246         }
00247         return result ;
00248     }
00249 }
00250 
00251 template<class ST> ST CSparseFeatures<ST>::sparse_dot(ST alpha, SGSparseVectorEntry<ST>* avec, int32_t alen, SGSparseVectorEntry<ST>* bvec, int32_t blen)
00252 {
00253     ST result=0;
00254 
00255     //result remains zero when one of the vectors is non existent
00256     if (avec && bvec)
00257     {
00258         if (alen<=blen)
00259         {
00260             int32_t j=0;
00261             for (int32_t i=0; i<alen; i++)
00262             {
00263                 int32_t a_feat_idx=avec[i].feat_index;
00264 
00265                 while ( (j<blen) && (bvec[j].feat_index < a_feat_idx) )
00266                     j++;
00267 
00268                 if ( (j<blen) && (bvec[j].feat_index == a_feat_idx) )
00269                 {
00270                     result+= avec[i].entry * bvec[j].entry;
00271                     j++;
00272                 }
00273             }
00274         }
00275         else
00276         {
00277             int32_t j=0;
00278             for (int32_t i=0; i<blen; i++)
00279             {
00280                 int32_t b_feat_idx=bvec[i].feat_index;
00281 
00282                 while ( (j<alen) && (avec[j].feat_index < b_feat_idx) )
00283                     j++;
00284 
00285                 if ( (j<alen) && (avec[j].feat_index == b_feat_idx) )
00286                 {
00287                     result+= bvec[i].entry * avec[j].entry;
00288                     j++;
00289                 }
00290             }
00291         }
00292 
00293         result*=alpha;
00294     }
00295 
00296     return result;
00297 }
00298 
00299 template<class ST> ST CSparseFeatures<ST>::dense_dot(ST alpha, int32_t num, ST* vec, int32_t dim, ST b)
00300 {
00301     ASSERT(vec);
00302     ASSERT(dim==num_features);
00303     ST result=b;
00304 
00305     SGSparseVector<ST> sv=get_sparse_feature_vector(num);
00306 
00307     if (sv.features)
00308     {
00309         for (int32_t i=0; i<sv.num_feat_entries; i++)
00310         {
00311             result+=alpha*vec[sv.features[i].feat_index]
00312                 *sv.features[i].entry;
00313         }
00314     }
00315 
00316     free_sparse_feature_vector(num);
00317     return result;
00318 }
00319 
00320 template<class ST> void CSparseFeatures<ST>::add_to_dense_vec(float64_t alpha, int32_t num, float64_t* vec, int32_t dim, bool abs_val)
00321 {
00322     ASSERT(vec);
00323     if (dim!=num_features)
00324     {
00325         SG_ERROR("dimension of vec (=%d) does not match number of features (=%d)\n",
00326                 dim, num_features);
00327     }
00328 
00329     SGSparseVector<ST> sv=get_sparse_feature_vector(num);
00330 
00331     if (sv.features)
00332     {
00333         if (abs_val)
00334         {
00335             for (int32_t i=0; i<sv.num_feat_entries; i++)
00336             {
00337                 vec[sv.features[i].feat_index]+=alpha
00338                     *CMath::abs(sv.features[i].entry);
00339             }
00340         }
00341         else
00342         {
00343             for (int32_t i=0; i<sv.num_feat_entries; i++)
00344             {
00345                 vec[sv.features[i].feat_index]+=alpha
00346                         *sv.features[i].entry;
00347             }
00348         }
00349     }
00350 
00351     free_sparse_feature_vector(num);
00352 }
00353 
00354 template<class ST> void CSparseFeatures<ST>::free_sparse_feature_vector(int32_t num)
00355 {
00356     if (feature_cache)
00357         feature_cache->unlock_entry(m_subset_stack->subset_idx_conversion(num));
00358 
00359     //vec.free_vector();
00360 }
00361 
00362 template<class ST> SGSparseVector<ST>* CSparseFeatures<ST>::get_sparse_feature_matrix(int32_t &num_feat, int32_t &num_vec)
00363 {
00364     if (m_subset_stack->has_subsets())
00365         SG_ERROR("get_sparse_feature_matrix() not allowed with subset\n");
00366 
00367     num_feat=num_features;
00368     num_vec=num_vectors;
00369 
00370     return sparse_feature_matrix;
00371 }
00372 
00373 template<class ST> SGSparseMatrix<ST> CSparseFeatures<ST>::get_sparse_feature_matrix()
00374 {
00375     if (m_subset_stack->has_subsets())
00376         SG_ERROR("get_sparse_feature_matrix() not allowed with subset\n");
00377 
00378     SGSparseMatrix<ST> sm=SGSparseMatrix<ST>(NULL, 0, 0, false);
00379     sm.sparse_matrix=get_sparse_feature_matrix(sm.num_features, sm.num_vectors);
00380     return sm;
00381 }
00382 
00383 template<class ST> CSparseFeatures<ST>* CSparseFeatures<ST>::get_transposed()
00384 {
00385     int32_t num_feat;
00386     int32_t num_vec;
00387     SGSparseVector<ST>* s=get_transposed(num_feat, num_vec);
00388     //SG_PRINT("num_feat = %d , num_vec = %d \n", num_feat, num_vec);
00389     return new CSparseFeatures<ST>(s, num_feat, num_vec);
00390 }
00391 
00392 template<class ST> SGSparseVector<ST>* CSparseFeatures<ST>::get_transposed(int32_t &num_feat, int32_t &num_vec)
00393 {
00394     num_feat=get_num_vectors();
00395     num_vec=num_features;
00396     //SG_PRINT("get transposed num_feat = %d , num_vec = %d \n", num_feat, num_vec);
00397 
00398     int32_t* hist=SG_MALLOC(int32_t, num_features);
00399     memset(hist, 0, sizeof(int32_t)*num_features);
00400 
00401     // count how lengths of future feature vectors
00402     for (int32_t v=0; v<num_feat; v++)
00403     {
00404         SGSparseVector<ST> sv=get_sparse_feature_vector(v);
00405 
00406         for (int32_t i=0; i<sv.num_feat_entries; i++)
00407             hist[sv.features[i].feat_index]++;
00408     }
00409 
00410     // allocate room for future feature vectors
00411     SGSparseVector<ST>* sfm=SG_MALLOC(SGSparseVector<ST>, num_vec);
00412     for (int32_t v=0; v<num_vec; v++)
00413         new (&sfm[v]) SGSparseVector<ST>(hist[v]);
00414 
00415     // fill future feature vectors with content
00416     memset(hist,0,sizeof(int32_t)*num_features);
00417     for (int32_t v=0; v<num_feat; v++)
00418     {
00419         SGSparseVector<ST> sv=get_sparse_feature_vector(v);
00420 
00421         for (int32_t i=0; i<sv.num_feat_entries; i++)
00422         {
00423             int32_t vidx=sv.features[i].feat_index;
00424             int32_t fidx=v;
00425             sfm[vidx].features[hist[vidx]].feat_index=fidx;
00426             sfm[vidx].features[hist[vidx]].entry=sv.features[i].entry;
00427             hist[vidx]++;
00428         }
00429 
00430     }
00431 
00432     SG_FREE(hist);
00433     return sfm;
00434 }
00435 
00436 template<class ST> void CSparseFeatures<ST>::set_sparse_feature_matrix(SGSparseMatrix<ST> sm)
00437 {
00438     if (m_subset_stack->has_subsets())
00439         SG_ERROR("set_sparse_feature_matrix() not allowed with subset\n");
00440 
00441     SGSparseVector<ST>* sparse_matrix = SG_MALLOC(SGSparseVector<ST>, sm.num_vectors);
00442     for (int32_t i=0; i<sm.num_vectors; i++)
00443     {
00444         new (&sparse_matrix[i]) SGSparseVector<ST>();
00445         sparse_matrix[i] = sm[i];
00446     }
00447 
00448     sparse_feature_matrix=sparse_matrix;
00449     num_features=sm.num_features;
00450     num_vectors=sm.num_vectors;
00451 }
00452 
00453 template<class ST> SGMatrix<ST> CSparseFeatures<ST>::get_full_feature_matrix()
00454 {
00455     SGMatrix<ST> full(num_features, get_num_vectors());
00456     full.zero();
00457 
00458     SG_INFO( "converting sparse features to full feature matrix of %ld x %ld entries\n", num_vectors, num_features);
00459 
00460     for (int32_t v=0; v<full.num_cols; v++)
00461     {
00462         int32_t idx=m_subset_stack->subset_idx_conversion(v);
00463         SGSparseVector<ST> current=sparse_feature_matrix[idx];
00464 
00465         for (int32_t f=0; f<current.num_feat_entries; f++)
00466         {
00467             int64_t offs=(idx*num_features)
00468                     +current.features[f].feat_index;
00469 
00470             full.matrix[offs]=current.features[f].entry;
00471         }
00472     }
00473 
00474     return full;
00475 }
00476 
00477 template<class ST> bool CSparseFeatures<ST>::set_full_feature_matrix(SGMatrix<ST> full)
00478 {
00479     remove_all_subsets();
00480 
00481     ST* src=full.matrix;
00482     int32_t num_feat=full.num_rows;
00483     int32_t num_vec=full.num_cols;
00484 
00485     free_sparse_feature_matrix();
00486     bool result=true;
00487     num_features=num_feat;
00488     num_vectors=num_vec;
00489 
00490     SG_INFO("converting dense feature matrix to sparse one\n");
00491     int32_t* num_feat_entries=SG_MALLOC(int, num_vectors);
00492 
00493     if (num_feat_entries)
00494     {
00495         int64_t num_total_entries=0;
00496 
00497         // count nr of non sparse features
00498         for (int32_t i=0; i< num_vec; i++)
00499         {
00500             num_feat_entries[i]=0;
00501             for (int32_t j=0; j< num_feat; j++)
00502             {
00503                 if (src[i*((int64_t) num_feat) + j] != 0)
00504                     num_feat_entries[i]++;
00505             }
00506         }
00507 
00508         if (num_vec>0)
00509         {
00510             sparse_feature_matrix=SG_MALLOC(SGSparseVector<ST>, num_vec);
00511 
00512             if (sparse_feature_matrix)
00513             {
00514                 for (int32_t i=0; i< num_vec; i++)
00515                 {
00516                     new(&sparse_feature_matrix[i]) SGSparseVector<ST>();
00517                     sparse_feature_matrix[i] = SGSparseVector<ST>(num_feat_entries[i]);
00518                     int32_t sparse_feat_idx=0;
00519 
00520                     for (int32_t j=0; j< num_feat; j++)
00521                     {
00522                         int64_t pos= i*num_feat + j;
00523 
00524                         if (src[pos] != 0)
00525                         {
00526                             sparse_feature_matrix[i].features[sparse_feat_idx].entry=src[pos];
00527                             sparse_feature_matrix[i].features[sparse_feat_idx].feat_index=j;
00528                             sparse_feat_idx++;
00529                             num_total_entries++;
00530                         }
00531                     }
00532                 }
00533             }
00534             else
00535             {
00536                 SG_ERROR( "allocation of sparse feature matrix failed\n");
00537                 result=false;
00538             }
00539 
00540             SG_INFO( "sparse feature matrix has %ld entries (full matrix had %ld, sparsity %2.2f%%)\n",
00541                     num_total_entries, int64_t(num_feat)*num_vec, (100.0*num_total_entries)/(int64_t(num_feat)*num_vec));
00542         }
00543         else
00544         {
00545             SG_ERROR( "huh ? zero size matrix given ?\n");
00546             result=false;
00547         }
00548     }
00549     SG_FREE(num_feat_entries);
00550     return result;
00551 }
00552 
00553 template<class ST> bool CSparseFeatures<ST>::apply_preprocessor(bool force_preprocessing)
00554 {
00555     SG_INFO( "force: %d\n", force_preprocessing);
00556 
00557     if ( sparse_feature_matrix && get_num_preprocessors() )
00558     {
00559         for (int32_t i=0; i<get_num_preprocessors(); i++)
00560         {
00561             if ( (!is_preprocessed(i) || force_preprocessing) )
00562             {
00563                 set_preprocessed(i);
00564                 SG_INFO( "preprocessing using preproc %s\n", get_preprocessor(i)->get_name());
00565                 if (((CSparsePreprocessor<ST>*) get_preprocessor(i))->apply_to_sparse_feature_matrix(this) == NULL)
00566                     return false;
00567             }
00568             return true;
00569         }
00570         return true;
00571     }
00572     else
00573     {
00574         SG_WARNING( "no sparse feature matrix available or features already preprocessed - skipping.\n");
00575         return false;
00576     }
00577 }
00578 
00579 template<class ST> int32_t CSparseFeatures<ST>::get_size() const
00580 {
00581     return sizeof(ST);
00582 }
00583 
00584 template<class ST> bool CSparseFeatures<ST>::obtain_from_simple(CDenseFeatures<ST>* sf)
00585 {
00586     SGMatrix<ST> fm=sf->get_feature_matrix();
00587     ASSERT(fm.matrix && fm.num_cols>0 && fm.num_rows>0);
00588 
00589     return set_full_feature_matrix(fm);
00590 }
00591 
00592 template<class ST> int32_t  CSparseFeatures<ST>::get_num_vectors() const
00593 {
00594     return m_subset_stack->has_subsets() ? m_subset_stack->get_size() : num_vectors;
00595 }
00596 
00597 template<class ST> int32_t  CSparseFeatures<ST>::get_num_features()
00598 {
00599     return num_features;
00600 }
00601 
00602 template<class ST> int32_t CSparseFeatures<ST>::set_num_features(int32_t num)
00603 {
00604     int32_t n=num_features;
00605     ASSERT(n<=num);
00606     num_features=num;
00607     return num_features;
00608 }
00609 
00610 template<class ST> EFeatureClass CSparseFeatures<ST>::get_feature_class() const
00611 {
00612     return C_SPARSE;
00613 }
00614 
00615 template<class ST> void CSparseFeatures<ST>::free_feature_vector(int32_t num)
00616 {
00617     if (feature_cache)
00618         feature_cache->unlock_entry(m_subset_stack->subset_idx_conversion(num));
00619 
00620     //vec.free_vector();
00621 }
00622 
00623 template<class ST> int64_t CSparseFeatures<ST>::get_num_nonzero_entries()
00624 {
00625     int64_t num=0;
00626     index_t num_vec=get_num_vectors();
00627     for (int32_t i=0; i<num_vec; i++)
00628         num+=sparse_feature_matrix[m_subset_stack->subset_idx_conversion(i)].num_feat_entries;
00629 
00630     return num;
00631 }
00632 
00633 template<class ST> float64_t* CSparseFeatures<ST>::compute_squared(float64_t* sq)
00634 {
00635     ASSERT(sq);
00636 
00637     index_t num_vec=get_num_vectors();
00638     for (int32_t i=0; i<num_vec; i++)
00639     {
00640         sq[i]=0;
00641         SGSparseVector<ST> vec=get_sparse_feature_vector(i);
00642 
00643         for (int32_t j=0; j<vec.num_feat_entries; j++)
00644             sq[i]+=vec.features[j].entry*vec.features[j].entry;
00645 
00646         free_feature_vector(i);
00647     }
00648 
00649     return sq;
00650 }
00651 
00652 template<class ST> float64_t CSparseFeatures<ST>::compute_squared_norm(
00653         CSparseFeatures<float64_t>* lhs, float64_t* sq_lhs, int32_t idx_a,
00654         CSparseFeatures<float64_t>* rhs, float64_t* sq_rhs, int32_t idx_b)
00655 {
00656     int32_t i,j;
00657     ASSERT(lhs);
00658     ASSERT(rhs);
00659 
00660     SGSparseVector<float64_t> avec=lhs->get_sparse_feature_vector(idx_a);
00661     SGSparseVector<float64_t> bvec=rhs->get_sparse_feature_vector(idx_b);
00662     ASSERT(avec.features);
00663     ASSERT(bvec.features);
00664 
00665     float64_t result=sq_lhs[idx_a]+sq_rhs[idx_b];
00666 
00667     if (avec.num_feat_entries<=bvec.num_feat_entries)
00668     {
00669         j=0;
00670         for (i=0; i<avec.num_feat_entries; i++)
00671         {
00672             int32_t a_feat_idx=avec.features[i].feat_index;
00673 
00674             while ((j<bvec.num_feat_entries)
00675                     &&(bvec.features[j].feat_index<a_feat_idx))
00676                 j++;
00677 
00678             if ((j<bvec.num_feat_entries)
00679                     &&(bvec.features[j].feat_index==a_feat_idx))
00680             {
00681                 result-=2*(avec.features[i].entry*bvec.features[j].entry);
00682                 j++;
00683             }
00684         }
00685     }
00686     else
00687     {
00688         j=0;
00689         for (i=0; i<bvec.num_feat_entries; i++)
00690         {
00691             int32_t b_feat_idx=bvec.features[i].feat_index;
00692 
00693             while ((j<avec.num_feat_entries)
00694                     &&(avec.features[j].feat_index<b_feat_idx))
00695                 j++;
00696 
00697             if ((j<avec.num_feat_entries)
00698                     &&(avec.features[j].feat_index==b_feat_idx))
00699             {
00700                 result-=2*(bvec.features[i].entry*avec.features[j].entry);
00701                 j++;
00702             }
00703         }
00704     }
00705 
00706     ((CSparseFeatures<float64_t>*) lhs)->free_feature_vector(idx_a);
00707     ((CSparseFeatures<float64_t>*) rhs)->free_feature_vector(idx_b);
00708 
00709     return CMath::abs(result);
00710 }
00711 
00712 template<class ST> CRegressionLabels* CSparseFeatures<ST>::load_svmlight_file(char* fname,
00713         bool do_sort_features)
00714 {
00715     remove_all_subsets();
00716 
00717     CRegressionLabels* lab=NULL;
00718 
00719     size_t blocksize=1024*1024;
00720     size_t required_blocksize=blocksize;
00721     uint8_t* dummy=SG_MALLOC(uint8_t, blocksize);
00722     FILE* f=fopen(fname, "ro");
00723 
00724     if (f)
00725     {
00726         free_sparse_feature_matrix();
00727         num_vectors=0;
00728         num_features=0;
00729 
00730         SG_INFO("counting line numbers in file %s\n", fname);
00731         size_t sz=blocksize;
00732         size_t block_offs=0;
00733         size_t old_block_offs=0;
00734         fseek(f, 0, SEEK_END);
00735         size_t fsize=ftell(f);
00736         rewind(f);
00737 
00738         while (sz == blocksize)
00739         {
00740             sz=fread(dummy, sizeof(uint8_t), blocksize, f);
00741             for (size_t i=0; i<sz; i++)
00742             {
00743                 block_offs++;
00744                 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00745                 {
00746                     num_vectors++;
00747                     required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs+1);
00748                     old_block_offs=block_offs;
00749                 }
00750             }
00751             SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00752         }
00753 
00754         SG_INFO("found %d feature vectors\n", num_vectors);
00755         SG_FREE(dummy);
00756         blocksize=required_blocksize;
00757         dummy = SG_MALLOC(uint8_t, blocksize+1); //allow setting of '\0' at EOL
00758 
00759         lab=new CRegressionLabels(num_vectors);
00760         sparse_feature_matrix=SG_MALLOC(SGSparseVector<ST>, num_vectors);
00761         for (int32_t i=0; i<num_vectors; i++)
00762             new (&sparse_feature_matrix[i]) SGSparseVector<ST>();
00763         rewind(f);
00764         sz=blocksize;
00765         int32_t lines=0;
00766         while (sz == blocksize)
00767         {
00768             sz=fread(dummy, sizeof(uint8_t), blocksize, f);
00769 
00770             size_t old_sz=0;
00771             for (size_t i=0; i<sz; i++)
00772             {
00773                 if (i==sz-1 && dummy[i]!='\n' && sz==blocksize)
00774                 {
00775                     size_t len=i-old_sz+1;
00776                     uint8_t* data=&dummy[old_sz];
00777 
00778                     for (size_t j=0; j<len; j++)
00779                         dummy[j]=data[j];
00780 
00781                     sz=fread(dummy+len, sizeof(uint8_t), blocksize-len, f);
00782                     i=0;
00783                     old_sz=0;
00784                     sz+=len;
00785                 }
00786 
00787                 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00788                 {
00789 
00790                     size_t len=i-old_sz;
00791                     uint8_t* data=&dummy[old_sz];
00792 
00793                     int32_t dims=0;
00794                     for (size_t j=0; j<len; j++)
00795                     {
00796                         if (data[j]==':')
00797                             dims++;
00798                     }
00799 
00800                     if (dims<=0)
00801                     {
00802                         SG_ERROR("Error in line %d - number of"
00803                                 " dimensions is %d line is %d characters"
00804                                 " long\n line_content:'%.*s'\n", lines,
00805                                 dims, len, len, (const char*) data);
00806                     }
00807 
00808                     SGSparseVectorEntry<ST>* feat=SG_MALLOC(SGSparseVectorEntry<ST>, dims);
00809                     size_t j=0;
00810                     for (; j<len; j++)
00811                     {
00812                         if (data[j]==' ')
00813                         {
00814                             data[j]='\0';
00815 
00816                             lab->set_label(lines, atof((const char*) data));
00817                             break;
00818                         }
00819                     }
00820 
00821                     int32_t d=0;
00822                     j++;
00823                     uint8_t* start=&data[j];
00824                     for (; j<len; j++)
00825                     {
00826                         if (data[j]==':')
00827                         {
00828                             data[j]='\0';
00829 
00830                             feat[d].feat_index=(int32_t) atoi((const char*) start)-1;
00831                             num_features=CMath::max(num_features, feat[d].feat_index+1);
00832 
00833                             j++;
00834                             start=&data[j];
00835                             for (; j<len; j++)
00836                             {
00837                                 if (data[j]==' ' || data[j]=='\n')
00838                                 {
00839                                     data[j]='\0';
00840                                     feat[d].entry=(ST) atof((const char*) start);
00841                                     d++;
00842                                     break;
00843                                 }
00844                             }
00845 
00846                             if (j==len)
00847                             {
00848                                 data[j]='\0';
00849                                 feat[dims-1].entry=(ST) atof((const char*) start);
00850                             }
00851 
00852                             j++;
00853                             start=&data[j];
00854                         }
00855                     }
00856 
00857                     sparse_feature_matrix[lines].num_feat_entries=dims;
00858                     sparse_feature_matrix[lines].features=feat;
00859 
00860                     old_sz=i+1;
00861                     lines++;
00862                     SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t");
00863                 }
00864             }
00865         }
00866         SG_INFO("file successfully read\n");
00867         fclose(f);
00868     }
00869 
00870     SG_FREE(dummy);
00871 
00872     if (do_sort_features)
00873         sort_features();
00874 
00875     return lab;
00876 }
00877 
00878 template<class ST> void CSparseFeatures<ST>::sort_features()
00879 {
00880     if (m_subset_stack->has_subsets())
00881         SG_ERROR("sort_features() not allowed with subset\n");
00882 
00883     ASSERT(get_num_preprocessors()==0);
00884 
00885     if (!sparse_feature_matrix)
00886         SG_ERROR("Requires sparse feature matrix to be available in-memory\n");
00887 
00888     for (int32_t i=0; i<num_vectors; i++)
00889     {
00890         int32_t len=sparse_feature_matrix[i].num_feat_entries;
00891 
00892         if (!len)
00893             continue;
00894 
00895         SGSparseVectorEntry<ST>* sf_orig=sparse_feature_matrix[i].features;
00896         int32_t* feat_idx=SG_MALLOC(int32_t, len);
00897         int32_t* orig_idx=SG_MALLOC(int32_t, len);
00898 
00899         for (int j=0; j<len; j++)
00900         {
00901             feat_idx[j]=sf_orig[j].feat_index;
00902             orig_idx[j]=j;
00903         }
00904 
00905         CMath::qsort_index(feat_idx, orig_idx, len);
00906 
00907         SGSparseVectorEntry<ST>* sf_new= SG_MALLOC(SGSparseVectorEntry<ST>, len);
00908         for (int j=0; j<len; j++)
00909             sf_new[j]=sf_orig[orig_idx[j]];
00910 
00911         sparse_feature_matrix[i].features=sf_new;
00912 
00913         // sanity check
00914         for (int j=0; j<len-1; j++)
00915             ASSERT(sf_new[j].feat_index<sf_new[j+1].feat_index);
00916 
00917         SG_FREE(orig_idx);
00918         SG_FREE(feat_idx);
00919         SG_FREE(sf_orig);
00920     }
00921 }
00922 
00923 template<class ST> bool CSparseFeatures<ST>::write_svmlight_file(char* fname,
00924         CRegressionLabels* label)
00925 {
00926     if (m_subset_stack->has_subsets())
00927         SG_ERROR("write_svmlight_file() not allowed with subset\n");
00928 
00929     ASSERT(label);
00930     int32_t num=label->get_num_labels();
00931     ASSERT(num>0);
00932     ASSERT(num==num_vectors);
00933 
00934     FILE* f=fopen(fname, "wb");
00935 
00936     if (f)
00937     {
00938         for (int32_t i=0; i<num; i++)
00939         {
00940             fprintf(f, "%d ", (int32_t) label->get_int_label(i));
00941 
00942             SGSparseVectorEntry<ST>* vec = sparse_feature_matrix[i].features;
00943             int32_t num_feat = sparse_feature_matrix[i].num_feat_entries;
00944 
00945             for (int32_t j=0; j<num_feat; j++)
00946             {
00947                 if (j<num_feat-1)
00948                     fprintf(f, "%d:%f ", (int32_t) vec[j].feat_index+1, (double) vec[j].entry);
00949                 else
00950                     fprintf(f, "%d:%f\n", (int32_t) vec[j].feat_index+1, (double) vec[j].entry);
00951             }
00952         }
00953 
00954         fclose(f);
00955         return true;
00956     }
00957     return false;
00958 }
00959 
00960 template<class ST> int32_t CSparseFeatures<ST>::get_dim_feature_space() const
00961 {
00962     return num_features;
00963 }
00964 
00965 template<class ST> float64_t CSparseFeatures<ST>::dot(int32_t vec_idx1,
00966         CDotFeatures* df, int32_t vec_idx2)
00967 {
00968     ASSERT(df);
00969     ASSERT(df->get_feature_type() == get_feature_type());
00970     ASSERT(df->get_feature_class() == get_feature_class());
00971     CSparseFeatures<ST>* sf = (CSparseFeatures<ST>*) df;
00972 
00973     SGSparseVector<ST> avec=get_sparse_feature_vector(vec_idx1);
00974     SGSparseVector<ST> bvec=sf->get_sparse_feature_vector(vec_idx2);
00975 
00976     float64_t result=sparse_dot(1, avec.features, avec.num_feat_entries,
00977         bvec.features, bvec.num_feat_entries);
00978 
00979     free_sparse_feature_vector(vec_idx1);
00980     sf->free_sparse_feature_vector(vec_idx2);
00981 
00982     return result;
00983 }
00984 template<class ST> float64_t CSparseFeatures<ST>::dense_dot(int32_t vec_idx1, float64_t* vec2, int32_t vec2_len)
00985 {
00986     ASSERT(vec2);
00987     if (vec2_len!=num_features)
00988     {
00989         SG_ERROR("dimension of vec2 (=%d) does not match number of features (=%d)\n",
00990                 vec2_len, num_features);
00991     }
00992     float64_t result=0;
00993 
00994     SGSparseVector<ST> sv=get_sparse_feature_vector(vec_idx1);
00995 
00996     if (sv.features)
00997     {
00998         for (int32_t i=0; i<sv.num_feat_entries; i++)
00999             result+=vec2[sv.features[i].feat_index]*sv.features[i].entry;
01000     }
01001 
01002     free_sparse_feature_vector(vec_idx1);
01003 
01004     return result;
01005 }
01006 
01007 template<class ST> void* CSparseFeatures<ST>::get_feature_iterator(int32_t vector_index)
01008 {
01009     if (vector_index>=get_num_vectors())
01010     {
01011         SG_ERROR("Index out of bounds (number of vectors %d, you "
01012                 "requested %d)\n", get_num_vectors(), vector_index);
01013     }
01014 
01015     if (!sparse_feature_matrix)
01016         SG_ERROR("Requires a in-memory feature matrix\n");
01017 
01018     sparse_feature_iterator* it=SG_MALLOC(sparse_feature_iterator, 1);
01019     it->sv=get_sparse_feature_vector(vector_index);
01020     it->index=0;
01021     it->vector_index=vector_index;
01022 
01023     return it;
01024 }
01025 
01026 template<class ST> bool CSparseFeatures<ST>::get_next_feature(int32_t& index, float64_t& value, void* iterator)
01027 {
01028     sparse_feature_iterator* it=(sparse_feature_iterator*) iterator;
01029     if (!it || it->index>=it->sv.num_feat_entries)
01030         return false;
01031 
01032     int32_t i=it->index++;
01033 
01034     index=it->sv.features[i].feat_index;
01035     value=(float64_t) it->sv.features[i].entry;
01036 
01037     return true;
01038 }
01039 
01040 template<class ST> void CSparseFeatures<ST>::free_feature_iterator(void* iterator)
01041 {
01042     if (!iterator)
01043         return;
01044 
01045     sparse_feature_iterator* it=(sparse_feature_iterator*) iterator;
01046     free_sparse_feature_vector(it->vector_index);
01047     SG_FREE(it);
01048 }
01049 
01050 template<class ST> CFeatures* CSparseFeatures<ST>::copy_subset(SGVector<index_t> indices)
01051 {
01052     SGSparseMatrix<ST> matrix_copy=SGSparseMatrix<ST>(indices.vlen,
01053         get_dim_feature_space());
01054 
01055     for (index_t i=0; i<indices.vlen; ++i)
01056     {
01057         /* index to copy */
01058         index_t index=indices.vector[i];
01059         index_t real_index=m_subset_stack->subset_idx_conversion(index);
01060 
01061         /* copy sparse vector */
01062         SGSparseVector<ST> current=get_sparse_feature_vector(real_index);
01063         matrix_copy.sparse_matrix[i]=current;
01064 
01065         free_sparse_feature_vector(index);
01066     }
01067 
01068     CFeatures* result=new CSparseFeatures<ST>(matrix_copy);
01069     SG_REF(result);
01070     return result;
01071 }
01072 
01073 template<class ST> SGSparseVectorEntry<ST>* CSparseFeatures<ST>::compute_sparse_feature_vector(int32_t num,
01074     int32_t& len, SGSparseVectorEntry<ST>* target)
01075 {
01076     SG_NOTIMPLEMENTED;
01077 
01078     len=0;
01079     return NULL;
01080 }
01081 
01082 template<class ST> void CSparseFeatures<ST>::init()
01083 {
01084     set_generic<ST>();
01085 
01086     m_parameters->add_vector(&sparse_feature_matrix, &num_vectors,
01087             "sparse_feature_matrix",
01088             "Array of sparse vectors.");
01089     m_parameters->add(&num_features, "num_features",
01090             "Total number of features.");
01091 }
01092 
01093 #define GET_FEATURE_TYPE(sg_type, f_type)                                   \
01094 template<> EFeatureType CSparseFeatures<sg_type>::get_feature_type() const  \
01095 {                                                                           \
01096     return f_type;                                                          \
01097 }
01098 GET_FEATURE_TYPE(bool, F_BOOL)
01099 GET_FEATURE_TYPE(char, F_CHAR)
01100 GET_FEATURE_TYPE(uint8_t, F_BYTE)
01101 GET_FEATURE_TYPE(int8_t, F_BYTE)
01102 GET_FEATURE_TYPE(int16_t, F_SHORT)
01103 GET_FEATURE_TYPE(uint16_t, F_WORD)
01104 GET_FEATURE_TYPE(int32_t, F_INT)
01105 GET_FEATURE_TYPE(uint32_t, F_UINT)
01106 GET_FEATURE_TYPE(int64_t, F_LONG)
01107 GET_FEATURE_TYPE(uint64_t, F_ULONG)
01108 GET_FEATURE_TYPE(float32_t, F_SHORTREAL)
01109 GET_FEATURE_TYPE(float64_t, F_DREAL)
01110 GET_FEATURE_TYPE(floatmax_t, F_LONGREAL)
01111 #undef GET_FEATURE_TYPE
01112 
01113 #define LOAD(fname, sg_type)                                            \
01114 template<> void CSparseFeatures<sg_type>::load(CFile* loader)   \
01115 {                                                                       \
01116     remove_all_subsets();                                                   \
01117     SG_SET_LOCALE_C;                                                    \
01118     ASSERT(loader);                                                     \
01119     SGSparseVector<sg_type>* matrix=NULL;                                       \
01120     int32_t num_feat=0;                                                 \
01121     int32_t num_vec=0;                                                  \
01122     loader->fname(matrix, num_feat, num_vec);                           \
01123     set_sparse_feature_matrix(SGSparseMatrix<sg_type>(matrix, num_feat, num_vec));              \
01124     SG_RESET_LOCALE;                                                    \
01125 }
01126 LOAD(get_sparse_matrix, bool)
01127 LOAD(get_sparse_matrix, char)
01128 LOAD(get_sparse_matrix, uint8_t)
01129 LOAD(get_int8_sparsematrix, int8_t)
01130 LOAD(get_sparse_matrix, int16_t)
01131 LOAD(get_sparse_matrix, uint16_t)
01132 LOAD(get_sparse_matrix, int32_t)
01133 LOAD(get_uint_sparsematrix, uint32_t)
01134 LOAD(get_long_sparsematrix, int64_t)
01135 LOAD(get_ulong_sparsematrix, uint64_t)
01136 LOAD(get_sparse_matrix, float32_t)
01137 LOAD(get_sparse_matrix, float64_t)
01138 LOAD(get_longreal_sparsematrix, floatmax_t)
01139 #undef LOAD
01140 
01141 #define WRITE(fname, sg_type)                                           \
01142 template<> void CSparseFeatures<sg_type>::save(CFile* writer)   \
01143 {                                                                       \
01144     if (m_subset_stack->has_subsets())                                                      \
01145         SG_ERROR("save() not allowed with subset\n");                   \
01146     SG_SET_LOCALE_C;                                                    \
01147     ASSERT(writer);                                                     \
01148     writer->fname(sparse_feature_matrix, num_features, num_vectors);    \
01149     SG_RESET_LOCALE;                                                    \
01150 }
01151 WRITE(set_sparse_matrix, bool)
01152 WRITE(set_sparse_matrix, char)
01153 WRITE(set_sparse_matrix, uint8_t)
01154 WRITE(set_int8_sparsematrix, int8_t)
01155 WRITE(set_sparse_matrix, int16_t)
01156 WRITE(set_sparse_matrix, uint16_t)
01157 WRITE(set_sparse_matrix, int32_t)
01158 WRITE(set_uint_sparsematrix, uint32_t)
01159 WRITE(set_long_sparsematrix, int64_t)
01160 WRITE(set_ulong_sparsematrix, uint64_t)
01161 WRITE(set_sparse_matrix, float32_t)
01162 WRITE(set_sparse_matrix, float64_t)
01163 WRITE(set_longreal_sparsematrix, floatmax_t)
01164 #undef WRITE
01165 
01166 template class CSparseFeatures<bool>;
01167 template class CSparseFeatures<char>;
01168 template class CSparseFeatures<int8_t>;
01169 template class CSparseFeatures<uint8_t>;
01170 template class CSparseFeatures<int16_t>;
01171 template class CSparseFeatures<uint16_t>;
01172 template class CSparseFeatures<int32_t>;
01173 template class CSparseFeatures<uint32_t>;
01174 template class CSparseFeatures<int64_t>;
01175 template class CSparseFeatures<uint64_t>;
01176 template class CSparseFeatures<float32_t>;
01177 template class CSparseFeatures<float64_t>;
01178 template class CSparseFeatures<floatmax_t>;
01179 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation