SHOGUN
v2.0.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2010 Soeren Sonnenburg 00008 * Copyright (C) 2010 Berlin Institute of Technology 00009 */ 00010 00011 #include <shogun/features/HashedWDFeaturesTransposed.h> 00012 #include <shogun/io/SGIO.h> 00013 #include <shogun/lib/Signal.h> 00014 #include <shogun/base/Parallel.h> 00015 00016 #ifdef HAVE_PTHREAD 00017 #include <pthread.h> 00018 #endif 00019 00020 using namespace shogun; 00021 00022 #ifndef DOXYGEN_SHOULD_SKIP_THIS 00023 struct HASHEDWD_THREAD_PARAM 00024 { 00025 CHashedWDFeaturesTransposed* hf; 00026 int32_t* sub_index; 00027 float64_t* output; 00028 int32_t start; 00029 int32_t stop; 00030 float64_t* alphas; 00031 float64_t* vec; 00032 float64_t bias; 00033 bool progress; 00034 uint32_t* index; 00035 }; 00036 #endif // DOXYGEN_SHOULD_SKIP_THIS 00037 00038 CHashedWDFeaturesTransposed::CHashedWDFeaturesTransposed() 00039 :CDotFeatures() 00040 { 00041 SG_UNSTABLE( 00042 "CHashedWDFeaturesTransposed::CHashedWDFeaturesTransposed()", 00043 "\n"); 00044 00045 strings = NULL; 00046 transposed_strings = NULL; 00047 00048 degree = 0; 00049 start_degree = 0; 00050 from_degree = 0; 00051 string_length = 0; 00052 num_strings = 0; 00053 alphabet_size = 0; 00054 w_dim = 0; 00055 partial_w_dim = 0; 00056 wd_weights = NULL; 00057 mask = 0; 00058 m_hash_bits = 0; 00059 00060 normalization_const = 0.0; 00061 } 00062 00063 CHashedWDFeaturesTransposed::CHashedWDFeaturesTransposed(CStringFeatures<uint8_t>* str, 00064 int32_t start_order, int32_t order, int32_t from_order, 00065 int32_t hash_bits) : CDotFeatures() 00066 { 00067 ASSERT(start_order>=0); 00068 ASSERT(start_order<order); 00069 ASSERT(order<=from_order); 00070 ASSERT(hash_bits>0); 00071 ASSERT(str); 00072 ASSERT(str->have_same_length()); 00073 SG_REF(str); 00074 00075 strings=str; 00076 int32_t transposed_num_feat=0; 00077 int32_t transposed_num_vec=0; 00078 transposed_strings=str->get_transposed(transposed_num_feat, transposed_num_vec); 00079 00080 string_length=str->get_max_vector_length(); 00081 num_strings=str->get_num_vectors(); 00082 ASSERT(transposed_num_feat==num_strings); 00083 ASSERT(transposed_num_vec==string_length); 00084 00085 CAlphabet* alpha=str->get_alphabet(); 00086 alphabet_size=alpha->get_num_symbols(); 00087 SG_UNREF(alpha); 00088 00089 degree=order; 00090 start_degree=start_order; 00091 if (start_degree!=0) 00092 SG_NOTIMPLEMENTED; 00093 from_degree=from_order; 00094 m_hash_bits=hash_bits; 00095 set_wd_weights(); 00096 set_normalization_const(); 00097 } 00098 00099 CHashedWDFeaturesTransposed::CHashedWDFeaturesTransposed(const CHashedWDFeaturesTransposed& orig) 00100 : CDotFeatures(orig), strings(orig.strings), transposed_strings(orig.transposed_strings), 00101 degree(orig.degree), start_degree(orig.start_degree), 00102 from_degree(orig.from_degree), m_hash_bits(orig.m_hash_bits), 00103 normalization_const(orig.normalization_const) 00104 { 00105 SG_REF(strings); 00106 string_length=strings->get_max_vector_length(); 00107 num_strings=strings->get_num_vectors(); 00108 CAlphabet* alpha=strings->get_alphabet(); 00109 alphabet_size=alpha->get_num_symbols(); 00110 SG_UNREF(alpha); 00111 00112 set_wd_weights(); 00113 } 00114 00115 CHashedWDFeaturesTransposed::~CHashedWDFeaturesTransposed() 00116 { 00117 for (int32_t i=0; i<string_length; i++) 00118 SG_FREE(transposed_strings[i].string); 00119 SG_FREE(transposed_strings); 00120 00121 SG_UNREF(strings); 00122 SG_FREE(wd_weights); 00123 } 00124 00125 float64_t CHashedWDFeaturesTransposed::dot(int32_t vec_idx1, CDotFeatures* df, int32_t vec_idx2) 00126 { 00127 ASSERT(df); 00128 ASSERT(df->get_feature_type() == get_feature_type()); 00129 ASSERT(df->get_feature_class() == get_feature_class()); 00130 CHashedWDFeaturesTransposed* wdf = (CHashedWDFeaturesTransposed*) df; 00131 00132 int32_t len1, len2; 00133 bool free_vec1, free_vec2; 00134 00135 uint8_t* vec1=strings->get_feature_vector(vec_idx1, len1, free_vec1); 00136 uint8_t* vec2=wdf->strings->get_feature_vector(vec_idx2, len2, free_vec2); 00137 00138 ASSERT(len1==len2); 00139 00140 float64_t sum=0.0; 00141 00142 for (int32_t i=0; i<len1; i++) 00143 { 00144 for (int32_t j=0; (i+j<len1) && (j<degree); j++) 00145 { 00146 if (vec1[i+j]!=vec2[i+j]) 00147 break; 00148 if (j>=start_degree) 00149 sum += wd_weights[j]*wd_weights[j]; 00150 } 00151 } 00152 strings->free_feature_vector(vec1, vec_idx1, free_vec1); 00153 wdf->strings->free_feature_vector(vec2, vec_idx2, free_vec2); 00154 return sum/CMath::sq(normalization_const); 00155 } 00156 00157 float64_t CHashedWDFeaturesTransposed::dense_dot(int32_t vec_idx1, float64_t* vec2, int32_t vec2_len) 00158 { 00159 if (vec2_len != w_dim) 00160 SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim); 00161 00162 float64_t sum=0; 00163 int32_t len; 00164 bool free_vec1; 00165 uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1); 00166 uint32_t* val=SG_MALLOC(uint32_t, len); 00167 00168 uint32_t offs=0; 00169 00170 SGVector<uint32_t>::fill_vector(val, len, 0xDEADBEAF); 00171 00172 for (int32_t i=0; i < len; i++) 00173 { 00174 uint32_t o=offs; 00175 uint32_t carry = 0; 00176 uint32_t chunk = 0; 00177 00178 for (int32_t k=0; k<degree && i+k<len; k++) 00179 { 00180 const float64_t wd = wd_weights[k]; 00181 chunk++; 00182 CHash::IncrementalMurmurHash3(&(val[i]), &carry, &(vec[i+k]), 1); 00183 uint32_t h = 00184 CHash::FinalizeIncrementalMurmurHash3(val[i], carry, chunk); 00185 #ifdef DEBUG_HASHEDWD 00186 SG_PRINT("vec[i]=%d, k=%d, offs=%d o=%d h=%d \n", vec[i], k,offs, o, h); 00187 #endif 00188 sum+=vec2[o+(h & mask)]*wd; 00189 o+=partial_w_dim; 00190 } 00191 val[i] = CHash::FinalizeIncrementalMurmurHash3(val[i], carry, chunk); 00192 offs+=partial_w_dim*degree; 00193 } 00194 SG_FREE(val); 00195 strings->free_feature_vector(vec, vec_idx1, free_vec1); 00196 00197 return sum/normalization_const; 00198 } 00199 00200 void CHashedWDFeaturesTransposed::dense_dot_range(float64_t* output, int32_t start, int32_t stop, float64_t* alphas, float64_t* vec, int32_t dim, float64_t b) 00201 { 00202 ASSERT(output); 00203 // write access is internally between output[start..stop] so the following 00204 // line is necessary to write to output[0...(stop-start-1)] 00205 output-=start; 00206 ASSERT(start>=0); 00207 ASSERT(start<stop); 00208 ASSERT(stop<=get_num_vectors()); 00209 uint32_t* index=SG_MALLOC(uint32_t, stop); 00210 00211 int32_t num_vectors=stop-start; 00212 ASSERT(num_vectors>0); 00213 00214 int32_t num_threads=parallel->get_num_threads(); 00215 ASSERT(num_threads>0); 00216 00217 CSignal::clear_cancel(); 00218 00219 if (dim != w_dim) 00220 SG_ERROR("Dimensions don't match, vec_len=%d, w_dim=%d\n", dim, w_dim); 00221 00222 #ifdef HAVE_PTHREAD 00223 if (num_threads < 2) 00224 { 00225 #endif 00226 HASHEDWD_THREAD_PARAM params; 00227 params.hf=this; 00228 params.sub_index=NULL; 00229 params.output=output; 00230 params.start=start; 00231 params.stop=stop; 00232 params.alphas=alphas; 00233 params.vec=vec; 00234 params.bias=b; 00235 params.progress=false; //true; 00236 params.index=index; 00237 dense_dot_range_helper((void*) ¶ms); 00238 #ifdef HAVE_PTHREAD 00239 } 00240 else 00241 { 00242 pthread_t* threads = SG_MALLOC(pthread_t, num_threads-1); 00243 HASHEDWD_THREAD_PARAM* params = SG_MALLOC(HASHEDWD_THREAD_PARAM, num_threads); 00244 int32_t step= num_vectors/num_threads; 00245 00246 int32_t t; 00247 00248 for (t=0; t<num_threads-1; t++) 00249 { 00250 params[t].hf = this; 00251 params[t].sub_index=NULL; 00252 params[t].output = output; 00253 params[t].start = start+t*step; 00254 params[t].stop = start+(t+1)*step; 00255 params[t].alphas=alphas; 00256 params[t].vec=vec; 00257 params[t].bias=b; 00258 params[t].progress = false; 00259 params[t].index=index; 00260 pthread_create(&threads[t], NULL, 00261 CHashedWDFeaturesTransposed::dense_dot_range_helper, (void*)¶ms[t]); 00262 } 00263 00264 params[t].hf = this; 00265 params[t].sub_index=NULL; 00266 params[t].output = output; 00267 params[t].start = start+t*step; 00268 params[t].stop = stop; 00269 params[t].alphas=alphas; 00270 params[t].vec=vec; 00271 params[t].bias=b; 00272 params[t].progress = false; //true; 00273 params[t].index=index; 00274 CHashedWDFeaturesTransposed::dense_dot_range_helper((void*) ¶ms[t]); 00275 00276 for (t=0; t<num_threads-1; t++) 00277 pthread_join(threads[t], NULL); 00278 00279 SG_FREE(params); 00280 SG_FREE(threads); 00281 } 00282 #endif 00283 SG_FREE(index); 00284 00285 #ifndef WIN32 00286 if ( CSignal::cancel_computations() ) 00287 SG_INFO( "prematurely stopped. \n"); 00288 #endif 00289 } 00290 00291 void CHashedWDFeaturesTransposed::dense_dot_range_subset(int32_t* sub_index, int num, float64_t* output, float64_t* alphas, float64_t* vec, int32_t dim, float64_t b) 00292 { 00293 ASSERT(sub_index); 00294 ASSERT(output); 00295 00296 uint32_t* index=SG_MALLOC(uint32_t, num); 00297 00298 int32_t num_threads=parallel->get_num_threads(); 00299 ASSERT(num_threads>0); 00300 00301 CSignal::clear_cancel(); 00302 00303 if (dim != w_dim) 00304 SG_ERROR("Dimensions don't match, vec_len=%d, w_dim=%d\n", dim, w_dim); 00305 00306 #ifdef HAVE_PTHREAD 00307 if (num_threads < 2) 00308 { 00309 #endif 00310 HASHEDWD_THREAD_PARAM params; 00311 params.hf=this; 00312 params.sub_index=sub_index; 00313 params.output=output; 00314 params.start=0; 00315 params.stop=num; 00316 params.alphas=alphas; 00317 params.vec=vec; 00318 params.bias=b; 00319 params.progress=false; //true; 00320 params.index=index; 00321 dense_dot_range_helper((void*) ¶ms); 00322 #ifdef HAVE_PTHREAD 00323 } 00324 else 00325 { 00326 pthread_t* threads = SG_MALLOC(pthread_t, num_threads-1); 00327 HASHEDWD_THREAD_PARAM* params = SG_MALLOC(HASHEDWD_THREAD_PARAM, num_threads); 00328 int32_t step= num/num_threads; 00329 00330 int32_t t; 00331 00332 for (t=0; t<num_threads-1; t++) 00333 { 00334 params[t].hf = this; 00335 params[t].sub_index=sub_index; 00336 params[t].output = output; 00337 params[t].start = t*step; 00338 params[t].stop = (t+1)*step; 00339 params[t].alphas=alphas; 00340 params[t].vec=vec; 00341 params[t].bias=b; 00342 params[t].progress = false; 00343 params[t].index=index; 00344 pthread_create(&threads[t], NULL, 00345 CHashedWDFeaturesTransposed::dense_dot_range_helper, (void*)¶ms[t]); 00346 } 00347 00348 params[t].hf = this; 00349 params[t].sub_index=sub_index; 00350 params[t].output = output; 00351 params[t].start = t*step; 00352 params[t].stop = num; 00353 params[t].alphas=alphas; 00354 params[t].vec=vec; 00355 params[t].bias=b; 00356 params[t].progress = false; //true; 00357 params[t].index=index; 00358 CHashedWDFeaturesTransposed::dense_dot_range_helper((void*) ¶ms[t]); 00359 00360 for (t=0; t<num_threads-1; t++) 00361 pthread_join(threads[t], NULL); 00362 00363 SG_FREE(params); 00364 SG_FREE(threads); 00365 SG_FREE(index); 00366 } 00367 #endif 00368 00369 #ifndef WIN32 00370 if ( CSignal::cancel_computations() ) 00371 SG_INFO( "prematurely stopped. \n"); 00372 #endif 00373 } 00374 00375 void* CHashedWDFeaturesTransposed::dense_dot_range_helper(void* p) 00376 { 00377 HASHEDWD_THREAD_PARAM* par=(HASHEDWD_THREAD_PARAM*) p; 00378 CHashedWDFeaturesTransposed* hf=par->hf; 00379 int32_t* sub_index=par->sub_index; 00380 float64_t* output=par->output; 00381 int32_t start=par->start; 00382 int32_t stop=par->stop; 00383 float64_t* alphas=par->alphas; 00384 float64_t* vec=par->vec; 00385 float64_t bias=par->bias; 00386 bool progress=par->progress; 00387 uint32_t* index=par->index; 00388 int32_t string_length=hf->string_length; 00389 int32_t degree=hf->degree; 00390 float64_t* wd_weights=hf->wd_weights; 00391 SGString<uint8_t>* transposed_strings=hf->transposed_strings; 00392 uint32_t mask=hf->mask; 00393 int32_t partial_w_dim=hf->partial_w_dim; 00394 float64_t normalization_const=hf->normalization_const; 00395 00396 if (sub_index) 00397 { 00398 for (int32_t j=start; j<stop; j++) 00399 output[j]=0.0; 00400 00401 uint32_t offs=0; 00402 for (int32_t i=0; i<string_length; i++) 00403 { 00404 uint32_t o=offs; 00405 for (int32_t k=0; k<degree && i+k<string_length; k++) 00406 { 00407 const float64_t wd = wd_weights[k]; 00408 uint8_t* dim=transposed_strings[i+k].string; 00409 uint32_t carry = 0; 00410 uint32_t chunk = 0; 00411 for (int32_t j=start; j<stop; j++) 00412 { 00413 uint8_t bval=dim[sub_index[j]]; 00414 if (k==0) 00415 index[j] = 0xDEADBEAF; 00416 00417 CHash::IncrementalMurmurHash3(&index[j], &carry, &bval, 1); 00418 00419 chunk++; 00420 uint32_t h = 00421 CHash::FinalizeIncrementalMurmurHash3( 00422 index[j], carry, chunk); 00423 00424 output[j]+=vec[o + (h & mask)]*wd; 00425 index[j] = h; 00426 } 00427 00428 index[stop-1] = 00429 CHash::FinalizeIncrementalMurmurHash3( 00430 index[stop-1], carry, chunk); 00431 00432 o+=partial_w_dim; 00433 } 00434 offs+=partial_w_dim*degree; 00435 00436 if (progress) 00437 hf->io->progress(i, 0,string_length, i); 00438 } 00439 00440 for (int32_t j=start; j<stop; j++) 00441 { 00442 if (alphas) 00443 output[j]=output[j]*alphas[sub_index[j]]/normalization_const+bias; 00444 else 00445 output[j]=output[j]/normalization_const+bias; 00446 } 00447 } 00448 else 00449 { 00450 SGVector<float64_t>::fill_vector(&output[start], stop-start, 0.0); 00451 00452 uint32_t offs=0; 00453 for (int32_t i=0; i<string_length; i++) 00454 { 00455 uint32_t o=offs; 00456 for (int32_t k=0; k<degree && i+k<string_length; k++) 00457 { 00458 const float64_t wd = wd_weights[k]; 00459 uint8_t* dim=transposed_strings[i+k].string; 00460 uint32_t carry = 0; 00461 uint32_t chunk = 0; 00462 00463 for (int32_t j=start; j<stop; j++) 00464 { 00465 uint8_t bval=dim[sub_index[j]]; 00466 if (k==0) 00467 index[j] = 0xDEADBEAF; 00468 00469 CHash::IncrementalMurmurHash3(&index[j], &carry, &bval, 1); 00470 00471 chunk++; 00472 uint32_t h = 00473 CHash::FinalizeIncrementalMurmurHash3( 00474 index[j], carry, chunk); 00475 00476 index[j] = h; 00477 output[j]+=vec[o + (h & mask)]*wd; 00478 } 00479 00480 index[stop-1] = CHash::FinalizeIncrementalMurmurHash3( 00481 index[stop-1], carry, chunk); 00482 00483 o+=partial_w_dim; 00484 } 00485 offs+=partial_w_dim*degree; 00486 00487 if (progress) 00488 hf->io->progress(i, 0,string_length, i); 00489 } 00490 00491 for (int32_t j=start; j<stop; j++) 00492 { 00493 if (alphas) 00494 output[j]=output[j]*alphas[j]/normalization_const+bias; 00495 else 00496 output[j]=output[j]/normalization_const+bias; 00497 } 00498 } 00499 00500 return NULL; 00501 } 00502 00503 void CHashedWDFeaturesTransposed::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val) 00504 { 00505 if (vec2_len != w_dim) 00506 SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim); 00507 00508 int32_t len; 00509 bool free_vec1; 00510 uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1); 00511 uint32_t* val=SG_MALLOC(uint32_t, len); 00512 00513 uint32_t offs=0; 00514 float64_t factor=alpha/normalization_const; 00515 if (abs_val) 00516 factor=CMath::abs(factor); 00517 00518 SGVector<uint32_t>::fill_vector(val, len, 0xDEADBEAF); 00519 00520 for (int32_t i=0; i<len; i++) 00521 { 00522 uint32_t o=offs; 00523 uint32_t carry = 0; 00524 uint32_t chunk = 0; 00525 00526 for (int32_t k=0; k<degree && i+k<len; k++) 00527 { 00528 float64_t wd = wd_weights[k]*factor; 00529 chunk++; 00530 CHash::IncrementalMurmurHash3(&(val[i]), &carry, &(vec[i+k]), 1); 00531 uint32_t h = 00532 CHash::FinalizeIncrementalMurmurHash3(val[i], carry, chunk); 00533 #ifdef DEBUG_HASHEDWD 00534 SG_PRINT("offs=%d o=%d h=%d \n", offs, o, h); 00535 SG_PRINT("vec[i]=%d, k=%d, offs=%d o=%d\n", vec[i], k,offs, o); 00536 #endif 00537 vec2[o+(h & mask)]+=wd; 00538 val[i] = h; 00539 o+=partial_w_dim; 00540 } 00541 00542 val[i] = CHash::FinalizeIncrementalMurmurHash3(val[i], carry, chunk); 00543 offs+=partial_w_dim*degree; 00544 } 00545 00546 SG_FREE(val); 00547 strings->free_feature_vector(vec, vec_idx1, free_vec1); 00548 } 00549 00550 void CHashedWDFeaturesTransposed::set_wd_weights() 00551 { 00552 ASSERT(degree>0); 00553 00554 mask=(uint32_t) (((uint64_t) 1)<<m_hash_bits)-1; 00555 partial_w_dim=1<<m_hash_bits; 00556 w_dim=partial_w_dim*string_length*(degree-start_degree); 00557 00558 wd_weights=SG_MALLOC(float64_t, degree); 00559 00560 for (int32_t i=0; i<degree; i++) 00561 wd_weights[i]=sqrt(2.0*(from_degree-i)/(from_degree*(from_degree+1))); 00562 00563 SG_DEBUG("created HashedWDFeaturesTransposed with d=%d (%d), alphabetsize=%d, " 00564 "dim=%d partial_dim=%d num=%d, len=%d\n", 00565 degree, from_degree, alphabet_size, 00566 w_dim, partial_w_dim, num_strings, string_length); 00567 } 00568 00569 00570 void CHashedWDFeaturesTransposed::set_normalization_const(float64_t n) 00571 { 00572 if (n==0) 00573 { 00574 normalization_const=0; 00575 for (int32_t i=0; i<degree; i++) 00576 normalization_const+=(string_length-i)*wd_weights[i]*wd_weights[i]; 00577 00578 normalization_const=CMath::sqrt(normalization_const); 00579 } 00580 else 00581 normalization_const=n; 00582 00583 SG_DEBUG("normalization_const:%f\n", normalization_const); 00584 } 00585 00586 CFeatures* CHashedWDFeaturesTransposed::duplicate() const 00587 { 00588 return new CHashedWDFeaturesTransposed(*this); 00589 } 00590 00591 void* CHashedWDFeaturesTransposed::get_feature_iterator(int32_t vector_index) 00592 { 00593 SG_NOTIMPLEMENTED; 00594 return NULL; 00595 } 00596 00597 bool CHashedWDFeaturesTransposed::get_next_feature(int32_t& index, float64_t& value, void* iterator) 00598 { 00599 SG_NOTIMPLEMENTED; 00600 return NULL; 00601 } 00602 00603 void CHashedWDFeaturesTransposed::free_feature_iterator(void* iterator) 00604 { 00605 SG_NOTIMPLEMENTED; 00606 }