SHOGUN
v2.0.0
|
00001 #include <shogun/features/StringFeatures.h> 00002 #include <shogun/preprocessor/Preprocessor.h> 00003 #include <shogun/preprocessor/StringPreprocessor.h> 00004 #include <shogun/io/MemoryMappedFile.h> 00005 #include <shogun/io/SGIO.h> 00006 #include <shogun/mathematics/Math.h> 00007 #include <shogun/base/Parameter.h> 00008 00009 #include <sys/types.h> 00010 #include <sys/stat.h> 00011 #include <dirent.h> 00012 #include <stdio.h> 00013 #include <stdlib.h> 00014 #include <unistd.h> 00015 00016 00017 namespace shogun 00018 { 00019 00020 template<class ST> CStringFeatures<ST>::CStringFeatures() : CFeatures(0) 00021 { 00022 init(); 00023 alphabet=new CAlphabet(); 00024 } 00025 00026 template<class ST> CStringFeatures<ST>::CStringFeatures(EAlphabet alpha) : CFeatures(0) 00027 { 00028 init(); 00029 00030 alphabet=new CAlphabet(alpha); 00031 SG_REF(alphabet); 00032 num_symbols=alphabet->get_num_symbols(); 00033 original_num_symbols=num_symbols; 00034 } 00035 00036 template<class ST> CStringFeatures<ST>::CStringFeatures(SGStringList<ST> string_list, EAlphabet alpha) 00037 : CFeatures(0) 00038 { 00039 init(); 00040 00041 alphabet=new CAlphabet(alpha); 00042 SG_REF(alphabet); 00043 num_symbols=alphabet->get_num_symbols(); 00044 original_num_symbols=num_symbols; 00045 set_features(string_list.strings, string_list.num_strings, string_list.max_string_length); 00046 } 00047 00048 template<class ST> CStringFeatures<ST>::CStringFeatures(SGStringList<ST> string_list, CAlphabet* alpha) 00049 : CFeatures(0) 00050 { 00051 init(); 00052 00053 alphabet=new CAlphabet(alpha); 00054 SG_REF(alphabet); 00055 num_symbols=alphabet->get_num_symbols(); 00056 original_num_symbols=num_symbols; 00057 set_features(string_list.strings, string_list.num_strings, string_list.max_string_length); 00058 } 00059 00060 template<class ST> CStringFeatures<ST>::CStringFeatures(CAlphabet* alpha) 00061 : CFeatures(0) 00062 { 00063 init(); 00064 00065 ASSERT(alpha); 00066 SG_REF(alpha); 00067 alphabet=alpha; 00068 num_symbols=alphabet->get_num_symbols(); 00069 original_num_symbols=num_symbols; 00070 } 00071 00072 template<class ST> CStringFeatures<ST>::CStringFeatures(const CStringFeatures & orig) 00073 : CFeatures(orig), num_vectors(orig.num_vectors), 00074 single_string(orig.single_string), 00075 length_of_single_string(orig.length_of_single_string), 00076 max_string_length(orig.max_string_length), 00077 num_symbols(orig.num_symbols), 00078 original_num_symbols(orig.original_num_symbols), 00079 order(orig.order), preprocess_on_get(false), 00080 feature_cache(NULL) 00081 { 00082 init(); 00083 00084 ASSERT(orig.single_string == NULL); //not implemented 00085 00086 alphabet=orig.alphabet; 00087 SG_REF(alphabet); 00088 00089 if (orig.features) 00090 { 00091 features=SG_MALLOC(SGString<ST>, orig.num_vectors); 00092 00093 for (int32_t i=0; i<num_vectors; i++) 00094 { 00095 features[i].string=SG_MALLOC(ST, orig.features[i].slen); 00096 features[i].slen=orig.features[i].slen; 00097 memcpy(features[i].string, orig.features[i].string, sizeof(ST)*orig.features[i].slen); 00098 } 00099 } 00100 00101 if (orig.symbol_mask_table) 00102 { 00103 symbol_mask_table=SG_MALLOC(ST, 256); 00104 for (int32_t i=0; i<256; i++) 00105 symbol_mask_table[i]=orig.symbol_mask_table[i]; 00106 } 00107 00108 m_subset_stack=orig.m_subset_stack; 00109 SG_REF(m_subset_stack); 00110 } 00111 00112 template<class ST> CStringFeatures<ST>::CStringFeatures(CFile* loader, EAlphabet alpha) 00113 : CFeatures(loader), num_vectors(0), 00114 features(NULL), single_string(NULL), length_of_single_string(0), 00115 max_string_length(0), order(0), 00116 symbol_mask_table(NULL), preprocess_on_get(false), feature_cache(NULL) 00117 { 00118 init(); 00119 00120 alphabet=new CAlphabet(alpha); 00121 SG_REF(alphabet); 00122 num_symbols=alphabet->get_num_symbols(); 00123 original_num_symbols=num_symbols; 00124 load(loader); 00125 } 00126 00127 template<class ST> CStringFeatures<ST>::~CStringFeatures() 00128 { 00129 cleanup(); 00130 00131 SG_UNREF(alphabet); 00132 } 00133 00134 template<class ST> void CStringFeatures<ST>::cleanup() 00135 { 00136 remove_all_subsets(); 00137 00138 if (single_string) 00139 { 00140 SG_FREE(single_string); 00141 single_string=NULL; 00142 } 00143 else 00144 cleanup_feature_vectors(0, num_vectors-1); 00145 00146 num_vectors=0; 00147 SG_FREE(features); 00148 SG_FREE(symbol_mask_table); 00149 features=NULL; 00150 symbol_mask_table=NULL; 00151 00152 /* start with a fresh alphabet, but instead of emptying the histogram 00153 * create a new object (to leave the alphabet object alone if it is used 00154 * by others) 00155 */ 00156 CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet()); 00157 SG_UNREF(alphabet); 00158 alphabet=alpha; 00159 SG_REF(alphabet); 00160 } 00161 00162 template<class ST> void CStringFeatures<ST>::cleanup_feature_vector(int32_t num) 00163 { 00164 ASSERT(num<get_num_vectors()); 00165 00166 if (features) 00167 { 00168 int32_t real_num=m_subset_stack->subset_idx_conversion(num); 00169 SG_FREE(features[real_num].string); 00170 features[real_num].string=NULL; 00171 features[real_num].slen=0; 00172 00173 determine_maximum_string_length(); 00174 } 00175 } 00176 00177 template<class ST> void CStringFeatures<ST>::cleanup_feature_vectors(int32_t start, int32_t stop) 00178 { 00179 if (features && get_num_vectors()) 00180 { 00181 ASSERT(start<get_num_vectors()); 00182 ASSERT(stop<get_num_vectors()); 00183 00184 for (int32_t i=start; i<=stop; i++) 00185 { 00186 int32_t real_num=m_subset_stack->subset_idx_conversion(i); 00187 SG_FREE(features[real_num].string); 00188 features[real_num].string=NULL; 00189 features[real_num].slen=0; 00190 } 00191 determine_maximum_string_length(); 00192 } 00193 } 00194 00195 template<class ST> EFeatureClass CStringFeatures<ST>::get_feature_class() const { return C_STRING; } 00196 00197 template<class ST> EFeatureType CStringFeatures<ST>::get_feature_type() const { return F_UNKNOWN; } 00198 00199 template<class ST> CAlphabet* CStringFeatures<ST>::get_alphabet() 00200 { 00201 SG_REF(alphabet); 00202 return alphabet; 00203 } 00204 00205 template<class ST> CFeatures* CStringFeatures<ST>::duplicate() const 00206 { 00207 return new CStringFeatures<ST>(*this); 00208 } 00209 00210 template<class ST> SGVector<ST> CStringFeatures<ST>::get_feature_vector(int32_t num) 00211 { 00212 ASSERT(features); 00213 if (num>=get_num_vectors()) 00214 { 00215 SG_ERROR("Index out of bounds (number of strings %d, you " 00216 "requested %d)\n", get_num_vectors(), num); 00217 } 00218 00219 int32_t l; 00220 bool free_vec; 00221 ST* vec=get_feature_vector(num, l, free_vec); 00222 ST* dst=SG_MALLOC(ST, l); 00223 memcpy(dst, vec, l*sizeof(ST)); 00224 free_feature_vector(vec, num, free_vec); 00225 return SGVector<ST>(dst, l, true); 00226 } 00227 00228 template<class ST> void CStringFeatures<ST>::set_feature_vector(SGVector<ST> vector, int32_t num) 00229 { 00230 ASSERT(features); 00231 00232 if (m_subset_stack->has_subsets()) 00233 SG_ERROR("A subset is set, cannot set feature vector\n"); 00234 00235 if (num>=num_vectors) 00236 { 00237 SG_ERROR("Index out of bounds (number of strings %d, you " 00238 "requested %d)\n", num_vectors, num); 00239 } 00240 00241 if (vector.vlen<=0) 00242 SG_ERROR("String has zero or negative length\n"); 00243 00244 cleanup_feature_vector(num); 00245 features[num].slen=vector.vlen; 00246 features[num].string=SG_MALLOC(ST, vector.vlen); 00247 memcpy(features[num].string, vector.vector, vector.vlen*sizeof(ST)); 00248 00249 determine_maximum_string_length(); 00250 } 00251 00252 template<class ST> void CStringFeatures<ST>::enable_on_the_fly_preprocessing() 00253 { 00254 preprocess_on_get=true; 00255 } 00256 00257 template<class ST> void CStringFeatures<ST>::disable_on_the_fly_preprocessing() 00258 { 00259 preprocess_on_get=false; 00260 } 00261 00262 template<class ST> ST* CStringFeatures<ST>::get_feature_vector(int32_t num, int32_t& len, bool& dofree) 00263 { 00264 ASSERT(features); 00265 if (num>=get_num_vectors()) 00266 SG_ERROR("Requested feature vector with index %d while total num is", num, get_num_vectors()); 00267 00268 int32_t real_num=m_subset_stack->subset_idx_conversion(num); 00269 00270 if (!preprocess_on_get) 00271 { 00272 dofree=false; 00273 len=features[real_num].slen; 00274 return features[real_num].string; 00275 } 00276 else 00277 { 00278 SG_DEBUG( "computing feature vector!\n") ; 00279 ST* feat=compute_feature_vector(num, len); 00280 dofree=true; 00281 00282 if (get_num_preprocessors()) 00283 { 00284 ST* tmp_feat_before=feat; 00285 00286 for (int32_t i=0; i<get_num_preprocessors(); i++) 00287 { 00288 CStringPreprocessor<ST>* p=(CStringPreprocessor<ST>*) get_preprocessor(i); 00289 feat=p->apply_to_string(tmp_feat_before, len); 00290 SG_UNREF(p); 00291 SG_FREE(tmp_feat_before); 00292 tmp_feat_before=feat; 00293 } 00294 } 00295 // TODO: implement caching 00296 return feat; 00297 } 00298 } 00299 00300 template<class ST> CStringFeatures<ST>* CStringFeatures<ST>::get_transposed() 00301 { 00302 int32_t num_feat; 00303 int32_t num_vec; 00304 SGString<ST>* s=get_transposed(num_feat, num_vec); 00305 SGStringList<ST> string_list; 00306 string_list.strings = s; 00307 string_list.num_strings = num_vec; 00308 string_list.max_string_length = num_feat; 00309 00310 return new CStringFeatures<ST>(string_list, alphabet); 00311 } 00312 00313 template<class ST> SGString<ST>* CStringFeatures<ST>::get_transposed(int32_t &num_feat, int32_t &num_vec) 00314 { 00315 num_feat=get_num_vectors(); 00316 num_vec=get_max_vector_length(); 00317 ASSERT(have_same_length()); 00318 00319 SG_DEBUG("Allocating memory for transposed string features of size %ld\n", 00320 int64_t(num_feat)*num_vec); 00321 00322 SGString<ST>* sf=SG_MALLOC(SGString<ST>, num_vec); 00323 00324 for (int32_t i=0; i<num_vec; i++) 00325 { 00326 sf[i].string=SG_MALLOC(ST, num_feat); 00327 sf[i].slen=num_feat; 00328 } 00329 00330 for (int32_t i=0; i<num_feat; i++) 00331 { 00332 int32_t len=0; 00333 bool free_vec=false; 00334 ST* vec=get_feature_vector(i, len, free_vec); 00335 00336 for (int32_t j=0; j<num_vec; j++) 00337 sf[j].string[i]=vec[j]; 00338 00339 free_feature_vector(vec, i, free_vec); 00340 } 00341 return sf; 00342 } 00343 00344 template<class ST> void CStringFeatures<ST>::free_feature_vector(ST* feat_vec, int32_t num, bool dofree) 00345 { 00346 if (num>=get_num_vectors()) 00347 { 00348 SG_ERROR( 00349 "Trying to access string[%d] but num_str=%d\n", num, 00350 get_num_vectors()); 00351 } 00352 00353 int32_t real_num=m_subset_stack->subset_idx_conversion(num); 00354 00355 if (feature_cache) 00356 feature_cache->unlock_entry(real_num); 00357 00358 if (dofree) 00359 SG_FREE(feat_vec); 00360 } 00361 00362 template<class ST> void CStringFeatures<ST>::free_feature_vector(SGVector<ST> feat_vec, int32_t num) 00363 { 00364 if (num>=get_num_vectors()) 00365 { 00366 SG_ERROR( 00367 "Trying to access string[%d] but num_str=%d\n", num, 00368 get_num_vectors()); 00369 } 00370 00371 int32_t real_num=m_subset_stack->subset_idx_conversion(num); 00372 00373 if (feature_cache) 00374 feature_cache->unlock_entry(real_num); 00375 } 00376 00377 template<class ST> ST CStringFeatures<ST>::get_feature(int32_t vec_num, int32_t feat_num) 00378 { 00379 ASSERT(vec_num<get_num_vectors()); 00380 00381 int32_t len; 00382 bool free_vec; 00383 ST* vec=get_feature_vector(vec_num, len, free_vec); 00384 ASSERT(feat_num<len); 00385 ST result=vec[feat_num]; 00386 free_feature_vector(vec, vec_num, free_vec); 00387 00388 return result; 00389 } 00390 00391 template<class ST> int32_t CStringFeatures<ST>::get_vector_length(int32_t vec_num) 00392 { 00393 ASSERT(vec_num<get_num_vectors()); 00394 00395 int32_t len; 00396 bool free_vec; 00397 ST* vec=get_feature_vector(vec_num, len, free_vec); 00398 free_feature_vector(vec, vec_num, free_vec); 00399 return len; 00400 } 00401 00402 template<class ST> int32_t CStringFeatures<ST>::get_max_vector_length() 00403 { 00404 return max_string_length; 00405 } 00406 00407 template<class ST> int32_t CStringFeatures<ST>::get_num_vectors() const 00408 { 00409 return m_subset_stack->has_subsets() ? m_subset_stack->get_size() : num_vectors; 00410 } 00411 00412 template<class ST> floatmax_t CStringFeatures<ST>::get_num_symbols() { return num_symbols; } 00413 00414 template<class ST> floatmax_t CStringFeatures<ST>::get_max_num_symbols() { return CMath::powl(2,sizeof(ST)*8); } 00415 00416 template<class ST> floatmax_t CStringFeatures<ST>::get_original_num_symbols() { return original_num_symbols; } 00417 00418 template<class ST> int32_t CStringFeatures<ST>::get_order() { return order; } 00419 00420 template<class ST> ST CStringFeatures<ST>::get_masked_symbols(ST symbol, uint8_t mask) 00421 { 00422 ASSERT(symbol_mask_table); 00423 return symbol_mask_table[mask] & symbol; 00424 } 00425 00426 template<class ST> ST CStringFeatures<ST>::shift_offset(ST offset, int32_t amount) 00427 { 00428 ASSERT(alphabet); 00429 return (offset << (amount*alphabet->get_num_bits())); 00430 } 00431 00432 template<class ST> ST CStringFeatures<ST>::shift_symbol(ST symbol, int32_t amount) 00433 { 00434 ASSERT(alphabet); 00435 return (symbol >> (amount*alphabet->get_num_bits())); 00436 } 00437 00438 template<class ST> void CStringFeatures<ST>::load_ascii_file(char* fname, bool remap_to_bin, 00439 EAlphabet ascii_alphabet, EAlphabet binary_alphabet) 00440 { 00441 remove_all_subsets(); 00442 00443 size_t blocksize=1024*1024; 00444 size_t required_blocksize=0; 00445 uint8_t* dummy=SG_MALLOC(uint8_t, blocksize); 00446 uint8_t* overflow=NULL; 00447 int32_t overflow_len=0; 00448 00449 cleanup(); 00450 00451 CAlphabet* alpha=new CAlphabet(ascii_alphabet); 00452 CAlphabet* alpha_bin=new CAlphabet(binary_alphabet); 00453 00454 FILE* f=fopen(fname, "ro"); 00455 00456 if (f) 00457 { 00458 num_vectors=0; 00459 max_string_length=0; 00460 00461 SG_INFO("counting line numbers in file %s\n", fname); 00462 size_t block_offs=0; 00463 size_t old_block_offs=0; 00464 fseek(f, 0, SEEK_END); 00465 size_t fsize=ftell(f); 00466 rewind(f); 00467 00468 if (blocksize>fsize) 00469 blocksize=fsize; 00470 00471 SG_DEBUG("block_size=%ld file_size=%ld\n", blocksize, fsize); 00472 00473 size_t sz=blocksize; 00474 while (sz == blocksize) 00475 { 00476 sz=fread(dummy, sizeof(uint8_t), blocksize, f); 00477 for (size_t i=0; i<sz; i++) 00478 { 00479 block_offs++; 00480 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize)) 00481 { 00482 num_vectors++; 00483 required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs); 00484 old_block_offs=block_offs; 00485 } 00486 } 00487 SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t"); 00488 } 00489 00490 SG_INFO("found %d strings\n", num_vectors); 00491 SG_FREE(dummy); 00492 blocksize=required_blocksize; 00493 dummy=SG_MALLOC(uint8_t, blocksize); 00494 overflow=SG_MALLOC(uint8_t, blocksize); 00495 features=SG_MALLOC(SGString<ST>, num_vectors); 00496 00497 rewind(f); 00498 sz=blocksize; 00499 int32_t lines=0; 00500 while (sz == blocksize) 00501 { 00502 sz=fread(dummy, sizeof(uint8_t), blocksize, f); 00503 00504 size_t old_sz=0; 00505 for (size_t i=0; i<sz; i++) 00506 { 00507 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize)) 00508 { 00509 int32_t len=i-old_sz; 00510 //SG_PRINT("i:%d len:%d old_sz:%d\n", i, len, old_sz); 00511 max_string_length=CMath::max(max_string_length, len+overflow_len); 00512 00513 features[lines].slen=len; 00514 features[lines].string=SG_MALLOC(ST, len); 00515 00516 if (remap_to_bin) 00517 { 00518 for (int32_t j=0; j<overflow_len; j++) 00519 features[lines].string[j]=alpha->remap_to_bin(overflow[j]); 00520 for (int32_t j=0; j<len; j++) 00521 features[lines].string[j+overflow_len]=alpha->remap_to_bin(dummy[old_sz+j]); 00522 alpha->add_string_to_histogram(&dummy[old_sz], len); 00523 alpha_bin->add_string_to_histogram(features[lines].string, features[lines].slen); 00524 } 00525 else 00526 { 00527 for (int32_t j=0; j<overflow_len; j++) 00528 features[lines].string[j]=overflow[j]; 00529 for (int32_t j=0; j<len; j++) 00530 features[lines].string[j+overflow_len]=dummy[old_sz+j]; 00531 alpha->add_string_to_histogram(&dummy[old_sz], len); 00532 alpha->add_string_to_histogram(features[lines].string, features[lines].slen); 00533 } 00534 00535 // clear overflow 00536 overflow_len=0; 00537 00538 //CMath::display_vector(features[lines].string, len); 00539 old_sz=i+1; 00540 lines++; 00541 SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t"); 00542 } 00543 } 00544 for (size_t i=old_sz; i<sz; i++) 00545 overflow[i-old_sz]=dummy[i]; 00546 00547 overflow_len=sz-old_sz; 00548 } 00549 00550 if (alpha->check_alphabet_size() && alpha->check_alphabet()) 00551 { 00552 SG_INFO("file successfully read\n"); 00553 SG_INFO("max_string_length=%d\n", max_string_length); 00554 SG_INFO("num_strings=%d\n", num_vectors); 00555 } 00556 fclose(f); 00557 } 00558 00559 SG_FREE(dummy); 00560 00561 SG_UNREF(alphabet); 00562 00563 if (remap_to_bin) 00564 alphabet=alpha_bin; 00565 else 00566 alphabet=alpha; 00567 SG_REF(alphabet); 00568 num_symbols=alphabet->get_num_symbols(); 00569 } 00570 00571 template<class ST> bool CStringFeatures<ST>::load_fasta_file(const char* fname, bool ignore_invalid) 00572 { 00573 remove_all_subsets(); 00574 00575 int32_t i=0; 00576 uint64_t len=0; 00577 uint64_t offs=0; 00578 int32_t num=0; 00579 int32_t max_len=0; 00580 00581 CMemoryMappedFile<char> f(fname); 00582 00583 while (true) 00584 { 00585 char* s=f.get_line(len, offs); 00586 if (!s) 00587 break; 00588 00589 if (len>0 && s[0]=='>') 00590 num++; 00591 } 00592 00593 if (num==0) 00594 SG_ERROR("No fasta hunks (lines starting with '>') found\n"); 00595 00596 cleanup(); 00597 SG_UNREF(alphabet); 00598 alphabet=new CAlphabet(DNA); 00599 num_symbols=alphabet->get_num_symbols(); 00600 00601 SGString<ST>* strings=SG_MALLOC(SGString<ST>, num); 00602 offs=0; 00603 00604 for (i=0;i<num; i++) 00605 { 00606 uint64_t id_len=0; 00607 char* id=f.get_line(id_len, offs); 00608 00609 char* fasta=f.get_line(len, offs); 00610 char* s=fasta; 00611 int32_t fasta_len=0; 00612 int32_t spanned_lines=0; 00613 00614 while (true) 00615 { 00616 if (!s || len==0) 00617 SG_ERROR("Error reading fasta entry in line %d len=%ld", 4*i+1, len); 00618 00619 if (s[0]=='>' || offs==f.get_size()) 00620 { 00621 offs-=len+1; // seek to beginning 00622 if (offs==f.get_size()) 00623 { 00624 SG_DEBUG("at EOF\n"); 00625 fasta_len+=len; 00626 } 00627 00628 len=fasta_len-spanned_lines; 00629 strings[i].string=SG_MALLOC(ST, len); 00630 strings[i].slen=len; 00631 00632 ST* str=strings[i].string; 00633 int32_t idx=0; 00634 SG_DEBUG("'%.*s', len=%d, spanned_lines=%d\n", (int32_t) id_len, id, (int32_t) len, (int32_t) spanned_lines); 00635 00636 for (int32_t j=0; j<fasta_len; j++) 00637 { 00638 if (fasta[j]=='\n') 00639 continue; 00640 00641 ST c=(ST) fasta[j]; 00642 00643 if (ignore_invalid && !alphabet->is_valid((uint8_t) fasta[j])) 00644 c=(ST) 'A'; 00645 00646 if (uint64_t(idx)>=len) 00647 SG_ERROR("idx=%d j=%d fasta_len=%d, spanned_lines=%d str='%.*s'\n", idx, j, fasta_len, spanned_lines, idx, str); 00648 str[idx++]=c; 00649 } 00650 max_len=CMath::max(max_len, strings[i].slen); 00651 00652 00653 break; 00654 } 00655 00656 spanned_lines++; 00657 fasta_len+=len+1; // including '\n' 00658 s=f.get_line(len, offs); 00659 } 00660 } 00661 return set_features(strings, num, max_len); 00662 } 00663 00664 template<class ST> bool CStringFeatures<ST>::load_fastq_file(const char* fname, 00665 bool ignore_invalid, bool bitremap_in_single_string) 00666 { 00667 remove_all_subsets(); 00668 00669 CMemoryMappedFile<char> f(fname); 00670 00671 int32_t i=0; 00672 uint64_t len=0; 00673 uint64_t offs=0; 00674 00675 int32_t num=f.get_num_lines(); 00676 int32_t max_len=0; 00677 00678 if (num%4) 00679 SG_ERROR("Number of lines must be divisible by 4 in fastq files\n"); 00680 num/=4; 00681 00682 cleanup(); 00683 SG_UNREF(alphabet); 00684 alphabet=new CAlphabet(DNA); 00685 00686 SGString<ST>* strings; 00687 00688 ST* str=NULL; 00689 if (bitremap_in_single_string) 00690 { 00691 strings=SG_MALLOC(SGString<ST>, 1); 00692 strings[0].string=SG_MALLOC(ST, num); 00693 strings[0].slen=num; 00694 f.get_line(len, offs); 00695 f.get_line(len, offs); 00696 order=len; 00697 max_len=num; 00698 offs=0; 00699 original_num_symbols=alphabet->get_num_symbols(); 00700 str=SG_MALLOC(ST, len); 00701 } 00702 else 00703 strings=SG_MALLOC(SGString<ST>, num); 00704 00705 for (i=0;i<num; i++) 00706 { 00707 if (!f.get_line(len, offs)) 00708 SG_ERROR("Error reading 'read' identifier in line %d", 4*i); 00709 00710 char* s=f.get_line(len, offs); 00711 if (!s || len==0) 00712 SG_ERROR("Error reading 'read' in line %d len=%ld", 4*i+1, len); 00713 00714 if (bitremap_in_single_string) 00715 { 00716 if (len!=(uint64_t) order) 00717 SG_ERROR("read in line %d not of length %d (is %d)\n", 4*i+1, order, len); 00718 for (int32_t j=0; j<order; j++) 00719 str[j]=(ST) alphabet->remap_to_bin((uint8_t) s[j]); 00720 00721 strings[0].string[i]=embed_word(str, order); 00722 } 00723 else 00724 { 00725 strings[i].string=SG_MALLOC(ST, len); 00726 strings[i].slen=len; 00727 str=strings[i].string; 00728 00729 if (ignore_invalid) 00730 { 00731 for (uint64_t j=0; j<len; j++) 00732 { 00733 if (alphabet->is_valid((uint8_t) s[j])) 00734 str[j]= (ST) s[j]; 00735 else 00736 str[j]= (ST) 'A'; 00737 } 00738 } 00739 else 00740 { 00741 for (uint64_t j=0; j<len; j++) 00742 str[j]= (ST) s[j]; 00743 } 00744 max_len=CMath::max(max_len, (int32_t) len); 00745 } 00746 00747 00748 if (!f.get_line(len, offs)) 00749 SG_ERROR("Error reading 'read' quality identifier in line %d", 4*i+2); 00750 00751 if (!f.get_line(len, offs)) 00752 SG_ERROR("Error reading 'read' quality in line %d", 4*i+3); 00753 } 00754 00755 if (bitremap_in_single_string) 00756 num=1; 00757 00758 num_vectors=num; 00759 max_string_length=max_len; 00760 features=strings; 00761 00762 return true; 00763 } 00764 00765 template<class ST> bool CStringFeatures<ST>::load_from_directory(char* dirname) 00766 { 00767 remove_all_subsets(); 00768 00769 struct dirent **namelist; 00770 int32_t n; 00771 00772 SGIO::set_dirname(dirname); 00773 00774 SG_DEBUG("dirname '%s'\n", dirname); 00775 00776 n=scandir(dirname, &namelist, &SGIO::filter, alphasort); 00777 if (n <= 0) 00778 { 00779 SG_ERROR("error calling scandir - no files found\n"); 00780 return false; 00781 } 00782 else 00783 { 00784 SGString<ST>* strings=NULL; 00785 00786 int32_t num=0; 00787 int32_t max_len=-1; 00788 00789 //usually n==num_vec, but it might not in race conditions 00790 //(file perms modified, file erased) 00791 strings=SG_MALLOC(SGString<ST>, n); 00792 00793 for (int32_t i=0; i<n; i++) 00794 { 00795 char* fname=SGIO::concat_filename(namelist[i]->d_name); 00796 00797 struct stat s; 00798 off_t filesize=0; 00799 00800 if (!stat(fname, &s) && s.st_size>0) 00801 { 00802 filesize=s.st_size/sizeof(ST); 00803 00804 FILE* f=fopen(fname, "ro"); 00805 if (f) 00806 { 00807 ST* str=SG_MALLOC(ST, filesize); 00808 SG_DEBUG("%s:%ld\n", fname, (int64_t) filesize); 00809 if (fread(str, sizeof(ST), filesize, f)!=(size_t) filesize) 00810 SG_ERROR("failed to read file\n"); 00811 strings[num].string=str; 00812 strings[num].slen=filesize; 00813 max_len=CMath::max(max_len, strings[num].slen); 00814 00815 num++; 00816 fclose(f); 00817 } 00818 } 00819 else 00820 SG_ERROR("empty or non readable file \'%s\'\n", fname); 00821 00822 SG_FREE(namelist[i]); 00823 } 00824 SG_FREE(namelist); 00825 00826 if (num>0 && strings) 00827 { 00828 set_features(strings, num, max_len); 00829 return true; 00830 } 00831 } 00832 return false; 00833 } 00834 00835 template<class ST> void CStringFeatures<ST>::set_features(SGStringList<ST> feats) 00836 { 00837 set_features(feats.strings, feats.num_strings, feats.max_string_length); 00838 } 00839 00840 template<class ST> bool CStringFeatures<ST>::set_features(SGString<ST>* p_features, int32_t p_num_vectors, int32_t p_max_string_length) 00841 { 00842 if (m_subset_stack->has_subsets()) 00843 SG_ERROR("Cannot call set_features() with subset.\n"); 00844 00845 if (p_features) 00846 { 00847 CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet()); 00848 00849 //compute histogram for char/byte 00850 for (int32_t i=0; i<p_num_vectors; i++) 00851 alpha->add_string_to_histogram( p_features[i].string, p_features[i].slen); 00852 00853 SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram()); 00854 SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram()); 00855 00856 if (alpha->check_alphabet_size() && alpha->check_alphabet()) 00857 { 00858 cleanup(); 00859 SG_UNREF(alphabet); 00860 00861 alphabet=alpha; 00862 SG_REF(alphabet); 00863 00864 features=p_features; 00865 num_vectors=p_num_vectors; 00866 max_string_length=p_max_string_length; 00867 00868 return true; 00869 } 00870 else 00871 SG_UNREF(alpha); 00872 } 00873 00874 return false; 00875 } 00876 00877 template<class ST> bool CStringFeatures<ST>::append_features(CStringFeatures<ST>* sf) 00878 { 00879 ASSERT(sf); 00880 00881 if (m_subset_stack->has_subsets()) 00882 SG_ERROR("Cannot call set_features() with subset.\n"); 00883 00884 SGString<ST>* new_features=SG_MALLOC(SGString<ST>, sf->get_num_vectors()); 00885 00886 index_t sf_num_str=sf->get_num_vectors(); 00887 for (int32_t i=0; i<sf_num_str; i++) 00888 { 00889 int32_t real_i = sf->m_subset_stack->subset_idx_conversion(i); 00890 int32_t length=sf->features[real_i].slen; 00891 new_features[i].string=SG_MALLOC(ST, length); 00892 memcpy(new_features[i].string, sf->features[real_i].string, length); 00893 new_features[i].slen=length; 00894 } 00895 return append_features(new_features, sf_num_str, 00896 sf->max_string_length); 00897 } 00898 00899 template<class ST> bool CStringFeatures<ST>::append_features(SGString<ST>* p_features, int32_t p_num_vectors, int32_t p_max_string_length) 00900 { 00901 if (m_subset_stack->has_subsets()) 00902 SG_ERROR("Cannot call set_features() with subset.\n"); 00903 00904 if (!features) 00905 return set_features(p_features, p_num_vectors, p_max_string_length); 00906 00907 CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet()); 00908 00909 //compute histogram for char/byte 00910 for (int32_t i=0; i<p_num_vectors; i++) 00911 alpha->add_string_to_histogram( p_features[i].string, p_features[i].slen); 00912 00913 SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram()); 00914 SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram()); 00915 00916 if (alpha->check_alphabet_size() && alpha->check_alphabet()) 00917 { 00918 SG_UNREF(alpha); 00919 for (int32_t i=0; i<p_num_vectors; i++) 00920 alphabet->add_string_to_histogram( p_features[i].string, p_features[i].slen); 00921 00922 int32_t old_num_vectors=num_vectors; 00923 num_vectors=old_num_vectors+p_num_vectors; 00924 SGString<ST>* new_features=SG_MALLOC(SGString<ST>, num_vectors); 00925 00926 for (int32_t i=0; i<num_vectors; i++) 00927 { 00928 if (i<old_num_vectors) 00929 { 00930 new_features[i].string=features[i].string; 00931 new_features[i].slen=features[i].slen; 00932 } 00933 else 00934 { 00935 new_features[i].string=p_features[i-old_num_vectors].string; 00936 new_features[i].slen=p_features[i-old_num_vectors].slen; 00937 } 00938 } 00939 SG_FREE(features); 00940 SG_FREE(p_features); // free now obsolete features 00941 00942 this->features=new_features; 00943 max_string_length=CMath::max(max_string_length, p_max_string_length); 00944 00945 return true; 00946 } 00947 SG_UNREF(alpha); 00948 00949 return false; 00950 } 00951 00952 template<class ST> SGStringList<ST> CStringFeatures<ST>::get_features() 00953 { 00954 SGStringList<ST> sl; 00955 00956 sl.strings=get_features(sl.num_strings, sl.max_string_length); 00957 return sl; 00958 } 00959 00960 template<class ST> SGString<ST>* CStringFeatures<ST>::get_features(int32_t& num_str, int32_t& max_str_len) 00961 { 00962 if (m_subset_stack->has_subsets()) 00963 SG_ERROR("get features() is not possible on subset"); 00964 00965 num_str=num_vectors; 00966 max_str_len=max_string_length; 00967 return features; 00968 } 00969 00970 template<class ST> SGString<ST>* CStringFeatures<ST>::copy_features(int32_t& num_str, int32_t& max_str_len) 00971 { 00972 ASSERT(num_vectors>0); 00973 00974 num_str=get_num_vectors(); 00975 max_str_len=max_string_length; 00976 SGString<ST>* new_feat=SG_MALLOC(SGString<ST>, num_str); 00977 00978 for (int32_t i=0; i<num_str; i++) 00979 { 00980 int32_t len; 00981 bool free_vec; 00982 ST* vec=get_feature_vector(i, len, free_vec); 00983 new_feat[i].string=SG_MALLOC(ST, len); 00984 new_feat[i].slen=len; 00985 memcpy(new_feat[i].string, vec, ((size_t) len) * sizeof(ST)); 00986 free_feature_vector(vec, i, free_vec); 00987 } 00988 00989 return new_feat; 00990 } 00991 00992 template<class ST> void CStringFeatures<ST>::get_features(SGString<ST>** dst, int32_t* num_str) 00993 { 00994 int32_t num_vec; 00995 int32_t max_str_len; 00996 *dst=copy_features(num_vec, max_str_len); 00997 *num_str=num_vec; 00998 } 00999 01000 template<class ST> bool CStringFeatures<ST>::load_compressed(char* src, bool decompress) 01001 { 01002 remove_all_subsets(); 01003 01004 FILE* file=NULL; 01005 01006 if (!(file=fopen(src, "r"))) 01007 return false; 01008 cleanup(); 01009 01010 // header shogun v0 01011 char id[4]; 01012 if (fread(&id[0], sizeof(char), 1, file)!=1) 01013 SG_ERROR("failed to read header"); 01014 ASSERT(id[0]=='S'); 01015 if (fread(&id[1], sizeof(char), 1, file)!=1) 01016 SG_ERROR("failed to read header"); 01017 ASSERT(id[1]=='G'); 01018 if (fread(&id[2], sizeof(char), 1, file)!=1) 01019 SG_ERROR("failed to read header"); 01020 ASSERT(id[2]=='V'); 01021 if (fread(&id[3], sizeof(char), 1, file)!=1) 01022 SG_ERROR("failed to read header"); 01023 ASSERT(id[3]=='0'); 01024 01025 //compression type 01026 uint8_t c; 01027 if (fread(&c, sizeof(uint8_t), 1, file)!=1) 01028 SG_ERROR("failed to read compression type"); 01029 CCompressor* compressor= new CCompressor((E_COMPRESSION_TYPE) c); 01030 //alphabet 01031 uint8_t a; 01032 delete alphabet; 01033 if (fread(&a, sizeof(uint8_t), 1, file)!=1) 01034 SG_ERROR("failed to read compression alphabet"); 01035 alphabet=new CAlphabet((EAlphabet) a); 01036 // number of vectors 01037 if (fread(&num_vectors, sizeof(int32_t), 1, file)!=1) 01038 SG_ERROR("failed to read compression number of vectors"); 01039 ASSERT(num_vectors>0); 01040 // maximum string length 01041 if (fread(&max_string_length, sizeof(int32_t), 1, file)!=1) 01042 SG_ERROR("failed to read maximum string length"); 01043 ASSERT(max_string_length>0); 01044 01045 features=SG_MALLOC(SGString<ST>, num_vectors); 01046 01047 // vectors 01048 for (int32_t i=0; i<num_vectors; i++) 01049 { 01050 // vector len compressed 01051 int32_t len_compressed; 01052 if (fread(&len_compressed, sizeof(int32_t), 1, file)!=1) 01053 SG_ERROR("failed to read vector length compressed"); 01054 // vector len uncompressed 01055 int32_t len_uncompressed; 01056 if (fread(&len_uncompressed, sizeof(int32_t), 1, file)!=1) 01057 SG_ERROR("failed to read vector length uncompressed"); 01058 01059 // vector raw data 01060 if (decompress) 01061 { 01062 features[i].string=SG_MALLOC(ST, len_uncompressed); 01063 features[i].slen=len_uncompressed; 01064 uint8_t* compressed=SG_MALLOC(uint8_t, len_compressed); 01065 if (fread(compressed, sizeof(uint8_t), len_compressed, file)!=(size_t) len_compressed) 01066 SG_ERROR("failed to read compressed data (expected %d bytes)", len_compressed); 01067 uint64_t uncompressed_size=len_uncompressed; 01068 uncompressed_size*=sizeof(ST); 01069 compressor->decompress(compressed, len_compressed, 01070 (uint8_t*) features[i].string, uncompressed_size); 01071 SG_FREE(compressed); 01072 ASSERT(uncompressed_size==((uint64_t) len_uncompressed)*sizeof(ST)); 01073 } 01074 else 01075 { 01076 int32_t offs=CMath::ceil(2.0*sizeof(int32_t)/sizeof(ST)); 01077 features[i].string=SG_MALLOC(ST, len_compressed+offs); 01078 features[i].slen=len_compressed+offs; 01079 int32_t* feat32ptr=((int32_t*) (features[i].string)); 01080 memset(features[i].string, 0, offs*sizeof(ST)); 01081 feat32ptr[0]=(int32_t) len_compressed; 01082 feat32ptr[1]=(int32_t) len_uncompressed; 01083 uint8_t* compressed=(uint8_t*) (&features[i].string[offs]); 01084 if (fread(compressed, 1, len_compressed, file)!=(size_t) len_compressed) 01085 SG_ERROR("failed to read uncompressed data"); 01086 } 01087 } 01088 01089 delete compressor; 01090 fclose(file); 01091 01092 return false; 01093 } 01094 01095 template<class ST> bool CStringFeatures<ST>::save_compressed(char* dest, E_COMPRESSION_TYPE compression, int level) 01096 { 01097 if (m_subset_stack->has_subsets()) 01098 SG_ERROR("save_compressed() is not possible on subset"); 01099 01100 FILE* file=NULL; 01101 01102 if (!(file=fopen(dest, "wb"))) 01103 return false; 01104 01105 CCompressor* compressor= new CCompressor(compression); 01106 01107 // header shogun v0 01108 const char* id="SGV0"; 01109 fwrite(&id[0], sizeof(char), 1, file); 01110 fwrite(&id[1], sizeof(char), 1, file); 01111 fwrite(&id[2], sizeof(char), 1, file); 01112 fwrite(&id[3], sizeof(char), 1, file); 01113 01114 //compression type 01115 uint8_t c=(uint8_t) compression; 01116 fwrite(&c, sizeof(uint8_t), 1, file); 01117 //alphabet 01118 uint8_t a=(uint8_t) alphabet->get_alphabet(); 01119 fwrite(&a, sizeof(uint8_t), 1, file); 01120 // number of vectors 01121 fwrite(&num_vectors, sizeof(int32_t), 1, file); 01122 // maximum string length 01123 fwrite(&max_string_length, sizeof(int32_t), 1, file); 01124 01125 // vectors 01126 for (int32_t i=0; i<num_vectors; i++) 01127 { 01128 int32_t len=-1; 01129 bool vfree; 01130 ST* vec=get_feature_vector(i, len, vfree); 01131 01132 uint8_t* compressed=NULL; 01133 uint64_t compressed_size=0; 01134 01135 compressor->compress((uint8_t*) vec, ((uint64_t) len)*sizeof(ST), 01136 compressed, compressed_size, level); 01137 01138 int32_t len_compressed=(int32_t) compressed_size; 01139 // vector len compressed in bytes 01140 fwrite(&len_compressed, sizeof(int32_t), 1, file); 01141 // vector len uncompressed in number of elements of type ST 01142 fwrite(&len, sizeof(int32_t), 1, file); 01143 // vector raw data 01144 fwrite(compressed, compressed_size, 1, file); 01145 SG_FREE(compressed); 01146 01147 free_feature_vector(vec, i, vfree); 01148 } 01149 01150 delete compressor; 01151 fclose(file); 01152 return true; 01153 } 01154 01155 template<class ST> int32_t CStringFeatures<ST>::get_size() const { return sizeof(ST); } 01156 01157 template<class ST> bool CStringFeatures<ST>::apply_preprocessor(bool force_preprocessing) 01158 { 01159 SG_DEBUG( "force: %d\n", force_preprocessing); 01160 01161 for (int32_t i=0; i<get_num_preprocessors(); i++) 01162 { 01163 if ( (!is_preprocessed(i) || force_preprocessing) ) 01164 { 01165 set_preprocessed(i); 01166 CStringPreprocessor<ST>* p=(CStringPreprocessor<ST>*) get_preprocessor(i); 01167 SG_INFO( "preprocessing using preproc %s\n", p->get_name()); 01168 01169 if (!p->apply_to_string_features(this)) 01170 { 01171 SG_UNREF(p); 01172 return false; 01173 } 01174 else 01175 SG_UNREF(p); 01176 } 01177 } 01178 return true; 01179 } 01180 01181 template<class ST> int32_t CStringFeatures<ST>::obtain_by_sliding_window(int32_t window_size, int32_t step_size, int32_t skip) 01182 { 01183 if (m_subset_stack->has_subsets()) 01184 SG_NOTIMPLEMENTED; 01185 01186 ASSERT(step_size>0); 01187 ASSERT(window_size>0); 01188 ASSERT(num_vectors==1 || single_string); 01189 ASSERT(max_string_length>=window_size || 01190 (single_string && length_of_single_string>=window_size)); 01191 01192 //in case we are dealing with a single remapped string 01193 //allow remapping 01194 if (single_string) 01195 num_vectors= (length_of_single_string-window_size)/step_size + 1; 01196 else if (num_vectors==1) 01197 { 01198 num_vectors= (max_string_length-window_size)/step_size + 1; 01199 length_of_single_string=max_string_length; 01200 } 01201 01202 SGString<ST>* f=SG_MALLOC(SGString<ST>, num_vectors); 01203 int32_t offs=0; 01204 for (int32_t i=0; i<num_vectors; i++) 01205 { 01206 f[i].string=&features[0].string[offs+skip]; 01207 f[i].slen=window_size-skip; 01208 offs+=step_size; 01209 } 01210 single_string=features[0].string; 01211 SG_FREE(features); 01212 features=f; 01213 max_string_length=window_size-skip; 01214 01215 return num_vectors; 01216 } 01217 01218 template<class ST> int32_t CStringFeatures<ST>::obtain_by_position_list(int32_t window_size, CDynamicArray<int32_t>* positions, 01219 int32_t skip) 01220 { 01221 if (m_subset_stack->has_subsets()) 01222 SG_NOTIMPLEMENTED; 01223 01224 ASSERT(positions); 01225 ASSERT(window_size>0); 01226 ASSERT(num_vectors==1 || single_string); 01227 ASSERT(max_string_length>=window_size || 01228 (single_string && length_of_single_string>=window_size)); 01229 01230 num_vectors= positions->get_num_elements(); 01231 ASSERT(num_vectors>0); 01232 01233 int32_t len; 01234 01235 //in case we are dealing with a single remapped string 01236 //allow remapping 01237 if (single_string) 01238 len=length_of_single_string; 01239 else 01240 { 01241 single_string=features[0].string; 01242 len=max_string_length; 01243 length_of_single_string=max_string_length; 01244 } 01245 01246 SGString<ST>* f=SG_MALLOC(SGString<ST>, num_vectors); 01247 for (int32_t i=0; i<num_vectors; i++) 01248 { 01249 int32_t p=positions->get_element(i); 01250 01251 if (p>=0 && p<=len-window_size) 01252 { 01253 f[i].string=&features[0].string[p+skip]; 01254 f[i].slen=window_size-skip; 01255 } 01256 else 01257 { 01258 num_vectors=1; 01259 max_string_length=len; 01260 features[0].slen=len; 01261 single_string=NULL; 01262 SG_FREE(f); 01263 SG_ERROR("window (size:%d) starting at position[%d]=%d does not fit in sequence(len:%d)\n", 01264 window_size, i, p, len); 01265 return -1; 01266 } 01267 } 01268 01269 SG_FREE(features); 01270 features=f; 01271 max_string_length=window_size-skip; 01272 01273 return num_vectors; 01274 } 01275 01276 template<class ST> bool CStringFeatures<ST>::obtain_from_char(CStringFeatures<char>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev) 01277 { 01278 return obtain_from_char_features(sf, start, p_order, gap, rev); 01279 } 01280 01281 template<class ST> bool CStringFeatures<ST>::have_same_length(int32_t len) 01282 { 01283 if (len!=-1) 01284 { 01285 if (len!=max_string_length) 01286 return false; 01287 } 01288 len=max_string_length; 01289 01290 index_t num_str=get_num_vectors(); 01291 for (int32_t i=0; i<num_str; i++) 01292 { 01293 if (get_vector_length(i)!=len) 01294 return false; 01295 } 01296 01297 return true; 01298 } 01299 01300 template<class ST> void CStringFeatures<ST>::embed_features(int32_t p_order) 01301 { 01302 if (m_subset_stack->has_subsets()) 01303 SG_NOTIMPLEMENTED; 01304 01305 ASSERT(alphabet->get_num_symbols_in_histogram() > 0); 01306 01307 order=p_order; 01308 original_num_symbols=alphabet->get_num_symbols(); 01309 int32_t max_val=alphabet->get_num_bits(); 01310 01311 if (p_order>1) 01312 num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order); 01313 else 01314 num_symbols=original_num_symbols; 01315 01316 SG_INFO( "max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols); 01317 01318 if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) ) 01319 SG_WARNING("symbols did not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val); 01320 01321 ST mask=0; 01322 for (int32_t i=0; i<p_order*max_val; i++) 01323 mask= (mask<<1) | ((ST) 1); 01324 01325 for (int32_t i=0; i<num_vectors; i++) 01326 { 01327 int32_t len=features[i].slen; 01328 01329 if (len < p_order) 01330 SG_ERROR("Sequence must be longer than order (%d vs. %d)\n", len, p_order); 01331 01332 ST* str=features[i].string; 01333 01334 // convert first word 01335 for (int32_t j=0; j<p_order; j++) 01336 str[j]=(ST) alphabet->remap_to_bin(str[j]); 01337 str[0]=embed_word(&str[0], p_order); 01338 01339 // convert the rest 01340 int32_t idx=0; 01341 for (int32_t j=p_order; j<len; j++) 01342 { 01343 str[j]=(ST) alphabet->remap_to_bin(str[j]); 01344 str[idx+1]= ((str[idx]<<max_val) | str[j]) & mask; 01345 idx++; 01346 } 01347 01348 features[i].slen=len-p_order+1; 01349 } 01350 01351 compute_symbol_mask_table(max_val); 01352 } 01353 01354 template<class ST> void CStringFeatures<ST>::compute_symbol_mask_table(int64_t max_val) 01355 { 01356 if (m_subset_stack->has_subsets()) 01357 SG_NOTIMPLEMENTED; 01358 01359 SG_FREE(symbol_mask_table); 01360 symbol_mask_table=SG_MALLOC(ST, 256); 01361 01362 uint64_t mask=0; 01363 for (int32_t i=0; i< (int64_t) max_val; i++) 01364 mask=(mask<<1) | 1; 01365 01366 for (int32_t i=0; i<256; i++) 01367 { 01368 uint8_t bits=(uint8_t) i; 01369 symbol_mask_table[i]=0; 01370 01371 for (int32_t j=0; j<8; j++) 01372 { 01373 if (bits & 1) 01374 symbol_mask_table[i]|=mask<<(max_val*j); 01375 01376 bits>>=1; 01377 } 01378 } 01379 } 01380 01381 template<class ST> void CStringFeatures<ST>::unembed_word(ST word, uint8_t* seq, int32_t len) 01382 { 01383 uint32_t nbits= (uint32_t) alphabet->get_num_bits(); 01384 01385 ST mask=0; 01386 for (uint32_t i=0; i<nbits; i++) 01387 mask=(mask<<1) | (ST) 1; 01388 01389 for (int32_t i=0; i<len; i++) 01390 { 01391 ST w=(word & mask); 01392 seq[len-i-1]=alphabet->remap_to_char((uint8_t) w); 01393 word>>=nbits; 01394 } 01395 } 01396 01397 template<class ST> ST CStringFeatures<ST>::embed_word(ST* seq, int32_t len) 01398 { 01399 ST value=(ST) 0; 01400 uint32_t nbits= (uint32_t) alphabet->get_num_bits(); 01401 for (int32_t i=0; i<len; i++) 01402 { 01403 value<<=nbits; 01404 value|=seq[i]; 01405 } 01406 01407 return value; 01408 } 01409 01410 template<class ST> void CStringFeatures<ST>::determine_maximum_string_length() 01411 { 01412 max_string_length=0; 01413 index_t num_str=get_num_vectors(); 01414 01415 for (int32_t i=0; i<num_str; i++) 01416 { 01417 max_string_length=CMath::max(max_string_length, 01418 features[m_subset_stack->subset_idx_conversion(i)].slen); 01419 } 01420 } 01421 01422 template<class ST> ST* CStringFeatures<ST>::get_zero_terminated_string_copy(SGString<ST> str) 01423 { 01424 int32_t l=str.slen; 01425 ST* s=SG_MALLOC(ST, l+1); 01426 memcpy(s, str.string, sizeof(ST)*l); 01427 s[l]='\0'; 01428 return s; 01429 } 01430 01431 template<class ST> void CStringFeatures<ST>::set_feature_vector(int32_t num, ST* string, int32_t len) 01432 { 01433 ASSERT(features); 01434 ASSERT(num<get_num_vectors()); 01435 01436 int32_t real_num=m_subset_stack->subset_idx_conversion(num); 01437 01438 01439 features[real_num].slen=len ; 01440 features[real_num].string=string ; 01441 01442 max_string_length=CMath::max(len, max_string_length); 01443 } 01444 01445 template<class ST> void CStringFeatures<ST>::get_histogram(float64_t** hist, int32_t* rows, int32_t* cols, bool normalize) 01446 { 01447 int32_t nsym=get_num_symbols(); 01448 int32_t slen=get_max_vector_length(); 01449 int64_t sz=int64_t(nsym)*slen*sizeof(float64_t); 01450 float64_t* h= SG_MALLOC(float64_t, sz); 01451 memset(h, 0, sz); 01452 01453 float64_t* h_normalizer=SG_MALLOC(float64_t, slen); 01454 memset(h_normalizer, 0, slen*sizeof(float64_t)); 01455 int32_t num_str=get_num_vectors(); 01456 for (int32_t i=0; i<num_str; i++) 01457 { 01458 int32_t len; 01459 bool free_vec; 01460 ST* vec=get_feature_vector(i, len, free_vec); 01461 for (int32_t j=0; j<len; j++) 01462 { 01463 h[int64_t(j)*nsym+alphabet->remap_to_bin(vec[j])]++; 01464 h_normalizer[j]++; 01465 } 01466 free_feature_vector(vec, i, free_vec); 01467 } 01468 01469 if (normalize) 01470 { 01471 for (int32_t i=0; i<slen; i++) 01472 { 01473 for (int32_t j=0; j<nsym; j++) 01474 { 01475 if (h_normalizer && h_normalizer[i]) 01476 h[int64_t(i)*nsym+j]/=h_normalizer[i]; 01477 } 01478 } 01479 } 01480 SG_FREE(h_normalizer); 01481 01482 *hist=h; 01483 *rows=nsym; 01484 *cols=slen; 01485 } 01486 01487 template<class ST> void CStringFeatures<ST>::create_random(float64_t* hist, int32_t rows, int32_t cols, int32_t num_vec) 01488 { 01489 ASSERT(rows == get_num_symbols()); 01490 cleanup(); 01491 float64_t* randoms=SG_MALLOC(float64_t, cols); 01492 SGString<ST>* sf=SG_MALLOC(SGString<ST>, num_vec); 01493 01494 for (int32_t i=0; i<num_vec; i++) 01495 { 01496 sf[i].string=SG_MALLOC(ST, cols); 01497 sf[i].slen=cols; 01498 01499 SGVector<float64_t>::random_vector(randoms, cols, 0.0, 1.0); 01500 01501 for (int32_t j=0; j<cols; j++) 01502 { 01503 float64_t lik=hist[int64_t(j)*rows+0]; 01504 01505 int32_t c; 01506 for (c=0; c<rows-1; c++) 01507 { 01508 if (randoms[j]<=lik) 01509 break; 01510 lik+=hist[int64_t(j)*rows+c+1]; 01511 } 01512 sf[i].string[j]=alphabet->remap_to_char(c); 01513 } 01514 } 01515 SG_FREE(randoms); 01516 set_features(sf, num_vec, cols); 01517 } 01518 01519 /* 01520 CStringFeatures<SSKTripleFeature>* obtain_sssk_triple_from_cha(int d1, int d2) 01521 { 01522 int *s; 01523 int32_t nStr=get_num_vectors(); 01524 01525 int32_t nfeat=0; 01526 for (int32_t i=0; i < nStr; ++i) 01527 nfeat += get_vector_length[i] - d1 -d2; 01528 SGString<SSKFeature>* F= SG_MALLOC(SGString<SSKFeature>, nfeat); 01529 int32_t c=0; 01530 for (int32_t i=0; i < nStr; ++i) 01531 { 01532 int32_t len; 01533 bool free_vec; 01534 ST* S=get_feature_vector(vec_num, len, free_vec); 01535 free_feature_vector(vec, vec_num, free_vec); 01536 int32_t n=len - d1 - d2; 01537 s=S[i]; 01538 for (int32_t j=0; j < n; ++j) 01539 { 01540 F[c].feature1=s[j]; 01541 F[c].feature2=s[j+d1]; 01542 F[c].feature3=s[j+d1+d2]; 01543 F[c].group=i; 01544 c++; 01545 } 01546 } 01547 ASSERT(nfeat==c); 01548 return F; 01549 } 01550 01551 CStringFeatures<SSKFeature>* obtain_sssk_double_from_char(int **S, int *len, int nStr, int d1) 01552 { 01553 int i, j; 01554 int n, nfeat; 01555 int *group; 01556 int *features; 01557 int *s; 01558 int c; 01559 SSKFeatures *F; 01560 01561 nfeat=0; 01562 for (i=0; i < nStr; ++i) 01563 nfeat += len[i] - d1; 01564 group=(int *)SG_MALLOC(nfeat*sizeof(int)); 01565 features=(int *)SG_MALLOC(nfeat*2*sizeof(int *)); 01566 c=0; 01567 for (i=0; i < nStr; ++i) 01568 { 01569 n=len[i] - d1; 01570 s=S[i]; 01571 for (j=0; j < n; ++j) 01572 { 01573 features[c]=s[j]; 01574 features[c+nfeat]=s[j+d1]; 01575 group[c]=i; 01576 c++; 01577 } 01578 } 01579 if (nfeat!=c) 01580 printf("Something is wrong...\n"); 01581 F=(SSKFeatures *)SG_MALLOC(sizeof(SSKFeatures)); 01582 (*F).features=features; 01583 (*F).group=group; 01584 (*F).n=nfeat; 01585 return F; 01586 } 01587 */ 01588 01589 template<class ST> CFeatures* CStringFeatures<ST>::copy_subset(SGVector<index_t> indices) 01590 { 01591 /* string list to create new CStringFeatures from */ 01592 SGStringList<ST> list_copy(indices.vlen, max_string_length); 01593 01594 /* copy all features */ 01595 for (index_t i=0; i<indices.vlen; ++i) 01596 { 01597 /* index with respect to possible subset */ 01598 index_t real_idx=m_subset_stack->subset_idx_conversion(indices.vector[i]); 01599 01600 /* copy string */ 01601 SGString<ST> current_string=features[real_idx]; 01602 SGString<ST> string_copy(current_string.slen); 01603 memcpy(string_copy.string, current_string.string, 01604 current_string.slen*sizeof(ST)); 01605 list_copy.strings[i]=string_copy; 01606 } 01607 01608 /* create copy instance */ 01609 CStringFeatures* result=new CStringFeatures(list_copy, alphabet); 01610 01611 /* max string length may have changed */ 01612 result->determine_maximum_string_length(); 01613 01614 SG_REF(result); 01615 01616 return result; 01617 } 01618 01619 template<class ST> void CStringFeatures<ST>::subset_changed_post() 01620 { 01621 /* max string length has to be updated */ 01622 determine_maximum_string_length(); 01623 } 01624 01625 template<class ST> ST* CStringFeatures<ST>::compute_feature_vector(int32_t num, int32_t& len) 01626 { 01627 ASSERT(features && num<get_num_vectors()); 01628 01629 int32_t real_num=m_subset_stack->subset_idx_conversion(num); 01630 01631 len=features[real_num].slen; 01632 if (len<=0) 01633 return NULL; 01634 01635 ST* target=SG_MALLOC(ST, len); 01636 memcpy(target, features[real_num].string, len*sizeof(ST)); 01637 return target; 01638 } 01639 01640 template<class ST> void CStringFeatures<ST>::init() 01641 { 01642 set_generic<ST>(); 01643 01644 alphabet=NULL; 01645 num_vectors=0; 01646 features=NULL; 01647 single_string=NULL; 01648 length_of_single_string=0; 01649 max_string_length=0; 01650 order=0; 01651 symbol_mask_table=0; 01652 preprocess_on_get=false; 01653 feature_cache=NULL; 01654 01655 m_parameters->add((CSGObject**) &alphabet, "alphabet"); 01656 m_parameters->add_vector(&features, &num_vectors, "features", 01657 "This contains the array of features."); 01658 m_parameters->add_vector(&single_string, 01659 &length_of_single_string, 01660 "single_string", 01661 "Created by sliding window."); 01662 m_parameters->add(&max_string_length, "max_string_length", 01663 "Length of longest string."); 01664 m_parameters->add(&num_symbols, "num_symbols", 01665 "Number of used symbols."); 01666 m_parameters->add(&original_num_symbols, "original_num_symbols", 01667 "Original number of used symbols."); 01668 m_parameters->add(&order, "order", 01669 "Order used in higher order mapping."); 01670 m_parameters->add(&preprocess_on_get, "preprocess_on_get", 01671 "Preprocess on-the-fly?"); 01672 01673 /* TODO M_PARAMETERS->ADD? 01674 * /// order used in higher order mapping 01675 * ST* symbol_mask_table; 01676 */ 01677 } 01678 01683 template<> EFeatureType CStringFeatures<bool>::get_feature_type() const 01684 { 01685 return F_BOOL; 01686 } 01687 01692 template<> EFeatureType CStringFeatures<char>::get_feature_type() const 01693 { 01694 return F_CHAR; 01695 } 01696 01701 template<> EFeatureType CStringFeatures<uint8_t>::get_feature_type() const 01702 { 01703 return F_BYTE; 01704 } 01705 01710 template<> EFeatureType CStringFeatures<int16_t>::get_feature_type() const 01711 { 01712 return F_SHORT; 01713 } 01714 01719 template<> EFeatureType CStringFeatures<uint16_t>::get_feature_type() const 01720 { 01721 return F_WORD; 01722 } 01723 01728 template<> EFeatureType CStringFeatures<int32_t>::get_feature_type() const 01729 { 01730 return F_INT; 01731 } 01732 01737 template<> EFeatureType CStringFeatures<uint32_t>::get_feature_type() const 01738 { 01739 return F_UINT; 01740 } 01741 01746 template<> EFeatureType CStringFeatures<int64_t>::get_feature_type() const 01747 { 01748 return F_LONG; 01749 } 01750 01755 template<> EFeatureType CStringFeatures<uint64_t>::get_feature_type() const 01756 { 01757 return F_ULONG; 01758 } 01759 01764 template<> EFeatureType CStringFeatures<float32_t>::get_feature_type() const 01765 { 01766 return F_SHORTREAL; 01767 } 01768 01773 template<> EFeatureType CStringFeatures<float64_t>::get_feature_type() const 01774 { 01775 return F_DREAL; 01776 } 01777 01782 template<> EFeatureType CStringFeatures<floatmax_t>::get_feature_type() const 01783 { 01784 return F_LONGREAL; 01785 } 01786 01787 template<> bool CStringFeatures<bool>::get_masked_symbols(bool symbol, uint8_t mask) 01788 { 01789 return symbol; 01790 } 01791 template<> float32_t CStringFeatures<float32_t>::get_masked_symbols(float32_t symbol, uint8_t mask) 01792 { 01793 return symbol; 01794 } 01795 template<> float64_t CStringFeatures<float64_t>::get_masked_symbols(float64_t symbol, uint8_t mask) 01796 { 01797 return symbol; 01798 } 01799 template<> floatmax_t CStringFeatures<floatmax_t>::get_masked_symbols(floatmax_t symbol, uint8_t mask) 01800 { 01801 return symbol; 01802 } 01803 01804 template<> bool CStringFeatures<bool>::shift_offset(bool symbol, int32_t amount) 01805 { 01806 return false; 01807 } 01808 template<> float32_t CStringFeatures<float32_t>::shift_offset(float32_t symbol, int32_t amount) 01809 { 01810 return 0; 01811 } 01812 template<> float64_t CStringFeatures<float64_t>::shift_offset(float64_t symbol, int32_t amount) 01813 { 01814 return 0; 01815 } 01816 template<> floatmax_t CStringFeatures<floatmax_t>::shift_offset(floatmax_t symbol, int32_t amount) 01817 { 01818 return 0; 01819 } 01820 01821 template<> bool CStringFeatures<bool>::shift_symbol(bool symbol, int32_t amount) 01822 { 01823 return symbol; 01824 } 01825 template<> float32_t CStringFeatures<float32_t>::shift_symbol(float32_t symbol, int32_t amount) 01826 { 01827 return symbol; 01828 } 01829 template<> float64_t CStringFeatures<float64_t>::shift_symbol(float64_t symbol, int32_t amount) 01830 { 01831 return symbol; 01832 } 01833 template<> floatmax_t CStringFeatures<floatmax_t>::shift_symbol(floatmax_t symbol, int32_t amount) 01834 { 01835 return symbol; 01836 } 01837 01838 #ifndef SUNOS 01839 template<> template <class CT> bool CStringFeatures<float32_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev) 01840 { 01841 return false; 01842 } 01843 template<> template <class CT> bool CStringFeatures<float64_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev) 01844 { 01845 return false; 01846 } 01847 template<> template <class CT> bool CStringFeatures<floatmax_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev) 01848 { 01849 return false; 01850 } 01851 #endif 01852 01853 template<> void CStringFeatures<float32_t>::embed_features(int32_t p_order) 01854 { 01855 } 01856 template<> void CStringFeatures<float64_t>::embed_features(int32_t p_order) 01857 { 01858 } 01859 template<> void CStringFeatures<floatmax_t>::embed_features(int32_t p_order) 01860 { 01861 } 01862 01863 template<> void CStringFeatures<float32_t>::compute_symbol_mask_table(int64_t max_val) 01864 { 01865 } 01866 template<> void CStringFeatures<float64_t>::compute_symbol_mask_table(int64_t max_val) 01867 { 01868 } 01869 template<> void CStringFeatures<floatmax_t>::compute_symbol_mask_table(int64_t max_val) 01870 { 01871 } 01872 01873 template<> float32_t CStringFeatures<float32_t>::embed_word(float32_t* seq, int32_t len) 01874 { 01875 return 0; 01876 } 01877 template<> float64_t CStringFeatures<float64_t>::embed_word(float64_t* seq, int32_t len) 01878 { 01879 return 0; 01880 } 01881 template<> floatmax_t CStringFeatures<floatmax_t>::embed_word(floatmax_t* seq, int32_t len) 01882 { 01883 return 0; 01884 } 01885 01886 template<> void CStringFeatures<float32_t>::unembed_word(float32_t word, uint8_t* seq, int32_t len) 01887 { 01888 } 01889 template<> void CStringFeatures<float64_t>::unembed_word(float64_t word, uint8_t* seq, int32_t len) 01890 { 01891 } 01892 template<> void CStringFeatures<floatmax_t>::unembed_word(floatmax_t word, uint8_t* seq, int32_t len) 01893 { 01894 } 01895 #define LOAD(f_load, sg_type) \ 01896 template<> void CStringFeatures<sg_type>::load(CFile* loader) \ 01897 { \ 01898 SG_INFO( "loading...\n"); \ 01899 \ 01900 SG_SET_LOCALE_C; \ 01901 SGString<sg_type>* strs; \ 01902 int32_t num_str; \ 01903 int32_t max_len; \ 01904 loader->f_load(strs, num_str, max_len); \ 01905 set_features(strs, num_str, max_len); \ 01906 SG_RESET_LOCALE; \ 01907 } 01908 01909 LOAD(get_string_list, bool) 01910 LOAD(get_string_list, char) 01911 LOAD(get_int8_string_list, int8_t) 01912 LOAD(get_string_list, uint8_t) 01913 LOAD(get_string_list, int16_t) 01914 LOAD(get_string_list, uint16_t) 01915 LOAD(get_string_list, int32_t) 01916 LOAD(get_uint_string_list, uint32_t) 01917 LOAD(get_long_string_list, int64_t) 01918 LOAD(get_ulong_string_list, uint64_t) 01919 LOAD(get_string_list, float32_t) 01920 LOAD(get_string_list, float64_t) 01921 LOAD(get_longreal_string_list, floatmax_t) 01922 #undef LOAD 01923 01924 #define SAVE(f_write, sg_type) \ 01925 template<> void CStringFeatures<sg_type>::save(CFile* writer) \ 01926 { \ 01927 if (m_subset_stack->has_subsets()) \ 01928 SG_ERROR("save() is not possible on subset"); \ 01929 SG_SET_LOCALE_C; \ 01930 ASSERT(writer); \ 01931 writer->f_write(features, num_vectors); \ 01932 SG_RESET_LOCALE; \ 01933 } 01934 01935 SAVE(set_string_list, bool) 01936 SAVE(set_string_list, char) 01937 SAVE(set_int8_string_list, int8_t) 01938 SAVE(set_string_list, uint8_t) 01939 SAVE(set_string_list, int16_t) 01940 SAVE(set_string_list, uint16_t) 01941 SAVE(set_string_list, int32_t) 01942 SAVE(set_uint_string_list, uint32_t) 01943 SAVE(set_long_string_list, int64_t) 01944 SAVE(set_ulong_string_list, uint64_t) 01945 SAVE(set_string_list, float32_t) 01946 SAVE(set_string_list, float64_t) 01947 SAVE(set_longreal_string_list, floatmax_t) 01948 #undef SAVE 01949 01950 template <class ST> template <class CT> 01951 bool CStringFeatures<ST>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, 01952 int32_t p_order, int32_t gap, bool rev) 01953 { 01954 remove_all_subsets(); 01955 ASSERT(sf); 01956 01957 CAlphabet* alpha=sf->get_alphabet(); 01958 ASSERT(alpha->get_num_symbols_in_histogram() > 0); 01959 01960 this->order=p_order; 01961 cleanup(); 01962 01963 num_vectors=sf->get_num_vectors(); 01964 ASSERT(num_vectors>0); 01965 max_string_length=sf->get_max_vector_length()-start; 01966 features=SG_MALLOC(SGString<ST>, num_vectors); 01967 01968 SG_DEBUG( "%1.0llf symbols in StringFeatures<*> %d symbols in histogram\n", sf->get_num_symbols(), 01969 alpha->get_num_symbols_in_histogram()); 01970 01971 for (int32_t i=0; i<num_vectors; i++) 01972 { 01973 int32_t len=-1; 01974 bool vfree; 01975 CT* c=sf->get_feature_vector(i, len, vfree); 01976 ASSERT(!vfree); // won't work when preprocessors are attached 01977 01978 features[i].string=SG_MALLOC(ST, len); 01979 features[i].slen=len; 01980 01981 ST* str=features[i].string; 01982 for (int32_t j=0; j<len; j++) 01983 str[j]=(ST) alpha->remap_to_bin(c[j]); 01984 } 01985 01986 original_num_symbols=alpha->get_num_symbols(); 01987 int32_t max_val=alpha->get_num_bits(); 01988 01989 SG_UNREF(alpha); 01990 01991 if (p_order>1) 01992 num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order); 01993 else 01994 num_symbols=original_num_symbols; 01995 SG_INFO( "max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols); 01996 01997 if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) ) 01998 { 01999 SG_ERROR( "symbol does not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val); 02000 return false; 02001 } 02002 02003 SG_DEBUG( "translate: start=%i order=%i gap=%i(size:%i)\n", start, p_order, gap, sizeof(ST)) ; 02004 for (int32_t line=0; line<num_vectors; line++) 02005 { 02006 int32_t len=0; 02007 bool vfree; 02008 ST* fv=get_feature_vector(line, len, vfree); 02009 ASSERT(!vfree); // won't work when preprocessors are attached 02010 02011 if (rev) 02012 CAlphabet::translate_from_single_order_reversed(fv, len, start+gap, p_order+gap, max_val, gap); 02013 else 02014 CAlphabet::translate_from_single_order(fv, len, start+gap, p_order+gap, max_val, gap); 02015 02016 /* fix the length of the string -- hacky */ 02017 features[line].slen-=start+gap ; 02018 if (features[line].slen<0) 02019 features[line].slen=0 ; 02020 } 02021 02022 compute_symbol_mask_table(max_val); 02023 02024 return true; 02025 } 02026 02027 template class CStringFeatures<bool>; 02028 template class CStringFeatures<char>; 02029 template class CStringFeatures<int8_t>; 02030 template class CStringFeatures<uint8_t>; 02031 template class CStringFeatures<int16_t>; 02032 template class CStringFeatures<uint16_t>; 02033 template class CStringFeatures<int32_t>; 02034 template class CStringFeatures<uint32_t>; 02035 template class CStringFeatures<int64_t>; 02036 template class CStringFeatures<uint64_t>; 02037 template class CStringFeatures<float32_t>; 02038 template class CStringFeatures<float64_t>; 02039 template class CStringFeatures<floatmax_t>; 02040 02041 template bool CStringFeatures<uint16_t>::obtain_from_char_features<uint8_t>(CStringFeatures<uint8_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev); 02042 template bool CStringFeatures<uint32_t>::obtain_from_char_features<uint8_t>(CStringFeatures<uint8_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev); 02043 template bool CStringFeatures<uint64_t>::obtain_from_char_features<uint8_t>(CStringFeatures<uint8_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev); 02044 02045 template bool CStringFeatures<uint16_t>::obtain_from_char_features<uint16_t>(CStringFeatures<uint16_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev); 02046 template bool CStringFeatures<uint32_t>::obtain_from_char_features<uint16_t>(CStringFeatures<uint16_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev); 02047 template bool CStringFeatures<uint64_t>::obtain_from_char_features<uint16_t>(CStringFeatures<uint16_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev); 02048 }