SHOGUN  v2.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
StringFeatures.cpp
Go to the documentation of this file.
00001 #include <shogun/features/StringFeatures.h>
00002 #include <shogun/preprocessor/Preprocessor.h>
00003 #include <shogun/preprocessor/StringPreprocessor.h>
00004 #include <shogun/io/MemoryMappedFile.h>
00005 #include <shogun/io/SGIO.h>
00006 #include <shogun/mathematics/Math.h>
00007 #include <shogun/base/Parameter.h>
00008 
00009 #include <sys/types.h>
00010 #include <sys/stat.h>
00011 #include <dirent.h>
00012 #include <stdio.h>
00013 #include <stdlib.h>
00014 #include <unistd.h>
00015 
00016 
00017 namespace shogun
00018 {
00019 
00020 template<class ST> CStringFeatures<ST>::CStringFeatures() : CFeatures(0)
00021 {
00022     init();
00023     alphabet=new CAlphabet();
00024 }
00025 
00026 template<class ST> CStringFeatures<ST>::CStringFeatures(EAlphabet alpha) : CFeatures(0)
00027 {
00028     init();
00029 
00030     alphabet=new CAlphabet(alpha);
00031     SG_REF(alphabet);
00032     num_symbols=alphabet->get_num_symbols();
00033     original_num_symbols=num_symbols;
00034 }
00035 
00036 template<class ST> CStringFeatures<ST>::CStringFeatures(SGStringList<ST> string_list, EAlphabet alpha)
00037 : CFeatures(0)
00038 {
00039     init();
00040 
00041     alphabet=new CAlphabet(alpha);
00042     SG_REF(alphabet);
00043     num_symbols=alphabet->get_num_symbols();
00044     original_num_symbols=num_symbols;
00045     set_features(string_list.strings, string_list.num_strings, string_list.max_string_length);
00046 }
00047 
00048 template<class ST> CStringFeatures<ST>::CStringFeatures(SGStringList<ST> string_list, CAlphabet* alpha)
00049 : CFeatures(0)
00050 {
00051     init();
00052 
00053     alphabet=new CAlphabet(alpha);
00054     SG_REF(alphabet);
00055     num_symbols=alphabet->get_num_symbols();
00056     original_num_symbols=num_symbols;
00057     set_features(string_list.strings, string_list.num_strings, string_list.max_string_length);
00058 }
00059 
00060 template<class ST> CStringFeatures<ST>::CStringFeatures(CAlphabet* alpha)
00061 : CFeatures(0)
00062 {
00063     init();
00064 
00065     ASSERT(alpha);
00066     SG_REF(alpha);
00067     alphabet=alpha;
00068     num_symbols=alphabet->get_num_symbols();
00069     original_num_symbols=num_symbols;
00070 }
00071 
00072 template<class ST> CStringFeatures<ST>::CStringFeatures(const CStringFeatures & orig)
00073 : CFeatures(orig), num_vectors(orig.num_vectors),
00074     single_string(orig.single_string),
00075     length_of_single_string(orig.length_of_single_string),
00076     max_string_length(orig.max_string_length),
00077     num_symbols(orig.num_symbols),
00078     original_num_symbols(orig.original_num_symbols),
00079     order(orig.order), preprocess_on_get(false),
00080     feature_cache(NULL)
00081 {
00082     init();
00083 
00084     ASSERT(orig.single_string == NULL); //not implemented
00085 
00086     alphabet=orig.alphabet;
00087     SG_REF(alphabet);
00088 
00089     if (orig.features)
00090     {
00091         features=SG_MALLOC(SGString<ST>, orig.num_vectors);
00092 
00093         for (int32_t i=0; i<num_vectors; i++)
00094         {
00095             features[i].string=SG_MALLOC(ST, orig.features[i].slen);
00096             features[i].slen=orig.features[i].slen;
00097             memcpy(features[i].string, orig.features[i].string, sizeof(ST)*orig.features[i].slen);
00098         }
00099     }
00100 
00101     if (orig.symbol_mask_table)
00102     {
00103         symbol_mask_table=SG_MALLOC(ST, 256);
00104         for (int32_t i=0; i<256; i++)
00105             symbol_mask_table[i]=orig.symbol_mask_table[i];
00106     }
00107 
00108     m_subset_stack=orig.m_subset_stack;
00109     SG_REF(m_subset_stack);
00110 }
00111 
00112 template<class ST> CStringFeatures<ST>::CStringFeatures(CFile* loader, EAlphabet alpha)
00113 : CFeatures(loader), num_vectors(0),
00114   features(NULL), single_string(NULL), length_of_single_string(0),
00115   max_string_length(0), order(0),
00116   symbol_mask_table(NULL), preprocess_on_get(false), feature_cache(NULL)
00117 {
00118     init();
00119 
00120     alphabet=new CAlphabet(alpha);
00121     SG_REF(alphabet);
00122     num_symbols=alphabet->get_num_symbols();
00123     original_num_symbols=num_symbols;
00124     load(loader);
00125 }
00126 
00127 template<class ST> CStringFeatures<ST>::~CStringFeatures()
00128 {
00129     cleanup();
00130 
00131     SG_UNREF(alphabet);
00132 }
00133 
00134 template<class ST> void CStringFeatures<ST>::cleanup()
00135 {
00136     remove_all_subsets();
00137 
00138     if (single_string)
00139     {
00140         SG_FREE(single_string);
00141         single_string=NULL;
00142     }
00143     else
00144         cleanup_feature_vectors(0, num_vectors-1);
00145 
00146     num_vectors=0;
00147     SG_FREE(features);
00148     SG_FREE(symbol_mask_table);
00149     features=NULL;
00150     symbol_mask_table=NULL;
00151 
00152     /* start with a fresh alphabet, but instead of emptying the histogram
00153      * create a new object (to leave the alphabet object alone if it is used
00154      * by others)
00155      */
00156     CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
00157     SG_UNREF(alphabet);
00158     alphabet=alpha;
00159     SG_REF(alphabet);
00160 }
00161 
00162 template<class ST> void CStringFeatures<ST>::cleanup_feature_vector(int32_t num)
00163 {
00164     ASSERT(num<get_num_vectors());
00165 
00166     if (features)
00167     {
00168         int32_t real_num=m_subset_stack->subset_idx_conversion(num);
00169         SG_FREE(features[real_num].string);
00170         features[real_num].string=NULL;
00171         features[real_num].slen=0;
00172 
00173         determine_maximum_string_length();
00174     }
00175 }
00176 
00177 template<class ST> void CStringFeatures<ST>::cleanup_feature_vectors(int32_t start, int32_t stop)
00178 {
00179     if (features && get_num_vectors())
00180     {
00181         ASSERT(start<get_num_vectors());
00182         ASSERT(stop<get_num_vectors());
00183 
00184         for (int32_t i=start; i<=stop; i++)
00185         {
00186             int32_t real_num=m_subset_stack->subset_idx_conversion(i);
00187             SG_FREE(features[real_num].string);
00188             features[real_num].string=NULL;
00189             features[real_num].slen=0;
00190         }
00191         determine_maximum_string_length();
00192     }
00193 }
00194 
00195 template<class ST> EFeatureClass CStringFeatures<ST>::get_feature_class() const { return C_STRING; }
00196 
00197 template<class ST> EFeatureType CStringFeatures<ST>::get_feature_type() const { return F_UNKNOWN; }
00198 
00199 template<class ST> CAlphabet* CStringFeatures<ST>::get_alphabet()
00200 {
00201     SG_REF(alphabet);
00202     return alphabet;
00203 }
00204 
00205 template<class ST> CFeatures* CStringFeatures<ST>::duplicate() const
00206 {
00207     return new CStringFeatures<ST>(*this);
00208 }
00209 
00210 template<class ST> SGVector<ST> CStringFeatures<ST>::get_feature_vector(int32_t num)
00211 {
00212     ASSERT(features);
00213     if (num>=get_num_vectors())
00214     {
00215         SG_ERROR("Index out of bounds (number of strings %d, you "
00216                 "requested %d)\n", get_num_vectors(), num);
00217     }
00218 
00219     int32_t l;
00220     bool free_vec;
00221     ST* vec=get_feature_vector(num, l, free_vec);
00222     ST* dst=SG_MALLOC(ST, l);
00223     memcpy(dst, vec, l*sizeof(ST));
00224     free_feature_vector(vec, num, free_vec);
00225     return SGVector<ST>(dst, l, true);
00226 }
00227 
00228 template<class ST> void CStringFeatures<ST>::set_feature_vector(SGVector<ST> vector, int32_t num)
00229 {
00230     ASSERT(features);
00231 
00232     if (m_subset_stack->has_subsets())
00233         SG_ERROR("A subset is set, cannot set feature vector\n");
00234 
00235     if (num>=num_vectors)
00236     {
00237         SG_ERROR("Index out of bounds (number of strings %d, you "
00238                 "requested %d)\n", num_vectors, num);
00239     }
00240 
00241     if (vector.vlen<=0)
00242         SG_ERROR("String has zero or negative length\n");
00243 
00244     cleanup_feature_vector(num);
00245     features[num].slen=vector.vlen;
00246     features[num].string=SG_MALLOC(ST, vector.vlen);
00247     memcpy(features[num].string, vector.vector, vector.vlen*sizeof(ST));
00248 
00249     determine_maximum_string_length();
00250 }
00251 
00252 template<class ST> void CStringFeatures<ST>::enable_on_the_fly_preprocessing()
00253 {
00254     preprocess_on_get=true;
00255 }
00256 
00257 template<class ST> void CStringFeatures<ST>::disable_on_the_fly_preprocessing()
00258 {
00259     preprocess_on_get=false;
00260 }
00261 
00262 template<class ST> ST* CStringFeatures<ST>::get_feature_vector(int32_t num, int32_t& len, bool& dofree)
00263 {
00264     ASSERT(features);
00265     if (num>=get_num_vectors())
00266         SG_ERROR("Requested feature vector with index %d while total num is", num, get_num_vectors());
00267 
00268     int32_t real_num=m_subset_stack->subset_idx_conversion(num);
00269 
00270     if (!preprocess_on_get)
00271     {
00272         dofree=false;
00273         len=features[real_num].slen;
00274         return features[real_num].string;
00275     }
00276     else
00277     {
00278         SG_DEBUG( "computing feature vector!\n") ;
00279         ST* feat=compute_feature_vector(num, len);
00280         dofree=true;
00281 
00282         if (get_num_preprocessors())
00283         {
00284             ST* tmp_feat_before=feat;
00285 
00286             for (int32_t i=0; i<get_num_preprocessors(); i++)
00287             {
00288                 CStringPreprocessor<ST>* p=(CStringPreprocessor<ST>*) get_preprocessor(i);
00289                 feat=p->apply_to_string(tmp_feat_before, len);
00290                 SG_UNREF(p);
00291                 SG_FREE(tmp_feat_before);
00292                 tmp_feat_before=feat;
00293             }
00294         }
00295         // TODO: implement caching
00296         return feat;
00297     }
00298 }
00299 
00300 template<class ST> CStringFeatures<ST>* CStringFeatures<ST>::get_transposed()
00301 {
00302     int32_t num_feat;
00303     int32_t num_vec;
00304     SGString<ST>* s=get_transposed(num_feat, num_vec);
00305     SGStringList<ST> string_list;
00306     string_list.strings = s;
00307     string_list.num_strings = num_vec;
00308     string_list.max_string_length = num_feat;
00309 
00310     return new CStringFeatures<ST>(string_list, alphabet);
00311 }
00312 
00313 template<class ST> SGString<ST>* CStringFeatures<ST>::get_transposed(int32_t &num_feat, int32_t &num_vec)
00314 {
00315     num_feat=get_num_vectors();
00316     num_vec=get_max_vector_length();
00317     ASSERT(have_same_length());
00318 
00319     SG_DEBUG("Allocating memory for transposed string features of size %ld\n",
00320             int64_t(num_feat)*num_vec);
00321 
00322     SGString<ST>* sf=SG_MALLOC(SGString<ST>, num_vec);
00323 
00324     for (int32_t i=0; i<num_vec; i++)
00325     {
00326         sf[i].string=SG_MALLOC(ST, num_feat);
00327         sf[i].slen=num_feat;
00328     }
00329 
00330     for (int32_t i=0; i<num_feat; i++)
00331     {
00332         int32_t len=0;
00333         bool free_vec=false;
00334         ST* vec=get_feature_vector(i, len, free_vec);
00335 
00336         for (int32_t j=0; j<num_vec; j++)
00337             sf[j].string[i]=vec[j];
00338 
00339         free_feature_vector(vec, i, free_vec);
00340     }
00341     return sf;
00342 }
00343 
00344 template<class ST> void CStringFeatures<ST>::free_feature_vector(ST* feat_vec, int32_t num, bool dofree)
00345 {
00346     if (num>=get_num_vectors())
00347     {
00348         SG_ERROR(
00349             "Trying to access string[%d] but num_str=%d\n", num,
00350             get_num_vectors());
00351     }
00352 
00353     int32_t real_num=m_subset_stack->subset_idx_conversion(num);
00354 
00355     if (feature_cache)
00356         feature_cache->unlock_entry(real_num);
00357 
00358     if (dofree)
00359         SG_FREE(feat_vec);
00360 }
00361 
00362 template<class ST> void CStringFeatures<ST>::free_feature_vector(SGVector<ST> feat_vec, int32_t num)
00363 {
00364     if (num>=get_num_vectors())
00365     {
00366         SG_ERROR(
00367             "Trying to access string[%d] but num_str=%d\n", num,
00368             get_num_vectors());
00369     }
00370 
00371     int32_t real_num=m_subset_stack->subset_idx_conversion(num);
00372 
00373     if (feature_cache)
00374         feature_cache->unlock_entry(real_num);
00375 }
00376 
00377 template<class ST> ST CStringFeatures<ST>::get_feature(int32_t vec_num, int32_t feat_num)
00378 {
00379     ASSERT(vec_num<get_num_vectors());
00380 
00381     int32_t len;
00382     bool free_vec;
00383     ST* vec=get_feature_vector(vec_num, len, free_vec);
00384     ASSERT(feat_num<len);
00385     ST result=vec[feat_num];
00386     free_feature_vector(vec, vec_num, free_vec);
00387 
00388     return result;
00389 }
00390 
00391 template<class ST> int32_t CStringFeatures<ST>::get_vector_length(int32_t vec_num)
00392 {
00393     ASSERT(vec_num<get_num_vectors());
00394 
00395     int32_t len;
00396     bool free_vec;
00397     ST* vec=get_feature_vector(vec_num, len, free_vec);
00398     free_feature_vector(vec, vec_num, free_vec);
00399     return len;
00400 }
00401 
00402 template<class ST> int32_t CStringFeatures<ST>::get_max_vector_length()
00403 {
00404     return max_string_length;
00405 }
00406 
00407 template<class ST> int32_t CStringFeatures<ST>::get_num_vectors() const
00408 {
00409     return m_subset_stack->has_subsets() ? m_subset_stack->get_size() : num_vectors;
00410 }
00411 
00412 template<class ST> floatmax_t CStringFeatures<ST>::get_num_symbols() { return num_symbols; }
00413 
00414 template<class ST> floatmax_t CStringFeatures<ST>::get_max_num_symbols() { return CMath::powl(2,sizeof(ST)*8); }
00415 
00416 template<class ST> floatmax_t CStringFeatures<ST>::get_original_num_symbols() { return original_num_symbols; }
00417 
00418 template<class ST> int32_t CStringFeatures<ST>::get_order() { return order; }
00419 
00420 template<class ST> ST CStringFeatures<ST>::get_masked_symbols(ST symbol, uint8_t mask)
00421 {
00422     ASSERT(symbol_mask_table);
00423     return symbol_mask_table[mask] & symbol;
00424 }
00425 
00426 template<class ST> ST CStringFeatures<ST>::shift_offset(ST offset, int32_t amount)
00427 {
00428     ASSERT(alphabet);
00429     return (offset << (amount*alphabet->get_num_bits()));
00430 }
00431 
00432 template<class ST> ST CStringFeatures<ST>::shift_symbol(ST symbol, int32_t amount)
00433 {
00434     ASSERT(alphabet);
00435     return (symbol >> (amount*alphabet->get_num_bits()));
00436 }
00437 
00438 template<class ST> void CStringFeatures<ST>::load_ascii_file(char* fname, bool remap_to_bin,
00439         EAlphabet ascii_alphabet, EAlphabet binary_alphabet)
00440 {
00441     remove_all_subsets();
00442 
00443     size_t blocksize=1024*1024;
00444     size_t required_blocksize=0;
00445     uint8_t* dummy=SG_MALLOC(uint8_t, blocksize);
00446     uint8_t* overflow=NULL;
00447     int32_t overflow_len=0;
00448 
00449     cleanup();
00450 
00451     CAlphabet* alpha=new CAlphabet(ascii_alphabet);
00452     CAlphabet* alpha_bin=new CAlphabet(binary_alphabet);
00453 
00454     FILE* f=fopen(fname, "ro");
00455 
00456     if (f)
00457     {
00458         num_vectors=0;
00459         max_string_length=0;
00460 
00461         SG_INFO("counting line numbers in file %s\n", fname);
00462         size_t block_offs=0;
00463         size_t old_block_offs=0;
00464         fseek(f, 0, SEEK_END);
00465         size_t fsize=ftell(f);
00466         rewind(f);
00467 
00468         if (blocksize>fsize)
00469             blocksize=fsize;
00470 
00471         SG_DEBUG("block_size=%ld file_size=%ld\n", blocksize, fsize);
00472 
00473         size_t sz=blocksize;
00474         while (sz == blocksize)
00475         {
00476             sz=fread(dummy, sizeof(uint8_t), blocksize, f);
00477             for (size_t i=0; i<sz; i++)
00478             {
00479                 block_offs++;
00480                 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00481                 {
00482                     num_vectors++;
00483                     required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs);
00484                     old_block_offs=block_offs;
00485                 }
00486             }
00487             SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00488         }
00489 
00490         SG_INFO("found %d strings\n", num_vectors);
00491         SG_FREE(dummy);
00492         blocksize=required_blocksize;
00493         dummy=SG_MALLOC(uint8_t, blocksize);
00494         overflow=SG_MALLOC(uint8_t, blocksize);
00495         features=SG_MALLOC(SGString<ST>, num_vectors);
00496 
00497         rewind(f);
00498         sz=blocksize;
00499         int32_t lines=0;
00500         while (sz == blocksize)
00501         {
00502             sz=fread(dummy, sizeof(uint8_t), blocksize, f);
00503 
00504             size_t old_sz=0;
00505             for (size_t i=0; i<sz; i++)
00506             {
00507                 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00508                 {
00509                     int32_t len=i-old_sz;
00510                     //SG_PRINT("i:%d len:%d old_sz:%d\n", i, len, old_sz);
00511                     max_string_length=CMath::max(max_string_length, len+overflow_len);
00512 
00513                     features[lines].slen=len;
00514                     features[lines].string=SG_MALLOC(ST, len);
00515 
00516                     if (remap_to_bin)
00517                     {
00518                         for (int32_t j=0; j<overflow_len; j++)
00519                             features[lines].string[j]=alpha->remap_to_bin(overflow[j]);
00520                         for (int32_t j=0; j<len; j++)
00521                             features[lines].string[j+overflow_len]=alpha->remap_to_bin(dummy[old_sz+j]);
00522                         alpha->add_string_to_histogram(&dummy[old_sz], len);
00523                         alpha_bin->add_string_to_histogram(features[lines].string, features[lines].slen);
00524                     }
00525                     else
00526                     {
00527                         for (int32_t j=0; j<overflow_len; j++)
00528                             features[lines].string[j]=overflow[j];
00529                         for (int32_t j=0; j<len; j++)
00530                             features[lines].string[j+overflow_len]=dummy[old_sz+j];
00531                         alpha->add_string_to_histogram(&dummy[old_sz], len);
00532                         alpha->add_string_to_histogram(features[lines].string, features[lines].slen);
00533                     }
00534 
00535                     // clear overflow
00536                     overflow_len=0;
00537 
00538                     //CMath::display_vector(features[lines].string, len);
00539                     old_sz=i+1;
00540                     lines++;
00541                     SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t");
00542                 }
00543             }
00544             for (size_t i=old_sz; i<sz; i++)
00545                 overflow[i-old_sz]=dummy[i];
00546 
00547             overflow_len=sz-old_sz;
00548         }
00549 
00550         if (alpha->check_alphabet_size() && alpha->check_alphabet())
00551         {
00552             SG_INFO("file successfully read\n");
00553             SG_INFO("max_string_length=%d\n", max_string_length);
00554             SG_INFO("num_strings=%d\n", num_vectors);
00555         }
00556         fclose(f);
00557     }
00558 
00559     SG_FREE(dummy);
00560 
00561     SG_UNREF(alphabet);
00562 
00563     if (remap_to_bin)
00564         alphabet=alpha_bin;
00565     else
00566         alphabet=alpha;
00567     SG_REF(alphabet);
00568     num_symbols=alphabet->get_num_symbols();
00569 }
00570 
00571 template<class ST> bool CStringFeatures<ST>::load_fasta_file(const char* fname, bool ignore_invalid)
00572 {
00573     remove_all_subsets();
00574 
00575     int32_t i=0;
00576     uint64_t len=0;
00577     uint64_t offs=0;
00578     int32_t num=0;
00579     int32_t max_len=0;
00580 
00581     CMemoryMappedFile<char> f(fname);
00582 
00583     while (true)
00584     {
00585         char* s=f.get_line(len, offs);
00586         if (!s)
00587             break;
00588 
00589         if (len>0 && s[0]=='>')
00590             num++;
00591     }
00592 
00593     if (num==0)
00594         SG_ERROR("No fasta hunks (lines starting with '>') found\n");
00595 
00596     cleanup();
00597     SG_UNREF(alphabet);
00598     alphabet=new CAlphabet(DNA);
00599     num_symbols=alphabet->get_num_symbols();
00600 
00601     SGString<ST>* strings=SG_MALLOC(SGString<ST>, num);
00602     offs=0;
00603 
00604     for (i=0;i<num; i++)
00605     {
00606         uint64_t id_len=0;
00607         char* id=f.get_line(id_len, offs);
00608 
00609         char* fasta=f.get_line(len, offs);
00610         char* s=fasta;
00611         int32_t fasta_len=0;
00612         int32_t spanned_lines=0;
00613 
00614         while (true)
00615         {
00616             if (!s || len==0)
00617                 SG_ERROR("Error reading fasta entry in line %d len=%ld", 4*i+1, len);
00618 
00619             if (s[0]=='>' || offs==f.get_size())
00620             {
00621                 offs-=len+1; // seek to beginning
00622                 if (offs==f.get_size())
00623                 {
00624                     SG_DEBUG("at EOF\n");
00625                     fasta_len+=len;
00626                 }
00627 
00628                 len=fasta_len-spanned_lines;
00629                 strings[i].string=SG_MALLOC(ST, len);
00630                 strings[i].slen=len;
00631 
00632                 ST* str=strings[i].string;
00633                 int32_t idx=0;
00634                 SG_DEBUG("'%.*s', len=%d, spanned_lines=%d\n", (int32_t) id_len, id, (int32_t) len, (int32_t) spanned_lines);
00635 
00636                 for (int32_t j=0; j<fasta_len; j++)
00637                 {
00638                     if (fasta[j]=='\n')
00639                         continue;
00640 
00641                     ST c=(ST) fasta[j];
00642 
00643                     if (ignore_invalid  && !alphabet->is_valid((uint8_t) fasta[j]))
00644                         c=(ST) 'A';
00645 
00646                     if (uint64_t(idx)>=len)
00647                         SG_ERROR("idx=%d j=%d fasta_len=%d, spanned_lines=%d str='%.*s'\n", idx, j, fasta_len, spanned_lines, idx, str);
00648                     str[idx++]=c;
00649                 }
00650                 max_len=CMath::max(max_len, strings[i].slen);
00651 
00652 
00653                 break;
00654             }
00655 
00656             spanned_lines++;
00657             fasta_len+=len+1; // including '\n'
00658             s=f.get_line(len, offs);
00659         }
00660     }
00661     return set_features(strings, num, max_len);
00662 }
00663 
00664 template<class ST> bool CStringFeatures<ST>::load_fastq_file(const char* fname,
00665         bool ignore_invalid, bool bitremap_in_single_string)
00666 {
00667     remove_all_subsets();
00668 
00669     CMemoryMappedFile<char> f(fname);
00670 
00671     int32_t i=0;
00672     uint64_t len=0;
00673     uint64_t offs=0;
00674 
00675     int32_t num=f.get_num_lines();
00676     int32_t max_len=0;
00677 
00678     if (num%4)
00679         SG_ERROR("Number of lines must be divisible by 4 in fastq files\n");
00680     num/=4;
00681 
00682     cleanup();
00683     SG_UNREF(alphabet);
00684     alphabet=new CAlphabet(DNA);
00685 
00686     SGString<ST>* strings;
00687 
00688     ST* str=NULL;
00689     if (bitremap_in_single_string)
00690     {
00691         strings=SG_MALLOC(SGString<ST>, 1);
00692         strings[0].string=SG_MALLOC(ST, num);
00693         strings[0].slen=num;
00694         f.get_line(len, offs);
00695         f.get_line(len, offs);
00696         order=len;
00697         max_len=num;
00698         offs=0;
00699         original_num_symbols=alphabet->get_num_symbols();
00700         str=SG_MALLOC(ST, len);
00701     }
00702     else
00703         strings=SG_MALLOC(SGString<ST>, num);
00704 
00705     for (i=0;i<num; i++)
00706     {
00707         if (!f.get_line(len, offs))
00708             SG_ERROR("Error reading 'read' identifier in line %d", 4*i);
00709 
00710         char* s=f.get_line(len, offs);
00711         if (!s || len==0)
00712             SG_ERROR("Error reading 'read' in line %d len=%ld", 4*i+1, len);
00713 
00714         if (bitremap_in_single_string)
00715         {
00716             if (len!=(uint64_t) order)
00717                 SG_ERROR("read in line %d not of length %d (is %d)\n", 4*i+1, order, len);
00718             for (int32_t j=0; j<order; j++)
00719                 str[j]=(ST) alphabet->remap_to_bin((uint8_t) s[j]);
00720 
00721             strings[0].string[i]=embed_word(str, order);
00722         }
00723         else
00724         {
00725             strings[i].string=SG_MALLOC(ST, len);
00726             strings[i].slen=len;
00727             str=strings[i].string;
00728 
00729             if (ignore_invalid)
00730             {
00731                 for (uint64_t j=0; j<len; j++)
00732                 {
00733                     if (alphabet->is_valid((uint8_t) s[j]))
00734                         str[j]= (ST) s[j];
00735                     else
00736                         str[j]= (ST) 'A';
00737                 }
00738             }
00739             else
00740             {
00741                 for (uint64_t j=0; j<len; j++)
00742                     str[j]= (ST) s[j];
00743             }
00744             max_len=CMath::max(max_len, (int32_t) len);
00745         }
00746 
00747 
00748         if (!f.get_line(len, offs))
00749             SG_ERROR("Error reading 'read' quality identifier in line %d", 4*i+2);
00750 
00751         if (!f.get_line(len, offs))
00752             SG_ERROR("Error reading 'read' quality in line %d", 4*i+3);
00753     }
00754 
00755     if (bitremap_in_single_string)
00756         num=1;
00757 
00758     num_vectors=num;
00759     max_string_length=max_len;
00760     features=strings;
00761 
00762     return true;
00763 }
00764 
00765 template<class ST> bool CStringFeatures<ST>::load_from_directory(char* dirname)
00766 {
00767     remove_all_subsets();
00768 
00769     struct dirent **namelist;
00770     int32_t n;
00771 
00772     SGIO::set_dirname(dirname);
00773 
00774     SG_DEBUG("dirname '%s'\n", dirname);
00775 
00776     n=scandir(dirname, &namelist, &SGIO::filter, alphasort);
00777     if (n <= 0)
00778     {
00779         SG_ERROR("error calling scandir - no files found\n");
00780         return false;
00781     }
00782     else
00783     {
00784         SGString<ST>* strings=NULL;
00785 
00786         int32_t num=0;
00787         int32_t max_len=-1;
00788 
00789         //usually n==num_vec, but it might not in race conditions
00790         //(file perms modified, file erased)
00791         strings=SG_MALLOC(SGString<ST>, n);
00792 
00793         for (int32_t i=0; i<n; i++)
00794         {
00795             char* fname=SGIO::concat_filename(namelist[i]->d_name);
00796 
00797             struct stat s;
00798             off_t filesize=0;
00799 
00800             if (!stat(fname, &s) && s.st_size>0)
00801             {
00802                 filesize=s.st_size/sizeof(ST);
00803 
00804                 FILE* f=fopen(fname, "ro");
00805                 if (f)
00806                 {
00807                     ST* str=SG_MALLOC(ST, filesize);
00808                     SG_DEBUG("%s:%ld\n", fname, (int64_t) filesize);
00809                     if (fread(str, sizeof(ST), filesize, f)!=(size_t) filesize)
00810                         SG_ERROR("failed to read file\n");
00811                     strings[num].string=str;
00812                     strings[num].slen=filesize;
00813                     max_len=CMath::max(max_len, strings[num].slen);
00814 
00815                     num++;
00816                     fclose(f);
00817                 }
00818             }
00819             else
00820                 SG_ERROR("empty or non readable file \'%s\'\n", fname);
00821 
00822             SG_FREE(namelist[i]);
00823         }
00824         SG_FREE(namelist);
00825 
00826         if (num>0 && strings)
00827         {
00828             set_features(strings, num, max_len);
00829             return true;
00830         }
00831     }
00832     return false;
00833 }
00834 
00835 template<class ST> void CStringFeatures<ST>::set_features(SGStringList<ST> feats)
00836 {
00837     set_features(feats.strings, feats.num_strings, feats.max_string_length);
00838 }
00839 
00840 template<class ST> bool CStringFeatures<ST>::set_features(SGString<ST>* p_features, int32_t p_num_vectors, int32_t p_max_string_length)
00841 {
00842     if (m_subset_stack->has_subsets())
00843         SG_ERROR("Cannot call set_features() with subset.\n");
00844 
00845     if (p_features)
00846     {
00847         CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
00848 
00849         //compute histogram for char/byte
00850         for (int32_t i=0; i<p_num_vectors; i++)
00851             alpha->add_string_to_histogram( p_features[i].string, p_features[i].slen);
00852 
00853         SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram());
00854         SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram());
00855 
00856         if (alpha->check_alphabet_size() && alpha->check_alphabet())
00857         {
00858             cleanup();
00859             SG_UNREF(alphabet);
00860 
00861             alphabet=alpha;
00862             SG_REF(alphabet);
00863 
00864             features=p_features;
00865             num_vectors=p_num_vectors;
00866             max_string_length=p_max_string_length;
00867 
00868             return true;
00869         }
00870         else
00871             SG_UNREF(alpha);
00872     }
00873 
00874     return false;
00875 }
00876 
00877 template<class ST> bool CStringFeatures<ST>::append_features(CStringFeatures<ST>* sf)
00878 {
00879     ASSERT(sf);
00880 
00881     if (m_subset_stack->has_subsets())
00882         SG_ERROR("Cannot call set_features() with subset.\n");
00883 
00884     SGString<ST>* new_features=SG_MALLOC(SGString<ST>, sf->get_num_vectors());
00885 
00886     index_t sf_num_str=sf->get_num_vectors();
00887     for (int32_t i=0; i<sf_num_str; i++)
00888     {
00889         int32_t real_i = sf->m_subset_stack->subset_idx_conversion(i);
00890         int32_t length=sf->features[real_i].slen;
00891         new_features[i].string=SG_MALLOC(ST, length);
00892         memcpy(new_features[i].string, sf->features[real_i].string, length);
00893         new_features[i].slen=length;
00894     }
00895     return append_features(new_features, sf_num_str,
00896             sf->max_string_length);
00897 }
00898 
00899 template<class ST> bool CStringFeatures<ST>::append_features(SGString<ST>* p_features, int32_t p_num_vectors, int32_t p_max_string_length)
00900 {
00901     if (m_subset_stack->has_subsets())
00902         SG_ERROR("Cannot call set_features() with subset.\n");
00903 
00904     if (!features)
00905         return set_features(p_features, p_num_vectors, p_max_string_length);
00906 
00907     CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
00908 
00909     //compute histogram for char/byte
00910     for (int32_t i=0; i<p_num_vectors; i++)
00911         alpha->add_string_to_histogram( p_features[i].string, p_features[i].slen);
00912 
00913     SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram());
00914     SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram());
00915 
00916     if (alpha->check_alphabet_size() && alpha->check_alphabet())
00917     {
00918         SG_UNREF(alpha);
00919         for (int32_t i=0; i<p_num_vectors; i++)
00920             alphabet->add_string_to_histogram( p_features[i].string, p_features[i].slen);
00921 
00922         int32_t old_num_vectors=num_vectors;
00923         num_vectors=old_num_vectors+p_num_vectors;
00924         SGString<ST>* new_features=SG_MALLOC(SGString<ST>, num_vectors);
00925 
00926         for (int32_t i=0; i<num_vectors; i++)
00927         {
00928             if (i<old_num_vectors)
00929             {
00930                 new_features[i].string=features[i].string;
00931                 new_features[i].slen=features[i].slen;
00932             }
00933             else
00934             {
00935                 new_features[i].string=p_features[i-old_num_vectors].string;
00936                 new_features[i].slen=p_features[i-old_num_vectors].slen;
00937             }
00938         }
00939         SG_FREE(features);
00940         SG_FREE(p_features); // free now obsolete features
00941 
00942         this->features=new_features;
00943         max_string_length=CMath::max(max_string_length, p_max_string_length);
00944 
00945         return true;
00946     }
00947     SG_UNREF(alpha);
00948 
00949     return false;
00950 }
00951 
00952 template<class ST> SGStringList<ST> CStringFeatures<ST>::get_features()
00953 {
00954     SGStringList<ST> sl;
00955 
00956     sl.strings=get_features(sl.num_strings, sl.max_string_length);
00957     return sl;
00958 }
00959 
00960 template<class ST> SGString<ST>* CStringFeatures<ST>::get_features(int32_t& num_str, int32_t& max_str_len)
00961 {
00962     if (m_subset_stack->has_subsets())
00963         SG_ERROR("get features() is not possible on subset");
00964 
00965     num_str=num_vectors;
00966     max_str_len=max_string_length;
00967     return features;
00968 }
00969 
00970 template<class ST> SGString<ST>* CStringFeatures<ST>::copy_features(int32_t& num_str, int32_t& max_str_len)
00971 {
00972     ASSERT(num_vectors>0);
00973 
00974     num_str=get_num_vectors();
00975     max_str_len=max_string_length;
00976     SGString<ST>* new_feat=SG_MALLOC(SGString<ST>, num_str);
00977 
00978     for (int32_t i=0; i<num_str; i++)
00979     {
00980         int32_t len;
00981         bool free_vec;
00982         ST* vec=get_feature_vector(i, len, free_vec);
00983         new_feat[i].string=SG_MALLOC(ST, len);
00984         new_feat[i].slen=len;
00985         memcpy(new_feat[i].string, vec, ((size_t) len) * sizeof(ST));
00986         free_feature_vector(vec, i, free_vec);
00987     }
00988 
00989     return new_feat;
00990 }
00991 
00992 template<class ST> void CStringFeatures<ST>::get_features(SGString<ST>** dst, int32_t* num_str)
00993 {
00994     int32_t num_vec;
00995     int32_t max_str_len;
00996     *dst=copy_features(num_vec, max_str_len);
00997     *num_str=num_vec;
00998 }
00999 
01000 template<class ST> bool CStringFeatures<ST>::load_compressed(char* src, bool decompress)
01001 {
01002     remove_all_subsets();
01003 
01004     FILE* file=NULL;
01005 
01006     if (!(file=fopen(src, "r")))
01007         return false;
01008     cleanup();
01009 
01010     // header shogun v0
01011     char id[4];
01012     if (fread(&id[0], sizeof(char), 1, file)!=1)
01013         SG_ERROR("failed to read header");
01014     ASSERT(id[0]=='S');
01015     if (fread(&id[1], sizeof(char), 1, file)!=1)
01016         SG_ERROR("failed to read header");
01017     ASSERT(id[1]=='G');
01018     if (fread(&id[2], sizeof(char), 1, file)!=1)
01019         SG_ERROR("failed to read header");
01020     ASSERT(id[2]=='V');
01021     if (fread(&id[3], sizeof(char), 1, file)!=1)
01022         SG_ERROR("failed to read header");
01023     ASSERT(id[3]=='0');
01024 
01025     //compression type
01026     uint8_t c;
01027     if (fread(&c, sizeof(uint8_t), 1, file)!=1)
01028         SG_ERROR("failed to read compression type");
01029     CCompressor* compressor= new CCompressor((E_COMPRESSION_TYPE) c);
01030     //alphabet
01031     uint8_t a;
01032     delete alphabet;
01033     if (fread(&a, sizeof(uint8_t), 1, file)!=1)
01034         SG_ERROR("failed to read compression alphabet");
01035     alphabet=new CAlphabet((EAlphabet) a);
01036     // number of vectors
01037     if (fread(&num_vectors, sizeof(int32_t), 1, file)!=1)
01038         SG_ERROR("failed to read compression number of vectors");
01039     ASSERT(num_vectors>0);
01040     // maximum string length
01041     if (fread(&max_string_length, sizeof(int32_t), 1, file)!=1)
01042         SG_ERROR("failed to read maximum string length");
01043     ASSERT(max_string_length>0);
01044 
01045     features=SG_MALLOC(SGString<ST>, num_vectors);
01046 
01047     // vectors
01048     for (int32_t i=0; i<num_vectors; i++)
01049     {
01050         // vector len compressed
01051         int32_t len_compressed;
01052         if (fread(&len_compressed, sizeof(int32_t), 1, file)!=1)
01053             SG_ERROR("failed to read vector length compressed");
01054         // vector len uncompressed
01055         int32_t len_uncompressed;
01056         if (fread(&len_uncompressed, sizeof(int32_t), 1, file)!=1)
01057             SG_ERROR("failed to read vector length uncompressed");
01058 
01059         // vector raw data
01060         if (decompress)
01061         {
01062             features[i].string=SG_MALLOC(ST, len_uncompressed);
01063             features[i].slen=len_uncompressed;
01064             uint8_t* compressed=SG_MALLOC(uint8_t, len_compressed);
01065             if (fread(compressed, sizeof(uint8_t), len_compressed, file)!=(size_t) len_compressed)
01066                 SG_ERROR("failed to read compressed data (expected %d bytes)", len_compressed);
01067             uint64_t uncompressed_size=len_uncompressed;
01068             uncompressed_size*=sizeof(ST);
01069             compressor->decompress(compressed, len_compressed,
01070                     (uint8_t*) features[i].string, uncompressed_size);
01071             SG_FREE(compressed);
01072             ASSERT(uncompressed_size==((uint64_t) len_uncompressed)*sizeof(ST));
01073         }
01074         else
01075         {
01076             int32_t offs=CMath::ceil(2.0*sizeof(int32_t)/sizeof(ST));
01077             features[i].string=SG_MALLOC(ST, len_compressed+offs);
01078             features[i].slen=len_compressed+offs;
01079             int32_t* feat32ptr=((int32_t*) (features[i].string));
01080             memset(features[i].string, 0, offs*sizeof(ST));
01081             feat32ptr[0]=(int32_t) len_compressed;
01082             feat32ptr[1]=(int32_t) len_uncompressed;
01083             uint8_t* compressed=(uint8_t*) (&features[i].string[offs]);
01084             if (fread(compressed, 1, len_compressed, file)!=(size_t) len_compressed)
01085                 SG_ERROR("failed to read uncompressed data");
01086         }
01087     }
01088 
01089     delete compressor;
01090     fclose(file);
01091 
01092     return false;
01093 }
01094 
01095 template<class ST> bool CStringFeatures<ST>::save_compressed(char* dest, E_COMPRESSION_TYPE compression, int level)
01096 {
01097     if (m_subset_stack->has_subsets())
01098         SG_ERROR("save_compressed() is not possible on subset");
01099 
01100     FILE* file=NULL;
01101 
01102     if (!(file=fopen(dest, "wb")))
01103         return false;
01104 
01105     CCompressor* compressor= new CCompressor(compression);
01106 
01107     // header shogun v0
01108     const char* id="SGV0";
01109     fwrite(&id[0], sizeof(char), 1, file);
01110     fwrite(&id[1], sizeof(char), 1, file);
01111     fwrite(&id[2], sizeof(char), 1, file);
01112     fwrite(&id[3], sizeof(char), 1, file);
01113 
01114     //compression type
01115     uint8_t c=(uint8_t) compression;
01116     fwrite(&c, sizeof(uint8_t), 1, file);
01117     //alphabet
01118     uint8_t a=(uint8_t) alphabet->get_alphabet();
01119     fwrite(&a, sizeof(uint8_t), 1, file);
01120     // number of vectors
01121     fwrite(&num_vectors, sizeof(int32_t), 1, file);
01122     // maximum string length
01123     fwrite(&max_string_length, sizeof(int32_t), 1, file);
01124 
01125     // vectors
01126     for (int32_t i=0; i<num_vectors; i++)
01127     {
01128         int32_t len=-1;
01129         bool vfree;
01130         ST* vec=get_feature_vector(i, len, vfree);
01131 
01132         uint8_t* compressed=NULL;
01133         uint64_t compressed_size=0;
01134 
01135         compressor->compress((uint8_t*) vec, ((uint64_t) len)*sizeof(ST),
01136                 compressed, compressed_size, level);
01137 
01138         int32_t len_compressed=(int32_t) compressed_size;
01139         // vector len compressed in bytes
01140         fwrite(&len_compressed, sizeof(int32_t), 1, file);
01141         // vector len uncompressed in number of elements of type ST
01142         fwrite(&len, sizeof(int32_t), 1, file);
01143         // vector raw data
01144         fwrite(compressed, compressed_size, 1, file);
01145         SG_FREE(compressed);
01146 
01147         free_feature_vector(vec, i, vfree);
01148     }
01149 
01150     delete compressor;
01151     fclose(file);
01152     return true;
01153 }
01154 
01155 template<class ST> int32_t CStringFeatures<ST>::get_size() const { return sizeof(ST); }
01156 
01157 template<class ST> bool CStringFeatures<ST>::apply_preprocessor(bool force_preprocessing)
01158 {
01159     SG_DEBUG( "force: %d\n", force_preprocessing);
01160 
01161     for (int32_t i=0; i<get_num_preprocessors(); i++)
01162     {
01163         if ( (!is_preprocessed(i) || force_preprocessing) )
01164         {
01165             set_preprocessed(i);
01166             CStringPreprocessor<ST>* p=(CStringPreprocessor<ST>*) get_preprocessor(i);
01167             SG_INFO( "preprocessing using preproc %s\n", p->get_name());
01168 
01169             if (!p->apply_to_string_features(this))
01170             {
01171                 SG_UNREF(p);
01172                 return false;
01173             }
01174             else
01175                 SG_UNREF(p);
01176         }
01177     }
01178     return true;
01179 }
01180 
01181 template<class ST> int32_t CStringFeatures<ST>::obtain_by_sliding_window(int32_t window_size, int32_t step_size, int32_t skip)
01182 {
01183     if (m_subset_stack->has_subsets())
01184         SG_NOTIMPLEMENTED;
01185 
01186     ASSERT(step_size>0);
01187     ASSERT(window_size>0);
01188     ASSERT(num_vectors==1 || single_string);
01189     ASSERT(max_string_length>=window_size ||
01190             (single_string && length_of_single_string>=window_size));
01191 
01192     //in case we are dealing with a single remapped string
01193     //allow remapping
01194     if (single_string)
01195         num_vectors= (length_of_single_string-window_size)/step_size + 1;
01196     else if (num_vectors==1)
01197     {
01198         num_vectors= (max_string_length-window_size)/step_size + 1;
01199         length_of_single_string=max_string_length;
01200     }
01201 
01202     SGString<ST>* f=SG_MALLOC(SGString<ST>, num_vectors);
01203     int32_t offs=0;
01204     for (int32_t i=0; i<num_vectors; i++)
01205     {
01206         f[i].string=&features[0].string[offs+skip];
01207         f[i].slen=window_size-skip;
01208         offs+=step_size;
01209     }
01210     single_string=features[0].string;
01211     SG_FREE(features);
01212     features=f;
01213     max_string_length=window_size-skip;
01214 
01215     return num_vectors;
01216 }
01217 
01218 template<class ST> int32_t CStringFeatures<ST>::obtain_by_position_list(int32_t window_size, CDynamicArray<int32_t>* positions,
01219         int32_t skip)
01220 {
01221     if (m_subset_stack->has_subsets())
01222         SG_NOTIMPLEMENTED;
01223 
01224     ASSERT(positions);
01225     ASSERT(window_size>0);
01226     ASSERT(num_vectors==1 || single_string);
01227     ASSERT(max_string_length>=window_size ||
01228             (single_string && length_of_single_string>=window_size));
01229 
01230     num_vectors= positions->get_num_elements();
01231     ASSERT(num_vectors>0);
01232 
01233     int32_t len;
01234 
01235     //in case we are dealing with a single remapped string
01236     //allow remapping
01237     if (single_string)
01238         len=length_of_single_string;
01239     else
01240     {
01241         single_string=features[0].string;
01242         len=max_string_length;
01243         length_of_single_string=max_string_length;
01244     }
01245 
01246     SGString<ST>* f=SG_MALLOC(SGString<ST>, num_vectors);
01247     for (int32_t i=0; i<num_vectors; i++)
01248     {
01249         int32_t p=positions->get_element(i);
01250 
01251         if (p>=0 && p<=len-window_size)
01252         {
01253             f[i].string=&features[0].string[p+skip];
01254             f[i].slen=window_size-skip;
01255         }
01256         else
01257         {
01258             num_vectors=1;
01259             max_string_length=len;
01260             features[0].slen=len;
01261             single_string=NULL;
01262             SG_FREE(f);
01263             SG_ERROR("window (size:%d) starting at position[%d]=%d does not fit in sequence(len:%d)\n",
01264                     window_size, i, p, len);
01265             return -1;
01266         }
01267     }
01268 
01269     SG_FREE(features);
01270     features=f;
01271     max_string_length=window_size-skip;
01272 
01273     return num_vectors;
01274 }
01275 
01276 template<class ST> bool CStringFeatures<ST>::obtain_from_char(CStringFeatures<char>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
01277 {
01278     return obtain_from_char_features(sf, start, p_order, gap, rev);
01279 }
01280 
01281 template<class ST> bool CStringFeatures<ST>::have_same_length(int32_t len)
01282 {
01283     if (len!=-1)
01284     {
01285         if (len!=max_string_length)
01286             return false;
01287     }
01288     len=max_string_length;
01289 
01290     index_t num_str=get_num_vectors();
01291     for (int32_t i=0; i<num_str; i++)
01292     {
01293         if (get_vector_length(i)!=len)
01294             return false;
01295     }
01296 
01297     return true;
01298 }
01299 
01300 template<class ST> void CStringFeatures<ST>::embed_features(int32_t p_order)
01301 {
01302     if (m_subset_stack->has_subsets())
01303         SG_NOTIMPLEMENTED;
01304 
01305     ASSERT(alphabet->get_num_symbols_in_histogram() > 0);
01306 
01307     order=p_order;
01308     original_num_symbols=alphabet->get_num_symbols();
01309     int32_t max_val=alphabet->get_num_bits();
01310 
01311     if (p_order>1)
01312         num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order);
01313     else
01314         num_symbols=original_num_symbols;
01315 
01316     SG_INFO( "max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols);
01317 
01318     if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) )
01319         SG_WARNING("symbols did not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val);
01320 
01321     ST mask=0;
01322     for (int32_t i=0; i<p_order*max_val; i++)
01323         mask= (mask<<1) | ((ST) 1);
01324 
01325     for (int32_t i=0; i<num_vectors; i++)
01326     {
01327         int32_t len=features[i].slen;
01328 
01329         if (len < p_order)
01330             SG_ERROR("Sequence must be longer than order (%d vs. %d)\n", len, p_order);
01331 
01332         ST* str=features[i].string;
01333 
01334         // convert first word
01335         for (int32_t j=0; j<p_order; j++)
01336             str[j]=(ST) alphabet->remap_to_bin(str[j]);
01337         str[0]=embed_word(&str[0], p_order);
01338 
01339         // convert the rest
01340         int32_t idx=0;
01341         for (int32_t j=p_order; j<len; j++)
01342         {
01343             str[j]=(ST) alphabet->remap_to_bin(str[j]);
01344             str[idx+1]= ((str[idx]<<max_val) | str[j]) & mask;
01345             idx++;
01346         }
01347 
01348         features[i].slen=len-p_order+1;
01349     }
01350 
01351     compute_symbol_mask_table(max_val);
01352 }
01353 
01354 template<class ST> void CStringFeatures<ST>::compute_symbol_mask_table(int64_t max_val)
01355 {
01356     if (m_subset_stack->has_subsets())
01357         SG_NOTIMPLEMENTED;
01358 
01359     SG_FREE(symbol_mask_table);
01360     symbol_mask_table=SG_MALLOC(ST, 256);
01361 
01362     uint64_t mask=0;
01363     for (int32_t i=0; i< (int64_t) max_val; i++)
01364         mask=(mask<<1) | 1;
01365 
01366     for (int32_t i=0; i<256; i++)
01367     {
01368         uint8_t bits=(uint8_t) i;
01369         symbol_mask_table[i]=0;
01370 
01371         for (int32_t j=0; j<8; j++)
01372         {
01373             if (bits & 1)
01374                 symbol_mask_table[i]|=mask<<(max_val*j);
01375 
01376             bits>>=1;
01377         }
01378     }
01379 }
01380 
01381 template<class ST> void CStringFeatures<ST>::unembed_word(ST word, uint8_t* seq, int32_t len)
01382 {
01383     uint32_t nbits= (uint32_t) alphabet->get_num_bits();
01384 
01385     ST mask=0;
01386     for (uint32_t i=0; i<nbits; i++)
01387         mask=(mask<<1) | (ST) 1;
01388 
01389     for (int32_t i=0; i<len; i++)
01390     {
01391         ST w=(word & mask);
01392         seq[len-i-1]=alphabet->remap_to_char((uint8_t) w);
01393         word>>=nbits;
01394     }
01395 }
01396 
01397 template<class ST> ST CStringFeatures<ST>::embed_word(ST* seq, int32_t len)
01398 {
01399     ST value=(ST) 0;
01400     uint32_t nbits= (uint32_t) alphabet->get_num_bits();
01401     for (int32_t i=0; i<len; i++)
01402     {
01403         value<<=nbits;
01404         value|=seq[i];
01405     }
01406 
01407     return value;
01408 }
01409 
01410 template<class ST> void CStringFeatures<ST>::determine_maximum_string_length()
01411 {
01412     max_string_length=0;
01413     index_t num_str=get_num_vectors();
01414 
01415     for (int32_t i=0; i<num_str; i++)
01416     {
01417         max_string_length=CMath::max(max_string_length,
01418             features[m_subset_stack->subset_idx_conversion(i)].slen);
01419     }
01420 }
01421 
01422 template<class ST> ST* CStringFeatures<ST>::get_zero_terminated_string_copy(SGString<ST> str)
01423 {
01424     int32_t l=str.slen;
01425     ST* s=SG_MALLOC(ST, l+1);
01426     memcpy(s, str.string, sizeof(ST)*l);
01427     s[l]='\0';
01428     return s;
01429 }
01430 
01431 template<class ST> void CStringFeatures<ST>::set_feature_vector(int32_t num, ST* string, int32_t len)
01432 {
01433     ASSERT(features);
01434     ASSERT(num<get_num_vectors());
01435 
01436     int32_t real_num=m_subset_stack->subset_idx_conversion(num);
01437 
01438 
01439     features[real_num].slen=len ;
01440     features[real_num].string=string ;
01441 
01442     max_string_length=CMath::max(len, max_string_length);
01443 }
01444 
01445 template<class ST> void CStringFeatures<ST>::get_histogram(float64_t** hist, int32_t* rows, int32_t* cols, bool normalize)
01446 {
01447     int32_t nsym=get_num_symbols();
01448     int32_t slen=get_max_vector_length();
01449     int64_t sz=int64_t(nsym)*slen*sizeof(float64_t);
01450     float64_t* h= SG_MALLOC(float64_t, sz);
01451     memset(h, 0, sz);
01452 
01453     float64_t* h_normalizer=SG_MALLOC(float64_t, slen);
01454     memset(h_normalizer, 0, slen*sizeof(float64_t));
01455     int32_t num_str=get_num_vectors();
01456     for (int32_t i=0; i<num_str; i++)
01457     {
01458         int32_t len;
01459         bool free_vec;
01460         ST* vec=get_feature_vector(i, len, free_vec);
01461         for (int32_t j=0; j<len; j++)
01462         {
01463             h[int64_t(j)*nsym+alphabet->remap_to_bin(vec[j])]++;
01464             h_normalizer[j]++;
01465         }
01466         free_feature_vector(vec, i, free_vec);
01467     }
01468 
01469     if (normalize)
01470     {
01471         for (int32_t i=0; i<slen; i++)
01472         {
01473             for (int32_t j=0; j<nsym; j++)
01474             {
01475                 if (h_normalizer && h_normalizer[i])
01476                     h[int64_t(i)*nsym+j]/=h_normalizer[i];
01477             }
01478         }
01479     }
01480     SG_FREE(h_normalizer);
01481 
01482     *hist=h;
01483     *rows=nsym;
01484     *cols=slen;
01485 }
01486 
01487 template<class ST> void CStringFeatures<ST>::create_random(float64_t* hist, int32_t rows, int32_t cols, int32_t num_vec)
01488 {
01489     ASSERT(rows == get_num_symbols());
01490     cleanup();
01491     float64_t* randoms=SG_MALLOC(float64_t, cols);
01492     SGString<ST>* sf=SG_MALLOC(SGString<ST>, num_vec);
01493 
01494     for (int32_t i=0; i<num_vec; i++)
01495     {
01496         sf[i].string=SG_MALLOC(ST, cols);
01497         sf[i].slen=cols;
01498 
01499         SGVector<float64_t>::random_vector(randoms, cols, 0.0, 1.0);
01500 
01501         for (int32_t j=0; j<cols; j++)
01502         {
01503             float64_t lik=hist[int64_t(j)*rows+0];
01504 
01505             int32_t c;
01506             for (c=0; c<rows-1; c++)
01507             {
01508                 if (randoms[j]<=lik)
01509                     break;
01510                 lik+=hist[int64_t(j)*rows+c+1];
01511             }
01512             sf[i].string[j]=alphabet->remap_to_char(c);
01513         }
01514     }
01515     SG_FREE(randoms);
01516     set_features(sf, num_vec, cols);
01517 }
01518 
01519 /*
01520 CStringFeatures<SSKTripleFeature>* obtain_sssk_triple_from_cha(int d1, int d2)
01521 {
01522     int *s;
01523     int32_t nStr=get_num_vectors();
01524 
01525     int32_t nfeat=0;
01526     for (int32_t i=0; i < nStr; ++i)
01527         nfeat += get_vector_length[i] - d1 -d2;
01528     SGString<SSKFeature>* F= SG_MALLOC(SGString<SSKFeature>, nfeat);
01529     int32_t c=0;
01530     for (int32_t i=0; i < nStr; ++i)
01531     {
01532     int32_t len;
01533     bool free_vec;
01534     ST* S=get_feature_vector(vec_num, len, free_vec);
01535     free_feature_vector(vec, vec_num, free_vec);
01536         int32_t n=len - d1 - d2;
01537         s=S[i];
01538         for (int32_t j=0; j < n; ++j)
01539         {
01540             F[c].feature1=s[j];
01541             F[c].feature2=s[j+d1];
01542             F[c].feature3=s[j+d1+d2];
01543             F[c].group=i;
01544             c++;
01545         }
01546     }
01547     ASSERT(nfeat==c);
01548     return F;
01549 }
01550 
01551 CStringFeatures<SSKFeature>* obtain_sssk_double_from_char(int **S, int *len, int nStr, int d1)
01552 {
01553     int i, j;
01554     int n, nfeat;
01555     int *group;
01556     int *features;
01557     int *s;
01558     int c;
01559     SSKFeatures *F;
01560 
01561     nfeat=0;
01562     for (i=0; i < nStr; ++i)
01563         nfeat += len[i] - d1;
01564     group=(int *)SG_MALLOC(nfeat*sizeof(int));
01565     features=(int *)SG_MALLOC(nfeat*2*sizeof(int *));
01566     c=0;
01567     for (i=0; i < nStr; ++i)
01568     {
01569         n=len[i] - d1;
01570         s=S[i];
01571         for (j=0; j < n; ++j)
01572         {
01573             features[c]=s[j];
01574             features[c+nfeat]=s[j+d1];
01575             group[c]=i;
01576             c++;
01577         }
01578     }
01579     if (nfeat!=c)
01580         printf("Something is wrong...\n");
01581     F=(SSKFeatures *)SG_MALLOC(sizeof(SSKFeatures));
01582     (*F).features=features;
01583     (*F).group=group;
01584     (*F).n=nfeat;
01585     return F;
01586 }
01587 */
01588 
01589 template<class ST> CFeatures* CStringFeatures<ST>::copy_subset(SGVector<index_t> indices)
01590 {
01591     /* string list to create new CStringFeatures from */
01592     SGStringList<ST> list_copy(indices.vlen, max_string_length);
01593 
01594     /* copy all features */
01595     for (index_t i=0; i<indices.vlen; ++i)
01596     {
01597         /* index with respect to possible subset */
01598         index_t real_idx=m_subset_stack->subset_idx_conversion(indices.vector[i]);
01599 
01600         /* copy string */
01601         SGString<ST> current_string=features[real_idx];
01602         SGString<ST> string_copy(current_string.slen);
01603         memcpy(string_copy.string, current_string.string,
01604             current_string.slen*sizeof(ST));
01605         list_copy.strings[i]=string_copy;
01606     }
01607 
01608     /* create copy instance */
01609     CStringFeatures* result=new CStringFeatures(list_copy, alphabet);
01610 
01611     /* max string length may have changed */
01612     result->determine_maximum_string_length();
01613 
01614     SG_REF(result);
01615 
01616     return result;
01617 }
01618 
01619 template<class ST> void CStringFeatures<ST>::subset_changed_post()
01620 {
01621     /* max string length has to be updated */
01622     determine_maximum_string_length();
01623 }
01624 
01625 template<class ST> ST* CStringFeatures<ST>::compute_feature_vector(int32_t num, int32_t& len)
01626 {
01627     ASSERT(features && num<get_num_vectors());
01628 
01629     int32_t real_num=m_subset_stack->subset_idx_conversion(num);
01630 
01631     len=features[real_num].slen;
01632     if (len<=0)
01633         return NULL;
01634 
01635     ST* target=SG_MALLOC(ST, len);
01636     memcpy(target, features[real_num].string, len*sizeof(ST));
01637     return target;
01638 }
01639 
01640 template<class ST> void CStringFeatures<ST>::init()
01641 {
01642     set_generic<ST>();
01643 
01644     alphabet=NULL;
01645     num_vectors=0;
01646     features=NULL;
01647     single_string=NULL;
01648     length_of_single_string=0;
01649     max_string_length=0;
01650     order=0;
01651     symbol_mask_table=0;
01652     preprocess_on_get=false;
01653     feature_cache=NULL;
01654 
01655     m_parameters->add((CSGObject**) &alphabet, "alphabet");
01656     m_parameters->add_vector(&features, &num_vectors, "features",
01657             "This contains the array of features.");
01658     m_parameters->add_vector(&single_string,
01659             &length_of_single_string,
01660             "single_string",
01661             "Created by sliding window.");
01662     m_parameters->add(&max_string_length, "max_string_length",
01663             "Length of longest string.");
01664     m_parameters->add(&num_symbols, "num_symbols",
01665             "Number of used symbols.");
01666     m_parameters->add(&original_num_symbols, "original_num_symbols",
01667             "Original number of used symbols.");
01668     m_parameters->add(&order, "order",
01669             "Order used in higher order mapping.");
01670     m_parameters->add(&preprocess_on_get, "preprocess_on_get",
01671             "Preprocess on-the-fly?");
01672 
01673     /* TODO M_PARAMETERS->ADD?
01674      * /// order used in higher order mapping
01675      * ST* symbol_mask_table;
01676      */
01677 }
01678 
01683 template<> EFeatureType CStringFeatures<bool>::get_feature_type() const
01684 {
01685     return F_BOOL;
01686 }
01687 
01692 template<> EFeatureType CStringFeatures<char>::get_feature_type() const
01693 {
01694     return F_CHAR;
01695 }
01696 
01701 template<> EFeatureType CStringFeatures<uint8_t>::get_feature_type() const
01702 {
01703     return F_BYTE;
01704 }
01705 
01710 template<> EFeatureType CStringFeatures<int16_t>::get_feature_type() const
01711 {
01712     return F_SHORT;
01713 }
01714 
01719 template<> EFeatureType CStringFeatures<uint16_t>::get_feature_type() const
01720 {
01721     return F_WORD;
01722 }
01723 
01728 template<> EFeatureType CStringFeatures<int32_t>::get_feature_type() const
01729 {
01730     return F_INT;
01731 }
01732 
01737 template<> EFeatureType CStringFeatures<uint32_t>::get_feature_type() const
01738 {
01739     return F_UINT;
01740 }
01741 
01746 template<> EFeatureType CStringFeatures<int64_t>::get_feature_type() const
01747 {
01748     return F_LONG;
01749 }
01750 
01755 template<> EFeatureType CStringFeatures<uint64_t>::get_feature_type() const
01756 {
01757     return F_ULONG;
01758 }
01759 
01764 template<> EFeatureType CStringFeatures<float32_t>::get_feature_type() const
01765 {
01766     return F_SHORTREAL;
01767 }
01768 
01773 template<> EFeatureType CStringFeatures<float64_t>::get_feature_type() const
01774 {
01775     return F_DREAL;
01776 }
01777 
01782 template<> EFeatureType CStringFeatures<floatmax_t>::get_feature_type() const
01783 {
01784     return F_LONGREAL;
01785 }
01786 
01787 template<> bool CStringFeatures<bool>::get_masked_symbols(bool symbol, uint8_t mask)
01788 {
01789     return symbol;
01790 }
01791 template<> float32_t CStringFeatures<float32_t>::get_masked_symbols(float32_t symbol, uint8_t mask)
01792 {
01793     return symbol;
01794 }
01795 template<> float64_t CStringFeatures<float64_t>::get_masked_symbols(float64_t symbol, uint8_t mask)
01796 {
01797     return symbol;
01798 }
01799 template<> floatmax_t CStringFeatures<floatmax_t>::get_masked_symbols(floatmax_t symbol, uint8_t mask)
01800 {
01801     return symbol;
01802 }
01803 
01804 template<> bool CStringFeatures<bool>::shift_offset(bool symbol, int32_t amount)
01805 {
01806     return false;
01807 }
01808 template<> float32_t CStringFeatures<float32_t>::shift_offset(float32_t symbol, int32_t amount)
01809 {
01810     return 0;
01811 }
01812 template<> float64_t CStringFeatures<float64_t>::shift_offset(float64_t symbol, int32_t amount)
01813 {
01814     return 0;
01815 }
01816 template<> floatmax_t CStringFeatures<floatmax_t>::shift_offset(floatmax_t symbol, int32_t amount)
01817 {
01818     return 0;
01819 }
01820 
01821 template<> bool CStringFeatures<bool>::shift_symbol(bool symbol, int32_t amount)
01822 {
01823     return symbol;
01824 }
01825 template<> float32_t CStringFeatures<float32_t>::shift_symbol(float32_t symbol, int32_t amount)
01826 {
01827     return symbol;
01828 }
01829 template<> float64_t CStringFeatures<float64_t>::shift_symbol(float64_t symbol, int32_t amount)
01830 {
01831     return symbol;
01832 }
01833 template<> floatmax_t CStringFeatures<floatmax_t>::shift_symbol(floatmax_t symbol, int32_t amount)
01834 {
01835     return symbol;
01836 }
01837 
01838 #ifndef SUNOS
01839 template<>  template <class CT> bool CStringFeatures<float32_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
01840 {
01841     return false;
01842 }
01843 template<>  template <class CT> bool CStringFeatures<float64_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
01844 {
01845     return false;
01846 }
01847 template<>  template <class CT> bool CStringFeatures<floatmax_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
01848 {
01849     return false;
01850 }
01851 #endif
01852 
01853 template<>  void CStringFeatures<float32_t>::embed_features(int32_t p_order)
01854 {
01855 }
01856 template<>  void CStringFeatures<float64_t>::embed_features(int32_t p_order)
01857 {
01858 }
01859 template<>  void CStringFeatures<floatmax_t>::embed_features(int32_t p_order)
01860 {
01861 }
01862 
01863 template<>  void CStringFeatures<float32_t>::compute_symbol_mask_table(int64_t max_val)
01864 {
01865 }
01866 template<>  void CStringFeatures<float64_t>::compute_symbol_mask_table(int64_t max_val)
01867 {
01868 }
01869 template<>  void CStringFeatures<floatmax_t>::compute_symbol_mask_table(int64_t max_val)
01870 {
01871 }
01872 
01873 template<>  float32_t CStringFeatures<float32_t>::embed_word(float32_t* seq, int32_t len)
01874 {
01875     return 0;
01876 }
01877 template<>  float64_t CStringFeatures<float64_t>::embed_word(float64_t* seq, int32_t len)
01878 {
01879     return 0;
01880 }
01881 template<>  floatmax_t CStringFeatures<floatmax_t>::embed_word(floatmax_t* seq, int32_t len)
01882 {
01883     return 0;
01884 }
01885 
01886 template<>  void CStringFeatures<float32_t>::unembed_word(float32_t word, uint8_t* seq, int32_t len)
01887 {
01888 }
01889 template<>  void CStringFeatures<float64_t>::unembed_word(float64_t word, uint8_t* seq, int32_t len)
01890 {
01891 }
01892 template<>  void CStringFeatures<floatmax_t>::unembed_word(floatmax_t word, uint8_t* seq, int32_t len)
01893 {
01894 }
01895 #define LOAD(f_load, sg_type)                                               \
01896 template<> void CStringFeatures<sg_type>::load(CFile* loader)       \
01897 {                                                                           \
01898     SG_INFO( "loading...\n");                                               \
01899                                                                             \
01900     SG_SET_LOCALE_C;                                                    \
01901     SGString<sg_type>* strs;                                                \
01902     int32_t num_str;                                                        \
01903     int32_t max_len;                                                        \
01904     loader->f_load(strs, num_str, max_len);                                 \
01905     set_features(strs, num_str, max_len);                                   \
01906     SG_RESET_LOCALE;                                                    \
01907 }
01908 
01909 LOAD(get_string_list, bool)
01910 LOAD(get_string_list, char)
01911 LOAD(get_int8_string_list, int8_t)
01912 LOAD(get_string_list, uint8_t)
01913 LOAD(get_string_list, int16_t)
01914 LOAD(get_string_list, uint16_t)
01915 LOAD(get_string_list, int32_t)
01916 LOAD(get_uint_string_list, uint32_t)
01917 LOAD(get_long_string_list, int64_t)
01918 LOAD(get_ulong_string_list, uint64_t)
01919 LOAD(get_string_list, float32_t)
01920 LOAD(get_string_list, float64_t)
01921 LOAD(get_longreal_string_list, floatmax_t)
01922 #undef LOAD
01923 
01924 #define SAVE(f_write, sg_type)                                              \
01925 template<> void CStringFeatures<sg_type>::save(CFile* writer)       \
01926 {                                                                           \
01927     if (m_subset_stack->has_subsets())                                                          \
01928         SG_ERROR("save() is not possible on subset");                       \
01929     SG_SET_LOCALE_C;                                                    \
01930     ASSERT(writer);                                                         \
01931     writer->f_write(features, num_vectors);                                 \
01932     SG_RESET_LOCALE;                                                    \
01933 }
01934 
01935 SAVE(set_string_list, bool)
01936 SAVE(set_string_list, char)
01937 SAVE(set_int8_string_list, int8_t)
01938 SAVE(set_string_list, uint8_t)
01939 SAVE(set_string_list, int16_t)
01940 SAVE(set_string_list, uint16_t)
01941 SAVE(set_string_list, int32_t)
01942 SAVE(set_uint_string_list, uint32_t)
01943 SAVE(set_long_string_list, int64_t)
01944 SAVE(set_ulong_string_list, uint64_t)
01945 SAVE(set_string_list, float32_t)
01946 SAVE(set_string_list, float64_t)
01947 SAVE(set_longreal_string_list, floatmax_t)
01948 #undef SAVE
01949 
01950 template <class ST> template <class CT>
01951 bool CStringFeatures<ST>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start,
01952         int32_t p_order, int32_t gap, bool rev)
01953 {
01954     remove_all_subsets();
01955     ASSERT(sf);
01956 
01957     CAlphabet* alpha=sf->get_alphabet();
01958     ASSERT(alpha->get_num_symbols_in_histogram() > 0);
01959 
01960     this->order=p_order;
01961     cleanup();
01962 
01963     num_vectors=sf->get_num_vectors();
01964     ASSERT(num_vectors>0);
01965     max_string_length=sf->get_max_vector_length()-start;
01966     features=SG_MALLOC(SGString<ST>, num_vectors);
01967 
01968     SG_DEBUG( "%1.0llf symbols in StringFeatures<*> %d symbols in histogram\n", sf->get_num_symbols(),
01969             alpha->get_num_symbols_in_histogram());
01970 
01971     for (int32_t i=0; i<num_vectors; i++)
01972     {
01973         int32_t len=-1;
01974         bool vfree;
01975         CT* c=sf->get_feature_vector(i, len, vfree);
01976         ASSERT(!vfree); // won't work when preprocessors are attached
01977 
01978         features[i].string=SG_MALLOC(ST, len);
01979         features[i].slen=len;
01980 
01981         ST* str=features[i].string;
01982         for (int32_t j=0; j<len; j++)
01983             str[j]=(ST) alpha->remap_to_bin(c[j]);
01984     }
01985 
01986     original_num_symbols=alpha->get_num_symbols();
01987     int32_t max_val=alpha->get_num_bits();
01988 
01989     SG_UNREF(alpha);
01990 
01991     if (p_order>1)
01992         num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order);
01993     else
01994         num_symbols=original_num_symbols;
01995     SG_INFO( "max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols);
01996 
01997     if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) )
01998     {
01999         SG_ERROR( "symbol does not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val);
02000         return false;
02001     }
02002 
02003     SG_DEBUG( "translate: start=%i order=%i gap=%i(size:%i)\n", start, p_order, gap, sizeof(ST)) ;
02004     for (int32_t line=0; line<num_vectors; line++)
02005     {
02006         int32_t len=0;
02007         bool vfree;
02008         ST* fv=get_feature_vector(line, len, vfree);
02009         ASSERT(!vfree); // won't work when preprocessors are attached
02010 
02011         if (rev)
02012             CAlphabet::translate_from_single_order_reversed(fv, len, start+gap, p_order+gap, max_val, gap);
02013         else
02014             CAlphabet::translate_from_single_order(fv, len, start+gap, p_order+gap, max_val, gap);
02015 
02016         /* fix the length of the string -- hacky */
02017         features[line].slen-=start+gap ;
02018         if (features[line].slen<0)
02019             features[line].slen=0 ;
02020     }
02021 
02022     compute_symbol_mask_table(max_val);
02023 
02024     return true;
02025 }
02026 
02027 template class CStringFeatures<bool>;
02028 template class CStringFeatures<char>;
02029 template class CStringFeatures<int8_t>;
02030 template class CStringFeatures<uint8_t>;
02031 template class CStringFeatures<int16_t>;
02032 template class CStringFeatures<uint16_t>;
02033 template class CStringFeatures<int32_t>;
02034 template class CStringFeatures<uint32_t>;
02035 template class CStringFeatures<int64_t>;
02036 template class CStringFeatures<uint64_t>;
02037 template class CStringFeatures<float32_t>;
02038 template class CStringFeatures<float64_t>;
02039 template class CStringFeatures<floatmax_t>;
02040 
02041 template bool CStringFeatures<uint16_t>::obtain_from_char_features<uint8_t>(CStringFeatures<uint8_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
02042 template bool CStringFeatures<uint32_t>::obtain_from_char_features<uint8_t>(CStringFeatures<uint8_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
02043 template bool CStringFeatures<uint64_t>::obtain_from_char_features<uint8_t>(CStringFeatures<uint8_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
02044 
02045 template bool CStringFeatures<uint16_t>::obtain_from_char_features<uint16_t>(CStringFeatures<uint16_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
02046 template bool CStringFeatures<uint32_t>::obtain_from_char_features<uint16_t>(CStringFeatures<uint16_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
02047 template bool CStringFeatures<uint64_t>::obtain_from_char_features<uint16_t>(CStringFeatures<uint16_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
02048 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation