SHOGUN
v2.0.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 1999-2009 Soeren Sonnenburg 00008 * Written (W) 1999-2008 Gunnar Raetsch 00009 * Written (W) 2011-2012 Heiko Strathmann 00010 * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00011 */ 00012 00013 #ifndef _CSTRINGFEATURES__H__ 00014 #define _CSTRINGFEATURES__H__ 00015 00016 #include <shogun/lib/common.h> 00017 #include <shogun/lib/Cache.h> 00018 #include <shogun/lib/DynamicArray.h> 00019 #include <shogun/lib/Compressor.h> 00020 #include <shogun/io/File.h> 00021 00022 #include <shogun/features/Features.h> 00023 #include <shogun/features/Alphabet.h> 00024 00025 namespace shogun 00026 { 00027 class CAlphabet; 00028 template <class T> class CDynamicArray; 00029 class CFile; 00030 template <class T> class SGString; 00031 00032 #ifndef DOXYGEN_SHOULD_SKIP_THIS 00033 struct SSKDoubleFeature 00034 { 00035 int feature1; 00036 int feature2; 00037 int group; 00038 }; 00039 00040 struct SSKTripleFeature 00041 { 00042 int feature1; 00043 int feature2; 00044 int feature3; 00045 int group; 00046 }; 00047 #endif 00048 00072 template <class ST> class CStringFeatures : public CFeatures 00073 { 00074 public: 00078 CStringFeatures(); 00079 00084 CStringFeatures(EAlphabet alpha); 00085 00090 CStringFeatures(SGStringList<ST> string_list, EAlphabet alpha); 00091 00096 CStringFeatures(SGStringList<ST> string_list, CAlphabet* alpha); 00097 00102 CStringFeatures(CAlphabet* alpha); 00103 00105 CStringFeatures(const CStringFeatures & orig); 00106 00112 CStringFeatures(CFile* loader, EAlphabet alpha=DNA); 00113 00114 virtual ~CStringFeatures(); 00115 00121 virtual void cleanup(); 00122 00129 virtual void cleanup_feature_vector(int32_t num); 00130 00138 virtual void cleanup_feature_vectors(int32_t start, int32_t stop); 00139 00144 virtual EFeatureClass get_feature_class() const; 00145 00150 virtual EFeatureType get_feature_type() const; 00151 00156 CAlphabet* get_alphabet(); 00157 00162 virtual CFeatures* duplicate() const; 00163 00170 SGVector<ST> get_feature_vector(int32_t num); 00171 00179 void set_feature_vector(SGVector<ST> vector, int32_t num); 00180 00183 void enable_on_the_fly_preprocessing(); 00184 00188 void disable_on_the_fly_preprocessing(); 00189 00200 ST* get_feature_vector(int32_t num, int32_t& len, bool& dofree); 00201 00208 CStringFeatures<ST>* get_transposed(); 00209 00223 SGString<ST>* get_transposed(int32_t &num_feat, int32_t &num_vec); 00224 00233 void free_feature_vector(ST* feat_vec, int32_t num, bool dofree); 00234 00242 void free_feature_vector(SGVector<ST> feat_vec, int32_t num); 00243 00252 virtual ST get_feature(int32_t vec_num, int32_t feat_num); 00253 00261 virtual int32_t get_vector_length(int32_t vec_num); 00262 00269 virtual int32_t get_max_vector_length(); 00270 00272 virtual int32_t get_num_vectors() const; 00273 00280 floatmax_t get_num_symbols(); 00281 00289 floatmax_t get_max_num_symbols(); 00290 00291 // these functions are necessary to find out about a former conversion process 00292 00297 floatmax_t get_original_num_symbols(); 00298 00303 int32_t get_order(); 00304 00312 ST get_masked_symbols(ST symbol, uint8_t mask); 00313 00320 ST shift_offset(ST offset, int32_t amount); 00321 00328 ST shift_symbol(ST symbol, int32_t amount); 00329 00334 virtual void load(CFile* loader); 00335 00346 void load_ascii_file(char* fname, bool remap_to_bin=true, 00347 EAlphabet ascii_alphabet=DNA, EAlphabet binary_alphabet=RAWDNA); 00348 00357 bool load_fasta_file(const char* fname, bool ignore_invalid=false); 00358 00368 bool load_fastq_file(const char* fname, 00369 bool ignore_invalid=false, bool bitremap_in_single_string=false); 00370 00378 bool load_from_directory(char* dirname); 00379 00385 void set_features(SGStringList<ST> feats); 00386 00396 bool set_features(SGString<ST>* p_features, int32_t p_num_vectors, 00397 int32_t p_max_string_length); 00398 00407 bool append_features(CStringFeatures<ST>* sf); 00408 00421 bool append_features(SGString<ST>* p_features, int32_t p_num_vectors, 00422 int32_t p_max_string_length); 00423 00427 SGStringList<ST> get_features(); 00428 00437 virtual SGString<ST>* get_features(int32_t& num_str, int32_t& max_str_len); 00438 00447 virtual SGString<ST>* copy_features(int32_t& num_str, int32_t& max_str_len); 00448 00456 virtual void get_features(SGString<ST>** dst, int32_t* num_str); 00457 00464 virtual void save(CFile* writer); 00465 00474 virtual bool load_compressed(char* src, bool decompress); 00475 00485 virtual bool save_compressed(char* dest, E_COMPRESSION_TYPE compression, int level); 00486 00491 virtual int32_t get_size() const; 00492 00498 virtual bool apply_preprocessor(bool force_preprocessing=false); 00499 00512 int32_t obtain_by_sliding_window(int32_t window_size, int32_t step_size, int32_t skip=0); 00513 00524 int32_t obtain_by_position_list(int32_t window_size, CDynamicArray<int32_t>* positions, 00525 int32_t skip=0); 00526 00540 bool obtain_from_char(CStringFeatures<char>* sf, int32_t start, 00541 int32_t p_order, int32_t gap, bool rev); 00542 00554 template <class CT> 00555 bool obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, 00556 int32_t p_order, int32_t gap, bool rev); 00557 00567 bool have_same_length(int32_t len=-1); 00568 00574 void embed_features(int32_t p_order); 00575 00582 void compute_symbol_mask_table(int64_t max_val); 00583 00590 void unembed_word(ST word, uint8_t* seq, int32_t len); 00591 00597 ST embed_word(ST* seq, int32_t len); 00598 00603 void determine_maximum_string_length(); 00604 00612 static ST* get_zero_terminated_string_copy(SGString<ST> str); 00613 00622 virtual void set_feature_vector(int32_t num, ST* string, int32_t len); 00623 00628 virtual void get_histogram(float64_t** hist, int32_t* rows, int32_t* cols, 00629 bool normalize=true); 00630 00635 virtual void create_random(float64_t* hist, int32_t rows, int32_t cols, 00636 int32_t num_vec); 00637 00646 virtual CFeatures* copy_subset(SGVector<index_t> indices); 00647 00649 inline virtual const char* get_name() const { return "StringFeatures"; } 00650 00652 virtual void subset_changed_post(); 00653 00654 protected: 00665 virtual ST* compute_feature_vector(int32_t num, int32_t& len); 00666 00667 private: 00668 void init(); 00669 00670 protected: 00671 00673 CAlphabet* alphabet; 00674 00676 int32_t num_vectors; 00677 00679 SGString<ST>* features; 00680 00682 ST* single_string; 00683 00685 int32_t length_of_single_string; 00686 00688 int32_t max_string_length; 00689 00691 floatmax_t num_symbols; 00692 00694 floatmax_t original_num_symbols; 00695 00697 int32_t order; 00698 00700 ST* symbol_mask_table; 00701 00703 bool preprocess_on_get; 00704 00706 CCache<ST>* feature_cache; 00707 }; 00708 } 00709 #endif // _CSTRINGFEATURES__H__