SHOGUN
v2.0.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2008 Christian Igel, Tobias Glasmachers 00008 * Copyright (C) 2008 Christian Igel, Tobias Glasmachers 00009 * 00010 * Shogun adjustments (W) 2008-2009 Soeren Sonnenburg 00011 * Copyright (C) 2008-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00012 * 00013 */ 00014 #include <shogun/kernel/string/OligoStringKernel.h> 00015 #include <shogun/kernel/normalizer/SqrtDiagKernelNormalizer.h> 00016 #include <shogun/features/StringFeatures.h> 00017 00018 #include <map> 00019 #include <vector> 00020 #include <algorithm> 00021 00022 using namespace shogun; 00023 00024 COligoStringKernel::COligoStringKernel() 00025 : CStringKernel<char>() 00026 { 00027 SG_UNSTABLE("OligoStringKernel"); 00028 init(); 00029 } 00030 00031 COligoStringKernel::COligoStringKernel(int32_t cache_sz, int32_t kmer_len, float64_t w) 00032 : CStringKernel<char>(cache_sz) 00033 { 00034 SG_UNSTABLE("OligoStringKernel"); 00035 init(); 00036 00037 k=kmer_len; 00038 width=w; 00039 } 00040 00041 COligoStringKernel::~COligoStringKernel() 00042 { 00043 cleanup(); 00044 } 00045 00046 void COligoStringKernel::cleanup() 00047 { 00048 SG_FREE(gauss_table); 00049 gauss_table=NULL; 00050 gauss_table_len=0; 00051 00052 CKernel::cleanup(); 00053 } 00054 00055 bool COligoStringKernel::init(CFeatures* l, CFeatures* r) 00056 { 00057 cleanup(); 00058 00059 CStringKernel<char>::init(l,r); 00060 int32_t max_len=CMath::max( 00061 ((CStringFeatures<char>*) l)->get_max_vector_length(), 00062 ((CStringFeatures<char>*) r)->get_max_vector_length() 00063 ); 00064 00065 getExpFunctionCache(max_len); 00066 return init_normalizer(); 00067 } 00068 00069 void COligoStringKernel::encodeOligo( 00070 const std::string& sequence, uint32_t k_mer_length, 00071 const std::string& allowed_characters, 00072 std::vector< std::pair<int32_t, float64_t> >& values) 00073 { 00074 float64_t oligo_value = 0.; 00075 float64_t factor = 1.; 00076 std::map<std::string::value_type, uint32_t> residue_values; 00077 uint32_t counter = 0; 00078 uint32_t number_of_residues = allowed_characters.size(); 00079 uint32_t sequence_length = sequence.size(); 00080 bool sequence_ok = true; 00081 00082 // checking if sequence contains illegal characters 00083 for (uint32_t i = 0; i < sequence.size(); ++i) 00084 { 00085 if (allowed_characters.find(sequence.at(i)) == std::string::npos) 00086 sequence_ok = false; 00087 } 00088 00089 if (sequence_ok && k_mer_length <= sequence_length) 00090 { 00091 values.resize(sequence_length - k_mer_length + 1, 00092 std::pair<int32_t, float64_t>()); 00093 for (uint32_t i = 0; i < number_of_residues; ++i) 00094 { 00095 residue_values.insert(std::make_pair(allowed_characters[i], counter)); 00096 ++counter; 00097 } 00098 for (int32_t k = k_mer_length - 1; k >= 0; k--) 00099 { 00100 oligo_value += factor * residue_values[sequence[k]]; 00101 factor *= number_of_residues; 00102 } 00103 factor /= number_of_residues; 00104 counter = 0; 00105 values[counter].first = 1; 00106 values[counter].second = oligo_value; 00107 ++counter; 00108 00109 for (uint32_t j = 1; j < sequence_length - k_mer_length + 1; j++) 00110 { 00111 oligo_value -= factor * residue_values[sequence[j - 1]]; 00112 oligo_value = oligo_value * number_of_residues + 00113 residue_values[sequence[j + k_mer_length - 1]]; 00114 00115 values[counter].first = j + 1; 00116 values[counter].second = oligo_value ; 00117 ++counter; 00118 } 00119 stable_sort(values.begin(), values.end(), cmpOligos_); 00120 } 00121 else 00122 { 00123 values.clear(); 00124 } 00125 } 00126 00127 void COligoStringKernel::getSequences( 00128 const std::vector<std::string>& sequences, uint32_t k_mer_length, 00129 const std::string& allowed_characters, 00130 std::vector< std::vector< std::pair<int32_t, float64_t> > >& encoded_sequences) 00131 { 00132 std::vector< std::pair<int32_t, float64_t> > temp_vector; 00133 encoded_sequences.resize(sequences.size(), 00134 std::vector< std::pair<int32_t, float64_t> >()); 00135 00136 for (uint32_t i = 0; i < sequences.size(); ++i) 00137 { 00138 encodeOligo(sequences[i], k_mer_length, allowed_characters, temp_vector); 00139 encoded_sequences[i] = temp_vector; 00140 } 00141 } 00142 00143 void COligoStringKernel::getExpFunctionCache(uint32_t sequence_length) 00144 { 00145 SG_FREE(gauss_table); 00146 gauss_table=SG_MALLOC(float64_t, sequence_length); 00147 00148 gauss_table[0] = 1; 00149 for (uint32_t i = 1; i < sequence_length; i++) 00150 gauss_table[i] = exp((-1.0 / (CMath::sq(width))) * CMath::sq((float64_t) i)); 00151 00152 gauss_table_len=sequence_length; 00153 } 00154 00155 float64_t COligoStringKernel::kernelOligoFast( 00156 const std::vector< std::pair<int32_t, float64_t> >& x, 00157 const std::vector< std::pair<int32_t, float64_t> >& y, 00158 int32_t max_distance) 00159 { 00160 float64_t result = 0; 00161 int32_t i1 = 0; 00162 int32_t i2 = 0; 00163 int32_t c1 = 0; 00164 uint32_t x_size = x.size(); 00165 uint32_t y_size = y.size(); 00166 00167 while ((uint32_t) i1 + 1 < x_size && (uint32_t) i2 + 1 < y_size) 00168 { 00169 if (x[i1].second == y[i2].second) 00170 { 00171 if (max_distance < 0 00172 || (abs(x[i1].first - y[i2].first)) <= max_distance) 00173 { 00174 result += gauss_table[abs((x[i1].first - y[i2].first))]; 00175 if (x[i1].second == x[i1 + 1].second) 00176 { 00177 i1++; 00178 c1++; 00179 } 00180 else if (y[i2].second == y[i2 + 1].second) 00181 { 00182 i2++; 00183 i1 -= c1; 00184 c1 = 0; 00185 } 00186 else 00187 { 00188 i1++; 00189 i2++; 00190 } 00191 } 00192 else 00193 { 00194 if (x[i1].first < y[i2].first) 00195 { 00196 if (x[i1].second == x[i1 + 1].second) 00197 { 00198 i1++; 00199 } 00200 else if (y[i2].second == y[i2 + 1].second) 00201 { 00202 while (y[i2].second == y[i2+1].second) 00203 i2++; 00204 ++i1; 00205 c1 = 0; 00206 } 00207 else 00208 { 00209 i1++; 00210 i2++; 00211 c1 = 0; 00212 } 00213 } 00214 else 00215 { 00216 i2++; 00217 i1 -= c1; 00218 c1 = 0; 00219 } 00220 } 00221 } 00222 else 00223 { 00224 if (x[i1].second < y[i2].second) 00225 i1++; 00226 else 00227 i2++; 00228 c1 = 0; 00229 } 00230 } 00231 return result; 00232 } 00233 00234 00235 float64_t COligoStringKernel::compute(int32_t idx_a, int32_t idx_b) 00236 { 00237 int32_t alen, blen; 00238 bool free_a, free_b; 00239 char* avec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen, free_a); 00240 char* bvec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen, free_b); 00241 std::vector< std::pair<int32_t, float64_t> > aenc; 00242 std::vector< std::pair<int32_t, float64_t> > benc; 00243 encodeOligo(std::string(avec, alen), k, "ACGT", aenc); 00244 encodeOligo(std::string(bvec, alen), k, "ACGT", benc); 00245 float64_t result=kernelOligoFast(aenc, benc); 00246 ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, idx_a, free_a); 00247 ((CStringFeatures<char>*) rhs)->free_feature_vector(bvec, idx_b, free_b); 00248 return result; 00249 } 00250 00251 void COligoStringKernel::init() 00252 { 00253 k=0; 00254 width=0.0; 00255 gauss_table=NULL; 00256 gauss_table_len=0; 00257 00258 set_normalizer(new CSqrtDiagKernelNormalizer()); 00259 00260 SG_ADD(&k, "k", "K-mer length.", MS_AVAILABLE); 00261 SG_ADD(&width, "width", "Width of Gaussian.", MS_AVAILABLE); 00262 m_parameters->add_vector(&gauss_table, &gauss_table_len, "gauss_table", 00263 "Gauss Cache Table."); 00264 }