SHOGUN
v2.0.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 1999-2009 Soeren Sonnenburg 00008 * Written (W) 1999-2008 Gunnar Raetsch 00009 * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00010 */ 00011 00012 #include <vector> 00013 00014 #include <shogun/lib/common.h> 00015 #include <shogun/io/SGIO.h> 00016 #include <shogun/lib/Signal.h> 00017 #include <shogun/lib/Trie.h> 00018 #include <shogun/base/Parallel.h> 00019 00020 #include <shogun/kernel/string/SpectrumRBFKernel.h> 00021 #include <shogun/features/Features.h> 00022 #include <shogun/features/StringFeatures.h> 00023 #include <math.h> 00024 00025 #include <vector> 00026 #include <string> 00027 #include <fstream> 00028 #include <cmath> 00029 00030 #include <assert.h> 00031 00032 #ifdef HAVE_PTHREAD 00033 #include <pthread.h> 00034 #endif 00035 00036 00037 using namespace shogun; 00038 00039 CSpectrumRBFKernel::CSpectrumRBFKernel() 00040 : CStringKernel<char>(0) 00041 { 00042 init(); 00043 register_param(); 00044 } 00045 00046 CSpectrumRBFKernel::CSpectrumRBFKernel (int32_t size, float64_t *AA_matrix_, int32_t degree_, float64_t width_) 00047 : CStringKernel<char>(size), alphabet(NULL), degree(degree_), width(width_), sequences(NULL), string_features(NULL), nof_sequences(0), max_sequence_length(0) 00048 { 00049 lhs=NULL; 00050 rhs=NULL; 00051 00052 target_letter_0=-1 ; 00053 00054 AA_matrix=SG_MALLOC(float64_t, 128*128); 00055 00056 00057 memcpy(AA_matrix, AA_matrix_, 128*128*sizeof(float64_t)) ; 00058 00059 read_profiles_and_sequences(); 00060 SGStringList<char> string_list; 00061 string_list.strings = sequences; 00062 string_list.num_strings = nof_sequences; 00063 string_list.max_string_length = max_sequence_length; 00064 00065 //string_features = new CStringFeatures<char>(sequences, nof_sequences, max_sequence_length, PROTEIN); 00066 string_features = new CStringFeatures<char>(string_list, IUPAC_AMINO_ACID); 00067 init(string_features, string_features); 00068 register_param(); 00069 } 00070 00071 CSpectrumRBFKernel::CSpectrumRBFKernel( 00072 CStringFeatures<char>* l, CStringFeatures<char>* r, int32_t size, float64_t* AA_matrix_, int32_t degree_, float64_t width_) 00073 : CStringKernel<char>(size), alphabet(NULL), degree(degree_), width(width_), sequences(NULL), string_features(NULL), nof_sequences(0), max_sequence_length(0) 00074 { 00075 target_letter_0=-1 ; 00076 00077 AA_matrix=SG_MALLOC(float64_t, 128*128); 00078 memcpy(AA_matrix, AA_matrix_, 128*128*sizeof(float64_t)) ; 00079 00080 init(l, r); 00081 register_param(); 00082 } 00083 00084 CSpectrumRBFKernel::~CSpectrumRBFKernel() 00085 { 00086 cleanup(); 00087 SG_FREE(AA_matrix); 00088 } 00089 00090 00091 void CSpectrumRBFKernel::remove_lhs() 00092 { 00093 00094 CKernel::remove_lhs(); 00095 } 00096 00097 void CSpectrumRBFKernel::read_profiles_and_sequences() 00098 { 00099 00100 int32_t aa_to_index[128];//profile 00101 aa_to_index[(uint8_t) 'A'] = 0; 00102 aa_to_index[(uint8_t) 'R'] = 1; 00103 aa_to_index[(uint8_t) 'N'] = 2; 00104 aa_to_index[(uint8_t) 'D'] = 3; 00105 aa_to_index[(uint8_t) 'C'] = 4; 00106 aa_to_index[(uint8_t) 'Q'] = 5; 00107 aa_to_index[(uint8_t) 'E'] = 6; 00108 aa_to_index[(uint8_t) 'G'] = 7; 00109 aa_to_index[(uint8_t) 'H'] = 8; 00110 aa_to_index[(uint8_t) 'I'] = 9; 00111 aa_to_index[(uint8_t) 'L'] = 10; 00112 aa_to_index[(uint8_t) 'K'] = 11; 00113 aa_to_index[(uint8_t) 'M'] = 12; 00114 aa_to_index[(uint8_t) 'F'] = 13; 00115 aa_to_index[(uint8_t) 'P'] = 14; 00116 aa_to_index[(uint8_t) 'S'] = 15; 00117 aa_to_index[(uint8_t) 'T'] = 16; 00118 aa_to_index[(uint8_t) 'W'] = 17; 00119 aa_to_index[(uint8_t) 'Y'] = 18; 00120 aa_to_index[(uint8_t) 'V'] = 19; 00121 SG_DEBUG("initializing background\n"); 00122 double background[20]; // profile 00123 background[0]=0.0799912015849807; //A 00124 background[1]=0.0484482507611578;//R 00125 background[2]=0.044293531582512;//N 00126 background[3]=0.0578891399707563;//D 00127 background[4]=0.0171846021407367;//C 00128 background[5]=0.0380578923048682;//Q 00129 background[6]=0.0638169929675978;//E 00130 background[7]=0.0760659374742852;//G 00131 background[8]=0.0223465499452473;//H 00132 background[9]=0.0550905793661343;//I 00133 background[10]=0.0866897071203864;//L 00134 background[11]=0.060458245507428;//K 00135 background[12]=0.0215379186368154;//M 00136 background[13]=0.0396348024787477;//F 00137 background[14]=0.0465746314476874;//P 00138 background[15]=0.0630028230885602;//S 00139 background[16]=0.0580394726014824;//T 00140 background[17]=0.0144991866213453;//W 00141 background[18]=0.03635438623143;//Y 00142 background[19]=0.0700241481678408;//V 00143 00144 00145 std::vector<std::string> seqs; 00146 //int32_t nof_sequences = 7329; 00147 00148 double C = 0.8; 00149 const char *filename="/fml/ag-raetsch/home/toussaint/scp/aawd_compbio_workshop/code_nora/data/profile/profiles"; 00150 std::ifstream fin(filename); 00151 00152 SG_DEBUG("Reading profiles from %s\n", filename); 00153 std::string line; 00154 while (!fin.eof()) 00155 { 00156 std::getline(fin, line); 00157 00158 if (line[0] == '>') // new sequence 00159 { 00160 int idx = line.find_first_of(' '); 00161 sequence_labels.push_back(line.substr(1,idx-1)); 00162 std::getline(fin, line); 00163 std::string orig_sequence = line; 00164 std::string sequence=""; 00165 00166 int len_line = line.length(); 00167 00168 // skip 3 lines 00169 00170 std::getline(fin, line); 00171 std::getline(fin, line); 00172 std::getline(fin, line); 00173 00174 profiles.push_back(std::vector<double>()); 00175 00176 std::vector<double>& curr_profile = profiles.back(); 00177 for (int i=0; i < len_line; ++i) 00178 { 00179 std::getline(fin, line); 00180 int a = line.find_first_not_of(' '); // index position 00181 int b = line.find_first_of(' ', a); // index position 00182 a = line.find_first_not_of(' ', b); // aa position 00183 b = line.find_first_of(' ', a); // aa position 00184 std::string aa=line.substr(a,b-a); 00185 if (0) //(aa =="B" || aa == "X" || aa == "Z") 00186 { 00187 int pos = seqs.size()+1; 00188 SG_DEBUG("Skipping aa in sequence %d\n", pos); 00189 continue; 00190 } 00191 else 00192 { 00193 sequence += aa; 00194 00195 a = line.find_first_not_of(' ', b); // beginning of block to ignore 00196 b = line.find_first_of(' ', a); // aa position 00197 00198 for (int j=0; j < 19; ++j) 00199 { 00200 a = line.find_first_not_of(' ', b); 00201 b = line.find_first_of(' ', a); 00202 } 00203 00204 int all_zeros = 1; 00205 // interesting block 00206 for (int j=0; j < 20; ++j) 00207 { 00208 a = line.find_first_not_of(' ', b); 00209 b = line.find_first_of(' ', a); 00210 double p = atof(line.substr(a, b-a).c_str()); 00211 if (p > 0) 00212 { 00213 all_zeros = 0; 00214 } 00215 double value = -1* std::log(C*(p/100)+(1-C)*background[j]); // taken from Leslie's example, C actually corresponds to 1/(1+C) 00216 curr_profile.push_back(value); 00217 //SG_DEBUG("seq %d aa %d value %f p %f bg %f\n", i, j, value,p, background[j]); 00218 } 00219 00220 if (all_zeros) 00221 { 00222 SG_DEBUG(">>>>>>>>>>>>>>> all zeros"); 00223 if (aa !="B" && aa != "X" && aa != "Z") 00224 { 00225 //profile[i][temp_profile_index]=-log(C+(1-C)*background[re_candidate[temp_profile_index]]); 00226 int32_t aa_index = aa_to_index[(int)aa.c_str()[0]]; 00227 double value = -1* std::log(C+(1-C)*background[aa_index]); // taken from Leslie's example, C actually corresponds to 1/(1+C) 00228 SG_DEBUG("before %f\n", profiles.back()[(i-1) * 20 + aa_index]); 00229 curr_profile[(i*20) + aa_index] = value; 00230 SG_DEBUG(">>> aa %c \t %d \t %f\n", aa.c_str()[0], aa_index, value); 00231 00232 /* 00233 for (int z=0; z <20; ++z) 00234 { 00235 SG_DEBUG(" %d \t %f\t", z, curr_profile[z]); 00236 } 00237 SG_DEBUG("\n"); 00238 */ 00239 } 00240 } 00241 } 00242 } 00243 00244 if (curr_profile.size() != 20 * sequence.length()) 00245 { 00246 SG_ERROR("Something's wrong with the profile.\n"); 00247 break; 00248 } 00249 00250 seqs.push_back(sequence); 00251 00252 00253 /* 00254 // 6 irrelevant lines 00255 for (int i=0; i < 6; ++i) 00256 { 00257 std::getline(fin, line); 00258 } 00259 // 00260 */ 00261 } 00262 } 00263 00264 fin.close(); 00265 00266 nof_sequences = seqs.size(); 00267 sequences = SG_MALLOC(SGString<char>, nof_sequences); 00268 00269 int max_len = 0; 00270 for (int i=0; i < nof_sequences; ++i) 00271 { 00272 int len = seqs[i].length(); 00273 sequences[i].string = SG_MALLOC(char, len+1); 00274 sequences[i].slen = len; 00275 strcpy(sequences[i].string, seqs[i].c_str()); 00276 00277 if (len > max_len) max_len = len; 00278 } 00279 00280 max_sequence_length = max_len; 00281 //string_features = new CStringFeatures<char>(sequences, nof_sequences, max_sequence_length, PROTEIN); 00282 00283 } 00284 00285 bool CSpectrumRBFKernel::init(CFeatures* l, CFeatures* r) 00286 { 00287 // >> profile 00288 /* 00289 read_profiles_and_sequences(); 00290 l = string_features; 00291 r = string_features; 00292 */ 00293 // << profile 00294 00295 int32_t lhs_changed=(lhs!=l); 00296 int32_t rhs_changed=(rhs!=r); 00297 00298 CStringKernel<char>::init(l,r); 00299 00300 SG_DEBUG("lhs_changed: %i\n", lhs_changed); 00301 SG_DEBUG("rhs_changed: %i\n", rhs_changed); 00302 00303 CStringFeatures<char>* sf_l=(CStringFeatures<char>*) l; 00304 CStringFeatures<char>* sf_r=(CStringFeatures<char>*) r; 00305 00306 SG_UNREF(alphabet); 00307 alphabet=sf_l->get_alphabet(); 00308 CAlphabet* ralphabet=sf_r->get_alphabet(); 00309 00310 if (!((alphabet->get_alphabet()==DNA) || (alphabet->get_alphabet()==RNA))) 00311 properties &= ((uint64_t) (-1)) ^ (KP_LINADD | KP_BATCHEVALUATION); 00312 00313 ASSERT(ralphabet->get_alphabet()==alphabet->get_alphabet()); 00314 SG_UNREF(ralphabet); 00315 00316 00317 return init_normalizer(); 00318 } 00319 00320 void CSpectrumRBFKernel::cleanup() 00321 { 00322 00323 SG_UNREF(alphabet); 00324 alphabet=NULL; 00325 00326 CKernel::cleanup(); 00327 } 00328 00329 inline bool isaa(char c) 00330 { 00331 if (c<65 || c>89 || c=='B' || c=='J' || c=='O' || c=='U' || c=='X' || c=='Z') 00332 return false ; 00333 return true ; 00334 } 00335 00336 float64_t CSpectrumRBFKernel::AA_helper(const char* path, const int seq_degree, const char* joint_seq, unsigned int index) 00337 { 00338 //const char* AA = "ARNDCQEGHILKMFPSTWYV"; 00339 float64_t diff=0.0 ; 00340 00341 for (int i=0; i<seq_degree; i++) 00342 { 00343 if (!isaa(path[i])||!isaa(joint_seq[index+i])) 00344 diff+=1.4 ; 00345 else 00346 { 00347 diff += AA_matrix[ (path[i]-1)*128 + path[i] - 1] ; 00348 diff -= 2*AA_matrix[ (path[i]-1)*128 + joint_seq[index+i] - 1] ; 00349 diff += AA_matrix[ (joint_seq[index+i]-1)*128 + joint_seq[index+i] - 1] ; 00350 if (CMath::is_nan(diff)) 00351 fprintf(stderr, "nan occurred: '%c' '%c'\n", path[i], joint_seq[index+i]) ; 00352 } 00353 } 00354 00355 return exp( - diff/width) ; 00356 } 00357 00358 float64_t CSpectrumRBFKernel::compute(int32_t idx_a, int32_t idx_b) 00359 { 00360 int32_t alen, blen; 00361 bool afree, bfree; 00362 00363 char* avec = ((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen, afree); 00364 char* bvec = ((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen, bfree); 00365 00366 float64_t result=0; 00367 for (int32_t i=0; i<alen; i++) 00368 { 00369 for (int32_t j=0; j<blen; j++) 00370 { 00371 if ((i+degree<=alen) && (j+degree<=blen)) 00372 result += AA_helper(&(avec[i]), degree, bvec, j) ; 00373 } 00374 } 00375 00376 ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, idx_a, afree); 00377 ((CStringFeatures<char>*) rhs)->free_feature_vector(bvec, idx_b, bfree); 00378 return result; 00379 } 00380 00381 bool CSpectrumRBFKernel::set_AA_matrix( 00382 float64_t* AA_matrix_) 00383 { 00384 00385 if (AA_matrix_) 00386 { 00387 SG_DEBUG("Setting AA_matrix\n") ; 00388 memcpy(AA_matrix, AA_matrix_, 128*128*sizeof(float64_t)) ; 00389 return true ; 00390 } 00391 00392 return false; 00393 } 00394 00395 void CSpectrumRBFKernel::register_param() 00396 { 00397 SG_ADD(°ree, "degree", "degree of the kernel", MS_AVAILABLE); 00398 SG_ADD(&AA_matrix_length, "AA_matrix_length", "the length of AA matrix", 00399 MS_NOT_AVAILABLE); 00400 m_parameters->add_vector(&AA_matrix, &AA_matrix_length, "AA_matrix", "128*128 scalar product matrix"); 00401 SG_ADD(&width, "width", "width of Gaussian", MS_AVAILABLE); 00402 SG_ADD(&nof_sequences, "nof_sequences", "length of the sequence", 00403 MS_NOT_AVAILABLE); 00404 m_parameters->add_vector(&sequences, &nof_sequences, "the sequences as a part of profile"); 00405 SG_ADD(&max_sequence_length, 00406 "max_sequence_length", "max length of the sequence", MS_NOT_AVAILABLE); 00407 } 00408 00409 void CSpectrumRBFKernel::register_alphabet() 00410 { 00411 SG_ADD((CSGObject**)&alphabet, "alphabet", "the alphabet used by kernel", 00412 MS_NOT_AVAILABLE); 00413 } 00414 00415 void CSpectrumRBFKernel::init() 00416 { 00417 alphabet = NULL; 00418 degree = 0; 00419 AA_matrix = NULL; 00420 AA_matrix_length = 128*128; 00421 width = 0.0; 00422 sequences = NULL; 00423 string_features = NULL; 00424 nof_sequences = 0; 00425 max_sequence_length = 0; 00426 00427 initialized = false; 00428 00429 max_mismatch = 0; 00430 target_letter_0 = 0; 00431 }