SHOGUN
v2.0.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2009 Soeren Sonnenburg 00008 * Copyright (C) 2009 Fraunhofer Institute FIRST and Max-Planck-Society 00009 */ 00010 00011 #include <shogun/features/SNPFeatures.h> 00012 #include <shogun/io/SGIO.h> 00013 #include <shogun/features/Alphabet.h> 00014 00015 using namespace shogun; 00016 00017 CSNPFeatures::CSNPFeatures() 00018 { 00019 SG_UNSTABLE("CSNPFeatures::CSNPFeatures()", "\n"); 00020 00021 strings = NULL; 00022 00023 string_length = 0; 00024 num_strings = 0; 00025 w_dim = 0; 00026 00027 normalization_const = 0.0; 00028 00029 m_str_min = NULL; 00030 m_str_maj = NULL; 00031 } 00032 00033 CSNPFeatures::CSNPFeatures(CStringFeatures<uint8_t>* str) : CDotFeatures(), 00034 m_str_min(NULL), m_str_maj(NULL) 00035 { 00036 ASSERT(str); 00037 ASSERT(str->have_same_length()); 00038 SG_REF(str); 00039 00040 strings=str; 00041 string_length=str->get_max_vector_length(); 00042 ASSERT((string_length & 1) == 0); // length divisible by 2 00043 w_dim=3*string_length/2; 00044 num_strings=str->get_num_vectors(); 00045 CAlphabet* alpha=str->get_alphabet(); 00046 ASSERT(alpha->get_alphabet()==SNP); 00047 SG_UNREF(alpha); 00048 00049 obtain_base_strings(); 00050 set_normalization_const(); 00051 00052 } 00053 00054 CSNPFeatures::CSNPFeatures(const CSNPFeatures& orig) 00055 : CDotFeatures(orig), strings(orig.strings), 00056 normalization_const(orig.normalization_const), 00057 m_str_min(NULL), m_str_maj(NULL) 00058 { 00059 SG_REF(strings); 00060 string_length=strings->get_max_vector_length(); 00061 ASSERT((string_length & 1) == 0); // length divisible by 2 00062 w_dim=3*string_length; 00063 num_strings=strings->get_num_vectors(); 00064 CAlphabet* alpha=strings->get_alphabet(); 00065 SG_UNREF(alpha); 00066 00067 obtain_base_strings(); 00068 } 00069 00070 CSNPFeatures::~CSNPFeatures() 00071 { 00072 SG_UNREF(strings); 00073 } 00074 00075 int32_t CSNPFeatures::get_dim_feature_space() const 00076 { 00077 return w_dim; 00078 } 00079 00080 int32_t CSNPFeatures::get_nnz_features_for_vector(int32_t num) 00081 { 00082 return w_dim/3; 00083 } 00084 00085 EFeatureType CSNPFeatures::get_feature_type() const 00086 { 00087 return F_UNKNOWN; 00088 } 00089 00090 EFeatureClass CSNPFeatures::get_feature_class() const 00091 { 00092 return C_WD; 00093 } 00094 00095 int32_t CSNPFeatures::get_num_vectors() const 00096 { 00097 return num_strings; 00098 } 00099 00100 int32_t CSNPFeatures::get_size() const 00101 { 00102 return sizeof(float64_t); 00103 } 00104 00105 float64_t CSNPFeatures::get_normalization_const() 00106 { 00107 return normalization_const; 00108 } 00109 00110 void CSNPFeatures::set_minor_base_string(const char* str) 00111 { 00112 m_str_min=(uint8_t*) strdup(str); 00113 } 00114 00115 void CSNPFeatures::set_major_base_string(const char* str) 00116 { 00117 m_str_maj=(uint8_t*) strdup(str); 00118 } 00119 00120 char* CSNPFeatures::get_minor_base_string() 00121 { 00122 return (char*) m_str_min; 00123 } 00124 00125 char* CSNPFeatures::get_major_base_string() 00126 { 00127 return (char*) m_str_maj; 00128 } 00129 00130 float64_t CSNPFeatures::dot(int32_t idx_a, CDotFeatures* df, int32_t idx_b) 00131 { 00132 ASSERT(df); 00133 ASSERT(df->get_feature_type() == get_feature_type()); 00134 ASSERT(df->get_feature_class() == get_feature_class()); 00135 CSNPFeatures* sf=(CSNPFeatures*) df; 00136 00137 int32_t alen, blen; 00138 bool free_avec, free_bvec; 00139 00140 uint8_t* avec = strings->get_feature_vector(idx_a, alen, free_avec); 00141 uint8_t* bvec = sf->strings->get_feature_vector(idx_b, blen, free_bvec); 00142 00143 ASSERT(alen==blen); 00144 if (alen!=string_length) 00145 SG_ERROR("alen (%d) !=string_length (%d)\n", alen, string_length); 00146 ASSERT(m_str_min); 00147 ASSERT(m_str_maj); 00148 00149 float64_t total=0; 00150 00151 for (int32_t i = 0; i<alen-1; i+=2) 00152 { 00153 int32_t sumaa=0; 00154 int32_t sumbb=0; 00155 int32_t sumab=0; 00156 00157 uint8_t a1=avec[i]; 00158 uint8_t a2=avec[i+1]; 00159 uint8_t b1=bvec[i]; 00160 uint8_t b2=bvec[i+1]; 00161 00162 if ((a1!=a2 || a1=='0' || a1=='0') && (b1!=b2 || b1=='0' || b2=='0')) 00163 sumab++; 00164 else if (a1==a2 && b1==b2) 00165 { 00166 if (a1!=b1) 00167 continue; 00168 00169 if (a1==m_str_min[i]) 00170 sumaa++; 00171 else if (a1==m_str_maj[i]) 00172 sumbb++; 00173 else 00174 { 00175 SG_ERROR("The impossible happened i=%d a1=%c " 00176 "a2=%c b1=%c b2=%c min=%c maj=%c\n", i, a1,a2, b1,b2, m_str_min[i], m_str_maj[i]); 00177 } 00178 00179 } 00180 total+=sumaa+sumbb+sumab; 00181 } 00182 00183 strings->free_feature_vector(avec, idx_a, free_avec); 00184 sf->strings->free_feature_vector(bvec, idx_b, free_bvec); 00185 return total; 00186 } 00187 00188 float64_t CSNPFeatures::dense_dot(int32_t vec_idx1, float64_t* vec2, int32_t vec2_len) 00189 { 00190 if (vec2_len != w_dim) 00191 SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim); 00192 00193 float64_t sum=0; 00194 int32_t len; 00195 bool free_vec1; 00196 uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1); 00197 int32_t offs=0; 00198 00199 for (int32_t i=0; i<len; i+=2) 00200 { 00201 int32_t dim=0; 00202 00203 char a1=vec[i]; 00204 char a2=vec[i+1]; 00205 00206 if (a1==a2 && a1!='0' && a2!='0') 00207 { 00208 if (a1==m_str_min[i]) 00209 dim=1; 00210 else if (a1==m_str_maj[i]) 00211 dim=2; 00212 else 00213 { 00214 SG_ERROR("The impossible happened i=%d a1=%c a2=%c min=%c maj=%c\n", 00215 i, a1,a2, m_str_min[i], m_str_maj[i]); 00216 } 00217 } 00218 00219 sum+=vec2[offs+dim]; 00220 offs+=3; 00221 } 00222 strings->free_feature_vector(vec, vec_idx1, free_vec1); 00223 00224 return sum/normalization_const; 00225 } 00226 00227 void CSNPFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val) 00228 { 00229 if (vec2_len != w_dim) 00230 SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim); 00231 00232 int32_t len; 00233 bool free_vec1; 00234 uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1); 00235 int32_t offs=0; 00236 00237 if (abs_val) 00238 alpha=CMath::abs(alpha); 00239 00240 for (int32_t i=0; i<len; i+=2) 00241 { 00242 int32_t dim=0; 00243 00244 char a1=vec[i]; 00245 char a2=vec[i+1]; 00246 00247 if (a1==a2 && a1!='0' && a2!='0') 00248 { 00249 if (a1==m_str_min[i]) 00250 dim=1; 00251 else if (a1==m_str_maj[i]) 00252 dim=2; 00253 else 00254 { 00255 SG_ERROR("The impossible happened i=%d a1=%c a2=%c min=%c maj=%c\n", 00256 i, a1,a2, m_str_min[i], m_str_maj[i]); 00257 } 00258 } 00259 00260 vec2[offs+dim]+=alpha; 00261 offs+=3; 00262 } 00263 strings->free_feature_vector(vec, vec_idx1, free_vec1); 00264 } 00265 00266 void CSNPFeatures::find_minor_major_strings(uint8_t* minor, uint8_t* major) 00267 { 00268 for (int32_t i=0; i<num_strings; i++) 00269 { 00270 int32_t len; 00271 bool free_vec; 00272 uint8_t* vec = ((CStringFeatures<uint8_t>*) strings)->get_feature_vector(i, len, free_vec); 00273 ASSERT(string_length==len); 00274 00275 for (int32_t j=0; j<len; j++) 00276 { 00277 // skip sequencing errors 00278 if (vec[j]=='0') 00279 continue; 00280 00281 if (minor[j]==0) 00282 minor[j]=vec[j]; 00283 else if (major[j]==0 && vec[j]!=minor[j]) 00284 major[j]=vec[j]; 00285 } 00286 00287 ((CStringFeatures<uint8_t>*) strings)->free_feature_vector(vec, i, free_vec); 00288 } 00289 } 00290 00291 void CSNPFeatures::obtain_base_strings(CSNPFeatures* snp) 00292 { 00293 SG_FREE(m_str_min); 00294 SG_FREE(m_str_maj); 00295 size_t tlen=(string_length+1)*sizeof(uint8_t); 00296 00297 m_str_min=SG_CALLOC(uint8_t, tlen); 00298 m_str_maj=SG_CALLOC(uint8_t, tlen); 00299 00300 find_minor_major_strings(m_str_min, m_str_maj); 00301 00302 if (snp) 00303 snp->find_minor_major_strings(m_str_min, m_str_maj); 00304 00305 for (int32_t j=0; j<string_length; j++) 00306 { 00307 // if only one symbol occurs use 0 00308 if (m_str_min[j]==0) 00309 m_str_min[j]='0'; 00310 if (m_str_maj[j]==0) 00311 m_str_maj[j]='0'; 00312 00313 if (m_str_min[j]>m_str_maj[j]) 00314 CMath::swap(m_str_min[j], m_str_maj[j]); 00315 } 00316 } 00317 00318 void CSNPFeatures::set_normalization_const(float64_t n) 00319 { 00320 if (n==0) 00321 { 00322 normalization_const=string_length; 00323 normalization_const=CMath::sqrt(normalization_const); 00324 } 00325 else 00326 normalization_const=n; 00327 00328 SG_DEBUG("normalization_const:%f\n", normalization_const); 00329 } 00330 00331 void* CSNPFeatures::get_feature_iterator(int32_t vector_index) 00332 { 00333 return NULL; 00334 } 00335 00336 bool CSNPFeatures::get_next_feature(int32_t& index, float64_t& value, void* iterator) 00337 { 00338 return false; 00339 } 00340 00341 void CSNPFeatures::free_feature_iterator(void* iterator) 00342 { 00343 } 00344 00345 CFeatures* CSNPFeatures::duplicate() const 00346 { 00347 return new CSNPFeatures(*this); 00348 } 00349 00350 SGMatrix<float64_t> CSNPFeatures::get_histogram(bool normalize) 00351 { 00352 int32_t nsym=3; 00353 float64_t* h= SG_CALLOC(float64_t, size_t(nsym)*string_length/2); 00354 00355 float64_t* h_normalizer=SG_MALLOC(float64_t, string_length/2); 00356 memset(h_normalizer, 0, string_length/2*sizeof(float64_t)); 00357 int32_t num_str=get_num_vectors(); 00358 for (int32_t i=0; i<num_str; i++) 00359 { 00360 int32_t len; 00361 bool free_vec; 00362 uint8_t* vec = strings->get_feature_vector(i, len, free_vec); 00363 00364 for (int32_t j=0; j<len; j+=2) 00365 { 00366 int32_t dim=0; 00367 00368 char a1=vec[j]; 00369 char a2=vec[j+1]; 00370 00371 if (a1==a2 && a1!='0' && a2!='0') 00372 { 00373 if (a1==m_str_min[j]) 00374 dim=1; 00375 else if (a1==m_str_maj[j]) 00376 dim=2; 00377 else 00378 { 00379 SG_ERROR("The impossible happened j=%d a1=%c a2=%c min=%c maj=%c\n", 00380 j, a1,a2, m_str_min[j], m_str_maj[j]); 00381 } 00382 } 00383 00384 h[int64_t(j/2)*nsym+dim]++; 00385 h_normalizer[j/2]++; 00386 } 00387 00388 strings->free_feature_vector(vec, i, free_vec); 00389 } 00390 00391 if (normalize) 00392 { 00393 for (int32_t i=0; i<string_length/2; i++) 00394 { 00395 for (int32_t j=0; j<nsym; j++) 00396 { 00397 if (h_normalizer && h_normalizer[i]) 00398 h[int64_t(i)*nsym+j]/=h_normalizer[i]; 00399 } 00400 } 00401 } 00402 SG_FREE(h_normalizer); 00403 00404 return SGMatrix<float64_t>(h, nsym, string_length/2); 00405 } 00406 00407 SGMatrix<float64_t> CSNPFeatures::get_2x3_table(CSNPFeatures* pos, CSNPFeatures* neg) 00408 { 00409 00410 ASSERT(pos->strings->get_max_vector_length() == neg->strings->get_max_vector_length()); 00411 int32_t len=pos->strings->get_max_vector_length(); 00412 00413 float64_t* table=SG_MALLOC(float64_t, 3*2*len/2); 00414 00415 SGMatrix<float64_t> p_hist=pos->get_histogram(false); 00416 SGMatrix<float64_t> n_hist=neg->get_histogram(false); 00417 00418 00419 for (int32_t i=0; i<3*len/2; i++) 00420 { 00421 table[2*i]=p_hist.matrix[i]; 00422 table[2*i+1]=n_hist.matrix[i]; 00423 } 00424 return SGMatrix<float64_t>(table, 2,3*len/2); 00425 }