SHOGUN
v2.0.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2009 Soeren Sonnenburg 00008 * Copyright (C) 2009 Berlin Institute of Technology 00009 */ 00010 00011 #include <shogun/lib/common.h> 00012 #include <shogun/io/SGIO.h> 00013 #include <shogun/kernel/string/SNPStringKernel.h> 00014 #include <shogun/kernel/normalizer/SqrtDiagKernelNormalizer.h> 00015 #include <shogun/features/Features.h> 00016 #include <shogun/features/StringFeatures.h> 00017 00018 using namespace shogun; 00019 00020 CSNPStringKernel::CSNPStringKernel() 00021 : CStringKernel<char>(0), 00022 m_degree(0), m_win_len(0), m_inhomogene(false) 00023 { 00024 init(); 00025 set_normalizer(new CSqrtDiagKernelNormalizer()); 00026 register_params(); 00027 } 00028 00029 CSNPStringKernel::CSNPStringKernel(int32_t size, 00030 int32_t degree, int32_t win_len, bool inhomogene) 00031 : CStringKernel<char>(size), 00032 m_degree(degree), m_win_len(2*win_len), m_inhomogene(inhomogene) 00033 { 00034 init(); 00035 set_normalizer(new CSqrtDiagKernelNormalizer()); 00036 register_params(); 00037 } 00038 00039 CSNPStringKernel::CSNPStringKernel( 00040 CStringFeatures<char>* l, CStringFeatures<char>* r, 00041 int32_t degree, int32_t win_len, bool inhomogene) 00042 : CStringKernel<char>(10), m_degree(degree), m_win_len(2*win_len), 00043 m_inhomogene(inhomogene) 00044 { 00045 init(); 00046 set_normalizer(new CSqrtDiagKernelNormalizer()); 00047 if (l==r) 00048 obtain_base_strings(); 00049 init(l, r); 00050 register_params(); 00051 } 00052 00053 CSNPStringKernel::~CSNPStringKernel() 00054 { 00055 cleanup(); 00056 } 00057 00058 bool CSNPStringKernel::init(CFeatures* l, CFeatures* r) 00059 { 00060 CStringKernel<char>::init(l, r); 00061 return init_normalizer(); 00062 } 00063 00064 void CSNPStringKernel::cleanup() 00065 { 00066 CKernel::cleanup(); 00067 SG_FREE(m_str_min); 00068 SG_FREE(m_str_maj); 00069 } 00070 00071 void CSNPStringKernel::obtain_base_strings() 00072 { 00073 //should only be called on training data 00074 ASSERT(lhs==rhs); 00075 00076 m_str_len=0; 00077 00078 for (int32_t i=0; i<num_lhs; i++) 00079 { 00080 int32_t len; 00081 bool free_vec; 00082 char* vec = ((CStringFeatures<char>*) lhs)->get_feature_vector(i, len, free_vec); 00083 00084 if (m_str_len==0) 00085 { 00086 m_str_len=len; 00087 m_str_min=SG_CALLOC(char, len+1); 00088 m_str_maj=SG_CALLOC(char, len+1); 00089 } 00090 else 00091 { 00092 ASSERT(m_str_len==len); 00093 } 00094 00095 for (int32_t j=0; j<len; j++) 00096 { 00097 // skip sequencing errors 00098 if (vec[j]=='0') 00099 continue; 00100 00101 if (m_str_min[j]==0) 00102 m_str_min[j]=vec[j]; 00103 else if (m_str_maj[j]==0 && vec[j]!=m_str_min[j]) 00104 m_str_maj[j]=vec[j]; 00105 } 00106 00107 ((CStringFeatures<char>*) lhs)->free_feature_vector(vec, i, free_vec); 00108 } 00109 00110 for (int32_t j=0; j<m_str_len; j++) 00111 { 00112 // if only one one symbol occurs use 0 00113 if (m_str_min[j]==0) 00114 m_str_min[j]='0'; 00115 if (m_str_maj[j]==0) 00116 m_str_maj[j]='0'; 00117 00118 if (m_str_min[j]>m_str_maj[j]) 00119 CMath::swap(m_str_min[j], m_str_maj[j]); 00120 } 00121 } 00122 00123 float64_t CSNPStringKernel::compute(int32_t idx_a, int32_t idx_b) 00124 { 00125 int32_t alen, blen; 00126 bool free_avec, free_bvec; 00127 00128 char* avec = ((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen, free_avec); 00129 char* bvec = ((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen, free_bvec); 00130 00131 ASSERT(alen==blen); 00132 if (alen!=m_str_len) 00133 SG_ERROR("alen (%d) !=m_str_len (%d)\n", alen, m_str_len); 00134 ASSERT(m_str_min); 00135 ASSERT(m_str_maj); 00136 00137 float64_t total=0; 00138 int32_t inhomogene= (m_inhomogene) ? 1 : 0; 00139 00140 for (int32_t i = 0; i<alen-1; i+=2) 00141 { 00142 int32_t sumaa=0; 00143 int32_t sumbb=0; 00144 int32_t sumab=0; 00145 00146 for (int32_t l=0; l<m_win_len && i+l<alen-1; l+=2) 00147 { 00148 char a1=avec[i+l]; 00149 char a2=avec[i+l+1]; 00150 char b1=bvec[i+l]; 00151 char b2=bvec[i+l+1]; 00152 00153 if ((a1!=a2 || a1=='0' || a1=='0') && (b1!=b2 || b1=='0' || b2=='0')) 00154 sumab++; 00155 else if (a1==a2 && b1==b2) 00156 { 00157 if (a1!=b1) 00158 continue; 00159 00160 if (a1==m_str_min[i+l]) 00161 sumaa++; 00162 else if (a1==m_str_maj[i+l]) 00163 sumbb++; 00164 else 00165 { 00166 SG_ERROR("The impossible happened i=%d l=%d a1=%c " 00167 "a2=%c b1=%c b2=%c min=%c maj=%c\n", i, l, a1,a2, b1,b2, m_str_min[i+l], m_str_maj[i+l]); 00168 } 00169 } 00170 00171 } 00172 total+=CMath::pow(float64_t(sumaa+sumbb+sumab+inhomogene), 00173 (int32_t) m_degree); 00174 } 00175 00176 ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, idx_a, free_avec); 00177 ((CStringFeatures<char>*) rhs)->free_feature_vector(bvec, idx_b, free_bvec); 00178 return total; 00179 } 00180 00181 void CSNPStringKernel::register_params() 00182 { 00183 SG_ADD(&m_degree, "m_degree", "the order of the kernel", MS_AVAILABLE); 00184 SG_ADD(&m_win_len, "m_win_len", "the window length", MS_AVAILABLE); 00185 SG_ADD(&m_inhomogene, "m_inhomogene", 00186 "the mark of whether it's an inhomogeneous poly kernel", MS_NOT_AVAILABLE); 00187 m_parameters->add_vector(&m_str_min, &m_str_len, "m_str_min", "allele A"); 00188 m_parameters->add_vector(&m_str_maj, &m_str_len, "m_str_maj", "allele B"); 00189 } 00190 00191 void CSNPStringKernel::init() 00192 { 00193 m_str_min=NULL; 00194 m_str_maj=NULL; 00195 }