SHOGUN
v2.0.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2009 Sebastian J. Schultheiss and Soeren Sonnenburg 00008 * Copyright (C) 2009 Max-Planck-Society 00009 */ 00010 00011 #include <shogun/lib/common.h> 00012 #include <shogun/kernel/string/RegulatoryModulesStringKernel.h> 00013 #include <shogun/features/Features.h> 00014 #include <shogun/io/SGIO.h> 00015 00016 using namespace shogun; 00017 00018 CRegulatoryModulesStringKernel::CRegulatoryModulesStringKernel() 00019 : CStringKernel<char>(0), width(0.0), degree(0), shift(0), window(0), 00020 motif_positions_lhs(NULL), motif_positions_rhs(NULL), 00021 position_weights(NULL), weights(NULL) 00022 { 00023 register_params(); 00024 } 00025 00026 CRegulatoryModulesStringKernel::CRegulatoryModulesStringKernel( 00027 int32_t size, float64_t w, int32_t d, int32_t s, int32_t wl) 00028 : CStringKernel<char>(size), width(w), degree(d), shift(s), window(wl), 00029 motif_positions_lhs(NULL), motif_positions_rhs(NULL), position_weights(NULL), weights(NULL) 00030 { 00031 register_params(); 00032 } 00033 00034 CRegulatoryModulesStringKernel::CRegulatoryModulesStringKernel(CStringFeatures<char>* lstr, CStringFeatures<char>* rstr, 00035 CDenseFeatures<uint16_t>* lpos, CDenseFeatures<uint16_t>* rpos, 00036 float64_t w, int32_t d, int32_t s, int32_t wl, int32_t size) 00037 : CStringKernel<char>(size), width(w), degree(d), shift(s), window(wl), 00038 motif_positions_lhs(NULL), motif_positions_rhs(NULL), position_weights(NULL), weights(NULL) 00039 { 00040 set_motif_positions(lpos, rpos); 00041 init(lstr,rstr); 00042 register_params(); 00043 } 00044 00045 CRegulatoryModulesStringKernel::~CRegulatoryModulesStringKernel() 00046 { 00047 SG_UNREF(motif_positions_lhs); 00048 SG_UNREF(motif_positions_rhs); 00049 } 00050 00051 bool CRegulatoryModulesStringKernel::init(CFeatures* l, CFeatures* r) 00052 { 00053 ASSERT(motif_positions_lhs); 00054 ASSERT(motif_positions_rhs); 00055 00056 if (l->get_num_vectors() != motif_positions_lhs->get_num_vectors()) 00057 SG_ERROR("Number of vectors does not agree (LHS: %d, Motif LHS: %d).\n", 00058 l->get_num_vectors(), motif_positions_lhs->get_num_vectors()); 00059 if (r->get_num_vectors() != motif_positions_rhs->get_num_vectors()) 00060 SG_ERROR("Number of vectors does not agree (RHS: %d, Motif RHS: %d).\n", 00061 r->get_num_vectors(), motif_positions_rhs->get_num_vectors()); 00062 00063 set_wd_weights(); 00064 CStringKernel<char>::init(l, r); 00065 return init_normalizer(); 00066 } 00067 00068 void CRegulatoryModulesStringKernel::set_motif_positions( 00069 CDenseFeatures<uint16_t>* positions_lhs, CDenseFeatures<uint16_t>* positions_rhs) 00070 { 00071 ASSERT(positions_lhs); 00072 ASSERT(positions_rhs); 00073 SG_UNREF(motif_positions_lhs); 00074 SG_UNREF(motif_positions_rhs); 00075 if (positions_lhs->get_num_features() != positions_rhs->get_num_features()) 00076 SG_ERROR("Number of dimensions does not agree.\n"); 00077 00078 motif_positions_lhs=positions_lhs; 00079 motif_positions_rhs=positions_rhs; 00080 SG_REF(positions_lhs); 00081 SG_REF(positions_rhs); 00082 } 00083 00084 float64_t CRegulatoryModulesStringKernel::compute(int32_t idx_a, int32_t idx_b) 00085 { 00086 ASSERT(motif_positions_lhs); 00087 ASSERT(motif_positions_rhs); 00088 00089 bool free_avec, free_bvec; 00090 char* avec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen, free_avec); 00091 char* bvec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen, free_bvec); 00092 00093 int32_t alen_pos, blen_pos; 00094 bool afree_pos, bfree_pos; 00095 uint16_t* positions_a = motif_positions_lhs->get_feature_vector(idx_a, alen_pos, afree_pos); 00096 uint16_t* positions_b = motif_positions_rhs->get_feature_vector(idx_b, blen_pos, bfree_pos); 00097 ASSERT(alen_pos==blen_pos); 00098 int32_t num_pos=alen_pos; 00099 00100 00101 float64_t result_rbf=0; 00102 float64_t result_wds=0; 00103 00104 for (int32_t p=0; p<num_pos; p++) 00105 { 00106 result_rbf+=CMath::sq(positions_a[p]-positions_b[p]); 00107 00108 for (int32_t p2=0; p2<num_pos; p2++) //p+1 and below * 2 00109 result_rbf+=CMath::sq( (positions_a[p]-positions_a[p2]) - (positions_b[p]-positions_b[p2]) ); 00110 00111 int32_t limit = window; 00112 if (window + positions_a[p] > alen) 00113 limit = alen - positions_a[p]; 00114 00115 if (window + positions_b[p] > blen) 00116 limit = CMath::min(limit, blen - positions_b[p]); 00117 00118 result_wds+=compute_wds(&avec[positions_a[p]], &bvec[positions_b[p]], 00119 limit); 00120 } 00121 00122 float64_t result=exp(-result_rbf/width)+result_wds; 00123 00124 ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, idx_a, free_avec); 00125 ((CStringFeatures<char>*) rhs)->free_feature_vector(bvec, idx_b, free_bvec); 00126 ((CDenseFeatures<uint16_t>*) lhs)->free_feature_vector(positions_a, idx_a, afree_pos); 00127 ((CDenseFeatures<uint16_t>*) rhs)->free_feature_vector(positions_b, idx_b, bfree_pos); 00128 00129 return result; 00130 } 00131 00132 float64_t CRegulatoryModulesStringKernel::compute_wds( 00133 char* avec, char* bvec, int32_t len) 00134 { 00135 float64_t* max_shift_vec = SG_MALLOC(float64_t, shift); 00136 float64_t sum0=0 ; 00137 for (int32_t i=0; i<shift; i++) 00138 max_shift_vec[i]=0 ; 00139 00140 // no shift 00141 for (int32_t i=0; i<len; i++) 00142 { 00143 if ((position_weights!=NULL) && (position_weights[i]==0.0)) 00144 continue ; 00145 00146 float64_t sumi = 0.0 ; 00147 for (int32_t j=0; (j<degree) && (i+j<len); j++) 00148 { 00149 if (avec[i+j]!=bvec[i+j]) 00150 break ; 00151 sumi += weights[j]; 00152 } 00153 if (position_weights!=NULL) 00154 sum0 += position_weights[i]*sumi ; 00155 else 00156 sum0 += sumi ; 00157 } ; 00158 00159 for (int32_t i=0; i<len; i++) 00160 { 00161 for (int32_t k=1; (k<=shift) && (i+k<len); k++) 00162 { 00163 if ((position_weights!=NULL) && (position_weights[i]==0.0) && (position_weights[i+k]==0.0)) 00164 continue ; 00165 00166 float64_t sumi1 = 0.0 ; 00167 // shift in sequence a 00168 for (int32_t j=0; (j<degree) && (i+j+k<len); j++) 00169 { 00170 if (avec[i+j+k]!=bvec[i+j]) 00171 break ; 00172 sumi1 += weights[j]; 00173 } 00174 float64_t sumi2 = 0.0 ; 00175 // shift in sequence b 00176 for (int32_t j=0; (j<degree) && (i+j+k<len); j++) 00177 { 00178 if (avec[i+j]!=bvec[i+j+k]) 00179 break ; 00180 sumi2 += weights[j]; 00181 } 00182 if (position_weights!=NULL) 00183 max_shift_vec[k-1] += position_weights[i]*sumi1 + position_weights[i+k]*sumi2 ; 00184 else 00185 max_shift_vec[k-1] += sumi1 + sumi2 ; 00186 } ; 00187 } 00188 00189 float64_t result = sum0 ; 00190 for (int32_t i=0; i<shift; i++) 00191 result += max_shift_vec[i]/(2*(i+1)) ; 00192 00193 SG_FREE(max_shift_vec); 00194 return result ; 00195 } 00196 00197 void CRegulatoryModulesStringKernel::set_wd_weights() 00198 { 00199 ASSERT(degree>0); 00200 00201 SG_FREE(weights); 00202 weights=SG_MALLOC(float64_t, degree); 00203 00204 int32_t i; 00205 float64_t sum=0; 00206 for (i=0; i<degree; i++) 00207 { 00208 weights[i]=degree-i; 00209 sum+=weights[i]; 00210 } 00211 00212 for (i=0; i<degree; i++) 00213 weights[i]/=sum; 00214 } 00215 00216 void CRegulatoryModulesStringKernel::register_params() 00217 { 00218 SG_ADD(&width, "width", "the width of Gaussian kernel part", MS_AVAILABLE); 00219 SG_ADD(°ree, "degree", "the degree of weighted degree kernel part", 00220 MS_AVAILABLE); 00221 SG_ADD(&shift, "shift", 00222 "the shift of weighted degree with shifts kernel part", MS_AVAILABLE); 00223 SG_ADD(&window, "window", "the size of window around motifs", MS_AVAILABLE); 00224 m_parameters->add_vector((CSGObject***)&motif_positions_lhs, &alen, "motif_positions_lhs", "the matrix of motif positions from sequences left-hand side"); 00225 m_parameters->add_vector((CSGObject***)&motif_positions_rhs, &blen, "motif_positions_rhs", "the matrix of motif positions from sequences right-hand side"); 00226 m_parameters->add_vector(&position_weights, °ree, "position_weights", "scaling weights in window"); 00227 m_parameters->add_vector(&weights, °ree, "weights", "weights of WD kernel"); 00228 }