SHOGUN
v2.0.0
|
00001 /* 00002 * Compute the local alignment kernel 00003 * 00004 * Largely based on LAkernel.c (version 0.3) 00005 * 00006 * Copyright 2003 Jean-Philippe Vert 00007 * Copyright 2005 Jean-Philippe Vert, Hiroto Saigo 00008 * 00009 * Shogun specific adjustments Written (W) 2007-2008,2010 Soeren Sonnenburg 00010 * 00011 * Reference: 00012 * H. Saigo, J.-P. Vert, T. Akutsu and N. Ueda, "Protein homology 00013 * detection using string alignment kernels", Bioinformatics, 00014 * vol.20, p.1682-1689, 2004. 00015 * 00016 * This program is free software; you can redistribute it and/or modify 00017 * it under the terms of the GNU General Public License as published by 00018 * the Free Software Foundation; either version 3 of the License, or 00019 * (at your option) any later version. 00020 */ 00021 00022 #include <stdlib.h> 00023 #include <stdio.h> 00024 #include <math.h> 00025 #include <ctype.h> 00026 #include <string.h> 00027 #include <shogun/kernel/string/LocalAlignmentStringKernel.h> 00028 00029 using namespace shogun; 00030 00031 /****************/ 00032 /* The alphabet */ 00033 /****************/ 00034 00035 const int32_t NAA=20; /* Number of amino-acids */ 00036 const int32_t NLET=26; /* Number of letters in the alphabet */ 00037 const char* CLocalAlignmentStringKernel::aaList="ARNDCQEGHILKMFPSTWYV"; /* The list of amino acids */ 00038 00039 /*****************/ 00040 /* SW parameters */ 00041 /*****************/ 00042 00043 /* mutation matrix */ 00044 const int32_t CLocalAlignmentStringKernel::blosum[] = { 00045 6, 00046 -2, 8, 00047 -2, -1, 9, 00048 -3, -2, 2, 9, 00049 -1, -5, -4, -5, 13, 00050 -1, 1, 0, 0, -4, 8, 00051 -1, 0, 0, 2, -5, 3, 7, 00052 0, -3, -1, -2, -4, -3, -3, 8, 00053 -2, 0, 1, -2, -4, 1, 0, -3, 11, 00054 -2, -5, -5, -5, -2, -4, -5, -6, -5, 6, 00055 -2, -3, -5, -5, -2, -3, -4, -5, -4, 2, 6, 00056 -1, 3, 0, -1, -5, 2, 1, -2, -1, -4, -4, 7, 00057 -1, -2, -3, -5, -2, -1, -3, -4, -2, 2, 3, -2, 8, 00058 -3, -4, -5, -5, -4, -5, -5, -5, -2, 0, 1, -5, 0, 9, 00059 -1, -3, -3, -2, -4, -2, -2, -3, -3, -4, -4, -2, -4, -5, 11, 00060 2, -1, 1, 0, -1, 0, 0, 0, -1, -4, -4, 0, -2, -4, -1, 6, 00061 0, -2, 0, -2, -1, -1, -1, -2, -3, -1, -2, -1, -1, -3, -2, 2, 7, 00062 -4, -4, -6, -6, -3, -3, -4, -4, -4, -4, -2, -4, -2, 1, -6, -4, -4, 16, 00063 -3, -3, -3, -5, -4, -2, -3, -5, 3, -2, -2, -3, -1, 4, -4, -3, -2, 3, 10, 00064 0, -4, -4, -5, -1, -3, -4, -5, -5, 4, 1, -3, 1, -1, -4, -2, 0, -4, -2, 6}; 00065 00066 /* Index corresponding to the (i,j) entry (i,j=0..19) in the blosum matrix */ 00067 static int32_t BINDEX(int32_t i, int32_t j) 00068 { 00069 return (((i)>(j))?(j)+(((i)*(i+1))/2):(i)+(((j)*(j+1))/2)); 00070 } 00071 00072 /********************* 00073 * Kernel parameters * 00074 *********************/ 00075 00076 const float64_t SCALING=0.1; /* Factor to scale all SW parameters */ 00077 00078 /* If you want to compute the sum over all local alignments (to get a valid kernel), uncomment the following line : */ 00079 /* If x=log(a) and y=log(b), compute log(a+b) : */ 00080 /* 00081 #define LOGP(x,y) (((x)>(y))?(x)+log1p(exp((y)-(x))):(y)+log1p(exp((x)-(y)))) 00082 */ 00083 00084 #define LOGP(x,y) LogSum(x,y) 00085 00086 /* OR if you want to compute the score of the best local alignment (to get the SW score by Viterbi), uncomment the following line : */ 00087 /* 00088 #define LOGP(x,y) (((x)>(y))?(x):(y)) 00089 */ 00090 00091 /* Usefule constants */ 00092 const float64_t LOG0=-10000; /* log(0) */ 00093 const float64_t INTSCALE=1000.0; /* critical for speed and precise computation*/ 00094 00095 int32_t CLocalAlignmentStringKernel::logsum_lookup[LOGSUM_TBL]; 00096 00097 CLocalAlignmentStringKernel::CLocalAlignmentStringKernel(int32_t size) 00098 : CStringKernel<char>(size) 00099 { 00100 SG_UNSTABLE("LocalAlignmentStringKernel"); 00101 init(); 00102 init_static_variables(); 00103 } 00104 00105 CLocalAlignmentStringKernel::CLocalAlignmentStringKernel( 00106 CStringFeatures<char>* l, CStringFeatures<char>* r, 00107 float64_t opening, float64_t extension) 00108 : CStringKernel<char>() 00109 { 00110 SG_UNSTABLE("LocalAlignmentStringKernel"); 00111 init(); 00112 m_opening=opening; 00113 m_extension=extension; 00114 init_static_variables(); 00115 init(l, r); 00116 } 00117 00118 CLocalAlignmentStringKernel::~CLocalAlignmentStringKernel() 00119 { 00120 cleanup(); 00121 } 00122 00123 bool CLocalAlignmentStringKernel::init(CFeatures* l, CFeatures* r) 00124 { 00125 CStringKernel<char>::init(l, r); 00126 initialized=true; 00127 return init_normalizer(); 00128 } 00129 00130 void CLocalAlignmentStringKernel::cleanup() 00131 { 00132 SG_FREE(scaled_blosum); 00133 scaled_blosum=NULL; 00134 00135 SG_FREE(isAA); 00136 isAA=NULL; 00137 SG_FREE(aaIndex); 00138 aaIndex=NULL; 00139 00140 CKernel::cleanup(); 00141 } 00142 00143 /* LogSum - default log funciotion. fast, but not exact */ 00144 /* LogSum2 - precise, but slow. Note that these two functions need different figure types */ 00145 00146 void CLocalAlignmentStringKernel::init_logsum(){ 00147 int32_t i; 00148 for (i=0; i<LOGSUM_TBL; i++) 00149 logsum_lookup[i]=int32_t(INTSCALE* 00150 (log(1.+exp((float32_t)-i/INTSCALE)))); 00151 } 00152 00153 int32_t CLocalAlignmentStringKernel::LogSum(int32_t p1, int32_t p2) 00154 { 00155 int32_t diff; 00156 static int32_t firsttime=1; 00157 00158 if (firsttime) 00159 { 00160 init_logsum(); 00161 firsttime =0; 00162 } 00163 00164 diff=p1-p2; 00165 if (diff>=LOGSUM_TBL) 00166 return p1; 00167 else if (diff<=-LOGSUM_TBL) 00168 return p2; 00169 else if (diff>0) 00170 return p1+logsum_lookup[diff]; 00171 else 00172 return p2+logsum_lookup[-diff]; 00173 } 00174 00175 00176 float32_t CLocalAlignmentStringKernel::LogSum2(float32_t p1, float32_t p2) 00177 { 00178 if (p1>p2) 00179 return (p1-p2>50.) ? p1 : p1+log(1.+exp(p2-p1)); 00180 else 00181 return (p2-p1>50.) ? p2 : p2+log(1.+exp(p1-p2)); 00182 } 00183 00184 00185 void CLocalAlignmentStringKernel::init_static_variables() 00186 /* Initialize all static variables. This function should be called once before computing the first pair HMM score */ 00187 { 00188 register int32_t i; 00189 00190 /* Initialization of the array which gives the position of each amino-acid in the set of amino-acid */ 00191 if ((aaIndex=(int32_t *)calloc(NLET,sizeof(int32_t)))==NULL) 00192 SG_ERROR("run out o memory"); 00193 for (i=0;i<NAA;i++) 00194 aaIndex[aaList[i]-'A']=i; 00195 00196 /* Initialization of the array which indicates whether a char is an amino-acid */ 00197 if ((isAA=(int32_t *)calloc(256,sizeof(int32_t)))==NULL) 00198 SG_ERROR("run out of memory"); 00199 for (i=0;i<NAA;i++) 00200 isAA[(int32_t)aaList[i]]=1; 00201 00202 /* Scale the blossum matrix */ 00203 for (i=0 ; i<NAA*(NAA+1)/2; i++) 00204 scaled_blosum[i]=(int32_t)floor(blosum[i]*SCALING*INTSCALE); 00205 00206 00207 /* Scale of gap penalties */ 00208 m_opening=(int32_t)floor(m_opening*SCALING*INTSCALE); 00209 m_extension=(int32_t)floor(m_extension*SCALING*INTSCALE); 00210 } 00211 00212 00213 00214 /* Implementation of the 00215 * convolution kernel which generalizes the Smith-Waterman algorithm 00216 */ 00217 float64_t CLocalAlignmentStringKernel::LAkernelcompute( 00218 int32_t* aaX, int32_t* aaY, /* the two amino-acid sequences (as sequences of indexes in [0..NAA-1] indicating the position of the amino-acid in the variable 'aaList') */ 00219 int32_t nX, int32_t nY /* the lengths of both sequences */) 00220 { 00221 register int32_t 00222 i,j, /* loop indexes */ 00223 cur, old, /* to indicate the array to use (0 or 1) */ 00224 curpos, frompos; /* position in an array */ 00225 00226 int32_t 00227 *logX, /* arrays to store the log-values of each state */ 00228 *logY, 00229 *logM, 00230 *logX2, 00231 *logY2; 00232 00233 int32_t aux, aux2;/* , aux3 , aux4 , aux5;*/ 00234 int32_t cl;/* length of a column for the dynamic programming */ 00235 00236 /* 00237 printf("now computing pairHMM between %d and %d:\n",nX,nY); 00238 for (i=0;i<nX;printf("%d ",aaX[i++])); 00239 printf("\n and \n"); 00240 for (i=0;i<nY;printf("%d ",aaY[i++])); 00241 printf("\n"); 00242 */ 00243 00244 /* Initialization of the arrays */ 00245 /* Each array stores two successive columns of the (nX+1)x(nY+1) table used in dynamic programming */ 00246 cl=nY+1; /* each column stores the positions in the aaY sequence, plus a position at zero */ 00247 00248 logM=SG_CALLOC(int32_t, 2*cl); 00249 logX=SG_CALLOC(int32_t, 2*cl); 00250 logY=SG_CALLOC(int32_t, 2*cl); 00251 logX2=SG_CALLOC(int32_t, 2*cl); 00252 logY2=SG_CALLOC(int32_t, 2*cl); 00253 00254 /************************************************/ 00255 /* First iteration : initialization of column 0 */ 00256 /************************************************/ 00257 /* The log=proabilities of each state are initialized for the first column (x=0,y=0..nY) */ 00258 00259 for (j=0; j<cl; j++) { 00260 logM[j]=LOG0; 00261 logX[j]=LOG0; 00262 logY[j]=LOG0; 00263 logX2[j]=LOG0; 00264 logY2[j]=LOG0; 00265 } 00266 00267 /* Update column order */ 00268 cur=1; /* Indexes [0..cl-1] are used to process the next column */ 00269 old=0; /* Indexes [cl..2*cl-1] were used for column 0 */ 00270 00271 00272 /************************************************/ 00273 /* Next iterations : processing columns 1 .. nX */ 00274 /************************************************/ 00275 00276 /* Main loop to vary the position in aaX : i=1..nX */ 00277 for (i=1; i<=nX; i++) { 00278 00279 /* Special update for positions (i=1..nX,j=0) */ 00280 curpos=cur*cl; /* index of the state (i,0) */ 00281 logM[curpos]=LOG0; 00282 logX[curpos]=LOG0; 00283 logY[curpos]=LOG0; 00284 logX2[curpos]=LOG0; 00285 logY2[curpos]=LOG0; 00286 00287 /* Secondary loop to vary the position in aaY : j=1..nY */ 00288 for (j=1; j<=nY; j++) { 00289 00290 curpos=cur*cl+j; /* index of the state (i,j) */ 00291 00292 /* Update for states which emit X only */ 00293 /***************************************/ 00294 00295 frompos=old*cl+j; /* index of the state (i-1,j) */ 00296 00297 /* State RX */ 00298 logX[curpos]=LOGP(-m_opening+logM[frompos], -m_extension+logX[frompos]); 00299 /* printf("%.5f\n",logX[curpos]);*/ 00300 /* printf("%.5f\n",logX_B[curpos]);*/ 00301 /* State RX2 */ 00302 logX2[curpos]=LOGP(logM[frompos], logX2[frompos]); 00303 00304 /* Update for states which emit Y only */ 00305 /***************************************/ 00306 00307 frompos=cur*cl+j-1; /* index of the state (i,j-1) */ 00308 00309 /* State RY */ 00310 aux=LOGP(-m_opening+logM[frompos], -m_extension+logY[frompos]); 00311 logY[curpos] = LOGP(aux, -m_opening+logX[frompos]); 00312 00313 /* State RY2 */ 00314 aux=LOGP(logM[frompos], logY2[frompos]); 00315 logY2[curpos]=LOGP(aux, logX2[frompos]); 00316 00317 /* Update for states which emit X and Y */ 00318 /****************************************/ 00319 00320 frompos=old*cl+j-1; /* index of the state (i-1,j-1) */ 00321 00322 aux=LOGP(logX[frompos], logY[frompos]); 00323 aux2=LOGP(0, logM[frompos]); 00324 logM[curpos]=LOGP(aux, aux2)+scaled_blosum[BINDEX(aaX[i-1], aaY[j-1])]; 00325 00326 /* 00327 printf("i=%d , j=%d\nM=%.5f\nX=%.5f\nY=%.5f\nX2=%.5f\nY2=%.5f\n",i,j,logM[curpos],logX[curpos],logY[curpos],logX2[curpos],logY2[curpos]); 00328 */ 00329 00330 } /* end of j=1:nY loop */ 00331 00332 00333 /* Update the culumn order */ 00334 cur=1-cur; 00335 old=1-old; 00336 00337 } /* end of j=1:nX loop */ 00338 00339 00340 /* Termination */ 00341 /***************/ 00342 00343 curpos=old*cl+nY; /* index of the state (nX,nY) */ 00344 aux=LOGP(logX2[curpos], logY2[curpos]); 00345 aux2=LOGP(0, logM[curpos]); 00346 /* kernel_value = LOGP( aux , aux2 );*/ 00347 00348 /* Memory release */ 00349 SG_FREE(logM); 00350 SG_FREE(logX); 00351 SG_FREE(logY); 00352 SG_FREE(logX2); 00353 SG_FREE(logY2); 00354 00355 /* Return the logarithm of the kernel */ 00356 return (float32_t)LOGP(aux,aux2)/INTSCALE; 00357 } 00358 00359 /********************/ 00360 /* Public functions */ 00361 /********************/ 00362 00363 00364 /* Return the log-probability of two sequences x and y under a pair HMM model */ 00365 /* x and y are strings of aminoacid letters, e.g., "AABRS" */ 00366 float64_t CLocalAlignmentStringKernel::compute(int32_t idx_x, int32_t idx_y) 00367 { 00368 int32_t *aax, *aay; /* to convert x and y into sequences of amino-acid indexes */ 00369 int32_t lx=0, ly=0; /* lengths of x and y */ 00370 int32_t i, j; 00371 00372 bool free_x, free_y; 00373 char* x=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_x, lx, free_x); 00374 char* y=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_y, ly, free_y); 00375 ASSERT(x && y); 00376 00377 if ( (lx<1) || (ly<1) ) 00378 SG_ERROR("empty chain"); 00379 00380 /* Create aax and aay */ 00381 00382 if ((aax=(int32_t *)calloc(lx,sizeof(int32_t)))==NULL) 00383 SG_ERROR("run out of memory"); 00384 if ((aay=(int32_t *)calloc(ly,sizeof(int32_t)))==NULL) 00385 SG_ERROR("run out of memory"); 00386 00387 /* Extract the characters corresponding to aminoacids and keep their indexes */ 00388 00389 j=0; 00390 for (i=0; i<lx; i++) 00391 if (isAA[toupper(x[i])]) 00392 aax[j++]=aaIndex[toupper(x[i])-'A']; 00393 lx=j; 00394 j=0; 00395 for (i=0; i<ly; i++) 00396 if (isAA[toupper(y[i])]) 00397 aay[j++]=aaIndex[toupper(y[i])-'A']; 00398 ly=j; 00399 00400 00401 /* Compute the pair HMM score */ 00402 float64_t result=LAkernelcompute(aax, aay, lx, ly); 00403 00404 /* Release memory */ 00405 SG_FREE(aax); 00406 SG_FREE(aay); 00407 00408 ((CStringFeatures<char>*)lhs)->free_feature_vector(x, idx_x, free_x); 00409 ((CStringFeatures<char>*)rhs)->free_feature_vector(y, idx_y, free_y); 00410 00411 return result; 00412 } 00413 00414 void CLocalAlignmentStringKernel::init() 00415 { 00416 initialized=false; 00417 isAA=NULL; 00418 aaIndex=NULL; 00419 00420 m_opening=10; 00421 m_extension=2; 00422 00423 scaled_blosum=SG_CALLOC(int32_t, sizeof(blosum)); 00424 init_logsum(); 00425 00426 SG_ADD(&initialized, "initialized", "If kernel is initalized.", 00427 MS_NOT_AVAILABLE); 00428 SG_ADD(&m_opening, "opening", "Opening gap opening penalty.", MS_AVAILABLE); 00429 SG_ADD(&m_extension, "extension", "Extension gap extension penalty.", 00430 MS_AVAILABLE); 00431 }