SHOGUN
v2.0.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2010 Soeren Sonnenburg 00008 * Copyright (C) 2010 Berlin Institute of Technology 00009 * 00010 * Based on code from Pavel Kuksa <pkuksa@cs.rutgers.edu> and 00011 * Vladimir Pavlovic <vladimir@cs.rutgers.edu> originally 00012 * released under the new BSD License. 00013 */ 00014 00015 #include <shogun/lib/common.h> 00016 #include <shogun/io/SGIO.h> 00017 #include <shogun/mathematics/Math.h> 00018 #include <shogun/kernel/string/SparseSpatialSampleStringKernel.h> 00019 #include <shogun/features/StringFeatures.h> 00020 00021 using namespace shogun; 00022 00023 CSparseSpatialSampleStringKernel::CSparseSpatialSampleStringKernel() 00024 : CStringKernel<char>(0), t(2), d(5) 00025 { 00026 } 00027 00028 CSparseSpatialSampleStringKernel::CSparseSpatialSampleStringKernel(CStringFeatures<char>* l, 00029 CStringFeatures<char>* r) : CStringKernel<char>(0), t(2), d(5) 00030 { 00031 init(l, r); 00032 } 00033 00034 bool CSparseSpatialSampleStringKernel::init(CFeatures* l, CFeatures* r) 00035 { 00036 CStringKernel<char>::init(l, r); 00037 return init_normalizer(); 00038 } 00039 00040 void CSparseSpatialSampleStringKernel::cleanup() 00041 { 00042 CKernel::cleanup(); 00043 } 00044 00045 CSparseSpatialSampleStringKernel::~CSparseSpatialSampleStringKernel() 00046 { 00047 } 00048 00049 SSKFeatures *CSparseSpatialSampleStringKernel::extractTriple(int **S, int *len, int nStr, int d1, int d2) 00050 { 00051 int i, j; 00052 int n, nfeat; 00053 int *group; 00054 int *features; 00055 int *s; 00056 int c; 00057 SSKFeatures *F; 00058 00059 nfeat = 0; 00060 for (i = 0; i < nStr; ++i) 00061 nfeat += len[i] - d1 -d2; 00062 group = SG_MALLOC(int, nfeat); 00063 features = SG_MALLOC(int, nfeat*3); 00064 c = 0; 00065 for (i = 0; i < nStr; ++i) 00066 { 00067 n = len[i] - d1 - d2; 00068 s = S[i]; 00069 for (j = 0; j < n; ++j) 00070 { 00071 features[c] = s[j]; 00072 features[c+nfeat] = s[j+d1]; 00073 features[c+2*nfeat] = s[j+d1+d2]; 00074 group[c] = i; 00075 c++; 00076 } 00077 } 00078 ASSERT(nfeat==c); 00079 F = SG_MALLOC(SSKFeatures, 1); 00080 (*F).features = features; 00081 (*F).group = group; 00082 (*F).n = nfeat; 00083 return F; 00084 } 00085 00086 00087 SSKFeatures *CSparseSpatialSampleStringKernel::extractDouble(int **S, int *len, int nStr, int d1) 00088 { 00089 int i, j; 00090 int n, nfeat; 00091 int *group; 00092 int *features; 00093 int *s; 00094 int c; 00095 SSKFeatures *F; 00096 00097 nfeat = 0; 00098 for (i = 0; i < nStr; ++i) 00099 nfeat += len[i] - d1; 00100 group = SG_MALLOC(int, nfeat); 00101 features = SG_MALLOC(int, nfeat*2); 00102 c = 0; 00103 for (i = 0; i < nStr; ++i) 00104 { 00105 n = len[i] - d1; 00106 s = S[i]; 00107 for (j = 0; j < n; ++j) 00108 { 00109 features[c] = s[j]; 00110 features[c+nfeat] = s[j+d1]; 00111 group[c] = i; 00112 c++; 00113 } 00114 } 00115 if (nfeat!=c) 00116 printf("Something is wrong...\n"); 00117 F = SG_MALLOC(SSKFeatures, 1); 00118 (*F).features = features; 00119 (*F).group = group; 00120 (*F).n = nfeat; 00121 return F; 00122 } 00123 00124 00125 void CSparseSpatialSampleStringKernel::compute_double(int32_t idx_a, int32_t idx_b) 00126 { 00127 int d1; 00128 SSKFeatures *features; 00129 int *sortIdx; 00130 int *features_srt; 00131 int *group_srt; 00132 int maxIdx; 00133 int **S=NULL; 00134 int *len=NULL; 00135 int nStr=0, nfeat; 00136 int i; 00137 int na=0; 00138 int *K=NULL; 00139 00140 for (d1 = 1; d1 <= d; ++d1) 00141 { 00142 if ( isVerbose ) printf("Extracting features..."), fflush( stdout ); 00143 features = extractDouble(S,len,nStr,d1); 00144 nfeat = (*features).n; 00145 printf("d=%d: %d features\n", d1, nfeat); 00146 maxIdx = 0; 00147 for (i = 0; i < nfeat*2; ++i) 00148 maxIdx = maxIdx > (*features).features[i] ? maxIdx : (*features).features[i]; 00149 if (na < maxIdx+1) 00150 { 00151 printf("WARNING: Sequence elements are outside the specified range [0,%d]\n",na); 00152 printf("\tUsing [0,%d] instead\n", maxIdx); 00153 na = maxIdx+1; 00154 } 00155 if (isVerbose) 00156 { 00157 printf("done.\n"); 00158 printf("Sorting..."); 00159 fflush( stdout ); 00160 } 00161 sortIdx = cntsrtna((*features).features,2,(*features).n,na); 00162 if (isVerbose) printf("done.\n"); 00163 features_srt = SG_MALLOC(int, nfeat*2); 00164 group_srt = SG_MALLOC(int, nfeat); 00165 for (i = 0; i < nfeat; ++i) 00166 { 00167 features_srt[i]=(*features).features[sortIdx[i]]; 00168 features_srt[i+nfeat]=(*features).features[sortIdx[i]+nfeat]; 00169 group_srt[i] = (*features).group[sortIdx[i]]; 00170 } 00171 SG_FREE(sortIdx); 00172 SG_FREE((*features).features); 00173 SG_FREE((*features).group); 00174 SG_FREE(features); 00175 if (isVerbose) 00176 { 00177 printf("Counting..."); 00178 fflush( stdout ); 00179 } 00180 countAndUpdate(K,features_srt,group_srt,2,nfeat,nStr); 00181 SG_FREE(features_srt); 00182 SG_FREE(group_srt); 00183 if (isVerbose) 00184 { 00185 printf("done."); 00186 } 00187 } 00188 } 00189 00190 void CSparseSpatialSampleStringKernel::compute_triple(int32_t idx_a, int32_t idx_b) 00191 { 00192 int d1, d2; 00193 SSKFeatures *features; 00194 int *sortIdx; 00195 int *features_srt; 00196 int *group_srt; 00197 int maxIdx; 00198 int **S=NULL; 00199 int *len=NULL; 00200 int nStr=0, nfeat; 00201 int i; 00202 int na=0; 00203 int *K=NULL; 00204 00205 for (d1 = 1; d1 <= d; ++d1) 00206 { 00207 for (d2 = 1; d2 <= d; ++d2) 00208 { 00209 if (isVerbose) 00210 { 00211 printf("Extracting features..."); 00212 fflush( stdout ); 00213 } 00214 features = extractTriple(S,len,nStr,d1,d2); 00215 nfeat = (*features).n; 00216 printf("(%d,%d): %d features\n", d1, d2, nfeat); 00217 maxIdx = 0; 00218 for (i = 0; i < nfeat*3; ++i) 00219 maxIdx = maxIdx > (*features).features[i] ? maxIdx : (*features).features[i]; 00220 if (na < maxIdx+1) 00221 { 00222 printf("WARNING: Sequence elements are outside the specified range [0,%d]\n",na); 00223 printf("\tUsing [0,%d] instead\n", maxIdx); 00224 na = maxIdx+1; 00225 } 00226 if (isVerbose) 00227 { 00228 printf("done.\n"); 00229 printf("Sorting..."); 00230 fflush( stdout ); 00231 } 00232 sortIdx = cntsrtna((*features).features,3,(*features).n,na); 00233 if (isVerbose) 00234 { 00235 printf("done.\n"); 00236 } 00237 features_srt = SG_MALLOC(int, nfeat*3); 00238 group_srt = SG_MALLOC(int, nfeat); 00239 for (i = 0; i < nfeat; ++i) 00240 { 00241 features_srt[i]=(*features).features[sortIdx[i]]; 00242 features_srt[i+nfeat]=(*features).features[sortIdx[i]+nfeat]; 00243 features_srt[i+2*nfeat]=(*features).features[sortIdx[i]+2*nfeat]; 00244 group_srt[i] = (*features).group[sortIdx[i]]; 00245 } 00246 SG_FREE(sortIdx); 00247 SG_FREE((*features).features); 00248 SG_FREE((*features).group); 00249 SG_FREE(features); 00250 if (isVerbose) 00251 { 00252 printf("Counting..."); 00253 fflush( stdout ); 00254 } 00255 countAndUpdate(K,features_srt,group_srt,3,nfeat,nStr); 00256 SG_FREE(features_srt); 00257 SG_FREE(group_srt); 00258 if (isVerbose) 00259 { 00260 printf("done.\n"); 00261 } 00262 } 00263 } 00264 } 00265 00266 void CSparseSpatialSampleStringKernel::countAndUpdate(int *outK, int *sx, int *g, int k, int r, int nStr) 00267 { 00268 char same; 00269 int i, j; 00270 int cu; 00271 long int ucnt; 00272 long int startInd, endInd, j1; 00273 int *curfeat, *ucnts, *updind; 00274 00275 curfeat = SG_MALLOC(int, k); 00276 ucnts = SG_MALLOC(int, nStr); 00277 updind = SG_MALLOC(int, nStr); 00278 i = 0; 00279 ucnt = 0; 00280 while (i<r) 00281 { 00282 for (j = 0; j < k; ++j) 00283 curfeat[j]=sx[i+j*r]; 00284 same=1; 00285 for (j = 0;j < k; ++j) 00286 if (curfeat[j]!=sx[i+j*r]) 00287 { 00288 same=0; 00289 break; 00290 } 00291 startInd=i; 00292 while (same && i<r) 00293 { 00294 i++; 00295 if (i >= r) break; 00296 same = 1; 00297 for (j = 0; j < k; ++j) 00298 if (curfeat[j]!=sx[i+j*r]) 00299 { 00300 same=0; 00301 break; 00302 } 00303 } 00304 endInd= (i<r) ? (i-1):(r-1); 00305 ucnt++; 00306 if ((long int)endInd-startInd+1>2*nStr) 00307 { 00308 for (j = 0; j < nStr; ++j) ucnts[j]=0; 00309 for (j = startInd;j <= endInd; ++j) ucnts[g[j]]++; 00310 cu=0; 00311 for (j=0;j<nStr;j++) 00312 { 00313 if (ucnts[j]>0) 00314 { 00315 updind[cu] = j; 00316 cu++; 00317 } 00318 } 00319 for (j=0;j<cu;j++) 00320 for (j1=0;j1<cu;j1++) 00321 outK[updind[j]+updind[j1]*nStr]+=ucnts[updind[j]]*ucnts[updind[j1]]; 00322 } 00323 else 00324 { 00325 for (j = startInd;j <= endInd; ++j) 00326 for (j1 = startInd;j1 <= endInd; ++j1) 00327 outK[ g[j]+nStr*g[j1] ]++; 00328 } 00329 } 00330 SG_FREE(updind); 00331 SG_FREE(ucnts); 00332 SG_FREE(curfeat); 00333 } 00334 00335 int *CSparseSpatialSampleStringKernel::cntsrtna(int *sx, int k, int r, int na) 00336 { 00337 int *sxc, *bc, *sxl, *cc, *regroup; 00338 int i, j; 00339 00340 sxc = SG_MALLOC(int, na); 00341 bc = SG_MALLOC(int, na); 00342 sxl = SG_MALLOC(int, r); 00343 cc = SG_MALLOC(int, r); 00344 regroup = SG_MALLOC(int, r); 00345 00346 for (i = 0; i < r; ++i) 00347 regroup[i]=i; 00348 for (j = k-1; j >= 0; --j) 00349 { 00350 for(i = 0; i < na; ++i) 00351 sxc[i]=0; 00352 for (i = 0; i < r; ++i) 00353 { 00354 cc[i]=sx[regroup[i]+j*r]; 00355 sxc[ cc[i] ]++; 00356 } 00357 bc[0]=0; 00358 for (i = 1;i < na; ++i) 00359 bc[i]=bc[i-1]+sxc[i-1]; 00360 for (i = 0; i < r; ++i) 00361 sxl[bc[ cc[i] ]++] = regroup[i]; 00362 for (i = 0; i < r; ++i) 00363 regroup[i] = sxl[i]; 00364 } 00365 SG_FREE(sxl); SG_FREE(bc); SG_FREE(sxc); SG_FREE(cc); 00366 00367 return regroup; 00368 } 00369 00370 float64_t CSparseSpatialSampleStringKernel::compute(int32_t idx_a, int32_t idx_b) 00371 { 00372 if (t==2) 00373 compute_double(idx_a, idx_b); 00374 if (t==3) 00375 compute_triple(idx_a, idx_b); 00376 00377 SG_ERROR("t out of range - shouldn't happen\n"); 00378 return -1; 00379 }