SHOGUN
v2.0.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2012 Heiko Strathmann 00008 */ 00009 00010 #include <shogun/statistics/QuadraticTimeMMD.h> 00011 #include <shogun/features/Features.h> 00012 #include <shogun/mathematics/Statistics.h> 00013 #include <shogun/kernel/Kernel.h> 00014 00015 using namespace shogun; 00016 00017 CQuadraticTimeMMD::CQuadraticTimeMMD() : CKernelTwoSampleTestStatistic() 00018 { 00019 init(); 00020 } 00021 00022 CQuadraticTimeMMD::CQuadraticTimeMMD(CKernel* kernel, CFeatures* p_and_q, 00023 index_t q_start) : 00024 CKernelTwoSampleTestStatistic(kernel, p_and_q, q_start) 00025 { 00026 init(); 00027 00028 if (p_and_q && q_start!=p_and_q->get_num_vectors()/2) 00029 { 00030 SG_ERROR("CQuadraticTimeMMD: Only features with equal number of vectors " 00031 "are currently possible\n"); 00032 } 00033 } 00034 00035 CQuadraticTimeMMD::CQuadraticTimeMMD(CKernel* kernel, CFeatures* p, 00036 CFeatures* q) : CKernelTwoSampleTestStatistic(kernel, p, q) 00037 { 00038 init(); 00039 00040 if (p && q && p->get_num_vectors()!=q->get_num_vectors()) 00041 { 00042 SG_ERROR("CQuadraticTimeMMD: Only features with equal number of vectors " 00043 "are currently possible\n"); 00044 } 00045 } 00046 00047 CQuadraticTimeMMD::~CQuadraticTimeMMD() 00048 { 00049 00050 } 00051 00052 void CQuadraticTimeMMD::init() 00053 { 00054 SG_ADD(&m_num_samples_spectrum, "num_samples_spectrum", "Number of samples" 00055 " for spectrum method null-distribution approximation", 00056 MS_NOT_AVAILABLE); 00057 SG_ADD(&m_num_eigenvalues_spectrum, "num_eigenvalues_spectrum", "Number of " 00058 " Eigenvalues for spectrum method null-distribution approximation", 00059 MS_NOT_AVAILABLE); 00060 SG_ADD((machine_int_t*)&m_statistic_type, "statistic_type", 00061 "Biased or unbiased MMD", MS_NOT_AVAILABLE); 00062 00063 m_num_samples_spectrum=0; 00064 m_num_eigenvalues_spectrum=0; 00065 m_statistic_type=UNBIASED; 00066 } 00067 00068 float64_t CQuadraticTimeMMD::compute_unbiased_statistic() 00069 { 00070 /* split computations into three terms from JLMR paper (see documentation )*/ 00071 index_t m=m_q_start; 00072 00073 /* init kernel with features */ 00074 m_kernel->init(m_p_and_q, m_p_and_q); 00075 00076 /* first term */ 00077 float64_t first=0; 00078 for (index_t i=0; i<m; ++i) 00079 { 00080 /* ensure i!=j while adding up */ 00081 for (index_t j=0; j<m; ++j) 00082 first+=i==j ? 0 : m_kernel->kernel(i,j); 00083 } 00084 first/=(m-1); 00085 00086 /* second term */ 00087 float64_t second=0; 00088 for (index_t i=m_q_start; i<m_q_start+m; ++i) 00089 { 00090 /* ensure i!=j while adding up */ 00091 for (index_t j=m_q_start; j<m_q_start+m; ++j) 00092 second+=i==j ? 0 : m_kernel->kernel(i,j); 00093 } 00094 second/=(m-1); 00095 00096 /* third term */ 00097 float64_t third=0; 00098 for (index_t i=0; i<m; ++i) 00099 { 00100 for (index_t j=m_q_start; j<m_q_start+m; ++j) 00101 third+=m_kernel->kernel(i,j); 00102 } 00103 third*=2.0/m; 00104 00105 return first+second-third; 00106 } 00107 00108 float64_t CQuadraticTimeMMD::compute_biased_statistic() 00109 { 00110 /* split computations into three terms from JLMR paper (see documentation )*/ 00111 index_t m=m_q_start; 00112 00113 /* init kernel with features */ 00114 m_kernel->init(m_p_and_q, m_p_and_q); 00115 00116 /* first term */ 00117 float64_t first=0; 00118 for (index_t i=0; i<m; ++i) 00119 { 00120 for (index_t j=0; j<m; ++j) 00121 first+=m_kernel->kernel(i,j); 00122 } 00123 first/=m; 00124 00125 /* second term */ 00126 float64_t second=0; 00127 for (index_t i=m_q_start; i<m_q_start+m; ++i) 00128 { 00129 for (index_t j=m_q_start; j<m_q_start+m; ++j) 00130 second+=m_kernel->kernel(i,j); 00131 } 00132 second/=m; 00133 00134 /* third term */ 00135 float64_t third=0; 00136 for (index_t i=0; i<m; ++i) 00137 { 00138 for (index_t j=m_q_start; j<m_q_start+m; ++j) 00139 third+=m_kernel->kernel(i,j); 00140 } 00141 third*=2.0/m; 00142 00143 return first+second-third; 00144 } 00145 00146 float64_t CQuadraticTimeMMD::compute_statistic() 00147 { 00148 if (!m_kernel) 00149 SG_ERROR("%s::compute_statistic(): No kernel specified!\n", get_name()); 00150 00151 float64_t result=0; 00152 switch (m_statistic_type) 00153 { 00154 case UNBIASED: 00155 result=compute_unbiased_statistic(); 00156 break; 00157 case BIASED: 00158 result=compute_biased_statistic(); 00159 break; 00160 default: 00161 SG_ERROR("CQuadraticTimeMMD::compute_statistic(): Unknown statistic " 00162 "type!\n"); 00163 break; 00164 } 00165 00166 return result; 00167 } 00168 00169 float64_t CQuadraticTimeMMD::compute_p_value(float64_t statistic) 00170 { 00171 float64_t result=0; 00172 00173 switch (m_null_approximation_method) 00174 { 00175 case MMD2_SPECTRUM: 00176 { 00177 #ifdef HAVE_LAPACK 00178 /* get samples from null-distribution and compute p-value of statistic */ 00179 SGVector<float64_t> null_samples=sample_null_spectrum( 00180 m_num_samples_spectrum, m_num_eigenvalues_spectrum); 00181 CMath::qsort(null_samples); 00182 index_t pos=CMath::find_position_to_insert(null_samples, statistic); 00183 result=1.0-((float64_t)pos)/null_samples.vlen; 00184 #else // HAVE_LAPACK 00185 SG_ERROR("CQuadraticTimeMMD::compute_p_value(): Only possible if " 00186 "shogun is compiled with LAPACK enabled\n"); 00187 #endif // HAVE_LAPACK 00188 break; 00189 } 00190 00191 case MMD2_GAMMA: 00192 { 00193 /* fit gamma and return cdf at statistic */ 00194 SGVector<float64_t> params=fit_null_gamma(); 00195 result=CStatistics::gamma_cdf(statistic, params[0], params[1]); 00196 break; 00197 } 00198 00199 default: 00200 result=CKernelTwoSampleTestStatistic::compute_p_value(statistic); 00201 break; 00202 } 00203 00204 return result; 00205 } 00206 00207 float64_t CQuadraticTimeMMD::compute_threshold(float64_t alpha) 00208 { 00209 float64_t result=0; 00210 00211 switch (m_null_approximation_method) 00212 { 00213 case MMD2_SPECTRUM: 00214 { 00215 #ifdef HAVE_LAPACK 00216 /* get samples from null-distribution and compute threshold */ 00217 SGVector<float64_t> null_samples=sample_null_spectrum( 00218 m_num_samples_spectrum, m_num_eigenvalues_spectrum); 00219 CMath::qsort(null_samples); 00220 result=null_samples[CMath::floor(null_samples.vlen*(1-alpha))]; 00221 #else // HAVE_LAPACK 00222 SG_ERROR("CQuadraticTimeMMD::compute_threshold(): Only possible if " 00223 "shogun is compiled with LAPACK enabled\n"); 00224 #endif // HAVE_LAPACK 00225 break; 00226 } 00227 00228 case MMD2_GAMMA: 00229 { 00230 /* fit gamma and return inverse cdf at alpha */ 00231 SGVector<float64_t> params=fit_null_gamma(); 00232 result=CStatistics::inverse_gamma_cdf(alpha, params[0], params[1]); 00233 break; 00234 } 00235 00236 default: 00237 /* bootstrapping is handled here */ 00238 result=CKernelTwoSampleTestStatistic::compute_threshold(alpha); 00239 break; 00240 } 00241 00242 return result; 00243 } 00244 00245 00246 #ifdef HAVE_LAPACK 00247 SGVector<float64_t> CQuadraticTimeMMD::sample_null_spectrum(index_t num_samples, 00248 index_t num_eigenvalues) 00249 { 00250 if (m_q_start!=m_p_and_q->get_num_vectors()/2) 00251 { 00252 SG_ERROR("%s::sample_null_spectrum(): Currently, only equal " 00253 "sample sizes are supported\n", get_name()); 00254 } 00255 00256 if (num_samples<=2) 00257 { 00258 SG_ERROR("%s::sample_null_spectrum(): Number of samples has to be at" 00259 " least 2, better in the hundrets", get_name()); 00260 } 00261 00262 if (num_eigenvalues>2*m_q_start-1) 00263 { 00264 SG_ERROR("%s::sample_null_spectrum(): Number of Eigenvalues too large\n", 00265 get_name()); 00266 } 00267 00268 if (num_eigenvalues<1) 00269 { 00270 SG_ERROR("%s::sample_null_spectrum(): Number of Eigenvalues too small\n", 00271 get_name()); 00272 } 00273 00274 /* evtl. warn user not to use wrong statistic type */ 00275 if (m_statistic_type!=BIASED) 00276 { 00277 SG_WARNING("%s::sample_null_spectrum(): Note: provided statistic has " 00278 "to be BIASED. Please ensure that! To get rid of warning," 00279 "call %s::set_statistic_type(BIASED)\n", get_name(), 00280 get_name()); 00281 } 00282 00283 /* imaginary matrix K=[K KL; KL' L] (MATLAB notation) 00284 * K is matrix for XX, L is matrix for YY, KL is XY, LK is YX 00285 * works since X and Y are concatenated here */ 00286 m_kernel->init(m_p_and_q, m_p_and_q); 00287 SGMatrix<float64_t> K=m_kernel->get_kernel_matrix(); 00288 00289 /* center matrix K=H*K*H */ 00290 K.center(); 00291 00292 /* compute eigenvalues and select num_eigenvalues largest ones */ 00293 SGVector<float64_t> eigenvalues= 00294 SGMatrix<float64_t>::compute_eigenvectors(K); 00295 SGVector<float64_t> largest_ev(num_eigenvalues); 00296 00297 /* take largest EV, scale by 1/2/m on the fly and take abs value*/ 00298 for (index_t i=0; i<num_eigenvalues; ++i) 00299 largest_ev[i]=CMath::abs( 00300 1.0/2/m_q_start*eigenvalues[eigenvalues.vlen-1-i]); 00301 00302 /* finally, sample from null distribution */ 00303 SGVector<float64_t> null_samples(num_samples); 00304 for (index_t i=0; i<num_samples; ++i) 00305 { 00306 /* 2*sum(kEigs.*(randn(length(kEigs),1)).^2); */ 00307 null_samples[i]=0; 00308 for (index_t j=0; j<largest_ev.vlen; ++j) 00309 null_samples[i]+=largest_ev[j]*CMath::pow(CMath::randn_double(), 2); 00310 00311 null_samples[i]*=2; 00312 } 00313 00314 return null_samples; 00315 } 00316 #endif // HAVE_LAPACK 00317 00318 SGVector<float64_t> CQuadraticTimeMMD::fit_null_gamma() 00319 { 00320 if (m_q_start!=m_p_and_q->get_num_vectors()/2) 00321 { 00322 SG_ERROR("%s::compute_p_value_gamma(): Currently, only equal " 00323 "sample sizes are supported\n", get_name()); 00324 } 00325 00326 /* evtl. warn user not to use wrong statistic type */ 00327 if (m_statistic_type!=BIASED) 00328 { 00329 SG_WARNING("%s::compute_p_value(): Note: provided statistic has " 00330 "to be BIASED. Please ensure that! To get rid of warning," 00331 "call %s::set_statistic_type(BIASED)\n", get_name(), 00332 get_name()); 00333 } 00334 00335 /* imaginary matrix K=[K KL; KL' L] (MATLAB notation) 00336 * K is matrix for XX, L is matrix for YY, KL is XY, LK is YX 00337 * works since X and Y are concatenated here */ 00338 m_kernel->init(m_p_and_q, m_p_and_q); 00339 00340 /* compute mean under H0 of MMD, which is 00341 * meanMMD = 2/m * ( 1 - 1/m*sum(diag(KL)) ); 00342 * in MATLAB. 00343 * Remove diagonals on the fly */ 00344 float64_t mean_mmd=0; 00345 for (index_t i=0; i<m_q_start; ++i) 00346 { 00347 /* virtual KL matrix is in upper right corner of SHOGUN K matrix 00348 * so this sums the diagonal of the matrix between X and Y*/ 00349 mean_mmd+=m_kernel->kernel(i, m_q_start+i); 00350 } 00351 mean_mmd=2.0/m_q_start*(1.0-1.0/m_q_start*mean_mmd); 00352 00353 /* compute variance under H0 of MMD, which is 00354 * varMMD = 2/m/(m-1) * 1/m/(m-1) * sum(sum( (K + L - KL - KL').^2 )); 00355 * in MATLAB, so sum up all elements */ 00356 float64_t var_mmd=0; 00357 for (index_t i=0; i<m_q_start; ++i) 00358 { 00359 for (index_t j=0; j<m_q_start; ++j) 00360 { 00361 /* dont add diagonal of all pairs of imaginary kernel matrices */ 00362 if (i==j || m_q_start+i==j || m_q_start+j==i) 00363 continue; 00364 00365 float64_t to_add=m_kernel->kernel(i, j); 00366 to_add+=m_kernel->kernel(m_q_start+i, m_q_start+j); 00367 to_add-=m_kernel->kernel(i, m_q_start+j); 00368 to_add-=m_kernel->kernel(m_q_start+i, j); 00369 var_mmd+=CMath::pow(to_add, 2); 00370 } 00371 } 00372 var_mmd*=2.0/m_q_start/(m_q_start-1)*1.0/m_q_start/(m_q_start-1); 00373 00374 /* parameters for gamma distribution */ 00375 float64_t a=CMath::pow(mean_mmd, 2)/var_mmd; 00376 float64_t b=var_mmd*m_q_start / mean_mmd; 00377 00378 SGVector<float64_t> result(2); 00379 result[0]=a; 00380 result[1]=b; 00381 00382 return result; 00383 } 00384 00385 void CQuadraticTimeMMD::set_num_samples_sepctrum(index_t 00386 num_samples_spectrum) 00387 { 00388 m_num_samples_spectrum=num_samples_spectrum; 00389 } 00390 00391 void CQuadraticTimeMMD::set_num_eigenvalues_spectrum( 00392 index_t num_eigenvalues_spectrum) 00393 { 00394 m_num_eigenvalues_spectrum=num_eigenvalues_spectrum; 00395 } 00396 00397 void CQuadraticTimeMMD::set_statistic_type(EQuadraticMMDType 00398 statistic_type) 00399 { 00400 m_statistic_type=statistic_type; 00401 }