SHOGUN  v2.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
LinearTimeMMD.cpp
Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2012 Heiko Strathmann
00008  */
00009 
00010 #include <shogun/statistics/LinearTimeMMD.h>
00011 #include <shogun/features/Features.h>
00012 #include <shogun/mathematics/Statistics.h>
00013 #include <shogun/features/CombinedFeatures.h>
00014 #include <shogun/kernel/CombinedKernel.h>
00015 
00016 #include <shogun/lib/external/libqp.h>
00017 
00018 using namespace shogun;
00019 
00020 CLinearTimeMMD::CLinearTimeMMD() :
00021         CKernelTwoSampleTestStatistic()
00022 {
00023     init();
00024 }
00025 
00026 CLinearTimeMMD::CLinearTimeMMD(CKernel* kernel, CFeatures* p_and_q,
00027         index_t q_start) :
00028         CKernelTwoSampleTestStatistic(kernel, p_and_q, q_start)
00029 {
00030     init();
00031 
00032     if (p_and_q && q_start!=p_and_q->get_num_vectors()/2)
00033     {
00034         SG_ERROR("CLinearTimeMMD: Only features with equal number of vectors "
00035                 "are currently possible\n");
00036     }
00037 }
00038 
00039 CLinearTimeMMD::CLinearTimeMMD(CKernel* kernel, CFeatures* p, CFeatures* q) :
00040         CKernelTwoSampleTestStatistic(kernel, p, q)
00041 {
00042     init();
00043 
00044     if (p->get_num_vectors()!=q->get_num_vectors())
00045     {
00046         SG_ERROR("CLinearTimeMMD: Only features with equal number of vectors "
00047                 "are currently possible\n");
00048     }
00049 }
00050 
00051 CLinearTimeMMD::~CLinearTimeMMD()
00052 {
00053 
00054 }
00055 
00056 void CLinearTimeMMD::init()
00057 {
00058     SG_ADD(&m_opt_max_iterations, "opt_max_iterations", "Maximum number of "
00059             "iterations for qp solver", MS_NOT_AVAILABLE);
00060     SG_ADD(&m_opt_epsilon, "opt_epsilon", "Stopping criterion for qp solver",
00061             MS_NOT_AVAILABLE);
00062     SG_ADD(&m_opt_low_cut, "opt_low_cut", "Low cut value for optimization "
00063             "kernel weights", MS_NOT_AVAILABLE);
00064     SG_ADD(&m_opt_regularization_eps, "opt_regularization_eps", "Regularization"
00065             " value that is added to diagonal of Q matrix", MS_NOT_AVAILABLE);
00066 
00067     m_opt_max_iterations=10000;
00068     m_opt_epsilon=10E-15;
00069     m_opt_low_cut=10E-7;
00070     m_opt_regularization_eps=0;
00071 }
00072 
00073 float64_t CLinearTimeMMD::compute_statistic()
00074 {
00075     SG_DEBUG("entering CLinearTimeMMD::compute_statistic()\n");
00076 
00077     REQUIRE(m_p_and_q, "%s::compute_statistic: features needed!\n", get_name());
00078 
00079     REQUIRE(m_kernel, "%s::compute_statistic: kernel needed!\n", get_name());
00080 
00081     /* m is number of samples from each distribution, m_2 is half of it
00082      * using names from JLMR paper (see class documentation) */
00083     index_t m=m_q_start;
00084     index_t m_2=m/2;
00085 
00086     SG_DEBUG("m_q_start=%d\n", m_q_start);
00087 
00088     /* compute traces of kernel matrices for linear MMD */
00089     m_kernel->init(m_p_and_q, m_p_and_q);
00090 
00091     float64_t pp=0;
00092     float64_t qq=0;
00093     float64_t pq=0;
00094     float64_t qp=0;
00095 
00096     /* compute traces */
00097     for (index_t i=0; i<m_2; ++i)
00098     {
00099         pp+=m_kernel->kernel(i, m_2+i);
00100         qq+=m_kernel->kernel(m+i, m+m_2+i);
00101         pq+=m_kernel->kernel(i, m+m_2+i);
00102         qp+=m_kernel->kernel(m_2+i, m+i);
00103     }
00104 
00105     SG_DEBUG("returning: 1/%d*(%f+%f-%f-%f)\n", m_2, pp, qq, pq, qp);
00106 
00107     /* mean of sum all traces is linear time mmd */
00108     SG_DEBUG("leaving CLinearTimeMMD::compute_statistic()\n");
00109     return 1.0/m_2*(pp+qq-pq-qp);
00110 }
00111 
00112 float64_t CLinearTimeMMD::compute_p_value(float64_t statistic)
00113 {
00114     float64_t result=0;
00115 
00116     switch (m_null_approximation_method)
00117     {
00118     case MMD1_GAUSSIAN:
00119         {
00120             /* compute variance and use to estimate Gaussian distribution */
00121             float64_t std_dev=CMath::sqrt(compute_variance_estimate());
00122             result=1.0-CStatistics::normal_cdf(statistic, std_dev);
00123         }
00124         break;
00125 
00126     default:
00127         /* bootstrapping is handled here */
00128         result=CKernelTwoSampleTestStatistic::compute_p_value(statistic);
00129         break;
00130     }
00131 
00132     return result;
00133 }
00134 
00135 float64_t CLinearTimeMMD::compute_threshold(float64_t alpha)
00136 {
00137     float64_t result=0;
00138 
00139     switch (m_null_approximation_method)
00140     {
00141     case MMD1_GAUSSIAN:
00142         {
00143             /* compute variance and use to estimate Gaussian distribution */
00144             float64_t std_dev=CMath::sqrt(compute_variance_estimate());
00145             result=1.0-CStatistics::inverse_normal_cdf(1-alpha, 0, std_dev);
00146         }
00147         break;
00148 
00149     default:
00150         /* bootstrapping is handled here */
00151         result=CKernelTwoSampleTestStatistic::compute_threshold(alpha);
00152         break;
00153     }
00154 
00155     return result;
00156 }
00157 
00158 float64_t CLinearTimeMMD::compute_variance_estimate()
00159 {
00160     REQUIRE(m_p_and_q, "%s::compute_variance_estimate: features needed!\n",
00161             get_name());
00162 
00163     REQUIRE(m_kernel, "%s::compute_variance_estimate: kernel needed!\n",
00164             get_name());
00165 
00166     if (m_p_and_q->get_num_vectors()<1000)
00167     {
00168         SG_WARNING("%s::compute_variance_estimate: The number of samples"
00169                 " should be very large (at least 1000)  in order to get a"
00170                 " good Gaussian approximation using MMD1_GAUSSIAN.\n",
00171                 get_name());
00172     }
00173 
00174     /* this corresponds to computing the statistic itself, however, the
00175      * difference is that all terms (of the traces) have to be stored */
00176     index_t m=m_q_start;
00177     index_t m_2=m/2;
00178 
00179     m_kernel->init(m_p_and_q, m_p_and_q);
00180 
00181     /* allocate memory for traces */
00182     SGVector<float64_t> traces(m_2);
00183 
00184     /* sum up diagonals of all kernel matrices */
00185     for (index_t i=0; i<m_2; ++i)
00186     {
00187         /* init for code beauty :) */
00188         traces[i]=0;
00189 
00190         /* p and p */
00191         traces[i]+=m_kernel->kernel(i, m_2+i);
00192 
00193         /* q and q */
00194         traces[i]+=m_kernel->kernel(m+i, m+m_2+i);
00195 
00196         /* p and q */
00197         traces[i]-=m_kernel->kernel(i, m+m_2+i);
00198 
00199         /* q and p */
00200         traces[i]-=m_kernel->kernel(m_2+i, m+i);
00201     }
00202 
00203     /* return linear time variance estimate */
00204     return CStatistics::variance(traces)/m_2;
00205 }
00206 
00207 #ifdef HAVE_LAPACK
00208 void CLinearTimeMMD::optimize_kernel_weights()
00209 {
00212     if (m_kernel->get_kernel_type()!=K_COMBINED)
00213     {
00214         SG_ERROR("CLinearTimeMMD::optimize_kernel_weights(): Only possible "
00215                 "with a combined kernel!\n");
00216     }
00217 
00218     if (m_p_and_q->get_feature_class()!=C_COMBINED)
00219     {
00220         SG_ERROR("CLinearTimeMMD::optimize_kernel_weights(): Only possible "
00221                 "with combined features!\n");
00222     }
00223 
00224     /* think about casting and types of different kernels/features here */
00225     CCombinedFeatures* combined_p_and_q=
00226             dynamic_cast<CCombinedFeatures*>(m_p_and_q);
00227     CCombinedKernel* combined_kernel=dynamic_cast<CCombinedKernel*>(m_kernel);
00228     ASSERT(combined_p_and_q);
00229     ASSERT(combined_kernel);
00230 
00231     if (combined_kernel->get_num_subkernels()!=
00232             combined_p_and_q->get_num_feature_obj())
00233     {
00234         SG_ERROR("CLinearTimeMMD::optimize_kernel_weights(): Only possible "
00235                 "when number of sub-kernels (%d) equal number of sub-features "
00236                 "(%d)\n", combined_kernel->get_num_subkernels(),
00237                 combined_p_and_q->get_num_feature_obj());
00238     }
00239 
00240     /* init kernel with features */
00241     m_kernel->init(m_p_and_q, m_p_and_q);
00242 
00243     /* number of kernels and data */
00244     index_t num_kernels=combined_kernel->get_num_subkernels();
00245     index_t m2=m_q_start/2;
00246 
00247     /* matrix with all h entries for all kernels and data */
00248     SGMatrix<float64_t> hs(m2, num_kernels);
00249 
00250     /* mmds are needed and are means of columns of hs */
00251     SGVector<float64_t> mmds(num_kernels);
00252 
00253     float64_t pp;
00254     float64_t qq;
00255     float64_t pq;
00256     float64_t qp;
00257     /* compute all h entries */
00258     for (index_t i=0; i<num_kernels; ++i)
00259     {
00260         CKernel* current=combined_kernel->get_kernel(i);
00261         mmds[i]=0;
00262         for (index_t j=0; j<m2; ++j)
00263         {
00264             pp=current->kernel(j, m2+j);
00265             qq=current->kernel(m_q_start+j, m_q_start+m2+j);
00266             pq=current->kernel(j, m_q_start+m2+j);
00267             qp=current->kernel(m2+j, m_q_start+j);
00268             hs(j, i)=pp+qq-pq-qp;
00269             mmds[i]+=hs(j, i);
00270         }
00271 
00272         /* mmd is simply mean. This is the unbiased linear time estimate */
00273         mmds[i]/=m2;
00274 
00275         SG_UNREF(current);
00276     }
00277 
00278     /* compute covariance matrix of h vector, in place is safe now since h
00279      * is not needed anymore */
00280     m_Q=CStatistics::covariance_matrix(hs, true);
00281 
00282     /* evtl regularize to avoid numerical problems (ratio of MMD and std-dev
00283      * blows up when variance is small */
00284     if (m_opt_regularization_eps)
00285     {
00286         SG_DEBUG("regularizing matrix Q by adding %f to diagonal\n",
00287                 m_opt_regularization_eps);
00288         for (index_t i=0; i<num_kernels; ++i)
00289             m_Q(i,i)+=m_opt_regularization_eps;
00290     }
00291 
00292     if (sg_io->get_loglevel()==MSG_DEBUG)
00293     {
00294         m_Q.display_matrix("(evtl. regularized) Q");
00295         mmds.display_vector("mmds");
00296     }
00297 
00298     /* compute sum of mmds to generate feasible point for convex program */
00299     float64_t sum_mmds=0;
00300     for (index_t i=0; i<mmds.vlen; ++i)
00301         sum_mmds+=mmds[i];
00302 
00303     /* QP: 0.5*x'*Q*x + f'*x
00304      * subject to
00305      * mmds'*x = b
00306      * LB[i] <= x[i] <= UB[i]   for all i=1..n */
00307     SGVector<float64_t> Q_diag(num_kernels);
00308     SGVector<float64_t> f(num_kernels);
00309     SGVector<float64_t> lb(num_kernels);
00310     SGVector<float64_t> ub(num_kernels);
00311     SGVector<float64_t> x(num_kernels);
00312 
00313     /* init everything, there are two cases possible: i) at least one mmd is
00314      * is positive, ii) all mmds are negative */
00315     bool one_pos;
00316     for (index_t i=0; i<mmds.vlen; ++i)
00317     {
00318         if (mmds[i]>0)
00319         {
00320             SG_DEBUG("found at least one positive MMD\n");
00321             one_pos=true;
00322             break;
00323         }
00324         one_pos=false;
00325     }
00326 
00327     if (!one_pos)
00328     {
00329         SG_WARNING("All mmd estimates are negative. This is techical possible,"
00330                 " although extremely rare. Current problem might bad\n");
00331 
00332         /* if no element is positive, Q has to be replaced by -Q */
00333         for (index_t i=0; i<num_kernels*num_kernels; ++i)
00334             m_Q.matrix[i]=-m_Q.matrix[i];
00335     }
00336 
00337     /* init vectors */
00338     for (index_t i=0; i<num_kernels; ++i)
00339     {
00340         Q_diag[i]=m_Q(i,i);
00341         f[i]=0;
00342         lb[i]=0;
00343         ub[i]=CMath::INFTY;
00344 
00345         /* initial point has to be feasible, i.e. mmds'*x = b */
00346         x[i]=1.0/sum_mmds;
00347     }
00348 
00349     /* start libqp solver with desired parameters */
00350     SG_DEBUG("starting libqp optimization\n");
00351     libqp_state_T qp_exitflag=libqp_gsmo_solver(&get_Q_col, Q_diag.vector,
00352             f.vector, mmds.vector,
00353             one_pos ? 1 : -1,
00354             lb.vector, ub.vector,
00355             x.vector, num_kernels, m_opt_max_iterations,
00356             m_opt_regularization_eps, &print_state);
00357 
00358     SG_DEBUG("libqp returns: nIts=%d, exit_flag: %d\n", qp_exitflag.nIter,
00359             qp_exitflag.exitflag);
00360 
00361     /* set really small entries to zero and sum up for normalization */
00362     float64_t sum_weights=0;
00363     for (index_t i=0; i<x.vlen; ++i)
00364     {
00365         if (x[i]<m_opt_low_cut)
00366         {
00367             SG_DEBUG("lowcut: weight[%i]=%f<%f; setting to zero\n", i, x[i],
00368                     m_opt_low_cut);
00369             x[i]=0;
00370         }
00371 
00372         sum_weights+=x[i];
00373     }
00374 
00375     /* normalize (allowed since problem is scale invariant) */
00376     for (index_t i=0; i<x.vlen; ++i)
00377         x[i]/=sum_weights;
00378 
00379     /* set weights to kernel */
00380     m_kernel->set_subkernel_weights(x);
00381 }
00382 
00383 SGMatrix<float64_t> CLinearTimeMMD::m_Q=SGMatrix<float64_t>();
00384 
00385 const float64_t* CLinearTimeMMD::get_Q_col(uint32_t i)
00386 {
00387     return &m_Q[m_Q.num_rows*i];
00388 }
00389 
00390 void CLinearTimeMMD::print_state(libqp_state_T state)
00391 {
00392     SG_SDEBUG("libqp state: primal=%f\n", state.QP);
00393 }
00394 
00395 #endif //HAVE_LAPACK
00396 
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation