SHOGUN  v2.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
TwoDistributionsTestStatistic.cpp
Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2012 Heiko Strathmann
00008  */
00009 
00010 #include <shogun/statistics/TwoDistributionsTestStatistic.h>
00011 #include <shogun/features/Features.h>
00012 
00013 using namespace shogun;
00014 
00015 CTwoDistributionsTestStatistic::CTwoDistributionsTestStatistic() :
00016         CTestStatistic()
00017 {
00018     init();
00019 }
00020 
00021 CTwoDistributionsTestStatistic::CTwoDistributionsTestStatistic(
00022         CFeatures* p_and_q,
00023         index_t q_start) : CTestStatistic()
00024 {
00025     init();
00026 
00027     m_p_and_q=p_and_q;
00028     SG_REF(m_p_and_q);
00029 
00030     m_q_start=q_start;
00031 }
00032 
00033 CTwoDistributionsTestStatistic::CTwoDistributionsTestStatistic(
00034         CFeatures* p, CFeatures* q) :
00035         CTestStatistic()
00036 {
00037     init();
00038 
00039     m_p_and_q=p->create_merged_copy(q);
00040     SG_REF(m_p_and_q);
00041 
00042     m_q_start=p->get_num_vectors();
00043 }
00044 
00045 CTwoDistributionsTestStatistic::~CTwoDistributionsTestStatistic()
00046 {
00047     SG_UNREF(m_p_and_q);
00048 }
00049 
00050 void CTwoDistributionsTestStatistic::init()
00051 {
00052     SG_ADD((CSGObject**)&m_p_and_q, "p_and_q", "Concatenated samples p and q",
00053             MS_NOT_AVAILABLE);
00054     SG_ADD(&m_q_start, "q_start", "Index of first sample of q",
00055             MS_NOT_AVAILABLE);
00056 
00057     m_p_and_q=NULL;
00058     m_q_start=0;
00059 }
00060 
00061 SGVector<float64_t> CTwoDistributionsTestStatistic::bootstrap_null()
00062 {
00063     SG_DEBUG("entering CTwoDistributionsTestStatistic::bootstrap_null()\n");
00064 
00065     /* compute bootstrap statistics for null distribution */
00066     SGVector<float64_t> results(m_bootstrap_iterations);
00067 
00068     /* memory for index permutations, (would slow down loop) */
00069     SGVector<index_t> ind_permutation(m_p_and_q->get_num_vectors());
00070     ind_permutation.range_fill();
00071     m_p_and_q->add_subset(ind_permutation);
00072 
00073     for (index_t i=0; i<m_bootstrap_iterations; ++i)
00074     {
00075         /* idea: merge features of p and q, shuffle, and compute statistic.
00076          * This is done using subsets here */
00077 
00078         /* create index permutation and add as subset. This will mix samples
00079          * from p and q */
00080         SGVector<int32_t>::permute_vector(ind_permutation);
00081 
00082         /* compute statistic for this permutation of mixed samples */
00083         results[i]=compute_statistic();
00084     }
00085 
00086     /* clean up */
00087     m_p_and_q->remove_subset();
00088 
00089     SG_DEBUG("leaving CTwoDistributionsTestStatistic::bootstrap_null()\n");
00090     return results;
00091 }
00092 
00093 float64_t CTwoDistributionsTestStatistic::compute_p_value(
00094         float64_t statistic)
00095 {
00096     float64_t result=0;
00097 
00098     if (m_null_approximation_method==BOOTSTRAP)
00099     {
00100         /* bootstrap a bunch of MMD values from null distribution */
00101         SGVector<float64_t> values=bootstrap_null();
00102 
00103         /* find out percentile of parameter "statistic" in null distribution */
00104         CMath::qsort(values);
00105         float64_t i=CMath::find_position_to_insert(values, statistic);
00106 
00107         /* return corresponding p-value */
00108         result=1.0-i/values.vlen;
00109     }
00110     else
00111     {
00112         SG_ERROR("CTwoDistributionsTestStatistics::compute_p_value(): Unknown"
00113                 "method to approximate null distribution!\n");
00114     }
00115 
00116     return result;
00117 }
00118 
00119 float64_t CTwoDistributionsTestStatistic::compute_threshold(
00120         float64_t alpha)
00121 {
00122     float64_t result=0;
00123 
00124     if (m_null_approximation_method==BOOTSTRAP)
00125     {
00126         /* bootstrap a bunch of MMD values from null distribution */
00127         SGVector<float64_t> values=bootstrap_null();
00128 
00129         /* return value of (1-alpha) quantile */
00130         result=values[CMath::floor(values.vlen*(1-alpha))];
00131     }
00132     else
00133     {
00134         SG_ERROR("CTwoDistributionsTestStatistics::compute_threshold():"
00135                 "Unknown method to approximate null distribution!\n");
00136     }
00137 
00138     return result;
00139 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation