SHOGUN
v2.0.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2012 Heiko Strathmann 00008 */ 00009 00010 #include <shogun/statistics/TwoDistributionsTestStatistic.h> 00011 #include <shogun/features/Features.h> 00012 00013 using namespace shogun; 00014 00015 CTwoDistributionsTestStatistic::CTwoDistributionsTestStatistic() : 00016 CTestStatistic() 00017 { 00018 init(); 00019 } 00020 00021 CTwoDistributionsTestStatistic::CTwoDistributionsTestStatistic( 00022 CFeatures* p_and_q, 00023 index_t q_start) : CTestStatistic() 00024 { 00025 init(); 00026 00027 m_p_and_q=p_and_q; 00028 SG_REF(m_p_and_q); 00029 00030 m_q_start=q_start; 00031 } 00032 00033 CTwoDistributionsTestStatistic::CTwoDistributionsTestStatistic( 00034 CFeatures* p, CFeatures* q) : 00035 CTestStatistic() 00036 { 00037 init(); 00038 00039 m_p_and_q=p->create_merged_copy(q); 00040 SG_REF(m_p_and_q); 00041 00042 m_q_start=p->get_num_vectors(); 00043 } 00044 00045 CTwoDistributionsTestStatistic::~CTwoDistributionsTestStatistic() 00046 { 00047 SG_UNREF(m_p_and_q); 00048 } 00049 00050 void CTwoDistributionsTestStatistic::init() 00051 { 00052 SG_ADD((CSGObject**)&m_p_and_q, "p_and_q", "Concatenated samples p and q", 00053 MS_NOT_AVAILABLE); 00054 SG_ADD(&m_q_start, "q_start", "Index of first sample of q", 00055 MS_NOT_AVAILABLE); 00056 00057 m_p_and_q=NULL; 00058 m_q_start=0; 00059 } 00060 00061 SGVector<float64_t> CTwoDistributionsTestStatistic::bootstrap_null() 00062 { 00063 SG_DEBUG("entering CTwoDistributionsTestStatistic::bootstrap_null()\n"); 00064 00065 /* compute bootstrap statistics for null distribution */ 00066 SGVector<float64_t> results(m_bootstrap_iterations); 00067 00068 /* memory for index permutations, (would slow down loop) */ 00069 SGVector<index_t> ind_permutation(m_p_and_q->get_num_vectors()); 00070 ind_permutation.range_fill(); 00071 m_p_and_q->add_subset(ind_permutation); 00072 00073 for (index_t i=0; i<m_bootstrap_iterations; ++i) 00074 { 00075 /* idea: merge features of p and q, shuffle, and compute statistic. 00076 * This is done using subsets here */ 00077 00078 /* create index permutation and add as subset. This will mix samples 00079 * from p and q */ 00080 SGVector<int32_t>::permute_vector(ind_permutation); 00081 00082 /* compute statistic for this permutation of mixed samples */ 00083 results[i]=compute_statistic(); 00084 } 00085 00086 /* clean up */ 00087 m_p_and_q->remove_subset(); 00088 00089 SG_DEBUG("leaving CTwoDistributionsTestStatistic::bootstrap_null()\n"); 00090 return results; 00091 } 00092 00093 float64_t CTwoDistributionsTestStatistic::compute_p_value( 00094 float64_t statistic) 00095 { 00096 float64_t result=0; 00097 00098 if (m_null_approximation_method==BOOTSTRAP) 00099 { 00100 /* bootstrap a bunch of MMD values from null distribution */ 00101 SGVector<float64_t> values=bootstrap_null(); 00102 00103 /* find out percentile of parameter "statistic" in null distribution */ 00104 CMath::qsort(values); 00105 float64_t i=CMath::find_position_to_insert(values, statistic); 00106 00107 /* return corresponding p-value */ 00108 result=1.0-i/values.vlen; 00109 } 00110 else 00111 { 00112 SG_ERROR("CTwoDistributionsTestStatistics::compute_p_value(): Unknown" 00113 "method to approximate null distribution!\n"); 00114 } 00115 00116 return result; 00117 } 00118 00119 float64_t CTwoDistributionsTestStatistic::compute_threshold( 00120 float64_t alpha) 00121 { 00122 float64_t result=0; 00123 00124 if (m_null_approximation_method==BOOTSTRAP) 00125 { 00126 /* bootstrap a bunch of MMD values from null distribution */ 00127 SGVector<float64_t> values=bootstrap_null(); 00128 00129 /* return value of (1-alpha) quantile */ 00130 result=values[CMath::floor(values.vlen*(1-alpha))]; 00131 } 00132 else 00133 { 00134 SG_ERROR("CTwoDistributionsTestStatistics::compute_threshold():" 00135 "Unknown method to approximate null distribution!\n"); 00136 } 00137 00138 return result; 00139 }