SHOGUN
v2.0.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 1999-2009 Soeren Sonnenburg 00008 * Written (W) 1999-2008 Gunnar Raetsch 00009 * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00010 */ 00011 00012 #include <shogun/lib/common.h> 00013 #include <shogun/io/SGIO.h> 00014 00015 #include <shogun/base/Parameter.h> 00016 00017 #include <shogun/distributions/LinearHMM.h> 00018 #include <shogun/features/StringFeatures.h> 00019 00020 using namespace shogun; 00021 00022 CLinearHMM::CLinearHMM() : CDistribution() 00023 { 00024 init(); 00025 } 00026 00027 CLinearHMM::CLinearHMM(CStringFeatures<uint16_t>* f) 00028 : CDistribution() 00029 { 00030 init(); 00031 00032 set_features(f); 00033 sequence_length = f->get_vector_length(0); 00034 num_symbols = (int32_t) f->get_num_symbols(); 00035 num_params = sequence_length*num_symbols; 00036 } 00037 00038 CLinearHMM::CLinearHMM(int32_t p_num_features, int32_t p_num_symbols) 00039 : CDistribution() 00040 { 00041 init(); 00042 00043 sequence_length = p_num_features; 00044 num_symbols = p_num_symbols; 00045 num_params = sequence_length*num_symbols; 00046 } 00047 00048 CLinearHMM::~CLinearHMM() 00049 { 00050 SG_FREE(transition_probs); 00051 SG_FREE(log_transition_probs); 00052 } 00053 00054 bool CLinearHMM::train(CFeatures* data) 00055 { 00056 if (data) 00057 { 00058 if (data->get_feature_class() != C_STRING || 00059 data->get_feature_type() != F_WORD) 00060 { 00061 SG_ERROR("Expected features of class string type word!\n"); 00062 } 00063 set_features(data); 00064 } 00065 SG_FREE(transition_probs); 00066 SG_FREE(log_transition_probs); 00067 int32_t* int_transition_probs=SG_MALLOC(int32_t, num_params); 00068 00069 int32_t vec; 00070 int32_t i; 00071 00072 for (i=0; i< num_params; i++) 00073 int_transition_probs[i]=0; 00074 00075 for (vec=0; vec<features->get_num_vectors(); vec++) 00076 { 00077 int32_t len; 00078 bool free_vec; 00079 00080 uint16_t* vector=((CStringFeatures<uint16_t>*) features)-> 00081 get_feature_vector(vec, len, free_vec); 00082 00083 //just count the symbols per position -> transition_probsogram 00084 for (int32_t feat=0; feat<len ; feat++) 00085 int_transition_probs[feat*num_symbols+vector[feat]]++; 00086 00087 ((CStringFeatures<uint16_t>*) features)-> 00088 free_feature_vector(vector, vec, free_vec); 00089 } 00090 00091 //trade memory for speed 00092 transition_probs=SG_MALLOC(float64_t, num_params); 00093 log_transition_probs=SG_MALLOC(float64_t, num_params); 00094 00095 for (i=0;i<sequence_length;i++) 00096 { 00097 for (int32_t j=0; j<num_symbols; j++) 00098 { 00099 float64_t sum=0; 00100 int32_t offs=i*num_symbols+ 00101 ((CStringFeatures<uint16_t> *) features)-> 00102 get_masked_symbols((uint16_t)j,(uint8_t) 254); 00103 int32_t original_num_symbols=(int32_t) 00104 ((CStringFeatures<uint16_t> *) features)-> 00105 get_original_num_symbols(); 00106 00107 for (int32_t k=0; k<original_num_symbols; k++) 00108 sum+=int_transition_probs[offs+k]; 00109 00110 transition_probs[i*num_symbols+j]= 00111 (int_transition_probs[i*num_symbols+j]+pseudo_count)/ 00112 (sum+((CStringFeatures<uint16_t> *) features)-> 00113 get_original_num_symbols()*pseudo_count); 00114 log_transition_probs[i*num_symbols+j]= 00115 log(transition_probs[i*num_symbols+j]); 00116 } 00117 } 00118 00119 SG_FREE(int_transition_probs); 00120 return true; 00121 } 00122 00123 bool CLinearHMM::train( 00124 const int32_t* indizes, int32_t num_indizes, float64_t pseudo) 00125 { 00126 SG_FREE(transition_probs); 00127 SG_FREE(log_transition_probs); 00128 int32_t* int_transition_probs=SG_MALLOC(int32_t, num_params); 00129 int32_t vec; 00130 int32_t i; 00131 00132 for (i=0; i< num_params; i++) 00133 int_transition_probs[i]=0; 00134 00135 for (vec=0; vec<num_indizes; vec++) 00136 { 00137 int32_t len; 00138 bool free_vec; 00139 00140 ASSERT(indizes[vec]>=0 && 00141 indizes[vec]<((CStringFeatures<uint16_t>*) features)-> 00142 get_num_vectors()); 00143 uint16_t* vector=((CStringFeatures<uint16_t>*) features)-> 00144 get_feature_vector(indizes[vec], len, free_vec); 00145 ((CStringFeatures<uint16_t>*) features)-> 00146 free_feature_vector(vector, indizes[vec], free_vec); 00147 00148 //just count the symbols per position -> transition_probsogram 00149 // 00150 for (int32_t feat=0; feat<len ; feat++) 00151 int_transition_probs[feat*num_symbols+vector[feat]]++; 00152 } 00153 00154 //trade memory for speed 00155 transition_probs=SG_MALLOC(float64_t, num_params); 00156 log_transition_probs=SG_MALLOC(float64_t, num_params); 00157 00158 for (i=0;i<sequence_length;i++) 00159 { 00160 for (int32_t j=0; j<num_symbols; j++) 00161 { 00162 float64_t sum=0; 00163 int32_t original_num_symbols=(int32_t) 00164 ((CStringFeatures<uint16_t> *) features)-> 00165 get_original_num_symbols(); 00166 for (int32_t k=0; k<original_num_symbols; k++) 00167 { 00168 sum+=int_transition_probs[i*num_symbols+ 00169 ((CStringFeatures<uint16_t>*) features)-> 00170 get_masked_symbols((uint16_t)j,(uint8_t) 254)+k]; 00171 } 00172 00173 transition_probs[i*num_symbols+j]= 00174 (int_transition_probs[i*num_symbols+j]+pseudo)/ 00175 (sum+((CStringFeatures<uint16_t>*) features)-> 00176 get_original_num_symbols()*pseudo); 00177 log_transition_probs[i*num_symbols+j]= 00178 log(transition_probs[i*num_symbols+j]); 00179 } 00180 } 00181 00182 SG_FREE(int_transition_probs); 00183 return true; 00184 } 00185 00186 float64_t CLinearHMM::get_log_likelihood_example(uint16_t* vector, int32_t len) 00187 { 00188 float64_t result=log_transition_probs[vector[0]]; 00189 00190 for (int32_t i=1; i<len; i++) 00191 result+=log_transition_probs[i*num_symbols+vector[i]]; 00192 00193 return result; 00194 } 00195 00196 float64_t CLinearHMM::get_log_likelihood_example(int32_t num_example) 00197 { 00198 int32_t len; 00199 bool free_vec; 00200 uint16_t* vector=((CStringFeatures<uint16_t>*) features)-> 00201 get_feature_vector(num_example, len, free_vec); 00202 float64_t result=log_transition_probs[vector[0]]; 00203 00204 for (int32_t i=1; i<len; i++) 00205 result+=log_transition_probs[i*num_symbols+vector[i]]; 00206 00207 ((CStringFeatures<uint16_t>*) features)-> 00208 free_feature_vector(vector, num_example, free_vec); 00209 00210 return result; 00211 } 00212 00213 float64_t CLinearHMM::get_likelihood_example(uint16_t* vector, int32_t len) 00214 { 00215 float64_t result=transition_probs[vector[0]]; 00216 00217 for (int32_t i=1; i<len; i++) 00218 result*=transition_probs[i*num_symbols+vector[i]]; 00219 00220 return result; 00221 } 00222 00223 float64_t CLinearHMM::get_log_derivative(int32_t num_param, int32_t num_example) 00224 { 00225 int32_t len; 00226 bool free_vec; 00227 uint16_t* vector=((CStringFeatures<uint16_t>*) features)-> 00228 get_feature_vector(num_example, len, free_vec); 00229 float64_t result=0; 00230 int32_t position=num_param/num_symbols; 00231 ASSERT(position>=0 && position<len); 00232 uint16_t sym=(uint16_t) (num_param-position*num_symbols); 00233 00234 if (vector[position]==sym && transition_probs[num_param]!=0) 00235 result=1.0/transition_probs[num_param]; 00236 ((CStringFeatures<uint16_t>*) features)-> 00237 free_feature_vector(vector, num_example, free_vec); 00238 00239 return result; 00240 } 00241 00242 SGVector<float64_t> CLinearHMM::get_transition_probs() 00243 { 00244 return SGVector<float64_t>(transition_probs, num_params, false); 00245 } 00246 00247 bool CLinearHMM::set_transition_probs(const SGVector<float64_t> probs) 00248 { 00249 ASSERT(probs.vlen == num_params); 00250 00251 if (!log_transition_probs) 00252 log_transition_probs=SG_MALLOC(float64_t, num_params); 00253 00254 if (!transition_probs) 00255 transition_probs=SG_MALLOC(float64_t, num_params); 00256 00257 for (int32_t i=0; i<num_params; i++) 00258 { 00259 transition_probs[i]=probs.vector[i]; 00260 log_transition_probs[i]=log(transition_probs[i]); 00261 } 00262 00263 return true; 00264 } 00265 00266 SGVector<float64_t> CLinearHMM::get_log_transition_probs() 00267 { 00268 return SGVector<float64_t>(log_transition_probs, num_params, false); 00269 } 00270 00271 bool CLinearHMM::set_log_transition_probs(const SGVector<float64_t> probs) 00272 { 00273 ASSERT(probs.vlen == num_params); 00274 00275 if (!log_transition_probs) 00276 log_transition_probs=SG_MALLOC(float64_t, num_params); 00277 00278 if (!transition_probs) 00279 transition_probs=SG_MALLOC(float64_t, num_params); 00280 00281 for (int32_t i=0; i<num_params; i++) 00282 { 00283 log_transition_probs[i]=probs.vector[i]; 00284 transition_probs[i]=exp(log_transition_probs[i]); 00285 } 00286 00287 return true; 00288 } 00289 00290 void CLinearHMM::load_serializable_post() throw (ShogunException) 00291 { 00292 CSGObject::load_serializable_post(); 00293 00294 num_params = sequence_length*num_symbols; 00295 } 00296 00297 void CLinearHMM::init() 00298 { 00299 sequence_length = 0; 00300 num_symbols = 0; 00301 num_params = 0; 00302 transition_probs = NULL; 00303 log_transition_probs = NULL; 00304 00305 m_parameters->add_matrix(&transition_probs, &num_symbols, &sequence_length, 00306 "transition_probs", "Transition probabilities."); 00307 m_parameters->add_matrix(&log_transition_probs, &num_symbols, &sequence_length, 00308 "log_transition_probs", "Transition probabilities (logspace)."); 00309 }