SHOGUN
v2.0.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2012 Fernando José Iglesias García 00008 * Copyright (C) 2012 Fernando José Iglesias García 00009 */ 00010 00011 #include <shogun/structure/TwoStateModel.h> 00012 #include <shogun/mathematics/Math.h> 00013 #include <shogun/features/MatrixFeatures.h> 00014 00015 using namespace shogun; 00016 00017 CTwoStateModel::CTwoStateModel() : CStateModel() 00018 { 00019 // The number of states in this state model is equal to four. 00020 // Although parameters are learnt only for two of them, other 00021 // two states (start and stop) are used 00022 m_num_states = 4; 00023 m_num_transmission_params = 4; 00024 00025 m_state_loss_mat = SGMatrix< float64_t >(m_num_states, m_num_states); 00026 m_state_loss_mat.zero(); 00027 for ( int32_t i = 0 ; i < m_num_states-1 ; ++i ) 00028 { 00029 m_state_loss_mat(m_num_states-1, i) = 1; 00030 m_state_loss_mat(i, m_num_states-1) = 1; 00031 } 00032 00033 // Initialize the start and stop states 00034 m_p = SGVector< float64_t >(m_num_states); 00035 m_q = SGVector< float64_t >(m_num_states); 00036 m_p.set_const(-CMath::INFTY); 00037 m_q.set_const(-CMath::INFTY); 00038 m_p[0] = 0; // start state 00039 m_q[1] = 0; // stop state 00040 } 00041 00042 CTwoStateModel::~CTwoStateModel() 00043 { 00044 } 00045 00046 SGMatrix< float64_t > CTwoStateModel::loss_matrix(CSequence* label_seq) 00047 { 00048 SGVector< int32_t > state_seq = labels_to_states(label_seq); 00049 SGMatrix< float64_t > loss_mat(m_num_states, state_seq.vlen); 00050 00051 for ( int32_t i = 0 ; i < loss_mat.num_cols ; ++i ) 00052 { 00053 for ( int32_t s = 0 ; s < loss_mat.num_rows ; ++s ) 00054 loss_mat(s,i) = m_state_loss_mat(s, state_seq[i]); 00055 } 00056 00057 return loss_mat; 00058 } 00059 00060 float64_t CTwoStateModel::loss(CSequence* label_seq_lhs, CSequence* label_seq_rhs) 00061 { 00062 SGVector< int32_t > state_seq_lhs = labels_to_states(label_seq_lhs); 00063 SGVector< int32_t > state_seq_rhs = labels_to_states(label_seq_rhs); 00064 00065 ASSERT(state_seq_lhs.vlen == state_seq_rhs.vlen); 00066 00067 float64_t ret = 0.0; 00068 for ( int32_t i = 0 ; i < state_seq_lhs.vlen ; ++i ) 00069 ret += m_state_loss_mat(state_seq_lhs[i], state_seq_rhs[i]); 00070 00071 return ret; 00072 } 00073 00074 SGVector< int32_t > CTwoStateModel::labels_to_states(CSequence* label_seq) const 00075 { 00076 // 0 -> start state 00077 // 1 -> stop state 00078 // 2 -> negative state (label == 0) 00079 // 3 -> positive state (label == 1) 00080 00081 SGVector< int32_t > seq_data = label_seq->get_data(); 00082 SGVector< int32_t > state_seq(seq_data.size()); 00083 for ( int32_t i = 1 ; i < state_seq.vlen-1 ; ++i ) 00084 { 00085 //FIXME make independent of values 0-1 in labels 00086 state_seq[i] = seq_data[i] + 2; 00087 } 00088 00089 // The first element is always start state 00090 state_seq[0] = 0; 00091 // The last element is always stop state 00092 state_seq[state_seq.vlen-1] = 1; 00093 00094 return state_seq; 00095 } 00096 00097 CSequence* CTwoStateModel::states_to_labels(SGVector< int32_t > state_seq) const 00098 { 00099 SGVector< int32_t > label_seq(state_seq.vlen); 00100 00101 //FIXME make independent of values 0-1 in labels 00102 // Legend for state indices: 00103 // 0 -> start state => label 0 00104 // 1 -> stop state => label 0 00105 // 2 -> negative state (label == 0) => label 0 00106 // 3 -> positive state (label == 1) => label 1 00107 label_seq.zero(); 00108 for ( int32_t i = 0 ; i < state_seq.vlen ; ++i ) 00109 { 00110 if ( state_seq[i] == 3 ) 00111 label_seq[i] = 1; 00112 } 00113 00114 CSequence* ret = new CSequence(label_seq); 00115 SG_REF(ret); 00116 return ret; 00117 } 00118 00119 void CTwoStateModel::reshape_emission_params(SGVector< float64_t >& emission_weights, 00120 SGVector< float64_t > w, int32_t num_feats, int32_t num_obs) 00121 { 00122 emission_weights.zero(); 00123 00124 // Legend for state indices: 00125 // 0 -> start state 00126 // 1 -> stop state 00127 // 2 -> negative state (label == 0) 00128 // 3 -> positive state (label == 1) 00129 // 00130 // start and stop states have no emission scores 00131 00132 index_t em_idx, w_idx = m_num_transmission_params; 00133 for ( int32_t s = 2 ; s < m_num_states ; ++s ) 00134 { 00135 for ( int32_t f = 0 ; f < num_feats ; ++f ) 00136 { 00137 for ( int32_t o = 0 ; o < num_obs ; ++o ) 00138 { 00139 em_idx = s*num_feats*num_obs + f*num_obs + o; 00140 emission_weights[em_idx] = w[w_idx++]; 00141 } 00142 } 00143 } 00144 } 00145 00146 void CTwoStateModel::reshape_transmission_params( 00147 SGMatrix< float64_t >& transmission_weights, SGVector< float64_t > w) 00148 { 00149 transmission_weights.set_const(-CMath::INFTY); 00150 00151 // Legend for state indices: 00152 // 0 -> start state 00153 // 1 -> stop state 00154 // 2 -> negative state (label == 0) 00155 // 3 -> positive state (label == 1) 00156 00157 // From start 00158 transmission_weights(0,2) = 0; // to negative 00159 transmission_weights(0,3) = 0; // to positive 00160 // From negative 00161 transmission_weights(2,1) = 0; // to stop 00162 transmission_weights(2,2) = w[0]; // to negative 00163 transmission_weights(2,3) = w[1]; // to positive 00164 // From positive 00165 transmission_weights(3,1) = 0; // to stop 00166 transmission_weights(3,2) = w[3]; // to positive 00167 transmission_weights(3,3) = w[2]; // to negative 00168 } 00169 00170 void CTwoStateModel::weights_to_vector(SGVector< float64_t >& psi, 00171 SGMatrix< float64_t > transmission_weights, 00172 SGVector< float64_t > emission_weights, 00173 int32_t num_feats, int32_t num_obs) const 00174 { 00175 // Legend for state indices: 00176 // 0 -> start state 00177 // 1 -> stop state 00178 // 2 -> negative state 00179 // 3 -> positive state 00180 psi[0] = transmission_weights(2,2); 00181 psi[1] = transmission_weights(2,3); 00182 psi[2] = transmission_weights(3,3); 00183 psi[3] = transmission_weights(3,2); 00184 00185 // start and stop states have no emission scores 00186 index_t obs_idx, psi_idx = m_num_transmission_params; 00187 for ( int32_t s = 2 ; s < m_num_states ; ++s ) 00188 { 00189 for ( int32_t f = 0 ; f < num_feats ; ++f ) 00190 { 00191 for ( int32_t o = 0 ; o < num_obs ; ++o ) 00192 { 00193 obs_idx = s*num_feats*num_obs + f*num_obs + o; 00194 psi[psi_idx++] = emission_weights[obs_idx]; 00195 } 00196 } 00197 } 00198 00199 } 00200 00201 SGVector< int32_t > CTwoStateModel::get_monotonicity(int32_t num_free_states, 00202 int32_t num_feats) const 00203 { 00204 REQUIRE(num_free_states == 2, "Using the TwoStateModel only two states are free\n"); 00205 00206 SGVector< int32_t > monotonicity(num_feats*num_free_states); 00207 00208 for ( int32_t i = 0 ; i < num_feats ; ++i ) 00209 monotonicity[i] = -1; 00210 for ( int32_t i = num_feats ; i < 2*num_feats ; ++i ) 00211 monotonicity[i] = +1; 00212 00213 return monotonicity; 00214 } 00215 00216 CHMSVMModel* CTwoStateModel::simulate_two_state_data() 00217 { 00218 // Number of examples 00219 int32_t num_exm = 1000; 00220 // Length of each example sequence 00221 int32_t exm_len = 250; 00222 // Number of different states 00223 int32_t num_states = 2; 00224 // Total number of features 00225 int32_t num_features = 10; 00226 // Number of features to be pure noise 00227 int32_t num_noise_features = 2; 00228 // Min and max length of positive block 00229 int32_t block_len[] = {10, 100}; 00230 // Min and max number of positive blocks per example 00231 int32_t num_blocks[] = {0, 3}; 00232 00233 // Proportion of wrong labels 00234 float64_t prop_distort = 0.2; 00235 // Standard deviation of Gaussian noise 00236 float64_t noise_std = 4; 00237 00238 // Generate label sequence randomly containing from num_blocks[0] to 00239 // num_blocks[1] blocks of positive labels each of length between 00240 // block_len[0] and block_len[1] 00241 00242 CHMSVMLabels* labels = new CHMSVMLabels(num_exm, num_states); 00243 SGVector< int32_t > ll(num_exm*exm_len); 00244 ll.zero(); 00245 int32_t rnb, rl, rp; 00246 00247 for ( int32_t i = 0 ; i < num_exm ; ++i) 00248 { 00249 SGVector< int32_t > lab(exm_len); 00250 lab.zero(); 00251 rnb = num_blocks[0] + CMath::ceil((num_blocks[1]-num_blocks[0])* 00252 CMath::random(0.0, 1.0)) - 1; 00253 00254 for ( int32_t j = 0 ; j < rnb ; ++j ) 00255 { 00256 rl = block_len[0] + CMath::ceil((block_len[1]-block_len[0])* 00257 CMath::random(0.0, 1.0)) - 1; 00258 rp = CMath::ceil((exm_len-rl)*CMath::random(0.0, 1.0)); 00259 00260 for ( int32_t idx = rp-1 ; idx < rp+rl ; ++idx ) 00261 { 00262 lab[idx] = 1; 00263 ll[i*exm_len + idx] = 1; 00264 } 00265 } 00266 00267 labels->add_label(lab); 00268 } 00269 00270 // Generate features by 00271 // i) introducing label noise, i.e. flipping a propotion prop_distort 00272 // of labels and 00273 // ii) adding Gaussian noise to the (distorted) label sequence 00274 00275 SGVector< int32_t > distort(num_exm*exm_len); 00276 SGVector< int32_t > d1(CMath::round(distort.vlen*prop_distort)); 00277 SGVector< int32_t > d2(d1.vlen); 00278 SGVector< int32_t > lf; 00279 SGMatrix< float64_t > signal(num_features, distort.vlen); 00280 00281 for ( int32_t i = 0 ; i < num_features ; ++i ) 00282 { 00283 lf = ll; 00284 distort.randperm(); 00285 00286 for ( int32_t j = 0 ; j < d1.vlen ; ++j ) 00287 d1[j] = distort[j]; 00288 00289 for ( int32_t j = 0 ; j < d2.vlen ; ++j ) 00290 d2[j] = distort[ distort.vlen-d2.vlen+j ]; 00291 00292 for ( int32_t j = 0 ; j < d1.vlen ; ++j ) 00293 lf[ d1[j] ] = lf[ d2[j] ]; 00294 00295 int32_t idx = i*signal.num_cols; 00296 for ( int32_t j = 0 ; j < signal.num_cols ; ++j ) 00297 signal[idx++] = lf[j] + noise_std*CMath::normal_random((float64_t)0.0, 1.0); 00298 } 00299 00300 // Substitute some features by pure noise 00301 SGVector< int32_t > ridx(num_features); 00302 ridx.randperm(); 00303 for ( int32_t i = 0 ; i < num_noise_features ; ++i ) 00304 { 00305 int32_t idx = i*signal.num_cols; 00306 for ( int32_t j = 0 ; j < signal.num_cols ; ++j ) 00307 signal[idx++] = noise_std*CMath::normal_random((float64_t)0.0, 1.0); 00308 } 00309 00310 CMatrixFeatures< float64_t >* features = 00311 new CMatrixFeatures< float64_t >(signal, exm_len, num_exm); 00312 00313 return new CHMSVMModel(features, labels, SMT_TWO_STATE, 3); 00314 }