SHOGUN
v2.0.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2007-2010 Soeren Sonnenburg 00008 * Copyright (c) 2007-2009 The LIBLINEAR Project. 00009 * Copyright (C) 2007-2010 Fraunhofer Institute FIRST and Max-Planck-Society 00010 */ 00011 #include <shogun/lib/config.h> 00012 00013 #ifdef HAVE_LAPACK 00014 #include <shogun/io/SGIO.h> 00015 #include <shogun/lib/Signal.h> 00016 #include <shogun/lib/Time.h> 00017 #include <shogun/base/Parameter.h> 00018 #include <shogun/classifier/svm/LibLinear.h> 00019 #include <shogun/optimization/liblinear/tron.h> 00020 #include <shogun/features/DotFeatures.h> 00021 #include <shogun/labels/BinaryLabels.h> 00022 00023 using namespace shogun; 00024 00025 CLibLinear::CLibLinear() 00026 : CLinearMachine() 00027 { 00028 init(); 00029 } 00030 00031 CLibLinear::CLibLinear(LIBLINEAR_SOLVER_TYPE l) 00032 : CLinearMachine() 00033 { 00034 init(); 00035 liblinear_solver_type=l; 00036 } 00037 00038 CLibLinear::CLibLinear( 00039 float64_t C, CDotFeatures* traindat, CLabels* trainlab) 00040 : CLinearMachine() 00041 { 00042 init(); 00043 C1=C; 00044 C2=C; 00045 use_bias=true; 00046 00047 set_features(traindat); 00048 set_labels(trainlab); 00049 init_linear_term(); 00050 } 00051 00052 void CLibLinear::init() 00053 { 00054 liblinear_solver_type=L2R_L1LOSS_SVC_DUAL; 00055 use_bias=false; 00056 C1=1; 00057 C2=1; 00058 set_max_iterations(); 00059 epsilon=1e-5; 00060 00061 SG_ADD(&C1, "C1", "C Cost constant 1.", MS_AVAILABLE); 00062 SG_ADD(&C2, "C2", "C Cost constant 2.", MS_AVAILABLE); 00063 SG_ADD(&use_bias, "use_bias", "Indicates if bias is used.", 00064 MS_NOT_AVAILABLE); 00065 SG_ADD(&epsilon, "epsilon", "Convergence precision.", MS_NOT_AVAILABLE); 00066 SG_ADD(&max_iterations, "max_iterations", "Max number of iterations.", 00067 MS_NOT_AVAILABLE); 00068 SG_ADD(&m_linear_term, "linear_term", "Linear Term", MS_NOT_AVAILABLE); 00069 SG_ADD((machine_int_t*) &liblinear_solver_type, "liblinear_solver_type", 00070 "Type of LibLinear solver.", MS_NOT_AVAILABLE); 00071 } 00072 00073 CLibLinear::~CLibLinear() 00074 { 00075 } 00076 00077 bool CLibLinear::train_machine(CFeatures* data) 00078 { 00079 CSignal::clear_cancel(); 00080 ASSERT(m_labels); 00081 ASSERT(m_labels->get_label_type() == LT_BINARY); 00082 00083 if (data) 00084 { 00085 if (!data->has_property(FP_DOT)) 00086 SG_ERROR("Specified features are not of type CDotFeatures\n"); 00087 00088 set_features((CDotFeatures*) data); 00089 } 00090 ASSERT(features); 00091 00092 00093 int32_t num_train_labels=m_labels->get_num_labels(); 00094 int32_t num_feat=features->get_dim_feature_space(); 00095 int32_t num_vec=features->get_num_vectors(); 00096 00097 if (liblinear_solver_type == L1R_L2LOSS_SVC || 00098 (liblinear_solver_type == L1R_LR) ) 00099 { 00100 if (num_feat!=num_train_labels) 00101 { 00102 SG_ERROR("L1 methods require the data to be transposed: " 00103 "number of features %d does not match number of " 00104 "training labels %d\n", 00105 num_feat, num_train_labels); 00106 } 00107 CMath::swap(num_feat, num_vec); 00108 00109 } 00110 else 00111 { 00112 if (num_vec!=num_train_labels) 00113 { 00114 SG_ERROR("number of vectors %d does not match " 00115 "number of training labels %d\n", 00116 num_vec, num_train_labels); 00117 } 00118 } 00119 if (use_bias) 00120 w=SGVector<float64_t>(SG_MALLOC(float64_t, num_feat+1), num_feat); 00121 else 00122 w=SGVector<float64_t>(num_feat); 00123 00124 problem prob; 00125 if (use_bias) 00126 { 00127 prob.n=w.vlen+1; 00128 memset(w.vector, 0, sizeof(float64_t)*(w.vlen+1)); 00129 } 00130 else 00131 { 00132 prob.n=w.vlen; 00133 memset(w.vector, 0, sizeof(float64_t)*(w.vlen+0)); 00134 } 00135 prob.l=num_vec; 00136 prob.x=features; 00137 prob.y=SG_MALLOC(double, prob.l); 00138 float64_t* Cs=SG_MALLOC(double, prob.l); 00139 prob.use_bias=use_bias; 00140 double Cp=C1; 00141 double Cn=C2; 00142 00143 for (int32_t i=0; i<prob.l; i++) 00144 { 00145 prob.y[i]=((CBinaryLabels*) m_labels)->get_int_label(i); 00146 if (prob.y[i] == +1) 00147 Cs[i]=C1; 00148 else if (prob.y[i] == -1) 00149 Cs[i]=C2; 00150 else 00151 SG_ERROR("labels should be +1/-1 only\n"); 00152 } 00153 00154 int pos = 0; 00155 int neg = 0; 00156 for(int i=0;i<prob.l;i++) 00157 { 00158 if(prob.y[i]==+1) 00159 pos++; 00160 } 00161 neg = prob.l - pos; 00162 00163 SG_INFO("%d training points %d dims\n", prob.l, prob.n); 00164 00165 function *fun_obj=NULL; 00166 switch (liblinear_solver_type) 00167 { 00168 case L2R_LR: 00169 { 00170 fun_obj=new l2r_lr_fun(&prob, Cs); 00171 CTron tron_obj(fun_obj, epsilon*CMath::min(pos,neg)/prob.l, max_iterations); 00172 SG_DEBUG("starting L2R_LR training via tron\n"); 00173 tron_obj.tron(w.vector, m_max_train_time); 00174 SG_DEBUG("done with tron\n"); 00175 delete fun_obj; 00176 break; 00177 } 00178 case L2R_L2LOSS_SVC: 00179 { 00180 fun_obj=new l2r_l2_svc_fun(&prob, Cs); 00181 CTron tron_obj(fun_obj, epsilon*CMath::min(pos,neg)/prob.l, max_iterations); 00182 tron_obj.tron(w.vector, m_max_train_time); 00183 delete fun_obj; 00184 break; 00185 } 00186 case L2R_L2LOSS_SVC_DUAL: 00187 solve_l2r_l1l2_svc(&prob, epsilon, Cp, Cn, L2R_L2LOSS_SVC_DUAL); 00188 break; 00189 case L2R_L1LOSS_SVC_DUAL: 00190 solve_l2r_l1l2_svc(&prob, epsilon, Cp, Cn, L2R_L1LOSS_SVC_DUAL); 00191 break; 00192 case L1R_L2LOSS_SVC: 00193 { 00194 //ASSUME FEATURES ARE TRANSPOSED ALREADY 00195 solve_l1r_l2_svc(&prob, epsilon*CMath::min(pos,neg)/prob.l, Cp, Cn); 00196 break; 00197 } 00198 case L1R_LR: 00199 { 00200 //ASSUME FEATURES ARE TRANSPOSED ALREADY 00201 solve_l1r_lr(&prob, epsilon*CMath::min(pos,neg)/prob.l, Cp, Cn); 00202 break; 00203 } 00204 case L2R_LR_DUAL: 00205 { 00206 solve_l2r_lr_dual(&prob, epsilon, Cp, Cn); 00207 break; 00208 } 00209 default: 00210 SG_ERROR("Error: unknown solver_type\n"); 00211 break; 00212 } 00213 00214 if (use_bias) 00215 set_bias(w[w.vlen]); 00216 else 00217 set_bias(0); 00218 00219 SG_FREE(prob.y); 00220 SG_FREE(Cs); 00221 00222 return true; 00223 } 00224 00225 // A coordinate descent algorithm for 00226 // L1-loss and L2-loss SVM dual problems 00227 // 00228 // min_\alpha 0.5(\alpha^T (Q + D)\alpha) - e^T \alpha, 00229 // s.t. 0 <= alpha_i <= upper_bound_i, 00230 // 00231 // where Qij = yi yj xi^T xj and 00232 // D is a diagonal matrix 00233 // 00234 // In L1-SVM case: 00235 // upper_bound_i = Cp if y_i = 1 00236 // upper_bound_i = Cn if y_i = -1 00237 // D_ii = 0 00238 // In L2-SVM case: 00239 // upper_bound_i = INF 00240 // D_ii = 1/(2*Cp) if y_i = 1 00241 // D_ii = 1/(2*Cn) if y_i = -1 00242 // 00243 // Given: 00244 // x, y, Cp, Cn 00245 // eps is the stopping tolerance 00246 // 00247 // solution will be put in w 00248 00249 #undef GETI 00250 #define GETI(i) (y[i]+1) 00251 // To support weights for instances, use GETI(i) (i) 00252 00253 void CLibLinear::solve_l2r_l1l2_svc( 00254 const problem *prob, double eps, double Cp, double Cn, LIBLINEAR_SOLVER_TYPE st) 00255 { 00256 int l = prob->l; 00257 int w_size = prob->n; 00258 int i, s, iter = 0; 00259 double C, d, G; 00260 double *QD = SG_MALLOC(double, l); 00261 int *index = SG_MALLOC(int, l); 00262 double *alpha = SG_MALLOC(double, l); 00263 int32_t *y = SG_MALLOC(int32_t, l); 00264 int active_size = l; 00265 00266 // PG: projected gradient, for shrinking and stopping 00267 double PG; 00268 double PGmax_old = CMath::INFTY; 00269 double PGmin_old = -CMath::INFTY; 00270 double PGmax_new, PGmin_new; 00271 00272 // default solver_type: L2R_L2LOSS_SVC_DUAL 00273 double diag[3] = {0.5/Cn, 0, 0.5/Cp}; 00274 double upper_bound[3] = {CMath::INFTY, 0, CMath::INFTY}; 00275 if(st == L2R_L1LOSS_SVC_DUAL) 00276 { 00277 diag[0] = 0; 00278 diag[2] = 0; 00279 upper_bound[0] = Cn; 00280 upper_bound[2] = Cp; 00281 } 00282 00283 int n = prob->n; 00284 00285 if (prob->use_bias) 00286 n--; 00287 00288 for(i=0; i<w_size; i++) 00289 w[i] = 0; 00290 00291 for(i=0; i<l; i++) 00292 { 00293 alpha[i] = 0; 00294 if(prob->y[i] > 0) 00295 { 00296 y[i] = +1; 00297 } 00298 else 00299 { 00300 y[i] = -1; 00301 } 00302 QD[i] = diag[GETI(i)]; 00303 00304 QD[i] += prob->x->dot(i, prob->x,i); 00305 index[i] = i; 00306 } 00307 00308 00309 CTime start_time; 00310 while (iter < max_iterations && !CSignal::cancel_computations()) 00311 { 00312 if (m_max_train_time > 0 && start_time.cur_time_diff() > m_max_train_time) 00313 break; 00314 00315 PGmax_new = -CMath::INFTY; 00316 PGmin_new = CMath::INFTY; 00317 00318 for (i=0; i<active_size; i++) 00319 { 00320 int j = i+rand()%(active_size-i); 00321 CMath::swap(index[i], index[j]); 00322 } 00323 00324 for (s=0;s<active_size;s++) 00325 { 00326 i = index[s]; 00327 int32_t yi = y[i]; 00328 00329 G = prob->x->dense_dot(i, w.vector, n); 00330 if (prob->use_bias) 00331 G+=w.vector[n]; 00332 00333 if (m_linear_term.vector) 00334 G = G*yi + m_linear_term.vector[i]; 00335 else 00336 G = G*yi-1; 00337 00338 C = upper_bound[GETI(i)]; 00339 G += alpha[i]*diag[GETI(i)]; 00340 00341 PG = 0; 00342 if (alpha[i] == 0) 00343 { 00344 if (G > PGmax_old) 00345 { 00346 active_size--; 00347 CMath::swap(index[s], index[active_size]); 00348 s--; 00349 continue; 00350 } 00351 else if (G < 0) 00352 PG = G; 00353 } 00354 else if (alpha[i] == C) 00355 { 00356 if (G < PGmin_old) 00357 { 00358 active_size--; 00359 CMath::swap(index[s], index[active_size]); 00360 s--; 00361 continue; 00362 } 00363 else if (G > 0) 00364 PG = G; 00365 } 00366 else 00367 PG = G; 00368 00369 PGmax_new = CMath::max(PGmax_new, PG); 00370 PGmin_new = CMath::min(PGmin_new, PG); 00371 00372 if(fabs(PG) > 1.0e-12) 00373 { 00374 double alpha_old = alpha[i]; 00375 alpha[i] = CMath::min(CMath::max(alpha[i] - G/QD[i], 0.0), C); 00376 d = (alpha[i] - alpha_old)*yi; 00377 00378 prob->x->add_to_dense_vec(d, i, w.vector, n); 00379 00380 if (prob->use_bias) 00381 w.vector[n]+=d; 00382 } 00383 } 00384 00385 iter++; 00386 float64_t gap=PGmax_new - PGmin_new; 00387 SG_SABS_PROGRESS(gap, -CMath::log10(gap), -CMath::log10(1), -CMath::log10(eps), 6); 00388 00389 if(gap <= eps) 00390 { 00391 if(active_size == l) 00392 break; 00393 else 00394 { 00395 active_size = l; 00396 PGmax_old = CMath::INFTY; 00397 PGmin_old = -CMath::INFTY; 00398 continue; 00399 } 00400 } 00401 PGmax_old = PGmax_new; 00402 PGmin_old = PGmin_new; 00403 if (PGmax_old <= 0) 00404 PGmax_old = CMath::INFTY; 00405 if (PGmin_old >= 0) 00406 PGmin_old = -CMath::INFTY; 00407 } 00408 00409 SG_DONE(); 00410 SG_INFO("optimization finished, #iter = %d\n",iter); 00411 if (iter >= max_iterations) 00412 { 00413 SG_WARNING("reaching max number of iterations\nUsing -s 2 may be faster" 00414 "(also see liblinear FAQ)\n\n"); 00415 } 00416 00417 // calculate objective value 00418 00419 double v = 0; 00420 int nSV = 0; 00421 for(i=0; i<w_size; i++) 00422 v += w.vector[i]*w.vector[i]; 00423 for(i=0; i<l; i++) 00424 { 00425 v += alpha[i]*(alpha[i]*diag[GETI(i)] - 2); 00426 if(alpha[i] > 0) 00427 ++nSV; 00428 } 00429 SG_INFO("Objective value = %lf\n",v/2); 00430 SG_INFO("nSV = %d\n",nSV); 00431 00432 SG_FREE(QD); 00433 SG_FREE(alpha); 00434 SG_FREE(y); 00435 SG_FREE(index); 00436 } 00437 00438 // A coordinate descent algorithm for 00439 // L1-regularized L2-loss support vector classification 00440 // 00441 // min_w \sum |wj| + C \sum max(0, 1-yi w^T xi)^2, 00442 // 00443 // Given: 00444 // x, y, Cp, Cn 00445 // eps is the stopping tolerance 00446 // 00447 // solution will be put in w 00448 00449 #undef GETI 00450 #define GETI(i) (y[i]+1) 00451 // To support weights for instances, use GETI(i) (i) 00452 00453 void CLibLinear::solve_l1r_l2_svc( 00454 problem *prob_col, double eps, double Cp, double Cn) 00455 { 00456 int l = prob_col->l; 00457 int w_size = prob_col->n; 00458 int j, s, iter = 0; 00459 int active_size = w_size; 00460 int max_num_linesearch = 20; 00461 00462 double sigma = 0.01; 00463 double d, G_loss, G, H; 00464 double Gmax_old = CMath::INFTY; 00465 double Gmax_new; 00466 double Gmax_init=0; 00467 double d_old, d_diff; 00468 double loss_old=0, loss_new; 00469 double appxcond, cond; 00470 00471 int *index = SG_MALLOC(int, w_size); 00472 int32_t *y = SG_MALLOC(int32_t, l); 00473 double *b = SG_MALLOC(double, l); // b = 1-ywTx 00474 double *xj_sq = SG_MALLOC(double, w_size); 00475 00476 CDotFeatures* x = (CDotFeatures*) prob_col->x; 00477 void* iterator; 00478 int32_t ind; 00479 float64_t val; 00480 00481 double C[3] = {Cn,0,Cp}; 00482 00483 int n = prob_col->n; 00484 if (prob_col->use_bias) 00485 n--; 00486 00487 for(j=0; j<l; j++) 00488 { 00489 b[j] = 1; 00490 if(prob_col->y[j] > 0) 00491 y[j] = 1; 00492 else 00493 y[j] = -1; 00494 } 00495 00496 for(j=0; j<w_size; j++) 00497 { 00498 w.vector[j] = 0; 00499 index[j] = j; 00500 xj_sq[j] = 0; 00501 00502 if (use_bias && j==n) 00503 { 00504 for (ind=0; ind<l; ind++) 00505 xj_sq[n] += C[GETI(ind)]; 00506 } 00507 else 00508 { 00509 iterator=x->get_feature_iterator(j); 00510 while (x->get_next_feature(ind, val, iterator)) 00511 xj_sq[j] += C[GETI(ind)]*val*val; 00512 x->free_feature_iterator(iterator); 00513 } 00514 } 00515 00516 00517 CTime start_time; 00518 while (iter < max_iterations && !CSignal::cancel_computations()) 00519 { 00520 if (m_max_train_time > 0 && start_time.cur_time_diff() > m_max_train_time) 00521 break; 00522 00523 Gmax_new = 0; 00524 00525 for(j=0; j<active_size; j++) 00526 { 00527 int i = j+rand()%(active_size-j); 00528 CMath::swap(index[i], index[j]); 00529 } 00530 00531 for(s=0; s<active_size; s++) 00532 { 00533 j = index[s]; 00534 G_loss = 0; 00535 H = 0; 00536 00537 if (use_bias && j==n) 00538 { 00539 for (ind=0; ind<l; ind++) 00540 { 00541 if(b[ind] > 0) 00542 { 00543 double tmp = C[GETI(ind)]*y[ind]; 00544 G_loss -= tmp*b[ind]; 00545 H += tmp*y[ind]; 00546 } 00547 } 00548 } 00549 else 00550 { 00551 iterator=x->get_feature_iterator(j); 00552 00553 while (x->get_next_feature(ind, val, iterator)) 00554 { 00555 if(b[ind] > 0) 00556 { 00557 double tmp = C[GETI(ind)]*val*y[ind]; 00558 G_loss -= tmp*b[ind]; 00559 H += tmp*val*y[ind]; 00560 } 00561 } 00562 x->free_feature_iterator(iterator); 00563 } 00564 00565 G_loss *= 2; 00566 00567 G = G_loss; 00568 H *= 2; 00569 H = CMath::max(H, 1e-12); 00570 00571 double Gp = G+1; 00572 double Gn = G-1; 00573 double violation = 0; 00574 if(w.vector[j] == 0) 00575 { 00576 if(Gp < 0) 00577 violation = -Gp; 00578 else if(Gn > 0) 00579 violation = Gn; 00580 else if(Gp>Gmax_old/l && Gn<-Gmax_old/l) 00581 { 00582 active_size--; 00583 CMath::swap(index[s], index[active_size]); 00584 s--; 00585 continue; 00586 } 00587 } 00588 else if(w.vector[j] > 0) 00589 violation = fabs(Gp); 00590 else 00591 violation = fabs(Gn); 00592 00593 Gmax_new = CMath::max(Gmax_new, violation); 00594 00595 // obtain Newton direction d 00596 if(Gp <= H*w.vector[j]) 00597 d = -Gp/H; 00598 else if(Gn >= H*w.vector[j]) 00599 d = -Gn/H; 00600 else 00601 d = -w.vector[j]; 00602 00603 if(fabs(d) < 1.0e-12) 00604 continue; 00605 00606 double delta = fabs(w.vector[j]+d)-fabs(w.vector[j]) + G*d; 00607 d_old = 0; 00608 int num_linesearch; 00609 for(num_linesearch=0; num_linesearch < max_num_linesearch; num_linesearch++) 00610 { 00611 d_diff = d_old - d; 00612 cond = fabs(w.vector[j]+d)-fabs(w.vector[j]) - sigma*delta; 00613 00614 appxcond = xj_sq[j]*d*d + G_loss*d + cond; 00615 if(appxcond <= 0) 00616 { 00617 if (use_bias && j==n) 00618 { 00619 for (ind=0; ind<l; ind++) 00620 b[ind] += d_diff*y[ind]; 00621 break; 00622 } 00623 else 00624 { 00625 iterator=x->get_feature_iterator(j); 00626 while (x->get_next_feature(ind, val, iterator)) 00627 b[ind] += d_diff*val*y[ind]; 00628 00629 x->free_feature_iterator(iterator); 00630 break; 00631 } 00632 } 00633 00634 if(num_linesearch == 0) 00635 { 00636 loss_old = 0; 00637 loss_new = 0; 00638 00639 if (use_bias && j==n) 00640 { 00641 for (ind=0; ind<l; ind++) 00642 { 00643 if(b[ind] > 0) 00644 loss_old += C[GETI(ind)]*b[ind]*b[ind]; 00645 double b_new = b[ind] + d_diff*y[ind]; 00646 b[ind] = b_new; 00647 if(b_new > 0) 00648 loss_new += C[GETI(ind)]*b_new*b_new; 00649 } 00650 } 00651 else 00652 { 00653 iterator=x->get_feature_iterator(j); 00654 while (x->get_next_feature(ind, val, iterator)) 00655 { 00656 if(b[ind] > 0) 00657 loss_old += C[GETI(ind)]*b[ind]*b[ind]; 00658 double b_new = b[ind] + d_diff*val*y[ind]; 00659 b[ind] = b_new; 00660 if(b_new > 0) 00661 loss_new += C[GETI(ind)]*b_new*b_new; 00662 } 00663 x->free_feature_iterator(iterator); 00664 } 00665 } 00666 else 00667 { 00668 loss_new = 0; 00669 if (use_bias && j==n) 00670 { 00671 for (ind=0; ind<l; ind++) 00672 { 00673 double b_new = b[ind] + d_diff*y[ind]; 00674 b[ind] = b_new; 00675 if(b_new > 0) 00676 loss_new += C[GETI(ind)]*b_new*b_new; 00677 } 00678 } 00679 else 00680 { 00681 iterator=x->get_feature_iterator(j); 00682 while (x->get_next_feature(ind, val, iterator)) 00683 { 00684 double b_new = b[ind] + d_diff*val*y[ind]; 00685 b[ind] = b_new; 00686 if(b_new > 0) 00687 loss_new += C[GETI(ind)]*b_new*b_new; 00688 } 00689 x->free_feature_iterator(iterator); 00690 } 00691 } 00692 00693 cond = cond + loss_new - loss_old; 00694 if(cond <= 0) 00695 break; 00696 else 00697 { 00698 d_old = d; 00699 d *= 0.5; 00700 delta *= 0.5; 00701 } 00702 } 00703 00704 w.vector[j] += d; 00705 00706 // recompute b[] if line search takes too many steps 00707 if(num_linesearch >= max_num_linesearch) 00708 { 00709 SG_INFO("#"); 00710 for(int i=0; i<l; i++) 00711 b[i] = 1; 00712 00713 for(int i=0; i<n; i++) 00714 { 00715 if(w.vector[i]==0) 00716 continue; 00717 00718 iterator=x->get_feature_iterator(i); 00719 while (x->get_next_feature(ind, val, iterator)) 00720 b[ind] -= w.vector[i]*val*y[ind]; 00721 x->free_feature_iterator(iterator); 00722 } 00723 00724 if (use_bias && w.vector[n]) 00725 { 00726 for (ind=0; ind<l; ind++) 00727 b[ind] -= w.vector[n]*y[ind]; 00728 } 00729 } 00730 } 00731 00732 if(iter == 0) 00733 Gmax_init = Gmax_new; 00734 iter++; 00735 00736 SG_SABS_PROGRESS(Gmax_new, -CMath::log10(Gmax_new), 00737 -CMath::log10(Gmax_init), -CMath::log10(eps*Gmax_init), 6); 00738 00739 if(Gmax_new <= eps*Gmax_init) 00740 { 00741 if(active_size == w_size) 00742 break; 00743 else 00744 { 00745 active_size = w_size; 00746 Gmax_old = CMath::INFTY; 00747 continue; 00748 } 00749 } 00750 00751 Gmax_old = Gmax_new; 00752 } 00753 00754 SG_DONE(); 00755 SG_INFO("optimization finished, #iter = %d\n", iter); 00756 if(iter >= max_iterations) 00757 SG_WARNING("\nWARNING: reaching max number of iterations\n"); 00758 00759 // calculate objective value 00760 00761 double v = 0; 00762 int nnz = 0; 00763 for(j=0; j<w_size; j++) 00764 { 00765 if(w.vector[j] != 0) 00766 { 00767 v += fabs(w.vector[j]); 00768 nnz++; 00769 } 00770 } 00771 for(j=0; j<l; j++) 00772 if(b[j] > 0) 00773 v += C[GETI(j)]*b[j]*b[j]; 00774 00775 SG_INFO("Objective value = %lf\n", v); 00776 SG_INFO("#nonzeros/#features = %d/%d\n", nnz, w_size); 00777 00778 SG_FREE(index); 00779 SG_FREE(y); 00780 SG_FREE(b); 00781 SG_FREE(xj_sq); 00782 } 00783 00784 // A coordinate descent algorithm for 00785 // L1-regularized logistic regression problems 00786 // 00787 // min_w \sum |wj| + C \sum log(1+exp(-yi w^T xi)), 00788 // 00789 // Given: 00790 // x, y, Cp, Cn 00791 // eps is the stopping tolerance 00792 // 00793 // solution will be put in w 00794 00795 #undef GETI 00796 #define GETI(i) (y[i]+1) 00797 // To support weights for instances, use GETI(i) (i) 00798 00799 void CLibLinear::solve_l1r_lr( 00800 const problem *prob_col, double eps, 00801 double Cp, double Cn) 00802 { 00803 int l = prob_col->l; 00804 int w_size = prob_col->n; 00805 int j, s, iter = 0; 00806 int active_size = w_size; 00807 int max_num_linesearch = 20; 00808 00809 double x_min = 0; 00810 double sigma = 0.01; 00811 double d, G, H; 00812 double Gmax_old = CMath::INFTY; 00813 double Gmax_new; 00814 double Gmax_init=0; 00815 double sum1, appxcond1; 00816 double sum2, appxcond2; 00817 double cond; 00818 00819 int *index = SG_MALLOC(int, w_size); 00820 int32_t *y = SG_MALLOC(int32_t, l); 00821 double *exp_wTx = SG_MALLOC(double, l); 00822 double *exp_wTx_new = SG_MALLOC(double, l); 00823 double *xj_max = SG_MALLOC(double, w_size); 00824 double *C_sum = SG_MALLOC(double, w_size); 00825 double *xjneg_sum = SG_MALLOC(double, w_size); 00826 double *xjpos_sum = SG_MALLOC(double, w_size); 00827 00828 CDotFeatures* x = prob_col->x; 00829 void* iterator; 00830 int ind; 00831 double val; 00832 00833 double C[3] = {Cn,0,Cp}; 00834 00835 int n = prob_col->n; 00836 if (prob_col->use_bias) 00837 n--; 00838 00839 for(j=0; j<l; j++) 00840 { 00841 exp_wTx[j] = 1; 00842 if(prob_col->y[j] > 0) 00843 y[j] = 1; 00844 else 00845 y[j] = -1; 00846 } 00847 for(j=0; j<w_size; j++) 00848 { 00849 w.vector[j] = 0; 00850 index[j] = j; 00851 xj_max[j] = 0; 00852 C_sum[j] = 0; 00853 xjneg_sum[j] = 0; 00854 xjpos_sum[j] = 0; 00855 00856 if (use_bias && j==n) 00857 { 00858 for (ind=0; ind<l; ind++) 00859 { 00860 x_min = CMath::min(x_min, 1.0); 00861 xj_max[j] = CMath::max(xj_max[j], 1.0); 00862 C_sum[j] += C[GETI(ind)]; 00863 if(y[ind] == -1) 00864 xjneg_sum[j] += C[GETI(ind)]; 00865 else 00866 xjpos_sum[j] += C[GETI(ind)]; 00867 } 00868 } 00869 else 00870 { 00871 iterator=x->get_feature_iterator(j); 00872 while (x->get_next_feature(ind, val, iterator)) 00873 { 00874 x_min = CMath::min(x_min, val); 00875 xj_max[j] = CMath::max(xj_max[j], val); 00876 C_sum[j] += C[GETI(ind)]; 00877 if(y[ind] == -1) 00878 xjneg_sum[j] += C[GETI(ind)]*val; 00879 else 00880 xjpos_sum[j] += C[GETI(ind)]*val; 00881 } 00882 x->free_feature_iterator(iterator); 00883 } 00884 } 00885 00886 CTime start_time; 00887 while (iter < max_iterations && !CSignal::cancel_computations()) 00888 { 00889 if (m_max_train_time > 0 && start_time.cur_time_diff() > m_max_train_time) 00890 break; 00891 00892 Gmax_new = 0; 00893 00894 for(j=0; j<active_size; j++) 00895 { 00896 int i = j+rand()%(active_size-j); 00897 CMath::swap(index[i], index[j]); 00898 } 00899 00900 for(s=0; s<active_size; s++) 00901 { 00902 j = index[s]; 00903 sum1 = 0; 00904 sum2 = 0; 00905 H = 0; 00906 00907 if (use_bias && j==n) 00908 { 00909 for (ind=0; ind<l; ind++) 00910 { 00911 double exp_wTxind = exp_wTx[ind]; 00912 double tmp1 = 1.0/(1+exp_wTxind); 00913 double tmp2 = C[GETI(ind)]*tmp1; 00914 double tmp3 = tmp2*exp_wTxind; 00915 sum2 += tmp2; 00916 sum1 += tmp3; 00917 H += tmp1*tmp3; 00918 } 00919 } 00920 else 00921 { 00922 iterator=x->get_feature_iterator(j); 00923 while (x->get_next_feature(ind, val, iterator)) 00924 { 00925 double exp_wTxind = exp_wTx[ind]; 00926 double tmp1 = val/(1+exp_wTxind); 00927 double tmp2 = C[GETI(ind)]*tmp1; 00928 double tmp3 = tmp2*exp_wTxind; 00929 sum2 += tmp2; 00930 sum1 += tmp3; 00931 H += tmp1*tmp3; 00932 } 00933 x->free_feature_iterator(iterator); 00934 } 00935 00936 G = -sum2 + xjneg_sum[j]; 00937 00938 double Gp = G+1; 00939 double Gn = G-1; 00940 double violation = 0; 00941 if(w.vector[j] == 0) 00942 { 00943 if(Gp < 0) 00944 violation = -Gp; 00945 else if(Gn > 0) 00946 violation = Gn; 00947 else if(Gp>Gmax_old/l && Gn<-Gmax_old/l) 00948 { 00949 active_size--; 00950 CMath::swap(index[s], index[active_size]); 00951 s--; 00952 continue; 00953 } 00954 } 00955 else if(w.vector[j] > 0) 00956 violation = fabs(Gp); 00957 else 00958 violation = fabs(Gn); 00959 00960 Gmax_new = CMath::max(Gmax_new, violation); 00961 00962 // obtain Newton direction d 00963 if(Gp <= H*w.vector[j]) 00964 d = -Gp/H; 00965 else if(Gn >= H*w.vector[j]) 00966 d = -Gn/H; 00967 else 00968 d = -w.vector[j]; 00969 00970 if(fabs(d) < 1.0e-12) 00971 continue; 00972 00973 d = CMath::min(CMath::max(d,-10.0),10.0); 00974 00975 double delta = fabs(w.vector[j]+d)-fabs(w.vector[j]) + G*d; 00976 int num_linesearch; 00977 for(num_linesearch=0; num_linesearch < max_num_linesearch; num_linesearch++) 00978 { 00979 cond = fabs(w.vector[j]+d)-fabs(w.vector[j]) - sigma*delta; 00980 00981 if(x_min >= 0) 00982 { 00983 double tmp = exp(d*xj_max[j]); 00984 appxcond1 = log(1+sum1*(tmp-1)/xj_max[j]/C_sum[j])*C_sum[j] + cond - d*xjpos_sum[j]; 00985 appxcond2 = log(1+sum2*(1/tmp-1)/xj_max[j]/C_sum[j])*C_sum[j] + cond + d*xjneg_sum[j]; 00986 if(CMath::min(appxcond1,appxcond2) <= 0) 00987 { 00988 if (use_bias && j==n) 00989 { 00990 for (ind=0; ind<l; ind++) 00991 exp_wTx[ind] *= exp(d); 00992 } 00993 00994 else 00995 { 00996 iterator=x->get_feature_iterator(j); 00997 while (x->get_next_feature(ind, val, iterator)) 00998 exp_wTx[ind] *= exp(d*val); 00999 x->free_feature_iterator(iterator); 01000 } 01001 break; 01002 } 01003 } 01004 01005 cond += d*xjneg_sum[j]; 01006 01007 int i = 0; 01008 01009 if (use_bias && j==n) 01010 { 01011 for (ind=0; ind<l; ind++) 01012 { 01013 double exp_dx = exp(d); 01014 exp_wTx_new[i] = exp_wTx[ind]*exp_dx; 01015 cond += C[GETI(ind)]*log((1+exp_wTx_new[i])/(exp_dx+exp_wTx_new[i])); 01016 i++; 01017 } 01018 } 01019 else 01020 { 01021 01022 iterator=x->get_feature_iterator(j); 01023 while (x->get_next_feature(ind, val, iterator)) 01024 { 01025 double exp_dx = exp(d*val); 01026 exp_wTx_new[i] = exp_wTx[ind]*exp_dx; 01027 cond += C[GETI(ind)]*log((1+exp_wTx_new[i])/(exp_dx+exp_wTx_new[i])); 01028 i++; 01029 } 01030 x->free_feature_iterator(iterator); 01031 } 01032 01033 if(cond <= 0) 01034 { 01035 i = 0; 01036 if (use_bias && j==n) 01037 { 01038 for (ind=0; ind<l; ind++) 01039 { 01040 exp_wTx[ind] = exp_wTx_new[i]; 01041 i++; 01042 } 01043 } 01044 else 01045 { 01046 iterator=x->get_feature_iterator(j); 01047 while (x->get_next_feature(ind, val, iterator)) 01048 { 01049 exp_wTx[ind] = exp_wTx_new[i]; 01050 i++; 01051 } 01052 x->free_feature_iterator(iterator); 01053 } 01054 break; 01055 } 01056 else 01057 { 01058 d *= 0.5; 01059 delta *= 0.5; 01060 } 01061 } 01062 01063 w.vector[j] += d; 01064 01065 // recompute exp_wTx[] if line search takes too many steps 01066 if(num_linesearch >= max_num_linesearch) 01067 { 01068 SG_INFO("#"); 01069 for(int i=0; i<l; i++) 01070 exp_wTx[i] = 0; 01071 01072 for(int i=0; i<w_size; i++) 01073 { 01074 if(w.vector[i]==0) continue; 01075 01076 if (use_bias && i==n) 01077 { 01078 for (ind=0; ind<l; ind++) 01079 exp_wTx[ind] += w.vector[i]; 01080 } 01081 else 01082 { 01083 iterator=x->get_feature_iterator(i); 01084 while (x->get_next_feature(ind, val, iterator)) 01085 exp_wTx[ind] += w.vector[i]*val; 01086 x->free_feature_iterator(iterator); 01087 } 01088 } 01089 01090 for(int i=0; i<l; i++) 01091 exp_wTx[i] = exp(exp_wTx[i]); 01092 } 01093 } 01094 01095 if(iter == 0) 01096 Gmax_init = Gmax_new; 01097 iter++; 01098 SG_SABS_PROGRESS(Gmax_new, -CMath::log10(Gmax_new), -CMath::log10(Gmax_init), -CMath::log10(eps*Gmax_init), 6); 01099 01100 if(Gmax_new <= eps*Gmax_init) 01101 { 01102 if(active_size == w_size) 01103 break; 01104 else 01105 { 01106 active_size = w_size; 01107 Gmax_old = CMath::INFTY; 01108 continue; 01109 } 01110 } 01111 01112 Gmax_old = Gmax_new; 01113 } 01114 01115 SG_DONE(); 01116 SG_INFO("optimization finished, #iter = %d\n", iter); 01117 if(iter >= max_iterations) 01118 SG_WARNING("\nWARNING: reaching max number of iterations\n"); 01119 01120 // calculate objective value 01121 01122 double v = 0; 01123 int nnz = 0; 01124 for(j=0; j<w_size; j++) 01125 if(w.vector[j] != 0) 01126 { 01127 v += fabs(w.vector[j]); 01128 nnz++; 01129 } 01130 for(j=0; j<l; j++) 01131 if(y[j] == 1) 01132 v += C[GETI(j)]*log(1+1/exp_wTx[j]); 01133 else 01134 v += C[GETI(j)]*log(1+exp_wTx[j]); 01135 01136 SG_INFO("Objective value = %lf\n", v); 01137 SG_INFO("#nonzeros/#features = %d/%d\n", nnz, w_size); 01138 01139 SG_FREE(index); 01140 SG_FREE(y); 01141 SG_FREE(exp_wTx); 01142 SG_FREE(exp_wTx_new); 01143 SG_FREE(xj_max); 01144 SG_FREE(C_sum); 01145 SG_FREE(xjneg_sum); 01146 SG_FREE(xjpos_sum); 01147 } 01148 01149 // A coordinate descent algorithm for 01150 // the dual of L2-regularized logistic regression problems 01151 // 01152 // min_\alpha 0.5(\alpha^T Q \alpha) + \sum \alpha_i log (\alpha_i) + (upper_bound_i - \alpha_i) log (upper_bound_i - \alpha_i), 01153 // s.t. 0 <= \alpha_i <= upper_bound_i, 01154 // 01155 // where Qij = yi yj xi^T xj and 01156 // upper_bound_i = Cp if y_i = 1 01157 // upper_bound_i = Cn if y_i = -1 01158 // 01159 // Given: 01160 // x, y, Cp, Cn 01161 // eps is the stopping tolerance 01162 // 01163 // solution will be put in w 01164 // 01165 // See Algorithm 5 of Yu et al., MLJ 2010 01166 01167 #undef GETI 01168 #define GETI(i) (y[i]+1) 01169 // To support weights for instances, use GETI(i) (i) 01170 01171 void CLibLinear::solve_l2r_lr_dual(const problem *prob, double eps, double Cp, double Cn) 01172 { 01173 int l = prob->l; 01174 int w_size = prob->n; 01175 int i, s, iter = 0; 01176 double *xTx = new double[l]; 01177 int max_iter = 1000; 01178 int *index = new int[l]; 01179 double *alpha = new double[2*l]; // store alpha and C - alpha 01180 int32_t *y = SG_MALLOC(int32_t, l); 01181 int max_inner_iter = 100; // for inner Newton 01182 double innereps = 1e-2; 01183 double innereps_min = CMath::min(1e-8, eps); 01184 double upper_bound[3] = {Cn, 0, Cp}; 01185 double Gmax_init = 0; 01186 01187 for(i=0; i<l; i++) 01188 { 01189 if(prob->y[i] > 0) 01190 { 01191 y[i] = +1; 01192 } 01193 else 01194 { 01195 y[i] = -1; 01196 } 01197 } 01198 01199 // Initial alpha can be set here. Note that 01200 // 0 < alpha[i] < upper_bound[GETI(i)] 01201 // alpha[2*i] + alpha[2*i+1] = upper_bound[GETI(i)] 01202 for(i=0; i<l; i++) 01203 { 01204 alpha[2*i] = CMath::min(0.001*upper_bound[GETI(i)], 1e-8); 01205 alpha[2*i+1] = upper_bound[GETI(i)] - alpha[2*i]; 01206 } 01207 01208 for(i=0; i<w_size; i++) 01209 w[i] = 0; 01210 for(i=0; i<l; i++) 01211 { 01212 xTx[i] = prob->x->dot(i, prob->x,i); 01213 prob->x->add_to_dense_vec(y[i]*alpha[2*i], i, w.vector, w_size); 01214 01215 if (prob->use_bias) 01216 { 01217 w.vector[w_size]+=y[i]*alpha[2*i]; 01218 xTx[i]+=1; 01219 } 01220 index[i] = i; 01221 } 01222 01223 while (iter < max_iter) 01224 { 01225 for (i=0; i<l; i++) 01226 { 01227 int j = i+rand()%(l-i); 01228 CMath::swap(index[i], index[j]); 01229 } 01230 int newton_iter = 0; 01231 double Gmax = 0; 01232 for (s=0; s<l; s++) 01233 { 01234 i = index[s]; 01235 int32_t yi = y[i]; 01236 double C = upper_bound[GETI(i)]; 01237 double ywTx = 0, xisq = xTx[i]; 01238 01239 ywTx = prob->x->dense_dot(i, w.vector, w_size); 01240 if (prob->use_bias) 01241 ywTx+=w.vector[w_size]; 01242 01243 ywTx *= y[i]; 01244 double a = xisq, b = ywTx; 01245 01246 // Decide to minimize g_1(z) or g_2(z) 01247 int ind1 = 2*i, ind2 = 2*i+1, sign = 1; 01248 if(0.5*a*(alpha[ind2]-alpha[ind1])+b < 0) 01249 { 01250 ind1 = 2*i+1; 01251 ind2 = 2*i; 01252 sign = -1; 01253 } 01254 01255 // g_t(z) = z*log(z) + (C-z)*log(C-z) + 0.5a(z-alpha_old)^2 + sign*b(z-alpha_old) 01256 double alpha_old = alpha[ind1]; 01257 double z = alpha_old; 01258 if(C - z < 0.5 * C) 01259 z = 0.1*z; 01260 double gp = a*(z-alpha_old)+sign*b+CMath::log(z/(C-z)); 01261 Gmax = CMath::max(Gmax, CMath::abs(gp)); 01262 01263 // Newton method on the sub-problem 01264 const double eta = 0.1; // xi in the paper 01265 int inner_iter = 0; 01266 while (inner_iter <= max_inner_iter) 01267 { 01268 if(fabs(gp) < innereps) 01269 break; 01270 double gpp = a + C/(C-z)/z; 01271 double tmpz = z - gp/gpp; 01272 if(tmpz <= 0) 01273 z *= eta; 01274 else // tmpz in (0, C) 01275 z = tmpz; 01276 gp = a*(z-alpha_old)+sign*b+log(z/(C-z)); 01277 newton_iter++; 01278 inner_iter++; 01279 } 01280 01281 if(inner_iter > 0) // update w 01282 { 01283 alpha[ind1] = z; 01284 alpha[ind2] = C-z; 01285 01286 prob->x->add_to_dense_vec(sign*(z-alpha_old)*yi, i, w.vector, w_size); 01287 01288 if (prob->use_bias) 01289 w.vector[w_size]+=sign*(z-alpha_old)*yi; 01290 } 01291 } 01292 01293 iter++; 01294 if(iter == 0) 01295 Gmax_init = Gmax; 01296 01297 SG_SABS_PROGRESS(Gmax, -CMath::log10(Gmax), -CMath::log10(Gmax_init), -CMath::log10(eps*Gmax_init), 6); 01298 01299 if(Gmax < eps) 01300 break; 01301 01302 if(newton_iter <= l/10) 01303 innereps = CMath::max(innereps_min, 0.1*innereps); 01304 01305 } 01306 01307 SG_DONE(); 01308 SG_INFO("optimization finished, #iter = %d\n",iter); 01309 if (iter >= max_iter) 01310 SG_WARNING("reaching max number of iterations\nUsing -s 0 may be faster (also see FAQ)\n\n"); 01311 01312 // calculate objective value 01313 01314 double v = 0; 01315 for(i=0; i<w_size; i++) 01316 v += w[i] * w[i]; 01317 v *= 0.5; 01318 for(i=0; i<l; i++) 01319 v += alpha[2*i] * log(alpha[2*i]) + alpha[2*i+1] * log(alpha[2*i+1]) 01320 - upper_bound[GETI(i)] * log(upper_bound[GETI(i)]); 01321 SG_INFO("Objective value = %lf\n", v); 01322 01323 delete [] xTx; 01324 delete [] alpha; 01325 delete [] y; 01326 delete [] index; 01327 } 01328 01329 01330 void CLibLinear::set_linear_term(const SGVector<float64_t> linear_term) 01331 { 01332 if (!m_labels) 01333 SG_ERROR("Please assign labels first!\n"); 01334 01335 int32_t num_labels=m_labels->get_num_labels(); 01336 01337 if (num_labels!=linear_term.vlen) 01338 { 01339 SG_ERROR("Number of labels (%d) does not match number" 01340 " of entries (%d) in linear term \n", num_labels, 01341 linear_term.vlen); 01342 } 01343 01344 m_linear_term=linear_term; 01345 } 01346 01347 SGVector<float64_t> CLibLinear::get_linear_term() 01348 { 01349 if (!m_linear_term.vlen || !m_linear_term.vector) 01350 SG_ERROR("Please assign linear term first!\n"); 01351 01352 return m_linear_term; 01353 } 01354 01355 void CLibLinear::init_linear_term() 01356 { 01357 if (!m_labels) 01358 SG_ERROR("Please assign labels first!\n"); 01359 01360 m_linear_term=SGVector<float64_t>(m_labels->get_num_labels()); 01361 SGVector<float64_t>::fill_vector(m_linear_term.vector, m_linear_term.vlen, -1.0); 01362 } 01363 01364 #endif //HAVE_LAPACK