SHOGUN
v2.0.0
|
00001 /* 00002 * Limited memory BFGS (L-BFGS). 00003 * 00004 * Copyright (c) 1990, Jorge Nocedal 00005 * Copyright (c) 2007-2010 Naoaki Okazaki 00006 * All rights reserved. 00007 * 00008 * Permission is hereby granted, free of charge, to any person obtaining a copy 00009 * of this software and associated documentation files (the "Software"), to deal 00010 * in the Software without restriction, including without limitation the rights 00011 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 00012 * copies of the Software, and to permit persons to whom the Software is 00013 * furnished to do so, subject to the following conditions: 00014 * 00015 * The above copyright notice and this permission notice shall be included in 00016 * all copies or substantial portions of the Software. 00017 * 00018 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 00019 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 00020 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 00021 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 00022 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 00023 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 00024 * THE SOFTWARE. 00025 */ 00026 00027 /* $Id$ */ 00028 00029 /* 00030 This library is a C port of the FORTRAN implementation of Limited-memory 00031 Broyden-Fletcher-Goldfarb-Shanno (L-BFGS) method written by Jorge Nocedal. 00032 The original FORTRAN source code is available at: 00033 http://www.ece.northwestern.edu/~nocedal/lbfgs.html 00034 00035 The L-BFGS algorithm is described in: 00036 - Jorge Nocedal. 00037 Updating Quasi-Newton Matrices with Limited Storage. 00038 <i>Mathematics of Computation</i>, Vol. 35, No. 151, pp. 773--782, 1980. 00039 - Dong C. Liu and Jorge Nocedal. 00040 On the limited memory BFGS method for large scale optimization. 00041 <i>Mathematical Programming</i> B, Vol. 45, No. 3, pp. 503-528, 1989. 00042 00043 The line search algorithms used in this implementation are described in: 00044 - John E. Dennis and Robert B. Schnabel. 00045 <i>Numerical Methods for Unconstrained Optimization and Nonlinear 00046 Equations</i>, Englewood Cliffs, 1983. 00047 - Jorge J. More and David J. Thuente. 00048 Line search algorithm with guaranteed sufficient decrease. 00049 <i>ACM Transactions on Mathematical Software (TOMS)</i>, Vol. 20, No. 3, 00050 pp. 286-307, 1994. 00051 00052 This library also implements Orthant-Wise Limited-memory Quasi-Newton (OWL-QN) 00053 method presented in: 00054 - Galen Andrew and Jianfeng Gao. 00055 Scalable training of L1-regularized log-linear models. 00056 In <i>Proceedings of the 24th International Conference on Machine 00057 Learning (ICML 2007)</i>, pp. 33-40, 2007. 00058 00059 I would like to thank the original author, Jorge Nocedal, who has been 00060 distributing the effieicnt and explanatory implementation in an open source 00061 licence. 00062 */ 00063 00064 #include <algorithm> 00065 #include <cstdio> 00066 #include <cstdlib> 00067 #include <cmath> 00068 00069 #include <shogun/optimization/lbfgs/lbfgs.h> 00070 #include <shogun/lib/SGVector.h> 00071 00072 namespace shogun 00073 { 00074 00075 #define min2(a, b) ((a) <= (b) ? (a) : (b)) 00076 #define max2(a, b) ((a) >= (b) ? (a) : (b)) 00077 #define max3(a, b, c) max2(max2((a), (b)), (c)); 00078 00079 struct tag_callback_data { 00080 int32_t n; 00081 void *instance; 00082 lbfgs_evaluate_t proc_evaluate; 00083 lbfgs_progress_t proc_progress; 00084 }; 00085 typedef struct tag_callback_data callback_data_t; 00086 00087 struct tag_iteration_data { 00088 float64_t alpha; 00089 float64_t *s; /* [n] */ 00090 float64_t *y; /* [n] */ 00091 float64_t ys; /* vecdot(y, s) */ 00092 }; 00093 typedef struct tag_iteration_data iteration_data_t; 00094 00095 static const lbfgs_parameter_t _defparam = { 00096 6, 1e-5, 0, 1e-5, 00097 0, LBFGS_LINESEARCH_DEFAULT, 40, 00098 1e-20, 1e20, 1e-4, 0.9, 0.9, 1.0e-16, 00099 0.0, 0, -1, 00100 }; 00101 00102 /* Forward function declarations. */ 00103 00104 typedef int32_t (*line_search_proc)( 00105 int32_t n, 00106 float64_t *x, 00107 float64_t *f, 00108 float64_t *g, 00109 float64_t *s, 00110 float64_t *stp, 00111 const float64_t* xp, 00112 const float64_t* gp, 00113 float64_t *wa, 00114 callback_data_t *cd, 00115 const lbfgs_parameter_t *param 00116 ); 00117 00118 static int32_t line_search_backtracking( 00119 int32_t n, 00120 float64_t *x, 00121 float64_t *f, 00122 float64_t *g, 00123 float64_t *s, 00124 float64_t *stp, 00125 const float64_t* xp, 00126 const float64_t* gp, 00127 float64_t *wa, 00128 callback_data_t *cd, 00129 const lbfgs_parameter_t *param 00130 ); 00131 00132 static int32_t line_search_backtracking_owlqn( 00133 int32_t n, 00134 float64_t *x, 00135 float64_t *f, 00136 float64_t *g, 00137 float64_t *s, 00138 float64_t *stp, 00139 const float64_t* xp, 00140 const float64_t* gp, 00141 float64_t *wp, 00142 callback_data_t *cd, 00143 const lbfgs_parameter_t *param 00144 ); 00145 00146 static int32_t line_search_morethuente( 00147 int32_t n, 00148 float64_t *x, 00149 float64_t *f, 00150 float64_t *g, 00151 float64_t *s, 00152 float64_t *stp, 00153 const float64_t* xp, 00154 const float64_t* gp, 00155 float64_t *wa, 00156 callback_data_t *cd, 00157 const lbfgs_parameter_t *param 00158 ); 00159 00160 static int32_t update_trial_interval( 00161 float64_t *x, 00162 float64_t *fx, 00163 float64_t *dx, 00164 float64_t *y, 00165 float64_t *fy, 00166 float64_t *dy, 00167 float64_t *t, 00168 float64_t *ft, 00169 float64_t *dt, 00170 const float64_t tmin, 00171 const float64_t tmax, 00172 int32_t *brackt 00173 ); 00174 00175 static float64_t owlqn_x1norm( 00176 const float64_t* x, 00177 const int32_t start, 00178 const int32_t n 00179 ); 00180 00181 static void owlqn_pseudo_gradient( 00182 float64_t* pg, 00183 const float64_t* x, 00184 const float64_t* g, 00185 const int32_t n, 00186 const float64_t c, 00187 const int32_t start, 00188 const int32_t end 00189 ); 00190 00191 static void owlqn_project( 00192 float64_t* d, 00193 const float64_t* sign, 00194 const int32_t start, 00195 const int32_t end 00196 ); 00197 00198 00199 void lbfgs_parameter_init(lbfgs_parameter_t *param) 00200 { 00201 memcpy(param, &_defparam, sizeof(*param)); 00202 } 00203 00204 int32_t lbfgs( 00205 int32_t n, 00206 float64_t *x, 00207 float64_t *ptr_fx, 00208 lbfgs_evaluate_t proc_evaluate, 00209 lbfgs_progress_t proc_progress, 00210 void *instance, 00211 lbfgs_parameter_t *_param 00212 ) 00213 { 00214 int32_t ret; 00215 int32_t i, j, k, ls, end, bound; 00216 float64_t step; 00217 00218 /* Constant parameters and their default values. */ 00219 lbfgs_parameter_t param = (_param != NULL) ? (*_param) : _defparam; 00220 const int32_t m = param.m; 00221 00222 float64_t *xp = NULL; 00223 float64_t *g = NULL, *gp = NULL, *pg = NULL; 00224 float64_t *d = NULL, *w = NULL, *pf = NULL; 00225 iteration_data_t *lm = NULL, *it = NULL; 00226 float64_t ys, yy; 00227 float64_t xnorm, gnorm, beta; 00228 float64_t fx = 0.; 00229 float64_t rate = 0.; 00230 line_search_proc linesearch = line_search_morethuente; 00231 00232 /* Construct a callback data. */ 00233 callback_data_t cd; 00234 cd.n = n; 00235 cd.instance = instance; 00236 cd.proc_evaluate = proc_evaluate; 00237 cd.proc_progress = proc_progress; 00238 00239 /* Check the input parameters for errors. */ 00240 if (n <= 0) { 00241 return LBFGSERR_INVALID_N; 00242 } 00243 if (param.epsilon < 0.) { 00244 return LBFGSERR_INVALID_EPSILON; 00245 } 00246 if (param.past < 0) { 00247 return LBFGSERR_INVALID_TESTPERIOD; 00248 } 00249 if (param.delta < 0.) { 00250 return LBFGSERR_INVALID_DELTA; 00251 } 00252 if (param.min_step < 0.) { 00253 return LBFGSERR_INVALID_MINSTEP; 00254 } 00255 if (param.max_step < param.min_step) { 00256 return LBFGSERR_INVALID_MAXSTEP; 00257 } 00258 if (param.ftol < 0.) { 00259 return LBFGSERR_INVALID_FTOL; 00260 } 00261 if (param.linesearch == LBFGS_LINESEARCH_BACKTRACKING_WOLFE || 00262 param.linesearch == LBFGS_LINESEARCH_BACKTRACKING_STRONG_WOLFE) { 00263 if (param.wolfe <= param.ftol || 1. <= param.wolfe) { 00264 return LBFGSERR_INVALID_WOLFE; 00265 } 00266 } 00267 if (param.gtol < 0.) { 00268 return LBFGSERR_INVALID_GTOL; 00269 } 00270 if (param.xtol < 0.) { 00271 return LBFGSERR_INVALID_XTOL; 00272 } 00273 if (param.max_linesearch <= 0) { 00274 return LBFGSERR_INVALID_MAXLINESEARCH; 00275 } 00276 if (param.orthantwise_c < 0.) { 00277 return LBFGSERR_INVALID_ORTHANTWISE; 00278 } 00279 if (param.orthantwise_start < 0 || n < param.orthantwise_start) { 00280 return LBFGSERR_INVALID_ORTHANTWISE_START; 00281 } 00282 if (param.orthantwise_end < 0) { 00283 param.orthantwise_end = n; 00284 } 00285 if (n < param.orthantwise_end) { 00286 return LBFGSERR_INVALID_ORTHANTWISE_END; 00287 } 00288 if (param.orthantwise_c != 0.) { 00289 switch (param.linesearch) { 00290 case LBFGS_LINESEARCH_BACKTRACKING: 00291 linesearch = line_search_backtracking_owlqn; 00292 break; 00293 default: 00294 /* Only the backtracking method is available. */ 00295 return LBFGSERR_INVALID_LINESEARCH; 00296 } 00297 } else { 00298 switch (param.linesearch) { 00299 case LBFGS_LINESEARCH_MORETHUENTE: 00300 linesearch = line_search_morethuente; 00301 break; 00302 case LBFGS_LINESEARCH_BACKTRACKING_ARMIJO: 00303 case LBFGS_LINESEARCH_BACKTRACKING_WOLFE: 00304 case LBFGS_LINESEARCH_BACKTRACKING_STRONG_WOLFE: 00305 linesearch = line_search_backtracking; 00306 break; 00307 default: 00308 return LBFGSERR_INVALID_LINESEARCH; 00309 } 00310 } 00311 00312 /* Allocate working space. */ 00313 xp = SG_CALLOC(float64_t, n); 00314 g = SG_CALLOC(float64_t, n); 00315 gp = SG_CALLOC(float64_t, n); 00316 d = SG_CALLOC(float64_t, n); 00317 w = SG_CALLOC(float64_t, n); 00318 if (xp == NULL || g == NULL || gp == NULL || d == NULL || w == NULL) { 00319 ret = LBFGSERR_OUTOFMEMORY; 00320 goto lbfgs_exit; 00321 } 00322 00323 if (param.orthantwise_c != 0.) { 00324 /* Allocate working space for OW-LQN. */ 00325 pg = SG_CALLOC(float64_t, n); 00326 if (pg == NULL) { 00327 ret = LBFGSERR_OUTOFMEMORY; 00328 goto lbfgs_exit; 00329 } 00330 } 00331 00332 /* Allocate limited memory storage. */ 00333 lm = SG_CALLOC(iteration_data_t, m); 00334 if (lm == NULL) { 00335 ret = LBFGSERR_OUTOFMEMORY; 00336 goto lbfgs_exit; 00337 } 00338 00339 /* Initialize the limited memory. */ 00340 for (i = 0;i < m;++i) { 00341 it = &lm[i]; 00342 it->alpha = 0; 00343 it->ys = 0; 00344 it->s = SG_CALLOC(float64_t, n); 00345 it->y = SG_CALLOC(float64_t, n); 00346 if (it->s == NULL || it->y == NULL) { 00347 ret = LBFGSERR_OUTOFMEMORY; 00348 goto lbfgs_exit; 00349 } 00350 } 00351 00352 /* Allocate an array for storing previous values of the objective function. */ 00353 if (0 < param.past) { 00354 pf = SG_CALLOC(float64_t, param.past); 00355 } 00356 00357 /* Evaluate the function value and its gradient. */ 00358 fx = cd.proc_evaluate(cd.instance, x, g, cd.n, 0); 00359 if (0. != param.orthantwise_c) { 00360 /* Compute the L1 norm of the variable and add it to the object value. */ 00361 xnorm = owlqn_x1norm(x, param.orthantwise_start, param.orthantwise_end); 00362 fx += xnorm * param.orthantwise_c; 00363 owlqn_pseudo_gradient( 00364 pg, x, g, n, 00365 param.orthantwise_c, param.orthantwise_start, param.orthantwise_end 00366 ); 00367 } 00368 00369 /* Store the initial value of the objective function. */ 00370 if (pf != NULL) { 00371 pf[0] = fx; 00372 } 00373 00374 /* 00375 Compute the direction; 00376 we assume the initial hessian matrix H_0 as the identity matrix. 00377 */ 00378 if (param.orthantwise_c == 0.) { 00379 std::copy(g,g+n,d); 00380 SGVector<float64_t>::scale_vector(-1, d, n); 00381 } else { 00382 std::copy(pg,pg+n,d); 00383 SGVector<float64_t>::scale_vector(-1, d, n); 00384 } 00385 00386 /* 00387 Make sure that the initial variables are not a minimizer. 00388 */ 00389 xnorm = SGVector<float64_t>::twonorm(x, n); 00390 if (param.orthantwise_c == 0.) { 00391 gnorm = SGVector<float64_t>::twonorm(g, n); 00392 } else { 00393 gnorm = SGVector<float64_t>::twonorm(pg, n); 00394 } 00395 if (xnorm < 1.0) xnorm = 1.0; 00396 if (gnorm / xnorm <= param.epsilon) { 00397 ret = LBFGS_ALREADY_MINIMIZED; 00398 goto lbfgs_exit; 00399 } 00400 00401 /* Compute the initial step: 00402 step = 1.0 / sqrt(vecdot(d, d, n)) 00403 */ 00404 step = 1.0 / SGVector<float64_t>::twonorm(d, n); 00405 00406 k = 1; 00407 end = 0; 00408 for (;;) { 00409 /* Store the current position and gradient vectors. */ 00410 std::copy(x,x+n,xp); 00411 std::copy(g,g+n,gp); 00412 00413 /* Search for an optimal step. */ 00414 if (param.orthantwise_c == 0.) { 00415 ls = linesearch(n, x, &fx, g, d, &step, xp, gp, w, &cd, ¶m); 00416 } else { 00417 ls = linesearch(n, x, &fx, g, d, &step, xp, pg, w, &cd, ¶m); 00418 owlqn_pseudo_gradient( 00419 pg, x, g, n, 00420 param.orthantwise_c, param.orthantwise_start, param.orthantwise_end 00421 ); 00422 } 00423 if (ls < 0) { 00424 /* Revert to the previous point. */ 00425 std::copy(xp,xp+n,x); 00426 std::copy(gp,gp+n,g); 00427 ret = ls; 00428 goto lbfgs_exit; 00429 } 00430 00431 /* Compute x and g norms. */ 00432 xnorm = SGVector<float64_t>::twonorm(x, n); 00433 if (param.orthantwise_c == 0.) { 00434 gnorm = SGVector<float64_t>::twonorm(g, n); 00435 } else { 00436 gnorm = SGVector<float64_t>::twonorm(pg, n); 00437 } 00438 00439 /* Report the progress. */ 00440 if (cd.proc_progress) { 00441 if ((ret = cd.proc_progress(cd.instance, x, g, fx, xnorm, gnorm, step, cd.n, k, ls))) { 00442 goto lbfgs_exit; 00443 } 00444 } 00445 00446 /* 00447 Convergence test. 00448 The criterion is given by the following formula: 00449 |g(x)| / \max2(1, |x|) < \epsilon 00450 */ 00451 if (xnorm < 1.0) xnorm = 1.0; 00452 if (gnorm / xnorm <= param.epsilon) { 00453 /* Convergence. */ 00454 ret = LBFGS_SUCCESS; 00455 break; 00456 } 00457 00458 /* 00459 Test for stopping criterion. 00460 The criterion is given by the following formula: 00461 (f(past_x) - f(x)) / f(x) < \delta 00462 */ 00463 if (pf != NULL) { 00464 /* We don't test the stopping criterion while k < past. */ 00465 if (param.past <= k) { 00466 /* Compute the relative improvement from the past. */ 00467 rate = (pf[k % param.past] - fx) / fx; 00468 00469 /* The stopping criterion. */ 00470 if (rate < param.delta) { 00471 ret = LBFGS_STOP; 00472 break; 00473 } 00474 } 00475 00476 /* Store the current value of the objective function. */ 00477 pf[k % param.past] = fx; 00478 } 00479 00480 if (param.max_iterations != 0 && param.max_iterations < k+1) { 00481 /* Maximum number of iterations. */ 00482 ret = LBFGSERR_MAXIMUMITERATION; 00483 break; 00484 } 00485 00486 /* 00487 Update vectors s and y: 00488 s_{k+1} = x_{k+1} - x_{k} = \step * d_{k}. 00489 y_{k+1} = g_{k+1} - g_{k}. 00490 */ 00491 it = &lm[end]; 00492 SGVector<float64_t>::add(it->s, 1, x, -1, xp, n); 00493 SGVector<float64_t>::add(it->y, 1, g, -1, gp, n); 00494 00495 /* 00496 Compute scalars ys and yy: 00497 ys = y^t \cdot s = 1 / \rho. 00498 yy = y^t \cdot y. 00499 Notice that yy is used for scaling the hessian matrix H_0 (Cholesky factor). 00500 */ 00501 ys = SGVector<float64_t>::dot(it->y, it->s, n); 00502 yy = SGVector<float64_t>::dot(it->y, it->y, n); 00503 it->ys = ys; 00504 00505 /* 00506 Recursive formula to compute dir = -(H \cdot g). 00507 This is described in page 779 of: 00508 Jorge Nocedal. 00509 Updating Quasi-Newton Matrices with Limited Storage. 00510 Mathematics of Computation, Vol. 35, No. 151, 00511 pp. 773--782, 1980. 00512 */ 00513 bound = (m <= k) ? m : k; 00514 ++k; 00515 end = (end + 1) % m; 00516 00517 /* Compute the steepest direction. */ 00518 if (param.orthantwise_c == 0.) { 00519 /* Compute the negative of gradients. */ 00520 std::copy(g, g+n, d); 00521 SGVector<float64_t>::scale_vector(-1, d, n); 00522 } else { 00523 std::copy(pg, pg+n, d); 00524 SGVector<float64_t>::scale_vector(-1, d, n); 00525 } 00526 00527 j = end; 00528 for (i = 0;i < bound;++i) { 00529 j = (j + m - 1) % m; /* if (--j == -1) j = m-1; */ 00530 it = &lm[j]; 00531 /* \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1}. */ 00532 it->alpha = SGVector<float64_t>::dot(it->s, d, n); 00533 it->alpha /= it->ys; 00534 /* q_{i} = q_{i+1} - \alpha_{i} y_{i}. */ 00535 SGVector<float64_t>::add(d, 1, d, -it->alpha, it->y, n); 00536 } 00537 00538 SGVector<float64_t>::scale_vector(ys / yy, d, n); 00539 00540 for (i = 0;i < bound;++i) { 00541 it = &lm[j]; 00542 /* \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i}. */ 00543 beta = SGVector<float64_t>::dot(it->y, d, n); 00544 beta /= it->ys; 00545 /* \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j}. */ 00546 SGVector<float64_t>::add(d, 1, d, it->alpha-beta, it->s, n); 00547 j = (j + 1) % m; /* if (++j == m) j = 0; */ 00548 } 00549 00550 /* 00551 Constrain the search direction for orthant-wise updates. 00552 */ 00553 if (param.orthantwise_c != 0.) { 00554 for (i = param.orthantwise_start;i < param.orthantwise_end;++i) { 00555 if (d[i] * pg[i] >= 0) { 00556 d[i] = 0; 00557 } 00558 } 00559 } 00560 00561 /* 00562 Now the search direction d is ready. We try step = 1 first. 00563 */ 00564 step = 1.0; 00565 } 00566 00567 lbfgs_exit: 00568 /* Return the final value of the objective function. */ 00569 if (ptr_fx != NULL) { 00570 *ptr_fx = fx; 00571 } 00572 00573 SG_FREE(pf); 00574 00575 /* Free memory blocks used by this function. */ 00576 if (lm != NULL) { 00577 for (i = 0;i < m;++i) { 00578 SG_FREE(lm[i].s); 00579 SG_FREE(lm[i].y); 00580 } 00581 SG_FREE(lm); 00582 } 00583 SG_FREE(pg); 00584 SG_FREE(w); 00585 SG_FREE(d); 00586 SG_FREE(gp); 00587 SG_FREE(g); 00588 SG_FREE(xp); 00589 00590 return ret; 00591 } 00592 00593 00594 00595 static int32_t line_search_backtracking( 00596 int32_t n, 00597 float64_t *x, 00598 float64_t *f, 00599 float64_t *g, 00600 float64_t *s, 00601 float64_t *stp, 00602 const float64_t* xp, 00603 const float64_t* gp, 00604 float64_t *wp, 00605 callback_data_t *cd, 00606 const lbfgs_parameter_t *param 00607 ) 00608 { 00609 int32_t count = 0; 00610 float64_t width, dg; 00611 float64_t finit, dginit = 0., dgtest; 00612 const float64_t dec = 0.5, inc = 2.1; 00613 00614 /* Check the input parameters for errors. */ 00615 if (*stp <= 0.) { 00616 return LBFGSERR_INVALIDPARAMETERS; 00617 } 00618 00619 /* Compute the initial gradient in the search direction. */ 00620 dginit = SGVector<float64_t>::dot(g, s, n); 00621 00622 /* Make sure that s points to a descent direction. */ 00623 if (0 < dginit) { 00624 return LBFGSERR_INCREASEGRADIENT; 00625 } 00626 00627 /* The initial value of the objective function. */ 00628 finit = *f; 00629 dgtest = param->ftol * dginit; 00630 00631 for (;;) { 00632 std::copy(xp,xp+n,x); 00633 SGVector<float64_t>::add(x, 1, x, *stp, s, n); 00634 00635 /* Evaluate the function and gradient values. */ 00636 *f = cd->proc_evaluate(cd->instance, x, g, cd->n, *stp); 00637 00638 ++count; 00639 00640 if (*f > finit + *stp * dgtest) { 00641 width = dec; 00642 } else { 00643 /* The sufficient decrease condition (Armijo condition). */ 00644 if (param->linesearch == LBFGS_LINESEARCH_BACKTRACKING_ARMIJO) { 00645 /* Exit with the Armijo condition. */ 00646 return count; 00647 } 00648 00649 /* Check the Wolfe condition. */ 00650 dg = SGVector<float64_t>::dot(g, s, n); 00651 if (dg < param->wolfe * dginit) { 00652 width = inc; 00653 } else { 00654 if(param->linesearch == LBFGS_LINESEARCH_BACKTRACKING_WOLFE) { 00655 /* Exit with the regular Wolfe condition. */ 00656 return count; 00657 } 00658 00659 /* Check the strong Wolfe condition. */ 00660 if(dg > -param->wolfe * dginit) { 00661 width = dec; 00662 } else { 00663 /* Exit with the strong Wolfe condition. */ 00664 return count; 00665 } 00666 } 00667 } 00668 00669 if (*stp < param->min_step) { 00670 /* The step is the minimum value. */ 00671 return LBFGSERR_MINIMUMSTEP; 00672 } 00673 if (*stp > param->max_step) { 00674 /* The step is the maximum value. */ 00675 return LBFGSERR_MAXIMUMSTEP; 00676 } 00677 if (param->max_linesearch <= count) { 00678 /* Maximum number of iteration. */ 00679 return LBFGSERR_MAXIMUMLINESEARCH; 00680 } 00681 00682 (*stp) *= width; 00683 } 00684 } 00685 00686 00687 00688 static int32_t line_search_backtracking_owlqn( 00689 int32_t n, 00690 float64_t *x, 00691 float64_t *f, 00692 float64_t *g, 00693 float64_t *s, 00694 float64_t *stp, 00695 const float64_t* xp, 00696 const float64_t* gp, 00697 float64_t *wp, 00698 callback_data_t *cd, 00699 const lbfgs_parameter_t *param 00700 ) 00701 { 00702 int32_t i, count = 0; 00703 float64_t width = 0.5, norm = 0.; 00704 float64_t finit = *f, dgtest; 00705 00706 /* Check the input parameters for errors. */ 00707 if (*stp <= 0.) { 00708 return LBFGSERR_INVALIDPARAMETERS; 00709 } 00710 00711 /* Choose the orthant for the new point. */ 00712 for (i = 0;i < n;++i) { 00713 wp[i] = (xp[i] == 0.) ? -gp[i] : xp[i]; 00714 } 00715 00716 for (;;) { 00717 /* Update the current point. */ 00718 std::copy(xp,xp+n,x); 00719 SGVector<float64_t>::add(x, 1, x, *stp, s, n); 00720 00721 /* The current point is projected onto the orthant. */ 00722 owlqn_project(x, wp, param->orthantwise_start, param->orthantwise_end); 00723 00724 /* Evaluate the function and gradient values. */ 00725 *f = cd->proc_evaluate(cd->instance, x, g, cd->n, *stp); 00726 00727 /* Compute the L1 norm of the variables and add it to the object value. */ 00728 norm = owlqn_x1norm(x, param->orthantwise_start, param->orthantwise_end); 00729 *f += norm * param->orthantwise_c; 00730 00731 ++count; 00732 00733 dgtest = 0.; 00734 for (i = 0;i < n;++i) { 00735 dgtest += (x[i] - xp[i]) * gp[i]; 00736 } 00737 00738 if (*f <= finit + param->ftol * dgtest) { 00739 /* The sufficient decrease condition. */ 00740 return count; 00741 } 00742 00743 if (*stp < param->min_step) { 00744 /* The step is the minimum value. */ 00745 return LBFGSERR_MINIMUMSTEP; 00746 } 00747 if (*stp > param->max_step) { 00748 /* The step is the maximum value. */ 00749 return LBFGSERR_MAXIMUMSTEP; 00750 } 00751 if (param->max_linesearch <= count) { 00752 /* Maximum number of iteration. */ 00753 return LBFGSERR_MAXIMUMLINESEARCH; 00754 } 00755 00756 (*stp) *= width; 00757 } 00758 } 00759 00760 00761 00762 static int32_t line_search_morethuente( 00763 int32_t n, 00764 float64_t *x, 00765 float64_t *f, 00766 float64_t *g, 00767 float64_t *s, 00768 float64_t *stp, 00769 const float64_t* xp, 00770 const float64_t* gp, 00771 float64_t *wa, 00772 callback_data_t *cd, 00773 const lbfgs_parameter_t *param 00774 ) 00775 { 00776 int32_t count = 0; 00777 int32_t brackt, stage1, uinfo = 0; 00778 float64_t dg; 00779 float64_t stx, fx, dgx; 00780 float64_t sty, fy, dgy; 00781 float64_t fxm, dgxm, fym, dgym, fm, dgm; 00782 float64_t finit, ftest1, dginit, dgtest; 00783 float64_t width, prev_width; 00784 float64_t stmin, stmax; 00785 00786 /* Check the input parameters for errors. */ 00787 if (*stp <= 0.) { 00788 return LBFGSERR_INVALIDPARAMETERS; 00789 } 00790 00791 /* Compute the initial gradient in the search direction. */ 00792 dginit = SGVector<float64_t>::dot(g, s, n); 00793 00794 /* Make sure that s points to a descent direction. */ 00795 if (0 < dginit) { 00796 return LBFGSERR_INCREASEGRADIENT; 00797 } 00798 00799 /* Initialize local variables. */ 00800 brackt = 0; 00801 stage1 = 1; 00802 finit = *f; 00803 dgtest = param->ftol * dginit; 00804 width = param->max_step - param->min_step; 00805 prev_width = 2.0 * width; 00806 00807 /* 00808 The variables stx, fx, dgx contain the values of the step, 00809 function, and directional derivative at the best step. 00810 The variables sty, fy, dgy contain the value of the step, 00811 function, and derivative at the other endpoint of 00812 the interval of uncertainty. 00813 The variables stp, f, dg contain the values of the step, 00814 function, and derivative at the current step. 00815 */ 00816 stx = sty = 0.; 00817 fx = fy = finit; 00818 dgx = dgy = dginit; 00819 00820 for (;;) { 00821 /* 00822 Set the minimum and maximum steps to correspond to the 00823 present interval of uncertainty. 00824 */ 00825 if (brackt) { 00826 stmin = min2(stx, sty); 00827 stmax = max2(stx, sty); 00828 } else { 00829 stmin = stx; 00830 stmax = *stp + 4.0 * (*stp - stx); 00831 } 00832 00833 /* Clip the step in the range of [stpmin, stpmax]. */ 00834 if (*stp < param->min_step) *stp = param->min_step; 00835 if (param->max_step < *stp) *stp = param->max_step; 00836 00837 /* 00838 If an unusual termination is to occur then let 00839 stp be the lowest point obtained so far. 00840 */ 00841 if ((brackt && ((*stp <= stmin || stmax <= *stp) || param->max_linesearch <= count + 1 || uinfo != 0)) || (brackt && (stmax - stmin <= param->xtol * stmax))) { 00842 *stp = stx; 00843 } 00844 00845 /* 00846 Compute the current value of x: 00847 x <- x + (*stp) * s. 00848 */ 00849 std::copy(xp,xp+n,x); 00850 SGVector<float64_t>::add(x, 1, x, *stp, s, n); 00851 00852 /* Evaluate the function and gradient values. */ 00853 *f = cd->proc_evaluate(cd->instance, x, g, cd->n, *stp); 00854 dg = SGVector<float64_t>::dot(g, s, n); 00855 00856 ftest1 = finit + *stp * dgtest; 00857 ++count; 00858 00859 /* Test for errors and convergence. */ 00860 if (brackt && ((*stp <= stmin || stmax <= *stp) || uinfo != 0)) { 00861 /* Rounding errors prevent further progress. */ 00862 return LBFGSERR_ROUNDING_ERROR; 00863 } 00864 if (*stp == param->max_step && *f <= ftest1 && dg <= dgtest) { 00865 /* The step is the maximum value. */ 00866 return LBFGSERR_MAXIMUMSTEP; 00867 } 00868 if (*stp == param->min_step && (ftest1 < *f || dgtest <= dg)) { 00869 /* The step is the minimum value. */ 00870 return LBFGSERR_MINIMUMSTEP; 00871 } 00872 if (brackt && (stmax - stmin) <= param->xtol * stmax) { 00873 /* Relative width of the interval of uncertainty is at most xtol. */ 00874 return LBFGSERR_WIDTHTOOSMALL; 00875 } 00876 if (param->max_linesearch <= count) { 00877 /* Maximum number of iteration. */ 00878 return LBFGSERR_MAXIMUMLINESEARCH; 00879 } 00880 if (*f <= ftest1 && fabs(dg) <= param->gtol * (-dginit)) { 00881 /* The sufficient decrease condition and the directional derivative condition hold. */ 00882 return count; 00883 } 00884 00885 /* 00886 In the first stage we seek a step for which the modified 00887 function has a nonpositive value and nonnegative derivative. 00888 */ 00889 if (stage1 && *f <= ftest1 && min2(param->ftol, param->gtol) * dginit <= dg) { 00890 stage1 = 0; 00891 } 00892 00893 /* 00894 A modified function is used to predict the step only if 00895 we have not obtained a step for which the modified 00896 function has a nonpositive function value and nonnegative 00897 derivative, and if a lower function value has been 00898 obtained but the decrease is not sufficient. 00899 */ 00900 if (stage1 && ftest1 < *f && *f <= fx) { 00901 /* Define the modified function and derivative values. */ 00902 fm = *f - *stp * dgtest; 00903 fxm = fx - stx * dgtest; 00904 fym = fy - sty * dgtest; 00905 dgm = dg - dgtest; 00906 dgxm = dgx - dgtest; 00907 dgym = dgy - dgtest; 00908 00909 /* 00910 Call update_trial_interval() to update the interval of 00911 uncertainty and to compute the new step. 00912 */ 00913 uinfo = update_trial_interval( 00914 &stx, &fxm, &dgxm, 00915 &sty, &fym, &dgym, 00916 stp, &fm, &dgm, 00917 stmin, stmax, &brackt 00918 ); 00919 00920 /* Reset the function and gradient values for f. */ 00921 fx = fxm + stx * dgtest; 00922 fy = fym + sty * dgtest; 00923 dgx = dgxm + dgtest; 00924 dgy = dgym + dgtest; 00925 } else { 00926 /* 00927 Call update_trial_interval() to update the interval of 00928 uncertainty and to compute the new step. 00929 */ 00930 uinfo = update_trial_interval( 00931 &stx, &fx, &dgx, 00932 &sty, &fy, &dgy, 00933 stp, f, &dg, 00934 stmin, stmax, &brackt 00935 ); 00936 } 00937 00938 /* 00939 Force a sufficient decrease in the interval of uncertainty. 00940 */ 00941 if (brackt) { 00942 if (0.66 * prev_width <= fabs(sty - stx)) { 00943 *stp = stx + 0.5 * (sty - stx); 00944 } 00945 prev_width = width; 00946 width = fabs(sty - stx); 00947 } 00948 } 00949 00950 return LBFGSERR_LOGICERROR; 00951 } 00952 00953 00954 00958 #define USES_MINIMIZER \ 00959 float64_t a, d, gamma, theta, p, q, r, s; 00960 00971 #define CUBIC_MINIMIZER(cm, u, fu, du, v, fv, dv) \ 00972 d = (v) - (u); \ 00973 theta = ((fu) - (fv)) * 3 / d + (du) + (dv); \ 00974 p = fabs(theta); \ 00975 q = fabs(du); \ 00976 r = fabs(dv); \ 00977 s = max3(p, q, r); \ 00978 /* gamma = s*sqrt((theta/s)**2 - (du/s) * (dv/s)) */ \ 00979 a = theta / s; \ 00980 gamma = s * sqrt(a * a - ((du) / s) * ((dv) / s)); \ 00981 if ((v) < (u)) gamma = -gamma; \ 00982 p = gamma - (du) + theta; \ 00983 q = gamma - (du) + gamma + (dv); \ 00984 r = p / q; \ 00985 (cm) = (u) + r * d; 00986 00999 #define CUBIC_MINIMIZER2(cm, u, fu, du, v, fv, dv, xmin, xmax) \ 01000 d = (v) - (u); \ 01001 theta = ((fu) - (fv)) * 3 / d + (du) + (dv); \ 01002 p = fabs(theta); \ 01003 q = fabs(du); \ 01004 r = fabs(dv); \ 01005 s = max3(p, q, r); \ 01006 /* gamma = s*sqrt((theta/s)**2 - (du/s) * (dv/s)) */ \ 01007 a = theta / s; \ 01008 gamma = s * sqrt(max2(0, a * a - ((du) / s) * ((dv) / s))); \ 01009 if ((u) < (v)) gamma = -gamma; \ 01010 p = gamma - (dv) + theta; \ 01011 q = gamma - (dv) + gamma + (du); \ 01012 r = p / q; \ 01013 if (r < 0. && gamma != 0.) { \ 01014 (cm) = (v) - r * d; \ 01015 } else if (a < 0) { \ 01016 (cm) = (xmax); \ 01017 } else { \ 01018 (cm) = (xmin); \ 01019 } 01020 01030 #define QUARD_MINIMIZER(qm, u, fu, du, v, fv) \ 01031 a = (v) - (u); \ 01032 (qm) = (u) + (du) / (((fu) - (fv)) / a + (du)) / 2 * a; 01033 01042 #define QUARD_MINIMIZER2(qm, u, du, v, dv) \ 01043 a = (u) - (v); \ 01044 (qm) = (v) + (dv) / ((dv) - (du)) * a; 01045 01046 #define fsigndiff(x, y) (*(x) * (*(y) / fabs(*(y))) < 0.) 01047 01077 static int32_t update_trial_interval( 01078 float64_t *x, 01079 float64_t *fx, 01080 float64_t *dx, 01081 float64_t *y, 01082 float64_t *fy, 01083 float64_t *dy, 01084 float64_t *t, 01085 float64_t *ft, 01086 float64_t *dt, 01087 const float64_t tmin, 01088 const float64_t tmax, 01089 int32_t *brackt 01090 ) 01091 { 01092 int32_t bound; 01093 int32_t dsign = fsigndiff(dt, dx); 01094 float64_t mc; /* minimizer of an interpolated cubic. */ 01095 float64_t mq; /* minimizer of an interpolated quadratic. */ 01096 float64_t newt; /* new trial value. */ 01097 USES_MINIMIZER; /* for CUBIC_MINIMIZER and QUARD_MINIMIZER. */ 01098 01099 /* Check the input parameters for errors. */ 01100 if (*brackt) { 01101 if (*t <= min2(*x, *y) || max2(*x, *y) <= *t) { 01102 /* The trival value t is out of the interval. */ 01103 return LBFGSERR_OUTOFINTERVAL; 01104 } 01105 if (0. <= *dx * (*t - *x)) { 01106 /* The function must decrease from x. */ 01107 return LBFGSERR_INCREASEGRADIENT; 01108 } 01109 if (tmax < tmin) { 01110 /* Incorrect tmin and tmax specified. */ 01111 return LBFGSERR_INCORRECT_TMINMAX; 01112 } 01113 } 01114 01115 /* 01116 Trial value selection. 01117 */ 01118 if (*fx < *ft) { 01119 /* 01120 Case 1: a higher function value. 01121 The minimum is brackt. If the cubic minimizer is closer 01122 to x than the quadratic one, the cubic one is taken, else 01123 the average of the minimizers is taken. 01124 */ 01125 *brackt = 1; 01126 bound = 1; 01127 CUBIC_MINIMIZER(mc, *x, *fx, *dx, *t, *ft, *dt); 01128 QUARD_MINIMIZER(mq, *x, *fx, *dx, *t, *ft); 01129 if (fabs(mc - *x) < fabs(mq - *x)) { 01130 newt = mc; 01131 } else { 01132 newt = mc + 0.5 * (mq - mc); 01133 } 01134 } else if (dsign) { 01135 /* 01136 Case 2: a lower function value and derivatives of 01137 opposite sign. The minimum is brackt. If the cubic 01138 minimizer is closer to x than the quadratic (secant) one, 01139 the cubic one is taken, else the quadratic one is taken. 01140 */ 01141 *brackt = 1; 01142 bound = 0; 01143 CUBIC_MINIMIZER(mc, *x, *fx, *dx, *t, *ft, *dt); 01144 QUARD_MINIMIZER2(mq, *x, *dx, *t, *dt); 01145 if (fabs(mc - *t) > fabs(mq - *t)) { 01146 newt = mc; 01147 } else { 01148 newt = mq; 01149 } 01150 } else if (fabs(*dt) < fabs(*dx)) { 01151 /* 01152 Case 3: a lower function value, derivatives of the 01153 same sign, and the magnitude of the derivative decreases. 01154 The cubic minimizer is only used if the cubic tends to 01155 infinity in the direction of the minimizer or if the minimum 01156 of the cubic is beyond t. Otherwise the cubic minimizer is 01157 defined to be either tmin or tmax. The quadratic (secant) 01158 minimizer is also computed and if the minimum is brackt 01159 then the the minimizer closest to x is taken, else the one 01160 farthest away is taken. 01161 */ 01162 bound = 1; 01163 CUBIC_MINIMIZER2(mc, *x, *fx, *dx, *t, *ft, *dt, tmin, tmax); 01164 QUARD_MINIMIZER2(mq, *x, *dx, *t, *dt); 01165 if (*brackt) { 01166 if (fabs(*t - mc) < fabs(*t - mq)) { 01167 newt = mc; 01168 } else { 01169 newt = mq; 01170 } 01171 } else { 01172 if (fabs(*t - mc) > fabs(*t - mq)) { 01173 newt = mc; 01174 } else { 01175 newt = mq; 01176 } 01177 } 01178 } else { 01179 /* 01180 Case 4: a lower function value, derivatives of the 01181 same sign, and the magnitude of the derivative does 01182 not decrease. If the minimum is not brackt, the step 01183 is either tmin or tmax, else the cubic minimizer is taken. 01184 */ 01185 bound = 0; 01186 if (*brackt) { 01187 CUBIC_MINIMIZER(newt, *t, *ft, *dt, *y, *fy, *dy); 01188 } else if (*x < *t) { 01189 newt = tmax; 01190 } else { 01191 newt = tmin; 01192 } 01193 } 01194 01195 /* 01196 Update the interval of uncertainty. This update does not 01197 depend on the new step or the case analysis above. 01198 01199 - Case a: if f(x) < f(t), 01200 x <- x, y <- t. 01201 - Case b: if f(t) <= f(x) && f'(t)*f'(x) > 0, 01202 x <- t, y <- y. 01203 - Case c: if f(t) <= f(x) && f'(t)*f'(x) < 0, 01204 x <- t, y <- x. 01205 */ 01206 if (*fx < *ft) { 01207 /* Case a */ 01208 *y = *t; 01209 *fy = *ft; 01210 *dy = *dt; 01211 } else { 01212 /* Case c */ 01213 if (dsign) { 01214 *y = *x; 01215 *fy = *fx; 01216 *dy = *dx; 01217 } 01218 /* Cases b and c */ 01219 *x = *t; 01220 *fx = *ft; 01221 *dx = *dt; 01222 } 01223 01224 /* Clip the new trial value in [tmin, tmax]. */ 01225 if (tmax < newt) newt = tmax; 01226 if (newt < tmin) newt = tmin; 01227 01228 /* 01229 Redefine the new trial value if it is close to the upper bound 01230 of the interval. 01231 */ 01232 if (*brackt && bound) { 01233 mq = *x + 0.66 * (*y - *x); 01234 if (*x < *y) { 01235 if (mq < newt) newt = mq; 01236 } else { 01237 if (newt < mq) newt = mq; 01238 } 01239 } 01240 01241 /* Return the new trial value. */ 01242 *t = newt; 01243 return 0; 01244 } 01245 01246 01247 01248 01249 01250 static float64_t owlqn_x1norm( 01251 const float64_t* x, 01252 const int32_t start, 01253 const int32_t n 01254 ) 01255 { 01256 int32_t i; 01257 float64_t norm = 0.; 01258 01259 for (i = start;i < n;++i) { 01260 norm += fabs(x[i]); 01261 } 01262 01263 return norm; 01264 } 01265 01266 static void owlqn_pseudo_gradient( 01267 float64_t* pg, 01268 const float64_t* x, 01269 const float64_t* g, 01270 const int32_t n, 01271 const float64_t c, 01272 const int32_t start, 01273 const int32_t end 01274 ) 01275 { 01276 int32_t i; 01277 01278 /* Compute the negative of gradients. */ 01279 for (i = 0;i < start;++i) { 01280 pg[i] = g[i]; 01281 } 01282 01283 /* Compute the psuedo-gradients. */ 01284 for (i = start;i < end;++i) { 01285 if (x[i] < 0.) { 01286 /* Differentiable. */ 01287 pg[i] = g[i] - c; 01288 } else if (0. < x[i]) { 01289 /* Differentiable. */ 01290 pg[i] = g[i] + c; 01291 } else { 01292 if (g[i] < -c) { 01293 /* Take the right partial derivative. */ 01294 pg[i] = g[i] + c; 01295 } else if (c < g[i]) { 01296 /* Take the left partial derivative. */ 01297 pg[i] = g[i] - c; 01298 } else { 01299 pg[i] = 0.; 01300 } 01301 } 01302 } 01303 01304 for (i = end;i < n;++i) { 01305 pg[i] = g[i]; 01306 } 01307 } 01308 01309 static void owlqn_project( 01310 float64_t* d, 01311 const float64_t* sign, 01312 const int32_t start, 01313 const int32_t end 01314 ) 01315 { 01316 int32_t i; 01317 01318 for (i = start;i < end;++i) { 01319 if (d[i] * sign[i] <= 0) { 01320 d[i] = 0; 01321 } 01322 } 01323 } 01324 01325 }