int line_search_backtracking_owlqn( int n, T *x, T *f, T *g, T *s, T *stp, const T* xp, const T* gp, T *wp, callback_data_t<T> *cd, const lbfgs_parameter_t *param ) { int i, count = 0; T width = 0.5, norm = 0.; T finit = *f, dgtest; /* Check the input parameters for errors. */ if (*stp <= 0.) { return LBFGSERR_INVALIDPARAMETERS; } /* Choose the orthant for the new point. */ for (i = 0;i < n;++i) { wp[i] = (xp[i] == 0.) ? -gp[i] : xp[i]; } for (;;) { /* Update the current point. */ veccpy(x, xp, n); vecadd(x, s, *stp, n); /* The current point is projected onto the orthant. */ owlqn_project(x, wp, param->orthantwise_start, param->orthantwise_end); /* Evaluate the function and gradient values. */ *f = cd->proc_evaluate(cd->instance, x, g, cd->n, *stp); /* Compute the L1 norm of the variables and add it to the object value. */ norm = owlqn_x1norm(x, param->orthantwise_start, param->orthantwise_end); *f += norm * param->orthantwise_c; ++count; dgtest = 0.; for (i = 0;i < n;++i) { dgtest += (x[i] - xp[i]) * gp[i]; } if (*f <= finit + param->ftol * dgtest) { /* The sufficient decrease condition. */ return count; } if (*stp < param->min_step) { /* The step is the minimum value. */ return LBFGSERR_MINIMUMSTEP; } if (*stp > param->max_step) { /* The step is the maximum value. */ return LBFGSERR_MAXIMUMSTEP; } if (param->max_linesearch <= count) { /* Maximum number of iteration. */ return LBFGSERR_MAXIMUMLINESEARCH; } (*stp) *= width; } }
static int line_search_backtracking_owlqn( int n, double* x, double* f, double* g, double* s, double* step, const double* xp, const double* gp, double* wp, callback_data_t* cd, const lbfgs_parameter_t* param ) { int i, count = 0; double width = 0.5, norm = 0.0; double finit = *f, dgtest; /* Check the input parameters for errors. */ if (*step <= 0.0) { return LBFGSERR_INVALIDPARAMETERS; } /* Choose the orthant for the new point. */ for (i = 0; i < n; i++) { wp[i] = (xp[i] == 0.0) ? -gp[i] : xp[i]; } for (;;) { veccpy(x, xp, n); vecadd(x, s, *step, n); /* The current point is projected onto the orthant. */ owlqn_project(x, wp, param->orthantwise_start, param->orthantwise_end); *f = cd->evaluate(cd->instance, cd->n, x, g, *step); count++; /* Compute the L1 norm of the variables and add it to the object value. */ norm = owlqn_x1norm(x, param->orthantwise_start, param->orthantwise_end); *f += norm * param->orthantwise_c; dgtest = 0.0; for (i = 0; i < n; i++) { dgtest += (x[i] - xp[i]) * gp[i]; } if (*f <= finit + param->ftol * dgtest) { /* The sufficient decrease condition. */ return count; } if (*step < param->min_step) { /* The step is the minimum value. */ return LBFGSERR_MINIMUMSTEP; } if (*step > param->max_step) { /* The step is the maximum value. */ return LBFGSERR_MAXIMUMSTEP; } if (count >= param->max_linesearch) { /* Maximum number of iteration. */ return LBFGSERR_MAXIMUMLINESEARCH; } (*step) *= width; } }
int lbfgs( int n, T *x, T *ptr_fx, typename FuncWrapper<T>::lbfgs_evaluate_t proc_evaluate, typename FuncWrapper<T>::lbfgs_progress_t proc_progress, void *instance, lbfgs_parameter_t *_param ) { int ret; int i, j, k, ls, end, bound; T step; /* Constant parameters and their default values. */ lbfgs_parameter_t param = (_param != NULL) ? (*_param) : _defparam; const int m = param.m; T *xp = NULL; T *g = NULL, *gp = NULL, *pg = NULL; T *d = NULL, *w = NULL, *pf = NULL; iteration_data_t<T> *lm = NULL; iteration_data_t<T>*it = NULL; T ys, yy; T xnorm, gnorm, beta; T fx = 0.; T rate = 0.; typename LineSearchWrapper<T>::line_search_proc linesearch = line_search_morethuente; /* Construct a callback data. */ callback_data_t<T> cd; cd.n = n; cd.instance = instance; cd.proc_evaluate = proc_evaluate; cd.proc_progress = proc_progress; #if defined(USE_SSE) && (defined(__SSE__) || defined(__SSE2__)) /* Round out the number of variables. */ n = round_out_variables(n); #endif/*defined(USE_SSE)*/ /* Check the input parameters for errors. */ if (n <= 0) { return LBFGSERR_INVALID_N; } #if defined(USE_SSE) && (defined(__SSE__) || defined(__SSE2__)) if (n % 8 != 0) { return LBFGSERR_INVALID_N_SSE; } if ((uintptr_t)(const void*)x % 16 != 0) { return LBFGSERR_INVALID_X_SSE; } #endif/*defined(USE_SSE)*/ if (param.epsilon < 0.) { return LBFGSERR_INVALID_EPSILON; } if (param.past < 0) { return LBFGSERR_INVALID_TESTPERIOD; } if (param.delta < 0.) { return LBFGSERR_INVALID_DELTA; } if (param.min_step < 0.) { return LBFGSERR_INVALID_MINSTEP; } if (param.max_step < param.min_step) { return LBFGSERR_INVALID_MAXSTEP; } if (param.ftol < 0.) { return LBFGSERR_INVALID_FTOL; } if (param.linesearch == LBFGS_LINESEARCH_BACKTRACKING_WOLFE || param.linesearch == LBFGS_LINESEARCH_BACKTRACKING_STRONG_WOLFE) { if (param.wolfe <= param.ftol || 1. <= param.wolfe) { return LBFGSERR_INVALID_WOLFE; } } if (param.gtol < 0.) { return LBFGSERR_INVALID_GTOL; } if (param.xtol < 0.) { return LBFGSERR_INVALID_XTOL; } if (param.max_linesearch <= 0) { return LBFGSERR_INVALID_MAXLINESEARCH; } if (param.orthantwise_c < 0.) { return LBFGSERR_INVALID_ORTHANTWISE; } if (param.orthantwise_start < 0 || n < param.orthantwise_start) { return LBFGSERR_INVALID_ORTHANTWISE_START; } if (param.orthantwise_end < 0) { param.orthantwise_end = n; } if (n < param.orthantwise_end) { return LBFGSERR_INVALID_ORTHANTWISE_END; } if (param.orthantwise_c != 0.) { switch (param.linesearch) { case LBFGS_LINESEARCH_BACKTRACKING: linesearch = line_search_backtracking_owlqn; break; default: /* Only the backtracking method is available. */ return LBFGSERR_INVALID_LINESEARCH; } } else { switch (param.linesearch) { case LBFGS_LINESEARCH_MORETHUENTE: linesearch = line_search_morethuente; break; case LBFGS_LINESEARCH_BACKTRACKING_ARMIJO: case LBFGS_LINESEARCH_BACKTRACKING_WOLFE: case LBFGS_LINESEARCH_BACKTRACKING_STRONG_WOLFE: linesearch = line_search_backtracking; break; default: return LBFGSERR_INVALID_LINESEARCH; } } /* Allocate working space. */ xp = (T*)vecalloc(n * sizeof(T)); g = (T*)vecalloc(n * sizeof(T)); gp = (T*)vecalloc(n * sizeof(T)); d = (T*)vecalloc(n * sizeof(T)); w = (T*)vecalloc(n * sizeof(T)); if (xp == NULL || g == NULL || gp == NULL || d == NULL || w == NULL) { ret = LBFGSERR_OUTOFMEMORY; goto lbfgs_exit; } if (param.orthantwise_c != 0.) { /* Allocate working space for OW-LQN. */ pg = (T*)vecalloc(n * sizeof(T)); if (pg == NULL) { ret = LBFGSERR_OUTOFMEMORY; goto lbfgs_exit; } } /* Allocate limited memory storage. */ lm = (iteration_data_t<T>*)vecalloc(m * sizeof(iteration_data_t<T>)); if (lm == NULL) { ret = LBFGSERR_OUTOFMEMORY; goto lbfgs_exit; } /* Initialize the limited memory. */ for (i = 0;i < m;++i) { it = &lm[i]; it->alpha = 0; it->ys = 0; it->s = (T*)vecalloc(n * sizeof(T)); it->y = (T*)vecalloc(n * sizeof(T)); if (it->s == NULL || it->y == NULL) { ret = LBFGSERR_OUTOFMEMORY; goto lbfgs_exit; } } /* Allocate an array for storing previous values of the objective function. */ if (0 < param.past) { pf = (T*)vecalloc(param.past * sizeof(T)); } /* Evaluate the function value and its gradient. */ fx = cd.proc_evaluate(cd.instance, x, g, cd.n, 0); if (0. != param.orthantwise_c) { /* Compute the L1 norm of the variable and add it to the object value. */ xnorm = owlqn_x1norm(x, param.orthantwise_start, param.orthantwise_end); fx += xnorm * param.orthantwise_c; owlqn_pseudo_gradient( pg, x, g, n, T(param.orthantwise_c), param.orthantwise_start, param.orthantwise_end ); } /* Store the initial value of the objective function. */ if (pf != NULL) { pf[0] = fx; } /* Compute the direction; we assume the initial hessian matrix H_0 as the identity matrix. */ if (param.orthantwise_c == 0.) { vecncpy(d, g, n); } else { vecncpy(d, pg, n); } /* Make sure that the initial variables are not a minimizer. */ vec2norm(&xnorm, x, n); if (param.orthantwise_c == 0.) { vec2norm(&gnorm, g, n); } else { vec2norm(&gnorm, pg, n); } if (xnorm < 1.0) xnorm = 1.0; if (gnorm / xnorm <= param.epsilon) { ret = LBFGS_ALREADY_MINIMIZED; goto lbfgs_exit; } /* Compute the initial step: step = 1.0 / sqrt(vecdot(d, d, n)) */ vec2norminv(&step, d, n); k = 1; end = 0; for (;;) { /* Store the current position and gradient vectors. */ veccpy(xp, x, n); veccpy(gp, g, n); /* Search for an optimal step. */ if (param.orthantwise_c == 0.) { ls = linesearch(n, x, &fx, g, d, &step, xp, gp, w, &cd, ¶m); } else { ls = linesearch(n, x, &fx, g, d, &step, xp, pg, w, &cd, ¶m); owlqn_pseudo_gradient( pg, x, g, n, T(param.orthantwise_c), param.orthantwise_start, param.orthantwise_end ); } if (ls < 0) { /* Revert to the previous point. */ veccpy(x, xp, n); veccpy(g, gp, n); ret = ls; goto lbfgs_exit; } /* Compute x and g norms. */ vec2norm(&xnorm, x, n); if (param.orthantwise_c == 0.) { vec2norm(&gnorm, g, n); } else { vec2norm(&gnorm, pg, n); } /* Report the progress. */ if (cd.proc_progress) { if ((ret = cd.proc_progress(cd.instance, x, g, fx, xnorm, gnorm, step, cd.n, k, ls))) { goto lbfgs_exit; } } /* Convergence test. The criterion is given by the following formula: |g(x)| / \max(1, |x|) < \epsilon */ if (xnorm < 1.0) xnorm = 1.0; if (gnorm / xnorm <= param.epsilon) { /* Convergence. */ ret = LBFGS_SUCCESS; break; } /* Test for stopping criterion. The criterion is given by the following formula: (f(past_x) - f(x)) / f(x) < \delta */ if (pf != NULL) { /* We don't test the stopping criterion while k < past. */ if (param.past <= k) { /* Compute the relative improvement from the past. */ rate = (pf[k % param.past] - fx) / fx; /* The stopping criterion. */ if (rate < param.delta) { ret = LBFGS_STOP; break; } } /* Store the current value of the objective function. */ pf[k % param.past] = fx; } if (param.max_iterations != 0 && param.max_iterations < k+1) { /* Maximum number of iterations. */ ret = LBFGSERR_MAXIMUMITERATION; break; } /* Update vectors s and y: s_{k+1} = x_{k+1} - x_{k} = \step * d_{k}. y_{k+1} = g_{k+1} - g_{k}. */ it = &lm[end]; vecdiff(it->s, x, xp, n); vecdiff(it->y, g, gp, n); /* Compute scalars ys and yy: ys = y^t \cdot s = 1 / \rho. yy = y^t \cdot y. Notice that yy is used for scaling the hessian matrix H_0 (Cholesky factor). */ vecdot(&ys, it->y, it->s, n); vecdot(&yy, it->y, it->y, n); it->ys = ys; /* Recursive formula to compute dir = -(H \cdot g). This is described in page 779 of: Jorge Nocedal. Updating Quasi-Newton Matrices with Limited Storage. Mathematics of Computation, Vol. 35, No. 151, pp. 773--782, 1980. */ bound = (m <= k) ? m : k; ++k; end = (end + 1) % m; /* Compute the steepest direction. */ if (param.orthantwise_c == 0.) { /* Compute the negative of gradients. */ vecncpy(d, g, n); } else { vecncpy(d, pg, n); } j = end; for (i = 0;i < bound;++i) { j = (j + m - 1) % m; /* if (--j == -1) j = m-1; */ it = &lm[j]; /* \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1}. */ vecdot(&it->alpha, it->s, d, n); it->alpha /= it->ys; /* q_{i} = q_{i+1} - \alpha_{i} y_{i}. */ vecadd(d, it->y, -it->alpha, n); } vecscale(d, ys / yy, n); for (i = 0;i < bound;++i) { it = &lm[j]; /* \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i}. */ vecdot(&beta, it->y, d, n); beta /= it->ys; /* \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j}. */ vecadd(d, it->s, it->alpha - beta, n); j = (j + 1) % m; /* if (++j == m) j = 0; */ } /* Constrain the search direction for orthant-wise updates. */ if (param.orthantwise_c != 0.) { for (i = param.orthantwise_start;i < param.orthantwise_end;++i) { if (d[i] * pg[i] >= 0) { d[i] = 0; } } } /* Now the search direction d is ready. We try step = 1 first. */ step = 1.0; } lbfgs_exit: /* Return the final value of the objective function. */ if (ptr_fx != NULL) { *ptr_fx = fx; } vecfree(pf); /* Free memory blocks used by this function. */ if (lm != NULL) { for (i = 0;i < m;++i) { vecfree(lm[i].s); vecfree(lm[i].y); } vecfree(lm); } vecfree(pg); vecfree(w); vecfree(d); vecfree(gp); vecfree(g); vecfree(xp); return ret; }
int lbfgs( int n, double* x, double* pfx, lbfgs_evaluate_t evaluate, lbfgs_progress_t progress, void* instance, const lbfgs_parameter_t* _param ) { int ret; int i, j, k, ls, end, bound, n_evaluate = 0; int enalbe_owlqn; double step; lbfgs_parameter_t param = (_param) ? (*_param) : default_param; const int m = param.m; double* xp; double* g, *gp, *pg = 0; double* d, *w, *pf = 0; iteration_data_t* lm = 0, *it = 0; double ys, yy; double xnorm, gnorm, rate, beta; double fx; line_search_proc_t linesearch = line_search_morethuente; callback_data_t cd; cd.n = n; cd.instance = instance; cd.evaluate = evaluate; cd.progress = (progress) ? progress : default_lbfgs_progress; /* Check the input parameters for errors. */ if (n <= 0) { return LBFGSERR_INVALID_N; } if (param.epsilon < 0.0) { return LBFGSERR_INVALID_EPSILON; } if (param.past < 0) { return LBFGSERR_INVALID_TESTPERIOD; } if (param.delta < 0.0) { return LBFGSERR_INVALID_DELTA; } if (param.min_step < 0.0) { return LBFGSERR_INVALID_MINSTEP; } if (param.max_step < param.min_step) { return LBFGSERR_INVALID_MAXSTEP; } if (param.ftol < 0.0) { return LBFGSERR_INVALID_FTOL; } if (param.linesearch == LBFGS_LINESEARCH_BACKTRACKING_WOLFE || param.linesearch == LBFGS_LINESEARCH_BACKTRACKING_STRONG_WOLFE) { if (param.wolfe <= param.ftol || 1. <= param.wolfe) { return LBFGSERR_INVALID_WOLFE; } } if (param.gtol < 0.0) { return LBFGSERR_INVALID_GTOL; } if (param.xtol < 0.0) { return LBFGSERR_INVALID_XTOL; } if (param.max_linesearch <= 0) { return LBFGSERR_INVALID_MAXLINESEARCH; } if (param.orthantwise_c < 0.0) { return LBFGSERR_INVALID_ORTHANTWISE; } if (param.orthantwise_start < 0 || param.orthantwise_start > n) { return LBFGSERR_INVALID_ORTHANTWISE_START; } if (param.orthantwise_end < 0) { param.orthantwise_end = n; } if (param.orthantwise_end > n) { return LBFGSERR_INVALID_ORTHANTWISE_END; } enalbe_owlqn = (param.orthantwise_c != 0.0); if (enalbe_owlqn) { switch (param.linesearch) { case LBFGS_LINESEARCH_BACKTRACKING_WOLFE: linesearch = line_search_backtracking_owlqn; break; default: /* Only the backtracking method is available. */ return LBFGSERR_INVALID_LINESEARCH; } } else { switch (param.linesearch) { case LBFGS_LINESEARCH_MORETHUENTE: linesearch = line_search_morethuente; break; case LBFGS_LINESEARCH_BACKTRACKING_ARMIJO: case LBFGS_LINESEARCH_BACKTRACKING_WOLFE: case LBFGS_LINESEARCH_BACKTRACKING_STRONG_WOLFE: linesearch = line_search_backtracking; break; default: return LBFGSERR_INVALID_LINESEARCH; } } /* Allocate working space. */ xp = vecalloc(n); g = vecalloc(n); gp = vecalloc(n); d = vecalloc(n); w = vecalloc(n); /* Allocate pseudo gradient. */ if (enalbe_owlqn) { pg = vecalloc(n); } /* Allocate and initialize the limited memory storage. */ lm = (iteration_data_t*)xalloc(m * sizeof(iteration_data_t)); for (i = 0; i < m; i++) { it = &lm[i]; it->alpha = 0.0; it->s = vecalloc(n); it->y = vecalloc(n); it->ys = 0.0; } /* Allocate an array for storing previous values of the objective function. */ if (param.past > 0) { pf = vecalloc((size_t)param.past); } fx = cd.evaluate(cd.instance, cd.n, x, g, 0); n_evaluate++; if (enalbe_owlqn) { xnorm = owlqn_x1norm(x, param.orthantwise_start, param.orthantwise_end); fx += xnorm * param.orthantwise_c; owlqn_pseudo_gradient( pg, x, g, n, param.orthantwise_c, param.orthantwise_start, param.orthantwise_end); } /* Store the initial value of the objective function. */ if (pf) { pf[0] = fx; } /** * Compute the direction. * we assume the initial hessian matrix H_0 as the identity matrix. */ if (!enalbe_owlqn) { vecncpy(d, g, n); } else { vecncpy(d, pg, n); } /** * Make sure that the initial variables are not a minimizer. */ vec2norm(&xnorm, x, n); if (!enalbe_owlqn) { vec2norm(&gnorm, g, n); } else { vec2norm(&gnorm, pg, n); } if (xnorm < 1.0) { xnorm = 1.0; } if (gnorm / xnorm <= param.epsilon) { ret = LBFGS_ALREADY_MINIMIZED; goto lbfgs_exit; } /** * Compute the initial step: * step = 1.0 / ||d|| */ vec2norminv(&step, d, n); k = 1; end = 0; for (;;) { /* Store the current position and gradient vectors. */ veccpy(xp, x, n); veccpy(gp, g, n); /* Search for an optimal step. */ if (!enalbe_owlqn) { ls = linesearch(n, x, &fx, g, d, &step, xp, gp, w, &cd, ¶m); } else { ls = linesearch(n, x, &fx, g, d, &step, xp, pg, w, &cd, ¶m); owlqn_pseudo_gradient( pg, x, g, n, param.orthantwise_c, param.orthantwise_start, param.orthantwise_end ); } if (ls < 0) { /* Revert to the previous point. */ veccpy(x, xp, n); veccpy(g, gp, n); ret = ls; break; } n_evaluate += ls; /* Compute x and g norms. */ vec2norm(&xnorm, x, n); if (!enalbe_owlqn) { vec2norm(&gnorm, g, n); } else { vec2norm(&gnorm, pg, n); } /* Report the progress. */ if ((ret = cd.progress(cd.instance, cd.n, x, g, fx, xnorm, gnorm, step, k, n_evaluate)) != 0) { ret = LBFGSERR_CANCELED; break; } /* Convergence test. */ if (xnorm < 1.0) { xnorm = 1.0; } if (gnorm / xnorm <= param.epsilon) { ret = LBFGS_CONVERGENCE; break; } /* Stopping criterion test. */ if (pf) { /* We don't test the stopping criterion while k < past. */ if (param.past <= k) { /* Compute the relative improvement from the past. */ rate = (pf[k % param.past] - fx) / fx; /* The stopping criterion. */ if (rate < param.delta) { ret = LBFGS_CONVERGENCE_DELTA; break; } } /* Store the current value of the objective function. */ pf[k % param.past] = fx; } if (param.max_iterations != 0 && param.max_iterations < k + 1) { ret = LBFGSERR_MAXIMUMITERATION; break; } /** * Update s and y: * s_{k+1} = x_{k+1} - x_{k} = step * d_{k} * y_{k+1} = g_{k+1} - g_{k} */ it = &lm[end]; vecdiff(it->s, x, xp, n); vecdiff(it->y, g, gp, n); /** * Compute scalars ys and yy: * ys = y^t s = 1 / \rho * yy = y^t y * Notice that yy is used for scaling the hessian matrix H_0 (Cholesky factor). */ vecdot(&ys, it->y, it->s, n); vecdot(&yy, it->y, it->y, n); it->ys = ys; /** * Recursive formula to compute d = -(H g). * This is described in page 779 of: * Jorge Nocedal. * Updating Quasi-Newton Matrices with Limited Storage. * Mathematics of Computation, Vol. 35, No. 151, * pp. 773--782, 1980. */ bound = (m <= k) ? m : k; k++; end = (end + 1) % m; /* Compute the steepest direction. */ /* Compute the negative of (pseudo) gradient. */ if (!enalbe_owlqn) { vecncpy(d, g, n); } else { vecncpy(d, pg, n); } j = end; for (i = 0; i < bound; i++) { j = (j + m - 1) % m; /* if (--j == -1) j = m-1; */ it = &lm[j]; /* \alpha_{j} = \rho_{j} s^{t}_{j} q_{k+1} */ vecdot(&it->alpha, it->s, d, n); it->alpha /= it->ys; /* q_{i} = q_{i+1} - \alpha_{i} y_{i} */ vecadd(d, it->y, -it->alpha, n); } vecscale(d, ys / yy, n); for (i = 0; i < bound; i++) { it = &lm[j]; /* \beta_{j} = \rho_{j} y^t_{j} \gamma_{i} */ vecdot(&beta, it->y, d, n); beta /= it->ys; /* \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j} */ vecadd(d, it->s, it->alpha - beta, n); j = (j + 1) % m; /* if (++j == m) j = 0; */ } /* Constrain the search direction for orthant-wise updates. */ if (enalbe_owlqn) { owlqn_contrain_line_search(d, pg, param.orthantwise_start, param.orthantwise_end); } /* Now the search direction d is ready. We try step = 1 first. */ step = 1.0; } lbfgs_exit: /* Return the final value of the objective function. */ if (pfx) { *pfx = fx; } vecfree(pf); if (lm != 0) { for (i = 0; i < m; i++) { vecfree(lm[i].s); vecfree(lm[i].y); } xfree(lm); } vecfree(pg); vecfree(w); vecfree(d); vecfree(gp); vecfree(g); vecfree(xp); return ret; }