コード例 #1
0
ファイル: lbfgs.cpp プロジェクト: huqinghao/T-LBFGS
int lbfgs(
    int n,
    T *x,
    T *ptr_fx,
typename   FuncWrapper<T>::lbfgs_evaluate_t proc_evaluate,
    typename  FuncWrapper<T>::lbfgs_progress_t proc_progress,
    void *instance,
    lbfgs_parameter_t *_param
    )
{
    int ret;
    int i, j, k, ls, end, bound;
    T step;

    /* Constant parameters and their default values. */
    lbfgs_parameter_t param = (_param != NULL) ? (*_param) : _defparam;
    const int m = param.m;

    T *xp = NULL;
    T *g = NULL, *gp = NULL, *pg = NULL;
    T *d = NULL, *w = NULL, *pf = NULL;
    iteration_data_t<T> *lm = NULL;
    iteration_data_t<T>*it = NULL;
    T ys, yy;
    T xnorm, gnorm, beta;
    T fx = 0.;
    T rate = 0.;
typename    LineSearchWrapper<T>::line_search_proc linesearch = line_search_morethuente;

    /* Construct a callback data. */
    callback_data_t<T> cd;
    cd.n = n;
    cd.instance = instance;
    cd.proc_evaluate = proc_evaluate;
    cd.proc_progress = proc_progress;

#if     defined(USE_SSE) && (defined(__SSE__) || defined(__SSE2__))
    /* Round out the number of variables. */
    n = round_out_variables(n);
#endif/*defined(USE_SSE)*/

    /* Check the input parameters for errors. */
    if (n <= 0) {
        return LBFGSERR_INVALID_N;
    }
#if     defined(USE_SSE) && (defined(__SSE__) || defined(__SSE2__))
    if (n % 8 != 0) {
        return LBFGSERR_INVALID_N_SSE;
    }
    if ((uintptr_t)(const void*)x % 16 != 0) {
        return LBFGSERR_INVALID_X_SSE;
    }
#endif/*defined(USE_SSE)*/
    if (param.epsilon < 0.) {
        return LBFGSERR_INVALID_EPSILON;
    }
    if (param.past < 0) {
        return LBFGSERR_INVALID_TESTPERIOD;
    }
    if (param.delta < 0.) {
        return LBFGSERR_INVALID_DELTA;
    }
    if (param.min_step < 0.) {
        return LBFGSERR_INVALID_MINSTEP;
    }
    if (param.max_step < param.min_step) {
        return LBFGSERR_INVALID_MAXSTEP;
    }
    if (param.ftol < 0.) {
        return LBFGSERR_INVALID_FTOL;
    }
    if (param.linesearch == LBFGS_LINESEARCH_BACKTRACKING_WOLFE ||
        param.linesearch == LBFGS_LINESEARCH_BACKTRACKING_STRONG_WOLFE) {
        if (param.wolfe <= param.ftol || 1. <= param.wolfe) {
            return LBFGSERR_INVALID_WOLFE;
        }
    }
    if (param.gtol < 0.) {
        return LBFGSERR_INVALID_GTOL;
    }
    if (param.xtol < 0.) {
        return LBFGSERR_INVALID_XTOL;
    }
    if (param.max_linesearch <= 0) {
        return LBFGSERR_INVALID_MAXLINESEARCH;
    }
    if (param.orthantwise_c < 0.) {
        return LBFGSERR_INVALID_ORTHANTWISE;
    }
    if (param.orthantwise_start < 0 || n < param.orthantwise_start) {
        return LBFGSERR_INVALID_ORTHANTWISE_START;
    }
    if (param.orthantwise_end < 0) {
        param.orthantwise_end = n;
    }
    if (n < param.orthantwise_end) {
        return LBFGSERR_INVALID_ORTHANTWISE_END;
    }
    if (param.orthantwise_c != 0.) {
        switch (param.linesearch) {
        case LBFGS_LINESEARCH_BACKTRACKING:
            linesearch = line_search_backtracking_owlqn;
            break;
        default:
            /* Only the backtracking method is available. */
            return LBFGSERR_INVALID_LINESEARCH;
        }
    } else {
        switch (param.linesearch) {
        case LBFGS_LINESEARCH_MORETHUENTE:
            linesearch = line_search_morethuente;
            break;
        case LBFGS_LINESEARCH_BACKTRACKING_ARMIJO:
        case LBFGS_LINESEARCH_BACKTRACKING_WOLFE:
        case LBFGS_LINESEARCH_BACKTRACKING_STRONG_WOLFE:
            linesearch = line_search_backtracking;
            break;
        default:
            return LBFGSERR_INVALID_LINESEARCH;
        }
    }

    /* Allocate working space. */
    xp = (T*)vecalloc(n * sizeof(T));
    g = (T*)vecalloc(n * sizeof(T));
    gp = (T*)vecalloc(n * sizeof(T));
    d = (T*)vecalloc(n * sizeof(T));
    w = (T*)vecalloc(n * sizeof(T));
    if (xp == NULL || g == NULL || gp == NULL || d == NULL || w == NULL) {
        ret = LBFGSERR_OUTOFMEMORY;
        goto lbfgs_exit;
    }

    if (param.orthantwise_c != 0.) {
        /* Allocate working space for OW-LQN. */
        pg = (T*)vecalloc(n * sizeof(T));
        if (pg == NULL) {
            ret = LBFGSERR_OUTOFMEMORY;
            goto lbfgs_exit;
        }
    }

    /* Allocate limited memory storage. */
    lm = (iteration_data_t<T>*)vecalloc(m * sizeof(iteration_data_t<T>));
    if (lm == NULL) {
        ret = LBFGSERR_OUTOFMEMORY;
        goto lbfgs_exit;
    }

    /* Initialize the limited memory. */
    for (i = 0;i < m;++i) {
        it = &lm[i];
        it->alpha = 0;
        it->ys = 0;
        it->s = (T*)vecalloc(n * sizeof(T));
        it->y = (T*)vecalloc(n * sizeof(T));
        if (it->s == NULL || it->y == NULL) {
            ret = LBFGSERR_OUTOFMEMORY;
            goto lbfgs_exit;
        }
    }

    /* Allocate an array for storing previous values of the objective function. */
    if (0 < param.past) {
        pf = (T*)vecalloc(param.past * sizeof(T));
    }

    /* Evaluate the function value and its gradient. */
    fx = cd.proc_evaluate(cd.instance, x, g, cd.n, 0);
    if (0. != param.orthantwise_c) {
        /* Compute the L1 norm of the variable and add it to the object value. */
        xnorm = owlqn_x1norm(x, param.orthantwise_start, param.orthantwise_end);
        fx += xnorm * param.orthantwise_c;
        owlqn_pseudo_gradient(
            pg, x, g, n,
            T(param.orthantwise_c), param.orthantwise_start, param.orthantwise_end
            );
    }

    /* Store the initial value of the objective function. */
    if (pf != NULL) {
        pf[0] = fx;
    }

    /*
        Compute the direction;
        we assume the initial hessian matrix H_0 as the identity matrix.
     */
    if (param.orthantwise_c == 0.) {
        vecncpy(d, g, n);
    } else {
        vecncpy(d, pg, n);
    }

    /*
       Make sure that the initial variables are not a minimizer.
     */
    vec2norm(&xnorm, x, n);
    if (param.orthantwise_c == 0.) {
        vec2norm(&gnorm, g, n);
    } else {
        vec2norm(&gnorm, pg, n);
    }
    if (xnorm < 1.0) xnorm = 1.0;
    if (gnorm / xnorm <= param.epsilon) {
        ret = LBFGS_ALREADY_MINIMIZED;
        goto lbfgs_exit;
    }

    /* Compute the initial step:
        step = 1.0 / sqrt(vecdot(d, d, n))
     */
    vec2norminv(&step, d, n);

    k = 1;
    end = 0;
    for (;;) {
        /* Store the current position and gradient vectors. */
        veccpy(xp, x, n);
        veccpy(gp, g, n);

        /* Search for an optimal step. */
        if (param.orthantwise_c == 0.) {
            ls = linesearch(n, x, &fx, g, d, &step, xp, gp, w, &cd, &param);
        } else {
            ls = linesearch(n, x, &fx, g, d, &step, xp, pg, w, &cd, &param);
            owlqn_pseudo_gradient(
                pg, x, g, n,
                T(param.orthantwise_c), param.orthantwise_start, param.orthantwise_end
                );
        }
        if (ls < 0) {
            /* Revert to the previous point. */
            veccpy(x, xp, n);
            veccpy(g, gp, n);
            ret = ls;
            goto lbfgs_exit;
        }

        /* Compute x and g norms. */
        vec2norm(&xnorm, x, n);
        if (param.orthantwise_c == 0.) {
            vec2norm(&gnorm, g, n);
        } else {
            vec2norm(&gnorm, pg, n);
        }

        /* Report the progress. */
        if (cd.proc_progress) {
            if ((ret = cd.proc_progress(cd.instance, x, g, fx, xnorm, gnorm, step, cd.n, k, ls))) {
                goto lbfgs_exit;
            }
        }

        /*
            Convergence test.
            The criterion is given by the following formula:
                |g(x)| / \max(1, |x|) < \epsilon
         */
        if (xnorm < 1.0) xnorm = 1.0;
        if (gnorm / xnorm <= param.epsilon) {
            /* Convergence. */
            ret = LBFGS_SUCCESS;
            break;
        }

        /*
            Test for stopping criterion.
            The criterion is given by the following formula:
                (f(past_x) - f(x)) / f(x) < \delta
         */
        if (pf != NULL) {
            /* We don't test the stopping criterion while k < past. */
            if (param.past <= k) {
                /* Compute the relative improvement from the past. */
                rate = (pf[k % param.past] - fx) / fx;

                /* The stopping criterion. */
                if (rate < param.delta) {
                    ret = LBFGS_STOP;
                    break;
                }
            }

            /* Store the current value of the objective function. */
            pf[k % param.past] = fx;
        }

        if (param.max_iterations != 0 && param.max_iterations < k+1) {
            /* Maximum number of iterations. */
            ret = LBFGSERR_MAXIMUMITERATION;
            break;
        }

        /*
            Update vectors s and y:
                s_{k+1} = x_{k+1} - x_{k} = \step * d_{k}.
                y_{k+1} = g_{k+1} - g_{k}.
         */
        it = &lm[end];
        vecdiff(it->s, x, xp, n);
        vecdiff(it->y, g, gp, n);

        /*
            Compute scalars ys and yy:
                ys = y^t \cdot s = 1 / \rho.
                yy = y^t \cdot y.
            Notice that yy is used for scaling the hessian matrix H_0 (Cholesky factor).
         */
        vecdot(&ys, it->y, it->s, n);
        vecdot(&yy, it->y, it->y, n);
        it->ys = ys;

        /*
            Recursive formula to compute dir = -(H \cdot g).
                This is described in page 779 of:
                Jorge Nocedal.
                Updating Quasi-Newton Matrices with Limited Storage.
                Mathematics of Computation, Vol. 35, No. 151,
                pp. 773--782, 1980.
         */
        bound = (m <= k) ? m : k;
        ++k;
        end = (end + 1) % m;

        /* Compute the steepest direction. */
        if (param.orthantwise_c == 0.) {
            /* Compute the negative of gradients. */
            vecncpy(d, g, n);
        } else {
            vecncpy(d, pg, n);
        }

        j = end;
        for (i = 0;i < bound;++i) {
            j = (j + m - 1) % m;    /* if (--j == -1) j = m-1; */
            it = &lm[j];
            /* \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1}. */
            vecdot(&it->alpha, it->s, d, n);
            it->alpha /= it->ys;
            /* q_{i} = q_{i+1} - \alpha_{i} y_{i}. */
            vecadd(d, it->y, -it->alpha, n);
        }

        vecscale(d, ys / yy, n);

        for (i = 0;i < bound;++i) {
            it = &lm[j];
            /* \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i}. */
            vecdot(&beta, it->y, d, n);
            beta /= it->ys;
            /* \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j}. */
            vecadd(d, it->s, it->alpha - beta, n);
            j = (j + 1) % m;        /* if (++j == m) j = 0; */
        }

        /*
            Constrain the search direction for orthant-wise updates.
         */
        if (param.orthantwise_c != 0.) {
            for (i = param.orthantwise_start;i < param.orthantwise_end;++i) {
                if (d[i] * pg[i] >= 0) {
                    d[i] = 0;
                }
            }
        }

        /*
            Now the search direction d is ready. We try step = 1 first.
         */
        step = 1.0;
    }

lbfgs_exit:
    /* Return the final value of the objective function. */
    if (ptr_fx != NULL) {
        *ptr_fx = fx;
    }

    vecfree(pf);

    /* Free memory blocks used by this function. */
    if (lm != NULL) {
        for (i = 0;i < m;++i) {
            vecfree(lm[i].s);
            vecfree(lm[i].y);
        }
        vecfree(lm);
    }
    vecfree(pg);
    vecfree(w);
    vecfree(d);
    vecfree(gp);
    vecfree(g);
    vecfree(xp);

    return ret;
}
コード例 #2
0
ファイル: lbfgs.c プロジェクト: yesyestian/BNB-solver
int lbfgs(
    int n,
    lbfgsfloatval_t *x,
    lbfgsfloatval_t *ptr_fx,
    lbfgs_evaluate_t proc_evaluate,
    lbfgs_progress_t proc_progress,
    void *instance,
    lbfgs_parameter_t *_param
    )
{
    int ret;
    int i, j, k, ls, end, bound;
    lbfgsfloatval_t step;

    /* Constant parameters and their default values. */
    const lbfgs_parameter_t* param = (_param != NULL) ? _param : &_defparam;
    const int m = param->m;

    lbfgsfloatval_t *xp = NULL, *g = NULL, *gp = NULL, *d = NULL, *w = NULL;
    iteration_data_t *lm = NULL, *it = NULL;
    lbfgsfloatval_t ys, yy;
    lbfgsfloatval_t norm, xnorm, gnorm, beta;
    lbfgsfloatval_t fx = 0.;
    line_search_proc linesearch = line_search_morethuente;

    /* Construct a callback data. */
    callback_data_t cd;
    cd.n = n;
    cd.instance = instance;
    cd.proc_evaluate = proc_evaluate;
    cd.proc_progress = proc_progress;

#if     defined(USE_SSE) && (defined(__SSE__) || defined(__SSE2__))
    /* Round out the number of variables. */
    n = round_out_variables(n);
#endif/*defined(USE_SSE)*/

    /* Check the input parameters for errors. */
    if (n <= 0) {
        return LBFGSERR_INVALID_N;
    }
#if     defined(USE_SSE) && (defined(__SSE__) || defined(__SSE2__))
    if (n % 8 != 0) {
        return LBFGSERR_INVALID_N_SSE;
    }
    if (((unsigned short)x & 0x000F) != 0) {
        return LBFGSERR_INVALID_X_SSE;
    }
#endif/*defined(USE_SSE)*/
    if (param->min_step < 0.) {
        return LBFGSERR_INVALID_MINSTEP;
    }
    if (param->max_step < param->min_step) {
        return LBFGSERR_INVALID_MAXSTEP;
    }
    if (param->ftol < 0.) {
        return LBFGSERR_INVALID_FTOL;
    }
    if (param->gtol < 0.) {
        return LBFGSERR_INVALID_GTOL;
    }
    if (param->xtol < 0.) {
        return LBFGSERR_INVALID_XTOL;
    }
    if (param->max_linesearch <= 0) {
        return LBFGSERR_INVALID_MAXLINESEARCH;
    }
    if (param->orthantwise_c < 0.) {
        return LBFGSERR_INVALID_ORTHANTWISE;
    }
    if (param->orthantwise_start < 0 || n < param->orthantwise_start) {
        return LBFGSERR_INVALID_ORTHANTWISE_START;
    }
    switch (param->linesearch) {
    case LBFGS_LINESEARCH_MORETHUENTE:
        linesearch = line_search_morethuente;
        break;
    case LBFGS_LINESEARCH_BACKTRACKING:
        linesearch = line_search_backtracking;
        break;
    default:
        return LBFGSERR_INVALID_LINESEARCH;
    }

    /* Allocate working space. */
    xp = (lbfgsfloatval_t*)vecalloc(n * sizeof(lbfgsfloatval_t));
    g = (lbfgsfloatval_t*)vecalloc(n * sizeof(lbfgsfloatval_t));
    gp = (lbfgsfloatval_t*)vecalloc(n * sizeof(lbfgsfloatval_t));
    d = (lbfgsfloatval_t*)vecalloc(n * sizeof(lbfgsfloatval_t));
    w = (lbfgsfloatval_t*)vecalloc(n * sizeof(lbfgsfloatval_t));
    if (xp == NULL || g == NULL || gp == NULL || d == NULL || w == NULL) {
        ret = LBFGSERR_OUTOFMEMORY;
        goto lbfgs_exit;
    }

    /* Allocate limited memory storage. */
    lm = (iteration_data_t*)vecalloc(m * sizeof(iteration_data_t));
    if (lm == NULL) {
        ret = LBFGSERR_OUTOFMEMORY;
        goto lbfgs_exit;
    }

    /* Initialize the limited memory. */
    for (i = 0;i < m;++i) {
        it = &lm[i];
        it->alpha = 0;
        it->ys = 0;
        it->s = (lbfgsfloatval_t*)vecalloc(n * sizeof(lbfgsfloatval_t));
        it->y = (lbfgsfloatval_t*)vecalloc(n * sizeof(lbfgsfloatval_t));
        if (it->s == NULL || it->y == NULL) {
            ret = LBFGSERR_OUTOFMEMORY;
            goto lbfgs_exit;
        }
    }

    /* Evaluate the function value and its gradient. */
    fx = cd.proc_evaluate(cd.instance, x, g, cd.n, 0);
    if (0. < param->orthantwise_c) {
        /* Compute L1-regularization factor and add it to the object value. */
        norm = 0.;
        for (i = param->orthantwise_start;i < n;++i) {
            norm += fabs(x[i]);
        }
        fx += norm * param->orthantwise_c;
    }

    /* We assume the initial hessian matrix H_0 as the identity matrix. */
    if (param->orthantwise_c == 0.) {
        vecncpy(d, g, n);
    } else {
        /* Compute the negative of gradients. */
        for (i = 0;i < param->orthantwise_start;++i) {
            d[i] = -g[i];
        }

        /* Compute the negative of psuedo-gradients. */
        for (i = param->orthantwise_start;i < n;++i) {
            if (x[i] < 0.) {
                /* Differentiable. */
                d[i] = -g[i] + param->orthantwise_c;
            } else if (0. < x[i]) {
                /* Differentiable. */
                d[i] = -g[i] - param->orthantwise_c;
            } else {
                if (g[i] < -param->orthantwise_c) {
                    /* Take the right partial derivative. */
                    d[i] = -g[i] - param->orthantwise_c;
                } else if (param->orthantwise_c < g[i]) {
                    /* Take the left partial derivative. */
                    d[i] = -g[i] + param->orthantwise_c;
                } else {
                    d[i] = 0.;
                }
            }
        }
    }

    /*
       Make sure that the initial variables are not a minimizer.
     */
    vecnorm(&gnorm, g, n);
    vecnorm(&xnorm, x, n);
    if (xnorm < 1.0) xnorm = 1.0;
    if (gnorm / xnorm <= param->epsilon) {
        ret = LBFGS_ALREADY_MINIMIZED;
        goto lbfgs_exit;
    }

    /* Compute the initial step:
        step = 1.0 / sqrt(vecdot(d, d, n))
     */
    vecrnorm(&step, d, n);

    k = 1;
    end = 0;
    for (;;) {
        /* Store the current position and gradient vectors. */
        veccpy(xp, x, n);
        veccpy(gp, g, n);

        /* Search for an optimal step. */
        ls = linesearch(n, x, &fx, g, d, &step, w, &cd, param);
        if (ls < 0) {
            ret = ls;
            goto lbfgs_exit;
        }

        /* Compute x and g norms. */
        vecnorm(&gnorm, g, n);
        vecnorm(&xnorm, x, n);

        /* Report the progress. */
        if (cd.proc_progress) {
            if (ret = cd.proc_progress(cd.instance, x, g, fx, xnorm, gnorm, step, cd.n, k, ls)) {
                goto lbfgs_exit;
            }
        }

        /*
            Convergence test.
            The criterion is given by the following formula:
                |g(x)| / \max(1, |x|) < \epsilon
         */
        if (xnorm < 1.0) xnorm = 1.0;
        if (gnorm / xnorm <= param->epsilon) {
            /* Convergence. */
            ret = LBFGS_SUCCESS;
            break;
        }

        if (param->max_iterations != 0 && param->max_iterations < k+1) {
            /* Maximum number of iterations. */
            ret = LBFGSERR_MAXIMUMITERATION;
            break;
        }

        /*
            Update vectors s and y:
                s_{k+1} = x_{k+1} - x_{k} = \step * d_{k}.
                y_{k+1} = g_{k+1} - g_{k}.
         */
        it = &lm[end];
        vecdiff(it->s, x, xp, n);
        vecdiff(it->y, g, gp, n);

        /*
            Compute scalars ys and yy:
                ys = y^t \cdot s = 1 / \rho.
                yy = y^t \cdot y.
            Notice that yy is used for scaling the hessian matrix H_0 (Cholesky factor).
         */
        vecdot(&ys, it->y, it->s, n);
        vecdot(&yy, it->y, it->y, n);
        it->ys = ys;

        /*
            Recursive formula to compute dir = -(H \cdot g).
                This is described in page 779 of:
                Jorge Nocedal.
                Updating Quasi-Newton Matrices with Limited Storage.
                Mathematics of Computation, Vol. 35, No. 151,
                pp. 773--782, 1980.
         */
        bound = (m <= k) ? m : k;
        ++k;
        end = (end + 1) % m;

        if (param->orthantwise_c == 0.) {
            /* Compute the negative of gradients. */
            vecncpy(d, g, n);
        } else {
            /* Compute the negative of gradients. */
            for (i = 0;i < param->orthantwise_start;++i) {
                d[i] = -g[i];
            }

            /* Compute the negative of psuedo-gradients. */
            for (i = param->orthantwise_start;i < n;++i) {
                if (x[i] < 0.) {
                    /* Differentiable. */
                    d[i] = -g[i] + param->orthantwise_c;
                } else if (0. < x[i]) {
                    /* Differentiable. */
                    d[i] = -g[i] - param->orthantwise_c;
                } else {
                    if (g[i] < -param->orthantwise_c) {
                        /* Take the right partial derivative. */
                        d[i] = -g[i] - param->orthantwise_c;
                    } else if (param->orthantwise_c < g[i]) {
                        /* Take the left partial derivative. */
                        d[i] = -g[i] + param->orthantwise_c;
                    } else {
                        d[i] = 0.;
                    }
                }
            }
            /* Store the steepest direction.*/
            veccpy(w, d, n);
        }

        j = end;
        for (i = 0;i < bound;++i) {
            j = (j + m - 1) % m;    /* if (--j == -1) j = m-1; */
            it = &lm[j];
            /* \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1}. */
            vecdot(&it->alpha, it->s, d, n);
            it->alpha /= it->ys;
            /* q_{i} = q_{i+1} - \alpha_{i} y_{i}. */
            vecadd(d, it->y, -it->alpha, n);
        }

        vecscale(d, ys / yy, n);

        for (i = 0;i < bound;++i) {
            it = &lm[j];
            /* \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i}. */
            vecdot(&beta, it->y, d, n);
            beta /= it->ys;
            /* \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j}. */
            vecadd(d, it->s, it->alpha - beta, n);
            j = (j + 1) % m;        /* if (++j == m) j = 0; */
        }

        /*
            Constrain the search direction for orthant-wise updates.
         */
        if (param->orthantwise_c != 0.) {
            for (i = param->orthantwise_start;i < n;++i) {
                if (d[i] * w[i] <= 0) {
                    d[i] = 0;
                }
            }
        }

        /*
            Now the search direction d is ready. We try step = 1 first.
         */
        step = 1.0;
    }

lbfgs_exit:
    /* Return the final value of the objective function. */
    if (ptr_fx != NULL) {
        *ptr_fx = fx;
    }

    /* Free memory blocks used by this function. */
    if (lm != NULL) {
        for (i = 0;i < m;++i) {
            vecfree(lm[i].s);
            vecfree(lm[i].y);
        }
        vecfree(lm);
    }
    vecfree(w);
    vecfree(d);
    vecfree(gp);
    vecfree(g);
    vecfree(xp);

    return ret;
}
コード例 #3
0
ファイル: lbfgs.c プロジェクト: zhuoxiongzhao/ml-pack
int gd(
  int n,
  double* x,
  double* pfx,
  lbfgs_evaluate_t evaluate,
  lbfgs_progress_t progress,
  void* instance,
  const lbfgs_parameter_t* _param
) {
  int ret, ls;
  int k, n_evaluate = 0;
  lbfgs_parameter_t param = (_param) ? (*_param) : default_param;
  double fx, xnorm, gnorm, rate, step;
  double* g, *d, *xp, *gp;
  double* pf = 0;
  callback_data_t cd;

  if (progress == 0) {
    progress = default_lbfgs_progress;
  }

  cd.n = n;
  cd.instance = instance;
  cd.evaluate = evaluate;
  cd.progress = progress;

  if (n <= 0) {
    return LBFGSERR_INVALID_N;
  }
  if (param.epsilon < 0.0) {
    return LBFGSERR_INVALID_EPSILON;
  }
  if (param.past < 0) {
    return LBFGSERR_INVALID_TESTPERIOD;
  }
  if (param.delta < 0.0) {
    return LBFGSERR_INVALID_DELTA;
  }

  if (param.min_step < 0.0) {
    return LBFGSERR_INVALID_MINSTEP;
  }
  if (param.max_step < param.min_step) {
    return LBFGSERR_INVALID_MAXSTEP;
  }
  if (param.ftol < 0.0) {
    return LBFGSERR_INVALID_FTOL;
  }
  if (param.linesearch == LBFGS_LINESEARCH_BACKTRACKING_WOLFE ||
      param.linesearch == LBFGS_LINESEARCH_BACKTRACKING_STRONG_WOLFE) {
    if (param.wolfe <= param.ftol || 1. <= param.wolfe) {
      return LBFGSERR_INVALID_WOLFE;
    }
  }
  if (param.max_linesearch <= 0) {
    return LBFGSERR_INVALID_MAXLINESEARCH;
  }

  g = vecalloc(n);
  d = vecalloc(n);
  xp = vecalloc(n);
  gp = vecalloc(n);

  if (param.past > 0) {
    pf = vecalloc((size_t)param.past);
  }

  fx = evaluate(instance, n, x, g, 0);
  n_evaluate++;

  vecncpy(d, g, n);

  if (pf) {
    pf[0] = fx;
  }

  vec2norm(&xnorm, x, n);
  vec2norm(&gnorm, g, n);
  if (xnorm < 1.0) {
    xnorm = 1.0;
  }
  if (gnorm / xnorm <= param.epsilon) {
    ret = LBFGS_ALREADY_MINIMIZED;
    goto gd_exit;
  }

  /* initial guess of step length */
  step = 0.01;

  k = 1;
  for (;;) {
    veccpy(xp, x, n);
    veccpy(gp, g, n);

    ls = line_search_backtracking(n, x, &fx, g, d, &step, xp, gp, 0, &cd, &param);
    if (ls < 0) {
      veccpy(x, xp, n);
      veccpy(g, gp, n);
      ret = ls;
      break;
    }

    n_evaluate += ls;

    vec2norm(&xnorm, x, n);
    vec2norm(&gnorm, g, n);
    if ((ret = progress(instance, n, x, g, fx, xnorm, gnorm, step, k, n_evaluate)) != 0) {
      ret = LBFGSERR_CANCELED;
      break;
    }
    if (xnorm < 1.0) {
      xnorm = 1.0;
    }
    if (gnorm / xnorm <= param.epsilon) {
      ret = LBFGS_CONVERGENCE;
      break;
    }

    if (pf) {
      if (param.past <= k) {
        rate = (pf[k % param.past] - fx) / fx;
        if (rate < param.delta) {
          ret = LBFGS_CONVERGENCE_DELTA;
          break;
        }
      }
      pf[k % param.past] = fx;
    }

    if (param.max_iterations != 0 && param.max_iterations < k + 1) {
      ret = LBFGSERR_MAXIMUMITERATION;
      break;
    }

    vecncpy(d, g, n);

    k++;
  }

gd_exit:
  if (pfx) {
    *pfx = fx;
  }

  vecfree(pf);
  vecfree(gp);
  vecfree(xp);
  vecfree(d);
  vecfree(g);
  return ret;
}
コード例 #4
0
ファイル: lbfgs.c プロジェクト: zhuoxiongzhao/ml-pack
int lbfgs(
  int n,
  double* x,
  double* pfx,
  lbfgs_evaluate_t evaluate,
  lbfgs_progress_t progress,
  void* instance,
  const lbfgs_parameter_t* _param
) {
  int ret;
  int i, j, k, ls, end, bound, n_evaluate = 0;
  int enalbe_owlqn;
  double step;
  lbfgs_parameter_t param = (_param) ? (*_param) : default_param;
  const int m = param.m;
  double* xp;
  double* g, *gp, *pg = 0;
  double* d, *w, *pf = 0;
  iteration_data_t* lm = 0, *it = 0;
  double ys, yy;
  double xnorm, gnorm, rate, beta;
  double fx;
  line_search_proc_t linesearch = line_search_morethuente;

  callback_data_t cd;
  cd.n = n;
  cd.instance = instance;
  cd.evaluate = evaluate;
  cd.progress = (progress) ? progress : default_lbfgs_progress;

  /* Check the input parameters for errors. */
  if (n <= 0) {
    return LBFGSERR_INVALID_N;
  }
  if (param.epsilon < 0.0) {
    return LBFGSERR_INVALID_EPSILON;
  }
  if (param.past < 0) {
    return LBFGSERR_INVALID_TESTPERIOD;
  }
  if (param.delta < 0.0) {
    return LBFGSERR_INVALID_DELTA;
  }
  if (param.min_step < 0.0) {
    return LBFGSERR_INVALID_MINSTEP;
  }
  if (param.max_step < param.min_step) {
    return LBFGSERR_INVALID_MAXSTEP;
  }
  if (param.ftol < 0.0) {
    return LBFGSERR_INVALID_FTOL;
  }
  if (param.linesearch == LBFGS_LINESEARCH_BACKTRACKING_WOLFE ||
      param.linesearch == LBFGS_LINESEARCH_BACKTRACKING_STRONG_WOLFE) {
    if (param.wolfe <= param.ftol || 1. <= param.wolfe) {
      return LBFGSERR_INVALID_WOLFE;
    }
  }
  if (param.gtol < 0.0) {
    return LBFGSERR_INVALID_GTOL;
  }
  if (param.xtol < 0.0) {
    return LBFGSERR_INVALID_XTOL;
  }
  if (param.max_linesearch <= 0) {
    return LBFGSERR_INVALID_MAXLINESEARCH;
  }
  if (param.orthantwise_c < 0.0) {
    return LBFGSERR_INVALID_ORTHANTWISE;
  }
  if (param.orthantwise_start < 0 || param.orthantwise_start > n) {
    return LBFGSERR_INVALID_ORTHANTWISE_START;
  }
  if (param.orthantwise_end < 0) {
    param.orthantwise_end = n;
  }
  if (param.orthantwise_end > n) {
    return LBFGSERR_INVALID_ORTHANTWISE_END;
  }

  enalbe_owlqn = (param.orthantwise_c != 0.0);
  if (enalbe_owlqn) {
    switch (param.linesearch) {
    case LBFGS_LINESEARCH_BACKTRACKING_WOLFE:
      linesearch = line_search_backtracking_owlqn;
      break;
    default:
      /* Only the backtracking method is available. */
      return LBFGSERR_INVALID_LINESEARCH;
    }
  } else {
    switch (param.linesearch) {
    case LBFGS_LINESEARCH_MORETHUENTE:
      linesearch = line_search_morethuente;
      break;
    case LBFGS_LINESEARCH_BACKTRACKING_ARMIJO:
    case LBFGS_LINESEARCH_BACKTRACKING_WOLFE:
    case LBFGS_LINESEARCH_BACKTRACKING_STRONG_WOLFE:
      linesearch = line_search_backtracking;
      break;
    default:
      return LBFGSERR_INVALID_LINESEARCH;
    }
  }

  /* Allocate working space. */
  xp = vecalloc(n);
  g = vecalloc(n);
  gp = vecalloc(n);
  d = vecalloc(n);
  w = vecalloc(n);

  /* Allocate pseudo gradient. */
  if (enalbe_owlqn) {
    pg = vecalloc(n);
  }

  /* Allocate and initialize the limited memory storage. */
  lm = (iteration_data_t*)xalloc(m * sizeof(iteration_data_t));
  for (i = 0; i < m; i++) {
    it = &lm[i];
    it->alpha = 0.0;
    it->s = vecalloc(n);
    it->y = vecalloc(n);
    it->ys = 0.0;
  }

  /* Allocate an array for storing previous values of the objective function. */
  if (param.past > 0) {
    pf = vecalloc((size_t)param.past);
  }

  fx = cd.evaluate(cd.instance, cd.n, x, g, 0);
  n_evaluate++;

  if (enalbe_owlqn) {
    xnorm = owlqn_x1norm(x, param.orthantwise_start, param.orthantwise_end);
    fx += xnorm * param.orthantwise_c;
    owlqn_pseudo_gradient(
      pg, x, g, n,
      param.orthantwise_c, param.orthantwise_start, param.orthantwise_end);
  }

  /* Store the initial value of the objective function. */
  if (pf) {
    pf[0] = fx;
  }

  /**
  * Compute the direction.
  * we assume the initial hessian matrix H_0 as the identity matrix.
  */
  if (!enalbe_owlqn) {
    vecncpy(d, g, n);
  } else {
    vecncpy(d, pg, n);
  }

  /**
  * Make sure that the initial variables are not a minimizer.
  */
  vec2norm(&xnorm, x, n);
  if (!enalbe_owlqn) {
    vec2norm(&gnorm, g, n);
  } else {
    vec2norm(&gnorm, pg, n);
  }
  if (xnorm < 1.0) {
    xnorm = 1.0;
  }
  if (gnorm / xnorm <= param.epsilon) {
    ret = LBFGS_ALREADY_MINIMIZED;
    goto lbfgs_exit;
  }

  /**
  * Compute the initial step:
  * step = 1.0 / ||d||
  */
  vec2norminv(&step, d, n);

  k = 1;
  end = 0;
  for (;;) {
    /* Store the current position and gradient vectors. */
    veccpy(xp, x, n);
    veccpy(gp, g, n);

    /* Search for an optimal step. */
    if (!enalbe_owlqn) {
      ls = linesearch(n, x, &fx, g, d, &step, xp, gp, w, &cd, &param);
    } else {
      ls = linesearch(n, x, &fx, g, d, &step, xp, pg, w, &cd, &param);
      owlqn_pseudo_gradient(
        pg, x, g, n,
        param.orthantwise_c, param.orthantwise_start, param.orthantwise_end
      );
    }

    if (ls < 0) {
      /* Revert to the previous point. */
      veccpy(x, xp, n);
      veccpy(g, gp, n);
      ret = ls;
      break;
    }

    n_evaluate += ls;

    /* Compute x and g norms. */
    vec2norm(&xnorm, x, n);
    if (!enalbe_owlqn) {
      vec2norm(&gnorm, g, n);
    } else {
      vec2norm(&gnorm, pg, n);
    }

    /* Report the progress. */
    if ((ret = cd.progress(cd.instance, cd.n, x, g, fx, xnorm, gnorm, step, k, n_evaluate)) != 0) {
      ret = LBFGSERR_CANCELED;
      break;
    }

    /* Convergence test. */
    if (xnorm < 1.0) {
      xnorm = 1.0;
    }
    if (gnorm / xnorm <= param.epsilon) {
      ret = LBFGS_CONVERGENCE;
      break;
    }

    /* Stopping criterion test. */
    if (pf) {
      /* We don't test the stopping criterion while k < past. */
      if (param.past <= k) {
        /* Compute the relative improvement from the past. */
        rate = (pf[k % param.past] - fx) / fx;

        /* The stopping criterion. */
        if (rate < param.delta) {
          ret = LBFGS_CONVERGENCE_DELTA;
          break;
        }
      }

      /* Store the current value of the objective function. */
      pf[k % param.past] = fx;
    }

    if (param.max_iterations != 0 && param.max_iterations < k + 1) {
      ret = LBFGSERR_MAXIMUMITERATION;
      break;
    }

    /**
    * Update s and y:
    * s_{k+1} = x_{k+1} - x_{k} = step * d_{k}
    * y_{k+1} = g_{k+1} - g_{k}
    */
    it = &lm[end];
    vecdiff(it->s, x, xp, n);
    vecdiff(it->y, g, gp, n);

    /**
    * Compute scalars ys and yy:
    * ys = y^t s = 1 / \rho
    * yy = y^t y
    * Notice that yy is used for scaling the hessian matrix H_0 (Cholesky factor).
    */
    vecdot(&ys, it->y, it->s, n);
    vecdot(&yy, it->y, it->y, n);
    it->ys = ys;

    /**
    * Recursive formula to compute d = -(H g).
    * This is described in page 779 of:
    * Jorge Nocedal.
    * Updating Quasi-Newton Matrices with Limited Storage.
    * Mathematics of Computation, Vol. 35, No. 151,
    * pp. 773--782, 1980.
    */
    bound = (m <= k) ? m : k;
    k++;
    end = (end + 1) % m;

    /* Compute the steepest direction. */
    /* Compute the negative of (pseudo) gradient. */
    if (!enalbe_owlqn) {
      vecncpy(d, g, n);
    } else {
      vecncpy(d, pg, n);
    }

    j = end;
    for (i = 0; i < bound; i++) {
      j = (j + m - 1) % m; /* if (--j == -1) j = m-1; */
      it = &lm[j];
      /* \alpha_{j} = \rho_{j} s^{t}_{j} q_{k+1} */
      vecdot(&it->alpha, it->s, d, n);
      it->alpha /= it->ys;
      /* q_{i} = q_{i+1} - \alpha_{i} y_{i} */
      vecadd(d, it->y, -it->alpha, n);
    }

    vecscale(d, ys / yy, n);

    for (i = 0; i < bound; i++) {
      it = &lm[j];
      /* \beta_{j} = \rho_{j} y^t_{j} \gamma_{i} */
      vecdot(&beta, it->y, d, n);
      beta /= it->ys;
      /* \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j} */
      vecadd(d, it->s, it->alpha - beta, n);
      j = (j + 1) % m; /* if (++j == m) j = 0; */
    }

    /* Constrain the search direction for orthant-wise updates. */
    if (enalbe_owlqn) {
      owlqn_contrain_line_search(d, pg, param.orthantwise_start, param.orthantwise_end);
    }

    /* Now the search direction d is ready. We try step = 1 first. */
    step = 1.0;
  }

lbfgs_exit:
  /* Return the final value of the objective function. */
  if (pfx) {
    *pfx = fx;
  }

  vecfree(pf);
  if (lm != 0) {
    for (i = 0; i < m; i++) {
      vecfree(lm[i].s);
      vecfree(lm[i].y);
    }
    xfree(lm);
  }
  vecfree(pg);
  vecfree(w);
  vecfree(d);
  vecfree(gp);
  vecfree(g);
  vecfree(xp);
  return ret;
}
コード例 #5
0
ファイル: lbfgs.c プロジェクト: zhuoxiongzhao/ml-pack
int cg(
  int n,
  double* x,
  double* pfx,
  lbfgs_evaluate_t evaluate,
  lbfgs_progress_t progress,
  void* instance,
  const lbfgs_parameter_t* _param
) {
  static const double RHO = 0.01;
  static const double SIG = 0.5;
  static const double INT = 0.1;
  static const double EXT = 3.0;
  static const double RATIO = 100.0;

  int ret;
  int k, ls_count, ls_success, ls_failed = 0, n_evaluate = 0;
  lbfgs_parameter_t param = (_param) ? (*_param) : default_param;
  double f0, f1, f2 = 0.0, f3, d1, d2, d3, z1, z2 = 0.0, z3, limit, A, B, C;
  double xnorm, gnorm, rate;
  double* df0, *df1, *df2, *s, *x0;
  double* pf = 0;

  if (progress == 0) {
    progress = default_lbfgs_progress;
  }

  if (n <= 0) {
    return LBFGSERR_INVALID_N;
  }
  if (param.epsilon < 0.0) {
    return LBFGSERR_INVALID_EPSILON;
  }
  if (param.past < 0) {
    return LBFGSERR_INVALID_TESTPERIOD;
  }
  if (param.delta < 0.0) {
    return LBFGSERR_INVALID_DELTA;
  }
  if (param.max_linesearch <= 0) {
    return LBFGSERR_INVALID_MAXLINESEARCH;
  }

  df0 = vecalloc(n);
  df1 = vecalloc(n);
  df2 = vecalloc(n);
  s = vecalloc(n);
  x0 = vecalloc(n);

  if (param.past > 0) {
    pf = vecalloc((size_t)param.past);
  }

  f1 = evaluate(instance, n, x, df1, 0);
  n_evaluate++;

  if (pf) {
    pf[0] = f1;
  }

  vec2norm(&xnorm, x, n);
  vec2norm(&gnorm, df1, n);
  if (xnorm < 1.0) {
    xnorm = 1.0;
  }
  if (gnorm / xnorm <= param.epsilon) {
    ret = LBFGS_ALREADY_MINIMIZED;
    goto cg_exit;
  }

  vecncpy(s, df1, n);
  vecdot(&d1, s, s, n);
  d1 = -d1;
  /**
  * Compute the initial step z1:
  */
  z1 = 1.0 / (1.0 - d1);

  k = 1;
  for (;;) {
    /* Store the current position and gradient vectors. */
    f0 = f1;
    veccpy(x0, x, n);
    veccpy(df0, df1, n);

    /* update x using current step: x=x+z1*s */
    vecadd(x, s, z1, n);

    f2 = evaluate(instance, n, x, df2, 0);
    n_evaluate++;

    vecdot(&d2, df2, s, n);
    /* set point 3 equal to point 1 */
    f3 = f1;
    d3 = d1;
    z3 = -z1;

    /* begin line search */
    ls_success = 0;
    ls_count = 0;
    limit = -1.0;
    for (;;) {
      while (f2 > f1 + RHO * z1 * d1 || d2 > -SIG * d1) {
        limit = z1;
        if (f2 > f1) {
          /* quadratic fit */
          z2 = z3 - (0.5 * d3 * z3 * z3) / (d3 * z3 + f2 - f3);
        } else {
          /* cubic fit */
          A = 6 * (f2 - f3) / z3 + 3 * (d2 + d3);
          B = 3 * (f3 - f2) - z3 * (d3 + 2 * d2);
          z2 = (sqrt(B * B - A * d2 * z3 * z3) - B) / A;
        }

        if (isinf(z2) || isnan(z2)) {
          /* if we had a numerical problem then bisect */
          z2 = z3 / 2.0;
        }

        /* don't accept too close to limits */
        z2 = max2(min2(z2, INT* z3), (1.0 - INT) * z3);
        /* update step and x */
        z1 = z1 + z2;
        vecadd(x, s, z2, n);

        f2 = evaluate(instance, n, x, df2, 0);
        n_evaluate++;
        ls_count++;

        vecdot(&d2, df2, s, n);
        z3 = z3 - z2;
      }

      if (f2 > f1 + z1 * RHO * d1 || d2 > -SIG * d1) {
        /* a line search failure */
        break;
      } else if (d2 > SIG * d1) {
        /* a line search success */
        ls_success = 1;
        break;
      } else if (ls_count >= param.max_linesearch) {
        ret = LBFGSERR_MAXIMUMLINESEARCH;
        goto cg_exit;
      }

      /* cubic extrapolation */
      A = 6.0 * (f2 - f3) / z3 + 3.0 * (d2 + d3);
      B = 3.0 * (f3 - f2) - z3 * (d3 + 2 * d2);
      z2 = -d2 * z3 * z3 / (B + sqrt(B * B - A * d2 * z3 * z3));
      /* adjust current step z2 for many cases */
      if (isnan(z2) || isinf(z2) || z2 < 0.0) {
        if (limit < -0.5) {
          z2 = z1 * (EXT - 1.0);
        } else {
          z2 = (limit - z1) / 2.0;
        }
      } else if (limit > -0.5 && z2 + z1 > limit) {
        z2 = (limit - z1) / 2.0;
      } else if (limit < -0.5 && z2 + z1 > z1 * EXT) {
        z2 = z1 * (EXT - 1.0);
      } else if (z2 < -z3 * INT) {
        z2 = -z3 * INT;
      } else if (limit > -0.5 && z2 < (limit - z1) * (1.0 - INT)) {
        z2 = (limit - z1) * (1.0 - INT);
      }

      /* set point 3 equal to point 2 */
      f3 = f2;
      d3 = d2;
      z3 = -z2;

      z1 = z1 + z2;
      vecadd(x, s, z2, n);

      f2 = evaluate(instance, n, x, df2, 0);
      n_evaluate++;
      ls_count++;

      vecdot(&d2, df2, s, n);
    }

    if (ls_success) {
      vec2norm(&xnorm, x, n);
      vec2norm(&gnorm, df2, n);
      if ((ret = progress(instance, n, x, df2, f2, xnorm, gnorm, z2, k, n_evaluate)) != 0) {
        ret = LBFGSERR_CANCELED;
        break;
      }
      if (xnorm < 1.0) {
        xnorm = 1.0;
      }
      if (gnorm / xnorm <= param.epsilon) {
        ret = LBFGS_CONVERGENCE;
        break;
      }

      if (pf) {
        if (param.past <= k) {
          rate = (pf[k % param.past] - f2) / f2;
          if (rate < param.delta) {
            ret = LBFGS_CONVERGENCE_DELTA;
            break;
          }
        }
        pf[k % param.past] = f2;
      }

      if (param.max_iterations != 0 && param.max_iterations < k + 1) {
        ret = LBFGSERR_MAXIMUMITERATION;
        break;
      }
      k++;


      f1 = f2;
      /**
      * Polack-Ribiere direction
      * s = (df2'*df2-df1'*df2)/(df1'*df1)*s - df2
      */
      vecdot(&A, df2, df2, n);
      vecdot(&B, df1, df2, n);
      vecdot(&C, df1, df1, n);
      vecscale(s, (A - B) / C, n);
      vecadd(s, df2, -1.0, n);

      vecswap(df1, df2, n);
      vecdot(&d2, df1, s, n);

      if (d2 > 0) {
        vecncpy(s, df1, n);
        vecdot(&d2, s, s, n);
        d2 = -d2;
      }

      z1 = z1 * min2(RATIO, d1 / (d2 - DBL_MIN));
      d1 = d2;
      ls_failed = 0;
    } else {
      /* restore previous point */
      f1 = f0;
      veccpy(x, x0, n);
      veccpy(df1, df0, n);

      if (ls_failed) {
        /* line search failed twice */
        ret = LBFGSERR_LINE_SEARCH_FAILED;
        break;
      }

      vecswap(df1, df2, n);
      vecncpy(s, df1, n);/* try steepest */
      vecdot(&d1, s, s, n);
      d1 = -d1;
      z1 = 1.0 / (1.0 - d1);
      ls_failed = 1;
    }
  }

cg_exit:
  if (pfx) {
    *pfx = f2;
  }

  vecfree(pf);
  vecfree(x0);
  vecfree(s);
  vecfree(df2);
  vecfree(df1);
  vecfree(df0);
  return ret;
}