Ejemplo n.º 1
0
/**
 * @brief Low level fitting routine for non-Gaussian trend filtering problems.
 * Can be configured to handle arbirary losses, as it takes the link function
 * and it first two derivaties as inputs. Fits the solution for a single value
 * of lambda. Most users will want to call tf_admm, rather than tf_admm_glm directly.
 *
 * @param y                    a vector of responses
 * @param x                    a vector of response locations; must be in increasing order
 * @param w                    a vector of sample weights
 * @param n                    the length of y, x, and w
 * @param k                    degree of the trendfilter; i.e., k=1 linear
 * @param max_iter             maximum number of ADMM interations; ignored for k=0
 * @param lam                  the value of lambda
 * @param beta                 allocated space for output coefficents; must pre-fill as it is used in warm start
 * @param alpha                allocated space for ADMM alpha covariates; must pre-fill as it is used in warm start
 * @param u                    allocated space for ADMM u covariates; must pre-fill as it is used in warm start
 * @param obj                  allocated space to store the objective; will fill at most max_iter elements
 * @param iter                 allocated space to store the number of iterations; will fill just one element
 * @param status               allocated space of size nlambda to store the status of each run
 * @param rho                  tuning parameter for the ADMM algorithm; set to 1 for default
 * @param obj_tol              stopping criteria tolerance; set to 1e-10 for default
 * @param alpha_ls             for family != 0, line search tuning parameter
 * @param gamma_ls             for family != 0, line search tuning parameter
 * @param max_iter_ls          for family != 0, max number of iterations in line search
 * @param max_iter_newton      for family != 0, max number of iterations in inner ADMM
 * @param DktDk                pointer to the inner product of DktDk
 * @param b                    the link function for a given loss
 * @param b1                   first derivative of the link function for a given loss
 * @param b2                   second derivative of the link function for a given loss
 * @param verbose              0/1 flag for printing progress
 * @return void
 * @see tf_admm
 */
void tf_admm_glm (double * y, double * x, double * w, int n, int k,
        int max_iter, double lam,
        double * beta, double * alpha, double * u,
        double * obj, int * iter,
        double rho, double obj_tol, double alpha_ls, double gamma_ls,
        int max_iter_ls, int max_iter_newton,
        cs * DktDk,
        func_RtoR b, func_RtoR b1, func_RtoR b2, int verbose)
{
  double * d; /* line search direction */
  double * yt;/* working response: ytilde */
  double * H; /* weighted Hessian */
  double * z;
  double * obj_admm;

  int i;
  int * iter_ls;
  int it;
  int iter_admm;
  double * Db;
  double * Dd;
  double loss;
  double pen;
  double t; /* stepsize */

  d  = (double*) malloc(n*sizeof(double)); /* line search direction */
  yt = (double*) malloc(n*sizeof(double));/* working response: ytilde */
  H  = (double*) malloc(n*sizeof(double)); /* weighted Hessian */
  z  = (double*) malloc(n*sizeof(double));

  /* Buffers for line search */
  Db      = (double *) malloc(n*sizeof(double));
  Dd      = (double *) malloc(n*sizeof(double));
  iter_ls = (int *)    malloc(sizeof(int));

  obj_admm = (double*)malloc(max_iter*sizeof(double));

  if (verbose) printf("\nlambda=%0.3e\n",lam);
  if (verbose) printf("Iteration\tObjective\tLoss\tPenalty\tADMM iters\n");

  /* One Prox Newton step per iteration */
  for (it=0; it < max_iter_newton; it++)
  {
    /* Define weighted Hessian, and working response */
    for(i=0; i<n; i++)
    {
      H[i] = w[i] * b2(beta[i]);
      if (fabs(H[i])>WEIGHT_SMALL)
      {
        yt[i] = beta[i] + (y[i]-b1(beta[i]))/H[i];
      }
      else
      {
        yt[i] = beta[i] + (y[i]-b1(beta[i]));
      }
    }

    /* Prox Newton step */
    iter_admm = 0;
    tf_admm_gauss (yt, x, H, n, k,
        max_iter, lam,
        d, alpha, u,
        obj_admm, &iter_admm, rho, obj_tol,
        DktDk, 0);

    for(i=0; i<n; i++)
    {
      d[i] = d[i] - beta[i];
    }

    t = line_search(y, x, w, n, k, lam, b, b1, beta, d, alpha_ls, gamma_ls, max_iter_ls, iter_ls, Db, Dd);
    /* if (verbose) printf("Stepsize t=%.3e,\titers=%d\n", t, *iter_ls+1); */

    for(i=0; i<n; i++)
    {
      beta[i] = beta[i] + t * d[i];
    }

    /* Compute objective */
    /* Compute loss */
    loss = 0;
    for (i=0; i<n; i++) loss += w[i] * (-y[i]*beta[i] + b(beta[i]));
    /* Compute penalty */
    tf_dx(x,n,k+1,beta,z); /* IMPORTANT: use k+1 here! */
    pen = 0;
    for (i=0; i<n-k-1; i++) pen += fabs(z[i]);
    obj[it] = loss+lam*pen;

    if (verbose) printf("\t%i\t%0.3e\t%0.3e\t%0.3e\t%i\n",it+1,obj[it],loss,lam*pen,iter_admm);

    if(it > 0)
    {
      if( fabs(obj[it] - obj[it-1]) < fabs(obj[it]) * obj_tol )
      {
        break;
      }
    }
  }

  *iter = it;

  /* free */
  free(d);
  free(yt);
  free(H);
  free(z);
  free(iter_ls);
  free(Db);
  free(Dd);
  free(obj_admm);
}
Ejemplo n.º 2
0
/**
 * @brief Low level fitting routine for non-Gaussian trend filtering problems.
 * Can be configured to handle arbirary losses, as it takes the link function
 * and it first two derivaties as inputs. Fits the solution for a single value
 * of lambda. Most users will want to call tf_admm, rather than tf_admm_glm directly.
 *
 * @param x                    a vector of data locations; must be in increasing order
 * @param y                    a vector of responses
 * @param w                    a vector of sample weights
 * @param n                    the length of x, y, and w
 * @param k                    polynomial degree of the fitted trend; i.e., k=1 for linear
 * @param max_iter             maximum number of ADMM interations; ignored for k=0
 * @param lam                  the value of lambda
 * @param df                   allocated space for df value at the solution
 * @param beta                 allocated space for output coefficents; must pre-fill as it is used in warm start
 * @param alpha                allocated space for ADMM alpha variable; must pre-fill as it is used in warm start
 * @param u                    allocated space for ADMM u variable; must pre-fill as it is used in warm start
 * @param obj                  allocated space to store the objective; will fill at most max_iter elements
 * @param iter                 allocated space to store the number of iterations; will fill just one element
 * @param status               allocated space of size nlambda to store the status of each run
 * @param rho                  tuning parameter for the ADMM algorithm; set to 1 for default
 * @param obj_tol              stopping criteria tolerance; set to 1e-6 for default
 * @param obj_tol_newton       for family != 0, stopping criteria tolerance for prox Newton
 * @param alpha_ls             for family != 0, line search tuning parameter
 * @param gamma_ls             for family != 0, line search tuning parameter
 * @param max_iter_ls          for family != 0, max number of iterations in line search
 * @param max_iter_newton      for family != 0, max number of iterations in inner ADMM
 * @param DktDk                pointer to the inner product of DktDk
 * @param b                    the link function for a given loss
 * @param b1                   first derivative of the link function for a given loss
 * @param b2                   second derivative of the link function for a given loss
 * @param verbose              0/1 flag for printing progress
 * @return void
 * @see tf_admm
 */
void tf_admm_glm (double * x, double * y, double * w, int n, int k,
    int max_iter, double lam, int tridiag, int * df,
    double * beta, double * alpha, double * u,
    double * obj, int * iter,
    double rho, double obj_tol, double obj_tol_newton, 
    double alpha_ls, double gamma_ls, int max_iter_ls, int max_iter_newton,
    cs * DktDk, double * A0, double * A1, 
    func_RtoR b, func_RtoR b1, func_RtoR b2, int verbose)
{
  double * dir; /* line search direction */
  double * yt;  /* working response: ytilde */
  double * H;   /* weighted Hessian */
  double * obj_admm;
  double * betabest;
  double * alphabest;  

  int i;
  int d;
  int it, itbest;
  int iter_admm;
  int * iter_ls;
  double * Db;
  double * Dd;
  double t; /* stepsize */  

  dir = (double*) malloc(n*sizeof(double));
  yt  = (double*) malloc(n*sizeof(double));
  H   = (double*) malloc(n*sizeof(double));

  /* Buffers for line search */
  Db      = (double *) malloc(n*sizeof(double));
  Dd      = (double *) malloc(n*sizeof(double));
  iter_ls = (int *)    malloc(sizeof(int));

  obj_admm = (double*) malloc(max_iter*sizeof(double));

  betabest = (double*) malloc(n*sizeof(double));
  alphabest = (double*) malloc(n*sizeof(double));

  if (verbose) printf("\nlambda=%0.3f\n",lam);
  if (verbose) printf("Iteration\tObjective\tADMM iters\n");

  itbest = 0;
  obj[0] = tf_obj_glm(x, y, w, n, k, lam, b, beta, yt);
  
  /* One prox Newton step per iteration */
  for (it=0; it < max_iter_newton; it++)
  {
    /* Define weighted Hessian, and working response */
    for (i=0; i<n; i++)
    {
      H[i] = w[i] * b2(beta[i]);
      if (fabs(H[i])>WEIGHT_SMALL) yt[i] = beta[i] + (y[i]-b1(beta[i]))/H[i];
      else yt[i] = beta[i] + (y[i]-b1(beta[i]));
    }

    /* Prox Newton step */
    iter_admm = 0;
    // rho = rho/(double) (it+1);
    if (tridiag)
      tf_admm_gauss_tri(x, yt, H, n, k, max_iter, lam, df, dir, alpha, u,
          obj_admm, &iter_admm, rho, obj_tol, A0, A1, 0);
    else
      tf_admm_gauss(x, yt, H, n, k, max_iter, lam, df, dir, alpha, u,
          obj_admm, &iter_admm, rho, obj_tol, DktDk, 0);

    /* Line search */
    for (i=0; i<n; i++) dir[i] = dir[i] - beta[i];
    t = line_search(y, x, w, n, k, lam, b, b1, beta, dir, alpha_ls, gamma_ls,
        max_iter_ls, iter_ls, Db, Dd);

    if (t < 0) { // line search failed
      break;
    }
    for (i=0; i<n; i++) beta[i] = beta[i] + t * dir[i];

    /* Compute objective */
    obj[it+1] = tf_obj_glm(x, y, w, n, k, lam, b, beta, yt);
    if (verbose) printf("\t%i\t%0.3e\t%i\t%i\n",it+1,obj[it],iter_admm,*iter_ls);

    if (obj[it+1] - obj[itbest] <= 0 )
    {
      memcpy(betabest, beta, n * sizeof(double));
      if (k>0) memcpy(alphabest, alpha, n * sizeof(double));
      if (obj[itbest] - obj[it+1] <= fabs(obj[itbest]) * obj_tol_newton) {
        itbest = it+1; break;
      }
      itbest = it+1;
    }
    if (it >= itbest + 4) break;

  }

  memcpy(beta, betabest, n * sizeof(double));
  if (k>0) memcpy(alpha, alphabest, n * sizeof(double));

  *iter = it;

  /* Compute final df value, based on alpha */
  d = k+1;
  if (k>0) for (i=0; i<n-k-1; i++) if (alpha[i] != alpha[i+1]) d += 1;
  else for (i=0; i<n-k-1; i++) if (beta[i] != beta[i+1]) d += 1;
  *df = d;

  /* Free everything */
  free(dir);
  free(yt);
  free(H);
  free(Db);
  free(Dd);
  free(iter_ls);
  free(obj_admm);
  free(betabest);
  free(alphabest);
}
Ejemplo n.º 3
0
/**
 * @brief Main wrapper for fitting a trendfilter model.
 * Takes as input either a sequence of lambda tuning parameters, or the number
 * of desired lambda values. In the latter case the function will also calculate
 * a lambda sequence. The user must supply allocated memory to store the output,
 * with the function itself returning only @c void. For default values, and an
 * example of how to call the function, see the function tf_admm_default.
 *
 * @param y                    a vector of responses
 * @param x                    a vector of response locations; must be in increasing order
 * @param w                    a vector of sample weights
 * @param n                    the length of y, x, and w
 * @param k                    degree of the trendfilter; i.e., k=1 linear
 * @param family               family code for the type of fit; family=0 for OLS
 * @param max_iter             maximum number of ADMM interations; ignored for k=0
 * @param lam_flag             0/1 flag for whether lambda sequence needs to be estimated
 * @param lambda               either a sequence of lambda when lam_flag=0, or empty
 *                             allocated space if lam_flag=1
 * @param nlambda              number of lambda values; need for both lam_flag=0 and 1
 * @param lambda_min_ratio     minimum ratio between min and max lambda; ignored for lam_flag=0
 * @param beta                 allocated space of size n*nlambda to store the output coefficents
 * @param obj                  allocated space of size max_iter*nlambda to store the objective
 * @param iter                 allocated space of size nlambda to store the number of iterations
 * @param status               allocated space of size nlambda to store the status of each run
 * @param rho                  tuning parameter for the ADMM algorithm
 * @param obj_tol              stopping criteria tolerance
 * @param alpha_ls             for family != 0, line search tuning parameter
 * @param gamma_ls             for family != 0, line search tuning parameter
 * @param max_iter_ls          for family != 0, max number of iterations in line search
 * @param max_iter_newton      for family != 0, max number of iterations in inner ADMM
 * @param verbose              0/1 flag for printing progress
 * @return void
 * @see tf_admm_default
 */
void tf_admm (double * y, double * x, double * w, int n, int k, int family,
              int max_iter, int lam_flag, double * lambda,
              int nlambda, double lambda_min_ratio, double * beta,
              double * obj, int * iter, int * status, double rho,
              double obj_tol, double alpha_ls, double gamma_ls,
              int max_iter_ls, int max_iter_newton, int verbose)
{
  int i;
  int j;
  double max_lam;
  double min_lam;
  double * temp_n;
  double * beta_max;
  double * alpha;
  double * u;

  cs * D;
  cs * Dt;
  cs * Dk;
  cs * Dkt;
  cs * DktDk;
  gqr * Dt_qr;
  gqr * Dkt_qr;

  beta_max = (double *) malloc(n * sizeof(double));
  temp_n   = (double *) malloc(n * sizeof(double));
  alpha    = (double *) malloc(n * sizeof(double)); /* we use extra buffer (n vs n-k) */
  u        = (double *) malloc(n * sizeof(double)); /* we use extra buffer (n vs n-k) */

  /* Assume w does not have zeros */
  for(i = 0; i < n; i++) temp_n[i] = 1/sqrt(w[i]);

  D = tf_calc_dk(n, k+1, x);
  Dk = tf_calc_dktil(n, k, x);
  Dt = cs_transpose(D, 1);
  diag_times_sparse(Dt, temp_n); /* Dt = W^{-1/2} Dt */
  Dkt = cs_transpose(Dk, 1);
  Dt_qr = glmgen_qr(Dt);
  Dkt_qr = glmgen_qr(Dkt);
  DktDk = cs_multiply(Dkt,Dk);

  /* Determine the maximum lambda in the path, and initiate the path if needed
   * using the input lambda_min_ratio and equally spaced log points.
   */
  max_lam = tf_maxlam(n, y, Dt_qr, w);
  if (!lam_flag)
  {
    min_lam = max_lam * lambda_min_ratio;
    lambda[0] = max_lam;
    for (i = 1; i < nlambda; i++)
      lambda[i] = exp((log(max_lam) * (nlambda - i -1) + log(min_lam) * i) / (nlambda-1));

  }

  rho = rho * pow( (x[n-1] - x[0])/n, (double)k);

  /* Initiate alpha and u for a warm start */
  if (lambda[0] < max_lam * 1e-5)
  {
    for (i = 0; i < n - k; i++)
    {
      alpha[i] = 0;
      u[i] = 0;
    }
  } else {

    /* beta_max */
    for (i = 0; i < n; i++) temp_n[i] = -sqrt(w[i]) * y[i];
    glmgen_qrsol (Dt_qr, temp_n);
    for (i = 0; i < n; i++) beta_max[i] = 0;
    cs_gaxpy(Dt, temp_n, beta_max);
    /* Dt has a W^{-1/2}, so in the next step divide by sqrt(w) instead of w. */
    for (i = 0; i < n; i++) beta_max[i] = y[i] - beta_max[i]/sqrt(w[i]);

    /* alpha_max */
    tf_dxtil(x, n, k, beta_max, alpha);

    /* u_max */
    switch (family)
    {
    case FAMILY_GAUSSIAN:
      for (i = 0; i < n; i++) u[i] = w[i] * (beta_max[i] - y[i]) / (rho * lambda[0]);
      break;

    case FAMILY_LOGISTIC:
      for (i = 0; i < n; i++) {
        u[i] = logi_b2(beta_max[i]) * w[i] * (beta_max[i] - y[i]) / (rho * lambda[0]);
      }
      break;

    case FAMILY_POISSON:
      for (i = 0; i < n; i++) {
        u[i] = pois_b2(beta_max[i]) * w[i] *(beta_max[i] - y[i]) / (rho * lambda[0]);
      }
      break;

    default:
      for (i = 0; i < nlambda; i++) status[i] = 2;
      return;
    }

    glmgen_qrsol (Dkt_qr, u);
  }

  /* Iterate lower level functions over all lambda values;
   * the alpha and u vectors get used each time of subsequent
   * warm starts
   */
  for (i = 0; i < nlambda; i++)
  {
    /* warm start */
    double * beta_init = (i == 0) ? beta_max : beta + (i-1)*n;
    for(j = 0; j < n; j++) beta[i*n + j] = beta_init[j];

    switch (family)
    {
      case FAMILY_GAUSSIAN:
        tf_admm_gauss(y, x, w, n, k, max_iter, lambda[i], beta+i*n, alpha,
                      u, obj+i*max_iter, iter+i, rho * lambda[i], obj_tol,
                      DktDk, verbose);
        break;

      case FAMILY_LOGISTIC:
        tf_admm_glm(y, x, w, n, k, max_iter, lambda[i], beta+i*n, alpha, u, obj+i*max_iter, iter+i,
                    rho * lambda[i], obj_tol, alpha_ls, gamma_ls, max_iter_ls, max_iter_newton,
                    DktDk, &logi_b, &logi_b1, &logi_b2, verbose);
        break;

      case FAMILY_POISSON:
        tf_admm_glm(y, x, w, n, k, max_iter, lambda[i], beta+i*n, alpha, u, obj+i*max_iter, iter+i,
                    rho * lambda[i], obj_tol, alpha_ls, gamma_ls, max_iter_ls, max_iter_newton,
                    DktDk, &pois_b, &pois_b1, &pois_b2, verbose);
        break;
    }

    /* If there any NaNs in beta: reset beta, alpha, u */
    if(has_nan(beta + i * n, n))
    {
      for(j = 0; j < n; j++) beta[i*n + j] = 0;
      for(j = 0; j < n-k; j++) { alpha[j] = 0; u[j] = 0; }
      status[i] = 1;
      printf("Numerical error in lambda[%d]=%f",i,lambda[i]);
    }
  }

  cs_spfree(D);
  cs_spfree(Dt);
  cs_spfree(Dk);
  cs_spfree(Dkt);
  cs_spfree(DktDk);
  glmgen_gqr_free(Dt_qr);
  glmgen_gqr_free(Dkt_qr);

  free(temp_n);
  free(beta_max);
  free(alpha);
  free(u);
}
Ejemplo n.º 4
0
/**
 * @brief Main wrapper for fitting a trendfilter model.
 * Takes as input either a sequence of lambda tuning parameters, or the number
 * of desired lambda values. In the latter case the function will also calculate
 * a lambda sequence. The user must supply allocated memory to store the output,
 * with the function itself returning only @c void. For default values, and an
 * example of how to call the function, see the function tf_admm_default.
 *
 * @param x                    a vector of data locations; must be in increasing order
 * @param y                    a vector of responses
 * @param w                    a vector of sample weights
 * @param n                    the length of x, y, and w
 * @param k                    polynomial degree of the fitted trend; i.e., k=1 for linear
 * @param family               family code for the type of fit; family=0 for OLS
 * @param max_iter             maximum number of ADMM interations; ignored for k=0
 * @param beta0                initialization value of beta for first lambda; ignored if NULL
 * @param lam_flag             0/1 flag for whether lambda sequence needs to be estimated
 * @param lambda               either a sequence of lambda when lam_flag=0, or empty
 *                             allocated space if lam_flag=1
 * @param nlambda              number of lambda values; need for both lam_flag=0 and 1
 * @param lambda_min_ratio     minimum ratio between min and max lambda; ignored for lam_flag=0
 * @param df                   allocated space of nlambda to store the output df values
 * @param beta                 allocated space of size n*nlambda to store the output coefficents
 * @param obj                  allocated space of size max_iter*nlambda to store the objective
 * @param iter                 allocated space of size nlambda to store the number of iterations
 * @param status               allocated space of size nlambda to store the status of each run
 * @param rho                  tuning parameter for the ADMM algorithm
 * @param obj_tol              stopping criteria tolerance
 * @param obj_tol_newton       for family != 0, stopping criteria tolerance for prox Newton
 * @param alpha_ls             for family != 0, line search tuning parameter
 * @param gamma_ls             for family != 0, line search tuning parameter
 * @param max_iter_ls          for family != 0, max number of iterations in line search
 * @param max_iter_newton       for family != 0, max number of iterations in inner ADMM
 * @param verbose              0/1 flag for printing progress
 * @return void
 * @see tf_admm_default
 */
void tf_admm ( double * x, double * y, double * w, int n, int k, int family,
    int max_iter, double * beta0, int lam_flag, double * lambda,
    int nlambda, double lambda_min_ratio, int tridiag, int * df,
    double * beta, double * obj, int * iter, int * status,
    double rho, double obj_tol, double obj_tol_newton, double alpha_ls, double gamma_ls,
    int max_iter_ls, int max_iter_newton, int verbose)
{
  int i;
  int j;
  int numDualVars;
  double max_lam;
  double min_lam;
  double * temp_n;
  double * beta_max;
  double * alpha;
  double * u;
  double * A0;
  double * A1;
  double * v;

  cs * D;
  cs * Dt;
  cs * Dk;
  cs * Dkt;
  cs * DktDk;
  gqr * Dt_qr;
  gqr * Dkt_qr;

  beta_max = (double *) malloc(n * sizeof(double));
  temp_n   = (double *) malloc(n * sizeof(double));
  v        = (double *) malloc(n * sizeof(double));

  numDualVars = tridiag ? k : 1;

  /* we use extra buffer below (n vs n-k) */
  alpha    = (double *) malloc(n * numDualVars * sizeof(double)); 
  u        = (double *) malloc(n * numDualVars * sizeof(double)); 

  /* Assume w does not have zeros */
  for (i = 0; i < n; i++) temp_n[i] = 1/sqrt(w[i]);

  D 	= tf_calc_dk(n, k+1, x);
  Dk 	= tf_calc_dktil(n, k, x);
  Dt 	= cs_transpose(D, 1);

  diag_times_sparse(Dt, temp_n); /* Dt = W^{-1/2} Dt */

  Dkt 	 = cs_transpose(Dk, 1);
  Dt_qr  = glmgen_qr(Dt);
  Dkt_qr = glmgen_qr(Dkt);
  DktDk  = cs_multiply(Dkt,Dk);


  /* Determine the maximum lambda in the path */
  max_lam = tf_maxlam(n, y, Dt_qr, w);
  /* and if it is too small, return a trivial solution for Gaussian case */
  if (family == FAMILY_GAUSSIAN) {
    if (max_lam < 1e-12) {
      for (i=0; i<nlambda; i++) {
        for (j=0; j<n; j++) beta[i*n+j] = y[j];
        obj[i*(max_iter+1)] = 0;
        df[i] = n;
      }
      cs_spfree(D);
      cs_spfree(Dt);
      cs_spfree(Dk);
      cs_spfree(Dkt);
      cs_spfree(DktDk);
      glmgen_gqr_free(Dt_qr);
      glmgen_gqr_free(Dkt_qr);
      free(temp_n);
      free(beta_max);
      free(alpha);
      free(u);
      return;
    }
  }
  else {		
    max_lam += 1;
  }

  /* Initiate the path if needed using the input lambda_min_ratio and 
   * equally spaced points in log space. */
  if (!lam_flag) seq_logspace(max_lam,lambda_min_ratio,nlambda,lambda);

  /* Augmented Lagrangian parameter */
  rho = rho * pow((x[n-1] - x[0])/(double)(n-1), (double)k);
  
  /* Initiate alpha and u for a warm start */
  if (lambda[0] < max_lam * 1e-5)  
    for (i = 0; i < n - k; i++) alpha[i] = u[i] = 0;    
  else {
    /* beta_max */
    if (beta0 == NULL)
      calc_beta_max(y,w,n,Dt_qr,Dt,temp_n,beta_max);
    else
      memcpy(beta_max, beta0, n*sizeof(double));

    /* Check if beta = weighted mean(y) is better than beta */
    double yc = weighted_mean(y,w,n);
    for (i = 0; i < n; i++) temp_n[i] = yc;
    double obj1 = tf_obj(x,y,w,n,k,max_lam,family,beta_max,v);
    double obj2 = tf_obj(x,y,w,n,k,max_lam,family,temp_n,v);
    if(obj2 < obj1) memcpy(beta_max, temp_n, n*sizeof(double));

    /* alpha_max */

    if (tridiag && k>0)
    {
      tf_dx1(x, n, 1, beta_max, alpha + (n*k-n));
      for (j=k-1; j >= 1; j--)
        tf_dx1(x, n, k-j+1, alpha + (n*j), alpha + (n*j-n));      
    }
    else if (k>0)
      tf_dxtil(x, n, k, beta_max, alpha);

    /* u_max */    

    if (tridiag)
      for (j=0; j<k; j++) memset(u + (n*j), 0, (n-k+j) * sizeof(double)); 
    else {
      for (i = 0; i < n; i++) 
          u[i] = w[i] * (beta_max[i] - y[i]) / (rho * lambda[0]);

      if(family == FAMILY_LOGISTIC)
        for (i = 0; i < n; i++) u[i] *= logi_b2(beta_max[i]);
      else if(family == FAMILY_POISSON)
        for (i = 0; i < n; i++) u[i] *= pois_b2(beta_max[i]);
      glmgen_qrsol (Dkt_qr, u);
      // for (i = 0; i < n-k; i++) u[i] = 0;
    }
  }

  if (tridiag && k>0)
  {
    /* Setup tridiagonal systems */  
    A0 = (double*) malloc(n*k*sizeof(double));
    A1 = (double*) malloc(n*k*sizeof(double));

    for (j=2; j <= k; j++)
    {
      form_tridiag(x, n, k-j+2, 1, 1, A0+(n*j-n), A1+(n*j-n));
    }
  }  

  /* Iterate lower level functions over all lambda values;
   * the alpha and u vectors get used each time of subsequent
   * warm starts */
  for (i = 0; i < nlambda; i++)
  {    
    /* warm start */
    double *beta_init = (i == 0) ? beta_max : beta + (i-1)*n;
    for(j = 0; j < n; j++) beta[i*n + j] = beta_init[j];

    if (tridiag)
    {
      form_tridiag(x, n, 1, rho * lambda[i], 0, A0, A1);
      for (j=0; j < n; j++) A0[j] = A0[j] + w[j];
    }

    switch (family) {
      case FAMILY_GAUSSIAN:
        if (tridiag)        
          tf_admm_gauss_tri(x, y, w, n, k, max_iter, lambda[i], df+i, beta+i*n,
              alpha, u, obj+i*(1+max_iter), iter+i, rho * lambda[i],
              obj_tol, A0, A1, verbose);        
        else
          tf_admm_gauss(x, y, w, n, k, max_iter, lambda[i], df+i, beta+i*n,
              alpha, u, obj+i*(1+max_iter), iter+i, rho * lambda[i],
              obj_tol, DktDk, verbose);

        break;

      case FAMILY_LOGISTIC:
        tf_admm_glm(x, y, w, n, k, max_iter, lambda[i], tridiag, df+i, beta+i*n,
            alpha, u, obj+i*(1+max_iter_newton), iter+i, rho * lambda[i], obj_tol,
            obj_tol_newton, alpha_ls, gamma_ls, max_iter_ls, max_iter_newton,
            DktDk, A0, A1, &logi_b, &logi_b1, &logi_b2, verbose);
        break;

      case FAMILY_POISSON:
        tf_admm_glm(x, y, w, n, k, max_iter, lambda[i], tridiag, df+i, beta+i*n,
            alpha, u, obj+i*(1+max_iter_newton), iter+i, rho * lambda[i], obj_tol,
            obj_tol_newton, alpha_ls, gamma_ls, max_iter_ls, max_iter_newton,
            DktDk, A0, A1, &pois_b, &pois_b1, &pois_b2, verbose);
        break;

      default:
        printf("Unknown family, stopping calculation.\n");
        status[i] = 2;
    }
    

    /* If there any NaNs in beta: reset beta, alpha, u */
    if (has_nan(beta + i*n, n))
    {
      double yc = weighted_mean(y,w,n);
      switch(family) {
        case FAMILY_POISSON:
          yc = (yc > 0) ? log(yc) : -DBL_MAX;
          break;
        case FAMILY_LOGISTIC:
          yc = (yc > 0) ? ( yc < 1 ? log(yc/(1-yc)) : DBL_MAX) : -DBL_MAX;
          break;
        default: break;
      }
      for (j = 0; j < n; j++) beta[i*n + j] = yc;
      for (j = 0; j < n-k; j++) alpha[j] = 0;
      for (j = 0; j < n; j++) u[j] = w[j] * (beta[i*n+j] - y[j]) / (rho * lambda[i]);
      glmgen_qrsol (Dkt_qr, u);
      if (tridiag) for (j = 0; j < n*k; j++) alpha[j] = u[j] = 0;
      status[i] = 1;
    }
  }

  cs_spfree(D);
  cs_spfree(Dt);
  cs_spfree(Dk);
  cs_spfree(Dkt);
  cs_spfree(DktDk);
  glmgen_gqr_free(Dt_qr);
  glmgen_gqr_free(Dkt_qr);

  free(beta_max);
  free(temp_n);
  free(alpha);
  free(u);
  free(v);

  if (tridiag && k>0)
  {
    free(A0);
    free(A1);
  }
}