Beispiel #1
0
// barebones version of the lasso for fixed lambda
// Eigen library is used for linear algebra
// x .............. predictor matrix
// y .............. response
// lambda ......... penalty parameter
// useSubset ...... logical indicating whether lasso should be computed on a
//                  subset
// subset ......... indices of subset on which lasso should be computed
// normalize ...... logical indicating whether predictors should be normalized
// useIntercept ... logical indicating whether intercept should be included
// eps ............ small numerical value (effective zero)
// useGram ........ logical indicating whether Gram matrix should be computed
//                  in advance
// useCrit ........ logical indicating whether to compute objective function
void fastLasso(const MatrixXd& x, const VectorXd& y, const double& lambda,
		const bool& useSubset, const VectorXi& subset, const bool& normalize, 
    const bool& useIntercept, const double& eps, const bool& useGram, 
    const bool& useCrit,
    // intercept, coefficients, residuals and objective function are returned 
    // through the following parameters
    double& intercept, VectorXd& beta, VectorXd& residuals, double& crit) {

	// data initializations
	int n, p = x.cols();
	MatrixXd xs;
	VectorXd ys;
	if(useSubset) {
		n = subset.size();
		xs.resize(n, p);
		ys.resize(n);
		int s;
		for(int i = 0; i < n; i++) {
			s = subset(i);
			xs.row(i) = x.row(s);
			ys(i) = y(s);
		}
	} else {
		n = x.rows();
		xs = x;	// does this copy memory?
		ys = y;	// does this copy memory?
	}
	double rescaledLambda = n * lambda / 2;

	// center data and store means
	RowVectorXd meanX;
	double meanY;
	if(useIntercept) {
		meanX = xs.colwise().mean();	// columnwise means of predictors
		xs.rowwise() -= meanX;			// sweep out columnwise means
		meanY = ys.mean();				// mean of response
		for(int i = 0; i < n; i++) {
			ys(i) -= meanY;				// sweep out mean
		}
	} else {
		meanY = 0;		// just to avoid warning, this is never used
//		intercept = 0;	// zero intercept
	}

	// some initializations
	VectorXi inactive(p);	// inactive predictors
	int m = 0;				// number of inactive predictors
	VectorXi ignores;		// indicates variables to be ignored
	int s = 0;				// number of ignored variables

	// normalize predictors and store norms
	RowVectorXd normX;
  if(normalize) {
    normX = xs.colwise().norm();	// columnwise norms
	  double epsNorm = eps * sqrt(n);	// R package 'lars' uses n, not n-1
	  for(int j = 0; j < p; j++) {
		  if(normX(j) < epsNorm) {
			  // variance is too small: ignore variable
			  ignores.append(j, s);
			  s++;
			  // set norm to tolerance to avoid numerical problems
			  normX(j) = epsNorm;
		  } else {
			  inactive(m) = j;		// add variable to inactive set
			  m++;					// increase number of inactive variables
		  }
		  xs.col(j) /= normX(j);		// sweep out norm
	  }
	  // resize inactive set and update number of variables if necessary
	  if(m < p) {
		  inactive.conservativeResize(m);
		  p = m;
	  }
  } else {
    for(int j = 0; j < p; j++) inactive(j) = j;  // add variable to inactive set
    m = p;
  }

	// compute Gram matrix if requested (saves time if number of variables is
	// not too large)
	MatrixXd Gram;
	if(useGram) {
		Gram.noalias() = xs.transpose() * xs;
	}

	// further initializations for iterative steps
  RowVectorXd corY;
  corY.noalias() = ys.transpose() * xs; // current correlations
  beta.resize(p+s);                     // final coefficients

  // compute lasso solution
  if(p == 1) {

    // special case of only one variable (with sufficiently large norm)
    int j = inactive(0);          
    // set maximum step size in the direction of that variable
    double maxStep = corY(j);
    if(maxStep < 0) maxStep = -maxStep; // absolute value
    // compute coefficients for least squares solution
    VectorXd betaLS = xs.col(j).householderQr().solve(ys);
    // compute lasso coefficients
    beta.setZero();
    if(rescaledLambda < maxStep) {
      // interpolate towards least squares solution
      beta(j) = (maxStep - rescaledLambda) * betaLS(0) / maxStep;
    }

  } else {

    // further initializations for iterative steps
    VectorXi active;  // active predictors
  	int k = 0;        // number of active predictors
    // previous and current regression coefficients
  	VectorXd previousBeta = VectorXd::Zero(p+s), currentBeta = VectorXd::Zero(p+s);
  	// previous and current penalty parameter
    double previousLambda = 0, currentLambda = 0;
  	// indicates variables to be dropped
    VectorXi drops;
  	// keep track of sign of correlations for the active variables 
    // (double precision is necessary for solve())
    VectorXd signs;
  	// Cholesky L of Gram matrix of active variables
    MatrixXd L;
  	int rank = 0;		// rank of Cholesky L
    // maximum number of variables to be sequenced
  	int maxActive = findMaxActive(n, p, useIntercept);

  	// modified LARS algorithm for lasso solution
  	while(k < maxActive) {

  		// extract current correlations of inactive variables
  		VectorXd corInactiveY(m);
  		for(int j = 0; j < m; j++) {
  			corInactiveY(j) = corY(inactive(j));
  		}
  		// compute absolute values of correlations and find maximum
  		VectorXd absCorInactiveY = corInactiveY.cwiseAbs();
  		double maxCor = absCorInactiveY.maxCoeff();
  		// update current lambda
  		if(k == 0) {	// no active variables
  			previousLambda = maxCor;
  		} else {
  			previousLambda = currentLambda;
  		}
  		currentLambda = maxCor;
  		if(currentLambda <= rescaledLambda) break;

  		if(drops.size() == 0) {
  			// new active variables
  			VectorXi newActive = findNewActive(absCorInactiveY, maxCor - eps);
  			// do calculations for new active variables
  			for(int j = 0; j < newActive.size(); j++) {
  				// update Cholesky L of Gram matrix of active variables
  				// TODO: put this into void function
  				int newJ = inactive(newActive(j));
  				VectorXd xNewJ;
  				double newX;
  				if(useGram) {
  					newX = Gram(newJ, newJ);
  				} else {
  					xNewJ = xs.col(newJ);
  					newX = xNewJ.squaredNorm();
  				}
  				double normNewX = sqrt(newX);
  				if(k == 0) {	// no active variables, L is empty
  					L.resize(1,1);
  					L(0, 0) = normNewX;
  					rank = 1;
  				} else {
  					VectorXd oldX(k);
  					if(useGram) {
  						for(int j = 0; j < k; j++) {
  							oldX(j) = Gram(active(j), newJ);
  						}
  					} else {
  						for(int j = 0; j < k; j++) {
  							oldX(j) = xNewJ.dot(xs.col(active(j)));
  						}
  					}
  					VectorXd l = L.triangularView<Lower>().solve(oldX);
  					double lkk = newX - l.squaredNorm();
  					// check if L is machine singular
  					if(lkk > eps) {
  						// no singularity: update Cholesky L
  						lkk = sqrt(lkk);
  						rank++;
  						// add new row and column to Cholesky L
  						// this is a little trick: sometimes we need
  						// lower triangular matrix, sometimes upper
  						// hence we define quadratic matrix and use
  						// triangularView() to interpret matrix the
  						// correct way
  						L.conservativeResize(k+1, k+1);
  						for(int j = 0; j < k; j++) {
  							L(k, j) = l(j);
  							L(j, k) = l(j);
  						}
  						L(k,k) = lkk;
  					}
  				}
  				// add new variable to active set or drop it for good
  				// in case of singularity
  				if(rank == k) {
  					// singularity: drop variable for good
  					ignores.append(newJ, s);
  					s++;	// increase number of ignored variables
  					p--;	// decrease number of variables
  					if(p < maxActive) {
  						// adjust maximum number of active variables
  						maxActive = p;
  					}
  				} else {
  					// no singularity: add variable to active set
  					active.append(newJ, k);
  					// keep track of sign of correlation for new active variable
  					signs.append(sign(corY(newJ)), k);
  					k++;	// increase number of active variables
  				}
  			}
  			// remove new active or ignored variables from inactive variables
  			// and corresponding vector of current correlations
  			inactive.remove(newActive);
  			corInactiveY.remove(newActive);
  			m = inactive.size();	// update number of inactive variables
  		}
  		// prepare for computation of step size
  		// here double precision of signs is necessary
  		VectorXd b = L.triangularView<Lower>().solve(signs);
  		VectorXd G = L.triangularView<Upper>().solve(b);
  		// correlations of active variables with equiangular vector
  		double corActiveU = 1/sqrt(G.dot(signs));
  		// coefficients of active variables in linear combination forming the
  		// equiangular vector
  		VectorXd w = G * corActiveU;	// note that this has the right signs
  		// equiangular vector
  		VectorXd u;
  		if(!useGram) {
  			// we only need equiangular vector if we don't use the precomputed
  			// Gram matrix, otherwise we can compute the correlations directly
  			// from the Gram matrix
  			u = VectorXd::Zero(n);
  			for(int i = 0; i < n; i++) {
  				for(int j = 0; j < k; j++) {
  					u(i) += xs(i, active(j)) * w(j);
  				}
  			}
  		}
  		// compute step size in equiangular direction
  		double step;
  		if(k < maxActive) {
  			// correlations of inactive variables with equiangular vector
  			VectorXd corInactiveU(m);
  			if(useGram) {
  				for(int j = 0; j < m; j++) {
  					corInactiveU(j) = 0;
  					for(int i = 0; i < k; i++) {
  						corInactiveU(j) += w(i) * Gram(active(i), inactive(j));
  					}
  				}
  			} else {
  				for(int j = 0; j < m; j++) {
  					corInactiveU(j) = u.dot(xs.col(inactive(j)));
  				}
  			}
  			// compute step size in the direction of the equiangular vector
  			step = findStep(maxCor, corInactiveY, corActiveU, corInactiveU, eps);
  		} else {
  			// last step: take maximum possible step
  			step = maxCor/corActiveU;
  		}
  		// adjust step size if any sign changes and drop corresponding variables
  		drops = findDrops(currentBeta, active, w, eps, step);
  		// update current regression coefficients
  		previousBeta = currentBeta;
  		for(int j = 0; j < k; j++) {
  			currentBeta(active(j)) += step * w(j);
  		}
  		// update current correlations
  		if(useGram) {
  			// we also need to do this for active variables, since they may be
  			// dropped at a later stage
  			// TODO: computing a vector step * w in advance may save some computation time
  			for(int j = 0; j < Gram.cols(); j++) {
  				for(int i = 0; i < k; i++) {
  					corY(j) -= step * w(i) * Gram(active(i), j);
  				}
  			}
  		} else {
  			ys -= step * u;	// take step in equiangular direction
  			corY.noalias() = ys.transpose() * xs;	// update correlations
  		}
  		// drop variables if necessary
  		if(drops.size() > 0) {
  			// downdate Cholesky L
  			// TODO: put this into void function
  			for(int j = drops.size()-1; j >= 0; j--) {
  				// variables need to be dropped in descending order
  				int drop = drops(j);	// index with respect to active set
  				// modify upper triangular part as in R package 'lars'
  				// simply deleting columns is not enough, other computations
  				// necessary but complicated due to Fortran code
  				L.removeCol(drop);
  				VectorXd z = VectorXd::Constant(k, 1, 1);
  				k--;	// decrease number of active variables
  				for(int i = drop; i < k; i++) {
  					double a = L(i,i), b = L(i+1,i);
  					if(b != 0.0) {
  						// compute the rotation
  						double tau, s, c;
  						if(std::abs(b) > std::abs(a)) {
  							tau = -a/b;
  							s = 1.0/sqrt(1.0+tau*tau);
  							c = s * tau;
  						} else {
  							tau = -b/a;
  							c = 1.0/sqrt(1.0+tau*tau);
  							s = c * tau;
  						}
  						// update 'L' and 'z';
  						L(i,i) = c*a - s*b;
  						for(int j = i+1; j < k; j++) {
  							a = L(i,j);
  							b = L(i+1,j);
  							L(i,j) = c*a - s*b;
  							L(i+1,j) = s*a + c*b;
  						}
  						a = z(i);
  						b = z(i+1);
  						z(i) = c*a - s*b;
  						z(i+1) = s*a + c*b;
  					}
  				}
  				// TODO: removing all rows together may save some computation time
  				L.conservativeResize(k, NoChange);
  				rank--;
  			}
  			// mirror lower triangular part
  			for(int j = 0; j < k; j++) {
  				for(int i = j+1; i < k; i++) {
  					L(i,j) = L(j,i);
  				}
  			}
  			// add dropped variables to inactive set and make sure
  			// coefficients are 0
  			inactive.conservativeResize(m + drops.size());
  			for(int j = 0; j < drops.size(); j++) {
  				int newInactive = active(drops(j));
  				inactive(m + j) = newInactive;
  				currentBeta(newInactive) = 0;	// make sure coefficient is 0
  			}
  			m = inactive.size();	// update number of inactive variables
  			// drop variables from active set and sign vector
  			// number of active variables is already updated above
  			active.remove(drops);
  			signs.remove(drops);
  		}
  	}

  	// interpolate coefficients for given lambda
    if(k == 0) {
      // lambda larger than largest lambda from steps of LARS algorithm
  		beta.setZero();
    } else {
    	// penalty parameter within two steps
      if(k == maxActive) {
          // current coefficients are the least squares solution (in the 
          // high-dimensional case, as far along the solution path as possible)
          // current and previous values of the penalty parameter need to be 
          // reset for interpolation
          previousLambda = currentLambda;
          currentLambda = 0;
      }
      // interpolate coefficients
    	beta = ((rescaledLambda - currentLambda) * previousBeta +
  				(previousLambda - rescaledLambda) * currentBeta) /
  				(previousLambda - currentLambda);
    }
  }

	// transform coefficients back
  VectorXd normedBeta;
	if(normalize) {
    if(useCrit) normedBeta = beta;
    for(int j = 0; j < p; j++) beta(j) /= normX(j);
	}
	if(useIntercept) intercept = meanY - beta.dot(meanX);

  // compute residuals for all observations
  n = x.rows();
  residuals = y - x * beta;
  if(useIntercept) {
    for(int i = 0; i < n; i++) residuals(i) -= intercept;
  }

  // compute value of objective function on the subset
  if(useCrit && useSubset) {
    if(normalize) crit = objective(normedBeta, residuals, subset, lambda);
    else crit = objective(beta, residuals, subset, lambda);
  }
}