void
LOCA::BorderedSolver::HouseholderQR::applyCompactWY(
                const NOX::Abstract::MultiVector::DenseMatrix& Y1,
                const NOX::Abstract::MultiVector& Y2,
                const NOX::Abstract::MultiVector::DenseMatrix& T,
                NOX::Abstract::MultiVector::DenseMatrix& X1,
                NOX::Abstract::MultiVector& X2,
                bool isZeroX1, bool isZeroX2,
                bool useTranspose) const
{
  if (isZeroX1 && isZeroX2) {
    X1.putScalar(0.0);
    X2.init(0.0);
    return;
  }

  int m = Y2.numVectors();

  Teuchos::ETransp T_flag;
  if (useTranspose)
    T_flag = Teuchos::TRANS;
  else
    T_flag = Teuchos::NO_TRANS;

  NOX::Abstract::MultiVector::DenseMatrix tmp(m, X2.numVectors());

  // Compute Y1^T*X1 + Y2^T*X2
  if (!isZeroX2)
    X2.multiply(1.0, Y2, tmp);

  // Opportunity for optimization here since Y1 is a lower-triangular
  // matrix with unit diagonal
  if (!isZeroX2 && !isZeroX1)
    tmp.multiply(Teuchos::TRANS, Teuchos::NO_TRANS, 1.0, Y1, X1, 1.0);
  else if (!isZeroX1)
    tmp.multiply(Teuchos::TRANS, Teuchos::NO_TRANS, 1.0, Y1, X1, 0.0);

  // Compute op(T)*(Y1^T*X1 + Y2^T*X2)
  dblas.TRMM(Teuchos::LEFT_SIDE, Teuchos::UPPER_TRI, T_flag,
         Teuchos::NON_UNIT_DIAG, tmp.numRows(), tmp.numCols(), 1.0,
         T.values(), T.numRows(), tmp.values(), tmp.numRows());

  // Compute X1 = X1 + Y1*op(T)*(Y1^T*X1 + Y2^T*X2)
  // Opportunity for optimization here since Y1 is a lower-triangular
  // matrix with unit diagonal
  if (isZeroX1)
    X1.multiply(Teuchos::NO_TRANS, Teuchos::NO_TRANS, 1.0, Y1, tmp, 0.0);
  else
    X1.multiply(Teuchos::NO_TRANS, Teuchos::NO_TRANS, 1.0, Y1, tmp, 1.0);

  // Compute X2 = X2 + Y1*op(T)*(Y1^T*X1 + Y2^T*X2)
  if (isZeroX2)
    X2.update(Teuchos::NO_TRANS, 1.0, Y2, tmp, 0.0);
  else
    X2.update(Teuchos::NO_TRANS, 1.0, Y2, tmp, 1.0);
}
void 
LOCA::Extended::MultiVector::multiply(
			    double alpha, 
			    const LOCA::Extended::MultiVector& y,
			    NOX::Abstract::MultiVector::DenseMatrix& b) const
{
  // Verify dimensions are consistent
  if (y.numMultiVecRows != numMultiVecRows || y.numColumns != b.numRows() ||
      y.numScalarRows != numScalarRows || numColumns != b.numCols()) 
    globalData->locaErrorCheck->throwError(
  "LOCA::Extended::MultiVector::multiply()",
  "Size of supplied multivector/matrix is incompatible with this multivector");

  // Zero out b
  b.putScalar(0.0);

  // Create temporary matrix to hold product for each multivec
  NOX::Abstract::MultiVector::DenseMatrix tmp(b);

  // Compute and sum products for each multivec
  for (int i=0; i<numMultiVecRows; i++) {
    multiVectorPtrs[i]->multiply(alpha, *(y.multiVectorPtrs[i]), tmp);
    b += tmp;
  }

  // Compute and add in product for scalars
  if (numScalarRows > 0)
    b.multiply(Teuchos::TRANS, Teuchos::NO_TRANS, alpha, *y.scalarsPtr,
	       *scalarsPtr, 1.0);

}
void
LOCA::BorderedSolver::HouseholderQR::applyHouseholderVector(
               const NOX::Abstract::MultiVector::DenseMatrix& V1,
               const NOX::Abstract::MultiVector& V2,
               double beta,
               NOX::Abstract::MultiVector::DenseMatrix& A1,
               NOX::Abstract::MultiVector& A2)
{
  int nColsA = A2.numVectors();

  // Compute u = V2^T * A2
  NOX::Abstract::MultiVector::DenseMatrix u(1, nColsA);
  A2.multiply(1.0, V2, u);

  // Compute u = u + V1^T * A_P
  u.multiply(Teuchos::TRANS, Teuchos::NO_TRANS, 1.0, V1, A1, 1.0);

  // Compute A1 = A1 - b*V1*u
  A1.multiply(Teuchos::NO_TRANS, Teuchos::NO_TRANS, -beta, V1, u, 1.0);

  // Compute A2 = A2 - b*V2*u
  A2.update(Teuchos::NO_TRANS, -beta, V2, u, 1.0);
}