void
LOCA::BorderedSolver::HouseholderQR::applyCompactWY(
                const NOX::Abstract::MultiVector::DenseMatrix& Y1,
                const NOX::Abstract::MultiVector& Y2,
                const NOX::Abstract::MultiVector::DenseMatrix& T,
                NOX::Abstract::MultiVector::DenseMatrix& X1,
                NOX::Abstract::MultiVector& X2,
                bool isZeroX1, bool isZeroX2,
                bool useTranspose) const
{
  if (isZeroX1 && isZeroX2) {
    X1.putScalar(0.0);
    X2.init(0.0);
    return;
  }

  int m = Y2.numVectors();

  Teuchos::ETransp T_flag;
  if (useTranspose)
    T_flag = Teuchos::TRANS;
  else
    T_flag = Teuchos::NO_TRANS;

  NOX::Abstract::MultiVector::DenseMatrix tmp(m, X2.numVectors());

  // Compute Y1^T*X1 + Y2^T*X2
  if (!isZeroX2)
    X2.multiply(1.0, Y2, tmp);

  // Opportunity for optimization here since Y1 is a lower-triangular
  // matrix with unit diagonal
  if (!isZeroX2 && !isZeroX1)
    tmp.multiply(Teuchos::TRANS, Teuchos::NO_TRANS, 1.0, Y1, X1, 1.0);
  else if (!isZeroX1)
    tmp.multiply(Teuchos::TRANS, Teuchos::NO_TRANS, 1.0, Y1, X1, 0.0);

  // Compute op(T)*(Y1^T*X1 + Y2^T*X2)
  dblas.TRMM(Teuchos::LEFT_SIDE, Teuchos::UPPER_TRI, T_flag,
         Teuchos::NON_UNIT_DIAG, tmp.numRows(), tmp.numCols(), 1.0,
         T.values(), T.numRows(), tmp.values(), tmp.numRows());

  // Compute X1 = X1 + Y1*op(T)*(Y1^T*X1 + Y2^T*X2)
  // Opportunity for optimization here since Y1 is a lower-triangular
  // matrix with unit diagonal
  if (isZeroX1)
    X1.multiply(Teuchos::NO_TRANS, Teuchos::NO_TRANS, 1.0, Y1, tmp, 0.0);
  else
    X1.multiply(Teuchos::NO_TRANS, Teuchos::NO_TRANS, 1.0, Y1, tmp, 1.0);

  // Compute X2 = X2 + Y1*op(T)*(Y1^T*X1 + Y2^T*X2)
  if (isZeroX2)
    X2.update(Teuchos::NO_TRANS, 1.0, Y2, tmp, 0.0);
  else
    X2.update(Teuchos::NO_TRANS, 1.0, Y2, tmp, 1.0);
}
void
NOX::Thyra::MultiVector::
multiply(double alpha,
     const NOX::Abstract::MultiVector& y,
     NOX::Abstract::MultiVector::DenseMatrix& b) const
{
  const NOX::Thyra::MultiVector& yy =
    dynamic_cast<const NOX::Thyra::MultiVector&>(y);

  int m = b.numRows();
  int n = b.numCols();
  Teuchos::RCP< ::Thyra::MultiVectorBase<double> > bb =
    ::Thyra::createMembersView(
      yy.thyraMultiVec->domain(),
      RTOpPack::SubMultiVectorView<double>(0, m, 0, n,
        Teuchos::arcp(b.values(), 0, b.stride()*b.numCols(), false),
        b.stride()
        )
      );

  ::Thyra::apply(*yy.thyraMultiVec, ::Thyra::CONJTRANS, *thyraMultiVec,
                   bb.ptr(), alpha, 0.0);
}
NOX::Abstract::Group::ReturnType 
LOCA::BorderedSolver::LowerTriangularBlockElimination::
solve(Teuchos::ParameterList& params,
      const LOCA::BorderedSolver::AbstractOperator& op,
      const LOCA::MultiContinuation::ConstraintInterface& B,
      const NOX::Abstract::MultiVector::DenseMatrix& C,
      const NOX::Abstract::MultiVector* F,
      const NOX::Abstract::MultiVector::DenseMatrix* G,
      NOX::Abstract::MultiVector& X,
      NOX::Abstract::MultiVector::DenseMatrix& Y) const
{
  string callingFunction = 
    "LOCA::BorderedSolver::LowerTriangularBlockElimination::solve()";
  NOX::Abstract::Group::ReturnType finalStatus = NOX::Abstract::Group::Ok;
  NOX::Abstract::Group::ReturnType status;

  // Determine if X or Y is zero
  bool isZeroF = (F == NULL);
  bool isZeroG = (G == NULL);
  bool isZeroB = B.isDXZero();
  bool isZeroX = isZeroF;
  bool isZeroY = isZeroG && (isZeroB  || isZeroX);

  // First compute X
  if (isZeroX)
    X.init(0.0);
  else {
    // Solve X = J^-1 F, note F must be nonzero
    status = op.applyInverse(params, *F, X);
    finalStatus = 
      globalData->locaErrorCheck->combineAndCheckReturnTypes(status, 
							     finalStatus,
							     callingFunction);
  }

  // Now compute Y
  if (isZeroY)
    Y.putScalar(0.0);
  else {
    // Compute G - B^T*X and store in Y
    if (isZeroG) 
      B.multiplyDX(-1.0, X, Y);
    else {
      Y.assign(*G);
      if (!isZeroB && !isZeroX) {
	NOX::Abstract::MultiVector::DenseMatrix T(Y.numRows(),Y.numCols());
	B.multiplyDX(1.0, X, T);
	Y -= T;
      }
    }

    // Overwrite Y with Y = C^-1 * (G - B^T*X)
    NOX::Abstract::MultiVector::DenseMatrix M(C);
    int *ipiv = new int[M.numRows()];
    Teuchos::LAPACK<int,double> L;
    int info;
    L.GETRF(M.numRows(), M.numCols(), M.values(), M.stride(), ipiv, &info);
    if (info != 0) {
      status = NOX::Abstract::Group::Failed;
      finalStatus = 
	globalData->locaErrorCheck->combineAndCheckReturnTypes(
							      status, 
							      finalStatus,
							      callingFunction);
    }
    L.GETRS('N', M.numRows(), Y.numCols(), M.values(), M.stride(), ipiv, 
	    Y.values(), Y.stride(), &info);
    delete [] ipiv;
    if (info != 0) {
      status = NOX::Abstract::Group::Failed;
      finalStatus = 
	globalData->locaErrorCheck->combineAndCheckReturnTypes(
							     status, 
							     finalStatus,
							     callingFunction);
    }
  }

  return finalStatus;
}
void
LOCA::BorderedSolver::HouseholderQR::computeQR(
                const NOX::Abstract::MultiVector::DenseMatrix& C,
                const NOX::Abstract::MultiVector& B,
                bool use_c_transpose,
                NOX::Abstract::MultiVector::DenseMatrix& Y1,
                NOX::Abstract::MultiVector& Y2,
                NOX::Abstract::MultiVector::DenseMatrix& T,
                NOX::Abstract::MultiVector::DenseMatrix& R)
{
  double beta;
  int m = B.numVectors();

  // Initialize
  Y1.putScalar(0.0);
  T.putScalar(0.0);
  Y2 = B;
  if (use_c_transpose) {
    for (int i=0; i<m; i++)
      for (int j=0; j<m; j++)
    R(i,j) = C(j,i);        // Copy transpose of C into R
  }
  else
    R.assign(C);

  // A temporary vector
  Teuchos::RCP<NOX::Abstract::MultiVector> v2 = Y2.clone(1);

  Teuchos::RCP<NOX::Abstract::MultiVector::DenseMatrix> v1;
  Teuchos::RCP<NOX::Abstract::MultiVector> h2;
  Teuchos::RCP<NOX::Abstract::MultiVector::DenseMatrix> h1;
  Teuchos::RCP<NOX::Abstract::MultiVector> y2;
  Teuchos::RCP<NOX::Abstract::MultiVector::DenseMatrix> y1;
  Teuchos::RCP<NOX::Abstract::MultiVector::DenseMatrix> z;
  std::vector<int> h_idx;
  std::vector<int> y_idx;
  y_idx.reserve(m);

  for (int i=0; i<m; i++) {

    // Create view of column i of Y1 starting at row i
    v1 =
      Teuchos::rcp(new NOX::Abstract::MultiVector::DenseMatrix(Teuchos::View,
                                   Y1,
                                   m-i,
                                   1, i, i));

    // Create view of columns i through m-1 of Y2
    h_idx.resize(m-i);
    for (unsigned int j=0; j<h_idx.size(); j++)
      h_idx[j] = i+j;
    h2 = Y2.subView(h_idx);

    // Create view of columns i thru m-1 of R, starting at row i
    h1 =
      Teuchos::rcp(new NOX::Abstract::MultiVector::DenseMatrix(Teuchos::View,
                                   R,
                                   m-i,
                                   m-i,
                                   i, i));

    if (i > 0) {

      // Create view of columns 0 through i-1 of Y2
      y_idx.push_back(i-1);
      y2 = Y2.subView(y_idx);

      // Create view of columns 0 through i-1 of Y1, starting at row i
      y1 =
    Teuchos::rcp(new NOX::Abstract::MultiVector::DenseMatrix(Teuchos::View,
                                 Y1,
                                 m-i,
                                 i, i, 0));

      // Create view of column i, row 0 through i-1 of T
      z =
    Teuchos::rcp(new NOX::Abstract::MultiVector::DenseMatrix(Teuchos::View,
                                 T,
                                 i,
                                 1,
                                 0, i));
    }

    // Compute Householder Vector
    computeHouseholderVector(i, R, Y2, *v1, *v2, beta);

    // Apply Householder reflection
    applyHouseholderVector(*v1, *v2, beta, *h1, *h2);

    // Copy v2 into Y2
    Y2[i] = (*v2)[0];

    T(i,i) = -beta;

    if (i > 0) {

      // Compute z = y2^T * v2
      v2->multiply(1.0, *y2, *z);

      // Compute z = -beta * (z + y1^T * v1)
      z->multiply(Teuchos::TRANS, Teuchos::NO_TRANS, -beta, *y1, *v1, -beta);

      // Compute z = T * z
      dblas.TRMV(Teuchos::UPPER_TRI, Teuchos::NO_TRANS, Teuchos::NON_UNIT_DIAG,
         i, T.values(), m, z->values(), 1);

    }
  }

}