void
LOCA::Epetra::CompactWYOp::applyCompactWY(const Epetra_MultiVector& x, 
					  Epetra_MultiVector& result_x, 
					  Epetra_MultiVector& result_p) const
{
  // Compute Y_x^T*x
  result_p.Multiply('T', 'N', 1.0, *Y_x, x, 0.0);

  // Compute T*(Y_x^T*x)
  dblas.TRMM(Teuchos::LEFT_SIDE, Teuchos::UPPER_TRI, Teuchos::NO_TRANS, 
	     Teuchos::NON_UNIT_DIAG, result_p.MyLength(), 
	     result_p.NumVectors(), 1.0, T.Values(), T.MyLength(), 
	     result_p.Values(), result_p.MyLength());

  // Compute x = x + Y_x*T*(Y_x^T*x)
  result_x = x;
  result_x.Multiply('N', 'N', 1.0, *Y_x, result_p, 1.0);

  // Compute result_p = Y_p*T*(Y_x^T*x)
  dblas.TRMM(Teuchos::LEFT_SIDE, Teuchos::LOWER_TRI, Teuchos::NO_TRANS, 
	     Teuchos::UNIT_DIAG, result_p.MyLength(), 
	     result_p.NumVectors(), 1.0, Y_p.Values(), Y_p.MyLength(), 
	     result_p.Values(), result_p.MyLength());

}
int
Stokhos::MeanBasedPreconditioner::
ApplyInverse(const Epetra_MultiVector& Input, Epetra_MultiVector& Result) const
{
  int myBlockRows = epetraCijk->numMyRows();

  if (!use_block_apply) {
    EpetraExt::BlockMultiVector sg_input(View, *base_map, Input);
    EpetraExt::BlockMultiVector sg_result(View, *base_map, Result);
    for (int i=0; i<myBlockRows; i++) {
      mean_prec->ApplyInverse(*(sg_input.GetBlock(i)),
                              *(sg_result.GetBlock(i)));
    }
  }

  else {
    int m = Input.NumVectors();
    Epetra_MultiVector input_block(
      View, *base_map, Input.Values(), base_map->NumMyElements(),
      m*myBlockRows);
    Epetra_MultiVector result_block(
      View, *base_map, Result.Values(), base_map->NumMyElements(),
      m*myBlockRows);
    mean_prec->ApplyInverse(input_block, result_block);
  }

  return 0;
}
int 
Stokhos::ApproxJacobiPreconditioner::
ApplyInverse(const Epetra_MultiVector& Input, Epetra_MultiVector& Result) const
{
#ifdef STOKHOS_TEUCHOS_TIME_MONITOR
  TEUCHOS_FUNC_TIME_MONITOR("Stokhos: Total Approximate Jacobi Time");
#endif

  // We have to be careful if Input and Result are the same vector.
  // If this is the case, the only possible solution is to make a copy
  const Epetra_MultiVector *input = &Input;
  bool made_copy = false;
  if (Input.Values() == Result.Values()) {
    input = new Epetra_MultiVector(Input);
    made_copy = true;
  } 

  int m = input->NumVectors();
  if (rhs_block == Teuchos::null || rhs_block->NumVectors() != m)
    rhs_block = 
      Teuchos::rcp(new EpetraExt::BlockMultiVector(*base_map, *sg_map, m));

  // Extract blocks
  EpetraExt::BlockMultiVector input_block(View, *base_map, *input);
  EpetraExt::BlockMultiVector result_block(View, *base_map, Result);

  int myBlockRows = epetraCijk->numMyRows();
  result_block.PutScalar(0.0);
  for (int iter=0; iter<num_iter; iter++) {

    // Compute RHS
    if (iter == 0)
      rhs_block->Update(1.0, input_block, 0.0);
    else {
      mat_free_op->Apply(result_block, *rhs_block);
      rhs_block->Update(1.0, input_block, -1.0);
    }

    // Apply deterministic preconditioner
    for(int i=0; i<myBlockRows; i++) {
#ifdef STOKHOS_TEUCHOS_TIME_MONITOR
      TEUCHOS_FUNC_TIME_MONITOR("Stokhos: Total AJ Deterministic Preconditioner Time");
#endif
      mean_prec->ApplyInverse(*(rhs_block->GetBlock(i)),
			      *(result_block.GetBlock(i)));
    }

  }

  if (made_copy)
    delete input;

  return 0; 
}
int EpetraOperator::ApplyInverse(const Epetra_MultiVector& X, Epetra_MultiVector& Y) const {
    try {
        // There is no rcpFromRef(const T&), so we need to do const_cast
        const Xpetra::EpetraMultiVector eX(rcpFromRef(const_cast<Epetra_MultiVector&>(X)));
        Xpetra::EpetraMultiVector       eY(rcpFromRef(Y));

        // Generally, we assume two different vectors, but AztecOO uses a single vector
        if (X.Values() == Y.Values()) {
            // X and Y point to the same memory, use an additional vector
            RCP<Xpetra::EpetraMultiVector> tmpY = Teuchos::rcp(new Xpetra::EpetraMultiVector(eY.getMap(), eY.getNumVectors()));

            // InitialGuessIsZero in MueLu::Hierarchy.Iterate() does not zero out components, it
            // only assumes that user provided an already zeroed out vector
            bool initialGuessZero = true;
            tmpY->putScalar(0.0);

            // apply one V-cycle as preconditioner
            Hierarchy_->Iterate(eX, 1, *tmpY, initialGuessZero);

            // deep copy solution from MueLu
            eY.update(1.0, *tmpY, 0.0);

        } else {
            // X and Y point to different memory, pass the vectors through

            // InitialGuessIsZero in MueLu::Hierarchy.Iterate() does not zero out components, it
            // only assumes that user provided an already zeroed out vector
            bool initialGuessZero = true;
            eY.putScalar(0.0);

            Hierarchy_->Iterate(eX, 1, eY, initialGuessZero);
        }

    } catch (std::exception& e) {
        //TODO: error msg directly on std::cerr?
        std::cerr << "Caught an exception in MueLu::EpetraOperator::ApplyInverse():" << std::endl
                  << e.what() << std::endl;
        return -1;
    }

    return 0;
}
int DoCopyMultiVector(double** matlabApr, const Epetra_MultiVector& A) {

  int ierr = 0;
  int length = A.GlobalLength();
  int numVectors = A.NumVectors();
  const Epetra_Comm & comm = A.Map().Comm();
  if (comm.MyPID()!=0) {
    if (A.MyLength()!=0) ierr = -1;
  }
  else {
    if (length!=A.MyLength()) ierr = -1;
    double* matlabAvalues = *matlabApr;
    double* Aptr = A.Values();
    memcpy((void *)matlabAvalues, (void *)Aptr, sizeof(*Aptr) * length * numVectors);
    *matlabApr += length;   
  }
  int ierrGlobal;
  comm.MinAll(&ierr, &ierrGlobal, 1); // If any processor has -1, all return -1
  return(ierrGlobal);
}
Exemple #6
0
int ARPACKm3::reSolve(int numEigen, Epetra_MultiVector &Q, double *lambda, int startingEV) {

  // Computes eigenvalues and the corresponding eigenvectors
  // of the generalized eigenvalue problem
  // 
  //      K X = M X Lambda
  // 
  // using ARPACK (mode 3).
  //
  // The convergence test is provided by ARPACK.
  //
  // Note that if M is not specified, then  K X = X Lambda is solved.
  // (using the mode for generalized eigenvalue problem).
  // 
  // Input variables:
  // 
  // numEigen  (integer) = Number of eigenmodes requested
  // 
  // Q (Epetra_MultiVector) = Initial search space
  //                   The number of columns of Q defines the size of search space (=NCV).
  //                   The rows of X are distributed across processors.
  //                   As a rule of thumb in ARPACK User's guide, NCV >= 2*numEigen.
  //                   At exit, the first numEigen locations contain the eigenvectors requested.
  // 
  // lambda (array of doubles) = Converged eigenvalues
  //                   The length of this array is equal to the number of columns in Q.
  //                   At exit, the first numEigen locations contain the eigenvalues requested.
  // 
  // startingEV (integer) = Number of eigenmodes already stored in Q
  //                   A linear combination of these vectors is made to define the starting
  //                   vector, placed in resid.
  //
  // Return information on status of computation
  // 
  // info >=   0 >> Number of converged eigenpairs at the end of computation
  // 
  // // Failure due to input arguments
  // 
  // info = -  1 >> The stiffness matrix K has not been specified.
  // info = -  2 >> The maps for the matrix K and the matrix M differ.
  // info = -  3 >> The maps for the matrix K and the preconditioner P differ.
  // info = -  4 >> The maps for the vectors and the matrix K differ.
  // info = -  5 >> Q is too small for the number of eigenvalues requested.
  // info = -  6 >> Q is too small for the computation parameters.
  // 
  // info = -  8 >> numEigen must be smaller than the dimension of the matrix.
  //
  // info = - 30 >> MEMORY
  //
  // See ARPACK documentation for the meaning of INFO

  if (numEigen <= startingEV) {
    return numEigen;
  }

  int info = myVerify.inputArguments(numEigen, K, M, 0, Q, minimumSpaceDimension(numEigen));
  if (info < 0)
    return info;

  int myPid = MyComm.MyPID();

  int localSize = Q.MyLength();
  int NCV = Q.NumVectors();
  int knownEV = 0;

  if (NCV > Q.GlobalLength()) {
    if (numEigen >= Q.GlobalLength()) {
      cerr << endl;
      cerr << " !! The number of requested eigenvalues must be smaller than the dimension";
      cerr << " of the matrix !!\n";
      cerr << endl;
      return -8;
    }
    NCV = Q.GlobalLength();
  }

  int localVerbose = verbose*(myPid == 0);

  // Define data for ARPACK
  highMem = (highMem > currentSize()) ? highMem : currentSize();

  int ido = 0;

  int lwI = 22 + NCV;
  int *wI = new (nothrow) int[lwI];
  if (wI == 0) {
    return -30;
  }
  memRequested += sizeof(int)*lwI/(1024.0*1024.0);

  int *iparam = wI;
  int *ipntr = wI + 11;
  int *select = wI + 22;

  int lworkl = NCV*(NCV+8);
  int lwD = lworkl + 4*localSize;
  double *wD = new (nothrow) double[lwD];
  if (wD == 0) {
    delete[] wI;
    return -30;
  }
  memRequested += sizeof(double)*(4*localSize+lworkl)/(1024.0*1024.0);

  double *pointer = wD;

  double *workl = pointer;
  pointer = pointer + lworkl;

  double *resid = pointer;
  pointer = pointer + localSize;

  double *workd = pointer;

  double *v = Q.Values();

  highMem = (highMem > currentSize()) ? highMem : currentSize();

  double sigma = 0.0;

  if (startingEV > 0) {
    // Define the initial starting vector
    memset(resid, 0, localSize*sizeof(double));
    for (int jj = 0; jj < startingEV; ++jj)
      for (int ii = 0; ii < localSize; ++ii)
         resid[ii] += v[ii + jj*localSize];
    info = 1;
  }

  iparam[1-1] = 1;
  iparam[3-1] = maxIterEigenSolve;
  iparam[7-1] = 3;

  // The fourth parameter forces to use the convergence test provided by ARPACK.
  // This requires a customization of ARPACK (provided by R. Lehoucq).

  iparam[4-1] = 0;

  Epetra_Vector v1(View, Q.Map(), workd);
  Epetra_Vector v2(View, Q.Map(), workd + localSize);
  Epetra_Vector v3(View, Q.Map(), workd + 2*localSize);

  double *vTmp = new (nothrow) double[localSize];
  if (vTmp == 0) {
    delete[] wI;
    delete[] wD;
    return -30;
  }
  memRequested += sizeof(double)*localSize/(1024.0*1024.0);

  highMem = (highMem > currentSize()) ? highMem : currentSize();

  if (localVerbose > 0) {
    cout << endl;
    cout << " *|* Problem: ";
    if (M) 
      cout << "K*Q = M*Q D ";
    else
      cout << "K*Q = Q D ";
    cout << endl;
    cout << " *|* Algorithm = ARPACK (mode 3)" << endl;
    cout << " *|* Number of requested eigenvalues = " << numEigen << endl;
    cout.precision(2);
    cout.setf(ios::scientific, ios::floatfield);
    cout << " *|* Tolerance for convergence = " << tolEigenSolve << endl;
    if (startingEV > 0)
      cout << " *|* User-defined starting vector (Combination of " << startingEV << " vectors)\n";
    cout << "\n -- Start iterations -- \n";
  }

#ifdef EPETRA_MPI
  Epetra_MpiComm *MPIComm = dynamic_cast<Epetra_MpiComm *>(const_cast<Epetra_Comm*>(&MyComm));
#endif

  timeOuterLoop -= MyWatch.WallTime();
  while (ido != 99) {

    highMem = (highMem > currentSize()) ? highMem : currentSize();

#ifdef EPETRA_MPI
    if (MPIComm)
      callFortran.PSAUPD(MPIComm->Comm(), &ido, 'G', localSize, which, numEigen, tolEigenSolve,
             resid, NCV, v, localSize, iparam, ipntr, workd, workl, lworkl, &info, localVerbose);
    else
      callFortran.SAUPD(&ido, 'G', localSize, which, numEigen, tolEigenSolve, resid, NCV, v, 
             localSize, iparam, ipntr, workd, workl, lworkl, &info, localVerbose);
#else
    callFortran.SAUPD(&ido, 'G', localSize, which, numEigen, tolEigenSolve, resid, NCV, v,
             localSize, iparam, ipntr, workd, workl, lworkl, &info, localVerbose);
#endif

    if (ido == -1) {
      // Apply the mass matrix      
      v3.ResetView(workd + ipntr[0] - 1);
      v1.ResetView(vTmp);
      timeMassOp -= MyWatch.WallTime();
      if (M)
        M->Apply(v3, v1);
      else
        memcpy(v1.Values(), v3.Values(), localSize*sizeof(double));
      timeMassOp += MyWatch.WallTime();
      massOp += 1;
      // Solve the stiffness problem
      v2.ResetView(workd + ipntr[1] - 1);
      timeStifOp -= MyWatch.WallTime();
      K->ApplyInverse(v1, v2);
      timeStifOp += MyWatch.WallTime();
      stifOp += 1;
      continue;
    } // if (ido == -1)

    if (ido == 1) {
      // Solve the stiffness problem
      v1.ResetView(workd + ipntr[2] - 1);
      v2.ResetView(workd + ipntr[1] - 1);
      timeStifOp -= MyWatch.WallTime();
      K->ApplyInverse(v1, v2);
      timeStifOp += MyWatch.WallTime();
      stifOp += 1;
      continue;
    } // if (ido == 1)

    if (ido == 2) {
      // Apply the mass matrix      
      v1.ResetView(workd + ipntr[0] - 1);
      v2.ResetView(workd + ipntr[1] - 1);
      timeMassOp -= MyWatch.WallTime();
      if (M)
        M->Apply(v1, v2);
      else
        memcpy(v2.Values(), v1.Values(), localSize*sizeof(double));
      timeMassOp += MyWatch.WallTime();
      massOp += 1;
      continue;
    } // if (ido == 2)

  } // while (ido != 99)
  timeOuterLoop += MyWatch.WallTime();
  highMem = (highMem > currentSize()) ? highMem : currentSize();

  if (info < 0) {
    if (myPid == 0) {
      cerr << endl;
      cerr << " Error with DSAUPD, info = " << info << endl;
      cerr << endl;
    }
  }
  else {

    // Compute the eigenvectors
    timePostProce -= MyWatch.WallTime();
#ifdef EPETRA_MPI
    if (MPIComm)
      callFortran.PSEUPD(MPIComm->Comm(), 1, 'A', select, lambda, v, localSize, sigma, 'G',
            localSize, which, numEigen, tolEigenSolve, resid, NCV, v, localSize, iparam, ipntr, 
            workd, workl, lworkl, &info);
    else
      callFortran.SEUPD(1, 'A', select, lambda, v, localSize, sigma, 'G', localSize, which,
            numEigen, tolEigenSolve, resid, NCV, v, localSize, iparam, ipntr, workd, workl,
            lworkl, &info);
#else
    callFortran.SEUPD(1, 'A', select, lambda, v, localSize, sigma, 'G', localSize, which,
          numEigen, tolEigenSolve, resid, NCV, v, localSize, iparam, ipntr, workd, workl,
          lworkl, &info);
#endif
    timePostProce += MyWatch.WallTime();
    highMem = (highMem > currentSize()) ? highMem : currentSize();

    // Treat the error
    if (info != 0) {
      if (myPid == 0) {
        cerr << endl;
        cerr << " Error with DSEUPD, info = " << info << endl;
        cerr << endl;
      }
    }

  } // if (info < 0)

  if (info == 0) {
    outerIter = iparam[3-1];
    knownEV = iparam[5-1];
    orthoOp = iparam[11-1];
  }

  delete[] wI;
  delete[] wD;
  delete[] vTmp;

  return (info == 0) ? knownEV : info;

}
Exemple #7
0
int BlockDACG::reSolve(int numEigen, Epetra_MultiVector &Q, double *lambda, int startingEV) {

  // Computes the smallest eigenvalues and the corresponding eigenvectors
  // of the generalized eigenvalue problem
  // 
  //      K X = M X Lambda
  // 
  // using a Block Deflation Accelerated Conjugate Gradient algorithm.
  //
  // Note that if M is not specified, then  K X = X Lambda is solved.
  //
  // Ref: P. Arbenz & R. Lehoucq, "A comparison of algorithms for modal analysis in the
  // absence of a sparse direct method", SNL, Technical Report SAND2003-1028J
  // With the notations of this report, the coefficient beta is defined as 
  //                 diag( H^T_{k} G_{k} ) / diag( H^T_{k-1} G_{k-1} )
  // 
  // Input variables:
  // 
  // numEigen  (integer) = Number of eigenmodes requested
  // 
  // Q (Epetra_MultiVector) = Converged eigenvectors
  //                   The number of columns of Q must be equal to numEigen + blockSize.
  //                   The rows of Q are distributed across processors.
  //                   At exit, the first numEigen columns contain the eigenvectors requested.
  // 
  // lambda (array of doubles) = Converged eigenvalues
  //                   At input, it must be of size numEigen + blockSize.
  //                   At exit, the first numEigen locations contain the eigenvalues requested.
  //
  // startingEV (integer) = Number of existing converged eigenmodes
  //
  // Return information on status of computation
  // 
  // info >=   0 >> Number of converged eigenpairs at the end of computation
  // 
  // // Failure due to input arguments
  // 
  // info = -  1 >> The stiffness matrix K has not been specified.
  // info = -  2 >> The maps for the matrix K and the matrix M differ.
  // info = -  3 >> The maps for the matrix K and the preconditioner P differ.
  // info = -  4 >> The maps for the vectors and the matrix K differ.
  // info = -  5 >> Q is too small for the number of eigenvalues requested.
  // info = -  6 >> Q is too small for the computation parameters.
  //
  // info = - 10 >> Failure during the mass orthonormalization
  // 
  // info = - 20 >> Error in LAPACK during the local eigensolve
  //
  // info = - 30 >> MEMORY
  //

  // Check the input parameters
  
  if (numEigen <= startingEV) {
    return startingEV;
  }

  int info = myVerify.inputArguments(numEigen, K, M, Prec, Q, numEigen + blockSize);
  if (info < 0)
    return info;

  int myPid = MyComm.MyPID();

  // Get the weight for approximating the M-inverse norm
  Epetra_Vector *vectWeight = 0;
  if (normWeight) {
    vectWeight = new Epetra_Vector(View, Q.Map(), normWeight);
  }

  int knownEV = startingEV;
  int localVerbose = verbose*(myPid==0);

  // Define local block vectors
  //
  // MX = Working vectors (storing M*X if M is specified, else pointing to X)
  // KX = Working vectors (storing K*X)
  //
  // R = Residuals
  //
  // H = Preconditioned residuals
  //
  // P = Search directions
  // MP = Working vectors (storing M*P if M is specified, else pointing to P)
  // KP = Working vectors (storing K*P)

  int xr = Q.MyLength();
  Epetra_MultiVector X(View, Q, numEigen, blockSize);
  X.Random();

  int tmp;
  tmp = (M == 0) ? 5*blockSize*xr : 7*blockSize*xr;

  double *work1 = new (nothrow) double[tmp]; 
  if (work1 == 0) {
    if (vectWeight)
      delete vectWeight;
    info = -30;
    return info;
  }
  memRequested += sizeof(double)*tmp/(1024.0*1024.0);

  highMem = (highMem > currentSize()) ? highMem : currentSize();

  double *tmpD = work1;

  Epetra_MultiVector KX(View, Q.Map(), tmpD, xr, blockSize);
  tmpD = tmpD + xr*blockSize;

  Epetra_MultiVector MX(View, Q.Map(), (M) ? tmpD : X.Values(), xr, blockSize);
  tmpD = (M) ? tmpD + xr*blockSize : tmpD;

  Epetra_MultiVector R(View, Q.Map(), tmpD, xr, blockSize);
  tmpD = tmpD + xr*blockSize;

  Epetra_MultiVector H(View, Q.Map(), tmpD, xr, blockSize);
  tmpD = tmpD + xr*blockSize;

  Epetra_MultiVector P(View, Q.Map(), tmpD, xr, blockSize);
  tmpD = tmpD + xr*blockSize;

  Epetra_MultiVector KP(View, Q.Map(), tmpD, xr, blockSize);
  tmpD = tmpD + xr*blockSize;

  Epetra_MultiVector MP(View, Q.Map(), (M) ? tmpD : P.Values(), xr, blockSize);

  // Define arrays
  //
  // theta = Store the local eigenvalues (size: 2*blockSize)
  // normR = Store the norm of residuals (size: blockSize)
  //
  // oldHtR = Store the previous H_i^T*R_i    (size: blockSize)
  // currentHtR = Store the current H_i^T*R_i (size: blockSize)
  //
  // MM = Local mass matrix              (size: 2*blockSize x 2*blockSize)
  // KK = Local stiffness matrix         (size: 2*blockSize x 2*blockSize)
  //
  // S = Local eigenvectors              (size: 2*blockSize x 2*blockSize)

  int lwork2;
  lwork2 = 5*blockSize + 12*blockSize*blockSize;
  double *work2 = new (nothrow) double[lwork2];
  if (work2 == 0) {
    if (vectWeight)
      delete vectWeight;
    delete[] work1;
    info = -30;
    return info;
  }

  highMem = (highMem > currentSize()) ? highMem : currentSize();

  tmpD = work2;

  double *theta = tmpD;
  tmpD = tmpD + 2*blockSize;

  double *normR = tmpD;
  tmpD = tmpD + blockSize;

  double *oldHtR = tmpD;
  tmpD = tmpD + blockSize;
  
  double *currentHtR = tmpD;
  tmpD = tmpD + blockSize;
  memset(currentHtR, 0, blockSize*sizeof(double));
  
  double *MM = tmpD;
  tmpD = tmpD + 4*blockSize*blockSize;

  double *KK = tmpD;
  tmpD = tmpD + 4*blockSize*blockSize;

  double *S = tmpD;

  memRequested += sizeof(double)*lwork2/(1024.0*1024.0);

  // Define an array to store the residuals history
  if (localVerbose > 2) {
    resHistory = new (nothrow) double[maxIterEigenSolve*blockSize];
    if (resHistory == 0) {
      if (vectWeight)
        delete vectWeight;
      delete[] work1;
      delete[] work2;
      info = -30;
      return info;
    }
    historyCount = 0;
  }

  // Miscellaneous definitions

  bool reStart = false;
  numRestart = 0;

  int localSize;
  int twoBlocks = 2*blockSize;
  int nFound = blockSize;
  int i, j;

  if (localVerbose > 0) {
    cout << endl;
    cout << " *|* Problem: ";
    if (M) 
      cout << "K*Q = M*Q D ";
    else
      cout << "K*Q = Q D ";
    if (Prec)
      cout << " with preconditioner";
    cout << endl;
    cout << " *|* Algorithm = DACG (block version)" << endl;
    cout << " *|* Size of blocks = " << blockSize << endl;
    cout << " *|* Number of requested eigenvalues = " << numEigen << endl;
    cout.precision(2);
    cout.setf(ios::scientific, ios::floatfield);
    cout << " *|* Tolerance for convergence = " << tolEigenSolve << endl;
    cout << " *|* Norm used for convergence: ";
    if (normWeight)
      cout << "weighted L2-norm with user-provided weights" << endl;
    else
      cout << "L^2-norm" << endl;
    if (startingEV > 0)
      cout << " *|* Input converged eigenvectors = " << startingEV << endl;
    cout << "\n -- Start iterations -- \n";
  }

  timeOuterLoop -= MyWatch.WallTime();
  for (outerIter = 1; outerIter <= maxIterEigenSolve; ++outerIter) {

    highMem = (highMem > currentSize()) ? highMem : currentSize();

    if ((outerIter == 1) || (reStart == true)) {

      reStart = false;
      localSize = blockSize;

      if (nFound > 0) {

        Epetra_MultiVector X2(View, X, blockSize-nFound, nFound);
        Epetra_MultiVector MX2(View, MX, blockSize-nFound, nFound);
        Epetra_MultiVector KX2(View, KX, blockSize-nFound, nFound);

        // Apply the mass matrix to X
        timeMassOp -= MyWatch.WallTime();
        if (M)
          M->Apply(X2, MX2);
        timeMassOp += MyWatch.WallTime();
        massOp += nFound;

        if (knownEV > 0) {
          // Orthonormalize X against the known eigenvectors with Gram-Schmidt
          // Note: Use R as a temporary work space
          Epetra_MultiVector copyQ(View, Q, 0, knownEV);
          timeOrtho -= MyWatch.WallTime();
          info = modalTool.massOrthonormalize(X, MX, M, copyQ, nFound, 0, R.Values());
          timeOrtho += MyWatch.WallTime();
          // Exit the code if the orthogonalization did not succeed
          if (info < 0) {
            info = -10;
            delete[] work1;
            delete[] work2;
            if (vectWeight)
              delete vectWeight;
            return info;
          }
        }

        // Apply the stiffness matrix to X
        timeStifOp -= MyWatch.WallTime();
        K->Apply(X2, KX2);
        timeStifOp += MyWatch.WallTime();
        stifOp += nFound;

      } // if (nFound > 0)

    } // if ((outerIter == 1) || (reStart == true))
    else {

      // Apply the preconditioner on the residuals
      if (Prec != 0) {
        timePrecOp -= MyWatch.WallTime();
        Prec->ApplyInverse(R, H);
        timePrecOp += MyWatch.WallTime();
        precOp += blockSize;
      }
      else {
        memcpy(H.Values(), R.Values(), xr*blockSize*sizeof(double));
      }

      // Compute the product H^T*R
      timeSearchP -= MyWatch.WallTime();      
      memcpy(oldHtR, currentHtR, blockSize*sizeof(double));
      H.Dot(R, currentHtR);
      // Define the new search directions
      if (localSize == blockSize) {
        P.Scale(-1.0, H);
        localSize = twoBlocks;
      } // if (localSize == blockSize)
      else {
        bool hasZeroDot = false;
        for (j = 0; j < blockSize; ++j) {
          if (oldHtR[j] == 0.0) {
            hasZeroDot = true;
            break; 
          }
          callBLAS.SCAL(xr, currentHtR[j]/oldHtR[j], P.Values() + j*xr);
        }
        if (hasZeroDot == true) {
          // Restart the computation when there is a null dot product
          if (localVerbose > 0) {
            cout << endl;
            cout << " !! Null dot product -- Restart the search space !!\n";
            cout << endl;
          }
          if (blockSize == 1) {
            X.Random();
            nFound = blockSize;
          }
          else {
            Epetra_MultiVector Xinit(View, X, j, blockSize-j);
            Xinit.Random();
            nFound = blockSize - j;
          } // if (blockSize == 1)
          reStart = true;
          numRestart += 1;
          info = 0;
          continue;
        }
        callBLAS.AXPY(xr*blockSize, -1.0, H.Values(), P.Values());
      } // if (localSize == blockSize)
      timeSearchP += MyWatch.WallTime();

      // Apply the mass matrix on P
      timeMassOp -= MyWatch.WallTime();
      if (M)
        M->Apply(P, MP);
      timeMassOp += MyWatch.WallTime();
      massOp += blockSize;

      if (knownEV > 0) {
        // Orthogonalize P against the known eigenvectors
        // Note: Use R as a temporary work space
        Epetra_MultiVector copyQ(View, Q, 0, knownEV);
        timeOrtho -= MyWatch.WallTime();
        modalTool.massOrthonormalize(P, MP, M, copyQ, blockSize, 1, R.Values());
        timeOrtho += MyWatch.WallTime();
      }

      // Apply the stiffness matrix to P
      timeStifOp -= MyWatch.WallTime();
      K->Apply(P, KP);
      timeStifOp += MyWatch.WallTime();
      stifOp += blockSize;

    } // if ((outerIter == 1) || (reStart == true))

    // Form "local" mass and stiffness matrices
    // Note: Use S as a temporary workspace
    timeLocalProj -= MyWatch.WallTime();
    modalTool.localProjection(blockSize, blockSize, xr, X.Values(), xr, KX.Values(), xr,
                    KK, localSize, S);
    modalTool.localProjection(blockSize, blockSize, xr, X.Values(), xr, MX.Values(), xr,
                    MM, localSize, S);
    if (localSize > blockSize) {
      modalTool.localProjection(blockSize, blockSize, xr, X.Values(), xr, KP.Values(), xr,
                      KK + blockSize*localSize, localSize, S);
      modalTool.localProjection(blockSize, blockSize, xr, P.Values(), xr, KP.Values(), xr,
                      KK + blockSize*localSize + blockSize, localSize, S);
      modalTool.localProjection(blockSize, blockSize, xr, X.Values(), xr, MP.Values(), xr,
                      MM + blockSize*localSize, localSize, S);
      modalTool.localProjection(blockSize, blockSize, xr, P.Values(), xr, MP.Values(), xr,
                      MM + blockSize*localSize + blockSize, localSize, S);
    } // if (localSize > blockSize)
    timeLocalProj += MyWatch.WallTime();

    // Perform a spectral decomposition
    timeLocalSolve -= MyWatch.WallTime();
    int nevLocal = localSize;
    info = modalTool.directSolver(localSize, KK, localSize, MM, localSize, nevLocal,
                                  S, localSize, theta, localVerbose,
                                  (blockSize == 1) ? 1: 0);
    timeLocalSolve += MyWatch.WallTime();

    if (info < 0) {
      // Stop when spectral decomposition has a critical failure
      break;
    }

    // Check for restarting
    if ((theta[0] < 0.0) || (nevLocal < blockSize)) {
      if (localVerbose > 0) {
        cout << " Iteration " << outerIter;
        cout << "- Failure for spectral decomposition - RESTART with new random search\n";
      }
      if (blockSize == 1) {
        X.Random();
        nFound = blockSize;
      }
      else {
        Epetra_MultiVector Xinit(View, X, 1, blockSize-1);
        Xinit.Random();
        nFound = blockSize - 1;
      } // if (blockSize == 1)
      reStart = true;
      numRestart += 1;
      info = 0;
      continue;
    } // if ((theta[0] < 0.0) || (nevLocal < blockSize))

    if ((localSize == twoBlocks) && (nevLocal == blockSize)) {
      for (j = 0; j < nevLocal; ++j)
        memcpy(S + j*blockSize, S + j*twoBlocks, blockSize*sizeof(double));
      localSize = blockSize;
    }

    // Check the direction of eigenvectors
    // Note: This sign check is important for convergence
    for (j = 0; j < nevLocal; ++j) {
      double coeff = S[j + j*localSize];
      if (coeff < 0.0)
        callBLAS.SCAL(localSize, -1.0, S + j*localSize);
    }

    // Compute the residuals
    timeResidual -= MyWatch.WallTime();
    callBLAS.GEMM('N', 'N', xr, blockSize, blockSize, 1.0, KX.Values(), xr,
                  S, localSize, 0.0, R.Values(), xr);
    if (localSize == twoBlocks) {
      callBLAS.GEMM('N', 'N', xr, blockSize, blockSize, 1.0, KP.Values(), xr,
                    S + blockSize, localSize, 1.0, R.Values(), xr);
    }
    for (j = 0; j < blockSize; ++j)
      callBLAS.SCAL(localSize, theta[j], S + j*localSize);
    callBLAS.GEMM('N', 'N', xr, blockSize, blockSize, -1.0, MX.Values(), xr,
                  S, localSize, 1.0, R.Values(), xr);
    if (localSize == twoBlocks) {
      callBLAS.GEMM('N', 'N', xr, blockSize, blockSize, -1.0, MP.Values(), xr,
                  S + blockSize, localSize, 1.0, R.Values(), xr);
    }
    for (j = 0; j < blockSize; ++j)
      callBLAS.SCAL(localSize, 1.0/theta[j], S + j*localSize);
    timeResidual += MyWatch.WallTime();
    
    // Compute the norms of the residuals
    timeNorm -= MyWatch.WallTime();
    if (vectWeight)
      R.NormWeighted(*vectWeight, normR);
    else
      R.Norm2(normR);
    // Scale the norms of residuals with the eigenvalues
    // Count the converged eigenvectors
    nFound = 0;
    for (j = 0; j < blockSize; ++j) {
      normR[j] = (theta[j] == 0.0) ? normR[j] : normR[j]/theta[j];
      if (normR[j] < tolEigenSolve) 
        nFound += 1;
    }
    timeNorm += MyWatch.WallTime();

    // Store the residual history
    if (localVerbose > 2) {
      memcpy(resHistory + historyCount*blockSize, normR, blockSize*sizeof(double));
      historyCount += 1;
    }

    // Print information on current iteration
    if (localVerbose > 0) {
      cout << " Iteration " << outerIter << " - Number of converged eigenvectors ";
      cout << knownEV + nFound << endl;
    }

    if (localVerbose > 1) {
      cout << endl;
      cout.precision(2);
      cout.setf(ios::scientific, ios::floatfield);
      for (i=0; i<blockSize; ++i) {
        cout << " Iteration " << outerIter << " - Scaled Norm of Residual " << i;
        cout << " = " << normR[i] << endl;
      }
      cout << endl;
      cout.precision(2);
      for (i=0; i<blockSize; ++i) {
        cout << " Iteration " << outerIter << " - Ritz eigenvalue " << i;
        cout.setf((fabs(theta[i]) < 0.01) ? ios::scientific : ios::fixed, ios::floatfield);
        cout << " = " << theta[i] << endl;
      }
      cout << endl;
    }

    if (nFound == 0) {
      // Update the spaces
      // Note: Use H as a temporary work space
      timeLocalUpdate -= MyWatch.WallTime();
      memcpy(H.Values(), X.Values(), xr*blockSize*sizeof(double));
      callBLAS.GEMM('N', 'N', xr, blockSize, blockSize, 1.0, H.Values(), xr, S, localSize,
                    0.0, X.Values(), xr);
      memcpy(H.Values(), KX.Values(), xr*blockSize*sizeof(double));
      callBLAS.GEMM('N', 'N', xr, blockSize, blockSize, 1.0, H.Values(), xr, S, localSize,
                    0.0, KX.Values(), xr);
      if (M) {
        memcpy(H.Values(), MX.Values(), xr*blockSize*sizeof(double));
        callBLAS.GEMM('N', 'N', xr, blockSize, blockSize, 1.0, H.Values(), xr, S, localSize,
                      0.0, MX.Values(), xr);
      }
      if (localSize == twoBlocks) {
        callBLAS.GEMM('N', 'N', xr, blockSize, blockSize, 1.0, P.Values(), xr,
                      S + blockSize, localSize, 1.0, X.Values(), xr);
        callBLAS.GEMM('N', 'N', xr, blockSize, blockSize, 1.0, KP.Values(), xr,
                      S + blockSize, localSize, 1.0, KX.Values(), xr);
        if (M) {
          callBLAS.GEMM('N', 'N', xr, blockSize, blockSize, 1.0, MP.Values(), xr,
                      S + blockSize, localSize, 1.0, MX.Values(), xr);
        }
      } // if (localSize == twoBlocks)
      timeLocalUpdate += MyWatch.WallTime();
      // When required, monitor some orthogonalities
      if (verbose > 2) {
        if (knownEV == 0) {
          accuracyCheck(&X, &MX, &R, 0, (localSize>blockSize) ? &P : 0);
        }
        else {
          Epetra_MultiVector copyQ(View, Q, 0, knownEV);
          accuracyCheck(&X, &MX, &R, &copyQ, (localSize>blockSize) ? &P : 0);
        }
      } // if (verbose > 2)
      continue;
    } // if (nFound == 0)

    // Order the Ritz eigenvectors by putting the converged vectors at the beginning
    int firstIndex = blockSize;
    for (j = 0; j < blockSize; ++j) {
      if (normR[j] >= tolEigenSolve) {
        firstIndex = j;
        break;
      }
    } // for (j = 0; j < blockSize; ++j)
    while (firstIndex < nFound) {
      for (j = firstIndex; j < blockSize; ++j) {
        if (normR[j] < tolEigenSolve) {
          // Swap the j-th and firstIndex-th position
          callFortran.SWAP(localSize, S + j*localSize, 1, S + firstIndex*localSize, 1);
          callFortran.SWAP(1, theta + j, 1, theta + firstIndex, 1);
          callFortran.SWAP(1, normR + j, 1, normR + firstIndex, 1);
          break;
        }
      } // for (j = firstIndex; j < blockSize; ++j)
      for (j = 0; j < blockSize; ++j) {
        if (normR[j] >= tolEigenSolve) {
          firstIndex = j;
          break;
        }
      } // for (j = 0; j < blockSize; ++j)
    } // while (firstIndex < nFound)

    // Copy the converged eigenvalues
    memcpy(lambda + knownEV, theta, nFound*sizeof(double));

    // Convergence test
    if (knownEV + nFound >= numEigen) {
      callBLAS.GEMM('N', 'N', xr, nFound, blockSize, 1.0, X.Values(), xr,
                    S, localSize, 0.0, R.Values(), xr);
      if (localSize > blockSize) {
        callBLAS.GEMM('N', 'N', xr, nFound, blockSize, 1.0, P.Values(), xr,
                      S + blockSize, localSize, 1.0, R.Values(), xr);
      }
      memcpy(Q.Values() + knownEV*xr, R.Values(), nFound*xr*sizeof(double));
      knownEV += nFound;
      if (localVerbose == 1) {
        cout << endl;
        cout.precision(2);
        cout.setf(ios::scientific, ios::floatfield);
        for (i=0; i<blockSize; ++i) {
          cout << " Iteration " << outerIter << " - Scaled Norm of Residual " << i;
          cout << " = " << normR[i] << endl;
        }
        cout << endl;
      }
      break;
    }

    // Store the converged eigenvalues and eigenvectors
    callBLAS.GEMM('N', 'N', xr, nFound, blockSize, 1.0, X.Values(), xr,
                  S, localSize, 0.0, Q.Values() + knownEV*xr, xr);
    if (localSize == twoBlocks) {
      callBLAS.GEMM('N', 'N', xr, nFound, blockSize, 1.0, P.Values(), xr,
                    S + blockSize, localSize, 1.0, Q.Values() + knownEV*xr, xr);
    }
    knownEV += nFound;

    // Define the restarting vectors
    timeRestart -= MyWatch.WallTime();
    int leftOver = (nevLocal < blockSize + nFound) ? nevLocal - nFound : blockSize;
    double *Snew = S + nFound*localSize;
    memcpy(H.Values(), X.Values(), blockSize*xr*sizeof(double));
    callBLAS.GEMM('N', 'N', xr, leftOver, blockSize, 1.0, H.Values(), xr,
                  Snew, localSize, 0.0, X.Values(), xr);
    memcpy(H.Values(), KX.Values(), blockSize*xr*sizeof(double));
    callBLAS.GEMM('N', 'N', xr, leftOver, blockSize, 1.0, H.Values(), xr,
                  Snew, localSize, 0.0, KX.Values(), xr);
    if (M) {
      memcpy(H.Values(), MX.Values(), blockSize*xr*sizeof(double));
      callBLAS.GEMM('N', 'N', xr, leftOver, blockSize, 1.0, H.Values(), xr,
                    Snew, localSize, 0.0, MX.Values(), xr);
    }
    if (localSize == twoBlocks) {
      callBLAS.GEMM('N', 'N', xr, leftOver, blockSize, 1.0, P.Values(), xr,
                    Snew+blockSize, localSize, 1.0, X.Values(), xr);
      callBLAS.GEMM('N', 'N', xr, leftOver, blockSize, 1.0, KP.Values(), xr,
                    Snew+blockSize, localSize, 1.0, KX.Values(), xr);
      if (M) {
        callBLAS.GEMM('N', 'N', xr, leftOver, blockSize, 1.0, MP.Values(), xr,
                      Snew+blockSize, localSize, 1.0, MX.Values(), xr);
      }
    } // if (localSize == twoBlocks)
    if (nevLocal < blockSize + nFound) {
      // Put new random vectors at the end of the block
      Epetra_MultiVector Xtmp(View, X, leftOver, blockSize - leftOver);
      Xtmp.Random();
    }
    else {
      nFound = 0;
    } // if (nevLocal < blockSize + nFound)
    reStart = true;
    timeRestart += MyWatch.WallTime();

  } // for (outerIter = 1; outerIter <= maxIterEigenSolve; ++outerIter)
  timeOuterLoop += MyWatch.WallTime();
  highMem = (highMem > currentSize()) ? highMem : currentSize();

  // Clean memory
  delete[] work1;
  delete[] work2;
  if (vectWeight)
    delete vectWeight;

  // Sort the eigenpairs
  timePostProce -= MyWatch.WallTime();
  if ((info == 0) && (knownEV > 0)) {
    mySort.sortScalars_Vectors(knownEV, lambda, Q.Values(), Q.MyLength());
  }
  timePostProce += MyWatch.WallTime();

  return (info == 0) ? knownEV : info;

}
int BlockPCGSolver::Solve(const Epetra_MultiVector &X, Epetra_MultiVector &Y, int blkSize) const {

  int xrow = X.MyLength();
  int xcol = X.NumVectors();
  int ycol = Y.NumVectors();

  int info = 0;
  int localVerbose = verbose*(MyComm.MyPID() == 0);
  double *valX = X.Values();
  int NB = 3 + callLAPACK.ILAENV(1, "hetrd", "u", blkSize);
  int lworkD = (blkSize > NB) ? blkSize*blkSize : NB*blkSize;
  int wSize = 4*blkSize*xrow + 3*blkSize + 2*blkSize*blkSize + lworkD;

  bool useY = true;
  if (ycol % blkSize != 0) {
    // Allocate an extra block to store the solutions
    wSize += blkSize*xrow;
    useY = false;
  }

  if (lWorkSpace < wSize) {
    delete[] workSpace;
    workSpace = new (std::nothrow) double[wSize];
    if (workSpace == 0) {
      info = -1;
      return info;
    }
    lWorkSpace = wSize;
  } // if (lWorkSpace < wSize)

  double *pointer = workSpace;

  // Array to store the matrix PtKP
  double *PtKP = pointer;
  pointer = pointer + blkSize*blkSize;

  // Array to store coefficient matrices
  double *coeff = pointer;
  pointer = pointer + blkSize*blkSize;

  // Workspace array
  double *workD = pointer;
  pointer = pointer + lworkD;

  // Array to store the eigenvalues of P^t K P
  double *da = pointer;
  pointer = pointer + blkSize;

  // Array to store the norms of right hand sides
  double *initNorm = pointer;
  pointer = pointer + blkSize;

  // Array to store the norms of residuals
  double *resNorm = pointer;
  pointer = pointer + blkSize;

  // Array to store the residuals
  double *valR = pointer;
  pointer = pointer + xrow*blkSize;
  Epetra_MultiVector R(View, X.Map(), valR, xrow, blkSize);

  // Array to store the preconditioned residuals
  double *valZ = pointer;
  pointer = pointer + xrow*blkSize;
  Epetra_MultiVector Z(View, X.Map(), valZ, xrow, blkSize);

  // Array to store the search directions
  double *valP = pointer;
  pointer = pointer + xrow*blkSize;
  Epetra_MultiVector P(View, X.Map(), valP, xrow, blkSize);

  // Array to store the image of the search directions
  double *valKP = pointer;
  pointer = pointer + xrow*blkSize;
  Epetra_MultiVector KP(View, X.Map(), valKP, xrow, blkSize);

  // Pointer to store the solutions
  double *valSOL = (useY == true) ? Y.Values() : pointer;

  int iRHS;
  for (iRHS = 0; iRHS < xcol; iRHS += blkSize) {

    int numVec = (iRHS + blkSize < xcol) ? blkSize : xcol - iRHS;

    // Set the initial residuals to the right hand sides
    if (numVec < blkSize) {
      R.Random();
    }
    memcpy(valR, valX + iRHS*xrow, numVec*xrow*sizeof(double));

    // Set the initial guess to zero
    valSOL = (useY == true) ? Y.Values() + iRHS*xrow : valSOL;
    Epetra_MultiVector SOL(View, X.Map(), valSOL, xrow, blkSize);
    SOL.PutScalar(0.0);

    int ii = 0;
    int iter = 0;
    int nFound = 0;

    R.Norm2(initNorm);

    if (localVerbose > 1) {
      std::cout << std::endl;
      std::cout << " Vectors " << iRHS << " to " << iRHS + numVec - 1 << std::endl;
      if (localVerbose > 2) {
        std::fprintf(stderr,"\n");
        for (ii = 0; ii < numVec; ++ii) {
          std::cout << " ... Initial Residual Norm " << ii << " = " << initNorm[ii] << std::endl;
        }
        std::cout << std::endl;
      }
    }

    // Iteration loop
    for (iter = 1; iter <= iterMax; ++iter) {

      // Apply the preconditioner
      if (Prec)
        Prec->ApplyInverse(R, Z);
      else
        Z = R;

      // Define the new search directions
      if (iter == 1) {
        P = Z;
      }
      else {
        // Compute P^t K Z
        callBLAS.GEMM(Teuchos::TRANS, Teuchos::NO_TRANS, blkSize, blkSize, xrow, 1.0, KP.Values(), xrow, Z.Values(), xrow,
                      0.0, workD, blkSize);
        MyComm.SumAll(workD, coeff, blkSize*blkSize);

        // Compute the coefficient (P^t K P)^{-1} P^t K Z
        callBLAS.GEMM(Teuchos::TRANS, Teuchos::NO_TRANS, blkSize, blkSize, blkSize, 1.0, PtKP, blkSize, coeff, blkSize,
                      0.0, workD, blkSize);
        for (ii = 0; ii < blkSize; ++ii)
          callBLAS.SCAL(blkSize, da[ii], workD + ii, blkSize);
        callBLAS.GEMM(Teuchos::NO_TRANS, Teuchos::NO_TRANS, blkSize, blkSize, blkSize, 1.0, PtKP, blkSize, workD, blkSize,
                      0.0, coeff, blkSize);

        // Update the search directions
        // Note: Use KP as a workspace
        memcpy(KP.Values(), P.Values(), xrow*blkSize*sizeof(double));
        callBLAS.GEMM(Teuchos::NO_TRANS, Teuchos::NO_TRANS, xrow, blkSize, blkSize, 1.0, KP.Values(), xrow, coeff, blkSize,
                      0.0, P.Values(), xrow);

        P.Update(1.0, Z, -1.0);

      } // if (iter == 1)

      K->Apply(P, KP);

      // Compute P^t K P
      callBLAS.GEMM(Teuchos::TRANS, Teuchos::NO_TRANS, blkSize, blkSize, xrow, 1.0, P.Values(), xrow, KP.Values(), xrow,
                    0.0, workD, blkSize);
      MyComm.SumAll(workD, PtKP, blkSize*blkSize);

      // Eigenvalue decomposition of P^t K P
      callLAPACK.SYEV('V', 'U', blkSize, PtKP, blkSize, da, workD, lworkD, &info);
      if (info) {
        // Break the loop as spectral decomposition failed
        break;
      } // if (info)

      // Compute the pseudo-inverse of the eigenvalues
      for (ii = 0; ii < blkSize; ++ii) {
        TEUCHOS_TEST_FOR_EXCEPTION(da[ii] < 0.0, std::runtime_error, "Negative "
                           "eigenvalue for P^T K P: da[" << ii << "] = "
                           << da[ii] << ".");
        da[ii] = (da[ii] == 0.0) ? 0.0 : 1.0/da[ii];
      } // for (ii = 0; ii < blkSize; ++ii)

      // Compute P^t R
      callBLAS.GEMM(Teuchos::TRANS, Teuchos::NO_TRANS, blkSize, blkSize, xrow, 1.0, P.Values(), xrow, R.Values(), xrow,
                    0.0, workD, blkSize);
      MyComm.SumAll(workD, coeff, blkSize*blkSize);

      // Compute the coefficient (P^t K P)^{-1} P^t R
      callBLAS.GEMM(Teuchos::TRANS, Teuchos::NO_TRANS, blkSize, blkSize, blkSize, 1.0, PtKP, blkSize, coeff, blkSize,
                    0.0, workD, blkSize);
      for (ii = 0; ii < blkSize; ++ii)
        callBLAS.SCAL(blkSize, da[ii], workD + ii, blkSize);
      callBLAS.GEMM(Teuchos::NO_TRANS, Teuchos::NO_TRANS, blkSize, blkSize, blkSize, 1.0, PtKP, blkSize, workD, blkSize,
                    0.0, coeff, blkSize);

      // Update the solutions
      callBLAS.GEMM(Teuchos::NO_TRANS, Teuchos::NO_TRANS, xrow, blkSize, blkSize, 1.0, P.Values(), xrow, coeff, blkSize,
                    1.0, valSOL, xrow);

      // Update the residuals
      callBLAS.GEMM(Teuchos::NO_TRANS, Teuchos::NO_TRANS, xrow, blkSize, blkSize, -1.0, KP.Values(), xrow, coeff, blkSize,
                    1.0, R.Values(), xrow);

      // Check convergence
      R.Norm2(resNorm);
      nFound = 0;
      for (ii = 0; ii < numVec; ++ii) {
        if (resNorm[ii] <= tolCG*initNorm[ii])
          nFound += 1;
      }

      if (localVerbose > 1) {
        std::cout << " Vectors " << iRHS << " to " << iRHS + numVec - 1;
        std::cout << " -- Iteration " << iter << " -- " << nFound << " converged vectors\n";
        if (localVerbose > 2) {
          std::cout << std::endl;
          for (ii = 0; ii < numVec; ++ii) {
            std::cout << " ... ";
            std::cout.width(5);
            std::cout << ii << " ... Residual = ";
            std::cout.precision(2);
            std::cout.setf(std::ios::scientific, std::ios::floatfield);
            std::cout << resNorm[ii] << " ... Right Hand Side = " << initNorm[ii] << std::endl;
          }
          std::cout << std::endl;
        }
      }

      if (nFound == numVec) {
        break;
      }

    }  // for (iter = 1; iter <= maxIter; ++iter)

    if (useY == false) {
      // Copy the solutions back into Y
      memcpy(Y.Values() + xrow*iRHS, valSOL, numVec*xrow*sizeof(double));
    }

    numSolve += nFound;

    if (nFound == numVec) {
      minIter = (iter < minIter) ? iter : minIter;
      maxIter = (iter > maxIter) ? iter : maxIter;
      sumIter += iter;
    }

  } // for (iRHS = 0; iRHS < xcol; iRHS += blkSize)

  return info;
}
int BlockPCGSolver::Solve(const Epetra_MultiVector &X, Epetra_MultiVector &Y) const {

  int info = 0;
  int localVerbose = verbose*(MyComm.MyPID() == 0);

  int xr = X.MyLength();

  int wSize = 3*xr;

  if (lWorkSpace < wSize) {
    if (workSpace)
      delete[] workSpace;
    workSpace = new (std::nothrow) double[wSize];
    if (workSpace == 0) {
      info = -1;
      return info;
    }
    lWorkSpace = wSize;
  } // if (lWorkSpace < wSize)

  double *pointer = workSpace;

  Epetra_Vector r(View, X.Map(), pointer);
  pointer = pointer + xr;

  Epetra_Vector p(View, X.Map(), pointer);
  pointer = pointer + xr;

  // Note: Kp and z uses the same memory space
  Epetra_Vector Kp(View, X.Map(), pointer);
  Epetra_Vector z(View, X.Map(), pointer);

  double tmp;
  double initNorm = 0.0, rNorm = 0.0, newRZ = 0.0, oldRZ = 0.0, alpha = 0.0;
  double tolSquare = tolCG*tolCG;

  memcpy(r.Values(), X.Values(), xr*sizeof(double));
  tmp = callBLAS.DOT(xr, r.Values(), 1, r.Values(), 1);
  MyComm.SumAll(&tmp, &initNorm, 1);

  Y.PutScalar(0.0);

  if (localVerbose > 1) {
    std::cout << std::endl;
    std::cout  << " --- PCG Iterations --- " << std::endl;
  }

  int iter;
  for (iter = 1; iter <= iterMax; ++iter) {

    if (Prec) {
      Prec->ApplyInverse(r, z);
    }
    else {
      memcpy(z.Values(), r.Values(), xr*sizeof(double));
    }

    if (iter == 1) {
      tmp = callBLAS.DOT(xr, r.Values(), 1, z.Values(), 1);
      MyComm.SumAll(&tmp, &newRZ, 1);
      memcpy(p.Values(), z.Values(), xr*sizeof(double));
    }
    else {
      oldRZ = newRZ;
      tmp = callBLAS.DOT(xr, r.Values(), 1, z.Values(), 1);
      MyComm.SumAll(&tmp, &newRZ, 1);
      p.Update(1.0, z, newRZ/oldRZ);
    }

    K->Apply(p, Kp);

    tmp = callBLAS.DOT(xr, p.Values(), 1, Kp.Values(), 1);
    MyComm.SumAll(&tmp, &alpha, 1);
    alpha = newRZ/alpha;

    TEUCHOS_TEST_FOR_EXCEPTION(alpha <= 0.0, std::runtime_error,
                         " !!! Non-positive value for p^TKp (" << alpha << ") !!!");

    callBLAS.AXPY(xr, alpha, p.Values(), 1, Y.Values(), 1);

    alpha *= -1.0;
    callBLAS.AXPY(xr, alpha, Kp.Values(), 1, r.Values(), 1);

    // Check convergence
    tmp = callBLAS.DOT(xr, r.Values(), 1, r.Values(), 1);
    MyComm.SumAll(&tmp, &rNorm, 1);

    if (localVerbose > 1) {
      std::cout  << "   Iter. " << iter;
      std::cout.precision(4);
      std::cout.setf(std::ios::scientific, std::ios::floatfield);
      std::cout << " Residual reduction " << std::sqrt(rNorm/initNorm) << std::endl;
    }

    if (rNorm <= tolSquare*initNorm)
      break;

  } // for (iter = 1; iter <= iterMax; ++iter)

  if (localVerbose == 1) {
    std::cout << std::endl;
    std::cout << " --- End of PCG solve ---" << std::endl;
    std::cout << "   Iter. " << iter;
    std::cout.precision(4);
    std::cout.setf(std::ios::scientific, std::ios::floatfield);
    std::cout << " Residual reduction " << std::sqrt(rNorm/initNorm) << std::endl;
    std::cout << std::endl;
  }

  if (localVerbose > 1) {
    std::cout << std::endl;
  }

  numSolve += 1;

  minIter = (iter < minIter) ? iter : minIter;
  maxIter = (iter > maxIter) ? iter : maxIter;
  sumIter += iter;

  return info;
}
Exemple #10
0
int Davidson::reSolve(int numEigen, Epetra_MultiVector &Q, double *lambda, int startingEV) {

  // Computes the smallest eigenvalues and the corresponding eigenvectors
  // of the generalized eigenvalue problem
  // 
  //      K X = M X Lambda
  // 
  // using a generalized Davidson algorithm
  //
  // Note that if M is not specified, then  K X = X Lambda is solved.
  // 
  // Input variables:
  // 
  // numEigen  (integer) = Number of eigenmodes requested
  // 
  // Q (Epetra_MultiVector) = Converged eigenvectors
  //                   The number of columns of Q must be at least numEigen + blockSize.
  //                   The rows of Q are distributed across processors.
  //                   At exit, the first numEigen columns contain the eigenvectors requested.
  // 
  // lambda (array of doubles) = Converged eigenvalues
  //                   At input, it must be of size numEigen + blockSize.
  //                   At exit, the first numEigen locations contain the eigenvalues requested.
  //
  // startingEV (integer) = Number of existing converged eigenvectors
  //                   We assume that the user has check the eigenvectors and 
  //                   their M-orthonormality.
  //
  // Return information on status of computation
  // 
  // info >=   0 >> Number of converged eigenpairs at the end of computation
  // 
  // // Failure due to input arguments
  // 
  // info = -  1 >> The stiffness matrix K has not been specified.
  // info = -  2 >> The maps for the matrix K and the matrix M differ.
  // info = -  3 >> The maps for the matrix K and the preconditioner P differ.
  // info = -  4 >> The maps for the vectors and the matrix K differ.
  // info = -  5 >> Q is too small for the number of eigenvalues requested.
  // info = -  6 >> Q is too small for the computation parameters.
  //
  // info = -  8 >> The number of blocks is too small for the number of eigenvalues.
  // 
  // info = - 10 >> Failure during the mass orthonormalization
  // 
  // info = - 30 >> MEMORY
  //

  // Check the input parameters
  
  if (numEigen <= startingEV) {
    return startingEV;
  }

  int info = myVerify.inputArguments(numEigen, K, M, Prec, Q, minimumSpaceDimension(numEigen));
  if (info < 0)
    return info;

  int myPid = MyComm.MyPID();

  if (numBlock*blockSize < numEigen) {
    if (myPid == 0) {
      cerr << endl;
      cerr << " !!! The space dimension (# of blocks x size of blocks) must be greater than ";
      cerr << " the number of eigenvalues !!!\n";
      cerr << " Number of blocks = " << numBlock << endl;
      cerr << " Size of blocks = " << blockSize << endl;
      cerr << " Number of eigenvalues = " << numEigen << endl;
      cerr << endl;
    }
    return -8;
  }

  // Get the weight for approximating the M-inverse norm
  Epetra_Vector *vectWeight = 0;
  if (normWeight) {
    vectWeight = new Epetra_Vector(View, Q.Map(), normWeight);
  }

  int knownEV = startingEV;
  int localVerbose = verbose*(myPid==0);

  // Define local block vectors
  //
  // MX = Working vectors (storing M*X if M is specified, else pointing to X)
  // KX = Working vectors (storing K*X)
  //
  // R = Residuals

  int xr = Q.MyLength();
  int dimSearch = blockSize*numBlock;

  Epetra_MultiVector X(View, Q, 0, dimSearch + blockSize);
  if (knownEV > 0) {
    Epetra_MultiVector copyX(View, Q, knownEV, blockSize);
    copyX.Random();
  }
  else {
    X.Random();
  }

  int tmp;
  tmp = (M == 0) ? 2*blockSize*xr : 3*blockSize*xr;

  double *work1 = new (nothrow) double[tmp]; 
  if (work1 == 0) {
    if (vectWeight)
      delete vectWeight;
    info = -30;
    return info;
  }
  memRequested += sizeof(double)*tmp/(1024.0*1024.0);

  highMem = (highMem > currentSize()) ? highMem : currentSize();

  double *tmpD = work1;

  Epetra_MultiVector KX(View, Q.Map(), tmpD, xr, blockSize);
  tmpD = tmpD + xr*blockSize;

  Epetra_MultiVector MX(View, Q.Map(), (M) ? tmpD : X.Values(), xr, blockSize);
  tmpD = (M) ? tmpD + xr*blockSize : tmpD;

  Epetra_MultiVector R(View, Q.Map(), tmpD, xr, blockSize);

  // Define arrays
  //
  // theta = Store the local eigenvalues (size: dimSearch)
  // normR = Store the norm of residuals (size: blockSize)
  //
  // KK = Local stiffness matrix         (size: dimSearch x dimSearch)
  //
  // S = Local eigenvectors              (size: dimSearch x dimSearch)
  //
  // tmpKK = Local workspace             (size: blockSize x blockSize)

  int lwork2 = blockSize + dimSearch + 2*dimSearch*dimSearch + blockSize*blockSize;
  double *work2 = new (nothrow) double[lwork2];
  if (work2 == 0) {
    if (vectWeight)
      delete vectWeight;
    delete[] work1;
    info = -30;
    return info;
  }

  memRequested += sizeof(double)*lwork2/(1024.0*1024.0);
  highMem = (highMem > currentSize()) ? highMem : currentSize();

  tmpD = work2;

  double *theta = tmpD;
  tmpD = tmpD + dimSearch;

  double *normR = tmpD;
  tmpD = tmpD + blockSize;

  double *KK = tmpD;
  tmpD = tmpD + dimSearch*dimSearch;
  memset(KK, 0, dimSearch*dimSearch*sizeof(double));

  double *S = tmpD;
  tmpD = tmpD + dimSearch*dimSearch;

  double *tmpKK = tmpD;

  // Define an array to store the residuals history
  if (localVerbose > 2) {
    resHistory = new (nothrow) double[maxIterEigenSolve*blockSize];
    spaceSizeHistory = new (nothrow) int[maxIterEigenSolve];
    if ((resHistory == 0) || (spaceSizeHistory == 0)) {
      if (vectWeight)
        delete vectWeight;
      delete[] work1;
      delete[] work2;
      info = -30;
      return info;
    }
    historyCount = 0;
  }

  // Miscellaneous definitions

  bool reStart = false;
  numRestart = 0;

  bool criticalExit = false;

  int bStart = 0;
  int offSet = 0;
  numBlock = (dimSearch/blockSize) - (knownEV/blockSize);

  int nFound = blockSize;
  int i, j;

  if (localVerbose > 0) {
    cout << endl;
    cout << " *|* Problem: ";
    if (M)
      cout << "K*Q = M*Q D ";
    else
      cout << "K*Q = Q D ";
    if (Prec)
      cout << " with preconditioner";
    cout << endl;
    cout << " *|* Algorithm = Davidson algorithm (block version)" << endl;
    cout << " *|* Size of blocks = " << blockSize << endl;
    cout << " *|* Largest size of search space = " << numBlock*blockSize << endl;
    cout << " *|* Number of requested eigenvalues = " << numEigen << endl;
    cout.precision(2);
    cout.setf(ios::scientific, ios::floatfield);
    cout << " *|* Tolerance for convergence = " << tolEigenSolve << endl;
    cout << " *|* Norm used for convergence: ";
    if (vectWeight)
      cout << "weighted L2-norm with user-provided weights" << endl;
    else
      cout << "L^2-norm" << endl;
    if (startingEV > 0)
      cout << " *|* Input converged eigenvectors = " << startingEV << endl;
    cout << "\n -- Start iterations -- \n";
  }

  int maxBlock = (dimSearch/blockSize) - (knownEV/blockSize);

  timeOuterLoop -= MyWatch.WallTime();
  outerIter = 0;
  while (outerIter <= maxIterEigenSolve) {

    highMem = (highMem > currentSize()) ? highMem : currentSize();

    int nb;
    for (nb = bStart; nb < maxBlock; ++nb) {

      outerIter += 1;
      if (outerIter > maxIterEigenSolve)
        break;

      int localSize = nb*blockSize;

      Epetra_MultiVector Xcurrent(View, X, localSize + knownEV, blockSize);

      timeMassOp -= MyWatch.WallTime();
      if (M)
        M->Apply(Xcurrent, MX);
      timeMassOp += MyWatch.WallTime();
      massOp += blockSize;

      // Orthonormalize X against the known eigenvectors and the previous vectors
      // Note: Use R as a temporary work space
      timeOrtho -= MyWatch.WallTime();
      if (nb == bStart) {
        if (nFound > 0) {
          if (knownEV == 0) {
            info = modalTool.massOrthonormalize(Xcurrent, MX, M, Q, nFound, 2, R.Values());
          }
          else {
            Epetra_MultiVector copyQ(View, X, 0, knownEV + localSize);
            info = modalTool.massOrthonormalize(Xcurrent, MX, M, copyQ, nFound, 0, R.Values());
          }
        }
        nFound = 0;
      }
      else {
        Epetra_MultiVector copyQ(View, X, 0, knownEV + localSize);
        info = modalTool.massOrthonormalize(Xcurrent, MX, M, copyQ, blockSize, 0, R.Values());
      }
      timeOrtho += MyWatch.WallTime();

      // Exit the code when the number of vectors exceeds the space dimension
      if (info < 0) {
        delete[] work1;
        delete[] work2;
        if (vectWeight)
          delete vectWeight;
        return -10;
      }

      timeStifOp -= MyWatch.WallTime();
      K->Apply(Xcurrent, KX);
      timeStifOp += MyWatch.WallTime();
      stifOp += blockSize;

      // Check the orthogonality properties of X
      if (verbose > 2) {
        if (knownEV + localSize == 0)
          accuracyCheck(&Xcurrent, &MX, 0);
        else {
          Epetra_MultiVector copyQ(View, X, 0, knownEV + localSize);
          accuracyCheck(&Xcurrent, &MX, &copyQ);
        }
        if (localVerbose > 0)
          cout << endl;
      } // if (verbose > 2)

      // Define the local stiffness matrix
      // Note: S is used as a workspace
      timeLocalProj -= MyWatch.WallTime();
      for (j = 0; j <= nb; ++j) {
        callBLAS.GEMM('T', 'N', blockSize, blockSize, xr,
                      1.0, X.Values()+(knownEV+j*blockSize)*xr, xr, KX.Values(), xr,
                      0.0, tmpKK, blockSize);
        MyComm.SumAll(tmpKK, S, blockSize*blockSize);
        int iC;
        for (iC = 0; iC < blockSize; ++iC) {
          double *Kpointer = KK + localSize*dimSearch + j*blockSize + iC*dimSearch;
          memcpy(Kpointer, S + iC*blockSize, blockSize*sizeof(double));
        }
      }
      timeLocalProj += MyWatch.WallTime();

      // Perform a spectral decomposition
      timeLocalSolve -= MyWatch.WallTime();
      int nevLocal = localSize + blockSize;
      info = modalTool.directSolver(localSize+blockSize, KK, dimSearch, 0, 0,
                                    nevLocal, S, dimSearch, theta, localVerbose, 10);
      timeLocalSolve += MyWatch.WallTime();

      if (info != 0) {
        // Stop as spectral decomposition has a critical failure
        if (info < 0) {
          criticalExit = true;
          break;
        }
        // Restart as spectral decomposition failed
        if (localVerbose > 0) {
          cout << " Iteration " << outerIter;
          cout << "- Failure for spectral decomposition - RESTART with new random search\n";
        }
        reStart = true;
        numRestart += 1;
        timeRestart -= MyWatch.WallTime();
        Epetra_MultiVector Xinit(View, X, knownEV, blockSize);
        Xinit.Random();
        timeRestart += MyWatch.WallTime();
        nFound = blockSize;
        bStart = 0;
        break;
      } // if (info != 0)

      // Update the search space
      // Note: Use KX as a workspace
      timeLocalUpdate -= MyWatch.WallTime();
      callBLAS.GEMM('N', 'N', xr, blockSize, localSize+blockSize, 1.0, X.Values()+knownEV*xr, xr,
                    S, dimSearch, 0.0, KX.Values(), xr);
      timeLocalUpdate += MyWatch.WallTime();

      // Apply the mass matrix for the next block
      timeMassOp -= MyWatch.WallTime();
      if (M)
        M->Apply(KX, MX);
      timeMassOp += MyWatch.WallTime();
      massOp += blockSize;

      // Apply the stiffness matrix for the next block
      timeStifOp -= MyWatch.WallTime();
      K->Apply(KX, R);
      timeStifOp += MyWatch.WallTime();
      stifOp += blockSize;

      // Form the residuals
      timeResidual -= MyWatch.WallTime();
      if (M) {
        for (j = 0; j < blockSize; ++j) {
          callBLAS.AXPY(xr, -theta[j], MX.Values() + j*xr, R.Values() + j*xr);
        }
      }
      else {
        // Note KX contains the updated block
        for (j = 0; j < blockSize; ++j) {
          callBLAS.AXPY(xr, -theta[j], KX.Values() + j*xr, R.Values() + j*xr);
        }
      }
      timeResidual += MyWatch.WallTime();
      residual += blockSize;

      // Compute the norm of residuals
      timeNorm -= MyWatch.WallTime();
      if (vectWeight) {
        R.NormWeighted(*vectWeight, normR);
      }
      else {
        R.Norm2(normR);
      }
      // Scale the norms of residuals with the eigenvalues
      // Count the number of converged eigenvectors
      nFound = 0;
      for (j = 0; j < blockSize; ++j) {
        normR[j] = (theta[j] == 0.0) ? normR[j] : normR[j]/theta[j];
        if (normR[j] < tolEigenSolve)
          nFound += 1;
      } // for (j = 0; j < blockSize; ++j)
      timeNorm += MyWatch.WallTime();

      // Store the residual history
      if (localVerbose > 2) {
        memcpy(resHistory + historyCount*blockSize, normR, blockSize*sizeof(double));
        spaceSizeHistory[historyCount] = localSize + blockSize;
        historyCount += 1;
      }
      maxSpaceSize = (maxSpaceSize > localSize+blockSize) ? maxSpaceSize : localSize+blockSize;
      sumSpaceSize += localSize + blockSize;

      // Print information on current iteration
      if (localVerbose > 0) {
        cout << " Iteration " << outerIter << " - Number of converged eigenvectors ";
        cout << knownEV + nFound << endl;
      } // if (localVerbose > 0)

      if (localVerbose > 1) {
        cout << endl;
        cout.precision(2);
        cout.setf(ios::scientific, ios::floatfield);
        for (i=0; i<blockSize; ++i) {
          cout << " Iteration " << outerIter << " - Scaled Norm of Residual " << i;
          cout << " = " << normR[i] << endl;
        }
        cout << endl;
        cout.precision(2);
        for (i=0; i<nevLocal; ++i) {
          cout << " Iteration " << outerIter << " - Ritz eigenvalue " << i;
          cout.setf((fabs(theta[i]) < 0.01) ? ios::scientific : ios::fixed, ios::floatfield);  
          cout << " = " << theta[i] << endl;
        }
        cout << endl;
      }

      // Exit the loop to treat the converged eigenvectors
      if (nFound > 0) {
        nb += 1;
        offSet = 0;
        break;
      }

      // Apply the preconditioner on the residuals
      // Note: Use KX as a workspace
      if (maxBlock == 1) {
        if (Prec) {
          timePrecOp -= MyWatch.WallTime();
          Prec->ApplyInverse(R, Xcurrent);
          timePrecOp += MyWatch.WallTime();
          precOp += blockSize;
        }
        else {
          memcpy(Xcurrent.Values(), R.Values(), blockSize*xr*sizeof(double));
        }
        timeRestart -= MyWatch.WallTime();
        Xcurrent.Update(1.0, KX, -1.0);
        timeRestart += MyWatch.WallTime();
        break;
      } // if (maxBlock == 1)

      if (nb == maxBlock - 1) {
        nb += 1;
        break;
      }

      Epetra_MultiVector Xnext(View, X, knownEV+localSize+blockSize, blockSize);
      if (Prec) {
        timePrecOp -= MyWatch.WallTime();
        Prec->ApplyInverse(R, Xnext);
        timePrecOp += MyWatch.WallTime();
        precOp += blockSize;
      }
      else {
        memcpy(Xnext.Values(), R.Values(), blockSize*xr*sizeof(double));
      }

    } // for (nb = bStart; nb < maxBlock; ++nb)

    if (outerIter > maxIterEigenSolve)
      break;

    if (reStart == true) {
      reStart = false;
      continue;
    }

    if (criticalExit == true)
      break;

    // Store the final converged eigenvectors
    if (knownEV + nFound >= numEigen) {
      for (j = 0; j < blockSize; ++j) {
        if (normR[j] < tolEigenSolve) {
          memcpy(X.Values() + knownEV*xr, KX.Values() + j*xr, xr*sizeof(double));
          lambda[knownEV] = theta[j];
          knownEV += 1;
        }
      }
      if (localVerbose == 1) {
        cout << endl;
        cout.precision(2);
        cout.setf(ios::scientific, ios::floatfield);
        for (i=0; i<blockSize; ++i) {
          cout << " Iteration " << outerIter << " - Scaled Norm of Residual " << i;
          cout << " = " << normR[i] << endl;
        }
        cout << endl;
      }  
      break;
    } // if (knownEV + nFound >= numEigen)

    // Treat the particular case of 1 block
    if (maxBlock == 1) {
      if (nFound > 0) {
        double *Xpointer = X.Values() + (knownEV+nFound)*xr;
        nFound = 0;
        for (j = 0; j < blockSize; ++j) {
          if (normR[j] < tolEigenSolve) {
            memcpy(X.Values() + knownEV*xr, KX.Values() + j*xr, xr*sizeof(double));
            lambda[knownEV] = theta[j];
            knownEV += 1;
            nFound += 1;
          }
          else {
            memcpy(Xpointer + (j-nFound)*xr, KX.Values() + j*xr, xr*sizeof(double));
          }
        }
        Epetra_MultiVector Xnext(View, X, knownEV + blockSize - nFound, nFound);
        Xnext.Random();
      }
      else {
        nFound = blockSize;
      }
      continue;
    }

    // Define the restarting block when maxBlock > 1
    if (nFound > 0) {
      int firstIndex = blockSize;
      for (j = 0; j < blockSize; ++j) {
        if (normR[j] >= tolEigenSolve) {
          firstIndex = j;
          break;
        }
      } // for (j = 0; j < blockSize; ++j)
      while (firstIndex < nFound) {
        for (j = firstIndex; j < blockSize; ++j) {
          if (normR[j] < tolEigenSolve) {
            // Swap the j-th and firstIndex-th position
            callFortran.SWAP(nb*blockSize, S + j*dimSearch, 1, S + firstIndex*dimSearch, 1);
            callFortran.SWAP(1, theta + j, 1, theta + firstIndex, 1);
            callFortran.SWAP(1, normR + j, 1, normR + firstIndex, 1);
            break;
          }
        } // for (j = firstIndex; j < blockSize; ++j)
        for (j = 0; j < blockSize; ++j) {
          if (normR[j] >= tolEigenSolve) {
            firstIndex = j;
            break;
          }
        } // for (j = 0; j < blockSize; ++j)
      } // while (firstIndex < nFound)

      // Copy the converged eigenvalues
      memcpy(lambda + knownEV, theta, nFound*sizeof(double));

    } // if (nFound > 0)

    // Define the restarting size
    bStart = ((nb - offSet) > 2) ? (nb - offSet)/2 : 0;

    // Define the restarting space and local stiffness
    timeRestart -= MyWatch.WallTime();
    memset(KK, 0, nb*blockSize*dimSearch*sizeof(double));
    for (j = 0; j < bStart*blockSize; ++j) {
      KK[j + j*dimSearch] = theta[j + nFound];
    }
    // Form the restarting space
    int oldCol = nb*blockSize;
    int newCol = nFound + (bStart+1)*blockSize;
    newCol = (newCol > oldCol) ? oldCol : newCol;
    callFortran.GEQRF(oldCol, newCol, S, dimSearch, theta, R.Values(), xr*blockSize, &info);
    callFortran.ORMQR('R', 'N', xr, oldCol, newCol, S, dimSearch, theta,
                      X.Values()+knownEV*xr, xr, R.Values(), blockSize*xr, &info);
    timeRestart += MyWatch.WallTime();

    if (nFound == 0)
      offSet += 1;

    knownEV += nFound;
    maxBlock = (dimSearch/blockSize) - (knownEV/blockSize);

    // Put random vectors if the Rayleigh Ritz vectors are not enough
    newCol = nFound + (bStart+1)*blockSize;
    if (newCol > oldCol) {
      Epetra_MultiVector Xnext(View, X, knownEV+blockSize-nFound, nFound);
      Xnext.Random();
      continue;
    }

    nFound = 0;

  } // while (outerIter <= maxIterEigenSolve)
  timeOuterLoop += MyWatch.WallTime();
  highMem = (highMem > currentSize()) ? highMem : currentSize();

  // Clean memory
  delete[] work1;
  delete[] work2;
  if (vectWeight)
    delete vectWeight;

  // Sort the eigenpairs
  timePostProce -= MyWatch.WallTime();
  if ((info == 0) && (knownEV > 0)) {
    mySort.sortScalars_Vectors(knownEV, lambda, Q.Values(), Q.MyLength());
  }
  timePostProce += MyWatch.WallTime();

  return (info == 0) ? knownEV : info;

}
int 
Stokhos::ApproxSchurComplementPreconditioner::
ApplyInverse(const Epetra_MultiVector& Input, Epetra_MultiVector& Result) const
{
#ifdef STOKHOS_TEUCHOS_TIME_MONITOR
  TEUCHOS_FUNC_TIME_MONITOR("Stokhos: Total Approximate Schur Complement Time");
#endif

  // We have to be careful if Input and Result are the same vector.
  // If this is the case, the only possible solution is to make a copy
  const Epetra_MultiVector *input = &Input;
  bool made_copy = false;
  if (Input.Values() == Result.Values()) {
    input = new Epetra_MultiVector(Input);
    made_copy = true;
  } 

  // Allocate temporary storage
  int m = input->NumVectors();
  if (rhs_block == Teuchos::null || rhs_block->NumVectors() != m)
    rhs_block = 
      Teuchos::rcp(new EpetraExt::BlockMultiVector(*base_map, *sg_map, m));
  if (tmp == Teuchos::null || tmp->NumVectors() != m*max_num_mat_vec)
    tmp = Teuchos::rcp(new Epetra_MultiVector(*base_map, 
					      m*max_num_mat_vec));
  j_ptr.resize(m*max_num_mat_vec);
  mj_indices.resize(m*max_num_mat_vec);
  
  // Extract blocks
  EpetraExt::BlockMultiVector input_block(View, *base_map, *input);
  EpetraExt::BlockMultiVector result_block(View, *base_map, Result);

  result_block.PutScalar(0.0);

  // Set right-hand-side to input_block
  rhs_block->Update(1.0, input_block, 0.0);

  // At level l, linear system has the structure
  // [ A_{l-1} B_l ][ u_l^{l-1} ] = [ r_l^{l-1} ]
  // [ C_l     D_l ][ u_l^l     ]   [ r_l^l     ]

  for (int l=P; l>=1; l--) {
    // Compute D_l^{-1} r_l^l
    divide_diagonal_block(block_indices[l], block_indices[l+1], 
			  *rhs_block, result_block);

    // Compute r_l^{l-1} = r_l^{l-1} - B_l D_l^{-1} r_l^l
    multiply_block(upper_block_Cijk[l], -1.0, result_block, *rhs_block);
  }

  // Solve A_0 u_0 = r_0
  divide_diagonal_block(0, 1, *rhs_block, result_block);

  for (int l=1; l<=P; l++) {
    // Compute r_l^l - C_l*u_l^{l-1}
    multiply_block(lower_block_Cijk[l], -1.0, result_block, *rhs_block);

    // Compute D_l^{-1} (r_l^l - C_l*u_l^{l-1})
    divide_diagonal_block(block_indices[l], block_indices[l+1], 
			  *rhs_block, result_block);
  }

  if (made_copy)
    delete input;

  return 0; 
}
int
Stokhos::MatrixFreeOperator::
Apply(const Epetra_MultiVector& Input, Epetra_MultiVector& Result) const
{
#ifdef STOKHOS_TEUCHOS_TIME_MONITOR
  TEUCHOS_FUNC_TIME_MONITOR("Stokhos: SG Operator Apply()");
#endif

  // Note for transpose:
  // The stochastic matrix is symmetric, however the matrix blocks may not
  // be.  So the algorithm here is the same whether we are using the transpose
  // or not.  We just apply the transpose of the blocks in the case of
  // applying the global transpose, and make sure the imported Input
  // vectors use the right map.

  // We have to be careful if Input and Result are the same vector.
  // If this is the case, the only possible solution is to make a copy
  const Epetra_MultiVector *input = &Input;
  bool made_copy = false;
  if (Input.Values() == Result.Values() && !is_stoch_parallel) {
    input = new Epetra_MultiVector(Input);
    made_copy = true;
  }

  // Initialize
  Result.PutScalar(0.0);

  const Epetra_Map* input_base_map = domain_base_map.get();
  const Epetra_Map* result_base_map = range_base_map.get();
  if (useTranspose == true) {
    input_base_map = range_base_map.get();
    result_base_map = domain_base_map.get();
  }

  // Allocate temporary storage
  int m = Input.NumVectors();
  if (useTranspose == false &&
      (tmp == Teuchos::null || tmp->NumVectors() != m*max_num_mat_vec))
    tmp = Teuchos::rcp(new Epetra_MultiVector(*result_base_map,
                                              m*max_num_mat_vec));
  else if (useTranspose == true &&
           (tmp_trans == Teuchos::null ||
            tmp_trans->NumVectors() != m*max_num_mat_vec))
    tmp_trans = Teuchos::rcp(new Epetra_MultiVector(*result_base_map,
                                                    m*max_num_mat_vec));
  Epetra_MultiVector *tmp_result;
  if (useTranspose == false)
    tmp_result = tmp.get();
  else
    tmp_result = tmp_trans.get();

  // Map input into column map
  const Epetra_MultiVector *tmp_col;
  if (!is_stoch_parallel)
    tmp_col = input;
  else {
    if (useTranspose == false) {
      if (input_col == Teuchos::null || input_col->NumVectors() != m)
        input_col = Teuchos::rcp(new Epetra_MultiVector(*global_col_map, m));
      input_col->Import(*input, *col_importer, Insert);
      tmp_col = input_col.get();
    }
    else {
      if (input_col_trans == Teuchos::null ||
          input_col_trans->NumVectors() != m)
        input_col_trans =
          Teuchos::rcp(new Epetra_MultiVector(*global_col_map_trans, m));
      input_col_trans->Import(*input, *col_importer_trans, Insert);
      tmp_col = input_col_trans.get();
    }
  }

  // Extract blocks
  EpetraExt::BlockMultiVector sg_input(View, *input_base_map, *tmp_col);
  EpetraExt::BlockMultiVector sg_result(View, *result_base_map, Result);
  for (int i=0; i<input_block.size(); i++)
    input_block[i] = sg_input.GetBlock(i);
  for (int i=0; i<result_block.size(); i++)
    result_block[i] = sg_result.GetBlock(i);

  // Apply block SG operator via
  // w_i =
  //    \sum_{j=0}^P \sum_{k=0}^L J_k v_j < \psi_i \psi_j \psi_k > / <\psi_i^2>
  // for i=0,...,P where P = expansion_size, L = num_blocks, w_j is the jth
  // input block, w_i is the ith result block, and J_k is the kth block operator

  // k_begin and k_end are initialized in the constructor
  const Teuchos::Array<double>& norms = sg_basis->norm_squared();
  for (Cijk_type::k_iterator k_it=k_begin; k_it!=k_end; ++k_it) {
    int k = index(k_it);
    Cijk_type::kj_iterator j_begin = Cijk->j_begin(k_it);
    Cijk_type::kj_iterator j_end = Cijk->j_end(k_it);
    int nj = Cijk->num_j(k_it);
    if (nj > 0) {
      Teuchos::Array<double*> j_ptr(nj*m);
      Teuchos::Array<int> mj_indices(nj*m);
      int l = 0;
      for (Cijk_type::kj_iterator j_it = j_begin; j_it != j_end; ++j_it) {
        int j = index(j_it);
        for (int mm=0; mm<m; mm++) {
          j_ptr[l*m+mm] = (*input_block[j])[mm];
          mj_indices[l*m+mm] = l*m+mm;
        }
        l++;
      }
      Epetra_MultiVector input_tmp(View, *input_base_map, &j_ptr[0], nj*m);
      Epetra_MultiVector result_tmp(View, *tmp_result, &mj_indices[0], nj*m);
      if (use_block_apply) {
        (*block_ops)[k].Apply(input_tmp, result_tmp);
      }
      else {
        for (int jj=0; jj<nj*m; jj++)
          (*block_ops)[k].Apply(*(input_tmp(jj)), *(result_tmp(jj)));
      }
      l = 0;
      for (Cijk_type::kj_iterator j_it = j_begin; j_it != j_end; ++j_it) {
        int j = index(j_it);
        for (Cijk_type::kji_iterator i_it = Cijk->i_begin(j_it);
             i_it != Cijk->i_end(j_it); ++i_it) {
          int i = index(i_it);
          double c = value(i_it);
          if (scale_op) {
            int i_gid;
            if (useTranspose)
              i_gid = epetraCijk->GCID(j);
            else
              i_gid = epetraCijk->GRID(i);
            c /= norms[i_gid];
          }
          for (int mm=0; mm<m; mm++)
            (*result_block[i])(mm)->Update(c, *result_tmp(l*m+mm), 1.0);
        }
        l++;
      }
    }
  }

  // Destroy blocks
  for (int i=0; i<input_block.size(); i++)
    input_block[i] = Teuchos::null;
  for (int i=0; i<result_block.size(); i++)
    result_block[i] = Teuchos::null;

  if (made_copy)
    delete input;

  return 0;
}
Exemple #13
0
int ModifiedARPACKm3::reSolve(int numEigen, Epetra_MultiVector &Q, double *lambda, 
                              int startingEV, const Epetra_MultiVector *orthoVec) {

  // Computes the smallest eigenvalues and the corresponding eigenvectors
  // of the generalized eigenvalue problem
  // 
  //      K X = M X Lambda
  // 
  // using ModifiedARPACK (mode 3).
  //
  // The convergence test is performed outisde of ARPACK
  //
  //                      || Kx - Mx lambda || < tol*lambda
  //
  // The norm ||.|| can be specified by the user through the array normWeight.
  // By default, the L2 Euclidean norm is used.
  //
  // Note that if M is not specified, then  K X = X Lambda is solved.
  // (using the mode for generalized eigenvalue problem).
  // 
  // Input variables:
  // 
  // numEigen  (integer) = Number of eigenmodes requested
  // 
  // Q (Epetra_MultiVector) = Initial search space
  //                   The number of columns of Q defines the size of search space (=NCV).
  //                   The rows of X are distributed across processors.
  //                   As a rule of thumb in ARPACK User's guide, NCV >= 2*numEigen.
  //                   At exit, the first numEigen locations contain the eigenvectors requested.
  // 
  // lambda (array of doubles) = Converged eigenvalues
  //                   The length of this array is equal to the number of columns in Q.
  //                   At exit, the first numEigen locations contain the eigenvalues requested.
  // 
  // startingEV (integer) = Number of eigenmodes already stored in Q
  //                   A linear combination of these vectors is made to define the starting
  //                   vector, placed in resid.
  //
  // orthoVec (Pointer to Epetra_MultiVector) = Space to be orthogonal to
  //                   The computation is performed in the orthogonal of the space spanned
  //                   by the columns vectors in orthoVec.
  //
  // Return information on status of computation
  // 
  // info >=   0 >> Number of converged eigenpairs at the end of computation
  // 
  // // Failure due to input arguments
  // 
  // info = -  1 >> The stiffness matrix K has not been specified.
  // info = -  2 >> The maps for the matrix K and the matrix M differ.
  // info = -  3 >> The maps for the matrix K and the preconditioner P differ.
  // info = -  4 >> The maps for the vectors and the matrix K differ.
  // info = -  5 >> Q is too small for the number of eigenvalues requested.
  // info = -  6 >> Q is too small for the computation parameters.
  // 
  // info = -  8 >> numEigen must be smaller than the dimension of the matrix.
  //
  // info = - 30 >> MEMORY
  //
  // See ARPACK documentation for the meaning of INFO

  if (numEigen <= startingEV) {
    return numEigen;
  }

  int info = myVerify.inputArguments(numEigen, K, M, 0, Q, minimumSpaceDimension(numEigen));
  if (info < 0)
    return info;

  int myPid = MyComm.MyPID();

  int localSize = Q.MyLength();
  int NCV = Q.NumVectors();
  int knownEV = 0;

  if (NCV > Q.GlobalLength()) {
    if (numEigen >= Q.GlobalLength()) {
      cerr << endl;
      cerr << " !! The number of requested eigenvalues must be smaller than the dimension";
      cerr << " of the matrix !!\n";
      cerr << endl;
      return -8;
    }
    NCV = Q.GlobalLength();
  }

  // Get the weight for approximating the M-inverse norm
  Epetra_Vector *vectWeight = 0;
  if (normWeight) {
    vectWeight = new Epetra_Vector(View, Q.Map(), normWeight);
  }

  int localVerbose = verbose*(myPid == 0);

  // Define data for ARPACK
  //
  // UH (10/17/03) Note that workl is also used 
  //               * to store the eigenvectors of the tridiagonal matrix
  //               * as a workspace for DSTEQR
  //               * as a workspace for recovering the global eigenvectors

  highMem = (highMem > currentSize()) ? highMem : currentSize();

  int ido = 0;

  int lwI = 22;
  int *wI = new (nothrow) int[lwI];
  if (wI == 0) {
    if (vectWeight)
      delete vectWeight;
    return -30;
  }
  memRequested += sizeof(int)*lwI/(1024.0*1024.0);

  int *iparam = wI;
  int *ipntr = wI + 11;

  int lworkl = NCV*(NCV+8);
  int lwD = lworkl + 4*localSize;
  double *wD = new (nothrow) double[lwD];
  if (wD == 0) {
    if (vectWeight)
      delete vectWeight;
    delete[] wI;
    return -30;
  }
  memRequested += sizeof(double)*(4*localSize+lworkl)/(1024.0*1024.0);

  double *pointer = wD;

  double *workl = pointer;
  pointer = pointer + lworkl;

  double *resid = pointer;
  pointer = pointer + localSize;

  double *workd = pointer;

  double *v = Q.Values();

  highMem = (highMem > currentSize()) ? highMem : currentSize();

  if (startingEV > 0) {
    // Define the initial starting vector
    memset(resid, 0, localSize*sizeof(double));
    for (int jj = 0; jj < startingEV; ++jj)
      for (int ii = 0; ii < localSize; ++ii)
         resid[ii] += v[ii + jj*localSize];
    info = 1;
  }

  iparam[1-1] = 1;
  iparam[3-1] = maxIterEigenSolve;
  iparam[7-1] = 3;

  // The fourth parameter forces to use the convergence test provided by ARPACK.
  // This requires a customization of ARPACK (provided by R. Lehoucq).

  iparam[4-1] = 1;

  Epetra_Vector v1(View, Q.Map(), workd);
  Epetra_Vector v2(View, Q.Map(), workd + localSize);
  Epetra_Vector v3(View, Q.Map(), workd + 2*localSize);

  // Define further storage for the new residual check
  // Use a block of vectors to compute the residuals more quickly.
  // Note that workd could be used if memory becomes an issue.
  int loopZ = (NCV > 10) ? 10 : NCV;

  int lwD2 = localSize + 2*NCV-1 + NCV;
  lwD2 += (M) ? 3*loopZ*localSize : 2*loopZ*localSize;
  double *wD2 = new (nothrow) double[lwD2];
  if (wD2 == 0) {
    if (vectWeight)
      delete vectWeight;
    delete[] wI;
    delete[] wD;
    return -30;
  }
  memRequested += sizeof(double)*lwD2/(1024.0*1024.0);

  pointer = wD2;
  
  // vTmp is used when ido = -1
  double *vTmp = pointer;
  pointer = pointer + localSize;

  // dd and ee stores the tridiagonal matrix.
  // Note that DSTEQR destroys the contents of the input arrays.
  double *dd = pointer;
  pointer = pointer + NCV;

  double *ee = pointer;
  pointer = pointer + NCV-1;

  double *vz = pointer;
  pointer = pointer + loopZ*localSize;
  Epetra_MultiVector approxEV(View, Q.Map(), vz, localSize, loopZ);

  double *kvz = pointer;
  pointer = pointer + loopZ*localSize;
  Epetra_MultiVector KapproxEV(View, Q.Map(), kvz, localSize, loopZ);

  double *mvz = (M) ? pointer : vz;
  pointer = (M) ? pointer + loopZ*localSize : pointer;
  Epetra_MultiVector MapproxEV(View, Q.Map(), mvz, localSize, loopZ);

  double *normR = pointer;

  // zz contains the eigenvectors of the tridiagonal matrix.
  // workt is a workspace for DSTEQR.
  // Note that zz and workt will use parts of workl.
  double *zz, *workt;

  highMem = (highMem > currentSize()) ? highMem : currentSize();

  // Define an array to store the residuals history
  if (localVerbose > 2) {
    resHistory = new (nothrow) double[maxIterEigenSolve*NCV];
    if (resHistory == 0) {
      if (vectWeight)
        delete vectWeight;
      delete[] wI;
      delete[] wD;
      delete[] wD2;
      return -30;
    }
    historyCount = 0;
  }

  highMem = (highMem > currentSize()) ? highMem : currentSize();

  if (localVerbose > 0) {
    cout << endl;
    cout << " *|* Problem: ";
    if (M) 
      cout << "K*Q = M*Q D ";
    else
      cout << "K*Q = Q D ";
    cout << endl;
    cout << " *|* Algorithm = ARPACK (Mode 3, modified such that user checks convergence)" << endl;
    cout << " *|* Number of requested eigenvalues = " << numEigen << endl;
    cout.precision(2);
    cout.setf(ios::scientific, ios::floatfield);
    cout << " *|* Tolerance for convergence = " << tolEigenSolve << endl;
    if (startingEV > 0)
      cout << " *|* User-defined starting vector (Combination of " << startingEV << " vectors)\n";
    cout << " *|* Norm used for convergence: ";
    if (normWeight)
      cout << "weighted L2-norm with user-provided weights" << endl;
    else
      cout << "L^2-norm" << endl;
    if (orthoVec)
      cout << " *|* Size of orthogonal subspace = " << orthoVec->NumVectors() << endl;
    cout << "\n -- Start iterations -- \n";
  }

#ifdef EPETRA_MPI
  Epetra_MpiComm *MPIComm = dynamic_cast<Epetra_MpiComm *>(const_cast<Epetra_Comm*>(&MyComm));
#endif

  timeOuterLoop -= MyWatch.WallTime();
  while (ido != 99) {

    highMem = (highMem > currentSize()) ? highMem : currentSize();

#ifdef EPETRA_MPI
    if (MPIComm)
      callFortran.PSAUPD(MPIComm->Comm(), &ido, 'G', localSize, "LM", numEigen, tolEigenSolve,
             resid, NCV, v, localSize, iparam, ipntr, workd, workl, lworkl, &info, 0);
    else
      callFortran.SAUPD(&ido, 'G', localSize, "LM", numEigen, tolEigenSolve, resid, NCV, v, 
             localSize, iparam, ipntr, workd, workl, lworkl, &info, 0);
#else
    callFortran.SAUPD(&ido, 'G', localSize, "LM", numEigen, tolEigenSolve, resid, NCV, v,
             localSize, iparam, ipntr, workd, workl, lworkl, &info, 0);
#endif

    if (ido == -1) {
      // Apply the mass matrix      
      v3.ResetView(workd + ipntr[0] - 1);
      v1.ResetView(vTmp);
      timeMassOp -= MyWatch.WallTime();
      if (M)
        M->Apply(v3, v1);
      else
        memcpy(v1.Values(), v3.Values(), localSize*sizeof(double));
      timeMassOp += MyWatch.WallTime();
      massOp += 1;
      if ((orthoVec) && (verbose > 3)) {
        // Check the orthogonality
        double maxDot = myVerify.errorOrthogonality(orthoVec, &v1, 0);
        if (myPid == 0) {
          cout << " Maximum Euclidean dot product against orthogonal space (Before Solve) = ";
          cout << maxDot << endl;
        }
      }
      // Solve the stiffness problem
      v2.ResetView(workd + ipntr[1] - 1);
      timeStifOp -= MyWatch.WallTime();
      K->ApplyInverse(v1, v2);
      timeStifOp += MyWatch.WallTime();
      stifOp += 1;
      // Project the solution vector if needed
      // Note: Use mvz as workspace
      if (orthoVec) {
        Epetra_Vector Mv2(View, v2.Map(), mvz);
        if (M)
          M->Apply(v2, Mv2);
        else
          memcpy(Mv2.Values(), v2.Values(), localSize*sizeof(double));
        modalTool.massOrthonormalize(v2, Mv2, M, *orthoVec, 1, 1);
      }
      if ((orthoVec) && (verbose > 3)) {
        // Check the orthogonality
        double maxDot = myVerify.errorOrthogonality(orthoVec, &v2, M);
        if (myPid == 0) {
          cout << " Maximum M-dot product against orthogonal space (After Solve) = ";
          cout << maxDot << endl;
        }
      }
      continue;
    } // if (ido == -1)

    if (ido == 1) {
      // Solve the stiffness problem
      v1.ResetView(workd + ipntr[2] - 1);
      v2.ResetView(workd + ipntr[1] - 1);
      if ((orthoVec) && (verbose > 3)) {
        // Check the orthogonality
        double maxDot = myVerify.errorOrthogonality(orthoVec, &v1, 0);
        if (myPid == 0) {
          cout << " Maximum Euclidean dot product against orthogonal space (Before Solve) = ";
          cout << maxDot << endl;
        }
      }
      timeStifOp -= MyWatch.WallTime();
      K->ApplyInverse(v1, v2);
      timeStifOp += MyWatch.WallTime();
      stifOp += 1;
      // Project the solution vector if needed
      // Note: Use mvz as workspace
      if (orthoVec) {
        Epetra_Vector Mv2(View, v2.Map(), mvz);
        if (M)
          M->Apply(v2, Mv2);
        else
          memcpy(Mv2.Values(), v2.Values(), localSize*sizeof(double));
        modalTool.massOrthonormalize(v2, Mv2, M, *orthoVec, 1, 1);
      }
      if ((orthoVec) && (verbose > 3)) {
        // Check the orthogonality
        double maxDot = myVerify.errorOrthogonality(orthoVec, &v2, M);
        if (myPid == 0) {
          cout << " Maximum M-dot product against orthogonal space (After Solve) = ";
          cout << maxDot << endl;
        }
      }
      continue;
    } // if (ido == 1)

    if (ido == 2) {
      // Apply the mass matrix      
      v1.ResetView(workd + ipntr[0] - 1);
      v2.ResetView(workd + ipntr[1] - 1);
      timeMassOp -= MyWatch.WallTime();
      if (M)
        M->Apply(v1, v2);
      else
        memcpy(v2.Values(), v1.Values(), localSize*sizeof(double));
      timeMassOp += MyWatch.WallTime();
      massOp += 1;
      continue;
    } // if (ido == 2)

    if (ido == 4) {
      timeResidual -= MyWatch.WallTime();
      // Copy the main diagonal of T
      memcpy(dd, workl + NCV + ipntr[4] - 1, NCV*sizeof(double));
      // Copy the lower diagonal of T
      memcpy(ee, workl + ipntr[4], (NCV-1)*sizeof(double));
      // Compute the eigenpairs of the tridiagonal matrix
      zz = workl + 4*NCV;
      workt = workl + 4*NCV + NCV*NCV;
      callFortran.STEQR('I', NCV, dd, ee, zz, NCV, workt, &info);
      if (info != 0) {
        if (localVerbose > 0) {
          cerr << endl;
          cerr << " Error with DSTEQR, info = " << info << endl;
          cerr << endl;
        }
        break;
      }
      // dd contains the eigenvalues in ascending order 
      // Check the residual of the proposed eigenvectors of (K, M)
      int ii, jz;
      iparam[4] = 0;
      for (jz = 0; jz < NCV; jz += loopZ) {
        int colZ = (jz + loopZ < NCV) ? loopZ : NCV - jz;
        callBLAS.GEMM('N', 'N', localSize, colZ, NCV, 1.0, v, localSize,
                      zz + jz*NCV, NCV, 0.0, vz, localSize);
        // Form the residuals
        if (M)
          M->Apply(approxEV, MapproxEV); 
        K->Apply(approxEV, KapproxEV); 
        for (ii = 0; ii < colZ; ++ii) {
          callBLAS.AXPY(localSize, -1.0/dd[ii+jz], MapproxEV.Values() + ii*localSize, 
                        KapproxEV.Values() + ii*localSize);
        }
        // Compute the norms of the residuals
        if (vectWeight) {
          KapproxEV.NormWeighted(*vectWeight, normR + jz);
        }
        else {
          KapproxEV.Norm2(normR + jz);
        }
        // Scale the norms of residuals with the eigenvalues
        for (ii = 0; ii < colZ; ++ii) {
          normR[ii+jz] = normR[ii+jz]*dd[ii+jz];
        }
        // Put the number of converged pairs in iparam[5-1]
        for (ii=0; ii<colZ; ++ii) {
          if (normR[ii+jz] < tolEigenSolve)
            iparam[4] += 1;
        }
      }
      timeResidual += MyWatch.WallTime();
      numResidual += NCV;
      outerIter += 1;
      if (localVerbose > 0) {
        cout << " Iteration " << outerIter;
        cout << " - Number of converged eigenvalues " << iparam[4] << endl;
      }
      if (localVerbose > 2) {
        memcpy(resHistory + historyCount, normR, NCV*sizeof(double));
        historyCount += NCV;
      }
      if (localVerbose > 1) {
        cout.precision(2);
        cout.setf(ios::scientific, ios::floatfield);
        for (ii=0; ii < NCV; ++ii) {
          cout << " Iteration " << outerIter;
          cout << " - Scaled Norm of Residual " << ii << " = " << normR[ii] << endl;
        }
        cout << endl;
        cout.precision(2);
        for (ii = 0; ii < NCV; ++ii) {
          cout << " Iteration " << outerIter << " - Ritz eigenvalue " << ii;
          cout.setf((fabs(dd[ii]) > 100) ? ios::scientific : ios::fixed, ios::floatfield);
          cout << " = " << 1.0/dd[ii] << endl;
        }
        cout << endl;
      }
    } // if (ido == 4)

  } // while (ido != 99)
  timeOuterLoop += MyWatch.WallTime();
  highMem = (highMem > currentSize()) ? highMem : currentSize();

  if (info < 0) {
    if (myPid == 0) {
      cerr << endl;
      cerr << " Error with DSAUPD, info = " << info << endl;
      cerr << endl;
    }
  }
  else {
    // Get the eigenvalues
    timePostProce -= MyWatch.WallTime();
    int ii, jj;
    double *pointer = workl + 4*NCV + NCV*NCV;
    for (ii=0; ii < localSize; ii += 3) {
      int nRow = (ii + 3 < localSize) ? 3 : localSize - ii;
      for (jj=0; jj<NCV; ++jj)
        memcpy(pointer + jj*nRow, v + ii + jj*localSize, nRow*sizeof(double));
      callBLAS.GEMM('N', 'N', nRow, NCV, NCV, 1.0, pointer, nRow, zz, NCV,
                    0.0, Q.Values() + ii, localSize);
    }
    // Put the converged eigenpairs at the beginning
    knownEV = 0;
    for (ii=0; ii < NCV; ++ii) {
      if (normR[ii] < tolEigenSolve) {
        lambda[knownEV] = 1.0/dd[ii];
        memcpy(Q.Values()+knownEV*localSize, Q.Values()+ii*localSize, localSize*sizeof(double));
        knownEV += 1;
        if (knownEV == Q.NumVectors())
          break;
      }
    }   
    // Sort the eigenpairs
    if (knownEV > 0) {
      mySort.sortScalars_Vectors(knownEV, lambda, Q.Values(), localSize);
    }
    timePostProce += MyWatch.WallTime();
  } // if (info < 0)

  if (info == 0) {
    orthoOp = iparam[11-1];
  }

  delete[] wI;
  delete[] wD;
  delete[] wD2;
  if (vectWeight)
    delete vectWeight;

  return (info == 0) ? knownEV : info;

}
int
Stokhos::ApproxGaussSeidelPreconditioner::
ApplyInverse(const Epetra_MultiVector& Input, Epetra_MultiVector& Result) const
{
#ifdef STOKHOS_TEUCHOS_TIME_MONITOR
  TEUCHOS_FUNC_TIME_MONITOR("Stokhos: Total Approximate Gauss-Seidel Time");
#endif

  // We have to be careful if Input and Result are the same vector.
  // If this is the case, the only possible solution is to make a copy
  const Epetra_MultiVector *input = &Input;
  bool made_copy = false;
  if (Input.Values() == Result.Values()) {
    input = new Epetra_MultiVector(Input);
    made_copy = true;
  }

  int m = input->NumVectors();
  if (mat_vec_tmp == Teuchos::null || mat_vec_tmp->NumVectors() != m)
    mat_vec_tmp = Teuchos::rcp(new Epetra_MultiVector(*base_map, m));
  if (rhs_block == Teuchos::null || rhs_block->NumVectors() != m)
    rhs_block =
      Teuchos::rcp(new EpetraExt::BlockMultiVector(*base_map, *sg_map, m));

  // Extract blocks
  EpetraExt::BlockMultiVector input_block(View, *base_map, *input);
  EpetraExt::BlockMultiVector result_block(View, *base_map, Result);

  result_block.PutScalar(0.0);

  int k_limit = sg_poly->size();
  if (only_use_linear)
    k_limit = sg_poly->basis()->dimension() + 1;
  const Teuchos::Array<double>& norms = sg_basis->norm_squared();

  rhs_block->Update(1.0, input_block, 0.0);

  for (Cijk_type::i_iterator i_it=Cijk->i_begin();
       i_it!=Cijk->i_end(); ++i_it) {
    int i = index(i_it);

    Teuchos::RCP<Epetra_MultiVector> res_i = result_block.GetBlock(i);
    {
      // Apply deterministic preconditioner
#ifdef STOKHOS_TEUCHOS_TIME_MONITOR
      TEUCHOS_FUNC_TIME_MONITOR("Stokhos: Total AGS Deterministic Preconditioner Time");
#endif
      mean_prec->ApplyInverse(*(rhs_block->GetBlock(i)), *res_i);
    }

    int i_gid = epetraCijk->GRID(i);
    for (Cijk_type::ik_iterator k_it = Cijk->k_begin(i_it);
         k_it != Cijk->k_end(i_it); ++k_it) {
      int k = index(k_it);
      if (k!=0 && k<k_limit) {
        bool do_mat_vec = false;
        for (Cijk_type::ikj_iterator j_it = Cijk->j_begin(k_it);
             j_it != Cijk->j_end(k_it); ++j_it) {
          int j = index(j_it);
          int j_gid = epetraCijk->GCID(j);
          if (j_gid > i_gid) {
            bool on_proc = epetraCijk->myGRID(j_gid);
            if (on_proc) {
              do_mat_vec = true;
              break;
            }
          }
        }
        if (do_mat_vec) {
          (*sg_poly)[k].Apply(*res_i, *mat_vec_tmp);
          for (Cijk_type::ikj_iterator j_it = Cijk->j_begin(k_it);
               j_it != Cijk->j_end(k_it); ++j_it) {
            int j = index(j_it);
            int j_gid = epetraCijk->GCID(j);
            double c = value(j_it);
            if (scale_op) {
              if (useTranspose)
                c /= norms[i_gid];
              else
                c /= norms[j_gid];
            }
            if (j_gid > i_gid) {
              bool on_proc = epetraCijk->myGRID(j_gid);
              if (on_proc) {
                rhs_block->GetBlock(j)->Update(-c, *mat_vec_tmp, 1.0);
              }
            }
          }
        }
      }
    }
  }

  // For symmetric Gauss-Seidel
  if (symmetric) {

    for (Cijk_type::i_reverse_iterator i_it= Cijk->i_rbegin();
       i_it!=Cijk->i_rend(); ++i_it) {
      int i = index(i_it);

      Teuchos::RCP<Epetra_MultiVector> res_i = result_block.GetBlock(i);
      {
        // Apply deterministic preconditioner
#ifdef STOKHOS_TEUCHOS_TIME_MONITOR
        TEUCHOS_FUNC_TIME_MONITOR("Stokhos: Total AGS Deterministic Preconditioner Time");
#endif
        mean_prec->ApplyInverse(*(rhs_block->GetBlock(i)), *res_i);
      }

      int i_gid = epetraCijk->GRID(i);
      for (Cijk_type::ik_iterator k_it = Cijk->k_begin(i_it);
           k_it != Cijk->k_end(i_it); ++k_it) {
        int k = index(k_it);
        if (k!=0 && k<k_limit) {
          bool do_mat_vec = false;
          for (Cijk_type::ikj_iterator j_it = Cijk->j_begin(k_it);
               j_it != Cijk->j_end(k_it); ++j_it) {
            int j = index(j_it);
            int j_gid = epetraCijk->GCID(j);
            if (j_gid < i_gid) {
              bool on_proc = epetraCijk->myGRID(j_gid);
              if (on_proc) {
                do_mat_vec = true;
                break;
              }
            }
          }
          if (do_mat_vec) {
            (*sg_poly)[k].Apply(*res_i, *mat_vec_tmp);
            for (Cijk_type::ikj_iterator j_it = Cijk->j_begin(k_it);
                 j_it != Cijk->j_end(k_it); ++j_it) {
              int j = index(j_it);
              int j_gid = epetraCijk->GCID(j);
              double c = value(j_it);
              if (scale_op)
                c /= norms[j_gid];
              if (j_gid < i_gid) {
                bool on_proc = epetraCijk->myGRID(j_gid);
                if (on_proc) {
                  rhs_block->GetBlock(j)->Update(-c, *mat_vec_tmp, 1.0);
                }
              }
            }
          }
        }
      }
    }
  }

  if (made_copy)
    delete input;

  return 0;
}
Exemple #15
0
int 
Stokhos::KLMatrixFreeOperator::
Apply(const Epetra_MultiVector& Input, Epetra_MultiVector& Result) const
{
  // We have to be careful if Input and Result are the same vector.
  // If this is the case, the only possible solution is to make a copy
  const Epetra_MultiVector *input = &Input;
  bool made_copy = false;
  if (Input.Values() == Result.Values() && !is_stoch_parallel) {
    input = new Epetra_MultiVector(Input);
    made_copy = true;
  }

  // Initialize
  Result.PutScalar(0.0);

  const Epetra_Map* input_base_map = domain_base_map.get();
  const Epetra_Map* result_base_map = range_base_map.get();
  if (useTranspose == true) {
    input_base_map = range_base_map.get();
    result_base_map = domain_base_map.get();
  }

  // Allocate temporary storage
  int m = Input.NumVectors();
  if (useTranspose == false && 
      (tmp == Teuchos::null || tmp->NumVectors() != m*max_num_mat_vec))
    tmp = Teuchos::rcp(new Epetra_MultiVector(*result_base_map, 
					      m*max_num_mat_vec));
  else if (useTranspose == true && 
	   (tmp_trans == Teuchos::null || 
	    tmp_trans->NumVectors() != m*max_num_mat_vec))
    tmp_trans = Teuchos::rcp(new Epetra_MultiVector(*result_base_map, 
						    m*max_num_mat_vec));
  Epetra_MultiVector *tmp_result;
  if (useTranspose == false)
    tmp_result = tmp.get();
  else
    tmp_result = tmp_trans.get();

  // Map input into column map
  const Epetra_MultiVector *tmp_col;
  if (!is_stoch_parallel)
    tmp_col = input;
  else {
    if (useTranspose == false) {
      if (input_col == Teuchos::null || input_col->NumVectors() != m)
	input_col = Teuchos::rcp(new Epetra_MultiVector(*global_col_map, m));
      input_col->Import(*input, *col_importer, Insert);
      tmp_col = input_col.get();
    }
    else {
      if (input_col_trans == Teuchos::null || 
	  input_col_trans->NumVectors() != m)
	input_col_trans = 
	  Teuchos::rcp(new Epetra_MultiVector(*global_col_map_trans, m));
      input_col_trans->Import(*input, *col_importer_trans, Insert);
      tmp_col = input_col_trans.get();
    }
  }

  // Extract blocks
  EpetraExt::BlockMultiVector sg_input(View, *input_base_map, *tmp_col);
  EpetraExt::BlockMultiVector sg_result(View, *result_base_map, Result);
  for (int i=0; i<input_block.size(); i++)
    input_block[i] = sg_input.GetBlock(i);
  for (int i=0; i<result_block.size(); i++)
    result_block[i] = sg_result.GetBlock(i);
  int N = result_block[0]->MyLength();

  const Teuchos::Array<double>& norms = sg_basis->norm_squared();
  int d = sg_basis->dimension();
  Teuchos::Array<double> zero(d), one(d);
  for(int j = 0; j<d; j++) {
    zero[j] = 0.0;
    one[j] = 1.0;
  }
  Teuchos::Array< double > phi_0(expansion_size), phi_1(expansion_size);
  sg_basis->evaluateBases(zero, phi_0);
  sg_basis->evaluateBases(one, phi_1);

  // k_begin and k_end are initialized in the constructor
  for (Cijk_type::k_iterator k_it=k_begin; k_it!=k_end; ++k_it) {
    Cijk_type::kj_iterator j_begin = Cijk->j_begin(k_it);
    Cijk_type::kj_iterator j_end = Cijk->j_end(k_it);
    int k = index(k_it);
    int nj = Cijk->num_j(k_it);
    if (nj > 0) {
      Teuchos::Array<double*> j_ptr(nj*m);
      Teuchos::Array<int> mj_indices(nj*m);
      int l = 0;
      for (Cijk_type::kj_iterator j_it = j_begin; j_it != j_end; ++j_it) {
	int j = index(j_it);
	for (int mm=0; mm<m; mm++) {
	  j_ptr[l*m+mm] = input_block[j]->Values()+mm*N;
	  mj_indices[l*m+mm] = l*m+mm;
	}
	l++;
      }
      Epetra_MultiVector input_tmp(View, *input_base_map, &j_ptr[0], nj*m);
      Epetra_MultiVector result_tmp(View, *tmp_result, &mj_indices[0], nj*m);
      (*block_ops)[k].Apply(input_tmp, result_tmp);
      l = 0;
      for (Cijk_type::kj_iterator j_it = j_begin; j_it != j_end; ++j_it) {
	int j = index(j_it);
	int j_gid = epetraCijk->GCID(j);
	for (Cijk_type::kji_iterator i_it = Cijk->i_begin(j_it);
	     i_it != Cijk->i_end(j_it); ++i_it) {
	  int i = index(i_it);
	  int i_gid = epetraCijk->GRID(i);
	  double c = value(i_it);
	  if (k == 0)
	    c /= phi_0[0];
	  else {
	    c /= phi_1[k];
	    if (i_gid == j_gid)
	      c -= phi_0[k]/(phi_1[k]*phi_0[0])*norms[i_gid];
	  }
	  if (scale_op) {
	    if (useTranspose)
	      c /= norms[j_gid];
	    else
	      c /= norms[i_gid];
	  }
	  for (int mm=0; mm<m; mm++)
	    (*result_block[i])(mm)->Update(c, *result_tmp(l*m+mm), 1.0);
	}
	l++;
      }
    }
  }

  // Destroy blocks
  for (int i=0; i<input_block.size(); i++)
    input_block[i] = Teuchos::null;
  for (int i=0; i<result_block.size(); i++)
    result_block[i] = Teuchos::null;

  if (made_copy)
    delete input;

  return 0;
}
// ============================================================================ 
int ML_Epetra::MatrixFreePreconditioner::
Compute(const Epetra_CrsGraph& Graph, Epetra_MultiVector& NullSpace)
{
  Epetra_Time TotalTime(Comm());

  const int NullSpaceDim = NullSpace.NumVectors();
  // get parameters from the list
  std::string PrecType = List_.get("prec: type", "hybrid");
  std::string SmootherType = List_.get("smoother: type", "Jacobi");
  std::string ColoringType = List_.get("coloring: type", "JONES_PLASSMAN");
  int PolynomialDegree = List_.get("smoother: degree", 3);
  std::string DiagonalColoringType = List_.get("diagonal coloring: type", "JONES_PLASSMAN");
  int MaximumIterations = List_.get("eigen-analysis: max iters", 10);
  std::string EigenType_ = List_.get("eigen-analysis: type", "cg");
  double boost = List_.get("eigen-analysis: boost for lambda max", 1.0);
  int OutputLevel = List_.get("ML output", -47);
  if (OutputLevel == -47) OutputLevel =  List_.get("output", 10);
  omega_ = List_.get("smoother: damping", omega_);
  ML_Set_PrintLevel(OutputLevel);
  bool LowMemory = List_.get("low memory", true);
  double AllocationFactor = List_.get("AP allocation factor", 0.5);

  verbose_ = (MyPID() == 0 && ML_Get_PrintLevel() > 5);

  // ================ //
  // check parameters //
  // ================ //

  if (PrecType == "presmoother only")
    PrecType_ = ML_MFP_PRESMOOTHER_ONLY;
  else if (PrecType == "hybrid")
    PrecType_ = ML_MFP_HYBRID;
  else if (PrecType == "additive")
    PrecType_ = ML_MFP_ADDITIVE;
  else
    ML_CHK_ERR(-3); // not recognized

  if (SmootherType == "none")
    SmootherType_ = ML_MFP_NONE;
  else if (SmootherType == "Jacobi")
    SmootherType_ = ML_MFP_JACOBI;
  else if (SmootherType == "block Jacobi")
    SmootherType_ = ML_MFP_BLOCK_JACOBI;
  else if (SmootherType == "Chebyshev")
    SmootherType_ = ML_MFP_CHEBY;
  else
    ML_CHK_ERR(-4); // not recognized

  if (AllocationFactor <= 0.0)
    ML_CHK_ERR(-1); // should be positive

  // =============================== //
  // basic checkings and some output //
  // =============================== //
  
  int OperatorDomainPoints =  Operator_.OperatorDomainMap().NumGlobalPoints();
  int OperatorRangePoints =  Operator_.OperatorRangeMap().NumGlobalPoints();
  int GraphBlockRows = Graph.NumGlobalBlockRows();
  int GraphNnz = Graph.NumGlobalNonzeros();
  NumPDEEqns_ = OperatorRangePoints / GraphBlockRows;
  NumMyBlockRows_ = Graph.NumMyBlockRows();

  if (OperatorDomainPoints != OperatorRangePoints)
    ML_CHK_ERR(-1); // only square matrices

  if (OperatorRangePoints % NumPDEEqns_ != 0)
    ML_CHK_ERR(-2); // num PDEs seems not constant

  if (verbose_)
  {
    ML_print_line("=",78);
    std::cout << "*** " << std::endl;
    std::cout << "*** ML_Epetra::MatrixFreePreconditioner" << std::endl;
    std::cout << "***" << std::endl;
    std::cout << "Number of rows and columns      = " << OperatorDomainPoints << std::endl;
    std::cout << "Number of rows per processor    = " << OperatorDomainPoints / Comm().NumProc()
         << " (on average)" << std::endl;
    std::cout << "Number of rows in the graph     = " << GraphBlockRows << std::endl;
    std::cout << "Number of nonzeros in the graph = " << GraphNnz << std::endl;
    std::cout << "Processors used in computation  = " << Comm().NumProc() << std::endl;
    std::cout << "Number of PDE equations         = " << NumPDEEqns_ << std::endl;
    std::cout << "Null space dimension            = " << NullSpaceDim << std::endl;
    std::cout << "Preconditioner type             = " << PrecType << std::endl;
    std::cout << "Smoother type                   = " << SmootherType << std::endl;
    std::cout << "Coloring type                   = " << ColoringType << std::endl;
    std::cout << "Allocation factor               = " << AllocationFactor << std::endl;
    std::cout << "Number of V-cycles for C        = " << List_.sublist("ML list").get("cycle applications", 1) << std::endl;
    std::cout << std::endl;
  }

  ResetStartTime();

  // ==================================== //
  // compute the inverse of the diagonal, //
  // control that no elements are zero.   //
  // ==================================== //
  
  for (int i = 0; i < InvPointDiagonal_->MyLength(); ++i)
    if ((*InvPointDiagonal_)[i] != 0.0)
      (*InvPointDiagonal_)[i] = 1.0 / (*InvPointDiagonal_)[i];

  // ========================================================= //
  // Setup the smoother. I need to extract the block diagonal  //
  // only if block Jacobi is used. For Chebyshev, I scale with //
  // the point diagonal only. In this latter case, I need to   //
  // compute lambda_max of the scaled operator.                //
  // ========================================================= //
  
  // probes for the block diagonal of the matrix.
  if (SmootherType_ == ML_MFP_JACOBI ||
      SmootherType_ == ML_MFP_NONE)
  {
    // do-nothing here
  }
  else if (SmootherType_ == ML_MFP_BLOCK_JACOBI)
  {
    if (verbose_);
      std::cout << "Diagonal coloring type         = " << DiagonalColoringType << std::endl;
    ML_CHK_ERR(GetBlockDiagonal(Graph, DiagonalColoringType));

    AddAndResetStartTime("block diagonal construction", true);
  }
  else if (SmootherType_ == ML_MFP_CHEBY)
  {
    double lambda_min = 0.0;
    double lambda_max = 0.0;
    Teuchos::ParameterList IFPACKList;

    if (EigenType_ == "power-method")
    {
      ML_CHK_ERR(Ifpack_Chebyshev::PowerMethod(Operator_, *InvPointDiagonal_,
                                               MaximumIterations, lambda_max));
    }
    else if(EigenType_ == "cg")
    {
      ML_CHK_ERR(Ifpack_Chebyshev::CG(Operator_, *InvPointDiagonal_,
                                      MaximumIterations, lambda_min, 
                                      lambda_max));
    }
    else
      ML_CHK_ERR(-1); // not recognized

    if (verbose_)
    {
      std::cout << "Using Chebyshev smoother of degree " << PolynomialDegree << std::endl;
      std::cout << "Estimating eigenvalues using " <<  EigenType_ << std::endl;
      std::cout << "lambda_min = " << lambda_min << ", ";
      std::cout << "lambda_max = " << lambda_max << std::endl;
    }

    IFPACKList.set("chebyshev: min eigenvalue", lambda_min);
    IFPACKList.set("chebyshev: max eigenvalue", boost * lambda_max);
    // FIXME: this allocates a new std::vector inside
    IFPACKList.set("chebyshev: operator inv diagonal", InvPointDiagonal_.get());
    IFPACKList.set("chebyshev: degree", PolynomialDegree);

    PreSmoother_ = rcp(new Ifpack_Chebyshev((Epetra_Operator*)(&Operator_)));
    if (PreSmoother_.get() == 0) ML_CHK_ERR(-1); // memory error?

    IFPACKList.set("chebyshev: zero starting solution", true);
    ML_CHK_ERR(PreSmoother_->SetParameters(IFPACKList));
    ML_CHK_ERR(PreSmoother_->Initialize());
    ML_CHK_ERR(PreSmoother_->Compute());

    PostSmoother_ = rcp(new Ifpack_Chebyshev((Epetra_Operator*)(&Operator_)));
    if (PostSmoother_.get() == 0) ML_CHK_ERR(-1); // memory error?

    IFPACKList.set("chebyshev: zero starting solution", false);
    ML_CHK_ERR(PostSmoother_->SetParameters(IFPACKList));
    ML_CHK_ERR(PostSmoother_->Initialize());
    ML_CHK_ERR(PostSmoother_->Compute());
  }

  // ========================================================= //
  // building P and R for block graph. This is done by working //
  // on the Graph_ object. Support is provided for local       //
  // aggregation schemes only so that all is basically local.  //
  // Then, build the block graph coarse problem.               //
  // ========================================================= //
  
  // ML wrapper for Graph_
  ML_Operator* Graph_ML = ML_Operator_Create(Comm_ML());
  ML_Operator_WrapEpetraCrsGraph(const_cast<Epetra_CrsGraph*>(&Graph), Graph_ML);

  ML_Aggregate* BlockAggr_ML = 0;
  ML_Operator* BlockPtent_ML = 0, *BlockRtent_ML = 0,* CoarseGraph_ML = 0;

  if (verbose_) std::cout << std::endl;

  ML_CHK_ERR(Coarsen(Graph_ML, &BlockAggr_ML, &BlockPtent_ML, &BlockRtent_ML, 
                     &CoarseGraph_ML));

  if (verbose_) std::cout << std::endl;

  Epetra_CrsMatrix* GraphCoarse;
  ML_CHK_ERR(ML_Operator2EpetraCrsMatrix(CoarseGraph_ML, GraphCoarse));

  // used later to estimate the entries in AP
  ML_Operator* CoarseAP_ML = ML_Operator_Create(Comm_ML());
  ML_2matmult(Graph_ML, BlockPtent_ML, CoarseAP_ML, ML_CSR_MATRIX);

  int AP_MaxNnzRow, itmp = CoarseAP_ML->max_nz_per_row;
  Comm().MaxAll(&itmp, &AP_MaxNnzRow, 1);
  ML_Operator_Destroy(&CoarseAP_ML);

  int NumAggregates = BlockPtent_ML->invec_leng;
  ML_Operator_Destroy(&BlockRtent_ML);
  ML_Operator_Destroy(&CoarseGraph_ML);

  AddAndResetStartTime("construction of block C, R, and P", true);
  if (verbose_) std::cout << std::endl;

  // ================================================== //
  // coloring of block graph:                           //
  // - color of block row `i' is given by `ColorMap[i]' //
  // - number of colors is ColorMap.NumColors().        //
  // ================================================== //
  
  ResetStartTime();

  CrsGraph_MapColoring* MapColoringTransform;
  
  if (ColoringType == "JONES_PLASSMAN")
    MapColoringTransform = new CrsGraph_MapColoring (CrsGraph_MapColoring::JONES_PLASSMAN,
                                                     0, false, 0);
  else if (ColoringType == "PSEUDO_PARALLEL")
    MapColoringTransform = new CrsGraph_MapColoring (CrsGraph_MapColoring::PSEUDO_PARALLEL,
                                                     0, false, 0);
  else if (ColoringType == "GREEDY")
    MapColoringTransform = new CrsGraph_MapColoring (CrsGraph_MapColoring::GREEDY,
                                                     0, false, 0);
  else if (ColoringType == "LUBY")
    MapColoringTransform = new CrsGraph_MapColoring (CrsGraph_MapColoring::LUBY,
                                                     0, false, 0);
  else 
    ML_CHK_ERR(-1);

  Epetra_MapColoring* ColorMap = &(*MapColoringTransform)(const_cast<Epetra_CrsGraph&>(GraphCoarse->Graph()));

  // move the information from ColorMap to std::vector Colors
  const int NumColors = ColorMap->MaxNumColors();
  RefCountPtr<Epetra_IntSerialDenseVector> Colors = rcp(new Epetra_IntSerialDenseVector(GraphCoarse->Graph().NumMyRows()));
  for (int i = 0; i < GraphCoarse->Graph().NumMyRows(); ++i)
    (*Colors)[i] = (*ColorMap)[i];

  delete MapColoringTransform;
  delete ColorMap; ColorMap = 0;
  delete GraphCoarse;

  AddAndResetStartTime("coarse graph coloring", true);
  if (verbose_) std::cout << std::endl;

  // get some other information about the aggregates, to be used
  // in the QR factorization of the null space. NodesOfAggregate
  // contains the local ID of block rows contained in each aggregate.

  // FIXME: make it faster
  std::vector< std::vector<int> > NodesOfAggregate(NumAggregates);

  for (int i = 0; i < Graph.NumMyBlockRows(); ++i)
  {
    int AID = BlockAggr_ML->aggr_info[0][i];
    NodesOfAggregate[AID].push_back(i);
  }

  int MaxAggrSize = 0;
  for (int i = 0; i < NumAggregates; ++i)
  {
    const int& MySize = NodesOfAggregate[i].size();
    if (MySize > MaxAggrSize) MaxAggrSize = MySize;
  }

  // collect aggregate information, and mark all nodes that are
  // connected with each aggregate. These nodes will have a possible
  // nonzero entry after the matrix-matrix product between the Operator_
  // and the tentative prolongator.

  std::vector<vector<int> > aggregates(NumAggregates);
  std::vector<int>::iterator iter;

  for (int i = 0; i < NumAggregates; ++i)
    aggregates[i].reserve(MaxAggrSize);

  for (int i = 0; i < Graph.NumMyBlockRows(); ++i)
  {
    int AID = BlockAggr_ML->aggr_info[0][i];

    int NumEntries;
    int* Indices;

    Graph.ExtractMyRowView(i, NumEntries, Indices);

    for (int k = 0; k < NumEntries; ++k)
    {
      // FIXME: use hash??
      const int& GCID = Graph.ColMap().GID(Indices[k]);

      iter = find(aggregates[AID].begin(), aggregates[AID].end(), GCID);
      if (iter == aggregates[AID].end())
        aggregates[AID].push_back(GCID);
    }
  }
  
  int* BlockNodeList = Graph.ColMap().MyGlobalElements();

  // finally get rid of the ML_Aggregate structure.
  ML_Aggregate_Destroy(&BlockAggr_ML);

  const Epetra_Map& FineMap = Operator_.OperatorDomainMap();
  Epetra_Map CoarseMap(-1, NumAggregates * NullSpaceDim, 0, Comm());
  RefCountPtr<Epetra_Map> BlockNodeListMap = 
    rcp(new Epetra_Map(-1, Graph.ColMap().NumMyElements(),
                       BlockNodeList, 0, Comm()));

  std::vector<int> NodeList(Graph.ColMap().NumMyElements() * NumPDEEqns_);
  for (int i = 0; i < Graph.ColMap().NumMyElements(); ++i)
    for (int m = 0; m < NumPDEEqns_; ++m)
      NodeList[i * NumPDEEqns_ + m] = BlockNodeList[i] * NumPDEEqns_ + m;
  RefCountPtr<Epetra_Map> NodeListMap = 
    rcp(new Epetra_Map(-1, NodeList.size(), &NodeList[0], 0, Comm()));

  AddAndResetStartTime("data structures", true);

  // ====================== //
  // process the null space //
  // ====================== //

  // CHECKME
  Epetra_MultiVector NewNullSpace(CoarseMap, NullSpaceDim);
  NewNullSpace.PutScalar(0.0);

  if (NullSpaceDim == 1)
  {
    double* ns_ptr = NullSpace.Values();

    for (int AID = 0; AID < NumAggregates; ++AID)
    {
      double dtemp = 0.0;
      for (int j = 0; j < (int) (NodesOfAggregate[AID].size()); j++)
        for (int m = 0; m < NumPDEEqns_; ++m)
        {
          const int& pos = NodesOfAggregate[AID][j] * NumPDEEqns_ + m;
          dtemp += (ns_ptr[pos] * ns_ptr[pos]);
        }
      dtemp = std::sqrt(dtemp);

      NewNullSpace[0][AID] = dtemp;

      dtemp = 1.0 / dtemp;

      for (int j = 0; j < (int) (NodesOfAggregate[AID].size()); j++)
        for (int m = 0; m < NumPDEEqns_; ++m)
          ns_ptr[NodesOfAggregate[AID][j] * NumPDEEqns_ + m] *= dtemp;
    }
  }
  else
  {
    // FIXME
    std::vector<double> qr_ptr(MaxAggrSize * NumPDEEqns_ * MaxAggrSize * NumPDEEqns_);
    std::vector<double> tmp_ptr(MaxAggrSize * NumPDEEqns_ * NullSpaceDim);

    std::vector<double> work(NullSpaceDim);
    int info;

    for (int AID = 0; AID < NumAggregates; ++AID)
    {
      int MySize = NodesOfAggregate[AID].size();
      int MyFullSize = NodesOfAggregate[AID].size() * NumPDEEqns_;
      int lwork = NullSpaceDim;

      for (int k = 0; k < NullSpaceDim; ++k)
        for (int j = 0; j < MySize; ++j)
          for (int m = 0; m < NumPDEEqns_; ++m)
            qr_ptr[k * MyFullSize + j * NumPDEEqns_ + m] = 
              NullSpace[k][NodesOfAggregate[AID][j] * NumPDEEqns_ + m];

      DGEQRF_F77(&MyFullSize, (int*)&NullSpaceDim, &qr_ptr[0], 
                 &MyFullSize, &tmp_ptr[0], &work[0], &lwork, &info);

      ML_CHK_ERR(info);

      if (work[0] > lwork) work.resize((int) work[0]);

      // the upper triangle of qr_tmp is now R, so copy that into the 
      //  new nullspace

      for (int j = 0; j < NullSpaceDim; j++)
        for (int k = j; k < NullSpaceDim; k++)
          NewNullSpace[k][AID * NullSpaceDim + j] = qr_ptr[j + MyFullSize * k];
		 
      // to get this block of P, need to run qr_tmp through another LAPACK 
      // function:

      DORGQR_F77(&MyFullSize, (int*)&NullSpaceDim, (int*)&NullSpaceDim, 
                 &qr_ptr[0], &MyFullSize, &tmp_ptr[0], &work[0], &lwork, &info);
      ML_CHK_ERR(info); // dgeqtr returned a non-zero

      if (work[0] > lwork) work.resize((int) work[0]);

      // insert the Q block into the null space

      for (int k = 0; k < NullSpaceDim; ++k)
        for (int j = 0; j < MySize; ++j)
          for (int m = 0; m < NumPDEEqns_; ++m)
          {
            int LRID = NodesOfAggregate[AID][j] * NumPDEEqns_ + m;
            double& val = qr_ptr[k * MyFullSize + j * NumPDEEqns_ + m];
            NullSpace[k][LRID] = val;
          }
    }
  }

  AddAndResetStartTime("null space setup", true);

  if (verbose_)
    std::cout << "Number of colors on processor " << Comm().MyPID() << " = "
        << NumColors << std::endl;
  if (verbose_)
    std::cout << "Maximum number of colors = " << NumColors << std::endl;

  RefCountPtr<Epetra_FECrsMatrix> AP;
  
  // try to get a good estimate of the nonzeros per row.
  // This is a compromize between efficiency -- that is, reduce
  // the memory allocation processes, and memory usage -- that, is
  // overestimating can actually kill the code. Basically, this is
  // all junk due to our dear friend, the Cray XT3.
  
  AP = rcp(new Epetra_FECrsMatrix(Copy, FineMap, (int)
                                  (AllocationFactor * AP_MaxNnzRow * NullSpaceDim)));
  if (AP.get() == 0) throw(-1);

  if (!LowMemory)
  {
    // ================================================= //
    // allocate one big chunk of memory, and use View    //             
    // to create Epetra_MultiVectors. Note that          //
    // NumColors * NullSpace can indeed be a quite large //
    // value. To reduce the memory consumption, both     //
    // ColoredAP and ExtColoredAP use the same memory    //
    // array.                                            //
    // ================================================= //
    
    Epetra_MultiVector* ColoredP;
    std::vector<double> ColoredAP_ptr;

    try
    {
      ColoredP = new Epetra_MultiVector(FineMap, NumColors * NullSpaceDim);
      ColoredAP_ptr.resize(NumColors * NullSpaceDim * NodeListMap->NumMyPoints());
    }
    catch (std::exception& rhs)
    {
      catch_message("the allocation of ColoredP", rhs.what(), __FILE__, __LINE__);
      ML_CHK_ERR(-1);
    }
    catch (...)
    {
      catch_message("the allocation of ColoredP", "", __FILE__, __LINE__);
      ML_CHK_ERR(-1);
    }

    int ColoredAP_LDA = NodeListMap->NumMyPoints();

    ColoredP->PutScalar(0.0);

    for (int i = 0; i < BlockPtent_ML->outvec_leng; ++i)
    {
      int allocated = 1;
      int NumEntries;
      int Indices;
      double Values;
      int ierr = ML_Operator_Getrow(BlockPtent_ML, 1 ,&i, allocated,
                                    &Indices,&Values,&NumEntries);
      if (ierr < 0)
        ML_CHK_ERR(-1);

      assert (NumEntries == 1); // this is the block P
      const int& Color = (*Colors)[Indices] - 1;
      for (int k = 0; k < NumPDEEqns_; ++k)
        for (int j = 0; j < NullSpaceDim; ++j)
          (*ColoredP)[(Color * NullSpaceDim + j)][i * NumPDEEqns_ + k] = 
            NullSpace[j][i * NumPDEEqns_ + k];
    }

    ML_Operator_Destroy(&BlockPtent_ML);

    Epetra_MultiVector ColoredAP(View, Operator_.OperatorRangeMap(), 
                                 &ColoredAP_ptr[0], ColoredAP_LDA, 
                                 NumColors * NullSpaceDim);
    // move ColoredAP into ColoredP. This should not be required.
    // but I prefer to skip strange games with View pointers
    Operator_.Apply(*ColoredP, ColoredAP);
    *ColoredP = ColoredAP;

    // FIXME: only if NumProc > 1
    Epetra_MultiVector ExtColoredAP(View, *NodeListMap, 
                                 &ColoredAP_ptr[0], ColoredAP_LDA, 
                                 NumColors * NullSpaceDim);

    try 
    {
      Epetra_Import Importer(*NodeListMap, Operator_.OperatorRangeMap());
      ExtColoredAP.Import(*ColoredP, Importer, Insert);
    }
    catch (std::exception& rhs)
    {
      catch_message("importing of ExtColoredAP", rhs.what(), __FILE__, __LINE__);
      ML_CHK_ERR(-1);
    }
    catch (...)
    {
      catch_message("importing of ExtColoredAP", "", __FILE__, __LINE__);
      ML_CHK_ERR(-1);
    }

    delete ColoredP;

    AddAndResetStartTime("computation of AP", true); 

    // populate the actual AP operator, skip some controls to make it faster

    for (int i = 0; i < NumAggregates; ++i)
    {
      for (int j = 0; j < (int) (aggregates[i].size()); ++j)
      {
        int GRID = aggregates[i][j];
        int LRID = BlockNodeListMap->LID(GRID); // this is the block ID
        //assert (LRID != -1);
        int GCID = CoarseMap.GID(i * NullSpaceDim);
        //assert (GCID != -1); 
        int color = (*Colors)[i] - 1;
        for (int k = 0; k < NumPDEEqns_; ++k)
          for (int j = 0; j < NullSpaceDim; ++j)
          {
            double val = ExtColoredAP[color * NullSpaceDim + j][LRID * NumPDEEqns_ + k];
            if (val != 0.0)
            {
              int GRID2 = GRID * NumPDEEqns_ + k;
              int GCID2 = GCID + j;
              AP->InsertGlobalValues(1, &GRID2, 1, &GCID2, &val);
              //if (ierr < 0) ML_CHK_ERR(ierr);
            }
          }
      }
    }
  }
  else
  {
    // =============================================================== //
    // apply the operator one color at-a-time. This requires NumColors //
    // cycles over BlockPtent. However, the memory requirements are    //
    // drastically reduced. As for low-memory == false, both ColoredAP //
    // and ExtColoredAP point to the same memory location.             //
    // =============================================================== //
    
    if (verbose_)
      std::cout << "Using low-memory computation for AP" << std::endl;

    Epetra_MultiVector ColoredP(FineMap, NullSpaceDim);
    std::vector<double> ColoredAP_ptr;
    try
    {
      ColoredAP_ptr.resize(NullSpaceDim * NodeListMap->NumMyPoints());
    }
    catch (std::exception& rhs)
    {
      catch_message("resizing of ColoredAP_pt", rhs.what(), __FILE__, __LINE__);
      ML_CHK_ERR(-1);
    }
    catch (...)
    {
      catch_message("resizing of ColoredAP_pt", "", __FILE__, __LINE__);
      ML_CHK_ERR(-1);
    }

    Epetra_MultiVector ColoredAP(View, Operator_.OperatorRangeMap(), 
                                 &ColoredAP_ptr[0], NodeListMap->NumMyPoints(), 
                                 NullSpaceDim);
    Epetra_MultiVector ExtColoredAP(View, *NodeListMap, 
                                 &ColoredAP_ptr[0], NodeListMap->NumMyPoints(), 
                                 NullSpaceDim);
    Epetra_Import Importer(*NodeListMap, Operator_.OperatorRangeMap());

    for (int ic = 0; ic < NumColors; ++ic)
    {
      if (ML_Get_PrintLevel() > 8 && Comm().MyPID() == 0)
      {
        if (ic % 20 == 0)
          std::cout << "Processing color " << flush;

        std::cout << ic << " " << flush;
        if (ic % 20 == 19 || ic == NumColors - 1)
          std::cout << std::endl;
        if (ic == NumColors - 1) std::cout << std::endl;
      }

      ColoredP.PutScalar(0.0);

      for (int i = 0; i < BlockPtent_ML->outvec_leng; ++i)
      {
        int allocated = 1;
        int NumEntries;
        int Indices;
        double Values;
        int ierr = ML_Operator_Getrow(BlockPtent_ML, 1 ,&i, allocated,
                                      &Indices,&Values,&NumEntries);
        if (ierr < 0 ||  // something strange in getrow
            NumEntries != 1) // this is the block P
          ML_CHK_ERR(-1);

        const int& Color = (*Colors)[Indices] - 1;
        if (Color != ic)
          continue; // skip this color for this cycle

        for (int k = 0; k < NumPDEEqns_; ++k)
          for (int j = 0; j < NullSpaceDim; ++j)
            ColoredP[j][i * NumPDEEqns_ + k] = 
              NullSpace[j][i * NumPDEEqns_ + k];
      }

      Operator_.Apply(ColoredP, ColoredAP);
      ColoredP = ColoredAP; // just to be safe

      ExtColoredAP.Import(ColoredP, Importer, Insert);

      // populate the actual AP operator, skip some controls to make it faster

      std::vector<int> InsertCols(NullSpaceDim * NumPDEEqns_);
      std::vector<double> InsertValues(NullSpaceDim * NumPDEEqns_);

      for (int i = 0; i < NumAggregates; ++i)
      {
        for (int j = 0; j < (int) (aggregates[i].size()); ++j)
        {
          int GRID = aggregates[i][j];
          int LRID = BlockNodeListMap->LID(GRID); // this is the block ID
          //assert (LRID != -1);
          int GCID = CoarseMap.GID(i * NullSpaceDim);
          //assert (GCID != -1); 
          int color = (*Colors)[i] - 1;
          if (color != ic) continue;

          for (int k = 0; k < NumPDEEqns_; ++k)
          {
            int count = 0;
            int GRID2 = GRID * NumPDEEqns_ + k;
            for (int j = 0; j < NullSpaceDim; ++j)
            {
              double val = ExtColoredAP[j][LRID * NumPDEEqns_ + k];
              if (val != 0.0)
              {
                InsertCols[count] = GCID + j;
                InsertValues[count] = val;
                ++count;
              }
            }
            AP->InsertGlobalValues(1, &GRID2, count, &InsertCols[0], 
                                   &InsertValues[0]);
          }
        }
      }
    }

    ML_Operator_Destroy(&BlockPtent_ML);
  }

  aggregates.resize(0);
  BlockNodeListMap = Teuchos::null;
  NodeListMap = Teuchos::null;

  Colors = Teuchos::null;

  AP->GlobalAssemble(false);
  AP->FillComplete(CoarseMap, FineMap);

#if 0
  try
  {
    AP->OptimizeStorage();
  }
  catch(...)
  {
    // a memory error was reported, typically ReportError.
    // We just continue with fingers crossed.
  }
#endif

  AddAndResetStartTime("computation of the final AP", true); 

  ML_Operator* AP_ML = ML_Operator_Create(Comm_ML());
  ML_Operator_WrapEpetraMatrix(AP.get(), AP_ML);

  // ======== //
  // create R //
  // ======== //
  
  std::vector<int> REntries(NumAggregates * NullSpaceDim);
  for (int AID = 0; AID < NumAggregates; ++AID)
  {
    for (int m = 0; m < NullSpaceDim; ++m)
      REntries[AID * NullSpaceDim + m] = NodesOfAggregate[AID].size() * NumPDEEqns_;
  }

  R_ = rcp(new Epetra_CrsMatrix(Copy, CoarseMap, &REntries[0], true));
  REntries.resize(0);

  for (int AID = 0; AID < NumAggregates; ++AID)
  {
    const int& MySize = NodesOfAggregate[AID].size();

    // FIXME: make it faster
    for (int j = 0; j < MySize; ++j)
      for (int m = 0; m < NumPDEEqns_; ++m)
        for (int k = 0; k < NullSpaceDim; ++k)
        {
          int LCID = NodesOfAggregate[AID][j] * NumPDEEqns_ + m;
          int GCID = FineMap.GID(LCID);
          assert (GCID != -1);

          double& val = NullSpace[k][LCID];

          int GRID = CoarseMap.GID(AID * NullSpaceDim + k);
          int ierr = R_->InsertGlobalValues(GRID, 1, &val, &GCID);
          if (ierr < 0)
            ML_CHK_ERR(-1);
        }
  }

  NodesOfAggregate.resize(0);

  R_->FillComplete(FineMap, CoarseMap);
#if 0
  try
  {
    R_->OptimizeStorage();
  }
  catch(...)
  {
    // a memory error was reported, typically ReportError.
    // We just continue with fingers crossed.
  }
#endif

  ML_Operator* R_ML = ML_Operator_Create(Comm_ML());
  ML_Operator_WrapEpetraMatrix(R_.get(), R_ML);

  AddAndResetStartTime("computation of R", true); 

  // ======== //
  // Create C //
  // ======== //

  C_ML_ = ML_Operator_Create(Comm_ML());
  ML_2matmult(R_ML, AP_ML, C_ML_, ML_MSR_MATRIX);

  ML_Operator_Destroy(&AP_ML);
  ML_Operator_Destroy(&R_ML);
  AP = Teuchos::null;

  C_ = rcp(new ML_Epetra::RowMatrix(C_ML_, &Comm(), false));
  assert (R_->OperatorRangeMap().SameAs(C_->OperatorDomainMap()));

  TotalTime.ResetStartTime();

  AddAndResetStartTime("computation of C", true); 

  if (verbose_)
  {
    std::cout << "Matrix-free preconditioner built. Now building solver for C..." << std::endl; 
  }

  Teuchos::ParameterList& sublist = List_.sublist("ML list");
  sublist.set("PDE equations", NullSpaceDim);
  sublist.set("null space: type", "pre-computed");
  sublist.set("null space: dimension", NewNullSpace.NumVectors());
  sublist.set("null space: vectors", NewNullSpace.Values());

  MLP_ = rcp(new MultiLevelPreconditioner(*C_, sublist, true));

  assert (MLP_.get() != 0);

  IsComputed_ = true;

  AddAndResetStartTime("computation of the preconditioner for C", true); 

  if (verbose_)
  {
    std::cout << std::endl;
    std::cout << "Total CPU time for construction (all included) = ";
    std::cout << TotalCPUTime() << std::endl;
    ML_print_line("=",78);
  }

  return(0);
}