void LOCA::Epetra::CompactWYOp::applyCompactWY(const Epetra_MultiVector& x, Epetra_MultiVector& result_x, Epetra_MultiVector& result_p) const { // Compute Y_x^T*x result_p.Multiply('T', 'N', 1.0, *Y_x, x, 0.0); // Compute T*(Y_x^T*x) dblas.TRMM(Teuchos::LEFT_SIDE, Teuchos::UPPER_TRI, Teuchos::NO_TRANS, Teuchos::NON_UNIT_DIAG, result_p.MyLength(), result_p.NumVectors(), 1.0, T.Values(), T.MyLength(), result_p.Values(), result_p.MyLength()); // Compute x = x + Y_x*T*(Y_x^T*x) result_x = x; result_x.Multiply('N', 'N', 1.0, *Y_x, result_p, 1.0); // Compute result_p = Y_p*T*(Y_x^T*x) dblas.TRMM(Teuchos::LEFT_SIDE, Teuchos::LOWER_TRI, Teuchos::NO_TRANS, Teuchos::UNIT_DIAG, result_p.MyLength(), result_p.NumVectors(), 1.0, Y_p.Values(), Y_p.MyLength(), result_p.Values(), result_p.MyLength()); }
int Stokhos::MeanBasedPreconditioner:: ApplyInverse(const Epetra_MultiVector& Input, Epetra_MultiVector& Result) const { int myBlockRows = epetraCijk->numMyRows(); if (!use_block_apply) { EpetraExt::BlockMultiVector sg_input(View, *base_map, Input); EpetraExt::BlockMultiVector sg_result(View, *base_map, Result); for (int i=0; i<myBlockRows; i++) { mean_prec->ApplyInverse(*(sg_input.GetBlock(i)), *(sg_result.GetBlock(i))); } } else { int m = Input.NumVectors(); Epetra_MultiVector input_block( View, *base_map, Input.Values(), base_map->NumMyElements(), m*myBlockRows); Epetra_MultiVector result_block( View, *base_map, Result.Values(), base_map->NumMyElements(), m*myBlockRows); mean_prec->ApplyInverse(input_block, result_block); } return 0; }
int Stokhos::ApproxJacobiPreconditioner:: ApplyInverse(const Epetra_MultiVector& Input, Epetra_MultiVector& Result) const { #ifdef STOKHOS_TEUCHOS_TIME_MONITOR TEUCHOS_FUNC_TIME_MONITOR("Stokhos: Total Approximate Jacobi Time"); #endif // We have to be careful if Input and Result are the same vector. // If this is the case, the only possible solution is to make a copy const Epetra_MultiVector *input = &Input; bool made_copy = false; if (Input.Values() == Result.Values()) { input = new Epetra_MultiVector(Input); made_copy = true; } int m = input->NumVectors(); if (rhs_block == Teuchos::null || rhs_block->NumVectors() != m) rhs_block = Teuchos::rcp(new EpetraExt::BlockMultiVector(*base_map, *sg_map, m)); // Extract blocks EpetraExt::BlockMultiVector input_block(View, *base_map, *input); EpetraExt::BlockMultiVector result_block(View, *base_map, Result); int myBlockRows = epetraCijk->numMyRows(); result_block.PutScalar(0.0); for (int iter=0; iter<num_iter; iter++) { // Compute RHS if (iter == 0) rhs_block->Update(1.0, input_block, 0.0); else { mat_free_op->Apply(result_block, *rhs_block); rhs_block->Update(1.0, input_block, -1.0); } // Apply deterministic preconditioner for(int i=0; i<myBlockRows; i++) { #ifdef STOKHOS_TEUCHOS_TIME_MONITOR TEUCHOS_FUNC_TIME_MONITOR("Stokhos: Total AJ Deterministic Preconditioner Time"); #endif mean_prec->ApplyInverse(*(rhs_block->GetBlock(i)), *(result_block.GetBlock(i))); } } if (made_copy) delete input; return 0; }
int EpetraOperator::ApplyInverse(const Epetra_MultiVector& X, Epetra_MultiVector& Y) const { try { // There is no rcpFromRef(const T&), so we need to do const_cast const Xpetra::EpetraMultiVector eX(rcpFromRef(const_cast<Epetra_MultiVector&>(X))); Xpetra::EpetraMultiVector eY(rcpFromRef(Y)); // Generally, we assume two different vectors, but AztecOO uses a single vector if (X.Values() == Y.Values()) { // X and Y point to the same memory, use an additional vector RCP<Xpetra::EpetraMultiVector> tmpY = Teuchos::rcp(new Xpetra::EpetraMultiVector(eY.getMap(), eY.getNumVectors())); // InitialGuessIsZero in MueLu::Hierarchy.Iterate() does not zero out components, it // only assumes that user provided an already zeroed out vector bool initialGuessZero = true; tmpY->putScalar(0.0); // apply one V-cycle as preconditioner Hierarchy_->Iterate(eX, 1, *tmpY, initialGuessZero); // deep copy solution from MueLu eY.update(1.0, *tmpY, 0.0); } else { // X and Y point to different memory, pass the vectors through // InitialGuessIsZero in MueLu::Hierarchy.Iterate() does not zero out components, it // only assumes that user provided an already zeroed out vector bool initialGuessZero = true; eY.putScalar(0.0); Hierarchy_->Iterate(eX, 1, eY, initialGuessZero); } } catch (std::exception& e) { //TODO: error msg directly on std::cerr? std::cerr << "Caught an exception in MueLu::EpetraOperator::ApplyInverse():" << std::endl << e.what() << std::endl; return -1; } return 0; }
int DoCopyMultiVector(double** matlabApr, const Epetra_MultiVector& A) { int ierr = 0; int length = A.GlobalLength(); int numVectors = A.NumVectors(); const Epetra_Comm & comm = A.Map().Comm(); if (comm.MyPID()!=0) { if (A.MyLength()!=0) ierr = -1; } else { if (length!=A.MyLength()) ierr = -1; double* matlabAvalues = *matlabApr; double* Aptr = A.Values(); memcpy((void *)matlabAvalues, (void *)Aptr, sizeof(*Aptr) * length * numVectors); *matlabApr += length; } int ierrGlobal; comm.MinAll(&ierr, &ierrGlobal, 1); // If any processor has -1, all return -1 return(ierrGlobal); }
int ARPACKm3::reSolve(int numEigen, Epetra_MultiVector &Q, double *lambda, int startingEV) { // Computes eigenvalues and the corresponding eigenvectors // of the generalized eigenvalue problem // // K X = M X Lambda // // using ARPACK (mode 3). // // The convergence test is provided by ARPACK. // // Note that if M is not specified, then K X = X Lambda is solved. // (using the mode for generalized eigenvalue problem). // // Input variables: // // numEigen (integer) = Number of eigenmodes requested // // Q (Epetra_MultiVector) = Initial search space // The number of columns of Q defines the size of search space (=NCV). // The rows of X are distributed across processors. // As a rule of thumb in ARPACK User's guide, NCV >= 2*numEigen. // At exit, the first numEigen locations contain the eigenvectors requested. // // lambda (array of doubles) = Converged eigenvalues // The length of this array is equal to the number of columns in Q. // At exit, the first numEigen locations contain the eigenvalues requested. // // startingEV (integer) = Number of eigenmodes already stored in Q // A linear combination of these vectors is made to define the starting // vector, placed in resid. // // Return information on status of computation // // info >= 0 >> Number of converged eigenpairs at the end of computation // // // Failure due to input arguments // // info = - 1 >> The stiffness matrix K has not been specified. // info = - 2 >> The maps for the matrix K and the matrix M differ. // info = - 3 >> The maps for the matrix K and the preconditioner P differ. // info = - 4 >> The maps for the vectors and the matrix K differ. // info = - 5 >> Q is too small for the number of eigenvalues requested. // info = - 6 >> Q is too small for the computation parameters. // // info = - 8 >> numEigen must be smaller than the dimension of the matrix. // // info = - 30 >> MEMORY // // See ARPACK documentation for the meaning of INFO if (numEigen <= startingEV) { return numEigen; } int info = myVerify.inputArguments(numEigen, K, M, 0, Q, minimumSpaceDimension(numEigen)); if (info < 0) return info; int myPid = MyComm.MyPID(); int localSize = Q.MyLength(); int NCV = Q.NumVectors(); int knownEV = 0; if (NCV > Q.GlobalLength()) { if (numEigen >= Q.GlobalLength()) { cerr << endl; cerr << " !! The number of requested eigenvalues must be smaller than the dimension"; cerr << " of the matrix !!\n"; cerr << endl; return -8; } NCV = Q.GlobalLength(); } int localVerbose = verbose*(myPid == 0); // Define data for ARPACK highMem = (highMem > currentSize()) ? highMem : currentSize(); int ido = 0; int lwI = 22 + NCV; int *wI = new (nothrow) int[lwI]; if (wI == 0) { return -30; } memRequested += sizeof(int)*lwI/(1024.0*1024.0); int *iparam = wI; int *ipntr = wI + 11; int *select = wI + 22; int lworkl = NCV*(NCV+8); int lwD = lworkl + 4*localSize; double *wD = new (nothrow) double[lwD]; if (wD == 0) { delete[] wI; return -30; } memRequested += sizeof(double)*(4*localSize+lworkl)/(1024.0*1024.0); double *pointer = wD; double *workl = pointer; pointer = pointer + lworkl; double *resid = pointer; pointer = pointer + localSize; double *workd = pointer; double *v = Q.Values(); highMem = (highMem > currentSize()) ? highMem : currentSize(); double sigma = 0.0; if (startingEV > 0) { // Define the initial starting vector memset(resid, 0, localSize*sizeof(double)); for (int jj = 0; jj < startingEV; ++jj) for (int ii = 0; ii < localSize; ++ii) resid[ii] += v[ii + jj*localSize]; info = 1; } iparam[1-1] = 1; iparam[3-1] = maxIterEigenSolve; iparam[7-1] = 3; // The fourth parameter forces to use the convergence test provided by ARPACK. // This requires a customization of ARPACK (provided by R. Lehoucq). iparam[4-1] = 0; Epetra_Vector v1(View, Q.Map(), workd); Epetra_Vector v2(View, Q.Map(), workd + localSize); Epetra_Vector v3(View, Q.Map(), workd + 2*localSize); double *vTmp = new (nothrow) double[localSize]; if (vTmp == 0) { delete[] wI; delete[] wD; return -30; } memRequested += sizeof(double)*localSize/(1024.0*1024.0); highMem = (highMem > currentSize()) ? highMem : currentSize(); if (localVerbose > 0) { cout << endl; cout << " *|* Problem: "; if (M) cout << "K*Q = M*Q D "; else cout << "K*Q = Q D "; cout << endl; cout << " *|* Algorithm = ARPACK (mode 3)" << endl; cout << " *|* Number of requested eigenvalues = " << numEigen << endl; cout.precision(2); cout.setf(ios::scientific, ios::floatfield); cout << " *|* Tolerance for convergence = " << tolEigenSolve << endl; if (startingEV > 0) cout << " *|* User-defined starting vector (Combination of " << startingEV << " vectors)\n"; cout << "\n -- Start iterations -- \n"; } #ifdef EPETRA_MPI Epetra_MpiComm *MPIComm = dynamic_cast<Epetra_MpiComm *>(const_cast<Epetra_Comm*>(&MyComm)); #endif timeOuterLoop -= MyWatch.WallTime(); while (ido != 99) { highMem = (highMem > currentSize()) ? highMem : currentSize(); #ifdef EPETRA_MPI if (MPIComm) callFortran.PSAUPD(MPIComm->Comm(), &ido, 'G', localSize, which, numEigen, tolEigenSolve, resid, NCV, v, localSize, iparam, ipntr, workd, workl, lworkl, &info, localVerbose); else callFortran.SAUPD(&ido, 'G', localSize, which, numEigen, tolEigenSolve, resid, NCV, v, localSize, iparam, ipntr, workd, workl, lworkl, &info, localVerbose); #else callFortran.SAUPD(&ido, 'G', localSize, which, numEigen, tolEigenSolve, resid, NCV, v, localSize, iparam, ipntr, workd, workl, lworkl, &info, localVerbose); #endif if (ido == -1) { // Apply the mass matrix v3.ResetView(workd + ipntr[0] - 1); v1.ResetView(vTmp); timeMassOp -= MyWatch.WallTime(); if (M) M->Apply(v3, v1); else memcpy(v1.Values(), v3.Values(), localSize*sizeof(double)); timeMassOp += MyWatch.WallTime(); massOp += 1; // Solve the stiffness problem v2.ResetView(workd + ipntr[1] - 1); timeStifOp -= MyWatch.WallTime(); K->ApplyInverse(v1, v2); timeStifOp += MyWatch.WallTime(); stifOp += 1; continue; } // if (ido == -1) if (ido == 1) { // Solve the stiffness problem v1.ResetView(workd + ipntr[2] - 1); v2.ResetView(workd + ipntr[1] - 1); timeStifOp -= MyWatch.WallTime(); K->ApplyInverse(v1, v2); timeStifOp += MyWatch.WallTime(); stifOp += 1; continue; } // if (ido == 1) if (ido == 2) { // Apply the mass matrix v1.ResetView(workd + ipntr[0] - 1); v2.ResetView(workd + ipntr[1] - 1); timeMassOp -= MyWatch.WallTime(); if (M) M->Apply(v1, v2); else memcpy(v2.Values(), v1.Values(), localSize*sizeof(double)); timeMassOp += MyWatch.WallTime(); massOp += 1; continue; } // if (ido == 2) } // while (ido != 99) timeOuterLoop += MyWatch.WallTime(); highMem = (highMem > currentSize()) ? highMem : currentSize(); if (info < 0) { if (myPid == 0) { cerr << endl; cerr << " Error with DSAUPD, info = " << info << endl; cerr << endl; } } else { // Compute the eigenvectors timePostProce -= MyWatch.WallTime(); #ifdef EPETRA_MPI if (MPIComm) callFortran.PSEUPD(MPIComm->Comm(), 1, 'A', select, lambda, v, localSize, sigma, 'G', localSize, which, numEigen, tolEigenSolve, resid, NCV, v, localSize, iparam, ipntr, workd, workl, lworkl, &info); else callFortran.SEUPD(1, 'A', select, lambda, v, localSize, sigma, 'G', localSize, which, numEigen, tolEigenSolve, resid, NCV, v, localSize, iparam, ipntr, workd, workl, lworkl, &info); #else callFortran.SEUPD(1, 'A', select, lambda, v, localSize, sigma, 'G', localSize, which, numEigen, tolEigenSolve, resid, NCV, v, localSize, iparam, ipntr, workd, workl, lworkl, &info); #endif timePostProce += MyWatch.WallTime(); highMem = (highMem > currentSize()) ? highMem : currentSize(); // Treat the error if (info != 0) { if (myPid == 0) { cerr << endl; cerr << " Error with DSEUPD, info = " << info << endl; cerr << endl; } } } // if (info < 0) if (info == 0) { outerIter = iparam[3-1]; knownEV = iparam[5-1]; orthoOp = iparam[11-1]; } delete[] wI; delete[] wD; delete[] vTmp; return (info == 0) ? knownEV : info; }
int BlockDACG::reSolve(int numEigen, Epetra_MultiVector &Q, double *lambda, int startingEV) { // Computes the smallest eigenvalues and the corresponding eigenvectors // of the generalized eigenvalue problem // // K X = M X Lambda // // using a Block Deflation Accelerated Conjugate Gradient algorithm. // // Note that if M is not specified, then K X = X Lambda is solved. // // Ref: P. Arbenz & R. Lehoucq, "A comparison of algorithms for modal analysis in the // absence of a sparse direct method", SNL, Technical Report SAND2003-1028J // With the notations of this report, the coefficient beta is defined as // diag( H^T_{k} G_{k} ) / diag( H^T_{k-1} G_{k-1} ) // // Input variables: // // numEigen (integer) = Number of eigenmodes requested // // Q (Epetra_MultiVector) = Converged eigenvectors // The number of columns of Q must be equal to numEigen + blockSize. // The rows of Q are distributed across processors. // At exit, the first numEigen columns contain the eigenvectors requested. // // lambda (array of doubles) = Converged eigenvalues // At input, it must be of size numEigen + blockSize. // At exit, the first numEigen locations contain the eigenvalues requested. // // startingEV (integer) = Number of existing converged eigenmodes // // Return information on status of computation // // info >= 0 >> Number of converged eigenpairs at the end of computation // // // Failure due to input arguments // // info = - 1 >> The stiffness matrix K has not been specified. // info = - 2 >> The maps for the matrix K and the matrix M differ. // info = - 3 >> The maps for the matrix K and the preconditioner P differ. // info = - 4 >> The maps for the vectors and the matrix K differ. // info = - 5 >> Q is too small for the number of eigenvalues requested. // info = - 6 >> Q is too small for the computation parameters. // // info = - 10 >> Failure during the mass orthonormalization // // info = - 20 >> Error in LAPACK during the local eigensolve // // info = - 30 >> MEMORY // // Check the input parameters if (numEigen <= startingEV) { return startingEV; } int info = myVerify.inputArguments(numEigen, K, M, Prec, Q, numEigen + blockSize); if (info < 0) return info; int myPid = MyComm.MyPID(); // Get the weight for approximating the M-inverse norm Epetra_Vector *vectWeight = 0; if (normWeight) { vectWeight = new Epetra_Vector(View, Q.Map(), normWeight); } int knownEV = startingEV; int localVerbose = verbose*(myPid==0); // Define local block vectors // // MX = Working vectors (storing M*X if M is specified, else pointing to X) // KX = Working vectors (storing K*X) // // R = Residuals // // H = Preconditioned residuals // // P = Search directions // MP = Working vectors (storing M*P if M is specified, else pointing to P) // KP = Working vectors (storing K*P) int xr = Q.MyLength(); Epetra_MultiVector X(View, Q, numEigen, blockSize); X.Random(); int tmp; tmp = (M == 0) ? 5*blockSize*xr : 7*blockSize*xr; double *work1 = new (nothrow) double[tmp]; if (work1 == 0) { if (vectWeight) delete vectWeight; info = -30; return info; } memRequested += sizeof(double)*tmp/(1024.0*1024.0); highMem = (highMem > currentSize()) ? highMem : currentSize(); double *tmpD = work1; Epetra_MultiVector KX(View, Q.Map(), tmpD, xr, blockSize); tmpD = tmpD + xr*blockSize; Epetra_MultiVector MX(View, Q.Map(), (M) ? tmpD : X.Values(), xr, blockSize); tmpD = (M) ? tmpD + xr*blockSize : tmpD; Epetra_MultiVector R(View, Q.Map(), tmpD, xr, blockSize); tmpD = tmpD + xr*blockSize; Epetra_MultiVector H(View, Q.Map(), tmpD, xr, blockSize); tmpD = tmpD + xr*blockSize; Epetra_MultiVector P(View, Q.Map(), tmpD, xr, blockSize); tmpD = tmpD + xr*blockSize; Epetra_MultiVector KP(View, Q.Map(), tmpD, xr, blockSize); tmpD = tmpD + xr*blockSize; Epetra_MultiVector MP(View, Q.Map(), (M) ? tmpD : P.Values(), xr, blockSize); // Define arrays // // theta = Store the local eigenvalues (size: 2*blockSize) // normR = Store the norm of residuals (size: blockSize) // // oldHtR = Store the previous H_i^T*R_i (size: blockSize) // currentHtR = Store the current H_i^T*R_i (size: blockSize) // // MM = Local mass matrix (size: 2*blockSize x 2*blockSize) // KK = Local stiffness matrix (size: 2*blockSize x 2*blockSize) // // S = Local eigenvectors (size: 2*blockSize x 2*blockSize) int lwork2; lwork2 = 5*blockSize + 12*blockSize*blockSize; double *work2 = new (nothrow) double[lwork2]; if (work2 == 0) { if (vectWeight) delete vectWeight; delete[] work1; info = -30; return info; } highMem = (highMem > currentSize()) ? highMem : currentSize(); tmpD = work2; double *theta = tmpD; tmpD = tmpD + 2*blockSize; double *normR = tmpD; tmpD = tmpD + blockSize; double *oldHtR = tmpD; tmpD = tmpD + blockSize; double *currentHtR = tmpD; tmpD = tmpD + blockSize; memset(currentHtR, 0, blockSize*sizeof(double)); double *MM = tmpD; tmpD = tmpD + 4*blockSize*blockSize; double *KK = tmpD; tmpD = tmpD + 4*blockSize*blockSize; double *S = tmpD; memRequested += sizeof(double)*lwork2/(1024.0*1024.0); // Define an array to store the residuals history if (localVerbose > 2) { resHistory = new (nothrow) double[maxIterEigenSolve*blockSize]; if (resHistory == 0) { if (vectWeight) delete vectWeight; delete[] work1; delete[] work2; info = -30; return info; } historyCount = 0; } // Miscellaneous definitions bool reStart = false; numRestart = 0; int localSize; int twoBlocks = 2*blockSize; int nFound = blockSize; int i, j; if (localVerbose > 0) { cout << endl; cout << " *|* Problem: "; if (M) cout << "K*Q = M*Q D "; else cout << "K*Q = Q D "; if (Prec) cout << " with preconditioner"; cout << endl; cout << " *|* Algorithm = DACG (block version)" << endl; cout << " *|* Size of blocks = " << blockSize << endl; cout << " *|* Number of requested eigenvalues = " << numEigen << endl; cout.precision(2); cout.setf(ios::scientific, ios::floatfield); cout << " *|* Tolerance for convergence = " << tolEigenSolve << endl; cout << " *|* Norm used for convergence: "; if (normWeight) cout << "weighted L2-norm with user-provided weights" << endl; else cout << "L^2-norm" << endl; if (startingEV > 0) cout << " *|* Input converged eigenvectors = " << startingEV << endl; cout << "\n -- Start iterations -- \n"; } timeOuterLoop -= MyWatch.WallTime(); for (outerIter = 1; outerIter <= maxIterEigenSolve; ++outerIter) { highMem = (highMem > currentSize()) ? highMem : currentSize(); if ((outerIter == 1) || (reStart == true)) { reStart = false; localSize = blockSize; if (nFound > 0) { Epetra_MultiVector X2(View, X, blockSize-nFound, nFound); Epetra_MultiVector MX2(View, MX, blockSize-nFound, nFound); Epetra_MultiVector KX2(View, KX, blockSize-nFound, nFound); // Apply the mass matrix to X timeMassOp -= MyWatch.WallTime(); if (M) M->Apply(X2, MX2); timeMassOp += MyWatch.WallTime(); massOp += nFound; if (knownEV > 0) { // Orthonormalize X against the known eigenvectors with Gram-Schmidt // Note: Use R as a temporary work space Epetra_MultiVector copyQ(View, Q, 0, knownEV); timeOrtho -= MyWatch.WallTime(); info = modalTool.massOrthonormalize(X, MX, M, copyQ, nFound, 0, R.Values()); timeOrtho += MyWatch.WallTime(); // Exit the code if the orthogonalization did not succeed if (info < 0) { info = -10; delete[] work1; delete[] work2; if (vectWeight) delete vectWeight; return info; } } // Apply the stiffness matrix to X timeStifOp -= MyWatch.WallTime(); K->Apply(X2, KX2); timeStifOp += MyWatch.WallTime(); stifOp += nFound; } // if (nFound > 0) } // if ((outerIter == 1) || (reStart == true)) else { // Apply the preconditioner on the residuals if (Prec != 0) { timePrecOp -= MyWatch.WallTime(); Prec->ApplyInverse(R, H); timePrecOp += MyWatch.WallTime(); precOp += blockSize; } else { memcpy(H.Values(), R.Values(), xr*blockSize*sizeof(double)); } // Compute the product H^T*R timeSearchP -= MyWatch.WallTime(); memcpy(oldHtR, currentHtR, blockSize*sizeof(double)); H.Dot(R, currentHtR); // Define the new search directions if (localSize == blockSize) { P.Scale(-1.0, H); localSize = twoBlocks; } // if (localSize == blockSize) else { bool hasZeroDot = false; for (j = 0; j < blockSize; ++j) { if (oldHtR[j] == 0.0) { hasZeroDot = true; break; } callBLAS.SCAL(xr, currentHtR[j]/oldHtR[j], P.Values() + j*xr); } if (hasZeroDot == true) { // Restart the computation when there is a null dot product if (localVerbose > 0) { cout << endl; cout << " !! Null dot product -- Restart the search space !!\n"; cout << endl; } if (blockSize == 1) { X.Random(); nFound = blockSize; } else { Epetra_MultiVector Xinit(View, X, j, blockSize-j); Xinit.Random(); nFound = blockSize - j; } // if (blockSize == 1) reStart = true; numRestart += 1; info = 0; continue; } callBLAS.AXPY(xr*blockSize, -1.0, H.Values(), P.Values()); } // if (localSize == blockSize) timeSearchP += MyWatch.WallTime(); // Apply the mass matrix on P timeMassOp -= MyWatch.WallTime(); if (M) M->Apply(P, MP); timeMassOp += MyWatch.WallTime(); massOp += blockSize; if (knownEV > 0) { // Orthogonalize P against the known eigenvectors // Note: Use R as a temporary work space Epetra_MultiVector copyQ(View, Q, 0, knownEV); timeOrtho -= MyWatch.WallTime(); modalTool.massOrthonormalize(P, MP, M, copyQ, blockSize, 1, R.Values()); timeOrtho += MyWatch.WallTime(); } // Apply the stiffness matrix to P timeStifOp -= MyWatch.WallTime(); K->Apply(P, KP); timeStifOp += MyWatch.WallTime(); stifOp += blockSize; } // if ((outerIter == 1) || (reStart == true)) // Form "local" mass and stiffness matrices // Note: Use S as a temporary workspace timeLocalProj -= MyWatch.WallTime(); modalTool.localProjection(blockSize, blockSize, xr, X.Values(), xr, KX.Values(), xr, KK, localSize, S); modalTool.localProjection(blockSize, blockSize, xr, X.Values(), xr, MX.Values(), xr, MM, localSize, S); if (localSize > blockSize) { modalTool.localProjection(blockSize, blockSize, xr, X.Values(), xr, KP.Values(), xr, KK + blockSize*localSize, localSize, S); modalTool.localProjection(blockSize, blockSize, xr, P.Values(), xr, KP.Values(), xr, KK + blockSize*localSize + blockSize, localSize, S); modalTool.localProjection(blockSize, blockSize, xr, X.Values(), xr, MP.Values(), xr, MM + blockSize*localSize, localSize, S); modalTool.localProjection(blockSize, blockSize, xr, P.Values(), xr, MP.Values(), xr, MM + blockSize*localSize + blockSize, localSize, S); } // if (localSize > blockSize) timeLocalProj += MyWatch.WallTime(); // Perform a spectral decomposition timeLocalSolve -= MyWatch.WallTime(); int nevLocal = localSize; info = modalTool.directSolver(localSize, KK, localSize, MM, localSize, nevLocal, S, localSize, theta, localVerbose, (blockSize == 1) ? 1: 0); timeLocalSolve += MyWatch.WallTime(); if (info < 0) { // Stop when spectral decomposition has a critical failure break; } // Check for restarting if ((theta[0] < 0.0) || (nevLocal < blockSize)) { if (localVerbose > 0) { cout << " Iteration " << outerIter; cout << "- Failure for spectral decomposition - RESTART with new random search\n"; } if (blockSize == 1) { X.Random(); nFound = blockSize; } else { Epetra_MultiVector Xinit(View, X, 1, blockSize-1); Xinit.Random(); nFound = blockSize - 1; } // if (blockSize == 1) reStart = true; numRestart += 1; info = 0; continue; } // if ((theta[0] < 0.0) || (nevLocal < blockSize)) if ((localSize == twoBlocks) && (nevLocal == blockSize)) { for (j = 0; j < nevLocal; ++j) memcpy(S + j*blockSize, S + j*twoBlocks, blockSize*sizeof(double)); localSize = blockSize; } // Check the direction of eigenvectors // Note: This sign check is important for convergence for (j = 0; j < nevLocal; ++j) { double coeff = S[j + j*localSize]; if (coeff < 0.0) callBLAS.SCAL(localSize, -1.0, S + j*localSize); } // Compute the residuals timeResidual -= MyWatch.WallTime(); callBLAS.GEMM('N', 'N', xr, blockSize, blockSize, 1.0, KX.Values(), xr, S, localSize, 0.0, R.Values(), xr); if (localSize == twoBlocks) { callBLAS.GEMM('N', 'N', xr, blockSize, blockSize, 1.0, KP.Values(), xr, S + blockSize, localSize, 1.0, R.Values(), xr); } for (j = 0; j < blockSize; ++j) callBLAS.SCAL(localSize, theta[j], S + j*localSize); callBLAS.GEMM('N', 'N', xr, blockSize, blockSize, -1.0, MX.Values(), xr, S, localSize, 1.0, R.Values(), xr); if (localSize == twoBlocks) { callBLAS.GEMM('N', 'N', xr, blockSize, blockSize, -1.0, MP.Values(), xr, S + blockSize, localSize, 1.0, R.Values(), xr); } for (j = 0; j < blockSize; ++j) callBLAS.SCAL(localSize, 1.0/theta[j], S + j*localSize); timeResidual += MyWatch.WallTime(); // Compute the norms of the residuals timeNorm -= MyWatch.WallTime(); if (vectWeight) R.NormWeighted(*vectWeight, normR); else R.Norm2(normR); // Scale the norms of residuals with the eigenvalues // Count the converged eigenvectors nFound = 0; for (j = 0; j < blockSize; ++j) { normR[j] = (theta[j] == 0.0) ? normR[j] : normR[j]/theta[j]; if (normR[j] < tolEigenSolve) nFound += 1; } timeNorm += MyWatch.WallTime(); // Store the residual history if (localVerbose > 2) { memcpy(resHistory + historyCount*blockSize, normR, blockSize*sizeof(double)); historyCount += 1; } // Print information on current iteration if (localVerbose > 0) { cout << " Iteration " << outerIter << " - Number of converged eigenvectors "; cout << knownEV + nFound << endl; } if (localVerbose > 1) { cout << endl; cout.precision(2); cout.setf(ios::scientific, ios::floatfield); for (i=0; i<blockSize; ++i) { cout << " Iteration " << outerIter << " - Scaled Norm of Residual " << i; cout << " = " << normR[i] << endl; } cout << endl; cout.precision(2); for (i=0; i<blockSize; ++i) { cout << " Iteration " << outerIter << " - Ritz eigenvalue " << i; cout.setf((fabs(theta[i]) < 0.01) ? ios::scientific : ios::fixed, ios::floatfield); cout << " = " << theta[i] << endl; } cout << endl; } if (nFound == 0) { // Update the spaces // Note: Use H as a temporary work space timeLocalUpdate -= MyWatch.WallTime(); memcpy(H.Values(), X.Values(), xr*blockSize*sizeof(double)); callBLAS.GEMM('N', 'N', xr, blockSize, blockSize, 1.0, H.Values(), xr, S, localSize, 0.0, X.Values(), xr); memcpy(H.Values(), KX.Values(), xr*blockSize*sizeof(double)); callBLAS.GEMM('N', 'N', xr, blockSize, blockSize, 1.0, H.Values(), xr, S, localSize, 0.0, KX.Values(), xr); if (M) { memcpy(H.Values(), MX.Values(), xr*blockSize*sizeof(double)); callBLAS.GEMM('N', 'N', xr, blockSize, blockSize, 1.0, H.Values(), xr, S, localSize, 0.0, MX.Values(), xr); } if (localSize == twoBlocks) { callBLAS.GEMM('N', 'N', xr, blockSize, blockSize, 1.0, P.Values(), xr, S + blockSize, localSize, 1.0, X.Values(), xr); callBLAS.GEMM('N', 'N', xr, blockSize, blockSize, 1.0, KP.Values(), xr, S + blockSize, localSize, 1.0, KX.Values(), xr); if (M) { callBLAS.GEMM('N', 'N', xr, blockSize, blockSize, 1.0, MP.Values(), xr, S + blockSize, localSize, 1.0, MX.Values(), xr); } } // if (localSize == twoBlocks) timeLocalUpdate += MyWatch.WallTime(); // When required, monitor some orthogonalities if (verbose > 2) { if (knownEV == 0) { accuracyCheck(&X, &MX, &R, 0, (localSize>blockSize) ? &P : 0); } else { Epetra_MultiVector copyQ(View, Q, 0, knownEV); accuracyCheck(&X, &MX, &R, ©Q, (localSize>blockSize) ? &P : 0); } } // if (verbose > 2) continue; } // if (nFound == 0) // Order the Ritz eigenvectors by putting the converged vectors at the beginning int firstIndex = blockSize; for (j = 0; j < blockSize; ++j) { if (normR[j] >= tolEigenSolve) { firstIndex = j; break; } } // for (j = 0; j < blockSize; ++j) while (firstIndex < nFound) { for (j = firstIndex; j < blockSize; ++j) { if (normR[j] < tolEigenSolve) { // Swap the j-th and firstIndex-th position callFortran.SWAP(localSize, S + j*localSize, 1, S + firstIndex*localSize, 1); callFortran.SWAP(1, theta + j, 1, theta + firstIndex, 1); callFortran.SWAP(1, normR + j, 1, normR + firstIndex, 1); break; } } // for (j = firstIndex; j < blockSize; ++j) for (j = 0; j < blockSize; ++j) { if (normR[j] >= tolEigenSolve) { firstIndex = j; break; } } // for (j = 0; j < blockSize; ++j) } // while (firstIndex < nFound) // Copy the converged eigenvalues memcpy(lambda + knownEV, theta, nFound*sizeof(double)); // Convergence test if (knownEV + nFound >= numEigen) { callBLAS.GEMM('N', 'N', xr, nFound, blockSize, 1.0, X.Values(), xr, S, localSize, 0.0, R.Values(), xr); if (localSize > blockSize) { callBLAS.GEMM('N', 'N', xr, nFound, blockSize, 1.0, P.Values(), xr, S + blockSize, localSize, 1.0, R.Values(), xr); } memcpy(Q.Values() + knownEV*xr, R.Values(), nFound*xr*sizeof(double)); knownEV += nFound; if (localVerbose == 1) { cout << endl; cout.precision(2); cout.setf(ios::scientific, ios::floatfield); for (i=0; i<blockSize; ++i) { cout << " Iteration " << outerIter << " - Scaled Norm of Residual " << i; cout << " = " << normR[i] << endl; } cout << endl; } break; } // Store the converged eigenvalues and eigenvectors callBLAS.GEMM('N', 'N', xr, nFound, blockSize, 1.0, X.Values(), xr, S, localSize, 0.0, Q.Values() + knownEV*xr, xr); if (localSize == twoBlocks) { callBLAS.GEMM('N', 'N', xr, nFound, blockSize, 1.0, P.Values(), xr, S + blockSize, localSize, 1.0, Q.Values() + knownEV*xr, xr); } knownEV += nFound; // Define the restarting vectors timeRestart -= MyWatch.WallTime(); int leftOver = (nevLocal < blockSize + nFound) ? nevLocal - nFound : blockSize; double *Snew = S + nFound*localSize; memcpy(H.Values(), X.Values(), blockSize*xr*sizeof(double)); callBLAS.GEMM('N', 'N', xr, leftOver, blockSize, 1.0, H.Values(), xr, Snew, localSize, 0.0, X.Values(), xr); memcpy(H.Values(), KX.Values(), blockSize*xr*sizeof(double)); callBLAS.GEMM('N', 'N', xr, leftOver, blockSize, 1.0, H.Values(), xr, Snew, localSize, 0.0, KX.Values(), xr); if (M) { memcpy(H.Values(), MX.Values(), blockSize*xr*sizeof(double)); callBLAS.GEMM('N', 'N', xr, leftOver, blockSize, 1.0, H.Values(), xr, Snew, localSize, 0.0, MX.Values(), xr); } if (localSize == twoBlocks) { callBLAS.GEMM('N', 'N', xr, leftOver, blockSize, 1.0, P.Values(), xr, Snew+blockSize, localSize, 1.0, X.Values(), xr); callBLAS.GEMM('N', 'N', xr, leftOver, blockSize, 1.0, KP.Values(), xr, Snew+blockSize, localSize, 1.0, KX.Values(), xr); if (M) { callBLAS.GEMM('N', 'N', xr, leftOver, blockSize, 1.0, MP.Values(), xr, Snew+blockSize, localSize, 1.0, MX.Values(), xr); } } // if (localSize == twoBlocks) if (nevLocal < blockSize + nFound) { // Put new random vectors at the end of the block Epetra_MultiVector Xtmp(View, X, leftOver, blockSize - leftOver); Xtmp.Random(); } else { nFound = 0; } // if (nevLocal < blockSize + nFound) reStart = true; timeRestart += MyWatch.WallTime(); } // for (outerIter = 1; outerIter <= maxIterEigenSolve; ++outerIter) timeOuterLoop += MyWatch.WallTime(); highMem = (highMem > currentSize()) ? highMem : currentSize(); // Clean memory delete[] work1; delete[] work2; if (vectWeight) delete vectWeight; // Sort the eigenpairs timePostProce -= MyWatch.WallTime(); if ((info == 0) && (knownEV > 0)) { mySort.sortScalars_Vectors(knownEV, lambda, Q.Values(), Q.MyLength()); } timePostProce += MyWatch.WallTime(); return (info == 0) ? knownEV : info; }
int BlockPCGSolver::Solve(const Epetra_MultiVector &X, Epetra_MultiVector &Y, int blkSize) const { int xrow = X.MyLength(); int xcol = X.NumVectors(); int ycol = Y.NumVectors(); int info = 0; int localVerbose = verbose*(MyComm.MyPID() == 0); double *valX = X.Values(); int NB = 3 + callLAPACK.ILAENV(1, "hetrd", "u", blkSize); int lworkD = (blkSize > NB) ? blkSize*blkSize : NB*blkSize; int wSize = 4*blkSize*xrow + 3*blkSize + 2*blkSize*blkSize + lworkD; bool useY = true; if (ycol % blkSize != 0) { // Allocate an extra block to store the solutions wSize += blkSize*xrow; useY = false; } if (lWorkSpace < wSize) { delete[] workSpace; workSpace = new (std::nothrow) double[wSize]; if (workSpace == 0) { info = -1; return info; } lWorkSpace = wSize; } // if (lWorkSpace < wSize) double *pointer = workSpace; // Array to store the matrix PtKP double *PtKP = pointer; pointer = pointer + blkSize*blkSize; // Array to store coefficient matrices double *coeff = pointer; pointer = pointer + blkSize*blkSize; // Workspace array double *workD = pointer; pointer = pointer + lworkD; // Array to store the eigenvalues of P^t K P double *da = pointer; pointer = pointer + blkSize; // Array to store the norms of right hand sides double *initNorm = pointer; pointer = pointer + blkSize; // Array to store the norms of residuals double *resNorm = pointer; pointer = pointer + blkSize; // Array to store the residuals double *valR = pointer; pointer = pointer + xrow*blkSize; Epetra_MultiVector R(View, X.Map(), valR, xrow, blkSize); // Array to store the preconditioned residuals double *valZ = pointer; pointer = pointer + xrow*blkSize; Epetra_MultiVector Z(View, X.Map(), valZ, xrow, blkSize); // Array to store the search directions double *valP = pointer; pointer = pointer + xrow*blkSize; Epetra_MultiVector P(View, X.Map(), valP, xrow, blkSize); // Array to store the image of the search directions double *valKP = pointer; pointer = pointer + xrow*blkSize; Epetra_MultiVector KP(View, X.Map(), valKP, xrow, blkSize); // Pointer to store the solutions double *valSOL = (useY == true) ? Y.Values() : pointer; int iRHS; for (iRHS = 0; iRHS < xcol; iRHS += blkSize) { int numVec = (iRHS + blkSize < xcol) ? blkSize : xcol - iRHS; // Set the initial residuals to the right hand sides if (numVec < blkSize) { R.Random(); } memcpy(valR, valX + iRHS*xrow, numVec*xrow*sizeof(double)); // Set the initial guess to zero valSOL = (useY == true) ? Y.Values() + iRHS*xrow : valSOL; Epetra_MultiVector SOL(View, X.Map(), valSOL, xrow, blkSize); SOL.PutScalar(0.0); int ii = 0; int iter = 0; int nFound = 0; R.Norm2(initNorm); if (localVerbose > 1) { std::cout << std::endl; std::cout << " Vectors " << iRHS << " to " << iRHS + numVec - 1 << std::endl; if (localVerbose > 2) { std::fprintf(stderr,"\n"); for (ii = 0; ii < numVec; ++ii) { std::cout << " ... Initial Residual Norm " << ii << " = " << initNorm[ii] << std::endl; } std::cout << std::endl; } } // Iteration loop for (iter = 1; iter <= iterMax; ++iter) { // Apply the preconditioner if (Prec) Prec->ApplyInverse(R, Z); else Z = R; // Define the new search directions if (iter == 1) { P = Z; } else { // Compute P^t K Z callBLAS.GEMM(Teuchos::TRANS, Teuchos::NO_TRANS, blkSize, blkSize, xrow, 1.0, KP.Values(), xrow, Z.Values(), xrow, 0.0, workD, blkSize); MyComm.SumAll(workD, coeff, blkSize*blkSize); // Compute the coefficient (P^t K P)^{-1} P^t K Z callBLAS.GEMM(Teuchos::TRANS, Teuchos::NO_TRANS, blkSize, blkSize, blkSize, 1.0, PtKP, blkSize, coeff, blkSize, 0.0, workD, blkSize); for (ii = 0; ii < blkSize; ++ii) callBLAS.SCAL(blkSize, da[ii], workD + ii, blkSize); callBLAS.GEMM(Teuchos::NO_TRANS, Teuchos::NO_TRANS, blkSize, blkSize, blkSize, 1.0, PtKP, blkSize, workD, blkSize, 0.0, coeff, blkSize); // Update the search directions // Note: Use KP as a workspace memcpy(KP.Values(), P.Values(), xrow*blkSize*sizeof(double)); callBLAS.GEMM(Teuchos::NO_TRANS, Teuchos::NO_TRANS, xrow, blkSize, blkSize, 1.0, KP.Values(), xrow, coeff, blkSize, 0.0, P.Values(), xrow); P.Update(1.0, Z, -1.0); } // if (iter == 1) K->Apply(P, KP); // Compute P^t K P callBLAS.GEMM(Teuchos::TRANS, Teuchos::NO_TRANS, blkSize, blkSize, xrow, 1.0, P.Values(), xrow, KP.Values(), xrow, 0.0, workD, blkSize); MyComm.SumAll(workD, PtKP, blkSize*blkSize); // Eigenvalue decomposition of P^t K P callLAPACK.SYEV('V', 'U', blkSize, PtKP, blkSize, da, workD, lworkD, &info); if (info) { // Break the loop as spectral decomposition failed break; } // if (info) // Compute the pseudo-inverse of the eigenvalues for (ii = 0; ii < blkSize; ++ii) { TEUCHOS_TEST_FOR_EXCEPTION(da[ii] < 0.0, std::runtime_error, "Negative " "eigenvalue for P^T K P: da[" << ii << "] = " << da[ii] << "."); da[ii] = (da[ii] == 0.0) ? 0.0 : 1.0/da[ii]; } // for (ii = 0; ii < blkSize; ++ii) // Compute P^t R callBLAS.GEMM(Teuchos::TRANS, Teuchos::NO_TRANS, blkSize, blkSize, xrow, 1.0, P.Values(), xrow, R.Values(), xrow, 0.0, workD, blkSize); MyComm.SumAll(workD, coeff, blkSize*blkSize); // Compute the coefficient (P^t K P)^{-1} P^t R callBLAS.GEMM(Teuchos::TRANS, Teuchos::NO_TRANS, blkSize, blkSize, blkSize, 1.0, PtKP, blkSize, coeff, blkSize, 0.0, workD, blkSize); for (ii = 0; ii < blkSize; ++ii) callBLAS.SCAL(blkSize, da[ii], workD + ii, blkSize); callBLAS.GEMM(Teuchos::NO_TRANS, Teuchos::NO_TRANS, blkSize, blkSize, blkSize, 1.0, PtKP, blkSize, workD, blkSize, 0.0, coeff, blkSize); // Update the solutions callBLAS.GEMM(Teuchos::NO_TRANS, Teuchos::NO_TRANS, xrow, blkSize, blkSize, 1.0, P.Values(), xrow, coeff, blkSize, 1.0, valSOL, xrow); // Update the residuals callBLAS.GEMM(Teuchos::NO_TRANS, Teuchos::NO_TRANS, xrow, blkSize, blkSize, -1.0, KP.Values(), xrow, coeff, blkSize, 1.0, R.Values(), xrow); // Check convergence R.Norm2(resNorm); nFound = 0; for (ii = 0; ii < numVec; ++ii) { if (resNorm[ii] <= tolCG*initNorm[ii]) nFound += 1; } if (localVerbose > 1) { std::cout << " Vectors " << iRHS << " to " << iRHS + numVec - 1; std::cout << " -- Iteration " << iter << " -- " << nFound << " converged vectors\n"; if (localVerbose > 2) { std::cout << std::endl; for (ii = 0; ii < numVec; ++ii) { std::cout << " ... "; std::cout.width(5); std::cout << ii << " ... Residual = "; std::cout.precision(2); std::cout.setf(std::ios::scientific, std::ios::floatfield); std::cout << resNorm[ii] << " ... Right Hand Side = " << initNorm[ii] << std::endl; } std::cout << std::endl; } } if (nFound == numVec) { break; } } // for (iter = 1; iter <= maxIter; ++iter) if (useY == false) { // Copy the solutions back into Y memcpy(Y.Values() + xrow*iRHS, valSOL, numVec*xrow*sizeof(double)); } numSolve += nFound; if (nFound == numVec) { minIter = (iter < minIter) ? iter : minIter; maxIter = (iter > maxIter) ? iter : maxIter; sumIter += iter; } } // for (iRHS = 0; iRHS < xcol; iRHS += blkSize) return info; }
int BlockPCGSolver::Solve(const Epetra_MultiVector &X, Epetra_MultiVector &Y) const { int info = 0; int localVerbose = verbose*(MyComm.MyPID() == 0); int xr = X.MyLength(); int wSize = 3*xr; if (lWorkSpace < wSize) { if (workSpace) delete[] workSpace; workSpace = new (std::nothrow) double[wSize]; if (workSpace == 0) { info = -1; return info; } lWorkSpace = wSize; } // if (lWorkSpace < wSize) double *pointer = workSpace; Epetra_Vector r(View, X.Map(), pointer); pointer = pointer + xr; Epetra_Vector p(View, X.Map(), pointer); pointer = pointer + xr; // Note: Kp and z uses the same memory space Epetra_Vector Kp(View, X.Map(), pointer); Epetra_Vector z(View, X.Map(), pointer); double tmp; double initNorm = 0.0, rNorm = 0.0, newRZ = 0.0, oldRZ = 0.0, alpha = 0.0; double tolSquare = tolCG*tolCG; memcpy(r.Values(), X.Values(), xr*sizeof(double)); tmp = callBLAS.DOT(xr, r.Values(), 1, r.Values(), 1); MyComm.SumAll(&tmp, &initNorm, 1); Y.PutScalar(0.0); if (localVerbose > 1) { std::cout << std::endl; std::cout << " --- PCG Iterations --- " << std::endl; } int iter; for (iter = 1; iter <= iterMax; ++iter) { if (Prec) { Prec->ApplyInverse(r, z); } else { memcpy(z.Values(), r.Values(), xr*sizeof(double)); } if (iter == 1) { tmp = callBLAS.DOT(xr, r.Values(), 1, z.Values(), 1); MyComm.SumAll(&tmp, &newRZ, 1); memcpy(p.Values(), z.Values(), xr*sizeof(double)); } else { oldRZ = newRZ; tmp = callBLAS.DOT(xr, r.Values(), 1, z.Values(), 1); MyComm.SumAll(&tmp, &newRZ, 1); p.Update(1.0, z, newRZ/oldRZ); } K->Apply(p, Kp); tmp = callBLAS.DOT(xr, p.Values(), 1, Kp.Values(), 1); MyComm.SumAll(&tmp, &alpha, 1); alpha = newRZ/alpha; TEUCHOS_TEST_FOR_EXCEPTION(alpha <= 0.0, std::runtime_error, " !!! Non-positive value for p^TKp (" << alpha << ") !!!"); callBLAS.AXPY(xr, alpha, p.Values(), 1, Y.Values(), 1); alpha *= -1.0; callBLAS.AXPY(xr, alpha, Kp.Values(), 1, r.Values(), 1); // Check convergence tmp = callBLAS.DOT(xr, r.Values(), 1, r.Values(), 1); MyComm.SumAll(&tmp, &rNorm, 1); if (localVerbose > 1) { std::cout << " Iter. " << iter; std::cout.precision(4); std::cout.setf(std::ios::scientific, std::ios::floatfield); std::cout << " Residual reduction " << std::sqrt(rNorm/initNorm) << std::endl; } if (rNorm <= tolSquare*initNorm) break; } // for (iter = 1; iter <= iterMax; ++iter) if (localVerbose == 1) { std::cout << std::endl; std::cout << " --- End of PCG solve ---" << std::endl; std::cout << " Iter. " << iter; std::cout.precision(4); std::cout.setf(std::ios::scientific, std::ios::floatfield); std::cout << " Residual reduction " << std::sqrt(rNorm/initNorm) << std::endl; std::cout << std::endl; } if (localVerbose > 1) { std::cout << std::endl; } numSolve += 1; minIter = (iter < minIter) ? iter : minIter; maxIter = (iter > maxIter) ? iter : maxIter; sumIter += iter; return info; }
int Davidson::reSolve(int numEigen, Epetra_MultiVector &Q, double *lambda, int startingEV) { // Computes the smallest eigenvalues and the corresponding eigenvectors // of the generalized eigenvalue problem // // K X = M X Lambda // // using a generalized Davidson algorithm // // Note that if M is not specified, then K X = X Lambda is solved. // // Input variables: // // numEigen (integer) = Number of eigenmodes requested // // Q (Epetra_MultiVector) = Converged eigenvectors // The number of columns of Q must be at least numEigen + blockSize. // The rows of Q are distributed across processors. // At exit, the first numEigen columns contain the eigenvectors requested. // // lambda (array of doubles) = Converged eigenvalues // At input, it must be of size numEigen + blockSize. // At exit, the first numEigen locations contain the eigenvalues requested. // // startingEV (integer) = Number of existing converged eigenvectors // We assume that the user has check the eigenvectors and // their M-orthonormality. // // Return information on status of computation // // info >= 0 >> Number of converged eigenpairs at the end of computation // // // Failure due to input arguments // // info = - 1 >> The stiffness matrix K has not been specified. // info = - 2 >> The maps for the matrix K and the matrix M differ. // info = - 3 >> The maps for the matrix K and the preconditioner P differ. // info = - 4 >> The maps for the vectors and the matrix K differ. // info = - 5 >> Q is too small for the number of eigenvalues requested. // info = - 6 >> Q is too small for the computation parameters. // // info = - 8 >> The number of blocks is too small for the number of eigenvalues. // // info = - 10 >> Failure during the mass orthonormalization // // info = - 30 >> MEMORY // // Check the input parameters if (numEigen <= startingEV) { return startingEV; } int info = myVerify.inputArguments(numEigen, K, M, Prec, Q, minimumSpaceDimension(numEigen)); if (info < 0) return info; int myPid = MyComm.MyPID(); if (numBlock*blockSize < numEigen) { if (myPid == 0) { cerr << endl; cerr << " !!! The space dimension (# of blocks x size of blocks) must be greater than "; cerr << " the number of eigenvalues !!!\n"; cerr << " Number of blocks = " << numBlock << endl; cerr << " Size of blocks = " << blockSize << endl; cerr << " Number of eigenvalues = " << numEigen << endl; cerr << endl; } return -8; } // Get the weight for approximating the M-inverse norm Epetra_Vector *vectWeight = 0; if (normWeight) { vectWeight = new Epetra_Vector(View, Q.Map(), normWeight); } int knownEV = startingEV; int localVerbose = verbose*(myPid==0); // Define local block vectors // // MX = Working vectors (storing M*X if M is specified, else pointing to X) // KX = Working vectors (storing K*X) // // R = Residuals int xr = Q.MyLength(); int dimSearch = blockSize*numBlock; Epetra_MultiVector X(View, Q, 0, dimSearch + blockSize); if (knownEV > 0) { Epetra_MultiVector copyX(View, Q, knownEV, blockSize); copyX.Random(); } else { X.Random(); } int tmp; tmp = (M == 0) ? 2*blockSize*xr : 3*blockSize*xr; double *work1 = new (nothrow) double[tmp]; if (work1 == 0) { if (vectWeight) delete vectWeight; info = -30; return info; } memRequested += sizeof(double)*tmp/(1024.0*1024.0); highMem = (highMem > currentSize()) ? highMem : currentSize(); double *tmpD = work1; Epetra_MultiVector KX(View, Q.Map(), tmpD, xr, blockSize); tmpD = tmpD + xr*blockSize; Epetra_MultiVector MX(View, Q.Map(), (M) ? tmpD : X.Values(), xr, blockSize); tmpD = (M) ? tmpD + xr*blockSize : tmpD; Epetra_MultiVector R(View, Q.Map(), tmpD, xr, blockSize); // Define arrays // // theta = Store the local eigenvalues (size: dimSearch) // normR = Store the norm of residuals (size: blockSize) // // KK = Local stiffness matrix (size: dimSearch x dimSearch) // // S = Local eigenvectors (size: dimSearch x dimSearch) // // tmpKK = Local workspace (size: blockSize x blockSize) int lwork2 = blockSize + dimSearch + 2*dimSearch*dimSearch + blockSize*blockSize; double *work2 = new (nothrow) double[lwork2]; if (work2 == 0) { if (vectWeight) delete vectWeight; delete[] work1; info = -30; return info; } memRequested += sizeof(double)*lwork2/(1024.0*1024.0); highMem = (highMem > currentSize()) ? highMem : currentSize(); tmpD = work2; double *theta = tmpD; tmpD = tmpD + dimSearch; double *normR = tmpD; tmpD = tmpD + blockSize; double *KK = tmpD; tmpD = tmpD + dimSearch*dimSearch; memset(KK, 0, dimSearch*dimSearch*sizeof(double)); double *S = tmpD; tmpD = tmpD + dimSearch*dimSearch; double *tmpKK = tmpD; // Define an array to store the residuals history if (localVerbose > 2) { resHistory = new (nothrow) double[maxIterEigenSolve*blockSize]; spaceSizeHistory = new (nothrow) int[maxIterEigenSolve]; if ((resHistory == 0) || (spaceSizeHistory == 0)) { if (vectWeight) delete vectWeight; delete[] work1; delete[] work2; info = -30; return info; } historyCount = 0; } // Miscellaneous definitions bool reStart = false; numRestart = 0; bool criticalExit = false; int bStart = 0; int offSet = 0; numBlock = (dimSearch/blockSize) - (knownEV/blockSize); int nFound = blockSize; int i, j; if (localVerbose > 0) { cout << endl; cout << " *|* Problem: "; if (M) cout << "K*Q = M*Q D "; else cout << "K*Q = Q D "; if (Prec) cout << " with preconditioner"; cout << endl; cout << " *|* Algorithm = Davidson algorithm (block version)" << endl; cout << " *|* Size of blocks = " << blockSize << endl; cout << " *|* Largest size of search space = " << numBlock*blockSize << endl; cout << " *|* Number of requested eigenvalues = " << numEigen << endl; cout.precision(2); cout.setf(ios::scientific, ios::floatfield); cout << " *|* Tolerance for convergence = " << tolEigenSolve << endl; cout << " *|* Norm used for convergence: "; if (vectWeight) cout << "weighted L2-norm with user-provided weights" << endl; else cout << "L^2-norm" << endl; if (startingEV > 0) cout << " *|* Input converged eigenvectors = " << startingEV << endl; cout << "\n -- Start iterations -- \n"; } int maxBlock = (dimSearch/blockSize) - (knownEV/blockSize); timeOuterLoop -= MyWatch.WallTime(); outerIter = 0; while (outerIter <= maxIterEigenSolve) { highMem = (highMem > currentSize()) ? highMem : currentSize(); int nb; for (nb = bStart; nb < maxBlock; ++nb) { outerIter += 1; if (outerIter > maxIterEigenSolve) break; int localSize = nb*blockSize; Epetra_MultiVector Xcurrent(View, X, localSize + knownEV, blockSize); timeMassOp -= MyWatch.WallTime(); if (M) M->Apply(Xcurrent, MX); timeMassOp += MyWatch.WallTime(); massOp += blockSize; // Orthonormalize X against the known eigenvectors and the previous vectors // Note: Use R as a temporary work space timeOrtho -= MyWatch.WallTime(); if (nb == bStart) { if (nFound > 0) { if (knownEV == 0) { info = modalTool.massOrthonormalize(Xcurrent, MX, M, Q, nFound, 2, R.Values()); } else { Epetra_MultiVector copyQ(View, X, 0, knownEV + localSize); info = modalTool.massOrthonormalize(Xcurrent, MX, M, copyQ, nFound, 0, R.Values()); } } nFound = 0; } else { Epetra_MultiVector copyQ(View, X, 0, knownEV + localSize); info = modalTool.massOrthonormalize(Xcurrent, MX, M, copyQ, blockSize, 0, R.Values()); } timeOrtho += MyWatch.WallTime(); // Exit the code when the number of vectors exceeds the space dimension if (info < 0) { delete[] work1; delete[] work2; if (vectWeight) delete vectWeight; return -10; } timeStifOp -= MyWatch.WallTime(); K->Apply(Xcurrent, KX); timeStifOp += MyWatch.WallTime(); stifOp += blockSize; // Check the orthogonality properties of X if (verbose > 2) { if (knownEV + localSize == 0) accuracyCheck(&Xcurrent, &MX, 0); else { Epetra_MultiVector copyQ(View, X, 0, knownEV + localSize); accuracyCheck(&Xcurrent, &MX, ©Q); } if (localVerbose > 0) cout << endl; } // if (verbose > 2) // Define the local stiffness matrix // Note: S is used as a workspace timeLocalProj -= MyWatch.WallTime(); for (j = 0; j <= nb; ++j) { callBLAS.GEMM('T', 'N', blockSize, blockSize, xr, 1.0, X.Values()+(knownEV+j*blockSize)*xr, xr, KX.Values(), xr, 0.0, tmpKK, blockSize); MyComm.SumAll(tmpKK, S, blockSize*blockSize); int iC; for (iC = 0; iC < blockSize; ++iC) { double *Kpointer = KK + localSize*dimSearch + j*blockSize + iC*dimSearch; memcpy(Kpointer, S + iC*blockSize, blockSize*sizeof(double)); } } timeLocalProj += MyWatch.WallTime(); // Perform a spectral decomposition timeLocalSolve -= MyWatch.WallTime(); int nevLocal = localSize + blockSize; info = modalTool.directSolver(localSize+blockSize, KK, dimSearch, 0, 0, nevLocal, S, dimSearch, theta, localVerbose, 10); timeLocalSolve += MyWatch.WallTime(); if (info != 0) { // Stop as spectral decomposition has a critical failure if (info < 0) { criticalExit = true; break; } // Restart as spectral decomposition failed if (localVerbose > 0) { cout << " Iteration " << outerIter; cout << "- Failure for spectral decomposition - RESTART with new random search\n"; } reStart = true; numRestart += 1; timeRestart -= MyWatch.WallTime(); Epetra_MultiVector Xinit(View, X, knownEV, blockSize); Xinit.Random(); timeRestart += MyWatch.WallTime(); nFound = blockSize; bStart = 0; break; } // if (info != 0) // Update the search space // Note: Use KX as a workspace timeLocalUpdate -= MyWatch.WallTime(); callBLAS.GEMM('N', 'N', xr, blockSize, localSize+blockSize, 1.0, X.Values()+knownEV*xr, xr, S, dimSearch, 0.0, KX.Values(), xr); timeLocalUpdate += MyWatch.WallTime(); // Apply the mass matrix for the next block timeMassOp -= MyWatch.WallTime(); if (M) M->Apply(KX, MX); timeMassOp += MyWatch.WallTime(); massOp += blockSize; // Apply the stiffness matrix for the next block timeStifOp -= MyWatch.WallTime(); K->Apply(KX, R); timeStifOp += MyWatch.WallTime(); stifOp += blockSize; // Form the residuals timeResidual -= MyWatch.WallTime(); if (M) { for (j = 0; j < blockSize; ++j) { callBLAS.AXPY(xr, -theta[j], MX.Values() + j*xr, R.Values() + j*xr); } } else { // Note KX contains the updated block for (j = 0; j < blockSize; ++j) { callBLAS.AXPY(xr, -theta[j], KX.Values() + j*xr, R.Values() + j*xr); } } timeResidual += MyWatch.WallTime(); residual += blockSize; // Compute the norm of residuals timeNorm -= MyWatch.WallTime(); if (vectWeight) { R.NormWeighted(*vectWeight, normR); } else { R.Norm2(normR); } // Scale the norms of residuals with the eigenvalues // Count the number of converged eigenvectors nFound = 0; for (j = 0; j < blockSize; ++j) { normR[j] = (theta[j] == 0.0) ? normR[j] : normR[j]/theta[j]; if (normR[j] < tolEigenSolve) nFound += 1; } // for (j = 0; j < blockSize; ++j) timeNorm += MyWatch.WallTime(); // Store the residual history if (localVerbose > 2) { memcpy(resHistory + historyCount*blockSize, normR, blockSize*sizeof(double)); spaceSizeHistory[historyCount] = localSize + blockSize; historyCount += 1; } maxSpaceSize = (maxSpaceSize > localSize+blockSize) ? maxSpaceSize : localSize+blockSize; sumSpaceSize += localSize + blockSize; // Print information on current iteration if (localVerbose > 0) { cout << " Iteration " << outerIter << " - Number of converged eigenvectors "; cout << knownEV + nFound << endl; } // if (localVerbose > 0) if (localVerbose > 1) { cout << endl; cout.precision(2); cout.setf(ios::scientific, ios::floatfield); for (i=0; i<blockSize; ++i) { cout << " Iteration " << outerIter << " - Scaled Norm of Residual " << i; cout << " = " << normR[i] << endl; } cout << endl; cout.precision(2); for (i=0; i<nevLocal; ++i) { cout << " Iteration " << outerIter << " - Ritz eigenvalue " << i; cout.setf((fabs(theta[i]) < 0.01) ? ios::scientific : ios::fixed, ios::floatfield); cout << " = " << theta[i] << endl; } cout << endl; } // Exit the loop to treat the converged eigenvectors if (nFound > 0) { nb += 1; offSet = 0; break; } // Apply the preconditioner on the residuals // Note: Use KX as a workspace if (maxBlock == 1) { if (Prec) { timePrecOp -= MyWatch.WallTime(); Prec->ApplyInverse(R, Xcurrent); timePrecOp += MyWatch.WallTime(); precOp += blockSize; } else { memcpy(Xcurrent.Values(), R.Values(), blockSize*xr*sizeof(double)); } timeRestart -= MyWatch.WallTime(); Xcurrent.Update(1.0, KX, -1.0); timeRestart += MyWatch.WallTime(); break; } // if (maxBlock == 1) if (nb == maxBlock - 1) { nb += 1; break; } Epetra_MultiVector Xnext(View, X, knownEV+localSize+blockSize, blockSize); if (Prec) { timePrecOp -= MyWatch.WallTime(); Prec->ApplyInverse(R, Xnext); timePrecOp += MyWatch.WallTime(); precOp += blockSize; } else { memcpy(Xnext.Values(), R.Values(), blockSize*xr*sizeof(double)); } } // for (nb = bStart; nb < maxBlock; ++nb) if (outerIter > maxIterEigenSolve) break; if (reStart == true) { reStart = false; continue; } if (criticalExit == true) break; // Store the final converged eigenvectors if (knownEV + nFound >= numEigen) { for (j = 0; j < blockSize; ++j) { if (normR[j] < tolEigenSolve) { memcpy(X.Values() + knownEV*xr, KX.Values() + j*xr, xr*sizeof(double)); lambda[knownEV] = theta[j]; knownEV += 1; } } if (localVerbose == 1) { cout << endl; cout.precision(2); cout.setf(ios::scientific, ios::floatfield); for (i=0; i<blockSize; ++i) { cout << " Iteration " << outerIter << " - Scaled Norm of Residual " << i; cout << " = " << normR[i] << endl; } cout << endl; } break; } // if (knownEV + nFound >= numEigen) // Treat the particular case of 1 block if (maxBlock == 1) { if (nFound > 0) { double *Xpointer = X.Values() + (knownEV+nFound)*xr; nFound = 0; for (j = 0; j < blockSize; ++j) { if (normR[j] < tolEigenSolve) { memcpy(X.Values() + knownEV*xr, KX.Values() + j*xr, xr*sizeof(double)); lambda[knownEV] = theta[j]; knownEV += 1; nFound += 1; } else { memcpy(Xpointer + (j-nFound)*xr, KX.Values() + j*xr, xr*sizeof(double)); } } Epetra_MultiVector Xnext(View, X, knownEV + blockSize - nFound, nFound); Xnext.Random(); } else { nFound = blockSize; } continue; } // Define the restarting block when maxBlock > 1 if (nFound > 0) { int firstIndex = blockSize; for (j = 0; j < blockSize; ++j) { if (normR[j] >= tolEigenSolve) { firstIndex = j; break; } } // for (j = 0; j < blockSize; ++j) while (firstIndex < nFound) { for (j = firstIndex; j < blockSize; ++j) { if (normR[j] < tolEigenSolve) { // Swap the j-th and firstIndex-th position callFortran.SWAP(nb*blockSize, S + j*dimSearch, 1, S + firstIndex*dimSearch, 1); callFortran.SWAP(1, theta + j, 1, theta + firstIndex, 1); callFortran.SWAP(1, normR + j, 1, normR + firstIndex, 1); break; } } // for (j = firstIndex; j < blockSize; ++j) for (j = 0; j < blockSize; ++j) { if (normR[j] >= tolEigenSolve) { firstIndex = j; break; } } // for (j = 0; j < blockSize; ++j) } // while (firstIndex < nFound) // Copy the converged eigenvalues memcpy(lambda + knownEV, theta, nFound*sizeof(double)); } // if (nFound > 0) // Define the restarting size bStart = ((nb - offSet) > 2) ? (nb - offSet)/2 : 0; // Define the restarting space and local stiffness timeRestart -= MyWatch.WallTime(); memset(KK, 0, nb*blockSize*dimSearch*sizeof(double)); for (j = 0; j < bStart*blockSize; ++j) { KK[j + j*dimSearch] = theta[j + nFound]; } // Form the restarting space int oldCol = nb*blockSize; int newCol = nFound + (bStart+1)*blockSize; newCol = (newCol > oldCol) ? oldCol : newCol; callFortran.GEQRF(oldCol, newCol, S, dimSearch, theta, R.Values(), xr*blockSize, &info); callFortran.ORMQR('R', 'N', xr, oldCol, newCol, S, dimSearch, theta, X.Values()+knownEV*xr, xr, R.Values(), blockSize*xr, &info); timeRestart += MyWatch.WallTime(); if (nFound == 0) offSet += 1; knownEV += nFound; maxBlock = (dimSearch/blockSize) - (knownEV/blockSize); // Put random vectors if the Rayleigh Ritz vectors are not enough newCol = nFound + (bStart+1)*blockSize; if (newCol > oldCol) { Epetra_MultiVector Xnext(View, X, knownEV+blockSize-nFound, nFound); Xnext.Random(); continue; } nFound = 0; } // while (outerIter <= maxIterEigenSolve) timeOuterLoop += MyWatch.WallTime(); highMem = (highMem > currentSize()) ? highMem : currentSize(); // Clean memory delete[] work1; delete[] work2; if (vectWeight) delete vectWeight; // Sort the eigenpairs timePostProce -= MyWatch.WallTime(); if ((info == 0) && (knownEV > 0)) { mySort.sortScalars_Vectors(knownEV, lambda, Q.Values(), Q.MyLength()); } timePostProce += MyWatch.WallTime(); return (info == 0) ? knownEV : info; }
int Stokhos::ApproxSchurComplementPreconditioner:: ApplyInverse(const Epetra_MultiVector& Input, Epetra_MultiVector& Result) const { #ifdef STOKHOS_TEUCHOS_TIME_MONITOR TEUCHOS_FUNC_TIME_MONITOR("Stokhos: Total Approximate Schur Complement Time"); #endif // We have to be careful if Input and Result are the same vector. // If this is the case, the only possible solution is to make a copy const Epetra_MultiVector *input = &Input; bool made_copy = false; if (Input.Values() == Result.Values()) { input = new Epetra_MultiVector(Input); made_copy = true; } // Allocate temporary storage int m = input->NumVectors(); if (rhs_block == Teuchos::null || rhs_block->NumVectors() != m) rhs_block = Teuchos::rcp(new EpetraExt::BlockMultiVector(*base_map, *sg_map, m)); if (tmp == Teuchos::null || tmp->NumVectors() != m*max_num_mat_vec) tmp = Teuchos::rcp(new Epetra_MultiVector(*base_map, m*max_num_mat_vec)); j_ptr.resize(m*max_num_mat_vec); mj_indices.resize(m*max_num_mat_vec); // Extract blocks EpetraExt::BlockMultiVector input_block(View, *base_map, *input); EpetraExt::BlockMultiVector result_block(View, *base_map, Result); result_block.PutScalar(0.0); // Set right-hand-side to input_block rhs_block->Update(1.0, input_block, 0.0); // At level l, linear system has the structure // [ A_{l-1} B_l ][ u_l^{l-1} ] = [ r_l^{l-1} ] // [ C_l D_l ][ u_l^l ] [ r_l^l ] for (int l=P; l>=1; l--) { // Compute D_l^{-1} r_l^l divide_diagonal_block(block_indices[l], block_indices[l+1], *rhs_block, result_block); // Compute r_l^{l-1} = r_l^{l-1} - B_l D_l^{-1} r_l^l multiply_block(upper_block_Cijk[l], -1.0, result_block, *rhs_block); } // Solve A_0 u_0 = r_0 divide_diagonal_block(0, 1, *rhs_block, result_block); for (int l=1; l<=P; l++) { // Compute r_l^l - C_l*u_l^{l-1} multiply_block(lower_block_Cijk[l], -1.0, result_block, *rhs_block); // Compute D_l^{-1} (r_l^l - C_l*u_l^{l-1}) divide_diagonal_block(block_indices[l], block_indices[l+1], *rhs_block, result_block); } if (made_copy) delete input; return 0; }
int Stokhos::MatrixFreeOperator:: Apply(const Epetra_MultiVector& Input, Epetra_MultiVector& Result) const { #ifdef STOKHOS_TEUCHOS_TIME_MONITOR TEUCHOS_FUNC_TIME_MONITOR("Stokhos: SG Operator Apply()"); #endif // Note for transpose: // The stochastic matrix is symmetric, however the matrix blocks may not // be. So the algorithm here is the same whether we are using the transpose // or not. We just apply the transpose of the blocks in the case of // applying the global transpose, and make sure the imported Input // vectors use the right map. // We have to be careful if Input and Result are the same vector. // If this is the case, the only possible solution is to make a copy const Epetra_MultiVector *input = &Input; bool made_copy = false; if (Input.Values() == Result.Values() && !is_stoch_parallel) { input = new Epetra_MultiVector(Input); made_copy = true; } // Initialize Result.PutScalar(0.0); const Epetra_Map* input_base_map = domain_base_map.get(); const Epetra_Map* result_base_map = range_base_map.get(); if (useTranspose == true) { input_base_map = range_base_map.get(); result_base_map = domain_base_map.get(); } // Allocate temporary storage int m = Input.NumVectors(); if (useTranspose == false && (tmp == Teuchos::null || tmp->NumVectors() != m*max_num_mat_vec)) tmp = Teuchos::rcp(new Epetra_MultiVector(*result_base_map, m*max_num_mat_vec)); else if (useTranspose == true && (tmp_trans == Teuchos::null || tmp_trans->NumVectors() != m*max_num_mat_vec)) tmp_trans = Teuchos::rcp(new Epetra_MultiVector(*result_base_map, m*max_num_mat_vec)); Epetra_MultiVector *tmp_result; if (useTranspose == false) tmp_result = tmp.get(); else tmp_result = tmp_trans.get(); // Map input into column map const Epetra_MultiVector *tmp_col; if (!is_stoch_parallel) tmp_col = input; else { if (useTranspose == false) { if (input_col == Teuchos::null || input_col->NumVectors() != m) input_col = Teuchos::rcp(new Epetra_MultiVector(*global_col_map, m)); input_col->Import(*input, *col_importer, Insert); tmp_col = input_col.get(); } else { if (input_col_trans == Teuchos::null || input_col_trans->NumVectors() != m) input_col_trans = Teuchos::rcp(new Epetra_MultiVector(*global_col_map_trans, m)); input_col_trans->Import(*input, *col_importer_trans, Insert); tmp_col = input_col_trans.get(); } } // Extract blocks EpetraExt::BlockMultiVector sg_input(View, *input_base_map, *tmp_col); EpetraExt::BlockMultiVector sg_result(View, *result_base_map, Result); for (int i=0; i<input_block.size(); i++) input_block[i] = sg_input.GetBlock(i); for (int i=0; i<result_block.size(); i++) result_block[i] = sg_result.GetBlock(i); // Apply block SG operator via // w_i = // \sum_{j=0}^P \sum_{k=0}^L J_k v_j < \psi_i \psi_j \psi_k > / <\psi_i^2> // for i=0,...,P where P = expansion_size, L = num_blocks, w_j is the jth // input block, w_i is the ith result block, and J_k is the kth block operator // k_begin and k_end are initialized in the constructor const Teuchos::Array<double>& norms = sg_basis->norm_squared(); for (Cijk_type::k_iterator k_it=k_begin; k_it!=k_end; ++k_it) { int k = index(k_it); Cijk_type::kj_iterator j_begin = Cijk->j_begin(k_it); Cijk_type::kj_iterator j_end = Cijk->j_end(k_it); int nj = Cijk->num_j(k_it); if (nj > 0) { Teuchos::Array<double*> j_ptr(nj*m); Teuchos::Array<int> mj_indices(nj*m); int l = 0; for (Cijk_type::kj_iterator j_it = j_begin; j_it != j_end; ++j_it) { int j = index(j_it); for (int mm=0; mm<m; mm++) { j_ptr[l*m+mm] = (*input_block[j])[mm]; mj_indices[l*m+mm] = l*m+mm; } l++; } Epetra_MultiVector input_tmp(View, *input_base_map, &j_ptr[0], nj*m); Epetra_MultiVector result_tmp(View, *tmp_result, &mj_indices[0], nj*m); if (use_block_apply) { (*block_ops)[k].Apply(input_tmp, result_tmp); } else { for (int jj=0; jj<nj*m; jj++) (*block_ops)[k].Apply(*(input_tmp(jj)), *(result_tmp(jj))); } l = 0; for (Cijk_type::kj_iterator j_it = j_begin; j_it != j_end; ++j_it) { int j = index(j_it); for (Cijk_type::kji_iterator i_it = Cijk->i_begin(j_it); i_it != Cijk->i_end(j_it); ++i_it) { int i = index(i_it); double c = value(i_it); if (scale_op) { int i_gid; if (useTranspose) i_gid = epetraCijk->GCID(j); else i_gid = epetraCijk->GRID(i); c /= norms[i_gid]; } for (int mm=0; mm<m; mm++) (*result_block[i])(mm)->Update(c, *result_tmp(l*m+mm), 1.0); } l++; } } } // Destroy blocks for (int i=0; i<input_block.size(); i++) input_block[i] = Teuchos::null; for (int i=0; i<result_block.size(); i++) result_block[i] = Teuchos::null; if (made_copy) delete input; return 0; }
int ModifiedARPACKm3::reSolve(int numEigen, Epetra_MultiVector &Q, double *lambda, int startingEV, const Epetra_MultiVector *orthoVec) { // Computes the smallest eigenvalues and the corresponding eigenvectors // of the generalized eigenvalue problem // // K X = M X Lambda // // using ModifiedARPACK (mode 3). // // The convergence test is performed outisde of ARPACK // // || Kx - Mx lambda || < tol*lambda // // The norm ||.|| can be specified by the user through the array normWeight. // By default, the L2 Euclidean norm is used. // // Note that if M is not specified, then K X = X Lambda is solved. // (using the mode for generalized eigenvalue problem). // // Input variables: // // numEigen (integer) = Number of eigenmodes requested // // Q (Epetra_MultiVector) = Initial search space // The number of columns of Q defines the size of search space (=NCV). // The rows of X are distributed across processors. // As a rule of thumb in ARPACK User's guide, NCV >= 2*numEigen. // At exit, the first numEigen locations contain the eigenvectors requested. // // lambda (array of doubles) = Converged eigenvalues // The length of this array is equal to the number of columns in Q. // At exit, the first numEigen locations contain the eigenvalues requested. // // startingEV (integer) = Number of eigenmodes already stored in Q // A linear combination of these vectors is made to define the starting // vector, placed in resid. // // orthoVec (Pointer to Epetra_MultiVector) = Space to be orthogonal to // The computation is performed in the orthogonal of the space spanned // by the columns vectors in orthoVec. // // Return information on status of computation // // info >= 0 >> Number of converged eigenpairs at the end of computation // // // Failure due to input arguments // // info = - 1 >> The stiffness matrix K has not been specified. // info = - 2 >> The maps for the matrix K and the matrix M differ. // info = - 3 >> The maps for the matrix K and the preconditioner P differ. // info = - 4 >> The maps for the vectors and the matrix K differ. // info = - 5 >> Q is too small for the number of eigenvalues requested. // info = - 6 >> Q is too small for the computation parameters. // // info = - 8 >> numEigen must be smaller than the dimension of the matrix. // // info = - 30 >> MEMORY // // See ARPACK documentation for the meaning of INFO if (numEigen <= startingEV) { return numEigen; } int info = myVerify.inputArguments(numEigen, K, M, 0, Q, minimumSpaceDimension(numEigen)); if (info < 0) return info; int myPid = MyComm.MyPID(); int localSize = Q.MyLength(); int NCV = Q.NumVectors(); int knownEV = 0; if (NCV > Q.GlobalLength()) { if (numEigen >= Q.GlobalLength()) { cerr << endl; cerr << " !! The number of requested eigenvalues must be smaller than the dimension"; cerr << " of the matrix !!\n"; cerr << endl; return -8; } NCV = Q.GlobalLength(); } // Get the weight for approximating the M-inverse norm Epetra_Vector *vectWeight = 0; if (normWeight) { vectWeight = new Epetra_Vector(View, Q.Map(), normWeight); } int localVerbose = verbose*(myPid == 0); // Define data for ARPACK // // UH (10/17/03) Note that workl is also used // * to store the eigenvectors of the tridiagonal matrix // * as a workspace for DSTEQR // * as a workspace for recovering the global eigenvectors highMem = (highMem > currentSize()) ? highMem : currentSize(); int ido = 0; int lwI = 22; int *wI = new (nothrow) int[lwI]; if (wI == 0) { if (vectWeight) delete vectWeight; return -30; } memRequested += sizeof(int)*lwI/(1024.0*1024.0); int *iparam = wI; int *ipntr = wI + 11; int lworkl = NCV*(NCV+8); int lwD = lworkl + 4*localSize; double *wD = new (nothrow) double[lwD]; if (wD == 0) { if (vectWeight) delete vectWeight; delete[] wI; return -30; } memRequested += sizeof(double)*(4*localSize+lworkl)/(1024.0*1024.0); double *pointer = wD; double *workl = pointer; pointer = pointer + lworkl; double *resid = pointer; pointer = pointer + localSize; double *workd = pointer; double *v = Q.Values(); highMem = (highMem > currentSize()) ? highMem : currentSize(); if (startingEV > 0) { // Define the initial starting vector memset(resid, 0, localSize*sizeof(double)); for (int jj = 0; jj < startingEV; ++jj) for (int ii = 0; ii < localSize; ++ii) resid[ii] += v[ii + jj*localSize]; info = 1; } iparam[1-1] = 1; iparam[3-1] = maxIterEigenSolve; iparam[7-1] = 3; // The fourth parameter forces to use the convergence test provided by ARPACK. // This requires a customization of ARPACK (provided by R. Lehoucq). iparam[4-1] = 1; Epetra_Vector v1(View, Q.Map(), workd); Epetra_Vector v2(View, Q.Map(), workd + localSize); Epetra_Vector v3(View, Q.Map(), workd + 2*localSize); // Define further storage for the new residual check // Use a block of vectors to compute the residuals more quickly. // Note that workd could be used if memory becomes an issue. int loopZ = (NCV > 10) ? 10 : NCV; int lwD2 = localSize + 2*NCV-1 + NCV; lwD2 += (M) ? 3*loopZ*localSize : 2*loopZ*localSize; double *wD2 = new (nothrow) double[lwD2]; if (wD2 == 0) { if (vectWeight) delete vectWeight; delete[] wI; delete[] wD; return -30; } memRequested += sizeof(double)*lwD2/(1024.0*1024.0); pointer = wD2; // vTmp is used when ido = -1 double *vTmp = pointer; pointer = pointer + localSize; // dd and ee stores the tridiagonal matrix. // Note that DSTEQR destroys the contents of the input arrays. double *dd = pointer; pointer = pointer + NCV; double *ee = pointer; pointer = pointer + NCV-1; double *vz = pointer; pointer = pointer + loopZ*localSize; Epetra_MultiVector approxEV(View, Q.Map(), vz, localSize, loopZ); double *kvz = pointer; pointer = pointer + loopZ*localSize; Epetra_MultiVector KapproxEV(View, Q.Map(), kvz, localSize, loopZ); double *mvz = (M) ? pointer : vz; pointer = (M) ? pointer + loopZ*localSize : pointer; Epetra_MultiVector MapproxEV(View, Q.Map(), mvz, localSize, loopZ); double *normR = pointer; // zz contains the eigenvectors of the tridiagonal matrix. // workt is a workspace for DSTEQR. // Note that zz and workt will use parts of workl. double *zz, *workt; highMem = (highMem > currentSize()) ? highMem : currentSize(); // Define an array to store the residuals history if (localVerbose > 2) { resHistory = new (nothrow) double[maxIterEigenSolve*NCV]; if (resHistory == 0) { if (vectWeight) delete vectWeight; delete[] wI; delete[] wD; delete[] wD2; return -30; } historyCount = 0; } highMem = (highMem > currentSize()) ? highMem : currentSize(); if (localVerbose > 0) { cout << endl; cout << " *|* Problem: "; if (M) cout << "K*Q = M*Q D "; else cout << "K*Q = Q D "; cout << endl; cout << " *|* Algorithm = ARPACK (Mode 3, modified such that user checks convergence)" << endl; cout << " *|* Number of requested eigenvalues = " << numEigen << endl; cout.precision(2); cout.setf(ios::scientific, ios::floatfield); cout << " *|* Tolerance for convergence = " << tolEigenSolve << endl; if (startingEV > 0) cout << " *|* User-defined starting vector (Combination of " << startingEV << " vectors)\n"; cout << " *|* Norm used for convergence: "; if (normWeight) cout << "weighted L2-norm with user-provided weights" << endl; else cout << "L^2-norm" << endl; if (orthoVec) cout << " *|* Size of orthogonal subspace = " << orthoVec->NumVectors() << endl; cout << "\n -- Start iterations -- \n"; } #ifdef EPETRA_MPI Epetra_MpiComm *MPIComm = dynamic_cast<Epetra_MpiComm *>(const_cast<Epetra_Comm*>(&MyComm)); #endif timeOuterLoop -= MyWatch.WallTime(); while (ido != 99) { highMem = (highMem > currentSize()) ? highMem : currentSize(); #ifdef EPETRA_MPI if (MPIComm) callFortran.PSAUPD(MPIComm->Comm(), &ido, 'G', localSize, "LM", numEigen, tolEigenSolve, resid, NCV, v, localSize, iparam, ipntr, workd, workl, lworkl, &info, 0); else callFortran.SAUPD(&ido, 'G', localSize, "LM", numEigen, tolEigenSolve, resid, NCV, v, localSize, iparam, ipntr, workd, workl, lworkl, &info, 0); #else callFortran.SAUPD(&ido, 'G', localSize, "LM", numEigen, tolEigenSolve, resid, NCV, v, localSize, iparam, ipntr, workd, workl, lworkl, &info, 0); #endif if (ido == -1) { // Apply the mass matrix v3.ResetView(workd + ipntr[0] - 1); v1.ResetView(vTmp); timeMassOp -= MyWatch.WallTime(); if (M) M->Apply(v3, v1); else memcpy(v1.Values(), v3.Values(), localSize*sizeof(double)); timeMassOp += MyWatch.WallTime(); massOp += 1; if ((orthoVec) && (verbose > 3)) { // Check the orthogonality double maxDot = myVerify.errorOrthogonality(orthoVec, &v1, 0); if (myPid == 0) { cout << " Maximum Euclidean dot product against orthogonal space (Before Solve) = "; cout << maxDot << endl; } } // Solve the stiffness problem v2.ResetView(workd + ipntr[1] - 1); timeStifOp -= MyWatch.WallTime(); K->ApplyInverse(v1, v2); timeStifOp += MyWatch.WallTime(); stifOp += 1; // Project the solution vector if needed // Note: Use mvz as workspace if (orthoVec) { Epetra_Vector Mv2(View, v2.Map(), mvz); if (M) M->Apply(v2, Mv2); else memcpy(Mv2.Values(), v2.Values(), localSize*sizeof(double)); modalTool.massOrthonormalize(v2, Mv2, M, *orthoVec, 1, 1); } if ((orthoVec) && (verbose > 3)) { // Check the orthogonality double maxDot = myVerify.errorOrthogonality(orthoVec, &v2, M); if (myPid == 0) { cout << " Maximum M-dot product against orthogonal space (After Solve) = "; cout << maxDot << endl; } } continue; } // if (ido == -1) if (ido == 1) { // Solve the stiffness problem v1.ResetView(workd + ipntr[2] - 1); v2.ResetView(workd + ipntr[1] - 1); if ((orthoVec) && (verbose > 3)) { // Check the orthogonality double maxDot = myVerify.errorOrthogonality(orthoVec, &v1, 0); if (myPid == 0) { cout << " Maximum Euclidean dot product against orthogonal space (Before Solve) = "; cout << maxDot << endl; } } timeStifOp -= MyWatch.WallTime(); K->ApplyInverse(v1, v2); timeStifOp += MyWatch.WallTime(); stifOp += 1; // Project the solution vector if needed // Note: Use mvz as workspace if (orthoVec) { Epetra_Vector Mv2(View, v2.Map(), mvz); if (M) M->Apply(v2, Mv2); else memcpy(Mv2.Values(), v2.Values(), localSize*sizeof(double)); modalTool.massOrthonormalize(v2, Mv2, M, *orthoVec, 1, 1); } if ((orthoVec) && (verbose > 3)) { // Check the orthogonality double maxDot = myVerify.errorOrthogonality(orthoVec, &v2, M); if (myPid == 0) { cout << " Maximum M-dot product against orthogonal space (After Solve) = "; cout << maxDot << endl; } } continue; } // if (ido == 1) if (ido == 2) { // Apply the mass matrix v1.ResetView(workd + ipntr[0] - 1); v2.ResetView(workd + ipntr[1] - 1); timeMassOp -= MyWatch.WallTime(); if (M) M->Apply(v1, v2); else memcpy(v2.Values(), v1.Values(), localSize*sizeof(double)); timeMassOp += MyWatch.WallTime(); massOp += 1; continue; } // if (ido == 2) if (ido == 4) { timeResidual -= MyWatch.WallTime(); // Copy the main diagonal of T memcpy(dd, workl + NCV + ipntr[4] - 1, NCV*sizeof(double)); // Copy the lower diagonal of T memcpy(ee, workl + ipntr[4], (NCV-1)*sizeof(double)); // Compute the eigenpairs of the tridiagonal matrix zz = workl + 4*NCV; workt = workl + 4*NCV + NCV*NCV; callFortran.STEQR('I', NCV, dd, ee, zz, NCV, workt, &info); if (info != 0) { if (localVerbose > 0) { cerr << endl; cerr << " Error with DSTEQR, info = " << info << endl; cerr << endl; } break; } // dd contains the eigenvalues in ascending order // Check the residual of the proposed eigenvectors of (K, M) int ii, jz; iparam[4] = 0; for (jz = 0; jz < NCV; jz += loopZ) { int colZ = (jz + loopZ < NCV) ? loopZ : NCV - jz; callBLAS.GEMM('N', 'N', localSize, colZ, NCV, 1.0, v, localSize, zz + jz*NCV, NCV, 0.0, vz, localSize); // Form the residuals if (M) M->Apply(approxEV, MapproxEV); K->Apply(approxEV, KapproxEV); for (ii = 0; ii < colZ; ++ii) { callBLAS.AXPY(localSize, -1.0/dd[ii+jz], MapproxEV.Values() + ii*localSize, KapproxEV.Values() + ii*localSize); } // Compute the norms of the residuals if (vectWeight) { KapproxEV.NormWeighted(*vectWeight, normR + jz); } else { KapproxEV.Norm2(normR + jz); } // Scale the norms of residuals with the eigenvalues for (ii = 0; ii < colZ; ++ii) { normR[ii+jz] = normR[ii+jz]*dd[ii+jz]; } // Put the number of converged pairs in iparam[5-1] for (ii=0; ii<colZ; ++ii) { if (normR[ii+jz] < tolEigenSolve) iparam[4] += 1; } } timeResidual += MyWatch.WallTime(); numResidual += NCV; outerIter += 1; if (localVerbose > 0) { cout << " Iteration " << outerIter; cout << " - Number of converged eigenvalues " << iparam[4] << endl; } if (localVerbose > 2) { memcpy(resHistory + historyCount, normR, NCV*sizeof(double)); historyCount += NCV; } if (localVerbose > 1) { cout.precision(2); cout.setf(ios::scientific, ios::floatfield); for (ii=0; ii < NCV; ++ii) { cout << " Iteration " << outerIter; cout << " - Scaled Norm of Residual " << ii << " = " << normR[ii] << endl; } cout << endl; cout.precision(2); for (ii = 0; ii < NCV; ++ii) { cout << " Iteration " << outerIter << " - Ritz eigenvalue " << ii; cout.setf((fabs(dd[ii]) > 100) ? ios::scientific : ios::fixed, ios::floatfield); cout << " = " << 1.0/dd[ii] << endl; } cout << endl; } } // if (ido == 4) } // while (ido != 99) timeOuterLoop += MyWatch.WallTime(); highMem = (highMem > currentSize()) ? highMem : currentSize(); if (info < 0) { if (myPid == 0) { cerr << endl; cerr << " Error with DSAUPD, info = " << info << endl; cerr << endl; } } else { // Get the eigenvalues timePostProce -= MyWatch.WallTime(); int ii, jj; double *pointer = workl + 4*NCV + NCV*NCV; for (ii=0; ii < localSize; ii += 3) { int nRow = (ii + 3 < localSize) ? 3 : localSize - ii; for (jj=0; jj<NCV; ++jj) memcpy(pointer + jj*nRow, v + ii + jj*localSize, nRow*sizeof(double)); callBLAS.GEMM('N', 'N', nRow, NCV, NCV, 1.0, pointer, nRow, zz, NCV, 0.0, Q.Values() + ii, localSize); } // Put the converged eigenpairs at the beginning knownEV = 0; for (ii=0; ii < NCV; ++ii) { if (normR[ii] < tolEigenSolve) { lambda[knownEV] = 1.0/dd[ii]; memcpy(Q.Values()+knownEV*localSize, Q.Values()+ii*localSize, localSize*sizeof(double)); knownEV += 1; if (knownEV == Q.NumVectors()) break; } } // Sort the eigenpairs if (knownEV > 0) { mySort.sortScalars_Vectors(knownEV, lambda, Q.Values(), localSize); } timePostProce += MyWatch.WallTime(); } // if (info < 0) if (info == 0) { orthoOp = iparam[11-1]; } delete[] wI; delete[] wD; delete[] wD2; if (vectWeight) delete vectWeight; return (info == 0) ? knownEV : info; }
int Stokhos::ApproxGaussSeidelPreconditioner:: ApplyInverse(const Epetra_MultiVector& Input, Epetra_MultiVector& Result) const { #ifdef STOKHOS_TEUCHOS_TIME_MONITOR TEUCHOS_FUNC_TIME_MONITOR("Stokhos: Total Approximate Gauss-Seidel Time"); #endif // We have to be careful if Input and Result are the same vector. // If this is the case, the only possible solution is to make a copy const Epetra_MultiVector *input = &Input; bool made_copy = false; if (Input.Values() == Result.Values()) { input = new Epetra_MultiVector(Input); made_copy = true; } int m = input->NumVectors(); if (mat_vec_tmp == Teuchos::null || mat_vec_tmp->NumVectors() != m) mat_vec_tmp = Teuchos::rcp(new Epetra_MultiVector(*base_map, m)); if (rhs_block == Teuchos::null || rhs_block->NumVectors() != m) rhs_block = Teuchos::rcp(new EpetraExt::BlockMultiVector(*base_map, *sg_map, m)); // Extract blocks EpetraExt::BlockMultiVector input_block(View, *base_map, *input); EpetraExt::BlockMultiVector result_block(View, *base_map, Result); result_block.PutScalar(0.0); int k_limit = sg_poly->size(); if (only_use_linear) k_limit = sg_poly->basis()->dimension() + 1; const Teuchos::Array<double>& norms = sg_basis->norm_squared(); rhs_block->Update(1.0, input_block, 0.0); for (Cijk_type::i_iterator i_it=Cijk->i_begin(); i_it!=Cijk->i_end(); ++i_it) { int i = index(i_it); Teuchos::RCP<Epetra_MultiVector> res_i = result_block.GetBlock(i); { // Apply deterministic preconditioner #ifdef STOKHOS_TEUCHOS_TIME_MONITOR TEUCHOS_FUNC_TIME_MONITOR("Stokhos: Total AGS Deterministic Preconditioner Time"); #endif mean_prec->ApplyInverse(*(rhs_block->GetBlock(i)), *res_i); } int i_gid = epetraCijk->GRID(i); for (Cijk_type::ik_iterator k_it = Cijk->k_begin(i_it); k_it != Cijk->k_end(i_it); ++k_it) { int k = index(k_it); if (k!=0 && k<k_limit) { bool do_mat_vec = false; for (Cijk_type::ikj_iterator j_it = Cijk->j_begin(k_it); j_it != Cijk->j_end(k_it); ++j_it) { int j = index(j_it); int j_gid = epetraCijk->GCID(j); if (j_gid > i_gid) { bool on_proc = epetraCijk->myGRID(j_gid); if (on_proc) { do_mat_vec = true; break; } } } if (do_mat_vec) { (*sg_poly)[k].Apply(*res_i, *mat_vec_tmp); for (Cijk_type::ikj_iterator j_it = Cijk->j_begin(k_it); j_it != Cijk->j_end(k_it); ++j_it) { int j = index(j_it); int j_gid = epetraCijk->GCID(j); double c = value(j_it); if (scale_op) { if (useTranspose) c /= norms[i_gid]; else c /= norms[j_gid]; } if (j_gid > i_gid) { bool on_proc = epetraCijk->myGRID(j_gid); if (on_proc) { rhs_block->GetBlock(j)->Update(-c, *mat_vec_tmp, 1.0); } } } } } } } // For symmetric Gauss-Seidel if (symmetric) { for (Cijk_type::i_reverse_iterator i_it= Cijk->i_rbegin(); i_it!=Cijk->i_rend(); ++i_it) { int i = index(i_it); Teuchos::RCP<Epetra_MultiVector> res_i = result_block.GetBlock(i); { // Apply deterministic preconditioner #ifdef STOKHOS_TEUCHOS_TIME_MONITOR TEUCHOS_FUNC_TIME_MONITOR("Stokhos: Total AGS Deterministic Preconditioner Time"); #endif mean_prec->ApplyInverse(*(rhs_block->GetBlock(i)), *res_i); } int i_gid = epetraCijk->GRID(i); for (Cijk_type::ik_iterator k_it = Cijk->k_begin(i_it); k_it != Cijk->k_end(i_it); ++k_it) { int k = index(k_it); if (k!=0 && k<k_limit) { bool do_mat_vec = false; for (Cijk_type::ikj_iterator j_it = Cijk->j_begin(k_it); j_it != Cijk->j_end(k_it); ++j_it) { int j = index(j_it); int j_gid = epetraCijk->GCID(j); if (j_gid < i_gid) { bool on_proc = epetraCijk->myGRID(j_gid); if (on_proc) { do_mat_vec = true; break; } } } if (do_mat_vec) { (*sg_poly)[k].Apply(*res_i, *mat_vec_tmp); for (Cijk_type::ikj_iterator j_it = Cijk->j_begin(k_it); j_it != Cijk->j_end(k_it); ++j_it) { int j = index(j_it); int j_gid = epetraCijk->GCID(j); double c = value(j_it); if (scale_op) c /= norms[j_gid]; if (j_gid < i_gid) { bool on_proc = epetraCijk->myGRID(j_gid); if (on_proc) { rhs_block->GetBlock(j)->Update(-c, *mat_vec_tmp, 1.0); } } } } } } } } if (made_copy) delete input; return 0; }
int Stokhos::KLMatrixFreeOperator:: Apply(const Epetra_MultiVector& Input, Epetra_MultiVector& Result) const { // We have to be careful if Input and Result are the same vector. // If this is the case, the only possible solution is to make a copy const Epetra_MultiVector *input = &Input; bool made_copy = false; if (Input.Values() == Result.Values() && !is_stoch_parallel) { input = new Epetra_MultiVector(Input); made_copy = true; } // Initialize Result.PutScalar(0.0); const Epetra_Map* input_base_map = domain_base_map.get(); const Epetra_Map* result_base_map = range_base_map.get(); if (useTranspose == true) { input_base_map = range_base_map.get(); result_base_map = domain_base_map.get(); } // Allocate temporary storage int m = Input.NumVectors(); if (useTranspose == false && (tmp == Teuchos::null || tmp->NumVectors() != m*max_num_mat_vec)) tmp = Teuchos::rcp(new Epetra_MultiVector(*result_base_map, m*max_num_mat_vec)); else if (useTranspose == true && (tmp_trans == Teuchos::null || tmp_trans->NumVectors() != m*max_num_mat_vec)) tmp_trans = Teuchos::rcp(new Epetra_MultiVector(*result_base_map, m*max_num_mat_vec)); Epetra_MultiVector *tmp_result; if (useTranspose == false) tmp_result = tmp.get(); else tmp_result = tmp_trans.get(); // Map input into column map const Epetra_MultiVector *tmp_col; if (!is_stoch_parallel) tmp_col = input; else { if (useTranspose == false) { if (input_col == Teuchos::null || input_col->NumVectors() != m) input_col = Teuchos::rcp(new Epetra_MultiVector(*global_col_map, m)); input_col->Import(*input, *col_importer, Insert); tmp_col = input_col.get(); } else { if (input_col_trans == Teuchos::null || input_col_trans->NumVectors() != m) input_col_trans = Teuchos::rcp(new Epetra_MultiVector(*global_col_map_trans, m)); input_col_trans->Import(*input, *col_importer_trans, Insert); tmp_col = input_col_trans.get(); } } // Extract blocks EpetraExt::BlockMultiVector sg_input(View, *input_base_map, *tmp_col); EpetraExt::BlockMultiVector sg_result(View, *result_base_map, Result); for (int i=0; i<input_block.size(); i++) input_block[i] = sg_input.GetBlock(i); for (int i=0; i<result_block.size(); i++) result_block[i] = sg_result.GetBlock(i); int N = result_block[0]->MyLength(); const Teuchos::Array<double>& norms = sg_basis->norm_squared(); int d = sg_basis->dimension(); Teuchos::Array<double> zero(d), one(d); for(int j = 0; j<d; j++) { zero[j] = 0.0; one[j] = 1.0; } Teuchos::Array< double > phi_0(expansion_size), phi_1(expansion_size); sg_basis->evaluateBases(zero, phi_0); sg_basis->evaluateBases(one, phi_1); // k_begin and k_end are initialized in the constructor for (Cijk_type::k_iterator k_it=k_begin; k_it!=k_end; ++k_it) { Cijk_type::kj_iterator j_begin = Cijk->j_begin(k_it); Cijk_type::kj_iterator j_end = Cijk->j_end(k_it); int k = index(k_it); int nj = Cijk->num_j(k_it); if (nj > 0) { Teuchos::Array<double*> j_ptr(nj*m); Teuchos::Array<int> mj_indices(nj*m); int l = 0; for (Cijk_type::kj_iterator j_it = j_begin; j_it != j_end; ++j_it) { int j = index(j_it); for (int mm=0; mm<m; mm++) { j_ptr[l*m+mm] = input_block[j]->Values()+mm*N; mj_indices[l*m+mm] = l*m+mm; } l++; } Epetra_MultiVector input_tmp(View, *input_base_map, &j_ptr[0], nj*m); Epetra_MultiVector result_tmp(View, *tmp_result, &mj_indices[0], nj*m); (*block_ops)[k].Apply(input_tmp, result_tmp); l = 0; for (Cijk_type::kj_iterator j_it = j_begin; j_it != j_end; ++j_it) { int j = index(j_it); int j_gid = epetraCijk->GCID(j); for (Cijk_type::kji_iterator i_it = Cijk->i_begin(j_it); i_it != Cijk->i_end(j_it); ++i_it) { int i = index(i_it); int i_gid = epetraCijk->GRID(i); double c = value(i_it); if (k == 0) c /= phi_0[0]; else { c /= phi_1[k]; if (i_gid == j_gid) c -= phi_0[k]/(phi_1[k]*phi_0[0])*norms[i_gid]; } if (scale_op) { if (useTranspose) c /= norms[j_gid]; else c /= norms[i_gid]; } for (int mm=0; mm<m; mm++) (*result_block[i])(mm)->Update(c, *result_tmp(l*m+mm), 1.0); } l++; } } } // Destroy blocks for (int i=0; i<input_block.size(); i++) input_block[i] = Teuchos::null; for (int i=0; i<result_block.size(); i++) result_block[i] = Teuchos::null; if (made_copy) delete input; return 0; }
// ============================================================================ int ML_Epetra::MatrixFreePreconditioner:: Compute(const Epetra_CrsGraph& Graph, Epetra_MultiVector& NullSpace) { Epetra_Time TotalTime(Comm()); const int NullSpaceDim = NullSpace.NumVectors(); // get parameters from the list std::string PrecType = List_.get("prec: type", "hybrid"); std::string SmootherType = List_.get("smoother: type", "Jacobi"); std::string ColoringType = List_.get("coloring: type", "JONES_PLASSMAN"); int PolynomialDegree = List_.get("smoother: degree", 3); std::string DiagonalColoringType = List_.get("diagonal coloring: type", "JONES_PLASSMAN"); int MaximumIterations = List_.get("eigen-analysis: max iters", 10); std::string EigenType_ = List_.get("eigen-analysis: type", "cg"); double boost = List_.get("eigen-analysis: boost for lambda max", 1.0); int OutputLevel = List_.get("ML output", -47); if (OutputLevel == -47) OutputLevel = List_.get("output", 10); omega_ = List_.get("smoother: damping", omega_); ML_Set_PrintLevel(OutputLevel); bool LowMemory = List_.get("low memory", true); double AllocationFactor = List_.get("AP allocation factor", 0.5); verbose_ = (MyPID() == 0 && ML_Get_PrintLevel() > 5); // ================ // // check parameters // // ================ // if (PrecType == "presmoother only") PrecType_ = ML_MFP_PRESMOOTHER_ONLY; else if (PrecType == "hybrid") PrecType_ = ML_MFP_HYBRID; else if (PrecType == "additive") PrecType_ = ML_MFP_ADDITIVE; else ML_CHK_ERR(-3); // not recognized if (SmootherType == "none") SmootherType_ = ML_MFP_NONE; else if (SmootherType == "Jacobi") SmootherType_ = ML_MFP_JACOBI; else if (SmootherType == "block Jacobi") SmootherType_ = ML_MFP_BLOCK_JACOBI; else if (SmootherType == "Chebyshev") SmootherType_ = ML_MFP_CHEBY; else ML_CHK_ERR(-4); // not recognized if (AllocationFactor <= 0.0) ML_CHK_ERR(-1); // should be positive // =============================== // // basic checkings and some output // // =============================== // int OperatorDomainPoints = Operator_.OperatorDomainMap().NumGlobalPoints(); int OperatorRangePoints = Operator_.OperatorRangeMap().NumGlobalPoints(); int GraphBlockRows = Graph.NumGlobalBlockRows(); int GraphNnz = Graph.NumGlobalNonzeros(); NumPDEEqns_ = OperatorRangePoints / GraphBlockRows; NumMyBlockRows_ = Graph.NumMyBlockRows(); if (OperatorDomainPoints != OperatorRangePoints) ML_CHK_ERR(-1); // only square matrices if (OperatorRangePoints % NumPDEEqns_ != 0) ML_CHK_ERR(-2); // num PDEs seems not constant if (verbose_) { ML_print_line("=",78); std::cout << "*** " << std::endl; std::cout << "*** ML_Epetra::MatrixFreePreconditioner" << std::endl; std::cout << "***" << std::endl; std::cout << "Number of rows and columns = " << OperatorDomainPoints << std::endl; std::cout << "Number of rows per processor = " << OperatorDomainPoints / Comm().NumProc() << " (on average)" << std::endl; std::cout << "Number of rows in the graph = " << GraphBlockRows << std::endl; std::cout << "Number of nonzeros in the graph = " << GraphNnz << std::endl; std::cout << "Processors used in computation = " << Comm().NumProc() << std::endl; std::cout << "Number of PDE equations = " << NumPDEEqns_ << std::endl; std::cout << "Null space dimension = " << NullSpaceDim << std::endl; std::cout << "Preconditioner type = " << PrecType << std::endl; std::cout << "Smoother type = " << SmootherType << std::endl; std::cout << "Coloring type = " << ColoringType << std::endl; std::cout << "Allocation factor = " << AllocationFactor << std::endl; std::cout << "Number of V-cycles for C = " << List_.sublist("ML list").get("cycle applications", 1) << std::endl; std::cout << std::endl; } ResetStartTime(); // ==================================== // // compute the inverse of the diagonal, // // control that no elements are zero. // // ==================================== // for (int i = 0; i < InvPointDiagonal_->MyLength(); ++i) if ((*InvPointDiagonal_)[i] != 0.0) (*InvPointDiagonal_)[i] = 1.0 / (*InvPointDiagonal_)[i]; // ========================================================= // // Setup the smoother. I need to extract the block diagonal // // only if block Jacobi is used. For Chebyshev, I scale with // // the point diagonal only. In this latter case, I need to // // compute lambda_max of the scaled operator. // // ========================================================= // // probes for the block diagonal of the matrix. if (SmootherType_ == ML_MFP_JACOBI || SmootherType_ == ML_MFP_NONE) { // do-nothing here } else if (SmootherType_ == ML_MFP_BLOCK_JACOBI) { if (verbose_); std::cout << "Diagonal coloring type = " << DiagonalColoringType << std::endl; ML_CHK_ERR(GetBlockDiagonal(Graph, DiagonalColoringType)); AddAndResetStartTime("block diagonal construction", true); } else if (SmootherType_ == ML_MFP_CHEBY) { double lambda_min = 0.0; double lambda_max = 0.0; Teuchos::ParameterList IFPACKList; if (EigenType_ == "power-method") { ML_CHK_ERR(Ifpack_Chebyshev::PowerMethod(Operator_, *InvPointDiagonal_, MaximumIterations, lambda_max)); } else if(EigenType_ == "cg") { ML_CHK_ERR(Ifpack_Chebyshev::CG(Operator_, *InvPointDiagonal_, MaximumIterations, lambda_min, lambda_max)); } else ML_CHK_ERR(-1); // not recognized if (verbose_) { std::cout << "Using Chebyshev smoother of degree " << PolynomialDegree << std::endl; std::cout << "Estimating eigenvalues using " << EigenType_ << std::endl; std::cout << "lambda_min = " << lambda_min << ", "; std::cout << "lambda_max = " << lambda_max << std::endl; } IFPACKList.set("chebyshev: min eigenvalue", lambda_min); IFPACKList.set("chebyshev: max eigenvalue", boost * lambda_max); // FIXME: this allocates a new std::vector inside IFPACKList.set("chebyshev: operator inv diagonal", InvPointDiagonal_.get()); IFPACKList.set("chebyshev: degree", PolynomialDegree); PreSmoother_ = rcp(new Ifpack_Chebyshev((Epetra_Operator*)(&Operator_))); if (PreSmoother_.get() == 0) ML_CHK_ERR(-1); // memory error? IFPACKList.set("chebyshev: zero starting solution", true); ML_CHK_ERR(PreSmoother_->SetParameters(IFPACKList)); ML_CHK_ERR(PreSmoother_->Initialize()); ML_CHK_ERR(PreSmoother_->Compute()); PostSmoother_ = rcp(new Ifpack_Chebyshev((Epetra_Operator*)(&Operator_))); if (PostSmoother_.get() == 0) ML_CHK_ERR(-1); // memory error? IFPACKList.set("chebyshev: zero starting solution", false); ML_CHK_ERR(PostSmoother_->SetParameters(IFPACKList)); ML_CHK_ERR(PostSmoother_->Initialize()); ML_CHK_ERR(PostSmoother_->Compute()); } // ========================================================= // // building P and R for block graph. This is done by working // // on the Graph_ object. Support is provided for local // // aggregation schemes only so that all is basically local. // // Then, build the block graph coarse problem. // // ========================================================= // // ML wrapper for Graph_ ML_Operator* Graph_ML = ML_Operator_Create(Comm_ML()); ML_Operator_WrapEpetraCrsGraph(const_cast<Epetra_CrsGraph*>(&Graph), Graph_ML); ML_Aggregate* BlockAggr_ML = 0; ML_Operator* BlockPtent_ML = 0, *BlockRtent_ML = 0,* CoarseGraph_ML = 0; if (verbose_) std::cout << std::endl; ML_CHK_ERR(Coarsen(Graph_ML, &BlockAggr_ML, &BlockPtent_ML, &BlockRtent_ML, &CoarseGraph_ML)); if (verbose_) std::cout << std::endl; Epetra_CrsMatrix* GraphCoarse; ML_CHK_ERR(ML_Operator2EpetraCrsMatrix(CoarseGraph_ML, GraphCoarse)); // used later to estimate the entries in AP ML_Operator* CoarseAP_ML = ML_Operator_Create(Comm_ML()); ML_2matmult(Graph_ML, BlockPtent_ML, CoarseAP_ML, ML_CSR_MATRIX); int AP_MaxNnzRow, itmp = CoarseAP_ML->max_nz_per_row; Comm().MaxAll(&itmp, &AP_MaxNnzRow, 1); ML_Operator_Destroy(&CoarseAP_ML); int NumAggregates = BlockPtent_ML->invec_leng; ML_Operator_Destroy(&BlockRtent_ML); ML_Operator_Destroy(&CoarseGraph_ML); AddAndResetStartTime("construction of block C, R, and P", true); if (verbose_) std::cout << std::endl; // ================================================== // // coloring of block graph: // // - color of block row `i' is given by `ColorMap[i]' // // - number of colors is ColorMap.NumColors(). // // ================================================== // ResetStartTime(); CrsGraph_MapColoring* MapColoringTransform; if (ColoringType == "JONES_PLASSMAN") MapColoringTransform = new CrsGraph_MapColoring (CrsGraph_MapColoring::JONES_PLASSMAN, 0, false, 0); else if (ColoringType == "PSEUDO_PARALLEL") MapColoringTransform = new CrsGraph_MapColoring (CrsGraph_MapColoring::PSEUDO_PARALLEL, 0, false, 0); else if (ColoringType == "GREEDY") MapColoringTransform = new CrsGraph_MapColoring (CrsGraph_MapColoring::GREEDY, 0, false, 0); else if (ColoringType == "LUBY") MapColoringTransform = new CrsGraph_MapColoring (CrsGraph_MapColoring::LUBY, 0, false, 0); else ML_CHK_ERR(-1); Epetra_MapColoring* ColorMap = &(*MapColoringTransform)(const_cast<Epetra_CrsGraph&>(GraphCoarse->Graph())); // move the information from ColorMap to std::vector Colors const int NumColors = ColorMap->MaxNumColors(); RefCountPtr<Epetra_IntSerialDenseVector> Colors = rcp(new Epetra_IntSerialDenseVector(GraphCoarse->Graph().NumMyRows())); for (int i = 0; i < GraphCoarse->Graph().NumMyRows(); ++i) (*Colors)[i] = (*ColorMap)[i]; delete MapColoringTransform; delete ColorMap; ColorMap = 0; delete GraphCoarse; AddAndResetStartTime("coarse graph coloring", true); if (verbose_) std::cout << std::endl; // get some other information about the aggregates, to be used // in the QR factorization of the null space. NodesOfAggregate // contains the local ID of block rows contained in each aggregate. // FIXME: make it faster std::vector< std::vector<int> > NodesOfAggregate(NumAggregates); for (int i = 0; i < Graph.NumMyBlockRows(); ++i) { int AID = BlockAggr_ML->aggr_info[0][i]; NodesOfAggregate[AID].push_back(i); } int MaxAggrSize = 0; for (int i = 0; i < NumAggregates; ++i) { const int& MySize = NodesOfAggregate[i].size(); if (MySize > MaxAggrSize) MaxAggrSize = MySize; } // collect aggregate information, and mark all nodes that are // connected with each aggregate. These nodes will have a possible // nonzero entry after the matrix-matrix product between the Operator_ // and the tentative prolongator. std::vector<vector<int> > aggregates(NumAggregates); std::vector<int>::iterator iter; for (int i = 0; i < NumAggregates; ++i) aggregates[i].reserve(MaxAggrSize); for (int i = 0; i < Graph.NumMyBlockRows(); ++i) { int AID = BlockAggr_ML->aggr_info[0][i]; int NumEntries; int* Indices; Graph.ExtractMyRowView(i, NumEntries, Indices); for (int k = 0; k < NumEntries; ++k) { // FIXME: use hash?? const int& GCID = Graph.ColMap().GID(Indices[k]); iter = find(aggregates[AID].begin(), aggregates[AID].end(), GCID); if (iter == aggregates[AID].end()) aggregates[AID].push_back(GCID); } } int* BlockNodeList = Graph.ColMap().MyGlobalElements(); // finally get rid of the ML_Aggregate structure. ML_Aggregate_Destroy(&BlockAggr_ML); const Epetra_Map& FineMap = Operator_.OperatorDomainMap(); Epetra_Map CoarseMap(-1, NumAggregates * NullSpaceDim, 0, Comm()); RefCountPtr<Epetra_Map> BlockNodeListMap = rcp(new Epetra_Map(-1, Graph.ColMap().NumMyElements(), BlockNodeList, 0, Comm())); std::vector<int> NodeList(Graph.ColMap().NumMyElements() * NumPDEEqns_); for (int i = 0; i < Graph.ColMap().NumMyElements(); ++i) for (int m = 0; m < NumPDEEqns_; ++m) NodeList[i * NumPDEEqns_ + m] = BlockNodeList[i] * NumPDEEqns_ + m; RefCountPtr<Epetra_Map> NodeListMap = rcp(new Epetra_Map(-1, NodeList.size(), &NodeList[0], 0, Comm())); AddAndResetStartTime("data structures", true); // ====================== // // process the null space // // ====================== // // CHECKME Epetra_MultiVector NewNullSpace(CoarseMap, NullSpaceDim); NewNullSpace.PutScalar(0.0); if (NullSpaceDim == 1) { double* ns_ptr = NullSpace.Values(); for (int AID = 0; AID < NumAggregates; ++AID) { double dtemp = 0.0; for (int j = 0; j < (int) (NodesOfAggregate[AID].size()); j++) for (int m = 0; m < NumPDEEqns_; ++m) { const int& pos = NodesOfAggregate[AID][j] * NumPDEEqns_ + m; dtemp += (ns_ptr[pos] * ns_ptr[pos]); } dtemp = std::sqrt(dtemp); NewNullSpace[0][AID] = dtemp; dtemp = 1.0 / dtemp; for (int j = 0; j < (int) (NodesOfAggregate[AID].size()); j++) for (int m = 0; m < NumPDEEqns_; ++m) ns_ptr[NodesOfAggregate[AID][j] * NumPDEEqns_ + m] *= dtemp; } } else { // FIXME std::vector<double> qr_ptr(MaxAggrSize * NumPDEEqns_ * MaxAggrSize * NumPDEEqns_); std::vector<double> tmp_ptr(MaxAggrSize * NumPDEEqns_ * NullSpaceDim); std::vector<double> work(NullSpaceDim); int info; for (int AID = 0; AID < NumAggregates; ++AID) { int MySize = NodesOfAggregate[AID].size(); int MyFullSize = NodesOfAggregate[AID].size() * NumPDEEqns_; int lwork = NullSpaceDim; for (int k = 0; k < NullSpaceDim; ++k) for (int j = 0; j < MySize; ++j) for (int m = 0; m < NumPDEEqns_; ++m) qr_ptr[k * MyFullSize + j * NumPDEEqns_ + m] = NullSpace[k][NodesOfAggregate[AID][j] * NumPDEEqns_ + m]; DGEQRF_F77(&MyFullSize, (int*)&NullSpaceDim, &qr_ptr[0], &MyFullSize, &tmp_ptr[0], &work[0], &lwork, &info); ML_CHK_ERR(info); if (work[0] > lwork) work.resize((int) work[0]); // the upper triangle of qr_tmp is now R, so copy that into the // new nullspace for (int j = 0; j < NullSpaceDim; j++) for (int k = j; k < NullSpaceDim; k++) NewNullSpace[k][AID * NullSpaceDim + j] = qr_ptr[j + MyFullSize * k]; // to get this block of P, need to run qr_tmp through another LAPACK // function: DORGQR_F77(&MyFullSize, (int*)&NullSpaceDim, (int*)&NullSpaceDim, &qr_ptr[0], &MyFullSize, &tmp_ptr[0], &work[0], &lwork, &info); ML_CHK_ERR(info); // dgeqtr returned a non-zero if (work[0] > lwork) work.resize((int) work[0]); // insert the Q block into the null space for (int k = 0; k < NullSpaceDim; ++k) for (int j = 0; j < MySize; ++j) for (int m = 0; m < NumPDEEqns_; ++m) { int LRID = NodesOfAggregate[AID][j] * NumPDEEqns_ + m; double& val = qr_ptr[k * MyFullSize + j * NumPDEEqns_ + m]; NullSpace[k][LRID] = val; } } } AddAndResetStartTime("null space setup", true); if (verbose_) std::cout << "Number of colors on processor " << Comm().MyPID() << " = " << NumColors << std::endl; if (verbose_) std::cout << "Maximum number of colors = " << NumColors << std::endl; RefCountPtr<Epetra_FECrsMatrix> AP; // try to get a good estimate of the nonzeros per row. // This is a compromize between efficiency -- that is, reduce // the memory allocation processes, and memory usage -- that, is // overestimating can actually kill the code. Basically, this is // all junk due to our dear friend, the Cray XT3. AP = rcp(new Epetra_FECrsMatrix(Copy, FineMap, (int) (AllocationFactor * AP_MaxNnzRow * NullSpaceDim))); if (AP.get() == 0) throw(-1); if (!LowMemory) { // ================================================= // // allocate one big chunk of memory, and use View // // to create Epetra_MultiVectors. Note that // // NumColors * NullSpace can indeed be a quite large // // value. To reduce the memory consumption, both // // ColoredAP and ExtColoredAP use the same memory // // array. // // ================================================= // Epetra_MultiVector* ColoredP; std::vector<double> ColoredAP_ptr; try { ColoredP = new Epetra_MultiVector(FineMap, NumColors * NullSpaceDim); ColoredAP_ptr.resize(NumColors * NullSpaceDim * NodeListMap->NumMyPoints()); } catch (std::exception& rhs) { catch_message("the allocation of ColoredP", rhs.what(), __FILE__, __LINE__); ML_CHK_ERR(-1); } catch (...) { catch_message("the allocation of ColoredP", "", __FILE__, __LINE__); ML_CHK_ERR(-1); } int ColoredAP_LDA = NodeListMap->NumMyPoints(); ColoredP->PutScalar(0.0); for (int i = 0; i < BlockPtent_ML->outvec_leng; ++i) { int allocated = 1; int NumEntries; int Indices; double Values; int ierr = ML_Operator_Getrow(BlockPtent_ML, 1 ,&i, allocated, &Indices,&Values,&NumEntries); if (ierr < 0) ML_CHK_ERR(-1); assert (NumEntries == 1); // this is the block P const int& Color = (*Colors)[Indices] - 1; for (int k = 0; k < NumPDEEqns_; ++k) for (int j = 0; j < NullSpaceDim; ++j) (*ColoredP)[(Color * NullSpaceDim + j)][i * NumPDEEqns_ + k] = NullSpace[j][i * NumPDEEqns_ + k]; } ML_Operator_Destroy(&BlockPtent_ML); Epetra_MultiVector ColoredAP(View, Operator_.OperatorRangeMap(), &ColoredAP_ptr[0], ColoredAP_LDA, NumColors * NullSpaceDim); // move ColoredAP into ColoredP. This should not be required. // but I prefer to skip strange games with View pointers Operator_.Apply(*ColoredP, ColoredAP); *ColoredP = ColoredAP; // FIXME: only if NumProc > 1 Epetra_MultiVector ExtColoredAP(View, *NodeListMap, &ColoredAP_ptr[0], ColoredAP_LDA, NumColors * NullSpaceDim); try { Epetra_Import Importer(*NodeListMap, Operator_.OperatorRangeMap()); ExtColoredAP.Import(*ColoredP, Importer, Insert); } catch (std::exception& rhs) { catch_message("importing of ExtColoredAP", rhs.what(), __FILE__, __LINE__); ML_CHK_ERR(-1); } catch (...) { catch_message("importing of ExtColoredAP", "", __FILE__, __LINE__); ML_CHK_ERR(-1); } delete ColoredP; AddAndResetStartTime("computation of AP", true); // populate the actual AP operator, skip some controls to make it faster for (int i = 0; i < NumAggregates; ++i) { for (int j = 0; j < (int) (aggregates[i].size()); ++j) { int GRID = aggregates[i][j]; int LRID = BlockNodeListMap->LID(GRID); // this is the block ID //assert (LRID != -1); int GCID = CoarseMap.GID(i * NullSpaceDim); //assert (GCID != -1); int color = (*Colors)[i] - 1; for (int k = 0; k < NumPDEEqns_; ++k) for (int j = 0; j < NullSpaceDim; ++j) { double val = ExtColoredAP[color * NullSpaceDim + j][LRID * NumPDEEqns_ + k]; if (val != 0.0) { int GRID2 = GRID * NumPDEEqns_ + k; int GCID2 = GCID + j; AP->InsertGlobalValues(1, &GRID2, 1, &GCID2, &val); //if (ierr < 0) ML_CHK_ERR(ierr); } } } } } else { // =============================================================== // // apply the operator one color at-a-time. This requires NumColors // // cycles over BlockPtent. However, the memory requirements are // // drastically reduced. As for low-memory == false, both ColoredAP // // and ExtColoredAP point to the same memory location. // // =============================================================== // if (verbose_) std::cout << "Using low-memory computation for AP" << std::endl; Epetra_MultiVector ColoredP(FineMap, NullSpaceDim); std::vector<double> ColoredAP_ptr; try { ColoredAP_ptr.resize(NullSpaceDim * NodeListMap->NumMyPoints()); } catch (std::exception& rhs) { catch_message("resizing of ColoredAP_pt", rhs.what(), __FILE__, __LINE__); ML_CHK_ERR(-1); } catch (...) { catch_message("resizing of ColoredAP_pt", "", __FILE__, __LINE__); ML_CHK_ERR(-1); } Epetra_MultiVector ColoredAP(View, Operator_.OperatorRangeMap(), &ColoredAP_ptr[0], NodeListMap->NumMyPoints(), NullSpaceDim); Epetra_MultiVector ExtColoredAP(View, *NodeListMap, &ColoredAP_ptr[0], NodeListMap->NumMyPoints(), NullSpaceDim); Epetra_Import Importer(*NodeListMap, Operator_.OperatorRangeMap()); for (int ic = 0; ic < NumColors; ++ic) { if (ML_Get_PrintLevel() > 8 && Comm().MyPID() == 0) { if (ic % 20 == 0) std::cout << "Processing color " << flush; std::cout << ic << " " << flush; if (ic % 20 == 19 || ic == NumColors - 1) std::cout << std::endl; if (ic == NumColors - 1) std::cout << std::endl; } ColoredP.PutScalar(0.0); for (int i = 0; i < BlockPtent_ML->outvec_leng; ++i) { int allocated = 1; int NumEntries; int Indices; double Values; int ierr = ML_Operator_Getrow(BlockPtent_ML, 1 ,&i, allocated, &Indices,&Values,&NumEntries); if (ierr < 0 || // something strange in getrow NumEntries != 1) // this is the block P ML_CHK_ERR(-1); const int& Color = (*Colors)[Indices] - 1; if (Color != ic) continue; // skip this color for this cycle for (int k = 0; k < NumPDEEqns_; ++k) for (int j = 0; j < NullSpaceDim; ++j) ColoredP[j][i * NumPDEEqns_ + k] = NullSpace[j][i * NumPDEEqns_ + k]; } Operator_.Apply(ColoredP, ColoredAP); ColoredP = ColoredAP; // just to be safe ExtColoredAP.Import(ColoredP, Importer, Insert); // populate the actual AP operator, skip some controls to make it faster std::vector<int> InsertCols(NullSpaceDim * NumPDEEqns_); std::vector<double> InsertValues(NullSpaceDim * NumPDEEqns_); for (int i = 0; i < NumAggregates; ++i) { for (int j = 0; j < (int) (aggregates[i].size()); ++j) { int GRID = aggregates[i][j]; int LRID = BlockNodeListMap->LID(GRID); // this is the block ID //assert (LRID != -1); int GCID = CoarseMap.GID(i * NullSpaceDim); //assert (GCID != -1); int color = (*Colors)[i] - 1; if (color != ic) continue; for (int k = 0; k < NumPDEEqns_; ++k) { int count = 0; int GRID2 = GRID * NumPDEEqns_ + k; for (int j = 0; j < NullSpaceDim; ++j) { double val = ExtColoredAP[j][LRID * NumPDEEqns_ + k]; if (val != 0.0) { InsertCols[count] = GCID + j; InsertValues[count] = val; ++count; } } AP->InsertGlobalValues(1, &GRID2, count, &InsertCols[0], &InsertValues[0]); } } } } ML_Operator_Destroy(&BlockPtent_ML); } aggregates.resize(0); BlockNodeListMap = Teuchos::null; NodeListMap = Teuchos::null; Colors = Teuchos::null; AP->GlobalAssemble(false); AP->FillComplete(CoarseMap, FineMap); #if 0 try { AP->OptimizeStorage(); } catch(...) { // a memory error was reported, typically ReportError. // We just continue with fingers crossed. } #endif AddAndResetStartTime("computation of the final AP", true); ML_Operator* AP_ML = ML_Operator_Create(Comm_ML()); ML_Operator_WrapEpetraMatrix(AP.get(), AP_ML); // ======== // // create R // // ======== // std::vector<int> REntries(NumAggregates * NullSpaceDim); for (int AID = 0; AID < NumAggregates; ++AID) { for (int m = 0; m < NullSpaceDim; ++m) REntries[AID * NullSpaceDim + m] = NodesOfAggregate[AID].size() * NumPDEEqns_; } R_ = rcp(new Epetra_CrsMatrix(Copy, CoarseMap, &REntries[0], true)); REntries.resize(0); for (int AID = 0; AID < NumAggregates; ++AID) { const int& MySize = NodesOfAggregate[AID].size(); // FIXME: make it faster for (int j = 0; j < MySize; ++j) for (int m = 0; m < NumPDEEqns_; ++m) for (int k = 0; k < NullSpaceDim; ++k) { int LCID = NodesOfAggregate[AID][j] * NumPDEEqns_ + m; int GCID = FineMap.GID(LCID); assert (GCID != -1); double& val = NullSpace[k][LCID]; int GRID = CoarseMap.GID(AID * NullSpaceDim + k); int ierr = R_->InsertGlobalValues(GRID, 1, &val, &GCID); if (ierr < 0) ML_CHK_ERR(-1); } } NodesOfAggregate.resize(0); R_->FillComplete(FineMap, CoarseMap); #if 0 try { R_->OptimizeStorage(); } catch(...) { // a memory error was reported, typically ReportError. // We just continue with fingers crossed. } #endif ML_Operator* R_ML = ML_Operator_Create(Comm_ML()); ML_Operator_WrapEpetraMatrix(R_.get(), R_ML); AddAndResetStartTime("computation of R", true); // ======== // // Create C // // ======== // C_ML_ = ML_Operator_Create(Comm_ML()); ML_2matmult(R_ML, AP_ML, C_ML_, ML_MSR_MATRIX); ML_Operator_Destroy(&AP_ML); ML_Operator_Destroy(&R_ML); AP = Teuchos::null; C_ = rcp(new ML_Epetra::RowMatrix(C_ML_, &Comm(), false)); assert (R_->OperatorRangeMap().SameAs(C_->OperatorDomainMap())); TotalTime.ResetStartTime(); AddAndResetStartTime("computation of C", true); if (verbose_) { std::cout << "Matrix-free preconditioner built. Now building solver for C..." << std::endl; } Teuchos::ParameterList& sublist = List_.sublist("ML list"); sublist.set("PDE equations", NullSpaceDim); sublist.set("null space: type", "pre-computed"); sublist.set("null space: dimension", NewNullSpace.NumVectors()); sublist.set("null space: vectors", NewNullSpace.Values()); MLP_ = rcp(new MultiLevelPreconditioner(*C_, sublist, true)); assert (MLP_.get() != 0); IsComputed_ = true; AddAndResetStartTime("computation of the preconditioner for C", true); if (verbose_) { std::cout << std::endl; std::cout << "Total CPU time for construction (all included) = "; std::cout << TotalCPUTime() << std::endl; ML_print_line("=",78); } return(0); }