int main(int argc, char* argv[]) { #ifdef HAVE_MPI MPI_Init(&argc,&argv); #endif // Creating an instance of the BLAS class for double-precision kernels looks like: Teuchos::BLAS<int, double> blas; // This instance provides the access to all the BLAS kernels listed in Figure \ref{blas_kernels}: const int n = 10; double alpha = 2.0; double x[ n ]; for ( int i=0; i<n; i++ ) { x[i] = i; } blas.SCAL( n, alpha, x, 1 ); int max_idx = blas.IAMAX( n, x, 1 ); cout<< "The index of the maximum magnitude entry of x[] is the " << max_idx <<"-th and x[ " << max_idx-1 << " ] = "<< x[max_idx-1] << endl; #ifdef HAVE_MPI MPI_Finalize(); #endif return 0; }
int main(int argc, char* argv[]) { std::cout << Teuchos::Teuchos_Version() << std::endl << std::endl; // Creating an instance of the BLAS class for double-precision kernels looks like: Teuchos::BLAS<int, double> blas; // This instance provides the access to all the BLAS kernels like _SCAL. const int n = 10; double alpha = 2.0; double x[ n ]; for ( int i=0; i<n; i++ ) { x[i] = i; } blas.SCAL( n, alpha, x, 1 ); int max_idx = blas.IAMAX( n, x, 1 ); std::cout<< "The index of the maximum magnitude entry of x[] is the " << max_idx <<"-th and x[ " << max_idx-1 << " ] = "<< x[max_idx-1] << std::endl; return 0; }
double do_time_teuchos_fad_dot(unsigned int m, unsigned int ndot, unsigned int nloop) { Sacado::Random<double> urand(0.0, 1.0); Teuchos::BLAS<int,FadType> blas; std::vector<FadType> X(m), Y(m); for (unsigned int i=0; i<m; i++) { X[i] = FadType(ndot, urand.number()); Y[i] = FadType(ndot, urand.number()); for (unsigned int k=0; k<ndot; k++) { X[i].fastAccessDx(k) = urand.number(); Y[i].fastAccessDx(k) = urand.number(); } } Teuchos::Time timer("Teuchos Fad DOT", false); timer.start(true); for (unsigned int j=0; j<nloop; j++) { FadType z = blas.DOT(m, &X[0], 1, &Y[0], 1); } timer.stop(); return timer.totalElapsedTime() / nloop; }
/* Computes integrals of monomials over a given reference cell. */ double computeIntegral(shards::CellTopology & cellTopology, int cubDegree, int xDeg, int yDeg, int zDeg) { DefaultCubatureFactory<double> cubFactory; // create factory Teuchos::RCP<Cubature<double> > myCub = cubFactory.create(cellTopology, cubDegree); // create default cubature double val = 0.0; int cubDim = myCub->getDimension(); int numCubPoints = myCub->getNumPoints(); FieldContainer<double> point(cubDim); FieldContainer<double> cubPoints(numCubPoints, cubDim); FieldContainer<double> cubWeights(numCubPoints); FieldContainer<double> functValues(numCubPoints); myCub->getCubature(cubPoints, cubWeights); for (int i=0; i<numCubPoints; i++) { for (int j=0; j<cubDim; j++) { point(j) = cubPoints(i,j); } functValues(i) = computeMonomial(point, xDeg, yDeg, zDeg); } Teuchos::BLAS<int, double> myblas; int inc = 1; val = myblas.DOT(numCubPoints, &functValues[0], inc, &cubWeights[0], inc); return val; }
static void GEMM (const Teuchos::ETransp transA, const Teuchos::ETransp transB, const Scalar& alpha, const View<const Scalar**, LayoutLeft, DeviceType>& A, const View<const Scalar**, LayoutLeft, DeviceType>& B, const Scalar& beta, const View<Scalar**, LayoutLeft, DeviceType>& C) { const int n = static_cast<int> (C.dimension_1 ()); const int lda = static_cast<int> (Impl::getStride2DView (A)); Teuchos::BLAS<int,Scalar> blas; // For some BLAS implementations (e.g., MKL), GEMM when B has // one column may be signficantly less efficient than GEMV. if (n == 1 && transB == Teuchos::NO_TRANS) { blas.GEMV (transA, A.dimension_0 (), A.dimension_1 (), alpha, A.ptr_on_device (), lda, B.ptr_on_device (), static_cast<int> (1), beta, C.ptr_on_device (), static_cast<int> (1)); } else { const int m = static_cast<int> (C.dimension_0 ()); const int k = static_cast<int> (transA == Teuchos::NO_TRANS ? A.dimension_1 () : A.dimension_0 ()); const int ldb = static_cast<int> (Impl::getStride2DView (B)); const int ldc = static_cast<int> (Impl::getStride2DView (C)); blas.GEMM (transA, transB, m, n, k, alpha, A.ptr_on_device(), lda, B.ptr_on_device(), ldb, beta, C.ptr_on_device(), ldc); } }
double do_time_teuchos_double_gemv(unsigned int m, unsigned int n, unsigned int nloop) { Sacado::Random<double> urand(0.0, 1.0); Teuchos::BLAS<int,double> blas; std::vector<double> A(m*n), B(n), C(m); for (unsigned int j=0; j<n; j++) { for (unsigned int i=0; i<m; i++) A[i+j*m] = urand.number(); B[j] = urand.number(); } for (unsigned int i=0; i<m; i++) C[i] = urand.number(); double alpha = urand.number(); double beta = urand.number(); Teuchos::Time timer("Teuchos Double GEMV", false); timer.start(true); for (unsigned int j=0; j<nloop; j++) { blas.GEMV(Teuchos::NO_TRANS, m, n, alpha, &A[0], m, &B[0], 1, beta, &C[0], 1); } timer.stop(); return timer.totalElapsedTime() / nloop; }
void Stokhos::StieltjesPCEBasis<ordinal_type, value_type>:: transformCoeffsFromStieltjes(const value_type *in, value_type *out) const { Teuchos::BLAS<ordinal_type, value_type> blas; blas.GEMV(Teuchos::TRANS, fromStieltjesMat.numRows(), fromStieltjesMat.numCols(), 1.0, fromStieltjesMat.values(), fromStieltjesMat.numRows(), in, 1, 0.0, out, 1); }
void Stokhos::LanczosPCEBasis<ordinal_type, value_type>:: transformCoeffsFromLanczos(const value_type *in, value_type *out) const { Teuchos::BLAS<ordinal_type, value_type> blas; ordinal_type sz = fromStieltjesMat.numRows(); blas.GEMV(Teuchos::NO_TRANS, sz, this->p+1, value_type(1.0), fromStieltjesMat.values(), sz, in, ordinal_type(1), value_type(0.0), out, ordinal_type(1)); }
void Stokhos::GramSchmidtBasis<ordinal_type, value_type>:: transformCoeffs(const value_type *in, value_type *out) const { Teuchos::BLAS<ordinal_type, value_type> blas; for (ordinal_type i=0; i<sz; i++) out[i] = in[i]; blas.TRSM(Teuchos::LEFT_SIDE, Teuchos::LOWER_TRI, Teuchos::TRANS, Teuchos::UNIT_DIAG, sz, 1, 1.0, gs_mat.values(), sz, out, sz); }
void Stokhos::MonoProjPCEBasis<ordinal_type, value_type>:: transformCoeffs(const value_type *in, value_type *out) const { // Transform coefficients to normalized basis Teuchos::BLAS<ordinal_type, value_type> blas; blas.GEMV(Teuchos::NO_TRANS, pce_sz, this->p+1, value_type(1.0), basis_vecs.values(), pce_sz, in, ordinal_type(1), value_type(0.0), out, ordinal_type(1)); // Transform from normalized to original for (ordinal_type i=0; i<pce_sz; i++) out[i] /= pce_norms[i]; }
void Arnoldi::_Orthogonalize(cSDV& q){ Teuchos::BLAS<int, complex<double> > blas; complex<double> OrthogFactor; for( int j = 0; j <= _k; ++j ){ OrthogFactor = blas.DOT(_length, q.values(), 1, _Q[j], 1); _H[_k][j] = OrthogFactor; complex<double>* iterQ = _Q[j]; complex<double>* iterq = q.values(); for( int i = 0; i < _length; ++i, ++iterQ, ++iterq ){ *iterq -= OrthogFactor*(*iterQ); } } }
KOKKOS_INLINE_FUNCTION int Gemm<Trans::ConjTranspose,Trans::NoTranspose, AlgoGemm::ExternalBlas,Variant::One> ::invoke(PolicyType &policy, MemberType &member, const ScalarType alpha, DenseExecViewTypeA &A, DenseExecViewTypeB &B, const ScalarType beta, DenseExecViewTypeC &C) { // static_assert( Kokkos::Impl::is_same< // typename DenseMatrixTypeA::space_type, // typename DenseMatrixTypeB::space_type // >::value && // Kokkos::Impl::is_same< // typename DenseMatrixTypeB::space_type, // typename DenseMatrixTypeC::space_type // >::value, // "Space type of input matrices does not match" ); if (member.team_rank() == 0) { #if \ defined( HAVE_SHYLUTACHO_TEUCHOS ) && \ defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) typedef typename DenseExecViewTypeA::ordinal_type ordinal_type; typedef typename DenseExecViewTypeA::value_type value_type; Teuchos::BLAS<ordinal_type,value_type> blas; const ordinal_type m = C.NumRows(); const ordinal_type n = C.NumCols(); const ordinal_type k = B.NumRows(); if (m > 0 && n > 0 && k > 0) blas.GEMM(Teuchos::CONJ_TRANS, Teuchos::NO_TRANS, m, n, k, alpha, A.ValuePtr(), A.BaseObject().ColStride(), B.ValuePtr(), B.BaseObject().ColStride(), beta, C.ValuePtr(), C.BaseObject().ColStride()); #else TACHO_TEST_FOR_ABORT( true, MSG_NOT_HAVE_PACKAGE("Teuchos") ); #endif } return 0; }
double do_time_teuchos_fad_gemm(unsigned int m, unsigned int n, unsigned int k, unsigned int ndot, unsigned int nloop) { Sacado::Random<double> urand(0.0, 1.0); Teuchos::BLAS<int,FadType> blas; std::vector<FadType> A(m*k), B(k*n), C(m*n); for (unsigned int j=0; j<k; j++) { for (unsigned int i=0; i<m; i++) { A[i+j*m] = FadType(ndot, urand.number()); for (unsigned int l=0; l<ndot; l++) A[i+j*m].fastAccessDx(l) = urand.number(); } } for (unsigned int j=0; j<n; j++) { for (unsigned int i=0; i<k; i++) { B[i+j*k] = FadType(ndot, urand.number()); for (unsigned int l=0; l<ndot; l++) B[i+j*k].fastAccessDx(l) = urand.number(); } } for (unsigned int j=0; j<n; j++) { for (unsigned int i=0; i<m; i++) { C[i+j*m] = FadType(ndot, urand.number()); for (unsigned int l=0; l<ndot; l++) C[i+j*m].fastAccessDx(l) = urand.number(); } } FadType alpha(ndot, urand.number()); FadType beta(ndot, urand.number()); for (unsigned int l=0; l<ndot; l++) { alpha.fastAccessDx(l) = urand.number(); beta.fastAccessDx(l) = urand.number(); } Teuchos::Time timer("Teuchos Fad GEMM", false); timer.start(true); for (unsigned int j=0; j<nloop; j++) { blas.GEMM(Teuchos::NO_TRANS, Teuchos::NO_TRANS, m, n, k, alpha, &A[0], m, &B[0], k, beta, &C[0], m); } timer.stop(); return timer.totalElapsedTime() / nloop; }
KOKKOS_INLINE_FUNCTION int Trsm<Side::Left,Uplo::Upper,Trans::NoTranspose, AlgoTrsm::ExternalBlas,Variant::One> ::invoke(PolicyType &policy, const MemberType &member, const int diagA, const ScalarType alpha, DenseExecViewTypeA &A, DenseExecViewTypeB &B) { // static_assert( Kokkos::Impl::is_same< // typename DenseMatrixTypeA::space_type, // Kokkos::Cuda // >::value, // "Cuda space is not available for calling external BLAS" ); // static_assert( Kokkos::Impl::is_same< // typename DenseMatrixTypeA::space_type, // typename DenseMatrixTypeB::space_type // >::value, // "Space type of input matrices does not match" ); //typedef typename DenseExecViewTypeA::space_type space_type; typedef typename DenseExecViewTypeA::ordinal_type ordinal_type; typedef typename DenseExecViewTypeA::value_type value_type; if (member.team_rank() == 0) { #ifdef HAVE_SHYLUTACHO_TEUCHOS Teuchos::BLAS<ordinal_type,value_type> blas; const ordinal_type m = A.NumRows(); const ordinal_type n = B.NumCols(); blas.TRSM(Teuchos::LEFT_SIDE, Teuchos::UPPER_TRI, Teuchos::NO_TRANS, (diagA == Diag::Unit ? Teuchos::UNIT_DIAG : Teuchos::NON_UNIT_DIAG), m, n, alpha, A.ValuePtr(), A.BaseObject().ColStride(), B.ValuePtr(), B.BaseObject().ColStride()); #else TACHO_TEST_FOR_ABORT( true, MSG_NOT_HAVE_PACKAGE("Teuchos") ); #endif } return 0; }
void Constraint<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::Setup(const MultiVector& B, const MultiVector& Bc, RCP<const CrsGraph> Ppattern) { Ppattern_ = Ppattern; const RCP<const Map> uniqueMap = Ppattern_->getDomainMap(); const RCP<const Map> nonUniqueMap = Ppattern_->getColMap(); RCP<const Import> importer = ImportFactory::Build(uniqueMap, nonUniqueMap); const size_t NSDim = Bc.getNumVectors(); X_ = MultiVectorFactory::Build(nonUniqueMap, NSDim); X_->doImport(Bc, *importer, Xpetra::INSERT); size_t numRows = Ppattern_->getNodeNumRows(); XXtInv_.resize(numRows); Teuchos::SerialDenseVector<LO,SC> BcRow(NSDim, false); for (size_t i = 0; i < numRows; i++) { Teuchos::ArrayView<const LO> indices; Ppattern_->getLocalRowView(i, indices); size_t nnz = indices.size(); Teuchos::SerialDenseMatrix<LO,SC> locX(NSDim, nnz, false); for (size_t j = 0; j < nnz; j++) { for (size_t k = 0; k < NSDim; k++) BcRow[k] = X_->getData(k)[indices[j]]; Teuchos::setCol(BcRow, (LO)j, locX); } XXtInv_[i] = Teuchos::SerialDenseMatrix<LO,SC>(NSDim, NSDim, false); Teuchos::BLAS<LO,SC> blas; blas.GEMM(Teuchos::NO_TRANS, Teuchos::CONJ_TRANS, NSDim, NSDim, nnz, Teuchos::ScalarTraits<SC>::one(), locX.values(), locX.stride(), locX.values(), locX.stride(), Teuchos::ScalarTraits<SC>::zero(), XXtInv_[i].values(), XXtInv_[i].stride()); Teuchos::LAPACK<LO,SC> lapack; LO info, lwork = 3*NSDim; ArrayRCP<LO> IPIV(NSDim); ArrayRCP<SC> WORK(lwork); lapack.GETRF(NSDim, NSDim, XXtInv_[i].values(), XXtInv_[i].stride(), IPIV.get(), &info); lapack.GETRI(NSDim, XXtInv_[i].values(), XXtInv_[i].stride(), IPIV.get(), WORK.get(), lwork, &info); } }
double do_time_teuchos_fad_gemv(unsigned int m, unsigned int n, unsigned int ndot, unsigned int nloop) { Sacado::Random<double> urand(0.0, 1.0); Teuchos::BLAS<int,FadType> blas; std::vector<FadType> A(m*n), B(n), C(m); for (unsigned int j=0; j<n; j++) { for (unsigned int i=0; i<m; i++) { //A[i+j*m] = urand.number(); A[i+j*m] = FadType(ndot, urand.number()); for (unsigned int k=0; k<ndot; k++) A[i+j*m].fastAccessDx(k) = urand.number(); } B[j] = FadType(ndot, urand.number()); for (unsigned int k=0; k<ndot; k++) B[j].fastAccessDx(k) = urand.number(); } for (unsigned int i=0; i<m; i++) { C[i] = FadType(ndot, urand.number()); for (unsigned int k=0; k<ndot; k++) C[i].fastAccessDx(k) = urand.number(); } FadType alpha(ndot, urand.number()); FadType beta(ndot, urand.number()); for (unsigned int k=0; k<ndot; k++) { alpha.fastAccessDx(k) = urand.number(); beta.fastAccessDx(k) = urand.number(); } Teuchos::Time timer("Teuchos Fad GEMV", false); timer.start(true); for (unsigned int j=0; j<nloop; j++) { blas.GEMV(Teuchos::NO_TRANS, m, n, alpha, &A[0], m, &B[0], 1, beta, &C[0], 1); } timer.stop(); return timer.totalElapsedTime() / nloop; }
double do_time_teuchos_double_dot(unsigned int m, unsigned int nloop) { Sacado::Random<double> urand(0.0, 1.0); Teuchos::BLAS<int,double> blas; std::vector<double> X(m), Y(m); for (unsigned int i=0; i<m; i++) { X[i] = urand.number(); Y[i] = urand.number(); } Teuchos::Time timer("Teuchos Double DOT", false); timer.start(true); double z; for (unsigned int j=0; j<nloop; j++) { z += blas.DOT(m, &X[0], 1, &Y[0], 1); } timer.stop(); return timer.totalElapsedTime() / nloop; }
static void GEMM (const Teuchos::ETransp transA, const Teuchos::ETransp transB, const double& alpha, const View<const double**, LayoutLeft, DeviceType>& A, const View<const double**, LayoutLeft, DeviceType>& B, const double& beta, const View<double**, LayoutLeft, DeviceType>& C) { const int n = static_cast<int> (C.dimension_1 ()); // For some BLAS implementations (e.g., MKL), GEMM when B has // one column may be signficantly less efficient than GEMV. if (n == 1 && transB == Teuchos::NO_TRANS) { char trans = 'N'; if (transA == Teuchos::TRANS) { trans = 'T'; } else if (transA == Teuchos::CONJ_TRANS) { trans = 'C'; } auto B_0 = Kokkos::subview (B, Kokkos::ALL (), 0); auto C_0 = Kokkos::subview (C, Kokkos::ALL (), 0); KokkosBlas::gemv (&trans, alpha, A, B_0, beta, C_0); } else { const int m = static_cast<int> (C.dimension_0 ()); const int k = static_cast<int> (transA == Teuchos::NO_TRANS ? A.dimension_1 () : A.dimension_0 ()); const int lda = static_cast<int> (Impl::getStride2DView (A)); const int ldb = static_cast<int> (Impl::getStride2DView (B)); const int ldc = static_cast<int> (Impl::getStride2DView (C)); Teuchos::BLAS<int,double> blas; blas.GEMM (transA, transB, m, n, k, alpha, A.ptr_on_device(), lda, B.ptr_on_device(), ldb, beta, C.ptr_on_device(), ldc); } }
/* Computes integrals of monomials over a given reference cell. */ void computeIntegral(Teuchos::Array<double>& testIntFixDeg, shards::CellTopology & cellTopology, int cubDegree) { DefaultCubatureFactory<double> cubFactory; // create factory Teuchos::RCP<Cubature<double> > myCub = cubFactory.create(cellTopology, cubDegree); // create default cubature int cubDim = myCub->getDimension(); int numCubPoints = myCub->getNumPoints(); int numPolys = (cubDegree+1)*(cubDegree+2)*(cubDegree+3)/6; FieldContainer<double> point(cubDim); FieldContainer<double> cubPoints(numCubPoints, cubDim); FieldContainer<double> cubWeights(numCubPoints); FieldContainer<double> functValues(numCubPoints, numPolys); myCub->getCubature(cubPoints, cubWeights); int polyCt = 0; for (int xDeg=0; xDeg <= cubDegree; xDeg++) { for (int yDeg=0; yDeg <= cubDegree-xDeg; yDeg++) { for (int zDeg=0; zDeg <= cubDegree-xDeg-yDeg; zDeg++) { for (int i=0; i<numCubPoints; i++) { for (int j=0; j<cubDim; j++) { point(j) = cubPoints(i,j); } functValues(i,polyCt) = computeMonomial(point, xDeg, yDeg, zDeg); } polyCt++; } } } Teuchos::BLAS<int, double> myblas; int inc = 1; double alpha = 1.0; double beta = 0.0; myblas.GEMV(Teuchos::NO_TRANS, numPolys, numCubPoints, alpha, &functValues(0,0), numPolys, &cubWeights(0), inc, beta, &testIntFixDeg[0], inc); }
/* Computes integrals of monomials over a given reference cell. */ void computeIntegral(Teuchos::Array<double>& testIntFixDeg, int cubDegree) { CubatureGenSparse<double,3> myCub(cubDegree); int cubDim = myCub.getDimension(); int numCubPoints = myCub.getNumPoints(); int numPolys = (cubDegree+1)*(cubDegree+2)*(cubDegree+3)/6; FieldContainer<double> point(cubDim); FieldContainer<double> cubPoints(numCubPoints, cubDim); FieldContainer<double> cubWeights(numCubPoints); FieldContainer<double> functValues(numCubPoints, numPolys); myCub.getCubature(cubPoints, cubWeights); int polyCt = 0; for (int xDeg=0; xDeg <= cubDegree; xDeg++) { for (int yDeg=0; yDeg <= cubDegree-xDeg; yDeg++) { for (int zDeg=0; zDeg <= cubDegree-xDeg-yDeg; zDeg++) { for (int i=0; i<numCubPoints; i++) { for (int j=0; j<cubDim; j++) { point(j) = cubPoints(i,j); } functValues(i,polyCt) = computeMonomial(point, xDeg, yDeg, zDeg); } polyCt++; } } } Teuchos::BLAS<int, double> myblas; int inc = 1; double alpha = 1.0; double beta = 0.0; myblas.GEMV(Teuchos::NO_TRANS, numPolys, numCubPoints, alpha, &functValues(0,0), numPolys, &cubWeights(0), inc, beta, &testIntFixDeg[0], inc); }
int main(int argc, char **argv) { const unsigned int n = 5; Sacado::Fad::Vector<unsigned int, FadType> A(n*n,0),B(n,n), C(n,n); for (unsigned int i=0; i<n; i++) { for (unsigned int j=0; j<n; j++) A[i+j*n] = FadType(Teuchos::ScalarTraits<double>::random()); B[i] = FadType(n, Teuchos::ScalarTraits<double>::random()); for (unsigned int j=0; j<n; j++) B[i].fastAccessDx(j) = Teuchos::ScalarTraits<double>::random(); C[i] = 0.0; } double *a = A.vals(); double *b = B.vals(); double *bdx = B.dx(); std::vector<double> c(n), cdx(n*n); Teuchos::BLAS<int,double> blas; blas.GEMV(Teuchos::NO_TRANS, n, n, 1.0, &a[0], n, &b[0], 1, 0.0, &c[0], 1); blas.GEMM(Teuchos::NO_TRANS, Teuchos::NO_TRANS, n, n, n, 1.0, &a[0], n, &bdx[0], n, 0.0, &cdx[0], n); // Teuchos::BLAS<int,FadType> blas_fad; // blas_fad.GEMV(Teuchos::NO_TRANS, n, n, 1.0, &A[0], n, &B[0], 1, 0.0, &C[0], 1); Teuchos::BLAS<int,FadType> sacado_fad_blas(false); sacado_fad_blas.GEMV(Teuchos::NO_TRANS, n, n, 1.0, &A[0], n, &B[0], 1, 0.0, &C[0], 1); // Print the results int p = 4; int w = p+7; std::cout.setf(std::ios::scientific); std::cout.precision(p); std::cout << "BLAS GEMV calculation:" << std::endl; std::cout << "a = " << std::endl; for (unsigned int i=0; i<n; i++) { for (unsigned int j=0; j<n; j++) std::cout << " " << std::setw(w) << a[i+j*n]; std::cout << std::endl; } std::cout << "b = " << std::endl; for (unsigned int i=0; i<n; i++) { std::cout << " " << std::setw(w) << b[i]; } std::cout << std::endl; std::cout << "bdot = " << std::endl; for (unsigned int i=0; i<n; i++) { for (unsigned int j=0; j<n; j++) std::cout << " " << std::setw(w) << bdx[i+j*n]; std::cout << std::endl; } std::cout << "c = " << std::endl; for (unsigned int i=0; i<n; i++) { std::cout << " " << std::setw(w) << c[i]; } std::cout << std::endl; std::cout << "cdot = " << std::endl; for (unsigned int i=0; i<n; i++) { for (unsigned int j=0; j<n; j++) std::cout << " " << std::setw(w) << cdx[i+j*n]; std::cout << std::endl; } std::cout << std::endl << std::endl; std::cout << "FAD BLAS GEMV calculation:" << std::endl; std::cout << "A.val() (should = a) = " << std::endl; for (unsigned int i=0; i<n; i++) { for (unsigned int j=0; j<n; j++) std::cout << " " << std::setw(w) << A[i+j*n].val(); std::cout << std::endl; } std::cout << "B.val() (should = b) = " << std::endl; for (unsigned int i=0; i<n; i++) { std::cout << " " << std::setw(w) << B[i].val(); } std::cout << std::endl; std::cout << "B.dx() (should = bdot) = " << std::endl; double *Bdx = B.dx(); for (unsigned int i=0; i<n; i++) { for (unsigned int j=0; j<n; j++) std::cout << " " << std::setw(w) << Bdx[i+j*n]; std::cout << std::endl; } std::cout << "C.val() (should = c) = " << std::endl; for (unsigned int i=0; i<n; i++) { std::cout << " " << std::setw(w) << C[i].val(); } std::cout << std::endl; std::cout << "C.dx() (should = cdot) = " << std::endl; double *Cdx = C.dx(); for (unsigned int i=0; i<n; i++) { for (unsigned int j=0; j<n; j++) std::cout << " " << std::setw(w) << Cdx[i+j*n]; std::cout << std::endl; } double tol = 1.0e-14; bool failed = false; for (unsigned int i=0; i<n; i++) { if (std::fabs(C[i].val() - c[i]) > tol) failed = true; for (unsigned int j=0; j<n; j++) { if (std::fabs(C[i].dx(j) - cdx[i+j*n]) > tol) failed = true; } } if (!failed) { std::cout << "\nExample passed!" << std::endl; return 0; } else { std::cout <<"\nSomething is wrong, example failed!" << std::endl; return 1; } }
void randomGlobalMatrix (Generator* const pGenerator, MatrixViewType& A_local, const typename Teuchos::ScalarTraits< typename MatrixViewType::scalar_type >::magnitudeType singular_values[], MessengerBase< typename MatrixViewType::ordinal_type >* const ordinalMessenger, MessengerBase< typename MatrixViewType::scalar_type >* const scalarMessenger) { using Teuchos::NO_TRANS; using std::vector; typedef typename MatrixViewType::ordinal_type ordinal_type; typedef typename MatrixViewType::scalar_type scalar_type; const bool b_local_debug = false; const int rootProc = 0; const int nprocs = ordinalMessenger->size(); const int myRank = ordinalMessenger->rank(); Teuchos::BLAS<ordinal_type, scalar_type> blas; const ordinal_type nrowsLocal = A_local.nrows(); const ordinal_type ncols = A_local.ncols(); // Theory: Suppose there are P processors. Proc q wants an m_q by n // component of the matrix A, which we write as A_q. On Proc 0, we // generate random m_q by n orthogonal matrices Q_q (in explicit // form), and send Q_q to Proc q. The m by n matrix [Q_0; Q_1; ...; // Q_{P-1}] is not itself orthogonal. However, the m by n matrix // Q = [Q_0 / P; Q_1 / P; ...; Q_{P-1} / P] is orthogonal: // // \sum_{q = 0}^{P-1} (Q_q^T * Q_q) / P = I. if (myRank == rootProc) { typedef Random::MatrixGenerator< ordinal_type, scalar_type, Generator > matgen_type; matgen_type matGen (*pGenerator); // Generate a random ncols by ncols upper triangular matrix // R with the given singular values. Matrix< ordinal_type, scalar_type > R (ncols, ncols, scalar_type(0)); matGen.fill_random_R (ncols, R.get(), R.lda(), singular_values); // Broadcast R to all the processors. scalarMessenger->broadcast (R.get(), ncols*ncols, rootProc); // Generate (for myself) a random nrowsLocal x ncols // orthogonal matrix, stored in explicit form. Matrix< ordinal_type, scalar_type > Q_local (nrowsLocal, ncols); matGen.explicit_Q (nrowsLocal, ncols, Q_local.get(), Q_local.lda()); // Scale the (local) orthogonal matrix by the number of // processors P, to make the columns of the global matrix Q // orthogonal. (Otherwise the norm of each column will be P // instead of 1.) const scalar_type P = static_cast< scalar_type > (nprocs); // Do overflow check. If casting P back to scalar_type // doesn't produce the same value as nprocs, the cast // overflowed. We take the real part, because scalar_type // might be complex. if (nprocs != static_cast<int> (Teuchos::ScalarTraits<scalar_type>::real (P))) throw std::runtime_error ("Casting nprocs to Scalar failed"); scaleMatrix (Q_local, P); // A_local := Q_local * R blas.GEMM (NO_TRANS, NO_TRANS, nrowsLocal, ncols, ncols, scalar_type(1), Q_local.get(), Q_local.lda(), R.get(), R.lda(), scalar_type(0), A_local.get(), A_local.lda()); for (int recvProc = 1; recvProc < nprocs; ++recvProc) { // Ask the receiving processor how big (i.e., how many rows) // its local component of the matrix is. ordinal_type nrowsRemote = 0; ordinalMessenger->recv (&nrowsRemote, 1, recvProc, 0); if (b_local_debug) { std::ostringstream os; os << "For Proc " << recvProc << ": local block is " << nrowsRemote << " by " << ncols << std::endl; std::cerr << os.str(); } // Make sure Q_local is big enough to hold the data for // the current receiver proc. Q_local.reshape (nrowsRemote, ncols); // Compute a random nrowsRemote * ncols orthogonal // matrix Q_local, for the current receiving processor. matGen.explicit_Q (nrowsRemote, ncols, Q_local.get(), Q_local.lda()); // Send Q_local to the current receiving processor. scalarMessenger->send (Q_local.get(), nrowsRemote*ncols, recvProc, 0); } } else { // Receive the R factor from Proc 0. There's only 1 R // factor for all the processes. Matrix< ordinal_type, scalar_type > R (ncols, ncols, scalar_type (0)); scalarMessenger->broadcast (R.get(), ncols*ncols, rootProc); // Q_local (nrows_local by ncols, random orthogonal matrix) // will be received from Proc 0, where it was generated. const ordinal_type recvSize = nrowsLocal * ncols; Matrix< ordinal_type, scalar_type > Q_local (nrowsLocal, ncols); // Tell Proc 0 how many rows there are in the random orthogonal // matrix I want to receive from Proc 0. ordinalMessenger->send (&nrowsLocal, 1, rootProc, 0); // Receive the orthogonal matrix from Proc 0. scalarMessenger->recv (Q_local.get(), recvSize, rootProc, 0); // Scale the (local) orthogonal matrix by the number of // processors, to make the global matrix Q orthogonal. const scalar_type P = static_cast< scalar_type > (nprocs); // Do overflow check. If casting P back to scalar_type // doesn't produce the same value as nprocs, the cast // overflowed. We take the real part, because scalar_type // might be complex. if (nprocs != static_cast<int> (Teuchos::ScalarTraits<scalar_type>::real (P))) throw std::runtime_error ("Casting nprocs to Scalar failed"); scaleMatrix (Q_local, P); // A_local := Q_local * R blas.GEMM (NO_TRANS, NO_TRANS, nrowsLocal, ncols, ncols, scalar_type(1), Q_local.get(), Q_local.lda(), R.get(), R.lda(), scalar_type(0), A_local.get(), A_local.lda()); } }
void Constraint<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::Apply(const Matrix& P, Matrix& Projected) const { const size_t NSDim = X_->getNumVectors(); const size_t numRows = P.getNodeNumRows(); Projected.resumeFill(); Teuchos::SerialDenseVector<LO,SC> BcRow(NSDim, false); for (size_t i = 0; i < numRows; i++) { Teuchos::ArrayView<const LO> indices, pindices; Teuchos::ArrayView<const SC> values, pvalues; P .getLocalRowView(i, indices, values); Projected.getLocalRowView(i, pindices, pvalues); size_t nnz = pindices.size(); // number of nonzeros in the constrained matrix size_t nnz1 = indices.size(); // number of nonzeros in the supplied matrix Teuchos::Array<SC> newValues(nnz, Teuchos::ScalarTraits<SC>::zero()); // step 1: fix stencil // Projected already has the correct stencil // step 2: copy correct stencil values for (size_t j = 0; j < nnz1; j++) { // this might be accelerated if we know smth about ordering size_t k = 0; for (; k < nnz; k++) if (pindices[k] == indices[j]) break; if (k != nnz) { // index indices[j] is part of template, copy corresponding value newValues[k] = values[j]; } } // step 3: project to the space Teuchos::SerialDenseMatrix<LO,SC> locX(NSDim, nnz, false); for (size_t j = 0; j < nnz; j++) { for (size_t k = 0; k < NSDim; k++) BcRow[k] = X_->getData(k)[pindices[j]]; Teuchos::setCol(BcRow, (LO)j, locX); } Teuchos::SerialDenseVector<LO,SC> val(nnz, false), val1(NSDim, false), val2(NSDim, false); for (size_t j = 0; j < nnz; j++) val[j] = newValues[j]; Teuchos::BLAS<LO,SC> blas; blas.GEMV(Teuchos::NO_TRANS, NSDim, nnz, Teuchos::ScalarTraits<SC>::one(), locX.values(), locX.stride(), val.values(), (LO)1, Teuchos::ScalarTraits<SC>::zero(), val1.values(), (LO)1); blas.GEMV(Teuchos::NO_TRANS, NSDim, NSDim, Teuchos::ScalarTraits<SC>::one(), XXtInv_[i].values(), XXtInv_[i].stride(), val1.values(), (LO)1, Teuchos::ScalarTraits<SC>::zero(), val2.values(), (LO)1); blas.GEMV(Teuchos::CONJ_TRANS, NSDim, nnz, Teuchos::ScalarTraits<SC>::one(), locX.values(), locX.stride(), val2.values(), (LO)1, Teuchos::ScalarTraits<SC>::zero(), val.values(), (LO)1); for (size_t j = 0; j < nnz; j++) newValues[j] -= val[j]; Projected.replaceLocalValues(i, pindices, newValues); } Projected.fillComplete(Projected.getDomainMap(), Projected.getRangeMap()); //FIXME: maps needed? }
int main(int argc, char *argv[]) { Kokkos::initialize(); //Check number of arguments if (argc < 4) { std::cout <<"\n>>> ERROR: Invalid number of arguments.\n\n"; std::cout <<"Usage:\n\n"; std::cout <<" ./Intrepid_example_Drivers_Example_06.exe deg NX NY verbose\n\n"; std::cout <<" where \n"; std::cout <<" int deg - polynomial degree to be used (assumed > 1) \n"; std::cout <<" int NX - num intervals in x direction (assumed box domain, 0,1) \n"; std::cout <<" int NY - num intervals in y direction (assumed box domain, 0,1) \n"; std::cout <<" verbose (optional) - any character, indicates verbose output \n\n"; exit(1); } // This little trick lets us print to std::cout only if // a (dummy) command-line argument is provided. int iprint = argc - 1; Teuchos::RCP<std::ostream> outStream; Teuchos::oblackholestream bhs; // outputs nothing if (iprint > 2) outStream = Teuchos::rcp(&std::cout, false); else outStream = Teuchos::rcp(&bhs, false); // Save the format state of the original std::cout. Teuchos::oblackholestream oldFormatState; oldFormatState.copyfmt(std::cout); *outStream \ << "===============================================================================\n" \ << "| |\n" \ << "| Example: Apply Stiffness Matrix for |\n" \ << "| Poisson Equation on Quadrilateral Mesh |\n" \ << "| |\n" \ << "| Questions? Contact Pavel Bochev ([email protected]), |\n" \ << "| Denis Ridzal ([email protected]), |\n" \ << "| Kara Peterson ([email protected]). |\n" \ << "| |\n" \ << "| Intrepid's website: http://trilinos.sandia.gov/packages/intrepid |\n" \ << "| Trilinos website: http://trilinos.sandia.gov |\n" \ << "| |\n" \ << "===============================================================================\n"; // ************************************ GET INPUTS ************************************** int deg = atoi(argv[1]); // polynomial degree to use int NX = atoi(argv[2]); // num intervals in x direction (assumed box domain, 0,1) int NY = atoi(argv[3]); // num intervals in y direction (assumed box domain, 0,1) // *********************************** CELL TOPOLOGY ********************************** // Get cell topology for base hexahedron typedef shards::CellTopology CellTopology; CellTopology quad_4(shards::getCellTopologyData<shards::Quadrilateral<4> >() ); // Get dimensions int numNodesPerElem = quad_4.getNodeCount(); int spaceDim = quad_4.getDimension(); // *********************************** GENERATE MESH ************************************ *outStream << "Generating mesh ... \n\n"; *outStream << " NX" << " NY\n"; *outStream << std::setw(5) << NX << std::setw(5) << NY << "\n\n"; // Print mesh information int numElems = NX*NY; int numNodes = (NX+1)*(NY+1); *outStream << " Number of Elements: " << numElems << " \n"; *outStream << " Number of Nodes: " << numNodes << " \n\n"; // Square double leftX = 0.0, rightX = 1.0; double leftY = 0.0, rightY = 1.0; // Mesh spacing double hx = (rightX-leftX)/((double)NX); double hy = (rightY-leftY)/((double)NY); // Get nodal coordinates FieldContainer<double> nodeCoord(numNodes, spaceDim); FieldContainer<int> nodeOnBoundary(numNodes); int inode = 0; for (int j=0; j<NY+1; j++) { for (int i=0; i<NX+1; i++) { nodeCoord(inode,0) = leftX + (double)i*hx; nodeCoord(inode,1) = leftY + (double)j*hy; if (j==0 || i==0 || j==NY || i==NX){ nodeOnBoundary(inode)=1; } else { nodeOnBoundary(inode)=0; } inode++; } } #define DUMP_DATA #ifdef DUMP_DATA // Print nodal coords ofstream fcoordout("coords.dat"); for (int i=0; i<numNodes; i++) { fcoordout << nodeCoord(i,0) <<" "; fcoordout << nodeCoord(i,1) <<"\n"; } fcoordout.close(); #endif // Element to Node map // We'll keep it around, but this is only the DOFMap if you are in the lowest order case. FieldContainer<int> elemToNode(numElems, numNodesPerElem); int ielem = 0; for (int j=0; j<NY; j++) { for (int i=0; i<NX; i++) { elemToNode(ielem,0) = (NX + 1)*j + i; elemToNode(ielem,1) = (NX + 1)*j + i + 1; elemToNode(ielem,2) = (NX + 1)*(j + 1) + i + 1; elemToNode(ielem,3) = (NX + 1)*(j + 1) + i; ielem++; } } #ifdef DUMP_DATA // Output connectivity ofstream fe2nout("elem2node.dat"); for (int j=0; j<NY; j++) { for (int i=0; i<NX; i++) { int ielem = i + j * NX; for (int m=0; m<numNodesPerElem; m++){ fe2nout << elemToNode(ielem,m) <<" "; } fe2nout <<"\n"; } } fe2nout.close(); #endif // ************************************ CUBATURE ************************************** *outStream << "Getting cubature ... \n\n"; // Get numerical integration points and weights DefaultCubatureFactory<double> cubFactory; int cubDegree = 2*deg; Teuchos::RCP<Cubature<double> > quadCub = cubFactory.create(quad_4, cubDegree); int cubDim = quadCub->getDimension(); int numCubPoints = quadCub->getNumPoints(); FieldContainer<double> cubPoints(numCubPoints, cubDim); FieldContainer<double> cubWeights(numCubPoints); quadCub->getCubature(cubPoints, cubWeights); // ************************************** BASIS *************************************** *outStream << "Getting basis ... \n\n"; // Define basis Basis_HGRAD_QUAD_Cn_FEM<double, FieldContainer<double> > quadHGradBasis(deg,POINTTYPE_SPECTRAL); int numFieldsG = quadHGradBasis.getCardinality(); FieldContainer<double> quadGVals(numFieldsG, numCubPoints); FieldContainer<double> quadGrads(numFieldsG, numCubPoints, spaceDim); // Evaluate basis values and gradients at cubature points quadHGradBasis.getValues(quadGVals, cubPoints, OPERATOR_VALUE); quadHGradBasis.getValues(quadGrads, cubPoints, OPERATOR_GRAD); // create the local-global mapping for higher order elements FieldContainer<int> ltgMapping(numElems,numFieldsG); const int numDOF = (NX*deg+1)*(NY*deg+1); ielem=0; for (int j=0;j<NY;j++) { for (int i=0;i<NX;i++) { const int start = deg * j * ( NX * deg + 1 ) + i * deg; // loop over local dof on this cell int local_dof_cur=0; for (int vertical=0;vertical<=deg;vertical++) { for (int horizontal=0;horizontal<=deg;horizontal++) { ltgMapping(ielem,local_dof_cur) = start + vertical*(NX*deg+1)+horizontal; local_dof_cur++; } } ielem++; } } #ifdef DUMP_DATA // Output ltg mapping // ofstream ltgout("ltg.dat"); // for (int j=0; j<NY; j++) { // for (int i=0; i<NX; i++) { // int ielem = i + j * NX; // for (int m=0; m<numFieldsG; m++){ // ltgout << ltgMapping(ielem,m) <<" "; // } // ltgout <<"\n"; // } // } // ltgout.close(); #endif // ******** CREATE A SINGLE STIFFNESS MATRIX, WHICH IS REPLICATED ON ALL ELEMENTS ********* *outStream << "Applying stiffness matrix and right hand side ... \n\n"; // Settings and data structures for mass and stiffness matrices typedef CellTools<double> CellTools; typedef FunctionSpaceTools fst; int numCells = 1; // Container for nodes FieldContainer<double> refQuadNodes(numCells, numNodesPerElem, spaceDim); // Containers for Jacobian FieldContainer<double> refQuadJacobian(numCells, numCubPoints, spaceDim, spaceDim); FieldContainer<double> refQuadJacobInv(numCells, numCubPoints, spaceDim, spaceDim); FieldContainer<double> refQuadJacobDet(numCells, numCubPoints); // Containers for element HGRAD stiffness matrix FieldContainer<double> localStiffMatrix(numCells, numFieldsG, numFieldsG); FieldContainer<double> weightedMeasure(numCells, numCubPoints); FieldContainer<double> quadGradsTransformed(numCells, numFieldsG, numCubPoints, spaceDim); FieldContainer<double> quadGradsTransformedWeighted(numCells, numFieldsG, numCubPoints, spaceDim); // Containers for right hand side vectors FieldContainer<double> rhsData(numCells, numCubPoints); FieldContainer<double> localRHS(numCells, numFieldsG); FieldContainer<double> quadGValsTransformed(numCells, numFieldsG, numCubPoints); FieldContainer<double> quadGValsTransformedWeighted(numCells, numFieldsG, numCubPoints); // Container for cubature points in physical space FieldContainer<double> physCubPoints(numCells, numCubPoints, cubDim); // Global arrays in Epetra format Epetra_SerialComm Comm; Epetra_Map globalMapG(numDOF, 0, Comm); Epetra_FEVector u(globalMapG); Epetra_FEVector Ku(globalMapG); u.Random(); std::cout << "About to start ref element matrix\n"; // ************************** Compute element HGrad stiffness matrices ******************************* refQuadNodes(0,0,0) = 0.0; refQuadNodes(0,0,1) = 0.0; refQuadNodes(0,1,0) = hx; refQuadNodes(0,1,1) = 0.0; refQuadNodes(0,2,0) = hx; refQuadNodes(0,2,1) = hy; refQuadNodes(0,3,0) = 0.0; refQuadNodes(0,3,1) = hy; // Compute cell Jacobians, their inverses and their determinants CellTools::setJacobian(refQuadJacobian, cubPoints, refQuadNodes, quad_4); CellTools::setJacobianInv(refQuadJacobInv, refQuadJacobian ); CellTools::setJacobianDet(refQuadJacobDet, refQuadJacobian ); // transform from [-1,1]^2 to [0,hx]x[0,hy] fst::HGRADtransformGRAD<double>(quadGradsTransformed, refQuadJacobInv, quadGrads); // compute weighted measure fst::computeCellMeasure<double>(weightedMeasure, refQuadJacobDet, cubWeights); // multiply values with weighted measure fst::multiplyMeasure<double>(quadGradsTransformedWeighted, weightedMeasure, quadGradsTransformed); // integrate to compute element stiffness matrix fst::integrate<double>(localStiffMatrix, quadGradsTransformed, quadGradsTransformedWeighted, COMP_BLAS); std::cout << "Finished with reference element matrix\n"; // now we will scatter global degrees of freedom, apply the local stiffness matrix // with BLAS, and then gather the results FieldContainer<double> uScattered(numElems,numFieldsG); FieldContainer<double> KuScattered(numElems,numFieldsG); // to extract info from u u.GlobalAssemble(); Epetra_Time multTimer(Comm); Ku.PutScalar(0.0); Ku.GlobalAssemble(); double *uVals = u[0]; double *KuVals = Ku[0]; Teuchos::BLAS<int,double> blas; Epetra_Time scatterTime(Comm); std::cout << "Scattering\n"; // Scatter for (int k=0; k<numElems; k++) { for (int i=0;i<numFieldsG;i++) { uScattered(k,i) = uVals[ltgMapping(k,i)]; } } const double scatTime = scatterTime.ElapsedTime(); std::cout << "Scattered in time " << scatTime << "\n"; Epetra_Time blasTimer(Comm); blas.GEMM(Teuchos::NO_TRANS , Teuchos::NO_TRANS , numFieldsG , numElems, numFieldsG , 1.0 , &localStiffMatrix(0,0,0) , numFieldsG , &uScattered(0,0) , numFieldsG , 0.0 , &KuScattered(0,0) , numFieldsG ); const double blasTime = blasTimer.ElapsedTime(); std::cout << "Element matrices applied in " << blasTime << "\n"; Epetra_Time gatherTimer(Comm); // Gather for (int k=0;k<numElems;k++) { for (int i=0;i<numFieldsG;i++) { KuVals[ltgMapping(k,i)] += KuScattered(k,i); } } const double gatherTime = gatherTimer.ElapsedTime(); std::cout << "Gathered in " << gatherTime << "\n"; const double applyTime = gatherTime + blasTime + scatTime; std::cout << "Time to do matrix-free product: " << applyTime << std::endl; std::cout << "End Result: TEST PASSED\n"; // reset format state of std::cout std::cout.copyfmt(oldFormatState); Kokkos::finalize(); return 0; }
KOKKOS_INLINE_FUNCTION int exampleDenseCholByBlocks(const OrdinalType mmin, const OrdinalType mmax, const OrdinalType minc, const OrdinalType mb, const int max_concurrency, const int max_task_dependence, const int team_size, const bool check, const bool verbose) { typedef ValueType value_type; typedef OrdinalType ordinal_type; typedef SizeType size_type; typedef TaskFactory<Kokkos::Experimental::TaskPolicy<SpaceType>, Kokkos::Experimental::Future<int,SpaceType> > TaskFactoryType; typedef DenseMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> DenseMatrixBaseType; typedef DenseMatrixView<DenseMatrixBaseType> DenseMatrixViewType; typedef TaskView<DenseMatrixViewType,TaskFactoryType> DenseTaskViewType; typedef DenseMatrixBase<DenseTaskViewType,ordinal_type,size_type,SpaceType,MemoryTraits> DenseHierMatrixBaseType; typedef DenseMatrixView<DenseHierMatrixBaseType> DenseHierMatrixViewType; typedef TaskView<DenseHierMatrixViewType,TaskFactoryType> DenseHierTaskViewType; int r_val = 0; Kokkos::Impl::Timer timer; double t = 0.0; cout << "DenseCholByBlocks:: test matrices " <<":: mmin = " << mmin << " , mmax = " << mmax << " , minc = " << minc << endl; const size_t max_task_size = (3*sizeof(DenseTaskViewType)+196); // when 128 error //cout << "max task size = "<< max_task_size << endl; typename TaskFactoryType::policy_type policy(max_concurrency, max_task_size, max_task_dependence, team_size); TaskFactoryType::setMaxTaskDependence(max_task_dependence); TaskFactoryType::setPolicy(&policy); for (ordinal_type m=mmin;m<=mmax;m+=minc) { DenseMatrixBaseType TT("TT", m, m), AA("AA", m, m), AB("AB", m, m); // random T matrix for (ordinal_type j=0;j<AA.NumCols();++j) { for (ordinal_type i=0;i<AA.NumRows();++i) TT.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0; TT.Value(j,j) = abs(TT.Value(j,j)); } { Teuchos::BLAS<ordinal_type,value_type> blas; // should be square const ordinal_type nn = AA.NumRows(); const ordinal_type kk = TT.NumRows(); blas.HERK(Teuchos::UPPER_TRI, Teuchos::CONJ_TRANS, nn, kk, 1.0, TT.ValuePtr(), TT.ColStride(), 0.0, AA.ValuePtr(), AA.ColStride()); AB.copy(AA); } const double flop = get_flop_chol<value_type>(m); #ifdef HAVE_SHYLUTACHO_MKL mkl_set_num_threads(1); #endif int ierr = 0; if (check) { timer.reset(); DenseTaskViewType A(&AB); ierr = Chol<Uplo::Upper,AlgoChol::ExternalLapack>::invoke (TaskFactoryType::Policy(), TaskFactoryType::Policy().member_single(), A); t = timer.seconds(); if (ierr) ERROR(">> Fail to perform Cholesky (serial) : no reference solution -> no numeric error information"); cout << "DenseCholByBlocks:: Serial Performance = " << (flop/t/1.0e9) << " [GFLOPs]" << endl; } { DenseHierMatrixBaseType HA; DenseMatrixHelper::flat2hier(AA, HA, mb, mb); DenseHierTaskViewType TA(&HA); timer.reset(); auto future = TaskFactoryType::Policy().create_team (Chol<Uplo::Upper,AlgoChol::DenseByBlocks> ::TaskFunctor<DenseHierTaskViewType>(TA), 0); TaskFactoryType::Policy().spawn(future); Kokkos::Experimental::wait(TaskFactoryType::Policy()); t = timer.seconds(); cout << "DenseCholByBlocks:: Parallel Performance = " << (flop/t/1.0e9) << " [GFLOPs]" << endl; } if (!ierr && check) { typedef typename Teuchos::ScalarTraits<value_type>::magnitudeType real_type; real_type err = 0.0, norm = 0.0; for (ordinal_type j=0;j<AA.NumCols();++j) for (ordinal_type i=0;i<=j;++i) { const real_type diff = abs(AA.Value(i,j) - AB.Value(i,j)); const real_type val = AB.Value(i,j); err += diff*diff; norm += val*val; } cout << "DenseCholByBlocks:: Check result::err = " << sqrt(err) << ", norm = " << sqrt(norm) << endl; } } return r_val; }
int exampleDenseCholByBlocks(const ordinal_type mmin, const ordinal_type mmax, const ordinal_type minc, const ordinal_type mb, const int max_concurrency, const int max_task_dependence, const int team_size, const int mkl_nthreads, const bool check, const bool verbose) { typedef typename Kokkos::Impl::is_space<DeviceSpaceType>::host_mirror_space::execution_space HostSpaceType ; const bool detail = false; std::cout << "DeviceSpace:: "; DeviceSpaceType::print_configuration(std::cout, detail); std::cout << "HostSpace:: "; HostSpaceType::print_configuration(std::cout, detail); typedef Kokkos::Experimental::TaskPolicy<DeviceSpaceType> PolicyType; typedef DenseMatrixBase<value_type,ordinal_type,size_type,HostSpaceType> DenseMatrixBaseHostType; typedef DenseMatrixView<DenseMatrixBaseHostType> DenseMatrixViewHostType; typedef DenseMatrixBase<value_type,ordinal_type,size_type,DeviceSpaceType> DenseMatrixBaseDeviceType; typedef DenseMatrixView<DenseMatrixBaseDeviceType> DenseMatrixViewDeviceType; typedef TaskView<DenseMatrixViewDeviceType> DenseTaskViewDeviceType; typedef DenseMatrixBase<DenseTaskViewDeviceType,ordinal_type,size_type,DeviceSpaceType> DenseHierMatrixBaseDeviceType; typedef DenseMatrixView<DenseHierMatrixBaseDeviceType> DenseHierMatrixViewDeviceType; typedef TaskView<DenseHierMatrixViewDeviceType> DenseHierTaskViewDeviceType; int r_val = 0; Kokkos::Impl::Timer timer; double t = 0.0; std::cout << "DenseCholByBlocks:: test matrices " <<":: mmin = " << mmin << " , mmax = " << mmax << " , minc = " << minc << " , mb = " << mb << std::endl; const size_t max_task_size = (3*sizeof(DenseTaskViewDeviceType)+sizeof(PolicyType)+128); PolicyType policy(max_concurrency, max_task_size, max_task_dependence, team_size); std::ostringstream os; os.precision(3); os << std::scientific; for (ordinal_type m=mmin;m<=mmax;m+=minc) { os.str(""); // host matrices DenseMatrixBaseHostType AA_host("AA_host", m, m), AB_host("AB_host"), TT_host("TT_host"); // random T matrix { TT_host.createConfTo(AA_host); for (ordinal_type j=0;j<TT_host.NumCols();++j) { for (ordinal_type i=0;i<TT_host.NumRows();++i) TT_host.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0; TT_host.Value(j,j) = std::fabs(TT_host.Value(j,j)); } } // create SPD matrix { Teuchos::BLAS<ordinal_type,value_type> blas; blas.HERK(ArgUplo == Uplo::Upper ? Teuchos::UPPER_TRI : Teuchos::LOWER_TRI, Teuchos::CONJ_TRANS, m, m, 1.0, TT_host.ValuePtr(), TT_host.ColStride(), 0.0, AA_host.ValuePtr(), AA_host.ColStride()); // preserve a copy of A AB_host.createConfTo(AA_host); DenseMatrixTools::copy(AB_host, AA_host); } const double flop = DenseFlopCount<value_type>::Chol(m); #ifdef HAVE_SHYLUTACHO_MKL mkl_set_num_threads(mkl_nthreads); #endif os << "DenseCholByBlocks:: m = " << m << " "; int ierr = 0; if (check) { timer.reset(); DenseMatrixViewHostType A_host(AB_host); ierr = Chol<ArgUplo,AlgoChol::ExternalLapack,Variant::One>::invoke (policy, policy.member_single(), A_host); t = timer.seconds(); TACHO_TEST_FOR_ABORT( ierr, "Fail to perform Cholesky (serial)"); os << ":: Serial Performance = " << (flop/t/1.0e9) << " [GFLOPs] "; } DenseMatrixBaseDeviceType AA_device("AA_device"); { timer.reset(); AA_device.mirror(AA_host); t = timer.seconds(); os << ":: Mirror = " << t << " [sec] "; } { DenseHierMatrixBaseDeviceType HA_device("HA_device"); DenseMatrixTools::createHierMatrix(HA_device, AA_device, mb, mb); DenseHierTaskViewDeviceType TA_device(HA_device); timer.reset(); auto future = policy.proc_create_team (Chol<ArgUplo,AlgoChol::DenseByBlocks,ArgVariant> ::createTaskFunctor(policy, TA_device), 0); policy.spawn(future); Kokkos::Experimental::wait(policy); t = timer.seconds(); os << ":: Parallel Performance = " << (flop/t/1.0e9) << " [GFLOPs] "; } AA_host.mirror(AA_device); if (!ierr && check) { double err = 0.0, norm = 0.0; for (ordinal_type j=0;j<AA_host.NumCols();++j) for (ordinal_type i=0;i<=j;++i) { const double diff = abs(AA_host.Value(i,j) - AB_host.Value(i,j)); const double val = AB_host.Value(i,j); err += diff*diff; norm += val*val; } os << ":: Check result ::norm = " << sqrt(norm) << ", error = " << sqrt(err); } std::cout << os.str() << std::endl; } return r_val; }
void Constraint<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Setup(const MultiVector& B, const MultiVector& Bc, RCP<const CrsGraph> Ppattern) { const size_t NSDim = Bc.getNumVectors(); Ppattern_ = Ppattern; size_t numRows = Ppattern_->getNodeNumRows(); XXtInv_.resize(numRows); RCP<const Import> importer = Ppattern_->getImporter(); X_ = MultiVectorFactory::Build(Ppattern_->getColMap(), NSDim); if (!importer.is_null()) X_->doImport(Bc, *importer, Xpetra::INSERT); else *X_ = Bc; std::vector<const SC*> Xval(NSDim); for (size_t j = 0; j < NSDim; j++) Xval[j] = X_->getData(j).get(); SC zero = Teuchos::ScalarTraits<SC>::zero(); SC one = Teuchos::ScalarTraits<SC>::one(); Teuchos::BLAS <LO,SC> blas; Teuchos::LAPACK<LO,SC> lapack; LO lwork = 3*NSDim; ArrayRCP<LO> IPIV(NSDim); ArrayRCP<SC> WORK(lwork); for (size_t i = 0; i < numRows; i++) { Teuchos::ArrayView<const LO> indices; Ppattern_->getLocalRowView(i, indices); size_t nnz = indices.size(); XXtInv_[i] = Teuchos::SerialDenseMatrix<LO,SC>(NSDim, NSDim, false/*zeroOut*/); Teuchos::SerialDenseMatrix<LO,SC>& XXtInv = XXtInv_[i]; if (NSDim == 1) { SC d = zero; for (size_t j = 0; j < nnz; j++) d += Xval[0][indices[j]] * Xval[0][indices[j]]; XXtInv(0,0) = one/d; } else { Teuchos::SerialDenseMatrix<LO,SC> locX(NSDim, nnz, false/*zeroOut*/); for (size_t j = 0; j < nnz; j++) for (size_t k = 0; k < NSDim; k++) locX(k,j) = Xval[k][indices[j]]; // XXtInv_ = (locX*locX^T)^{-1} blas.GEMM(Teuchos::NO_TRANS, Teuchos::CONJ_TRANS, NSDim, NSDim, nnz, one, locX.values(), locX.stride(), locX.values(), locX.stride(), zero, XXtInv.values(), XXtInv.stride()); LO info; // Compute LU factorization using partial pivoting with row exchanges lapack.GETRF(NSDim, NSDim, XXtInv.values(), XXtInv.stride(), IPIV.get(), &info); // Use the computed factorization to compute the inverse lapack.GETRI(NSDim, XXtInv.values(), XXtInv.stride(), IPIV.get(), WORK.get(), lwork, &info); } } }
void Constraint<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Apply(const Matrix& P, Matrix& Projected) const { // We check only row maps. Column may be different. TEUCHOS_TEST_FOR_EXCEPTION(!P.getRowMap()->isSameAs(*Projected.getRowMap()), Exceptions::Incompatible, "Row maps are incompatible"); const size_t NSDim = X_->getNumVectors(); const size_t numRows = P.getNodeNumRows(); const Map& colMap = *P.getColMap(); const Map& PColMap = *Projected.getColMap(); Projected.resumeFill(); Teuchos::ArrayView<const LO> indices, pindices; Teuchos::ArrayView<const SC> values, pvalues; Teuchos::Array<SC> valuesAll(colMap.getNodeNumElements()), newValues; LO invalid = Teuchos::OrdinalTraits<LO>::invalid(); LO oneLO = Teuchos::OrdinalTraits<LO>::one(); SC zero = Teuchos::ScalarTraits<SC> ::zero(); SC one = Teuchos::ScalarTraits<SC> ::one(); std::vector<const SC*> Xval(NSDim); for (size_t j = 0; j < NSDim; j++) Xval[j] = X_->getData(j).get(); for (size_t i = 0; i < numRows; i++) { P .getLocalRowView(i, indices, values); Projected.getLocalRowView(i, pindices, pvalues); size_t nnz = indices.size(); // number of nonzeros in the supplied matrix size_t pnnz = pindices.size(); // number of nonzeros in the constrained matrix newValues.resize(pnnz); // Step 1: fix stencil // Projected *must* already have the correct stencil // Step 2: copy correct stencil values // The algorithm is very similar to the one used in the calculation of // Frobenius dot product, see src/Transfers/Energy-Minimization/Solvers/MueLu_CGSolver_def.hpp // NOTE: using extra array allows us to skip the search among indices for (size_t j = 0; j < nnz; j++) valuesAll[indices[j]] = values[j]; for (size_t j = 0; j < pnnz; j++) { LO ind = colMap.getLocalElement(PColMap.getGlobalElement(pindices[j])); // FIXME: we could do that before the full loop just once if (ind != invalid) // index indices[j] is part of template, copy corresponding value newValues[j] = valuesAll[ind]; else newValues[j] = zero; } for (size_t j = 0; j < nnz; j++) valuesAll[indices[j]] = zero; // Step 3: project to the space Teuchos::SerialDenseMatrix<LO,SC>& XXtInv = XXtInv_[i]; Teuchos::SerialDenseMatrix<LO,SC> locX(NSDim, pnnz, false); for (size_t j = 0; j < pnnz; j++) for (size_t k = 0; k < NSDim; k++) locX(k,j) = Xval[k][pindices[j]]; Teuchos::SerialDenseVector<LO,SC> val(pnnz, false), val1(NSDim, false), val2(NSDim, false); for (size_t j = 0; j < pnnz; j++) val[j] = newValues[j]; Teuchos::BLAS<LO,SC> blas; // val1 = locX * val; blas.GEMV(Teuchos::NO_TRANS, NSDim, pnnz, one, locX.values(), locX.stride(), val.values(), oneLO, zero, val1.values(), oneLO); // val2 = XXtInv * val1 blas.GEMV(Teuchos::NO_TRANS, NSDim, NSDim, one, XXtInv.values(), XXtInv.stride(), val1.values(), oneLO, zero, val2.values(), oneLO); // val = X^T * val2 blas.GEMV(Teuchos::CONJ_TRANS, NSDim, pnnz, one, locX.values(), locX.stride(), val2.values(), oneLO, zero, val.values(), oneLO); for (size_t j = 0; j < pnnz; j++) newValues[j] -= val[j]; Projected.replaceLocalValues(i, pindices, newValues); } Projected.fillComplete(Projected.getDomainMap(), Projected.getRangeMap()); //FIXME: maps needed? }
KOKKOS_INLINE_FUNCTION int exampleDenseTrsmMKL(const OrdinalType mmin, const OrdinalType mmax, const OrdinalType minc, const OrdinalType k, const bool verbose) { typedef ValueType value_type; typedef OrdinalType ordinal_type; typedef SizeType size_type; typedef DenseMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> DenseMatrixBaseType; int r_val = 0; Kokkos::Impl::Timer timer; double t = 0.0; cout << "DenseGemmMKL:: test matrices " <<":: mmin = " << mmin << " , mmax = " << mmax << " , minc = " << minc << " , k = "<< k << endl; ostringstream os; os.precision(3); os << scientific; for (ordinal_type m=mmin;m<=mmax;m+=minc) { os.str(""); DenseMatrixBaseType AA("AA", m, m), BB("BB", m, k), BC("BC", m, k); // setup upper triangular for (ordinal_type j=0;j<AA.NumCols();++j) { AA.Value(j,j) = 10.0; for (ordinal_type i=0;i<j;++i) AA.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0; } // setup one and right hand side is going to be overwritten by the product of AB for (ordinal_type j=0;j<BB.NumCols();++j) for (ordinal_type i=0;i<BB.NumRows();++i) BB.Value(i,j) = 1.0; Teuchos::BLAS<ordinal_type,value_type> blas; blas.GEMM(Teuchos::CONJ_TRANS, Teuchos::NO_TRANS, m, k, m, 1.0, AA.ValuePtr(), AA.ColStride(), BB.ValuePtr(), BB.ColStride(), 0.0, BC.ValuePtr(), BC.ColStride()); BB.copy(BC); const double flop = get_flop_trsm_upper<value_type>(m, k); os << "DenseTrsmMKL:: m = " << m << " k = " << k; { timer.reset(); Teuchos::BLAS<ordinal_type,value_type> blas; const ordinal_type mm = AA.NumRows(); const ordinal_type nn = BB.NumCols(); blas.TRSM(Teuchos::LEFT_SIDE, Teuchos::UPPER_TRI, Teuchos::CONJ_TRANS, Teuchos::NON_UNIT_DIAG, mm, nn, 1.0, AA.ValuePtr(), AA.ColStride(), BB.ValuePtr(), BB.ColStride()); t = timer.seconds(); os << ":: MKL Performance = " << (flop/t/1.0e9) << " [GFLOPs] "; } cout << os.str() << endl; } return r_val; }