double do_time_teuchos_double_gemv(unsigned int m, unsigned int n, unsigned int nloop) { Sacado::Random<double> urand(0.0, 1.0); Teuchos::BLAS<int,double> blas; std::vector<double> A(m*n), B(n), C(m); for (unsigned int j=0; j<n; j++) { for (unsigned int i=0; i<m; i++) A[i+j*m] = urand.number(); B[j] = urand.number(); } for (unsigned int i=0; i<m; i++) C[i] = urand.number(); double alpha = urand.number(); double beta = urand.number(); Teuchos::Time timer("Teuchos Double GEMV", false); timer.start(true); for (unsigned int j=0; j<nloop; j++) { blas.GEMV(Teuchos::NO_TRANS, m, n, alpha, &A[0], m, &B[0], 1, beta, &C[0], 1); } timer.stop(); return timer.totalElapsedTime() / nloop; }
static void GEMM (const Teuchos::ETransp transA, const Teuchos::ETransp transB, const Scalar& alpha, const View<const Scalar**, LayoutLeft, DeviceType>& A, const View<const Scalar**, LayoutLeft, DeviceType>& B, const Scalar& beta, const View<Scalar**, LayoutLeft, DeviceType>& C) { const int n = static_cast<int> (C.dimension_1 ()); const int lda = static_cast<int> (Impl::getStride2DView (A)); Teuchos::BLAS<int,Scalar> blas; // For some BLAS implementations (e.g., MKL), GEMM when B has // one column may be signficantly less efficient than GEMV. if (n == 1 && transB == Teuchos::NO_TRANS) { blas.GEMV (transA, A.dimension_0 (), A.dimension_1 (), alpha, A.ptr_on_device (), lda, B.ptr_on_device (), static_cast<int> (1), beta, C.ptr_on_device (), static_cast<int> (1)); } else { const int m = static_cast<int> (C.dimension_0 ()); const int k = static_cast<int> (transA == Teuchos::NO_TRANS ? A.dimension_1 () : A.dimension_0 ()); const int ldb = static_cast<int> (Impl::getStride2DView (B)); const int ldc = static_cast<int> (Impl::getStride2DView (C)); blas.GEMM (transA, transB, m, n, k, alpha, A.ptr_on_device(), lda, B.ptr_on_device(), ldb, beta, C.ptr_on_device(), ldc); } }
void Stokhos::StieltjesPCEBasis<ordinal_type, value_type>:: transformCoeffsFromStieltjes(const value_type *in, value_type *out) const { Teuchos::BLAS<ordinal_type, value_type> blas; blas.GEMV(Teuchos::TRANS, fromStieltjesMat.numRows(), fromStieltjesMat.numCols(), 1.0, fromStieltjesMat.values(), fromStieltjesMat.numRows(), in, 1, 0.0, out, 1); }
void Stokhos::LanczosPCEBasis<ordinal_type, value_type>:: transformCoeffsFromLanczos(const value_type *in, value_type *out) const { Teuchos::BLAS<ordinal_type, value_type> blas; ordinal_type sz = fromStieltjesMat.numRows(); blas.GEMV(Teuchos::NO_TRANS, sz, this->p+1, value_type(1.0), fromStieltjesMat.values(), sz, in, ordinal_type(1), value_type(0.0), out, ordinal_type(1)); }
void Stokhos::MonoProjPCEBasis<ordinal_type, value_type>:: transformCoeffs(const value_type *in, value_type *out) const { // Transform coefficients to normalized basis Teuchos::BLAS<ordinal_type, value_type> blas; blas.GEMV(Teuchos::NO_TRANS, pce_sz, this->p+1, value_type(1.0), basis_vecs.values(), pce_sz, in, ordinal_type(1), value_type(0.0), out, ordinal_type(1)); // Transform from normalized to original for (ordinal_type i=0; i<pce_sz; i++) out[i] /= pce_norms[i]; }
double do_time_teuchos_fad_gemv(unsigned int m, unsigned int n, unsigned int ndot, unsigned int nloop) { Sacado::Random<double> urand(0.0, 1.0); Teuchos::BLAS<int,FadType> blas; std::vector<FadType> A(m*n), B(n), C(m); for (unsigned int j=0; j<n; j++) { for (unsigned int i=0; i<m; i++) { //A[i+j*m] = urand.number(); A[i+j*m] = FadType(ndot, urand.number()); for (unsigned int k=0; k<ndot; k++) A[i+j*m].fastAccessDx(k) = urand.number(); } B[j] = FadType(ndot, urand.number()); for (unsigned int k=0; k<ndot; k++) B[j].fastAccessDx(k) = urand.number(); } for (unsigned int i=0; i<m; i++) { C[i] = FadType(ndot, urand.number()); for (unsigned int k=0; k<ndot; k++) C[i].fastAccessDx(k) = urand.number(); } FadType alpha(ndot, urand.number()); FadType beta(ndot, urand.number()); for (unsigned int k=0; k<ndot; k++) { alpha.fastAccessDx(k) = urand.number(); beta.fastAccessDx(k) = urand.number(); } Teuchos::Time timer("Teuchos Fad GEMV", false); timer.start(true); for (unsigned int j=0; j<nloop; j++) { blas.GEMV(Teuchos::NO_TRANS, m, n, alpha, &A[0], m, &B[0], 1, beta, &C[0], 1); } timer.stop(); return timer.totalElapsedTime() / nloop; }
/* Computes integrals of monomials over a given reference cell. */ void computeIntegral(Teuchos::Array<double>& testIntFixDeg, shards::CellTopology & cellTopology, int cubDegree) { DefaultCubatureFactory<double> cubFactory; // create factory Teuchos::RCP<Cubature<double> > myCub = cubFactory.create(cellTopology, cubDegree); // create default cubature int cubDim = myCub->getDimension(); int numCubPoints = myCub->getNumPoints(); int numPolys = (cubDegree+1)*(cubDegree+2)*(cubDegree+3)/6; FieldContainer<double> point(cubDim); FieldContainer<double> cubPoints(numCubPoints, cubDim); FieldContainer<double> cubWeights(numCubPoints); FieldContainer<double> functValues(numCubPoints, numPolys); myCub->getCubature(cubPoints, cubWeights); int polyCt = 0; for (int xDeg=0; xDeg <= cubDegree; xDeg++) { for (int yDeg=0; yDeg <= cubDegree-xDeg; yDeg++) { for (int zDeg=0; zDeg <= cubDegree-xDeg-yDeg; zDeg++) { for (int i=0; i<numCubPoints; i++) { for (int j=0; j<cubDim; j++) { point(j) = cubPoints(i,j); } functValues(i,polyCt) = computeMonomial(point, xDeg, yDeg, zDeg); } polyCt++; } } } Teuchos::BLAS<int, double> myblas; int inc = 1; double alpha = 1.0; double beta = 0.0; myblas.GEMV(Teuchos::NO_TRANS, numPolys, numCubPoints, alpha, &functValues(0,0), numPolys, &cubWeights(0), inc, beta, &testIntFixDeg[0], inc); }
/* Computes integrals of monomials over a given reference cell. */ void computeIntegral(Teuchos::Array<double>& testIntFixDeg, int cubDegree) { CubatureGenSparse<double,3> myCub(cubDegree); int cubDim = myCub.getDimension(); int numCubPoints = myCub.getNumPoints(); int numPolys = (cubDegree+1)*(cubDegree+2)*(cubDegree+3)/6; FieldContainer<double> point(cubDim); FieldContainer<double> cubPoints(numCubPoints, cubDim); FieldContainer<double> cubWeights(numCubPoints); FieldContainer<double> functValues(numCubPoints, numPolys); myCub.getCubature(cubPoints, cubWeights); int polyCt = 0; for (int xDeg=0; xDeg <= cubDegree; xDeg++) { for (int yDeg=0; yDeg <= cubDegree-xDeg; yDeg++) { for (int zDeg=0; zDeg <= cubDegree-xDeg-yDeg; zDeg++) { for (int i=0; i<numCubPoints; i++) { for (int j=0; j<cubDim; j++) { point(j) = cubPoints(i,j); } functValues(i,polyCt) = computeMonomial(point, xDeg, yDeg, zDeg); } polyCt++; } } } Teuchos::BLAS<int, double> myblas; int inc = 1; double alpha = 1.0; double beta = 0.0; myblas.GEMV(Teuchos::NO_TRANS, numPolys, numCubPoints, alpha, &functValues(0,0), numPolys, &cubWeights(0), inc, beta, &testIntFixDeg[0], inc); }
void Constraint<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Apply(const Matrix& P, Matrix& Projected) const { // We check only row maps. Column may be different. TEUCHOS_TEST_FOR_EXCEPTION(!P.getRowMap()->isSameAs(*Projected.getRowMap()), Exceptions::Incompatible, "Row maps are incompatible"); const size_t NSDim = X_->getNumVectors(); const size_t numRows = P.getNodeNumRows(); const Map& colMap = *P.getColMap(); const Map& PColMap = *Projected.getColMap(); Projected.resumeFill(); Teuchos::ArrayView<const LO> indices, pindices; Teuchos::ArrayView<const SC> values, pvalues; Teuchos::Array<SC> valuesAll(colMap.getNodeNumElements()), newValues; LO invalid = Teuchos::OrdinalTraits<LO>::invalid(); LO oneLO = Teuchos::OrdinalTraits<LO>::one(); SC zero = Teuchos::ScalarTraits<SC> ::zero(); SC one = Teuchos::ScalarTraits<SC> ::one(); std::vector<const SC*> Xval(NSDim); for (size_t j = 0; j < NSDim; j++) Xval[j] = X_->getData(j).get(); for (size_t i = 0; i < numRows; i++) { P .getLocalRowView(i, indices, values); Projected.getLocalRowView(i, pindices, pvalues); size_t nnz = indices.size(); // number of nonzeros in the supplied matrix size_t pnnz = pindices.size(); // number of nonzeros in the constrained matrix newValues.resize(pnnz); // Step 1: fix stencil // Projected *must* already have the correct stencil // Step 2: copy correct stencil values // The algorithm is very similar to the one used in the calculation of // Frobenius dot product, see src/Transfers/Energy-Minimization/Solvers/MueLu_CGSolver_def.hpp // NOTE: using extra array allows us to skip the search among indices for (size_t j = 0; j < nnz; j++) valuesAll[indices[j]] = values[j]; for (size_t j = 0; j < pnnz; j++) { LO ind = colMap.getLocalElement(PColMap.getGlobalElement(pindices[j])); // FIXME: we could do that before the full loop just once if (ind != invalid) // index indices[j] is part of template, copy corresponding value newValues[j] = valuesAll[ind]; else newValues[j] = zero; } for (size_t j = 0; j < nnz; j++) valuesAll[indices[j]] = zero; // Step 3: project to the space Teuchos::SerialDenseMatrix<LO,SC>& XXtInv = XXtInv_[i]; Teuchos::SerialDenseMatrix<LO,SC> locX(NSDim, pnnz, false); for (size_t j = 0; j < pnnz; j++) for (size_t k = 0; k < NSDim; k++) locX(k,j) = Xval[k][pindices[j]]; Teuchos::SerialDenseVector<LO,SC> val(pnnz, false), val1(NSDim, false), val2(NSDim, false); for (size_t j = 0; j < pnnz; j++) val[j] = newValues[j]; Teuchos::BLAS<LO,SC> blas; // val1 = locX * val; blas.GEMV(Teuchos::NO_TRANS, NSDim, pnnz, one, locX.values(), locX.stride(), val.values(), oneLO, zero, val1.values(), oneLO); // val2 = XXtInv * val1 blas.GEMV(Teuchos::NO_TRANS, NSDim, NSDim, one, XXtInv.values(), XXtInv.stride(), val1.values(), oneLO, zero, val2.values(), oneLO); // val = X^T * val2 blas.GEMV(Teuchos::CONJ_TRANS, NSDim, pnnz, one, locX.values(), locX.stride(), val2.values(), oneLO, zero, val.values(), oneLO); for (size_t j = 0; j < pnnz; j++) newValues[j] -= val[j]; Projected.replaceLocalValues(i, pindices, newValues); } Projected.fillComplete(Projected.getDomainMap(), Projected.getRangeMap()); //FIXME: maps needed? }
void Constraint<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::Apply(const Matrix& P, Matrix& Projected) const { const size_t NSDim = X_->getNumVectors(); const size_t numRows = P.getNodeNumRows(); Projected.resumeFill(); Teuchos::SerialDenseVector<LO,SC> BcRow(NSDim, false); for (size_t i = 0; i < numRows; i++) { Teuchos::ArrayView<const LO> indices, pindices; Teuchos::ArrayView<const SC> values, pvalues; P .getLocalRowView(i, indices, values); Projected.getLocalRowView(i, pindices, pvalues); size_t nnz = pindices.size(); // number of nonzeros in the constrained matrix size_t nnz1 = indices.size(); // number of nonzeros in the supplied matrix Teuchos::Array<SC> newValues(nnz, Teuchos::ScalarTraits<SC>::zero()); // step 1: fix stencil // Projected already has the correct stencil // step 2: copy correct stencil values for (size_t j = 0; j < nnz1; j++) { // this might be accelerated if we know smth about ordering size_t k = 0; for (; k < nnz; k++) if (pindices[k] == indices[j]) break; if (k != nnz) { // index indices[j] is part of template, copy corresponding value newValues[k] = values[j]; } } // step 3: project to the space Teuchos::SerialDenseMatrix<LO,SC> locX(NSDim, nnz, false); for (size_t j = 0; j < nnz; j++) { for (size_t k = 0; k < NSDim; k++) BcRow[k] = X_->getData(k)[pindices[j]]; Teuchos::setCol(BcRow, (LO)j, locX); } Teuchos::SerialDenseVector<LO,SC> val(nnz, false), val1(NSDim, false), val2(NSDim, false); for (size_t j = 0; j < nnz; j++) val[j] = newValues[j]; Teuchos::BLAS<LO,SC> blas; blas.GEMV(Teuchos::NO_TRANS, NSDim, nnz, Teuchos::ScalarTraits<SC>::one(), locX.values(), locX.stride(), val.values(), (LO)1, Teuchos::ScalarTraits<SC>::zero(), val1.values(), (LO)1); blas.GEMV(Teuchos::NO_TRANS, NSDim, NSDim, Teuchos::ScalarTraits<SC>::one(), XXtInv_[i].values(), XXtInv_[i].stride(), val1.values(), (LO)1, Teuchos::ScalarTraits<SC>::zero(), val2.values(), (LO)1); blas.GEMV(Teuchos::CONJ_TRANS, NSDim, nnz, Teuchos::ScalarTraits<SC>::one(), locX.values(), locX.stride(), val2.values(), (LO)1, Teuchos::ScalarTraits<SC>::zero(), val.values(), (LO)1); for (size_t j = 0; j < nnz; j++) newValues[j] -= val[j]; Projected.replaceLocalValues(i, pindices, newValues); } Projected.fillComplete(Projected.getDomainMap(), Projected.getRangeMap()); //FIXME: maps needed? }
int main(int argc, char **argv) { const unsigned int n = 5; Sacado::Fad::Vector<unsigned int, FadType> A(n*n,0),B(n,n), C(n,n); for (unsigned int i=0; i<n; i++) { for (unsigned int j=0; j<n; j++) A[i+j*n] = FadType(Teuchos::ScalarTraits<double>::random()); B[i] = FadType(n, Teuchos::ScalarTraits<double>::random()); for (unsigned int j=0; j<n; j++) B[i].fastAccessDx(j) = Teuchos::ScalarTraits<double>::random(); C[i] = 0.0; } double *a = A.vals(); double *b = B.vals(); double *bdx = B.dx(); std::vector<double> c(n), cdx(n*n); Teuchos::BLAS<int,double> blas; blas.GEMV(Teuchos::NO_TRANS, n, n, 1.0, &a[0], n, &b[0], 1, 0.0, &c[0], 1); blas.GEMM(Teuchos::NO_TRANS, Teuchos::NO_TRANS, n, n, n, 1.0, &a[0], n, &bdx[0], n, 0.0, &cdx[0], n); // Teuchos::BLAS<int,FadType> blas_fad; // blas_fad.GEMV(Teuchos::NO_TRANS, n, n, 1.0, &A[0], n, &B[0], 1, 0.0, &C[0], 1); Teuchos::BLAS<int,FadType> sacado_fad_blas(false); sacado_fad_blas.GEMV(Teuchos::NO_TRANS, n, n, 1.0, &A[0], n, &B[0], 1, 0.0, &C[0], 1); // Print the results int p = 4; int w = p+7; std::cout.setf(std::ios::scientific); std::cout.precision(p); std::cout << "BLAS GEMV calculation:" << std::endl; std::cout << "a = " << std::endl; for (unsigned int i=0; i<n; i++) { for (unsigned int j=0; j<n; j++) std::cout << " " << std::setw(w) << a[i+j*n]; std::cout << std::endl; } std::cout << "b = " << std::endl; for (unsigned int i=0; i<n; i++) { std::cout << " " << std::setw(w) << b[i]; } std::cout << std::endl; std::cout << "bdot = " << std::endl; for (unsigned int i=0; i<n; i++) { for (unsigned int j=0; j<n; j++) std::cout << " " << std::setw(w) << bdx[i+j*n]; std::cout << std::endl; } std::cout << "c = " << std::endl; for (unsigned int i=0; i<n; i++) { std::cout << " " << std::setw(w) << c[i]; } std::cout << std::endl; std::cout << "cdot = " << std::endl; for (unsigned int i=0; i<n; i++) { for (unsigned int j=0; j<n; j++) std::cout << " " << std::setw(w) << cdx[i+j*n]; std::cout << std::endl; } std::cout << std::endl << std::endl; std::cout << "FAD BLAS GEMV calculation:" << std::endl; std::cout << "A.val() (should = a) = " << std::endl; for (unsigned int i=0; i<n; i++) { for (unsigned int j=0; j<n; j++) std::cout << " " << std::setw(w) << A[i+j*n].val(); std::cout << std::endl; } std::cout << "B.val() (should = b) = " << std::endl; for (unsigned int i=0; i<n; i++) { std::cout << " " << std::setw(w) << B[i].val(); } std::cout << std::endl; std::cout << "B.dx() (should = bdot) = " << std::endl; double *Bdx = B.dx(); for (unsigned int i=0; i<n; i++) { for (unsigned int j=0; j<n; j++) std::cout << " " << std::setw(w) << Bdx[i+j*n]; std::cout << std::endl; } std::cout << "C.val() (should = c) = " << std::endl; for (unsigned int i=0; i<n; i++) { std::cout << " " << std::setw(w) << C[i].val(); } std::cout << std::endl; std::cout << "C.dx() (should = cdot) = " << std::endl; double *Cdx = C.dx(); for (unsigned int i=0; i<n; i++) { for (unsigned int j=0; j<n; j++) std::cout << " " << std::setw(w) << Cdx[i+j*n]; std::cout << std::endl; } double tol = 1.0e-14; bool failed = false; for (unsigned int i=0; i<n; i++) { if (std::fabs(C[i].val() - c[i]) > tol) failed = true; for (unsigned int j=0; j<n; j++) { if (std::fabs(C[i].dx(j) - cdx[i+j*n]) > tol) failed = true; } } if (!failed) { std::cout << "\nExample passed!" << std::endl; return 0; } else { std::cout <<"\nSomething is wrong, example failed!" << std::endl; return 1; } }