Пример #1
0
double
do_time_teuchos_double_gemv(unsigned int m, unsigned int n, unsigned int nloop)
{
  Sacado::Random<double> urand(0.0, 1.0);
  Teuchos::BLAS<int,double> blas;

  std::vector<double> A(m*n), B(n), C(m);
  for (unsigned int j=0; j<n; j++) {
    for (unsigned int i=0; i<m; i++)
      A[i+j*m] = urand.number();
    B[j] = urand.number();
  }
  for (unsigned int i=0; i<m; i++)
    C[i] = urand.number();
  double alpha = urand.number();
  double beta = urand.number();
  
  Teuchos::Time timer("Teuchos Double GEMV", false);
  timer.start(true);
  for (unsigned int j=0; j<nloop; j++) {
    blas.GEMV(Teuchos::NO_TRANS, m, n, alpha, &A[0], m, &B[0], 1, beta, &C[0], 1);
  }
  timer.stop();

  return timer.totalElapsedTime() / nloop;
}
Пример #2
0
    static void
    GEMM (const Teuchos::ETransp transA,
          const Teuchos::ETransp transB,
          const Scalar& alpha,
          const View<const Scalar**, LayoutLeft, DeviceType>& A,
          const View<const Scalar**, LayoutLeft, DeviceType>& B,
          const Scalar& beta,
          const View<Scalar**, LayoutLeft, DeviceType>& C)
    {
      const int n = static_cast<int> (C.dimension_1 ());
      const int lda = static_cast<int> (Impl::getStride2DView (A));
      Teuchos::BLAS<int,Scalar> blas;

      // For some BLAS implementations (e.g., MKL), GEMM when B has
      // one column may be signficantly less efficient than GEMV.
      if (n == 1 && transB == Teuchos::NO_TRANS) {
        blas.GEMV (transA, A.dimension_0 (), A.dimension_1 (),
                   alpha, A.ptr_on_device (), lda,
                   B.ptr_on_device (), static_cast<int> (1),
                   beta, C.ptr_on_device (), static_cast<int> (1));
      }
      else {
        const int m = static_cast<int> (C.dimension_0 ());
        const int k = static_cast<int> (transA == Teuchos::NO_TRANS ?
                                        A.dimension_1 () : A.dimension_0 ());
        const int ldb = static_cast<int> (Impl::getStride2DView (B));
        const int ldc = static_cast<int> (Impl::getStride2DView (C));

        blas.GEMM (transA, transB, m, n, k, alpha,
                   A.ptr_on_device(), lda,
                   B.ptr_on_device(), ldb,
                   beta, C.ptr_on_device(), ldc);
      }
    }
Пример #3
0
void
Stokhos::StieltjesPCEBasis<ordinal_type, value_type>::
transformCoeffsFromStieltjes(const value_type *in, value_type *out) const
{
  Teuchos::BLAS<ordinal_type, value_type> blas;
  blas.GEMV(Teuchos::TRANS, fromStieltjesMat.numRows(), 
	    fromStieltjesMat.numCols(), 1.0, fromStieltjesMat.values(), 
	    fromStieltjesMat.numRows(), in, 1, 0.0, out, 1);
}
Пример #4
0
void
Stokhos::LanczosPCEBasis<ordinal_type, value_type>::
transformCoeffsFromLanczos(const value_type *in, value_type *out) const
{
  Teuchos::BLAS<ordinal_type, value_type> blas;
  ordinal_type sz = fromStieltjesMat.numRows();
  blas.GEMV(Teuchos::NO_TRANS, sz, this->p+1, 
	    value_type(1.0), fromStieltjesMat.values(), sz, 
	    in, ordinal_type(1), value_type(0.0), out, ordinal_type(1));
}
void
Stokhos::MonoProjPCEBasis<ordinal_type, value_type>::
transformCoeffs(const value_type *in, value_type *out) const
{
  // Transform coefficients to normalized basis
  Teuchos::BLAS<ordinal_type, value_type> blas;
  blas.GEMV(Teuchos::NO_TRANS, pce_sz, this->p+1, 
	    value_type(1.0), basis_vecs.values(), pce_sz, 
	    in, ordinal_type(1), value_type(0.0), out, ordinal_type(1));

  // Transform from normalized to original
  for (ordinal_type i=0; i<pce_sz; i++)
    out[i] /= pce_norms[i];
}
Пример #6
0
double
do_time_teuchos_fad_gemv(unsigned int m, unsigned int n, unsigned int ndot, 
			 unsigned int nloop)
{
  Sacado::Random<double> urand(0.0, 1.0);
  Teuchos::BLAS<int,FadType> blas;

  std::vector<FadType> A(m*n), B(n), C(m);
  for (unsigned int j=0; j<n; j++) {
    for (unsigned int i=0; i<m; i++) {
      //A[i+j*m] = urand.number();
      A[i+j*m] = FadType(ndot, urand.number());
      for (unsigned int k=0; k<ndot; k++)
      	A[i+j*m].fastAccessDx(k) = urand.number();
    }
    B[j] = FadType(ndot, urand.number());
    for (unsigned int k=0; k<ndot; k++)
      B[j].fastAccessDx(k) = urand.number();
  }
  for (unsigned int i=0; i<m; i++) {
    C[i] = FadType(ndot, urand.number());
    for (unsigned int k=0; k<ndot; k++)
      C[i].fastAccessDx(k) = urand.number();
  }
  FadType alpha(ndot, urand.number());
  FadType beta(ndot, urand.number());
  for (unsigned int k=0; k<ndot; k++) {
    alpha.fastAccessDx(k) = urand.number();
    beta.fastAccessDx(k) = urand.number();
  }
  
  Teuchos::Time timer("Teuchos Fad GEMV", false);
  timer.start(true);
  for (unsigned int j=0; j<nloop; j++) {
    blas.GEMV(Teuchos::NO_TRANS, m, n, alpha, &A[0], m, &B[0], 1, beta, &C[0], 1);
  }
  timer.stop();

  return timer.totalElapsedTime() / nloop;
}
Пример #7
0
/*
  Computes integrals of monomials over a given reference cell.
*/
void computeIntegral(Teuchos::Array<double>& testIntFixDeg, shards::CellTopology & cellTopology, int cubDegree) {

  DefaultCubatureFactory<double>  cubFactory;                                         // create factory
  Teuchos::RCP<Cubature<double> > myCub = cubFactory.create(cellTopology, cubDegree); // create default cubature

  int cubDim       = myCub->getDimension();
  int numCubPoints = myCub->getNumPoints();
  int numPolys     = (cubDegree+1)*(cubDegree+2)*(cubDegree+3)/6;

  FieldContainer<double> point(cubDim);
  FieldContainer<double> cubPoints(numCubPoints, cubDim);
  FieldContainer<double> cubWeights(numCubPoints);
  FieldContainer<double> functValues(numCubPoints, numPolys);

  myCub->getCubature(cubPoints, cubWeights);

  int polyCt = 0;
  for (int xDeg=0; xDeg <= cubDegree; xDeg++) {
    for (int yDeg=0; yDeg <= cubDegree-xDeg; yDeg++) {
      for (int zDeg=0; zDeg <= cubDegree-xDeg-yDeg; zDeg++) {
        for (int i=0; i<numCubPoints; i++) {
          for (int j=0; j<cubDim; j++) {
            point(j) = cubPoints(i,j);
          }
          functValues(i,polyCt) = computeMonomial(point, xDeg, yDeg, zDeg);
        }
        polyCt++;
      }
    }
  }

  Teuchos::BLAS<int, double> myblas;
  int inc = 1;
  double alpha = 1.0;
  double beta  = 0.0;
  myblas.GEMV(Teuchos::NO_TRANS, numPolys, numCubPoints, alpha, &functValues(0,0), numPolys,
              &cubWeights(0), inc, beta, &testIntFixDeg[0], inc);
}
Пример #8
0
/*
  Computes integrals of monomials over a given reference cell.
*/
void computeIntegral(Teuchos::Array<double>& testIntFixDeg, int cubDegree) {

  CubatureGenSparse<double,3> myCub(cubDegree);

  int cubDim       = myCub.getDimension();
  int numCubPoints = myCub.getNumPoints();
  int numPolys     = (cubDegree+1)*(cubDegree+2)*(cubDegree+3)/6;

  FieldContainer<double> point(cubDim);
  FieldContainer<double> cubPoints(numCubPoints, cubDim);
  FieldContainer<double> cubWeights(numCubPoints);
  FieldContainer<double> functValues(numCubPoints, numPolys);

  myCub.getCubature(cubPoints, cubWeights);

  int polyCt = 0;
  for (int xDeg=0; xDeg <= cubDegree; xDeg++) {
    for (int yDeg=0; yDeg <= cubDegree-xDeg; yDeg++) {
      for (int zDeg=0; zDeg <= cubDegree-xDeg-yDeg; zDeg++) {
        for (int i=0; i<numCubPoints; i++) {
          for (int j=0; j<cubDim; j++) {
            point(j) = cubPoints(i,j);
          }
          functValues(i,polyCt) = computeMonomial(point, xDeg, yDeg, zDeg);
        }
        polyCt++;
      }
    }
  }

  Teuchos::BLAS<int, double> myblas;
  int inc = 1;
  double alpha = 1.0;
  double beta  = 0.0;
  myblas.GEMV(Teuchos::NO_TRANS, numPolys, numCubPoints, alpha, &functValues(0,0), numPolys,
              &cubWeights(0), inc, beta, &testIntFixDeg[0], inc);
}
  void Constraint<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Apply(const Matrix& P, Matrix& Projected) const {
    // We check only row maps. Column may be different.
    TEUCHOS_TEST_FOR_EXCEPTION(!P.getRowMap()->isSameAs(*Projected.getRowMap()), Exceptions::Incompatible,
                               "Row maps are incompatible");
    const size_t NSDim   = X_->getNumVectors();
    const size_t numRows = P.getNodeNumRows();

    const Map& colMap  = *P.getColMap();
    const Map& PColMap = *Projected.getColMap();

    Projected.resumeFill();

    Teuchos::ArrayView<const LO> indices, pindices;
    Teuchos::ArrayView<const SC> values,  pvalues;
    Teuchos::Array<SC> valuesAll(colMap.getNodeNumElements()), newValues;

    LO invalid = Teuchos::OrdinalTraits<LO>::invalid();
    LO oneLO   = Teuchos::OrdinalTraits<LO>::one();
    SC zero    = Teuchos::ScalarTraits<SC> ::zero();
    SC one     = Teuchos::ScalarTraits<SC> ::one();

    std::vector<const SC*> Xval(NSDim);
    for (size_t j = 0; j < NSDim; j++)
      Xval[j] = X_->getData(j).get();

    for (size_t i = 0; i < numRows; i++) {
      P        .getLocalRowView(i,  indices,  values);
      Projected.getLocalRowView(i, pindices, pvalues);

      size_t nnz  = indices.size();     // number of nonzeros in the supplied matrix
      size_t pnnz = pindices.size();    // number of nonzeros in the constrained matrix

      newValues.resize(pnnz);

      // Step 1: fix stencil
      // Projected *must* already have the correct stencil

      // Step 2: copy correct stencil values
      // The algorithm is very similar to the one used in the calculation of
      // Frobenius dot product, see src/Transfers/Energy-Minimization/Solvers/MueLu_CGSolver_def.hpp

      // NOTE: using extra array allows us to skip the search among indices
      for (size_t j = 0; j < nnz; j++)
        valuesAll[indices[j]] = values[j];
      for (size_t j = 0; j < pnnz; j++) {
        LO ind = colMap.getLocalElement(PColMap.getGlobalElement(pindices[j])); // FIXME: we could do that before the full loop just once
        if (ind != invalid)
          // index indices[j] is part of template, copy corresponding value
          newValues[j] = valuesAll[ind];
        else
          newValues[j] = zero;
      }
      for (size_t j = 0; j < nnz; j++)
        valuesAll[indices[j]] = zero;

      // Step 3: project to the space
      Teuchos::SerialDenseMatrix<LO,SC>& XXtInv = XXtInv_[i];

      Teuchos::SerialDenseMatrix<LO,SC> locX(NSDim, pnnz, false);
      for (size_t j = 0; j < pnnz; j++)
        for (size_t k = 0; k < NSDim; k++)
          locX(k,j) = Xval[k][pindices[j]];

      Teuchos::SerialDenseVector<LO,SC> val(pnnz, false), val1(NSDim, false), val2(NSDim, false);
      for (size_t j = 0; j < pnnz; j++)
        val[j] = newValues[j];

      Teuchos::BLAS<LO,SC> blas;
      // val1 = locX * val;
      blas.GEMV(Teuchos::NO_TRANS, NSDim, pnnz,
                one, locX.values(), locX.stride(),
                val.values(), oneLO,
                zero, val1.values(), oneLO);
      // val2 = XXtInv * val1
      blas.GEMV(Teuchos::NO_TRANS, NSDim, NSDim,
                one, XXtInv.values(), XXtInv.stride(),
                val1.values(), oneLO,
                zero,   val2.values(), oneLO);
      // val = X^T * val2
      blas.GEMV(Teuchos::CONJ_TRANS, NSDim, pnnz,
                one, locX.values(), locX.stride(),
                val2.values(), oneLO,
                zero,  val.values(), oneLO);

      for (size_t j = 0; j < pnnz; j++)
        newValues[j] -= val[j];

      Projected.replaceLocalValues(i, pindices, newValues);
    }

    Projected.fillComplete(Projected.getDomainMap(), Projected.getRangeMap()); //FIXME: maps needed?
  }
Пример #10
0
  void Constraint<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::Apply(const Matrix& P, Matrix& Projected) const {
    const size_t NSDim   = X_->getNumVectors();
    const size_t numRows = P.getNodeNumRows();

    Projected.resumeFill();

    Teuchos::SerialDenseVector<LO,SC> BcRow(NSDim, false);
    for (size_t i = 0; i < numRows; i++) {
      Teuchos::ArrayView<const LO> indices, pindices;
      Teuchos::ArrayView<const SC> values,  pvalues;

      P        .getLocalRowView(i,  indices,  values);
      Projected.getLocalRowView(i, pindices, pvalues);

      size_t nnz  = pindices.size();    // number of nonzeros in the constrained matrix
      size_t nnz1 = indices.size();     // number of nonzeros in the supplied matrix

      Teuchos::Array<SC> newValues(nnz, Teuchos::ScalarTraits<SC>::zero());

      // step 1: fix stencil
      // Projected already has the correct stencil

      // step 2: copy correct stencil values
      for (size_t j = 0; j < nnz1; j++) {
        // this might be accelerated if we know smth about ordering
        size_t k = 0;
        for (; k < nnz; k++)
          if (pindices[k] == indices[j])
            break;
        if (k != nnz) {
          // index indices[j] is part of template, copy corresponding value
          newValues[k] = values[j];
        }
      }

      // step 3: project to the space
      Teuchos::SerialDenseMatrix<LO,SC> locX(NSDim, nnz, false);
      for (size_t j = 0; j < nnz; j++) {
        for (size_t k = 0; k < NSDim; k++)
          BcRow[k] = X_->getData(k)[pindices[j]];

        Teuchos::setCol(BcRow, (LO)j, locX);
      }

      Teuchos::SerialDenseVector<LO,SC> val(nnz, false), val1(NSDim, false), val2(NSDim, false);
      for (size_t j = 0; j < nnz; j++)
        val[j] = newValues[j];

      Teuchos::BLAS<LO,SC> blas;
      blas.GEMV(Teuchos::NO_TRANS, NSDim, nnz, Teuchos::ScalarTraits<SC>::one(), locX.values(),
                locX.stride(), val.values(), (LO)1, Teuchos::ScalarTraits<SC>::zero(), val1.values(), (LO)1);
      blas.GEMV(Teuchos::NO_TRANS, NSDim, NSDim, Teuchos::ScalarTraits<SC>::one(), XXtInv_[i].values(),
                XXtInv_[i].stride(), val1.values(), (LO)1, Teuchos::ScalarTraits<SC>::zero(), val2.values(), (LO)1);
      blas.GEMV(Teuchos::CONJ_TRANS, NSDim, nnz, Teuchos::ScalarTraits<SC>::one(), locX.values(),
                locX.stride(), val2.values(), (LO)1, Teuchos::ScalarTraits<SC>::zero(), val.values(), (LO)1);

      for (size_t j = 0; j < nnz; j++)
        newValues[j] -= val[j];

      Projected.replaceLocalValues(i, pindices, newValues);
    }

    Projected.fillComplete(Projected.getDomainMap(), Projected.getRangeMap()); //FIXME: maps needed?
  }
Пример #11
0
int main(int argc, char **argv)
{
  const unsigned int n = 5;
  Sacado::Fad::Vector<unsigned int, FadType> A(n*n,0),B(n,n), C(n,n);
  for (unsigned int i=0; i<n; i++) {
    for (unsigned int j=0; j<n; j++)
      A[i+j*n] = FadType(Teuchos::ScalarTraits<double>::random());
    B[i] = FadType(n, Teuchos::ScalarTraits<double>::random());
    for (unsigned int j=0; j<n; j++)
      B[i].fastAccessDx(j) = Teuchos::ScalarTraits<double>::random();
    C[i] = 0.0;
  }

  double *a = A.vals();
  double *b = B.vals();
  double *bdx = B.dx();
  std::vector<double> c(n), cdx(n*n);

  Teuchos::BLAS<int,double> blas;
  blas.GEMV(Teuchos::NO_TRANS, n, n, 1.0, &a[0], n, &b[0], 1, 0.0, &c[0], 1);
  blas.GEMM(Teuchos::NO_TRANS, Teuchos::NO_TRANS, n, n, n, 1.0, &a[0], n, &bdx[0], n, 0.0, &cdx[0], n);

  // Teuchos::BLAS<int,FadType> blas_fad;
  // blas_fad.GEMV(Teuchos::NO_TRANS, n, n, 1.0, &A[0], n, &B[0], 1, 0.0, &C[0], 1);

  Teuchos::BLAS<int,FadType> sacado_fad_blas(false);
  sacado_fad_blas.GEMV(Teuchos::NO_TRANS, n, n, 1.0, &A[0], n, &B[0], 1, 0.0, &C[0], 1);

  // Print the results
  int p = 4;
  int w = p+7;
  std::cout.setf(std::ios::scientific);
  std::cout.precision(p);

  std::cout << "BLAS GEMV calculation:" << std::endl;
  std::cout << "a = " << std::endl;
  for (unsigned int i=0; i<n; i++) {
    for (unsigned int j=0; j<n; j++)
      std::cout << " " << std::setw(w) << a[i+j*n];
    std::cout << std::endl;
  }
  std::cout << "b = " << std::endl;
  for (unsigned int i=0; i<n; i++) {
    std::cout << " " << std::setw(w) << b[i];
  }
  std::cout << std::endl;
  std::cout << "bdot = " << std::endl;
  for (unsigned int i=0; i<n; i++) {
    for (unsigned int j=0; j<n; j++)
      std::cout << " " << std::setw(w) << bdx[i+j*n];
    std::cout << std::endl;
  }
  std::cout << "c = " << std::endl;
  for (unsigned int i=0; i<n; i++) {
    std::cout << " " << std::setw(w) << c[i];
  }
  std::cout << std::endl;
  std::cout << "cdot = " << std::endl;
  for (unsigned int i=0; i<n; i++) {
    for (unsigned int j=0; j<n; j++)
      std::cout << " " << std::setw(w) << cdx[i+j*n];
    std::cout << std::endl;
  }
  std::cout << std::endl << std::endl;

  std::cout << "FAD BLAS GEMV calculation:" << std::endl;
  std::cout << "A.val() (should = a) = " << std::endl;
  for (unsigned int i=0; i<n; i++) {
    for (unsigned int j=0; j<n; j++)
      std::cout << " " << std::setw(w) << A[i+j*n].val();
    std::cout << std::endl;
  }
  std::cout << "B.val() (should = b) = " << std::endl;
  for (unsigned int i=0; i<n; i++) {
    std::cout << " " << std::setw(w) << B[i].val();
  }
  std::cout << std::endl;
  std::cout << "B.dx() (should = bdot) = " << std::endl;
  double *Bdx = B.dx();
  for (unsigned int i=0; i<n; i++) {
    for (unsigned int j=0; j<n; j++)
      std::cout << " " << std::setw(w) << Bdx[i+j*n];
    std::cout << std::endl;
  }
  std::cout << "C.val() (should = c) = " << std::endl;
  for (unsigned int i=0; i<n; i++) {
    std::cout << " " << std::setw(w) << C[i].val();
  }
  std::cout << std::endl;
  std::cout << "C.dx() (should = cdot) = " << std::endl;
  double *Cdx = C.dx();
  for (unsigned int i=0; i<n; i++) {
    for (unsigned int j=0; j<n; j++)
      std::cout << " " << std::setw(w) << Cdx[i+j*n];
    std::cout << std::endl;
  }

  double tol = 1.0e-14;
  bool failed = false;
  for (unsigned int i=0; i<n; i++) {
    if (std::fabs(C[i].val() - c[i]) > tol)
      failed = true;
    for (unsigned int j=0; j<n; j++) {
      if (std::fabs(C[i].dx(j) - cdx[i+j*n]) > tol) 
	failed = true;
    }
  }
  if (!failed) {
    std::cout << "\nExample passed!" << std::endl;
    return 0;
  }
  else {
    std::cout <<"\nSomething is wrong, example failed!" << std::endl;
    return 1;
  }
}