Exemple #1
0
int main(int argc, char* argv[])
{
#ifdef HAVE_MPI
  MPI_Init(&argc,&argv);
#endif

  // Creating an instance of the BLAS class for double-precision kernels looks like:
  Teuchos::BLAS<int, double> blas;

  // This instance provides the access to all the BLAS kernels listed in Figure \ref{blas_kernels}:
  const int n = 10;
  double alpha = 2.0;
  double x[ n ];
  for ( int i=0; i<n; i++ ) { x[i] = i; }
  blas.SCAL( n, alpha, x, 1 );
  int max_idx = blas.IAMAX( n, x, 1 );
  cout<< "The index of the maximum magnitude entry of x[] is the "
    <<  max_idx <<"-th and x[ " << max_idx-1 << " ] = "<< x[max_idx-1]
    << endl;

#ifdef HAVE_MPI
  MPI_Finalize();
#endif
  return 0;
}
int main(int argc, char* argv[])
{

  std::cout << Teuchos::Teuchos_Version() << std::endl << std::endl;

  // Creating an instance of the BLAS class for double-precision kernels looks like:
  Teuchos::BLAS<int, double> blas;

  // This instance provides the access to all the BLAS kernels like _SCAL.

  const int n = 10;
  double alpha = 2.0;

  double x[ n ];
  for ( int i=0; i<n; i++ ) { x[i] = i; }

  blas.SCAL( n, alpha, x, 1 );

  int max_idx = blas.IAMAX( n, x, 1 );

  std::cout<< "The index of the maximum magnitude entry of x[] is the "
      <<  max_idx <<"-th and x[ " << max_idx-1 << " ] = "<< x[max_idx-1]
      << std::endl;

  return 0;
}
Exemple #3
0
double
do_time_teuchos_fad_dot(unsigned int m, unsigned int ndot, unsigned int nloop)
{
  Sacado::Random<double> urand(0.0, 1.0);
  Teuchos::BLAS<int,FadType> blas;

  std::vector<FadType> X(m), Y(m);
  for (unsigned int i=0; i<m; i++) {
    X[i] = FadType(ndot, urand.number());
    Y[i] = FadType(ndot, urand.number());
    for (unsigned int k=0; k<ndot; k++) {
      X[i].fastAccessDx(k) = urand.number();
      Y[i].fastAccessDx(k) = urand.number();
    }
  }
  
  Teuchos::Time timer("Teuchos Fad DOT", false);
  timer.start(true);
  for (unsigned int j=0; j<nloop; j++) {
    FadType z = blas.DOT(m, &X[0], 1, &Y[0], 1);
  }
  timer.stop();

  return timer.totalElapsedTime() / nloop;
}
Exemple #4
0
/*
  Computes integrals of monomials over a given reference cell.
*/
double computeIntegral(shards::CellTopology & cellTopology, int cubDegree, int xDeg, int yDeg, int zDeg) {

  DefaultCubatureFactory<double>  cubFactory;                                         // create factory
  Teuchos::RCP<Cubature<double> > myCub = cubFactory.create(cellTopology, cubDegree); // create default cubature

  double val       = 0.0;
  int cubDim       = myCub->getDimension();
  int numCubPoints = myCub->getNumPoints();

  FieldContainer<double> point(cubDim);
  FieldContainer<double> cubPoints(numCubPoints, cubDim);
  FieldContainer<double> cubWeights(numCubPoints);
  FieldContainer<double> functValues(numCubPoints);

  myCub->getCubature(cubPoints, cubWeights);

  for (int i=0; i<numCubPoints; i++) {
    for (int j=0; j<cubDim; j++) {
      point(j) = cubPoints(i,j);
    }
    functValues(i) = computeMonomial(point, xDeg, yDeg, zDeg);
  }

  Teuchos::BLAS<int, double> myblas;
  int inc = 1;
  val = myblas.DOT(numCubPoints, &functValues[0], inc, &cubWeights[0], inc);

  return val;
}
Exemple #5
0
    static void
    GEMM (const Teuchos::ETransp transA,
          const Teuchos::ETransp transB,
          const Scalar& alpha,
          const View<const Scalar**, LayoutLeft, DeviceType>& A,
          const View<const Scalar**, LayoutLeft, DeviceType>& B,
          const Scalar& beta,
          const View<Scalar**, LayoutLeft, DeviceType>& C)
    {
      const int n = static_cast<int> (C.dimension_1 ());
      const int lda = static_cast<int> (Impl::getStride2DView (A));
      Teuchos::BLAS<int,Scalar> blas;

      // For some BLAS implementations (e.g., MKL), GEMM when B has
      // one column may be signficantly less efficient than GEMV.
      if (n == 1 && transB == Teuchos::NO_TRANS) {
        blas.GEMV (transA, A.dimension_0 (), A.dimension_1 (),
                   alpha, A.ptr_on_device (), lda,
                   B.ptr_on_device (), static_cast<int> (1),
                   beta, C.ptr_on_device (), static_cast<int> (1));
      }
      else {
        const int m = static_cast<int> (C.dimension_0 ());
        const int k = static_cast<int> (transA == Teuchos::NO_TRANS ?
                                        A.dimension_1 () : A.dimension_0 ());
        const int ldb = static_cast<int> (Impl::getStride2DView (B));
        const int ldc = static_cast<int> (Impl::getStride2DView (C));

        blas.GEMM (transA, transB, m, n, k, alpha,
                   A.ptr_on_device(), lda,
                   B.ptr_on_device(), ldb,
                   beta, C.ptr_on_device(), ldc);
      }
    }
Exemple #6
0
double
do_time_teuchos_double_gemv(unsigned int m, unsigned int n, unsigned int nloop)
{
  Sacado::Random<double> urand(0.0, 1.0);
  Teuchos::BLAS<int,double> blas;

  std::vector<double> A(m*n), B(n), C(m);
  for (unsigned int j=0; j<n; j++) {
    for (unsigned int i=0; i<m; i++)
      A[i+j*m] = urand.number();
    B[j] = urand.number();
  }
  for (unsigned int i=0; i<m; i++)
    C[i] = urand.number();
  double alpha = urand.number();
  double beta = urand.number();
  
  Teuchos::Time timer("Teuchos Double GEMV", false);
  timer.start(true);
  for (unsigned int j=0; j<nloop; j++) {
    blas.GEMV(Teuchos::NO_TRANS, m, n, alpha, &A[0], m, &B[0], 1, beta, &C[0], 1);
  }
  timer.stop();

  return timer.totalElapsedTime() / nloop;
}
Exemple #7
0
void
Stokhos::StieltjesPCEBasis<ordinal_type, value_type>::
transformCoeffsFromStieltjes(const value_type *in, value_type *out) const
{
  Teuchos::BLAS<ordinal_type, value_type> blas;
  blas.GEMV(Teuchos::TRANS, fromStieltjesMat.numRows(), 
	    fromStieltjesMat.numCols(), 1.0, fromStieltjesMat.values(), 
	    fromStieltjesMat.numRows(), in, 1, 0.0, out, 1);
}
Exemple #8
0
void
Stokhos::LanczosPCEBasis<ordinal_type, value_type>::
transformCoeffsFromLanczos(const value_type *in, value_type *out) const
{
  Teuchos::BLAS<ordinal_type, value_type> blas;
  ordinal_type sz = fromStieltjesMat.numRows();
  blas.GEMV(Teuchos::NO_TRANS, sz, this->p+1, 
	    value_type(1.0), fromStieltjesMat.values(), sz, 
	    in, ordinal_type(1), value_type(0.0), out, ordinal_type(1));
}
void
Stokhos::GramSchmidtBasis<ordinal_type, value_type>::
transformCoeffs(const value_type *in, value_type *out) const
{
  Teuchos::BLAS<ordinal_type, value_type> blas;
  for (ordinal_type i=0; i<sz; i++)
    out[i] = in[i];
  blas.TRSM(Teuchos::LEFT_SIDE, Teuchos::LOWER_TRI, Teuchos::TRANS,
	    Teuchos::UNIT_DIAG, sz, 1, 1.0, gs_mat.values(), sz, out, sz);
}
void
Stokhos::MonoProjPCEBasis<ordinal_type, value_type>::
transformCoeffs(const value_type *in, value_type *out) const
{
  // Transform coefficients to normalized basis
  Teuchos::BLAS<ordinal_type, value_type> blas;
  blas.GEMV(Teuchos::NO_TRANS, pce_sz, this->p+1, 
	    value_type(1.0), basis_vecs.values(), pce_sz, 
	    in, ordinal_type(1), value_type(0.0), out, ordinal_type(1));

  // Transform from normalized to original
  for (ordinal_type i=0; i<pce_sz; i++)
    out[i] /= pce_norms[i];
}
Exemple #11
0
void Arnoldi::_Orthogonalize(cSDV& q){
    Teuchos::BLAS<int, complex<double> > blas;
    complex<double> OrthogFactor;

    for( int j = 0; j <= _k; ++j ){
        OrthogFactor = blas.DOT(_length, q.values(), 1, _Q[j], 1);
        _H[_k][j] = OrthogFactor;
        complex<double>* iterQ = _Q[j];
        complex<double>* iterq = q.values();
        for( int i = 0; i < _length; ++i, ++iterQ, ++iterq ){
           *iterq -= OrthogFactor*(*iterQ);
        }
    }
}
  KOKKOS_INLINE_FUNCTION
  int
  Gemm<Trans::ConjTranspose,Trans::NoTranspose,
       AlgoGemm::ExternalBlas,Variant::One>
  ::invoke(PolicyType &policy,
           MemberType &member,
           const ScalarType alpha,
           DenseExecViewTypeA &A,
           DenseExecViewTypeB &B,
           const ScalarType beta,
           DenseExecViewTypeC &C) {
    // static_assert( Kokkos::Impl::is_same<
    //                typename DenseMatrixTypeA::space_type,
    //                typename DenseMatrixTypeB::space_type
    //                >::value && 
    //                Kokkos::Impl::is_same<
    //                typename DenseMatrixTypeB::space_type,
    //                typename DenseMatrixTypeC::space_type
    //                >::value,
    //                "Space type of input matrices does not match" );
    
    if (member.team_rank() == 0) {
#if                                                     \
  defined( HAVE_SHYLUTACHO_TEUCHOS ) &&                 \
  defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
      typedef typename DenseExecViewTypeA::ordinal_type ordinal_type;
      typedef typename DenseExecViewTypeA::value_type   value_type;

      Teuchos::BLAS<ordinal_type,value_type> blas;
      
      const ordinal_type m = C.NumRows();
      const ordinal_type n = C.NumCols();
      const ordinal_type k = B.NumRows();

      if (m > 0 && n > 0 && k > 0)
        blas.GEMM(Teuchos::CONJ_TRANS, Teuchos::NO_TRANS,
                  m, n, k,
                  alpha,
                  A.ValuePtr(), A.BaseObject().ColStride(),
                  B.ValuePtr(), B.BaseObject().ColStride(),
                  beta,
                  C.ValuePtr(), C.BaseObject().ColStride());
#else
    TACHO_TEST_FOR_ABORT( true, MSG_NOT_HAVE_PACKAGE("Teuchos") );        
#endif
    } 

    return 0;
  }
Exemple #13
0
double
do_time_teuchos_fad_gemm(unsigned int m, unsigned int n, unsigned int k,
			 unsigned int ndot, unsigned int nloop)
{
  Sacado::Random<double> urand(0.0, 1.0);
  Teuchos::BLAS<int,FadType> blas;

  std::vector<FadType> A(m*k), B(k*n), C(m*n);
  for (unsigned int j=0; j<k; j++) {
    for (unsigned int i=0; i<m; i++) {
      A[i+j*m] = FadType(ndot, urand.number());
      for (unsigned int l=0; l<ndot; l++)
      	A[i+j*m].fastAccessDx(l) = urand.number();
    }
  }
  for (unsigned int j=0; j<n; j++) {
    for (unsigned int i=0; i<k; i++) {
      B[i+j*k] = FadType(ndot, urand.number());
      for (unsigned int l=0; l<ndot; l++)
	B[i+j*k].fastAccessDx(l) = urand.number();
    }
  }
  for (unsigned int j=0; j<n; j++) {
    for (unsigned int i=0; i<m; i++) {
      C[i+j*m] = FadType(ndot, urand.number());
      for (unsigned int l=0; l<ndot; l++)
	C[i+j*m].fastAccessDx(l) = urand.number();
    }
  }
  FadType alpha(ndot, urand.number());
  FadType beta(ndot, urand.number());
  for (unsigned int l=0; l<ndot; l++) {
    alpha.fastAccessDx(l) = urand.number();
    beta.fastAccessDx(l) = urand.number();
  }
  
  Teuchos::Time timer("Teuchos Fad GEMM", false);
  timer.start(true);
  for (unsigned int j=0; j<nloop; j++) {
    blas.GEMM(Teuchos::NO_TRANS, Teuchos::NO_TRANS, m, n, k, alpha, &A[0], m, 
	      &B[0], k, beta, &C[0], m);
  }
  timer.stop();

  return timer.totalElapsedTime() / nloop;
}
  KOKKOS_INLINE_FUNCTION
  int
  Trsm<Side::Left,Uplo::Upper,Trans::NoTranspose,
       AlgoTrsm::ExternalBlas,Variant::One>
  ::invoke(PolicyType &policy,
           const MemberType &member,
           const int diagA,
           const ScalarType alpha,
           DenseExecViewTypeA &A,
           DenseExecViewTypeB &B) {
    // static_assert( Kokkos::Impl::is_same<
    //                typename DenseMatrixTypeA::space_type,
    //                Kokkos::Cuda
    //                >::value,
    //                "Cuda space is not available for calling external BLAS" );

    // static_assert( Kokkos::Impl::is_same<
    //                typename DenseMatrixTypeA::space_type,
    //                typename DenseMatrixTypeB::space_type
    //                >::value,
    //                "Space type of input matrices does not match" );

    //typedef typename DenseExecViewTypeA::space_type   space_type;
    typedef typename DenseExecViewTypeA::ordinal_type ordinal_type;
    typedef typename DenseExecViewTypeA::value_type   value_type;

    if (member.team_rank() == 0) {
#ifdef HAVE_SHYLUTACHO_TEUCHOS
      Teuchos::BLAS<ordinal_type,value_type> blas;

      const ordinal_type m = A.NumRows();
      const ordinal_type n = B.NumCols();

      blas.TRSM(Teuchos::LEFT_SIDE, Teuchos::UPPER_TRI, Teuchos::NO_TRANS,
                (diagA == Diag::Unit ? Teuchos::UNIT_DIAG : Teuchos::NON_UNIT_DIAG),
                m, n,
                alpha,
                A.ValuePtr(), A.BaseObject().ColStride(),
                B.ValuePtr(), B.BaseObject().ColStride());
#else
    TACHO_TEST_FOR_ABORT( true, MSG_NOT_HAVE_PACKAGE("Teuchos") );
#endif
    }

    return 0;
  }
Exemple #15
0
  void Constraint<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::Setup(const MultiVector& B, const MultiVector& Bc, RCP<const CrsGraph> Ppattern) {
    Ppattern_ = Ppattern;

    const RCP<const Map> uniqueMap    = Ppattern_->getDomainMap();
    const RCP<const Map> nonUniqueMap = Ppattern_->getColMap();
    RCP<const Import> importer = ImportFactory::Build(uniqueMap, nonUniqueMap);

    const size_t NSDim = Bc.getNumVectors();
    X_ = MultiVectorFactory::Build(nonUniqueMap, NSDim);
    X_->doImport(Bc, *importer, Xpetra::INSERT);

    size_t numRows = Ppattern_->getNodeNumRows();
    XXtInv_.resize(numRows);
    Teuchos::SerialDenseVector<LO,SC> BcRow(NSDim, false);
    for (size_t i = 0; i < numRows; i++) {
      Teuchos::ArrayView<const LO> indices;
      Ppattern_->getLocalRowView(i, indices);

      size_t nnz = indices.size();

      Teuchos::SerialDenseMatrix<LO,SC> locX(NSDim, nnz, false);
      for (size_t j = 0; j < nnz; j++) {
        for (size_t k = 0; k < NSDim; k++)
          BcRow[k] = X_->getData(k)[indices[j]];

        Teuchos::setCol(BcRow, (LO)j, locX);
      }

      XXtInv_[i] = Teuchos::SerialDenseMatrix<LO,SC>(NSDim, NSDim, false);

      Teuchos::BLAS<LO,SC> blas;
      blas.GEMM(Teuchos::NO_TRANS, Teuchos::CONJ_TRANS, NSDim, NSDim, nnz,
                Teuchos::ScalarTraits<SC>::one(), locX.values(), locX.stride(),
                locX.values(), locX.stride(), Teuchos::ScalarTraits<SC>::zero(),
                XXtInv_[i].values(), XXtInv_[i].stride());

      Teuchos::LAPACK<LO,SC> lapack;
      LO info, lwork = 3*NSDim;
      ArrayRCP<LO> IPIV(NSDim);
      ArrayRCP<SC> WORK(lwork);
      lapack.GETRF(NSDim, NSDim, XXtInv_[i].values(), XXtInv_[i].stride(), IPIV.get(), &info);
      lapack.GETRI(NSDim, XXtInv_[i].values(), XXtInv_[i].stride(), IPIV.get(), WORK.get(), lwork, &info);
    }
  }
Exemple #16
0
double
do_time_teuchos_fad_gemv(unsigned int m, unsigned int n, unsigned int ndot, 
			 unsigned int nloop)
{
  Sacado::Random<double> urand(0.0, 1.0);
  Teuchos::BLAS<int,FadType> blas;

  std::vector<FadType> A(m*n), B(n), C(m);
  for (unsigned int j=0; j<n; j++) {
    for (unsigned int i=0; i<m; i++) {
      //A[i+j*m] = urand.number();
      A[i+j*m] = FadType(ndot, urand.number());
      for (unsigned int k=0; k<ndot; k++)
      	A[i+j*m].fastAccessDx(k) = urand.number();
    }
    B[j] = FadType(ndot, urand.number());
    for (unsigned int k=0; k<ndot; k++)
      B[j].fastAccessDx(k) = urand.number();
  }
  for (unsigned int i=0; i<m; i++) {
    C[i] = FadType(ndot, urand.number());
    for (unsigned int k=0; k<ndot; k++)
      C[i].fastAccessDx(k) = urand.number();
  }
  FadType alpha(ndot, urand.number());
  FadType beta(ndot, urand.number());
  for (unsigned int k=0; k<ndot; k++) {
    alpha.fastAccessDx(k) = urand.number();
    beta.fastAccessDx(k) = urand.number();
  }
  
  Teuchos::Time timer("Teuchos Fad GEMV", false);
  timer.start(true);
  for (unsigned int j=0; j<nloop; j++) {
    blas.GEMV(Teuchos::NO_TRANS, m, n, alpha, &A[0], m, &B[0], 1, beta, &C[0], 1);
  }
  timer.stop();

  return timer.totalElapsedTime() / nloop;
}
Exemple #17
0
double
do_time_teuchos_double_dot(unsigned int m, unsigned int nloop)
{
  Sacado::Random<double> urand(0.0, 1.0);
  Teuchos::BLAS<int,double> blas;

  std::vector<double> X(m), Y(m);
  for (unsigned int i=0; i<m; i++) {
    X[i] = urand.number();
    Y[i] = urand.number();
  }
  
  Teuchos::Time timer("Teuchos Double DOT", false);
  timer.start(true);
  double z;
  for (unsigned int j=0; j<nloop; j++) {
    z += blas.DOT(m, &X[0], 1, &Y[0], 1);
  }
  timer.stop();

  return timer.totalElapsedTime() / nloop;
}
Exemple #18
0
    static void
    GEMM (const Teuchos::ETransp transA,
          const Teuchos::ETransp transB,
          const double& alpha,
          const View<const double**, LayoutLeft, DeviceType>& A,
          const View<const double**, LayoutLeft, DeviceType>& B,
          const double& beta,
          const View<double**, LayoutLeft, DeviceType>& C)
    {
      const int n = static_cast<int> (C.dimension_1 ());

      // For some BLAS implementations (e.g., MKL), GEMM when B has
      // one column may be signficantly less efficient than GEMV.
      if (n == 1 && transB == Teuchos::NO_TRANS) {
        char trans = 'N';
        if (transA == Teuchos::TRANS) {
          trans = 'T';
        }
        else if (transA == Teuchos::CONJ_TRANS) {
          trans = 'C';
        }
        auto B_0 = Kokkos::subview (B, Kokkos::ALL (), 0);
        auto C_0 = Kokkos::subview (C, Kokkos::ALL (), 0);
        KokkosBlas::gemv (&trans, alpha, A, B_0, beta, C_0);
      }
      else {
        const int m = static_cast<int> (C.dimension_0 ());
        const int k = static_cast<int> (transA == Teuchos::NO_TRANS ? A.dimension_1 () : A.dimension_0 ());
        const int lda = static_cast<int> (Impl::getStride2DView (A));
        const int ldb = static_cast<int> (Impl::getStride2DView (B));
        const int ldc = static_cast<int> (Impl::getStride2DView (C));

        Teuchos::BLAS<int,double> blas;
        blas.GEMM (transA, transB, m, n, k, alpha,
                   A.ptr_on_device(), lda,
                   B.ptr_on_device(), ldb,
                   beta, C.ptr_on_device(), ldc);
      }
    }
Exemple #19
0
/*
  Computes integrals of monomials over a given reference cell.
*/
void computeIntegral(Teuchos::Array<double>& testIntFixDeg, shards::CellTopology & cellTopology, int cubDegree) {

  DefaultCubatureFactory<double>  cubFactory;                                         // create factory
  Teuchos::RCP<Cubature<double> > myCub = cubFactory.create(cellTopology, cubDegree); // create default cubature

  int cubDim       = myCub->getDimension();
  int numCubPoints = myCub->getNumPoints();
  int numPolys     = (cubDegree+1)*(cubDegree+2)*(cubDegree+3)/6;

  FieldContainer<double> point(cubDim);
  FieldContainer<double> cubPoints(numCubPoints, cubDim);
  FieldContainer<double> cubWeights(numCubPoints);
  FieldContainer<double> functValues(numCubPoints, numPolys);

  myCub->getCubature(cubPoints, cubWeights);

  int polyCt = 0;
  for (int xDeg=0; xDeg <= cubDegree; xDeg++) {
    for (int yDeg=0; yDeg <= cubDegree-xDeg; yDeg++) {
      for (int zDeg=0; zDeg <= cubDegree-xDeg-yDeg; zDeg++) {
        for (int i=0; i<numCubPoints; i++) {
          for (int j=0; j<cubDim; j++) {
            point(j) = cubPoints(i,j);
          }
          functValues(i,polyCt) = computeMonomial(point, xDeg, yDeg, zDeg);
        }
        polyCt++;
      }
    }
  }

  Teuchos::BLAS<int, double> myblas;
  int inc = 1;
  double alpha = 1.0;
  double beta  = 0.0;
  myblas.GEMV(Teuchos::NO_TRANS, numPolys, numCubPoints, alpha, &functValues(0,0), numPolys,
              &cubWeights(0), inc, beta, &testIntFixDeg[0], inc);
}
Exemple #20
0
/*
  Computes integrals of monomials over a given reference cell.
*/
void computeIntegral(Teuchos::Array<double>& testIntFixDeg, int cubDegree) {

  CubatureGenSparse<double,3> myCub(cubDegree);

  int cubDim       = myCub.getDimension();
  int numCubPoints = myCub.getNumPoints();
  int numPolys     = (cubDegree+1)*(cubDegree+2)*(cubDegree+3)/6;

  FieldContainer<double> point(cubDim);
  FieldContainer<double> cubPoints(numCubPoints, cubDim);
  FieldContainer<double> cubWeights(numCubPoints);
  FieldContainer<double> functValues(numCubPoints, numPolys);

  myCub.getCubature(cubPoints, cubWeights);

  int polyCt = 0;
  for (int xDeg=0; xDeg <= cubDegree; xDeg++) {
    for (int yDeg=0; yDeg <= cubDegree-xDeg; yDeg++) {
      for (int zDeg=0; zDeg <= cubDegree-xDeg-yDeg; zDeg++) {
        for (int i=0; i<numCubPoints; i++) {
          for (int j=0; j<cubDim; j++) {
            point(j) = cubPoints(i,j);
          }
          functValues(i,polyCt) = computeMonomial(point, xDeg, yDeg, zDeg);
        }
        polyCt++;
      }
    }
  }

  Teuchos::BLAS<int, double> myblas;
  int inc = 1;
  double alpha = 1.0;
  double beta  = 0.0;
  myblas.GEMV(Teuchos::NO_TRANS, numPolys, numCubPoints, alpha, &functValues(0,0), numPolys,
              &cubWeights(0), inc, beta, &testIntFixDeg[0], inc);
}
int main(int argc, char **argv)
{
  const unsigned int n = 5;
  Sacado::Fad::Vector<unsigned int, FadType> A(n*n,0),B(n,n), C(n,n);
  for (unsigned int i=0; i<n; i++) {
    for (unsigned int j=0; j<n; j++)
      A[i+j*n] = FadType(Teuchos::ScalarTraits<double>::random());
    B[i] = FadType(n, Teuchos::ScalarTraits<double>::random());
    for (unsigned int j=0; j<n; j++)
      B[i].fastAccessDx(j) = Teuchos::ScalarTraits<double>::random();
    C[i] = 0.0;
  }

  double *a = A.vals();
  double *b = B.vals();
  double *bdx = B.dx();
  std::vector<double> c(n), cdx(n*n);

  Teuchos::BLAS<int,double> blas;
  blas.GEMV(Teuchos::NO_TRANS, n, n, 1.0, &a[0], n, &b[0], 1, 0.0, &c[0], 1);
  blas.GEMM(Teuchos::NO_TRANS, Teuchos::NO_TRANS, n, n, n, 1.0, &a[0], n, &bdx[0], n, 0.0, &cdx[0], n);

  // Teuchos::BLAS<int,FadType> blas_fad;
  // blas_fad.GEMV(Teuchos::NO_TRANS, n, n, 1.0, &A[0], n, &B[0], 1, 0.0, &C[0], 1);

  Teuchos::BLAS<int,FadType> sacado_fad_blas(false);
  sacado_fad_blas.GEMV(Teuchos::NO_TRANS, n, n, 1.0, &A[0], n, &B[0], 1, 0.0, &C[0], 1);

  // Print the results
  int p = 4;
  int w = p+7;
  std::cout.setf(std::ios::scientific);
  std::cout.precision(p);

  std::cout << "BLAS GEMV calculation:" << std::endl;
  std::cout << "a = " << std::endl;
  for (unsigned int i=0; i<n; i++) {
    for (unsigned int j=0; j<n; j++)
      std::cout << " " << std::setw(w) << a[i+j*n];
    std::cout << std::endl;
  }
  std::cout << "b = " << std::endl;
  for (unsigned int i=0; i<n; i++) {
    std::cout << " " << std::setw(w) << b[i];
  }
  std::cout << std::endl;
  std::cout << "bdot = " << std::endl;
  for (unsigned int i=0; i<n; i++) {
    for (unsigned int j=0; j<n; j++)
      std::cout << " " << std::setw(w) << bdx[i+j*n];
    std::cout << std::endl;
  }
  std::cout << "c = " << std::endl;
  for (unsigned int i=0; i<n; i++) {
    std::cout << " " << std::setw(w) << c[i];
  }
  std::cout << std::endl;
  std::cout << "cdot = " << std::endl;
  for (unsigned int i=0; i<n; i++) {
    for (unsigned int j=0; j<n; j++)
      std::cout << " " << std::setw(w) << cdx[i+j*n];
    std::cout << std::endl;
  }
  std::cout << std::endl << std::endl;

  std::cout << "FAD BLAS GEMV calculation:" << std::endl;
  std::cout << "A.val() (should = a) = " << std::endl;
  for (unsigned int i=0; i<n; i++) {
    for (unsigned int j=0; j<n; j++)
      std::cout << " " << std::setw(w) << A[i+j*n].val();
    std::cout << std::endl;
  }
  std::cout << "B.val() (should = b) = " << std::endl;
  for (unsigned int i=0; i<n; i++) {
    std::cout << " " << std::setw(w) << B[i].val();
  }
  std::cout << std::endl;
  std::cout << "B.dx() (should = bdot) = " << std::endl;
  double *Bdx = B.dx();
  for (unsigned int i=0; i<n; i++) {
    for (unsigned int j=0; j<n; j++)
      std::cout << " " << std::setw(w) << Bdx[i+j*n];
    std::cout << std::endl;
  }
  std::cout << "C.val() (should = c) = " << std::endl;
  for (unsigned int i=0; i<n; i++) {
    std::cout << " " << std::setw(w) << C[i].val();
  }
  std::cout << std::endl;
  std::cout << "C.dx() (should = cdot) = " << std::endl;
  double *Cdx = C.dx();
  for (unsigned int i=0; i<n; i++) {
    for (unsigned int j=0; j<n; j++)
      std::cout << " " << std::setw(w) << Cdx[i+j*n];
    std::cout << std::endl;
  }

  double tol = 1.0e-14;
  bool failed = false;
  for (unsigned int i=0; i<n; i++) {
    if (std::fabs(C[i].val() - c[i]) > tol)
      failed = true;
    for (unsigned int j=0; j<n; j++) {
      if (std::fabs(C[i].dx(j) - cdx[i+j*n]) > tol) 
	failed = true;
    }
  }
  if (!failed) {
    std::cout << "\nExample passed!" << std::endl;
    return 0;
  }
  else {
    std::cout <<"\nSomething is wrong, example failed!" << std::endl;
    return 1;
  }
}
    void
    randomGlobalMatrix (Generator* const pGenerator,
                        MatrixViewType& A_local,
                        const typename Teuchos::ScalarTraits< typename MatrixViewType::scalar_type >::magnitudeType singular_values[],
                        MessengerBase< typename MatrixViewType::ordinal_type >* const ordinalMessenger,
                        MessengerBase< typename MatrixViewType::scalar_type >* const scalarMessenger)
    {
      using Teuchos::NO_TRANS;
      using std::vector;
      typedef typename MatrixViewType::ordinal_type ordinal_type;
      typedef typename MatrixViewType::scalar_type scalar_type;


      const bool b_local_debug = false;

      const int rootProc = 0;
      const int nprocs = ordinalMessenger->size();
      const int myRank = ordinalMessenger->rank();
      Teuchos::BLAS<ordinal_type, scalar_type> blas;

      const ordinal_type nrowsLocal = A_local.nrows();
      const ordinal_type ncols = A_local.ncols();

      // Theory: Suppose there are P processors.  Proc q wants an m_q by n
      // component of the matrix A, which we write as A_q.  On Proc 0, we
      // generate random m_q by n orthogonal matrices Q_q (in explicit
      // form), and send Q_q to Proc q.  The m by n matrix [Q_0; Q_1; ...;
      // Q_{P-1}] is not itself orthogonal.  However, the m by n matrix
      // Q = [Q_0 / P; Q_1 / P; ...; Q_{P-1} / P] is orthogonal:
      //
      // \sum_{q = 0}^{P-1} (Q_q^T * Q_q) / P = I.

      if (myRank == rootProc)
        {
          typedef Random::MatrixGenerator< ordinal_type, scalar_type, Generator > matgen_type;
          matgen_type matGen (*pGenerator);

          // Generate a random ncols by ncols upper triangular matrix
          // R with the given singular values.
          Matrix< ordinal_type, scalar_type > R (ncols, ncols, scalar_type(0));
          matGen.fill_random_R (ncols, R.get(), R.lda(), singular_values);

          // Broadcast R to all the processors.
          scalarMessenger->broadcast (R.get(), ncols*ncols, rootProc);

          // Generate (for myself) a random nrowsLocal x ncols
          // orthogonal matrix, stored in explicit form.
          Matrix< ordinal_type, scalar_type > Q_local (nrowsLocal, ncols);
          matGen.explicit_Q (nrowsLocal, ncols, Q_local.get(), Q_local.lda());

          // Scale the (local) orthogonal matrix by the number of
          // processors P, to make the columns of the global matrix Q
          // orthogonal.  (Otherwise the norm of each column will be P
          // instead of 1.)
          const scalar_type P = static_cast< scalar_type > (nprocs);
          // Do overflow check.  If casting P back to scalar_type
          // doesn't produce the same value as nprocs, the cast
          // overflowed.  We take the real part, because scalar_type
          // might be complex.
          if (nprocs != static_cast<int> (Teuchos::ScalarTraits<scalar_type>::real (P)))
            throw std::runtime_error ("Casting nprocs to Scalar failed");

          scaleMatrix (Q_local, P);

          // A_local := Q_local * R
          blas.GEMM (NO_TRANS, NO_TRANS, nrowsLocal, ncols, ncols,
                     scalar_type(1), Q_local.get(), Q_local.lda(),
                     R.get(), R.lda(),
                     scalar_type(0), A_local.get(), A_local.lda());

          for (int recvProc = 1; recvProc < nprocs; ++recvProc)
            {
              // Ask the receiving processor how big (i.e., how many rows)
              // its local component of the matrix is.
              ordinal_type nrowsRemote = 0;
              ordinalMessenger->recv (&nrowsRemote, 1, recvProc, 0);

              if (b_local_debug)
                {
                  std::ostringstream os;
                  os << "For Proc " << recvProc << ": local block is "
                     << nrowsRemote << " by " << ncols << std::endl;
                  std::cerr << os.str();
                }

              // Make sure Q_local is big enough to hold the data for
              // the current receiver proc.
              Q_local.reshape (nrowsRemote, ncols);

              // Compute a random nrowsRemote * ncols orthogonal
              // matrix Q_local, for the current receiving processor.
              matGen.explicit_Q (nrowsRemote, ncols, Q_local.get(), Q_local.lda());

              // Send Q_local to the current receiving processor.
              scalarMessenger->send (Q_local.get(), nrowsRemote*ncols, recvProc, 0);
            }
        }
      else
        {
          // Receive the R factor from Proc 0.  There's only 1 R
          // factor for all the processes.
          Matrix< ordinal_type, scalar_type > R (ncols, ncols, scalar_type (0));
          scalarMessenger->broadcast (R.get(), ncols*ncols, rootProc);

          // Q_local (nrows_local by ncols, random orthogonal matrix)
          // will be received from Proc 0, where it was generated.
          const ordinal_type recvSize = nrowsLocal * ncols;
          Matrix< ordinal_type, scalar_type > Q_local (nrowsLocal, ncols);

          // Tell Proc 0 how many rows there are in the random orthogonal
          // matrix I want to receive from Proc 0.
          ordinalMessenger->send (&nrowsLocal, 1, rootProc, 0);

          // Receive the orthogonal matrix from Proc 0.
          scalarMessenger->recv (Q_local.get(), recvSize, rootProc, 0);

          // Scale the (local) orthogonal matrix by the number of
          // processors, to make the global matrix Q orthogonal.
          const scalar_type P = static_cast< scalar_type > (nprocs);
          // Do overflow check.  If casting P back to scalar_type
          // doesn't produce the same value as nprocs, the cast
          // overflowed.  We take the real part, because scalar_type
          // might be complex.
          if (nprocs != static_cast<int> (Teuchos::ScalarTraits<scalar_type>::real (P)))
            throw std::runtime_error ("Casting nprocs to Scalar failed");
          scaleMatrix (Q_local, P);

          // A_local := Q_local * R
          blas.GEMM (NO_TRANS, NO_TRANS, nrowsLocal, ncols, ncols,
                     scalar_type(1), Q_local.get(), Q_local.lda(),
                     R.get(), R.lda(),
                     scalar_type(0), A_local.get(), A_local.lda());
        }
    }
Exemple #23
0
  void Constraint<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps>::Apply(const Matrix& P, Matrix& Projected) const {
    const size_t NSDim   = X_->getNumVectors();
    const size_t numRows = P.getNodeNumRows();

    Projected.resumeFill();

    Teuchos::SerialDenseVector<LO,SC> BcRow(NSDim, false);
    for (size_t i = 0; i < numRows; i++) {
      Teuchos::ArrayView<const LO> indices, pindices;
      Teuchos::ArrayView<const SC> values,  pvalues;

      P        .getLocalRowView(i,  indices,  values);
      Projected.getLocalRowView(i, pindices, pvalues);

      size_t nnz  = pindices.size();    // number of nonzeros in the constrained matrix
      size_t nnz1 = indices.size();     // number of nonzeros in the supplied matrix

      Teuchos::Array<SC> newValues(nnz, Teuchos::ScalarTraits<SC>::zero());

      // step 1: fix stencil
      // Projected already has the correct stencil

      // step 2: copy correct stencil values
      for (size_t j = 0; j < nnz1; j++) {
        // this might be accelerated if we know smth about ordering
        size_t k = 0;
        for (; k < nnz; k++)
          if (pindices[k] == indices[j])
            break;
        if (k != nnz) {
          // index indices[j] is part of template, copy corresponding value
          newValues[k] = values[j];
        }
      }

      // step 3: project to the space
      Teuchos::SerialDenseMatrix<LO,SC> locX(NSDim, nnz, false);
      for (size_t j = 0; j < nnz; j++) {
        for (size_t k = 0; k < NSDim; k++)
          BcRow[k] = X_->getData(k)[pindices[j]];

        Teuchos::setCol(BcRow, (LO)j, locX);
      }

      Teuchos::SerialDenseVector<LO,SC> val(nnz, false), val1(NSDim, false), val2(NSDim, false);
      for (size_t j = 0; j < nnz; j++)
        val[j] = newValues[j];

      Teuchos::BLAS<LO,SC> blas;
      blas.GEMV(Teuchos::NO_TRANS, NSDim, nnz, Teuchos::ScalarTraits<SC>::one(), locX.values(),
                locX.stride(), val.values(), (LO)1, Teuchos::ScalarTraits<SC>::zero(), val1.values(), (LO)1);
      blas.GEMV(Teuchos::NO_TRANS, NSDim, NSDim, Teuchos::ScalarTraits<SC>::one(), XXtInv_[i].values(),
                XXtInv_[i].stride(), val1.values(), (LO)1, Teuchos::ScalarTraits<SC>::zero(), val2.values(), (LO)1);
      blas.GEMV(Teuchos::CONJ_TRANS, NSDim, nnz, Teuchos::ScalarTraits<SC>::one(), locX.values(),
                locX.stride(), val2.values(), (LO)1, Teuchos::ScalarTraits<SC>::zero(), val.values(), (LO)1);

      for (size_t j = 0; j < nnz; j++)
        newValues[j] -= val[j];

      Projected.replaceLocalValues(i, pindices, newValues);
    }

    Projected.fillComplete(Projected.getDomainMap(), Projected.getRangeMap()); //FIXME: maps needed?
  }
Exemple #24
0
int main(int argc, char *argv[]) {
Kokkos::initialize();
  //Check number of arguments
   if (argc < 4) {
      std::cout <<"\n>>> ERROR: Invalid number of arguments.\n\n";
      std::cout <<"Usage:\n\n";
      std::cout <<"  ./Intrepid_example_Drivers_Example_06.exe deg NX NY verbose\n\n";
      std::cout <<" where \n";
      std::cout <<"   int deg             - polynomial degree to be used (assumed > 1) \n";
      std::cout <<"   int NX              - num intervals in x direction (assumed box domain, 0,1) \n";
      std::cout <<"   int NY              - num intervals in y direction (assumed box domain, 0,1) \n";
      std::cout <<"   verbose (optional)  - any character, indicates verbose output \n\n";
      exit(1);
   }
  
  // This little trick lets us print to std::cout only if
  // a (dummy) command-line argument is provided.
  int iprint     = argc - 1;
  Teuchos::RCP<std::ostream> outStream;
  Teuchos::oblackholestream bhs; // outputs nothing
  if (iprint > 2)
    outStream = Teuchos::rcp(&std::cout, false);
  else
    outStream = Teuchos::rcp(&bhs, false);
  
  // Save the format state of the original std::cout.
  Teuchos::oblackholestream oldFormatState;
  oldFormatState.copyfmt(std::cout);
  
  *outStream \
    << "===============================================================================\n" \
    << "|                                                                             |\n" \
    << "|  Example: Apply Stiffness Matrix for                                        |\n" \
    << "|                   Poisson Equation on Quadrilateral Mesh                    |\n" \
    << "|                                                                             |\n" \
    << "|  Questions? Contact  Pavel Bochev  ([email protected]),                    |\n" \
    << "|                      Denis Ridzal  ([email protected]),                    |\n" \
    << "|                      Kara Peterson ([email protected]).                    |\n" \
    << "|                                                                             |\n" \
    << "|  Intrepid's website: http://trilinos.sandia.gov/packages/intrepid           |\n" \
    << "|  Trilinos website:   http://trilinos.sandia.gov                             |\n" \
    << "|                                                                             |\n" \
    << "===============================================================================\n";

  
  // ************************************ GET INPUTS **************************************
  
  int deg          = atoi(argv[1]);  // polynomial degree to use
  int NX            = atoi(argv[2]);  // num intervals in x direction (assumed box domain, 0,1)
  int NY            = atoi(argv[3]);  // num intervals in y direction (assumed box domain, 0,1)
  

  // *********************************** CELL TOPOLOGY **********************************
  
  // Get cell topology for base hexahedron
  typedef shards::CellTopology    CellTopology;
  CellTopology quad_4(shards::getCellTopologyData<shards::Quadrilateral<4> >() );
  
  // Get dimensions 
  int numNodesPerElem = quad_4.getNodeCount();
  int spaceDim = quad_4.getDimension();
  
  // *********************************** GENERATE MESH ************************************
  
  *outStream << "Generating mesh ... \n\n";
  
  *outStream << "   NX" << "   NY\n";
  *outStream << std::setw(5) << NX <<
    std::setw(5) << NY << "\n\n";
  
  // Print mesh information
  int numElems = NX*NY;
  int numNodes = (NX+1)*(NY+1);
  *outStream << " Number of Elements: " << numElems << " \n";
  *outStream << "    Number of Nodes: " << numNodes << " \n\n";
  
  // Square
  double leftX = 0.0, rightX = 1.0;
  double leftY = 0.0, rightY = 1.0;

  // Mesh spacing
  double hx = (rightX-leftX)/((double)NX);
  double hy = (rightY-leftY)/((double)NY);

  // Get nodal coordinates
  FieldContainer<double> nodeCoord(numNodes, spaceDim);
  FieldContainer<int> nodeOnBoundary(numNodes);
  int inode = 0;
  for (int j=0; j<NY+1; j++) {
    for (int i=0; i<NX+1; i++) {
      nodeCoord(inode,0) = leftX + (double)i*hx;
      nodeCoord(inode,1) = leftY + (double)j*hy;
      if (j==0 || i==0 || j==NY || i==NX){
	nodeOnBoundary(inode)=1;
      }
      else {
	nodeOnBoundary(inode)=0;
      }
      inode++;
    }
  }
#define DUMP_DATA
#ifdef DUMP_DATA
  // Print nodal coords
  ofstream fcoordout("coords.dat");
  for (int i=0; i<numNodes; i++) {
    fcoordout << nodeCoord(i,0) <<" ";
    fcoordout << nodeCoord(i,1) <<"\n";
  }
  fcoordout.close();
#endif
  
  
  // Element to Node map
  // We'll keep it around, but this is only the DOFMap if you are in the lowest order case.
  FieldContainer<int> elemToNode(numElems, numNodesPerElem);
  int ielem = 0;
  for (int j=0; j<NY; j++) {
    for (int i=0; i<NX; i++) {
      elemToNode(ielem,0) = (NX + 1)*j + i;
      elemToNode(ielem,1) = (NX + 1)*j + i + 1;
      elemToNode(ielem,2) = (NX + 1)*(j + 1) + i + 1;
      elemToNode(ielem,3) = (NX + 1)*(j + 1) + i;
      ielem++;
    }
  }
#ifdef DUMP_DATA
  // Output connectivity
  ofstream fe2nout("elem2node.dat");
  for (int j=0; j<NY; j++) {
    for (int i=0; i<NX; i++) {
      int ielem = i + j * NX;
      for (int m=0; m<numNodesPerElem; m++){
	fe2nout << elemToNode(ielem,m) <<"  ";
      }
      fe2nout <<"\n";
    }
  }
  fe2nout.close();
#endif
  
  // ************************************ CUBATURE ************************************** 
  *outStream << "Getting cubature ... \n\n";
  
  // Get numerical integration points and weights
  DefaultCubatureFactory<double>  cubFactory;                                   
  int cubDegree = 2*deg;
  Teuchos::RCP<Cubature<double> > quadCub = cubFactory.create(quad_4, cubDegree); 
  
  int cubDim       = quadCub->getDimension();
  int numCubPoints = quadCub->getNumPoints();
  
  FieldContainer<double> cubPoints(numCubPoints, cubDim);
  FieldContainer<double> cubWeights(numCubPoints);
  
  quadCub->getCubature(cubPoints, cubWeights);
  

  // ************************************** BASIS ***************************************
  
  *outStream << "Getting basis ... \n\n";
  
  // Define basis 
  Basis_HGRAD_QUAD_Cn_FEM<double, FieldContainer<double> > quadHGradBasis(deg,POINTTYPE_SPECTRAL);
  int numFieldsG = quadHGradBasis.getCardinality();
  FieldContainer<double> quadGVals(numFieldsG, numCubPoints); 
  FieldContainer<double> quadGrads(numFieldsG, numCubPoints, spaceDim); 
  
  // Evaluate basis values and gradients at cubature points
  quadHGradBasis.getValues(quadGVals, cubPoints, OPERATOR_VALUE);
  quadHGradBasis.getValues(quadGrads, cubPoints, OPERATOR_GRAD);

  // create the local-global mapping for higher order elements
  FieldContainer<int> ltgMapping(numElems,numFieldsG);
  const int numDOF = (NX*deg+1)*(NY*deg+1);
  ielem=0;
  for (int j=0;j<NY;j++) {
    for (int i=0;i<NX;i++) {
      const int start = deg * j * ( NX * deg + 1 ) + i * deg;
      // loop over local dof on this cell
      int local_dof_cur=0;
      for (int vertical=0;vertical<=deg;vertical++) {
	for (int horizontal=0;horizontal<=deg;horizontal++) {
	  ltgMapping(ielem,local_dof_cur) = start + vertical*(NX*deg+1)+horizontal;
	  local_dof_cur++;
	}
      }
      ielem++;
    }
  }
#ifdef DUMP_DATA
  // Output ltg mapping
//   ofstream ltgout("ltg.dat");
//   for (int j=0; j<NY; j++) {
//     for (int i=0; i<NX; i++) {
//       int ielem = i + j * NX;
//       for (int m=0; m<numFieldsG; m++){
// 	ltgout << ltgMapping(ielem,m) <<"  ";
//       }
//       ltgout <<"\n";
//     }
//   }
//   ltgout.close();
#endif
  
  // ******** CREATE A SINGLE STIFFNESS MATRIX, WHICH IS REPLICATED ON ALL ELEMENTS *********
  *outStream << "Applying stiffness matrix and right hand side ... \n\n";

  // Settings and data structures for mass and stiffness matrices
  typedef CellTools<double>  CellTools;
  typedef FunctionSpaceTools fst;
  int numCells = 1; 

  // Container for nodes
  FieldContainer<double> refQuadNodes(numCells, numNodesPerElem, spaceDim);
  // Containers for Jacobian
  FieldContainer<double> refQuadJacobian(numCells, numCubPoints, spaceDim, spaceDim);
  FieldContainer<double> refQuadJacobInv(numCells, numCubPoints, spaceDim, spaceDim);
  FieldContainer<double> refQuadJacobDet(numCells, numCubPoints);
  // Containers for element HGRAD stiffness matrix
  FieldContainer<double> localStiffMatrix(numCells, numFieldsG, numFieldsG);
  FieldContainer<double> weightedMeasure(numCells, numCubPoints);
  FieldContainer<double> quadGradsTransformed(numCells, numFieldsG, numCubPoints, spaceDim);
  FieldContainer<double> quadGradsTransformedWeighted(numCells, numFieldsG, numCubPoints, spaceDim);
  // Containers for right hand side vectors
  FieldContainer<double> rhsData(numCells, numCubPoints);
  FieldContainer<double> localRHS(numCells, numFieldsG);
  FieldContainer<double> quadGValsTransformed(numCells, numFieldsG, numCubPoints);
  FieldContainer<double> quadGValsTransformedWeighted(numCells, numFieldsG, numCubPoints);
  // Container for cubature points in physical space
  FieldContainer<double> physCubPoints(numCells, numCubPoints, cubDim);
  
  // Global arrays in Epetra format 
  Epetra_SerialComm Comm;
  Epetra_Map globalMapG(numDOF, 0, Comm);
  Epetra_FEVector u(globalMapG);
  Epetra_FEVector Ku(globalMapG);
  u.Random();

  std::cout << "About to start ref element matrix\n";

  // ************************** Compute element HGrad stiffness matrices *******************************  
  refQuadNodes(0,0,0) = 0.0;
  refQuadNodes(0,0,1) = 0.0;
  refQuadNodes(0,1,0) = hx;
  refQuadNodes(0,1,1) = 0.0;
  refQuadNodes(0,2,0) = hx;
  refQuadNodes(0,2,1) = hy;
  refQuadNodes(0,3,0) = 0.0;
  refQuadNodes(0,3,1) = hy;

  // Compute cell Jacobians, their inverses and their determinants
  CellTools::setJacobian(refQuadJacobian, cubPoints, refQuadNodes, quad_4);
  CellTools::setJacobianInv(refQuadJacobInv, refQuadJacobian );
  CellTools::setJacobianDet(refQuadJacobDet, refQuadJacobian );
  
  // transform from [-1,1]^2 to [0,hx]x[0,hy]
  fst::HGRADtransformGRAD<double>(quadGradsTransformed, refQuadJacobInv, quadGrads);
      
  // compute weighted measure
  fst::computeCellMeasure<double>(weightedMeasure, refQuadJacobDet, cubWeights);

  // multiply values with weighted measure
  fst::multiplyMeasure<double>(quadGradsTransformedWeighted,
			       weightedMeasure, quadGradsTransformed);

  // integrate to compute element stiffness matrix
  fst::integrate<double>(localStiffMatrix,
			 quadGradsTransformed, quadGradsTransformedWeighted, COMP_BLAS);

  std::cout << "Finished with reference element matrix\n";

  
  // now we will scatter global degrees of freedom, apply the local stiffness matrix 
  // with BLAS, and then gather the results
  FieldContainer<double> uScattered(numElems,numFieldsG);
  FieldContainer<double> KuScattered(numElems,numFieldsG);

  // to extract info from u

  u.GlobalAssemble();

  Epetra_Time multTimer(Comm);

  Ku.PutScalar(0.0);
  Ku.GlobalAssemble();

  double *uVals = u[0];
  double *KuVals = Ku[0];

  Teuchos::BLAS<int,double> blas;
  Epetra_Time scatterTime(Comm);
  std::cout << "Scattering\n";
  // Scatter
  for (int k=0; k<numElems; k++) 
    {
      for (int i=0;i<numFieldsG;i++) 
	{
	  uScattered(k,i) = uVals[ltgMapping(k,i)];
	}
    }
  const double scatTime = scatterTime.ElapsedTime();
  std::cout << "Scattered in time " << scatTime << "\n";

  Epetra_Time blasTimer(Comm);
  blas.GEMM(Teuchos::NO_TRANS , Teuchos::NO_TRANS , 
	    numFieldsG , numElems, numFieldsG  , 
	    1.0 , 
	    &localStiffMatrix(0,0,0) , 
	    numFieldsG ,
	    &uScattered(0,0) , 
	    numFieldsG , 
	    0.0 , 
	     &KuScattered(0,0) , 
	    numFieldsG );
  const double blasTime = blasTimer.ElapsedTime();
  std::cout << "Element matrices applied in " << blasTime << "\n";

  Epetra_Time gatherTimer(Comm);
  // Gather
  for (int k=0;k<numElems;k++)
    {
      for (int i=0;i<numFieldsG;i++)
	{
	  KuVals[ltgMapping(k,i)] += KuScattered(k,i);
	}
    }

  const double gatherTime = gatherTimer.ElapsedTime();
  std::cout << "Gathered in " << gatherTime << "\n";
  

  const double applyTime = gatherTime + blasTime + scatTime;
  std::cout << "Time to do matrix-free product: " << applyTime << std::endl;
  
  
  std::cout << "End Result: TEST PASSED\n";
  
  // reset format state of std::cout
  std::cout.copyfmt(oldFormatState);
 Kokkos::finalize(); 
  return 0;
}
  KOKKOS_INLINE_FUNCTION
  int exampleDenseCholByBlocks(const OrdinalType mmin,
                               const OrdinalType mmax,
                               const OrdinalType minc,
                               const OrdinalType mb,
                               const int max_concurrency,
                               const int max_task_dependence,
                               const int team_size,
                               const bool check,
                               const bool verbose) {
    typedef ValueType   value_type;
    typedef OrdinalType ordinal_type;
    typedef SizeType    size_type;

    typedef TaskFactory<Kokkos::Experimental::TaskPolicy<SpaceType>,
      Kokkos::Experimental::Future<int,SpaceType> > TaskFactoryType;

    typedef DenseMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> DenseMatrixBaseType;

    typedef DenseMatrixView<DenseMatrixBaseType> DenseMatrixViewType;
    typedef TaskView<DenseMatrixViewType,TaskFactoryType> DenseTaskViewType;

    typedef DenseMatrixBase<DenseTaskViewType,ordinal_type,size_type,SpaceType,MemoryTraits> DenseHierMatrixBaseType;

    typedef DenseMatrixView<DenseHierMatrixBaseType> DenseHierMatrixViewType;
    typedef TaskView<DenseHierMatrixViewType,TaskFactoryType> DenseHierTaskViewType;

    int r_val = 0;

    Kokkos::Impl::Timer timer;
    double t = 0.0;

    cout << "DenseCholByBlocks:: test matrices "
         <<":: mmin = " << mmin << " , mmax = " << mmax << " , minc = " << minc << endl;

    const size_t max_task_size = (3*sizeof(DenseTaskViewType)+196); // when 128 error
    //cout << "max task size = "<< max_task_size << endl;
    typename TaskFactoryType::policy_type policy(max_concurrency,
                                                 max_task_size,
                                                 max_task_dependence,
                                                 team_size);

    TaskFactoryType::setMaxTaskDependence(max_task_dependence);
    TaskFactoryType::setPolicy(&policy);

    for (ordinal_type m=mmin;m<=mmax;m+=minc) {
      DenseMatrixBaseType TT("TT", m, m), AA("AA", m, m), AB("AB", m, m);

      // random T matrix
      for (ordinal_type j=0;j<AA.NumCols();++j) {
        for (ordinal_type i=0;i<AA.NumRows();++i)
          TT.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0;
        TT.Value(j,j) = abs(TT.Value(j,j));
      }

      {
        Teuchos::BLAS<ordinal_type,value_type> blas;

        // should be square
        const ordinal_type nn = AA.NumRows();
        const ordinal_type kk = TT.NumRows();

        blas.HERK(Teuchos::UPPER_TRI, Teuchos::CONJ_TRANS,
                  nn, kk,
                  1.0,
                  TT.ValuePtr(), TT.ColStride(),
                  0.0,
                  AA.ValuePtr(), AA.ColStride());
        AB.copy(AA);
      }

      const double flop = get_flop_chol<value_type>(m);

#ifdef HAVE_SHYLUTACHO_MKL
      mkl_set_num_threads(1);
#endif
      int ierr = 0;
      if (check) {
        timer.reset();
        DenseTaskViewType A(&AB);
        ierr = Chol<Uplo::Upper,AlgoChol::ExternalLapack>::invoke
          (TaskFactoryType::Policy(),
           TaskFactoryType::Policy().member_single(),
           A);
        t = timer.seconds();
        if (ierr) 
          ERROR(">> Fail to perform Cholesky (serial) : no reference solution -> no numeric error information");
        cout << "DenseCholByBlocks:: Serial Performance = " << (flop/t/1.0e9) << " [GFLOPs]" << endl;
      }

      {
        DenseHierMatrixBaseType HA;
        DenseMatrixHelper::flat2hier(AA, HA, mb, mb);

        DenseHierTaskViewType TA(&HA);
        timer.reset();
        auto future = TaskFactoryType::Policy().create_team
          (Chol<Uplo::Upper,AlgoChol::DenseByBlocks>
           ::TaskFunctor<DenseHierTaskViewType>(TA), 0);
        TaskFactoryType::Policy().spawn(future);
        Kokkos::Experimental::wait(TaskFactoryType::Policy());
        t = timer.seconds();
        cout << "DenseCholByBlocks:: Parallel Performance = " << (flop/t/1.0e9) << " [GFLOPs]" << endl;
      }

      if (!ierr && check) {
        typedef typename Teuchos::ScalarTraits<value_type>::magnitudeType real_type;
        real_type err = 0.0, norm = 0.0;
        for (ordinal_type j=0;j<AA.NumCols();++j)
          for (ordinal_type i=0;i<=j;++i) {
            const real_type diff = abs(AA.Value(i,j) - AB.Value(i,j));
            const real_type val  = AB.Value(i,j);
            err  += diff*diff;
            norm += val*val;
          }
        cout << "DenseCholByBlocks:: Check result::err = " << sqrt(err) << ", norm = " << sqrt(norm) << endl;
      }
    }

    return r_val;
  }
  int exampleDenseCholByBlocks(const ordinal_type mmin,
                               const ordinal_type mmax,
                               const ordinal_type minc,
                               const ordinal_type mb,
                               const int max_concurrency,
                               const int max_task_dependence,
                               const int team_size,
                               const int mkl_nthreads,
                               const bool check,
                               const bool verbose) {
    typedef typename
      Kokkos::Impl::is_space<DeviceSpaceType>::host_mirror_space::execution_space HostSpaceType ;

    const bool detail = false;
    std::cout << "DeviceSpace::  "; DeviceSpaceType::print_configuration(std::cout, detail);
    std::cout << "HostSpace::    ";   HostSpaceType::print_configuration(std::cout, detail);

    typedef Kokkos::Experimental::TaskPolicy<DeviceSpaceType> PolicyType;

    typedef DenseMatrixBase<value_type,ordinal_type,size_type,HostSpaceType> DenseMatrixBaseHostType;
    typedef DenseMatrixView<DenseMatrixBaseHostType> DenseMatrixViewHostType;

    typedef DenseMatrixBase<value_type,ordinal_type,size_type,DeviceSpaceType> DenseMatrixBaseDeviceType;
    typedef DenseMatrixView<DenseMatrixBaseDeviceType> DenseMatrixViewDeviceType;
    typedef TaskView<DenseMatrixViewDeviceType> DenseTaskViewDeviceType;

    typedef DenseMatrixBase<DenseTaskViewDeviceType,ordinal_type,size_type,DeviceSpaceType> DenseHierMatrixBaseDeviceType;

    typedef DenseMatrixView<DenseHierMatrixBaseDeviceType> DenseHierMatrixViewDeviceType;
    typedef TaskView<DenseHierMatrixViewDeviceType> DenseHierTaskViewDeviceType;

    int r_val = 0;

    Kokkos::Impl::Timer timer;
    double t = 0.0;

    std::cout << "DenseCholByBlocks:: test matrices "
              <<":: mmin = " << mmin << " , mmax = " << mmax << " , minc = " << minc 
              << " , mb = " << mb << std::endl;

    const size_t max_task_size = (3*sizeof(DenseTaskViewDeviceType)+sizeof(PolicyType)+128); 
    PolicyType policy(max_concurrency,
                      max_task_size,
                      max_task_dependence,
                      team_size);

    std::ostringstream os;
    os.precision(3);
    os << std::scientific;

    for (ordinal_type m=mmin;m<=mmax;m+=minc) {
      os.str("");
      
      // host matrices
      DenseMatrixBaseHostType AA_host("AA_host", m, m), AB_host("AB_host"), TT_host("TT_host");

      // random T matrix
      {
        TT_host.createConfTo(AA_host);
        for (ordinal_type j=0;j<TT_host.NumCols();++j) {
          for (ordinal_type i=0;i<TT_host.NumRows();++i)
            TT_host.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0;
          TT_host.Value(j,j) = std::fabs(TT_host.Value(j,j));
        }
      }
      // create SPD matrix
      {
        Teuchos::BLAS<ordinal_type,value_type> blas;

        blas.HERK(ArgUplo == Uplo::Upper ? Teuchos::UPPER_TRI : Teuchos::LOWER_TRI, 
                  Teuchos::CONJ_TRANS,
                  m, m,
                  1.0,
                  TT_host.ValuePtr(), TT_host.ColStride(),
                  0.0,
                  AA_host.ValuePtr(), AA_host.ColStride());

        // preserve a copy of A
        AB_host.createConfTo(AA_host);        
        DenseMatrixTools::copy(AB_host, AA_host);
      }

      const double flop = DenseFlopCount<value_type>::Chol(m);

#ifdef HAVE_SHYLUTACHO_MKL
      mkl_set_num_threads(mkl_nthreads);
#endif
      os << "DenseCholByBlocks:: m = " << m << "  ";
      int ierr = 0;
      if (check) {
        timer.reset();
        DenseMatrixViewHostType A_host(AB_host); 
        ierr = Chol<ArgUplo,AlgoChol::ExternalLapack,Variant::One>::invoke
          (policy, policy.member_single(),
           A_host);
        t = timer.seconds();
        TACHO_TEST_FOR_ABORT( ierr, "Fail to perform Cholesky (serial)");
        os << ":: Serial Performance = " << (flop/t/1.0e9) << " [GFLOPs]  ";
      }

      DenseMatrixBaseDeviceType AA_device("AA_device");
      {
        timer.reset();
        AA_device.mirror(AA_host);
        t = timer.seconds();
        os << ":: Mirror = " << t << " [sec]  ";
      }

      {
        DenseHierMatrixBaseDeviceType HA_device("HA_device");
        
        DenseMatrixTools::createHierMatrix(HA_device, AA_device, mb, mb);

        DenseHierTaskViewDeviceType TA_device(HA_device);

        timer.reset();
        auto future = policy.proc_create_team
          (Chol<ArgUplo,AlgoChol::DenseByBlocks,ArgVariant>
           ::createTaskFunctor(policy, TA_device), 
           0);
        policy.spawn(future);
        Kokkos::Experimental::wait(policy);
        t = timer.seconds();
        os << ":: Parallel Performance = " << (flop/t/1.0e9) << " [GFLOPs]  ";
      }

      AA_host.mirror(AA_device);
      if (!ierr && check) {
        double err = 0.0, norm = 0.0;
        for (ordinal_type j=0;j<AA_host.NumCols();++j)
          for (ordinal_type i=0;i<=j;++i) {
            const double diff = abs(AA_host.Value(i,j) - AB_host.Value(i,j));
            const double val  = AB_host.Value(i,j);
            err  += diff*diff;
            norm += val*val;
          }
        os << ":: Check result ::norm = " << sqrt(norm) << ", error = " << sqrt(err);
      }
      std::cout << os.str() << std::endl;
    }
    
    return r_val;
  }
  void Constraint<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Setup(const MultiVector& B, const MultiVector& Bc, RCP<const CrsGraph> Ppattern) {
    const size_t NSDim = Bc.getNumVectors();

    Ppattern_ = Ppattern;

    size_t numRows = Ppattern_->getNodeNumRows();
    XXtInv_.resize(numRows);

    RCP<const Import> importer = Ppattern_->getImporter();

    X_ = MultiVectorFactory::Build(Ppattern_->getColMap(), NSDim);
    if (!importer.is_null())
      X_->doImport(Bc, *importer, Xpetra::INSERT);
    else
      *X_ = Bc;

    std::vector<const SC*> Xval(NSDim);
    for (size_t j = 0; j < NSDim; j++)
      Xval[j] = X_->getData(j).get();

    SC zero = Teuchos::ScalarTraits<SC>::zero();
    SC one  = Teuchos::ScalarTraits<SC>::one();

    Teuchos::BLAS  <LO,SC> blas;
    Teuchos::LAPACK<LO,SC> lapack;
    LO lwork = 3*NSDim;
    ArrayRCP<LO> IPIV(NSDim);
    ArrayRCP<SC> WORK(lwork);

    for (size_t i = 0; i < numRows; i++) {
      Teuchos::ArrayView<const LO> indices;
      Ppattern_->getLocalRowView(i, indices);

      size_t nnz = indices.size();

      XXtInv_[i] = Teuchos::SerialDenseMatrix<LO,SC>(NSDim, NSDim, false/*zeroOut*/);
      Teuchos::SerialDenseMatrix<LO,SC>& XXtInv = XXtInv_[i];

      if (NSDim == 1) {
        SC d = zero;
        for (size_t j = 0; j < nnz; j++)
          d += Xval[0][indices[j]] * Xval[0][indices[j]];
        XXtInv(0,0) = one/d;

      } else {
        Teuchos::SerialDenseMatrix<LO,SC> locX(NSDim, nnz, false/*zeroOut*/);
        for (size_t j = 0; j < nnz; j++)
          for (size_t k = 0; k < NSDim; k++)
            locX(k,j) = Xval[k][indices[j]];

        // XXtInv_ = (locX*locX^T)^{-1}
        blas.GEMM(Teuchos::NO_TRANS, Teuchos::CONJ_TRANS, NSDim, NSDim, nnz,
                   one,   locX.values(),   locX.stride(),
                          locX.values(),   locX.stride(),
                  zero, XXtInv.values(), XXtInv.stride());

        LO info;
        // Compute LU factorization using partial pivoting with row exchanges
        lapack.GETRF(NSDim, NSDim, XXtInv.values(), XXtInv.stride(), IPIV.get(), &info);
        // Use the computed factorization to compute the inverse
        lapack.GETRI(NSDim, XXtInv.values(), XXtInv.stride(), IPIV.get(), WORK.get(), lwork, &info);
      }
    }
  }
  void Constraint<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Apply(const Matrix& P, Matrix& Projected) const {
    // We check only row maps. Column may be different.
    TEUCHOS_TEST_FOR_EXCEPTION(!P.getRowMap()->isSameAs(*Projected.getRowMap()), Exceptions::Incompatible,
                               "Row maps are incompatible");
    const size_t NSDim   = X_->getNumVectors();
    const size_t numRows = P.getNodeNumRows();

    const Map& colMap  = *P.getColMap();
    const Map& PColMap = *Projected.getColMap();

    Projected.resumeFill();

    Teuchos::ArrayView<const LO> indices, pindices;
    Teuchos::ArrayView<const SC> values,  pvalues;
    Teuchos::Array<SC> valuesAll(colMap.getNodeNumElements()), newValues;

    LO invalid = Teuchos::OrdinalTraits<LO>::invalid();
    LO oneLO   = Teuchos::OrdinalTraits<LO>::one();
    SC zero    = Teuchos::ScalarTraits<SC> ::zero();
    SC one     = Teuchos::ScalarTraits<SC> ::one();

    std::vector<const SC*> Xval(NSDim);
    for (size_t j = 0; j < NSDim; j++)
      Xval[j] = X_->getData(j).get();

    for (size_t i = 0; i < numRows; i++) {
      P        .getLocalRowView(i,  indices,  values);
      Projected.getLocalRowView(i, pindices, pvalues);

      size_t nnz  = indices.size();     // number of nonzeros in the supplied matrix
      size_t pnnz = pindices.size();    // number of nonzeros in the constrained matrix

      newValues.resize(pnnz);

      // Step 1: fix stencil
      // Projected *must* already have the correct stencil

      // Step 2: copy correct stencil values
      // The algorithm is very similar to the one used in the calculation of
      // Frobenius dot product, see src/Transfers/Energy-Minimization/Solvers/MueLu_CGSolver_def.hpp

      // NOTE: using extra array allows us to skip the search among indices
      for (size_t j = 0; j < nnz; j++)
        valuesAll[indices[j]] = values[j];
      for (size_t j = 0; j < pnnz; j++) {
        LO ind = colMap.getLocalElement(PColMap.getGlobalElement(pindices[j])); // FIXME: we could do that before the full loop just once
        if (ind != invalid)
          // index indices[j] is part of template, copy corresponding value
          newValues[j] = valuesAll[ind];
        else
          newValues[j] = zero;
      }
      for (size_t j = 0; j < nnz; j++)
        valuesAll[indices[j]] = zero;

      // Step 3: project to the space
      Teuchos::SerialDenseMatrix<LO,SC>& XXtInv = XXtInv_[i];

      Teuchos::SerialDenseMatrix<LO,SC> locX(NSDim, pnnz, false);
      for (size_t j = 0; j < pnnz; j++)
        for (size_t k = 0; k < NSDim; k++)
          locX(k,j) = Xval[k][pindices[j]];

      Teuchos::SerialDenseVector<LO,SC> val(pnnz, false), val1(NSDim, false), val2(NSDim, false);
      for (size_t j = 0; j < pnnz; j++)
        val[j] = newValues[j];

      Teuchos::BLAS<LO,SC> blas;
      // val1 = locX * val;
      blas.GEMV(Teuchos::NO_TRANS, NSDim, pnnz,
                one, locX.values(), locX.stride(),
                val.values(), oneLO,
                zero, val1.values(), oneLO);
      // val2 = XXtInv * val1
      blas.GEMV(Teuchos::NO_TRANS, NSDim, NSDim,
                one, XXtInv.values(), XXtInv.stride(),
                val1.values(), oneLO,
                zero,   val2.values(), oneLO);
      // val = X^T * val2
      blas.GEMV(Teuchos::CONJ_TRANS, NSDim, pnnz,
                one, locX.values(), locX.stride(),
                val2.values(), oneLO,
                zero,  val.values(), oneLO);

      for (size_t j = 0; j < pnnz; j++)
        newValues[j] -= val[j];

      Projected.replaceLocalValues(i, pindices, newValues);
    }

    Projected.fillComplete(Projected.getDomainMap(), Projected.getRangeMap()); //FIXME: maps needed?
  }
  KOKKOS_INLINE_FUNCTION
  int exampleDenseTrsmMKL(const OrdinalType mmin,
                          const OrdinalType mmax,
                          const OrdinalType minc,
                          const OrdinalType k,
                          const bool verbose) {
    typedef ValueType   value_type;
    typedef OrdinalType ordinal_type;
    typedef SizeType    size_type;

    typedef DenseMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> DenseMatrixBaseType;

    int r_val = 0;

    Kokkos::Impl::Timer timer;
    double t = 0.0;

    cout << "DenseGemmMKL:: test matrices "
         <<":: mmin = " << mmin << " , mmax = " << mmax << " , minc = " << minc << " , k = "<< k << endl;

    ostringstream os;
    os.precision(3);
    os << scientific;

    for (ordinal_type m=mmin;m<=mmax;m+=minc) {
      os.str("");

      DenseMatrixBaseType AA("AA", m, m), BB("BB", m, k), BC("BC", m, k);
      
      // setup upper triangular
      for (ordinal_type j=0;j<AA.NumCols();++j) {
        AA.Value(j,j) = 10.0;
        for (ordinal_type i=0;i<j;++i)
          AA.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0;
      }

      // setup one and right hand side is going to be overwritten by the product of AB
      for (ordinal_type j=0;j<BB.NumCols();++j)
        for (ordinal_type i=0;i<BB.NumRows();++i)
          BB.Value(i,j) = 1.0;

      Teuchos::BLAS<ordinal_type,value_type> blas;

      blas.GEMM(Teuchos::CONJ_TRANS, Teuchos::NO_TRANS,
                m, k, m,
                1.0,
                AA.ValuePtr(), AA.ColStride(),
                BB.ValuePtr(), BB.ColStride(),
                0.0,
                BC.ValuePtr(), BC.ColStride());
      BB.copy(BC);

      const double flop = get_flop_trsm_upper<value_type>(m, k);

      os << "DenseTrsmMKL:: m = " << m << " k = " << k;
      {
        timer.reset();
        Teuchos::BLAS<ordinal_type,value_type> blas;

        const ordinal_type mm = AA.NumRows();
        const ordinal_type nn = BB.NumCols();

        blas.TRSM(Teuchos::LEFT_SIDE, Teuchos::UPPER_TRI, Teuchos::CONJ_TRANS,
                  Teuchos::NON_UNIT_DIAG,
                  mm, nn,
                  1.0,
                  AA.ValuePtr(), AA.ColStride(),
                  BB.ValuePtr(), BB.ColStride());
        t = timer.seconds();
        os << ":: MKL Performance = " << (flop/t/1.0e9) << " [GFLOPs]  ";
      }
      cout << os.str() << endl;
    }

    return r_val;
  }