void mkl_apply(
      KernelHandle *handle,
      typename KernelHandle::row_lno_t m,
      typename KernelHandle::row_lno_t n,
      typename KernelHandle::row_lno_t k,
      in_row_index_view_type row_mapA,
      in_nonzero_index_view_type entriesA,
      in_nonzero_value_view_type valuesA,

      bool transposeA,
      in_row_index_view_type row_mapB,
      in_nonzero_index_view_type entriesB,
      in_nonzero_value_view_type valuesB,
      bool transposeB,
      typename in_row_index_view_type::non_const_type &row_mapC,
      typename in_nonzero_index_view_type::non_const_type &entriesC,
      typename in_nonzero_value_view_type::non_const_type &valuesC){

#ifdef KERNELS_HAVE_MKL

    typedef typename KernelHandle::row_lno_t idx;
    typedef in_row_index_view_type idx_array_type;

    typedef typename KernelHandle::nnz_scalar_t value_type;


    typedef typename in_row_index_view_type::device_type device1;
    typedef typename in_nonzero_index_view_type::device_type device2;
    typedef typename in_nonzero_value_view_type::device_type device3;

    typedef typename KernelHandle::HandleExecSpace MyExecSpace;

    std::cout << "RUNNING MKL" << std::endl;

#if defined( KOKKOS_HAVE_CUDA )
    if (!Kokkos::Impl::is_same<Kokkos::Cuda, device1 >::value){
      std::cerr << "MEMORY IS NOT ALLOCATED IN HOST DEVICE for MKL" << std::endl;
      return;
    }
    if (!Kokkos::Impl::is_same<Kokkos::Cuda, device2 >::value){
      std::cerr << "MEMORY IS NOT ALLOCATED IN HOST DEVICE for MKL" << std::endl;
      return;
    }
    if (!Kokkos::Impl::is_same<Kokkos::Cuda, device3 >::value){
      std::cerr << "MEMORY IS NOT ALLOCATED IN HOST DEVICE for MKL" << std::endl;
      return;
    }
#endif

    if (Kokkos::Impl::is_same<idx, int>::value){
      int *a_xadj = (int *)row_mapA.ptr_on_device();
      int *b_xadj = (int *)row_mapB.ptr_on_device();
      int *c_xadj = (int *)row_mapC.ptr_on_device();

      int *a_adj = (int *)entriesA.ptr_on_device();
      int *b_adj = (int *)entriesB.ptr_on_device();
      int *c_adj = (int *)entriesC.ptr_on_device();

      int nnzA = entriesA.dimension_0();
      int nnzB = entriesB.dimension_0();

      value_type *a_ew = valuesA.ptr_on_device();
      value_type *b_ew = valuesB.ptr_on_device();
      value_type *c_ew = valuesC.ptr_on_device();

      sparse_matrix_t A;
      sparse_matrix_t B;
      sparse_matrix_t C;

      if (Kokkos::Impl::is_same<value_type, float>::value){



        if (SPARSE_STATUS_SUCCESS != mkl_sparse_s_create_csr (&A, SPARSE_INDEX_BASE_ZERO, m, n, a_xadj, a_xadj + 1, a_adj, (float *)a_ew)){
          std::cerr << "CANNOT CREATE mkl_sparse_s_create_csr A" << std::endl;
          return;
        }

        if (SPARSE_STATUS_SUCCESS != mkl_sparse_s_create_csr (&B, SPARSE_INDEX_BASE_ZERO, n, k, b_xadj, b_xadj + 1, b_adj, (float *)b_ew)){
          std::cerr << "CANNOT CREATE mkl_sparse_s_create_csr B" << std::endl;
          return;
        }


        sparse_operation_t operation;
        if (transposeA && transposeB){
          operation = SPARSE_OPERATION_TRANSPOSE;
        }
        else if (!(transposeA || transposeB)){
          operation = SPARSE_OPERATION_NON_TRANSPOSE;
        }
        else {
          std::cerr << "Ask both to transpose or non transpose for MKL SPGEMM" << std::endl;
          return;
        }


        Kokkos::Impl::Timer timer1;
        bool success = SPARSE_STATUS_SUCCESS != mkl_sparse_spmm (operation, A, B, &C);
        std::cout << "Actual FLOAT MKL SPMM Time:" << timer1.seconds() << std::endl;

        if (success){
          std::cerr << "CANNOT multiply mkl_sparse_spmm " << std::endl;
          return;
        }
        else{

          sparse_index_base_t c_indexing;
          MKL_INT c_rows, c_cols, *rows_start, *rows_end, *columns;
          float *values;

          if (SPARSE_STATUS_SUCCESS !=
              mkl_sparse_s_export_csr (C,
                  &c_indexing, &c_rows, &c_cols, &rows_start, &rows_end, &columns, &values)){
            std::cerr << "CANNOT export result matrix " << std::endl;
            return;
          }

          if (SPARSE_INDEX_BASE_ZERO != c_indexing){
            std::cerr << "C is not zero based indexed." << std::endl;
            return;
          }


          row_mapC = typename in_row_index_view_type::non_const_type(Kokkos::ViewAllocateWithoutInitializing("rowmapC"), c_rows + 1);
          entriesC = typename in_nonzero_index_view_type::non_const_type (Kokkos::ViewAllocateWithoutInitializing("EntriesC") , rows_end[m - 1] );
          valuesC = typename in_nonzero_value_view_type::non_const_type (Kokkos::ViewAllocateWithoutInitializing("valuesC") ,  rows_end[m - 1]);

          KokkosKernels::Experimental::Util::copy_vector<MKL_INT *, typename in_row_index_view_type::non_const_type, MyExecSpace> (m, rows_start, row_mapC);
          idx nnz = row_mapC(m) =  rows_end[m - 1];

          KokkosKernels::Experimental::Util::copy_vector<MKL_INT *, typename in_nonzero_index_view_type::non_const_type , MyExecSpace> (nnz, columns, entriesC);
          KokkosKernels::Experimental::Util::copy_vector<float *, typename in_nonzero_value_view_type::non_const_type, MyExecSpace> (m, values, valuesC);
        }


        if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy (A)){
          std::cerr << "CANNOT DESTROY mkl_sparse_destroy A" << std::endl;
          return;
        }

        if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy (B)){
          std::cerr << "CANNOT DESTROY mkl_sparse_destroy B" << std::endl;
          return;
        }
        if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy (C)){
          std::cerr << "CANNOT DESTROY mkl_sparse_destroy C" << std::endl;
          return;
        }
      }
      else if (Kokkos::Impl::is_same<value_type, double>::value){

        /*
        std::cout << "create a" << std::endl;
        std::cout << "m:" << m << " n:" << n << std::endl;
        std::cout << "a_xadj[0]:" << a_xadj[0] << " a_xadj[m]:" << a_xadj[m] << std::endl;
        std::cout << "a_adj[a_xadj[m] - 1]:" << a_adj[a_xadj[m] - 1] << " a_ew[a_xadj[m] - 1]:" << a_ew[a_xadj[m] - 1] << std::endl;
        */
        if (SPARSE_STATUS_SUCCESS != mkl_sparse_d_create_csr (&A, SPARSE_INDEX_BASE_ZERO, m, n, a_xadj, a_xadj + 1, a_adj, (double *)a_ew)){
          std::cerr << "CANNOT CREATE mkl_sparse_d_create_csr A" << std::endl;
          return;
        }

        //std::cout << "create b" << std::endl;
        if (SPARSE_STATUS_SUCCESS != mkl_sparse_d_create_csr (&B, SPARSE_INDEX_BASE_ZERO, n, k, b_xadj, b_xadj + 1, b_adj, (double *) b_ew)){
          std::cerr << "CANNOT CREATE mkl_sparse_d_create_csr B" << std::endl;
          return;
        }

        sparse_operation_t operation;
        if (transposeA && transposeB){
          operation = SPARSE_OPERATION_TRANSPOSE;
        }
        else if (!(transposeA || transposeB)){
          operation = SPARSE_OPERATION_NON_TRANSPOSE;
        }
        else {
          std::cerr << "Ask both to transpose or non transpose for MKL SPGEMM" << std::endl;
          return;
        }


        Kokkos::Impl::Timer timer1;
        bool success = SPARSE_STATUS_SUCCESS != mkl_sparse_spmm (operation, A, B, &C);
        std::cout << "Actual DOUBLE MKL SPMM Time:" << timer1.seconds() << std::endl;

        if (success){
          std::cerr << "CANNOT multiply mkl_sparse_spmm " << std::endl;
          return;
        }
        else{


          sparse_index_base_t c_indexing;
          MKL_INT c_rows, c_cols, *rows_start, *rows_end, *columns;
          double *values;

          if (SPARSE_STATUS_SUCCESS !=
              mkl_sparse_d_export_csr (C,
                  &c_indexing, &c_rows, &c_cols, &rows_start, &rows_end, &columns, &values)){
            std::cerr << "CANNOT export result matrix " << std::endl;
            return;
          }

          if (SPARSE_INDEX_BASE_ZERO != c_indexing){
            std::cerr << "C is not zero based indexed." << std::endl;
            return;
          }
          {
            Kokkos::Impl::Timer copy_time;
            row_mapC = typename in_row_index_view_type::non_const_type(Kokkos::ViewAllocateWithoutInitializing("rowmapC"), c_rows + 1);
            entriesC = typename in_nonzero_index_view_type::non_const_type (Kokkos::ViewAllocateWithoutInitializing("EntriesC") , rows_end[m - 1] );
            valuesC = typename in_nonzero_value_view_type::non_const_type (Kokkos::ViewAllocateWithoutInitializing("valuesC") ,  rows_end[m - 1]);

            KokkosKernels::Experimental::Util::copy_vector<MKL_INT *, typename in_row_index_view_type::non_const_type, MyExecSpace> (m, rows_start, row_mapC);
            idx nnz = row_mapC(m) =  rows_end[m - 1];

            KokkosKernels::Experimental::Util::copy_vector<MKL_INT *, typename in_nonzero_index_view_type::non_const_type, MyExecSpace> (nnz, columns, entriesC);
            KokkosKernels::Experimental::Util::copy_vector<double *, typename in_nonzero_value_view_type::non_const_type, MyExecSpace> (m, values, valuesC);
            double copy_time_d = copy_time.seconds();
            std::cout << "MKL COPYTIME:" << copy_time_d << std::endl;
          }

        }


        if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy (A)){
          std::cerr << "CANNOT DESTROY mkl_sparse_destroy A" << std::endl;
          return;
        }

        if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy (B)){
          std::cerr << "CANNOT DESTROY mkl_sparse_destroy B" << std::endl;
          return;
        }
        if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy (C)){
          std::cerr << "CANNOT DESTROY mkl_sparse_destroy C" << std::endl;
          return;
        }

      }
      else {
        std::cerr << "CUSPARSE requires float or double values. cuComplex and cuDoubleComplex are not implemented yet." << std::endl;
        return;
      }
    }
    else {

      //int *a_xadj = row_mapA.ptr_on_device();
      std::cerr << "MKL requires integer values" << std::endl;

      if (Kokkos::Impl::is_same<idx, unsigned int>::value){
        std::cerr << "MKL is given unsigned integer" << std::endl;
      }
      else if (Kokkos::Impl::is_same<idx, long>::value){
        std::cerr << "MKL is given long" << std::endl;
      }
      else if (Kokkos::Impl::is_same<idx, const int>::value){
        std::cerr << "MKL is given const int" << std::endl;
      }
      else if (Kokkos::Impl::is_same<idx, unsigned long>::value){
        std::cerr << "MKL is given unsigned long" << std::endl;
      }
      else if (Kokkos::Impl::is_same<idx, const unsigned long>::value){
        std::cerr << "MKL is given const unsigned long" << std::endl;
      }
      else{
        std::cerr << "MKL is given something else" << std::endl;
      }
      return;
    }
#else
    std::cerr << "MKL IS NOT DEFINED" << std::endl;
    return;
#endif
  }
  void viennaCL_apply(
      KernelHandle *handle,
      typename KernelHandle::nnz_lno_t m,
      typename KernelHandle::nnz_lno_t n,
      typename KernelHandle::nnz_lno_t k,
      in_row_index_view_type row_mapA,
      in_nonzero_index_view_type entriesA,
      in_nonzero_value_view_type valuesA,

      bool transposeA,
      bin_row_index_view_type row_mapB,
      bin_nonzero_index_view_type entriesB,
      bin_nonzero_value_view_type valuesB,
      bool transposeB,
      cin_row_index_view_type &row_mapC,
      cin_nonzero_index_view_type &entriesC,
      cin_nonzero_value_view_type &valuesC){

#ifdef KERNELS_HAVE_VIENNACL

    typedef typename KernelHandle::nnz_lno_t idx;
    typedef in_row_index_view_type idx_array_type;

    typedef typename KernelHandle::nnz_scalar_t value_type;


    typedef typename in_row_index_view_type::device_type device1;
    typedef typename in_nonzero_index_view_type::device_type device2;
    typedef typename in_nonzero_value_view_type::device_type device3;

    typedef typename KernelHandle::HandleExecSpace MyExecSpace;

    std::cout << "RUNNING VIENNACL" << std::endl;

    typedef typename viennacl::compressed_matrix<value_type>::handle_type it;
    typedef typename viennacl::compressed_matrix<value_type>::value_type vt;

    if ((Kokkos::Impl::is_same<idx, int>::value && Kokkos::Impl::is_same<typename KernelHandle::size_type, int>::value )||
        (Kokkos::Impl::is_same<idx, unsigned int>::value && Kokkos::Impl::is_same<typename KernelHandle::size_type, unsigned int>::value ) ||
        (Kokkos::Impl::is_same<idx, it>::value && Kokkos::Impl::is_same<typename KernelHandle::size_type, it>::value )
        ){

      unsigned int * a_xadj = (unsigned int *)row_mapA.ptr_on_device();
      unsigned int * b_xadj = (unsigned int * )row_mapB.ptr_on_device();
      unsigned int * c_xadj = (unsigned int * )row_mapC.ptr_on_device();

      unsigned int * a_adj = (unsigned int * )entriesA.ptr_on_device();
      unsigned int * b_adj = (unsigned int * )entriesB.ptr_on_device();
      unsigned int * c_adj = (unsigned int * )entriesC.ptr_on_device();

      int nnzA = entriesA.dimension_0();
      int nnzB = entriesB.dimension_0();

      value_type *a_ew = valuesA.ptr_on_device();
      value_type *b_ew = valuesB.ptr_on_device();
      value_type *c_ew = valuesC.ptr_on_device();



      /*
        std::cout << "create a" << std::endl;
        std::cout << "m:" << m << " n:" << n << std::endl;
        std::cout << "a_xadj[0]:" << a_xadj[0] << " a_xadj[m]:" << a_xadj[m] << std::endl;
        std::cout << "a_adj[a_xadj[m] - 1]:" << a_adj[a_xadj[m] - 1] << " a_ew[a_xadj[m] - 1]:" << a_ew[a_xadj[m] - 1] << std::endl;
       */


      Kokkos::Impl::Timer timerset;

      viennacl::compressed_matrix<value_type> A;
      viennacl::compressed_matrix<value_type> B;
      A.set(a_xadj, a_adj, a_ew, m, n, nnzA);
      B.set(b_xadj, b_adj, b_ew, n, k, nnzB);
      std::cout << "compress matrix create:" << timerset.seconds() << std::endl;


      std::cout << "Now running ViennaCL" << std::endl;
      Kokkos::Impl::Timer timer1;
      viennacl::compressed_matrix<value_type> C = viennacl::linalg::prod(A, B);
      std::cout << "Actual VIENNACL SPMM Time:" << timer1.seconds() << std::endl;

      {



        unsigned int c_rows = m, c_cols = k, cnnz = C.nnz();


        value_type   const * values   = viennacl::linalg::host_based::detail::extract_raw_pointer<value_type>(C.handle());
        unsigned int const * rows_start = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(C.handle1());
        unsigned int const * columns = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(C.handle2());


        {
          Kokkos::Impl::Timer copy_time;
          row_mapC = typename cin_row_index_view_type::non_const_type(Kokkos::ViewAllocateWithoutInitializing("rowmapC"), c_rows + 1);
          entriesC = typename cin_nonzero_index_view_type::non_const_type (Kokkos::ViewAllocateWithoutInitializing("EntriesC") , cnnz);
          valuesC = typename cin_nonzero_value_view_type::non_const_type (Kokkos::ViewAllocateWithoutInitializing("valuesC") ,  cnnz);

          KokkosKernels::Experimental::Util::copy_vector<unsigned int const *, typename cin_row_index_view_type::non_const_type, MyExecSpace> (m, rows_start, row_mapC);
          idx nnz = cnnz;

          KokkosKernels::Experimental::Util::copy_vector<unsigned int const *, typename cin_nonzero_index_view_type::non_const_type, MyExecSpace> (nnz, columns, entriesC);
          KokkosKernels::Experimental::Util::copy_vector<value_type   const *, typename cin_nonzero_value_view_type::non_const_type, MyExecSpace> (m, values, valuesC);
          double copy_time_d = copy_time.seconds();
          std::cout << "VIENNACL COPYTIME:" << copy_time_d << std::endl;
        }

      }
    }
    else {

      //int *a_xadj = row_mapA.ptr_on_device();
      std::cerr << "vienna requires (u) integer values" << std::endl;

      if (Kokkos::Impl::is_same<idx, long>::value){
        std::cerr << "MKL is given long" << std::endl;
      }
      else if (Kokkos::Impl::is_same<idx, const int>::value){
        std::cerr << "MKL is given const int" << std::endl;
      }
      else if (Kokkos::Impl::is_same<idx, unsigned long>::value){
        std::cerr << "MKL is given unsigned long" << std::endl;
      }
      else if (Kokkos::Impl::is_same<idx, const unsigned long>::value){
        std::cerr << "MKL is given const unsigned long" << std::endl;
      }
      else{
        std::cerr << "MKL is given something else" << std::endl;
      }
      return;
    }
#else
    std::cerr << "VIENNACL IS NOT DEFINED" << std::endl;
    return;
#endif
  }
  void cuSPARSE_symbolic(
      KernelHandle *handle,
      typename KernelHandle::row_lno_t m,
      typename KernelHandle::row_lno_t n,
      typename KernelHandle::row_lno_t k,
      in_row_index_view_type row_mapA,
      in_nonzero_index_view_type entriesA,

      bool transposeA,
      in_row_index_view_type row_mapB,
      in_nonzero_index_view_type entriesB,
      bool transposeB,
      typename in_row_index_view_type::non_const_type &row_mapC,
      typename in_nonzero_index_view_type::non_const_type &entriesC){

#ifdef KERNELS_HAVE_CUSPARSE

    typedef typename in_row_index_view_type::device_type device1;
    typedef typename in_nonzero_index_view_type::device_type device2;

    typedef typename KernelHandle::row_lno_t idx;
    typedef typename in_row_index_view_type::non_const_type idx_array_type;


    if (Kokkos::Impl::is_same<Kokkos::Cuda, device1 >::value){
      std::cerr << "MEMORY IS NOT ALLOCATED IN GPU DEVICE for CUSPARSE" << std::endl;
      return;
    }
    if (Kokkos::Impl::is_same<Kokkos::Cuda, device2 >::value){
      std::cerr << "MEMORY IS NOT ALLOCATED IN GPU DEVICE for CUSPARSE" << std::endl;
      return;
    }

    if (Kokkos::Impl::is_same<idx, int>::value){
      row_mapC = idx_array_type("rowMapC", m + 1);
      const idx *a_xadj = row_mapA.ptr_on_device();
      const idx *b_xadj = row_mapB.ptr_on_device();
      idx *c_xadj = row_mapC.ptr_on_device();

      const idx *a_adj = entriesA.ptr_on_device();
      const idx *b_adj = entriesB.ptr_on_device();
      handle->create_cuSPARSE_Handle(transposeA, transposeB);
      typename KernelHandle::SPGEMMcuSparseHandleType *h = handle->get_cuSparseHandle();

      int nnzA = entriesA.dimension_0();
      int nnzB = entriesB.dimension_0();

      int baseC, nnzC;
      int *nnzTotalDevHostPtr = &nnzC;

      cusparseXcsrgemmNnz(h->handle,
                          h->transA,
                          h->transB,
                          (int)m,
                          (int)n,
                          (int)k,
                          h->a_descr,
                          nnzA,
                          (int *) a_xadj,
                          (int *)a_adj,
                          h->b_descr,
                          nnzB,
                          (int *)b_xadj,
                          (int *)b_adj,
                          h->c_descr,
                          (int *)c_xadj,
                          nnzTotalDevHostPtr );

      if (NULL != nnzTotalDevHostPtr){
          nnzC = *nnzTotalDevHostPtr;
      }else{
          cudaMemcpy(&nnzC, c_xadj+m, sizeof(int), cudaMemcpyDeviceToHost);
          cudaMemcpy(&baseC, c_xadj, sizeof(int), cudaMemcpyDeviceToHost);
          nnzC -= baseC;
      }
      entriesC = in_nonzero_index_view_type("entriesC", nnzC);
    }
    else {
      std::cerr << "CUSPARSE requires integer values" << std::endl;
      return;
    }
#else
    std::cerr << "CUSPARSE IS NOT DEFINED" << std::endl;
    return;
#endif

  }
void CUSP_apply(
    KernelHandle *handle,
    typename KernelHandle::row_index_type m,
    typename KernelHandle::row_index_type n,
    typename KernelHandle::row_index_type k,
    in_row_index_view_type row_mapA,
    in_nonzero_index_view_type entriesA,
    in_nonzero_value_view_type valuesA,

    bool transposeA,
    in_row_index_view_type row_mapB,
    in_nonzero_index_view_type entriesB,
    in_nonzero_value_view_type valuesB,
    bool transposeB,
    in_row_index_view_type &row_mapC,
    in_nonzero_index_view_type &entriesC,
    in_nonzero_value_view_type &valuesC){
#ifdef KERNELS_HAVE_CUSP
  typedef typename KernelHandle::row_index_type idx;
  typedef typename KernelHandle::nonzero_value_type value_type;

  typedef typename in_row_index_view_type::device_type device1;
  typedef typename in_nonzero_index_view_type::device_type device2;
  typedef typename in_nonzero_value_view_type::device_type device3;

  std::cout << "RUNNING CUSP" << std::endl;
  if (Kokkos::Impl::is_same<Kokkos::Cuda, device1 >::value){
    std::cerr << "MEMORY IS NOT ALLOCATED IN GPU DEVICE for CUSP" << std::endl;
    return;
  }
  if (Kokkos::Impl::is_same<Kokkos::Cuda, device2 >::value){
    std::cerr << "MEMORY IS NOT ALLOCATED IN GPU DEVICE for CUSP" << std::endl;
    return;
  }
  if (Kokkos::Impl::is_same<Kokkos::Cuda, device3 >::value){
    std::cerr << "MEMORY IS NOT ALLOCATED IN GPU DEVICE for CUSP" << std::endl;
    return;
  }

  typedef in_row_index_view_type idx_array_type;

  typedef typename Kokkos::RangePolicy<typename KernelHandle::HandleExecSpace> my_exec_space;




  idx nnzA = entriesA.dimension_0();
  idx nnzB = entriesB.dimension_0();


  idx *a_xadj = (idx *)row_mapA.ptr_on_device();
  idx *b_xadj = (idx *)row_mapB.ptr_on_device();


  idx *a_adj = (idx *)entriesA.ptr_on_device();
  idx *b_adj = (idx *)entriesB.ptr_on_device();

  value_type *a_ew = valuesA.ptr_on_device();
  value_type *b_ew = valuesB.ptr_on_device();

  /*
  thrust::device_ptr<idx> dev_a_xadj(a_xadj);
  thrust::device_ptr<idx> dev_a_adj(a_adj);
  thrust::device_ptr<idx> dev_b_xadj(b_xadj);
  thrust::device_ptr<idx> dev_b_adj(b_adj);
  thrust::device_ptr<value_type> dev_a_ew(a_ew);
  thrust::device_ptr<value_type> dev_b_ew(b_ew);

  */
  typedef typename cusp::array1d_view< thrust::device_ptr<idx> > IDXArray1dView;
  typedef typename cusp::array1d_view< thrust::device_ptr<value_type> > VALUEArray1dView;
  //typedef typename cusp::array1d<idx, cusp::device_memory> IDXArray1dView;
  //typedef typename cusp::array1d<value_type, cusp::device_memory> VALUEArray1dView;
  IDXArray1dView arraya_xadj(thrust::device_pointer_cast(a_xadj), thrust::device_pointer_cast(a_xadj) + m + 1);
  IDXArray1dView arraya_adj(thrust::device_pointer_cast(a_adj), thrust::device_pointer_cast(a_adj) + nnzA);
  IDXArray1dView arrayb_xadj(thrust::device_pointer_cast(b_xadj), thrust::device_pointer_cast(b_xadj) + n + 1);
  IDXArray1dView arrayb_adj(thrust::device_pointer_cast(b_adj), thrust::device_pointer_cast(b_adj) + nnzB);
  VALUEArray1dView arraya_ew(thrust::device_pointer_cast(a_ew), thrust::device_pointer_cast(a_ew) + nnzA);
  VALUEArray1dView arrayb_ew(thrust::device_pointer_cast(b_ew), thrust::device_pointer_cast(b_ew)+ nnzB);

  typedef typename cusp::csr_matrix_view<IDXArray1dView, IDXArray1dView, VALUEArray1dView, idx,value_type,cusp::device_memory> cuspMatrix_View;

  cuspMatrix_View A(m, n, entriesA.dimension_0(), arraya_xadj, arraya_adj, arraya_ew);
  cuspMatrix_View B(n, k, entriesB.dimension_0(), arrayb_xadj, arrayb_adj, arrayb_ew);

  /*
  CopyArrayToCuspArray<typename cuspMatrix::row_offsets_array_type, typename KernelHandle::idx_array_type> Aforward(A.row_offsets, row_mapA);
  Kokkos::parallel_for (my_exec_space (0, m + 1) , Aforward);
  Kokkos::parallel_for (my_exec_space (0, n + 1) , CopyArrayToCuspArray<typename cuspMatrix::row_offsets_array_type, typename KernelHandle::idx_array_type>(B.row_offsets, row_mapB));

  Kokkos::parallel_for (my_exec_space (0, entriesA.dimension_0()) , CopyArrayToCuspArray<typename cuspMatrix::column_indices_array_type, typename KernelHandle::idx_edge_array_type>(A.column_indices, entriesA));
  Kokkos::parallel_for (my_exec_space (0, entriesB.dimension_0()) , CopyArrayToCuspArray<typename cuspMatrix::column_indices_array_type, typename KernelHandle::idx_edge_array_type>(B.column_indices, entriesB));

  Kokkos::parallel_for (my_exec_space (0, valuesA.dimension_0()) , CopyArrayToCuspArray<typename cuspMatrix::values_array_type, typename KernelHandle::value_array_type>(A.values, valuesA));
  Kokkos::parallel_for (my_exec_space (0, valuesB.dimension_0()) , CopyArrayToCuspArray<typename cuspMatrix::values_array_type, typename KernelHandle::value_array_type>(B.values, valuesB));
  */

  typedef typename cusp::csr_matrix<idx,value_type,cusp::device_memory> cuspMatrix;
  //typedef cuspMatrix_View cuspMatrix;
  cuspMatrix C;



  cusp::multiply(A,B,C);

  std::cout << " C.column_indices.size():" <<  C.column_indices.size() << std::endl;
  std::cout << " C.values.size():" <<  C.values.size() << std::endl;
  row_mapC = typename in_row_index_view_type::non_const_type("rowmapC", m + 1);
  entriesC = typename in_nonzero_index_view_type::non_const_type ("EntriesC" ,  C.column_indices.size());
  valuesC = typename in_nonzero_value_view_type::non_const_type ("valuesC" ,  C.values.size());

  Kokkos::parallel_for (my_exec_space (0, m + 1) , CopyArrayToCuspArray<in_row_index_view_type,
      idx >(row_mapC, (idx *) thrust::raw_pointer_cast(C.row_offsets.data())));
  Kokkos::parallel_for (my_exec_space (0, C.column_indices.size()) , CopyArrayToCuspArray<in_nonzero_index_view_type,
      idx >(entriesC, (idx *) thrust::raw_pointer_cast(C.column_indices.data())));
  Kokkos::parallel_for (my_exec_space (0, C.values.size()) , CopyArrayToCuspArray<in_nonzero_value_view_type,
      value_type>(valuesC, (value_type *) thrust::raw_pointer_cast(C.values.data())));

#else
  std::cerr << "CUSP IS NOT DEFINED" << std::endl;
  return;
#endif
}
  void cuSPARSE_apply(
      KernelHandle *handle,
      typename KernelHandle::row_lno_t m,
      typename KernelHandle::row_lno_t n,
      typename KernelHandle::row_lno_t k,
      in_row_index_view_type row_mapA,
      in_nonzero_index_view_type entriesA,
      in_nonzero_value_view_type valuesA,

      bool transposeA,
      in_row_index_view_type row_mapB,
      in_nonzero_index_view_type entriesB,
      in_nonzero_value_view_type valuesB,
      bool transposeB,
      typename in_row_index_view_type::non_const_type &row_mapC,
      typename in_nonzero_index_view_type::non_const_type &entriesC,
      typename in_nonzero_value_view_type::non_const_type &valuesC){

#ifdef KERNELS_HAVE_CUSPARSE
    typedef typename KernelHandle::row_lno_t idx;
    typedef in_row_index_view_type idx_array_type;

    typedef typename KernelHandle::nnz_scalar_t value_type;


    typedef typename in_row_index_view_type::device_type device1;
    typedef typename in_nonzero_index_view_type::device_type device2;
    typedef typename in_nonzero_value_view_type::device_type device3;
    std::cout << "RUNNING CUSParse" << std::endl;

    if (Kokkos::Impl::is_same<Kokkos::Cuda, device1 >::value){
      std::cerr << "MEMORY IS NOT ALLOCATED IN GPU DEVICE for CUSPARSE" << std::endl;
      return;
    }
    if (Kokkos::Impl::is_same<Kokkos::Cuda, device2 >::value){
      std::cerr << "MEMORY IS NOT ALLOCATED IN GPU DEVICE for CUSPARSE" << std::endl;
      return;
    }
    if (Kokkos::Impl::is_same<Kokkos::Cuda, device3 >::value){
      std::cerr << "MEMORY IS NOT ALLOCATED IN GPU DEVICE for CUSPARSE" << std::endl;
      return;
    }



    if (Kokkos::Impl::is_same<idx, int>::value){
      int *a_xadj = (int *)row_mapA.ptr_on_device();
      int *b_xadj = (int *)row_mapB.ptr_on_device();
      int *c_xadj = (int *)row_mapC.ptr_on_device();

      int *a_adj = (int *)entriesA.ptr_on_device();
      int *b_adj = (int *)entriesB.ptr_on_device();
      int *c_adj = (int *)entriesC.ptr_on_device();


      typename KernelHandle::SPGEMMcuSparseHandleType *h = handle->get_cuSparseHandle();

      int nnzA = entriesA.dimension_0();
      int nnzB = entriesB.dimension_0();

      value_type *a_ew = valuesA.ptr_on_device();
      value_type *b_ew = valuesB.ptr_on_device();
      value_type *c_ew = valuesC.ptr_on_device();

      if (Kokkos::Impl::is_same<value_type, float>::value){
        cusparseScsrgemm(
            h->handle,
            h->transA,
            h->transB,
            m,
            n,
            k,
            h->a_descr,
            nnzA,
            (float *)a_ew,
            a_xadj,
            a_adj,
            h->b_descr,
            nnzB,
            (float *)b_ew,
            b_xadj,
            b_adj,
            h->c_descr,
            (float *)c_ew,
            c_xadj,
            c_adj);
      }
      else if (Kokkos::Impl::is_same<value_type, double>::value){
        cusparseDcsrgemm(
            h->handle,
            h->transA,
            h->transB,
            m,
            n,
            k,
            h->a_descr,
            nnzA,
            (double *)a_ew,
            a_xadj,
            a_adj,
            h->b_descr,
            nnzB,
            (double *)b_ew,
            b_xadj,
            b_adj,
            h->c_descr,
            (double *)c_ew,
            c_xadj,
            c_adj);
      }
      else {
        std::cerr << "CUSPARSE requires float or double values. cuComplex and cuDoubleComplex are not implemented yet." << std::endl;
        return;
      }




    }
    else {
      std::cerr << "CUSPARSE requires integer values" << std::endl;
      return;
    }
#else
    std::cerr << "CUSPARSE IS NOT DEFINED" << std::endl;
    return;
#endif
  }