void ilu_transpose(compressed_matrix<NumericT> const & A,
                   compressed_matrix<NumericT>       & B)
{
  viennacl::context orig_ctx = viennacl::traits::context(A);
  viennacl::context cpu_ctx(viennacl::MAIN_MEMORY);
  (void)orig_ctx;
  (void)cpu_ctx;

  viennacl::compressed_matrix<NumericT> A_host(0, 0, 0, cpu_ctx);
  (void)A_host;

  switch (viennacl::traits::handle(A).get_active_handle_id())
  {
  case viennacl::MAIN_MEMORY:
    viennacl::linalg::host_based::ilu_transpose(A, B);
    break;
#ifdef VIENNACL_WITH_OPENCL
  case viennacl::OPENCL_MEMORY:
    A_host = A;
    B.switch_memory_context(cpu_ctx);
    viennacl::linalg::host_based::ilu_transpose(A_host, B);
    B.switch_memory_context(orig_ctx);
    break;
#endif
#ifdef VIENNACL_WITH_CUDA
  case viennacl::CUDA_MEMORY:
    A_host = A;
    B.switch_memory_context(cpu_ctx);
    viennacl::linalg::host_based::ilu_transpose(A_host, B);
    B.switch_memory_context(orig_ctx);
    break;
#endif
  case viennacl::MEMORY_NOT_INITIALIZED:
    throw memory_exception("not initialised!");
  default:
    throw memory_exception("not implemented");
  }
}
  int exampleDenseGemmByBlocks(const ordinal_type mmin,
                               const ordinal_type mmax,
                               const ordinal_type minc,
                               const ordinal_type k,
                               const ordinal_type mb,
                               const int max_concurrency,
                               const int max_task_dependence,
                               const int team_size,
                               const int mkl_nthreads,
                               const bool check,
                               const bool verbose) {
    typedef typename
      Kokkos::Impl::is_space<DeviceSpaceType>::host_mirror_space::execution_space HostSpaceType ;

    const bool detail = false;
    std::cout << "DeviceSpace::  "; DeviceSpaceType::print_configuration(std::cout, detail);
    std::cout << "HostSpace::    ";   HostSpaceType::print_configuration(std::cout, detail);

    typedef Kokkos::Experimental::TaskPolicy<DeviceSpaceType> PolicyType;

    typedef DenseMatrixBase<value_type,ordinal_type,size_type,HostSpaceType> DenseMatrixBaseHostType;
    typedef DenseMatrixView<DenseMatrixBaseHostType> DenseMatrixViewHostType;

    typedef DenseMatrixBase<value_type,ordinal_type,size_type,DeviceSpaceType> DenseMatrixBaseDeviceType;
    typedef DenseMatrixView<DenseMatrixBaseDeviceType> DenseMatrixViewDeviceType;
    typedef TaskView<DenseMatrixViewDeviceType> DenseTaskViewDeviceType;

    typedef DenseMatrixBase<DenseTaskViewDeviceType,ordinal_type,size_type,DeviceSpaceType> DenseHierMatrixBaseDeviceType;

    typedef DenseMatrixView<DenseHierMatrixBaseDeviceType> DenseHierMatrixViewDeviceType;
    typedef TaskView<DenseHierMatrixViewDeviceType> DenseHierTaskViewDeviceType;

    int r_val = 0;

    Kokkos::Impl::Timer timer;
    double t = 0.0;

    std::cout << "DenseGemmByBlocks:: test matrices "
              <<":: mmin = " << mmin << " , mmax = " << mmax << " , minc = " << minc 
              << " , k = "<< k << " , mb = " << mb << std::endl;

    const size_t max_task_size = (3*sizeof(DenseTaskViewDeviceType)+sizeof(PolicyType)+128); 
    PolicyType policy(max_concurrency,
                      max_task_size,
                      max_task_dependence,
                      team_size);

    std::ostringstream os;
    os.precision(3);
    os << std::scientific;

    for (ordinal_type m=mmin;m<=mmax;m+=minc) {
      os.str("");
      
      // host matrices
      DenseMatrixBaseHostType AA_host, BB_host, CC_host("CC_host", m, m), CB_host("CB_host", m, m);
      {
        if (ArgTransA == Trans::NoTranspose) 
          AA_host = DenseMatrixBaseHostType("AA_host", m, k); 
        else 
          AA_host = DenseMatrixBaseHostType("AA_host", k, m);
        
        if (ArgTransB == Trans::NoTranspose) 
          BB_host = DenseMatrixBaseHostType("BB_host", k, m);
        else 
          BB_host = DenseMatrixBaseHostType("BB_host", m, k);
        
        for (ordinal_type j=0;j<AA_host.NumCols();++j)
          for (ordinal_type i=0;i<AA_host.NumRows();++i)
            AA_host.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0;
        
        for (ordinal_type j=0;j<BB_host.NumCols();++j)
          for (ordinal_type i=0;i<BB_host.NumRows();++i)
            BB_host.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0;
        
        for (ordinal_type j=0;j<CC_host.NumCols();++j)
          for (ordinal_type i=0;i<CC_host.NumRows();++i)
            CC_host.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0;
        
        DenseMatrixTools::copy(CB_host, CC_host);
      }

      const double flop = DenseFlopCount<value_type>::Gemm(m, m, k);

#ifdef HAVE_SHYLUTACHO_MKL
      mkl_set_num_threads(mkl_nthreads);
#endif

      os << "DenseGemmByBlocks:: m = " << m << " n = " << m << " k = " << k << "  ";
      if (check) {
        timer.reset();
        DenseMatrixViewHostType A_host(AA_host), B_host(BB_host), C_host(CB_host);
        Gemm<ArgTransA,ArgTransB,AlgoGemm::ExternalBlas,Variant::One>::invoke
          (policy, policy.member_single(),
           1.0, A_host, B_host, 1.0, C_host);
        t = timer.seconds();
        os << ":: Serial Performance = " << (flop/t/1.0e9) << " [GFLOPs]  ";
      }

      DenseMatrixBaseDeviceType AA_device("AA_device"), BB_device("BB_device"), CC_device("CC_device");
      {
        timer.reset();
        AA_device.mirror(AA_host);
        BB_device.mirror(BB_host);
        CC_device.mirror(CC_host);
        t = timer.seconds();
        os << ":: Mirror = " << t << " [sec]  ";
      }

      {
        DenseHierMatrixBaseDeviceType HA_device("HA_device"), HB_device("HB_device"), HC_device("HC_device");

        DenseMatrixTools::createHierMatrix(HA_device, AA_device, mb, mb);        
        DenseMatrixTools::createHierMatrix(HB_device, BB_device, mb, mb);        
        DenseMatrixTools::createHierMatrix(HC_device, CC_device, mb, mb);        

        DenseHierTaskViewDeviceType TA_device(HA_device), TB_device(HB_device), TC_device(HC_device);
        timer.reset();
        auto future = policy.proc_create_team
          (Gemm<ArgTransA,ArgTransB,AlgoGemm::DenseByBlocks,ArgVariant>
           ::createTaskFunctor(policy, 1.0, TA_device, TB_device, 1.0, TC_device),
           0);
        policy.spawn(future);
        Kokkos::Experimental::wait(policy);
        t = timer.seconds();       
        os << ":: Parallel Performance = " << (flop/t/1.0e9) << " [GFLOPs]  ";
      } 

      CC_host.mirror(CC_device);
      if (check) {
        double err = 0.0, norm = 0.0;
        for (ordinal_type j=0;j<CC_host.NumCols();++j)
          for (ordinal_type i=0;i<CC_host.NumRows();++i) {
            const double diff = abs(CC_host.Value(i,j) - CB_host.Value(i,j));
            const double val  = CB_host.Value(i,j);
            err  += diff*diff;
            norm += val*val;
          }
        os << ":: Check result ::norm = " << sqrt(norm) << ", error = " << sqrt(err);
      }
      std::cout << os.str() << std::endl;
    }

    return r_val;
  }
  int exampleDenseCholByBlocks(const ordinal_type mmin,
                               const ordinal_type mmax,
                               const ordinal_type minc,
                               const ordinal_type mb,
                               const int max_concurrency,
                               const int max_task_dependence,
                               const int team_size,
                               const int mkl_nthreads,
                               const bool check,
                               const bool verbose) {
    typedef typename
      Kokkos::Impl::is_space<DeviceSpaceType>::host_mirror_space::execution_space HostSpaceType ;

    const bool detail = false;
    std::cout << "DeviceSpace::  "; DeviceSpaceType::print_configuration(std::cout, detail);
    std::cout << "HostSpace::    ";   HostSpaceType::print_configuration(std::cout, detail);

    typedef Kokkos::Experimental::TaskPolicy<DeviceSpaceType> PolicyType;

    typedef DenseMatrixBase<value_type,ordinal_type,size_type,HostSpaceType> DenseMatrixBaseHostType;
    typedef DenseMatrixView<DenseMatrixBaseHostType> DenseMatrixViewHostType;

    typedef DenseMatrixBase<value_type,ordinal_type,size_type,DeviceSpaceType> DenseMatrixBaseDeviceType;
    typedef DenseMatrixView<DenseMatrixBaseDeviceType> DenseMatrixViewDeviceType;
    typedef TaskView<DenseMatrixViewDeviceType> DenseTaskViewDeviceType;

    typedef DenseMatrixBase<DenseTaskViewDeviceType,ordinal_type,size_type,DeviceSpaceType> DenseHierMatrixBaseDeviceType;

    typedef DenseMatrixView<DenseHierMatrixBaseDeviceType> DenseHierMatrixViewDeviceType;
    typedef TaskView<DenseHierMatrixViewDeviceType> DenseHierTaskViewDeviceType;

    int r_val = 0;

    Kokkos::Impl::Timer timer;
    double t = 0.0;

    std::cout << "DenseCholByBlocks:: test matrices "
              <<":: mmin = " << mmin << " , mmax = " << mmax << " , minc = " << minc 
              << " , mb = " << mb << std::endl;

    const size_t max_task_size = (3*sizeof(DenseTaskViewDeviceType)+sizeof(PolicyType)+128); 
    PolicyType policy(max_concurrency,
                      max_task_size,
                      max_task_dependence,
                      team_size);

    std::ostringstream os;
    os.precision(3);
    os << std::scientific;

    for (ordinal_type m=mmin;m<=mmax;m+=minc) {
      os.str("");
      
      // host matrices
      DenseMatrixBaseHostType AA_host("AA_host", m, m), AB_host("AB_host"), TT_host("TT_host");

      // random T matrix
      {
        TT_host.createConfTo(AA_host);
        for (ordinal_type j=0;j<TT_host.NumCols();++j) {
          for (ordinal_type i=0;i<TT_host.NumRows();++i)
            TT_host.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0;
          TT_host.Value(j,j) = std::fabs(TT_host.Value(j,j));
        }
      }
      // create SPD matrix
      {
        Teuchos::BLAS<ordinal_type,value_type> blas;

        blas.HERK(ArgUplo == Uplo::Upper ? Teuchos::UPPER_TRI : Teuchos::LOWER_TRI, 
                  Teuchos::CONJ_TRANS,
                  m, m,
                  1.0,
                  TT_host.ValuePtr(), TT_host.ColStride(),
                  0.0,
                  AA_host.ValuePtr(), AA_host.ColStride());

        // preserve a copy of A
        AB_host.createConfTo(AA_host);        
        DenseMatrixTools::copy(AB_host, AA_host);
      }

      const double flop = DenseFlopCount<value_type>::Chol(m);

#ifdef HAVE_SHYLUTACHO_MKL
      mkl_set_num_threads(mkl_nthreads);
#endif
      os << "DenseCholByBlocks:: m = " << m << "  ";
      int ierr = 0;
      if (check) {
        timer.reset();
        DenseMatrixViewHostType A_host(AB_host); 
        ierr = Chol<ArgUplo,AlgoChol::ExternalLapack,Variant::One>::invoke
          (policy, policy.member_single(),
           A_host);
        t = timer.seconds();
        TACHO_TEST_FOR_ABORT( ierr, "Fail to perform Cholesky (serial)");
        os << ":: Serial Performance = " << (flop/t/1.0e9) << " [GFLOPs]  ";
      }

      DenseMatrixBaseDeviceType AA_device("AA_device");
      {
        timer.reset();
        AA_device.mirror(AA_host);
        t = timer.seconds();
        os << ":: Mirror = " << t << " [sec]  ";
      }

      {
        DenseHierMatrixBaseDeviceType HA_device("HA_device");
        
        DenseMatrixTools::createHierMatrix(HA_device, AA_device, mb, mb);

        DenseHierTaskViewDeviceType TA_device(HA_device);

        timer.reset();
        auto future = policy.proc_create_team
          (Chol<ArgUplo,AlgoChol::DenseByBlocks,ArgVariant>
           ::createTaskFunctor(policy, TA_device), 
           0);
        policy.spawn(future);
        Kokkos::Experimental::wait(policy);
        t = timer.seconds();
        os << ":: Parallel Performance = " << (flop/t/1.0e9) << " [GFLOPs]  ";
      }

      AA_host.mirror(AA_device);
      if (!ierr && check) {
        double err = 0.0, norm = 0.0;
        for (ordinal_type j=0;j<AA_host.NumCols();++j)
          for (ordinal_type i=0;i<=j;++i) {
            const double diff = abs(AA_host.Value(i,j) - AB_host.Value(i,j));
            const double val  = AB_host.Value(i,j);
            err  += diff*diff;
            norm += val*val;
          }
        os << ":: Check result ::norm = " << sqrt(norm) << ", error = " << sqrt(err);
      }
      std::cout << os.str() << std::endl;
    }
    
    return r_val;
  }