Пример #1
0
void Loop(int loop, int test, const char* type_name) {
  LoopVariant<T>(loop,test);

  Kokkos::Impl::Timer timer;
  T res = LoopVariant<T>(loop,test);
  double time1 = timer.seconds();

  timer.reset();
  T resNonAtomic = LoopVariantNonAtomic<T>(loop,test);
  double time2 = timer.seconds();

  timer.reset();
  T resSerial = LoopVariantSerial<T>(loop,test);
  double time3 = timer.seconds();

  time1*=1e6/loop;
  time2*=1e6/loop;
  time3*=1e6/loop;
  textcolor_standard();
  bool passed = true;
  if(resSerial!=res) passed = false;
  if(!passed) textcolor(RESET,BLACK,YELLOW);
  printf("%s Test %i %s  --- Loop: %i Value (S,A,NA): %e %e %e Time: %7.4e %7.4e %7.4e Size of Type %i)",type_name,test,passed?"PASSED":"FAILED",loop,1.0*resSerial,1.0*res,1.0*resNonAtomic,time1,time2,time3,(int)sizeof(T));
  if(!passed) textcolor_standard();
  printf("\n");
}
Пример #2
0
size_t test_global_to_local_ids(unsigned num_ids, unsigned capacity, unsigned num_find_iterations)
{

  typedef Device execution_space;
  typedef typename execution_space::size_type size_type;

  typedef Kokkos::View<uint32_t*,execution_space> local_id_view;
  typedef Kokkos::UnorderedMap<uint32_t,size_type,execution_space> global_id_view;

  double elasped_time = 0;
  Kokkos::Impl::Timer timer;

  local_id_view local_2_global("local_ids", num_ids);
  global_id_view global_2_local(capacity);

  int shiftw = 15;

  //create
  elasped_time = timer.seconds();
  std::cout << std::setw(shiftw) <<  "allocate: " <<  elasped_time << std::endl;
  timer.reset();

  // generate unique ids
  {
    generate_ids<Device> gen(local_2_global);
  }

  // generate
  elasped_time = timer.seconds();
  std::cout << std::setw(shiftw) << "generate: " <<  elasped_time << std::endl;
  timer.reset();

  {
    fill_map<Device> fill(global_2_local, local_2_global);
  }

  // fill
  elasped_time = timer.seconds();
  std::cout << std::setw(shiftw) << "fill: " <<  elasped_time << std::endl;
  timer.reset();


  size_t num_errors = global_2_local.failed_insert();

  if (num_errors == 0u) {
    for (unsigned i=0; i<num_find_iterations; ++i)
    {
      find_test<Device> find(global_2_local, local_2_global,num_errors);
    }

    // find
    elasped_time = timer.seconds();
    std::cout << std::setw(shiftw) << "lookup: " <<  elasped_time << std::endl;
  }
  else {
    std::cout << "    !!! Fill Failed !!!" << std::endl;
  }

  return num_errors;
}
Пример #3
0
int main(int narg, char* arg[]) {
  Kokkos::initialize(narg,arg);

  int size = 1000000;

  // Create DualViews. This will allocate on both the device and its
  // host_mirror_device.
  idx_type idx("Idx",size,64);
  view_type dest("Dest",size);
  view_type src("Src",size);

  srand(134231);

  // Get a reference to the host view of idx directly (equivalent to
  // idx.view<idx_type::host_mirror_device_type>() )
  idx_type::t_host h_idx = idx.h_view;
  for (int i = 0; i < size; ++i) {
    for (view_type::size_type j=0; j < h_idx.dimension_1 (); ++j) {
      h_idx(i,j) = (size + i + (rand () % 500 - 250)) % size;
    }
  }

  // Mark idx as modified on the host_mirror_device_type so that a
  // sync to the device will actually move data.  The sync happens in
  // the functor's constructor.
  idx.modify<idx_type::host_mirror_device_type>();

  // Run on the device.  This will cause a sync of idx to the device,
  // since it was marked as modified on the host.
  Kokkos::Impl::Timer timer;
  Kokkos::parallel_for(size,localsum<view_type::device_type>(idx,dest,src));
  Kokkos::fence();
  double sec1_dev = timer.seconds();

  timer.reset();
  Kokkos::parallel_for(size,localsum<view_type::device_type>(idx,dest,src));
  Kokkos::fence();
  double sec2_dev = timer.seconds();

  // Run on the host (could be the same as device).  This will cause a
  // sync back to the host of dest.  Note that if the Device is CUDA,
  // the data layout will not be optimal on host, so performance is
  // lower than what it would be for a pure host compilation.
  timer.reset();
  Kokkos::parallel_for(size,localsum<view_type::host_mirror_device_type>(idx,dest,src));
  Kokkos::fence();
  double sec1_host = timer.seconds();

  timer.reset();
  Kokkos::parallel_for(size,localsum<view_type::host_mirror_device_type>(idx,dest,src));
  Kokkos::fence();
  double sec2_host = timer.seconds();

  printf("Device Time with Sync: %lf without Sync: %lf \n",sec1_dev,sec2_dev);
  printf("Host   Time with Sync: %lf without Sync: %lf \n",sec1_host,sec2_host);

  Kokkos::finalize();
}
Пример #4
0
void test_global_to_local_ids(unsigned num_ids)
{

  typedef Device device_type;
  typedef typename device_type::size_type size_type;

  typedef Kokkos::View<uint32_t*,device_type> local_id_view;
  typedef Kokkos::UnorderedMap<uint32_t,size_type,device_type> global_id_view;

  //size
  std::cout << num_ids << ", ";

  double elasped_time = 0;
  Kokkos::Impl::Timer timer;

  local_id_view local_2_global("local_ids", num_ids);
  global_id_view global_2_local((3u*num_ids)/2u);

  //create
  elasped_time = timer.seconds();
  std::cout << elasped_time << ", ";
  timer.reset();

  // generate unique ids
  {
    generate_ids<Device> gen(local_2_global);
  }
  Device::fence();
  // generate
  elasped_time = timer.seconds();
  std::cout << elasped_time << ", ";
  timer.reset();

  {
    fill_map<Device> fill(global_2_local, local_2_global);
  }
  Device::fence();

  // fill
  elasped_time = timer.seconds();
  std::cout << elasped_time << ", ";
  timer.reset();


  size_t num_errors = 0;
  for (int i=0; i<100; ++i)
  {
    find_test<Device> find(global_2_local, local_2_global,num_errors);
  }
  Device::fence();

  // find
  elasped_time = timer.seconds();
  std::cout << elasped_time << std::endl;

  ASSERT_EQ( num_errors, 0u);
}
Пример #5
0
int main(int narg, char* arg[]) {
  Kokkos::initialize(narg,arg);

  int size = 1000000;

  // Create Views
  idx_type idx("Idx",size,64);
  view_type dest("Dest",size);
  view_type src("Src",size);

  srand(134231);

  // When using UVM Cuda views can be accessed on the Host directly
  for(int i=0; i<size; i++) {
    for(int j=0; j<idx.dimension_1(); j++)
      idx(i,j) = (size + i + (rand()%500 - 250))%size;
  }

  Kokkos::fence();
  // Run on the device
  // This will cause a sync of idx to the device since it was modified on the host
  Kokkos::Impl::Timer timer;
  Kokkos::parallel_for(size,localsum<view_type::execution_space>(idx,dest,src));
  Kokkos::fence();
  double sec1_dev = timer.seconds();

  // No data transfer will happen now, since nothing is accessed on the host
  timer.reset();
  Kokkos::parallel_for(size,localsum<view_type::execution_space>(idx,dest,src));
  Kokkos::fence();
  double sec2_dev = timer.seconds();

  // Run on the host
  // This will cause a sync back to the host of dest which was changed on the device
  // Compare runtime here with the dual_view example: dest will be copied back in 4k blocks
  // when they are accessed the first time during the parallel_for. Due to the latency of a memcpy
  // this gives lower effective bandwidth when doing a manual copy via dual views
  timer.reset();
  Kokkos::parallel_for(size,localsum<Kokkos::HostSpace::execution_space>(idx,dest,src));
  Kokkos::fence();
  double sec1_host = timer.seconds();

  // No data transfers will happen now
  timer.reset();
  Kokkos::parallel_for(size,localsum<Kokkos::HostSpace::execution_space>(idx,dest,src));
  Kokkos::fence();
  double sec2_host = timer.seconds();



  printf("Device Time with Sync: %lf without Sync: %lf \n",sec1_dev,sec2_dev);
  printf("Host   Time with Sync: %lf without Sync: %lf \n",sec1_host,sec2_host);

  Kokkos::finalize();
}
Пример #6
0
  KOKKOS_INLINE_FUNCTION
  int exampleCholByBlocks(const string file_input,
                          const int nthreads,
                          const int max_task_dependence,
                          const int team_size,
                          const bool verbose) {
    typedef ValueType   value_type;
    typedef OrdinalType ordinal_type;
    typedef SizeType    size_type;

    typedef TaskFactory<Kokkos::Experimental::TaskPolicy<SpaceType>,
      Kokkos::Experimental::Future<int,SpaceType> > TaskFactoryType;

    typedef CrsMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> CrsMatrixBaseType;
    typedef GraphHelper_Scotch<CrsMatrixBaseType> GraphHelperType;

    typedef CrsMatrixView<CrsMatrixBaseType> CrsMatrixViewType;
    typedef TaskView<CrsMatrixViewType,TaskFactoryType> CrsTaskViewType;

    typedef CrsMatrixBase<CrsTaskViewType,ordinal_type,size_type,SpaceType,MemoryTraits> CrsHierMatrixBaseType;

    typedef CrsMatrixView<CrsHierMatrixBaseType> CrsHierMatrixViewType;
    typedef TaskView<CrsHierMatrixViewType,TaskFactoryType> CrsHierTaskViewType;

    int r_val = 0;

    Kokkos::Impl::Timer timer;
    double t = 0.0;

    cout << "CholByBlocks:: import input file = " << file_input << endl;
    CrsMatrixBaseType AA("AA");
    {
      timer.reset();

      ifstream in;
      in.open(file_input);
      if (!in.good()) {
        cout << "Failed in open the file: " << file_input << endl;
        return ++r_val;
      }
      AA.importMatrixMarket(in);

      t = timer.seconds();

      if (verbose)
        cout << AA << endl;
    }
    cout << "CholByBlocks:: import input file::time = " << t << endl;


    cout << "CholByBlocks:: reorder the matrix" << endl;
    CrsMatrixBaseType UU("UU");     // permuted base matrix
    CrsHierMatrixBaseType HU("HU"); // hierarchical matrix of views
    {
      timer.reset();

      typename GraphHelperType::size_type_array rptr(AA.Label()+"Graph::RowPtrArray", AA.NumRows() + 1);
      typename GraphHelperType::ordinal_type_array cidx(AA.Label()+"Graph::ColIndexArray", AA.NumNonZeros());

      AA.convertGraph(rptr, cidx);
      GraphHelperType S(AA.Label()+"ScotchHelper",
                        AA.NumRows(),
                        rptr,
                        cidx);
      S.computeOrdering();

      CrsMatrixBaseType PA("Permuted AA");
      PA.copy(S.PermVector(), S.InvPermVector(), AA);

      UU.copy(Uplo::Upper, PA);

      CrsMatrixHelper::flat2hier(Uplo::Upper, UU, HU,
                                 S.NumBlocks(),
                                 S.RangeVector(),
                                 S.TreeVector());

      for (ordinal_type k=0;k<HU.NumNonZeros();++k)
        HU.Value(k).fillRowViewArray();

      t = timer.seconds();

      if (verbose)
        cout << UU << endl;
    }
    cout << "CholByBlocks:: reorder the matrix::time = " << t << endl;

    const size_t max_concurrency = 16384;
    cout << "CholByBlocks:: max concurrency = " << max_concurrency << endl;

    const size_t max_task_size = 3*sizeof(CrsTaskViewType)+128;
    cout << "CholByBlocks:: max task size   = " << max_task_size << endl;

    typename TaskFactoryType::policy_type policy(max_concurrency,
                                                 max_task_size,
                                                 max_task_dependence,
                                                 team_size);

    TaskFactoryType::setMaxTaskDependence(max_task_dependence);
    TaskFactoryType::setPolicy(&policy);

    cout << "CholByBlocks:: factorize the matrix" << endl;
    CrsHierTaskViewType H(&HU);
    {
      timer.reset();

      auto future = TaskFactoryType::Policy().create_team(Chol<Uplo::Upper,AlgoChol::ByBlocks>::
                                                          TaskFunctor<CrsHierTaskViewType>(H), 0);
      TaskFactoryType::Policy().spawn(future);
      Kokkos::Experimental::wait(TaskFactoryType::Policy());

      t = timer.seconds();

      if (verbose)
        cout << UU << endl;
    }
    cout << "CholByBlocks:: factorize the matrix::time = " << t << endl;

    return r_val;
  }
  KOKKOS_INLINE_FUNCTION
  int exampleSymbolicFactor(const string file_input,
                            const int treecut,
                            const int minblksize,
                            const int seed,
                            const int fill_level,
                            const int league_size,
                            const bool reorder,
                            const bool verbose) {
    typedef ValueType   value_type;
    typedef OrdinalType ordinal_type;
    typedef SizeType    size_type;

    typedef CrsMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> CrsMatrixBaseType;
    typedef GraphHelper_Scotch<CrsMatrixBaseType> GraphHelperType;
    typedef SymbolicFactorHelper<CrsMatrixBaseType> SymbolicFactorHelperType;

    int r_val = 0;

    Kokkos::Impl::Timer timer;
    double t = 0.0;

    cout << "SymbolicFactor:: import input file = " << file_input << endl;
    CrsMatrixBaseType AA("AA");
    {
      timer.reset();

      ifstream in;
      in.open(file_input);
      if (!in.good()) {
        cout << "Failed in open the file: " << file_input << endl;
        return ++r_val;
      }
      AA.importMatrixMarket(in);
      t = timer.seconds();

      cout << "SymbolicFactor:: AA nnz = " << AA.NumNonZeros() << endl;
      if (verbose)
        cout << AA << endl;
    }
    cout << "SymbolicFactor:: import input file::time = " << t << endl;

    CrsMatrixBaseType PA("Permuted AA");
    GraphHelperType S(AA, seed);
    if (reorder) {
      timer.reset();

      S.computeOrdering(treecut, minblksize);

      PA.copy(S.PermVector(), S.InvPermVector(), AA);

      t = timer.seconds();

      if (verbose)
        cout << S << endl
             << PA << endl;
    } else {
      PA = AA;

      t = 0.0;
    }
    cout << "SymbolicFactor:: reorder the matrix::time = " << t << endl;


    CrsMatrixBaseType UU("UU");
    {
      timer.reset();

      SymbolicFactorHelperType symbolic(PA, league_size);
      symbolic.createNonZeroPattern(fill_level, Uplo::Upper, UU);

      t = timer.seconds();

      cout << "SymbolicFactor:: UU nnz = " << UU.NumNonZeros() << endl;

      if (verbose) {
        cout << symbolic << endl;
        cout << UU << endl;
      }
    }
    cout << "SymbolicFactor:: factorize the matrix::time = " << t << endl;

    return r_val;
  }
Пример #8
0
  BASKER_INLINE
  int Basker<Int, Entry, Exe_Space>::factor_inc_lvl(Int option)
  {

    printf("Factor Inc Level Called \n");

    gn = A.ncol;
    gm = A.nrow;

    if(Options.btf == BASKER_TRUE)
      {

	//JDB: We can change this for the new inteface

	//call reference copy constructor
	gn = A.ncol;
	gm = A.nrow;
	A = BTF_A; 
	//printf("\n\n Switching A, newsize: %d \n",
	//   A.ncol);
	//printMTX("A_FACTOR.mtx", A);
      }
   

    //Spit into Domain and Sep
    //----------------------Domain-------------------------//
    #ifdef BASKER_KOKKOS

    //====TIMER==
    #ifdef BASKER_TIME
    Kokkos::Impl::Timer       timer;
    #endif
    //===TIMER===

    typedef Kokkos::TeamPolicy<Exe_Space>        TeamPolicy;

    if(btf_tabs_offset != 0)
      {

	kokkos_nfactor_domain_inc_lvl <Int,Entry,Exe_Space>
	  domain_nfactor(this);
	Kokkos::parallel_for(TeamPolicy(num_threads,1), 
			     domain_nfactor);
	Kokkos::fence();
    

	//=====Check for error======
	while(true)
	  {
	INT_1DARRAY thread_start;
	MALLOC_INT_1DARRAY(thread_start, num_threads+1);
	init_value(thread_start, num_threads+1, 
		   (Int) BASKER_MAX_IDX);
	int nt = nfactor_domain_error(thread_start);
	if(nt == BASKER_SUCCESS)
	  {
	    break;
	  }
	else
	  {
	    printf("restart \n");
	    kokkos_nfactor_domain_remalloc <Int, Entry, Exe_Space>
	      diag_nfactor_remalloc(this, thread_start);
	    Kokkos::parallel_for(TeamPolicy(num_threads,1),
				 diag_nfactor_remalloc);
	    Kokkos::fence();
	  }
      }//end while
   

    //====TIMER===
    #ifdef BASKER_TIME
    printf("Time DOMAIN: %f \n", timer.seconds());
    timer.reset();
    #endif
    //====TIMER====
    

    #else// else basker_kokkos
    #pragma omp parallel
    {


    }//end omp parallel
    #endif //end basker_kokkos

      }
    //-------------------End--Domian--------------------------//

   
    //---------------------------Sep--------------------------//

    
    
    if(btf_tabs_offset != 0)
      {
        //for(Int l=1; l<=1; l++)
       for(Int l=1; l <= tree.nlvls; l++)
      {

        //Come back for syncs
        
	//#ifdef BASKER_OLD_BARRIER
	Int lthreads = pow(2,l);
	Int lnteams = num_threads/lthreads;
	//#else
	//Int lthreads = 1;
	//Int lnteams = num_threads/lthreads;
	//#endif

	
	//printf("\n\n   ============ SEP: %d ======\n\n",l);

	#ifdef BASKER_KOKKOS
	Kokkos::Impl::Timer  timer_inner_sep;
	#ifdef BASKER_NO_LAMBDA
       
	kokkos_nfactor_sep2_inc_lvl <Int, Entry, Exe_Space>
	  sep_nfactor(this,l);
	
	Kokkos::parallel_for(TeamPolicy(lnteams,lthreads),
			     sep_nfactor);
	Kokkos::fence();
	
	#ifdef BASKER_TIME
	printf("Time INNERSEP: %d %f \n", 
	       l, timer_inner_sep.seconds());
	#endif
        #else //ELSE BASKER_NO_LAMBDA
	//Note: to be added
        #endif //end BASKER_NO_LAMBDA
	#else
	#pragma omp parallel
	{

	}//end omp parallel
	#endif
      }//end over each level

    #ifdef BASKER_TIME
    printf("Time SEP: %f \n", timer.seconds());
    #endif
      }

   
    //-------------------------End Sep----------------//


    //-------------------IF BTF-----------------------//
    if(Options.btf == BASKER_TRUE)
      {
	//=====Timer
	#ifdef BASKER_TIME
	Kokkos::Impl::Timer  timer_btf;
	#endif
	//====Timer
	
	//======Call diag factor====
	/*
	kokkos_nfactor_diag <Int, Entry, Exe_Space> 
	  diag_nfactor(this);
	Kokkos::parallel_for(TeamPolicy(num_threads,1),
			     diag_nfactor);
	Kokkos::fence();
	*/
	//=====Check for error======
	//while(true)
	// {
	    //INT_1DARRAY thread_start;
	    // MALLOC_INT_1DARRAY(thread_start, num_threads+1);
	    //init_value(thread_start, num_threads+1, 
	    //	       (Int) BASKER_MAX_IDX);
	    //int nt = nfactor_diag_error(thread_start);
	    // if(nt == BASKER_SUCCESS)
	    //  {
	    ///		break;
	    // }
	    //else
	    // {
		/*
		break;
		printf("restart \n");
		kokkos_nfactor_diag_remalloc <Int, Entry, Exe_Space>
		  diag_nfactor_remalloc(this, thread_start);
		Kokkos::parallel_for(TeamPolicy(num_threads,1),
				     diag_nfactor);
		Kokkos::fence();
		*/
	    //}
	    // }//end while

	//====TIMER
	#ifdef BASKER_TIME
	printf("Time BTF: %f \n", 
	       timer_btf.seconds());
	#endif
	//===TIMER

      }//end btf call

    
    return 0;
  }//end factor_lvl_inc()
  KOKKOS_INLINE_FUNCTION
  int exampleCholUnblocked(const string file_input,
                           const int max_task_dependence,
                           const int team_size,
                           const int algo,
                           const int variant,
                           const bool verbose) {
    typedef ValueType   value_type;
    typedef OrdinalType ordinal_type;
    typedef SizeType    size_type;

    typedef CrsMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> CrsMatrixBaseType;
    typedef CrsMatrixView<CrsMatrixBaseType> CrsMatrixViewType;

    typedef TaskFactory<Kokkos::Experimental::TaskPolicy<SpaceType>,
      Kokkos::Experimental::Future<int,SpaceType> > TaskFactoryType;

    typedef TaskView<CrsMatrixViewType,TaskFactoryType> CrsTaskViewType;
    
    int r_val = 0;

    Kokkos::Impl::Timer timer;
    double t = 0.0;

    cout << "CholUnblocked:: import input file = " << file_input << endl;        
    CrsMatrixBaseType AA("AA"), UU("UU");    
    {
      timer.reset();

      ifstream in;
      in.open(file_input);
      if (!in.good()) {
        cout << "Failed in open the file: " << file_input << endl;
        return ++r_val;
      }
      AA.importMatrixMarket(in);

      UU.copy(Uplo::Upper, AA);

      t = timer.seconds();

      if (verbose)
        cout << UU << endl;
    }
    cout << "CholUnblocked:: import input file::time = " << t << endl;        

#ifdef __USE_FIXED_TEAM_SIZE__
    typename TaskFactoryType::policy_type policy(max_task_dependence);
#else
    typename TaskFactoryType::policy_type policy(max_task_dependence, team_size);
#endif
    TaskFactoryType::setMaxTaskDependence(max_task_dependence);
    TaskFactoryType::setPolicy(&policy);

    cout << "CholUnblocked:: factorize the matrix" << endl;
    CrsTaskViewType U(&UU);
    U.fillRowViewArray();
    {
      timer.reset();
    
      typename TaskFactoryType::future_type future;
      switch (algo) {
      case AlgoChol::UnblockedOpt: {
        if (variant == Variant::One)
          future = TaskFactoryType::Policy().create_team(Chol<Uplo::Upper,AlgoChol::UnblockedOpt,Variant::One>
                                                         ::TaskFunctor<CrsTaskViewType>(U), 0);
        else if (variant == Variant::Two)
          future = TaskFactoryType::Policy().create_team(Chol<Uplo::Upper,AlgoChol::UnblockedOpt,Variant::Two>
                                                         ::TaskFunctor<CrsTaskViewType>(U), 0);
        else {
          ERROR(">> Not supported algorithm variant");          
        }
        break;
      }
      case AlgoChol::Dummy: {
        future = TaskFactoryType::Policy().create_team(Chol<Uplo::Upper,AlgoChol::Dummy>
                                                       ::TaskFunctor<CrsTaskViewType>(U), 0);
        break;
      }
      default:
        ERROR(">> Not supported algorithm");
        break;
      }
      TaskFactoryType::Policy().spawn(future);
      Kokkos::Experimental::wait(TaskFactoryType::Policy());

      t = timer.seconds();

      if (verbose)
        cout << UU << endl;
    }   
    cout << "CholUnblocked:: factorize the matrix::time = " << t << endl; 
    
    return r_val;
  }
Пример #10
0
  int exampleMatrixMarket(const std::string file_input,
                          const bool verbose) {
    typedef typename
      Kokkos::Impl::is_space<DeviceSpaceType>::host_mirror_space::execution_space HostSpaceType ;

    const bool detail = false;
    std::cout << "DeviceSpace::  "; DeviceSpaceType::print_configuration(std::cout, detail);
    std::cout << "HostSpace::    ";   HostSpaceType::print_configuration(std::cout, detail);
  
    typedef CrsMatrixBase<value_type,ordinal_type,size_type,HostSpaceType>   CrsMatrixBaseHostType;

    int r_val = 0;

    Kokkos::Impl::Timer timer;
    
    CrsMatrixBaseHostType AA("AA");
    timer.reset();
    {
      std::ifstream in;
      in.open(file_input);
      if (!in.good()) {
        std::cout << "Failed in open the file: " << file_input << std::endl;
        return -1;
      }
      MatrixMarket::read(AA, in);
    }
    double t_read = timer.seconds();
    
    timer.reset();
    {
      std::string file_output = "mm-test-output.mtx";
      std::ofstream out;
      out.open(file_output);
      if (!out.good()) {
        std::cout << "Failed in open the file: " << file_output << std::endl;
        return -1;
      }
      MatrixMarket::write(out, AA, "%% Test output");
    }
    double t_write = timer.seconds();

    {
      const auto prec = std::cout.precision();
      std::cout.precision(4);

      std::cout << std::scientific
                << "MatrixMarket:: dimension = " << AA.NumRows() << " x " << AA.NumCols() 
                << ", " << " nnz = " << AA.NumNonZeros() << ", "
                << "read = " << t_read << " [sec], "
                << "write = " << t_write << " [sec] "
                << std::endl;

      std::cout.unsetf(std::ios::scientific);
      std::cout.precision(prec);
    }
    
    if (verbose) {
      AA.showMe(std::cout) << std::endl;
    }

    CrsMatrixBaseHostType BB("BB");
    BB.createConfTo(AA);

    CrsMatrixTools::copy(BB, Uplo::Upper, 0, AA);
    if (verbose) {
      BB.setLabel("Copy::AA:Upper::0"); BB.showMe(std::cout) << std::endl;
    }

    CrsMatrixTools::copy(BB, Uplo::Upper, 1, AA);
    if (verbose) {
      BB.setLabel("Copy::AA:Upper::1"); BB.showMe(std::cout) << std::endl;    
    }

    CrsMatrixTools::copy(BB, Uplo::Lower, 0, AA);
    if (verbose) {
      BB.setLabel("Copy::AA:Lower::0"); BB.showMe(std::cout) << std::endl;    
    }

    CrsMatrixTools::copy(BB, Uplo::Lower, 1, AA);
    if (verbose) {
      BB.setLabel("Copy::AA:Lower::1"); BB.showMe(std::cout) << std::endl;    
    }

    return r_val;
  }
  KOKKOS_INLINE_FUNCTION
  int exampleTriSolvePerformance(const string file_input,
                                 const OrdinalType nrhs,
                                 const OrdinalType nb,
                                 const int niter,
                                 const int nthreads,
                                 const int max_task_dependence,
                                 const int team_size, 
                                 const bool team_interface,
                                 const bool skip_serial,
                                 const bool verbose) {
    typedef ValueType   value_type;
    typedef OrdinalType ordinal_type;
    typedef SizeType    size_type;

    typedef TaskFactory<Kokkos::Experimental::TaskPolicy<SpaceType>,
      Kokkos::Experimental::Future<int,SpaceType> > TaskFactoryType;

    typedef CrsMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> CrsMatrixBaseType;
    typedef GraphHelper_Scotch<CrsMatrixBaseType> GraphHelperType;

    typedef CrsMatrixView<CrsMatrixBaseType> CrsMatrixViewType;
    typedef TaskView<CrsMatrixViewType,TaskFactoryType> CrsTaskViewType;
    
    typedef CrsMatrixBase<CrsTaskViewType,ordinal_type,size_type,SpaceType,MemoryTraits> CrsHierMatrixBaseType;

    typedef CrsMatrixView<CrsHierMatrixBaseType> CrsHierMatrixViewType;
    typedef TaskView<CrsHierMatrixViewType,TaskFactoryType> CrsHierTaskViewType;

    typedef DenseMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> DenseMatrixBaseType;

    typedef DenseMatrixView<DenseMatrixBaseType> DenseMatrixViewType;
    typedef TaskView<DenseMatrixViewType,TaskFactoryType> DenseTaskViewType;

    typedef DenseMatrixBase<DenseTaskViewType,ordinal_type,size_type,SpaceType,MemoryTraits> DenseHierMatrixBaseType;

    typedef DenseMatrixView<DenseHierMatrixBaseType> DenseHierMatrixViewType;
    typedef TaskView<DenseHierMatrixViewType,TaskFactoryType> DenseHierTaskViewType;

    int r_val = 0;

    Kokkos::Impl::Timer timer;
    double 
      t_import = 0.0,
      t_reorder = 0.0,
      t_solve_seq = 0.0,
      t_solve_task = 0.0;
    const int start = -2;

    cout << "TriSolvePerformance:: import input file = " << file_input << endl;
    CrsMatrixBaseType AA("AA");
    {
      timer.reset();

      ifstream in;
      in.open(file_input);
      if (!in.good()) {
        cout << "Failed in open the file: " << file_input << endl;
        return ++r_val;
      }
      AA.importMatrixMarket(in);

      t_import = timer.seconds();

      if (verbose)
        cout << AA << endl;
    }
    cout << "TriSolvePerformance:: import input file::time = " << t_import << endl;

    CrsMatrixBaseType   UU("UU");
    DenseMatrixBaseType BB("BB",  AA.NumRows(), nrhs);

    cout << "TriSolvePerformance:: reorder the matrix and partition right hand side, nb = " << nb << endl;
    CrsHierMatrixBaseType   HU("HU");
    DenseHierMatrixBaseType HB("HB");
    {
      timer.reset();

      GraphHelperType S(AA);
      S.computeOrdering();

      CrsMatrixBaseType PA("Permuted AA");
      PA.copy(S.PermVector(), S.InvPermVector(), AA);

      UU.copy(Uplo::Upper, PA);

      CrsMatrixHelper::flat2hier(Uplo::Upper, UU, HU,
                                 S.NumBlocks(),
                                 S.RangeVector(),
                                 S.TreeVector());

      DenseMatrixHelper::flat2hier(BB, HB,
                                   S.NumBlocks(),
                                   S.RangeVector(),
                                   nb);

      t_reorder = timer.seconds();

      cout << "TriSolvePerformance:: Hier (dof, nnz) = " << HU.NumRows() << ", " << HU.NumNonZeros() << endl;

      if (verbose)
        cout << UU << endl;
    }
    cout << "TriSolvePerformance:: reorder the matrix and partition right hand side::time = " << t_reorder << endl;

    const size_t max_concurrency = 16384;
    cout << "TriSolvePerformance:: max concurrency = " << max_concurrency << endl;

    const size_t max_task_size = 3*sizeof(CrsTaskViewType)+128;
    cout << "TriSolvePerformance:: max task size   = " << max_task_size << endl;

    if (!skip_serial) {
      __INIT_DENSE_MATRIX__(BB, 1.0);
      typename TaskFactoryType::policy_type policy(max_concurrency,
                                                   max_task_size,
                                                   max_task_dependence, 
                                                   1);

      TaskFactoryType::setUseTeamInterface(team_interface);
      TaskFactoryType::setMaxTaskDependence(max_task_dependence);
      TaskFactoryType::setPolicy(&policy);
      
      CrsTaskViewType U(&UU);
      DenseTaskViewType B(&BB);
      U.fillRowViewArray();

      cout << "TriSolvePerformance:: Serial forward and backward solve of the matrix" << endl;
      {
        for (int i=start;i<niter;++i) {
          timer.reset();
          // {
          //   auto future = TaskFactoryType::Policy().create_team(TriSolve<Uplo::Upper,Trans::ConjTranspose,AlgoTriSolve::Unblocked>
          //                                                       ::TaskFunctor<CrsTaskViewType,DenseTaskViewType>
          //                                                       (Diag::NonUnit, U, B), 0);
            
          //   TaskFactoryType::Policy().spawn(future);
          //   Kokkos::Experimental::wait(TaskFactoryType::Policy());
          // }
          {
            TriSolve<Uplo::Upper,Trans::ConjTranspose,AlgoTriSolve::Unblocked>
              ::invoke(TaskFactoryType::Policy(),
                                TaskFactoryType::Policy().member_single(),
                                Diag::NonUnit, U, B);
            
          }
          // {
          //   auto future = TaskFactoryType::Policy().create_team(TriSolve<Uplo::Upper,Trans::NoTranspose,AlgoTriSolve::Unblocked>
          //                                                       ::TaskFunctor<CrsTaskViewType,DenseTaskViewType>
          //                                                       (Diag::NonUnit, U, B), 0);
            
          //   TaskFactoryType::Policy().spawn(future);
          //   Kokkos::Experimental::wait(TaskFactoryType::Policy());
          // }
          {
            TriSolve<Uplo::Upper,Trans::NoTranspose,AlgoTriSolve::Unblocked>
              ::invoke(TaskFactoryType::Policy(),
                                TaskFactoryType::Policy().member_single(),
                                Diag::NonUnit, U, B);
          }
          t_solve_seq += timer.seconds() * (i>=0);
        }
        t_solve_seq /= niter;
        
        if (verbose)
          cout << BB << endl;
      }
      cout << "TriSolvePerformance:: Serial forward and backward solve of the matrix::time = " << t_solve_seq << endl;
    }
    
    {
      __INIT_DENSE_MATRIX__(BB, 1.0);
      typename TaskFactoryType::policy_type policy(max_concurrency,
                                                   max_task_size,
                                                   max_task_dependence, 
                                                   team_size);

      TaskFactoryType::setUseTeamInterface(team_interface);
      TaskFactoryType::setMaxTaskDependence(max_task_dependence);
      TaskFactoryType::setPolicy(&policy);

      // wrap the hierarchically partitioned matrix with task handler
      CrsHierTaskViewType TU(&HU);
      for (ordinal_type k=0;k<HU.NumNonZeros();++k)
        HU.Value(k).fillRowViewArray();
      
      DenseHierTaskViewType TB(&HB);
      
      cout << "TriSolvePerformance:: ByBlocks forward and backward solve of the matrix" << endl;
      {
        for (int i=start;i<niter;++i) {
          timer.reset(); 
          {
            auto future_forward_solve = TaskFactoryType::Policy().create_team
              (TriSolve<Uplo::Upper,Trans::ConjTranspose,AlgoTriSolve::ByBlocks>
               ::TaskFunctor<CrsHierTaskViewType,DenseHierTaskViewType>
               (Diag::NonUnit, TU, TB), 0);
            
            TaskFactoryType::Policy().spawn(future_forward_solve);
            
            auto future_backward_solve = TaskFactoryType::Policy().create_team
              (TriSolve<Uplo::Upper,Trans::NoTranspose,AlgoTriSolve::ByBlocks>
               ::TaskFunctor<CrsHierTaskViewType,DenseHierTaskViewType>
               (Diag::NonUnit, TU, TB), 1);
            
            TaskFactoryType::Policy().add_dependence(future_backward_solve, future_forward_solve);
            TaskFactoryType::Policy().spawn(future_backward_solve);
            
            Kokkos::Experimental::wait(TaskFactoryType::Policy());
          }
          t_solve_task += timer.seconds() * (i>=0);
        }
        t_solve_task /= niter;

        if (verbose)
          cout << BB << endl;
      }

      cout << "TriSolvePerformance:: ByBlocks forward and backward solve of the matrix::time = " << t_solve_task << endl;
    }

    if (!skip_serial) {
      cout << "TriSolvePerformance:: task scale [seq/task] = " << t_solve_seq/t_solve_task << endl;
    }

    return r_val;
  }
  KOKKOS_INLINE_FUNCTION
  int exampleDenseGemmByBlocks(const OrdinalType mmin,
                               const OrdinalType mmax,
                               const OrdinalType minc,
                               const OrdinalType k,
                               const OrdinalType mb,
                               const int max_concurrency,
                               const int max_task_dependence,
                               const int team_size,
                               const int mkl_nthreads,
                               const bool check,
                               const bool verbose) {
    typedef ValueType   value_type;
    typedef OrdinalType ordinal_type;
    typedef SizeType    size_type;

    typedef TaskFactory<Kokkos::Experimental::TaskPolicy<SpaceType>,
      Kokkos::Experimental::Future<int,SpaceType> > TaskFactoryType;

    typedef DenseMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> DenseMatrixBaseType;

    typedef DenseMatrixView<DenseMatrixBaseType> DenseMatrixViewType;
    typedef TaskView<DenseMatrixViewType,TaskFactoryType> DenseTaskViewType;

    typedef DenseMatrixBase<DenseTaskViewType,ordinal_type,size_type,SpaceType,MemoryTraits> DenseHierMatrixBaseType;

    typedef DenseMatrixView<DenseHierMatrixBaseType> DenseHierMatrixViewType;
    typedef TaskView<DenseHierMatrixViewType,TaskFactoryType> DenseHierTaskViewType;

    int r_val = 0;

    Kokkos::Impl::Timer timer;
    double t = 0.0;

    cout << "DenseGemmByBlocks:: test matrices "
         <<":: mmin = " << mmin << " , mmax = " << mmax << " , minc = " << minc << " , k = "<< k << " , mb = " << mb << endl;

    const size_t max_task_size = (3*sizeof(DenseTaskViewType)+196); // when 128 error
    //cout << "max task size = "<< max_task_size << endl;
    typename TaskFactoryType::policy_type policy(max_concurrency,
                                                 max_task_size,
                                                 max_task_dependence, 
                                                 team_size);
    
    TaskFactoryType::setMaxTaskDependence(max_task_dependence);
    TaskFactoryType::setPolicy(&policy);

    ostringstream os;
    os.precision(3);
    os << scientific;

    for (ordinal_type m=mmin;m<=mmax;m+=minc) {
      os.str("");

      DenseMatrixBaseType AA, BB, CC("CC", m, m), CB("CB", m, m);

      if (ArgTransA == Trans::NoTranspose) 
        AA = DenseMatrixBaseType("AA", m, k); 
      else 
        AA = DenseMatrixBaseType("AA", k, m);
      
      if (ArgTransB == Trans::NoTranspose) 
        BB = DenseMatrixBaseType("BB", k, m);
      else 
        BB = DenseMatrixBaseType("BB", m, k);
      
      for (ordinal_type j=0;j<AA.NumCols();++j)
        for (ordinal_type i=0;i<AA.NumRows();++i)
          AA.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0;
      
      for (ordinal_type j=0;j<BB.NumCols();++j)
        for (ordinal_type i=0;i<BB.NumRows();++i)
          BB.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0;
      
      for (ordinal_type j=0;j<CC.NumCols();++j)
        for (ordinal_type i=0;i<CC.NumRows();++i)
          CC.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0;
      CB.copy(CC);

      const double flop = get_flop_gemm<value_type>(m, m, k);

#ifdef HAVE_SHYLUTACHO_MKL
      mkl_set_num_threads(mkl_nthreads);
#endif

      os << "DenseGemmByBlocks:: m = " << m << " n = " << m << " k = " << k;
      if (check) {
        timer.reset();
        DenseTaskViewType A(&AA), B(&BB), C(&CB);
        Gemm<ArgTransA,ArgTransB,AlgoGemm::ExternalBlas>::invoke
          (TaskFactoryType::Policy(),
           TaskFactoryType::Policy().member_single(),
           1.0, A, B, 1.0, C);
        t = timer.seconds();
        os << ":: Serial Performance = " << (flop/t/1.0e9) << " [GFLOPs]  ";
      }

      {
        DenseHierMatrixBaseType HA, HB, HC;
        DenseMatrixHelper::flat2hier(AA, HA, mb, mb);
        DenseMatrixHelper::flat2hier(BB, HB, mb, mb);
        DenseMatrixHelper::flat2hier(CC, HC, mb, mb);

        DenseHierTaskViewType TA(&HA), TB(&HB), TC(&HC);
        timer.reset();
        auto future = TaskFactoryType::Policy().create_team
          (typename Gemm<ArgTransA,ArgTransB,AlgoGemm::DenseByBlocks,Variant::One>
           ::template TaskFunctor<value_type,DenseHierTaskViewType,DenseHierTaskViewType,DenseHierTaskViewType>
           (1.0, TA, TB, 1.0, TC), 0);
        TaskFactoryType::Policy().spawn(future);
        Kokkos::Experimental::wait(TaskFactoryType::Policy());
        t = timer.seconds();       
        os << ":: Parallel Performance = " << (flop/t/1.0e9) << " [GFLOPs]  ";
      } 

      if (check) {
        typedef typename Teuchos::ScalarTraits<value_type>::magnitudeType real_type; 
        real_type err = 0.0, norm = 0.0;
        for (ordinal_type j=0;j<CC.NumCols();++j)
          for (ordinal_type i=0;i<CC.NumRows();++i) {
            const real_type diff = abs(CC.Value(i,j) - CB.Value(i,j));
            const real_type val  = CB.Value(i,j);
            err  += diff*diff;
            norm += val*val;
          }
        os << ":: Check result ::err = " << sqrt(err) << ", norm = " << sqrt(norm);
      }
      cout << os.str() << endl;
    }

    return r_val;
  }
int exampleDenseMatrixBase(const ordinal_type mmin,
                           const ordinal_type mmax,
                           const ordinal_type minc,
                           const bool verbose) {
    typedef typename
    Kokkos::Impl::is_space<DeviceSpaceType>::host_mirror_space::execution_space HostSpaceType ;

    const bool detail = false;
    std::cout << "DeviceSpace::  ";
    DeviceSpaceType::print_configuration(std::cout, detail);
    std::cout << "HostSpace::    ";
    HostSpaceType::print_configuration(std::cout, detail);
    std::cout << std::endl;

    typedef DenseMatrixBase<value_type,ordinal_type,size_type,HostSpaceType>   DenseMatrixBaseHostType;
    typedef DenseMatrixBase<value_type,ordinal_type,size_type,DeviceSpaceType> DenseMatrixBaseDeviceType;

    int r_val = 0;

    Kokkos::Impl::Timer timer;

    std::cout << "DenseMatrixBase:: test matrices "
              <<":: mmin = " << mmin << " , mmax = " << mmax << " , minc = " << minc << std::endl;

    for (auto m=mmin; m<=mmax; m+=minc) {
        // random test matrix on host
        DenseMatrixBaseHostType TT("TT", m, m);
        for (ordinal_type j=0; j<TT.NumCols(); ++j) {
            for (ordinal_type i=0; i<TT.NumRows(); ++i)
                TT.Value(i,j) = 2.0*((value_type)std::rand()/(RAND_MAX)) - 1.0;
            TT.Value(j,j) = std::fabs(TT.Value(j,j));
        }
        if (verbose)
            TT.showMe(std::cout) << std::endl;

        DenseMatrixBaseDeviceType AA("AA");

        timer.reset();
        AA.mirror(TT);
        double t_mirror = timer.seconds();

        DenseMatrixBaseDeviceType BB("BB");
        BB.createConfTo(AA);

        timer.reset();
        DenseMatrixTools::copy(BB, AA);
        double t_copy = timer.seconds();

        // check
        DenseMatrixBaseHostType RR("RR");
        RR.createConfTo(BB);
        RR.mirror(BB);
        if (verbose)
            RR.showMe(std::cout) << std::endl;

        double err = 0.0;
        for (ordinal_type j=0; j<TT.NumCols(); ++j)
            for (ordinal_type i=0; i<TT.NumRows(); ++i)
                err += std::fabs(TT.Value(i,j) - RR.Value(i,j));

        {
            const auto prec = std::cout.precision();
            std::cout.precision(4);

            std::cout << std::scientific
                      << "DenseMatrixBase:: dimension = " << m << " x " << m << ", "
                      << "Mirroring to device  = " << t_mirror << " [sec], "
                      << "Elementwise copy on device = " << t_copy << " [sec], "
                      << "Error = " << err
                      << std::endl;

            std::cout.unsetf(std::ios::scientific);
            std::cout.precision(prec);
        }
    }

    return r_val;
}
  KOKKOS_INLINE_FUNCTION
  int exampleICholUnblocked(const string file_input,
                            const int max_task_dependence,
                            const int team_size,
                            const bool verbose) {
    typedef ValueType   value_type;
    typedef OrdinalType ordinal_type;
    typedef SizeType    size_type;

    typedef CrsMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> CrsMatrixBaseType;
    typedef CrsMatrixView<CrsMatrixBaseType> CrsMatrixViewType;

    typedef TaskTeamFactory<Kokkos::Experimental::TaskPolicy<SpaceType>,
      Kokkos::Experimental::Future<int,SpaceType>,
      Kokkos::Impl::TeamThreadRangeBoundariesStruct> TaskFactoryType;

    typedef ParallelFor ForType;
    
    typedef TaskView<CrsMatrixViewType,TaskFactoryType> CrsTaskViewType;
    
    int r_val = 0;

    Kokkos::Impl::Timer timer;
    double t = 0.0;

    cout << "ICholUnblocked:: import input file = " << file_input << endl;        
    CrsMatrixBaseType AA("AA"), UU("UU");    
    {
      timer.reset();

      ifstream in;
      in.open(file_input);
      if (!in.good()) {
        cout << "Failed in open the file: " << file_input << endl;
        return ++r_val;
      }
      AA.importMatrixMarket(in);

      UU.copy(Uplo::Upper, AA);

      t = timer.seconds();

      if (verbose)
        cout << UU << endl;
    }
    cout << "ICholUnblocked:: import input file::time = " << t << endl;        

#ifdef __USE_FIXED_TEAM_SIZE__
    typename TaskFactoryType::policy_type policy(max_task_dependence);
#else
    typename TaskFactoryType::policy_type policy(max_task_dependence, team_size);
#endif
    TaskFactoryType::setMaxTaskDependence(max_task_dependence);
    TaskFactoryType::setPolicy(&policy);

    cout << "ICholUnblocked:: factorize the matrix" << endl;
    CrsTaskViewType U(&UU);
    U.fillRowViewArray();
    {
      timer.reset();
    
      auto future = TaskFactoryType::Policy().create_team(IChol<Uplo::Upper,AlgoIChol::UnblockedOpt1>
                                                          ::TaskFunctor<ForType,CrsTaskViewType>(U), 0);
      TaskFactoryType::Policy().spawn(future);
      Kokkos::Experimental::wait(TaskFactoryType::Policy());

      t = timer.seconds();

      if (verbose)
        cout << UU << endl;
    }   
    cout << "ICholUnblocked:: factorize the matrix::time = " << t << endl; 
    
    return r_val;
  }
Пример #15
0
	void color(bool useConflictList, bool serialConflictResolution, bool ticToc){
		Ordinal numUncolored = _size; // on host
		double t, total = 0.0;
		Kokkos::Impl::Timer timer;

		if(useConflictList)
			_conflictType = CONFLICT_LIST;

		// While vertices to color, do speculative coloring.
		int iter = 0;
		for(iter = 0; (iter<20) && (numUncolored>0); iter++){
			std::cout<< "Start iteration " << iter << std::endl;

			// First color greedy speculatively, some conflicts expected
			this -> colorGreedy();
			ExecSpace::fence();
			if(ticToc){
				t = timer.seconds();
				total += t;
				std::cout << "Time speculative greedy phase " << iter << " : " << std::endl;
				timer.reset();
			}

#ifdef DEBUG
			// UVM required - will be slow!
			printf("\n 100 first vertices: ");
			for(int i = 0; i < 100; i++){
				printf(" %i", _colors[i]);
			}
			printf("\n");
#endif

			// Check for conflicts (parallel), find vertices to recolor
			numUncolored = this -> findConflicts();

			ExecSpace::fence();
			if(ticToc){
			t = timer.seconds();
			total += t;
			std::cout << "Time conflict detection " << iter << " : " << t << std::endl;
			timer.reset();
			}
			if (serialConflictResolution) break; // Break after first iteration
/*			if(_conflictType == CONFLICT_LIST){
				array_type temp = _vertexList;
				_vertexList = _recolorList;
				_vertexListLength() = _recolorListLength();
				_recolorList = temp;
				_recolorListLength() = 0;
			}
*/			if(_conflictType == CONFLICT_LIST){
				array_type temp = _vertexList;
				_vertexList = _recolorList;
				host_vertexListLength() = host_recolorListLength();
				_recolorList = temp;
				host_recolorListLength() = 0;
				Kokkos::deep_copy(_vertexListLength, host_vertexListLength);
				Kokkos::deep_copy(_recolorListLength, host_recolorListLength);
			}
		}

		std::cout << "Number of coloring iterations: " << iter << std::endl;

		if(numUncolored > 0){
			// Resolve conflicts by recolor in serial
			this -> resolveConflicts();
			ExecSpace::fence();
			if(ticToc){
				t = timer.seconds();
				total += t;
				std::cout << "Time conflict resolution: " << t << std::endl;
				std::cout << "Total time: " << total << std::endl;
			}
		}
	}
  int exampleDenseCholByBlocks(const ordinal_type mmin,
                               const ordinal_type mmax,
                               const ordinal_type minc,
                               const ordinal_type mb,
                               const int max_concurrency,
                               const int max_task_dependence,
                               const int team_size,
                               const int mkl_nthreads,
                               const bool check,
                               const bool verbose) {
    typedef typename
      Kokkos::Impl::is_space<DeviceSpaceType>::host_mirror_space::execution_space HostSpaceType ;

    const bool detail = false;
    std::cout << "DeviceSpace::  "; DeviceSpaceType::print_configuration(std::cout, detail);
    std::cout << "HostSpace::    ";   HostSpaceType::print_configuration(std::cout, detail);

    typedef Kokkos::Experimental::TaskPolicy<DeviceSpaceType> PolicyType;

    typedef DenseMatrixBase<value_type,ordinal_type,size_type,HostSpaceType> DenseMatrixBaseHostType;
    typedef DenseMatrixView<DenseMatrixBaseHostType> DenseMatrixViewHostType;

    typedef DenseMatrixBase<value_type,ordinal_type,size_type,DeviceSpaceType> DenseMatrixBaseDeviceType;
    typedef DenseMatrixView<DenseMatrixBaseDeviceType> DenseMatrixViewDeviceType;
    typedef TaskView<DenseMatrixViewDeviceType> DenseTaskViewDeviceType;

    typedef DenseMatrixBase<DenseTaskViewDeviceType,ordinal_type,size_type,DeviceSpaceType> DenseHierMatrixBaseDeviceType;

    typedef DenseMatrixView<DenseHierMatrixBaseDeviceType> DenseHierMatrixViewDeviceType;
    typedef TaskView<DenseHierMatrixViewDeviceType> DenseHierTaskViewDeviceType;

    int r_val = 0;

    Kokkos::Impl::Timer timer;
    double t = 0.0;

    std::cout << "DenseCholByBlocks:: test matrices "
              <<":: mmin = " << mmin << " , mmax = " << mmax << " , minc = " << minc 
              << " , mb = " << mb << std::endl;

    const size_t max_task_size = (3*sizeof(DenseTaskViewDeviceType)+sizeof(PolicyType)+128); 
    PolicyType policy(max_concurrency,
                      max_task_size,
                      max_task_dependence,
                      team_size);

    std::ostringstream os;
    os.precision(3);
    os << std::scientific;

    for (ordinal_type m=mmin;m<=mmax;m+=minc) {
      os.str("");
      
      // host matrices
      DenseMatrixBaseHostType AA_host("AA_host", m, m), AB_host("AB_host"), TT_host("TT_host");

      // random T matrix
      {
        TT_host.createConfTo(AA_host);
        for (ordinal_type j=0;j<TT_host.NumCols();++j) {
          for (ordinal_type i=0;i<TT_host.NumRows();++i)
            TT_host.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0;
          TT_host.Value(j,j) = std::fabs(TT_host.Value(j,j));
        }
      }
      // create SPD matrix
      {
        Teuchos::BLAS<ordinal_type,value_type> blas;

        blas.HERK(ArgUplo == Uplo::Upper ? Teuchos::UPPER_TRI : Teuchos::LOWER_TRI, 
                  Teuchos::CONJ_TRANS,
                  m, m,
                  1.0,
                  TT_host.ValuePtr(), TT_host.ColStride(),
                  0.0,
                  AA_host.ValuePtr(), AA_host.ColStride());

        // preserve a copy of A
        AB_host.createConfTo(AA_host);        
        DenseMatrixTools::copy(AB_host, AA_host);
      }

      const double flop = DenseFlopCount<value_type>::Chol(m);

#ifdef HAVE_SHYLUTACHO_MKL
      mkl_set_num_threads(mkl_nthreads);
#endif
      os << "DenseCholByBlocks:: m = " << m << "  ";
      int ierr = 0;
      if (check) {
        timer.reset();
        DenseMatrixViewHostType A_host(AB_host); 
        ierr = Chol<ArgUplo,AlgoChol::ExternalLapack,Variant::One>::invoke
          (policy, policy.member_single(),
           A_host);
        t = timer.seconds();
        TACHO_TEST_FOR_ABORT( ierr, "Fail to perform Cholesky (serial)");
        os << ":: Serial Performance = " << (flop/t/1.0e9) << " [GFLOPs]  ";
      }

      DenseMatrixBaseDeviceType AA_device("AA_device");
      {
        timer.reset();
        AA_device.mirror(AA_host);
        t = timer.seconds();
        os << ":: Mirror = " << t << " [sec]  ";
      }

      {
        DenseHierMatrixBaseDeviceType HA_device("HA_device");
        
        DenseMatrixTools::createHierMatrix(HA_device, AA_device, mb, mb);

        DenseHierTaskViewDeviceType TA_device(HA_device);

        timer.reset();
        auto future = policy.proc_create_team
          (Chol<ArgUplo,AlgoChol::DenseByBlocks,ArgVariant>
           ::createTaskFunctor(policy, TA_device), 
           0);
        policy.spawn(future);
        Kokkos::Experimental::wait(policy);
        t = timer.seconds();
        os << ":: Parallel Performance = " << (flop/t/1.0e9) << " [GFLOPs]  ";
      }

      AA_host.mirror(AA_device);
      if (!ierr && check) {
        double err = 0.0, norm = 0.0;
        for (ordinal_type j=0;j<AA_host.NumCols();++j)
          for (ordinal_type i=0;i<=j;++i) {
            const double diff = abs(AA_host.Value(i,j) - AB_host.Value(i,j));
            const double val  = AB_host.Value(i,j);
            err  += diff*diff;
            norm += val*val;
          }
        os << ":: Check result ::norm = " << sqrt(norm) << ", error = " << sqrt(err);
      }
      std::cout << os.str() << std::endl;
    }
    
    return r_val;
  }
  int exampleCholUnblocked(const std::string file_input,
                           const int treecut,
                           const int prunecut,
                           const int fill_level,
                           const int rows_per_team,
                           const bool verbose) {
    typedef typename
      Kokkos::Impl::is_space<DeviceSpaceType>::host_mirror_space::execution_space HostSpaceType ;

    const bool detail = false;
    std::cout << "DeviceSpace::  "; DeviceSpaceType::print_configuration(std::cout, detail);
    std::cout << "HostSpace::    ";   HostSpaceType::print_configuration(std::cout, detail);

    typedef CrsMatrixBase<value_type,ordinal_type,size_type,HostSpaceType> CrsMatrixBaseHostType;
    typedef GraphTools<ordinal_type,size_type,HostSpaceType> GraphToolsHostType;

    typedef GraphTools_Scotch<ordinal_type,size_type,HostSpaceType> GraphToolsHostType_Scotch;
    typedef GraphTools_CAMD<ordinal_type,size_type,HostSpaceType> GraphToolsHostType_CAMD;

    typedef IncompleteSymbolicFactorization<CrsMatrixBaseHostType> IncompleteSymbolicFactorizationType;

    typedef Kokkos::Experimental::TaskPolicy<DeviceSpaceType> PolicyType;

    typedef CrsMatrixBase<value_type,ordinal_type,size_type,DeviceSpaceType> CrsMatrixBaseDeviceType;

    typedef CrsMatrixView<CrsMatrixBaseDeviceType> CrsMatrixViewDeviceType;
    typedef TaskView<CrsMatrixViewDeviceType> CrsTaskViewDeviceType;

    int r_val = 0;

    Kokkos::Impl::Timer timer;

    CrsMatrixBaseHostType AA_host("AA_host");
    timer.reset();
    {
      std::ifstream in;
      in.open(file_input);
      if (!in.good()) {
        std::cout << "Failed in open the file: " << file_input << std::endl;
        return -1;
      }
      MatrixMarket::read(AA_host, in);
    }
    double t_read = timer.seconds();

    if (verbose)
      AA_host.showMe(std::cout) << std::endl;

    typename GraphToolsHostType::size_type_array rptr("Graph::RowPtrArray", AA_host.NumRows() + 1);
    typename GraphToolsHostType::ordinal_type_array cidx("Graph::ColIndexArray", AA_host.NumNonZeros());

    timer.reset();
    GraphToolsHostType::getGraph(rptr, cidx, AA_host);
    double t_graph = timer.seconds();

    GraphToolsHostType_Scotch S;
    S.setGraph(AA_host.NumRows(), rptr, cidx);
    S.setSeed(0);
    S.setTreeLevel();
    S.setStrategy( SCOTCH_STRATSPEED
                   | SCOTCH_STRATLEVELMAX  
                   | SCOTCH_STRATLEVELMIN  
                   | SCOTCH_STRATLEAFSIMPLE
                   | SCOTCH_STRATSEPASIMPLE
                   );

    timer.reset();
    S.computeOrdering(treecut);
    double t_scotch = timer.seconds();

    S.pruneTree(prunecut);
    if (verbose)
      S.showMe(std::cout) << std::endl;

    CrsMatrixBaseHostType BB_host("BB_host");
    BB_host.createConfTo(AA_host);

    CrsMatrixTools::copy(BB_host,
                         S.PermVector(),
                         S.InvPermVector(),
                         AA_host);

    if (verbose)
      BB_host.showMe(std::cout) << std::endl;

    timer.reset();
    GraphToolsHostType::getGraph(rptr, cidx, BB_host);
    t_graph += timer.seconds();

    GraphToolsHostType_CAMD C;
    C.setGraph(BB_host.NumRows(),
               rptr, cidx,
               S.NumBlocks(),
               S.RangeVector());

    timer.reset();
    C.computeOrdering();
    double t_camd = timer.seconds();

    if (verbose)
      C.showMe(std::cout) << std::endl;

    CrsMatrixBaseHostType CC_host("CC_host");
    CC_host.createConfTo(BB_host);

    CrsMatrixTools::copy(CC_host,
                         C.PermVector(),
                         C.InvPermVector(),
                         BB_host);

    if (verbose)
      CC_host.showMe(std::cout) << std::endl;

    CrsMatrixBaseHostType DD_host("DD_host");

    timer.reset();
    IncompleteSymbolicFactorizationType::createNonZeroPattern(DD_host,
                                                    fill_level,
                                                    Uplo::Upper,
                                                    CC_host,
                                                    rows_per_team);
    double t_symbolic = timer.seconds();

    if (verbose)
      DD_host.showMe(std::cout) << std::endl;

    // ==================================================================================

    CrsMatrixBaseDeviceType AA_device("AA_device");
    AA_device.mirror(DD_host);

    const size_type max_concurrency = 10;
    const size_type max_task_size = (3*sizeof(CrsTaskViewDeviceType)+sizeof(PolicyType)+128);
    const size_type max_task_dependence = 0;
    const size_type team_size = 1;

    PolicyType policy(max_concurrency,
                      max_task_size,
                      max_task_dependence,
                      team_size);

    CrsMatrixViewDeviceType A_device(AA_device);
    Kokkos::View<typename CrsMatrixViewDeviceType::row_view_type*,DeviceSpaceType> 
      rowviews("RowViewInMatView", A_device.NumRows());
    A_device.setRowViewArray(rowviews);
    
    timer.reset();
    int ierr = Chol<Uplo::Upper,AlgoChol::Unblocked,Variant::One>::invoke
      (policy, policy.member_single(),
       A_device);
    double t_chol = timer.seconds();
    TACHO_TEST_FOR_ABORT( ierr, "Fail to perform Cholesky (serial)");

    if (verbose) {
      DD_host.mirror(AA_device);
      DD_host.showMe(std::cout) << std::endl;
    }

    {
      const auto prec = std::cout.precision();
      std::cout.precision(4);

      std::cout << std::scientific;
      std::cout << "SymbolicFactorization:: Given matrix  dimension = " << AA_host.NumRows() << " x " << AA_host.NumCols()
                << ", " << " nnz = " << AA_host.NumNonZeros() << std::endl;
      std::cout << "SymbolicFactorization:: Upper factors dimension = " << DD_host.NumRows() << " x " << DD_host.NumCols()
                << ", " << " nnz = " << DD_host.NumNonZeros() << std::endl;

      std::cout << "SymbolicFactorization:: "
                << "read = " << t_read << " [sec], "
                << "graph generation = " << (t_graph/2.0) << " [sec] "
                << "scotch reordering = " << t_scotch << " [sec] "
                << "camd reordering = " << t_camd << " [sec] "
                << "symbolic factorization = " << t_symbolic << " [sec] "
                << "Cholesky factorization = " << t_chol << " [sec] "
                << std::endl;

      std::cout.unsetf(std::ios::scientific);
      std::cout.precision(prec);
    }

    return r_val;
  }
Пример #18
0
  KOKKOS_INLINE_FUNCTION
  int exampleStatByBlocks(const string file_input,
                          const int treecut,
                          const int minblksize,
                          const int prunecut,
                          const int seed,
                          const int fill_level,
                          const int league_size,
                          const int histogram_size,
                          const bool verbose) {
    typedef ValueType   value_type;
    typedef OrdinalType ordinal_type;
    typedef SizeType    size_type;

    typedef CrsMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> CrsMatrixBaseType;
    typedef GraphHelper_Scotch<CrsMatrixBaseType> GraphHelperType;
    typedef SymbolicFactorHelper<CrsMatrixBaseType> SymbolicFactorHelperType;

    typedef CrsMatrixView<CrsMatrixBaseType> CrsMatrixViewType;
    typedef CrsMatrixBase<CrsMatrixViewType,ordinal_type,size_type,SpaceType,MemoryTraits> CrsHierMatrixBaseType;

    int r_val = 0;

    Kokkos::Impl::Timer timer;
    double t = 0.0;

    cout << "StatByBlocks:: import input file = " << file_input << endl;
    CrsMatrixBaseType AA("AA");
    {
      timer.reset();

      ifstream in;
      in.open(file_input);
      if (!in.good()) {
        cout << "Failed in open the file: " << file_input << endl;
        return ++r_val;
      }
      AA.importMatrixMarket(in);

      t = timer.seconds();

      if (verbose)
        cout << AA << endl;
    }
    cout << "StatByBlocks:: import input file::time = " << t << endl;


    CrsMatrixBaseType UU("UU");
    CrsHierMatrixBaseType HU("HU");
    {
      CrsMatrixBaseType PA("Permuted AA");
      typename GraphHelperType::size_type_array rptr(AA.Label()+"Graph::RowPtrArray", AA.NumRows() + 1);
      typename GraphHelperType::ordinal_type_array cidx(AA.Label()+"Graph::ColIndexArray", AA.NumNonZeros());

      AA.convertGraph(rptr, cidx);
      GraphHelperType S(AA.Label()+"ScotchHelper",
                        AA.NumRows(),
                        rptr,
                        cidx,
                        seed);
      {
        timer.reset();

        S.computeOrdering(treecut, minblksize);
        S.pruneTree(prunecut);

        PA.copy(S.PermVector(), S.InvPermVector(), AA);

        t = timer.seconds();

        if (verbose)
          cout << S << endl;
      }
      cout << "StatByBlocks:: reorder the matrix::time = " << t << endl;

      {
        SymbolicFactorHelperType F(PA, league_size);
        timer.reset();

        F.createNonZeroPattern(fill_level, Uplo::Upper, UU);

        t = timer.seconds();
        cout << "StatByBlocks:: AA (nnz) = " << AA.NumNonZeros() << ", UU (nnz) = " << UU.NumNonZeros() << endl;
      }
      cout << "StatByBlocks:: symbolic factorization::time = " << t << endl;

      {
        timer.reset();

        CrsMatrixHelper::flat2hier(Uplo::Upper, UU, HU,
                                   S.NumBlocks(),
                                   S.RangeVector(),
                                   S.TreeVector());

        for (ordinal_type k=0;k<HU.NumNonZeros();++k)
          HU.Value(k).fillRowViewArray();

        t = timer.seconds();
        cout << "StatByBlocks:: Hier (dof, nnz) = " << HU.NumRows() << ", " << HU.NumNonZeros() << endl;
      }
      cout << "StatByBlocks:: construct hierarchical matrix::time = " << t << endl;
    }

    {
      cout << endl;
      cout << " -- Flat matrix: UU --" << endl;
      cout << "    # of Rows     = " << UU.NumRows() << endl;
      cout << "    # of Cols     = " << UU.NumCols() << endl;
      cout << "    # of Nonzeros = " << UU.NumNonZeros() << endl;
      cout << endl;
      cout << " -- Hierarchical matrix: HU --" << endl;
      cout << "    # of Rows     = " << HU.NumRows() << endl;
      cout << "    # of Cols     = " << HU.NumCols() << endl;
      cout << "    # of Nonzeros = " << HU.NumNonZeros() << endl;
      cout << endl;
      cout << " -- Blocks of HU --" << endl;

      map<size_type,size_type> histogram;

      if (HU.NumNonZeros()) {

        size_type
          nnz_min = HU.Value(0).countNumNonZeros(),
          nnz_max = nnz_min,
          nnz_sum = 0,
          nnz_ave = 0;

        size_type nnz_cnt = 0;
        for (ordinal_type k=0;k<HU.NumNonZeros();++k) {
          const auto nnz_blk = HU.Value(k).countNumNonZeros();
          if (nnz_blk) {
            nnz_min  = min(nnz_min, nnz_blk);
            nnz_max  = max(nnz_max, nnz_blk);
            nnz_sum += nnz_blk;
            ++nnz_cnt;

            if (histogram_size)
              ++histogram[nnz_blk/histogram_size];
          }
        }
        nnz_ave = nnz_sum/nnz_cnt;

        cout << "    Min # of Nonzeros = " << nnz_min << endl;
        cout << "    Max # of Nonzeros = " << nnz_max << endl;
        cout << "    Ave # of Nonzeros = " << nnz_ave << endl;
        cout << "    Sum # of Nonzeros = " << nnz_sum << endl;
        cout << "    # of empty Blocks = " << (HU.NumNonZeros() - nnz_cnt) << endl;

        if (histogram_size) {
          cout << "    Histogram" << endl;
          for (auto it=histogram.begin();it!=histogram.end();++it)
            cout << (it->first*histogram_size) << " , " << it->second << endl;
        }
      } else {
        cout << "    No registered blocks" << endl;
      }

    }

    return r_val;
  }
Пример #19
0
  KOKKOS_INLINE_FUNCTION
  int exampleDenseTrsmMKL(const OrdinalType mmin,
                          const OrdinalType mmax,
                          const OrdinalType minc,
                          const OrdinalType k,
                          const bool verbose) {
    typedef ValueType   value_type;
    typedef OrdinalType ordinal_type;
    typedef SizeType    size_type;

    typedef DenseMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> DenseMatrixBaseType;

    int r_val = 0;

    Kokkos::Impl::Timer timer;
    double t = 0.0;

    cout << "DenseGemmMKL:: test matrices "
         <<":: mmin = " << mmin << " , mmax = " << mmax << " , minc = " << minc << " , k = "<< k << endl;

    ostringstream os;
    os.precision(3);
    os << scientific;

    for (ordinal_type m=mmin;m<=mmax;m+=minc) {
      os.str("");

      DenseMatrixBaseType AA("AA", m, m), BB("BB", m, k), BC("BC", m, k);
      
      // setup upper triangular
      for (ordinal_type j=0;j<AA.NumCols();++j) {
        AA.Value(j,j) = 10.0;
        for (ordinal_type i=0;i<j;++i)
          AA.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0;
      }

      // setup one and right hand side is going to be overwritten by the product of AB
      for (ordinal_type j=0;j<BB.NumCols();++j)
        for (ordinal_type i=0;i<BB.NumRows();++i)
          BB.Value(i,j) = 1.0;

      Teuchos::BLAS<ordinal_type,value_type> blas;

      blas.GEMM(Teuchos::CONJ_TRANS, Teuchos::NO_TRANS,
                m, k, m,
                1.0,
                AA.ValuePtr(), AA.ColStride(),
                BB.ValuePtr(), BB.ColStride(),
                0.0,
                BC.ValuePtr(), BC.ColStride());
      BB.copy(BC);

      const double flop = get_flop_trsm_upper<value_type>(m, k);

      os << "DenseTrsmMKL:: m = " << m << " k = " << k;
      {
        timer.reset();
        Teuchos::BLAS<ordinal_type,value_type> blas;

        const ordinal_type mm = AA.NumRows();
        const ordinal_type nn = BB.NumCols();

        blas.TRSM(Teuchos::LEFT_SIDE, Teuchos::UPPER_TRI, Teuchos::CONJ_TRANS,
                  Teuchos::NON_UNIT_DIAG,
                  mm, nn,
                  1.0,
                  AA.ValuePtr(), AA.ColStride(),
                  BB.ValuePtr(), BB.ColStride());
        t = timer.seconds();
        os << ":: MKL Performance = " << (flop/t/1.0e9) << " [GFLOPs]  ";
      }
      cout << os.str() << endl;
    }

    return r_val;
  }
  int exampleCholByBlocks(const std::string file_input,
                          const int treecut,
                          const int prunecut,
                          const int fill_level,
                          const int rows_per_team,
                          const int max_concurrency,
                          const int max_task_dependence,
                          const int team_size,
                          const bool check,
                          const bool verbose) {
    typedef typename
      Kokkos::Impl::is_space<DeviceSpaceType>::host_mirror_space::execution_space HostSpaceType ;

    const bool detail = false;
    std::cout << "DeviceSpace::  "; DeviceSpaceType::print_configuration(std::cout, detail);
    std::cout << "HostSpace::    ";   HostSpaceType::print_configuration(std::cout, detail);

    // for simple test, let's use host space only here, for device it needs mirroring.

    typedef CrsMatrixBase<value_type,ordinal_type,size_type,HostSpaceType> CrsMatrixBaseHostType;
    typedef CrsMatrixView<CrsMatrixBaseHostType> CrsMatrixViewHostType;

    typedef GraphTools<ordinal_type,size_type,HostSpaceType> GraphToolsHostType;

    typedef GraphTools_Scotch<ordinal_type,size_type,HostSpaceType> GraphToolsHostType_Scotch;
    typedef GraphTools_CAMD<ordinal_type,size_type,HostSpaceType> GraphToolsHostType_CAMD;

    typedef IncompleteSymbolicFactorization<CrsMatrixBaseHostType> IncompleteSymbolicFactorizationType;

    typedef Kokkos::Experimental::TaskPolicy<DeviceSpaceType> PolicyType;

    typedef TaskView<CrsMatrixViewHostType> CrsTaskViewHostType;
    typedef CrsMatrixBase<CrsTaskViewHostType,ordinal_type,size_type,HostSpaceType> CrsHierBaseHostType;
    typedef CrsMatrixView<CrsHierBaseHostType> CrsHierViewHostType;
    typedef TaskView<CrsHierViewHostType> CrsTaskHierViewHostType;

    int r_val = 0;
    
    Kokkos::Impl::Timer timer;

    ///
    /// Read from matrix market
    ///
    ///     input  - file
    ///     output - AA_host
    ///
    CrsMatrixBaseHostType AA_host("AA_host");
    timer.reset();
    {
      std::ifstream in;
      in.open(file_input);
      if (!in.good()) {
        std::cout << "Failed in open the file: " << file_input << std::endl;
        return -1;
      }
      MatrixMarket::read(AA_host, in);
    }
    double t_read = timer.seconds();

    if (verbose)
      AA_host.showMe(std::cout) << std::endl;

    ///
    /// Create a graph structure for Scotch and CAMD (rptr, cidx)
    ///     
    ///     rptr and cidx are need to be set up for Scotch and CAMD
    ///
    typename GraphToolsHostType::size_type_array rptr("Graph::RowPtrArray", AA_host.NumRows() + 1);
    typename GraphToolsHostType::ordinal_type_array cidx("Graph::ColIndexArray", AA_host.NumNonZeros());

    ///
    /// Run Scotch
    ///
    ///     input  - rptr, cidx, A_host
    ///     output - S (perm, iperm, nblks, range, tree), AA_scotch_host (permuted)
    ///
    timer.reset();
    GraphToolsHostType::getGraph(rptr, cidx, AA_host);
    double t_graph = timer.seconds();

    GraphToolsHostType_Scotch S;
    S.setGraph(AA_host.NumRows(), rptr, cidx);
    S.setSeed(0);
    S.setTreeLevel();
    S.setStrategy( SCOTCH_STRATSPEED 
                   | SCOTCH_STRATLEVELMAX   
                   | SCOTCH_STRATLEVELMIN   
                   | SCOTCH_STRATLEAFSIMPLE 
                   | SCOTCH_STRATSEPASIMPLE 
                   );
    
    timer.reset();
    S.computeOrdering(treecut);
    double t_scotch = timer.seconds();
    
    if (verbose)
      S.showMe(std::cout) << std::endl;

    CrsMatrixBaseHostType AA_scotch_host("AA_scotch_host");
    AA_scotch_host.createConfTo(AA_host);
    
    CrsMatrixTools::copy(AA_scotch_host,
                         S.PermVector(),
                         S.InvPermVector(),
                         AA_host);

    if (verbose)
      AA_scotch_host.showMe(std::cout) << std::endl;

    ///
    /// Run CAMD
    ///
    ///     input  - rptr, cidx, A_host
    ///     output - S (perm, iperm, nblks, range, tree), AA_scotch_host (permuted)
    ///
    timer.reset();
    GraphToolsHostType::getGraph(rptr, cidx, AA_scotch_host);
    t_graph += timer.seconds();

    GraphToolsHostType_CAMD C;
    C.setGraph(AA_scotch_host.NumRows(),
               rptr, cidx,
               S.NumBlocks(),
               S.RangeVector());

    timer.reset();
    C.computeOrdering();
    double t_camd = timer.seconds();

    if (verbose)
      C.showMe(std::cout) << std::endl;
    
    CrsMatrixBaseHostType AA_camd_host("AA_camd_host");
    AA_camd_host.createConfTo(AA_scotch_host);
    
    CrsMatrixTools::copy(AA_camd_host,
                         C.PermVector(),
                         C.InvPermVector(),
                         AA_scotch_host);
    
    if (verbose)
      AA_camd_host.showMe(std::cout) << std::endl;

    ///
    /// Symbolic factorization
    ///
    ///     input  - 
    ///     output - S (perm, iperm, nblks, range, tree), AA_scotch_host (permuted)
    ///
    CrsMatrixBaseHostType AA_factor_host("AA_factor_host");

    timer.reset();
    IncompleteSymbolicFactorizationType::createNonZeroPattern(AA_factor_host,
                                                    fill_level,
                                                    Uplo::Upper,
                                                    AA_camd_host,
                                                    rows_per_team);
    double t_symbolic = timer.seconds();

    if (verbose)
      AA_factor_host.showMe(std::cout) << std::endl;

    ///
    /// Clean tempoerary matrices
    ///
    ///     input  - AA_scotch_host, AA_camd_host, C, rptr, cidx
    ///     output - none
    ///
    AA_scotch_host = CrsMatrixBaseHostType();
    AA_camd_host   = CrsMatrixBaseHostType();

    C = GraphToolsHostType_CAMD();
    rptr = typename GraphToolsHostType::size_type_array();
    cidx = typename GraphToolsHostType::ordinal_type_array();

    ///
    /// Create task policy
    ///
    ///     input  - max_task_size
    ///     output - policy
    ///
    const size_type max_task_size = (3*sizeof(CrsTaskViewHostType)+sizeof(PolicyType)+128);

    timer.reset();
    PolicyType policy(max_concurrency,
                      max_task_size,
                      max_task_dependence,
                      team_size);
    double t_policy = timer.seconds();

    ///
    /// Sequential execution
    ///
    ///     input  - AA_factor_host (matrix to be compared), rowviews
    ///     output - BB_factor_host, B_factor_host
    ///
    double t_chol_serial = 0;
    CrsMatrixBaseHostType BB_factor_host("BB_factor_host");
    if (check) {
      BB_factor_host.createConfTo(AA_factor_host);
      CrsMatrixTools::copy(BB_factor_host, AA_factor_host);
      
      CrsTaskViewHostType B_factor_host(BB_factor_host);
      Kokkos::View<typename CrsTaskViewHostType::row_view_type*,HostSpaceType>
        rowviews("RowViewInMatView", B_factor_host.NumRows());
      B_factor_host.setRowViewArray(rowviews);
      
      timer.reset();
      {
        auto future = policy.proc_create_team(Chol<Uplo::Upper,AlgoChol::Unblocked,Variant::One>
                                              ::createTaskFunctor(policy, B_factor_host));
        policy.spawn(future);
        Kokkos::Experimental::wait(policy);
        TACHO_TEST_FOR_ABORT( future.get(), "Fail to perform Cholesky (serial)");
      }
      t_chol_serial = timer.seconds();

      if (verbose)
        BB_factor_host.showMe(std::cout) << std::endl;
    }

    ///
    /// Task parallel execution
    ///
    ///    input  - AA_factor_host, rowviews
    ///    output - HA_factor_host, AA_factor_host, B_factor_host
    ///
    double t_hier = 0, t_blocks = 0, t_chol_parallel = 0;
    CrsHierBaseHostType HA_factor_host("HA_factor_host");
    {
      timer.reset();
      S.pruneTree(prunecut);
      CrsMatrixTools::createHierMatrix(HA_factor_host, 
                                       AA_factor_host, 
                                       S.NumBlocks(),
                                       S.RangeVector(),
                                       S.TreeVector());
      t_hier = timer.seconds();    
      
      timer.reset();
      size_type nblocks = HA_factor_host.NumNonZeros();
      
      Kokkos::View<ordinal_type*,HostSpaceType> 
        ap_rowview_blocks("NumRowViewInBlocks", nblocks + 1);
      
      ap_rowview_blocks(0) = 0;
      for (ordinal_type k=0;k<nblocks;++k) 
        ap_rowview_blocks(k+1) = ap_rowview_blocks(k) + HA_factor_host.Value(k).NumRows();
      
      Kokkos::View<typename CrsMatrixViewHostType::row_view_type*,HostSpaceType>
        rowview_blocks("RowViewInBlocks", ap_rowview_blocks(nblocks));
      
      Kokkos::parallel_for(Kokkos::RangePolicy<HostSpaceType>(0, nblocks),
                           [&](const ordinal_type k) {
                             const ordinal_type begin = ap_rowview_blocks(k);
                             const ordinal_type end   = ap_rowview_blocks(k+1);
                             HA_factor_host.Value(k).setRowViewArray
                               (Kokkos::subview(rowview_blocks, Kokkos::pair<ordinal_type,ordinal_type>(begin, end)));
                           } );
      CrsMatrixTools::filterEmptyBlocks(HA_factor_host);
      t_blocks = timer.seconds();    

      {
        size_type nblocks_filtered = HA_factor_host.NumNonZeros(), nnz_blocks = 0;
        for (size_type k=0;k<nblocks_filtered; ++k) 
          nnz_blocks += HA_factor_host.Value(k).NumNonZeros();
        
        TACHO_TEST_FOR_ABORT( nnz_blocks != AA_factor_host.NumNonZeros(),
                              "nnz counted in blocks is different from nnz in the base matrix.");
      }
      
      CrsTaskHierViewHostType H_factor_host(HA_factor_host);
      timer.reset();
      {
        auto future = policy.proc_create_team(Chol<Uplo::Upper,AlgoChol::ByBlocks,Variant::One>
                                              ::createTaskFunctor(policy, H_factor_host));
        policy.spawn(future);
        Kokkos::Experimental::wait(policy);
        TACHO_TEST_FOR_ABORT( future.get(), "Fail to perform Cholesky (serial)");
      }
      t_chol_parallel = timer.seconds();
      
      if (verbose)
        AA_factor_host.showMe(std::cout) << std::endl;
    }

    if (check) {
      double diff = 0, norm = 0;    
      TACHO_TEST_FOR_ABORT( BB_factor_host.NumNonZeros() != AA_factor_host.NumNonZeros(),
                            "nnz used in serial is not same as nnz used in parallel");

      const size_type nnz = AA_factor_host.NumNonZeros();
      for (size_type k=0;k<nnz;++k) {
        norm += Util::abs(BB_factor_host.Value(k));
        diff += Util::abs(AA_factor_host.Value(k) - BB_factor_host.Value(k));
      }
      std::cout << std::scientific;
      std::cout << "CholByBlocks:: check with serial execution " << std::endl
                << "               diff = " << diff << ", norm = " << norm << ", rel err = " << (diff/norm) << std::endl;
      std::cout.unsetf(std::ios::scientific);      
    }

    {
      const auto prec = std::cout.precision();
      std::cout.precision(4);

      std::cout << std::scientific;
      std::cout << "CholByBlocks:: Given    matrix = " << AA_host.NumRows() << " x " << AA_host.NumCols() << ", nnz = " << AA_host.NumNonZeros() << std::endl;
      std::cout << "CholByBlocks:: Factored matrix = " << AA_factor_host.NumRows() << " x " << AA_factor_host.NumCols() << ", nnz = " << AA_factor_host.NumNonZeros() << std::endl;
      std::cout << "CholByBlocks:: Hier     matrix = " << HA_factor_host.NumRows() << " x " << HA_factor_host.NumCols() << ", nnz = " << HA_factor_host.NumNonZeros() << std::endl;

      std::cout << "CholByBlocks:: "
                << "read = " << t_read << " [sec], "
                << "graph generation = " << (t_graph/2.0) << " [sec] "
                << "scotch reordering = " << t_scotch << " [sec] "
                << "camd reordering = " << t_camd << " [sec] " 
                << std::endl
                << "CholByBlocks:: "
                << "symbolic factorization = " << t_symbolic << " [sec] "
                << std::endl
                << "CholByBlocks:: "
                << "policy creation = " << t_policy << " [sec] "
                << "hier creation = " << t_hier << " [sec] "
                << "block specification = " << t_blocks << " [sec] "
                << std::endl
                << "CholByBlocks:: "
                << "Chol Parallel = " << t_chol_parallel << " [sec] ";
      if (check) 
        std::cout << "Chol Serial = " << (check ? t_chol_serial : -1) << " [sec] "
                  << "speed-up = " << (t_chol_serial/t_chol_parallel) << " [sec] ";
      
      std::cout << std::endl;
      
      std::cout.unsetf(std::ios::scientific);
      std::cout.precision(prec);
    }

    return r_val;
  }
Пример #21
0
  KOKKOS_INLINE_FUNCTION
  int exampleCholPerformance(const string file_input,
                              const int treecut,
                              const int minblksize,
                              const int prunecut,
                              const int seed,
                              const int niter,
                              const int nthreads,
                              const int max_task_dependence,
                              const int team_size,
                              const int fill_level,
                              const int league_size,
                              const bool team_interface,
                              const bool skip_serial,
                              const bool mkl_interface,
                              const bool verbose) {
    typedef ValueType   value_type;
    typedef OrdinalType ordinal_type;
    typedef SizeType    size_type;

    typedef TaskFactory<Kokkos::Experimental::TaskPolicy<SpaceType>,
      Kokkos::Experimental::Future<int,SpaceType> > TaskFactoryType;

    typedef CrsMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> CrsMatrixBaseType;
    typedef GraphHelper_Scotch<CrsMatrixBaseType> GraphHelperType;
    typedef SymbolicFactorHelper<CrsMatrixBaseType> SymbolicFactorHelperType;

#ifdef HAVE_SHYLUTACHO_MKL 
    typedef typename CrsMatrixBaseType::value_type_array value_type_array;
#endif

    typedef CrsMatrixView<CrsMatrixBaseType> CrsMatrixViewType;
    typedef TaskView<CrsMatrixViewType,TaskFactoryType> CrsTaskViewType;
    
    typedef CrsMatrixBase<CrsTaskViewType,ordinal_type,size_type,SpaceType,MemoryTraits> CrsHierMatrixBaseType;

    typedef CrsMatrixView<CrsHierMatrixBaseType> CrsHierMatrixViewType;
    typedef TaskView<CrsHierMatrixViewType,TaskFactoryType> CrsHierTaskViewType;

    int r_val = 0;

    Kokkos::Impl::Timer timer;
    double 
      t_import = 0.0,
      t_reorder = 0.0,
      t_symbolic = 0.0,
      t_flat2hier = 0.0,
#ifdef HAVE_SHYLUTACHO_MKL 
      t_mkl_seq = 0.0,
#endif
      t_factor_seq = 0.0,
      t_factor_task = 0.0;
    const int start = -2;
    
    cout << "CholPerformance:: import input file = " << file_input << endl;        
    CrsMatrixBaseType AA("AA");
    {
      timer.reset();

      ifstream in;
      in.open(file_input);
      if (!in.good()) {
        cout << "Failed in open the file: " << file_input << endl;
        return ++r_val;
      }
      AA.importMatrixMarket(in);

      t_import = timer.seconds();

      if (verbose)
        cout << AA << endl;
    }
    cout << "CholPerformance:: import input file::time = " << t_import << endl;

    cout << "CholPerformance:: reorder the matrix" << endl;        
    CrsMatrixBaseType PA("Permuted AA");
    CrsMatrixBaseType UU("UU");     // permuted base upper triangular matrix
    CrsHierMatrixBaseType HU("HU"); // hierarchical matrix of views

    {
      GraphHelperType S(AA, seed);
      {
        timer.reset();
        
        S.computeOrdering(treecut, minblksize);
        S.pruneTree(prunecut);
        
        PA.copy(S.PermVector(), S.InvPermVector(), AA);
        
        t_reorder = timer.seconds();

        if (verbose)
          cout << S << endl
               << PA << endl;
      }

      cout << "CholPerformance:: reorder the matrix::time = " << t_reorder << endl;            
      {
        SymbolicFactorHelperType F(PA, league_size);
        for (int i=start;i<niter;++i) {
          timer.reset();
        
          F.createNonZeroPattern(fill_level, Uplo::Upper, UU);

          // UU.copy(Uplo::Upper, PA);

          t_symbolic += timer.seconds() * (i>=0);
        }
        t_symbolic /= niter;

        cout << "CholPerformance:: AA (nnz) = " << AA.NumNonZeros() << ", UU (nnz) = " << UU.NumNonZeros() << endl;

        if (verbose)
          cout << F << endl
               << UU << endl;
      }
      cout << "CholPerformance:: symbolic factorization::time = " << t_symbolic << endl;            
      {
        timer.reset();

        CrsMatrixHelper::flat2hier(Uplo::Upper, UU, HU,
                                   S.NumBlocks(),
                                   S.RangeVector(),
                                   S.TreeVector());
        
        for (ordinal_type k=0;k<HU.NumNonZeros();++k)
          HU.Value(k).fillRowViewArray();
        
        t_flat2hier = timer.seconds();
        
        cout << "CholPerformance:: Hier (dof, nnz) = " << HU.NumRows() << ", " << HU.NumNonZeros() << endl;
      }
      cout << "CholPerformance:: construct hierarchical matrix::time = " << t_flat2hier << endl;            
    }

    // copy of UU
    CrsMatrixBaseType RR("RR");
    RR.copy(UU);

#ifdef HAVE_SHYLUTACHO_MKL
    if (!skip_serial && mkl_interface) {
      cout << "CholPerformance:: MKL factorize the matrix" << endl;
      CrsMatrixBaseType MM("MM");
      for (int i=start;i<niter;++i) {
        MM.copy(RR);
        MM.hermitianize(Uplo::Upper);

        MKL_INT  n  = static_cast<MKL_INT>(MM.NumRows());
        double  *a  = static_cast<double*>(MM.ValuePtr());
        MKL_INT *ia = static_cast<MKL_INT*>(MM.RowPtr());
        MKL_INT *ja = static_cast<MKL_INT*>(MM.ColPtr());

        // convert to 1-based matrix
        {
          for (ordinal_type k=0;k<(MM.NumRows()+1);++k)
            ++ia[k];
          
          for (size_type k=0;k<MM.NumNonZeros();++k)
            ++ja[k];
        }
        value_type_array mkl_result = value_type_array("mkl-ilu-values", MM.NumNonZeros());
        double  *bilu0 = static_cast<double*>(&mkl_result[0]);
        MKL_INT ipar[128];
        double dpar[128];
        MKL_INT ierr;
        
        // we provide ilu-k pattern 
        timer.reset();
        dcsrilu0(&n, a, ia, ja, bilu0, ipar, dpar, &ierr);
        t_mkl_seq += timer.seconds() * (i>=0) * 0.5;
        
        if (ierr != 0) 
          cout << " MKL Error = " << ierr << endl;
      }
      t_mkl_seq /= niter;
      cout << "CholPerformance:: MKL factorize the matrix::time = " << t_mkl_seq << endl;
    }
#endif

    if (!skip_serial) {
#ifdef __USE_FIXED_TEAM_SIZE__
      typename TaskFactoryType::policy_type policy(max_task_dependence);
#else
      typename TaskFactoryType::policy_type policy(max_task_dependence, 1);
#endif
      TaskFactoryType::setUseTeamInterface(team_interface);
      TaskFactoryType::setMaxTaskDependence(max_task_dependence);
      TaskFactoryType::setPolicy(&policy);

      CrsTaskViewType U(&UU);
      U.fillRowViewArray();

      cout << "CholPerformance:: Serial factorize the matrix" << endl;
      {
        for (int i=start;i<niter;++i) {
          UU.copy(RR);
          timer.reset();          
          // {
          //   auto future = TaskFactoryType::Policy().create(Chol<Uplo::Upper,AlgoChol::UnblockedOpt,Variant::One>
          //                                                  ::TaskFunctor<CrsTaskViewType>(U), 0);
          //   TaskFactoryType::Policy().spawn(future);
          //   Kokkos::Experimental::wait(TaskFactoryType::Policy());
          // }
          {
            Chol<Uplo::Upper,AlgoChol::UnblockedOpt,Variant::One>
              ::invoke(TaskFactoryType::Policy(),
                                TaskFactoryType::Policy().member_single(),
                                U);
          }
          t_factor_seq += timer.seconds() * (i>=0);
        }
        t_factor_seq /= niter;

        if (verbose)
          cout << UU << endl;
      }
      cout << "CholPerformance:: Serial factorize the matrix::time = " << t_factor_seq << endl;
    }

//     {
// #ifdef __USE_FIXED_TEAM_SIZE__
//       typename TaskFactoryType::policy_type policy(max_task_dependence);
// #else
//       typename TaskFactoryType::policy_type policy(max_task_dependence, nthreads);
// #endif
//       TaskFactoryType::setPolicy(&policy);

//       CrsTaskViewType U(&UU);
//       U.fillRowViewArray();

//       cout << "CholPerformance:: Team factorize the matrix:: team_size = " << nthreads << endl;
//       {
//         timer.reset();
        
//         auto future = TaskFactoryType::Policy().create(Chol<Uplo::Upper,AlgoChol::UnblockedOpt,Variant::One>
//                                                        ::TaskFunctor<CrsTaskViewType>(U), 0);
//         TaskFactoryType::Policy().spawn(future);
//         Kokkos::Experimental::wait(TaskFactoryType::Policy());
        
//         t_factor_team = timer.seconds();
        
//         if (verbose)
//           cout << UU << endl;
//       }
//       cout << "CholPerformance:: Team factorize the matrix::time = " << t_factor_team << endl;
//     }

    {
#ifdef __USE_FIXED_TEAM_SIZE__
      typename TaskFactoryType::policy_type policy(max_task_dependence);
#else
      typename TaskFactoryType::policy_type policy(max_task_dependence, team_size);
#endif
      TaskFactoryType::setUseTeamInterface(team_interface);
      TaskFactoryType::setMaxTaskDependence(max_task_dependence);
      TaskFactoryType::setPolicy(&policy);
      
      cout << "CholPerformance:: ByBlocks factorize the matrix:: team_size = " << team_size << endl;
      CrsHierTaskViewType H(&HU);
      {
        for (int i=start;i<niter;++i) {
          UU.copy(RR);
          timer.reset();
          {
            auto future = TaskFactoryType::Policy().create_team(Chol<Uplo::Upper,AlgoChol::ByBlocks>::
                                                                TaskFunctor<CrsHierTaskViewType>(H), 0);
            TaskFactoryType::Policy().spawn(future);
            Kokkos::Experimental::wait(TaskFactoryType::Policy());
          }
          t_factor_task += timer.seconds() * (i>=0);
        }
        t_factor_task /= niter;

        if (verbose)
          cout << UU << endl;
      }  
      cout << "CholPerformance:: ByBlocks factorize the matrix::time = " << t_factor_task << endl;
    }

    if (!skip_serial) {
#ifdef HAVE_SHYLUTACHO_MKL
      cout << "CholPerformance:: mkl/chol scale [mkl/chol] = " << t_mkl_seq/t_factor_seq << endl;    
      cout << "CholPerformance:: mkl/task  scale [mkl/task]  = " << t_mkl_seq/t_factor_task << endl;    
#else
      cout << "CholPerformance:: task scale [seq/task] = " << t_factor_seq/t_factor_task << endl;    
#endif
      //cout << "CholPerformance:: team scale [seq/team] = " << t_factor_seq/t_factor_team << endl;    
    }

    return r_val;
  }
Пример #22
0
void VerletKokkos::run(int n)
{
  bigint ntimestep;
  int nflag,sortflag;

  int n_post_integrate = modify->n_post_integrate;
  int n_pre_exchange = modify->n_pre_exchange;
  int n_pre_neighbor = modify->n_pre_neighbor;
  int n_pre_force = modify->n_pre_force;
  int n_post_force = modify->n_post_force;
  int n_end_of_step = modify->n_end_of_step;

  if (atomKK->sortfreq > 0) sortflag = 1;
  else sortflag = 0;

  static double time = 0.0;
  static int count = 0;
  atomKK->sync(Device,ALL_MASK);
  Kokkos::Impl::Timer ktimer;

  for (int i = 0; i < n; i++) {

    ntimestep = ++update->ntimestep;
    ev_set(ntimestep);

    // initial time integration

    ktimer.reset();
    timer->stamp();
    modify->initial_integrate(vflag);
    time += ktimer.seconds();
    if (n_post_integrate) modify->post_integrate();
    timer->stamp(Timer::MODIFY);

    // regular communication vs neighbor list rebuild

    nflag = neighbor->decide();

    if (nflag == 0) {
      timer->stamp();
      comm->forward_comm();
      timer->stamp(Timer::COMM);
    } else {
      // added debug
      //atomKK->sync(Host,ALL_MASK);
      //atomKK->modified(Host,ALL_MASK);

      if (n_pre_exchange) {
        timer->stamp();
        modify->pre_exchange();
        timer->stamp(Timer::MODIFY);
      }
      // debug
      //atomKK->sync(Host,ALL_MASK);
      //atomKK->modified(Host,ALL_MASK);
      if (triclinic) domain->x2lamda(atomKK->nlocal);
      domain->pbc();
      if (domain->box_change) {
        domain->reset_box();
        comm->setup();
        if (neighbor->style) neighbor->setup_bins();
      }
      timer->stamp();

      // added debug
      //atomKK->sync(Device,ALL_MASK);
      //atomKK->modified(Device,ALL_MASK);

      comm->exchange();
      if (sortflag && ntimestep >= atomKK->nextsort) atomKK->sort();
      comm->borders();

      // added debug
      //atomKK->sync(Host,ALL_MASK);
      //atomKK->modified(Host,ALL_MASK);

      if (triclinic) domain->lamda2x(atomKK->nlocal+atomKK->nghost);

      timer->stamp(Timer::COMM);
      if (n_pre_neighbor) {
        modify->pre_neighbor();
        timer->stamp(Timer::MODIFY);
      }
      neighbor->build();
      timer->stamp(Timer::NEIGH);
    }

    // force computations
    // important for pair to come before bonded contributions
    // since some bonded potentials tally pairwise energy/virial
    // and Pair:ev_tally() needs to be called before any tallying

    force_clear();

    timer->stamp();

    // added for debug
    //atomKK->k_x.sync<LMPHostType>();
    //atomKK->k_f.sync<LMPHostType>();
    //atomKK->k_f.modify<LMPHostType>();
    if (n_pre_force) {
      modify->pre_force(vflag);
      timer->stamp(Timer::MODIFY);
    }


    if (pair_compute_flag) {
      atomKK->sync(force->pair->execution_space,force->pair->datamask_read);
      atomKK->modified(force->pair->execution_space,force->pair->datamask_modify);
      force->pair->compute(eflag,vflag);
      timer->stamp(Timer::PAIR);
    }

    if (atomKK->molecular) {
      if (force->bond) {
        atomKK->sync(force->bond->execution_space,force->bond->datamask_read);
        atomKK->modified(force->bond->execution_space,force->bond->datamask_modify);
        force->bond->compute(eflag,vflag);
      }
      if (force->angle) {
        atomKK->sync(force->angle->execution_space,force->angle->datamask_read);
        atomKK->modified(force->angle->execution_space,force->angle->datamask_modify);
        force->angle->compute(eflag,vflag);
      }
      if (force->dihedral) {
        atomKK->sync(force->dihedral->execution_space,force->dihedral->datamask_read);
        atomKK->modified(force->dihedral->execution_space,force->dihedral->datamask_modify);
        force->dihedral->compute(eflag,vflag);
      }
      if (force->improper) {
        atomKK->sync(force->improper->execution_space,force->improper->datamask_read);
        atomKK->modified(force->improper->execution_space,force->improper->datamask_modify);
        force->improper->compute(eflag,vflag);
      }
      timer->stamp(Timer::BOND);
    }

    if (kspace_compute_flag) {
      atomKK->sync(force->kspace->execution_space,force->kspace->datamask_read);
      atomKK->modified(force->kspace->execution_space,force->kspace->datamask_modify);
      force->kspace->compute(eflag,vflag);
      timer->stamp(Timer::KSPACE);
    }

    // reverse communication of forces

    if (force->newton) comm->reverse_comm();
    timer->stamp(Timer::COMM);

    // force modifications, final time integration, diagnostics

    ktimer.reset();

    if (n_post_force) modify->post_force(vflag);
    modify->final_integrate();
    if (n_end_of_step) modify->end_of_step();
    timer->stamp(Timer::MODIFY);

    time += ktimer.seconds();

    // all output

    if (ntimestep == output->next) {
       atomKK->sync(Host,ALL_MASK);

      timer->stamp();
      output->write(ntimestep);
      timer->stamp(Timer::OUTPUT);
    }
  }
}
Пример #23
0
  BASKER_INLINE
  int Basker<Int, Entry, Exe_Space>::factor_notoken(Int option)
  {

    //printf("factor no token called \n");

    gn = A.ncol;
    gm = A.nrow;
    BASKER_MATRIX ATEMP;

    //Kokkos::Impl::Timer tza;
    if(Options.btf == BASKER_TRUE)
      {
	//JDB: We can change this for the new inteface
	gn = A.ncol;
	gm = A.nrow;
	ATEMP = A;
	A = BTF_A; 
      }
    //printf("Switch time: %f \n", tza.seconds());

   

    //Spit into Domain and Sep
    //----------------------Domain-------------------------//
    #ifdef BASKER_KOKKOS

    //====TIMER==
    #ifdef BASKER_TIME
    Kokkos::Impl::Timer       timer;
    #endif
    //===TIMER===

    typedef Kokkos::TeamPolicy<Exe_Space>        TeamPolicy;

    if(btf_tabs_offset != 0)
      {

	if(Options.verbose == BASKER_TRUE)
	  {
	    printf("Factoring Dom num_threads: %d \n",
		   num_threads);
	  }


	Int domain_restart = 0;
	kokkos_nfactor_domain <Int,Entry,Exe_Space>
	  domain_nfactor(this);
	Kokkos::parallel_for(TeamPolicy(num_threads,1), 
			     domain_nfactor);
	Kokkos::fence();
    

    //=====Check for error======
    while(true)
      {
	INT_1DARRAY thread_start;
	MALLOC_INT_1DARRAY(thread_start, num_threads+1);
	init_value(thread_start, num_threads+1, 
		   (Int) BASKER_MAX_IDX);
	int nt = nfactor_domain_error(thread_start);
	if((nt == BASKER_SUCCESS) ||
	   (nt == BASKER_ERROR) ||
	   (domain_restart > BASKER_RESTART))
	  {
	    break;
	  }
	else
	  {
	    domain_restart++;
	    if(Options.verbose == BASKER_TRUE)
	      {
		printf("restart \n");
	      }
	    kokkos_nfactor_domain_remalloc <Int, Entry, Exe_Space>
	      diag_nfactor_remalloc(this, thread_start);
	    Kokkos::parallel_for(TeamPolicy(num_threads,1),
				 diag_nfactor_remalloc);
	    Kokkos::fence();
	  }
      }//end while

    //====TIMER===
    #ifdef BASKER_TIME
    printf("Time DOMAIN: %f \n", timer.seconds());
    timer.reset();
    #endif
    //====TIMER====
    

    #else// else basker_kokkos
    #pragma omp parallel
    {


    }//end omp parallel
    #endif //end basker_kokkos

      }
    //-------------------End--Domian--------------------------//
    
    //printVec("domperm.csc", gpermi, A.nrow);
   
    //---------------------------Sep--------------------------//
   
    if(btf_tabs_offset != 0)
      {
    //for(Int l=1; l<=4; l++)
    for(Int l=1; l <= tree.nlvls; l++)
      {

	//#ifdef BASKER_OLD_BARRIER
	//Int lthreads = pow(2,l);
	//Int lnteams = num_threads/lthreads;
	//#else
	Int lthreads = 1;
	Int lnteams = num_threads/lthreads;
	//#endif

	Int sep_restart = 0;


	if(Options.verbose == BASKER_TRUE)
	  {
	    printf("Factoring Sep num_threads: %d %d \n",
		   lnteams, lthreads);
	  }

	#ifdef BASKER_KOKKOS
	Kokkos::Impl::Timer  timer_inner_sep;
	#ifdef BASKER_NO_LAMBDA

	//kokkos_nfactor_sep <Int, Entry, Exe_Space> 
	//sep_nfactor(this, l);

	kokkos_nfactor_sep2 <Int, Entry, Exe_Space>
	  sep_nfactor(this,l);
	
	Kokkos::parallel_for(TeamPolicy(lnteams,lthreads),
			     sep_nfactor);
	Kokkos::fence();

	//======Check for error=====
	while(true)
	  {
	    INT_1DARRAY thread_start;
	    MALLOC_INT_1DARRAY(thread_start, num_threads+1);
	    init_value(thread_start, num_threads+1,
		      (Int) BASKER_MAX_IDX);
	    int nt = nfactor_sep_error(thread_start);
	    if((nt == BASKER_SUCCESS)||
	       (nt == BASKER_ERROR) ||
	       (sep_restart > BASKER_RESTART))
	      {
		FREE_INT_1DARRAY(thread_start);
		break;
	      }
	    else
	      {
		sep_restart++;
		if (Options.verbose == BASKER_TRUE)
		  {
		    printf("restart \n");
		  }
		Kokkos::parallel_for(TeamPolicy(lnteams,lthreads),  sep_nfactor);
		Kokkos::fence();

	      }
	  }//end while-true


	#ifdef BASKER_TIME
	printf("Time INNERSEP: %d %f \n", 
	       l, timer_inner_sep.seconds());
	#endif
        #else //ELSE BASKER_NO_LAMBDA
	//Note: to be added
        #endif //end BASKER_NO_LAMBDA
	#else
	#pragma omp parallel
	{

	}//end omp parallel
	#endif
      }//end over each level

    #ifdef BASKER_TIME
    printf("Time SEP: %f \n", timer.seconds());
    #endif
      }
    
    //-------------------------End Sep----------------//


    //-------------------IF BTF-----------------------//
    if(Options.btf == BASKER_TRUE)
      {

	Int btf_restart = 0;

	if(Options.verbose == BASKER_TRUE)
	  {
	    printf("Factoring BLKs num_threads: %d \n",
		   num_threads);
	  }

	//=====Timer
	#ifdef BASKER_TIME
	Kokkos::Impl::Timer  timer_btf;
	#endif
	//====Timer
	
	//======Call diag factor====
	kokkos_nfactor_diag <Int, Entry, Exe_Space> 
	  diag_nfactor(this);
	Kokkos::parallel_for(TeamPolicy(num_threads,1),
			     diag_nfactor);
	Kokkos::fence();
	
	//=====Check for error======
	while(true)
	  {
	    INT_1DARRAY thread_start;
	    MALLOC_INT_1DARRAY(thread_start, num_threads+1);
	    init_value(thread_start, num_threads+1, 
		       (Int) BASKER_MAX_IDX);
	    int nt = nfactor_diag_error(thread_start);
	    //printf("RETURNED: %d \n", nt);
	    if((nt == BASKER_SUCCESS) || 
	       (nt == BASKER_ERROR) ||
	       (btf_restart > BASKER_RESTART))
	      {
		break;
	      }
	    else
	      {
		btf_restart++;
		if (Options.verbose == BASKER_TRUE)
		  {
		    printf("restart \n");
		  }
		kokkos_nfactor_diag_remalloc <Int, Entry, Exe_Space>
		  diag_nfactor_remalloc(this, thread_start);
		Kokkos::parallel_for(TeamPolicy(num_threads,1),
				     diag_nfactor_remalloc);
		Kokkos::fence();
	      }
	  }//end while

	//====TIMER
	#ifdef BASKER_TIME
	printf("Time BTF: %f \n", 
	       timer_btf.seconds());
	#endif
	//===TIMER

      }//end btf call

    Kokkos::Impl::Timer tzback;
     if(Options.btf == BASKER_TRUE)
      {
	A = ATEMP;
      }
     //printf("Switch back: %f \n",
     //	    tzback.seconds());
    
    return 0;
  }//end factor_notoken()
Пример #24
0
inline
void pcgsolve( //const ImportType & import,
              KernelHandle &kh
            ,  const CrsMatrix <typename KernelHandle::nonzero_value_type , typename KernelHandle::row_index_type, typename KernelHandle::HandleExecSpace >      & A
            , const Kokkos::View <typename KernelHandle::nonzero_value_type *,
                                  typename KernelHandle::HandleExecSpace> & b
            , const Kokkos::View <typename KernelHandle::nonzero_value_type * ,
                                  typename KernelHandle::HandleExecSpace > & x
            , const size_t  maximum_iteration = 200
            , const double  tolerance = std::numeric_limits<double>::epsilon()
            , CGSolveResult * result = 0
            , bool use_sgs = true
            )
{
  typedef typename KernelHandle::HandleExecSpace Space;
  //typedef typename KernelHandle::nonzero_value_type MScalar;
  typedef typename KernelHandle::nonzero_value_type VScalar;
  //typedef typename KernelHandle::row_index_type Idx_Type;
  //typedef typename KernelHandle::idx_array_type idx_array_type;
  typedef typename Kokkos::View< VScalar * , Space >  VectorType ;

  //const size_t count_owned = import.count_owned ;
  //const size_t count_total = import.count_owned + import.count_receive;
  const size_t count_owned = A.graph.nv;
  const size_t count_total  = count_owned;

  size_t  iteration = 0 ;
  double  iter_time = 0 ;
  double  matvec_time = 0 ;
  double  norm_res = 0 ;
  double precond_time = 0;
  double precond_init_time = 0;

  Kokkos::Impl::Timer wall_clock ;
  Kokkos::Impl::Timer timer;
  // Need input vector to matvec to be owned + received
  VectorType pAll ( "cg::p" , count_total );

  VectorType p = Kokkos::subview( pAll , std::pair<size_t,size_t>(0,count_owned) );
  VectorType r ( "cg::r" , count_owned );
  VectorType Ap( "cg::Ap", count_owned );

  /* r = b - A * x ; */

  /* p  = x       */  Kokkos::deep_copy( p , x );
  ///* import p     */  import( pAll );
  /* Ap = A * p   */  multiply( count_owned , Ap , A , pAll );
  /* r = b - Ap   */  waxpby( count_owned , r , 1.0 , b , -1.0 , Ap );
  /* p  = r       */  Kokkos::deep_copy( p , r );

  //double old_rdot = Kokkos::Example::all_reduce( dot( count_owned , r , r ) , import.comm );
  double old_rdot = dot( count_owned , r , r );

  norm_res  = sqrt( old_rdot );



  int apply_count = 1;
  VectorType z;
  //double precond_old_rdot = Kokkos::Example::all_reduce( dot( count_owned , r , z ) , import.comm );
  double precond_old_rdot = 1;
#ifdef PRECOND_NORM
  double precond_norm_res  = 1;
#endif
  Kokkos::deep_copy( p , z );

  //typename KernelHandle::GaussSeidelHandleType *gsHandler;
  bool owner_handle = false;
  if (use_sgs){
    if (kh.get_gs_handle() == NULL){

      owner_handle = true;
      kh.create_gs_handle();
    }
    //gsHandler = kh.get_gs_handle();
    timer.reset();

    KokkosKernels::Experimental::Graph::gauss_seidel_numeric
      (&kh, count_owned, count_owned, A.graph.row_map, A.graph.entries, A.coeff);

    Space::fence();
    precond_init_time += timer.seconds();

    z = VectorType( "pcg::z" , count_owned );
    Space::fence();
    timer.reset();

    KokkosKernels::Experimental::Graph::symmetric_gauss_seidel_apply
        (&kh, count_owned, count_owned, A.graph.row_map, A.graph.entries, A.coeff, z, r, true, apply_count);

    Space::fence();
    precond_time += timer.seconds();
    //double precond_old_rdot = Kokkos::Example::all_reduce( dot( count_owned , r , z ) , import.comm );
    precond_old_rdot = dot( count_owned , r , z );
#ifdef PRECOND_NORM
    precond_norm_res  = sqrt( precond_old_rdot );
#endif

    Kokkos::deep_copy( p , z );
  }

  iteration = 0 ;

#ifdef PRINTRES

  std::cout << "norm_res:" << norm_res << " old_rdot:" << old_rdot<<  std::endl;
#ifdef PRECOND_NORM
  if (use_sgs)
  std::cout << "precond_norm_res:" << precond_norm_res << " precond_old_rdot:" << precond_old_rdot<<  std::endl;
#endif

#endif
  while ( tolerance < norm_res && iteration < maximum_iteration ) {

    /* pAp_dot = dot( p , Ap = A * p ) */

    timer.reset();
    ///* import p    */  import( pAll );
    /* Ap = A * p  */  multiply( count_owned , Ap , A , pAll );
    Space::fence();
    matvec_time += timer.seconds();

    //const double pAp_dot = Kokkos::Example::all_reduce( dot( count_owned , p , Ap ) , import.comm );
    const double pAp_dot = dot( count_owned , p , Ap ) ;

    double alpha  = 0;
    if (use_sgs){
      alpha = precond_old_rdot / pAp_dot ;
    }
    else {
      alpha = old_rdot / pAp_dot ;
    }

    /* x +=  alpha * p ;  */ waxpby( count_owned , x ,  alpha, p  , 1.0 , x );
    /* r += -alpha * Ap ; */ waxpby( count_owned , r , -alpha, Ap , 1.0 , r );

    //const double r_dot = Kokkos::Example::all_reduce( dot( count_owned , r , r ) , import.comm );
    const double r_dot = dot( count_owned , r , r );
    const double beta_original  = r_dot / old_rdot ;

    double precond_r_dot = 1;
    double precond_beta = 1;
    if (use_sgs){
      Space::fence();
      timer.reset();
      KokkosKernels::Experimental::Graph::symmetric_gauss_seidel_apply(&kh, count_owned, count_owned, A.graph.row_map, A.graph.entries, A.coeff, z, r, true, apply_count);

      Space::fence();
      precond_time += timer.seconds();
      //const double precond_r_dot = Kokkos::Example::all_reduce( dot( count_owned , r , z ) , import.comm );
      precond_r_dot = dot( count_owned , r , z );
      precond_beta  = precond_r_dot / precond_old_rdot ;
    }

    double beta  = 1;
    if (!use_sgs){
      beta = beta_original;
      /* p = r + beta * p ; */ waxpby( count_owned , p , 1.0 , r , beta , p );
    }
    else {
      beta = precond_beta;
      waxpby( count_owned , p , 1.0 , z , beta , p );
    }

#ifdef PRINTRES
    std::cout << "\tbeta_original:" << beta_original <<  std::endl;

    if (use_sgs)
    std::cout << "\tprecond_beta:" << precond_beta <<  std::endl;

#endif


    norm_res = sqrt( old_rdot = r_dot );
#ifdef PRECOND_NORM
    if (use_sgs){
      precond_norm_res = sqrt( precond_old_rdot = precond_r_dot );
    }
#else
    precond_old_rdot = precond_r_dot;
#endif

#ifdef PRINTRES
    std::cout << "\tnorm_res:" << norm_res << " old_rdot:" << old_rdot<<  std::endl;
#ifdef PRECOND_NORM

    if (use_sgs)
    std::cout << "\tprecond_norm_res:" << precond_norm_res << " precond_old_rdot:" << precond_old_rdot<<  std::endl;
#endif
#endif
    ++iteration ;
  }

  Space::fence();
  iter_time = wall_clock.seconds();

  if ( 0 != result ) {
    result->iteration   = iteration ;
    result->iter_time   = iter_time ;
    result->matvec_time = matvec_time ;
    result->norm_res    = norm_res ;
    result->precond_time = precond_time;
    result->precond_init_time = precond_init_time;
  }

  if (use_sgs & owner_handle ){

    kh.destroy_gs_handle();
  }
}
  KOKKOS_INLINE_FUNCTION
  int exampleKokkosTaskData(const int ntasks,
                            const int max_task_dependence,
                            const int team_size, 
                            const bool verbose) {

    typedef Kokkos::Experimental::TaskPolicy<SpaceType> policy_type ;
    typedef SimpleTask<policy_type> simple_task_type;

    typedef Kokkos::Experimental::Future<typename simple_task_type::value_type,SpaceType> future_type ;
    
    policy_type policy;

    Kokkos::Impl::Timer timer;

    for (int use_barrier=0;use_barrier<2;++use_barrier) {
      cout << "KokkosTaskData:: use barrier " << (use_barrier ? "yes" : "no") << endl;
      {
        timer.reset();
        
        future_type f = policy.create(simple_task_type(use_barrier), max_task_dependence);
        policy.spawn(f);
        
        Kokkos::Experimental::wait( policy );
        
        const double t = timer.seconds();
        cout << "KokkosTaskData:: single task is spawned :: time = " << t << endl;
      }
      
      {
        timer.reset();
        
        future_type f = policy.create_team(simple_task_type(use_barrier), max_task_dependence);
        policy.spawn(f);
        
        Kokkos::Experimental::wait( policy );
        
        const double t = timer.seconds();
        cout << "KokkosTaskData:: single team task is spawned :: time = " << t << endl;
      }
      
      {
        timer.reset();
        
        future_type f[MAXTASKS];
        for (int i=0;i<ntasks;++i) {
          f[i] = policy.create(simple_task_type(use_barrier), max_task_dependence);
          policy.spawn(f[i]);
        }
        Kokkos::Experimental::wait( policy );
      
        const double t = timer.seconds();
        cout << "KokkosTaskData:: " << ntasks << " tasks are spawned :: time = " << t << endl;
      }
      
      {
        timer.reset();
        
        future_type f[MAXTASKS];
        for (int i=0;i<ntasks;++i) {
          f[i] = policy.create_team(simple_task_type(use_barrier), max_task_dependence);
          policy.spawn(f[i]);
        }
        
        Kokkos::Experimental::wait( policy );
        
        const double t = timer.seconds();
        cout << "KokkosTaskData:: " << ntasks << " team tasks are spawned :: time = " << t << endl;
      }
    }
    return 0;
  }
Пример #26
0
    int ComputeBasis_HGRAD_Vector(const ordinal_type nworkset,
                                  const ordinal_type C,
                                  const ordinal_type order,
                                  const bool verbose) {
      typedef Vector<VectorTagType> VectorType;

      typedef typename VectorTagType::value_type ValueType;
      constexpr int VectorLength = VectorTagType::length;

      Teuchos::RCP<std::ostream> verboseStream;
      Teuchos::oblackholestream bhs; // outputs nothing

      if (verbose) 
        verboseStream = Teuchos::rcp(&std::cout, false);
      else
        verboseStream = Teuchos::rcp(&bhs,       false);

      Teuchos::oblackholestream oldFormatState;
      oldFormatState.copyfmt(std::cout);

      typedef typename
        Kokkos::Impl::is_space<DeviceSpaceType>::host_mirror_space::execution_space HostSpaceType ;

      *verboseStream << "DeviceSpace::  "; DeviceSpaceType::print_configuration(*verboseStream, false);
      *verboseStream << "HostSpace::    ";   HostSpaceType::print_configuration(*verboseStream, false);
      *verboseStream << "VectorLength::  " << (VectorLength) << "\n";

      using BasisTypeHost = Basis_HGRAD_HEX_C1_FEM<HostSpaceType,ValueType,ValueType>;
      using ImplBasisType = Impl::Basis_HGRAD_HEX_C1_FEM;
      using range_type = Kokkos::pair<ordinal_type,ordinal_type>;

      constexpr size_t LLC_CAPACITY = 32*1024*1024;
      Intrepid2::Test::Flush<LLC_CAPACITY,DeviceSpaceType> flush;
      
      Kokkos::Impl::Timer timer;
      double t_vectorize = 0;
      int errorFlag = 0;

      BasisTypeHost hostBasis;
      const auto cellTopo = hostBasis.getBaseCellTopology();

      auto cubature = DefaultCubatureFactory::create<DeviceSpaceType,ValueType,ValueType>(cellTopo, order);

      const ordinal_type 
        numCells = C,
        numCellsAdjusted = C/VectorLength + (C%VectorLength > 0),
        numVerts = cellTopo.getVertexCount(),
        numDofs = hostBasis.getCardinality(),
        numPoints = cubature->getNumPoints(), 
        spaceDim = cubature->getDimension();

      Kokkos::DynRankView<ValueType,HostSpaceType> dofCoordsHost("dofCoordsHost", numDofs, spaceDim);
      hostBasis.getDofCoords(dofCoordsHost);
      const auto refNodesHost = Kokkos::subview(dofCoordsHost, range_type(0, numVerts), Kokkos::ALL());
      
      // pertub nodes
      Kokkos::DynRankView<VectorType,HostSpaceType> worksetCellsHost("worksetCellsHost", numCellsAdjusted, numVerts, spaceDim);
      for (ordinal_type cell=0;cell<numCells;++cell) {
        for (ordinal_type i=0;i<numVerts;++i)
          for (ordinal_type j=0;j<spaceDim;++j) {
            ValueType val = (rand()/(RAND_MAX + 1.0))*0.2 -0.1;
            worksetCellsHost(cell/VectorLength, i, j)[cell%VectorLength] = refNodesHost(i, j) + val; 
          }
      }

      auto worksetCells = Kokkos::create_mirror_view(typename DeviceSpaceType::memory_space(), worksetCellsHost);
      Kokkos::deep_copy(worksetCells, worksetCellsHost);

      Kokkos::DynRankView<ValueType,DeviceSpaceType> refPoints("refPoints", numPoints, spaceDim), refWeights("refWeights", numPoints);
      cubature->getCubature(refPoints, refWeights);

      std::cout
        << "===============================================================================\n" 
        << " Performance Test evaluating ComputeBasis \n"
        << " # of workset = " << nworkset << "\n" 
        << " Test Array Structure (C,F,P,D) = " << numCells << ", " << numDofs << ", " << numPoints << ", " << spaceDim << "\n"
        << "===============================================================================\n";

      *verboseStream
        << "\n"
        << "===============================================================================\n"
        << "TEST 1: evaluateFields vector version\n"
        << "===============================================================================\n";
      
      try {
        Kokkos::DynRankView<ValueType,DeviceSpaceType> 
          refBasisValues("refBasisValues", numDofs, numPoints),
          refBasisGrads ("refBasisGrads",  numDofs, numPoints, spaceDim);
        
        ImplBasisType::getValues<DeviceSpaceType>(refBasisValues, refPoints, OPERATOR_VALUE);
        ImplBasisType::getValues<DeviceSpaceType>(refBasisGrads,  refPoints, OPERATOR_GRAD);
        
        const ordinal_type ibegin = -3;

        // testing vertical approach
        {          
          Kokkos::DynRankView<VectorType,DeviceSpaceType> 
            weightedBasisValues("weightedBasisValues", numCellsAdjusted, numDofs, numPoints),
            weightedBasisGrads ("weightedBasisGrads",  numCellsAdjusted, numDofs, numPoints, spaceDim);

          typedef F_hgrad_eval<VectorType,ValueType,DeviceSpaceType> FunctorType;

          using range_policy_type = Kokkos::Experimental::MDRangePolicy
            < DeviceSpaceType, Kokkos::Experimental::Rank<2>, Kokkos::IndexType<ordinal_type> >;
          range_policy_type policy( {                0,         0 },
                                    { numCellsAdjusted, numPoints } );

          FunctorType functor(weightedBasisValues,
                              weightedBasisGrads,
                              refBasisGrads,
                              worksetCells,
                              refWeights,
                              refBasisValues,
                              refBasisGrads);
          
          for (ordinal_type iwork=ibegin;iwork<nworkset;++iwork) {
            flush.run();

            DeviceSpaceType::fence();
            timer.reset();
            
            Kokkos::parallel_for(policy, functor);

            DeviceSpaceType::fence();
            t_vectorize += (iwork >= 0)*timer.seconds();
          }

        }

      } catch (std::exception err) {
        *verboseStream << "UNEXPECTED ERROR !!! ----------------------------------------------------------\n";
        *verboseStream << err.what() << '\n';
        *verboseStream << "-------------------------------------------------------------------------------" << "\n\n";
        errorFlag = -1000;
      }

      std::cout 
        << "TEST HGRAD " 
        << " t_vectorize = " << (t_vectorize/nworkset)
        << std::endl;
      
      if (errorFlag != 0)
        std::cout << "End Result: TEST FAILED\n";
      else
        std::cout << "End Result: TEST PASSED\n";
      
      // reset format state of std::cout
      std::cout.copyfmt(oldFormatState);
      
      return errorFlag;
    }
  KOKKOS_INLINE_FUNCTION
  int exampleCholDirectPerformance(const string file_input,
                                   const int treecut,
                                   const int minblksize,
                                   const int prunecut,
                                   const int seed,
                                   const int niter,
                                   const int nthreads,
                                   const int max_task_dependence,
                                   const int team_size,
                                   const int league_size,
                                   const bool team_interface,
                                   const bool skip_serial,
                                   const bool mkl_interface,
                                   const bool verbose) {
    typedef ValueType   value_type;
    typedef OrdinalType ordinal_type;
    typedef SizeType    size_type;

    typedef TaskFactory<Kokkos::Experimental::TaskPolicy<SpaceType>,
      Kokkos::Experimental::Future<int,SpaceType> > TaskFactoryType;

    typedef CrsMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> CrsMatrixBaseType;
    typedef GraphHelper_Scotch<CrsMatrixBaseType> GraphHelperType;
    typedef SymbolicFactorHelper<CrsMatrixBaseType> SymbolicFactorHelperType;

#ifdef HAVE_SHYLUTACHO_MKL 
    typedef typename CrsMatrixBaseType::value_type_array value_type_array;
#endif

    typedef CrsMatrixView<CrsMatrixBaseType> CrsMatrixViewType;
    typedef TaskView<CrsMatrixViewType,TaskFactoryType> CrsTaskViewType;
    
    typedef CrsMatrixBase<CrsTaskViewType,ordinal_type,size_type,SpaceType,MemoryTraits> CrsHierMatrixBaseType;

    typedef CrsMatrixView<CrsHierMatrixBaseType> CrsHierMatrixViewType;
    typedef TaskView<CrsHierMatrixViewType,TaskFactoryType> CrsHierTaskViewType;

    int r_val = 0;

    Kokkos::Impl::Timer timer;
    double 
      t_import = 0.0,
      t_reorder = 0.0,
      t_symbolic = 0.0,
      t_flat2hier = 0.0,
#ifdef HAVE_SHYLUTACHO_MKL 
      t_mkl = 0.0,
#endif
      t_factor_seq = 0.0,   t_solve_seq = 0.0,
      t_factor_task = 0.0,  t_solve_task = 0.0;
    const int start = 0;
    
    cout << "CholDirectPerformance:: import input file = " << file_input << endl;        
    CrsMatrixBaseType AA("AA");
    {
      timer.reset();

      ifstream in;
      in.open(file_input);
      if (!in.good()) {
        cout << "Failed in open the file: " << file_input << endl;
        return ++r_val;
      }
      AA.importMatrixMarket(in);

      t_import = timer.seconds();
    }
    cout << "CholDirectPerformance:: import input file::time = " << t_import << endl;

    cout << "CholDirectPerformance:: reorder the matrix" << endl;        
    CrsMatrixBaseType PA("Permuted AA");
    CrsMatrixBaseType UU("UU");     // permuted base upper triangular matrix
    CrsHierMatrixBaseType HU("HU"); // hierarchical matrix of views

    DenseMatrixBaseType BB("BB", AA.NumRows(), nrhs);
    DenseHierMatrixBaseType HB("HB");

    {
      GraphHelperType S(AA, seed);
      {
        timer.reset();
        
        S.computeOrdering(treecut, minblksize);
        S.pruneTree(prunecut);
        
        PA.copy(S.PermVector(), S.InvPermVector(), AA);
        
        t_reorder = timer.seconds();
      }

      cout << "CholDirectPerformance:: reorder the matrix::time = " << t_reorder << endl;            
      {
        SymbolicFactorHelperType F(PA, league_size);
        for (int i=start;i<niter;++i) {
          timer.reset();
        
          F.createNonZeroPattern(Uplo::Upper, UU);

          t_symbolic += timer.seconds() * (i>=0);
        }
        t_symbolic /= niter;

        cout << "CholDirectPerformance:: AA (nnz) = " << AA.NumNonZeros() << ", UU (nnz) = " << UU.NumNonZeros() << endl;
      }
      cout << "CholDirectPerformance:: symbolic factorization::time = " << t_symbolic << endl;            
      {
        timer.reset();

        CrsMatrixHelper::flat2hier(Uplo::Upper, UU, HU,
                                   S.NumBlocks(),
                                   S.RangeVector(),
                                   S.TreeVector());
        
        for (ordinal_type k=0;k<HU.NumNonZeros();++k)
          HU.Value(k).fillRowViewArray();

        DenseMatrixHelper::flat2hier(BB, HB,
                                     S.NumBlocks(),
                                     S.RangeVector(),
                                     nb);
        
        t_flat2hier = timer.seconds();
        
        cout << "CholDirectPerformance:: Hier (dof, nnz) = " << HU.NumRows() << ", " << HU.NumNonZeros() << endl;
      }
      cout << "CholDirectPerformance:: construct hierarchical matrix::time = " << t_flat2hier << endl;            
    }

    // copy of UU
    CrsMatrixBaseType RR("RR");
    RR.copy(UU);

    /////////////////////////// Serial Numeric Factorization
    if (!skip_serial) {
#ifdef __USE_FIXED_TEAM_SIZE__
      typename TaskFactoryType::policy_type policy(max_task_dependence);
#else
      typename TaskFactoryType::policy_type policy(max_task_dependence, 1);
#endif
      TaskFactoryType::setUseTeamInterface(team_interface);
      TaskFactoryType::setMaxTaskDependence(max_task_dependence);
      TaskFactoryType::setPolicy(&policy);

      CrsTaskViewType U(&UU);
      U.fillRowViewArray();

      cout << "CholDirectPerformance:: Serial factorize the matrix" << endl;
      {
        for (int i=start;i<niter;++i) {
          UU.copy(RR);
          timer.reset();          
          {
            Chol<Uplo::Upper,AlgoChol::UnblockedOpt,Variant::One>
              ::invoke(TaskFactoryType::Policy(),
                       TaskFactoryType::Policy().member_single(),
                       U);
          }
          t_factor_seq += timer.seconds() * (i>=0);
        }
        t_factor_seq /= niter;
      }
      cout << "CholDirectPerformance:: Serial factorize the matrix::time = " << t_factor_seq << endl;

      cout << "CholDirectPerformance:: Serial forward/backward solve" << endl;      
      {
        for (int i=start;i<niter;++i) {
          XX.copy(BB);
          timer.reset();          
          {
            TriSolve<Uplo::Upper,Trans::ConjTranspose,AlgoTriSolve::Unblocked>
              ::invoke(TaskFactoryType::Policy(),
                       TaskFactoryType::Policy().member_single(),
                       Diag::NonUnit, U, X);
            TriSolve<Uplo::Upper,Trans::NoTranspose,AlgoTriSolve::Unblocked>
              ::invoke(TaskFactoryType::Policy(),
                       TaskFactoryType::Policy().member_single(),
                       Diag::NonUnit, U, X);
          }
          t_factor_seq += timer.seconds() * (i>=0);
        }
        t_factor_seq /= niter;
      }
      cout << "CholDirectPerformance:: Serial forward/backward solve::time = " << t_solve_seq << endl;
    }

    // if (!skip_serial) 
    //   cout << "CholDirectPerformance:: task scale [seq/task] = " << t_factor_seq/t_factor_task << endl;    

    return r_val;
  }
  int exampleDenseGemmByBlocks(const ordinal_type mmin,
                               const ordinal_type mmax,
                               const ordinal_type minc,
                               const ordinal_type k,
                               const ordinal_type mb,
                               const int max_concurrency,
                               const int max_task_dependence,
                               const int team_size,
                               const int mkl_nthreads,
                               const bool check,
                               const bool verbose) {
    typedef typename
      Kokkos::Impl::is_space<DeviceSpaceType>::host_mirror_space::execution_space HostSpaceType ;

    const bool detail = false;
    std::cout << "DeviceSpace::  "; DeviceSpaceType::print_configuration(std::cout, detail);
    std::cout << "HostSpace::    ";   HostSpaceType::print_configuration(std::cout, detail);

    typedef Kokkos::Experimental::TaskPolicy<DeviceSpaceType> PolicyType;

    typedef DenseMatrixBase<value_type,ordinal_type,size_type,HostSpaceType> DenseMatrixBaseHostType;
    typedef DenseMatrixView<DenseMatrixBaseHostType> DenseMatrixViewHostType;

    typedef DenseMatrixBase<value_type,ordinal_type,size_type,DeviceSpaceType> DenseMatrixBaseDeviceType;
    typedef DenseMatrixView<DenseMatrixBaseDeviceType> DenseMatrixViewDeviceType;
    typedef TaskView<DenseMatrixViewDeviceType> DenseTaskViewDeviceType;

    typedef DenseMatrixBase<DenseTaskViewDeviceType,ordinal_type,size_type,DeviceSpaceType> DenseHierMatrixBaseDeviceType;

    typedef DenseMatrixView<DenseHierMatrixBaseDeviceType> DenseHierMatrixViewDeviceType;
    typedef TaskView<DenseHierMatrixViewDeviceType> DenseHierTaskViewDeviceType;

    int r_val = 0;

    Kokkos::Impl::Timer timer;
    double t = 0.0;

    std::cout << "DenseGemmByBlocks:: test matrices "
              <<":: mmin = " << mmin << " , mmax = " << mmax << " , minc = " << minc 
              << " , k = "<< k << " , mb = " << mb << std::endl;

    const size_t max_task_size = (3*sizeof(DenseTaskViewDeviceType)+sizeof(PolicyType)+128); 
    PolicyType policy(max_concurrency,
                      max_task_size,
                      max_task_dependence,
                      team_size);

    std::ostringstream os;
    os.precision(3);
    os << std::scientific;

    for (ordinal_type m=mmin;m<=mmax;m+=minc) {
      os.str("");
      
      // host matrices
      DenseMatrixBaseHostType AA_host, BB_host, CC_host("CC_host", m, m), CB_host("CB_host", m, m);
      {
        if (ArgTransA == Trans::NoTranspose) 
          AA_host = DenseMatrixBaseHostType("AA_host", m, k); 
        else 
          AA_host = DenseMatrixBaseHostType("AA_host", k, m);
        
        if (ArgTransB == Trans::NoTranspose) 
          BB_host = DenseMatrixBaseHostType("BB_host", k, m);
        else 
          BB_host = DenseMatrixBaseHostType("BB_host", m, k);
        
        for (ordinal_type j=0;j<AA_host.NumCols();++j)
          for (ordinal_type i=0;i<AA_host.NumRows();++i)
            AA_host.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0;
        
        for (ordinal_type j=0;j<BB_host.NumCols();++j)
          for (ordinal_type i=0;i<BB_host.NumRows();++i)
            BB_host.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0;
        
        for (ordinal_type j=0;j<CC_host.NumCols();++j)
          for (ordinal_type i=0;i<CC_host.NumRows();++i)
            CC_host.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0;
        
        DenseMatrixTools::copy(CB_host, CC_host);
      }

      const double flop = DenseFlopCount<value_type>::Gemm(m, m, k);

#ifdef HAVE_SHYLUTACHO_MKL
      mkl_set_num_threads(mkl_nthreads);
#endif

      os << "DenseGemmByBlocks:: m = " << m << " n = " << m << " k = " << k << "  ";
      if (check) {
        timer.reset();
        DenseMatrixViewHostType A_host(AA_host), B_host(BB_host), C_host(CB_host);
        Gemm<ArgTransA,ArgTransB,AlgoGemm::ExternalBlas,Variant::One>::invoke
          (policy, policy.member_single(),
           1.0, A_host, B_host, 1.0, C_host);
        t = timer.seconds();
        os << ":: Serial Performance = " << (flop/t/1.0e9) << " [GFLOPs]  ";
      }

      DenseMatrixBaseDeviceType AA_device("AA_device"), BB_device("BB_device"), CC_device("CC_device");
      {
        timer.reset();
        AA_device.mirror(AA_host);
        BB_device.mirror(BB_host);
        CC_device.mirror(CC_host);
        t = timer.seconds();
        os << ":: Mirror = " << t << " [sec]  ";
      }

      {
        DenseHierMatrixBaseDeviceType HA_device("HA_device"), HB_device("HB_device"), HC_device("HC_device");

        DenseMatrixTools::createHierMatrix(HA_device, AA_device, mb, mb);        
        DenseMatrixTools::createHierMatrix(HB_device, BB_device, mb, mb);        
        DenseMatrixTools::createHierMatrix(HC_device, CC_device, mb, mb);        

        DenseHierTaskViewDeviceType TA_device(HA_device), TB_device(HB_device), TC_device(HC_device);
        timer.reset();
        auto future = policy.proc_create_team
          (Gemm<ArgTransA,ArgTransB,AlgoGemm::DenseByBlocks,ArgVariant>
           ::createTaskFunctor(policy, 1.0, TA_device, TB_device, 1.0, TC_device),
           0);
        policy.spawn(future);
        Kokkos::Experimental::wait(policy);
        t = timer.seconds();       
        os << ":: Parallel Performance = " << (flop/t/1.0e9) << " [GFLOPs]  ";
      } 

      CC_host.mirror(CC_device);
      if (check) {
        double err = 0.0, norm = 0.0;
        for (ordinal_type j=0;j<CC_host.NumCols();++j)
          for (ordinal_type i=0;i<CC_host.NumRows();++i) {
            const double diff = abs(CC_host.Value(i,j) - CB_host.Value(i,j));
            const double val  = CB_host.Value(i,j);
            err  += diff*diff;
            norm += val*val;
          }
        os << ":: Check result ::norm = " << sqrt(norm) << ", error = " << sqrt(err);
      }
      std::cout << os.str() << std::endl;
    }

    return r_val;
  }
  KOKKOS_INLINE_FUNCTION
  int exampleCholDirectPlain(const string file_input,
                             const int prunecut,
                             const int seed,
                             const int nrhs,
                             const int nb, 
                             const int nthreads,
                             const int max_task_dependence,
                             const int team_size,
                             const int league_size,
                             const bool team_interface,
                             const bool serial,
                             const bool solve,
                             const bool check,
                             const bool verbose) {
    typedef ValueType   value_type;
    typedef OrdinalType ordinal_type;
    typedef SizeType    size_type;

    typedef TaskFactory<Kokkos::Experimental::TaskPolicy<SpaceType>,
      Kokkos::Experimental::Future<int,SpaceType> > TaskFactoryType;

    typedef CrsMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> CrsMatrixBaseType;
    typedef GraphHelper_Scotch<CrsMatrixBaseType> GraphHelperType;
    typedef SymbolicFactorHelper<CrsMatrixBaseType> SymbolicFactorHelperType;

    typedef CrsMatrixView<CrsMatrixBaseType> CrsMatrixViewType;
    typedef TaskView<CrsMatrixViewType,TaskFactoryType> CrsTaskViewType;
    
    typedef CrsMatrixBase<CrsTaskViewType,ordinal_type,size_type,SpaceType,MemoryTraits> CrsHierMatrixBaseType;

    typedef CrsMatrixView<CrsHierMatrixBaseType> CrsHierMatrixViewType;
    typedef TaskView<CrsHierMatrixViewType,TaskFactoryType> CrsHierTaskViewType;

    typedef DenseMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> DenseMatrixBaseType;

    typedef DenseMatrixView<DenseMatrixBaseType> DenseMatrixViewType;
    typedef TaskView<DenseMatrixViewType,TaskFactoryType> DenseTaskViewType;

    typedef DenseMatrixBase<DenseTaskViewType,ordinal_type,size_type,SpaceType,MemoryTraits> DenseHierMatrixBaseType;
    
    typedef DenseMatrixView<DenseHierMatrixBaseType> DenseHierMatrixViewType;
    typedef TaskView<DenseHierMatrixViewType,TaskFactoryType> DenseHierTaskViewType;

    int r_val = 0;

    Kokkos::Impl::Timer timer;
    double 
      t_import = 0.0,
      t_reorder = 0.0,
      t_symbolic = 0.0,
      t_flat2hier = 0.0,
      t_factor = 0.0, 
      t_solve = 0.0;
    
    cout << "CholDirectPlain:: import input file = " << file_input << endl;        
    CrsMatrixBaseType AA("AA");
    {
      timer.reset();
      ifstream in;
      in.open(file_input);
      if (!in.good()) {
        cout << "Failed in open the file: " << file_input << endl;
        return ++r_val;
      }
      AA.importMatrixMarket(in);
      t_import = timer.seconds();
    }
    cout << "CholDirectPlain:: import input file::time = " << t_import << endl;

    // matrix A and its upper triangular factors U
    CrsMatrixBaseType PA("Permuted AA");
    CrsMatrixBaseType UU("UU"); // permuted base upper triangular matrix
    CrsHierMatrixBaseType HU("HU"); // hierarchical matrix of views

    // right hand side and solution matrix
    DenseMatrixBaseType BB("BB", AA.NumRows(), nrhs), XX("XX", AA.NumRows(), nrhs);
    DenseHierMatrixBaseType HB("HB"), HX("HX");

    {
      cout << "CholDirectPlain:: reorder the matrix" << endl;        
      GraphHelperType S(AA, seed);
      {
        timer.reset();
        S.computeOrdering();
        S.pruneTree(prunecut);
        PA.copy(S.PermVector(), S.InvPermVector(), AA);
        t_reorder = timer.seconds();
      }
      cout << "CholDirectPlain:: reorder the matrix::time = " << t_reorder << endl;            

      {
        SymbolicFactorHelperType F(PA, league_size);
        timer.reset();
        F.createNonZeroPattern(Uplo::Upper, UU);
        t_symbolic = timer.seconds();
        cout << "CholDirectPlain:: AA (nnz) = " << AA.NumNonZeros() << ", UU (nnz) = " << UU.NumNonZeros() << endl;
      }
      cout << "CholDirectPlain:: symbolic factorization::time = " << t_symbolic << endl;            

      {
        timer.reset();
        CrsMatrixHelper::flat2hier(Uplo::Upper, UU, HU,
                                   S.NumBlocks(),
                                   S.RangeVector(),
                                   S.TreeVector());

        // fill internal meta data for sparse blocs
        for (ordinal_type k=0;k<HU.NumNonZeros();++k)
          HU.Value(k).fillRowViewArray();

        DenseMatrixHelper::flat2hier(BB, HB,
                                     S.NumBlocks(),
                                     S.RangeVector(),
                                     nb);        

        DenseMatrixHelper::flat2hier(XX, HX,
                                     S.NumBlocks(),
                                     S.RangeVector(),
                                     nb);        
        t_flat2hier = timer.seconds();
        cout << "CholDirectPlain:: Hier (dof, nnz) = " << HU.NumRows() << ", " << HU.NumNonZeros() << endl;
      }
      cout << "CholDirectPlain:: construct hierarchical matrix::time = " << t_flat2hier << endl;            
    }


    {
      // Policy setup
#ifdef __USE_FIXED_TEAM_SIZE__ 
      typename TaskFactoryType::policy_type policy(max_task_dependence);
#else
      typename TaskFactoryType::policy_type policy(max_task_dependence, 1);
#endif
      TaskFactoryType::setUseTeamInterface(team_interface);
      TaskFactoryType::setMaxTaskDependence(max_task_dependence);
      TaskFactoryType::setPolicy(&policy);
      
      CrsTaskViewType A(&PA), U(&UU);
      DenseTaskViewType X(&XX), B(&BB);
      
      A.fillRowViewArray();      
      U.fillRowViewArray();      

      CrsHierTaskViewType TU(&HU);
      DenseHierTaskViewType TB(&HB), TX(&HX);

      {
        // Manufacture B = AX
        const int m = A.NumRows();
        for (int j=0;j<nrhs;++j)
          for (int i=0;i<m;++i)
            X.Value(i,j) = (j+1);
        
        Gemm<Trans::NoTranspose,Trans::NoTranspose,AlgoGemm::ForTriSolveBlocked>
          ::invoke(TaskFactoryType::Policy(),
                   TaskFactoryType::Policy().member_single(),
                   1.0, A, X, 0.0, B);
        XX.copy(BB);        
      }
      
      if (serial) {
        cout << "CholDirectPlain:: Serial factorize the matrix" << endl;
        timer.reset();          
        Chol<Uplo::Upper,AlgoChol::UnblockedOpt,Variant::One>
          ::invoke(TaskFactoryType::Policy(),
                   TaskFactoryType::Policy().member_single(),
                   U);
        t_factor = timer.seconds();
        cout << "CholDirectPlain:: Serial factorize the matrix::time = " << t_factor << endl;
      } else {
        cout << "CholDirectPlain:: ByBlocks factorize the matrix:: team_size = " << team_size << endl;
        timer.reset();    
        auto future = TaskFactoryType::Policy().create_team
          (Chol<Uplo::Upper,AlgoChol::ByBlocks>
           ::TaskFunctor<CrsHierTaskViewType>(TU), 0);
        TaskFactoryType::Policy().spawn(future);
        Kokkos::Experimental::wait(TaskFactoryType::Policy());
        t_factor = timer.seconds();
        cout << "CholDirectPlain:: ByBlocks factorize the matrix::time = " << t_factor << endl;
      }

      if (solve) {
        if (serial) {
          cout << "CholDirectPlain:: Serial forward/backward solve" << endl;
          timer.reset();          
          TriSolve<Uplo::Upper,Trans::ConjTranspose,AlgoTriSolve::Unblocked>
            ::invoke(TaskFactoryType::Policy(),
                     TaskFactoryType::Policy().member_single(),
                     Diag::NonUnit, U, X);
          TriSolve<Uplo::Upper,Trans::NoTranspose,AlgoTriSolve::Unblocked>
            ::invoke(TaskFactoryType::Policy(),
                     TaskFactoryType::Policy().member_single(),
                     Diag::NonUnit, U, X);
          t_solve = timer.seconds();
          cout << "CholDirectPlain:: Serial forward/backward solve::time = " << t_solve << endl;
        } else {
          cout << "CholDirectPlain:: ByBlocks forward/backward solve" << endl;
          timer.reset();          
          auto future_forward_solve = TaskFactoryType::Policy().create_team
            (TriSolve<Uplo::Upper,Trans::ConjTranspose,AlgoTriSolve::ByBlocks>
             ::TaskFunctor<CrsHierTaskViewType,DenseHierTaskViewType>
             (Diag::NonUnit, TU, TX), 0);
          
          TaskFactoryType::Policy().spawn(future_forward_solve);
          
          auto future_backward_solve = TaskFactoryType::Policy().create_team
            (TriSolve<Uplo::Upper,Trans::NoTranspose,AlgoTriSolve::ByBlocks>
             ::TaskFunctor<CrsHierTaskViewType,DenseHierTaskViewType>
             (Diag::NonUnit, TU, TX), 1);
          
          TaskFactoryType::Policy().add_dependence(future_backward_solve, future_forward_solve);
          TaskFactoryType::Policy().spawn(future_backward_solve);
          
          Kokkos::Experimental::wait(TaskFactoryType::Policy());
          t_solve = timer.seconds();
          cout << "CholDirectPlain:: ByBlocks forward/backward solve::time = " << t_solve << endl;
        }
      }

      if (solve && check) {
        // Check manufactured solution
        double l2 = 0.0, linf = 0.0;
        const int m = A.NumRows();
        for (int j=0;j<nrhs;++j)
          for (int i=0;i<m;++i) {
            double diff = abs(X.Value(i,j) - (j+1));
            l2 += diff*diff;
            linf = max(diff, linf);
          }
        l2 = sqrt(l2);
        cout << "CholDirectPlain:: Check solution::L2 = " << l2 << ", Linf = " << linf << endl;
      }
    }

    return r_val;
  }
  KOKKOS_INLINE_FUNCTION
  int exampleCholPerformanceSingle(const string file_input,
                                   const int treecut,
                                   const int minblksize,
                                   const int prunecut,
                                   const int seed,
                                   const int nthreads,
                                   const int max_task_dependence,
                                   const int team_size,
                                   const int fill_level,
                                   const int league_size,
                                   const bool team_interface,
                                   const bool skip_serial,
                                   const bool vtune_symbolic,
                                   const bool vtune_serial,
                                   const bool vtune_task,
                                   const bool verbose) {
    typedef ValueType   value_type;
    typedef OrdinalType ordinal_type;
    typedef SizeType    size_type;

    typedef TaskFactory<Kokkos::Experimental::TaskPolicy<SpaceType>,
      Kokkos::Experimental::Future<int,SpaceType> > TaskFactoryType;

    typedef CrsMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> CrsMatrixBaseType;
    typedef GraphHelper_Scotch<CrsMatrixBaseType> GraphHelperType;
    typedef SymbolicFactorHelper<CrsMatrixBaseType> SymbolicFactorHelperType;

    typedef CrsMatrixView<CrsMatrixBaseType> CrsMatrixViewType;
    typedef TaskView<CrsMatrixViewType,TaskFactoryType> CrsTaskViewType;
    
    typedef CrsMatrixBase<CrsTaskViewType,ordinal_type,size_type,SpaceType,MemoryTraits> CrsHierMatrixBaseType;

    typedef CrsMatrixView<CrsHierMatrixBaseType> CrsHierMatrixViewType;
    typedef TaskView<CrsHierMatrixViewType,TaskFactoryType> CrsHierTaskViewType;

#ifdef HAVE_SHYLUTACHO_VTUNE
    __itt_pause();
#endif

    int r_val = 0;

    Kokkos::Impl::Timer timer;
    double 
      t_import = 0.0,
      t_reorder = 0.0,
      t_symbolic = 0.0,
      t_flat2hier = 0.0,
      t_factor_seq = 0.0,
      t_factor_task = 0.0;
    
    cout << "CholPerformance:: import input file = " << file_input << endl;        
    CrsMatrixBaseType AA("AA");
    {
      timer.reset();

      ifstream in;
      in.open(file_input);
      if (!in.good()) {
        cout << "Failed in open the file: " << file_input << endl;
        return ++r_val;
      }
      AA.importMatrixMarket(in);

      t_import = timer.seconds();

      if (verbose)
        cout << AA << endl;
    }
    cout << "CholPerformance:: import input file::time = " << t_import << endl;

    cout << "CholPerformance:: reorder the matrix" << endl;        
    CrsMatrixBaseType PA("Permuted AA");
    CrsMatrixBaseType UU("UU");     // permuted base upper triangular matrix
    CrsHierMatrixBaseType HU("HU"); // hierarchical matrix of views

    {
      GraphHelperType S(AA, seed);
      {
        timer.reset();
        
        S.computeOrdering(treecut, minblksize);
        S.pruneTree(prunecut);
        
        PA.copy(S.PermVector(), S.InvPermVector(), AA);
        
        t_reorder = timer.seconds();

        if (verbose)
          cout << S << endl
               << PA << endl;
      }

      cout << "CholPerformance:: reorder the matrix::time = " << t_reorder << endl;            
      {
        SymbolicFactorHelperType F(PA, league_size);
        if (vtune_symbolic) {
#ifdef HAVE_SHYLUTACHO_VTUNE
          __itt_resume();
#endif
        }
        timer.reset();
        F.createNonZeroPattern(fill_level, Uplo::Upper, UU);
        t_symbolic = timer.seconds();
        if (vtune_symbolic) {
#ifdef HAVE_SHYLUTACHO_VTUNE
          __itt_pause();
#endif
        }

        cout << "CholPerformance:: AA (nnz) = " << AA.NumNonZeros() << ", UU (nnz) = " << UU.NumNonZeros() << endl;

        if (verbose)
          cout << F << endl
               << UU << endl;
      }
      cout << "CholPerformance:: symbolic factorization::time = " << t_symbolic << endl;            
      {
        timer.reset();
        CrsMatrixHelper::flat2hier(Uplo::Upper, UU, HU,
                                   S.NumBlocks(),
                                   S.RangeVector(),
                                   S.TreeVector());
        
        for (ordinal_type k=0;k<HU.NumNonZeros();++k)
          HU.Value(k).fillRowViewArray();
        
        t_flat2hier = timer.seconds();
        
        cout << "CholPerformance:: Hier (dof, nnz) = " << HU.NumRows() << ", " << HU.NumNonZeros() << endl;
      }
      cout << "CholPerformance:: construct hierarchical matrix::time = " << t_flat2hier << endl;            
    }

    // copy of UU
    CrsMatrixBaseType RR("RR");
    RR.copy(UU);

    const size_t max_concurrency = 16384;
    cout << "CholPerformance:: max concurrency = " << max_concurrency << endl;

    const size_t max_task_size = 3*sizeof(CrsTaskViewType)+128;
    cout << "CholPerformance:: max task size   = " << max_task_size << endl;

    if (!skip_serial) {
      typename TaskFactoryType::policy_type policy(max_concurrency,
                                                   max_task_size,
                                                   max_task_dependence, 
                                                   1);

      TaskFactoryType::setUseTeamInterface(team_interface);
      TaskFactoryType::setMaxTaskDependence(max_task_dependence);
      TaskFactoryType::setPolicy(&policy);

      CrsTaskViewType U(&UU);
      U.fillRowViewArray();

      cout << "CholPerformance:: Serial factorize the matrix" << endl;
      {
        UU.copy(RR);
        if (vtune_serial) {
#ifdef HAVE_SHYLUTACHO_VTUNE
          __itt_resume();
#endif
        }
        timer.reset();          
        {
          Chol<Uplo::Upper,AlgoChol::UnblockedOpt,Variant::One>
            ::invoke(TaskFactoryType::Policy(),
                              TaskFactoryType::Policy().member_single(),
                              U);
        }
        t_factor_seq = timer.seconds();
        if (vtune_serial) {
#ifdef HAVE_SHYLUTACHO_VTUNE
          __itt_pause();
#endif
        }
        if (verbose)
          cout << UU << endl;
      }
      cout << "CholPerformance:: Serial factorize the matrix::time = " << t_factor_seq << endl;
    }

    {
      typename TaskFactoryType::policy_type policy(max_concurrency,
                                                   max_task_size,
                                                   max_task_dependence, 
                                                   team_size);
      TaskFactoryType::setUseTeamInterface(team_interface);
      TaskFactoryType::setMaxTaskDependence(max_task_dependence);
      TaskFactoryType::setPolicy(&policy);
      
      cout << "CholPerformance:: ByBlocks factorize the matrix:: team_size = " << team_size << endl;
      CrsHierTaskViewType H(&HU);
      {
        UU.copy(RR);
        if (vtune_task) {
#ifdef HAVE_SHYLUTACHO_VTUNE
          __itt_resume();
#endif
        }
        timer.reset();
        {
          auto future = TaskFactoryType::Policy().create_team(Chol<Uplo::Upper,AlgoChol::ByBlocks>::
                                                              TaskFunctor<CrsHierTaskViewType>(H), 0);
          TaskFactoryType::Policy().spawn(future);
          Kokkos::Experimental::wait(TaskFactoryType::Policy());
        }
        t_factor_task += timer.seconds();
        if (vtune_task) {        
#ifdef HAVE_SHYLUTACHO_VTUNE
        __itt_pause();
#endif
        }
        if (verbose)
          cout << UU << endl;
      }  
      cout << "CholPerformance:: ByBlocks factorize the matrix::time = " << t_factor_task << endl;
    }
    
    if (!skip_serial) {
      cout << "CholPerformance:: task scale [seq/task] = " << t_factor_seq/t_factor_task << endl;    
    }

    return r_val;
  }