Beispiel #1
0
size_t test_global_to_local_ids(unsigned num_ids, unsigned capacity, unsigned num_find_iterations)
{

  typedef Device execution_space;
  typedef typename execution_space::size_type size_type;

  typedef Kokkos::View<uint32_t*,execution_space> local_id_view;
  typedef Kokkos::UnorderedMap<uint32_t,size_type,execution_space> global_id_view;

  double elasped_time = 0;
  Kokkos::Impl::Timer timer;

  local_id_view local_2_global("local_ids", num_ids);
  global_id_view global_2_local(capacity);

  int shiftw = 15;

  //create
  elasped_time = timer.seconds();
  std::cout << std::setw(shiftw) <<  "allocate: " <<  elasped_time << std::endl;
  timer.reset();

  // generate unique ids
  {
    generate_ids<Device> gen(local_2_global);
  }

  // generate
  elasped_time = timer.seconds();
  std::cout << std::setw(shiftw) << "generate: " <<  elasped_time << std::endl;
  timer.reset();

  {
    fill_map<Device> fill(global_2_local, local_2_global);
  }

  // fill
  elasped_time = timer.seconds();
  std::cout << std::setw(shiftw) << "fill: " <<  elasped_time << std::endl;
  timer.reset();


  size_t num_errors = global_2_local.failed_insert();

  if (num_errors == 0u) {
    for (unsigned i=0; i<num_find_iterations; ++i)
    {
      find_test<Device> find(global_2_local, local_2_global,num_errors);
    }

    // find
    elasped_time = timer.seconds();
    std::cout << std::setw(shiftw) << "lookup: " <<  elasped_time << std::endl;
  }
  else {
    std::cout << "    !!! Fill Failed !!!" << std::endl;
  }

  return num_errors;
}
Beispiel #2
0
void Loop(int loop, int test, const char* type_name) {
  LoopVariant<T>(loop,test);

  Kokkos::Impl::Timer timer;
  T res = LoopVariant<T>(loop,test);
  double time1 = timer.seconds();

  timer.reset();
  T resNonAtomic = LoopVariantNonAtomic<T>(loop,test);
  double time2 = timer.seconds();

  timer.reset();
  T resSerial = LoopVariantSerial<T>(loop,test);
  double time3 = timer.seconds();

  time1*=1e6/loop;
  time2*=1e6/loop;
  time3*=1e6/loop;
  textcolor_standard();
  bool passed = true;
  if(resSerial!=res) passed = false;
  if(!passed) textcolor(RESET,BLACK,YELLOW);
  printf("%s Test %i %s  --- Loop: %i Value (S,A,NA): %e %e %e Time: %7.4e %7.4e %7.4e Size of Type %i)",type_name,test,passed?"PASSED":"FAILED",loop,1.0*resSerial,1.0*res,1.0*resNonAtomic,time1,time2,time3,(int)sizeof(T));
  if(!passed) textcolor_standard();
  printf("\n");
}
Beispiel #3
0
int main(int narg, char* arg[]) {
  Kokkos::initialize(narg,arg);

  int size = 1000000;

  // Create DualViews. This will allocate on both the device and its
  // host_mirror_device.
  idx_type idx("Idx",size,64);
  view_type dest("Dest",size);
  view_type src("Src",size);

  srand(134231);

  // Get a reference to the host view of idx directly (equivalent to
  // idx.view<idx_type::host_mirror_device_type>() )
  idx_type::t_host h_idx = idx.h_view;
  for (int i = 0; i < size; ++i) {
    for (view_type::size_type j=0; j < h_idx.dimension_1 (); ++j) {
      h_idx(i,j) = (size + i + (rand () % 500 - 250)) % size;
    }
  }

  // Mark idx as modified on the host_mirror_device_type so that a
  // sync to the device will actually move data.  The sync happens in
  // the functor's constructor.
  idx.modify<idx_type::host_mirror_device_type>();

  // Run on the device.  This will cause a sync of idx to the device,
  // since it was marked as modified on the host.
  Kokkos::Impl::Timer timer;
  Kokkos::parallel_for(size,localsum<view_type::device_type>(idx,dest,src));
  Kokkos::fence();
  double sec1_dev = timer.seconds();

  timer.reset();
  Kokkos::parallel_for(size,localsum<view_type::device_type>(idx,dest,src));
  Kokkos::fence();
  double sec2_dev = timer.seconds();

  // Run on the host (could be the same as device).  This will cause a
  // sync back to the host of dest.  Note that if the Device is CUDA,
  // the data layout will not be optimal on host, so performance is
  // lower than what it would be for a pure host compilation.
  timer.reset();
  Kokkos::parallel_for(size,localsum<view_type::host_mirror_device_type>(idx,dest,src));
  Kokkos::fence();
  double sec1_host = timer.seconds();

  timer.reset();
  Kokkos::parallel_for(size,localsum<view_type::host_mirror_device_type>(idx,dest,src));
  Kokkos::fence();
  double sec2_host = timer.seconds();

  printf("Device Time with Sync: %lf without Sync: %lf \n",sec1_dev,sec2_dev);
  printf("Host   Time with Sync: %lf without Sync: %lf \n",sec1_host,sec2_host);

  Kokkos::finalize();
}
void test_global_to_local_ids(unsigned num_ids)
{

  typedef Device device_type;
  typedef typename device_type::size_type size_type;

  typedef Kokkos::View<uint32_t*,device_type> local_id_view;
  typedef Kokkos::UnorderedMap<uint32_t,size_type,device_type> global_id_view;

  //size
  std::cout << num_ids << ", ";

  double elasped_time = 0;
  Kokkos::Impl::Timer timer;

  local_id_view local_2_global("local_ids", num_ids);
  global_id_view global_2_local((3u*num_ids)/2u);

  //create
  elasped_time = timer.seconds();
  std::cout << elasped_time << ", ";
  timer.reset();

  // generate unique ids
  {
    generate_ids<Device> gen(local_2_global);
  }
  Device::fence();
  // generate
  elasped_time = timer.seconds();
  std::cout << elasped_time << ", ";
  timer.reset();

  {
    fill_map<Device> fill(global_2_local, local_2_global);
  }
  Device::fence();

  // fill
  elasped_time = timer.seconds();
  std::cout << elasped_time << ", ";
  timer.reset();


  size_t num_errors = 0;
  for (int i=0; i<100; ++i)
  {
    find_test<Device> find(global_2_local, local_2_global,num_errors);
  }
  Device::fence();

  // find
  elasped_time = timer.seconds();
  std::cout << elasped_time << std::endl;

  ASSERT_EQ( num_errors, 0u);
}
Beispiel #5
0
int main(int narg, char* arg[]) {
  Kokkos::initialize(narg,arg);

  int size = 1000000;

  // Create Views
  idx_type idx("Idx",size,64);
  view_type dest("Dest",size);
  view_type src("Src",size);

  srand(134231);

  // When using UVM Cuda views can be accessed on the Host directly
  for(int i=0; i<size; i++) {
    for(int j=0; j<idx.dimension_1(); j++)
      idx(i,j) = (size + i + (rand()%500 - 250))%size;
  }

  Kokkos::fence();
  // Run on the device
  // This will cause a sync of idx to the device since it was modified on the host
  Kokkos::Impl::Timer timer;
  Kokkos::parallel_for(size,localsum<view_type::execution_space>(idx,dest,src));
  Kokkos::fence();
  double sec1_dev = timer.seconds();

  // No data transfer will happen now, since nothing is accessed on the host
  timer.reset();
  Kokkos::parallel_for(size,localsum<view_type::execution_space>(idx,dest,src));
  Kokkos::fence();
  double sec2_dev = timer.seconds();

  // Run on the host
  // This will cause a sync back to the host of dest which was changed on the device
  // Compare runtime here with the dual_view example: dest will be copied back in 4k blocks
  // when they are accessed the first time during the parallel_for. Due to the latency of a memcpy
  // this gives lower effective bandwidth when doing a manual copy via dual views
  timer.reset();
  Kokkos::parallel_for(size,localsum<Kokkos::HostSpace::execution_space>(idx,dest,src));
  Kokkos::fence();
  double sec1_host = timer.seconds();

  // No data transfers will happen now
  timer.reset();
  Kokkos::parallel_for(size,localsum<Kokkos::HostSpace::execution_space>(idx,dest,src));
  Kokkos::fence();
  double sec2_host = timer.seconds();



  printf("Device Time with Sync: %lf without Sync: %lf \n",sec1_dev,sec2_dev);
  printf("Host   Time with Sync: %lf without Sync: %lf \n",sec1_host,sec2_host);

  Kokkos::finalize();
}
Beispiel #6
0
  BASKER_INLINE
  int Basker<Int, Entry, Exe_Space>::Symbolic(Int option)
  {


    printf("calling symbolic \n");
    #ifdef BASKER_KOKKOS_TIME 
    Kokkos::Impl::Timer timer;
    #endif

    //symmetric_sfactor();
    sfactor();


    if(option == 0)
      {
        
      }
    else if(option == 1)
      {
	
        
      }

    #ifdef BASKER_KOKKOS_TIME
    stats.time_sfactor += timer.seconds();
    #endif

    return 0;
  }//end Symbolic
Beispiel #7
0
  BASKER_INLINE
  int Basker<Int, Entry, Exe_Space>::Factor(Int option)
  {
#ifdef BASKER_KOKKOS_TIME
    Kokkos::Impl::Timer timer;
#endif

    factor_notoken(option);

#ifdef BASKER_KOKKOS_TIME
    stats.time_nfactor += timer.seconds();
#endif

    // NDE
    MALLOC_ENTRY_1DARRAY(x_view_ptr_copy, gn); //used in basker_solve_rhs - move alloc
    MALLOC_ENTRY_1DARRAY(y_view_ptr_copy, gm);
    MALLOC_INT_1DARRAY(perm_inv_comp_array , gm); //y
    MALLOC_INT_1DARRAY(perm_comp_array, gn); //x

    MALLOC_INT_1DARRAY(perm_comp_iworkspace_array, gn); 
    MALLOC_ENTRY_1DARRAY(perm_comp_fworkspace_array, gn);
    permute_composition_for_solve();

    factor_flag = BASKER_TRUE;

    return 0;
  }//end Factor()
Beispiel #8
0
void graph_color_symbolic(KernelHandle *handle){

  Kokkos::Impl::Timer timer;

  typename KernelHandle::idx_array_type row_map = handle->get_row_map();
  typename KernelHandle::idx_edge_array_type entries = handle->get_entries();

  typename KernelHandle::GraphColoringHandleType *gch = handle->get_graph_coloring_handle();

  Experimental::KokkosKernels::Graph::ColoringAlgorithm algorithm = gch->get_coloring_type();

  typedef typename KernelHandle::GraphColoringHandleType::color_array_type color_view_type;
  color_view_type colors_out = color_view_type("Graph Colors", row_map.dimension_0() - 1);

  typedef typename Experimental::KokkosKernels::Graph::Impl::GraphColor
      <typename KernelHandle::GraphColoringHandleType> BaseGraphColoring;
  BaseGraphColoring *gc = NULL;



  switch (algorithm){
  case Experimental::KokkosKernels::Graph::COLORING_SERIAL:

    gc = new BaseGraphColoring(
        row_map.dimension_0() - 1, entries.dimension_0(),
        row_map, entries, gch);
    break;
  case Experimental::KokkosKernels::Graph::COLORING_VB:
  case Experimental::KokkosKernels::Graph::COLORING_VBBIT:
  case Experimental::KokkosKernels::Graph::COLORING_VBCS:

    typedef typename Experimental::KokkosKernels::Graph::Impl::GraphColor_VB
        <typename KernelHandle::GraphColoringHandleType> VBGraphColoring;
    gc = new VBGraphColoring(
        row_map.dimension_0() - 1, entries.dimension_0(),
        row_map, entries, gch);
    break;
  case Experimental::KokkosKernels::Graph::COLORING_EB:

    typedef typename Experimental::KokkosKernels::Graph::Impl::GraphColor_EB
        <typename KernelHandle::GraphColoringHandleType> EBGraphColoring;

    gc = new EBGraphColoring(row_map.dimension_0() - 1, entries.dimension_0(),row_map, entries, gch);
    break;
  case Experimental::KokkosKernels::Graph::COLORING_DEFAULT:
    break;

  }

  int num_phases = 0;
  gc->color_graph(colors_out, num_phases);
  delete gc;
  double coloring_time = timer.seconds();
  gch->add_to_overall_coloring_time(coloring_time);
  gch->set_coloring_time(coloring_time);
  gch->set_num_phases(num_phases);
  gch->set_vertex_colors(colors_out);
}
Beispiel #9
0
 BASKER_INLINE
 int Basker<Int, Entry, Exe_Space>::Factor(Int option)
 {
   #ifdef BASKER_KOKKOS_TIME
   Kokkos::Impl::Timer timer;
   #endif
   
   factor_notoken(option);
   
   #ifdef BASKER_KOKKOS_TIME
   stats.time_nfactor += timer.seconds();
   #endif
   return 0;
 }//end Factor()
Beispiel #10
0
void mexFunction(int nlhs,
                 mxArray *plhs [],
                 int nrhs,
                 const mxArray *prhs []) {

  Kokkos::Impl::Timer time;

  Kokkos::initialize();
  string name = typeid(Kokkos::DefaultExecutionSpace).name();
  mexPrintf("\n Kokkos is initialized with a default spaceL: %s\n", name.c_str());

  Kokkos::finalize();
  mexPrintf("\n Kokkos is finalized\n");

  plhs[0] = mxCreateDoubleScalar(time.seconds());
}
Beispiel #11
0
  static double factorization( const multivector_type Q_ ,
                               const multivector_type R_ )
  {
#if defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
    using Kokkos::Experimental::ALL ;
#else
    const Kokkos::ALL ALL ;
#endif
    const size_type count  = Q_.dimension_1();
    value_view tmp("tmp");
    value_view one("one");

    Kokkos::deep_copy( one , (Scalar) 1 );

    Kokkos::Impl::Timer timer ;

    for ( size_type j = 0 ; j < count ; ++j ) {
      // Reduction   : tmp = dot( Q(:,j) , Q(:,j) );
      // PostProcess : tmp = sqrt( tmp ); R(j,j) = tmp ; tmp = 1 / tmp ;
      const vector_type Qj  = Kokkos::subview( Q_ , ALL , j );
      const value_view  Rjj = Kokkos::subview( R_ , j , j );

      invnorm2( Qj , Rjj , tmp );

      // Q(:,j) *= ( 1 / R(j,j) ); => Q(:,j) *= tmp ;
      Kokkos::scale( tmp , Qj );

      for ( size_t k = j + 1 ; k < count ; ++k ) {
        const vector_type Qk = Kokkos::subview( Q_ , ALL , k );
        const value_view  Rjk = Kokkos::subview( R_ , j , k );

        // Reduction   : R(j,k) = dot( Q(:,j) , Q(:,k) );
        // PostProcess : tmp = - R(j,k);
        dot_neg( Qj , Qk , Rjk , tmp );

        // Q(:,k) -= R(j,k) * Q(:,j); => Q(:,k) += tmp * Q(:,j)
        Kokkos::axpby( tmp , Qj , one , Qk );
      }
    }

    execution_space::fence();

    return timer.seconds();
  }
void profile_end_kernel(const std::string& kernel_name, const std::string& exec_space) {
  Kokkos::Impl::Timer* timer = get_timer();
  double time = timer->seconds();

  KernelEntry* entry = get_kernel_list_head();
  if(entry == NULL) {
    KernelEntry* entry_new = new KernelEntry(entry,time,kernel_name,exec_space);
    get_kernel_list_head(entry_new);
    return;
  }

  bool found = entry->matches(kernel_name,exec_space);
  while(!found && (entry->next!=NULL) ) {
    entry = entry->next;
    found = entry->matches(kernel_name,exec_space);
  }
  if(found)
    entry->add_time(time);
  else
    entry->next = new KernelEntry(entry,time,kernel_name,exec_space);
}
Beispiel #13
0
int main(int narg, char* args[]) {
  Kokkos::initialize(narg,args);
  
  int chunk_size = 1024;
  int nchunks = 100000; //1024*1024;
  Kokkos::DualView<int*> data("data",nchunks*chunk_size+1);

  srand(1231093);

  for(int i = 0; i < data.dimension_0(); i++) {
    data.h_view(i) = rand()%TS;
  }
  data.modify<Host>();
  data.sync<Device>();

  Kokkos::DualView<int**> histogram("histogram",TS,TS);


  Kokkos::Impl::Timer timer;
  // Threads/team (TS) is automically limited to the maximum supported by the device.
  Kokkos::parallel_for( team_policy( nchunks , TS )
                      , find_2_tuples(chunk_size,data,histogram) );
  Kokkos::fence();
  double time = timer.seconds();

  histogram.sync<Host>();

  printf("Time: %lf \n\n",time);
  int sum = 0;
  for(int k=0; k<TS; k++) {
    for(int l=0; l<TS; l++) {
      printf("%i ",histogram.h_view(k,l));
      sum += histogram.h_view(k,l);
    }
    printf("\n");
  }
  printf("Result: %i %i\n",sum,chunk_size*nchunks);
  Kokkos::finalize();
}
Beispiel #14
0
int main(int narg, char* args[]) {
  Kokkos::initialize(narg,args);
  
  int chunk_size = 1024;
  int nchunks = 100000; //1024*1024;
  Kokkos::DualView<int*> data("data",nchunks*chunk_size+1);

  srand(1231093);

  for(int i = 0; i < data.dimension_0(); i++) {
    data.h_view(i) = rand()%TS;
  }
  data.modify<Host>();
  data.sync<Device>();

  Kokkos::DualView<int**> histogram("histogram",TS,TS);


  Kokkos::Impl::Timer timer;
  Kokkos::parallel_for(
      Kokkos::ParallelWorkRequest(nchunks,TS<Device::team_max()?TS:Device::team_max()),
      find_2_tuples(chunk_size,data,histogram));
  Kokkos::fence();
  double time = timer.seconds();

  histogram.sync<Host>();

  printf("Time: %lf \n\n",time);
  int sum = 0;
  for(int k=0; k<TS; k++) {
    for(int l=0; l<TS; l++) {
      printf("%i ",histogram.h_view(k,l));
      sum += histogram.h_view(k,l);
    }
    printf("\n");
  }
  printf("Result: %i %i\n",sum,chunk_size*nchunks);
  Kokkos::finalize();
}
Beispiel #15
0
  static double test( const int count , const int iter = 1 )
  {
    elem_coord_type coord( "coord" , count );
    elem_grad_type  grad ( "grad" , count );

    // Execute the parallel kernels on the arrays:

    double dt_min = 0 ;

    Kokkos::parallel_for( count , Init( coord ) );
    device_type::fence();

    for ( int i = 0 ; i < iter ; ++i ) {
      Kokkos::Impl::Timer timer ;
      Kokkos::parallel_for( count , HexGrad<device_type>( coord , grad ) );
      device_type::fence();
      const double dt = timer.seconds();
      if ( 0 == i ) dt_min = dt ;
      else dt_min = dt < dt_min ? dt : dt_min ;
    }

    return dt_min ;
  }
  KOKKOS_INLINE_FUNCTION
  int exampleCholPerformance(const string file_input,
                              const int treecut,
                              const int minblksize,
                              const int prunecut,
                              const int seed,
                              const int niter,
                              const int nthreads,
                              const int max_task_dependence,
                              const int team_size,
                              const int fill_level,
                              const int league_size,
                              const bool team_interface,
                              const bool skip_serial,
                              const bool mkl_interface,
                              const bool verbose) {
    typedef ValueType   value_type;
    typedef OrdinalType ordinal_type;
    typedef SizeType    size_type;

    typedef TaskFactory<Kokkos::Experimental::TaskPolicy<SpaceType>,
      Kokkos::Experimental::Future<int,SpaceType> > TaskFactoryType;

    typedef CrsMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> CrsMatrixBaseType;
    typedef GraphHelper_Scotch<CrsMatrixBaseType> GraphHelperType;
    typedef SymbolicFactorHelper<CrsMatrixBaseType> SymbolicFactorHelperType;

#ifdef HAVE_SHYLUTACHO_MKL 
    typedef typename CrsMatrixBaseType::value_type_array value_type_array;
#endif

    typedef CrsMatrixView<CrsMatrixBaseType> CrsMatrixViewType;
    typedef TaskView<CrsMatrixViewType,TaskFactoryType> CrsTaskViewType;
    
    typedef CrsMatrixBase<CrsTaskViewType,ordinal_type,size_type,SpaceType,MemoryTraits> CrsHierMatrixBaseType;

    typedef CrsMatrixView<CrsHierMatrixBaseType> CrsHierMatrixViewType;
    typedef TaskView<CrsHierMatrixViewType,TaskFactoryType> CrsHierTaskViewType;

    int r_val = 0;

    Kokkos::Impl::Timer timer;
    double 
      t_import = 0.0,
      t_reorder = 0.0,
      t_symbolic = 0.0,
      t_flat2hier = 0.0,
#ifdef HAVE_SHYLUTACHO_MKL 
      t_mkl_seq = 0.0,
#endif
      t_factor_seq = 0.0,
      t_factor_task = 0.0;
    const int start = -2;
    
    cout << "CholPerformance:: import input file = " << file_input << endl;        
    CrsMatrixBaseType AA("AA");
    {
      timer.reset();

      ifstream in;
      in.open(file_input);
      if (!in.good()) {
        cout << "Failed in open the file: " << file_input << endl;
        return ++r_val;
      }
      AA.importMatrixMarket(in);

      t_import = timer.seconds();

      if (verbose)
        cout << AA << endl;
    }
    cout << "CholPerformance:: import input file::time = " << t_import << endl;

    cout << "CholPerformance:: reorder the matrix" << endl;        
    CrsMatrixBaseType PA("Permuted AA");
    CrsMatrixBaseType UU("UU");     // permuted base upper triangular matrix
    CrsHierMatrixBaseType HU("HU"); // hierarchical matrix of views

    {
      GraphHelperType S(AA, seed);
      {
        timer.reset();
        
        S.computeOrdering(treecut, minblksize);
        S.pruneTree(prunecut);
        
        PA.copy(S.PermVector(), S.InvPermVector(), AA);
        
        t_reorder = timer.seconds();

        if (verbose)
          cout << S << endl
               << PA << endl;
      }

      cout << "CholPerformance:: reorder the matrix::time = " << t_reorder << endl;            
      {
        SymbolicFactorHelperType F(PA, league_size);
        for (int i=start;i<niter;++i) {
          timer.reset();
        
          F.createNonZeroPattern(fill_level, Uplo::Upper, UU);

          // UU.copy(Uplo::Upper, PA);

          t_symbolic += timer.seconds() * (i>=0);
        }
        t_symbolic /= niter;

        cout << "CholPerformance:: AA (nnz) = " << AA.NumNonZeros() << ", UU (nnz) = " << UU.NumNonZeros() << endl;

        if (verbose)
          cout << F << endl
               << UU << endl;
      }
      cout << "CholPerformance:: symbolic factorization::time = " << t_symbolic << endl;            
      {
        timer.reset();

        CrsMatrixHelper::flat2hier(Uplo::Upper, UU, HU,
                                   S.NumBlocks(),
                                   S.RangeVector(),
                                   S.TreeVector());
        
        for (ordinal_type k=0;k<HU.NumNonZeros();++k)
          HU.Value(k).fillRowViewArray();
        
        t_flat2hier = timer.seconds();
        
        cout << "CholPerformance:: Hier (dof, nnz) = " << HU.NumRows() << ", " << HU.NumNonZeros() << endl;
      }
      cout << "CholPerformance:: construct hierarchical matrix::time = " << t_flat2hier << endl;            
    }

    // copy of UU
    CrsMatrixBaseType RR("RR");
    RR.copy(UU);

#ifdef HAVE_SHYLUTACHO_MKL
    if (!skip_serial && mkl_interface) {
      cout << "CholPerformance:: MKL factorize the matrix" << endl;
      CrsMatrixBaseType MM("MM");
      for (int i=start;i<niter;++i) {
        MM.copy(RR);
        MM.hermitianize(Uplo::Upper);

        MKL_INT  n  = static_cast<MKL_INT>(MM.NumRows());
        double  *a  = static_cast<double*>(MM.ValuePtr());
        MKL_INT *ia = static_cast<MKL_INT*>(MM.RowPtr());
        MKL_INT *ja = static_cast<MKL_INT*>(MM.ColPtr());

        // convert to 1-based matrix
        {
          for (ordinal_type k=0;k<(MM.NumRows()+1);++k)
            ++ia[k];
          
          for (size_type k=0;k<MM.NumNonZeros();++k)
            ++ja[k];
        }
        value_type_array mkl_result = value_type_array("mkl-ilu-values", MM.NumNonZeros());
        double  *bilu0 = static_cast<double*>(&mkl_result[0]);
        MKL_INT ipar[128];
        double dpar[128];
        MKL_INT ierr;
        
        // we provide ilu-k pattern 
        timer.reset();
        dcsrilu0(&n, a, ia, ja, bilu0, ipar, dpar, &ierr);
        t_mkl_seq += timer.seconds() * (i>=0) * 0.5;
        
        if (ierr != 0) 
          cout << " MKL Error = " << ierr << endl;
      }
      t_mkl_seq /= niter;
      cout << "CholPerformance:: MKL factorize the matrix::time = " << t_mkl_seq << endl;
    }
#endif

    if (!skip_serial) {
#ifdef __USE_FIXED_TEAM_SIZE__
      typename TaskFactoryType::policy_type policy(max_task_dependence);
#else
      typename TaskFactoryType::policy_type policy(max_task_dependence, 1);
#endif
      TaskFactoryType::setUseTeamInterface(team_interface);
      TaskFactoryType::setMaxTaskDependence(max_task_dependence);
      TaskFactoryType::setPolicy(&policy);

      CrsTaskViewType U(&UU);
      U.fillRowViewArray();

      cout << "CholPerformance:: Serial factorize the matrix" << endl;
      {
        for (int i=start;i<niter;++i) {
          UU.copy(RR);
          timer.reset();          
          // {
          //   auto future = TaskFactoryType::Policy().create(Chol<Uplo::Upper,AlgoChol::UnblockedOpt,Variant::One>
          //                                                  ::TaskFunctor<CrsTaskViewType>(U), 0);
          //   TaskFactoryType::Policy().spawn(future);
          //   Kokkos::Experimental::wait(TaskFactoryType::Policy());
          // }
          {
            Chol<Uplo::Upper,AlgoChol::UnblockedOpt,Variant::One>
              ::invoke(TaskFactoryType::Policy(),
                                TaskFactoryType::Policy().member_single(),
                                U);
          }
          t_factor_seq += timer.seconds() * (i>=0);
        }
        t_factor_seq /= niter;

        if (verbose)
          cout << UU << endl;
      }
      cout << "CholPerformance:: Serial factorize the matrix::time = " << t_factor_seq << endl;
    }

//     {
// #ifdef __USE_FIXED_TEAM_SIZE__
//       typename TaskFactoryType::policy_type policy(max_task_dependence);
// #else
//       typename TaskFactoryType::policy_type policy(max_task_dependence, nthreads);
// #endif
//       TaskFactoryType::setPolicy(&policy);

//       CrsTaskViewType U(&UU);
//       U.fillRowViewArray();

//       cout << "CholPerformance:: Team factorize the matrix:: team_size = " << nthreads << endl;
//       {
//         timer.reset();
        
//         auto future = TaskFactoryType::Policy().create(Chol<Uplo::Upper,AlgoChol::UnblockedOpt,Variant::One>
//                                                        ::TaskFunctor<CrsTaskViewType>(U), 0);
//         TaskFactoryType::Policy().spawn(future);
//         Kokkos::Experimental::wait(TaskFactoryType::Policy());
        
//         t_factor_team = timer.seconds();
        
//         if (verbose)
//           cout << UU << endl;
//       }
//       cout << "CholPerformance:: Team factorize the matrix::time = " << t_factor_team << endl;
//     }

    {
#ifdef __USE_FIXED_TEAM_SIZE__
      typename TaskFactoryType::policy_type policy(max_task_dependence);
#else
      typename TaskFactoryType::policy_type policy(max_task_dependence, team_size);
#endif
      TaskFactoryType::setUseTeamInterface(team_interface);
      TaskFactoryType::setMaxTaskDependence(max_task_dependence);
      TaskFactoryType::setPolicy(&policy);
      
      cout << "CholPerformance:: ByBlocks factorize the matrix:: team_size = " << team_size << endl;
      CrsHierTaskViewType H(&HU);
      {
        for (int i=start;i<niter;++i) {
          UU.copy(RR);
          timer.reset();
          {
            auto future = TaskFactoryType::Policy().create_team(Chol<Uplo::Upper,AlgoChol::ByBlocks>::
                                                                TaskFunctor<CrsHierTaskViewType>(H), 0);
            TaskFactoryType::Policy().spawn(future);
            Kokkos::Experimental::wait(TaskFactoryType::Policy());
          }
          t_factor_task += timer.seconds() * (i>=0);
        }
        t_factor_task /= niter;

        if (verbose)
          cout << UU << endl;
      }  
      cout << "CholPerformance:: ByBlocks factorize the matrix::time = " << t_factor_task << endl;
    }

    if (!skip_serial) {
#ifdef HAVE_SHYLUTACHO_MKL
      cout << "CholPerformance:: mkl/chol scale [mkl/chol] = " << t_mkl_seq/t_factor_seq << endl;    
      cout << "CholPerformance:: mkl/task  scale [mkl/task]  = " << t_mkl_seq/t_factor_task << endl;    
#else
      cout << "CholPerformance:: task scale [seq/task] = " << t_factor_seq/t_factor_task << endl;    
#endif
      //cout << "CholPerformance:: team scale [seq/team] = " << t_factor_seq/t_factor_team << endl;    
    }

    return r_val;
  }
void graph_color_symbolic(
    KernelHandle *handle,
    typename KernelHandle::row_lno_t num_rows,
    typename KernelHandle::row_lno_t num_cols,
    lno_row_view_t_ row_map,
    lno_nnz_view_t_ entries,
    bool is_symmetric = true){

  Kokkos::Impl::Timer timer;

  typename KernelHandle::GraphColoringHandleType *gch = handle->get_graph_coloring_handle();

  ColoringAlgorithm algorithm = gch->get_coloring_type();

  typedef typename KernelHandle::GraphColoringHandleType::color_view_t color_view_type;

  color_view_type colors_out = color_view_type("Graph Colors", num_rows);


  typedef typename Impl::GraphColor
      <typename KernelHandle::GraphColoringHandleType, lno_row_view_t_, lno_nnz_view_t_> BaseGraphColoring;
  BaseGraphColoring *gc = NULL;


  switch (algorithm){
  case COLORING_SERIAL:

    gc = new BaseGraphColoring(
        num_rows, entries.dimension_0(),
        row_map, entries, gch);
    break;
  case COLORING_VB:
  case COLORING_VBBIT:
  case COLORING_VBCS:

    typedef typename Impl::GraphColor_VB
        <typename KernelHandle::GraphColoringHandleType, lno_row_view_t_, lno_nnz_view_t_> VBGraphColoring;
    gc = new VBGraphColoring(
        num_rows, entries.dimension_0(),
        row_map, entries, gch);
    break;
  case COLORING_EB:

    typedef typename Impl::GraphColor_EB
        <typename KernelHandle::GraphColoringHandleType, lno_row_view_t_, lno_nnz_view_t_> EBGraphColoring;

    gc = new EBGraphColoring(num_rows, entries.dimension_0(),row_map, entries, gch);
    break;
  case COLORING_DEFAULT:
    break;

  }

  int num_phases = 0;
  gc->color_graph(colors_out, num_phases);
  delete gc;
  double coloring_time = timer.seconds();
  gch->add_to_overall_coloring_time(coloring_time);
  gch->set_coloring_time(coloring_time);
  gch->set_num_phases(num_phases);
  gch->set_vertex_colors(colors_out);
}
  KOKKOS_INLINE_FUNCTION
  int exampleCholDirectPerformance(const string file_input,
                                   const int treecut,
                                   const int minblksize,
                                   const int prunecut,
                                   const int seed,
                                   const int niter,
                                   const int nthreads,
                                   const int max_task_dependence,
                                   const int team_size,
                                   const int league_size,
                                   const bool team_interface,
                                   const bool skip_serial,
                                   const bool mkl_interface,
                                   const bool verbose) {
    typedef ValueType   value_type;
    typedef OrdinalType ordinal_type;
    typedef SizeType    size_type;

    typedef TaskFactory<Kokkos::Experimental::TaskPolicy<SpaceType>,
      Kokkos::Experimental::Future<int,SpaceType> > TaskFactoryType;

    typedef CrsMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> CrsMatrixBaseType;
    typedef GraphHelper_Scotch<CrsMatrixBaseType> GraphHelperType;
    typedef SymbolicFactorHelper<CrsMatrixBaseType> SymbolicFactorHelperType;

#ifdef HAVE_SHYLUTACHO_MKL 
    typedef typename CrsMatrixBaseType::value_type_array value_type_array;
#endif

    typedef CrsMatrixView<CrsMatrixBaseType> CrsMatrixViewType;
    typedef TaskView<CrsMatrixViewType,TaskFactoryType> CrsTaskViewType;
    
    typedef CrsMatrixBase<CrsTaskViewType,ordinal_type,size_type,SpaceType,MemoryTraits> CrsHierMatrixBaseType;

    typedef CrsMatrixView<CrsHierMatrixBaseType> CrsHierMatrixViewType;
    typedef TaskView<CrsHierMatrixViewType,TaskFactoryType> CrsHierTaskViewType;

    int r_val = 0;

    Kokkos::Impl::Timer timer;
    double 
      t_import = 0.0,
      t_reorder = 0.0,
      t_symbolic = 0.0,
      t_flat2hier = 0.0,
#ifdef HAVE_SHYLUTACHO_MKL 
      t_mkl = 0.0,
#endif
      t_factor_seq = 0.0,   t_solve_seq = 0.0,
      t_factor_task = 0.0,  t_solve_task = 0.0;
    const int start = 0;
    
    cout << "CholDirectPerformance:: import input file = " << file_input << endl;        
    CrsMatrixBaseType AA("AA");
    {
      timer.reset();

      ifstream in;
      in.open(file_input);
      if (!in.good()) {
        cout << "Failed in open the file: " << file_input << endl;
        return ++r_val;
      }
      AA.importMatrixMarket(in);

      t_import = timer.seconds();
    }
    cout << "CholDirectPerformance:: import input file::time = " << t_import << endl;

    cout << "CholDirectPerformance:: reorder the matrix" << endl;        
    CrsMatrixBaseType PA("Permuted AA");
    CrsMatrixBaseType UU("UU");     // permuted base upper triangular matrix
    CrsHierMatrixBaseType HU("HU"); // hierarchical matrix of views

    DenseMatrixBaseType BB("BB", AA.NumRows(), nrhs);
    DenseHierMatrixBaseType HB("HB");

    {
      GraphHelperType S(AA, seed);
      {
        timer.reset();
        
        S.computeOrdering(treecut, minblksize);
        S.pruneTree(prunecut);
        
        PA.copy(S.PermVector(), S.InvPermVector(), AA);
        
        t_reorder = timer.seconds();
      }

      cout << "CholDirectPerformance:: reorder the matrix::time = " << t_reorder << endl;            
      {
        SymbolicFactorHelperType F(PA, league_size);
        for (int i=start;i<niter;++i) {
          timer.reset();
        
          F.createNonZeroPattern(Uplo::Upper, UU);

          t_symbolic += timer.seconds() * (i>=0);
        }
        t_symbolic /= niter;

        cout << "CholDirectPerformance:: AA (nnz) = " << AA.NumNonZeros() << ", UU (nnz) = " << UU.NumNonZeros() << endl;
      }
      cout << "CholDirectPerformance:: symbolic factorization::time = " << t_symbolic << endl;            
      {
        timer.reset();

        CrsMatrixHelper::flat2hier(Uplo::Upper, UU, HU,
                                   S.NumBlocks(),
                                   S.RangeVector(),
                                   S.TreeVector());
        
        for (ordinal_type k=0;k<HU.NumNonZeros();++k)
          HU.Value(k).fillRowViewArray();

        DenseMatrixHelper::flat2hier(BB, HB,
                                     S.NumBlocks(),
                                     S.RangeVector(),
                                     nb);
        
        t_flat2hier = timer.seconds();
        
        cout << "CholDirectPerformance:: Hier (dof, nnz) = " << HU.NumRows() << ", " << HU.NumNonZeros() << endl;
      }
      cout << "CholDirectPerformance:: construct hierarchical matrix::time = " << t_flat2hier << endl;            
    }

    // copy of UU
    CrsMatrixBaseType RR("RR");
    RR.copy(UU);

    /////////////////////////// Serial Numeric Factorization
    if (!skip_serial) {
#ifdef __USE_FIXED_TEAM_SIZE__
      typename TaskFactoryType::policy_type policy(max_task_dependence);
#else
      typename TaskFactoryType::policy_type policy(max_task_dependence, 1);
#endif
      TaskFactoryType::setUseTeamInterface(team_interface);
      TaskFactoryType::setMaxTaskDependence(max_task_dependence);
      TaskFactoryType::setPolicy(&policy);

      CrsTaskViewType U(&UU);
      U.fillRowViewArray();

      cout << "CholDirectPerformance:: Serial factorize the matrix" << endl;
      {
        for (int i=start;i<niter;++i) {
          UU.copy(RR);
          timer.reset();          
          {
            Chol<Uplo::Upper,AlgoChol::UnblockedOpt,Variant::One>
              ::invoke(TaskFactoryType::Policy(),
                       TaskFactoryType::Policy().member_single(),
                       U);
          }
          t_factor_seq += timer.seconds() * (i>=0);
        }
        t_factor_seq /= niter;
      }
      cout << "CholDirectPerformance:: Serial factorize the matrix::time = " << t_factor_seq << endl;

      cout << "CholDirectPerformance:: Serial forward/backward solve" << endl;      
      {
        for (int i=start;i<niter;++i) {
          XX.copy(BB);
          timer.reset();          
          {
            TriSolve<Uplo::Upper,Trans::ConjTranspose,AlgoTriSolve::Unblocked>
              ::invoke(TaskFactoryType::Policy(),
                       TaskFactoryType::Policy().member_single(),
                       Diag::NonUnit, U, X);
            TriSolve<Uplo::Upper,Trans::NoTranspose,AlgoTriSolve::Unblocked>
              ::invoke(TaskFactoryType::Policy(),
                       TaskFactoryType::Policy().member_single(),
                       Diag::NonUnit, U, X);
          }
          t_factor_seq += timer.seconds() * (i>=0);
        }
        t_factor_seq /= niter;
      }
      cout << "CholDirectPerformance:: Serial forward/backward solve::time = " << t_solve_seq << endl;
    }

    // if (!skip_serial) 
    //   cout << "CholDirectPerformance:: task scale [seq/task] = " << t_factor_seq/t_factor_task << endl;    

    return r_val;
  }
Beispiel #19
0
void VerletKokkos::run(int n)
{
  bigint ntimestep;
  int nflag,sortflag;

  int n_post_integrate = modify->n_post_integrate;
  int n_pre_exchange = modify->n_pre_exchange;
  int n_pre_neighbor = modify->n_pre_neighbor;
  int n_pre_force = modify->n_pre_force;
  int n_post_force = modify->n_post_force;
  int n_end_of_step = modify->n_end_of_step;

  if (atomKK->sortfreq > 0) sortflag = 1;
  else sortflag = 0;

  static double time = 0.0;
  static int count = 0;
  atomKK->sync(Device,ALL_MASK);
  Kokkos::Impl::Timer ktimer;

  for (int i = 0; i < n; i++) {

    ntimestep = ++update->ntimestep;
    ev_set(ntimestep);

    // initial time integration

    ktimer.reset();
    timer->stamp();
    modify->initial_integrate(vflag);
    time += ktimer.seconds();
    if (n_post_integrate) modify->post_integrate();
    timer->stamp(Timer::MODIFY);

    // regular communication vs neighbor list rebuild

    nflag = neighbor->decide();

    if (nflag == 0) {
      timer->stamp();
      comm->forward_comm();
      timer->stamp(Timer::COMM);
    } else {
      // added debug
      //atomKK->sync(Host,ALL_MASK);
      //atomKK->modified(Host,ALL_MASK);

      if (n_pre_exchange) {
        timer->stamp();
        modify->pre_exchange();
        timer->stamp(Timer::MODIFY);
      }
      // debug
      //atomKK->sync(Host,ALL_MASK);
      //atomKK->modified(Host,ALL_MASK);
      if (triclinic) domain->x2lamda(atomKK->nlocal);
      domain->pbc();
      if (domain->box_change) {
        domain->reset_box();
        comm->setup();
        if (neighbor->style) neighbor->setup_bins();
      }
      timer->stamp();

      // added debug
      //atomKK->sync(Device,ALL_MASK);
      //atomKK->modified(Device,ALL_MASK);

      comm->exchange();
      if (sortflag && ntimestep >= atomKK->nextsort) atomKK->sort();
      comm->borders();

      // added debug
      //atomKK->sync(Host,ALL_MASK);
      //atomKK->modified(Host,ALL_MASK);

      if (triclinic) domain->lamda2x(atomKK->nlocal+atomKK->nghost);

      timer->stamp(Timer::COMM);
      if (n_pre_neighbor) {
        modify->pre_neighbor();
        timer->stamp(Timer::MODIFY);
      }
      neighbor->build();
      timer->stamp(Timer::NEIGH);
    }

    // force computations
    // important for pair to come before bonded contributions
    // since some bonded potentials tally pairwise energy/virial
    // and Pair:ev_tally() needs to be called before any tallying

    force_clear();

    timer->stamp();

    // added for debug
    //atomKK->k_x.sync<LMPHostType>();
    //atomKK->k_f.sync<LMPHostType>();
    //atomKK->k_f.modify<LMPHostType>();
    if (n_pre_force) {
      modify->pre_force(vflag);
      timer->stamp(Timer::MODIFY);
    }


    if (pair_compute_flag) {
      atomKK->sync(force->pair->execution_space,force->pair->datamask_read);
      atomKK->modified(force->pair->execution_space,force->pair->datamask_modify);
      force->pair->compute(eflag,vflag);
      timer->stamp(Timer::PAIR);
    }

    if (atomKK->molecular) {
      if (force->bond) {
        atomKK->sync(force->bond->execution_space,force->bond->datamask_read);
        atomKK->modified(force->bond->execution_space,force->bond->datamask_modify);
        force->bond->compute(eflag,vflag);
      }
      if (force->angle) {
        atomKK->sync(force->angle->execution_space,force->angle->datamask_read);
        atomKK->modified(force->angle->execution_space,force->angle->datamask_modify);
        force->angle->compute(eflag,vflag);
      }
      if (force->dihedral) {
        atomKK->sync(force->dihedral->execution_space,force->dihedral->datamask_read);
        atomKK->modified(force->dihedral->execution_space,force->dihedral->datamask_modify);
        force->dihedral->compute(eflag,vflag);
      }
      if (force->improper) {
        atomKK->sync(force->improper->execution_space,force->improper->datamask_read);
        atomKK->modified(force->improper->execution_space,force->improper->datamask_modify);
        force->improper->compute(eflag,vflag);
      }
      timer->stamp(Timer::BOND);
    }

    if (kspace_compute_flag) {
      atomKK->sync(force->kspace->execution_space,force->kspace->datamask_read);
      atomKK->modified(force->kspace->execution_space,force->kspace->datamask_modify);
      force->kspace->compute(eflag,vflag);
      timer->stamp(Timer::KSPACE);
    }

    // reverse communication of forces

    if (force->newton) comm->reverse_comm();
    timer->stamp(Timer::COMM);

    // force modifications, final time integration, diagnostics

    ktimer.reset();

    if (n_post_force) modify->post_force(vflag);
    modify->final_integrate();
    if (n_end_of_step) modify->end_of_step();
    timer->stamp(Timer::MODIFY);

    time += ktimer.seconds();

    // all output

    if (ntimestep == output->next) {
       atomKK->sync(Host,ALL_MASK);

      timer->stamp();
      output->write(ntimestep);
      timer->stamp(Timer::OUTPUT);
    }
  }
}
  KOKKOS_INLINE_FUNCTION
  int exampleTriSolvePerformance(const string file_input,
                                 const OrdinalType nrhs,
                                 const OrdinalType nb,
                                 const int niter,
                                 const int nthreads,
                                 const int max_task_dependence,
                                 const int team_size, 
                                 const bool team_interface,
                                 const bool skip_serial,
                                 const bool verbose) {
    typedef ValueType   value_type;
    typedef OrdinalType ordinal_type;
    typedef SizeType    size_type;

    typedef TaskFactory<Kokkos::Experimental::TaskPolicy<SpaceType>,
      Kokkos::Experimental::Future<int,SpaceType> > TaskFactoryType;

    typedef CrsMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> CrsMatrixBaseType;
    typedef GraphHelper_Scotch<CrsMatrixBaseType> GraphHelperType;

    typedef CrsMatrixView<CrsMatrixBaseType> CrsMatrixViewType;
    typedef TaskView<CrsMatrixViewType,TaskFactoryType> CrsTaskViewType;
    
    typedef CrsMatrixBase<CrsTaskViewType,ordinal_type,size_type,SpaceType,MemoryTraits> CrsHierMatrixBaseType;

    typedef CrsMatrixView<CrsHierMatrixBaseType> CrsHierMatrixViewType;
    typedef TaskView<CrsHierMatrixViewType,TaskFactoryType> CrsHierTaskViewType;

    typedef DenseMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> DenseMatrixBaseType;

    typedef DenseMatrixView<DenseMatrixBaseType> DenseMatrixViewType;
    typedef TaskView<DenseMatrixViewType,TaskFactoryType> DenseTaskViewType;

    typedef DenseMatrixBase<DenseTaskViewType,ordinal_type,size_type,SpaceType,MemoryTraits> DenseHierMatrixBaseType;

    typedef DenseMatrixView<DenseHierMatrixBaseType> DenseHierMatrixViewType;
    typedef TaskView<DenseHierMatrixViewType,TaskFactoryType> DenseHierTaskViewType;

    int r_val = 0;

    Kokkos::Impl::Timer timer;
    double 
      t_import = 0.0,
      t_reorder = 0.0,
      t_solve_seq = 0.0,
      t_solve_task = 0.0;
    const int start = -2;

    cout << "TriSolvePerformance:: import input file = " << file_input << endl;
    CrsMatrixBaseType AA("AA");
    {
      timer.reset();

      ifstream in;
      in.open(file_input);
      if (!in.good()) {
        cout << "Failed in open the file: " << file_input << endl;
        return ++r_val;
      }
      AA.importMatrixMarket(in);

      t_import = timer.seconds();

      if (verbose)
        cout << AA << endl;
    }
    cout << "TriSolvePerformance:: import input file::time = " << t_import << endl;

    CrsMatrixBaseType   UU("UU");
    DenseMatrixBaseType BB("BB",  AA.NumRows(), nrhs);

    cout << "TriSolvePerformance:: reorder the matrix and partition right hand side, nb = " << nb << endl;
    CrsHierMatrixBaseType   HU("HU");
    DenseHierMatrixBaseType HB("HB");
    {
      timer.reset();

      GraphHelperType S(AA);
      S.computeOrdering();

      CrsMatrixBaseType PA("Permuted AA");
      PA.copy(S.PermVector(), S.InvPermVector(), AA);

      UU.copy(Uplo::Upper, PA);

      CrsMatrixHelper::flat2hier(Uplo::Upper, UU, HU,
                                 S.NumBlocks(),
                                 S.RangeVector(),
                                 S.TreeVector());

      DenseMatrixHelper::flat2hier(BB, HB,
                                   S.NumBlocks(),
                                   S.RangeVector(),
                                   nb);

      t_reorder = timer.seconds();

      cout << "TriSolvePerformance:: Hier (dof, nnz) = " << HU.NumRows() << ", " << HU.NumNonZeros() << endl;

      if (verbose)
        cout << UU << endl;
    }
    cout << "TriSolvePerformance:: reorder the matrix and partition right hand side::time = " << t_reorder << endl;

    const size_t max_concurrency = 16384;
    cout << "TriSolvePerformance:: max concurrency = " << max_concurrency << endl;

    const size_t max_task_size = 3*sizeof(CrsTaskViewType)+128;
    cout << "TriSolvePerformance:: max task size   = " << max_task_size << endl;

    if (!skip_serial) {
      __INIT_DENSE_MATRIX__(BB, 1.0);
      typename TaskFactoryType::policy_type policy(max_concurrency,
                                                   max_task_size,
                                                   max_task_dependence, 
                                                   1);

      TaskFactoryType::setUseTeamInterface(team_interface);
      TaskFactoryType::setMaxTaskDependence(max_task_dependence);
      TaskFactoryType::setPolicy(&policy);
      
      CrsTaskViewType U(&UU);
      DenseTaskViewType B(&BB);
      U.fillRowViewArray();

      cout << "TriSolvePerformance:: Serial forward and backward solve of the matrix" << endl;
      {
        for (int i=start;i<niter;++i) {
          timer.reset();
          // {
          //   auto future = TaskFactoryType::Policy().create_team(TriSolve<Uplo::Upper,Trans::ConjTranspose,AlgoTriSolve::Unblocked>
          //                                                       ::TaskFunctor<CrsTaskViewType,DenseTaskViewType>
          //                                                       (Diag::NonUnit, U, B), 0);
            
          //   TaskFactoryType::Policy().spawn(future);
          //   Kokkos::Experimental::wait(TaskFactoryType::Policy());
          // }
          {
            TriSolve<Uplo::Upper,Trans::ConjTranspose,AlgoTriSolve::Unblocked>
              ::invoke(TaskFactoryType::Policy(),
                                TaskFactoryType::Policy().member_single(),
                                Diag::NonUnit, U, B);
            
          }
          // {
          //   auto future = TaskFactoryType::Policy().create_team(TriSolve<Uplo::Upper,Trans::NoTranspose,AlgoTriSolve::Unblocked>
          //                                                       ::TaskFunctor<CrsTaskViewType,DenseTaskViewType>
          //                                                       (Diag::NonUnit, U, B), 0);
            
          //   TaskFactoryType::Policy().spawn(future);
          //   Kokkos::Experimental::wait(TaskFactoryType::Policy());
          // }
          {
            TriSolve<Uplo::Upper,Trans::NoTranspose,AlgoTriSolve::Unblocked>
              ::invoke(TaskFactoryType::Policy(),
                                TaskFactoryType::Policy().member_single(),
                                Diag::NonUnit, U, B);
          }
          t_solve_seq += timer.seconds() * (i>=0);
        }
        t_solve_seq /= niter;
        
        if (verbose)
          cout << BB << endl;
      }
      cout << "TriSolvePerformance:: Serial forward and backward solve of the matrix::time = " << t_solve_seq << endl;
    }
    
    {
      __INIT_DENSE_MATRIX__(BB, 1.0);
      typename TaskFactoryType::policy_type policy(max_concurrency,
                                                   max_task_size,
                                                   max_task_dependence, 
                                                   team_size);

      TaskFactoryType::setUseTeamInterface(team_interface);
      TaskFactoryType::setMaxTaskDependence(max_task_dependence);
      TaskFactoryType::setPolicy(&policy);

      // wrap the hierarchically partitioned matrix with task handler
      CrsHierTaskViewType TU(&HU);
      for (ordinal_type k=0;k<HU.NumNonZeros();++k)
        HU.Value(k).fillRowViewArray();
      
      DenseHierTaskViewType TB(&HB);
      
      cout << "TriSolvePerformance:: ByBlocks forward and backward solve of the matrix" << endl;
      {
        for (int i=start;i<niter;++i) {
          timer.reset(); 
          {
            auto future_forward_solve = TaskFactoryType::Policy().create_team
              (TriSolve<Uplo::Upper,Trans::ConjTranspose,AlgoTriSolve::ByBlocks>
               ::TaskFunctor<CrsHierTaskViewType,DenseHierTaskViewType>
               (Diag::NonUnit, TU, TB), 0);
            
            TaskFactoryType::Policy().spawn(future_forward_solve);
            
            auto future_backward_solve = TaskFactoryType::Policy().create_team
              (TriSolve<Uplo::Upper,Trans::NoTranspose,AlgoTriSolve::ByBlocks>
               ::TaskFunctor<CrsHierTaskViewType,DenseHierTaskViewType>
               (Diag::NonUnit, TU, TB), 1);
            
            TaskFactoryType::Policy().add_dependence(future_backward_solve, future_forward_solve);
            TaskFactoryType::Policy().spawn(future_backward_solve);
            
            Kokkos::Experimental::wait(TaskFactoryType::Policy());
          }
          t_solve_task += timer.seconds() * (i>=0);
        }
        t_solve_task /= niter;

        if (verbose)
          cout << BB << endl;
      }

      cout << "TriSolvePerformance:: ByBlocks forward and backward solve of the matrix::time = " << t_solve_task << endl;
    }

    if (!skip_serial) {
      cout << "TriSolvePerformance:: task scale [seq/task] = " << t_solve_seq/t_solve_task << endl;
    }

    return r_val;
  }
  KOKKOS_INLINE_FUNCTION
  int exampleCholUnblocked(const string file_input,
                           const int max_task_dependence,
                           const int team_size,
                           const int algo,
                           const int variant,
                           const bool verbose) {
    typedef ValueType   value_type;
    typedef OrdinalType ordinal_type;
    typedef SizeType    size_type;

    typedef CrsMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> CrsMatrixBaseType;
    typedef CrsMatrixView<CrsMatrixBaseType> CrsMatrixViewType;

    typedef TaskFactory<Kokkos::Experimental::TaskPolicy<SpaceType>,
      Kokkos::Experimental::Future<int,SpaceType> > TaskFactoryType;

    typedef TaskView<CrsMatrixViewType,TaskFactoryType> CrsTaskViewType;
    
    int r_val = 0;

    Kokkos::Impl::Timer timer;
    double t = 0.0;

    cout << "CholUnblocked:: import input file = " << file_input << endl;        
    CrsMatrixBaseType AA("AA"), UU("UU");    
    {
      timer.reset();

      ifstream in;
      in.open(file_input);
      if (!in.good()) {
        cout << "Failed in open the file: " << file_input << endl;
        return ++r_val;
      }
      AA.importMatrixMarket(in);

      UU.copy(Uplo::Upper, AA);

      t = timer.seconds();

      if (verbose)
        cout << UU << endl;
    }
    cout << "CholUnblocked:: import input file::time = " << t << endl;        

#ifdef __USE_FIXED_TEAM_SIZE__
    typename TaskFactoryType::policy_type policy(max_task_dependence);
#else
    typename TaskFactoryType::policy_type policy(max_task_dependence, team_size);
#endif
    TaskFactoryType::setMaxTaskDependence(max_task_dependence);
    TaskFactoryType::setPolicy(&policy);

    cout << "CholUnblocked:: factorize the matrix" << endl;
    CrsTaskViewType U(&UU);
    U.fillRowViewArray();
    {
      timer.reset();
    
      typename TaskFactoryType::future_type future;
      switch (algo) {
      case AlgoChol::UnblockedOpt: {
        if (variant == Variant::One)
          future = TaskFactoryType::Policy().create_team(Chol<Uplo::Upper,AlgoChol::UnblockedOpt,Variant::One>
                                                         ::TaskFunctor<CrsTaskViewType>(U), 0);
        else if (variant == Variant::Two)
          future = TaskFactoryType::Policy().create_team(Chol<Uplo::Upper,AlgoChol::UnblockedOpt,Variant::Two>
                                                         ::TaskFunctor<CrsTaskViewType>(U), 0);
        else {
          ERROR(">> Not supported algorithm variant");          
        }
        break;
      }
      case AlgoChol::Dummy: {
        future = TaskFactoryType::Policy().create_team(Chol<Uplo::Upper,AlgoChol::Dummy>
                                                       ::TaskFunctor<CrsTaskViewType>(U), 0);
        break;
      }
      default:
        ERROR(">> Not supported algorithm");
        break;
      }
      TaskFactoryType::Policy().spawn(future);
      Kokkos::Experimental::wait(TaskFactoryType::Policy());

      t = timer.seconds();

      if (verbose)
        cout << UU << endl;
    }   
    cout << "CholUnblocked:: factorize the matrix::time = " << t << endl; 
    
    return r_val;
  }
  BASKER_INLINE
  int Basker<Int, Entry, Exe_Space>::factor_inc_lvl(Int option)
  {

    printf("Factor Inc Level Called \n");

    gn = A.ncol;
    gm = A.nrow;

    if(Options.btf == BASKER_TRUE)
      {

	//JDB: We can change this for the new inteface

	//call reference copy constructor
	gn = A.ncol;
	gm = A.nrow;
	A = BTF_A; 
	//printf("\n\n Switching A, newsize: %d \n",
	//   A.ncol);
	//printMTX("A_FACTOR.mtx", A);
      }
   

    //Spit into Domain and Sep
    //----------------------Domain-------------------------//
    #ifdef BASKER_KOKKOS

    //====TIMER==
    #ifdef BASKER_TIME
    Kokkos::Impl::Timer       timer;
    #endif
    //===TIMER===

    typedef Kokkos::TeamPolicy<Exe_Space>        TeamPolicy;

    if(btf_tabs_offset != 0)
      {

	kokkos_nfactor_domain_inc_lvl <Int,Entry,Exe_Space>
	  domain_nfactor(this);
	Kokkos::parallel_for(TeamPolicy(num_threads,1), 
			     domain_nfactor);
	Kokkos::fence();
    

	//=====Check for error======
	while(true)
	  {
	INT_1DARRAY thread_start;
	MALLOC_INT_1DARRAY(thread_start, num_threads+1);
	init_value(thread_start, num_threads+1, 
		   (Int) BASKER_MAX_IDX);
	int nt = nfactor_domain_error(thread_start);
	if(nt == BASKER_SUCCESS)
	  {
	    break;
	  }
	else
	  {
	    printf("restart \n");
	    kokkos_nfactor_domain_remalloc <Int, Entry, Exe_Space>
	      diag_nfactor_remalloc(this, thread_start);
	    Kokkos::parallel_for(TeamPolicy(num_threads,1),
				 diag_nfactor_remalloc);
	    Kokkos::fence();
	  }
      }//end while
   

    //====TIMER===
    #ifdef BASKER_TIME
    printf("Time DOMAIN: %f \n", timer.seconds());
    timer.reset();
    #endif
    //====TIMER====
    

    #else// else basker_kokkos
    #pragma omp parallel
    {


    }//end omp parallel
    #endif //end basker_kokkos

      }
    //-------------------End--Domian--------------------------//

   
    //---------------------------Sep--------------------------//

    
    
    if(btf_tabs_offset != 0)
      {
        //for(Int l=1; l<=1; l++)
       for(Int l=1; l <= tree.nlvls; l++)
      {

        //Come back for syncs
        
	//#ifdef BASKER_OLD_BARRIER
	Int lthreads = pow(2,l);
	Int lnteams = num_threads/lthreads;
	//#else
	//Int lthreads = 1;
	//Int lnteams = num_threads/lthreads;
	//#endif

	
	//printf("\n\n   ============ SEP: %d ======\n\n",l);

	#ifdef BASKER_KOKKOS
	Kokkos::Impl::Timer  timer_inner_sep;
	#ifdef BASKER_NO_LAMBDA
       
	kokkos_nfactor_sep2_inc_lvl <Int, Entry, Exe_Space>
	  sep_nfactor(this,l);
	
	Kokkos::parallel_for(TeamPolicy(lnteams,lthreads),
			     sep_nfactor);
	Kokkos::fence();
	
	#ifdef BASKER_TIME
	printf("Time INNERSEP: %d %f \n", 
	       l, timer_inner_sep.seconds());
	#endif
        #else //ELSE BASKER_NO_LAMBDA
	//Note: to be added
        #endif //end BASKER_NO_LAMBDA
	#else
	#pragma omp parallel
	{

	}//end omp parallel
	#endif
      }//end over each level

    #ifdef BASKER_TIME
    printf("Time SEP: %f \n", timer.seconds());
    #endif
      }

   
    //-------------------------End Sep----------------//


    //-------------------IF BTF-----------------------//
    if(Options.btf == BASKER_TRUE)
      {
	//=====Timer
	#ifdef BASKER_TIME
	Kokkos::Impl::Timer  timer_btf;
	#endif
	//====Timer
	
	//======Call diag factor====
	/*
	kokkos_nfactor_diag <Int, Entry, Exe_Space> 
	  diag_nfactor(this);
	Kokkos::parallel_for(TeamPolicy(num_threads,1),
			     diag_nfactor);
	Kokkos::fence();
	*/
	//=====Check for error======
	//while(true)
	// {
	    //INT_1DARRAY thread_start;
	    // MALLOC_INT_1DARRAY(thread_start, num_threads+1);
	    //init_value(thread_start, num_threads+1, 
	    //	       (Int) BASKER_MAX_IDX);
	    //int nt = nfactor_diag_error(thread_start);
	    // if(nt == BASKER_SUCCESS)
	    //  {
	    ///		break;
	    // }
	    //else
	    // {
		/*
		break;
		printf("restart \n");
		kokkos_nfactor_diag_remalloc <Int, Entry, Exe_Space>
		  diag_nfactor_remalloc(this, thread_start);
		Kokkos::parallel_for(TeamPolicy(num_threads,1),
				     diag_nfactor);
		Kokkos::fence();
		*/
	    //}
	    // }//end while

	//====TIMER
	#ifdef BASKER_TIME
	printf("Time BTF: %f \n", 
	       timer_btf.seconds());
	#endif
	//===TIMER

      }//end btf call

    
    return 0;
  }//end factor_lvl_inc()
int main (int argc, char ** argv){
  if (argc < 2){
    std::cerr << "Usage:" << argv[0] << " input_bin_file" << std::endl;
    exit(1);
  }


  Kokkos::initialize(argc, argv);
  MyExecSpace::print_configuration(std::cout);
  idx nv = 0, ne = 0;
  idx *xadj, *adj;
  wt *ew;

  KokkosKernels::Experimental::Graph::Utils::read_graph_bin<idx, wt> (&nv, &ne, &xadj, &adj, &ew, argv[1]);

  std::cout << "nv:" << nv << " ne:" << ne << std::endl;

  um_array_type _xadj (xadj, nv + 1);
  um_edge_array_type _adj (adj, ne);

  idx_array_type kok_xadj ("xadj", nv + 1);
  idx_edge_array_type kok_adj("adj", ne);

  idx_array_type sym_xadj;
  idx_edge_array_type sym_adj;

  Kokkos::deep_copy (kok_xadj, _xadj);
  Kokkos::deep_copy (kok_adj, _adj);

  wt_um_edge_array_type _mtx_vals (ew, ne);
  value_array_type kok_mtx_vals ("MTX_VALS", ne);
  Kokkos::deep_copy (kok_mtx_vals, _mtx_vals);

  delete [] xadj;
  delete [] adj;
  delete [] ew;

  std::cout << "Symetrizing Graph" << std::endl;

  Kokkos::Impl::Timer timer;

  KokkosKernels::Experimental::Util::symmetrize_graph_symbolic_hashmap<
  idx_array_type, idx_edge_array_type, idx_array_type, idx_edge_array_type, MyExecSpace>
    (nv, kok_xadj, kok_adj,sym_xadj, sym_adj);
  Kokkos::fence();

  double t = timer.seconds();
  std::cout << "Time to symmetrize:" << t << std::endl;
  KokkosKernels::Experimental::Util::print_1Dview(kok_xadj);
  KokkosKernels::Experimental::Util::print_1Dview(kok_adj);

  std::cout << "Symetric Graph" << std::endl;
  KokkosKernels::Experimental::Util::print_1Dview(sym_xadj);

  KokkosKernels::Experimental::Util::print_1Dview(sym_adj);

  Kokkos::finalize();



  return 0;
}
  int exampleCholByBlocks(const std::string file_input,
                          const int treecut,
                          const int prunecut,
                          const int fill_level,
                          const int rows_per_team,
                          const int max_concurrency,
                          const int max_task_dependence,
                          const int team_size,
                          const bool check,
                          const bool verbose) {
    typedef typename
      Kokkos::Impl::is_space<DeviceSpaceType>::host_mirror_space::execution_space HostSpaceType ;

    const bool detail = false;
    std::cout << "DeviceSpace::  "; DeviceSpaceType::print_configuration(std::cout, detail);
    std::cout << "HostSpace::    ";   HostSpaceType::print_configuration(std::cout, detail);

    // for simple test, let's use host space only here, for device it needs mirroring.

    typedef CrsMatrixBase<value_type,ordinal_type,size_type,HostSpaceType> CrsMatrixBaseHostType;
    typedef CrsMatrixView<CrsMatrixBaseHostType> CrsMatrixViewHostType;

    typedef GraphTools<ordinal_type,size_type,HostSpaceType> GraphToolsHostType;

    typedef GraphTools_Scotch<ordinal_type,size_type,HostSpaceType> GraphToolsHostType_Scotch;
    typedef GraphTools_CAMD<ordinal_type,size_type,HostSpaceType> GraphToolsHostType_CAMD;

    typedef IncompleteSymbolicFactorization<CrsMatrixBaseHostType> IncompleteSymbolicFactorizationType;

    typedef Kokkos::Experimental::TaskPolicy<DeviceSpaceType> PolicyType;

    typedef TaskView<CrsMatrixViewHostType> CrsTaskViewHostType;
    typedef CrsMatrixBase<CrsTaskViewHostType,ordinal_type,size_type,HostSpaceType> CrsHierBaseHostType;
    typedef CrsMatrixView<CrsHierBaseHostType> CrsHierViewHostType;
    typedef TaskView<CrsHierViewHostType> CrsTaskHierViewHostType;

    int r_val = 0;
    
    Kokkos::Impl::Timer timer;

    ///
    /// Read from matrix market
    ///
    ///     input  - file
    ///     output - AA_host
    ///
    CrsMatrixBaseHostType AA_host("AA_host");
    timer.reset();
    {
      std::ifstream in;
      in.open(file_input);
      if (!in.good()) {
        std::cout << "Failed in open the file: " << file_input << std::endl;
        return -1;
      }
      MatrixMarket::read(AA_host, in);
    }
    double t_read = timer.seconds();

    if (verbose)
      AA_host.showMe(std::cout) << std::endl;

    ///
    /// Create a graph structure for Scotch and CAMD (rptr, cidx)
    ///     
    ///     rptr and cidx are need to be set up for Scotch and CAMD
    ///
    typename GraphToolsHostType::size_type_array rptr("Graph::RowPtrArray", AA_host.NumRows() + 1);
    typename GraphToolsHostType::ordinal_type_array cidx("Graph::ColIndexArray", AA_host.NumNonZeros());

    ///
    /// Run Scotch
    ///
    ///     input  - rptr, cidx, A_host
    ///     output - S (perm, iperm, nblks, range, tree), AA_scotch_host (permuted)
    ///
    timer.reset();
    GraphToolsHostType::getGraph(rptr, cidx, AA_host);
    double t_graph = timer.seconds();

    GraphToolsHostType_Scotch S;
    S.setGraph(AA_host.NumRows(), rptr, cidx);
    S.setSeed(0);
    S.setTreeLevel();
    S.setStrategy( SCOTCH_STRATSPEED 
                   | SCOTCH_STRATLEVELMAX   
                   | SCOTCH_STRATLEVELMIN   
                   | SCOTCH_STRATLEAFSIMPLE 
                   | SCOTCH_STRATSEPASIMPLE 
                   );
    
    timer.reset();
    S.computeOrdering(treecut);
    double t_scotch = timer.seconds();
    
    if (verbose)
      S.showMe(std::cout) << std::endl;

    CrsMatrixBaseHostType AA_scotch_host("AA_scotch_host");
    AA_scotch_host.createConfTo(AA_host);
    
    CrsMatrixTools::copy(AA_scotch_host,
                         S.PermVector(),
                         S.InvPermVector(),
                         AA_host);

    if (verbose)
      AA_scotch_host.showMe(std::cout) << std::endl;

    ///
    /// Run CAMD
    ///
    ///     input  - rptr, cidx, A_host
    ///     output - S (perm, iperm, nblks, range, tree), AA_scotch_host (permuted)
    ///
    timer.reset();
    GraphToolsHostType::getGraph(rptr, cidx, AA_scotch_host);
    t_graph += timer.seconds();

    GraphToolsHostType_CAMD C;
    C.setGraph(AA_scotch_host.NumRows(),
               rptr, cidx,
               S.NumBlocks(),
               S.RangeVector());

    timer.reset();
    C.computeOrdering();
    double t_camd = timer.seconds();

    if (verbose)
      C.showMe(std::cout) << std::endl;
    
    CrsMatrixBaseHostType AA_camd_host("AA_camd_host");
    AA_camd_host.createConfTo(AA_scotch_host);
    
    CrsMatrixTools::copy(AA_camd_host,
                         C.PermVector(),
                         C.InvPermVector(),
                         AA_scotch_host);
    
    if (verbose)
      AA_camd_host.showMe(std::cout) << std::endl;

    ///
    /// Symbolic factorization
    ///
    ///     input  - 
    ///     output - S (perm, iperm, nblks, range, tree), AA_scotch_host (permuted)
    ///
    CrsMatrixBaseHostType AA_factor_host("AA_factor_host");

    timer.reset();
    IncompleteSymbolicFactorizationType::createNonZeroPattern(AA_factor_host,
                                                    fill_level,
                                                    Uplo::Upper,
                                                    AA_camd_host,
                                                    rows_per_team);
    double t_symbolic = timer.seconds();

    if (verbose)
      AA_factor_host.showMe(std::cout) << std::endl;

    ///
    /// Clean tempoerary matrices
    ///
    ///     input  - AA_scotch_host, AA_camd_host, C, rptr, cidx
    ///     output - none
    ///
    AA_scotch_host = CrsMatrixBaseHostType();
    AA_camd_host   = CrsMatrixBaseHostType();

    C = GraphToolsHostType_CAMD();
    rptr = typename GraphToolsHostType::size_type_array();
    cidx = typename GraphToolsHostType::ordinal_type_array();

    ///
    /// Create task policy
    ///
    ///     input  - max_task_size
    ///     output - policy
    ///
    const size_type max_task_size = (3*sizeof(CrsTaskViewHostType)+sizeof(PolicyType)+128);

    timer.reset();
    PolicyType policy(max_concurrency,
                      max_task_size,
                      max_task_dependence,
                      team_size);
    double t_policy = timer.seconds();

    ///
    /// Sequential execution
    ///
    ///     input  - AA_factor_host (matrix to be compared), rowviews
    ///     output - BB_factor_host, B_factor_host
    ///
    double t_chol_serial = 0;
    CrsMatrixBaseHostType BB_factor_host("BB_factor_host");
    if (check) {
      BB_factor_host.createConfTo(AA_factor_host);
      CrsMatrixTools::copy(BB_factor_host, AA_factor_host);
      
      CrsTaskViewHostType B_factor_host(BB_factor_host);
      Kokkos::View<typename CrsTaskViewHostType::row_view_type*,HostSpaceType>
        rowviews("RowViewInMatView", B_factor_host.NumRows());
      B_factor_host.setRowViewArray(rowviews);
      
      timer.reset();
      {
        auto future = policy.proc_create_team(Chol<Uplo::Upper,AlgoChol::Unblocked,Variant::One>
                                              ::createTaskFunctor(policy, B_factor_host));
        policy.spawn(future);
        Kokkos::Experimental::wait(policy);
        TACHO_TEST_FOR_ABORT( future.get(), "Fail to perform Cholesky (serial)");
      }
      t_chol_serial = timer.seconds();

      if (verbose)
        BB_factor_host.showMe(std::cout) << std::endl;
    }

    ///
    /// Task parallel execution
    ///
    ///    input  - AA_factor_host, rowviews
    ///    output - HA_factor_host, AA_factor_host, B_factor_host
    ///
    double t_hier = 0, t_blocks = 0, t_chol_parallel = 0;
    CrsHierBaseHostType HA_factor_host("HA_factor_host");
    {
      timer.reset();
      S.pruneTree(prunecut);
      CrsMatrixTools::createHierMatrix(HA_factor_host, 
                                       AA_factor_host, 
                                       S.NumBlocks(),
                                       S.RangeVector(),
                                       S.TreeVector());
      t_hier = timer.seconds();    
      
      timer.reset();
      size_type nblocks = HA_factor_host.NumNonZeros();
      
      Kokkos::View<ordinal_type*,HostSpaceType> 
        ap_rowview_blocks("NumRowViewInBlocks", nblocks + 1);
      
      ap_rowview_blocks(0) = 0;
      for (ordinal_type k=0;k<nblocks;++k) 
        ap_rowview_blocks(k+1) = ap_rowview_blocks(k) + HA_factor_host.Value(k).NumRows();
      
      Kokkos::View<typename CrsMatrixViewHostType::row_view_type*,HostSpaceType>
        rowview_blocks("RowViewInBlocks", ap_rowview_blocks(nblocks));
      
      Kokkos::parallel_for(Kokkos::RangePolicy<HostSpaceType>(0, nblocks),
                           [&](const ordinal_type k) {
                             const ordinal_type begin = ap_rowview_blocks(k);
                             const ordinal_type end   = ap_rowview_blocks(k+1);
                             HA_factor_host.Value(k).setRowViewArray
                               (Kokkos::subview(rowview_blocks, Kokkos::pair<ordinal_type,ordinal_type>(begin, end)));
                           } );
      CrsMatrixTools::filterEmptyBlocks(HA_factor_host);
      t_blocks = timer.seconds();    

      {
        size_type nblocks_filtered = HA_factor_host.NumNonZeros(), nnz_blocks = 0;
        for (size_type k=0;k<nblocks_filtered; ++k) 
          nnz_blocks += HA_factor_host.Value(k).NumNonZeros();
        
        TACHO_TEST_FOR_ABORT( nnz_blocks != AA_factor_host.NumNonZeros(),
                              "nnz counted in blocks is different from nnz in the base matrix.");
      }
      
      CrsTaskHierViewHostType H_factor_host(HA_factor_host);
      timer.reset();
      {
        auto future = policy.proc_create_team(Chol<Uplo::Upper,AlgoChol::ByBlocks,Variant::One>
                                              ::createTaskFunctor(policy, H_factor_host));
        policy.spawn(future);
        Kokkos::Experimental::wait(policy);
        TACHO_TEST_FOR_ABORT( future.get(), "Fail to perform Cholesky (serial)");
      }
      t_chol_parallel = timer.seconds();
      
      if (verbose)
        AA_factor_host.showMe(std::cout) << std::endl;
    }

    if (check) {
      double diff = 0, norm = 0;    
      TACHO_TEST_FOR_ABORT( BB_factor_host.NumNonZeros() != AA_factor_host.NumNonZeros(),
                            "nnz used in serial is not same as nnz used in parallel");

      const size_type nnz = AA_factor_host.NumNonZeros();
      for (size_type k=0;k<nnz;++k) {
        norm += Util::abs(BB_factor_host.Value(k));
        diff += Util::abs(AA_factor_host.Value(k) - BB_factor_host.Value(k));
      }
      std::cout << std::scientific;
      std::cout << "CholByBlocks:: check with serial execution " << std::endl
                << "               diff = " << diff << ", norm = " << norm << ", rel err = " << (diff/norm) << std::endl;
      std::cout.unsetf(std::ios::scientific);      
    }

    {
      const auto prec = std::cout.precision();
      std::cout.precision(4);

      std::cout << std::scientific;
      std::cout << "CholByBlocks:: Given    matrix = " << AA_host.NumRows() << " x " << AA_host.NumCols() << ", nnz = " << AA_host.NumNonZeros() << std::endl;
      std::cout << "CholByBlocks:: Factored matrix = " << AA_factor_host.NumRows() << " x " << AA_factor_host.NumCols() << ", nnz = " << AA_factor_host.NumNonZeros() << std::endl;
      std::cout << "CholByBlocks:: Hier     matrix = " << HA_factor_host.NumRows() << " x " << HA_factor_host.NumCols() << ", nnz = " << HA_factor_host.NumNonZeros() << std::endl;

      std::cout << "CholByBlocks:: "
                << "read = " << t_read << " [sec], "
                << "graph generation = " << (t_graph/2.0) << " [sec] "
                << "scotch reordering = " << t_scotch << " [sec] "
                << "camd reordering = " << t_camd << " [sec] " 
                << std::endl
                << "CholByBlocks:: "
                << "symbolic factorization = " << t_symbolic << " [sec] "
                << std::endl
                << "CholByBlocks:: "
                << "policy creation = " << t_policy << " [sec] "
                << "hier creation = " << t_hier << " [sec] "
                << "block specification = " << t_blocks << " [sec] "
                << std::endl
                << "CholByBlocks:: "
                << "Chol Parallel = " << t_chol_parallel << " [sec] ";
      if (check) 
        std::cout << "Chol Serial = " << (check ? t_chol_serial : -1) << " [sec] "
                  << "speed-up = " << (t_chol_serial/t_chol_parallel) << " [sec] ";
      
      std::cout << std::endl;
      
      std::cout.unsetf(std::ios::scientific);
      std::cout.precision(prec);
    }

    return r_val;
  }
  void viennaCL_apply(
      KernelHandle *handle,
      typename KernelHandle::nnz_lno_t m,
      typename KernelHandle::nnz_lno_t n,
      typename KernelHandle::nnz_lno_t k,
      in_row_index_view_type row_mapA,
      in_nonzero_index_view_type entriesA,
      in_nonzero_value_view_type valuesA,

      bool transposeA,
      bin_row_index_view_type row_mapB,
      bin_nonzero_index_view_type entriesB,
      bin_nonzero_value_view_type valuesB,
      bool transposeB,
      cin_row_index_view_type &row_mapC,
      cin_nonzero_index_view_type &entriesC,
      cin_nonzero_value_view_type &valuesC){

#ifdef KERNELS_HAVE_VIENNACL

    typedef typename KernelHandle::nnz_lno_t idx;
    typedef in_row_index_view_type idx_array_type;

    typedef typename KernelHandle::nnz_scalar_t value_type;


    typedef typename in_row_index_view_type::device_type device1;
    typedef typename in_nonzero_index_view_type::device_type device2;
    typedef typename in_nonzero_value_view_type::device_type device3;

    typedef typename KernelHandle::HandleExecSpace MyExecSpace;

    std::cout << "RUNNING VIENNACL" << std::endl;

    typedef typename viennacl::compressed_matrix<value_type>::handle_type it;
    typedef typename viennacl::compressed_matrix<value_type>::value_type vt;

    if ((Kokkos::Impl::is_same<idx, int>::value && Kokkos::Impl::is_same<typename KernelHandle::size_type, int>::value )||
        (Kokkos::Impl::is_same<idx, unsigned int>::value && Kokkos::Impl::is_same<typename KernelHandle::size_type, unsigned int>::value ) ||
        (Kokkos::Impl::is_same<idx, it>::value && Kokkos::Impl::is_same<typename KernelHandle::size_type, it>::value )
        ){

      unsigned int * a_xadj = (unsigned int *)row_mapA.ptr_on_device();
      unsigned int * b_xadj = (unsigned int * )row_mapB.ptr_on_device();
      unsigned int * c_xadj = (unsigned int * )row_mapC.ptr_on_device();

      unsigned int * a_adj = (unsigned int * )entriesA.ptr_on_device();
      unsigned int * b_adj = (unsigned int * )entriesB.ptr_on_device();
      unsigned int * c_adj = (unsigned int * )entriesC.ptr_on_device();

      int nnzA = entriesA.dimension_0();
      int nnzB = entriesB.dimension_0();

      value_type *a_ew = valuesA.ptr_on_device();
      value_type *b_ew = valuesB.ptr_on_device();
      value_type *c_ew = valuesC.ptr_on_device();



      /*
        std::cout << "create a" << std::endl;
        std::cout << "m:" << m << " n:" << n << std::endl;
        std::cout << "a_xadj[0]:" << a_xadj[0] << " a_xadj[m]:" << a_xadj[m] << std::endl;
        std::cout << "a_adj[a_xadj[m] - 1]:" << a_adj[a_xadj[m] - 1] << " a_ew[a_xadj[m] - 1]:" << a_ew[a_xadj[m] - 1] << std::endl;
       */


      Kokkos::Impl::Timer timerset;

      viennacl::compressed_matrix<value_type> A;
      viennacl::compressed_matrix<value_type> B;
      A.set(a_xadj, a_adj, a_ew, m, n, nnzA);
      B.set(b_xadj, b_adj, b_ew, n, k, nnzB);
      std::cout << "compress matrix create:" << timerset.seconds() << std::endl;


      std::cout << "Now running ViennaCL" << std::endl;
      Kokkos::Impl::Timer timer1;
      viennacl::compressed_matrix<value_type> C = viennacl::linalg::prod(A, B);
      std::cout << "Actual VIENNACL SPMM Time:" << timer1.seconds() << std::endl;

      {



        unsigned int c_rows = m, c_cols = k, cnnz = C.nnz();


        value_type   const * values   = viennacl::linalg::host_based::detail::extract_raw_pointer<value_type>(C.handle());
        unsigned int const * rows_start = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(C.handle1());
        unsigned int const * columns = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(C.handle2());


        {
          Kokkos::Impl::Timer copy_time;
          row_mapC = typename cin_row_index_view_type::non_const_type(Kokkos::ViewAllocateWithoutInitializing("rowmapC"), c_rows + 1);
          entriesC = typename cin_nonzero_index_view_type::non_const_type (Kokkos::ViewAllocateWithoutInitializing("EntriesC") , cnnz);
          valuesC = typename cin_nonzero_value_view_type::non_const_type (Kokkos::ViewAllocateWithoutInitializing("valuesC") ,  cnnz);

          KokkosKernels::Experimental::Util::copy_vector<unsigned int const *, typename cin_row_index_view_type::non_const_type, MyExecSpace> (m, rows_start, row_mapC);
          idx nnz = cnnz;

          KokkosKernels::Experimental::Util::copy_vector<unsigned int const *, typename cin_nonzero_index_view_type::non_const_type, MyExecSpace> (nnz, columns, entriesC);
          KokkosKernels::Experimental::Util::copy_vector<value_type   const *, typename cin_nonzero_value_view_type::non_const_type, MyExecSpace> (m, values, valuesC);
          double copy_time_d = copy_time.seconds();
          std::cout << "VIENNACL COPYTIME:" << copy_time_d << std::endl;
        }

      }
    }
    else {

      //int *a_xadj = row_mapA.ptr_on_device();
      std::cerr << "vienna requires (u) integer values" << std::endl;

      if (Kokkos::Impl::is_same<idx, long>::value){
        std::cerr << "MKL is given long" << std::endl;
      }
      else if (Kokkos::Impl::is_same<idx, const int>::value){
        std::cerr << "MKL is given const int" << std::endl;
      }
      else if (Kokkos::Impl::is_same<idx, unsigned long>::value){
        std::cerr << "MKL is given unsigned long" << std::endl;
      }
      else if (Kokkos::Impl::is_same<idx, const unsigned long>::value){
        std::cerr << "MKL is given const unsigned long" << std::endl;
      }
      else{
        std::cerr << "MKL is given something else" << std::endl;
      }
      return;
    }
#else
    std::cerr << "VIENNACL IS NOT DEFINED" << std::endl;
    return;
#endif
  }
  KOKKOS_INLINE_FUNCTION
  int exampleSymbolicFactor(const string file_input,
                            const int treecut,
                            const int minblksize,
                            const int seed,
                            const int fill_level,
                            const int league_size,
                            const bool reorder,
                            const bool verbose) {
    typedef ValueType   value_type;
    typedef OrdinalType ordinal_type;
    typedef SizeType    size_type;

    typedef CrsMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> CrsMatrixBaseType;
    typedef GraphHelper_Scotch<CrsMatrixBaseType> GraphHelperType;
    typedef SymbolicFactorHelper<CrsMatrixBaseType> SymbolicFactorHelperType;

    int r_val = 0;

    Kokkos::Impl::Timer timer;
    double t = 0.0;

    cout << "SymbolicFactor:: import input file = " << file_input << endl;
    CrsMatrixBaseType AA("AA");
    {
      timer.reset();

      ifstream in;
      in.open(file_input);
      if (!in.good()) {
        cout << "Failed in open the file: " << file_input << endl;
        return ++r_val;
      }
      AA.importMatrixMarket(in);
      t = timer.seconds();

      cout << "SymbolicFactor:: AA nnz = " << AA.NumNonZeros() << endl;
      if (verbose)
        cout << AA << endl;
    }
    cout << "SymbolicFactor:: import input file::time = " << t << endl;

    CrsMatrixBaseType PA("Permuted AA");
    GraphHelperType S(AA, seed);
    if (reorder) {
      timer.reset();

      S.computeOrdering(treecut, minblksize);

      PA.copy(S.PermVector(), S.InvPermVector(), AA);

      t = timer.seconds();

      if (verbose)
        cout << S << endl
             << PA << endl;
    } else {
      PA = AA;

      t = 0.0;
    }
    cout << "SymbolicFactor:: reorder the matrix::time = " << t << endl;


    CrsMatrixBaseType UU("UU");
    {
      timer.reset();

      SymbolicFactorHelperType symbolic(PA, league_size);
      symbolic.createNonZeroPattern(fill_level, Uplo::Upper, UU);

      t = timer.seconds();

      cout << "SymbolicFactor:: UU nnz = " << UU.NumNonZeros() << endl;

      if (verbose) {
        cout << symbolic << endl;
        cout << UU << endl;
      }
    }
    cout << "SymbolicFactor:: factorize the matrix::time = " << t << endl;

    return r_val;
  }
  KOKKOS_INLINE_FUNCTION
  int exampleCholDirectPlain(const string file_input,
                             const int prunecut,
                             const int seed,
                             const int nrhs,
                             const int nb, 
                             const int nthreads,
                             const int max_task_dependence,
                             const int team_size,
                             const int league_size,
                             const bool team_interface,
                             const bool serial,
                             const bool solve,
                             const bool check,
                             const bool verbose) {
    typedef ValueType   value_type;
    typedef OrdinalType ordinal_type;
    typedef SizeType    size_type;

    typedef TaskFactory<Kokkos::Experimental::TaskPolicy<SpaceType>,
      Kokkos::Experimental::Future<int,SpaceType> > TaskFactoryType;

    typedef CrsMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> CrsMatrixBaseType;
    typedef GraphHelper_Scotch<CrsMatrixBaseType> GraphHelperType;
    typedef SymbolicFactorHelper<CrsMatrixBaseType> SymbolicFactorHelperType;

    typedef CrsMatrixView<CrsMatrixBaseType> CrsMatrixViewType;
    typedef TaskView<CrsMatrixViewType,TaskFactoryType> CrsTaskViewType;
    
    typedef CrsMatrixBase<CrsTaskViewType,ordinal_type,size_type,SpaceType,MemoryTraits> CrsHierMatrixBaseType;

    typedef CrsMatrixView<CrsHierMatrixBaseType> CrsHierMatrixViewType;
    typedef TaskView<CrsHierMatrixViewType,TaskFactoryType> CrsHierTaskViewType;

    typedef DenseMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> DenseMatrixBaseType;

    typedef DenseMatrixView<DenseMatrixBaseType> DenseMatrixViewType;
    typedef TaskView<DenseMatrixViewType,TaskFactoryType> DenseTaskViewType;

    typedef DenseMatrixBase<DenseTaskViewType,ordinal_type,size_type,SpaceType,MemoryTraits> DenseHierMatrixBaseType;
    
    typedef DenseMatrixView<DenseHierMatrixBaseType> DenseHierMatrixViewType;
    typedef TaskView<DenseHierMatrixViewType,TaskFactoryType> DenseHierTaskViewType;

    int r_val = 0;

    Kokkos::Impl::Timer timer;
    double 
      t_import = 0.0,
      t_reorder = 0.0,
      t_symbolic = 0.0,
      t_flat2hier = 0.0,
      t_factor = 0.0, 
      t_solve = 0.0;
    
    cout << "CholDirectPlain:: import input file = " << file_input << endl;        
    CrsMatrixBaseType AA("AA");
    {
      timer.reset();
      ifstream in;
      in.open(file_input);
      if (!in.good()) {
        cout << "Failed in open the file: " << file_input << endl;
        return ++r_val;
      }
      AA.importMatrixMarket(in);
      t_import = timer.seconds();
    }
    cout << "CholDirectPlain:: import input file::time = " << t_import << endl;

    // matrix A and its upper triangular factors U
    CrsMatrixBaseType PA("Permuted AA");
    CrsMatrixBaseType UU("UU"); // permuted base upper triangular matrix
    CrsHierMatrixBaseType HU("HU"); // hierarchical matrix of views

    // right hand side and solution matrix
    DenseMatrixBaseType BB("BB", AA.NumRows(), nrhs), XX("XX", AA.NumRows(), nrhs);
    DenseHierMatrixBaseType HB("HB"), HX("HX");

    {
      cout << "CholDirectPlain:: reorder the matrix" << endl;        
      GraphHelperType S(AA, seed);
      {
        timer.reset();
        S.computeOrdering();
        S.pruneTree(prunecut);
        PA.copy(S.PermVector(), S.InvPermVector(), AA);
        t_reorder = timer.seconds();
      }
      cout << "CholDirectPlain:: reorder the matrix::time = " << t_reorder << endl;            

      {
        SymbolicFactorHelperType F(PA, league_size);
        timer.reset();
        F.createNonZeroPattern(Uplo::Upper, UU);
        t_symbolic = timer.seconds();
        cout << "CholDirectPlain:: AA (nnz) = " << AA.NumNonZeros() << ", UU (nnz) = " << UU.NumNonZeros() << endl;
      }
      cout << "CholDirectPlain:: symbolic factorization::time = " << t_symbolic << endl;            

      {
        timer.reset();
        CrsMatrixHelper::flat2hier(Uplo::Upper, UU, HU,
                                   S.NumBlocks(),
                                   S.RangeVector(),
                                   S.TreeVector());

        // fill internal meta data for sparse blocs
        for (ordinal_type k=0;k<HU.NumNonZeros();++k)
          HU.Value(k).fillRowViewArray();

        DenseMatrixHelper::flat2hier(BB, HB,
                                     S.NumBlocks(),
                                     S.RangeVector(),
                                     nb);        

        DenseMatrixHelper::flat2hier(XX, HX,
                                     S.NumBlocks(),
                                     S.RangeVector(),
                                     nb);        
        t_flat2hier = timer.seconds();
        cout << "CholDirectPlain:: Hier (dof, nnz) = " << HU.NumRows() << ", " << HU.NumNonZeros() << endl;
      }
      cout << "CholDirectPlain:: construct hierarchical matrix::time = " << t_flat2hier << endl;            
    }


    {
      // Policy setup
#ifdef __USE_FIXED_TEAM_SIZE__ 
      typename TaskFactoryType::policy_type policy(max_task_dependence);
#else
      typename TaskFactoryType::policy_type policy(max_task_dependence, 1);
#endif
      TaskFactoryType::setUseTeamInterface(team_interface);
      TaskFactoryType::setMaxTaskDependence(max_task_dependence);
      TaskFactoryType::setPolicy(&policy);
      
      CrsTaskViewType A(&PA), U(&UU);
      DenseTaskViewType X(&XX), B(&BB);
      
      A.fillRowViewArray();      
      U.fillRowViewArray();      

      CrsHierTaskViewType TU(&HU);
      DenseHierTaskViewType TB(&HB), TX(&HX);

      {
        // Manufacture B = AX
        const int m = A.NumRows();
        for (int j=0;j<nrhs;++j)
          for (int i=0;i<m;++i)
            X.Value(i,j) = (j+1);
        
        Gemm<Trans::NoTranspose,Trans::NoTranspose,AlgoGemm::ForTriSolveBlocked>
          ::invoke(TaskFactoryType::Policy(),
                   TaskFactoryType::Policy().member_single(),
                   1.0, A, X, 0.0, B);
        XX.copy(BB);        
      }
      
      if (serial) {
        cout << "CholDirectPlain:: Serial factorize the matrix" << endl;
        timer.reset();          
        Chol<Uplo::Upper,AlgoChol::UnblockedOpt,Variant::One>
          ::invoke(TaskFactoryType::Policy(),
                   TaskFactoryType::Policy().member_single(),
                   U);
        t_factor = timer.seconds();
        cout << "CholDirectPlain:: Serial factorize the matrix::time = " << t_factor << endl;
      } else {
        cout << "CholDirectPlain:: ByBlocks factorize the matrix:: team_size = " << team_size << endl;
        timer.reset();    
        auto future = TaskFactoryType::Policy().create_team
          (Chol<Uplo::Upper,AlgoChol::ByBlocks>
           ::TaskFunctor<CrsHierTaskViewType>(TU), 0);
        TaskFactoryType::Policy().spawn(future);
        Kokkos::Experimental::wait(TaskFactoryType::Policy());
        t_factor = timer.seconds();
        cout << "CholDirectPlain:: ByBlocks factorize the matrix::time = " << t_factor << endl;
      }

      if (solve) {
        if (serial) {
          cout << "CholDirectPlain:: Serial forward/backward solve" << endl;
          timer.reset();          
          TriSolve<Uplo::Upper,Trans::ConjTranspose,AlgoTriSolve::Unblocked>
            ::invoke(TaskFactoryType::Policy(),
                     TaskFactoryType::Policy().member_single(),
                     Diag::NonUnit, U, X);
          TriSolve<Uplo::Upper,Trans::NoTranspose,AlgoTriSolve::Unblocked>
            ::invoke(TaskFactoryType::Policy(),
                     TaskFactoryType::Policy().member_single(),
                     Diag::NonUnit, U, X);
          t_solve = timer.seconds();
          cout << "CholDirectPlain:: Serial forward/backward solve::time = " << t_solve << endl;
        } else {
          cout << "CholDirectPlain:: ByBlocks forward/backward solve" << endl;
          timer.reset();          
          auto future_forward_solve = TaskFactoryType::Policy().create_team
            (TriSolve<Uplo::Upper,Trans::ConjTranspose,AlgoTriSolve::ByBlocks>
             ::TaskFunctor<CrsHierTaskViewType,DenseHierTaskViewType>
             (Diag::NonUnit, TU, TX), 0);
          
          TaskFactoryType::Policy().spawn(future_forward_solve);
          
          auto future_backward_solve = TaskFactoryType::Policy().create_team
            (TriSolve<Uplo::Upper,Trans::NoTranspose,AlgoTriSolve::ByBlocks>
             ::TaskFunctor<CrsHierTaskViewType,DenseHierTaskViewType>
             (Diag::NonUnit, TU, TX), 1);
          
          TaskFactoryType::Policy().add_dependence(future_backward_solve, future_forward_solve);
          TaskFactoryType::Policy().spawn(future_backward_solve);
          
          Kokkos::Experimental::wait(TaskFactoryType::Policy());
          t_solve = timer.seconds();
          cout << "CholDirectPlain:: ByBlocks forward/backward solve::time = " << t_solve << endl;
        }
      }

      if (solve && check) {
        // Check manufactured solution
        double l2 = 0.0, linf = 0.0;
        const int m = A.NumRows();
        for (int j=0;j<nrhs;++j)
          for (int i=0;i<m;++i) {
            double diff = abs(X.Value(i,j) - (j+1));
            l2 += diff*diff;
            linf = max(diff, linf);
          }
        l2 = sqrt(l2);
        cout << "CholDirectPlain:: Check solution::L2 = " << l2 << ", Linf = " << linf << endl;
      }
    }

    return r_val;
  }
  KOKKOS_INLINE_FUNCTION
  int exampleDenseGemmByBlocks(const OrdinalType mmin,
                               const OrdinalType mmax,
                               const OrdinalType minc,
                               const OrdinalType k,
                               const OrdinalType mb,
                               const int max_concurrency,
                               const int max_task_dependence,
                               const int team_size,
                               const int mkl_nthreads,
                               const bool check,
                               const bool verbose) {
    typedef ValueType   value_type;
    typedef OrdinalType ordinal_type;
    typedef SizeType    size_type;

    typedef TaskFactory<Kokkos::Experimental::TaskPolicy<SpaceType>,
      Kokkos::Experimental::Future<int,SpaceType> > TaskFactoryType;

    typedef DenseMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> DenseMatrixBaseType;

    typedef DenseMatrixView<DenseMatrixBaseType> DenseMatrixViewType;
    typedef TaskView<DenseMatrixViewType,TaskFactoryType> DenseTaskViewType;

    typedef DenseMatrixBase<DenseTaskViewType,ordinal_type,size_type,SpaceType,MemoryTraits> DenseHierMatrixBaseType;

    typedef DenseMatrixView<DenseHierMatrixBaseType> DenseHierMatrixViewType;
    typedef TaskView<DenseHierMatrixViewType,TaskFactoryType> DenseHierTaskViewType;

    int r_val = 0;

    Kokkos::Impl::Timer timer;
    double t = 0.0;

    cout << "DenseGemmByBlocks:: test matrices "
         <<":: mmin = " << mmin << " , mmax = " << mmax << " , minc = " << minc << " , k = "<< k << " , mb = " << mb << endl;

    const size_t max_task_size = (3*sizeof(DenseTaskViewType)+196); // when 128 error
    //cout << "max task size = "<< max_task_size << endl;
    typename TaskFactoryType::policy_type policy(max_concurrency,
                                                 max_task_size,
                                                 max_task_dependence, 
                                                 team_size);
    
    TaskFactoryType::setMaxTaskDependence(max_task_dependence);
    TaskFactoryType::setPolicy(&policy);

    ostringstream os;
    os.precision(3);
    os << scientific;

    for (ordinal_type m=mmin;m<=mmax;m+=minc) {
      os.str("");

      DenseMatrixBaseType AA, BB, CC("CC", m, m), CB("CB", m, m);

      if (ArgTransA == Trans::NoTranspose) 
        AA = DenseMatrixBaseType("AA", m, k); 
      else 
        AA = DenseMatrixBaseType("AA", k, m);
      
      if (ArgTransB == Trans::NoTranspose) 
        BB = DenseMatrixBaseType("BB", k, m);
      else 
        BB = DenseMatrixBaseType("BB", m, k);
      
      for (ordinal_type j=0;j<AA.NumCols();++j)
        for (ordinal_type i=0;i<AA.NumRows();++i)
          AA.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0;
      
      for (ordinal_type j=0;j<BB.NumCols();++j)
        for (ordinal_type i=0;i<BB.NumRows();++i)
          BB.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0;
      
      for (ordinal_type j=0;j<CC.NumCols();++j)
        for (ordinal_type i=0;i<CC.NumRows();++i)
          CC.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0;
      CB.copy(CC);

      const double flop = get_flop_gemm<value_type>(m, m, k);

#ifdef HAVE_SHYLUTACHO_MKL
      mkl_set_num_threads(mkl_nthreads);
#endif

      os << "DenseGemmByBlocks:: m = " << m << " n = " << m << " k = " << k;
      if (check) {
        timer.reset();
        DenseTaskViewType A(&AA), B(&BB), C(&CB);
        Gemm<ArgTransA,ArgTransB,AlgoGemm::ExternalBlas>::invoke
          (TaskFactoryType::Policy(),
           TaskFactoryType::Policy().member_single(),
           1.0, A, B, 1.0, C);
        t = timer.seconds();
        os << ":: Serial Performance = " << (flop/t/1.0e9) << " [GFLOPs]  ";
      }

      {
        DenseHierMatrixBaseType HA, HB, HC;
        DenseMatrixHelper::flat2hier(AA, HA, mb, mb);
        DenseMatrixHelper::flat2hier(BB, HB, mb, mb);
        DenseMatrixHelper::flat2hier(CC, HC, mb, mb);

        DenseHierTaskViewType TA(&HA), TB(&HB), TC(&HC);
        timer.reset();
        auto future = TaskFactoryType::Policy().create_team
          (typename Gemm<ArgTransA,ArgTransB,AlgoGemm::DenseByBlocks,Variant::One>
           ::template TaskFunctor<value_type,DenseHierTaskViewType,DenseHierTaskViewType,DenseHierTaskViewType>
           (1.0, TA, TB, 1.0, TC), 0);
        TaskFactoryType::Policy().spawn(future);
        Kokkos::Experimental::wait(TaskFactoryType::Policy());
        t = timer.seconds();       
        os << ":: Parallel Performance = " << (flop/t/1.0e9) << " [GFLOPs]  ";
      } 

      if (check) {
        typedef typename Teuchos::ScalarTraits<value_type>::magnitudeType real_type; 
        real_type err = 0.0, norm = 0.0;
        for (ordinal_type j=0;j<CC.NumCols();++j)
          for (ordinal_type i=0;i<CC.NumRows();++i) {
            const real_type diff = abs(CC.Value(i,j) - CB.Value(i,j));
            const real_type val  = CB.Value(i,j);
            err  += diff*diff;
            norm += val*val;
          }
        os << ":: Check result ::err = " << sqrt(err) << ", norm = " << sqrt(norm);
      }
      cout << os.str() << endl;
    }

    return r_val;
  }
Beispiel #29
0
  BASKER_INLINE
  int Basker<Int, Entry, Exe_Space>::factor_notoken(Int option)
  {

    //printf("factor no token called \n");

    gn = A.ncol;
    gm = A.nrow;
    BASKER_MATRIX ATEMP;

    //Kokkos::Impl::Timer tza;
    if(Options.btf == BASKER_TRUE)
      {
	//JDB: We can change this for the new inteface
	gn = A.ncol;
	gm = A.nrow;
	ATEMP = A;
	A = BTF_A; 
      }
    //printf("Switch time: %f \n", tza.seconds());

   

    //Spit into Domain and Sep
    //----------------------Domain-------------------------//
    #ifdef BASKER_KOKKOS

    //====TIMER==
    #ifdef BASKER_TIME
    Kokkos::Impl::Timer       timer;
    #endif
    //===TIMER===

    typedef Kokkos::TeamPolicy<Exe_Space>        TeamPolicy;

    if(btf_tabs_offset != 0)
      {

	if(Options.verbose == BASKER_TRUE)
	  {
	    printf("Factoring Dom num_threads: %d \n",
		   num_threads);
	  }


	Int domain_restart = 0;
	kokkos_nfactor_domain <Int,Entry,Exe_Space>
	  domain_nfactor(this);
	Kokkos::parallel_for(TeamPolicy(num_threads,1), 
			     domain_nfactor);
	Kokkos::fence();
    

    //=====Check for error======
    while(true)
      {
	INT_1DARRAY thread_start;
	MALLOC_INT_1DARRAY(thread_start, num_threads+1);
	init_value(thread_start, num_threads+1, 
		   (Int) BASKER_MAX_IDX);
	int nt = nfactor_domain_error(thread_start);
	if((nt == BASKER_SUCCESS) ||
	   (nt == BASKER_ERROR) ||
	   (domain_restart > BASKER_RESTART))
	  {
	    break;
	  }
	else
	  {
	    domain_restart++;
	    if(Options.verbose == BASKER_TRUE)
	      {
		printf("restart \n");
	      }
	    kokkos_nfactor_domain_remalloc <Int, Entry, Exe_Space>
	      diag_nfactor_remalloc(this, thread_start);
	    Kokkos::parallel_for(TeamPolicy(num_threads,1),
				 diag_nfactor_remalloc);
	    Kokkos::fence();
	  }
      }//end while

    //====TIMER===
    #ifdef BASKER_TIME
    printf("Time DOMAIN: %f \n", timer.seconds());
    timer.reset();
    #endif
    //====TIMER====
    

    #else// else basker_kokkos
    #pragma omp parallel
    {


    }//end omp parallel
    #endif //end basker_kokkos

      }
    //-------------------End--Domian--------------------------//
    
    //printVec("domperm.csc", gpermi, A.nrow);
   
    //---------------------------Sep--------------------------//
   
    if(btf_tabs_offset != 0)
      {
    //for(Int l=1; l<=4; l++)
    for(Int l=1; l <= tree.nlvls; l++)
      {

	//#ifdef BASKER_OLD_BARRIER
	//Int lthreads = pow(2,l);
	//Int lnteams = num_threads/lthreads;
	//#else
	Int lthreads = 1;
	Int lnteams = num_threads/lthreads;
	//#endif

	Int sep_restart = 0;


	if(Options.verbose == BASKER_TRUE)
	  {
	    printf("Factoring Sep num_threads: %d %d \n",
		   lnteams, lthreads);
	  }

	#ifdef BASKER_KOKKOS
	Kokkos::Impl::Timer  timer_inner_sep;
	#ifdef BASKER_NO_LAMBDA

	//kokkos_nfactor_sep <Int, Entry, Exe_Space> 
	//sep_nfactor(this, l);

	kokkos_nfactor_sep2 <Int, Entry, Exe_Space>
	  sep_nfactor(this,l);
	
	Kokkos::parallel_for(TeamPolicy(lnteams,lthreads),
			     sep_nfactor);
	Kokkos::fence();

	//======Check for error=====
	while(true)
	  {
	    INT_1DARRAY thread_start;
	    MALLOC_INT_1DARRAY(thread_start, num_threads+1);
	    init_value(thread_start, num_threads+1,
		      (Int) BASKER_MAX_IDX);
	    int nt = nfactor_sep_error(thread_start);
	    if((nt == BASKER_SUCCESS)||
	       (nt == BASKER_ERROR) ||
	       (sep_restart > BASKER_RESTART))
	      {
		FREE_INT_1DARRAY(thread_start);
		break;
	      }
	    else
	      {
		sep_restart++;
		if (Options.verbose == BASKER_TRUE)
		  {
		    printf("restart \n");
		  }
		Kokkos::parallel_for(TeamPolicy(lnteams,lthreads),  sep_nfactor);
		Kokkos::fence();

	      }
	  }//end while-true


	#ifdef BASKER_TIME
	printf("Time INNERSEP: %d %f \n", 
	       l, timer_inner_sep.seconds());
	#endif
        #else //ELSE BASKER_NO_LAMBDA
	//Note: to be added
        #endif //end BASKER_NO_LAMBDA
	#else
	#pragma omp parallel
	{

	}//end omp parallel
	#endif
      }//end over each level

    #ifdef BASKER_TIME
    printf("Time SEP: %f \n", timer.seconds());
    #endif
      }
    
    //-------------------------End Sep----------------//


    //-------------------IF BTF-----------------------//
    if(Options.btf == BASKER_TRUE)
      {

	Int btf_restart = 0;

	if(Options.verbose == BASKER_TRUE)
	  {
	    printf("Factoring BLKs num_threads: %d \n",
		   num_threads);
	  }

	//=====Timer
	#ifdef BASKER_TIME
	Kokkos::Impl::Timer  timer_btf;
	#endif
	//====Timer
	
	//======Call diag factor====
	kokkos_nfactor_diag <Int, Entry, Exe_Space> 
	  diag_nfactor(this);
	Kokkos::parallel_for(TeamPolicy(num_threads,1),
			     diag_nfactor);
	Kokkos::fence();
	
	//=====Check for error======
	while(true)
	  {
	    INT_1DARRAY thread_start;
	    MALLOC_INT_1DARRAY(thread_start, num_threads+1);
	    init_value(thread_start, num_threads+1, 
		       (Int) BASKER_MAX_IDX);
	    int nt = nfactor_diag_error(thread_start);
	    //printf("RETURNED: %d \n", nt);
	    if((nt == BASKER_SUCCESS) || 
	       (nt == BASKER_ERROR) ||
	       (btf_restart > BASKER_RESTART))
	      {
		break;
	      }
	    else
	      {
		btf_restart++;
		if (Options.verbose == BASKER_TRUE)
		  {
		    printf("restart \n");
		  }
		kokkos_nfactor_diag_remalloc <Int, Entry, Exe_Space>
		  diag_nfactor_remalloc(this, thread_start);
		Kokkos::parallel_for(TeamPolicy(num_threads,1),
				     diag_nfactor_remalloc);
		Kokkos::fence();
	      }
	  }//end while

	//====TIMER
	#ifdef BASKER_TIME
	printf("Time BTF: %f \n", 
	       timer_btf.seconds());
	#endif
	//===TIMER

      }//end btf call

    Kokkos::Impl::Timer tzback;
     if(Options.btf == BASKER_TRUE)
      {
	A = ATEMP;
      }
     //printf("Switch back: %f \n",
     //	    tzback.seconds());
    
    return 0;
  }//end factor_notoken()
  int exampleDenseGemmByBlocks(const ordinal_type mmin,
                               const ordinal_type mmax,
                               const ordinal_type minc,
                               const ordinal_type k,
                               const ordinal_type mb,
                               const int max_concurrency,
                               const int max_task_dependence,
                               const int team_size,
                               const int mkl_nthreads,
                               const bool check,
                               const bool verbose) {
    typedef typename
      Kokkos::Impl::is_space<DeviceSpaceType>::host_mirror_space::execution_space HostSpaceType ;

    const bool detail = false;
    std::cout << "DeviceSpace::  "; DeviceSpaceType::print_configuration(std::cout, detail);
    std::cout << "HostSpace::    ";   HostSpaceType::print_configuration(std::cout, detail);

    typedef Kokkos::Experimental::TaskPolicy<DeviceSpaceType> PolicyType;

    typedef DenseMatrixBase<value_type,ordinal_type,size_type,HostSpaceType> DenseMatrixBaseHostType;
    typedef DenseMatrixView<DenseMatrixBaseHostType> DenseMatrixViewHostType;

    typedef DenseMatrixBase<value_type,ordinal_type,size_type,DeviceSpaceType> DenseMatrixBaseDeviceType;
    typedef DenseMatrixView<DenseMatrixBaseDeviceType> DenseMatrixViewDeviceType;
    typedef TaskView<DenseMatrixViewDeviceType> DenseTaskViewDeviceType;

    typedef DenseMatrixBase<DenseTaskViewDeviceType,ordinal_type,size_type,DeviceSpaceType> DenseHierMatrixBaseDeviceType;

    typedef DenseMatrixView<DenseHierMatrixBaseDeviceType> DenseHierMatrixViewDeviceType;
    typedef TaskView<DenseHierMatrixViewDeviceType> DenseHierTaskViewDeviceType;

    int r_val = 0;

    Kokkos::Impl::Timer timer;
    double t = 0.0;

    std::cout << "DenseGemmByBlocks:: test matrices "
              <<":: mmin = " << mmin << " , mmax = " << mmax << " , minc = " << minc 
              << " , k = "<< k << " , mb = " << mb << std::endl;

    const size_t max_task_size = (3*sizeof(DenseTaskViewDeviceType)+sizeof(PolicyType)+128); 
    PolicyType policy(max_concurrency,
                      max_task_size,
                      max_task_dependence,
                      team_size);

    std::ostringstream os;
    os.precision(3);
    os << std::scientific;

    for (ordinal_type m=mmin;m<=mmax;m+=minc) {
      os.str("");
      
      // host matrices
      DenseMatrixBaseHostType AA_host, BB_host, CC_host("CC_host", m, m), CB_host("CB_host", m, m);
      {
        if (ArgTransA == Trans::NoTranspose) 
          AA_host = DenseMatrixBaseHostType("AA_host", m, k); 
        else 
          AA_host = DenseMatrixBaseHostType("AA_host", k, m);
        
        if (ArgTransB == Trans::NoTranspose) 
          BB_host = DenseMatrixBaseHostType("BB_host", k, m);
        else 
          BB_host = DenseMatrixBaseHostType("BB_host", m, k);
        
        for (ordinal_type j=0;j<AA_host.NumCols();++j)
          for (ordinal_type i=0;i<AA_host.NumRows();++i)
            AA_host.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0;
        
        for (ordinal_type j=0;j<BB_host.NumCols();++j)
          for (ordinal_type i=0;i<BB_host.NumRows();++i)
            BB_host.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0;
        
        for (ordinal_type j=0;j<CC_host.NumCols();++j)
          for (ordinal_type i=0;i<CC_host.NumRows();++i)
            CC_host.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0;
        
        DenseMatrixTools::copy(CB_host, CC_host);
      }

      const double flop = DenseFlopCount<value_type>::Gemm(m, m, k);

#ifdef HAVE_SHYLUTACHO_MKL
      mkl_set_num_threads(mkl_nthreads);
#endif

      os << "DenseGemmByBlocks:: m = " << m << " n = " << m << " k = " << k << "  ";
      if (check) {
        timer.reset();
        DenseMatrixViewHostType A_host(AA_host), B_host(BB_host), C_host(CB_host);
        Gemm<ArgTransA,ArgTransB,AlgoGemm::ExternalBlas,Variant::One>::invoke
          (policy, policy.member_single(),
           1.0, A_host, B_host, 1.0, C_host);
        t = timer.seconds();
        os << ":: Serial Performance = " << (flop/t/1.0e9) << " [GFLOPs]  ";
      }

      DenseMatrixBaseDeviceType AA_device("AA_device"), BB_device("BB_device"), CC_device("CC_device");
      {
        timer.reset();
        AA_device.mirror(AA_host);
        BB_device.mirror(BB_host);
        CC_device.mirror(CC_host);
        t = timer.seconds();
        os << ":: Mirror = " << t << " [sec]  ";
      }

      {
        DenseHierMatrixBaseDeviceType HA_device("HA_device"), HB_device("HB_device"), HC_device("HC_device");

        DenseMatrixTools::createHierMatrix(HA_device, AA_device, mb, mb);        
        DenseMatrixTools::createHierMatrix(HB_device, BB_device, mb, mb);        
        DenseMatrixTools::createHierMatrix(HC_device, CC_device, mb, mb);        

        DenseHierTaskViewDeviceType TA_device(HA_device), TB_device(HB_device), TC_device(HC_device);
        timer.reset();
        auto future = policy.proc_create_team
          (Gemm<ArgTransA,ArgTransB,AlgoGemm::DenseByBlocks,ArgVariant>
           ::createTaskFunctor(policy, 1.0, TA_device, TB_device, 1.0, TC_device),
           0);
        policy.spawn(future);
        Kokkos::Experimental::wait(policy);
        t = timer.seconds();       
        os << ":: Parallel Performance = " << (flop/t/1.0e9) << " [GFLOPs]  ";
      } 

      CC_host.mirror(CC_device);
      if (check) {
        double err = 0.0, norm = 0.0;
        for (ordinal_type j=0;j<CC_host.NumCols();++j)
          for (ordinal_type i=0;i<CC_host.NumRows();++i) {
            const double diff = abs(CC_host.Value(i,j) - CB_host.Value(i,j));
            const double val  = CB_host.Value(i,j);
            err  += diff*diff;
            norm += val*val;
          }
        os << ":: Check result ::norm = " << sqrt(norm) << ", error = " << sqrt(err);
      }
      std::cout << os.str() << std::endl;
    }

    return r_val;
  }