Example #1
0
void Loop(int loop, int test, const char* type_name) {
  LoopVariant<T>(loop,test);

  Kokkos::Impl::Timer timer;
  T res = LoopVariant<T>(loop,test);
  double time = timer.seconds();

  timer.reset();
  T resNonAtomic = LoopVariantNonAtomic<T>(loop,test);
  double timeNonAtomic = timer.seconds();

  timer.reset();
  T resSerial = LoopVariantSerial<T>(loop,test);
  double timeSerial = timer.seconds();

  time         *=1e6/loop;
  timeNonAtomic*=1e6/loop;
  timeSerial   *=1e6/loop;
  //textcolor_standard();
  bool passed = true;
  if(resSerial!=res) passed = false;
  //if(!passed) textcolor(RESET,BLACK,YELLOW);
  printf("%s Test %i %s  --- Loop: %i Value (S,A,NA): %e %e %e Time: %7.4e %7.4e %7.4e Size of Type %i)",
         type_name,test,passed?"PASSED":"FAILED",loop,
         1.0*resSerial,1.0*res,1.0*resNonAtomic,
         timeSerial,time,timeNonAtomic,(int)sizeof(T));
  //if(!passed) textcolor_standard();
  printf("\n");
}
Example #2
0
int main(int narg, char* args[]) {
  Kokkos::initialize(narg,args);
  
  int chunk_size = 1024;
  int nchunks = 100000; //1024*1024;
  Kokkos::DualView<int*> data("data",nchunks*chunk_size+1);

  srand(1231093);

  for(int i = 0; i < data.dimension_0(); i++) {
    data.h_view(i) = rand()%TS;
  }
  data.modify<Host>();
  data.sync<Device>();

  Kokkos::DualView<int**> histogram("histogram",TS,TS);


  Kokkos::Impl::Timer timer;
  // Threads/team (TS) is automically limited to the maximum supported by the device.
  Kokkos::parallel_for( team_policy( nchunks , TS )
                      , find_2_tuples(chunk_size,data,histogram) );
  Kokkos::fence();
  double time = timer.seconds();

  histogram.sync<Host>();

  printf("Time: %lf \n\n",time);
  int sum = 0;
  for(int k=0; k<TS; k++) {
    for(int l=0; l<TS; l++) {
      printf("%i ",histogram.h_view(k,l));
      sum += histogram.h_view(k,l);
    }
    printf("\n");
  }
  printf("Result: %i %i\n",sum,chunk_size*nchunks);
  Kokkos::finalize();
}
Example #3
0
int main(int narg, char* args[]) {
  Kokkos::initialize(narg,args);
  
  int chunk_size = 1024;
  int nchunks = 100000; //1024*1024;
  Kokkos::DualView<int*> data("data",nchunks*chunk_size+1);

  srand(1231093);

  for(int i = 0; i < data.dimension_0(); i++) {
    data.h_view(i) = rand()%TS;
  }
  data.modify<Host>();
  data.sync<Device>();

  Kokkos::DualView<int**> histogram("histogram",TS,TS);


  Kokkos::Impl::Timer timer;
  Kokkos::parallel_for(
      Kokkos::ParallelWorkRequest(nchunks,TS<Device::team_max()?TS:Device::team_max()),
      find_2_tuples(chunk_size,data,histogram));
  Kokkos::fence();
  double time = timer.seconds();

  histogram.sync<Host>();

  printf("Time: %lf \n\n",time);
  int sum = 0;
  for(int k=0; k<TS; k++) {
    for(int l=0; l<TS; l++) {
      printf("%i ",histogram.h_view(k,l));
      sum += histogram.h_view(k,l);
    }
    printf("\n");
  }
  printf("Result: %i %i\n",sum,chunk_size*nchunks);
  Kokkos::finalize();
}
Example #4
0
  static double test( const int count , const int iter = 1 )
  {
    elem_coord_type coord( "coord" , count );
    elem_grad_type  grad ( "grad" , count );

    // Execute the parallel kernels on the arrays:

    double dt_min = 0 ;

    Kokkos::parallel_for( count , Init( coord ) );
    device_type::fence();

    for ( int i = 0 ; i < iter ; ++i ) {
      Kokkos::Impl::Timer timer ;
      Kokkos::parallel_for( count , HexGrad<device_type>( coord , grad ) );
      device_type::fence();
      const double dt = timer.seconds();
      if ( 0 == i ) dt_min = dt ;
      else dt_min = dt < dt_min ? dt : dt_min ;
    }

    return dt_min ;
  }
int main (int argc, char ** argv){
  if (argc < 2){
    std::cerr << "Usage:" << argv[0] << " input_bin_file" << std::endl;
    exit(1);
  }


  Kokkos::initialize(argc, argv);
  MyExecSpace::print_configuration(std::cout);
  idx nv = 0, ne = 0;
  idx *xadj, *adj;
  wt *ew;

  KokkosKernels::Experimental::Graph::Utils::read_graph_bin<idx, wt> (&nv, &ne, &xadj, &adj, &ew, argv[1]);

  std::cout << "nv:" << nv << " ne:" << ne << std::endl;

  um_array_type _xadj (xadj, nv + 1);
  um_edge_array_type _adj (adj, ne);

  idx_array_type kok_xadj ("xadj", nv + 1);
  idx_edge_array_type kok_adj("adj", ne);

  idx_array_type sym_xadj;
  idx_edge_array_type sym_adj;

  Kokkos::deep_copy (kok_xadj, _xadj);
  Kokkos::deep_copy (kok_adj, _adj);

  wt_um_edge_array_type _mtx_vals (ew, ne);
  value_array_type kok_mtx_vals ("MTX_VALS", ne);
  Kokkos::deep_copy (kok_mtx_vals, _mtx_vals);

  delete [] xadj;
  delete [] adj;
  delete [] ew;

  std::cout << "Symetrizing Graph" << std::endl;

  Kokkos::Impl::Timer timer;

  KokkosKernels::Experimental::Util::symmetrize_graph_symbolic_hashmap<
  idx_array_type, idx_edge_array_type, idx_array_type, idx_edge_array_type, MyExecSpace>
    (nv, kok_xadj, kok_adj,sym_xadj, sym_adj);
  Kokkos::fence();

  double t = timer.seconds();
  std::cout << "Time to symmetrize:" << t << std::endl;
  KokkosKernels::Experimental::Util::print_1Dview(kok_xadj);
  KokkosKernels::Experimental::Util::print_1Dview(kok_adj);

  std::cout << "Symetric Graph" << std::endl;
  KokkosKernels::Experimental::Util::print_1Dview(sym_xadj);

  KokkosKernels::Experimental::Util::print_1Dview(sym_adj);

  Kokkos::finalize();



  return 0;
}
  KOKKOS_INLINE_FUNCTION
  int exampleCholDirectPerformance(const string file_input,
                                   const int treecut,
                                   const int minblksize,
                                   const int prunecut,
                                   const int seed,
                                   const int niter,
                                   const int nthreads,
                                   const int max_task_dependence,
                                   const int team_size,
                                   const int league_size,
                                   const bool team_interface,
                                   const bool skip_serial,
                                   const bool mkl_interface,
                                   const bool verbose) {
    typedef ValueType   value_type;
    typedef OrdinalType ordinal_type;
    typedef SizeType    size_type;

    typedef TaskFactory<Kokkos::Experimental::TaskPolicy<SpaceType>,
      Kokkos::Experimental::Future<int,SpaceType> > TaskFactoryType;

    typedef CrsMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> CrsMatrixBaseType;
    typedef GraphHelper_Scotch<CrsMatrixBaseType> GraphHelperType;
    typedef SymbolicFactorHelper<CrsMatrixBaseType> SymbolicFactorHelperType;

#ifdef HAVE_SHYLUTACHO_MKL 
    typedef typename CrsMatrixBaseType::value_type_array value_type_array;
#endif

    typedef CrsMatrixView<CrsMatrixBaseType> CrsMatrixViewType;
    typedef TaskView<CrsMatrixViewType,TaskFactoryType> CrsTaskViewType;
    
    typedef CrsMatrixBase<CrsTaskViewType,ordinal_type,size_type,SpaceType,MemoryTraits> CrsHierMatrixBaseType;

    typedef CrsMatrixView<CrsHierMatrixBaseType> CrsHierMatrixViewType;
    typedef TaskView<CrsHierMatrixViewType,TaskFactoryType> CrsHierTaskViewType;

    int r_val = 0;

    Kokkos::Impl::Timer timer;
    double 
      t_import = 0.0,
      t_reorder = 0.0,
      t_symbolic = 0.0,
      t_flat2hier = 0.0,
#ifdef HAVE_SHYLUTACHO_MKL 
      t_mkl = 0.0,
#endif
      t_factor_seq = 0.0,   t_solve_seq = 0.0,
      t_factor_task = 0.0,  t_solve_task = 0.0;
    const int start = 0;
    
    cout << "CholDirectPerformance:: import input file = " << file_input << endl;        
    CrsMatrixBaseType AA("AA");
    {
      timer.reset();

      ifstream in;
      in.open(file_input);
      if (!in.good()) {
        cout << "Failed in open the file: " << file_input << endl;
        return ++r_val;
      }
      AA.importMatrixMarket(in);

      t_import = timer.seconds();
    }
    cout << "CholDirectPerformance:: import input file::time = " << t_import << endl;

    cout << "CholDirectPerformance:: reorder the matrix" << endl;        
    CrsMatrixBaseType PA("Permuted AA");
    CrsMatrixBaseType UU("UU");     // permuted base upper triangular matrix
    CrsHierMatrixBaseType HU("HU"); // hierarchical matrix of views

    DenseMatrixBaseType BB("BB", AA.NumRows(), nrhs);
    DenseHierMatrixBaseType HB("HB");

    {
      GraphHelperType S(AA, seed);
      {
        timer.reset();
        
        S.computeOrdering(treecut, minblksize);
        S.pruneTree(prunecut);
        
        PA.copy(S.PermVector(), S.InvPermVector(), AA);
        
        t_reorder = timer.seconds();
      }

      cout << "CholDirectPerformance:: reorder the matrix::time = " << t_reorder << endl;            
      {
        SymbolicFactorHelperType F(PA, league_size);
        for (int i=start;i<niter;++i) {
          timer.reset();
        
          F.createNonZeroPattern(Uplo::Upper, UU);

          t_symbolic += timer.seconds() * (i>=0);
        }
        t_symbolic /= niter;

        cout << "CholDirectPerformance:: AA (nnz) = " << AA.NumNonZeros() << ", UU (nnz) = " << UU.NumNonZeros() << endl;
      }
      cout << "CholDirectPerformance:: symbolic factorization::time = " << t_symbolic << endl;            
      {
        timer.reset();

        CrsMatrixHelper::flat2hier(Uplo::Upper, UU, HU,
                                   S.NumBlocks(),
                                   S.RangeVector(),
                                   S.TreeVector());
        
        for (ordinal_type k=0;k<HU.NumNonZeros();++k)
          HU.Value(k).fillRowViewArray();

        DenseMatrixHelper::flat2hier(BB, HB,
                                     S.NumBlocks(),
                                     S.RangeVector(),
                                     nb);
        
        t_flat2hier = timer.seconds();
        
        cout << "CholDirectPerformance:: Hier (dof, nnz) = " << HU.NumRows() << ", " << HU.NumNonZeros() << endl;
      }
      cout << "CholDirectPerformance:: construct hierarchical matrix::time = " << t_flat2hier << endl;            
    }

    // copy of UU
    CrsMatrixBaseType RR("RR");
    RR.copy(UU);

    /////////////////////////// Serial Numeric Factorization
    if (!skip_serial) {
#ifdef __USE_FIXED_TEAM_SIZE__
      typename TaskFactoryType::policy_type policy(max_task_dependence);
#else
      typename TaskFactoryType::policy_type policy(max_task_dependence, 1);
#endif
      TaskFactoryType::setUseTeamInterface(team_interface);
      TaskFactoryType::setMaxTaskDependence(max_task_dependence);
      TaskFactoryType::setPolicy(&policy);

      CrsTaskViewType U(&UU);
      U.fillRowViewArray();

      cout << "CholDirectPerformance:: Serial factorize the matrix" << endl;
      {
        for (int i=start;i<niter;++i) {
          UU.copy(RR);
          timer.reset();          
          {
            Chol<Uplo::Upper,AlgoChol::UnblockedOpt,Variant::One>
              ::invoke(TaskFactoryType::Policy(),
                       TaskFactoryType::Policy().member_single(),
                       U);
          }
          t_factor_seq += timer.seconds() * (i>=0);
        }
        t_factor_seq /= niter;
      }
      cout << "CholDirectPerformance:: Serial factorize the matrix::time = " << t_factor_seq << endl;

      cout << "CholDirectPerformance:: Serial forward/backward solve" << endl;      
      {
        for (int i=start;i<niter;++i) {
          XX.copy(BB);
          timer.reset();          
          {
            TriSolve<Uplo::Upper,Trans::ConjTranspose,AlgoTriSolve::Unblocked>
              ::invoke(TaskFactoryType::Policy(),
                       TaskFactoryType::Policy().member_single(),
                       Diag::NonUnit, U, X);
            TriSolve<Uplo::Upper,Trans::NoTranspose,AlgoTriSolve::Unblocked>
              ::invoke(TaskFactoryType::Policy(),
                       TaskFactoryType::Policy().member_single(),
                       Diag::NonUnit, U, X);
          }
          t_factor_seq += timer.seconds() * (i>=0);
        }
        t_factor_seq /= niter;
      }
      cout << "CholDirectPerformance:: Serial forward/backward solve::time = " << t_solve_seq << endl;
    }

    // if (!skip_serial) 
    //   cout << "CholDirectPerformance:: task scale [seq/task] = " << t_factor_seq/t_factor_task << endl;    

    return r_val;
  }
  KOKKOS_INLINE_FUNCTION
  int exampleTriSolvePerformance(const string file_input,
                                 const OrdinalType nrhs,
                                 const OrdinalType nb,
                                 const int niter,
                                 const int nthreads,
                                 const int max_task_dependence,
                                 const int team_size, 
                                 const bool team_interface,
                                 const bool skip_serial,
                                 const bool verbose) {
    typedef ValueType   value_type;
    typedef OrdinalType ordinal_type;
    typedef SizeType    size_type;

    typedef TaskFactory<Kokkos::Experimental::TaskPolicy<SpaceType>,
      Kokkos::Experimental::Future<int,SpaceType> > TaskFactoryType;

    typedef CrsMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> CrsMatrixBaseType;
    typedef GraphHelper_Scotch<CrsMatrixBaseType> GraphHelperType;

    typedef CrsMatrixView<CrsMatrixBaseType> CrsMatrixViewType;
    typedef TaskView<CrsMatrixViewType,TaskFactoryType> CrsTaskViewType;
    
    typedef CrsMatrixBase<CrsTaskViewType,ordinal_type,size_type,SpaceType,MemoryTraits> CrsHierMatrixBaseType;

    typedef CrsMatrixView<CrsHierMatrixBaseType> CrsHierMatrixViewType;
    typedef TaskView<CrsHierMatrixViewType,TaskFactoryType> CrsHierTaskViewType;

    typedef DenseMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> DenseMatrixBaseType;

    typedef DenseMatrixView<DenseMatrixBaseType> DenseMatrixViewType;
    typedef TaskView<DenseMatrixViewType,TaskFactoryType> DenseTaskViewType;

    typedef DenseMatrixBase<DenseTaskViewType,ordinal_type,size_type,SpaceType,MemoryTraits> DenseHierMatrixBaseType;

    typedef DenseMatrixView<DenseHierMatrixBaseType> DenseHierMatrixViewType;
    typedef TaskView<DenseHierMatrixViewType,TaskFactoryType> DenseHierTaskViewType;

    int r_val = 0;

    Kokkos::Impl::Timer timer;
    double 
      t_import = 0.0,
      t_reorder = 0.0,
      t_solve_seq = 0.0,
      t_solve_task = 0.0;
    const int start = -2;

    cout << "TriSolvePerformance:: import input file = " << file_input << endl;
    CrsMatrixBaseType AA("AA");
    {
      timer.reset();

      ifstream in;
      in.open(file_input);
      if (!in.good()) {
        cout << "Failed in open the file: " << file_input << endl;
        return ++r_val;
      }
      AA.importMatrixMarket(in);

      t_import = timer.seconds();

      if (verbose)
        cout << AA << endl;
    }
    cout << "TriSolvePerformance:: import input file::time = " << t_import << endl;

    CrsMatrixBaseType   UU("UU");
    DenseMatrixBaseType BB("BB",  AA.NumRows(), nrhs);

    cout << "TriSolvePerformance:: reorder the matrix and partition right hand side, nb = " << nb << endl;
    CrsHierMatrixBaseType   HU("HU");
    DenseHierMatrixBaseType HB("HB");
    {
      timer.reset();

      GraphHelperType S(AA);
      S.computeOrdering();

      CrsMatrixBaseType PA("Permuted AA");
      PA.copy(S.PermVector(), S.InvPermVector(), AA);

      UU.copy(Uplo::Upper, PA);

      CrsMatrixHelper::flat2hier(Uplo::Upper, UU, HU,
                                 S.NumBlocks(),
                                 S.RangeVector(),
                                 S.TreeVector());

      DenseMatrixHelper::flat2hier(BB, HB,
                                   S.NumBlocks(),
                                   S.RangeVector(),
                                   nb);

      t_reorder = timer.seconds();

      cout << "TriSolvePerformance:: Hier (dof, nnz) = " << HU.NumRows() << ", " << HU.NumNonZeros() << endl;

      if (verbose)
        cout << UU << endl;
    }
    cout << "TriSolvePerformance:: reorder the matrix and partition right hand side::time = " << t_reorder << endl;

    const size_t max_concurrency = 16384;
    cout << "TriSolvePerformance:: max concurrency = " << max_concurrency << endl;

    const size_t max_task_size = 3*sizeof(CrsTaskViewType)+128;
    cout << "TriSolvePerformance:: max task size   = " << max_task_size << endl;

    if (!skip_serial) {
      __INIT_DENSE_MATRIX__(BB, 1.0);
      typename TaskFactoryType::policy_type policy(max_concurrency,
                                                   max_task_size,
                                                   max_task_dependence, 
                                                   1);

      TaskFactoryType::setUseTeamInterface(team_interface);
      TaskFactoryType::setMaxTaskDependence(max_task_dependence);
      TaskFactoryType::setPolicy(&policy);
      
      CrsTaskViewType U(&UU);
      DenseTaskViewType B(&BB);
      U.fillRowViewArray();

      cout << "TriSolvePerformance:: Serial forward and backward solve of the matrix" << endl;
      {
        for (int i=start;i<niter;++i) {
          timer.reset();
          // {
          //   auto future = TaskFactoryType::Policy().create_team(TriSolve<Uplo::Upper,Trans::ConjTranspose,AlgoTriSolve::Unblocked>
          //                                                       ::TaskFunctor<CrsTaskViewType,DenseTaskViewType>
          //                                                       (Diag::NonUnit, U, B), 0);
            
          //   TaskFactoryType::Policy().spawn(future);
          //   Kokkos::Experimental::wait(TaskFactoryType::Policy());
          // }
          {
            TriSolve<Uplo::Upper,Trans::ConjTranspose,AlgoTriSolve::Unblocked>
              ::invoke(TaskFactoryType::Policy(),
                                TaskFactoryType::Policy().member_single(),
                                Diag::NonUnit, U, B);
            
          }
          // {
          //   auto future = TaskFactoryType::Policy().create_team(TriSolve<Uplo::Upper,Trans::NoTranspose,AlgoTriSolve::Unblocked>
          //                                                       ::TaskFunctor<CrsTaskViewType,DenseTaskViewType>
          //                                                       (Diag::NonUnit, U, B), 0);
            
          //   TaskFactoryType::Policy().spawn(future);
          //   Kokkos::Experimental::wait(TaskFactoryType::Policy());
          // }
          {
            TriSolve<Uplo::Upper,Trans::NoTranspose,AlgoTriSolve::Unblocked>
              ::invoke(TaskFactoryType::Policy(),
                                TaskFactoryType::Policy().member_single(),
                                Diag::NonUnit, U, B);
          }
          t_solve_seq += timer.seconds() * (i>=0);
        }
        t_solve_seq /= niter;
        
        if (verbose)
          cout << BB << endl;
      }
      cout << "TriSolvePerformance:: Serial forward and backward solve of the matrix::time = " << t_solve_seq << endl;
    }
    
    {
      __INIT_DENSE_MATRIX__(BB, 1.0);
      typename TaskFactoryType::policy_type policy(max_concurrency,
                                                   max_task_size,
                                                   max_task_dependence, 
                                                   team_size);

      TaskFactoryType::setUseTeamInterface(team_interface);
      TaskFactoryType::setMaxTaskDependence(max_task_dependence);
      TaskFactoryType::setPolicy(&policy);

      // wrap the hierarchically partitioned matrix with task handler
      CrsHierTaskViewType TU(&HU);
      for (ordinal_type k=0;k<HU.NumNonZeros();++k)
        HU.Value(k).fillRowViewArray();
      
      DenseHierTaskViewType TB(&HB);
      
      cout << "TriSolvePerformance:: ByBlocks forward and backward solve of the matrix" << endl;
      {
        for (int i=start;i<niter;++i) {
          timer.reset(); 
          {
            auto future_forward_solve = TaskFactoryType::Policy().create_team
              (TriSolve<Uplo::Upper,Trans::ConjTranspose,AlgoTriSolve::ByBlocks>
               ::TaskFunctor<CrsHierTaskViewType,DenseHierTaskViewType>
               (Diag::NonUnit, TU, TB), 0);
            
            TaskFactoryType::Policy().spawn(future_forward_solve);
            
            auto future_backward_solve = TaskFactoryType::Policy().create_team
              (TriSolve<Uplo::Upper,Trans::NoTranspose,AlgoTriSolve::ByBlocks>
               ::TaskFunctor<CrsHierTaskViewType,DenseHierTaskViewType>
               (Diag::NonUnit, TU, TB), 1);
            
            TaskFactoryType::Policy().add_dependence(future_backward_solve, future_forward_solve);
            TaskFactoryType::Policy().spawn(future_backward_solve);
            
            Kokkos::Experimental::wait(TaskFactoryType::Policy());
          }
          t_solve_task += timer.seconds() * (i>=0);
        }
        t_solve_task /= niter;

        if (verbose)
          cout << BB << endl;
      }

      cout << "TriSolvePerformance:: ByBlocks forward and backward solve of the matrix::time = " << t_solve_task << endl;
    }

    if (!skip_serial) {
      cout << "TriSolvePerformance:: task scale [seq/task] = " << t_solve_seq/t_solve_task << endl;
    }

    return r_val;
  }
  KOKKOS_INLINE_FUNCTION
  int exampleDenseGemmByBlocks(const OrdinalType mmin,
                               const OrdinalType mmax,
                               const OrdinalType minc,
                               const OrdinalType k,
                               const OrdinalType mb,
                               const int max_concurrency,
                               const int max_task_dependence,
                               const int team_size,
                               const int mkl_nthreads,
                               const bool check,
                               const bool verbose) {
    typedef ValueType   value_type;
    typedef OrdinalType ordinal_type;
    typedef SizeType    size_type;

    typedef TaskFactory<Kokkos::Experimental::TaskPolicy<SpaceType>,
      Kokkos::Experimental::Future<int,SpaceType> > TaskFactoryType;

    typedef DenseMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> DenseMatrixBaseType;

    typedef DenseMatrixView<DenseMatrixBaseType> DenseMatrixViewType;
    typedef TaskView<DenseMatrixViewType,TaskFactoryType> DenseTaskViewType;

    typedef DenseMatrixBase<DenseTaskViewType,ordinal_type,size_type,SpaceType,MemoryTraits> DenseHierMatrixBaseType;

    typedef DenseMatrixView<DenseHierMatrixBaseType> DenseHierMatrixViewType;
    typedef TaskView<DenseHierMatrixViewType,TaskFactoryType> DenseHierTaskViewType;

    int r_val = 0;

    Kokkos::Impl::Timer timer;
    double t = 0.0;

    cout << "DenseGemmByBlocks:: test matrices "
         <<":: mmin = " << mmin << " , mmax = " << mmax << " , minc = " << minc << " , k = "<< k << " , mb = " << mb << endl;

    const size_t max_task_size = (3*sizeof(DenseTaskViewType)+196); // when 128 error
    //cout << "max task size = "<< max_task_size << endl;
    typename TaskFactoryType::policy_type policy(max_concurrency,
                                                 max_task_size,
                                                 max_task_dependence, 
                                                 team_size);
    
    TaskFactoryType::setMaxTaskDependence(max_task_dependence);
    TaskFactoryType::setPolicy(&policy);

    ostringstream os;
    os.precision(3);
    os << scientific;

    for (ordinal_type m=mmin;m<=mmax;m+=minc) {
      os.str("");

      DenseMatrixBaseType AA, BB, CC("CC", m, m), CB("CB", m, m);

      if (ArgTransA == Trans::NoTranspose) 
        AA = DenseMatrixBaseType("AA", m, k); 
      else 
        AA = DenseMatrixBaseType("AA", k, m);
      
      if (ArgTransB == Trans::NoTranspose) 
        BB = DenseMatrixBaseType("BB", k, m);
      else 
        BB = DenseMatrixBaseType("BB", m, k);
      
      for (ordinal_type j=0;j<AA.NumCols();++j)
        for (ordinal_type i=0;i<AA.NumRows();++i)
          AA.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0;
      
      for (ordinal_type j=0;j<BB.NumCols();++j)
        for (ordinal_type i=0;i<BB.NumRows();++i)
          BB.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0;
      
      for (ordinal_type j=0;j<CC.NumCols();++j)
        for (ordinal_type i=0;i<CC.NumRows();++i)
          CC.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0;
      CB.copy(CC);

      const double flop = get_flop_gemm<value_type>(m, m, k);

#ifdef HAVE_SHYLUTACHO_MKL
      mkl_set_num_threads(mkl_nthreads);
#endif

      os << "DenseGemmByBlocks:: m = " << m << " n = " << m << " k = " << k;
      if (check) {
        timer.reset();
        DenseTaskViewType A(&AA), B(&BB), C(&CB);
        Gemm<ArgTransA,ArgTransB,AlgoGemm::ExternalBlas>::invoke
          (TaskFactoryType::Policy(),
           TaskFactoryType::Policy().member_single(),
           1.0, A, B, 1.0, C);
        t = timer.seconds();
        os << ":: Serial Performance = " << (flop/t/1.0e9) << " [GFLOPs]  ";
      }

      {
        DenseHierMatrixBaseType HA, HB, HC;
        DenseMatrixHelper::flat2hier(AA, HA, mb, mb);
        DenseMatrixHelper::flat2hier(BB, HB, mb, mb);
        DenseMatrixHelper::flat2hier(CC, HC, mb, mb);

        DenseHierTaskViewType TA(&HA), TB(&HB), TC(&HC);
        timer.reset();
        auto future = TaskFactoryType::Policy().create_team
          (typename Gemm<ArgTransA,ArgTransB,AlgoGemm::DenseByBlocks,Variant::One>
           ::template TaskFunctor<value_type,DenseHierTaskViewType,DenseHierTaskViewType,DenseHierTaskViewType>
           (1.0, TA, TB, 1.0, TC), 0);
        TaskFactoryType::Policy().spawn(future);
        Kokkos::Experimental::wait(TaskFactoryType::Policy());
        t = timer.seconds();       
        os << ":: Parallel Performance = " << (flop/t/1.0e9) << " [GFLOPs]  ";
      } 

      if (check) {
        typedef typename Teuchos::ScalarTraits<value_type>::magnitudeType real_type; 
        real_type err = 0.0, norm = 0.0;
        for (ordinal_type j=0;j<CC.NumCols();++j)
          for (ordinal_type i=0;i<CC.NumRows();++i) {
            const real_type diff = abs(CC.Value(i,j) - CB.Value(i,j));
            const real_type val  = CB.Value(i,j);
            err  += diff*diff;
            norm += val*val;
          }
        os << ":: Check result ::err = " << sqrt(err) << ", norm = " << sqrt(norm);
      }
      cout << os.str() << endl;
    }

    return r_val;
  }
  int exampleCholByBlocks(const std::string file_input,
                          const int treecut,
                          const int prunecut,
                          const int fill_level,
                          const int rows_per_team,
                          const int max_concurrency,
                          const int max_task_dependence,
                          const int team_size,
                          const bool check,
                          const bool verbose) {
    typedef typename
      Kokkos::Impl::is_space<DeviceSpaceType>::host_mirror_space::execution_space HostSpaceType ;

    const bool detail = false;
    std::cout << "DeviceSpace::  "; DeviceSpaceType::print_configuration(std::cout, detail);
    std::cout << "HostSpace::    ";   HostSpaceType::print_configuration(std::cout, detail);

    // for simple test, let's use host space only here, for device it needs mirroring.

    typedef CrsMatrixBase<value_type,ordinal_type,size_type,HostSpaceType> CrsMatrixBaseHostType;
    typedef CrsMatrixView<CrsMatrixBaseHostType> CrsMatrixViewHostType;

    typedef GraphTools<ordinal_type,size_type,HostSpaceType> GraphToolsHostType;

    typedef GraphTools_Scotch<ordinal_type,size_type,HostSpaceType> GraphToolsHostType_Scotch;
    typedef GraphTools_CAMD<ordinal_type,size_type,HostSpaceType> GraphToolsHostType_CAMD;

    typedef IncompleteSymbolicFactorization<CrsMatrixBaseHostType> IncompleteSymbolicFactorizationType;

    typedef Kokkos::Experimental::TaskPolicy<DeviceSpaceType> PolicyType;

    typedef TaskView<CrsMatrixViewHostType> CrsTaskViewHostType;
    typedef CrsMatrixBase<CrsTaskViewHostType,ordinal_type,size_type,HostSpaceType> CrsHierBaseHostType;
    typedef CrsMatrixView<CrsHierBaseHostType> CrsHierViewHostType;
    typedef TaskView<CrsHierViewHostType> CrsTaskHierViewHostType;

    int r_val = 0;
    
    Kokkos::Impl::Timer timer;

    ///
    /// Read from matrix market
    ///
    ///     input  - file
    ///     output - AA_host
    ///
    CrsMatrixBaseHostType AA_host("AA_host");
    timer.reset();
    {
      std::ifstream in;
      in.open(file_input);
      if (!in.good()) {
        std::cout << "Failed in open the file: " << file_input << std::endl;
        return -1;
      }
      MatrixMarket::read(AA_host, in);
    }
    double t_read = timer.seconds();

    if (verbose)
      AA_host.showMe(std::cout) << std::endl;

    ///
    /// Create a graph structure for Scotch and CAMD (rptr, cidx)
    ///     
    ///     rptr and cidx are need to be set up for Scotch and CAMD
    ///
    typename GraphToolsHostType::size_type_array rptr("Graph::RowPtrArray", AA_host.NumRows() + 1);
    typename GraphToolsHostType::ordinal_type_array cidx("Graph::ColIndexArray", AA_host.NumNonZeros());

    ///
    /// Run Scotch
    ///
    ///     input  - rptr, cidx, A_host
    ///     output - S (perm, iperm, nblks, range, tree), AA_scotch_host (permuted)
    ///
    timer.reset();
    GraphToolsHostType::getGraph(rptr, cidx, AA_host);
    double t_graph = timer.seconds();

    GraphToolsHostType_Scotch S;
    S.setGraph(AA_host.NumRows(), rptr, cidx);
    S.setSeed(0);
    S.setTreeLevel();
    S.setStrategy( SCOTCH_STRATSPEED 
                   | SCOTCH_STRATLEVELMAX   
                   | SCOTCH_STRATLEVELMIN   
                   | SCOTCH_STRATLEAFSIMPLE 
                   | SCOTCH_STRATSEPASIMPLE 
                   );
    
    timer.reset();
    S.computeOrdering(treecut);
    double t_scotch = timer.seconds();
    
    if (verbose)
      S.showMe(std::cout) << std::endl;

    CrsMatrixBaseHostType AA_scotch_host("AA_scotch_host");
    AA_scotch_host.createConfTo(AA_host);
    
    CrsMatrixTools::copy(AA_scotch_host,
                         S.PermVector(),
                         S.InvPermVector(),
                         AA_host);

    if (verbose)
      AA_scotch_host.showMe(std::cout) << std::endl;

    ///
    /// Run CAMD
    ///
    ///     input  - rptr, cidx, A_host
    ///     output - S (perm, iperm, nblks, range, tree), AA_scotch_host (permuted)
    ///
    timer.reset();
    GraphToolsHostType::getGraph(rptr, cidx, AA_scotch_host);
    t_graph += timer.seconds();

    GraphToolsHostType_CAMD C;
    C.setGraph(AA_scotch_host.NumRows(),
               rptr, cidx,
               S.NumBlocks(),
               S.RangeVector());

    timer.reset();
    C.computeOrdering();
    double t_camd = timer.seconds();

    if (verbose)
      C.showMe(std::cout) << std::endl;
    
    CrsMatrixBaseHostType AA_camd_host("AA_camd_host");
    AA_camd_host.createConfTo(AA_scotch_host);
    
    CrsMatrixTools::copy(AA_camd_host,
                         C.PermVector(),
                         C.InvPermVector(),
                         AA_scotch_host);
    
    if (verbose)
      AA_camd_host.showMe(std::cout) << std::endl;

    ///
    /// Symbolic factorization
    ///
    ///     input  - 
    ///     output - S (perm, iperm, nblks, range, tree), AA_scotch_host (permuted)
    ///
    CrsMatrixBaseHostType AA_factor_host("AA_factor_host");

    timer.reset();
    IncompleteSymbolicFactorizationType::createNonZeroPattern(AA_factor_host,
                                                    fill_level,
                                                    Uplo::Upper,
                                                    AA_camd_host,
                                                    rows_per_team);
    double t_symbolic = timer.seconds();

    if (verbose)
      AA_factor_host.showMe(std::cout) << std::endl;

    ///
    /// Clean tempoerary matrices
    ///
    ///     input  - AA_scotch_host, AA_camd_host, C, rptr, cidx
    ///     output - none
    ///
    AA_scotch_host = CrsMatrixBaseHostType();
    AA_camd_host   = CrsMatrixBaseHostType();

    C = GraphToolsHostType_CAMD();
    rptr = typename GraphToolsHostType::size_type_array();
    cidx = typename GraphToolsHostType::ordinal_type_array();

    ///
    /// Create task policy
    ///
    ///     input  - max_task_size
    ///     output - policy
    ///
    const size_type max_task_size = (3*sizeof(CrsTaskViewHostType)+sizeof(PolicyType)+128);

    timer.reset();
    PolicyType policy(max_concurrency,
                      max_task_size,
                      max_task_dependence,
                      team_size);
    double t_policy = timer.seconds();

    ///
    /// Sequential execution
    ///
    ///     input  - AA_factor_host (matrix to be compared), rowviews
    ///     output - BB_factor_host, B_factor_host
    ///
    double t_chol_serial = 0;
    CrsMatrixBaseHostType BB_factor_host("BB_factor_host");
    if (check) {
      BB_factor_host.createConfTo(AA_factor_host);
      CrsMatrixTools::copy(BB_factor_host, AA_factor_host);
      
      CrsTaskViewHostType B_factor_host(BB_factor_host);
      Kokkos::View<typename CrsTaskViewHostType::row_view_type*,HostSpaceType>
        rowviews("RowViewInMatView", B_factor_host.NumRows());
      B_factor_host.setRowViewArray(rowviews);
      
      timer.reset();
      {
        auto future = policy.proc_create_team(Chol<Uplo::Upper,AlgoChol::Unblocked,Variant::One>
                                              ::createTaskFunctor(policy, B_factor_host));
        policy.spawn(future);
        Kokkos::Experimental::wait(policy);
        TACHO_TEST_FOR_ABORT( future.get(), "Fail to perform Cholesky (serial)");
      }
      t_chol_serial = timer.seconds();

      if (verbose)
        BB_factor_host.showMe(std::cout) << std::endl;
    }

    ///
    /// Task parallel execution
    ///
    ///    input  - AA_factor_host, rowviews
    ///    output - HA_factor_host, AA_factor_host, B_factor_host
    ///
    double t_hier = 0, t_blocks = 0, t_chol_parallel = 0;
    CrsHierBaseHostType HA_factor_host("HA_factor_host");
    {
      timer.reset();
      S.pruneTree(prunecut);
      CrsMatrixTools::createHierMatrix(HA_factor_host, 
                                       AA_factor_host, 
                                       S.NumBlocks(),
                                       S.RangeVector(),
                                       S.TreeVector());
      t_hier = timer.seconds();    
      
      timer.reset();
      size_type nblocks = HA_factor_host.NumNonZeros();
      
      Kokkos::View<ordinal_type*,HostSpaceType> 
        ap_rowview_blocks("NumRowViewInBlocks", nblocks + 1);
      
      ap_rowview_blocks(0) = 0;
      for (ordinal_type k=0;k<nblocks;++k) 
        ap_rowview_blocks(k+1) = ap_rowview_blocks(k) + HA_factor_host.Value(k).NumRows();
      
      Kokkos::View<typename CrsMatrixViewHostType::row_view_type*,HostSpaceType>
        rowview_blocks("RowViewInBlocks", ap_rowview_blocks(nblocks));
      
      Kokkos::parallel_for(Kokkos::RangePolicy<HostSpaceType>(0, nblocks),
                           [&](const ordinal_type k) {
                             const ordinal_type begin = ap_rowview_blocks(k);
                             const ordinal_type end   = ap_rowview_blocks(k+1);
                             HA_factor_host.Value(k).setRowViewArray
                               (Kokkos::subview(rowview_blocks, Kokkos::pair<ordinal_type,ordinal_type>(begin, end)));
                           } );
      CrsMatrixTools::filterEmptyBlocks(HA_factor_host);
      t_blocks = timer.seconds();    

      {
        size_type nblocks_filtered = HA_factor_host.NumNonZeros(), nnz_blocks = 0;
        for (size_type k=0;k<nblocks_filtered; ++k) 
          nnz_blocks += HA_factor_host.Value(k).NumNonZeros();
        
        TACHO_TEST_FOR_ABORT( nnz_blocks != AA_factor_host.NumNonZeros(),
                              "nnz counted in blocks is different from nnz in the base matrix.");
      }
      
      CrsTaskHierViewHostType H_factor_host(HA_factor_host);
      timer.reset();
      {
        auto future = policy.proc_create_team(Chol<Uplo::Upper,AlgoChol::ByBlocks,Variant::One>
                                              ::createTaskFunctor(policy, H_factor_host));
        policy.spawn(future);
        Kokkos::Experimental::wait(policy);
        TACHO_TEST_FOR_ABORT( future.get(), "Fail to perform Cholesky (serial)");
      }
      t_chol_parallel = timer.seconds();
      
      if (verbose)
        AA_factor_host.showMe(std::cout) << std::endl;
    }

    if (check) {
      double diff = 0, norm = 0;    
      TACHO_TEST_FOR_ABORT( BB_factor_host.NumNonZeros() != AA_factor_host.NumNonZeros(),
                            "nnz used in serial is not same as nnz used in parallel");

      const size_type nnz = AA_factor_host.NumNonZeros();
      for (size_type k=0;k<nnz;++k) {
        norm += Util::abs(BB_factor_host.Value(k));
        diff += Util::abs(AA_factor_host.Value(k) - BB_factor_host.Value(k));
      }
      std::cout << std::scientific;
      std::cout << "CholByBlocks:: check with serial execution " << std::endl
                << "               diff = " << diff << ", norm = " << norm << ", rel err = " << (diff/norm) << std::endl;
      std::cout.unsetf(std::ios::scientific);      
    }

    {
      const auto prec = std::cout.precision();
      std::cout.precision(4);

      std::cout << std::scientific;
      std::cout << "CholByBlocks:: Given    matrix = " << AA_host.NumRows() << " x " << AA_host.NumCols() << ", nnz = " << AA_host.NumNonZeros() << std::endl;
      std::cout << "CholByBlocks:: Factored matrix = " << AA_factor_host.NumRows() << " x " << AA_factor_host.NumCols() << ", nnz = " << AA_factor_host.NumNonZeros() << std::endl;
      std::cout << "CholByBlocks:: Hier     matrix = " << HA_factor_host.NumRows() << " x " << HA_factor_host.NumCols() << ", nnz = " << HA_factor_host.NumNonZeros() << std::endl;

      std::cout << "CholByBlocks:: "
                << "read = " << t_read << " [sec], "
                << "graph generation = " << (t_graph/2.0) << " [sec] "
                << "scotch reordering = " << t_scotch << " [sec] "
                << "camd reordering = " << t_camd << " [sec] " 
                << std::endl
                << "CholByBlocks:: "
                << "symbolic factorization = " << t_symbolic << " [sec] "
                << std::endl
                << "CholByBlocks:: "
                << "policy creation = " << t_policy << " [sec] "
                << "hier creation = " << t_hier << " [sec] "
                << "block specification = " << t_blocks << " [sec] "
                << std::endl
                << "CholByBlocks:: "
                << "Chol Parallel = " << t_chol_parallel << " [sec] ";
      if (check) 
        std::cout << "Chol Serial = " << (check ? t_chol_serial : -1) << " [sec] "
                  << "speed-up = " << (t_chol_serial/t_chol_parallel) << " [sec] ";
      
      std::cout << std::endl;
      
      std::cout.unsetf(std::ios::scientific);
      std::cout.precision(prec);
    }

    return r_val;
  }
  KOKKOS_INLINE_FUNCTION
  int exampleSymbolicFactor(const string file_input,
                            const int treecut,
                            const int minblksize,
                            const int seed,
                            const int fill_level,
                            const int league_size,
                            const bool reorder,
                            const bool verbose) {
    typedef ValueType   value_type;
    typedef OrdinalType ordinal_type;
    typedef SizeType    size_type;

    typedef CrsMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> CrsMatrixBaseType;
    typedef GraphHelper_Scotch<CrsMatrixBaseType> GraphHelperType;
    typedef SymbolicFactorHelper<CrsMatrixBaseType> SymbolicFactorHelperType;

    int r_val = 0;

    Kokkos::Impl::Timer timer;
    double t = 0.0;

    cout << "SymbolicFactor:: import input file = " << file_input << endl;
    CrsMatrixBaseType AA("AA");
    {
      timer.reset();

      ifstream in;
      in.open(file_input);
      if (!in.good()) {
        cout << "Failed in open the file: " << file_input << endl;
        return ++r_val;
      }
      AA.importMatrixMarket(in);
      t = timer.seconds();

      cout << "SymbolicFactor:: AA nnz = " << AA.NumNonZeros() << endl;
      if (verbose)
        cout << AA << endl;
    }
    cout << "SymbolicFactor:: import input file::time = " << t << endl;

    CrsMatrixBaseType PA("Permuted AA");
    GraphHelperType S(AA, seed);
    if (reorder) {
      timer.reset();

      S.computeOrdering(treecut, minblksize);

      PA.copy(S.PermVector(), S.InvPermVector(), AA);

      t = timer.seconds();

      if (verbose)
        cout << S << endl
             << PA << endl;
    } else {
      PA = AA;

      t = 0.0;
    }
    cout << "SymbolicFactor:: reorder the matrix::time = " << t << endl;


    CrsMatrixBaseType UU("UU");
    {
      timer.reset();

      SymbolicFactorHelperType symbolic(PA, league_size);
      symbolic.createNonZeroPattern(fill_level, Uplo::Upper, UU);

      t = timer.seconds();

      cout << "SymbolicFactor:: UU nnz = " << UU.NumNonZeros() << endl;

      if (verbose) {
        cout << symbolic << endl;
        cout << UU << endl;
      }
    }
    cout << "SymbolicFactor:: factorize the matrix::time = " << t << endl;

    return r_val;
  }
  int exampleCholUnblocked(const std::string file_input,
                           const int treecut,
                           const int prunecut,
                           const int fill_level,
                           const int rows_per_team,
                           const bool verbose) {
    typedef typename
      Kokkos::Impl::is_space<DeviceSpaceType>::host_mirror_space::execution_space HostSpaceType ;

    const bool detail = false;
    std::cout << "DeviceSpace::  "; DeviceSpaceType::print_configuration(std::cout, detail);
    std::cout << "HostSpace::    ";   HostSpaceType::print_configuration(std::cout, detail);

    typedef CrsMatrixBase<value_type,ordinal_type,size_type,HostSpaceType> CrsMatrixBaseHostType;
    typedef GraphTools<ordinal_type,size_type,HostSpaceType> GraphToolsHostType;

    typedef GraphTools_Scotch<ordinal_type,size_type,HostSpaceType> GraphToolsHostType_Scotch;
    typedef GraphTools_CAMD<ordinal_type,size_type,HostSpaceType> GraphToolsHostType_CAMD;

    typedef IncompleteSymbolicFactorization<CrsMatrixBaseHostType> IncompleteSymbolicFactorizationType;

    typedef Kokkos::Experimental::TaskPolicy<DeviceSpaceType> PolicyType;

    typedef CrsMatrixBase<value_type,ordinal_type,size_type,DeviceSpaceType> CrsMatrixBaseDeviceType;

    typedef CrsMatrixView<CrsMatrixBaseDeviceType> CrsMatrixViewDeviceType;
    typedef TaskView<CrsMatrixViewDeviceType> CrsTaskViewDeviceType;

    int r_val = 0;

    Kokkos::Impl::Timer timer;

    CrsMatrixBaseHostType AA_host("AA_host");
    timer.reset();
    {
      std::ifstream in;
      in.open(file_input);
      if (!in.good()) {
        std::cout << "Failed in open the file: " << file_input << std::endl;
        return -1;
      }
      MatrixMarket::read(AA_host, in);
    }
    double t_read = timer.seconds();

    if (verbose)
      AA_host.showMe(std::cout) << std::endl;

    typename GraphToolsHostType::size_type_array rptr("Graph::RowPtrArray", AA_host.NumRows() + 1);
    typename GraphToolsHostType::ordinal_type_array cidx("Graph::ColIndexArray", AA_host.NumNonZeros());

    timer.reset();
    GraphToolsHostType::getGraph(rptr, cidx, AA_host);
    double t_graph = timer.seconds();

    GraphToolsHostType_Scotch S;
    S.setGraph(AA_host.NumRows(), rptr, cidx);
    S.setSeed(0);
    S.setTreeLevel();
    S.setStrategy( SCOTCH_STRATSPEED
                   | SCOTCH_STRATLEVELMAX  
                   | SCOTCH_STRATLEVELMIN  
                   | SCOTCH_STRATLEAFSIMPLE
                   | SCOTCH_STRATSEPASIMPLE
                   );

    timer.reset();
    S.computeOrdering(treecut);
    double t_scotch = timer.seconds();

    S.pruneTree(prunecut);
    if (verbose)
      S.showMe(std::cout) << std::endl;

    CrsMatrixBaseHostType BB_host("BB_host");
    BB_host.createConfTo(AA_host);

    CrsMatrixTools::copy(BB_host,
                         S.PermVector(),
                         S.InvPermVector(),
                         AA_host);

    if (verbose)
      BB_host.showMe(std::cout) << std::endl;

    timer.reset();
    GraphToolsHostType::getGraph(rptr, cidx, BB_host);
    t_graph += timer.seconds();

    GraphToolsHostType_CAMD C;
    C.setGraph(BB_host.NumRows(),
               rptr, cidx,
               S.NumBlocks(),
               S.RangeVector());

    timer.reset();
    C.computeOrdering();
    double t_camd = timer.seconds();

    if (verbose)
      C.showMe(std::cout) << std::endl;

    CrsMatrixBaseHostType CC_host("CC_host");
    CC_host.createConfTo(BB_host);

    CrsMatrixTools::copy(CC_host,
                         C.PermVector(),
                         C.InvPermVector(),
                         BB_host);

    if (verbose)
      CC_host.showMe(std::cout) << std::endl;

    CrsMatrixBaseHostType DD_host("DD_host");

    timer.reset();
    IncompleteSymbolicFactorizationType::createNonZeroPattern(DD_host,
                                                    fill_level,
                                                    Uplo::Upper,
                                                    CC_host,
                                                    rows_per_team);
    double t_symbolic = timer.seconds();

    if (verbose)
      DD_host.showMe(std::cout) << std::endl;

    // ==================================================================================

    CrsMatrixBaseDeviceType AA_device("AA_device");
    AA_device.mirror(DD_host);

    const size_type max_concurrency = 10;
    const size_type max_task_size = (3*sizeof(CrsTaskViewDeviceType)+sizeof(PolicyType)+128);
    const size_type max_task_dependence = 0;
    const size_type team_size = 1;

    PolicyType policy(max_concurrency,
                      max_task_size,
                      max_task_dependence,
                      team_size);

    CrsMatrixViewDeviceType A_device(AA_device);
    Kokkos::View<typename CrsMatrixViewDeviceType::row_view_type*,DeviceSpaceType> 
      rowviews("RowViewInMatView", A_device.NumRows());
    A_device.setRowViewArray(rowviews);
    
    timer.reset();
    int ierr = Chol<Uplo::Upper,AlgoChol::Unblocked,Variant::One>::invoke
      (policy, policy.member_single(),
       A_device);
    double t_chol = timer.seconds();
    TACHO_TEST_FOR_ABORT( ierr, "Fail to perform Cholesky (serial)");

    if (verbose) {
      DD_host.mirror(AA_device);
      DD_host.showMe(std::cout) << std::endl;
    }

    {
      const auto prec = std::cout.precision();
      std::cout.precision(4);

      std::cout << std::scientific;
      std::cout << "SymbolicFactorization:: Given matrix  dimension = " << AA_host.NumRows() << " x " << AA_host.NumCols()
                << ", " << " nnz = " << AA_host.NumNonZeros() << std::endl;
      std::cout << "SymbolicFactorization:: Upper factors dimension = " << DD_host.NumRows() << " x " << DD_host.NumCols()
                << ", " << " nnz = " << DD_host.NumNonZeros() << std::endl;

      std::cout << "SymbolicFactorization:: "
                << "read = " << t_read << " [sec], "
                << "graph generation = " << (t_graph/2.0) << " [sec] "
                << "scotch reordering = " << t_scotch << " [sec] "
                << "camd reordering = " << t_camd << " [sec] "
                << "symbolic factorization = " << t_symbolic << " [sec] "
                << "Cholesky factorization = " << t_chol << " [sec] "
                << std::endl;

      std::cout.unsetf(std::ios::scientific);
      std::cout.precision(prec);
    }

    return r_val;
  }
  KOKKOS_INLINE_FUNCTION
  int exampleStatByBlocks(const string file_input,
                          const int treecut,
                          const int minblksize,
                          const int prunecut,
                          const int seed,
                          const int fill_level,
                          const int league_size,
                          const int histogram_size,
                          const bool verbose) {
    typedef ValueType   value_type;
    typedef OrdinalType ordinal_type;
    typedef SizeType    size_type;

    typedef CrsMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> CrsMatrixBaseType;
    typedef GraphHelper_Scotch<CrsMatrixBaseType> GraphHelperType;
    typedef SymbolicFactorHelper<CrsMatrixBaseType> SymbolicFactorHelperType;

    typedef CrsMatrixView<CrsMatrixBaseType> CrsMatrixViewType;
    typedef CrsMatrixBase<CrsMatrixViewType,ordinal_type,size_type,SpaceType,MemoryTraits> CrsHierMatrixBaseType;

    int r_val = 0;

    Kokkos::Impl::Timer timer;
    double t = 0.0;

    cout << "StatByBlocks:: import input file = " << file_input << endl;
    CrsMatrixBaseType AA("AA");
    {
      timer.reset();

      ifstream in;
      in.open(file_input);
      if (!in.good()) {
        cout << "Failed in open the file: " << file_input << endl;
        return ++r_val;
      }
      AA.importMatrixMarket(in);

      t = timer.seconds();

      if (verbose)
        cout << AA << endl;
    }
    cout << "StatByBlocks:: import input file::time = " << t << endl;


    CrsMatrixBaseType UU("UU");
    CrsHierMatrixBaseType HU("HU");
    {
      CrsMatrixBaseType PA("Permuted AA");
      typename GraphHelperType::size_type_array rptr(AA.Label()+"Graph::RowPtrArray", AA.NumRows() + 1);
      typename GraphHelperType::ordinal_type_array cidx(AA.Label()+"Graph::ColIndexArray", AA.NumNonZeros());

      AA.convertGraph(rptr, cidx);
      GraphHelperType S(AA.Label()+"ScotchHelper",
                        AA.NumRows(),
                        rptr,
                        cidx,
                        seed);
      {
        timer.reset();

        S.computeOrdering(treecut, minblksize);
        S.pruneTree(prunecut);

        PA.copy(S.PermVector(), S.InvPermVector(), AA);

        t = timer.seconds();

        if (verbose)
          cout << S << endl;
      }
      cout << "StatByBlocks:: reorder the matrix::time = " << t << endl;

      {
        SymbolicFactorHelperType F(PA, league_size);
        timer.reset();

        F.createNonZeroPattern(fill_level, Uplo::Upper, UU);

        t = timer.seconds();
        cout << "StatByBlocks:: AA (nnz) = " << AA.NumNonZeros() << ", UU (nnz) = " << UU.NumNonZeros() << endl;
      }
      cout << "StatByBlocks:: symbolic factorization::time = " << t << endl;

      {
        timer.reset();

        CrsMatrixHelper::flat2hier(Uplo::Upper, UU, HU,
                                   S.NumBlocks(),
                                   S.RangeVector(),
                                   S.TreeVector());

        for (ordinal_type k=0;k<HU.NumNonZeros();++k)
          HU.Value(k).fillRowViewArray();

        t = timer.seconds();
        cout << "StatByBlocks:: Hier (dof, nnz) = " << HU.NumRows() << ", " << HU.NumNonZeros() << endl;
      }
      cout << "StatByBlocks:: construct hierarchical matrix::time = " << t << endl;
    }

    {
      cout << endl;
      cout << " -- Flat matrix: UU --" << endl;
      cout << "    # of Rows     = " << UU.NumRows() << endl;
      cout << "    # of Cols     = " << UU.NumCols() << endl;
      cout << "    # of Nonzeros = " << UU.NumNonZeros() << endl;
      cout << endl;
      cout << " -- Hierarchical matrix: HU --" << endl;
      cout << "    # of Rows     = " << HU.NumRows() << endl;
      cout << "    # of Cols     = " << HU.NumCols() << endl;
      cout << "    # of Nonzeros = " << HU.NumNonZeros() << endl;
      cout << endl;
      cout << " -- Blocks of HU --" << endl;

      map<size_type,size_type> histogram;

      if (HU.NumNonZeros()) {

        size_type
          nnz_min = HU.Value(0).countNumNonZeros(),
          nnz_max = nnz_min,
          nnz_sum = 0,
          nnz_ave = 0;

        size_type nnz_cnt = 0;
        for (ordinal_type k=0;k<HU.NumNonZeros();++k) {
          const auto nnz_blk = HU.Value(k).countNumNonZeros();
          if (nnz_blk) {
            nnz_min  = min(nnz_min, nnz_blk);
            nnz_max  = max(nnz_max, nnz_blk);
            nnz_sum += nnz_blk;
            ++nnz_cnt;

            if (histogram_size)
              ++histogram[nnz_blk/histogram_size];
          }
        }
        nnz_ave = nnz_sum/nnz_cnt;

        cout << "    Min # of Nonzeros = " << nnz_min << endl;
        cout << "    Max # of Nonzeros = " << nnz_max << endl;
        cout << "    Ave # of Nonzeros = " << nnz_ave << endl;
        cout << "    Sum # of Nonzeros = " << nnz_sum << endl;
        cout << "    # of empty Blocks = " << (HU.NumNonZeros() - nnz_cnt) << endl;

        if (histogram_size) {
          cout << "    Histogram" << endl;
          for (auto it=histogram.begin();it!=histogram.end();++it)
            cout << (it->first*histogram_size) << " , " << it->second << endl;
        }
      } else {
        cout << "    No registered blocks" << endl;
      }

    }

    return r_val;
  }
  KOKKOS_INLINE_FUNCTION
  int exampleDenseTrsmMKL(const OrdinalType mmin,
                          const OrdinalType mmax,
                          const OrdinalType minc,
                          const OrdinalType k,
                          const bool verbose) {
    typedef ValueType   value_type;
    typedef OrdinalType ordinal_type;
    typedef SizeType    size_type;

    typedef DenseMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> DenseMatrixBaseType;

    int r_val = 0;

    Kokkos::Impl::Timer timer;
    double t = 0.0;

    cout << "DenseGemmMKL:: test matrices "
         <<":: mmin = " << mmin << " , mmax = " << mmax << " , minc = " << minc << " , k = "<< k << endl;

    ostringstream os;
    os.precision(3);
    os << scientific;

    for (ordinal_type m=mmin;m<=mmax;m+=minc) {
      os.str("");

      DenseMatrixBaseType AA("AA", m, m), BB("BB", m, k), BC("BC", m, k);
      
      // setup upper triangular
      for (ordinal_type j=0;j<AA.NumCols();++j) {
        AA.Value(j,j) = 10.0;
        for (ordinal_type i=0;i<j;++i)
          AA.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0;
      }

      // setup one and right hand side is going to be overwritten by the product of AB
      for (ordinal_type j=0;j<BB.NumCols();++j)
        for (ordinal_type i=0;i<BB.NumRows();++i)
          BB.Value(i,j) = 1.0;

      Teuchos::BLAS<ordinal_type,value_type> blas;

      blas.GEMM(Teuchos::CONJ_TRANS, Teuchos::NO_TRANS,
                m, k, m,
                1.0,
                AA.ValuePtr(), AA.ColStride(),
                BB.ValuePtr(), BB.ColStride(),
                0.0,
                BC.ValuePtr(), BC.ColStride());
      BB.copy(BC);

      const double flop = get_flop_trsm_upper<value_type>(m, k);

      os << "DenseTrsmMKL:: m = " << m << " k = " << k;
      {
        timer.reset();
        Teuchos::BLAS<ordinal_type,value_type> blas;

        const ordinal_type mm = AA.NumRows();
        const ordinal_type nn = BB.NumCols();

        blas.TRSM(Teuchos::LEFT_SIDE, Teuchos::UPPER_TRI, Teuchos::CONJ_TRANS,
                  Teuchos::NON_UNIT_DIAG,
                  mm, nn,
                  1.0,
                  AA.ValuePtr(), AA.ColStride(),
                  BB.ValuePtr(), BB.ColStride());
        t = timer.seconds();
        os << ":: MKL Performance = " << (flop/t/1.0e9) << " [GFLOPs]  ";
      }
      cout << os.str() << endl;
    }

    return r_val;
  }
  int exampleMatrixMarket(const std::string file_input,
                          const bool verbose) {
    typedef typename
      Kokkos::Impl::is_space<DeviceSpaceType>::host_mirror_space::execution_space HostSpaceType ;

    const bool detail = false;
    std::cout << "DeviceSpace::  "; DeviceSpaceType::print_configuration(std::cout, detail);
    std::cout << "HostSpace::    ";   HostSpaceType::print_configuration(std::cout, detail);
  
    typedef CrsMatrixBase<value_type,ordinal_type,size_type,HostSpaceType>   CrsMatrixBaseHostType;

    int r_val = 0;

    Kokkos::Impl::Timer timer;
    
    CrsMatrixBaseHostType AA("AA");
    timer.reset();
    {
      std::ifstream in;
      in.open(file_input);
      if (!in.good()) {
        std::cout << "Failed in open the file: " << file_input << std::endl;
        return -1;
      }
      MatrixMarket::read(AA, in);
    }
    double t_read = timer.seconds();
    
    timer.reset();
    {
      std::string file_output = "mm-test-output.mtx";
      std::ofstream out;
      out.open(file_output);
      if (!out.good()) {
        std::cout << "Failed in open the file: " << file_output << std::endl;
        return -1;
      }
      MatrixMarket::write(out, AA, "%% Test output");
    }
    double t_write = timer.seconds();

    {
      const auto prec = std::cout.precision();
      std::cout.precision(4);

      std::cout << std::scientific
                << "MatrixMarket:: dimension = " << AA.NumRows() << " x " << AA.NumCols() 
                << ", " << " nnz = " << AA.NumNonZeros() << ", "
                << "read = " << t_read << " [sec], "
                << "write = " << t_write << " [sec] "
                << std::endl;

      std::cout.unsetf(std::ios::scientific);
      std::cout.precision(prec);
    }
    
    if (verbose) {
      AA.showMe(std::cout) << std::endl;
    }

    CrsMatrixBaseHostType BB("BB");
    BB.createConfTo(AA);

    CrsMatrixTools::copy(BB, Uplo::Upper, 0, AA);
    if (verbose) {
      BB.setLabel("Copy::AA:Upper::0"); BB.showMe(std::cout) << std::endl;
    }

    CrsMatrixTools::copy(BB, Uplo::Upper, 1, AA);
    if (verbose) {
      BB.setLabel("Copy::AA:Upper::1"); BB.showMe(std::cout) << std::endl;    
    }

    CrsMatrixTools::copy(BB, Uplo::Lower, 0, AA);
    if (verbose) {
      BB.setLabel("Copy::AA:Lower::0"); BB.showMe(std::cout) << std::endl;    
    }

    CrsMatrixTools::copy(BB, Uplo::Lower, 1, AA);
    if (verbose) {
      BB.setLabel("Copy::AA:Lower::1"); BB.showMe(std::cout) << std::endl;    
    }

    return r_val;
  }
    int ComputeBasis_HGRAD_Vector(const ordinal_type nworkset,
                                  const ordinal_type C,
                                  const ordinal_type order,
                                  const bool verbose) {
      typedef Vector<VectorTagType> VectorType;

      typedef typename VectorTagType::value_type ValueType;
      constexpr int VectorLength = VectorTagType::length;

      Teuchos::RCP<std::ostream> verboseStream;
      Teuchos::oblackholestream bhs; // outputs nothing

      if (verbose) 
        verboseStream = Teuchos::rcp(&std::cout, false);
      else
        verboseStream = Teuchos::rcp(&bhs,       false);

      Teuchos::oblackholestream oldFormatState;
      oldFormatState.copyfmt(std::cout);

      typedef typename
        Kokkos::Impl::is_space<DeviceSpaceType>::host_mirror_space::execution_space HostSpaceType ;

      *verboseStream << "DeviceSpace::  "; DeviceSpaceType::print_configuration(*verboseStream, false);
      *verboseStream << "HostSpace::    ";   HostSpaceType::print_configuration(*verboseStream, false);
      *verboseStream << "VectorLength::  " << (VectorLength) << "\n";

      using BasisTypeHost = Basis_HGRAD_HEX_C1_FEM<HostSpaceType,ValueType,ValueType>;
      using ImplBasisType = Impl::Basis_HGRAD_HEX_C1_FEM;
      using range_type = Kokkos::pair<ordinal_type,ordinal_type>;

      constexpr size_t LLC_CAPACITY = 32*1024*1024;
      Intrepid2::Test::Flush<LLC_CAPACITY,DeviceSpaceType> flush;
      
      Kokkos::Impl::Timer timer;
      double t_vectorize = 0;
      int errorFlag = 0;

      BasisTypeHost hostBasis;
      const auto cellTopo = hostBasis.getBaseCellTopology();

      auto cubature = DefaultCubatureFactory::create<DeviceSpaceType,ValueType,ValueType>(cellTopo, order);

      const ordinal_type 
        numCells = C,
        numCellsAdjusted = C/VectorLength + (C%VectorLength > 0),
        numVerts = cellTopo.getVertexCount(),
        numDofs = hostBasis.getCardinality(),
        numPoints = cubature->getNumPoints(), 
        spaceDim = cubature->getDimension();

      Kokkos::DynRankView<ValueType,HostSpaceType> dofCoordsHost("dofCoordsHost", numDofs, spaceDim);
      hostBasis.getDofCoords(dofCoordsHost);
      const auto refNodesHost = Kokkos::subview(dofCoordsHost, range_type(0, numVerts), Kokkos::ALL());
      
      // pertub nodes
      Kokkos::DynRankView<VectorType,HostSpaceType> worksetCellsHost("worksetCellsHost", numCellsAdjusted, numVerts, spaceDim);
      for (ordinal_type cell=0;cell<numCells;++cell) {
        for (ordinal_type i=0;i<numVerts;++i)
          for (ordinal_type j=0;j<spaceDim;++j) {
            ValueType val = (rand()/(RAND_MAX + 1.0))*0.2 -0.1;
            worksetCellsHost(cell/VectorLength, i, j)[cell%VectorLength] = refNodesHost(i, j) + val; 
          }
      }

      auto worksetCells = Kokkos::create_mirror_view(typename DeviceSpaceType::memory_space(), worksetCellsHost);
      Kokkos::deep_copy(worksetCells, worksetCellsHost);

      Kokkos::DynRankView<ValueType,DeviceSpaceType> refPoints("refPoints", numPoints, spaceDim), refWeights("refWeights", numPoints);
      cubature->getCubature(refPoints, refWeights);

      std::cout
        << "===============================================================================\n" 
        << " Performance Test evaluating ComputeBasis \n"
        << " # of workset = " << nworkset << "\n" 
        << " Test Array Structure (C,F,P,D) = " << numCells << ", " << numDofs << ", " << numPoints << ", " << spaceDim << "\n"
        << "===============================================================================\n";

      *verboseStream
        << "\n"
        << "===============================================================================\n"
        << "TEST 1: evaluateFields vector version\n"
        << "===============================================================================\n";
      
      try {
        Kokkos::DynRankView<ValueType,DeviceSpaceType> 
          refBasisValues("refBasisValues", numDofs, numPoints),
          refBasisGrads ("refBasisGrads",  numDofs, numPoints, spaceDim);
        
        ImplBasisType::getValues<DeviceSpaceType>(refBasisValues, refPoints, OPERATOR_VALUE);
        ImplBasisType::getValues<DeviceSpaceType>(refBasisGrads,  refPoints, OPERATOR_GRAD);
        
        const ordinal_type ibegin = -3;

        // testing vertical approach
        {          
          Kokkos::DynRankView<VectorType,DeviceSpaceType> 
            weightedBasisValues("weightedBasisValues", numCellsAdjusted, numDofs, numPoints),
            weightedBasisGrads ("weightedBasisGrads",  numCellsAdjusted, numDofs, numPoints, spaceDim);

          typedef F_hgrad_eval<VectorType,ValueType,DeviceSpaceType> FunctorType;

          using range_policy_type = Kokkos::Experimental::MDRangePolicy
            < DeviceSpaceType, Kokkos::Experimental::Rank<2>, Kokkos::IndexType<ordinal_type> >;
          range_policy_type policy( {                0,         0 },
                                    { numCellsAdjusted, numPoints } );

          FunctorType functor(weightedBasisValues,
                              weightedBasisGrads,
                              refBasisGrads,
                              worksetCells,
                              refWeights,
                              refBasisValues,
                              refBasisGrads);
          
          for (ordinal_type iwork=ibegin;iwork<nworkset;++iwork) {
            flush.run();

            DeviceSpaceType::fence();
            timer.reset();
            
            Kokkos::parallel_for(policy, functor);

            DeviceSpaceType::fence();
            t_vectorize += (iwork >= 0)*timer.seconds();
          }

        }

      } catch (std::exception err) {
        *verboseStream << "UNEXPECTED ERROR !!! ----------------------------------------------------------\n";
        *verboseStream << err.what() << '\n';
        *verboseStream << "-------------------------------------------------------------------------------" << "\n\n";
        errorFlag = -1000;
      }

      std::cout 
        << "TEST HGRAD " 
        << " t_vectorize = " << (t_vectorize/nworkset)
        << std::endl;
      
      if (errorFlag != 0)
        std::cout << "End Result: TEST FAILED\n";
      else
        std::cout << "End Result: TEST PASSED\n";
      
      // reset format state of std::cout
      std::cout.copyfmt(oldFormatState);
      
      return errorFlag;
    }
  KOKKOS_INLINE_FUNCTION
  int exampleCholUnblocked(const string file_input,
                           const int max_task_dependence,
                           const int team_size,
                           const int algo,
                           const int variant,
                           const bool verbose) {
    typedef ValueType   value_type;
    typedef OrdinalType ordinal_type;
    typedef SizeType    size_type;

    typedef CrsMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> CrsMatrixBaseType;
    typedef CrsMatrixView<CrsMatrixBaseType> CrsMatrixViewType;

    typedef TaskFactory<Kokkos::Experimental::TaskPolicy<SpaceType>,
      Kokkos::Experimental::Future<int,SpaceType> > TaskFactoryType;

    typedef TaskView<CrsMatrixViewType,TaskFactoryType> CrsTaskViewType;
    
    int r_val = 0;

    Kokkos::Impl::Timer timer;
    double t = 0.0;

    cout << "CholUnblocked:: import input file = " << file_input << endl;        
    CrsMatrixBaseType AA("AA"), UU("UU");    
    {
      timer.reset();

      ifstream in;
      in.open(file_input);
      if (!in.good()) {
        cout << "Failed in open the file: " << file_input << endl;
        return ++r_val;
      }
      AA.importMatrixMarket(in);

      UU.copy(Uplo::Upper, AA);

      t = timer.seconds();

      if (verbose)
        cout << UU << endl;
    }
    cout << "CholUnblocked:: import input file::time = " << t << endl;        

#ifdef __USE_FIXED_TEAM_SIZE__
    typename TaskFactoryType::policy_type policy(max_task_dependence);
#else
    typename TaskFactoryType::policy_type policy(max_task_dependence, team_size);
#endif
    TaskFactoryType::setMaxTaskDependence(max_task_dependence);
    TaskFactoryType::setPolicy(&policy);

    cout << "CholUnblocked:: factorize the matrix" << endl;
    CrsTaskViewType U(&UU);
    U.fillRowViewArray();
    {
      timer.reset();
    
      typename TaskFactoryType::future_type future;
      switch (algo) {
      case AlgoChol::UnblockedOpt: {
        if (variant == Variant::One)
          future = TaskFactoryType::Policy().create_team(Chol<Uplo::Upper,AlgoChol::UnblockedOpt,Variant::One>
                                                         ::TaskFunctor<CrsTaskViewType>(U), 0);
        else if (variant == Variant::Two)
          future = TaskFactoryType::Policy().create_team(Chol<Uplo::Upper,AlgoChol::UnblockedOpt,Variant::Two>
                                                         ::TaskFunctor<CrsTaskViewType>(U), 0);
        else {
          ERROR(">> Not supported algorithm variant");          
        }
        break;
      }
      case AlgoChol::Dummy: {
        future = TaskFactoryType::Policy().create_team(Chol<Uplo::Upper,AlgoChol::Dummy>
                                                       ::TaskFunctor<CrsTaskViewType>(U), 0);
        break;
      }
      default:
        ERROR(">> Not supported algorithm");
        break;
      }
      TaskFactoryType::Policy().spawn(future);
      Kokkos::Experimental::wait(TaskFactoryType::Policy());

      t = timer.seconds();

      if (verbose)
        cout << UU << endl;
    }   
    cout << "CholUnblocked:: factorize the matrix::time = " << t << endl; 
    
    return r_val;
  }
Example #17
0
  BASKER_INLINE
  int Basker<Int, Entry, Exe_Space>::factor_inc_lvl(Int option)
  {

    printf("Factor Inc Level Called \n");

    gn = A.ncol;
    gm = A.nrow;

    if(Options.btf == BASKER_TRUE)
      {

	//JDB: We can change this for the new inteface

	//call reference copy constructor
	gn = A.ncol;
	gm = A.nrow;
	A = BTF_A; 
	//printf("\n\n Switching A, newsize: %d \n",
	//   A.ncol);
	//printMTX("A_FACTOR.mtx", A);
      }
   

    //Spit into Domain and Sep
    //----------------------Domain-------------------------//
    #ifdef BASKER_KOKKOS

    //====TIMER==
    #ifdef BASKER_TIME
    Kokkos::Impl::Timer       timer;
    #endif
    //===TIMER===

    typedef Kokkos::TeamPolicy<Exe_Space>        TeamPolicy;

    if(btf_tabs_offset != 0)
      {

	kokkos_nfactor_domain_inc_lvl <Int,Entry,Exe_Space>
	  domain_nfactor(this);
	Kokkos::parallel_for(TeamPolicy(num_threads,1), 
			     domain_nfactor);
	Kokkos::fence();
    

	//=====Check for error======
	while(true)
	  {
	INT_1DARRAY thread_start;
	MALLOC_INT_1DARRAY(thread_start, num_threads+1);
	init_value(thread_start, num_threads+1, 
		   (Int) BASKER_MAX_IDX);
	int nt = nfactor_domain_error(thread_start);
	if(nt == BASKER_SUCCESS)
	  {
	    break;
	  }
	else
	  {
	    printf("restart \n");
	    kokkos_nfactor_domain_remalloc <Int, Entry, Exe_Space>
	      diag_nfactor_remalloc(this, thread_start);
	    Kokkos::parallel_for(TeamPolicy(num_threads,1),
				 diag_nfactor_remalloc);
	    Kokkos::fence();
	  }
      }//end while
   

    //====TIMER===
    #ifdef BASKER_TIME
    printf("Time DOMAIN: %f \n", timer.seconds());
    timer.reset();
    #endif
    //====TIMER====
    

    #else// else basker_kokkos
    #pragma omp parallel
    {


    }//end omp parallel
    #endif //end basker_kokkos

      }
    //-------------------End--Domian--------------------------//

   
    //---------------------------Sep--------------------------//

    
    
    if(btf_tabs_offset != 0)
      {
        //for(Int l=1; l<=1; l++)
       for(Int l=1; l <= tree.nlvls; l++)
      {

        //Come back for syncs
        
	//#ifdef BASKER_OLD_BARRIER
	Int lthreads = pow(2,l);
	Int lnteams = num_threads/lthreads;
	//#else
	//Int lthreads = 1;
	//Int lnteams = num_threads/lthreads;
	//#endif

	
	//printf("\n\n   ============ SEP: %d ======\n\n",l);

	#ifdef BASKER_KOKKOS
	Kokkos::Impl::Timer  timer_inner_sep;
	#ifdef BASKER_NO_LAMBDA
       
	kokkos_nfactor_sep2_inc_lvl <Int, Entry, Exe_Space>
	  sep_nfactor(this,l);
	
	Kokkos::parallel_for(TeamPolicy(lnteams,lthreads),
			     sep_nfactor);
	Kokkos::fence();
	
	#ifdef BASKER_TIME
	printf("Time INNERSEP: %d %f \n", 
	       l, timer_inner_sep.seconds());
	#endif
        #else //ELSE BASKER_NO_LAMBDA
	//Note: to be added
        #endif //end BASKER_NO_LAMBDA
	#else
	#pragma omp parallel
	{

	}//end omp parallel
	#endif
      }//end over each level

    #ifdef BASKER_TIME
    printf("Time SEP: %f \n", timer.seconds());
    #endif
      }

   
    //-------------------------End Sep----------------//


    //-------------------IF BTF-----------------------//
    if(Options.btf == BASKER_TRUE)
      {
	//=====Timer
	#ifdef BASKER_TIME
	Kokkos::Impl::Timer  timer_btf;
	#endif
	//====Timer
	
	//======Call diag factor====
	/*
	kokkos_nfactor_diag <Int, Entry, Exe_Space> 
	  diag_nfactor(this);
	Kokkos::parallel_for(TeamPolicy(num_threads,1),
			     diag_nfactor);
	Kokkos::fence();
	*/
	//=====Check for error======
	//while(true)
	// {
	    //INT_1DARRAY thread_start;
	    // MALLOC_INT_1DARRAY(thread_start, num_threads+1);
	    //init_value(thread_start, num_threads+1, 
	    //	       (Int) BASKER_MAX_IDX);
	    //int nt = nfactor_diag_error(thread_start);
	    // if(nt == BASKER_SUCCESS)
	    //  {
	    ///		break;
	    // }
	    //else
	    // {
		/*
		break;
		printf("restart \n");
		kokkos_nfactor_diag_remalloc <Int, Entry, Exe_Space>
		  diag_nfactor_remalloc(this, thread_start);
		Kokkos::parallel_for(TeamPolicy(num_threads,1),
				     diag_nfactor);
		Kokkos::fence();
		*/
	    //}
	    // }//end while

	//====TIMER
	#ifdef BASKER_TIME
	printf("Time BTF: %f \n", 
	       timer_btf.seconds());
	#endif
	//===TIMER

      }//end btf call

    
    return 0;
  }//end factor_lvl_inc()
  int exampleDenseCholByBlocks(const ordinal_type mmin,
                               const ordinal_type mmax,
                               const ordinal_type minc,
                               const ordinal_type mb,
                               const int max_concurrency,
                               const int max_task_dependence,
                               const int team_size,
                               const int mkl_nthreads,
                               const bool check,
                               const bool verbose) {
    typedef typename
      Kokkos::Impl::is_space<DeviceSpaceType>::host_mirror_space::execution_space HostSpaceType ;

    const bool detail = false;
    std::cout << "DeviceSpace::  "; DeviceSpaceType::print_configuration(std::cout, detail);
    std::cout << "HostSpace::    ";   HostSpaceType::print_configuration(std::cout, detail);

    typedef Kokkos::Experimental::TaskPolicy<DeviceSpaceType> PolicyType;

    typedef DenseMatrixBase<value_type,ordinal_type,size_type,HostSpaceType> DenseMatrixBaseHostType;
    typedef DenseMatrixView<DenseMatrixBaseHostType> DenseMatrixViewHostType;

    typedef DenseMatrixBase<value_type,ordinal_type,size_type,DeviceSpaceType> DenseMatrixBaseDeviceType;
    typedef DenseMatrixView<DenseMatrixBaseDeviceType> DenseMatrixViewDeviceType;
    typedef TaskView<DenseMatrixViewDeviceType> DenseTaskViewDeviceType;

    typedef DenseMatrixBase<DenseTaskViewDeviceType,ordinal_type,size_type,DeviceSpaceType> DenseHierMatrixBaseDeviceType;

    typedef DenseMatrixView<DenseHierMatrixBaseDeviceType> DenseHierMatrixViewDeviceType;
    typedef TaskView<DenseHierMatrixViewDeviceType> DenseHierTaskViewDeviceType;

    int r_val = 0;

    Kokkos::Impl::Timer timer;
    double t = 0.0;

    std::cout << "DenseCholByBlocks:: test matrices "
              <<":: mmin = " << mmin << " , mmax = " << mmax << " , minc = " << minc 
              << " , mb = " << mb << std::endl;

    const size_t max_task_size = (3*sizeof(DenseTaskViewDeviceType)+sizeof(PolicyType)+128); 
    PolicyType policy(max_concurrency,
                      max_task_size,
                      max_task_dependence,
                      team_size);

    std::ostringstream os;
    os.precision(3);
    os << std::scientific;

    for (ordinal_type m=mmin;m<=mmax;m+=minc) {
      os.str("");
      
      // host matrices
      DenseMatrixBaseHostType AA_host("AA_host", m, m), AB_host("AB_host"), TT_host("TT_host");

      // random T matrix
      {
        TT_host.createConfTo(AA_host);
        for (ordinal_type j=0;j<TT_host.NumCols();++j) {
          for (ordinal_type i=0;i<TT_host.NumRows();++i)
            TT_host.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0;
          TT_host.Value(j,j) = std::fabs(TT_host.Value(j,j));
        }
      }
      // create SPD matrix
      {
        Teuchos::BLAS<ordinal_type,value_type> blas;

        blas.HERK(ArgUplo == Uplo::Upper ? Teuchos::UPPER_TRI : Teuchos::LOWER_TRI, 
                  Teuchos::CONJ_TRANS,
                  m, m,
                  1.0,
                  TT_host.ValuePtr(), TT_host.ColStride(),
                  0.0,
                  AA_host.ValuePtr(), AA_host.ColStride());

        // preserve a copy of A
        AB_host.createConfTo(AA_host);        
        DenseMatrixTools::copy(AB_host, AA_host);
      }

      const double flop = DenseFlopCount<value_type>::Chol(m);

#ifdef HAVE_SHYLUTACHO_MKL
      mkl_set_num_threads(mkl_nthreads);
#endif
      os << "DenseCholByBlocks:: m = " << m << "  ";
      int ierr = 0;
      if (check) {
        timer.reset();
        DenseMatrixViewHostType A_host(AB_host); 
        ierr = Chol<ArgUplo,AlgoChol::ExternalLapack,Variant::One>::invoke
          (policy, policy.member_single(),
           A_host);
        t = timer.seconds();
        TACHO_TEST_FOR_ABORT( ierr, "Fail to perform Cholesky (serial)");
        os << ":: Serial Performance = " << (flop/t/1.0e9) << " [GFLOPs]  ";
      }

      DenseMatrixBaseDeviceType AA_device("AA_device");
      {
        timer.reset();
        AA_device.mirror(AA_host);
        t = timer.seconds();
        os << ":: Mirror = " << t << " [sec]  ";
      }

      {
        DenseHierMatrixBaseDeviceType HA_device("HA_device");
        
        DenseMatrixTools::createHierMatrix(HA_device, AA_device, mb, mb);

        DenseHierTaskViewDeviceType TA_device(HA_device);

        timer.reset();
        auto future = policy.proc_create_team
          (Chol<ArgUplo,AlgoChol::DenseByBlocks,ArgVariant>
           ::createTaskFunctor(policy, TA_device), 
           0);
        policy.spawn(future);
        Kokkos::Experimental::wait(policy);
        t = timer.seconds();
        os << ":: Parallel Performance = " << (flop/t/1.0e9) << " [GFLOPs]  ";
      }

      AA_host.mirror(AA_device);
      if (!ierr && check) {
        double err = 0.0, norm = 0.0;
        for (ordinal_type j=0;j<AA_host.NumCols();++j)
          for (ordinal_type i=0;i<=j;++i) {
            const double diff = abs(AA_host.Value(i,j) - AB_host.Value(i,j));
            const double val  = AB_host.Value(i,j);
            err  += diff*diff;
            norm += val*val;
          }
        os << ":: Check result ::norm = " << sqrt(norm) << ", error = " << sqrt(err);
      }
      std::cout << os.str() << std::endl;
    }
    
    return r_val;
  }
  void viennaCL_apply(
      KernelHandle *handle,
      typename KernelHandle::nnz_lno_t m,
      typename KernelHandle::nnz_lno_t n,
      typename KernelHandle::nnz_lno_t k,
      in_row_index_view_type row_mapA,
      in_nonzero_index_view_type entriesA,
      in_nonzero_value_view_type valuesA,

      bool transposeA,
      bin_row_index_view_type row_mapB,
      bin_nonzero_index_view_type entriesB,
      bin_nonzero_value_view_type valuesB,
      bool transposeB,
      cin_row_index_view_type &row_mapC,
      cin_nonzero_index_view_type &entriesC,
      cin_nonzero_value_view_type &valuesC){

#ifdef KERNELS_HAVE_VIENNACL

    typedef typename KernelHandle::nnz_lno_t idx;
    typedef in_row_index_view_type idx_array_type;

    typedef typename KernelHandle::nnz_scalar_t value_type;


    typedef typename in_row_index_view_type::device_type device1;
    typedef typename in_nonzero_index_view_type::device_type device2;
    typedef typename in_nonzero_value_view_type::device_type device3;

    typedef typename KernelHandle::HandleExecSpace MyExecSpace;

    std::cout << "RUNNING VIENNACL" << std::endl;

    typedef typename viennacl::compressed_matrix<value_type>::handle_type it;
    typedef typename viennacl::compressed_matrix<value_type>::value_type vt;

    if ((Kokkos::Impl::is_same<idx, int>::value && Kokkos::Impl::is_same<typename KernelHandle::size_type, int>::value )||
        (Kokkos::Impl::is_same<idx, unsigned int>::value && Kokkos::Impl::is_same<typename KernelHandle::size_type, unsigned int>::value ) ||
        (Kokkos::Impl::is_same<idx, it>::value && Kokkos::Impl::is_same<typename KernelHandle::size_type, it>::value )
        ){

      unsigned int * a_xadj = (unsigned int *)row_mapA.ptr_on_device();
      unsigned int * b_xadj = (unsigned int * )row_mapB.ptr_on_device();
      unsigned int * c_xadj = (unsigned int * )row_mapC.ptr_on_device();

      unsigned int * a_adj = (unsigned int * )entriesA.ptr_on_device();
      unsigned int * b_adj = (unsigned int * )entriesB.ptr_on_device();
      unsigned int * c_adj = (unsigned int * )entriesC.ptr_on_device();

      int nnzA = entriesA.dimension_0();
      int nnzB = entriesB.dimension_0();

      value_type *a_ew = valuesA.ptr_on_device();
      value_type *b_ew = valuesB.ptr_on_device();
      value_type *c_ew = valuesC.ptr_on_device();



      /*
        std::cout << "create a" << std::endl;
        std::cout << "m:" << m << " n:" << n << std::endl;
        std::cout << "a_xadj[0]:" << a_xadj[0] << " a_xadj[m]:" << a_xadj[m] << std::endl;
        std::cout << "a_adj[a_xadj[m] - 1]:" << a_adj[a_xadj[m] - 1] << " a_ew[a_xadj[m] - 1]:" << a_ew[a_xadj[m] - 1] << std::endl;
       */


      Kokkos::Impl::Timer timerset;

      viennacl::compressed_matrix<value_type> A;
      viennacl::compressed_matrix<value_type> B;
      A.set(a_xadj, a_adj, a_ew, m, n, nnzA);
      B.set(b_xadj, b_adj, b_ew, n, k, nnzB);
      std::cout << "compress matrix create:" << timerset.seconds() << std::endl;


      std::cout << "Now running ViennaCL" << std::endl;
      Kokkos::Impl::Timer timer1;
      viennacl::compressed_matrix<value_type> C = viennacl::linalg::prod(A, B);
      std::cout << "Actual VIENNACL SPMM Time:" << timer1.seconds() << std::endl;

      {



        unsigned int c_rows = m, c_cols = k, cnnz = C.nnz();


        value_type   const * values   = viennacl::linalg::host_based::detail::extract_raw_pointer<value_type>(C.handle());
        unsigned int const * rows_start = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(C.handle1());
        unsigned int const * columns = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(C.handle2());


        {
          Kokkos::Impl::Timer copy_time;
          row_mapC = typename cin_row_index_view_type::non_const_type(Kokkos::ViewAllocateWithoutInitializing("rowmapC"), c_rows + 1);
          entriesC = typename cin_nonzero_index_view_type::non_const_type (Kokkos::ViewAllocateWithoutInitializing("EntriesC") , cnnz);
          valuesC = typename cin_nonzero_value_view_type::non_const_type (Kokkos::ViewAllocateWithoutInitializing("valuesC") ,  cnnz);

          KokkosKernels::Experimental::Util::copy_vector<unsigned int const *, typename cin_row_index_view_type::non_const_type, MyExecSpace> (m, rows_start, row_mapC);
          idx nnz = cnnz;

          KokkosKernels::Experimental::Util::copy_vector<unsigned int const *, typename cin_nonzero_index_view_type::non_const_type, MyExecSpace> (nnz, columns, entriesC);
          KokkosKernels::Experimental::Util::copy_vector<value_type   const *, typename cin_nonzero_value_view_type::non_const_type, MyExecSpace> (m, values, valuesC);
          double copy_time_d = copy_time.seconds();
          std::cout << "VIENNACL COPYTIME:" << copy_time_d << std::endl;
        }

      }
    }
    else {

      //int *a_xadj = row_mapA.ptr_on_device();
      std::cerr << "vienna requires (u) integer values" << std::endl;

      if (Kokkos::Impl::is_same<idx, long>::value){
        std::cerr << "MKL is given long" << std::endl;
      }
      else if (Kokkos::Impl::is_same<idx, const int>::value){
        std::cerr << "MKL is given const int" << std::endl;
      }
      else if (Kokkos::Impl::is_same<idx, unsigned long>::value){
        std::cerr << "MKL is given unsigned long" << std::endl;
      }
      else if (Kokkos::Impl::is_same<idx, const unsigned long>::value){
        std::cerr << "MKL is given const unsigned long" << std::endl;
      }
      else{
        std::cerr << "MKL is given something else" << std::endl;
      }
      return;
    }
#else
    std::cerr << "VIENNACL IS NOT DEFINED" << std::endl;
    return;
#endif
  }
Example #20
0
	void color(bool useConflictList, bool serialConflictResolution, bool ticToc){
		Ordinal numUncolored = _size; // on host
		double t, total = 0.0;
		Kokkos::Impl::Timer timer;

		if(useConflictList)
			_conflictType = CONFLICT_LIST;

		// While vertices to color, do speculative coloring.
		int iter = 0;
		for(iter = 0; (iter<20) && (numUncolored>0); iter++){
			std::cout<< "Start iteration " << iter << std::endl;

			// First color greedy speculatively, some conflicts expected
			this -> colorGreedy();
			ExecSpace::fence();
			if(ticToc){
				t = timer.seconds();
				total += t;
				std::cout << "Time speculative greedy phase " << iter << " : " << std::endl;
				timer.reset();
			}

#ifdef DEBUG
			// UVM required - will be slow!
			printf("\n 100 first vertices: ");
			for(int i = 0; i < 100; i++){
				printf(" %i", _colors[i]);
			}
			printf("\n");
#endif

			// Check for conflicts (parallel), find vertices to recolor
			numUncolored = this -> findConflicts();

			ExecSpace::fence();
			if(ticToc){
			t = timer.seconds();
			total += t;
			std::cout << "Time conflict detection " << iter << " : " << t << std::endl;
			timer.reset();
			}
			if (serialConflictResolution) break; // Break after first iteration
/*			if(_conflictType == CONFLICT_LIST){
				array_type temp = _vertexList;
				_vertexList = _recolorList;
				_vertexListLength() = _recolorListLength();
				_recolorList = temp;
				_recolorListLength() = 0;
			}
*/			if(_conflictType == CONFLICT_LIST){
				array_type temp = _vertexList;
				_vertexList = _recolorList;
				host_vertexListLength() = host_recolorListLength();
				_recolorList = temp;
				host_recolorListLength() = 0;
				Kokkos::deep_copy(_vertexListLength, host_vertexListLength);
				Kokkos::deep_copy(_recolorListLength, host_recolorListLength);
			}
		}

		std::cout << "Number of coloring iterations: " << iter << std::endl;

		if(numUncolored > 0){
			// Resolve conflicts by recolor in serial
			this -> resolveConflicts();
			ExecSpace::fence();
			if(ticToc){
				t = timer.seconds();
				total += t;
				std::cout << "Time conflict resolution: " << t << std::endl;
				std::cout << "Total time: " << total << std::endl;
			}
		}
	}
  KOKKOS_INLINE_FUNCTION
  int exampleCholPerformance(const string file_input,
                              const int treecut,
                              const int minblksize,
                              const int prunecut,
                              const int seed,
                              const int niter,
                              const int nthreads,
                              const int max_task_dependence,
                              const int team_size,
                              const int fill_level,
                              const int league_size,
                              const bool team_interface,
                              const bool skip_serial,
                              const bool mkl_interface,
                              const bool verbose) {
    typedef ValueType   value_type;
    typedef OrdinalType ordinal_type;
    typedef SizeType    size_type;

    typedef TaskFactory<Kokkos::Experimental::TaskPolicy<SpaceType>,
      Kokkos::Experimental::Future<int,SpaceType> > TaskFactoryType;

    typedef CrsMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> CrsMatrixBaseType;
    typedef GraphHelper_Scotch<CrsMatrixBaseType> GraphHelperType;
    typedef SymbolicFactorHelper<CrsMatrixBaseType> SymbolicFactorHelperType;

#ifdef HAVE_SHYLUTACHO_MKL 
    typedef typename CrsMatrixBaseType::value_type_array value_type_array;
#endif

    typedef CrsMatrixView<CrsMatrixBaseType> CrsMatrixViewType;
    typedef TaskView<CrsMatrixViewType,TaskFactoryType> CrsTaskViewType;
    
    typedef CrsMatrixBase<CrsTaskViewType,ordinal_type,size_type,SpaceType,MemoryTraits> CrsHierMatrixBaseType;

    typedef CrsMatrixView<CrsHierMatrixBaseType> CrsHierMatrixViewType;
    typedef TaskView<CrsHierMatrixViewType,TaskFactoryType> CrsHierTaskViewType;

    int r_val = 0;

    Kokkos::Impl::Timer timer;
    double 
      t_import = 0.0,
      t_reorder = 0.0,
      t_symbolic = 0.0,
      t_flat2hier = 0.0,
#ifdef HAVE_SHYLUTACHO_MKL 
      t_mkl_seq = 0.0,
#endif
      t_factor_seq = 0.0,
      t_factor_task = 0.0;
    const int start = -2;
    
    cout << "CholPerformance:: import input file = " << file_input << endl;        
    CrsMatrixBaseType AA("AA");
    {
      timer.reset();

      ifstream in;
      in.open(file_input);
      if (!in.good()) {
        cout << "Failed in open the file: " << file_input << endl;
        return ++r_val;
      }
      AA.importMatrixMarket(in);

      t_import = timer.seconds();

      if (verbose)
        cout << AA << endl;
    }
    cout << "CholPerformance:: import input file::time = " << t_import << endl;

    cout << "CholPerformance:: reorder the matrix" << endl;        
    CrsMatrixBaseType PA("Permuted AA");
    CrsMatrixBaseType UU("UU");     // permuted base upper triangular matrix
    CrsHierMatrixBaseType HU("HU"); // hierarchical matrix of views

    {
      GraphHelperType S(AA, seed);
      {
        timer.reset();
        
        S.computeOrdering(treecut, minblksize);
        S.pruneTree(prunecut);
        
        PA.copy(S.PermVector(), S.InvPermVector(), AA);
        
        t_reorder = timer.seconds();

        if (verbose)
          cout << S << endl
               << PA << endl;
      }

      cout << "CholPerformance:: reorder the matrix::time = " << t_reorder << endl;            
      {
        SymbolicFactorHelperType F(PA, league_size);
        for (int i=start;i<niter;++i) {
          timer.reset();
        
          F.createNonZeroPattern(fill_level, Uplo::Upper, UU);

          // UU.copy(Uplo::Upper, PA);

          t_symbolic += timer.seconds() * (i>=0);
        }
        t_symbolic /= niter;

        cout << "CholPerformance:: AA (nnz) = " << AA.NumNonZeros() << ", UU (nnz) = " << UU.NumNonZeros() << endl;

        if (verbose)
          cout << F << endl
               << UU << endl;
      }
      cout << "CholPerformance:: symbolic factorization::time = " << t_symbolic << endl;            
      {
        timer.reset();

        CrsMatrixHelper::flat2hier(Uplo::Upper, UU, HU,
                                   S.NumBlocks(),
                                   S.RangeVector(),
                                   S.TreeVector());
        
        for (ordinal_type k=0;k<HU.NumNonZeros();++k)
          HU.Value(k).fillRowViewArray();
        
        t_flat2hier = timer.seconds();
        
        cout << "CholPerformance:: Hier (dof, nnz) = " << HU.NumRows() << ", " << HU.NumNonZeros() << endl;
      }
      cout << "CholPerformance:: construct hierarchical matrix::time = " << t_flat2hier << endl;            
    }

    // copy of UU
    CrsMatrixBaseType RR("RR");
    RR.copy(UU);

#ifdef HAVE_SHYLUTACHO_MKL
    if (!skip_serial && mkl_interface) {
      cout << "CholPerformance:: MKL factorize the matrix" << endl;
      CrsMatrixBaseType MM("MM");
      for (int i=start;i<niter;++i) {
        MM.copy(RR);
        MM.hermitianize(Uplo::Upper);

        MKL_INT  n  = static_cast<MKL_INT>(MM.NumRows());
        double  *a  = static_cast<double*>(MM.ValuePtr());
        MKL_INT *ia = static_cast<MKL_INT*>(MM.RowPtr());
        MKL_INT *ja = static_cast<MKL_INT*>(MM.ColPtr());

        // convert to 1-based matrix
        {
          for (ordinal_type k=0;k<(MM.NumRows()+1);++k)
            ++ia[k];
          
          for (size_type k=0;k<MM.NumNonZeros();++k)
            ++ja[k];
        }
        value_type_array mkl_result = value_type_array("mkl-ilu-values", MM.NumNonZeros());
        double  *bilu0 = static_cast<double*>(&mkl_result[0]);
        MKL_INT ipar[128];
        double dpar[128];
        MKL_INT ierr;
        
        // we provide ilu-k pattern 
        timer.reset();
        dcsrilu0(&n, a, ia, ja, bilu0, ipar, dpar, &ierr);
        t_mkl_seq += timer.seconds() * (i>=0) * 0.5;
        
        if (ierr != 0) 
          cout << " MKL Error = " << ierr << endl;
      }
      t_mkl_seq /= niter;
      cout << "CholPerformance:: MKL factorize the matrix::time = " << t_mkl_seq << endl;
    }
#endif

    if (!skip_serial) {
#ifdef __USE_FIXED_TEAM_SIZE__
      typename TaskFactoryType::policy_type policy(max_task_dependence);
#else
      typename TaskFactoryType::policy_type policy(max_task_dependence, 1);
#endif
      TaskFactoryType::setUseTeamInterface(team_interface);
      TaskFactoryType::setMaxTaskDependence(max_task_dependence);
      TaskFactoryType::setPolicy(&policy);

      CrsTaskViewType U(&UU);
      U.fillRowViewArray();

      cout << "CholPerformance:: Serial factorize the matrix" << endl;
      {
        for (int i=start;i<niter;++i) {
          UU.copy(RR);
          timer.reset();          
          // {
          //   auto future = TaskFactoryType::Policy().create(Chol<Uplo::Upper,AlgoChol::UnblockedOpt,Variant::One>
          //                                                  ::TaskFunctor<CrsTaskViewType>(U), 0);
          //   TaskFactoryType::Policy().spawn(future);
          //   Kokkos::Experimental::wait(TaskFactoryType::Policy());
          // }
          {
            Chol<Uplo::Upper,AlgoChol::UnblockedOpt,Variant::One>
              ::invoke(TaskFactoryType::Policy(),
                                TaskFactoryType::Policy().member_single(),
                                U);
          }
          t_factor_seq += timer.seconds() * (i>=0);
        }
        t_factor_seq /= niter;

        if (verbose)
          cout << UU << endl;
      }
      cout << "CholPerformance:: Serial factorize the matrix::time = " << t_factor_seq << endl;
    }

//     {
// #ifdef __USE_FIXED_TEAM_SIZE__
//       typename TaskFactoryType::policy_type policy(max_task_dependence);
// #else
//       typename TaskFactoryType::policy_type policy(max_task_dependence, nthreads);
// #endif
//       TaskFactoryType::setPolicy(&policy);

//       CrsTaskViewType U(&UU);
//       U.fillRowViewArray();

//       cout << "CholPerformance:: Team factorize the matrix:: team_size = " << nthreads << endl;
//       {
//         timer.reset();
        
//         auto future = TaskFactoryType::Policy().create(Chol<Uplo::Upper,AlgoChol::UnblockedOpt,Variant::One>
//                                                        ::TaskFunctor<CrsTaskViewType>(U), 0);
//         TaskFactoryType::Policy().spawn(future);
//         Kokkos::Experimental::wait(TaskFactoryType::Policy());
        
//         t_factor_team = timer.seconds();
        
//         if (verbose)
//           cout << UU << endl;
//       }
//       cout << "CholPerformance:: Team factorize the matrix::time = " << t_factor_team << endl;
//     }

    {
#ifdef __USE_FIXED_TEAM_SIZE__
      typename TaskFactoryType::policy_type policy(max_task_dependence);
#else
      typename TaskFactoryType::policy_type policy(max_task_dependence, team_size);
#endif
      TaskFactoryType::setUseTeamInterface(team_interface);
      TaskFactoryType::setMaxTaskDependence(max_task_dependence);
      TaskFactoryType::setPolicy(&policy);
      
      cout << "CholPerformance:: ByBlocks factorize the matrix:: team_size = " << team_size << endl;
      CrsHierTaskViewType H(&HU);
      {
        for (int i=start;i<niter;++i) {
          UU.copy(RR);
          timer.reset();
          {
            auto future = TaskFactoryType::Policy().create_team(Chol<Uplo::Upper,AlgoChol::ByBlocks>::
                                                                TaskFunctor<CrsHierTaskViewType>(H), 0);
            TaskFactoryType::Policy().spawn(future);
            Kokkos::Experimental::wait(TaskFactoryType::Policy());
          }
          t_factor_task += timer.seconds() * (i>=0);
        }
        t_factor_task /= niter;

        if (verbose)
          cout << UU << endl;
      }  
      cout << "CholPerformance:: ByBlocks factorize the matrix::time = " << t_factor_task << endl;
    }

    if (!skip_serial) {
#ifdef HAVE_SHYLUTACHO_MKL
      cout << "CholPerformance:: mkl/chol scale [mkl/chol] = " << t_mkl_seq/t_factor_seq << endl;    
      cout << "CholPerformance:: mkl/task  scale [mkl/task]  = " << t_mkl_seq/t_factor_task << endl;    
#else
      cout << "CholPerformance:: task scale [seq/task] = " << t_factor_seq/t_factor_task << endl;    
#endif
      //cout << "CholPerformance:: team scale [seq/team] = " << t_factor_seq/t_factor_team << endl;    
    }

    return r_val;
  }
  KOKKOS_INLINE_FUNCTION
  int exampleICholUnblocked(const string file_input,
                            const int max_task_dependence,
                            const int team_size,
                            const bool verbose) {
    typedef ValueType   value_type;
    typedef OrdinalType ordinal_type;
    typedef SizeType    size_type;

    typedef CrsMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> CrsMatrixBaseType;
    typedef CrsMatrixView<CrsMatrixBaseType> CrsMatrixViewType;

    typedef TaskTeamFactory<Kokkos::Experimental::TaskPolicy<SpaceType>,
      Kokkos::Experimental::Future<int,SpaceType>,
      Kokkos::Impl::TeamThreadRangeBoundariesStruct> TaskFactoryType;

    typedef ParallelFor ForType;
    
    typedef TaskView<CrsMatrixViewType,TaskFactoryType> CrsTaskViewType;
    
    int r_val = 0;

    Kokkos::Impl::Timer timer;
    double t = 0.0;

    cout << "ICholUnblocked:: import input file = " << file_input << endl;        
    CrsMatrixBaseType AA("AA"), UU("UU");    
    {
      timer.reset();

      ifstream in;
      in.open(file_input);
      if (!in.good()) {
        cout << "Failed in open the file: " << file_input << endl;
        return ++r_val;
      }
      AA.importMatrixMarket(in);

      UU.copy(Uplo::Upper, AA);

      t = timer.seconds();

      if (verbose)
        cout << UU << endl;
    }
    cout << "ICholUnblocked:: import input file::time = " << t << endl;        

#ifdef __USE_FIXED_TEAM_SIZE__
    typename TaskFactoryType::policy_type policy(max_task_dependence);
#else
    typename TaskFactoryType::policy_type policy(max_task_dependence, team_size);
#endif
    TaskFactoryType::setMaxTaskDependence(max_task_dependence);
    TaskFactoryType::setPolicy(&policy);

    cout << "ICholUnblocked:: factorize the matrix" << endl;
    CrsTaskViewType U(&UU);
    U.fillRowViewArray();
    {
      timer.reset();
    
      auto future = TaskFactoryType::Policy().create_team(IChol<Uplo::Upper,AlgoIChol::UnblockedOpt1>
                                                          ::TaskFunctor<ForType,CrsTaskViewType>(U), 0);
      TaskFactoryType::Policy().spawn(future);
      Kokkos::Experimental::wait(TaskFactoryType::Policy());

      t = timer.seconds();

      if (verbose)
        cout << UU << endl;
    }   
    cout << "ICholUnblocked:: factorize the matrix::time = " << t << endl; 
    
    return r_val;
  }
Example #23
0
  BASKER_INLINE
  int Basker<Int, Entry, Exe_Space>::factor_notoken(Int option)
  {

    //printf("factor no token called \n");

    gn = A.ncol;
    gm = A.nrow;
    BASKER_MATRIX ATEMP;

    //Kokkos::Impl::Timer tza;
    if(Options.btf == BASKER_TRUE)
      {
	//JDB: We can change this for the new inteface
	gn = A.ncol;
	gm = A.nrow;
	ATEMP = A;
	A = BTF_A; 
      }
    //printf("Switch time: %f \n", tza.seconds());

   

    //Spit into Domain and Sep
    //----------------------Domain-------------------------//
    #ifdef BASKER_KOKKOS

    //====TIMER==
    #ifdef BASKER_TIME
    Kokkos::Impl::Timer       timer;
    #endif
    //===TIMER===

    typedef Kokkos::TeamPolicy<Exe_Space>        TeamPolicy;

    if(btf_tabs_offset != 0)
      {

	if(Options.verbose == BASKER_TRUE)
	  {
	    printf("Factoring Dom num_threads: %d \n",
		   num_threads);
	  }


	Int domain_restart = 0;
	kokkos_nfactor_domain <Int,Entry,Exe_Space>
	  domain_nfactor(this);
	Kokkos::parallel_for(TeamPolicy(num_threads,1), 
			     domain_nfactor);
	Kokkos::fence();
    

    //=====Check for error======
    while(true)
      {
	INT_1DARRAY thread_start;
	MALLOC_INT_1DARRAY(thread_start, num_threads+1);
	init_value(thread_start, num_threads+1, 
		   (Int) BASKER_MAX_IDX);
	int nt = nfactor_domain_error(thread_start);
	if((nt == BASKER_SUCCESS) ||
	   (nt == BASKER_ERROR) ||
	   (domain_restart > BASKER_RESTART))
	  {
	    break;
	  }
	else
	  {
	    domain_restart++;
	    if(Options.verbose == BASKER_TRUE)
	      {
		printf("restart \n");
	      }
	    kokkos_nfactor_domain_remalloc <Int, Entry, Exe_Space>
	      diag_nfactor_remalloc(this, thread_start);
	    Kokkos::parallel_for(TeamPolicy(num_threads,1),
				 diag_nfactor_remalloc);
	    Kokkos::fence();
	  }
      }//end while

    //====TIMER===
    #ifdef BASKER_TIME
    printf("Time DOMAIN: %f \n", timer.seconds());
    timer.reset();
    #endif
    //====TIMER====
    

    #else// else basker_kokkos
    #pragma omp parallel
    {


    }//end omp parallel
    #endif //end basker_kokkos

      }
    //-------------------End--Domian--------------------------//
    
    //printVec("domperm.csc", gpermi, A.nrow);
   
    //---------------------------Sep--------------------------//
   
    if(btf_tabs_offset != 0)
      {
    //for(Int l=1; l<=4; l++)
    for(Int l=1; l <= tree.nlvls; l++)
      {

	//#ifdef BASKER_OLD_BARRIER
	//Int lthreads = pow(2,l);
	//Int lnteams = num_threads/lthreads;
	//#else
	Int lthreads = 1;
	Int lnteams = num_threads/lthreads;
	//#endif

	Int sep_restart = 0;


	if(Options.verbose == BASKER_TRUE)
	  {
	    printf("Factoring Sep num_threads: %d %d \n",
		   lnteams, lthreads);
	  }

	#ifdef BASKER_KOKKOS
	Kokkos::Impl::Timer  timer_inner_sep;
	#ifdef BASKER_NO_LAMBDA

	//kokkos_nfactor_sep <Int, Entry, Exe_Space> 
	//sep_nfactor(this, l);

	kokkos_nfactor_sep2 <Int, Entry, Exe_Space>
	  sep_nfactor(this,l);
	
	Kokkos::parallel_for(TeamPolicy(lnteams,lthreads),
			     sep_nfactor);
	Kokkos::fence();

	//======Check for error=====
	while(true)
	  {
	    INT_1DARRAY thread_start;
	    MALLOC_INT_1DARRAY(thread_start, num_threads+1);
	    init_value(thread_start, num_threads+1,
		      (Int) BASKER_MAX_IDX);
	    int nt = nfactor_sep_error(thread_start);
	    if((nt == BASKER_SUCCESS)||
	       (nt == BASKER_ERROR) ||
	       (sep_restart > BASKER_RESTART))
	      {
		FREE_INT_1DARRAY(thread_start);
		break;
	      }
	    else
	      {
		sep_restart++;
		if (Options.verbose == BASKER_TRUE)
		  {
		    printf("restart \n");
		  }
		Kokkos::parallel_for(TeamPolicy(lnteams,lthreads),  sep_nfactor);
		Kokkos::fence();

	      }
	  }//end while-true


	#ifdef BASKER_TIME
	printf("Time INNERSEP: %d %f \n", 
	       l, timer_inner_sep.seconds());
	#endif
        #else //ELSE BASKER_NO_LAMBDA
	//Note: to be added
        #endif //end BASKER_NO_LAMBDA
	#else
	#pragma omp parallel
	{

	}//end omp parallel
	#endif
      }//end over each level

    #ifdef BASKER_TIME
    printf("Time SEP: %f \n", timer.seconds());
    #endif
      }
    
    //-------------------------End Sep----------------//


    //-------------------IF BTF-----------------------//
    if(Options.btf == BASKER_TRUE)
      {

	Int btf_restart = 0;

	if(Options.verbose == BASKER_TRUE)
	  {
	    printf("Factoring BLKs num_threads: %d \n",
		   num_threads);
	  }

	//=====Timer
	#ifdef BASKER_TIME
	Kokkos::Impl::Timer  timer_btf;
	#endif
	//====Timer
	
	//======Call diag factor====
	kokkos_nfactor_diag <Int, Entry, Exe_Space> 
	  diag_nfactor(this);
	Kokkos::parallel_for(TeamPolicy(num_threads,1),
			     diag_nfactor);
	Kokkos::fence();
	
	//=====Check for error======
	while(true)
	  {
	    INT_1DARRAY thread_start;
	    MALLOC_INT_1DARRAY(thread_start, num_threads+1);
	    init_value(thread_start, num_threads+1, 
		       (Int) BASKER_MAX_IDX);
	    int nt = nfactor_diag_error(thread_start);
	    //printf("RETURNED: %d \n", nt);
	    if((nt == BASKER_SUCCESS) || 
	       (nt == BASKER_ERROR) ||
	       (btf_restart > BASKER_RESTART))
	      {
		break;
	      }
	    else
	      {
		btf_restart++;
		if (Options.verbose == BASKER_TRUE)
		  {
		    printf("restart \n");
		  }
		kokkos_nfactor_diag_remalloc <Int, Entry, Exe_Space>
		  diag_nfactor_remalloc(this, thread_start);
		Kokkos::parallel_for(TeamPolicy(num_threads,1),
				     diag_nfactor_remalloc);
		Kokkos::fence();
	      }
	  }//end while

	//====TIMER
	#ifdef BASKER_TIME
	printf("Time BTF: %f \n", 
	       timer_btf.seconds());
	#endif
	//===TIMER

      }//end btf call

    Kokkos::Impl::Timer tzback;
     if(Options.btf == BASKER_TRUE)
      {
	A = ATEMP;
      }
     //printf("Switch back: %f \n",
     //	    tzback.seconds());
    
    return 0;
  }//end factor_notoken()
int exampleDenseMatrixBase(const ordinal_type mmin,
                           const ordinal_type mmax,
                           const ordinal_type minc,
                           const bool verbose) {
    typedef typename
    Kokkos::Impl::is_space<DeviceSpaceType>::host_mirror_space::execution_space HostSpaceType ;

    const bool detail = false;
    std::cout << "DeviceSpace::  ";
    DeviceSpaceType::print_configuration(std::cout, detail);
    std::cout << "HostSpace::    ";
    HostSpaceType::print_configuration(std::cout, detail);
    std::cout << std::endl;

    typedef DenseMatrixBase<value_type,ordinal_type,size_type,HostSpaceType>   DenseMatrixBaseHostType;
    typedef DenseMatrixBase<value_type,ordinal_type,size_type,DeviceSpaceType> DenseMatrixBaseDeviceType;

    int r_val = 0;

    Kokkos::Impl::Timer timer;

    std::cout << "DenseMatrixBase:: test matrices "
              <<":: mmin = " << mmin << " , mmax = " << mmax << " , minc = " << minc << std::endl;

    for (auto m=mmin; m<=mmax; m+=minc) {
        // random test matrix on host
        DenseMatrixBaseHostType TT("TT", m, m);
        for (ordinal_type j=0; j<TT.NumCols(); ++j) {
            for (ordinal_type i=0; i<TT.NumRows(); ++i)
                TT.Value(i,j) = 2.0*((value_type)std::rand()/(RAND_MAX)) - 1.0;
            TT.Value(j,j) = std::fabs(TT.Value(j,j));
        }
        if (verbose)
            TT.showMe(std::cout) << std::endl;

        DenseMatrixBaseDeviceType AA("AA");

        timer.reset();
        AA.mirror(TT);
        double t_mirror = timer.seconds();

        DenseMatrixBaseDeviceType BB("BB");
        BB.createConfTo(AA);

        timer.reset();
        DenseMatrixTools::copy(BB, AA);
        double t_copy = timer.seconds();

        // check
        DenseMatrixBaseHostType RR("RR");
        RR.createConfTo(BB);
        RR.mirror(BB);
        if (verbose)
            RR.showMe(std::cout) << std::endl;

        double err = 0.0;
        for (ordinal_type j=0; j<TT.NumCols(); ++j)
            for (ordinal_type i=0; i<TT.NumRows(); ++i)
                err += std::fabs(TT.Value(i,j) - RR.Value(i,j));

        {
            const auto prec = std::cout.precision();
            std::cout.precision(4);

            std::cout << std::scientific
                      << "DenseMatrixBase:: dimension = " << m << " x " << m << ", "
                      << "Mirroring to device  = " << t_mirror << " [sec], "
                      << "Elementwise copy on device = " << t_copy << " [sec], "
                      << "Error = " << err
                      << std::endl;

            std::cout.unsetf(std::ios::scientific);
            std::cout.precision(prec);
        }
    }

    return r_val;
}
Example #25
0
void VerletKokkos::run(int n)
{
  bigint ntimestep;
  int nflag,sortflag;

  int n_post_integrate = modify->n_post_integrate;
  int n_pre_exchange = modify->n_pre_exchange;
  int n_pre_neighbor = modify->n_pre_neighbor;
  int n_pre_force = modify->n_pre_force;
  int n_post_force = modify->n_post_force;
  int n_end_of_step = modify->n_end_of_step;

  if (atomKK->sortfreq > 0) sortflag = 1;
  else sortflag = 0;

  static double time = 0.0;
  static int count = 0;
  atomKK->sync(Device,ALL_MASK);
  Kokkos::Impl::Timer ktimer;

  for (int i = 0; i < n; i++) {

    ntimestep = ++update->ntimestep;
    ev_set(ntimestep);

    // initial time integration

    ktimer.reset();
    timer->stamp();
    modify->initial_integrate(vflag);
    time += ktimer.seconds();
    if (n_post_integrate) modify->post_integrate();
    timer->stamp(Timer::MODIFY);

    // regular communication vs neighbor list rebuild

    nflag = neighbor->decide();

    if (nflag == 0) {
      timer->stamp();
      comm->forward_comm();
      timer->stamp(Timer::COMM);
    } else {
      // added debug
      //atomKK->sync(Host,ALL_MASK);
      //atomKK->modified(Host,ALL_MASK);

      if (n_pre_exchange) {
        timer->stamp();
        modify->pre_exchange();
        timer->stamp(Timer::MODIFY);
      }
      // debug
      //atomKK->sync(Host,ALL_MASK);
      //atomKK->modified(Host,ALL_MASK);
      if (triclinic) domain->x2lamda(atomKK->nlocal);
      domain->pbc();
      if (domain->box_change) {
        domain->reset_box();
        comm->setup();
        if (neighbor->style) neighbor->setup_bins();
      }
      timer->stamp();

      // added debug
      //atomKK->sync(Device,ALL_MASK);
      //atomKK->modified(Device,ALL_MASK);

      comm->exchange();
      if (sortflag && ntimestep >= atomKK->nextsort) atomKK->sort();
      comm->borders();

      // added debug
      //atomKK->sync(Host,ALL_MASK);
      //atomKK->modified(Host,ALL_MASK);

      if (triclinic) domain->lamda2x(atomKK->nlocal+atomKK->nghost);

      timer->stamp(Timer::COMM);
      if (n_pre_neighbor) {
        modify->pre_neighbor();
        timer->stamp(Timer::MODIFY);
      }
      neighbor->build();
      timer->stamp(Timer::NEIGH);
    }

    // force computations
    // important for pair to come before bonded contributions
    // since some bonded potentials tally pairwise energy/virial
    // and Pair:ev_tally() needs to be called before any tallying

    force_clear();

    timer->stamp();

    // added for debug
    //atomKK->k_x.sync<LMPHostType>();
    //atomKK->k_f.sync<LMPHostType>();
    //atomKK->k_f.modify<LMPHostType>();
    if (n_pre_force) {
      modify->pre_force(vflag);
      timer->stamp(Timer::MODIFY);
    }


    if (pair_compute_flag) {
      atomKK->sync(force->pair->execution_space,force->pair->datamask_read);
      atomKK->modified(force->pair->execution_space,force->pair->datamask_modify);
      force->pair->compute(eflag,vflag);
      timer->stamp(Timer::PAIR);
    }

    if (atomKK->molecular) {
      if (force->bond) {
        atomKK->sync(force->bond->execution_space,force->bond->datamask_read);
        atomKK->modified(force->bond->execution_space,force->bond->datamask_modify);
        force->bond->compute(eflag,vflag);
      }
      if (force->angle) {
        atomKK->sync(force->angle->execution_space,force->angle->datamask_read);
        atomKK->modified(force->angle->execution_space,force->angle->datamask_modify);
        force->angle->compute(eflag,vflag);
      }
      if (force->dihedral) {
        atomKK->sync(force->dihedral->execution_space,force->dihedral->datamask_read);
        atomKK->modified(force->dihedral->execution_space,force->dihedral->datamask_modify);
        force->dihedral->compute(eflag,vflag);
      }
      if (force->improper) {
        atomKK->sync(force->improper->execution_space,force->improper->datamask_read);
        atomKK->modified(force->improper->execution_space,force->improper->datamask_modify);
        force->improper->compute(eflag,vflag);
      }
      timer->stamp(Timer::BOND);
    }

    if (kspace_compute_flag) {
      atomKK->sync(force->kspace->execution_space,force->kspace->datamask_read);
      atomKK->modified(force->kspace->execution_space,force->kspace->datamask_modify);
      force->kspace->compute(eflag,vflag);
      timer->stamp(Timer::KSPACE);
    }

    // reverse communication of forces

    if (force->newton) comm->reverse_comm();
    timer->stamp(Timer::COMM);

    // force modifications, final time integration, diagnostics

    ktimer.reset();

    if (n_post_force) modify->post_force(vflag);
    modify->final_integrate();
    if (n_end_of_step) modify->end_of_step();
    timer->stamp(Timer::MODIFY);

    time += ktimer.seconds();

    // all output

    if (ntimestep == output->next) {
       atomKK->sync(Host,ALL_MASK);

      timer->stamp();
      output->write(ntimestep);
      timer->stamp(Timer::OUTPUT);
    }
  }
}
  KOKKOS_INLINE_FUNCTION
  int exampleKokkosTaskData(const int ntasks,
                            const int max_task_dependence,
                            const int team_size, 
                            const bool verbose) {

    typedef Kokkos::Experimental::TaskPolicy<SpaceType> policy_type ;
    typedef SimpleTask<policy_type> simple_task_type;

    typedef Kokkos::Experimental::Future<typename simple_task_type::value_type,SpaceType> future_type ;
    
    policy_type policy;

    Kokkos::Impl::Timer timer;

    for (int use_barrier=0;use_barrier<2;++use_barrier) {
      cout << "KokkosTaskData:: use barrier " << (use_barrier ? "yes" : "no") << endl;
      {
        timer.reset();
        
        future_type f = policy.create(simple_task_type(use_barrier), max_task_dependence);
        policy.spawn(f);
        
        Kokkos::Experimental::wait( policy );
        
        const double t = timer.seconds();
        cout << "KokkosTaskData:: single task is spawned :: time = " << t << endl;
      }
      
      {
        timer.reset();
        
        future_type f = policy.create_team(simple_task_type(use_barrier), max_task_dependence);
        policy.spawn(f);
        
        Kokkos::Experimental::wait( policy );
        
        const double t = timer.seconds();
        cout << "KokkosTaskData:: single team task is spawned :: time = " << t << endl;
      }
      
      {
        timer.reset();
        
        future_type f[MAXTASKS];
        for (int i=0;i<ntasks;++i) {
          f[i] = policy.create(simple_task_type(use_barrier), max_task_dependence);
          policy.spawn(f[i]);
        }
        Kokkos::Experimental::wait( policy );
      
        const double t = timer.seconds();
        cout << "KokkosTaskData:: " << ntasks << " tasks are spawned :: time = " << t << endl;
      }
      
      {
        timer.reset();
        
        future_type f[MAXTASKS];
        for (int i=0;i<ntasks;++i) {
          f[i] = policy.create_team(simple_task_type(use_barrier), max_task_dependence);
          policy.spawn(f[i]);
        }
        
        Kokkos::Experimental::wait( policy );
        
        const double t = timer.seconds();
        cout << "KokkosTaskData:: " << ntasks << " team tasks are spawned :: time = " << t << endl;
      }
    }
    return 0;
  }
void graph_color_symbolic(
    KernelHandle *handle,
    typename KernelHandle::row_lno_t num_rows,
    typename KernelHandle::row_lno_t num_cols,
    lno_row_view_t_ row_map,
    lno_nnz_view_t_ entries,
    bool is_symmetric = true){

  Kokkos::Impl::Timer timer;

  typename KernelHandle::GraphColoringHandleType *gch = handle->get_graph_coloring_handle();

  ColoringAlgorithm algorithm = gch->get_coloring_type();

  typedef typename KernelHandle::GraphColoringHandleType::color_view_t color_view_type;

  color_view_type colors_out = color_view_type("Graph Colors", num_rows);


  typedef typename Impl::GraphColor
      <typename KernelHandle::GraphColoringHandleType, lno_row_view_t_, lno_nnz_view_t_> BaseGraphColoring;
  BaseGraphColoring *gc = NULL;


  switch (algorithm){
  case COLORING_SERIAL:

    gc = new BaseGraphColoring(
        num_rows, entries.dimension_0(),
        row_map, entries, gch);
    break;
  case COLORING_VB:
  case COLORING_VBBIT:
  case COLORING_VBCS:

    typedef typename Impl::GraphColor_VB
        <typename KernelHandle::GraphColoringHandleType, lno_row_view_t_, lno_nnz_view_t_> VBGraphColoring;
    gc = new VBGraphColoring(
        num_rows, entries.dimension_0(),
        row_map, entries, gch);
    break;
  case COLORING_EB:

    typedef typename Impl::GraphColor_EB
        <typename KernelHandle::GraphColoringHandleType, lno_row_view_t_, lno_nnz_view_t_> EBGraphColoring;

    gc = new EBGraphColoring(num_rows, entries.dimension_0(),row_map, entries, gch);
    break;
  case COLORING_DEFAULT:
    break;

  }

  int num_phases = 0;
  gc->color_graph(colors_out, num_phases);
  delete gc;
  double coloring_time = timer.seconds();
  gch->add_to_overall_coloring_time(coloring_time);
  gch->set_coloring_time(coloring_time);
  gch->set_num_phases(num_phases);
  gch->set_vertex_colors(colors_out);
}
Example #28
0
void driver_modified_gram_schmidt
<
#if defined( __CUDACC__ )
Kokkos::Cuda
#else
Kokkos::Threads
#endif
>
  ( const int length_begin ,
    const int length_end ,
    const int count ,
    const int iter ,
    comm::Machine machine )
{
#if defined( __CUDACC__ )
  typedef Kokkos::Cuda Device ;
#else
  typedef Kokkos::Threads Device ;
#endif

  const int comm_size = comm::size( machine );
  const int comm_rank = comm::rank( machine );

  if ( comm_rank == 0 ) {

    std::cout << ( Kokkos::Impl::is_same<Device,Kokkos::Cuda>::value ?
                   "\"Cuda\"" : "\"Threads\"" )
              << " , \"Double Precision\""
              << std::endl ;

    std::cout << "\"Length\" , \"Count\" , \"millisec\""
              << " , \"Gflops\" , \"Greads\" , \"Gwrites\""
              << " ,   \"Gflops/s\" , \"Read GB/s\" , \"Write GB/s\""
              << std::endl ;
  }

  for ( int length = length_begin ; length < length_end ; length *= 2 ) {

    const ModifedGramSchmidCounts counts( length , count );

    const int local_length_upper = ( length + comm_size - 1 ) / comm_size ;
    const int local_begin  = std::min( length , local_length_upper * comm_rank );
    const int local_next   = std::min( length , local_length_upper * ( comm_rank + 1 ) );
    const int local_length = local_next - local_begin ;

    typedef Kokkos::View< double ** ,
                               Kokkos::LayoutLeft ,
                               Device > matrix_double_type ;

    const matrix_double_type Q( "Q" , local_length , count );
    const matrix_double_type R( "R" , count , count );

    const matrix_double_type::HostMirror hQ =
      Kokkos::create_mirror_view( Q );

    for ( int j = 0 ; j < count ; ++j ) {
      for ( int i = 0 ; i < local_length ; ++i ) {
        hQ(i,j) = ( i + 1 ) * ( j + 1 );
      }
    }

    double dt_min = 0 ;

    for ( int j = 0 ; j < iter ; ++j ) {
      Kokkos::deep_copy( Q , hQ );

      Kokkos::Impl::Timer timer ;

      modified_gram_schmidt( Q , R , machine );

      const double dt = comm::max( machine , timer.seconds() );

      if ( 0 == j || dt < dt_min ) dt_min = dt ;
    }

    if ( 0 == comm_rank ) {

      const double milli_sec   = dt_min * 1.0e3 ;
      const double giga_flops  = ( counts.flops / dt_min ) / 1.0e9 ;
      const double GB_reads  = ( counts.reads * sizeof(double) ) / ( dt_min * 1.0e9 );
      const double GB_writes = ( counts.writes * sizeof(double) ) / ( dt_min * 1.0e9 );

      std::cout << length     << " , "
                << count      << " , "
                << milli_sec  << " , "
                << double(counts.flops) / 1.0e9 << " , "
                << double(counts.reads) / 1.0e9 << " , "
                << double(counts.writes) / 1.0e9 << " ,   "
                << giga_flops << " , "
                << GB_reads << " , "
                << GB_writes
                << std::endl ;
    }
  }
}
  KOKKOS_INLINE_FUNCTION
  int exampleCholDirectPlain(const string file_input,
                             const int prunecut,
                             const int seed,
                             const int nrhs,
                             const int nb, 
                             const int nthreads,
                             const int max_task_dependence,
                             const int team_size,
                             const int league_size,
                             const bool team_interface,
                             const bool serial,
                             const bool solve,
                             const bool check,
                             const bool verbose) {
    typedef ValueType   value_type;
    typedef OrdinalType ordinal_type;
    typedef SizeType    size_type;

    typedef TaskFactory<Kokkos::Experimental::TaskPolicy<SpaceType>,
      Kokkos::Experimental::Future<int,SpaceType> > TaskFactoryType;

    typedef CrsMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> CrsMatrixBaseType;
    typedef GraphHelper_Scotch<CrsMatrixBaseType> GraphHelperType;
    typedef SymbolicFactorHelper<CrsMatrixBaseType> SymbolicFactorHelperType;

    typedef CrsMatrixView<CrsMatrixBaseType> CrsMatrixViewType;
    typedef TaskView<CrsMatrixViewType,TaskFactoryType> CrsTaskViewType;
    
    typedef CrsMatrixBase<CrsTaskViewType,ordinal_type,size_type,SpaceType,MemoryTraits> CrsHierMatrixBaseType;

    typedef CrsMatrixView<CrsHierMatrixBaseType> CrsHierMatrixViewType;
    typedef TaskView<CrsHierMatrixViewType,TaskFactoryType> CrsHierTaskViewType;

    typedef DenseMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> DenseMatrixBaseType;

    typedef DenseMatrixView<DenseMatrixBaseType> DenseMatrixViewType;
    typedef TaskView<DenseMatrixViewType,TaskFactoryType> DenseTaskViewType;

    typedef DenseMatrixBase<DenseTaskViewType,ordinal_type,size_type,SpaceType,MemoryTraits> DenseHierMatrixBaseType;
    
    typedef DenseMatrixView<DenseHierMatrixBaseType> DenseHierMatrixViewType;
    typedef TaskView<DenseHierMatrixViewType,TaskFactoryType> DenseHierTaskViewType;

    int r_val = 0;

    Kokkos::Impl::Timer timer;
    double 
      t_import = 0.0,
      t_reorder = 0.0,
      t_symbolic = 0.0,
      t_flat2hier = 0.0,
      t_factor = 0.0, 
      t_solve = 0.0;
    
    cout << "CholDirectPlain:: import input file = " << file_input << endl;        
    CrsMatrixBaseType AA("AA");
    {
      timer.reset();
      ifstream in;
      in.open(file_input);
      if (!in.good()) {
        cout << "Failed in open the file: " << file_input << endl;
        return ++r_val;
      }
      AA.importMatrixMarket(in);
      t_import = timer.seconds();
    }
    cout << "CholDirectPlain:: import input file::time = " << t_import << endl;

    // matrix A and its upper triangular factors U
    CrsMatrixBaseType PA("Permuted AA");
    CrsMatrixBaseType UU("UU"); // permuted base upper triangular matrix
    CrsHierMatrixBaseType HU("HU"); // hierarchical matrix of views

    // right hand side and solution matrix
    DenseMatrixBaseType BB("BB", AA.NumRows(), nrhs), XX("XX", AA.NumRows(), nrhs);
    DenseHierMatrixBaseType HB("HB"), HX("HX");

    {
      cout << "CholDirectPlain:: reorder the matrix" << endl;        
      GraphHelperType S(AA, seed);
      {
        timer.reset();
        S.computeOrdering();
        S.pruneTree(prunecut);
        PA.copy(S.PermVector(), S.InvPermVector(), AA);
        t_reorder = timer.seconds();
      }
      cout << "CholDirectPlain:: reorder the matrix::time = " << t_reorder << endl;            

      {
        SymbolicFactorHelperType F(PA, league_size);
        timer.reset();
        F.createNonZeroPattern(Uplo::Upper, UU);
        t_symbolic = timer.seconds();
        cout << "CholDirectPlain:: AA (nnz) = " << AA.NumNonZeros() << ", UU (nnz) = " << UU.NumNonZeros() << endl;
      }
      cout << "CholDirectPlain:: symbolic factorization::time = " << t_symbolic << endl;            

      {
        timer.reset();
        CrsMatrixHelper::flat2hier(Uplo::Upper, UU, HU,
                                   S.NumBlocks(),
                                   S.RangeVector(),
                                   S.TreeVector());

        // fill internal meta data for sparse blocs
        for (ordinal_type k=0;k<HU.NumNonZeros();++k)
          HU.Value(k).fillRowViewArray();

        DenseMatrixHelper::flat2hier(BB, HB,
                                     S.NumBlocks(),
                                     S.RangeVector(),
                                     nb);        

        DenseMatrixHelper::flat2hier(XX, HX,
                                     S.NumBlocks(),
                                     S.RangeVector(),
                                     nb);        
        t_flat2hier = timer.seconds();
        cout << "CholDirectPlain:: Hier (dof, nnz) = " << HU.NumRows() << ", " << HU.NumNonZeros() << endl;
      }
      cout << "CholDirectPlain:: construct hierarchical matrix::time = " << t_flat2hier << endl;            
    }


    {
      // Policy setup
#ifdef __USE_FIXED_TEAM_SIZE__ 
      typename TaskFactoryType::policy_type policy(max_task_dependence);
#else
      typename TaskFactoryType::policy_type policy(max_task_dependence, 1);
#endif
      TaskFactoryType::setUseTeamInterface(team_interface);
      TaskFactoryType::setMaxTaskDependence(max_task_dependence);
      TaskFactoryType::setPolicy(&policy);
      
      CrsTaskViewType A(&PA), U(&UU);
      DenseTaskViewType X(&XX), B(&BB);
      
      A.fillRowViewArray();      
      U.fillRowViewArray();      

      CrsHierTaskViewType TU(&HU);
      DenseHierTaskViewType TB(&HB), TX(&HX);

      {
        // Manufacture B = AX
        const int m = A.NumRows();
        for (int j=0;j<nrhs;++j)
          for (int i=0;i<m;++i)
            X.Value(i,j) = (j+1);
        
        Gemm<Trans::NoTranspose,Trans::NoTranspose,AlgoGemm::ForTriSolveBlocked>
          ::invoke(TaskFactoryType::Policy(),
                   TaskFactoryType::Policy().member_single(),
                   1.0, A, X, 0.0, B);
        XX.copy(BB);        
      }
      
      if (serial) {
        cout << "CholDirectPlain:: Serial factorize the matrix" << endl;
        timer.reset();          
        Chol<Uplo::Upper,AlgoChol::UnblockedOpt,Variant::One>
          ::invoke(TaskFactoryType::Policy(),
                   TaskFactoryType::Policy().member_single(),
                   U);
        t_factor = timer.seconds();
        cout << "CholDirectPlain:: Serial factorize the matrix::time = " << t_factor << endl;
      } else {
        cout << "CholDirectPlain:: ByBlocks factorize the matrix:: team_size = " << team_size << endl;
        timer.reset();    
        auto future = TaskFactoryType::Policy().create_team
          (Chol<Uplo::Upper,AlgoChol::ByBlocks>
           ::TaskFunctor<CrsHierTaskViewType>(TU), 0);
        TaskFactoryType::Policy().spawn(future);
        Kokkos::Experimental::wait(TaskFactoryType::Policy());
        t_factor = timer.seconds();
        cout << "CholDirectPlain:: ByBlocks factorize the matrix::time = " << t_factor << endl;
      }

      if (solve) {
        if (serial) {
          cout << "CholDirectPlain:: Serial forward/backward solve" << endl;
          timer.reset();          
          TriSolve<Uplo::Upper,Trans::ConjTranspose,AlgoTriSolve::Unblocked>
            ::invoke(TaskFactoryType::Policy(),
                     TaskFactoryType::Policy().member_single(),
                     Diag::NonUnit, U, X);
          TriSolve<Uplo::Upper,Trans::NoTranspose,AlgoTriSolve::Unblocked>
            ::invoke(TaskFactoryType::Policy(),
                     TaskFactoryType::Policy().member_single(),
                     Diag::NonUnit, U, X);
          t_solve = timer.seconds();
          cout << "CholDirectPlain:: Serial forward/backward solve::time = " << t_solve << endl;
        } else {
          cout << "CholDirectPlain:: ByBlocks forward/backward solve" << endl;
          timer.reset();          
          auto future_forward_solve = TaskFactoryType::Policy().create_team
            (TriSolve<Uplo::Upper,Trans::ConjTranspose,AlgoTriSolve::ByBlocks>
             ::TaskFunctor<CrsHierTaskViewType,DenseHierTaskViewType>
             (Diag::NonUnit, TU, TX), 0);
          
          TaskFactoryType::Policy().spawn(future_forward_solve);
          
          auto future_backward_solve = TaskFactoryType::Policy().create_team
            (TriSolve<Uplo::Upper,Trans::NoTranspose,AlgoTriSolve::ByBlocks>
             ::TaskFunctor<CrsHierTaskViewType,DenseHierTaskViewType>
             (Diag::NonUnit, TU, TX), 1);
          
          TaskFactoryType::Policy().add_dependence(future_backward_solve, future_forward_solve);
          TaskFactoryType::Policy().spawn(future_backward_solve);
          
          Kokkos::Experimental::wait(TaskFactoryType::Policy());
          t_solve = timer.seconds();
          cout << "CholDirectPlain:: ByBlocks forward/backward solve::time = " << t_solve << endl;
        }
      }

      if (solve && check) {
        // Check manufactured solution
        double l2 = 0.0, linf = 0.0;
        const int m = A.NumRows();
        for (int j=0;j<nrhs;++j)
          for (int i=0;i<m;++i) {
            double diff = abs(X.Value(i,j) - (j+1));
            l2 += diff*diff;
            linf = max(diff, linf);
          }
        l2 = sqrt(l2);
        cout << "CholDirectPlain:: Check solution::L2 = " << l2 << ", Linf = " << linf << endl;
      }
    }

    return r_val;
  }
  int exampleDenseGemmByBlocks(const ordinal_type mmin,
                               const ordinal_type mmax,
                               const ordinal_type minc,
                               const ordinal_type k,
                               const ordinal_type mb,
                               const int max_concurrency,
                               const int max_task_dependence,
                               const int team_size,
                               const int mkl_nthreads,
                               const bool check,
                               const bool verbose) {
    typedef typename
      Kokkos::Impl::is_space<DeviceSpaceType>::host_mirror_space::execution_space HostSpaceType ;

    const bool detail = false;
    std::cout << "DeviceSpace::  "; DeviceSpaceType::print_configuration(std::cout, detail);
    std::cout << "HostSpace::    ";   HostSpaceType::print_configuration(std::cout, detail);

    typedef Kokkos::Experimental::TaskPolicy<DeviceSpaceType> PolicyType;

    typedef DenseMatrixBase<value_type,ordinal_type,size_type,HostSpaceType> DenseMatrixBaseHostType;
    typedef DenseMatrixView<DenseMatrixBaseHostType> DenseMatrixViewHostType;

    typedef DenseMatrixBase<value_type,ordinal_type,size_type,DeviceSpaceType> DenseMatrixBaseDeviceType;
    typedef DenseMatrixView<DenseMatrixBaseDeviceType> DenseMatrixViewDeviceType;
    typedef TaskView<DenseMatrixViewDeviceType> DenseTaskViewDeviceType;

    typedef DenseMatrixBase<DenseTaskViewDeviceType,ordinal_type,size_type,DeviceSpaceType> DenseHierMatrixBaseDeviceType;

    typedef DenseMatrixView<DenseHierMatrixBaseDeviceType> DenseHierMatrixViewDeviceType;
    typedef TaskView<DenseHierMatrixViewDeviceType> DenseHierTaskViewDeviceType;

    int r_val = 0;

    Kokkos::Impl::Timer timer;
    double t = 0.0;

    std::cout << "DenseGemmByBlocks:: test matrices "
              <<":: mmin = " << mmin << " , mmax = " << mmax << " , minc = " << minc 
              << " , k = "<< k << " , mb = " << mb << std::endl;

    const size_t max_task_size = (3*sizeof(DenseTaskViewDeviceType)+sizeof(PolicyType)+128); 
    PolicyType policy(max_concurrency,
                      max_task_size,
                      max_task_dependence,
                      team_size);

    std::ostringstream os;
    os.precision(3);
    os << std::scientific;

    for (ordinal_type m=mmin;m<=mmax;m+=minc) {
      os.str("");
      
      // host matrices
      DenseMatrixBaseHostType AA_host, BB_host, CC_host("CC_host", m, m), CB_host("CB_host", m, m);
      {
        if (ArgTransA == Trans::NoTranspose) 
          AA_host = DenseMatrixBaseHostType("AA_host", m, k); 
        else 
          AA_host = DenseMatrixBaseHostType("AA_host", k, m);
        
        if (ArgTransB == Trans::NoTranspose) 
          BB_host = DenseMatrixBaseHostType("BB_host", k, m);
        else 
          BB_host = DenseMatrixBaseHostType("BB_host", m, k);
        
        for (ordinal_type j=0;j<AA_host.NumCols();++j)
          for (ordinal_type i=0;i<AA_host.NumRows();++i)
            AA_host.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0;
        
        for (ordinal_type j=0;j<BB_host.NumCols();++j)
          for (ordinal_type i=0;i<BB_host.NumRows();++i)
            BB_host.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0;
        
        for (ordinal_type j=0;j<CC_host.NumCols();++j)
          for (ordinal_type i=0;i<CC_host.NumRows();++i)
            CC_host.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0;
        
        DenseMatrixTools::copy(CB_host, CC_host);
      }

      const double flop = DenseFlopCount<value_type>::Gemm(m, m, k);

#ifdef HAVE_SHYLUTACHO_MKL
      mkl_set_num_threads(mkl_nthreads);
#endif

      os << "DenseGemmByBlocks:: m = " << m << " n = " << m << " k = " << k << "  ";
      if (check) {
        timer.reset();
        DenseMatrixViewHostType A_host(AA_host), B_host(BB_host), C_host(CB_host);
        Gemm<ArgTransA,ArgTransB,AlgoGemm::ExternalBlas,Variant::One>::invoke
          (policy, policy.member_single(),
           1.0, A_host, B_host, 1.0, C_host);
        t = timer.seconds();
        os << ":: Serial Performance = " << (flop/t/1.0e9) << " [GFLOPs]  ";
      }

      DenseMatrixBaseDeviceType AA_device("AA_device"), BB_device("BB_device"), CC_device("CC_device");
      {
        timer.reset();
        AA_device.mirror(AA_host);
        BB_device.mirror(BB_host);
        CC_device.mirror(CC_host);
        t = timer.seconds();
        os << ":: Mirror = " << t << " [sec]  ";
      }

      {
        DenseHierMatrixBaseDeviceType HA_device("HA_device"), HB_device("HB_device"), HC_device("HC_device");

        DenseMatrixTools::createHierMatrix(HA_device, AA_device, mb, mb);        
        DenseMatrixTools::createHierMatrix(HB_device, BB_device, mb, mb);        
        DenseMatrixTools::createHierMatrix(HC_device, CC_device, mb, mb);        

        DenseHierTaskViewDeviceType TA_device(HA_device), TB_device(HB_device), TC_device(HC_device);
        timer.reset();
        auto future = policy.proc_create_team
          (Gemm<ArgTransA,ArgTransB,AlgoGemm::DenseByBlocks,ArgVariant>
           ::createTaskFunctor(policy, 1.0, TA_device, TB_device, 1.0, TC_device),
           0);
        policy.spawn(future);
        Kokkos::Experimental::wait(policy);
        t = timer.seconds();       
        os << ":: Parallel Performance = " << (flop/t/1.0e9) << " [GFLOPs]  ";
      } 

      CC_host.mirror(CC_device);
      if (check) {
        double err = 0.0, norm = 0.0;
        for (ordinal_type j=0;j<CC_host.NumCols();++j)
          for (ordinal_type i=0;i<CC_host.NumRows();++i) {
            const double diff = abs(CC_host.Value(i,j) - CB_host.Value(i,j));
            const double val  = CB_host.Value(i,j);
            err  += diff*diff;
            norm += val*val;
          }
        os << ":: Check result ::norm = " << sqrt(norm) << ", error = " << sqrt(err);
      }
      std::cout << os.str() << std::endl;
    }

    return r_val;
  }