void Loop(int loop, int test, const char* type_name) { LoopVariant<T>(loop,test); Kokkos::Impl::Timer timer; T res = LoopVariant<T>(loop,test); double time = timer.seconds(); timer.reset(); T resNonAtomic = LoopVariantNonAtomic<T>(loop,test); double timeNonAtomic = timer.seconds(); timer.reset(); T resSerial = LoopVariantSerial<T>(loop,test); double timeSerial = timer.seconds(); time *=1e6/loop; timeNonAtomic*=1e6/loop; timeSerial *=1e6/loop; //textcolor_standard(); bool passed = true; if(resSerial!=res) passed = false; //if(!passed) textcolor(RESET,BLACK,YELLOW); printf("%s Test %i %s --- Loop: %i Value (S,A,NA): %e %e %e Time: %7.4e %7.4e %7.4e Size of Type %i)", type_name,test,passed?"PASSED":"FAILED",loop, 1.0*resSerial,1.0*res,1.0*resNonAtomic, timeSerial,time,timeNonAtomic,(int)sizeof(T)); //if(!passed) textcolor_standard(); printf("\n"); }
int main(int narg, char* args[]) { Kokkos::initialize(narg,args); int chunk_size = 1024; int nchunks = 100000; //1024*1024; Kokkos::DualView<int*> data("data",nchunks*chunk_size+1); srand(1231093); for(int i = 0; i < data.dimension_0(); i++) { data.h_view(i) = rand()%TS; } data.modify<Host>(); data.sync<Device>(); Kokkos::DualView<int**> histogram("histogram",TS,TS); Kokkos::Impl::Timer timer; // Threads/team (TS) is automically limited to the maximum supported by the device. Kokkos::parallel_for( team_policy( nchunks , TS ) , find_2_tuples(chunk_size,data,histogram) ); Kokkos::fence(); double time = timer.seconds(); histogram.sync<Host>(); printf("Time: %lf \n\n",time); int sum = 0; for(int k=0; k<TS; k++) { for(int l=0; l<TS; l++) { printf("%i ",histogram.h_view(k,l)); sum += histogram.h_view(k,l); } printf("\n"); } printf("Result: %i %i\n",sum,chunk_size*nchunks); Kokkos::finalize(); }
int main(int narg, char* args[]) { Kokkos::initialize(narg,args); int chunk_size = 1024; int nchunks = 100000; //1024*1024; Kokkos::DualView<int*> data("data",nchunks*chunk_size+1); srand(1231093); for(int i = 0; i < data.dimension_0(); i++) { data.h_view(i) = rand()%TS; } data.modify<Host>(); data.sync<Device>(); Kokkos::DualView<int**> histogram("histogram",TS,TS); Kokkos::Impl::Timer timer; Kokkos::parallel_for( Kokkos::ParallelWorkRequest(nchunks,TS<Device::team_max()?TS:Device::team_max()), find_2_tuples(chunk_size,data,histogram)); Kokkos::fence(); double time = timer.seconds(); histogram.sync<Host>(); printf("Time: %lf \n\n",time); int sum = 0; for(int k=0; k<TS; k++) { for(int l=0; l<TS; l++) { printf("%i ",histogram.h_view(k,l)); sum += histogram.h_view(k,l); } printf("\n"); } printf("Result: %i %i\n",sum,chunk_size*nchunks); Kokkos::finalize(); }
static double test( const int count , const int iter = 1 ) { elem_coord_type coord( "coord" , count ); elem_grad_type grad ( "grad" , count ); // Execute the parallel kernels on the arrays: double dt_min = 0 ; Kokkos::parallel_for( count , Init( coord ) ); device_type::fence(); for ( int i = 0 ; i < iter ; ++i ) { Kokkos::Impl::Timer timer ; Kokkos::parallel_for( count , HexGrad<device_type>( coord , grad ) ); device_type::fence(); const double dt = timer.seconds(); if ( 0 == i ) dt_min = dt ; else dt_min = dt < dt_min ? dt : dt_min ; } return dt_min ; }
int main (int argc, char ** argv){ if (argc < 2){ std::cerr << "Usage:" << argv[0] << " input_bin_file" << std::endl; exit(1); } Kokkos::initialize(argc, argv); MyExecSpace::print_configuration(std::cout); idx nv = 0, ne = 0; idx *xadj, *adj; wt *ew; KokkosKernels::Experimental::Graph::Utils::read_graph_bin<idx, wt> (&nv, &ne, &xadj, &adj, &ew, argv[1]); std::cout << "nv:" << nv << " ne:" << ne << std::endl; um_array_type _xadj (xadj, nv + 1); um_edge_array_type _adj (adj, ne); idx_array_type kok_xadj ("xadj", nv + 1); idx_edge_array_type kok_adj("adj", ne); idx_array_type sym_xadj; idx_edge_array_type sym_adj; Kokkos::deep_copy (kok_xadj, _xadj); Kokkos::deep_copy (kok_adj, _adj); wt_um_edge_array_type _mtx_vals (ew, ne); value_array_type kok_mtx_vals ("MTX_VALS", ne); Kokkos::deep_copy (kok_mtx_vals, _mtx_vals); delete [] xadj; delete [] adj; delete [] ew; std::cout << "Symetrizing Graph" << std::endl; Kokkos::Impl::Timer timer; KokkosKernels::Experimental::Util::symmetrize_graph_symbolic_hashmap< idx_array_type, idx_edge_array_type, idx_array_type, idx_edge_array_type, MyExecSpace> (nv, kok_xadj, kok_adj,sym_xadj, sym_adj); Kokkos::fence(); double t = timer.seconds(); std::cout << "Time to symmetrize:" << t << std::endl; KokkosKernels::Experimental::Util::print_1Dview(kok_xadj); KokkosKernels::Experimental::Util::print_1Dview(kok_adj); std::cout << "Symetric Graph" << std::endl; KokkosKernels::Experimental::Util::print_1Dview(sym_xadj); KokkosKernels::Experimental::Util::print_1Dview(sym_adj); Kokkos::finalize(); return 0; }
KOKKOS_INLINE_FUNCTION int exampleCholDirectPerformance(const string file_input, const int treecut, const int minblksize, const int prunecut, const int seed, const int niter, const int nthreads, const int max_task_dependence, const int team_size, const int league_size, const bool team_interface, const bool skip_serial, const bool mkl_interface, const bool verbose) { typedef ValueType value_type; typedef OrdinalType ordinal_type; typedef SizeType size_type; typedef TaskFactory<Kokkos::Experimental::TaskPolicy<SpaceType>, Kokkos::Experimental::Future<int,SpaceType> > TaskFactoryType; typedef CrsMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> CrsMatrixBaseType; typedef GraphHelper_Scotch<CrsMatrixBaseType> GraphHelperType; typedef SymbolicFactorHelper<CrsMatrixBaseType> SymbolicFactorHelperType; #ifdef HAVE_SHYLUTACHO_MKL typedef typename CrsMatrixBaseType::value_type_array value_type_array; #endif typedef CrsMatrixView<CrsMatrixBaseType> CrsMatrixViewType; typedef TaskView<CrsMatrixViewType,TaskFactoryType> CrsTaskViewType; typedef CrsMatrixBase<CrsTaskViewType,ordinal_type,size_type,SpaceType,MemoryTraits> CrsHierMatrixBaseType; typedef CrsMatrixView<CrsHierMatrixBaseType> CrsHierMatrixViewType; typedef TaskView<CrsHierMatrixViewType,TaskFactoryType> CrsHierTaskViewType; int r_val = 0; Kokkos::Impl::Timer timer; double t_import = 0.0, t_reorder = 0.0, t_symbolic = 0.0, t_flat2hier = 0.0, #ifdef HAVE_SHYLUTACHO_MKL t_mkl = 0.0, #endif t_factor_seq = 0.0, t_solve_seq = 0.0, t_factor_task = 0.0, t_solve_task = 0.0; const int start = 0; cout << "CholDirectPerformance:: import input file = " << file_input << endl; CrsMatrixBaseType AA("AA"); { timer.reset(); ifstream in; in.open(file_input); if (!in.good()) { cout << "Failed in open the file: " << file_input << endl; return ++r_val; } AA.importMatrixMarket(in); t_import = timer.seconds(); } cout << "CholDirectPerformance:: import input file::time = " << t_import << endl; cout << "CholDirectPerformance:: reorder the matrix" << endl; CrsMatrixBaseType PA("Permuted AA"); CrsMatrixBaseType UU("UU"); // permuted base upper triangular matrix CrsHierMatrixBaseType HU("HU"); // hierarchical matrix of views DenseMatrixBaseType BB("BB", AA.NumRows(), nrhs); DenseHierMatrixBaseType HB("HB"); { GraphHelperType S(AA, seed); { timer.reset(); S.computeOrdering(treecut, minblksize); S.pruneTree(prunecut); PA.copy(S.PermVector(), S.InvPermVector(), AA); t_reorder = timer.seconds(); } cout << "CholDirectPerformance:: reorder the matrix::time = " << t_reorder << endl; { SymbolicFactorHelperType F(PA, league_size); for (int i=start;i<niter;++i) { timer.reset(); F.createNonZeroPattern(Uplo::Upper, UU); t_symbolic += timer.seconds() * (i>=0); } t_symbolic /= niter; cout << "CholDirectPerformance:: AA (nnz) = " << AA.NumNonZeros() << ", UU (nnz) = " << UU.NumNonZeros() << endl; } cout << "CholDirectPerformance:: symbolic factorization::time = " << t_symbolic << endl; { timer.reset(); CrsMatrixHelper::flat2hier(Uplo::Upper, UU, HU, S.NumBlocks(), S.RangeVector(), S.TreeVector()); for (ordinal_type k=0;k<HU.NumNonZeros();++k) HU.Value(k).fillRowViewArray(); DenseMatrixHelper::flat2hier(BB, HB, S.NumBlocks(), S.RangeVector(), nb); t_flat2hier = timer.seconds(); cout << "CholDirectPerformance:: Hier (dof, nnz) = " << HU.NumRows() << ", " << HU.NumNonZeros() << endl; } cout << "CholDirectPerformance:: construct hierarchical matrix::time = " << t_flat2hier << endl; } // copy of UU CrsMatrixBaseType RR("RR"); RR.copy(UU); /////////////////////////// Serial Numeric Factorization if (!skip_serial) { #ifdef __USE_FIXED_TEAM_SIZE__ typename TaskFactoryType::policy_type policy(max_task_dependence); #else typename TaskFactoryType::policy_type policy(max_task_dependence, 1); #endif TaskFactoryType::setUseTeamInterface(team_interface); TaskFactoryType::setMaxTaskDependence(max_task_dependence); TaskFactoryType::setPolicy(&policy); CrsTaskViewType U(&UU); U.fillRowViewArray(); cout << "CholDirectPerformance:: Serial factorize the matrix" << endl; { for (int i=start;i<niter;++i) { UU.copy(RR); timer.reset(); { Chol<Uplo::Upper,AlgoChol::UnblockedOpt,Variant::One> ::invoke(TaskFactoryType::Policy(), TaskFactoryType::Policy().member_single(), U); } t_factor_seq += timer.seconds() * (i>=0); } t_factor_seq /= niter; } cout << "CholDirectPerformance:: Serial factorize the matrix::time = " << t_factor_seq << endl; cout << "CholDirectPerformance:: Serial forward/backward solve" << endl; { for (int i=start;i<niter;++i) { XX.copy(BB); timer.reset(); { TriSolve<Uplo::Upper,Trans::ConjTranspose,AlgoTriSolve::Unblocked> ::invoke(TaskFactoryType::Policy(), TaskFactoryType::Policy().member_single(), Diag::NonUnit, U, X); TriSolve<Uplo::Upper,Trans::NoTranspose,AlgoTriSolve::Unblocked> ::invoke(TaskFactoryType::Policy(), TaskFactoryType::Policy().member_single(), Diag::NonUnit, U, X); } t_factor_seq += timer.seconds() * (i>=0); } t_factor_seq /= niter; } cout << "CholDirectPerformance:: Serial forward/backward solve::time = " << t_solve_seq << endl; } // if (!skip_serial) // cout << "CholDirectPerformance:: task scale [seq/task] = " << t_factor_seq/t_factor_task << endl; return r_val; }
KOKKOS_INLINE_FUNCTION int exampleTriSolvePerformance(const string file_input, const OrdinalType nrhs, const OrdinalType nb, const int niter, const int nthreads, const int max_task_dependence, const int team_size, const bool team_interface, const bool skip_serial, const bool verbose) { typedef ValueType value_type; typedef OrdinalType ordinal_type; typedef SizeType size_type; typedef TaskFactory<Kokkos::Experimental::TaskPolicy<SpaceType>, Kokkos::Experimental::Future<int,SpaceType> > TaskFactoryType; typedef CrsMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> CrsMatrixBaseType; typedef GraphHelper_Scotch<CrsMatrixBaseType> GraphHelperType; typedef CrsMatrixView<CrsMatrixBaseType> CrsMatrixViewType; typedef TaskView<CrsMatrixViewType,TaskFactoryType> CrsTaskViewType; typedef CrsMatrixBase<CrsTaskViewType,ordinal_type,size_type,SpaceType,MemoryTraits> CrsHierMatrixBaseType; typedef CrsMatrixView<CrsHierMatrixBaseType> CrsHierMatrixViewType; typedef TaskView<CrsHierMatrixViewType,TaskFactoryType> CrsHierTaskViewType; typedef DenseMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> DenseMatrixBaseType; typedef DenseMatrixView<DenseMatrixBaseType> DenseMatrixViewType; typedef TaskView<DenseMatrixViewType,TaskFactoryType> DenseTaskViewType; typedef DenseMatrixBase<DenseTaskViewType,ordinal_type,size_type,SpaceType,MemoryTraits> DenseHierMatrixBaseType; typedef DenseMatrixView<DenseHierMatrixBaseType> DenseHierMatrixViewType; typedef TaskView<DenseHierMatrixViewType,TaskFactoryType> DenseHierTaskViewType; int r_val = 0; Kokkos::Impl::Timer timer; double t_import = 0.0, t_reorder = 0.0, t_solve_seq = 0.0, t_solve_task = 0.0; const int start = -2; cout << "TriSolvePerformance:: import input file = " << file_input << endl; CrsMatrixBaseType AA("AA"); { timer.reset(); ifstream in; in.open(file_input); if (!in.good()) { cout << "Failed in open the file: " << file_input << endl; return ++r_val; } AA.importMatrixMarket(in); t_import = timer.seconds(); if (verbose) cout << AA << endl; } cout << "TriSolvePerformance:: import input file::time = " << t_import << endl; CrsMatrixBaseType UU("UU"); DenseMatrixBaseType BB("BB", AA.NumRows(), nrhs); cout << "TriSolvePerformance:: reorder the matrix and partition right hand side, nb = " << nb << endl; CrsHierMatrixBaseType HU("HU"); DenseHierMatrixBaseType HB("HB"); { timer.reset(); GraphHelperType S(AA); S.computeOrdering(); CrsMatrixBaseType PA("Permuted AA"); PA.copy(S.PermVector(), S.InvPermVector(), AA); UU.copy(Uplo::Upper, PA); CrsMatrixHelper::flat2hier(Uplo::Upper, UU, HU, S.NumBlocks(), S.RangeVector(), S.TreeVector()); DenseMatrixHelper::flat2hier(BB, HB, S.NumBlocks(), S.RangeVector(), nb); t_reorder = timer.seconds(); cout << "TriSolvePerformance:: Hier (dof, nnz) = " << HU.NumRows() << ", " << HU.NumNonZeros() << endl; if (verbose) cout << UU << endl; } cout << "TriSolvePerformance:: reorder the matrix and partition right hand side::time = " << t_reorder << endl; const size_t max_concurrency = 16384; cout << "TriSolvePerformance:: max concurrency = " << max_concurrency << endl; const size_t max_task_size = 3*sizeof(CrsTaskViewType)+128; cout << "TriSolvePerformance:: max task size = " << max_task_size << endl; if (!skip_serial) { __INIT_DENSE_MATRIX__(BB, 1.0); typename TaskFactoryType::policy_type policy(max_concurrency, max_task_size, max_task_dependence, 1); TaskFactoryType::setUseTeamInterface(team_interface); TaskFactoryType::setMaxTaskDependence(max_task_dependence); TaskFactoryType::setPolicy(&policy); CrsTaskViewType U(&UU); DenseTaskViewType B(&BB); U.fillRowViewArray(); cout << "TriSolvePerformance:: Serial forward and backward solve of the matrix" << endl; { for (int i=start;i<niter;++i) { timer.reset(); // { // auto future = TaskFactoryType::Policy().create_team(TriSolve<Uplo::Upper,Trans::ConjTranspose,AlgoTriSolve::Unblocked> // ::TaskFunctor<CrsTaskViewType,DenseTaskViewType> // (Diag::NonUnit, U, B), 0); // TaskFactoryType::Policy().spawn(future); // Kokkos::Experimental::wait(TaskFactoryType::Policy()); // } { TriSolve<Uplo::Upper,Trans::ConjTranspose,AlgoTriSolve::Unblocked> ::invoke(TaskFactoryType::Policy(), TaskFactoryType::Policy().member_single(), Diag::NonUnit, U, B); } // { // auto future = TaskFactoryType::Policy().create_team(TriSolve<Uplo::Upper,Trans::NoTranspose,AlgoTriSolve::Unblocked> // ::TaskFunctor<CrsTaskViewType,DenseTaskViewType> // (Diag::NonUnit, U, B), 0); // TaskFactoryType::Policy().spawn(future); // Kokkos::Experimental::wait(TaskFactoryType::Policy()); // } { TriSolve<Uplo::Upper,Trans::NoTranspose,AlgoTriSolve::Unblocked> ::invoke(TaskFactoryType::Policy(), TaskFactoryType::Policy().member_single(), Diag::NonUnit, U, B); } t_solve_seq += timer.seconds() * (i>=0); } t_solve_seq /= niter; if (verbose) cout << BB << endl; } cout << "TriSolvePerformance:: Serial forward and backward solve of the matrix::time = " << t_solve_seq << endl; } { __INIT_DENSE_MATRIX__(BB, 1.0); typename TaskFactoryType::policy_type policy(max_concurrency, max_task_size, max_task_dependence, team_size); TaskFactoryType::setUseTeamInterface(team_interface); TaskFactoryType::setMaxTaskDependence(max_task_dependence); TaskFactoryType::setPolicy(&policy); // wrap the hierarchically partitioned matrix with task handler CrsHierTaskViewType TU(&HU); for (ordinal_type k=0;k<HU.NumNonZeros();++k) HU.Value(k).fillRowViewArray(); DenseHierTaskViewType TB(&HB); cout << "TriSolvePerformance:: ByBlocks forward and backward solve of the matrix" << endl; { for (int i=start;i<niter;++i) { timer.reset(); { auto future_forward_solve = TaskFactoryType::Policy().create_team (TriSolve<Uplo::Upper,Trans::ConjTranspose,AlgoTriSolve::ByBlocks> ::TaskFunctor<CrsHierTaskViewType,DenseHierTaskViewType> (Diag::NonUnit, TU, TB), 0); TaskFactoryType::Policy().spawn(future_forward_solve); auto future_backward_solve = TaskFactoryType::Policy().create_team (TriSolve<Uplo::Upper,Trans::NoTranspose,AlgoTriSolve::ByBlocks> ::TaskFunctor<CrsHierTaskViewType,DenseHierTaskViewType> (Diag::NonUnit, TU, TB), 1); TaskFactoryType::Policy().add_dependence(future_backward_solve, future_forward_solve); TaskFactoryType::Policy().spawn(future_backward_solve); Kokkos::Experimental::wait(TaskFactoryType::Policy()); } t_solve_task += timer.seconds() * (i>=0); } t_solve_task /= niter; if (verbose) cout << BB << endl; } cout << "TriSolvePerformance:: ByBlocks forward and backward solve of the matrix::time = " << t_solve_task << endl; } if (!skip_serial) { cout << "TriSolvePerformance:: task scale [seq/task] = " << t_solve_seq/t_solve_task << endl; } return r_val; }
KOKKOS_INLINE_FUNCTION int exampleDenseGemmByBlocks(const OrdinalType mmin, const OrdinalType mmax, const OrdinalType minc, const OrdinalType k, const OrdinalType mb, const int max_concurrency, const int max_task_dependence, const int team_size, const int mkl_nthreads, const bool check, const bool verbose) { typedef ValueType value_type; typedef OrdinalType ordinal_type; typedef SizeType size_type; typedef TaskFactory<Kokkos::Experimental::TaskPolicy<SpaceType>, Kokkos::Experimental::Future<int,SpaceType> > TaskFactoryType; typedef DenseMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> DenseMatrixBaseType; typedef DenseMatrixView<DenseMatrixBaseType> DenseMatrixViewType; typedef TaskView<DenseMatrixViewType,TaskFactoryType> DenseTaskViewType; typedef DenseMatrixBase<DenseTaskViewType,ordinal_type,size_type,SpaceType,MemoryTraits> DenseHierMatrixBaseType; typedef DenseMatrixView<DenseHierMatrixBaseType> DenseHierMatrixViewType; typedef TaskView<DenseHierMatrixViewType,TaskFactoryType> DenseHierTaskViewType; int r_val = 0; Kokkos::Impl::Timer timer; double t = 0.0; cout << "DenseGemmByBlocks:: test matrices " <<":: mmin = " << mmin << " , mmax = " << mmax << " , minc = " << minc << " , k = "<< k << " , mb = " << mb << endl; const size_t max_task_size = (3*sizeof(DenseTaskViewType)+196); // when 128 error //cout << "max task size = "<< max_task_size << endl; typename TaskFactoryType::policy_type policy(max_concurrency, max_task_size, max_task_dependence, team_size); TaskFactoryType::setMaxTaskDependence(max_task_dependence); TaskFactoryType::setPolicy(&policy); ostringstream os; os.precision(3); os << scientific; for (ordinal_type m=mmin;m<=mmax;m+=minc) { os.str(""); DenseMatrixBaseType AA, BB, CC("CC", m, m), CB("CB", m, m); if (ArgTransA == Trans::NoTranspose) AA = DenseMatrixBaseType("AA", m, k); else AA = DenseMatrixBaseType("AA", k, m); if (ArgTransB == Trans::NoTranspose) BB = DenseMatrixBaseType("BB", k, m); else BB = DenseMatrixBaseType("BB", m, k); for (ordinal_type j=0;j<AA.NumCols();++j) for (ordinal_type i=0;i<AA.NumRows();++i) AA.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0; for (ordinal_type j=0;j<BB.NumCols();++j) for (ordinal_type i=0;i<BB.NumRows();++i) BB.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0; for (ordinal_type j=0;j<CC.NumCols();++j) for (ordinal_type i=0;i<CC.NumRows();++i) CC.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0; CB.copy(CC); const double flop = get_flop_gemm<value_type>(m, m, k); #ifdef HAVE_SHYLUTACHO_MKL mkl_set_num_threads(mkl_nthreads); #endif os << "DenseGemmByBlocks:: m = " << m << " n = " << m << " k = " << k; if (check) { timer.reset(); DenseTaskViewType A(&AA), B(&BB), C(&CB); Gemm<ArgTransA,ArgTransB,AlgoGemm::ExternalBlas>::invoke (TaskFactoryType::Policy(), TaskFactoryType::Policy().member_single(), 1.0, A, B, 1.0, C); t = timer.seconds(); os << ":: Serial Performance = " << (flop/t/1.0e9) << " [GFLOPs] "; } { DenseHierMatrixBaseType HA, HB, HC; DenseMatrixHelper::flat2hier(AA, HA, mb, mb); DenseMatrixHelper::flat2hier(BB, HB, mb, mb); DenseMatrixHelper::flat2hier(CC, HC, mb, mb); DenseHierTaskViewType TA(&HA), TB(&HB), TC(&HC); timer.reset(); auto future = TaskFactoryType::Policy().create_team (typename Gemm<ArgTransA,ArgTransB,AlgoGemm::DenseByBlocks,Variant::One> ::template TaskFunctor<value_type,DenseHierTaskViewType,DenseHierTaskViewType,DenseHierTaskViewType> (1.0, TA, TB, 1.0, TC), 0); TaskFactoryType::Policy().spawn(future); Kokkos::Experimental::wait(TaskFactoryType::Policy()); t = timer.seconds(); os << ":: Parallel Performance = " << (flop/t/1.0e9) << " [GFLOPs] "; } if (check) { typedef typename Teuchos::ScalarTraits<value_type>::magnitudeType real_type; real_type err = 0.0, norm = 0.0; for (ordinal_type j=0;j<CC.NumCols();++j) for (ordinal_type i=0;i<CC.NumRows();++i) { const real_type diff = abs(CC.Value(i,j) - CB.Value(i,j)); const real_type val = CB.Value(i,j); err += diff*diff; norm += val*val; } os << ":: Check result ::err = " << sqrt(err) << ", norm = " << sqrt(norm); } cout << os.str() << endl; } return r_val; }
int exampleCholByBlocks(const std::string file_input, const int treecut, const int prunecut, const int fill_level, const int rows_per_team, const int max_concurrency, const int max_task_dependence, const int team_size, const bool check, const bool verbose) { typedef typename Kokkos::Impl::is_space<DeviceSpaceType>::host_mirror_space::execution_space HostSpaceType ; const bool detail = false; std::cout << "DeviceSpace:: "; DeviceSpaceType::print_configuration(std::cout, detail); std::cout << "HostSpace:: "; HostSpaceType::print_configuration(std::cout, detail); // for simple test, let's use host space only here, for device it needs mirroring. typedef CrsMatrixBase<value_type,ordinal_type,size_type,HostSpaceType> CrsMatrixBaseHostType; typedef CrsMatrixView<CrsMatrixBaseHostType> CrsMatrixViewHostType; typedef GraphTools<ordinal_type,size_type,HostSpaceType> GraphToolsHostType; typedef GraphTools_Scotch<ordinal_type,size_type,HostSpaceType> GraphToolsHostType_Scotch; typedef GraphTools_CAMD<ordinal_type,size_type,HostSpaceType> GraphToolsHostType_CAMD; typedef IncompleteSymbolicFactorization<CrsMatrixBaseHostType> IncompleteSymbolicFactorizationType; typedef Kokkos::Experimental::TaskPolicy<DeviceSpaceType> PolicyType; typedef TaskView<CrsMatrixViewHostType> CrsTaskViewHostType; typedef CrsMatrixBase<CrsTaskViewHostType,ordinal_type,size_type,HostSpaceType> CrsHierBaseHostType; typedef CrsMatrixView<CrsHierBaseHostType> CrsHierViewHostType; typedef TaskView<CrsHierViewHostType> CrsTaskHierViewHostType; int r_val = 0; Kokkos::Impl::Timer timer; /// /// Read from matrix market /// /// input - file /// output - AA_host /// CrsMatrixBaseHostType AA_host("AA_host"); timer.reset(); { std::ifstream in; in.open(file_input); if (!in.good()) { std::cout << "Failed in open the file: " << file_input << std::endl; return -1; } MatrixMarket::read(AA_host, in); } double t_read = timer.seconds(); if (verbose) AA_host.showMe(std::cout) << std::endl; /// /// Create a graph structure for Scotch and CAMD (rptr, cidx) /// /// rptr and cidx are need to be set up for Scotch and CAMD /// typename GraphToolsHostType::size_type_array rptr("Graph::RowPtrArray", AA_host.NumRows() + 1); typename GraphToolsHostType::ordinal_type_array cidx("Graph::ColIndexArray", AA_host.NumNonZeros()); /// /// Run Scotch /// /// input - rptr, cidx, A_host /// output - S (perm, iperm, nblks, range, tree), AA_scotch_host (permuted) /// timer.reset(); GraphToolsHostType::getGraph(rptr, cidx, AA_host); double t_graph = timer.seconds(); GraphToolsHostType_Scotch S; S.setGraph(AA_host.NumRows(), rptr, cidx); S.setSeed(0); S.setTreeLevel(); S.setStrategy( SCOTCH_STRATSPEED | SCOTCH_STRATLEVELMAX | SCOTCH_STRATLEVELMIN | SCOTCH_STRATLEAFSIMPLE | SCOTCH_STRATSEPASIMPLE ); timer.reset(); S.computeOrdering(treecut); double t_scotch = timer.seconds(); if (verbose) S.showMe(std::cout) << std::endl; CrsMatrixBaseHostType AA_scotch_host("AA_scotch_host"); AA_scotch_host.createConfTo(AA_host); CrsMatrixTools::copy(AA_scotch_host, S.PermVector(), S.InvPermVector(), AA_host); if (verbose) AA_scotch_host.showMe(std::cout) << std::endl; /// /// Run CAMD /// /// input - rptr, cidx, A_host /// output - S (perm, iperm, nblks, range, tree), AA_scotch_host (permuted) /// timer.reset(); GraphToolsHostType::getGraph(rptr, cidx, AA_scotch_host); t_graph += timer.seconds(); GraphToolsHostType_CAMD C; C.setGraph(AA_scotch_host.NumRows(), rptr, cidx, S.NumBlocks(), S.RangeVector()); timer.reset(); C.computeOrdering(); double t_camd = timer.seconds(); if (verbose) C.showMe(std::cout) << std::endl; CrsMatrixBaseHostType AA_camd_host("AA_camd_host"); AA_camd_host.createConfTo(AA_scotch_host); CrsMatrixTools::copy(AA_camd_host, C.PermVector(), C.InvPermVector(), AA_scotch_host); if (verbose) AA_camd_host.showMe(std::cout) << std::endl; /// /// Symbolic factorization /// /// input - /// output - S (perm, iperm, nblks, range, tree), AA_scotch_host (permuted) /// CrsMatrixBaseHostType AA_factor_host("AA_factor_host"); timer.reset(); IncompleteSymbolicFactorizationType::createNonZeroPattern(AA_factor_host, fill_level, Uplo::Upper, AA_camd_host, rows_per_team); double t_symbolic = timer.seconds(); if (verbose) AA_factor_host.showMe(std::cout) << std::endl; /// /// Clean tempoerary matrices /// /// input - AA_scotch_host, AA_camd_host, C, rptr, cidx /// output - none /// AA_scotch_host = CrsMatrixBaseHostType(); AA_camd_host = CrsMatrixBaseHostType(); C = GraphToolsHostType_CAMD(); rptr = typename GraphToolsHostType::size_type_array(); cidx = typename GraphToolsHostType::ordinal_type_array(); /// /// Create task policy /// /// input - max_task_size /// output - policy /// const size_type max_task_size = (3*sizeof(CrsTaskViewHostType)+sizeof(PolicyType)+128); timer.reset(); PolicyType policy(max_concurrency, max_task_size, max_task_dependence, team_size); double t_policy = timer.seconds(); /// /// Sequential execution /// /// input - AA_factor_host (matrix to be compared), rowviews /// output - BB_factor_host, B_factor_host /// double t_chol_serial = 0; CrsMatrixBaseHostType BB_factor_host("BB_factor_host"); if (check) { BB_factor_host.createConfTo(AA_factor_host); CrsMatrixTools::copy(BB_factor_host, AA_factor_host); CrsTaskViewHostType B_factor_host(BB_factor_host); Kokkos::View<typename CrsTaskViewHostType::row_view_type*,HostSpaceType> rowviews("RowViewInMatView", B_factor_host.NumRows()); B_factor_host.setRowViewArray(rowviews); timer.reset(); { auto future = policy.proc_create_team(Chol<Uplo::Upper,AlgoChol::Unblocked,Variant::One> ::createTaskFunctor(policy, B_factor_host)); policy.spawn(future); Kokkos::Experimental::wait(policy); TACHO_TEST_FOR_ABORT( future.get(), "Fail to perform Cholesky (serial)"); } t_chol_serial = timer.seconds(); if (verbose) BB_factor_host.showMe(std::cout) << std::endl; } /// /// Task parallel execution /// /// input - AA_factor_host, rowviews /// output - HA_factor_host, AA_factor_host, B_factor_host /// double t_hier = 0, t_blocks = 0, t_chol_parallel = 0; CrsHierBaseHostType HA_factor_host("HA_factor_host"); { timer.reset(); S.pruneTree(prunecut); CrsMatrixTools::createHierMatrix(HA_factor_host, AA_factor_host, S.NumBlocks(), S.RangeVector(), S.TreeVector()); t_hier = timer.seconds(); timer.reset(); size_type nblocks = HA_factor_host.NumNonZeros(); Kokkos::View<ordinal_type*,HostSpaceType> ap_rowview_blocks("NumRowViewInBlocks", nblocks + 1); ap_rowview_blocks(0) = 0; for (ordinal_type k=0;k<nblocks;++k) ap_rowview_blocks(k+1) = ap_rowview_blocks(k) + HA_factor_host.Value(k).NumRows(); Kokkos::View<typename CrsMatrixViewHostType::row_view_type*,HostSpaceType> rowview_blocks("RowViewInBlocks", ap_rowview_blocks(nblocks)); Kokkos::parallel_for(Kokkos::RangePolicy<HostSpaceType>(0, nblocks), [&](const ordinal_type k) { const ordinal_type begin = ap_rowview_blocks(k); const ordinal_type end = ap_rowview_blocks(k+1); HA_factor_host.Value(k).setRowViewArray (Kokkos::subview(rowview_blocks, Kokkos::pair<ordinal_type,ordinal_type>(begin, end))); } ); CrsMatrixTools::filterEmptyBlocks(HA_factor_host); t_blocks = timer.seconds(); { size_type nblocks_filtered = HA_factor_host.NumNonZeros(), nnz_blocks = 0; for (size_type k=0;k<nblocks_filtered; ++k) nnz_blocks += HA_factor_host.Value(k).NumNonZeros(); TACHO_TEST_FOR_ABORT( nnz_blocks != AA_factor_host.NumNonZeros(), "nnz counted in blocks is different from nnz in the base matrix."); } CrsTaskHierViewHostType H_factor_host(HA_factor_host); timer.reset(); { auto future = policy.proc_create_team(Chol<Uplo::Upper,AlgoChol::ByBlocks,Variant::One> ::createTaskFunctor(policy, H_factor_host)); policy.spawn(future); Kokkos::Experimental::wait(policy); TACHO_TEST_FOR_ABORT( future.get(), "Fail to perform Cholesky (serial)"); } t_chol_parallel = timer.seconds(); if (verbose) AA_factor_host.showMe(std::cout) << std::endl; } if (check) { double diff = 0, norm = 0; TACHO_TEST_FOR_ABORT( BB_factor_host.NumNonZeros() != AA_factor_host.NumNonZeros(), "nnz used in serial is not same as nnz used in parallel"); const size_type nnz = AA_factor_host.NumNonZeros(); for (size_type k=0;k<nnz;++k) { norm += Util::abs(BB_factor_host.Value(k)); diff += Util::abs(AA_factor_host.Value(k) - BB_factor_host.Value(k)); } std::cout << std::scientific; std::cout << "CholByBlocks:: check with serial execution " << std::endl << " diff = " << diff << ", norm = " << norm << ", rel err = " << (diff/norm) << std::endl; std::cout.unsetf(std::ios::scientific); } { const auto prec = std::cout.precision(); std::cout.precision(4); std::cout << std::scientific; std::cout << "CholByBlocks:: Given matrix = " << AA_host.NumRows() << " x " << AA_host.NumCols() << ", nnz = " << AA_host.NumNonZeros() << std::endl; std::cout << "CholByBlocks:: Factored matrix = " << AA_factor_host.NumRows() << " x " << AA_factor_host.NumCols() << ", nnz = " << AA_factor_host.NumNonZeros() << std::endl; std::cout << "CholByBlocks:: Hier matrix = " << HA_factor_host.NumRows() << " x " << HA_factor_host.NumCols() << ", nnz = " << HA_factor_host.NumNonZeros() << std::endl; std::cout << "CholByBlocks:: " << "read = " << t_read << " [sec], " << "graph generation = " << (t_graph/2.0) << " [sec] " << "scotch reordering = " << t_scotch << " [sec] " << "camd reordering = " << t_camd << " [sec] " << std::endl << "CholByBlocks:: " << "symbolic factorization = " << t_symbolic << " [sec] " << std::endl << "CholByBlocks:: " << "policy creation = " << t_policy << " [sec] " << "hier creation = " << t_hier << " [sec] " << "block specification = " << t_blocks << " [sec] " << std::endl << "CholByBlocks:: " << "Chol Parallel = " << t_chol_parallel << " [sec] "; if (check) std::cout << "Chol Serial = " << (check ? t_chol_serial : -1) << " [sec] " << "speed-up = " << (t_chol_serial/t_chol_parallel) << " [sec] "; std::cout << std::endl; std::cout.unsetf(std::ios::scientific); std::cout.precision(prec); } return r_val; }
KOKKOS_INLINE_FUNCTION int exampleSymbolicFactor(const string file_input, const int treecut, const int minblksize, const int seed, const int fill_level, const int league_size, const bool reorder, const bool verbose) { typedef ValueType value_type; typedef OrdinalType ordinal_type; typedef SizeType size_type; typedef CrsMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> CrsMatrixBaseType; typedef GraphHelper_Scotch<CrsMatrixBaseType> GraphHelperType; typedef SymbolicFactorHelper<CrsMatrixBaseType> SymbolicFactorHelperType; int r_val = 0; Kokkos::Impl::Timer timer; double t = 0.0; cout << "SymbolicFactor:: import input file = " << file_input << endl; CrsMatrixBaseType AA("AA"); { timer.reset(); ifstream in; in.open(file_input); if (!in.good()) { cout << "Failed in open the file: " << file_input << endl; return ++r_val; } AA.importMatrixMarket(in); t = timer.seconds(); cout << "SymbolicFactor:: AA nnz = " << AA.NumNonZeros() << endl; if (verbose) cout << AA << endl; } cout << "SymbolicFactor:: import input file::time = " << t << endl; CrsMatrixBaseType PA("Permuted AA"); GraphHelperType S(AA, seed); if (reorder) { timer.reset(); S.computeOrdering(treecut, minblksize); PA.copy(S.PermVector(), S.InvPermVector(), AA); t = timer.seconds(); if (verbose) cout << S << endl << PA << endl; } else { PA = AA; t = 0.0; } cout << "SymbolicFactor:: reorder the matrix::time = " << t << endl; CrsMatrixBaseType UU("UU"); { timer.reset(); SymbolicFactorHelperType symbolic(PA, league_size); symbolic.createNonZeroPattern(fill_level, Uplo::Upper, UU); t = timer.seconds(); cout << "SymbolicFactor:: UU nnz = " << UU.NumNonZeros() << endl; if (verbose) { cout << symbolic << endl; cout << UU << endl; } } cout << "SymbolicFactor:: factorize the matrix::time = " << t << endl; return r_val; }
int exampleCholUnblocked(const std::string file_input, const int treecut, const int prunecut, const int fill_level, const int rows_per_team, const bool verbose) { typedef typename Kokkos::Impl::is_space<DeviceSpaceType>::host_mirror_space::execution_space HostSpaceType ; const bool detail = false; std::cout << "DeviceSpace:: "; DeviceSpaceType::print_configuration(std::cout, detail); std::cout << "HostSpace:: "; HostSpaceType::print_configuration(std::cout, detail); typedef CrsMatrixBase<value_type,ordinal_type,size_type,HostSpaceType> CrsMatrixBaseHostType; typedef GraphTools<ordinal_type,size_type,HostSpaceType> GraphToolsHostType; typedef GraphTools_Scotch<ordinal_type,size_type,HostSpaceType> GraphToolsHostType_Scotch; typedef GraphTools_CAMD<ordinal_type,size_type,HostSpaceType> GraphToolsHostType_CAMD; typedef IncompleteSymbolicFactorization<CrsMatrixBaseHostType> IncompleteSymbolicFactorizationType; typedef Kokkos::Experimental::TaskPolicy<DeviceSpaceType> PolicyType; typedef CrsMatrixBase<value_type,ordinal_type,size_type,DeviceSpaceType> CrsMatrixBaseDeviceType; typedef CrsMatrixView<CrsMatrixBaseDeviceType> CrsMatrixViewDeviceType; typedef TaskView<CrsMatrixViewDeviceType> CrsTaskViewDeviceType; int r_val = 0; Kokkos::Impl::Timer timer; CrsMatrixBaseHostType AA_host("AA_host"); timer.reset(); { std::ifstream in; in.open(file_input); if (!in.good()) { std::cout << "Failed in open the file: " << file_input << std::endl; return -1; } MatrixMarket::read(AA_host, in); } double t_read = timer.seconds(); if (verbose) AA_host.showMe(std::cout) << std::endl; typename GraphToolsHostType::size_type_array rptr("Graph::RowPtrArray", AA_host.NumRows() + 1); typename GraphToolsHostType::ordinal_type_array cidx("Graph::ColIndexArray", AA_host.NumNonZeros()); timer.reset(); GraphToolsHostType::getGraph(rptr, cidx, AA_host); double t_graph = timer.seconds(); GraphToolsHostType_Scotch S; S.setGraph(AA_host.NumRows(), rptr, cidx); S.setSeed(0); S.setTreeLevel(); S.setStrategy( SCOTCH_STRATSPEED | SCOTCH_STRATLEVELMAX | SCOTCH_STRATLEVELMIN | SCOTCH_STRATLEAFSIMPLE | SCOTCH_STRATSEPASIMPLE ); timer.reset(); S.computeOrdering(treecut); double t_scotch = timer.seconds(); S.pruneTree(prunecut); if (verbose) S.showMe(std::cout) << std::endl; CrsMatrixBaseHostType BB_host("BB_host"); BB_host.createConfTo(AA_host); CrsMatrixTools::copy(BB_host, S.PermVector(), S.InvPermVector(), AA_host); if (verbose) BB_host.showMe(std::cout) << std::endl; timer.reset(); GraphToolsHostType::getGraph(rptr, cidx, BB_host); t_graph += timer.seconds(); GraphToolsHostType_CAMD C; C.setGraph(BB_host.NumRows(), rptr, cidx, S.NumBlocks(), S.RangeVector()); timer.reset(); C.computeOrdering(); double t_camd = timer.seconds(); if (verbose) C.showMe(std::cout) << std::endl; CrsMatrixBaseHostType CC_host("CC_host"); CC_host.createConfTo(BB_host); CrsMatrixTools::copy(CC_host, C.PermVector(), C.InvPermVector(), BB_host); if (verbose) CC_host.showMe(std::cout) << std::endl; CrsMatrixBaseHostType DD_host("DD_host"); timer.reset(); IncompleteSymbolicFactorizationType::createNonZeroPattern(DD_host, fill_level, Uplo::Upper, CC_host, rows_per_team); double t_symbolic = timer.seconds(); if (verbose) DD_host.showMe(std::cout) << std::endl; // ================================================================================== CrsMatrixBaseDeviceType AA_device("AA_device"); AA_device.mirror(DD_host); const size_type max_concurrency = 10; const size_type max_task_size = (3*sizeof(CrsTaskViewDeviceType)+sizeof(PolicyType)+128); const size_type max_task_dependence = 0; const size_type team_size = 1; PolicyType policy(max_concurrency, max_task_size, max_task_dependence, team_size); CrsMatrixViewDeviceType A_device(AA_device); Kokkos::View<typename CrsMatrixViewDeviceType::row_view_type*,DeviceSpaceType> rowviews("RowViewInMatView", A_device.NumRows()); A_device.setRowViewArray(rowviews); timer.reset(); int ierr = Chol<Uplo::Upper,AlgoChol::Unblocked,Variant::One>::invoke (policy, policy.member_single(), A_device); double t_chol = timer.seconds(); TACHO_TEST_FOR_ABORT( ierr, "Fail to perform Cholesky (serial)"); if (verbose) { DD_host.mirror(AA_device); DD_host.showMe(std::cout) << std::endl; } { const auto prec = std::cout.precision(); std::cout.precision(4); std::cout << std::scientific; std::cout << "SymbolicFactorization:: Given matrix dimension = " << AA_host.NumRows() << " x " << AA_host.NumCols() << ", " << " nnz = " << AA_host.NumNonZeros() << std::endl; std::cout << "SymbolicFactorization:: Upper factors dimension = " << DD_host.NumRows() << " x " << DD_host.NumCols() << ", " << " nnz = " << DD_host.NumNonZeros() << std::endl; std::cout << "SymbolicFactorization:: " << "read = " << t_read << " [sec], " << "graph generation = " << (t_graph/2.0) << " [sec] " << "scotch reordering = " << t_scotch << " [sec] " << "camd reordering = " << t_camd << " [sec] " << "symbolic factorization = " << t_symbolic << " [sec] " << "Cholesky factorization = " << t_chol << " [sec] " << std::endl; std::cout.unsetf(std::ios::scientific); std::cout.precision(prec); } return r_val; }
KOKKOS_INLINE_FUNCTION int exampleStatByBlocks(const string file_input, const int treecut, const int minblksize, const int prunecut, const int seed, const int fill_level, const int league_size, const int histogram_size, const bool verbose) { typedef ValueType value_type; typedef OrdinalType ordinal_type; typedef SizeType size_type; typedef CrsMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> CrsMatrixBaseType; typedef GraphHelper_Scotch<CrsMatrixBaseType> GraphHelperType; typedef SymbolicFactorHelper<CrsMatrixBaseType> SymbolicFactorHelperType; typedef CrsMatrixView<CrsMatrixBaseType> CrsMatrixViewType; typedef CrsMatrixBase<CrsMatrixViewType,ordinal_type,size_type,SpaceType,MemoryTraits> CrsHierMatrixBaseType; int r_val = 0; Kokkos::Impl::Timer timer; double t = 0.0; cout << "StatByBlocks:: import input file = " << file_input << endl; CrsMatrixBaseType AA("AA"); { timer.reset(); ifstream in; in.open(file_input); if (!in.good()) { cout << "Failed in open the file: " << file_input << endl; return ++r_val; } AA.importMatrixMarket(in); t = timer.seconds(); if (verbose) cout << AA << endl; } cout << "StatByBlocks:: import input file::time = " << t << endl; CrsMatrixBaseType UU("UU"); CrsHierMatrixBaseType HU("HU"); { CrsMatrixBaseType PA("Permuted AA"); typename GraphHelperType::size_type_array rptr(AA.Label()+"Graph::RowPtrArray", AA.NumRows() + 1); typename GraphHelperType::ordinal_type_array cidx(AA.Label()+"Graph::ColIndexArray", AA.NumNonZeros()); AA.convertGraph(rptr, cidx); GraphHelperType S(AA.Label()+"ScotchHelper", AA.NumRows(), rptr, cidx, seed); { timer.reset(); S.computeOrdering(treecut, minblksize); S.pruneTree(prunecut); PA.copy(S.PermVector(), S.InvPermVector(), AA); t = timer.seconds(); if (verbose) cout << S << endl; } cout << "StatByBlocks:: reorder the matrix::time = " << t << endl; { SymbolicFactorHelperType F(PA, league_size); timer.reset(); F.createNonZeroPattern(fill_level, Uplo::Upper, UU); t = timer.seconds(); cout << "StatByBlocks:: AA (nnz) = " << AA.NumNonZeros() << ", UU (nnz) = " << UU.NumNonZeros() << endl; } cout << "StatByBlocks:: symbolic factorization::time = " << t << endl; { timer.reset(); CrsMatrixHelper::flat2hier(Uplo::Upper, UU, HU, S.NumBlocks(), S.RangeVector(), S.TreeVector()); for (ordinal_type k=0;k<HU.NumNonZeros();++k) HU.Value(k).fillRowViewArray(); t = timer.seconds(); cout << "StatByBlocks:: Hier (dof, nnz) = " << HU.NumRows() << ", " << HU.NumNonZeros() << endl; } cout << "StatByBlocks:: construct hierarchical matrix::time = " << t << endl; } { cout << endl; cout << " -- Flat matrix: UU --" << endl; cout << " # of Rows = " << UU.NumRows() << endl; cout << " # of Cols = " << UU.NumCols() << endl; cout << " # of Nonzeros = " << UU.NumNonZeros() << endl; cout << endl; cout << " -- Hierarchical matrix: HU --" << endl; cout << " # of Rows = " << HU.NumRows() << endl; cout << " # of Cols = " << HU.NumCols() << endl; cout << " # of Nonzeros = " << HU.NumNonZeros() << endl; cout << endl; cout << " -- Blocks of HU --" << endl; map<size_type,size_type> histogram; if (HU.NumNonZeros()) { size_type nnz_min = HU.Value(0).countNumNonZeros(), nnz_max = nnz_min, nnz_sum = 0, nnz_ave = 0; size_type nnz_cnt = 0; for (ordinal_type k=0;k<HU.NumNonZeros();++k) { const auto nnz_blk = HU.Value(k).countNumNonZeros(); if (nnz_blk) { nnz_min = min(nnz_min, nnz_blk); nnz_max = max(nnz_max, nnz_blk); nnz_sum += nnz_blk; ++nnz_cnt; if (histogram_size) ++histogram[nnz_blk/histogram_size]; } } nnz_ave = nnz_sum/nnz_cnt; cout << " Min # of Nonzeros = " << nnz_min << endl; cout << " Max # of Nonzeros = " << nnz_max << endl; cout << " Ave # of Nonzeros = " << nnz_ave << endl; cout << " Sum # of Nonzeros = " << nnz_sum << endl; cout << " # of empty Blocks = " << (HU.NumNonZeros() - nnz_cnt) << endl; if (histogram_size) { cout << " Histogram" << endl; for (auto it=histogram.begin();it!=histogram.end();++it) cout << (it->first*histogram_size) << " , " << it->second << endl; } } else { cout << " No registered blocks" << endl; } } return r_val; }
KOKKOS_INLINE_FUNCTION int exampleDenseTrsmMKL(const OrdinalType mmin, const OrdinalType mmax, const OrdinalType minc, const OrdinalType k, const bool verbose) { typedef ValueType value_type; typedef OrdinalType ordinal_type; typedef SizeType size_type; typedef DenseMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> DenseMatrixBaseType; int r_val = 0; Kokkos::Impl::Timer timer; double t = 0.0; cout << "DenseGemmMKL:: test matrices " <<":: mmin = " << mmin << " , mmax = " << mmax << " , minc = " << minc << " , k = "<< k << endl; ostringstream os; os.precision(3); os << scientific; for (ordinal_type m=mmin;m<=mmax;m+=minc) { os.str(""); DenseMatrixBaseType AA("AA", m, m), BB("BB", m, k), BC("BC", m, k); // setup upper triangular for (ordinal_type j=0;j<AA.NumCols();++j) { AA.Value(j,j) = 10.0; for (ordinal_type i=0;i<j;++i) AA.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0; } // setup one and right hand side is going to be overwritten by the product of AB for (ordinal_type j=0;j<BB.NumCols();++j) for (ordinal_type i=0;i<BB.NumRows();++i) BB.Value(i,j) = 1.0; Teuchos::BLAS<ordinal_type,value_type> blas; blas.GEMM(Teuchos::CONJ_TRANS, Teuchos::NO_TRANS, m, k, m, 1.0, AA.ValuePtr(), AA.ColStride(), BB.ValuePtr(), BB.ColStride(), 0.0, BC.ValuePtr(), BC.ColStride()); BB.copy(BC); const double flop = get_flop_trsm_upper<value_type>(m, k); os << "DenseTrsmMKL:: m = " << m << " k = " << k; { timer.reset(); Teuchos::BLAS<ordinal_type,value_type> blas; const ordinal_type mm = AA.NumRows(); const ordinal_type nn = BB.NumCols(); blas.TRSM(Teuchos::LEFT_SIDE, Teuchos::UPPER_TRI, Teuchos::CONJ_TRANS, Teuchos::NON_UNIT_DIAG, mm, nn, 1.0, AA.ValuePtr(), AA.ColStride(), BB.ValuePtr(), BB.ColStride()); t = timer.seconds(); os << ":: MKL Performance = " << (flop/t/1.0e9) << " [GFLOPs] "; } cout << os.str() << endl; } return r_val; }
int exampleMatrixMarket(const std::string file_input, const bool verbose) { typedef typename Kokkos::Impl::is_space<DeviceSpaceType>::host_mirror_space::execution_space HostSpaceType ; const bool detail = false; std::cout << "DeviceSpace:: "; DeviceSpaceType::print_configuration(std::cout, detail); std::cout << "HostSpace:: "; HostSpaceType::print_configuration(std::cout, detail); typedef CrsMatrixBase<value_type,ordinal_type,size_type,HostSpaceType> CrsMatrixBaseHostType; int r_val = 0; Kokkos::Impl::Timer timer; CrsMatrixBaseHostType AA("AA"); timer.reset(); { std::ifstream in; in.open(file_input); if (!in.good()) { std::cout << "Failed in open the file: " << file_input << std::endl; return -1; } MatrixMarket::read(AA, in); } double t_read = timer.seconds(); timer.reset(); { std::string file_output = "mm-test-output.mtx"; std::ofstream out; out.open(file_output); if (!out.good()) { std::cout << "Failed in open the file: " << file_output << std::endl; return -1; } MatrixMarket::write(out, AA, "%% Test output"); } double t_write = timer.seconds(); { const auto prec = std::cout.precision(); std::cout.precision(4); std::cout << std::scientific << "MatrixMarket:: dimension = " << AA.NumRows() << " x " << AA.NumCols() << ", " << " nnz = " << AA.NumNonZeros() << ", " << "read = " << t_read << " [sec], " << "write = " << t_write << " [sec] " << std::endl; std::cout.unsetf(std::ios::scientific); std::cout.precision(prec); } if (verbose) { AA.showMe(std::cout) << std::endl; } CrsMatrixBaseHostType BB("BB"); BB.createConfTo(AA); CrsMatrixTools::copy(BB, Uplo::Upper, 0, AA); if (verbose) { BB.setLabel("Copy::AA:Upper::0"); BB.showMe(std::cout) << std::endl; } CrsMatrixTools::copy(BB, Uplo::Upper, 1, AA); if (verbose) { BB.setLabel("Copy::AA:Upper::1"); BB.showMe(std::cout) << std::endl; } CrsMatrixTools::copy(BB, Uplo::Lower, 0, AA); if (verbose) { BB.setLabel("Copy::AA:Lower::0"); BB.showMe(std::cout) << std::endl; } CrsMatrixTools::copy(BB, Uplo::Lower, 1, AA); if (verbose) { BB.setLabel("Copy::AA:Lower::1"); BB.showMe(std::cout) << std::endl; } return r_val; }
int ComputeBasis_HGRAD_Vector(const ordinal_type nworkset, const ordinal_type C, const ordinal_type order, const bool verbose) { typedef Vector<VectorTagType> VectorType; typedef typename VectorTagType::value_type ValueType; constexpr int VectorLength = VectorTagType::length; Teuchos::RCP<std::ostream> verboseStream; Teuchos::oblackholestream bhs; // outputs nothing if (verbose) verboseStream = Teuchos::rcp(&std::cout, false); else verboseStream = Teuchos::rcp(&bhs, false); Teuchos::oblackholestream oldFormatState; oldFormatState.copyfmt(std::cout); typedef typename Kokkos::Impl::is_space<DeviceSpaceType>::host_mirror_space::execution_space HostSpaceType ; *verboseStream << "DeviceSpace:: "; DeviceSpaceType::print_configuration(*verboseStream, false); *verboseStream << "HostSpace:: "; HostSpaceType::print_configuration(*verboseStream, false); *verboseStream << "VectorLength:: " << (VectorLength) << "\n"; using BasisTypeHost = Basis_HGRAD_HEX_C1_FEM<HostSpaceType,ValueType,ValueType>; using ImplBasisType = Impl::Basis_HGRAD_HEX_C1_FEM; using range_type = Kokkos::pair<ordinal_type,ordinal_type>; constexpr size_t LLC_CAPACITY = 32*1024*1024; Intrepid2::Test::Flush<LLC_CAPACITY,DeviceSpaceType> flush; Kokkos::Impl::Timer timer; double t_vectorize = 0; int errorFlag = 0; BasisTypeHost hostBasis; const auto cellTopo = hostBasis.getBaseCellTopology(); auto cubature = DefaultCubatureFactory::create<DeviceSpaceType,ValueType,ValueType>(cellTopo, order); const ordinal_type numCells = C, numCellsAdjusted = C/VectorLength + (C%VectorLength > 0), numVerts = cellTopo.getVertexCount(), numDofs = hostBasis.getCardinality(), numPoints = cubature->getNumPoints(), spaceDim = cubature->getDimension(); Kokkos::DynRankView<ValueType,HostSpaceType> dofCoordsHost("dofCoordsHost", numDofs, spaceDim); hostBasis.getDofCoords(dofCoordsHost); const auto refNodesHost = Kokkos::subview(dofCoordsHost, range_type(0, numVerts), Kokkos::ALL()); // pertub nodes Kokkos::DynRankView<VectorType,HostSpaceType> worksetCellsHost("worksetCellsHost", numCellsAdjusted, numVerts, spaceDim); for (ordinal_type cell=0;cell<numCells;++cell) { for (ordinal_type i=0;i<numVerts;++i) for (ordinal_type j=0;j<spaceDim;++j) { ValueType val = (rand()/(RAND_MAX + 1.0))*0.2 -0.1; worksetCellsHost(cell/VectorLength, i, j)[cell%VectorLength] = refNodesHost(i, j) + val; } } auto worksetCells = Kokkos::create_mirror_view(typename DeviceSpaceType::memory_space(), worksetCellsHost); Kokkos::deep_copy(worksetCells, worksetCellsHost); Kokkos::DynRankView<ValueType,DeviceSpaceType> refPoints("refPoints", numPoints, spaceDim), refWeights("refWeights", numPoints); cubature->getCubature(refPoints, refWeights); std::cout << "===============================================================================\n" << " Performance Test evaluating ComputeBasis \n" << " # of workset = " << nworkset << "\n" << " Test Array Structure (C,F,P,D) = " << numCells << ", " << numDofs << ", " << numPoints << ", " << spaceDim << "\n" << "===============================================================================\n"; *verboseStream << "\n" << "===============================================================================\n" << "TEST 1: evaluateFields vector version\n" << "===============================================================================\n"; try { Kokkos::DynRankView<ValueType,DeviceSpaceType> refBasisValues("refBasisValues", numDofs, numPoints), refBasisGrads ("refBasisGrads", numDofs, numPoints, spaceDim); ImplBasisType::getValues<DeviceSpaceType>(refBasisValues, refPoints, OPERATOR_VALUE); ImplBasisType::getValues<DeviceSpaceType>(refBasisGrads, refPoints, OPERATOR_GRAD); const ordinal_type ibegin = -3; // testing vertical approach { Kokkos::DynRankView<VectorType,DeviceSpaceType> weightedBasisValues("weightedBasisValues", numCellsAdjusted, numDofs, numPoints), weightedBasisGrads ("weightedBasisGrads", numCellsAdjusted, numDofs, numPoints, spaceDim); typedef F_hgrad_eval<VectorType,ValueType,DeviceSpaceType> FunctorType; using range_policy_type = Kokkos::Experimental::MDRangePolicy < DeviceSpaceType, Kokkos::Experimental::Rank<2>, Kokkos::IndexType<ordinal_type> >; range_policy_type policy( { 0, 0 }, { numCellsAdjusted, numPoints } ); FunctorType functor(weightedBasisValues, weightedBasisGrads, refBasisGrads, worksetCells, refWeights, refBasisValues, refBasisGrads); for (ordinal_type iwork=ibegin;iwork<nworkset;++iwork) { flush.run(); DeviceSpaceType::fence(); timer.reset(); Kokkos::parallel_for(policy, functor); DeviceSpaceType::fence(); t_vectorize += (iwork >= 0)*timer.seconds(); } } } catch (std::exception err) { *verboseStream << "UNEXPECTED ERROR !!! ----------------------------------------------------------\n"; *verboseStream << err.what() << '\n'; *verboseStream << "-------------------------------------------------------------------------------" << "\n\n"; errorFlag = -1000; } std::cout << "TEST HGRAD " << " t_vectorize = " << (t_vectorize/nworkset) << std::endl; if (errorFlag != 0) std::cout << "End Result: TEST FAILED\n"; else std::cout << "End Result: TEST PASSED\n"; // reset format state of std::cout std::cout.copyfmt(oldFormatState); return errorFlag; }
KOKKOS_INLINE_FUNCTION int exampleCholUnblocked(const string file_input, const int max_task_dependence, const int team_size, const int algo, const int variant, const bool verbose) { typedef ValueType value_type; typedef OrdinalType ordinal_type; typedef SizeType size_type; typedef CrsMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> CrsMatrixBaseType; typedef CrsMatrixView<CrsMatrixBaseType> CrsMatrixViewType; typedef TaskFactory<Kokkos::Experimental::TaskPolicy<SpaceType>, Kokkos::Experimental::Future<int,SpaceType> > TaskFactoryType; typedef TaskView<CrsMatrixViewType,TaskFactoryType> CrsTaskViewType; int r_val = 0; Kokkos::Impl::Timer timer; double t = 0.0; cout << "CholUnblocked:: import input file = " << file_input << endl; CrsMatrixBaseType AA("AA"), UU("UU"); { timer.reset(); ifstream in; in.open(file_input); if (!in.good()) { cout << "Failed in open the file: " << file_input << endl; return ++r_val; } AA.importMatrixMarket(in); UU.copy(Uplo::Upper, AA); t = timer.seconds(); if (verbose) cout << UU << endl; } cout << "CholUnblocked:: import input file::time = " << t << endl; #ifdef __USE_FIXED_TEAM_SIZE__ typename TaskFactoryType::policy_type policy(max_task_dependence); #else typename TaskFactoryType::policy_type policy(max_task_dependence, team_size); #endif TaskFactoryType::setMaxTaskDependence(max_task_dependence); TaskFactoryType::setPolicy(&policy); cout << "CholUnblocked:: factorize the matrix" << endl; CrsTaskViewType U(&UU); U.fillRowViewArray(); { timer.reset(); typename TaskFactoryType::future_type future; switch (algo) { case AlgoChol::UnblockedOpt: { if (variant == Variant::One) future = TaskFactoryType::Policy().create_team(Chol<Uplo::Upper,AlgoChol::UnblockedOpt,Variant::One> ::TaskFunctor<CrsTaskViewType>(U), 0); else if (variant == Variant::Two) future = TaskFactoryType::Policy().create_team(Chol<Uplo::Upper,AlgoChol::UnblockedOpt,Variant::Two> ::TaskFunctor<CrsTaskViewType>(U), 0); else { ERROR(">> Not supported algorithm variant"); } break; } case AlgoChol::Dummy: { future = TaskFactoryType::Policy().create_team(Chol<Uplo::Upper,AlgoChol::Dummy> ::TaskFunctor<CrsTaskViewType>(U), 0); break; } default: ERROR(">> Not supported algorithm"); break; } TaskFactoryType::Policy().spawn(future); Kokkos::Experimental::wait(TaskFactoryType::Policy()); t = timer.seconds(); if (verbose) cout << UU << endl; } cout << "CholUnblocked:: factorize the matrix::time = " << t << endl; return r_val; }
BASKER_INLINE int Basker<Int, Entry, Exe_Space>::factor_inc_lvl(Int option) { printf("Factor Inc Level Called \n"); gn = A.ncol; gm = A.nrow; if(Options.btf == BASKER_TRUE) { //JDB: We can change this for the new inteface //call reference copy constructor gn = A.ncol; gm = A.nrow; A = BTF_A; //printf("\n\n Switching A, newsize: %d \n", // A.ncol); //printMTX("A_FACTOR.mtx", A); } //Spit into Domain and Sep //----------------------Domain-------------------------// #ifdef BASKER_KOKKOS //====TIMER== #ifdef BASKER_TIME Kokkos::Impl::Timer timer; #endif //===TIMER=== typedef Kokkos::TeamPolicy<Exe_Space> TeamPolicy; if(btf_tabs_offset != 0) { kokkos_nfactor_domain_inc_lvl <Int,Entry,Exe_Space> domain_nfactor(this); Kokkos::parallel_for(TeamPolicy(num_threads,1), domain_nfactor); Kokkos::fence(); //=====Check for error====== while(true) { INT_1DARRAY thread_start; MALLOC_INT_1DARRAY(thread_start, num_threads+1); init_value(thread_start, num_threads+1, (Int) BASKER_MAX_IDX); int nt = nfactor_domain_error(thread_start); if(nt == BASKER_SUCCESS) { break; } else { printf("restart \n"); kokkos_nfactor_domain_remalloc <Int, Entry, Exe_Space> diag_nfactor_remalloc(this, thread_start); Kokkos::parallel_for(TeamPolicy(num_threads,1), diag_nfactor_remalloc); Kokkos::fence(); } }//end while //====TIMER=== #ifdef BASKER_TIME printf("Time DOMAIN: %f \n", timer.seconds()); timer.reset(); #endif //====TIMER==== #else// else basker_kokkos #pragma omp parallel { }//end omp parallel #endif //end basker_kokkos } //-------------------End--Domian--------------------------// //---------------------------Sep--------------------------// if(btf_tabs_offset != 0) { //for(Int l=1; l<=1; l++) for(Int l=1; l <= tree.nlvls; l++) { //Come back for syncs //#ifdef BASKER_OLD_BARRIER Int lthreads = pow(2,l); Int lnteams = num_threads/lthreads; //#else //Int lthreads = 1; //Int lnteams = num_threads/lthreads; //#endif //printf("\n\n ============ SEP: %d ======\n\n",l); #ifdef BASKER_KOKKOS Kokkos::Impl::Timer timer_inner_sep; #ifdef BASKER_NO_LAMBDA kokkos_nfactor_sep2_inc_lvl <Int, Entry, Exe_Space> sep_nfactor(this,l); Kokkos::parallel_for(TeamPolicy(lnteams,lthreads), sep_nfactor); Kokkos::fence(); #ifdef BASKER_TIME printf("Time INNERSEP: %d %f \n", l, timer_inner_sep.seconds()); #endif #else //ELSE BASKER_NO_LAMBDA //Note: to be added #endif //end BASKER_NO_LAMBDA #else #pragma omp parallel { }//end omp parallel #endif }//end over each level #ifdef BASKER_TIME printf("Time SEP: %f \n", timer.seconds()); #endif } //-------------------------End Sep----------------// //-------------------IF BTF-----------------------// if(Options.btf == BASKER_TRUE) { //=====Timer #ifdef BASKER_TIME Kokkos::Impl::Timer timer_btf; #endif //====Timer //======Call diag factor==== /* kokkos_nfactor_diag <Int, Entry, Exe_Space> diag_nfactor(this); Kokkos::parallel_for(TeamPolicy(num_threads,1), diag_nfactor); Kokkos::fence(); */ //=====Check for error====== //while(true) // { //INT_1DARRAY thread_start; // MALLOC_INT_1DARRAY(thread_start, num_threads+1); //init_value(thread_start, num_threads+1, // (Int) BASKER_MAX_IDX); //int nt = nfactor_diag_error(thread_start); // if(nt == BASKER_SUCCESS) // { /// break; // } //else // { /* break; printf("restart \n"); kokkos_nfactor_diag_remalloc <Int, Entry, Exe_Space> diag_nfactor_remalloc(this, thread_start); Kokkos::parallel_for(TeamPolicy(num_threads,1), diag_nfactor); Kokkos::fence(); */ //} // }//end while //====TIMER #ifdef BASKER_TIME printf("Time BTF: %f \n", timer_btf.seconds()); #endif //===TIMER }//end btf call return 0; }//end factor_lvl_inc()
int exampleDenseCholByBlocks(const ordinal_type mmin, const ordinal_type mmax, const ordinal_type minc, const ordinal_type mb, const int max_concurrency, const int max_task_dependence, const int team_size, const int mkl_nthreads, const bool check, const bool verbose) { typedef typename Kokkos::Impl::is_space<DeviceSpaceType>::host_mirror_space::execution_space HostSpaceType ; const bool detail = false; std::cout << "DeviceSpace:: "; DeviceSpaceType::print_configuration(std::cout, detail); std::cout << "HostSpace:: "; HostSpaceType::print_configuration(std::cout, detail); typedef Kokkos::Experimental::TaskPolicy<DeviceSpaceType> PolicyType; typedef DenseMatrixBase<value_type,ordinal_type,size_type,HostSpaceType> DenseMatrixBaseHostType; typedef DenseMatrixView<DenseMatrixBaseHostType> DenseMatrixViewHostType; typedef DenseMatrixBase<value_type,ordinal_type,size_type,DeviceSpaceType> DenseMatrixBaseDeviceType; typedef DenseMatrixView<DenseMatrixBaseDeviceType> DenseMatrixViewDeviceType; typedef TaskView<DenseMatrixViewDeviceType> DenseTaskViewDeviceType; typedef DenseMatrixBase<DenseTaskViewDeviceType,ordinal_type,size_type,DeviceSpaceType> DenseHierMatrixBaseDeviceType; typedef DenseMatrixView<DenseHierMatrixBaseDeviceType> DenseHierMatrixViewDeviceType; typedef TaskView<DenseHierMatrixViewDeviceType> DenseHierTaskViewDeviceType; int r_val = 0; Kokkos::Impl::Timer timer; double t = 0.0; std::cout << "DenseCholByBlocks:: test matrices " <<":: mmin = " << mmin << " , mmax = " << mmax << " , minc = " << minc << " , mb = " << mb << std::endl; const size_t max_task_size = (3*sizeof(DenseTaskViewDeviceType)+sizeof(PolicyType)+128); PolicyType policy(max_concurrency, max_task_size, max_task_dependence, team_size); std::ostringstream os; os.precision(3); os << std::scientific; for (ordinal_type m=mmin;m<=mmax;m+=minc) { os.str(""); // host matrices DenseMatrixBaseHostType AA_host("AA_host", m, m), AB_host("AB_host"), TT_host("TT_host"); // random T matrix { TT_host.createConfTo(AA_host); for (ordinal_type j=0;j<TT_host.NumCols();++j) { for (ordinal_type i=0;i<TT_host.NumRows();++i) TT_host.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0; TT_host.Value(j,j) = std::fabs(TT_host.Value(j,j)); } } // create SPD matrix { Teuchos::BLAS<ordinal_type,value_type> blas; blas.HERK(ArgUplo == Uplo::Upper ? Teuchos::UPPER_TRI : Teuchos::LOWER_TRI, Teuchos::CONJ_TRANS, m, m, 1.0, TT_host.ValuePtr(), TT_host.ColStride(), 0.0, AA_host.ValuePtr(), AA_host.ColStride()); // preserve a copy of A AB_host.createConfTo(AA_host); DenseMatrixTools::copy(AB_host, AA_host); } const double flop = DenseFlopCount<value_type>::Chol(m); #ifdef HAVE_SHYLUTACHO_MKL mkl_set_num_threads(mkl_nthreads); #endif os << "DenseCholByBlocks:: m = " << m << " "; int ierr = 0; if (check) { timer.reset(); DenseMatrixViewHostType A_host(AB_host); ierr = Chol<ArgUplo,AlgoChol::ExternalLapack,Variant::One>::invoke (policy, policy.member_single(), A_host); t = timer.seconds(); TACHO_TEST_FOR_ABORT( ierr, "Fail to perform Cholesky (serial)"); os << ":: Serial Performance = " << (flop/t/1.0e9) << " [GFLOPs] "; } DenseMatrixBaseDeviceType AA_device("AA_device"); { timer.reset(); AA_device.mirror(AA_host); t = timer.seconds(); os << ":: Mirror = " << t << " [sec] "; } { DenseHierMatrixBaseDeviceType HA_device("HA_device"); DenseMatrixTools::createHierMatrix(HA_device, AA_device, mb, mb); DenseHierTaskViewDeviceType TA_device(HA_device); timer.reset(); auto future = policy.proc_create_team (Chol<ArgUplo,AlgoChol::DenseByBlocks,ArgVariant> ::createTaskFunctor(policy, TA_device), 0); policy.spawn(future); Kokkos::Experimental::wait(policy); t = timer.seconds(); os << ":: Parallel Performance = " << (flop/t/1.0e9) << " [GFLOPs] "; } AA_host.mirror(AA_device); if (!ierr && check) { double err = 0.0, norm = 0.0; for (ordinal_type j=0;j<AA_host.NumCols();++j) for (ordinal_type i=0;i<=j;++i) { const double diff = abs(AA_host.Value(i,j) - AB_host.Value(i,j)); const double val = AB_host.Value(i,j); err += diff*diff; norm += val*val; } os << ":: Check result ::norm = " << sqrt(norm) << ", error = " << sqrt(err); } std::cout << os.str() << std::endl; } return r_val; }
void viennaCL_apply( KernelHandle *handle, typename KernelHandle::nnz_lno_t m, typename KernelHandle::nnz_lno_t n, typename KernelHandle::nnz_lno_t k, in_row_index_view_type row_mapA, in_nonzero_index_view_type entriesA, in_nonzero_value_view_type valuesA, bool transposeA, bin_row_index_view_type row_mapB, bin_nonzero_index_view_type entriesB, bin_nonzero_value_view_type valuesB, bool transposeB, cin_row_index_view_type &row_mapC, cin_nonzero_index_view_type &entriesC, cin_nonzero_value_view_type &valuesC){ #ifdef KERNELS_HAVE_VIENNACL typedef typename KernelHandle::nnz_lno_t idx; typedef in_row_index_view_type idx_array_type; typedef typename KernelHandle::nnz_scalar_t value_type; typedef typename in_row_index_view_type::device_type device1; typedef typename in_nonzero_index_view_type::device_type device2; typedef typename in_nonzero_value_view_type::device_type device3; typedef typename KernelHandle::HandleExecSpace MyExecSpace; std::cout << "RUNNING VIENNACL" << std::endl; typedef typename viennacl::compressed_matrix<value_type>::handle_type it; typedef typename viennacl::compressed_matrix<value_type>::value_type vt; if ((Kokkos::Impl::is_same<idx, int>::value && Kokkos::Impl::is_same<typename KernelHandle::size_type, int>::value )|| (Kokkos::Impl::is_same<idx, unsigned int>::value && Kokkos::Impl::is_same<typename KernelHandle::size_type, unsigned int>::value ) || (Kokkos::Impl::is_same<idx, it>::value && Kokkos::Impl::is_same<typename KernelHandle::size_type, it>::value ) ){ unsigned int * a_xadj = (unsigned int *)row_mapA.ptr_on_device(); unsigned int * b_xadj = (unsigned int * )row_mapB.ptr_on_device(); unsigned int * c_xadj = (unsigned int * )row_mapC.ptr_on_device(); unsigned int * a_adj = (unsigned int * )entriesA.ptr_on_device(); unsigned int * b_adj = (unsigned int * )entriesB.ptr_on_device(); unsigned int * c_adj = (unsigned int * )entriesC.ptr_on_device(); int nnzA = entriesA.dimension_0(); int nnzB = entriesB.dimension_0(); value_type *a_ew = valuesA.ptr_on_device(); value_type *b_ew = valuesB.ptr_on_device(); value_type *c_ew = valuesC.ptr_on_device(); /* std::cout << "create a" << std::endl; std::cout << "m:" << m << " n:" << n << std::endl; std::cout << "a_xadj[0]:" << a_xadj[0] << " a_xadj[m]:" << a_xadj[m] << std::endl; std::cout << "a_adj[a_xadj[m] - 1]:" << a_adj[a_xadj[m] - 1] << " a_ew[a_xadj[m] - 1]:" << a_ew[a_xadj[m] - 1] << std::endl; */ Kokkos::Impl::Timer timerset; viennacl::compressed_matrix<value_type> A; viennacl::compressed_matrix<value_type> B; A.set(a_xadj, a_adj, a_ew, m, n, nnzA); B.set(b_xadj, b_adj, b_ew, n, k, nnzB); std::cout << "compress matrix create:" << timerset.seconds() << std::endl; std::cout << "Now running ViennaCL" << std::endl; Kokkos::Impl::Timer timer1; viennacl::compressed_matrix<value_type> C = viennacl::linalg::prod(A, B); std::cout << "Actual VIENNACL SPMM Time:" << timer1.seconds() << std::endl; { unsigned int c_rows = m, c_cols = k, cnnz = C.nnz(); value_type const * values = viennacl::linalg::host_based::detail::extract_raw_pointer<value_type>(C.handle()); unsigned int const * rows_start = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(C.handle1()); unsigned int const * columns = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(C.handle2()); { Kokkos::Impl::Timer copy_time; row_mapC = typename cin_row_index_view_type::non_const_type(Kokkos::ViewAllocateWithoutInitializing("rowmapC"), c_rows + 1); entriesC = typename cin_nonzero_index_view_type::non_const_type (Kokkos::ViewAllocateWithoutInitializing("EntriesC") , cnnz); valuesC = typename cin_nonzero_value_view_type::non_const_type (Kokkos::ViewAllocateWithoutInitializing("valuesC") , cnnz); KokkosKernels::Experimental::Util::copy_vector<unsigned int const *, typename cin_row_index_view_type::non_const_type, MyExecSpace> (m, rows_start, row_mapC); idx nnz = cnnz; KokkosKernels::Experimental::Util::copy_vector<unsigned int const *, typename cin_nonzero_index_view_type::non_const_type, MyExecSpace> (nnz, columns, entriesC); KokkosKernels::Experimental::Util::copy_vector<value_type const *, typename cin_nonzero_value_view_type::non_const_type, MyExecSpace> (m, values, valuesC); double copy_time_d = copy_time.seconds(); std::cout << "VIENNACL COPYTIME:" << copy_time_d << std::endl; } } } else { //int *a_xadj = row_mapA.ptr_on_device(); std::cerr << "vienna requires (u) integer values" << std::endl; if (Kokkos::Impl::is_same<idx, long>::value){ std::cerr << "MKL is given long" << std::endl; } else if (Kokkos::Impl::is_same<idx, const int>::value){ std::cerr << "MKL is given const int" << std::endl; } else if (Kokkos::Impl::is_same<idx, unsigned long>::value){ std::cerr << "MKL is given unsigned long" << std::endl; } else if (Kokkos::Impl::is_same<idx, const unsigned long>::value){ std::cerr << "MKL is given const unsigned long" << std::endl; } else{ std::cerr << "MKL is given something else" << std::endl; } return; } #else std::cerr << "VIENNACL IS NOT DEFINED" << std::endl; return; #endif }
void color(bool useConflictList, bool serialConflictResolution, bool ticToc){ Ordinal numUncolored = _size; // on host double t, total = 0.0; Kokkos::Impl::Timer timer; if(useConflictList) _conflictType = CONFLICT_LIST; // While vertices to color, do speculative coloring. int iter = 0; for(iter = 0; (iter<20) && (numUncolored>0); iter++){ std::cout<< "Start iteration " << iter << std::endl; // First color greedy speculatively, some conflicts expected this -> colorGreedy(); ExecSpace::fence(); if(ticToc){ t = timer.seconds(); total += t; std::cout << "Time speculative greedy phase " << iter << " : " << std::endl; timer.reset(); } #ifdef DEBUG // UVM required - will be slow! printf("\n 100 first vertices: "); for(int i = 0; i < 100; i++){ printf(" %i", _colors[i]); } printf("\n"); #endif // Check for conflicts (parallel), find vertices to recolor numUncolored = this -> findConflicts(); ExecSpace::fence(); if(ticToc){ t = timer.seconds(); total += t; std::cout << "Time conflict detection " << iter << " : " << t << std::endl; timer.reset(); } if (serialConflictResolution) break; // Break after first iteration /* if(_conflictType == CONFLICT_LIST){ array_type temp = _vertexList; _vertexList = _recolorList; _vertexListLength() = _recolorListLength(); _recolorList = temp; _recolorListLength() = 0; } */ if(_conflictType == CONFLICT_LIST){ array_type temp = _vertexList; _vertexList = _recolorList; host_vertexListLength() = host_recolorListLength(); _recolorList = temp; host_recolorListLength() = 0; Kokkos::deep_copy(_vertexListLength, host_vertexListLength); Kokkos::deep_copy(_recolorListLength, host_recolorListLength); } } std::cout << "Number of coloring iterations: " << iter << std::endl; if(numUncolored > 0){ // Resolve conflicts by recolor in serial this -> resolveConflicts(); ExecSpace::fence(); if(ticToc){ t = timer.seconds(); total += t; std::cout << "Time conflict resolution: " << t << std::endl; std::cout << "Total time: " << total << std::endl; } } }
KOKKOS_INLINE_FUNCTION int exampleCholPerformance(const string file_input, const int treecut, const int minblksize, const int prunecut, const int seed, const int niter, const int nthreads, const int max_task_dependence, const int team_size, const int fill_level, const int league_size, const bool team_interface, const bool skip_serial, const bool mkl_interface, const bool verbose) { typedef ValueType value_type; typedef OrdinalType ordinal_type; typedef SizeType size_type; typedef TaskFactory<Kokkos::Experimental::TaskPolicy<SpaceType>, Kokkos::Experimental::Future<int,SpaceType> > TaskFactoryType; typedef CrsMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> CrsMatrixBaseType; typedef GraphHelper_Scotch<CrsMatrixBaseType> GraphHelperType; typedef SymbolicFactorHelper<CrsMatrixBaseType> SymbolicFactorHelperType; #ifdef HAVE_SHYLUTACHO_MKL typedef typename CrsMatrixBaseType::value_type_array value_type_array; #endif typedef CrsMatrixView<CrsMatrixBaseType> CrsMatrixViewType; typedef TaskView<CrsMatrixViewType,TaskFactoryType> CrsTaskViewType; typedef CrsMatrixBase<CrsTaskViewType,ordinal_type,size_type,SpaceType,MemoryTraits> CrsHierMatrixBaseType; typedef CrsMatrixView<CrsHierMatrixBaseType> CrsHierMatrixViewType; typedef TaskView<CrsHierMatrixViewType,TaskFactoryType> CrsHierTaskViewType; int r_val = 0; Kokkos::Impl::Timer timer; double t_import = 0.0, t_reorder = 0.0, t_symbolic = 0.0, t_flat2hier = 0.0, #ifdef HAVE_SHYLUTACHO_MKL t_mkl_seq = 0.0, #endif t_factor_seq = 0.0, t_factor_task = 0.0; const int start = -2; cout << "CholPerformance:: import input file = " << file_input << endl; CrsMatrixBaseType AA("AA"); { timer.reset(); ifstream in; in.open(file_input); if (!in.good()) { cout << "Failed in open the file: " << file_input << endl; return ++r_val; } AA.importMatrixMarket(in); t_import = timer.seconds(); if (verbose) cout << AA << endl; } cout << "CholPerformance:: import input file::time = " << t_import << endl; cout << "CholPerformance:: reorder the matrix" << endl; CrsMatrixBaseType PA("Permuted AA"); CrsMatrixBaseType UU("UU"); // permuted base upper triangular matrix CrsHierMatrixBaseType HU("HU"); // hierarchical matrix of views { GraphHelperType S(AA, seed); { timer.reset(); S.computeOrdering(treecut, minblksize); S.pruneTree(prunecut); PA.copy(S.PermVector(), S.InvPermVector(), AA); t_reorder = timer.seconds(); if (verbose) cout << S << endl << PA << endl; } cout << "CholPerformance:: reorder the matrix::time = " << t_reorder << endl; { SymbolicFactorHelperType F(PA, league_size); for (int i=start;i<niter;++i) { timer.reset(); F.createNonZeroPattern(fill_level, Uplo::Upper, UU); // UU.copy(Uplo::Upper, PA); t_symbolic += timer.seconds() * (i>=0); } t_symbolic /= niter; cout << "CholPerformance:: AA (nnz) = " << AA.NumNonZeros() << ", UU (nnz) = " << UU.NumNonZeros() << endl; if (verbose) cout << F << endl << UU << endl; } cout << "CholPerformance:: symbolic factorization::time = " << t_symbolic << endl; { timer.reset(); CrsMatrixHelper::flat2hier(Uplo::Upper, UU, HU, S.NumBlocks(), S.RangeVector(), S.TreeVector()); for (ordinal_type k=0;k<HU.NumNonZeros();++k) HU.Value(k).fillRowViewArray(); t_flat2hier = timer.seconds(); cout << "CholPerformance:: Hier (dof, nnz) = " << HU.NumRows() << ", " << HU.NumNonZeros() << endl; } cout << "CholPerformance:: construct hierarchical matrix::time = " << t_flat2hier << endl; } // copy of UU CrsMatrixBaseType RR("RR"); RR.copy(UU); #ifdef HAVE_SHYLUTACHO_MKL if (!skip_serial && mkl_interface) { cout << "CholPerformance:: MKL factorize the matrix" << endl; CrsMatrixBaseType MM("MM"); for (int i=start;i<niter;++i) { MM.copy(RR); MM.hermitianize(Uplo::Upper); MKL_INT n = static_cast<MKL_INT>(MM.NumRows()); double *a = static_cast<double*>(MM.ValuePtr()); MKL_INT *ia = static_cast<MKL_INT*>(MM.RowPtr()); MKL_INT *ja = static_cast<MKL_INT*>(MM.ColPtr()); // convert to 1-based matrix { for (ordinal_type k=0;k<(MM.NumRows()+1);++k) ++ia[k]; for (size_type k=0;k<MM.NumNonZeros();++k) ++ja[k]; } value_type_array mkl_result = value_type_array("mkl-ilu-values", MM.NumNonZeros()); double *bilu0 = static_cast<double*>(&mkl_result[0]); MKL_INT ipar[128]; double dpar[128]; MKL_INT ierr; // we provide ilu-k pattern timer.reset(); dcsrilu0(&n, a, ia, ja, bilu0, ipar, dpar, &ierr); t_mkl_seq += timer.seconds() * (i>=0) * 0.5; if (ierr != 0) cout << " MKL Error = " << ierr << endl; } t_mkl_seq /= niter; cout << "CholPerformance:: MKL factorize the matrix::time = " << t_mkl_seq << endl; } #endif if (!skip_serial) { #ifdef __USE_FIXED_TEAM_SIZE__ typename TaskFactoryType::policy_type policy(max_task_dependence); #else typename TaskFactoryType::policy_type policy(max_task_dependence, 1); #endif TaskFactoryType::setUseTeamInterface(team_interface); TaskFactoryType::setMaxTaskDependence(max_task_dependence); TaskFactoryType::setPolicy(&policy); CrsTaskViewType U(&UU); U.fillRowViewArray(); cout << "CholPerformance:: Serial factorize the matrix" << endl; { for (int i=start;i<niter;++i) { UU.copy(RR); timer.reset(); // { // auto future = TaskFactoryType::Policy().create(Chol<Uplo::Upper,AlgoChol::UnblockedOpt,Variant::One> // ::TaskFunctor<CrsTaskViewType>(U), 0); // TaskFactoryType::Policy().spawn(future); // Kokkos::Experimental::wait(TaskFactoryType::Policy()); // } { Chol<Uplo::Upper,AlgoChol::UnblockedOpt,Variant::One> ::invoke(TaskFactoryType::Policy(), TaskFactoryType::Policy().member_single(), U); } t_factor_seq += timer.seconds() * (i>=0); } t_factor_seq /= niter; if (verbose) cout << UU << endl; } cout << "CholPerformance:: Serial factorize the matrix::time = " << t_factor_seq << endl; } // { // #ifdef __USE_FIXED_TEAM_SIZE__ // typename TaskFactoryType::policy_type policy(max_task_dependence); // #else // typename TaskFactoryType::policy_type policy(max_task_dependence, nthreads); // #endif // TaskFactoryType::setPolicy(&policy); // CrsTaskViewType U(&UU); // U.fillRowViewArray(); // cout << "CholPerformance:: Team factorize the matrix:: team_size = " << nthreads << endl; // { // timer.reset(); // auto future = TaskFactoryType::Policy().create(Chol<Uplo::Upper,AlgoChol::UnblockedOpt,Variant::One> // ::TaskFunctor<CrsTaskViewType>(U), 0); // TaskFactoryType::Policy().spawn(future); // Kokkos::Experimental::wait(TaskFactoryType::Policy()); // t_factor_team = timer.seconds(); // if (verbose) // cout << UU << endl; // } // cout << "CholPerformance:: Team factorize the matrix::time = " << t_factor_team << endl; // } { #ifdef __USE_FIXED_TEAM_SIZE__ typename TaskFactoryType::policy_type policy(max_task_dependence); #else typename TaskFactoryType::policy_type policy(max_task_dependence, team_size); #endif TaskFactoryType::setUseTeamInterface(team_interface); TaskFactoryType::setMaxTaskDependence(max_task_dependence); TaskFactoryType::setPolicy(&policy); cout << "CholPerformance:: ByBlocks factorize the matrix:: team_size = " << team_size << endl; CrsHierTaskViewType H(&HU); { for (int i=start;i<niter;++i) { UU.copy(RR); timer.reset(); { auto future = TaskFactoryType::Policy().create_team(Chol<Uplo::Upper,AlgoChol::ByBlocks>:: TaskFunctor<CrsHierTaskViewType>(H), 0); TaskFactoryType::Policy().spawn(future); Kokkos::Experimental::wait(TaskFactoryType::Policy()); } t_factor_task += timer.seconds() * (i>=0); } t_factor_task /= niter; if (verbose) cout << UU << endl; } cout << "CholPerformance:: ByBlocks factorize the matrix::time = " << t_factor_task << endl; } if (!skip_serial) { #ifdef HAVE_SHYLUTACHO_MKL cout << "CholPerformance:: mkl/chol scale [mkl/chol] = " << t_mkl_seq/t_factor_seq << endl; cout << "CholPerformance:: mkl/task scale [mkl/task] = " << t_mkl_seq/t_factor_task << endl; #else cout << "CholPerformance:: task scale [seq/task] = " << t_factor_seq/t_factor_task << endl; #endif //cout << "CholPerformance:: team scale [seq/team] = " << t_factor_seq/t_factor_team << endl; } return r_val; }
KOKKOS_INLINE_FUNCTION int exampleICholUnblocked(const string file_input, const int max_task_dependence, const int team_size, const bool verbose) { typedef ValueType value_type; typedef OrdinalType ordinal_type; typedef SizeType size_type; typedef CrsMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> CrsMatrixBaseType; typedef CrsMatrixView<CrsMatrixBaseType> CrsMatrixViewType; typedef TaskTeamFactory<Kokkos::Experimental::TaskPolicy<SpaceType>, Kokkos::Experimental::Future<int,SpaceType>, Kokkos::Impl::TeamThreadRangeBoundariesStruct> TaskFactoryType; typedef ParallelFor ForType; typedef TaskView<CrsMatrixViewType,TaskFactoryType> CrsTaskViewType; int r_val = 0; Kokkos::Impl::Timer timer; double t = 0.0; cout << "ICholUnblocked:: import input file = " << file_input << endl; CrsMatrixBaseType AA("AA"), UU("UU"); { timer.reset(); ifstream in; in.open(file_input); if (!in.good()) { cout << "Failed in open the file: " << file_input << endl; return ++r_val; } AA.importMatrixMarket(in); UU.copy(Uplo::Upper, AA); t = timer.seconds(); if (verbose) cout << UU << endl; } cout << "ICholUnblocked:: import input file::time = " << t << endl; #ifdef __USE_FIXED_TEAM_SIZE__ typename TaskFactoryType::policy_type policy(max_task_dependence); #else typename TaskFactoryType::policy_type policy(max_task_dependence, team_size); #endif TaskFactoryType::setMaxTaskDependence(max_task_dependence); TaskFactoryType::setPolicy(&policy); cout << "ICholUnblocked:: factorize the matrix" << endl; CrsTaskViewType U(&UU); U.fillRowViewArray(); { timer.reset(); auto future = TaskFactoryType::Policy().create_team(IChol<Uplo::Upper,AlgoIChol::UnblockedOpt1> ::TaskFunctor<ForType,CrsTaskViewType>(U), 0); TaskFactoryType::Policy().spawn(future); Kokkos::Experimental::wait(TaskFactoryType::Policy()); t = timer.seconds(); if (verbose) cout << UU << endl; } cout << "ICholUnblocked:: factorize the matrix::time = " << t << endl; return r_val; }
BASKER_INLINE int Basker<Int, Entry, Exe_Space>::factor_notoken(Int option) { //printf("factor no token called \n"); gn = A.ncol; gm = A.nrow; BASKER_MATRIX ATEMP; //Kokkos::Impl::Timer tza; if(Options.btf == BASKER_TRUE) { //JDB: We can change this for the new inteface gn = A.ncol; gm = A.nrow; ATEMP = A; A = BTF_A; } //printf("Switch time: %f \n", tza.seconds()); //Spit into Domain and Sep //----------------------Domain-------------------------// #ifdef BASKER_KOKKOS //====TIMER== #ifdef BASKER_TIME Kokkos::Impl::Timer timer; #endif //===TIMER=== typedef Kokkos::TeamPolicy<Exe_Space> TeamPolicy; if(btf_tabs_offset != 0) { if(Options.verbose == BASKER_TRUE) { printf("Factoring Dom num_threads: %d \n", num_threads); } Int domain_restart = 0; kokkos_nfactor_domain <Int,Entry,Exe_Space> domain_nfactor(this); Kokkos::parallel_for(TeamPolicy(num_threads,1), domain_nfactor); Kokkos::fence(); //=====Check for error====== while(true) { INT_1DARRAY thread_start; MALLOC_INT_1DARRAY(thread_start, num_threads+1); init_value(thread_start, num_threads+1, (Int) BASKER_MAX_IDX); int nt = nfactor_domain_error(thread_start); if((nt == BASKER_SUCCESS) || (nt == BASKER_ERROR) || (domain_restart > BASKER_RESTART)) { break; } else { domain_restart++; if(Options.verbose == BASKER_TRUE) { printf("restart \n"); } kokkos_nfactor_domain_remalloc <Int, Entry, Exe_Space> diag_nfactor_remalloc(this, thread_start); Kokkos::parallel_for(TeamPolicy(num_threads,1), diag_nfactor_remalloc); Kokkos::fence(); } }//end while //====TIMER=== #ifdef BASKER_TIME printf("Time DOMAIN: %f \n", timer.seconds()); timer.reset(); #endif //====TIMER==== #else// else basker_kokkos #pragma omp parallel { }//end omp parallel #endif //end basker_kokkos } //-------------------End--Domian--------------------------// //printVec("domperm.csc", gpermi, A.nrow); //---------------------------Sep--------------------------// if(btf_tabs_offset != 0) { //for(Int l=1; l<=4; l++) for(Int l=1; l <= tree.nlvls; l++) { //#ifdef BASKER_OLD_BARRIER //Int lthreads = pow(2,l); //Int lnteams = num_threads/lthreads; //#else Int lthreads = 1; Int lnteams = num_threads/lthreads; //#endif Int sep_restart = 0; if(Options.verbose == BASKER_TRUE) { printf("Factoring Sep num_threads: %d %d \n", lnteams, lthreads); } #ifdef BASKER_KOKKOS Kokkos::Impl::Timer timer_inner_sep; #ifdef BASKER_NO_LAMBDA //kokkos_nfactor_sep <Int, Entry, Exe_Space> //sep_nfactor(this, l); kokkos_nfactor_sep2 <Int, Entry, Exe_Space> sep_nfactor(this,l); Kokkos::parallel_for(TeamPolicy(lnteams,lthreads), sep_nfactor); Kokkos::fence(); //======Check for error===== while(true) { INT_1DARRAY thread_start; MALLOC_INT_1DARRAY(thread_start, num_threads+1); init_value(thread_start, num_threads+1, (Int) BASKER_MAX_IDX); int nt = nfactor_sep_error(thread_start); if((nt == BASKER_SUCCESS)|| (nt == BASKER_ERROR) || (sep_restart > BASKER_RESTART)) { FREE_INT_1DARRAY(thread_start); break; } else { sep_restart++; if (Options.verbose == BASKER_TRUE) { printf("restart \n"); } Kokkos::parallel_for(TeamPolicy(lnteams,lthreads), sep_nfactor); Kokkos::fence(); } }//end while-true #ifdef BASKER_TIME printf("Time INNERSEP: %d %f \n", l, timer_inner_sep.seconds()); #endif #else //ELSE BASKER_NO_LAMBDA //Note: to be added #endif //end BASKER_NO_LAMBDA #else #pragma omp parallel { }//end omp parallel #endif }//end over each level #ifdef BASKER_TIME printf("Time SEP: %f \n", timer.seconds()); #endif } //-------------------------End Sep----------------// //-------------------IF BTF-----------------------// if(Options.btf == BASKER_TRUE) { Int btf_restart = 0; if(Options.verbose == BASKER_TRUE) { printf("Factoring BLKs num_threads: %d \n", num_threads); } //=====Timer #ifdef BASKER_TIME Kokkos::Impl::Timer timer_btf; #endif //====Timer //======Call diag factor==== kokkos_nfactor_diag <Int, Entry, Exe_Space> diag_nfactor(this); Kokkos::parallel_for(TeamPolicy(num_threads,1), diag_nfactor); Kokkos::fence(); //=====Check for error====== while(true) { INT_1DARRAY thread_start; MALLOC_INT_1DARRAY(thread_start, num_threads+1); init_value(thread_start, num_threads+1, (Int) BASKER_MAX_IDX); int nt = nfactor_diag_error(thread_start); //printf("RETURNED: %d \n", nt); if((nt == BASKER_SUCCESS) || (nt == BASKER_ERROR) || (btf_restart > BASKER_RESTART)) { break; } else { btf_restart++; if (Options.verbose == BASKER_TRUE) { printf("restart \n"); } kokkos_nfactor_diag_remalloc <Int, Entry, Exe_Space> diag_nfactor_remalloc(this, thread_start); Kokkos::parallel_for(TeamPolicy(num_threads,1), diag_nfactor_remalloc); Kokkos::fence(); } }//end while //====TIMER #ifdef BASKER_TIME printf("Time BTF: %f \n", timer_btf.seconds()); #endif //===TIMER }//end btf call Kokkos::Impl::Timer tzback; if(Options.btf == BASKER_TRUE) { A = ATEMP; } //printf("Switch back: %f \n", // tzback.seconds()); return 0; }//end factor_notoken()
int exampleDenseMatrixBase(const ordinal_type mmin, const ordinal_type mmax, const ordinal_type minc, const bool verbose) { typedef typename Kokkos::Impl::is_space<DeviceSpaceType>::host_mirror_space::execution_space HostSpaceType ; const bool detail = false; std::cout << "DeviceSpace:: "; DeviceSpaceType::print_configuration(std::cout, detail); std::cout << "HostSpace:: "; HostSpaceType::print_configuration(std::cout, detail); std::cout << std::endl; typedef DenseMatrixBase<value_type,ordinal_type,size_type,HostSpaceType> DenseMatrixBaseHostType; typedef DenseMatrixBase<value_type,ordinal_type,size_type,DeviceSpaceType> DenseMatrixBaseDeviceType; int r_val = 0; Kokkos::Impl::Timer timer; std::cout << "DenseMatrixBase:: test matrices " <<":: mmin = " << mmin << " , mmax = " << mmax << " , minc = " << minc << std::endl; for (auto m=mmin; m<=mmax; m+=minc) { // random test matrix on host DenseMatrixBaseHostType TT("TT", m, m); for (ordinal_type j=0; j<TT.NumCols(); ++j) { for (ordinal_type i=0; i<TT.NumRows(); ++i) TT.Value(i,j) = 2.0*((value_type)std::rand()/(RAND_MAX)) - 1.0; TT.Value(j,j) = std::fabs(TT.Value(j,j)); } if (verbose) TT.showMe(std::cout) << std::endl; DenseMatrixBaseDeviceType AA("AA"); timer.reset(); AA.mirror(TT); double t_mirror = timer.seconds(); DenseMatrixBaseDeviceType BB("BB"); BB.createConfTo(AA); timer.reset(); DenseMatrixTools::copy(BB, AA); double t_copy = timer.seconds(); // check DenseMatrixBaseHostType RR("RR"); RR.createConfTo(BB); RR.mirror(BB); if (verbose) RR.showMe(std::cout) << std::endl; double err = 0.0; for (ordinal_type j=0; j<TT.NumCols(); ++j) for (ordinal_type i=0; i<TT.NumRows(); ++i) err += std::fabs(TT.Value(i,j) - RR.Value(i,j)); { const auto prec = std::cout.precision(); std::cout.precision(4); std::cout << std::scientific << "DenseMatrixBase:: dimension = " << m << " x " << m << ", " << "Mirroring to device = " << t_mirror << " [sec], " << "Elementwise copy on device = " << t_copy << " [sec], " << "Error = " << err << std::endl; std::cout.unsetf(std::ios::scientific); std::cout.precision(prec); } } return r_val; }
void VerletKokkos::run(int n) { bigint ntimestep; int nflag,sortflag; int n_post_integrate = modify->n_post_integrate; int n_pre_exchange = modify->n_pre_exchange; int n_pre_neighbor = modify->n_pre_neighbor; int n_pre_force = modify->n_pre_force; int n_post_force = modify->n_post_force; int n_end_of_step = modify->n_end_of_step; if (atomKK->sortfreq > 0) sortflag = 1; else sortflag = 0; static double time = 0.0; static int count = 0; atomKK->sync(Device,ALL_MASK); Kokkos::Impl::Timer ktimer; for (int i = 0; i < n; i++) { ntimestep = ++update->ntimestep; ev_set(ntimestep); // initial time integration ktimer.reset(); timer->stamp(); modify->initial_integrate(vflag); time += ktimer.seconds(); if (n_post_integrate) modify->post_integrate(); timer->stamp(Timer::MODIFY); // regular communication vs neighbor list rebuild nflag = neighbor->decide(); if (nflag == 0) { timer->stamp(); comm->forward_comm(); timer->stamp(Timer::COMM); } else { // added debug //atomKK->sync(Host,ALL_MASK); //atomKK->modified(Host,ALL_MASK); if (n_pre_exchange) { timer->stamp(); modify->pre_exchange(); timer->stamp(Timer::MODIFY); } // debug //atomKK->sync(Host,ALL_MASK); //atomKK->modified(Host,ALL_MASK); if (triclinic) domain->x2lamda(atomKK->nlocal); domain->pbc(); if (domain->box_change) { domain->reset_box(); comm->setup(); if (neighbor->style) neighbor->setup_bins(); } timer->stamp(); // added debug //atomKK->sync(Device,ALL_MASK); //atomKK->modified(Device,ALL_MASK); comm->exchange(); if (sortflag && ntimestep >= atomKK->nextsort) atomKK->sort(); comm->borders(); // added debug //atomKK->sync(Host,ALL_MASK); //atomKK->modified(Host,ALL_MASK); if (triclinic) domain->lamda2x(atomKK->nlocal+atomKK->nghost); timer->stamp(Timer::COMM); if (n_pre_neighbor) { modify->pre_neighbor(); timer->stamp(Timer::MODIFY); } neighbor->build(); timer->stamp(Timer::NEIGH); } // force computations // important for pair to come before bonded contributions // since some bonded potentials tally pairwise energy/virial // and Pair:ev_tally() needs to be called before any tallying force_clear(); timer->stamp(); // added for debug //atomKK->k_x.sync<LMPHostType>(); //atomKK->k_f.sync<LMPHostType>(); //atomKK->k_f.modify<LMPHostType>(); if (n_pre_force) { modify->pre_force(vflag); timer->stamp(Timer::MODIFY); } if (pair_compute_flag) { atomKK->sync(force->pair->execution_space,force->pair->datamask_read); atomKK->modified(force->pair->execution_space,force->pair->datamask_modify); force->pair->compute(eflag,vflag); timer->stamp(Timer::PAIR); } if (atomKK->molecular) { if (force->bond) { atomKK->sync(force->bond->execution_space,force->bond->datamask_read); atomKK->modified(force->bond->execution_space,force->bond->datamask_modify); force->bond->compute(eflag,vflag); } if (force->angle) { atomKK->sync(force->angle->execution_space,force->angle->datamask_read); atomKK->modified(force->angle->execution_space,force->angle->datamask_modify); force->angle->compute(eflag,vflag); } if (force->dihedral) { atomKK->sync(force->dihedral->execution_space,force->dihedral->datamask_read); atomKK->modified(force->dihedral->execution_space,force->dihedral->datamask_modify); force->dihedral->compute(eflag,vflag); } if (force->improper) { atomKK->sync(force->improper->execution_space,force->improper->datamask_read); atomKK->modified(force->improper->execution_space,force->improper->datamask_modify); force->improper->compute(eflag,vflag); } timer->stamp(Timer::BOND); } if (kspace_compute_flag) { atomKK->sync(force->kspace->execution_space,force->kspace->datamask_read); atomKK->modified(force->kspace->execution_space,force->kspace->datamask_modify); force->kspace->compute(eflag,vflag); timer->stamp(Timer::KSPACE); } // reverse communication of forces if (force->newton) comm->reverse_comm(); timer->stamp(Timer::COMM); // force modifications, final time integration, diagnostics ktimer.reset(); if (n_post_force) modify->post_force(vflag); modify->final_integrate(); if (n_end_of_step) modify->end_of_step(); timer->stamp(Timer::MODIFY); time += ktimer.seconds(); // all output if (ntimestep == output->next) { atomKK->sync(Host,ALL_MASK); timer->stamp(); output->write(ntimestep); timer->stamp(Timer::OUTPUT); } } }
KOKKOS_INLINE_FUNCTION int exampleKokkosTaskData(const int ntasks, const int max_task_dependence, const int team_size, const bool verbose) { typedef Kokkos::Experimental::TaskPolicy<SpaceType> policy_type ; typedef SimpleTask<policy_type> simple_task_type; typedef Kokkos::Experimental::Future<typename simple_task_type::value_type,SpaceType> future_type ; policy_type policy; Kokkos::Impl::Timer timer; for (int use_barrier=0;use_barrier<2;++use_barrier) { cout << "KokkosTaskData:: use barrier " << (use_barrier ? "yes" : "no") << endl; { timer.reset(); future_type f = policy.create(simple_task_type(use_barrier), max_task_dependence); policy.spawn(f); Kokkos::Experimental::wait( policy ); const double t = timer.seconds(); cout << "KokkosTaskData:: single task is spawned :: time = " << t << endl; } { timer.reset(); future_type f = policy.create_team(simple_task_type(use_barrier), max_task_dependence); policy.spawn(f); Kokkos::Experimental::wait( policy ); const double t = timer.seconds(); cout << "KokkosTaskData:: single team task is spawned :: time = " << t << endl; } { timer.reset(); future_type f[MAXTASKS]; for (int i=0;i<ntasks;++i) { f[i] = policy.create(simple_task_type(use_barrier), max_task_dependence); policy.spawn(f[i]); } Kokkos::Experimental::wait( policy ); const double t = timer.seconds(); cout << "KokkosTaskData:: " << ntasks << " tasks are spawned :: time = " << t << endl; } { timer.reset(); future_type f[MAXTASKS]; for (int i=0;i<ntasks;++i) { f[i] = policy.create_team(simple_task_type(use_barrier), max_task_dependence); policy.spawn(f[i]); } Kokkos::Experimental::wait( policy ); const double t = timer.seconds(); cout << "KokkosTaskData:: " << ntasks << " team tasks are spawned :: time = " << t << endl; } } return 0; }
void graph_color_symbolic( KernelHandle *handle, typename KernelHandle::row_lno_t num_rows, typename KernelHandle::row_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, bool is_symmetric = true){ Kokkos::Impl::Timer timer; typename KernelHandle::GraphColoringHandleType *gch = handle->get_graph_coloring_handle(); ColoringAlgorithm algorithm = gch->get_coloring_type(); typedef typename KernelHandle::GraphColoringHandleType::color_view_t color_view_type; color_view_type colors_out = color_view_type("Graph Colors", num_rows); typedef typename Impl::GraphColor <typename KernelHandle::GraphColoringHandleType, lno_row_view_t_, lno_nnz_view_t_> BaseGraphColoring; BaseGraphColoring *gc = NULL; switch (algorithm){ case COLORING_SERIAL: gc = new BaseGraphColoring( num_rows, entries.dimension_0(), row_map, entries, gch); break; case COLORING_VB: case COLORING_VBBIT: case COLORING_VBCS: typedef typename Impl::GraphColor_VB <typename KernelHandle::GraphColoringHandleType, lno_row_view_t_, lno_nnz_view_t_> VBGraphColoring; gc = new VBGraphColoring( num_rows, entries.dimension_0(), row_map, entries, gch); break; case COLORING_EB: typedef typename Impl::GraphColor_EB <typename KernelHandle::GraphColoringHandleType, lno_row_view_t_, lno_nnz_view_t_> EBGraphColoring; gc = new EBGraphColoring(num_rows, entries.dimension_0(),row_map, entries, gch); break; case COLORING_DEFAULT: break; } int num_phases = 0; gc->color_graph(colors_out, num_phases); delete gc; double coloring_time = timer.seconds(); gch->add_to_overall_coloring_time(coloring_time); gch->set_coloring_time(coloring_time); gch->set_num_phases(num_phases); gch->set_vertex_colors(colors_out); }
void driver_modified_gram_schmidt < #if defined( __CUDACC__ ) Kokkos::Cuda #else Kokkos::Threads #endif > ( const int length_begin , const int length_end , const int count , const int iter , comm::Machine machine ) { #if defined( __CUDACC__ ) typedef Kokkos::Cuda Device ; #else typedef Kokkos::Threads Device ; #endif const int comm_size = comm::size( machine ); const int comm_rank = comm::rank( machine ); if ( comm_rank == 0 ) { std::cout << ( Kokkos::Impl::is_same<Device,Kokkos::Cuda>::value ? "\"Cuda\"" : "\"Threads\"" ) << " , \"Double Precision\"" << std::endl ; std::cout << "\"Length\" , \"Count\" , \"millisec\"" << " , \"Gflops\" , \"Greads\" , \"Gwrites\"" << " , \"Gflops/s\" , \"Read GB/s\" , \"Write GB/s\"" << std::endl ; } for ( int length = length_begin ; length < length_end ; length *= 2 ) { const ModifedGramSchmidCounts counts( length , count ); const int local_length_upper = ( length + comm_size - 1 ) / comm_size ; const int local_begin = std::min( length , local_length_upper * comm_rank ); const int local_next = std::min( length , local_length_upper * ( comm_rank + 1 ) ); const int local_length = local_next - local_begin ; typedef Kokkos::View< double ** , Kokkos::LayoutLeft , Device > matrix_double_type ; const matrix_double_type Q( "Q" , local_length , count ); const matrix_double_type R( "R" , count , count ); const matrix_double_type::HostMirror hQ = Kokkos::create_mirror_view( Q ); for ( int j = 0 ; j < count ; ++j ) { for ( int i = 0 ; i < local_length ; ++i ) { hQ(i,j) = ( i + 1 ) * ( j + 1 ); } } double dt_min = 0 ; for ( int j = 0 ; j < iter ; ++j ) { Kokkos::deep_copy( Q , hQ ); Kokkos::Impl::Timer timer ; modified_gram_schmidt( Q , R , machine ); const double dt = comm::max( machine , timer.seconds() ); if ( 0 == j || dt < dt_min ) dt_min = dt ; } if ( 0 == comm_rank ) { const double milli_sec = dt_min * 1.0e3 ; const double giga_flops = ( counts.flops / dt_min ) / 1.0e9 ; const double GB_reads = ( counts.reads * sizeof(double) ) / ( dt_min * 1.0e9 ); const double GB_writes = ( counts.writes * sizeof(double) ) / ( dt_min * 1.0e9 ); std::cout << length << " , " << count << " , " << milli_sec << " , " << double(counts.flops) / 1.0e9 << " , " << double(counts.reads) / 1.0e9 << " , " << double(counts.writes) / 1.0e9 << " , " << giga_flops << " , " << GB_reads << " , " << GB_writes << std::endl ; } } }
KOKKOS_INLINE_FUNCTION int exampleCholDirectPlain(const string file_input, const int prunecut, const int seed, const int nrhs, const int nb, const int nthreads, const int max_task_dependence, const int team_size, const int league_size, const bool team_interface, const bool serial, const bool solve, const bool check, const bool verbose) { typedef ValueType value_type; typedef OrdinalType ordinal_type; typedef SizeType size_type; typedef TaskFactory<Kokkos::Experimental::TaskPolicy<SpaceType>, Kokkos::Experimental::Future<int,SpaceType> > TaskFactoryType; typedef CrsMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> CrsMatrixBaseType; typedef GraphHelper_Scotch<CrsMatrixBaseType> GraphHelperType; typedef SymbolicFactorHelper<CrsMatrixBaseType> SymbolicFactorHelperType; typedef CrsMatrixView<CrsMatrixBaseType> CrsMatrixViewType; typedef TaskView<CrsMatrixViewType,TaskFactoryType> CrsTaskViewType; typedef CrsMatrixBase<CrsTaskViewType,ordinal_type,size_type,SpaceType,MemoryTraits> CrsHierMatrixBaseType; typedef CrsMatrixView<CrsHierMatrixBaseType> CrsHierMatrixViewType; typedef TaskView<CrsHierMatrixViewType,TaskFactoryType> CrsHierTaskViewType; typedef DenseMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> DenseMatrixBaseType; typedef DenseMatrixView<DenseMatrixBaseType> DenseMatrixViewType; typedef TaskView<DenseMatrixViewType,TaskFactoryType> DenseTaskViewType; typedef DenseMatrixBase<DenseTaskViewType,ordinal_type,size_type,SpaceType,MemoryTraits> DenseHierMatrixBaseType; typedef DenseMatrixView<DenseHierMatrixBaseType> DenseHierMatrixViewType; typedef TaskView<DenseHierMatrixViewType,TaskFactoryType> DenseHierTaskViewType; int r_val = 0; Kokkos::Impl::Timer timer; double t_import = 0.0, t_reorder = 0.0, t_symbolic = 0.0, t_flat2hier = 0.0, t_factor = 0.0, t_solve = 0.0; cout << "CholDirectPlain:: import input file = " << file_input << endl; CrsMatrixBaseType AA("AA"); { timer.reset(); ifstream in; in.open(file_input); if (!in.good()) { cout << "Failed in open the file: " << file_input << endl; return ++r_val; } AA.importMatrixMarket(in); t_import = timer.seconds(); } cout << "CholDirectPlain:: import input file::time = " << t_import << endl; // matrix A and its upper triangular factors U CrsMatrixBaseType PA("Permuted AA"); CrsMatrixBaseType UU("UU"); // permuted base upper triangular matrix CrsHierMatrixBaseType HU("HU"); // hierarchical matrix of views // right hand side and solution matrix DenseMatrixBaseType BB("BB", AA.NumRows(), nrhs), XX("XX", AA.NumRows(), nrhs); DenseHierMatrixBaseType HB("HB"), HX("HX"); { cout << "CholDirectPlain:: reorder the matrix" << endl; GraphHelperType S(AA, seed); { timer.reset(); S.computeOrdering(); S.pruneTree(prunecut); PA.copy(S.PermVector(), S.InvPermVector(), AA); t_reorder = timer.seconds(); } cout << "CholDirectPlain:: reorder the matrix::time = " << t_reorder << endl; { SymbolicFactorHelperType F(PA, league_size); timer.reset(); F.createNonZeroPattern(Uplo::Upper, UU); t_symbolic = timer.seconds(); cout << "CholDirectPlain:: AA (nnz) = " << AA.NumNonZeros() << ", UU (nnz) = " << UU.NumNonZeros() << endl; } cout << "CholDirectPlain:: symbolic factorization::time = " << t_symbolic << endl; { timer.reset(); CrsMatrixHelper::flat2hier(Uplo::Upper, UU, HU, S.NumBlocks(), S.RangeVector(), S.TreeVector()); // fill internal meta data for sparse blocs for (ordinal_type k=0;k<HU.NumNonZeros();++k) HU.Value(k).fillRowViewArray(); DenseMatrixHelper::flat2hier(BB, HB, S.NumBlocks(), S.RangeVector(), nb); DenseMatrixHelper::flat2hier(XX, HX, S.NumBlocks(), S.RangeVector(), nb); t_flat2hier = timer.seconds(); cout << "CholDirectPlain:: Hier (dof, nnz) = " << HU.NumRows() << ", " << HU.NumNonZeros() << endl; } cout << "CholDirectPlain:: construct hierarchical matrix::time = " << t_flat2hier << endl; } { // Policy setup #ifdef __USE_FIXED_TEAM_SIZE__ typename TaskFactoryType::policy_type policy(max_task_dependence); #else typename TaskFactoryType::policy_type policy(max_task_dependence, 1); #endif TaskFactoryType::setUseTeamInterface(team_interface); TaskFactoryType::setMaxTaskDependence(max_task_dependence); TaskFactoryType::setPolicy(&policy); CrsTaskViewType A(&PA), U(&UU); DenseTaskViewType X(&XX), B(&BB); A.fillRowViewArray(); U.fillRowViewArray(); CrsHierTaskViewType TU(&HU); DenseHierTaskViewType TB(&HB), TX(&HX); { // Manufacture B = AX const int m = A.NumRows(); for (int j=0;j<nrhs;++j) for (int i=0;i<m;++i) X.Value(i,j) = (j+1); Gemm<Trans::NoTranspose,Trans::NoTranspose,AlgoGemm::ForTriSolveBlocked> ::invoke(TaskFactoryType::Policy(), TaskFactoryType::Policy().member_single(), 1.0, A, X, 0.0, B); XX.copy(BB); } if (serial) { cout << "CholDirectPlain:: Serial factorize the matrix" << endl; timer.reset(); Chol<Uplo::Upper,AlgoChol::UnblockedOpt,Variant::One> ::invoke(TaskFactoryType::Policy(), TaskFactoryType::Policy().member_single(), U); t_factor = timer.seconds(); cout << "CholDirectPlain:: Serial factorize the matrix::time = " << t_factor << endl; } else { cout << "CholDirectPlain:: ByBlocks factorize the matrix:: team_size = " << team_size << endl; timer.reset(); auto future = TaskFactoryType::Policy().create_team (Chol<Uplo::Upper,AlgoChol::ByBlocks> ::TaskFunctor<CrsHierTaskViewType>(TU), 0); TaskFactoryType::Policy().spawn(future); Kokkos::Experimental::wait(TaskFactoryType::Policy()); t_factor = timer.seconds(); cout << "CholDirectPlain:: ByBlocks factorize the matrix::time = " << t_factor << endl; } if (solve) { if (serial) { cout << "CholDirectPlain:: Serial forward/backward solve" << endl; timer.reset(); TriSolve<Uplo::Upper,Trans::ConjTranspose,AlgoTriSolve::Unblocked> ::invoke(TaskFactoryType::Policy(), TaskFactoryType::Policy().member_single(), Diag::NonUnit, U, X); TriSolve<Uplo::Upper,Trans::NoTranspose,AlgoTriSolve::Unblocked> ::invoke(TaskFactoryType::Policy(), TaskFactoryType::Policy().member_single(), Diag::NonUnit, U, X); t_solve = timer.seconds(); cout << "CholDirectPlain:: Serial forward/backward solve::time = " << t_solve << endl; } else { cout << "CholDirectPlain:: ByBlocks forward/backward solve" << endl; timer.reset(); auto future_forward_solve = TaskFactoryType::Policy().create_team (TriSolve<Uplo::Upper,Trans::ConjTranspose,AlgoTriSolve::ByBlocks> ::TaskFunctor<CrsHierTaskViewType,DenseHierTaskViewType> (Diag::NonUnit, TU, TX), 0); TaskFactoryType::Policy().spawn(future_forward_solve); auto future_backward_solve = TaskFactoryType::Policy().create_team (TriSolve<Uplo::Upper,Trans::NoTranspose,AlgoTriSolve::ByBlocks> ::TaskFunctor<CrsHierTaskViewType,DenseHierTaskViewType> (Diag::NonUnit, TU, TX), 1); TaskFactoryType::Policy().add_dependence(future_backward_solve, future_forward_solve); TaskFactoryType::Policy().spawn(future_backward_solve); Kokkos::Experimental::wait(TaskFactoryType::Policy()); t_solve = timer.seconds(); cout << "CholDirectPlain:: ByBlocks forward/backward solve::time = " << t_solve << endl; } } if (solve && check) { // Check manufactured solution double l2 = 0.0, linf = 0.0; const int m = A.NumRows(); for (int j=0;j<nrhs;++j) for (int i=0;i<m;++i) { double diff = abs(X.Value(i,j) - (j+1)); l2 += diff*diff; linf = max(diff, linf); } l2 = sqrt(l2); cout << "CholDirectPlain:: Check solution::L2 = " << l2 << ", Linf = " << linf << endl; } } return r_val; }
int exampleDenseGemmByBlocks(const ordinal_type mmin, const ordinal_type mmax, const ordinal_type minc, const ordinal_type k, const ordinal_type mb, const int max_concurrency, const int max_task_dependence, const int team_size, const int mkl_nthreads, const bool check, const bool verbose) { typedef typename Kokkos::Impl::is_space<DeviceSpaceType>::host_mirror_space::execution_space HostSpaceType ; const bool detail = false; std::cout << "DeviceSpace:: "; DeviceSpaceType::print_configuration(std::cout, detail); std::cout << "HostSpace:: "; HostSpaceType::print_configuration(std::cout, detail); typedef Kokkos::Experimental::TaskPolicy<DeviceSpaceType> PolicyType; typedef DenseMatrixBase<value_type,ordinal_type,size_type,HostSpaceType> DenseMatrixBaseHostType; typedef DenseMatrixView<DenseMatrixBaseHostType> DenseMatrixViewHostType; typedef DenseMatrixBase<value_type,ordinal_type,size_type,DeviceSpaceType> DenseMatrixBaseDeviceType; typedef DenseMatrixView<DenseMatrixBaseDeviceType> DenseMatrixViewDeviceType; typedef TaskView<DenseMatrixViewDeviceType> DenseTaskViewDeviceType; typedef DenseMatrixBase<DenseTaskViewDeviceType,ordinal_type,size_type,DeviceSpaceType> DenseHierMatrixBaseDeviceType; typedef DenseMatrixView<DenseHierMatrixBaseDeviceType> DenseHierMatrixViewDeviceType; typedef TaskView<DenseHierMatrixViewDeviceType> DenseHierTaskViewDeviceType; int r_val = 0; Kokkos::Impl::Timer timer; double t = 0.0; std::cout << "DenseGemmByBlocks:: test matrices " <<":: mmin = " << mmin << " , mmax = " << mmax << " , minc = " << minc << " , k = "<< k << " , mb = " << mb << std::endl; const size_t max_task_size = (3*sizeof(DenseTaskViewDeviceType)+sizeof(PolicyType)+128); PolicyType policy(max_concurrency, max_task_size, max_task_dependence, team_size); std::ostringstream os; os.precision(3); os << std::scientific; for (ordinal_type m=mmin;m<=mmax;m+=minc) { os.str(""); // host matrices DenseMatrixBaseHostType AA_host, BB_host, CC_host("CC_host", m, m), CB_host("CB_host", m, m); { if (ArgTransA == Trans::NoTranspose) AA_host = DenseMatrixBaseHostType("AA_host", m, k); else AA_host = DenseMatrixBaseHostType("AA_host", k, m); if (ArgTransB == Trans::NoTranspose) BB_host = DenseMatrixBaseHostType("BB_host", k, m); else BB_host = DenseMatrixBaseHostType("BB_host", m, k); for (ordinal_type j=0;j<AA_host.NumCols();++j) for (ordinal_type i=0;i<AA_host.NumRows();++i) AA_host.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0; for (ordinal_type j=0;j<BB_host.NumCols();++j) for (ordinal_type i=0;i<BB_host.NumRows();++i) BB_host.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0; for (ordinal_type j=0;j<CC_host.NumCols();++j) for (ordinal_type i=0;i<CC_host.NumRows();++i) CC_host.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0; DenseMatrixTools::copy(CB_host, CC_host); } const double flop = DenseFlopCount<value_type>::Gemm(m, m, k); #ifdef HAVE_SHYLUTACHO_MKL mkl_set_num_threads(mkl_nthreads); #endif os << "DenseGemmByBlocks:: m = " << m << " n = " << m << " k = " << k << " "; if (check) { timer.reset(); DenseMatrixViewHostType A_host(AA_host), B_host(BB_host), C_host(CB_host); Gemm<ArgTransA,ArgTransB,AlgoGemm::ExternalBlas,Variant::One>::invoke (policy, policy.member_single(), 1.0, A_host, B_host, 1.0, C_host); t = timer.seconds(); os << ":: Serial Performance = " << (flop/t/1.0e9) << " [GFLOPs] "; } DenseMatrixBaseDeviceType AA_device("AA_device"), BB_device("BB_device"), CC_device("CC_device"); { timer.reset(); AA_device.mirror(AA_host); BB_device.mirror(BB_host); CC_device.mirror(CC_host); t = timer.seconds(); os << ":: Mirror = " << t << " [sec] "; } { DenseHierMatrixBaseDeviceType HA_device("HA_device"), HB_device("HB_device"), HC_device("HC_device"); DenseMatrixTools::createHierMatrix(HA_device, AA_device, mb, mb); DenseMatrixTools::createHierMatrix(HB_device, BB_device, mb, mb); DenseMatrixTools::createHierMatrix(HC_device, CC_device, mb, mb); DenseHierTaskViewDeviceType TA_device(HA_device), TB_device(HB_device), TC_device(HC_device); timer.reset(); auto future = policy.proc_create_team (Gemm<ArgTransA,ArgTransB,AlgoGemm::DenseByBlocks,ArgVariant> ::createTaskFunctor(policy, 1.0, TA_device, TB_device, 1.0, TC_device), 0); policy.spawn(future); Kokkos::Experimental::wait(policy); t = timer.seconds(); os << ":: Parallel Performance = " << (flop/t/1.0e9) << " [GFLOPs] "; } CC_host.mirror(CC_device); if (check) { double err = 0.0, norm = 0.0; for (ordinal_type j=0;j<CC_host.NumCols();++j) for (ordinal_type i=0;i<CC_host.NumRows();++i) { const double diff = abs(CC_host.Value(i,j) - CB_host.Value(i,j)); const double val = CB_host.Value(i,j); err += diff*diff; norm += val*val; } os << ":: Check result ::norm = " << sqrt(norm) << ", error = " << sqrt(err); } std::cout << os.str() << std::endl; } return r_val; }