void Loop(int loop, int test, const char* type_name) { LoopVariant<T>(loop,test); Kokkos::Impl::Timer timer; T res = LoopVariant<T>(loop,test); double time1 = timer.seconds(); timer.reset(); T resNonAtomic = LoopVariantNonAtomic<T>(loop,test); double time2 = timer.seconds(); timer.reset(); T resSerial = LoopVariantSerial<T>(loop,test); double time3 = timer.seconds(); time1*=1e6/loop; time2*=1e6/loop; time3*=1e6/loop; textcolor_standard(); bool passed = true; if(resSerial!=res) passed = false; if(!passed) textcolor(RESET,BLACK,YELLOW); printf("%s Test %i %s --- Loop: %i Value (S,A,NA): %e %e %e Time: %7.4e %7.4e %7.4e Size of Type %i)",type_name,test,passed?"PASSED":"FAILED",loop,1.0*resSerial,1.0*res,1.0*resNonAtomic,time1,time2,time3,(int)sizeof(T)); if(!passed) textcolor_standard(); printf("\n"); }
size_t test_global_to_local_ids(unsigned num_ids, unsigned capacity, unsigned num_find_iterations) { typedef Device execution_space; typedef typename execution_space::size_type size_type; typedef Kokkos::View<uint32_t*,execution_space> local_id_view; typedef Kokkos::UnorderedMap<uint32_t,size_type,execution_space> global_id_view; double elasped_time = 0; Kokkos::Impl::Timer timer; local_id_view local_2_global("local_ids", num_ids); global_id_view global_2_local(capacity); int shiftw = 15; //create elasped_time = timer.seconds(); std::cout << std::setw(shiftw) << "allocate: " << elasped_time << std::endl; timer.reset(); // generate unique ids { generate_ids<Device> gen(local_2_global); } // generate elasped_time = timer.seconds(); std::cout << std::setw(shiftw) << "generate: " << elasped_time << std::endl; timer.reset(); { fill_map<Device> fill(global_2_local, local_2_global); } // fill elasped_time = timer.seconds(); std::cout << std::setw(shiftw) << "fill: " << elasped_time << std::endl; timer.reset(); size_t num_errors = global_2_local.failed_insert(); if (num_errors == 0u) { for (unsigned i=0; i<num_find_iterations; ++i) { find_test<Device> find(global_2_local, local_2_global,num_errors); } // find elasped_time = timer.seconds(); std::cout << std::setw(shiftw) << "lookup: " << elasped_time << std::endl; } else { std::cout << " !!! Fill Failed !!!" << std::endl; } return num_errors; }
int main(int narg, char* arg[]) { Kokkos::initialize(narg,arg); int size = 1000000; // Create DualViews. This will allocate on both the device and its // host_mirror_device. idx_type idx("Idx",size,64); view_type dest("Dest",size); view_type src("Src",size); srand(134231); // Get a reference to the host view of idx directly (equivalent to // idx.view<idx_type::host_mirror_device_type>() ) idx_type::t_host h_idx = idx.h_view; for (int i = 0; i < size; ++i) { for (view_type::size_type j=0; j < h_idx.dimension_1 (); ++j) { h_idx(i,j) = (size + i + (rand () % 500 - 250)) % size; } } // Mark idx as modified on the host_mirror_device_type so that a // sync to the device will actually move data. The sync happens in // the functor's constructor. idx.modify<idx_type::host_mirror_device_type>(); // Run on the device. This will cause a sync of idx to the device, // since it was marked as modified on the host. Kokkos::Impl::Timer timer; Kokkos::parallel_for(size,localsum<view_type::device_type>(idx,dest,src)); Kokkos::fence(); double sec1_dev = timer.seconds(); timer.reset(); Kokkos::parallel_for(size,localsum<view_type::device_type>(idx,dest,src)); Kokkos::fence(); double sec2_dev = timer.seconds(); // Run on the host (could be the same as device). This will cause a // sync back to the host of dest. Note that if the Device is CUDA, // the data layout will not be optimal on host, so performance is // lower than what it would be for a pure host compilation. timer.reset(); Kokkos::parallel_for(size,localsum<view_type::host_mirror_device_type>(idx,dest,src)); Kokkos::fence(); double sec1_host = timer.seconds(); timer.reset(); Kokkos::parallel_for(size,localsum<view_type::host_mirror_device_type>(idx,dest,src)); Kokkos::fence(); double sec2_host = timer.seconds(); printf("Device Time with Sync: %lf without Sync: %lf \n",sec1_dev,sec2_dev); printf("Host Time with Sync: %lf without Sync: %lf \n",sec1_host,sec2_host); Kokkos::finalize(); }
void test_global_to_local_ids(unsigned num_ids) { typedef Device device_type; typedef typename device_type::size_type size_type; typedef Kokkos::View<uint32_t*,device_type> local_id_view; typedef Kokkos::UnorderedMap<uint32_t,size_type,device_type> global_id_view; //size std::cout << num_ids << ", "; double elasped_time = 0; Kokkos::Impl::Timer timer; local_id_view local_2_global("local_ids", num_ids); global_id_view global_2_local((3u*num_ids)/2u); //create elasped_time = timer.seconds(); std::cout << elasped_time << ", "; timer.reset(); // generate unique ids { generate_ids<Device> gen(local_2_global); } Device::fence(); // generate elasped_time = timer.seconds(); std::cout << elasped_time << ", "; timer.reset(); { fill_map<Device> fill(global_2_local, local_2_global); } Device::fence(); // fill elasped_time = timer.seconds(); std::cout << elasped_time << ", "; timer.reset(); size_t num_errors = 0; for (int i=0; i<100; ++i) { find_test<Device> find(global_2_local, local_2_global,num_errors); } Device::fence(); // find elasped_time = timer.seconds(); std::cout << elasped_time << std::endl; ASSERT_EQ( num_errors, 0u); }
int main(int narg, char* arg[]) { Kokkos::initialize(narg,arg); int size = 1000000; // Create Views idx_type idx("Idx",size,64); view_type dest("Dest",size); view_type src("Src",size); srand(134231); // When using UVM Cuda views can be accessed on the Host directly for(int i=0; i<size; i++) { for(int j=0; j<idx.dimension_1(); j++) idx(i,j) = (size + i + (rand()%500 - 250))%size; } Kokkos::fence(); // Run on the device // This will cause a sync of idx to the device since it was modified on the host Kokkos::Impl::Timer timer; Kokkos::parallel_for(size,localsum<view_type::execution_space>(idx,dest,src)); Kokkos::fence(); double sec1_dev = timer.seconds(); // No data transfer will happen now, since nothing is accessed on the host timer.reset(); Kokkos::parallel_for(size,localsum<view_type::execution_space>(idx,dest,src)); Kokkos::fence(); double sec2_dev = timer.seconds(); // Run on the host // This will cause a sync back to the host of dest which was changed on the device // Compare runtime here with the dual_view example: dest will be copied back in 4k blocks // when they are accessed the first time during the parallel_for. Due to the latency of a memcpy // this gives lower effective bandwidth when doing a manual copy via dual views timer.reset(); Kokkos::parallel_for(size,localsum<Kokkos::HostSpace::execution_space>(idx,dest,src)); Kokkos::fence(); double sec1_host = timer.seconds(); // No data transfers will happen now timer.reset(); Kokkos::parallel_for(size,localsum<Kokkos::HostSpace::execution_space>(idx,dest,src)); Kokkos::fence(); double sec2_host = timer.seconds(); printf("Device Time with Sync: %lf without Sync: %lf \n",sec1_dev,sec2_dev); printf("Host Time with Sync: %lf without Sync: %lf \n",sec1_host,sec2_host); Kokkos::finalize(); }
KOKKOS_INLINE_FUNCTION int exampleCholByBlocks(const string file_input, const int nthreads, const int max_task_dependence, const int team_size, const bool verbose) { typedef ValueType value_type; typedef OrdinalType ordinal_type; typedef SizeType size_type; typedef TaskFactory<Kokkos::Experimental::TaskPolicy<SpaceType>, Kokkos::Experimental::Future<int,SpaceType> > TaskFactoryType; typedef CrsMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> CrsMatrixBaseType; typedef GraphHelper_Scotch<CrsMatrixBaseType> GraphHelperType; typedef CrsMatrixView<CrsMatrixBaseType> CrsMatrixViewType; typedef TaskView<CrsMatrixViewType,TaskFactoryType> CrsTaskViewType; typedef CrsMatrixBase<CrsTaskViewType,ordinal_type,size_type,SpaceType,MemoryTraits> CrsHierMatrixBaseType; typedef CrsMatrixView<CrsHierMatrixBaseType> CrsHierMatrixViewType; typedef TaskView<CrsHierMatrixViewType,TaskFactoryType> CrsHierTaskViewType; int r_val = 0; Kokkos::Impl::Timer timer; double t = 0.0; cout << "CholByBlocks:: import input file = " << file_input << endl; CrsMatrixBaseType AA("AA"); { timer.reset(); ifstream in; in.open(file_input); if (!in.good()) { cout << "Failed in open the file: " << file_input << endl; return ++r_val; } AA.importMatrixMarket(in); t = timer.seconds(); if (verbose) cout << AA << endl; } cout << "CholByBlocks:: import input file::time = " << t << endl; cout << "CholByBlocks:: reorder the matrix" << endl; CrsMatrixBaseType UU("UU"); // permuted base matrix CrsHierMatrixBaseType HU("HU"); // hierarchical matrix of views { timer.reset(); typename GraphHelperType::size_type_array rptr(AA.Label()+"Graph::RowPtrArray", AA.NumRows() + 1); typename GraphHelperType::ordinal_type_array cidx(AA.Label()+"Graph::ColIndexArray", AA.NumNonZeros()); AA.convertGraph(rptr, cidx); GraphHelperType S(AA.Label()+"ScotchHelper", AA.NumRows(), rptr, cidx); S.computeOrdering(); CrsMatrixBaseType PA("Permuted AA"); PA.copy(S.PermVector(), S.InvPermVector(), AA); UU.copy(Uplo::Upper, PA); CrsMatrixHelper::flat2hier(Uplo::Upper, UU, HU, S.NumBlocks(), S.RangeVector(), S.TreeVector()); for (ordinal_type k=0;k<HU.NumNonZeros();++k) HU.Value(k).fillRowViewArray(); t = timer.seconds(); if (verbose) cout << UU << endl; } cout << "CholByBlocks:: reorder the matrix::time = " << t << endl; const size_t max_concurrency = 16384; cout << "CholByBlocks:: max concurrency = " << max_concurrency << endl; const size_t max_task_size = 3*sizeof(CrsTaskViewType)+128; cout << "CholByBlocks:: max task size = " << max_task_size << endl; typename TaskFactoryType::policy_type policy(max_concurrency, max_task_size, max_task_dependence, team_size); TaskFactoryType::setMaxTaskDependence(max_task_dependence); TaskFactoryType::setPolicy(&policy); cout << "CholByBlocks:: factorize the matrix" << endl; CrsHierTaskViewType H(&HU); { timer.reset(); auto future = TaskFactoryType::Policy().create_team(Chol<Uplo::Upper,AlgoChol::ByBlocks>:: TaskFunctor<CrsHierTaskViewType>(H), 0); TaskFactoryType::Policy().spawn(future); Kokkos::Experimental::wait(TaskFactoryType::Policy()); t = timer.seconds(); if (verbose) cout << UU << endl; } cout << "CholByBlocks:: factorize the matrix::time = " << t << endl; return r_val; }
KOKKOS_INLINE_FUNCTION int exampleSymbolicFactor(const string file_input, const int treecut, const int minblksize, const int seed, const int fill_level, const int league_size, const bool reorder, const bool verbose) { typedef ValueType value_type; typedef OrdinalType ordinal_type; typedef SizeType size_type; typedef CrsMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> CrsMatrixBaseType; typedef GraphHelper_Scotch<CrsMatrixBaseType> GraphHelperType; typedef SymbolicFactorHelper<CrsMatrixBaseType> SymbolicFactorHelperType; int r_val = 0; Kokkos::Impl::Timer timer; double t = 0.0; cout << "SymbolicFactor:: import input file = " << file_input << endl; CrsMatrixBaseType AA("AA"); { timer.reset(); ifstream in; in.open(file_input); if (!in.good()) { cout << "Failed in open the file: " << file_input << endl; return ++r_val; } AA.importMatrixMarket(in); t = timer.seconds(); cout << "SymbolicFactor:: AA nnz = " << AA.NumNonZeros() << endl; if (verbose) cout << AA << endl; } cout << "SymbolicFactor:: import input file::time = " << t << endl; CrsMatrixBaseType PA("Permuted AA"); GraphHelperType S(AA, seed); if (reorder) { timer.reset(); S.computeOrdering(treecut, minblksize); PA.copy(S.PermVector(), S.InvPermVector(), AA); t = timer.seconds(); if (verbose) cout << S << endl << PA << endl; } else { PA = AA; t = 0.0; } cout << "SymbolicFactor:: reorder the matrix::time = " << t << endl; CrsMatrixBaseType UU("UU"); { timer.reset(); SymbolicFactorHelperType symbolic(PA, league_size); symbolic.createNonZeroPattern(fill_level, Uplo::Upper, UU); t = timer.seconds(); cout << "SymbolicFactor:: UU nnz = " << UU.NumNonZeros() << endl; if (verbose) { cout << symbolic << endl; cout << UU << endl; } } cout << "SymbolicFactor:: factorize the matrix::time = " << t << endl; return r_val; }
BASKER_INLINE int Basker<Int, Entry, Exe_Space>::factor_inc_lvl(Int option) { printf("Factor Inc Level Called \n"); gn = A.ncol; gm = A.nrow; if(Options.btf == BASKER_TRUE) { //JDB: We can change this for the new inteface //call reference copy constructor gn = A.ncol; gm = A.nrow; A = BTF_A; //printf("\n\n Switching A, newsize: %d \n", // A.ncol); //printMTX("A_FACTOR.mtx", A); } //Spit into Domain and Sep //----------------------Domain-------------------------// #ifdef BASKER_KOKKOS //====TIMER== #ifdef BASKER_TIME Kokkos::Impl::Timer timer; #endif //===TIMER=== typedef Kokkos::TeamPolicy<Exe_Space> TeamPolicy; if(btf_tabs_offset != 0) { kokkos_nfactor_domain_inc_lvl <Int,Entry,Exe_Space> domain_nfactor(this); Kokkos::parallel_for(TeamPolicy(num_threads,1), domain_nfactor); Kokkos::fence(); //=====Check for error====== while(true) { INT_1DARRAY thread_start; MALLOC_INT_1DARRAY(thread_start, num_threads+1); init_value(thread_start, num_threads+1, (Int) BASKER_MAX_IDX); int nt = nfactor_domain_error(thread_start); if(nt == BASKER_SUCCESS) { break; } else { printf("restart \n"); kokkos_nfactor_domain_remalloc <Int, Entry, Exe_Space> diag_nfactor_remalloc(this, thread_start); Kokkos::parallel_for(TeamPolicy(num_threads,1), diag_nfactor_remalloc); Kokkos::fence(); } }//end while //====TIMER=== #ifdef BASKER_TIME printf("Time DOMAIN: %f \n", timer.seconds()); timer.reset(); #endif //====TIMER==== #else// else basker_kokkos #pragma omp parallel { }//end omp parallel #endif //end basker_kokkos } //-------------------End--Domian--------------------------// //---------------------------Sep--------------------------// if(btf_tabs_offset != 0) { //for(Int l=1; l<=1; l++) for(Int l=1; l <= tree.nlvls; l++) { //Come back for syncs //#ifdef BASKER_OLD_BARRIER Int lthreads = pow(2,l); Int lnteams = num_threads/lthreads; //#else //Int lthreads = 1; //Int lnteams = num_threads/lthreads; //#endif //printf("\n\n ============ SEP: %d ======\n\n",l); #ifdef BASKER_KOKKOS Kokkos::Impl::Timer timer_inner_sep; #ifdef BASKER_NO_LAMBDA kokkos_nfactor_sep2_inc_lvl <Int, Entry, Exe_Space> sep_nfactor(this,l); Kokkos::parallel_for(TeamPolicy(lnteams,lthreads), sep_nfactor); Kokkos::fence(); #ifdef BASKER_TIME printf("Time INNERSEP: %d %f \n", l, timer_inner_sep.seconds()); #endif #else //ELSE BASKER_NO_LAMBDA //Note: to be added #endif //end BASKER_NO_LAMBDA #else #pragma omp parallel { }//end omp parallel #endif }//end over each level #ifdef BASKER_TIME printf("Time SEP: %f \n", timer.seconds()); #endif } //-------------------------End Sep----------------// //-------------------IF BTF-----------------------// if(Options.btf == BASKER_TRUE) { //=====Timer #ifdef BASKER_TIME Kokkos::Impl::Timer timer_btf; #endif //====Timer //======Call diag factor==== /* kokkos_nfactor_diag <Int, Entry, Exe_Space> diag_nfactor(this); Kokkos::parallel_for(TeamPolicy(num_threads,1), diag_nfactor); Kokkos::fence(); */ //=====Check for error====== //while(true) // { //INT_1DARRAY thread_start; // MALLOC_INT_1DARRAY(thread_start, num_threads+1); //init_value(thread_start, num_threads+1, // (Int) BASKER_MAX_IDX); //int nt = nfactor_diag_error(thread_start); // if(nt == BASKER_SUCCESS) // { /// break; // } //else // { /* break; printf("restart \n"); kokkos_nfactor_diag_remalloc <Int, Entry, Exe_Space> diag_nfactor_remalloc(this, thread_start); Kokkos::parallel_for(TeamPolicy(num_threads,1), diag_nfactor); Kokkos::fence(); */ //} // }//end while //====TIMER #ifdef BASKER_TIME printf("Time BTF: %f \n", timer_btf.seconds()); #endif //===TIMER }//end btf call return 0; }//end factor_lvl_inc()
KOKKOS_INLINE_FUNCTION int exampleCholUnblocked(const string file_input, const int max_task_dependence, const int team_size, const int algo, const int variant, const bool verbose) { typedef ValueType value_type; typedef OrdinalType ordinal_type; typedef SizeType size_type; typedef CrsMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> CrsMatrixBaseType; typedef CrsMatrixView<CrsMatrixBaseType> CrsMatrixViewType; typedef TaskFactory<Kokkos::Experimental::TaskPolicy<SpaceType>, Kokkos::Experimental::Future<int,SpaceType> > TaskFactoryType; typedef TaskView<CrsMatrixViewType,TaskFactoryType> CrsTaskViewType; int r_val = 0; Kokkos::Impl::Timer timer; double t = 0.0; cout << "CholUnblocked:: import input file = " << file_input << endl; CrsMatrixBaseType AA("AA"), UU("UU"); { timer.reset(); ifstream in; in.open(file_input); if (!in.good()) { cout << "Failed in open the file: " << file_input << endl; return ++r_val; } AA.importMatrixMarket(in); UU.copy(Uplo::Upper, AA); t = timer.seconds(); if (verbose) cout << UU << endl; } cout << "CholUnblocked:: import input file::time = " << t << endl; #ifdef __USE_FIXED_TEAM_SIZE__ typename TaskFactoryType::policy_type policy(max_task_dependence); #else typename TaskFactoryType::policy_type policy(max_task_dependence, team_size); #endif TaskFactoryType::setMaxTaskDependence(max_task_dependence); TaskFactoryType::setPolicy(&policy); cout << "CholUnblocked:: factorize the matrix" << endl; CrsTaskViewType U(&UU); U.fillRowViewArray(); { timer.reset(); typename TaskFactoryType::future_type future; switch (algo) { case AlgoChol::UnblockedOpt: { if (variant == Variant::One) future = TaskFactoryType::Policy().create_team(Chol<Uplo::Upper,AlgoChol::UnblockedOpt,Variant::One> ::TaskFunctor<CrsTaskViewType>(U), 0); else if (variant == Variant::Two) future = TaskFactoryType::Policy().create_team(Chol<Uplo::Upper,AlgoChol::UnblockedOpt,Variant::Two> ::TaskFunctor<CrsTaskViewType>(U), 0); else { ERROR(">> Not supported algorithm variant"); } break; } case AlgoChol::Dummy: { future = TaskFactoryType::Policy().create_team(Chol<Uplo::Upper,AlgoChol::Dummy> ::TaskFunctor<CrsTaskViewType>(U), 0); break; } default: ERROR(">> Not supported algorithm"); break; } TaskFactoryType::Policy().spawn(future); Kokkos::Experimental::wait(TaskFactoryType::Policy()); t = timer.seconds(); if (verbose) cout << UU << endl; } cout << "CholUnblocked:: factorize the matrix::time = " << t << endl; return r_val; }
int exampleMatrixMarket(const std::string file_input, const bool verbose) { typedef typename Kokkos::Impl::is_space<DeviceSpaceType>::host_mirror_space::execution_space HostSpaceType ; const bool detail = false; std::cout << "DeviceSpace:: "; DeviceSpaceType::print_configuration(std::cout, detail); std::cout << "HostSpace:: "; HostSpaceType::print_configuration(std::cout, detail); typedef CrsMatrixBase<value_type,ordinal_type,size_type,HostSpaceType> CrsMatrixBaseHostType; int r_val = 0; Kokkos::Impl::Timer timer; CrsMatrixBaseHostType AA("AA"); timer.reset(); { std::ifstream in; in.open(file_input); if (!in.good()) { std::cout << "Failed in open the file: " << file_input << std::endl; return -1; } MatrixMarket::read(AA, in); } double t_read = timer.seconds(); timer.reset(); { std::string file_output = "mm-test-output.mtx"; std::ofstream out; out.open(file_output); if (!out.good()) { std::cout << "Failed in open the file: " << file_output << std::endl; return -1; } MatrixMarket::write(out, AA, "%% Test output"); } double t_write = timer.seconds(); { const auto prec = std::cout.precision(); std::cout.precision(4); std::cout << std::scientific << "MatrixMarket:: dimension = " << AA.NumRows() << " x " << AA.NumCols() << ", " << " nnz = " << AA.NumNonZeros() << ", " << "read = " << t_read << " [sec], " << "write = " << t_write << " [sec] " << std::endl; std::cout.unsetf(std::ios::scientific); std::cout.precision(prec); } if (verbose) { AA.showMe(std::cout) << std::endl; } CrsMatrixBaseHostType BB("BB"); BB.createConfTo(AA); CrsMatrixTools::copy(BB, Uplo::Upper, 0, AA); if (verbose) { BB.setLabel("Copy::AA:Upper::0"); BB.showMe(std::cout) << std::endl; } CrsMatrixTools::copy(BB, Uplo::Upper, 1, AA); if (verbose) { BB.setLabel("Copy::AA:Upper::1"); BB.showMe(std::cout) << std::endl; } CrsMatrixTools::copy(BB, Uplo::Lower, 0, AA); if (verbose) { BB.setLabel("Copy::AA:Lower::0"); BB.showMe(std::cout) << std::endl; } CrsMatrixTools::copy(BB, Uplo::Lower, 1, AA); if (verbose) { BB.setLabel("Copy::AA:Lower::1"); BB.showMe(std::cout) << std::endl; } return r_val; }
KOKKOS_INLINE_FUNCTION int exampleTriSolvePerformance(const string file_input, const OrdinalType nrhs, const OrdinalType nb, const int niter, const int nthreads, const int max_task_dependence, const int team_size, const bool team_interface, const bool skip_serial, const bool verbose) { typedef ValueType value_type; typedef OrdinalType ordinal_type; typedef SizeType size_type; typedef TaskFactory<Kokkos::Experimental::TaskPolicy<SpaceType>, Kokkos::Experimental::Future<int,SpaceType> > TaskFactoryType; typedef CrsMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> CrsMatrixBaseType; typedef GraphHelper_Scotch<CrsMatrixBaseType> GraphHelperType; typedef CrsMatrixView<CrsMatrixBaseType> CrsMatrixViewType; typedef TaskView<CrsMatrixViewType,TaskFactoryType> CrsTaskViewType; typedef CrsMatrixBase<CrsTaskViewType,ordinal_type,size_type,SpaceType,MemoryTraits> CrsHierMatrixBaseType; typedef CrsMatrixView<CrsHierMatrixBaseType> CrsHierMatrixViewType; typedef TaskView<CrsHierMatrixViewType,TaskFactoryType> CrsHierTaskViewType; typedef DenseMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> DenseMatrixBaseType; typedef DenseMatrixView<DenseMatrixBaseType> DenseMatrixViewType; typedef TaskView<DenseMatrixViewType,TaskFactoryType> DenseTaskViewType; typedef DenseMatrixBase<DenseTaskViewType,ordinal_type,size_type,SpaceType,MemoryTraits> DenseHierMatrixBaseType; typedef DenseMatrixView<DenseHierMatrixBaseType> DenseHierMatrixViewType; typedef TaskView<DenseHierMatrixViewType,TaskFactoryType> DenseHierTaskViewType; int r_val = 0; Kokkos::Impl::Timer timer; double t_import = 0.0, t_reorder = 0.0, t_solve_seq = 0.0, t_solve_task = 0.0; const int start = -2; cout << "TriSolvePerformance:: import input file = " << file_input << endl; CrsMatrixBaseType AA("AA"); { timer.reset(); ifstream in; in.open(file_input); if (!in.good()) { cout << "Failed in open the file: " << file_input << endl; return ++r_val; } AA.importMatrixMarket(in); t_import = timer.seconds(); if (verbose) cout << AA << endl; } cout << "TriSolvePerformance:: import input file::time = " << t_import << endl; CrsMatrixBaseType UU("UU"); DenseMatrixBaseType BB("BB", AA.NumRows(), nrhs); cout << "TriSolvePerformance:: reorder the matrix and partition right hand side, nb = " << nb << endl; CrsHierMatrixBaseType HU("HU"); DenseHierMatrixBaseType HB("HB"); { timer.reset(); GraphHelperType S(AA); S.computeOrdering(); CrsMatrixBaseType PA("Permuted AA"); PA.copy(S.PermVector(), S.InvPermVector(), AA); UU.copy(Uplo::Upper, PA); CrsMatrixHelper::flat2hier(Uplo::Upper, UU, HU, S.NumBlocks(), S.RangeVector(), S.TreeVector()); DenseMatrixHelper::flat2hier(BB, HB, S.NumBlocks(), S.RangeVector(), nb); t_reorder = timer.seconds(); cout << "TriSolvePerformance:: Hier (dof, nnz) = " << HU.NumRows() << ", " << HU.NumNonZeros() << endl; if (verbose) cout << UU << endl; } cout << "TriSolvePerformance:: reorder the matrix and partition right hand side::time = " << t_reorder << endl; const size_t max_concurrency = 16384; cout << "TriSolvePerformance:: max concurrency = " << max_concurrency << endl; const size_t max_task_size = 3*sizeof(CrsTaskViewType)+128; cout << "TriSolvePerformance:: max task size = " << max_task_size << endl; if (!skip_serial) { __INIT_DENSE_MATRIX__(BB, 1.0); typename TaskFactoryType::policy_type policy(max_concurrency, max_task_size, max_task_dependence, 1); TaskFactoryType::setUseTeamInterface(team_interface); TaskFactoryType::setMaxTaskDependence(max_task_dependence); TaskFactoryType::setPolicy(&policy); CrsTaskViewType U(&UU); DenseTaskViewType B(&BB); U.fillRowViewArray(); cout << "TriSolvePerformance:: Serial forward and backward solve of the matrix" << endl; { for (int i=start;i<niter;++i) { timer.reset(); // { // auto future = TaskFactoryType::Policy().create_team(TriSolve<Uplo::Upper,Trans::ConjTranspose,AlgoTriSolve::Unblocked> // ::TaskFunctor<CrsTaskViewType,DenseTaskViewType> // (Diag::NonUnit, U, B), 0); // TaskFactoryType::Policy().spawn(future); // Kokkos::Experimental::wait(TaskFactoryType::Policy()); // } { TriSolve<Uplo::Upper,Trans::ConjTranspose,AlgoTriSolve::Unblocked> ::invoke(TaskFactoryType::Policy(), TaskFactoryType::Policy().member_single(), Diag::NonUnit, U, B); } // { // auto future = TaskFactoryType::Policy().create_team(TriSolve<Uplo::Upper,Trans::NoTranspose,AlgoTriSolve::Unblocked> // ::TaskFunctor<CrsTaskViewType,DenseTaskViewType> // (Diag::NonUnit, U, B), 0); // TaskFactoryType::Policy().spawn(future); // Kokkos::Experimental::wait(TaskFactoryType::Policy()); // } { TriSolve<Uplo::Upper,Trans::NoTranspose,AlgoTriSolve::Unblocked> ::invoke(TaskFactoryType::Policy(), TaskFactoryType::Policy().member_single(), Diag::NonUnit, U, B); } t_solve_seq += timer.seconds() * (i>=0); } t_solve_seq /= niter; if (verbose) cout << BB << endl; } cout << "TriSolvePerformance:: Serial forward and backward solve of the matrix::time = " << t_solve_seq << endl; } { __INIT_DENSE_MATRIX__(BB, 1.0); typename TaskFactoryType::policy_type policy(max_concurrency, max_task_size, max_task_dependence, team_size); TaskFactoryType::setUseTeamInterface(team_interface); TaskFactoryType::setMaxTaskDependence(max_task_dependence); TaskFactoryType::setPolicy(&policy); // wrap the hierarchically partitioned matrix with task handler CrsHierTaskViewType TU(&HU); for (ordinal_type k=0;k<HU.NumNonZeros();++k) HU.Value(k).fillRowViewArray(); DenseHierTaskViewType TB(&HB); cout << "TriSolvePerformance:: ByBlocks forward and backward solve of the matrix" << endl; { for (int i=start;i<niter;++i) { timer.reset(); { auto future_forward_solve = TaskFactoryType::Policy().create_team (TriSolve<Uplo::Upper,Trans::ConjTranspose,AlgoTriSolve::ByBlocks> ::TaskFunctor<CrsHierTaskViewType,DenseHierTaskViewType> (Diag::NonUnit, TU, TB), 0); TaskFactoryType::Policy().spawn(future_forward_solve); auto future_backward_solve = TaskFactoryType::Policy().create_team (TriSolve<Uplo::Upper,Trans::NoTranspose,AlgoTriSolve::ByBlocks> ::TaskFunctor<CrsHierTaskViewType,DenseHierTaskViewType> (Diag::NonUnit, TU, TB), 1); TaskFactoryType::Policy().add_dependence(future_backward_solve, future_forward_solve); TaskFactoryType::Policy().spawn(future_backward_solve); Kokkos::Experimental::wait(TaskFactoryType::Policy()); } t_solve_task += timer.seconds() * (i>=0); } t_solve_task /= niter; if (verbose) cout << BB << endl; } cout << "TriSolvePerformance:: ByBlocks forward and backward solve of the matrix::time = " << t_solve_task << endl; } if (!skip_serial) { cout << "TriSolvePerformance:: task scale [seq/task] = " << t_solve_seq/t_solve_task << endl; } return r_val; }
KOKKOS_INLINE_FUNCTION int exampleDenseGemmByBlocks(const OrdinalType mmin, const OrdinalType mmax, const OrdinalType minc, const OrdinalType k, const OrdinalType mb, const int max_concurrency, const int max_task_dependence, const int team_size, const int mkl_nthreads, const bool check, const bool verbose) { typedef ValueType value_type; typedef OrdinalType ordinal_type; typedef SizeType size_type; typedef TaskFactory<Kokkos::Experimental::TaskPolicy<SpaceType>, Kokkos::Experimental::Future<int,SpaceType> > TaskFactoryType; typedef DenseMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> DenseMatrixBaseType; typedef DenseMatrixView<DenseMatrixBaseType> DenseMatrixViewType; typedef TaskView<DenseMatrixViewType,TaskFactoryType> DenseTaskViewType; typedef DenseMatrixBase<DenseTaskViewType,ordinal_type,size_type,SpaceType,MemoryTraits> DenseHierMatrixBaseType; typedef DenseMatrixView<DenseHierMatrixBaseType> DenseHierMatrixViewType; typedef TaskView<DenseHierMatrixViewType,TaskFactoryType> DenseHierTaskViewType; int r_val = 0; Kokkos::Impl::Timer timer; double t = 0.0; cout << "DenseGemmByBlocks:: test matrices " <<":: mmin = " << mmin << " , mmax = " << mmax << " , minc = " << minc << " , k = "<< k << " , mb = " << mb << endl; const size_t max_task_size = (3*sizeof(DenseTaskViewType)+196); // when 128 error //cout << "max task size = "<< max_task_size << endl; typename TaskFactoryType::policy_type policy(max_concurrency, max_task_size, max_task_dependence, team_size); TaskFactoryType::setMaxTaskDependence(max_task_dependence); TaskFactoryType::setPolicy(&policy); ostringstream os; os.precision(3); os << scientific; for (ordinal_type m=mmin;m<=mmax;m+=minc) { os.str(""); DenseMatrixBaseType AA, BB, CC("CC", m, m), CB("CB", m, m); if (ArgTransA == Trans::NoTranspose) AA = DenseMatrixBaseType("AA", m, k); else AA = DenseMatrixBaseType("AA", k, m); if (ArgTransB == Trans::NoTranspose) BB = DenseMatrixBaseType("BB", k, m); else BB = DenseMatrixBaseType("BB", m, k); for (ordinal_type j=0;j<AA.NumCols();++j) for (ordinal_type i=0;i<AA.NumRows();++i) AA.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0; for (ordinal_type j=0;j<BB.NumCols();++j) for (ordinal_type i=0;i<BB.NumRows();++i) BB.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0; for (ordinal_type j=0;j<CC.NumCols();++j) for (ordinal_type i=0;i<CC.NumRows();++i) CC.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0; CB.copy(CC); const double flop = get_flop_gemm<value_type>(m, m, k); #ifdef HAVE_SHYLUTACHO_MKL mkl_set_num_threads(mkl_nthreads); #endif os << "DenseGemmByBlocks:: m = " << m << " n = " << m << " k = " << k; if (check) { timer.reset(); DenseTaskViewType A(&AA), B(&BB), C(&CB); Gemm<ArgTransA,ArgTransB,AlgoGemm::ExternalBlas>::invoke (TaskFactoryType::Policy(), TaskFactoryType::Policy().member_single(), 1.0, A, B, 1.0, C); t = timer.seconds(); os << ":: Serial Performance = " << (flop/t/1.0e9) << " [GFLOPs] "; } { DenseHierMatrixBaseType HA, HB, HC; DenseMatrixHelper::flat2hier(AA, HA, mb, mb); DenseMatrixHelper::flat2hier(BB, HB, mb, mb); DenseMatrixHelper::flat2hier(CC, HC, mb, mb); DenseHierTaskViewType TA(&HA), TB(&HB), TC(&HC); timer.reset(); auto future = TaskFactoryType::Policy().create_team (typename Gemm<ArgTransA,ArgTransB,AlgoGemm::DenseByBlocks,Variant::One> ::template TaskFunctor<value_type,DenseHierTaskViewType,DenseHierTaskViewType,DenseHierTaskViewType> (1.0, TA, TB, 1.0, TC), 0); TaskFactoryType::Policy().spawn(future); Kokkos::Experimental::wait(TaskFactoryType::Policy()); t = timer.seconds(); os << ":: Parallel Performance = " << (flop/t/1.0e9) << " [GFLOPs] "; } if (check) { typedef typename Teuchos::ScalarTraits<value_type>::magnitudeType real_type; real_type err = 0.0, norm = 0.0; for (ordinal_type j=0;j<CC.NumCols();++j) for (ordinal_type i=0;i<CC.NumRows();++i) { const real_type diff = abs(CC.Value(i,j) - CB.Value(i,j)); const real_type val = CB.Value(i,j); err += diff*diff; norm += val*val; } os << ":: Check result ::err = " << sqrt(err) << ", norm = " << sqrt(norm); } cout << os.str() << endl; } return r_val; }
int exampleDenseMatrixBase(const ordinal_type mmin, const ordinal_type mmax, const ordinal_type minc, const bool verbose) { typedef typename Kokkos::Impl::is_space<DeviceSpaceType>::host_mirror_space::execution_space HostSpaceType ; const bool detail = false; std::cout << "DeviceSpace:: "; DeviceSpaceType::print_configuration(std::cout, detail); std::cout << "HostSpace:: "; HostSpaceType::print_configuration(std::cout, detail); std::cout << std::endl; typedef DenseMatrixBase<value_type,ordinal_type,size_type,HostSpaceType> DenseMatrixBaseHostType; typedef DenseMatrixBase<value_type,ordinal_type,size_type,DeviceSpaceType> DenseMatrixBaseDeviceType; int r_val = 0; Kokkos::Impl::Timer timer; std::cout << "DenseMatrixBase:: test matrices " <<":: mmin = " << mmin << " , mmax = " << mmax << " , minc = " << minc << std::endl; for (auto m=mmin; m<=mmax; m+=minc) { // random test matrix on host DenseMatrixBaseHostType TT("TT", m, m); for (ordinal_type j=0; j<TT.NumCols(); ++j) { for (ordinal_type i=0; i<TT.NumRows(); ++i) TT.Value(i,j) = 2.0*((value_type)std::rand()/(RAND_MAX)) - 1.0; TT.Value(j,j) = std::fabs(TT.Value(j,j)); } if (verbose) TT.showMe(std::cout) << std::endl; DenseMatrixBaseDeviceType AA("AA"); timer.reset(); AA.mirror(TT); double t_mirror = timer.seconds(); DenseMatrixBaseDeviceType BB("BB"); BB.createConfTo(AA); timer.reset(); DenseMatrixTools::copy(BB, AA); double t_copy = timer.seconds(); // check DenseMatrixBaseHostType RR("RR"); RR.createConfTo(BB); RR.mirror(BB); if (verbose) RR.showMe(std::cout) << std::endl; double err = 0.0; for (ordinal_type j=0; j<TT.NumCols(); ++j) for (ordinal_type i=0; i<TT.NumRows(); ++i) err += std::fabs(TT.Value(i,j) - RR.Value(i,j)); { const auto prec = std::cout.precision(); std::cout.precision(4); std::cout << std::scientific << "DenseMatrixBase:: dimension = " << m << " x " << m << ", " << "Mirroring to device = " << t_mirror << " [sec], " << "Elementwise copy on device = " << t_copy << " [sec], " << "Error = " << err << std::endl; std::cout.unsetf(std::ios::scientific); std::cout.precision(prec); } } return r_val; }
KOKKOS_INLINE_FUNCTION int exampleICholUnblocked(const string file_input, const int max_task_dependence, const int team_size, const bool verbose) { typedef ValueType value_type; typedef OrdinalType ordinal_type; typedef SizeType size_type; typedef CrsMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> CrsMatrixBaseType; typedef CrsMatrixView<CrsMatrixBaseType> CrsMatrixViewType; typedef TaskTeamFactory<Kokkos::Experimental::TaskPolicy<SpaceType>, Kokkos::Experimental::Future<int,SpaceType>, Kokkos::Impl::TeamThreadRangeBoundariesStruct> TaskFactoryType; typedef ParallelFor ForType; typedef TaskView<CrsMatrixViewType,TaskFactoryType> CrsTaskViewType; int r_val = 0; Kokkos::Impl::Timer timer; double t = 0.0; cout << "ICholUnblocked:: import input file = " << file_input << endl; CrsMatrixBaseType AA("AA"), UU("UU"); { timer.reset(); ifstream in; in.open(file_input); if (!in.good()) { cout << "Failed in open the file: " << file_input << endl; return ++r_val; } AA.importMatrixMarket(in); UU.copy(Uplo::Upper, AA); t = timer.seconds(); if (verbose) cout << UU << endl; } cout << "ICholUnblocked:: import input file::time = " << t << endl; #ifdef __USE_FIXED_TEAM_SIZE__ typename TaskFactoryType::policy_type policy(max_task_dependence); #else typename TaskFactoryType::policy_type policy(max_task_dependence, team_size); #endif TaskFactoryType::setMaxTaskDependence(max_task_dependence); TaskFactoryType::setPolicy(&policy); cout << "ICholUnblocked:: factorize the matrix" << endl; CrsTaskViewType U(&UU); U.fillRowViewArray(); { timer.reset(); auto future = TaskFactoryType::Policy().create_team(IChol<Uplo::Upper,AlgoIChol::UnblockedOpt1> ::TaskFunctor<ForType,CrsTaskViewType>(U), 0); TaskFactoryType::Policy().spawn(future); Kokkos::Experimental::wait(TaskFactoryType::Policy()); t = timer.seconds(); if (verbose) cout << UU << endl; } cout << "ICholUnblocked:: factorize the matrix::time = " << t << endl; return r_val; }
void color(bool useConflictList, bool serialConflictResolution, bool ticToc){ Ordinal numUncolored = _size; // on host double t, total = 0.0; Kokkos::Impl::Timer timer; if(useConflictList) _conflictType = CONFLICT_LIST; // While vertices to color, do speculative coloring. int iter = 0; for(iter = 0; (iter<20) && (numUncolored>0); iter++){ std::cout<< "Start iteration " << iter << std::endl; // First color greedy speculatively, some conflicts expected this -> colorGreedy(); ExecSpace::fence(); if(ticToc){ t = timer.seconds(); total += t; std::cout << "Time speculative greedy phase " << iter << " : " << std::endl; timer.reset(); } #ifdef DEBUG // UVM required - will be slow! printf("\n 100 first vertices: "); for(int i = 0; i < 100; i++){ printf(" %i", _colors[i]); } printf("\n"); #endif // Check for conflicts (parallel), find vertices to recolor numUncolored = this -> findConflicts(); ExecSpace::fence(); if(ticToc){ t = timer.seconds(); total += t; std::cout << "Time conflict detection " << iter << " : " << t << std::endl; timer.reset(); } if (serialConflictResolution) break; // Break after first iteration /* if(_conflictType == CONFLICT_LIST){ array_type temp = _vertexList; _vertexList = _recolorList; _vertexListLength() = _recolorListLength(); _recolorList = temp; _recolorListLength() = 0; } */ if(_conflictType == CONFLICT_LIST){ array_type temp = _vertexList; _vertexList = _recolorList; host_vertexListLength() = host_recolorListLength(); _recolorList = temp; host_recolorListLength() = 0; Kokkos::deep_copy(_vertexListLength, host_vertexListLength); Kokkos::deep_copy(_recolorListLength, host_recolorListLength); } } std::cout << "Number of coloring iterations: " << iter << std::endl; if(numUncolored > 0){ // Resolve conflicts by recolor in serial this -> resolveConflicts(); ExecSpace::fence(); if(ticToc){ t = timer.seconds(); total += t; std::cout << "Time conflict resolution: " << t << std::endl; std::cout << "Total time: " << total << std::endl; } } }
int exampleDenseCholByBlocks(const ordinal_type mmin, const ordinal_type mmax, const ordinal_type minc, const ordinal_type mb, const int max_concurrency, const int max_task_dependence, const int team_size, const int mkl_nthreads, const bool check, const bool verbose) { typedef typename Kokkos::Impl::is_space<DeviceSpaceType>::host_mirror_space::execution_space HostSpaceType ; const bool detail = false; std::cout << "DeviceSpace:: "; DeviceSpaceType::print_configuration(std::cout, detail); std::cout << "HostSpace:: "; HostSpaceType::print_configuration(std::cout, detail); typedef Kokkos::Experimental::TaskPolicy<DeviceSpaceType> PolicyType; typedef DenseMatrixBase<value_type,ordinal_type,size_type,HostSpaceType> DenseMatrixBaseHostType; typedef DenseMatrixView<DenseMatrixBaseHostType> DenseMatrixViewHostType; typedef DenseMatrixBase<value_type,ordinal_type,size_type,DeviceSpaceType> DenseMatrixBaseDeviceType; typedef DenseMatrixView<DenseMatrixBaseDeviceType> DenseMatrixViewDeviceType; typedef TaskView<DenseMatrixViewDeviceType> DenseTaskViewDeviceType; typedef DenseMatrixBase<DenseTaskViewDeviceType,ordinal_type,size_type,DeviceSpaceType> DenseHierMatrixBaseDeviceType; typedef DenseMatrixView<DenseHierMatrixBaseDeviceType> DenseHierMatrixViewDeviceType; typedef TaskView<DenseHierMatrixViewDeviceType> DenseHierTaskViewDeviceType; int r_val = 0; Kokkos::Impl::Timer timer; double t = 0.0; std::cout << "DenseCholByBlocks:: test matrices " <<":: mmin = " << mmin << " , mmax = " << mmax << " , minc = " << minc << " , mb = " << mb << std::endl; const size_t max_task_size = (3*sizeof(DenseTaskViewDeviceType)+sizeof(PolicyType)+128); PolicyType policy(max_concurrency, max_task_size, max_task_dependence, team_size); std::ostringstream os; os.precision(3); os << std::scientific; for (ordinal_type m=mmin;m<=mmax;m+=minc) { os.str(""); // host matrices DenseMatrixBaseHostType AA_host("AA_host", m, m), AB_host("AB_host"), TT_host("TT_host"); // random T matrix { TT_host.createConfTo(AA_host); for (ordinal_type j=0;j<TT_host.NumCols();++j) { for (ordinal_type i=0;i<TT_host.NumRows();++i) TT_host.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0; TT_host.Value(j,j) = std::fabs(TT_host.Value(j,j)); } } // create SPD matrix { Teuchos::BLAS<ordinal_type,value_type> blas; blas.HERK(ArgUplo == Uplo::Upper ? Teuchos::UPPER_TRI : Teuchos::LOWER_TRI, Teuchos::CONJ_TRANS, m, m, 1.0, TT_host.ValuePtr(), TT_host.ColStride(), 0.0, AA_host.ValuePtr(), AA_host.ColStride()); // preserve a copy of A AB_host.createConfTo(AA_host); DenseMatrixTools::copy(AB_host, AA_host); } const double flop = DenseFlopCount<value_type>::Chol(m); #ifdef HAVE_SHYLUTACHO_MKL mkl_set_num_threads(mkl_nthreads); #endif os << "DenseCholByBlocks:: m = " << m << " "; int ierr = 0; if (check) { timer.reset(); DenseMatrixViewHostType A_host(AB_host); ierr = Chol<ArgUplo,AlgoChol::ExternalLapack,Variant::One>::invoke (policy, policy.member_single(), A_host); t = timer.seconds(); TACHO_TEST_FOR_ABORT( ierr, "Fail to perform Cholesky (serial)"); os << ":: Serial Performance = " << (flop/t/1.0e9) << " [GFLOPs] "; } DenseMatrixBaseDeviceType AA_device("AA_device"); { timer.reset(); AA_device.mirror(AA_host); t = timer.seconds(); os << ":: Mirror = " << t << " [sec] "; } { DenseHierMatrixBaseDeviceType HA_device("HA_device"); DenseMatrixTools::createHierMatrix(HA_device, AA_device, mb, mb); DenseHierTaskViewDeviceType TA_device(HA_device); timer.reset(); auto future = policy.proc_create_team (Chol<ArgUplo,AlgoChol::DenseByBlocks,ArgVariant> ::createTaskFunctor(policy, TA_device), 0); policy.spawn(future); Kokkos::Experimental::wait(policy); t = timer.seconds(); os << ":: Parallel Performance = " << (flop/t/1.0e9) << " [GFLOPs] "; } AA_host.mirror(AA_device); if (!ierr && check) { double err = 0.0, norm = 0.0; for (ordinal_type j=0;j<AA_host.NumCols();++j) for (ordinal_type i=0;i<=j;++i) { const double diff = abs(AA_host.Value(i,j) - AB_host.Value(i,j)); const double val = AB_host.Value(i,j); err += diff*diff; norm += val*val; } os << ":: Check result ::norm = " << sqrt(norm) << ", error = " << sqrt(err); } std::cout << os.str() << std::endl; } return r_val; }
int exampleCholUnblocked(const std::string file_input, const int treecut, const int prunecut, const int fill_level, const int rows_per_team, const bool verbose) { typedef typename Kokkos::Impl::is_space<DeviceSpaceType>::host_mirror_space::execution_space HostSpaceType ; const bool detail = false; std::cout << "DeviceSpace:: "; DeviceSpaceType::print_configuration(std::cout, detail); std::cout << "HostSpace:: "; HostSpaceType::print_configuration(std::cout, detail); typedef CrsMatrixBase<value_type,ordinal_type,size_type,HostSpaceType> CrsMatrixBaseHostType; typedef GraphTools<ordinal_type,size_type,HostSpaceType> GraphToolsHostType; typedef GraphTools_Scotch<ordinal_type,size_type,HostSpaceType> GraphToolsHostType_Scotch; typedef GraphTools_CAMD<ordinal_type,size_type,HostSpaceType> GraphToolsHostType_CAMD; typedef IncompleteSymbolicFactorization<CrsMatrixBaseHostType> IncompleteSymbolicFactorizationType; typedef Kokkos::Experimental::TaskPolicy<DeviceSpaceType> PolicyType; typedef CrsMatrixBase<value_type,ordinal_type,size_type,DeviceSpaceType> CrsMatrixBaseDeviceType; typedef CrsMatrixView<CrsMatrixBaseDeviceType> CrsMatrixViewDeviceType; typedef TaskView<CrsMatrixViewDeviceType> CrsTaskViewDeviceType; int r_val = 0; Kokkos::Impl::Timer timer; CrsMatrixBaseHostType AA_host("AA_host"); timer.reset(); { std::ifstream in; in.open(file_input); if (!in.good()) { std::cout << "Failed in open the file: " << file_input << std::endl; return -1; } MatrixMarket::read(AA_host, in); } double t_read = timer.seconds(); if (verbose) AA_host.showMe(std::cout) << std::endl; typename GraphToolsHostType::size_type_array rptr("Graph::RowPtrArray", AA_host.NumRows() + 1); typename GraphToolsHostType::ordinal_type_array cidx("Graph::ColIndexArray", AA_host.NumNonZeros()); timer.reset(); GraphToolsHostType::getGraph(rptr, cidx, AA_host); double t_graph = timer.seconds(); GraphToolsHostType_Scotch S; S.setGraph(AA_host.NumRows(), rptr, cidx); S.setSeed(0); S.setTreeLevel(); S.setStrategy( SCOTCH_STRATSPEED | SCOTCH_STRATLEVELMAX | SCOTCH_STRATLEVELMIN | SCOTCH_STRATLEAFSIMPLE | SCOTCH_STRATSEPASIMPLE ); timer.reset(); S.computeOrdering(treecut); double t_scotch = timer.seconds(); S.pruneTree(prunecut); if (verbose) S.showMe(std::cout) << std::endl; CrsMatrixBaseHostType BB_host("BB_host"); BB_host.createConfTo(AA_host); CrsMatrixTools::copy(BB_host, S.PermVector(), S.InvPermVector(), AA_host); if (verbose) BB_host.showMe(std::cout) << std::endl; timer.reset(); GraphToolsHostType::getGraph(rptr, cidx, BB_host); t_graph += timer.seconds(); GraphToolsHostType_CAMD C; C.setGraph(BB_host.NumRows(), rptr, cidx, S.NumBlocks(), S.RangeVector()); timer.reset(); C.computeOrdering(); double t_camd = timer.seconds(); if (verbose) C.showMe(std::cout) << std::endl; CrsMatrixBaseHostType CC_host("CC_host"); CC_host.createConfTo(BB_host); CrsMatrixTools::copy(CC_host, C.PermVector(), C.InvPermVector(), BB_host); if (verbose) CC_host.showMe(std::cout) << std::endl; CrsMatrixBaseHostType DD_host("DD_host"); timer.reset(); IncompleteSymbolicFactorizationType::createNonZeroPattern(DD_host, fill_level, Uplo::Upper, CC_host, rows_per_team); double t_symbolic = timer.seconds(); if (verbose) DD_host.showMe(std::cout) << std::endl; // ================================================================================== CrsMatrixBaseDeviceType AA_device("AA_device"); AA_device.mirror(DD_host); const size_type max_concurrency = 10; const size_type max_task_size = (3*sizeof(CrsTaskViewDeviceType)+sizeof(PolicyType)+128); const size_type max_task_dependence = 0; const size_type team_size = 1; PolicyType policy(max_concurrency, max_task_size, max_task_dependence, team_size); CrsMatrixViewDeviceType A_device(AA_device); Kokkos::View<typename CrsMatrixViewDeviceType::row_view_type*,DeviceSpaceType> rowviews("RowViewInMatView", A_device.NumRows()); A_device.setRowViewArray(rowviews); timer.reset(); int ierr = Chol<Uplo::Upper,AlgoChol::Unblocked,Variant::One>::invoke (policy, policy.member_single(), A_device); double t_chol = timer.seconds(); TACHO_TEST_FOR_ABORT( ierr, "Fail to perform Cholesky (serial)"); if (verbose) { DD_host.mirror(AA_device); DD_host.showMe(std::cout) << std::endl; } { const auto prec = std::cout.precision(); std::cout.precision(4); std::cout << std::scientific; std::cout << "SymbolicFactorization:: Given matrix dimension = " << AA_host.NumRows() << " x " << AA_host.NumCols() << ", " << " nnz = " << AA_host.NumNonZeros() << std::endl; std::cout << "SymbolicFactorization:: Upper factors dimension = " << DD_host.NumRows() << " x " << DD_host.NumCols() << ", " << " nnz = " << DD_host.NumNonZeros() << std::endl; std::cout << "SymbolicFactorization:: " << "read = " << t_read << " [sec], " << "graph generation = " << (t_graph/2.0) << " [sec] " << "scotch reordering = " << t_scotch << " [sec] " << "camd reordering = " << t_camd << " [sec] " << "symbolic factorization = " << t_symbolic << " [sec] " << "Cholesky factorization = " << t_chol << " [sec] " << std::endl; std::cout.unsetf(std::ios::scientific); std::cout.precision(prec); } return r_val; }
KOKKOS_INLINE_FUNCTION int exampleStatByBlocks(const string file_input, const int treecut, const int minblksize, const int prunecut, const int seed, const int fill_level, const int league_size, const int histogram_size, const bool verbose) { typedef ValueType value_type; typedef OrdinalType ordinal_type; typedef SizeType size_type; typedef CrsMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> CrsMatrixBaseType; typedef GraphHelper_Scotch<CrsMatrixBaseType> GraphHelperType; typedef SymbolicFactorHelper<CrsMatrixBaseType> SymbolicFactorHelperType; typedef CrsMatrixView<CrsMatrixBaseType> CrsMatrixViewType; typedef CrsMatrixBase<CrsMatrixViewType,ordinal_type,size_type,SpaceType,MemoryTraits> CrsHierMatrixBaseType; int r_val = 0; Kokkos::Impl::Timer timer; double t = 0.0; cout << "StatByBlocks:: import input file = " << file_input << endl; CrsMatrixBaseType AA("AA"); { timer.reset(); ifstream in; in.open(file_input); if (!in.good()) { cout << "Failed in open the file: " << file_input << endl; return ++r_val; } AA.importMatrixMarket(in); t = timer.seconds(); if (verbose) cout << AA << endl; } cout << "StatByBlocks:: import input file::time = " << t << endl; CrsMatrixBaseType UU("UU"); CrsHierMatrixBaseType HU("HU"); { CrsMatrixBaseType PA("Permuted AA"); typename GraphHelperType::size_type_array rptr(AA.Label()+"Graph::RowPtrArray", AA.NumRows() + 1); typename GraphHelperType::ordinal_type_array cidx(AA.Label()+"Graph::ColIndexArray", AA.NumNonZeros()); AA.convertGraph(rptr, cidx); GraphHelperType S(AA.Label()+"ScotchHelper", AA.NumRows(), rptr, cidx, seed); { timer.reset(); S.computeOrdering(treecut, minblksize); S.pruneTree(prunecut); PA.copy(S.PermVector(), S.InvPermVector(), AA); t = timer.seconds(); if (verbose) cout << S << endl; } cout << "StatByBlocks:: reorder the matrix::time = " << t << endl; { SymbolicFactorHelperType F(PA, league_size); timer.reset(); F.createNonZeroPattern(fill_level, Uplo::Upper, UU); t = timer.seconds(); cout << "StatByBlocks:: AA (nnz) = " << AA.NumNonZeros() << ", UU (nnz) = " << UU.NumNonZeros() << endl; } cout << "StatByBlocks:: symbolic factorization::time = " << t << endl; { timer.reset(); CrsMatrixHelper::flat2hier(Uplo::Upper, UU, HU, S.NumBlocks(), S.RangeVector(), S.TreeVector()); for (ordinal_type k=0;k<HU.NumNonZeros();++k) HU.Value(k).fillRowViewArray(); t = timer.seconds(); cout << "StatByBlocks:: Hier (dof, nnz) = " << HU.NumRows() << ", " << HU.NumNonZeros() << endl; } cout << "StatByBlocks:: construct hierarchical matrix::time = " << t << endl; } { cout << endl; cout << " -- Flat matrix: UU --" << endl; cout << " # of Rows = " << UU.NumRows() << endl; cout << " # of Cols = " << UU.NumCols() << endl; cout << " # of Nonzeros = " << UU.NumNonZeros() << endl; cout << endl; cout << " -- Hierarchical matrix: HU --" << endl; cout << " # of Rows = " << HU.NumRows() << endl; cout << " # of Cols = " << HU.NumCols() << endl; cout << " # of Nonzeros = " << HU.NumNonZeros() << endl; cout << endl; cout << " -- Blocks of HU --" << endl; map<size_type,size_type> histogram; if (HU.NumNonZeros()) { size_type nnz_min = HU.Value(0).countNumNonZeros(), nnz_max = nnz_min, nnz_sum = 0, nnz_ave = 0; size_type nnz_cnt = 0; for (ordinal_type k=0;k<HU.NumNonZeros();++k) { const auto nnz_blk = HU.Value(k).countNumNonZeros(); if (nnz_blk) { nnz_min = min(nnz_min, nnz_blk); nnz_max = max(nnz_max, nnz_blk); nnz_sum += nnz_blk; ++nnz_cnt; if (histogram_size) ++histogram[nnz_blk/histogram_size]; } } nnz_ave = nnz_sum/nnz_cnt; cout << " Min # of Nonzeros = " << nnz_min << endl; cout << " Max # of Nonzeros = " << nnz_max << endl; cout << " Ave # of Nonzeros = " << nnz_ave << endl; cout << " Sum # of Nonzeros = " << nnz_sum << endl; cout << " # of empty Blocks = " << (HU.NumNonZeros() - nnz_cnt) << endl; if (histogram_size) { cout << " Histogram" << endl; for (auto it=histogram.begin();it!=histogram.end();++it) cout << (it->first*histogram_size) << " , " << it->second << endl; } } else { cout << " No registered blocks" << endl; } } return r_val; }
KOKKOS_INLINE_FUNCTION int exampleDenseTrsmMKL(const OrdinalType mmin, const OrdinalType mmax, const OrdinalType minc, const OrdinalType k, const bool verbose) { typedef ValueType value_type; typedef OrdinalType ordinal_type; typedef SizeType size_type; typedef DenseMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> DenseMatrixBaseType; int r_val = 0; Kokkos::Impl::Timer timer; double t = 0.0; cout << "DenseGemmMKL:: test matrices " <<":: mmin = " << mmin << " , mmax = " << mmax << " , minc = " << minc << " , k = "<< k << endl; ostringstream os; os.precision(3); os << scientific; for (ordinal_type m=mmin;m<=mmax;m+=minc) { os.str(""); DenseMatrixBaseType AA("AA", m, m), BB("BB", m, k), BC("BC", m, k); // setup upper triangular for (ordinal_type j=0;j<AA.NumCols();++j) { AA.Value(j,j) = 10.0; for (ordinal_type i=0;i<j;++i) AA.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0; } // setup one and right hand side is going to be overwritten by the product of AB for (ordinal_type j=0;j<BB.NumCols();++j) for (ordinal_type i=0;i<BB.NumRows();++i) BB.Value(i,j) = 1.0; Teuchos::BLAS<ordinal_type,value_type> blas; blas.GEMM(Teuchos::CONJ_TRANS, Teuchos::NO_TRANS, m, k, m, 1.0, AA.ValuePtr(), AA.ColStride(), BB.ValuePtr(), BB.ColStride(), 0.0, BC.ValuePtr(), BC.ColStride()); BB.copy(BC); const double flop = get_flop_trsm_upper<value_type>(m, k); os << "DenseTrsmMKL:: m = " << m << " k = " << k; { timer.reset(); Teuchos::BLAS<ordinal_type,value_type> blas; const ordinal_type mm = AA.NumRows(); const ordinal_type nn = BB.NumCols(); blas.TRSM(Teuchos::LEFT_SIDE, Teuchos::UPPER_TRI, Teuchos::CONJ_TRANS, Teuchos::NON_UNIT_DIAG, mm, nn, 1.0, AA.ValuePtr(), AA.ColStride(), BB.ValuePtr(), BB.ColStride()); t = timer.seconds(); os << ":: MKL Performance = " << (flop/t/1.0e9) << " [GFLOPs] "; } cout << os.str() << endl; } return r_val; }
int exampleCholByBlocks(const std::string file_input, const int treecut, const int prunecut, const int fill_level, const int rows_per_team, const int max_concurrency, const int max_task_dependence, const int team_size, const bool check, const bool verbose) { typedef typename Kokkos::Impl::is_space<DeviceSpaceType>::host_mirror_space::execution_space HostSpaceType ; const bool detail = false; std::cout << "DeviceSpace:: "; DeviceSpaceType::print_configuration(std::cout, detail); std::cout << "HostSpace:: "; HostSpaceType::print_configuration(std::cout, detail); // for simple test, let's use host space only here, for device it needs mirroring. typedef CrsMatrixBase<value_type,ordinal_type,size_type,HostSpaceType> CrsMatrixBaseHostType; typedef CrsMatrixView<CrsMatrixBaseHostType> CrsMatrixViewHostType; typedef GraphTools<ordinal_type,size_type,HostSpaceType> GraphToolsHostType; typedef GraphTools_Scotch<ordinal_type,size_type,HostSpaceType> GraphToolsHostType_Scotch; typedef GraphTools_CAMD<ordinal_type,size_type,HostSpaceType> GraphToolsHostType_CAMD; typedef IncompleteSymbolicFactorization<CrsMatrixBaseHostType> IncompleteSymbolicFactorizationType; typedef Kokkos::Experimental::TaskPolicy<DeviceSpaceType> PolicyType; typedef TaskView<CrsMatrixViewHostType> CrsTaskViewHostType; typedef CrsMatrixBase<CrsTaskViewHostType,ordinal_type,size_type,HostSpaceType> CrsHierBaseHostType; typedef CrsMatrixView<CrsHierBaseHostType> CrsHierViewHostType; typedef TaskView<CrsHierViewHostType> CrsTaskHierViewHostType; int r_val = 0; Kokkos::Impl::Timer timer; /// /// Read from matrix market /// /// input - file /// output - AA_host /// CrsMatrixBaseHostType AA_host("AA_host"); timer.reset(); { std::ifstream in; in.open(file_input); if (!in.good()) { std::cout << "Failed in open the file: " << file_input << std::endl; return -1; } MatrixMarket::read(AA_host, in); } double t_read = timer.seconds(); if (verbose) AA_host.showMe(std::cout) << std::endl; /// /// Create a graph structure for Scotch and CAMD (rptr, cidx) /// /// rptr and cidx are need to be set up for Scotch and CAMD /// typename GraphToolsHostType::size_type_array rptr("Graph::RowPtrArray", AA_host.NumRows() + 1); typename GraphToolsHostType::ordinal_type_array cidx("Graph::ColIndexArray", AA_host.NumNonZeros()); /// /// Run Scotch /// /// input - rptr, cidx, A_host /// output - S (perm, iperm, nblks, range, tree), AA_scotch_host (permuted) /// timer.reset(); GraphToolsHostType::getGraph(rptr, cidx, AA_host); double t_graph = timer.seconds(); GraphToolsHostType_Scotch S; S.setGraph(AA_host.NumRows(), rptr, cidx); S.setSeed(0); S.setTreeLevel(); S.setStrategy( SCOTCH_STRATSPEED | SCOTCH_STRATLEVELMAX | SCOTCH_STRATLEVELMIN | SCOTCH_STRATLEAFSIMPLE | SCOTCH_STRATSEPASIMPLE ); timer.reset(); S.computeOrdering(treecut); double t_scotch = timer.seconds(); if (verbose) S.showMe(std::cout) << std::endl; CrsMatrixBaseHostType AA_scotch_host("AA_scotch_host"); AA_scotch_host.createConfTo(AA_host); CrsMatrixTools::copy(AA_scotch_host, S.PermVector(), S.InvPermVector(), AA_host); if (verbose) AA_scotch_host.showMe(std::cout) << std::endl; /// /// Run CAMD /// /// input - rptr, cidx, A_host /// output - S (perm, iperm, nblks, range, tree), AA_scotch_host (permuted) /// timer.reset(); GraphToolsHostType::getGraph(rptr, cidx, AA_scotch_host); t_graph += timer.seconds(); GraphToolsHostType_CAMD C; C.setGraph(AA_scotch_host.NumRows(), rptr, cidx, S.NumBlocks(), S.RangeVector()); timer.reset(); C.computeOrdering(); double t_camd = timer.seconds(); if (verbose) C.showMe(std::cout) << std::endl; CrsMatrixBaseHostType AA_camd_host("AA_camd_host"); AA_camd_host.createConfTo(AA_scotch_host); CrsMatrixTools::copy(AA_camd_host, C.PermVector(), C.InvPermVector(), AA_scotch_host); if (verbose) AA_camd_host.showMe(std::cout) << std::endl; /// /// Symbolic factorization /// /// input - /// output - S (perm, iperm, nblks, range, tree), AA_scotch_host (permuted) /// CrsMatrixBaseHostType AA_factor_host("AA_factor_host"); timer.reset(); IncompleteSymbolicFactorizationType::createNonZeroPattern(AA_factor_host, fill_level, Uplo::Upper, AA_camd_host, rows_per_team); double t_symbolic = timer.seconds(); if (verbose) AA_factor_host.showMe(std::cout) << std::endl; /// /// Clean tempoerary matrices /// /// input - AA_scotch_host, AA_camd_host, C, rptr, cidx /// output - none /// AA_scotch_host = CrsMatrixBaseHostType(); AA_camd_host = CrsMatrixBaseHostType(); C = GraphToolsHostType_CAMD(); rptr = typename GraphToolsHostType::size_type_array(); cidx = typename GraphToolsHostType::ordinal_type_array(); /// /// Create task policy /// /// input - max_task_size /// output - policy /// const size_type max_task_size = (3*sizeof(CrsTaskViewHostType)+sizeof(PolicyType)+128); timer.reset(); PolicyType policy(max_concurrency, max_task_size, max_task_dependence, team_size); double t_policy = timer.seconds(); /// /// Sequential execution /// /// input - AA_factor_host (matrix to be compared), rowviews /// output - BB_factor_host, B_factor_host /// double t_chol_serial = 0; CrsMatrixBaseHostType BB_factor_host("BB_factor_host"); if (check) { BB_factor_host.createConfTo(AA_factor_host); CrsMatrixTools::copy(BB_factor_host, AA_factor_host); CrsTaskViewHostType B_factor_host(BB_factor_host); Kokkos::View<typename CrsTaskViewHostType::row_view_type*,HostSpaceType> rowviews("RowViewInMatView", B_factor_host.NumRows()); B_factor_host.setRowViewArray(rowviews); timer.reset(); { auto future = policy.proc_create_team(Chol<Uplo::Upper,AlgoChol::Unblocked,Variant::One> ::createTaskFunctor(policy, B_factor_host)); policy.spawn(future); Kokkos::Experimental::wait(policy); TACHO_TEST_FOR_ABORT( future.get(), "Fail to perform Cholesky (serial)"); } t_chol_serial = timer.seconds(); if (verbose) BB_factor_host.showMe(std::cout) << std::endl; } /// /// Task parallel execution /// /// input - AA_factor_host, rowviews /// output - HA_factor_host, AA_factor_host, B_factor_host /// double t_hier = 0, t_blocks = 0, t_chol_parallel = 0; CrsHierBaseHostType HA_factor_host("HA_factor_host"); { timer.reset(); S.pruneTree(prunecut); CrsMatrixTools::createHierMatrix(HA_factor_host, AA_factor_host, S.NumBlocks(), S.RangeVector(), S.TreeVector()); t_hier = timer.seconds(); timer.reset(); size_type nblocks = HA_factor_host.NumNonZeros(); Kokkos::View<ordinal_type*,HostSpaceType> ap_rowview_blocks("NumRowViewInBlocks", nblocks + 1); ap_rowview_blocks(0) = 0; for (ordinal_type k=0;k<nblocks;++k) ap_rowview_blocks(k+1) = ap_rowview_blocks(k) + HA_factor_host.Value(k).NumRows(); Kokkos::View<typename CrsMatrixViewHostType::row_view_type*,HostSpaceType> rowview_blocks("RowViewInBlocks", ap_rowview_blocks(nblocks)); Kokkos::parallel_for(Kokkos::RangePolicy<HostSpaceType>(0, nblocks), [&](const ordinal_type k) { const ordinal_type begin = ap_rowview_blocks(k); const ordinal_type end = ap_rowview_blocks(k+1); HA_factor_host.Value(k).setRowViewArray (Kokkos::subview(rowview_blocks, Kokkos::pair<ordinal_type,ordinal_type>(begin, end))); } ); CrsMatrixTools::filterEmptyBlocks(HA_factor_host); t_blocks = timer.seconds(); { size_type nblocks_filtered = HA_factor_host.NumNonZeros(), nnz_blocks = 0; for (size_type k=0;k<nblocks_filtered; ++k) nnz_blocks += HA_factor_host.Value(k).NumNonZeros(); TACHO_TEST_FOR_ABORT( nnz_blocks != AA_factor_host.NumNonZeros(), "nnz counted in blocks is different from nnz in the base matrix."); } CrsTaskHierViewHostType H_factor_host(HA_factor_host); timer.reset(); { auto future = policy.proc_create_team(Chol<Uplo::Upper,AlgoChol::ByBlocks,Variant::One> ::createTaskFunctor(policy, H_factor_host)); policy.spawn(future); Kokkos::Experimental::wait(policy); TACHO_TEST_FOR_ABORT( future.get(), "Fail to perform Cholesky (serial)"); } t_chol_parallel = timer.seconds(); if (verbose) AA_factor_host.showMe(std::cout) << std::endl; } if (check) { double diff = 0, norm = 0; TACHO_TEST_FOR_ABORT( BB_factor_host.NumNonZeros() != AA_factor_host.NumNonZeros(), "nnz used in serial is not same as nnz used in parallel"); const size_type nnz = AA_factor_host.NumNonZeros(); for (size_type k=0;k<nnz;++k) { norm += Util::abs(BB_factor_host.Value(k)); diff += Util::abs(AA_factor_host.Value(k) - BB_factor_host.Value(k)); } std::cout << std::scientific; std::cout << "CholByBlocks:: check with serial execution " << std::endl << " diff = " << diff << ", norm = " << norm << ", rel err = " << (diff/norm) << std::endl; std::cout.unsetf(std::ios::scientific); } { const auto prec = std::cout.precision(); std::cout.precision(4); std::cout << std::scientific; std::cout << "CholByBlocks:: Given matrix = " << AA_host.NumRows() << " x " << AA_host.NumCols() << ", nnz = " << AA_host.NumNonZeros() << std::endl; std::cout << "CholByBlocks:: Factored matrix = " << AA_factor_host.NumRows() << " x " << AA_factor_host.NumCols() << ", nnz = " << AA_factor_host.NumNonZeros() << std::endl; std::cout << "CholByBlocks:: Hier matrix = " << HA_factor_host.NumRows() << " x " << HA_factor_host.NumCols() << ", nnz = " << HA_factor_host.NumNonZeros() << std::endl; std::cout << "CholByBlocks:: " << "read = " << t_read << " [sec], " << "graph generation = " << (t_graph/2.0) << " [sec] " << "scotch reordering = " << t_scotch << " [sec] " << "camd reordering = " << t_camd << " [sec] " << std::endl << "CholByBlocks:: " << "symbolic factorization = " << t_symbolic << " [sec] " << std::endl << "CholByBlocks:: " << "policy creation = " << t_policy << " [sec] " << "hier creation = " << t_hier << " [sec] " << "block specification = " << t_blocks << " [sec] " << std::endl << "CholByBlocks:: " << "Chol Parallel = " << t_chol_parallel << " [sec] "; if (check) std::cout << "Chol Serial = " << (check ? t_chol_serial : -1) << " [sec] " << "speed-up = " << (t_chol_serial/t_chol_parallel) << " [sec] "; std::cout << std::endl; std::cout.unsetf(std::ios::scientific); std::cout.precision(prec); } return r_val; }
KOKKOS_INLINE_FUNCTION int exampleCholPerformance(const string file_input, const int treecut, const int minblksize, const int prunecut, const int seed, const int niter, const int nthreads, const int max_task_dependence, const int team_size, const int fill_level, const int league_size, const bool team_interface, const bool skip_serial, const bool mkl_interface, const bool verbose) { typedef ValueType value_type; typedef OrdinalType ordinal_type; typedef SizeType size_type; typedef TaskFactory<Kokkos::Experimental::TaskPolicy<SpaceType>, Kokkos::Experimental::Future<int,SpaceType> > TaskFactoryType; typedef CrsMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> CrsMatrixBaseType; typedef GraphHelper_Scotch<CrsMatrixBaseType> GraphHelperType; typedef SymbolicFactorHelper<CrsMatrixBaseType> SymbolicFactorHelperType; #ifdef HAVE_SHYLUTACHO_MKL typedef typename CrsMatrixBaseType::value_type_array value_type_array; #endif typedef CrsMatrixView<CrsMatrixBaseType> CrsMatrixViewType; typedef TaskView<CrsMatrixViewType,TaskFactoryType> CrsTaskViewType; typedef CrsMatrixBase<CrsTaskViewType,ordinal_type,size_type,SpaceType,MemoryTraits> CrsHierMatrixBaseType; typedef CrsMatrixView<CrsHierMatrixBaseType> CrsHierMatrixViewType; typedef TaskView<CrsHierMatrixViewType,TaskFactoryType> CrsHierTaskViewType; int r_val = 0; Kokkos::Impl::Timer timer; double t_import = 0.0, t_reorder = 0.0, t_symbolic = 0.0, t_flat2hier = 0.0, #ifdef HAVE_SHYLUTACHO_MKL t_mkl_seq = 0.0, #endif t_factor_seq = 0.0, t_factor_task = 0.0; const int start = -2; cout << "CholPerformance:: import input file = " << file_input << endl; CrsMatrixBaseType AA("AA"); { timer.reset(); ifstream in; in.open(file_input); if (!in.good()) { cout << "Failed in open the file: " << file_input << endl; return ++r_val; } AA.importMatrixMarket(in); t_import = timer.seconds(); if (verbose) cout << AA << endl; } cout << "CholPerformance:: import input file::time = " << t_import << endl; cout << "CholPerformance:: reorder the matrix" << endl; CrsMatrixBaseType PA("Permuted AA"); CrsMatrixBaseType UU("UU"); // permuted base upper triangular matrix CrsHierMatrixBaseType HU("HU"); // hierarchical matrix of views { GraphHelperType S(AA, seed); { timer.reset(); S.computeOrdering(treecut, minblksize); S.pruneTree(prunecut); PA.copy(S.PermVector(), S.InvPermVector(), AA); t_reorder = timer.seconds(); if (verbose) cout << S << endl << PA << endl; } cout << "CholPerformance:: reorder the matrix::time = " << t_reorder << endl; { SymbolicFactorHelperType F(PA, league_size); for (int i=start;i<niter;++i) { timer.reset(); F.createNonZeroPattern(fill_level, Uplo::Upper, UU); // UU.copy(Uplo::Upper, PA); t_symbolic += timer.seconds() * (i>=0); } t_symbolic /= niter; cout << "CholPerformance:: AA (nnz) = " << AA.NumNonZeros() << ", UU (nnz) = " << UU.NumNonZeros() << endl; if (verbose) cout << F << endl << UU << endl; } cout << "CholPerformance:: symbolic factorization::time = " << t_symbolic << endl; { timer.reset(); CrsMatrixHelper::flat2hier(Uplo::Upper, UU, HU, S.NumBlocks(), S.RangeVector(), S.TreeVector()); for (ordinal_type k=0;k<HU.NumNonZeros();++k) HU.Value(k).fillRowViewArray(); t_flat2hier = timer.seconds(); cout << "CholPerformance:: Hier (dof, nnz) = " << HU.NumRows() << ", " << HU.NumNonZeros() << endl; } cout << "CholPerformance:: construct hierarchical matrix::time = " << t_flat2hier << endl; } // copy of UU CrsMatrixBaseType RR("RR"); RR.copy(UU); #ifdef HAVE_SHYLUTACHO_MKL if (!skip_serial && mkl_interface) { cout << "CholPerformance:: MKL factorize the matrix" << endl; CrsMatrixBaseType MM("MM"); for (int i=start;i<niter;++i) { MM.copy(RR); MM.hermitianize(Uplo::Upper); MKL_INT n = static_cast<MKL_INT>(MM.NumRows()); double *a = static_cast<double*>(MM.ValuePtr()); MKL_INT *ia = static_cast<MKL_INT*>(MM.RowPtr()); MKL_INT *ja = static_cast<MKL_INT*>(MM.ColPtr()); // convert to 1-based matrix { for (ordinal_type k=0;k<(MM.NumRows()+1);++k) ++ia[k]; for (size_type k=0;k<MM.NumNonZeros();++k) ++ja[k]; } value_type_array mkl_result = value_type_array("mkl-ilu-values", MM.NumNonZeros()); double *bilu0 = static_cast<double*>(&mkl_result[0]); MKL_INT ipar[128]; double dpar[128]; MKL_INT ierr; // we provide ilu-k pattern timer.reset(); dcsrilu0(&n, a, ia, ja, bilu0, ipar, dpar, &ierr); t_mkl_seq += timer.seconds() * (i>=0) * 0.5; if (ierr != 0) cout << " MKL Error = " << ierr << endl; } t_mkl_seq /= niter; cout << "CholPerformance:: MKL factorize the matrix::time = " << t_mkl_seq << endl; } #endif if (!skip_serial) { #ifdef __USE_FIXED_TEAM_SIZE__ typename TaskFactoryType::policy_type policy(max_task_dependence); #else typename TaskFactoryType::policy_type policy(max_task_dependence, 1); #endif TaskFactoryType::setUseTeamInterface(team_interface); TaskFactoryType::setMaxTaskDependence(max_task_dependence); TaskFactoryType::setPolicy(&policy); CrsTaskViewType U(&UU); U.fillRowViewArray(); cout << "CholPerformance:: Serial factorize the matrix" << endl; { for (int i=start;i<niter;++i) { UU.copy(RR); timer.reset(); // { // auto future = TaskFactoryType::Policy().create(Chol<Uplo::Upper,AlgoChol::UnblockedOpt,Variant::One> // ::TaskFunctor<CrsTaskViewType>(U), 0); // TaskFactoryType::Policy().spawn(future); // Kokkos::Experimental::wait(TaskFactoryType::Policy()); // } { Chol<Uplo::Upper,AlgoChol::UnblockedOpt,Variant::One> ::invoke(TaskFactoryType::Policy(), TaskFactoryType::Policy().member_single(), U); } t_factor_seq += timer.seconds() * (i>=0); } t_factor_seq /= niter; if (verbose) cout << UU << endl; } cout << "CholPerformance:: Serial factorize the matrix::time = " << t_factor_seq << endl; } // { // #ifdef __USE_FIXED_TEAM_SIZE__ // typename TaskFactoryType::policy_type policy(max_task_dependence); // #else // typename TaskFactoryType::policy_type policy(max_task_dependence, nthreads); // #endif // TaskFactoryType::setPolicy(&policy); // CrsTaskViewType U(&UU); // U.fillRowViewArray(); // cout << "CholPerformance:: Team factorize the matrix:: team_size = " << nthreads << endl; // { // timer.reset(); // auto future = TaskFactoryType::Policy().create(Chol<Uplo::Upper,AlgoChol::UnblockedOpt,Variant::One> // ::TaskFunctor<CrsTaskViewType>(U), 0); // TaskFactoryType::Policy().spawn(future); // Kokkos::Experimental::wait(TaskFactoryType::Policy()); // t_factor_team = timer.seconds(); // if (verbose) // cout << UU << endl; // } // cout << "CholPerformance:: Team factorize the matrix::time = " << t_factor_team << endl; // } { #ifdef __USE_FIXED_TEAM_SIZE__ typename TaskFactoryType::policy_type policy(max_task_dependence); #else typename TaskFactoryType::policy_type policy(max_task_dependence, team_size); #endif TaskFactoryType::setUseTeamInterface(team_interface); TaskFactoryType::setMaxTaskDependence(max_task_dependence); TaskFactoryType::setPolicy(&policy); cout << "CholPerformance:: ByBlocks factorize the matrix:: team_size = " << team_size << endl; CrsHierTaskViewType H(&HU); { for (int i=start;i<niter;++i) { UU.copy(RR); timer.reset(); { auto future = TaskFactoryType::Policy().create_team(Chol<Uplo::Upper,AlgoChol::ByBlocks>:: TaskFunctor<CrsHierTaskViewType>(H), 0); TaskFactoryType::Policy().spawn(future); Kokkos::Experimental::wait(TaskFactoryType::Policy()); } t_factor_task += timer.seconds() * (i>=0); } t_factor_task /= niter; if (verbose) cout << UU << endl; } cout << "CholPerformance:: ByBlocks factorize the matrix::time = " << t_factor_task << endl; } if (!skip_serial) { #ifdef HAVE_SHYLUTACHO_MKL cout << "CholPerformance:: mkl/chol scale [mkl/chol] = " << t_mkl_seq/t_factor_seq << endl; cout << "CholPerformance:: mkl/task scale [mkl/task] = " << t_mkl_seq/t_factor_task << endl; #else cout << "CholPerformance:: task scale [seq/task] = " << t_factor_seq/t_factor_task << endl; #endif //cout << "CholPerformance:: team scale [seq/team] = " << t_factor_seq/t_factor_team << endl; } return r_val; }
void VerletKokkos::run(int n) { bigint ntimestep; int nflag,sortflag; int n_post_integrate = modify->n_post_integrate; int n_pre_exchange = modify->n_pre_exchange; int n_pre_neighbor = modify->n_pre_neighbor; int n_pre_force = modify->n_pre_force; int n_post_force = modify->n_post_force; int n_end_of_step = modify->n_end_of_step; if (atomKK->sortfreq > 0) sortflag = 1; else sortflag = 0; static double time = 0.0; static int count = 0; atomKK->sync(Device,ALL_MASK); Kokkos::Impl::Timer ktimer; for (int i = 0; i < n; i++) { ntimestep = ++update->ntimestep; ev_set(ntimestep); // initial time integration ktimer.reset(); timer->stamp(); modify->initial_integrate(vflag); time += ktimer.seconds(); if (n_post_integrate) modify->post_integrate(); timer->stamp(Timer::MODIFY); // regular communication vs neighbor list rebuild nflag = neighbor->decide(); if (nflag == 0) { timer->stamp(); comm->forward_comm(); timer->stamp(Timer::COMM); } else { // added debug //atomKK->sync(Host,ALL_MASK); //atomKK->modified(Host,ALL_MASK); if (n_pre_exchange) { timer->stamp(); modify->pre_exchange(); timer->stamp(Timer::MODIFY); } // debug //atomKK->sync(Host,ALL_MASK); //atomKK->modified(Host,ALL_MASK); if (triclinic) domain->x2lamda(atomKK->nlocal); domain->pbc(); if (domain->box_change) { domain->reset_box(); comm->setup(); if (neighbor->style) neighbor->setup_bins(); } timer->stamp(); // added debug //atomKK->sync(Device,ALL_MASK); //atomKK->modified(Device,ALL_MASK); comm->exchange(); if (sortflag && ntimestep >= atomKK->nextsort) atomKK->sort(); comm->borders(); // added debug //atomKK->sync(Host,ALL_MASK); //atomKK->modified(Host,ALL_MASK); if (triclinic) domain->lamda2x(atomKK->nlocal+atomKK->nghost); timer->stamp(Timer::COMM); if (n_pre_neighbor) { modify->pre_neighbor(); timer->stamp(Timer::MODIFY); } neighbor->build(); timer->stamp(Timer::NEIGH); } // force computations // important for pair to come before bonded contributions // since some bonded potentials tally pairwise energy/virial // and Pair:ev_tally() needs to be called before any tallying force_clear(); timer->stamp(); // added for debug //atomKK->k_x.sync<LMPHostType>(); //atomKK->k_f.sync<LMPHostType>(); //atomKK->k_f.modify<LMPHostType>(); if (n_pre_force) { modify->pre_force(vflag); timer->stamp(Timer::MODIFY); } if (pair_compute_flag) { atomKK->sync(force->pair->execution_space,force->pair->datamask_read); atomKK->modified(force->pair->execution_space,force->pair->datamask_modify); force->pair->compute(eflag,vflag); timer->stamp(Timer::PAIR); } if (atomKK->molecular) { if (force->bond) { atomKK->sync(force->bond->execution_space,force->bond->datamask_read); atomKK->modified(force->bond->execution_space,force->bond->datamask_modify); force->bond->compute(eflag,vflag); } if (force->angle) { atomKK->sync(force->angle->execution_space,force->angle->datamask_read); atomKK->modified(force->angle->execution_space,force->angle->datamask_modify); force->angle->compute(eflag,vflag); } if (force->dihedral) { atomKK->sync(force->dihedral->execution_space,force->dihedral->datamask_read); atomKK->modified(force->dihedral->execution_space,force->dihedral->datamask_modify); force->dihedral->compute(eflag,vflag); } if (force->improper) { atomKK->sync(force->improper->execution_space,force->improper->datamask_read); atomKK->modified(force->improper->execution_space,force->improper->datamask_modify); force->improper->compute(eflag,vflag); } timer->stamp(Timer::BOND); } if (kspace_compute_flag) { atomKK->sync(force->kspace->execution_space,force->kspace->datamask_read); atomKK->modified(force->kspace->execution_space,force->kspace->datamask_modify); force->kspace->compute(eflag,vflag); timer->stamp(Timer::KSPACE); } // reverse communication of forces if (force->newton) comm->reverse_comm(); timer->stamp(Timer::COMM); // force modifications, final time integration, diagnostics ktimer.reset(); if (n_post_force) modify->post_force(vflag); modify->final_integrate(); if (n_end_of_step) modify->end_of_step(); timer->stamp(Timer::MODIFY); time += ktimer.seconds(); // all output if (ntimestep == output->next) { atomKK->sync(Host,ALL_MASK); timer->stamp(); output->write(ntimestep); timer->stamp(Timer::OUTPUT); } } }
BASKER_INLINE int Basker<Int, Entry, Exe_Space>::factor_notoken(Int option) { //printf("factor no token called \n"); gn = A.ncol; gm = A.nrow; BASKER_MATRIX ATEMP; //Kokkos::Impl::Timer tza; if(Options.btf == BASKER_TRUE) { //JDB: We can change this for the new inteface gn = A.ncol; gm = A.nrow; ATEMP = A; A = BTF_A; } //printf("Switch time: %f \n", tza.seconds()); //Spit into Domain and Sep //----------------------Domain-------------------------// #ifdef BASKER_KOKKOS //====TIMER== #ifdef BASKER_TIME Kokkos::Impl::Timer timer; #endif //===TIMER=== typedef Kokkos::TeamPolicy<Exe_Space> TeamPolicy; if(btf_tabs_offset != 0) { if(Options.verbose == BASKER_TRUE) { printf("Factoring Dom num_threads: %d \n", num_threads); } Int domain_restart = 0; kokkos_nfactor_domain <Int,Entry,Exe_Space> domain_nfactor(this); Kokkos::parallel_for(TeamPolicy(num_threads,1), domain_nfactor); Kokkos::fence(); //=====Check for error====== while(true) { INT_1DARRAY thread_start; MALLOC_INT_1DARRAY(thread_start, num_threads+1); init_value(thread_start, num_threads+1, (Int) BASKER_MAX_IDX); int nt = nfactor_domain_error(thread_start); if((nt == BASKER_SUCCESS) || (nt == BASKER_ERROR) || (domain_restart > BASKER_RESTART)) { break; } else { domain_restart++; if(Options.verbose == BASKER_TRUE) { printf("restart \n"); } kokkos_nfactor_domain_remalloc <Int, Entry, Exe_Space> diag_nfactor_remalloc(this, thread_start); Kokkos::parallel_for(TeamPolicy(num_threads,1), diag_nfactor_remalloc); Kokkos::fence(); } }//end while //====TIMER=== #ifdef BASKER_TIME printf("Time DOMAIN: %f \n", timer.seconds()); timer.reset(); #endif //====TIMER==== #else// else basker_kokkos #pragma omp parallel { }//end omp parallel #endif //end basker_kokkos } //-------------------End--Domian--------------------------// //printVec("domperm.csc", gpermi, A.nrow); //---------------------------Sep--------------------------// if(btf_tabs_offset != 0) { //for(Int l=1; l<=4; l++) for(Int l=1; l <= tree.nlvls; l++) { //#ifdef BASKER_OLD_BARRIER //Int lthreads = pow(2,l); //Int lnteams = num_threads/lthreads; //#else Int lthreads = 1; Int lnteams = num_threads/lthreads; //#endif Int sep_restart = 0; if(Options.verbose == BASKER_TRUE) { printf("Factoring Sep num_threads: %d %d \n", lnteams, lthreads); } #ifdef BASKER_KOKKOS Kokkos::Impl::Timer timer_inner_sep; #ifdef BASKER_NO_LAMBDA //kokkos_nfactor_sep <Int, Entry, Exe_Space> //sep_nfactor(this, l); kokkos_nfactor_sep2 <Int, Entry, Exe_Space> sep_nfactor(this,l); Kokkos::parallel_for(TeamPolicy(lnteams,lthreads), sep_nfactor); Kokkos::fence(); //======Check for error===== while(true) { INT_1DARRAY thread_start; MALLOC_INT_1DARRAY(thread_start, num_threads+1); init_value(thread_start, num_threads+1, (Int) BASKER_MAX_IDX); int nt = nfactor_sep_error(thread_start); if((nt == BASKER_SUCCESS)|| (nt == BASKER_ERROR) || (sep_restart > BASKER_RESTART)) { FREE_INT_1DARRAY(thread_start); break; } else { sep_restart++; if (Options.verbose == BASKER_TRUE) { printf("restart \n"); } Kokkos::parallel_for(TeamPolicy(lnteams,lthreads), sep_nfactor); Kokkos::fence(); } }//end while-true #ifdef BASKER_TIME printf("Time INNERSEP: %d %f \n", l, timer_inner_sep.seconds()); #endif #else //ELSE BASKER_NO_LAMBDA //Note: to be added #endif //end BASKER_NO_LAMBDA #else #pragma omp parallel { }//end omp parallel #endif }//end over each level #ifdef BASKER_TIME printf("Time SEP: %f \n", timer.seconds()); #endif } //-------------------------End Sep----------------// //-------------------IF BTF-----------------------// if(Options.btf == BASKER_TRUE) { Int btf_restart = 0; if(Options.verbose == BASKER_TRUE) { printf("Factoring BLKs num_threads: %d \n", num_threads); } //=====Timer #ifdef BASKER_TIME Kokkos::Impl::Timer timer_btf; #endif //====Timer //======Call diag factor==== kokkos_nfactor_diag <Int, Entry, Exe_Space> diag_nfactor(this); Kokkos::parallel_for(TeamPolicy(num_threads,1), diag_nfactor); Kokkos::fence(); //=====Check for error====== while(true) { INT_1DARRAY thread_start; MALLOC_INT_1DARRAY(thread_start, num_threads+1); init_value(thread_start, num_threads+1, (Int) BASKER_MAX_IDX); int nt = nfactor_diag_error(thread_start); //printf("RETURNED: %d \n", nt); if((nt == BASKER_SUCCESS) || (nt == BASKER_ERROR) || (btf_restart > BASKER_RESTART)) { break; } else { btf_restart++; if (Options.verbose == BASKER_TRUE) { printf("restart \n"); } kokkos_nfactor_diag_remalloc <Int, Entry, Exe_Space> diag_nfactor_remalloc(this, thread_start); Kokkos::parallel_for(TeamPolicy(num_threads,1), diag_nfactor_remalloc); Kokkos::fence(); } }//end while //====TIMER #ifdef BASKER_TIME printf("Time BTF: %f \n", timer_btf.seconds()); #endif //===TIMER }//end btf call Kokkos::Impl::Timer tzback; if(Options.btf == BASKER_TRUE) { A = ATEMP; } //printf("Switch back: %f \n", // tzback.seconds()); return 0; }//end factor_notoken()
inline void pcgsolve( //const ImportType & import, KernelHandle &kh , const CrsMatrix <typename KernelHandle::nonzero_value_type , typename KernelHandle::row_index_type, typename KernelHandle::HandleExecSpace > & A , const Kokkos::View <typename KernelHandle::nonzero_value_type *, typename KernelHandle::HandleExecSpace> & b , const Kokkos::View <typename KernelHandle::nonzero_value_type * , typename KernelHandle::HandleExecSpace > & x , const size_t maximum_iteration = 200 , const double tolerance = std::numeric_limits<double>::epsilon() , CGSolveResult * result = 0 , bool use_sgs = true ) { typedef typename KernelHandle::HandleExecSpace Space; //typedef typename KernelHandle::nonzero_value_type MScalar; typedef typename KernelHandle::nonzero_value_type VScalar; //typedef typename KernelHandle::row_index_type Idx_Type; //typedef typename KernelHandle::idx_array_type idx_array_type; typedef typename Kokkos::View< VScalar * , Space > VectorType ; //const size_t count_owned = import.count_owned ; //const size_t count_total = import.count_owned + import.count_receive; const size_t count_owned = A.graph.nv; const size_t count_total = count_owned; size_t iteration = 0 ; double iter_time = 0 ; double matvec_time = 0 ; double norm_res = 0 ; double precond_time = 0; double precond_init_time = 0; Kokkos::Impl::Timer wall_clock ; Kokkos::Impl::Timer timer; // Need input vector to matvec to be owned + received VectorType pAll ( "cg::p" , count_total ); VectorType p = Kokkos::subview( pAll , std::pair<size_t,size_t>(0,count_owned) ); VectorType r ( "cg::r" , count_owned ); VectorType Ap( "cg::Ap", count_owned ); /* r = b - A * x ; */ /* p = x */ Kokkos::deep_copy( p , x ); ///* import p */ import( pAll ); /* Ap = A * p */ multiply( count_owned , Ap , A , pAll ); /* r = b - Ap */ waxpby( count_owned , r , 1.0 , b , -1.0 , Ap ); /* p = r */ Kokkos::deep_copy( p , r ); //double old_rdot = Kokkos::Example::all_reduce( dot( count_owned , r , r ) , import.comm ); double old_rdot = dot( count_owned , r , r ); norm_res = sqrt( old_rdot ); int apply_count = 1; VectorType z; //double precond_old_rdot = Kokkos::Example::all_reduce( dot( count_owned , r , z ) , import.comm ); double precond_old_rdot = 1; #ifdef PRECOND_NORM double precond_norm_res = 1; #endif Kokkos::deep_copy( p , z ); //typename KernelHandle::GaussSeidelHandleType *gsHandler; bool owner_handle = false; if (use_sgs){ if (kh.get_gs_handle() == NULL){ owner_handle = true; kh.create_gs_handle(); } //gsHandler = kh.get_gs_handle(); timer.reset(); KokkosKernels::Experimental::Graph::gauss_seidel_numeric (&kh, count_owned, count_owned, A.graph.row_map, A.graph.entries, A.coeff); Space::fence(); precond_init_time += timer.seconds(); z = VectorType( "pcg::z" , count_owned ); Space::fence(); timer.reset(); KokkosKernels::Experimental::Graph::symmetric_gauss_seidel_apply (&kh, count_owned, count_owned, A.graph.row_map, A.graph.entries, A.coeff, z, r, true, apply_count); Space::fence(); precond_time += timer.seconds(); //double precond_old_rdot = Kokkos::Example::all_reduce( dot( count_owned , r , z ) , import.comm ); precond_old_rdot = dot( count_owned , r , z ); #ifdef PRECOND_NORM precond_norm_res = sqrt( precond_old_rdot ); #endif Kokkos::deep_copy( p , z ); } iteration = 0 ; #ifdef PRINTRES std::cout << "norm_res:" << norm_res << " old_rdot:" << old_rdot<< std::endl; #ifdef PRECOND_NORM if (use_sgs) std::cout << "precond_norm_res:" << precond_norm_res << " precond_old_rdot:" << precond_old_rdot<< std::endl; #endif #endif while ( tolerance < norm_res && iteration < maximum_iteration ) { /* pAp_dot = dot( p , Ap = A * p ) */ timer.reset(); ///* import p */ import( pAll ); /* Ap = A * p */ multiply( count_owned , Ap , A , pAll ); Space::fence(); matvec_time += timer.seconds(); //const double pAp_dot = Kokkos::Example::all_reduce( dot( count_owned , p , Ap ) , import.comm ); const double pAp_dot = dot( count_owned , p , Ap ) ; double alpha = 0; if (use_sgs){ alpha = precond_old_rdot / pAp_dot ; } else { alpha = old_rdot / pAp_dot ; } /* x += alpha * p ; */ waxpby( count_owned , x , alpha, p , 1.0 , x ); /* r += -alpha * Ap ; */ waxpby( count_owned , r , -alpha, Ap , 1.0 , r ); //const double r_dot = Kokkos::Example::all_reduce( dot( count_owned , r , r ) , import.comm ); const double r_dot = dot( count_owned , r , r ); const double beta_original = r_dot / old_rdot ; double precond_r_dot = 1; double precond_beta = 1; if (use_sgs){ Space::fence(); timer.reset(); KokkosKernels::Experimental::Graph::symmetric_gauss_seidel_apply(&kh, count_owned, count_owned, A.graph.row_map, A.graph.entries, A.coeff, z, r, true, apply_count); Space::fence(); precond_time += timer.seconds(); //const double precond_r_dot = Kokkos::Example::all_reduce( dot( count_owned , r , z ) , import.comm ); precond_r_dot = dot( count_owned , r , z ); precond_beta = precond_r_dot / precond_old_rdot ; } double beta = 1; if (!use_sgs){ beta = beta_original; /* p = r + beta * p ; */ waxpby( count_owned , p , 1.0 , r , beta , p ); } else { beta = precond_beta; waxpby( count_owned , p , 1.0 , z , beta , p ); } #ifdef PRINTRES std::cout << "\tbeta_original:" << beta_original << std::endl; if (use_sgs) std::cout << "\tprecond_beta:" << precond_beta << std::endl; #endif norm_res = sqrt( old_rdot = r_dot ); #ifdef PRECOND_NORM if (use_sgs){ precond_norm_res = sqrt( precond_old_rdot = precond_r_dot ); } #else precond_old_rdot = precond_r_dot; #endif #ifdef PRINTRES std::cout << "\tnorm_res:" << norm_res << " old_rdot:" << old_rdot<< std::endl; #ifdef PRECOND_NORM if (use_sgs) std::cout << "\tprecond_norm_res:" << precond_norm_res << " precond_old_rdot:" << precond_old_rdot<< std::endl; #endif #endif ++iteration ; } Space::fence(); iter_time = wall_clock.seconds(); if ( 0 != result ) { result->iteration = iteration ; result->iter_time = iter_time ; result->matvec_time = matvec_time ; result->norm_res = norm_res ; result->precond_time = precond_time; result->precond_init_time = precond_init_time; } if (use_sgs & owner_handle ){ kh.destroy_gs_handle(); } }
KOKKOS_INLINE_FUNCTION int exampleKokkosTaskData(const int ntasks, const int max_task_dependence, const int team_size, const bool verbose) { typedef Kokkos::Experimental::TaskPolicy<SpaceType> policy_type ; typedef SimpleTask<policy_type> simple_task_type; typedef Kokkos::Experimental::Future<typename simple_task_type::value_type,SpaceType> future_type ; policy_type policy; Kokkos::Impl::Timer timer; for (int use_barrier=0;use_barrier<2;++use_barrier) { cout << "KokkosTaskData:: use barrier " << (use_barrier ? "yes" : "no") << endl; { timer.reset(); future_type f = policy.create(simple_task_type(use_barrier), max_task_dependence); policy.spawn(f); Kokkos::Experimental::wait( policy ); const double t = timer.seconds(); cout << "KokkosTaskData:: single task is spawned :: time = " << t << endl; } { timer.reset(); future_type f = policy.create_team(simple_task_type(use_barrier), max_task_dependence); policy.spawn(f); Kokkos::Experimental::wait( policy ); const double t = timer.seconds(); cout << "KokkosTaskData:: single team task is spawned :: time = " << t << endl; } { timer.reset(); future_type f[MAXTASKS]; for (int i=0;i<ntasks;++i) { f[i] = policy.create(simple_task_type(use_barrier), max_task_dependence); policy.spawn(f[i]); } Kokkos::Experimental::wait( policy ); const double t = timer.seconds(); cout << "KokkosTaskData:: " << ntasks << " tasks are spawned :: time = " << t << endl; } { timer.reset(); future_type f[MAXTASKS]; for (int i=0;i<ntasks;++i) { f[i] = policy.create_team(simple_task_type(use_barrier), max_task_dependence); policy.spawn(f[i]); } Kokkos::Experimental::wait( policy ); const double t = timer.seconds(); cout << "KokkosTaskData:: " << ntasks << " team tasks are spawned :: time = " << t << endl; } } return 0; }
int ComputeBasis_HGRAD_Vector(const ordinal_type nworkset, const ordinal_type C, const ordinal_type order, const bool verbose) { typedef Vector<VectorTagType> VectorType; typedef typename VectorTagType::value_type ValueType; constexpr int VectorLength = VectorTagType::length; Teuchos::RCP<std::ostream> verboseStream; Teuchos::oblackholestream bhs; // outputs nothing if (verbose) verboseStream = Teuchos::rcp(&std::cout, false); else verboseStream = Teuchos::rcp(&bhs, false); Teuchos::oblackholestream oldFormatState; oldFormatState.copyfmt(std::cout); typedef typename Kokkos::Impl::is_space<DeviceSpaceType>::host_mirror_space::execution_space HostSpaceType ; *verboseStream << "DeviceSpace:: "; DeviceSpaceType::print_configuration(*verboseStream, false); *verboseStream << "HostSpace:: "; HostSpaceType::print_configuration(*verboseStream, false); *verboseStream << "VectorLength:: " << (VectorLength) << "\n"; using BasisTypeHost = Basis_HGRAD_HEX_C1_FEM<HostSpaceType,ValueType,ValueType>; using ImplBasisType = Impl::Basis_HGRAD_HEX_C1_FEM; using range_type = Kokkos::pair<ordinal_type,ordinal_type>; constexpr size_t LLC_CAPACITY = 32*1024*1024; Intrepid2::Test::Flush<LLC_CAPACITY,DeviceSpaceType> flush; Kokkos::Impl::Timer timer; double t_vectorize = 0; int errorFlag = 0; BasisTypeHost hostBasis; const auto cellTopo = hostBasis.getBaseCellTopology(); auto cubature = DefaultCubatureFactory::create<DeviceSpaceType,ValueType,ValueType>(cellTopo, order); const ordinal_type numCells = C, numCellsAdjusted = C/VectorLength + (C%VectorLength > 0), numVerts = cellTopo.getVertexCount(), numDofs = hostBasis.getCardinality(), numPoints = cubature->getNumPoints(), spaceDim = cubature->getDimension(); Kokkos::DynRankView<ValueType,HostSpaceType> dofCoordsHost("dofCoordsHost", numDofs, spaceDim); hostBasis.getDofCoords(dofCoordsHost); const auto refNodesHost = Kokkos::subview(dofCoordsHost, range_type(0, numVerts), Kokkos::ALL()); // pertub nodes Kokkos::DynRankView<VectorType,HostSpaceType> worksetCellsHost("worksetCellsHost", numCellsAdjusted, numVerts, spaceDim); for (ordinal_type cell=0;cell<numCells;++cell) { for (ordinal_type i=0;i<numVerts;++i) for (ordinal_type j=0;j<spaceDim;++j) { ValueType val = (rand()/(RAND_MAX + 1.0))*0.2 -0.1; worksetCellsHost(cell/VectorLength, i, j)[cell%VectorLength] = refNodesHost(i, j) + val; } } auto worksetCells = Kokkos::create_mirror_view(typename DeviceSpaceType::memory_space(), worksetCellsHost); Kokkos::deep_copy(worksetCells, worksetCellsHost); Kokkos::DynRankView<ValueType,DeviceSpaceType> refPoints("refPoints", numPoints, spaceDim), refWeights("refWeights", numPoints); cubature->getCubature(refPoints, refWeights); std::cout << "===============================================================================\n" << " Performance Test evaluating ComputeBasis \n" << " # of workset = " << nworkset << "\n" << " Test Array Structure (C,F,P,D) = " << numCells << ", " << numDofs << ", " << numPoints << ", " << spaceDim << "\n" << "===============================================================================\n"; *verboseStream << "\n" << "===============================================================================\n" << "TEST 1: evaluateFields vector version\n" << "===============================================================================\n"; try { Kokkos::DynRankView<ValueType,DeviceSpaceType> refBasisValues("refBasisValues", numDofs, numPoints), refBasisGrads ("refBasisGrads", numDofs, numPoints, spaceDim); ImplBasisType::getValues<DeviceSpaceType>(refBasisValues, refPoints, OPERATOR_VALUE); ImplBasisType::getValues<DeviceSpaceType>(refBasisGrads, refPoints, OPERATOR_GRAD); const ordinal_type ibegin = -3; // testing vertical approach { Kokkos::DynRankView<VectorType,DeviceSpaceType> weightedBasisValues("weightedBasisValues", numCellsAdjusted, numDofs, numPoints), weightedBasisGrads ("weightedBasisGrads", numCellsAdjusted, numDofs, numPoints, spaceDim); typedef F_hgrad_eval<VectorType,ValueType,DeviceSpaceType> FunctorType; using range_policy_type = Kokkos::Experimental::MDRangePolicy < DeviceSpaceType, Kokkos::Experimental::Rank<2>, Kokkos::IndexType<ordinal_type> >; range_policy_type policy( { 0, 0 }, { numCellsAdjusted, numPoints } ); FunctorType functor(weightedBasisValues, weightedBasisGrads, refBasisGrads, worksetCells, refWeights, refBasisValues, refBasisGrads); for (ordinal_type iwork=ibegin;iwork<nworkset;++iwork) { flush.run(); DeviceSpaceType::fence(); timer.reset(); Kokkos::parallel_for(policy, functor); DeviceSpaceType::fence(); t_vectorize += (iwork >= 0)*timer.seconds(); } } } catch (std::exception err) { *verboseStream << "UNEXPECTED ERROR !!! ----------------------------------------------------------\n"; *verboseStream << err.what() << '\n'; *verboseStream << "-------------------------------------------------------------------------------" << "\n\n"; errorFlag = -1000; } std::cout << "TEST HGRAD " << " t_vectorize = " << (t_vectorize/nworkset) << std::endl; if (errorFlag != 0) std::cout << "End Result: TEST FAILED\n"; else std::cout << "End Result: TEST PASSED\n"; // reset format state of std::cout std::cout.copyfmt(oldFormatState); return errorFlag; }
KOKKOS_INLINE_FUNCTION int exampleCholDirectPerformance(const string file_input, const int treecut, const int minblksize, const int prunecut, const int seed, const int niter, const int nthreads, const int max_task_dependence, const int team_size, const int league_size, const bool team_interface, const bool skip_serial, const bool mkl_interface, const bool verbose) { typedef ValueType value_type; typedef OrdinalType ordinal_type; typedef SizeType size_type; typedef TaskFactory<Kokkos::Experimental::TaskPolicy<SpaceType>, Kokkos::Experimental::Future<int,SpaceType> > TaskFactoryType; typedef CrsMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> CrsMatrixBaseType; typedef GraphHelper_Scotch<CrsMatrixBaseType> GraphHelperType; typedef SymbolicFactorHelper<CrsMatrixBaseType> SymbolicFactorHelperType; #ifdef HAVE_SHYLUTACHO_MKL typedef typename CrsMatrixBaseType::value_type_array value_type_array; #endif typedef CrsMatrixView<CrsMatrixBaseType> CrsMatrixViewType; typedef TaskView<CrsMatrixViewType,TaskFactoryType> CrsTaskViewType; typedef CrsMatrixBase<CrsTaskViewType,ordinal_type,size_type,SpaceType,MemoryTraits> CrsHierMatrixBaseType; typedef CrsMatrixView<CrsHierMatrixBaseType> CrsHierMatrixViewType; typedef TaskView<CrsHierMatrixViewType,TaskFactoryType> CrsHierTaskViewType; int r_val = 0; Kokkos::Impl::Timer timer; double t_import = 0.0, t_reorder = 0.0, t_symbolic = 0.0, t_flat2hier = 0.0, #ifdef HAVE_SHYLUTACHO_MKL t_mkl = 0.0, #endif t_factor_seq = 0.0, t_solve_seq = 0.0, t_factor_task = 0.0, t_solve_task = 0.0; const int start = 0; cout << "CholDirectPerformance:: import input file = " << file_input << endl; CrsMatrixBaseType AA("AA"); { timer.reset(); ifstream in; in.open(file_input); if (!in.good()) { cout << "Failed in open the file: " << file_input << endl; return ++r_val; } AA.importMatrixMarket(in); t_import = timer.seconds(); } cout << "CholDirectPerformance:: import input file::time = " << t_import << endl; cout << "CholDirectPerformance:: reorder the matrix" << endl; CrsMatrixBaseType PA("Permuted AA"); CrsMatrixBaseType UU("UU"); // permuted base upper triangular matrix CrsHierMatrixBaseType HU("HU"); // hierarchical matrix of views DenseMatrixBaseType BB("BB", AA.NumRows(), nrhs); DenseHierMatrixBaseType HB("HB"); { GraphHelperType S(AA, seed); { timer.reset(); S.computeOrdering(treecut, minblksize); S.pruneTree(prunecut); PA.copy(S.PermVector(), S.InvPermVector(), AA); t_reorder = timer.seconds(); } cout << "CholDirectPerformance:: reorder the matrix::time = " << t_reorder << endl; { SymbolicFactorHelperType F(PA, league_size); for (int i=start;i<niter;++i) { timer.reset(); F.createNonZeroPattern(Uplo::Upper, UU); t_symbolic += timer.seconds() * (i>=0); } t_symbolic /= niter; cout << "CholDirectPerformance:: AA (nnz) = " << AA.NumNonZeros() << ", UU (nnz) = " << UU.NumNonZeros() << endl; } cout << "CholDirectPerformance:: symbolic factorization::time = " << t_symbolic << endl; { timer.reset(); CrsMatrixHelper::flat2hier(Uplo::Upper, UU, HU, S.NumBlocks(), S.RangeVector(), S.TreeVector()); for (ordinal_type k=0;k<HU.NumNonZeros();++k) HU.Value(k).fillRowViewArray(); DenseMatrixHelper::flat2hier(BB, HB, S.NumBlocks(), S.RangeVector(), nb); t_flat2hier = timer.seconds(); cout << "CholDirectPerformance:: Hier (dof, nnz) = " << HU.NumRows() << ", " << HU.NumNonZeros() << endl; } cout << "CholDirectPerformance:: construct hierarchical matrix::time = " << t_flat2hier << endl; } // copy of UU CrsMatrixBaseType RR("RR"); RR.copy(UU); /////////////////////////// Serial Numeric Factorization if (!skip_serial) { #ifdef __USE_FIXED_TEAM_SIZE__ typename TaskFactoryType::policy_type policy(max_task_dependence); #else typename TaskFactoryType::policy_type policy(max_task_dependence, 1); #endif TaskFactoryType::setUseTeamInterface(team_interface); TaskFactoryType::setMaxTaskDependence(max_task_dependence); TaskFactoryType::setPolicy(&policy); CrsTaskViewType U(&UU); U.fillRowViewArray(); cout << "CholDirectPerformance:: Serial factorize the matrix" << endl; { for (int i=start;i<niter;++i) { UU.copy(RR); timer.reset(); { Chol<Uplo::Upper,AlgoChol::UnblockedOpt,Variant::One> ::invoke(TaskFactoryType::Policy(), TaskFactoryType::Policy().member_single(), U); } t_factor_seq += timer.seconds() * (i>=0); } t_factor_seq /= niter; } cout << "CholDirectPerformance:: Serial factorize the matrix::time = " << t_factor_seq << endl; cout << "CholDirectPerformance:: Serial forward/backward solve" << endl; { for (int i=start;i<niter;++i) { XX.copy(BB); timer.reset(); { TriSolve<Uplo::Upper,Trans::ConjTranspose,AlgoTriSolve::Unblocked> ::invoke(TaskFactoryType::Policy(), TaskFactoryType::Policy().member_single(), Diag::NonUnit, U, X); TriSolve<Uplo::Upper,Trans::NoTranspose,AlgoTriSolve::Unblocked> ::invoke(TaskFactoryType::Policy(), TaskFactoryType::Policy().member_single(), Diag::NonUnit, U, X); } t_factor_seq += timer.seconds() * (i>=0); } t_factor_seq /= niter; } cout << "CholDirectPerformance:: Serial forward/backward solve::time = " << t_solve_seq << endl; } // if (!skip_serial) // cout << "CholDirectPerformance:: task scale [seq/task] = " << t_factor_seq/t_factor_task << endl; return r_val; }
int exampleDenseGemmByBlocks(const ordinal_type mmin, const ordinal_type mmax, const ordinal_type minc, const ordinal_type k, const ordinal_type mb, const int max_concurrency, const int max_task_dependence, const int team_size, const int mkl_nthreads, const bool check, const bool verbose) { typedef typename Kokkos::Impl::is_space<DeviceSpaceType>::host_mirror_space::execution_space HostSpaceType ; const bool detail = false; std::cout << "DeviceSpace:: "; DeviceSpaceType::print_configuration(std::cout, detail); std::cout << "HostSpace:: "; HostSpaceType::print_configuration(std::cout, detail); typedef Kokkos::Experimental::TaskPolicy<DeviceSpaceType> PolicyType; typedef DenseMatrixBase<value_type,ordinal_type,size_type,HostSpaceType> DenseMatrixBaseHostType; typedef DenseMatrixView<DenseMatrixBaseHostType> DenseMatrixViewHostType; typedef DenseMatrixBase<value_type,ordinal_type,size_type,DeviceSpaceType> DenseMatrixBaseDeviceType; typedef DenseMatrixView<DenseMatrixBaseDeviceType> DenseMatrixViewDeviceType; typedef TaskView<DenseMatrixViewDeviceType> DenseTaskViewDeviceType; typedef DenseMatrixBase<DenseTaskViewDeviceType,ordinal_type,size_type,DeviceSpaceType> DenseHierMatrixBaseDeviceType; typedef DenseMatrixView<DenseHierMatrixBaseDeviceType> DenseHierMatrixViewDeviceType; typedef TaskView<DenseHierMatrixViewDeviceType> DenseHierTaskViewDeviceType; int r_val = 0; Kokkos::Impl::Timer timer; double t = 0.0; std::cout << "DenseGemmByBlocks:: test matrices " <<":: mmin = " << mmin << " , mmax = " << mmax << " , minc = " << minc << " , k = "<< k << " , mb = " << mb << std::endl; const size_t max_task_size = (3*sizeof(DenseTaskViewDeviceType)+sizeof(PolicyType)+128); PolicyType policy(max_concurrency, max_task_size, max_task_dependence, team_size); std::ostringstream os; os.precision(3); os << std::scientific; for (ordinal_type m=mmin;m<=mmax;m+=minc) { os.str(""); // host matrices DenseMatrixBaseHostType AA_host, BB_host, CC_host("CC_host", m, m), CB_host("CB_host", m, m); { if (ArgTransA == Trans::NoTranspose) AA_host = DenseMatrixBaseHostType("AA_host", m, k); else AA_host = DenseMatrixBaseHostType("AA_host", k, m); if (ArgTransB == Trans::NoTranspose) BB_host = DenseMatrixBaseHostType("BB_host", k, m); else BB_host = DenseMatrixBaseHostType("BB_host", m, k); for (ordinal_type j=0;j<AA_host.NumCols();++j) for (ordinal_type i=0;i<AA_host.NumRows();++i) AA_host.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0; for (ordinal_type j=0;j<BB_host.NumCols();++j) for (ordinal_type i=0;i<BB_host.NumRows();++i) BB_host.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0; for (ordinal_type j=0;j<CC_host.NumCols();++j) for (ordinal_type i=0;i<CC_host.NumRows();++i) CC_host.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0; DenseMatrixTools::copy(CB_host, CC_host); } const double flop = DenseFlopCount<value_type>::Gemm(m, m, k); #ifdef HAVE_SHYLUTACHO_MKL mkl_set_num_threads(mkl_nthreads); #endif os << "DenseGemmByBlocks:: m = " << m << " n = " << m << " k = " << k << " "; if (check) { timer.reset(); DenseMatrixViewHostType A_host(AA_host), B_host(BB_host), C_host(CB_host); Gemm<ArgTransA,ArgTransB,AlgoGemm::ExternalBlas,Variant::One>::invoke (policy, policy.member_single(), 1.0, A_host, B_host, 1.0, C_host); t = timer.seconds(); os << ":: Serial Performance = " << (flop/t/1.0e9) << " [GFLOPs] "; } DenseMatrixBaseDeviceType AA_device("AA_device"), BB_device("BB_device"), CC_device("CC_device"); { timer.reset(); AA_device.mirror(AA_host); BB_device.mirror(BB_host); CC_device.mirror(CC_host); t = timer.seconds(); os << ":: Mirror = " << t << " [sec] "; } { DenseHierMatrixBaseDeviceType HA_device("HA_device"), HB_device("HB_device"), HC_device("HC_device"); DenseMatrixTools::createHierMatrix(HA_device, AA_device, mb, mb); DenseMatrixTools::createHierMatrix(HB_device, BB_device, mb, mb); DenseMatrixTools::createHierMatrix(HC_device, CC_device, mb, mb); DenseHierTaskViewDeviceType TA_device(HA_device), TB_device(HB_device), TC_device(HC_device); timer.reset(); auto future = policy.proc_create_team (Gemm<ArgTransA,ArgTransB,AlgoGemm::DenseByBlocks,ArgVariant> ::createTaskFunctor(policy, 1.0, TA_device, TB_device, 1.0, TC_device), 0); policy.spawn(future); Kokkos::Experimental::wait(policy); t = timer.seconds(); os << ":: Parallel Performance = " << (flop/t/1.0e9) << " [GFLOPs] "; } CC_host.mirror(CC_device); if (check) { double err = 0.0, norm = 0.0; for (ordinal_type j=0;j<CC_host.NumCols();++j) for (ordinal_type i=0;i<CC_host.NumRows();++i) { const double diff = abs(CC_host.Value(i,j) - CB_host.Value(i,j)); const double val = CB_host.Value(i,j); err += diff*diff; norm += val*val; } os << ":: Check result ::norm = " << sqrt(norm) << ", error = " << sqrt(err); } std::cout << os.str() << std::endl; } return r_val; }
KOKKOS_INLINE_FUNCTION int exampleCholDirectPlain(const string file_input, const int prunecut, const int seed, const int nrhs, const int nb, const int nthreads, const int max_task_dependence, const int team_size, const int league_size, const bool team_interface, const bool serial, const bool solve, const bool check, const bool verbose) { typedef ValueType value_type; typedef OrdinalType ordinal_type; typedef SizeType size_type; typedef TaskFactory<Kokkos::Experimental::TaskPolicy<SpaceType>, Kokkos::Experimental::Future<int,SpaceType> > TaskFactoryType; typedef CrsMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> CrsMatrixBaseType; typedef GraphHelper_Scotch<CrsMatrixBaseType> GraphHelperType; typedef SymbolicFactorHelper<CrsMatrixBaseType> SymbolicFactorHelperType; typedef CrsMatrixView<CrsMatrixBaseType> CrsMatrixViewType; typedef TaskView<CrsMatrixViewType,TaskFactoryType> CrsTaskViewType; typedef CrsMatrixBase<CrsTaskViewType,ordinal_type,size_type,SpaceType,MemoryTraits> CrsHierMatrixBaseType; typedef CrsMatrixView<CrsHierMatrixBaseType> CrsHierMatrixViewType; typedef TaskView<CrsHierMatrixViewType,TaskFactoryType> CrsHierTaskViewType; typedef DenseMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> DenseMatrixBaseType; typedef DenseMatrixView<DenseMatrixBaseType> DenseMatrixViewType; typedef TaskView<DenseMatrixViewType,TaskFactoryType> DenseTaskViewType; typedef DenseMatrixBase<DenseTaskViewType,ordinal_type,size_type,SpaceType,MemoryTraits> DenseHierMatrixBaseType; typedef DenseMatrixView<DenseHierMatrixBaseType> DenseHierMatrixViewType; typedef TaskView<DenseHierMatrixViewType,TaskFactoryType> DenseHierTaskViewType; int r_val = 0; Kokkos::Impl::Timer timer; double t_import = 0.0, t_reorder = 0.0, t_symbolic = 0.0, t_flat2hier = 0.0, t_factor = 0.0, t_solve = 0.0; cout << "CholDirectPlain:: import input file = " << file_input << endl; CrsMatrixBaseType AA("AA"); { timer.reset(); ifstream in; in.open(file_input); if (!in.good()) { cout << "Failed in open the file: " << file_input << endl; return ++r_val; } AA.importMatrixMarket(in); t_import = timer.seconds(); } cout << "CholDirectPlain:: import input file::time = " << t_import << endl; // matrix A and its upper triangular factors U CrsMatrixBaseType PA("Permuted AA"); CrsMatrixBaseType UU("UU"); // permuted base upper triangular matrix CrsHierMatrixBaseType HU("HU"); // hierarchical matrix of views // right hand side and solution matrix DenseMatrixBaseType BB("BB", AA.NumRows(), nrhs), XX("XX", AA.NumRows(), nrhs); DenseHierMatrixBaseType HB("HB"), HX("HX"); { cout << "CholDirectPlain:: reorder the matrix" << endl; GraphHelperType S(AA, seed); { timer.reset(); S.computeOrdering(); S.pruneTree(prunecut); PA.copy(S.PermVector(), S.InvPermVector(), AA); t_reorder = timer.seconds(); } cout << "CholDirectPlain:: reorder the matrix::time = " << t_reorder << endl; { SymbolicFactorHelperType F(PA, league_size); timer.reset(); F.createNonZeroPattern(Uplo::Upper, UU); t_symbolic = timer.seconds(); cout << "CholDirectPlain:: AA (nnz) = " << AA.NumNonZeros() << ", UU (nnz) = " << UU.NumNonZeros() << endl; } cout << "CholDirectPlain:: symbolic factorization::time = " << t_symbolic << endl; { timer.reset(); CrsMatrixHelper::flat2hier(Uplo::Upper, UU, HU, S.NumBlocks(), S.RangeVector(), S.TreeVector()); // fill internal meta data for sparse blocs for (ordinal_type k=0;k<HU.NumNonZeros();++k) HU.Value(k).fillRowViewArray(); DenseMatrixHelper::flat2hier(BB, HB, S.NumBlocks(), S.RangeVector(), nb); DenseMatrixHelper::flat2hier(XX, HX, S.NumBlocks(), S.RangeVector(), nb); t_flat2hier = timer.seconds(); cout << "CholDirectPlain:: Hier (dof, nnz) = " << HU.NumRows() << ", " << HU.NumNonZeros() << endl; } cout << "CholDirectPlain:: construct hierarchical matrix::time = " << t_flat2hier << endl; } { // Policy setup #ifdef __USE_FIXED_TEAM_SIZE__ typename TaskFactoryType::policy_type policy(max_task_dependence); #else typename TaskFactoryType::policy_type policy(max_task_dependence, 1); #endif TaskFactoryType::setUseTeamInterface(team_interface); TaskFactoryType::setMaxTaskDependence(max_task_dependence); TaskFactoryType::setPolicy(&policy); CrsTaskViewType A(&PA), U(&UU); DenseTaskViewType X(&XX), B(&BB); A.fillRowViewArray(); U.fillRowViewArray(); CrsHierTaskViewType TU(&HU); DenseHierTaskViewType TB(&HB), TX(&HX); { // Manufacture B = AX const int m = A.NumRows(); for (int j=0;j<nrhs;++j) for (int i=0;i<m;++i) X.Value(i,j) = (j+1); Gemm<Trans::NoTranspose,Trans::NoTranspose,AlgoGemm::ForTriSolveBlocked> ::invoke(TaskFactoryType::Policy(), TaskFactoryType::Policy().member_single(), 1.0, A, X, 0.0, B); XX.copy(BB); } if (serial) { cout << "CholDirectPlain:: Serial factorize the matrix" << endl; timer.reset(); Chol<Uplo::Upper,AlgoChol::UnblockedOpt,Variant::One> ::invoke(TaskFactoryType::Policy(), TaskFactoryType::Policy().member_single(), U); t_factor = timer.seconds(); cout << "CholDirectPlain:: Serial factorize the matrix::time = " << t_factor << endl; } else { cout << "CholDirectPlain:: ByBlocks factorize the matrix:: team_size = " << team_size << endl; timer.reset(); auto future = TaskFactoryType::Policy().create_team (Chol<Uplo::Upper,AlgoChol::ByBlocks> ::TaskFunctor<CrsHierTaskViewType>(TU), 0); TaskFactoryType::Policy().spawn(future); Kokkos::Experimental::wait(TaskFactoryType::Policy()); t_factor = timer.seconds(); cout << "CholDirectPlain:: ByBlocks factorize the matrix::time = " << t_factor << endl; } if (solve) { if (serial) { cout << "CholDirectPlain:: Serial forward/backward solve" << endl; timer.reset(); TriSolve<Uplo::Upper,Trans::ConjTranspose,AlgoTriSolve::Unblocked> ::invoke(TaskFactoryType::Policy(), TaskFactoryType::Policy().member_single(), Diag::NonUnit, U, X); TriSolve<Uplo::Upper,Trans::NoTranspose,AlgoTriSolve::Unblocked> ::invoke(TaskFactoryType::Policy(), TaskFactoryType::Policy().member_single(), Diag::NonUnit, U, X); t_solve = timer.seconds(); cout << "CholDirectPlain:: Serial forward/backward solve::time = " << t_solve << endl; } else { cout << "CholDirectPlain:: ByBlocks forward/backward solve" << endl; timer.reset(); auto future_forward_solve = TaskFactoryType::Policy().create_team (TriSolve<Uplo::Upper,Trans::ConjTranspose,AlgoTriSolve::ByBlocks> ::TaskFunctor<CrsHierTaskViewType,DenseHierTaskViewType> (Diag::NonUnit, TU, TX), 0); TaskFactoryType::Policy().spawn(future_forward_solve); auto future_backward_solve = TaskFactoryType::Policy().create_team (TriSolve<Uplo::Upper,Trans::NoTranspose,AlgoTriSolve::ByBlocks> ::TaskFunctor<CrsHierTaskViewType,DenseHierTaskViewType> (Diag::NonUnit, TU, TX), 1); TaskFactoryType::Policy().add_dependence(future_backward_solve, future_forward_solve); TaskFactoryType::Policy().spawn(future_backward_solve); Kokkos::Experimental::wait(TaskFactoryType::Policy()); t_solve = timer.seconds(); cout << "CholDirectPlain:: ByBlocks forward/backward solve::time = " << t_solve << endl; } } if (solve && check) { // Check manufactured solution double l2 = 0.0, linf = 0.0; const int m = A.NumRows(); for (int j=0;j<nrhs;++j) for (int i=0;i<m;++i) { double diff = abs(X.Value(i,j) - (j+1)); l2 += diff*diff; linf = max(diff, linf); } l2 = sqrt(l2); cout << "CholDirectPlain:: Check solution::L2 = " << l2 << ", Linf = " << linf << endl; } } return r_val; }
KOKKOS_INLINE_FUNCTION int exampleCholPerformanceSingle(const string file_input, const int treecut, const int minblksize, const int prunecut, const int seed, const int nthreads, const int max_task_dependence, const int team_size, const int fill_level, const int league_size, const bool team_interface, const bool skip_serial, const bool vtune_symbolic, const bool vtune_serial, const bool vtune_task, const bool verbose) { typedef ValueType value_type; typedef OrdinalType ordinal_type; typedef SizeType size_type; typedef TaskFactory<Kokkos::Experimental::TaskPolicy<SpaceType>, Kokkos::Experimental::Future<int,SpaceType> > TaskFactoryType; typedef CrsMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> CrsMatrixBaseType; typedef GraphHelper_Scotch<CrsMatrixBaseType> GraphHelperType; typedef SymbolicFactorHelper<CrsMatrixBaseType> SymbolicFactorHelperType; typedef CrsMatrixView<CrsMatrixBaseType> CrsMatrixViewType; typedef TaskView<CrsMatrixViewType,TaskFactoryType> CrsTaskViewType; typedef CrsMatrixBase<CrsTaskViewType,ordinal_type,size_type,SpaceType,MemoryTraits> CrsHierMatrixBaseType; typedef CrsMatrixView<CrsHierMatrixBaseType> CrsHierMatrixViewType; typedef TaskView<CrsHierMatrixViewType,TaskFactoryType> CrsHierTaskViewType; #ifdef HAVE_SHYLUTACHO_VTUNE __itt_pause(); #endif int r_val = 0; Kokkos::Impl::Timer timer; double t_import = 0.0, t_reorder = 0.0, t_symbolic = 0.0, t_flat2hier = 0.0, t_factor_seq = 0.0, t_factor_task = 0.0; cout << "CholPerformance:: import input file = " << file_input << endl; CrsMatrixBaseType AA("AA"); { timer.reset(); ifstream in; in.open(file_input); if (!in.good()) { cout << "Failed in open the file: " << file_input << endl; return ++r_val; } AA.importMatrixMarket(in); t_import = timer.seconds(); if (verbose) cout << AA << endl; } cout << "CholPerformance:: import input file::time = " << t_import << endl; cout << "CholPerformance:: reorder the matrix" << endl; CrsMatrixBaseType PA("Permuted AA"); CrsMatrixBaseType UU("UU"); // permuted base upper triangular matrix CrsHierMatrixBaseType HU("HU"); // hierarchical matrix of views { GraphHelperType S(AA, seed); { timer.reset(); S.computeOrdering(treecut, minblksize); S.pruneTree(prunecut); PA.copy(S.PermVector(), S.InvPermVector(), AA); t_reorder = timer.seconds(); if (verbose) cout << S << endl << PA << endl; } cout << "CholPerformance:: reorder the matrix::time = " << t_reorder << endl; { SymbolicFactorHelperType F(PA, league_size); if (vtune_symbolic) { #ifdef HAVE_SHYLUTACHO_VTUNE __itt_resume(); #endif } timer.reset(); F.createNonZeroPattern(fill_level, Uplo::Upper, UU); t_symbolic = timer.seconds(); if (vtune_symbolic) { #ifdef HAVE_SHYLUTACHO_VTUNE __itt_pause(); #endif } cout << "CholPerformance:: AA (nnz) = " << AA.NumNonZeros() << ", UU (nnz) = " << UU.NumNonZeros() << endl; if (verbose) cout << F << endl << UU << endl; } cout << "CholPerformance:: symbolic factorization::time = " << t_symbolic << endl; { timer.reset(); CrsMatrixHelper::flat2hier(Uplo::Upper, UU, HU, S.NumBlocks(), S.RangeVector(), S.TreeVector()); for (ordinal_type k=0;k<HU.NumNonZeros();++k) HU.Value(k).fillRowViewArray(); t_flat2hier = timer.seconds(); cout << "CholPerformance:: Hier (dof, nnz) = " << HU.NumRows() << ", " << HU.NumNonZeros() << endl; } cout << "CholPerformance:: construct hierarchical matrix::time = " << t_flat2hier << endl; } // copy of UU CrsMatrixBaseType RR("RR"); RR.copy(UU); const size_t max_concurrency = 16384; cout << "CholPerformance:: max concurrency = " << max_concurrency << endl; const size_t max_task_size = 3*sizeof(CrsTaskViewType)+128; cout << "CholPerformance:: max task size = " << max_task_size << endl; if (!skip_serial) { typename TaskFactoryType::policy_type policy(max_concurrency, max_task_size, max_task_dependence, 1); TaskFactoryType::setUseTeamInterface(team_interface); TaskFactoryType::setMaxTaskDependence(max_task_dependence); TaskFactoryType::setPolicy(&policy); CrsTaskViewType U(&UU); U.fillRowViewArray(); cout << "CholPerformance:: Serial factorize the matrix" << endl; { UU.copy(RR); if (vtune_serial) { #ifdef HAVE_SHYLUTACHO_VTUNE __itt_resume(); #endif } timer.reset(); { Chol<Uplo::Upper,AlgoChol::UnblockedOpt,Variant::One> ::invoke(TaskFactoryType::Policy(), TaskFactoryType::Policy().member_single(), U); } t_factor_seq = timer.seconds(); if (vtune_serial) { #ifdef HAVE_SHYLUTACHO_VTUNE __itt_pause(); #endif } if (verbose) cout << UU << endl; } cout << "CholPerformance:: Serial factorize the matrix::time = " << t_factor_seq << endl; } { typename TaskFactoryType::policy_type policy(max_concurrency, max_task_size, max_task_dependence, team_size); TaskFactoryType::setUseTeamInterface(team_interface); TaskFactoryType::setMaxTaskDependence(max_task_dependence); TaskFactoryType::setPolicy(&policy); cout << "CholPerformance:: ByBlocks factorize the matrix:: team_size = " << team_size << endl; CrsHierTaskViewType H(&HU); { UU.copy(RR); if (vtune_task) { #ifdef HAVE_SHYLUTACHO_VTUNE __itt_resume(); #endif } timer.reset(); { auto future = TaskFactoryType::Policy().create_team(Chol<Uplo::Upper,AlgoChol::ByBlocks>:: TaskFunctor<CrsHierTaskViewType>(H), 0); TaskFactoryType::Policy().spawn(future); Kokkos::Experimental::wait(TaskFactoryType::Policy()); } t_factor_task += timer.seconds(); if (vtune_task) { #ifdef HAVE_SHYLUTACHO_VTUNE __itt_pause(); #endif } if (verbose) cout << UU << endl; } cout << "CholPerformance:: ByBlocks factorize the matrix::time = " << t_factor_task << endl; } if (!skip_serial) { cout << "CholPerformance:: task scale [seq/task] = " << t_factor_seq/t_factor_task << endl; } return r_val; }