size_t test_global_to_local_ids(unsigned num_ids, unsigned capacity, unsigned num_find_iterations) { typedef Device execution_space; typedef typename execution_space::size_type size_type; typedef Kokkos::View<uint32_t*,execution_space> local_id_view; typedef Kokkos::UnorderedMap<uint32_t,size_type,execution_space> global_id_view; double elasped_time = 0; Kokkos::Impl::Timer timer; local_id_view local_2_global("local_ids", num_ids); global_id_view global_2_local(capacity); int shiftw = 15; //create elasped_time = timer.seconds(); std::cout << std::setw(shiftw) << "allocate: " << elasped_time << std::endl; timer.reset(); // generate unique ids { generate_ids<Device> gen(local_2_global); } // generate elasped_time = timer.seconds(); std::cout << std::setw(shiftw) << "generate: " << elasped_time << std::endl; timer.reset(); { fill_map<Device> fill(global_2_local, local_2_global); } // fill elasped_time = timer.seconds(); std::cout << std::setw(shiftw) << "fill: " << elasped_time << std::endl; timer.reset(); size_t num_errors = global_2_local.failed_insert(); if (num_errors == 0u) { for (unsigned i=0; i<num_find_iterations; ++i) { find_test<Device> find(global_2_local, local_2_global,num_errors); } // find elasped_time = timer.seconds(); std::cout << std::setw(shiftw) << "lookup: " << elasped_time << std::endl; } else { std::cout << " !!! Fill Failed !!!" << std::endl; } return num_errors; }
void Loop(int loop, int test, const char* type_name) { LoopVariant<T>(loop,test); Kokkos::Impl::Timer timer; T res = LoopVariant<T>(loop,test); double time1 = timer.seconds(); timer.reset(); T resNonAtomic = LoopVariantNonAtomic<T>(loop,test); double time2 = timer.seconds(); timer.reset(); T resSerial = LoopVariantSerial<T>(loop,test); double time3 = timer.seconds(); time1*=1e6/loop; time2*=1e6/loop; time3*=1e6/loop; textcolor_standard(); bool passed = true; if(resSerial!=res) passed = false; if(!passed) textcolor(RESET,BLACK,YELLOW); printf("%s Test %i %s --- Loop: %i Value (S,A,NA): %e %e %e Time: %7.4e %7.4e %7.4e Size of Type %i)",type_name,test,passed?"PASSED":"FAILED",loop,1.0*resSerial,1.0*res,1.0*resNonAtomic,time1,time2,time3,(int)sizeof(T)); if(!passed) textcolor_standard(); printf("\n"); }
int main(int narg, char* arg[]) { Kokkos::initialize(narg,arg); int size = 1000000; // Create DualViews. This will allocate on both the device and its // host_mirror_device. idx_type idx("Idx",size,64); view_type dest("Dest",size); view_type src("Src",size); srand(134231); // Get a reference to the host view of idx directly (equivalent to // idx.view<idx_type::host_mirror_device_type>() ) idx_type::t_host h_idx = idx.h_view; for (int i = 0; i < size; ++i) { for (view_type::size_type j=0; j < h_idx.dimension_1 (); ++j) { h_idx(i,j) = (size + i + (rand () % 500 - 250)) % size; } } // Mark idx as modified on the host_mirror_device_type so that a // sync to the device will actually move data. The sync happens in // the functor's constructor. idx.modify<idx_type::host_mirror_device_type>(); // Run on the device. This will cause a sync of idx to the device, // since it was marked as modified on the host. Kokkos::Impl::Timer timer; Kokkos::parallel_for(size,localsum<view_type::device_type>(idx,dest,src)); Kokkos::fence(); double sec1_dev = timer.seconds(); timer.reset(); Kokkos::parallel_for(size,localsum<view_type::device_type>(idx,dest,src)); Kokkos::fence(); double sec2_dev = timer.seconds(); // Run on the host (could be the same as device). This will cause a // sync back to the host of dest. Note that if the Device is CUDA, // the data layout will not be optimal on host, so performance is // lower than what it would be for a pure host compilation. timer.reset(); Kokkos::parallel_for(size,localsum<view_type::host_mirror_device_type>(idx,dest,src)); Kokkos::fence(); double sec1_host = timer.seconds(); timer.reset(); Kokkos::parallel_for(size,localsum<view_type::host_mirror_device_type>(idx,dest,src)); Kokkos::fence(); double sec2_host = timer.seconds(); printf("Device Time with Sync: %lf without Sync: %lf \n",sec1_dev,sec2_dev); printf("Host Time with Sync: %lf without Sync: %lf \n",sec1_host,sec2_host); Kokkos::finalize(); }
void test_global_to_local_ids(unsigned num_ids) { typedef Device device_type; typedef typename device_type::size_type size_type; typedef Kokkos::View<uint32_t*,device_type> local_id_view; typedef Kokkos::UnorderedMap<uint32_t,size_type,device_type> global_id_view; //size std::cout << num_ids << ", "; double elasped_time = 0; Kokkos::Impl::Timer timer; local_id_view local_2_global("local_ids", num_ids); global_id_view global_2_local((3u*num_ids)/2u); //create elasped_time = timer.seconds(); std::cout << elasped_time << ", "; timer.reset(); // generate unique ids { generate_ids<Device> gen(local_2_global); } Device::fence(); // generate elasped_time = timer.seconds(); std::cout << elasped_time << ", "; timer.reset(); { fill_map<Device> fill(global_2_local, local_2_global); } Device::fence(); // fill elasped_time = timer.seconds(); std::cout << elasped_time << ", "; timer.reset(); size_t num_errors = 0; for (int i=0; i<100; ++i) { find_test<Device> find(global_2_local, local_2_global,num_errors); } Device::fence(); // find elasped_time = timer.seconds(); std::cout << elasped_time << std::endl; ASSERT_EQ( num_errors, 0u); }
int main(int narg, char* arg[]) { Kokkos::initialize(narg,arg); int size = 1000000; // Create Views idx_type idx("Idx",size,64); view_type dest("Dest",size); view_type src("Src",size); srand(134231); // When using UVM Cuda views can be accessed on the Host directly for(int i=0; i<size; i++) { for(int j=0; j<idx.dimension_1(); j++) idx(i,j) = (size + i + (rand()%500 - 250))%size; } Kokkos::fence(); // Run on the device // This will cause a sync of idx to the device since it was modified on the host Kokkos::Impl::Timer timer; Kokkos::parallel_for(size,localsum<view_type::execution_space>(idx,dest,src)); Kokkos::fence(); double sec1_dev = timer.seconds(); // No data transfer will happen now, since nothing is accessed on the host timer.reset(); Kokkos::parallel_for(size,localsum<view_type::execution_space>(idx,dest,src)); Kokkos::fence(); double sec2_dev = timer.seconds(); // Run on the host // This will cause a sync back to the host of dest which was changed on the device // Compare runtime here with the dual_view example: dest will be copied back in 4k blocks // when they are accessed the first time during the parallel_for. Due to the latency of a memcpy // this gives lower effective bandwidth when doing a manual copy via dual views timer.reset(); Kokkos::parallel_for(size,localsum<Kokkos::HostSpace::execution_space>(idx,dest,src)); Kokkos::fence(); double sec1_host = timer.seconds(); // No data transfers will happen now timer.reset(); Kokkos::parallel_for(size,localsum<Kokkos::HostSpace::execution_space>(idx,dest,src)); Kokkos::fence(); double sec2_host = timer.seconds(); printf("Device Time with Sync: %lf without Sync: %lf \n",sec1_dev,sec2_dev); printf("Host Time with Sync: %lf without Sync: %lf \n",sec1_host,sec2_host); Kokkos::finalize(); }
BASKER_INLINE int Basker<Int, Entry, Exe_Space>::Symbolic(Int option) { printf("calling symbolic \n"); #ifdef BASKER_KOKKOS_TIME Kokkos::Impl::Timer timer; #endif //symmetric_sfactor(); sfactor(); if(option == 0) { } else if(option == 1) { } #ifdef BASKER_KOKKOS_TIME stats.time_sfactor += timer.seconds(); #endif return 0; }//end Symbolic
BASKER_INLINE int Basker<Int, Entry, Exe_Space>::Factor(Int option) { #ifdef BASKER_KOKKOS_TIME Kokkos::Impl::Timer timer; #endif factor_notoken(option); #ifdef BASKER_KOKKOS_TIME stats.time_nfactor += timer.seconds(); #endif // NDE MALLOC_ENTRY_1DARRAY(x_view_ptr_copy, gn); //used in basker_solve_rhs - move alloc MALLOC_ENTRY_1DARRAY(y_view_ptr_copy, gm); MALLOC_INT_1DARRAY(perm_inv_comp_array , gm); //y MALLOC_INT_1DARRAY(perm_comp_array, gn); //x MALLOC_INT_1DARRAY(perm_comp_iworkspace_array, gn); MALLOC_ENTRY_1DARRAY(perm_comp_fworkspace_array, gn); permute_composition_for_solve(); factor_flag = BASKER_TRUE; return 0; }//end Factor()
void graph_color_symbolic(KernelHandle *handle){ Kokkos::Impl::Timer timer; typename KernelHandle::idx_array_type row_map = handle->get_row_map(); typename KernelHandle::idx_edge_array_type entries = handle->get_entries(); typename KernelHandle::GraphColoringHandleType *gch = handle->get_graph_coloring_handle(); Experimental::KokkosKernels::Graph::ColoringAlgorithm algorithm = gch->get_coloring_type(); typedef typename KernelHandle::GraphColoringHandleType::color_array_type color_view_type; color_view_type colors_out = color_view_type("Graph Colors", row_map.dimension_0() - 1); typedef typename Experimental::KokkosKernels::Graph::Impl::GraphColor <typename KernelHandle::GraphColoringHandleType> BaseGraphColoring; BaseGraphColoring *gc = NULL; switch (algorithm){ case Experimental::KokkosKernels::Graph::COLORING_SERIAL: gc = new BaseGraphColoring( row_map.dimension_0() - 1, entries.dimension_0(), row_map, entries, gch); break; case Experimental::KokkosKernels::Graph::COLORING_VB: case Experimental::KokkosKernels::Graph::COLORING_VBBIT: case Experimental::KokkosKernels::Graph::COLORING_VBCS: typedef typename Experimental::KokkosKernels::Graph::Impl::GraphColor_VB <typename KernelHandle::GraphColoringHandleType> VBGraphColoring; gc = new VBGraphColoring( row_map.dimension_0() - 1, entries.dimension_0(), row_map, entries, gch); break; case Experimental::KokkosKernels::Graph::COLORING_EB: typedef typename Experimental::KokkosKernels::Graph::Impl::GraphColor_EB <typename KernelHandle::GraphColoringHandleType> EBGraphColoring; gc = new EBGraphColoring(row_map.dimension_0() - 1, entries.dimension_0(),row_map, entries, gch); break; case Experimental::KokkosKernels::Graph::COLORING_DEFAULT: break; } int num_phases = 0; gc->color_graph(colors_out, num_phases); delete gc; double coloring_time = timer.seconds(); gch->add_to_overall_coloring_time(coloring_time); gch->set_coloring_time(coloring_time); gch->set_num_phases(num_phases); gch->set_vertex_colors(colors_out); }
BASKER_INLINE int Basker<Int, Entry, Exe_Space>::Factor(Int option) { #ifdef BASKER_KOKKOS_TIME Kokkos::Impl::Timer timer; #endif factor_notoken(option); #ifdef BASKER_KOKKOS_TIME stats.time_nfactor += timer.seconds(); #endif return 0; }//end Factor()
void mexFunction(int nlhs, mxArray *plhs [], int nrhs, const mxArray *prhs []) { Kokkos::Impl::Timer time; Kokkos::initialize(); string name = typeid(Kokkos::DefaultExecutionSpace).name(); mexPrintf("\n Kokkos is initialized with a default spaceL: %s\n", name.c_str()); Kokkos::finalize(); mexPrintf("\n Kokkos is finalized\n"); plhs[0] = mxCreateDoubleScalar(time.seconds()); }
static double factorization( const multivector_type Q_ , const multivector_type R_ ) { #if defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) using Kokkos::Experimental::ALL ; #else const Kokkos::ALL ALL ; #endif const size_type count = Q_.dimension_1(); value_view tmp("tmp"); value_view one("one"); Kokkos::deep_copy( one , (Scalar) 1 ); Kokkos::Impl::Timer timer ; for ( size_type j = 0 ; j < count ; ++j ) { // Reduction : tmp = dot( Q(:,j) , Q(:,j) ); // PostProcess : tmp = sqrt( tmp ); R(j,j) = tmp ; tmp = 1 / tmp ; const vector_type Qj = Kokkos::subview( Q_ , ALL , j ); const value_view Rjj = Kokkos::subview( R_ , j , j ); invnorm2( Qj , Rjj , tmp ); // Q(:,j) *= ( 1 / R(j,j) ); => Q(:,j) *= tmp ; Kokkos::scale( tmp , Qj ); for ( size_t k = j + 1 ; k < count ; ++k ) { const vector_type Qk = Kokkos::subview( Q_ , ALL , k ); const value_view Rjk = Kokkos::subview( R_ , j , k ); // Reduction : R(j,k) = dot( Q(:,j) , Q(:,k) ); // PostProcess : tmp = - R(j,k); dot_neg( Qj , Qk , Rjk , tmp ); // Q(:,k) -= R(j,k) * Q(:,j); => Q(:,k) += tmp * Q(:,j) Kokkos::axpby( tmp , Qj , one , Qk ); } } execution_space::fence(); return timer.seconds(); }
void profile_end_kernel(const std::string& kernel_name, const std::string& exec_space) { Kokkos::Impl::Timer* timer = get_timer(); double time = timer->seconds(); KernelEntry* entry = get_kernel_list_head(); if(entry == NULL) { KernelEntry* entry_new = new KernelEntry(entry,time,kernel_name,exec_space); get_kernel_list_head(entry_new); return; } bool found = entry->matches(kernel_name,exec_space); while(!found && (entry->next!=NULL) ) { entry = entry->next; found = entry->matches(kernel_name,exec_space); } if(found) entry->add_time(time); else entry->next = new KernelEntry(entry,time,kernel_name,exec_space); }
int main(int narg, char* args[]) { Kokkos::initialize(narg,args); int chunk_size = 1024; int nchunks = 100000; //1024*1024; Kokkos::DualView<int*> data("data",nchunks*chunk_size+1); srand(1231093); for(int i = 0; i < data.dimension_0(); i++) { data.h_view(i) = rand()%TS; } data.modify<Host>(); data.sync<Device>(); Kokkos::DualView<int**> histogram("histogram",TS,TS); Kokkos::Impl::Timer timer; // Threads/team (TS) is automically limited to the maximum supported by the device. Kokkos::parallel_for( team_policy( nchunks , TS ) , find_2_tuples(chunk_size,data,histogram) ); Kokkos::fence(); double time = timer.seconds(); histogram.sync<Host>(); printf("Time: %lf \n\n",time); int sum = 0; for(int k=0; k<TS; k++) { for(int l=0; l<TS; l++) { printf("%i ",histogram.h_view(k,l)); sum += histogram.h_view(k,l); } printf("\n"); } printf("Result: %i %i\n",sum,chunk_size*nchunks); Kokkos::finalize(); }
int main(int narg, char* args[]) { Kokkos::initialize(narg,args); int chunk_size = 1024; int nchunks = 100000; //1024*1024; Kokkos::DualView<int*> data("data",nchunks*chunk_size+1); srand(1231093); for(int i = 0; i < data.dimension_0(); i++) { data.h_view(i) = rand()%TS; } data.modify<Host>(); data.sync<Device>(); Kokkos::DualView<int**> histogram("histogram",TS,TS); Kokkos::Impl::Timer timer; Kokkos::parallel_for( Kokkos::ParallelWorkRequest(nchunks,TS<Device::team_max()?TS:Device::team_max()), find_2_tuples(chunk_size,data,histogram)); Kokkos::fence(); double time = timer.seconds(); histogram.sync<Host>(); printf("Time: %lf \n\n",time); int sum = 0; for(int k=0; k<TS; k++) { for(int l=0; l<TS; l++) { printf("%i ",histogram.h_view(k,l)); sum += histogram.h_view(k,l); } printf("\n"); } printf("Result: %i %i\n",sum,chunk_size*nchunks); Kokkos::finalize(); }
static double test( const int count , const int iter = 1 ) { elem_coord_type coord( "coord" , count ); elem_grad_type grad ( "grad" , count ); // Execute the parallel kernels on the arrays: double dt_min = 0 ; Kokkos::parallel_for( count , Init( coord ) ); device_type::fence(); for ( int i = 0 ; i < iter ; ++i ) { Kokkos::Impl::Timer timer ; Kokkos::parallel_for( count , HexGrad<device_type>( coord , grad ) ); device_type::fence(); const double dt = timer.seconds(); if ( 0 == i ) dt_min = dt ; else dt_min = dt < dt_min ? dt : dt_min ; } return dt_min ; }
KOKKOS_INLINE_FUNCTION int exampleCholPerformance(const string file_input, const int treecut, const int minblksize, const int prunecut, const int seed, const int niter, const int nthreads, const int max_task_dependence, const int team_size, const int fill_level, const int league_size, const bool team_interface, const bool skip_serial, const bool mkl_interface, const bool verbose) { typedef ValueType value_type; typedef OrdinalType ordinal_type; typedef SizeType size_type; typedef TaskFactory<Kokkos::Experimental::TaskPolicy<SpaceType>, Kokkos::Experimental::Future<int,SpaceType> > TaskFactoryType; typedef CrsMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> CrsMatrixBaseType; typedef GraphHelper_Scotch<CrsMatrixBaseType> GraphHelperType; typedef SymbolicFactorHelper<CrsMatrixBaseType> SymbolicFactorHelperType; #ifdef HAVE_SHYLUTACHO_MKL typedef typename CrsMatrixBaseType::value_type_array value_type_array; #endif typedef CrsMatrixView<CrsMatrixBaseType> CrsMatrixViewType; typedef TaskView<CrsMatrixViewType,TaskFactoryType> CrsTaskViewType; typedef CrsMatrixBase<CrsTaskViewType,ordinal_type,size_type,SpaceType,MemoryTraits> CrsHierMatrixBaseType; typedef CrsMatrixView<CrsHierMatrixBaseType> CrsHierMatrixViewType; typedef TaskView<CrsHierMatrixViewType,TaskFactoryType> CrsHierTaskViewType; int r_val = 0; Kokkos::Impl::Timer timer; double t_import = 0.0, t_reorder = 0.0, t_symbolic = 0.0, t_flat2hier = 0.0, #ifdef HAVE_SHYLUTACHO_MKL t_mkl_seq = 0.0, #endif t_factor_seq = 0.0, t_factor_task = 0.0; const int start = -2; cout << "CholPerformance:: import input file = " << file_input << endl; CrsMatrixBaseType AA("AA"); { timer.reset(); ifstream in; in.open(file_input); if (!in.good()) { cout << "Failed in open the file: " << file_input << endl; return ++r_val; } AA.importMatrixMarket(in); t_import = timer.seconds(); if (verbose) cout << AA << endl; } cout << "CholPerformance:: import input file::time = " << t_import << endl; cout << "CholPerformance:: reorder the matrix" << endl; CrsMatrixBaseType PA("Permuted AA"); CrsMatrixBaseType UU("UU"); // permuted base upper triangular matrix CrsHierMatrixBaseType HU("HU"); // hierarchical matrix of views { GraphHelperType S(AA, seed); { timer.reset(); S.computeOrdering(treecut, minblksize); S.pruneTree(prunecut); PA.copy(S.PermVector(), S.InvPermVector(), AA); t_reorder = timer.seconds(); if (verbose) cout << S << endl << PA << endl; } cout << "CholPerformance:: reorder the matrix::time = " << t_reorder << endl; { SymbolicFactorHelperType F(PA, league_size); for (int i=start;i<niter;++i) { timer.reset(); F.createNonZeroPattern(fill_level, Uplo::Upper, UU); // UU.copy(Uplo::Upper, PA); t_symbolic += timer.seconds() * (i>=0); } t_symbolic /= niter; cout << "CholPerformance:: AA (nnz) = " << AA.NumNonZeros() << ", UU (nnz) = " << UU.NumNonZeros() << endl; if (verbose) cout << F << endl << UU << endl; } cout << "CholPerformance:: symbolic factorization::time = " << t_symbolic << endl; { timer.reset(); CrsMatrixHelper::flat2hier(Uplo::Upper, UU, HU, S.NumBlocks(), S.RangeVector(), S.TreeVector()); for (ordinal_type k=0;k<HU.NumNonZeros();++k) HU.Value(k).fillRowViewArray(); t_flat2hier = timer.seconds(); cout << "CholPerformance:: Hier (dof, nnz) = " << HU.NumRows() << ", " << HU.NumNonZeros() << endl; } cout << "CholPerformance:: construct hierarchical matrix::time = " << t_flat2hier << endl; } // copy of UU CrsMatrixBaseType RR("RR"); RR.copy(UU); #ifdef HAVE_SHYLUTACHO_MKL if (!skip_serial && mkl_interface) { cout << "CholPerformance:: MKL factorize the matrix" << endl; CrsMatrixBaseType MM("MM"); for (int i=start;i<niter;++i) { MM.copy(RR); MM.hermitianize(Uplo::Upper); MKL_INT n = static_cast<MKL_INT>(MM.NumRows()); double *a = static_cast<double*>(MM.ValuePtr()); MKL_INT *ia = static_cast<MKL_INT*>(MM.RowPtr()); MKL_INT *ja = static_cast<MKL_INT*>(MM.ColPtr()); // convert to 1-based matrix { for (ordinal_type k=0;k<(MM.NumRows()+1);++k) ++ia[k]; for (size_type k=0;k<MM.NumNonZeros();++k) ++ja[k]; } value_type_array mkl_result = value_type_array("mkl-ilu-values", MM.NumNonZeros()); double *bilu0 = static_cast<double*>(&mkl_result[0]); MKL_INT ipar[128]; double dpar[128]; MKL_INT ierr; // we provide ilu-k pattern timer.reset(); dcsrilu0(&n, a, ia, ja, bilu0, ipar, dpar, &ierr); t_mkl_seq += timer.seconds() * (i>=0) * 0.5; if (ierr != 0) cout << " MKL Error = " << ierr << endl; } t_mkl_seq /= niter; cout << "CholPerformance:: MKL factorize the matrix::time = " << t_mkl_seq << endl; } #endif if (!skip_serial) { #ifdef __USE_FIXED_TEAM_SIZE__ typename TaskFactoryType::policy_type policy(max_task_dependence); #else typename TaskFactoryType::policy_type policy(max_task_dependence, 1); #endif TaskFactoryType::setUseTeamInterface(team_interface); TaskFactoryType::setMaxTaskDependence(max_task_dependence); TaskFactoryType::setPolicy(&policy); CrsTaskViewType U(&UU); U.fillRowViewArray(); cout << "CholPerformance:: Serial factorize the matrix" << endl; { for (int i=start;i<niter;++i) { UU.copy(RR); timer.reset(); // { // auto future = TaskFactoryType::Policy().create(Chol<Uplo::Upper,AlgoChol::UnblockedOpt,Variant::One> // ::TaskFunctor<CrsTaskViewType>(U), 0); // TaskFactoryType::Policy().spawn(future); // Kokkos::Experimental::wait(TaskFactoryType::Policy()); // } { Chol<Uplo::Upper,AlgoChol::UnblockedOpt,Variant::One> ::invoke(TaskFactoryType::Policy(), TaskFactoryType::Policy().member_single(), U); } t_factor_seq += timer.seconds() * (i>=0); } t_factor_seq /= niter; if (verbose) cout << UU << endl; } cout << "CholPerformance:: Serial factorize the matrix::time = " << t_factor_seq << endl; } // { // #ifdef __USE_FIXED_TEAM_SIZE__ // typename TaskFactoryType::policy_type policy(max_task_dependence); // #else // typename TaskFactoryType::policy_type policy(max_task_dependence, nthreads); // #endif // TaskFactoryType::setPolicy(&policy); // CrsTaskViewType U(&UU); // U.fillRowViewArray(); // cout << "CholPerformance:: Team factorize the matrix:: team_size = " << nthreads << endl; // { // timer.reset(); // auto future = TaskFactoryType::Policy().create(Chol<Uplo::Upper,AlgoChol::UnblockedOpt,Variant::One> // ::TaskFunctor<CrsTaskViewType>(U), 0); // TaskFactoryType::Policy().spawn(future); // Kokkos::Experimental::wait(TaskFactoryType::Policy()); // t_factor_team = timer.seconds(); // if (verbose) // cout << UU << endl; // } // cout << "CholPerformance:: Team factorize the matrix::time = " << t_factor_team << endl; // } { #ifdef __USE_FIXED_TEAM_SIZE__ typename TaskFactoryType::policy_type policy(max_task_dependence); #else typename TaskFactoryType::policy_type policy(max_task_dependence, team_size); #endif TaskFactoryType::setUseTeamInterface(team_interface); TaskFactoryType::setMaxTaskDependence(max_task_dependence); TaskFactoryType::setPolicy(&policy); cout << "CholPerformance:: ByBlocks factorize the matrix:: team_size = " << team_size << endl; CrsHierTaskViewType H(&HU); { for (int i=start;i<niter;++i) { UU.copy(RR); timer.reset(); { auto future = TaskFactoryType::Policy().create_team(Chol<Uplo::Upper,AlgoChol::ByBlocks>:: TaskFunctor<CrsHierTaskViewType>(H), 0); TaskFactoryType::Policy().spawn(future); Kokkos::Experimental::wait(TaskFactoryType::Policy()); } t_factor_task += timer.seconds() * (i>=0); } t_factor_task /= niter; if (verbose) cout << UU << endl; } cout << "CholPerformance:: ByBlocks factorize the matrix::time = " << t_factor_task << endl; } if (!skip_serial) { #ifdef HAVE_SHYLUTACHO_MKL cout << "CholPerformance:: mkl/chol scale [mkl/chol] = " << t_mkl_seq/t_factor_seq << endl; cout << "CholPerformance:: mkl/task scale [mkl/task] = " << t_mkl_seq/t_factor_task << endl; #else cout << "CholPerformance:: task scale [seq/task] = " << t_factor_seq/t_factor_task << endl; #endif //cout << "CholPerformance:: team scale [seq/team] = " << t_factor_seq/t_factor_team << endl; } return r_val; }
void graph_color_symbolic( KernelHandle *handle, typename KernelHandle::row_lno_t num_rows, typename KernelHandle::row_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, bool is_symmetric = true){ Kokkos::Impl::Timer timer; typename KernelHandle::GraphColoringHandleType *gch = handle->get_graph_coloring_handle(); ColoringAlgorithm algorithm = gch->get_coloring_type(); typedef typename KernelHandle::GraphColoringHandleType::color_view_t color_view_type; color_view_type colors_out = color_view_type("Graph Colors", num_rows); typedef typename Impl::GraphColor <typename KernelHandle::GraphColoringHandleType, lno_row_view_t_, lno_nnz_view_t_> BaseGraphColoring; BaseGraphColoring *gc = NULL; switch (algorithm){ case COLORING_SERIAL: gc = new BaseGraphColoring( num_rows, entries.dimension_0(), row_map, entries, gch); break; case COLORING_VB: case COLORING_VBBIT: case COLORING_VBCS: typedef typename Impl::GraphColor_VB <typename KernelHandle::GraphColoringHandleType, lno_row_view_t_, lno_nnz_view_t_> VBGraphColoring; gc = new VBGraphColoring( num_rows, entries.dimension_0(), row_map, entries, gch); break; case COLORING_EB: typedef typename Impl::GraphColor_EB <typename KernelHandle::GraphColoringHandleType, lno_row_view_t_, lno_nnz_view_t_> EBGraphColoring; gc = new EBGraphColoring(num_rows, entries.dimension_0(),row_map, entries, gch); break; case COLORING_DEFAULT: break; } int num_phases = 0; gc->color_graph(colors_out, num_phases); delete gc; double coloring_time = timer.seconds(); gch->add_to_overall_coloring_time(coloring_time); gch->set_coloring_time(coloring_time); gch->set_num_phases(num_phases); gch->set_vertex_colors(colors_out); }
KOKKOS_INLINE_FUNCTION int exampleCholDirectPerformance(const string file_input, const int treecut, const int minblksize, const int prunecut, const int seed, const int niter, const int nthreads, const int max_task_dependence, const int team_size, const int league_size, const bool team_interface, const bool skip_serial, const bool mkl_interface, const bool verbose) { typedef ValueType value_type; typedef OrdinalType ordinal_type; typedef SizeType size_type; typedef TaskFactory<Kokkos::Experimental::TaskPolicy<SpaceType>, Kokkos::Experimental::Future<int,SpaceType> > TaskFactoryType; typedef CrsMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> CrsMatrixBaseType; typedef GraphHelper_Scotch<CrsMatrixBaseType> GraphHelperType; typedef SymbolicFactorHelper<CrsMatrixBaseType> SymbolicFactorHelperType; #ifdef HAVE_SHYLUTACHO_MKL typedef typename CrsMatrixBaseType::value_type_array value_type_array; #endif typedef CrsMatrixView<CrsMatrixBaseType> CrsMatrixViewType; typedef TaskView<CrsMatrixViewType,TaskFactoryType> CrsTaskViewType; typedef CrsMatrixBase<CrsTaskViewType,ordinal_type,size_type,SpaceType,MemoryTraits> CrsHierMatrixBaseType; typedef CrsMatrixView<CrsHierMatrixBaseType> CrsHierMatrixViewType; typedef TaskView<CrsHierMatrixViewType,TaskFactoryType> CrsHierTaskViewType; int r_val = 0; Kokkos::Impl::Timer timer; double t_import = 0.0, t_reorder = 0.0, t_symbolic = 0.0, t_flat2hier = 0.0, #ifdef HAVE_SHYLUTACHO_MKL t_mkl = 0.0, #endif t_factor_seq = 0.0, t_solve_seq = 0.0, t_factor_task = 0.0, t_solve_task = 0.0; const int start = 0; cout << "CholDirectPerformance:: import input file = " << file_input << endl; CrsMatrixBaseType AA("AA"); { timer.reset(); ifstream in; in.open(file_input); if (!in.good()) { cout << "Failed in open the file: " << file_input << endl; return ++r_val; } AA.importMatrixMarket(in); t_import = timer.seconds(); } cout << "CholDirectPerformance:: import input file::time = " << t_import << endl; cout << "CholDirectPerformance:: reorder the matrix" << endl; CrsMatrixBaseType PA("Permuted AA"); CrsMatrixBaseType UU("UU"); // permuted base upper triangular matrix CrsHierMatrixBaseType HU("HU"); // hierarchical matrix of views DenseMatrixBaseType BB("BB", AA.NumRows(), nrhs); DenseHierMatrixBaseType HB("HB"); { GraphHelperType S(AA, seed); { timer.reset(); S.computeOrdering(treecut, minblksize); S.pruneTree(prunecut); PA.copy(S.PermVector(), S.InvPermVector(), AA); t_reorder = timer.seconds(); } cout << "CholDirectPerformance:: reorder the matrix::time = " << t_reorder << endl; { SymbolicFactorHelperType F(PA, league_size); for (int i=start;i<niter;++i) { timer.reset(); F.createNonZeroPattern(Uplo::Upper, UU); t_symbolic += timer.seconds() * (i>=0); } t_symbolic /= niter; cout << "CholDirectPerformance:: AA (nnz) = " << AA.NumNonZeros() << ", UU (nnz) = " << UU.NumNonZeros() << endl; } cout << "CholDirectPerformance:: symbolic factorization::time = " << t_symbolic << endl; { timer.reset(); CrsMatrixHelper::flat2hier(Uplo::Upper, UU, HU, S.NumBlocks(), S.RangeVector(), S.TreeVector()); for (ordinal_type k=0;k<HU.NumNonZeros();++k) HU.Value(k).fillRowViewArray(); DenseMatrixHelper::flat2hier(BB, HB, S.NumBlocks(), S.RangeVector(), nb); t_flat2hier = timer.seconds(); cout << "CholDirectPerformance:: Hier (dof, nnz) = " << HU.NumRows() << ", " << HU.NumNonZeros() << endl; } cout << "CholDirectPerformance:: construct hierarchical matrix::time = " << t_flat2hier << endl; } // copy of UU CrsMatrixBaseType RR("RR"); RR.copy(UU); /////////////////////////// Serial Numeric Factorization if (!skip_serial) { #ifdef __USE_FIXED_TEAM_SIZE__ typename TaskFactoryType::policy_type policy(max_task_dependence); #else typename TaskFactoryType::policy_type policy(max_task_dependence, 1); #endif TaskFactoryType::setUseTeamInterface(team_interface); TaskFactoryType::setMaxTaskDependence(max_task_dependence); TaskFactoryType::setPolicy(&policy); CrsTaskViewType U(&UU); U.fillRowViewArray(); cout << "CholDirectPerformance:: Serial factorize the matrix" << endl; { for (int i=start;i<niter;++i) { UU.copy(RR); timer.reset(); { Chol<Uplo::Upper,AlgoChol::UnblockedOpt,Variant::One> ::invoke(TaskFactoryType::Policy(), TaskFactoryType::Policy().member_single(), U); } t_factor_seq += timer.seconds() * (i>=0); } t_factor_seq /= niter; } cout << "CholDirectPerformance:: Serial factorize the matrix::time = " << t_factor_seq << endl; cout << "CholDirectPerformance:: Serial forward/backward solve" << endl; { for (int i=start;i<niter;++i) { XX.copy(BB); timer.reset(); { TriSolve<Uplo::Upper,Trans::ConjTranspose,AlgoTriSolve::Unblocked> ::invoke(TaskFactoryType::Policy(), TaskFactoryType::Policy().member_single(), Diag::NonUnit, U, X); TriSolve<Uplo::Upper,Trans::NoTranspose,AlgoTriSolve::Unblocked> ::invoke(TaskFactoryType::Policy(), TaskFactoryType::Policy().member_single(), Diag::NonUnit, U, X); } t_factor_seq += timer.seconds() * (i>=0); } t_factor_seq /= niter; } cout << "CholDirectPerformance:: Serial forward/backward solve::time = " << t_solve_seq << endl; } // if (!skip_serial) // cout << "CholDirectPerformance:: task scale [seq/task] = " << t_factor_seq/t_factor_task << endl; return r_val; }
void VerletKokkos::run(int n) { bigint ntimestep; int nflag,sortflag; int n_post_integrate = modify->n_post_integrate; int n_pre_exchange = modify->n_pre_exchange; int n_pre_neighbor = modify->n_pre_neighbor; int n_pre_force = modify->n_pre_force; int n_post_force = modify->n_post_force; int n_end_of_step = modify->n_end_of_step; if (atomKK->sortfreq > 0) sortflag = 1; else sortflag = 0; static double time = 0.0; static int count = 0; atomKK->sync(Device,ALL_MASK); Kokkos::Impl::Timer ktimer; for (int i = 0; i < n; i++) { ntimestep = ++update->ntimestep; ev_set(ntimestep); // initial time integration ktimer.reset(); timer->stamp(); modify->initial_integrate(vflag); time += ktimer.seconds(); if (n_post_integrate) modify->post_integrate(); timer->stamp(Timer::MODIFY); // regular communication vs neighbor list rebuild nflag = neighbor->decide(); if (nflag == 0) { timer->stamp(); comm->forward_comm(); timer->stamp(Timer::COMM); } else { // added debug //atomKK->sync(Host,ALL_MASK); //atomKK->modified(Host,ALL_MASK); if (n_pre_exchange) { timer->stamp(); modify->pre_exchange(); timer->stamp(Timer::MODIFY); } // debug //atomKK->sync(Host,ALL_MASK); //atomKK->modified(Host,ALL_MASK); if (triclinic) domain->x2lamda(atomKK->nlocal); domain->pbc(); if (domain->box_change) { domain->reset_box(); comm->setup(); if (neighbor->style) neighbor->setup_bins(); } timer->stamp(); // added debug //atomKK->sync(Device,ALL_MASK); //atomKK->modified(Device,ALL_MASK); comm->exchange(); if (sortflag && ntimestep >= atomKK->nextsort) atomKK->sort(); comm->borders(); // added debug //atomKK->sync(Host,ALL_MASK); //atomKK->modified(Host,ALL_MASK); if (triclinic) domain->lamda2x(atomKK->nlocal+atomKK->nghost); timer->stamp(Timer::COMM); if (n_pre_neighbor) { modify->pre_neighbor(); timer->stamp(Timer::MODIFY); } neighbor->build(); timer->stamp(Timer::NEIGH); } // force computations // important for pair to come before bonded contributions // since some bonded potentials tally pairwise energy/virial // and Pair:ev_tally() needs to be called before any tallying force_clear(); timer->stamp(); // added for debug //atomKK->k_x.sync<LMPHostType>(); //atomKK->k_f.sync<LMPHostType>(); //atomKK->k_f.modify<LMPHostType>(); if (n_pre_force) { modify->pre_force(vflag); timer->stamp(Timer::MODIFY); } if (pair_compute_flag) { atomKK->sync(force->pair->execution_space,force->pair->datamask_read); atomKK->modified(force->pair->execution_space,force->pair->datamask_modify); force->pair->compute(eflag,vflag); timer->stamp(Timer::PAIR); } if (atomKK->molecular) { if (force->bond) { atomKK->sync(force->bond->execution_space,force->bond->datamask_read); atomKK->modified(force->bond->execution_space,force->bond->datamask_modify); force->bond->compute(eflag,vflag); } if (force->angle) { atomKK->sync(force->angle->execution_space,force->angle->datamask_read); atomKK->modified(force->angle->execution_space,force->angle->datamask_modify); force->angle->compute(eflag,vflag); } if (force->dihedral) { atomKK->sync(force->dihedral->execution_space,force->dihedral->datamask_read); atomKK->modified(force->dihedral->execution_space,force->dihedral->datamask_modify); force->dihedral->compute(eflag,vflag); } if (force->improper) { atomKK->sync(force->improper->execution_space,force->improper->datamask_read); atomKK->modified(force->improper->execution_space,force->improper->datamask_modify); force->improper->compute(eflag,vflag); } timer->stamp(Timer::BOND); } if (kspace_compute_flag) { atomKK->sync(force->kspace->execution_space,force->kspace->datamask_read); atomKK->modified(force->kspace->execution_space,force->kspace->datamask_modify); force->kspace->compute(eflag,vflag); timer->stamp(Timer::KSPACE); } // reverse communication of forces if (force->newton) comm->reverse_comm(); timer->stamp(Timer::COMM); // force modifications, final time integration, diagnostics ktimer.reset(); if (n_post_force) modify->post_force(vflag); modify->final_integrate(); if (n_end_of_step) modify->end_of_step(); timer->stamp(Timer::MODIFY); time += ktimer.seconds(); // all output if (ntimestep == output->next) { atomKK->sync(Host,ALL_MASK); timer->stamp(); output->write(ntimestep); timer->stamp(Timer::OUTPUT); } } }
KOKKOS_INLINE_FUNCTION int exampleTriSolvePerformance(const string file_input, const OrdinalType nrhs, const OrdinalType nb, const int niter, const int nthreads, const int max_task_dependence, const int team_size, const bool team_interface, const bool skip_serial, const bool verbose) { typedef ValueType value_type; typedef OrdinalType ordinal_type; typedef SizeType size_type; typedef TaskFactory<Kokkos::Experimental::TaskPolicy<SpaceType>, Kokkos::Experimental::Future<int,SpaceType> > TaskFactoryType; typedef CrsMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> CrsMatrixBaseType; typedef GraphHelper_Scotch<CrsMatrixBaseType> GraphHelperType; typedef CrsMatrixView<CrsMatrixBaseType> CrsMatrixViewType; typedef TaskView<CrsMatrixViewType,TaskFactoryType> CrsTaskViewType; typedef CrsMatrixBase<CrsTaskViewType,ordinal_type,size_type,SpaceType,MemoryTraits> CrsHierMatrixBaseType; typedef CrsMatrixView<CrsHierMatrixBaseType> CrsHierMatrixViewType; typedef TaskView<CrsHierMatrixViewType,TaskFactoryType> CrsHierTaskViewType; typedef DenseMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> DenseMatrixBaseType; typedef DenseMatrixView<DenseMatrixBaseType> DenseMatrixViewType; typedef TaskView<DenseMatrixViewType,TaskFactoryType> DenseTaskViewType; typedef DenseMatrixBase<DenseTaskViewType,ordinal_type,size_type,SpaceType,MemoryTraits> DenseHierMatrixBaseType; typedef DenseMatrixView<DenseHierMatrixBaseType> DenseHierMatrixViewType; typedef TaskView<DenseHierMatrixViewType,TaskFactoryType> DenseHierTaskViewType; int r_val = 0; Kokkos::Impl::Timer timer; double t_import = 0.0, t_reorder = 0.0, t_solve_seq = 0.0, t_solve_task = 0.0; const int start = -2; cout << "TriSolvePerformance:: import input file = " << file_input << endl; CrsMatrixBaseType AA("AA"); { timer.reset(); ifstream in; in.open(file_input); if (!in.good()) { cout << "Failed in open the file: " << file_input << endl; return ++r_val; } AA.importMatrixMarket(in); t_import = timer.seconds(); if (verbose) cout << AA << endl; } cout << "TriSolvePerformance:: import input file::time = " << t_import << endl; CrsMatrixBaseType UU("UU"); DenseMatrixBaseType BB("BB", AA.NumRows(), nrhs); cout << "TriSolvePerformance:: reorder the matrix and partition right hand side, nb = " << nb << endl; CrsHierMatrixBaseType HU("HU"); DenseHierMatrixBaseType HB("HB"); { timer.reset(); GraphHelperType S(AA); S.computeOrdering(); CrsMatrixBaseType PA("Permuted AA"); PA.copy(S.PermVector(), S.InvPermVector(), AA); UU.copy(Uplo::Upper, PA); CrsMatrixHelper::flat2hier(Uplo::Upper, UU, HU, S.NumBlocks(), S.RangeVector(), S.TreeVector()); DenseMatrixHelper::flat2hier(BB, HB, S.NumBlocks(), S.RangeVector(), nb); t_reorder = timer.seconds(); cout << "TriSolvePerformance:: Hier (dof, nnz) = " << HU.NumRows() << ", " << HU.NumNonZeros() << endl; if (verbose) cout << UU << endl; } cout << "TriSolvePerformance:: reorder the matrix and partition right hand side::time = " << t_reorder << endl; const size_t max_concurrency = 16384; cout << "TriSolvePerformance:: max concurrency = " << max_concurrency << endl; const size_t max_task_size = 3*sizeof(CrsTaskViewType)+128; cout << "TriSolvePerformance:: max task size = " << max_task_size << endl; if (!skip_serial) { __INIT_DENSE_MATRIX__(BB, 1.0); typename TaskFactoryType::policy_type policy(max_concurrency, max_task_size, max_task_dependence, 1); TaskFactoryType::setUseTeamInterface(team_interface); TaskFactoryType::setMaxTaskDependence(max_task_dependence); TaskFactoryType::setPolicy(&policy); CrsTaskViewType U(&UU); DenseTaskViewType B(&BB); U.fillRowViewArray(); cout << "TriSolvePerformance:: Serial forward and backward solve of the matrix" << endl; { for (int i=start;i<niter;++i) { timer.reset(); // { // auto future = TaskFactoryType::Policy().create_team(TriSolve<Uplo::Upper,Trans::ConjTranspose,AlgoTriSolve::Unblocked> // ::TaskFunctor<CrsTaskViewType,DenseTaskViewType> // (Diag::NonUnit, U, B), 0); // TaskFactoryType::Policy().spawn(future); // Kokkos::Experimental::wait(TaskFactoryType::Policy()); // } { TriSolve<Uplo::Upper,Trans::ConjTranspose,AlgoTriSolve::Unblocked> ::invoke(TaskFactoryType::Policy(), TaskFactoryType::Policy().member_single(), Diag::NonUnit, U, B); } // { // auto future = TaskFactoryType::Policy().create_team(TriSolve<Uplo::Upper,Trans::NoTranspose,AlgoTriSolve::Unblocked> // ::TaskFunctor<CrsTaskViewType,DenseTaskViewType> // (Diag::NonUnit, U, B), 0); // TaskFactoryType::Policy().spawn(future); // Kokkos::Experimental::wait(TaskFactoryType::Policy()); // } { TriSolve<Uplo::Upper,Trans::NoTranspose,AlgoTriSolve::Unblocked> ::invoke(TaskFactoryType::Policy(), TaskFactoryType::Policy().member_single(), Diag::NonUnit, U, B); } t_solve_seq += timer.seconds() * (i>=0); } t_solve_seq /= niter; if (verbose) cout << BB << endl; } cout << "TriSolvePerformance:: Serial forward and backward solve of the matrix::time = " << t_solve_seq << endl; } { __INIT_DENSE_MATRIX__(BB, 1.0); typename TaskFactoryType::policy_type policy(max_concurrency, max_task_size, max_task_dependence, team_size); TaskFactoryType::setUseTeamInterface(team_interface); TaskFactoryType::setMaxTaskDependence(max_task_dependence); TaskFactoryType::setPolicy(&policy); // wrap the hierarchically partitioned matrix with task handler CrsHierTaskViewType TU(&HU); for (ordinal_type k=0;k<HU.NumNonZeros();++k) HU.Value(k).fillRowViewArray(); DenseHierTaskViewType TB(&HB); cout << "TriSolvePerformance:: ByBlocks forward and backward solve of the matrix" << endl; { for (int i=start;i<niter;++i) { timer.reset(); { auto future_forward_solve = TaskFactoryType::Policy().create_team (TriSolve<Uplo::Upper,Trans::ConjTranspose,AlgoTriSolve::ByBlocks> ::TaskFunctor<CrsHierTaskViewType,DenseHierTaskViewType> (Diag::NonUnit, TU, TB), 0); TaskFactoryType::Policy().spawn(future_forward_solve); auto future_backward_solve = TaskFactoryType::Policy().create_team (TriSolve<Uplo::Upper,Trans::NoTranspose,AlgoTriSolve::ByBlocks> ::TaskFunctor<CrsHierTaskViewType,DenseHierTaskViewType> (Diag::NonUnit, TU, TB), 1); TaskFactoryType::Policy().add_dependence(future_backward_solve, future_forward_solve); TaskFactoryType::Policy().spawn(future_backward_solve); Kokkos::Experimental::wait(TaskFactoryType::Policy()); } t_solve_task += timer.seconds() * (i>=0); } t_solve_task /= niter; if (verbose) cout << BB << endl; } cout << "TriSolvePerformance:: ByBlocks forward and backward solve of the matrix::time = " << t_solve_task << endl; } if (!skip_serial) { cout << "TriSolvePerformance:: task scale [seq/task] = " << t_solve_seq/t_solve_task << endl; } return r_val; }
KOKKOS_INLINE_FUNCTION int exampleCholUnblocked(const string file_input, const int max_task_dependence, const int team_size, const int algo, const int variant, const bool verbose) { typedef ValueType value_type; typedef OrdinalType ordinal_type; typedef SizeType size_type; typedef CrsMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> CrsMatrixBaseType; typedef CrsMatrixView<CrsMatrixBaseType> CrsMatrixViewType; typedef TaskFactory<Kokkos::Experimental::TaskPolicy<SpaceType>, Kokkos::Experimental::Future<int,SpaceType> > TaskFactoryType; typedef TaskView<CrsMatrixViewType,TaskFactoryType> CrsTaskViewType; int r_val = 0; Kokkos::Impl::Timer timer; double t = 0.0; cout << "CholUnblocked:: import input file = " << file_input << endl; CrsMatrixBaseType AA("AA"), UU("UU"); { timer.reset(); ifstream in; in.open(file_input); if (!in.good()) { cout << "Failed in open the file: " << file_input << endl; return ++r_val; } AA.importMatrixMarket(in); UU.copy(Uplo::Upper, AA); t = timer.seconds(); if (verbose) cout << UU << endl; } cout << "CholUnblocked:: import input file::time = " << t << endl; #ifdef __USE_FIXED_TEAM_SIZE__ typename TaskFactoryType::policy_type policy(max_task_dependence); #else typename TaskFactoryType::policy_type policy(max_task_dependence, team_size); #endif TaskFactoryType::setMaxTaskDependence(max_task_dependence); TaskFactoryType::setPolicy(&policy); cout << "CholUnblocked:: factorize the matrix" << endl; CrsTaskViewType U(&UU); U.fillRowViewArray(); { timer.reset(); typename TaskFactoryType::future_type future; switch (algo) { case AlgoChol::UnblockedOpt: { if (variant == Variant::One) future = TaskFactoryType::Policy().create_team(Chol<Uplo::Upper,AlgoChol::UnblockedOpt,Variant::One> ::TaskFunctor<CrsTaskViewType>(U), 0); else if (variant == Variant::Two) future = TaskFactoryType::Policy().create_team(Chol<Uplo::Upper,AlgoChol::UnblockedOpt,Variant::Two> ::TaskFunctor<CrsTaskViewType>(U), 0); else { ERROR(">> Not supported algorithm variant"); } break; } case AlgoChol::Dummy: { future = TaskFactoryType::Policy().create_team(Chol<Uplo::Upper,AlgoChol::Dummy> ::TaskFunctor<CrsTaskViewType>(U), 0); break; } default: ERROR(">> Not supported algorithm"); break; } TaskFactoryType::Policy().spawn(future); Kokkos::Experimental::wait(TaskFactoryType::Policy()); t = timer.seconds(); if (verbose) cout << UU << endl; } cout << "CholUnblocked:: factorize the matrix::time = " << t << endl; return r_val; }
BASKER_INLINE int Basker<Int, Entry, Exe_Space>::factor_inc_lvl(Int option) { printf("Factor Inc Level Called \n"); gn = A.ncol; gm = A.nrow; if(Options.btf == BASKER_TRUE) { //JDB: We can change this for the new inteface //call reference copy constructor gn = A.ncol; gm = A.nrow; A = BTF_A; //printf("\n\n Switching A, newsize: %d \n", // A.ncol); //printMTX("A_FACTOR.mtx", A); } //Spit into Domain and Sep //----------------------Domain-------------------------// #ifdef BASKER_KOKKOS //====TIMER== #ifdef BASKER_TIME Kokkos::Impl::Timer timer; #endif //===TIMER=== typedef Kokkos::TeamPolicy<Exe_Space> TeamPolicy; if(btf_tabs_offset != 0) { kokkos_nfactor_domain_inc_lvl <Int,Entry,Exe_Space> domain_nfactor(this); Kokkos::parallel_for(TeamPolicy(num_threads,1), domain_nfactor); Kokkos::fence(); //=====Check for error====== while(true) { INT_1DARRAY thread_start; MALLOC_INT_1DARRAY(thread_start, num_threads+1); init_value(thread_start, num_threads+1, (Int) BASKER_MAX_IDX); int nt = nfactor_domain_error(thread_start); if(nt == BASKER_SUCCESS) { break; } else { printf("restart \n"); kokkos_nfactor_domain_remalloc <Int, Entry, Exe_Space> diag_nfactor_remalloc(this, thread_start); Kokkos::parallel_for(TeamPolicy(num_threads,1), diag_nfactor_remalloc); Kokkos::fence(); } }//end while //====TIMER=== #ifdef BASKER_TIME printf("Time DOMAIN: %f \n", timer.seconds()); timer.reset(); #endif //====TIMER==== #else// else basker_kokkos #pragma omp parallel { }//end omp parallel #endif //end basker_kokkos } //-------------------End--Domian--------------------------// //---------------------------Sep--------------------------// if(btf_tabs_offset != 0) { //for(Int l=1; l<=1; l++) for(Int l=1; l <= tree.nlvls; l++) { //Come back for syncs //#ifdef BASKER_OLD_BARRIER Int lthreads = pow(2,l); Int lnteams = num_threads/lthreads; //#else //Int lthreads = 1; //Int lnteams = num_threads/lthreads; //#endif //printf("\n\n ============ SEP: %d ======\n\n",l); #ifdef BASKER_KOKKOS Kokkos::Impl::Timer timer_inner_sep; #ifdef BASKER_NO_LAMBDA kokkos_nfactor_sep2_inc_lvl <Int, Entry, Exe_Space> sep_nfactor(this,l); Kokkos::parallel_for(TeamPolicy(lnteams,lthreads), sep_nfactor); Kokkos::fence(); #ifdef BASKER_TIME printf("Time INNERSEP: %d %f \n", l, timer_inner_sep.seconds()); #endif #else //ELSE BASKER_NO_LAMBDA //Note: to be added #endif //end BASKER_NO_LAMBDA #else #pragma omp parallel { }//end omp parallel #endif }//end over each level #ifdef BASKER_TIME printf("Time SEP: %f \n", timer.seconds()); #endif } //-------------------------End Sep----------------// //-------------------IF BTF-----------------------// if(Options.btf == BASKER_TRUE) { //=====Timer #ifdef BASKER_TIME Kokkos::Impl::Timer timer_btf; #endif //====Timer //======Call diag factor==== /* kokkos_nfactor_diag <Int, Entry, Exe_Space> diag_nfactor(this); Kokkos::parallel_for(TeamPolicy(num_threads,1), diag_nfactor); Kokkos::fence(); */ //=====Check for error====== //while(true) // { //INT_1DARRAY thread_start; // MALLOC_INT_1DARRAY(thread_start, num_threads+1); //init_value(thread_start, num_threads+1, // (Int) BASKER_MAX_IDX); //int nt = nfactor_diag_error(thread_start); // if(nt == BASKER_SUCCESS) // { /// break; // } //else // { /* break; printf("restart \n"); kokkos_nfactor_diag_remalloc <Int, Entry, Exe_Space> diag_nfactor_remalloc(this, thread_start); Kokkos::parallel_for(TeamPolicy(num_threads,1), diag_nfactor); Kokkos::fence(); */ //} // }//end while //====TIMER #ifdef BASKER_TIME printf("Time BTF: %f \n", timer_btf.seconds()); #endif //===TIMER }//end btf call return 0; }//end factor_lvl_inc()
int main (int argc, char ** argv){ if (argc < 2){ std::cerr << "Usage:" << argv[0] << " input_bin_file" << std::endl; exit(1); } Kokkos::initialize(argc, argv); MyExecSpace::print_configuration(std::cout); idx nv = 0, ne = 0; idx *xadj, *adj; wt *ew; KokkosKernels::Experimental::Graph::Utils::read_graph_bin<idx, wt> (&nv, &ne, &xadj, &adj, &ew, argv[1]); std::cout << "nv:" << nv << " ne:" << ne << std::endl; um_array_type _xadj (xadj, nv + 1); um_edge_array_type _adj (adj, ne); idx_array_type kok_xadj ("xadj", nv + 1); idx_edge_array_type kok_adj("adj", ne); idx_array_type sym_xadj; idx_edge_array_type sym_adj; Kokkos::deep_copy (kok_xadj, _xadj); Kokkos::deep_copy (kok_adj, _adj); wt_um_edge_array_type _mtx_vals (ew, ne); value_array_type kok_mtx_vals ("MTX_VALS", ne); Kokkos::deep_copy (kok_mtx_vals, _mtx_vals); delete [] xadj; delete [] adj; delete [] ew; std::cout << "Symetrizing Graph" << std::endl; Kokkos::Impl::Timer timer; KokkosKernels::Experimental::Util::symmetrize_graph_symbolic_hashmap< idx_array_type, idx_edge_array_type, idx_array_type, idx_edge_array_type, MyExecSpace> (nv, kok_xadj, kok_adj,sym_xadj, sym_adj); Kokkos::fence(); double t = timer.seconds(); std::cout << "Time to symmetrize:" << t << std::endl; KokkosKernels::Experimental::Util::print_1Dview(kok_xadj); KokkosKernels::Experimental::Util::print_1Dview(kok_adj); std::cout << "Symetric Graph" << std::endl; KokkosKernels::Experimental::Util::print_1Dview(sym_xadj); KokkosKernels::Experimental::Util::print_1Dview(sym_adj); Kokkos::finalize(); return 0; }
int exampleCholByBlocks(const std::string file_input, const int treecut, const int prunecut, const int fill_level, const int rows_per_team, const int max_concurrency, const int max_task_dependence, const int team_size, const bool check, const bool verbose) { typedef typename Kokkos::Impl::is_space<DeviceSpaceType>::host_mirror_space::execution_space HostSpaceType ; const bool detail = false; std::cout << "DeviceSpace:: "; DeviceSpaceType::print_configuration(std::cout, detail); std::cout << "HostSpace:: "; HostSpaceType::print_configuration(std::cout, detail); // for simple test, let's use host space only here, for device it needs mirroring. typedef CrsMatrixBase<value_type,ordinal_type,size_type,HostSpaceType> CrsMatrixBaseHostType; typedef CrsMatrixView<CrsMatrixBaseHostType> CrsMatrixViewHostType; typedef GraphTools<ordinal_type,size_type,HostSpaceType> GraphToolsHostType; typedef GraphTools_Scotch<ordinal_type,size_type,HostSpaceType> GraphToolsHostType_Scotch; typedef GraphTools_CAMD<ordinal_type,size_type,HostSpaceType> GraphToolsHostType_CAMD; typedef IncompleteSymbolicFactorization<CrsMatrixBaseHostType> IncompleteSymbolicFactorizationType; typedef Kokkos::Experimental::TaskPolicy<DeviceSpaceType> PolicyType; typedef TaskView<CrsMatrixViewHostType> CrsTaskViewHostType; typedef CrsMatrixBase<CrsTaskViewHostType,ordinal_type,size_type,HostSpaceType> CrsHierBaseHostType; typedef CrsMatrixView<CrsHierBaseHostType> CrsHierViewHostType; typedef TaskView<CrsHierViewHostType> CrsTaskHierViewHostType; int r_val = 0; Kokkos::Impl::Timer timer; /// /// Read from matrix market /// /// input - file /// output - AA_host /// CrsMatrixBaseHostType AA_host("AA_host"); timer.reset(); { std::ifstream in; in.open(file_input); if (!in.good()) { std::cout << "Failed in open the file: " << file_input << std::endl; return -1; } MatrixMarket::read(AA_host, in); } double t_read = timer.seconds(); if (verbose) AA_host.showMe(std::cout) << std::endl; /// /// Create a graph structure for Scotch and CAMD (rptr, cidx) /// /// rptr and cidx are need to be set up for Scotch and CAMD /// typename GraphToolsHostType::size_type_array rptr("Graph::RowPtrArray", AA_host.NumRows() + 1); typename GraphToolsHostType::ordinal_type_array cidx("Graph::ColIndexArray", AA_host.NumNonZeros()); /// /// Run Scotch /// /// input - rptr, cidx, A_host /// output - S (perm, iperm, nblks, range, tree), AA_scotch_host (permuted) /// timer.reset(); GraphToolsHostType::getGraph(rptr, cidx, AA_host); double t_graph = timer.seconds(); GraphToolsHostType_Scotch S; S.setGraph(AA_host.NumRows(), rptr, cidx); S.setSeed(0); S.setTreeLevel(); S.setStrategy( SCOTCH_STRATSPEED | SCOTCH_STRATLEVELMAX | SCOTCH_STRATLEVELMIN | SCOTCH_STRATLEAFSIMPLE | SCOTCH_STRATSEPASIMPLE ); timer.reset(); S.computeOrdering(treecut); double t_scotch = timer.seconds(); if (verbose) S.showMe(std::cout) << std::endl; CrsMatrixBaseHostType AA_scotch_host("AA_scotch_host"); AA_scotch_host.createConfTo(AA_host); CrsMatrixTools::copy(AA_scotch_host, S.PermVector(), S.InvPermVector(), AA_host); if (verbose) AA_scotch_host.showMe(std::cout) << std::endl; /// /// Run CAMD /// /// input - rptr, cidx, A_host /// output - S (perm, iperm, nblks, range, tree), AA_scotch_host (permuted) /// timer.reset(); GraphToolsHostType::getGraph(rptr, cidx, AA_scotch_host); t_graph += timer.seconds(); GraphToolsHostType_CAMD C; C.setGraph(AA_scotch_host.NumRows(), rptr, cidx, S.NumBlocks(), S.RangeVector()); timer.reset(); C.computeOrdering(); double t_camd = timer.seconds(); if (verbose) C.showMe(std::cout) << std::endl; CrsMatrixBaseHostType AA_camd_host("AA_camd_host"); AA_camd_host.createConfTo(AA_scotch_host); CrsMatrixTools::copy(AA_camd_host, C.PermVector(), C.InvPermVector(), AA_scotch_host); if (verbose) AA_camd_host.showMe(std::cout) << std::endl; /// /// Symbolic factorization /// /// input - /// output - S (perm, iperm, nblks, range, tree), AA_scotch_host (permuted) /// CrsMatrixBaseHostType AA_factor_host("AA_factor_host"); timer.reset(); IncompleteSymbolicFactorizationType::createNonZeroPattern(AA_factor_host, fill_level, Uplo::Upper, AA_camd_host, rows_per_team); double t_symbolic = timer.seconds(); if (verbose) AA_factor_host.showMe(std::cout) << std::endl; /// /// Clean tempoerary matrices /// /// input - AA_scotch_host, AA_camd_host, C, rptr, cidx /// output - none /// AA_scotch_host = CrsMatrixBaseHostType(); AA_camd_host = CrsMatrixBaseHostType(); C = GraphToolsHostType_CAMD(); rptr = typename GraphToolsHostType::size_type_array(); cidx = typename GraphToolsHostType::ordinal_type_array(); /// /// Create task policy /// /// input - max_task_size /// output - policy /// const size_type max_task_size = (3*sizeof(CrsTaskViewHostType)+sizeof(PolicyType)+128); timer.reset(); PolicyType policy(max_concurrency, max_task_size, max_task_dependence, team_size); double t_policy = timer.seconds(); /// /// Sequential execution /// /// input - AA_factor_host (matrix to be compared), rowviews /// output - BB_factor_host, B_factor_host /// double t_chol_serial = 0; CrsMatrixBaseHostType BB_factor_host("BB_factor_host"); if (check) { BB_factor_host.createConfTo(AA_factor_host); CrsMatrixTools::copy(BB_factor_host, AA_factor_host); CrsTaskViewHostType B_factor_host(BB_factor_host); Kokkos::View<typename CrsTaskViewHostType::row_view_type*,HostSpaceType> rowviews("RowViewInMatView", B_factor_host.NumRows()); B_factor_host.setRowViewArray(rowviews); timer.reset(); { auto future = policy.proc_create_team(Chol<Uplo::Upper,AlgoChol::Unblocked,Variant::One> ::createTaskFunctor(policy, B_factor_host)); policy.spawn(future); Kokkos::Experimental::wait(policy); TACHO_TEST_FOR_ABORT( future.get(), "Fail to perform Cholesky (serial)"); } t_chol_serial = timer.seconds(); if (verbose) BB_factor_host.showMe(std::cout) << std::endl; } /// /// Task parallel execution /// /// input - AA_factor_host, rowviews /// output - HA_factor_host, AA_factor_host, B_factor_host /// double t_hier = 0, t_blocks = 0, t_chol_parallel = 0; CrsHierBaseHostType HA_factor_host("HA_factor_host"); { timer.reset(); S.pruneTree(prunecut); CrsMatrixTools::createHierMatrix(HA_factor_host, AA_factor_host, S.NumBlocks(), S.RangeVector(), S.TreeVector()); t_hier = timer.seconds(); timer.reset(); size_type nblocks = HA_factor_host.NumNonZeros(); Kokkos::View<ordinal_type*,HostSpaceType> ap_rowview_blocks("NumRowViewInBlocks", nblocks + 1); ap_rowview_blocks(0) = 0; for (ordinal_type k=0;k<nblocks;++k) ap_rowview_blocks(k+1) = ap_rowview_blocks(k) + HA_factor_host.Value(k).NumRows(); Kokkos::View<typename CrsMatrixViewHostType::row_view_type*,HostSpaceType> rowview_blocks("RowViewInBlocks", ap_rowview_blocks(nblocks)); Kokkos::parallel_for(Kokkos::RangePolicy<HostSpaceType>(0, nblocks), [&](const ordinal_type k) { const ordinal_type begin = ap_rowview_blocks(k); const ordinal_type end = ap_rowview_blocks(k+1); HA_factor_host.Value(k).setRowViewArray (Kokkos::subview(rowview_blocks, Kokkos::pair<ordinal_type,ordinal_type>(begin, end))); } ); CrsMatrixTools::filterEmptyBlocks(HA_factor_host); t_blocks = timer.seconds(); { size_type nblocks_filtered = HA_factor_host.NumNonZeros(), nnz_blocks = 0; for (size_type k=0;k<nblocks_filtered; ++k) nnz_blocks += HA_factor_host.Value(k).NumNonZeros(); TACHO_TEST_FOR_ABORT( nnz_blocks != AA_factor_host.NumNonZeros(), "nnz counted in blocks is different from nnz in the base matrix."); } CrsTaskHierViewHostType H_factor_host(HA_factor_host); timer.reset(); { auto future = policy.proc_create_team(Chol<Uplo::Upper,AlgoChol::ByBlocks,Variant::One> ::createTaskFunctor(policy, H_factor_host)); policy.spawn(future); Kokkos::Experimental::wait(policy); TACHO_TEST_FOR_ABORT( future.get(), "Fail to perform Cholesky (serial)"); } t_chol_parallel = timer.seconds(); if (verbose) AA_factor_host.showMe(std::cout) << std::endl; } if (check) { double diff = 0, norm = 0; TACHO_TEST_FOR_ABORT( BB_factor_host.NumNonZeros() != AA_factor_host.NumNonZeros(), "nnz used in serial is not same as nnz used in parallel"); const size_type nnz = AA_factor_host.NumNonZeros(); for (size_type k=0;k<nnz;++k) { norm += Util::abs(BB_factor_host.Value(k)); diff += Util::abs(AA_factor_host.Value(k) - BB_factor_host.Value(k)); } std::cout << std::scientific; std::cout << "CholByBlocks:: check with serial execution " << std::endl << " diff = " << diff << ", norm = " << norm << ", rel err = " << (diff/norm) << std::endl; std::cout.unsetf(std::ios::scientific); } { const auto prec = std::cout.precision(); std::cout.precision(4); std::cout << std::scientific; std::cout << "CholByBlocks:: Given matrix = " << AA_host.NumRows() << " x " << AA_host.NumCols() << ", nnz = " << AA_host.NumNonZeros() << std::endl; std::cout << "CholByBlocks:: Factored matrix = " << AA_factor_host.NumRows() << " x " << AA_factor_host.NumCols() << ", nnz = " << AA_factor_host.NumNonZeros() << std::endl; std::cout << "CholByBlocks:: Hier matrix = " << HA_factor_host.NumRows() << " x " << HA_factor_host.NumCols() << ", nnz = " << HA_factor_host.NumNonZeros() << std::endl; std::cout << "CholByBlocks:: " << "read = " << t_read << " [sec], " << "graph generation = " << (t_graph/2.0) << " [sec] " << "scotch reordering = " << t_scotch << " [sec] " << "camd reordering = " << t_camd << " [sec] " << std::endl << "CholByBlocks:: " << "symbolic factorization = " << t_symbolic << " [sec] " << std::endl << "CholByBlocks:: " << "policy creation = " << t_policy << " [sec] " << "hier creation = " << t_hier << " [sec] " << "block specification = " << t_blocks << " [sec] " << std::endl << "CholByBlocks:: " << "Chol Parallel = " << t_chol_parallel << " [sec] "; if (check) std::cout << "Chol Serial = " << (check ? t_chol_serial : -1) << " [sec] " << "speed-up = " << (t_chol_serial/t_chol_parallel) << " [sec] "; std::cout << std::endl; std::cout.unsetf(std::ios::scientific); std::cout.precision(prec); } return r_val; }
void viennaCL_apply( KernelHandle *handle, typename KernelHandle::nnz_lno_t m, typename KernelHandle::nnz_lno_t n, typename KernelHandle::nnz_lno_t k, in_row_index_view_type row_mapA, in_nonzero_index_view_type entriesA, in_nonzero_value_view_type valuesA, bool transposeA, bin_row_index_view_type row_mapB, bin_nonzero_index_view_type entriesB, bin_nonzero_value_view_type valuesB, bool transposeB, cin_row_index_view_type &row_mapC, cin_nonzero_index_view_type &entriesC, cin_nonzero_value_view_type &valuesC){ #ifdef KERNELS_HAVE_VIENNACL typedef typename KernelHandle::nnz_lno_t idx; typedef in_row_index_view_type idx_array_type; typedef typename KernelHandle::nnz_scalar_t value_type; typedef typename in_row_index_view_type::device_type device1; typedef typename in_nonzero_index_view_type::device_type device2; typedef typename in_nonzero_value_view_type::device_type device3; typedef typename KernelHandle::HandleExecSpace MyExecSpace; std::cout << "RUNNING VIENNACL" << std::endl; typedef typename viennacl::compressed_matrix<value_type>::handle_type it; typedef typename viennacl::compressed_matrix<value_type>::value_type vt; if ((Kokkos::Impl::is_same<idx, int>::value && Kokkos::Impl::is_same<typename KernelHandle::size_type, int>::value )|| (Kokkos::Impl::is_same<idx, unsigned int>::value && Kokkos::Impl::is_same<typename KernelHandle::size_type, unsigned int>::value ) || (Kokkos::Impl::is_same<idx, it>::value && Kokkos::Impl::is_same<typename KernelHandle::size_type, it>::value ) ){ unsigned int * a_xadj = (unsigned int *)row_mapA.ptr_on_device(); unsigned int * b_xadj = (unsigned int * )row_mapB.ptr_on_device(); unsigned int * c_xadj = (unsigned int * )row_mapC.ptr_on_device(); unsigned int * a_adj = (unsigned int * )entriesA.ptr_on_device(); unsigned int * b_adj = (unsigned int * )entriesB.ptr_on_device(); unsigned int * c_adj = (unsigned int * )entriesC.ptr_on_device(); int nnzA = entriesA.dimension_0(); int nnzB = entriesB.dimension_0(); value_type *a_ew = valuesA.ptr_on_device(); value_type *b_ew = valuesB.ptr_on_device(); value_type *c_ew = valuesC.ptr_on_device(); /* std::cout << "create a" << std::endl; std::cout << "m:" << m << " n:" << n << std::endl; std::cout << "a_xadj[0]:" << a_xadj[0] << " a_xadj[m]:" << a_xadj[m] << std::endl; std::cout << "a_adj[a_xadj[m] - 1]:" << a_adj[a_xadj[m] - 1] << " a_ew[a_xadj[m] - 1]:" << a_ew[a_xadj[m] - 1] << std::endl; */ Kokkos::Impl::Timer timerset; viennacl::compressed_matrix<value_type> A; viennacl::compressed_matrix<value_type> B; A.set(a_xadj, a_adj, a_ew, m, n, nnzA); B.set(b_xadj, b_adj, b_ew, n, k, nnzB); std::cout << "compress matrix create:" << timerset.seconds() << std::endl; std::cout << "Now running ViennaCL" << std::endl; Kokkos::Impl::Timer timer1; viennacl::compressed_matrix<value_type> C = viennacl::linalg::prod(A, B); std::cout << "Actual VIENNACL SPMM Time:" << timer1.seconds() << std::endl; { unsigned int c_rows = m, c_cols = k, cnnz = C.nnz(); value_type const * values = viennacl::linalg::host_based::detail::extract_raw_pointer<value_type>(C.handle()); unsigned int const * rows_start = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(C.handle1()); unsigned int const * columns = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(C.handle2()); { Kokkos::Impl::Timer copy_time; row_mapC = typename cin_row_index_view_type::non_const_type(Kokkos::ViewAllocateWithoutInitializing("rowmapC"), c_rows + 1); entriesC = typename cin_nonzero_index_view_type::non_const_type (Kokkos::ViewAllocateWithoutInitializing("EntriesC") , cnnz); valuesC = typename cin_nonzero_value_view_type::non_const_type (Kokkos::ViewAllocateWithoutInitializing("valuesC") , cnnz); KokkosKernels::Experimental::Util::copy_vector<unsigned int const *, typename cin_row_index_view_type::non_const_type, MyExecSpace> (m, rows_start, row_mapC); idx nnz = cnnz; KokkosKernels::Experimental::Util::copy_vector<unsigned int const *, typename cin_nonzero_index_view_type::non_const_type, MyExecSpace> (nnz, columns, entriesC); KokkosKernels::Experimental::Util::copy_vector<value_type const *, typename cin_nonzero_value_view_type::non_const_type, MyExecSpace> (m, values, valuesC); double copy_time_d = copy_time.seconds(); std::cout << "VIENNACL COPYTIME:" << copy_time_d << std::endl; } } } else { //int *a_xadj = row_mapA.ptr_on_device(); std::cerr << "vienna requires (u) integer values" << std::endl; if (Kokkos::Impl::is_same<idx, long>::value){ std::cerr << "MKL is given long" << std::endl; } else if (Kokkos::Impl::is_same<idx, const int>::value){ std::cerr << "MKL is given const int" << std::endl; } else if (Kokkos::Impl::is_same<idx, unsigned long>::value){ std::cerr << "MKL is given unsigned long" << std::endl; } else if (Kokkos::Impl::is_same<idx, const unsigned long>::value){ std::cerr << "MKL is given const unsigned long" << std::endl; } else{ std::cerr << "MKL is given something else" << std::endl; } return; } #else std::cerr << "VIENNACL IS NOT DEFINED" << std::endl; return; #endif }
KOKKOS_INLINE_FUNCTION int exampleSymbolicFactor(const string file_input, const int treecut, const int minblksize, const int seed, const int fill_level, const int league_size, const bool reorder, const bool verbose) { typedef ValueType value_type; typedef OrdinalType ordinal_type; typedef SizeType size_type; typedef CrsMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> CrsMatrixBaseType; typedef GraphHelper_Scotch<CrsMatrixBaseType> GraphHelperType; typedef SymbolicFactorHelper<CrsMatrixBaseType> SymbolicFactorHelperType; int r_val = 0; Kokkos::Impl::Timer timer; double t = 0.0; cout << "SymbolicFactor:: import input file = " << file_input << endl; CrsMatrixBaseType AA("AA"); { timer.reset(); ifstream in; in.open(file_input); if (!in.good()) { cout << "Failed in open the file: " << file_input << endl; return ++r_val; } AA.importMatrixMarket(in); t = timer.seconds(); cout << "SymbolicFactor:: AA nnz = " << AA.NumNonZeros() << endl; if (verbose) cout << AA << endl; } cout << "SymbolicFactor:: import input file::time = " << t << endl; CrsMatrixBaseType PA("Permuted AA"); GraphHelperType S(AA, seed); if (reorder) { timer.reset(); S.computeOrdering(treecut, minblksize); PA.copy(S.PermVector(), S.InvPermVector(), AA); t = timer.seconds(); if (verbose) cout << S << endl << PA << endl; } else { PA = AA; t = 0.0; } cout << "SymbolicFactor:: reorder the matrix::time = " << t << endl; CrsMatrixBaseType UU("UU"); { timer.reset(); SymbolicFactorHelperType symbolic(PA, league_size); symbolic.createNonZeroPattern(fill_level, Uplo::Upper, UU); t = timer.seconds(); cout << "SymbolicFactor:: UU nnz = " << UU.NumNonZeros() << endl; if (verbose) { cout << symbolic << endl; cout << UU << endl; } } cout << "SymbolicFactor:: factorize the matrix::time = " << t << endl; return r_val; }
KOKKOS_INLINE_FUNCTION int exampleCholDirectPlain(const string file_input, const int prunecut, const int seed, const int nrhs, const int nb, const int nthreads, const int max_task_dependence, const int team_size, const int league_size, const bool team_interface, const bool serial, const bool solve, const bool check, const bool verbose) { typedef ValueType value_type; typedef OrdinalType ordinal_type; typedef SizeType size_type; typedef TaskFactory<Kokkos::Experimental::TaskPolicy<SpaceType>, Kokkos::Experimental::Future<int,SpaceType> > TaskFactoryType; typedef CrsMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> CrsMatrixBaseType; typedef GraphHelper_Scotch<CrsMatrixBaseType> GraphHelperType; typedef SymbolicFactorHelper<CrsMatrixBaseType> SymbolicFactorHelperType; typedef CrsMatrixView<CrsMatrixBaseType> CrsMatrixViewType; typedef TaskView<CrsMatrixViewType,TaskFactoryType> CrsTaskViewType; typedef CrsMatrixBase<CrsTaskViewType,ordinal_type,size_type,SpaceType,MemoryTraits> CrsHierMatrixBaseType; typedef CrsMatrixView<CrsHierMatrixBaseType> CrsHierMatrixViewType; typedef TaskView<CrsHierMatrixViewType,TaskFactoryType> CrsHierTaskViewType; typedef DenseMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> DenseMatrixBaseType; typedef DenseMatrixView<DenseMatrixBaseType> DenseMatrixViewType; typedef TaskView<DenseMatrixViewType,TaskFactoryType> DenseTaskViewType; typedef DenseMatrixBase<DenseTaskViewType,ordinal_type,size_type,SpaceType,MemoryTraits> DenseHierMatrixBaseType; typedef DenseMatrixView<DenseHierMatrixBaseType> DenseHierMatrixViewType; typedef TaskView<DenseHierMatrixViewType,TaskFactoryType> DenseHierTaskViewType; int r_val = 0; Kokkos::Impl::Timer timer; double t_import = 0.0, t_reorder = 0.0, t_symbolic = 0.0, t_flat2hier = 0.0, t_factor = 0.0, t_solve = 0.0; cout << "CholDirectPlain:: import input file = " << file_input << endl; CrsMatrixBaseType AA("AA"); { timer.reset(); ifstream in; in.open(file_input); if (!in.good()) { cout << "Failed in open the file: " << file_input << endl; return ++r_val; } AA.importMatrixMarket(in); t_import = timer.seconds(); } cout << "CholDirectPlain:: import input file::time = " << t_import << endl; // matrix A and its upper triangular factors U CrsMatrixBaseType PA("Permuted AA"); CrsMatrixBaseType UU("UU"); // permuted base upper triangular matrix CrsHierMatrixBaseType HU("HU"); // hierarchical matrix of views // right hand side and solution matrix DenseMatrixBaseType BB("BB", AA.NumRows(), nrhs), XX("XX", AA.NumRows(), nrhs); DenseHierMatrixBaseType HB("HB"), HX("HX"); { cout << "CholDirectPlain:: reorder the matrix" << endl; GraphHelperType S(AA, seed); { timer.reset(); S.computeOrdering(); S.pruneTree(prunecut); PA.copy(S.PermVector(), S.InvPermVector(), AA); t_reorder = timer.seconds(); } cout << "CholDirectPlain:: reorder the matrix::time = " << t_reorder << endl; { SymbolicFactorHelperType F(PA, league_size); timer.reset(); F.createNonZeroPattern(Uplo::Upper, UU); t_symbolic = timer.seconds(); cout << "CholDirectPlain:: AA (nnz) = " << AA.NumNonZeros() << ", UU (nnz) = " << UU.NumNonZeros() << endl; } cout << "CholDirectPlain:: symbolic factorization::time = " << t_symbolic << endl; { timer.reset(); CrsMatrixHelper::flat2hier(Uplo::Upper, UU, HU, S.NumBlocks(), S.RangeVector(), S.TreeVector()); // fill internal meta data for sparse blocs for (ordinal_type k=0;k<HU.NumNonZeros();++k) HU.Value(k).fillRowViewArray(); DenseMatrixHelper::flat2hier(BB, HB, S.NumBlocks(), S.RangeVector(), nb); DenseMatrixHelper::flat2hier(XX, HX, S.NumBlocks(), S.RangeVector(), nb); t_flat2hier = timer.seconds(); cout << "CholDirectPlain:: Hier (dof, nnz) = " << HU.NumRows() << ", " << HU.NumNonZeros() << endl; } cout << "CholDirectPlain:: construct hierarchical matrix::time = " << t_flat2hier << endl; } { // Policy setup #ifdef __USE_FIXED_TEAM_SIZE__ typename TaskFactoryType::policy_type policy(max_task_dependence); #else typename TaskFactoryType::policy_type policy(max_task_dependence, 1); #endif TaskFactoryType::setUseTeamInterface(team_interface); TaskFactoryType::setMaxTaskDependence(max_task_dependence); TaskFactoryType::setPolicy(&policy); CrsTaskViewType A(&PA), U(&UU); DenseTaskViewType X(&XX), B(&BB); A.fillRowViewArray(); U.fillRowViewArray(); CrsHierTaskViewType TU(&HU); DenseHierTaskViewType TB(&HB), TX(&HX); { // Manufacture B = AX const int m = A.NumRows(); for (int j=0;j<nrhs;++j) for (int i=0;i<m;++i) X.Value(i,j) = (j+1); Gemm<Trans::NoTranspose,Trans::NoTranspose,AlgoGemm::ForTriSolveBlocked> ::invoke(TaskFactoryType::Policy(), TaskFactoryType::Policy().member_single(), 1.0, A, X, 0.0, B); XX.copy(BB); } if (serial) { cout << "CholDirectPlain:: Serial factorize the matrix" << endl; timer.reset(); Chol<Uplo::Upper,AlgoChol::UnblockedOpt,Variant::One> ::invoke(TaskFactoryType::Policy(), TaskFactoryType::Policy().member_single(), U); t_factor = timer.seconds(); cout << "CholDirectPlain:: Serial factorize the matrix::time = " << t_factor << endl; } else { cout << "CholDirectPlain:: ByBlocks factorize the matrix:: team_size = " << team_size << endl; timer.reset(); auto future = TaskFactoryType::Policy().create_team (Chol<Uplo::Upper,AlgoChol::ByBlocks> ::TaskFunctor<CrsHierTaskViewType>(TU), 0); TaskFactoryType::Policy().spawn(future); Kokkos::Experimental::wait(TaskFactoryType::Policy()); t_factor = timer.seconds(); cout << "CholDirectPlain:: ByBlocks factorize the matrix::time = " << t_factor << endl; } if (solve) { if (serial) { cout << "CholDirectPlain:: Serial forward/backward solve" << endl; timer.reset(); TriSolve<Uplo::Upper,Trans::ConjTranspose,AlgoTriSolve::Unblocked> ::invoke(TaskFactoryType::Policy(), TaskFactoryType::Policy().member_single(), Diag::NonUnit, U, X); TriSolve<Uplo::Upper,Trans::NoTranspose,AlgoTriSolve::Unblocked> ::invoke(TaskFactoryType::Policy(), TaskFactoryType::Policy().member_single(), Diag::NonUnit, U, X); t_solve = timer.seconds(); cout << "CholDirectPlain:: Serial forward/backward solve::time = " << t_solve << endl; } else { cout << "CholDirectPlain:: ByBlocks forward/backward solve" << endl; timer.reset(); auto future_forward_solve = TaskFactoryType::Policy().create_team (TriSolve<Uplo::Upper,Trans::ConjTranspose,AlgoTriSolve::ByBlocks> ::TaskFunctor<CrsHierTaskViewType,DenseHierTaskViewType> (Diag::NonUnit, TU, TX), 0); TaskFactoryType::Policy().spawn(future_forward_solve); auto future_backward_solve = TaskFactoryType::Policy().create_team (TriSolve<Uplo::Upper,Trans::NoTranspose,AlgoTriSolve::ByBlocks> ::TaskFunctor<CrsHierTaskViewType,DenseHierTaskViewType> (Diag::NonUnit, TU, TX), 1); TaskFactoryType::Policy().add_dependence(future_backward_solve, future_forward_solve); TaskFactoryType::Policy().spawn(future_backward_solve); Kokkos::Experimental::wait(TaskFactoryType::Policy()); t_solve = timer.seconds(); cout << "CholDirectPlain:: ByBlocks forward/backward solve::time = " << t_solve << endl; } } if (solve && check) { // Check manufactured solution double l2 = 0.0, linf = 0.0; const int m = A.NumRows(); for (int j=0;j<nrhs;++j) for (int i=0;i<m;++i) { double diff = abs(X.Value(i,j) - (j+1)); l2 += diff*diff; linf = max(diff, linf); } l2 = sqrt(l2); cout << "CholDirectPlain:: Check solution::L2 = " << l2 << ", Linf = " << linf << endl; } } return r_val; }
KOKKOS_INLINE_FUNCTION int exampleDenseGemmByBlocks(const OrdinalType mmin, const OrdinalType mmax, const OrdinalType minc, const OrdinalType k, const OrdinalType mb, const int max_concurrency, const int max_task_dependence, const int team_size, const int mkl_nthreads, const bool check, const bool verbose) { typedef ValueType value_type; typedef OrdinalType ordinal_type; typedef SizeType size_type; typedef TaskFactory<Kokkos::Experimental::TaskPolicy<SpaceType>, Kokkos::Experimental::Future<int,SpaceType> > TaskFactoryType; typedef DenseMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryTraits> DenseMatrixBaseType; typedef DenseMatrixView<DenseMatrixBaseType> DenseMatrixViewType; typedef TaskView<DenseMatrixViewType,TaskFactoryType> DenseTaskViewType; typedef DenseMatrixBase<DenseTaskViewType,ordinal_type,size_type,SpaceType,MemoryTraits> DenseHierMatrixBaseType; typedef DenseMatrixView<DenseHierMatrixBaseType> DenseHierMatrixViewType; typedef TaskView<DenseHierMatrixViewType,TaskFactoryType> DenseHierTaskViewType; int r_val = 0; Kokkos::Impl::Timer timer; double t = 0.0; cout << "DenseGemmByBlocks:: test matrices " <<":: mmin = " << mmin << " , mmax = " << mmax << " , minc = " << minc << " , k = "<< k << " , mb = " << mb << endl; const size_t max_task_size = (3*sizeof(DenseTaskViewType)+196); // when 128 error //cout << "max task size = "<< max_task_size << endl; typename TaskFactoryType::policy_type policy(max_concurrency, max_task_size, max_task_dependence, team_size); TaskFactoryType::setMaxTaskDependence(max_task_dependence); TaskFactoryType::setPolicy(&policy); ostringstream os; os.precision(3); os << scientific; for (ordinal_type m=mmin;m<=mmax;m+=minc) { os.str(""); DenseMatrixBaseType AA, BB, CC("CC", m, m), CB("CB", m, m); if (ArgTransA == Trans::NoTranspose) AA = DenseMatrixBaseType("AA", m, k); else AA = DenseMatrixBaseType("AA", k, m); if (ArgTransB == Trans::NoTranspose) BB = DenseMatrixBaseType("BB", k, m); else BB = DenseMatrixBaseType("BB", m, k); for (ordinal_type j=0;j<AA.NumCols();++j) for (ordinal_type i=0;i<AA.NumRows();++i) AA.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0; for (ordinal_type j=0;j<BB.NumCols();++j) for (ordinal_type i=0;i<BB.NumRows();++i) BB.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0; for (ordinal_type j=0;j<CC.NumCols();++j) for (ordinal_type i=0;i<CC.NumRows();++i) CC.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0; CB.copy(CC); const double flop = get_flop_gemm<value_type>(m, m, k); #ifdef HAVE_SHYLUTACHO_MKL mkl_set_num_threads(mkl_nthreads); #endif os << "DenseGemmByBlocks:: m = " << m << " n = " << m << " k = " << k; if (check) { timer.reset(); DenseTaskViewType A(&AA), B(&BB), C(&CB); Gemm<ArgTransA,ArgTransB,AlgoGemm::ExternalBlas>::invoke (TaskFactoryType::Policy(), TaskFactoryType::Policy().member_single(), 1.0, A, B, 1.0, C); t = timer.seconds(); os << ":: Serial Performance = " << (flop/t/1.0e9) << " [GFLOPs] "; } { DenseHierMatrixBaseType HA, HB, HC; DenseMatrixHelper::flat2hier(AA, HA, mb, mb); DenseMatrixHelper::flat2hier(BB, HB, mb, mb); DenseMatrixHelper::flat2hier(CC, HC, mb, mb); DenseHierTaskViewType TA(&HA), TB(&HB), TC(&HC); timer.reset(); auto future = TaskFactoryType::Policy().create_team (typename Gemm<ArgTransA,ArgTransB,AlgoGemm::DenseByBlocks,Variant::One> ::template TaskFunctor<value_type,DenseHierTaskViewType,DenseHierTaskViewType,DenseHierTaskViewType> (1.0, TA, TB, 1.0, TC), 0); TaskFactoryType::Policy().spawn(future); Kokkos::Experimental::wait(TaskFactoryType::Policy()); t = timer.seconds(); os << ":: Parallel Performance = " << (flop/t/1.0e9) << " [GFLOPs] "; } if (check) { typedef typename Teuchos::ScalarTraits<value_type>::magnitudeType real_type; real_type err = 0.0, norm = 0.0; for (ordinal_type j=0;j<CC.NumCols();++j) for (ordinal_type i=0;i<CC.NumRows();++i) { const real_type diff = abs(CC.Value(i,j) - CB.Value(i,j)); const real_type val = CB.Value(i,j); err += diff*diff; norm += val*val; } os << ":: Check result ::err = " << sqrt(err) << ", norm = " << sqrt(norm); } cout << os.str() << endl; } return r_val; }
BASKER_INLINE int Basker<Int, Entry, Exe_Space>::factor_notoken(Int option) { //printf("factor no token called \n"); gn = A.ncol; gm = A.nrow; BASKER_MATRIX ATEMP; //Kokkos::Impl::Timer tza; if(Options.btf == BASKER_TRUE) { //JDB: We can change this for the new inteface gn = A.ncol; gm = A.nrow; ATEMP = A; A = BTF_A; } //printf("Switch time: %f \n", tza.seconds()); //Spit into Domain and Sep //----------------------Domain-------------------------// #ifdef BASKER_KOKKOS //====TIMER== #ifdef BASKER_TIME Kokkos::Impl::Timer timer; #endif //===TIMER=== typedef Kokkos::TeamPolicy<Exe_Space> TeamPolicy; if(btf_tabs_offset != 0) { if(Options.verbose == BASKER_TRUE) { printf("Factoring Dom num_threads: %d \n", num_threads); } Int domain_restart = 0; kokkos_nfactor_domain <Int,Entry,Exe_Space> domain_nfactor(this); Kokkos::parallel_for(TeamPolicy(num_threads,1), domain_nfactor); Kokkos::fence(); //=====Check for error====== while(true) { INT_1DARRAY thread_start; MALLOC_INT_1DARRAY(thread_start, num_threads+1); init_value(thread_start, num_threads+1, (Int) BASKER_MAX_IDX); int nt = nfactor_domain_error(thread_start); if((nt == BASKER_SUCCESS) || (nt == BASKER_ERROR) || (domain_restart > BASKER_RESTART)) { break; } else { domain_restart++; if(Options.verbose == BASKER_TRUE) { printf("restart \n"); } kokkos_nfactor_domain_remalloc <Int, Entry, Exe_Space> diag_nfactor_remalloc(this, thread_start); Kokkos::parallel_for(TeamPolicy(num_threads,1), diag_nfactor_remalloc); Kokkos::fence(); } }//end while //====TIMER=== #ifdef BASKER_TIME printf("Time DOMAIN: %f \n", timer.seconds()); timer.reset(); #endif //====TIMER==== #else// else basker_kokkos #pragma omp parallel { }//end omp parallel #endif //end basker_kokkos } //-------------------End--Domian--------------------------// //printVec("domperm.csc", gpermi, A.nrow); //---------------------------Sep--------------------------// if(btf_tabs_offset != 0) { //for(Int l=1; l<=4; l++) for(Int l=1; l <= tree.nlvls; l++) { //#ifdef BASKER_OLD_BARRIER //Int lthreads = pow(2,l); //Int lnteams = num_threads/lthreads; //#else Int lthreads = 1; Int lnteams = num_threads/lthreads; //#endif Int sep_restart = 0; if(Options.verbose == BASKER_TRUE) { printf("Factoring Sep num_threads: %d %d \n", lnteams, lthreads); } #ifdef BASKER_KOKKOS Kokkos::Impl::Timer timer_inner_sep; #ifdef BASKER_NO_LAMBDA //kokkos_nfactor_sep <Int, Entry, Exe_Space> //sep_nfactor(this, l); kokkos_nfactor_sep2 <Int, Entry, Exe_Space> sep_nfactor(this,l); Kokkos::parallel_for(TeamPolicy(lnteams,lthreads), sep_nfactor); Kokkos::fence(); //======Check for error===== while(true) { INT_1DARRAY thread_start; MALLOC_INT_1DARRAY(thread_start, num_threads+1); init_value(thread_start, num_threads+1, (Int) BASKER_MAX_IDX); int nt = nfactor_sep_error(thread_start); if((nt == BASKER_SUCCESS)|| (nt == BASKER_ERROR) || (sep_restart > BASKER_RESTART)) { FREE_INT_1DARRAY(thread_start); break; } else { sep_restart++; if (Options.verbose == BASKER_TRUE) { printf("restart \n"); } Kokkos::parallel_for(TeamPolicy(lnteams,lthreads), sep_nfactor); Kokkos::fence(); } }//end while-true #ifdef BASKER_TIME printf("Time INNERSEP: %d %f \n", l, timer_inner_sep.seconds()); #endif #else //ELSE BASKER_NO_LAMBDA //Note: to be added #endif //end BASKER_NO_LAMBDA #else #pragma omp parallel { }//end omp parallel #endif }//end over each level #ifdef BASKER_TIME printf("Time SEP: %f \n", timer.seconds()); #endif } //-------------------------End Sep----------------// //-------------------IF BTF-----------------------// if(Options.btf == BASKER_TRUE) { Int btf_restart = 0; if(Options.verbose == BASKER_TRUE) { printf("Factoring BLKs num_threads: %d \n", num_threads); } //=====Timer #ifdef BASKER_TIME Kokkos::Impl::Timer timer_btf; #endif //====Timer //======Call diag factor==== kokkos_nfactor_diag <Int, Entry, Exe_Space> diag_nfactor(this); Kokkos::parallel_for(TeamPolicy(num_threads,1), diag_nfactor); Kokkos::fence(); //=====Check for error====== while(true) { INT_1DARRAY thread_start; MALLOC_INT_1DARRAY(thread_start, num_threads+1); init_value(thread_start, num_threads+1, (Int) BASKER_MAX_IDX); int nt = nfactor_diag_error(thread_start); //printf("RETURNED: %d \n", nt); if((nt == BASKER_SUCCESS) || (nt == BASKER_ERROR) || (btf_restart > BASKER_RESTART)) { break; } else { btf_restart++; if (Options.verbose == BASKER_TRUE) { printf("restart \n"); } kokkos_nfactor_diag_remalloc <Int, Entry, Exe_Space> diag_nfactor_remalloc(this, thread_start); Kokkos::parallel_for(TeamPolicy(num_threads,1), diag_nfactor_remalloc); Kokkos::fence(); } }//end while //====TIMER #ifdef BASKER_TIME printf("Time BTF: %f \n", timer_btf.seconds()); #endif //===TIMER }//end btf call Kokkos::Impl::Timer tzback; if(Options.btf == BASKER_TRUE) { A = ATEMP; } //printf("Switch back: %f \n", // tzback.seconds()); return 0; }//end factor_notoken()
int exampleDenseGemmByBlocks(const ordinal_type mmin, const ordinal_type mmax, const ordinal_type minc, const ordinal_type k, const ordinal_type mb, const int max_concurrency, const int max_task_dependence, const int team_size, const int mkl_nthreads, const bool check, const bool verbose) { typedef typename Kokkos::Impl::is_space<DeviceSpaceType>::host_mirror_space::execution_space HostSpaceType ; const bool detail = false; std::cout << "DeviceSpace:: "; DeviceSpaceType::print_configuration(std::cout, detail); std::cout << "HostSpace:: "; HostSpaceType::print_configuration(std::cout, detail); typedef Kokkos::Experimental::TaskPolicy<DeviceSpaceType> PolicyType; typedef DenseMatrixBase<value_type,ordinal_type,size_type,HostSpaceType> DenseMatrixBaseHostType; typedef DenseMatrixView<DenseMatrixBaseHostType> DenseMatrixViewHostType; typedef DenseMatrixBase<value_type,ordinal_type,size_type,DeviceSpaceType> DenseMatrixBaseDeviceType; typedef DenseMatrixView<DenseMatrixBaseDeviceType> DenseMatrixViewDeviceType; typedef TaskView<DenseMatrixViewDeviceType> DenseTaskViewDeviceType; typedef DenseMatrixBase<DenseTaskViewDeviceType,ordinal_type,size_type,DeviceSpaceType> DenseHierMatrixBaseDeviceType; typedef DenseMatrixView<DenseHierMatrixBaseDeviceType> DenseHierMatrixViewDeviceType; typedef TaskView<DenseHierMatrixViewDeviceType> DenseHierTaskViewDeviceType; int r_val = 0; Kokkos::Impl::Timer timer; double t = 0.0; std::cout << "DenseGemmByBlocks:: test matrices " <<":: mmin = " << mmin << " , mmax = " << mmax << " , minc = " << minc << " , k = "<< k << " , mb = " << mb << std::endl; const size_t max_task_size = (3*sizeof(DenseTaskViewDeviceType)+sizeof(PolicyType)+128); PolicyType policy(max_concurrency, max_task_size, max_task_dependence, team_size); std::ostringstream os; os.precision(3); os << std::scientific; for (ordinal_type m=mmin;m<=mmax;m+=minc) { os.str(""); // host matrices DenseMatrixBaseHostType AA_host, BB_host, CC_host("CC_host", m, m), CB_host("CB_host", m, m); { if (ArgTransA == Trans::NoTranspose) AA_host = DenseMatrixBaseHostType("AA_host", m, k); else AA_host = DenseMatrixBaseHostType("AA_host", k, m); if (ArgTransB == Trans::NoTranspose) BB_host = DenseMatrixBaseHostType("BB_host", k, m); else BB_host = DenseMatrixBaseHostType("BB_host", m, k); for (ordinal_type j=0;j<AA_host.NumCols();++j) for (ordinal_type i=0;i<AA_host.NumRows();++i) AA_host.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0; for (ordinal_type j=0;j<BB_host.NumCols();++j) for (ordinal_type i=0;i<BB_host.NumRows();++i) BB_host.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0; for (ordinal_type j=0;j<CC_host.NumCols();++j) for (ordinal_type i=0;i<CC_host.NumRows();++i) CC_host.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0; DenseMatrixTools::copy(CB_host, CC_host); } const double flop = DenseFlopCount<value_type>::Gemm(m, m, k); #ifdef HAVE_SHYLUTACHO_MKL mkl_set_num_threads(mkl_nthreads); #endif os << "DenseGemmByBlocks:: m = " << m << " n = " << m << " k = " << k << " "; if (check) { timer.reset(); DenseMatrixViewHostType A_host(AA_host), B_host(BB_host), C_host(CB_host); Gemm<ArgTransA,ArgTransB,AlgoGemm::ExternalBlas,Variant::One>::invoke (policy, policy.member_single(), 1.0, A_host, B_host, 1.0, C_host); t = timer.seconds(); os << ":: Serial Performance = " << (flop/t/1.0e9) << " [GFLOPs] "; } DenseMatrixBaseDeviceType AA_device("AA_device"), BB_device("BB_device"), CC_device("CC_device"); { timer.reset(); AA_device.mirror(AA_host); BB_device.mirror(BB_host); CC_device.mirror(CC_host); t = timer.seconds(); os << ":: Mirror = " << t << " [sec] "; } { DenseHierMatrixBaseDeviceType HA_device("HA_device"), HB_device("HB_device"), HC_device("HC_device"); DenseMatrixTools::createHierMatrix(HA_device, AA_device, mb, mb); DenseMatrixTools::createHierMatrix(HB_device, BB_device, mb, mb); DenseMatrixTools::createHierMatrix(HC_device, CC_device, mb, mb); DenseHierTaskViewDeviceType TA_device(HA_device), TB_device(HB_device), TC_device(HC_device); timer.reset(); auto future = policy.proc_create_team (Gemm<ArgTransA,ArgTransB,AlgoGemm::DenseByBlocks,ArgVariant> ::createTaskFunctor(policy, 1.0, TA_device, TB_device, 1.0, TC_device), 0); policy.spawn(future); Kokkos::Experimental::wait(policy); t = timer.seconds(); os << ":: Parallel Performance = " << (flop/t/1.0e9) << " [GFLOPs] "; } CC_host.mirror(CC_device); if (check) { double err = 0.0, norm = 0.0; for (ordinal_type j=0;j<CC_host.NumCols();++j) for (ordinal_type i=0;i<CC_host.NumRows();++i) { const double diff = abs(CC_host.Value(i,j) - CB_host.Value(i,j)); const double val = CB_host.Value(i,j); err += diff*diff; norm += val*val; } os << ":: Check result ::norm = " << sqrt(norm) << ", error = " << sqrt(err); } std::cout << os.str() << std::endl; } return r_val; }