size_t test_global_to_local_ids(unsigned num_ids, unsigned capacity, unsigned num_find_iterations) { typedef Device execution_space; typedef typename execution_space::size_type size_type; typedef Kokkos::View<uint32_t*,execution_space> local_id_view; typedef Kokkos::UnorderedMap<uint32_t,size_type,execution_space> global_id_view; double elasped_time = 0; Kokkos::Timer timer; local_id_view local_2_global("local_ids", num_ids); global_id_view global_2_local(capacity); int shiftw = 15; //create elasped_time = timer.seconds(); std::cout << std::setw(shiftw) << "allocate: " << elasped_time << std::endl; timer.reset(); // generate unique ids { generate_ids<Device> gen(local_2_global); } // generate elasped_time = timer.seconds(); std::cout << std::setw(shiftw) << "generate: " << elasped_time << std::endl; timer.reset(); { fill_map<Device> fill(global_2_local, local_2_global); } // fill elasped_time = timer.seconds(); std::cout << std::setw(shiftw) << "fill: " << elasped_time << std::endl; timer.reset(); size_t num_errors = global_2_local.failed_insert(); if (num_errors == 0u) { for (unsigned i=0; i<num_find_iterations; ++i) { find_test<Device> find(global_2_local, local_2_global,num_errors); } // find elasped_time = timer.seconds(); std::cout << std::setw(shiftw) << "lookup: " << elasped_time << std::endl; } else { std::cout << " !!! Fill Failed !!!" << std::endl; } return num_errors; }
int main(int narg, char* arg[]) { Kokkos::initialize(narg,arg); int size = 1000000; // Create Views idx_type idx("Idx",size,64); view_type dest("Dest",size); view_type src("Src",size); srand(134231); Kokkos::fence(); // When using UVM Cuda views can be accessed on the Host directly for(int i=0; i<size; i++) { for(int j=0; j<int(idx.dimension_1()); j++) idx(i,j) = (size + i + (rand()%500 - 250))%size; } Kokkos::fence(); // Run on the device // This will cause a sync of idx to the device since it was modified on the host Kokkos::Timer timer; Kokkos::parallel_for(size,localsum<view_type::execution_space>(idx,dest,src)); Kokkos::fence(); double sec1_dev = timer.seconds(); // No data transfer will happen now, since nothing is accessed on the host timer.reset(); Kokkos::parallel_for(size,localsum<view_type::execution_space>(idx,dest,src)); Kokkos::fence(); double sec2_dev = timer.seconds(); // Run on the host // This will cause a sync back to the host of dest which was changed on the device // Compare runtime here with the dual_view example: dest will be copied back in 4k blocks // when they are accessed the first time during the parallel_for. Due to the latency of a memcpy // this gives lower effective bandwidth when doing a manual copy via dual views timer.reset(); Kokkos::parallel_for(size,localsum<Kokkos::HostSpace::execution_space>(idx,dest,src)); Kokkos::fence(); double sec1_host = timer.seconds(); // No data transfers will happen now timer.reset(); Kokkos::parallel_for(size,localsum<Kokkos::HostSpace::execution_space>(idx,dest,src)); Kokkos::fence(); double sec2_host = timer.seconds(); printf("Device Time with Sync: %e without Sync: %e \n",sec1_dev,sec2_dev); printf("Host Time with Sync: %e without Sync: %e \n",sec1_host,sec2_host); Kokkos::finalize(); }
int main(int argc, char* args[]) { if (argc != 3){ printf("Please pass two integers on the command line\n"); } else { // Initialize Kokkos Kokkos::initialize(argc,args); int size = atoi(args[1]); int samples = atoi(args[2]); // Create two random number generator pools one for 64bit states and one for 1024 bit states // Both take an 64 bit unsigned integer seed to initialize a Random_XorShift64 generator which // is used to fill the generators of the pool. Kokkos::Random_XorShift64_Pool<> rand_pool64(5374857); Kokkos::Random_XorShift1024_Pool<> rand_pool1024(5374857); Kokkos::DualView<uint64_t*> vals("Vals",size*samples); // Run some performance comparisons Kokkos::Timer timer; Kokkos::parallel_for(size,generate_random<Kokkos::Random_XorShift64_Pool<> >(vals.d_view,rand_pool64,samples)); Kokkos::fence(); timer.reset(); Kokkos::parallel_for(size,generate_random<Kokkos::Random_XorShift64_Pool<> >(vals.d_view,rand_pool64,samples)); Kokkos::fence(); double time_64 = timer.seconds(); Kokkos::parallel_for(size,generate_random<Kokkos::Random_XorShift1024_Pool<> >(vals.d_view,rand_pool1024,samples)); Kokkos::fence(); timer.reset(); Kokkos::parallel_for(size,generate_random<Kokkos::Random_XorShift1024_Pool<> >(vals.d_view,rand_pool1024,samples)); Kokkos::fence(); double time_1024 = timer.seconds(); printf("#Time XorShift64*: %e %e\n",time_64,1.0e-9*samples*size/time_64 ); printf("#Time XorShift1024*: %e %e\n",time_1024,1.0e-9*samples*size/time_1024 ); Kokkos::deep_copy(vals.h_view,vals.d_view); Kokkos::finalize(); } return 0; }
extern "C" void kokkosp_init_library(const int loadSeq, const uint64_t interfaceVer, const uint32_t devInfoCount, void* deviceInfo) { num_spaces = 0; for(int i=0; i<16; i++) space_size[i] = 0; timer.reset(); }
int main(int narg, char* args[]) { Kokkos::initialize(narg,args); int chunk_size = 1024; int nchunks = 100000; //1024*1024; Kokkos::DualView<int*> data("data",nchunks*chunk_size+1); srand(1231093); for(int i = 0; i < (int) data.dimension_0(); i++) { data.h_view(i) = rand()%TEAM_SIZE; } data.modify<Host>(); data.sync<Device>(); Kokkos::DualView<int**> histogram("histogram",TEAM_SIZE,TEAM_SIZE); Kokkos::Timer timer; // threads/team is automatically limited to maximum supported by the device. Kokkos::parallel_for( team_policy( nchunks , TEAM_SIZE ) , find_2_tuples(chunk_size,data,histogram) ); Kokkos::fence(); double time = timer.seconds(); histogram.sync<Host>(); printf("Time: %f \n\n",time); int sum = 0; for(int k=0; k<TEAM_SIZE; k++) { for(int l=0; l<TEAM_SIZE; l++) { printf("%i ",histogram.h_view(k,l)); sum += histogram.h_view(k,l); } printf("\n"); } printf("Result: %i %i\n",sum,chunk_size*nchunks); Kokkos::finalize(); }
extern "C" void kokkosp_allocate_data(const SpaceHandle space, const char* label, const void* const ptr, const uint64_t size) { std::lock_guard<std::mutex> lock(m); double time = timer.seconds(); int space_i = num_spaces; for(int s = 0; s<num_spaces; s++) if(strcmp(space_name[s],space.name)==0) space_i = s; if(space_i == num_spaces) { strncpy(space_name[num_spaces],space.name,64); num_spaces++; } space_size[space_i] += size; space_size_track[space_i].push_back(std::make_tuple(time,space_size[space_i],max_mem_usage())); }
void test_dynrankview_op_perf( const int par_size ) { typedef DeviceType execution_space; typedef typename execution_space::size_type size_type; const size_type dim2 = 900; const size_type dim3 = 300; double elapsed_time_view = 0; double elapsed_time_compview = 0; double elapsed_time_strideview = 0; double elapsed_time_view_rank7 = 0; double elapsed_time_drview = 0; double elapsed_time_compdrview = 0; Kokkos::Timer timer; { Kokkos::View<double***,DeviceType> testview("testview",par_size,dim2,dim3); typedef InitViewFunctor<DeviceType> FunctorType; timer.reset(); Kokkos::RangePolicy<DeviceType> policy(0,par_size); Kokkos::parallel_for( policy , FunctorType(testview) ); DeviceType::fence(); elapsed_time_view = timer.seconds(); std::cout << " View time (init only): " << elapsed_time_view << std::endl; timer.reset(); Kokkos::View<double*,DeviceType> sumview("sumview",par_size); Kokkos::parallel_for( policy , typename FunctorType::SumComputationTest(testview, sumview) ); DeviceType::fence(); elapsed_time_compview = timer.seconds(); std::cout << " View sum computation time: " << elapsed_time_view << std::endl; Kokkos::View<double***,Kokkos::LayoutStride, DeviceType> teststrideview = Kokkos::subview(testview, Kokkos::ALL, Kokkos::ALL,Kokkos::ALL); typedef InitStrideViewFunctor<DeviceType> FunctorStrideType; timer.reset(); Kokkos::parallel_for( policy , FunctorStrideType(teststrideview) ); DeviceType::fence(); elapsed_time_strideview = timer.seconds(); std::cout << " Strided View time (init only): " << elapsed_time_strideview << std::endl; } { Kokkos::View<double*******,DeviceType> testview("testview",par_size,dim2,dim3,1,1,1,1); typedef InitViewRank7Functor<DeviceType> FunctorType; timer.reset(); Kokkos::RangePolicy<DeviceType> policy(0,par_size); Kokkos::parallel_for( policy , FunctorType(testview) ); DeviceType::fence(); elapsed_time_view_rank7 = timer.seconds(); std::cout << " View Rank7 time (init only): " << elapsed_time_view_rank7 << std::endl; } { Kokkos::DynRankView<double,DeviceType> testdrview("testdrview",par_size,dim2,dim3); typedef InitDynRankViewFunctor<DeviceType> FunctorType; timer.reset(); Kokkos::RangePolicy<DeviceType> policy(0,par_size); Kokkos::parallel_for( policy , FunctorType(testdrview) ); DeviceType::fence(); elapsed_time_drview = timer.seconds(); std::cout << " DynRankView time (init only): " << elapsed_time_drview << std::endl; timer.reset(); Kokkos::DynRankView<double,DeviceType> sumview("sumview",par_size); Kokkos::parallel_for( policy , typename FunctorType::SumComputationTest(testdrview, sumview) ); DeviceType::fence(); elapsed_time_compdrview = timer.seconds(); std::cout << " DynRankView sum computation time: " << elapsed_time_compdrview << std::endl; } std::cout << " Ratio of View to DynRankView time: " << elapsed_time_view / elapsed_time_drview << std::endl; //expect < 1 std::cout << " Ratio of View to DynRankView sum computation time: " << elapsed_time_compview / elapsed_time_compdrview << std::endl; //expect < 1 std::cout << " Ratio of View to View Rank7 time: " << elapsed_time_view / elapsed_time_view_rank7 << std::endl; //expect < 1 std::cout << " Ratio of StrideView to DynRankView time: " << elapsed_time_strideview / elapsed_time_drview << std::endl; //expect < 1 std::cout << " Ratio of DynRankView to View Rank7 time: " << elapsed_time_drview / elapsed_time_view_rank7 << std::endl; //expect ? timer.reset(); } //end test_dynrankview
void sort_array( const size_t array_length /* length of spans of array to sort */ , const size_t total_length /* total length of array */ , const int print = 1 ) { typedef Device execution_space ; typedef Kokkos::View<int*,Device> device_array_type ; #if defined( KOKKOS_HAVE_CUDA ) typedef typename Kokkos::Impl::if_c< Kokkos::Impl::is_same< Device , Kokkos::Cuda >::value , Kokkos::View<int*,Kokkos::Cuda::array_layout,Kokkos::CudaHostPinnedSpace> , typename device_array_type::HostMirror >::type host_array_type ; #else typedef typename device_array_type::HostMirror host_array_type ; #endif Kokkos::Timer timer; const device_array_type work_array("work_array" , array_length ); const host_array_type host_array("host_array" , total_length ); std::cout << "sort_array length( " << total_length << " )" << " in chunks( " << array_length << " )" << std::endl ; double sec = timer.seconds(); std::cout << "declaring Views took " << sec << " seconds" << std::endl; timer.reset(); for ( size_t i = 0 ; i < total_length ; ++i ) { host_array(i) = ( lrand48() * total_length ) >> 31 ; } sec = timer.seconds(); std::cout << "initializing " << total_length << " elements on host took " << sec << " seconds" << std::endl; timer.reset(); double sec_copy_in = 0 ; double sec_sort = 0 ; double sec_copy_out = 0 ; double sec_error = 0 ; size_t error_count = 0 ; for ( size_t begin = 0 ; begin < total_length ; begin += array_length ) { const size_t end = begin + array_length < total_length ? begin + array_length : total_length ; const std::pair<size_t,size_t> host_range(begin,end); const host_array_type host_subarray = Kokkos::subview( host_array , host_range ); timer.reset(); Kokkos::deep_copy( work_array , host_subarray ); sec_copy_in += timer.seconds(); timer.reset(); SortView< execution_space >( work_array , 0 , end - begin ); sec_sort += timer.seconds(); timer.reset(); Kokkos::deep_copy( host_subarray , work_array ); sec_copy_out += timer.seconds(); timer.reset(); for ( size_t i = begin + 1 ; i < end ; ++i ) { if ( host_array(i) < host_array(i-1) ) ++error_count ; } sec_error += timer.seconds(); timer.reset(); } std::cout << "copy to device " << sec_copy_in << " seconds" << std::endl << "sort on device " << sec_sort << " seconds" << std::endl << "copy from device " << sec_copy_out << " seconds" << std::endl << "errors " << error_count << " took " << sec_error << " seconds" << std::endl ; }
void run_allocateview_tests(int N, int R) { const int N1 = N; const int N2 = N*N; const int N3 = N2*N; const int N4 = N2*N2; const int N8 = N4*N4; double time1,time2,time3,time4,time5,time6,time7,time8,time_raw = 100000.0; { Kokkos::Timer timer; for(int r=0; r<R; r++) { Kokkos::View<double*,Layout> a("A1",N8); } time1 = timer.seconds()/R; } { Kokkos::Timer timer; for(int r=0; r<R; r++) { Kokkos::View<double**,Layout> a("A2",N4,N4); } time2 = timer.seconds()/R; } { Kokkos::Timer timer; for(int r=0; r<R; r++) { Kokkos::View<double***,Layout> a("A3",N3,N3,N2); } time3 = timer.seconds()/R; } { Kokkos::Timer timer; for(int r=0; r<R; r++) { Kokkos::View<double****,Layout> a("A4",N2,N2,N2,N2); } time4 = timer.seconds()/R; } { Kokkos::Timer timer; for(int r=0; r<R; r++) { Kokkos::View<double*****,Layout> a("A5",N2,N2,N1,N1,N2); } time5 = timer.seconds()/R; } { Kokkos::Timer timer; for(int r=0; r<R; r++) { Kokkos::View<double******,Layout> a("A6",N2,N1,N1,N1,N1,N2); } time6 = timer.seconds()/R; } { Kokkos::Timer timer; for(int r=0; r<R; r++) { Kokkos::View<double*******,Layout> a("A7",N2,N1,N1,N1,N1,N1,N1); } time7 = timer.seconds()/R; } { Kokkos::Timer timer; for(int r=0; r<R; r++) { Kokkos::View<double********,Layout> a("A8",N1,N1,N1,N1,N1,N1,N1,N1); } time8 = timer.seconds()/R; } #if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) { Kokkos::Timer timer; for(int r=0;r<R;r++) { double* a_ptr = (double*) Kokkos::kokkos_malloc("A", sizeof(double)*N8); Kokkos::parallel_for(N8, KOKKOS_LAMBDA (const int& i) { a_ptr[i] = 0.0; }); Kokkos::kokkos_free(a_ptr); } time_raw = timer.seconds()/R; } #endif double size = 1.0*N8*8/1024/1024; printf(" Raw: %lf s %lf MB %lf GB/s\n",time_raw,size,size/1024/time_raw); printf(" Rank1: %lf s %lf MB %lf GB/s\n",time1,size,size/1024/time1); printf(" Rank2: %lf s %lf MB %lf GB/s\n",time2,size,size/1024/time2); printf(" Rank3: %lf s %lf MB %lf GB/s\n",time3,size,size/1024/time3); printf(" Rank4: %lf s %lf MB %lf GB/s\n",time4,size,size/1024/time4); printf(" Rank5: %lf s %lf MB %lf GB/s\n",time5,size,size/1024/time5); printf(" Rank6: %lf s %lf MB %lf GB/s\n",time6,size,size/1024/time6); printf(" Rank7: %lf s %lf MB %lf GB/s\n",time7,size,size/1024/time7); printf(" Rank8: %lf s %lf MB %lf GB/s\n",time8,size,size/1024/time8); }
int main(int argc, char** argv) { printf("Running MD Skeleton\n"); /* Thread numbers for Host */ int num_threads = 1; int teams = 1; int device = 0; // Default device for GPU runs /* avoid unused variable warnings */ (void)num_threads; (void)teams; (void)device; /* Default value for number of force calculations */ int iter = 100; /* Default value for system size (4*nx*ny*nz atoms) * nx, ny and nz are set to system_size if not specififed on commandline */ int system_size = 20; int nx = -1; int ny = -1; int nz = -1; int neighbor_size = 1; // Default bin size for neighbor list construction double rho = 0.8442; // Number density of the system double delta = 0; // Scaling factor for random offsets of atom positions /* read in command-line arguments */ for(int i = 0; i < argc; i++) { if((strcmp(argv[i], "-t") == 0) || (strcmp(argv[i], "--num_threads") == 0)) { num_threads = atoi(argv[++i]); continue; } if((strcmp(argv[i], "--teams") == 0)) { teams = atoi(argv[++i]); continue; } if((strcmp(argv[i], "-d") == 0) || (strcmp(argv[i], "--device") == 0)) { device = atoi(argv[++i]); continue; } if((strcmp(argv[i], "--delta") == 0)) { delta = atof(argv[++i]); continue; } if((strcmp(argv[i], "-i") == 0) || (strcmp(argv[i], "--iter") == 0)) { iter = atoi(argv[++i]); continue; } if((strcmp(argv[i], "-rho") == 0)) { rho = atoi(argv[++i]); continue; } if((strcmp(argv[i], "-s") == 0) || (strcmp(argv[i], "--size") == 0)) { system_size = atoi(argv[++i]); continue; } if((strcmp(argv[i], "-nx") == 0)) { nx = atoi(argv[++i]); continue; } if((strcmp(argv[i], "-ny") == 0)) { ny = atoi(argv[++i]); continue; } if((strcmp(argv[i], "-nz") == 0)) { nz = atoi(argv[++i]); continue; } if((strcmp(argv[i], "-b") == 0) || (strcmp(argv[i], "--neigh_bins") == 0)) { neighbor_size = atoi(argv[++i]); continue; } } if( nx < 0 ) nx = system_size; if( ny < 0 ) ny = system_size; if( nz < 0 ) nz = system_size; printf("-> Init Device\n"); #if defined( KOKKOS_HAVE_CUDA ) Kokkos::HostSpace::execution_space::initialize(teams*num_threads); Kokkos::Cuda::SelectDevice select_device(device); Kokkos::Cuda::initialize(select_device); #elif defined( KOKKOS_HAVE_OPENMP ) Kokkos::OpenMP::initialize(teams*num_threads); #elif defined( KOKKOS_HAVE_PTHREAD ) Kokkos::Threads::initialize(teams*num_threads); #endif System system; system.neigh_cut = 2.8; system.force_cut = 2.5; system.force_cutsq = system.force_cut*system.force_cut; system.delta = delta; printf("-> Build system\n"); create_system(system,nx,ny,nz,rho); printf("-> Created %i atoms and %i ghost atoms\n",system.nlocal,system.nghost); system.nbinx = system.box.xprd/neighbor_size+1; system.nbiny = system.box.yprd/neighbor_size+1; system.nbinz = system.box.zprd/neighbor_size+1; printf("-> Building Neighborlist\n"); neigh_setup(system); neigh_build(system); double2 ev = force(system,1); printf("-> Calculate Energy: %f Virial: %f\n",ev.x,ev.y); printf("-> Running %i force calculations\n",iter); Kokkos::Timer timer; for(int i=0;i<iter;i++) { force(system,0); } double time = timer.seconds(); printf("Time: %e s for %i iterations with %i atoms\n",time,iter,system.nlocal); execution_space::finalize(); }