size_t test_global_to_local_ids(unsigned num_ids, unsigned capacity, unsigned num_find_iterations) { typedef Device execution_space; typedef typename execution_space::size_type size_type; typedef Kokkos::View<uint32_t*,execution_space> local_id_view; typedef Kokkos::UnorderedMap<uint32_t,size_type,execution_space> global_id_view; double elasped_time = 0; Kokkos::Timer timer; local_id_view local_2_global("local_ids", num_ids); global_id_view global_2_local(capacity); int shiftw = 15; //create elasped_time = timer.seconds(); std::cout << std::setw(shiftw) << "allocate: " << elasped_time << std::endl; timer.reset(); // generate unique ids { generate_ids<Device> gen(local_2_global); } // generate elasped_time = timer.seconds(); std::cout << std::setw(shiftw) << "generate: " << elasped_time << std::endl; timer.reset(); { fill_map<Device> fill(global_2_local, local_2_global); } // fill elasped_time = timer.seconds(); std::cout << std::setw(shiftw) << "fill: " << elasped_time << std::endl; timer.reset(); size_t num_errors = global_2_local.failed_insert(); if (num_errors == 0u) { for (unsigned i=0; i<num_find_iterations; ++i) { find_test<Device> find(global_2_local, local_2_global,num_errors); } // find elasped_time = timer.seconds(); std::cout << std::setw(shiftw) << "lookup: " << elasped_time << std::endl; } else { std::cout << " !!! Fill Failed !!!" << std::endl; } return num_errors; }
int main(int narg, char* arg[]) { Kokkos::initialize(narg,arg); int size = 1000000; // Create Views idx_type idx("Idx",size,64); view_type dest("Dest",size); view_type src("Src",size); srand(134231); Kokkos::fence(); // When using UVM Cuda views can be accessed on the Host directly for(int i=0; i<size; i++) { for(int j=0; j<int(idx.dimension_1()); j++) idx(i,j) = (size + i + (rand()%500 - 250))%size; } Kokkos::fence(); // Run on the device // This will cause a sync of idx to the device since it was modified on the host Kokkos::Timer timer; Kokkos::parallel_for(size,localsum<view_type::execution_space>(idx,dest,src)); Kokkos::fence(); double sec1_dev = timer.seconds(); // No data transfer will happen now, since nothing is accessed on the host timer.reset(); Kokkos::parallel_for(size,localsum<view_type::execution_space>(idx,dest,src)); Kokkos::fence(); double sec2_dev = timer.seconds(); // Run on the host // This will cause a sync back to the host of dest which was changed on the device // Compare runtime here with the dual_view example: dest will be copied back in 4k blocks // when they are accessed the first time during the parallel_for. Due to the latency of a memcpy // this gives lower effective bandwidth when doing a manual copy via dual views timer.reset(); Kokkos::parallel_for(size,localsum<Kokkos::HostSpace::execution_space>(idx,dest,src)); Kokkos::fence(); double sec1_host = timer.seconds(); // No data transfers will happen now timer.reset(); Kokkos::parallel_for(size,localsum<Kokkos::HostSpace::execution_space>(idx,dest,src)); Kokkos::fence(); double sec2_host = timer.seconds(); printf("Device Time with Sync: %e without Sync: %e \n",sec1_dev,sec2_dev); printf("Host Time with Sync: %e without Sync: %e \n",sec1_host,sec2_host); Kokkos::finalize(); }
int main(int argc, char* args[]) { if (argc != 3){ printf("Please pass two integers on the command line\n"); } else { // Initialize Kokkos Kokkos::initialize(argc,args); int size = atoi(args[1]); int samples = atoi(args[2]); // Create two random number generator pools one for 64bit states and one for 1024 bit states // Both take an 64 bit unsigned integer seed to initialize a Random_XorShift64 generator which // is used to fill the generators of the pool. Kokkos::Random_XorShift64_Pool<> rand_pool64(5374857); Kokkos::Random_XorShift1024_Pool<> rand_pool1024(5374857); Kokkos::DualView<uint64_t*> vals("Vals",size*samples); // Run some performance comparisons Kokkos::Timer timer; Kokkos::parallel_for(size,generate_random<Kokkos::Random_XorShift64_Pool<> >(vals.d_view,rand_pool64,samples)); Kokkos::fence(); timer.reset(); Kokkos::parallel_for(size,generate_random<Kokkos::Random_XorShift64_Pool<> >(vals.d_view,rand_pool64,samples)); Kokkos::fence(); double time_64 = timer.seconds(); Kokkos::parallel_for(size,generate_random<Kokkos::Random_XorShift1024_Pool<> >(vals.d_view,rand_pool1024,samples)); Kokkos::fence(); timer.reset(); Kokkos::parallel_for(size,generate_random<Kokkos::Random_XorShift1024_Pool<> >(vals.d_view,rand_pool1024,samples)); Kokkos::fence(); double time_1024 = timer.seconds(); printf("#Time XorShift64*: %e %e\n",time_64,1.0e-9*samples*size/time_64 ); printf("#Time XorShift1024*: %e %e\n",time_1024,1.0e-9*samples*size/time_1024 ); Kokkos::deep_copy(vals.h_view,vals.d_view); Kokkos::finalize(); } return 0; }
extern "C" void kokkosp_init_library(const int loadSeq, const uint64_t interfaceVer, const uint32_t devInfoCount, void* deviceInfo) { num_spaces = 0; for(int i=0; i<16; i++) space_size[i] = 0; timer.reset(); }
void test_dynrankview_op_perf( const int par_size ) { typedef DeviceType execution_space; typedef typename execution_space::size_type size_type; const size_type dim2 = 900; const size_type dim3 = 300; double elapsed_time_view = 0; double elapsed_time_compview = 0; double elapsed_time_strideview = 0; double elapsed_time_view_rank7 = 0; double elapsed_time_drview = 0; double elapsed_time_compdrview = 0; Kokkos::Timer timer; { Kokkos::View<double***,DeviceType> testview("testview",par_size,dim2,dim3); typedef InitViewFunctor<DeviceType> FunctorType; timer.reset(); Kokkos::RangePolicy<DeviceType> policy(0,par_size); Kokkos::parallel_for( policy , FunctorType(testview) ); DeviceType::fence(); elapsed_time_view = timer.seconds(); std::cout << " View time (init only): " << elapsed_time_view << std::endl; timer.reset(); Kokkos::View<double*,DeviceType> sumview("sumview",par_size); Kokkos::parallel_for( policy , typename FunctorType::SumComputationTest(testview, sumview) ); DeviceType::fence(); elapsed_time_compview = timer.seconds(); std::cout << " View sum computation time: " << elapsed_time_view << std::endl; Kokkos::View<double***,Kokkos::LayoutStride, DeviceType> teststrideview = Kokkos::subview(testview, Kokkos::ALL, Kokkos::ALL,Kokkos::ALL); typedef InitStrideViewFunctor<DeviceType> FunctorStrideType; timer.reset(); Kokkos::parallel_for( policy , FunctorStrideType(teststrideview) ); DeviceType::fence(); elapsed_time_strideview = timer.seconds(); std::cout << " Strided View time (init only): " << elapsed_time_strideview << std::endl; } { Kokkos::View<double*******,DeviceType> testview("testview",par_size,dim2,dim3,1,1,1,1); typedef InitViewRank7Functor<DeviceType> FunctorType; timer.reset(); Kokkos::RangePolicy<DeviceType> policy(0,par_size); Kokkos::parallel_for( policy , FunctorType(testview) ); DeviceType::fence(); elapsed_time_view_rank7 = timer.seconds(); std::cout << " View Rank7 time (init only): " << elapsed_time_view_rank7 << std::endl; } { Kokkos::DynRankView<double,DeviceType> testdrview("testdrview",par_size,dim2,dim3); typedef InitDynRankViewFunctor<DeviceType> FunctorType; timer.reset(); Kokkos::RangePolicy<DeviceType> policy(0,par_size); Kokkos::parallel_for( policy , FunctorType(testdrview) ); DeviceType::fence(); elapsed_time_drview = timer.seconds(); std::cout << " DynRankView time (init only): " << elapsed_time_drview << std::endl; timer.reset(); Kokkos::DynRankView<double,DeviceType> sumview("sumview",par_size); Kokkos::parallel_for( policy , typename FunctorType::SumComputationTest(testdrview, sumview) ); DeviceType::fence(); elapsed_time_compdrview = timer.seconds(); std::cout << " DynRankView sum computation time: " << elapsed_time_compdrview << std::endl; } std::cout << " Ratio of View to DynRankView time: " << elapsed_time_view / elapsed_time_drview << std::endl; //expect < 1 std::cout << " Ratio of View to DynRankView sum computation time: " << elapsed_time_compview / elapsed_time_compdrview << std::endl; //expect < 1 std::cout << " Ratio of View to View Rank7 time: " << elapsed_time_view / elapsed_time_view_rank7 << std::endl; //expect < 1 std::cout << " Ratio of StrideView to DynRankView time: " << elapsed_time_strideview / elapsed_time_drview << std::endl; //expect < 1 std::cout << " Ratio of DynRankView to View Rank7 time: " << elapsed_time_drview / elapsed_time_view_rank7 << std::endl; //expect ? timer.reset(); } //end test_dynrankview
void sort_array( const size_t array_length /* length of spans of array to sort */ , const size_t total_length /* total length of array */ , const int print = 1 ) { typedef Device execution_space ; typedef Kokkos::View<int*,Device> device_array_type ; #if defined( KOKKOS_HAVE_CUDA ) typedef typename Kokkos::Impl::if_c< Kokkos::Impl::is_same< Device , Kokkos::Cuda >::value , Kokkos::View<int*,Kokkos::Cuda::array_layout,Kokkos::CudaHostPinnedSpace> , typename device_array_type::HostMirror >::type host_array_type ; #else typedef typename device_array_type::HostMirror host_array_type ; #endif Kokkos::Timer timer; const device_array_type work_array("work_array" , array_length ); const host_array_type host_array("host_array" , total_length ); std::cout << "sort_array length( " << total_length << " )" << " in chunks( " << array_length << " )" << std::endl ; double sec = timer.seconds(); std::cout << "declaring Views took " << sec << " seconds" << std::endl; timer.reset(); for ( size_t i = 0 ; i < total_length ; ++i ) { host_array(i) = ( lrand48() * total_length ) >> 31 ; } sec = timer.seconds(); std::cout << "initializing " << total_length << " elements on host took " << sec << " seconds" << std::endl; timer.reset(); double sec_copy_in = 0 ; double sec_sort = 0 ; double sec_copy_out = 0 ; double sec_error = 0 ; size_t error_count = 0 ; for ( size_t begin = 0 ; begin < total_length ; begin += array_length ) { const size_t end = begin + array_length < total_length ? begin + array_length : total_length ; const std::pair<size_t,size_t> host_range(begin,end); const host_array_type host_subarray = Kokkos::subview( host_array , host_range ); timer.reset(); Kokkos::deep_copy( work_array , host_subarray ); sec_copy_in += timer.seconds(); timer.reset(); SortView< execution_space >( work_array , 0 , end - begin ); sec_sort += timer.seconds(); timer.reset(); Kokkos::deep_copy( host_subarray , work_array ); sec_copy_out += timer.seconds(); timer.reset(); for ( size_t i = begin + 1 ; i < end ; ++i ) { if ( host_array(i) < host_array(i-1) ) ++error_count ; } sec_error += timer.seconds(); timer.reset(); } std::cout << "copy to device " << sec_copy_in << " seconds" << std::endl << "sort on device " << sec_sort << " seconds" << std::endl << "copy from device " << sec_copy_out << " seconds" << std::endl << "errors " << error_count << " took " << sec_error << " seconds" << std::endl ; }