int main(int narg, char* args[]) { Kokkos::initialize(narg,args); int chunk_size = 1024; int nchunks = 100000; //1024*1024; Kokkos::DualView<int*> data("data",nchunks*chunk_size+1); srand(1231093); for(int i = 0; i < (int) data.dimension_0(); i++) { data.h_view(i) = rand()%TEAM_SIZE; } data.modify<Host>(); data.sync<Device>(); Kokkos::DualView<int**> histogram("histogram",TEAM_SIZE,TEAM_SIZE); Kokkos::Timer timer; // threads/team is automatically limited to maximum supported by the device. Kokkos::parallel_for( team_policy( nchunks , TEAM_SIZE ) , find_2_tuples(chunk_size,data,histogram) ); Kokkos::fence(); double time = timer.seconds(); histogram.sync<Host>(); printf("Time: %f \n\n",time); int sum = 0; for(int k=0; k<TEAM_SIZE; k++) { for(int l=0; l<TEAM_SIZE; l++) { printf("%i ",histogram.h_view(k,l)); sum += histogram.h_view(k,l); } printf("\n"); } printf("Result: %i %i\n",sum,chunk_size*nchunks); Kokkos::finalize(); }
int main(int narg, char* args[]) { Kokkos::initialize(narg,args); int chunk_size = 1024; int nchunks = 100000; //1024*1024; Kokkos::DualView<int*> data("data",nchunks*chunk_size+1); srand(1231093); for(int i = 0; i < data.dimension_0(); i++) { data.h_view(i) = rand()%TS; } data.modify<Host>(); data.sync<Device>(); Kokkos::DualView<int**> histogram("histogram",TS,TS); Kokkos::Impl::Timer timer; Kokkos::parallel_for( Kokkos::ParallelWorkRequest(nchunks,TS<Device::team_max()?TS:Device::team_max()), find_2_tuples(chunk_size,data,histogram)); Kokkos::fence(); double time = timer.seconds(); histogram.sync<Host>(); printf("Time: %lf \n\n",time); int sum = 0; for(int k=0; k<TS; k++) { for(int l=0; l<TS; l++) { printf("%i ",histogram.h_view(k,l)); sum += histogram.h_view(k,l); } printf("\n"); } printf("Result: %i %i\n",sum,chunk_size*nchunks); Kokkos::finalize(); }