KOKKOS_INLINE_FUNCTION void operator() ( const team_member & dev) const { Kokkos::View<int**,Kokkos::MemoryUnmanaged> l_histogram(dev.team_shmem(),TEAM_SIZE,TEAM_SIZE); Kokkos::View<int*,Kokkos::MemoryUnmanaged> l_data(dev.team_shmem(),chunk_size+1); const int i = dev.league_rank() * chunk_size; for(int j = dev.team_rank(); j<chunk_size+1; j+=dev.team_size()) l_data(j) = data(i+j); for(int k = dev.team_rank(); k < TEAM_SIZE; k+=dev.team_size()) for(int l = 0; l < TEAM_SIZE; l++) l_histogram(k,l) = 0; dev.team_barrier(); for(int j = 0; j<chunk_size; j++) { for(int k = dev.team_rank(); k < TEAM_SIZE; k+=dev.team_size()) for(int l = 0; l < TEAM_SIZE; l++) { if((l_data(j) == k) && (l_data(j+1)==l)) l_histogram(k,l)++; } } for(int k = dev.team_rank(); k < TEAM_SIZE; k+=dev.team_size()) for(int l = 0; l < TEAM_SIZE; l++) { Kokkos::atomic_fetch_add(&histogram(k,l),l_histogram(k,l)); } dev.team_barrier(); }
KOKKOS_INLINE_FUNCTION void operator() ( const team_member & thread) const { int i = thread.league_rank(); // Allocate a shared array for the team. shared_1d_int count(thread.team_shmem(),data.dimension_1()); // With each team run a parallel_for with its threads Kokkos::parallel_for(Kokkos::TeamThreadRange(thread,data.dimension_1()), [=] (const int& j) { int tsum; // Run a vector loop reduction over the inner dimension of data // Count how many values are multiples of 4 // Every vector lane gets the same reduction value (tsum) back, it is broadcast to all vector lanes Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(thread,data.dimension_2()), [=] (const int& k, int & vsum) { vsum+= (data(i,j,k) % 4 == 0)?1:0; },tsum); // Make sure only one vector lane adds the reduction value to the shared array, i.e. execute // the next line only once PerThread Kokkos::single(Kokkos::PerThread(thread),[=] () { count(j) = tsum; }); }); // Wait for all threads to finish the parallel_for so that all shared memory writes are done thread.team_barrier(); // Check with one vector lane from each thread how many consecutive // data segments have the same number of values divisible by 4 // The team reduction value is again broadcast to every team member (and every vector lane) int team_sum = 0; Kokkos::parallel_reduce(Kokkos::TeamThreadRange(thread, data.dimension_1()-1), [=] (const int& j, int& thread_sum) { // It is not valid to directly add to thread_sum // Use a single function with broadcast instead // team_sum will be used as input to the operator (i.e. it is used to initialize sum) // the end value of sum will be broadcast to all vector lanes in the thread. Kokkos::single(Kokkos::PerThread(thread),[=] (int& sum) { if(count(j)==count(j+1)) sum++; },thread_sum); },team_sum); // Add with one thread and vectorlane of the team the team_sum to the global value Kokkos::single(Kokkos::PerTeam(thread),[=] () { Kokkos::atomic_add(&gsum(),team_sum); }); }