// This is a reduction operator which now takes as first argument the // TeamPolicy member_type. Every member of the team contributes to the // total sum. // It is helpful to think of this operator as a parallel region for a team // (i.e. every team member is active and will execute the code). KOKKOS_INLINE_FUNCTION void operator() ( const team_member & thread, int& sum) const { sum+=1; // The TeamPolicy<>::member_type provides functions to query the multi // dimensional index of a thread as well as the number of thread-teams and the size // of each team. printf("Hello World: %i %i // %i %i\n",thread.league_rank(),thread.team_rank(),thread.league_size(),thread.team_size()); }
KOKKOS_INLINE_FUNCTION void operator() (const team_member& dev) const { Kokkos::parallel_for(Kokkos::TeamThreadRange(dev,0,rows_per_team), [&] (const ordinal_type& loop) { const ordinal_type iRow = static_cast<ordinal_type> ( dev.league_rank() ) * rows_per_team + loop; if (iRow >= m_A.numRows ()) { return; } const KokkosSparse::SparseRowViewConst<AMatrix> row = m_A.rowConst(iRow); const ordinal_type row_length = static_cast<ordinal_type> (row.length); value_type sum = 0; Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(dev,row_length), [&] (const ordinal_type& iEntry, value_type& lsum) { const value_type val = conjugate ? ATV::conj (row.value(iEntry)) : row.value(iEntry); lsum += val * m_x(row.colidx(iEntry)); },sum); Kokkos::single(Kokkos::PerThread(dev), [&] () { sum *= alpha; if (dobeta == 0) { m_y(iRow) = sum ; } else { m_y(iRow) = beta * m_y(iRow) + sum; } }); }); }
KOKKOS_INLINE_FUNCTION void operator() ( const team_member & thread) const { int i = thread.league_rank(); // Allocate a shared array for the team. shared_1d_int count(thread.team_shmem(),data.dimension_1()); // With each team run a parallel_for with its threads Kokkos::parallel_for(Kokkos::TeamThreadRange(thread,data.dimension_1()), [=] (const int& j) { int tsum; // Run a vector loop reduction over the inner dimension of data // Count how many values are multiples of 4 // Every vector lane gets the same reduction value (tsum) back, it is broadcast to all vector lanes Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(thread,data.dimension_2()), [=] (const int& k, int & vsum) { vsum+= (data(i,j,k) % 4 == 0)?1:0; },tsum); // Make sure only one vector lane adds the reduction value to the shared array, i.e. execute // the next line only once PerThread Kokkos::single(Kokkos::PerThread(thread),[=] () { count(j) = tsum; }); }); // Wait for all threads to finish the parallel_for so that all shared memory writes are done thread.team_barrier(); // Check with one vector lane from each thread how many consecutive // data segments have the same number of values divisible by 4 // The team reduction value is again broadcast to every team member (and every vector lane) int team_sum = 0; Kokkos::parallel_reduce(Kokkos::TeamThreadRange(thread, data.dimension_1()-1), [=] (const int& j, int& thread_sum) { // It is not valid to directly add to thread_sum // Use a single function with broadcast instead // team_sum will be used as input to the operator (i.e. it is used to initialize sum) // the end value of sum will be broadcast to all vector lanes in the thread. Kokkos::single(Kokkos::PerThread(thread),[=] (int& sum) { if(count(j)==count(j+1)) sum++; },thread_sum); },team_sum); // Add with one thread and vectorlane of the team the team_sum to the global value Kokkos::single(Kokkos::PerTeam(thread),[=] () { Kokkos::atomic_add(&gsum(),team_sum); }); }
KOKKOS_INLINE_FUNCTION void operator()(const team_member& team) const { const int team_offset = (team.league_rank() + halo_depth)*y; Kokkos::parallel_for( Kokkos::TeamThreadRange(team, halo_depth, y-halo_depth), [&] (const int &j) { const int index = team_offset + j; p(index) = beta*p(index) + r(index); }); }
KOKKOS_INLINE_FUNCTION void operator()(const team_member& team, value_type& rrn) const { double rrn_team = 0.0; const int team_offset = (team.league_rank() + halo_depth)*y; Kokkos::parallel_reduce( Kokkos::TeamThreadRange(team, halo_depth, y-halo_depth), [&] (const int &j, double& rrn_thread) { const int index = team_offset + j; u(index) += alpha*p(index); r(index) -= alpha*w(index); rrn_thread += r(index)*r(index); }, rrn_team); Kokkos::single(Kokkos::PerTeam(team), [&] () { rrn += rrn_team; }); }
KOKKOS_INLINE_FUNCTION void operator()(const team_member& team, value_type& pw) const { double pw_team = 0.0; const int team_offset = (team.league_rank() + halo_depth)*y; Kokkos::parallel_reduce( Kokkos::TeamThreadRange(team, halo_depth, y-halo_depth), [&] (const int &j, double& pw_thread) { const int index = team_offset + j; const double smvp = SMVP(p); w(index) = smvp; pw_thread += smvp*p(index); }, pw_team); Kokkos::single(Kokkos::PerTeam(team), [&] () { pw += pw_team; }); }
KOKKOS_INLINE_FUNCTION void operator() ( const team_member & dev) const { Kokkos::View<int**,Kokkos::MemoryUnmanaged> l_histogram(dev.team_shmem(),TEAM_SIZE,TEAM_SIZE); Kokkos::View<int*,Kokkos::MemoryUnmanaged> l_data(dev.team_shmem(),chunk_size+1); const int i = dev.league_rank() * chunk_size; for(int j = dev.team_rank(); j<chunk_size+1; j+=dev.team_size()) l_data(j) = data(i+j); for(int k = dev.team_rank(); k < TEAM_SIZE; k+=dev.team_size()) for(int l = 0; l < TEAM_SIZE; l++) l_histogram(k,l) = 0; dev.team_barrier(); for(int j = 0; j<chunk_size; j++) { for(int k = dev.team_rank(); k < TEAM_SIZE; k+=dev.team_size()) for(int l = 0; l < TEAM_SIZE; l++) { if((l_data(j) == k) && (l_data(j+1)==l)) l_histogram(k,l)++; } } for(int k = dev.team_rank(); k < TEAM_SIZE; k+=dev.team_size()) for(int l = 0; l < TEAM_SIZE; l++) { Kokkos::atomic_fetch_add(&histogram(k,l),l_histogram(k,l)); } dev.team_barrier(); }