KOKKOS_INLINE_FUNCTION
  void operator() ( const team_member & dev) const {
    Kokkos::View<int**,Kokkos::MemoryUnmanaged> l_histogram(dev.team_shmem(),TEAM_SIZE,TEAM_SIZE);
    Kokkos::View<int*,Kokkos::MemoryUnmanaged> l_data(dev.team_shmem(),chunk_size+1);

    const int i = dev.league_rank() * chunk_size;
    for(int j = dev.team_rank(); j<chunk_size+1; j+=dev.team_size())
      l_data(j) = data(i+j);

    for(int k = dev.team_rank(); k < TEAM_SIZE; k+=dev.team_size())
      for(int l = 0; l < TEAM_SIZE; l++)
        l_histogram(k,l) = 0;
    dev.team_barrier();

    for(int j = 0; j<chunk_size; j++) {
      for(int k = dev.team_rank(); k < TEAM_SIZE; k+=dev.team_size())
        for(int l = 0; l < TEAM_SIZE; l++) {
          if((l_data(j) == k) && (l_data(j+1)==l))
            l_histogram(k,l)++;
        }
    }

    for(int k = dev.team_rank(); k < TEAM_SIZE; k+=dev.team_size())
      for(int l = 0; l < TEAM_SIZE; l++) {
        Kokkos::atomic_fetch_add(&histogram(k,l),l_histogram(k,l));
      }
    dev.team_barrier();
  }
Ejemplo n.º 2
0
  KOKKOS_INLINE_FUNCTION
  void operator() ( const team_member & thread) const {
    int i = thread.league_rank();

    // Allocate a shared array for the team.
    shared_1d_int count(thread.team_shmem(),data.dimension_1());

    // With each team run a parallel_for with its threads
    Kokkos::parallel_for(Kokkos::TeamThreadRange(thread,data.dimension_1()), [=] (const int& j) {
      int tsum;
      // Run a vector loop reduction over the inner dimension of data
      // Count how many values are multiples of 4
      // Every vector lane gets the same reduction value (tsum) back, it is broadcast to all vector lanes
      Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(thread,data.dimension_2()), [=] (const int& k, int & vsum) {
        vsum+= (data(i,j,k) % 4 == 0)?1:0;
      },tsum);

      // Make sure only one vector lane adds the reduction value to the shared array, i.e. execute
      // the next line only once PerThread
      Kokkos::single(Kokkos::PerThread(thread),[=] () {
        count(j) = tsum;
      });
    });

    // Wait for all threads to finish the parallel_for so that all shared memory writes are done
    thread.team_barrier();

    // Check with one vector lane from each thread how many consecutive
    // data segments have the same number of values divisible by 4
    // The team reduction value is again broadcast to every team member (and every vector lane)
    int team_sum = 0;
    Kokkos::parallel_reduce(Kokkos::TeamThreadRange(thread, data.dimension_1()-1), [=] (const int& j, int& thread_sum) {
      // It is not valid to directly add to thread_sum
      // Use a single function with broadcast instead
      // team_sum will be used as input to the operator (i.e. it is used to initialize sum)
      // the end value of sum will be broadcast to all vector lanes in the thread.
      Kokkos::single(Kokkos::PerThread(thread),[=] (int& sum) {
        if(count(j)==count(j+1)) sum++;
      },thread_sum);
    },team_sum);

    // Add with one thread and vectorlane of the team the team_sum to the global value
    Kokkos::single(Kokkos::PerTeam(thread),[=] () {
      Kokkos::atomic_add(&gsum(),team_sum);
    });
  }