Exemplo n.º 1
0
 // This is a reduction operator which now takes as first argument the
 // TeamPolicy member_type. Every member of the team contributes to the
 // total sum.
 // It is helpful to think of this operator as a parallel region for a team
 // (i.e. every team member is active and will execute the code).
 KOKKOS_INLINE_FUNCTION
 void operator() ( const team_member & thread, int& sum) const {
   sum+=1;
   // The TeamPolicy<>::member_type provides functions to query the multi
   // dimensional index of a thread as well as the number of thread-teams and the size
   // of each team.
   printf("Hello World: %i %i // %i %i\n",thread.league_rank(),thread.team_rank(),thread.league_size(),thread.team_size());
 }
Exemplo n.º 2
0
  KOKKOS_INLINE_FUNCTION void
  operator() (const team_member& dev) const
  {
    Kokkos::parallel_for(Kokkos::TeamThreadRange(dev,0,rows_per_team), [&] (const ordinal_type& loop) {

      const ordinal_type iRow = static_cast<ordinal_type> ( dev.league_rank() ) * rows_per_team + loop;
      if (iRow >= m_A.numRows ()) {
        return;
      }
      const KokkosSparse::SparseRowViewConst<AMatrix> row = m_A.rowConst(iRow);
      const ordinal_type row_length = static_cast<ordinal_type> (row.length);
      value_type sum = 0;

      Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(dev,row_length), [&] (const ordinal_type& iEntry, value_type& lsum) {
        const value_type val = conjugate ?
                ATV::conj (row.value(iEntry)) :
                row.value(iEntry);
        lsum += val * m_x(row.colidx(iEntry));
      },sum);

      Kokkos::single(Kokkos::PerThread(dev), [&] () {
        sum *= alpha;

        if (dobeta == 0) {
          m_y(iRow) = sum ;
        } else {
          m_y(iRow) = beta * m_y(iRow) + sum;
        }
      });
    });
  }
Exemplo n.º 3
0
  KOKKOS_INLINE_FUNCTION
  void operator() ( const team_member & thread) const {
    int i = thread.league_rank();

    // Allocate a shared array for the team.
    shared_1d_int count(thread.team_shmem(),data.dimension_1());

    // With each team run a parallel_for with its threads
    Kokkos::parallel_for(Kokkos::TeamThreadRange(thread,data.dimension_1()), [=] (const int& j) {
      int tsum;
      // Run a vector loop reduction over the inner dimension of data
      // Count how many values are multiples of 4
      // Every vector lane gets the same reduction value (tsum) back, it is broadcast to all vector lanes
      Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(thread,data.dimension_2()), [=] (const int& k, int & vsum) {
        vsum+= (data(i,j,k) % 4 == 0)?1:0;
      },tsum);

      // Make sure only one vector lane adds the reduction value to the shared array, i.e. execute
      // the next line only once PerThread
      Kokkos::single(Kokkos::PerThread(thread),[=] () {
        count(j) = tsum;
      });
    });

    // Wait for all threads to finish the parallel_for so that all shared memory writes are done
    thread.team_barrier();

    // Check with one vector lane from each thread how many consecutive
    // data segments have the same number of values divisible by 4
    // The team reduction value is again broadcast to every team member (and every vector lane)
    int team_sum = 0;
    Kokkos::parallel_reduce(Kokkos::TeamThreadRange(thread, data.dimension_1()-1), [=] (const int& j, int& thread_sum) {
      // It is not valid to directly add to thread_sum
      // Use a single function with broadcast instead
      // team_sum will be used as input to the operator (i.e. it is used to initialize sum)
      // the end value of sum will be broadcast to all vector lanes in the thread.
      Kokkos::single(Kokkos::PerThread(thread),[=] (int& sum) {
        if(count(j)==count(j+1)) sum++;
      },thread_sum);
    },team_sum);

    // Add with one thread and vectorlane of the team the team_sum to the global value
    Kokkos::single(Kokkos::PerTeam(thread),[=] () {
      Kokkos::atomic_add(&gsum(),team_sum);
    });
  }
Exemplo n.º 4
0
    KOKKOS_INLINE_FUNCTION
    void operator()(const team_member& team) const
    {
        const int team_offset = (team.league_rank() + halo_depth)*y;

        Kokkos::parallel_for(
            Kokkos::TeamThreadRange(team, halo_depth, y-halo_depth),
            [&] (const int &j)
        {
            const int index = team_offset + j;
            p(index) = beta*p(index) + r(index);
        });
    }
Exemplo n.º 5
0
    KOKKOS_INLINE_FUNCTION
    void operator()(const team_member& team, value_type& rrn) const
    {
        double rrn_team = 0.0;
        const int team_offset = (team.league_rank() + halo_depth)*y;

        Kokkos::parallel_reduce(
                Kokkos::TeamThreadRange(team, halo_depth, y-halo_depth),
                [&] (const int &j, double& rrn_thread)
        {
            const int index = team_offset + j;
            u(index) += alpha*p(index);
            r(index) -= alpha*w(index);
            rrn_thread += r(index)*r(index);
        }, rrn_team);

        Kokkos::single(Kokkos::PerTeam(team), [&] ()
        {
            rrn += rrn_team;
        });
    }
Exemplo n.º 6
0
    KOKKOS_INLINE_FUNCTION
    void operator()(const team_member& team, value_type& pw) const
    {
        double pw_team = 0.0;
        const int team_offset = (team.league_rank() + halo_depth)*y;

        Kokkos::parallel_reduce(
                Kokkos::TeamThreadRange(team, halo_depth, y-halo_depth),
                [&] (const int &j, double& pw_thread)
        {
            const int index = team_offset + j;
            const double smvp = SMVP(p);
            w(index) = smvp;
            pw_thread += smvp*p(index);
        }, pw_team);

        Kokkos::single(Kokkos::PerTeam(team), [&] ()
        {
            pw += pw_team;
        });
    }
  KOKKOS_INLINE_FUNCTION
  void operator() ( const team_member & dev) const {
    Kokkos::View<int**,Kokkos::MemoryUnmanaged> l_histogram(dev.team_shmem(),TEAM_SIZE,TEAM_SIZE);
    Kokkos::View<int*,Kokkos::MemoryUnmanaged> l_data(dev.team_shmem(),chunk_size+1);

    const int i = dev.league_rank() * chunk_size;
    for(int j = dev.team_rank(); j<chunk_size+1; j+=dev.team_size())
      l_data(j) = data(i+j);

    for(int k = dev.team_rank(); k < TEAM_SIZE; k+=dev.team_size())
      for(int l = 0; l < TEAM_SIZE; l++)
        l_histogram(k,l) = 0;
    dev.team_barrier();

    for(int j = 0; j<chunk_size; j++) {
      for(int k = dev.team_rank(); k < TEAM_SIZE; k+=dev.team_size())
        for(int l = 0; l < TEAM_SIZE; l++) {
          if((l_data(j) == k) && (l_data(j+1)==l))
            l_histogram(k,l)++;
        }
    }

    for(int k = dev.team_rank(); k < TEAM_SIZE; k+=dev.team_size())
      for(int l = 0; l < TEAM_SIZE; l++) {
        Kokkos::atomic_fetch_add(&histogram(k,l),l_histogram(k,l));
      }
    dev.team_barrier();
  }