KOKKOS_INLINE_FUNCTION
 static double reduce(const double& val) {
   double result = val;
   if (N > 1)
     result += shfl_down(result, 1,N);
   if (N > 2)
     result += shfl_down(result, 2,N);
   if (N > 4)
     result += shfl_down(result, 4,N);
   if (N > 8)
     result += shfl_down(result, 8,N);
   if (N > 16)
     result += shfl_down(result, 16,N);
   return result;
 }
 KOKKOS_INLINE_FUNCTION
 static unsigned long int reduce(const unsigned long int& val) {
   unsigned long int result = val;
   if (N > 1)
     result += shfl_down(result, 1,N);
   if (N > 2)
     result += shfl_down(result, 2,N);
   if (N > 4)
     result += shfl_down(result, 4,N);
   if (N > 8)
     result += shfl_down(result, 8,N);
   if (N > 16)
     result += shfl_down(result, 16,N);
   return result;
 }
Esempio n. 3
0
        __device__ static void mergeShfl(const ValTuple& val, uint delta, uint width, const OpTuple& op)
        {
            typename GetType<typename tuple_element<I, ValTuple>::type>::type reg = shfl_down(get<I>(val), delta, width);
            get<I>(val) = get<I>(op)(get<I>(val), reg);

            For<I + 1, N>::mergeShfl(val, delta, width, op);
        }
Esempio n. 4
0
__global__ void kAggRows_wholerow_nosync(const float* mat, float* matSum, const uint width, const uint height,
                                         Agg agg, UnaryOp uop, BinaryOp bop) {
    const uint tidx = hipThreadIdx_x;
    const uint warpIdx = tidx / WARP_SIZE;
    const uint lane = tidx % WARP_SIZE;
    
    __shared__ float accum[(WARP_SIZE + 1) * AWR_NUM_WARPS];
    __shared__ float finalAccum[AWR_NUM_WARPS];

    float* myAccum = &accum[warpIdx * (WARP_SIZE + 1) + lane];
    float* myFinalAccum = &finalAccum[tidx];
    //volatile float* vMyAccum = &accum[warpIdx * (WARP_SIZE + 1) + lane];
    matSum += hipBlockIdx_y;
    mat += width * hipBlockIdx_y;

    float rAccum = agg.getBaseValue(); // cache in register, a bit faster than shmem
    #pragma unroll 32
    for (uint x = tidx; x < width; x += AWR_NUM_THREADS) {
        rAccum = agg(rAccum, uop(mat[x]));
    }
    myAccum[0] = rAccum;
    
    // Each warp does a reduction that doesn't require synchronizatoin
    #pragma unroll
    for (uint i = 0; i < LOG_WARP_SIZE; i++) {
        const uint d = 1 << i;
        myAccum[0] = agg(myAccum[0], shfl_down(myAccum[0], d));
    }
    __syncthreads();
    // The warps write their results
    if (tidx < AWR_NUM_WARPS) {
        //volatile float* vMyFinalAccum = &finalAccum[tidx];
        myFinalAccum[0] = accum[tidx * (WARP_SIZE + 1)];
        #pragma unroll
        for (uint i = 0; i < AWR_LOG_NUM_WARPS; i++) {
            const uint d = 1 << i;
            myFinalAccum[0] = agg(myFinalAccum[0], shfl_down(myFinalAccum[0], d));
        }
        if (tidx == 0) {
            matSum[0] = bop(matSum[0], myFinalAccum[0]);
            matSum += hipGridDim_y;
        }
    }
}
    __device__
    void operator()(void) const
    {
      // Number of bases in the stochastic system:
      const size_type dim = m_A.block.dimension();
      const size_type max_tile_size = m_A.block.max_jk_tile_size();

      volatile VectorScalar * const sh_x_k =
        kokkos_impl_cuda_shared_memory<VectorScalar>();
      volatile VectorScalar * const sh_x_j = sh_x_k+m_block_size*max_tile_size;
      volatile VectorScalar * const sh_A_k = sh_x_j+m_block_size*max_tile_size;
      volatile VectorScalar * const sh_A_j = sh_A_k+m_block_size*max_tile_size;
      volatile VectorScalar * const sh_y   = sh_A_j+m_block_size*max_tile_size;

      const size_type nid = blockDim.x * blockDim.y;
      const size_type tid = threadIdx.x + blockDim.x * threadIdx.y;

      // blockIdx.x == row in the deterministic (finite element) system
      const size_type iBlockEntryBeg = m_A.graph.row_map[ blockIdx.x ];
      const size_type iBlockEntryEnd = m_A.graph.row_map[ blockIdx.x + 1 ];
      size_type numBlock = (iBlockEntryEnd-iBlockEntryBeg) / m_block_size;
      const size_type remBlock = (iBlockEntryEnd-iBlockEntryBeg) % m_block_size;
      if (remBlock > 0) ++numBlock;

      // Loop over i tiles
      const size_type n_i_tile = m_A.block.num_i_tiles();
      for (size_type i_tile = 0; i_tile<n_i_tile; ++i_tile) {
        const size_type i_begin = m_A.block.i_begin(i_tile);
        const size_type i_size  = m_A.block.i_size(i_tile);

        // Zero y
        for (size_type i=tid; i<i_size; i+=nid) {
          sh_y[i] = 0.0;
        }

        // Loop over finite element column blocks.
        size_type iBlockEntry = iBlockEntryBeg;
        for (size_type block=0; block<numBlock; ++block,
               iBlockEntry+=m_block_size) {
          const size_type block_size =
            (block == numBlock-1 && remBlock > 0) ? remBlock : m_block_size;

          // Loop over j tiles
          const size_type n_j_tile = m_A.block.num_j_tiles(i_tile);
          for (size_type j_tile = 0; j_tile<n_j_tile; ++j_tile) {
            const size_type j_begin = m_A.block.j_begin(i_tile, j_tile);
            const size_type j_size  = m_A.block.j_size(i_tile, j_tile);

            // Wait for X and A to be used in the previous iteration
            // before reading new values.
            __syncthreads();

            // Coalesced read j-blocks of X and A into shared memory
            for (size_type col=0; col<block_size; ++col) {
              const size_type iBlockColumn =
                m_A.graph.entries(iBlockEntry + col);
              const VectorScalar * const x_j =
                &m_x(j_begin, iBlockColumn);
              const MatrixScalar * const A_j =
                &m_A.values(j_begin, iBlockEntry + col);
              for (size_type j=tid; j<j_size; j+=nid) {
                sh_x_j[col+j*m_block_size] = x_j[j];
                sh_A_j[col+j*m_block_size] = A_j[j];
              }
            }

            // Loop over k tiles
            const size_type n_k_tile = m_A.block.num_k_tiles(i_tile, j_tile);
            for (size_type k_tile = 0; k_tile<n_k_tile; ++k_tile) {
              const size_type k_begin =
                m_A.block.k_begin(i_tile, j_tile, k_tile);
              const size_type k_size  =
                m_A.block.k_size(i_tile, j_tile, k_tile);

              // Wait for X and A to be used in the previous iteration
              // before reading new values.
              __syncthreads();

              // Coalesced read j-blocks of X and A into shared memory
              for (size_type col=0; col<block_size; ++col) {
                const size_type iBlockColumn =
                  m_A.graph.entries(iBlockEntry + col);
                const VectorScalar * const x_k =
                  &m_x(k_begin, iBlockColumn);
                const MatrixScalar * const A_k =
                  &m_A.values(k_begin, iBlockEntry + col);
                for (size_type k=tid; k<k_size; k+=nid) {
                  sh_x_k[col+k*m_block_size] = x_k[k];
                  sh_A_k[col+k*m_block_size] = A_k[k];
                }
              }

              __syncthreads(); // wait for X and A to be read

              // Loop over stochastic rows in this tile
              for (size_type i=threadIdx.y; i<i_size; i+=blockDim.y) {
                VectorScalar s = 0;

                // Product tensor entries which this warp will iterate:
                const size_type lBeg =
                  m_A.block.entry_begin(i_tile, j_tile, k_tile, i);
                const size_type lEnd =
                  m_A.block.entry_end(i_tile, j_tile, k_tile, i);

                // Loop through sparse tensor contributions with
                // coalesced reads.
                for (size_type l=lBeg+threadIdx.x; l<lEnd; l+=blockDim.x) {
                  const size_type kj   = m_A.block.coord( l );
                  const TensorScalar v = m_A.block.value( l );
                  const size_type j    = ( kj & 0x0ffff ) * m_block_size ;
                  const size_type k    = ( kj >> 16     ) * m_block_size ;

                  for ( size_type col = 0; col < block_size; ++col ) {
                    s += v * ( sh_A_j[col+j] * sh_x_k[col+k] +
                               sh_A_k[col+k] * sh_x_j[col+j] );
                  }
                }

                // Reduction of 'y' within 'blockDim.x'
                if (blockDim.x >= 2) s += shfl_down(s, 1, blockDim.x);
                if (blockDim.x >= 4) s += shfl_down(s, 2, blockDim.x);
                if (blockDim.x >= 8) s += shfl_down(s, 4, blockDim.x);
                if (blockDim.x >= 16) s += shfl_down(s, 8, blockDim.x);
                if (blockDim.x >= 32) s += shfl_down(s, 16, blockDim.x);
                if ( threadIdx.x == 0 ) sh_y[i] += s;

              } // i-loop

            } // k-tile loop

          } // j-tile loop

        } // block column loop

        // Wait for all threads to complete the i-tile
        __syncthreads();

        // Store sum for this tile back in global memory
        for (size_type i=tid; i<i_size; i+=nid) {
          m_y( i+i_begin , blockIdx.x ) = sh_y[i];
        }

      } // i-tile loop

    } // operator()
Esempio n. 6
0
 __device__ __forceinline__ void mergeShfl(T& val, uint delta, uint width, const Op& op)
 {
     T reg = shfl_down(val, delta, width);
     val = op(val, reg);
 }