__device__ inline void operator() ( const size_type iRow ) const { const size_type iEntryBegin = m_A.graph.row_map[iRow]; const size_type iEntryEnd = m_A.graph.row_map[iRow+1]; for (size_type j=0; j<m_num_col; j++) { size_type iCol = m_col_indices[j]; scalar_type sum = 0.0; for ( size_type iEntry = iEntryBegin ; iEntry < iEntryEnd ; ++iEntry ) { sum += m_A.values(iEntry) * m_x( m_A.graph.entries(iEntry), iCol ); } m_y( iRow, iCol ) = sum; } }
__device__ void operator()(void) const { const size_type WarpSize = Kokkos::Impl::CudaTraits::WarpSize; // Number of bases in the stochastic system: const size_type dim = m_A.block.dimension(); // Number of Cijk tiles const size_type n_tile = m_A.block.num_tiles(); const size_type tile_size = m_A.block.tile_size(); const size_type tile_dim = n_tile == 1 ? dim : tile_size; //const size_type tile_dim = tile_size; VectorScalar * const sh_x_k = kokkos_impl_cuda_shared_memory<VectorScalar>(); VectorScalar * const sh_x_j = n_tile == 1 ? sh_x_k : sh_x_k + m_block_size*tile_dim; VectorScalar * const sh_A_k = sh_x_j + m_block_size*tile_dim; VectorScalar * const sh_A_j = n_tile == 1 ? sh_A_k : sh_A_k + m_block_size*tile_dim; VectorScalar * const sh_y = sh_A_j + m_block_size*tile_dim; volatile VectorScalar * const sh_t = sh_y + tile_dim; const size_type nid = blockDim.x * blockDim.y; const size_type tid = threadIdx.x + blockDim.x * threadIdx.y; // blockIdx.x == row in the deterministic (finite element) system const size_type iBlockEntryBeg = m_A.graph.row_map[ blockIdx.x ]; const size_type iBlockEntryEnd = m_A.graph.row_map[ blockIdx.x + 1 ]; size_type numBlock = (iBlockEntryEnd-iBlockEntryBeg) / m_block_size; const size_type remBlock = (iBlockEntryEnd-iBlockEntryBeg) % m_block_size; if (remBlock > 0) ++numBlock; // Zero y for (size_type i=tid; i<dim; i+=nid) { m_y(i,blockIdx.x) = 0.0; } // Loop over Cijk tiles for (size_type tile = 0; tile<n_tile; ++tile) { const size_type i_offset = m_A.block.offset(tile, 0); const size_type j_offset = m_A.block.offset(tile, 1); const size_type k_offset = m_A.block.offset(tile, 2); const size_type i_range = m_A.block.range(tile, 0); const size_type j_range = m_A.block.range(tile, 1); const size_type k_range = m_A.block.range(tile, 2); const size_type n_row = m_A.block.num_rows(tile); // Zero y for (size_type i=tid; i<i_range; i+=nid) { sh_y[i] = 0.0; } // Loop over finite element column blocks. size_type iBlockEntry = iBlockEntryBeg; for (size_type block=0; block<numBlock; ++block, iBlockEntry+=m_block_size) { const size_type block_size = (block == numBlock-1 && remBlock > 0) ? remBlock : m_block_size; // Wait for X and A to be used in the previous iteration // before reading new values. __syncthreads(); // Coalesced read blocks of X and A into shared memory for (size_type col=0; col<block_size; ++col) { const size_type iBlockColumn = m_A.graph.entries( iBlockEntry + col ); const VectorScalar * const x_k = & m_x( k_offset , iBlockColumn ); const VectorScalar * const x_j = & m_x( j_offset , iBlockColumn ); const MatrixScalar * const A_k = & m_A.values( k_offset , iBlockEntry + col ); const MatrixScalar * const A_j = & m_A.values( j_offset , iBlockEntry + col ); for (size_type j=tid; j<j_range; j+=nid) { sh_x_j[col+j*m_block_size] = x_j[j]; sh_A_j[col+j*m_block_size] = A_j[j]; } if (n_tile > 1) { for (size_type k=tid; k<k_range; k+=nid) { sh_x_k[col+k*m_block_size] = x_k[k]; sh_A_k[col+k*m_block_size] = A_k[k]; } } } __syncthreads(); // wait for X and A to be read // Loop over stochastic rows in this tile for (size_type i=threadIdx.y; i<i_range; i+=blockDim.y) { VectorScalar s = 0; // Product tensor entries which this warp will iterate: const size_type lBeg = m_A.block.entry_begin(tile, i); const size_type lEnd = m_A.block.entry_end(tile, i); // Loop through sparse tensor contributions with coalesced reads. for (size_type l=lBeg+threadIdx.x; l<lEnd; l+=blockDim.x) { const size_type kj = m_A.block.coord( l ); const TensorScalar v = m_A.block.value( l ); const size_type j = ( kj & 0x0ffff ) * m_block_size ; const size_type k = ( kj >> 16 ) * m_block_size ; for ( size_type col = 0; col < block_size; ++col ) { s += v * ( sh_A_j[col+j] * sh_x_k[col+k] + sh_A_k[col+k] * sh_x_j[col+j] ); } } // Reduction of 'y' within 'CudaTraits::WarpSize' sh_t[tid] = s; if ( threadIdx.x + 16 < WarpSize ) sh_t[tid] += sh_t[tid+16]; if ( threadIdx.x + 8 < WarpSize ) sh_t[tid] += sh_t[tid+ 8]; if ( threadIdx.x + 4 < WarpSize ) sh_t[tid] += sh_t[tid+ 4]; if ( threadIdx.x + 2 < WarpSize ) sh_t[tid] += sh_t[tid+ 2]; if ( threadIdx.x + 1 < WarpSize ) sh_t[tid] += sh_t[tid+ 1]; if ( threadIdx.x == 0 ) sh_y[i] += sh_t[tid]; } } // Wait for all threads to complete the tile __syncthreads(); // Store partial sum for this tile back in global memory for (size_type i=tid; i<i_range; i+=nid) { m_y( i+i_offset , blockIdx.x ) += sh_y[i]; } } }
__device__ void operator()(void) const { // Number of bases in the stochastic system: const size_type dim = m_A.block.dimension(); const size_type max_tile_size = m_A.block.max_jk_tile_size(); volatile VectorScalar * const sh_x_k = kokkos_impl_cuda_shared_memory<VectorScalar>(); volatile VectorScalar * const sh_x_j = sh_x_k+m_block_size*max_tile_size; volatile VectorScalar * const sh_A_k = sh_x_j+m_block_size*max_tile_size; volatile VectorScalar * const sh_A_j = sh_A_k+m_block_size*max_tile_size; volatile VectorScalar * const sh_y = sh_A_j+m_block_size*max_tile_size; const size_type nid = blockDim.x * blockDim.y; const size_type tid = threadIdx.x + blockDim.x * threadIdx.y; // blockIdx.x == row in the deterministic (finite element) system const size_type iBlockEntryBeg = m_A.graph.row_map[ blockIdx.x ]; const size_type iBlockEntryEnd = m_A.graph.row_map[ blockIdx.x + 1 ]; size_type numBlock = (iBlockEntryEnd-iBlockEntryBeg) / m_block_size; const size_type remBlock = (iBlockEntryEnd-iBlockEntryBeg) % m_block_size; if (remBlock > 0) ++numBlock; // Loop over i tiles const size_type n_i_tile = m_A.block.num_i_tiles(); for (size_type i_tile = 0; i_tile<n_i_tile; ++i_tile) { const size_type i_begin = m_A.block.i_begin(i_tile); const size_type i_size = m_A.block.i_size(i_tile); // Zero y for (size_type i=tid; i<i_size; i+=nid) { sh_y[i] = 0.0; } // Loop over finite element column blocks. size_type iBlockEntry = iBlockEntryBeg; for (size_type block=0; block<numBlock; ++block, iBlockEntry+=m_block_size) { const size_type block_size = (block == numBlock-1 && remBlock > 0) ? remBlock : m_block_size; // Loop over j tiles const size_type n_j_tile = m_A.block.num_j_tiles(i_tile); for (size_type j_tile = 0; j_tile<n_j_tile; ++j_tile) { const size_type j_begin = m_A.block.j_begin(i_tile, j_tile); const size_type j_size = m_A.block.j_size(i_tile, j_tile); // Wait for X and A to be used in the previous iteration // before reading new values. __syncthreads(); // Coalesced read j-blocks of X and A into shared memory for (size_type col=0; col<block_size; ++col) { const size_type iBlockColumn = m_A.graph.entries(iBlockEntry + col); const VectorScalar * const x_j = &m_x(j_begin, iBlockColumn); const MatrixScalar * const A_j = &m_A.values(j_begin, iBlockEntry + col); for (size_type j=tid; j<j_size; j+=nid) { sh_x_j[col+j*m_block_size] = x_j[j]; sh_A_j[col+j*m_block_size] = A_j[j]; } } // Loop over k tiles const size_type n_k_tile = m_A.block.num_k_tiles(i_tile, j_tile); for (size_type k_tile = 0; k_tile<n_k_tile; ++k_tile) { const size_type k_begin = m_A.block.k_begin(i_tile, j_tile, k_tile); const size_type k_size = m_A.block.k_size(i_tile, j_tile, k_tile); // Wait for X and A to be used in the previous iteration // before reading new values. __syncthreads(); // Coalesced read j-blocks of X and A into shared memory for (size_type col=0; col<block_size; ++col) { const size_type iBlockColumn = m_A.graph.entries(iBlockEntry + col); const VectorScalar * const x_k = &m_x(k_begin, iBlockColumn); const MatrixScalar * const A_k = &m_A.values(k_begin, iBlockEntry + col); for (size_type k=tid; k<k_size; k+=nid) { sh_x_k[col+k*m_block_size] = x_k[k]; sh_A_k[col+k*m_block_size] = A_k[k]; } } __syncthreads(); // wait for X and A to be read // Loop over stochastic rows in this tile for (size_type i=threadIdx.y; i<i_size; i+=blockDim.y) { VectorScalar s = 0; // Product tensor entries which this warp will iterate: const size_type lBeg = m_A.block.entry_begin(i_tile, j_tile, k_tile, i); const size_type lEnd = m_A.block.entry_end(i_tile, j_tile, k_tile, i); // Loop through sparse tensor contributions with // coalesced reads. for (size_type l=lBeg+threadIdx.x; l<lEnd; l+=blockDim.x) { const size_type kj = m_A.block.coord( l ); const TensorScalar v = m_A.block.value( l ); const size_type j = ( kj & 0x0ffff ) * m_block_size ; const size_type k = ( kj >> 16 ) * m_block_size ; for ( size_type col = 0; col < block_size; ++col ) { s += v * ( sh_A_j[col+j] * sh_x_k[col+k] + sh_A_k[col+k] * sh_x_j[col+j] ); } } // Reduction of 'y' within 'blockDim.x' if (blockDim.x >= 2) s += shfl_down(s, 1, blockDim.x); if (blockDim.x >= 4) s += shfl_down(s, 2, blockDim.x); if (blockDim.x >= 8) s += shfl_down(s, 4, blockDim.x); if (blockDim.x >= 16) s += shfl_down(s, 8, blockDim.x); if (blockDim.x >= 32) s += shfl_down(s, 16, blockDim.x); if ( threadIdx.x == 0 ) sh_y[i] += s; } // i-loop } // k-tile loop } // j-tile loop } // block column loop // Wait for all threads to complete the i-tile __syncthreads(); // Store sum for this tile back in global memory for (size_type i=tid; i<i_size; i+=nid) { m_y( i+i_begin , blockIdx.x ) = sh_y[i]; } } // i-tile loop } // operator()