__global__ void kAggShortRows(const float* mat, float* matSum, const uint width, const uint height, Agg agg, UnaryOp uop, BinaryOp bop) { const uint shmemX = THREADS_X + 1; __shared__ float shmem[AGG_SHORT_ROWS_THREADS_Y*shmemX]; const uint tidx = hipThreadIdx_y * THREADS_X + hipThreadIdx_x; const uint ty = LOOPS_X == 1 ? tidx / width : hipThreadIdx_y; // when loops==1, width is gonna be smaller than block x dim const uint tx = LOOPS_X == 1 ? tidx % width : hipThreadIdx_x; const uint bidx = hipBlockIdx_y * hipGridDim_x + hipBlockIdx_x; const uint blockRowIdx = bidx * AGG_SHORT_ROWS_LOOPS_Y * AGG_SHORT_ROWS_THREADS_Y; float* shmemWrite = shmem + MUL24(ty, shmemX) + tx; matSum += blockRowIdx + tidx; // shmem[MUL24(hipThreadIdx_y, shmemX) + hipThreadIdx_x] = 0; mat += width * blockRowIdx + MUL24(ty, width) + tx; float* shmemWriteZeros = &shmem[MUL24(hipThreadIdx_y,shmemX) + hipThreadIdx_x]; bool doAgg = tidx < AGG_SHORT_ROWS_THREADS_Y ; if (blockRowIdx < height) { #pragma unroll for (uint y = 0; y < AGG_SHORT_ROWS_LOOPS_Y*AGG_SHORT_ROWS_THREADS_Y; y += AGG_SHORT_ROWS_THREADS_Y) { doAgg &= tidx + y + blockRowIdx < height; const bool heightIdxOK = ty < AGG_SHORT_ROWS_THREADS_Y && ty + y + blockRowIdx < height; shmemWriteZeros[0] = agg.getBaseValue(); __syncthreads(); #pragma unroll for(uint x = 0; x < LOOPS_X * THREADS_X; x+= THREADS_X) { // __syncthreads(); if (heightIdxOK && x + tx < width) { shmemWrite[0] = agg(uop(mat[x]), shmemWrite[0]); } } __syncthreads(); if (doAgg) { /* * I tried doing this final sum as a 4-step reduction, with 8 threads * per warp participating. It was slightly slower. */ float accum = agg.getBaseValue(); float* shmemRead = shmem + MUL24(tidx, shmemX); // this loops too much if the rows are really short :( #pragma unroll for (uint i = 0; i < THREADS_X; i++) { accum = agg(accum, shmemRead[0]); shmemRead++; } matSum[0] = bop(matSum[0], accum); matSum += AGG_SHORT_ROWS_THREADS_Y; } __syncthreads(); mat += width * AGG_SHORT_ROWS_THREADS_Y; } } }
__global__ void kTotalAgg(const float* a, float* const target, const uint numElements, Agg agg) { __shared__ float shmem[DP_BLOCKSIZE]; uint eidx = DP_BLOCKSIZE * hipBlockIdx_x + hipThreadIdx_x; shmem[hipThreadIdx_x] = agg.getBaseValue(); if (eidx < hipGridDim_x * DP_BLOCKSIZE) { for (; eidx < numElements; eidx += hipGridDim_x * DP_BLOCKSIZE) { shmem[hipThreadIdx_x] = agg(shmem[hipThreadIdx_x], a[eidx]); } } __syncthreads(); if (hipThreadIdx_x < 256) { shmem[hipThreadIdx_x] = agg(shmem[hipThreadIdx_x], shmem[hipThreadIdx_x + 256]); } __syncthreads(); if (hipThreadIdx_x < 128) { shmem[hipThreadIdx_x] = agg(shmem[hipThreadIdx_x], shmem[hipThreadIdx_x + 128]); } __syncthreads(); if (hipThreadIdx_x < 64) { shmem[hipThreadIdx_x] = agg(shmem[hipThreadIdx_x], shmem[hipThreadIdx_x + 64]); } __syncthreads(); if (hipThreadIdx_x < 32) { volatile float* mysh = &shmem[hipThreadIdx_x]; *mysh = agg(*mysh, mysh[32]); *mysh = agg(*mysh, mysh[16]); *mysh = agg(*mysh, mysh[8]); *mysh = agg(*mysh, mysh[4]); *mysh = agg(*mysh, mysh[2]); *mysh = agg(*mysh, mysh[1]); if (hipThreadIdx_x == 0) { target[hipBlockIdx_x] = *mysh; } } }
__global__ void kAggShortRows2(const float* mat, float* matSum, const uint width, const uint height, Agg agg, UnaryOp uop, BinaryOp bop) { const uint shmemX = AGG_SHORT_ROWS_THREADS_X + 1; __shared__ float shmem[AGG_SHORT_ROWS_THREADS_Y*shmemX]; const uint LOOPS_X = DIVUP(width, AGG_SHORT_ROWS_THREADS_X); const uint tidx = hipThreadIdx_y * AGG_SHORT_ROWS_THREADS_X + hipThreadIdx_x; const uint bidx = hipBlockIdx_y * hipGridDim_x + hipBlockIdx_x; const uint blockRowIdx = bidx * AGG_SHORT_ROWS_LOOPS_Y * AGG_SHORT_ROWS_THREADS_Y; float* shmemWrite = shmem + MUL24(hipThreadIdx_y, shmemX) + hipThreadIdx_x; matSum += blockRowIdx + tidx; // shmem[MUL24(hipThreadIdx_y, shmemX) + hipThreadIdx_x] = 0; mat += width * blockRowIdx + MUL24(hipThreadIdx_y, width) + hipThreadIdx_x; bool doAgg = tidx < AGG_SHORT_ROWS_THREADS_Y; if(blockRowIdx < height) { for (uint y = 0; y < AGG_SHORT_ROWS_LOOPS_Y*AGG_SHORT_ROWS_THREADS_Y; y += AGG_SHORT_ROWS_THREADS_Y) { doAgg &= tidx + y + blockRowIdx < height; const bool heightIdxOK = hipThreadIdx_y + y + blockRowIdx < height; float accum = agg.getBaseValue(); shmemWrite[0] = agg.getBaseValue(); for(uint x = 0; x < LOOPS_X * AGG_SHORT_ROWS_THREADS_X; x+= AGG_SHORT_ROWS_THREADS_X) { // __syncthreads(); if (heightIdxOK && x + hipThreadIdx_x < width) { shmemWrite[0] = agg(uop(mat[x]), shmemWrite[0]); } } __syncthreads(); if (doAgg) { float* shmemRead = shmem + MUL24(tidx, shmemX); #pragma unroll for (uint i = 0; i < AGG_SHORT_ROWS_THREADS_X; i++) { accum = agg(accum, shmemRead[0]); shmemRead++; } matSum[0] = bop(matSum[0], accum); matSum += AGG_SHORT_ROWS_THREADS_Y; } __syncthreads(); mat += width * AGG_SHORT_ROWS_THREADS_Y; } } }
__global__ void kDumbAggCols(cudaTextureObject_t mat, float* const vec, const uint width, const uint height, Agg agg, UnaryOp uop, BinaryOp bop) { const uint idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; if (idx < width) { float mx = agg.getBaseValue(); for (uint j = 0; j < height; j++) { mx = agg(uop(tex1Dfetch<float>(mat, width * j + idx)), mx); } vec[idx] = bop(vec[idx], mx); } }
__global__ void kAggCols(cudaTextureObject_t mat, float* const vec, const uint width, const uint height, const uint sumLength, Agg agg, UnaryOp op) { const uint idxX = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; const uint idxY = hipBlockIdx_y * sumLength; if (idxX < width) { float mx = agg.getBaseValue(); for (uint j = idxY; j < min(height,idxY + sumLength); j++) { mx = agg(op(tex1Dfetch<float>(mat, j * width + idxX)), mx); } vec[hipBlockIdx_y * width + idxX] = mx; } }
__global__ void kAggRows_wholerow_nosync(const float* mat, float* matSum, const uint width, const uint height, Agg agg, UnaryOp uop, BinaryOp bop) { const uint tidx = hipThreadIdx_x; const uint warpIdx = tidx / WARP_SIZE; const uint lane = tidx % WARP_SIZE; __shared__ float accum[(WARP_SIZE + 1) * AWR_NUM_WARPS]; __shared__ float finalAccum[AWR_NUM_WARPS]; float* myAccum = &accum[warpIdx * (WARP_SIZE + 1) + lane]; float* myFinalAccum = &finalAccum[tidx]; //volatile float* vMyAccum = &accum[warpIdx * (WARP_SIZE + 1) + lane]; matSum += hipBlockIdx_y; mat += width * hipBlockIdx_y; float rAccum = agg.getBaseValue(); // cache in register, a bit faster than shmem #pragma unroll 32 for (uint x = tidx; x < width; x += AWR_NUM_THREADS) { rAccum = agg(rAccum, uop(mat[x])); } myAccum[0] = rAccum; // Each warp does a reduction that doesn't require synchronizatoin #pragma unroll for (uint i = 0; i < LOG_WARP_SIZE; i++) { const uint d = 1 << i; myAccum[0] = agg(myAccum[0], shfl_down(myAccum[0], d)); } __syncthreads(); // The warps write their results if (tidx < AWR_NUM_WARPS) { //volatile float* vMyFinalAccum = &finalAccum[tidx]; myFinalAccum[0] = accum[tidx * (WARP_SIZE + 1)]; #pragma unroll for (uint i = 0; i < AWR_LOG_NUM_WARPS; i++) { const uint d = 1 << i; myFinalAccum[0] = agg(myFinalAccum[0], shfl_down(myFinalAccum[0], d)); } if (tidx == 0) { matSum[0] = bop(matSum[0], myFinalAccum[0]); matSum += hipGridDim_y; } } }
__global__ void kAggRows_wholerow(const float* mat, float* matSum, const uint width, const uint height, Agg agg, BinaryOp op) { const int tidx = hipThreadIdx_x; __shared__ float accum[AWR_NUM_THREADS]; volatile float* vMyAccum = &accum[tidx]; float* myAccum = &accum[tidx]; matSum += hipBlockIdx_y; mat += width * hipBlockIdx_y; for (uint idxY = hipBlockIdx_y; idxY < height; idxY += hipGridDim_y) { myAccum[0] = agg.getBaseValue(); for (uint x = tidx; x < width; x += AWR_NUM_THREADS) { myAccum[0] = agg(myAccum[0], mat[x]); } #pragma unroll for (uint i = AWR_LOG_NUM_THREADS - 1; i > LOG_WARP_SIZE; i--) { const uint d = 1 << i; __syncthreads(); if (tidx < d) { myAccum[0] = agg(myAccum[0], myAccum[d]); } } __syncthreads(); if (tidx < WARP_SIZE) { #pragma unroll for (int i = LOG_WARP_SIZE; i >= 0; i--) { const uint d = 1 << i; vMyAccum[0] = agg(vMyAccum[0], vMyAccum[d]); } if (tidx == 0) { matSum[0] = op(matSum[0], vMyAccum[0]); matSum += hipGridDim_y; } } __syncthreads(); mat += width * hipGridDim_y; } }
__global__ void kAggRows(const float* mat, float* matSum, const uint width, const uint height, const uint sumWidth, Agg agg, UnaryOp uop, BinaryOp bop) { const int idxX = hipBlockIdx_x * blockSize*2 + hipThreadIdx_x; __shared__ float accum[blockSize*2]; matSum += hipBlockIdx_y * sumWidth + hipBlockIdx_x; /* * Here it's important to make sure that all threads in a block call __syncthreads, * so I have even the redundant threads (for which idxX >= width) enter this loop * just so that they may call __syncthreads at the appropriate times. */ mat += width * hipBlockIdx_y + idxX; accum[hipThreadIdx_x] = agg.getBaseValue(); accum[hipThreadIdx_x + blockSize] = agg.getBaseValue(); for (uint idxY = hipBlockIdx_y; idxY < height; idxY += hipGridDim_y) { if (idxX < width) { accum[hipThreadIdx_x] = uop(mat[0]); if(idxX + blockSize < width) accum[hipThreadIdx_x + blockSize] = uop(mat[blockSize]); } if (blockSize >= 512) { __syncthreads(); if (hipThreadIdx_x < 512) accum[hipThreadIdx_x] = agg(accum[hipThreadIdx_x], accum[hipThreadIdx_x + 512]); } if (blockSize >= 256) { __syncthreads(); if (hipThreadIdx_x < 256) accum[hipThreadIdx_x] = agg(accum[hipThreadIdx_x],accum[hipThreadIdx_x + 256]); } if (blockSize >= 128) { __syncthreads(); if (hipThreadIdx_x < 128) accum[hipThreadIdx_x] = agg(accum[hipThreadIdx_x],accum[hipThreadIdx_x + 128]); } if (blockSize >= 64) { __syncthreads(); if (hipThreadIdx_x < 64) accum[hipThreadIdx_x] = agg(accum[hipThreadIdx_x],accum[hipThreadIdx_x + 64]); } __syncthreads(); volatile float* myAccum = &accum[hipThreadIdx_x]; if (hipThreadIdx_x < 32) { // executed only by first warp myAccum[0] = agg(myAccum[0], myAccum[32]); myAccum[0] = agg(myAccum[0], myAccum[16]); myAccum[0] = agg(myAccum[0], myAccum[8]); myAccum[0] = agg(myAccum[0], myAccum[4]); myAccum[0] = agg(myAccum[0], myAccum[2]); myAccum[0] = agg(myAccum[0], myAccum[1]); } if (hipThreadIdx_x == 0) { matSum[0] = bop(matSum[0], myAccum[0]); matSum += hipGridDim_y * sumWidth; } __syncthreads(); mat += width * hipGridDim_y; } }