예제 #1
0
__global__ void kAggShortRows(const float* mat, float* matSum, const uint width, const uint height, Agg agg, UnaryOp uop, BinaryOp bop) {
    const uint shmemX = THREADS_X + 1;
    __shared__ float shmem[AGG_SHORT_ROWS_THREADS_Y*shmemX];

    const uint tidx = hipThreadIdx_y * THREADS_X + hipThreadIdx_x;
    const uint ty = LOOPS_X == 1 ? tidx / width : hipThreadIdx_y; // when loops==1, width is gonna be smaller than block x dim
    const uint tx = LOOPS_X == 1 ? tidx % width : hipThreadIdx_x;
    const uint bidx = hipBlockIdx_y * hipGridDim_x + hipBlockIdx_x;
    const uint blockRowIdx = bidx * AGG_SHORT_ROWS_LOOPS_Y * AGG_SHORT_ROWS_THREADS_Y;
    float* shmemWrite = shmem + MUL24(ty, shmemX) + tx;
    matSum += blockRowIdx + tidx;
//    shmem[MUL24(hipThreadIdx_y, shmemX) + hipThreadIdx_x] = 0;
    mat += width * blockRowIdx + MUL24(ty, width) + tx;
    float* shmemWriteZeros = &shmem[MUL24(hipThreadIdx_y,shmemX) + hipThreadIdx_x];

    bool doAgg = tidx < AGG_SHORT_ROWS_THREADS_Y ;

    if (blockRowIdx < height) {
#pragma unroll
        for (uint y = 0; y < AGG_SHORT_ROWS_LOOPS_Y*AGG_SHORT_ROWS_THREADS_Y; y += AGG_SHORT_ROWS_THREADS_Y) {
            doAgg &= tidx + y + blockRowIdx < height;
            const bool heightIdxOK = ty < AGG_SHORT_ROWS_THREADS_Y && ty + y + blockRowIdx < height;

            shmemWriteZeros[0] = agg.getBaseValue();
            __syncthreads();
#pragma unroll
            for(uint x = 0; x < LOOPS_X * THREADS_X; x+= THREADS_X) {
//                __syncthreads();
                if (heightIdxOK && x + tx < width) {
                    shmemWrite[0] = agg(uop(mat[x]), shmemWrite[0]);
                }
            }
            __syncthreads();
            if (doAgg) {
                /*
                 * I tried doing this final sum as a 4-step reduction, with 8 threads
                 * per warp participating. It was slightly slower.
                 */
                float accum = agg.getBaseValue();
                float* shmemRead = shmem + MUL24(tidx, shmemX);
                // this loops too much if the rows are really short :(
#pragma unroll
                for (uint i = 0; i < THREADS_X; i++) {
                    accum = agg(accum, shmemRead[0]);
                    shmemRead++;
                }
                matSum[0] = bop(matSum[0], accum);
                matSum += AGG_SHORT_ROWS_THREADS_Y;
            }
            __syncthreads();
            mat += width * AGG_SHORT_ROWS_THREADS_Y;
        }
    }
}
예제 #2
0
__global__ void kTotalAgg(const float* a, float* const target, const uint numElements, Agg agg) {
    __shared__ float shmem[DP_BLOCKSIZE];
    uint eidx = DP_BLOCKSIZE * hipBlockIdx_x + hipThreadIdx_x;
    shmem[hipThreadIdx_x] = agg.getBaseValue();
    if (eidx < hipGridDim_x * DP_BLOCKSIZE) {
        for (; eidx < numElements; eidx += hipGridDim_x * DP_BLOCKSIZE) {
            shmem[hipThreadIdx_x] = agg(shmem[hipThreadIdx_x], a[eidx]);
        }
    }
    __syncthreads();
    if (hipThreadIdx_x < 256) {
        shmem[hipThreadIdx_x] = agg(shmem[hipThreadIdx_x], shmem[hipThreadIdx_x + 256]);
    }
    __syncthreads();
    if (hipThreadIdx_x < 128) {
        shmem[hipThreadIdx_x] = agg(shmem[hipThreadIdx_x], shmem[hipThreadIdx_x + 128]);
    }
    __syncthreads();
    if (hipThreadIdx_x < 64) {
        shmem[hipThreadIdx_x] = agg(shmem[hipThreadIdx_x], shmem[hipThreadIdx_x + 64]);
    }
    __syncthreads();
    if (hipThreadIdx_x < 32) {
        volatile float* mysh = &shmem[hipThreadIdx_x];
        *mysh = agg(*mysh, mysh[32]);
        *mysh = agg(*mysh, mysh[16]);
        *mysh = agg(*mysh, mysh[8]);
        *mysh = agg(*mysh, mysh[4]);
        *mysh = agg(*mysh, mysh[2]);
        *mysh = agg(*mysh, mysh[1]);
        if (hipThreadIdx_x == 0) {
            target[hipBlockIdx_x] = *mysh;
        }
    }
}
예제 #3
0
__global__ void kAggShortRows2(const float* mat, float* matSum, const uint width, const uint height, Agg agg, UnaryOp uop, BinaryOp bop) {
    const uint shmemX = AGG_SHORT_ROWS_THREADS_X + 1;
    __shared__ float shmem[AGG_SHORT_ROWS_THREADS_Y*shmemX];
    const uint LOOPS_X = DIVUP(width, AGG_SHORT_ROWS_THREADS_X);
    const uint tidx = hipThreadIdx_y * AGG_SHORT_ROWS_THREADS_X + hipThreadIdx_x;

    const uint bidx = hipBlockIdx_y * hipGridDim_x + hipBlockIdx_x;
    const uint blockRowIdx = bidx * AGG_SHORT_ROWS_LOOPS_Y * AGG_SHORT_ROWS_THREADS_Y;

    float* shmemWrite = shmem + MUL24(hipThreadIdx_y, shmemX) + hipThreadIdx_x;
    matSum += blockRowIdx + tidx;
//    shmem[MUL24(hipThreadIdx_y, shmemX) + hipThreadIdx_x] = 0;
    mat += width * blockRowIdx + MUL24(hipThreadIdx_y, width) + hipThreadIdx_x;

    bool doAgg = tidx < AGG_SHORT_ROWS_THREADS_Y;
    if(blockRowIdx < height) {
        for (uint y = 0; y < AGG_SHORT_ROWS_LOOPS_Y*AGG_SHORT_ROWS_THREADS_Y; y += AGG_SHORT_ROWS_THREADS_Y) {
            doAgg &= tidx + y + blockRowIdx < height;
            const bool heightIdxOK = hipThreadIdx_y + y + blockRowIdx < height;
            float accum = agg.getBaseValue();
            shmemWrite[0] = agg.getBaseValue();

            for(uint x = 0; x < LOOPS_X * AGG_SHORT_ROWS_THREADS_X; x+= AGG_SHORT_ROWS_THREADS_X) {
//                __syncthreads();
                if (heightIdxOK && x + hipThreadIdx_x < width) {
                    shmemWrite[0] = agg(uop(mat[x]), shmemWrite[0]);
                }
            }

            __syncthreads();
            if (doAgg) {
                float* shmemRead = shmem + MUL24(tidx, shmemX);

#pragma unroll
                for (uint i = 0; i < AGG_SHORT_ROWS_THREADS_X; i++) {
                    accum = agg(accum, shmemRead[0]);
                    shmemRead++;
                }

                matSum[0] = bop(matSum[0], accum);
                matSum += AGG_SHORT_ROWS_THREADS_Y;
            }
            __syncthreads();
            mat += width * AGG_SHORT_ROWS_THREADS_Y;
        }
    }
}
예제 #4
0
__global__ void kDumbAggCols(cudaTextureObject_t mat, float* const vec, const uint width, const uint height, Agg agg, UnaryOp uop, BinaryOp bop) {
    const uint idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
    if (idx < width) {
        float mx = agg.getBaseValue();
        for (uint j = 0; j < height; j++) {
            mx = agg(uop(tex1Dfetch<float>(mat, width * j + idx)), mx);
        }
        vec[idx] = bop(vec[idx], mx);
    }
}
예제 #5
0
__global__ void kAggCols(cudaTextureObject_t mat, float* const vec, const uint width, const uint height, const uint sumLength, Agg agg, UnaryOp op) {
    const uint idxX = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
    const uint idxY = hipBlockIdx_y * sumLength;
    if (idxX < width) {
        float mx = agg.getBaseValue();
        for (uint j = idxY; j < min(height,idxY + sumLength); j++) {
            mx = agg(op(tex1Dfetch<float>(mat, j * width + idxX)), mx);
        }
        vec[hipBlockIdx_y * width + idxX] = mx;
    }
}
예제 #6
0
__global__ void kAggRows_wholerow_nosync(const float* mat, float* matSum, const uint width, const uint height,
                                         Agg agg, UnaryOp uop, BinaryOp bop) {
    const uint tidx = hipThreadIdx_x;
    const uint warpIdx = tidx / WARP_SIZE;
    const uint lane = tidx % WARP_SIZE;
    
    __shared__ float accum[(WARP_SIZE + 1) * AWR_NUM_WARPS];
    __shared__ float finalAccum[AWR_NUM_WARPS];

    float* myAccum = &accum[warpIdx * (WARP_SIZE + 1) + lane];
    float* myFinalAccum = &finalAccum[tidx];
    //volatile float* vMyAccum = &accum[warpIdx * (WARP_SIZE + 1) + lane];
    matSum += hipBlockIdx_y;
    mat += width * hipBlockIdx_y;

    float rAccum = agg.getBaseValue(); // cache in register, a bit faster than shmem
    #pragma unroll 32
    for (uint x = tidx; x < width; x += AWR_NUM_THREADS) {
        rAccum = agg(rAccum, uop(mat[x]));
    }
    myAccum[0] = rAccum;
    
    // Each warp does a reduction that doesn't require synchronizatoin
    #pragma unroll
    for (uint i = 0; i < LOG_WARP_SIZE; i++) {
        const uint d = 1 << i;
        myAccum[0] = agg(myAccum[0], shfl_down(myAccum[0], d));
    }
    __syncthreads();
    // The warps write their results
    if (tidx < AWR_NUM_WARPS) {
        //volatile float* vMyFinalAccum = &finalAccum[tidx];
        myFinalAccum[0] = accum[tidx * (WARP_SIZE + 1)];
        #pragma unroll
        for (uint i = 0; i < AWR_LOG_NUM_WARPS; i++) {
            const uint d = 1 << i;
            myFinalAccum[0] = agg(myFinalAccum[0], shfl_down(myFinalAccum[0], d));
        }
        if (tidx == 0) {
            matSum[0] = bop(matSum[0], myFinalAccum[0]);
            matSum += hipGridDim_y;
        }
    }
}
예제 #7
0
__global__ void kAggRows_wholerow(const float* mat, float* matSum, const uint width, const uint height, Agg agg, BinaryOp op) {
    const int tidx = hipThreadIdx_x;

    __shared__ float accum[AWR_NUM_THREADS];
    volatile float* vMyAccum = &accum[tidx];
    float* myAccum = &accum[tidx];
    
    matSum += hipBlockIdx_y;
    mat += width * hipBlockIdx_y;

    for (uint idxY = hipBlockIdx_y; idxY < height; idxY += hipGridDim_y) {
        myAccum[0] = agg.getBaseValue();
        for (uint x = tidx; x < width; x += AWR_NUM_THREADS) {
            myAccum[0] = agg(myAccum[0], mat[x]);
        }
        #pragma unroll
        for (uint i = AWR_LOG_NUM_THREADS - 1; i > LOG_WARP_SIZE; i--) {
            const uint d = 1 << i;
            __syncthreads();
            if (tidx < d) {
                myAccum[0] = agg(myAccum[0], myAccum[d]);
            }
        }
        __syncthreads();
        if (tidx < WARP_SIZE) {
            #pragma unroll
            for (int i = LOG_WARP_SIZE; i >= 0; i--) {
                const uint d = 1 << i;
                vMyAccum[0] = agg(vMyAccum[0], vMyAccum[d]);
            }

            if (tidx == 0) {
                matSum[0] = op(matSum[0], vMyAccum[0]);
                matSum += hipGridDim_y;
            }
        }
        __syncthreads();
        mat += width * hipGridDim_y;
    }
}
예제 #8
0
__global__ void kAggRows(const float* mat, float* matSum, const uint width, const uint height, const uint sumWidth, Agg agg, UnaryOp uop, BinaryOp bop) {
    const int idxX = hipBlockIdx_x * blockSize*2 + hipThreadIdx_x;

    __shared__ float accum[blockSize*2];

    matSum += hipBlockIdx_y * sumWidth + hipBlockIdx_x;
    /*
     * Here it's important to make sure that all threads in a block call __syncthreads,
     * so I have even the redundant threads (for which idxX >= width) enter this loop
     * just so that they may call __syncthreads at the appropriate times.
     */
    mat += width * hipBlockIdx_y + idxX;

    accum[hipThreadIdx_x] = agg.getBaseValue();
    accum[hipThreadIdx_x + blockSize] = agg.getBaseValue();
    for (uint idxY = hipBlockIdx_y; idxY < height; idxY += hipGridDim_y) {
        if (idxX < width) {
            accum[hipThreadIdx_x] = uop(mat[0]);
            if(idxX + blockSize < width)
                accum[hipThreadIdx_x + blockSize] = uop(mat[blockSize]);
        }
        if (blockSize >= 512) {
            __syncthreads();
            if (hipThreadIdx_x < 512)
                accum[hipThreadIdx_x] = agg(accum[hipThreadIdx_x], accum[hipThreadIdx_x + 512]);
        }
        if (blockSize >= 256) {
            __syncthreads();
            if (hipThreadIdx_x < 256)
                accum[hipThreadIdx_x] = agg(accum[hipThreadIdx_x],accum[hipThreadIdx_x + 256]);
        }
        if (blockSize >= 128) {
            __syncthreads();
            if (hipThreadIdx_x < 128)
                accum[hipThreadIdx_x] = agg(accum[hipThreadIdx_x],accum[hipThreadIdx_x + 128]);
        }
        if (blockSize >= 64) {
            __syncthreads();
            if (hipThreadIdx_x < 64)
                accum[hipThreadIdx_x] = agg(accum[hipThreadIdx_x],accum[hipThreadIdx_x + 64]);
        }

        __syncthreads();
        volatile float* myAccum = &accum[hipThreadIdx_x];
        if (hipThreadIdx_x < 32) { // executed only by first warp
            myAccum[0] = agg(myAccum[0], myAccum[32]);
            myAccum[0] = agg(myAccum[0], myAccum[16]);
            myAccum[0] = agg(myAccum[0], myAccum[8]);
            myAccum[0] = agg(myAccum[0], myAccum[4]);
            myAccum[0] = agg(myAccum[0], myAccum[2]);
            myAccum[0] = agg(myAccum[0], myAccum[1]);
        }

        if (hipThreadIdx_x == 0) {
            matSum[0] = bop(matSum[0], myAccum[0]);
            matSum += hipGridDim_y * sumWidth;
        }
        __syncthreads();
        mat += width * hipGridDim_y;
    }
}