__device__ T blockScanInclusive(T data, volatile T* smem, uint tid) { if (THREADS_NUM > WARP_SIZE) { // bottom-level inclusive warp scan T warpResult = warpScanInclusive(data, smem, tid); __syncthreads(); // save top elements of each warp for exclusive warp scan // sync to wait for warp scans to complete (because s_Data is being overwritten) if ((tid & (WARP_SIZE - 1)) == (WARP_SIZE - 1)) { smem[tid >> LOG_WARP_SIZE] = warpResult; }
__device__ __forceinline__ T warpScanExclusive(T data, volatile T* smem, uint tid) { return warpScanInclusive(data, smem, tid) - data; }