__global__ void kColVectorOp(float* mat, float* vec, float* tgtMat, const uint width, const uint height, const uint matStride, const uint tgtStride, Op op) { __shared__ float shVec[ADD_VEC_THREADS_Y]; const uint by = ADD_VEC_THREADS_Y * hipBlockIdx_y; const uint bx = ADD_VEC_THREADS_X * hipBlockIdx_x; const uint tidx = ADD_VEC_THREADS_X * hipThreadIdx_y + hipThreadIdx_x; mat += hipThreadIdx_y * matStride; vec += tidx; tgtMat += hipThreadIdx_y * tgtStride; for (uint y = by; y < height; y += hipGridDim_y * ADD_VEC_THREADS_Y) { __syncthreads(); if (y + tidx < height && tidx < ADD_VEC_THREADS_Y) { shVec[tidx] = vec[y]; } __syncthreads(); if (y + hipThreadIdx_y < height) { for (uint x = bx + hipThreadIdx_x; x < width; x += hipGridDim_x * ADD_VEC_THREADS_X) { tgtMat[(y) * tgtStride + x] = op(mat[(y) * matStride + x], shVec[hipThreadIdx_y]); } } } }
__global__ void kEltwiseUnaryOpTrans(const float* a, float* const dest, const uint height, const uint width, const uint strideA, const uint strideDest, Op op) { __shared__ float shmem[ELTWISE_THREADS_X][ELTWISE_THREADS_X + 1]; for (uint by = ELTWISE_THREADS_X * hipBlockIdx_y; by < height; by += ELTWISE_THREADS_X * hipGridDim_y) { for (uint bx = ELTWISE_THREADS_X * hipBlockIdx_x; bx < width; bx += ELTWISE_THREADS_X * hipGridDim_x) { const uint readX = by + hipThreadIdx_x; const uint readY = bx + hipThreadIdx_y; for (uint y = 0; y < ELTWISE_THREADS_X; y+= ELTWISE_THREADS_Y) { if (!checkBounds || (readX < height && readY + y < width)) { shmem[hipThreadIdx_x][hipThreadIdx_y + y] = op(a[(readY + y) * strideA + readX]); } } __syncthreads(); const uint writeX = bx + hipThreadIdx_x; const uint writeY = by + hipThreadIdx_y; for (uint y = 0; y < ELTWISE_THREADS_X; y+= ELTWISE_THREADS_Y) { if(!checkBounds || (writeX < width && writeY + y < height)) { dest[(writeY + y) * strideDest + writeX] = shmem[hipThreadIdx_y + y][hipThreadIdx_x]; } } __syncthreads(); } } }
__global__ void prefix_sum_down_phase(T* idata, T* intermid, const int num_blocks, const int num_loops) { const int tid = threadIdx.x; const int MAX = 1U << LOG_MAX; const int HARF_MAX = MAX / 2; // == blockDim.x const int i_start = num_loops * blockIdx.x; const int i_end = min(i_start + num_loops, num_blocks); __shared__ T s_data[(1U << LOG_MAX)*2]; T* s_in = s_data + 1; T carry; if(tid == 0) carry = intermid[blockIdx.x]; for(int i = i_start; i < i_end; ++i) { s_in[tid + HARF_MAX*0] = idata[i*MAX + HARF_MAX*0 + tid]; s_in[tid + HARF_MAX*1] = idata[i*MAX + HARF_MAX*1 + tid]; if(tid == 0) s_data[0] = carry; __syncthreads(); dev_prefix_sum<T, LOG_MAX>(s_data, tid); idata[i*MAX + HARF_MAX*0 + tid] = s_in[tid + HARF_MAX*0]; idata[i*MAX + HARF_MAX*1 + tid] = s_in[tid + HARF_MAX*1]; if(tid == 0) carry = s_data[MAX]; __syncthreads(); } if(tid == 0 && blockIdx.x == 0) idata[-1] = 0; }
__global__ void gauss_jordan6(int n, T * AA){ extern __shared__ T A[]; int idy = threadIdx.x; // inverted to avoid warp divergence int idx = threadIdx.y; int blk = blockIdx.x; A[(idy*n)+idy] = AA[(n*n*blk)+(idy*n)+idx]; __syncthreads(); for(int i=0; i < n; ++i){ T i_row = (( idy != n-1 ) ? A[(i*n)+idy+1] : 1) / A[i*n]; T y_row = (( idy != n-1 ) ? A[(idx*n)+idy+1] : 0 ) - A[idx*n]*i_row; __syncthreads(); A[(idx*n)+idy] = ( idx != i ) ? y_row : i_row ; __syncthreads(); } AA[(n*n*blk)+(idx*n)+idy] = A[(idy*n)+idx]; }
__global__ void csr_trans_unit_lu_forward_kernel( const unsigned int * row_indices, const unsigned int * column_indices, const T * elements, T * vector, unsigned int size) { __shared__ unsigned int row_index_lookahead[256]; __shared__ unsigned int row_index_buffer[256]; unsigned int row_index; unsigned int col_index; T matrix_entry; unsigned int nnz = row_indices[size]; unsigned int row_at_window_start = 0; unsigned int row_at_window_end = 0; unsigned int loop_end = ( (nnz - 1) / blockDim.x + 1) * blockDim.x; for (unsigned int i = threadIdx.x; i < loop_end; i += blockDim.x) { col_index = (i < nnz) ? column_indices[i] : 0; matrix_entry = (i < nnz) ? elements[i] : 0; row_index_lookahead[threadIdx.x] = (row_at_window_start + threadIdx.x < size) ? row_indices[row_at_window_start + threadIdx.x] : size - 1; __syncthreads(); if (i < nnz) { unsigned int row_index_inc = 0; while (i >= row_index_lookahead[row_index_inc + 1]) ++row_index_inc; row_index = row_at_window_start + row_index_inc; row_index_buffer[threadIdx.x] = row_index; } else { row_index = size+1; row_index_buffer[threadIdx.x] = size - 1; } __syncthreads(); row_at_window_start = row_index_buffer[0]; row_at_window_end = row_index_buffer[blockDim.x - 1]; //forward elimination for (unsigned int row = row_at_window_start; row <= row_at_window_end; ++row) { T result_entry = vector[row]; if ( (row_index == row) && (col_index > row) ) vector[col_index] -= result_entry * matrix_entry; __syncthreads(); } row_at_window_start = row_at_window_end; } }
__global__ void kTotalAgg(const float* a, float* const target, const uint numElements, Agg agg) { __shared__ float shmem[DP_BLOCKSIZE]; uint eidx = DP_BLOCKSIZE * hipBlockIdx_x + hipThreadIdx_x; shmem[hipThreadIdx_x] = agg.getBaseValue(); if (eidx < hipGridDim_x * DP_BLOCKSIZE) { for (; eidx < numElements; eidx += hipGridDim_x * DP_BLOCKSIZE) { shmem[hipThreadIdx_x] = agg(shmem[hipThreadIdx_x], a[eidx]); } } __syncthreads(); if (hipThreadIdx_x < 256) { shmem[hipThreadIdx_x] = agg(shmem[hipThreadIdx_x], shmem[hipThreadIdx_x + 256]); } __syncthreads(); if (hipThreadIdx_x < 128) { shmem[hipThreadIdx_x] = agg(shmem[hipThreadIdx_x], shmem[hipThreadIdx_x + 128]); } __syncthreads(); if (hipThreadIdx_x < 64) { shmem[hipThreadIdx_x] = agg(shmem[hipThreadIdx_x], shmem[hipThreadIdx_x + 64]); } __syncthreads(); if (hipThreadIdx_x < 32) { volatile float* mysh = &shmem[hipThreadIdx_x]; *mysh = agg(*mysh, mysh[32]); *mysh = agg(*mysh, mysh[16]); *mysh = agg(*mysh, mysh[8]); *mysh = agg(*mysh, mysh[4]); *mysh = agg(*mysh, mysh[2]); *mysh = agg(*mysh, mysh[1]); if (hipThreadIdx_x == 0) { target[hipBlockIdx_x] = *mysh; } } }
static __device__ __forceinline__ void reduce_n(T* data, unsigned int n, BinOp op) { int ftid = flattenedThreadId(); int sft = stride(); if (sft < n) { for (unsigned int i = sft + ftid; i < n; i += sft) data[ftid] = op(data[ftid], data[i]); __syncthreads(); n = sft; } while (n > 1) { unsigned int half = n/2; if (ftid < half) data[ftid] = op(data[ftid], data[n - ftid - 1]); __syncthreads(); n = n - half; } }
__device__ static void reduce(Pointer smem, Reference val, uint tid, Op op) { const uint laneId = Warp::laneId(); #if CV_CUDEV_ARCH >= 300 Unroll<16, Pointer, Reference, Op>::loopShfl(val, op, warpSize); if (laneId == 0) loadToSmem(smem, val, tid / 32); #else loadToSmem(smem, val, tid); if (laneId < 16) Unroll<16, Pointer, Reference, Op>::loop(smem, val, tid, op); __syncthreads(); if (laneId == 0) loadToSmem(smem, val, tid / 32); #endif __syncthreads(); loadFromSmem(smem, val, tid); if (tid < 32) { #if CV_CUDEV_ARCH >= 300 Unroll<M / 2, Pointer, Reference, Op>::loopShfl(val, op, M); #else Unroll<M / 2, Pointer, Reference, Op>::loop(smem, val, tid, op); #endif } }
__global__ void prefix_sum_reference_kernel(T* odata, T* idata, const int n) { extern __shared__ T temp[]; const int tid = threadIdx.x; int offset = 1; const int ai = tid + 0; const int bi = tid + n/2; temp[ai] = idata[blockIdx.x*2*blockDim.x + ai]; temp[bi] = idata[blockIdx.x*2*blockDim.x + bi]; for(int d = n >> 1; d > 0; d >>= 1) { __syncthreads(); if(tid < d) { int ai = offset*(2*tid+1)-1; int bi = offset*(2*tid+2)-1; temp[bi] += temp[ai]; } offset *= 2; } if(tid == 0) { temp[n - 1] = 0; } for(int d = 1; d < n; d *= 2) { offset >>= 1; __syncthreads(); if(tid < d) { int ai = offset*(2*tid+1)-1; int bi = offset*(2*tid+2)-1; T t = temp[ai]; temp[ai] = temp[bi]; temp[bi] += t; } } __syncthreads(); odata[blockIdx.x*2*blockDim.x + ai] = temp[ai]; odata[blockIdx.x*2*blockDim.x + bi] = temp[bi]; }
__forceinline__ __device__ void readAndReconstructB(float (&Bi)[height][width], float (&RBx)[height][width], float (&RBy)[height][width], unsigned int p, unsigned int q, unsigned int bx_, unsigned int by_) { int bx = bx_ * blockDim.x; for (int j=threadIdx.y; j<height; j+=blockDim.y) { int by = by_ * blockDim.y + j; float* Bi_ptr = device_address2D(fgh_ctx.Bi.ptr, fgh_ctx.Bi.pitch, bx, by); for (int i=threadIdx.x; i<width; i+=blockDim.x) { Bi[j][i] = Bi_ptr[i]; } } __syncthreads(); /** * Reconstruct B at the integration points */ reconstructBx(RBx, Bi, p, q); reconstructBy(RBy, Bi, p, q); if (threadIdx.y == 0) { //Use one warp to perform the extra reconstructions needed reconstructBy(RBy, Bi, p, 1);//second row reconstructBy(RBy, Bi, p, height-2);//second last row reconstructBy(RBy, Bi, p, height-1);//last row if (threadIdx.x < height-4) { reconstructBx(RBx, Bi, 1, p);//second column reconstructBx(RBx, Bi, width-2, p); //second last column reconstructBx(RBx, Bi, width-1, p);//last column } } __syncthreads(); }
__global__ void prefix_sum_up_phase(T* idata, T* intermid, const int num_blocks, const int num_loops) { const int tid = threadIdx.x; const int MAX = 1U << LOG_MAX; const int HARF_MAX = MAX / 2; const int QUAT_MAX = MAX / 4; // == blockDim.x const int i_start = num_loops * blockIdx.x; const int i_end = min(i_start + num_loops, num_blocks); __shared__ T s_data[(1U << LOG_MAX)/2]; T sum = T(0); for(int i = i_start; i < i_end; ++i) { s_data[tid + 0 ] = idata[i*MAX + QUAT_MAX*0 + tid] + idata[i*MAX + QUAT_MAX*1 + tid]; s_data[tid + QUAT_MAX] = idata[i*MAX + QUAT_MAX*2 + tid] + idata[i*MAX + QUAT_MAX*3 + tid]; __syncthreads(); dev_reduce<T, HARF_MAX>(HARF_MAX, s_data, tid); if(tid == 0) sum += s_data[0]; __syncthreads(); } if(tid == 0) intermid[blockIdx.x] = sum; }
CUGIP_GLOBAL void pushKernelMultiLevelGlobalSync( TGraph aGraph, ParallelQueueView<int> aVertices, int *aLevelStarts, int aLevelCount, int aCurrentLevel, device_ptr<int> aLastProcessedLevel, device_flag_view aPushSuccessfulFlag, cub::GridBarrier barrier) { uint blockId = __mul24(blockIdx.y, gridDim.x) + blockIdx.x; int levelStart = aLevelStarts[aCurrentLevel - 1]; int levelEnd = aLevelStarts[aCurrentLevel]; do { __syncthreads(); barrier.Sync(); int index = levelStart + blockId * blockDim.x + threadIdx.x; while (index < levelEnd) { pushImplementation(aGraph, aVertices, index, aCurrentLevel, aPushSuccessfulFlag); index += blockDim.x * gridDim.x; } __syncthreads(); barrier.Sync(); --aCurrentLevel; levelStart = aLevelStarts[aCurrentLevel - 1]; levelEnd = aLevelStarts[aCurrentLevel]; __syncthreads(); } while (aCurrentLevel > 0 && (levelEnd - levelStart) <= TPolicy::MULTI_LEVEL_GLOBAL_SYNC_LIMIT); if (threadIdx.x == 0 && blockIdx.x == 0) { aLastProcessedLevel.assign_device(aCurrentLevel); } }
__global__ void yourHisto(const unsigned int* const vals, //INPUT unsigned int* const histo, //OUPUT int size) { __shared__ unsigned int temp[1024]; temp[threadIdx.x + 0] = 0; temp[threadIdx.x + 256] = 0; temp[threadIdx.x + 512] = 0; temp[threadIdx.x + 768] = 0; __syncthreads(); int i = threadIdx.x + blockIdx.x * blockDim.x; int offset = blockDim.x * gridDim.x; while (i < size) { atomicAdd( &temp[vals[i]], 1); i += offset; } __syncthreads(); atomicAdd( &(histo[threadIdx.x + 0]), temp[threadIdx.x + 0] ); atomicAdd( &(histo[threadIdx.x + 256]), temp[threadIdx.x + 256] ); atomicAdd( &(histo[threadIdx.x + 512]), temp[threadIdx.x + 512] ); atomicAdd( &(histo[threadIdx.x + 768]), temp[threadIdx.x + 768] ); }
__global__ void sequence_hamming_weight_kernel( device_sequence_space< IntType > * seqs, basic_data_space< unsigned int > * res, clotho::utility::algo_version< 3 > * v ) { typedef device_sequence_space< IntType > space_type; typedef typename space_type::int_type int_type; unsigned int bid = blockIdx.y * gridDim.x + blockIdx.x; unsigned int _count = seqs->seq_count; if( bid >= _count ) return; assert( _count <= res->size ); // sanity check: enough allocated space for results unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x; unsigned int tpb = blockDim.x * blockDim.y; // threads per block unsigned int bpg = gridDim.x * gridDim.y; // blocks per grid unsigned int wpb = (tpb >> 5); unsigned int spg = wpb * bpg; // sequences/grid = sequences(warps)/block * blocks/grid assert( (tpb & 31) == 0); // sanity check: all warps are full unsigned int lane_id = (tid & 31); unsigned int warp_id = (tid >> 5); unsigned int _width = seqs->seq_width; int_type * sptr = seqs->sequences; unsigned int * countptr = res->data; unsigned int max_seq_id = _count / wpb; // max_rounds = sequences * block/sequences max_seq_id += ((_count % wpb) ? 1 : 0); // would !!(_count % spg) be more efficient? max_seq_id *= wpb; unsigned int seq_id = bid * wpb + warp_id; while( seq_id < max_seq_id ) { // blocks of grid may terminate early; only block for tail may diverge unsigned int degree = 0; unsigned int end = (seq_id + 1) * _width; unsigned int idx = end - ((seq_id < _count) ? _width : 0); // true for all threads in warp while( idx < end ) { // all threads in a warp read same bit block; no divergence int_type b = sptr[ idx++ ]; // all threads in a warp read/load same bit block degree += ((b >> lane_id) & 1); // would !!( b & lane_mask), where (lane_mask = (1 << lane_id)), be more efficient? } __syncthreads(); // sync all warps for( unsigned int i = 1; i < 32; i <<= 1 ) { unsigned int d = __shfl_up(degree, i ); degree += (( lane_id >= i ) * d); } if( lane_id == 31 && (seq_id < _count) ) { countptr[ seq_id ] = degree; } __syncthreads(); seq_id += spg; } }
__global__ void histogram(const SrcPtr src, ResType* hist, const MaskPtr mask, const int rows, const int cols) { #if CV_CUDEV_ARCH >= 120 __shared__ ResType smem[BIN_COUNT]; const int y = blockIdx.x * blockDim.y + threadIdx.y; const int tid = threadIdx.y * blockDim.x + threadIdx.x; for (int i = tid; i < BIN_COUNT; i += BLOCK_SIZE) smem[i] = 0; __syncthreads(); if (y < rows) { for (int x = threadIdx.x; x < cols; x += blockDim.x) { if (mask(y, x)) { const uint data = src(y, x); atomicAdd(&smem[data % BIN_COUNT], 1); } } } __syncthreads(); for (int i = tid; i < BIN_COUNT; i += BLOCK_SIZE) { const ResType histVal = smem[i]; if (histVal > 0) atomicAdd(hist + i, histVal); } #endif }
/// Advance system by one time unit GPUAPI void advance() { double hby2 = 0.5 * min( max_timestep , _params.time_step ); // Step 1 if ( is_in_body_component_grid() ) drift_step(hby2); // Step 2: Kick Step if( is_in_body_component_grid_no_star() ) sys[b][c].vel() += hby2 * acc_bc; __syncthreads(); // 3: Kepler Drift Step (Keplerian orbit about sun/central body) if( (ij>0) && (ij<nbod) ) drift_kepler( sys[ij][0].pos(),sys[ij][1].pos(),sys[ij][2].pos(),sys[ij][0].vel(),sys[ij][1].vel(),sys[ij][2].vel(),sqrtGM, 2.0*hby2 ); __syncthreads(); // TODO: check for close encounters here acc_bc = calcForces.acc_planets(ij,b,c); // Step 4: Kick Step if( is_in_body_component_grid_no_star() ) sys[b][c].vel() += hby2 * acc_bc; __syncthreads(); // Step 5 if ( is_in_body_component_grid() ) drift_step(hby2); if( is_first_thread_in_system() ) sys.time() += 2.0*hby2; }
__global__ void kernel_reduce(const IndexType n, const ValueType *data, ValueType *out, const IndexType GROUP_SIZE, const IndexType LOCAL_SIZE) { IndexType tid = threadIdx.x; __shared__ ValueType sdata[BLOCK_SIZE]; sdata[tid] = ValueType(0.0); // get global id IndexType gid = GROUP_SIZE * blockIdx.x + tid; for (IndexType i = 0; i < LOCAL_SIZE; ++i, gid += BLOCK_SIZE) if ( gid < n ) sdata[tid] += data[gid]; __syncthreads(); #pragma unroll for (IndexType i = BLOCK_SIZE/2; i > 0; i /= 2) { if (tid < i) sdata[tid] += sdata[tid + i]; __syncthreads(); } if (tid == 0) out[blockIdx.x] = sdata[tid]; }
__global__ void matMultParallelTiled(float* A, float* B, float* C, int n, int m, int o){ __shared__ float Ads[TILE_WIDTH][TILE_WIDTH]; __shared__ float Bds[TILE_WIDTH][TILE_WIDTH]; int bx = blockIdx.x; int by = blockIdx.y; int tx = threadIdx.x; int ty = threadIdx.y; int row = by * TILE_WIDTH + ty; int col = bx * TILE_WIDTH + tx; float Pvalue=0; for (int k = 0; k < (m+TILE_WIDTH-1)/TILE_WIDTH; ++k){ if ( row < n && (k*TILE_WIDTH + tx) < m){ Ads[ty][tx] = A[row * m + k*TILE_WIDTH + tx]; }else{ Ads[ty][tx] = 0; } if ((k*TILE_WIDTH + ty) < m && col < o){ Bds[ty][tx] = B[(k*TILE_WIDTH + ty) * o + col]; }else{ Bds[ty][tx] =0; } __syncthreads(); for(int k = 0; k < TILE_WIDTH; ++k){ Pvalue += Ads[ty][k] * Bds[k][tx]; } __syncthreads(); } if (row < n && col < o){ C[row * o + col] = Pvalue; } }
__global__ void kernelCountParticles(PBox pb, uint64_cu* gCounter, Filter filter, Mapping mapper) { typedef typename PBox::FrameType FRAME; const uint32_t Dim = Mapping::Dim; __shared__ FRAME *frame; __shared__ bool isValid; __shared__ int counter; __shared__ lcellId_t particlesInSuperCell; __syncthreads(); /*wait that all shared memory is initialised*/ typedef typename Mapping::SuperCellSize SuperCellSize; const DataSpace<Dim > threadIndex(threadIdx); const int linearThreadIdx = DataSpaceOperations<Dim>::template map<SuperCellSize > (threadIndex); const DataSpace<Dim> superCellIdx(mapper.getSuperCellIndex(DataSpace<Dim > (blockIdx))); if (linearThreadIdx == 0) { frame = &(pb.getLastFrame(superCellIdx, isValid)); particlesInSuperCell = pb.getSuperCell(superCellIdx).getSizeLastFrame(); counter = 0; } __syncthreads(); if (!isValid) return; //end kernel if we have no frames filter.setSuperCellPosition((superCellIdx - mapper.getGuardingSuperCells()) * mapper.getSuperCellSize()); while (isValid) { if (linearThreadIdx < particlesInSuperCell) { if (filter(*frame, linearThreadIdx)) atomicAdd(&counter, 1); } __syncthreads(); if (linearThreadIdx == 0) { frame = &(pb.getPreviousFrame(*frame, isValid)); particlesInSuperCell = math::CT::volume<SuperCellSize>::type::value; } __syncthreads(); } __syncthreads(); if (linearThreadIdx == 0) { atomicAdd(gCounter, (uint64_cu) counter); } }
__global__ void kernelCollideVoxelMapsDebug(Voxel* voxelmap, const uint32_t voxelmap_size, OtherVoxel* other_map, Collider collider, uint16_t* results) { //#define DISABLE_STORING_OF_COLLISIONS __shared__ uint16_t cache[cMAX_NR_OF_THREADS_PER_BLOCK]; uint32_t i = blockIdx.x * blockDim.x + threadIdx.x; uint32_t cache_index = threadIdx.x; cache[cache_index] = 0; while (i < voxelmap_size) { // todo / note: at the moment collision check is only used for DYNAMIC and SWEPT VOLUME type, static is used for debugging const bool collision = collider.collide(voxelmap[i], other_map[i]); if (collision) // store collision info { //#ifndef DISABLE_STORING_OF_COLLISIONS // other_map[i].occupancy = 255; // other_map[i].voxeltype = eVT_COLLISION; //#endif cache[cache_index] += 1; } i += blockDim.x * gridDim.x; } // debug: print collision coordinates // if (temp) // { // Vector3ui col_coord = mapToVoxels(voxelmap, dimensions, &(voxelmap[i])); // printf("Collision at voxel (%u) = (%u, %u, %u). Memory addresses are %p and %p.\n", // i, col_coord.x, col_coord.y, col_coord.z, (void*)&(voxelmap[i]), (void*)&(other_map[i])); // } __syncthreads(); uint32_t j = blockDim.x / 2; while (j != 0) { if (cache_index < j) { cache[cache_index] = cache[cache_index] + cache[cache_index + j]; } __syncthreads(); j /= 2; } // copy results from this block to global memory if (cache_index == 0) { results[blockIdx.x] = cache[0]; } #undef DISABLE_STORING_OF_COLLISIONS }
__global__ void kAggShortRows(const float* mat, float* matSum, const uint width, const uint height, Agg agg, UnaryOp uop, BinaryOp bop) { const uint shmemX = THREADS_X + 1; __shared__ float shmem[AGG_SHORT_ROWS_THREADS_Y*shmemX]; const uint tidx = hipThreadIdx_y * THREADS_X + hipThreadIdx_x; const uint ty = LOOPS_X == 1 ? tidx / width : hipThreadIdx_y; // when loops==1, width is gonna be smaller than block x dim const uint tx = LOOPS_X == 1 ? tidx % width : hipThreadIdx_x; const uint bidx = hipBlockIdx_y * hipGridDim_x + hipBlockIdx_x; const uint blockRowIdx = bidx * AGG_SHORT_ROWS_LOOPS_Y * AGG_SHORT_ROWS_THREADS_Y; float* shmemWrite = shmem + MUL24(ty, shmemX) + tx; matSum += blockRowIdx + tidx; // shmem[MUL24(hipThreadIdx_y, shmemX) + hipThreadIdx_x] = 0; mat += width * blockRowIdx + MUL24(ty, width) + tx; float* shmemWriteZeros = &shmem[MUL24(hipThreadIdx_y,shmemX) + hipThreadIdx_x]; bool doAgg = tidx < AGG_SHORT_ROWS_THREADS_Y ; if (blockRowIdx < height) { #pragma unroll for (uint y = 0; y < AGG_SHORT_ROWS_LOOPS_Y*AGG_SHORT_ROWS_THREADS_Y; y += AGG_SHORT_ROWS_THREADS_Y) { doAgg &= tidx + y + blockRowIdx < height; const bool heightIdxOK = ty < AGG_SHORT_ROWS_THREADS_Y && ty + y + blockRowIdx < height; shmemWriteZeros[0] = agg.getBaseValue(); __syncthreads(); #pragma unroll for(uint x = 0; x < LOOPS_X * THREADS_X; x+= THREADS_X) { // __syncthreads(); if (heightIdxOK && x + tx < width) { shmemWrite[0] = agg(uop(mat[x]), shmemWrite[0]); } } __syncthreads(); if (doAgg) { /* * I tried doing this final sum as a 4-step reduction, with 8 threads * per warp participating. It was slightly slower. */ float accum = agg.getBaseValue(); float* shmemRead = shmem + MUL24(tidx, shmemX); // this loops too much if the rows are really short :( #pragma unroll for (uint i = 0; i < THREADS_X; i++) { accum = agg(accum, shmemRead[0]); shmemRead++; } matSum[0] = bop(matSum[0], accum); matSum += AGG_SHORT_ROWS_THREADS_Y; } __syncthreads(); mat += width * AGG_SHORT_ROWS_THREADS_Y; } } }
/** * Matrix multiplication on the device: C = A * B (column-major) * wA is A's and B's width * Each block uses shared memory of (nIt * 2 * 16 * 16 * 4) = 2048 bytes (BLOCK_SIZE=16, sizeof(WORD)=4) * nIt is at most BLOCK_DIM/2 but does not affect the amount of shared memory used * each multiprocessor can execute at most 8 blocks simultaneously (due to shared memory constraints) */ __global__ void matrixMul( float * C, float * A, float * B, int wA, int sCx, int sCy, int sAx, int sAy, int sBx, int sBy, int add) { // Block index int bx = blockIdx.x; int by = blockIdx.y; // Thread index int tx = threadIdx.x; int ty = threadIdx.y; // Remember... column-major int sa = sAx * WA + sAy; int sb = sBx * WA + sBy; int sc = sCx * WA + sCy; int ba = BLOCK_SIZE * by; // y-offset int bb = WA * BLOCK_SIZE * bx; // x-offset float min = FLOATINF; int nIt = wA / BLOCK_SIZE; // number of blocks in one dimension // Do block multiplication to update the C(i,j) block // Using A(i,1) * A(1,j) + A(i,2) * A(2,j) + ... + A(i,n) * A(n,j) for(int m = 0; m < nIt; ++m) { __shared__ float As[BLOCK_SIZE * BLOCK_SIZE]; __shared__ float Bs[BLOCK_SIZE * BLOCK_SIZE]; //load one element each As[tx*BLOCK_SIZE + ty] = A[sa + ba + m * BLOCK_SIZE * WA + tx *WA + ty]; Bs[tx*BLOCK_SIZE + ty] = B[sb + bb + m * BLOCK_SIZE + tx *WA + ty]; __syncthreads(); for(int k = 0; k < BLOCK_SIZE; ++k) { float a = As[k * BLOCK_SIZE + tx]; // (tx)th row float b = Bs[ty * BLOCK_SIZE + k]; // (ty)th column min = fminf(a+b, min); } __syncthreads(); } // Write the block sub-matrix to device memory; // each thread writes one element if(add) C[sc + ba + bb + ty * WA + tx] = fminf(C[sc + ba + bb + ty * WA + tx], min); else C[sc + ba + bb + ty * WA + tx] = min; // (tx,ty)th element }
/* * Run the complete algorithm for computing acceleration and * jerk on all bodies. This is tightly coupled with the * BPPT integrators. ij, b and c are calculated from thread id. * * If you need to calculate only acceleration use \ref acc function * instead. * * @ij The pair number for this tread. * @b The planet number for this thread. * @c coordinate number x:0,y:1,z:2 * @pos position for this planet's coordinate * @vel velecotiy for this planet's coordinate * @acc output variable to hold acceleration * @jerk output variable to hold jerk. * */ GPUAPI void operator() (int ij,int b,int c,double& pos,double& vel,double& acc,double& jerk) const{ // Write positions to shared (global) memory if(b < nbod && c < 3) sys[b][c].pos() = pos , sys[b][c].vel() = vel; __syncthreads(); if(ij < pair_count) calc_pair(ij); __syncthreads(); if(b < nbod && c < 3){ sum(b,c,acc,jerk); } }
__global__ void reduce( Src src, const uint32_t src_count, Dest dest, Functor func, Functor2 func2) { const uint32_t l_tid = threadIdx.x; const uint32_t tid = blockIdx.x * blockDim.x + l_tid; const uint32_t globalThreadCount = gridDim.x * blockDim.x; /* cuda can not handle extern shared memory were the type is * defined by a template * - therefore we use type int for the definition (dirty but OK) */ extern __shared__ int s_mem_extern[]; /* create a pointer with the right type*/ Type* s_mem=(Type*)s_mem_extern; if (tid >= src_count) return; /*end not needed threads*/ __syncthreads(); /*wait that all shared memory is initialized*/ /*fill shared mem*/ Type r_value = src[tid]; /*reduce not readed global memory to shared*/ uint32_t i = tid + globalThreadCount; while (i < src_count) { func(r_value, src[i]); i += globalThreadCount; } s_mem[l_tid] = r_value; __syncthreads(); /*now reduce shared memory*/ uint32_t chunk_count = blockDim.x; uint32_t active_threads; while (chunk_count != 1) { const float half_threads = (float) chunk_count / 2.0f; active_threads = float2uint(half_threads); if (threadIdx.x != 0 && l_tid >= active_threads) return; /*end not needed threads*/ chunk_count = ceilf(half_threads); func(s_mem[l_tid], s_mem[l_tid + chunk_count]); __syncthreads(); } func2(dest[blockIdx.x], s_mem[0]); }
__device__ void stop() { if(intervall >= 0 && intervall < numberOfIntervalls) { unsigned long long diff = clock64() - currentTime; __syncthreads(); atomicAdd(&counter[intervall].avg, diff); atomicMin(&counter[intervall].min, diff); atomicMax(&counter[intervall].max, diff); atomicAdd(&counter[intervall].iter, 1); intervall++; __syncthreads(); currentTime = clock64(); } }
__device__ inline unsigned int computeNumSmallerEigenvalsLarge(const NumericT *g_d, const NumericT *g_s, const unsigned int n, const NumericT x, const unsigned int tid, const unsigned int num_intervals_active, NumericT *s_d, NumericT *s_s, unsigned int converged ) { NumericT delta = 1.0f; unsigned int count = 0; unsigned int rem = n; // do until whole diagonal and superdiagonal has been loaded and processed for (unsigned int i = 0; i < n; i += blockDim.x) { __syncthreads(); // read new chunk of data into shared memory if ((i + threadIdx.x) < n) { s_d[threadIdx.x] = *(g_d + i + threadIdx.x); s_s[threadIdx.x] = *(g_s + i + threadIdx.x - 1); } __syncthreads(); if (tid < num_intervals_active) { // perform (optimized) Gaussian elimination to determine the number // of eigenvalues that are smaller than n for (unsigned int k = 0; k < min(rem,blockDim.x); ++k) { delta = s_d[k] - x - (s_s[k] * s_s[k]) / delta; // delta = (abs( delta) < (1.0e-10)) ? -(1.0e-10) : delta; count += (delta < 0) ? 1 : 0; } } // end if thread currently processing an interval rem -= blockDim.x; } return count; }
/*! \pre This thread block shall be converged. */ __device__ void construct_owner() { __shared__ int owner; __syncthreads(); if(threadIdx.x == 0) { m_owner = &owner; } __syncthreads(); disown_and_synchronize(); }
template <typename T1> __device__ __forceinline__ VecDiffCachedRegister(const T1* vec1, int len, U* smem, int glob_tid, int tid) { if (glob_tid < len) smem[glob_tid] = vec1[glob_tid]; __syncthreads(); U* vec1ValsPtr = vec1Vals; #pragma unroll for (int i = tid; i < MAX_LEN; i += THREAD_DIM) *vec1ValsPtr++ = smem[i]; __syncthreads(); }
__global__ void sequence_hamming_weight_kernel( device_sequence_space< IntType > * seqs, basic_data_space< unsigned int > * res, clotho::utility::algo_version< 5 > * v ) { assert( blockDim.x == 32 ); assert( blockDim.y <= 32 ); __shared__ unsigned int buffer[ 32 ]; if( threadIdx.y == 0 ) { buffer[ threadIdx.x ] = 0; } __syncthreads(); const unsigned int WIDTH = seqs->seq_width; IntType * seq_ptr = seqs->sequences; unsigned int N = 0; unsigned int seq_idx = blockIdx.y * gridDim.x + blockIdx.x; unsigned int seq_begin = seq_idx * WIDTH + threadIdx.y; unsigned int seq_end = (seq_idx + 1) * WIDTH; while( seq_begin < seq_end ) { IntType x = seq_ptr[ seq_begin ]; N += (( x >> threadIdx.x) & 1); seq_begin += blockDim.y; } __syncthreads(); for( unsigned int i = 1; i < 32; i <<= 1 ) { unsigned int t = __shfl_up( N, i ); N += ((unsigned int) (threadIdx.x >= i) * t); } if( threadIdx.x == 31 ) { buffer[ threadIdx.y ] = N; } __syncthreads(); N = buffer[ threadIdx.x ]; __syncthreads(); for( unsigned int i = 1; i < 32; i <<= 1 ) { unsigned int t = __shfl_up( N, i ); N += ((unsigned int) (threadIdx.x >= i) * t); } if( threadIdx.y == 0 && threadIdx.x == 31 ) { res->data[ seq_idx ] = N; } __syncthreads(); }
__global__ void reduce( Src src, const uint32_t src_count, Dest dest, Functor func, Functor2 func2) { const uint32_t localId = threadIdx.x; const uint32_t tid = blockIdx.x * blockDim.x + localId; const uint32_t globalThreadCount = gridDim.x * blockDim.x; /* cuda can not handle extern shared memory were the type is * defined by a template * - therefore we use type int for the definition (dirty but OK) */ extern __shared__ int s_mem_extern[]; /* create a pointer with the right type*/ Type* s_mem=(Type*)s_mem_extern; if (tid >= src_count) return; /*end not needed threads*/ /*fill shared mem*/ Type r_value = src[tid]; /*reduce not read global memory to shared*/ uint32_t i = tid + globalThreadCount; while (i < src_count) { func(r_value, src[i]); i += globalThreadCount; } s_mem[localId] = r_value; __syncthreads(); /*now reduce shared memory*/ uint32_t chunk_count = blockDim.x; while (chunk_count != 1) { /* Half number of chunks (rounded down) */ uint32_t active_threads = chunk_count / 2; if (localId >= active_threads) return; /*end not needed threads*/ /* New chunks is half number of chunks rounded up for uneven counts * --> local_tid=0 will reduce the single element for an odd number of values at the end */ chunk_count = (chunk_count + 1) / 2; func(s_mem[localId], s_mem[localId + chunk_count]); __syncthreads(); } func2(dest[blockIdx.x], s_mem[0]); }