__global__ void yourHisto(const unsigned int* const vals, //INPUT unsigned int* const histo, //OUPUT int size) { __shared__ unsigned int temp[1024]; temp[threadIdx.x + 0] = 0; temp[threadIdx.x + 256] = 0; temp[threadIdx.x + 512] = 0; temp[threadIdx.x + 768] = 0; __syncthreads(); int i = threadIdx.x + blockIdx.x * blockDim.x; int offset = blockDim.x * gridDim.x; while (i < size) { atomicAdd( &temp[vals[i]], 1); i += offset; } __syncthreads(); atomicAdd( &(histo[threadIdx.x + 0]), temp[threadIdx.x + 0] ); atomicAdd( &(histo[threadIdx.x + 256]), temp[threadIdx.x + 256] ); atomicAdd( &(histo[threadIdx.x + 512]), temp[threadIdx.x + 512] ); atomicAdd( &(histo[threadIdx.x + 768]), temp[threadIdx.x + 768] ); }
DINLINE void addParticle( T_Acc const & acc, const floatD_X position, const float3_X momentum, const float_X weighting ) { nvidia::atomicAllInc( acc, &this->numMacroParticles, ::alpaka::hierarchy::Threads{} ); atomicAdd( &this->numRealParticles, weighting, ::alpaka::hierarchy::Threads{} ); if( this->splittingStage == VoronoiSplittingStage::position ) { const floatD_X position2 = position * position; for( int i = 0; i < simDim; i++ ) { atomicAdd( &this->meanValue[i], weighting * position[i], ::alpaka::hierarchy::Threads{} ); atomicAdd( &this->meanSquaredValue[i], weighting * position2[i], ::alpaka::hierarchy::Threads{} ); } } else { const float3_X momentum2 = momentum * momentum; for( int i = 0; i < DIM3; i++ ) { atomicAdd( &this->meanValue[i], weighting * momentum[i], ::alpaka::hierarchy::Threads{} ); atomicAdd( &this->meanSquaredValue[i], weighting * momentum2[i], ::alpaka::hierarchy::Threads{} ); } } }
inline void splat(Vec2u pixel, Vec3f w) { uint32 idx = pixel.x() + pixel.y()*_w; atomicAdd(_buffer[idx].x(), w.x()); atomicAdd(_buffer[idx].y(), w.y()); atomicAdd(_buffer[idx].z(), w.z()); }
__global__ void histogram(const SrcPtr src, ResType* hist, const MaskPtr mask, const int rows, const int cols) { #if CV_CUDEV_ARCH >= 120 __shared__ ResType smem[BIN_COUNT]; const int y = blockIdx.x * blockDim.y + threadIdx.y; const int tid = threadIdx.y * blockDim.x + threadIdx.x; for (int i = tid; i < BIN_COUNT; i += BLOCK_SIZE) smem[i] = 0; __syncthreads(); if (y < rows) { for (int x = threadIdx.x; x < cols; x += blockDim.x) { if (mask(y, x)) { const uint data = src(y, x); atomicAdd(&smem[data % BIN_COUNT], 1); } } } __syncthreads(); for (int i = tid; i < BIN_COUNT; i += BLOCK_SIZE) { const ResType histVal = smem[i]; if (histVal > 0) atomicAdd(hist + i, histVal); } #endif }
__device__ __forceinline__ static void add(R* ptr, val_type val) { atomicAdd(ptr, val.x); atomicAdd(ptr + 1, val.y); atomicAdd(ptr + 2, val.z); atomicAdd(ptr + 3, val.w); }
__device__ cdouble atomicAdd<cdouble>(cdouble *ptr, cdouble val) { double *fptr = (double *)(ptr); cdouble res; res.x = atomicAdd(fptr + 0, val.x); res.y = atomicAdd(fptr + 1, val.y); return res; }
void kernel(int iter, int batch_id){ // Assume mode 0 (SGD) double init_step_size = 0.5; double step_size = init_step_size * pow(100.0 + (iter - 1), -0.5); int batch_start = batch_id * batch_size; int batch_end = batch_start + batch_size; if(batch_end > no_of_nodes){ batch_end = no_of_nodes; } #pragma omp parallel for for(int vid = batch_start; vid < batch_end; vid++){ // Stack Variables double Li_curr[K_count]; double Rj_curr[K_count]; double Li_update[K_count]; double Rj_update[K_count]; int row_index = x_row_array[vid]; int column_index = x_col_array[vid]; double Xij = x_val_array[vid]; for (int k = 0; k < K_count; ++k) { Li_curr[k] = L_table[(row_index * K_count) + k]; if(stale_mode == 0){ Rj_curr[k] = R_table[(column_index * K_count) + k]; }else{ Rj_curr[k] = R_table_ind[(vid * K_count) + k]; } } double LiRj = 0.0; for (int k = 0; k < K_count; ++k) { LiRj = LiRj + (Li_curr[k] * Rj_curr[k]); } for (int k = 0; k < K_count; ++k) { double gradient = 0.0; double Li_value = Li_curr[k]; double Rj_value = Rj_curr[k]; gradient = (-2 * Xij * Rj_value) + (2 * LiRj * Rj_value); Li_update[k] = -gradient * step_size; gradient = (-2 * Xij * Li_value) + (2 * LiRj * Li_value); Rj_update[k] = -gradient * step_size; } // Commit updates for (int k = 0; k < K_count; ++k) { atomicAdd(&(L_table[(row_index * K_count) + k]), Li_update[k]); // Replaced by non-blocking write to original weight atomicAdd(&(R_table[(column_index * K_count) + k]), Rj_update[k]); } // The loss function at X(i,j) is ( X(i,j) - L(i,:)*R(:,j) )^2. atomicAdd(&loss_sum, (double)pow(Xij - LiRj, 2)); } }
__global__ void kernelCountParticles(PBox pb, uint64_cu* gCounter, Filter filter, Mapping mapper) { typedef typename PBox::FrameType FRAME; const uint32_t Dim = Mapping::Dim; __shared__ FRAME *frame; __shared__ bool isValid; __shared__ int counter; __shared__ lcellId_t particlesInSuperCell; __syncthreads(); /*wait that all shared memory is initialised*/ typedef typename Mapping::SuperCellSize SuperCellSize; const DataSpace<Dim > threadIndex(threadIdx); const int linearThreadIdx = DataSpaceOperations<Dim>::template map<SuperCellSize > (threadIndex); const DataSpace<Dim> superCellIdx(mapper.getSuperCellIndex(DataSpace<Dim > (blockIdx))); if (linearThreadIdx == 0) { frame = &(pb.getLastFrame(superCellIdx, isValid)); particlesInSuperCell = pb.getSuperCell(superCellIdx).getSizeLastFrame(); counter = 0; } __syncthreads(); if (!isValid) return; //end kernel if we have no frames filter.setSuperCellPosition((superCellIdx - mapper.getGuardingSuperCells()) * mapper.getSuperCellSize()); while (isValid) { if (linearThreadIdx < particlesInSuperCell) { if (filter(*frame, linearThreadIdx)) atomicAdd(&counter, 1); } __syncthreads(); if (linearThreadIdx == 0) { frame = &(pb.getPreviousFrame(*frame, isValid)); particlesInSuperCell = math::CT::volume<SuperCellSize>::type::value; } __syncthreads(); } __syncthreads(); if (linearThreadIdx == 0) { atomicAdd(gCounter, (uint64_cu) counter); } }
__device__ void stop() { if(intervall >= 0 && intervall < numberOfIntervalls) { unsigned long long diff = clock64() - currentTime; __syncthreads(); atomicAdd(&counter[intervall].avg, diff); atomicMin(&counter[intervall].min, diff); atomicMax(&counter[intervall].max, diff); atomicAdd(&counter[intervall].iter, 1); intervall++; __syncthreads(); currentTime = clock64(); } }
__global__ void gpu_ballot(hipLaunchParm lp, unsigned int* device_ballot, int Num_Warps_per_Block,int pshift) { int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; const unsigned int warp_num = hipThreadIdx_x >> pshift; #ifdef __HIP_PLATFORM_HCC__ atomicAdd(&device_ballot[warp_num+hipBlockIdx_x*Num_Warps_per_Block],__popcll(__ballot(tid - 245))); #else atomicAdd(&device_ballot[warp_num+hipBlockIdx_x*Num_Warps_per_Block],__popc(__ballot(tid - 245))); #endif }
/** * Allocate at least the specified size in bytes. * Align to MallocAlignBytes (16) Bytes. * * gc means garbage collector. * * @return address for allocated memory chunk */ __device__ int org_trifort_gc_malloc_no_fail( int size ) { int const lastAddress = atomicAdd( &global_free_pointer, padNumberTo( size, MallocAlignBytes ) / MallocAlignBytes ); return lastAddress; }
// Request other attached threads that are not at safe points to park themselves on safepoints. bool parkOthers() { ASSERT(ThreadState::current()->isAtSafePoint()); // Lock threadAttachMutex() to prevent threads from attaching. threadAttachMutex().lock(); ThreadState::AttachedThreadStateSet& threads = ThreadState::attachedThreads(); MutexLocker locker(m_mutex); atomicAdd(&m_unparkedThreadCount, threads.size()); releaseStore(&m_canResume, 0); ThreadState* current = ThreadState::current(); for (ThreadState* state : threads) { if (state == current) continue; for (ThreadState::Interruptor* interruptor : state->interruptors()) interruptor->requestInterrupt(); } while (acquireLoad(&m_unparkedThreadCount) > 0) { double expirationTime = currentTime() + lockingTimeout(); if (!m_parked.timedWait(m_mutex, expirationTime)) { // One of the other threads did not return to a safepoint within the maximum // time we allow for threads to be parked. Abandon the GC and resume the // currently parked threads. resumeOthers(true); return false; } } return true; }
CUGIP_DECL_DEVICE int append(const TType &aItem) { int index = atomicAdd(mSize.get(), 1); mData[index] = aItem; return index; }
static __global__ void histogramKernel(Param<outType> out, CParam<inType> in, int len, int nbins, float minval, float maxval, int nBBS) { SharedMemory<outType> shared; outType * shrdMem = shared.getPointer(); // offset input and output to account for batch ops unsigned b2 = blockIdx.x / nBBS; const inType *iptr = in.ptr + b2 * in.strides[2] + blockIdx.y * in.strides[3]; outType *optr = out.ptr + b2 * out.strides[2] + blockIdx.y * out.strides[3]; int start = (blockIdx.x-b2*nBBS) * THRD_LOAD * blockDim.x + threadIdx.x; int end = minimum((start + THRD_LOAD * blockDim.x), len); float step = (maxval-minval) / (float)nbins; // If nbins > max shared memory allocated, then just use atomicAdd on global memory bool use_global = nbins > MAX_BINS; // Skip initializing shared memory if (!use_global) { for (int i = threadIdx.x; i < nbins; i += blockDim.x) shrdMem[i] = 0; __syncthreads(); } for (int row = start; row < end; row += blockDim.x) { int idx = isLinear ? row : ((row % in.dims[0]) + (row / in.dims[0])*in.strides[1]); int bin = (int)((iptr[idx] - minval) / step); bin = (bin < 0) ? 0 : bin; bin = (bin >= nbins) ? (nbins-1) : bin; if (use_global) { atomicAdd((optr + bin), 1); } else { atomicAdd((shrdMem + bin), 1); } } // No need to write to global if use_global is true if (!use_global) { __syncthreads(); for (int i = threadIdx.x; i < nbins; i += blockDim.x) { atomicAdd((optr + i), shrdMem[i]); } } }
EPP_DEVICE void calcOffset ( SliceIndex *slice_index, unsigned int _index, unsigned int *current_top ) { int sub_index; slice_index[ _index ].offset = atomicAdd( current_top, slice_index[ _index ].temp_offset ); }
HDINLINE void BitData<TYPE, NUMBITS>::operator+=(const TYPE &rhs) { #if !defined(__CUDA_ARCH__) // Host code path *(this->data) += (rhs << this->bit); #else atomicAdd(this->data, rhs << this->bit); #endif }
/** * Adds val to the stack in an atomic operation. * * @param val data of type VALUE to add to the stack */ HDINLINE void push(VALUE val) { #if !defined(__CUDA_ARCH__) // Host code path TYPE old_addr = (*currentSize)++; #else TYPE old_addr = atomicAdd(currentSize, 1); #endif (*this)[old_addr] = val; }
__device__ __forceinline__ void finalise() { #ifdef __CUDACC__ int lane = threadIdx.x % warpSize; count = warpReduceSum(count); if (lane == 0) atomicAdd(&frontier, count); #endif }
__device__ uint32_t get_id() { /* Equivalent to: * * value = *id_; * *id_ += 1; */ uint32_t value = atomicAdd(id_, 1); return value; }
__device__ int org_trifort_gc_malloc_no_fail(int size){ if(size % 16 != 0){ size += (16 - (size % 16)); } size >>= 4; int ret; ret = atomicAdd(&global_free_pointer, size); return ret; }
__global__ void extract_orb( unsigned* desc_out, const unsigned n_feat, float* x_in_out, float* y_in_out, const float* ori_in, float* size_out, CParam<T> image, const float scl, const unsigned patch_size) { unsigned f = blockDim.x * blockIdx.x + threadIdx.x; if (f < n_feat) { unsigned x = (unsigned)round(x_in_out[f]); unsigned y = (unsigned)round(y_in_out[f]); float ori = ori_in[f]; unsigned size = patch_size; unsigned r = ceil(patch_size * sqrt(2.f) / 2.f); if (x < r || y < r || x >= image.dims[1] - r || y >= image.dims[0] - r) return; // Descriptor fixed at 256 bits for now // Storing descriptor as a vector of 8 x 32-bit unsigned numbers for (unsigned i = threadIdx.y; i < 16; i += blockDim.y) { unsigned v = 0; // j < 16 for 256 bits descriptor for (unsigned j = 0; j < 16; j++) { // Get position from distribution pattern and values of points p1 and p2 int dist_x = d_ref_pat[i*16*4 + j*4]; int dist_y = d_ref_pat[i*16*4 + j*4+1]; T p1 = get_pixel(x, y, ori, size, dist_x, dist_y, image, patch_size); dist_x = d_ref_pat[i*16*4 + j*4+2]; dist_y = d_ref_pat[i*16*4 + j*4+3]; T p2 = get_pixel(x, y, ori, size, dist_x, dist_y, image, patch_size); // Calculate bit based on p1 and p2 and shifts it to correct position v |= (p1 < p2) << (j + 16*(i % 2)); } // Store 16 bits of descriptor atomicAdd(&desc_out[f * 8 + i/2], v); } if (threadIdx.y == 0) { x_in_out[f] = round(x * scl); y_in_out[f] = round(y * scl); size_out[f] = patch_size * scl; } } }
/** * Increases the size of the stack with count elements in an atomic operation. * * @param count number of elements to increase stack with * @return a TileDataBox of size count pointing to the new stack elements */ HDINLINE TileDataBox<VALUE> pushN(TYPE count) { #if !defined(__CUDA_ARCH__) // Host code path //TYPE old_addr = (*currentSize) = (*currentSize) + count; //old_addr -= count; TYPE old_addr = (*currentSize); *currentSize += count; #else TYPE old_addr = atomicAdd(currentSize, count); #endif return TileDataBox<VALUE > (this->fixedPointer, DataSpace<DIM1>(old_addr)); }
__device__ long long edu_syr_pcpratts_gc_malloc_no_fail(char * gc_info, long long size){ unsigned long long * addr = (unsigned long long *) (gc_info + TO_SPACE_FREE_POINTER_OFFSET); size += 8; long long ret; ret = atomicAdd(addr, (unsigned long long) size); int mod = ret % 8; if(mod != 0) ret += mod; return ret; }
__global__ void yourHisto(const unsigned int* const vals, //INPUT unsigned int* const histo, //OUPUT int vals_size, int numBins) { int tid = blockIdx.x*blockDim.x + threadIdx.x; int startingPositionsPerThread = numBins/threadsPerBlock; int offset = blockDim.x * gridDim.x; extern __shared__ unsigned int local[]; for(int i = 0; i < startingPositionsPerThread && threadIdx.x + i*threadsPerBlock < numBins; i++) local[threadIdx.x + i*threadsPerBlock] = 0; __syncthreads(); for(int i = tid; i < vals_size; i += offset) { atomicAdd(&local[vals[i]],1); } __syncthreads(); for(int i = 0; i < startingPositionsPerThread && threadIdx.x + i*threadsPerBlock < numBins; i++) atomicAdd(&histo[threadIdx.x + i*threadsPerBlock], local[threadIdx.x + i*threadsPerBlock]); }
CUGIP_DECL_DEVICE bool tryPullPush(GraphCutData<TFlow> &aGraph, int aFrom, int aTo, int aConnectionIndex, bool aConnectionSide) { EdgeResidualsRecord<TFlow> &edge = aGraph.residuals(aConnectionIndex); TFlow residual = edge.getResidual(aConnectionSide); TFlow flow = tryPull(aGraph, aFrom, aTo, residual); if (flow > 0.0f) { //printf("try pull push %d %d -> %d %d %f - residual %f\n", aGraph.label(aFrom), aFrom, aGraph.label(aTo), aTo, flow, residual); atomicAdd(&(aGraph.excess(aTo)), flow); edge.getResidual(aConnectionSide) -= flow; edge.getResidual(!aConnectionSide) += flow; return true; } //printf("failed pull push %d %d -> %d %d %f - residual %f\n", aGraph.label(aFrom), aFrom, aGraph.label(aTo), aTo, flow, residual); return false; }
__global__ void HIP_FUNCTION(testKernel,int *g_odata) { // access thread id const unsigned int tid = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; // Test various atomic instructions // Arithmetic atomic instructions // Atomic addition atomicAdd(&g_odata[0], 10); // Atomic subtraction (final should be 0) atomicSub(&g_odata[1], 10); // Atomic exchange atomicExch(&g_odata[2], tid); // Atomic maximum atomicMax(&g_odata[3], tid); // Atomic minimum atomicMin(&g_odata[4], tid); // Atomic increment (modulo 17+1) //atomicInc((unsigned int *)&g_odata[5], 17); atomicInc((unsigned int *)&g_odata[5]); // Atomic decrement // atomicDec((unsigned int *)&g_odata[6], 137); atomicDec((unsigned int *)&g_odata[6]); // Atomic compare-and-swap atomicCAS(&g_odata[7], tid-1, tid); // Bitwise atomic instructions // Atomic AND atomicAnd(&g_odata[8], 2*tid+7); // Atomic OR atomicOr(&g_odata[9], 1 << tid); // Atomic XOR atomicXor(&g_odata[10], tid); }
int addObject (int occlusion, vec4 color, int lastSegmentOffset, coherent int *ptrCell) { //Get stream size and previous node offset //Store new stream size and current node offset int nodeOffset = atomicAdd( ptrInfo + INFO_COUNTER_STREAMLEN, 5 ); int prevOffset = atomicExchange( ptrCell + CELL_COUNTER_PREV, nodeOffset ); coherent float *ptrNode = ptrStream + nodeOffset; //Store object data ptrNode[ 0 ] = (float) NODE_TYPE_OBJECT; ptrNode[ 1 ] = (float) prevOffset; ptrNode[ 2 ] = (float) objectId; ptrNode[ 3 ] = (float) occlusion; ptrNode[ 4 ] = (float) lastSegmentOffset; return nodeOffset; }
int addLine (vec2 l0, vec2 l1, coherent int *ptrObjCell) { //Get stream size and previous node offset //Store new stream size and current node offset int nodeOffset = atomicAdd( ptrInfo + INFO_COUNTER_STREAMLEN, 6 ); int prevOffset = atomicExchange( ptrObjCell + OBJCELL_COUNTER_PREV, nodeOffset ); coherent float *ptrNode = ptrStream + nodeOffset; //Store line data ptrNode[ 0 ] = (float) NODE_TYPE_LINE; ptrNode[ 1 ] = (float) prevOffset; ptrNode[ 2 ] = l0.x; ptrNode[ 3 ] = l0.y; ptrNode[ 4 ] = l1.x; ptrNode[ 5 ] = l1.y; return nodeOffset; }
__host__ __device__ typename enable_if< sizeof(Integer64) == 8, Integer64 >::type atomic_fetch_add(Integer64 *x, Integer64 y) { #if defined(__CUDA_ARCH__) return atomicAdd(x, y); #elif defined(__GNUC__) return __atomic_fetch_add(x, y, __ATOMIC_SEQ_CST); #elif defined(_MSC_VER) return InterlockedExchangeAdd64(x, y); #elif defined(__clang__) return __c11_atomic_fetch_add(x, y) #else #error "No atomic_fetch_add implementation." #endif }
__host__ __device__ typename enable_if< sizeof(Integer64) == 8, Integer64 >::type atomic_load(const Integer64 *x) { #if defined(__CUDA_ARCH__) return atomicAdd(const_cast<Integer64*>(x), Integer64(0)); #elif defined(__GNUC__) return atomic_load_n(x, __ATOMIC_SEQ_CST); #elif defined(_MSC_VER) return InterlockedExchangeAdd(x, Integer64(0)); #elif defined(__clang__) return __c11_atomic_load(x); #else #error "No atomic_load_n implementation." #endif }