Exemplo n.º 1
0
__global__
void yourHisto(const unsigned int* const vals, //INPUT
               unsigned int* const histo,      //OUPUT
               int size)
{
	__shared__ unsigned int temp[1024];
  
     temp[threadIdx.x + 0] = 0;
     temp[threadIdx.x + 256] = 0;
     temp[threadIdx.x + 512] = 0;
     temp[threadIdx.x + 768] = 0;
     __syncthreads();

     int i = threadIdx.x + blockIdx.x * blockDim.x;
     int offset = blockDim.x * gridDim.x;
     while (i < size)
     {
              atomicAdd( &temp[vals[i]], 1);
              i += offset;
     }
     __syncthreads();


  
    atomicAdd( &(histo[threadIdx.x + 0]), temp[threadIdx.x + 0] );
    atomicAdd( &(histo[threadIdx.x + 256]), temp[threadIdx.x + 256] );
    atomicAdd( &(histo[threadIdx.x + 512]), temp[threadIdx.x + 512] );
    atomicAdd( &(histo[threadIdx.x + 768]), temp[threadIdx.x + 768] );
}
Exemplo n.º 2
0
        DINLINE
        void addParticle(
            T_Acc const & acc,
            const floatD_X position,
            const float3_X momentum,
            const float_X weighting
        )
        {
            nvidia::atomicAllInc( acc, &this->numMacroParticles, ::alpaka::hierarchy::Threads{} );
            atomicAdd( &this->numRealParticles, weighting, ::alpaka::hierarchy::Threads{} );

            if( this->splittingStage == VoronoiSplittingStage::position )
            {
                const floatD_X position2 = position * position;

                for( int i = 0; i < simDim; i++ )
                {
                    atomicAdd( &this->meanValue[i], weighting * position[i], ::alpaka::hierarchy::Threads{} );
                    atomicAdd( &this->meanSquaredValue[i], weighting * position2[i], ::alpaka::hierarchy::Threads{} );
                }
            }
            else
            {
                const float3_X momentum2 = momentum * momentum;

                for( int i = 0; i < DIM3; i++ )
                {
                    atomicAdd( &this->meanValue[i], weighting * momentum[i], ::alpaka::hierarchy::Threads{} );
                    atomicAdd( &this->meanSquaredValue[i], weighting * momentum2[i], ::alpaka::hierarchy::Threads{} );
                }
            }
        }
Exemplo n.º 3
0
 inline void splat(Vec2u pixel, Vec3f w)
 {
     uint32 idx = pixel.x() + pixel.y()*_w;
     atomicAdd(_buffer[idx].x(), w.x());
     atomicAdd(_buffer[idx].y(), w.y());
     atomicAdd(_buffer[idx].z(), w.z());
 }
Exemplo n.º 4
0
    __global__ void histogram(const SrcPtr src, ResType* hist, const MaskPtr mask, const int rows, const int cols)
    {
    #if CV_CUDEV_ARCH >= 120
        __shared__ ResType smem[BIN_COUNT];

        const int y = blockIdx.x * blockDim.y + threadIdx.y;
        const int tid = threadIdx.y * blockDim.x + threadIdx.x;

        for (int i = tid; i < BIN_COUNT; i += BLOCK_SIZE)
            smem[i] = 0;

        __syncthreads();

        if (y < rows)
        {
            for (int x = threadIdx.x; x < cols; x += blockDim.x)
            {
                if (mask(y, x))
                {
                    const uint data = src(y, x);
                    atomicAdd(&smem[data % BIN_COUNT], 1);
                }
            }
        }

        __syncthreads();

        for (int i = tid; i < BIN_COUNT; i += BLOCK_SIZE)
        {
            const ResType histVal = smem[i];
            if (histVal > 0)
                atomicAdd(hist + i, histVal);
        }
    #endif
    }
Exemplo n.º 5
0
 __device__ __forceinline__ static void add(R* ptr, val_type val)
 {
     atomicAdd(ptr, val.x);
     atomicAdd(ptr + 1, val.y);
     atomicAdd(ptr + 2, val.z);
     atomicAdd(ptr + 3, val.w);
 }
Exemplo n.º 6
0
 __device__ cdouble atomicAdd<cdouble>(cdouble *ptr, cdouble val)
 {
     double *fptr = (double *)(ptr);
     cdouble res;
     res.x = atomicAdd(fptr + 0, val.x);
     res.y = atomicAdd(fptr + 1, val.y);
     return res;
 }
Exemplo n.º 7
0
void kernel(int iter, int batch_id){
  // Assume mode 0 (SGD)
  double init_step_size = 0.5;
  double step_size = init_step_size * pow(100.0 + (iter - 1), -0.5);

  int batch_start = batch_id * batch_size;
  int batch_end = batch_start + batch_size;
  if(batch_end > no_of_nodes){
    batch_end = no_of_nodes;
  }
  
  #pragma omp parallel for  
  for(int vid = batch_start; vid < batch_end; vid++){    
    // Stack Variables
    double Li_curr[K_count]; double Rj_curr[K_count];
    double Li_update[K_count]; double Rj_update[K_count]; 
    
    int row_index = x_row_array[vid];
    int column_index = x_col_array[vid];
    double Xij = x_val_array[vid];

    for (int k = 0; k < K_count; ++k) {
      Li_curr[k] = L_table[(row_index * K_count) + k];
      if(stale_mode == 0){
        Rj_curr[k] = R_table[(column_index * K_count) + k];
      }else{
        Rj_curr[k] = R_table_ind[(vid * K_count) + k];
      }
    }
    
    double LiRj = 0.0;
    for (int k = 0; k < K_count; ++k) {
      LiRj = LiRj + (Li_curr[k] * Rj_curr[k]);
    }
    
    for (int k = 0; k < K_count; ++k) {
      double gradient = 0.0; 
      double Li_value = Li_curr[k]; 
      double Rj_value = Rj_curr[k];

      gradient = (-2 * Xij * Rj_value) + (2 * LiRj * Rj_value);
      Li_update[k] = -gradient * step_size;
      gradient = (-2 * Xij * Li_value) + (2 * LiRj * Li_value);
      Rj_update[k] = -gradient * step_size;
    }

    // Commit updates
    for (int k = 0; k < K_count; ++k) {
      atomicAdd(&(L_table[(row_index * K_count) + k]), Li_update[k]);
      // Replaced by non-blocking write to original weight
      atomicAdd(&(R_table[(column_index * K_count) + k]), Rj_update[k]); 
    }

    // The loss function at X(i,j) is ( X(i,j) - L(i,:)*R(:,j) )^2.
    atomicAdd(&loss_sum, (double)pow(Xij - LiRj, 2));
  }
}
Exemplo n.º 8
0
__global__ void kernelCountParticles(PBox pb,
                                     uint64_cu* gCounter,
                                     Filter filter,
                                     Mapping mapper)
{

    typedef typename PBox::FrameType FRAME;
    const uint32_t Dim = Mapping::Dim;

    __shared__ FRAME *frame;
    __shared__ bool isValid;
    __shared__ int counter;
    __shared__ lcellId_t particlesInSuperCell;


    __syncthreads(); /*wait that all shared memory is initialised*/

    typedef typename Mapping::SuperCellSize SuperCellSize;

    const DataSpace<Dim > threadIndex(threadIdx);
    const int linearThreadIdx = DataSpaceOperations<Dim>::template map<SuperCellSize > (threadIndex);
    const DataSpace<Dim> superCellIdx(mapper.getSuperCellIndex(DataSpace<Dim > (blockIdx)));

    if (linearThreadIdx == 0)
    {
        frame = &(pb.getLastFrame(superCellIdx, isValid));
        particlesInSuperCell = pb.getSuperCell(superCellIdx).getSizeLastFrame();
        counter = 0;
    }
    __syncthreads();
    if (!isValid)
        return; //end kernel if we have no frames
    filter.setSuperCellPosition((superCellIdx - mapper.getGuardingSuperCells()) * mapper.getSuperCellSize());
    while (isValid)
    {
        if (linearThreadIdx < particlesInSuperCell)
        {
            if (filter(*frame, linearThreadIdx))
                atomicAdd(&counter, 1);
        }
        __syncthreads();
        if (linearThreadIdx == 0)
        {
            frame = &(pb.getPreviousFrame(*frame, isValid));
            particlesInSuperCell = math::CT::volume<SuperCellSize>::type::value;
        }
        __syncthreads();
    }

    __syncthreads();
    if (linearThreadIdx == 0)
    {
        atomicAdd(gCounter, (uint64_cu) counter);
    }
}
Exemplo n.º 9
0
	__device__ void stop() {
		if(intervall >= 0 && intervall < numberOfIntervalls) {
			unsigned long long diff = clock64() - currentTime;
			__syncthreads();
			atomicAdd(&counter[intervall].avg, diff);
			atomicMin(&counter[intervall].min, diff);
			atomicMax(&counter[intervall].max, diff);
			atomicAdd(&counter[intervall].iter, 1);
			intervall++;
			__syncthreads();
			currentTime = clock64();
		}
	}
__global__ void 
	gpu_ballot(hipLaunchParm lp, unsigned int* device_ballot, int Num_Warps_per_Block,int pshift)
{

   int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
   const unsigned int warp_num = hipThreadIdx_x >> pshift;
#ifdef __HIP_PLATFORM_HCC__
   atomicAdd(&device_ballot[warp_num+hipBlockIdx_x*Num_Warps_per_Block],__popcll(__ballot(tid - 245)));
#else
	atomicAdd(&device_ballot[warp_num+hipBlockIdx_x*Num_Warps_per_Block],__popc(__ballot(tid - 245)));
#endif
 
}
Exemplo n.º 11
0
/**
 * Allocate at least the specified size in bytes.
 * Align to MallocAlignBytes (16) Bytes.
 *
 * gc means garbage collector.
 *
 * @return address for allocated memory chunk
 */
__device__ int
org_trifort_gc_malloc_no_fail( int size )
{
    int const lastAddress = atomicAdd( &global_free_pointer,
                                       padNumberTo( size, MallocAlignBytes ) / MallocAlignBytes );
    return lastAddress;
}
Exemplo n.º 12
0
    // Request other attached threads that are not at safe points to park themselves on safepoints.
    bool parkOthers()
    {
        ASSERT(ThreadState::current()->isAtSafePoint());

        // Lock threadAttachMutex() to prevent threads from attaching.
        threadAttachMutex().lock();

        ThreadState::AttachedThreadStateSet& threads = ThreadState::attachedThreads();

        MutexLocker locker(m_mutex);
        atomicAdd(&m_unparkedThreadCount, threads.size());
        releaseStore(&m_canResume, 0);

        ThreadState* current = ThreadState::current();
        for (ThreadState* state : threads) {
            if (state == current)
                continue;

            for (ThreadState::Interruptor* interruptor : state->interruptors())
                interruptor->requestInterrupt();
        }

        while (acquireLoad(&m_unparkedThreadCount) > 0) {
            double expirationTime = currentTime() + lockingTimeout();
            if (!m_parked.timedWait(m_mutex, expirationTime)) {
                // One of the other threads did not return to a safepoint within the maximum
                // time we allow for threads to be parked. Abandon the GC and resume the
                // currently parked threads.
                resumeOthers(true);
                return false;
            }
        }
        return true;
    }
Exemplo n.º 13
0
	CUGIP_DECL_DEVICE int
	append(const TType &aItem)
	{
		int index = atomicAdd(mSize.get(), 1);
		mData[index] = aItem;
		return index;
	}
Exemplo n.º 14
0
static __global__
void histogramKernel(Param<outType> out, CParam<inType> in,
                     int len, int nbins, float minval, float maxval, int nBBS)
{
    SharedMemory<outType> shared;
    outType * shrdMem = shared.getPointer();

    // offset input and output to account for batch ops
    unsigned b2 = blockIdx.x / nBBS;
    const inType *iptr  =  in.ptr + b2 *  in.strides[2] + blockIdx.y *  in.strides[3];
    outType      *optr  = out.ptr + b2 * out.strides[2] + blockIdx.y * out.strides[3];

    int start  = (blockIdx.x-b2*nBBS) * THRD_LOAD * blockDim.x + threadIdx.x;
    int end    = minimum((start + THRD_LOAD * blockDim.x), len);
    float step = (maxval-minval) / (float)nbins;

    // If nbins > max shared memory allocated, then just use atomicAdd on global memory
    bool use_global = nbins > MAX_BINS;

    // Skip initializing shared memory
    if (!use_global) {
        for (int i = threadIdx.x; i < nbins; i += blockDim.x)
            shrdMem[i] = 0;
        __syncthreads();
    }

    for (int row = start; row < end; row += blockDim.x) {
        int idx = isLinear ? row : ((row % in.dims[0]) + (row / in.dims[0])*in.strides[1]);
        int bin = (int)((iptr[idx] - minval) / step);
        bin     = (bin < 0)      ? 0         : bin;
        bin     = (bin >= nbins) ? (nbins-1) : bin;

        if (use_global) {
            atomicAdd((optr + bin), 1);
        } else {
            atomicAdd((shrdMem + bin), 1);
        }
    }

    // No need to write to global if use_global is true
    if (!use_global) {
        __syncthreads();
        for (int i = threadIdx.x; i < nbins; i += blockDim.x) {
            atomicAdd((optr + i), shrdMem[i]);
        }
    }
}
Exemplo n.º 15
0
 EPP_DEVICE void calcOffset (
   SliceIndex *slice_index,
   unsigned int _index,
   unsigned int *current_top
 ) {
   int sub_index;
   slice_index[ _index ].offset = atomicAdd( current_top, slice_index[ _index ].temp_offset );
 }
Exemplo n.º 16
0
    HDINLINE void BitData<TYPE, NUMBITS>::operator+=(const TYPE &rhs)
    {
#if !defined(__CUDA_ARCH__) // Host code path
        *(this->data) += (rhs << this->bit);
#else
        atomicAdd(this->data, rhs << this->bit);
#endif
    }
Exemplo n.º 17
0
        /**
         * Adds val to the stack in an atomic operation.
         * 
         * @param val data of type VALUE to add to the stack
         */
        HDINLINE void push(VALUE val)
        {
#if !defined(__CUDA_ARCH__) // Host code path
            TYPE old_addr = (*currentSize)++;
#else
            TYPE old_addr = atomicAdd(currentSize, 1);
#endif
            (*this)[old_addr] = val;
        }
Exemplo n.º 18
0
    __device__ __forceinline__ void finalise()
    {
#ifdef __CUDACC__
        int lane = threadIdx.x % warpSize;

        count = warpReduceSum(count);
        if (lane == 0) atomicAdd(&frontier, count);
#endif
    }
Exemplo n.º 19
0
 __device__ uint32_t get_id() {
     /* Equivalent to:
      *
      *     value = *id_;
      *     *id_ += 1;
      */
     uint32_t value = atomicAdd(id_, 1);
     return value;
 }
Exemplo n.º 20
0
__device__ int
org_trifort_gc_malloc_no_fail(int size){
  if(size % 16 != 0){
    size += (16 - (size % 16));
  }
  size >>= 4;

  int ret;
  ret = atomicAdd(&global_free_pointer, size);
  return ret;
}
Exemplo n.º 21
0
__global__ void extract_orb(
    unsigned* desc_out,
    const unsigned n_feat,
    float* x_in_out,
    float* y_in_out,
    const float* ori_in,
    float* size_out,
    CParam<T> image,
    const float scl,
    const unsigned patch_size)
{
    unsigned f = blockDim.x * blockIdx.x + threadIdx.x;

    if (f < n_feat) {
        unsigned x = (unsigned)round(x_in_out[f]);
        unsigned y = (unsigned)round(y_in_out[f]);
        float ori = ori_in[f];
        unsigned size = patch_size;

        unsigned r = ceil(patch_size * sqrt(2.f) / 2.f);
        if (x < r || y < r || x >= image.dims[1] - r || y >= image.dims[0] - r)
            return;

        // Descriptor fixed at 256 bits for now
        // Storing descriptor as a vector of 8 x 32-bit unsigned numbers
        for (unsigned i = threadIdx.y; i < 16; i += blockDim.y) {
            unsigned v = 0;

            // j < 16 for 256 bits descriptor
            for (unsigned j = 0; j < 16; j++) {
                // Get position from distribution pattern and values of points p1 and p2
                int dist_x = d_ref_pat[i*16*4 + j*4];
                int dist_y = d_ref_pat[i*16*4 + j*4+1];
                T p1 = get_pixel(x, y, ori, size, dist_x, dist_y, image, patch_size);

                dist_x = d_ref_pat[i*16*4 + j*4+2];
                dist_y = d_ref_pat[i*16*4 + j*4+3];
                T p2 = get_pixel(x, y, ori, size, dist_x, dist_y, image, patch_size);

                // Calculate bit based on p1 and p2 and shifts it to correct position
                v |= (p1 < p2) << (j + 16*(i % 2));
            }

            // Store 16 bits of descriptor
            atomicAdd(&desc_out[f * 8 + i/2], v);
        }

        if (threadIdx.y == 0) {
            x_in_out[f] = round(x * scl);
            y_in_out[f] = round(y * scl);
            size_out[f] = patch_size * scl;
        }
    }
}
Exemplo n.º 22
0
        /**
         * Increases the size of the stack with count elements in an atomic operation.
         *
         * @param count number of elements to increase stack with
         * @return a TileDataBox of size count pointing to the new stack elements
         */
        HDINLINE TileDataBox<VALUE> pushN(TYPE count)
        {
#if !defined(__CUDA_ARCH__) // Host code path
            //TYPE old_addr = (*currentSize) = (*currentSize) + count;
            //old_addr -= count;
            TYPE old_addr = (*currentSize);
            *currentSize += count;
#else
            TYPE old_addr = atomicAdd(currentSize, count);
#endif
            return TileDataBox<VALUE > (this->fixedPointer, DataSpace<DIM1>(old_addr));
        }
Exemplo n.º 23
0
__device__ long long
edu_syr_pcpratts_gc_malloc_no_fail(char * gc_info, long long size){
  unsigned long long * addr = (unsigned long long *) (gc_info + TO_SPACE_FREE_POINTER_OFFSET);
  size += 8;
  long long ret;
    
  ret = atomicAdd(addr, (unsigned long long) size);
  int mod = ret % 8;
  if(mod != 0)
    ret += mod;

  return ret;
}
__global__
void yourHisto(const unsigned int* const vals, //INPUT
               unsigned int* const histo,      //OUPUT
               int vals_size,
               int numBins)
{
    int tid = blockIdx.x*blockDim.x + threadIdx.x;
    int startingPositionsPerThread = numBins/threadsPerBlock;
    int offset = blockDim.x * gridDim.x;

    extern __shared__ unsigned int local[];

    for(int i = 0; i < startingPositionsPerThread && threadIdx.x + i*threadsPerBlock < numBins; i++)
        local[threadIdx.x + i*threadsPerBlock] = 0;
    __syncthreads();

    for(int i = tid; i < vals_size; i += offset) {
        atomicAdd(&local[vals[i]],1);
    }
    __syncthreads();

    for(int i = 0; i < startingPositionsPerThread && threadIdx.x + i*threadsPerBlock < numBins; i++)
        atomicAdd(&histo[threadIdx.x + i*threadsPerBlock], local[threadIdx.x + i*threadsPerBlock]);
}
Exemplo n.º 25
0
CUGIP_DECL_DEVICE bool
tryPullPush(GraphCutData<TFlow> &aGraph, int aFrom, int aTo, int aConnectionIndex, bool aConnectionSide)
{
	EdgeResidualsRecord<TFlow> &edge = aGraph.residuals(aConnectionIndex);
	TFlow residual = edge.getResidual(aConnectionSide);
	TFlow flow = tryPull(aGraph, aFrom, aTo, residual);
	if (flow > 0.0f) {
		//printf("try pull push %d %d -> %d %d %f - residual %f\n", aGraph.label(aFrom), aFrom, aGraph.label(aTo), aTo, flow, residual);
		atomicAdd(&(aGraph.excess(aTo)), flow);
		edge.getResidual(aConnectionSide) -= flow;
		edge.getResidual(!aConnectionSide) += flow;
		return true;
	}
	//printf("failed pull push %d %d -> %d %d %f - residual %f\n", aGraph.label(aFrom), aFrom, aGraph.label(aTo), aTo, flow, residual);
	return false;
}
Exemplo n.º 26
0
__global__ void HIP_FUNCTION(testKernel,int *g_odata)
{
    // access thread id
    const unsigned int tid = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;

    // Test various atomic instructions

    // Arithmetic atomic instructions

    // Atomic addition
    atomicAdd(&g_odata[0], 10);

    // Atomic subtraction (final should be 0)
    atomicSub(&g_odata[1], 10);

    // Atomic exchange
    atomicExch(&g_odata[2], tid);

    // Atomic maximum
    atomicMax(&g_odata[3], tid);

    // Atomic minimum
    atomicMin(&g_odata[4], tid);

    // Atomic increment (modulo 17+1)
    //atomicInc((unsigned int *)&g_odata[5], 17);
    atomicInc((unsigned int *)&g_odata[5]);

    // Atomic decrement
   // atomicDec((unsigned int *)&g_odata[6], 137);
    atomicDec((unsigned int *)&g_odata[6]);

    // Atomic compare-and-swap
    atomicCAS(&g_odata[7], tid-1, tid);

    // Bitwise atomic instructions

    // Atomic AND
    atomicAnd(&g_odata[8], 2*tid+7);

    // Atomic OR
    atomicOr(&g_odata[9], 1 << tid);

    // Atomic XOR
    atomicXor(&g_odata[10], tid);
}
Exemplo n.º 27
0
int addObject (int occlusion, vec4 color, int lastSegmentOffset, coherent int *ptrCell)
{
  //Get stream size and previous node offset
  //Store new stream size and current node offset
  int nodeOffset = atomicAdd( ptrInfo + INFO_COUNTER_STREAMLEN, 5 );
  int prevOffset = atomicExchange( ptrCell + CELL_COUNTER_PREV, nodeOffset );
  coherent float *ptrNode = ptrStream + nodeOffset;

  //Store object data
  ptrNode[ 0 ] = (float) NODE_TYPE_OBJECT;
  ptrNode[ 1 ] = (float) prevOffset;
  ptrNode[ 2 ] = (float) objectId;
  ptrNode[ 3 ] = (float) occlusion;
  ptrNode[ 4 ] = (float) lastSegmentOffset;

  return nodeOffset;
}
Exemplo n.º 28
0
int addLine (vec2 l0, vec2 l1, coherent int *ptrObjCell)
{
  //Get stream size and previous node offset
  //Store new stream size and current node offset
  int nodeOffset = atomicAdd( ptrInfo + INFO_COUNTER_STREAMLEN, 6 );
  int prevOffset = atomicExchange( ptrObjCell + OBJCELL_COUNTER_PREV, nodeOffset );
  coherent float *ptrNode = ptrStream + nodeOffset;

  //Store line data
  ptrNode[ 0 ] = (float) NODE_TYPE_LINE;
  ptrNode[ 1 ] = (float) prevOffset;
  ptrNode[ 2 ] = l0.x;
  ptrNode[ 3 ] = l0.y;
  ptrNode[ 4 ] = l1.x;
  ptrNode[ 5 ] = l1.y;

  return nodeOffset;
}
Exemplo n.º 29
0
__host__ __device__
typename enable_if<
  sizeof(Integer64) == 8,
  Integer64
>::type
atomic_fetch_add(Integer64 *x, Integer64 y)
{
#if defined(__CUDA_ARCH__)
  return atomicAdd(x, y);
#elif defined(__GNUC__)
  return __atomic_fetch_add(x, y, __ATOMIC_SEQ_CST);
#elif defined(_MSC_VER)
  return InterlockedExchangeAdd64(x, y);
#elif defined(__clang__)
  return __c11_atomic_fetch_add(x, y)
#else
#error "No atomic_fetch_add implementation."
#endif
}
Exemplo n.º 30
0
__host__ __device__
typename enable_if<
  sizeof(Integer64) == 8,
  Integer64
>::type
atomic_load(const Integer64 *x)
{
#if defined(__CUDA_ARCH__)
  return atomicAdd(const_cast<Integer64*>(x), Integer64(0));
#elif defined(__GNUC__)
  return atomic_load_n(x, __ATOMIC_SEQ_CST);
#elif defined(_MSC_VER)
  return InterlockedExchangeAdd(x, Integer64(0));
#elif defined(__clang__)
  return __c11_atomic_load(x);
#else
#error "No atomic_load_n implementation."
#endif
}