コード例 #1
0
__global__ void kColVectorOp(float* mat, float* vec, float* tgtMat,
                             const uint width, const uint height,
                             const uint matStride, const uint tgtStride, Op op) {
    __shared__ float shVec[ADD_VEC_THREADS_Y];
    const uint by = ADD_VEC_THREADS_Y * hipBlockIdx_y;
    const uint bx = ADD_VEC_THREADS_X * hipBlockIdx_x;
    const uint tidx = ADD_VEC_THREADS_X * hipThreadIdx_y + hipThreadIdx_x;
    
    mat += hipThreadIdx_y * matStride;
    vec += tidx;
    tgtMat += hipThreadIdx_y * tgtStride;

    for (uint y = by; y < height; y += hipGridDim_y * ADD_VEC_THREADS_Y) {
        __syncthreads();
        if (y + tidx < height && tidx < ADD_VEC_THREADS_Y) {
            shVec[tidx] = vec[y];
        }
        __syncthreads();

        if (y + hipThreadIdx_y < height) {
            for (uint x = bx + hipThreadIdx_x; x < width; x += hipGridDim_x * ADD_VEC_THREADS_X) {
                tgtMat[(y) * tgtStride + x] = op(mat[(y) * matStride + x], shVec[hipThreadIdx_y]);
            }
        }
    }
}
コード例 #2
0
__global__ void kEltwiseUnaryOpTrans(const float* a, float* const dest,
                                     const uint height, const uint width,
                                     const uint strideA, const uint strideDest, Op op) {

    __shared__ float shmem[ELTWISE_THREADS_X][ELTWISE_THREADS_X + 1];

    for (uint by = ELTWISE_THREADS_X * hipBlockIdx_y; by < height; by += ELTWISE_THREADS_X * hipGridDim_y) {
        for (uint bx = ELTWISE_THREADS_X * hipBlockIdx_x; bx < width; bx += ELTWISE_THREADS_X * hipGridDim_x) {
            const uint readX = by + hipThreadIdx_x;
            const uint readY = bx + hipThreadIdx_y;
            for (uint y = 0; y < ELTWISE_THREADS_X; y+= ELTWISE_THREADS_Y) {
                if (!checkBounds || (readX < height && readY + y < width)) {
                    shmem[hipThreadIdx_x][hipThreadIdx_y + y] = op(a[(readY + y) * strideA + readX]);
                }
            }
            __syncthreads();

            const uint writeX = bx + hipThreadIdx_x;
            const uint writeY = by + hipThreadIdx_y;
            for (uint y = 0; y < ELTWISE_THREADS_X; y+= ELTWISE_THREADS_Y) {
                if(!checkBounds || (writeX < width && writeY + y < height)) {
                    dest[(writeY + y) * strideDest + writeX] = shmem[hipThreadIdx_y + y][hipThreadIdx_x];

                }
            }
            __syncthreads();
        }
    }
}
コード例 #3
0
ファイル: prefix_sum.hpp プロジェクト: anshumang/bfs_titech
__global__ void prefix_sum_down_phase(T* idata, T* intermid, const int num_blocks, const int num_loops)
{
	const int tid = threadIdx.x;
	const int MAX = 1U << LOG_MAX;
	const int HARF_MAX = MAX / 2; // == blockDim.x

	const int i_start = num_loops * blockIdx.x;
	const int i_end = min(i_start + num_loops, num_blocks);

	__shared__ T s_data[(1U << LOG_MAX)*2];
	T* s_in = s_data + 1;
	T carry;
	if(tid == 0) carry = intermid[blockIdx.x];

	for(int i = i_start; i < i_end; ++i) {
		s_in[tid + HARF_MAX*0] = idata[i*MAX + HARF_MAX*0 + tid];
		s_in[tid + HARF_MAX*1] = idata[i*MAX + HARF_MAX*1 + tid];
		if(tid == 0) s_data[0] = carry;
		__syncthreads();

		dev_prefix_sum<T, LOG_MAX>(s_data, tid);

		idata[i*MAX + HARF_MAX*0 + tid] = s_in[tid + HARF_MAX*0];
		idata[i*MAX + HARF_MAX*1 + tid] = s_in[tid + HARF_MAX*1];
		if(tid == 0) carry = s_data[MAX];
		__syncthreads();
	}

	if(tid == 0 && blockIdx.x == 0) idata[-1] = 0;
}
コード例 #4
0
ファイル: gauss-jordan6.hpp プロジェクト: paogor/gauss-jordan
__global__ void gauss_jordan6(int n, T * AA){

  extern __shared__ T A[];

  int idy = threadIdx.x; // inverted to avoid warp divergence
  int idx = threadIdx.y;
  int blk = blockIdx.x;
  
  A[(idy*n)+idy] = AA[(n*n*blk)+(idy*n)+idx];

  __syncthreads();

  for(int i=0; i < n; ++i){

    T i_row = (( idy != n-1 ) ? A[(i*n)+idy+1] : 1)  / A[i*n];
    T y_row = (( idy != n-1 ) ? A[(idx*n)+idy+1] : 0 ) - A[idx*n]*i_row;

    __syncthreads();
    
    A[(idx*n)+idy] = ( idx != i ) ? y_row : i_row ;

    __syncthreads();

  }
  
  AA[(n*n*blk)+(idx*n)+idy] = A[(idy*n)+idx];


}
コード例 #5
0
__global__ void csr_trans_unit_lu_forward_kernel(
    const unsigned int * row_indices,
    const unsigned int * column_indices,
    const T * elements,
    T * vector,
    unsigned int size)
{
    __shared__  unsigned int row_index_lookahead[256];
    __shared__  unsigned int row_index_buffer[256];

    unsigned int row_index;
    unsigned int col_index;
    T matrix_entry;
    unsigned int nnz = row_indices[size];
    unsigned int row_at_window_start = 0;
    unsigned int row_at_window_end = 0;
    unsigned int loop_end = ( (nnz - 1) / blockDim.x + 1) * blockDim.x;

    for (unsigned int i = threadIdx.x; i < loop_end; i += blockDim.x)
    {
        col_index    = (i < nnz) ? column_indices[i] : 0;
        matrix_entry = (i < nnz) ? elements[i]       : 0;
        row_index_lookahead[threadIdx.x] = (row_at_window_start + threadIdx.x < size) ? row_indices[row_at_window_start + threadIdx.x] : size - 1;

        __syncthreads();

        if (i < nnz)
        {
            unsigned int row_index_inc = 0;
            while (i >= row_index_lookahead[row_index_inc + 1])
                ++row_index_inc;
            row_index = row_at_window_start + row_index_inc;
            row_index_buffer[threadIdx.x] = row_index;
        }
        else
        {
            row_index = size+1;
            row_index_buffer[threadIdx.x] = size - 1;
        }

        __syncthreads();

        row_at_window_start = row_index_buffer[0];
        row_at_window_end   = row_index_buffer[blockDim.x - 1];

        //forward elimination
        for (unsigned int row = row_at_window_start; row <= row_at_window_end; ++row)
        {
            T result_entry = vector[row];

            if ( (row_index == row) && (col_index > row) )
                vector[col_index] -= result_entry * matrix_entry;

            __syncthreads();
        }

        row_at_window_start = row_at_window_end;
    }

}
コード例 #6
0
__global__ void kTotalAgg(const float* a, float* const target, const uint numElements, Agg agg) {
    __shared__ float shmem[DP_BLOCKSIZE];
    uint eidx = DP_BLOCKSIZE * hipBlockIdx_x + hipThreadIdx_x;
    shmem[hipThreadIdx_x] = agg.getBaseValue();
    if (eidx < hipGridDim_x * DP_BLOCKSIZE) {
        for (; eidx < numElements; eidx += hipGridDim_x * DP_BLOCKSIZE) {
            shmem[hipThreadIdx_x] = agg(shmem[hipThreadIdx_x], a[eidx]);
        }
    }
    __syncthreads();
    if (hipThreadIdx_x < 256) {
        shmem[hipThreadIdx_x] = agg(shmem[hipThreadIdx_x], shmem[hipThreadIdx_x + 256]);
    }
    __syncthreads();
    if (hipThreadIdx_x < 128) {
        shmem[hipThreadIdx_x] = agg(shmem[hipThreadIdx_x], shmem[hipThreadIdx_x + 128]);
    }
    __syncthreads();
    if (hipThreadIdx_x < 64) {
        shmem[hipThreadIdx_x] = agg(shmem[hipThreadIdx_x], shmem[hipThreadIdx_x + 64]);
    }
    __syncthreads();
    if (hipThreadIdx_x < 32) {
        volatile float* mysh = &shmem[hipThreadIdx_x];
        *mysh = agg(*mysh, mysh[32]);
        *mysh = agg(*mysh, mysh[16]);
        *mysh = agg(*mysh, mysh[8]);
        *mysh = agg(*mysh, mysh[4]);
        *mysh = agg(*mysh, mysh[2]);
        *mysh = agg(*mysh, mysh[1]);
        if (hipThreadIdx_x == 0) {
            target[hipBlockIdx_x] = *mysh;
        }
    }
}
コード例 #7
0
ファイル: block.hpp プロジェクト: AdamRuiz/opencv
        static __device__ __forceinline__ void reduce_n(T* data, unsigned int n, BinOp op)
        {
            int ftid = flattenedThreadId();
            int sft = stride();

            if (sft < n)
            {
                for (unsigned int i = sft + ftid; i < n; i += sft)
                    data[ftid] = op(data[ftid], data[i]);

                __syncthreads();

                n = sft;
            }

            while (n > 1)
            {
                unsigned int half = n/2;

                if (ftid < half)
                    data[ftid] = op(data[ftid], data[n - ftid - 1]);

                __syncthreads();

                n = n - half;
            }
        }
コード例 #8
0
ファイル: reduce.hpp プロジェクト: 007Indian/opencv
        __device__ static void reduce(Pointer smem, Reference val, uint tid, Op op)
        {
            const uint laneId = Warp::laneId();

        #if CV_CUDEV_ARCH >= 300
            Unroll<16, Pointer, Reference, Op>::loopShfl(val, op, warpSize);

            if (laneId == 0)
                loadToSmem(smem, val, tid / 32);
        #else
            loadToSmem(smem, val, tid);

            if (laneId < 16)
                Unroll<16, Pointer, Reference, Op>::loop(smem, val, tid, op);

            __syncthreads();

            if (laneId == 0)
                loadToSmem(smem, val, tid / 32);
        #endif

            __syncthreads();

            loadFromSmem(smem, val, tid);

            if (tid < 32)
            {
        #if CV_CUDEV_ARCH >= 300
                Unroll<M / 2, Pointer, Reference, Op>::loopShfl(val, op, M);
        #else
                Unroll<M / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
        #endif
            }
        }
コード例 #9
0
ファイル: prefix_sum.hpp プロジェクト: anshumang/bfs_titech
__global__ void prefix_sum_reference_kernel(T* odata, T* idata, const int n)
{
	extern __shared__ T temp[];
	const int tid = threadIdx.x;
	int offset = 1;
	const int ai = tid + 0;
	const int bi = tid + n/2;
	temp[ai] = idata[blockIdx.x*2*blockDim.x + ai];
	temp[bi] = idata[blockIdx.x*2*blockDim.x + bi];
	for(int d = n >> 1; d > 0; d >>= 1) {
		__syncthreads();
		if(tid < d) {
			int ai = offset*(2*tid+1)-1;
			int bi = offset*(2*tid+2)-1;
			temp[bi] += temp[ai];
		}
		offset *= 2;
	}
	if(tid == 0) { temp[n - 1] = 0; }
	for(int d = 1; d < n; d *= 2) {
		offset >>= 1;
		__syncthreads();
		if(tid < d) {
			int ai = offset*(2*tid+1)-1;
			int bi = offset*(2*tid+2)-1;
			T t = temp[ai];
			temp[ai] = temp[bi];
			temp[bi] += t;
		}
	}
	__syncthreads();
	odata[blockIdx.x*2*blockDim.x + ai] = temp[ai];
	odata[blockIdx.x*2*blockDim.x + bi] = temp[bi];
}
コード例 #10
0
ファイル: FGH.hpp プロジェクト: nornamor/Prosjektoppgave
__forceinline__ __device__ void readAndReconstructB(float (&Bi)[height][width], 
            float (&RBx)[height][width],
            float (&RBy)[height][width],
            unsigned int p, unsigned int q,
            unsigned int bx_, unsigned int by_) {
    int bx = bx_ * blockDim.x;
    for (int j=threadIdx.y; j<height; j+=blockDim.y) {
        int by = by_ * blockDim.y + j;
		float* Bi_ptr  = device_address2D(fgh_ctx.Bi.ptr, fgh_ctx.Bi.pitch, bx, by);

		for (int i=threadIdx.x; i<width; i+=blockDim.x) {
			Bi[j][i] = Bi_ptr[i];
		}
	}
    __syncthreads();
    
	/**
	 * Reconstruct B at the integration points
	 */
	reconstructBx(RBx, Bi, p, q);
	reconstructBy(RBy, Bi, p, q);
	if (threadIdx.y == 0) { //Use one warp to perform the extra reconstructions needed
		reconstructBy(RBy, Bi, p, 1);//second row
		reconstructBy(RBy, Bi, p, height-2);//second last row
		reconstructBy(RBy, Bi, p, height-1);//last row
		if (threadIdx.x < height-4) {
			reconstructBx(RBx, Bi, 1, p);//second column
			reconstructBx(RBx, Bi, width-2, p); //second last column
			reconstructBx(RBx, Bi, width-1, p);//last column
		}
	}
    __syncthreads();
}
コード例 #11
0
ファイル: prefix_sum.hpp プロジェクト: anshumang/bfs_titech
__global__ void prefix_sum_up_phase(T* idata, T* intermid, const int num_blocks, const int num_loops)
{
	const int tid = threadIdx.x;
	const int MAX = 1U << LOG_MAX;
	const int HARF_MAX = MAX / 2;
	const int QUAT_MAX = MAX / 4; // == blockDim.x

	const int i_start = num_loops * blockIdx.x;
	const int i_end = min(i_start + num_loops, num_blocks);

	__shared__ T s_data[(1U << LOG_MAX)/2];
	T sum = T(0);

	for(int i = i_start; i < i_end; ++i) {
		s_data[tid + 0       ] = idata[i*MAX + QUAT_MAX*0 + tid]
		                       + idata[i*MAX + QUAT_MAX*1 + tid];
		s_data[tid + QUAT_MAX] = idata[i*MAX + QUAT_MAX*2 + tid]
		                       + idata[i*MAX + QUAT_MAX*3 + tid];
		__syncthreads();

		dev_reduce<T, HARF_MAX>(HARF_MAX, s_data, tid);
		if(tid == 0) sum += s_data[0];
		__syncthreads();
	}

	if(tid == 0) intermid[blockIdx.x] = sum;
}
コード例 #12
0
CUGIP_GLOBAL void
pushKernelMultiLevelGlobalSync(
		TGraph aGraph,
		ParallelQueueView<int> aVertices,
		int *aLevelStarts,
		int aLevelCount,
		int aCurrentLevel,
		device_ptr<int> aLastProcessedLevel,
		device_flag_view aPushSuccessfulFlag,
		cub::GridBarrier barrier)
{
	uint blockId = __mul24(blockIdx.y, gridDim.x) + blockIdx.x;
	int levelStart = aLevelStarts[aCurrentLevel - 1];
	int levelEnd = aLevelStarts[aCurrentLevel];
	do {
		__syncthreads();
		barrier.Sync();
		int index = levelStart + blockId * blockDim.x + threadIdx.x;
		while (index < levelEnd) {
			pushImplementation(aGraph, aVertices, index, aCurrentLevel, aPushSuccessfulFlag);
			index += blockDim.x * gridDim.x;
		}
		__syncthreads();
		barrier.Sync();
		--aCurrentLevel;
		levelStart = aLevelStarts[aCurrentLevel - 1];
		levelEnd = aLevelStarts[aCurrentLevel];
		__syncthreads();
	} while (aCurrentLevel > 0 && (levelEnd - levelStart) <= TPolicy::MULTI_LEVEL_GLOBAL_SYNC_LIMIT);
	if (threadIdx.x == 0 && blockIdx.x == 0) {
		aLastProcessedLevel.assign_device(aCurrentLevel);
	}
}
コード例 #13
0
ファイル: another_approach.cpp プロジェクト: mirgee/exercises
__global__
void yourHisto(const unsigned int* const vals, //INPUT
               unsigned int* const histo,      //OUPUT
               int size)
{
	__shared__ unsigned int temp[1024];
  
     temp[threadIdx.x + 0] = 0;
     temp[threadIdx.x + 256] = 0;
     temp[threadIdx.x + 512] = 0;
     temp[threadIdx.x + 768] = 0;
     __syncthreads();

     int i = threadIdx.x + blockIdx.x * blockDim.x;
     int offset = blockDim.x * gridDim.x;
     while (i < size)
     {
              atomicAdd( &temp[vals[i]], 1);
              i += offset;
     }
     __syncthreads();


  
    atomicAdd( &(histo[threadIdx.x + 0]), temp[threadIdx.x + 0] );
    atomicAdd( &(histo[threadIdx.x + 256]), temp[threadIdx.x + 256] );
    atomicAdd( &(histo[threadIdx.x + 512]), temp[threadIdx.x + 512] );
    atomicAdd( &(histo[threadIdx.x + 768]), temp[threadIdx.x + 768] );
}
コード例 #14
0
__global__ void sequence_hamming_weight_kernel( device_sequence_space< IntType > * seqs, basic_data_space< unsigned int > * res, clotho::utility::algo_version< 3 > * v ) {

    typedef device_sequence_space< IntType >    space_type;
    typedef typename space_type::int_type       int_type;

    unsigned int bid = blockIdx.y * gridDim.x + blockIdx.x;
    unsigned int _count = seqs->seq_count;

    if( bid >= _count ) return;

    assert( _count <= res->size );  // sanity check: enough allocated space for results

    unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
    unsigned int tpb = blockDim.x * blockDim.y; // threads per block
    unsigned int bpg = gridDim.x * gridDim.y;   // blocks per grid
    unsigned int wpb = (tpb >> 5);
    unsigned int spg = wpb * bpg;   // sequences/grid = sequences(warps)/block * blocks/grid

    assert( (tpb & 31) == 0); // sanity check: all warps are full

    unsigned int lane_id = (tid & 31);
    unsigned int warp_id = (tid >> 5);

    unsigned int _width = seqs->seq_width;

    int_type * sptr = seqs->sequences;

    unsigned int * countptr = res->data;

    unsigned int max_seq_id = _count / wpb; // max_rounds = sequences * block/sequences 
    max_seq_id += ((_count % wpb) ? 1 : 0); // would !!(_count % spg) be more efficient?
    max_seq_id *= wpb;

    unsigned int seq_id = bid * wpb + warp_id;

    while( seq_id < max_seq_id ) {  // blocks of grid may terminate early; only block for tail may diverge
        unsigned int degree = 0;

        unsigned int end = (seq_id + 1) * _width;
        unsigned int idx = end - ((seq_id < _count) ? _width : 0);  // true for all threads in warp

        while( idx < end ) {    // all threads in a warp read same bit block; no divergence
            int_type b = sptr[ idx++ ]; // all threads in a warp read/load same bit block
            degree += ((b >> lane_id) & 1); // would !!( b & lane_mask), where (lane_mask = (1 << lane_id)), be more efficient?
        }
        __syncthreads();    // sync all warps

        for( unsigned int i = 1; i < 32; i <<= 1 ) {
            unsigned int d = __shfl_up(degree, i );
            degree += (( lane_id >= i ) * d);
        }

        if( lane_id == 31 && (seq_id < _count) ) {
            countptr[ seq_id ] = degree;
        }
        __syncthreads();

        seq_id += spg;
    }
}
コード例 #15
0
ファイル: histogram.hpp プロジェクト: Achraf33/opencv
    __global__ void histogram(const SrcPtr src, ResType* hist, const MaskPtr mask, const int rows, const int cols)
    {
    #if CV_CUDEV_ARCH >= 120
        __shared__ ResType smem[BIN_COUNT];

        const int y = blockIdx.x * blockDim.y + threadIdx.y;
        const int tid = threadIdx.y * blockDim.x + threadIdx.x;

        for (int i = tid; i < BIN_COUNT; i += BLOCK_SIZE)
            smem[i] = 0;

        __syncthreads();

        if (y < rows)
        {
            for (int x = threadIdx.x; x < cols; x += blockDim.x)
            {
                if (mask(y, x))
                {
                    const uint data = src(y, x);
                    atomicAdd(&smem[data % BIN_COUNT], 1);
                }
            }
        }

        __syncthreads();

        for (int i = tid; i < BIN_COUNT; i += BLOCK_SIZE)
        {
            const ResType histVal = smem[i];
            if (histVal > 0)
                atomicAdd(hist + i, histVal);
        }
    #endif
    }
コード例 #16
0
ファイル: mvs.hpp プロジェクト: AstroGPU/swarm
	/// Advance system by one time unit
	GPUAPI void advance()
	{
		double hby2 = 0.5 * min( max_timestep ,  _params.time_step );

			// Step 1
			if ( is_in_body_component_grid() ) 
			   drift_step(hby2);

			// Step 2: Kick Step
			if( is_in_body_component_grid_no_star() ) 
			   sys[b][c].vel() += hby2 * acc_bc;

			__syncthreads();

			// 3: Kepler Drift Step (Keplerian orbit about sun/central body)
			if( (ij>0) && (ij<nbod)  ) 
			    drift_kepler( sys[ij][0].pos(),sys[ij][1].pos(),sys[ij][2].pos(),sys[ij][0].vel(),sys[ij][1].vel(),sys[ij][2].vel(),sqrtGM, 2.0*hby2 );
			__syncthreads();

			// TODO: check for close encounters here
			acc_bc = calcForces.acc_planets(ij,b,c);

			// Step 4: Kick Step
			if( is_in_body_component_grid_no_star() ) 
			   sys[b][c].vel() += hby2 * acc_bc;
			__syncthreads();

			// Step 5
			if ( is_in_body_component_grid() ) 
			  drift_step(hby2);

		if( is_first_thread_in_system() ) 
			sys.time() += 2.0*hby2;
	}
コード例 #17
0
__global__ void kernel_reduce(const IndexType n, const ValueType *data, ValueType *out,
                              const IndexType GROUP_SIZE, const IndexType LOCAL_SIZE) {

    IndexType tid = threadIdx.x;

    __shared__ ValueType sdata[BLOCK_SIZE];
    sdata[tid] = ValueType(0.0);

    // get global id
    IndexType gid = GROUP_SIZE * blockIdx.x + tid;

    for (IndexType i = 0; i < LOCAL_SIZE; ++i, gid += BLOCK_SIZE)
      if ( gid < n )
        sdata[tid] += data[gid];

    __syncthreads();

#pragma unroll
    for (IndexType i = BLOCK_SIZE/2; i > 0; i /= 2) {

      if (tid < i)
        sdata[tid] += sdata[tid + i];

      __syncthreads();

    }

    if (tid == 0)
      out[blockIdx.x] = sdata[tid];

}
コード例 #18
0
__global__ void matMultParallelTiled(float* A, float* B, float* C, int n, int m, int o){
  
	__shared__ float Ads[TILE_WIDTH][TILE_WIDTH];
	__shared__ float Bds[TILE_WIDTH][TILE_WIDTH];

	int bx = blockIdx.x;
	int by = blockIdx.y;
	int tx = threadIdx.x;
	int ty = threadIdx.y;

	int row = by * TILE_WIDTH + ty;
	int col = bx * TILE_WIDTH + tx;

	float Pvalue=0;
	for (int k = 0; k < (m+TILE_WIDTH-1)/TILE_WIDTH; ++k){
		if ( row < n  &&  (k*TILE_WIDTH + tx) < m){
			Ads[ty][tx] = A[row * m + k*TILE_WIDTH + tx];
		}else{
			Ads[ty][tx] = 0;
		}
		if ((k*TILE_WIDTH + ty) < m && col < o){
			Bds[ty][tx] = B[(k*TILE_WIDTH + ty) * o + col];
		}else{
			Bds[ty][tx] =0;
		}
		__syncthreads();
		for(int k = 0; k < TILE_WIDTH; ++k){
			Pvalue += Ads[ty][k] * Bds[k][tx];
		}
		__syncthreads();
	}
	if (row < n && col < o){
		C[row * o + col] = Pvalue;
	}
}
コード例 #19
0
__global__ void kernelCountParticles(PBox pb,
                                     uint64_cu* gCounter,
                                     Filter filter,
                                     Mapping mapper)
{

    typedef typename PBox::FrameType FRAME;
    const uint32_t Dim = Mapping::Dim;

    __shared__ FRAME *frame;
    __shared__ bool isValid;
    __shared__ int counter;
    __shared__ lcellId_t particlesInSuperCell;


    __syncthreads(); /*wait that all shared memory is initialised*/

    typedef typename Mapping::SuperCellSize SuperCellSize;

    const DataSpace<Dim > threadIndex(threadIdx);
    const int linearThreadIdx = DataSpaceOperations<Dim>::template map<SuperCellSize > (threadIndex);
    const DataSpace<Dim> superCellIdx(mapper.getSuperCellIndex(DataSpace<Dim > (blockIdx)));

    if (linearThreadIdx == 0)
    {
        frame = &(pb.getLastFrame(superCellIdx, isValid));
        particlesInSuperCell = pb.getSuperCell(superCellIdx).getSizeLastFrame();
        counter = 0;
    }
    __syncthreads();
    if (!isValid)
        return; //end kernel if we have no frames
    filter.setSuperCellPosition((superCellIdx - mapper.getGuardingSuperCells()) * mapper.getSuperCellSize());
    while (isValid)
    {
        if (linearThreadIdx < particlesInSuperCell)
        {
            if (filter(*frame, linearThreadIdx))
                atomicAdd(&counter, 1);
        }
        __syncthreads();
        if (linearThreadIdx == 0)
        {
            frame = &(pb.getPreviousFrame(*frame, isValid));
            particlesInSuperCell = math::CT::volume<SuperCellSize>::type::value;
        }
        __syncthreads();
    }

    __syncthreads();
    if (linearThreadIdx == 0)
    {
        atomicAdd(gCounter, (uint64_cu) counter);
    }
}
コード例 #20
0
__global__
void kernelCollideVoxelMapsDebug(Voxel* voxelmap, const uint32_t voxelmap_size, OtherVoxel* other_map,
                                 Collider collider, uint16_t* results)
{
//#define DISABLE_STORING_OF_COLLISIONS
    __shared__ uint16_t cache[cMAX_NR_OF_THREADS_PER_BLOCK];
    uint32_t i = blockIdx.x * blockDim.x + threadIdx.x;
    uint32_t cache_index = threadIdx.x;
    cache[cache_index] = 0;

    while (i < voxelmap_size)
    {
        // todo / note: at the moment collision check is only used for DYNAMIC and SWEPT VOLUME type, static is used for debugging
        const bool collision = collider.collide(voxelmap[i], other_map[i]);
        if (collision) // store collision info
        {
//#ifndef DISABLE_STORING_OF_COLLISIONS
//      other_map[i].occupancy = 255;
//      other_map[i].voxeltype = eVT_COLLISION;
//#endif
            cache[cache_index] += 1;
        }
        i += blockDim.x * gridDim.x;
    }

    // debug: print collision coordinates

//  if (temp)
//  {
//    Vector3ui col_coord = mapToVoxels(voxelmap, dimensions, &(voxelmap[i]));
//    printf("Collision at voxel (%u) = (%u, %u, %u). Memory addresses are %p and %p.\n",
//           i, col_coord.x, col_coord.y, col_coord.z, (void*)&(voxelmap[i]), (void*)&(other_map[i]));
//  }
    __syncthreads();

    uint32_t j = blockDim.x / 2;

    while (j != 0)
    {
        if (cache_index < j)
        {
            cache[cache_index] = cache[cache_index] + cache[cache_index + j];
        }
        __syncthreads();
        j /= 2;
    }

    // copy results from this block to global memory
    if (cache_index == 0)
    {
        results[blockIdx.x] = cache[0];
    }
#undef DISABLE_STORING_OF_COLLISIONS
}
コード例 #21
0
__global__ void kAggShortRows(const float* mat, float* matSum, const uint width, const uint height, Agg agg, UnaryOp uop, BinaryOp bop) {
    const uint shmemX = THREADS_X + 1;
    __shared__ float shmem[AGG_SHORT_ROWS_THREADS_Y*shmemX];

    const uint tidx = hipThreadIdx_y * THREADS_X + hipThreadIdx_x;
    const uint ty = LOOPS_X == 1 ? tidx / width : hipThreadIdx_y; // when loops==1, width is gonna be smaller than block x dim
    const uint tx = LOOPS_X == 1 ? tidx % width : hipThreadIdx_x;
    const uint bidx = hipBlockIdx_y * hipGridDim_x + hipBlockIdx_x;
    const uint blockRowIdx = bidx * AGG_SHORT_ROWS_LOOPS_Y * AGG_SHORT_ROWS_THREADS_Y;
    float* shmemWrite = shmem + MUL24(ty, shmemX) + tx;
    matSum += blockRowIdx + tidx;
//    shmem[MUL24(hipThreadIdx_y, shmemX) + hipThreadIdx_x] = 0;
    mat += width * blockRowIdx + MUL24(ty, width) + tx;
    float* shmemWriteZeros = &shmem[MUL24(hipThreadIdx_y,shmemX) + hipThreadIdx_x];

    bool doAgg = tidx < AGG_SHORT_ROWS_THREADS_Y ;

    if (blockRowIdx < height) {
#pragma unroll
        for (uint y = 0; y < AGG_SHORT_ROWS_LOOPS_Y*AGG_SHORT_ROWS_THREADS_Y; y += AGG_SHORT_ROWS_THREADS_Y) {
            doAgg &= tidx + y + blockRowIdx < height;
            const bool heightIdxOK = ty < AGG_SHORT_ROWS_THREADS_Y && ty + y + blockRowIdx < height;

            shmemWriteZeros[0] = agg.getBaseValue();
            __syncthreads();
#pragma unroll
            for(uint x = 0; x < LOOPS_X * THREADS_X; x+= THREADS_X) {
//                __syncthreads();
                if (heightIdxOK && x + tx < width) {
                    shmemWrite[0] = agg(uop(mat[x]), shmemWrite[0]);
                }
            }
            __syncthreads();
            if (doAgg) {
                /*
                 * I tried doing this final sum as a 4-step reduction, with 8 threads
                 * per warp participating. It was slightly slower.
                 */
                float accum = agg.getBaseValue();
                float* shmemRead = shmem + MUL24(tidx, shmemX);
                // this loops too much if the rows are really short :(
#pragma unroll
                for (uint i = 0; i < THREADS_X; i++) {
                    accum = agg(accum, shmemRead[0]);
                    shmemRead++;
                }
                matSum[0] = bop(matSum[0], accum);
                matSum += AGG_SHORT_ROWS_THREADS_Y;
            }
            __syncthreads();
            mat += width * AGG_SHORT_ROWS_THREADS_Y;
        }
    }
}
コード例 #22
0
ファイル: old_asps.hpp プロジェクト: T-R0D/Past-Courses
/**
 * Matrix multiplication on the device: C = A * B (column-major)
 * wA is A's and B's width
 * Each block uses shared memory of (nIt * 2 * 16 * 16 * 4) = 2048 bytes (BLOCK_SIZE=16, sizeof(WORD)=4)
 * nIt is at most BLOCK_DIM/2 but does not affect the amount of shared memory used 
 * each multiprocessor can execute at most 8 blocks simultaneously (due to shared memory constraints) 
 */
__global__ void
matrixMul( float * C, float * A, float * B, int wA, int sCx, int sCy, int sAx, int sAy, int sBx, int sBy, int add)
{
    // Block index
    int bx = blockIdx.x;
    int by = blockIdx.y;

    // Thread index
    int tx = threadIdx.x;
    int ty = threadIdx.y;

  // Remember... column-major
    int sa = sAx * WA + sAy;
    int sb = sBx * WA + sBy;
  int sc = sCx * WA + sCy;
    
    int ba = BLOCK_SIZE * by;			// y-offset
    int bb = WA * BLOCK_SIZE * bx;		// x-offset
    
    float min = FLOATINF;
    
    int nIt = wA / BLOCK_SIZE;	// number of blocks in one dimension
    
    // Do block multiplication to update the C(i,j) block
    // Using A(i,1) * A(1,j) + A(i,2) * A(2,j) + ... + A(i,n) * A(n,j)
    for(int m = 0; m < nIt; ++m)
    {
        __shared__ float As[BLOCK_SIZE * BLOCK_SIZE];
        __shared__ float Bs[BLOCK_SIZE * BLOCK_SIZE];
    
        //load one element each
        As[tx*BLOCK_SIZE + ty] = A[sa + ba + m * BLOCK_SIZE * WA + tx *WA + ty];
        Bs[tx*BLOCK_SIZE + ty] = B[sb + bb + m * BLOCK_SIZE  + tx *WA + ty];
        __syncthreads();
    
    
        for(int k = 0; k < BLOCK_SIZE; ++k)
        {
            float a = As[k * BLOCK_SIZE + tx];	// (tx)th row
            float b = Bs[ty * BLOCK_SIZE + k];	// (ty)th column

        min = fminf(a+b, min);
    }
        __syncthreads();    
    }
    // Write the block sub-matrix to device memory;
    // each thread writes one element

  if(add)
    C[sc + ba + bb + ty * WA + tx] = fminf(C[sc + ba + bb + ty * WA + tx], min);	
  else
    C[sc + ba + bb + ty * WA + tx] = min;		// (tx,ty)th element
}
コード例 #23
0
ファイル: gravitation_accjerk.hpp プロジェクト: eford/swarm
	/*
	 * Run the complete algorithm for computing acceleration and
	 * jerk on all bodies. This is tightly coupled with the
	 * BPPT integrators. ij, b and c are calculated from thread id.
	 *
	 * If you need to calculate only acceleration use \ref acc function
	 * instead.
	 *
	 * @ij The pair number for this tread.
	 * @b  The planet number for this thread.
	 * @c  coordinate number x:0,y:1,z:2
	 * @pos position for this planet's coordinate
	 * @vel velecotiy for this planet's coordinate
	 * @acc output variable to hold acceleration
	 * @jerk output variable to hold jerk.
	 *
	 */
	GPUAPI void operator() (int ij,int b,int c,double& pos,double& vel,double& acc,double& jerk) const{
		// Write positions to shared (global) memory
		if(b < nbod && c < 3)
			sys[b][c].pos() = pos , sys[b][c].vel() = vel;
		__syncthreads();
		if(ij < pair_count)
			calc_pair(ij);
		__syncthreads();
		if(b < nbod && c < 3){
			sum(b,c,acc,jerk);
		}
	}
コード例 #24
0
ファイル: Reduce.hpp プロジェクト: Sanjay-Kamalapuri/picongpu
                __global__ void reduce(
                                       Src src, const uint32_t src_count,
                                       Dest dest,
                                       Functor func, Functor2 func2)
                {

                    const uint32_t l_tid = threadIdx.x;
                    const uint32_t tid = blockIdx.x * blockDim.x + l_tid;
                    const uint32_t globalThreadCount = gridDim.x * blockDim.x;
                    
                    /* cuda can not handle extern shared memory were the type is 
                     * defined by a template
                     * - therefore we use type int for the definition (dirty but OK) */
                    extern __shared__ int s_mem_extern[];
                    /* create a pointer with the right type*/
                    Type* s_mem=(Type*)s_mem_extern;

                    if (tid >= src_count) return; /*end not needed threads*/
                    
                    __syncthreads(); /*wait that all shared memory is initialized*/

                    /*fill shared mem*/
                    Type r_value = src[tid];
                    /*reduce not readed global memory to shared*/
                    uint32_t i = tid + globalThreadCount;
                    while (i < src_count)
                    {
                        func(r_value, src[i]);
                        i += globalThreadCount;
                    }
                    s_mem[l_tid] = r_value;
                    __syncthreads();
                    /*now reduce shared memory*/
                    uint32_t chunk_count = blockDim.x;
                    uint32_t active_threads;

                    while (chunk_count != 1)
                    {
                        const float half_threads = (float) chunk_count / 2.0f;
                        active_threads = float2uint(half_threads);
                        if (threadIdx.x != 0 && l_tid >= active_threads) return; /*end not needed threads*/


                        chunk_count = ceilf(half_threads);
                        func(s_mem[l_tid], s_mem[l_tid + chunk_count]);

                        __syncthreads();
                    }

                    func2(dest[blockIdx.x], s_mem[0]);

                }
コード例 #25
0
ファイル: CudaStopWatch.hpp プロジェクト: kempehzdr/DSP
	__device__ void stop() {
		if(intervall >= 0 && intervall < numberOfIntervalls) {
			unsigned long long diff = clock64() - currentTime;
			__syncthreads();
			atomicAdd(&counter[intervall].avg, diff);
			atomicMin(&counter[intervall].min, diff);
			atomicMax(&counter[intervall].max, diff);
			atomicAdd(&counter[intervall].iter, 1);
			intervall++;
			__syncthreads();
			currentTime = clock64();
		}
	}
コード例 #26
0
__device__
inline unsigned int
computeNumSmallerEigenvalsLarge(const NumericT *g_d, const NumericT *g_s, const unsigned int n,
                                const NumericT x,
                                const unsigned int tid,
                                const unsigned int num_intervals_active,
                                NumericT *s_d, NumericT *s_s,
                                unsigned int converged
                               )
{
    NumericT  delta = 1.0f;
    unsigned int count = 0;

    unsigned int rem = n;

    // do until whole diagonal and superdiagonal has been loaded and processed
    for (unsigned int i = 0; i < n; i += blockDim.x)
    {

        __syncthreads();

        // read new chunk of data into shared memory
        if ((i + threadIdx.x) < n)
        {

            s_d[threadIdx.x] = *(g_d + i + threadIdx.x);
            s_s[threadIdx.x] = *(g_s + i + threadIdx.x - 1);
        }

        __syncthreads();


        if (tid < num_intervals_active)
        {

            // perform (optimized) Gaussian elimination to determine the number
            // of eigenvalues that are smaller than n
            for (unsigned int k = 0; k < min(rem,blockDim.x); ++k)
            {
                delta = s_d[k] - x - (s_s[k] * s_s[k]) / delta;
                // delta = (abs( delta) < (1.0e-10)) ? -(1.0e-10) : delta;
                count += (delta < 0) ? 1 : 0;
            }

        }  // end if thread currently processing an interval

        rem -= blockDim.x;
    }

    return count;
}
コード例 #27
0
    /*! \pre This thread block shall be converged.
     */
    __device__
    void construct_owner()
    {
      __shared__ int owner;

      __syncthreads();
      if(threadIdx.x == 0)
      {
        m_owner = &owner;
      }
      __syncthreads();

      disown_and_synchronize();
    }
コード例 #28
0
ファイル: vec_distance.hpp プロジェクト: Achraf33/opencv
        template <typename T1> __device__ __forceinline__ VecDiffCachedRegister(const T1* vec1, int len, U* smem, int glob_tid, int tid)
        {
            if (glob_tid < len)
                smem[glob_tid] = vec1[glob_tid];
            __syncthreads();

            U* vec1ValsPtr = vec1Vals;

            #pragma unroll
            for (int i = tid; i < MAX_LEN; i += THREAD_DIM)
                *vec1ValsPtr++ = smem[i];

            __syncthreads();
        }
コード例 #29
0
__global__ void sequence_hamming_weight_kernel( device_sequence_space< IntType > * seqs, basic_data_space< unsigned int > * res, clotho::utility::algo_version< 5 > * v ) {
    assert( blockDim.x == 32 );
    assert( blockDim.y <= 32 );

    __shared__ unsigned int buffer[ 32 ];

    if( threadIdx.y == 0 ) {
        buffer[ threadIdx.x ] = 0;
    }
    __syncthreads();

    const unsigned int WIDTH = seqs->seq_width;

    IntType * seq_ptr = seqs->sequences;

    unsigned int N = 0;
    unsigned int seq_idx = blockIdx.y * gridDim.x  + blockIdx.x;
    unsigned int seq_begin = seq_idx * WIDTH + threadIdx.y;
    unsigned int seq_end = (seq_idx + 1) * WIDTH;
    while( seq_begin < seq_end ) {
        IntType x = seq_ptr[ seq_begin ];
        N += (( x >> threadIdx.x) & 1);
        seq_begin += blockDim.y;    
    }
    __syncthreads();

    for( unsigned int i = 1; i < 32; i <<= 1 ) {
        unsigned int t = __shfl_up( N, i );
        N += ((unsigned int) (threadIdx.x >= i) * t);
    }

    
    if( threadIdx.x == 31 ) {
        buffer[ threadIdx.y ] = N;
    }
    __syncthreads();

    N = buffer[ threadIdx.x ];
    __syncthreads();

    for( unsigned int i = 1; i < 32; i <<= 1 ) {
        unsigned int t = __shfl_up( N, i );
        N += ((unsigned int) (threadIdx.x >= i) * t);
    }

    if( threadIdx.y == 0 && threadIdx.x == 31 ) {
        res->data[ seq_idx ] = N;           
    }
    __syncthreads();
}
コード例 #30
0
                __global__ void reduce(
                                       Src src, const uint32_t src_count,
                                       Dest dest,
                                       Functor func, Functor2 func2)
                {
                    const uint32_t localId = threadIdx.x;
                    const uint32_t tid = blockIdx.x * blockDim.x + localId;
                    const uint32_t globalThreadCount = gridDim.x * blockDim.x;

                    /* cuda can not handle extern shared memory were the type is
                     * defined by a template
                     * - therefore we use type int for the definition (dirty but OK) */
                    extern __shared__ int s_mem_extern[];
                    /* create a pointer with the right type*/
                    Type* s_mem=(Type*)s_mem_extern;

                    if (tid >= src_count)
                        return; /*end not needed threads*/

                    /*fill shared mem*/
                    Type r_value = src[tid];
                    /*reduce not read global memory to shared*/
                    uint32_t i = tid + globalThreadCount;
                    while (i < src_count)
                    {
                        func(r_value, src[i]);
                        i += globalThreadCount;
                    }
                    s_mem[localId] = r_value;
                    __syncthreads();
                    /*now reduce shared memory*/
                    uint32_t chunk_count = blockDim.x;

                    while (chunk_count != 1)
                    {
                        /* Half number of chunks (rounded down) */
                        uint32_t active_threads = chunk_count / 2;
                        if (localId >= active_threads)
                            return; /*end not needed threads*/

                        /* New chunks is half number of chunks rounded up for uneven counts
                         * --> local_tid=0 will reduce the single element for an odd number of values at the end */
                        chunk_count = (chunk_count + 1) / 2;
                        func(s_mem[localId], s_mem[localId + chunk_count]);

                        __syncthreads();
                    }

                    func2(dest[blockIdx.x], s_mem[0]);
                }