Пример #1
0
 __device__ __forceinline__ void
 pack_tsdf (float tsdf, int weight, short2& value)
 {
   int fixedp = max (-DIVISOR, min (DIVISOR, __float2int_rz (tsdf * DIVISOR)));
   //int fixedp = __float2int_rz(tsdf * DIVISOR);
   value = make_short2 (fixedp, weight);
 }
Пример #2
0
/**
 * @brief Computes inverse wavelet transform 53.
 *
 * @param idata Input data.
 * @param odata Output data
 * @param img_size Struct with input image width and height.
 * @param step Struct with output image width and height.
 */
__global__
void iwt53_new(const float *idata, float *odata, const int2 img_size, const int2 step)
{
	// shared memory for part of the signal
	__shared__ int shared[MEMSIZE][MEMSIZE + 1];

	// LL subband dimensions - ceil of input image dimensions
//	const int2 ll_sub = make_int2((int) ceilf(img_size.x / 2.0), (int) ceilf(img_size.y / 2.0));
	const int2 ll_sub = make_int2((img_size.x + 1) >> 1, (img_size.y + 1) >> 1);

	// Input x, y block dimension
	// Width
	// bidx.x - left block
	// bidx.y - right block
	const int2 bidx = make_int2(blockIdx.x * BLOCKSIZEX, ll_sub.x + blockIdx.x * BLOCKSIZEX);
	// Height
	// bidy.x - top block
	// bidy.y - bottom block
	const int2 bidy = make_int2(blockIdx.y * BLOCKSIZEY, ll_sub.y + blockIdx.y * BLOCKSIZEY);

	// Even thread id
	const short tidx2 = threadIdx.x * 2;

	// thread id
	short2 tid = make_short2(threadIdx.x, threadIdx.y);

	// Patch size
	/* Compute patch offset and size */
	// p_size_x.x - left part block x size
	// p_size_x.y - right part block x size
	const short2 p_size_x = make_short2(ll_sub.x - bidx.x < BLOCKSIZEX ? ll_sub.x - bidx.x : BLOCKSIZEX,
			img_size.x - bidx.y < BLOCKSIZEX ? img_size.x - bidx.y : BLOCKSIZEX);

	// p_size_y.x - top part block x size
	// p_size_y.y - bottom part block x size
	const short2 p_size_y = make_short2(ll_sub.y - bidy.x < BLOCKSIZEY ? ll_sub.y - bidy.x : BLOCKSIZEY,
			img_size.y - bidy.y < BLOCKSIZEY ? img_size.y - bidy.y : BLOCKSIZEY);

	// summary size
	const short2 p_size_sum = make_short2(p_size_x.x + p_size_x.y, p_size_y.x + p_size_y.y); /* block x size */

	// Threads offset to read margins
	short p_offset_y_t;
	// Allocate registers in order to compute even and odd pixels.
	int pix_neighborhood[6];
	// Minimize registers usage. Right | bottom offset. Odd | even result pixels.
	int results[6];

	read_data_new<int, MEMSIZE + 1>(1, tid, bidx, bidy, p_size_x, p_size_y, ll_sub, img_size, step.x, idata, shared, OFFSET_53/2);

	__syncthreads();

	// thread x id
	tid.x = threadIdx.x;
	// thread y id
	tid.y = threadIdx.y;

	// Row number
	p_offset_y_t = 0;

	// Process columns
	iprocess_53_new<MEMSIZE + 1>(tidx2, tid.y, p_offset_y_t, p_size_sum.y, p_size_sum.x + 2 * OFFSET_53, pix_neighborhood, shared, results);

	__syncthreads();

	tid.x = threadIdx.x;
	tid.y = threadIdx.y;
	p_offset_y_t = 0;

	// safe results and rotate
	while (tid.y < p_size_sum.x + 2 * OFFSET_53 && 2 * tid.x < p_size_sum.y)
	{
		// Can not dynamically index registers, avoid local memory usage.
		//		shared[tid.x][tid.y] = k2 * results[0 + p_offset_y * 2];
		//		if(tid.x + BLOCKSIZEX < p_size_sum.y)
		//			shared[tid.x + BLOCKSIZEX][tid.y] = k1 * results[1 + p_offset_y * 2];
		save_to_shared_new<int, MEMSIZE + 1, (MEMSIZE + (BLOCKSIZEY - 1)) / BLOCKSIZEY> (1, make_short2(2 * tid.x, tid.y), make_short2(2 * tid.x + 1, tid.y), 2
				* tid.x + 1, p_offset_y_t, p_size_sum.y, results, shared);

		p_offset_y_t++;
		tid.y += BLOCKSIZEY;
	}
	__syncthreads();

	tid.x = threadIdx.x;
	tid.y = threadIdx.y;

	// Row number
	p_offset_y_t = 0;

	// Process rows
	iprocess_53_new<MEMSIZE + 1>(tidx2, tid.y, p_offset_y_t, p_size_sum.x, p_size_sum.y, pix_neighborhood, shared, results);

	__syncthreads();

	tid.x = threadIdx.x;
	tid.y = threadIdx.y;

	// Row number
	p_offset_y_t = 0;

	// Safe results
	while (2 * tid.x < p_size_sum.x && tid.y < p_size_sum.y)
	{
		// Can not dynamically index registers, avoid local memory usage.
		//		shared[tid.x][tid.y] = k2 * results[0 + p_offset_y * 2];
		//		if(tid.x + BLOCKSIZEX < p_size_sum.y)
		//			shared[tid.x + BLOCKSIZEX][tid.y] = k1 * results[1 + p_offset_y * 2];
		save_to_shared_new<int, MEMSIZE + 1, (MEMSIZE + (BLOCKSIZEY - 1)) / BLOCKSIZEY> (1, make_short2(tid.y, 2 * tid.x), make_short2(tid.y, 2 * tid.x + 1), 2
				* tid.x + 1, p_offset_y_t, p_size_sum.x, results, shared);

		p_offset_y_t++;
		tid.y += BLOCKSIZEY;
	}
	__syncthreads();

	tid.x = threadIdx.x;
	tid.y = threadIdx.y;

	// Save to GM
	save_data_new<int, MEMSIZE + 1>(tid, p_size_sum, bidx.x, bidy.x, img_size, step.x, odata, shared);
}
Пример #3
0
__device__ __host__ short2 operator + (const short2 s2O1_, const short2 s2O2_){
	return make_short2(s2O1_.x + s2O2_.x,s2O1_.y + s2O2_.y);
}
Пример #4
0
__device__  short2 convert2s2(const float2 f2O1_){ //can be called from host and device
	return make_short2(__float2int_rn(f2O1_.x), __float2int_rn(f2O1_.y));
}
Пример #5
0
__device__ short2 operator * (const short sO1_, const short2 s2O2_){
	return make_short2( sO1_* s2O2_.x, sO1_ * s2O2_.y );
}