__device__ __forceinline__ void pack_tsdf (float tsdf, int weight, short2& value) { int fixedp = max (-DIVISOR, min (DIVISOR, __float2int_rz (tsdf * DIVISOR))); //int fixedp = __float2int_rz(tsdf * DIVISOR); value = make_short2 (fixedp, weight); }
/** * @brief Computes inverse wavelet transform 53. * * @param idata Input data. * @param odata Output data * @param img_size Struct with input image width and height. * @param step Struct with output image width and height. */ __global__ void iwt53_new(const float *idata, float *odata, const int2 img_size, const int2 step) { // shared memory for part of the signal __shared__ int shared[MEMSIZE][MEMSIZE + 1]; // LL subband dimensions - ceil of input image dimensions // const int2 ll_sub = make_int2((int) ceilf(img_size.x / 2.0), (int) ceilf(img_size.y / 2.0)); const int2 ll_sub = make_int2((img_size.x + 1) >> 1, (img_size.y + 1) >> 1); // Input x, y block dimension // Width // bidx.x - left block // bidx.y - right block const int2 bidx = make_int2(blockIdx.x * BLOCKSIZEX, ll_sub.x + blockIdx.x * BLOCKSIZEX); // Height // bidy.x - top block // bidy.y - bottom block const int2 bidy = make_int2(blockIdx.y * BLOCKSIZEY, ll_sub.y + blockIdx.y * BLOCKSIZEY); // Even thread id const short tidx2 = threadIdx.x * 2; // thread id short2 tid = make_short2(threadIdx.x, threadIdx.y); // Patch size /* Compute patch offset and size */ // p_size_x.x - left part block x size // p_size_x.y - right part block x size const short2 p_size_x = make_short2(ll_sub.x - bidx.x < BLOCKSIZEX ? ll_sub.x - bidx.x : BLOCKSIZEX, img_size.x - bidx.y < BLOCKSIZEX ? img_size.x - bidx.y : BLOCKSIZEX); // p_size_y.x - top part block x size // p_size_y.y - bottom part block x size const short2 p_size_y = make_short2(ll_sub.y - bidy.x < BLOCKSIZEY ? ll_sub.y - bidy.x : BLOCKSIZEY, img_size.y - bidy.y < BLOCKSIZEY ? img_size.y - bidy.y : BLOCKSIZEY); // summary size const short2 p_size_sum = make_short2(p_size_x.x + p_size_x.y, p_size_y.x + p_size_y.y); /* block x size */ // Threads offset to read margins short p_offset_y_t; // Allocate registers in order to compute even and odd pixels. int pix_neighborhood[6]; // Minimize registers usage. Right | bottom offset. Odd | even result pixels. int results[6]; read_data_new<int, MEMSIZE + 1>(1, tid, bidx, bidy, p_size_x, p_size_y, ll_sub, img_size, step.x, idata, shared, OFFSET_53/2); __syncthreads(); // thread x id tid.x = threadIdx.x; // thread y id tid.y = threadIdx.y; // Row number p_offset_y_t = 0; // Process columns iprocess_53_new<MEMSIZE + 1>(tidx2, tid.y, p_offset_y_t, p_size_sum.y, p_size_sum.x + 2 * OFFSET_53, pix_neighborhood, shared, results); __syncthreads(); tid.x = threadIdx.x; tid.y = threadIdx.y; p_offset_y_t = 0; // safe results and rotate while (tid.y < p_size_sum.x + 2 * OFFSET_53 && 2 * tid.x < p_size_sum.y) { // Can not dynamically index registers, avoid local memory usage. // shared[tid.x][tid.y] = k2 * results[0 + p_offset_y * 2]; // if(tid.x + BLOCKSIZEX < p_size_sum.y) // shared[tid.x + BLOCKSIZEX][tid.y] = k1 * results[1 + p_offset_y * 2]; save_to_shared_new<int, MEMSIZE + 1, (MEMSIZE + (BLOCKSIZEY - 1)) / BLOCKSIZEY> (1, make_short2(2 * tid.x, tid.y), make_short2(2 * tid.x + 1, tid.y), 2 * tid.x + 1, p_offset_y_t, p_size_sum.y, results, shared); p_offset_y_t++; tid.y += BLOCKSIZEY; } __syncthreads(); tid.x = threadIdx.x; tid.y = threadIdx.y; // Row number p_offset_y_t = 0; // Process rows iprocess_53_new<MEMSIZE + 1>(tidx2, tid.y, p_offset_y_t, p_size_sum.x, p_size_sum.y, pix_neighborhood, shared, results); __syncthreads(); tid.x = threadIdx.x; tid.y = threadIdx.y; // Row number p_offset_y_t = 0; // Safe results while (2 * tid.x < p_size_sum.x && tid.y < p_size_sum.y) { // Can not dynamically index registers, avoid local memory usage. // shared[tid.x][tid.y] = k2 * results[0 + p_offset_y * 2]; // if(tid.x + BLOCKSIZEX < p_size_sum.y) // shared[tid.x + BLOCKSIZEX][tid.y] = k1 * results[1 + p_offset_y * 2]; save_to_shared_new<int, MEMSIZE + 1, (MEMSIZE + (BLOCKSIZEY - 1)) / BLOCKSIZEY> (1, make_short2(tid.y, 2 * tid.x), make_short2(tid.y, 2 * tid.x + 1), 2 * tid.x + 1, p_offset_y_t, p_size_sum.x, results, shared); p_offset_y_t++; tid.y += BLOCKSIZEY; } __syncthreads(); tid.x = threadIdx.x; tid.y = threadIdx.y; // Save to GM save_data_new<int, MEMSIZE + 1>(tid, p_size_sum, bidx.x, bidy.x, img_size, step.x, odata, shared); }
__device__ __host__ short2 operator + (const short2 s2O1_, const short2 s2O2_){ return make_short2(s2O1_.x + s2O2_.x,s2O1_.y + s2O2_.y); }
__device__ short2 convert2s2(const float2 f2O1_){ //can be called from host and device return make_short2(__float2int_rn(f2O1_.x), __float2int_rn(f2O1_.y)); }
__device__ short2 operator * (const short sO1_, const short2 s2O2_){ return make_short2( sO1_* s2O2_.x, sO1_ * s2O2_.y ); }