__global__ static void transform_kernel(Param<T> out, CParam<T> in, const dim_type nimages, const dim_type ntransforms) { // Get thread indices const dim_type xx = blockIdx.x * blockDim.x + threadIdx.x; const dim_type yy = blockIdx.y * blockDim.y + threadIdx.y; if(xx >= out.dims[0] * nimages || yy >= out.dims[1] * ntransforms) return; // Index of channel of images and transform const dim_type i_idx = xx / out.dims[0]; const dim_type t_idx = yy / out.dims[1]; // Index in local channel -> This is output index const dim_type xido = xx - i_idx * out.dims[0]; const dim_type yido = yy - t_idx * out.dims[1]; // Global offset // Offset for transform channel + Offset for image channel. T *optr = out.ptr + t_idx * nimages * out.strides[2] + i_idx * out.strides[2]; const T *iptr = in.ptr + i_idx * in.strides[2]; // Transform is in constant memory. const float *tmat_ptr = c_tmat + t_idx * 6; float tmat[6]; // We expect a inverse transform matrix by default // If it is an forward transform, then we need its inverse if(inverse) { #pragma unroll for(int i = 0; i < 6; i++) tmat[i] = tmat_ptr[i]; } else { calc_affine_inverse(tmat, tmat_ptr); } if (xido >= out.dims[0] && yido >= out.dims[1]) return; // Compute input index const dim_type xidi = round(xido * tmat[0] + yido * tmat[1] + tmat[2]); const dim_type yidi = round(xido * tmat[3] + yido * tmat[4] + tmat[5]); // Compute memory location of indices dim_type loci = (yidi * in.strides[1] + xidi); dim_type loco = (yido * out.strides[1] + xido); // Copy to output T val = 0; if (xidi < in.dims[0] && yidi < in.dims[1] && xidi >= 0 && yidi >= 0) val = iptr[loci]; optr[loco] = val; }
__global__ static void transform_kernel(Param<T> out, CParam<T> in, const int nimages, const int ntransforms, const int blocksXPerImage) { // Compute which image set const int setId = blockIdx.x / blocksXPerImage; const int blockIdx_x = blockIdx.x - setId * blocksXPerImage; // Get thread indices const int xx = blockIdx_x * blockDim.x + threadIdx.x; const int yy = blockIdx.y * blockDim.y + threadIdx.y; const int limages = min(out.dims[2] - setId * nimages, nimages); if(xx >= out.dims[0] || yy >= out.dims[1] * ntransforms) return; // Index of channel of images and transform //const int i_idx = xx / out.dims[0]; const int t_idx = yy / out.dims[1]; // Index in local channel -> This is output index //const int xido = xx - i_idx * out.dims[0]; const int xido = xx; const int yido = yy - t_idx * out.dims[1]; // Global offset // Offset for transform channel + Offset for image channel. T *optr = out.ptr + t_idx * nimages * out.strides[2] + setId * nimages * out.strides[2]; const T *iptr = in.ptr + setId * nimages * in.strides[2]; // Transform is in constant memory. const float *tmat_ptr = c_tmat + t_idx * 6; float tmat[6]; // We expect a inverse transform matrix by default // If it is an forward transform, then we need its inverse if(inverse) { #pragma unroll for(int i = 0; i < 6; i++) tmat[i] = tmat_ptr[i]; } else { calc_affine_inverse(tmat, tmat_ptr); } if (xido >= out.dims[0] && yido >= out.dims[1]) return; switch(method) { case AF_INTERP_NEAREST: transform_n(optr, out, iptr, in, tmat, xido, yido, limages); break; case AF_INTERP_BILINEAR: transform_b(optr, out, iptr, in, tmat, xido, yido, limages); break; case AF_INTERP_LOWER: transform_l(optr, out, iptr, in, tmat, xido, yido, limages); break; default: break; } }