/* do what I mean */ static bench_tensor *dwim(bench_tensor *t, bench_iodim **last_iodim, n_transform nti, n_transform nto, bench_iodim *dt) { int i; bench_iodim *d, *d1; if (!FINITE_RNK(t->rnk) || t->rnk < 1) return t; i = t->rnk; d1 = *last_iodim; while (--i >= 0) { d = t->dims + i; if (!d->is) d->is = d1->is * transform_n(d1->n, d1==dt ? nti : SAME); if (!d->os) d->os = d1->os * transform_n(d1->n, d1==dt ? nto : SAME); d1 = d; } *last_iodim = d1; return t; }
__global__ static void transform_kernel(Param<T> out, CParam<T> in, const int nimages, const int ntransforms, const int blocksXPerImage) { // Compute which image set const int setId = blockIdx.x / blocksXPerImage; const int blockIdx_x = blockIdx.x - setId * blocksXPerImage; // Get thread indices const int xx = blockIdx_x * blockDim.x + threadIdx.x; const int yy = blockIdx.y * blockDim.y + threadIdx.y; const int limages = min(out.dims[2] - setId * nimages, nimages); if(xx >= out.dims[0] || yy >= out.dims[1] * ntransforms) return; // Index of channel of images and transform //const int i_idx = xx / out.dims[0]; const int t_idx = yy / out.dims[1]; // Index in local channel -> This is output index //const int xido = xx - i_idx * out.dims[0]; const int xido = xx; const int yido = yy - t_idx * out.dims[1]; // Global offset // Offset for transform channel + Offset for image channel. T *optr = out.ptr + t_idx * nimages * out.strides[2] + setId * nimages * out.strides[2]; const T *iptr = in.ptr + setId * nimages * in.strides[2]; // Transform is in constant memory. const float *tmat_ptr = c_tmat + t_idx * 6; float tmat[6]; // We expect a inverse transform matrix by default // If it is an forward transform, then we need its inverse if(inverse) { #pragma unroll for(int i = 0; i < 6; i++) tmat[i] = tmat_ptr[i]; } else { calc_affine_inverse(tmat, tmat_ptr); } if (xido >= out.dims[0] && yido >= out.dims[1]) return; switch(method) { case AF_INTERP_NEAREST: transform_n(optr, out, iptr, in, tmat, xido, yido, limages); break; case AF_INTERP_BILINEAR: transform_b(optr, out, iptr, in, tmat, xido, yido, limages); break; case AF_INTERP_LOWER: transform_l(optr, out, iptr, in, tmat, xido, yido, limages); break; default: break; } }
__global__ static void rotate_kernel(Param<T> out, CParam<T> in, const tmat_t t, const int nimages, const int nbatches, const int blocksXPerImage, const int blocksYPerImage) { // Compute which image set const int setId = blockIdx.x / blocksXPerImage; const int blockIdx_x = blockIdx.x - setId * blocksXPerImage; const int batch = blockIdx.y / blocksYPerImage; const int blockIdx_y = blockIdx.y - batch * blocksYPerImage; // Get thread indices const int xx = blockIdx_x * blockDim.x + threadIdx.x; const int yy = blockIdx_y * blockDim.y + threadIdx.y; const int limages = min(out.dims[2] - setId * nimages, nimages); if(xx >= out.dims[0] || yy >= out.dims[1]) return; // Global offset // Offset for transform channel + Offset for image channel. T *optr = out.ptr + setId * nimages * out.strides[2] + batch * out.strides[3]; const T *iptr = in.ptr + setId * nimages * in.strides[2] + batch * in.strides[3]; switch(method) { case AF_INTERP_NEAREST: transform_n(optr, out, iptr, in, t.tmat, xx, yy, limages); break; case AF_INTERP_BILINEAR: transform_b(optr, out, iptr, in, t.tmat, xx, yy, limages); break; case AF_INTERP_LOWER: transform_l(optr, out, iptr, in, t.tmat, xx, yy, limages); break; default: break; } }