void diff2(Param<T> out, CParam<T> in, int const dim) { af::dim4 dims = out.dims(); // Bool for dimension bool is_dim0 = dim == 0; bool is_dim1 = dim == 1; bool is_dim2 = dim == 2; bool is_dim3 = dim == 3; T const * const inPtr = in.get(); T * outPtr = out.get(); // TODO: Improve this for(dim_t l = 0; l < dims[3]; l++) { for(dim_t k = 0; k < dims[2]; k++) { for(dim_t j = 0; j < dims[1]; j++) { for(dim_t i = 0; i < dims[0]; i++) { // Operation: out[index] = in[index + 1 * dim_size] - in[index] int idx = getIdx(in.strides(), i, j, k, l); int jdx = getIdx(in.strides(), i + is_dim0, j + is_dim1, k + is_dim2, l + is_dim3); int kdx = getIdx(in.strides(), i + 2 * is_dim0, j + 2 * is_dim1, k + 2 * is_dim2, l + 2 * is_dim3); int odx = getIdx(out.strides(), i, j, k, l); outPtr[odx] = inPtr[kdx] + inPtr[idx] - inPtr[jdx] - inPtr[jdx]; } } } } }
void transpose(Param<T> output, CParam<T> input) { const dim4 odims = output.dims(); const dim4 ostrides = output.strides(); const dim4 istrides = input.strides(); T *out = output.get(); T const *const in = input.get(); for (dim_t l = 0; l < odims[3]; ++l) { for (dim_t k = 0; k < odims[2]; ++k) { // Outermost loop handles batch mode // if input has no data along third dimension // this loop runs only once for (dim_t j = 0; j < odims[1]; ++j) { for (dim_t i = 0; i < odims[0]; ++i) { // calculate array indices based on offsets and strides // the helper getIdx takes care of indices const dim_t inIdx = getIdx(istrides, j, i, k, l); const dim_t outIdx = getIdx(ostrides, i, j, k, l); if (conjugate) out[outIdx] = getConjugate(in[inIdx]); else out[outIdx] = in[inIdx]; } } // outData and inData pointers doesn't need to be // offset as the getIdx function is taking care // of the batch parameter } } }
void transpose_inplace(Param<T> input) { const dim4 idims = input.dims(); const dim4 istrides = input.strides(); T *in = input.get(); for (dim_t l = 0; l < idims[3]; ++l) { for (dim_t k = 0; k < idims[2]; ++k) { // Outermost loop handles batch mode // if input has no data along third dimension // this loop runs only once // // Run only bottom triangle. std::swap swaps with upper triangle for (dim_t j = 0; j < idims[1]; ++j) { for (dim_t i = j + 1; i < idims[0]; ++i) { // calculate array indices based on offsets and strides // the helper getIdx takes care of indices const dim_t iIdx = getIdx(istrides, j, i, k, l); const dim_t oIdx = getIdx(istrides, i, j, k, l); if (conjugate) { in[iIdx] = getConjugate(in[iIdx]); in[oIdx] = getConjugate(in[oIdx]); std::swap(in[iIdx], in[oIdx]); } else { std::swap(in[iIdx], in[oIdx]); } } } } } }
void wrap_dim(Param<T> out, CParam<T> in, const dim_t wx, const dim_t wy, const dim_t sx, const dim_t sy, const dim_t px, const dim_t py) { const T* inPtr = in.get(); T* outPtr = out.get(); af::dim4 idims = in.dims(); af::dim4 odims = out.dims(); af::dim4 istrides = in.strides(); af::dim4 ostrides = out.strides(); dim_t nx = (odims[0] + 2 * px - wx) / sx + 1; for (dim_t w = 0; w < idims[3]; w++) { for (dim_t z = 0; z < idims[2]; z++) { dim_t cIn = w * istrides[3] + z * istrides[2]; dim_t cOut = w * ostrides[3] + z * ostrides[2]; const T* iptr_ = inPtr + cIn; T* optr = outPtr + cOut; for (dim_t col = 0; col < idims[d]; col++) { // Offset output ptr const T* iptr = iptr_ + col * istrides[d]; // Calculate input window index dim_t winy = (col / nx); dim_t winx = (col % nx); dim_t startx = winx * sx; dim_t starty = winy * sy; dim_t spx = startx - px; dim_t spy = starty - py; // Short cut condition ensuring all values within input // dimensions bool cond = (spx >= 0 && spx + wx < odims[0] && spy >= 0 && spy + wy < odims[1]); for (dim_t y = 0; y < wy; y++) { for (dim_t x = 0; x < wx; x++) { dim_t xpad = spx + x; dim_t ypad = spy + y; dim_t iloc = (y * wx + x); if (d == 0) iloc *= istrides[1]; if (cond || (xpad >= 0 && xpad < odims[0] && ypad >= 0 && ypad < odims[1])) { dim_t oloc = (ypad * ostrides[1] + xpad * ostrides[0]); // FIXME: When using threads, atomize this optr[oloc] += iptr[iloc]; } } } } } } }
void bilateral(Param<OutT> out, CParam<InT> in, float const s_sigma, float const c_sigma) { af::dim4 const dims = in.dims(); af::dim4 const istrides = in.strides(); af::dim4 const ostrides = out.strides(); // clamp spatical and chromatic sigma's float space_ = std::min(11.5f, std::max(s_sigma, 0.f)); float color_ = std::max(c_sigma, 0.f); dim_t const radius = std::max((dim_t)(space_ * 1.5f), (dim_t)1); float const svar = space_*space_; float const cvar = color_*color_; for(dim_t b3=0; b3<dims[3]; ++b3) { OutT *outData = out.get() + b3 * ostrides[3]; InT const * inData = in.get() + b3 * istrides[3]; // b3 for loop handles following batch configurations // - gfor // - input based batch // - when input is 4d array for color images for(dim_t b2=0; b2<dims[2]; ++b2) { // b2 for loop handles following batch configurations // - channels // - input based batch // - when input is 3d array for grayscale images for(dim_t j=0; j<dims[1]; ++j) { // j steps along 2nd dimension for(dim_t i=0; i<dims[0]; ++i) { // i steps along 1st dimension OutT norm = 0.0; OutT res = 0.0; OutT const center = (OutT)inData[getIdx(istrides, i, j)]; for(dim_t wj=-radius; wj<=radius; ++wj) { // clamps offsets dim_t tj = clamp(j+wj, dim_t(0), dims[1]-1); for(dim_t wi=-radius; wi<=radius; ++wi) { // clamps offsets dim_t ti = clamp(i+wi, dim_t(0), dims[0]-1); // proceed OutT const val= (OutT)inData[getIdx(istrides, ti, tj)]; OutT const gauss_space = (wi*wi+wj*wj)/(-2.0*svar); OutT const gauss_range = ((center-val)*(center-val))/(-2.0*cvar); OutT const weight = std::exp(gauss_space+gauss_range); norm += weight; res += val*weight; } } // filter loop ends here outData[getIdx(ostrides, i, j)] = res/norm; } //1st dimension loop ends here } //2nd dimension loop ends here outData += ostrides[2]; inData += istrides[2]; } } }
void select(Param<T> out, CParam<char> cond, CParam<T> a, CParam<T> b) { af::dim4 adims = a.dims(); af::dim4 astrides = a.strides(); af::dim4 bdims = b.dims(); af::dim4 bstrides = b.strides(); af::dim4 cdims = cond.dims(); af::dim4 cstrides = cond.strides(); af::dim4 odims = out.dims(); af::dim4 ostrides = out.strides(); bool is_a_same[] = {adims[0] == odims[0], adims[1] == odims[1], adims[2] == odims[2], adims[3] == odims[3]}; bool is_b_same[] = {bdims[0] == odims[0], bdims[1] == odims[1], bdims[2] == odims[2], bdims[3] == odims[3]}; bool is_c_same[] = {cdims[0] == odims[0], cdims[1] == odims[1], cdims[2] == odims[2], cdims[3] == odims[3]}; const T *aptr = a.get(); const T *bptr = b.get(); T *optr = out.get(); const char *cptr = cond.get(); for (int l = 0; l < odims[3]; l++) { int o_off3 = ostrides[3] * l; int a_off3 = astrides[3] * is_a_same[3] * l; int b_off3 = bstrides[3] * is_b_same[3] * l; int c_off3 = cstrides[3] * is_c_same[3] * l; for (int k = 0; k < odims[2]; k++) { int o_off2 = ostrides[2] * k + o_off3; int a_off2 = astrides[2] * is_a_same[2] * k + a_off3; int b_off2 = bstrides[2] * is_b_same[2] * k + b_off3; int c_off2 = cstrides[2] * is_c_same[2] * k + c_off3; for (int j = 0; j < odims[1]; j++) { int o_off1 = ostrides[1] * j + o_off2; int a_off1 = astrides[1] * is_a_same[1] * j + a_off2; int b_off1 = bstrides[1] * is_b_same[1] * j + b_off2; int c_off1 = cstrides[1] * is_c_same[1] * j + c_off2; for (int i = 0; i < odims[0]; i++) { bool cval = is_c_same[0] ? cptr[c_off1 + i] : cptr[c_off1]; T aval = is_a_same[0] ? aptr[a_off1 + i] : aptr[a_off1]; T bval = is_b_same[0] ? bptr[b_off1 + i] : bptr[b_off1]; T oval = cval ? aval : bval; optr[o_off1 + i] = oval; } } } } }
void diagExtract(Param<T> out, CParam<T> in, int const num) { af::dim4 const odims = out.dims(); af::dim4 const idims = in.dims(); int const i_off = (num > 0) ? (num * in.strides(1)) : (-num); for (int l = 0; l < (int)odims[3]; l++) { for (int k = 0; k < (int)odims[2]; k++) { const T *iptr = in.get() + l * in.strides(3) + k * in.strides(2) + i_off; T *optr = out.get() + l * out.strides(3) + k * out.strides(2); for (int i = 0; i < (int)odims[0]; i++) { T val = scalar<T>(0); if (i < idims[0] && i < idims[1]) val = iptr[i * in.strides(1) + i]; optr[i] = val; } } } }
void approx2(Param<InT> output, CParam<InT> input, CParam<LocT> xposition, CParam<LocT> yposition, float const offGrid, af_interp_type method) { InT * out = output.get(); const LocT *xpos = xposition.get(); const LocT *ypos = yposition.get(); af::dim4 const odims = output.dims(); af::dim4 const idims = input.dims(); af::dim4 const xdims = xposition.dims(); af::dim4 const ostrides = output.strides(); af::dim4 const istrides = input.strides(); af::dim4 const xstrides = xposition.strides(); af::dim4 const ystrides = yposition.strides(); Interp2<InT, LocT, order> interp; bool batch = !(xdims[2] == 1 && xdims[3] == 1); for(dim_t idw = 0; idw < odims[3]; idw++) { for(dim_t idz = 0; idz < odims[2]; idz++) { dim_t xoffzw = idw * xstrides[3] + idz * xstrides[2]; dim_t yoffzw = idw * ystrides[3] + idz * ystrides[2]; dim_t ooffzw = idw * ostrides[3] + idz * ostrides[2]; dim_t ioffzw = idw * istrides[3] + idz * istrides[2]; for(dim_t idy = 0; idy < odims[1]; idy++) { dim_t xoff = xoffzw * batch + idy * xstrides[1]; dim_t yoff = yoffzw * batch + idy * ystrides[1]; dim_t ooff = ooffzw + idy * ostrides[1]; for(dim_t idx = 0; idx < odims[0]; idx++) { const LocT x = xpos[xoff + idx]; const LocT y = ypos[yoff + idx]; // FIXME: Only cubic interpolation is doing clamping // We need to make it consistent across all methods // Not changing the behavior because tests will fail bool clamp = order == 3; if (x < 0 || idims[0] < x + 1 || y < 0 || idims[1] < y + 1 ) { out[ooff + idx] = scalar<InT>(offGrid); } else { interp(output, ooff + idx, input, ioffzw, x, y, method, 1, clamp); } } } } } }
void diagCreate(Param<T> out, CParam<T> in, int const num) { int batch = in.dims(1); int size = out.dims(0); T const * iptr = in.get(); T * optr = out.get(); for (int k = 0; k < batch; k++) { for (int j = 0; j < size; j++) { for (int i = 0; i < size; i++) { T val = scalar<T>(0); if (i == j - num) { val = (num > 0) ? iptr[i] : iptr[j]; } optr[i + j * out.strides(1)] = val; } } optr += out.strides(2); iptr += in.strides(1); } }
void copyElemwise(Param<OutT> dst, CParam<InT> src, OutT default_value, double factor) { af::dim4 src_dims = src.dims(); af::dim4 dst_dims = dst.dims(); af::dim4 src_strides = src.strides(); af::dim4 dst_strides = dst.strides(); InT const * const src_ptr = src.get(); OutT * dst_ptr = dst.get(); dim_t trgt_l = std::min(dst_dims[3], src_dims[3]); dim_t trgt_k = std::min(dst_dims[2], src_dims[2]); dim_t trgt_j = std::min(dst_dims[1], src_dims[1]); dim_t trgt_i = std::min(dst_dims[0], src_dims[0]); for(dim_t l=0; l<dst_dims[3]; ++l) { dim_t src_loff = l*src_strides[3]; dim_t dst_loff = l*dst_strides[3]; bool isLvalid = l<trgt_l; for(dim_t k=0; k<dst_dims[2]; ++k) { dim_t src_koff = k*src_strides[2]; dim_t dst_koff = k*dst_strides[2]; bool isKvalid = k<trgt_k; for(dim_t j=0; j<dst_dims[1]; ++j) { dim_t src_joff = j*src_strides[1]; dim_t dst_joff = j*dst_strides[1]; bool isJvalid = j<trgt_j; for(dim_t i=0; i<dst_dims[0]; ++i) { OutT temp = default_value; if (isLvalid && isKvalid && isJvalid && i<trgt_i) { dim_t src_idx = i*src_strides[0] + src_joff + src_koff + src_loff; temp = OutT(src_ptr[src_idx])*OutT(factor); } dim_t dst_idx = i*dst_strides[0] + dst_joff + dst_koff + dst_loff; dst_ptr[dst_idx] = temp; } } } } }
void join(const int dim, Param<T> out, const std::vector<CParam<T>> inputs) { af::dim4 zero(0,0,0,0); af::dim4 d = zero; switch(dim) { case 0: join_append<T, T, 0>(out.get(), inputs[0].get(), zero, out.dims(), inputs[0].dims(), out.strides(), inputs[0].strides()); for(int i = 1; i < n_arrays; i++) { d += inputs[i - 1].dims(); join_append<T, T, 0>(out.get(), inputs[i].get(), calcOffset<0>(d), out.dims(), inputs[i].dims(), out.strides(), inputs[i].strides()); } break; case 1: join_append<T, T, 1>(out.get(), inputs[0].get(), zero, out.dims(), inputs[0].dims(), out.strides(), inputs[0].strides()); for(int i = 1; i < n_arrays; i++) { d += inputs[i - 1].dims(); join_append<T, T, 1>(out.get(), inputs[i].get(), calcOffset<1>(d), out.dims(), inputs[i].dims(), out.strides(), inputs[i].strides()); } break; case 2: join_append<T, T, 2>(out.get(), inputs[0].get(), zero, out.dims(), inputs[0].dims(), out.strides(), inputs[0].strides()); for(int i = 1; i < n_arrays; i++) { d += inputs[i - 1].dims(); join_append<T, T, 2>(out.get(), inputs[i].get(), calcOffset<2>(d), out.dims(), inputs[i].dims(), out.strides(), inputs[i].strides()); } break; case 3: join_append<T, T, 3>(out.get(), inputs[0].get(), zero, out.dims(), inputs[0].dims(), out.strides(), inputs[0].strides()); for(int i = 1; i < n_arrays; i++) { d += inputs[i - 1].dims(); join_append<T, T, 3>(out.get(), inputs[i].get(), calcOffset<3>(d), out.dims(), inputs[i].dims(), out.strides(), inputs[i].strides()); } break; } }
void select_scalar(Param<T> out, CParam<char> cond, CParam<T> a, const double b) { af::dim4 astrides = a.strides(); af::dim4 adims = a.dims(); af::dim4 cstrides = cond.strides(); af::dim4 cdims = cond.dims(); af::dim4 odims = out.dims(); af::dim4 ostrides = out.strides(); const T *aptr = a.get(); T *optr = out.get(); const char *cptr = cond.get(); bool is_a_same[] = {adims[0] == odims[0], adims[1] == odims[1], adims[2] == odims[2], adims[3] == odims[3]}; bool is_c_same[] = {cdims[0] == odims[0], cdims[1] == odims[1], cdims[2] == odims[2], cdims[3] == odims[3]}; for (int l = 0; l < odims[3]; l++) { int o_off3 = ostrides[3] * l; int a_off3 = astrides[3] * is_a_same[3] * l; int c_off3 = cstrides[3] * is_c_same[3] * l; for (int k = 0; k < odims[2]; k++) { int o_off2 = ostrides[2] * k + o_off3; int a_off2 = astrides[2] * is_a_same[2] * k + a_off3; int c_off2 = cstrides[2] * is_c_same[2] * k + c_off3; for (int j = 0; j < odims[1]; j++) { int o_off1 = ostrides[1] * j + o_off2; int a_off1 = astrides[1] * is_a_same[1] * j + a_off2; int c_off1 = cstrides[1] * is_c_same[1] * j + c_off2; for (int i = 0; i < odims[0]; i++) { bool cval = is_c_same[0] ? cptr[c_off1 + i] : cptr[c_off1]; T aval = is_a_same[0] ? aptr[a_off1 + i] : aptr[a_off1]; optr[o_off1 + i] = (flip ^ cval) ? aval : b; } } } } }
void lookup(Param<InT> out, CParam<InT> input, CParam<IndexT> indices, unsigned const dim) { const af::dim4 iDims = input.dims(); const af::dim4 oDims = out.dims(); const af::dim4 iStrides = input.strides(); const af::dim4 oStrides = out.strides(); const InT *inPtr = input.get(); const IndexT *idxPtr = indices.get(); InT *outPtr = out.get(); for (dim_t l = 0; l < oDims[3]; ++l) { dim_t iLOff = iStrides[3] * (dim == 3 ? trimIndex((dim_t)idxPtr[l], iDims[3]) : l); dim_t oLOff = l * oStrides[3]; for (dim_t k = 0; k < oDims[2]; ++k) { dim_t iKOff = iStrides[2] * (dim == 2 ? trimIndex((dim_t)idxPtr[k], iDims[2]) : k); dim_t oKOff = k * oStrides[2]; for (dim_t j = 0; j < oDims[1]; ++j) { dim_t iJOff = iStrides[1] * (dim == 1 ? trimIndex((dim_t)idxPtr[j], iDims[1]) : j); dim_t oJOff = j * oStrides[1]; for (dim_t i = 0; i < oDims[0]; ++i) { dim_t iIOff = iStrides[0] * (dim == 0 ? trimIndex((dim_t)idxPtr[i], iDims[0]) : i); dim_t oIOff = i * oStrides[0]; outPtr[oLOff + oKOff + oJOff + oIOff] = inPtr[iLOff + iKOff + iJOff + iIOff]; } } } } }
void join(Param<Tx> out, const int dim, CParam<Tx> first, CParam<Ty> second) { Tx* outPtr = out.get(); const Tx* fptr = first.get(); const Ty* sptr = second.get(); af::dim4 zero(0,0,0,0); const af::dim4 odims = out.dims(); const af::dim4 fdims = first.dims(); const af::dim4 sdims = second.dims(); switch(dim) { case 0: join_append<Tx, Tx, 0>(outPtr, fptr, zero, odims, fdims, out.strides(), first.strides()); join_append<Tx, Ty, 0>(outPtr, sptr, calcOffset<0>(fdims), odims, sdims, out.strides(), second.strides()); break; case 1: join_append<Tx, Tx, 1>(outPtr, fptr, zero, odims, fdims, out.strides(), first.strides()); join_append<Tx, Ty, 1>(outPtr, sptr, calcOffset<1>(fdims), odims, sdims, out.strides(), second.strides()); break; case 2: join_append<Tx, Tx, 2>(outPtr, fptr, zero, odims, fdims, out.strides(), first.strides()); join_append<Tx, Ty, 2>(outPtr, sptr, calcOffset<2>(fdims), odims, sdims, out.strides(), second.strides()); break; case 3: join_append<Tx, Tx, 3>(outPtr, fptr, zero, odims, fdims, out.strides(), first.strides()); join_append<Tx, Ty, 3>(outPtr, sptr, calcOffset<3>(fdims), odims, sdims, out.strides(), second.strides()); break; } }
void fft_c2r(Param<Tr> out, const af::dim4 oDataDims, CParam<Tc> in, const af::dim4 iDataDims, const af::dim4 odims) { int t_dims[rank]; int in_embed[rank]; int out_embed[rank]; computeDims<rank>(t_dims , odims); computeDims<rank>(in_embed , iDataDims); computeDims<rank>(out_embed , oDataDims); const af::dim4 istrides = in.strides(); const af::dim4 ostrides = out.strides(); typedef typename fftw_real_transform<Tr, Tc>::ctype_t ctype_t; typename fftw_real_transform<Tr, Tc>::plan_t plan; fftw_real_transform<Tr, Tc> transform; int batch = 1; for (int i = rank; i < 4; i++) { batch *= odims[i]; } plan = transform.create(rank, t_dims, (int)batch, (ctype_t *)in.get(), in_embed, (int)istrides[0], (int)istrides[rank], (Tr *)out.get(), out_embed, (int)ostrides[0], (int)ostrides[rank], FFTW_ESTIMATE); transform.execute(plan); transform.destroy(plan); }
static void copy(Param<T> dst, CParam<T> src) { af::dim4 src_dims = src.dims(); af::dim4 dst_dims = dst.dims(); af::dim4 src_strides = src.strides(); af::dim4 dst_strides = dst.strides(); T const * src_ptr = src.get(); T * dst_ptr = dst.get(); // find the major-most dimension, which is linear in both arrays int linear_end = 0; dim_t count = 1; while (linear_end < 4 && count == src_strides[linear_end] && count == dst_strides[linear_end]) { count *= src_dims[linear_end]; ++linear_end; } // traverse through the array using strides only until neccessary copy_go(dst_ptr, dst_strides, dst_dims, src_ptr, src_strides, src_dims, 3, linear_end); }
void fft_inplace(Param<T> in, const af::dim4 iDataDims) { int t_dims[rank]; int in_embed[rank]; const af::dim4 idims = in.dims(); computeDims<rank>(t_dims , idims); computeDims<rank>(in_embed , iDataDims); const af::dim4 istrides = in.strides(); typedef typename fftw_transform<T>::ctype_t ctype_t; typename fftw_transform<T>::plan_t plan; fftw_transform<T> transform; int batch = 1; for (int i = rank; i < 4; i++) { batch *= idims[i]; } plan = transform.create(rank, t_dims, (int)batch, (ctype_t *)in.get(), in_embed, (int)istrides[0], (int)istrides[rank], (ctype_t *)in.get(), in_embed, (int)istrides[0], (int)istrides[rank], direction ? FFTW_FORWARD : FFTW_BACKWARD, FFTW_ESTIMATE); transform.execute(plan); transform.destroy(plan); }
void padBorders(Param<T> out, CParam<T> in, const dim4 lBoundPadSize, const dim4 uBoundPadSize, const af::borderType btype) { const dim4& oDims = out.dims(); const dim4& oStrs = out.strides(); const dim4& iDims = in.dims(); const dim4& iStrs = in.strides(); T const* const src = in.get(); T* dst = out.get(); const dim4 validRegEnds( oDims[0] - uBoundPadSize[0], oDims[1] - uBoundPadSize[1], oDims[2] - uBoundPadSize[2], oDims[3] - uBoundPadSize[3]); const bool isInputLinear = iStrs[0] == 1; /* * VALID REGION COPYING DOES * NOT NEED ANY BOUND CHECKS * */ for (dim_t l = lBoundPadSize[3]; l < validRegEnds[3]; ++l) { dim_t oLOff = oStrs[3] * l; dim_t iLOff = iStrs[3] * (l - lBoundPadSize[3]); for (dim_t k = lBoundPadSize[2]; k < validRegEnds[2]; ++k) { dim_t oKOff = oStrs[2] * k; dim_t iKOff = iStrs[2] * (k - lBoundPadSize[2]); for (dim_t j = lBoundPadSize[1]; j < validRegEnds[1]; ++j) { dim_t oJOff = oStrs[1] * j; dim_t iJOff = iStrs[1] * (j - lBoundPadSize[1]); if (isInputLinear) { T const* const sptr = src + iLOff + iKOff + iJOff; T* dptr = dst + oLOff + oKOff + oJOff + lBoundPadSize[0]; std::copy(sptr, sptr + iDims[0], dptr); } else { for (dim_t i = lBoundPadSize[0]; i < validRegEnds[0]; ++i) { dim_t oIOff = oStrs[0] * i; dim_t iIOff = iStrs[0] * (i - lBoundPadSize[0]); dst[oLOff + oKOff + oJOff + oIOff] = src[iLOff + iKOff + iJOff + iIOff]; } } } // second dimension loop } // third dimension loop } // fourth dimension loop // If we have to do zero padding, // just return as the output is filled with // zeros during allocation if (btype == AF_PAD_ZERO) return; /* * PADDED REGIONS NEED BOUND * CHECKS; FOLLOWING NESTED * LOOPS SHALL ONLY PROCESS * PADDED REGIONS AND SKIP REST * */ for (dim_t l = 0; l < oDims[3]; ++l) { bool skipL = (l >= lBoundPadSize[3] && l < validRegEnds[3]); dim_t oLOff = oStrs[3] * l; dim_t iLOff = iStrs[3] * idxByndEdge(l, lBoundPadSize[3], iDims[3], btype); for (dim_t k = 0; k < oDims[2]; ++k) { bool skipK = (k >= lBoundPadSize[2] && k < validRegEnds[2]); dim_t oKOff = oStrs[2] * k; dim_t iKOff = iStrs[2] * idxByndEdge(k, lBoundPadSize[2], iDims[2], btype); for (dim_t j = 0; j < oDims[1]; ++j) { bool skipJ = (j >= lBoundPadSize[1] && j < validRegEnds[1]); dim_t oJOff = oStrs[1] * j; dim_t iJOff = iStrs[1] * idxByndEdge(j, lBoundPadSize[1], iDims[1], btype); for (dim_t i = 0; i < oDims[0]; ++i) { bool skipI = (i >= lBoundPadSize[0] && i < validRegEnds[0]); if (skipI && skipJ && skipK && skipL) continue; dim_t oIOff = oStrs[0] * i; dim_t iIOff = iStrs[0] * idxByndEdge(i, lBoundPadSize[0], iDims[0], btype); dst[oLOff + oKOff + oJOff + oIOff] = src[iLOff + iKOff + iJOff + iIOff]; } // first dimension loop } // second dimension loop } // third dimension loop } // fourth dimension loop }
void transform(Param<T> output, CParam<T> input, CParam<float> transform, const bool inverse, const bool perspective, af_interp_type method) { typedef typename af::dtype_traits<T>::base_type BT; typedef wtype_t<BT> WT; const af::dim4 idims = input.dims(); const af::dim4 odims = output.dims(); const af::dim4 tdims = transform.dims(); const af::dim4 tstrides = transform.strides(); const af::dim4 istrides = input.strides(); const af::dim4 ostrides = output.strides(); T *out = output.get(); const float *tf = transform.get(); int batch_size = 1; if (idims[2] != tdims[2]) batch_size = idims[2]; Interp2<T, WT, order> interp; for (int idw = 0; idw < (int)odims[3]; idw++) { dim_t out_offw = idw * ostrides[3]; dim_t in_offw = (idims[3] > 1) * idw * istrides[3]; dim_t tf_offw = (tdims[3] > 1) * idw * tstrides[3]; for (int idz = 0; idz < (int)odims[2]; idz += batch_size) { dim_t out_offzw = out_offw + idz * ostrides[2]; dim_t in_offzw = in_offw + (idims[2] > 1) * idz * istrides[2]; dim_t tf_offzw = tf_offw + (tdims[2] > 1) * idz * tstrides[2]; const float *tptr = tf + tf_offzw; float tmat[9]; calc_transform_inverse(tmat, tptr, inverse, perspective, perspective ? 9 : 6); for (int idy = 0; idy < (int)odims[1]; idy++) { for (int idx = 0; idx < (int)odims[0]; idx++) { WT xidi = idx * tmat[0] + idy * tmat[1] + tmat[2]; WT yidi = idx * tmat[3] + idy * tmat[4] + tmat[5]; if (perspective) { WT W = idx * tmat[6] + idy * tmat[7] + tmat[8]; xidi /= W; yidi /= W; } // FIXME: Nearest and lower do not do clamping, but other // methods do Make it consistent bool clamp = order != 1; bool condX = xidi >= -0.0001 && xidi < idims[0]; bool condY = yidi >= -0.0001 && yidi < idims[1]; int ooff = out_offzw + idy * ostrides[1] + idx; if (condX && condY) { interp(output, ooff, input, in_offzw, xidi, yidi, method, batch_size, clamp); } else { for (int n = 0; n < batch_size; n++) { out[ooff + n * ostrides[2]] = scalar<T>(0); } } } } } } }