Array<T> resize(const Array<T> &in, const dim_type odim0, const dim_type odim1, const af_interp_type method) { if ((std::is_same<T, double>::value || std::is_same<T, cdouble>::value) && !isDoubleSupported(getActiveDeviceId())) { OPENCL_NOT_SUPPORTED(); } const af::dim4 iDims = in.dims(); af::dim4 oDims(odim0, odim1, iDims[2], iDims[3]); if(iDims.elements() == 0 || oDims.elements() == 0) { throw std::runtime_error("Elements is 0"); } Array<T> out = createEmptyArray<T>(oDims); switch(method) { case AF_INTERP_NEAREST: kernel::resize<T, AF_INTERP_NEAREST> (out, in); break; case AF_INTERP_BILINEAR: kernel::resize<T, AF_INTERP_BILINEAR>(out, in); break; default: break; } return out; }
Array<in_t>* arrayIndex(const Array<in_t> &input, const Array<idx_t> &indices, const unsigned dim) { const dim4 iDims = input.dims(); const dim4 iStrides = input.strides(); const in_t *inPtr = input.get(); const idx_t *idxPtr = indices.get(); dim4 oDims(1); for (dim_type d=0; d<4; ++d) oDims[d] = (d==int(dim) ? indices.elements() : iDims[d]); Array<in_t>* out = createEmptyArray<in_t>(oDims); dim4 oStrides = out->strides(); in_t *outPtr = out->get(); for (dim_type l=0; l<oDims[3]; ++l) { dim_type iLOff = iStrides[3]*(dim==3 ? trimIndex((dim_type)idxPtr[l], iDims[3]): l); dim_type oLOff = l*oStrides[3]; for (dim_type k=0; k<oDims[2]; ++k) { dim_type iKOff = iStrides[2]*(dim==2 ? trimIndex((dim_type)idxPtr[k], iDims[2]): k); dim_type oKOff = k*oStrides[2]; for (dim_type j=0; j<oDims[1]; ++j) { dim_type iJOff = iStrides[1]*(dim==1 ? trimIndex((dim_type)idxPtr[j], iDims[1]): j); dim_type oJOff = j*oStrides[1]; for (dim_type i=0; i<oDims[0]; ++i) { dim_type iIOff = iStrides[0]*(dim==0 ? trimIndex((dim_type)idxPtr[i], iDims[0]): i); dim_type oIOff = i*oStrides[0]; outPtr[oLOff+oKOff+oJOff+oIOff] = inPtr[iLOff+iKOff+iJOff+iIOff]; } } } } return out; }
Array<T> *reorder(const Array<T> &in, const af::dim4 &rdims) { if ((std::is_same<T, double>::value || std::is_same<T, cdouble>::value) && !isDoubleSupported(getActiveDeviceId())) { OPENCL_NOT_SUPPORTED(); } const af::dim4 iDims = in.dims(); af::dim4 oDims(0); for(int i = 0; i < 4; i++) oDims[i] = iDims[rdims[i]]; Array<T> *out = createEmptyArray<T>(oDims); kernel::reorder<T>(*out, in, rdims.get()); return out; }
Array<T> resize(const Array<T> &in, const dim_t odim0, const dim_t odim1, const af_interp_type method) { const af::dim4 iDims = in.dims(); af::dim4 oDims(odim0, odim1, iDims[2], iDims[3]); Array<T> out = createEmptyArray<T>(oDims); switch (method) { case AF_INTERP_NEAREST: kernel::resize<T, AF_INTERP_NEAREST>(out, in); break; case AF_INTERP_BILINEAR: kernel::resize<T, AF_INTERP_BILINEAR>(out, in); break; case AF_INTERP_LOWER: kernel::resize<T, AF_INTERP_LOWER>(out, in); break; default: break; } return out; }
Array<in_t> lookup(const Array<in_t> &input, const Array<idx_t> &indices, const unsigned dim) { const dim4 iDims = input.dims(); dim4 oDims(1); for (int d=0; d<4; ++d) oDims[d] = (d==int(dim) ? indices.elements() : iDims[d]); Array<in_t> out = createEmptyArray<in_t>(oDims); dim_t nDims = iDims.ndims(); switch(dim) { case 0: kernel::lookup<in_t, idx_t, 0>(out, input, indices, nDims); break; case 1: kernel::lookup<in_t, idx_t, 1>(out, input, indices, nDims); break; case 2: kernel::lookup<in_t, idx_t, 2>(out, input, indices, nDims); break; case 3: kernel::lookup<in_t, idx_t, 3>(out, input, indices, nDims); break; } return out; }
Array<T> fftconvolve(Array<T> const& signal, Array<T> const& filter, const bool expand, ConvolveBatchKind kind) { const af::dim4 sd = signal.dims(); const af::dim4 fd = filter.dims(); dim_t fftScale = 1; af::dim4 packed_dims; int fft_dims[baseDim]; af::dim4 sig_tmp_dims, sig_tmp_strides; af::dim4 filter_tmp_dims, filter_tmp_strides; // Pack both signal and filter on same memory array, this will ensure // better use of batched cuFFT capabilities for (dim_t k = 0; k < 4; k++) { if (k < baseDim) packed_dims[k] = nextpow2((unsigned)(sd[k] + fd[k] - 1)); else if (k == baseDim) packed_dims[k] = sd[k] + fd[k]; else packed_dims[k] = 1; if (k < baseDim) { fft_dims[baseDim-k-1] = (k == 0) ? packed_dims[k] / 2 : packed_dims[k]; fftScale *= fft_dims[baseDim-k-1]; } } Array<convT> packed = createEmptyArray<convT>(packed_dims); convT *packed_ptr = packed.get(); const af::dim4 packed_strides = packed.strides(); sig_tmp_dims[0] = filter_tmp_dims[0] = packed_dims[0]; sig_tmp_strides[0] = filter_tmp_strides[0] = 1; for (dim_t k = 1; k < 4; k++) { if (k < baseDim) { sig_tmp_dims[k] = packed_dims[k]; filter_tmp_dims[k] = packed_dims[k]; } else { sig_tmp_dims[k] = sd[k]; filter_tmp_dims[k] = fd[k]; } sig_tmp_strides[k] = sig_tmp_strides[k - 1] * sig_tmp_dims[k - 1]; filter_tmp_strides[k] = filter_tmp_strides[k - 1] * filter_tmp_dims[k - 1]; } // Calculate memory offsets for packed signal and filter convT *sig_tmp_ptr = packed_ptr; convT *filter_tmp_ptr = packed_ptr + sig_tmp_strides[3] * sig_tmp_dims[3]; // Number of packed complex elements in dimension 0 dim_t sig_half_d0 = divup(sd[0], 2); // Pack signal in a complex matrix where first dimension is half the input // (allows faster FFT computation) and pad array to a power of 2 with 0s packData<convT, T>(sig_tmp_ptr, sig_tmp_dims, sig_tmp_strides, signal); // Pad filter array with 0s padArray<convT, T>(filter_tmp_ptr, filter_tmp_dims, filter_tmp_strides, filter); // Compute forward FFT if (isDouble) { fftw_plan plan = fftw_plan_many_dft(baseDim, fft_dims, packed_dims[baseDim], (fftw_complex*)packed.get(), NULL, packed_strides[0], packed_strides[baseDim] / 2, (fftw_complex*)packed.get(), NULL, packed_strides[0], packed_strides[baseDim] / 2, FFTW_FORWARD, FFTW_ESTIMATE); fftw_execute(plan); fftw_destroy_plan(plan); } else { fftwf_plan plan = fftwf_plan_many_dft(baseDim, fft_dims, packed_dims[baseDim], (fftwf_complex*)packed.get(), NULL, packed_strides[0], packed_strides[baseDim] / 2, (fftwf_complex*)packed.get(), NULL, packed_strides[0], packed_strides[baseDim] / 2, FFTW_FORWARD, FFTW_ESTIMATE); fftwf_execute(plan); fftwf_destroy_plan(plan); } // Multiply filter and signal FFT arrays if (kind == ONE2MANY) complexMultiply<convT>(filter_tmp_ptr, filter_tmp_dims, filter_tmp_strides, sig_tmp_ptr, sig_tmp_dims, sig_tmp_strides, filter_tmp_ptr, filter_tmp_dims, filter_tmp_strides, kind); else complexMultiply<convT>(sig_tmp_ptr, sig_tmp_dims, sig_tmp_strides, sig_tmp_ptr, sig_tmp_dims, sig_tmp_strides, filter_tmp_ptr, filter_tmp_dims, filter_tmp_strides, kind); // Compute inverse FFT if (isDouble) { fftw_plan plan = fftw_plan_many_dft(baseDim, fft_dims, packed_dims[baseDim], (fftw_complex*)packed.get(), NULL, packed_strides[0], packed_strides[baseDim] / 2, (fftw_complex*)packed.get(), NULL, packed_strides[0], packed_strides[baseDim] / 2, FFTW_BACKWARD, FFTW_ESTIMATE); fftw_execute(plan); fftw_destroy_plan(plan); } else { fftwf_plan plan = fftwf_plan_many_dft(baseDim, fft_dims, packed_dims[baseDim], (fftwf_complex*)packed.get(), NULL, packed_strides[0], packed_strides[baseDim] / 2, (fftwf_complex*)packed.get(), NULL, packed_strides[0], packed_strides[baseDim] / 2, FFTW_BACKWARD, FFTW_ESTIMATE); fftwf_execute(plan); fftwf_destroy_plan(plan); } // Compute output dimensions dim4 oDims(1); if (expand) { for(dim_t d=0; d<4; ++d) { if (kind==ONE2ONE || kind==ONE2MANY) { oDims[d] = sd[d]+fd[d]-1; } else { oDims[d] = (d<baseDim ? sd[d]+fd[d]-1 : sd[d]); } } } else { oDims = sd; if (kind==ONE2MANY) { for (dim_t i=baseDim; i<4; ++i) oDims[i] = fd[i]; } } Array<T> out = createEmptyArray<T>(oDims); T* out_ptr = out.get(); const af::dim4 out_dims = out.dims(); const af::dim4 out_strides = out.strides(); const af::dim4 filter_dims = filter.dims(); // Reorder the output if (kind == ONE2MANY) { reorderOutput<T, convT, roundOut> (out_ptr, out_dims, out_strides, filter_tmp_ptr, filter_tmp_dims, filter_tmp_strides, filter_dims, sig_half_d0, baseDim, fftScale, expand); } else { reorderOutput<T, convT, roundOut> (out_ptr, out_dims, out_strides, sig_tmp_ptr, sig_tmp_dims, sig_tmp_strides, filter_dims, sig_half_d0, baseDim, fftScale, expand); } return out; }