void interpolate( const_Matrix<IT, Block1> indices, // n x m Tensor<T, Block2> window, // n x m x I const_Matrix<complex<T>, Block3> in, // n x m Matrix<complex<T>, Block4> out, // nx x m length_type depth, length_type padded_depth) { // All blocks must have the same dimension ordering typedef typename Block_layout<Block1>::order_type order1_type; typedef typename Block_layout<Block2>::order_type order2_type; typedef typename Block_layout<Block3>::order_type order3_type; typedef typename Block_layout<Block4>::order_type order4_type; assert(order1_type::impl_dim0 == order2_type::impl_dim0); assert(order1_type::impl_dim0 == order3_type::impl_dim0); assert(order1_type::impl_dim0 == order4_type::impl_dim0); assert(order1_type::impl_dim1 == order2_type::impl_dim1); assert(order1_type::impl_dim1 == order3_type::impl_dim1); assert(order1_type::impl_dim1 == order4_type::impl_dim1); Device_memory<Block1> dev_indices(indices.block(), impl::SYNC_IN); Device_memory<Block2> dev_window(window.block(), impl::SYNC_IN); Device_memory<Block3> dev_in(in.block(), impl::SYNC_IN); Device_memory<Block4> dev_out(out.block(), impl::SYNC_OUT); size_t rows_in = in.size(0); size_t rows_out = out.size(0); size_t cols = in.size(1); assert(cols == out.size(1)); interpolate( dev_indices.data(), dev_window.data(), reinterpret_cast<cuComplex const*>(dev_in.data()), reinterpret_cast<cuComplex*>(dev_out.data()), depth, padded_depth, rows_in, rows_out, cols); }
void Fastconv_base<D, T, ComplexFmt>::fconv (T const* in, T const* kernel, T* out, length_type rows, length_type columns, bool transform_kernel) { size_t kernel_size = (D == 1) ? columns : rows * columns; // allocate device memory and copy input and kernel over from host Device_storage<T> dev_out(rows * columns); Device_storage<T> dev_kernel(kernel_size); Device_storage<T> dev_in(rows * columns); // If the kernel is a matrix, it is assumed to be row-major and dense. // As a result, it can be copied as one contiguous chunk. cudaMemcpy( dev_kernel.data(), kernel, kernel_size * sizeof(T), cudaMemcpyHostToDevice); ASSERT_CUDA_OK(); // Transfer the input (row major, dense) cudaMemcpy( dev_in.data(), in, rows * columns * sizeof(T), cudaMemcpyHostToDevice); ASSERT_CUDA_OK(); // convert pointers to types the CUFFT library accepts typedef cufftComplex ctype; ctype* d_out = reinterpret_cast<ctype*>(dev_out.data()); ctype* d_kernel = reinterpret_cast<ctype*>(dev_kernel.data()); ctype* d_in = reinterpret_cast<ctype*>(dev_in.data()); cufftHandle plan; if (transform_kernel) { // Create a 1D FFT plan and transform the kernel cufftPlan1d(&plan, columns, CUFFT_C2C, 1); cufftExecC2C(plan, d_kernel, d_kernel, CUFFT_FORWARD); cufftDestroy(plan); } // Create a FFTM plan cufftPlan1d(&plan, columns, CUFFT_C2C, rows); // transform the data cufftExecC2C(plan, d_in, d_in, CUFFT_FORWARD); // convolve with kernel, combine with scaling needed for inverse FFT typedef typename impl::Scalar_of<T>::type scalar_type; scalar_type scale = 1 / static_cast<scalar_type>(columns); if (D == 1) vmmuls_row_cc(d_kernel, d_in, d_out, scale, rows, columns); else mmmuls_cc(d_kernel, d_in, d_out, scale, rows, columns); // inverse transform the signal cufftExecC2C(plan, d_out, d_out, CUFFT_INVERSE); cufftDestroy(plan); // Move data back to the host from the output buffer cudaMemcpy( out, dev_out.data(), rows * columns * sizeof(T), cudaMemcpyDeviceToHost); ASSERT_CUDA_OK(); }