Example #1
0
void
interpolate(
  const_Matrix<IT, Block1>	   indices,  // n x m
  Tensor<T, Block2>                window,   // n x m x I
  const_Matrix<complex<T>, Block3> in,       // n x m
  Matrix<complex<T>, Block4>       out,      // nx x m
  length_type                      depth,
  length_type                      padded_depth)
{
  // All blocks must have the same dimension ordering
  typedef typename Block_layout<Block1>::order_type order1_type;
  typedef typename Block_layout<Block2>::order_type order2_type;
  typedef typename Block_layout<Block3>::order_type order3_type;
  typedef typename Block_layout<Block4>::order_type order4_type;
  assert(order1_type::impl_dim0 == order2_type::impl_dim0);
  assert(order1_type::impl_dim0 == order3_type::impl_dim0);
  assert(order1_type::impl_dim0 == order4_type::impl_dim0);
  assert(order1_type::impl_dim1 == order2_type::impl_dim1);
  assert(order1_type::impl_dim1 == order3_type::impl_dim1);
  assert(order1_type::impl_dim1 == order4_type::impl_dim1);

  Device_memory<Block1> dev_indices(indices.block(), impl::SYNC_IN);
  Device_memory<Block2> dev_window(window.block(), impl::SYNC_IN);
  Device_memory<Block3> dev_in(in.block(), impl::SYNC_IN);
  Device_memory<Block4> dev_out(out.block(), impl::SYNC_OUT);

  size_t rows_in = in.size(0);
  size_t rows_out = out.size(0);
  size_t cols = in.size(1);
  assert(cols == out.size(1));

  interpolate(
    dev_indices.data(),
    dev_window.data(),
    reinterpret_cast<cuComplex const*>(dev_in.data()),
    reinterpret_cast<cuComplex*>(dev_out.data()),
    depth,
    padded_depth,
    rows_in,
    rows_out,
    cols);
}
void
Fastconv_base<D, T, ComplexFmt>::fconv
  (T const* in, T const* kernel, T* out, length_type rows, length_type columns, bool transform_kernel)
{
  size_t kernel_size = (D == 1) ? columns : rows * columns;

  // allocate device memory and copy input and kernel over from host
  Device_storage<T> dev_out(rows * columns);
  Device_storage<T> dev_kernel(kernel_size);
  Device_storage<T> dev_in(rows * columns);

  // If the kernel is a matrix, it is assumed to be row-major and dense.
  // As a result, it can be copied as one contiguous chunk.
  cudaMemcpy(
    dev_kernel.data(),
    kernel,
    kernel_size * sizeof(T), 
    cudaMemcpyHostToDevice);
  ASSERT_CUDA_OK();

  // Transfer the input (row major, dense)
  cudaMemcpy(
    dev_in.data(),
    in,
    rows * columns * sizeof(T), 
    cudaMemcpyHostToDevice);
  ASSERT_CUDA_OK();
 

  // convert pointers to types the CUFFT library accepts
  typedef cufftComplex ctype;
  ctype* d_out = reinterpret_cast<ctype*>(dev_out.data());
  ctype* d_kernel = reinterpret_cast<ctype*>(dev_kernel.data());
  ctype* d_in = reinterpret_cast<ctype*>(dev_in.data());

  cufftHandle plan;
  if (transform_kernel)
  {
    // Create a 1D FFT plan and transform the kernel
    cufftPlan1d(&plan, columns, CUFFT_C2C, 1);
    cufftExecC2C(plan, d_kernel, d_kernel, CUFFT_FORWARD);
    cufftDestroy(plan);
  }

  // Create a FFTM plan
  cufftPlan1d(&plan, columns, CUFFT_C2C, rows);

  // transform the data
  cufftExecC2C(plan, d_in, d_in, CUFFT_FORWARD);

  // convolve with kernel, combine with scaling needed for inverse FFT
  typedef typename impl::Scalar_of<T>::type scalar_type;
  scalar_type scale = 1 / static_cast<scalar_type>(columns);
  if (D == 1)
    vmmuls_row_cc(d_kernel, d_in, d_out, scale, rows, columns);
  else
    mmmuls_cc(d_kernel, d_in, d_out, scale, rows, columns);

  // inverse transform the signal
  cufftExecC2C(plan, d_out, d_out, CUFFT_INVERSE);
  cufftDestroy(plan);

  // Move data back to the host from the output buffer
  cudaMemcpy(
    out,
    dev_out.data(),
    rows * columns * sizeof(T), 
    cudaMemcpyDeviceToHost);
  ASSERT_CUDA_OK();
}