void convolute_3d_out_of_place(MatrixT& _image, MatrixT& _kernel) {

  if (_image.size() != _kernel.size()) {
    std::cerr << "received image and kernel of mismatching size!\n";
    return;
  }

  unsigned M, N, K;
  M = _image.shape()[0];
  N = _image.shape()[1];
  K = _image.shape()[2];

  unsigned fft_size = M * N * (K / 2 + 1);

  // setup fourier space arrays
  fftwf_complex* image_fourier = static_cast<fftwf_complex*>(
      fftwf_malloc(sizeof(fftwf_complex) * fft_size));
  fftwf_complex* kernel_fourier = static_cast<fftwf_complex*>(
      fftwf_malloc(sizeof(fftwf_complex) * fft_size));
  float scale = 1.0 / (M * N * K);

  // define+run forward plans
  fftwf_plan image_fwd_plan = fftwf_plan_dft_r2c_3d(
      M, N, K, _image.data(), image_fourier, FFTW_ESTIMATE);
  fftwf_execute(image_fwd_plan);

  fftwf_plan kernel_fwd_plan = fftwf_plan_dft_r2c_3d(
      M, N, K, _kernel.data(), kernel_fourier, FFTW_ESTIMATE);
  fftwf_execute(kernel_fwd_plan);

  // multiply
  for (unsigned index = 0; index < fft_size; ++index) {
    float real = image_fourier[index][0] * kernel_fourier[index][0] -
                 image_fourier[index][1] * kernel_fourier[index][1];
    float imag = image_fourier[index][0] * kernel_fourier[index][1] +
                 image_fourier[index][1] * kernel_fourier[index][0];
    image_fourier[index][0] = real;
    image_fourier[index][1] = imag;
  }

  fftwf_destroy_plan(kernel_fwd_plan);
  fftwf_destroy_plan(image_fwd_plan);

  fftwf_plan image_rev_plan = fftwf_plan_dft_c2r_3d(
      M, N, K, image_fourier, _image.data(), FFTW_ESTIMATE);
  fftwf_execute(image_rev_plan);

  for (unsigned index = 0; index < _image.num_elements(); ++index) {
    _image.data()[index] *= scale;
  }

  fftwf_destroy_plan(image_rev_plan);
  fftwf_free(image_fourier);
  fftwf_free(kernel_fourier);
}
void print_mismatching_items(MatrixT& _reference, MatrixT& _other) {
  for (long x = 0; x < _reference.shape()[0]; ++x)
    for (long y = 0; y < _reference.shape()[1]; ++y)
      for (long z = 0; z < _reference.shape()[2]; ++z) {
        float reference = _reference[x][y][z];
        float to_compared = _other[x][y][z];
        if (std::fabs(reference - to_compared) > (1e-3 * reference) &&
            (std::fabs(reference) > 1e-4 || std::fabs(to_compared) > 1e-4)) {
          std::cout << "[" << x << "][" << y << "][" << z
                    << "] mismatch, ref: " << reference
                    << " != to_compare: " << to_compared << "\n";
        }
      }
}
void convolute_3d_in_place(MatrixT& _image, const MatrixT& _kernel,
                           const bool& _verbose = false) {

  if (_image.size() == _kernel.size()) {
    std::cerr << "received image and kernel of matching size, this makes "
                 "preparing the kernel impossible!\nExiting.\n";
    return;
  }

  if (MatrixT::dimensionality != 3) {
    std::cerr << "received image and kernel of dimension "
              << MatrixT::dimensionality
              << " that cannot be processed by convolute_3d_in_place!\n";
    return;
  }

  std::vector<unsigned> origin_image_extents(MatrixT::dimensionality);
  std::copy(_image.shape(), _image.shape() + MatrixT::dimensionality,
            origin_image_extents.begin());

  std::vector<unsigned> origin_kernel_extents(MatrixT::dimensionality);
  std::copy(_kernel.shape(), _kernel.shape() + MatrixT::dimensionality,
            origin_kernel_extents.begin());

  if (_verbose) {
    std::cout << "[convolute_3d_in_place]\timage:\n" << _image << "\n";
    std::cout << "[convolute_3d_in_place]\tkernel:\n" << _kernel << "\n";
  }
  ///////////////////////////////////////////////////////////////////////////
  // CALCULATE PADDING EXTENT
  std::vector<unsigned> common_extents(MatrixT::dimensionality);
  std::transform(origin_image_extents.begin(), origin_image_extents.end(),
                 origin_kernel_extents.begin(), common_extents.begin(),
                 add_minus_1<unsigned>());

  std::vector<unsigned> common_offsets(MatrixT::dimensionality);
  std::transform(origin_kernel_extents.begin(), origin_kernel_extents.end(),
                 common_offsets.begin(), minus_1_div_2<unsigned>());

  ///////////////////////////////////////////////////////////////////////////
  // PADD IMAGE
  image_stack padded_image(common_extents, _image.storage_order());
  image_stack_view subview_padded_image = padded_image
      [boost::indices[range(common_offsets[0],
                            common_offsets[0] + origin_image_extents[0])]
                     [range(common_offsets[1],
                            common_offsets[1] + origin_image_extents[1])]
                     [range(common_offsets[2],
                            common_offsets[2] + origin_image_extents[2])]];
  subview_padded_image = _image;
  unsigned long size_of_transform = padded_image.num_elements();

  ///////////////////////////////////////////////////////////////////////////
  // PADD KERNEL
  image_stack padded_kernel(common_extents, _kernel.storage_order());
  for (long z = 0; z < origin_kernel_extents[2]; ++z)
    for (long y = 0; y < origin_kernel_extents[1]; ++y)
      for (long x = 0; x < origin_kernel_extents[0]; ++x) {
        long intermediate_x = x - origin_kernel_extents[0] / 2L;
        long intermediate_y = y - origin_kernel_extents[1] / 2L;
        long intermediate_z = z - origin_kernel_extents[2] / 2L;

        intermediate_x = (intermediate_x < 0)
                             ? intermediate_x + common_extents[0]
                             : intermediate_x;
        intermediate_y = (intermediate_y < 0)
                             ? intermediate_y + common_extents[1]
                             : intermediate_y;
        intermediate_z = (intermediate_z < 0)
                             ? intermediate_z + common_extents[2]
                             : intermediate_z;

        padded_kernel[intermediate_x][intermediate_y][intermediate_z] =
            _kernel[x][y][z];
      }

  ///////////////////////////////////////////////////////////////////////////
  // RESIZE ALL TO ALLOW FFTW INPLACE TRANSFORM
  std::vector<unsigned> inplace_extents(3);
  adapt_extents_for_fftw_inplace(common_extents, inplace_extents,
                                 _image.storage_order());
  padded_image.resize(boost::extents[inplace_extents[0]][inplace_extents[1]]
                                    [inplace_extents[2]]);
  padded_kernel.resize(boost::extents[inplace_extents[0]][inplace_extents[1]]
                                     [inplace_extents[2]]);
  if (_verbose) {
    std::cout << "[convolute_3d_in_place]\t padded image:\n" << padded_image
              << "\n";
    std::cout << "[convolute_3d_in_place]\t padded kernel:\n" << padded_kernel
              << "\n";
  }

  float scale = 1.0 / (size_of_transform);
  fftwf_complex* complex_image_fourier = (fftwf_complex*)padded_image.data();
  fftwf_complex* complex_kernel_fourier = (fftwf_complex*)padded_kernel.data();

  // define+run forward plans
  fftwf_plan image_fwd_plan = fftwf_plan_dft_r2c_3d(
      common_extents[0], common_extents[1], common_extents[2],
      padded_image.data(), complex_image_fourier, FFTW_ESTIMATE);
  fftwf_execute(image_fwd_plan);

  fftwf_plan kernel_fwd_plan = fftwf_plan_dft_r2c_3d(
      common_extents[0], common_extents[1], common_extents[2],
      padded_kernel.data(), complex_kernel_fourier, FFTW_ESTIMATE);
  fftwf_execute(kernel_fwd_plan);

  fftwf_destroy_plan(kernel_fwd_plan);
  fftwf_destroy_plan(image_fwd_plan);
  // multiply

  unsigned fourier_num_elements = padded_image.num_elements() / 2;
  for (unsigned index = 0; index < fourier_num_elements; ++index) {
    float real =
        complex_image_fourier[index][0] * complex_kernel_fourier[index][0] -
        complex_image_fourier[index][1] * complex_kernel_fourier[index][1];
    float imag =
        complex_image_fourier[index][0] * complex_kernel_fourier[index][1] +
        complex_image_fourier[index][1] * complex_kernel_fourier[index][0];
    complex_image_fourier[index][0] = real;
    complex_image_fourier[index][1] = imag;
  }

  fftwf_plan image_rev_plan = fftwf_plan_dft_c2r_3d(
      common_extents[0], common_extents[1], common_extents[2],
      complex_image_fourier, padded_image.data(), FFTW_ESTIMATE);
  fftwf_execute(image_rev_plan);

  for (unsigned index = 0; index < padded_image.num_elements(); ++index) {
    padded_image.data()[index] *= scale;
  }

  fftwf_destroy_plan(image_rev_plan);

  _image = padded_image
      [boost::indices[range(common_offsets[0],
                            common_offsets[0] + origin_image_extents[0])]
                     [range(common_offsets[1],
                            common_offsets[1] + origin_image_extents[1])]
                     [range(common_offsets[2],
                            common_offsets[2] + origin_image_extents[2])]];
  if (_verbose) {
    std::cout << "[convolute_3d_in_place]\t padded result:\n" << padded_image
              << "\n";
    std::cout << "[convolute_3d_in_place]\t result:\n" << _image << "\n";
  }
}