int main(int argc, char **argv)
{
  int np[2];
  ptrdiff_t n[3];
  ptrdiff_t alloc_local;
  ptrdiff_t local_ni[3], local_i_start[3];
  ptrdiff_t local_no[3], local_o_start[3];
  double err;
  double *planned_in, *executed_in;
  pfft_complex *planned_out, *executed_out;
  pfft_plan plan_forw=NULL, plan_back=NULL;
  MPI_Comm comm_cart_2d;
  
  /* Set size of FFT and process mesh */
  n[0] = 29; n[1] = 27; n[2] = 31;
  np[0] = 2; np[1] = 2;
  
  /* Initialize MPI and PFFT */
  MPI_Init(&argc, &argv);
  pfft_init();

  /* Create two-dimensional process grid of size np[0] x np[1], if possible */
  if( pfft_create_procmesh_2d(MPI_COMM_WORLD, np[0], np[1], &comm_cart_2d) ){
    pfft_fprintf(MPI_COMM_WORLD, stderr, "Error: This test file only works with %d processes.\n", np[0]*np[1]);
    MPI_Finalize();
    return 1;
  }
  
  /* Get parameters of data distribution */
  alloc_local = pfft_local_size_dft_r2c_3d(n, comm_cart_2d, PFFT_TRANSPOSED_OUT| PFFT_PADDED_R2C,
      local_ni, local_i_start, local_no, local_o_start);

  /* Allocate memory for planning */
  planned_in  = pfft_alloc_real (2 * alloc_local);
  planned_out = pfft_alloc_complex(alloc_local);

  /* Plan parallel forward FFT */
  plan_forw = pfft_plan_dft_r2c_3d(
      n, planned_in, planned_out, comm_cart_2d, PFFT_FORWARD, PFFT_TRANSPOSED_OUT| PFFT_MEASURE| PFFT_DESTROY_INPUT| PFFT_PADDED_R2C);
  
  /* Plan parallel backward FFT */
  plan_back = pfft_plan_dft_c2r_3d(
      n, planned_out, planned_in, comm_cart_2d, PFFT_BACKWARD, PFFT_TRANSPOSED_IN| PFFT_MEASURE| PFFT_DESTROY_INPUT| PFFT_PADDED_C2R);

  /* Free planning arrays since we use other arrays for execution */
  pfft_free(planned_in); pfft_free(planned_out);

  /* Allocate memory for execution */
  executed_in  = pfft_alloc_real(2 * alloc_local);
  executed_out = pfft_alloc_complex(alloc_local);

  /* Initialize input with random numbers */
  pfft_init_input_real(3, n, local_ni, local_i_start,
      executed_in);
  
  /* execute parallel forward FFT */
  pfft_execute_dft_r2c(plan_forw, executed_in, executed_out);

  /* clear the old input */
  pfft_clear_input_real(3, n, local_ni, local_i_start,
      executed_in);

  /* execute parallel backward FFT */
  pfft_execute_dft_c2r(plan_back, executed_out, executed_in);
  
  /* Scale data */
  for(ptrdiff_t l=0; l < local_ni[0] * local_ni[1] * local_ni[2]; l++)
    executed_in[l] /= (n[0]*n[1]*n[2]);

  /* Print error of back transformed data */
  err = pfft_check_output_real(3, n, local_ni, local_i_start, executed_in, comm_cart_2d);
  pfft_printf(comm_cart_2d, "Error after one forward and backward trafo of size n=(%td, %td, %td):\n", n[0], n[1], n[2]); 
  pfft_printf(comm_cart_2d, "maxerror = %6.2e;\n", err);
  
  /* free mem and finalize */
  pfft_destroy_plan(plan_forw);
  pfft_destroy_plan(plan_back);
  MPI_Comm_free(&comm_cart_2d);
  pfft_free(executed_in); pfft_free(executed_out);
  MPI_Finalize();
  return 0;
}
int main(int argc, char **argv)
{
  int np[2];
  ptrdiff_t n[3], ni[3], howmany;
  double err;

  ptrdiff_t alloc_local_c;
  ptrdiff_t local_ni_c[3], local_i_start_c[3];
  ptrdiff_t local_no_c[3], local_o_start_c[3];
  pfft_complex *in_c, *out_c;
  pfft_plan plan_forw_c=NULL, plan_back_c=NULL;

  ptrdiff_t alloc_local_r, alloc_local_forw, alloc_local_back;
  ptrdiff_t local_ni_r[3], local_i_start_r[3];
  ptrdiff_t local_no_r[3], local_o_start_r[3];
  pfft_complex *in_r;
  double *out_r;
  pfft_plan plan_forw_r=NULL, plan_back_r=NULL;

  MPI_Comm comm_cart_2d;

  /* Set size of FFT and process mesh */
  ni[0] = 4; ni[1] = 4; ni[2] = 4;
  n[0] = 6; n[1] = 6; n[2] = 6;
  np[0] = 2; np[1] = 2;
  howmany = 1;

  /* Initialize MPI and PFFT */
  MPI_Init(&argc, &argv);
  pfft_init();

  /* Create two-dimensional process grid of size np[0] x np[1], if possible */
  if( pfft_create_procmesh_2d(MPI_COMM_WORLD, np[0], np[1], &comm_cart_2d) ){
    pfft_fprintf(MPI_COMM_WORLD, stderr, "Error: This test file only works with %d processes.\n", np[0]*np[1]);
    MPI_Finalize();
    return 1;
  }

  /* Get parameters of data distribution */
  alloc_local_forw = pfft_local_size_many_dft(3, n, ni, n, howmany, PFFT_DEFAULT_BLOCKS, PFFT_DEFAULT_BLOCKS, comm_cart_2d, PFFT_TRANSPOSED_NONE| PFFT_SHIFTED_IN | PFFT_SHIFTED_OUT,
      local_ni_c, local_i_start_c, local_no_c, local_o_start_c);
  alloc_local_back = pfft_local_size_many_dft(3, n, n, ni, howmany, PFFT_DEFAULT_BLOCKS, PFFT_DEFAULT_BLOCKS, comm_cart_2d, PFFT_TRANSPOSED_NONE| PFFT_SHIFTED_IN | PFFT_SHIFTED_OUT,
      local_no_c, local_o_start_c, local_ni_c, local_i_start_c);

  alloc_local_c = (alloc_local_forw > alloc_local_back) ? alloc_local_forw : alloc_local_back;

  alloc_local_forw = pfft_local_size_many_dft_c2r(3, n, ni, n, howmany, PFFT_DEFAULT_BLOCKS, PFFT_DEFAULT_BLOCKS, comm_cart_2d, PFFT_TRANSPOSED_NONE| PFFT_SHIFTED_IN | PFFT_SHIFTED_OUT,
      local_ni_r, local_i_start_r, local_no_r, local_o_start_r);
  alloc_local_back = pfft_local_size_many_dft_r2c(3, n, n, ni, howmany, PFFT_DEFAULT_BLOCKS, PFFT_DEFAULT_BLOCKS, comm_cart_2d, PFFT_TRANSPOSED_NONE| PFFT_SHIFTED_IN | PFFT_SHIFTED_OUT,
      local_no_r, local_o_start_r, local_ni_r, local_i_start_r);

  alloc_local_r = (alloc_local_forw > alloc_local_back) ? alloc_local_forw : alloc_local_back;


  /* Allocate memory */
  in_c  = pfft_alloc_complex(alloc_local_c);
  out_c = pfft_alloc_complex(alloc_local_c);
  in_r  = pfft_alloc_complex(alloc_local_r);
  out_r = pfft_alloc_real(2*alloc_local_r);


  /* Plan parallel forward FFT */
  plan_forw_c = pfft_plan_many_dft(3,
      n, ni, n, howmany, PFFT_DEFAULT_BLOCKS, PFFT_DEFAULT_BLOCKS, in_c, out_c, comm_cart_2d, PFFT_FORWARD, PFFT_TRANSPOSED_NONE| PFFT_MEASURE| PFFT_DESTROY_INPUT| PFFT_SHIFTED_IN | PFFT_SHIFTED_OUT);
  plan_forw_r = pfft_plan_many_dft_c2r(3,
      n, ni, n, howmany, PFFT_DEFAULT_BLOCKS, PFFT_DEFAULT_BLOCKS, in_r, out_r, comm_cart_2d, PFFT_FORWARD, PFFT_TRANSPOSED_NONE| PFFT_MEASURE| PFFT_DESTROY_INPUT| PFFT_SHIFTED_IN | PFFT_SHIFTED_OUT);

  /* Plan parallel backward FFT */
  plan_back_c = pfft_plan_many_dft(3,
      n, n, ni, howmany, PFFT_DEFAULT_BLOCKS, PFFT_DEFAULT_BLOCKS, out_c, in_c, comm_cart_2d, PFFT_BACKWARD, PFFT_TRANSPOSED_NONE| PFFT_MEASURE| PFFT_DESTROY_INPUT| PFFT_SHIFTED_IN | PFFT_SHIFTED_OUT);
  plan_back_r = pfft_plan_many_dft_r2c(3,
      n, n, ni, howmany, PFFT_DEFAULT_BLOCKS, PFFT_DEFAULT_BLOCKS, out_r, in_r, comm_cart_2d, PFFT_BACKWARD, PFFT_TRANSPOSED_NONE| PFFT_MEASURE| PFFT_DESTROY_INPUT| PFFT_SHIFTED_IN | PFFT_SHIFTED_OUT);

  /* Initialize input with random numbers */
  init_input(ni, local_ni_c, local_i_start_c, in_c);
  init_input(ni, local_ni_r, local_i_start_r, in_r);

//   pfft_apr_complex_3d(in_c, local_ni_c, local_i_start_c, "c2c input:\n", comm_cart_2d);
//   pfft_apr_complex_3d(in_r, local_ni_r, local_i_start_r, "c2r input:\n", comm_cart_2d);

  /* execute parallel forward FFT */
  pfft_execute(plan_forw_c);

  /* clear the old input */

  pfft_execute(plan_forw_r);

//   pfft_apr_complex_3d(out_c, local_no_c, local_o_start_c, "c2c output:\n", comm_cart_2d);
//   pfft_apr_real_3d(out_r, local_no_r, local_o_start_r, "c2r output:\n", comm_cart_2d);

  /* execute parallel backward FFT */
  pfft_execute(plan_back_c);
  pfft_execute(plan_back_r);

//   pfft_apr_complex_3d(in_c, local_ni_c, local_i_start_c, "c2c^ output:\n", comm_cart_2d);
//   pfft_apr_complex_3d(in_r, local_ni_r, local_i_start_r, "c2r^ output:\n", comm_cart_2d);

  /* Scale data */
  for(ptrdiff_t l=0; l < local_ni_c[0] * local_ni_c[1] * local_ni_c[2]; l++)
    in_c[l] /= (n[0]*n[1]*n[2]);
  for(ptrdiff_t l=0; l < local_ni_r[0] * local_ni_r[1] * local_ni_r[2]; l++)
    in_r[l] /= (n[0]*n[1]*n[2]);

  /* Print error of back transformed data */
  err = compare_c2c_c2r(local_ni_c, local_ni_r, in_c, in_r, comm_cart_2d);

  pfft_printf(comm_cart_2d, "Error after one forward and backward trafo of size n=(%td, %td, %td):\n", n[0], n[1], n[2]);
  pfft_printf(comm_cart_2d, "maxerror = %6.2e;\n", err);
  
  /* free mem and finalize */
  pfft_destroy_plan(plan_forw_c);
  pfft_destroy_plan(plan_back_c);
  pfft_destroy_plan(plan_forw_r);
  pfft_destroy_plan(plan_back_r);
  MPI_Comm_free(&comm_cart_2d);
  pfft_free(in_c); pfft_free(out_c);
  pfft_free(in_r); pfft_free(out_r);
  MPI_Finalize();
  return 0;
}
int main(int argc, char **argv){
  int np[2];
  ptrdiff_t n[3], ni[3], no[3];
  ptrdiff_t alloc_local_forw, alloc_local_back, alloc_local, howmany;
  ptrdiff_t local_ni[3], local_i_start[3];
  ptrdiff_t local_n[3], local_start[3];
  ptrdiff_t local_no[3], local_o_start[3];
  double err, *in;
  pfft_complex *out;
  pfft_plan plan_forw=NULL, plan_back=NULL;
  MPI_Comm comm_cart_2d;
  
  /* Set size of FFT and process mesh */
  ni[0] = ni[1] = ni[2] = 16;
  n[0] = 29; n[1] = 27; n[2] = 31;
  for(int t=0; t<3; t++)
    no[t] = ni[t];
  np[0] = 2; np[1] = 2;
  howmany = 1;

  /* Initialize MPI and PFFT */
  MPI_Init(&argc, &argv);
  pfft_init();

  /* Create two-dimensional process grid of size np[0] x np[1], if possible */
  if( pfft_create_procmesh_2d(MPI_COMM_WORLD, np[0], np[1], &comm_cart_2d) ){
    pfft_fprintf(MPI_COMM_WORLD, stderr, "Error: This test file only works with %d processes.\n", np[0]*np[1]);
    MPI_Finalize();
    return 1;
  }

  /* Get parameters of data distribution */
  alloc_local_forw = pfft_local_size_many_dft_r2c(3, n, ni, n, howmany,
      PFFT_DEFAULT_BLOCKS, PFFT_DEFAULT_BLOCKS,
      comm_cart_2d, PFFT_TRANSPOSED_NONE | PFFT_PADDED_R2C,
      local_ni, local_i_start, local_n, local_start);

  alloc_local_back = pfft_local_size_many_dft_c2r(3, n, n, no, howmany,
      PFFT_DEFAULT_BLOCKS, PFFT_DEFAULT_BLOCKS,
      comm_cart_2d, PFFT_TRANSPOSED_NONE | PFFT_PADDED_R2C,
      local_n, local_start, local_no, local_o_start);

  /* Allocate enough memory for both trafos */
  alloc_local = (alloc_local_forw > alloc_local_back) ?
    alloc_local_forw : alloc_local_back;
  in  = pfft_alloc_real(2 * alloc_local);
  out = pfft_alloc_complex(alloc_local);

  /* Plan parallel forward FFT */
  plan_forw = pfft_plan_many_dft_r2c(
      3, n, ni, n, howmany, PFFT_DEFAULT_BLOCKS, PFFT_DEFAULT_BLOCKS,
      in, out, comm_cart_2d, PFFT_FORWARD, PFFT_TRANSPOSED_NONE| PFFT_MEASURE| PFFT_DESTROY_INPUT| PFFT_PADDED_R2C);

  /* Plan parallel backward FFT */
  plan_back = pfft_plan_many_dft_c2r(
      3, n, n, no, howmany, PFFT_DEFAULT_BLOCKS, PFFT_DEFAULT_BLOCKS,
      out, in, comm_cart_2d, PFFT_BACKWARD, PFFT_TRANSPOSED_NONE| PFFT_MEASURE| PFFT_DESTROY_INPUT| PFFT_PADDED_R2C);

  /* Initialize input with random numbers */
  pfft_init_input_real(3, ni, local_ni, local_i_start,
      in);

  /* Execute parallel forward FFT */
  pfft_execute(plan_forw);

  /* clear the old input */
  pfft_clear_input_real(3, ni, local_ni, local_i_start,
      in);
 
  /* execute parallel backward FFT */
  pfft_execute(plan_back);
  
  /* Scale data */
  for(ptrdiff_t l=0; l < local_ni[0] * local_ni[1] * local_ni[2]; l++)
    in[l] /= (n[0]*n[1]*n[2]);

  /* Print error of back transformed data */
  MPI_Barrier(MPI_COMM_WORLD);
  err = pfft_check_output_real(3, ni, local_ni, local_i_start, in, comm_cart_2d);
  pfft_printf(comm_cart_2d, "Error after one forward and backward trafo of size n=(%td, %td, %td):\n", n[0], n[1], n[2]); 
  pfft_printf(comm_cart_2d, "maxerror = %6.2e;\n", err);
  
  /* free mem and finalize MPI */
  pfft_destroy_plan(plan_forw);
  pfft_destroy_plan(plan_back);
  MPI_Comm_free(&comm_cart_2d);
  pfft_free(in); pfft_free(out);
  MPI_Finalize();
  return 0;
}
int main(int argc, char **argv)
{
  int np[3];
  ptrdiff_t n[4], N[4];
  ptrdiff_t alloc_local;
  ptrdiff_t local_ni[4], local_i_start[4];
  ptrdiff_t local_no[4], local_o_start[4];
  double err, *in, *out;
  pfft_plan plan_forw=NULL, plan_back=NULL;
  MPI_Comm comm_cart_3d;
  pfft_r2r_kind kinds_forw[4], kinds_back[4];
  
  /* Set size of FFT and process mesh */
  n[0] = 13; n[1] = 14; n[2] = 19; n[3] = 17;
  np[0] = 2; np[1] = 2; np[2] = 2;
  
  /* Set FFTW kinds of 1d R2R trafos */
  kinds_forw[0] = PFFT_REDFT00; kinds_back[0] = PFFT_REDFT00;
  kinds_forw[1] = PFFT_REDFT01; kinds_back[1] = PFFT_REDFT10;
  kinds_forw[2] = PFFT_RODFT00; kinds_back[2] = PFFT_RODFT00;
  kinds_forw[3] = PFFT_RODFT10; kinds_back[3] = PFFT_RODFT01;

  /* Set logical DFT sizes corresponding to FFTW manual:
   * for REDFT00 N=2*(n-1), for RODFT00 N=2*(n+1), otherwise N=2*n */
  N[0] = 2*(n[0]-1);
  N[1] = 2*n[1];
  N[2] = 2*(n[2]+1); 
  N[3] = 2*n[3];

  /* Initialize MPI and PFFT */
  MPI_Init(&argc, &argv);
  pfft_init();

  /* Create three-dimensional process grid of size np[0] x np[1] x np[2], if possible */
  if( pfft_create_procmesh(3, MPI_COMM_WORLD, np, &comm_cart_3d) ){
    pfft_fprintf(MPI_COMM_WORLD, stderr, "Error: This test file only works with %d processes.\n", np[0]*np[1]*np[2]);
    MPI_Finalize();
    return 1;
  }
  
  /* Get parameters of data distribution */
  alloc_local = pfft_local_size_r2r(4, n, comm_cart_3d, PFFT_TRANSPOSED_NONE,
      local_ni, local_i_start, local_no, local_o_start);

  /* Allocate memory */
  in  = pfft_alloc_real(alloc_local);
  out = pfft_alloc_real(alloc_local);

  /* Plan parallel forward FFT */
  plan_forw = pfft_plan_r2r(
      4, n, in, out, comm_cart_3d, kinds_forw, PFFT_TRANSPOSED_NONE| PFFT_MEASURE| PFFT_DESTROY_INPUT);
  
  /* Plan parallel backward FFT */
  plan_back = pfft_plan_r2r(
      4, n, out, in, comm_cart_3d, kinds_back, PFFT_TRANSPOSED_NONE| PFFT_MEASURE| PFFT_DESTROY_INPUT);

  /* Initialize input with random numbers */
  pfft_init_input_real(4, n, local_ni, local_i_start,
      in);

  /* execute parallel forward FFT */
  pfft_execute(plan_forw);

  /* clear the old input */
  pfft_clear_input_real(4, n, local_ni, local_i_start,
      in);
  
  /* execute parallel backward FFT */
  pfft_execute(plan_back);
  
  /* Scale data */
  for(ptrdiff_t l=0; l < local_ni[0] * local_ni[1] * local_ni[2] * local_ni[3]; l++)
    in[l] /= (N[0]*N[1]*N[2]*N[3]);
  
  /* Print error of back transformed data */
  MPI_Barrier(MPI_COMM_WORLD);
  err = pfft_check_output_real(4, n, local_ni, local_i_start, in, comm_cart_3d);
  pfft_printf(comm_cart_3d, "Error after one forward and backward trafo of size n=(%td, %td, %td, %td):\n", n[0], n[1], n[2], n[3]); 
  pfft_printf(comm_cart_3d, "maxerror = %6.2e;\n", err);
  
  /* free mem and finalize */
  pfft_destroy_plan(plan_forw);
  pfft_destroy_plan(plan_back);
  MPI_Comm_free(&comm_cart_3d);
  pfft_free(in); pfft_free(out);
  MPI_Finalize();
  return 0;
}
int main(int argc, char **argv)
{
  int np[2];
  ptrdiff_t n[4];
  ptrdiff_t alloc_local;
  ptrdiff_t local_ni[4], local_i_start[4];
  ptrdiff_t local_no[4], local_o_start[4];
  double err, *in;
  pfft_complex *out;
  pfft_plan plan_forw=NULL, plan_back=NULL;
  MPI_Comm comm_cart_2d;
  
  /* Set size of FFT and process mesh */
  n[0] = 13; n[1] = 14; n[2] = 19; n[3] = 17;
  np[0] = 2; np[1] = 2;
  
  /* Initialize MPI and PFFT */
  MPI_Init(&argc, &argv);
  pfft_init();

  /* Create two-dimensional process grid of size np[0] x np[1], if possible */
  if( pfft_create_procmesh_2d(MPI_COMM_WORLD, np[0], np[1], &comm_cart_2d) ){
    pfft_fprintf(MPI_COMM_WORLD, stderr, "Error: This test file only works with %d processes.\n", np[0]*np[1]);
    MPI_Finalize();
    return 1;
  }
  
  /* Get parameters of data distribution */
  alloc_local = pfft_local_size_dft_r2c(4, n, comm_cart_2d, PFFT_TRANSPOSED_NONE,
      local_ni, local_i_start, local_no, local_o_start);

  /* Allocate memory */
  in  = pfft_alloc_real(2 * alloc_local);
  out = pfft_alloc_complex(alloc_local);

  /* Plan parallel forward FFT */
  plan_forw = pfft_plan_dft_r2c(
      4, n, in, out, comm_cart_2d, PFFT_FORWARD, PFFT_TRANSPOSED_NONE| PFFT_MEASURE| PFFT_DESTROY_INPUT);
  
  /* Plan parallel backward FFT */
  plan_back = pfft_plan_dft_c2r(
      4, n, out, in, comm_cart_2d, PFFT_BACKWARD, PFFT_TRANSPOSED_NONE| PFFT_MEASURE| PFFT_DESTROY_INPUT);

  /* Initialize input with random numbers */
  pfft_init_input_real(4, n, local_ni, local_i_start,
      in);

  /* execute parallel forward FFT */
  pfft_execute(plan_forw);
  
  /* execute parallel backward FFT */
  pfft_execute(plan_back);
  
  /* Scale data */
  for(ptrdiff_t l=0; l < local_ni[0] * local_ni[1] * local_ni[2] * local_ni[3]; l++)
    in[l] /= (n[0]*n[1]*n[2]*n[3]);
  
  /* Print error of back transformed data */
  MPI_Barrier(MPI_COMM_WORLD);
  err = pfft_check_output_real(4, n, local_ni, local_i_start, in, comm_cart_2d);
  pfft_printf(comm_cart_2d, "Error after one forward and backward trafo of size n=(%td, %td, %td, %td):\n", n[0], n[1], n[2], n[3]); 
  pfft_printf(comm_cart_2d, "maxerror = %6.2e;\n", err);
  
  /* free mem and finalize */
  pfft_destroy_plan(plan_forw);
  pfft_destroy_plan(plan_back);
  MPI_Comm_free(&comm_cart_2d);
  pfft_free(in); pfft_free(out);
  MPI_Finalize();
  return 0;
}
int main(int argc, char **argv)
{
  int np[2];
  ptrdiff_t n[3], N[3];
  ptrdiff_t alloc_local;
  ptrdiff_t local_ni[3], local_i_start[3];
  ptrdiff_t local_no[3], local_o_start[3];
  double err, *in, *out;
  pfft_plan plan_forw=NULL, plan_back=NULL;
  MPI_Comm comm_cart_2d;
  pfft_r2r_kind kinds_forw[3], kinds_back[3];
  
  /* Set size of FFT and process mesh */
  n[0] = 29; n[1] = 27; n[2] = 31;
  np[0] = 2; np[1] = 2;
  
  /* Set FFTW kinds of 1d R2R trafos */
  kinds_forw[0] = PFFT_REDFT00; kinds_back[0] = PFFT_REDFT00;
  kinds_forw[1] = PFFT_REDFT01; kinds_back[1] = PFFT_REDFT10;
  kinds_forw[2] = PFFT_RODFT00; kinds_back[2] = PFFT_RODFT00;

  /* Set logical DFT sizes corresponding to FFTW manual:
   * for REDFT00 N=2*(n-1), for RODFT00 N=2*(n+1), otherwise N=2*n */
  N[0] = 2*(n[0]-1);
  N[1] = 2*n[1];
  N[2] = 2*(n[2]+1); 

  /* Initialize MPI and PFFT */
  MPI_Init(&argc, &argv);
  pfft_init();

  /* Create two-dimensional process grid of size np[0] x np[1], if possible */
  if( pfft_create_procmesh_2d(MPI_COMM_WORLD, np[0], np[1], &comm_cart_2d) ){
    pfft_fprintf(MPI_COMM_WORLD, stderr, "Error: This test file only works with %d processes.\n", np[0]*np[1]);
    MPI_Finalize();
    return 1;
  }
  
  /* Get parameters of data distribution */
  alloc_local = pfft_local_size_r2r_3d(n, comm_cart_2d, PFFT_TRANSPOSED_OUT,
      local_ni, local_i_start, local_no, local_o_start);

  /* Allocate memory */
  in  = pfft_alloc_real(alloc_local);
  out = pfft_alloc_real(alloc_local);

  /* Plan parallel forward FFT */
  plan_forw = pfft_plan_r2r_3d(
      n, in, out, comm_cart_2d, kinds_forw, PFFT_TRANSPOSED_OUT| PFFT_MEASURE| PFFT_DESTROY_INPUT);
 
  /* Plan parallel backward FFT */
  plan_back = pfft_plan_r2r_3d(
      n, out, in, comm_cart_2d, kinds_back, PFFT_TRANSPOSED_IN| PFFT_MEASURE| PFFT_DESTROY_INPUT);

  /* Initialize input with random numbers */
  pfft_init_input_real_3d(n, local_ni, local_i_start,
      in);

  int myrank, size;
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
  MPI_Comm_size(MPI_COMM_WORLD, &size);

//  for(int t=0; t<size; t++){
//    if(t == myrank){  
//      int m=0;
//      for(int k0=0; k0<local_ni[0]; k0++)
//        for(int k1=0; k1<local_ni[1]; k1++){
//          for(int k2=0; k2<local_ni[2]; k2++, m++)
//            printf("in[%d, %d, %d] = %.2f\t", k0+local_i_start[0], k1+local_i_start[1], k2+local_i_start[2], in[m]);
//          printf("\n");
//        }
//    }
//    fflush(stderr);
//    MPI_Barrier(MPI_COMM_WORLD);
//  }

  /* execute parallel forward FFT */
  pfft_execute(plan_forw);
  

//  for(int t=0; t<size; t++){
//    if(t == myrank){  
//      int m=0;
//      for(int k1=0; k1<local_no[1]; k1++)
//        for(int k2=0; k2<local_no[2]; k2++){
//          for(int k0=0; k0<local_no[0]; k0++, m++)
//            printf("out[%d, %d, %d] = %.2f\t", k0+local_o_start[0], k1+local_o_start[1], k2+local_o_start[2], out[m]);
//          printf("\n");
//        }
//    }
//    fflush(stderr);
//    MPI_Barrier(MPI_COMM_WORLD);
//  }

  /* execute parallel backward FFT */
  pfft_execute(plan_back);
  
  /* Scale data */
  for(ptrdiff_t l=0; l < local_ni[0] * local_ni[1] * local_ni[2]; l++)
    in[l] /= (N[0]*N[1]*N[2]);

  /* Print error of back transformed data */
  MPI_Barrier(MPI_COMM_WORLD);
  err = pfft_check_output_real_3d(n, local_ni, local_i_start, in, comm_cart_2d);
  pfft_printf(comm_cart_2d, "Error after one forward and backward trafo of size n=(%td, %td, %td):\n", n[0], n[1], n[2]); 
  pfft_printf(comm_cart_2d, "maxerror = %6.2e;\n", err);

  /* free mem and finalize */
  pfft_destroy_plan(plan_forw);
  pfft_destroy_plan(plan_back);
  MPI_Comm_free(&comm_cart_2d);
  pfft_free(in); pfft_free(out);
  MPI_Finalize();
  return 0;
}