Beispiel #1
0
/**
 * Execute C2C plan. This function is blocking and only returns after the transform is completed.
 * @note For inplace transforms, data_out should point to the same memory address as data, AND
 * the plan must have been created as inplace.
 * @param plan FFT plan created by \ref accfft_plan_dft_3d_r2c.
 * @param data Input data in frequency domain.
 * @param data_out Output data in frequency domain.
 * @param timer See \ref timer for more details.
 * @param XYZ a bit set field that determines which directions FFT should be executed
 */
void accfft_execute_c2c_gpu(accfft_plan_gpu* plan, int direction,Complex * data_d, Complex * data_out_d, double * timer,std::bitset<3> xyz){

  if(!plan->c2c_plan_baked){
    if(plan->procid==0) std::cout<<"Error. r2c plan has not been made correctly. Please first create the plan before calling execute functions."<<std::endl;
    return;
  }

  if(data_d==NULL)
    data_d=plan->data_c;
  if(data_out_d==NULL)
    data_out_d=plan->data_out_c;
  int * coords=plan->coord;
  int procid=plan->procid;
  double fft_time=0;
  double timings[5]={0};

  int NY=plan->N[1];
  cudaEvent_t memcpy_startEvent, memcpy_stopEvent;
  cudaEvent_t fft_startEvent, fft_stopEvent;
  checkCuda_accfft( cudaEventCreate(&memcpy_startEvent) );
  checkCuda_accfft( cudaEventCreate(&memcpy_stopEvent) );
  checkCuda_accfft( cudaEventCreate(&fft_startEvent) );
  checkCuda_accfft( cudaEventCreate(&fft_stopEvent) );
  cufftResult_t cufft_error;
  float dummy_time=0;

  int *osize_0 =plan->osize_0, *ostart_0 =plan->ostart_0;
  int *osize_1 =plan->osize_1, *ostart_1 =plan->ostart_1;
  int *osize_2 =plan->osize_2, *ostart_2 =plan->ostart_2;
  int *osize_1i=plan->osize_1i,*ostart_1i=plan->ostart_1i;
  int *osize_2i=plan->osize_2i,*ostart_2i=plan->ostart_2i;

  if(direction==-1){
    /**************************************************************/
    /*******************  N0/P0 x N1/P1 x N2 **********************/
    /**************************************************************/
    // FFT in Z direction
    if(xyz[2]){
      checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
      checkCuda_accfft(cufftExecZ2Z(plan->fplan_0,(cufftDoubleComplex*)data_d, (cufftDoubleComplex*)data_out_d,CUFFT_FORWARD));
      checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
      checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
      checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
      fft_time+=dummy_time/1000;
    }
    else
      data_out_d=data_d;

    if(!plan->oneD){
      plan->T_plan_1->execute_gpu(plan->T_plan_1,(double*)data_out_d,timings,2,osize_0[0],coords[0]);
    }
    checkCuda_accfft (cudaDeviceSynchronize());
    MPI_Barrier(plan->c_comm);
    /**************************************************************/
    /*******************  N0/P0 x N1 x N2/P1 **********************/
    /**************************************************************/
    if(xyz[1]){
      checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
      for (int i=0;i<osize_1[0];++i){
        checkCuda_accfft(cufftExecZ2Z(plan->fplan_1,(cufftDoubleComplex*)&data_out_d[i*osize_1[1]*osize_1[2]], (cufftDoubleComplex*)&data_out_d[i*osize_1[1]*osize_1[2]],CUFFT_FORWARD));
      }
      checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
      checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
      checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
      fft_time+=dummy_time/1000;
    }

    MPI_Barrier(plan->c_comm);


    if(plan->oneD){
      plan->T_plan_2->execute_gpu(plan->T_plan_2,(double*)data_out_d,timings,2);
    }
    else{
      plan->T_plan_2->execute_gpu(plan->T_plan_2,(double*)data_out_d,timings,2,1,coords[1]);
    }
    checkCuda_accfft (cudaDeviceSynchronize());
    MPI_Barrier(plan->c_comm);
    /**************************************************************/
    /*******************  N0 x N1/P0 x N2/P1 **********************/
    /**************************************************************/
    if(xyz[0]){
      checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
      checkCuda_accfft(cufftExecZ2Z(plan->fplan_2,(cufftDoubleComplex*)data_out_d, (cufftDoubleComplex*)data_out_d,CUFFT_FORWARD));
      checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
      checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
      checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
      fft_time+=dummy_time/1000;
    }

  }
  else if (direction==1){
    if(xyz[0]){
      checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
      checkCuda_accfft (cufftExecZ2Z(plan->fplan_2,(cufftDoubleComplex*)data_d, (cufftDoubleComplex*)data_d,CUFFT_INVERSE));
      checkCuda_accfft (cudaDeviceSynchronize());
      checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
      checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
      checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
      fft_time+=dummy_time/1000;
      MPI_Barrier(plan->c_comm);
    }


    if(plan->oneD){
      plan->T_plan_2i->execute_gpu(plan->T_plan_2i,(double*)data_d,timings,1);
    }
    else{
      plan->T_plan_2i->execute_gpu(plan->T_plan_2i,(double*)data_d,timings,1,1,coords[1]);
    }
    checkCuda_accfft (cudaDeviceSynchronize());
    MPI_Barrier(plan->c_comm);
    /**************************************************************/
    /*******************  N0/P0 x N1 x N2/P1 **********************/
    /**************************************************************/
    if(xyz[1]){
      checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
      for (int i=0;i<osize_1i[0];++i){
        checkCuda_accfft (cufftExecZ2Z(plan->fplan_1,(cufftDoubleComplex*)&data_d[i*NY*osize_1i[2]], (cufftDoubleComplex*)&data_d[i*NY*osize_1i[2]],CUFFT_INVERSE));
      }
      checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
      checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
      checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
      fft_time+=dummy_time/1000;
      MPI_Barrier(plan->c_comm);
    }

    if(!plan->oneD){
      plan->T_plan_1i->execute_gpu(plan->T_plan_1i,(double*)data_d,timings,1,osize_1i[0],coords[0]);
    }
    checkCuda_accfft (cudaDeviceSynchronize());
    MPI_Barrier(plan->c_comm);
    /**************************************************************/
    /*******************  N0/P0 x N1/P1 x N2 **********************/
    /**************************************************************/

    if(xyz[2]){
      checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
      checkCuda_accfft (cufftExecZ2Z(plan->fplan_0,(cufftDoubleComplex*)data_d,(cufftDoubleComplex*)data_out_d,CUFFT_INVERSE));
      checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
      checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
      checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
      fft_time+=dummy_time/1000;
    }
    else
      data_out_d=data_d;

  }

  timings[4]+=fft_time;
  if(timer==NULL){
    //delete [] timings;
  }
  else{
    timer[0]+=timings[0];
    timer[1]+=timings[1];
    timer[2]+=timings[2];
    timer[3]+=timings[3];
    timer[4]+=timings[4];
  }
  MPI_Barrier(plan->c_comm);

  return;
}// end accfft_execute_c2c_gpu
Beispiel #2
0
void accfft_execute_gpu(accfft_plan_gpu* plan, int direction,double * data_d, double * data_out_d, double * timer,std::bitset<3> xyz){

  if(data_d==NULL)
    data_d=plan->data;
  if(data_out_d==NULL)
    data_out_d=plan->data_out;

  int * coords=plan->coord;
  int procid=plan->procid;
  double fft_time=0;
  double timings[5]={0};

  cudaEvent_t memcpy_startEvent, memcpy_stopEvent;
  cudaEvent_t fft_startEvent, fft_stopEvent;
  checkCuda_accfft( cudaEventCreate(&memcpy_startEvent) );
  checkCuda_accfft( cudaEventCreate(&memcpy_stopEvent) );
  checkCuda_accfft( cudaEventCreate(&fft_startEvent) );
  checkCuda_accfft( cudaEventCreate(&fft_stopEvent) );
  int NY=plan->N[1];
  float dummy_time=0;


  int *osize_0 =plan->osize_0;// *ostart_0 =plan->ostart_0;
  int *osize_1 =plan->osize_1;// *ostart_1 =plan->ostart_1;
  //int *osize_2 =plan->osize_2, *ostart_2 =plan->ostart_2;
  int *osize_1i=plan->osize_1i;//*ostart_1i=plan->ostart_1i;
  //int *osize_2i=plan->osize_2i,*ostart_2i=plan->ostart_2i;

  if(direction==-1){
    /**************************************************************/
    /*******************  N0/P0 x N1/P1 x N2 **********************/
    /**************************************************************/
    // FFT in Z direction
    if(xyz[2]){
      checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
      checkCuda_accfft (cufftExecD2Z(plan->fplan_0,(cufftDoubleReal*)data_d, (cufftDoubleComplex*)data_out_d));
      checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
      checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
      checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
      fft_time+=dummy_time/1000;
    }
    else
      data_out_d=data_d;
    // Perform N0/P0 transpose


    if(!plan->oneD){
      plan->T_plan_1->execute_gpu(plan->T_plan_1,data_out_d,timings,2,osize_0[0],coords[0]);
    }
    /**************************************************************/
    /*******************  N0/P0 x N1 x N2/P1 **********************/
    /**************************************************************/
    if(xyz[1]){
      checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
      for (int i=0;i<osize_1[0];++i){
        checkCuda_accfft (cufftExecZ2Z(plan->fplan_1,(cufftDoubleComplex*)&data_out_d[2*i*osize_1[1]*osize_1[2]], (cufftDoubleComplex*)&data_out_d[2*i*osize_1[1]*osize_1[2]],CUFFT_FORWARD));
      }
      checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
      checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
      checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
      fft_time+=dummy_time/1000;
    }

    if(plan->oneD){
      plan->T_plan_2->execute_gpu(plan->T_plan_2,data_out_d,timings,2);
    }
    else{
      plan->T_plan_2->execute_gpu(plan->T_plan_2,data_out_d,timings,2,1,coords[1]);
    }
    /**************************************************************/
    /*******************  N0 x N1/P0 x N2/P1 **********************/
    /**************************************************************/
    MPI_Barrier(plan->c_comm);
    if(xyz[0]){
      checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
      checkCuda_accfft (cufftExecZ2Z(plan->fplan_2,(cufftDoubleComplex*)data_out_d, (cufftDoubleComplex*)data_out_d,CUFFT_FORWARD));
      checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
      checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
      checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
      fft_time+=dummy_time/1000;
    }
  }
  else if (direction==1){
    if(xyz[0]){
      checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
      checkCuda_accfft (cufftExecZ2Z(plan->fplan_2,(cufftDoubleComplex*)data_d, (cufftDoubleComplex*)data_d,CUFFT_INVERSE));
      checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
      checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
      checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
      fft_time+=dummy_time/1000;
    }

    if(plan->oneD){
      plan->T_plan_2i->execute_gpu(plan->T_plan_2i,data_d,timings,1);
    }
    else{
      plan->T_plan_2i->execute_gpu(plan->T_plan_2i,data_d,timings,1,1,coords[1]);
    }
    MPI_Barrier(plan->c_comm);
    /**************************************************************/
    /*******************  N0/P0 x N1 x N2/P1 **********************/
    /**************************************************************/
    if(xyz[1]){
      checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
      for (int i=0;i<osize_1i[0];++i){
        checkCuda_accfft (cufftExecZ2Z(plan->fplan_1,(cufftDoubleComplex*)&data_d[2*i*NY*osize_1i[2]], (cufftDoubleComplex*)&data_d[2*i*NY*osize_1i[2]],CUFFT_INVERSE));
      }
      checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
      checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
      checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
      fft_time+=dummy_time/1000;
    }



    if(!plan->oneD){
      plan->T_plan_1i->execute_gpu(plan->T_plan_1i,data_d,timings,1,osize_1i[0],coords[0]);
    }
    /**************************************************************/
    /*******************  N0/P0 x N1/P1 x N2 **********************/
    /**************************************************************/

    // IFFT in Z direction
    if(xyz[2]){
      checkCuda_accfft( cudaEventRecord(fft_startEvent,0) );
      checkCuda_accfft (cufftExecZ2D(plan->iplan_0,(cufftDoubleComplex*)data_d,(cufftDoubleReal*)data_out_d));
      checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) );
      checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed
      checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) );
      fft_time+=dummy_time/1000;
    }
    else
      data_out_d=data_d;

  }

  timings[4]+=fft_time;
  if(timer==NULL){
    //delete [] timings;
  }
  else{
    timer[0]+=timings[0];
    timer[1]+=timings[1];
    timer[2]+=timings[2];
    timer[3]+=timings[3];
    timer[4]+=timings[4];
  }
  checkCuda_accfft (cudaDeviceSynchronize());
  MPI_Barrier(plan->c_comm);

  return;
}// end accfft_execute_gpu
Beispiel #3
0
/**
 * Creates a 3D C2C parallel FFT plan. If data_out point to the same location as the input
 * data, then an inplace plan will be created. Otherwise the plan would be outplace.
 * @param n Integer array of size 3, corresponding to the global data size
 * @param data Input data in spatial domain
 * @param data_out Output data in frequency domain
 * @param c_comm Cartesian communicator returned by \ref accfft_create_comm
 * @param flags AccFFT flags, See \ref flags for more details.
 * @return
 */
accfft_plan_gpu*  accfft_plan_dft_3d_c2c_gpu(int * n, Complex * data_d, Complex * data_out_d, MPI_Comm c_comm,unsigned flags){
  accfft_plan_gpu *plan=new accfft_plan_gpu;
  int nprocs, procid;
  MPI_Comm_rank(c_comm, &procid);
  plan->procid=procid;
  MPI_Cart_get(c_comm,2,plan->np,plan->periods,plan->coord);
  plan->c_comm=c_comm;
  int *coord=plan->coord;
  MPI_Comm_split(c_comm,coord[0],coord[1],&plan->row_comm);
  MPI_Comm_split(c_comm,coord[1],coord[0],&plan->col_comm);
  plan->N[0]=n[0];plan->N[1]=n[1];plan->N[2]=n[2];
  int NX=n[0], NY=n[1], NZ=n[2];
  cufftResult_t cufft_error;

  plan->data_c=data_d;
  plan->data_out_c=data_out_d;
  if(data_out_d==data_d){
    plan->inplace=true;}
  else{plan->inplace=false;}

  if(plan->np[1]==1)
    plan->oneD=true;
  else
    plan->oneD=false;



  int *osize_0 =plan->osize_0, *ostart_0 =plan->ostart_0;
  int *osize_1 =plan->osize_1, *ostart_1 =plan->ostart_1;
  int *osize_2 =plan->osize_2, *ostart_2 =plan->ostart_2;
  int *osize_1i=plan->osize_1i,*ostart_1i=plan->ostart_1i;
  int *osize_2i=plan->osize_2i,*ostart_2i=plan->ostart_2i;

  int alloc_local;
  int alloc_max=0,n_tuples=n[2]*2;

  //int isize[3],osize[3],istart[3],ostart[3];
  alloc_max=accfft_local_size_dft_c2c_gpu(n,plan->isize,plan->istart,plan->osize,plan->ostart,c_comm);
  plan->alloc_max=alloc_max;

  dfft_get_local_size_gpu(n[0],n[1],n[2],osize_0,ostart_0,c_comm);
  dfft_get_local_size_gpu(n[0],n[2],n[1],osize_1,ostart_1,c_comm);
  dfft_get_local_size_gpu(n[1],n[2],n[0],osize_2,ostart_2,c_comm);


  std::swap(osize_1[1],osize_1[2]);
  std::swap(ostart_1[1],ostart_1[2]);

  std::swap(ostart_2[1],ostart_2[2]);
  std::swap(ostart_2[0],ostart_2[1]);
  std::swap(osize_2[1],osize_2[2]);
  std::swap(osize_2[0],osize_2[1]);

  for(int i=0;i<3;i++){
    osize_1i[i]=osize_1[i];
    osize_2i[i]=osize_2[i];
    ostart_1i[i]=ostart_1[i];
    ostart_2i[i]=ostart_2[i];
  }


  // fplan_0
  {
    int f_inembed[1]={NZ};
    int f_onembed[1]={NZ};
    int idist=(NZ);
    int odist=(NZ);
    int istride=1;
    int ostride=1;
    int batch=osize_0[0]*osize_0[1];//NX;

    if(batch!=0)
    {
      cufft_error=cufftPlanMany(&plan->fplan_0, 1, &n[2],
          f_inembed, istride, idist, // *inembed, istride, idist
          f_onembed, ostride, odist, // *onembed, ostride, odist
          CUFFT_Z2Z, batch);
      if(cufft_error!= CUFFT_SUCCESS)
      {
        fprintf(stderr, "CUFFT error: fplan_0 creation failed %d \n",cufft_error); return NULL;
      }
      //cufftSetCompatibilityMode(fplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan cuda compatibility\n"); return;}
    }
  }
  // fplan_1
  {
    int f_inembed[1]={NY};
    int f_onembed[1]={NY};
    int idist=1;
    int odist=1;
    int istride=osize_1[2];
    int ostride=osize_1[2];
    int batch=osize_1[2];

    if(batch!=0)
    {
      cufft_error=cufftPlanMany(&plan->fplan_1, 1, &n[1],
          f_inembed, istride, idist, // *inembed, istride, idist
          f_onembed, ostride, odist, // *onembed, ostride, odist
          CUFFT_Z2Z, batch);
      if(cufft_error!= CUFFT_SUCCESS)
      {
        fprintf(stderr, "CUFFT error: fplan_1 creation failed %d \n",cufft_error); return NULL;
      }
      //cufftSetCompatibilityMode(fplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan cuda compatibility\n"); return;}
    }
  }
  // fplan_2
  {
    int f_inembed[1]={NX};
    int f_onembed[1]={NX};
    int idist=1;
    int odist=1;
    int istride=osize_2[1]*osize_2[2];
    int ostride=osize_2[1]*osize_2[2];
    int batch=osize_2[1]*osize_2[2];;

    if(batch!=0)
    {
      cufft_error=cufftPlanMany(&plan->fplan_2, 1, &n[0],
          f_inembed, istride, idist, // *inembed, istride, idist
          f_onembed, ostride, odist, // *onembed, ostride, odist
          CUFFT_Z2Z, batch);
      if(cufft_error!= CUFFT_SUCCESS)
      {
        fprintf(stderr, "CUFFT error: fplan_2 creation failed %d \n",cufft_error); return NULL;
      }
      //cufftSetCompatibilityMode(fplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan cuda compatibility\n"); return;}
    }
  }

  // 1D Decomposition
  if (plan->oneD){
    int NX=n[0],NY=n[1],NZ=n[2];


    plan->alloc_max=alloc_max;

    plan->Mem_mgr= new Mem_Mgr_gpu <double>(NX,NY,(NZ)*2,c_comm);
    plan->T_plan_2= new T_Plan_gpu <double>(NX,NY,(NZ)*2, plan->Mem_mgr,c_comm);
    plan->T_plan_2i= new T_Plan_gpu<double>(NY,NX,NZ*2, plan->Mem_mgr,c_comm);

    plan->T_plan_2->alloc_local=alloc_max;
    plan->T_plan_2i->alloc_local=alloc_max;
    plan->T_plan_1=NULL;
    plan->T_plan_1i=NULL;




    if(flags==ACCFFT_MEASURE){
      plan->T_plan_2->which_fast_method_gpu(plan->T_plan_2,(double*)data_out_d);

    }
    else{
      plan->T_plan_2->method=2;
      plan->T_plan_2->kway=2;
    }
    checkCuda_accfft (cudaDeviceSynchronize());
    MPI_Barrier(plan->c_comm);

    plan->T_plan_2i->method=-plan->T_plan_2->method;
    plan->T_plan_2i->kway=plan->T_plan_2->kway;
    plan->T_plan_2i->kway_async=plan->T_plan_2->kway_async;


  }// end 1d c2c

  // 2D Decomposition
  if (!plan->oneD){
    // the reaseon for n_tuples/2 is to avoid splitting of imag and real parts of complex numbers
    plan->Mem_mgr=  new Mem_Mgr_gpu<double>(n[1],n[2],2,plan->row_comm,osize_0[0],alloc_max);
    plan->T_plan_1= new  T_Plan_gpu<double>(n[1],n[2],2, plan->Mem_mgr, plan->row_comm,osize_0[0]);
    plan->T_plan_2= new  T_Plan_gpu<double>(n[0],n[1],2*osize_2[2], plan->Mem_mgr, plan->col_comm);
    plan->T_plan_2i= new T_Plan_gpu<double>(n[1],n[0],2*osize_2i[2], plan->Mem_mgr, plan->col_comm);
    plan->T_plan_1i= new T_Plan_gpu<double>(n[2],n[1],2, plan->Mem_mgr, plan->row_comm,osize_1i[0]);

    plan->T_plan_1->alloc_local=plan->alloc_max;
    plan->T_plan_2->alloc_local=plan->alloc_max;
    plan->T_plan_2i->alloc_local=plan->alloc_max;
    plan->T_plan_1i->alloc_local=plan->alloc_max;


    plan->iplan_0=NULL;
    plan->iplan_1=NULL;
    plan->iplan_2=NULL;

    int coords[2],np[2],periods[2];
    MPI_Cart_get(c_comm,2,np,periods,coords);

    if(flags==ACCFFT_MEASURE){
      if(coords[0]==0){
        plan->T_plan_1->which_fast_method_gpu(plan->T_plan_1,(double*)data_out_d,osize_0[0]);
      }
    }
    else{
      plan->T_plan_1->method=2;
      plan->T_plan_1->kway=2;
    }

    MPI_Bcast(&plan->T_plan_1->method,1, MPI_INT,0, c_comm );
    MPI_Bcast(&plan->T_plan_1->kway,1, MPI_INT,0, c_comm );
    MPI_Bcast(&plan->T_plan_1->kway_async,1, MPI::BOOL,0, c_comm );
    checkCuda_accfft (cudaDeviceSynchronize());
    MPI_Barrier(plan->c_comm);


    plan->T_plan_1->method =plan->T_plan_1->method;
    plan->T_plan_2->method =plan->T_plan_1->method;
    plan->T_plan_2i->method=-plan->T_plan_1->method;
    plan->T_plan_1i->method=-plan->T_plan_1->method;

    plan->T_plan_1->kway =plan->T_plan_1->kway;
    plan->T_plan_2->kway =plan->T_plan_1->kway;
    plan->T_plan_2i->kway=plan->T_plan_1->kway;
    plan->T_plan_1i->kway=plan->T_plan_1->kway;

    plan->T_plan_1->kway_async =plan->T_plan_1->kway_async;
    plan->T_plan_2->kway_async =plan->T_plan_1->kway_async;
    plan->T_plan_2i->kway_async=plan->T_plan_1->kway_async;
    plan->T_plan_1i->kway_async=plan->T_plan_1->kway_async;


  }// end 2d c2c

  plan->c2c_plan_baked=true;
  return plan;
}// end accfft_plan_dft_c2c_gpu
/**
 * Creates a 3D single precision R2C parallel FFT plan. If data_out point to the same location as the input
 * data, then an inplace plan will be created. Otherwise the plan would be outplace.
 * @param n Integer array of size 3, corresponding to the global data size
 * @param data Input data in spatial domain
 * @param data_out Output data in frequency domain
 * @param c_comm Cartesian communicator returned by \ref accfft_create_comm
 * @param flags AccFFT flags, See \ref flags for more details.
 * @return
 */
accfft_plan_gpuf*  accfft_plan_dft_3d_r2c_gpuf(int * n, float * data_d,float * data_out_d, MPI_Comm c_comm,unsigned flags){
  accfft_plan_gpuf *plan=new accfft_plan_gpuf;
  int procid;
  MPI_Comm_rank(c_comm, &procid);
  plan->procid=procid;
  MPI_Cart_get(c_comm,2,plan->np,plan->periods,plan->coord);
  plan->c_comm=c_comm;
  int *coord=plan->coord;
  MPI_Comm_split(c_comm,coord[0],coord[1],&plan->row_comm);
  MPI_Comm_split(c_comm,coord[1],coord[0],&plan->col_comm);
  plan->N[0]=n[0];plan->N[1]=n[1];plan->N[2]=n[2];
  plan->data=data_d;
  plan->data_out=data_out_d;

  if(plan->np[1]==1)
    plan->oneD=true;
  else
    plan->oneD=false;


  if(data_out_d==data_d){
    plan->inplace=true;}
  else{plan->inplace=false;}

  int *osize_0 =plan->osize_0, *ostart_0 =plan->ostart_0;
  int *osize_1 =plan->osize_1, *ostart_1 =plan->ostart_1;
  int *osize_2 =plan->osize_2, *ostart_2 =plan->ostart_2;
  int *osize_1i=plan->osize_1i,*ostart_1i=plan->ostart_1i;
  int *osize_2i=plan->osize_2i,*ostart_2i=plan->ostart_2i;

  int alloc_max=0;
  int n_tuples_i, n_tuples_o;
  //plan->inplace==true ? n_tuples=(n[2]/2+1)*2: n_tuples=n[2]*2;
  plan->inplace==true ? n_tuples_i=(n[2]/2+1)*2:  n_tuples_i=n[2];
  n_tuples_o=(n[2]/2+1)*2;

  int isize[3],osize[3],istart[3],ostart[3];
  alloc_max=accfft_local_size_dft_r2c_gpuf(n,isize,istart,osize,ostart,c_comm,plan->inplace);
  plan->alloc_max=alloc_max;

  dfft_get_local_size_gpuf(n[0],n[1],n_tuples_o,osize_0,ostart_0,c_comm);
  dfft_get_local_size_gpuf(n[0],n_tuples_o/2,n[1],osize_1,ostart_1,c_comm);
  dfft_get_local_size_gpuf(n[1],n_tuples_o/2,n[0],osize_2,ostart_2,c_comm);

  std::swap(osize_1[1],osize_1[2]);
  std::swap(ostart_1[1],ostart_1[2]);

  std::swap(ostart_2[1],ostart_2[2]);
  std::swap(ostart_2[0],ostart_2[1]);
  std::swap(osize_2[1],osize_2[2]);
  std::swap(osize_2[0],osize_2[1]);

  for(int i=0;i<3;i++){
    osize_1i[i]=osize_1[i];
    osize_2i[i]=osize_2[i];
    ostart_1i[i]=ostart_1[i];
    ostart_2i[i]=ostart_2[i];
  }

  // fplan_0
  int NX=n[0], NY=n[1], NZ=n[2];
  cufftResult_t cufft_error;
  {
    int f_inembed[1]={n_tuples_i};
    int f_onembed[1]={n_tuples_o/2};
    int idist=(n_tuples_i);
    int odist=n_tuples_o/2;
    int istride=1;
    int ostride=1;
    int batch=osize_0[0]*osize_0[1];//NX;

    if(batch!=0)
    {
      cufft_error=cufftPlanMany(&plan->fplan_0, 1, &n[2],
          f_inembed, istride, idist, // *inembed, istride, idist
          f_onembed, ostride, odist, // *onembed, ostride, odist
          CUFFT_R2C, batch);
      if(cufft_error!= CUFFT_SUCCESS)
      {
        fprintf(stderr, "CUFFT error: fplan_0 creation failed %d \n",cufft_error); return NULL;
      }
      //cufftSetCompatibilityMode(fplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan cuda compatibility\n"); return;}
    }
    if(batch!=0)
    {
      cufft_error=cufftPlanMany(&plan->iplan_0, 1, &n[2],
          f_onembed, ostride, odist, // *onembed, ostride, odist
          f_inembed, istride, idist, // *inembed, istride, idist
          CUFFT_C2R, batch);
      if(cufft_error!= CUFFT_SUCCESS)
      {
        fprintf(stderr, "CUFFT error: iplan_0 creation failed %d \n",cufft_error); return NULL;
      }
      //cufftSetCompatibilityMode(fplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan cuda compatibility\n"); return;}
    }
  }
  // fplan_1
  {
    int f_inembed[1]={NY};
    int f_onembed[1]={NY};
    int idist=1;
    int odist=1;
    int istride=osize_1[2];
    int ostride=osize_1[2];
    int batch=osize_1[2];

    if(batch!=0)
    {
      cufft_error=cufftPlanMany(&plan->fplan_1, 1, &n[1],
          f_inembed, istride, idist, // *inembed, istride, idist
          f_onembed, ostride, odist, // *onembed, ostride, odist
          CUFFT_C2C, batch);
      if(cufft_error!= CUFFT_SUCCESS)
      {
        fprintf(stderr, "CUFFT error: fplan_1 creation failed %d \n",cufft_error); return NULL;
      }
      //cufftSetCompatibilityMode(fplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan cuda compatibility\n"); return;}
    }
  }
  // fplan_2
  {
    int f_inembed[1]={NX};
    int f_onembed[1]={NX};
    int idist=1;
    int odist=1;
    int istride=osize_2[1]*osize_2[2];
    int ostride=osize_2[1]*osize_2[2];
    int batch=osize_2[1]*osize_2[2];;

    if(batch!=0)
    {
      cufft_error=cufftPlanMany(&plan->fplan_2, 1, &n[0],
          f_inembed, istride, idist, // *inembed, istride, idist
          f_onembed, ostride, odist, // *onembed, ostride, odist
          CUFFT_C2C, batch);
      if(cufft_error!= CUFFT_SUCCESS)
      {
        fprintf(stderr, "CUFFT error: fplan_2 creation failed %d \n",cufft_error); return NULL;
      }
      //cufftSetCompatibilityMode(fplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan cuda compatibility\n"); return;}
    }
  }


  // 1D Decomposition
  if(plan->oneD){
    int N0=n[0], N1=n[1], N2=n[2];

    plan->Mem_mgr  = new Mem_Mgr_gpu<float>(N0,N1,n_tuples_o,c_comm);
    plan->T_plan_2 = new T_Plan_gpu <float>(N0,N1,n_tuples_o, plan->Mem_mgr, c_comm);
    plan->T_plan_2i= new T_Plan_gpu <float>(N1,N0,n_tuples_o,plan->Mem_mgr, c_comm);
    plan->T_plan_1=NULL;
    plan->T_plan_1i=NULL;

    plan->alloc_max=alloc_max;
    plan->T_plan_2->alloc_local=alloc_max;
    plan->T_plan_2i->alloc_local=alloc_max;


    if(flags==ACCFFT_MEASURE){
      plan->T_plan_2->which_fast_method_gpu(plan->T_plan_2,data_out_d);
    }
    else{
      plan->T_plan_2->method=2;
      plan->T_plan_2->kway=2;
    }
    checkCuda_accfft (cudaDeviceSynchronize());
    MPI_Barrier(plan->c_comm);

    plan->T_plan_2->method =plan->T_plan_2->method;
    plan->T_plan_2i->method=plan->T_plan_2->method;

    plan->T_plan_2->kway =plan->T_plan_2->kway;
    plan->T_plan_2i->kway=plan->T_plan_2->kway;


  }

  // 2D Decomposition
  if (!plan->oneD){
    // the reaseon for n_tuples/2 is to avoid splitting of imag and real parts of complex numbers
    plan->Mem_mgr  =new Mem_Mgr_gpu<float>(n[1],n_tuples_o/2,2,plan->row_comm,osize_0[0],alloc_max);
    plan->T_plan_1 = new T_Plan_gpu<float>(n[1],n_tuples_o/2,2, plan->Mem_mgr, plan->row_comm,osize_0[0]);
    plan->T_plan_2 = new T_Plan_gpu<float>(n[0],n[1],osize_2[2]*2,plan->Mem_mgr, plan->col_comm);
    plan->T_plan_2i= new T_Plan_gpu<float>(n[1],n[0],osize_2i[2]*2, plan->Mem_mgr, plan->col_comm);
    plan->T_plan_1i= new T_Plan_gpu<float>(n_tuples_o/2,n[1],2, plan->Mem_mgr, plan->row_comm,osize_1i[0]);


    plan->T_plan_1->alloc_local=plan->alloc_max;
    plan->T_plan_2->alloc_local=plan->alloc_max;
    plan->T_plan_2i->alloc_local=plan->alloc_max;
    plan->T_plan_1i->alloc_local=plan->alloc_max;



    if(flags==ACCFFT_MEASURE){
      if(coord[0]==0){
        plan->T_plan_1->which_fast_method_gpu(plan->T_plan_1,data_out_d,osize_0[0]);
      }
    }
    else{
      plan->T_plan_1->method=2;
      plan->T_plan_1->kway=2;
    }

    MPI_Bcast(&plan->T_plan_1->method,1, MPI_INT,0, c_comm );
    MPI_Bcast(&plan->T_plan_1->kway,1, MPI_INT,0, c_comm );

    checkCuda_accfft (cudaDeviceSynchronize());
    MPI_Barrier(plan->c_comm);
    plan->T_plan_1->method =plan->T_plan_1->method;
    plan->T_plan_2->method =plan->T_plan_1->method;
    plan->T_plan_2i->method=plan->T_plan_1->method;
    plan->T_plan_1i->method=plan->T_plan_1->method;
    plan->T_plan_1->kway =plan->T_plan_1->kway;
    plan->T_plan_2->kway =plan->T_plan_1->kway;
    plan->T_plan_2i->kway=plan->T_plan_1->kway;
    plan->T_plan_1i->kway=plan->T_plan_1->kway;

    plan->iplan_1=-1;
    plan->iplan_2=-1;

  }

  plan->r2c_plan_baked=true;
  return plan;

}