Esempio n. 1
0
/*
 * Class:     jcuda_jcufft_JCufft
 * Method:    cufftPlanManyNative
 * Signature: (Ljcuda/jcufft/cufftHandle;I[I[III[IIIII)I
 */
JNIEXPORT jint JNICALL Java_jcuda_jcufft_JCufft_cufftPlanManyNative
  (JNIEnv *env, jclass cla, jobject handle, jint rank, jintArray n, jintArray inembed, jint istride, jint idist, jintArray onembed, jint ostride, jint odist, jint type, jint batch)
{
    if (handle == NULL)
    {
        ThrowByName(env, "java/lang/NullPointerException", "Parameter 'handle' is null for cufftPlanMany");
        return JCUFFT_INTERNAL_ERROR;
    }
    if (n == NULL)
    {
        ThrowByName(env, "java/lang/NullPointerException", "Parameter 'n' is null for cufftPlanMany");
        return JCUFFT_INTERNAL_ERROR;
    }

    Logger::log(LOG_TRACE, "Executing cufftPlanMany\n");

    cufftHandle plan = env->GetIntField(handle, cufftHandle_plan);
    int *nativeN = getArrayContents(env, n);
    int *nativeInembed = getArrayContents(env, inembed);
    int *nativeOnembed = getArrayContents(env, onembed);

    cufftResult result = cufftPlanMany(&plan, rank, nativeN, nativeInembed, (int)istride, (int)idist, nativeOnembed, (int)ostride, (int)odist, getCufftType(type), (int)batch);

    delete[] nativeN;
    delete[] nativeInembed;
    delete[] nativeOnembed;
    env->SetIntField(handle, cufftHandle_plan, plan);
    return result;

}
Esempio n. 2
0
    static void instantiate(const callable_type_data *DYND_UNUSED(self), const callable_type *DYND_UNUSED(self_tp),
                            kernel_builder *ckb, intptr_t ckb_offset, const ndt::type &DYND_UNUSED(dst_tp),
                            const char *dst_arrmeta, intptr_t DYND_UNUSED(nsrc), const ndt::type *src_tp,
                            const char *const *src_arrmeta, kernel_request_t kernreq,
                            const eval::eval_context *DYND_UNUSED(ectx), const nd::array &kwds,
                            const std::map<std::string, ndt::type> &DYND_UNUSED(tp_vars))
    {
      const size_stride_t *dst_size_stride = reinterpret_cast<const size_stride_t *>(dst_arrmeta);
      const size_stride_t *src_size_stride = reinterpret_cast<const size_stride_t *>(src_arrmeta[0]);

      array axes = kwds.p("axes");
      array shape = kwds.p("shape");

      int ndim = static_cast<int>(src_tp[0].get_ndim());

      int rank = static_cast<int>(axes.is_missing() ? ndim : (ndim - 1));
      int istride = static_cast<int>(src_size_stride[ndim - 1].stride / sizeof(src_type));
      int idist = static_cast<int>(src_size_stride[0].stride / sizeof(src_type));
      int ostride = static_cast<int>(dst_size_stride[ndim - 1].stride / sizeof(dst_type));
      int odist = static_cast<int>(dst_size_stride[0].stride / sizeof(dst_type));

      std::vector<int> n(rank), inembed(rank), onembed(rank);
      for (int i = 0, j = axes.is_missing() ? 0 : 1; j < ndim; ++i, ++j) {
        n[i] = static_cast<int>(src_size_stride[j].dim_size);
        inembed[i] = static_cast<int>(src_size_stride[j].dim_size);
        onembed[i] = static_cast<int>(dst_size_stride[j].dim_size);
      }

      int batch = static_cast<int>(axes.is_missing() ? 1 : src_size_stride[0].dim_size);

      self_type *self = self_type::create(ckb, kernreq, ckb_offset);
      cufftPlanMany(&self->plan, rank, n.data(), inembed.data(), istride, idist, onembed.data(), ostride, odist,
                    CUFFT_Z2Z, batch);
    }
Esempio n. 3
0
/* Create a n-d CUFFT plan
 *
 * sign = 0 (forward) or 1 (inverse)
 */
int dfft_cuda_create_nd_plan(
    cuda_plan_t *plan,
    int ndim,
    int *dim,
    int howmany,
    int *iembed,
    int istride,
    int idist,
    int *oembed,
    int ostride,
    int odist,
    int dir)
    {
    cufftResult res;
    res = cufftPlanMany(plan, ndim, dim, iembed, istride, idist, oembed,
        ostride, odist, CUFFT_C2C, howmany);
    if (res != CUFFT_SUCCESS)
        {
        printf("CUFFT Error: %d\n", res);
        return 1;
        }
    return 0;
    }
Esempio n. 4
0
/* Create a 1d CUFFT plan
 *
 * sign = 0 (forward) or 1 (inverse)
 */
int dfft_cuda_create_1d_plan(
    cuda_plan_t *plan,
    int dim,
    int howmany,
    int istride,
    int idist,
    int ostride,
    int odist,
    int dir)
    {
    int dims[1];
    dims[0] = dim;

    cufftResult res;
    res = cufftPlanMany(plan, 1, dims, dims, istride, idist, dims,
        ostride, odist, CUFFT_C2C, howmany);
    if (res != CUFFT_SUCCESS)
        {
        printf("CUFFT Error: %d\n", res);
        return 1;
        }
    return 0;
    }
Esempio n. 5
0
/**
 * Creates a 3D C2C parallel FFT plan. If data_out point to the same location as the input
 * data, then an inplace plan will be created. Otherwise the plan would be outplace.
 * @param n Integer array of size 3, corresponding to the global data size
 * @param data Input data in spatial domain
 * @param data_out Output data in frequency domain
 * @param c_comm Cartesian communicator returned by \ref accfft_create_comm
 * @param flags AccFFT flags, See \ref flags for more details.
 * @return
 */
accfft_plan_gpu*  accfft_plan_dft_3d_c2c_gpu(int * n, Complex * data_d, Complex * data_out_d, MPI_Comm c_comm,unsigned flags){
  accfft_plan_gpu *plan=new accfft_plan_gpu;
  int nprocs, procid;
  MPI_Comm_rank(c_comm, &procid);
  plan->procid=procid;
  MPI_Cart_get(c_comm,2,plan->np,plan->periods,plan->coord);
  plan->c_comm=c_comm;
  int *coord=plan->coord;
  MPI_Comm_split(c_comm,coord[0],coord[1],&plan->row_comm);
  MPI_Comm_split(c_comm,coord[1],coord[0],&plan->col_comm);
  plan->N[0]=n[0];plan->N[1]=n[1];plan->N[2]=n[2];
  int NX=n[0], NY=n[1], NZ=n[2];
  cufftResult_t cufft_error;

  plan->data_c=data_d;
  plan->data_out_c=data_out_d;
  if(data_out_d==data_d){
    plan->inplace=true;}
  else{plan->inplace=false;}

  if(plan->np[1]==1)
    plan->oneD=true;
  else
    plan->oneD=false;



  int *osize_0 =plan->osize_0, *ostart_0 =plan->ostart_0;
  int *osize_1 =plan->osize_1, *ostart_1 =plan->ostart_1;
  int *osize_2 =plan->osize_2, *ostart_2 =plan->ostart_2;
  int *osize_1i=plan->osize_1i,*ostart_1i=plan->ostart_1i;
  int *osize_2i=plan->osize_2i,*ostart_2i=plan->ostart_2i;

  int alloc_local;
  int alloc_max=0,n_tuples=n[2]*2;

  //int isize[3],osize[3],istart[3],ostart[3];
  alloc_max=accfft_local_size_dft_c2c_gpu(n,plan->isize,plan->istart,plan->osize,plan->ostart,c_comm);
  plan->alloc_max=alloc_max;

  dfft_get_local_size_gpu(n[0],n[1],n[2],osize_0,ostart_0,c_comm);
  dfft_get_local_size_gpu(n[0],n[2],n[1],osize_1,ostart_1,c_comm);
  dfft_get_local_size_gpu(n[1],n[2],n[0],osize_2,ostart_2,c_comm);


  std::swap(osize_1[1],osize_1[2]);
  std::swap(ostart_1[1],ostart_1[2]);

  std::swap(ostart_2[1],ostart_2[2]);
  std::swap(ostart_2[0],ostart_2[1]);
  std::swap(osize_2[1],osize_2[2]);
  std::swap(osize_2[0],osize_2[1]);

  for(int i=0;i<3;i++){
    osize_1i[i]=osize_1[i];
    osize_2i[i]=osize_2[i];
    ostart_1i[i]=ostart_1[i];
    ostart_2i[i]=ostart_2[i];
  }


  // fplan_0
  {
    int f_inembed[1]={NZ};
    int f_onembed[1]={NZ};
    int idist=(NZ);
    int odist=(NZ);
    int istride=1;
    int ostride=1;
    int batch=osize_0[0]*osize_0[1];//NX;

    if(batch!=0)
    {
      cufft_error=cufftPlanMany(&plan->fplan_0, 1, &n[2],
          f_inembed, istride, idist, // *inembed, istride, idist
          f_onembed, ostride, odist, // *onembed, ostride, odist
          CUFFT_Z2Z, batch);
      if(cufft_error!= CUFFT_SUCCESS)
      {
        fprintf(stderr, "CUFFT error: fplan_0 creation failed %d \n",cufft_error); return NULL;
      }
      //cufftSetCompatibilityMode(fplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan cuda compatibility\n"); return;}
    }
  }
  // fplan_1
  {
    int f_inembed[1]={NY};
    int f_onembed[1]={NY};
    int idist=1;
    int odist=1;
    int istride=osize_1[2];
    int ostride=osize_1[2];
    int batch=osize_1[2];

    if(batch!=0)
    {
      cufft_error=cufftPlanMany(&plan->fplan_1, 1, &n[1],
          f_inembed, istride, idist, // *inembed, istride, idist
          f_onembed, ostride, odist, // *onembed, ostride, odist
          CUFFT_Z2Z, batch);
      if(cufft_error!= CUFFT_SUCCESS)
      {
        fprintf(stderr, "CUFFT error: fplan_1 creation failed %d \n",cufft_error); return NULL;
      }
      //cufftSetCompatibilityMode(fplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan cuda compatibility\n"); return;}
    }
  }
  // fplan_2
  {
    int f_inembed[1]={NX};
    int f_onembed[1]={NX};
    int idist=1;
    int odist=1;
    int istride=osize_2[1]*osize_2[2];
    int ostride=osize_2[1]*osize_2[2];
    int batch=osize_2[1]*osize_2[2];;

    if(batch!=0)
    {
      cufft_error=cufftPlanMany(&plan->fplan_2, 1, &n[0],
          f_inembed, istride, idist, // *inembed, istride, idist
          f_onembed, ostride, odist, // *onembed, ostride, odist
          CUFFT_Z2Z, batch);
      if(cufft_error!= CUFFT_SUCCESS)
      {
        fprintf(stderr, "CUFFT error: fplan_2 creation failed %d \n",cufft_error); return NULL;
      }
      //cufftSetCompatibilityMode(fplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan cuda compatibility\n"); return;}
    }
  }

  // 1D Decomposition
  if (plan->oneD){
    int NX=n[0],NY=n[1],NZ=n[2];


    plan->alloc_max=alloc_max;

    plan->Mem_mgr= new Mem_Mgr_gpu <double>(NX,NY,(NZ)*2,c_comm);
    plan->T_plan_2= new T_Plan_gpu <double>(NX,NY,(NZ)*2, plan->Mem_mgr,c_comm);
    plan->T_plan_2i= new T_Plan_gpu<double>(NY,NX,NZ*2, plan->Mem_mgr,c_comm);

    plan->T_plan_2->alloc_local=alloc_max;
    plan->T_plan_2i->alloc_local=alloc_max;
    plan->T_plan_1=NULL;
    plan->T_plan_1i=NULL;




    if(flags==ACCFFT_MEASURE){
      plan->T_plan_2->which_fast_method_gpu(plan->T_plan_2,(double*)data_out_d);

    }
    else{
      plan->T_plan_2->method=2;
      plan->T_plan_2->kway=2;
    }
    checkCuda_accfft (cudaDeviceSynchronize());
    MPI_Barrier(plan->c_comm);

    plan->T_plan_2i->method=-plan->T_plan_2->method;
    plan->T_plan_2i->kway=plan->T_plan_2->kway;
    plan->T_plan_2i->kway_async=plan->T_plan_2->kway_async;


  }// end 1d c2c

  // 2D Decomposition
  if (!plan->oneD){
    // the reaseon for n_tuples/2 is to avoid splitting of imag and real parts of complex numbers
    plan->Mem_mgr=  new Mem_Mgr_gpu<double>(n[1],n[2],2,plan->row_comm,osize_0[0],alloc_max);
    plan->T_plan_1= new  T_Plan_gpu<double>(n[1],n[2],2, plan->Mem_mgr, plan->row_comm,osize_0[0]);
    plan->T_plan_2= new  T_Plan_gpu<double>(n[0],n[1],2*osize_2[2], plan->Mem_mgr, plan->col_comm);
    plan->T_plan_2i= new T_Plan_gpu<double>(n[1],n[0],2*osize_2i[2], plan->Mem_mgr, plan->col_comm);
    plan->T_plan_1i= new T_Plan_gpu<double>(n[2],n[1],2, plan->Mem_mgr, plan->row_comm,osize_1i[0]);

    plan->T_plan_1->alloc_local=plan->alloc_max;
    plan->T_plan_2->alloc_local=plan->alloc_max;
    plan->T_plan_2i->alloc_local=plan->alloc_max;
    plan->T_plan_1i->alloc_local=plan->alloc_max;


    plan->iplan_0=NULL;
    plan->iplan_1=NULL;
    plan->iplan_2=NULL;

    int coords[2],np[2],periods[2];
    MPI_Cart_get(c_comm,2,np,periods,coords);

    if(flags==ACCFFT_MEASURE){
      if(coords[0]==0){
        plan->T_plan_1->which_fast_method_gpu(plan->T_plan_1,(double*)data_out_d,osize_0[0]);
      }
    }
    else{
      plan->T_plan_1->method=2;
      plan->T_plan_1->kway=2;
    }

    MPI_Bcast(&plan->T_plan_1->method,1, MPI_INT,0, c_comm );
    MPI_Bcast(&plan->T_plan_1->kway,1, MPI_INT,0, c_comm );
    MPI_Bcast(&plan->T_plan_1->kway_async,1, MPI::BOOL,0, c_comm );
    checkCuda_accfft (cudaDeviceSynchronize());
    MPI_Barrier(plan->c_comm);


    plan->T_plan_1->method =plan->T_plan_1->method;
    plan->T_plan_2->method =plan->T_plan_1->method;
    plan->T_plan_2i->method=-plan->T_plan_1->method;
    plan->T_plan_1i->method=-plan->T_plan_1->method;

    plan->T_plan_1->kway =plan->T_plan_1->kway;
    plan->T_plan_2->kway =plan->T_plan_1->kway;
    plan->T_plan_2i->kway=plan->T_plan_1->kway;
    plan->T_plan_1i->kway=plan->T_plan_1->kway;

    plan->T_plan_1->kway_async =plan->T_plan_1->kway_async;
    plan->T_plan_2->kway_async =plan->T_plan_1->kway_async;
    plan->T_plan_2i->kway_async=plan->T_plan_1->kway_async;
    plan->T_plan_1i->kway_async=plan->T_plan_1->kway_async;


  }// end 2d c2c

  plan->c2c_plan_baked=true;
  return plan;
}// end accfft_plan_dft_c2c_gpu
Esempio n. 6
0
cufftResult WINAPI wine_cufftPlanMany(cufftHandle *plan, int rank, int *n, int *inembed, int istride, int idist, int *onembed, int ostride, int odist, cufftType type, int batch){
	WINE_TRACE("\n");
	return cufftPlanMany( plan, rank, n, inembed, istride, idist, onembed, ostride, odist, type, batch);
}
Esempio n. 7
0
/**
 * Creates a 3D single precision R2C parallel FFT plan. If data_out point to the same location as the input
 * data, then an inplace plan will be created. Otherwise the plan would be outplace.
 * @param n Integer array of size 3, corresponding to the global data size
 * @param data Input data in spatial domain
 * @param data_out Output data in frequency domain
 * @param c_comm Cartesian communicator returned by \ref accfft_create_comm
 * @param flags AccFFT flags, See \ref flags for more details.
 * @return
 */
accfft_plan_gpuf*  accfft_plan_dft_3d_r2c_gpuf(int * n, float * data_d,float * data_out_d, MPI_Comm c_comm,unsigned flags){
  accfft_plan_gpuf *plan=new accfft_plan_gpuf;
  int procid;
  MPI_Comm_rank(c_comm, &procid);
  plan->procid=procid;
  MPI_Cart_get(c_comm,2,plan->np,plan->periods,plan->coord);
  plan->c_comm=c_comm;
  int *coord=plan->coord;
  MPI_Comm_split(c_comm,coord[0],coord[1],&plan->row_comm);
  MPI_Comm_split(c_comm,coord[1],coord[0],&plan->col_comm);
  plan->N[0]=n[0];plan->N[1]=n[1];plan->N[2]=n[2];
  plan->data=data_d;
  plan->data_out=data_out_d;

  if(plan->np[1]==1)
    plan->oneD=true;
  else
    plan->oneD=false;


  if(data_out_d==data_d){
    plan->inplace=true;}
  else{plan->inplace=false;}

  int *osize_0 =plan->osize_0, *ostart_0 =plan->ostart_0;
  int *osize_1 =plan->osize_1, *ostart_1 =plan->ostart_1;
  int *osize_2 =plan->osize_2, *ostart_2 =plan->ostart_2;
  int *osize_1i=plan->osize_1i,*ostart_1i=plan->ostart_1i;
  int *osize_2i=plan->osize_2i,*ostart_2i=plan->ostart_2i;

  int alloc_max=0;
  int n_tuples_i, n_tuples_o;
  //plan->inplace==true ? n_tuples=(n[2]/2+1)*2: n_tuples=n[2]*2;
  plan->inplace==true ? n_tuples_i=(n[2]/2+1)*2:  n_tuples_i=n[2];
  n_tuples_o=(n[2]/2+1)*2;

  int isize[3],osize[3],istart[3],ostart[3];
  alloc_max=accfft_local_size_dft_r2c_gpuf(n,isize,istart,osize,ostart,c_comm,plan->inplace);
  plan->alloc_max=alloc_max;

  dfft_get_local_size_gpuf(n[0],n[1],n_tuples_o,osize_0,ostart_0,c_comm);
  dfft_get_local_size_gpuf(n[0],n_tuples_o/2,n[1],osize_1,ostart_1,c_comm);
  dfft_get_local_size_gpuf(n[1],n_tuples_o/2,n[0],osize_2,ostart_2,c_comm);

  std::swap(osize_1[1],osize_1[2]);
  std::swap(ostart_1[1],ostart_1[2]);

  std::swap(ostart_2[1],ostart_2[2]);
  std::swap(ostart_2[0],ostart_2[1]);
  std::swap(osize_2[1],osize_2[2]);
  std::swap(osize_2[0],osize_2[1]);

  for(int i=0;i<3;i++){
    osize_1i[i]=osize_1[i];
    osize_2i[i]=osize_2[i];
    ostart_1i[i]=ostart_1[i];
    ostart_2i[i]=ostart_2[i];
  }

  // fplan_0
  int NX=n[0], NY=n[1], NZ=n[2];
  cufftResult_t cufft_error;
  {
    int f_inembed[1]={n_tuples_i};
    int f_onembed[1]={n_tuples_o/2};
    int idist=(n_tuples_i);
    int odist=n_tuples_o/2;
    int istride=1;
    int ostride=1;
    int batch=osize_0[0]*osize_0[1];//NX;

    if(batch!=0)
    {
      cufft_error=cufftPlanMany(&plan->fplan_0, 1, &n[2],
          f_inembed, istride, idist, // *inembed, istride, idist
          f_onembed, ostride, odist, // *onembed, ostride, odist
          CUFFT_R2C, batch);
      if(cufft_error!= CUFFT_SUCCESS)
      {
        fprintf(stderr, "CUFFT error: fplan_0 creation failed %d \n",cufft_error); return NULL;
      }
      //cufftSetCompatibilityMode(fplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan cuda compatibility\n"); return;}
    }
    if(batch!=0)
    {
      cufft_error=cufftPlanMany(&plan->iplan_0, 1, &n[2],
          f_onembed, ostride, odist, // *onembed, ostride, odist
          f_inembed, istride, idist, // *inembed, istride, idist
          CUFFT_C2R, batch);
      if(cufft_error!= CUFFT_SUCCESS)
      {
        fprintf(stderr, "CUFFT error: iplan_0 creation failed %d \n",cufft_error); return NULL;
      }
      //cufftSetCompatibilityMode(fplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan cuda compatibility\n"); return;}
    }
  }
  // fplan_1
  {
    int f_inembed[1]={NY};
    int f_onembed[1]={NY};
    int idist=1;
    int odist=1;
    int istride=osize_1[2];
    int ostride=osize_1[2];
    int batch=osize_1[2];

    if(batch!=0)
    {
      cufft_error=cufftPlanMany(&plan->fplan_1, 1, &n[1],
          f_inembed, istride, idist, // *inembed, istride, idist
          f_onembed, ostride, odist, // *onembed, ostride, odist
          CUFFT_C2C, batch);
      if(cufft_error!= CUFFT_SUCCESS)
      {
        fprintf(stderr, "CUFFT error: fplan_1 creation failed %d \n",cufft_error); return NULL;
      }
      //cufftSetCompatibilityMode(fplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan cuda compatibility\n"); return;}
    }
  }
  // fplan_2
  {
    int f_inembed[1]={NX};
    int f_onembed[1]={NX};
    int idist=1;
    int odist=1;
    int istride=osize_2[1]*osize_2[2];
    int ostride=osize_2[1]*osize_2[2];
    int batch=osize_2[1]*osize_2[2];;

    if(batch!=0)
    {
      cufft_error=cufftPlanMany(&plan->fplan_2, 1, &n[0],
          f_inembed, istride, idist, // *inembed, istride, idist
          f_onembed, ostride, odist, // *onembed, ostride, odist
          CUFFT_C2C, batch);
      if(cufft_error!= CUFFT_SUCCESS)
      {
        fprintf(stderr, "CUFFT error: fplan_2 creation failed %d \n",cufft_error); return NULL;
      }
      //cufftSetCompatibilityMode(fplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan cuda compatibility\n"); return;}
    }
  }


  // 1D Decomposition
  if(plan->oneD){
    int N0=n[0], N1=n[1], N2=n[2];

    plan->Mem_mgr  = new Mem_Mgr_gpu<float>(N0,N1,n_tuples_o,c_comm);
    plan->T_plan_2 = new T_Plan_gpu <float>(N0,N1,n_tuples_o, plan->Mem_mgr, c_comm);
    plan->T_plan_2i= new T_Plan_gpu <float>(N1,N0,n_tuples_o,plan->Mem_mgr, c_comm);
    plan->T_plan_1=NULL;
    plan->T_plan_1i=NULL;

    plan->alloc_max=alloc_max;
    plan->T_plan_2->alloc_local=alloc_max;
    plan->T_plan_2i->alloc_local=alloc_max;


    if(flags==ACCFFT_MEASURE){
      plan->T_plan_2->which_fast_method_gpu(plan->T_plan_2,data_out_d);
    }
    else{
      plan->T_plan_2->method=2;
      plan->T_plan_2->kway=2;
    }
    checkCuda_accfft (cudaDeviceSynchronize());
    MPI_Barrier(plan->c_comm);

    plan->T_plan_2->method =plan->T_plan_2->method;
    plan->T_plan_2i->method=plan->T_plan_2->method;

    plan->T_plan_2->kway =plan->T_plan_2->kway;
    plan->T_plan_2i->kway=plan->T_plan_2->kway;


  }

  // 2D Decomposition
  if (!plan->oneD){
    // the reaseon for n_tuples/2 is to avoid splitting of imag and real parts of complex numbers
    plan->Mem_mgr  =new Mem_Mgr_gpu<float>(n[1],n_tuples_o/2,2,plan->row_comm,osize_0[0],alloc_max);
    plan->T_plan_1 = new T_Plan_gpu<float>(n[1],n_tuples_o/2,2, plan->Mem_mgr, plan->row_comm,osize_0[0]);
    plan->T_plan_2 = new T_Plan_gpu<float>(n[0],n[1],osize_2[2]*2,plan->Mem_mgr, plan->col_comm);
    plan->T_plan_2i= new T_Plan_gpu<float>(n[1],n[0],osize_2i[2]*2, plan->Mem_mgr, plan->col_comm);
    plan->T_plan_1i= new T_Plan_gpu<float>(n_tuples_o/2,n[1],2, plan->Mem_mgr, plan->row_comm,osize_1i[0]);


    plan->T_plan_1->alloc_local=plan->alloc_max;
    plan->T_plan_2->alloc_local=plan->alloc_max;
    plan->T_plan_2i->alloc_local=plan->alloc_max;
    plan->T_plan_1i->alloc_local=plan->alloc_max;



    if(flags==ACCFFT_MEASURE){
      if(coord[0]==0){
        plan->T_plan_1->which_fast_method_gpu(plan->T_plan_1,data_out_d,osize_0[0]);
      }
    }
    else{
      plan->T_plan_1->method=2;
      plan->T_plan_1->kway=2;
    }

    MPI_Bcast(&plan->T_plan_1->method,1, MPI_INT,0, c_comm );
    MPI_Bcast(&plan->T_plan_1->kway,1, MPI_INT,0, c_comm );

    checkCuda_accfft (cudaDeviceSynchronize());
    MPI_Barrier(plan->c_comm);
    plan->T_plan_1->method =plan->T_plan_1->method;
    plan->T_plan_2->method =plan->T_plan_1->method;
    plan->T_plan_2i->method=plan->T_plan_1->method;
    plan->T_plan_1i->method=plan->T_plan_1->method;
    plan->T_plan_1->kway =plan->T_plan_1->kway;
    plan->T_plan_2->kway =plan->T_plan_1->kway;
    plan->T_plan_2i->kway=plan->T_plan_1->kway;
    plan->T_plan_1i->kway=plan->T_plan_1->kway;

    plan->iplan_1=-1;
    plan->iplan_2=-1;

  }

  plan->r2c_plan_baked=true;
  return plan;

}