C++ (Cpp) accfft_local_size_dft_r2c_gpuの例

コード例 #1

0

ファイルを表示

ファイル: step5_gpu.cpp プロジェクト: spiralgen/accfft-scratch

void initialize(double *a, int *n, MPI_Comm c_comm)
{
  double pi=M_PI;
  int istart[3], isize[3], osize[3],ostart[3];
  accfft_local_size_dft_r2c_gpu(n,isize,istart,osize,ostart,c_comm);

#pragma omp parallel
  {
    double X,Y,Z;
    long int ptr;
#pragma omp for
    for (int i=0; i<isize[0]; i++){
      for (int j=0; j<isize[1]; j++){
        for (int k=0; k<isize[2]; k++){
          X=2*pi/n[0]*(i+istart[0]);
          Y=2*pi/n[1]*(j+istart[1]);
          Z=2*pi/n[2]*k;
          ptr=i*isize[1]*n[2]+j*n[2]+k;
          a[ptr]=testcase(X,Y,Z);
        }
      }
    }
  }
  return;
} // end initialize

コード例 #2

0

ファイルを表示

ファイル: step1_gpu.cpp プロジェクト: amirgholami/accfft

void check_err(double* a, int*n, MPI_Comm c_comm) {
	int nprocs, procid;
	MPI_Comm_rank(c_comm, &procid);
	MPI_Comm_size(c_comm, &nprocs);
	long long int size = n[0];
	size *= n[1];
	size *= n[2];
	double pi = 4 * atan(1.0);

	int istart[3], isize[3], osize[3], ostart[3];
	accfft_local_size_dft_r2c_gpu(n, isize, istart, osize, ostart, c_comm);

	double err = 0, norm = 0;

	double X, Y, Z, numerical_r;
	long int ptr;
	int thid = omp_get_thread_num();
	for (int i = 0; i < isize[0]; i++) {
		for (int j = 0; j < isize[1]; j++) {
			for (int k = 0; k < isize[2]; k++) {
				X = 2 * pi / n[0] * (i + istart[0]);
				Y = 2 * pi / n[1] * (j + istart[1]);
				Z = 2 * pi / n[2] * k;
				ptr = i * isize[1] * n[2] + j * n[2] + k;
				numerical_r = a[ptr] / size;
				if (numerical_r != numerical_r)
					numerical_r = 0;
				err += std::abs(numerical_r - testcase(X, Y, Z));
				norm += std::abs(testcase(X, Y, Z));
			}
		}
	}

	double g_err = 0, g_norm = 0;
	MPI_Reduce(&err, &g_err, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
	MPI_Reduce(&norm, &g_norm, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
	PCOUT << "\nL1 Error of iFF(a)-a: " << g_err << std::endl;
	PCOUT << "Relative L1 Error of iFF(a)-a: " << g_err / g_norm << std::endl;
	if (g_err / g_norm < 1e-10)
		PCOUT << "\nResults are CORRECT!\n\n";
	else
		PCOUT << "\nResults are NOT CORRECT!\n\n";

	return;
} // end check_err

コード例 #3

0

ファイルを表示

ファイル: step5_gpu.cpp プロジェクト: spiralgen/accfft-scratch

void grad(int *n) {
  int nprocs, procid;
  MPI_Comm_rank(MPI_COMM_WORLD, &procid);
  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);

  /* Create Cartesian Communicator */
  int c_dims[2]={0};
  MPI_Comm c_comm;
  accfft_create_comm(MPI_COMM_WORLD,c_dims,&c_comm);

  double f_time=0*MPI_Wtime(),i_time=0, setup_time=0;
  int alloc_max=0;

  int isize[3],osize[3],istart[3],ostart[3];
  /* Get the local pencil size and the allocation size */
  alloc_max=accfft_local_size_dft_r2c_gpu(n,isize,istart,osize,ostart,c_comm);

  //data=(double*)accfft_alloc(isize[0]*isize[1]*isize[2]*sizeof(double));
  double * data_cpu=(double*)accfft_alloc(alloc_max);
  double* data;
  Complex* data_hat;
  cudaMalloc((void**) &data    , alloc_max);
  cudaMalloc((void**) &data_hat, alloc_max);


  accfft_init();

  /* Create FFT plan */
  setup_time=-MPI_Wtime();
  accfft_plan_gpu * plan=accfft_plan_dft_3d_r2c_gpu(n,data,(double*)data_hat,c_comm,ACCFFT_MEASURE);
  setup_time+=MPI_Wtime();


  /*  Initialize data */
  initialize(data_cpu,n,c_comm);
  cudaMemcpy(data, data_cpu,alloc_max, cudaMemcpyHostToDevice);

  MPI_Barrier(c_comm);

  double * gradx,*grady, *gradz;
  cudaMalloc((void**) &gradx    , alloc_max);
  cudaMalloc((void**) &grady    , alloc_max);
  cudaMalloc((void**) &gradz    , alloc_max);
  double timings[5]={0};

  std::bitset<3> XYZ=0;
  XYZ[0]=1;
  XYZ[1]=1;
  XYZ[2]=1;
  double exec_time=-MPI_Wtime();
  accfft_grad_gpu(gradx,grady,gradz,data,plan,XYZ,timings);
  exec_time+=MPI_Wtime();
  /* Check err*/
  PCOUT<<">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"<<std::endl;
  PCOUT<<">>>>>>>>Checking Gradx>>>>>>>>"<<std::endl;
  cudaMemcpy(data_cpu, gradx, alloc_max, cudaMemcpyDeviceToHost);
  check_err_grad(data_cpu,n,c_comm,0);
  PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<"<<std::endl;
  PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"<<std::endl;

  PCOUT<<">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"<<std::endl;
  PCOUT<<">>>>>>>>Checking Grady>>>>>>>>"<<std::endl;
  cudaMemcpy(data_cpu, grady, alloc_max, cudaMemcpyDeviceToHost);
  check_err_grad(data_cpu,n,c_comm,1);
  PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<"<<std::endl;
  PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"<<std::endl;

  PCOUT<<">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"<<std::endl;
  PCOUT<<">>>>>>>>Checking Gradz>>>>>>>>"<<std::endl;
  cudaMemcpy(data_cpu, gradz, alloc_max, cudaMemcpyDeviceToHost);
  check_err_grad(data_cpu,n,c_comm,2);
  PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<"<<std::endl;
  PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"<<std::endl;

  /* Compute some timings statistics */
  double g_setup_time,g_timings[5],g_exec_time;

  MPI_Reduce(timings,g_timings,5, MPI_DOUBLE, MPI_MAX,0, c_comm);
  MPI_Reduce(&setup_time,&g_setup_time,1, MPI_DOUBLE, MPI_MAX,0, c_comm);
  MPI_Reduce(&exec_time,&g_exec_time,1, MPI_DOUBLE, MPI_MAX,0, c_comm);

  PCOUT<<"Timing for Grad Computation for size "<<n[0]<<"*"<<n[1]<<"*"<<n[2]<<std::endl;
  PCOUT<<"Setup \t\t"<<g_setup_time<<std::endl;
  PCOUT<<"Evaluation \t"<<g_exec_time<<std::endl;

  accfft_free(data_cpu);
  cudaFree(data);
  cudaFree(data_hat);
  MPI_Barrier(c_comm);
  cudaFree(gradx);
  cudaFree(grady);
  cudaFree(gradz);
  accfft_destroy_plan(plan);
  accfft_cleanup_gpu();
  MPI_Comm_free(&c_comm);
  PCOUT<<"-------------------------------------------------------"<<std::endl;
  PCOUT<<"-------------------------------------------------------"<<std::endl;
  PCOUT<<"-------------------------------------------------------\n"<<std::endl;
  return ;

} // end grad

コード例 #4

0

ファイルを表示

ファイル: step1_gpu.cpp プロジェクト: amirgholami/accfft

void step1_gpu(int *n) {

	int nprocs, procid;
	MPI_Comm_rank(MPI_COMM_WORLD, &procid);
	MPI_Comm_size(MPI_COMM_WORLD, &nprocs);

	/* Create Cartesian Communicator */
	int c_dims[2] = { 0 };
	MPI_Comm c_comm;
	accfft_create_comm(MPI_COMM_WORLD, c_dims, &c_comm);

	double *data, *data_cpu;
	Complex *data_hat;
	double f_time = 0 * MPI_Wtime(), i_time = 0, setup_time = 0;
	int alloc_max = 0;

	int isize[3], osize[3], istart[3], ostart[3];
	/* Get the local pencil size and the allocation size */
	alloc_max = accfft_local_size_dft_r2c_gpu(n, isize, istart, osize, ostart,
			c_comm);

	data_cpu = (double*) malloc(
			isize[0] * isize[1] * isize[2] * sizeof(double));
	//data_hat=(Complex*)accfft_alloc(alloc_max);
	cudaMalloc((void**) &data, isize[0] * isize[1] * isize[2] * sizeof(double));
	cudaMalloc((void**) &data_hat, alloc_max);

	//accfft_init(nthreads);

	/* Create FFT plan */
	setup_time = -MPI_Wtime();
	accfft_plan_gpu * plan = accfft_plan_dft_3d_r2c_gpu(n, data,
			(double*) data_hat, c_comm, ACCFFT_MEASURE);
	setup_time += MPI_Wtime();

	/* Warm Up */
	accfft_execute_r2c_gpu(plan, data, data_hat);
	accfft_execute_r2c_gpu(plan, data, data_hat);

	/*  Initialize data */
	initialize(data_cpu, n, c_comm);
	cudaMemcpy(data, data_cpu, isize[0] * isize[1] * isize[2] * sizeof(double),
			cudaMemcpyHostToDevice);
	// initialize_gpu(data,n,isize,istart); // GPU version of initialize function

	MPI_Barrier(c_comm);

	/* Perform forward FFT */
	f_time -= MPI_Wtime();
	accfft_execute_r2c_gpu(plan, data, data_hat);
	f_time += MPI_Wtime();

	MPI_Barrier(c_comm);

	double *data2_cpu, *data2;
	cudaMalloc((void**) &data2,
			isize[0] * isize[1] * isize[2] * sizeof(double));
	data2_cpu = (double*) malloc(
			isize[0] * isize[1] * isize[2] * sizeof(double));

	/* Perform backward FFT */
	i_time -= MPI_Wtime();
	accfft_execute_c2r_gpu(plan, data_hat, data2);
	i_time += MPI_Wtime();

	/* copy back results on CPU */
	cudaMemcpy(data2_cpu, data2,
			isize[0] * isize[1] * isize[2] * sizeof(double),
			cudaMemcpyDeviceToHost);

	/* Check Error */
	double err = 0, g_err = 0;
	double norm = 0, g_norm = 0;
	for (int i = 0; i < isize[0] * isize[1] * isize[2]; ++i) {
		err += std::abs(data2_cpu[i] / n[0] / n[1] / n[2] - data_cpu[i]);
		norm += std::abs(data_cpu[i]);
	}
	MPI_Reduce(&err, &g_err, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
	MPI_Reduce(&norm, &g_norm, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);

	PCOUT << "\nL1 Error is " << g_err << std::endl;
	PCOUT << "Relative L1 Error is " << g_err / g_norm << std::endl;
	if (g_err / g_norm < 1e-10)
		PCOUT << "\nResults are CORRECT!\n\n";
	else
		PCOUT << "\nResults are NOT CORRECT!\n\n";

	check_err(data2_cpu, n, c_comm);

	/* Compute some timings statistics */
	double g_f_time, g_i_time, g_setup_time;
	MPI_Reduce(&f_time, &g_f_time, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
	MPI_Reduce(&i_time, &g_i_time, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
	MPI_Reduce(&setup_time, &g_setup_time, 1, MPI_DOUBLE, MPI_MAX, 0,
			MPI_COMM_WORLD);

	PCOUT << "GPU Timing for FFT of size " << n[0] << "*" << n[1] << "*" << n[2]
			<< std::endl;
	PCOUT << "Setup \t" << g_setup_time << std::endl;
	PCOUT << "FFT \t" << g_f_time << std::endl;
	PCOUT << "IFFT \t" << g_i_time << std::endl;

	free(data_cpu);
	free(data2_cpu);
	cudaFree(data);
	cudaFree(data_hat);
	cudaFree(data2);
	accfft_destroy_plan_gpu(plan);
	accfft_cleanup_gpu();
	MPI_Comm_free(&c_comm);
	return;

} // end step1_gpu

コード例 #5

0

ファイルを表示

ファイル: accfft_gpu.cpp プロジェクト: jeffhammond/accfft

/**
 * Creates a 3D R2C parallel FFT plan.If data_out point to the same location as the input
 * data, then an inplace plan will be created. Otherwise the plan would be outplace.
 * @param n Integer array of size 3, corresponding to the global data size
 * @param data Input data in spatial domain
 * @param data_out Output data in frequency domain
 * @param c_comm Cartesian communicator returned by \ref accfft_create_comm
 * @param flags AccFFT flags, See \ref flags for more details.
 * @return
 */
accfft_plan_gpu*  accfft_plan_dft_3d_r2c_gpu(int * n, double * data_d, double * data_out_d, MPI_Comm c_comm,unsigned flags){
  accfft_plan_gpu *plan=new accfft_plan_gpu;
  int procid;
  MPI_Comm_rank(c_comm, &procid);
  plan->procid=procid;
  MPI_Cart_get(c_comm,2,plan->np,plan->periods,plan->coord);
  plan->c_comm=c_comm;
  int *coord=plan->coord;
  MPI_Comm_split(c_comm,coord[0],coord[1],&plan->row_comm);
  MPI_Comm_split(c_comm,coord[1],coord[0],&plan->col_comm);
  plan->N[0]=n[0];plan->N[1]=n[1];plan->N[2]=n[2];
  plan->data=data_d;
  plan->data_out=data_out_d;

  if(plan->np[1]==1)
    plan->oneD=true;
  else
    plan->oneD=false;


  if(data_out_d==data_d){
    plan->inplace=true;}
  else{plan->inplace=false;}

  int *osize_0 =plan->osize_0, *ostart_0 =plan->ostart_0;
  int *osize_1 =plan->osize_1, *ostart_1 =plan->ostart_1;
  int *osize_2 =plan->osize_2, *ostart_2 =plan->ostart_2;
  int *osize_1i=plan->osize_1i,*ostart_1i=plan->ostart_1i;
  int *osize_2i=plan->osize_2i,*ostart_2i=plan->ostart_2i;

  int alloc_max=0;
  int n_tuples_i, n_tuples_o;
  //plan->inplace==true ? n_tuples=(n[2]/2+1)*2: n_tuples=n[2]*2;
  plan->inplace==true ? n_tuples_i=(n[2]/2+1)*2:  n_tuples_i=n[2];
  n_tuples_o=(n[2]/2+1)*2;

  //int isize[3],osize[3],istart[3],ostart[3];
  alloc_max=accfft_local_size_dft_r2c_gpu(n,plan->isize,plan->istart,plan->osize,plan->ostart,c_comm,plan->inplace);
  plan->alloc_max=alloc_max;

  dfft_get_local_size_gpu(n[0],n[1],n_tuples_o,osize_0,ostart_0,c_comm);
  dfft_get_local_size_gpu(n[0],n_tuples_o/2,n[1],osize_1,ostart_1,c_comm);
  dfft_get_local_size_gpu(n[1],n_tuples_o/2,n[0],osize_2,ostart_2,c_comm);

  std::swap(osize_1[1],osize_1[2]);
  std::swap(ostart_1[1],ostart_1[2]);

  std::swap(ostart_2[1],ostart_2[2]);
  std::swap(ostart_2[0],ostart_2[1]);
  std::swap(osize_2[1],osize_2[2]);
  std::swap(osize_2[0],osize_2[1]);

  for(int i=0;i<3;i++){
    osize_1i[i]=osize_1[i];
    osize_2i[i]=osize_2[i];
    ostart_1i[i]=ostart_1[i];
    ostart_2i[i]=ostart_2[i];
  }

  // fplan_0
  int NX=n[0], NY=n[1], NZ=n[2];
  cufftResult_t cufft_error;
  {
    int f_inembed[1]={n_tuples_i};
    int f_onembed[1]={n_tuples_o/2};
    int idist=(n_tuples_i);
    int odist=n_tuples_o/2;
    int istride=1;
    int ostride=1;
    int batch=osize_0[0]*osize_0[1];//NX;

    if(batch!=0)
    {
      cufft_error=cufftPlanMany(&plan->fplan_0, 1, &n[2],
          f_inembed, istride, idist, // *inembed, istride, idist
          f_onembed, ostride, odist, // *onembed, ostride, odist
          CUFFT_D2Z, batch);
      if(cufft_error!= CUFFT_SUCCESS)
      {
        fprintf(stderr, "CUFFT error: fplan_0 creation failed %d \n",cufft_error); return NULL;
      }
      //cufftSetCompatibilityMode(fplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan cuda compatibility\n"); return;}
    }
    if(batch!=0)
    {
      cufft_error=cufftPlanMany(&plan->iplan_0, 1, &n[2],
          f_onembed, ostride, odist, // *onembed, ostride, odist
          f_inembed, istride, idist, // *inembed, istride, idist
          CUFFT_Z2D, batch);
      if(cufft_error!= CUFFT_SUCCESS)
      {
        fprintf(stderr, "CUFFT error: iplan_0 creation failed %d \n",cufft_error); return NULL;
      }
      //cufftSetCompatibilityMode(fplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan cuda compatibility\n"); return;}
    }
  }
  // fplan_1
  {
    int f_inembed[1]={NY};
    int f_onembed[1]={NY};
    int idist=1;
    int odist=1;
    int istride=osize_1[2];
    int ostride=osize_1[2];
    int batch=osize_1[2];

    if(batch!=0)
    {
      cufft_error=cufftPlanMany(&plan->fplan_1, 1, &n[1],
          f_inembed, istride, idist, // *inembed, istride, idist
          f_onembed, ostride, odist, // *onembed, ostride, odist
          CUFFT_Z2Z, batch);
      if(cufft_error!= CUFFT_SUCCESS)
      {
        fprintf(stderr, "CUFFT error: fplan_1 creation failed %d \n",cufft_error); return NULL;
      }
      //cufftSetCompatibilityMode(fplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan cuda compatibility\n"); return;}
    }
  }
  // fplan_2
  {
    int f_inembed[1]={NX};
    int f_onembed[1]={NX};
    int idist=1;
    int odist=1;
    int istride=osize_2[1]*osize_2[2];
    int ostride=osize_2[1]*osize_2[2];
    int batch=osize_2[1]*osize_2[2];;

    if(batch!=0)
    {
      cufft_error=cufftPlanMany(&plan->fplan_2, 1, &n[0],
          f_inembed, istride, idist, // *inembed, istride, idist
          f_onembed, ostride, odist, // *onembed, ostride, odist
          CUFFT_Z2Z, batch);
      if(cufft_error!= CUFFT_SUCCESS)
      {
        fprintf(stderr, "CUFFT error: fplan_2 creation failed %d \n",cufft_error); return NULL;
      }
      //cufftSetCompatibilityMode(fplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan cuda compatibility\n"); return;}
    }
  }


  // 1D Decomposition
  if(plan->oneD){
    int N0=n[0], N1=n[1], N2=n[2];

    plan->Mem_mgr  = new Mem_Mgr_gpu<double>(N0,N1,n_tuples_o,c_comm);
    plan->T_plan_2 = new T_Plan_gpu<double>(N0,N1,n_tuples_o, plan->Mem_mgr, c_comm);
    plan->T_plan_2i= new T_Plan_gpu<double>(N1,N0,n_tuples_o,plan->Mem_mgr, c_comm);
    plan->T_plan_1=NULL;
    plan->T_plan_1i=NULL;

    plan->alloc_max=alloc_max;
    plan->T_plan_2->alloc_local=alloc_max;
    plan->T_plan_2i->alloc_local=alloc_max;


    if(flags==ACCFFT_MEASURE){
      plan->T_plan_2->which_fast_method_gpu(plan->T_plan_2,data_out_d);
    }
    else{
      plan->T_plan_2->method=2;
      plan->T_plan_2->kway=2;
    }
    checkCuda_accfft (cudaDeviceSynchronize());
    MPI_Barrier(plan->c_comm);

    plan->T_plan_2i->method=-plan->T_plan_2->method;
    plan->T_plan_2i->kway=plan->T_plan_2->kway;
    plan->T_plan_2i->kway_async=plan->T_plan_2->kway_async;


  } // end 1d r2c

  // 2D Decomposition
  if (!plan->oneD){
    // the reaseon for n_tuples/2 is to avoid splitting of imag and real parts of complex numbers
    plan->Mem_mgr  = new Mem_Mgr_gpu<double>(n[1],n_tuples_o/2,2,plan->row_comm,osize_0[0],alloc_max);
    plan->T_plan_1 = new T_Plan_gpu<double>(n[1],n_tuples_o/2,2, plan->Mem_mgr, plan->row_comm,osize_0[0]);
    plan->T_plan_2 = new T_Plan_gpu<double>(n[0],n[1],osize_2[2]*2,plan->Mem_mgr, plan->col_comm);
    plan->T_plan_2i= new T_Plan_gpu<double>(n[1],n[0],osize_2i[2]*2, plan->Mem_mgr, plan->col_comm);
    plan->T_plan_1i= new T_Plan_gpu<double>(n_tuples_o/2,n[1],2, plan->Mem_mgr, plan->row_comm,osize_1i[0]);


    plan->T_plan_1->alloc_local=plan->alloc_max;
    plan->T_plan_2->alloc_local=plan->alloc_max;
    plan->T_plan_2i->alloc_local=plan->alloc_max;
    plan->T_plan_1i->alloc_local=plan->alloc_max;

    if(flags==ACCFFT_MEASURE){
      if(coord[0]==0){
        plan->T_plan_1->which_fast_method_gpu(plan->T_plan_1,data_out_d,osize_0[0]);
      }
    }
    else{
      plan->T_plan_1->method=2;
      plan->T_plan_1->kway=2;
    }

    MPI_Bcast(&plan->T_plan_1->method,1, MPI_INT,0, c_comm );
    MPI_Bcast(&plan->T_plan_1->kway,1, MPI_INT,0, c_comm );
    MPI_Bcast(&plan->T_plan_1->kway_async,1, MPI::BOOL,0, c_comm );

    checkCuda_accfft (cudaDeviceSynchronize());
    MPI_Barrier(plan->c_comm);
    plan->T_plan_1->method =plan->T_plan_1->method;
    plan->T_plan_2->method =plan->T_plan_1->method;
    plan->T_plan_2i->method=-plan->T_plan_1->method;
    plan->T_plan_1i->method=-plan->T_plan_1->method;

    plan->T_plan_1->kway =plan->T_plan_1->kway;
    plan->T_plan_2->kway =plan->T_plan_1->kway;
    plan->T_plan_2i->kway=plan->T_plan_1->kway;
    plan->T_plan_1i->kway=plan->T_plan_1->kway;

    plan->T_plan_1->kway_async =plan->T_plan_1->kway_async;
    plan->T_plan_2->kway_async =plan->T_plan_1->kway_async;
    plan->T_plan_2i->kway_async=plan->T_plan_1->kway_async;
    plan->T_plan_1i->kway_async=plan->T_plan_1->kway_async;

    plan->iplan_1=-1;
    plan->iplan_2=-1;

  }// end 2d r2c

  plan->r2c_plan_baked=true;
  return plan;

} // end accfft_plan_dft_r2c_gpu

コード例 #6

0

ファイルを表示

ファイル: accfft_gpu.cpp プロジェクト: jeffhammond/accfft

int accfft_local_size_dft_r2c_gpu_t( int * n,int * isize, int * istart, int * osize, int *ostart,MPI_Comm c_comm){
  return accfft_local_size_dft_r2c_gpu(n,isize,istart,osize,ostart,c_comm);
}