C++ (Cpp) accfft_create_comm Beispiele

Beispiel #1

0

Datei anzeigen

Datei: step5f.cpp Projekt: jeffhammond/accfft

void grad(int *n, int nthreads) {
  int nprocs, procid;
  MPI_Comm_rank(MPI_COMM_WORLD, &procid);
  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);

  /* Create Cartesian Communicator */
  int c_dims[2]={0};
  MPI_Comm c_comm;
  accfft_create_comm(MPI_COMM_WORLD,c_dims,&c_comm);

  float *data;
  Complexf *data_hat;
  double f_time=0*MPI_Wtime(),i_time=0, setup_time=0;
  int alloc_max=0;

  int isize[3],osize[3],istart[3],ostart[3];
  /* Get the local pencil size and the allocation size */
  alloc_max=accfft_local_size_dft_r2cf(n,isize,istart,osize,ostart,c_comm);

  //data=(float*)accfft_alloc(isize[0]*isize[1]*isize[2]*sizeof(float));
  data=(float*)accfft_alloc(alloc_max);
  data_hat=(Complexf*)accfft_alloc(alloc_max);

  accfft_init(nthreads);

  /* Create FFT plan */
  setup_time=-MPI_Wtime();
  accfft_planf * plan=accfft_plan_dft_3d_r2cf(n,data,(float*)data_hat,c_comm,ACCFFT_MEASURE);
  setup_time+=MPI_Wtime();


  /*  Initialize data */
  initialize(data,n,c_comm);
  MPI_Barrier(c_comm);

  float * gradx=(float*)accfft_alloc(isize[0]*isize[1]*isize[2]*sizeof(float));
  float * grady=(float*)accfft_alloc(isize[0]*isize[1]*isize[2]*sizeof(float));
  float * gradz=(float*)accfft_alloc(isize[0]*isize[1]*isize[2]*sizeof(float));
  double timings[5]={0};

  std::bitset<3> XYZ=0;
  XYZ[0]=1;
  XYZ[1]=1;
  XYZ[2]=1;
  double exec_time=-MPI_Wtime();
  accfft_gradf(gradx,grady,gradz,data,plan,&XYZ,timings);
  exec_time+=MPI_Wtime();
  /* Check err*/
  PCOUT<<">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"<<std::endl;
  PCOUT<<">>>>>>>>Checking Gradx>>>>>>>>"<<std::endl;
  check_err_grad(gradx,n,c_comm,0);
  PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<"<<std::endl;
  PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"<<std::endl;

  PCOUT<<">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"<<std::endl;
  PCOUT<<">>>>>>>>Checking Grady>>>>>>>>"<<std::endl;
  check_err_grad(grady,n,c_comm,1);
  PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<"<<std::endl;
  PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"<<std::endl;

  PCOUT<<">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"<<std::endl;
  PCOUT<<">>>>>>>>Checking Gradz>>>>>>>>"<<std::endl;
  check_err_grad(gradz,n,c_comm,2);
  PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<"<<std::endl;
  PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"<<std::endl;

  /* Compute some timings statistics */
  double g_setup_time,g_timings[5],g_exec_time;

  MPI_Reduce(timings,g_timings,5, MPI_DOUBLE, MPI_MAX,0, c_comm);
  MPI_Reduce(&setup_time,&g_setup_time,1, MPI_DOUBLE, MPI_MAX,0, c_comm);
  MPI_Reduce(&exec_time,&g_exec_time,1, MPI_DOUBLE, MPI_MAX,0, c_comm);

  PCOUT<<"Timing for Grad Computation for size "<<n[0]<<"*"<<n[1]<<"*"<<n[2]<<std::endl;
  PCOUT<<"Setup \t\t"<<g_setup_time<<std::endl;
  PCOUT<<"Evaluation \t"<<g_exec_time<<std::endl;

  accfft_free(data);
  accfft_free(data_hat);
  MPI_Barrier(c_comm);
  accfft_free(gradx);
  accfft_free(grady);
  accfft_free(gradz);
  accfft_destroy_plan(plan);
  accfft_cleanup();
  MPI_Comm_free(&c_comm);
  PCOUT<<"-------------------------------------------------------"<<std::endl;
  PCOUT<<"-------------------------------------------------------"<<std::endl;
  PCOUT<<"-------------------------------------------------------\n"<<std::endl;
  return ;

} // end grad

Beispiel #2

0

Datei anzeigen

Datei: step2f.cpp Projekt: spiralgen/accfft-scratch

void step2(int *n, int nthreads) {
    int nprocs, procid;
    MPI_Comm_rank(MPI_COMM_WORLD, &procid);
    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);

    /* Create Cartesian Communicator */
    int c_dims[2];
    MPI_Comm c_comm;
    accfft_create_comm(MPI_COMM_WORLD,c_dims,&c_comm);

    float *data;
    double f_time=0*MPI_Wtime(),i_time=0, setup_time=0;
    int alloc_max=0;

    int isize[3],osize[3],istart[3],ostart[3];
    /* Get the local pencil size and the allocation size */
    alloc_max=accfft_local_size_dft_r2cf(n,isize,istart,osize,ostart,c_comm);

    data=(float*)accfft_alloc(alloc_max);

    accfft_init(nthreads);

    setup_time=-MPI_Wtime();
    /* Create FFT plan */
    accfft_planf * plan=accfft_plan_dft_3d_r2cf(n,data,data,c_comm,ACCFFT_MEASURE); // note that in and out are both data -> inplace plan
    setup_time+=MPI_Wtime();

    /* Warm Up */
    accfft_execute_r2cf(plan,data,(Complexf*)data);
    accfft_execute_r2cf(plan,data,(Complexf*)data);

    /*  Initialize data */
    initialize(data,n,c_comm); // special initialize plan for inplace transform -> difference in padding
    MPI_Barrier(c_comm);

    /* Perform forward FFT */
    f_time-=MPI_Wtime();
    accfft_execute_r2cf(plan,data,(Complexf*)data);
    f_time+=MPI_Wtime();

    MPI_Barrier(c_comm);

    /* Perform backward FFT */
    i_time-=MPI_Wtime();
    accfft_execute_c2rf(plan,(Complexf*)data,data);
    i_time+=MPI_Wtime();

    /* Check Error */
    check_err(data,n,c_comm);

    /* Compute some timings statistics */
    double g_f_time, g_i_time, g_setup_time;
    MPI_Reduce(&f_time,&g_f_time,1, MPI_DOUBLE, MPI_MAX,0, MPI_COMM_WORLD);
    MPI_Reduce(&i_time,&g_i_time,1, MPI_DOUBLE, MPI_MAX,0, MPI_COMM_WORLD);
    MPI_Reduce(&setup_time,&g_setup_time,1, MPI_DOUBLE, MPI_MAX,0, MPI_COMM_WORLD);

    PCOUT<<"Timing for FFT of size "<<n[0]<<"*"<<n[1]<<"*"<<n[2]<<std::endl;
    PCOUT<<"Setup \t"<<g_setup_time<<std::endl;
    PCOUT<<"FFT \t"<<g_f_time<<std::endl;
    PCOUT<<"IFFT \t"<<g_i_time<<std::endl;

    accfft_free(data);
    accfft_destroy_plan(plan);
    accfft_cleanup();
    MPI_Comm_free(&c_comm);
    return ;

} // end step2

Beispiel #3

0

Datei anzeigen

Datei: step5_gpu.cpp Projekt: spiralgen/accfft-scratch

void divergence(int *n) {
  int nprocs, procid;
  MPI_Comm_rank(MPI_COMM_WORLD, &procid);
  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);

  /* Create Cartesian Communicator */
  int c_dims[2]={0};
  MPI_Comm c_comm;
  accfft_create_comm(MPI_COMM_WORLD,c_dims,&c_comm);

  double f_time=0*MPI_Wtime(),i_time=0, setup_time=0;
  int alloc_max=0;

  int isize[3],osize[3],istart[3],ostart[3];
  /* Get the local pencil size and the allocation size */
  alloc_max=accfft_local_size_dft_r2c_gpu(n,isize,istart,osize,ostart,c_comm);

  //data=(double*)accfft_alloc(isize[0]*isize[1]*isize[2]*sizeof(double));
  double * data_cpu=(double*)accfft_alloc(alloc_max);
  double* data;
  Complex* data_hat;
  cudaMalloc((void**) &data    , alloc_max);
  cudaMalloc((void**) &data_hat, alloc_max);


  accfft_init();

  /* Create FFT plan */
  setup_time=-MPI_Wtime();
  accfft_plan_gpu * plan=accfft_plan_dft_3d_r2c_gpu(n,data,(double*)data_hat,c_comm,ACCFFT_MEASURE);
  setup_time+=MPI_Wtime();


  /*  Initialize data */
  initialize(data_cpu,n,c_comm);
  cudaMemcpy(data, data_cpu,alloc_max, cudaMemcpyHostToDevice);

  MPI_Barrier(c_comm);

  double * gradx,*grady, *gradz, *divergence;
  cudaMalloc((void**) &gradx     , alloc_max);
  cudaMalloc((void**) &grady     , alloc_max);
  cudaMalloc((void**) &gradz     , alloc_max);
  cudaMalloc((void**) &divergence, alloc_max);
  double timings[5]={0};


  std::bitset<3> XYZ=0;
  XYZ[0]=1;
  XYZ[1]=1;
  XYZ[2]=1;
  double exec_time=-MPI_Wtime();
  accfft_grad_gpu(gradx,grady,gradz,data,plan,XYZ,timings);
  accfft_divergence_gpu(divergence,gradx,grady,gradz,plan,timings);
  exec_time+=MPI_Wtime();

  PCOUT<<">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"<<std::endl;
  PCOUT<<">>>>>Checking Divergence>>>>>>"<<std::endl;
  cudaMemcpy(data_cpu, divergence, alloc_max, cudaMemcpyDeviceToHost);
  check_err_laplace(data_cpu,n,c_comm);
  PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<"<<std::endl;
  PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"<<std::endl;


  /* Compute some timings statistics */
  double g_setup_time,g_timings[5],g_exec_time;

  MPI_Reduce(timings,g_timings,5, MPI_DOUBLE, MPI_MAX,0, c_comm);
  MPI_Reduce(&setup_time,&g_setup_time,1, MPI_DOUBLE, MPI_MAX,0, c_comm);
  MPI_Reduce(&exec_time,&g_exec_time,1, MPI_DOUBLE, MPI_MAX,0, c_comm);

  PCOUT<<"Timing for Grad Computation for size "<<n[0]<<"*"<<n[1]<<"*"<<n[2]<<std::endl;
  PCOUT<<"Setup \t\t"<<g_setup_time<<std::endl;
  PCOUT<<"Evaluation \t"<<g_exec_time<<std::endl;

  accfft_free(data_cpu);
  cudaFree(data);
  cudaFree(data_hat);
  MPI_Barrier(c_comm);
  cudaFree(gradx);
  cudaFree(grady);
  cudaFree(gradz);
  accfft_destroy_plan(plan);
  accfft_cleanup_gpu();
  MPI_Comm_free(&c_comm);
  PCOUT<<"-------------------------------------------------------"<<std::endl;
  PCOUT<<"-------------------------------------------------------"<<std::endl;
  PCOUT<<"-------------------------------------------------------\n"<<std::endl;
  return ;

} // end divergence

Beispiel #4

0

Datei anzeigen

Datei: step1_gpu.cpp Projekt: amirgholami/accfft

void step1_gpu(int *n) {

	int nprocs, procid;
	MPI_Comm_rank(MPI_COMM_WORLD, &procid);
	MPI_Comm_size(MPI_COMM_WORLD, &nprocs);

	/* Create Cartesian Communicator */
	int c_dims[2] = { 0 };
	MPI_Comm c_comm;
	accfft_create_comm(MPI_COMM_WORLD, c_dims, &c_comm);

	double *data, *data_cpu;
	Complex *data_hat;
	double f_time = 0 * MPI_Wtime(), i_time = 0, setup_time = 0;
	int alloc_max = 0;

	int isize[3], osize[3], istart[3], ostart[3];
	/* Get the local pencil size and the allocation size */
	alloc_max = accfft_local_size_dft_r2c_gpu(n, isize, istart, osize, ostart,
			c_comm);

	data_cpu = (double*) malloc(
			isize[0] * isize[1] * isize[2] * sizeof(double));
	//data_hat=(Complex*)accfft_alloc(alloc_max);
	cudaMalloc((void**) &data, isize[0] * isize[1] * isize[2] * sizeof(double));
	cudaMalloc((void**) &data_hat, alloc_max);

	//accfft_init(nthreads);

	/* Create FFT plan */
	setup_time = -MPI_Wtime();
	accfft_plan_gpu * plan = accfft_plan_dft_3d_r2c_gpu(n, data,
			(double*) data_hat, c_comm, ACCFFT_MEASURE);
	setup_time += MPI_Wtime();

	/* Warm Up */
	accfft_execute_r2c_gpu(plan, data, data_hat);
	accfft_execute_r2c_gpu(plan, data, data_hat);

	/*  Initialize data */
	initialize(data_cpu, n, c_comm);
	cudaMemcpy(data, data_cpu, isize[0] * isize[1] * isize[2] * sizeof(double),
			cudaMemcpyHostToDevice);
	// initialize_gpu(data,n,isize,istart); // GPU version of initialize function

	MPI_Barrier(c_comm);

	/* Perform forward FFT */
	f_time -= MPI_Wtime();
	accfft_execute_r2c_gpu(plan, data, data_hat);
	f_time += MPI_Wtime();

	MPI_Barrier(c_comm);

	double *data2_cpu, *data2;
	cudaMalloc((void**) &data2,
			isize[0] * isize[1] * isize[2] * sizeof(double));
	data2_cpu = (double*) malloc(
			isize[0] * isize[1] * isize[2] * sizeof(double));

	/* Perform backward FFT */
	i_time -= MPI_Wtime();
	accfft_execute_c2r_gpu(plan, data_hat, data2);
	i_time += MPI_Wtime();

	/* copy back results on CPU */
	cudaMemcpy(data2_cpu, data2,
			isize[0] * isize[1] * isize[2] * sizeof(double),
			cudaMemcpyDeviceToHost);

	/* Check Error */
	double err = 0, g_err = 0;
	double norm = 0, g_norm = 0;
	for (int i = 0; i < isize[0] * isize[1] * isize[2]; ++i) {
		err += std::abs(data2_cpu[i] / n[0] / n[1] / n[2] - data_cpu[i]);
		norm += std::abs(data_cpu[i]);
	}
	MPI_Reduce(&err, &g_err, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
	MPI_Reduce(&norm, &g_norm, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);

	PCOUT << "\nL1 Error is " << g_err << std::endl;
	PCOUT << "Relative L1 Error is " << g_err / g_norm << std::endl;
	if (g_err / g_norm < 1e-10)
		PCOUT << "\nResults are CORRECT!\n\n";
	else
		PCOUT << "\nResults are NOT CORRECT!\n\n";

	check_err(data2_cpu, n, c_comm);

	/* Compute some timings statistics */
	double g_f_time, g_i_time, g_setup_time;
	MPI_Reduce(&f_time, &g_f_time, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
	MPI_Reduce(&i_time, &g_i_time, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
	MPI_Reduce(&setup_time, &g_setup_time, 1, MPI_DOUBLE, MPI_MAX, 0,
			MPI_COMM_WORLD);

	PCOUT << "GPU Timing for FFT of size " << n[0] << "*" << n[1] << "*" << n[2]
			<< std::endl;
	PCOUT << "Setup \t" << g_setup_time << std::endl;
	PCOUT << "FFT \t" << g_f_time << std::endl;
	PCOUT << "IFFT \t" << g_i_time << std::endl;

	free(data_cpu);
	free(data2_cpu);
	cudaFree(data);
	cudaFree(data_hat);
	cudaFree(data2);
	accfft_destroy_plan_gpu(plan);
	accfft_cleanup_gpu();
	MPI_Comm_free(&c_comm);
	return;

} // end step1_gpu

Beispiel #5

0

Datei anzeigen

Datei: step3f.cpp Projekt: jeffhammond/accfft

void step3(int *n, int nthreads) {
  int nprocs, procid;
  MPI_Comm_rank(MPI_COMM_WORLD, &procid);
  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);

  /* Create Cartesian Communicator */
  int c_dims[2]={0};
  MPI_Comm c_comm;
  accfft_create_comm(MPI_COMM_WORLD,c_dims,&c_comm);

#ifdef INPLACE
  Complexf *data;
#else
  Complexf *data;
  Complexf *data_hat;
#endif
  double f_time=0*MPI_Wtime(),i_time=0, setup_time=0;
  int alloc_max=0;

  int isize[3],osize[3],istart[3],ostart[3];
  /* Get the local pencil size and the allocation size */
  alloc_max=accfft_local_size_dft_c2cf(n,isize,istart,osize,ostart,c_comm);

#ifdef INPLACE
  data=(Complexf*)accfft_alloc(alloc_max);
#else
  data=(Complexf*)accfft_alloc(isize[0]*isize[1]*isize[2]*2*sizeof(float));
  data_hat=(Complexf*)accfft_alloc(alloc_max);
#endif

  accfft_init(nthreads);

  /* Create FFT plan */
  setup_time=-MPI_Wtime();
#ifdef INPLACE
  accfft_planf * plan=accfft_plan_dft_3d_c2cf(n,data,data,c_comm,ACCFFT_MEASURE);
#else
  accfft_planf * plan=accfft_plan_dft_3d_c2cf(n,data,data_hat,c_comm,ACCFFT_MEASURE);
#endif
  setup_time+=MPI_Wtime();

  /*  Initialize data */
  initialize(data,n,c_comm);
  MPI_Barrier(c_comm);

  /* Perform forward FFT */
  f_time-=MPI_Wtime();
#ifdef INPLACE
  accfft_execute_c2cf(plan,ACCFFT_FORWARD,data,data);
#else
  accfft_execute_c2cf(plan,ACCFFT_FORWARD,data,data_hat);
#endif
  f_time+=MPI_Wtime();

  MPI_Barrier(c_comm);


  /* Perform backward FFT */
#ifdef INPLACE
  i_time-=MPI_Wtime();
  accfft_execute_c2cf(plan,ACCFFT_BACKWARD,data,data);
  i_time+=MPI_Wtime();
#else
  Complexf * data2=(Complexf*)accfft_alloc(isize[0]*isize[1]*isize[2]*2*sizeof(float));
  i_time-=MPI_Wtime();
  accfft_execute_c2cf(plan,ACCFFT_BACKWARD,data_hat,data2);
  i_time+=MPI_Wtime();
#endif

  /* Check Error */
#ifdef INPLACE
  check_err(data,n,c_comm);
#else
  check_err(data2,n,c_comm);
#endif

  /* Compute some timings statistics */
  double g_f_time, g_i_time, g_setup_time;
  MPI_Reduce(&f_time,&g_f_time,1, MPI_DOUBLE, MPI_MAX,0, MPI_COMM_WORLD);
  MPI_Reduce(&i_time,&g_i_time,1, MPI_DOUBLE, MPI_MAX,0, MPI_COMM_WORLD);
  MPI_Reduce(&setup_time,&g_setup_time,1, MPI_DOUBLE, MPI_MAX,0, MPI_COMM_WORLD);

#ifdef INPLACE
  PCOUT<<"Timing for Inplace FFT of size "<<n[0]<<"*"<<n[1]<<"*"<<n[2]<<std::endl;
#else
  PCOUT<<"Timing for Outplace FFT of size "<<n[0]<<"*"<<n[1]<<"*"<<n[2]<<std::endl;
#endif
  PCOUT<<"Setup \t"<<g_setup_time<<std::endl;
  PCOUT<<"FFT \t"<<g_f_time<<std::endl;
  PCOUT<<"IFFT \t"<<g_i_time<<std::endl;

  accfft_free(data);
#ifndef INPLACE
  accfft_free(data_hat);
  accfft_free(data2);
#endif
  accfft_destroy_plan(plan);
  accfft_cleanup();
  MPI_Comm_free(&c_comm);
  return ;

} // end step3

Beispiel #6

0

Datei anzeigen

Datei: step2_gpuf.cpp Projekt: amirgholami/accfft

void step2_gpu(int *n) {

	int nprocs, procid;
	MPI_Comm_rank(MPI_COMM_WORLD, &procid);
	MPI_Comm_size(MPI_COMM_WORLD, &nprocs);

	/* Create Cartesian Communicator */
	int c_dims[2];
	MPI_Comm c_comm;
	accfft_create_comm(MPI_COMM_WORLD, c_dims, &c_comm);

	float *data, *data_cpu;
	Complexf *data_hat;
	double f_time = 0 * MPI_Wtime(), i_time = 0, setup_time = 0;
	int alloc_max = 0;

	int isize[3], osize[3], istart[3], ostart[3];
	/* Get the local pencil size and the allocation size */
	alloc_max = accfft_local_size_dft_r2c_gpuf(n, isize, istart, osize, ostart,
			c_comm);

	/* Note that both need to be allocated by alloc_max because of inplace transform*/
	data_cpu = (float*) malloc(alloc_max);
	cudaMalloc((void**) &data, alloc_max);

	//accfft_init(nthreads);

	/* Create FFT plan */
	setup_time = -MPI_Wtime();
	accfft_plan_gpuf * plan = accfft_plan_dft_3d_r2c_gpuf(n, data, data, c_comm,
			ACCFFT_MEASURE);
	setup_time += MPI_Wtime();

	/* Warm Up */
	accfft_execute_r2c_gpuf(plan, data, (Complexf*) data);
	accfft_execute_r2c_gpuf(plan, data, (Complexf*) data);

	/* Initialize data */
	initialize_gpu(data, n, isize, istart);
	MPI_Barrier(c_comm);

	/* Perform forward FFT */
	f_time -= MPI_Wtime();
	accfft_execute_r2c_gpuf(plan, data, (Complexf*) data);
	f_time += MPI_Wtime();
	MPI_Barrier(c_comm);

	/* Perform backward FFT */
	i_time -= MPI_Wtime();
	accfft_execute_c2r_gpuf(plan, (Complexf*) data, data);
	i_time += MPI_Wtime();

	/* copy back results on CPU */
	cudaMemcpy(data_cpu, data, alloc_max, cudaMemcpyDeviceToHost);

	/* Check Error */
	check_err(data_cpu, n, c_comm);

	/* Compute some timings statistics */
	double g_f_time, g_i_time, g_setup_time;
	MPI_Reduce(&f_time, &g_f_time, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
	MPI_Reduce(&i_time, &g_i_time, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
	MPI_Reduce(&setup_time, &g_setup_time, 1, MPI_DOUBLE, MPI_MAX, 0,
			MPI_COMM_WORLD);

	PCOUT << "GPU Timing for FFT of size " << n[0] << "*" << n[1] << "*" << n[2]
			<< std::endl;
	PCOUT << "Setup \t" << g_setup_time << std::endl;
	PCOUT << "FFT \t" << g_f_time << std::endl;
	PCOUT << "IFFT \t" << g_i_time << std::endl;

	free(data_cpu);
	cudaFree(data);
	accfft_destroy_plan_gpu(plan);
	accfft_cleanup_gpuf();
	MPI_Comm_free(&c_comm);
	return;

} // end step2_gpu

Beispiel #7

0

Datei anzeigen

Datei: step3_gpuf.cpp Projekt: jeffhammond/accfft

void step3_gpu(int *n) {

  int nprocs, procid;
  MPI_Comm_rank(MPI_COMM_WORLD, &procid);
  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);

  /* Create Cartesian Communicator */
  int c_dims[2]={0};
  MPI_Comm c_comm;
  accfft_create_comm(MPI_COMM_WORLD,c_dims,&c_comm);

  Complexf *data, *data_cpu;
  Complexf *data_hat;
  double f_time=0*MPI_Wtime(),i_time=0, setup_time=0;
  int alloc_max=0;

  int isize[3],osize[3],istart[3],ostart[3];
  /* Get the local pencil size and the allocation size */
  alloc_max=accfft_local_size_dft_c2c_gpuf(n,isize,istart,osize,ostart,c_comm);

#ifdef INPLACE
  data_cpu=(Complexf*)malloc(alloc_max);
  cudaMalloc((void**) &data, alloc_max);
#else
  data_cpu=(Complexf*)malloc(isize[0]*isize[1]*isize[2]*2*sizeof(float));
  cudaMalloc((void**) &data,isize[0]*isize[1]*isize[2]*2*sizeof(float));
  cudaMalloc((void**) &data_hat, alloc_max);
#endif

  //accfft_init(nthreads);
  setup_time=-MPI_Wtime();

  /* Create FFT plan */
#ifdef INPLACE
  accfft_plan_gpuf * plan=accfft_plan_dft_3d_c2c_gpuf(n,data,data,c_comm,ACCFFT_MEASURE);
#else
  accfft_plan_gpuf * plan=accfft_plan_dft_3d_c2c_gpuf(n,data,data_hat,c_comm,ACCFFT_MEASURE);
#endif
  setup_time+=MPI_Wtime();

  /* Warmup Runs */
#ifdef INPLACE
  accfft_execute_c2c_gpuf(plan,ACCFFT_FORWARD,data,data);
  accfft_execute_c2c_gpuf(plan,ACCFFT_FORWARD,data,data);
#else
  accfft_execute_c2c_gpuf(plan,ACCFFT_FORWARD,data,data_hat);
  accfft_execute_c2c_gpuf(plan,ACCFFT_FORWARD,data,data_hat);
#endif

  /*  Initialize data */
  initialize(data_cpu,n,c_comm);
#ifdef INPLACE
  cudaMemcpy(data, data_cpu,alloc_max, cudaMemcpyHostToDevice);
#else
  cudaMemcpy(data, data_cpu,isize[0]*isize[1]*isize[2]*2*sizeof(float), cudaMemcpyHostToDevice);
#endif

  MPI_Barrier(c_comm);


  /* Perform forward FFT */
  f_time-=MPI_Wtime();
#ifdef INPLACE
  accfft_execute_c2c_gpuf(plan,ACCFFT_FORWARD,data,data);
#else
  accfft_execute_c2c_gpuf(plan,ACCFFT_FORWARD,data,data_hat);
#endif
  f_time+=MPI_Wtime();

  MPI_Barrier(c_comm);

#ifndef INPLACE
  Complexf *data2_cpu, *data2;
  cudaMalloc((void**) &data2, isize[0]*isize[1]*isize[2]*2*sizeof(float));
  data2_cpu=(Complexf*) malloc(isize[0]*isize[1]*isize[2]*2*sizeof(float));
#endif

  /* Perform backward FFT */
  i_time-=MPI_Wtime();
#ifdef INPLACE
  accfft_execute_c2c_gpuf(plan,ACCFFT_BACKWARD,data,data);
#else
  accfft_execute_c2c_gpuf(plan,ACCFFT_BACKWARD,data_hat,data2);
#endif
  i_time+=MPI_Wtime();

  /* copy back results on CPU and check error*/
#ifdef INPLACE
  cudaMemcpy(data_cpu, data, alloc_max, cudaMemcpyDeviceToHost);
  check_err(data_cpu,n,c_comm);
#else
  cudaMemcpy(data2_cpu, data2, isize[0]*isize[1]*isize[2]*2*sizeof(float), cudaMemcpyDeviceToHost);
  check_err(data2_cpu,n,c_comm);
#endif


  /* Compute some timings statistics */
  double g_f_time, g_i_time, g_setup_time;
  MPI_Reduce(&f_time,&g_f_time,1, MPI_DOUBLE, MPI_MAX,0, MPI_COMM_WORLD);
  MPI_Reduce(&i_time,&g_i_time,1, MPI_DOUBLE, MPI_MAX,0, MPI_COMM_WORLD);
  MPI_Reduce(&setup_time,&g_setup_time,1, MPI_DOUBLE, MPI_MAX,0, MPI_COMM_WORLD);

#ifdef INPLACE
  PCOUT<<"GPU Timing for Inplace FFT of size "<<n[0]<<"*"<<n[1]<<"*"<<n[2]<<std::endl;
#else
  PCOUT<<"GPU Timing for Outplace FFT of size "<<n[0]<<"*"<<n[1]<<"*"<<n[2]<<std::endl;
#endif
  PCOUT<<"Setup \t"<<g_setup_time<<std::endl;
  PCOUT<<"FFT \t"<<g_f_time<<std::endl;
  PCOUT<<"IFFT \t"<<g_i_time<<std::endl;

  MPI_Barrier(c_comm);
  cudaDeviceSynchronize();
  free(data_cpu);
  cudaFree(data);
#ifndef INPLACE
  cudaFree(data_hat);
  free(data2_cpu);
  cudaFree(data2);
#endif
  accfft_destroy_plan_gpu(plan);
  accfft_cleanup_gpuf();
  MPI_Comm_free(&c_comm);
  return ;

} // end step3_gpu

Beispiel #8

0

Datei anzeigen

Datei: step5.cpp Projekt: spiralgen/accfft-scratch

void laplace(int *n, int nthreads) {
  int nprocs, procid;
  MPI_Comm_rank(MPI_COMM_WORLD, &procid);
  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);

  /* Create Cartesian Communicator */
  int c_dims[2]={0};
  MPI_Comm c_comm;
  accfft_create_comm(MPI_COMM_WORLD,c_dims,&c_comm);

  double *data;
  Complex *data_hat;
  double f_time=0*MPI_Wtime(),i_time=0, setup_time=0;
  int alloc_max=0;

  int isize[3],osize[3],istart[3],ostart[3];
  /* Get the local pencil size and the allocation size */
  alloc_max=accfft_local_size_dft_r2c(n,isize,istart,osize,ostart,c_comm);

  //data=(double*)accfft_alloc(isize[0]*isize[1]*isize[2]*sizeof(double));
  data=(double*)accfft_alloc(alloc_max);
  data_hat=(Complex*)accfft_alloc(alloc_max);

  accfft_init(nthreads);

  /* Create FFT plan */
  setup_time=-MPI_Wtime();
  accfft_plan * plan=accfft_plan_dft_3d_r2c(n,data,(double*)data_hat,c_comm,ACCFFT_MEASURE);
  setup_time+=MPI_Wtime();


  /*  Initialize data */
  initialize(data,n,c_comm);
  MPI_Barrier(c_comm);

  double * laplace=(double*)accfft_alloc(isize[0]*isize[1]*isize[2]*sizeof(double));
  double timings[5]={0};

  double exec_time=-MPI_Wtime();
  accfft_laplace(laplace,data,plan,timings);
  exec_time+=MPI_Wtime();
  /* Check err*/
  PCOUT<<">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"<<std::endl;
  PCOUT<<">>>>>>>Checking Laplace>>>>>>>"<<std::endl;
  check_err_laplace(laplace,n,c_comm);
  PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<"<<std::endl;
  PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"<<std::endl;

  /* Compute some timings statistics */
  double g_setup_time,g_timings[5],g_exec_time;

  MPI_Reduce(timings,g_timings,5, MPI_DOUBLE, MPI_MAX,0, c_comm);
  MPI_Reduce(&setup_time,&g_setup_time,1, MPI_DOUBLE, MPI_MAX,0, c_comm);
  MPI_Reduce(&exec_time,&g_exec_time,1, MPI_DOUBLE, MPI_MAX,0, c_comm);

  PCOUT<<"Timing for Laplace Computation for size "<<n[0]<<"*"<<n[1]<<"*"<<n[2]<<std::endl;
  PCOUT<<"Setup \t\t"<<g_setup_time<<std::endl;
  PCOUT<<"Evaluation \t"<<g_exec_time<<std::endl;

  accfft_free(data);
  accfft_free(data_hat);
  MPI_Barrier(c_comm);
  accfft_free(laplace);
  accfft_destroy_plan(plan);
  accfft_cleanup();
  MPI_Comm_free(&c_comm);
  return ;

} // end laplace