Exemple #1
0
void divergence(int *n, int nthreads) {
  int nprocs, procid;
  MPI_Comm_rank(MPI_COMM_WORLD, &procid);
  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);

  /* Create Cartesian Communicator */
  int c_dims[2]={0};
  MPI_Comm c_comm;
  accfft_create_comm(MPI_COMM_WORLD,c_dims,&c_comm);

  float *data;
  Complexf *data_hat;
  double f_time=0*MPI_Wtime(),i_time=0, setup_time=0;
  int alloc_max=0;

  int isize[3],osize[3],istart[3],ostart[3];
  /* Get the local pencil size and the allocation size */
  alloc_max=accfft_local_size_dft_r2cf(n,isize,istart,osize,ostart,c_comm);

  //data=(float*)accfft_alloc(isize[0]*isize[1]*isize[2]*sizeof(float));
  data=(float*)accfft_alloc(alloc_max);
  data_hat=(Complexf*)accfft_alloc(alloc_max);

  accfft_init(nthreads);

  /* Create FFT plan */
  setup_time=-MPI_Wtime();
  accfft_planf * plan=accfft_plan_dft_3d_r2cf(n,data,(float*)data_hat,c_comm,ACCFFT_MEASURE);
  setup_time+=MPI_Wtime();


  /*  Initialize data */
  initialize(data,n,c_comm);
  MPI_Barrier(c_comm);

  float * gradx=(float*)accfft_alloc(isize[0]*isize[1]*isize[2]*sizeof(float));
  float * grady=(float*)accfft_alloc(isize[0]*isize[1]*isize[2]*sizeof(float));
  float * gradz=(float*)accfft_alloc(isize[0]*isize[1]*isize[2]*sizeof(float));
  float * divergence=(float*)accfft_alloc(isize[0]*isize[1]*isize[2]*sizeof(float));
  double timings[5]={0};

  std::bitset<3> XYZ=0;
  XYZ[0]=1;
  XYZ[1]=1;
  XYZ[2]=1;
  double exec_time=-MPI_Wtime();
  accfft_gradf(gradx,grady,gradz,data,plan,&XYZ,timings);
  accfft_divergencef(divergence,gradx,grady,gradz,plan,timings);
  exec_time+=MPI_Wtime();

  PCOUT<<">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"<<std::endl;
  PCOUT<<">>>>>Checking Divergence>>>>>>"<<std::endl;
  check_err_laplace(divergence,n,c_comm);
  PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<"<<std::endl;
  PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"<<std::endl;


  /* Compute some timings statistics */
  double g_setup_time,g_timings[5],g_exec_time;

  MPI_Reduce(timings,g_timings,5, MPI_DOUBLE, MPI_MAX,0, c_comm);
  MPI_Reduce(&setup_time,&g_setup_time,1, MPI_DOUBLE, MPI_MAX,0, c_comm);
  MPI_Reduce(&exec_time,&g_exec_time,1, MPI_DOUBLE, MPI_MAX,0, c_comm);

  PCOUT<<"Timing for Divergence Computation for size "<<n[0]<<"*"<<n[1]<<"*"<<n[2]<<std::endl;
  PCOUT<<"Setup \t\t"<<g_setup_time<<std::endl;
  PCOUT<<"Evaluation \t"<<g_exec_time<<std::endl;

  accfft_free(data);
  accfft_free(data_hat);
  MPI_Barrier(c_comm);
  accfft_free(gradx);
  accfft_free(grady);
  accfft_free(gradz);
  accfft_destroy_plan(plan);
  accfft_cleanup();
  MPI_Comm_free(&c_comm);
  PCOUT<<"-------------------------------------------------------"<<std::endl;
  PCOUT<<"-------------------------------------------------------"<<std::endl;
  PCOUT<<"-------------------------------------------------------\n"<<std::endl;
  return ;

} // end divergence
void divergence(int *n) {
  int nprocs, procid;
  MPI_Comm_rank(MPI_COMM_WORLD, &procid);
  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);

  /* Create Cartesian Communicator */
  int c_dims[2]={0};
  MPI_Comm c_comm;
  accfft_create_comm(MPI_COMM_WORLD,c_dims,&c_comm);

  double f_time=0*MPI_Wtime(),i_time=0, setup_time=0;
  int alloc_max=0;

  int isize[3],osize[3],istart[3],ostart[3];
  /* Get the local pencil size and the allocation size */
  alloc_max=accfft_local_size_dft_r2c_gpu(n,isize,istart,osize,ostart,c_comm);

  //data=(double*)accfft_alloc(isize[0]*isize[1]*isize[2]*sizeof(double));
  double * data_cpu=(double*)accfft_alloc(alloc_max);
  double* data;
  Complex* data_hat;
  cudaMalloc((void**) &data    , alloc_max);
  cudaMalloc((void**) &data_hat, alloc_max);


  accfft_init();

  /* Create FFT plan */
  setup_time=-MPI_Wtime();
  accfft_plan_gpu * plan=accfft_plan_dft_3d_r2c_gpu(n,data,(double*)data_hat,c_comm,ACCFFT_MEASURE);
  setup_time+=MPI_Wtime();


  /*  Initialize data */
  initialize(data_cpu,n,c_comm);
  cudaMemcpy(data, data_cpu,alloc_max, cudaMemcpyHostToDevice);

  MPI_Barrier(c_comm);

  double * gradx,*grady, *gradz, *divergence;
  cudaMalloc((void**) &gradx     , alloc_max);
  cudaMalloc((void**) &grady     , alloc_max);
  cudaMalloc((void**) &gradz     , alloc_max);
  cudaMalloc((void**) &divergence, alloc_max);
  double timings[5]={0};


  std::bitset<3> XYZ=0;
  XYZ[0]=1;
  XYZ[1]=1;
  XYZ[2]=1;
  double exec_time=-MPI_Wtime();
  accfft_grad_gpu(gradx,grady,gradz,data,plan,XYZ,timings);
  accfft_divergence_gpu(divergence,gradx,grady,gradz,plan,timings);
  exec_time+=MPI_Wtime();

  PCOUT<<">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"<<std::endl;
  PCOUT<<">>>>>Checking Divergence>>>>>>"<<std::endl;
  cudaMemcpy(data_cpu, divergence, alloc_max, cudaMemcpyDeviceToHost);
  check_err_laplace(data_cpu,n,c_comm);
  PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<"<<std::endl;
  PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"<<std::endl;


  /* Compute some timings statistics */
  double g_setup_time,g_timings[5],g_exec_time;

  MPI_Reduce(timings,g_timings,5, MPI_DOUBLE, MPI_MAX,0, c_comm);
  MPI_Reduce(&setup_time,&g_setup_time,1, MPI_DOUBLE, MPI_MAX,0, c_comm);
  MPI_Reduce(&exec_time,&g_exec_time,1, MPI_DOUBLE, MPI_MAX,0, c_comm);

  PCOUT<<"Timing for Grad Computation for size "<<n[0]<<"*"<<n[1]<<"*"<<n[2]<<std::endl;
  PCOUT<<"Setup \t\t"<<g_setup_time<<std::endl;
  PCOUT<<"Evaluation \t"<<g_exec_time<<std::endl;

  accfft_free(data_cpu);
  cudaFree(data);
  cudaFree(data_hat);
  MPI_Barrier(c_comm);
  cudaFree(gradx);
  cudaFree(grady);
  cudaFree(gradz);
  accfft_destroy_plan(plan);
  accfft_cleanup_gpu();
  MPI_Comm_free(&c_comm);
  PCOUT<<"-------------------------------------------------------"<<std::endl;
  PCOUT<<"-------------------------------------------------------"<<std::endl;
  PCOUT<<"-------------------------------------------------------\n"<<std::endl;
  return ;

} // end divergence