void divergence(int *n, int nthreads) { int nprocs, procid; MPI_Comm_rank(MPI_COMM_WORLD, &procid); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); /* Create Cartesian Communicator */ int c_dims[2]={0}; MPI_Comm c_comm; accfft_create_comm(MPI_COMM_WORLD,c_dims,&c_comm); float *data; Complexf *data_hat; double f_time=0*MPI_Wtime(),i_time=0, setup_time=0; int alloc_max=0; int isize[3],osize[3],istart[3],ostart[3]; /* Get the local pencil size and the allocation size */ alloc_max=accfft_local_size_dft_r2cf(n,isize,istart,osize,ostart,c_comm); //data=(float*)accfft_alloc(isize[0]*isize[1]*isize[2]*sizeof(float)); data=(float*)accfft_alloc(alloc_max); data_hat=(Complexf*)accfft_alloc(alloc_max); accfft_init(nthreads); /* Create FFT plan */ setup_time=-MPI_Wtime(); accfft_planf * plan=accfft_plan_dft_3d_r2cf(n,data,(float*)data_hat,c_comm,ACCFFT_MEASURE); setup_time+=MPI_Wtime(); /* Initialize data */ initialize(data,n,c_comm); MPI_Barrier(c_comm); float * gradx=(float*)accfft_alloc(isize[0]*isize[1]*isize[2]*sizeof(float)); float * grady=(float*)accfft_alloc(isize[0]*isize[1]*isize[2]*sizeof(float)); float * gradz=(float*)accfft_alloc(isize[0]*isize[1]*isize[2]*sizeof(float)); float * divergence=(float*)accfft_alloc(isize[0]*isize[1]*isize[2]*sizeof(float)); double timings[5]={0}; std::bitset<3> XYZ=0; XYZ[0]=1; XYZ[1]=1; XYZ[2]=1; double exec_time=-MPI_Wtime(); accfft_gradf(gradx,grady,gradz,data,plan,&XYZ,timings); accfft_divergencef(divergence,gradx,grady,gradz,plan,timings); exec_time+=MPI_Wtime(); PCOUT<<">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"<<std::endl; PCOUT<<">>>>>Checking Divergence>>>>>>"<<std::endl; check_err_laplace(divergence,n,c_comm); PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<"<<std::endl; PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"<<std::endl; /* Compute some timings statistics */ double g_setup_time,g_timings[5],g_exec_time; MPI_Reduce(timings,g_timings,5, MPI_DOUBLE, MPI_MAX,0, c_comm); MPI_Reduce(&setup_time,&g_setup_time,1, MPI_DOUBLE, MPI_MAX,0, c_comm); MPI_Reduce(&exec_time,&g_exec_time,1, MPI_DOUBLE, MPI_MAX,0, c_comm); PCOUT<<"Timing for Divergence Computation for size "<<n[0]<<"*"<<n[1]<<"*"<<n[2]<<std::endl; PCOUT<<"Setup \t\t"<<g_setup_time<<std::endl; PCOUT<<"Evaluation \t"<<g_exec_time<<std::endl; accfft_free(data); accfft_free(data_hat); MPI_Barrier(c_comm); accfft_free(gradx); accfft_free(grady); accfft_free(gradz); accfft_destroy_plan(plan); accfft_cleanup(); MPI_Comm_free(&c_comm); PCOUT<<"-------------------------------------------------------"<<std::endl; PCOUT<<"-------------------------------------------------------"<<std::endl; PCOUT<<"-------------------------------------------------------\n"<<std::endl; return ; } // end divergence
void divergence(int *n) { int nprocs, procid; MPI_Comm_rank(MPI_COMM_WORLD, &procid); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); /* Create Cartesian Communicator */ int c_dims[2]={0}; MPI_Comm c_comm; accfft_create_comm(MPI_COMM_WORLD,c_dims,&c_comm); double f_time=0*MPI_Wtime(),i_time=0, setup_time=0; int alloc_max=0; int isize[3],osize[3],istart[3],ostart[3]; /* Get the local pencil size and the allocation size */ alloc_max=accfft_local_size_dft_r2c_gpu(n,isize,istart,osize,ostart,c_comm); //data=(double*)accfft_alloc(isize[0]*isize[1]*isize[2]*sizeof(double)); double * data_cpu=(double*)accfft_alloc(alloc_max); double* data; Complex* data_hat; cudaMalloc((void**) &data , alloc_max); cudaMalloc((void**) &data_hat, alloc_max); accfft_init(); /* Create FFT plan */ setup_time=-MPI_Wtime(); accfft_plan_gpu * plan=accfft_plan_dft_3d_r2c_gpu(n,data,(double*)data_hat,c_comm,ACCFFT_MEASURE); setup_time+=MPI_Wtime(); /* Initialize data */ initialize(data_cpu,n,c_comm); cudaMemcpy(data, data_cpu,alloc_max, cudaMemcpyHostToDevice); MPI_Barrier(c_comm); double * gradx,*grady, *gradz, *divergence; cudaMalloc((void**) &gradx , alloc_max); cudaMalloc((void**) &grady , alloc_max); cudaMalloc((void**) &gradz , alloc_max); cudaMalloc((void**) &divergence, alloc_max); double timings[5]={0}; std::bitset<3> XYZ=0; XYZ[0]=1; XYZ[1]=1; XYZ[2]=1; double exec_time=-MPI_Wtime(); accfft_grad_gpu(gradx,grady,gradz,data,plan,XYZ,timings); accfft_divergence_gpu(divergence,gradx,grady,gradz,plan,timings); exec_time+=MPI_Wtime(); PCOUT<<">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"<<std::endl; PCOUT<<">>>>>Checking Divergence>>>>>>"<<std::endl; cudaMemcpy(data_cpu, divergence, alloc_max, cudaMemcpyDeviceToHost); check_err_laplace(data_cpu,n,c_comm); PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<"<<std::endl; PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"<<std::endl; /* Compute some timings statistics */ double g_setup_time,g_timings[5],g_exec_time; MPI_Reduce(timings,g_timings,5, MPI_DOUBLE, MPI_MAX,0, c_comm); MPI_Reduce(&setup_time,&g_setup_time,1, MPI_DOUBLE, MPI_MAX,0, c_comm); MPI_Reduce(&exec_time,&g_exec_time,1, MPI_DOUBLE, MPI_MAX,0, c_comm); PCOUT<<"Timing for Grad Computation for size "<<n[0]<<"*"<<n[1]<<"*"<<n[2]<<std::endl; PCOUT<<"Setup \t\t"<<g_setup_time<<std::endl; PCOUT<<"Evaluation \t"<<g_exec_time<<std::endl; accfft_free(data_cpu); cudaFree(data); cudaFree(data_hat); MPI_Barrier(c_comm); cudaFree(gradx); cudaFree(grady); cudaFree(gradz); accfft_destroy_plan(plan); accfft_cleanup_gpu(); MPI_Comm_free(&c_comm); PCOUT<<"-------------------------------------------------------"<<std::endl; PCOUT<<"-------------------------------------------------------"<<std::endl; PCOUT<<"-------------------------------------------------------\n"<<std::endl; return ; } // end divergence