void grad(int *n, int nthreads) { int nprocs, procid; MPI_Comm_rank(MPI_COMM_WORLD, &procid); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); /* Create Cartesian Communicator */ int c_dims[2]={0}; MPI_Comm c_comm; accfft_create_comm(MPI_COMM_WORLD,c_dims,&c_comm); float *data; Complexf *data_hat; double f_time=0*MPI_Wtime(),i_time=0, setup_time=0; int alloc_max=0; int isize[3],osize[3],istart[3],ostart[3]; /* Get the local pencil size and the allocation size */ alloc_max=accfft_local_size_dft_r2cf(n,isize,istart,osize,ostart,c_comm); //data=(float*)accfft_alloc(isize[0]*isize[1]*isize[2]*sizeof(float)); data=(float*)accfft_alloc(alloc_max); data_hat=(Complexf*)accfft_alloc(alloc_max); accfft_init(nthreads); /* Create FFT plan */ setup_time=-MPI_Wtime(); accfft_planf * plan=accfft_plan_dft_3d_r2cf(n,data,(float*)data_hat,c_comm,ACCFFT_MEASURE); setup_time+=MPI_Wtime(); /* Initialize data */ initialize(data,n,c_comm); MPI_Barrier(c_comm); float * gradx=(float*)accfft_alloc(isize[0]*isize[1]*isize[2]*sizeof(float)); float * grady=(float*)accfft_alloc(isize[0]*isize[1]*isize[2]*sizeof(float)); float * gradz=(float*)accfft_alloc(isize[0]*isize[1]*isize[2]*sizeof(float)); double timings[5]={0}; std::bitset<3> XYZ=0; XYZ[0]=1; XYZ[1]=1; XYZ[2]=1; double exec_time=-MPI_Wtime(); accfft_gradf(gradx,grady,gradz,data,plan,&XYZ,timings); exec_time+=MPI_Wtime(); /* Check err*/ PCOUT<<">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"<<std::endl; PCOUT<<">>>>>>>>Checking Gradx>>>>>>>>"<<std::endl; check_err_grad(gradx,n,c_comm,0); PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<"<<std::endl; PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"<<std::endl; PCOUT<<">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"<<std::endl; PCOUT<<">>>>>>>>Checking Grady>>>>>>>>"<<std::endl; check_err_grad(grady,n,c_comm,1); PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<"<<std::endl; PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"<<std::endl; PCOUT<<">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"<<std::endl; PCOUT<<">>>>>>>>Checking Gradz>>>>>>>>"<<std::endl; check_err_grad(gradz,n,c_comm,2); PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<"<<std::endl; PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"<<std::endl; /* Compute some timings statistics */ double g_setup_time,g_timings[5],g_exec_time; MPI_Reduce(timings,g_timings,5, MPI_DOUBLE, MPI_MAX,0, c_comm); MPI_Reduce(&setup_time,&g_setup_time,1, MPI_DOUBLE, MPI_MAX,0, c_comm); MPI_Reduce(&exec_time,&g_exec_time,1, MPI_DOUBLE, MPI_MAX,0, c_comm); PCOUT<<"Timing for Grad Computation for size "<<n[0]<<"*"<<n[1]<<"*"<<n[2]<<std::endl; PCOUT<<"Setup \t\t"<<g_setup_time<<std::endl; PCOUT<<"Evaluation \t"<<g_exec_time<<std::endl; accfft_free(data); accfft_free(data_hat); MPI_Barrier(c_comm); accfft_free(gradx); accfft_free(grady); accfft_free(gradz); accfft_destroy_plan(plan); accfft_cleanup(); MPI_Comm_free(&c_comm); PCOUT<<"-------------------------------------------------------"<<std::endl; PCOUT<<"-------------------------------------------------------"<<std::endl; PCOUT<<"-------------------------------------------------------\n"<<std::endl; return ; } // end grad
void step2(int *n, int nthreads) { int nprocs, procid; MPI_Comm_rank(MPI_COMM_WORLD, &procid); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); /* Create Cartesian Communicator */ int c_dims[2]; MPI_Comm c_comm; accfft_create_comm(MPI_COMM_WORLD,c_dims,&c_comm); float *data; double f_time=0*MPI_Wtime(),i_time=0, setup_time=0; int alloc_max=0; int isize[3],osize[3],istart[3],ostart[3]; /* Get the local pencil size and the allocation size */ alloc_max=accfft_local_size_dft_r2cf(n,isize,istart,osize,ostart,c_comm); data=(float*)accfft_alloc(alloc_max); accfft_init(nthreads); setup_time=-MPI_Wtime(); /* Create FFT plan */ accfft_planf * plan=accfft_plan_dft_3d_r2cf(n,data,data,c_comm,ACCFFT_MEASURE); // note that in and out are both data -> inplace plan setup_time+=MPI_Wtime(); /* Warm Up */ accfft_execute_r2cf(plan,data,(Complexf*)data); accfft_execute_r2cf(plan,data,(Complexf*)data); /* Initialize data */ initialize(data,n,c_comm); // special initialize plan for inplace transform -> difference in padding MPI_Barrier(c_comm); /* Perform forward FFT */ f_time-=MPI_Wtime(); accfft_execute_r2cf(plan,data,(Complexf*)data); f_time+=MPI_Wtime(); MPI_Barrier(c_comm); /* Perform backward FFT */ i_time-=MPI_Wtime(); accfft_execute_c2rf(plan,(Complexf*)data,data); i_time+=MPI_Wtime(); /* Check Error */ check_err(data,n,c_comm); /* Compute some timings statistics */ double g_f_time, g_i_time, g_setup_time; MPI_Reduce(&f_time,&g_f_time,1, MPI_DOUBLE, MPI_MAX,0, MPI_COMM_WORLD); MPI_Reduce(&i_time,&g_i_time,1, MPI_DOUBLE, MPI_MAX,0, MPI_COMM_WORLD); MPI_Reduce(&setup_time,&g_setup_time,1, MPI_DOUBLE, MPI_MAX,0, MPI_COMM_WORLD); PCOUT<<"Timing for FFT of size "<<n[0]<<"*"<<n[1]<<"*"<<n[2]<<std::endl; PCOUT<<"Setup \t"<<g_setup_time<<std::endl; PCOUT<<"FFT \t"<<g_f_time<<std::endl; PCOUT<<"IFFT \t"<<g_i_time<<std::endl; accfft_free(data); accfft_destroy_plan(plan); accfft_cleanup(); MPI_Comm_free(&c_comm); return ; } // end step2
void divergence(int *n) { int nprocs, procid; MPI_Comm_rank(MPI_COMM_WORLD, &procid); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); /* Create Cartesian Communicator */ int c_dims[2]={0}; MPI_Comm c_comm; accfft_create_comm(MPI_COMM_WORLD,c_dims,&c_comm); double f_time=0*MPI_Wtime(),i_time=0, setup_time=0; int alloc_max=0; int isize[3],osize[3],istart[3],ostart[3]; /* Get the local pencil size and the allocation size */ alloc_max=accfft_local_size_dft_r2c_gpu(n,isize,istart,osize,ostart,c_comm); //data=(double*)accfft_alloc(isize[0]*isize[1]*isize[2]*sizeof(double)); double * data_cpu=(double*)accfft_alloc(alloc_max); double* data; Complex* data_hat; cudaMalloc((void**) &data , alloc_max); cudaMalloc((void**) &data_hat, alloc_max); accfft_init(); /* Create FFT plan */ setup_time=-MPI_Wtime(); accfft_plan_gpu * plan=accfft_plan_dft_3d_r2c_gpu(n,data,(double*)data_hat,c_comm,ACCFFT_MEASURE); setup_time+=MPI_Wtime(); /* Initialize data */ initialize(data_cpu,n,c_comm); cudaMemcpy(data, data_cpu,alloc_max, cudaMemcpyHostToDevice); MPI_Barrier(c_comm); double * gradx,*grady, *gradz, *divergence; cudaMalloc((void**) &gradx , alloc_max); cudaMalloc((void**) &grady , alloc_max); cudaMalloc((void**) &gradz , alloc_max); cudaMalloc((void**) &divergence, alloc_max); double timings[5]={0}; std::bitset<3> XYZ=0; XYZ[0]=1; XYZ[1]=1; XYZ[2]=1; double exec_time=-MPI_Wtime(); accfft_grad_gpu(gradx,grady,gradz,data,plan,XYZ,timings); accfft_divergence_gpu(divergence,gradx,grady,gradz,plan,timings); exec_time+=MPI_Wtime(); PCOUT<<">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"<<std::endl; PCOUT<<">>>>>Checking Divergence>>>>>>"<<std::endl; cudaMemcpy(data_cpu, divergence, alloc_max, cudaMemcpyDeviceToHost); check_err_laplace(data_cpu,n,c_comm); PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<"<<std::endl; PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"<<std::endl; /* Compute some timings statistics */ double g_setup_time,g_timings[5],g_exec_time; MPI_Reduce(timings,g_timings,5, MPI_DOUBLE, MPI_MAX,0, c_comm); MPI_Reduce(&setup_time,&g_setup_time,1, MPI_DOUBLE, MPI_MAX,0, c_comm); MPI_Reduce(&exec_time,&g_exec_time,1, MPI_DOUBLE, MPI_MAX,0, c_comm); PCOUT<<"Timing for Grad Computation for size "<<n[0]<<"*"<<n[1]<<"*"<<n[2]<<std::endl; PCOUT<<"Setup \t\t"<<g_setup_time<<std::endl; PCOUT<<"Evaluation \t"<<g_exec_time<<std::endl; accfft_free(data_cpu); cudaFree(data); cudaFree(data_hat); MPI_Barrier(c_comm); cudaFree(gradx); cudaFree(grady); cudaFree(gradz); accfft_destroy_plan(plan); accfft_cleanup_gpu(); MPI_Comm_free(&c_comm); PCOUT<<"-------------------------------------------------------"<<std::endl; PCOUT<<"-------------------------------------------------------"<<std::endl; PCOUT<<"-------------------------------------------------------\n"<<std::endl; return ; } // end divergence
void step1_gpu(int *n) { int nprocs, procid; MPI_Comm_rank(MPI_COMM_WORLD, &procid); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); /* Create Cartesian Communicator */ int c_dims[2] = { 0 }; MPI_Comm c_comm; accfft_create_comm(MPI_COMM_WORLD, c_dims, &c_comm); double *data, *data_cpu; Complex *data_hat; double f_time = 0 * MPI_Wtime(), i_time = 0, setup_time = 0; int alloc_max = 0; int isize[3], osize[3], istart[3], ostart[3]; /* Get the local pencil size and the allocation size */ alloc_max = accfft_local_size_dft_r2c_gpu(n, isize, istart, osize, ostart, c_comm); data_cpu = (double*) malloc( isize[0] * isize[1] * isize[2] * sizeof(double)); //data_hat=(Complex*)accfft_alloc(alloc_max); cudaMalloc((void**) &data, isize[0] * isize[1] * isize[2] * sizeof(double)); cudaMalloc((void**) &data_hat, alloc_max); //accfft_init(nthreads); /* Create FFT plan */ setup_time = -MPI_Wtime(); accfft_plan_gpu * plan = accfft_plan_dft_3d_r2c_gpu(n, data, (double*) data_hat, c_comm, ACCFFT_MEASURE); setup_time += MPI_Wtime(); /* Warm Up */ accfft_execute_r2c_gpu(plan, data, data_hat); accfft_execute_r2c_gpu(plan, data, data_hat); /* Initialize data */ initialize(data_cpu, n, c_comm); cudaMemcpy(data, data_cpu, isize[0] * isize[1] * isize[2] * sizeof(double), cudaMemcpyHostToDevice); // initialize_gpu(data,n,isize,istart); // GPU version of initialize function MPI_Barrier(c_comm); /* Perform forward FFT */ f_time -= MPI_Wtime(); accfft_execute_r2c_gpu(plan, data, data_hat); f_time += MPI_Wtime(); MPI_Barrier(c_comm); double *data2_cpu, *data2; cudaMalloc((void**) &data2, isize[0] * isize[1] * isize[2] * sizeof(double)); data2_cpu = (double*) malloc( isize[0] * isize[1] * isize[2] * sizeof(double)); /* Perform backward FFT */ i_time -= MPI_Wtime(); accfft_execute_c2r_gpu(plan, data_hat, data2); i_time += MPI_Wtime(); /* copy back results on CPU */ cudaMemcpy(data2_cpu, data2, isize[0] * isize[1] * isize[2] * sizeof(double), cudaMemcpyDeviceToHost); /* Check Error */ double err = 0, g_err = 0; double norm = 0, g_norm = 0; for (int i = 0; i < isize[0] * isize[1] * isize[2]; ++i) { err += std::abs(data2_cpu[i] / n[0] / n[1] / n[2] - data_cpu[i]); norm += std::abs(data_cpu[i]); } MPI_Reduce(&err, &g_err, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); MPI_Reduce(&norm, &g_norm, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); PCOUT << "\nL1 Error is " << g_err << std::endl; PCOUT << "Relative L1 Error is " << g_err / g_norm << std::endl; if (g_err / g_norm < 1e-10) PCOUT << "\nResults are CORRECT!\n\n"; else PCOUT << "\nResults are NOT CORRECT!\n\n"; check_err(data2_cpu, n, c_comm); /* Compute some timings statistics */ double g_f_time, g_i_time, g_setup_time; MPI_Reduce(&f_time, &g_f_time, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); MPI_Reduce(&i_time, &g_i_time, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); MPI_Reduce(&setup_time, &g_setup_time, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); PCOUT << "GPU Timing for FFT of size " << n[0] << "*" << n[1] << "*" << n[2] << std::endl; PCOUT << "Setup \t" << g_setup_time << std::endl; PCOUT << "FFT \t" << g_f_time << std::endl; PCOUT << "IFFT \t" << g_i_time << std::endl; free(data_cpu); free(data2_cpu); cudaFree(data); cudaFree(data_hat); cudaFree(data2); accfft_destroy_plan_gpu(plan); accfft_cleanup_gpu(); MPI_Comm_free(&c_comm); return; } // end step1_gpu
void step3(int *n, int nthreads) { int nprocs, procid; MPI_Comm_rank(MPI_COMM_WORLD, &procid); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); /* Create Cartesian Communicator */ int c_dims[2]={0}; MPI_Comm c_comm; accfft_create_comm(MPI_COMM_WORLD,c_dims,&c_comm); #ifdef INPLACE Complexf *data; #else Complexf *data; Complexf *data_hat; #endif double f_time=0*MPI_Wtime(),i_time=0, setup_time=0; int alloc_max=0; int isize[3],osize[3],istart[3],ostart[3]; /* Get the local pencil size and the allocation size */ alloc_max=accfft_local_size_dft_c2cf(n,isize,istart,osize,ostart,c_comm); #ifdef INPLACE data=(Complexf*)accfft_alloc(alloc_max); #else data=(Complexf*)accfft_alloc(isize[0]*isize[1]*isize[2]*2*sizeof(float)); data_hat=(Complexf*)accfft_alloc(alloc_max); #endif accfft_init(nthreads); /* Create FFT plan */ setup_time=-MPI_Wtime(); #ifdef INPLACE accfft_planf * plan=accfft_plan_dft_3d_c2cf(n,data,data,c_comm,ACCFFT_MEASURE); #else accfft_planf * plan=accfft_plan_dft_3d_c2cf(n,data,data_hat,c_comm,ACCFFT_MEASURE); #endif setup_time+=MPI_Wtime(); /* Initialize data */ initialize(data,n,c_comm); MPI_Barrier(c_comm); /* Perform forward FFT */ f_time-=MPI_Wtime(); #ifdef INPLACE accfft_execute_c2cf(plan,ACCFFT_FORWARD,data,data); #else accfft_execute_c2cf(plan,ACCFFT_FORWARD,data,data_hat); #endif f_time+=MPI_Wtime(); MPI_Barrier(c_comm); /* Perform backward FFT */ #ifdef INPLACE i_time-=MPI_Wtime(); accfft_execute_c2cf(plan,ACCFFT_BACKWARD,data,data); i_time+=MPI_Wtime(); #else Complexf * data2=(Complexf*)accfft_alloc(isize[0]*isize[1]*isize[2]*2*sizeof(float)); i_time-=MPI_Wtime(); accfft_execute_c2cf(plan,ACCFFT_BACKWARD,data_hat,data2); i_time+=MPI_Wtime(); #endif /* Check Error */ #ifdef INPLACE check_err(data,n,c_comm); #else check_err(data2,n,c_comm); #endif /* Compute some timings statistics */ double g_f_time, g_i_time, g_setup_time; MPI_Reduce(&f_time,&g_f_time,1, MPI_DOUBLE, MPI_MAX,0, MPI_COMM_WORLD); MPI_Reduce(&i_time,&g_i_time,1, MPI_DOUBLE, MPI_MAX,0, MPI_COMM_WORLD); MPI_Reduce(&setup_time,&g_setup_time,1, MPI_DOUBLE, MPI_MAX,0, MPI_COMM_WORLD); #ifdef INPLACE PCOUT<<"Timing for Inplace FFT of size "<<n[0]<<"*"<<n[1]<<"*"<<n[2]<<std::endl; #else PCOUT<<"Timing for Outplace FFT of size "<<n[0]<<"*"<<n[1]<<"*"<<n[2]<<std::endl; #endif PCOUT<<"Setup \t"<<g_setup_time<<std::endl; PCOUT<<"FFT \t"<<g_f_time<<std::endl; PCOUT<<"IFFT \t"<<g_i_time<<std::endl; accfft_free(data); #ifndef INPLACE accfft_free(data_hat); accfft_free(data2); #endif accfft_destroy_plan(plan); accfft_cleanup(); MPI_Comm_free(&c_comm); return ; } // end step3
void step2_gpu(int *n) { int nprocs, procid; MPI_Comm_rank(MPI_COMM_WORLD, &procid); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); /* Create Cartesian Communicator */ int c_dims[2]; MPI_Comm c_comm; accfft_create_comm(MPI_COMM_WORLD, c_dims, &c_comm); float *data, *data_cpu; Complexf *data_hat; double f_time = 0 * MPI_Wtime(), i_time = 0, setup_time = 0; int alloc_max = 0; int isize[3], osize[3], istart[3], ostart[3]; /* Get the local pencil size and the allocation size */ alloc_max = accfft_local_size_dft_r2c_gpuf(n, isize, istart, osize, ostart, c_comm); /* Note that both need to be allocated by alloc_max because of inplace transform*/ data_cpu = (float*) malloc(alloc_max); cudaMalloc((void**) &data, alloc_max); //accfft_init(nthreads); /* Create FFT plan */ setup_time = -MPI_Wtime(); accfft_plan_gpuf * plan = accfft_plan_dft_3d_r2c_gpuf(n, data, data, c_comm, ACCFFT_MEASURE); setup_time += MPI_Wtime(); /* Warm Up */ accfft_execute_r2c_gpuf(plan, data, (Complexf*) data); accfft_execute_r2c_gpuf(plan, data, (Complexf*) data); /* Initialize data */ initialize_gpu(data, n, isize, istart); MPI_Barrier(c_comm); /* Perform forward FFT */ f_time -= MPI_Wtime(); accfft_execute_r2c_gpuf(plan, data, (Complexf*) data); f_time += MPI_Wtime(); MPI_Barrier(c_comm); /* Perform backward FFT */ i_time -= MPI_Wtime(); accfft_execute_c2r_gpuf(plan, (Complexf*) data, data); i_time += MPI_Wtime(); /* copy back results on CPU */ cudaMemcpy(data_cpu, data, alloc_max, cudaMemcpyDeviceToHost); /* Check Error */ check_err(data_cpu, n, c_comm); /* Compute some timings statistics */ double g_f_time, g_i_time, g_setup_time; MPI_Reduce(&f_time, &g_f_time, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); MPI_Reduce(&i_time, &g_i_time, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); MPI_Reduce(&setup_time, &g_setup_time, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); PCOUT << "GPU Timing for FFT of size " << n[0] << "*" << n[1] << "*" << n[2] << std::endl; PCOUT << "Setup \t" << g_setup_time << std::endl; PCOUT << "FFT \t" << g_f_time << std::endl; PCOUT << "IFFT \t" << g_i_time << std::endl; free(data_cpu); cudaFree(data); accfft_destroy_plan_gpu(plan); accfft_cleanup_gpuf(); MPI_Comm_free(&c_comm); return; } // end step2_gpu
void step3_gpu(int *n) { int nprocs, procid; MPI_Comm_rank(MPI_COMM_WORLD, &procid); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); /* Create Cartesian Communicator */ int c_dims[2]={0}; MPI_Comm c_comm; accfft_create_comm(MPI_COMM_WORLD,c_dims,&c_comm); Complexf *data, *data_cpu; Complexf *data_hat; double f_time=0*MPI_Wtime(),i_time=0, setup_time=0; int alloc_max=0; int isize[3],osize[3],istart[3],ostart[3]; /* Get the local pencil size and the allocation size */ alloc_max=accfft_local_size_dft_c2c_gpuf(n,isize,istart,osize,ostart,c_comm); #ifdef INPLACE data_cpu=(Complexf*)malloc(alloc_max); cudaMalloc((void**) &data, alloc_max); #else data_cpu=(Complexf*)malloc(isize[0]*isize[1]*isize[2]*2*sizeof(float)); cudaMalloc((void**) &data,isize[0]*isize[1]*isize[2]*2*sizeof(float)); cudaMalloc((void**) &data_hat, alloc_max); #endif //accfft_init(nthreads); setup_time=-MPI_Wtime(); /* Create FFT plan */ #ifdef INPLACE accfft_plan_gpuf * plan=accfft_plan_dft_3d_c2c_gpuf(n,data,data,c_comm,ACCFFT_MEASURE); #else accfft_plan_gpuf * plan=accfft_plan_dft_3d_c2c_gpuf(n,data,data_hat,c_comm,ACCFFT_MEASURE); #endif setup_time+=MPI_Wtime(); /* Warmup Runs */ #ifdef INPLACE accfft_execute_c2c_gpuf(plan,ACCFFT_FORWARD,data,data); accfft_execute_c2c_gpuf(plan,ACCFFT_FORWARD,data,data); #else accfft_execute_c2c_gpuf(plan,ACCFFT_FORWARD,data,data_hat); accfft_execute_c2c_gpuf(plan,ACCFFT_FORWARD,data,data_hat); #endif /* Initialize data */ initialize(data_cpu,n,c_comm); #ifdef INPLACE cudaMemcpy(data, data_cpu,alloc_max, cudaMemcpyHostToDevice); #else cudaMemcpy(data, data_cpu,isize[0]*isize[1]*isize[2]*2*sizeof(float), cudaMemcpyHostToDevice); #endif MPI_Barrier(c_comm); /* Perform forward FFT */ f_time-=MPI_Wtime(); #ifdef INPLACE accfft_execute_c2c_gpuf(plan,ACCFFT_FORWARD,data,data); #else accfft_execute_c2c_gpuf(plan,ACCFFT_FORWARD,data,data_hat); #endif f_time+=MPI_Wtime(); MPI_Barrier(c_comm); #ifndef INPLACE Complexf *data2_cpu, *data2; cudaMalloc((void**) &data2, isize[0]*isize[1]*isize[2]*2*sizeof(float)); data2_cpu=(Complexf*) malloc(isize[0]*isize[1]*isize[2]*2*sizeof(float)); #endif /* Perform backward FFT */ i_time-=MPI_Wtime(); #ifdef INPLACE accfft_execute_c2c_gpuf(plan,ACCFFT_BACKWARD,data,data); #else accfft_execute_c2c_gpuf(plan,ACCFFT_BACKWARD,data_hat,data2); #endif i_time+=MPI_Wtime(); /* copy back results on CPU and check error*/ #ifdef INPLACE cudaMemcpy(data_cpu, data, alloc_max, cudaMemcpyDeviceToHost); check_err(data_cpu,n,c_comm); #else cudaMemcpy(data2_cpu, data2, isize[0]*isize[1]*isize[2]*2*sizeof(float), cudaMemcpyDeviceToHost); check_err(data2_cpu,n,c_comm); #endif /* Compute some timings statistics */ double g_f_time, g_i_time, g_setup_time; MPI_Reduce(&f_time,&g_f_time,1, MPI_DOUBLE, MPI_MAX,0, MPI_COMM_WORLD); MPI_Reduce(&i_time,&g_i_time,1, MPI_DOUBLE, MPI_MAX,0, MPI_COMM_WORLD); MPI_Reduce(&setup_time,&g_setup_time,1, MPI_DOUBLE, MPI_MAX,0, MPI_COMM_WORLD); #ifdef INPLACE PCOUT<<"GPU Timing for Inplace FFT of size "<<n[0]<<"*"<<n[1]<<"*"<<n[2]<<std::endl; #else PCOUT<<"GPU Timing for Outplace FFT of size "<<n[0]<<"*"<<n[1]<<"*"<<n[2]<<std::endl; #endif PCOUT<<"Setup \t"<<g_setup_time<<std::endl; PCOUT<<"FFT \t"<<g_f_time<<std::endl; PCOUT<<"IFFT \t"<<g_i_time<<std::endl; MPI_Barrier(c_comm); cudaDeviceSynchronize(); free(data_cpu); cudaFree(data); #ifndef INPLACE cudaFree(data_hat); free(data2_cpu); cudaFree(data2); #endif accfft_destroy_plan_gpu(plan); accfft_cleanup_gpuf(); MPI_Comm_free(&c_comm); return ; } // end step3_gpu
void laplace(int *n, int nthreads) { int nprocs, procid; MPI_Comm_rank(MPI_COMM_WORLD, &procid); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); /* Create Cartesian Communicator */ int c_dims[2]={0}; MPI_Comm c_comm; accfft_create_comm(MPI_COMM_WORLD,c_dims,&c_comm); double *data; Complex *data_hat; double f_time=0*MPI_Wtime(),i_time=0, setup_time=0; int alloc_max=0; int isize[3],osize[3],istart[3],ostart[3]; /* Get the local pencil size and the allocation size */ alloc_max=accfft_local_size_dft_r2c(n,isize,istart,osize,ostart,c_comm); //data=(double*)accfft_alloc(isize[0]*isize[1]*isize[2]*sizeof(double)); data=(double*)accfft_alloc(alloc_max); data_hat=(Complex*)accfft_alloc(alloc_max); accfft_init(nthreads); /* Create FFT plan */ setup_time=-MPI_Wtime(); accfft_plan * plan=accfft_plan_dft_3d_r2c(n,data,(double*)data_hat,c_comm,ACCFFT_MEASURE); setup_time+=MPI_Wtime(); /* Initialize data */ initialize(data,n,c_comm); MPI_Barrier(c_comm); double * laplace=(double*)accfft_alloc(isize[0]*isize[1]*isize[2]*sizeof(double)); double timings[5]={0}; double exec_time=-MPI_Wtime(); accfft_laplace(laplace,data,plan,timings); exec_time+=MPI_Wtime(); /* Check err*/ PCOUT<<">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"<<std::endl; PCOUT<<">>>>>>>Checking Laplace>>>>>>>"<<std::endl; check_err_laplace(laplace,n,c_comm); PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<"<<std::endl; PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"<<std::endl; /* Compute some timings statistics */ double g_setup_time,g_timings[5],g_exec_time; MPI_Reduce(timings,g_timings,5, MPI_DOUBLE, MPI_MAX,0, c_comm); MPI_Reduce(&setup_time,&g_setup_time,1, MPI_DOUBLE, MPI_MAX,0, c_comm); MPI_Reduce(&exec_time,&g_exec_time,1, MPI_DOUBLE, MPI_MAX,0, c_comm); PCOUT<<"Timing for Laplace Computation for size "<<n[0]<<"*"<<n[1]<<"*"<<n[2]<<std::endl; PCOUT<<"Setup \t\t"<<g_setup_time<<std::endl; PCOUT<<"Evaluation \t"<<g_exec_time<<std::endl; accfft_free(data); accfft_free(data_hat); MPI_Barrier(c_comm); accfft_free(laplace); accfft_destroy_plan(plan); accfft_cleanup(); MPI_Comm_free(&c_comm); return ; } // end laplace