void initialize(double *a, int *n, MPI_Comm c_comm) { double pi=M_PI; int istart[3], isize[3], osize[3],ostart[3]; accfft_local_size_dft_r2c_gpu(n,isize,istart,osize,ostart,c_comm); #pragma omp parallel { double X,Y,Z; long int ptr; #pragma omp for for (int i=0; i<isize[0]; i++){ for (int j=0; j<isize[1]; j++){ for (int k=0; k<isize[2]; k++){ X=2*pi/n[0]*(i+istart[0]); Y=2*pi/n[1]*(j+istart[1]); Z=2*pi/n[2]*k; ptr=i*isize[1]*n[2]+j*n[2]+k; a[ptr]=testcase(X,Y,Z); } } } } return; } // end initialize
void check_err(double* a, int*n, MPI_Comm c_comm) { int nprocs, procid; MPI_Comm_rank(c_comm, &procid); MPI_Comm_size(c_comm, &nprocs); long long int size = n[0]; size *= n[1]; size *= n[2]; double pi = 4 * atan(1.0); int istart[3], isize[3], osize[3], ostart[3]; accfft_local_size_dft_r2c_gpu(n, isize, istart, osize, ostart, c_comm); double err = 0, norm = 0; double X, Y, Z, numerical_r; long int ptr; int thid = omp_get_thread_num(); for (int i = 0; i < isize[0]; i++) { for (int j = 0; j < isize[1]; j++) { for (int k = 0; k < isize[2]; k++) { X = 2 * pi / n[0] * (i + istart[0]); Y = 2 * pi / n[1] * (j + istart[1]); Z = 2 * pi / n[2] * k; ptr = i * isize[1] * n[2] + j * n[2] + k; numerical_r = a[ptr] / size; if (numerical_r != numerical_r) numerical_r = 0; err += std::abs(numerical_r - testcase(X, Y, Z)); norm += std::abs(testcase(X, Y, Z)); } } } double g_err = 0, g_norm = 0; MPI_Reduce(&err, &g_err, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); MPI_Reduce(&norm, &g_norm, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); PCOUT << "\nL1 Error of iFF(a)-a: " << g_err << std::endl; PCOUT << "Relative L1 Error of iFF(a)-a: " << g_err / g_norm << std::endl; if (g_err / g_norm < 1e-10) PCOUT << "\nResults are CORRECT!\n\n"; else PCOUT << "\nResults are NOT CORRECT!\n\n"; return; } // end check_err
void grad(int *n) { int nprocs, procid; MPI_Comm_rank(MPI_COMM_WORLD, &procid); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); /* Create Cartesian Communicator */ int c_dims[2]={0}; MPI_Comm c_comm; accfft_create_comm(MPI_COMM_WORLD,c_dims,&c_comm); double f_time=0*MPI_Wtime(),i_time=0, setup_time=0; int alloc_max=0; int isize[3],osize[3],istart[3],ostart[3]; /* Get the local pencil size and the allocation size */ alloc_max=accfft_local_size_dft_r2c_gpu(n,isize,istart,osize,ostart,c_comm); //data=(double*)accfft_alloc(isize[0]*isize[1]*isize[2]*sizeof(double)); double * data_cpu=(double*)accfft_alloc(alloc_max); double* data; Complex* data_hat; cudaMalloc((void**) &data , alloc_max); cudaMalloc((void**) &data_hat, alloc_max); accfft_init(); /* Create FFT plan */ setup_time=-MPI_Wtime(); accfft_plan_gpu * plan=accfft_plan_dft_3d_r2c_gpu(n,data,(double*)data_hat,c_comm,ACCFFT_MEASURE); setup_time+=MPI_Wtime(); /* Initialize data */ initialize(data_cpu,n,c_comm); cudaMemcpy(data, data_cpu,alloc_max, cudaMemcpyHostToDevice); MPI_Barrier(c_comm); double * gradx,*grady, *gradz; cudaMalloc((void**) &gradx , alloc_max); cudaMalloc((void**) &grady , alloc_max); cudaMalloc((void**) &gradz , alloc_max); double timings[5]={0}; std::bitset<3> XYZ=0; XYZ[0]=1; XYZ[1]=1; XYZ[2]=1; double exec_time=-MPI_Wtime(); accfft_grad_gpu(gradx,grady,gradz,data,plan,XYZ,timings); exec_time+=MPI_Wtime(); /* Check err*/ PCOUT<<">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"<<std::endl; PCOUT<<">>>>>>>>Checking Gradx>>>>>>>>"<<std::endl; cudaMemcpy(data_cpu, gradx, alloc_max, cudaMemcpyDeviceToHost); check_err_grad(data_cpu,n,c_comm,0); PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<"<<std::endl; PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"<<std::endl; PCOUT<<">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"<<std::endl; PCOUT<<">>>>>>>>Checking Grady>>>>>>>>"<<std::endl; cudaMemcpy(data_cpu, grady, alloc_max, cudaMemcpyDeviceToHost); check_err_grad(data_cpu,n,c_comm,1); PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<"<<std::endl; PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"<<std::endl; PCOUT<<">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"<<std::endl; PCOUT<<">>>>>>>>Checking Gradz>>>>>>>>"<<std::endl; cudaMemcpy(data_cpu, gradz, alloc_max, cudaMemcpyDeviceToHost); check_err_grad(data_cpu,n,c_comm,2); PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<"<<std::endl; PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"<<std::endl; /* Compute some timings statistics */ double g_setup_time,g_timings[5],g_exec_time; MPI_Reduce(timings,g_timings,5, MPI_DOUBLE, MPI_MAX,0, c_comm); MPI_Reduce(&setup_time,&g_setup_time,1, MPI_DOUBLE, MPI_MAX,0, c_comm); MPI_Reduce(&exec_time,&g_exec_time,1, MPI_DOUBLE, MPI_MAX,0, c_comm); PCOUT<<"Timing for Grad Computation for size "<<n[0]<<"*"<<n[1]<<"*"<<n[2]<<std::endl; PCOUT<<"Setup \t\t"<<g_setup_time<<std::endl; PCOUT<<"Evaluation \t"<<g_exec_time<<std::endl; accfft_free(data_cpu); cudaFree(data); cudaFree(data_hat); MPI_Barrier(c_comm); cudaFree(gradx); cudaFree(grady); cudaFree(gradz); accfft_destroy_plan(plan); accfft_cleanup_gpu(); MPI_Comm_free(&c_comm); PCOUT<<"-------------------------------------------------------"<<std::endl; PCOUT<<"-------------------------------------------------------"<<std::endl; PCOUT<<"-------------------------------------------------------\n"<<std::endl; return ; } // end grad
void step1_gpu(int *n) { int nprocs, procid; MPI_Comm_rank(MPI_COMM_WORLD, &procid); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); /* Create Cartesian Communicator */ int c_dims[2] = { 0 }; MPI_Comm c_comm; accfft_create_comm(MPI_COMM_WORLD, c_dims, &c_comm); double *data, *data_cpu; Complex *data_hat; double f_time = 0 * MPI_Wtime(), i_time = 0, setup_time = 0; int alloc_max = 0; int isize[3], osize[3], istart[3], ostart[3]; /* Get the local pencil size and the allocation size */ alloc_max = accfft_local_size_dft_r2c_gpu(n, isize, istart, osize, ostart, c_comm); data_cpu = (double*) malloc( isize[0] * isize[1] * isize[2] * sizeof(double)); //data_hat=(Complex*)accfft_alloc(alloc_max); cudaMalloc((void**) &data, isize[0] * isize[1] * isize[2] * sizeof(double)); cudaMalloc((void**) &data_hat, alloc_max); //accfft_init(nthreads); /* Create FFT plan */ setup_time = -MPI_Wtime(); accfft_plan_gpu * plan = accfft_plan_dft_3d_r2c_gpu(n, data, (double*) data_hat, c_comm, ACCFFT_MEASURE); setup_time += MPI_Wtime(); /* Warm Up */ accfft_execute_r2c_gpu(plan, data, data_hat); accfft_execute_r2c_gpu(plan, data, data_hat); /* Initialize data */ initialize(data_cpu, n, c_comm); cudaMemcpy(data, data_cpu, isize[0] * isize[1] * isize[2] * sizeof(double), cudaMemcpyHostToDevice); // initialize_gpu(data,n,isize,istart); // GPU version of initialize function MPI_Barrier(c_comm); /* Perform forward FFT */ f_time -= MPI_Wtime(); accfft_execute_r2c_gpu(plan, data, data_hat); f_time += MPI_Wtime(); MPI_Barrier(c_comm); double *data2_cpu, *data2; cudaMalloc((void**) &data2, isize[0] * isize[1] * isize[2] * sizeof(double)); data2_cpu = (double*) malloc( isize[0] * isize[1] * isize[2] * sizeof(double)); /* Perform backward FFT */ i_time -= MPI_Wtime(); accfft_execute_c2r_gpu(plan, data_hat, data2); i_time += MPI_Wtime(); /* copy back results on CPU */ cudaMemcpy(data2_cpu, data2, isize[0] * isize[1] * isize[2] * sizeof(double), cudaMemcpyDeviceToHost); /* Check Error */ double err = 0, g_err = 0; double norm = 0, g_norm = 0; for (int i = 0; i < isize[0] * isize[1] * isize[2]; ++i) { err += std::abs(data2_cpu[i] / n[0] / n[1] / n[2] - data_cpu[i]); norm += std::abs(data_cpu[i]); } MPI_Reduce(&err, &g_err, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); MPI_Reduce(&norm, &g_norm, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); PCOUT << "\nL1 Error is " << g_err << std::endl; PCOUT << "Relative L1 Error is " << g_err / g_norm << std::endl; if (g_err / g_norm < 1e-10) PCOUT << "\nResults are CORRECT!\n\n"; else PCOUT << "\nResults are NOT CORRECT!\n\n"; check_err(data2_cpu, n, c_comm); /* Compute some timings statistics */ double g_f_time, g_i_time, g_setup_time; MPI_Reduce(&f_time, &g_f_time, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); MPI_Reduce(&i_time, &g_i_time, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); MPI_Reduce(&setup_time, &g_setup_time, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); PCOUT << "GPU Timing for FFT of size " << n[0] << "*" << n[1] << "*" << n[2] << std::endl; PCOUT << "Setup \t" << g_setup_time << std::endl; PCOUT << "FFT \t" << g_f_time << std::endl; PCOUT << "IFFT \t" << g_i_time << std::endl; free(data_cpu); free(data2_cpu); cudaFree(data); cudaFree(data_hat); cudaFree(data2); accfft_destroy_plan_gpu(plan); accfft_cleanup_gpu(); MPI_Comm_free(&c_comm); return; } // end step1_gpu
/** * Creates a 3D R2C parallel FFT plan.If data_out point to the same location as the input * data, then an inplace plan will be created. Otherwise the plan would be outplace. * @param n Integer array of size 3, corresponding to the global data size * @param data Input data in spatial domain * @param data_out Output data in frequency domain * @param c_comm Cartesian communicator returned by \ref accfft_create_comm * @param flags AccFFT flags, See \ref flags for more details. * @return */ accfft_plan_gpu* accfft_plan_dft_3d_r2c_gpu(int * n, double * data_d, double * data_out_d, MPI_Comm c_comm,unsigned flags){ accfft_plan_gpu *plan=new accfft_plan_gpu; int procid; MPI_Comm_rank(c_comm, &procid); plan->procid=procid; MPI_Cart_get(c_comm,2,plan->np,plan->periods,plan->coord); plan->c_comm=c_comm; int *coord=plan->coord; MPI_Comm_split(c_comm,coord[0],coord[1],&plan->row_comm); MPI_Comm_split(c_comm,coord[1],coord[0],&plan->col_comm); plan->N[0]=n[0];plan->N[1]=n[1];plan->N[2]=n[2]; plan->data=data_d; plan->data_out=data_out_d; if(plan->np[1]==1) plan->oneD=true; else plan->oneD=false; if(data_out_d==data_d){ plan->inplace=true;} else{plan->inplace=false;} int *osize_0 =plan->osize_0, *ostart_0 =plan->ostart_0; int *osize_1 =plan->osize_1, *ostart_1 =plan->ostart_1; int *osize_2 =plan->osize_2, *ostart_2 =plan->ostart_2; int *osize_1i=plan->osize_1i,*ostart_1i=plan->ostart_1i; int *osize_2i=plan->osize_2i,*ostart_2i=plan->ostart_2i; int alloc_max=0; int n_tuples_i, n_tuples_o; //plan->inplace==true ? n_tuples=(n[2]/2+1)*2: n_tuples=n[2]*2; plan->inplace==true ? n_tuples_i=(n[2]/2+1)*2: n_tuples_i=n[2]; n_tuples_o=(n[2]/2+1)*2; //int isize[3],osize[3],istart[3],ostart[3]; alloc_max=accfft_local_size_dft_r2c_gpu(n,plan->isize,plan->istart,plan->osize,plan->ostart,c_comm,plan->inplace); plan->alloc_max=alloc_max; dfft_get_local_size_gpu(n[0],n[1],n_tuples_o,osize_0,ostart_0,c_comm); dfft_get_local_size_gpu(n[0],n_tuples_o/2,n[1],osize_1,ostart_1,c_comm); dfft_get_local_size_gpu(n[1],n_tuples_o/2,n[0],osize_2,ostart_2,c_comm); std::swap(osize_1[1],osize_1[2]); std::swap(ostart_1[1],ostart_1[2]); std::swap(ostart_2[1],ostart_2[2]); std::swap(ostart_2[0],ostart_2[1]); std::swap(osize_2[1],osize_2[2]); std::swap(osize_2[0],osize_2[1]); for(int i=0;i<3;i++){ osize_1i[i]=osize_1[i]; osize_2i[i]=osize_2[i]; ostart_1i[i]=ostart_1[i]; ostart_2i[i]=ostart_2[i]; } // fplan_0 int NX=n[0], NY=n[1], NZ=n[2]; cufftResult_t cufft_error; { int f_inembed[1]={n_tuples_i}; int f_onembed[1]={n_tuples_o/2}; int idist=(n_tuples_i); int odist=n_tuples_o/2; int istride=1; int ostride=1; int batch=osize_0[0]*osize_0[1];//NX; if(batch!=0) { cufft_error=cufftPlanMany(&plan->fplan_0, 1, &n[2], f_inembed, istride, idist, // *inembed, istride, idist f_onembed, ostride, odist, // *onembed, ostride, odist CUFFT_D2Z, batch); if(cufft_error!= CUFFT_SUCCESS) { fprintf(stderr, "CUFFT error: fplan_0 creation failed %d \n",cufft_error); return NULL; } //cufftSetCompatibilityMode(fplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan cuda compatibility\n"); return;} } if(batch!=0) { cufft_error=cufftPlanMany(&plan->iplan_0, 1, &n[2], f_onembed, ostride, odist, // *onembed, ostride, odist f_inembed, istride, idist, // *inembed, istride, idist CUFFT_Z2D, batch); if(cufft_error!= CUFFT_SUCCESS) { fprintf(stderr, "CUFFT error: iplan_0 creation failed %d \n",cufft_error); return NULL; } //cufftSetCompatibilityMode(fplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan cuda compatibility\n"); return;} } } // fplan_1 { int f_inembed[1]={NY}; int f_onembed[1]={NY}; int idist=1; int odist=1; int istride=osize_1[2]; int ostride=osize_1[2]; int batch=osize_1[2]; if(batch!=0) { cufft_error=cufftPlanMany(&plan->fplan_1, 1, &n[1], f_inembed, istride, idist, // *inembed, istride, idist f_onembed, ostride, odist, // *onembed, ostride, odist CUFFT_Z2Z, batch); if(cufft_error!= CUFFT_SUCCESS) { fprintf(stderr, "CUFFT error: fplan_1 creation failed %d \n",cufft_error); return NULL; } //cufftSetCompatibilityMode(fplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan cuda compatibility\n"); return;} } } // fplan_2 { int f_inembed[1]={NX}; int f_onembed[1]={NX}; int idist=1; int odist=1; int istride=osize_2[1]*osize_2[2]; int ostride=osize_2[1]*osize_2[2]; int batch=osize_2[1]*osize_2[2];; if(batch!=0) { cufft_error=cufftPlanMany(&plan->fplan_2, 1, &n[0], f_inembed, istride, idist, // *inembed, istride, idist f_onembed, ostride, odist, // *onembed, ostride, odist CUFFT_Z2Z, batch); if(cufft_error!= CUFFT_SUCCESS) { fprintf(stderr, "CUFFT error: fplan_2 creation failed %d \n",cufft_error); return NULL; } //cufftSetCompatibilityMode(fplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan cuda compatibility\n"); return;} } } // 1D Decomposition if(plan->oneD){ int N0=n[0], N1=n[1], N2=n[2]; plan->Mem_mgr = new Mem_Mgr_gpu<double>(N0,N1,n_tuples_o,c_comm); plan->T_plan_2 = new T_Plan_gpu<double>(N0,N1,n_tuples_o, plan->Mem_mgr, c_comm); plan->T_plan_2i= new T_Plan_gpu<double>(N1,N0,n_tuples_o,plan->Mem_mgr, c_comm); plan->T_plan_1=NULL; plan->T_plan_1i=NULL; plan->alloc_max=alloc_max; plan->T_plan_2->alloc_local=alloc_max; plan->T_plan_2i->alloc_local=alloc_max; if(flags==ACCFFT_MEASURE){ plan->T_plan_2->which_fast_method_gpu(plan->T_plan_2,data_out_d); } else{ plan->T_plan_2->method=2; plan->T_plan_2->kway=2; } checkCuda_accfft (cudaDeviceSynchronize()); MPI_Barrier(plan->c_comm); plan->T_plan_2i->method=-plan->T_plan_2->method; plan->T_plan_2i->kway=plan->T_plan_2->kway; plan->T_plan_2i->kway_async=plan->T_plan_2->kway_async; } // end 1d r2c // 2D Decomposition if (!plan->oneD){ // the reaseon for n_tuples/2 is to avoid splitting of imag and real parts of complex numbers plan->Mem_mgr = new Mem_Mgr_gpu<double>(n[1],n_tuples_o/2,2,plan->row_comm,osize_0[0],alloc_max); plan->T_plan_1 = new T_Plan_gpu<double>(n[1],n_tuples_o/2,2, plan->Mem_mgr, plan->row_comm,osize_0[0]); plan->T_plan_2 = new T_Plan_gpu<double>(n[0],n[1],osize_2[2]*2,plan->Mem_mgr, plan->col_comm); plan->T_plan_2i= new T_Plan_gpu<double>(n[1],n[0],osize_2i[2]*2, plan->Mem_mgr, plan->col_comm); plan->T_plan_1i= new T_Plan_gpu<double>(n_tuples_o/2,n[1],2, plan->Mem_mgr, plan->row_comm,osize_1i[0]); plan->T_plan_1->alloc_local=plan->alloc_max; plan->T_plan_2->alloc_local=plan->alloc_max; plan->T_plan_2i->alloc_local=plan->alloc_max; plan->T_plan_1i->alloc_local=plan->alloc_max; if(flags==ACCFFT_MEASURE){ if(coord[0]==0){ plan->T_plan_1->which_fast_method_gpu(plan->T_plan_1,data_out_d,osize_0[0]); } } else{ plan->T_plan_1->method=2; plan->T_plan_1->kway=2; } MPI_Bcast(&plan->T_plan_1->method,1, MPI_INT,0, c_comm ); MPI_Bcast(&plan->T_plan_1->kway,1, MPI_INT,0, c_comm ); MPI_Bcast(&plan->T_plan_1->kway_async,1, MPI::BOOL,0, c_comm ); checkCuda_accfft (cudaDeviceSynchronize()); MPI_Barrier(plan->c_comm); plan->T_plan_1->method =plan->T_plan_1->method; plan->T_plan_2->method =plan->T_plan_1->method; plan->T_plan_2i->method=-plan->T_plan_1->method; plan->T_plan_1i->method=-plan->T_plan_1->method; plan->T_plan_1->kway =plan->T_plan_1->kway; plan->T_plan_2->kway =plan->T_plan_1->kway; plan->T_plan_2i->kway=plan->T_plan_1->kway; plan->T_plan_1i->kway=plan->T_plan_1->kway; plan->T_plan_1->kway_async =plan->T_plan_1->kway_async; plan->T_plan_2->kway_async =plan->T_plan_1->kway_async; plan->T_plan_2i->kway_async=plan->T_plan_1->kway_async; plan->T_plan_1i->kway_async=plan->T_plan_1->kway_async; plan->iplan_1=-1; plan->iplan_2=-1; }// end 2d r2c plan->r2c_plan_baked=true; return plan; } // end accfft_plan_dft_r2c_gpu
int accfft_local_size_dft_r2c_gpu_t( int * n,int * isize, int * istart, int * osize, int *ostart,MPI_Comm c_comm){ return accfft_local_size_dft_r2c_gpu(n,isize,istart,osize,ostart,c_comm); }