/** * Execute C2C plan. This function is blocking and only returns after the transform is completed. * @note For inplace transforms, data_out should point to the same memory address as data, AND * the plan must have been created as inplace. * @param plan FFT plan created by \ref accfft_plan_dft_3d_r2c. * @param data Input data in frequency domain. * @param data_out Output data in frequency domain. * @param timer See \ref timer for more details. * @param XYZ a bit set field that determines which directions FFT should be executed */ void accfft_execute_c2c_gpu(accfft_plan_gpu* plan, int direction,Complex * data_d, Complex * data_out_d, double * timer,std::bitset<3> xyz){ if(!plan->c2c_plan_baked){ if(plan->procid==0) std::cout<<"Error. r2c plan has not been made correctly. Please first create the plan before calling execute functions."<<std::endl; return; } if(data_d==NULL) data_d=plan->data_c; if(data_out_d==NULL) data_out_d=plan->data_out_c; int * coords=plan->coord; int procid=plan->procid; double fft_time=0; double timings[5]={0}; int NY=plan->N[1]; cudaEvent_t memcpy_startEvent, memcpy_stopEvent; cudaEvent_t fft_startEvent, fft_stopEvent; checkCuda_accfft( cudaEventCreate(&memcpy_startEvent) ); checkCuda_accfft( cudaEventCreate(&memcpy_stopEvent) ); checkCuda_accfft( cudaEventCreate(&fft_startEvent) ); checkCuda_accfft( cudaEventCreate(&fft_stopEvent) ); cufftResult_t cufft_error; float dummy_time=0; int *osize_0 =plan->osize_0, *ostart_0 =plan->ostart_0; int *osize_1 =plan->osize_1, *ostart_1 =plan->ostart_1; int *osize_2 =plan->osize_2, *ostart_2 =plan->ostart_2; int *osize_1i=plan->osize_1i,*ostart_1i=plan->ostart_1i; int *osize_2i=plan->osize_2i,*ostart_2i=plan->ostart_2i; if(direction==-1){ /**************************************************************/ /******************* N0/P0 x N1/P1 x N2 **********************/ /**************************************************************/ // FFT in Z direction if(xyz[2]){ checkCuda_accfft( cudaEventRecord(fft_startEvent,0) ); checkCuda_accfft(cufftExecZ2Z(plan->fplan_0,(cufftDoubleComplex*)data_d, (cufftDoubleComplex*)data_out_d,CUFFT_FORWARD)); checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) ); checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) ); fft_time+=dummy_time/1000; } else data_out_d=data_d; if(!plan->oneD){ plan->T_plan_1->execute_gpu(plan->T_plan_1,(double*)data_out_d,timings,2,osize_0[0],coords[0]); } checkCuda_accfft (cudaDeviceSynchronize()); MPI_Barrier(plan->c_comm); /**************************************************************/ /******************* N0/P0 x N1 x N2/P1 **********************/ /**************************************************************/ if(xyz[1]){ checkCuda_accfft( cudaEventRecord(fft_startEvent,0) ); for (int i=0;i<osize_1[0];++i){ checkCuda_accfft(cufftExecZ2Z(plan->fplan_1,(cufftDoubleComplex*)&data_out_d[i*osize_1[1]*osize_1[2]], (cufftDoubleComplex*)&data_out_d[i*osize_1[1]*osize_1[2]],CUFFT_FORWARD)); } checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) ); checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) ); fft_time+=dummy_time/1000; } MPI_Barrier(plan->c_comm); if(plan->oneD){ plan->T_plan_2->execute_gpu(plan->T_plan_2,(double*)data_out_d,timings,2); } else{ plan->T_plan_2->execute_gpu(plan->T_plan_2,(double*)data_out_d,timings,2,1,coords[1]); } checkCuda_accfft (cudaDeviceSynchronize()); MPI_Barrier(plan->c_comm); /**************************************************************/ /******************* N0 x N1/P0 x N2/P1 **********************/ /**************************************************************/ if(xyz[0]){ checkCuda_accfft( cudaEventRecord(fft_startEvent,0) ); checkCuda_accfft(cufftExecZ2Z(plan->fplan_2,(cufftDoubleComplex*)data_out_d, (cufftDoubleComplex*)data_out_d,CUFFT_FORWARD)); checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) ); checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) ); fft_time+=dummy_time/1000; } } else if (direction==1){ if(xyz[0]){ checkCuda_accfft( cudaEventRecord(fft_startEvent,0) ); checkCuda_accfft (cufftExecZ2Z(plan->fplan_2,(cufftDoubleComplex*)data_d, (cufftDoubleComplex*)data_d,CUFFT_INVERSE)); checkCuda_accfft (cudaDeviceSynchronize()); checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) ); checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) ); fft_time+=dummy_time/1000; MPI_Barrier(plan->c_comm); } if(plan->oneD){ plan->T_plan_2i->execute_gpu(plan->T_plan_2i,(double*)data_d,timings,1); } else{ plan->T_plan_2i->execute_gpu(plan->T_plan_2i,(double*)data_d,timings,1,1,coords[1]); } checkCuda_accfft (cudaDeviceSynchronize()); MPI_Barrier(plan->c_comm); /**************************************************************/ /******************* N0/P0 x N1 x N2/P1 **********************/ /**************************************************************/ if(xyz[1]){ checkCuda_accfft( cudaEventRecord(fft_startEvent,0) ); for (int i=0;i<osize_1i[0];++i){ checkCuda_accfft (cufftExecZ2Z(plan->fplan_1,(cufftDoubleComplex*)&data_d[i*NY*osize_1i[2]], (cufftDoubleComplex*)&data_d[i*NY*osize_1i[2]],CUFFT_INVERSE)); } checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) ); checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) ); fft_time+=dummy_time/1000; MPI_Barrier(plan->c_comm); } if(!plan->oneD){ plan->T_plan_1i->execute_gpu(plan->T_plan_1i,(double*)data_d,timings,1,osize_1i[0],coords[0]); } checkCuda_accfft (cudaDeviceSynchronize()); MPI_Barrier(plan->c_comm); /**************************************************************/ /******************* N0/P0 x N1/P1 x N2 **********************/ /**************************************************************/ if(xyz[2]){ checkCuda_accfft( cudaEventRecord(fft_startEvent,0) ); checkCuda_accfft (cufftExecZ2Z(plan->fplan_0,(cufftDoubleComplex*)data_d,(cufftDoubleComplex*)data_out_d,CUFFT_INVERSE)); checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) ); checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) ); fft_time+=dummy_time/1000; } else data_out_d=data_d; } timings[4]+=fft_time; if(timer==NULL){ //delete [] timings; } else{ timer[0]+=timings[0]; timer[1]+=timings[1]; timer[2]+=timings[2]; timer[3]+=timings[3]; timer[4]+=timings[4]; } MPI_Barrier(plan->c_comm); return; }// end accfft_execute_c2c_gpu
void accfft_execute_gpu(accfft_plan_gpu* plan, int direction,double * data_d, double * data_out_d, double * timer,std::bitset<3> xyz){ if(data_d==NULL) data_d=plan->data; if(data_out_d==NULL) data_out_d=plan->data_out; int * coords=plan->coord; int procid=plan->procid; double fft_time=0; double timings[5]={0}; cudaEvent_t memcpy_startEvent, memcpy_stopEvent; cudaEvent_t fft_startEvent, fft_stopEvent; checkCuda_accfft( cudaEventCreate(&memcpy_startEvent) ); checkCuda_accfft( cudaEventCreate(&memcpy_stopEvent) ); checkCuda_accfft( cudaEventCreate(&fft_startEvent) ); checkCuda_accfft( cudaEventCreate(&fft_stopEvent) ); int NY=plan->N[1]; float dummy_time=0; int *osize_0 =plan->osize_0;// *ostart_0 =plan->ostart_0; int *osize_1 =plan->osize_1;// *ostart_1 =plan->ostart_1; //int *osize_2 =plan->osize_2, *ostart_2 =plan->ostart_2; int *osize_1i=plan->osize_1i;//*ostart_1i=plan->ostart_1i; //int *osize_2i=plan->osize_2i,*ostart_2i=plan->ostart_2i; if(direction==-1){ /**************************************************************/ /******************* N0/P0 x N1/P1 x N2 **********************/ /**************************************************************/ // FFT in Z direction if(xyz[2]){ checkCuda_accfft( cudaEventRecord(fft_startEvent,0) ); checkCuda_accfft (cufftExecD2Z(plan->fplan_0,(cufftDoubleReal*)data_d, (cufftDoubleComplex*)data_out_d)); checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) ); checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) ); fft_time+=dummy_time/1000; } else data_out_d=data_d; // Perform N0/P0 transpose if(!plan->oneD){ plan->T_plan_1->execute_gpu(plan->T_plan_1,data_out_d,timings,2,osize_0[0],coords[0]); } /**************************************************************/ /******************* N0/P0 x N1 x N2/P1 **********************/ /**************************************************************/ if(xyz[1]){ checkCuda_accfft( cudaEventRecord(fft_startEvent,0) ); for (int i=0;i<osize_1[0];++i){ checkCuda_accfft (cufftExecZ2Z(plan->fplan_1,(cufftDoubleComplex*)&data_out_d[2*i*osize_1[1]*osize_1[2]], (cufftDoubleComplex*)&data_out_d[2*i*osize_1[1]*osize_1[2]],CUFFT_FORWARD)); } checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) ); checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) ); fft_time+=dummy_time/1000; } if(plan->oneD){ plan->T_plan_2->execute_gpu(plan->T_plan_2,data_out_d,timings,2); } else{ plan->T_plan_2->execute_gpu(plan->T_plan_2,data_out_d,timings,2,1,coords[1]); } /**************************************************************/ /******************* N0 x N1/P0 x N2/P1 **********************/ /**************************************************************/ MPI_Barrier(plan->c_comm); if(xyz[0]){ checkCuda_accfft( cudaEventRecord(fft_startEvent,0) ); checkCuda_accfft (cufftExecZ2Z(plan->fplan_2,(cufftDoubleComplex*)data_out_d, (cufftDoubleComplex*)data_out_d,CUFFT_FORWARD)); checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) ); checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) ); fft_time+=dummy_time/1000; } } else if (direction==1){ if(xyz[0]){ checkCuda_accfft( cudaEventRecord(fft_startEvent,0) ); checkCuda_accfft (cufftExecZ2Z(plan->fplan_2,(cufftDoubleComplex*)data_d, (cufftDoubleComplex*)data_d,CUFFT_INVERSE)); checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) ); checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) ); fft_time+=dummy_time/1000; } if(plan->oneD){ plan->T_plan_2i->execute_gpu(plan->T_plan_2i,data_d,timings,1); } else{ plan->T_plan_2i->execute_gpu(plan->T_plan_2i,data_d,timings,1,1,coords[1]); } MPI_Barrier(plan->c_comm); /**************************************************************/ /******************* N0/P0 x N1 x N2/P1 **********************/ /**************************************************************/ if(xyz[1]){ checkCuda_accfft( cudaEventRecord(fft_startEvent,0) ); for (int i=0;i<osize_1i[0];++i){ checkCuda_accfft (cufftExecZ2Z(plan->fplan_1,(cufftDoubleComplex*)&data_d[2*i*NY*osize_1i[2]], (cufftDoubleComplex*)&data_d[2*i*NY*osize_1i[2]],CUFFT_INVERSE)); } checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) ); checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) ); fft_time+=dummy_time/1000; } if(!plan->oneD){ plan->T_plan_1i->execute_gpu(plan->T_plan_1i,data_d,timings,1,osize_1i[0],coords[0]); } /**************************************************************/ /******************* N0/P0 x N1/P1 x N2 **********************/ /**************************************************************/ // IFFT in Z direction if(xyz[2]){ checkCuda_accfft( cudaEventRecord(fft_startEvent,0) ); checkCuda_accfft (cufftExecZ2D(plan->iplan_0,(cufftDoubleComplex*)data_d,(cufftDoubleReal*)data_out_d)); checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) ); checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) ); fft_time+=dummy_time/1000; } else data_out_d=data_d; } timings[4]+=fft_time; if(timer==NULL){ //delete [] timings; } else{ timer[0]+=timings[0]; timer[1]+=timings[1]; timer[2]+=timings[2]; timer[3]+=timings[3]; timer[4]+=timings[4]; } checkCuda_accfft (cudaDeviceSynchronize()); MPI_Barrier(plan->c_comm); return; }// end accfft_execute_gpu
/** * Creates a 3D C2C parallel FFT plan. If data_out point to the same location as the input * data, then an inplace plan will be created. Otherwise the plan would be outplace. * @param n Integer array of size 3, corresponding to the global data size * @param data Input data in spatial domain * @param data_out Output data in frequency domain * @param c_comm Cartesian communicator returned by \ref accfft_create_comm * @param flags AccFFT flags, See \ref flags for more details. * @return */ accfft_plan_gpu* accfft_plan_dft_3d_c2c_gpu(int * n, Complex * data_d, Complex * data_out_d, MPI_Comm c_comm,unsigned flags){ accfft_plan_gpu *plan=new accfft_plan_gpu; int nprocs, procid; MPI_Comm_rank(c_comm, &procid); plan->procid=procid; MPI_Cart_get(c_comm,2,plan->np,plan->periods,plan->coord); plan->c_comm=c_comm; int *coord=plan->coord; MPI_Comm_split(c_comm,coord[0],coord[1],&plan->row_comm); MPI_Comm_split(c_comm,coord[1],coord[0],&plan->col_comm); plan->N[0]=n[0];plan->N[1]=n[1];plan->N[2]=n[2]; int NX=n[0], NY=n[1], NZ=n[2]; cufftResult_t cufft_error; plan->data_c=data_d; plan->data_out_c=data_out_d; if(data_out_d==data_d){ plan->inplace=true;} else{plan->inplace=false;} if(plan->np[1]==1) plan->oneD=true; else plan->oneD=false; int *osize_0 =plan->osize_0, *ostart_0 =plan->ostart_0; int *osize_1 =plan->osize_1, *ostart_1 =plan->ostart_1; int *osize_2 =plan->osize_2, *ostart_2 =plan->ostart_2; int *osize_1i=plan->osize_1i,*ostart_1i=plan->ostart_1i; int *osize_2i=plan->osize_2i,*ostart_2i=plan->ostart_2i; int alloc_local; int alloc_max=0,n_tuples=n[2]*2; //int isize[3],osize[3],istart[3],ostart[3]; alloc_max=accfft_local_size_dft_c2c_gpu(n,plan->isize,plan->istart,plan->osize,plan->ostart,c_comm); plan->alloc_max=alloc_max; dfft_get_local_size_gpu(n[0],n[1],n[2],osize_0,ostart_0,c_comm); dfft_get_local_size_gpu(n[0],n[2],n[1],osize_1,ostart_1,c_comm); dfft_get_local_size_gpu(n[1],n[2],n[0],osize_2,ostart_2,c_comm); std::swap(osize_1[1],osize_1[2]); std::swap(ostart_1[1],ostart_1[2]); std::swap(ostart_2[1],ostart_2[2]); std::swap(ostart_2[0],ostart_2[1]); std::swap(osize_2[1],osize_2[2]); std::swap(osize_2[0],osize_2[1]); for(int i=0;i<3;i++){ osize_1i[i]=osize_1[i]; osize_2i[i]=osize_2[i]; ostart_1i[i]=ostart_1[i]; ostart_2i[i]=ostart_2[i]; } // fplan_0 { int f_inembed[1]={NZ}; int f_onembed[1]={NZ}; int idist=(NZ); int odist=(NZ); int istride=1; int ostride=1; int batch=osize_0[0]*osize_0[1];//NX; if(batch!=0) { cufft_error=cufftPlanMany(&plan->fplan_0, 1, &n[2], f_inembed, istride, idist, // *inembed, istride, idist f_onembed, ostride, odist, // *onembed, ostride, odist CUFFT_Z2Z, batch); if(cufft_error!= CUFFT_SUCCESS) { fprintf(stderr, "CUFFT error: fplan_0 creation failed %d \n",cufft_error); return NULL; } //cufftSetCompatibilityMode(fplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan cuda compatibility\n"); return;} } } // fplan_1 { int f_inembed[1]={NY}; int f_onembed[1]={NY}; int idist=1; int odist=1; int istride=osize_1[2]; int ostride=osize_1[2]; int batch=osize_1[2]; if(batch!=0) { cufft_error=cufftPlanMany(&plan->fplan_1, 1, &n[1], f_inembed, istride, idist, // *inembed, istride, idist f_onembed, ostride, odist, // *onembed, ostride, odist CUFFT_Z2Z, batch); if(cufft_error!= CUFFT_SUCCESS) { fprintf(stderr, "CUFFT error: fplan_1 creation failed %d \n",cufft_error); return NULL; } //cufftSetCompatibilityMode(fplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan cuda compatibility\n"); return;} } } // fplan_2 { int f_inembed[1]={NX}; int f_onembed[1]={NX}; int idist=1; int odist=1; int istride=osize_2[1]*osize_2[2]; int ostride=osize_2[1]*osize_2[2]; int batch=osize_2[1]*osize_2[2];; if(batch!=0) { cufft_error=cufftPlanMany(&plan->fplan_2, 1, &n[0], f_inembed, istride, idist, // *inembed, istride, idist f_onembed, ostride, odist, // *onembed, ostride, odist CUFFT_Z2Z, batch); if(cufft_error!= CUFFT_SUCCESS) { fprintf(stderr, "CUFFT error: fplan_2 creation failed %d \n",cufft_error); return NULL; } //cufftSetCompatibilityMode(fplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan cuda compatibility\n"); return;} } } // 1D Decomposition if (plan->oneD){ int NX=n[0],NY=n[1],NZ=n[2]; plan->alloc_max=alloc_max; plan->Mem_mgr= new Mem_Mgr_gpu <double>(NX,NY,(NZ)*2,c_comm); plan->T_plan_2= new T_Plan_gpu <double>(NX,NY,(NZ)*2, plan->Mem_mgr,c_comm); plan->T_plan_2i= new T_Plan_gpu<double>(NY,NX,NZ*2, plan->Mem_mgr,c_comm); plan->T_plan_2->alloc_local=alloc_max; plan->T_plan_2i->alloc_local=alloc_max; plan->T_plan_1=NULL; plan->T_plan_1i=NULL; if(flags==ACCFFT_MEASURE){ plan->T_plan_2->which_fast_method_gpu(plan->T_plan_2,(double*)data_out_d); } else{ plan->T_plan_2->method=2; plan->T_plan_2->kway=2; } checkCuda_accfft (cudaDeviceSynchronize()); MPI_Barrier(plan->c_comm); plan->T_plan_2i->method=-plan->T_plan_2->method; plan->T_plan_2i->kway=plan->T_plan_2->kway; plan->T_plan_2i->kway_async=plan->T_plan_2->kway_async; }// end 1d c2c // 2D Decomposition if (!plan->oneD){ // the reaseon for n_tuples/2 is to avoid splitting of imag and real parts of complex numbers plan->Mem_mgr= new Mem_Mgr_gpu<double>(n[1],n[2],2,plan->row_comm,osize_0[0],alloc_max); plan->T_plan_1= new T_Plan_gpu<double>(n[1],n[2],2, plan->Mem_mgr, plan->row_comm,osize_0[0]); plan->T_plan_2= new T_Plan_gpu<double>(n[0],n[1],2*osize_2[2], plan->Mem_mgr, plan->col_comm); plan->T_plan_2i= new T_Plan_gpu<double>(n[1],n[0],2*osize_2i[2], plan->Mem_mgr, plan->col_comm); plan->T_plan_1i= new T_Plan_gpu<double>(n[2],n[1],2, plan->Mem_mgr, plan->row_comm,osize_1i[0]); plan->T_plan_1->alloc_local=plan->alloc_max; plan->T_plan_2->alloc_local=plan->alloc_max; plan->T_plan_2i->alloc_local=plan->alloc_max; plan->T_plan_1i->alloc_local=plan->alloc_max; plan->iplan_0=NULL; plan->iplan_1=NULL; plan->iplan_2=NULL; int coords[2],np[2],periods[2]; MPI_Cart_get(c_comm,2,np,periods,coords); if(flags==ACCFFT_MEASURE){ if(coords[0]==0){ plan->T_plan_1->which_fast_method_gpu(plan->T_plan_1,(double*)data_out_d,osize_0[0]); } } else{ plan->T_plan_1->method=2; plan->T_plan_1->kway=2; } MPI_Bcast(&plan->T_plan_1->method,1, MPI_INT,0, c_comm ); MPI_Bcast(&plan->T_plan_1->kway,1, MPI_INT,0, c_comm ); MPI_Bcast(&plan->T_plan_1->kway_async,1, MPI::BOOL,0, c_comm ); checkCuda_accfft (cudaDeviceSynchronize()); MPI_Barrier(plan->c_comm); plan->T_plan_1->method =plan->T_plan_1->method; plan->T_plan_2->method =plan->T_plan_1->method; plan->T_plan_2i->method=-plan->T_plan_1->method; plan->T_plan_1i->method=-plan->T_plan_1->method; plan->T_plan_1->kway =plan->T_plan_1->kway; plan->T_plan_2->kway =plan->T_plan_1->kway; plan->T_plan_2i->kway=plan->T_plan_1->kway; plan->T_plan_1i->kway=plan->T_plan_1->kway; plan->T_plan_1->kway_async =plan->T_plan_1->kway_async; plan->T_plan_2->kway_async =plan->T_plan_1->kway_async; plan->T_plan_2i->kway_async=plan->T_plan_1->kway_async; plan->T_plan_1i->kway_async=plan->T_plan_1->kway_async; }// end 2d c2c plan->c2c_plan_baked=true; return plan; }// end accfft_plan_dft_c2c_gpu
/** * Creates a 3D single precision R2C parallel FFT plan. If data_out point to the same location as the input * data, then an inplace plan will be created. Otherwise the plan would be outplace. * @param n Integer array of size 3, corresponding to the global data size * @param data Input data in spatial domain * @param data_out Output data in frequency domain * @param c_comm Cartesian communicator returned by \ref accfft_create_comm * @param flags AccFFT flags, See \ref flags for more details. * @return */ accfft_plan_gpuf* accfft_plan_dft_3d_r2c_gpuf(int * n, float * data_d,float * data_out_d, MPI_Comm c_comm,unsigned flags){ accfft_plan_gpuf *plan=new accfft_plan_gpuf; int procid; MPI_Comm_rank(c_comm, &procid); plan->procid=procid; MPI_Cart_get(c_comm,2,plan->np,plan->periods,plan->coord); plan->c_comm=c_comm; int *coord=plan->coord; MPI_Comm_split(c_comm,coord[0],coord[1],&plan->row_comm); MPI_Comm_split(c_comm,coord[1],coord[0],&plan->col_comm); plan->N[0]=n[0];plan->N[1]=n[1];plan->N[2]=n[2]; plan->data=data_d; plan->data_out=data_out_d; if(plan->np[1]==1) plan->oneD=true; else plan->oneD=false; if(data_out_d==data_d){ plan->inplace=true;} else{plan->inplace=false;} int *osize_0 =plan->osize_0, *ostart_0 =plan->ostart_0; int *osize_1 =plan->osize_1, *ostart_1 =plan->ostart_1; int *osize_2 =plan->osize_2, *ostart_2 =plan->ostart_2; int *osize_1i=plan->osize_1i,*ostart_1i=plan->ostart_1i; int *osize_2i=plan->osize_2i,*ostart_2i=plan->ostart_2i; int alloc_max=0; int n_tuples_i, n_tuples_o; //plan->inplace==true ? n_tuples=(n[2]/2+1)*2: n_tuples=n[2]*2; plan->inplace==true ? n_tuples_i=(n[2]/2+1)*2: n_tuples_i=n[2]; n_tuples_o=(n[2]/2+1)*2; int isize[3],osize[3],istart[3],ostart[3]; alloc_max=accfft_local_size_dft_r2c_gpuf(n,isize,istart,osize,ostart,c_comm,plan->inplace); plan->alloc_max=alloc_max; dfft_get_local_size_gpuf(n[0],n[1],n_tuples_o,osize_0,ostart_0,c_comm); dfft_get_local_size_gpuf(n[0],n_tuples_o/2,n[1],osize_1,ostart_1,c_comm); dfft_get_local_size_gpuf(n[1],n_tuples_o/2,n[0],osize_2,ostart_2,c_comm); std::swap(osize_1[1],osize_1[2]); std::swap(ostart_1[1],ostart_1[2]); std::swap(ostart_2[1],ostart_2[2]); std::swap(ostart_2[0],ostart_2[1]); std::swap(osize_2[1],osize_2[2]); std::swap(osize_2[0],osize_2[1]); for(int i=0;i<3;i++){ osize_1i[i]=osize_1[i]; osize_2i[i]=osize_2[i]; ostart_1i[i]=ostart_1[i]; ostart_2i[i]=ostart_2[i]; } // fplan_0 int NX=n[0], NY=n[1], NZ=n[2]; cufftResult_t cufft_error; { int f_inembed[1]={n_tuples_i}; int f_onembed[1]={n_tuples_o/2}; int idist=(n_tuples_i); int odist=n_tuples_o/2; int istride=1; int ostride=1; int batch=osize_0[0]*osize_0[1];//NX; if(batch!=0) { cufft_error=cufftPlanMany(&plan->fplan_0, 1, &n[2], f_inembed, istride, idist, // *inembed, istride, idist f_onembed, ostride, odist, // *onembed, ostride, odist CUFFT_R2C, batch); if(cufft_error!= CUFFT_SUCCESS) { fprintf(stderr, "CUFFT error: fplan_0 creation failed %d \n",cufft_error); return NULL; } //cufftSetCompatibilityMode(fplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan cuda compatibility\n"); return;} } if(batch!=0) { cufft_error=cufftPlanMany(&plan->iplan_0, 1, &n[2], f_onembed, ostride, odist, // *onembed, ostride, odist f_inembed, istride, idist, // *inembed, istride, idist CUFFT_C2R, batch); if(cufft_error!= CUFFT_SUCCESS) { fprintf(stderr, "CUFFT error: iplan_0 creation failed %d \n",cufft_error); return NULL; } //cufftSetCompatibilityMode(fplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan cuda compatibility\n"); return;} } } // fplan_1 { int f_inembed[1]={NY}; int f_onembed[1]={NY}; int idist=1; int odist=1; int istride=osize_1[2]; int ostride=osize_1[2]; int batch=osize_1[2]; if(batch!=0) { cufft_error=cufftPlanMany(&plan->fplan_1, 1, &n[1], f_inembed, istride, idist, // *inembed, istride, idist f_onembed, ostride, odist, // *onembed, ostride, odist CUFFT_C2C, batch); if(cufft_error!= CUFFT_SUCCESS) { fprintf(stderr, "CUFFT error: fplan_1 creation failed %d \n",cufft_error); return NULL; } //cufftSetCompatibilityMode(fplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan cuda compatibility\n"); return;} } } // fplan_2 { int f_inembed[1]={NX}; int f_onembed[1]={NX}; int idist=1; int odist=1; int istride=osize_2[1]*osize_2[2]; int ostride=osize_2[1]*osize_2[2]; int batch=osize_2[1]*osize_2[2];; if(batch!=0) { cufft_error=cufftPlanMany(&plan->fplan_2, 1, &n[0], f_inembed, istride, idist, // *inembed, istride, idist f_onembed, ostride, odist, // *onembed, ostride, odist CUFFT_C2C, batch); if(cufft_error!= CUFFT_SUCCESS) { fprintf(stderr, "CUFFT error: fplan_2 creation failed %d \n",cufft_error); return NULL; } //cufftSetCompatibilityMode(fplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan cuda compatibility\n"); return;} } } // 1D Decomposition if(plan->oneD){ int N0=n[0], N1=n[1], N2=n[2]; plan->Mem_mgr = new Mem_Mgr_gpu<float>(N0,N1,n_tuples_o,c_comm); plan->T_plan_2 = new T_Plan_gpu <float>(N0,N1,n_tuples_o, plan->Mem_mgr, c_comm); plan->T_plan_2i= new T_Plan_gpu <float>(N1,N0,n_tuples_o,plan->Mem_mgr, c_comm); plan->T_plan_1=NULL; plan->T_plan_1i=NULL; plan->alloc_max=alloc_max; plan->T_plan_2->alloc_local=alloc_max; plan->T_plan_2i->alloc_local=alloc_max; if(flags==ACCFFT_MEASURE){ plan->T_plan_2->which_fast_method_gpu(plan->T_plan_2,data_out_d); } else{ plan->T_plan_2->method=2; plan->T_plan_2->kway=2; } checkCuda_accfft (cudaDeviceSynchronize()); MPI_Barrier(plan->c_comm); plan->T_plan_2->method =plan->T_plan_2->method; plan->T_plan_2i->method=plan->T_plan_2->method; plan->T_plan_2->kway =plan->T_plan_2->kway; plan->T_plan_2i->kway=plan->T_plan_2->kway; } // 2D Decomposition if (!plan->oneD){ // the reaseon for n_tuples/2 is to avoid splitting of imag and real parts of complex numbers plan->Mem_mgr =new Mem_Mgr_gpu<float>(n[1],n_tuples_o/2,2,plan->row_comm,osize_0[0],alloc_max); plan->T_plan_1 = new T_Plan_gpu<float>(n[1],n_tuples_o/2,2, plan->Mem_mgr, plan->row_comm,osize_0[0]); plan->T_plan_2 = new T_Plan_gpu<float>(n[0],n[1],osize_2[2]*2,plan->Mem_mgr, plan->col_comm); plan->T_plan_2i= new T_Plan_gpu<float>(n[1],n[0],osize_2i[2]*2, plan->Mem_mgr, plan->col_comm); plan->T_plan_1i= new T_Plan_gpu<float>(n_tuples_o/2,n[1],2, plan->Mem_mgr, plan->row_comm,osize_1i[0]); plan->T_plan_1->alloc_local=plan->alloc_max; plan->T_plan_2->alloc_local=plan->alloc_max; plan->T_plan_2i->alloc_local=plan->alloc_max; plan->T_plan_1i->alloc_local=plan->alloc_max; if(flags==ACCFFT_MEASURE){ if(coord[0]==0){ plan->T_plan_1->which_fast_method_gpu(plan->T_plan_1,data_out_d,osize_0[0]); } } else{ plan->T_plan_1->method=2; plan->T_plan_1->kway=2; } MPI_Bcast(&plan->T_plan_1->method,1, MPI_INT,0, c_comm ); MPI_Bcast(&plan->T_plan_1->kway,1, MPI_INT,0, c_comm ); checkCuda_accfft (cudaDeviceSynchronize()); MPI_Barrier(plan->c_comm); plan->T_plan_1->method =plan->T_plan_1->method; plan->T_plan_2->method =plan->T_plan_1->method; plan->T_plan_2i->method=plan->T_plan_1->method; plan->T_plan_1i->method=plan->T_plan_1->method; plan->T_plan_1->kway =plan->T_plan_1->kway; plan->T_plan_2->kway =plan->T_plan_1->kway; plan->T_plan_2i->kway=plan->T_plan_1->kway; plan->T_plan_1i->kway=plan->T_plan_1->kway; plan->iplan_1=-1; plan->iplan_2=-1; } plan->r2c_plan_baked=true; return plan; }