void mexFunction( int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[] ) { if (nrhs!=5) mexErrMsgTxt("Wrong number of arguments"); cufftHandle plan = (cufftHandle) mxGetScalar(prhs[0]); int nx = (int) mxGetScalar(prhs[1]); int ny = (int) mxGetScalar(prhs[2]); int nz = (int) mxGetScalar(prhs[3]); cufftType_t type = (cufftType_t) ((int)mxGetScalar(prhs[4])); cufftResult status = cufftPlan3d(&plan, nx, ny, nz, type); plhs[0] = mxCreateDoubleScalar(status); if (nlhs>1) plhs[1] = mxCreateDoubleScalar(plan); }
void fft3dGPU(T1* d_data, int nx, int ny, int nz, void* stream) { cufftHandle plan; cufftSetCompatibilityMode(plan, CUFFT_COMPATIBILITY_FFTW_ALL); if (cufftPlan3d(&plan, nz, ny, nx, CUFFT_C2C)!=CUFFT_SUCCESS) { fprintf(stderr, "CUFFT error: Plan creation failed"); } cufftSetStream(plan, (cudaStream_t) stream); cufftExecC2C(plan, (cufftComplex*) d_data, (cufftComplex*) d_data, CUFFT_FORWARD); cufftDestroy(plan); }
sararfftnd_plan sararfft3d_create_plan( int nx, int ny, int nz, sarafft_direction dir ) { #ifdef USE_GPUS sararfftnd_plan plan; cufftResult result = cufftPlan3d( &plan, nx, ny, nz, dir ); if( CUFFT_SUCCESS != result ) exit(64); // TODO better error handling (but to do that, the caller must be rewritten) return plan; #else // #ifndef USE_GPUS return rfftw3d_create_plan( nx, ny, nz, dir, FFTW_MEASURE | FFTW_IN_PLACE ); #endif }
/* * Class: jcuda_jcufft_JCufft * Method: cufftPlan3dNative * Signature: (Ljcuda/jcufft/JCufftHandle;IIII)I */ JNIEXPORT jint JNICALL Java_jcuda_jcufft_JCufft_cufftPlan3dNative (JNIEnv *env, jclass cla, jobject handle, jint nx, jint ny, jint nz, jint type) { if (handle == NULL) { ThrowByName(env, "java/lang/NullPointerException", "Parameter 'handle' is null for cufftPlan3d"); return JCUFFT_INTERNAL_ERROR; } Logger::log(LOG_TRACE, "Creating 3D plan for (%d, %d, %d) elements of type %d\n", nx, ny, nz, type); cufftHandle plan = env->GetIntField(handle, cufftHandle_plan); cufftResult result = cufftPlan3d(&plan, nx, ny, nz, getCufftType(type)); env->SetIntField(handle, cufftHandle_plan, plan); return result; }
sararfftnd_plan sararfft3d_create_plan( int nx, int ny, int nz, sarafft_direction dir ) { sararfftnd_plan plan; printf( "cufftPlan3d() about to start!\n" ); fflush ( stdout ); cufftResult result = cufftPlan3d( &plan, nx, ny, nz, dir ); if ( CUFFT_SUCCESS != result ) { printf( "cufftPlan3d() failed with code %d for dir=%d\n", result, dir ); fflush ( stdout ); exit( 84 ); // TODO better error handling (but to do that, the caller must be rewritten) } printf( "cufftPlan3d() succeeded!\n" ); fflush ( stdout ); setPlanSize ( plan, sizeof( sarafft_real ) * nx * ny * ( nz + 2 ) ); return plan; }
void fft3dGPU(T1* d_data, int nx, int ny, int nz, void* stream) { //printf("Running 3d forward xform \n"); cufftHandle plan; cufftSetCompatibilityMode(plan, CUFFT_COMPATIBILITY_FFTW_ALL); if (cufftPlan3d(&plan, nz, ny, nx, CUFFT_Z2Z)!=CUFFT_SUCCESS) { printf("CUFFT error: Plan creation failed\n"); } //printf("Built plan \n"); cufftSetStream(plan, (cudaStream_t) stream); if (cufftExecZ2Z(plan, (cufftDoubleComplex*) d_data, (cufftDoubleComplex*) d_data, CUFFT_FORWARD)!=CUFFT_SUCCESS) { printf("CUFFT error: Plan execution failed\n"); }; cufftDestroy(plan); }
void gpuNUFFT::GpuNUFFTOperator::initDeviceMemory(int n_coils) { if (gpuMemAllocated) return; gi_host = initAndCopyGpuNUFFTInfo();// int data_count = (int)this->kSpaceTraj.count(); IndType imdata_count = this->imgDims.count(); int sector_count = (int)this->gridSectorDims.count(); if (DEBUG) printf("allocate and copy data indices of size %d...\n",dataIndices.count()); allocateAndCopyToDeviceMem<IndType>(&data_indices_d,dataIndices.data,dataIndices.count()); if (DEBUG) printf("allocate and copy data of size %d...\n",data_count); allocateDeviceMem<DType2>(&data_sorted_d,data_count); if (DEBUG) printf("allocate and copy gdata of size %d...\n",gi_host->grid_width_dim); allocateDeviceMem<CufftType>(&gdata_d,gi_host->grid_width_dim); if (DEBUG) printf("allocate and copy coords of size %d...\n",getImageDimensionCount()*data_count); allocateAndCopyToDeviceMem<DType>(&crds_d,this->kSpaceTraj.data,getImageDimensionCount()*data_count); if (DEBUG) printf("allocate and copy kernel in const memory of size %d...\n",this->kernel.count()); initLookupTable(); //allocateAndCopyToDeviceMem<DType>(&kernel_d,kernel,kernel_count); if (DEBUG) printf("allocate and copy sectors of size %d...\n",sector_count+1); allocateAndCopyToDeviceMem<IndType>(§ors_d,this->sectorDataCount.data,sector_count+1); if (DEBUG) printf("allocate and copy sector_centers of size %d...\n",getImageDimensionCount()*sector_count); allocateAndCopyToDeviceMem<IndType>(§or_centers_d,(IndType*)this->getSectorCentersData(),getImageDimensionCount()*sector_count); if (this->applyDensComp()) { if (DEBUG) printf("allocate and copy density compensation of size %d...\n",data_count); allocateAndCopyToDeviceMem<DType>(&density_comp_d,this->dens.data,data_count); } if (this->applySensData()) { if (DEBUG) printf("allocate sens data of size %d...\n",imdata_count); allocateDeviceMem<DType2>(&sens_d,imdata_count); } if (n_coils > 1) { if (DEBUG) printf("allocate precompute deapofunction of size %d...\n",imdata_count); allocateDeviceMem<DType>(&deapo_d,imdata_count); precomputeDeapodization(deapo_d,gi_host); } if (DEBUG) printf("sector pad width: %d\n",gi_host->sector_pad_width); //Inverse fft plan and execution if (DEBUG) printf("creating cufft plan with %d,%d,%d dimensions\n",DEFAULT_VALUE(gi_host->gridDims.z),gi_host->gridDims.y,gi_host->gridDims.x); cufftResult res = cufftPlan3d(&fft_plan, (int)DEFAULT_VALUE(gi_host->gridDims.z),(int)gi_host->gridDims.y,(int)gi_host->gridDims.x, CufftTransformType) ; if (res != CUFFT_SUCCESS) fprintf(stderr,"error on CUFFT Plan creation!!! %d\n",res); gpuMemAllocated = true; }
struct fft_plan_3d *fft_3d_create_plan_cuda( MPI_Comm comm, int nfast, int nmid, int nslow, int in_ilo, int in_ihi, int in_jlo, int in_jhi, int in_klo, int in_khi, int out_ilo, int out_ihi, int out_jlo, int out_jhi, int out_klo, int out_khi, int scaled, int permute, int *nbuf,bool ainit) { #ifdef FFT_CUFFT struct fft_plan_3d *plan; int me,nprocs; int i,num,flag,remapflag,fftflag; int first_ilo,first_ihi,first_jlo,first_jhi,first_klo,first_khi; int second_ilo,second_ihi,second_jlo,second_jhi,second_klo,second_khi; int third_ilo,third_ihi,third_jlo,third_jhi,third_klo,third_khi; int out_size,first_size,second_size,third_size,copy_size,scratch_size; int np1,np2,ip1,ip2; int list[50]; // system specific variables // query MPI info MPI_Comm_rank(comm,&me); MPI_Comm_size(comm,&nprocs); #ifndef FFT_CUFFT error->all(FLERR,"ERROR: Trying to use cuda fft without FFT_CUFFT set. Recompile with make option 'cufft=1'."); #endif // compute division of procs in 2 dimensions not on-processor bifactor_cuda(nprocs,&np1,&np2); ip1 = me % np1; ip2 = me/np1; // in case of CUDA FFT every proc does the full FFT in order to avoid data transfers (the problem is other wise heavily bandwidth limited) int ip1out = ip1; int ip2out = ip2; int np1out = np1; int np2out = np2; ip1 = 0; ip2 = 0; np1 = 1; np2 = 1; // allocate memory for plan data struct plan = (struct fft_plan_3d *) malloc(sizeof(struct fft_plan_3d)); if (plan == NULL) return NULL; plan->init=ainit; // remap from initial distribution to layout needed for 1st set of 1d FFTs // not needed if all procs own entire fast axis initially // first indices = distribution after 1st set of FFTs if (in_ilo == 0 && in_ihi == nfast-1) flag = 0; else flag = 1; if(nprocs>1)flag=1; MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm); if (remapflag == 0) { first_ilo = in_ilo; first_ihi = in_ihi; first_jlo = in_jlo; first_jhi = in_jhi; first_klo = in_klo; first_khi = in_khi; plan->pre_plan = NULL; } else { first_ilo = 0; first_ihi = nfast - 1; first_jlo = ip1*nmid/np1; first_jhi = (ip1+1)*nmid/np1 - 1; first_klo = ip2*nslow/np2; first_khi = (ip2+1)*nslow/np2 - 1; int members=2; if(plan->init) members=1; plan->pre_plan = remap_3d_create_plan(comm,in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi, first_ilo,first_ihi,first_jlo,first_jhi, first_klo,first_khi, members,0,0,2,0); if (plan->pre_plan == NULL) return NULL; } // 1d FFTs along fast axis plan->length1 = nfast; plan->total1 = nfast * nmid * nslow; // remap from 1st to 2nd FFT // choose which axis is split over np1 vs np2 to minimize communication // second indices = distribution after 2nd set of FFTs second_ilo = ip1*nfast/np1; second_ihi = (ip1+1)*nfast/np1 - 1; second_jlo = 0; second_jhi = nmid - 1; second_klo = ip2*nslow/np2; second_khi = (ip2+1)*nslow/np2 - 1; plan->mid1_plan = remap_3d_create_plan(comm, first_ilo,first_ihi,first_jlo,first_jhi, first_klo,first_khi, second_ilo,second_ihi,second_jlo,second_jhi, second_klo,second_khi, 2,1,0,2,0); if (plan->mid1_plan == NULL) return NULL; // 1d FFTs along mid axis plan->length2 = nmid; plan->total2 = nfast * nmid * nslow; // remap from 2nd to 3rd FFT // if final distribution is permute=2 with all procs owning entire slow axis // then this remapping goes directly to final distribution // third indices = distribution after 3rd set of FFTs flag=1; MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm); if (remapflag == 0) { third_ilo = out_ilo; third_ihi = out_ihi; third_jlo = out_jlo; third_jhi = out_jhi; third_klo = out_klo; third_khi = out_khi; } else { third_ilo = ip1*nfast/np1; third_ihi = (ip1+1)*nfast/np1 - 1; third_jlo = ip2*nmid/np2; third_jhi = (ip2+1)*nmid/np2 - 1; third_klo = 0; third_khi = nslow - 1; } plan->mid2_plan = remap_3d_create_plan(comm, second_jlo,second_jhi,second_klo,second_khi, second_ilo,second_ihi, third_jlo,third_jhi,third_klo,third_khi, third_ilo,third_ihi, 2,1,0,2,0); if (plan->mid2_plan == NULL) return NULL; // 1d FFTs along slow axis plan->length3 = nslow; plan->total3 = nfast * nmid * nslow; // remap from 3rd FFT to final distribution // not needed if permute = 2 and third indices = out indices on all procs flag=1; MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm); if (remapflag == 0) plan->post_plan = NULL; else { plan->post_plan = remap_3d_create_plan(comm, third_klo,third_khi,third_ilo,third_ihi, third_jlo,third_jhi, out_klo,out_khi,out_ilo,out_ihi, out_jlo,out_jhi, 2,(permute+1)%3,0,2,0); if (plan->post_plan == NULL) return NULL; } // configure plan memory pointers and allocate work space // out_size = amount of memory given to FFT by user // first/second/third_size = amount of memory needed after pre,mid1,mid2 remaps // copy_size = amount needed internally for extra copy of data // scratch_size = amount needed internally for remap scratch space // for each remap: // out space used for result if big enough, else require copy buffer // accumulate largest required remap scratch space out_size = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1); first_size = (first_ihi-first_ilo+1) * (first_jhi-first_jlo+1) * (first_khi-first_klo+1); second_size = (second_ihi-second_ilo+1) * (second_jhi-second_jlo+1) * (second_khi-second_klo+1); third_size = (third_ihi-third_ilo+1) * (third_jhi-third_jlo+1) * (third_khi-third_klo+1); plan->ihi_out=out_ihi; plan->ilo_out=out_ilo; plan->jhi_out=out_jhi; plan->jlo_out=out_jlo; plan->khi_out=out_khi; plan->klo_out=out_klo; copy_size = 0; scratch_size = 0; if (plan->pre_plan) { if (first_size <= out_size) plan->pre_target = 0; else { plan->pre_target = 1; copy_size = MAX(copy_size,first_size); } scratch_size = MAX(scratch_size,first_size); } if (plan->mid1_plan) { if (second_size <= out_size) plan->mid1_target = 0; else { plan->mid1_target = 1; copy_size = MAX(copy_size,second_size); } scratch_size = MAX(scratch_size,second_size); } if (plan->mid2_plan) { if (third_size <= out_size) plan->mid2_target = 0; else { plan->mid2_target = 1; copy_size = MAX(copy_size,third_size); } scratch_size = MAX(scratch_size,third_size); } if (plan->post_plan) scratch_size = MAX(scratch_size,out_size); *nbuf = copy_size + scratch_size; if (copy_size) { plan->copy = (FFT_DATA *) malloc(copy_size*sizeof(FFT_DATA)); if (plan->copy == NULL) return NULL; } else plan->copy = NULL; if (scratch_size) { plan->scratch = (FFT_DATA *) malloc(scratch_size*sizeof(FFT_DATA)); if (plan->scratch == NULL) return NULL; } else plan->scratch = NULL; // system specific pre-computation of 1d FFT coeffs // and scaling normalization cufftResult retvalc; int nfft = (in_ihi-in_ilo+1) * (in_jhi-in_jlo+1) * (in_khi-in_klo+1); int nfft_brick = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1); int nfft_both = MAX(nfft,nfft_brick); nfft_both=nfast*nmid*nslow; plan->cudatasize=nfft_both*sizeof(FFT_DATA); //retvalc=cufftPlan1d(&(plan->plan_fast), nfast, CUFFT_PLAN,plan->total1/nfast); //if(retvalc!=CUFFT_SUCCESS) printf("ErrorCUFFT1: %i\n",retvalc); plan->nfast=nfast; //retvalc=cufftPlan1d(&(plan->plan_mid), nmid, CUFFT_PLAN,plan->total2/nmid); //if(retvalc!=CUFFT_SUCCESS) printf("ErrorCUFFT2: %i\n",retvalc); plan->nmid=nmid; //retvalc=cufftPlan1d(&(plan->plan_slow), nslow, CUFFT_PLAN,plan->total3/nslow); //if(retvalc!=CUFFT_SUCCESS) printf("ErrorCUFFT3: %i\n",retvalc); plan->nslow=nslow; retvalc=cufftPlan3d(&(plan->plan_3d), nslow,nmid,nfast, CUFFT_PLAN); if(retvalc!=CUFFT_SUCCESS) printf("ErrorCUFFT3: %i\n",retvalc); plan->nprocs=nprocs; plan->me=me; if (scaled == 0) plan->scaled = 0; else { plan->scaled = 1; plan->norm = 1.0/(nfast*nmid*nslow); plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1); } plan->coretime=0; plan->iterate=0; plan->ffttime=0; return plan; #endif }
cufftResult WINAPI wine_cufftPlan3d(cufftHandle *plan, int nx, int ny, int nz, cufftType type){ WINE_TRACE("\n"); return cufftPlan3d( plan, nx, ny, nz, type ); }
/* 3D FFT for field generation */ void make_field_3dfft(int _fielddim) { std::chrono::time_point<std::chrono::high_resolution_clock> time1, time2, time3; time1 = std::chrono::system_clock::now(); double PLindex; // int nr_3Dfields, fielddim, index, mod_fielddim; int neg_k,neg_i,neg_j; cufftDoubleComplex *host_array; cufftDoubleComplex *device_array; cufftHandle back_plan; PLindex = atof(test_fftw[1]); int fielddim = _fielddim;//atoi(test_fftw[2]); int nr_3Dfields = atoi(test_fftw[3]); int mod_fielddim = fielddim+1; size_t nr_points = (fielddim+1)*(fielddim+1)*(fielddim+1); size_t num_bytes = nr_points * sizeof(cufftDoubleComplex); // mem allocation host_array = (cufftDoubleComplex*) malloc(num_bytes); //array_in = (fftw_complex*) fftw_malloc(nr_points * sizeof(fftw_complex)); checkCudaErrors(cudaMalloc((void **)&device_array, num_bytes)); //back_plan = fftw_plan_dft_3d(mod_fielddim, mod_fielddim, mod_fielddim,array_in, array_in, FFTW_BACKWARD , FFTW_MEASURE); cufftPlan3d(&back_plan, mod_fielddim, mod_fielddim, mod_fielddim, CUFFT_Z2Z); //array_in, array_in, FFTW_BACKWARD , FFTW_MEASURE); cout << endl <<"fiel_dim " << mod_fielddim << endl; for(int m=0; m<nr_3Dfields; m++){ cout << endl <<"field #" << m << endl; for(int i=-fielddim/2;i<=fielddim/2;i++){ //fill the data that goes into the FFT for(int j=-fielddim/2;j<=fielddim/2;j++){ for(int k=0;k<=fielddim/2;k++){ //positive k parts double abs_k = sqrt((double)(i*i+j*j+k*k)); double A = 1000.0*pow(abs_k,(PLindex*0.5)); double phase=(2.0*rand_01()-1.0)*PI; int index = (i+fielddim/2)*mod_fielddim*mod_fielddim+(j+fielddim/2)*mod_fielddim+k+fielddim/2; //array_in[index]=A*cos(phase)+I*A*sin(phase); host_array[index].x = A*cos(phase); host_array[index].y = A*sin(phase); //negative k parts //phase=-1.0*phase; neg_i =- 1*i; neg_j =- 1*j; neg_k =- 1*k; index = (neg_i+fielddim/2)*mod_fielddim*mod_fielddim+(neg_j+fielddim/2)*mod_fielddim+neg_k+fielddim/2; //array_in[index]=A*cos(phase)+I*A*sin(-1.0*phase); host_array[index].x = A*cos(phase); host_array[index].y = A*sin(-1.0*phase); } } } host_array[(fielddim/2)*mod_fielddim*mod_fielddim+(fielddim/2)*mod_fielddim+fielddim/2].x = 1.0; host_array[(fielddim/2)*mod_fielddim*mod_fielddim+(fielddim/2)*mod_fielddim+fielddim/2].y = 0.0; //Rearange array for use in FFTW library for(int j=0;j<=nr_points/2;j++){ cufftDoubleComplex temp = host_array[j]; host_array[j] = host_array[j+nr_points/2]; host_array[j+nr_points/2] = temp; } time2 = std::chrono::system_clock::now(); // move data to the GPU cudaMemcpy(device_array, host_array, num_bytes, cudaMemcpyDeviceToHost); /// fftw_execute(back_plan); cufftExecZ2Z(back_plan, device_array, device_array, FFTW_BACKWARD); // move back cudaMemcpy(host_array, device_array, num_bytes, cudaMemcpyHostToDevice); time3 = std::chrono::system_clock::now(); auto s1 = std::chrono::duration_cast<std::chrono::milliseconds>(time2-time1).count(); auto s2 = std::chrono::duration_cast<std::chrono::milliseconds>(time3-time2).count(); auto time_total = s1 + s2; cout << endl << "--- 3D FFT make_field ---" << endl << "1. " << s1 << " ms" << endl << "2. " << s2 << " ms" << endl << "---" << endl << "total time " << time_total << " ms" << endl << "---" << endl; //for(i=0;i<nr_points;i++) printf("%g + i*%g\n",creal(array_in[i]),cimag(array_in[i])); /* for(int i=0;i<mod_fielddim;i++){ for(int j=0;j<mod_fielddim;j++){ for(int k=0;k<mod_fielddim;k++){ printf("%.15lf\n", host_array[i*mod_fielddim*mod_fielddim+j*mod_fielddim+k].x); } } } */ } // print some elemntes for debug double *print_array = (double*) host_array; print_double(print_array, 100); // first 100 elements print_double(print_array+(2*nr_points-100), 100); // last 100 elements /* fftw_free(array_in); fftw_destroy_plan(back_plan); */ cufftDestroy(back_plan); cudaFree(device_array); free(host_array); }