/* * Class: jcuda_jcufft_JCufft * Method: cufftExecZ2ZNative * Signature: (Ljcuda/jcufft/cufftHandle;Ljcuda/Pointer;Ljcuda/Pointer;I)I */ JNIEXPORT jint JNICALL Java_jcuda_jcufft_JCufft_cufftExecZ2ZNative (JNIEnv *env, jclass cla, jobject handle, jobject cIdata, jobject cOdata, jint direction) { if (handle == NULL) { ThrowByName(env, "java/lang/NullPointerException", "Parameter 'handle' is null for cufftExecZ2Z"); return JCUFFT_INTERNAL_ERROR; } if (cIdata == NULL) { ThrowByName(env, "java/lang/NullPointerException", "Parameter 'cIdata' is null for cufftExecZ2Z"); return JCUFFT_INTERNAL_ERROR; } if (cOdata == NULL) { ThrowByName(env, "java/lang/NullPointerException", "Parameter 'cOdata' is null for cufftExecZ2Z"); return JCUFFT_INTERNAL_ERROR; } Logger::log(LOG_TRACE, "Executing cufftExecZ2Z\n"); cufftHandle nativePlan = env->GetIntField(handle, cufftHandle_plan); cufftDoubleComplex* nativeCIData = (cufftDoubleComplex*)getPointer(env, cIdata); cufftDoubleComplex* nativeCOData = (cufftDoubleComplex*)getPointer(env, cOdata); cufftResult result = cufftExecZ2Z(nativePlan, nativeCIData, nativeCOData, direction); return result; }
extern "C" void cuda_fft(double *d_data, int Lx, int Ny, void *stream) { cufftHandle plan; cufftPlan1d(&plan, Lx, CUFFT_Z2Z, Ny); cufftSetStream(plan, (cudaStream_t)stream); cufftExecZ2Z(plan, (cufftDoubleComplex*)d_data, (cufftDoubleComplex*)d_data,CUFFT_FORWARD); cufftDestroy(plan); }
/* * Function to be called in thread managing host operations and invoking kernels */ void* host_thread(void* passing_ptr) { DataArray* data_arr_ptr = (DataArray*) passing_ptr; alloc_data_host(data_arr_ptr); printf("data allocated by host thread\n"); //printf("data filling by host thread\n"); for (uint64_t ii = 0; ii < data_arr_ptr->size; ii++) { (*(data_arr_ptr->data_r))[ii] = ii; (*(data_arr_ptr->data_k))[ii] = data_arr_ptr->size-ii; } printf("data filled by host thread\n"); // synchronize after allocating memory - streams should be created, mem on device ready for copying pthread_barrier_wait (&barrier); printf("1st barier host thread - allocating mem on cpu\n"); // here we can make cufft plan, for example cufftHandle plan_forward; cufftPlan1d(&plan_forward, N, CUFFT_Z2Z, 1); // synchornize after ... - data should be copyied on device pthread_barrier_wait (&barrier); printf("2nd barier host thread - \n"); // run some computations cufftExecZ2Z(plan_forward, *(data_arr_ptr->data_r_dev), *(data_arr_ptr->data_k_dev), CUFFT_FORWARD); printf("cufft done\n"); // synchornize after computations - cudaDeviceSynchronize(); // should be used on pthread_barrier_wait (&barrier); printf("3rd barier host thread - \n"); // synchornize after computations - pthread_barrier_wait (&barrier); printf("4th barier host thread - \n"); printf("data visible in host thread:\n"); /*for (uint64_t ii = 0; ii < (data_arr_ptr->size <= 32) ? data_arr_ptr->size : 32 ; ii++) { printf("%lu.\t",ii); printf("%lf + %lfj\t", creal( (*(data_arr_ptr->data_r))[ii] ), cimag( (*(data_arr_ptr->data_r))[ii] )); printf("%lf + %lfj\n", creal( (*(data_arr_ptr->data_k))[ii] ), cimag( (*(data_arr_ptr->data_k))[ii] )); }*/ printf("closing host thread\n"); pthread_exit(NULL); }
void oskar_fft_exec(oskar_FFT* h, oskar_Mem* data, int* status) { oskar_Mem *data_copy = 0, *data_ptr = data; if (oskar_mem_location(data) != h->location) { data_copy = oskar_mem_create_copy(data, h->location, status); data_ptr = data_copy; } if (h->location == OSKAR_CPU) { if (h->num_dim == 1) { *status = OSKAR_ERR_FUNCTION_NOT_AVAILABLE; } else if (h->num_dim == 2) { if (h->precision == OSKAR_DOUBLE) oskar_fftpack_cfft2f(h->dim_size, h->dim_size, h->dim_size, oskar_mem_double(data_ptr, status), oskar_mem_double(h->fftpack_wsave, status), oskar_mem_double(h->fftpack_work, status)); else oskar_fftpack_cfft2f_f(h->dim_size, h->dim_size, h->dim_size, oskar_mem_float(data_ptr, status), oskar_mem_float(h->fftpack_wsave, status), oskar_mem_float(h->fftpack_work, status)); /* This step not needed for W-kernel generation, so turn it off. */ if (h->ensure_consistent_norm) oskar_mem_scale_real(data_ptr, (double)h->num_cells_total, 0, h->num_cells_total, status); } } else if (h->location == OSKAR_GPU) { #ifdef OSKAR_HAVE_CUDA if (h->precision == OSKAR_DOUBLE) cufftExecZ2Z(h->cufft_plan, (cufftDoubleComplex*) oskar_mem_void(data_ptr), (cufftDoubleComplex*) oskar_mem_void(data_ptr), CUFFT_FORWARD); else cufftExecC2C(h->cufft_plan, (cufftComplex*) oskar_mem_void(data_ptr), (cufftComplex*) oskar_mem_void(data_ptr), CUFFT_FORWARD); #endif } else *status = OSKAR_ERR_BAD_LOCATION; if (oskar_mem_location(data) != h->location) oskar_mem_copy(data, data_ptr, status); oskar_mem_free(data_copy, status); }
static cufftResult_t centerdifft(const std::complex<double>* in_mat, std::complex<double>* out_mat, cufftHandle* fftplan) { cufftResult_t cufftResult; cufftResult = cufftExecZ2Z(*fftplan, (cufftDoubleComplex*)in_mat, (cufftDoubleComplex*)out_mat, CUFFT_INVERSE); AGILE_ASSERT(result == CUFFT_SUCCESS, StandardException::ExceptionMessage( "Error during FFT procedure")); return cufftResult; }
void fft3dGPU(T1* d_data, int nx, int ny, int nz, void* stream) { //printf("Running 3d forward xform \n"); cufftHandle plan; cufftSetCompatibilityMode(plan, CUFFT_COMPATIBILITY_FFTW_ALL); if (cufftPlan3d(&plan, nz, ny, nx, CUFFT_Z2Z)!=CUFFT_SUCCESS) { printf("CUFFT error: Plan creation failed\n"); } //printf("Built plan \n"); cufftSetStream(plan, (cudaStream_t) stream); if (cufftExecZ2Z(plan, (cufftDoubleComplex*) d_data, (cufftDoubleComplex*) d_data, CUFFT_FORWARD)!=CUFFT_SUCCESS) { printf("CUFFT error: Plan execution failed\n"); }; cufftDestroy(plan); }
void single(char *dst, char *const *src) { cufftExecZ2Z(plan, reinterpret_cast<src_type *>(src[0]), reinterpret_cast<dst_type *>(dst), sign); }
void oskar_imager_finalise_plane(oskar_Imager* h, oskar_Mem* plane, double plane_norm, int* status) { int size, num_cells; DeviceData* d; if (*status) return; /* Apply normalisation. */ if (plane_norm > 0.0 || plane_norm < 0.0) oskar_mem_scale_real(plane, 1.0 / plane_norm, status); if (h->algorithm == OSKAR_ALGORITHM_DFT_2D || h->algorithm == OSKAR_ALGORITHM_DFT_3D) return; /* Check plane is complex type, as plane must be gridded visibilities. */ if (!oskar_mem_is_complex(plane)) { *status = OSKAR_ERR_TYPE_MISMATCH; return; } /* Make image using FFT and apply grid correction. */ size = h->grid_size; num_cells = size * size; d = &h->d[0]; if (oskar_mem_precision(plane) == OSKAR_DOUBLE) { oskar_fftphase_cd(size, size, oskar_mem_double(plane, status)); if (h->fft_on_gpu) { #ifdef OSKAR_HAVE_CUDA oskar_device_set(h->cuda_device_ids[0], status); oskar_mem_copy(d->plane_gpu, plane, status); cufftExecZ2Z(h->cufft_plan, oskar_mem_void(d->plane_gpu), oskar_mem_void(d->plane_gpu), CUFFT_FORWARD); oskar_mem_copy(plane, d->plane_gpu, status); #else *status = OSKAR_ERR_CUDA_NOT_AVAILABLE; #endif } else { oskar_fftpack_cfft2f(size, size, size, oskar_mem_double(plane, status), oskar_mem_double(h->fftpack_wsave, status), oskar_mem_double(h->fftpack_work, status)); oskar_mem_scale_real(plane, (double)num_cells, status); } oskar_fftphase_cd(size, size, oskar_mem_double(plane, status)); oskar_grid_correction_d(size, oskar_mem_double(h->corr_func, status), oskar_mem_double(plane, status)); } else { oskar_fftphase_cf(size, size, oskar_mem_float(plane, status)); if (h->fft_on_gpu) { #ifdef OSKAR_HAVE_CUDA oskar_device_set(h->cuda_device_ids[0], status); oskar_mem_copy(d->plane_gpu, plane, status); cufftExecC2C(h->cufft_plan, oskar_mem_void(d->plane_gpu), oskar_mem_void(d->plane_gpu), CUFFT_FORWARD); oskar_mem_copy(plane, d->plane_gpu, status); #else *status = OSKAR_ERR_CUDA_NOT_AVAILABLE; #endif } else { oskar_fftpack_cfft2f_f(size, size, size, oskar_mem_float(plane, status), oskar_mem_float(h->fftpack_wsave, status), oskar_mem_float(h->fftpack_work, status)); oskar_mem_scale_real(plane, (double)num_cells, status); } oskar_fftphase_cf(size, size, oskar_mem_float(plane, status)); oskar_grid_correction_f(size, oskar_mem_double(h->corr_func, status), oskar_mem_float(plane, status)); } }
/** * Execute C2C plan. This function is blocking and only returns after the transform is completed. * @note For inplace transforms, data_out should point to the same memory address as data, AND * the plan must have been created as inplace. * @param plan FFT plan created by \ref accfft_plan_dft_3d_r2c. * @param data Input data in frequency domain. * @param data_out Output data in frequency domain. * @param timer See \ref timer for more details. * @param XYZ a bit set field that determines which directions FFT should be executed */ void accfft_execute_c2c_gpu(accfft_plan_gpu* plan, int direction,Complex * data_d, Complex * data_out_d, double * timer,std::bitset<3> xyz){ if(!plan->c2c_plan_baked){ if(plan->procid==0) std::cout<<"Error. r2c plan has not been made correctly. Please first create the plan before calling execute functions."<<std::endl; return; } if(data_d==NULL) data_d=plan->data_c; if(data_out_d==NULL) data_out_d=plan->data_out_c; int * coords=plan->coord; int procid=plan->procid; double fft_time=0; double timings[5]={0}; int NY=plan->N[1]; cudaEvent_t memcpy_startEvent, memcpy_stopEvent; cudaEvent_t fft_startEvent, fft_stopEvent; checkCuda_accfft( cudaEventCreate(&memcpy_startEvent) ); checkCuda_accfft( cudaEventCreate(&memcpy_stopEvent) ); checkCuda_accfft( cudaEventCreate(&fft_startEvent) ); checkCuda_accfft( cudaEventCreate(&fft_stopEvent) ); cufftResult_t cufft_error; float dummy_time=0; int *osize_0 =plan->osize_0, *ostart_0 =plan->ostart_0; int *osize_1 =plan->osize_1, *ostart_1 =plan->ostart_1; int *osize_2 =plan->osize_2, *ostart_2 =plan->ostart_2; int *osize_1i=plan->osize_1i,*ostart_1i=plan->ostart_1i; int *osize_2i=plan->osize_2i,*ostart_2i=plan->ostart_2i; if(direction==-1){ /**************************************************************/ /******************* N0/P0 x N1/P1 x N2 **********************/ /**************************************************************/ // FFT in Z direction if(xyz[2]){ checkCuda_accfft( cudaEventRecord(fft_startEvent,0) ); checkCuda_accfft(cufftExecZ2Z(plan->fplan_0,(cufftDoubleComplex*)data_d, (cufftDoubleComplex*)data_out_d,CUFFT_FORWARD)); checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) ); checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) ); fft_time+=dummy_time/1000; } else data_out_d=data_d; if(!plan->oneD){ plan->T_plan_1->execute_gpu(plan->T_plan_1,(double*)data_out_d,timings,2,osize_0[0],coords[0]); } checkCuda_accfft (cudaDeviceSynchronize()); MPI_Barrier(plan->c_comm); /**************************************************************/ /******************* N0/P0 x N1 x N2/P1 **********************/ /**************************************************************/ if(xyz[1]){ checkCuda_accfft( cudaEventRecord(fft_startEvent,0) ); for (int i=0;i<osize_1[0];++i){ checkCuda_accfft(cufftExecZ2Z(plan->fplan_1,(cufftDoubleComplex*)&data_out_d[i*osize_1[1]*osize_1[2]], (cufftDoubleComplex*)&data_out_d[i*osize_1[1]*osize_1[2]],CUFFT_FORWARD)); } checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) ); checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) ); fft_time+=dummy_time/1000; } MPI_Barrier(plan->c_comm); if(plan->oneD){ plan->T_plan_2->execute_gpu(plan->T_plan_2,(double*)data_out_d,timings,2); } else{ plan->T_plan_2->execute_gpu(plan->T_plan_2,(double*)data_out_d,timings,2,1,coords[1]); } checkCuda_accfft (cudaDeviceSynchronize()); MPI_Barrier(plan->c_comm); /**************************************************************/ /******************* N0 x N1/P0 x N2/P1 **********************/ /**************************************************************/ if(xyz[0]){ checkCuda_accfft( cudaEventRecord(fft_startEvent,0) ); checkCuda_accfft(cufftExecZ2Z(plan->fplan_2,(cufftDoubleComplex*)data_out_d, (cufftDoubleComplex*)data_out_d,CUFFT_FORWARD)); checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) ); checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) ); fft_time+=dummy_time/1000; } } else if (direction==1){ if(xyz[0]){ checkCuda_accfft( cudaEventRecord(fft_startEvent,0) ); checkCuda_accfft (cufftExecZ2Z(plan->fplan_2,(cufftDoubleComplex*)data_d, (cufftDoubleComplex*)data_d,CUFFT_INVERSE)); checkCuda_accfft (cudaDeviceSynchronize()); checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) ); checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) ); fft_time+=dummy_time/1000; MPI_Barrier(plan->c_comm); } if(plan->oneD){ plan->T_plan_2i->execute_gpu(plan->T_plan_2i,(double*)data_d,timings,1); } else{ plan->T_plan_2i->execute_gpu(plan->T_plan_2i,(double*)data_d,timings,1,1,coords[1]); } checkCuda_accfft (cudaDeviceSynchronize()); MPI_Barrier(plan->c_comm); /**************************************************************/ /******************* N0/P0 x N1 x N2/P1 **********************/ /**************************************************************/ if(xyz[1]){ checkCuda_accfft( cudaEventRecord(fft_startEvent,0) ); for (int i=0;i<osize_1i[0];++i){ checkCuda_accfft (cufftExecZ2Z(plan->fplan_1,(cufftDoubleComplex*)&data_d[i*NY*osize_1i[2]], (cufftDoubleComplex*)&data_d[i*NY*osize_1i[2]],CUFFT_INVERSE)); } checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) ); checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) ); fft_time+=dummy_time/1000; MPI_Barrier(plan->c_comm); } if(!plan->oneD){ plan->T_plan_1i->execute_gpu(plan->T_plan_1i,(double*)data_d,timings,1,osize_1i[0],coords[0]); } checkCuda_accfft (cudaDeviceSynchronize()); MPI_Barrier(plan->c_comm); /**************************************************************/ /******************* N0/P0 x N1/P1 x N2 **********************/ /**************************************************************/ if(xyz[2]){ checkCuda_accfft( cudaEventRecord(fft_startEvent,0) ); checkCuda_accfft (cufftExecZ2Z(plan->fplan_0,(cufftDoubleComplex*)data_d,(cufftDoubleComplex*)data_out_d,CUFFT_INVERSE)); checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) ); checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) ); fft_time+=dummy_time/1000; } else data_out_d=data_d; } timings[4]+=fft_time; if(timer==NULL){ //delete [] timings; } else{ timer[0]+=timings[0]; timer[1]+=timings[1]; timer[2]+=timings[2]; timer[3]+=timings[3]; timer[4]+=timings[4]; } MPI_Barrier(plan->c_comm); return; }// end accfft_execute_c2c_gpu
void accfft_execute_gpu(accfft_plan_gpu* plan, int direction,double * data_d, double * data_out_d, double * timer,std::bitset<3> xyz){ if(data_d==NULL) data_d=plan->data; if(data_out_d==NULL) data_out_d=plan->data_out; int * coords=plan->coord; int procid=plan->procid; double fft_time=0; double timings[5]={0}; cudaEvent_t memcpy_startEvent, memcpy_stopEvent; cudaEvent_t fft_startEvent, fft_stopEvent; checkCuda_accfft( cudaEventCreate(&memcpy_startEvent) ); checkCuda_accfft( cudaEventCreate(&memcpy_stopEvent) ); checkCuda_accfft( cudaEventCreate(&fft_startEvent) ); checkCuda_accfft( cudaEventCreate(&fft_stopEvent) ); int NY=plan->N[1]; float dummy_time=0; int *osize_0 =plan->osize_0;// *ostart_0 =plan->ostart_0; int *osize_1 =plan->osize_1;// *ostart_1 =plan->ostart_1; //int *osize_2 =plan->osize_2, *ostart_2 =plan->ostart_2; int *osize_1i=plan->osize_1i;//*ostart_1i=plan->ostart_1i; //int *osize_2i=plan->osize_2i,*ostart_2i=plan->ostart_2i; if(direction==-1){ /**************************************************************/ /******************* N0/P0 x N1/P1 x N2 **********************/ /**************************************************************/ // FFT in Z direction if(xyz[2]){ checkCuda_accfft( cudaEventRecord(fft_startEvent,0) ); checkCuda_accfft (cufftExecD2Z(plan->fplan_0,(cufftDoubleReal*)data_d, (cufftDoubleComplex*)data_out_d)); checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) ); checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) ); fft_time+=dummy_time/1000; } else data_out_d=data_d; // Perform N0/P0 transpose if(!plan->oneD){ plan->T_plan_1->execute_gpu(plan->T_plan_1,data_out_d,timings,2,osize_0[0],coords[0]); } /**************************************************************/ /******************* N0/P0 x N1 x N2/P1 **********************/ /**************************************************************/ if(xyz[1]){ checkCuda_accfft( cudaEventRecord(fft_startEvent,0) ); for (int i=0;i<osize_1[0];++i){ checkCuda_accfft (cufftExecZ2Z(plan->fplan_1,(cufftDoubleComplex*)&data_out_d[2*i*osize_1[1]*osize_1[2]], (cufftDoubleComplex*)&data_out_d[2*i*osize_1[1]*osize_1[2]],CUFFT_FORWARD)); } checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) ); checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) ); fft_time+=dummy_time/1000; } if(plan->oneD){ plan->T_plan_2->execute_gpu(plan->T_plan_2,data_out_d,timings,2); } else{ plan->T_plan_2->execute_gpu(plan->T_plan_2,data_out_d,timings,2,1,coords[1]); } /**************************************************************/ /******************* N0 x N1/P0 x N2/P1 **********************/ /**************************************************************/ MPI_Barrier(plan->c_comm); if(xyz[0]){ checkCuda_accfft( cudaEventRecord(fft_startEvent,0) ); checkCuda_accfft (cufftExecZ2Z(plan->fplan_2,(cufftDoubleComplex*)data_out_d, (cufftDoubleComplex*)data_out_d,CUFFT_FORWARD)); checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) ); checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) ); fft_time+=dummy_time/1000; } } else if (direction==1){ if(xyz[0]){ checkCuda_accfft( cudaEventRecord(fft_startEvent,0) ); checkCuda_accfft (cufftExecZ2Z(plan->fplan_2,(cufftDoubleComplex*)data_d, (cufftDoubleComplex*)data_d,CUFFT_INVERSE)); checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) ); checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) ); fft_time+=dummy_time/1000; } if(plan->oneD){ plan->T_plan_2i->execute_gpu(plan->T_plan_2i,data_d,timings,1); } else{ plan->T_plan_2i->execute_gpu(plan->T_plan_2i,data_d,timings,1,1,coords[1]); } MPI_Barrier(plan->c_comm); /**************************************************************/ /******************* N0/P0 x N1 x N2/P1 **********************/ /**************************************************************/ if(xyz[1]){ checkCuda_accfft( cudaEventRecord(fft_startEvent,0) ); for (int i=0;i<osize_1i[0];++i){ checkCuda_accfft (cufftExecZ2Z(plan->fplan_1,(cufftDoubleComplex*)&data_d[2*i*NY*osize_1i[2]], (cufftDoubleComplex*)&data_d[2*i*NY*osize_1i[2]],CUFFT_INVERSE)); } checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) ); checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) ); fft_time+=dummy_time/1000; } if(!plan->oneD){ plan->T_plan_1i->execute_gpu(plan->T_plan_1i,data_d,timings,1,osize_1i[0],coords[0]); } /**************************************************************/ /******************* N0/P0 x N1/P1 x N2 **********************/ /**************************************************************/ // IFFT in Z direction if(xyz[2]){ checkCuda_accfft( cudaEventRecord(fft_startEvent,0) ); checkCuda_accfft (cufftExecZ2D(plan->iplan_0,(cufftDoubleComplex*)data_d,(cufftDoubleReal*)data_out_d)); checkCuda_accfft( cudaEventRecord(fft_stopEvent,0) ); checkCuda_accfft( cudaEventSynchronize(fft_stopEvent) ); // wait until fft is executed checkCuda_accfft( cudaEventElapsedTime(&dummy_time, fft_startEvent, fft_stopEvent) ); fft_time+=dummy_time/1000; } else data_out_d=data_d; } timings[4]+=fft_time; if(timer==NULL){ //delete [] timings; } else{ timer[0]+=timings[0]; timer[1]+=timings[1]; timer[2]+=timings[2]; timer[3]+=timings[3]; timer[4]+=timings[4]; } checkCuda_accfft (cudaDeviceSynchronize()); MPI_Barrier(plan->c_comm); return; }// end accfft_execute_gpu
cufftResult WINAPI wine_cufftExecZ2Z(cufftHandle plan, cufftDoubleComplex *idata, cufftDoubleComplex *odata, int direction){ WINE_TRACE("\n"); return cufftExecZ2Z( plan, idata, odata, direction ); }
/* 3D FFT for field generation */ void make_field_3dfft(int _fielddim) { std::chrono::time_point<std::chrono::high_resolution_clock> time1, time2, time3; time1 = std::chrono::system_clock::now(); double PLindex; // int nr_3Dfields, fielddim, index, mod_fielddim; int neg_k,neg_i,neg_j; cufftDoubleComplex *host_array; cufftDoubleComplex *device_array; cufftHandle back_plan; PLindex = atof(test_fftw[1]); int fielddim = _fielddim;//atoi(test_fftw[2]); int nr_3Dfields = atoi(test_fftw[3]); int mod_fielddim = fielddim+1; size_t nr_points = (fielddim+1)*(fielddim+1)*(fielddim+1); size_t num_bytes = nr_points * sizeof(cufftDoubleComplex); // mem allocation host_array = (cufftDoubleComplex*) malloc(num_bytes); //array_in = (fftw_complex*) fftw_malloc(nr_points * sizeof(fftw_complex)); checkCudaErrors(cudaMalloc((void **)&device_array, num_bytes)); //back_plan = fftw_plan_dft_3d(mod_fielddim, mod_fielddim, mod_fielddim,array_in, array_in, FFTW_BACKWARD , FFTW_MEASURE); cufftPlan3d(&back_plan, mod_fielddim, mod_fielddim, mod_fielddim, CUFFT_Z2Z); //array_in, array_in, FFTW_BACKWARD , FFTW_MEASURE); cout << endl <<"fiel_dim " << mod_fielddim << endl; for(int m=0; m<nr_3Dfields; m++){ cout << endl <<"field #" << m << endl; for(int i=-fielddim/2;i<=fielddim/2;i++){ //fill the data that goes into the FFT for(int j=-fielddim/2;j<=fielddim/2;j++){ for(int k=0;k<=fielddim/2;k++){ //positive k parts double abs_k = sqrt((double)(i*i+j*j+k*k)); double A = 1000.0*pow(abs_k,(PLindex*0.5)); double phase=(2.0*rand_01()-1.0)*PI; int index = (i+fielddim/2)*mod_fielddim*mod_fielddim+(j+fielddim/2)*mod_fielddim+k+fielddim/2; //array_in[index]=A*cos(phase)+I*A*sin(phase); host_array[index].x = A*cos(phase); host_array[index].y = A*sin(phase); //negative k parts //phase=-1.0*phase; neg_i =- 1*i; neg_j =- 1*j; neg_k =- 1*k; index = (neg_i+fielddim/2)*mod_fielddim*mod_fielddim+(neg_j+fielddim/2)*mod_fielddim+neg_k+fielddim/2; //array_in[index]=A*cos(phase)+I*A*sin(-1.0*phase); host_array[index].x = A*cos(phase); host_array[index].y = A*sin(-1.0*phase); } } } host_array[(fielddim/2)*mod_fielddim*mod_fielddim+(fielddim/2)*mod_fielddim+fielddim/2].x = 1.0; host_array[(fielddim/2)*mod_fielddim*mod_fielddim+(fielddim/2)*mod_fielddim+fielddim/2].y = 0.0; //Rearange array for use in FFTW library for(int j=0;j<=nr_points/2;j++){ cufftDoubleComplex temp = host_array[j]; host_array[j] = host_array[j+nr_points/2]; host_array[j+nr_points/2] = temp; } time2 = std::chrono::system_clock::now(); // move data to the GPU cudaMemcpy(device_array, host_array, num_bytes, cudaMemcpyDeviceToHost); /// fftw_execute(back_plan); cufftExecZ2Z(back_plan, device_array, device_array, FFTW_BACKWARD); // move back cudaMemcpy(host_array, device_array, num_bytes, cudaMemcpyHostToDevice); time3 = std::chrono::system_clock::now(); auto s1 = std::chrono::duration_cast<std::chrono::milliseconds>(time2-time1).count(); auto s2 = std::chrono::duration_cast<std::chrono::milliseconds>(time3-time2).count(); auto time_total = s1 + s2; cout << endl << "--- 3D FFT make_field ---" << endl << "1. " << s1 << " ms" << endl << "2. " << s2 << " ms" << endl << "---" << endl << "total time " << time_total << " ms" << endl << "---" << endl; //for(i=0;i<nr_points;i++) printf("%g + i*%g\n",creal(array_in[i]),cimag(array_in[i])); /* for(int i=0;i<mod_fielddim;i++){ for(int j=0;j<mod_fielddim;j++){ for(int k=0;k<mod_fielddim;k++){ printf("%.15lf\n", host_array[i*mod_fielddim*mod_fielddim+j*mod_fielddim+k].x); } } } */ } // print some elemntes for debug double *print_array = (double*) host_array; print_double(print_array, 100); // first 100 elements print_double(print_array+(2*nr_points-100), 100); // last 100 elements /* fftw_free(array_in); fftw_destroy_plan(back_plan); */ cufftDestroy(back_plan); cudaFree(device_array); free(host_array); }