void cleanup(void) { cudaGraphicsUnregisterResource(cuda_vbo_resource); unbindTexture(); deleteTexture(); // Free all host and device resources free(hvfield); free(particles); cudaFree(dvfield); cudaFree(vxfield); cudaFree(vyfield); cufftDestroy(planr2c); cufftDestroy(planc2r); glBindBufferARB(GL_ARRAY_BUFFER_ARB, 0); glDeleteBuffersARB(1, &vbo); sdkDeleteTimer(&timer); if (g_bExitESC) { checkCudaErrors(cudaDeviceReset()); } }
/*! computes the N-point DFT of signal X and stores in Y using CUDA's FFT library */ bool cuda_dft(cuComplex *Y, cuComplex *X, float scale, int N) { size_t bytes = (size_t)N * sizeof(cuComplex); cuComplex *Y_gpu, *X_gpu; cudaMalloc((void **)&Y_gpu, bytes); cudaMalloc((void **)&X_gpu, bytes); cudaMemcpy(Y_gpu, Y, bytes, cudaMemcpyHostToDevice); cudaMemcpy(X_gpu, X, bytes, cudaMemcpyHostToDevice); cufftHandle plan; cufftPlan1d(&plan, N, CUFFT_C2C, 1); cufftExecC2C(plan, X_gpu, Y_gpu, CUFFT_FORWARD); cufftDestroy(plan); cudaMemcpy(Y, Y_gpu, bytes, cudaMemcpyDeviceToHost); cudaFree(Y_gpu); cudaFree(X_gpu); for (int n = 0; n < N; n++) { Y[n].x *= scale; Y[n].y *= scale; } return true; }
void gpuNUFFT::GpuNUFFTOperator::freeDeviceMemory(int n_coils) { if (!gpuMemAllocated) return; cufftDestroy(fft_plan); // Destroy the cuFFT plan. if (DEBUG && (cudaThreadSynchronize() != cudaSuccess)) printf("error at thread synchronization 9: %s\n",cudaGetErrorString(cudaGetLastError())); freeLookupTable(); freeTotalDeviceMemory(data_indices_d,data_sorted_d,crds_d,gdata_d,sectors_d,sector_centers_d,NULL);//NULL as stop if (n_coils > 1 && deapo_d != NULL) cudaFree(deapo_d); if (this->applySensData()) cudaFree(sens_d); if (this->applyDensComp()) cudaFree(density_comp_d); showMemoryInfo(); gpuMemAllocated = false; }
/** * Destroys a previously created plan. * The CUDA destructor returns a result code, while the fftw2 destructor is * a void function. For now, the result code in the CUDA destructor is * ignored. */ void sararfftnd_destroy_plan( sararfftnd_plan plan ) { #ifdef USE_GPUS cufftDestroy( plan ); #else // #ifndef USE_GPUS rfftwnd_destroy_plan( plan ); #endif }
extern "C" void cuda_fft(double *d_data, int Lx, int Ny, void *stream) { cufftHandle plan; cufftPlan1d(&plan, Lx, CUFFT_Z2Z, Ny); cufftSetStream(plan, (cudaStream_t)stream); cufftExecZ2Z(plan, (cufftDoubleComplex*)d_data, (cufftDoubleComplex*)d_data,CUFFT_FORWARD); cufftDestroy(plan); }
extern "C" void cuda_fft(float *d_data, int Lx, int Ny, void *stream) { cufftHandle plan; cufftPlan1d(&plan, Lx, CUFFT_C2C, Ny); cufftSetStream(plan, (cudaStream_t)stream); cufftExecC2C(plan, (cufftComplex*)d_data, (cufftComplex*)d_data,CUFFT_FORWARD); cufftDestroy(plan); }
void g_fwdfft(QSP_ARG_DECL Data_Obj *dst_dp, Data_Obj *src1_dp) { //Variable declarations int NX = 256; //int BATCH = 10; int BATCH = 1; cufftResult_t status; //Declare plan for FFT cufftHandle plan; //cufftComplex *data; //cufftComplex *result; void *data; void *result; cudaError_t drv_err; //Allocate RAM //cutilSafeCall(cudaMalloc(&data, sizeof(cufftComplex)*NX*BATCH)); //cutilSafeCall(cudaMalloc(&result, sizeof(cufftComplex)*NX*BATCH)); drv_err = cudaMalloc(&data, sizeof(cufftComplex)*NX*BATCH); if( drv_err != cudaSuccess ){ WARN("error allocating cuda data buffer for fft!?"); return; } drv_err = cudaMalloc(&result, sizeof(cufftComplex)*NX*BATCH); if( drv_err != cudaSuccess ){ WARN("error allocating cuda result buffer for fft!?"); // BUG clean up previous malloc... return; } //Create plan for FFT status = cufftPlan1d(&plan, NX, CUFFT_C2C, BATCH); if (status != CUFFT_SUCCESS) { sprintf(ERROR_STRING, "Error in cufftPlan1d: %s\n", getCUFFTError(status)); NWARN(ERROR_STRING); } //Run forward fft on data status = cufftExecC2C(plan, (cufftComplex *)data, (cufftComplex *)result, CUFFT_FORWARD); if (status != CUFFT_SUCCESS) { sprintf(ERROR_STRING, "Error in cufftExecC2C: %s\n", getCUFFTError(status)); NWARN(ERROR_STRING); } //Run inverse fft on data /*status = cufftExecC2C(plan, data, result, CUFFT_INVERSE); if (status != CUFFT_SUCCESS) { sprintf(ERROR_STRING, "Error in cufftExecC2C: %s\n", getCUFFTError(status)); NWARN(ERROR_STRING); }*/ //Free resources cufftDestroy(plan); cudaFree(data); }
/* Destroy a 1d plan */ int dfft_cuda_destroy_local_plan(cuda_plan_t *p) { cufftResult res = cufftDestroy(*p); if (res != CUFFT_SUCCESS) { printf("cufftDestroy error: %d\n", res); return res; } return 0; }
GLFluids::~GLFluids(){ cudaGraphicsUnregisterResource(cuda_vbo_resource); unbind_texture(); delete_texture(); // Free all host and device resources free(hvfield); free(particles); cudaFree(dvfield); cudaFree(vxfield); cudaFree(vyfield); cufftDestroy(planr2c); cufftDestroy(planc2r); glBindBuffer(GL_ARRAY_BUFFER, 0); glDeleteBuffers(1, &vbo); }
void oskar_fft_free(oskar_FFT* h) { int status = 0; if (!h) return; oskar_mem_free(h->fftpack_work, &status); oskar_mem_free(h->fftpack_wsave, &status); #ifdef OSKAR_HAVE_CUDA if (h->location == OSKAR_GPU) cufftDestroy(h->cufft_plan); #endif free(h); }
void fft3dGPU(T1* d_data, int nx, int ny, int nz, void* stream) { cufftHandle plan; cufftSetCompatibilityMode(plan, CUFFT_COMPATIBILITY_FFTW_ALL); if (cufftPlan3d(&plan, nz, ny, nx, CUFFT_C2C)!=CUFFT_SUCCESS) { fprintf(stderr, "CUFFT error: Plan creation failed"); } cufftSetStream(plan, (cudaStream_t) stream); cufftExecC2C(plan, (cufftComplex*) d_data, (cufftComplex*) d_data, CUFFT_FORWARD); cufftDestroy(plan); }
void Fastconv_base<D, T, C>::fconv (T const* in, T const* kernel, T* out, length_type rows, length_type columns, bool transform_kernel) { // convert pointers to types the CUFFT library accepts typedef cufftComplex ctype; ctype* d_out = reinterpret_cast<ctype*>(out); ctype* d_kernel = const_cast<ctype*>(reinterpret_cast<ctype const*>(kernel)); ctype* d_in = const_cast<ctype*>(reinterpret_cast<ctype const*>(in)); cufftHandle plan; if (transform_kernel) { // Create a 1D FFT plan and transform the kernel cufftPlan1d(&plan, columns, CUFFT_C2C, 1); cufftExecC2C(plan, d_kernel, d_kernel, CUFFT_FORWARD); cufftDestroy(plan); } // Create a FFTM plan cufftPlan1d(&plan, columns, CUFFT_C2C, rows); // transform the data cufftExecC2C(plan, d_in, d_in, CUFFT_FORWARD); // convolve with kernel, combine with scaling needed for inverse FFT typedef typename impl::scalar_of<T>::type scalar_type; scalar_type scale = 1 / static_cast<scalar_type>(columns); if (D == 1) vmmuls_row(kernel, in, out, scale, rows, columns); else mmmuls(kernel, in, out, scale, rows, columns); // inverse transform the signal cufftExecC2C(plan, d_out, d_out, CUFFT_INVERSE); cufftDestroy(plan); }
// Main execution loop. // Capture from camera, filter image data, save filered data. // Note*: wait conditions are used to ensure that the previous image // has been rendered while allowing for continued operation. void WorkerThread::run() { createInitialFilter(); while( 1 ) { // Throttling while( !g_throttleTimerFlag ){} g_throttleTimerFlag = 0; // Capture image data _camera->capture(); float *data = _camera->frameData(); // Need a new filter? _new_filter_mutex.lock(); if( _should_create_new_filter ) { createNewFilter(); } _new_filter_mutex.unlock(); // Gabor filter gaborFilter( data ); // Make sure the previous image has been rendered before overwriting it _image_mutex.lock(); if( !_is_image_processed ) { _image_processed.wait(&_image_mutex); } // Copy image data so the thread continues capturing and filtering memcpy( _full_image, data, sizeof(float) * _original_image_width * _original_image_height * 2 ); _is_image_processed = false; emit filterComplete(); _image_mutex.unlock(); if( _should_terminate ) { break; } } // Free GPU resources cudaFree(_gabor_data); cudaFree(_gpu_image_0); cudaFree(_gpu_image_1); cufftDestroy(_fft_plan); }
void cleanup(void) { cudaGraphicsUnregisterResource(cuda_vbo_resource); unbindTexture(); deleteTexture(); // Free all host and device resources free(hvfield); free(particles); #ifdef BROADCAST free(packets); #endif cudaFree(dvfield); cudaFree(vxfield); cudaFree(vyfield); cufftDestroy(planr2c); cufftDestroy(planc2r); glBindBufferARB(GL_ARRAY_BUFFER_ARB, 0); glDeleteBuffersARB(1, &vbo); sdkDeleteTimer(&timer); }
/* * Class: jcuda_jcufft_JCufft * Method: cufftDestroyNative * Signature: (Ljcuda/jcufft/JCufftHandle;)I */ JNIEXPORT jint JNICALL Java_jcuda_jcufft_JCufft_cufftDestroyNative (JNIEnv *env, jclass cla, jobject handle) { if (handle == NULL) { ThrowByName(env, "java/lang/NullPointerException", "Parameter 'handle' is null for cufftDestroy"); return JCUFFT_INTERNAL_ERROR; } Logger::log(LOG_TRACE, "Destroying plan\n"); cufftHandle plan = env->GetIntField(handle, cufftHandle_plan); cufftResult result = cufftDestroy(plan); return result; }
void cleanup() { // DEPRECATED: cutilSafeCall(cudaGLUnregisterBufferObject(heightVertexBuffer)); cutilSafeCall(cudaGraphicsUnregisterResource(cuda_heightVB_resource)); // DEPRECATED: cutilSafeCall(cudaGLUnregisterBufferObject(slopeVertexBuffer)); cutilSafeCall(cudaGraphicsUnregisterResource(cuda_slopeVB_resource)); deleteVBO(&posVertexBuffer); deleteVBO(&heightVertexBuffer); deleteVBO(&slopeVertexBuffer); cutilSafeCall( cudaFree(d_h0) ); cutilSafeCall( cudaFree(d_slope) ); free(h_h0); cufftDestroy(fftPlan); }
void fft3dGPU(T1* d_data, int nx, int ny, int nz, void* stream) { //printf("Running 3d forward xform \n"); cufftHandle plan; cufftSetCompatibilityMode(plan, CUFFT_COMPATIBILITY_FFTW_ALL); if (cufftPlan3d(&plan, nz, ny, nx, CUFFT_Z2Z)!=CUFFT_SUCCESS) { printf("CUFFT error: Plan creation failed\n"); } //printf("Built plan \n"); cufftSetStream(plan, (cudaStream_t) stream); if (cufftExecZ2Z(plan, (cufftDoubleComplex*) d_data, (cufftDoubleComplex*) d_data, CUFFT_FORWARD)!=CUFFT_SUCCESS) { printf("CUFFT error: Plan execution failed\n"); }; cufftDestroy(plan); }
void fft_3d_destroy_plan_cuda(struct fft_plan_3d *plan) { #ifdef FFT_CUFFT if (plan->pre_plan) remap_3d_destroy_plan(plan->pre_plan); if (plan->mid1_plan) remap_3d_destroy_plan(plan->mid1_plan); if (plan->mid2_plan) remap_3d_destroy_plan(plan->mid2_plan); if (plan->post_plan) remap_3d_destroy_plan(plan->post_plan); if (plan->copy) free(plan->copy); if (plan->scratch) free(plan->scratch); //cufftDestroy(plan->plan_fast); //cufftDestroy(plan->plan_mid); //cufftDestroy(plan->plan_slow); cufftDestroy(plan->plan_3d); free(plan); #endif }
/** * Destroy AccFFT GPU plan. * @param plan Input plan to be destroyed. */ void accfft_destroy_plan_gpu(accfft_plan_gpu * plan){ if(plan->T_plan_1!=NULL)delete(plan->T_plan_1); if(plan->T_plan_1i!=NULL)delete(plan->T_plan_1i); if(plan->T_plan_2!=NULL)delete(plan->T_plan_2); if(plan->T_plan_2i!=NULL)delete(plan->T_plan_2i); if(plan->Mem_mgr!=NULL)delete(plan->Mem_mgr); if(plan->fplan_0!=-1)cufftDestroy(plan->fplan_0); if(plan->fplan_1!=-1)cufftDestroy(plan->fplan_1); if(plan->fplan_2!=-1)cufftDestroy(plan->fplan_2); if(plan->iplan_0!=-1)cufftDestroy(plan->iplan_0); if(plan->iplan_1!=-1)cufftDestroy(plan->iplan_1); if(plan->iplan_2!=-1)cufftDestroy(plan->iplan_2); MPI_Comm_free(&plan->row_comm); MPI_Comm_free(&plan->col_comm); return; }//end accfft_destroy_plan_gpu
int main2(int sockfd) { cufftHandle plan; cufftComplex *devPtr; cufftReal indata[NX*BATCH]; cufftComplex data[NX*BATCH]; int i,timer,j,k; char fname[15]; FILE *f; #define BUFSIZE (21*4096*sizeof(int)) int buffer[BUFSIZE]; int p,nread; f = fopen("21-4096","rb"); nread=fread(buffer,BUFSIZE,1,f); printf("nread=%i\n",nread); fclose(f); i=0; for (j=0;j<BATCH;j++) { for (k=0;k<NX;k++) { data[j*NX+k].x = buffer[j*NX+k]; data[j*NX+k].y = 0; } } //f=fopen("y.txt","r"); /* source data creation */ //int sockfd = myconnect(); //printf("connected\n"); /* WORKING!!!!!!!! i=0; for (j=0;j<BATCH;j++) { sprintf(fname,"%i.txt",j); printf("%s\n",fname); f = fopen(fname,"r"); for (k=0;k<NX;k++) { fscanf(f,"%i\n",&p); data[j*NX+k].x = p; data[j*NX+k].y = 0; } fclose(f); */ /* for(i= 0 ; i < NX*BATCH ; i++){ //fscanf(f,"%i\n",&p); //data[i].x= p; data[i].x= 1.0f; //printf("%f\n",data[i].x); data[i].y = 0.0f; } //fclose(f) */ //} /* creates 1D FFT plan */ cufftPlan1d(&plan, NX, CUFFT_C2C, BATCH); /* cutCreateTimer(&timer); cutResetTimer(timer); cutStartTimer(timer); */ /* GPU memory allocation */ cudaMalloc((void**)&devPtr, sizeof(cufftComplex)*NX*BATCH); /* transfer to GPU memory */ cudaMemcpy(devPtr, data, sizeof(cufftComplex)*NX*BATCH, cudaMemcpyHostToDevice); /* executes FFT processes */ cufftExecC2C(plan, devPtr, devPtr, CUFFT_FORWARD); /* executes FFT processes (inverse transformation) */ //cufftExecC2C(plan, devPtr, devPtr, CUFFT_INVERSE); /* transfer results from GPU memory */ cudaMemcpy(data, devPtr, sizeof(cufftComplex)*NX*BATCH, cudaMemcpyDeviceToHost); /* deletes CUFFT plan */ cufftDestroy(plan); /* frees GPU memory */ cudaFree(devPtr); /* cudaThreadSynchronize(); cutStopTimer(timer); printf("%f\n",cutGetTimerValue(timer)/(float)1000); cutDeleteTimer(timer); */ /* float mag; for(i = 0 ; i < NX*BATCH ; i++){ //printf("data[%d] %f %f\n", i, data[i].x, data[i].y); //printf("%f\n", data[i].x); mag = sqrtf(data[i].x*data[i].x+data[i].y*data[i].y)*2.0/NX; printf("%f\n",mag); } */ /* // save as text file float mag; i=0; for (j=0;j<BATCH;j++) { sprintf(fname,"%i-mag.txt",j); printf("%s\n",fname); f = fopen(fname,"w"); for (k=0;k<NX;k++) { //fscanf(f,"%i\n",&p); if (k>50) continue; i = j*NX+k; mag = sqrtf(data[i].x*data[i].x+data[i].y*data[i].y)*2.0/NX; fprintf(f,"%f\n",mag); } fclose(f); } */ float mag; i=0; float mags[NX]; int magsint[NX*BATCH]; memset(magsint,0,sizeof(int)*NX*BATCH); int u = 0; printf("%f %f %f %f\n",data[0].x,data[1].x,data[2].x,data[3].x); //printf("%i %i %i %i\n",magsint[0],magsint[1],magsint[2],magsint[3]); // f = fopen("ffts.bin","wb"); for (j=0;j<BATCH;j++) { // sprintf(fname,"%i-bin.dat",j); // printf("%s\n",fname); for (k=0;k<NX;k++) { //fscanf(f,"%i\n",&p); if (k>50) continue; i = j*NX+k; mags[k] = sqrtf(data[i].x*data[i].x+data[i].y*data[i].y)*2.0/NX; magsint[u]=mags[k] ; u++; //fprintf(f,"%f\n",mag); } //f = fopen(fname,"wb"); // fwrite(magsint,sizeof(int)*50,1,f); } int n; n = write(sockfd,magsint,sizeof(int)*BATCH*50); printf("%i %i %i %i\n",magsint[0],magsint[1],magsint[2],magsint[3]); printf("send ok, size: %i\n",n); //fclose(f); return 0; }
//! \brief Destructor. virtual ~FFT() { cufftDestroy(fftplan_); }
cufftResult WINAPI wine_cufftDestroy(cufftHandle plan){ WINE_TRACE("\n"); return cufftDestroy( plan ); }
int main(int argc, char *argv[]) { int i; struct timeval begin, end; int size; size_t bytes; int n = 0, m = 0; STARPUFFT(plan) plan; #ifdef STARPU_HAVE_FFTW _FFTW(plan) fftw_plan; #endif #ifdef STARPU_USE_CUDA cufftHandle cuda_plan; cudaError_t cures; #endif double timing; if (argc < 2 || argc > 3) { fprintf(stderr,"need one or two size of vector\n"); exit(EXIT_FAILURE); } starpu_init(NULL); if (argc == 2) { n = atoi(argv[1]); /* 1D */ size = n; } else if (argc == 3) { n = atoi(argv[1]); m = atoi(argv[2]); /* 2D */ size = n * m; } else { assert(0); } bytes = size * sizeof(STARPUFFT(complex)); STARPUFFT(complex) *in = STARPUFFT(malloc)(size * sizeof(*in)); starpu_srand48(0); for (i = 0; i < size; i++) in[i] = starpu_drand48() + I * starpu_drand48(); STARPUFFT(complex) *out = STARPUFFT(malloc)(size * sizeof(*out)); #ifdef STARPU_HAVE_FFTW STARPUFFT(complex) *out_fftw = STARPUFFT(malloc)(size * sizeof(*out_fftw)); #endif #ifdef STARPU_USE_CUDA STARPUFFT(complex) *out_cuda = malloc(size * sizeof(*out_cuda)); #endif if (argc == 2) { plan = STARPUFFT(plan_dft_1d)(n, SIGN, 0); #ifdef STARPU_HAVE_FFTW fftw_plan = _FFTW(plan_dft_1d)(n, in, out_fftw, SIGN, FFTW_ESTIMATE); #endif #ifdef STARPU_USE_CUDA if (cufftPlan1d(&cuda_plan, n, _CUFFT_C2C, 1) != CUFFT_SUCCESS) printf("erf\n"); #endif } else if (argc == 3) { plan = STARPUFFT(plan_dft_2d)(n, m, SIGN, 0); #ifdef STARPU_HAVE_FFTW fftw_plan = _FFTW(plan_dft_2d)(n, m, in, out_fftw, SIGN, FFTW_ESTIMATE); #endif #ifdef STARPU_USE_CUDA STARPU_ASSERT(cufftPlan2d(&cuda_plan, n, m, _CUFFT_C2C) == CUFFT_SUCCESS); #endif } else { assert(0); } #ifdef STARPU_HAVE_FFTW gettimeofday(&begin, NULL); _FFTW(execute)(fftw_plan); gettimeofday(&end, NULL); _FFTW(destroy_plan)(fftw_plan); timing = (double)((end.tv_sec - begin.tv_sec)*1000000 + (end.tv_usec - begin.tv_usec)); printf("FFTW took %2.2f ms (%2.2f MB/s)\n\n", timing/1000, bytes/timing); #endif #ifdef STARPU_USE_CUDA gettimeofday(&begin, NULL); if (cufftExecC2C(cuda_plan, (cufftComplex*) in, (cufftComplex*) out_cuda, CUFFT_FORWARD) != CUFFT_SUCCESS) printf("erf2\n"); if ((cures = cudaThreadSynchronize()) != cudaSuccess) STARPU_CUDA_REPORT_ERROR(cures); gettimeofday(&end, NULL); cufftDestroy(cuda_plan); timing = (double)((end.tv_sec - begin.tv_sec)*1000000 + (end.tv_usec - begin.tv_usec)); printf("CUDA took %2.2f ms (%2.2f MB/s)\n\n", timing/1000, bytes/timing); #endif STARPUFFT(execute)(plan, in, out); STARPUFFT(showstats)(stdout); STARPUFFT(destroy_plan)(plan); printf("\n"); #if 0 for (i = 0; i < 16; i++) printf("(%f,%f) ", cimag(in[i]), creal(in[i])); printf("\n\n"); for (i = 0; i < 16; i++) printf("(%f,%f) ", cimag(out[i]), creal(out[i])); printf("\n\n"); #ifdef STARPU_HAVE_FFTW for (i = 0; i < 16; i++) printf("(%f,%f) ", cimag(out_fftw[i]), creal(out_fftw[i])); printf("\n\n"); #endif #endif #ifdef STARPU_HAVE_FFTW { double max = 0., tot = 0., norm = 0., normdiff = 0.; for (i = 0; i < size; i++) { double diff = cabs(out[i]-out_fftw[i]); double diff2 = diff * diff; double size = cabs(out_fftw[i]); double size2 = size * size; if (diff > max) max = diff; tot += diff; normdiff += diff2; norm += size2; } fprintf(stderr, "\nmaximum difference %g\n", max); fprintf(stderr, "average difference %g\n", tot / size); fprintf(stderr, "difference norm %g\n", sqrt(normdiff)); double relmaxdiff = max / sqrt(norm); fprintf(stderr, "relative maximum difference %g\n", relmaxdiff); double relavgdiff = (tot / size) / sqrt(norm); fprintf(stderr, "relative average difference %g\n", relavgdiff); if (!strcmp(TYPE, "f") && (relmaxdiff > 1e-8 || relavgdiff > 1e-8)) return EXIT_FAILURE; if (!strcmp(TYPE, "") && (relmaxdiff > 1e-16 || relavgdiff > 1e-16)) return EXIT_FAILURE; } #endif #ifdef STARPU_USE_CUDA { double max = 0., tot = 0., norm = 0., normdiff = 0.; for (i = 0; i < size; i++) { double diff = cabs(out_cuda[i]-out_fftw[i]); double diff2 = diff * diff; double size = cabs(out_fftw[i]); double size2 = size * size; if (diff > max) max = diff; tot += diff; normdiff += diff2; norm += size2; } fprintf(stderr, "\nmaximum difference %g\n", max); fprintf(stderr, "average difference %g\n", tot / size); fprintf(stderr, "difference norm %g\n", sqrt(normdiff)); double relmaxdiff = max / sqrt(norm); fprintf(stderr, "relative maximum difference %g\n", relmaxdiff); double relavgdiff = (tot / size) / sqrt(norm); fprintf(stderr, "relative average difference %g\n", relavgdiff); if (!strcmp(TYPE, "f") && (relmaxdiff > 1e-8 || relavgdiff > 1e-8)) return EXIT_FAILURE; if (!strcmp(TYPE, "") && (relmaxdiff > 1e-16 || relavgdiff > 1e-16)) return EXIT_FAILURE; } #endif STARPUFFT(free)(in); STARPUFFT(free)(out); #ifdef STARPU_HAVE_FFTW STARPUFFT(free)(out_fftw); #endif #ifdef STARPU_USE_CUDA free(out_cuda); #endif starpu_shutdown(); return EXIT_SUCCESS; }
/** Destructor. The destructor destroys the CUFFT plan. */ inline ~Plan() { CUFFT_CHECK(cufftDestroy(plan)); }
bool test0(void) { float *h_Data, *h_Kernel, *h_ResultCPU, *h_ResultGPU; float *d_Data, *d_PaddedData, *d_Kernel, *d_PaddedKernel; fComplex *d_DataSpectrum, *d_KernelSpectrum; cufftHandle fftPlanFwd, fftPlanInv; bool bRetVal; StopWatchInterface *hTimer = NULL; sdkCreateTimer(&hTimer); printf("Testing built-in R2C / C2R FFT-based convolution\n"); const int kernelH = 3; const int kernelW = 3; const int kernelY = 1; const int kernelX = 1; const int dataH = 10; const int dataW = 10; const int fftH = snapTransformSize(dataH + kernelH - 1); const int fftW = snapTransformSize(dataW + kernelW - 1); printf("...allocating memory\n"); h_Data = (float *)malloc(dataH * dataW * sizeof(float)); h_Kernel = (float *)malloc(kernelH * kernelW * sizeof(float)); h_ResultCPU = (float *)malloc(dataH * dataW * sizeof(float)); h_ResultGPU = (float *)malloc(fftH * fftW * sizeof(float)); checkCudaErrors(cudaMalloc((void **)&d_Data, dataH * dataW * sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_Kernel, kernelH * kernelW * sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_PaddedData, fftH * fftW * sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_PaddedKernel, fftH * fftW * sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_DataSpectrum, fftH * (fftW / 2 + 1) * sizeof(fComplex))); checkCudaErrors(cudaMalloc((void **)&d_KernelSpectrum, fftH * (fftW / 2 + 1) * sizeof(fComplex))); printf("...generating random input data\n"); srand(2010); for (int i = 0; i < dataH * dataW; i++) { //h_Data[i] = getRand(); h_Data[i] = i + 1; } for (int i = 0; i < kernelH * kernelW; i++) { //h_Kernel[i] = getRand(); h_Kernel[i] = i + 1; } FILE* fp2 = fopen("input_kernel.txt", "w+"); FILE* fp3 = fopen("input_data.txt", "w+"); for (int i = 0; i < dataH * dataW; i++) fprintf(fp3, "%f\n", h_Data[i]); for (int i = 0; i < kernelH * kernelW; i++) fprintf(fp2, "%f\n", h_Kernel[i]); fclose(fp2); fclose(fp3); printf("...creating R2C & C2R FFT plans for %i x %i\n", fftH, fftW); checkCudaErrors(cufftPlan2d(&fftPlanFwd, fftH, fftW, CUFFT_R2C)); checkCudaErrors(cufftPlan2d(&fftPlanInv, fftH, fftW, CUFFT_C2R)); printf("...uploading to GPU and padding convolution kernel and input data\n"); checkCudaErrors(cudaMemcpy(d_Kernel, h_Kernel, kernelH * kernelW * sizeof(float), cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(d_Data, h_Data, dataH * dataW * sizeof(float), cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemset(d_PaddedKernel, 0, fftH * fftW * sizeof(float))); checkCudaErrors(cudaMemset(d_PaddedData, 0, fftH * fftW * sizeof(float))); padKernel( d_PaddedKernel, d_Kernel, fftH, fftW, kernelH, kernelW, kernelY, kernelX ); padDataClampToBorder( d_PaddedData, d_Data, fftH, fftW, dataH, dataW, kernelH, kernelW, kernelY, kernelX ); //Not including kernel transformation into time measurement, //since convolution kernel is not changed very frequently printf("...transforming convolution kernel\n"); checkCudaErrors(cufftExecR2C(fftPlanFwd, (cufftReal *)d_PaddedKernel, (cufftComplex *)d_KernelSpectrum)); printf("...running GPU FFT convolution: "); checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); checkCudaErrors(cufftExecR2C(fftPlanFwd, (cufftReal *)d_PaddedData, (cufftComplex *)d_DataSpectrum)); modulateAndNormalize(d_DataSpectrum, d_KernelSpectrum, fftH, fftW, 1); checkCudaErrors(cufftExecC2R(fftPlanInv, (cufftComplex *)d_DataSpectrum, (cufftReal *)d_PaddedData)); checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&hTimer); double gpuTime = sdkGetTimerValue(&hTimer); printf("%f MPix/s (%f ms)\n", (double)dataH * (double)dataW * 1e-6 / (gpuTime * 0.001), gpuTime); printf("...reading back GPU convolution results\n"); checkCudaErrors(cudaMemcpy(h_ResultGPU, d_PaddedData, fftH * fftW * sizeof(float), cudaMemcpyDeviceToHost)); printf("...running reference CPU convolution\n"); convolutionClampToBorderCPU( h_ResultCPU, h_Data, h_Kernel, dataH, dataW, kernelH, kernelW, kernelY, kernelX ); printf("...comparing the results: "); double sum_delta2 = 0; double sum_ref2 = 0; double max_delta_ref = 0; for (int y = 0; y < dataH; y++) for (int x = 0; x < dataW; x++) { double rCPU = (double)h_ResultCPU[y * dataW + x]; double rGPU = (double)h_ResultGPU[y * fftW + x]; double delta = (rCPU - rGPU) * (rCPU - rGPU); double ref = rCPU * rCPU + rCPU * rCPU; if ((delta / ref) > max_delta_ref) { max_delta_ref = delta / ref; } sum_delta2 += delta; sum_ref2 += ref; } double L2norm = sqrt(sum_delta2 / sum_ref2); printf("rel L2 = %E (max delta = %E)\n", L2norm, sqrt(max_delta_ref)); bRetVal = (L2norm < 1e-6) ? true : false; printf(bRetVal ? "L2norm Error OK\n" : "L2norm Error too high!\n"); printf("...shutting down\n"); sdkStartTimer(&hTimer); checkCudaErrors(cufftDestroy(fftPlanInv)); checkCudaErrors(cufftDestroy(fftPlanFwd)); checkCudaErrors(cudaFree(d_DataSpectrum)); checkCudaErrors(cudaFree(d_KernelSpectrum)); checkCudaErrors(cudaFree(d_PaddedData)); checkCudaErrors(cudaFree(d_PaddedKernel)); checkCudaErrors(cudaFree(d_Data)); checkCudaErrors(cudaFree(d_Kernel)); FILE* fp = fopen("result_gpu.txt", "w+"); FILE* fp1 = fopen("result_cpu.txt", "w+"); for (int i = 0; i < dataH * dataW; i++) { fprintf(fp, "%f\n", h_ResultGPU[i]); fprintf(fp1, "%f\n", h_ResultCPU[i]); } fclose(fp); fclose(fp1); free(h_ResultGPU); free(h_ResultCPU); free(h_Data); free(h_Kernel); return bRetVal; }
void sararfftnd_destroy_plan( sararfftnd_plan plan ) { cufftDestroy( plan ); destroyPlanSize( plan ); }
void WorkerThread::createNewFilter() { // Free GPU memory from current filter and CUFFT cudaFree(_gabor_data); cudaFree(_gpu_image_0); cudaFree(_gpu_image_1); cufftDestroy(_fft_plan); float* gaussian_data; cudaMalloc((void**)&gaussian_data, sizeof(float) * _filter_pixels); int2 gaussian_size; gaussian_size.x = _filter_size; gaussian_size.y = _filter_size; int2 gaussian_center; gaussian_center.x = _filter_size / 2; gaussian_center.y = _filter_size / 2; gaussian(gaussian_data, _new_theta, _new_sigma, 1.0, gaussian_center, gaussian_size); float* harmonic_data; cudaMalloc((void**)&harmonic_data, sizeof(float) * _filter_pixels * 2); int2 harmonic_size; harmonic_size.x = _filter_size; harmonic_size.y = _filter_size; int2 harmonic_center; harmonic_center.x = _filter_size / 2; harmonic_center.y = _filter_size / 2; harmonic(harmonic_data, _new_theta, _new_lambda, _new_psi, harmonic_center, harmonic_size); float* host_harmonic = new float[_filter_size * _filter_size * 2]; cudaMalloc((void**)&_gabor_data, sizeof(float) * _filter_pixels * 2); int2 gabor_size; gabor_size.x = _filter_size; gabor_size.y = _filter_size; int2 gabor_center; gabor_center.x = _filter_size / 2; gabor_center.y = _filter_size / 2; multiplyRealComplex(gaussian_data, harmonic_data, _gabor_data, _filter_size * _filter_size); float* host_gabor_data = new float[_filter_pixels * 2]; cudaMemcpy(host_gabor_data, _gabor_data, sizeof(float) * _filter_pixels * 2, cudaMemcpyDeviceToHost); //pad the filter { float* data = host_gabor_data; float* target = _filter_image; memset(target, 0, sizeof(float) * _padded_pixels * 2); int padded_stride = 2 * _padded_size; int target_stride = 2 * _target_size; for (int i = 0; i < _target_size; ++i) { memcpy(target, data, sizeof(float) * target_stride); target += padded_stride; data += target_stride; } } // Copy gabor data into member for texture creation _filter_image_mutex.lock(); memcpy(_host_gabor_data, host_gabor_data, sizeof(float) * _filter_pixels * 2); _filter_image_mutex.unlock(); cudaFree(_gabor_data); cudaMalloc((void**)&_gabor_data, sizeof(float) * _padded_pixels * 2); cudaMalloc((void**)&_gpu_image_0, sizeof(float) * _padded_pixels * 2); cudaMalloc((void**)&_gpu_image_1, sizeof(float) * _padded_pixels * 2); cudaMemcpy(_gabor_data, _filter_image, sizeof(float) * _padded_pixels * 2, cudaMemcpyHostToDevice); cufftPlan2d(&_fft_plan, _padded_size, _padded_size, CUFFT_C2C); cufftExecC2C(_fft_plan, (cufftComplex*)(_gabor_data), (cufftComplex*)(_gabor_data), CUFFT_FORWARD); cudaMemcpy(_filter_image, _gabor_data, sizeof(float) * _padded_pixels * 2, cudaMemcpyDeviceToHost); // Free temporary GPU memory used for creation of filter cudaFree(gaussian_data); cudaFree(harmonic_data); delete host_harmonic; delete host_gabor_data; _should_create_new_filter = false; emit newFilterImage(); }
void Fastconv_base<D, T, ComplexFmt>::fconv (T const* in, T const* kernel, T* out, length_type rows, length_type columns, bool transform_kernel) { size_t kernel_size = (D == 1) ? columns : rows * columns; // allocate device memory and copy input and kernel over from host Device_storage<T> dev_out(rows * columns); Device_storage<T> dev_kernel(kernel_size); Device_storage<T> dev_in(rows * columns); // If the kernel is a matrix, it is assumed to be row-major and dense. // As a result, it can be copied as one contiguous chunk. cudaMemcpy( dev_kernel.data(), kernel, kernel_size * sizeof(T), cudaMemcpyHostToDevice); ASSERT_CUDA_OK(); // Transfer the input (row major, dense) cudaMemcpy( dev_in.data(), in, rows * columns * sizeof(T), cudaMemcpyHostToDevice); ASSERT_CUDA_OK(); // convert pointers to types the CUFFT library accepts typedef cufftComplex ctype; ctype* d_out = reinterpret_cast<ctype*>(dev_out.data()); ctype* d_kernel = reinterpret_cast<ctype*>(dev_kernel.data()); ctype* d_in = reinterpret_cast<ctype*>(dev_in.data()); cufftHandle plan; if (transform_kernel) { // Create a 1D FFT plan and transform the kernel cufftPlan1d(&plan, columns, CUFFT_C2C, 1); cufftExecC2C(plan, d_kernel, d_kernel, CUFFT_FORWARD); cufftDestroy(plan); } // Create a FFTM plan cufftPlan1d(&plan, columns, CUFFT_C2C, rows); // transform the data cufftExecC2C(plan, d_in, d_in, CUFFT_FORWARD); // convolve with kernel, combine with scaling needed for inverse FFT typedef typename impl::Scalar_of<T>::type scalar_type; scalar_type scale = 1 / static_cast<scalar_type>(columns); if (D == 1) vmmuls_row_cc(d_kernel, d_in, d_out, scale, rows, columns); else mmmuls_cc(d_kernel, d_in, d_out, scale, rows, columns); // inverse transform the signal cufftExecC2C(plan, d_out, d_out, CUFFT_INVERSE); cufftDestroy(plan); // Move data back to the host from the output buffer cudaMemcpy( out, dev_out.data(), rows * columns * sizeof(T), cudaMemcpyDeviceToHost); ASSERT_CUDA_OK(); }
bool test2(void) { float *h_Data, *h_Kernel, *h_ResultCPU, *h_ResultGPU; float *d_Data, *d_Kernel, *d_PaddedData, *d_PaddedKernel; fComplex *d_DataSpectrum0, *d_KernelSpectrum0; cufftHandle fftPlan; bool bRetVal; StopWatchInterface *hTimer = NULL; sdkCreateTimer(&hTimer); printf("Testing updated custom R2C / C2R FFT-based convolution\n"); const int kernelH = 7; const int kernelW = 6; const int kernelY = 3; const int kernelX = 4; const int dataH = 2000; const int dataW = 2000; const int fftH = snapTransformSize(dataH + kernelH - 1); const int fftW = snapTransformSize(dataW + kernelW - 1); printf("...allocating memory\n"); h_Data = (float *)malloc(dataH * dataW * sizeof(float)); h_Kernel = (float *)malloc(kernelH * kernelW * sizeof(float)); h_ResultCPU = (float *)malloc(dataH * dataW * sizeof(float)); h_ResultGPU = (float *)malloc(fftH * fftW * sizeof(float)); checkCudaErrors(cudaMalloc((void **)&d_Data, dataH * dataW * sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_Kernel, kernelH * kernelW * sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_PaddedData, fftH * fftW * sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_PaddedKernel, fftH * fftW * sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_DataSpectrum0, fftH * (fftW / 2) * sizeof(fComplex))); checkCudaErrors(cudaMalloc((void **)&d_KernelSpectrum0, fftH * (fftW / 2) * sizeof(fComplex))); printf("...generating random input data\n"); srand(2010); for (int i = 0; i < dataH * dataW; i++) { h_Data[i] = getRand(); } for (int i = 0; i < kernelH * kernelW; i++) { h_Kernel[i] = getRand(); } printf("...creating C2C FFT plan for %i x %i\n", fftH, fftW / 2); checkCudaErrors(cufftPlan2d(&fftPlan, fftH, fftW / 2, CUFFT_C2C)); printf("...uploading to GPU and padding convolution kernel and input data\n"); checkCudaErrors(cudaMemcpy(d_Data, h_Data, dataH * dataW * sizeof(float), cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(d_Kernel, h_Kernel, kernelH * kernelW * sizeof(float), cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemset(d_PaddedData, 0, fftH * fftW * sizeof(float))); checkCudaErrors(cudaMemset(d_PaddedKernel, 0, fftH * fftW * sizeof(float))); padDataClampToBorder( d_PaddedData, d_Data, fftH, fftW, dataH, dataW, kernelH, kernelW, kernelY, kernelX ); padKernel( d_PaddedKernel, d_Kernel, fftH, fftW, kernelH, kernelW, kernelY, kernelX ); //CUFFT_INVERSE works just as well... const int FFT_DIR = CUFFT_FORWARD; //Not including kernel transformation into time measurement, //since convolution kernel is not changed very frequently printf("...transforming convolution kernel\n"); checkCudaErrors(cufftExecC2C(fftPlan, (cufftComplex *)d_PaddedKernel, (cufftComplex *)d_KernelSpectrum0, FFT_DIR)); printf("...running GPU FFT convolution: "); checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); checkCudaErrors(cufftExecC2C(fftPlan, (cufftComplex *)d_PaddedData, (cufftComplex *)d_DataSpectrum0, FFT_DIR)); spProcess2D(d_DataSpectrum0, d_DataSpectrum0, d_KernelSpectrum0, fftH, fftW / 2, FFT_DIR); checkCudaErrors(cufftExecC2C(fftPlan, (cufftComplex *)d_DataSpectrum0, (cufftComplex *)d_PaddedData, -FFT_DIR)); checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&hTimer); double gpuTime = sdkGetTimerValue(&hTimer); printf("%f MPix/s (%f ms)\n", (double)dataH * (double)dataW * 1e-6 / (gpuTime * 0.001), gpuTime); printf("...reading back GPU FFT results\n"); checkCudaErrors(cudaMemcpy(h_ResultGPU, d_PaddedData, fftH * fftW * sizeof(float), cudaMemcpyDeviceToHost)); printf("...running reference CPU convolution\n"); convolutionClampToBorderCPU( h_ResultCPU, h_Data, h_Kernel, dataH, dataW, kernelH, kernelW, kernelY, kernelX ); printf("...comparing the results: "); double sum_delta2 = 0; double sum_ref2 = 0; double max_delta_ref = 0; for (int y = 0; y < dataH; y++) { for (int x = 0; x < dataW; x++) { double rCPU = (double)h_ResultCPU[y * dataW + x]; double rGPU = (double)h_ResultGPU[y * fftW + x]; double delta = (rCPU - rGPU) * (rCPU - rGPU); double ref = rCPU * rCPU + rCPU * rCPU; if ((delta / ref) > max_delta_ref) { max_delta_ref = delta / ref; } sum_delta2 += delta; sum_ref2 += ref; } } double L2norm = sqrt(sum_delta2 / sum_ref2); printf("rel L2 = %E (max delta = %E)\n", L2norm, sqrt(max_delta_ref)); bRetVal = (L2norm < 1e-6) ? true : false; printf(bRetVal ? "L2norm Error OK\n" : "L2norm Error too high!\n"); printf("...shutting down\n"); sdkStartTimer(&hTimer); checkCudaErrors(cufftDestroy(fftPlan)); checkCudaErrors(cudaFree(d_KernelSpectrum0)); checkCudaErrors(cudaFree(d_DataSpectrum0)); checkCudaErrors(cudaFree(d_PaddedKernel)); checkCudaErrors(cudaFree(d_PaddedData)); checkCudaErrors(cudaFree(d_Kernel)); checkCudaErrors(cudaFree(d_Data)); free(h_ResultGPU); free(h_ResultCPU); free(h_Kernel); free(h_Data); return bRetVal; }
~FFT2D() { cufftDestroy(m_plan); }