Пример #1
0
void cleanup(void)
{
    cudaGraphicsUnregisterResource(cuda_vbo_resource);

    unbindTexture();
    deleteTexture();

    // Free all host and device resources
    free(hvfield);
    free(particles);
    cudaFree(dvfield);
    cudaFree(vxfield);
    cudaFree(vyfield);
    cufftDestroy(planr2c);
    cufftDestroy(planc2r);

    glBindBufferARB(GL_ARRAY_BUFFER_ARB, 0);
    glDeleteBuffersARB(1, &vbo);

    sdkDeleteTimer(&timer);

    if (g_bExitESC)
    {
        checkCudaErrors(cudaDeviceReset());
    }
}
Пример #2
0
/*!
	computes the N-point DFT of signal X and stores in Y using CUDA's FFT library
*/
bool cuda_dft(cuComplex *Y, cuComplex *X, float scale, int N) {
	size_t bytes = (size_t)N * sizeof(cuComplex);
	cuComplex *Y_gpu, *X_gpu;
	
	cudaMalloc((void **)&Y_gpu, bytes);
	cudaMalloc((void **)&X_gpu, bytes);

	cudaMemcpy(Y_gpu, Y, bytes, cudaMemcpyHostToDevice);
	cudaMemcpy(X_gpu, X, bytes, cudaMemcpyHostToDevice);

	cufftHandle plan;
	cufftPlan1d(&plan, N, CUFFT_C2C, 1);

	cufftExecC2C(plan, X_gpu, Y_gpu, CUFFT_FORWARD);

	cufftDestroy(plan);

	cudaMemcpy(Y, Y_gpu, bytes, cudaMemcpyDeviceToHost);

	cudaFree(Y_gpu);
	cudaFree(X_gpu);

	for (int n = 0; n < N; n++) {
		Y[n].x *= scale;
		Y[n].y *= scale;
	}

	return true;
}
Пример #3
0
void gpuNUFFT::GpuNUFFTOperator::freeDeviceMemory(int n_coils)
{
  if (!gpuMemAllocated)
    return;

  cufftDestroy(fft_plan);
  // Destroy the cuFFT plan.
  if (DEBUG && (cudaThreadSynchronize() != cudaSuccess))
    printf("error at thread synchronization 9: %s\n",cudaGetErrorString(cudaGetLastError()));
  freeLookupTable();
  
  freeTotalDeviceMemory(data_indices_d,data_sorted_d,crds_d,gdata_d,sectors_d,sector_centers_d,NULL);//NULL as stop
  
  if (n_coils > 1 && deapo_d != NULL)
    cudaFree(deapo_d);
  
  if (this->applySensData())
    cudaFree(sens_d);
  
  if (this->applyDensComp())
    cudaFree(density_comp_d);

  showMemoryInfo();
  gpuMemAllocated = false;
}
Пример #4
0
/**
 * Destroys a previously created plan.
 * The CUDA destructor returns a result code, while the fftw2 destructor is
 * a void function. For now, the result code in the CUDA destructor is
 * ignored.
 */
void sararfftnd_destroy_plan( sararfftnd_plan plan ) {
#ifdef USE_GPUS
  cufftDestroy( plan );
#else // #ifndef USE_GPUS
  rfftwnd_destroy_plan( plan );
#endif
}
Пример #5
0
extern "C" void cuda_fft(double *d_data, int Lx, int Ny, void *stream) 
{ 
	cufftHandle plan; 
	cufftPlan1d(&plan, Lx, CUFFT_Z2Z, Ny); 
	cufftSetStream(plan, (cudaStream_t)stream); 
	cufftExecZ2Z(plan, (cufftDoubleComplex*)d_data, (cufftDoubleComplex*)d_data,CUFFT_FORWARD); 
	cufftDestroy(plan); 
} 
Пример #6
0
extern "C" void cuda_fft(float *d_data, int Lx, int Ny, void *stream)
{
	cufftHandle plan;
	cufftPlan1d(&plan, Lx, CUFFT_C2C, Ny);
	cufftSetStream(plan, (cudaStream_t)stream);
	cufftExecC2C(plan, (cufftComplex*)d_data, (cufftComplex*)d_data,CUFFT_FORWARD);
	cufftDestroy(plan);
}
Пример #7
0
Файл: cu2.c Проект: E-LLP/QuIP
void g_fwdfft(QSP_ARG_DECL  Data_Obj *dst_dp, Data_Obj *src1_dp)
{
	//Variable declarations
	int NX = 256;
	//int BATCH = 10;
	int BATCH = 1;
	cufftResult_t status;

	//Declare plan for FFT
	cufftHandle plan;
	//cufftComplex *data;
	//cufftComplex *result;
	void *data;
	void *result;
	cudaError_t drv_err;

	//Allocate RAM
	//cutilSafeCall(cudaMalloc(&data, sizeof(cufftComplex)*NX*BATCH));	
	//cutilSafeCall(cudaMalloc(&result, sizeof(cufftComplex)*NX*BATCH));
	drv_err = cudaMalloc(&data, sizeof(cufftComplex)*NX*BATCH);
	if( drv_err != cudaSuccess ){
		WARN("error allocating cuda data buffer for fft!?");
		return;
	}
	drv_err = cudaMalloc(&result, sizeof(cufftComplex)*NX*BATCH);
	if( drv_err != cudaSuccess ){
		WARN("error allocating cuda result buffer for fft!?");
		// BUG clean up previous malloc...
		return;
	}

	//Create plan for FFT
	status = cufftPlan1d(&plan, NX, CUFFT_C2C, BATCH);
	if (status != CUFFT_SUCCESS) {
		sprintf(ERROR_STRING, "Error in cufftPlan1d: %s\n", getCUFFTError(status));
		NWARN(ERROR_STRING);
	}

	//Run forward fft on data
	status = cufftExecC2C(plan, (cufftComplex *)data,
			(cufftComplex *)result, CUFFT_FORWARD);
	if (status != CUFFT_SUCCESS) {
		sprintf(ERROR_STRING, "Error in cufftExecC2C: %s\n", getCUFFTError(status));
		NWARN(ERROR_STRING);
	}

	//Run inverse fft on data
	/*status = cufftExecC2C(plan, data, result, CUFFT_INVERSE);
	if (status != CUFFT_SUCCESS)
	{
		sprintf(ERROR_STRING, "Error in cufftExecC2C: %s\n", getCUFFTError(status));
		NWARN(ERROR_STRING);
	}*/

	//Free resources
	cufftDestroy(plan);
	cudaFree(data);
}
Пример #8
0
/* Destroy a 1d plan */
int dfft_cuda_destroy_local_plan(cuda_plan_t *p)
    {
    cufftResult res = cufftDestroy(*p);
    if (res != CUFFT_SUCCESS)
        {
        printf("cufftDestroy error: %d\n", res);
        return res;
        }
    return 0;
    }
Пример #9
0
GLFluids::~GLFluids(){

    cudaGraphicsUnregisterResource(cuda_vbo_resource);

    unbind_texture();
    delete_texture();

    // Free all host and device resources
    free(hvfield);
    free(particles);
    cudaFree(dvfield);
    cudaFree(vxfield);
    cudaFree(vyfield);
    cufftDestroy(planr2c);
    cufftDestroy(planc2r);

    glBindBuffer(GL_ARRAY_BUFFER, 0);
    glDeleteBuffers(1, &vbo);

}
Пример #10
0
void oskar_fft_free(oskar_FFT* h)
{
    int status = 0;
    if (!h) return;
    oskar_mem_free(h->fftpack_work, &status);
    oskar_mem_free(h->fftpack_wsave, &status);
#ifdef OSKAR_HAVE_CUDA
    if (h->location == OSKAR_GPU)
        cufftDestroy(h->cufft_plan);
#endif
    free(h);
}
Пример #11
0
void fft3dGPU(T1* d_data, int nx, int ny, int nz, void* stream)
{
	cufftHandle plan;
	cufftSetCompatibilityMode(plan, CUFFT_COMPATIBILITY_FFTW_ALL);

	if (cufftPlan3d(&plan, nz, ny, nx, CUFFT_C2C)!=CUFFT_SUCCESS) {
		fprintf(stderr, "CUFFT error: Plan creation failed");
	}

	cufftSetStream(plan, (cudaStream_t) stream);
	cufftExecC2C(plan, (cufftComplex*) d_data, (cufftComplex*) d_data, CUFFT_FORWARD);
	cufftDestroy(plan);
}
Пример #12
0
void
Fastconv_base<D, T, C>::fconv
  (T const* in, T const* kernel, T* out, length_type rows, length_type columns, bool transform_kernel)
{
  // convert pointers to types the CUFFT library accepts
  typedef cufftComplex ctype;
  ctype* d_out = reinterpret_cast<ctype*>(out);
  ctype* d_kernel = const_cast<ctype*>(reinterpret_cast<ctype const*>(kernel));
  ctype* d_in = const_cast<ctype*>(reinterpret_cast<ctype const*>(in));

  cufftHandle plan;
  if (transform_kernel)
  {
    // Create a 1D FFT plan and transform the kernel
    cufftPlan1d(&plan, columns, CUFFT_C2C, 1);
    cufftExecC2C(plan, d_kernel, d_kernel, CUFFT_FORWARD);
    cufftDestroy(plan);
  }

  // Create a FFTM plan
  cufftPlan1d(&plan, columns, CUFFT_C2C, rows);

  // transform the data
  cufftExecC2C(plan, d_in, d_in, CUFFT_FORWARD);

  // convolve with kernel, combine with scaling needed for inverse FFT
  typedef typename impl::scalar_of<T>::type scalar_type;
  scalar_type scale = 1 / static_cast<scalar_type>(columns);
  if (D == 1)
    vmmuls_row(kernel, in, out, scale, rows, columns);
  else
    mmmuls(kernel, in, out, scale, rows, columns);

  // inverse transform the signal
  cufftExecC2C(plan, d_out, d_out, CUFFT_INVERSE);
  cufftDestroy(plan);
}
Пример #13
0
// Main execution loop.
// Capture from camera, filter image data, save filered data.
// Note*: wait conditions are used to ensure that the previous image
// has been rendered while allowing for continued operation.
void WorkerThread::run()
{
	createInitialFilter();
	while( 1 )
	{
		// Throttling
		while( !g_throttleTimerFlag ){}
		g_throttleTimerFlag = 0;

		// Capture image data
		_camera->capture();
		float *data = _camera->frameData();

		// Need a new filter?
		_new_filter_mutex.lock();	
		if( _should_create_new_filter )
		{
			createNewFilter();
		}
		_new_filter_mutex.unlock();	

		// Gabor filter
		gaborFilter( data );	

		// Make sure the previous image has been rendered before overwriting it
		_image_mutex.lock();
		if( !_is_image_processed )
		{
			_image_processed.wait(&_image_mutex);	
		}

		// Copy image data so the thread continues capturing and filtering
		memcpy( _full_image, data, sizeof(float) * _original_image_width * _original_image_height * 2 );
		_is_image_processed = false;
		emit filterComplete();
		_image_mutex.unlock();

		if( _should_terminate )
		{
			break;
		}
	}

	// Free GPU resources
	cudaFree(_gabor_data);
	cudaFree(_gpu_image_0);
	cudaFree(_gpu_image_1);
	cufftDestroy(_fft_plan);
}
Пример #14
0
void cleanup(void)
{
    cudaGraphicsUnregisterResource(cuda_vbo_resource);

    unbindTexture();
    deleteTexture();

    // Free all host and device resources
    free(hvfield);
    free(particles);
#ifdef BROADCAST
	free(packets);
#endif
    cudaFree(dvfield);
    cudaFree(vxfield);
    cudaFree(vyfield);
    cufftDestroy(planr2c);
    cufftDestroy(planc2r);

    glBindBufferARB(GL_ARRAY_BUFFER_ARB, 0);
    glDeleteBuffersARB(1, &vbo);

    sdkDeleteTimer(&timer);
}
Пример #15
0
/*
 * Class:     jcuda_jcufft_JCufft
 * Method:    cufftDestroyNative
 * Signature: (Ljcuda/jcufft/JCufftHandle;)I
 */
JNIEXPORT jint JNICALL Java_jcuda_jcufft_JCufft_cufftDestroyNative
  (JNIEnv *env, jclass cla, jobject handle)
{
    if (handle == NULL)
    {
        ThrowByName(env, "java/lang/NullPointerException", "Parameter 'handle' is null for cufftDestroy");
        return JCUFFT_INTERNAL_ERROR;
    }

    Logger::log(LOG_TRACE, "Destroying plan\n");

    cufftHandle plan = env->GetIntField(handle, cufftHandle_plan);
    cufftResult result = cufftDestroy(plan);
    return result;
}
Пример #16
0
void cleanup()
{
    // DEPRECATED: cutilSafeCall(cudaGLUnregisterBufferObject(heightVertexBuffer));
    cutilSafeCall(cudaGraphicsUnregisterResource(cuda_heightVB_resource));
    // DEPRECATED: cutilSafeCall(cudaGLUnregisterBufferObject(slopeVertexBuffer));
    cutilSafeCall(cudaGraphicsUnregisterResource(cuda_slopeVB_resource));

	deleteVBO(&posVertexBuffer);
    deleteVBO(&heightVertexBuffer);
    deleteVBO(&slopeVertexBuffer);

    cutilSafeCall( cudaFree(d_h0) );
    cutilSafeCall( cudaFree(d_slope) );
    free(h_h0);
    cufftDestroy(fftPlan);
}
Пример #17
0
void fft3dGPU(T1* d_data, int nx, int ny, int nz, void* stream)
{   //printf("Running 3d forward xform \n");
	cufftHandle plan;
	cufftSetCompatibilityMode(plan, CUFFT_COMPATIBILITY_FFTW_ALL);

	if (cufftPlan3d(&plan, nz, ny, nx, CUFFT_Z2Z)!=CUFFT_SUCCESS) {
		printf("CUFFT error: Plan creation failed\n");
	}
	//printf("Built plan \n");

	cufftSetStream(plan, (cudaStream_t) stream);

	if (cufftExecZ2Z(plan, (cufftDoubleComplex*) d_data, (cufftDoubleComplex*) d_data, CUFFT_FORWARD)!=CUFFT_SUCCESS) {
		printf("CUFFT error: Plan execution failed\n");
	};
	cufftDestroy(plan);
}
Пример #18
0
void fft_3d_destroy_plan_cuda(struct fft_plan_3d *plan)
{
#ifdef FFT_CUFFT
  if (plan->pre_plan) remap_3d_destroy_plan(plan->pre_plan);
  if (plan->mid1_plan) remap_3d_destroy_plan(plan->mid1_plan);
  if (plan->mid2_plan) remap_3d_destroy_plan(plan->mid2_plan);
  if (plan->post_plan) remap_3d_destroy_plan(plan->post_plan);

  if (plan->copy) free(plan->copy);
  if (plan->scratch) free(plan->scratch);


  //cufftDestroy(plan->plan_fast);
  //cufftDestroy(plan->plan_mid);
  //cufftDestroy(plan->plan_slow);
  cufftDestroy(plan->plan_3d);
  free(plan);
#endif
}
Пример #19
0
/**
 * Destroy AccFFT GPU plan.
 * @param plan Input plan to be destroyed.
 */
void accfft_destroy_plan_gpu(accfft_plan_gpu * plan){

  if(plan->T_plan_1!=NULL)delete(plan->T_plan_1);
  if(plan->T_plan_1i!=NULL)delete(plan->T_plan_1i);
  if(plan->T_plan_2!=NULL)delete(plan->T_plan_2);
  if(plan->T_plan_2i!=NULL)delete(plan->T_plan_2i);
  if(plan->Mem_mgr!=NULL)delete(plan->Mem_mgr);

  if(plan->fplan_0!=-1)cufftDestroy(plan->fplan_0);
  if(plan->fplan_1!=-1)cufftDestroy(plan->fplan_1);
  if(plan->fplan_2!=-1)cufftDestroy(plan->fplan_2);

  if(plan->iplan_0!=-1)cufftDestroy(plan->iplan_0);
  if(plan->iplan_1!=-1)cufftDestroy(plan->iplan_1);
  if(plan->iplan_2!=-1)cufftDestroy(plan->iplan_2);

  MPI_Comm_free(&plan->row_comm);
  MPI_Comm_free(&plan->col_comm);
  return;
}//end accfft_destroy_plan_gpu
Пример #20
0
int main2(int sockfd)
{
        cufftHandle plan;
        cufftComplex *devPtr;
        cufftReal indata[NX*BATCH];
        cufftComplex data[NX*BATCH];
        int i,timer,j,k;
        char fname[15];
        FILE *f;
	#define BUFSIZE (21*4096*sizeof(int))
	int buffer[BUFSIZE];

        int p,nread;


	f = fopen("21-4096","rb");
	nread=fread(buffer,BUFSIZE,1,f);
	printf("nread=%i\n",nread);
	fclose(f);

        i=0;
        for (j=0;j<BATCH;j++) {
            for (k=0;k<NX;k++) {
                data[j*NX+k].x = buffer[j*NX+k];
                data[j*NX+k].y = 0;
            }
	}


        //f=fopen("y.txt","r");
    /* source data creation */

        //int sockfd = myconnect();
        //printf("connected\n");
	
		

        /* WORKING!!!!!!!!
        i=0;
        for (j=0;j<BATCH;j++) {
            sprintf(fname,"%i.txt",j);
            printf("%s\n",fname);
            f = fopen(fname,"r");
            for (k=0;k<NX;k++) {
                fscanf(f,"%i\n",&p);
                data[j*NX+k].x = p;
                data[j*NX+k].y = 0;
            }
            fclose(f);
	*/
/*
        for(i=  0 ; i < NX*BATCH ; i++){
                //fscanf(f,"%i\n",&p);
                //data[i].x= p;
                data[i].x= 1.0f;
                //printf("%f\n",data[i].x);
                data[i].y = 0.0f;
        }
        //fclose(f)
        */
        //}


        /* creates 1D FFT plan */
        cufftPlan1d(&plan, NX, CUFFT_C2C, BATCH);


        /*
        cutCreateTimer(&timer);
        cutResetTimer(timer);
        cutStartTimer(timer);
        */
        
    /* GPU memory allocation */
        cudaMalloc((void**)&devPtr, sizeof(cufftComplex)*NX*BATCH);

    /* transfer to GPU memory */
        cudaMemcpy(devPtr, data, sizeof(cufftComplex)*NX*BATCH, cudaMemcpyHostToDevice);


        /* executes FFT processes */
        cufftExecC2C(plan, devPtr, devPtr, CUFFT_FORWARD);

        /* executes FFT processes (inverse transformation) */
       //cufftExecC2C(plan, devPtr, devPtr, CUFFT_INVERSE);

    /* transfer results from GPU memory */
        cudaMemcpy(data, devPtr, sizeof(cufftComplex)*NX*BATCH, cudaMemcpyDeviceToHost);

        /* deletes CUFFT plan */
        cufftDestroy(plan);

    /* frees GPU memory */
        cudaFree(devPtr);

        /*
        cudaThreadSynchronize();
        cutStopTimer(timer);
        printf("%f\n",cutGetTimerValue(timer)/(float)1000);
        cutDeleteTimer(timer);
        */

        /*
        float mag;
        for(i = 0 ; i < NX*BATCH ; i++){
                //printf("data[%d] %f %f\n", i, data[i].x, data[i].y);
                //printf("%f\n", data[i].x);
                mag = sqrtf(data[i].x*data[i].x+data[i].y*data[i].y)*2.0/NX;
                printf("%f\n",mag);

        }
        */

/*
        // save as text file
        float mag;
        i=0;
        for (j=0;j<BATCH;j++) {
            sprintf(fname,"%i-mag.txt",j);
            printf("%s\n",fname);
            f = fopen(fname,"w");
            for (k=0;k<NX;k++) {
                //fscanf(f,"%i\n",&p);
                if (k>50)
                    continue;
                i = j*NX+k;
                mag = sqrtf(data[i].x*data[i].x+data[i].y*data[i].y)*2.0/NX;
                fprintf(f,"%f\n",mag);
            }
            fclose(f);
        }
*/


        float mag;
        i=0;
        float mags[NX];
        int magsint[NX*BATCH];
        memset(magsint,0,sizeof(int)*NX*BATCH);
        int u = 0;

        printf("%f %f %f %f\n",data[0].x,data[1].x,data[2].x,data[3].x);

        //printf("%i %i %i %i\n",magsint[0],magsint[1],magsint[2],magsint[3]);

//        f = fopen("ffts.bin","wb");
        for (j=0;j<BATCH;j++) {
//            sprintf(fname,"%i-bin.dat",j);
//            printf("%s\n",fname);

            for (k=0;k<NX;k++) {
                //fscanf(f,"%i\n",&p);
                if (k>50)
                    continue;
                i = j*NX+k;
                mags[k] = sqrtf(data[i].x*data[i].x+data[i].y*data[i].y)*2.0/NX;
                magsint[u]=mags[k]    ;
                u++;
                //fprintf(f,"%f\n",mag);
                
            }

            //f = fopen(fname,"wb");
  //          fwrite(magsint,sizeof(int)*50,1,f);
        }
        int n;
        n = write(sockfd,magsint,sizeof(int)*BATCH*50);
        printf("%i %i %i %i\n",magsint[0],magsint[1],magsint[2],magsint[3]);
        printf("send ok, size: %i\n",n);
        //fclose(f);
        
        
        return 0;
}
Пример #21
0
 //! \brief Destructor.
 virtual ~FFT()
 {
   cufftDestroy(fftplan_);
 }
Пример #22
0
cufftResult WINAPI wine_cufftDestroy(cufftHandle plan){
	WINE_TRACE("\n");
	return cufftDestroy( plan );
}
Пример #23
0
int main(int argc, char *argv[]) {
	int i;
	struct timeval begin, end;
	int size;
	size_t bytes;
	int n = 0, m = 0;
	STARPUFFT(plan) plan;
#ifdef STARPU_HAVE_FFTW
	_FFTW(plan) fftw_plan;
#endif
#ifdef STARPU_USE_CUDA
	cufftHandle cuda_plan;
	cudaError_t cures;
#endif
	double timing;

	if (argc < 2 || argc > 3) {
		fprintf(stderr,"need one or two size of vector\n");
		exit(EXIT_FAILURE);
	}

	starpu_init(NULL);

	if (argc == 2) {
		n = atoi(argv[1]);

		/* 1D */
		size = n;
	} else if (argc == 3) {
		n = atoi(argv[1]);
		m = atoi(argv[2]);

		/* 2D */
		size = n * m;
	} else {
		assert(0);
	}

	bytes = size * sizeof(STARPUFFT(complex));

	STARPUFFT(complex) *in = STARPUFFT(malloc)(size * sizeof(*in));
	starpu_srand48(0);
	for (i = 0; i < size; i++)
		in[i] = starpu_drand48() + I * starpu_drand48();

	STARPUFFT(complex) *out = STARPUFFT(malloc)(size * sizeof(*out));

#ifdef STARPU_HAVE_FFTW
	STARPUFFT(complex) *out_fftw = STARPUFFT(malloc)(size * sizeof(*out_fftw));
#endif

#ifdef STARPU_USE_CUDA
	STARPUFFT(complex) *out_cuda = malloc(size * sizeof(*out_cuda));
#endif

	if (argc == 2) {
		plan = STARPUFFT(plan_dft_1d)(n, SIGN, 0);
#ifdef STARPU_HAVE_FFTW
		fftw_plan = _FFTW(plan_dft_1d)(n, in, out_fftw, SIGN, FFTW_ESTIMATE);
#endif
#ifdef STARPU_USE_CUDA
		if (cufftPlan1d(&cuda_plan, n, _CUFFT_C2C, 1) != CUFFT_SUCCESS)
			printf("erf\n");
#endif

	} else if (argc == 3) {
		plan = STARPUFFT(plan_dft_2d)(n, m, SIGN, 0);
#ifdef STARPU_HAVE_FFTW
		fftw_plan = _FFTW(plan_dft_2d)(n, m, in, out_fftw, SIGN, FFTW_ESTIMATE);
#endif
#ifdef STARPU_USE_CUDA
		STARPU_ASSERT(cufftPlan2d(&cuda_plan, n, m, _CUFFT_C2C) == CUFFT_SUCCESS);
#endif
	} else {
		assert(0);
	}

#ifdef STARPU_HAVE_FFTW
	gettimeofday(&begin, NULL);
	_FFTW(execute)(fftw_plan);
	gettimeofday(&end, NULL);
	_FFTW(destroy_plan)(fftw_plan);
	timing = (double)((end.tv_sec - begin.tv_sec)*1000000 + (end.tv_usec - begin.tv_usec));
	printf("FFTW took %2.2f ms (%2.2f MB/s)\n\n", timing/1000, bytes/timing);
#endif
#ifdef STARPU_USE_CUDA
	gettimeofday(&begin, NULL);
	if (cufftExecC2C(cuda_plan, (cufftComplex*) in, (cufftComplex*) out_cuda, CUFFT_FORWARD) != CUFFT_SUCCESS)
		printf("erf2\n");
	if ((cures = cudaThreadSynchronize()) != cudaSuccess)
		STARPU_CUDA_REPORT_ERROR(cures);
	gettimeofday(&end, NULL);
	cufftDestroy(cuda_plan);
	timing = (double)((end.tv_sec - begin.tv_sec)*1000000 + (end.tv_usec - begin.tv_usec));
	printf("CUDA took %2.2f ms (%2.2f MB/s)\n\n", timing/1000, bytes/timing);
#endif

	STARPUFFT(execute)(plan, in, out);

	STARPUFFT(showstats)(stdout);
	STARPUFFT(destroy_plan)(plan);

	printf("\n");
#if 0
	for (i = 0; i < 16; i++)
		printf("(%f,%f) ", cimag(in[i]), creal(in[i]));
	printf("\n\n");
	for (i = 0; i < 16; i++)
		printf("(%f,%f) ", cimag(out[i]), creal(out[i]));
	printf("\n\n");
#ifdef STARPU_HAVE_FFTW
	for (i = 0; i < 16; i++)
		printf("(%f,%f) ", cimag(out_fftw[i]), creal(out_fftw[i]));
	printf("\n\n");
#endif
#endif

#ifdef STARPU_HAVE_FFTW
{
	double max = 0., tot = 0., norm = 0., normdiff = 0.;
	for (i = 0; i < size; i++) {
		double diff = cabs(out[i]-out_fftw[i]);
		double diff2 = diff * diff;
		double size = cabs(out_fftw[i]);
		double size2 = size * size;
		if (diff > max)
			max = diff;
		tot += diff;
		normdiff += diff2;
		norm += size2;
	}
	fprintf(stderr, "\nmaximum difference %g\n", max);
	fprintf(stderr, "average difference %g\n", tot / size);
	fprintf(stderr, "difference norm %g\n", sqrt(normdiff));
	double relmaxdiff = max / sqrt(norm);
	fprintf(stderr, "relative maximum difference %g\n", relmaxdiff);
	double relavgdiff = (tot / size) / sqrt(norm);
	fprintf(stderr, "relative average difference %g\n", relavgdiff);
	if (!strcmp(TYPE, "f") && (relmaxdiff > 1e-8 || relavgdiff > 1e-8))
		return EXIT_FAILURE;
	if (!strcmp(TYPE, "") && (relmaxdiff > 1e-16 || relavgdiff > 1e-16))
		return EXIT_FAILURE;
}
#endif

#ifdef STARPU_USE_CUDA
{
	double max = 0., tot = 0., norm = 0., normdiff = 0.;
	for (i = 0; i < size; i++) {
		double diff = cabs(out_cuda[i]-out_fftw[i]);
		double diff2 = diff * diff;
		double size = cabs(out_fftw[i]);
		double size2 = size * size;
		if (diff > max)
			max = diff;
		tot += diff;
		normdiff += diff2;
		norm += size2;
	}
	fprintf(stderr, "\nmaximum difference %g\n", max);
	fprintf(stderr, "average difference %g\n", tot / size);
	fprintf(stderr, "difference norm %g\n", sqrt(normdiff));
	double relmaxdiff = max / sqrt(norm);
	fprintf(stderr, "relative maximum difference %g\n", relmaxdiff);
	double relavgdiff = (tot / size) / sqrt(norm);
	fprintf(stderr, "relative average difference %g\n", relavgdiff);
	if (!strcmp(TYPE, "f") && (relmaxdiff > 1e-8 || relavgdiff > 1e-8))
		return EXIT_FAILURE;
	if (!strcmp(TYPE, "") && (relmaxdiff > 1e-16 || relavgdiff > 1e-16))
		return EXIT_FAILURE;
}
#endif

	STARPUFFT(free)(in);
	STARPUFFT(free)(out);

#ifdef STARPU_HAVE_FFTW
	STARPUFFT(free)(out_fftw);
#endif

#ifdef STARPU_USE_CUDA
	free(out_cuda);
#endif

	starpu_shutdown();

	return EXIT_SUCCESS;
}
 /**
    Destructor.
    The destructor destroys the CUFFT plan.
 */
 inline ~Plan()
 {
   CUFFT_CHECK(cufftDestroy(plan));
 }
Пример #25
0
bool test0(void)
{
    float
    *h_Data,
    *h_Kernel,
    *h_ResultCPU,
    *h_ResultGPU;

    float
    *d_Data,
    *d_PaddedData,
    *d_Kernel,
    *d_PaddedKernel;

    fComplex
    *d_DataSpectrum,
    *d_KernelSpectrum;

    cufftHandle
    fftPlanFwd,
    fftPlanInv;

    bool bRetVal;
    StopWatchInterface *hTimer = NULL;
    sdkCreateTimer(&hTimer);

    printf("Testing built-in R2C / C2R FFT-based convolution\n");
    const int kernelH = 3;
    const int kernelW = 3;
    const int kernelY = 1;
    const int kernelX = 1;
    const int   dataH = 10;
    const int   dataW = 10;
    const int    fftH = snapTransformSize(dataH + kernelH - 1);
    const int    fftW = snapTransformSize(dataW + kernelW - 1);

    printf("...allocating memory\n");
    h_Data      = (float *)malloc(dataH   * dataW * sizeof(float));
    h_Kernel    = (float *)malloc(kernelH * kernelW * sizeof(float));
    h_ResultCPU = (float *)malloc(dataH   * dataW * sizeof(float));
    h_ResultGPU = (float *)malloc(fftH    * fftW * sizeof(float));

    checkCudaErrors(cudaMalloc((void **)&d_Data,   dataH   * dataW   * sizeof(float)));
    checkCudaErrors(cudaMalloc((void **)&d_Kernel, kernelH * kernelW * sizeof(float)));

    checkCudaErrors(cudaMalloc((void **)&d_PaddedData,   fftH * fftW * sizeof(float)));
    checkCudaErrors(cudaMalloc((void **)&d_PaddedKernel, fftH * fftW * sizeof(float)));

    checkCudaErrors(cudaMalloc((void **)&d_DataSpectrum,   fftH * (fftW / 2 + 1) * sizeof(fComplex)));
    checkCudaErrors(cudaMalloc((void **)&d_KernelSpectrum, fftH * (fftW / 2 + 1) * sizeof(fComplex)));

    printf("...generating random input data\n");
    srand(2010);

    for (int i = 0; i < dataH * dataW; i++)
    {
        //h_Data[i] = getRand();
		h_Data[i] = i + 1;
    }

    for (int i = 0; i < kernelH * kernelW; i++)
    {
        //h_Kernel[i] = getRand();
		h_Kernel[i] = i + 1;
    }

	FILE* fp2 = fopen("input_kernel.txt", "w+");
	FILE* fp3 = fopen("input_data.txt", "w+");
	for (int i = 0; i < dataH * dataW; i++)
		fprintf(fp3, "%f\n", h_Data[i]);
	for (int i = 0; i < kernelH * kernelW; i++)
		fprintf(fp2, "%f\n", h_Kernel[i]);
	fclose(fp2);
	fclose(fp3);

    printf("...creating R2C & C2R FFT plans for %i x %i\n", fftH, fftW);
    checkCudaErrors(cufftPlan2d(&fftPlanFwd, fftH, fftW, CUFFT_R2C));
    checkCudaErrors(cufftPlan2d(&fftPlanInv, fftH, fftW, CUFFT_C2R));

    printf("...uploading to GPU and padding convolution kernel and input data\n");
    checkCudaErrors(cudaMemcpy(d_Kernel, h_Kernel, kernelH * kernelW * sizeof(float), cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(d_Data,   h_Data,   dataH   * dataW *   sizeof(float), cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemset(d_PaddedKernel, 0, fftH * fftW * sizeof(float)));
    checkCudaErrors(cudaMemset(d_PaddedData,   0, fftH * fftW * sizeof(float)));

    padKernel(
        d_PaddedKernel,
        d_Kernel,
        fftH,
        fftW,
        kernelH,
        kernelW,
        kernelY,
        kernelX
    );

    padDataClampToBorder(
        d_PaddedData,
        d_Data,
        fftH,
        fftW,
        dataH,
        dataW,
        kernelH,
        kernelW,
        kernelY,
        kernelX
    );

    //Not including kernel transformation into time measurement,
    //since convolution kernel is not changed very frequently
    printf("...transforming convolution kernel\n");
    checkCudaErrors(cufftExecR2C(fftPlanFwd, (cufftReal *)d_PaddedKernel, (cufftComplex *)d_KernelSpectrum));

    printf("...running GPU FFT convolution: ");
    checkCudaErrors(cudaDeviceSynchronize());
    sdkResetTimer(&hTimer);
    sdkStartTimer(&hTimer);
    checkCudaErrors(cufftExecR2C(fftPlanFwd, (cufftReal *)d_PaddedData, (cufftComplex *)d_DataSpectrum));
    modulateAndNormalize(d_DataSpectrum, d_KernelSpectrum, fftH, fftW, 1);
    checkCudaErrors(cufftExecC2R(fftPlanInv, (cufftComplex *)d_DataSpectrum, (cufftReal *)d_PaddedData));

    checkCudaErrors(cudaDeviceSynchronize());
    sdkStopTimer(&hTimer);
    double gpuTime = sdkGetTimerValue(&hTimer);
    printf("%f MPix/s (%f ms)\n", (double)dataH * (double)dataW * 1e-6 / (gpuTime * 0.001), gpuTime);

    printf("...reading back GPU convolution results\n");
    checkCudaErrors(cudaMemcpy(h_ResultGPU, d_PaddedData, fftH * fftW * sizeof(float), cudaMemcpyDeviceToHost));

    printf("...running reference CPU convolution\n");
    convolutionClampToBorderCPU(
        h_ResultCPU,
        h_Data,
        h_Kernel,
        dataH,
        dataW,
        kernelH,
        kernelW,
        kernelY,
        kernelX
    );

    printf("...comparing the results: ");
    double sum_delta2 = 0;
    double sum_ref2   = 0;
    double max_delta_ref = 0;

    for (int y = 0; y < dataH; y++)
        for (int x = 0; x < dataW; x++)
        {
            double  rCPU = (double)h_ResultCPU[y * dataW + x];
            double  rGPU = (double)h_ResultGPU[y * fftW  + x];
            double delta = (rCPU - rGPU) * (rCPU - rGPU);
            double   ref = rCPU * rCPU + rCPU * rCPU;

            if ((delta / ref) > max_delta_ref)
            {
                max_delta_ref = delta / ref;
            }

            sum_delta2 += delta;
            sum_ref2   += ref;
        }

    double L2norm = sqrt(sum_delta2 / sum_ref2);
    printf("rel L2 = %E (max delta = %E)\n", L2norm, sqrt(max_delta_ref));
    bRetVal = (L2norm < 1e-6) ? true : false;
    printf(bRetVal ? "L2norm Error OK\n" : "L2norm Error too high!\n");

    printf("...shutting down\n");
    sdkStartTimer(&hTimer);

    checkCudaErrors(cufftDestroy(fftPlanInv));
    checkCudaErrors(cufftDestroy(fftPlanFwd));

    checkCudaErrors(cudaFree(d_DataSpectrum));
    checkCudaErrors(cudaFree(d_KernelSpectrum));
    checkCudaErrors(cudaFree(d_PaddedData));
    checkCudaErrors(cudaFree(d_PaddedKernel));
    checkCudaErrors(cudaFree(d_Data));
    checkCudaErrors(cudaFree(d_Kernel));

	FILE* fp = fopen("result_gpu.txt", "w+");
	FILE* fp1 = fopen("result_cpu.txt", "w+");
	for (int i = 0; i < dataH * dataW; i++)
	{
		fprintf(fp, "%f\n", h_ResultGPU[i]);
		fprintf(fp1, "%f\n", h_ResultCPU[i]);
	}
	fclose(fp);
	fclose(fp1);

    free(h_ResultGPU);
    free(h_ResultCPU);
    free(h_Data);
    free(h_Kernel);

    return bRetVal;
}
Пример #26
0
void sararfftnd_destroy_plan(
  sararfftnd_plan plan
) {
  cufftDestroy( plan );
  destroyPlanSize( plan );
}
Пример #27
0
void WorkerThread::createNewFilter()
{
	// Free GPU memory from current filter and CUFFT
	cudaFree(_gabor_data);
	cudaFree(_gpu_image_0);
	cudaFree(_gpu_image_1);
	cufftDestroy(_fft_plan);

	float* gaussian_data;
	cudaMalloc((void**)&gaussian_data, sizeof(float) * _filter_pixels);
	int2 gaussian_size;
	gaussian_size.x = _filter_size;
	gaussian_size.y = _filter_size;
	int2 gaussian_center;
	gaussian_center.x = _filter_size / 2;
	gaussian_center.y = _filter_size / 2;
	gaussian(gaussian_data, _new_theta, _new_sigma, 1.0, gaussian_center, gaussian_size);
	
	float* harmonic_data;
	cudaMalloc((void**)&harmonic_data, sizeof(float) * _filter_pixels * 2);
	int2 harmonic_size;
	harmonic_size.x = _filter_size;
	harmonic_size.y = _filter_size;
	int2 harmonic_center;
	harmonic_center.x = _filter_size / 2;
	harmonic_center.y = _filter_size / 2;
	harmonic(harmonic_data, _new_theta, _new_lambda, _new_psi, harmonic_center, harmonic_size);
	float* host_harmonic = new float[_filter_size * _filter_size * 2];
	
	cudaMalloc((void**)&_gabor_data, sizeof(float) * _filter_pixels * 2);
	int2 gabor_size;
	gabor_size.x = _filter_size;
	gabor_size.y = _filter_size;
	int2 gabor_center;
	gabor_center.x = _filter_size / 2;
	gabor_center.y = _filter_size / 2;
	multiplyRealComplex(gaussian_data, harmonic_data, _gabor_data, _filter_size * _filter_size);
	float* host_gabor_data = new float[_filter_pixels * 2];
	cudaMemcpy(host_gabor_data,
		_gabor_data,
		sizeof(float) * _filter_pixels * 2,
		cudaMemcpyDeviceToHost);

	//pad the filter
	{
		float* data = host_gabor_data;
		float* target = _filter_image;
		memset(target, 0, sizeof(float) * _padded_pixels * 2);
		int padded_stride = 2 * _padded_size;
		int target_stride = 2 * _target_size;
		for (int i = 0; i < _target_size; ++i)
		{
			memcpy(target, data, sizeof(float) * target_stride);
			target += padded_stride;
			data += target_stride;
		}
	}

	// Copy gabor data into member for texture creation
	_filter_image_mutex.lock();
	memcpy(_host_gabor_data, host_gabor_data, sizeof(float) * _filter_pixels * 2);
	_filter_image_mutex.unlock();
	
	cudaFree(_gabor_data);
	
	cudaMalloc((void**)&_gabor_data, sizeof(float) * _padded_pixels * 2);
	cudaMalloc((void**)&_gpu_image_0, sizeof(float) * _padded_pixels * 2);
	cudaMalloc((void**)&_gpu_image_1, sizeof(float) * _padded_pixels * 2);
	cudaMemcpy(_gabor_data,
		_filter_image,
		sizeof(float) * _padded_pixels * 2,
		cudaMemcpyHostToDevice);

	cufftPlan2d(&_fft_plan, _padded_size, _padded_size, CUFFT_C2C);
	cufftExecC2C(_fft_plan,
		(cufftComplex*)(_gabor_data),
		(cufftComplex*)(_gabor_data),
		CUFFT_FORWARD);
	cudaMemcpy(_filter_image,
		_gabor_data,
		sizeof(float) * _padded_pixels * 2,
		cudaMemcpyDeviceToHost);

	// Free temporary GPU memory used for creation of filter
	cudaFree(gaussian_data);
	cudaFree(harmonic_data);

	delete host_harmonic;
	delete host_gabor_data;

	_should_create_new_filter = false;

	emit newFilterImage();
}
Пример #28
0
void
Fastconv_base<D, T, ComplexFmt>::fconv
  (T const* in, T const* kernel, T* out, length_type rows, length_type columns, bool transform_kernel)
{
  size_t kernel_size = (D == 1) ? columns : rows * columns;

  // allocate device memory and copy input and kernel over from host
  Device_storage<T> dev_out(rows * columns);
  Device_storage<T> dev_kernel(kernel_size);
  Device_storage<T> dev_in(rows * columns);

  // If the kernel is a matrix, it is assumed to be row-major and dense.
  // As a result, it can be copied as one contiguous chunk.
  cudaMemcpy(
    dev_kernel.data(),
    kernel,
    kernel_size * sizeof(T), 
    cudaMemcpyHostToDevice);
  ASSERT_CUDA_OK();

  // Transfer the input (row major, dense)
  cudaMemcpy(
    dev_in.data(),
    in,
    rows * columns * sizeof(T), 
    cudaMemcpyHostToDevice);
  ASSERT_CUDA_OK();
 

  // convert pointers to types the CUFFT library accepts
  typedef cufftComplex ctype;
  ctype* d_out = reinterpret_cast<ctype*>(dev_out.data());
  ctype* d_kernel = reinterpret_cast<ctype*>(dev_kernel.data());
  ctype* d_in = reinterpret_cast<ctype*>(dev_in.data());

  cufftHandle plan;
  if (transform_kernel)
  {
    // Create a 1D FFT plan and transform the kernel
    cufftPlan1d(&plan, columns, CUFFT_C2C, 1);
    cufftExecC2C(plan, d_kernel, d_kernel, CUFFT_FORWARD);
    cufftDestroy(plan);
  }

  // Create a FFTM plan
  cufftPlan1d(&plan, columns, CUFFT_C2C, rows);

  // transform the data
  cufftExecC2C(plan, d_in, d_in, CUFFT_FORWARD);

  // convolve with kernel, combine with scaling needed for inverse FFT
  typedef typename impl::Scalar_of<T>::type scalar_type;
  scalar_type scale = 1 / static_cast<scalar_type>(columns);
  if (D == 1)
    vmmuls_row_cc(d_kernel, d_in, d_out, scale, rows, columns);
  else
    mmmuls_cc(d_kernel, d_in, d_out, scale, rows, columns);

  // inverse transform the signal
  cufftExecC2C(plan, d_out, d_out, CUFFT_INVERSE);
  cufftDestroy(plan);

  // Move data back to the host from the output buffer
  cudaMemcpy(
    out,
    dev_out.data(),
    rows * columns * sizeof(T), 
    cudaMemcpyDeviceToHost);
  ASSERT_CUDA_OK();
}
Пример #29
0
bool test2(void)
{
    float
    *h_Data,
    *h_Kernel,
    *h_ResultCPU,
    *h_ResultGPU;

    float
    *d_Data,
    *d_Kernel,
    *d_PaddedData,
    *d_PaddedKernel;

    fComplex
    *d_DataSpectrum0,
    *d_KernelSpectrum0;

    cufftHandle
    fftPlan;

    bool bRetVal;
    StopWatchInterface *hTimer = NULL;
    sdkCreateTimer(&hTimer);

    printf("Testing updated custom R2C / C2R FFT-based convolution\n");
    const int kernelH = 7;
    const int kernelW = 6;
    const int kernelY = 3;
    const int kernelX = 4;
    const int dataH = 2000;
    const int dataW = 2000;
    const int fftH = snapTransformSize(dataH + kernelH - 1);
    const int fftW = snapTransformSize(dataW + kernelW - 1);

    printf("...allocating memory\n");
    h_Data      = (float *)malloc(dataH   * dataW * sizeof(float));
    h_Kernel    = (float *)malloc(kernelH * kernelW * sizeof(float));
    h_ResultCPU = (float *)malloc(dataH   * dataW * sizeof(float));
    h_ResultGPU = (float *)malloc(fftH    * fftW * sizeof(float));

    checkCudaErrors(cudaMalloc((void **)&d_Data,   dataH   * dataW   * sizeof(float)));
    checkCudaErrors(cudaMalloc((void **)&d_Kernel, kernelH * kernelW * sizeof(float)));

    checkCudaErrors(cudaMalloc((void **)&d_PaddedData,   fftH * fftW * sizeof(float)));
    checkCudaErrors(cudaMalloc((void **)&d_PaddedKernel, fftH * fftW * sizeof(float)));

    checkCudaErrors(cudaMalloc((void **)&d_DataSpectrum0,   fftH * (fftW / 2) * sizeof(fComplex)));
    checkCudaErrors(cudaMalloc((void **)&d_KernelSpectrum0, fftH * (fftW / 2) * sizeof(fComplex)));

    printf("...generating random input data\n");
    srand(2010);

    for (int i = 0; i < dataH * dataW; i++)
    {
        h_Data[i] = getRand();
    }

    for (int i = 0; i < kernelH * kernelW; i++)
    {
        h_Kernel[i] = getRand();
    }

    printf("...creating C2C FFT plan for %i x %i\n", fftH, fftW / 2);
    checkCudaErrors(cufftPlan2d(&fftPlan, fftH, fftW / 2, CUFFT_C2C));

    printf("...uploading to GPU and padding convolution kernel and input data\n");
    checkCudaErrors(cudaMemcpy(d_Data,   h_Data,   dataH   * dataW *   sizeof(float), cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(d_Kernel, h_Kernel, kernelH * kernelW * sizeof(float), cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemset(d_PaddedData,   0, fftH * fftW * sizeof(float)));
    checkCudaErrors(cudaMemset(d_PaddedKernel, 0, fftH * fftW * sizeof(float)));

    padDataClampToBorder(
        d_PaddedData,
        d_Data,
        fftH,
        fftW,
        dataH,
        dataW,
        kernelH,
        kernelW,
        kernelY,
        kernelX
    );

    padKernel(
        d_PaddedKernel,
        d_Kernel,
        fftH,
        fftW,
        kernelH,
        kernelW,
        kernelY,
        kernelX
    );

    //CUFFT_INVERSE works just as well...
    const int FFT_DIR = CUFFT_FORWARD;

    //Not including kernel transformation into time measurement,
    //since convolution kernel is not changed very frequently
    printf("...transforming convolution kernel\n");
    checkCudaErrors(cufftExecC2C(fftPlan, (cufftComplex *)d_PaddedKernel, (cufftComplex *)d_KernelSpectrum0, FFT_DIR));

    printf("...running GPU FFT convolution: ");
    checkCudaErrors(cudaDeviceSynchronize());
    sdkResetTimer(&hTimer);
    sdkStartTimer(&hTimer);

    checkCudaErrors(cufftExecC2C(fftPlan, (cufftComplex *)d_PaddedData, (cufftComplex *)d_DataSpectrum0, FFT_DIR));
    spProcess2D(d_DataSpectrum0, d_DataSpectrum0, d_KernelSpectrum0, fftH, fftW / 2, FFT_DIR);
    checkCudaErrors(cufftExecC2C(fftPlan, (cufftComplex *)d_DataSpectrum0, (cufftComplex *)d_PaddedData, -FFT_DIR));

    checkCudaErrors(cudaDeviceSynchronize());
    sdkStopTimer(&hTimer);
    double gpuTime = sdkGetTimerValue(&hTimer);
    printf("%f MPix/s (%f ms)\n", (double)dataH * (double)dataW * 1e-6 / (gpuTime * 0.001), gpuTime);

    printf("...reading back GPU FFT results\n");
    checkCudaErrors(cudaMemcpy(h_ResultGPU, d_PaddedData, fftH * fftW * sizeof(float), cudaMemcpyDeviceToHost));

    printf("...running reference CPU convolution\n");
    convolutionClampToBorderCPU(
        h_ResultCPU,
        h_Data,
        h_Kernel,
        dataH,
        dataW,
        kernelH,
        kernelW,
        kernelY,
        kernelX
    );

    printf("...comparing the results: ");
    double sum_delta2 = 0;
    double sum_ref2   = 0;
    double max_delta_ref = 0;

    for (int y = 0; y < dataH; y++)
    {
        for (int x = 0; x < dataW; x++)
        {
            double  rCPU = (double)h_ResultCPU[y * dataW + x];
            double  rGPU = (double)h_ResultGPU[y * fftW  + x];
            double delta = (rCPU - rGPU) * (rCPU - rGPU);
            double   ref = rCPU * rCPU + rCPU * rCPU;

            if ((delta / ref) > max_delta_ref)
            {
                max_delta_ref = delta / ref;
            }

            sum_delta2 += delta;
            sum_ref2   += ref;
        }
    }

    double L2norm = sqrt(sum_delta2 / sum_ref2);
    printf("rel L2 = %E (max delta = %E)\n", L2norm, sqrt(max_delta_ref));
    bRetVal = (L2norm < 1e-6) ? true : false;
    printf(bRetVal ? "L2norm Error OK\n" : "L2norm Error too high!\n");

    printf("...shutting down\n");
    sdkStartTimer(&hTimer);
    checkCudaErrors(cufftDestroy(fftPlan));

    checkCudaErrors(cudaFree(d_KernelSpectrum0));
    checkCudaErrors(cudaFree(d_DataSpectrum0));
    checkCudaErrors(cudaFree(d_PaddedKernel));
    checkCudaErrors(cudaFree(d_PaddedData));
    checkCudaErrors(cudaFree(d_Kernel));
    checkCudaErrors(cudaFree(d_Data));

    free(h_ResultGPU);
    free(h_ResultCPU);
    free(h_Kernel);
    free(h_Data);

    return bRetVal;
}
Пример #30
0
 ~FFT2D()
 {
   cufftDestroy(m_plan);
 }