Beispiel #1
0
void
transformStack(const FreeImageStack & rImageStack, FourierImageStack & rFourierStack)
{
    unsigned int nMaxSlices = rImageStack.slices();
    if (nMaxSlices > rFourierStack.slices())
        nMaxSlices = rFourierStack.slices();

    NppiSize oSizeROI = {rImageStack.width(), rImageStack.height()};
            // create plan for the FFT
    cufftHandle oPlanCUFFT;
    NPP_CHECK_CUFFT(cufftPlan2d(&oPlanCUFFT, oSizeROI.width, oSizeROI.height, CUFFT_R2C));
            // allocate 32-bit float intermediate image
            // for this image to work with cuFFT, we must have tightly packed pixels.
    npp::ImageNPP<Npp32f, 1, FrugalAllocator_32f_C1> oSource_32f_C1(oSizeROI.width, oSizeROI.height);
    NPP_DEBUG_ASSERT(oSource_32f_C1.width() * sizeof(Npp32f) == oSource_32f_C1.pitch());
            // allocate 8-bit image 
    npp::ImageNPP_8u_C1 oSource_8u_C1;
    for (unsigned int iSlice = 0; iSlice < nMaxSlices; ++iSlice)
    {
                // load slice
        rImageStack.loadImage(iSlice, oSource_8u_C1);        
                // upconvert 8-bit image to 32-bit float image
        NPP_CHECK_NPP(nppiConvert_8u32f_C1R(oSource_8u_C1.data(),  oSource_8u_C1.pitch(), 
                                            oSource_32f_C1.data(), oSource_32f_C1.pitch(),
                                            oSizeROI));
        NPP_CHECK_CUFFT(cufftExecR2C(oPlanCUFFT, oSource_32f_C1.data(), reinterpret_cast<cufftComplex *>(rFourierStack.data(iSlice))));
    }
}
Beispiel #2
0
GLFluids::GLFluids(QWidget *parent)
    : QGLWidget(parent),
      QGLFunctions()
{
    vbo = 0;

    wWidth = qMax(512, DIM);
    wHeight = qMax(512, DIM);

    hvfield = (float2 *)malloc(sizeof(float2) * DS);
    memset(hvfield, 0, sizeof(float2) * DS);

    // Allocate and initialize device data
    cudaMallocPitch((void **)&dvfield, &tPitch, sizeof(float2)*DIM, DIM);

    cudaMemcpy(dvfield, hvfield, sizeof(float2) * DS,
               cudaMemcpyHostToDevice);
    // Temporary complex velocity field data
    cudaMalloc((void **)&vxfield, sizeof(float2) * PDS);
    cudaMalloc((void **)&vyfield, sizeof(float2) * PDS);

    setup_texture(DIM, DIM);
    bind_texture();

    // Create particle array
    particles = (float2 *)malloc(sizeof(float2) * DS);
    memset(particles, 0, sizeof(float2) * DS);
    initParticles(particles, DIM, DIM);

    // Create CUFFT transform plan configuration
    cufftPlan2d(&planr2c, DIM, DIM, CUFFT_R2C);
    cufftPlan2d(&planc2r, DIM, DIM, CUFFT_C2R);

    // TODO: update kernels to use the new unpadded memory layout for perf
    // rather than the old FFTW-compatible layout
    cufftSetCompatibilityMode(planr2c, CUFFT_COMPATIBILITY_FFTW_PADDING);
    cufftSetCompatibilityMode(planc2r, CUFFT_COMPATIBILITY_FFTW_PADDING);

    QTimer *timer = new QTimer(this);
    connect(timer, &QTimer::timeout, [&](){
        simulateFluids();
        updateGL();
    });
    timer->start(0);
}
Beispiel #3
0
 void createPlan(unsigned nx, unsigned ny)
 {
   if (nx != m_nx || ny != m_ny)
   {
     m_nx = nx;
     m_ny = ny;
     cufftResult result = cufftPlan2d(&m_plan, m_nx, m_ny, CUFFT_C2C);
     AGILE_ASSERT(result == CUFFT_SUCCESS,
                   StandardException::ExceptionMessage(
                     "Could not create FFT plan"));
   }
 }
Beispiel #4
0
void fft2dGPU(T1* d_data, int nx, int ny, void* stream)
{
	cufftHandle plan;
	cufftSetCompatibilityMode(plan, CUFFT_COMPATIBILITY_FFTW_ALL);

	if (cufftPlan2d(&plan, nx, ny, CUFFT_C2C)!=CUFFT_SUCCESS) {
		fprintf(stderr, "CUFFT error: Plan creation failed");
	}

	cufftSetStream(plan, (cudaStream_t) stream);
	cufftExecC2C(plan, (cufftComplex*) d_data, (cufftComplex*) d_data, CUFFT_FORWARD);
	cufftDestroy(plan);
}
Beispiel #5
0
/*
 * Class:     jcuda_jcufft_JCufft
 * Method:    cufftPlan2dNative
 * Signature: (Ljcuda/jcufft/JCufftHandle;III)I
 */
JNIEXPORT jint JNICALL Java_jcuda_jcufft_JCufft_cufftPlan2dNative
  (JNIEnv *env, jclass cla, jobject handle, jint nx, jint ny, jint type)
{
    if (handle == NULL)
    {
        ThrowByName(env, "java/lang/NullPointerException", "Parameter 'handle' is null for cufftPlan2d");
        return JCUFFT_INTERNAL_ERROR;
    }

    Logger::log(LOG_TRACE, "Creating 2D plan for (%d, %d) elements of type %d\n", nx, ny, type);

    cufftHandle plan = env->GetIntField(handle, cufftHandle_plan);
    cufftResult result = cufftPlan2d(&plan, nx, ny, getCufftType(type));
    env->SetIntField(handle, cufftHandle_plan, plan);
    return result;
}
Beispiel #6
0
////////////////////////////////////////////////////////////////////////////////
//! Run test
////////////////////////////////////////////////////////////////////////////////
void runAutoTest(int argc, char** argv)
{
    printf("[%s]\n", sSDKsample);

    // Cuda init
	int dev = cutilChooseCudaDevice(argc, argv);

    cudaDeviceProp deviceProp;
    cutilSafeCall(cudaGetDeviceProperties(&deviceProp, dev));
    printf("Compute capability %d.%d\n", deviceProp.major, deviceProp.minor);
    int version = deviceProp.major*10 + deviceProp.minor;
    g_hasDouble = (version >= 13);
    if (inEmulationMode()) {
        // workaround since SM13 kernel doesn't produce correct output in emulation mode
        g_hasDouble = false;
    }

    // create FFT plan
    CUFFT_SAFE_CALL(cufftPlan2d(&fftPlan, meshW, meshH, CUFFT_C2R) );

    // allocate memory
    fftInputW = (meshW / 2)+1;
    fftInputH = meshH;
    fftInputSize = (fftInputW*fftInputH)*sizeof(float2);

    cutilSafeCall(cudaMalloc((void **)&d_h0, fftInputSize) );
    cutilSafeCall(cudaMalloc((void **)&d_ht, fftInputSize) );
    h_h0 = (float2 *) malloc(fftInputSize);
    generate_h0();
    cutilSafeCall(cudaMemcpy(d_h0, h_h0, fftInputSize, cudaMemcpyHostToDevice) );

    cutilSafeCall(cudaMalloc((void **)&d_slope, meshW*meshH*sizeof(float2)) );

    cutCreateTimer(&timer);
    cutStartTimer(timer);
    prevTime = cutGetTimerValue(timer);

    // Creating the Auto-Validation Code
    g_CheckRender = new CheckBackBuffer(windowH, windowH, 4, false);
    g_CheckRender->setPixelFormat(GL_RGBA);
    g_CheckRender->setExecPath(argv[0]);
    g_CheckRender->EnableQAReadback(true);

    runCudaTest(g_hasDouble);
    cudaThreadExit();
}
Beispiel #7
0
void ifft2dGPU(T1* d_data, int nx, int ny, void* stream)
{
	//printf("Running 2d inverse xform \n");
	cufftHandle plan;

	cufftSetCompatibilityMode(plan, CUFFT_COMPATIBILITY_FFTW_ALL);

	if (cufftPlan2d(&plan, ny, nx, CUFFT_Z2Z)!=CUFFT_SUCCESS) {
		printf( "CUFFT error: Plan creation failed\n");
	}
	//printf("Built plan \n");
	cufftSetStream(plan, (cudaStream_t) stream);
	if (cufftExecZ2Z(plan, (cufftDoubleComplex*) d_data, (cufftDoubleComplex*) d_data, CUFFT_INVERSE)!=CUFFT_SUCCESS) {
		printf("CUFFT error: Plan execution failed\n");
	};
	cufftDestroy(plan);
}
Beispiel #8
0
void WorkerThread::createInitialFilter()
{
	float* gaussian_data;
	cudaMalloc((void**)&gaussian_data, sizeof(float) * _filter_pixels);
	int2 gaussian_size;
	gaussian_size.x = _filter_size;
	gaussian_size.y = _filter_size;
	int2 gaussian_center;
	gaussian_center.x = _filter_size / 2;
	gaussian_center.y = _filter_size / 2;
	gaussian(gaussian_data, 0.0, _sigma, 1.0, gaussian_center, gaussian_size);
	
	float* harmonic_data;
	cudaMalloc((void**)&harmonic_data, sizeof(float) * _filter_pixels * 2);
	int2 harmonic_size;
	harmonic_size.x = _filter_size;
	harmonic_size.y = _filter_size;
	int2 harmonic_center;
	harmonic_center.x = _filter_size / 2;
	harmonic_center.y = _filter_size / 2;
	harmonic(harmonic_data, 0, _lambda, 0.0, harmonic_center, harmonic_size);
	float* host_harmonic = new float[_filter_size * _filter_size * 2];
	
	cudaMalloc((void**)&_gabor_data, sizeof(float) * _filter_pixels * 2);
	int2 gabor_size;
	gabor_size.x = _filter_size;
	gabor_size.y = _filter_size;
	int2 gabor_center;
	gabor_center.x = _filter_size / 2;
	gabor_center.y = _filter_size / 2;
	multiplyRealComplex(gaussian_data, harmonic_data, _gabor_data, _filter_size * _filter_size);
	float* host_gabor_data = new float[_filter_pixels * 2];
	cudaMemcpy(host_gabor_data,
		_gabor_data,
		sizeof(float) * _filter_pixels * 2,
		cudaMemcpyDeviceToHost);

	//pad the filter
	{
		float* data = host_gabor_data;
		float* target = _filter_image;
		memset(target, 0, sizeof(float) * _padded_pixels * 2);
		int padded_stride = 2 * _padded_size;
		int target_stride = 2 * _target_size;
		for (int i = 0; i < _target_size; ++i)
		{
			memcpy(target, data, sizeof(float) * target_stride);
			target += padded_stride;
			data += target_stride;
		}
	}

	// Copy gabor data into member for texture creation
	_filter_image_mutex.lock();
	memcpy(_host_gabor_data, host_gabor_data, sizeof(float) * _filter_pixels * 2);
	_filter_image_mutex.unlock();
	
	cudaFree(_gabor_data);
	
	cudaMalloc((void**)&_gabor_data, sizeof(float) * _padded_pixels * 2);
	cudaMalloc((void**)&_gpu_image_0, sizeof(float) * _padded_pixels * 2);
	cudaMalloc((void**)&_gpu_image_1, sizeof(float) * _padded_pixels * 2);
	cudaMemcpy(_gabor_data,
		_filter_image,
		sizeof(float) * _padded_pixels * 2,
		cudaMemcpyHostToDevice);

	cufftPlan2d(&_fft_plan, _padded_size, _padded_size, CUFFT_C2C);
	cufftExecC2C(_fft_plan,
		(cufftComplex*)(_gabor_data),
		(cufftComplex*)(_gabor_data),
		CUFFT_FORWARD);
	cudaMemcpy(_filter_image,
		_gabor_data,
		sizeof(float) * _padded_pixels * 2,
		cudaMemcpyDeviceToHost);

	emit newFilterImage();
}
Beispiel #9
0
////////////////////////////////////////////////////////////////////////////////
//! Run test
////////////////////////////////////////////////////////////////////////////////
void runGraphicsTest(int argc, char** argv)
{
    printf("[%s] ", sSDKsample);
    if (g_bOpenGLQA)   printf("[OpenGL Readback Comparisons] ");
    printf("\n");

    if ( cutCheckCmdLineFlag(argc, (const char **)argv, "device") ) { 
		printf("[%s]\n", argv[0]);
		printf("   Does not explicitly support -device=n in OpenGL mode\n");
		printf("   To use -device=n, the sample must be running w/o OpenGL\n\n");
		printf(" > %s -device=n -qatest\n", argv[0]);
		printf("exiting...\n");
		exit(0);
    }

    // First initialize OpenGL context, so we can properly set the GL for CUDA.
    // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop.
    if(CUTFalse == initGL( &argc, argv )) {
        cudaThreadExit();
        return;
    }

    cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() );

    // create FFT plan
    CUFFT_SAFE_CALL(cufftPlan2d(&fftPlan, meshW, meshH, CUFFT_C2R) );

    // allocate memory
    fftInputW = (meshW / 2)+1;
    fftInputH = meshH;
    fftInputSize = (fftInputW*fftInputH)*sizeof(float2);

    cutilSafeCall(cudaMalloc((void **)&d_h0, fftInputSize) );
    cutilSafeCall(cudaMalloc((void **)&d_ht, fftInputSize) );
    h_h0 = (float2 *) malloc(fftInputSize);
    generate_h0();
    cutilSafeCall(cudaMemcpy(d_h0, h_h0, fftInputSize, cudaMemcpyHostToDevice) );

    cutilSafeCall(cudaMalloc((void **)&d_slope, meshW*meshH*sizeof(float2)) );

    cutCreateTimer(&timer);
    cutStartTimer(timer);
    prevTime = cutGetTimerValue(timer);

    // create vertex buffers and register with CUDA
    createVBO(&heightVertexBuffer, meshW*meshH*sizeof(float));
    // DEPRECATED: cutilSafeCall(cudaGLRegisterBufferObject(heightVertexBuffer));
	cutilSafeCall(cudaGraphicsGLRegisterBuffer(&cuda_heightVB_resource, heightVertexBuffer, cudaGraphicsMapFlagsWriteDiscard));

    createVBO(&slopeVertexBuffer, meshW*meshH*sizeof(float2));
    // DEPRECATED: cutilSafeCall(cudaGLRegisterBufferObject(slopeVertexBuffer));
	cutilSafeCall(cudaGraphicsGLRegisterBuffer(&cuda_slopeVB_resource, slopeVertexBuffer, cudaGraphicsMapFlagsWriteDiscard));

    // create vertex and index buffer for mesh
    createMeshPositionVBO(&posVertexBuffer, meshW, meshH);
    createMeshIndexBuffer(&indexBuffer, meshW, meshH);

    // Creating the Auto-Validation Code
    if (g_bOpenGLQA) {
        g_CheckRender = new CheckBackBuffer(windowH, windowH, 4);
        g_CheckRender->setPixelFormat(GL_RGBA);
        g_CheckRender->setExecPath(argv[0]);
        g_CheckRender->EnableQAReadback(true);
    }

    runCuda();

    // register callbacks
    glutDisplayFunc(display);
    glutKeyboardFunc(keyboard);
    glutMouseFunc(mouse);
    glutMotionFunc(motion);
    glutReshapeFunc(reshape);
    glutIdleFunc(idle);

    // start rendering mainloop
    glutMainLoop();
    cudaThreadExit();
}
Beispiel #10
0
int main(int argc, char **argv)
{
    int devID;
    cudaDeviceProp deviceProps;
    printf("%s Starting...\n\n", sSDKname);
    printf("[%s] - [OpenGL/CUDA simulation] starting...\n", sSDKname);

    // First initialize OpenGL context, so we can properly set the GL for CUDA.
    // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop.
    if (false == initGL(&argc, argv))
    {
        exit(EXIT_SUCCESS);
    }

    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
#ifndef OPTIMUS
    devID = findCudaGLDevice(argc, (const char **)argv);
#else
    devID = gpuGetMaxGflopsDeviceId();
#endif

    // get number of SMs on this GPU
    checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
    printf("CUDA device [%s] has %d Multi-Processors\n",
           deviceProps.name, deviceProps.multiProcessorCount);

    // automated build testing harness
    if (checkCmdLineFlag(argc, (const char **)argv, "file"))
    {
        getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file);
    }

    // Allocate and initialize host data
    GLint bsize;

    sdkCreateTimer(&timer);
    sdkResetTimer(&timer);

    hvfield = (cData *)malloc(sizeof(cData) * DS);
    memset(hvfield, 0, sizeof(cData) * DS);

    // Allocate and initialize device data
    cudaMallocPitch((void **)&dvfield, &tPitch, sizeof(cData)*DIM, DIM);

    cudaMemcpy(dvfield, hvfield, sizeof(cData) * DS,
               cudaMemcpyHostToDevice);
    // Temporary complex velocity field data
    cudaMalloc((void **)&vxfield, sizeof(cData) * PDS);
    cudaMalloc((void **)&vyfield, sizeof(cData) * PDS);

    setupTexture(DIM, DIM);
    bindTexture();

    // Create particle array in host memory
    particles = (cData *)malloc(sizeof(cData) * DS);
    memset(particles, 0, sizeof(cData) * DS);

#ifdef BROADCAST
	int step = 1;

	// Broadcasted visualization stepping.
	if (argc > 3)
		step = atoi(argv[3]);

	// Create additional space to store particle packets
	// for broadcasting.
	wstep = step; hstep = step;
	int npackets = sizeof(float) * (DIM / wstep) * (DIM / hstep) / UdpBroadcastServer::PacketSize;
	if (sizeof(float) * (DIM / wstep) * (DIM / hstep) % UdpBroadcastServer::PacketSize)
		npackets++;
	packets = (char*)malloc(npackets *
		(UdpBroadcastServer::PacketSize + sizeof(unsigned int)));
#endif

    initParticles(particles, DIM, DIM);

#if defined(OPTIMUS) || defined(BROADCAST)
    // Create particle array in device memory
    cudaMalloc((void **)&particles_gpu, sizeof(cData) * DS);
    cudaMemcpy(particles_gpu, particles, sizeof(cData) * DS, cudaMemcpyHostToDevice);
#endif

    // Create CUFFT transform plan configuration
    cufftPlan2d(&planr2c, DIM, DIM, CUFFT_R2C);
    cufftPlan2d(&planc2r, DIM, DIM, CUFFT_C2R);
    // TODO: update kernels to use the new unpadded memory layout for perf
    // rather than the old FFTW-compatible layout
    cufftSetCompatibilityMode(planr2c, CUFFT_COMPATIBILITY_FFTW_PADDING);
    cufftSetCompatibilityMode(planc2r, CUFFT_COMPATIBILITY_FFTW_PADDING);

    glGenBuffersARB(1, &vbo);
    glBindBufferARB(GL_ARRAY_BUFFER_ARB, vbo);
    glBufferDataARB(GL_ARRAY_BUFFER_ARB, sizeof(cData) * DS,
                    particles, GL_DYNAMIC_DRAW_ARB);

    glGetBufferParameterivARB(GL_ARRAY_BUFFER_ARB, GL_BUFFER_SIZE_ARB, &bsize);

    if (bsize != (sizeof(cData) * DS))
        goto EXTERR;

    glBindBufferARB(GL_ARRAY_BUFFER_ARB, 0);

#ifndef OPTIMUS
    checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo, cudaGraphicsMapFlagsNone));
    getLastCudaError("cudaGraphicsGLRegisterBuffer failed");
#endif

    if (ref_file)
    {
        autoTest(argv);
        cleanup();

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        printf("[fluidsGL] - Test Results: %d Failures\n", g_TotalErrors);
        exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE);

    }
    else
    {
#ifdef BROADCAST
		const char *sv_addr = "127.0.0:9097";
		const char *bc_addr = "127.255.255.2:9097";

		// Server address
		if (argc > 2)
			sv_addr = argv[2];

		// Broadcast address
		if (argc > 1)
			bc_addr = argv[1];

		server.reset(new UdpBroadcastServer(sv_addr, bc_addr));

		// Listen to clients' feedbacks in a separate thread.
		{
			pthread_t tid;
			pthread_create(&tid, NULL, &feedback_listener, &step);
		}

		// Broadcast the particles state in a separate thread.
		{
			pthread_t tid;
			pthread_create(&tid, NULL, &broadcaster, &step);
		}
#endif
#if defined (__APPLE__) || defined(MACOSX)
        atexit(cleanup);
#else
        glutCloseFunc(cleanup);
#endif
        glutMainLoop();
    }

    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();

    if (!ref_file)
    {
        exit(EXIT_SUCCESS);
    }

    return 0;

EXTERR:
    printf("Failed to initialize GL extensions.\n");

    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();
    exit(EXIT_FAILURE);
}
Beispiel #11
0
int main(int argc, char *argv[]) {
	int i;
	struct timeval begin, end;
	int size;
	size_t bytes;
	int n = 0, m = 0;
	STARPUFFT(plan) plan;
#ifdef STARPU_HAVE_FFTW
	_FFTW(plan) fftw_plan;
#endif
#ifdef STARPU_USE_CUDA
	cufftHandle cuda_plan;
	cudaError_t cures;
#endif
	double timing;

	if (argc < 2 || argc > 3) {
		fprintf(stderr,"need one or two size of vector\n");
		exit(EXIT_FAILURE);
	}

	starpu_init(NULL);

	if (argc == 2) {
		n = atoi(argv[1]);

		/* 1D */
		size = n;
	} else if (argc == 3) {
		n = atoi(argv[1]);
		m = atoi(argv[2]);

		/* 2D */
		size = n * m;
	} else {
		assert(0);
	}

	bytes = size * sizeof(STARPUFFT(complex));

	STARPUFFT(complex) *in = STARPUFFT(malloc)(size * sizeof(*in));
	starpu_srand48(0);
	for (i = 0; i < size; i++)
		in[i] = starpu_drand48() + I * starpu_drand48();

	STARPUFFT(complex) *out = STARPUFFT(malloc)(size * sizeof(*out));

#ifdef STARPU_HAVE_FFTW
	STARPUFFT(complex) *out_fftw = STARPUFFT(malloc)(size * sizeof(*out_fftw));
#endif

#ifdef STARPU_USE_CUDA
	STARPUFFT(complex) *out_cuda = malloc(size * sizeof(*out_cuda));
#endif

	if (argc == 2) {
		plan = STARPUFFT(plan_dft_1d)(n, SIGN, 0);
#ifdef STARPU_HAVE_FFTW
		fftw_plan = _FFTW(plan_dft_1d)(n, in, out_fftw, SIGN, FFTW_ESTIMATE);
#endif
#ifdef STARPU_USE_CUDA
		if (cufftPlan1d(&cuda_plan, n, _CUFFT_C2C, 1) != CUFFT_SUCCESS)
			printf("erf\n");
#endif

	} else if (argc == 3) {
		plan = STARPUFFT(plan_dft_2d)(n, m, SIGN, 0);
#ifdef STARPU_HAVE_FFTW
		fftw_plan = _FFTW(plan_dft_2d)(n, m, in, out_fftw, SIGN, FFTW_ESTIMATE);
#endif
#ifdef STARPU_USE_CUDA
		STARPU_ASSERT(cufftPlan2d(&cuda_plan, n, m, _CUFFT_C2C) == CUFFT_SUCCESS);
#endif
	} else {
		assert(0);
	}

#ifdef STARPU_HAVE_FFTW
	gettimeofday(&begin, NULL);
	_FFTW(execute)(fftw_plan);
	gettimeofday(&end, NULL);
	_FFTW(destroy_plan)(fftw_plan);
	timing = (double)((end.tv_sec - begin.tv_sec)*1000000 + (end.tv_usec - begin.tv_usec));
	printf("FFTW took %2.2f ms (%2.2f MB/s)\n\n", timing/1000, bytes/timing);
#endif
#ifdef STARPU_USE_CUDA
	gettimeofday(&begin, NULL);
	if (cufftExecC2C(cuda_plan, (cufftComplex*) in, (cufftComplex*) out_cuda, CUFFT_FORWARD) != CUFFT_SUCCESS)
		printf("erf2\n");
	if ((cures = cudaThreadSynchronize()) != cudaSuccess)
		STARPU_CUDA_REPORT_ERROR(cures);
	gettimeofday(&end, NULL);
	cufftDestroy(cuda_plan);
	timing = (double)((end.tv_sec - begin.tv_sec)*1000000 + (end.tv_usec - begin.tv_usec));
	printf("CUDA took %2.2f ms (%2.2f MB/s)\n\n", timing/1000, bytes/timing);
#endif

	STARPUFFT(execute)(plan, in, out);

	STARPUFFT(showstats)(stdout);
	STARPUFFT(destroy_plan)(plan);

	printf("\n");
#if 0
	for (i = 0; i < 16; i++)
		printf("(%f,%f) ", cimag(in[i]), creal(in[i]));
	printf("\n\n");
	for (i = 0; i < 16; i++)
		printf("(%f,%f) ", cimag(out[i]), creal(out[i]));
	printf("\n\n");
#ifdef STARPU_HAVE_FFTW
	for (i = 0; i < 16; i++)
		printf("(%f,%f) ", cimag(out_fftw[i]), creal(out_fftw[i]));
	printf("\n\n");
#endif
#endif

#ifdef STARPU_HAVE_FFTW
{
	double max = 0., tot = 0., norm = 0., normdiff = 0.;
	for (i = 0; i < size; i++) {
		double diff = cabs(out[i]-out_fftw[i]);
		double diff2 = diff * diff;
		double size = cabs(out_fftw[i]);
		double size2 = size * size;
		if (diff > max)
			max = diff;
		tot += diff;
		normdiff += diff2;
		norm += size2;
	}
	fprintf(stderr, "\nmaximum difference %g\n", max);
	fprintf(stderr, "average difference %g\n", tot / size);
	fprintf(stderr, "difference norm %g\n", sqrt(normdiff));
	double relmaxdiff = max / sqrt(norm);
	fprintf(stderr, "relative maximum difference %g\n", relmaxdiff);
	double relavgdiff = (tot / size) / sqrt(norm);
	fprintf(stderr, "relative average difference %g\n", relavgdiff);
	if (!strcmp(TYPE, "f") && (relmaxdiff > 1e-8 || relavgdiff > 1e-8))
		return EXIT_FAILURE;
	if (!strcmp(TYPE, "") && (relmaxdiff > 1e-16 || relavgdiff > 1e-16))
		return EXIT_FAILURE;
}
#endif

#ifdef STARPU_USE_CUDA
{
	double max = 0., tot = 0., norm = 0., normdiff = 0.;
	for (i = 0; i < size; i++) {
		double diff = cabs(out_cuda[i]-out_fftw[i]);
		double diff2 = diff * diff;
		double size = cabs(out_fftw[i]);
		double size2 = size * size;
		if (diff > max)
			max = diff;
		tot += diff;
		normdiff += diff2;
		norm += size2;
	}
	fprintf(stderr, "\nmaximum difference %g\n", max);
	fprintf(stderr, "average difference %g\n", tot / size);
	fprintf(stderr, "difference norm %g\n", sqrt(normdiff));
	double relmaxdiff = max / sqrt(norm);
	fprintf(stderr, "relative maximum difference %g\n", relmaxdiff);
	double relavgdiff = (tot / size) / sqrt(norm);
	fprintf(stderr, "relative average difference %g\n", relavgdiff);
	if (!strcmp(TYPE, "f") && (relmaxdiff > 1e-8 || relavgdiff > 1e-8))
		return EXIT_FAILURE;
	if (!strcmp(TYPE, "") && (relmaxdiff > 1e-16 || relavgdiff > 1e-16))
		return EXIT_FAILURE;
}
#endif

	STARPUFFT(free)(in);
	STARPUFFT(free)(out);

#ifdef STARPU_HAVE_FFTW
	STARPUFFT(free)(out_fftw);
#endif

#ifdef STARPU_USE_CUDA
	free(out_cuda);
#endif

	starpu_shutdown();

	return EXIT_SUCCESS;
}
Beispiel #12
0
bool test0(void)
{
    float
    *h_Data,
    *h_Kernel,
    *h_ResultCPU,
    *h_ResultGPU;

    float
    *d_Data,
    *d_PaddedData,
    *d_Kernel,
    *d_PaddedKernel;

    fComplex
    *d_DataSpectrum,
    *d_KernelSpectrum;

    cufftHandle
    fftPlanFwd,
    fftPlanInv;

    bool bRetVal;
    StopWatchInterface *hTimer = NULL;
    sdkCreateTimer(&hTimer);

    printf("Testing built-in R2C / C2R FFT-based convolution\n");
    const int kernelH = 3;
    const int kernelW = 3;
    const int kernelY = 1;
    const int kernelX = 1;
    const int   dataH = 10;
    const int   dataW = 10;
    const int    fftH = snapTransformSize(dataH + kernelH - 1);
    const int    fftW = snapTransformSize(dataW + kernelW - 1);

    printf("...allocating memory\n");
    h_Data      = (float *)malloc(dataH   * dataW * sizeof(float));
    h_Kernel    = (float *)malloc(kernelH * kernelW * sizeof(float));
    h_ResultCPU = (float *)malloc(dataH   * dataW * sizeof(float));
    h_ResultGPU = (float *)malloc(fftH    * fftW * sizeof(float));

    checkCudaErrors(cudaMalloc((void **)&d_Data,   dataH   * dataW   * sizeof(float)));
    checkCudaErrors(cudaMalloc((void **)&d_Kernel, kernelH * kernelW * sizeof(float)));

    checkCudaErrors(cudaMalloc((void **)&d_PaddedData,   fftH * fftW * sizeof(float)));
    checkCudaErrors(cudaMalloc((void **)&d_PaddedKernel, fftH * fftW * sizeof(float)));

    checkCudaErrors(cudaMalloc((void **)&d_DataSpectrum,   fftH * (fftW / 2 + 1) * sizeof(fComplex)));
    checkCudaErrors(cudaMalloc((void **)&d_KernelSpectrum, fftH * (fftW / 2 + 1) * sizeof(fComplex)));

    printf("...generating random input data\n");
    srand(2010);

    for (int i = 0; i < dataH * dataW; i++)
    {
        //h_Data[i] = getRand();
		h_Data[i] = i + 1;
    }

    for (int i = 0; i < kernelH * kernelW; i++)
    {
        //h_Kernel[i] = getRand();
		h_Kernel[i] = i + 1;
    }

	FILE* fp2 = fopen("input_kernel.txt", "w+");
	FILE* fp3 = fopen("input_data.txt", "w+");
	for (int i = 0; i < dataH * dataW; i++)
		fprintf(fp3, "%f\n", h_Data[i]);
	for (int i = 0; i < kernelH * kernelW; i++)
		fprintf(fp2, "%f\n", h_Kernel[i]);
	fclose(fp2);
	fclose(fp3);

    printf("...creating R2C & C2R FFT plans for %i x %i\n", fftH, fftW);
    checkCudaErrors(cufftPlan2d(&fftPlanFwd, fftH, fftW, CUFFT_R2C));
    checkCudaErrors(cufftPlan2d(&fftPlanInv, fftH, fftW, CUFFT_C2R));

    printf("...uploading to GPU and padding convolution kernel and input data\n");
    checkCudaErrors(cudaMemcpy(d_Kernel, h_Kernel, kernelH * kernelW * sizeof(float), cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(d_Data,   h_Data,   dataH   * dataW *   sizeof(float), cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemset(d_PaddedKernel, 0, fftH * fftW * sizeof(float)));
    checkCudaErrors(cudaMemset(d_PaddedData,   0, fftH * fftW * sizeof(float)));

    padKernel(
        d_PaddedKernel,
        d_Kernel,
        fftH,
        fftW,
        kernelH,
        kernelW,
        kernelY,
        kernelX
    );

    padDataClampToBorder(
        d_PaddedData,
        d_Data,
        fftH,
        fftW,
        dataH,
        dataW,
        kernelH,
        kernelW,
        kernelY,
        kernelX
    );

    //Not including kernel transformation into time measurement,
    //since convolution kernel is not changed very frequently
    printf("...transforming convolution kernel\n");
    checkCudaErrors(cufftExecR2C(fftPlanFwd, (cufftReal *)d_PaddedKernel, (cufftComplex *)d_KernelSpectrum));

    printf("...running GPU FFT convolution: ");
    checkCudaErrors(cudaDeviceSynchronize());
    sdkResetTimer(&hTimer);
    sdkStartTimer(&hTimer);
    checkCudaErrors(cufftExecR2C(fftPlanFwd, (cufftReal *)d_PaddedData, (cufftComplex *)d_DataSpectrum));
    modulateAndNormalize(d_DataSpectrum, d_KernelSpectrum, fftH, fftW, 1);
    checkCudaErrors(cufftExecC2R(fftPlanInv, (cufftComplex *)d_DataSpectrum, (cufftReal *)d_PaddedData));

    checkCudaErrors(cudaDeviceSynchronize());
    sdkStopTimer(&hTimer);
    double gpuTime = sdkGetTimerValue(&hTimer);
    printf("%f MPix/s (%f ms)\n", (double)dataH * (double)dataW * 1e-6 / (gpuTime * 0.001), gpuTime);

    printf("...reading back GPU convolution results\n");
    checkCudaErrors(cudaMemcpy(h_ResultGPU, d_PaddedData, fftH * fftW * sizeof(float), cudaMemcpyDeviceToHost));

    printf("...running reference CPU convolution\n");
    convolutionClampToBorderCPU(
        h_ResultCPU,
        h_Data,
        h_Kernel,
        dataH,
        dataW,
        kernelH,
        kernelW,
        kernelY,
        kernelX
    );

    printf("...comparing the results: ");
    double sum_delta2 = 0;
    double sum_ref2   = 0;
    double max_delta_ref = 0;

    for (int y = 0; y < dataH; y++)
        for (int x = 0; x < dataW; x++)
        {
            double  rCPU = (double)h_ResultCPU[y * dataW + x];
            double  rGPU = (double)h_ResultGPU[y * fftW  + x];
            double delta = (rCPU - rGPU) * (rCPU - rGPU);
            double   ref = rCPU * rCPU + rCPU * rCPU;

            if ((delta / ref) > max_delta_ref)
            {
                max_delta_ref = delta / ref;
            }

            sum_delta2 += delta;
            sum_ref2   += ref;
        }

    double L2norm = sqrt(sum_delta2 / sum_ref2);
    printf("rel L2 = %E (max delta = %E)\n", L2norm, sqrt(max_delta_ref));
    bRetVal = (L2norm < 1e-6) ? true : false;
    printf(bRetVal ? "L2norm Error OK\n" : "L2norm Error too high!\n");

    printf("...shutting down\n");
    sdkStartTimer(&hTimer);

    checkCudaErrors(cufftDestroy(fftPlanInv));
    checkCudaErrors(cufftDestroy(fftPlanFwd));

    checkCudaErrors(cudaFree(d_DataSpectrum));
    checkCudaErrors(cudaFree(d_KernelSpectrum));
    checkCudaErrors(cudaFree(d_PaddedData));
    checkCudaErrors(cudaFree(d_PaddedKernel));
    checkCudaErrors(cudaFree(d_Data));
    checkCudaErrors(cudaFree(d_Kernel));

	FILE* fp = fopen("result_gpu.txt", "w+");
	FILE* fp1 = fopen("result_cpu.txt", "w+");
	for (int i = 0; i < dataH * dataW; i++)
	{
		fprintf(fp, "%f\n", h_ResultGPU[i]);
		fprintf(fp1, "%f\n", h_ResultCPU[i]);
	}
	fclose(fp);
	fclose(fp1);

    free(h_ResultGPU);
    free(h_ResultCPU);
    free(h_Data);
    free(h_Kernel);

    return bRetVal;
}
Beispiel #13
0
cufftResult WINAPI wine_cufftPlan2d(cufftHandle *plan, int nx, int ny, cufftType type){
	WINE_TRACE("\n");
	return cufftPlan2d( plan, nx, ny, type );
}
Beispiel #14
0
oskar_FFT* oskar_fft_create(int precision, int location, int num_dim,
        int dim_size, int batch_size_1d, int* status)
{
    int i;
    oskar_FFT* h = (oskar_FFT*) calloc(1, sizeof(oskar_FFT));
#ifndef OSKAR_HAVE_CUDA
    if (location == OSKAR_GPU) location = OSKAR_CPU;
#endif
#ifndef OSKAR_HAVE_OPENCL
    if (location & OSKAR_CL) location = OSKAR_CPU;
#endif
    h->precision = precision;
    h->location = location;
    h->num_dim = num_dim;
    h->dim_size = dim_size;
    h->ensure_consistent_norm = 1;
    h->num_cells_total = (size_t) dim_size;
    for (i = 1; i < num_dim; ++i) h->num_cells_total *= (size_t) dim_size;
    if (location == OSKAR_CPU)
    {
        int len = 4 * dim_size +
                2 * (int)(log((double)dim_size) / log(2.0)) + 8;
        h->fftpack_wsave = oskar_mem_create(precision, location, len, status);
        if (num_dim == 1)
        {
            (void) batch_size_1d;
            *status = OSKAR_ERR_FUNCTION_NOT_AVAILABLE;
        }
        else if (num_dim == 2)
        {
            if (precision == OSKAR_DOUBLE)
                oskar_fftpack_cfft2i(dim_size, dim_size,
                        oskar_mem_double(h->fftpack_wsave, status));
            else
                oskar_fftpack_cfft2i_f(dim_size, dim_size,
                        oskar_mem_float(h->fftpack_wsave, status));
        }
        else
            *status = OSKAR_ERR_INVALID_ARGUMENT;
        h->fftpack_work = oskar_mem_create(precision, location,
                2 * h->num_cells_total, status);
    }
    else if (location == OSKAR_GPU)
    {
#ifdef OSKAR_HAVE_CUDA
        if (num_dim == 1)
            cufftPlan1d(&h->cufft_plan, dim_size,
                    ((precision == OSKAR_DOUBLE) ? CUFFT_Z2Z : CUFFT_C2C),
                    batch_size_1d);
        else if (num_dim == 2)
            cufftPlan2d(&h->cufft_plan, dim_size, dim_size,
                    ((precision == OSKAR_DOUBLE) ? CUFFT_Z2Z : CUFFT_C2C));
        else
            *status = OSKAR_ERR_INVALID_ARGUMENT;
#endif
    }
    else if (location & OSKAR_CL)
    {
#ifdef OSKAR_HAVE_OPENCL
        *status = OSKAR_ERR_FUNCTION_NOT_AVAILABLE;
#endif
    }
    else
        *status = OSKAR_ERR_BAD_LOCATION;
    return h;
}
Beispiel #15
0
 static void setfftpl(unsigned num_rows, unsigned num_columns, cufftHandle* fftplan)
 {
     cufftPlan2d(fftplan, num_rows, num_columns, CUFFT_Z2Z);
 }
Beispiel #16
0
void WorkerThread::createNewFilter()
{
	// Free GPU memory from current filter and CUFFT
	cudaFree(_gabor_data);
	cudaFree(_gpu_image_0);
	cudaFree(_gpu_image_1);
	cufftDestroy(_fft_plan);

	float* gaussian_data;
	cudaMalloc((void**)&gaussian_data, sizeof(float) * _filter_pixels);
	int2 gaussian_size;
	gaussian_size.x = _filter_size;
	gaussian_size.y = _filter_size;
	int2 gaussian_center;
	gaussian_center.x = _filter_size / 2;
	gaussian_center.y = _filter_size / 2;
	gaussian(gaussian_data, _new_theta, _new_sigma, 1.0, gaussian_center, gaussian_size);
	
	float* harmonic_data;
	cudaMalloc((void**)&harmonic_data, sizeof(float) * _filter_pixels * 2);
	int2 harmonic_size;
	harmonic_size.x = _filter_size;
	harmonic_size.y = _filter_size;
	int2 harmonic_center;
	harmonic_center.x = _filter_size / 2;
	harmonic_center.y = _filter_size / 2;
	harmonic(harmonic_data, _new_theta, _new_lambda, _new_psi, harmonic_center, harmonic_size);
	float* host_harmonic = new float[_filter_size * _filter_size * 2];
	
	cudaMalloc((void**)&_gabor_data, sizeof(float) * _filter_pixels * 2);
	int2 gabor_size;
	gabor_size.x = _filter_size;
	gabor_size.y = _filter_size;
	int2 gabor_center;
	gabor_center.x = _filter_size / 2;
	gabor_center.y = _filter_size / 2;
	multiplyRealComplex(gaussian_data, harmonic_data, _gabor_data, _filter_size * _filter_size);
	float* host_gabor_data = new float[_filter_pixels * 2];
	cudaMemcpy(host_gabor_data,
		_gabor_data,
		sizeof(float) * _filter_pixels * 2,
		cudaMemcpyDeviceToHost);

	//pad the filter
	{
		float* data = host_gabor_data;
		float* target = _filter_image;
		memset(target, 0, sizeof(float) * _padded_pixels * 2);
		int padded_stride = 2 * _padded_size;
		int target_stride = 2 * _target_size;
		for (int i = 0; i < _target_size; ++i)
		{
			memcpy(target, data, sizeof(float) * target_stride);
			target += padded_stride;
			data += target_stride;
		}
	}

	// Copy gabor data into member for texture creation
	_filter_image_mutex.lock();
	memcpy(_host_gabor_data, host_gabor_data, sizeof(float) * _filter_pixels * 2);
	_filter_image_mutex.unlock();
	
	cudaFree(_gabor_data);
	
	cudaMalloc((void**)&_gabor_data, sizeof(float) * _padded_pixels * 2);
	cudaMalloc((void**)&_gpu_image_0, sizeof(float) * _padded_pixels * 2);
	cudaMalloc((void**)&_gpu_image_1, sizeof(float) * _padded_pixels * 2);
	cudaMemcpy(_gabor_data,
		_filter_image,
		sizeof(float) * _padded_pixels * 2,
		cudaMemcpyHostToDevice);

	cufftPlan2d(&_fft_plan, _padded_size, _padded_size, CUFFT_C2C);
	cufftExecC2C(_fft_plan,
		(cufftComplex*)(_gabor_data),
		(cufftComplex*)(_gabor_data),
		CUFFT_FORWARD);
	cudaMemcpy(_filter_image,
		_gabor_data,
		sizeof(float) * _padded_pixels * 2,
		cudaMemcpyDeviceToHost);

	// Free temporary GPU memory used for creation of filter
	cudaFree(gaussian_data);
	cudaFree(harmonic_data);

	delete host_harmonic;
	delete host_gabor_data;

	_should_create_new_filter = false;

	emit newFilterImage();
}
Beispiel #17
0
int main(int argc, char **argv)
{
    int devID;
    cudaDeviceProp deviceProps;
    printf("%s Starting...\n\n", sSDKname);
    printf("[%s] - [OpenGL/CUDA simulation] starting...\n", sSDKname);

    // First initialize OpenGL context, so we can properly set the GL for CUDA.
    // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop.
    if (false == initGL(&argc, argv))
    {
        exit(EXIT_SUCCESS);
    }

    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
    devID = findCudaGLDevice(argc, (const char **)argv);

    // get number of SMs on this GPU
    checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
    printf("CUDA device [%s] has %d Multi-Processors\n",
           deviceProps.name, deviceProps.multiProcessorCount);

    // automated build testing harness
    if (checkCmdLineFlag(argc, (const char **)argv, "file"))
    {
        getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file);
    }

    // Allocate and initialize host data
    GLint bsize;

    sdkCreateTimer(&timer);
    sdkResetTimer(&timer);

    hvfield = (cData *)malloc(sizeof(cData) * DS);
    memset(hvfield, 0, sizeof(cData) * DS);

    // Allocate and initialize device data
    cudaMallocPitch((void **)&dvfield, &tPitch, sizeof(cData)*DIM, DIM);

    cudaMemcpy(dvfield, hvfield, sizeof(cData) * DS,
               cudaMemcpyHostToDevice);
    // Temporary complex velocity field data
    cudaMalloc((void **)&vxfield, sizeof(cData) * PDS);
    cudaMalloc((void **)&vyfield, sizeof(cData) * PDS);

    setupTexture(DIM, DIM);
    bindTexture();

    // Create particle array
    particles = (cData *)malloc(sizeof(cData) * DS);
    memset(particles, 0, sizeof(cData) * DS);

    initParticles(particles, DIM, DIM);

    // Create CUFFT transform plan configuration
    cufftPlan2d(&planr2c, DIM, DIM, CUFFT_R2C);
    cufftPlan2d(&planc2r, DIM, DIM, CUFFT_C2R);
    // TODO: update kernels to use the new unpadded memory layout for perf
    // rather than the old FFTW-compatible layout
    cufftSetCompatibilityMode(planr2c, CUFFT_COMPATIBILITY_FFTW_PADDING);
    cufftSetCompatibilityMode(planc2r, CUFFT_COMPATIBILITY_FFTW_PADDING);

    glGenBuffersARB(1, &vbo);
    glBindBufferARB(GL_ARRAY_BUFFER_ARB, vbo);
    glBufferDataARB(GL_ARRAY_BUFFER_ARB, sizeof(cData) * DS,
                    particles, GL_DYNAMIC_DRAW_ARB);

    glGetBufferParameterivARB(GL_ARRAY_BUFFER_ARB, GL_BUFFER_SIZE_ARB, &bsize);

    if (bsize != (sizeof(cData) * DS))
        goto EXTERR;

    glBindBufferARB(GL_ARRAY_BUFFER_ARB, 0);

    checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo, cudaGraphicsMapFlagsNone));
    getLastCudaError("cudaGraphicsGLRegisterBuffer failed");

    if (ref_file)
    {
        autoTest(argv);
        cleanup();
        cudaDeviceReset();
        printf("[fluidsGL] - Test Results: %d Failures\n", g_TotalErrors);
        exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE);

    }
    else
    {
        atexit(cleanup);
        glutMainLoop();
    }

    cudaDeviceReset();

    if (!ref_file)
    {
        exit(EXIT_SUCCESS);
    }

    return 0;

EXTERR:
    printf("Failed to initialize GL extensions.\n");

    cudaDeviceReset();
    exit(EXIT_FAILURE);
}
Beispiel #18
0
bool test2(void)
{
    float
    *h_Data,
    *h_Kernel,
    *h_ResultCPU,
    *h_ResultGPU;

    float
    *d_Data,
    *d_Kernel,
    *d_PaddedData,
    *d_PaddedKernel;

    fComplex
    *d_DataSpectrum0,
    *d_KernelSpectrum0;

    cufftHandle
    fftPlan;

    bool bRetVal;
    StopWatchInterface *hTimer = NULL;
    sdkCreateTimer(&hTimer);

    printf("Testing updated custom R2C / C2R FFT-based convolution\n");
    const int kernelH = 7;
    const int kernelW = 6;
    const int kernelY = 3;
    const int kernelX = 4;
    const int dataH = 2000;
    const int dataW = 2000;
    const int fftH = snapTransformSize(dataH + kernelH - 1);
    const int fftW = snapTransformSize(dataW + kernelW - 1);

    printf("...allocating memory\n");
    h_Data      = (float *)malloc(dataH   * dataW * sizeof(float));
    h_Kernel    = (float *)malloc(kernelH * kernelW * sizeof(float));
    h_ResultCPU = (float *)malloc(dataH   * dataW * sizeof(float));
    h_ResultGPU = (float *)malloc(fftH    * fftW * sizeof(float));

    checkCudaErrors(cudaMalloc((void **)&d_Data,   dataH   * dataW   * sizeof(float)));
    checkCudaErrors(cudaMalloc((void **)&d_Kernel, kernelH * kernelW * sizeof(float)));

    checkCudaErrors(cudaMalloc((void **)&d_PaddedData,   fftH * fftW * sizeof(float)));
    checkCudaErrors(cudaMalloc((void **)&d_PaddedKernel, fftH * fftW * sizeof(float)));

    checkCudaErrors(cudaMalloc((void **)&d_DataSpectrum0,   fftH * (fftW / 2) * sizeof(fComplex)));
    checkCudaErrors(cudaMalloc((void **)&d_KernelSpectrum0, fftH * (fftW / 2) * sizeof(fComplex)));

    printf("...generating random input data\n");
    srand(2010);

    for (int i = 0; i < dataH * dataW; i++)
    {
        h_Data[i] = getRand();
    }

    for (int i = 0; i < kernelH * kernelW; i++)
    {
        h_Kernel[i] = getRand();
    }

    printf("...creating C2C FFT plan for %i x %i\n", fftH, fftW / 2);
    checkCudaErrors(cufftPlan2d(&fftPlan, fftH, fftW / 2, CUFFT_C2C));

    printf("...uploading to GPU and padding convolution kernel and input data\n");
    checkCudaErrors(cudaMemcpy(d_Data,   h_Data,   dataH   * dataW *   sizeof(float), cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(d_Kernel, h_Kernel, kernelH * kernelW * sizeof(float), cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemset(d_PaddedData,   0, fftH * fftW * sizeof(float)));
    checkCudaErrors(cudaMemset(d_PaddedKernel, 0, fftH * fftW * sizeof(float)));

    padDataClampToBorder(
        d_PaddedData,
        d_Data,
        fftH,
        fftW,
        dataH,
        dataW,
        kernelH,
        kernelW,
        kernelY,
        kernelX
    );

    padKernel(
        d_PaddedKernel,
        d_Kernel,
        fftH,
        fftW,
        kernelH,
        kernelW,
        kernelY,
        kernelX
    );

    //CUFFT_INVERSE works just as well...
    const int FFT_DIR = CUFFT_FORWARD;

    //Not including kernel transformation into time measurement,
    //since convolution kernel is not changed very frequently
    printf("...transforming convolution kernel\n");
    checkCudaErrors(cufftExecC2C(fftPlan, (cufftComplex *)d_PaddedKernel, (cufftComplex *)d_KernelSpectrum0, FFT_DIR));

    printf("...running GPU FFT convolution: ");
    checkCudaErrors(cudaDeviceSynchronize());
    sdkResetTimer(&hTimer);
    sdkStartTimer(&hTimer);

    checkCudaErrors(cufftExecC2C(fftPlan, (cufftComplex *)d_PaddedData, (cufftComplex *)d_DataSpectrum0, FFT_DIR));
    spProcess2D(d_DataSpectrum0, d_DataSpectrum0, d_KernelSpectrum0, fftH, fftW / 2, FFT_DIR);
    checkCudaErrors(cufftExecC2C(fftPlan, (cufftComplex *)d_DataSpectrum0, (cufftComplex *)d_PaddedData, -FFT_DIR));

    checkCudaErrors(cudaDeviceSynchronize());
    sdkStopTimer(&hTimer);
    double gpuTime = sdkGetTimerValue(&hTimer);
    printf("%f MPix/s (%f ms)\n", (double)dataH * (double)dataW * 1e-6 / (gpuTime * 0.001), gpuTime);

    printf("...reading back GPU FFT results\n");
    checkCudaErrors(cudaMemcpy(h_ResultGPU, d_PaddedData, fftH * fftW * sizeof(float), cudaMemcpyDeviceToHost));

    printf("...running reference CPU convolution\n");
    convolutionClampToBorderCPU(
        h_ResultCPU,
        h_Data,
        h_Kernel,
        dataH,
        dataW,
        kernelH,
        kernelW,
        kernelY,
        kernelX
    );

    printf("...comparing the results: ");
    double sum_delta2 = 0;
    double sum_ref2   = 0;
    double max_delta_ref = 0;

    for (int y = 0; y < dataH; y++)
    {
        for (int x = 0; x < dataW; x++)
        {
            double  rCPU = (double)h_ResultCPU[y * dataW + x];
            double  rGPU = (double)h_ResultGPU[y * fftW  + x];
            double delta = (rCPU - rGPU) * (rCPU - rGPU);
            double   ref = rCPU * rCPU + rCPU * rCPU;

            if ((delta / ref) > max_delta_ref)
            {
                max_delta_ref = delta / ref;
            }

            sum_delta2 += delta;
            sum_ref2   += ref;
        }
    }

    double L2norm = sqrt(sum_delta2 / sum_ref2);
    printf("rel L2 = %E (max delta = %E)\n", L2norm, sqrt(max_delta_ref));
    bRetVal = (L2norm < 1e-6) ? true : false;
    printf(bRetVal ? "L2norm Error OK\n" : "L2norm Error too high!\n");

    printf("...shutting down\n");
    sdkStartTimer(&hTimer);
    checkCudaErrors(cufftDestroy(fftPlan));

    checkCudaErrors(cudaFree(d_KernelSpectrum0));
    checkCudaErrors(cudaFree(d_DataSpectrum0));
    checkCudaErrors(cudaFree(d_PaddedKernel));
    checkCudaErrors(cudaFree(d_PaddedData));
    checkCudaErrors(cudaFree(d_Kernel));
    checkCudaErrors(cudaFree(d_Data));

    free(h_ResultGPU);
    free(h_ResultCPU);
    free(h_Kernel);
    free(h_Data);

    return bRetVal;
}