void transformStack(const FreeImageStack & rImageStack, FourierImageStack & rFourierStack) { unsigned int nMaxSlices = rImageStack.slices(); if (nMaxSlices > rFourierStack.slices()) nMaxSlices = rFourierStack.slices(); NppiSize oSizeROI = {rImageStack.width(), rImageStack.height()}; // create plan for the FFT cufftHandle oPlanCUFFT; NPP_CHECK_CUFFT(cufftPlan2d(&oPlanCUFFT, oSizeROI.width, oSizeROI.height, CUFFT_R2C)); // allocate 32-bit float intermediate image // for this image to work with cuFFT, we must have tightly packed pixels. npp::ImageNPP<Npp32f, 1, FrugalAllocator_32f_C1> oSource_32f_C1(oSizeROI.width, oSizeROI.height); NPP_DEBUG_ASSERT(oSource_32f_C1.width() * sizeof(Npp32f) == oSource_32f_C1.pitch()); // allocate 8-bit image npp::ImageNPP_8u_C1 oSource_8u_C1; for (unsigned int iSlice = 0; iSlice < nMaxSlices; ++iSlice) { // load slice rImageStack.loadImage(iSlice, oSource_8u_C1); // upconvert 8-bit image to 32-bit float image NPP_CHECK_NPP(nppiConvert_8u32f_C1R(oSource_8u_C1.data(), oSource_8u_C1.pitch(), oSource_32f_C1.data(), oSource_32f_C1.pitch(), oSizeROI)); NPP_CHECK_CUFFT(cufftExecR2C(oPlanCUFFT, oSource_32f_C1.data(), reinterpret_cast<cufftComplex *>(rFourierStack.data(iSlice)))); } }
GLFluids::GLFluids(QWidget *parent) : QGLWidget(parent), QGLFunctions() { vbo = 0; wWidth = qMax(512, DIM); wHeight = qMax(512, DIM); hvfield = (float2 *)malloc(sizeof(float2) * DS); memset(hvfield, 0, sizeof(float2) * DS); // Allocate and initialize device data cudaMallocPitch((void **)&dvfield, &tPitch, sizeof(float2)*DIM, DIM); cudaMemcpy(dvfield, hvfield, sizeof(float2) * DS, cudaMemcpyHostToDevice); // Temporary complex velocity field data cudaMalloc((void **)&vxfield, sizeof(float2) * PDS); cudaMalloc((void **)&vyfield, sizeof(float2) * PDS); setup_texture(DIM, DIM); bind_texture(); // Create particle array particles = (float2 *)malloc(sizeof(float2) * DS); memset(particles, 0, sizeof(float2) * DS); initParticles(particles, DIM, DIM); // Create CUFFT transform plan configuration cufftPlan2d(&planr2c, DIM, DIM, CUFFT_R2C); cufftPlan2d(&planc2r, DIM, DIM, CUFFT_C2R); // TODO: update kernels to use the new unpadded memory layout for perf // rather than the old FFTW-compatible layout cufftSetCompatibilityMode(planr2c, CUFFT_COMPATIBILITY_FFTW_PADDING); cufftSetCompatibilityMode(planc2r, CUFFT_COMPATIBILITY_FFTW_PADDING); QTimer *timer = new QTimer(this); connect(timer, &QTimer::timeout, [&](){ simulateFluids(); updateGL(); }); timer->start(0); }
void createPlan(unsigned nx, unsigned ny) { if (nx != m_nx || ny != m_ny) { m_nx = nx; m_ny = ny; cufftResult result = cufftPlan2d(&m_plan, m_nx, m_ny, CUFFT_C2C); AGILE_ASSERT(result == CUFFT_SUCCESS, StandardException::ExceptionMessage( "Could not create FFT plan")); } }
void fft2dGPU(T1* d_data, int nx, int ny, void* stream) { cufftHandle plan; cufftSetCompatibilityMode(plan, CUFFT_COMPATIBILITY_FFTW_ALL); if (cufftPlan2d(&plan, nx, ny, CUFFT_C2C)!=CUFFT_SUCCESS) { fprintf(stderr, "CUFFT error: Plan creation failed"); } cufftSetStream(plan, (cudaStream_t) stream); cufftExecC2C(plan, (cufftComplex*) d_data, (cufftComplex*) d_data, CUFFT_FORWARD); cufftDestroy(plan); }
/* * Class: jcuda_jcufft_JCufft * Method: cufftPlan2dNative * Signature: (Ljcuda/jcufft/JCufftHandle;III)I */ JNIEXPORT jint JNICALL Java_jcuda_jcufft_JCufft_cufftPlan2dNative (JNIEnv *env, jclass cla, jobject handle, jint nx, jint ny, jint type) { if (handle == NULL) { ThrowByName(env, "java/lang/NullPointerException", "Parameter 'handle' is null for cufftPlan2d"); return JCUFFT_INTERNAL_ERROR; } Logger::log(LOG_TRACE, "Creating 2D plan for (%d, %d) elements of type %d\n", nx, ny, type); cufftHandle plan = env->GetIntField(handle, cufftHandle_plan); cufftResult result = cufftPlan2d(&plan, nx, ny, getCufftType(type)); env->SetIntField(handle, cufftHandle_plan, plan); return result; }
//////////////////////////////////////////////////////////////////////////////// //! Run test //////////////////////////////////////////////////////////////////////////////// void runAutoTest(int argc, char** argv) { printf("[%s]\n", sSDKsample); // Cuda init int dev = cutilChooseCudaDevice(argc, argv); cudaDeviceProp deviceProp; cutilSafeCall(cudaGetDeviceProperties(&deviceProp, dev)); printf("Compute capability %d.%d\n", deviceProp.major, deviceProp.minor); int version = deviceProp.major*10 + deviceProp.minor; g_hasDouble = (version >= 13); if (inEmulationMode()) { // workaround since SM13 kernel doesn't produce correct output in emulation mode g_hasDouble = false; } // create FFT plan CUFFT_SAFE_CALL(cufftPlan2d(&fftPlan, meshW, meshH, CUFFT_C2R) ); // allocate memory fftInputW = (meshW / 2)+1; fftInputH = meshH; fftInputSize = (fftInputW*fftInputH)*sizeof(float2); cutilSafeCall(cudaMalloc((void **)&d_h0, fftInputSize) ); cutilSafeCall(cudaMalloc((void **)&d_ht, fftInputSize) ); h_h0 = (float2 *) malloc(fftInputSize); generate_h0(); cutilSafeCall(cudaMemcpy(d_h0, h_h0, fftInputSize, cudaMemcpyHostToDevice) ); cutilSafeCall(cudaMalloc((void **)&d_slope, meshW*meshH*sizeof(float2)) ); cutCreateTimer(&timer); cutStartTimer(timer); prevTime = cutGetTimerValue(timer); // Creating the Auto-Validation Code g_CheckRender = new CheckBackBuffer(windowH, windowH, 4, false); g_CheckRender->setPixelFormat(GL_RGBA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(true); runCudaTest(g_hasDouble); cudaThreadExit(); }
void ifft2dGPU(T1* d_data, int nx, int ny, void* stream) { //printf("Running 2d inverse xform \n"); cufftHandle plan; cufftSetCompatibilityMode(plan, CUFFT_COMPATIBILITY_FFTW_ALL); if (cufftPlan2d(&plan, ny, nx, CUFFT_Z2Z)!=CUFFT_SUCCESS) { printf( "CUFFT error: Plan creation failed\n"); } //printf("Built plan \n"); cufftSetStream(plan, (cudaStream_t) stream); if (cufftExecZ2Z(plan, (cufftDoubleComplex*) d_data, (cufftDoubleComplex*) d_data, CUFFT_INVERSE)!=CUFFT_SUCCESS) { printf("CUFFT error: Plan execution failed\n"); }; cufftDestroy(plan); }
void WorkerThread::createInitialFilter() { float* gaussian_data; cudaMalloc((void**)&gaussian_data, sizeof(float) * _filter_pixels); int2 gaussian_size; gaussian_size.x = _filter_size; gaussian_size.y = _filter_size; int2 gaussian_center; gaussian_center.x = _filter_size / 2; gaussian_center.y = _filter_size / 2; gaussian(gaussian_data, 0.0, _sigma, 1.0, gaussian_center, gaussian_size); float* harmonic_data; cudaMalloc((void**)&harmonic_data, sizeof(float) * _filter_pixels * 2); int2 harmonic_size; harmonic_size.x = _filter_size; harmonic_size.y = _filter_size; int2 harmonic_center; harmonic_center.x = _filter_size / 2; harmonic_center.y = _filter_size / 2; harmonic(harmonic_data, 0, _lambda, 0.0, harmonic_center, harmonic_size); float* host_harmonic = new float[_filter_size * _filter_size * 2]; cudaMalloc((void**)&_gabor_data, sizeof(float) * _filter_pixels * 2); int2 gabor_size; gabor_size.x = _filter_size; gabor_size.y = _filter_size; int2 gabor_center; gabor_center.x = _filter_size / 2; gabor_center.y = _filter_size / 2; multiplyRealComplex(gaussian_data, harmonic_data, _gabor_data, _filter_size * _filter_size); float* host_gabor_data = new float[_filter_pixels * 2]; cudaMemcpy(host_gabor_data, _gabor_data, sizeof(float) * _filter_pixels * 2, cudaMemcpyDeviceToHost); //pad the filter { float* data = host_gabor_data; float* target = _filter_image; memset(target, 0, sizeof(float) * _padded_pixels * 2); int padded_stride = 2 * _padded_size; int target_stride = 2 * _target_size; for (int i = 0; i < _target_size; ++i) { memcpy(target, data, sizeof(float) * target_stride); target += padded_stride; data += target_stride; } } // Copy gabor data into member for texture creation _filter_image_mutex.lock(); memcpy(_host_gabor_data, host_gabor_data, sizeof(float) * _filter_pixels * 2); _filter_image_mutex.unlock(); cudaFree(_gabor_data); cudaMalloc((void**)&_gabor_data, sizeof(float) * _padded_pixels * 2); cudaMalloc((void**)&_gpu_image_0, sizeof(float) * _padded_pixels * 2); cudaMalloc((void**)&_gpu_image_1, sizeof(float) * _padded_pixels * 2); cudaMemcpy(_gabor_data, _filter_image, sizeof(float) * _padded_pixels * 2, cudaMemcpyHostToDevice); cufftPlan2d(&_fft_plan, _padded_size, _padded_size, CUFFT_C2C); cufftExecC2C(_fft_plan, (cufftComplex*)(_gabor_data), (cufftComplex*)(_gabor_data), CUFFT_FORWARD); cudaMemcpy(_filter_image, _gabor_data, sizeof(float) * _padded_pixels * 2, cudaMemcpyDeviceToHost); emit newFilterImage(); }
//////////////////////////////////////////////////////////////////////////////// //! Run test //////////////////////////////////////////////////////////////////////////////// void runGraphicsTest(int argc, char** argv) { printf("[%s] ", sSDKsample); if (g_bOpenGLQA) printf("[OpenGL Readback Comparisons] "); printf("\n"); if ( cutCheckCmdLineFlag(argc, (const char **)argv, "device") ) { printf("[%s]\n", argv[0]); printf(" Does not explicitly support -device=n in OpenGL mode\n"); printf(" To use -device=n, the sample must be running w/o OpenGL\n\n"); printf(" > %s -device=n -qatest\n", argv[0]); printf("exiting...\n"); exit(0); } // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. if(CUTFalse == initGL( &argc, argv )) { cudaThreadExit(); return; } cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() ); // create FFT plan CUFFT_SAFE_CALL(cufftPlan2d(&fftPlan, meshW, meshH, CUFFT_C2R) ); // allocate memory fftInputW = (meshW / 2)+1; fftInputH = meshH; fftInputSize = (fftInputW*fftInputH)*sizeof(float2); cutilSafeCall(cudaMalloc((void **)&d_h0, fftInputSize) ); cutilSafeCall(cudaMalloc((void **)&d_ht, fftInputSize) ); h_h0 = (float2 *) malloc(fftInputSize); generate_h0(); cutilSafeCall(cudaMemcpy(d_h0, h_h0, fftInputSize, cudaMemcpyHostToDevice) ); cutilSafeCall(cudaMalloc((void **)&d_slope, meshW*meshH*sizeof(float2)) ); cutCreateTimer(&timer); cutStartTimer(timer); prevTime = cutGetTimerValue(timer); // create vertex buffers and register with CUDA createVBO(&heightVertexBuffer, meshW*meshH*sizeof(float)); // DEPRECATED: cutilSafeCall(cudaGLRegisterBufferObject(heightVertexBuffer)); cutilSafeCall(cudaGraphicsGLRegisterBuffer(&cuda_heightVB_resource, heightVertexBuffer, cudaGraphicsMapFlagsWriteDiscard)); createVBO(&slopeVertexBuffer, meshW*meshH*sizeof(float2)); // DEPRECATED: cutilSafeCall(cudaGLRegisterBufferObject(slopeVertexBuffer)); cutilSafeCall(cudaGraphicsGLRegisterBuffer(&cuda_slopeVB_resource, slopeVertexBuffer, cudaGraphicsMapFlagsWriteDiscard)); // create vertex and index buffer for mesh createMeshPositionVBO(&posVertexBuffer, meshW, meshH); createMeshIndexBuffer(&indexBuffer, meshW, meshH); // Creating the Auto-Validation Code if (g_bOpenGLQA) { g_CheckRender = new CheckBackBuffer(windowH, windowH, 4); g_CheckRender->setPixelFormat(GL_RGBA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(true); } runCuda(); // register callbacks glutDisplayFunc(display); glutKeyboardFunc(keyboard); glutMouseFunc(mouse); glutMotionFunc(motion); glutReshapeFunc(reshape); glutIdleFunc(idle); // start rendering mainloop glutMainLoop(); cudaThreadExit(); }
int main(int argc, char **argv) { int devID; cudaDeviceProp deviceProps; printf("%s Starting...\n\n", sSDKname); printf("[%s] - [OpenGL/CUDA simulation] starting...\n", sSDKname); // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. if (false == initGL(&argc, argv)) { exit(EXIT_SUCCESS); } // use command-line specified CUDA device, otherwise use device with highest Gflops/s #ifndef OPTIMUS devID = findCudaGLDevice(argc, (const char **)argv); #else devID = gpuGetMaxGflopsDeviceId(); #endif // get number of SMs on this GPU checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); printf("CUDA device [%s] has %d Multi-Processors\n", deviceProps.name, deviceProps.multiProcessorCount); // automated build testing harness if (checkCmdLineFlag(argc, (const char **)argv, "file")) { getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); } // Allocate and initialize host data GLint bsize; sdkCreateTimer(&timer); sdkResetTimer(&timer); hvfield = (cData *)malloc(sizeof(cData) * DS); memset(hvfield, 0, sizeof(cData) * DS); // Allocate and initialize device data cudaMallocPitch((void **)&dvfield, &tPitch, sizeof(cData)*DIM, DIM); cudaMemcpy(dvfield, hvfield, sizeof(cData) * DS, cudaMemcpyHostToDevice); // Temporary complex velocity field data cudaMalloc((void **)&vxfield, sizeof(cData) * PDS); cudaMalloc((void **)&vyfield, sizeof(cData) * PDS); setupTexture(DIM, DIM); bindTexture(); // Create particle array in host memory particles = (cData *)malloc(sizeof(cData) * DS); memset(particles, 0, sizeof(cData) * DS); #ifdef BROADCAST int step = 1; // Broadcasted visualization stepping. if (argc > 3) step = atoi(argv[3]); // Create additional space to store particle packets // for broadcasting. wstep = step; hstep = step; int npackets = sizeof(float) * (DIM / wstep) * (DIM / hstep) / UdpBroadcastServer::PacketSize; if (sizeof(float) * (DIM / wstep) * (DIM / hstep) % UdpBroadcastServer::PacketSize) npackets++; packets = (char*)malloc(npackets * (UdpBroadcastServer::PacketSize + sizeof(unsigned int))); #endif initParticles(particles, DIM, DIM); #if defined(OPTIMUS) || defined(BROADCAST) // Create particle array in device memory cudaMalloc((void **)&particles_gpu, sizeof(cData) * DS); cudaMemcpy(particles_gpu, particles, sizeof(cData) * DS, cudaMemcpyHostToDevice); #endif // Create CUFFT transform plan configuration cufftPlan2d(&planr2c, DIM, DIM, CUFFT_R2C); cufftPlan2d(&planc2r, DIM, DIM, CUFFT_C2R); // TODO: update kernels to use the new unpadded memory layout for perf // rather than the old FFTW-compatible layout cufftSetCompatibilityMode(planr2c, CUFFT_COMPATIBILITY_FFTW_PADDING); cufftSetCompatibilityMode(planc2r, CUFFT_COMPATIBILITY_FFTW_PADDING); glGenBuffersARB(1, &vbo); glBindBufferARB(GL_ARRAY_BUFFER_ARB, vbo); glBufferDataARB(GL_ARRAY_BUFFER_ARB, sizeof(cData) * DS, particles, GL_DYNAMIC_DRAW_ARB); glGetBufferParameterivARB(GL_ARRAY_BUFFER_ARB, GL_BUFFER_SIZE_ARB, &bsize); if (bsize != (sizeof(cData) * DS)) goto EXTERR; glBindBufferARB(GL_ARRAY_BUFFER_ARB, 0); #ifndef OPTIMUS checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo, cudaGraphicsMapFlagsNone)); getLastCudaError("cudaGraphicsGLRegisterBuffer failed"); #endif if (ref_file) { autoTest(argv); cleanup(); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); printf("[fluidsGL] - Test Results: %d Failures\n", g_TotalErrors); exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); } else { #ifdef BROADCAST const char *sv_addr = "127.0.0:9097"; const char *bc_addr = "127.255.255.2:9097"; // Server address if (argc > 2) sv_addr = argv[2]; // Broadcast address if (argc > 1) bc_addr = argv[1]; server.reset(new UdpBroadcastServer(sv_addr, bc_addr)); // Listen to clients' feedbacks in a separate thread. { pthread_t tid; pthread_create(&tid, NULL, &feedback_listener, &step); } // Broadcast the particles state in a separate thread. { pthread_t tid; pthread_create(&tid, NULL, &broadcaster, &step); } #endif #if defined (__APPLE__) || defined(MACOSX) atexit(cleanup); #else glutCloseFunc(cleanup); #endif glutMainLoop(); } // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); if (!ref_file) { exit(EXIT_SUCCESS); } return 0; EXTERR: printf("Failed to initialize GL extensions.\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(EXIT_FAILURE); }
int main(int argc, char *argv[]) { int i; struct timeval begin, end; int size; size_t bytes; int n = 0, m = 0; STARPUFFT(plan) plan; #ifdef STARPU_HAVE_FFTW _FFTW(plan) fftw_plan; #endif #ifdef STARPU_USE_CUDA cufftHandle cuda_plan; cudaError_t cures; #endif double timing; if (argc < 2 || argc > 3) { fprintf(stderr,"need one or two size of vector\n"); exit(EXIT_FAILURE); } starpu_init(NULL); if (argc == 2) { n = atoi(argv[1]); /* 1D */ size = n; } else if (argc == 3) { n = atoi(argv[1]); m = atoi(argv[2]); /* 2D */ size = n * m; } else { assert(0); } bytes = size * sizeof(STARPUFFT(complex)); STARPUFFT(complex) *in = STARPUFFT(malloc)(size * sizeof(*in)); starpu_srand48(0); for (i = 0; i < size; i++) in[i] = starpu_drand48() + I * starpu_drand48(); STARPUFFT(complex) *out = STARPUFFT(malloc)(size * sizeof(*out)); #ifdef STARPU_HAVE_FFTW STARPUFFT(complex) *out_fftw = STARPUFFT(malloc)(size * sizeof(*out_fftw)); #endif #ifdef STARPU_USE_CUDA STARPUFFT(complex) *out_cuda = malloc(size * sizeof(*out_cuda)); #endif if (argc == 2) { plan = STARPUFFT(plan_dft_1d)(n, SIGN, 0); #ifdef STARPU_HAVE_FFTW fftw_plan = _FFTW(plan_dft_1d)(n, in, out_fftw, SIGN, FFTW_ESTIMATE); #endif #ifdef STARPU_USE_CUDA if (cufftPlan1d(&cuda_plan, n, _CUFFT_C2C, 1) != CUFFT_SUCCESS) printf("erf\n"); #endif } else if (argc == 3) { plan = STARPUFFT(plan_dft_2d)(n, m, SIGN, 0); #ifdef STARPU_HAVE_FFTW fftw_plan = _FFTW(plan_dft_2d)(n, m, in, out_fftw, SIGN, FFTW_ESTIMATE); #endif #ifdef STARPU_USE_CUDA STARPU_ASSERT(cufftPlan2d(&cuda_plan, n, m, _CUFFT_C2C) == CUFFT_SUCCESS); #endif } else { assert(0); } #ifdef STARPU_HAVE_FFTW gettimeofday(&begin, NULL); _FFTW(execute)(fftw_plan); gettimeofday(&end, NULL); _FFTW(destroy_plan)(fftw_plan); timing = (double)((end.tv_sec - begin.tv_sec)*1000000 + (end.tv_usec - begin.tv_usec)); printf("FFTW took %2.2f ms (%2.2f MB/s)\n\n", timing/1000, bytes/timing); #endif #ifdef STARPU_USE_CUDA gettimeofday(&begin, NULL); if (cufftExecC2C(cuda_plan, (cufftComplex*) in, (cufftComplex*) out_cuda, CUFFT_FORWARD) != CUFFT_SUCCESS) printf("erf2\n"); if ((cures = cudaThreadSynchronize()) != cudaSuccess) STARPU_CUDA_REPORT_ERROR(cures); gettimeofday(&end, NULL); cufftDestroy(cuda_plan); timing = (double)((end.tv_sec - begin.tv_sec)*1000000 + (end.tv_usec - begin.tv_usec)); printf("CUDA took %2.2f ms (%2.2f MB/s)\n\n", timing/1000, bytes/timing); #endif STARPUFFT(execute)(plan, in, out); STARPUFFT(showstats)(stdout); STARPUFFT(destroy_plan)(plan); printf("\n"); #if 0 for (i = 0; i < 16; i++) printf("(%f,%f) ", cimag(in[i]), creal(in[i])); printf("\n\n"); for (i = 0; i < 16; i++) printf("(%f,%f) ", cimag(out[i]), creal(out[i])); printf("\n\n"); #ifdef STARPU_HAVE_FFTW for (i = 0; i < 16; i++) printf("(%f,%f) ", cimag(out_fftw[i]), creal(out_fftw[i])); printf("\n\n"); #endif #endif #ifdef STARPU_HAVE_FFTW { double max = 0., tot = 0., norm = 0., normdiff = 0.; for (i = 0; i < size; i++) { double diff = cabs(out[i]-out_fftw[i]); double diff2 = diff * diff; double size = cabs(out_fftw[i]); double size2 = size * size; if (diff > max) max = diff; tot += diff; normdiff += diff2; norm += size2; } fprintf(stderr, "\nmaximum difference %g\n", max); fprintf(stderr, "average difference %g\n", tot / size); fprintf(stderr, "difference norm %g\n", sqrt(normdiff)); double relmaxdiff = max / sqrt(norm); fprintf(stderr, "relative maximum difference %g\n", relmaxdiff); double relavgdiff = (tot / size) / sqrt(norm); fprintf(stderr, "relative average difference %g\n", relavgdiff); if (!strcmp(TYPE, "f") && (relmaxdiff > 1e-8 || relavgdiff > 1e-8)) return EXIT_FAILURE; if (!strcmp(TYPE, "") && (relmaxdiff > 1e-16 || relavgdiff > 1e-16)) return EXIT_FAILURE; } #endif #ifdef STARPU_USE_CUDA { double max = 0., tot = 0., norm = 0., normdiff = 0.; for (i = 0; i < size; i++) { double diff = cabs(out_cuda[i]-out_fftw[i]); double diff2 = diff * diff; double size = cabs(out_fftw[i]); double size2 = size * size; if (diff > max) max = diff; tot += diff; normdiff += diff2; norm += size2; } fprintf(stderr, "\nmaximum difference %g\n", max); fprintf(stderr, "average difference %g\n", tot / size); fprintf(stderr, "difference norm %g\n", sqrt(normdiff)); double relmaxdiff = max / sqrt(norm); fprintf(stderr, "relative maximum difference %g\n", relmaxdiff); double relavgdiff = (tot / size) / sqrt(norm); fprintf(stderr, "relative average difference %g\n", relavgdiff); if (!strcmp(TYPE, "f") && (relmaxdiff > 1e-8 || relavgdiff > 1e-8)) return EXIT_FAILURE; if (!strcmp(TYPE, "") && (relmaxdiff > 1e-16 || relavgdiff > 1e-16)) return EXIT_FAILURE; } #endif STARPUFFT(free)(in); STARPUFFT(free)(out); #ifdef STARPU_HAVE_FFTW STARPUFFT(free)(out_fftw); #endif #ifdef STARPU_USE_CUDA free(out_cuda); #endif starpu_shutdown(); return EXIT_SUCCESS; }
bool test0(void) { float *h_Data, *h_Kernel, *h_ResultCPU, *h_ResultGPU; float *d_Data, *d_PaddedData, *d_Kernel, *d_PaddedKernel; fComplex *d_DataSpectrum, *d_KernelSpectrum; cufftHandle fftPlanFwd, fftPlanInv; bool bRetVal; StopWatchInterface *hTimer = NULL; sdkCreateTimer(&hTimer); printf("Testing built-in R2C / C2R FFT-based convolution\n"); const int kernelH = 3; const int kernelW = 3; const int kernelY = 1; const int kernelX = 1; const int dataH = 10; const int dataW = 10; const int fftH = snapTransformSize(dataH + kernelH - 1); const int fftW = snapTransformSize(dataW + kernelW - 1); printf("...allocating memory\n"); h_Data = (float *)malloc(dataH * dataW * sizeof(float)); h_Kernel = (float *)malloc(kernelH * kernelW * sizeof(float)); h_ResultCPU = (float *)malloc(dataH * dataW * sizeof(float)); h_ResultGPU = (float *)malloc(fftH * fftW * sizeof(float)); checkCudaErrors(cudaMalloc((void **)&d_Data, dataH * dataW * sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_Kernel, kernelH * kernelW * sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_PaddedData, fftH * fftW * sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_PaddedKernel, fftH * fftW * sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_DataSpectrum, fftH * (fftW / 2 + 1) * sizeof(fComplex))); checkCudaErrors(cudaMalloc((void **)&d_KernelSpectrum, fftH * (fftW / 2 + 1) * sizeof(fComplex))); printf("...generating random input data\n"); srand(2010); for (int i = 0; i < dataH * dataW; i++) { //h_Data[i] = getRand(); h_Data[i] = i + 1; } for (int i = 0; i < kernelH * kernelW; i++) { //h_Kernel[i] = getRand(); h_Kernel[i] = i + 1; } FILE* fp2 = fopen("input_kernel.txt", "w+"); FILE* fp3 = fopen("input_data.txt", "w+"); for (int i = 0; i < dataH * dataW; i++) fprintf(fp3, "%f\n", h_Data[i]); for (int i = 0; i < kernelH * kernelW; i++) fprintf(fp2, "%f\n", h_Kernel[i]); fclose(fp2); fclose(fp3); printf("...creating R2C & C2R FFT plans for %i x %i\n", fftH, fftW); checkCudaErrors(cufftPlan2d(&fftPlanFwd, fftH, fftW, CUFFT_R2C)); checkCudaErrors(cufftPlan2d(&fftPlanInv, fftH, fftW, CUFFT_C2R)); printf("...uploading to GPU and padding convolution kernel and input data\n"); checkCudaErrors(cudaMemcpy(d_Kernel, h_Kernel, kernelH * kernelW * sizeof(float), cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(d_Data, h_Data, dataH * dataW * sizeof(float), cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemset(d_PaddedKernel, 0, fftH * fftW * sizeof(float))); checkCudaErrors(cudaMemset(d_PaddedData, 0, fftH * fftW * sizeof(float))); padKernel( d_PaddedKernel, d_Kernel, fftH, fftW, kernelH, kernelW, kernelY, kernelX ); padDataClampToBorder( d_PaddedData, d_Data, fftH, fftW, dataH, dataW, kernelH, kernelW, kernelY, kernelX ); //Not including kernel transformation into time measurement, //since convolution kernel is not changed very frequently printf("...transforming convolution kernel\n"); checkCudaErrors(cufftExecR2C(fftPlanFwd, (cufftReal *)d_PaddedKernel, (cufftComplex *)d_KernelSpectrum)); printf("...running GPU FFT convolution: "); checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); checkCudaErrors(cufftExecR2C(fftPlanFwd, (cufftReal *)d_PaddedData, (cufftComplex *)d_DataSpectrum)); modulateAndNormalize(d_DataSpectrum, d_KernelSpectrum, fftH, fftW, 1); checkCudaErrors(cufftExecC2R(fftPlanInv, (cufftComplex *)d_DataSpectrum, (cufftReal *)d_PaddedData)); checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&hTimer); double gpuTime = sdkGetTimerValue(&hTimer); printf("%f MPix/s (%f ms)\n", (double)dataH * (double)dataW * 1e-6 / (gpuTime * 0.001), gpuTime); printf("...reading back GPU convolution results\n"); checkCudaErrors(cudaMemcpy(h_ResultGPU, d_PaddedData, fftH * fftW * sizeof(float), cudaMemcpyDeviceToHost)); printf("...running reference CPU convolution\n"); convolutionClampToBorderCPU( h_ResultCPU, h_Data, h_Kernel, dataH, dataW, kernelH, kernelW, kernelY, kernelX ); printf("...comparing the results: "); double sum_delta2 = 0; double sum_ref2 = 0; double max_delta_ref = 0; for (int y = 0; y < dataH; y++) for (int x = 0; x < dataW; x++) { double rCPU = (double)h_ResultCPU[y * dataW + x]; double rGPU = (double)h_ResultGPU[y * fftW + x]; double delta = (rCPU - rGPU) * (rCPU - rGPU); double ref = rCPU * rCPU + rCPU * rCPU; if ((delta / ref) > max_delta_ref) { max_delta_ref = delta / ref; } sum_delta2 += delta; sum_ref2 += ref; } double L2norm = sqrt(sum_delta2 / sum_ref2); printf("rel L2 = %E (max delta = %E)\n", L2norm, sqrt(max_delta_ref)); bRetVal = (L2norm < 1e-6) ? true : false; printf(bRetVal ? "L2norm Error OK\n" : "L2norm Error too high!\n"); printf("...shutting down\n"); sdkStartTimer(&hTimer); checkCudaErrors(cufftDestroy(fftPlanInv)); checkCudaErrors(cufftDestroy(fftPlanFwd)); checkCudaErrors(cudaFree(d_DataSpectrum)); checkCudaErrors(cudaFree(d_KernelSpectrum)); checkCudaErrors(cudaFree(d_PaddedData)); checkCudaErrors(cudaFree(d_PaddedKernel)); checkCudaErrors(cudaFree(d_Data)); checkCudaErrors(cudaFree(d_Kernel)); FILE* fp = fopen("result_gpu.txt", "w+"); FILE* fp1 = fopen("result_cpu.txt", "w+"); for (int i = 0; i < dataH * dataW; i++) { fprintf(fp, "%f\n", h_ResultGPU[i]); fprintf(fp1, "%f\n", h_ResultCPU[i]); } fclose(fp); fclose(fp1); free(h_ResultGPU); free(h_ResultCPU); free(h_Data); free(h_Kernel); return bRetVal; }
cufftResult WINAPI wine_cufftPlan2d(cufftHandle *plan, int nx, int ny, cufftType type){ WINE_TRACE("\n"); return cufftPlan2d( plan, nx, ny, type ); }
oskar_FFT* oskar_fft_create(int precision, int location, int num_dim, int dim_size, int batch_size_1d, int* status) { int i; oskar_FFT* h = (oskar_FFT*) calloc(1, sizeof(oskar_FFT)); #ifndef OSKAR_HAVE_CUDA if (location == OSKAR_GPU) location = OSKAR_CPU; #endif #ifndef OSKAR_HAVE_OPENCL if (location & OSKAR_CL) location = OSKAR_CPU; #endif h->precision = precision; h->location = location; h->num_dim = num_dim; h->dim_size = dim_size; h->ensure_consistent_norm = 1; h->num_cells_total = (size_t) dim_size; for (i = 1; i < num_dim; ++i) h->num_cells_total *= (size_t) dim_size; if (location == OSKAR_CPU) { int len = 4 * dim_size + 2 * (int)(log((double)dim_size) / log(2.0)) + 8; h->fftpack_wsave = oskar_mem_create(precision, location, len, status); if (num_dim == 1) { (void) batch_size_1d; *status = OSKAR_ERR_FUNCTION_NOT_AVAILABLE; } else if (num_dim == 2) { if (precision == OSKAR_DOUBLE) oskar_fftpack_cfft2i(dim_size, dim_size, oskar_mem_double(h->fftpack_wsave, status)); else oskar_fftpack_cfft2i_f(dim_size, dim_size, oskar_mem_float(h->fftpack_wsave, status)); } else *status = OSKAR_ERR_INVALID_ARGUMENT; h->fftpack_work = oskar_mem_create(precision, location, 2 * h->num_cells_total, status); } else if (location == OSKAR_GPU) { #ifdef OSKAR_HAVE_CUDA if (num_dim == 1) cufftPlan1d(&h->cufft_plan, dim_size, ((precision == OSKAR_DOUBLE) ? CUFFT_Z2Z : CUFFT_C2C), batch_size_1d); else if (num_dim == 2) cufftPlan2d(&h->cufft_plan, dim_size, dim_size, ((precision == OSKAR_DOUBLE) ? CUFFT_Z2Z : CUFFT_C2C)); else *status = OSKAR_ERR_INVALID_ARGUMENT; #endif } else if (location & OSKAR_CL) { #ifdef OSKAR_HAVE_OPENCL *status = OSKAR_ERR_FUNCTION_NOT_AVAILABLE; #endif } else *status = OSKAR_ERR_BAD_LOCATION; return h; }
static void setfftpl(unsigned num_rows, unsigned num_columns, cufftHandle* fftplan) { cufftPlan2d(fftplan, num_rows, num_columns, CUFFT_Z2Z); }
void WorkerThread::createNewFilter() { // Free GPU memory from current filter and CUFFT cudaFree(_gabor_data); cudaFree(_gpu_image_0); cudaFree(_gpu_image_1); cufftDestroy(_fft_plan); float* gaussian_data; cudaMalloc((void**)&gaussian_data, sizeof(float) * _filter_pixels); int2 gaussian_size; gaussian_size.x = _filter_size; gaussian_size.y = _filter_size; int2 gaussian_center; gaussian_center.x = _filter_size / 2; gaussian_center.y = _filter_size / 2; gaussian(gaussian_data, _new_theta, _new_sigma, 1.0, gaussian_center, gaussian_size); float* harmonic_data; cudaMalloc((void**)&harmonic_data, sizeof(float) * _filter_pixels * 2); int2 harmonic_size; harmonic_size.x = _filter_size; harmonic_size.y = _filter_size; int2 harmonic_center; harmonic_center.x = _filter_size / 2; harmonic_center.y = _filter_size / 2; harmonic(harmonic_data, _new_theta, _new_lambda, _new_psi, harmonic_center, harmonic_size); float* host_harmonic = new float[_filter_size * _filter_size * 2]; cudaMalloc((void**)&_gabor_data, sizeof(float) * _filter_pixels * 2); int2 gabor_size; gabor_size.x = _filter_size; gabor_size.y = _filter_size; int2 gabor_center; gabor_center.x = _filter_size / 2; gabor_center.y = _filter_size / 2; multiplyRealComplex(gaussian_data, harmonic_data, _gabor_data, _filter_size * _filter_size); float* host_gabor_data = new float[_filter_pixels * 2]; cudaMemcpy(host_gabor_data, _gabor_data, sizeof(float) * _filter_pixels * 2, cudaMemcpyDeviceToHost); //pad the filter { float* data = host_gabor_data; float* target = _filter_image; memset(target, 0, sizeof(float) * _padded_pixels * 2); int padded_stride = 2 * _padded_size; int target_stride = 2 * _target_size; for (int i = 0; i < _target_size; ++i) { memcpy(target, data, sizeof(float) * target_stride); target += padded_stride; data += target_stride; } } // Copy gabor data into member for texture creation _filter_image_mutex.lock(); memcpy(_host_gabor_data, host_gabor_data, sizeof(float) * _filter_pixels * 2); _filter_image_mutex.unlock(); cudaFree(_gabor_data); cudaMalloc((void**)&_gabor_data, sizeof(float) * _padded_pixels * 2); cudaMalloc((void**)&_gpu_image_0, sizeof(float) * _padded_pixels * 2); cudaMalloc((void**)&_gpu_image_1, sizeof(float) * _padded_pixels * 2); cudaMemcpy(_gabor_data, _filter_image, sizeof(float) * _padded_pixels * 2, cudaMemcpyHostToDevice); cufftPlan2d(&_fft_plan, _padded_size, _padded_size, CUFFT_C2C); cufftExecC2C(_fft_plan, (cufftComplex*)(_gabor_data), (cufftComplex*)(_gabor_data), CUFFT_FORWARD); cudaMemcpy(_filter_image, _gabor_data, sizeof(float) * _padded_pixels * 2, cudaMemcpyDeviceToHost); // Free temporary GPU memory used for creation of filter cudaFree(gaussian_data); cudaFree(harmonic_data); delete host_harmonic; delete host_gabor_data; _should_create_new_filter = false; emit newFilterImage(); }
int main(int argc, char **argv) { int devID; cudaDeviceProp deviceProps; printf("%s Starting...\n\n", sSDKname); printf("[%s] - [OpenGL/CUDA simulation] starting...\n", sSDKname); // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. if (false == initGL(&argc, argv)) { exit(EXIT_SUCCESS); } // use command-line specified CUDA device, otherwise use device with highest Gflops/s devID = findCudaGLDevice(argc, (const char **)argv); // get number of SMs on this GPU checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); printf("CUDA device [%s] has %d Multi-Processors\n", deviceProps.name, deviceProps.multiProcessorCount); // automated build testing harness if (checkCmdLineFlag(argc, (const char **)argv, "file")) { getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); } // Allocate and initialize host data GLint bsize; sdkCreateTimer(&timer); sdkResetTimer(&timer); hvfield = (cData *)malloc(sizeof(cData) * DS); memset(hvfield, 0, sizeof(cData) * DS); // Allocate and initialize device data cudaMallocPitch((void **)&dvfield, &tPitch, sizeof(cData)*DIM, DIM); cudaMemcpy(dvfield, hvfield, sizeof(cData) * DS, cudaMemcpyHostToDevice); // Temporary complex velocity field data cudaMalloc((void **)&vxfield, sizeof(cData) * PDS); cudaMalloc((void **)&vyfield, sizeof(cData) * PDS); setupTexture(DIM, DIM); bindTexture(); // Create particle array particles = (cData *)malloc(sizeof(cData) * DS); memset(particles, 0, sizeof(cData) * DS); initParticles(particles, DIM, DIM); // Create CUFFT transform plan configuration cufftPlan2d(&planr2c, DIM, DIM, CUFFT_R2C); cufftPlan2d(&planc2r, DIM, DIM, CUFFT_C2R); // TODO: update kernels to use the new unpadded memory layout for perf // rather than the old FFTW-compatible layout cufftSetCompatibilityMode(planr2c, CUFFT_COMPATIBILITY_FFTW_PADDING); cufftSetCompatibilityMode(planc2r, CUFFT_COMPATIBILITY_FFTW_PADDING); glGenBuffersARB(1, &vbo); glBindBufferARB(GL_ARRAY_BUFFER_ARB, vbo); glBufferDataARB(GL_ARRAY_BUFFER_ARB, sizeof(cData) * DS, particles, GL_DYNAMIC_DRAW_ARB); glGetBufferParameterivARB(GL_ARRAY_BUFFER_ARB, GL_BUFFER_SIZE_ARB, &bsize); if (bsize != (sizeof(cData) * DS)) goto EXTERR; glBindBufferARB(GL_ARRAY_BUFFER_ARB, 0); checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo, cudaGraphicsMapFlagsNone)); getLastCudaError("cudaGraphicsGLRegisterBuffer failed"); if (ref_file) { autoTest(argv); cleanup(); cudaDeviceReset(); printf("[fluidsGL] - Test Results: %d Failures\n", g_TotalErrors); exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); } else { atexit(cleanup); glutMainLoop(); } cudaDeviceReset(); if (!ref_file) { exit(EXIT_SUCCESS); } return 0; EXTERR: printf("Failed to initialize GL extensions.\n"); cudaDeviceReset(); exit(EXIT_FAILURE); }
bool test2(void) { float *h_Data, *h_Kernel, *h_ResultCPU, *h_ResultGPU; float *d_Data, *d_Kernel, *d_PaddedData, *d_PaddedKernel; fComplex *d_DataSpectrum0, *d_KernelSpectrum0; cufftHandle fftPlan; bool bRetVal; StopWatchInterface *hTimer = NULL; sdkCreateTimer(&hTimer); printf("Testing updated custom R2C / C2R FFT-based convolution\n"); const int kernelH = 7; const int kernelW = 6; const int kernelY = 3; const int kernelX = 4; const int dataH = 2000; const int dataW = 2000; const int fftH = snapTransformSize(dataH + kernelH - 1); const int fftW = snapTransformSize(dataW + kernelW - 1); printf("...allocating memory\n"); h_Data = (float *)malloc(dataH * dataW * sizeof(float)); h_Kernel = (float *)malloc(kernelH * kernelW * sizeof(float)); h_ResultCPU = (float *)malloc(dataH * dataW * sizeof(float)); h_ResultGPU = (float *)malloc(fftH * fftW * sizeof(float)); checkCudaErrors(cudaMalloc((void **)&d_Data, dataH * dataW * sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_Kernel, kernelH * kernelW * sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_PaddedData, fftH * fftW * sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_PaddedKernel, fftH * fftW * sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_DataSpectrum0, fftH * (fftW / 2) * sizeof(fComplex))); checkCudaErrors(cudaMalloc((void **)&d_KernelSpectrum0, fftH * (fftW / 2) * sizeof(fComplex))); printf("...generating random input data\n"); srand(2010); for (int i = 0; i < dataH * dataW; i++) { h_Data[i] = getRand(); } for (int i = 0; i < kernelH * kernelW; i++) { h_Kernel[i] = getRand(); } printf("...creating C2C FFT plan for %i x %i\n", fftH, fftW / 2); checkCudaErrors(cufftPlan2d(&fftPlan, fftH, fftW / 2, CUFFT_C2C)); printf("...uploading to GPU and padding convolution kernel and input data\n"); checkCudaErrors(cudaMemcpy(d_Data, h_Data, dataH * dataW * sizeof(float), cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(d_Kernel, h_Kernel, kernelH * kernelW * sizeof(float), cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemset(d_PaddedData, 0, fftH * fftW * sizeof(float))); checkCudaErrors(cudaMemset(d_PaddedKernel, 0, fftH * fftW * sizeof(float))); padDataClampToBorder( d_PaddedData, d_Data, fftH, fftW, dataH, dataW, kernelH, kernelW, kernelY, kernelX ); padKernel( d_PaddedKernel, d_Kernel, fftH, fftW, kernelH, kernelW, kernelY, kernelX ); //CUFFT_INVERSE works just as well... const int FFT_DIR = CUFFT_FORWARD; //Not including kernel transformation into time measurement, //since convolution kernel is not changed very frequently printf("...transforming convolution kernel\n"); checkCudaErrors(cufftExecC2C(fftPlan, (cufftComplex *)d_PaddedKernel, (cufftComplex *)d_KernelSpectrum0, FFT_DIR)); printf("...running GPU FFT convolution: "); checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); checkCudaErrors(cufftExecC2C(fftPlan, (cufftComplex *)d_PaddedData, (cufftComplex *)d_DataSpectrum0, FFT_DIR)); spProcess2D(d_DataSpectrum0, d_DataSpectrum0, d_KernelSpectrum0, fftH, fftW / 2, FFT_DIR); checkCudaErrors(cufftExecC2C(fftPlan, (cufftComplex *)d_DataSpectrum0, (cufftComplex *)d_PaddedData, -FFT_DIR)); checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&hTimer); double gpuTime = sdkGetTimerValue(&hTimer); printf("%f MPix/s (%f ms)\n", (double)dataH * (double)dataW * 1e-6 / (gpuTime * 0.001), gpuTime); printf("...reading back GPU FFT results\n"); checkCudaErrors(cudaMemcpy(h_ResultGPU, d_PaddedData, fftH * fftW * sizeof(float), cudaMemcpyDeviceToHost)); printf("...running reference CPU convolution\n"); convolutionClampToBorderCPU( h_ResultCPU, h_Data, h_Kernel, dataH, dataW, kernelH, kernelW, kernelY, kernelX ); printf("...comparing the results: "); double sum_delta2 = 0; double sum_ref2 = 0; double max_delta_ref = 0; for (int y = 0; y < dataH; y++) { for (int x = 0; x < dataW; x++) { double rCPU = (double)h_ResultCPU[y * dataW + x]; double rGPU = (double)h_ResultGPU[y * fftW + x]; double delta = (rCPU - rGPU) * (rCPU - rGPU); double ref = rCPU * rCPU + rCPU * rCPU; if ((delta / ref) > max_delta_ref) { max_delta_ref = delta / ref; } sum_delta2 += delta; sum_ref2 += ref; } } double L2norm = sqrt(sum_delta2 / sum_ref2); printf("rel L2 = %E (max delta = %E)\n", L2norm, sqrt(max_delta_ref)); bRetVal = (L2norm < 1e-6) ? true : false; printf(bRetVal ? "L2norm Error OK\n" : "L2norm Error too high!\n"); printf("...shutting down\n"); sdkStartTimer(&hTimer); checkCudaErrors(cufftDestroy(fftPlan)); checkCudaErrors(cudaFree(d_KernelSpectrum0)); checkCudaErrors(cudaFree(d_DataSpectrum0)); checkCudaErrors(cudaFree(d_PaddedKernel)); checkCudaErrors(cudaFree(d_PaddedData)); checkCudaErrors(cudaFree(d_Kernel)); checkCudaErrors(cudaFree(d_Data)); free(h_ResultGPU); free(h_ResultCPU); free(h_Kernel); free(h_Data); return bRetVal; }