void benchmark(int iterations) { // allocate memory for result unsigned int *d_result; unsigned int size = width * height * sizeof(unsigned int); checkCudaErrors(cudaMalloc((void **) &d_result, size)); // warm-up gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads); checkCudaErrors(cudaDeviceSynchronize()); sdkStartTimer(&timer); // execute the kernel for (int i = 0; i < iterations; i++) { gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads); } checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&timer); // check if kernel execution generated an error getLastCudaError("Kernel execution failed"); printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); printf("%.2f Mpixels/sec\n", (width*height*iterations / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6); checkCudaErrors(cudaFree(d_result)); }
void copy_image(PPM_IMG img_in) { StopWatchInterface *timer=NULL; PPM_IMG host_img; PPM_IMG device_img; int size = img_in.w * img_in.h * sizeof(unsigned char); host_img.w = img_in.w; host_img.h = img_in.h; host_img.img_r = (unsigned char *)malloc(size); host_img.img_g = (unsigned char *)malloc(size); host_img.img_b = (unsigned char *)malloc(size); device_img.w = img_in.w; device_img.h = img_in.h; cudaMalloc((void **)&(device_img.img_r), size); cudaMalloc((void **)&(device_img.img_g), size); cudaMalloc((void **)&(device_img.img_b), size); launchEmptyKernel(); // lauch an empty kernel printf("Starting copy image...\n"); // CPU to GPU sdkCreateTimer(&timer); sdkStartTimer(&timer); cudaMemcpy(device_img.img_r, img_in.img_r, size, cudaMemcpyHostToDevice); cudaMemcpy(device_img.img_g, img_in.img_g, size, cudaMemcpyHostToDevice); cudaMemcpy(device_img.img_b, img_in.img_b, size, cudaMemcpyHostToDevice); sdkStopTimer(&timer); printf("Time of copy image from CPU to GPU: %f (ms)\n", sdkGetTimerValue(&timer)); sdkDeleteTimer(&timer); // GPU to CPU sdkCreateTimer(&timer); sdkStartTimer(&timer); cudaMemcpy(host_img.img_r, device_img.img_r, size, cudaMemcpyDeviceToHost); cudaMemcpy(host_img.img_g, device_img.img_g, size, cudaMemcpyDeviceToHost); cudaMemcpy(host_img.img_b, device_img.img_b, size, cudaMemcpyDeviceToHost); sdkStopTimer(&timer); printf("Time of copy image from GPU to CPU: %f (ms)\n", sdkGetTimerValue(&timer)); sdkDeleteTimer(&timer); cudaFree(device_img.img_r); cudaFree(device_img.img_g); cudaFree(device_img.img_b); free(host_img.img_r); free(host_img.img_g); free(host_img.img_b); }
void runBenchmark(int iterations, char *exec_path) { printf("Run %u particles simulation for %d iterations...\n\n", numParticles, iterations); cudaDeviceSynchronize(); sdkStartTimer(&timer); for (int i = 0; i < iterations; ++i) { psystem->update(timestep); } cudaDeviceSynchronize(); sdkStopTimer(&timer); float fAvgSeconds = ((float)1.0e-3 * (float)sdkGetTimerValue(&timer)/(float)iterations); printf("particles, Throughput = %.4f KParticles/s, Time = %.5f s, Size = %u particles, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-3 * numParticles)/fAvgSeconds, fAvgSeconds, numParticles, 1, 0); if (g_refFile) { printf("\nChecking result...\n\n"); float *hPos = (float *)malloc(sizeof(float)*4*psystem->getNumParticles()); copyArrayFromDevice(hPos, psystem->getCudaPosVBO(), 0, sizeof(float)*4*psystem->getNumParticles()); sdkDumpBin((void *)hPos, sizeof(float)*4*psystem->getNumParticles(), "particles.bin"); if (!sdkCompareBin2BinFloat("particles.bin", g_refFile, sizeof(float)*4*psystem->getNumParticles(), MAX_EPSILON_ERROR, THRESHOLD, exec_path)) { g_TotalErrors++; } } }
void runBenchmark(int iterations) { printf("[%s] (Benchmark Mode)\n", sSDKsample); sdkCreateTimer(&timer); uchar4 *d_output; checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0)); size_t num_bytes; checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&d_output, &num_bytes, cuda_pbo_resource)); sdkStartTimer(&timer); for (int i = 0; i < iterations; ++i) { render(imageWidth, imageHeight, tx, ty, scale, cx, cy, blockSize, gridSize, g_FilterMode, d_output); } cudaDeviceSynchronize(); sdkStopTimer(&timer); float time = sdkGetTimerValue(&timer) / (float) iterations; checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); printf("time: %0.3f ms, %f Mpixels/sec\n", time, (width*height / (time * 0.001f)) / 1e6); }
void displayFunc(void) { sdkStartTimer(&timer); TColor *d_dst = NULL; size_t num_bytes; if (frameCounter++ == 0) { sdkResetTimer(&timer); } // DEPRECATED: checkCudaErrors(cudaGLMapBufferObject((void**)&d_dst, gl_PBO)); checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0)); getLastCudaError("cudaGraphicsMapResources failed"); checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&d_dst, &num_bytes, cuda_pbo_resource)); getLastCudaError("cudaGraphicsResourceGetMappedPointer failed"); checkCudaErrors(CUDA_Bind2TextureArray()); runImageFilters(d_dst); checkCudaErrors(CUDA_UnbindTexture()); // DEPRECATED: checkCudaErrors(cudaGLUnmapBufferObject(gl_PBO)); checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); // Common display code path { glClear(GL_COLOR_BUFFER_BIT); glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, imageW, imageH, GL_RGBA, GL_UNSIGNED_BYTE, BUFFER_DATA(0)); glBegin(GL_TRIANGLES); glTexCoord2f(0, 0); glVertex2f(-1, -1); glTexCoord2f(2, 0); glVertex2f(+3, -1); glTexCoord2f(0, 2); glVertex2f(-1, +3); glEnd(); glFinish(); } if (frameCounter == frameN) { frameCounter = 0; if (g_FPS) { printf("FPS: %3.1f\n", frameN / (sdkGetTimerValue(&timer) * 0.001)); g_FPS = false; } } glutSwapBuffers(); glutReportErrors(); sdkStopTimer(&timer); computeFPS(); }
void EyeDescriptor::mainHough(cv::Mat& dst) { StopWatchInterface *timer = NULL; sdkCreateTimer(&timer); sdkStartTimer(&timer); //pixels which should be considered std::vector<std::pair<int, int>> edgesIdx; for (int y = 0; y < height; y++) for (int x = 0; x < width; x++) if (dst.at<uchar>(y, x) == 255) { edgesIdx.push_back( std::pair<int, int>(x, y) ); } int NPixelsEdges = (int) edgesIdx.size(); for (int ipixel = 0; ipixel < NPixelsEdges; ++ipixel) { int x = edgesIdx[ipixel].first; int y = edgesIdx[ipixel].second; //gradient angle? int q_angle = localGradient_angles[x + y * width]; //we check for each radius // from rmin to rmax for (double r = rmin; r < rmax; r += rdelta) { int eps = 40; if (std::abs(r) < eps) continue; int ri = int(((r - rmin) / (rmax - rmin))*rstepnumb); if (ri == rstepnumb) ri = rstepnumb - 1; //small chance for drawing rmax and getting out of bounds int x0 = int(x - r*ci[q_angle] + 0.5); int y0 = int(y - r*si[q_angle] + 0.5); if (!(x0>=0 && x0 < width && y0>=0 && y0 < height)) continue; int tmp = ++accummulator[x0 + y0*width + ri*height*width]; if (tmp > houghmaxval){ houghmaxval = tmp; x_maxval = x0; y_maxval = y0; r_maxval = int(std::abs(r)+0.5); } } } sdkStopTimer(&timer); float execution_time = sdkGetTimerValue(&timer); std::cout << "Main loop, TIME: " << execution_time << "[ms]" << std::endl; }
bool runSingleTest(const char *ref_file, const char *exec_path) { // allocate memory for result int nTotalErrors = 0; unsigned int *d_result; unsigned int size = width * height * sizeof(unsigned int); checkCudaErrors(cudaMalloc((void **) &d_result, size)); // warm-up gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads); checkCudaErrors(cudaDeviceSynchronize()); sdkStartTimer(&timer); gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads); checkCudaErrors(cudaDeviceSynchronize()); getLastCudaError("Kernel execution failed"); sdkStopTimer(&timer); unsigned char *h_result = (unsigned char *)malloc(width*height*4); checkCudaErrors(cudaMemcpy(h_result, d_result, width*height*4, cudaMemcpyDeviceToHost)); char dump_file[1024]; sprintf(dump_file, "lena_%02d.ppm", (int)sigma); sdkSavePPM4ub(dump_file, h_result, width, height); if (!sdkComparePPM(dump_file, sdkFindFilePath(ref_file, exec_path), MAX_EPSILON_ERROR, THRESHOLD, false)) { nTotalErrors++; } printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); printf("%.2f Mpixels/sec\n", (width*height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6); checkCudaErrors(cudaFree(d_result)); free(h_result); printf("Summary: %d errors!\n", nTotalErrors); printf(nTotalErrors == 0 ? "Test passed\n": "Test failed!\n"); return (nTotalErrors == 0); }
void run_cpu_color_test(PPM_IMG img_in) { StopWatchInterface *timer=NULL; printf("Starting CPU processing...\n"); sdkCreateTimer(&timer); sdkStartTimer(&timer); img_obuf_yuv_cpu = rgb2yuv(img_in); //Start RGB 2 YUV sdkStopTimer(&timer); printf("RGB to YUV conversion time: %f (ms)\n", sdkGetTimerValue(&timer)); sdkDeleteTimer(&timer); sdkCreateTimer(&timer); sdkStartTimer(&timer); img_obuf_rgb_cpu = yuv2rgb(img_obuf_yuv_cpu); //Start YUV 2 RGB sdkStopTimer(&timer); printf("YUV to RGB conversion time: %f (ms)\n", sdkGetTimerValue(&timer)); sdkDeleteTimer(&timer); write_yuv(img_obuf_yuv_cpu, "out_yuv.yuv"); write_ppm(img_obuf_rgb_cpu, "out_rgb.ppm"); }
void run_gpu_color_test(PPM_IMG img_in) { StopWatchInterface *timer=NULL; launchEmptyKernel(); // lauch an empty kernel printf("Starting GPU processing...\n"); sdkCreateTimer(&timer); sdkStartTimer(&timer); img_obuf_yuv_gpu = rgb2yuvGPU(img_in); //Start RGB 2 YUV sdkStopTimer(&timer); printf("RGB to YUV conversion time(GPU): %f (ms)\n", sdkGetTimerValue(&timer)); sdkDeleteTimer(&timer); sdkCreateTimer(&timer); sdkStartTimer(&timer); img_obuf_rgb_gpu = yuv2rgbGPU(img_obuf_yuv_gpu); //Start YUV 2 RGB sdkStopTimer(&timer); printf("YUV to RGB conversion time(GPU): %f (ms)\n", sdkGetTimerValue(&timer)); sdkDeleteTimer(&timer); write_ppm(img_obuf_rgb_gpu, "out_rgb.ppm"); write_yuv(img_obuf_yuv_gpu, "out_yuv.yuv"); }
void filter() { if (filterAnimation) { filterFactor = cosf(sdkGetTimerValue(&animationTimer) * filterTimeScale); } FilterKernel_update(filterFactor); Volume *volumeRender = VolumeFilter_runFilter(&volumeOriginal,&volumeFilter0,&volumeFilter1, filterIterations, 3*3*3,filterWeights,filterBias); VolumeRender_setVolume(volumeRender); }
void _runBenchmark(int iterations) { // once without timing to prime the device if (!useCpu) { m_nbody->update(activeParams.m_timestep); } if (useCpu) { sdkCreateTimer(&timer); sdkStartTimer(&timer); } else { checkCudaErrors(cudaEventRecord(startEvent, 0)); } for (int i = 0; i < iterations; ++i) { m_nbody->update(activeParams.m_timestep); } float milliseconds = 0; if (useCpu) { sdkStopTimer(&timer); milliseconds = sdkGetTimerValue(&timer); sdkStartTimer(&timer); } else { checkCudaErrors(cudaEventRecord(stopEvent, 0)); checkCudaErrors(cudaEventSynchronize(stopEvent)); checkCudaErrors(cudaEventElapsedTime(&milliseconds, startEvent, stopEvent)); } double interactionsPerSecond = 0; double gflops = 0; computePerfStats(interactionsPerSecond, gflops, milliseconds, iterations); printf("%d bodies, total time for %d iterations: %.3f ms, mean %f\n", numBodies, iterations, milliseconds, milliseconds/iterations); printf("= %.3f billion interactions per second\n", interactionsPerSecond); printf("= %.3f %s-precision GFLOP/s at %d flops per interaction\n", gflops, (sizeof(T) > 4) ? "double" : "single", flopsPerInteraction); }
void runBenchmark(int iterations, char *exec_path) { printf("Run %u particles simulation for %d iterations...\n\n", numParticles, iterations); cudaDeviceSynchronize(); sdkStartTimer(&timer); for (int i = 0; i < iterations; ++i) { psystem->update(timestep); } cudaDeviceSynchronize(); sdkStopTimer(&timer); float fAvgSeconds = ((float)1.0e-3 * (float)sdkGetTimerValue(&timer)/(float)iterations); printf("particles, Throughput = %.4f KParticles/s, Time = %.5f s, Size = %u particles, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-3 * numParticles)/fAvgSeconds, fAvgSeconds, numParticles, 1, 0); }
void runBenchmark(int iterations, char *exec_path) { int file_count=0, iterationsPerFrame = (int)(1.0/(30.0*timestep)); printf("Run %u particles simulation for %d iterations...\n\n", numParticles, iterations); //abb58: 1. what are you trying to sync??? cudaDeviceSynchronize(); sdkStartTimer(&timer); for (int i = 0; i < iterations; ++i) { psystem->update(timestep); if (i % iterationsPerFrame == 0) { psystem->writeParticles(fpout, 0, numParticles, file_count); //psystem->dumpParticles(0, numParticles, file_count); file_count++; } } cudaDeviceSynchronize(); sdkStopTimer(&timer); float fAvgSeconds = ((float)1.0e-3 * (float)sdkGetTimerValue(&timer)/(float)iterations); }
//////////////////////////////////////////////////////////////////////////////// // Test driver //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { uint *h_SrcKey, *h_SrcVal, *h_DstKey, *h_DstVal; uint *d_SrcKey, *d_SrcVal, *d_BufKey, *d_BufVal, *d_DstKey, *d_DstVal; StopWatchInterface *hTimer = NULL; const uint N = 4 * 1048576; const uint DIR = 1; const uint numValues = 65536; printf("%s Starting...\n\n", argv[0]); int dev = findCudaDevice(argc, (const char **) argv); if (dev == -1) { return EXIT_FAILURE; } printf("Allocating and initializing host arrays...\n\n"); sdkCreateTimer(&hTimer); h_SrcKey = (uint *)malloc(N * sizeof(uint)); h_SrcVal = (uint *)malloc(N * sizeof(uint)); h_DstKey = (uint *)malloc(N * sizeof(uint)); h_DstVal = (uint *)malloc(N * sizeof(uint)); srand(2009); for (uint i = 0; i < N; i++) { h_SrcKey[i] = rand() % numValues; } fillValues(h_SrcVal, N); printf("Allocating and initializing CUDA arrays...\n\n"); checkCudaErrors(cudaMalloc((void **)&d_DstKey, N * sizeof(uint))); checkCudaErrors(cudaMalloc((void **)&d_DstVal, N * sizeof(uint))); checkCudaErrors(cudaMalloc((void **)&d_BufKey, N * sizeof(uint))); checkCudaErrors(cudaMalloc((void **)&d_BufVal, N * sizeof(uint))); checkCudaErrors(cudaMalloc((void **)&d_SrcKey, N * sizeof(uint))); checkCudaErrors(cudaMalloc((void **)&d_SrcVal, N * sizeof(uint))); checkCudaErrors(cudaMemcpy(d_SrcKey, h_SrcKey, N * sizeof(uint), cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(d_SrcVal, h_SrcVal, N * sizeof(uint), cudaMemcpyHostToDevice)); printf("Initializing GPU merge sort...\n"); initMergeSort(); printf("Running GPU merge sort...\n"); checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); mergeSort( d_DstKey, d_DstVal, d_BufKey, d_BufVal, d_SrcKey, d_SrcVal, N, DIR ); checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&hTimer); printf("Time: %f ms\n", sdkGetTimerValue(&hTimer)); printf("Reading back GPU merge sort results...\n"); checkCudaErrors(cudaMemcpy(h_DstKey, d_DstKey, N * sizeof(uint), cudaMemcpyDeviceToHost)); checkCudaErrors(cudaMemcpy(h_DstVal, d_DstVal, N * sizeof(uint), cudaMemcpyDeviceToHost)); printf("Inspecting the results...\n"); uint keysFlag = validateSortedKeys( h_DstKey, h_SrcKey, 1, N, numValues, DIR ); uint valuesFlag = validateSortedValues( h_DstKey, h_DstVal, h_SrcKey, 1, N ); printf("Shutting down...\n"); closeMergeSort(); sdkDeleteTimer(&hTimer); checkCudaErrors(cudaFree(d_SrcVal)); checkCudaErrors(cudaFree(d_SrcKey)); checkCudaErrors(cudaFree(d_BufVal)); checkCudaErrors(cudaFree(d_BufKey)); checkCudaErrors(cudaFree(d_DstVal)); checkCudaErrors(cudaFree(d_DstKey)); free(h_DstVal); free(h_DstKey); free(h_SrcVal); free(h_SrcKey); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit((keysFlag && valuesFlag) ? EXIT_SUCCESS : EXIT_FAILURE); }
int main(int argc, char **argv) { // Start logs printf("%s Starting...\n\n", argv[0]); unsigned int useDoublePrecision; char *precisionChoice; getCmdLineArgumentString(argc, (const char **)argv, "type", &precisionChoice); if (precisionChoice == NULL) { useDoublePrecision = 0; } else { if (!STRCASECMP(precisionChoice, "double")) { useDoublePrecision = 1; } else { useDoublePrecision = 0; } } unsigned int tableCPU[QRNG_DIMENSIONS][QRNG_RESOLUTION]; float *h_OutputGPU, *d_Output; int dim, pos; double delta, ref, sumDelta, sumRef, L1norm, gpuTime; StopWatchInterface *hTimer = NULL; if (sizeof(INT64) != 8) { printf("sizeof(INT64) != 8\n"); return 0; } // use command-line specified CUDA device, otherwise use device with highest Gflops/s int dev = findCudaDevice(argc, (const char **)argv); sdkCreateTimer(&hTimer); int deviceIndex; checkCudaErrors(cudaGetDevice(&deviceIndex)); cudaDeviceProp deviceProp; checkCudaErrors(cudaGetDeviceProperties(&deviceProp, deviceIndex)); int version = deviceProp.major * 10 + deviceProp.minor; if (useDoublePrecision && version < 13) { printf("Double precision not supported.\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return 0; } printf("Allocating GPU memory...\n"); checkCudaErrors(cudaMalloc((void **)&d_Output, QRNG_DIMENSIONS * N * sizeof(float))); printf("Allocating CPU memory...\n"); h_OutputGPU = (float *)malloc(QRNG_DIMENSIONS * N * sizeof(float)); printf("Initializing QRNG tables...\n\n"); initQuasirandomGenerator(tableCPU); if (useDoublePrecision) { initTable_SM13(tableCPU); } else { initTable_SM10(tableCPU); } printf("Testing QRNG...\n\n"); checkCudaErrors(cudaMemset(d_Output, 0, QRNG_DIMENSIONS * N * sizeof(float))); int numIterations = 20; for (int i = -1; i < numIterations; i++) { if (i == 0) { checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); } if (useDoublePrecision) { quasirandomGenerator_SM13(d_Output, 0, N); } else { quasirandomGenerator_SM10(d_Output, 0, N); } } checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&hTimer); gpuTime = sdkGetTimerValue(&hTimer)/(double)numIterations*1e-3; printf("quasirandomGenerator, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers, NumDevsUsed = %u, Workgroup = %u\n", (double)QRNG_DIMENSIONS * (double)N * 1.0E-9 / gpuTime, gpuTime, QRNG_DIMENSIONS*N, 1, 128*QRNG_DIMENSIONS); printf("\nReading GPU results...\n"); checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, QRNG_DIMENSIONS * N * sizeof(float), cudaMemcpyDeviceToHost)); printf("Comparing to the CPU results...\n\n"); sumDelta = 0; sumRef = 0; for (dim = 0; dim < QRNG_DIMENSIONS; dim++) for (pos = 0; pos < N; pos++) { ref = getQuasirandomValue63(pos, dim); delta = (double)h_OutputGPU[dim * N + pos] - ref; sumDelta += fabs(delta); sumRef += fabs(ref); } printf("L1 norm: %E\n", sumDelta / sumRef); printf("\nTesting inverseCNDgpu()...\n\n"); checkCudaErrors(cudaMemset(d_Output, 0, QRNG_DIMENSIONS * N * sizeof(float))); for (int i = -1; i < numIterations; i++) { if (i == 0) { checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); } if (useDoublePrecision) { inverseCND_SM13(d_Output, NULL, QRNG_DIMENSIONS * N); } else { inverseCND_SM10(d_Output, NULL, QRNG_DIMENSIONS * N); } } checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&hTimer); gpuTime = sdkGetTimerValue(&hTimer)/(double)numIterations*1e-3; printf("quasirandomGenerator-inverse, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers, NumDevsUsed = %u, Workgroup = %u\n", (double)QRNG_DIMENSIONS * (double)N * 1E-9 / gpuTime, gpuTime, QRNG_DIMENSIONS*N, 1, 128); printf("Reading GPU results...\n"); checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, QRNG_DIMENSIONS * N * sizeof(float), cudaMemcpyDeviceToHost)); printf("\nComparing to the CPU results...\n"); sumDelta = 0; sumRef = 0; unsigned int distance = ((unsigned int)-1) / (QRNG_DIMENSIONS * N + 1); for (pos = 0; pos < QRNG_DIMENSIONS * N; pos++) { unsigned int d = (pos + 1) * distance; ref = MoroInvCNDcpu(d); delta = (double)h_OutputGPU[pos] - ref; sumDelta += fabs(delta); sumRef += fabs(ref); } printf("L1 norm: %E\n\n", L1norm = sumDelta / sumRef); printf("Shutting down...\n"); sdkDeleteTimer(&hTimer); free(h_OutputGPU); checkCudaErrors(cudaFree(d_Output)); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(L1norm < 1e-6 ? EXIT_SUCCESS : EXIT_FAILURE); }
int main(int argc, char **argv) { // Start logs printf("%s Starting...\n\n", argv[0]); unsigned int tableCPU[QRNG_DIMENSIONS][QRNG_RESOLUTION]; float *h_OutputGPU, *d_Output; int dim, pos; double delta, ref, sumDelta, sumRef, L1norm, gpuTime; StopWatchInterface *hTimer = NULL; if (sizeof(INT64) != 8) { printf("sizeof(INT64) != 8\n"); return 0; } cudaDeviceProp deviceProp; int dev = findCudaDevice(argc, (const char **)argv); checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev)); if (((deviceProp.major << 4) + deviceProp.minor) < 0x20) { fprintf(stderr, "quasirandomGenerator requires Compute Capability of SM 2.0 or higher to run.\n"); cudaDeviceReset(); exit(EXIT_WAIVED); } sdkCreateTimer(&hTimer); printf("Allocating GPU memory...\n"); checkCudaErrors(cudaMalloc((void **)&d_Output, QRNG_DIMENSIONS * N * sizeof(float))); printf("Allocating CPU memory...\n"); h_OutputGPU = (float *)malloc(QRNG_DIMENSIONS * N * sizeof(float)); printf("Initializing QRNG tables...\n\n"); initQuasirandomGenerator(tableCPU); initTableGPU(tableCPU); printf("Testing QRNG...\n\n"); checkCudaErrors(cudaMemset(d_Output, 0, QRNG_DIMENSIONS * N * sizeof(float))); int numIterations = 20; for (int i = -1; i < numIterations; i++) { if (i == 0) { checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); } quasirandomGeneratorGPU(d_Output, 0, N); } checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&hTimer); gpuTime = sdkGetTimerValue(&hTimer)/(double)numIterations*1e-3; printf("quasirandomGenerator, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers, NumDevsUsed = %u, Workgroup = %u\n", (double)QRNG_DIMENSIONS * (double)N * 1.0E-9 / gpuTime, gpuTime, QRNG_DIMENSIONS*N, 1, 128*QRNG_DIMENSIONS); printf("\nReading GPU results...\n"); checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, QRNG_DIMENSIONS * N * sizeof(float), cudaMemcpyDeviceToHost)); printf("Comparing to the CPU results...\n\n"); sumDelta = 0; sumRef = 0; for (dim = 0; dim < QRNG_DIMENSIONS; dim++) for (pos = 0; pos < N; pos++) { ref = getQuasirandomValue63(pos, dim); delta = (double)h_OutputGPU[dim * N + pos] - ref; sumDelta += fabs(delta); sumRef += fabs(ref); } printf("L1 norm: %E\n", sumDelta / sumRef); printf("\nTesting inverseCNDgpu()...\n\n"); checkCudaErrors(cudaMemset(d_Output, 0, QRNG_DIMENSIONS * N * sizeof(float))); for (int i = -1; i < numIterations; i++) { if (i == 0) { checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); } inverseCNDgpu(d_Output, NULL, QRNG_DIMENSIONS * N); } checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&hTimer); gpuTime = sdkGetTimerValue(&hTimer)/(double)numIterations*1e-3; printf("quasirandomGenerator-inverse, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers, NumDevsUsed = %u, Workgroup = %u\n", (double)QRNG_DIMENSIONS * (double)N * 1E-9 / gpuTime, gpuTime, QRNG_DIMENSIONS*N, 1, 128); printf("Reading GPU results...\n"); checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, QRNG_DIMENSIONS * N * sizeof(float), cudaMemcpyDeviceToHost)); printf("\nComparing to the CPU results...\n"); sumDelta = 0; sumRef = 0; unsigned int distance = ((unsigned int)-1) / (QRNG_DIMENSIONS * N + 1); for (pos = 0; pos < QRNG_DIMENSIONS * N; pos++) { unsigned int d = (pos + 1) * distance; ref = MoroInvCNDcpu(d); delta = (double)h_OutputGPU[pos] - ref; sumDelta += fabs(delta); sumRef += fabs(ref); } printf("L1 norm: %E\n\n", L1norm = sumDelta / sumRef); printf("Shutting down...\n"); sdkDeleteTimer(&hTimer); free(h_OutputGPU); checkCudaErrors(cudaFree(d_Output)); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(L1norm < 1e-6 ? EXIT_SUCCESS : EXIT_FAILURE); }
//////////////////////////////////////////////////////////////////////////////// // Test driver //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { cudaError_t error; printf("%s Starting...\n\n", argv[0]); printf("Starting up CUDA context...\n"); int dev = findCudaDevice(argc, (const char **)argv); uint *h_InputKey, *h_InputVal, *h_OutputKeyGPU, *h_OutputValGPU; uint *d_InputKey, *d_InputVal, *d_OutputKey, *d_OutputVal; StopWatchInterface *hTimer = NULL; const uint N = 1048576; const uint DIR = 0; const uint numValues = 65536; const uint numIterations = 1; printf("Allocating and initializing host arrays...\n\n"); sdkCreateTimer(&hTimer); h_InputKey = (uint *)malloc(N * sizeof(uint)); h_InputVal = (uint *)malloc(N * sizeof(uint)); h_OutputKeyGPU = (uint *)malloc(N * sizeof(uint)); h_OutputValGPU = (uint *)malloc(N * sizeof(uint)); srand(2001); for (uint i = 0; i < N; i++) { h_InputKey[i] = rand() % numValues; h_InputVal[i] = i; } printf("Allocating and initializing CUDA arrays...\n\n"); error = cudaMalloc((void **)&d_InputKey, N * sizeof(uint)); checkCudaErrors(error); error = cudaMalloc((void **)&d_InputVal, N * sizeof(uint)); checkCudaErrors(error); error = cudaMalloc((void **)&d_OutputKey, N * sizeof(uint)); checkCudaErrors(error); error = cudaMalloc((void **)&d_OutputVal, N * sizeof(uint)); checkCudaErrors(error); error = cudaMemcpy(d_InputKey, h_InputKey, N * sizeof(uint), cudaMemcpyHostToDevice); checkCudaErrors(error); error = cudaMemcpy(d_InputVal, h_InputVal, N * sizeof(uint), cudaMemcpyHostToDevice); checkCudaErrors(error); int flag = 1; printf("Running GPU bitonic sort (%u identical iterations)...\n\n", numIterations); for (uint arrayLength = 64; arrayLength <= N; arrayLength *= 2) { printf("Testing array length %u (%u arrays per batch)...\n", arrayLength, N / arrayLength); error = cudaDeviceSynchronize(); checkCudaErrors(error); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); uint threadCount = 0; for (uint i = 0; i < numIterations; i++) threadCount = bitonicSort( d_OutputKey, d_OutputVal, d_InputKey, d_InputVal, N / arrayLength, arrayLength, DIR ); error = cudaDeviceSynchronize(); checkCudaErrors(error); sdkStopTimer(&hTimer); printf("Average time: %f ms\n\n", sdkGetTimerValue(&hTimer) / numIterations); if (arrayLength == N) { double dTimeSecs = 1.0e-3 * sdkGetTimerValue(&hTimer) / numIterations; printf("sortingNetworks-bitonic, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u elements, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * (double)arrayLength/dTimeSecs), dTimeSecs, arrayLength, 1, threadCount); } printf("\nValidating the results...\n"); printf("...reading back GPU results\n"); error = cudaMemcpy(h_OutputKeyGPU, d_OutputKey, N * sizeof(uint), cudaMemcpyDeviceToHost); checkCudaErrors(error); error = cudaMemcpy(h_OutputValGPU, d_OutputVal, N * sizeof(uint), cudaMemcpyDeviceToHost); checkCudaErrors(error); int keysFlag = validateSortedKeys(h_OutputKeyGPU, h_InputKey, N / arrayLength, arrayLength, numValues, DIR); int valuesFlag = validateValues(h_OutputKeyGPU, h_OutputValGPU, h_InputKey, N / arrayLength, arrayLength); flag = flag && keysFlag && valuesFlag; printf("\n"); } printf("Shutting down...\n"); sdkDeleteTimer(&hTimer); cudaFree(d_OutputVal); cudaFree(d_OutputKey); cudaFree(d_InputVal); cudaFree(d_InputKey); free(h_OutputValGPU); free(h_OutputKeyGPU); free(h_InputVal); free(h_InputKey); cudaDeviceReset(); exit(flag ? EXIT_SUCCESS : EXIT_FAILURE); }
void computeFPS(HWND hWnd, bool bUseInterop) { sdkStopTimer(&frame_timer); if (g_bRunning) { g_fpsCount++; if (!g_pFrameQueue->isEndOfDecode()) { g_FrameCount++; } } char sFPS[256]; std::string sDecodeStatus; if (g_bDeviceLost) { sDecodeStatus = "DeviceLost!\0"; sprintf(sFPS, "%s [%s] - [%s %d]", sAppName, sDecodeStatus.c_str(), (g_bIsProgressive ? "Frame" : "Field"), g_DecodeFrameCount); if (bUseInterop && (!g_bQAReadback)) { SetWindowText(hWnd, sFPS); UpdateWindow(hWnd); } sdkResetTimer(&frame_timer); g_fpsCount = 0; return; } if (g_pFrameQueue->isEndOfDecode()) { sDecodeStatus = "STOP (End of File)\0"; // we only want to record this once if (total_time == 0.0f) { total_time = sdkGetTimerValue(&global_timer); } sdkStopTimer(&global_timer); if (g_bAutoQuit) { g_bRunning = false; g_bDone = true; } } else { if (!g_bRunning) { sDecodeStatus = "PAUSE\0"; sprintf(sFPS, "%s [%s] - [%s %d] - Video Display %s / Vsync %s", sAppName, sDecodeStatus.c_str(), (g_bIsProgressive ? "Frame" : "Field"), g_DecodeFrameCount, g_bUseDisplay ? "ON" : "OFF", g_bUseVsync ? "ON" : "OFF"); if (bUseInterop && (!g_bQAReadback)) { SetWindowText(hWnd, sFPS); UpdateWindow(hWnd); } } else { if (g_bFrameStep) { sDecodeStatus = "STEP\0"; } else { sDecodeStatus = "PLAY\0"; } } if (g_fpsCount == g_fpsLimit) { float ifps = 1.f / (sdkGetAverageTimerValue(&frame_timer) / 1000.f); sprintf(sFPS, "[%s] [%s] - [%3.1f fps, %s %d] - Video Display %s / Vsync %s", sAppName, sDecodeStatus.c_str(), ifps, (g_bIsProgressive ? "Frame" : "Field"), g_DecodeFrameCount, g_bUseDisplay ? "ON" : "OFF", g_bUseVsync ? "ON" : "OFF"); if (bUseInterop && (!g_bQAReadback)) { SetWindowText(hWnd, sFPS); UpdateWindow(hWnd); } printf("[%s] - [%s: %04d, %04.1f fps, time: %04.2f (ms) ]\n", sSDKname, (g_bIsProgressive ? "Frame" : "Field"), g_FrameCount, ifps, 1000.f/ifps); sdkResetTimer(&frame_timer); g_fpsCount = 0; } } sdkStartTimer(&frame_timer); }
//////////////////////////////////////////////////////////////////////////////// // Main program //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { float *h_Kernel, *h_Input, *h_Buffer, *h_OutputCPU, *h_OutputGPU; cudaArray *a_Src; cudaChannelFormatDesc floatTex = cudaCreateChannelDesc<float>(); float *d_Output; float gpuTime; StopWatchInterface *hTimer = NULL; const int imageW = 3072; const int imageH = 3072 / 2; const unsigned int iterations = 10; printf("[%s] - Starting...\n", argv[0]); // use command-line specified CUDA device, otherwise use device with highest Gflops/s findCudaDevice(argc, (const char **)argv); sdkCreateTimer(&hTimer); printf("Initializing data...\n"); h_Kernel = (float *)malloc(KERNEL_LENGTH * sizeof(float)); h_Input = (float *)malloc(imageW * imageH * sizeof(float)); h_Buffer = (float *)malloc(imageW * imageH * sizeof(float)); h_OutputCPU = (float *)malloc(imageW * imageH * sizeof(float)); h_OutputGPU = (float *)malloc(imageW * imageH * sizeof(float)); checkCudaErrors(cudaMallocArray(&a_Src, &floatTex, imageW, imageH)); checkCudaErrors(cudaMalloc((void **)&d_Output, imageW * imageH * sizeof(float))); srand(2009); for (unsigned int i = 0; i < KERNEL_LENGTH; i++) { h_Kernel[i] = (float)(rand() % 16); } for (unsigned int i = 0; i < imageW * imageH; i++) { h_Input[i] = (float)(rand() % 16); } setConvolutionKernel(h_Kernel); checkCudaErrors(cudaMemcpyToArray(a_Src, 0, 0, h_Input, imageW * imageH * sizeof(float), cudaMemcpyHostToDevice)); printf("Running GPU rows convolution (%u identical iterations)...\n", iterations); checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); for (unsigned int i = 0; i < iterations; i++) { convolutionRowsGPU( d_Output, a_Src, imageW, imageH ); } checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&hTimer); gpuTime = sdkGetTimerValue(&hTimer) / (float)iterations; printf("Average convolutionRowsGPU() time: %f msecs; //%f Mpix/s\n", gpuTime, imageW * imageH * 1e-6 / (0.001 * gpuTime)); //While CUDA kernels can't write to textures directly, this copy is inevitable printf("Copying convolutionRowGPU() output back to the texture...\n"); checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); checkCudaErrors(cudaMemcpyToArray(a_Src, 0, 0, d_Output, imageW * imageH * sizeof(float), cudaMemcpyDeviceToDevice)); checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&hTimer); gpuTime = sdkGetTimerValue(&hTimer); printf("cudaMemcpyToArray() time: %f msecs; //%f Mpix/s\n", gpuTime, imageW * imageH * 1e-6 / (0.001 * gpuTime)); printf("Running GPU columns convolution (%i iterations)\n", iterations); checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); for (int i = 0; i < iterations; i++) { convolutionColumnsGPU( d_Output, a_Src, imageW, imageH ); } checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&hTimer); gpuTime = sdkGetTimerValue(&hTimer) / (float)iterations; printf("Average convolutionColumnsGPU() time: %f msecs; //%f Mpix/s\n", gpuTime, imageW * imageH * 1e-6 / (0.001 * gpuTime)); printf("Reading back GPU results...\n"); checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, imageW * imageH * sizeof(float), cudaMemcpyDeviceToHost)); printf("Checking the results...\n"); printf("...running convolutionRowsCPU()\n"); convolutionRowsCPU( h_Buffer, h_Input, h_Kernel, imageW, imageH, KERNEL_RADIUS ); printf("...running convolutionColumnsCPU()\n"); convolutionColumnsCPU( h_OutputCPU, h_Buffer, h_Kernel, imageW, imageH, KERNEL_RADIUS ); double delta = 0; double sum = 0; for (unsigned int i = 0; i < imageW * imageH; i++) { sum += h_OutputCPU[i] * h_OutputCPU[i]; delta += (h_OutputGPU[i] - h_OutputCPU[i]) * (h_OutputGPU[i] - h_OutputCPU[i]); } double L2norm = sqrt(delta / sum); printf("Relative L2 norm: %E\n", L2norm); printf("Shutting down...\n"); checkCudaErrors(cudaFree(d_Output)); checkCudaErrors(cudaFreeArray(a_Src)); free(h_OutputGPU); free(h_Buffer); free(h_Input); free(h_Kernel); sdkDeleteTimer(&hTimer); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); if (L2norm > 1e-6) { printf("Test failed!\n"); exit(EXIT_FAILURE); } printf("Test passed\n"); exit(EXIT_SUCCESS); }
bool test2(void) { float *h_Data, *h_Kernel, *h_ResultCPU, *h_ResultGPU; float *d_Data, *d_Kernel, *d_PaddedData, *d_PaddedKernel; fComplex *d_DataSpectrum0, *d_KernelSpectrum0; cufftHandle fftPlan; bool bRetVal; StopWatchInterface *hTimer = NULL; sdkCreateTimer(&hTimer); printf("Testing updated custom R2C / C2R FFT-based convolution\n"); const int kernelH = 7; const int kernelW = 6; const int kernelY = 3; const int kernelX = 4; const int dataH = 2000; const int dataW = 2000; const int fftH = snapTransformSize(dataH + kernelH - 1); const int fftW = snapTransformSize(dataW + kernelW - 1); printf("...allocating memory\n"); h_Data = (float *)malloc(dataH * dataW * sizeof(float)); h_Kernel = (float *)malloc(kernelH * kernelW * sizeof(float)); h_ResultCPU = (float *)malloc(dataH * dataW * sizeof(float)); h_ResultGPU = (float *)malloc(fftH * fftW * sizeof(float)); checkCudaErrors(cudaMalloc((void **)&d_Data, dataH * dataW * sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_Kernel, kernelH * kernelW * sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_PaddedData, fftH * fftW * sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_PaddedKernel, fftH * fftW * sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_DataSpectrum0, fftH * (fftW / 2) * sizeof(fComplex))); checkCudaErrors(cudaMalloc((void **)&d_KernelSpectrum0, fftH * (fftW / 2) * sizeof(fComplex))); printf("...generating random input data\n"); srand(2010); for (int i = 0; i < dataH * dataW; i++) { h_Data[i] = getRand(); } for (int i = 0; i < kernelH * kernelW; i++) { h_Kernel[i] = getRand(); } printf("...creating C2C FFT plan for %i x %i\n", fftH, fftW / 2); checkCudaErrors(cufftPlan2d(&fftPlan, fftH, fftW / 2, CUFFT_C2C)); printf("...uploading to GPU and padding convolution kernel and input data\n"); checkCudaErrors(cudaMemcpy(d_Data, h_Data, dataH * dataW * sizeof(float), cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(d_Kernel, h_Kernel, kernelH * kernelW * sizeof(float), cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemset(d_PaddedData, 0, fftH * fftW * sizeof(float))); checkCudaErrors(cudaMemset(d_PaddedKernel, 0, fftH * fftW * sizeof(float))); padDataClampToBorder( d_PaddedData, d_Data, fftH, fftW, dataH, dataW, kernelH, kernelW, kernelY, kernelX ); padKernel( d_PaddedKernel, d_Kernel, fftH, fftW, kernelH, kernelW, kernelY, kernelX ); //CUFFT_INVERSE works just as well... const int FFT_DIR = CUFFT_FORWARD; //Not including kernel transformation into time measurement, //since convolution kernel is not changed very frequently printf("...transforming convolution kernel\n"); checkCudaErrors(cufftExecC2C(fftPlan, (cufftComplex *)d_PaddedKernel, (cufftComplex *)d_KernelSpectrum0, FFT_DIR)); printf("...running GPU FFT convolution: "); checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); checkCudaErrors(cufftExecC2C(fftPlan, (cufftComplex *)d_PaddedData, (cufftComplex *)d_DataSpectrum0, FFT_DIR)); spProcess2D(d_DataSpectrum0, d_DataSpectrum0, d_KernelSpectrum0, fftH, fftW / 2, FFT_DIR); checkCudaErrors(cufftExecC2C(fftPlan, (cufftComplex *)d_DataSpectrum0, (cufftComplex *)d_PaddedData, -FFT_DIR)); checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&hTimer); double gpuTime = sdkGetTimerValue(&hTimer); printf("%f MPix/s (%f ms)\n", (double)dataH * (double)dataW * 1e-6 / (gpuTime * 0.001), gpuTime); printf("...reading back GPU FFT results\n"); checkCudaErrors(cudaMemcpy(h_ResultGPU, d_PaddedData, fftH * fftW * sizeof(float), cudaMemcpyDeviceToHost)); printf("...running reference CPU convolution\n"); convolutionClampToBorderCPU( h_ResultCPU, h_Data, h_Kernel, dataH, dataW, kernelH, kernelW, kernelY, kernelX ); printf("...comparing the results: "); double sum_delta2 = 0; double sum_ref2 = 0; double max_delta_ref = 0; for (int y = 0; y < dataH; y++) { for (int x = 0; x < dataW; x++) { double rCPU = (double)h_ResultCPU[y * dataW + x]; double rGPU = (double)h_ResultGPU[y * fftW + x]; double delta = (rCPU - rGPU) * (rCPU - rGPU); double ref = rCPU * rCPU + rCPU * rCPU; if ((delta / ref) > max_delta_ref) { max_delta_ref = delta / ref; } sum_delta2 += delta; sum_ref2 += ref; } } double L2norm = sqrt(sum_delta2 / sum_ref2); printf("rel L2 = %E (max delta = %E)\n", L2norm, sqrt(max_delta_ref)); bRetVal = (L2norm < 1e-6) ? true : false; printf(bRetVal ? "L2norm Error OK\n" : "L2norm Error too high!\n"); printf("...shutting down\n"); sdkStartTimer(&hTimer); checkCudaErrors(cufftDestroy(fftPlan)); checkCudaErrors(cudaFree(d_KernelSpectrum0)); checkCudaErrors(cudaFree(d_DataSpectrum0)); checkCudaErrors(cudaFree(d_PaddedKernel)); checkCudaErrors(cudaFree(d_PaddedData)); checkCudaErrors(cudaFree(d_Kernel)); checkCudaErrors(cudaFree(d_Data)); free(h_ResultGPU); free(h_ResultCPU); free(h_Kernel); free(h_Data); return bRetVal; }
////////////////////////////////////////////////////////////////////////// // AUTOMATIC TESTING void runSingleTest(const char *ref_file, const char *exec_path) { uint *d_output; checkCudaErrors(cudaMalloc((void **)&d_output, width*height*sizeof(uint))); checkCudaErrors(cudaMemset(d_output, 0, width*height*sizeof(uint))); float modelView[16] = { 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 4.0f, 1.0f }; invViewMatrix[0] = modelView[0]; invViewMatrix[1] = modelView[4]; invViewMatrix[2] = modelView[8]; invViewMatrix[3] = modelView[12]; invViewMatrix[4] = modelView[1]; invViewMatrix[5] = modelView[5]; invViewMatrix[6] = modelView[9]; invViewMatrix[7] = modelView[13]; invViewMatrix[8] = modelView[2]; invViewMatrix[9] = modelView[6]; invViewMatrix[10] = modelView[10]; invViewMatrix[11] = modelView[14]; // call CUDA kernel, writing results to PBO VolumeRender_copyInvViewMatrix(invViewMatrix, sizeof(float4)*3); filterAnimation = false; // Start timer 0 and process n loops on the GPU int nIter = 10; float scale = 2.0f/float(nIter-1); for (int i = -1; i < nIter; i++) { if (i == 0) { cudaDeviceSynchronize(); sdkStartTimer(&timer); } filterFactor = (float(i) * scale) - 1.0f; filterFactor = -filterFactor; filter(); VolumeRender_render(gridSize, blockSize, d_output, width, height, density, brightness, transferOffset, transferScale); } cudaDeviceSynchronize(); sdkStopTimer(&timer); // Get elapsed time and throughput, then log to sample and master logs double dAvgTime = sdkGetTimerValue(&timer)/(nIter * 1000.0); printf("volumeFiltering, Throughput = %.4f MTexels/s, Time = %.5f s, Size = %u Texels, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * width * height)/dAvgTime, dAvgTime, (width * height), 1, blockSize.x * blockSize.y); getLastCudaError("Error: kernel execution FAILED"); checkCudaErrors(cudaDeviceSynchronize()); unsigned char *h_output = (unsigned char *)malloc(width*height*4); checkCudaErrors(cudaMemcpy(h_output, d_output, width*height*4, cudaMemcpyDeviceToHost)); sdkSavePPM4ub("volumefilter.ppm", h_output, width, height); bool bTestResult = sdkComparePPM("volumefilter.ppm", sdkFindFilePath(ref_file, exec_path), MAX_EPSILON_ERROR, THRESHOLD, true); checkCudaErrors(cudaFree(d_output)); free(h_output); cleanup(); exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE); }
int main(int argc, char **argv) { char *multiMethodChoice = NULL; char *scalingChoice = NULL; bool use_threads = true; bool bqatest = false; bool strongScaling = false; pArgc = &argc; pArgv = argv; printf("%s Starting...\n\n", argv[0]); if (checkCmdLineFlag(argc, (const char **)argv, "qatest")) { bqatest = true; } getCmdLineArgumentString(argc, (const char **)argv, "method", &multiMethodChoice); getCmdLineArgumentString(argc, (const char **)argv, "scaling", &scalingChoice); if (checkCmdLineFlag(argc, (const char **)argv, "h") || checkCmdLineFlag(argc, (const char **)argv, "help")) { usage(); exit(EXIT_SUCCESS); } if (multiMethodChoice == NULL) { use_threads = true; } else { if (!strcasecmp(multiMethodChoice, "threaded")) { use_threads = true; } else { use_threads = false; } } if (use_threads == false) { printf("Using single CPU thread for multiple GPUs\n"); } if (scalingChoice == NULL) { strongScaling = false; } else { if (!strcasecmp(scalingChoice, "strong")) { strongScaling = true; } else { strongScaling = false; } } //GPU number present in the system int GPU_N; checkCudaErrors(cudaGetDeviceCount(&GPU_N)); int nOptions = 256; nOptions = adjustProblemSize(GPU_N, nOptions); // select problem size int scale = (strongScaling) ? 1 : GPU_N; int OPT_N = nOptions * scale; int PATH_N = 262144; const unsigned long long SEED = 777; // initialize the timers hTimer = new StopWatchInterface*[GPU_N]; for (int i=0; i<GPU_N; i++) { sdkCreateTimer(&hTimer[i]); sdkResetTimer(&hTimer[i]); } //Input data array TOptionData *optionData = new TOptionData[OPT_N]; //Final GPU MC results TOptionValue *callValueGPU = new TOptionValue[OPT_N]; //"Theoretical" call values by Black-Scholes formula float *callValueBS = new float[OPT_N]; //Solver config TOptionPlan *optionSolver = new TOptionPlan[GPU_N]; //OS thread ID CUTThread *threadID = new CUTThread[GPU_N]; int gpuBase, gpuIndex; int i; float time; double delta, ref, sumDelta, sumRef, sumReserve; printf("MonteCarloMultiGPU\n"); printf("==================\n"); printf("Parallelization method = %s\n", use_threads ? "threaded" : "streamed"); printf("Problem scaling = %s\n", strongScaling? "strong" : "weak"); printf("Number of GPUs = %d\n", GPU_N); printf("Total number of options = %d\n", OPT_N); printf("Number of paths = %d\n", PATH_N); printf("main(): generating input data...\n"); srand(123); for (i=0; i < OPT_N; i++) { optionData[i].S = randFloat(5.0f, 50.0f); optionData[i].X = randFloat(10.0f, 25.0f); optionData[i].T = randFloat(1.0f, 5.0f); optionData[i].R = 0.06f; optionData[i].V = 0.10f; callValueGPU[i].Expected = -1.0f; callValueGPU[i].Confidence = -1.0f; } printf("main(): starting %i host threads...\n", GPU_N); //Get option count for each GPU for (i = 0; i < GPU_N; i++) { optionSolver[i].optionCount = OPT_N / GPU_N; } //Take into account cases with "odd" option counts for (i = 0; i < (OPT_N % GPU_N); i++) { optionSolver[i].optionCount++; } //Assign GPU option ranges gpuBase = 0; for (i = 0; i < GPU_N; i++) { optionSolver[i].device = i; optionSolver[i].optionData = optionData + gpuBase; optionSolver[i].callValue = callValueGPU + gpuBase; // all devices use the same global seed, but start // the sequence at a different offset optionSolver[i].seed = SEED; optionSolver[i].pathN = PATH_N; gpuBase += optionSolver[i].optionCount; } if (use_threads || bqatest) { //Start CPU thread for each GPU for (gpuIndex = 0; gpuIndex < GPU_N; gpuIndex++) { threadID[gpuIndex] = cutStartThread((CUT_THREADROUTINE)solverThread, &optionSolver[gpuIndex]); } printf("main(): waiting for GPU results...\n"); cutWaitForThreads(threadID, GPU_N); printf("main(): GPU statistics, threaded\n"); for (i = 0; i < GPU_N; i++) { cudaDeviceProp deviceProp; checkCudaErrors(cudaGetDeviceProperties(&deviceProp, optionSolver[i].device)); printf("GPU Device #%i: %s\n", optionSolver[i].device, deviceProp.name); printf("Options : %i\n", optionSolver[i].optionCount); printf("Simulation paths: %i\n", optionSolver[i].pathN); time = sdkGetTimerValue(&hTimer[i]); printf("Total time (ms.): %f\n", time); printf("Options per sec.: %f\n", OPT_N / (time * 0.001)); } printf("main(): comparing Monte Carlo and Black-Scholes results...\n"); sumDelta = 0; sumRef = 0; sumReserve = 0; for (i = 0; i < OPT_N; i++) { BlackScholesCall(callValueBS[i], optionData[i]); delta = fabs(callValueBS[i] - callValueGPU[i].Expected); ref = callValueBS[i]; sumDelta += delta; sumRef += fabs(ref); if (delta > 1e-6) { sumReserve += callValueGPU[i].Confidence / delta; } #ifdef PRINT_RESULTS printf("BS: %f; delta: %E\n", callValueBS[i], delta); #endif } sumReserve /= OPT_N; } if (!use_threads || bqatest) { multiSolver(optionSolver, GPU_N); printf("main(): GPU statistics, streamed\n"); for (i = 0; i < GPU_N; i++) { cudaDeviceProp deviceProp; checkCudaErrors(cudaGetDeviceProperties(&deviceProp, optionSolver[i].device)); printf("GPU Device #%i: %s\n", optionSolver[i].device, deviceProp.name); printf("Options : %i\n", optionSolver[i].optionCount); printf("Simulation paths: %i\n", optionSolver[i].pathN); } time = sdkGetTimerValue(&hTimer[0]); printf("\nTotal time (ms.): %f\n", time); printf("\tNote: This is elapsed time for all to compute.\n"); printf("Options per sec.: %f\n", OPT_N / (time * 0.001)); printf("main(): comparing Monte Carlo and Black-Scholes results...\n"); sumDelta = 0; sumRef = 0; sumReserve = 0; for (i = 0; i < OPT_N; i++) { BlackScholesCall(callValueBS[i], optionData[i]); delta = fabs(callValueBS[i] - callValueGPU[i].Expected); ref = callValueBS[i]; sumDelta += delta; sumRef += fabs(ref); if (delta > 1e-6) { sumReserve += callValueGPU[i].Confidence / delta; } #ifdef PRINT_RESULTS printf("BS: %f; delta: %E\n", callValueBS[i], delta); #endif } sumReserve /= OPT_N; } #ifdef DO_CPU printf("main(): running CPU MonteCarlo...\n"); TOptionValue callValueCPU; sumDelta = 0; sumRef = 0; for (i = 0; i < OPT_N; i++) { MonteCarloCPU( callValueCPU, optionData[i], NULL, PATH_N ); delta = fabs(callValueCPU.Expected - callValueGPU[i].Expected); ref = callValueCPU.Expected; sumDelta += delta; sumRef += fabs(ref); printf("Exp : %f | %f\t", callValueCPU.Expected, callValueGPU[i].Expected); printf("Conf: %f | %f\n", callValueCPU.Confidence, callValueGPU[i].Confidence); } printf("L1 norm: %E\n", sumDelta / sumRef); #endif printf("Shutting down...\n"); for (int i=0; i<GPU_N; i++) { sdkStartTimer(&hTimer[i]); checkCudaErrors(cudaSetDevice(i)); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); } delete[] optionSolver; delete[] callValueBS; delete[] callValueGPU; delete[] optionData; delete[] threadID; delete[] hTimer; printf("Test Summary...\n"); printf("L1 norm : %E\n", sumDelta / sumRef); printf("Average reserve: %f\n", sumReserve); printf(sumReserve > 1.0f ? "Test passed\n" : "Test failed!\n"); exit(sumReserve > 1.0f ? EXIT_SUCCESS : EXIT_FAILURE); }
int main(int argc, char **argv) { // Start logs printf("[%s] - Starting...\n", argv[0]); //'h_' prefix - CPU (host) memory space float //Results calculated by CPU for reference *h_CallResultCPU, *h_PutResultCPU, //CPU copy of GPU results *h_CallResultGPU, *h_PutResultGPU, //CPU instance of input data *h_StockPrice, *h_OptionStrike, *h_OptionYears; //'d_' prefix - GPU (device) memory space CUdeviceptr //Results calculated by GPU d_CallResult, d_PutResult, //GPU instance of input data d_StockPrice, d_OptionStrike, d_OptionYears; double delta, ref, sum_delta, sum_ref, max_delta, L1norm, gpuTime; StopWatchInterface *hTimer = NULL; int i; sdkCreateTimer(&hTimer); printf("Initializing data...\n"); printf("...allocating CPU memory for options.\n"); h_CallResultCPU = (float *)malloc(OPT_SZ); h_PutResultCPU = (float *)malloc(OPT_SZ); h_CallResultGPU = (float *)malloc(OPT_SZ); h_PutResultGPU = (float *)malloc(OPT_SZ); h_StockPrice = (float *)malloc(OPT_SZ); h_OptionStrike = (float *)malloc(OPT_SZ); h_OptionYears = (float *)malloc(OPT_SZ); char *ptx, *kernel_file; size_t ptxSize; kernel_file = sdkFindFilePath("BlackScholes_kernel.cuh", argv[0]); // Set a Compiler Option to have maximum register to be used by each thread. char *compile_options[1]; compile_options[0] = (char *) malloc(sizeof(char)*(strlen("--maxrregcount=16"))); strcpy((char *)compile_options[0],"--maxrregcount=16"); // Compile the kernel BlackScholes_kernel. compileFileToPTX(kernel_file, 1, (const char **)compile_options, &ptx, &ptxSize); CUmodule module = loadPTX(ptx, argc, argv); CUfunction kernel_addr; checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "BlackScholesGPU")); printf("...allocating GPU memory for options.\n"); checkCudaErrors(cuMemAlloc(&d_CallResult, OPT_SZ)); checkCudaErrors(cuMemAlloc(&d_PutResult, OPT_SZ)); checkCudaErrors(cuMemAlloc(&d_StockPrice, OPT_SZ)); checkCudaErrors(cuMemAlloc(&d_OptionStrike,OPT_SZ)); checkCudaErrors(cuMemAlloc(&d_OptionYears, OPT_SZ)); printf("...generating input data in CPU mem.\n"); srand(5347); //Generate options set for (i = 0; i < OPT_N; i++) { h_CallResultCPU[i] = 0.0f; h_PutResultCPU[i] = -1.0f; h_StockPrice[i] = RandFloat(5.0f, 30.0f); h_OptionStrike[i] = RandFloat(1.0f, 100.0f); h_OptionYears[i] = RandFloat(0.25f, 10.0f); } printf("...copying input data to GPU mem.\n"); //Copy options data to GPU memory for further processing checkCudaErrors(cuMemcpyHtoD(d_StockPrice, h_StockPrice, OPT_SZ)); checkCudaErrors(cuMemcpyHtoD(d_OptionStrike, h_OptionStrike, OPT_SZ)); checkCudaErrors(cuMemcpyHtoD(d_OptionYears, h_OptionYears, OPT_SZ)); printf("Data init done.\n\n"); printf("Executing Black-Scholes GPU kernel (%i iterations)...\n", NUM_ITERATIONS); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); dim3 cudaBlockSize( 128, 1, 1); dim3 cudaGridSize(DIV_UP(OPT_N/2, 128),1,1); float risk = RISKFREE; float volatility = VOLATILITY; int optval = OPT_N; void *arr[] = { (void *)&d_CallResult, (void *)&d_PutResult, (void *)&d_StockPrice, (void *)&d_OptionStrike, (void *)&d_OptionYears, (void *)&risk, (void *)&volatility, (void *)&optval }; for (i = 0; i < NUM_ITERATIONS; i++) { checkCudaErrors(cuLaunchKernel(kernel_addr, cudaGridSize.x, cudaGridSize.y, cudaGridSize.z, /* grid dim */ cudaBlockSize.x, cudaBlockSize.y, cudaBlockSize.z, /* block dim */ 0,0, /* shared mem, stream */ &arr[0], /* arguments */ 0)); } checkCudaErrors(cuCtxSynchronize()); sdkStopTimer(&hTimer); gpuTime = sdkGetTimerValue(&hTimer) / NUM_ITERATIONS; //Both call and put is calculated printf("Options count : %i \n", 2 * OPT_N); printf("BlackScholesGPU() time : %f msec\n", gpuTime); printf("Effective memory bandwidth: %f GB/s\n", ((double)(5 * OPT_N * sizeof(float)) * 1E-9) / (gpuTime * 1E-3)); printf("Gigaoptions per second : %f \n\n", ((double)(2 * OPT_N) * 1E-9) / (gpuTime * 1E-3)); printf("BlackScholes, Throughput = %.4f GOptions/s, Time = %.5f s, Size = %u options, NumDevsUsed = %u, Workgroup = %u\n", (((double)(2.0 * OPT_N) * 1.0E-9) / (gpuTime * 1.0E-3)), gpuTime*1e-3, (2 * OPT_N), 1, 128); printf("\nReading back GPU results...\n"); //Read back GPU results to compare them to CPU results checkCudaErrors(cuMemcpyDtoH(h_CallResultGPU, d_CallResult, OPT_SZ)); checkCudaErrors(cuMemcpyDtoH(h_PutResultGPU, d_PutResult, OPT_SZ)); printf("Checking the results...\n"); printf("...running CPU calculations.\n\n"); //Calculate options values on CPU BlackScholesCPU( h_CallResultCPU, h_PutResultCPU, h_StockPrice, h_OptionStrike, h_OptionYears, RISKFREE, VOLATILITY, OPT_N ); printf("Comparing the results...\n"); //Calculate max absolute difference and L1 distance //between CPU and GPU results sum_delta = 0; sum_ref = 0; max_delta = 0; for (i = 0; i < OPT_N; i++) { ref = h_CallResultCPU[i]; delta = fabs(h_CallResultCPU[i] - h_CallResultGPU[i]); if (delta > max_delta) { max_delta = delta; } sum_delta += delta; sum_ref += fabs(ref); } L1norm = sum_delta / sum_ref; printf("L1 norm: %E\n", L1norm); printf("Max absolute error: %E\n\n", max_delta); printf("Shutting down...\n"); printf("...releasing GPU memory.\n"); checkCudaErrors(cuMemFree(d_OptionYears)); checkCudaErrors(cuMemFree(d_OptionStrike)); checkCudaErrors(cuMemFree(d_StockPrice)); checkCudaErrors(cuMemFree(d_PutResult)); checkCudaErrors(cuMemFree(d_CallResult)); printf("...releasing CPU memory.\n"); free(h_OptionYears); free(h_OptionStrike); free(h_StockPrice); free(h_PutResultGPU); free(h_CallResultGPU); free(h_PutResultCPU); free(h_CallResultCPU); sdkDeleteTimer(&hTimer); printf("Shutdown done.\n"); printf("\n[%s] - Test Summary\n", argv[0]); cuProfilerStop(); if (L1norm > 1e-6) { printf("Test failed!\n"); exit(EXIT_FAILURE); } printf("Test passed\n"); exit(EXIT_SUCCESS); }
// render Mandelbrot image using CUDA or CPU void renderImage(bool bUseOpenGL, bool fp64, int mode) { #if RUN_TIMING pass = 0; #endif if (pass < 128) { if (g_runCPU) { int startPass = pass; float xs, ys; sdkResetTimer(&hTimer); if (bUseOpenGL) { // DEPRECATED: checkCudaErrors(cudaGLMapBufferObject((void**)&d_dst, gl_PBO)); checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0)); size_t num_bytes; checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&d_dst, &num_bytes, cuda_pbo_resource)); } // Get the anti-alias sub-pixel sample location GetSample(pass & 127, xs, ys); // Get the pixel scale and offset double s = scale / (double)imageW; double x = (xs - (double)imageW * 0.5f) * s + xOff; double y = (ys - (double)imageH * 0.5f) * s + yOff; // Run the mandelbrot generator if (pass && !startPass) // Use the adaptive sampling version when animating. { if (precisionMode) RunMandelbrotDSGold1(h_Src, imageW, imageH, crunch, x, y, xJParam, yJParam, s, colors, pass++, animationFrame, g_isJuliaSet); else RunMandelbrotGold1(h_Src, imageW, imageH, crunch, (float)x, (float)y, (float)xJParam, (float)yJParam, (float)s, colors, pass++, animationFrame, g_isJuliaSet); } else { if (precisionMode) RunMandelbrotDSGold0(h_Src, imageW, imageH, crunch, x, y, xJParam, yJParam, s, colors, pass++, animationFrame, g_isJuliaSet); else RunMandelbrotGold0(h_Src, imageW, imageH, crunch, (float)x, (float)y, (float)xJParam, (float)yJParam, (float)s, colors, pass++, animationFrame, g_isJuliaSet); } checkCudaErrors(cudaMemcpy(d_dst, h_Src, imageW * imageH * sizeof(uchar4), cudaMemcpyHostToDevice)); if (bUseOpenGL) { // DEPRECATED: checkCudaErrors(cudaGLUnmapBufferObject(gl_PBO)); checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); } #if RUN_TIMING printf("CPU = %5.8f\n", 0.001f * sdkGetTimerValue(&hTimer); #endif } else { float timeEstimate; int startPass = pass; sdkResetTimer(&hTimer); if (bUseOpenGL) { // DEPRECATED: checkCudaErrors(cudaGLMapBufferObject((void**)&d_dst, gl_PBO)); checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0)); size_t num_bytes; checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&d_dst, &num_bytes, cuda_pbo_resource)); } // Render anti-aliasing passes until we run out time (60fps approximately) do { float xs, ys; // Get the anti-alias sub-pixel sample location GetSample(pass & 127, xs, ys); // Get the pixel scale and offset double s = scale / (float)imageW; double x = (xs - (double)imageW * 0.5f) * s + xOff; double y = (ys - (double)imageH * 0.5f) * s + yOff; // Run the mandelbrot generator if (pass && !startPass) // Use the adaptive sampling version when animating. RunMandelbrot1(d_dst, imageW, imageH, crunch, x, y, xJParam, yJParam, s, colors, pass++, animationFrame, precisionMode, numSMs, g_isJuliaSet, version); else RunMandelbrot0(d_dst, imageW, imageH, crunch, x, y, xJParam, yJParam, s, colors, pass++, animationFrame, precisionMode, numSMs, g_isJuliaSet, version); cudaDeviceSynchronize(); // Estimate the total time of the frame if one more pass is rendered timeEstimate = 0.001f * sdkGetTimerValue(&hTimer) * ((float)(pass + 1 - startPass) / (float)(pass - startPass)); } while ((pass < 128) && (timeEstimate < 1.0f / 60.0f) && !RUN_TIMING); if (bUseOpenGL) { // DEPRECATED: checkCudaErrors(cudaGLUnmapBufferObject(gl_PBO)); checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); } #if RUN_TIMING printf("GPU = %5.8f\n", 0.001f * sdkGetTimerValue(&hTimer); #endif } }
int main(int argc, char **argv) { printf("%s Starting...\n\n", argv[0]); //Use command-line specified CUDA device, otherwise use device with highest Gflops/s findCudaDevice(argc, (const char **)argv); uint *d_Input, *d_Output; uint *h_Input, *h_OutputCPU, *h_OutputGPU; StopWatchInterface *hTimer = NULL; const uint N = 13 * 1048576 / 2; printf("Allocating and initializing host arrays...\n"); sdkCreateTimer(&hTimer); h_Input = (uint *)malloc(N * sizeof(uint)); h_OutputCPU = (uint *)malloc(N * sizeof(uint)); h_OutputGPU = (uint *)malloc(N * sizeof(uint)); srand(2009); for (uint i = 0; i < N; i++) { h_Input[i] = rand(); } printf("Allocating and initializing CUDA arrays...\n"); checkCudaErrors(cudaMalloc((void **)&d_Input, N * sizeof(uint))); checkCudaErrors(cudaMalloc((void **)&d_Output, N * sizeof(uint))); checkCudaErrors(cudaMemcpy(d_Input, h_Input, N * sizeof(uint), cudaMemcpyHostToDevice)); printf("Initializing CUDA-C scan...\n\n"); initScan(); int globalFlag = 1; size_t szWorkgroup; const int iCycles = 100; printf("*** Running GPU scan for short arrays (%d identical iterations)...\n\n", iCycles); for (uint arrayLength = MIN_SHORT_ARRAY_SIZE; arrayLength <= MAX_SHORT_ARRAY_SIZE; arrayLength <<= 1) { printf("Running scan for %u elements (%u arrays)...\n", arrayLength, N / arrayLength); checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); for (int i = 0; i < iCycles; i++) { szWorkgroup = scanExclusiveShort(d_Output, d_Input, N / arrayLength, arrayLength); } checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&hTimer); double timerValue = 1.0e-3 * sdkGetTimerValue(&hTimer) / iCycles; printf("Validating the results...\n"); printf("...reading back GPU results\n"); checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, N * sizeof(uint), cudaMemcpyDeviceToHost)); printf(" ...scanExclusiveHost()\n"); scanExclusiveHost(h_OutputCPU, h_Input, N / arrayLength, arrayLength); // Compare GPU results with CPU results and accumulate error for this test printf(" ...comparing the results\n"); int localFlag = 1; for (uint i = 0; i < N; i++) { if (h_OutputCPU[i] != h_OutputGPU[i]) { localFlag = 0; break; } } // Log message on individual test result, then accumulate to global flag printf(" ...Results %s\n\n", (localFlag == 1) ? "Match" : "DON'T Match !!!"); globalFlag = globalFlag && localFlag; // Data log if (arrayLength == MAX_SHORT_ARRAY_SIZE) { printf("\n"); printf("scan-Short, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * (double)arrayLength/timerValue), timerValue, (unsigned int)arrayLength, 1, (unsigned int)szWorkgroup); printf("\n"); } } printf("***Running GPU scan for large arrays (%u identical iterations)...\n\n", iCycles); for (uint arrayLength = MIN_LARGE_ARRAY_SIZE; arrayLength <= MAX_LARGE_ARRAY_SIZE; arrayLength <<= 1) { printf("Running scan for %u elements (%u arrays)...\n", arrayLength, N / arrayLength); checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); for (int i = 0; i < iCycles; i++) { szWorkgroup = scanExclusiveLarge(d_Output, d_Input, N / arrayLength, arrayLength); } checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&hTimer); double timerValue = 1.0e-3 * sdkGetTimerValue(&hTimer) / iCycles; printf("Validating the results...\n"); printf("...reading back GPU results\n"); checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, N * sizeof(uint), cudaMemcpyDeviceToHost)); printf("...scanExclusiveHost()\n"); scanExclusiveHost(h_OutputCPU, h_Input, N / arrayLength, arrayLength); // Compare GPU results with CPU results and accumulate error for this test printf(" ...comparing the results\n"); int localFlag = 1; for (uint i = 0; i < N; i++) { if (h_OutputCPU[i] != h_OutputGPU[i]) { localFlag = 0; break; } } // Log message on individual test result, then accumulate to global flag printf(" ...Results %s\n\n", (localFlag == 1) ? "Match" : "DON'T Match !!!"); globalFlag = globalFlag && localFlag; // Data log if (arrayLength == MAX_LARGE_ARRAY_SIZE) { printf("\n"); printf("scan-Large, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * (double)arrayLength/timerValue), timerValue, (unsigned int)arrayLength, 1, (unsigned int)szWorkgroup); printf("\n"); } } printf("Shutting down...\n"); closeScan(); checkCudaErrors(cudaFree(d_Output)); checkCudaErrors(cudaFree(d_Input)); sdkDeleteTimer(&hTimer); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); // pass or fail (cumulative... all tests in the loop) exit(globalFlag ? EXIT_SUCCESS : EXIT_FAILURE); }
int main(int argc, char * argv[] ){ //Read nifti data /* FSLIO *fslio; void *buffer; short x, y, z, v; fslio = FslInit(); buffer = FslReadAllVolumes(fslio,"/home/pteodors/openfmri/ds105/sub001/BOLD/task001_run001/bold.nii.gz"); if (buffer == NULL) { fprintf(stderr, "\nError opening and reading\n"); exit(1); } signed short *bf = (signed short *) buffer; FslGetDim(fslio, &x, &y, &z, &v); int nvol = x*y*z*v; int m = x*y*z; // x*y*z int n = v; // = v int mn = m*n; int i, j; if (argc==3){ m = atoi(argv[1]); n = atoi(argv[2]); } else if(argc == 2){ m = atoi(argv[1]); } // some sample data for testing /* double sample_data [4][5] = {{1.0, 0.0, 0.0, 0.0, 2.0}, {0.0, 0.0, 3.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0, 0.0}, {0.0, 4.0, 0.0, 0.0, 0.0}}; m = 5; n = 4; int sample_size = m*n; // testing version nifti_data_type *data = (nifti_data_type*) malloc(sizeof(nifti_data_type)*sample_size); int j; for(i=0; i < n; i++){ for(j=0; j < m; j++){ data[i*m + j] = sample_data[i][j]; } } */ if (argc < 3){ printf("You must specify name values of m and n\n"); return 0; } int m = atoi(argv[1]); int n = atoi(argv[2]); int mn = m*n; char filename[80]; sprintf(filename, "/home/pteodors/matlab_pca/%dx%d.bin", m, n); float* A = (float*) malloc(sizeof(float)*mn); FILE * pA; const unsigned int num_bytes = mn; pA = fopen(filename, "rb"); int i; unsigned int read_bytes = 0; while(num_bytes - read_bytes){ read_bytes = fread((void*)&A[read_bytes], sizeof(float), num_bytes-read_bytes, pA); } fclose(pA); nifti_data_type *data = (nifti_data_type*) malloc(sizeof(nifti_data_type)*mn); // prepare data format for(i=0; i < mn; i++){ //data[i] = (nifti_data_type) bf[i]; data[i] = (nifti_data_type) A[i]; } free(A); //FslClose(fslio); int ncomponents = 20; nifti_data_type* coeff = (nifti_data_type*) malloc(m*ncomponents*sizeof(nifti_data_type)); int interations = 20; int i; StopWatchInterface *timer = NULL; sdkCreateTimer(&timer); sdkStartTimer(&timer); for(i=0; i<interations; i++){ runPCA(data, m, n, ncomponents, coeff); //x*y*z, v } sdkStopTimer(&timer); printf("Processing time: %f ms\n", sdkGetTimerValue(&timer)/(float)interations); sdkDeleteTimer(&timer); free(data); free(coeff); return 0; }
bool test0(void) { float *h_Data, *h_Kernel, *h_ResultCPU, *h_ResultGPU; float *d_Data, *d_PaddedData, *d_Kernel, *d_PaddedKernel; fComplex *d_DataSpectrum, *d_KernelSpectrum; cufftHandle fftPlanFwd, fftPlanInv; bool bRetVal; StopWatchInterface *hTimer = NULL; sdkCreateTimer(&hTimer); printf("Testing built-in R2C / C2R FFT-based convolution\n"); const int kernelH = 3; const int kernelW = 3; const int kernelY = 1; const int kernelX = 1; const int dataH = 10; const int dataW = 10; const int fftH = snapTransformSize(dataH + kernelH - 1); const int fftW = snapTransformSize(dataW + kernelW - 1); printf("...allocating memory\n"); h_Data = (float *)malloc(dataH * dataW * sizeof(float)); h_Kernel = (float *)malloc(kernelH * kernelW * sizeof(float)); h_ResultCPU = (float *)malloc(dataH * dataW * sizeof(float)); h_ResultGPU = (float *)malloc(fftH * fftW * sizeof(float)); checkCudaErrors(cudaMalloc((void **)&d_Data, dataH * dataW * sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_Kernel, kernelH * kernelW * sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_PaddedData, fftH * fftW * sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_PaddedKernel, fftH * fftW * sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_DataSpectrum, fftH * (fftW / 2 + 1) * sizeof(fComplex))); checkCudaErrors(cudaMalloc((void **)&d_KernelSpectrum, fftH * (fftW / 2 + 1) * sizeof(fComplex))); printf("...generating random input data\n"); srand(2010); for (int i = 0; i < dataH * dataW; i++) { //h_Data[i] = getRand(); h_Data[i] = i + 1; } for (int i = 0; i < kernelH * kernelW; i++) { //h_Kernel[i] = getRand(); h_Kernel[i] = i + 1; } FILE* fp2 = fopen("input_kernel.txt", "w+"); FILE* fp3 = fopen("input_data.txt", "w+"); for (int i = 0; i < dataH * dataW; i++) fprintf(fp3, "%f\n", h_Data[i]); for (int i = 0; i < kernelH * kernelW; i++) fprintf(fp2, "%f\n", h_Kernel[i]); fclose(fp2); fclose(fp3); printf("...creating R2C & C2R FFT plans for %i x %i\n", fftH, fftW); checkCudaErrors(cufftPlan2d(&fftPlanFwd, fftH, fftW, CUFFT_R2C)); checkCudaErrors(cufftPlan2d(&fftPlanInv, fftH, fftW, CUFFT_C2R)); printf("...uploading to GPU and padding convolution kernel and input data\n"); checkCudaErrors(cudaMemcpy(d_Kernel, h_Kernel, kernelH * kernelW * sizeof(float), cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(d_Data, h_Data, dataH * dataW * sizeof(float), cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemset(d_PaddedKernel, 0, fftH * fftW * sizeof(float))); checkCudaErrors(cudaMemset(d_PaddedData, 0, fftH * fftW * sizeof(float))); padKernel( d_PaddedKernel, d_Kernel, fftH, fftW, kernelH, kernelW, kernelY, kernelX ); padDataClampToBorder( d_PaddedData, d_Data, fftH, fftW, dataH, dataW, kernelH, kernelW, kernelY, kernelX ); //Not including kernel transformation into time measurement, //since convolution kernel is not changed very frequently printf("...transforming convolution kernel\n"); checkCudaErrors(cufftExecR2C(fftPlanFwd, (cufftReal *)d_PaddedKernel, (cufftComplex *)d_KernelSpectrum)); printf("...running GPU FFT convolution: "); checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); checkCudaErrors(cufftExecR2C(fftPlanFwd, (cufftReal *)d_PaddedData, (cufftComplex *)d_DataSpectrum)); modulateAndNormalize(d_DataSpectrum, d_KernelSpectrum, fftH, fftW, 1); checkCudaErrors(cufftExecC2R(fftPlanInv, (cufftComplex *)d_DataSpectrum, (cufftReal *)d_PaddedData)); checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&hTimer); double gpuTime = sdkGetTimerValue(&hTimer); printf("%f MPix/s (%f ms)\n", (double)dataH * (double)dataW * 1e-6 / (gpuTime * 0.001), gpuTime); printf("...reading back GPU convolution results\n"); checkCudaErrors(cudaMemcpy(h_ResultGPU, d_PaddedData, fftH * fftW * sizeof(float), cudaMemcpyDeviceToHost)); printf("...running reference CPU convolution\n"); convolutionClampToBorderCPU( h_ResultCPU, h_Data, h_Kernel, dataH, dataW, kernelH, kernelW, kernelY, kernelX ); printf("...comparing the results: "); double sum_delta2 = 0; double sum_ref2 = 0; double max_delta_ref = 0; for (int y = 0; y < dataH; y++) for (int x = 0; x < dataW; x++) { double rCPU = (double)h_ResultCPU[y * dataW + x]; double rGPU = (double)h_ResultGPU[y * fftW + x]; double delta = (rCPU - rGPU) * (rCPU - rGPU); double ref = rCPU * rCPU + rCPU * rCPU; if ((delta / ref) > max_delta_ref) { max_delta_ref = delta / ref; } sum_delta2 += delta; sum_ref2 += ref; } double L2norm = sqrt(sum_delta2 / sum_ref2); printf("rel L2 = %E (max delta = %E)\n", L2norm, sqrt(max_delta_ref)); bRetVal = (L2norm < 1e-6) ? true : false; printf(bRetVal ? "L2norm Error OK\n" : "L2norm Error too high!\n"); printf("...shutting down\n"); sdkStartTimer(&hTimer); checkCudaErrors(cufftDestroy(fftPlanInv)); checkCudaErrors(cufftDestroy(fftPlanFwd)); checkCudaErrors(cudaFree(d_DataSpectrum)); checkCudaErrors(cudaFree(d_KernelSpectrum)); checkCudaErrors(cudaFree(d_PaddedData)); checkCudaErrors(cudaFree(d_PaddedKernel)); checkCudaErrors(cudaFree(d_Data)); checkCudaErrors(cudaFree(d_Kernel)); FILE* fp = fopen("result_gpu.txt", "w+"); FILE* fp1 = fopen("result_cpu.txt", "w+"); for (int i = 0; i < dataH * dataW; i++) { fprintf(fp, "%f\n", h_ResultGPU[i]); fprintf(fp1, "%f\n", h_ResultCPU[i]); } fclose(fp); fclose(fp1); free(h_ResultGPU); free(h_ResultCPU); free(h_Data); free(h_Kernel); return bRetVal; }
/////////////////////////////////////////////////////////////////////////////// // Main program /////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { // Start logs shrQAStart(argc, argv); // initialize the GPU, either identified by --device // or by picking the device with highest flop rate. int devID = findCudaDevice(argc, (const char **)argv); // parsing the number of random numbers to generate int rand_n = DEFAULT_RAND_N; if( checkCmdLineFlag(argc, (const char**) argv, "count") ) { rand_n = getCmdLineArgumentInt(argc, (const char**) argv, "count"); } printf("Allocating data for %i samples...\n", rand_n); // parsing the seed int seed = DEFAULT_SEED; if( checkCmdLineFlag(argc, (const char**) argv, "seed") ) { seed = getCmdLineArgumentInt(argc, (const char**) argv, "seed"); } printf("Seeding with %i ...\n", seed); float *d_Rand; checkCudaErrors( cudaMalloc((void **)&d_Rand, rand_n * sizeof(float)) ); curandGenerator_t prngGPU; checkCurandErrors( curandCreateGenerator(&prngGPU, CURAND_RNG_PSEUDO_MTGP32) ); checkCurandErrors( curandSetPseudoRandomGeneratorSeed(prngGPU, seed) ); curandGenerator_t prngCPU; checkCurandErrors( curandCreateGeneratorHost(&prngCPU, CURAND_RNG_PSEUDO_MTGP32) ); checkCurandErrors( curandSetPseudoRandomGeneratorSeed(prngCPU, seed) ); // // Example 1: Compare random numbers generated on GPU and CPU float *h_RandGPU = (float *)malloc(rand_n * sizeof(float)); printf("Generating random numbers on GPU...\n\n"); checkCurandErrors( curandGenerateUniform(prngGPU, (float*) d_Rand, rand_n) ); printf("\nReading back the results...\n"); checkCudaErrors( cudaMemcpy(h_RandGPU, d_Rand, rand_n * sizeof(float), cudaMemcpyDeviceToHost) ); float *h_RandCPU = (float *)malloc(rand_n * sizeof(float)); printf("Generating random numbers on CPU...\n\n"); checkCurandErrors( curandGenerateUniform(prngCPU, (float*) h_RandCPU, rand_n) ); printf("Comparing CPU/GPU random numbers...\n\n"); float L1norm = compareResults(rand_n, h_RandGPU, h_RandCPU); // // Example 2: Timing of random number generation on GPU const int numIterations = 10; int i; StopWatchInterface *hTimer; checkCudaErrors( cudaDeviceSynchronize() ); sdkCreateTimer(&hTimer); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); for (i = 0; i < numIterations; i++) { checkCurandErrors( curandGenerateUniform(prngGPU, (float*) d_Rand, rand_n) ); } checkCudaErrors( cudaDeviceSynchronize() ); sdkStopTimer(&hTimer); double gpuTime = 1.0e-3 * sdkGetTimerValue(&hTimer)/(double)numIterations; printf("MersenneTwister, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers\n", 1.0e-9 * rand_n / gpuTime, gpuTime, rand_n); printf("Shutting down...\n"); checkCurandErrors( curandDestroyGenerator(prngGPU) ); checkCurandErrors( curandDestroyGenerator(prngCPU) ); checkCudaErrors( cudaFree(d_Rand) ); sdkDeleteTimer( &hTimer); free(h_RandGPU); free(h_RandCPU); cudaDeviceReset(); shrQAFinishExit(argc, (const char**)argv, (L1norm < 1e-6) ? QA_PASSED : QA_FAILED); }
void display() { static double gflops = 0; static double ifps = 0; static double interactionsPerSecond = 0; // update the simulation if (!bPause) { if (cycleDemo && (sdkGetTimerValue(&demoTimer) > demoTime)) { activeDemo = (activeDemo + 1) % numDemos; selectDemo(activeDemo); } updateSimulation(); if (!useCpu) { cudaEventRecord(hostMemSyncEvent, 0); // insert an event to wait on before rendering } } glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); if (displayEnabled) { // view transform { glMatrixMode(GL_MODELVIEW); glLoadIdentity(); for (int c = 0; c < 3; ++c) { camera_trans_lag[c] += (camera_trans[c] - camera_trans_lag[c]) * inertia; camera_rot_lag[c] += (camera_rot[c] - camera_rot_lag[c]) * inertia; } glTranslatef(camera_trans_lag[0], camera_trans_lag[1], camera_trans_lag[2]); glRotatef(camera_rot_lag[0], 1.0, 0.0, 0.0); glRotatef(camera_rot_lag[1], 0.0, 1.0, 0.0); } displayNBodySystem(); // display user interface if (bShowSliders) { glBlendFunc(GL_ONE_MINUS_DST_COLOR, GL_ZERO); // invert color glEnable(GL_BLEND); paramlist->Render(0, 0); glDisable(GL_BLEND); } if (bFullscreen) { beginWinCoords(); char msg0[256], msg1[256], msg2[256]; if (bDispInteractions) { sprintf(msg1, "%0.2f billion interactions per second", interactionsPerSecond); } else { sprintf(msg1, "%0.2f GFLOP/s", gflops); } sprintf(msg0, "%s", deviceName); sprintf(msg2, "%0.2f FPS [%s | %d bodies]", ifps, fp64 ? "double precision" : "single precision", numBodies); glBlendFunc(GL_ONE_MINUS_DST_COLOR, GL_ZERO); // invert color glEnable(GL_BLEND); glColor3f(0.46f, 0.73f, 0.0f); glPrint(80, glutGet(GLUT_WINDOW_HEIGHT) - 122, msg0, GLUT_BITMAP_TIMES_ROMAN_24); glColor3f(1.0f, 1.0f, 1.0f); glPrint(80, glutGet(GLUT_WINDOW_HEIGHT) - 96, msg2, GLUT_BITMAP_TIMES_ROMAN_24); glColor3f(1.0f, 1.0f, 1.0f); glPrint(80, glutGet(GLUT_WINDOW_HEIGHT) - 70, msg1, GLUT_BITMAP_TIMES_ROMAN_24); glDisable(GL_BLEND); endWinCoords(); } glutSwapBuffers(); } fpsCount++; // this displays the frame rate updated every second (independent of frame rate) if (fpsCount >= fpsLimit) { char fps[256]; float milliseconds = 1; // stop timer if (useCpu) { milliseconds = sdkGetTimerValue(&timer); sdkResetTimer(&timer); } else { checkCudaErrors(cudaEventRecord(stopEvent, 0)); checkCudaErrors(cudaEventSynchronize(stopEvent)); checkCudaErrors(cudaEventElapsedTime(&milliseconds, startEvent, stopEvent)); } milliseconds /= (float)fpsCount; computePerfStats(interactionsPerSecond, gflops, milliseconds, 1); ifps = 1.f / (milliseconds / 1000.f); sprintf(fps, "CUDA N-Body (%d bodies): " "%0.1f fps | %0.1f BIPS | %0.1f GFLOP/s | %s", numBodies, ifps, interactionsPerSecond, gflops, fp64 ? "double precision" : "single precision"); glutSetWindowTitle(fps); fpsCount = 0; fpsLimit = (ifps > 1.f) ? (int)ifps : 1; if (bPause) { fpsLimit = 0; } // restart timer if (!useCpu) { checkCudaErrors(cudaEventRecord(startEvent, 0)); } } glutReportErrors(); }
//////////////////////////////////////////////////////////////////////////////// // Main program //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { // start logs printf("[%s] - Starting...\n", argv[0]); float *h_Kernel, *h_Input, *h_Buffer, *h_OutputCPU, *h_OutputGPU; float *d_Input, *d_Output, *d_Buffer; const int imageW = 3072; const int imageH = 3072; const int iterations = 16; StopWatchInterface *hTimer = NULL; //Use command-line specified CUDA device, otherwise use device with highest Gflops/s findCudaDevice(argc, (const char **)argv); sdkCreateTimer(&hTimer); printf("Image Width x Height = %i x %i\n\n", imageW, imageH); printf("Allocating and initializing host arrays...\n"); h_Kernel = (float *)malloc(KERNEL_LENGTH * sizeof(float)); h_Input = (float *)malloc(imageW * imageH * sizeof(float)); h_Buffer = (float *)malloc(imageW * imageH * sizeof(float)); h_OutputCPU = (float *)malloc(imageW * imageH * sizeof(float)); h_OutputGPU = (float *)malloc(imageW * imageH * sizeof(float)); srand(200); for (unsigned int i = 0; i < KERNEL_LENGTH; i++) { h_Kernel[i] = (float)(rand() % 16); } for (unsigned i = 0; i < imageW * imageH; i++) { h_Input[i] = (float)(rand() % 16); } printf("Allocating and initializing CUDA arrays...\n"); checkCudaErrors(cudaMalloc((void **)&d_Input, imageW * imageH * sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_Output, imageW * imageH * sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_Buffer , imageW * imageH * sizeof(float))); setConvolutionKernel(h_Kernel); checkCudaErrors(cudaMemcpy(d_Input, h_Input, imageW * imageH * sizeof(float), cudaMemcpyHostToDevice)); printf("Running GPU convolution (%u identical iterations)...\n\n", iterations); for (int i = -1; i < iterations; i++) { //i == -1 -- warmup iteration if (i == 0) { checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); } convolutionRowsGPU( d_Buffer, d_Input, imageW, imageH ); convolutionColumnsGPU( d_Output, d_Buffer, imageW, imageH ); } checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&hTimer); double gpuTime = 0.001 * sdkGetTimerValue(&hTimer) / (double)iterations; printf("convolutionSeparable, Throughput = %.4f MPixels/sec, Time = %.5f s, Size = %u Pixels, NumDevsUsed = %i, Workgroup = %u\n", (1.0e-6 * (double)(imageW * imageH)/ gpuTime), gpuTime, (imageW * imageH), 1, 0); printf("\nReading back GPU results...\n\n"); checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, imageW * imageH * sizeof(float), cudaMemcpyDeviceToHost)); printf("Checking the results...\n"); printf(" ...running convolutionRowCPU()\n"); convolutionRowCPU( h_Buffer, h_Input, h_Kernel, imageW, imageH, KERNEL_RADIUS ); printf(" ...running convolutionColumnCPU()\n"); convolutionColumnCPU( h_OutputCPU, h_Buffer, h_Kernel, imageW, imageH, KERNEL_RADIUS ); printf(" ...comparing the results\n"); double sum = 0, delta = 0; for (unsigned i = 0; i < imageW * imageH; i++) { delta += (h_OutputGPU[i] - h_OutputCPU[i]) * (h_OutputGPU[i] - h_OutputCPU[i]); sum += h_OutputCPU[i] * h_OutputCPU[i]; } double L2norm = sqrt(delta / sum); printf(" ...Relative L2 norm: %E\n\n", L2norm); printf("Shutting down...\n"); checkCudaErrors(cudaFree(d_Buffer)); checkCudaErrors(cudaFree(d_Output)); checkCudaErrors(cudaFree(d_Input)); free(h_OutputGPU); free(h_OutputCPU); free(h_Buffer); free(h_Input); free(h_Kernel); sdkDeleteTimer(&hTimer); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); if (L2norm > 1e-6) { printf("Test failed!\n"); exit(EXIT_FAILURE); } printf("Test passed\n"); exit(EXIT_SUCCESS); }