void cleanup() { sdkDeleteTimer(&timer); sdkDeleteTimer(&kernel_timer); if (h_img) { free(h_img); h_img=NULL; } if (d_img) { cudaFree(d_img); d_img=NULL; } if (d_temp) { cudaFree(d_temp); d_temp=NULL; } // Refer to boxFilter_kernel.cu for implementation freeTextures(); cudaGraphicsUnregisterResource(cuda_pbo_resource); glDeleteBuffersARB(1, &pbo); glDeleteTextures(1, &texid); glDeleteProgramsARB(1, &shader); }
void cleanup() { sdkDeleteTimer(&timer); sdkDeleteTimer(&kernel_timer); if (hImage) { free(hImage); } freeTextures(); //DEPRECATED: checkCudaErrors(cudaGLUnregisterBufferObject(pbo)); cudaGraphicsUnregisterResource(cuda_pbo_resource); glDeleteBuffersARB(1, &pbo); glDeleteTextures(1, &texid); glDeleteProgramsARB(1, &shader); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); }
void copy_image(PPM_IMG img_in) { StopWatchInterface *timer=NULL; PPM_IMG host_img; PPM_IMG device_img; int size = img_in.w * img_in.h * sizeof(unsigned char); host_img.w = img_in.w; host_img.h = img_in.h; host_img.img_r = (unsigned char *)malloc(size); host_img.img_g = (unsigned char *)malloc(size); host_img.img_b = (unsigned char *)malloc(size); device_img.w = img_in.w; device_img.h = img_in.h; cudaMalloc((void **)&(device_img.img_r), size); cudaMalloc((void **)&(device_img.img_g), size); cudaMalloc((void **)&(device_img.img_b), size); launchEmptyKernel(); // lauch an empty kernel printf("Starting copy image...\n"); // CPU to GPU sdkCreateTimer(&timer); sdkStartTimer(&timer); cudaMemcpy(device_img.img_r, img_in.img_r, size, cudaMemcpyHostToDevice); cudaMemcpy(device_img.img_g, img_in.img_g, size, cudaMemcpyHostToDevice); cudaMemcpy(device_img.img_b, img_in.img_b, size, cudaMemcpyHostToDevice); sdkStopTimer(&timer); printf("Time of copy image from CPU to GPU: %f (ms)\n", sdkGetTimerValue(&timer)); sdkDeleteTimer(&timer); // GPU to CPU sdkCreateTimer(&timer); sdkStartTimer(&timer); cudaMemcpy(host_img.img_r, device_img.img_r, size, cudaMemcpyDeviceToHost); cudaMemcpy(host_img.img_g, device_img.img_g, size, cudaMemcpyDeviceToHost); cudaMemcpy(host_img.img_b, device_img.img_b, size, cudaMemcpyDeviceToHost); sdkStopTimer(&timer); printf("Time of copy image from GPU to CPU: %f (ms)\n", sdkGetTimerValue(&timer)); sdkDeleteTimer(&timer); cudaFree(device_img.img_r); cudaFree(device_img.img_g); cudaFree(device_img.img_b); free(host_img.img_r); free(host_img.img_g); free(host_img.img_b); }
void runAutoTest(const char *ref_file, char *exec_path) { checkCudaErrors(cudaMalloc((void **)&d_output, width*height*sizeof(GLubyte)*4)); // render the volumeData render_kernel(gridSize, blockSize, d_output, width, height, w); checkCudaErrors(cudaDeviceSynchronize()); getLastCudaError("render_kernel failed"); void *h_output = malloc(width*height*sizeof(GLubyte)*4); checkCudaErrors(cudaMemcpy(h_output, d_output, width*height*sizeof(GLubyte)*4, cudaMemcpyDeviceToHost)); sdkDumpBin(h_output, width*height*sizeof(GLubyte)*4, "simpleTexture3D.bin"); bool bTestResult = sdkCompareBin2BinFloat("simpleTexture3D.bin", sdkFindFilePath(ref_file, exec_path), width*height, MAX_EPSILON_ERROR, THRESHOLD, exec_path); checkCudaErrors(cudaFree(d_output)); free(h_output); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); sdkStopTimer(&timer); sdkDeleteTimer(&timer); exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE); }
void siTest(T *d_ptclA, T *d_ptclA_new, T *d_wghtA, unsigned int size, int stateDim) { int blocks, threads; float elapsedTimeInMs = 0.0f; threads = BLOCK_SIZE; blocks = (size + threads - 1) / threads; #ifdef NVS while (blocks > GRID_LIMIT){ blocks >>= 1; threads <<= 1; } #endif StopWatchInterface *timer = NULL; sdkCreateTimer(&timer); for (int i = 0 ; i < TEST_ITERATIONS ; i ++){ cudaDeviceSynchronize(); sdkStartTimer(&timer); SI<T>(blocks, threads, d_ptclA, d_ptclA_new, d_wghtA, size, stateDim); checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&timer); } elapsedTimeInMs = sdkGetAverageTimerValue(&timer); printf("%f\t", elapsedTimeInMs); printf("size=%u, stateDim=%d, blocks=%d, threads=%d\n",size, stateDim, blocks, threads); sdkDeleteTimer(&timer); }
~NBodyDemo() { if (m_nbodyCpu) { delete m_nbodyCpu; } if (m_nbodyCuda) { delete m_nbodyCuda; } if (m_hPos) { delete [] m_hPos; } if (m_hVel) { delete [] m_hVel; } if (m_hColor) { delete [] m_hColor; } sdkDeleteTimer(&demoTimer); if (!benchmark && !compareToCPU) delete m_renderer; }
void cleanup() { sdkDeleteTimer(&timer); checkCudaErrors(cudaFree(d_img)); checkCudaErrors(cudaFree(d_temp)); if (!runBenchmark) { if (pbo) { checkCudaErrors(cudaGLUnregisterBufferObject(pbo)); glDeleteBuffersARB(1, &pbo); } if (texid) { glDeleteTextures(1, &texid); } } // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); }
void cleanup(void) { cudaGraphicsUnregisterResource(cuda_vbo_resource); unbindTexture(); deleteTexture(); // Free all host and device resources free(hvfield); free(particles); cudaFree(dvfield); cudaFree(vxfield); cudaFree(vyfield); cufftDestroy(planr2c); cufftDestroy(planc2r); glBindBufferARB(GL_ARRAY_BUFFER_ARB, 0); glDeleteBuffersARB(1, &vbo); sdkDeleteTimer(&timer); if (g_bExitESC) { checkCudaErrors(cudaDeviceReset()); } }
void cleanup() { sdkDeleteTimer(&timer); sdkDeleteTimer(&animationTimer); Volume_deinit(&volumeOriginal); Volume_deinit(&volumeFilter0); Volume_deinit(&volumeFilter1); VolumeRender_deinit(); if (pbo) { cudaGraphicsUnregisterResource(cuda_pbo_resource); glDeleteBuffersARB(1, &pbo); glDeleteTextures(1, &volumeTex); } }
void cleanup() { sdkDeleteTimer(&timer); if (vbo) { deleteVBO(&vbo, cuda_vbo_resource); } }
void cleanup() { sdkDeleteTimer(&timer); if (psystem) { delete psystem; } return; }
void cleanup(void) { cudaGraphicsUnregisterResource(cuda_pbo_resource); glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); glDeleteBuffers(1, &pbo_buffer); glDeleteTextures(1, &texid); deleteTexture(); sdkDeleteTimer(&timer); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { pArgc = &argc; pArgv = argv; // parse arguments char *filename; printf("Starting bicubicTexture\n"); if (checkCmdLineFlag(argc, (const char **) argv, "help")) { printHelp(); exit(EXIT_SUCCESS); } if (checkCmdLineFlag(argc, (const char **) argv, "mode")) { g_FilterMode = (eFilterMode)getCmdLineArgumentInt(argc, (const char **) argv, "mode"); if (g_FilterMode < MODE_NEAREST && g_FilterMode > MODE_CATMULL_ROM) { printf("Invalid Mode setting %d\n", g_FilterMode); exit(EXIT_FAILURE); } } if (getCmdLineArgumentString(argc, (const char **) argv, "file", &filename)) { dumpFilename = filename; fpsLimit = frameCheckNumber; // Running CUDA kernel (bicubicFiltering) without visualization (QA Testing/Verification) runAutoTest(argc, argv, (const char *)dumpFilename, g_FilterMode); } else { // This runs the CUDA kernel (bicubicFiltering) + OpenGL visualization initialize(argc, argv); glutMainLoop(); sdkDeleteTimer(&timer); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(EXIT_SUCCESS); } exit(EXIT_SUCCESS); }
void cleanup() { sdkDeleteTimer(&timer); sdkDeleteTimer(&kernel_timer); if (h_img) { free(h_img); h_img=NULL; } if (d_img) { cudaFree(d_img); d_img=NULL; } if (d_temp) { cudaFree(d_temp); d_temp=NULL; } // Refer to boxFilter_kernel.cu for implementation freeTextures(); cudaGraphicsUnregisterResource(cuda_pbo_resource); glDeleteBuffersARB(1, &pbo); glDeleteTextures(1, &texid); glDeleteProgramsARB(1, &shader); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); }
void run_cpu_color_test(PPM_IMG img_in) { StopWatchInterface *timer=NULL; printf("Starting CPU processing...\n"); sdkCreateTimer(&timer); sdkStartTimer(&timer); img_obuf_yuv_cpu = rgb2yuv(img_in); //Start RGB 2 YUV sdkStopTimer(&timer); printf("RGB to YUV conversion time: %f (ms)\n", sdkGetTimerValue(&timer)); sdkDeleteTimer(&timer); sdkCreateTimer(&timer); sdkStartTimer(&timer); img_obuf_rgb_cpu = yuv2rgb(img_obuf_yuv_cpu); //Start YUV 2 RGB sdkStopTimer(&timer); printf("YUV to RGB conversion time: %f (ms)\n", sdkGetTimerValue(&timer)); sdkDeleteTimer(&timer); write_yuv(img_obuf_yuv_cpu, "out_yuv.yuv"); write_ppm(img_obuf_rgb_cpu, "out_rgb.ppm"); }
void run_gpu_color_test(PPM_IMG img_in) { StopWatchInterface *timer=NULL; launchEmptyKernel(); // lauch an empty kernel printf("Starting GPU processing...\n"); sdkCreateTimer(&timer); sdkStartTimer(&timer); img_obuf_yuv_gpu = rgb2yuvGPU(img_in); //Start RGB 2 YUV sdkStopTimer(&timer); printf("RGB to YUV conversion time(GPU): %f (ms)\n", sdkGetTimerValue(&timer)); sdkDeleteTimer(&timer); sdkCreateTimer(&timer); sdkStartTimer(&timer); img_obuf_rgb_gpu = yuv2rgbGPU(img_obuf_yuv_gpu); //Start YUV 2 RGB sdkStopTimer(&timer); printf("YUV to RGB conversion time(GPU): %f (ms)\n", sdkGetTimerValue(&timer)); sdkDeleteTimer(&timer); write_ppm(img_obuf_rgb_gpu, "out_rgb.ppm"); write_yuv(img_obuf_yuv_gpu, "out_yuv.yuv"); }
void cleanup() { sdkDeleteTimer(&timer); sdkDeleteTimer(&animationTimer); Volume_deinit(&volumeOriginal); Volume_deinit(&volumeFilter0); Volume_deinit(&volumeFilter1); VolumeRender_deinit(); if (pbo) { cudaGraphicsUnregisterResource(cuda_pbo_resource); glDeleteBuffersARB(1, &pbo); glDeleteTextures(1, &volumeTex); } // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); }
void cleanup() { sdkDeleteTimer(&timer); if (psystem) { delete psystem; } // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return; }
void cleanup() { free(h_Src); checkCudaErrors(CUDA_FreeArray()); checkCudaErrors(cudaGraphicsUnregisterResource(cuda_pbo_resource)); glDeleteProgramsARB(1, &shader); sdkDeleteTimer(&timer); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); }
void cleanup(void) { cudaGraphicsUnregisterResource(cuda_pbo_resource); glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); glDeleteBuffers(1, &pbo_buffer); glDeleteTextures(1, &texid); deleteTexture(); sdkDeleteTimer(&timer); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); }
void cleanup() { sdkDeleteTimer(&timer); // add extra check to unmap the resource before unregistering it if (g_GraphicsMapFlag) { cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0); g_GraphicsMapFlag--; } // unregister this buffer object from CUDA C cudaGraphicsUnregisterResource(cuda_pbo_resource); glDeleteBuffersARB(1, &pbo); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); }
void cleanup() { freeTexture(); checkCudaErrors(cudaGraphicsUnregisterResource(cuda_pbo_resource)); glDeleteBuffersARB(1, &pbo); #if USE_BUFFER_TEX glDeleteTextures(1, &bufferTex); glDeleteProgramsARB(1, &fprog); #else glDeleteTextures(1, &displayTex); #endif sdkDeleteTimer(&timer); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); }
void cleanup(void) { cudaGraphicsUnregisterResource(cuda_vbo_resource); unbindTexture(); deleteTexture(); // Free all host and device resources free(hvfield); free(particles); #ifdef BROADCAST free(packets); #endif cudaFree(dvfield); cudaFree(vxfield); cudaFree(vyfield); cufftDestroy(planr2c); cufftDestroy(planc2r); glBindBufferARB(GL_ARRAY_BUFFER_ARB, 0); glDeleteBuffersARB(1, &vbo); sdkDeleteTimer(&timer); }
int main(int argc, char **argv) { printf("%s Starting...\n\n", argv[0]); //Use command-line specified CUDA device, otherwise use device with highest Gflops/s findCudaDevice(argc, (const char **)argv); uint *d_Input, *d_Output; uint *h_Input, *h_OutputCPU, *h_OutputGPU; StopWatchInterface *hTimer = NULL; const uint N = 13 * 1048576 / 2; printf("Allocating and initializing host arrays...\n"); sdkCreateTimer(&hTimer); h_Input = (uint *)malloc(N * sizeof(uint)); h_OutputCPU = (uint *)malloc(N * sizeof(uint)); h_OutputGPU = (uint *)malloc(N * sizeof(uint)); srand(2009); for (uint i = 0; i < N; i++) { h_Input[i] = rand(); } printf("Allocating and initializing CUDA arrays...\n"); checkCudaErrors(cudaMalloc((void **)&d_Input, N * sizeof(uint))); checkCudaErrors(cudaMalloc((void **)&d_Output, N * sizeof(uint))); checkCudaErrors(cudaMemcpy(d_Input, h_Input, N * sizeof(uint), cudaMemcpyHostToDevice)); printf("Initializing CUDA-C scan...\n\n"); initScan(); int globalFlag = 1; size_t szWorkgroup; const int iCycles = 100; printf("*** Running GPU scan for short arrays (%d identical iterations)...\n\n", iCycles); for (uint arrayLength = MIN_SHORT_ARRAY_SIZE; arrayLength <= MAX_SHORT_ARRAY_SIZE; arrayLength <<= 1) { printf("Running scan for %u elements (%u arrays)...\n", arrayLength, N / arrayLength); checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); for (int i = 0; i < iCycles; i++) { szWorkgroup = scanExclusiveShort(d_Output, d_Input, N / arrayLength, arrayLength); } checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&hTimer); double timerValue = 1.0e-3 * sdkGetTimerValue(&hTimer) / iCycles; printf("Validating the results...\n"); printf("...reading back GPU results\n"); checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, N * sizeof(uint), cudaMemcpyDeviceToHost)); printf(" ...scanExclusiveHost()\n"); scanExclusiveHost(h_OutputCPU, h_Input, N / arrayLength, arrayLength); // Compare GPU results with CPU results and accumulate error for this test printf(" ...comparing the results\n"); int localFlag = 1; for (uint i = 0; i < N; i++) { if (h_OutputCPU[i] != h_OutputGPU[i]) { localFlag = 0; break; } } // Log message on individual test result, then accumulate to global flag printf(" ...Results %s\n\n", (localFlag == 1) ? "Match" : "DON'T Match !!!"); globalFlag = globalFlag && localFlag; // Data log if (arrayLength == MAX_SHORT_ARRAY_SIZE) { printf("\n"); printf("scan-Short, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * (double)arrayLength/timerValue), timerValue, (unsigned int)arrayLength, 1, (unsigned int)szWorkgroup); printf("\n"); } } printf("***Running GPU scan for large arrays (%u identical iterations)...\n\n", iCycles); for (uint arrayLength = MIN_LARGE_ARRAY_SIZE; arrayLength <= MAX_LARGE_ARRAY_SIZE; arrayLength <<= 1) { printf("Running scan for %u elements (%u arrays)...\n", arrayLength, N / arrayLength); checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); for (int i = 0; i < iCycles; i++) { szWorkgroup = scanExclusiveLarge(d_Output, d_Input, N / arrayLength, arrayLength); } checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&hTimer); double timerValue = 1.0e-3 * sdkGetTimerValue(&hTimer) / iCycles; printf("Validating the results...\n"); printf("...reading back GPU results\n"); checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, N * sizeof(uint), cudaMemcpyDeviceToHost)); printf("...scanExclusiveHost()\n"); scanExclusiveHost(h_OutputCPU, h_Input, N / arrayLength, arrayLength); // Compare GPU results with CPU results and accumulate error for this test printf(" ...comparing the results\n"); int localFlag = 1; for (uint i = 0; i < N; i++) { if (h_OutputCPU[i] != h_OutputGPU[i]) { localFlag = 0; break; } } // Log message on individual test result, then accumulate to global flag printf(" ...Results %s\n\n", (localFlag == 1) ? "Match" : "DON'T Match !!!"); globalFlag = globalFlag && localFlag; // Data log if (arrayLength == MAX_LARGE_ARRAY_SIZE) { printf("\n"); printf("scan-Large, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * (double)arrayLength/timerValue), timerValue, (unsigned int)arrayLength, 1, (unsigned int)szWorkgroup); printf("\n"); } } printf("Shutting down...\n"); closeScan(); checkCudaErrors(cudaFree(d_Output)); checkCudaErrors(cudaFree(d_Input)); sdkDeleteTimer(&hTimer); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); // pass or fail (cumulative... all tests in the loop) exit(globalFlag ? EXIT_SUCCESS : EXIT_FAILURE); }
//////////////////////////////////////////////////////////////////////////////// // Test driver //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { uint *h_SrcKey, *h_SrcVal, *h_DstKey, *h_DstVal; uint *d_SrcKey, *d_SrcVal, *d_BufKey, *d_BufVal, *d_DstKey, *d_DstVal; StopWatchInterface *hTimer = NULL; const uint N = 4 * 1048576; const uint DIR = 1; const uint numValues = 65536; printf("%s Starting...\n\n", argv[0]); int dev = findCudaDevice(argc, (const char **) argv); if (dev == -1) { return EXIT_FAILURE; } printf("Allocating and initializing host arrays...\n\n"); sdkCreateTimer(&hTimer); h_SrcKey = (uint *)malloc(N * sizeof(uint)); h_SrcVal = (uint *)malloc(N * sizeof(uint)); h_DstKey = (uint *)malloc(N * sizeof(uint)); h_DstVal = (uint *)malloc(N * sizeof(uint)); srand(2009); for (uint i = 0; i < N; i++) { h_SrcKey[i] = rand() % numValues; } fillValues(h_SrcVal, N); printf("Allocating and initializing CUDA arrays...\n\n"); checkCudaErrors(cudaMalloc((void **)&d_DstKey, N * sizeof(uint))); checkCudaErrors(cudaMalloc((void **)&d_DstVal, N * sizeof(uint))); checkCudaErrors(cudaMalloc((void **)&d_BufKey, N * sizeof(uint))); checkCudaErrors(cudaMalloc((void **)&d_BufVal, N * sizeof(uint))); checkCudaErrors(cudaMalloc((void **)&d_SrcKey, N * sizeof(uint))); checkCudaErrors(cudaMalloc((void **)&d_SrcVal, N * sizeof(uint))); checkCudaErrors(cudaMemcpy(d_SrcKey, h_SrcKey, N * sizeof(uint), cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(d_SrcVal, h_SrcVal, N * sizeof(uint), cudaMemcpyHostToDevice)); printf("Initializing GPU merge sort...\n"); initMergeSort(); printf("Running GPU merge sort...\n"); checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); mergeSort( d_DstKey, d_DstVal, d_BufKey, d_BufVal, d_SrcKey, d_SrcVal, N, DIR ); checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&hTimer); printf("Time: %f ms\n", sdkGetTimerValue(&hTimer)); printf("Reading back GPU merge sort results...\n"); checkCudaErrors(cudaMemcpy(h_DstKey, d_DstKey, N * sizeof(uint), cudaMemcpyDeviceToHost)); checkCudaErrors(cudaMemcpy(h_DstVal, d_DstVal, N * sizeof(uint), cudaMemcpyDeviceToHost)); printf("Inspecting the results...\n"); uint keysFlag = validateSortedKeys( h_DstKey, h_SrcKey, 1, N, numValues, DIR ); uint valuesFlag = validateSortedValues( h_DstKey, h_DstVal, h_SrcKey, 1, N ); printf("Shutting down...\n"); closeMergeSort(); sdkDeleteTimer(&hTimer); checkCudaErrors(cudaFree(d_SrcVal)); checkCudaErrors(cudaFree(d_SrcKey)); checkCudaErrors(cudaFree(d_BufVal)); checkCudaErrors(cudaFree(d_BufKey)); checkCudaErrors(cudaFree(d_DstVal)); checkCudaErrors(cudaFree(d_DstKey)); free(h_DstVal); free(h_DstKey); free(h_SrcVal); free(h_SrcKey); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit((keysFlag && valuesFlag) ? EXIT_SUCCESS : EXIT_FAILURE); }
/* void initCellSystem(uint3 gridSize) { csystem = new CellSystem(gridSize); //psystem->reset(ParticleSystem::CONFIG_GRID); } */ void cleanup() { sdkDeleteTimer(&timer); }
//////////////////////////////////////////////////////////////////////////////// // Main program //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { float *h_Kernel, *h_Input, *h_Buffer, *h_OutputCPU, *h_OutputGPU; cudaArray *a_Src; cudaChannelFormatDesc floatTex = cudaCreateChannelDesc<float>(); float *d_Output; float gpuTime; StopWatchInterface *hTimer = NULL; const int imageW = 3072; const int imageH = 3072 / 2; const unsigned int iterations = 10; printf("[%s] - Starting...\n", argv[0]); // use command-line specified CUDA device, otherwise use device with highest Gflops/s findCudaDevice(argc, (const char **)argv); sdkCreateTimer(&hTimer); printf("Initializing data...\n"); h_Kernel = (float *)malloc(KERNEL_LENGTH * sizeof(float)); h_Input = (float *)malloc(imageW * imageH * sizeof(float)); h_Buffer = (float *)malloc(imageW * imageH * sizeof(float)); h_OutputCPU = (float *)malloc(imageW * imageH * sizeof(float)); h_OutputGPU = (float *)malloc(imageW * imageH * sizeof(float)); checkCudaErrors(cudaMallocArray(&a_Src, &floatTex, imageW, imageH)); checkCudaErrors(cudaMalloc((void **)&d_Output, imageW * imageH * sizeof(float))); srand(2009); for (unsigned int i = 0; i < KERNEL_LENGTH; i++) { h_Kernel[i] = (float)(rand() % 16); } for (unsigned int i = 0; i < imageW * imageH; i++) { h_Input[i] = (float)(rand() % 16); } setConvolutionKernel(h_Kernel); checkCudaErrors(cudaMemcpyToArray(a_Src, 0, 0, h_Input, imageW * imageH * sizeof(float), cudaMemcpyHostToDevice)); printf("Running GPU rows convolution (%u identical iterations)...\n", iterations); checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); for (unsigned int i = 0; i < iterations; i++) { convolutionRowsGPU( d_Output, a_Src, imageW, imageH ); } checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&hTimer); gpuTime = sdkGetTimerValue(&hTimer) / (float)iterations; printf("Average convolutionRowsGPU() time: %f msecs; //%f Mpix/s\n", gpuTime, imageW * imageH * 1e-6 / (0.001 * gpuTime)); //While CUDA kernels can't write to textures directly, this copy is inevitable printf("Copying convolutionRowGPU() output back to the texture...\n"); checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); checkCudaErrors(cudaMemcpyToArray(a_Src, 0, 0, d_Output, imageW * imageH * sizeof(float), cudaMemcpyDeviceToDevice)); checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&hTimer); gpuTime = sdkGetTimerValue(&hTimer); printf("cudaMemcpyToArray() time: %f msecs; //%f Mpix/s\n", gpuTime, imageW * imageH * 1e-6 / (0.001 * gpuTime)); printf("Running GPU columns convolution (%i iterations)\n", iterations); checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); for (int i = 0; i < iterations; i++) { convolutionColumnsGPU( d_Output, a_Src, imageW, imageH ); } checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&hTimer); gpuTime = sdkGetTimerValue(&hTimer) / (float)iterations; printf("Average convolutionColumnsGPU() time: %f msecs; //%f Mpix/s\n", gpuTime, imageW * imageH * 1e-6 / (0.001 * gpuTime)); printf("Reading back GPU results...\n"); checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, imageW * imageH * sizeof(float), cudaMemcpyDeviceToHost)); printf("Checking the results...\n"); printf("...running convolutionRowsCPU()\n"); convolutionRowsCPU( h_Buffer, h_Input, h_Kernel, imageW, imageH, KERNEL_RADIUS ); printf("...running convolutionColumnsCPU()\n"); convolutionColumnsCPU( h_OutputCPU, h_Buffer, h_Kernel, imageW, imageH, KERNEL_RADIUS ); double delta = 0; double sum = 0; for (unsigned int i = 0; i < imageW * imageH; i++) { sum += h_OutputCPU[i] * h_OutputCPU[i]; delta += (h_OutputGPU[i] - h_OutputCPU[i]) * (h_OutputGPU[i] - h_OutputCPU[i]); } double L2norm = sqrt(delta / sum); printf("Relative L2 norm: %E\n", L2norm); printf("Shutting down...\n"); checkCudaErrors(cudaFree(d_Output)); checkCudaErrors(cudaFreeArray(a_Src)); free(h_OutputGPU); free(h_Buffer); free(h_Input); free(h_Kernel); sdkDeleteTimer(&hTimer); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); if (L2norm > 1e-6) { printf("Test failed!\n"); exit(EXIT_FAILURE); } printf("Test passed\n"); exit(EXIT_SUCCESS); }
bool runTest(int argc, char **argv, ReduceType datatype) { int size = 1<<24; // number of elements to reduce int maxThreads = 256; // number of threads per block int whichKernel = 6; int maxBlocks = 64; bool cpuFinalReduction = false; int cpuFinalThreshold = 1; if (checkCmdLineFlag(argc, (const char **) argv, "n")) { size = getCmdLineArgumentInt(argc, (const char **) argv, "n"); } if (checkCmdLineFlag(argc, (const char **) argv, "threads")) { maxThreads = getCmdLineArgumentInt(argc, (const char **) argv, "threads"); } if (checkCmdLineFlag(argc, (const char **) argv, "kernel")) { whichKernel = getCmdLineArgumentInt(argc, (const char **) argv, "kernel"); } if (checkCmdLineFlag(argc, (const char **) argv, "maxblocks")) { maxBlocks = getCmdLineArgumentInt(argc, (const char **) argv, "maxblocks"); } printf("%d elements\n", size); printf("%d threads (max)\n", maxThreads); cpuFinalReduction = checkCmdLineFlag(argc, (const char **) argv, "cpufinal"); if (checkCmdLineFlag(argc, (const char **) argv, "cputhresh")) { cpuFinalThreshold = getCmdLineArgumentInt(argc, (const char **) argv, "cputhresh"); } bool runShmoo = checkCmdLineFlag(argc, (const char **) argv, "shmoo"); if (runShmoo) { shmoo<T>(1, 33554432, maxThreads, maxBlocks, datatype); } else { // create random input data on CPU unsigned int bytes = size * sizeof(T); T *h_idata = (T *) malloc(bytes); for (int i=0; i<size; i++) { // Keep the numbers small so we don't get truncation error in the sum if (datatype == REDUCE_INT) { h_idata[i] = (T)(rand() & 0xFF); } else { h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX; } } int numBlocks = 0; int numThreads = 0; getNumBlocksAndThreads(whichKernel, size, maxBlocks, maxThreads, numBlocks, numThreads); if (numBlocks == 1) { cpuFinalThreshold = 1; } // allocate mem for the result on host side T *h_odata = (T *) malloc(numBlocks*sizeof(T)); printf("%d blocks\n\n", numBlocks); // allocate device memory and data T *d_idata = NULL; T *d_odata = NULL; checkCudaErrors(cudaMalloc((void **) &d_idata, bytes)); checkCudaErrors(cudaMalloc((void **) &d_odata, numBlocks*sizeof(T))); // copy data directly to device memory checkCudaErrors(cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(d_odata, h_idata, numBlocks*sizeof(T), cudaMemcpyHostToDevice)); // warm-up reduce<T>(size, numThreads, numBlocks, whichKernel, d_idata, d_odata); int testIterations = 100; StopWatchInterface *timer = 0; sdkCreateTimer(&timer); T gpu_result = 0; gpu_result = benchmarkReduce<T>(size, numThreads, numBlocks, maxThreads, maxBlocks, whichKernel, testIterations, cpuFinalReduction, cpuFinalThreshold, timer, h_odata, d_idata, d_odata); double reduceTime = sdkGetAverageTimerValue(&timer) * 1e-3; printf("Reduction, Throughput = %.4f GB/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %d, Workgroup = %u\n", 1.0e-9 * ((double)bytes)/reduceTime, reduceTime, size, 1, numThreads); // compute reference solution T cpu_result = reduceCPU<T>(h_idata, size); int precision = 0; double threshold = 0; double diff = 0; if (datatype == REDUCE_INT) { printf("\nGPU result = %d\n", (int)gpu_result); printf("CPU result = %d\n\n", (int)cpu_result); } else { if (datatype == REDUCE_FLOAT) { precision = 8; threshold = 1e-8 * size; } else { precision = 12; threshold = 1e-12 * size; } printf("\nGPU result = %.*f\n", precision, (double)gpu_result); printf("CPU result = %.*f\n\n", precision, (double)cpu_result); diff = fabs((double)gpu_result - (double)cpu_result); } // cleanup sdkDeleteTimer(&timer); free(h_idata); free(h_odata); checkCudaErrors(cudaFree(d_idata)); checkCudaErrors(cudaFree(d_odata)); if (datatype == REDUCE_INT) { return (gpu_result == cpu_result); } else { return (diff < threshold); } } return true; }
void shmoo(int minN, int maxN, int maxThreads, int maxBlocks, ReduceType datatype) { // create random input data on CPU unsigned int bytes = maxN * sizeof(T); T *h_idata = (T *) malloc(bytes); for (int i = 0; i < maxN; i++) { // Keep the numbers small so we don't get truncation error in the sum if (datatype == REDUCE_INT) { h_idata[i] = (T)(rand() & 0xFF); } else { h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX; } } int maxNumBlocks = MIN(maxN / maxThreads, MAX_BLOCK_DIM_SIZE); // allocate mem for the result on host side T *h_odata = (T *) malloc(maxNumBlocks*sizeof(T)); // allocate device memory and data T *d_idata = NULL; T *d_odata = NULL; checkCudaErrors(cudaMalloc((void **) &d_idata, bytes)); checkCudaErrors(cudaMalloc((void **) &d_odata, maxNumBlocks*sizeof(T))); // copy data directly to device memory checkCudaErrors(cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(d_odata, h_idata, maxNumBlocks*sizeof(T), cudaMemcpyHostToDevice)); // warm-up for (int kernel = 0; kernel < 7; kernel++) { reduce<T>(maxN, maxThreads, maxNumBlocks, kernel, d_idata, d_odata); } int testIterations = 100; StopWatchInterface *timer = 0; sdkCreateTimer(&timer); // print headers printf("Time in milliseconds for various numbers of elements for each kernel\n\n\n"); printf("Kernel"); for (int i = minN; i <= maxN; i *= 2) { printf(", %d", i); } for (int kernel = 0; kernel < 7; kernel++) { printf("\n%d", kernel); for (int i = minN; i <= maxN; i *= 2) { sdkResetTimer(&timer); int numBlocks = 0; int numThreads = 0; getNumBlocksAndThreads(kernel, i, maxBlocks, maxThreads, numBlocks, numThreads); float reduceTime; if (numBlocks <= MAX_BLOCK_DIM_SIZE) { benchmarkReduce(i, numThreads, numBlocks, maxThreads, maxBlocks, kernel, testIterations, false, 1, timer, h_odata, d_idata, d_odata); reduceTime = sdkGetAverageTimerValue(&timer); } else { reduceTime = -1.0; } printf(", %.5f", reduceTime); } } // cleanup sdkDeleteTimer(&timer); free(h_idata); free(h_odata); checkCudaErrors(cudaFree(d_idata)); checkCudaErrors(cudaFree(d_odata)); }
int main(int argc, char **argv) { // Start logs printf("[%s] - Starting...\n", argv[0]); //'h_' prefix - CPU (host) memory space float //Results calculated by CPU for reference *h_CallResultCPU, *h_PutResultCPU, //CPU copy of GPU results *h_CallResultGPU, *h_PutResultGPU, //CPU instance of input data *h_StockPrice, *h_OptionStrike, *h_OptionYears; //'d_' prefix - GPU (device) memory space CUdeviceptr //Results calculated by GPU d_CallResult, d_PutResult, //GPU instance of input data d_StockPrice, d_OptionStrike, d_OptionYears; double delta, ref, sum_delta, sum_ref, max_delta, L1norm, gpuTime; StopWatchInterface *hTimer = NULL; int i; sdkCreateTimer(&hTimer); printf("Initializing data...\n"); printf("...allocating CPU memory for options.\n"); h_CallResultCPU = (float *)malloc(OPT_SZ); h_PutResultCPU = (float *)malloc(OPT_SZ); h_CallResultGPU = (float *)malloc(OPT_SZ); h_PutResultGPU = (float *)malloc(OPT_SZ); h_StockPrice = (float *)malloc(OPT_SZ); h_OptionStrike = (float *)malloc(OPT_SZ); h_OptionYears = (float *)malloc(OPT_SZ); char *ptx, *kernel_file; size_t ptxSize; kernel_file = sdkFindFilePath("BlackScholes_kernel.cuh", argv[0]); // Set a Compiler Option to have maximum register to be used by each thread. char *compile_options[1]; compile_options[0] = (char *) malloc(sizeof(char)*(strlen("--maxrregcount=16"))); strcpy((char *)compile_options[0],"--maxrregcount=16"); // Compile the kernel BlackScholes_kernel. compileFileToPTX(kernel_file, 1, (const char **)compile_options, &ptx, &ptxSize); CUmodule module = loadPTX(ptx, argc, argv); CUfunction kernel_addr; checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "BlackScholesGPU")); printf("...allocating GPU memory for options.\n"); checkCudaErrors(cuMemAlloc(&d_CallResult, OPT_SZ)); checkCudaErrors(cuMemAlloc(&d_PutResult, OPT_SZ)); checkCudaErrors(cuMemAlloc(&d_StockPrice, OPT_SZ)); checkCudaErrors(cuMemAlloc(&d_OptionStrike,OPT_SZ)); checkCudaErrors(cuMemAlloc(&d_OptionYears, OPT_SZ)); printf("...generating input data in CPU mem.\n"); srand(5347); //Generate options set for (i = 0; i < OPT_N; i++) { h_CallResultCPU[i] = 0.0f; h_PutResultCPU[i] = -1.0f; h_StockPrice[i] = RandFloat(5.0f, 30.0f); h_OptionStrike[i] = RandFloat(1.0f, 100.0f); h_OptionYears[i] = RandFloat(0.25f, 10.0f); } printf("...copying input data to GPU mem.\n"); //Copy options data to GPU memory for further processing checkCudaErrors(cuMemcpyHtoD(d_StockPrice, h_StockPrice, OPT_SZ)); checkCudaErrors(cuMemcpyHtoD(d_OptionStrike, h_OptionStrike, OPT_SZ)); checkCudaErrors(cuMemcpyHtoD(d_OptionYears, h_OptionYears, OPT_SZ)); printf("Data init done.\n\n"); printf("Executing Black-Scholes GPU kernel (%i iterations)...\n", NUM_ITERATIONS); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); dim3 cudaBlockSize( 128, 1, 1); dim3 cudaGridSize(DIV_UP(OPT_N/2, 128),1,1); float risk = RISKFREE; float volatility = VOLATILITY; int optval = OPT_N; void *arr[] = { (void *)&d_CallResult, (void *)&d_PutResult, (void *)&d_StockPrice, (void *)&d_OptionStrike, (void *)&d_OptionYears, (void *)&risk, (void *)&volatility, (void *)&optval }; for (i = 0; i < NUM_ITERATIONS; i++) { checkCudaErrors(cuLaunchKernel(kernel_addr, cudaGridSize.x, cudaGridSize.y, cudaGridSize.z, /* grid dim */ cudaBlockSize.x, cudaBlockSize.y, cudaBlockSize.z, /* block dim */ 0,0, /* shared mem, stream */ &arr[0], /* arguments */ 0)); } checkCudaErrors(cuCtxSynchronize()); sdkStopTimer(&hTimer); gpuTime = sdkGetTimerValue(&hTimer) / NUM_ITERATIONS; //Both call and put is calculated printf("Options count : %i \n", 2 * OPT_N); printf("BlackScholesGPU() time : %f msec\n", gpuTime); printf("Effective memory bandwidth: %f GB/s\n", ((double)(5 * OPT_N * sizeof(float)) * 1E-9) / (gpuTime * 1E-3)); printf("Gigaoptions per second : %f \n\n", ((double)(2 * OPT_N) * 1E-9) / (gpuTime * 1E-3)); printf("BlackScholes, Throughput = %.4f GOptions/s, Time = %.5f s, Size = %u options, NumDevsUsed = %u, Workgroup = %u\n", (((double)(2.0 * OPT_N) * 1.0E-9) / (gpuTime * 1.0E-3)), gpuTime*1e-3, (2 * OPT_N), 1, 128); printf("\nReading back GPU results...\n"); //Read back GPU results to compare them to CPU results checkCudaErrors(cuMemcpyDtoH(h_CallResultGPU, d_CallResult, OPT_SZ)); checkCudaErrors(cuMemcpyDtoH(h_PutResultGPU, d_PutResult, OPT_SZ)); printf("Checking the results...\n"); printf("...running CPU calculations.\n\n"); //Calculate options values on CPU BlackScholesCPU( h_CallResultCPU, h_PutResultCPU, h_StockPrice, h_OptionStrike, h_OptionYears, RISKFREE, VOLATILITY, OPT_N ); printf("Comparing the results...\n"); //Calculate max absolute difference and L1 distance //between CPU and GPU results sum_delta = 0; sum_ref = 0; max_delta = 0; for (i = 0; i < OPT_N; i++) { ref = h_CallResultCPU[i]; delta = fabs(h_CallResultCPU[i] - h_CallResultGPU[i]); if (delta > max_delta) { max_delta = delta; } sum_delta += delta; sum_ref += fabs(ref); } L1norm = sum_delta / sum_ref; printf("L1 norm: %E\n", L1norm); printf("Max absolute error: %E\n\n", max_delta); printf("Shutting down...\n"); printf("...releasing GPU memory.\n"); checkCudaErrors(cuMemFree(d_OptionYears)); checkCudaErrors(cuMemFree(d_OptionStrike)); checkCudaErrors(cuMemFree(d_StockPrice)); checkCudaErrors(cuMemFree(d_PutResult)); checkCudaErrors(cuMemFree(d_CallResult)); printf("...releasing CPU memory.\n"); free(h_OptionYears); free(h_OptionStrike); free(h_StockPrice); free(h_PutResultGPU); free(h_CallResultGPU); free(h_PutResultCPU); free(h_CallResultCPU); sdkDeleteTimer(&hTimer); printf("Shutdown done.\n"); printf("\n[%s] - Test Summary\n", argv[0]); cuProfilerStop(); if (L1norm > 1e-6) { printf("Test failed!\n"); exit(EXIT_FAILURE); } printf("Test passed\n"); exit(EXIT_SUCCESS); }