void runAutoTest(int argc, char **argv) { printf("[%s] (automated testing w/ readback)\n", sSDKsample); if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) { int device = cutilDeviceInit(argc, argv); if (device < 0) { printf("No CUDA Capable devices found, exiting...\n"); shrQAFinishExit(argc, (const char **)argv, QA_WAIVED); } checkDeviceMeetComputeSpec( argc, argv ); } else { int dev = findCapableDevice(argc, argv); if( dev != -1 ) cudaSetDevice( dev ); else { cutilDeviceReset(); shrQAFinishExit2(g_bQAReadback, *pArgc, (const char **)pArgv, QA_PASSED); } } loadDefaultImage( argc, argv ); if (argc > 1) { char *filename; if (cutGetCmdLineArgumentstr(argc, (const char **)argv, "file", &filename)) { initializeData(filename, argc, argv); } } else { loadDefaultImage( argc, argv ); } g_CheckRender = new CheckBackBuffer(imWidth, imHeight, sizeof(Pixel), false); g_CheckRender->setExecPath(argv[0]); Pixel *d_result; cutilSafeCall( cudaMalloc( (void **)&d_result, imWidth*imHeight*sizeof(Pixel)) ); while (g_SobelDisplayMode <= 2) { printf("AutoTest: %s <%s>\n", sSDKsample, filterMode[g_SobelDisplayMode]); sobelFilter(d_result, imWidth, imHeight, g_SobelDisplayMode, imageScale, blockOp, pointOp ); cutilSafeCall( cutilDeviceSynchronize() ); cudaMemcpy(g_CheckRender->imageData(), d_result, imWidth*imHeight*sizeof(Pixel), cudaMemcpyDeviceToHost); g_CheckRender->savePGM(sOriginal[g_Index], false, NULL); if (!g_CheckRender->PGMvsPGM(sOriginal[g_Index], sReference[g_Index], MAX_EPSILON_ERROR, 0.15f)) { g_TotalErrors++; } g_Index++; g_SobelDisplayMode = (SobelDisplayMode)g_Index; } cutilSafeCall( cudaFree( d_result ) ); delete g_CheckRender; shrQAFinishExit(argc, (const char **)argv, (!g_TotalErrors ? QA_PASSED : QA_FAILED) ); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main( int argc, char** argv) { CUdevice dev; int major = 0, minor = 0; int deviceCount = 0; char deviceName[256]; shrQAStart(argc, argv); // note your project will need to link with cuda.lib files on windows printf("CUDA Device Query (Driver API) statically linked version \n"); CUresult error_id = cuInit(0); if (error_id != CUDA_SUCCESS) { printf("cuInit(0) returned %d\n-> %s\n", error_id, getCudaDrvErrorString(error_id)); shrQAFinishExit(argc, (const char **)argv, QA_FAILED); } error_id = cuDeviceGetCount(&deviceCount); if (error_id != CUDA_SUCCESS) { shrLog( "cuDeviceGetCount returned %d\n-> %s\n", (int)error_id, getCudaDrvErrorString(error_id) ); shrQAFinishExit(argc, (const char **)argv, QA_FAILED); } // This function call returns 0 if there are no CUDA capable devices. if (deviceCount == 0) printf("There are no available device(s) that support CUDA\n"); else if (deviceCount == 1) printf("There is 1 device supporting CUDA\n"); else printf("There are %d devices supporting CUDA\n", deviceCount); for (dev = 0; dev < deviceCount; ++dev) { error_id = cuDeviceComputeCapability(&major, &minor, dev); if (error_id != CUDA_SUCCESS) { shrLog( "cuDeviceComputeCapability returned %d\n-> %s\n", (int)error_id, getCudaDrvErrorString(error_id) ); shrQAFinishExit(argc, (const char **)argv, QA_FAILED); } error_id = cuDeviceGetName(deviceName, 256, dev); if (error_id != CUDA_SUCCESS) { shrLog( "cuDeviceGetName returned %d\n-> %s\n", (int)error_id, getCudaDrvErrorString(error_id) ); shrQAFinishExit(argc, (const char **)argv, QA_FAILED); } printf("\nDevice %d: \"%s\"\n", dev, deviceName); #if CUDA_VERSION >= 2020 int driverVersion = 0; cuDriverGetVersion(&driverVersion); printf(" CUDA Driver Version: %d.%d\n", driverVersion/1000, (driverVersion%100)/10); #endif shrLog(" CUDA Capability Major/Minor version number: %d.%d\n", major, minor); size_t totalGlobalMem; error_id = cuDeviceTotalMem(&totalGlobalMem, dev); if (error_id != CUDA_SUCCESS) { shrLog( "cuDeviceTotalMem returned %d\n-> %s\n", (int)error_id, getCudaDrvErrorString(error_id) ); shrQAFinishExit(argc, (const char **)argv, QA_FAILED); } char msg[256]; sprintf(msg, " Total amount of global memory: %.0f MBytes (%llu bytes)\n", (float)totalGlobalMem/1048576.0f, (unsigned long long) totalGlobalMem); shrLog(msg); #if CUDA_VERSION >= 2000 int multiProcessorCount; getCudaAttribute<int>(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev); shrLog(" (%2d) Multiprocessors x (%3d) CUDA Cores/MP: %d CUDA Cores\n", multiProcessorCount, ConvertSMVer2Cores(major, minor), ConvertSMVer2Cores(major, minor) * multiProcessorCount); #endif int clockRate; getCudaAttribute<int>(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev); printf(" GPU Clock rate: %.0f MHz (%0.2f GHz)\n", clockRate * 1e-3f, clockRate * 1e-6f); #if CUDA_VERSION >= 4000 int memoryClock; getCudaAttribute<int>( &memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev ); shrLog(" Memory Clock rate: %.0f Mhz\n", memoryClock * 1e-3f); int memBusWidth; getCudaAttribute<int>( &memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev ); shrLog(" Memory Bus Width: %d-bit\n", memBusWidth); int L2CacheSize; getCudaAttribute<int>( &L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev ); if (L2CacheSize) { shrLog(" L2 Cache Size: %d bytes\n", L2CacheSize); } int maxTex1D, maxTex2D[2], maxTex3D[3]; getCudaAttribute<int>( &maxTex1D, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH, dev ); getCudaAttribute<int>( &maxTex2D[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH, dev ); getCudaAttribute<int>( &maxTex2D[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT, dev ); getCudaAttribute<int>( &maxTex3D[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH, dev ); getCudaAttribute<int>( &maxTex3D[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT, dev ); getCudaAttribute<int>( &maxTex3D[2], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH, dev ); shrLog(" Max Texture Dimension Sizes 1D=(%d) 2D=(%d,%d) 3D=(%d,%d,%d)\n", maxTex1D, maxTex2D[0], maxTex2D[1], maxTex3D[0], maxTex3D[1], maxTex3D[2]); int maxTex2DLayered[3]; getCudaAttribute<int>( &maxTex2DLayered[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH, dev ); getCudaAttribute<int>( &maxTex2DLayered[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT, dev ); getCudaAttribute<int>( &maxTex2DLayered[2], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS, dev ); shrLog(" Max Layered Texture Size (dim) x layers 1D=(%d) x %d, 2D=(%d,%d) x %d\n", maxTex2DLayered[0], maxTex2DLayered[2], maxTex2DLayered[0], maxTex2DLayered[1], maxTex2DLayered[2]); #endif int totalConstantMemory; getCudaAttribute<int>( &totalConstantMemory, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, dev ); printf(" Total amount of constant memory: %u bytes\n", totalConstantMemory); int sharedMemPerBlock; getCudaAttribute<int>( &sharedMemPerBlock, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, dev ); printf(" Total amount of shared memory per block: %u bytes\n", sharedMemPerBlock); int regsPerBlock; getCudaAttribute<int>( ®sPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev ); printf(" Total number of registers available per block: %d\n", regsPerBlock); int warpSize; getCudaAttribute<int>( &warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev ); printf(" Warp size: %d\n", warpSize); int maxThreadsPerMultiProcessor; getCudaAttribute<int>( &maxThreadsPerMultiProcessor, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev ); printf(" Maximum number of threads per multiprocessor: %d\n", maxThreadsPerMultiProcessor); int maxThreadsPerBlock; getCudaAttribute<int>( &maxThreadsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev ); printf(" Maximum number of threads per block: %d\n", maxThreadsPerBlock); int blockDim[3]; getCudaAttribute<int>( &blockDim[0], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, dev ); getCudaAttribute<int>( &blockDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, dev ); getCudaAttribute<int>( &blockDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, dev ); printf(" Maximum sizes of each dimension of a block: %d x %d x %d\n", blockDim[0], blockDim[1], blockDim[2]); int gridDim[3]; getCudaAttribute<int>( &gridDim[0], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, dev ); getCudaAttribute<int>( &gridDim[1], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, dev ); getCudaAttribute<int>( &gridDim[2], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, dev ); printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n", gridDim[0], gridDim[1], gridDim[2]); int textureAlign; getCudaAttribute<int>( &textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, dev ); printf(" Texture alignment: %u bytes\n", textureAlign); int memPitch; getCudaAttribute<int>( &memPitch, CU_DEVICE_ATTRIBUTE_MAX_PITCH, dev ); printf(" Maximum memory pitch: %u bytes\n", memPitch); #if CUDA_VERSION >= 2000 int gpuOverlap; getCudaAttribute<int>( &gpuOverlap, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev ); #endif #if CUDA_VERSION >= 4000 int asyncEngineCount; getCudaAttribute<int>( &asyncEngineCount, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev ); printf(" Concurrent copy and execution: %s with %d copy engine(s)\n", (gpuOverlap ? "Yes" : "No"), asyncEngineCount); #else printf(" Concurrent copy and execution: %s\n",gpuOverlap ? "Yes" : "No"); #endif #if CUDA_VERSION >= 2020 int kernelExecTimeoutEnabled; getCudaAttribute<int>( &kernelExecTimeoutEnabled, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, dev ); printf(" Run time limit on kernels: %s\n", kernelExecTimeoutEnabled ? "Yes" : "No"); int integrated; getCudaAttribute<int>( &integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev ); printf(" Integrated GPU sharing Host Memory: %s\n", integrated ? "Yes" : "No"); int canMapHostMemory; getCudaAttribute<int>( &canMapHostMemory, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev ); printf(" Support host page-locked memory mapping: %s\n", canMapHostMemory ? "Yes" : "No"); #endif #if CUDA_VERSION >= 3000 int concurrentKernels; getCudaAttribute<int>( &concurrentKernels, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev ); printf(" Concurrent kernel execution: %s\n", concurrentKernels ? "Yes" : "No"); int surfaceAlignment; getCudaAttribute<int>( &surfaceAlignment, CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT, dev ); printf(" Alignment requirement for Surfaces: %s\n", surfaceAlignment ? "Yes" : "No"); int eccEnabled; getCudaAttribute<int>( &eccEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev ); printf(" Device has ECC support enabled: %s\n", eccEnabled ? "Yes" : "No"); #endif #if CUDA_VERSION >= 3020 int tccDriver ; getCudaAttribute<int>( &tccDriver , CU_DEVICE_ATTRIBUTE_TCC_DRIVER, dev ); printf(" Device is using TCC driver mode: %s\n", tccDriver ? "Yes" : "No"); #endif #if CUDA_VERSION >= 4000 int unifiedAddressing; getCudaAttribute<int>( &unifiedAddressing, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev ); printf(" Device supports Unified Addressing (UVA): %s\n", unifiedAddressing ? "Yes" : "No"); int pciBusID, pciDeviceID; getCudaAttribute<int>( &pciBusID, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, dev ); getCudaAttribute<int>( &pciDeviceID, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, dev ); printf(" Device PCI Bus ID / PCI location ID: %d / %d\n", pciBusID, pciDeviceID ); const char *sComputeMode[] = { "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)", "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)", "Prohibited (no host thread can use ::cudaSetDevice() with this device)", "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)", "Unknown", NULL }; int computeMode; getCudaAttribute<int>( &computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev ); printf(" Compute Mode:\n"); printf(" < %s >\n", sComputeMode[computeMode]); #endif } shrQAFinishExit(argc, (const char **)argv, QA_PASSED); }
//////////////////////////////////////////////////////////////////////////////// // Main program //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { cl_platform_id cpPlatform; //OpenCL platform cl_device_id cdDevice; //OpenCL device cl_context cxGPUContext; //OpenCL context cl_command_queue cqCommandQueue; //OpenCL command que cl_mem d_Input, d_Output; //OpenCL memory buffer objects cl_int ciErrNum; float *h_Input, *h_OutputCPU, *h_OutputGPU; const uint imageW = 2048, imageH = 2048, stride = 2048; const int dir = DCT_FORWARD; shrQAStart(argc, argv); int use_gpu = 0; for(int i = 0; i < argc && argv; i++) { if(!argv[i]) continue; if(strstr(argv[i], "cpu")) use_gpu = 0; else if(strstr(argv[i], "gpu")) use_gpu = 1; } // set logfile name and start logs shrSetLogFileName ("oclDCT8x8.txt"); shrLog("%s Starting...\n\n", argv[0]); shrLog("Allocating and initializing host memory...\n"); h_Input = (float *)malloc(imageH * stride * sizeof(float)); h_OutputCPU = (float *)malloc(imageH * stride * sizeof(float)); h_OutputGPU = (float *)malloc(imageH * stride * sizeof(float)); srand(2009); for(uint i = 0; i < imageH; i++) for(uint j = 0; j < imageW; j++) h_Input[i * stride + j] = (float)rand() / (float)RAND_MAX; shrLog("Initializing OpenCL...\n"); //Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckError(ciErrNum, CL_SUCCESS); //Get a GPU device ciErrNum = clGetDeviceIDs(cpPlatform, use_gpu?CL_DEVICE_TYPE_GPU:CL_DEVICE_TYPE_CPU, 1, &cdDevice, NULL); oclCheckError(ciErrNum, CL_SUCCESS); //Create the context cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); //Create a command-queue cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("Initializing OpenCL DCT 8x8...\n"); initDCT8x8(cxGPUContext, cqCommandQueue, (const char **)argv); shrLog("Creating OpenCL memory objects...\n"); d_Input = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, imageH * stride * sizeof(cl_float), h_Input, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); d_Output = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, imageH * stride * sizeof(cl_float), NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("Performing DCT8x8 of %u x %u image...\n\n", imageH, imageW); //Just a single iteration or a warmup iteration DCT8x8( cqCommandQueue, d_Output, d_Input, stride, imageH, imageW, dir ); #ifdef GPU_PROFILING const int numIterations = 16; cl_event startMark, endMark; ciErrNum = clEnqueueMarker(cqCommandQueue, &startMark); ciErrNum |= clFinish(cqCommandQueue); shrCheckError(ciErrNum, CL_SUCCESS); shrDeltaT(0); for(int iter = 0; iter < numIterations; iter++) DCT8x8( NULL, d_Output, d_Input, stride, imageH, imageW, dir ); ciErrNum = clEnqueueMarker(cqCommandQueue, &endMark); ciErrNum |= clFinish(cqCommandQueue); shrCheckError(ciErrNum, CL_SUCCESS); //Calculate performance metrics by wallclock time double gpuTime = shrDeltaT(0) / (double)numIterations; shrLogEx(LOGBOTH | MASTER, 0, "oclDCT8x8, Throughput = %.4f MPixels/s, Time = %.5f s, Size = %u Pixels, NumDevsUsed = %i, Workgroup = %u\n", (1.0e-6 * (double)(imageW * imageH)/ gpuTime), gpuTime, (imageW * imageH), 1, 0); //Get profiler time cl_ulong startTime = 0, endTime = 0; ciErrNum = clGetEventProfilingInfo(startMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &startTime, NULL); ciErrNum |= clGetEventProfilingInfo(endMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL); shrCheckError(ciErrNum, CL_SUCCESS); shrLog("\nOpenCL time: %.5f s\n\n", 1.0e-9 * ((double)endTime - (double)startTime) / (double)numIterations); #endif shrLog("Reading back OpenCL results...\n"); ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_Output, CL_TRUE, 0, imageH * stride * sizeof(cl_float), h_OutputGPU, 0, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("Comparing against Host/C++ computation...\n"); DCT8x8CPU(h_OutputCPU, h_Input, stride, imageH, imageW, dir); double sum = 0, delta = 0; double L2norm; for(uint i = 0; i < imageH; i++) for(uint j = 0; j < imageW; j++){ sum += h_OutputCPU[i * stride + j] * h_OutputCPU[i * stride + j]; delta += (h_OutputGPU[i * stride + j] - h_OutputCPU[i * stride + j]) * (h_OutputGPU[i * stride + j] - h_OutputCPU[i * stride + j]); } L2norm = sqrt(delta / sum); shrLog("Relative L2 norm: %.3e\n\n", L2norm); shrLog("Shutting down...\n"); //Release kernels and program closeDCT8x8(); //Release other OpenCL objects ciErrNum = clReleaseMemObject(d_Output); ciErrNum |= clReleaseMemObject(d_Input); ciErrNum |= clReleaseCommandQueue(cqCommandQueue); ciErrNum |= clReleaseContext(cxGPUContext); oclCheckError(ciErrNum, CL_SUCCESS); //Release host buffers free(h_OutputGPU); free(h_OutputCPU); free(h_Input); //Finish shrQAFinishExit(argc, (const char **)argv, (L2norm < 1E-6) ? QA_PASSED : QA_FAILED); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main( int argc, char** argv) { pArgc = &argc; pArgv = argv; shrQAStart(argc, argv); shrSetLogFileName ("deviceQuery.txt"); shrLog("%s Starting...\n\n", argv[0]); shrLog(" CUDA Device Query (Runtime API) version (CUDART static linking)\n\n"); int deviceCount = 0; cudaError_t error_id = cudaGetDeviceCount(&deviceCount); if (error_id != cudaSuccess) { shrLog( "cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id) ); shrQAFinishExit(*pArgc, (const char **)pArgv, QA_FAILED); } // This function call returns 0 if there are no CUDA capable devices. if (deviceCount == 0) shrLog("There is no device supporting CUDA\n"); else shrLog("Found %d CUDA Capable device(s)\n", deviceCount); int dev, driverVersion = 0, runtimeVersion = 0; for (dev = 0; dev < deviceCount; ++dev) { cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, dev); shrLog("\nDevice %d: \"%s\"\n", dev, deviceProp.name); #if CUDART_VERSION >= 2020 // Console log cudaDriverGetVersion(&driverVersion); cudaRuntimeGetVersion(&runtimeVersion); shrLog(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", driverVersion/1000, (driverVersion%100)/10, runtimeVersion/1000, (runtimeVersion%100)/10); #endif shrLog(" CUDA Capability Major/Minor version number: %d.%d\n", deviceProp.major, deviceProp.minor); char msg[256]; sprintf(msg, " Total amount of global memory: %.0f MBytes (%llu bytes)\n", (float)deviceProp.totalGlobalMem/1048576.0f, (unsigned long long) deviceProp.totalGlobalMem); shrLog(msg); #if CUDART_VERSION >= 2000 shrLog(" (%2d) Multiprocessors x (%3d) CUDA Cores/MP: %d CUDA Cores\n", deviceProp.multiProcessorCount, ConvertSMVer2Cores(deviceProp.major, deviceProp.minor), ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount); #endif shrLog(" GPU Clock rate: %.0f MHz (%0.2f GHz)\n", deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f); #if CUDART_VERSION >= 4000 // This is not available in the CUDA Runtime API, so we make the necessary calls the driver API to support this for output int memoryClock; getCudaAttribute<int>( &memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev ); shrLog(" Memory Clock rate: %.0f Mhz\n", memoryClock * 1e-3f); int memBusWidth; getCudaAttribute<int>( &memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev ); shrLog(" Memory Bus Width: %d-bit\n", memBusWidth); int L2CacheSize; getCudaAttribute<int>( &L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev ); if (L2CacheSize) { shrLog(" L2 Cache Size: %d bytes\n", L2CacheSize); } shrLog(" Max Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n", deviceProp.maxTexture1D, deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1], deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]); shrLog(" Max Layered Texture Size (dim) x layers 1D=(%d) x %d, 2D=(%d,%d) x %d\n", deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1], deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1], deviceProp.maxTexture2DLayered[2]); #endif shrLog(" Total amount of constant memory: %u bytes\n", deviceProp.totalConstMem); shrLog(" Total amount of shared memory per block: %u bytes\n", deviceProp.sharedMemPerBlock); shrLog(" Total number of registers available per block: %d\n", deviceProp.regsPerBlock); shrLog(" Warp size: %d\n", deviceProp.warpSize); shrLog(" Maximum number of threads per multiprocessor: %d\n", deviceProp.maxThreadsPerMultiProcessor); shrLog(" Maximum number of threads per block: %d\n", deviceProp.maxThreadsPerBlock); shrLog(" Maximum sizes of each dimension of a block: %d x %d x %d\n", deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]); shrLog(" Maximum sizes of each dimension of a grid: %d x %d x %d\n", deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], deviceProp.maxGridSize[2]); shrLog(" Maximum memory pitch: %u bytes\n", deviceProp.memPitch); shrLog(" Texture alignment: %u bytes\n", deviceProp.textureAlignment); #if CUDART_VERSION >= 4000 shrLog(" Concurrent copy and execution: %s with %d copy engine(s)\n", (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount); #else shrLog(" Concurrent copy and execution: %s\n", deviceProp.deviceOverlap ? "Yes" : "No"); #endif #if CUDART_VERSION >= 2020 shrLog(" Run time limit on kernels: %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No"); shrLog(" Integrated GPU sharing Host Memory: %s\n", deviceProp.integrated ? "Yes" : "No"); shrLog(" Support host page-locked memory mapping: %s\n", deviceProp.canMapHostMemory ? "Yes" : "No"); #endif #if CUDART_VERSION >= 3000 shrLog(" Concurrent kernel execution: %s\n", deviceProp.concurrentKernels ? "Yes" : "No"); shrLog(" Alignment requirement for Surfaces: %s\n", deviceProp.surfaceAlignment ? "Yes" : "No"); #endif #if CUDART_VERSION >= 3010 shrLog(" Device has ECC support enabled: %s\n", deviceProp.ECCEnabled ? "Yes" : "No"); #endif #if CUDART_VERSION >= 3020 shrLog(" Device is using TCC driver mode: %s\n", deviceProp.tccDriver ? "Yes" : "No"); #endif #if CUDART_VERSION >= 4000 shrLog(" Device supports Unified Addressing (UVA): %s\n", deviceProp.unifiedAddressing ? "Yes" : "No"); shrLog(" Device PCI Bus ID / PCI location ID: %d / %d\n", deviceProp.pciBusID, deviceProp.pciDeviceID ); #endif #if CUDART_VERSION >= 2020 const char *sComputeMode[] = { "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)", "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)", "Prohibited (no host thread can use ::cudaSetDevice() with this device)", "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)", "Unknown", NULL }; shrLog(" Compute Mode:\n"); shrLog(" < %s >\n", sComputeMode[deviceProp.computeMode]); #endif } // csv masterlog info // ***************************** // exe and CUDA driver name shrLog("\n"); std::string sProfileString = "deviceQuery, CUDA Driver = CUDART"; char cTemp[10]; // driver version sProfileString += ", CUDA Driver Version = "; #ifdef WIN32 sprintf_s(cTemp, 10, "%d.%d", driverVersion/1000, (driverVersion%100)/10); #else sprintf(cTemp, "%d.%d", driverVersion/1000, (driverVersion%100)/10); #endif sProfileString += cTemp; // Runtime version sProfileString += ", CUDA Runtime Version = "; #ifdef WIN32 sprintf_s(cTemp, 10, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10); #else sprintf(cTemp, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10); #endif sProfileString += cTemp; // Device count sProfileString += ", NumDevs = "; #ifdef WIN32 sprintf_s(cTemp, 10, "%d", deviceCount); #else sprintf(cTemp, "%d", deviceCount); #endif sProfileString += cTemp; // First 2 device names, if any for (dev = 0; dev < ((deviceCount > 2) ? 2 : deviceCount); ++dev) { cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, dev); sProfileString += ", Device = "; sProfileString += deviceProp.name; } sProfileString += "\n"; shrLogEx(LOGBOTH | MASTER, 0, sProfileString.c_str()); // finish shrQAFinishExit(argc, (const char **)argv, QA_PASSED); }
////////////////////////////////////////////////////////////////////////////// // Program main ////////////////////////////////////////////////////////////////////////////// int main( int argc, char** argv) { bool bTestResults = true; shrQAStart(argc, argv); if( cutCheckCmdLineFlag(argc, (const char**)argv, "help") ) { showHelp(); return 0; } shrLog("Run \"nbody -benchmark [-n=<numBodies>]\" to measure perfomance.\n"); shrLog("\t-fullscreen (run n-body simulation in fullscreen mode)\n"); shrLog("\t-fp64 (use double precision floating point values for simulation)\n"); shrLog("\t-numdevices=N (use first N CUDA devices for simulation)\n"); // shrLog("\t-hostmem (stores simulation data in host memory)\n"); // shrLog("\t-cpu (performs simulation on the host)\n"); shrLog("\n"); bFullscreen = (cutCheckCmdLineFlag(argc, (const char**) argv, "fullscreen") != 0); if (bFullscreen) bShowSliders = false; benchmark = (cutCheckCmdLineFlag(argc, (const char**) argv, "benchmark") != 0); compareToCPU = ((cutCheckCmdLineFlag(argc, (const char**) argv, "compare") != 0) || (cutCheckCmdLineFlag(argc, (const char**) argv, "qatest") != 0)); QATest = (cutCheckCmdLineFlag(argc, (const char**) argv, "qatest") != 0); useHostMem = (cutCheckCmdLineFlag(argc, (const char**) argv, "hostmem") != 0); fp64 = (cutCheckCmdLineFlag(argc, (const char**) argv, "fp64") != 0); flopsPerInteraction = fp64 ? 30 : 20; useCpu = (cutCheckCmdLineFlag(argc, (const char**) argv, "cpu") != 0); cutGetCmdLineArgumenti(argc, (const char**) argv, "numdevices", &numDevsRequested); // for multi-device we currently require using host memory -- the devices share // data via the host if (numDevsRequested > 1) useHostMem = true; int numDevsAvailable = 0; bool customGPU = false; cudaGetDeviceCount(&numDevsAvailable); if (numDevsAvailable < numDevsRequested) { shrLog("Error: only %d Devices available, %d requested. Exiting.\n", numDevsAvailable, numDevsRequested); shrQAFinishExit(argc, (const char **)argv, QA_PASSED); } shrLog("> %s mode\n", bFullscreen ? "Fullscreen" : "Windowed"); shrLog("> Simulation data stored in %s memory\n", useHostMem ? "system" : "video" ); shrLog("> %s precision floating point simulation\n", fp64 ? "Double" : "Single"); shrLog("> %d Devices used for simulation\n", numDevsRequested); int devID; cudaDeviceProp props; // Initialize GL and GLUT if necessary if (!benchmark && !compareToCPU) { initGL(&argc, argv); initParameters(); } if (useCpu) { useHostMem = true; compareToCPU = false; bSupportDouble = true; #ifdef OPENMP shrLog("> Simulation with CPU using OpenMP\n"); #else shrLog("> Simulation with CPU\n"); #endif } else { // Now choose the CUDA Device // Either without GL interop: if (benchmark || compareToCPU || useHostMem) { // Note if we are using host memory for the body system, we // don't use CUDA-GL interop. if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) { devID = cutilDeviceInit(argc, argv); if (devID < 0) { printf("exiting...\n"); shrQAFinishExit(argc, (const char **)argv, QA_PASSED); } customGPU = true; } else { devID = cutGetMaxGflopsDeviceId(); cudaSetDevice( devID ); } } else // or with GL interop: { if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) { cutilGLDeviceInit(argc, argv); customGPU = true; } else { devID = cutGetMaxGflopsDeviceId(); cudaGLSetGLDevice( devID ); } } cutilSafeCall(cudaGetDevice(&devID)); cutilSafeCall(cudaGetDeviceProperties(&props, devID)); bSupportDouble = true; #if CUDART_VERSION < 4000 if (numDevsRequested > 1) { shrLog("MultiGPU n-body requires CUDA 4.0 or later\n"); cutilDeviceReset(); shrQAFinishExit(argc, (const char**)argv, QA_PASSED); } #endif // Initialize devices if(numDevsRequested > 1 && customGPU) { printf("You can't use --numdevices and --device at the same time.\n"); shrQAFinishExit(argc, (const char**)argv, QA_PASSED); } if(customGPU) { cudaDeviceProp props; cutilSafeCall(cudaGetDeviceProperties(&props, devID)); shrLog("> Compute %d.%d CUDA device: [%s]\n", props.major, props.minor, props.name); } else { for (int i = 0; i < numDevsRequested; i++) { cudaDeviceProp props; cutilSafeCall(cudaGetDeviceProperties(&props, i)); shrLog("> Compute %d.%d CUDA device: [%s]\n", props.major, props.minor, props.name); if (useHostMem) { #if CUDART_VERSION >= 2020 if(!props.canMapHostMemory) { fprintf(stderr, "Device %d cannot map host memory!\n", devID); cutilDeviceReset(); shrQAFinishExit(argc, (const char **)argv, QA_PASSED); } if (numDevsRequested > 1) cutilSafeCall(cudaSetDevice(i)); cutilSafeCall(cudaSetDeviceFlags(cudaDeviceMapHost)); #else fprintf(stderr, "This CUDART version does not support <cudaDeviceProp.canMapHostMemory> field\n"); cutilDeviceReset(); shrQAFinishExit(argc, (const char **)argv, QA_PASSED); #endif } } // CC 1.2 and earlier do not support double precision if (props.major*10 + props.minor <= 12) bSupportDouble = false; } //if(numDevsRequested > 1) // cutilSafeCall(cudaSetDevice(devID)); if (fp64 && !bSupportDouble) { fprintf(stderr, "One or more of the requested devices does not support double precision floating-point\n"); cutilDeviceReset(); shrQAFinishExit(argc, (const char **)argv, QA_PASSED); } } numIterations = 0; p = 0; q = 1; cutGetCmdLineArgumenti(argc, (const char**) argv, "i", &numIterations); cutGetCmdLineArgumenti(argc, (const char**) argv, "p", &p); cutGetCmdLineArgumenti(argc, (const char**) argv, "q", &q); if (p == 0) // p not set on command line { p = 256; if (q * p > 256) { p = 256 / q; shrLog("Setting p=%d, q=%d to maintain %d threads per block\n", p, q, 256); } } // default number of bodies is #SMs * 4 * CTA size if (useCpu) #ifdef OPENMP numBodies = 8192; #else numBodies = 4096; #endif else if (numDevsRequested == 1)
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main( int argc, char** argv) { shrQAStart( argc, argv ); shrSetLogFileName ("reduction.txt"); char *reduceMethod; cutGetCmdLineArgumentstr( argc, (const char**) argv, "method", &reduceMethod); char *typeChoice; cutGetCmdLineArgumentstr( argc, (const char**) argv, "type", &typeChoice); if (0 == typeChoice) { typeChoice = (char*)malloc(4 * sizeof(char)); strcpy(typeChoice, "int"); } ReduceType datatype = REDUCE_INT; if (!strcasecmp(typeChoice, "float")) datatype = REDUCE_FLOAT; else if (!strcasecmp(typeChoice, "double")) datatype = REDUCE_DOUBLE; else datatype = REDUCE_INT; cudaDeviceProp deviceProp; deviceProp.major = 1; deviceProp.minor = 0; int minimumComputeVersion = 10; if (datatype == REDUCE_DOUBLE) { deviceProp.minor = 3; minimumComputeVersion = 13; } int dev; if(!cutCheckCmdLineFlag(argc, (const char**)argv, "method") ) { fprintf(stderr, "MISSING --method FLAG.\nYou must provide --method={ SUM | MIN | MAX }.\n"); exit(1); } if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) { cutilDeviceInit(argc, argv); cutilSafeCallNoSync(cudaGetDevice(&dev)); } else { cutilSafeCallNoSync(cudaChooseDevice(&dev, &deviceProp)); } cutilSafeCallNoSync(cudaGetDeviceProperties(&deviceProp, dev)); if((deviceProp.major * 10 + deviceProp.minor) >= minimumComputeVersion) { shrLog("Using Device %d: %s\n\n", dev, deviceProp.name); cutilSafeCallNoSync(cudaSetDevice(dev)); } else { shrLog("Error: the selected device does not support the minimum compute capability of %d.%d.\n\n", minimumComputeVersion / 10, minimumComputeVersion % 10); cutilDeviceReset(); shrQAFinishExit(argc, (const char **)argv, QA_WAIVED); } shrLog("Reducing array of type %s\n\n", typeChoice); bool bResult = false; switch (datatype) { default: case REDUCE_INT: if (strcmp("SUM", reduceMethod) == 0) { bResult = runTestSum<int>( argc, argv, datatype); } else if ( strcmp("MAX", reduceMethod) == 0 ) { bResult = runTestMax<int>( argc, argv, datatype); } else if ( strcmp("MIN", reduceMethod) == 0 ) { bResult = runTestMin<int>( argc, argv, datatype); } else { fprintf(stderr, "No --method specified!\n"); exit(1); } break; case REDUCE_FLOAT: if (strcmp("SUM", reduceMethod) == 0) { bResult = runTestSum<float>( argc, argv, datatype); } else if ( strcmp("MAX", reduceMethod) == 0 ) { bResult = runTestMax<float>( argc, argv, datatype); } else if ( strcmp("MIN", reduceMethod) == 0 ) { bResult = runTestMin<float>( argc, argv, datatype); } else { fprintf(stderr, "No --method specified!\n"); exit(1); } break; case REDUCE_DOUBLE: if (strcmp("SUM", reduceMethod) == 0) { bResult = runTestSum<double>( argc, argv, datatype); } else if ( strcmp("MAX", reduceMethod) == 0 ) { bResult = runTestMax<double>( argc, argv, datatype); } else if ( strcmp("MIN", reduceMethod) == 0 ) { bResult = runTestMin<double>( argc, argv, datatype); } else { fprintf(stderr, "No --method specified!\n"); exit(1); } break; } cutilDeviceReset(); shrQAFinishExit(argc, (const char**)argv, (bResult ? QA_PASSED : QA_FAILED)); }
int main( int argc, char **argv ) { uchar *h_Data; uint *h_HistogramCPU, *h_HistogramGPU; uchar *d_Data; uint *d_Histogram; uint hTimer; int PassFailFlag = 1; uint byteCount = 64 * 1048576; uint uiSizeMult = 1; cudaDeviceProp deviceProp; deviceProp.major = 0; deviceProp.minor = 0; int dev; shrQAStart(argc, argv); // set logfile name and start logs shrSetLogFileName ("histogram.txt"); //Use command-line specified CUDA device, otherwise use device with highest Gflops/s if( shrCheckCmdLineFlag(argc, (const char**)argv, "device") ) { dev = cutilDeviceInit(argc, argv); if (dev < 0) { printf("No CUDA Capable Devices found, exiting...\n"); shrQAFinishExit(argc, (const char **)argv, QA_WAIVED); } } else { cudaSetDevice( dev = cutGetMaxGflopsDeviceId() ); cutilSafeCall( cudaChooseDevice(&dev, &deviceProp) ); } cutilSafeCall( cudaGetDeviceProperties(&deviceProp, dev) ); printf("CUDA device [%s] has %d Multi-Processors, Compute %d.%d\n", deviceProp.name, deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor); int version = deviceProp.major * 0x10 + deviceProp.minor; if(version < 0x11) { printf("There is no device supporting a minimum of CUDA compute capability 1.1 for this SDK sample\n"); cutilDeviceReset(); shrQAFinishExit(argc, (const char **)argv, QA_WAIVED); } cutilCheckError(cutCreateTimer(&hTimer)); // Optional Command-line multiplier to increase size of array to histogram if (shrGetCmdLineArgumentu(argc, (const char**)argv, "sizemult", &uiSizeMult)) { uiSizeMult = CLAMP(uiSizeMult, 1, 10); byteCount *= uiSizeMult; } shrLog("Initializing data...\n"); shrLog("...allocating CPU memory.\n"); h_Data = (uchar *)malloc(byteCount); h_HistogramCPU = (uint *)malloc(HISTOGRAM256_BIN_COUNT * sizeof(uint)); h_HistogramGPU = (uint *)malloc(HISTOGRAM256_BIN_COUNT * sizeof(uint)); shrLog("...generating input data\n"); srand(2009); for(uint i = 0; i < byteCount; i++) h_Data[i] = rand() % 256; shrLog("...allocating GPU memory and copying input data\n\n"); cutilSafeCall( cudaMalloc((void **)&d_Data, byteCount ) ); cutilSafeCall( cudaMalloc((void **)&d_Histogram, HISTOGRAM256_BIN_COUNT * sizeof(uint) ) ); cutilSafeCall( cudaMemcpy(d_Data, h_Data, byteCount, cudaMemcpyHostToDevice) ); //----- // 64 bin histogram //------ { shrLog("Starting up 64-bin histogram...\n\n"); initHistogram64(); shrLog("Running 64-bin GPU histogram for %u bytes (%u runs)...\n\n", byteCount, numRuns); for(int iter = -1; iter < numRuns; iter++){ //iter == -1 -- warmup iteration if(iter == 0){ cutilSafeCall( cutilDeviceSynchronize() ); cutilCheckError( cutResetTimer(hTimer) ); cutilCheckError( cutStartTimer(hTimer) ); } histogram64(d_Histogram, d_Data, byteCount); } cutilSafeCall( cutilDeviceSynchronize() ); cutilCheckError( cutStopTimer(hTimer)); double dAvgSecs = 1.0e-3 * (double)cutGetTimerValue(hTimer) / (double)numRuns; shrLog("histogram64() time (average) : %.5f sec, %.4f MB/sec\n\n", dAvgSecs, ((double)byteCount * 1.0e-6) / dAvgSecs); shrLogEx(LOGBOTH | MASTER, 0, "histogram64, Throughput = %.4f MB/s, Time = %.5f s, Size = %u Bytes, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * (double)byteCount / dAvgSecs), dAvgSecs, byteCount, 1, HISTOGRAM64_THREADBLOCK_SIZE); shrLog("\nValidating GPU results...\n"); shrLog(" ...reading back GPU results\n"); cutilSafeCall( cudaMemcpy(h_HistogramGPU, d_Histogram, HISTOGRAM64_BIN_COUNT * sizeof(uint), cudaMemcpyDeviceToHost) ); shrLog(" ...histogram64CPU()\n"); histogram64CPU( h_HistogramCPU, h_Data, byteCount ); shrLog(" ...comparing the results...\n"); for(uint i = 0; i < HISTOGRAM64_BIN_COUNT; i++) if(h_HistogramGPU[i] != h_HistogramCPU[i]) PassFailFlag = 0; shrLog(PassFailFlag ? " ...64-bin histograms match\n\n" : " ***64-bin histograms do not match!!!***\n\n" ); shrLog("Shutting down 64-bin histogram...\n\n\n"); closeHistogram64(); } //----- // Histogram 256 //----- { shrLog("Initializing 256-bin histogram...\n"); initHistogram256(); shrLog("Running 256-bin GPU histogram for %u bytes (%u runs)...\n\n", byteCount, numRuns); for(int iter = -1; iter < numRuns; iter++){ //iter == -1 -- warmup iteration if(iter == 0){ cutilSafeCall( cutilDeviceSynchronize() ); cutilCheckError( cutResetTimer(hTimer) ); cutilCheckError( cutStartTimer(hTimer) ); } histogram256(d_Histogram, d_Data, byteCount); } cutilSafeCall( cutilDeviceSynchronize() ); cutilCheckError( cutStopTimer(hTimer)); double dAvgSecs = 1.0e-3 * (double)cutGetTimerValue(hTimer) / (double)numRuns; shrLog("histogram256() time (average) : %.5f sec, %.4f MB/sec\n\n", dAvgSecs, ((double)byteCount * 1.0e-6) / dAvgSecs); shrLogEx(LOGBOTH | MASTER, 0, "histogram256, Throughput = %.4f MB/s, Time = %.5f s, Size = %u Bytes, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * (double)byteCount / dAvgSecs), dAvgSecs, byteCount, 1, HISTOGRAM256_THREADBLOCK_SIZE); shrLog("\nValidating GPU results...\n"); shrLog(" ...reading back GPU results\n"); cutilSafeCall( cudaMemcpy(h_HistogramGPU, d_Histogram, HISTOGRAM256_BIN_COUNT * sizeof(uint), cudaMemcpyDeviceToHost) ); shrLog(" ...histogram256CPU()\n"); histogram256CPU( h_HistogramCPU, h_Data, byteCount ); shrLog(" ...comparing the results\n"); for(uint i = 0; i < HISTOGRAM256_BIN_COUNT; i++) if(h_HistogramGPU[i] != h_HistogramCPU[i]) PassFailFlag = 0; shrLog(PassFailFlag ? " ...256-bin histograms match\n\n" : " ***256-bin histograms do not match!!!***\n\n" ); shrLog("Shutting down 256-bin histogram...\n\n\n"); closeHistogram256(); } //----- // Histogram Trish //----- { shrLog("Initializing 256-bin TRISH histogram...\n"); initTrish256(); shrLog("Running 256-bin GPU TRISH histogram for %u bytes (%u runs)...\n\n", byteCount, numRuns); for(int iter = -1; iter < numRuns; iter++) { //iter == -1 -- warmup iteration if (iter == 0) { cutilSafeCall( cutilDeviceSynchronize() ); cutilCheckError( cutResetTimer(hTimer) ); cutilCheckError( cutStartTimer(hTimer) ); } histogramTrish256( d_Histogram, d_Data, byteCount ); } cutilSafeCall( cutilDeviceSynchronize() ); cutilCheckError( cutStopTimer(hTimer)); double dAvgSecs = 1.0e-3 * (double)cutGetTimerValue(hTimer) / (double)numRuns; shrLog("histogramTRISH() time (average) : %.5f sec, %.4f MB/sec\n\n", dAvgSecs, ((double)byteCount * 1.0e-6) / dAvgSecs); shrLogEx(LOGBOTH | MASTER, 0, "histogramTRISH, Throughput = %.4f MB/s, Time = %.5f s, Size = %u Bytes, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * (double)byteCount / dAvgSecs), dAvgSecs, byteCount, 1, HISTOGRAM256_THREADBLOCK_SIZE); shrLog("\nValidating GPU results...\n"); shrLog(" ...reading back GPU results\n"); cutilSafeCall( cudaMemcpy(h_HistogramGPU, d_Histogram, HISTOGRAM256_BIN_COUNT * sizeof(uint), cudaMemcpyDeviceToHost) ); shrLog(" ...histogram256CPU()\n"); histogram256CPU( h_HistogramCPU, h_Data, byteCount ); shrLog(" ...comparing the results\n"); for(uint i = 0; i < HISTOGRAM256_BIN_COUNT; i++) if(h_HistogramGPU[i] != h_HistogramCPU[i]) PassFailFlag = 0; shrLog(PassFailFlag ? " ...256-bin histograms match\n\n" : " ***256-bin histograms do not match!!!***\n\n" ); shrLog("Shutting down 256-bin TRISH histogram...\n\n\n"); closeTrish256(); } shrLog("Shutting down...\n"); cutilCheckError(cutDeleteTimer(hTimer)); cutilSafeCall( cudaFree(d_Histogram) ); cutilSafeCall( cudaFree(d_Data) ); free(h_HistogramGPU); free(h_HistogramCPU); free(h_Data); cutilDeviceReset(); shrLog("%s - Test Summary\n", sSDKsample ); // pass or fail (for both 64 bit and 256 bit histograms) shrQAFinishExit(argc, (const char **)argv, (PassFailFlag ? QA_PASSED : QA_FAILED)); }
// Main function // ********************************************************************* int main(int argc, char **argv) { shrQAStart(argc, argv); int NUM_BLOCKS = 10; shrSetLogFileName ("Barrier_Centralized.txt"); while(NUM_BLOCKS<=120) { int iNumElements = NUM_BLOCKS* NUM_THREADS; // total num of threads // BARRIER GOAL int goal_val = NUM_BLOCKS; // get command line arg for quick test, if provided bNoPrompt = shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt"); // start logs cExecutableName = argv[0]; shrSetLogFileName ("Barrier.txt"); shrLog("%s Starting...\n\n# of THREADS \t= %i\n", argv[0], iNumElements); // set and log Global and Local work size dimensions szLocalWorkSize = NUM_THREADS ; szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, iNumElements); // rounded up to the nearest multiple of the LocalWorkSize shrLog("Global Work Size \t\t= %u\nLocal Work Size \t\t= %u\n# of Work Groups \t\t= %u\n\n", szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize)); //Get an OpenCL platform ciErr1 = clGetPlatformIDs(1, &cpPlatform, NULL); shrLog("clGetPlatformID...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clGetPlatformID, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } //Get the devices ciErr1 = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL); shrLog("clGetDeviceIDs...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clGetDeviceIDs, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } //Create the context cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErr1); shrLog("clCreateContext...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateContext, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Create a command-queue cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE, &ciErr1); shrLog("clCreateCommandQueue...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateCommandQueue, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Read the OpenCL kernel in from source file shrLog("oclLoadProgSource (%s)...\n", cSourceFile); cPathAndName = shrFindFilePath(cSourceFile, argv[0]); cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength); // Create the program cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErr1); shrLog("clCreateProgramWithSource...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateProgramWithSource, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Build the program with 'mad' Optimization option #ifdef MAC char* flags = "-cl-fast-relaxed-math -DMAC"; #else char* flags = "-cl-fast-relaxed-math"; #endif ciErr1 = clBuildProgram(cpProgram, 0, NULL, NULL, NULL, NULL); shrLog("clBuildProgram...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clBuildProgram, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Create the kernel ckKernel = clCreateKernel(cpProgram, "Barrier", &ciErr1); shrLog("clCreateKernel (Barrier)...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Allocate and initialize host arrays shrLog( "Allocate and Init Host Mem...\n"); input = (int *)malloc(sizeof(int) * NUM_BLOCKS); for(int i =0; i<=NUM_BLOCKS; i++) { input[i]=0; } // Allocate the OpenCL buffer memory objects for source and result on the device GMEM array_in = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, sizeof(int)* NUM_BLOCKS, NULL, &ciErr1); array_out = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, sizeof(int)* NUM_BLOCKS, NULL, &ciErr1); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Set the Argument values ciErr1 = clSetKernelArg(ckKernel, 0, sizeof(cl_int), (void*)&goal_val); ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&array_in); ciErr1 |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&array_out); // ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_int), (void*)&iNumElements); shrLog("clSetKernelArg 0 - 2...\n\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clSetKernelArg, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // -------------------------------------------------------- // Start Core sequence... copy input data to GPU, compute, copy results back ciErr1 = clEnqueueWriteBuffer(cqCommandQueue, array_in, CL_FALSE, 0, sizeof(int) * NUM_BLOCKS,(void*) input, 0, NULL, NULL); shrLog("clEnqueueWriteBuffer (SrcA and SrcB)...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clEnqueueWriteBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Launch kernel ciErr1 = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, &ceEvent); shrLog("clEnqueueNDRangeKernel (Barrier)...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clEnqueueNDRangeKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } /*ciErr1 = clEnqueueReadBuffer(cqCommandQueue, global_mutex, CL_TRUE, 0, sizeof(cl_int), &original_goal, 0, NULL, NULL); shrLog("clEnqueueReadBuffer (Dst)...%d \n\n", original_goal); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clEnqueueReadBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); }*/ //GPU_PROFILING ciErr1=clWaitForEvents(1, &ceEvent); if (ciErr1 != CL_SUCCESS) { shrLog("Error 1 !\n\n"); Cleanup(argc, argv, EXIT_FAILURE); } cl_ulong start, end; ciErr1 = clGetEventProfilingInfo(ceEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); ciErr1 |= clGetEventProfilingInfo(ceEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); if (ciErr1 != CL_SUCCESS) { shrLog("Error 2 !\n\n"); Cleanup(argc, argv, EXIT_FAILURE); } double dSeconds = 1.0e-9 * (double)(end - start); shrLog("Done! time taken %ul \n",end - start ); // shrLog("Done! Kernel execution time: %.5f s\n\n", dSeconds); // Release event clReleaseEvent(ceEvent); ceEvent = 0; Cleanup (argc, argv, EXIT_SUCCESS); NUM_BLOCKS = NUM_BLOCKS+10; } shrQAFinishExit(argc, (const char **)argv, QA_PASSED); }
int main(int argc, char **argv) { shrQAStart(argc, argv); if (argc > 1) { if (cutCheckCmdLineFlag(argc, (const char **)argv, "qatest") || cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")) { g_bQAReadback = true; fpsLimit = frameCheckNumber; } if (cutCheckCmdLineFlag(argc, (const char **)argv, "glverify")) { g_bOpenGLQA = true; g_bFBODisplay = false; fpsLimit = frameCheckNumber; } if (cutCheckCmdLineFlag(argc, (const char **)argv, "fbo")) { g_bFBODisplay = true; fpsLimit = frameCheckNumber; } } if (g_bQAReadback) { runAutoTest(argc, argv); } else { printf("[%s] ", sSDKsample); if (g_bFBODisplay) printf("[FBO Display] "); if (g_bOpenGLQA) printf("[OpenGL Readback Comparisons] "); printf("\n"); // use command-line specified CUDA device, otherwise use device with highest Gflops/s if ( cutCheckCmdLineFlag(argc, (const char **)argv, "device")) { printf("[%s]\n", argv[0]); printf(" Does not explicitly support -device=n in OpenGL mode\n"); printf(" To use -device=n, the sample must be running w/o OpenGL\n\n"); printf(" > %s -device=n -qatest\n", argv[0]); printf("exiting...\n"); exit(0); } // First load the image, so we know what the size of the image (imageW and imageH) printf("Allocating host and CUDA memory and loading image file...\n"); const char *image_path = cutFindFilePath("portrait_noise.bmp", argv[0]); if (image_path == NULL) { printf( "imageDenoisingGL was unable to find and load image file <portrait_noise.bmp>.\nExiting...\n"); shrQAFinishExit(argc, (const char **)argv, QA_FAILED); } LoadBMPFile(&h_Src, &imageW, &imageH, image_path); printf("Data init done.\n"); // First initialize OpenGL context, so we can properly set the GL for CUDA. // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. initGL( &argc, argv ); cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() ); cutilSafeCall( CUDA_MallocArray(&h_Src, imageW, imageH) ); initOpenGLBuffers(); // Creating the Auto-Validation Code if (g_bOpenGLQA) { if (g_bFBODisplay) { g_CheckRender = new CheckFBO(imageW, imageH, 4); } else { g_CheckRender = new CheckBackBuffer(imageW, imageH, 4); } g_CheckRender->setPixelFormat(GL_RGBA); g_CheckRender->setExecPath(argv[0]); g_CheckRender->EnableQAReadback(g_bOpenGLQA); } } printf("Starting GLUT main loop...\n"); printf("Press [1] to view noisy image\n"); printf("Press [2] to view image restored with knn filter\n"); printf("Press [3] to view image restored with nlm filter\n"); printf("Press [4] to view image restored with modified nlm filter\n"); printf("Press [ ] to view smooth/edgy areas [RED/BLUE] Ct's\n"); printf("Press [f] to print frame rate\n"); printf("Press [?] to print Noise and Lerp Ct's\n"); printf("Press [q] to exit\n"); glutDisplayFunc(displayFunc); glutKeyboardFunc(shutDown); cutilCheckError( cutCreateTimer(&hTimer) ); cutilCheckError( cutStartTimer(hTimer) ); glutTimerFunc(REFRESH_DELAY, timerEvent,0); glutMainLoop(); cutilDeviceReset(); shrQAFinishExit(argc, (const char **)argv, QA_PASSED); }
int main(int argc, char **argv) { int M = 0, N = 0, nz = 0, *I = NULL, *J = NULL; float *val = NULL; const float tol = 1e-5f; const int max_iter = 10000; float *x; float *rhs; float a, b, na, r0, r1; int *d_col, *d_row; float *d_val, *d_x, dot; float *d_r, *d_p, *d_Ax; int k; float alpha, beta, alpham1; shrQAStart(argc, argv); // This will pick the best possible CUDA capable device cudaDeviceProp deviceProp; int devID = findCudaDevice(argc, (const char **)argv); if (devID < 0) { printf("exiting...\n"); shrQAFinishExit(argc, (const char **)argv, QA_PASSED); exit(0); } checkCudaErrors( cudaGetDeviceProperties(&deviceProp, devID) ); // Statistics about the GPU device printf("> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n\n", deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor); int version = (deviceProp.major * 0x10 + deviceProp.minor); if(version < 0x11) { printf("%s: requires a minimum CUDA compute 1.1 capability\n", sSDKname); cudaDeviceReset(); shrQAFinishExit(argc, (const char **)argv, QA_PASSED); } /* Generate a random tridiagonal symmetric matrix in CSR format */ M = N = 1048576; nz = (N-2)*3 + 4; I = (int*)malloc(sizeof(int)*(N+1)); J = (int*)malloc(sizeof(int)*nz); val = (float*)malloc(sizeof(float)*nz); genTridiag(I, J, val, N, nz); x = (float*)malloc(sizeof(float)*N); rhs = (float*)malloc(sizeof(float)*N); for (int i = 0; i < N; i++) { rhs[i] = 1.0; x[i] = 0.0; } /* Get handle to the CUBLAS context */ cublasHandle_t cublasHandle = 0; cublasStatus_t cublasStatus; cublasStatus = cublasCreate(&cublasHandle); if ( checkCublasStatus (cublasStatus, "!!!! CUBLAS initialization error\n") ) return EXIT_FAILURE; /* Get handle to the CUSPARSE context */ cusparseHandle_t cusparseHandle = 0; cusparseStatus_t cusparseStatus; cusparseStatus = cusparseCreate(&cusparseHandle); if ( checkCusparseStatus (cusparseStatus, "!!!! CUSPARSE initialization error\n") ) return EXIT_FAILURE; cusparseMatDescr_t descr = 0; cusparseStatus = cusparseCreateMatDescr(&descr); if ( checkCusparseStatus (cusparseStatus, "!!!! CUSPARSE cusparseCreateMatDescr error\n") ) return EXIT_FAILURE; cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO); checkCudaErrors( cudaMalloc((void**)&d_col, nz*sizeof(int)) ); checkCudaErrors( cudaMalloc((void**)&d_row, (N+1)*sizeof(int)) ); checkCudaErrors( cudaMalloc((void**)&d_val, nz*sizeof(float)) ); checkCudaErrors( cudaMalloc((void**)&d_x, N*sizeof(float)) ); checkCudaErrors( cudaMalloc((void**)&d_r, N*sizeof(float)) ); checkCudaErrors( cudaMalloc((void**)&d_p, N*sizeof(float)) ); checkCudaErrors( cudaMalloc((void**)&d_Ax, N*sizeof(float)) ); cudaMemcpy(d_col, J, nz*sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(d_row, I, (N+1)*sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(d_val, val, nz*sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(d_r, rhs, N*sizeof(float), cudaMemcpyHostToDevice); alpha = 1.0; alpham1 = -1.0; beta = 0.0; r0 = 0.; cusparseScsrmv(cusparseHandle,CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, nz, &alpha, descr, d_val, d_row, d_col, d_x, &beta, d_Ax); cublasSaxpy(cublasHandle, N, &alpham1, d_Ax, 1, d_r, 1); cublasStatus = cublasSdot(cublasHandle, N, d_r, 1, d_r, 1, &r1); k = 1; while (r1 > tol*tol && k <= max_iter) { if (k > 1) { b = r1 / r0; cublasStatus = cublasSscal(cublasHandle, N, &b, d_p, 1); cublasStatus = cublasSaxpy(cublasHandle, N, &alpha, d_r, 1, d_p, 1); } else { cublasStatus = cublasScopy(cublasHandle, N, d_r, 1, d_p, 1); } cusparseScsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, nz, &alpha, descr, d_val, d_row, d_col, d_p, &beta, d_Ax); cublasStatus = cublasSdot(cublasHandle, N, d_p, 1, d_Ax, 1, &dot); a = r1 / dot; cublasStatus = cublasSaxpy(cublasHandle, N, &a, d_p, 1, d_x, 1); na = -a; cublasStatus = cublasSaxpy(cublasHandle, N, &na, d_Ax, 1, d_r, 1); r0 = r1; cublasStatus = cublasSdot(cublasHandle, N, d_r, 1, d_r, 1, &r1); cudaThreadSynchronize(); printf("iteration = %3d, residual = %e\n", k, sqrt(r1)); k++; } cudaMemcpy(x, d_x, N*sizeof(float), cudaMemcpyDeviceToHost); float rsum, diff, err = 0.0; for (int i = 0; i < N; i++) { rsum = 0.0; for (int j = I[i]; j < I[i+1]; j++) { rsum += val[j]*x[J[j]]; } diff = fabs(rsum - rhs[i]); if (diff > err) err = diff; } cusparseDestroy(cusparseHandle); cublasDestroy(cublasHandle); free(I); free(J); free(val); free(x); free(rhs); cudaFree(d_col); cudaFree(d_row); cudaFree(d_val); cudaFree(d_x); cudaFree(d_r); cudaFree(d_p); cudaFree(d_Ax); cudaDeviceReset(); printf("Test Summary: Error amount = %f\n", err); shrQAFinishExit(argc, (const char **)argv, (k <= max_iter) ? QA_PASSED : QA_FAILED ); }
int main(int argc, char* argv[]) { shrQAStart(argc, argv); try { std::string sFilename; char *filePath = findFilePath("Lena.pgm", argv[0]); if (filePath) { sFilename = filePath; } else { printf("Error unable to find Lena.pgm\n"); shrQAFinishExit(argc, (const char **)argv, QA_FAILED); } // Parse the command line arguments for proper configuration parseCommandLineArguments(argc, argv); printfNPPinfo(argc, argv); if (g_bQATest == false && (g_nDevice == -1) && argc > 1) { sFilename = argv[1]; } // if we specify the filename at the command line, then we only test sFilename. int file_errors = 0; std::ifstream infile(sFilename.data(), std::ifstream::in); if (infile.good()) { std::cout << "histEqualizationNPP opened: <" << sFilename.data() << "> successfully!" << std::endl; file_errors = 0; infile.close(); } else { std::cout << "histEqualizationNPP unable to open: <" << sFilename.data() << ">" << std::endl; file_errors++; infile.close(); } if (file_errors > 0) { shrQAFinishExit(argc, (const char **)argv, QA_FAILED); } std::string dstFileName = sFilename; std::string::size_type dot = dstFileName.rfind('.'); if (dot != std::string::npos) dstFileName = dstFileName.substr(0, dot); dstFileName += "_histEqualization.pgm"; if (argc >= 3 && !g_bQATest) dstFileName = argv[2]; npp::ImageCPU_8u_C1 oHostSrc; npp::loadImage(sFilename, oHostSrc); npp::ImageNPP_8u_C1 oDeviceSrc(oHostSrc); // // allocate arrays for histogram and levels // const int binCount = 256; const int levelCount = binCount + 1; // levels array has one more element Npp32s * histDevice = 0; Npp32s * levelsDevice = 0; NPP_CHECK_CUDA(cudaMalloc((void **)&histDevice, binCount * sizeof(Npp32s))); NPP_CHECK_CUDA(cudaMalloc((void **)&levelsDevice, levelCount * sizeof(Npp32s))); // // compute histogram // NppiSize oSizeROI = {oDeviceSrc.width(), oDeviceSrc.height()}; // full image // create device scratch buffer for nppiHistogram int nDeviceBufferSize; nppiHistogramEvenGetBufferSize_8u_C1R(oSizeROI, levelCount ,&nDeviceBufferSize); Npp8u * pDeviceBuffer; NPP_CHECK_CUDA(cudaMalloc((void **)&pDeviceBuffer, nDeviceBufferSize)); // compute levels values on host Npp32s levelsHost[levelCount]; NPP_CHECK_NPP(nppiEvenLevelsHost_32s(levelsHost, levelCount, 0, binCount)); // compute the histogram NPP_CHECK_NPP(nppiHistogramEven_8u_C1R(oDeviceSrc.data(), oDeviceSrc.pitch(), oSizeROI, histDevice, levelCount, 0, binCount, pDeviceBuffer)); // copy histogram and levels to host memory Npp32s histHost[binCount]; NPP_CHECK_CUDA(cudaMemcpy(histHost, histDevice, binCount * sizeof(Npp32s), cudaMemcpyDeviceToHost)); Npp32s lutHost[binCount + 1]; // fill LUT { Npp32s * pHostHistogram = histHost; Npp32s totalSum = 0; for (; pHostHistogram < histHost + binCount; ++pHostHistogram) totalSum += *pHostHistogram; NPP_ASSERT(totalSum == oSizeROI.width * oSizeROI.height); if (totalSum == 0) totalSum = 1; float multiplier = 1.0f / float(totalSum) * 0xFF; Npp32s runningSum = 0; Npp32s * pLookupTable = lutHost; for (pHostHistogram = histHost; pHostHistogram < histHost + binCount; ++pHostHistogram) { *pLookupTable = (Npp32s)(runningSum * multiplier + 0.5f); pLookupTable++; runningSum += *pHostHistogram; } lutHost[binCount] = 0xFF; // last element is always 1 } // // apply LUT transformation to the image // // Create a device image for the result. npp::ImageNPP_8u_C1 oDeviceDst(oDeviceSrc.size()); NPP_CHECK_NPP(nppiLUT_Linear_8u_C1R(oDeviceSrc.data(), oDeviceSrc.pitch(), oDeviceDst.data(), oDeviceDst.pitch(), oSizeROI, lutHost, // value and level arrays are in host memory levelsHost, binCount+1)); // copy the result image back into the storage that contained the // input image npp::ImageCPU_8u_C1 oHostDst(oDeviceDst.size()); oDeviceDst.copyTo(oHostDst.data(), oHostDst.pitch()); // save the result npp::saveImage(dstFileName.c_str(), oHostDst); std::cout << "Saved image file " << dstFileName << std::endl; shrQAFinishExit(argc, (const char **)argv, QA_PASSED); } catch (npp::Exception & rException) { std::cerr << "Program error! The following exception occurred: \n"; std::cerr << rException << std::endl; std::cerr << "Aborting." << std::endl; shrQAFinishExit(argc, (const char **)argv, QA_FAILED); } catch (...) { std::cerr << "Program error! An unknow type of exception occurred. \n"; std::cerr << "Aborting." << std::endl; shrQAFinishExit(argc, (const char **)argv, QA_FAILED); } return 0; }
// Main function // ********************************************************************* int main( int argc, const char** argv) { shrQAStart(argc, (char **)argv); // start logs shrSetLogFileName ("oclReduction.txt"); shrLog("%s Starting...\n\n", argv[0]); char *typeChoice; shrGetCmdLineArgumentstr(argc, argv, "type", &typeChoice); // determine type of array from command line args if (0 == typeChoice) { typeChoice = (char*)malloc(7 * sizeof(char)); #ifdef WIN32 strcpy_s(typeChoice, 7 * sizeof(char) + 1, "int"); #else strcpy(typeChoice, "int"); #endif } ReduceType datatype = REDUCE_INT; #ifdef WIN32 if (!_strcmpi(typeChoice, "float")) datatype = REDUCE_FLOAT; else if (!_strcmpi(typeChoice, "double")) datatype = REDUCE_DOUBLE; else datatype = REDUCE_INT; #else if (!strcmp(typeChoice, "float")) datatype = REDUCE_FLOAT; else if (!strcmp(typeChoice, "double")) datatype = REDUCE_DOUBLE; else datatype = REDUCE_INT; #endif shrLog("Reducing array of type %s.\n", typeChoice); //Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckError(ciErrNum, CL_SUCCESS); //Get the devices ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices); oclCheckError(ciErrNum, CL_SUCCESS); cl_device_id *cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) ); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL); oclCheckError(ciErrNum, CL_SUCCESS); //Create the context cxGPUContext = clCreateContext(0, uiNumDevices, cdDevices, NULL, NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); // get and log the device info if( shrCheckCmdLineFlag(argc, (const char**)argv, "device") ) { int device_nr = 0; shrGetCmdLineArgumenti(argc, (const char**)argv, "device", &device_nr); if( device_nr < uiNumDevices ) { device = oclGetDev(cxGPUContext, device_nr); } else { shrLog("Invalid Device %d Requested.\n", device_nr); shrExitEX(argc, argv, EXIT_FAILURE); } } else { device = oclGetMaxFlopsDev(cxGPUContext); } oclPrintDevName(LOGBOTH, device); shrLog("\n"); // create a command-queue cqCommandQueue = clCreateCommandQueue(cxGPUContext, device, 0, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); source_path = shrFindFilePath("oclReduction_kernel.cl", argv[0]); bool bSuccess = false; switch (datatype) { default: case REDUCE_INT: bSuccess = runTest<int>( argc, argv, datatype); break; case REDUCE_FLOAT: bSuccess = runTest<float>( argc, argv, datatype); break; } // finish shrQAFinishExit(argc, (const char **)argv, bSuccess ? QA_PASSED : QA_FAILED); }