// Helper to trigger reset of fps vars at transition //***************************************************************************** void TriggerFPSUpdate() { iFrameCount = 0; iFramesPerSec = 1; iFrameTrigger = 2; shrDeltaT(1); shrDeltaT(0); dProcessingTime = 0.0; }
/////////////////////////////////////////////////////////////////////////////// // test the bandwidth of a device to host memcopy of a specific size /////////////////////////////////////////////////////////////////////////////// double testDeviceToDeviceTransfer(unsigned int memSize) { double elapsedTimeInSec = 0.0; double bandwidthInMBs = 0.0; unsigned char* h_idata = NULL; cl_int ciErrNum = CL_SUCCESS; //allocate host memory h_idata = (unsigned char *)malloc( memSize ); //initialize the memory for(unsigned int i = 0; i < memSize/sizeof(unsigned char); i++) { h_idata[i] = (unsigned char) (i & 0xff); } // allocate device input and output memory and initialize the device input memory cl_mem d_idata = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, memSize, NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); cl_mem d_odata = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, memSize, NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); ciErrNum = clEnqueueWriteBuffer(cqCommandQueue, d_idata, CL_TRUE, 0, memSize, h_idata, 0, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); // Sync queue to host, start timer 0, and copy data from one GPU buffer to another GPU bufffer clFinish(cqCommandQueue); shrDeltaT(0); for(unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { ciErrNum = clEnqueueCopyBuffer(cqCommandQueue, d_idata, d_odata, 0, 0, memSize, 0, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); } // Sync with GPU clFinish(cqCommandQueue); //get the the elapsed time in seconds elapsedTimeInSec = shrDeltaT(0); // Calculate bandwidth in MB/s // This is for kernels that read and write GMEM simultaneously // Obtained Throughput for unidirectional block copies will be 1/2 of this # bandwidthInMBs = 2.0 * ((double)memSize * (double)MEMCOPY_ITERATIONS)/(elapsedTimeInSec * (double)(1 << 20)); //clean up memory on host and device free(h_idata); clReleaseMemObject(d_idata); clReleaseMemObject(d_odata); return bandwidthInMBs; }
// Helper to trigger reset of fps vars at transition //***************************************************************************** void TriggerFPSUpdate() { iFrameCount = 0; shrDeltaT(1); iFramesPerSec = 1; iFrameTrigger = 2; }
// Kernel function //***************************************************************************** int executeKernel(cl_int radius) { // set global and local work item dimensions szLocalWorkSize[0] = 16; szLocalWorkSize[1] = 16; szGlobalWorkSize[0] = shrRoundUp((int)szLocalWorkSize[0], image_width); szGlobalWorkSize[1] = shrRoundUp((int)szLocalWorkSize[1], image_height); // set the args values cl_int tilew = (cl_int)szLocalWorkSize[0]+(2*radius); ciErrNum = clSetKernelArg(ckKernel, 4, sizeof(tilew), &tilew); ciErrNum |= clSetKernelArg(ckKernel, 5, sizeof(radius), &radius); cl_float threshold = 0.8f; ciErrNum |= clSetKernelArg(ckKernel, 6, sizeof(threshold), &threshold); cl_float highlight = 4.0f; ciErrNum |= clSetKernelArg(ckKernel, 7, sizeof(highlight), &highlight); // Local memory ciErrNum |= clSetKernelArg(ckKernel, 8, (szLocalWorkSize[0]+(2*16))*(szLocalWorkSize[1]+(2*16))*sizeof(int), NULL); // launch computation kernel #ifdef GPU_PROFILING int nIter = 30; for( int i=-1; i< nIter; ++i) { if( i ==0 ) shrDeltaT(0); #endif ciErrNum |= clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 2, NULL, szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL); #ifdef GPU_PROFILING } clFinish(cqCommandQueue); double dSeconds = shrDeltaT(0)/(double)nIter; double dNumTexels = (double)image_width * (double)image_height; double mtexps = 1.0e-6 * dNumTexels/dSeconds; shrLogEx(LOGBOTH | MASTER, 0, "oclPostprocessGL, Throughput = %.4f MTexels/s, Time = %.5f s, Size = %.0f Texels, NumDevsUsed = %u, Workgroup = %u\n", mtexps, dSeconds, dNumTexels, uiNumDevsUsed, szLocalWorkSize[0] * szLocalWorkSize[1]); #endif oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); return 0; }
// Run a test sequence without any GL //***************************************************************************** void TestNoGL() { // Warmup call to assure OpenCL driver is awake // note this function has a finish for all queues at its end, so no further host sync is needed SobelFilterGPU (uiInput, uiOutput); // Start timer 0 and process n loops on the GPU const int iCycles = 150; dProcessingTime = 0.0; shrLog("\nRunning SobelFilterGPU for %d cycles...\n\n", iCycles); shrDeltaT(2); for (int i = 0; i < iCycles; i++) { // note this function has a finish for all queues at its end, so no further host sync is needed dProcessingTime += SobelFilterGPU (uiInput, uiOutput); } // Get round-trip and average computation time double dRoundtripTime = shrDeltaT(2)/(double)iCycles; dProcessingTime /= (double)iCycles; // log throughput, timing and config info to sample and master logs shrLogEx(LOGBOTH | MASTER, 0, "oclSobelFilter, Throughput = %.4f M RGB Pixels/s, Time = %.5f s, Size = %u RGB Pixels, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * uiImageWidth * uiImageHeight)/dProcessingTime, dProcessingTime, (uiImageWidth * uiImageHeight), GpuDevMngr->uiUsefulDevCt, szLocalWorkSize[0] * szLocalWorkSize[1]); shrLog("\nRoundTrip Time = %.5f s, Equivalent FPS = %.1f\n\n", dRoundtripTime, 1.0/dRoundtripTime); // Compute on host cl_uint* uiGolden = (cl_uint*)malloc(szBuffBytes); SobelFilterHost(uiInput, uiGolden, uiImageWidth, uiImageHeight, fThresh); // Compare GPU and Host results: Allow variance of 1 GV in up to 0.01% of pixels shrLog("Comparing GPU Result to CPU Result...\n"); shrBOOL bMatch = shrCompareuit(uiGolden, uiOutput, (uiImageWidth * uiImageHeight), 1.0f, 0.0001f); shrLog("\nGPU Result %s CPU Result within tolerance...\n", (bMatch == shrTRUE) ? "matches" : "DOESN'T match"); // Cleanup and exit free(uiGolden); Cleanup((bMatch == shrTRUE) ? EXIT_SUCCESS : EXIT_FAILURE); }
int genTimer(int timerID) { #ifdef NV shrDeltaT(timerID); return timerID; #else int timer = sampleCommon->createTimer(); sampleCommon->resetTimer(timer); sampleCommon->startTimer(timer); return timer; #endif }
// QATest sequence without any GL calls //***************************************************************************** void TestNoGL() { // Warmup call to assure OpenCL driver is awake psystem->update(timestep); // Start timer 0 and process n loops on the GPU const int iCycles = 20; shrDeltaT(0); for (int i = 0; i < iCycles; i++) { psystem->update(timestep); } // Get elapsed time and throughput, then log to sample and master logs double dAvgTime = shrDeltaT(0)/(double)iCycles; shrLogEx(LOGBOTH | MASTER, 0, "oclParticles, Throughput = %.4f KParticles/s, Time = %.5f s, Size = %u particles, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-3 * numParticles)/dAvgTime, dAvgTime, numParticles, 1, 0); // Cleanup and exit shrQAFinish2(true, *pArgc, (const char **)pArgv, QA_PASSED); Cleanup (EXIT_SUCCESS); }
void RunProfiling(int iterations, unsigned int uiWorkgroup) { // once without timing to prime the GPU nbody->update(activeParams.m_timestep); nbody->synchronizeThreads(); // Start timer 0 and process n loops on the GPU shrDeltaT(FUNCTIME); for (int i = 0; i < iterations; ++i) { nbody->update(activeParams.m_timestep); } nbody->synchronizeThreads(); // Get elapsed time and throughput, then log to sample and master logs double dSeconds = shrDeltaT(FUNCTIME); double dGigaInteractionsPerSecond = 0.0; double dGigaFlops = 0.0; ComputePerfStats(dGigaInteractionsPerSecond, dGigaFlops, dSeconds, iterations); shrLogEx(LOGBOTH | MASTER, 0, "oclNBody-%s, Throughput = %.4f GFLOP/s, Time = %.5f s, Size = %u bodies, NumDevsUsed = %u, Workgroup = %u\n", (bDouble ? "DP" : "SP"), dGigaFlops, dSeconds/(double)iterations, numBodies, uiNumDevsUsed, uiWorkgroup); }
//***************************************************************************** void SelectDemo(int index) { oclCheckErrorEX((index < numDemos), shrTRUE, pCleanup); activeParams = demoParams[index]; camera_trans[0] = camera_trans_lag[0] = activeParams.m_x; camera_trans[1] = camera_trans_lag[1] = activeParams.m_y; camera_trans[2] = camera_trans_lag[2] = activeParams.m_z; ResetSim(nbody, numBodies, NBODY_CONFIG_SHELL, true); //Rest the demo timer shrDeltaT(DEMOTIME); }
double getTimerNoSync(int timer) { //clFinish(cqCommandQueue); #ifdef NV double elapsedTimeInSec = shrDeltaT(timer); return elapsedTimeInSec; #else sampleCommon->stopTimer(timer); double writeTime = (double)(sampleCommon->readTimer(timer)); sampleCommon->resetTimer(timer); sampleCommon->startTimer(timer); return writeTime; #endif }
//////////////////////////////////////////////////////////////////////////////// //! Run a simple benchmark test for CUDA //////////////////////////////////////////////////////////////////////////////// void runBenchmark( int argc, char **argv ) { int devID = 0; shrLog("[runBenchmark]: [%s]\n", sSDKsample); devID = cutilChooseCudaDevice(argc, argv); loadImageData(argc, argv); initCuda(); g_CheckRender = new CheckBackBuffer(width, height, 4, false); g_CheckRender->setExecPath(argv[0]); unsigned int *d_result; cutilSafeCall( cudaMalloc( (void **)&d_result, width*height*sizeof(unsigned int)) ); // warm-up boxFilterRGBA(d_img, d_temp, d_temp, width, height, filter_radius, iterations, nthreads); cutilSafeCall( cutilDeviceSynchronize() ); // Start round-trip timer and process iCycles loops on the GPU iterations = 1; // standard 1-pass filtering const int iCycles = 150; double dProcessingTime = 0.0; shrLog("\nRunning BoxFilterGPU for %d cycles...\n\n", iCycles); shrDeltaT(2); for (int i = 0; i < iCycles; i++) { dProcessingTime += boxFilterRGBA(d_img, d_temp, d_img, width, height, filter_radius, iterations, nthreads); } // check if kernel execution generated an error and sync host cutilCheckMsg("Error: boxFilterRGBA Kernel execution FAILED"); cutilSafeCall(cutilDeviceSynchronize()); // Get average computation time dProcessingTime /= (double)iCycles; // log testname, throughput, timing and config info to sample and master logs shrLogEx(LOGBOTH | MASTER, 0, "boxFilter-texture, Throughput = %.4f M RGBA Pixels/s, Time = %.5f s, Size = %u RGBA Pixels, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * width * height)/dProcessingTime, dProcessingTime, (width * height), 1, nthreads); shrLog("\n"); }
double getTimer(int timer) { /*With Kernel Scheduler*/ // cl_int err1=clFinish(CommandQueue[0]); // cl_int err2=clFinish(CommandQueue[1]); // printf("getTimer calls finish commandqueue first"); #ifdef NV double elapsedTimeInSec = shrDeltaT(timer); return elapsedTimeInSec; #else sampleCommon->stopTimer(timer); double writeTime = (double)(sampleCommon->readTimer(timer)); sampleCommon->resetTimer(timer); sampleCommon->startTimer(timer); return writeTime; #endif }
int main(int argc, const char **argv) { cl_platform_id cpPlatform; // OpenCL platform cl_uint nDevice; // OpenCL device count cl_device_id* cdDevices; // OpenCL device list cl_context cxGPUContext; // OpenCL context cl_command_queue cqCommandQueue[MAX_GPU_COUNT]; // OpenCL command que cl_int ciErrNum; shrSetLogFileName ("oclRadixSort.txt"); shrLog("%s starting...\n\n", argv[0]); shrLog("clGetPlatformID...\n"); ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); shrLog("clGetDeviceIDs...\n"); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &nDevice); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); cdDevices = (cl_device_id *)malloc(nDevice * sizeof(cl_device_id) ); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, nDevice, cdDevices, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); shrLog("clCreateContext...\n"); cxGPUContext = clCreateContext(0, nDevice, cdDevices, NULL, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); shrLog("Create command queue...\n\n"); int id_device; if(shrGetCmdLineArgumenti(argc, argv, "device", &id_device)) // Set up command queue(s) for GPU specified on the command line { // get & log device index # and name cl_device_id cdDevice = cdDevices[id_device]; // create a command que cqCommandQueue[0] = clCreateCommandQueue(cxGPUContext, cdDevice, 0, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); oclPrintDevInfo(LOGBOTH, cdDevice); nDevice = 1; } else { // create command queues for all available devices for (cl_uint i = 0; i < nDevice; i++) { cqCommandQueue[i] = clCreateCommandQueue(cxGPUContext, cdDevices[i], 0, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); } for (cl_uint i = 0; i < nDevice; i++) oclPrintDevInfo(LOGBOTH, cdDevices[i]); } int ctaSize; if (!shrGetCmdLineArgumenti(argc, argv, "work-group-size", &ctaSize)) { ctaSize = 128; } shrLog("Running Radix Sort on %d GPU(s) ...\n\n", nDevice); unsigned int numElements = 1048576;//128*128*128*2; // Alloc and init some data on the host, then alloc and init GPU buffer unsigned int **h_keys = (unsigned int**)malloc(nDevice * sizeof(unsigned int*)); unsigned int **h_keysSorted = (unsigned int**)malloc(nDevice * sizeof(unsigned int*)); cl_mem *d_keys = (cl_mem* )malloc(nDevice * sizeof(cl_mem)); for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { h_keys[iDevice] = (unsigned int*)malloc(numElements * sizeof(unsigned int)); h_keysSorted[iDevice] = (unsigned int*)malloc(numElements * sizeof(unsigned int)); makeRandomUintVector(h_keys[iDevice], numElements, keybits); d_keys[iDevice] = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, sizeof(unsigned int) * numElements, NULL, &ciErrNum); ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[iDevice], d_keys[iDevice], CL_TRUE, 0, sizeof(unsigned int) * numElements, h_keys[iDevice], 0, NULL, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); } // instantiate RadixSort objects RadixSort **radixSort = (RadixSort**)malloc(nDevice * sizeof(RadixSort*)); for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { radixSort[iDevice] = new RadixSort(cxGPUContext, cqCommandQueue[iDevice], numElements, argv[0], ctaSize, true); } #ifdef GPU_PROFILING int numIterations = 30; for (int i = -1; i < numIterations; i++) { if (i == 0) { for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { clFinish(cqCommandQueue[iDevice]); } shrDeltaT(1); } #endif for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { radixSort[iDevice]->sort(d_keys[iDevice], 0, numElements, keybits); } #ifdef GPU_PROFILING } for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { clFinish(cqCommandQueue[iDevice]); } double gpuTime = shrDeltaT(1)/(double)numIterations; shrLogEx(LOGBOTH | MASTER, 0, "oclRadixSort, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u elements, NumDevsUsed = %d, Workgroup = %d\n", (1.0e-6 * (double)(nDevice * numElements)/gpuTime), gpuTime, nDevice * numElements, nDevice, ctaSize); #endif // copy sorted keys to CPU for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { clEnqueueReadBuffer(cqCommandQueue[iDevice], d_keys[iDevice], CL_TRUE, 0, sizeof(unsigned int) * numElements, h_keysSorted[iDevice], 0, NULL, NULL); } // Check results bool passed = true; for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { passed &= verifySortUint(h_keysSorted[iDevice], NULL, h_keys[iDevice], numElements); } shrLog("\n%s\n\n", passed ? "PASSED" : "FAILED"); // cleanup allocs for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { clReleaseMemObject(d_keys[iDevice]); free(h_keys[iDevice]); free(h_keysSorted[iDevice]); delete radixSort[iDevice]; } free(radixSort); free(h_keys); free(h_keysSorted); // remaining cleanup and exit free(cdDevices); for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { clReleaseCommandQueue(cqCommandQueue[iDevice]); } clReleaseContext(cxGPUContext); shrEXIT(argc, argv); }
// Main program //***************************************************************************** int main(int argc, char** argv) { // Locals used with command line args int p = 256; // workgroup X dimension int q = 1; // workgroup Y dimension pArgc = &argc; pArgv = argv; shrQAStart(argc, argv); // latch the executable path for other funcs to use cExecutablePath = argv[0]; // start logs and show command line help shrSetLogFileName ("oclNbody.txt"); shrLog("%s Starting...\n\n", cExecutablePath); shrLog("Command line switches:\n"); shrLog(" --qatest\t\tCheck correctness of GPU execution and measure performance)\n"); shrLog(" --noprompt\t\tQuit simulation automatically after a brief period\n"); shrLog(" --n=<numbodies>\tSpecify # of bodies to simulate (default = %d)\n", numBodies); shrLog(" --double\t\tUse double precision floating point values for simulation\n"); shrLog(" --p=<workgroup X dim>\tSpecify X dimension of workgroup (default = %d)\n", p); shrLog(" --q=<workgroup Y dim>\tSpecify Y dimension of workgroup (default = %d)\n\n", q); // Get command line arguments if there are any and set vars accordingly if (argc > 0) { shrGetCmdLineArgumenti(argc, (const char**)argv, "p", &p); shrGetCmdLineArgumenti(argc, (const char**)argv, "q", &q); shrGetCmdLineArgumenti(argc, (const char**)argv, "n", &numBodies); bDouble = (shrTRUE == shrCheckCmdLineFlag(argc, (const char**)argv, "double")); bNoPrompt = shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt"); bQATest = shrCheckCmdLineFlag(argc, (const char**)argv, "qatest"); } //Get the NVIDIA platform cl_int ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); shrLog("clGetPlatformID...\n\n"); if (bDouble) { shrLog("Double precision execution...\n\n"); } else { shrLog("Single precision execution...\n\n"); } flopsPerInteraction = bDouble ? 30 : 20; //Get all the devices shrLog("Get the Device info and select Device...\n"); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) ); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Set target device and Query number of compute units on uiTargetDevice shrLog(" # of Devices Available = %u\n", uiNumDevices); if(shrGetCmdLineArgumentu(argc, (const char**)argv, "device", &uiTargetDevice)== shrTRUE) { uiTargetDevice = CLAMP(uiTargetDevice, 0, (uiNumDevices - 1)); } shrLog(" Using Device %u, ", uiTargetDevice); oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]); cl_uint uiNumComputeUnits; clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(uiNumComputeUnits), &uiNumComputeUnits, NULL); shrLog(" # of Compute Units = %u\n", uiNumComputeUnits); //Create the context shrLog("clCreateContext...\n"); cxContext = clCreateContext(0, uiNumDevsUsed, &cdDevices[uiTargetDevice], NULL, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Create a command-queue shrLog("clCreateCommandQueue...\n\n"); cqCommandQueue = clCreateCommandQueue(cxContext, cdDevices[uiTargetDevice], CL_QUEUE_PROFILING_ENABLE, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Log and config for number of bodies shrLog("Number of Bodies = %d\n", numBodies); switch (numBodies) { case 1024: activeParams.m_clusterScale = 1.52f; activeParams.m_velocityScale = 2.f; break; case 2048: activeParams.m_clusterScale = 1.56f; activeParams.m_velocityScale = 2.64f; break; case 4096: activeParams.m_clusterScale = 1.68f; activeParams.m_velocityScale = 2.98f; break; case 7680: case 8192: activeParams.m_clusterScale = 1.98f; activeParams.m_velocityScale = 2.9f; break; default: case 15360: case 16384: activeParams.m_clusterScale = 1.54f; activeParams.m_velocityScale = 8.f; break; case 30720: case 32768: activeParams.m_clusterScale = 1.44f; activeParams.m_velocityScale = 11.f; break; } if ((q * p) > 256) { p = 256 / q; shrLog("Setting p=%d to maintain %d threads per block\n", p, 256); } if ((q == 1) && (numBodies < p)) { p = numBodies; shrLog("Setting p=%d because # of bodies < p\n", p); } shrLog("Workgroup Dims = (%d x %d)\n\n", p, q); // Initialize OpenGL items if using GL if (bQATest == shrFALSE) { assert(0); /* shrLog("Calling InitGL...\n"); InitGL(&argc, argv); */ } else { shrLog("Skipping InitGL...\n"); } // CL/GL interop disabled bUsePBO = (false && (bQATest == shrFALSE)); InitNbody(cdDevices[uiTargetDevice], cxContext, cqCommandQueue, numBodies, p, q, bUsePBO, bDouble); ResetSim(nbody, numBodies, NBODY_CONFIG_SHELL, bUsePBO); // init timers shrDeltaT(DEMOTIME); // timer 0 is for timing demo periods shrDeltaT(FUNCTIME); // timer 1 is for logging function delta t's shrDeltaT(FPSTIME); // timer 2 is for fps measurement // Standard simulation if (bQATest == shrFALSE) { assert(0); /* shrLog("Running standard oclNbody simulation...\n\n"); glutDisplayFunc(DisplayGL); glutReshapeFunc(ReshapeGL); glutMouseFunc(MouseGL); glutMotionFunc(MotionGL); glutKeyboardFunc(KeyboardGL); glutSpecialFunc(SpecialGL); glutIdleFunc(IdleGL); glutMainLoop(); */ } // Compare to host, profile and write out file for regression analysis if (bQATest == shrTRUE) { bool bTestResults = false; shrLog("Running oclNbody Results Comparison...\n\n"); bTestResults = CompareResults(numBodies); //shrLog("Profiling oclNbody...\n\n"); //RunProfiling(100, (unsigned int)(p * q)); // 100 iterations shrQAFinish(argc, (const char **)argv, bTestResults ? QA_PASSED : QA_FAILED); } else { // Cleanup/exit bNoPrompt = shrTRUE; shrQAFinish2(false, *pArgc, (const char **)pArgv, QA_PASSED); } Cleanup(EXIT_SUCCESS); }
// Primary GLUT callback loop function //***************************************************************************** void DisplayGL() { // update the simulation, unless paused double dProcessingTime = 0.0; if (!bPause) { // start timer FUNCTIME if it's update time if (iFrameCount >= iFrameTrigger) { shrDeltaT(FUNCTIME); } // Run the simlation computations nbody->update(activeParams.m_timestep); nbody->getArray(BodySystem::BODYSYSTEM_POSITION); // Make graphics work with or without CL/GL interop if (bUsePBO) { renderer->setPBO((unsigned int)nbody->getCurrentReadBuffer(), nbody->getNumBodies()); } else { renderer->setPositions((float*)nbody->getCurrentReadBuffer(), nbody->getNumBodies()); } // get processing time from timer FUNCTIME, if it's update time if (iFrameCount >= iFrameTrigger) { dProcessingTime = shrDeltaT(FUNCTIME); } } // Redraw main graphics display, if enabled glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); if (displayEnabled) { // view transform glMatrixMode(GL_MODELVIEW); glLoadIdentity(); for (int c = 0; c < 3; ++c) { camera_trans_lag[c] += (camera_trans[c] - camera_trans_lag[c]) * inertia; camera_rot_lag[c] += (camera_rot[c] - camera_rot_lag[c]) * inertia; } glTranslatef(camera_trans_lag[0], camera_trans_lag[1], camera_trans_lag[2]); glRotatef(camera_rot_lag[0], 1.0, 0.0, 0.0); glRotatef(camera_rot_lag[1], 0.0, 1.0, 0.0); renderer->setSpriteSize(activeParams.m_pointSize); renderer->display(displayMode); } // Display user interface if enabled if (bShowSliders) { glBlendFunc(GL_ONE_MINUS_DST_COLOR, GL_ZERO); // invert color glEnable(GL_BLEND); paramlist->Render(0, 0); glDisable(GL_BLEND); } // Flip backbuffer to screen glutSwapBuffers(); // If frame count has triggerd, increment the frame counter, and do fps stuff if (iFrameCount++ > iFrameTrigger) { // If tour mode is enabled & interval has timed out, switch to next tour/demo mode dElapsedTime += shrDeltaT(DEMOTIME); if (bTour && (dElapsedTime > demoTime)) { dElapsedTime = 0.0; activeDemo = (activeDemo + 1) % numDemos; SelectDemo(activeDemo); } // get the perf and fps stats iFramesPerSec = (int)((double)iFrameCount/ shrDeltaT(FPSTIME)); double dGigaInteractionsPerSecond = 0.0; double dGigaFlops = 0.0; ComputePerfStats(dGigaInteractionsPerSecond, dGigaFlops, dProcessingTime, 1); // If not paused, set the display window title, reset trigger and log info char cTitle[256]; if(!bPause) { #ifdef GPU_PROFILING #ifdef _WIN32 sprintf_s(cTitle, "OpenCL for GPU Nbody Demo (%d bodies): %i fps | %0.4f BIPS | %0.4f GFLOP/s", numBodies, iFramesPerSec, dGigaInteractionsPerSecond, dGigaFlops); #else sprintf(cTitle, "OpenCL for GPU Nbody Demo (%d bodies): %i fps | %0.4f BIPS | %0.4f GFLOP/s", numBodies, iFramesPerSec, dGigaInteractionsPerSecond, dGigaFlops); #endif #else #ifdef _WIN32 sprintf_s(cTitle, "OpenCL for GPU Nbody Demo (%d bodies)", numBodies, iFramesPerSec, dGigaInteractionsPerSecond); #else sprintf(cTitle, "OpenCL for GPU Nbody Demo (%d bodies)", numBodies, iFramesPerSec, dGigaInteractionsPerSecond); #endif #endif #ifndef __EMSCRIPTEN__ glutSetWindowTitle(cTitle); #else printf("%s\n",cTitle); #endif // Log fps and processing info to console and file shrLog("%s\n", cTitle); // if doing quick test, exit if ((bNoPrompt) && (!--iTestSets)) { // Cleanup up and quit shrQAFinish2(false, *pArgc, (const char **)pArgv, QA_PASSED); Cleanup(EXIT_SUCCESS); } // reset the frame counter and adjust trigger iFrameCount = 0; iFrameTrigger = (iFramesPerSec > 1) ? iFramesPerSec * 2 : 1; } } #ifndef __EMSCRIPTEN__ glutReportErrors(); #endif }
//////////////////////////////////////////////////////////////////////////////// // Test driver //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { cl_platform_id cpPlatform; cl_device_id cdDevice; cl_context cxGPUContext; //OpenCL context cl_command_queue cqCommandQueue; //OpenCL command queue cl_mem c_Kernel, d_Input, d_Buffer, d_Output; //OpenCL memory buffer objects cl_float *h_Kernel, *h_Input, *h_Buffer, *h_OutputCPU, *h_OutputGPU; cl_int ciErrNum; const unsigned int imageW = 3072; const unsigned int imageH = 3072; shrQAStart(argc, argv); // set logfile name and start logs shrSetLogFileName ("oclConvolutionSeparable.txt"); shrLog("%s Starting...\n\n", argv[0]); shrLog("Allocating and initializing host memory...\n"); h_Kernel = (cl_float *)malloc(KERNEL_LENGTH * sizeof(cl_float)); h_Input = (cl_float *)malloc(imageW * imageH * sizeof(cl_float)); h_Buffer = (cl_float *)malloc(imageW * imageH * sizeof(cl_float)); h_OutputCPU = (cl_float *)malloc(imageW * imageH * sizeof(cl_float)); h_OutputGPU = (cl_float *)malloc(imageW * imageH * sizeof(cl_float)); srand(2009); for(unsigned int i = 0; i < KERNEL_LENGTH; i++) h_Kernel[i] = (cl_float)(rand() % 16); for(unsigned int i = 0; i < imageW * imageH; i++) h_Input[i] = (cl_float)(rand() % 16); shrLog("Initializing OpenCL...\n"); //Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckError(ciErrNum, CL_SUCCESS); //Get the devices ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL); //Create the context cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); //Create a command-queue cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("Initializing OpenCL separable convolution...\n"); initConvolutionSeparable(cxGPUContext, cqCommandQueue, (const char **)argv); shrLog("Creating OpenCL memory objects...\n"); c_Kernel = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, KERNEL_LENGTH * sizeof(cl_float), h_Kernel, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); d_Input = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, imageW * imageH * sizeof(cl_float), h_Input, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); d_Buffer = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, imageW * imageH * sizeof(cl_float), NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); d_Output = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, imageW * imageH * sizeof(cl_float), NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("Applying separable convolution to %u x %u image...\n\n", imageW, imageH); //Just a single run or a warmup iteration convolutionRows( NULL, d_Buffer, d_Input, c_Kernel, imageW, imageH ); convolutionColumns( NULL, d_Output, d_Buffer, c_Kernel, imageW, imageH ); #ifdef GPU_PROFILING const int numIterations = 16; cl_event startMark, endMark; ciErrNum = clEnqueueMarker(cqCommandQueue, &startMark); ciErrNum |= clFinish(cqCommandQueue); shrCheckError(ciErrNum, CL_SUCCESS); shrDeltaT(0); for(int iter = 0; iter < numIterations; iter++){ convolutionRows( cqCommandQueue, d_Buffer, d_Input, c_Kernel, imageW, imageH ); convolutionColumns( cqCommandQueue, d_Output, d_Buffer, c_Kernel, imageW, imageH ); } ciErrNum = clEnqueueMarker(cqCommandQueue, &endMark); ciErrNum |= clFinish(cqCommandQueue); shrCheckError(ciErrNum, CL_SUCCESS); //Calculate performance metrics by wallclock time double gpuTime = shrDeltaT(0) / (double)numIterations; shrLogEx(LOGBOTH | MASTER, 0, "oclConvolutionSeparable, Throughput = %.4f MPixels/s, Time = %.5f s, Size = %u Pixels, NumDevsUsed = %i, Workgroup = %u\n", (1.0e-6 * (double)(imageW * imageH)/ gpuTime), gpuTime, (imageW * imageH), 1, 0); //Get OpenCL profiler info cl_ulong startTime = 0, endTime = 0; ciErrNum = clGetEventProfilingInfo(startMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &startTime, NULL); ciErrNum |= clGetEventProfilingInfo(endMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL); shrCheckError(ciErrNum, CL_SUCCESS); shrLog("\nOpenCL time: %.5f s\n\n", 1.0e-9 * ((double)endTime - (double)startTime)/ (double)numIterations); #endif shrLog("Reading back OpenCL results...\n\n"); ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_Output, CL_TRUE, 0, imageW * imageH * sizeof(cl_float), h_OutputGPU, 0, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("Comparing against Host/C++ computation...\n"); convolutionRowHost(h_Buffer, h_Input, h_Kernel, imageW, imageH, KERNEL_RADIUS); convolutionColumnHost(h_OutputCPU, h_Buffer, h_Kernel, imageW, imageH, KERNEL_RADIUS); double sum = 0, delta = 0; double L2norm; for(unsigned int i = 0; i < imageW * imageH; i++){ delta += (h_OutputCPU[i] - h_OutputGPU[i]) * (h_OutputCPU[i] - h_OutputGPU[i]); sum += h_OutputCPU[i] * h_OutputCPU[i]; } L2norm = sqrt(delta / sum); shrLog("Relative L2 norm: %.3e\n\n", L2norm); // cleanup closeConvolutionSeparable(); ciErrNum = clReleaseMemObject(d_Output); ciErrNum |= clReleaseMemObject(d_Buffer); ciErrNum |= clReleaseMemObject(d_Input); ciErrNum |= clReleaseMemObject(c_Kernel); ciErrNum |= clReleaseCommandQueue(cqCommandQueue); ciErrNum |= clReleaseContext(cxGPUContext); oclCheckError(ciErrNum, CL_SUCCESS); free(h_OutputGPU); free(h_OutputCPU); free(h_Buffer); free(h_Input); free(h_Kernel); // finish shrQAFinishExit(argc, (const char **)argv, (L2norm < 1e-6) ? QA_PASSED : QA_FAILED); }
// main function //***************************************************************************** int main(int argc, const char **argv) { cl_platform_id cpPlatform; // OpenCL platform cl_uint nDevice; // OpenCL device count cl_device_id* cdDevices; // OpenCL device list cl_context cxGPUContext; // OpenCL context cl_command_queue cqCommandQue[MAX_GPU_COUNT]; // OpenCL command que cl_int ciErrNum = 1; // Error code var shrQAStart(argc, (char **)argv); // Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); shrLog("clGetPlatformID...\n"); //Get all the devices cl_uint uiNumDevices = 0; // Number of devices available cl_uint uiTargetDevice = 0; // Default Device to compute on cl_uint uiNumComputeUnits; // Number of compute units (SM's on NV GPU) shrLog("Get the Device info and select Device...\n"); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) ); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); // Get command line device options and config accordingly shrLog(" # of Devices Available = %u\n", uiNumDevices); if(shrGetCmdLineArgumentu(argc, (const char**)argv, "device", &uiTargetDevice)== shrTRUE) { uiTargetDevice = CLAMP(uiTargetDevice, 0, (uiNumDevices - 1)); } shrLog(" Using Device %u: ", uiTargetDevice); oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]); ciErrNum = clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(uiNumComputeUnits), &uiNumComputeUnits, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); shrLog("\n # of Compute Units = %u\n", uiNumComputeUnits); shrSetLogFileName ("oclHiddenMarkovModel.txt"); shrLog("%s Starting...\n\n", argv[0]); shrLog("Get platform...\n"); ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); shrLog("Get devices...\n"); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &nDevice); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); cdDevices = (cl_device_id *)malloc(nDevice * sizeof(cl_device_id) ); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, nDevice, cdDevices, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); shrLog("clCreateContext\n"); cxGPUContext = clCreateContext(0, nDevice, cdDevices, NULL, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); shrLog("clCreateCommandQueue\n"); int id_device; if(shrGetCmdLineArgumenti(argc, argv, "device", &id_device)) // Set up command queue(s) for GPU specified on the command line { // create a command que cqCommandQue[0] = clCreateCommandQueue(cxGPUContext, cdDevices[id_device], 0, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); oclPrintDevInfo(LOGBOTH, cdDevices[id_device]); nDevice = 1; } else { // create command queues for all available devices for (cl_uint i = 0; i < nDevice; i++) { cqCommandQue[i] = clCreateCommandQueue(cxGPUContext, cdDevices[i], CL_QUEUE_PROFILING_ENABLE, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); } for (cl_uint i = 0; i < nDevice; i++) oclPrintDevInfo(LOGBOTH, cdDevices[i]); } shrLog("\nUsing %d GPU(s)...\n\n", nDevice); int wgSize; if (!shrGetCmdLineArgumenti(argc, argv, "work-group-size", &wgSize)) { wgSize = 256; } shrLog("Init Hidden Markov Model parameters\n"); int nState = 256*16; // number of states, must be a multiple of 256 int nEmit = 128; // number of possible observations float *initProb = (float*)malloc(sizeof(float)*nState); // initial probability float *mtState = (float*)malloc(sizeof(float)*nState*nState); // state transition matrix float *mtEmit = (float*)malloc(sizeof(float)*nEmit*nState); // emission matrix initHMM(initProb, mtState, mtEmit, nState, nEmit); // define observational sequence int nObs = 100; // size of observational sequence int **obs = (int**)malloc(nDevice*sizeof(int*)); int **viterbiPathCPU = (int**)malloc(nDevice*sizeof(int*)); int **viterbiPathGPU = (int**)malloc(nDevice*sizeof(int*)); float *viterbiProbCPU = (float*)malloc(nDevice*sizeof(float)); for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { obs[iDevice] = (int*)malloc(sizeof(int)*nObs); for (int i = 0; i < nObs; i++) obs[iDevice][i] = i % 15; viterbiPathCPU[iDevice] = (int*)malloc(sizeof(int)*nObs); viterbiPathGPU[iDevice] = (int*)malloc(sizeof(int)*nObs); } shrLog("# of states = %d\n# of possible observations = %d \nSize of observational sequence = %d\n\n", nState, nEmit, nObs); shrLog("Compute Viterbi path on GPU\n\n"); HMM **oHmm = (HMM**)malloc(nDevice*sizeof(HMM*)); for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { oHmm[iDevice] = new HMM(cxGPUContext, cqCommandQue[iDevice], initProb, mtState, mtEmit, nState, nEmit, nObs, argv[0], wgSize); } cl_mem *vProb = (cl_mem*)malloc(sizeof(cl_mem)*nDevice); cl_mem *vPath = (cl_mem*)malloc(sizeof(cl_mem)*nDevice); for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { vProb[iDevice] = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, sizeof(float), NULL, &ciErrNum); vPath[iDevice] = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, sizeof(int)*nObs, NULL, &ciErrNum); } #ifdef GPU_PROFILING for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { clFinish(cqCommandQue[iDevice]);; } shrDeltaT(1); #endif size_t szWorkGroup; for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { szWorkGroup = oHmm[iDevice]->ViterbiSearch(vProb[iDevice], vPath[iDevice], obs[iDevice]); } for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { clFinish(cqCommandQue[iDevice]); } #ifdef GPU_PROFILING double dElapsedTime = shrDeltaT(1); shrLogEx(LOGBOTH | MASTER, 0, "oclHiddenMarkovModel, Throughput = %.4f GB/s, Time = %.5f s, Size = %u items, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-9 * 2.0 * sizeof(float) * nDevice * nState * nState * (nObs-1))/dElapsedTime, dElapsedTime, (nDevice * nState * nObs), nDevice, szWorkGroup); #endif for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { ciErrNum = clEnqueueReadBuffer(cqCommandQue[iDevice], vPath[iDevice], CL_TRUE, 0, sizeof(int)*nObs, viterbiPathGPU[iDevice], 0, NULL, NULL); } shrLog("\nCompute Viterbi path on CPU\n"); for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { ciErrNum = ViterbiCPU(viterbiProbCPU[iDevice], viterbiPathCPU[iDevice], obs[iDevice], nObs, initProb, mtState, nState, mtEmit); } if (!ciErrNum) { shrEXIT(argc, argv); } bool pass = true; for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { for (int i = 0; i < nObs; i++) { if (viterbiPathCPU[iDevice][i] != viterbiPathGPU[iDevice][i]) { pass = false; break; } } } // NOTE: Most properly this should be done at any of the exit points above, but it is omitted elsewhere for clarity. shrLog("Release CPU buffers and OpenCL objects...\n"); free(initProb); free(mtState); free(mtEmit); for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { free(obs[iDevice]); free(viterbiPathCPU[iDevice]); free(viterbiPathGPU[iDevice]); delete oHmm[iDevice]; clReleaseCommandQueue(cqCommandQue[iDevice]); } free(obs); free(viterbiPathCPU); free(viterbiPathGPU); free(viterbiProbCPU); free(cdDevices); free(oHmm); clReleaseContext(cxGPUContext); // finish shrQAFinishExit(argc, (const char **)argv, pass ? QA_PASSED : QA_FAILED); shrEXIT(argc, argv); }
//----------------------------------------------------------------------------- // Program main //----------------------------------------------------------------------------- int main(int argc, char** argv) { pArgc = &argc; pArgv = argv; // start logs shrQAStart(argc, argv); shrSetLogFileName ("oclSimpleD3D9Texture.txt"); shrLog("%s Starting...\n\n", argv[0]); // process command line arguments if (argc > 1) { bQATest = shrCheckCmdLineFlag(argc, (const char **)argv, "qatest"); bNoPrompt = shrCheckCmdLineFlag(argc, (const char **)argv, "noprompt"); } // // create window // // Register the window class WNDCLASSEX wc = { sizeof(WNDCLASSEX), CS_CLASSDC, MsgProc, 0L, 0L, GetModuleHandle(NULL), NULL, NULL, NULL, NULL, "OpenCL/D3D9 Texture InterOP", NULL }; RegisterClassEx( &wc ); int xBorder = ::GetSystemMetrics(SM_CXSIZEFRAME); int yMenu = ::GetSystemMetrics(SM_CYMENU); int yBorder = ::GetSystemMetrics(SM_CYSIZEFRAME); // Create the application's window (padding by window border for uniform BB sizes across OSs) HWND hWnd = CreateWindow( wc.lpszClassName, "OpenCL/D3D9 Texture InterOP", WS_OVERLAPPEDWINDOW, 0, 0, g_WindowWidth + 2*xBorder, g_WindowHeight+ 2*yBorder+yMenu, NULL, NULL, wc.hInstance, NULL ); ShowWindow(hWnd, SW_SHOWDEFAULT); UpdateWindow(hWnd); // init fps timer shrDeltaT (1); // Initialize Direct3D if( SUCCEEDED( InitD3D9(hWnd) ) && SUCCEEDED( InitCL(argc, (const char **)argv) ) && SUCCEEDED( InitTextures() ) ) { if (!g_bDeviceLost) { RegisterD3D9ResourceWithCL(); } } // // the main loop // while(false == g_bDone) { RunCL(); DrawScene(); // // handle I/O // MSG msg; ZeroMemory( &msg, sizeof(msg) ); while( msg.message!=WM_QUIT ) { if( PeekMessage( &msg, NULL, 0U, 0U, PM_REMOVE ) ) { TranslateMessage( &msg ); DispatchMessage( &msg ); } else { RunCL(); DrawScene(); if(bQATest) { for(int count=0;count<g_iFrameToCompare;count++) { RunCL(); DrawScene(); } const char *ref_image_path = "ref_oclSimpleD3D9Texture.ppm"; const char *cur_image_path = "oclSimpleD3D9Texture.ppm"; // Save a reference of our current test run image CheckRenderD3D9::BackbufferToPPM(g_pD3DDevice,cur_image_path); // compare to offical reference image, printing PASS or FAIL. g_bPassed = CheckRenderD3D9::PPMvsPPM(cur_image_path,ref_image_path,argv[0],MAX_EPSILON, 0.15f); PostQuitMessage(0); g_bDone = true; } } } }; // Unregister windows class UnregisterClass( wc.lpszClassName, wc.hInstance ); // Cleanup and leave Cleanup (g_bPassed ? EXIT_SUCCESS : EXIT_FAILURE); }
// Primary GLUT callback loop function //***************************************************************************** void DisplayGL() { // update the simulation, if not paused double dProcessingTime = 0.0; if (!bPause) { // start timer 0 if it's update time if (iFrameCount >= iFrameTrigger) { shrDeltaT(0); } // do the processing psystem->update(timestep); renderer->setVertexBuffer(psystem->getCurrentReadBuffer(), psystem->getNumParticles()); // get processing time from timer 0, if it's update time if (iFrameCount >= iFrameTrigger) { dProcessingTime = shrDeltaT(0); } } // Clear last frame glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); // Add cube glColor3f(1.0, 1.0, 1.0); glutWireCube(2.0); // Add collider glPushMatrix(); float3 p = psystem->getColliderPos(); glTranslatef(p.x, p.y, p.z); glColor3f(0.75, 0.0, 0.5); glutWireSphere(psystem->getColliderRadius(), 30, 15); glPopMatrix(); // Render if (displayEnabled) { renderer->display(displayMode); } // Display user interface if enabled if (displaySliders) { glDisable(GL_DEPTH_TEST); glBlendFunc(GL_ONE_MINUS_DST_COLOR, GL_ZERO); // invert color glEnable(GL_BLEND); params->Render(0, 0); glDisable(GL_BLEND); glEnable(GL_DEPTH_TEST); } // Flip backbuffer to screen glutSwapBuffers(); // Increment the frame counter, and do fps stuff if it's time if (iFrameCount++ > iFrameTrigger) { // Set the display window title char cTitle[256]; iFramesPerSec = (int)((double)iFrameCount/ shrDeltaT(1)); #ifdef GPU_PROFILING if(!bPause) { #ifdef _WIN32 sprintf_s(cTitle, 256, "%s Particles Simulation (%u particles) | %i fps | Proc. t = %.4f s", cProcessor[iProcFlag], numParticles, iFramesPerSec, dProcessingTime); #else sprintf(cTitle, "%s OpenCL Particles Simulation (%u particles) | %i fps | Proc. t = %.4f s", cProcessor[iProcFlag], numParticles, iFramesPerSec, dProcessingTime); #endif } else { #ifdef _WIN32 sprintf_s(cTitle, 256, "%s Particles Simulation (%u particles) (Paused) | %i fps", cProcessor[iProcFlag], numParticles, iFramesPerSec); #else sprintf(cTitle, "%s OpenCL Particles Simulation (%u particles) (Paused) | %i fps", cProcessor[iProcFlag], numParticles, iFramesPerSec); #endif } #else if(!bPause) { #ifdef _WIN32 sprintf_s(cTitle, 256, "%s Particles Simulation (%u particles)", cProcessor[iProcFlag], numParticles); #else sprintf(cTitle, "%s OpenCL Particles Simulation (%u particles)", cProcessor[iProcFlag], numParticles); #endif } else { #ifdef _WIN32 sprintf_s(cTitle, 256, "%s Particles Simulation (%u particles) (Paused)", cProcessor[iProcFlag], numParticles); #else sprintf(cTitle, "%s OpenCL Particles Simulation (%u particles) (Paused)", cProcessor[iProcFlag], numParticles); #endif } #endif glutSetWindowTitle(cTitle); // Log fps and processing info to console and file shrLog("%s\n", cTitle); // Set based options: QuickTest or cycle demo if (iSetCount++ == iTestSets) { if (bNoPrompt) { shrQAFinish2(false, *pArgc, (const char **)pArgv, QA_PASSED); Cleanup(EXIT_SUCCESS); } if (bTour) { static int iOption = 1; ResetSim(++iOption); if (iOption > 3)iOption = 0; } iSetCount = 0; } // reset the frame count and trigger iFrameCount = 0; iFrameTrigger = (iFramesPerSec > 1) ? iFramesPerSec * 2 : 1; } }
// Display callback //***************************************************************************** void DisplayGL() { // Render the 3D teapot with GL renderScene(); // start timer 0 if it's update time double dProcessingTime = 0.0; if (iFrameCount >= iFrameTrigger) { shrDeltaT(0); } // process processImage(); // get processing time from timer 0, if it's update time if (iFrameCount >= iFrameTrigger) { dProcessingTime = shrDeltaT(0); } // flip backbuffer to screen displayImage(); glutSwapBuffers(); glutPostRedisplay(); // Increment the frame counter, and do fps and Q/A stuff if it's time if (iFrameCount++ > iFrameTrigger) { // set GLUT Window Title char cFPS[256]; iFramesPerSec = (int)((double)iFrameCount/shrDeltaT(1)); #ifdef GPU_PROFILING if( bPostprocess ) { #ifdef _WIN32 sprintf_s(cFPS, 256, "%s Postprocess ON %ix%i | %i fps | Proc.t = %.3f s", cProcessor[iProcFlag], iGraphicsWinWidth, iGraphicsWinHeight, iFramesPerSec, dProcessingTime); #else sprintf(cFPS, "%s Postprocess ON %ix%i | %i fps | Proc.t = %.3f s", cProcessor[iProcFlag], iGraphicsWinWidth, iGraphicsWinHeight, iFramesPerSec, dProcessingTime); #endif } else { #ifdef _WIN32 sprintf_s(cFPS, 256, "Postprocess OFF %ix%i | %i fps", iGraphicsWinWidth, iGraphicsWinHeight, iFramesPerSec); #else sprintf(cFPS, "Postprocess OFF %ix%i | %i fps", iGraphicsWinWidth, iGraphicsWinHeight, iFramesPerSec); #endif } #else if(bPostprocess) { #ifdef _WIN32 sprintf_s(cFPS, 256, "%s Postprocess ON %ix%i", cProcessor[iProcFlag], iGraphicsWinWidth, iGraphicsWinHeight); #else sprintf(cFPS, "%s Postprocess ON %ix%i", cProcessor[iProcFlag], iGraphicsWinWidth, iGraphicsWinHeight); #endif } else { #ifdef _WIN32 sprintf_s(cFPS, 256, "%s Postprocess OFF %ix%i", cProcessor[iProcFlag], iGraphicsWinWidth, iGraphicsWinHeight); #else sprintf(cFPS, "%s Postprocess OFF %ix%i", cProcessor[iProcFlag], iGraphicsWinWidth, iGraphicsWinHeight); #endif } #endif glutSetWindowTitle(cFPS); // Log fps and processing info to console and file shrLog(" %s\n", cFPS); // if doing quick test, exit if ((bNoPrompt) && (!--iTestSets)) { // Cleanup up and quit Cleanup(EXIT_SUCCESS); } // reset framecount, trigger and timer iFrameCount = 0; iFrameTrigger = (iFramesPerSec > 1) ? iFramesPerSec * 2 : 1; } }
// Display callback for GLUT main loop //***************************************************************************** void DisplayGL() { glClear(GL_COLOR_BUFFER_BIT); // Run OpenCL kernel to filter image (if toggled on), then render to backbuffer if (bFilter) { // process on the GPU or the Host, depending on user toggle/flag if (iProcFlag == 0) { dProcessingTime += SobelFilterGPU (uiInput, uiOutput); } else { dProcessingTime += SobelFilterHost (uiInput, uiOutput, uiImageWidth, uiImageHeight, fThresh); } // Draw processed image glDrawPixels(uiImageWidth, uiImageHeight, GL_RGBA, GL_UNSIGNED_BYTE, uiOutput); } else { // Skip processing and draw the raw input image glDrawPixels(uiImageWidth, uiImageHeight, GL_RGBA, GL_UNSIGNED_BYTE, uiInput); } // Flip backbuffer to screen glutSwapBuffers(); // Increment the frame counter, and do fps stuff if it's time if (iFrameCount++ > iFrameTrigger) { // Buffer for display window title char cTitle[256]; // Get average fps and average computation time iFramesPerSec = (int)((double)iFrameCount / shrDeltaT(1)); dProcessingTime /= (double)iFrameCount; #ifdef GPU_PROFILING if (bFilter) { #ifdef _WIN32 sprintf_s(cTitle, 256, "%s RGB Sobel Filter ON | W: %u , H: %u | Thresh. = %.1f | # GPUs = %u | %i fps | Proc. t = %.5f s | %.1f Mpix/s", cProcessor[iProcFlag], uiImageWidth, uiImageHeight, fThresh, GpuDevMngr->uiUsefulDevCt, iFramesPerSec, dProcessingTime, (1.0e-6 * uiImageWidth * uiImageHeight)/dProcessingTime); #else sprintf(cTitle, "%s RGB Sobel Filter ON | W: %u , H: %u | Thresh. = %.1f | # GPUs = %u | %i fps | Proc. t = %.5f s | %.1f Mpix/s", cProcessor[iProcFlag], uiImageWidth, uiImageHeight, fThresh, GpuDevMngr->uiUsefulDevCt, iFramesPerSec, dProcessingTime, (1.0e-6 * uiImageWidth * uiImageHeight)/dProcessingTime); #endif } else { #ifdef _WIN32 sprintf_s(cTitle, 256, "RGB Sobel Filter OFF | W: %u , H: %u | %i fps", uiImageWidth, uiImageHeight, iFramesPerSec); #else sprintf(cTitle, "RGB Sobel Filter OFF | W: %u , H: %u | %i fps", uiImageWidth, uiImageHeight, iFramesPerSec); #endif } #else if (bFilter) { #ifdef _WIN32 sprintf_s(cTitle, 256, "%s RGB Sobel Filter ON | W: %u , H: %u | Thresh. = %.1f", cProcessor[iProcFlag], uiImageWidth, uiImageHeight, fThresh); #else sprintf(cTitle, "%s RGB Sobel Filter ON | W: %u , H: %u | Thresh. = %.1f", cProcessor[iProcFlag], uiImageWidth, uiImageHeight, fThresh); #endif } else { #ifdef _WIN32 sprintf_s(cTitle, 256, "RGB Sobel Filter OFF | W: %u , H: %u", uiImageWidth, uiImageHeight); #else sprintf(cTitle, "RGB Sobel Filter OFF | W: %u , H: %u", uiImageWidth, uiImageHeight); #endif } #endif glutSetWindowTitle(cTitle); // Log fps and processing info to console and file shrLog("%s\n", cTitle); // if doing quick test, exit if ((bNoPrompt) && (!--iTestSets)) { // Cleanup up and quit Cleanup(EXIT_SUCCESS); } // reset the frame counter and processing timer val and adjust trigger iFrameCount = 0; dProcessingTime = 0.0; iFrameTrigger = (iFramesPerSec > 1) ? iFramesPerSec * 2 : 1; } }
// OpenCL computation function for 1 or more GPUs // Copies input data from pinned host buf to the device, runs kernel, copies output data back to pinned output host buf //***************************************************************************** double SobelFilterGPU(cl_uint* uiInputImage, cl_uint* uiOutputImage) { // If this is a video application, fresh data in pinned host buffer is needed beyond here // This line could be a sync point assuring that an asynchronous acqusition is complete. // That ascynchronous acquisition would do a map, update and unmap for the pinned input buffer // // Otherwise a synchronous acquisition call ('get next frame') could be placed here, but that would be less optimal. // For each device: copy fresh input H2D ciErrNum = CL_SUCCESS; for (cl_uint i = 0; i < GpuDevMngr->uiUsefulDevCt; i++) { // Nonblocking Write of input image data from host to device ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[i], cmDevBufIn[i], CL_FALSE, 0, szAllocDevBytes[i], (void*)&uiInputImage[uiInHostPixOffsets[i]], 0, NULL, NULL); } // Sync all queues to host and start computation timer on host to get computation elapsed wall clock time // (Only for timing... can be omitted in a production app) for (cl_uint j = 0; j < GpuDevMngr->uiUsefulDevCt; j++) { ciErrNum |= clFinish(cqCommandQueue[j]); } oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // For each device: Process shrDeltaT(0); for (cl_uint i = 0; i < GpuDevMngr->uiUsefulDevCt; i++) { // Determine configuration bytes, offsets and launch config, based on position of device region vertically in image if (GpuDevMngr->uiUsefulDevCt == 1) { // One device processes the whole image with no offset tricks needed szGlobalWorkSize[1] = shrRoundUp((int)szLocalWorkSize[1], (int)uiDevImageHeight[i]); } else if (i == 0) { // Multiple devices, top boundary tile: // Process whole device allocation, including extra row // No offset, but don't return the last row (dark/garbage row) D2H szGlobalWorkSize[1] = shrRoundUp((int)szLocalWorkSize[1], (int)uiDevImageHeight[i]); } else if (i < (GpuDevMngr->uiUsefulDevCt - 1)) { // Multiple devices, middle tile: // Process whole device allocation, including extra 2 rows // Offset down by 1 row, and don't return the first and last rows (dark/garbage rows) D2H szGlobalWorkSize[1] = shrRoundUp((int)szLocalWorkSize[1], (int)uiDevImageHeight[i]); } else { // Multiple devices, last boundary tile: // Process whole device allocation, including extra row // Offset down by 1 row, and don't return the first row (dark/garbage row) D2H szGlobalWorkSize[1] = shrRoundUp((int)szLocalWorkSize[1], (int)uiDevImageHeight[i]); } // Pass in dev image height (# of rows worked on) for this device ciErrNum |= clSetKernelArg(ckSobel[i], 5, sizeof(cl_uint), (void*)&uiDevImageHeight[i]); // Launch Sobel kernel(s) into queue(s) and push to device(s) ciErrNum |= clEnqueueNDRangeKernel(cqCommandQueue[i], ckSobel[i], 2, NULL, szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL); // Push to device(s) so subsequent clFinish in queue 0 doesn't block driver from issuing enqueue command for higher queues ciErrNum |= clFlush(cqCommandQueue[i]); } // Sync all queues to host and get elapsed wall clock time for computation in all queues // (Only for timing... can be omitted in a production app) for (cl_uint j = 0; j < GpuDevMngr->uiUsefulDevCt; j++) { ciErrNum |= clFinish(cqCommandQueue[j]); } double dKernelTime = shrDeltaT(0); // Time from launch of first compute kernel to end of all compute kernels oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // For each device: copy fresh output D2H for (cl_uint i = 0; i < GpuDevMngr->uiUsefulDevCt; i++) { // Determine configuration bytes and offsets based on position of device region vertically in image size_t szReturnBytes; cl_uint uiOutDevByteOffset; if (GpuDevMngr->uiUsefulDevCt == 1) { // One device processes the whole image with no offset tricks needed szReturnBytes = szBuffBytes; uiOutDevByteOffset = 0; } else if (i == 0) { // Multiple devices, top boundary tile: // Process whole device allocation, including extra row // No offset, but don't return the last row (dark/garbage row) D2H szReturnBytes = szAllocDevBytes[i] - (uiImageWidth * sizeof(cl_uint)); uiOutDevByteOffset = 0; } else if (i < (GpuDevMngr->uiUsefulDevCt - 1)) { // Multiple devices, middle tile: // Process whole device allocation, including extra 2 rows // Offset down by 1 row, and don't return the first and last rows (dark/garbage rows) D2H szReturnBytes = szAllocDevBytes[i] - ((uiImageWidth * sizeof(cl_uint)) * 2); uiOutDevByteOffset = uiImageWidth * sizeof(cl_uint); } else { // Multiple devices, last boundary tile: // Process whole device allocation, including extra row // Offset down by 1 row, and don't return the first row (dark/garbage row) D2H szReturnBytes = szAllocDevBytes[i] - (uiImageWidth * sizeof(cl_uint)); uiOutDevByteOffset = uiImageWidth * sizeof(cl_uint); } // Non Blocking Read of output image data from device to host ciErrNum |= clEnqueueReadBuffer(cqCommandQueue[i], cmDevBufOut[i], CL_FALSE, uiOutDevByteOffset, szReturnBytes, (void*)&uiOutputImage[uiOutHostPixOffsets[i]], 0, NULL, NULL); } // Finish all queues and check for errors before returning // The block here assures valid output data for subsequent host processing for (cl_uint j = 0; j < GpuDevMngr->uiUsefulDevCt; j++) { ciErrNum |= clFinish(cqCommandQueue[j]); } oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); return dKernelTime; }
// Main program //***************************************************************************** int main(int argc, char** argv) { pArgc = &argc; pArgv = argv; shrQAStart(argc, argv); // Start logs cExecutableName = argv[0]; shrSetLogFileName ("oclSobelFilter.txt"); shrLog("%s Starting (Using %s)...\n\n", argv[0], clSourcefile); // Get command line args for quick test or QA test, if provided bNoPrompt = (bool)shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt"); bQATest = (bool)shrCheckCmdLineFlag(argc, (const char**)argv, "qatest"); // Menu items if (!(bQATest)) { ShowMenuItems(); } // Find the path from the exe to the image file cPathAndName = shrFindFilePath(cImageFile, argv[0]); oclCheckErrorEX(cPathAndName != NULL, shrTRUE, pCleanup); shrLog("Image File\t = %s\nImage Dimensions = %u w x %u h x %u bpp\n\n", cPathAndName, uiImageWidth, uiImageHeight, sizeof(unsigned int)<<3); // Initialize OpenGL items (if not No-GL QA test) shrLog("%sInitGL...\n\n", bQATest ? "Skipping " : "Calling "); if (!(bQATest)) { InitGL(&argc, argv); } //Get the NVIDIA platform if available, otherwise use default char cBuffer[1024]; bool bNV = false; shrLog("Get Platform ID... "); ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); ciErrNum = clGetPlatformInfo (cpPlatform, CL_PLATFORM_NAME, sizeof(cBuffer), cBuffer, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); shrLog("%s\n\n", cBuffer); bNV = (strstr(cBuffer, "NVIDIA") != NULL); //Get the devices shrLog("Get Device Info...\n"); cl_uint uiNumAllDevs = 0; GpuDevMngr = new DeviceManager(cpPlatform, &uiNumAllDevs, pCleanup); // Get selected device if specified, otherwise examine avaiable ones and choose by perf cl_int iSelectedDevice = 0; if((shrGetCmdLineArgumenti(argc, (const char**)argv, "device", &iSelectedDevice)) || (uiNumAllDevs == 1)) { // Use 1 selected device GpuDevMngr->uiUsefulDevCt = 1; iSelectedDevice = CLAMP((cl_uint)iSelectedDevice, 0, (uiNumAllDevs - 1)); GpuDevMngr->uiUsefulDevs[0] = iSelectedDevice; GpuDevMngr->fLoadProportions[0] = 1.0f; shrLog(" Using 1 Selected Device for Sobel Filter Computation...\n"); } else { // Use available useful devices and Compute the device load proportions ciErrNum = GpuDevMngr->GetDevLoadProportions(bNV); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); if (GpuDevMngr->uiUsefulDevCt == 1) { iSelectedDevice = GpuDevMngr->uiUsefulDevs[0]; } shrLog(" Using %u Device(s) for Sobel Filter Computation\n", GpuDevMngr->uiUsefulDevCt); } //Create the context shrLog("\nclCreateContext...\n\n"); cxGPUContext = clCreateContext(0, uiNumAllDevs, GpuDevMngr->cdDevices, NULL, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Allocate per-device OpenCL objects for useful devices cqCommandQueue = new cl_command_queue[GpuDevMngr->uiUsefulDevCt]; ckSobel = new cl_kernel[GpuDevMngr->uiUsefulDevCt]; cmDevBufIn = new cl_mem[GpuDevMngr->uiUsefulDevCt]; cmDevBufOut = new cl_mem[GpuDevMngr->uiUsefulDevCt]; szAllocDevBytes = new size_t[GpuDevMngr->uiUsefulDevCt]; uiInHostPixOffsets = new cl_uint[GpuDevMngr->uiUsefulDevCt]; uiOutHostPixOffsets = new cl_uint[GpuDevMngr->uiUsefulDevCt]; uiDevImageHeight = new cl_uint[GpuDevMngr->uiUsefulDevCt]; // Create command queue(s) for device(s) shrLog("clCreateCommandQueue...\n"); for (cl_uint i = 0; i < GpuDevMngr->uiUsefulDevCt; i++) { cqCommandQueue[i] = clCreateCommandQueue(cxGPUContext, GpuDevMngr->cdDevices[GpuDevMngr->uiUsefulDevs[i]], 0, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); shrLog(" CommandQueue %u, Device %u, Device Load Proportion = %.2f, ", i, GpuDevMngr->uiUsefulDevs[i], GpuDevMngr->fLoadProportions[i]); oclPrintDevName(LOGBOTH, GpuDevMngr->cdDevices[GpuDevMngr->uiUsefulDevs[i]]); shrLog("\n"); } // Allocate pinned input and output host image buffers: mem copy operations to/from pinned memory is much faster than paged memory szBuffBytes = uiImageWidth * uiImageHeight * sizeof (unsigned int); cmPinnedBufIn = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); cmPinnedBufOut = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); shrLog("\nclCreateBuffer (Input and Output Pinned Host buffers)...\n"); // Get mapped pointers for writing to pinned input and output host image pointers uiInput = (cl_uint*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedBufIn, CL_TRUE, CL_MAP_WRITE, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); uiOutput = (cl_uint*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedBufOut, CL_TRUE, CL_MAP_READ, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); shrLog("clEnqueueMapBuffer (Pointer to Input and Output pinned host buffers)...\n"); // Load image data from file to pinned input host buffer ciErrNum = shrLoadPPM4ub(cPathAndName, (unsigned char **)&uiInput, &uiImageWidth, &uiImageHeight); oclCheckErrorEX(ciErrNum, shrTRUE, pCleanup); shrLog("Load Input Image to Input pinned host buffer...\n"); // Read the kernel in from file free(cPathAndName); cPathAndName = shrFindFilePath(clSourcefile, argv[0]); oclCheckErrorEX(cPathAndName != NULL, shrTRUE, pCleanup); cSourceCL = oclLoadProgSource(cPathAndName, "// My comment\n", &szKernelLength); oclCheckErrorEX(cSourceCL != NULL, shrTRUE, pCleanup); shrLog("Load OpenCL Prog Source from File...\n"); // Create the program object cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); shrLog("clCreateProgramWithSource...\n"); // Build the program with 'mad' Optimization option #ifdef MAC char *flags = "-cl-fast-relaxed-math -DMAC"; #else char *flags = "-cl-fast-relaxed-math"; #endif ciErrNum = clBuildProgram(cpProgram, 0, NULL, flags, NULL, NULL); if (ciErrNum != CL_SUCCESS) { // On error: write out standard error, Build Log and PTX, then cleanup and exit shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclSobelFilter.ptx"); Cleanup(EXIT_FAILURE); } shrLog("clBuildProgram...\n\n"); // Determine, the size/shape of the image portions for each dev and create the device buffers unsigned uiSumHeight = 0; for (cl_uint i = 0; i < GpuDevMngr->uiUsefulDevCt; i++) { // Create kernel instance ckSobel[i] = clCreateKernel(cpProgram, "ckSobel", &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); shrLog("clCreateKernel (ckSobel), Device %u...\n", i); // Allocations and offsets for the portion of the image worked on by each device if (GpuDevMngr->uiUsefulDevCt == 1) { // One device processes the whole image with no offset uiDevImageHeight[i] = uiImageHeight; uiInHostPixOffsets[i] = 0; uiOutHostPixOffsets[i] = 0; szAllocDevBytes[i] = uiDevImageHeight[i] * uiImageWidth * sizeof(cl_uint); } else if (i == 0) { // Multiple devices, top stripe zone including topmost row of image: // Over-allocate on device by 1 row // Set offset and size to copy extra 1 padding row H2D (below bottom of stripe) // Won't return the last row (dark/garbage row) D2H uiInHostPixOffsets[i] = 0; uiOutHostPixOffsets[i] = 0; uiDevImageHeight[i] = (cl_uint)(GpuDevMngr->fLoadProportions[GpuDevMngr->uiUsefulDevs[i]] * (float)uiImageHeight); // height is proportional to dev perf uiSumHeight += uiDevImageHeight[i]; uiDevImageHeight[i] += 1; szAllocDevBytes[i] = uiDevImageHeight[i] * uiImageWidth * sizeof(cl_uint); } else if (i < (GpuDevMngr->uiUsefulDevCt - 1)) { // Multiple devices, middle stripe zone: // Over-allocate on device by 2 rows // Set offset and size to copy extra 2 padding rows H2D (above top and below bottom of stripe) // Won't return the first and last rows (dark/garbage rows) D2H uiInHostPixOffsets[i] = (uiSumHeight - 1) * uiImageWidth; uiOutHostPixOffsets[i] = uiInHostPixOffsets[i] + uiImageWidth; uiDevImageHeight[i] = (cl_uint)(GpuDevMngr->fLoadProportions[GpuDevMngr->uiUsefulDevs[i]] * (float)uiImageHeight); // height is proportional to dev perf uiSumHeight += uiDevImageHeight[i]; uiDevImageHeight[i] += 2; szAllocDevBytes[i] = uiDevImageHeight[i] * uiImageWidth * sizeof(cl_uint); } else { // Multiple devices, last boundary tile: // Over-allocate on device by 1 row // Set offset and size to copy extra 1 padding row H2D (above top of stripe) // Won't return the first row (dark/garbage rows D2H uiInHostPixOffsets[i] = (uiSumHeight - 1) * uiImageWidth; uiOutHostPixOffsets[i] = uiInHostPixOffsets[i] + uiImageWidth; uiDevImageHeight[i] = uiImageHeight - uiSumHeight; // "leftover" rows uiSumHeight += uiDevImageHeight[i]; uiDevImageHeight[i] += 1; szAllocDevBytes[i] = uiDevImageHeight[i] * uiImageWidth * sizeof(cl_uint); } shrLog("Image Height (rows) for Device %u = %u...\n", i, uiDevImageHeight[i]); // Create the device buffers in GMEM on each device cmDevBufIn[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, szAllocDevBytes[i], NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); cmDevBufOut[i] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, szAllocDevBytes[i], NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); shrLog("clCreateBuffer (Input and Output GMEM buffers, Device %u)...\n", i); // Set the common argument values for the Median kernel instance for each device int iLocalPixPitch = iBlockDimX + 2; ciErrNum = clSetKernelArg(ckSobel[i], 0, sizeof(cl_mem), (void*)&cmDevBufIn[i]); ciErrNum |= clSetKernelArg(ckSobel[i], 1, sizeof(cl_mem), (void*)&cmDevBufOut[i]); ciErrNum |= clSetKernelArg(ckSobel[i], 2, (iLocalPixPitch * (iBlockDimY + 2) * sizeof(cl_uchar4)), NULL); ciErrNum |= clSetKernelArg(ckSobel[i], 3, sizeof(cl_int), (void*)&iLocalPixPitch); ciErrNum |= clSetKernelArg(ckSobel[i], 4, sizeof(cl_uint), (void*)&uiImageWidth); ciErrNum |= clSetKernelArg(ckSobel[i], 6, sizeof(cl_float), (void*)&fThresh); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); shrLog("clSetKernelArg (0-4), Device %u...\n\n", i); } // Set common global and local work sizes for Median kernel szLocalWorkSize[0] = iBlockDimX; szLocalWorkSize[1] = iBlockDimY; szGlobalWorkSize[0] = shrRoundUp((int)szLocalWorkSize[0], uiImageWidth); // init running timers shrDeltaT(0); // timer 0 used for computation timing shrDeltaT(1); // timer 1 used for fps computation // Start main GLUT rendering loop for processing and rendering, // or otherwise run No-GL Q/A test sequence if (!(bQATest)) { glutMainLoop(); } else { TestNoGL(); } Cleanup(EXIT_SUCCESS); }
int main(int argc, char* argv[]) { const int n = 4; const size_t sizes[n] = { 3 << 10, 15 << 10, 15 << 20, 100 << 20 }; const int iterations[n] = { 60000, 60000, 300, 30 }; printf( "device,platform name,device name,size (B),memory,access,direction,time (s),bandwidth (MB/s)\n"); int g = 0; cl_int error; cl_uint num_platforms; checkOclErrors(clGetPlatformIDs(0, NULL, &num_platforms)); cl_platform_id* platforms = (cl_platform_id*) malloc( sizeof(cl_platform_id) * num_platforms); checkOclErrors(clGetPlatformIDs(num_platforms, platforms, NULL)); for (cl_uint p = 0; p < 1; ++p) { cl_platform_id platform = platforms[p]; char platform_name[256]; checkOclErrors( clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL)); // if (strcmp(platform_name, "NVIDIA CUDA")) continue; cl_uint num_devices; checkOclErrors( clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices)); cl_device_id* devices = (cl_device_id*) malloc( sizeof(cl_device_id) * num_devices); checkOclErrors( clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL)); for (cl_uint d = 0; d < num_devices; ++d, ++g) { cl_device_id device = devices[d]; char device_name[256]; checkOclErrors( clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), device_name, NULL)); cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, &error); checkOclErrors(error); cl_command_queue command_queue = clCreateCommandQueue(context, device, 0/*CL_QUEUE_PROFILING_ENABLE*/, &error); checkOclErrors(error); const size_t size = 100000 * sizeof(float); const double bandwidth_unit = (double) size / (1 << 20); void* h_p = NULL; void* d_p = NULL; cl_mem h_b; cl_mem d_b; double time; double bandwidth; d_b = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &error); checkOclErrors(error); // allocate pinned h_b h_b = clCreateBuffer(context, /*CL_MEM_READ_WRITE | */ CL_MEM_ALLOC_HOST_PTR, size, NULL, &error); checkOclErrors(error); // --memory=pinned --access=mapped --htod shrDeltaT(); h_p = clEnqueueMapBuffer(command_queue, h_b, CL_TRUE, CL_MAP_READ, 0, size, 0, NULL, NULL, &error); checkOclErrors(error); d_p = clEnqueueMapBuffer(command_queue, d_b, CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0, size, 0, NULL, NULL, &error); checkOclErrors(error); memcpy(d_p, h_p, size); checkOclErrors( clEnqueueUnmapMemObject(command_queue, d_b, d_p, 0, NULL, NULL)); checkOclErrors( clEnqueueUnmapMemObject(command_queue, h_b, h_p, 0, NULL, NULL)); checkOclErrors(clFinish(command_queue)); time = shrDeltaT(); bandwidth = bandwidth_unit / time; printf("%d,%s,%s,%lu,%s,%s,%s,%.3f,%.0f\n", g, platform_name, device_name, size, "pinned", "mapped", "HtoD", time, bandwidth); // --memory=pinned --access=mapped --dtoh shrDeltaT(); h_p = clEnqueueMapBuffer(command_queue, h_b, CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0, size, 0, NULL, NULL, &error); checkOclErrors(error); d_p = clEnqueueMapBuffer(command_queue, d_b, CL_TRUE, CL_MAP_READ, 0, size, 0, NULL, NULL, &error); checkOclErrors(error); memcpy(h_p, d_p, size); checkOclErrors( clEnqueueUnmapMemObject(command_queue, d_b, d_p, 0, NULL, NULL)); checkOclErrors( clEnqueueUnmapMemObject(command_queue, h_b, h_p, 0, NULL, NULL)); checkOclErrors(clFinish(command_queue)); time = shrDeltaT(); bandwidth = bandwidth_unit / time; printf("%d,%s,%s,%lu,%s,%s,%s,%.3f,%.0f\n", g, platform_name, device_name, size, "pinned", "mapped", "DtoH", time, bandwidth); // deallocate pinned h_b checkOclErrors(clReleaseMemObject(h_b)); checkOclErrors(clReleaseMemObject(d_b)); checkOclErrors(clReleaseCommandQueue(command_queue)); checkOclErrors(clReleaseContext(context)); } free(devices); } free(platforms); }
// Main program //***************************************************************************** int main(int argc, char** argv) { numParticles = NUM_PARTICLES; uint gridDim = GRID_SIZE; pArgc = &argc; pArgv = argv; shrQAStart(argc, argv); // Start logs and timers cExecutableName = argv[0]; shrSetLogFileName ("oclParticles.txt"); shrLog("%s Starting...\n\n", argv[0]); // check command line flags and parameters if (argc > 1) { shrGetCmdLineArgumenti(argc, (const char**)argv, "n", (int*)&numParticles); shrGetCmdLineArgumenti(argc, (const char**)argv, "grid", (int*)&gridDim); bQATest = shrCheckCmdLineFlag(argc, (const char**)argv, "qatest"); bNoPrompt = shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt"); } // Set and log grid size and particle count, after checking optional command-line inputs gridSize.x = gridSize.y = gridSize.z = gridDim; shrLog(" grid: %d x %d x %d = %d cells\n", gridSize.x, gridSize.y, gridSize.z, gridSize.x * gridSize.y * gridSize.z); shrLog(" particles: %d\n\n", numParticles); // initialize GLUT and GLEW if(!bQATest) { InitGL(&argc, argv); } // initialize OpenCL startupOpenCL(argc,(const char**)argv); // init simulation parameters and objects initParticleSystem(numParticles, gridSize); initParams(); // Init timers shrDeltaT(0); // timer 0 is for processing time measurements shrDeltaT(1); // timer 1 is for fps measurement // Start main GLUT rendering loop for processing and rendering, // or otherwise run No-GL Q/A test sequence if(!bQATest) { glutMainLoop(); } else { TestNoGL(); } // Normally unused return path shrQAFinish(argc, (const char **)argv, QA_PASSED); Cleanup(EXIT_SUCCESS); }
// Main function // ********************************************************************* int main(int argc, char** argv) { shrQAStart(argc, argv); int use_gpu = 0; for(int i = 0; i < argc && argv; i++) { if(!argv[i]) continue; if(strstr(argv[i], "cpu")) use_gpu = 0; else if(strstr(argv[i], "gpu")) use_gpu = 1; } // start logs shrSetLogFileName ("oclDXTCompression.txt"); shrLog("%s Starting...\n\n", argv[0]); cl_platform_id cpPlatform = NULL; cl_uint uiNumDevices = 0; cl_device_id *cdDevices = NULL; cl_context cxGPUContext; cl_command_queue cqCommandQueue; cl_program cpProgram; cl_kernel ckKernel; cl_mem cmMemObjs[3]; cl_mem cmAlphaTable4, cmProds4; cl_mem cmAlphaTable3, cmProds3; size_t szGlobalWorkSize[1]; size_t szLocalWorkSize[1]; cl_int ciErrNum; // Get the path of the filename char *filename; if (shrGetCmdLineArgumentstr(argc, (const char **)argv, "image", &filename)) { image_filename = filename; } // load image const char* image_path = shrFindFilePath(image_filename, argv[0]); oclCheckError(image_path != NULL, shrTRUE); shrLoadPPM4ub(image_path, (unsigned char **)&h_img, &width, &height); oclCheckError(h_img != NULL, shrTRUE); shrLog("Loaded '%s', %d x %d pixels\n\n", image_path, width, height); // Convert linear image to block linear. const uint memSize = width * height * sizeof(cl_uint); uint* block_image = (uint*)malloc(memSize); // Convert linear image to block linear. for(uint by = 0; by < height/4; by++) { for(uint bx = 0; bx < width/4; bx++) { for (int i = 0; i < 16; i++) { const int x = i & 3; const int y = i / 4; block_image[(by * width/4 + bx) * 16 + i] = ((uint *)h_img)[(by * 4 + y) * 4 * (width/4) + bx * 4 + x]; } } } // Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckError(ciErrNum, CL_SUCCESS); // Get the platform's GPU devices ciErrNum = clGetDeviceIDs(cpPlatform, use_gpu?CL_DEVICE_TYPE_GPU:CL_DEVICE_TYPE_CPU, 0, NULL, &uiNumDevices); oclCheckError(ciErrNum, CL_SUCCESS); cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) ); ciErrNum = clGetDeviceIDs(cpPlatform, use_gpu?CL_DEVICE_TYPE_GPU:CL_DEVICE_TYPE_CPU, uiNumDevices, cdDevices, NULL); oclCheckError(ciErrNum, CL_SUCCESS); // Create the context cxGPUContext = clCreateContext(0, uiNumDevices, cdDevices, NULL, NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); // get and log device cl_device_id device; if( shrCheckCmdLineFlag(argc, (const char **)argv, "device") ) { int device_nr = 0; shrGetCmdLineArgumenti(argc, (const char **)argv, "device", &device_nr); device = oclGetDev(cxGPUContext, device_nr); if( device == (cl_device_id)-1 ) { shrLog(" Invalid GPU Device: devID=%d. %d valid GPU devices detected\n\n", device_nr, uiNumDevices); shrLog(" exiting...\n"); return -1; } } else { device = oclGetMaxFlopsDev(cxGPUContext); } oclPrintDevName(LOGBOTH, device); shrLog("\n"); // create a command-queue cqCommandQueue = clCreateCommandQueue(cxGPUContext, device, 0, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); // Memory Setup // Constants cmAlphaTable4 = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 4 * sizeof(cl_float), (void*)&alphaTable4[0], &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); cmProds4 = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 4 * sizeof(cl_int), (void*)&prods4[0], &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); cmAlphaTable3 = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 4 * sizeof(cl_float), (void*)&alphaTable3[0], &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); cmProds3 = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 4 * sizeof(cl_int), (void*)&prods3[0], &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); // Compute permutations. cl_uint permutations[1024]; computePermutations(permutations); // Upload permutations. cmMemObjs[0] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_uint) * 1024, permutations, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); // Image cmMemObjs[1] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, memSize, NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); // Result const uint compressedSize = (width / 4) * (height / 4) * 8; cmMemObjs[2] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, compressedSize, NULL , &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); unsigned int * h_result = (uint*)malloc(compressedSize); // Program Setup size_t program_length; const char* source_path = shrFindFilePath("DXTCompression.cl", argv[0]); oclCheckError(source_path != NULL, shrTRUE); char *source = oclLoadProgSource(source_path, "", &program_length); oclCheckError(source != NULL, shrTRUE); // create the program cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **) &source, &program_length, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); // build the program ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL); if (ciErrNum != CL_SUCCESS) { // write out standard error, Build Log and PTX, then cleanup and exit shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclDXTCompression.ptx"); oclCheckError(ciErrNum, CL_SUCCESS); } // create the kernel ckKernel = clCreateKernel(cpProgram, "compress", &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); // set the args values ciErrNum = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void *) &cmMemObjs[0]); ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void *) &cmMemObjs[1]); ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void *) &cmMemObjs[2]); ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(cl_mem), (void*)&cmAlphaTable4); ciErrNum |= clSetKernelArg(ckKernel, 4, sizeof(cl_mem), (void*)&cmProds4); ciErrNum |= clSetKernelArg(ckKernel, 5, sizeof(cl_mem), (void*)&cmAlphaTable3); ciErrNum |= clSetKernelArg(ckKernel, 6, sizeof(cl_mem), (void*)&cmProds3); oclCheckError(ciErrNum, CL_SUCCESS); // Copy input data host to device clEnqueueWriteBuffer(cqCommandQueue, cmMemObjs[1], CL_FALSE, 0, sizeof(cl_uint) * width * height, block_image, 0,0,0); // Determine launch configuration and run timed computation numIterations times int blocks = ((width + 3) / 4) * ((height + 3) / 4); // rounds up by 1 block in each dim if %4 != 0 // Restrict the numbers of blocks to launch on low end GPUs to avoid kernel timeout cl_uint compute_units; clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL); int blocksPerLaunch = MIN(blocks, 768 * (int)compute_units); // set work-item dimensions szGlobalWorkSize[0] = blocksPerLaunch * NUM_THREADS; szLocalWorkSize[0]= NUM_THREADS; #ifdef GPU_PROFILING shrLog("\nRunning DXT Compression on %u x %u image...\n", width, height); shrLog("\n%u Workgroups, %u Work Items per Workgroup, %u Work Items in NDRange...\n\n", blocks, NUM_THREADS, blocks * NUM_THREADS); int numIterations = 50; for (int i = -1; i < numIterations; ++i) { if (i == 0) { // start timing only after the first warmup iteration clFinish(cqCommandQueue); // flush command queue shrDeltaT(0); // start timer } #endif // execute kernel for( int j=0; j<blocks; j+= blocksPerLaunch ) { clSetKernelArg(ckKernel, 7, sizeof(int), &j); szGlobalWorkSize[0] = MIN( blocksPerLaunch, blocks-j ) * NUM_THREADS; ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); } #ifdef GPU_PROFILING } clFinish(cqCommandQueue); double dAvgTime = shrDeltaT(0) / (double)numIterations; shrLogEx(LOGBOTH | MASTER, 0, "oclDXTCompression, Throughput = %.4f MPixels/s, Time = %.5f s, Size = %u Pixels, NumDevsUsed = %i, Workgroup = %d\n", (1.0e-6 * (double)(width * height)/ dAvgTime), dAvgTime, (width * height), 1, szLocalWorkSize[0]); #endif // blocking read output ciErrNum = clEnqueueReadBuffer(cqCommandQueue, cmMemObjs[2], CL_TRUE, 0, compressedSize, h_result, 0, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); // Write DDS file. FILE* fp = NULL; char output_filename[1024]; #ifdef WIN32 strcpy_s(output_filename, 1024, image_path); strcpy_s(output_filename + strlen(image_path) - 3, 1024 - strlen(image_path) + 3, "dds"); fopen_s(&fp, output_filename, "wb"); #else strcpy(output_filename, image_path); strcpy(output_filename + strlen(image_path) - 3, "dds"); fp = fopen(output_filename, "wb"); #endif oclCheckError(fp != NULL, shrTRUE); DDSHeader header; header.fourcc = FOURCC_DDS; header.size = 124; header.flags = (DDSD_WIDTH|DDSD_HEIGHT|DDSD_CAPS|DDSD_PIXELFORMAT|DDSD_LINEARSIZE); header.height = height; header.width = width; header.pitch = compressedSize; header.depth = 0; header.mipmapcount = 0; memset(header.reserved, 0, sizeof(header.reserved)); header.pf.size = 32; header.pf.flags = DDPF_FOURCC; header.pf.fourcc = FOURCC_DXT1; header.pf.bitcount = 0; header.pf.rmask = 0; header.pf.gmask = 0; header.pf.bmask = 0; header.pf.amask = 0; header.caps.caps1 = DDSCAPS_TEXTURE; header.caps.caps2 = 0; header.caps.caps3 = 0; header.caps.caps4 = 0; header.notused = 0; fwrite(&header, sizeof(DDSHeader), 1, fp); fwrite(h_result, compressedSize, 1, fp); fclose(fp); // Make sure the generated image matches the reference image (regression check) shrLog("\nComparing against Host/C++ computation...\n"); const char* reference_image_path = shrFindFilePath(refimage_filename, argv[0]); oclCheckError(reference_image_path != NULL, shrTRUE); // read in the reference image from file #ifdef WIN32 fopen_s(&fp, reference_image_path, "rb"); #else fp = fopen(reference_image_path, "rb"); #endif oclCheckError(fp != NULL, shrTRUE); fseek(fp, sizeof(DDSHeader), SEEK_SET); uint referenceSize = (width / 4) * (height / 4) * 8; uint * reference = (uint *)malloc(referenceSize); fread(reference, referenceSize, 1, fp); fclose(fp); // compare the reference image data to the sample/generated image float rms = 0; for (uint y = 0; y < height; y += 4) { for (uint x = 0; x < width; x += 4) { // binary comparison of data uint referenceBlockIdx = ((y/4) * (width/4) + (x/4)); uint resultBlockIdx = ((y/4) * (width/4) + (x/4)); int cmp = compareBlock(((BlockDXT1 *)h_result) + resultBlockIdx, ((BlockDXT1 *)reference) + referenceBlockIdx); // log deviations, if any if (cmp != 0.0f) { compareBlock(((BlockDXT1 *)h_result) + resultBlockIdx, ((BlockDXT1 *)reference) + referenceBlockIdx); shrLog("Deviation at (%d, %d):\t%f rms\n", x/4, y/4, float(cmp)/16/3); } rms += cmp; } } rms /= width * height * 3; shrLog("RMS(reference, result) = %f\n\n", rms); // Free OpenCL resources oclDeleteMemObjs(cmMemObjs, 3); clReleaseMemObject(cmAlphaTable4); clReleaseMemObject(cmProds4); clReleaseMemObject(cmAlphaTable3); clReleaseMemObject(cmProds3); clReleaseKernel(ckKernel); clReleaseProgram(cpProgram); clReleaseCommandQueue(cqCommandQueue); clReleaseContext(cxGPUContext); // Free host memory free(source); free(h_img); // finish shrQAFinishExit(argc, (const char **)argv, (rms <= ERROR_THRESHOLD) ? QA_PASSED : QA_FAILED); }
bool fdtdGPU(float *output, const float *input, const float *coeff, const int dimx, const int dimy, const int dimz, const int radius, const int timesteps, const int argc, const char **argv) { bool ok = true; const int outerDimx = dimx + 2 * radius; const int outerDimy = dimy + 2 * radius; const int outerDimz = dimz + 2 * radius; const size_t volumeSize = outerDimx * outerDimy * outerDimz; cl_context context = 0; cl_platform_id platform = 0; cl_device_id *devices = 0; cl_command_queue commandQueue = 0; cl_mem bufferOut = 0; cl_mem bufferIn = 0; cl_mem bufferCoeff = 0; cl_program program = 0; cl_kernel kernel = 0; cl_event *kernelEvents = 0; #ifdef GPU_PROFILING cl_ulong kernelEventStart; cl_ulong kernelEventEnd; #endif double hostElapsedTimeS; char *cPathAndName = 0; char *cSourceCL = 0; size_t szKernelLength; size_t globalWorkSize[2]; size_t localWorkSize[2]; cl_uint deviceCount = 0; cl_uint targetDevice = 0; cl_int errnum = 0; char buildOptions[128]; // Ensure that the inner data starts on a 128B boundary const int padding = (128 / sizeof(float)) - radius; const size_t paddedVolumeSize = volumeSize + padding; #ifdef GPU_PROFILING const int profileTimesteps = timesteps - 1; if (ok) { if (profileTimesteps < 1) { shrLog(" cannot profile with fewer than two timesteps (timesteps=%d), profiling is disabled.\n", timesteps); } } #endif // Get the NVIDIA platform if (ok) { shrLog(" oclGetPlatformID...\n"); errnum = oclGetPlatformID(&platform); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("oclGetPlatformID (returned %d).\n", errnum); ok = false; } } // Get the list of GPU devices associated with the platform if (ok) { shrLog(" clGetDeviceIDs"); errnum = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &deviceCount); devices = (cl_device_id *)malloc(deviceCount * sizeof(cl_device_id) ); errnum = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, deviceCount, devices, NULL); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clGetDeviceIDs (returned %d).\n", errnum); ok = false; } } // Create the OpenCL context if (ok) { shrLog(" clCreateContext...\n"); context = clCreateContext(0, deviceCount, devices, NULL, NULL, &errnum); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clCreateContext (returned %d).\n", errnum); ok = false; } } // Select target device (device 0 by default) if (ok) { char *device = 0; if (shrGetCmdLineArgumentstr(argc, argv, "device", &device)) { targetDevice = (cl_uint)atoi(device); if (targetDevice >= deviceCount) { shrLogEx(LOGBOTH | ERRORMSG, -2001, STDERROR); shrLog("invalid target device specified on command line (device %d does not exist).\n", targetDevice); ok = false; } } else { targetDevice = 0; } if (device) { free(device); } } // Create a command-queue if (ok) { shrLog(" clCreateCommandQueue\n"); commandQueue = clCreateCommandQueue(context, devices[targetDevice], CL_QUEUE_PROFILING_ENABLE, &errnum); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clCreateCommandQueue (returned %d).\n", errnum); ok = false; } } // Create memory buffer objects if (ok) { shrLog(" clCreateBuffer bufferOut\n"); bufferOut = clCreateBuffer(context, CL_MEM_READ_WRITE, paddedVolumeSize * sizeof(float), NULL, &errnum); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clCreateBuffer (returned %d).\n", errnum); ok = false; } } if (ok) { shrLog(" clCreateBuffer bufferIn\n"); bufferIn = clCreateBuffer(context, CL_MEM_READ_WRITE, paddedVolumeSize * sizeof(float), NULL, &errnum); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clCreateBuffer (returned %d).\n", errnum); ok = false; } } if (ok) { shrLog(" clCreateBuffer bufferCoeff\n"); bufferCoeff = clCreateBuffer(context, CL_MEM_READ_ONLY, (radius + 1) * sizeof(float), NULL, &errnum); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clCreateBuffer (returned %d).\n", errnum); ok = false; } } // Load the kernel from file if (ok) { shrLog(" shrFindFilePath\n"); cPathAndName = shrFindFilePath(clSourceFile, argv[0]); if (cPathAndName == NULL) { shrLogEx(LOGBOTH | ERRORMSG, -2002, STDERROR); shrLog("shrFindFilePath returned null.\n"); ok = false; } } if (ok) { shrLog(" oclLoadProgSource\n"); cSourceCL = oclLoadProgSource(cPathAndName, "// Preamble\n", &szKernelLength); if (cSourceCL == NULL) { shrLogEx(LOGBOTH | ERRORMSG, -2003, STDERROR); shrLog("oclLoadProgSource returned null.\n"); ok = false; } } // Create the program if (ok) { shrLog(" clCreateProgramWithSource\n"); program = clCreateProgramWithSource(context, 1, (const char **)&cSourceCL, &szKernelLength, &errnum); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clCreateProgramWithSource (returned %d).\n", errnum); ok = false; } } // Check for a command-line specified work group size size_t userWorkSize; int localWorkMaxY; if (ok) { int userWorkSizeInt; if (shrGetCmdLineArgumenti(argc, argv, "work-group-size", &userWorkSizeInt)) { // We can't clamp to CL_KERNEL_WORK_GROUP_SIZE yet since that is // dependent on the build. if (userWorkSizeInt < k_localWorkMin || userWorkSizeInt > k_localWorkMax) { shrLogEx(LOGBOTH | ERRORMSG, -2004, STDERROR); shrLog("invalid work group size specified on command line (must be between %d and %d).\n", k_localWorkMin, k_localWorkMax); ok = false; } // Constrain to a multiple of k_localWorkX userWorkSize = (userWorkSizeInt / k_localWorkX * k_localWorkX); } else { userWorkSize = k_localWorkY * k_localWorkX; } // Divide by k_localWorkX (integer division to clamp) localWorkMaxY = userWorkSize / k_localWorkX; } // Build the program if (ok) { #ifdef WIN32 if (sprintf_s(buildOptions, sizeof(buildOptions), "-DRADIUS=%d -DMAXWORKX=%d -DMAXWORKY=%d -cl-fast-relaxed-math", radius, k_localWorkX, localWorkMaxY) < 0) { shrLogEx(LOGBOTH | ERRORMSG, -2005, STDERROR); shrLog("sprintf_s (failed).\n"); ok = false; } #else if (snprintf(buildOptions, sizeof(buildOptions), "-DRADIUS=%d -DMAXWORKX=%d -DMAXWORKY=%d -cl-fast-relaxed-math", radius, k_localWorkX, localWorkMaxY) < 0) { shrLogEx(LOGBOTH | ERRORMSG, -2005, STDERROR); shrLog("snprintf (failed).\n"); ok = false; } #endif } if (ok) { shrLog(" clBuildProgram (%s)\n", buildOptions); errnum = clBuildProgram(program, 0, NULL, buildOptions, NULL, NULL); if (errnum != CL_SUCCESS) { char buildLog[10240]; clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, sizeof(buildLog), buildLog, NULL); shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clBuildProgram (returned %d).\n", errnum); shrLog("Log:\n%s\n", buildLog); ok = false; } } // Create the kernel if (ok) { shrLog(" clCreateKernel\n"); kernel = clCreateKernel(program, "FiniteDifferences", &errnum); if (kernel == (cl_kernel)NULL || errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clCreateKernel (returned %d).\n", errnum); ok = false; } } // Get the maximum work group size size_t maxWorkSize; if (ok) { shrLog(" clGetKernelWorkGroupInfo\n"); errnum = clGetKernelWorkGroupInfo(kernel, devices[targetDevice], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &maxWorkSize, NULL); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clGetKernelWorkGroupInfo (returned %d).\n", errnum); ok = false; } } // Set the work group size if (ok) { userWorkSize = CLAMP(userWorkSize, k_localWorkMin, maxWorkSize); localWorkSize[0] = k_localWorkX; localWorkSize[1] = userWorkSize / k_localWorkX; globalWorkSize[0] = localWorkSize[0] * (unsigned int)ceil((float)dimx / localWorkSize[0]); globalWorkSize[1] = localWorkSize[1] * (unsigned int)ceil((float)dimy / localWorkSize[1]); shrLog(" set local work group size to %dx%d\n", localWorkSize[0], localWorkSize[1]); shrLog(" set total work size to %dx%d\n", globalWorkSize[0], globalWorkSize[1]); } // Copy the input to the device input buffer if (ok) { shrLog(" clEnqueueWriteBuffer bufferIn\n"); errnum = clEnqueueWriteBuffer(commandQueue, bufferIn, CL_TRUE, padding * sizeof(float), volumeSize * sizeof(float), input, 0, NULL, NULL); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clEnqueueWriteBuffer bufferIn (returned %d).\n", errnum); ok = false; } } // Copy the input to the device output buffer (actually only need the halo) if (ok) { shrLog(" clEnqueueWriteBuffer bufferOut\n"); errnum = clEnqueueWriteBuffer(commandQueue, bufferOut, CL_TRUE, padding * sizeof(float), volumeSize * sizeof(float), input, 0, NULL, NULL); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clEnqueueWriteBuffer bufferOut (returned %d).\n", errnum); ok = false; } } // Copy the coefficients to the device coefficient buffer if (ok) { shrLog(" clEnqueueWriteBuffer bufferCoeff\n"); errnum = clEnqueueWriteBuffer(commandQueue, bufferCoeff, CL_TRUE, 0, (radius + 1) * sizeof(float), coeff, 0, NULL, NULL); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clEnqueueWriteBuffer bufferCoeff (returned %d).\n", errnum); ok = false; } } // Allocate the events if (ok) { shrLog(" calloc events\n"); if ((kernelEvents = (cl_event *)calloc(timesteps, sizeof(cl_event))) == NULL) { shrLogEx(LOGBOTH | ERRORMSG, -2006, STDERROR); shrLog("Insufficient memory for events calloc, please try a smaller volume (use --help for syntax).\n"); ok = false; } } // Start the clock shrDeltaT(0); // Set the constant arguments if (ok) { shrLog(" clSetKernelArg 2-6\n"); errnum = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&bufferCoeff); errnum |= clSetKernelArg(kernel, 3, sizeof(int), &dimx); errnum |= clSetKernelArg(kernel, 4, sizeof(int), &dimy); errnum |= clSetKernelArg(kernel, 5, sizeof(int), &dimz); errnum |= clSetKernelArg(kernel, 6, sizeof(int), &padding); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clSetKernelArg 2-6 (returned %d).\n", errnum); ok = false; } } // Execute the FDTD cl_mem bufferSrc = bufferIn; cl_mem bufferDst = bufferOut; if (ok) { shrLog(" GPU FDTD loop\n"); } for (int it = 0 ; ok && it < timesteps ; it++) { shrLog("\tt = %d ", it); // Set the dynamic arguments if (ok) { shrLog(" clSetKernelArg 0-1,"); errnum = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&bufferDst); errnum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&bufferSrc); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clSetKernelArg 0-1 (returned %d).\n", errnum); ok = false; } } // Launch the kernel if (ok) { shrLog(" clEnqueueNDRangeKernel\n"); errnum = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, &kernelEvents[it]); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clEnqueueNDRangeKernel (returned %d).\n", errnum); ok = false; } } // Toggle the buffers cl_mem tmp = bufferSrc; bufferSrc = bufferDst; bufferDst = tmp; } if (ok) shrLog("\n"); // Wait for the kernel to complete if (ok) { shrLog(" clWaitForEvents\n"); errnum = clWaitForEvents(1, &kernelEvents[timesteps-1]); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clWaitForEvents (returned %d).\n", errnum); ok = false; } } // Stop the clock hostElapsedTimeS = shrDeltaT(0); // Read the result back, result is in bufferSrc (after final toggle) if (ok) { shrLog(" clEnqueueReadBuffer\n"); errnum = clEnqueueReadBuffer(commandQueue, bufferSrc, CL_TRUE, padding * sizeof(float), volumeSize * sizeof(float), output, 0, NULL, NULL); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clEnqueueReadBuffer bufferSrc (returned %d).\n", errnum); ok = false; } } // Report time #ifdef GPU_PROFILING double elapsedTime = 0.0; if (ok && profileTimesteps > 0) shrLog(" Collect profile information\n"); for (int it = 1 ; ok && it <= profileTimesteps ; it++) { shrLog("\tt = %d ", it); shrLog(" clGetEventProfilingInfo,", it); errnum = clGetEventProfilingInfo(kernelEvents[it], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernelEventStart, NULL); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clGetEventProfilingInfo (returned %d).\n", errnum); ok = false; } shrLog(" clGetEventProfilingInfo\n", it); errnum = clGetEventProfilingInfo(kernelEvents[it], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernelEventEnd, NULL); if (errnum != CL_SUCCESS) { shrLogEx(LOGBOTH | ERRORMSG, errnum, STDERROR); shrLog("clGetEventProfilingInfo (returned %d).\n", errnum); ok = false; } elapsedTime += (double)kernelEventEnd - (double)kernelEventStart; } if (ok && profileTimesteps > 0) { shrLog("\n"); // Convert nanoseconds to seconds elapsedTime *= 1.0e-9; double avgElapsedTime = elapsedTime / (double)profileTimesteps; // Determine number of computations per timestep size_t pointsComputed = dimx * dimy * dimz; // Determine throughput double throughputM = 1.0e-6 * (double)pointsComputed / avgElapsedTime; shrLogEx(LOGBOTH | MASTER, 0, "oclFDTD3d, Throughput = %.4f MPoints/s, Time = %.5f s, Size = %u Points, NumDevsUsed = %i, Workgroup = %u\n", throughputM, avgElapsedTime, pointsComputed, 1, localWorkSize[0] * localWorkSize[1]); } #endif // Cleanup if (kernelEvents) { for (int it = 0 ; it < timesteps ; it++) { if (kernelEvents[it]) clReleaseEvent(kernelEvents[it]); } free(kernelEvents); } if (kernel) clReleaseKernel(kernel); if (program) clReleaseProgram(program); if (cSourceCL) free(cSourceCL); if (cPathAndName) free(cPathAndName); if (bufferCoeff) clReleaseMemObject(bufferCoeff); if (bufferIn) clReleaseMemObject(bufferIn); if (bufferOut) clReleaseMemObject(bufferOut); if (commandQueue) clReleaseCommandQueue(commandQueue); if (devices) free(devices); if (context) clReleaseContext(context); return ok; }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { shrQAStart(argc, argv); // start logs shrSetLogFileName ("oclSimpleMultiGPU.txt"); shrLog("%s Starting, Array = %u float values...\n\n", argv[0], DATA_N); // OpenCL cl_platform_id cpPlatform; cl_uint ciDeviceCount; cl_device_id* cdDevices; cl_context cxGPUContext; cl_device_id cdDevice; // GPU device int deviceNr[MAX_GPU_COUNT]; cl_command_queue commandQueue[MAX_GPU_COUNT]; cl_mem d_Data[MAX_GPU_COUNT]; cl_mem d_Result[MAX_GPU_COUNT]; cl_program cpProgram; cl_kernel reduceKernel[MAX_GPU_COUNT]; cl_event GPUDone[MAX_GPU_COUNT]; cl_event GPUExecution[MAX_GPU_COUNT]; size_t programLength; cl_int ciErrNum; char cDeviceName [256]; cl_mem h_DataBuffer; // Vars for reduction results float h_SumGPU[MAX_GPU_COUNT * ACCUM_N]; float *h_Data; double sumGPU; double sumCPU, dRelError; // allocate and init host buffer with with some random generated input data h_Data = (float *)malloc(DATA_N * sizeof(float)); shrFillArray(h_Data, DATA_N); // start timer & logs shrLog("Setting up OpenCL on the Host...\n\n"); shrDeltaT(1); // Annotate profiling state #ifdef GPU_PROFILING shrLog("OpenCL Profiling is enabled...\n\n"); #endif //Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clGetPlatformID...\n"); //Get the devices ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &ciDeviceCount); oclCheckError(ciErrNum, CL_SUCCESS); cdDevices = (cl_device_id *)malloc(ciDeviceCount * sizeof(cl_device_id) ); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, ciDeviceCount, cdDevices, NULL); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clGetDeviceIDs...\n"); //Create the context cxGPUContext = clCreateContext(0, ciDeviceCount, cdDevices, NULL, NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateContext...\n"); // Set up command queue(s) for GPU's specified on the command line or all GPU's if(shrCheckCmdLineFlag(argc, (const char **)argv, "device")) { // User specified GPUs int ciMaxDeviceID = ciDeviceCount-1; ciDeviceCount = 0; char* deviceList; char* deviceStr; char* next_token; shrGetCmdLineArgumentstr(argc, (const char **)argv, "device", &deviceList); #ifdef WIN32 deviceStr = strtok_s (deviceList," ,.-", &next_token); #else deviceStr = strtok (deviceList," ,.-"); #endif // Create command queues for all Requested GPU's while(deviceStr != NULL) { // get & log device index # and name deviceNr[ciDeviceCount] = atoi(deviceStr); if( deviceNr[ciDeviceCount] > ciMaxDeviceID ) { shrLog(" Invalid user specified device ID: %d\n", deviceNr[ciDeviceCount]); return 1; } cdDevice = oclGetDev(cxGPUContext, deviceNr[ciDeviceCount]); ciErrNum = clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(cDeviceName), cDeviceName, NULL); oclCheckError(ciErrNum, CL_SUCCESS); shrLog(" Device %i: %s\n\n", deviceNr[ciDeviceCount], cDeviceName); // create a command que commandQueue[ciDeviceCount] = clCreateCommandQueue(cxGPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateCommandQueue\n"); ++ciDeviceCount; #ifdef WIN32 deviceStr = strtok_s (NULL," ,.-", &next_token); #else deviceStr = strtok (NULL," ,.-"); #endif } free(deviceList); } else { // Find out how many GPU's to compute on all available GPUs size_t nDeviceBytes; ciErrNum = clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &nDeviceBytes); oclCheckError(ciErrNum, CL_SUCCESS); ciDeviceCount = (cl_uint)nDeviceBytes/sizeof(cl_device_id); for(unsigned int i = 0; i < ciDeviceCount; ++i ) { // get & log device index # and name deviceNr[i] = i; cdDevice = oclGetDev(cxGPUContext, i); ciErrNum = clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(cDeviceName), cDeviceName, NULL); oclCheckError(ciErrNum, CL_SUCCESS); shrLog(" Device %i: %s\n", i, cDeviceName); // create a command que commandQueue[i] = clCreateCommandQueue(cxGPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateCommandQueue\n\n"); } } // Load the OpenCL source code from the .cl file const char* source_path = shrFindFilePath("simpleMultiGPU.cl", argv[0]); char *source = oclLoadProgSource(source_path, "", &programLength); oclCheckError(source != NULL, shrTRUE); shrLog("oclLoadProgSource\n"); // Create the program for all GPUs in the context cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&source, &programLength, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateProgramWithSource\n"); // build the program ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL); if (ciErrNum != CL_SUCCESS) { // write out standard error, Build Log and PTX, then cleanup and exit shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclSimpleMultiGPU.ptx"); oclCheckError(ciErrNum, CL_SUCCESS); } shrLog("clBuildProgram\n"); // Create host buffer with page-locked memory h_DataBuffer = clCreateBuffer(cxGPUContext, CL_MEM_COPY_HOST_PTR | CL_MEM_ALLOC_HOST_PTR, DATA_N * sizeof(float), h_Data, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateBuffer (Page-locked Host)\n\n"); // Create buffers for each GPU, with data divided evenly among GPU's int sizePerGPU = DATA_N / ciDeviceCount; int workOffset[MAX_GPU_COUNT]; int workSize[MAX_GPU_COUNT]; workOffset[0] = 0; for(unsigned int i = 0; i < ciDeviceCount; ++i ) { workSize[i] = (i != (ciDeviceCount - 1)) ? sizePerGPU : (DATA_N - workOffset[i]); // Input buffer d_Data[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, workSize[i] * sizeof(float), NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateBuffer (Input)\t\tDev %i\n", i); // Copy data from host to device ciErrNum = clEnqueueCopyBuffer(commandQueue[i], h_DataBuffer, d_Data[i], workOffset[i] * sizeof(float), 0, workSize[i] * sizeof(float), 0, NULL, NULL); shrLog("clEnqueueCopyBuffer (Input)\tDev %i\n", i); // Output buffer d_Result[i] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, ACCUM_N * sizeof(float), NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateBuffer (Output)\t\tDev %i\n", i); // Create kernel reduceKernel[i] = clCreateKernel(cpProgram, "reduce", &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateKernel\t\t\tDev %i\n", i); // Set the args values and check for errors ciErrNum |= clSetKernelArg(reduceKernel[i], 0, sizeof(cl_mem), &d_Result[i]); ciErrNum |= clSetKernelArg(reduceKernel[i], 1, sizeof(cl_mem), &d_Data[i]); ciErrNum |= clSetKernelArg(reduceKernel[i], 2, sizeof(int), &workSize[i]); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clSetKernelArg\t\t\tDev %i\n\n", i); workOffset[i + 1] = workOffset[i] + workSize[i]; } // Set # of work items in work group and total in 1 dimensional range size_t localWorkSize[] = {THREAD_N}; size_t globalWorkSize[] = {ACCUM_N}; // Start timer and launch reduction kernel on each GPU, with data split between them shrLog("Launching Kernels on GPU(s)...\n\n"); for(unsigned int i = 0; i < ciDeviceCount; i++) { ciErrNum = clEnqueueNDRangeKernel(commandQueue[i], reduceKernel[i], 1, 0, globalWorkSize, localWorkSize, 0, NULL, &GPUExecution[i]); oclCheckError(ciErrNum, CL_SUCCESS); } // Copy result from device to host for each device for(unsigned int i = 0; i < ciDeviceCount; i++) { ciErrNum = clEnqueueReadBuffer(commandQueue[i], d_Result[i], CL_FALSE, 0, ACCUM_N * sizeof(float), h_SumGPU + i * ACCUM_N, 0, NULL, &GPUDone[i]); oclCheckError(ciErrNum, CL_SUCCESS); } // Synchronize with the GPUs and do accumulated error check clWaitForEvents(ciDeviceCount, GPUDone); shrLog("clWaitForEvents complete...\n\n"); // Aggregate results for multiple GPU's and stop/log processing time sumGPU = 0; for(unsigned int i = 0; i < ciDeviceCount * ACCUM_N; i++) { sumGPU += h_SumGPU[i]; } // Print Execution Times for each GPU #ifdef GPU_PROFILING shrLog("Profiling Information for GPU Processing:\n\n"); for(unsigned int i = 0; i < ciDeviceCount; i++) { cdDevice = oclGetDev(cxGPUContext, deviceNr[i]); clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(cDeviceName), cDeviceName, NULL); shrLog("Device %i : %s\n", deviceNr[i], cDeviceName); shrLog(" Reduce Kernel : %.5f s\n", executionTime(GPUExecution[i])); shrLog(" Copy Device->Host : %.5f s\n\n\n", executionTime(GPUDone[i])); } #endif // Run the computation on the Host CPU and log processing time shrLog("Launching Host/CPU C++ Computation...\n\n"); sumCPU = 0; for(unsigned int i = 0; i < DATA_N; i++) { sumCPU += h_Data[i]; } // Check GPU result against CPU result dRelError = 100.0 * fabs(sumCPU - sumGPU) / fabs(sumCPU); shrLog("Comparing against Host/C++ computation...\n"); shrLog(" GPU sum: %f\n CPU sum: %f\n", sumGPU, sumCPU); shrLog(" Relative Error (100.0 * Error / Golden) = %f \n\n", dRelError); // cleanup free(source); free(h_Data); for(unsigned int i = 0; i < ciDeviceCount; ++i ) { clReleaseKernel(reduceKernel[i]); clReleaseCommandQueue(commandQueue[i]); } clReleaseProgram(cpProgram); clReleaseContext(cxGPUContext); // finish shrQAFinishExit(argc, (const char **)argv, (dRelError < 1e-4) ? QA_PASSED : QA_FAILED); }
void matrixMulGPU(cl_uint ciDeviceCount, cl_mem h_A, float* h_B_data, unsigned int mem_size_B, float* h_C ) { cl_mem d_A[MAX_GPU_COUNT]; cl_mem d_C[MAX_GPU_COUNT]; cl_mem d_B[MAX_GPU_COUNT]; cl_event GPUDone[MAX_GPU_COUNT]; cl_event GPUExecution[MAX_GPU_COUNT]; // Start the computation on each available GPU // Create buffers for each GPU // Each GPU will compute sizePerGPU rows of the result int sizePerGPU = uiHA / ciDeviceCount; int workOffset[MAX_GPU_COUNT]; int workSize[MAX_GPU_COUNT]; workOffset[0] = 0; for(unsigned int i=0; i < ciDeviceCount; ++i) { // Input buffer workSize[i] = (i != (ciDeviceCount - 1)) ? sizePerGPU : (uiHA - workOffset[i]); d_A[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, workSize[i] * sizeof(float) * uiWA, NULL,NULL); // Copy only assigned rows from host to device clEnqueueCopyBuffer(commandQueue[i], h_A, d_A[i], workOffset[i] * sizeof(float) * uiWA, 0, workSize[i] * sizeof(float) * uiWA, 0, NULL, NULL); // create OpenCL buffer on device that will be initiatlize from the host memory on first use // on device d_B[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size_B, h_B_data, NULL); // Output buffer d_C[i] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, workSize[i] * uiWC * sizeof(float), NULL,NULL); // set the args values clSetKernelArg(multiplicationKernel[i], 0, sizeof(cl_mem), (void *) &d_C[i]); clSetKernelArg(multiplicationKernel[i], 1, sizeof(cl_mem), (void *) &d_A[i]); clSetKernelArg(multiplicationKernel[i], 2, sizeof(cl_mem), (void *) &d_B[i]); clSetKernelArg(multiplicationKernel[i], 3, sizeof(float) * BLOCK_SIZE *BLOCK_SIZE, 0 ); clSetKernelArg(multiplicationKernel[i], 4, sizeof(float) * BLOCK_SIZE *BLOCK_SIZE, 0 ); clSetKernelArg(multiplicationKernel[i], 5, sizeof(cl_int), (void *) &uiWA); clSetKernelArg(multiplicationKernel[i], 6, sizeof(cl_int), (void *) &uiWB); if(i+1 < ciDeviceCount) workOffset[i + 1] = workOffset[i] + workSize[i]; } // Execute Multiplication on all GPUs in parallel size_t localWorkSize[] = {BLOCK_SIZE, BLOCK_SIZE}; size_t globalWorkSize[] = {shrRoundUp(BLOCK_SIZE, uiWC), shrRoundUp(BLOCK_SIZE, workSize[0])}; // Launch kernels on devices #ifdef GPU_PROFILING int nIter = 30; for (int j = -1; j < nIter; j++) { // Sync all queues to host and start timer first time through loop if(j == 0){ for(unsigned int i = 0; i < ciDeviceCount; i++) { clFinish(commandQueue[i]); } shrDeltaT(0); } #endif for(unsigned int i = 0; i < ciDeviceCount; i++) { // Multiplication - non-blocking execution: launch and push to device(s) globalWorkSize[1] = shrRoundUp(BLOCK_SIZE, workSize[i]); clEnqueueNDRangeKernel(commandQueue[i], multiplicationKernel[i], 2, 0, globalWorkSize, localWorkSize, 0, NULL, &GPUExecution[i]); clFlush(commandQueue[i]); } #ifdef GPU_PROFILING } #endif // sync all queues to host for(unsigned int i = 0; i < ciDeviceCount; i++) { clFinish(commandQueue[i]); } #ifdef GPU_PROFILING // stop and log timer double dSeconds = shrDeltaT(0)/(double)nIter; double dNumOps = 2.0 * (double)uiWA * (double)uiHA * (double)uiWB; double gflops = 1.0e-9 * dNumOps/dSeconds; shrLogEx(LOGBOTH | MASTER, 0, "oclMatrixMul, Throughput = %.4f GFlops/s, Time = %.5f s, Size = %.0f, NumDevsUsed = %d, Workgroup = %u\n", gflops, dSeconds, dNumOps, ciDeviceCount, localWorkSize[0] * localWorkSize[1]); // Print kernel timing per GPU shrLog("\n"); for(unsigned int i = 0; i < ciDeviceCount; i++) { shrLog(" Kernel execution time on GPU %d \t: %.5f s\n", i, executionTime(GPUExecution[i])); } shrLog("\n"); #endif for(unsigned int i = 0; i < ciDeviceCount; i++) { // Non-blocking copy of result from device to host clEnqueueReadBuffer(commandQueue[i], d_C[i], CL_FALSE, 0, uiWC * sizeof(float) * workSize[i], h_C + workOffset[i] * uiWC, 0, NULL, &GPUDone[i]); } // CPU sync with GPU clWaitForEvents(ciDeviceCount, GPUDone); // Release mem and event objects for(unsigned int i = 0; i < ciDeviceCount; i++) { clReleaseMemObject(d_A[i]); clReleaseMemObject(d_C[i]); clReleaseMemObject(d_B[i]); clReleaseEvent(GPUExecution[i]); clReleaseEvent(GPUDone[i]); } }
//////////////////////////////////////////////////////////////////////////////// // Main program //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { cl_platform_id cpPlatform; //OpenCL platform cl_device_id cdDevice; //OpenCL device cl_context cxGPUContext; //OpenCL context cl_command_queue cqCommandQueue; //OpenCL command que cl_mem d_Input, d_Output; //OpenCL memory buffer objects cl_int ciErrNum; float *h_Input, *h_OutputCPU, *h_OutputGPU; const uint imageW = 2048, imageH = 2048, stride = 2048; const int dir = DCT_FORWARD; shrQAStart(argc, argv); int use_gpu = 0; for(int i = 0; i < argc && argv; i++) { if(!argv[i]) continue; if(strstr(argv[i], "cpu")) use_gpu = 0; else if(strstr(argv[i], "gpu")) use_gpu = 1; } // set logfile name and start logs shrSetLogFileName ("oclDCT8x8.txt"); shrLog("%s Starting...\n\n", argv[0]); shrLog("Allocating and initializing host memory...\n"); h_Input = (float *)malloc(imageH * stride * sizeof(float)); h_OutputCPU = (float *)malloc(imageH * stride * sizeof(float)); h_OutputGPU = (float *)malloc(imageH * stride * sizeof(float)); srand(2009); for(uint i = 0; i < imageH; i++) for(uint j = 0; j < imageW; j++) h_Input[i * stride + j] = (float)rand() / (float)RAND_MAX; shrLog("Initializing OpenCL...\n"); //Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckError(ciErrNum, CL_SUCCESS); //Get a GPU device ciErrNum = clGetDeviceIDs(cpPlatform, use_gpu?CL_DEVICE_TYPE_GPU:CL_DEVICE_TYPE_CPU, 1, &cdDevice, NULL); oclCheckError(ciErrNum, CL_SUCCESS); //Create the context cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); //Create a command-queue cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("Initializing OpenCL DCT 8x8...\n"); initDCT8x8(cxGPUContext, cqCommandQueue, (const char **)argv); shrLog("Creating OpenCL memory objects...\n"); d_Input = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, imageH * stride * sizeof(cl_float), h_Input, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); d_Output = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, imageH * stride * sizeof(cl_float), NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("Performing DCT8x8 of %u x %u image...\n\n", imageH, imageW); //Just a single iteration or a warmup iteration DCT8x8( cqCommandQueue, d_Output, d_Input, stride, imageH, imageW, dir ); #ifdef GPU_PROFILING const int numIterations = 16; cl_event startMark, endMark; ciErrNum = clEnqueueMarker(cqCommandQueue, &startMark); ciErrNum |= clFinish(cqCommandQueue); shrCheckError(ciErrNum, CL_SUCCESS); shrDeltaT(0); for(int iter = 0; iter < numIterations; iter++) DCT8x8( NULL, d_Output, d_Input, stride, imageH, imageW, dir ); ciErrNum = clEnqueueMarker(cqCommandQueue, &endMark); ciErrNum |= clFinish(cqCommandQueue); shrCheckError(ciErrNum, CL_SUCCESS); //Calculate performance metrics by wallclock time double gpuTime = shrDeltaT(0) / (double)numIterations; shrLogEx(LOGBOTH | MASTER, 0, "oclDCT8x8, Throughput = %.4f MPixels/s, Time = %.5f s, Size = %u Pixels, NumDevsUsed = %i, Workgroup = %u\n", (1.0e-6 * (double)(imageW * imageH)/ gpuTime), gpuTime, (imageW * imageH), 1, 0); //Get profiler time cl_ulong startTime = 0, endTime = 0; ciErrNum = clGetEventProfilingInfo(startMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &startTime, NULL); ciErrNum |= clGetEventProfilingInfo(endMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL); shrCheckError(ciErrNum, CL_SUCCESS); shrLog("\nOpenCL time: %.5f s\n\n", 1.0e-9 * ((double)endTime - (double)startTime) / (double)numIterations); #endif shrLog("Reading back OpenCL results...\n"); ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_Output, CL_TRUE, 0, imageH * stride * sizeof(cl_float), h_OutputGPU, 0, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("Comparing against Host/C++ computation...\n"); DCT8x8CPU(h_OutputCPU, h_Input, stride, imageH, imageW, dir); double sum = 0, delta = 0; double L2norm; for(uint i = 0; i < imageH; i++) for(uint j = 0; j < imageW; j++){ sum += h_OutputCPU[i * stride + j] * h_OutputCPU[i * stride + j]; delta += (h_OutputGPU[i * stride + j] - h_OutputCPU[i * stride + j]) * (h_OutputGPU[i * stride + j] - h_OutputCPU[i * stride + j]); } L2norm = sqrt(delta / sum); shrLog("Relative L2 norm: %.3e\n\n", L2norm); shrLog("Shutting down...\n"); //Release kernels and program closeDCT8x8(); //Release other OpenCL objects ciErrNum = clReleaseMemObject(d_Output); ciErrNum |= clReleaseMemObject(d_Input); ciErrNum |= clReleaseCommandQueue(cqCommandQueue); ciErrNum |= clReleaseContext(cxGPUContext); oclCheckError(ciErrNum, CL_SUCCESS); //Release host buffers free(h_OutputGPU); free(h_OutputCPU); free(h_Input); //Finish shrQAFinishExit(argc, (const char **)argv, (L2norm < 1E-6) ? QA_PASSED : QA_FAILED); }