// Init OpenCL //***************************************************************************** int initCL(int argc, const char** argv) { cl_platform_id cpPlatform; cl_uint uiDevCount; cl_device_id *cdDevices; //Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Get the number of GPU devices available to the platform ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiDevCount); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Create the device list cdDevices = new cl_device_id [uiDevCount]; ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiDevCount, cdDevices, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Get device requested on command line, if any unsigned int uiDeviceUsed = 0; unsigned int uiEndDev = uiDevCount - 1; if(shrGetCmdLineArgumentu(argc, argv, "device", &uiDeviceUsed)) { uiDeviceUsed = CLAMP(uiDeviceUsed, 0, uiEndDev); uiEndDev = uiDeviceUsed; } // Check if the requested device (or any of the devices if none requested) supports context sharing with OpenGL if(bGLinterop && !bQATest) { bool bSharingSupported = false; for(unsigned int i = uiDeviceUsed; (!bSharingSupported && (i <= uiEndDev)); ++i) { size_t extensionSize; ciErrNum = clGetDeviceInfo(cdDevices[i], CL_DEVICE_EXTENSIONS, 0, NULL, &extensionSize ); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); if(extensionSize > 0) { char* extensions = (char*)malloc(extensionSize); ciErrNum = clGetDeviceInfo(cdDevices[i], CL_DEVICE_EXTENSIONS, extensionSize, extensions, &extensionSize); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); std::string stdDevString(extensions); free(extensions); size_t szOldPos = 0; size_t szSpacePos = stdDevString.find(' ', szOldPos); // extensions string is space delimited while (szSpacePos != stdDevString.npos) { if( strcmp(GL_SHARING_EXTENSION, stdDevString.substr(szOldPos, szSpacePos - szOldPos).c_str()) == 0 ) { // Device supports context sharing with OpenGL uiDeviceUsed = i; bSharingSupported = true; break; } do { szOldPos = szSpacePos + 1; szSpacePos = stdDevString.find(' ', szOldPos); } while (szSpacePos == szOldPos); } } } shrLog("%s...\n\n", bSharingSupported ? "Using CL-GL Interop" : "No device found that supports CL/GL context sharing"); oclCheckErrorEX(bSharingSupported, true, pCleanup); // Define OS-specific context properties and create the OpenCL context #if defined (__APPLE__) || defined (MACOSX) CGLContextObj kCGLContext = CGLGetCurrentContext(); CGLShareGroupObj kCGLShareGroup = CGLGetShareGroup(kCGLContext); cl_context_properties props[] = { CL_CONTEXT_PROPERTY_USE_CGL_SHAREGROUP_APPLE, (cl_context_properties)kCGLShareGroup, 0 }; cxGPUContext = clCreateContext(props, 0,0, NULL, NULL, &ciErrNum); #else #ifdef UNIX cl_context_properties props[] = { CL_GL_CONTEXT_KHR, (cl_context_properties)glXGetCurrentContext(), CL_GLX_DISPLAY_KHR, (cl_context_properties)glXGetCurrentDisplay(), CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform, 0 }; cxGPUContext = clCreateContext(props, 1, &cdDevices[uiDeviceUsed], NULL, NULL, &ciErrNum); #else // Win32 cl_context_properties props[] = { CL_GL_CONTEXT_KHR, (cl_context_properties)wglGetCurrentContext(), CL_WGL_HDC_KHR, (cl_context_properties)wglGetCurrentDC(), CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform, 0 }; cxGPUContext = clCreateContext(props, 1, &cdDevices[uiDeviceUsed], NULL, NULL, &ciErrNum); #endif #endif } else { // No GL interop cl_context_properties props[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform, 0}; cxGPUContext = clCreateContext(props, 1, &cdDevices[uiDeviceUsed], NULL, NULL, &ciErrNum); bGLinterop = shrFALSE; } shrCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Log device used shrLog("Device # %u, ", uiDeviceUsed); oclPrintDevName(LOGBOTH, cdDevices[uiDeviceUsed]); shrLog("\n"); // create a command-queue cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[uiDeviceUsed], 0, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Memory Setup if( bGLinterop ) { cl_pbos[0] = clCreateFromGLBuffer(cxGPUContext, CL_MEM_READ_ONLY, pbo_source, &ciErrNum); cl_pbos[1] = clCreateFromGLBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, pbo_dest, &ciErrNum); } else { cl_pbos[0] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, 4 * image_width * image_height, NULL, &ciErrNum); cl_pbos[1] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, 4 * image_width * image_height, NULL, &ciErrNum); } oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Program Setup size_t program_length; const char* source_path = shrFindFilePath(clSourcefile, argv[0]); char *source = oclLoadProgSource(source_path, "", &program_length); oclCheckErrorEX(source != NULL, shrTRUE, pCleanup); // create the program cpProgram = clCreateProgramWithSource(cxGPUContext, 1,(const char **) &source, &program_length, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); free(source); // build the program ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL); if (ciErrNum != CL_SUCCESS) { // write out standard error, Build Log and PTX, then cleanup and exit shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclPostProcessGL.ptx"); Cleanup(EXIT_FAILURE); } // create the kernel ckKernel = clCreateKernel(cpProgram, "postprocess", &ciErrNum); // set the args values ciErrNum |= clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void *) &(cl_pbos[0])); ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void *) &(cl_pbos[1])); ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(image_width), &image_width); ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(image_width), &image_height); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); return 0; }
// main function //***************************************************************************** int main(int argc, const char **argv) { cl_context cxGPUContext; // OpenCL context cl_command_queue cqCommandQue[MAX_GPU_COUNT]; // OpenCL command que cl_device_id* cdDevices; // OpenCL device list cl_int err = 1; // Error code var shrSetLogFileName ("oclHiddenMarkovModel.txt"); shrLog(LOGBOTH, 0, "%s Starting...\n\n", argv[0]); shrLog(LOGBOTH, 0, "Create context\n"); cxGPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &err); shrCheckErrorEX(err, CL_SUCCESS, NULL); shrLog(LOGBOTH, 0, "Get device info...\n"); int nDevice = 0; size_t nDeviceBytes; err |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &nDeviceBytes); cdDevices = (cl_device_id*)malloc(nDeviceBytes); err |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, nDeviceBytes, cdDevices, NULL); nDevice = (int)(nDeviceBytes/sizeof(cl_device_id)); shrLog(LOGBOTH, 0, "clCreateCommandQueue\n"); int id_device; if(shrGetCmdLineArgumenti(argc, argv, "device", &id_device)) // Set up command queue(s) for GPU specified on the command line { // get & log device index # and name cl_device_id cdDevice = cdDevices[id_device]; // create a command que cqCommandQue[0] = clCreateCommandQueue(cxGPUContext, cdDevice, 0, &err); shrCheckErrorEX(err, CL_SUCCESS, NULL); oclPrintDevInfo(LOGBOTH, cdDevice); nDevice = 1; } else { // create command queues for all available devices for (int i = 0; i < nDevice; i++) { cqCommandQue[i] = clCreateCommandQueue(cxGPUContext, cdDevices[i], 0, &err); shrCheckErrorEX(err, CL_SUCCESS, NULL); } for (int i = 0; i < nDevice; i++) oclPrintDevInfo(LOGBOTH, cdDevices[i]); } shrLog(LOGBOTH, 0, "\nUsing %d GPU(s)...\n\n", nDevice); int wgSize; if (!shrGetCmdLineArgumenti(argc, argv, "work-group-size", &wgSize)) { wgSize = 256; } shrLog(LOGBOTH, 0, "Init Hidden Markov Model parameters\n"); int nState = 256*16; // number of states int nEmit = 128; // number of possible observations float *initProb = (float*)malloc(sizeof(float)*nState); // initial probability float *mtState = (float*)malloc(sizeof(float)*nState*nState); // state transition matrix float *mtEmit = (float*)malloc(sizeof(float)*nEmit*nState); // emission matrix initHMM(initProb, mtState, mtEmit, nState, nEmit); // define observational sequence int nObs = 500; // size of observational sequence int **obs = (int**)malloc(nDevice*sizeof(int*)); int **viterbiPathCPU = (int**)malloc(nDevice*sizeof(int*)); int **viterbiPathGPU = (int**)malloc(nDevice*sizeof(int*)); float *viterbiProbCPU = (float*)malloc(nDevice*sizeof(float)); for (int iDevice = 0; iDevice < nDevice; iDevice++) { obs[iDevice] = (int*)malloc(sizeof(int)*nObs); for (int i = 0; i < nObs; i++) obs[iDevice][i] = i % 15; viterbiPathCPU[iDevice] = (int*)malloc(sizeof(int)*nObs); viterbiPathGPU[iDevice] = (int*)malloc(sizeof(int)*nObs); } shrLog(LOGBOTH, 0, "# of states = %d\n# of possible observations = %d \nSize of observational sequence = %d\n\n", nState, nEmit, nObs); shrLog(LOGBOTH, 0, "Compute Viterbi path on GPU\n\n"); HMM **oHmm = (HMM**)malloc(nDevice*sizeof(HMM*)); for (int iDevice = 0; iDevice < nDevice; iDevice++) { oHmm[iDevice] = new HMM(cxGPUContext, cqCommandQue[iDevice], initProb, mtState, mtEmit, nState, nEmit, nObs, argv[0], wgSize); } cl_mem *vProb = (cl_mem*)malloc(sizeof(cl_mem)*nDevice); cl_mem *vPath = (cl_mem*)malloc(sizeof(cl_mem)*nDevice); for (int iDevice = 0; iDevice < nDevice; iDevice++) { vProb[iDevice] = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, sizeof(float), NULL, &err); vPath[iDevice] = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, sizeof(int)*nObs, NULL, &err); } #ifdef GPU_PROFILING for (int iDevice = 0; iDevice < nDevice; iDevice++) { clFinish(cqCommandQue[iDevice]);; } shrDeltaT(1); #endif size_t szWorkGroup; for (int iDevice = 0; iDevice < nDevice; iDevice++) { szWorkGroup = oHmm[iDevice]->ViterbiSearch(vProb[iDevice], vPath[iDevice], obs[iDevice]); } for (int iDevice = 0; iDevice < nDevice; iDevice++) { clFinish(cqCommandQue[iDevice]); } #ifdef GPU_PROFILING double dElapsedTime = shrDeltaT(1); shrLog(LOGBOTH | MASTER, 0, "oclHiddenMarkovModel, Throughput = %.4f, Time = %.5f, Size = %u, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * nDevice * nState * nObs)/dElapsedTime, dElapsedTime, (nDevice * nState * nObs), nDevice, szWorkGroup); #endif for (int iDevice = 0; iDevice < nDevice; iDevice++) { err = clEnqueueReadBuffer(cqCommandQue[iDevice], vPath[iDevice], CL_TRUE, 0, sizeof(int)*nObs, viterbiPathGPU[iDevice], 0, NULL, NULL); } shrLog(LOGBOTH, 0, "\nCompute Viterbi path on CPU\n"); for (int iDevice = 0; iDevice < nDevice; iDevice++) { err = ViterbiCPU(viterbiProbCPU[iDevice], viterbiPathCPU[iDevice], obs[iDevice], nObs, initProb, mtState, nState, mtEmit); } if (!err) { shrEXIT(argc, argv); } bool pass = true; for (int iDevice = 0; iDevice < nDevice; iDevice++) { for (int i = 0; i < nObs; i++) { if (viterbiPathCPU[iDevice][i] != viterbiPathGPU[iDevice][i]) { pass = false; break; } } } shrLog(LOGBOTH, 0, "\nTEST %s\n\n", (pass) ? "PASSED" : "FAILED !!!"); // NOTE: Most properly this should be done at any of the exit points above, but it is omitted elsewhere for clarity. shrLog(LOGBOTH, 0, "Release CPU buffers and OpenCL objects...\n"); free(initProb); free(mtState); free(mtEmit); for (int iDevice = 0; iDevice < nDevice; iDevice++) { free(obs[iDevice]); free(viterbiPathCPU[iDevice]); free(viterbiPathGPU[iDevice]); delete oHmm[iDevice]; clReleaseCommandQueue(cqCommandQue[iDevice]); } free(obs); free(viterbiPathCPU); free(viterbiPathGPU); free(viterbiProbCPU); free(cdDevices); free(oHmm); clReleaseContext(cxGPUContext); shrEXIT(argc, argv); }