void ofxClScheduler::printDeviceInfo(int device_nr) { // get and log device info if (device_nr != -1) device = oclGetDev(context, device_nr); else device = oclGetMaxFlopsDev(context); oclPrintDevInfo(0, device); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, const char** argv) { const char *my_name = "[oclAvcDisc]"; int bPassed = 1; char filename[500], cBuffer[1024], sProfileString[2048]; FILE *log_ofs=NULL; time_t g_the_time; /* OpenCL variables */ cl_int ciErrNum; cl_platform_id clSelectedPlatformID = NULL; cl_uint ciDeviceCount; cl_device_id *devices; sprintf(filename, "oclAvcDisc.txt"); if( (log_ofs=fopen(filename, "a"))== NULL ) { fprintf(stderr, "[oclAvcDisc] Error, could not open file %s\n", filename); exit(1); } g_the_time = time(NULL); _write_log(log_ofs, "%s oclDeviceQuery.exe Starting...\n", my_name); /* Get OpenCL platform ID for NVIDIA if avaiable, otherwise default */ _write_log(log_ofs, "%s OpenCL SW Info:\n", my_name); ciErrNum = oclGetPlatformID (&clSelectedPlatformID); oclCheckError(ciErrNum, CL_SUCCESS); /* Get OpenCL platform name and version */ ciErrNum = clGetPlatformInfo (clSelectedPlatformID, CL_PLATFORM_NAME, sizeof(cBuffer), cBuffer, NULL); if (ciErrNum == CL_SUCCESS) { _write_log(log_ofs, "%s CL_PLATFORM_NAME: \t%s\n", my_name, cBuffer); } else { _write_log(log_ofs, "%s Error %i in clGetPlatformInfo Call !!!\n\n", my_name, ciErrNum); bPassed = 0; } ciErrNum = clGetPlatformInfo (clSelectedPlatformID, CL_PLATFORM_VERSION, sizeof(cBuffer), cBuffer, NULL); if (ciErrNum == CL_SUCCESS) { _write_log(log_ofs, "%s CL_PLATFORM_VERSION: \t%s\n", my_name, cBuffer); } else { _write_log(" Error %i in clGetPlatformInfo Call !!!\n\n", ciErrNum); bPassed = 0; } // Get and log OpenCL device info _write_log(log_ofs, "%s OpenCL Device Info:\n\n", my_name); ciErrNum = clGetDeviceIDs (clSelectedPlatformID, CL_DEVICE_TYPE_ALL, 0, NULL, &ciDeviceCount); // check for 0 devices found or errors... if (ciDeviceCount == 0) { _write_log(log_ofs, "%s No devices found supporting OpenCL (return code %i)\n\n", my_name, ciErrNum); bPassed = false; } else if (ciErrNum != CL_SUCCESS) { _write_log(log_ofs, "%s Error %i in clGetDeviceIDs call !!!\n\n", my_name, ciErrNum); bPassed = false; } else { // Get and log the OpenCL device ID's char cTemp[2]; _write_log(log_ofs, "%s %u devices found supporting OpenCL:\n\n", my_name , ciDeviceCount); sprintf(cTemp, "%u", ciDeviceCount); if ((devices = (cl_device_id*)malloc(sizeof(cl_device_id) * ciDeviceCount)) == NULL) { _write_log(log_ofs, "%s Failed to allocate memory for devices !!!\n\n", my_name); bPassed = false; } ciErrNum = clGetDeviceIDs (clSelectedPlatformID, CL_DEVICE_TYPE_ALL, ciDeviceCount, devices, &ciDeviceCount); if (ciErrNum == CL_SUCCESS) { //Create a context for the devices cl_context cxGPUContext = clCreateContext(0, ciDeviceCount, devices, NULL, NULL, &ciErrNum); if (ciErrNum != CL_SUCCESS) { _write_log(log_ofs, "%s Error %i in clCreateContext call !!!\n\n", my_name, ciErrNum); bPassed = false; } else { // show info for each device in the context for(unsigned int i = 0; i < ciDeviceCount; ++i ) { _write_log(log_ofs, "%s ---------------------------------\n", my_name); clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(cBuffer), &cBuffer, NULL); _write_log(log_ofs, "%s Device %s\n", my_name, cBuffer); _write_log(log_ofs, "%s ---------------------------------\n", my_name); oclPrintDevInfo(LOGBOTH, devices[i]); } // Determine and show image format support cl_uint uiNumSupportedFormats = 0; // 2D clGetSupportedImageFormats(cxGPUContext, CL_MEM_READ_ONLY, CL_MEM_OBJECT_IMAGE2D, NULL, NULL, &uiNumSupportedFormats); cl_image_format *ImageFormats = NULL; ImageFormats = (cl_image_format*)malloc(uiNumSupportedFormats*sizeof(cl_image_format)); if(ImageFormats==NULL) { _write_log(log_ofs, "%s Error, could not alloc ImageFormats\n", my_name); exit(2); } clGetSupportedImageFormats(cxGPUContext, CL_MEM_READ_ONLY, CL_MEM_OBJECT_IMAGE2D, uiNumSupportedFormats, ImageFormats, NULL); _write_log(log_ofs, "%s ---------------------------------\n", my_name); _write_log(log_ofs, "%s 2D Image Formats Supported (%u)\n", my_name, uiNumSupportedFormats); _write_log(log_ofs, "%s ---------------------------------\n", my_name); _write_log(log_ofs, "%s %-6s%-16s%-22s\n\n", my_name, "#", "Channel Order", "Channel Type"); for(unsigned int i = 0; i < uiNumSupportedFormats; i++) { _write_log(log_ofs, "%s %-6u%-16s%-22s\n", my_name, (i + 1), oclImageFormatString(ImageFormats[i].image_channel_order), oclImageFormatString(ImageFormats[i].image_channel_data_type)); } _write_log(log_ofs, "%s\n", my_name); free(ImageFormats); ImageFormats = NULL; // 3D clGetSupportedImageFormats(cxGPUContext, CL_MEM_READ_ONLY, CL_MEM_OBJECT_IMAGE3D, NULL, NULL, &uiNumSupportedFormats); ImageFormats = (cl_image_format*)malloc(uiNumSupportedFormats*sizeof(cl_image_format)); if(ImageFormats==NULL) { _write_log(log_ofs, "%s Error, could not alloc ImageFormats\n", my_name); exit(3); } clGetSupportedImageFormats(cxGPUContext, CL_MEM_READ_ONLY, CL_MEM_OBJECT_IMAGE3D, uiNumSupportedFormats, ImageFormats, NULL); _write_log(log_ofs, "%s ---------------------------------\n", my_name); _write_log(log_ofs, "%s 3D Image Formats Supported (%u)\n", my_name, uiNumSupportedFormats); _write_log(log_ofs, "%s ---------------------------------\n", my_name); _write_log(log_ofs, "%s %-6s%-16s%-22s\n\n", my_name, "#", "Channel Order", "Channel Type"); for(unsigned int i = 0; i < uiNumSupportedFormats; i++) { _write_log(log_ofs, "%s %-6u%-16s%-22s\n", my_name, (i + 1), oclImageFormatString(ImageFormats[i].image_channel_order), oclImageFormatString(ImageFormats[i].image_channel_data_type)); } write_log(log_ofs, "%s\n", my_name); free(ImageFormats); ImageFormats=NULL; } } else { write_log(log_ofs, "%s Error %i in clGetDeviceIDs call !!!\n\n", my_name, ciErrNum); bPassed = 0; } } // finish _write_log(log_ofs, "%s %s\n\n", my_name, bPassed==1 ? "PASSED" : "FAILED"); fflush(log_ofs); fclose(log_ofs); return(0); }
int main(int argc, const char **argv) { cl_platform_id cpPlatform; // OpenCL platform cl_uint nDevice; // OpenCL device count cl_device_id* cdDevices; // OpenCL device list cl_context cxGPUContext; // OpenCL context cl_command_queue cqCommandQueue[MAX_GPU_COUNT]; // OpenCL command que cl_int ciErrNum; shrSetLogFileName ("oclRadixSort.txt"); shrLog("%s starting...\n\n", argv[0]); shrLog("clGetPlatformID...\n"); ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); shrLog("clGetDeviceIDs...\n"); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &nDevice); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); cdDevices = (cl_device_id *)malloc(nDevice * sizeof(cl_device_id) ); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, nDevice, cdDevices, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); shrLog("clCreateContext...\n"); cxGPUContext = clCreateContext(0, nDevice, cdDevices, NULL, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); shrLog("Create command queue...\n\n"); int id_device; if(shrGetCmdLineArgumenti(argc, argv, "device", &id_device)) // Set up command queue(s) for GPU specified on the command line { // get & log device index # and name cl_device_id cdDevice = cdDevices[id_device]; // create a command que cqCommandQueue[0] = clCreateCommandQueue(cxGPUContext, cdDevice, 0, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); oclPrintDevInfo(LOGBOTH, cdDevice); nDevice = 1; } else { // create command queues for all available devices for (cl_uint i = 0; i < nDevice; i++) { cqCommandQueue[i] = clCreateCommandQueue(cxGPUContext, cdDevices[i], 0, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); } for (cl_uint i = 0; i < nDevice; i++) oclPrintDevInfo(LOGBOTH, cdDevices[i]); } int ctaSize; if (!shrGetCmdLineArgumenti(argc, argv, "work-group-size", &ctaSize)) { ctaSize = 128; } shrLog("Running Radix Sort on %d GPU(s) ...\n\n", nDevice); unsigned int numElements = 1048576;//128*128*128*2; // Alloc and init some data on the host, then alloc and init GPU buffer unsigned int **h_keys = (unsigned int**)malloc(nDevice * sizeof(unsigned int*)); unsigned int **h_keysSorted = (unsigned int**)malloc(nDevice * sizeof(unsigned int*)); cl_mem *d_keys = (cl_mem* )malloc(nDevice * sizeof(cl_mem)); for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { h_keys[iDevice] = (unsigned int*)malloc(numElements * sizeof(unsigned int)); h_keysSorted[iDevice] = (unsigned int*)malloc(numElements * sizeof(unsigned int)); makeRandomUintVector(h_keys[iDevice], numElements, keybits); d_keys[iDevice] = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, sizeof(unsigned int) * numElements, NULL, &ciErrNum); ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[iDevice], d_keys[iDevice], CL_TRUE, 0, sizeof(unsigned int) * numElements, h_keys[iDevice], 0, NULL, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); } // instantiate RadixSort objects RadixSort **radixSort = (RadixSort**)malloc(nDevice * sizeof(RadixSort*)); for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { radixSort[iDevice] = new RadixSort(cxGPUContext, cqCommandQueue[iDevice], numElements, argv[0], ctaSize, true); } #ifdef GPU_PROFILING int numIterations = 30; for (int i = -1; i < numIterations; i++) { if (i == 0) { for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { clFinish(cqCommandQueue[iDevice]); } shrDeltaT(1); } #endif for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { radixSort[iDevice]->sort(d_keys[iDevice], 0, numElements, keybits); } #ifdef GPU_PROFILING } for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { clFinish(cqCommandQueue[iDevice]); } double gpuTime = shrDeltaT(1)/(double)numIterations; shrLogEx(LOGBOTH | MASTER, 0, "oclRadixSort, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u elements, NumDevsUsed = %d, Workgroup = %d\n", (1.0e-6 * (double)(nDevice * numElements)/gpuTime), gpuTime, nDevice * numElements, nDevice, ctaSize); #endif // copy sorted keys to CPU for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { clEnqueueReadBuffer(cqCommandQueue[iDevice], d_keys[iDevice], CL_TRUE, 0, sizeof(unsigned int) * numElements, h_keysSorted[iDevice], 0, NULL, NULL); } // Check results bool passed = true; for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { passed &= verifySortUint(h_keysSorted[iDevice], NULL, h_keys[iDevice], numElements); } shrLog("\n%s\n\n", passed ? "PASSED" : "FAILED"); // cleanup allocs for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { clReleaseMemObject(d_keys[iDevice]); free(h_keys[iDevice]); free(h_keysSorted[iDevice]); delete radixSort[iDevice]; } free(radixSort); free(h_keys); free(h_keysSorted); // remaining cleanup and exit free(cdDevices); for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { clReleaseCommandQueue(cqCommandQueue[iDevice]); } clReleaseContext(cxGPUContext); shrEXIT(argc, argv); }
// main function //***************************************************************************** int main(int argc, const char **argv) { cl_platform_id cpPlatform; // OpenCL platform cl_uint nDevice; // OpenCL device count cl_device_id* cdDevices; // OpenCL device list cl_context cxGPUContext; // OpenCL context cl_command_queue cqCommandQue[MAX_GPU_COUNT]; // OpenCL command que cl_int ciErrNum = 1; // Error code var shrQAStart(argc, (char **)argv); // Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); shrLog("clGetPlatformID...\n"); //Get all the devices cl_uint uiNumDevices = 0; // Number of devices available cl_uint uiTargetDevice = 0; // Default Device to compute on cl_uint uiNumComputeUnits; // Number of compute units (SM's on NV GPU) shrLog("Get the Device info and select Device...\n"); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) ); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); // Get command line device options and config accordingly shrLog(" # of Devices Available = %u\n", uiNumDevices); if(shrGetCmdLineArgumentu(argc, (const char**)argv, "device", &uiTargetDevice)== shrTRUE) { uiTargetDevice = CLAMP(uiTargetDevice, 0, (uiNumDevices - 1)); } shrLog(" Using Device %u: ", uiTargetDevice); oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]); ciErrNum = clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(uiNumComputeUnits), &uiNumComputeUnits, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); shrLog("\n # of Compute Units = %u\n", uiNumComputeUnits); shrSetLogFileName ("oclHiddenMarkovModel.txt"); shrLog("%s Starting...\n\n", argv[0]); shrLog("Get platform...\n"); ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); shrLog("Get devices...\n"); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &nDevice); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); cdDevices = (cl_device_id *)malloc(nDevice * sizeof(cl_device_id) ); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, nDevice, cdDevices, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); shrLog("clCreateContext\n"); cxGPUContext = clCreateContext(0, nDevice, cdDevices, NULL, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); shrLog("clCreateCommandQueue\n"); int id_device; if(shrGetCmdLineArgumenti(argc, argv, "device", &id_device)) // Set up command queue(s) for GPU specified on the command line { // create a command que cqCommandQue[0] = clCreateCommandQueue(cxGPUContext, cdDevices[id_device], 0, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); oclPrintDevInfo(LOGBOTH, cdDevices[id_device]); nDevice = 1; } else { // create command queues for all available devices for (cl_uint i = 0; i < nDevice; i++) { cqCommandQue[i] = clCreateCommandQueue(cxGPUContext, cdDevices[i], CL_QUEUE_PROFILING_ENABLE, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); } for (cl_uint i = 0; i < nDevice; i++) oclPrintDevInfo(LOGBOTH, cdDevices[i]); } shrLog("\nUsing %d GPU(s)...\n\n", nDevice); int wgSize; if (!shrGetCmdLineArgumenti(argc, argv, "work-group-size", &wgSize)) { wgSize = 256; } shrLog("Init Hidden Markov Model parameters\n"); int nState = 256*16; // number of states, must be a multiple of 256 int nEmit = 128; // number of possible observations float *initProb = (float*)malloc(sizeof(float)*nState); // initial probability float *mtState = (float*)malloc(sizeof(float)*nState*nState); // state transition matrix float *mtEmit = (float*)malloc(sizeof(float)*nEmit*nState); // emission matrix initHMM(initProb, mtState, mtEmit, nState, nEmit); // define observational sequence int nObs = 100; // size of observational sequence int **obs = (int**)malloc(nDevice*sizeof(int*)); int **viterbiPathCPU = (int**)malloc(nDevice*sizeof(int*)); int **viterbiPathGPU = (int**)malloc(nDevice*sizeof(int*)); float *viterbiProbCPU = (float*)malloc(nDevice*sizeof(float)); for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { obs[iDevice] = (int*)malloc(sizeof(int)*nObs); for (int i = 0; i < nObs; i++) obs[iDevice][i] = i % 15; viterbiPathCPU[iDevice] = (int*)malloc(sizeof(int)*nObs); viterbiPathGPU[iDevice] = (int*)malloc(sizeof(int)*nObs); } shrLog("# of states = %d\n# of possible observations = %d \nSize of observational sequence = %d\n\n", nState, nEmit, nObs); shrLog("Compute Viterbi path on GPU\n\n"); HMM **oHmm = (HMM**)malloc(nDevice*sizeof(HMM*)); for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { oHmm[iDevice] = new HMM(cxGPUContext, cqCommandQue[iDevice], initProb, mtState, mtEmit, nState, nEmit, nObs, argv[0], wgSize); } cl_mem *vProb = (cl_mem*)malloc(sizeof(cl_mem)*nDevice); cl_mem *vPath = (cl_mem*)malloc(sizeof(cl_mem)*nDevice); for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { vProb[iDevice] = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, sizeof(float), NULL, &ciErrNum); vPath[iDevice] = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, sizeof(int)*nObs, NULL, &ciErrNum); } #ifdef GPU_PROFILING for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { clFinish(cqCommandQue[iDevice]);; } shrDeltaT(1); #endif size_t szWorkGroup; for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { szWorkGroup = oHmm[iDevice]->ViterbiSearch(vProb[iDevice], vPath[iDevice], obs[iDevice]); } for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { clFinish(cqCommandQue[iDevice]); } #ifdef GPU_PROFILING double dElapsedTime = shrDeltaT(1); shrLogEx(LOGBOTH | MASTER, 0, "oclHiddenMarkovModel, Throughput = %.4f GB/s, Time = %.5f s, Size = %u items, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-9 * 2.0 * sizeof(float) * nDevice * nState * nState * (nObs-1))/dElapsedTime, dElapsedTime, (nDevice * nState * nObs), nDevice, szWorkGroup); #endif for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { ciErrNum = clEnqueueReadBuffer(cqCommandQue[iDevice], vPath[iDevice], CL_TRUE, 0, sizeof(int)*nObs, viterbiPathGPU[iDevice], 0, NULL, NULL); } shrLog("\nCompute Viterbi path on CPU\n"); for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { ciErrNum = ViterbiCPU(viterbiProbCPU[iDevice], viterbiPathCPU[iDevice], obs[iDevice], nObs, initProb, mtState, nState, mtEmit); } if (!ciErrNum) { shrEXIT(argc, argv); } bool pass = true; for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { for (int i = 0; i < nObs; i++) { if (viterbiPathCPU[iDevice][i] != viterbiPathGPU[iDevice][i]) { pass = false; break; } } } // NOTE: Most properly this should be done at any of the exit points above, but it is omitted elsewhere for clarity. shrLog("Release CPU buffers and OpenCL objects...\n"); free(initProb); free(mtState); free(mtEmit); for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { free(obs[iDevice]); free(viterbiPathCPU[iDevice]); free(viterbiPathGPU[iDevice]); delete oHmm[iDevice]; clReleaseCommandQueue(cqCommandQue[iDevice]); } free(obs); free(viterbiPathCPU); free(viterbiPathGPU); free(viterbiProbCPU); free(cdDevices); free(oHmm); clReleaseContext(cxGPUContext); // finish shrQAFinishExit(argc, (const char **)argv, pass ? QA_PASSED : QA_FAILED); shrEXIT(argc, argv); }
int InitOpenCLContext() { // start logs shrSetLogFileName ("oclVolumeRender.txt"); // get command line arg for quick test, if provided // process command line arguments // First initialize OpenGL context, so we can properly setup the OpenGL / OpenCL interop. // glewInit(); // GLboolean bGLEW = glewIsSupported("GL_VERSION_2_0 GL_ARB_pixel_buffer_object"); // oclCheckErrorEX(bGLEW, shrTRUE, pCleanup); g_glInterop = true; // Create OpenCL context, get device info, select device, select options for image/texture and CL-GL interop createCLContext(); // Print device info clGetDeviceInfo(cdDevices[uiDeviceUsed], CL_DEVICE_IMAGE_SUPPORT, sizeof(g_bImageSupport), &g_bImageSupport, NULL); //shrLog("%s...\n\n", g_bImageSupport ? "Using Image (Texture)" : "No Image (Texuture) Support"); // shrLog("Detailed Device info:\n\n"); oclPrintDevInfo(LOGBOTH, cdDevices[uiDeviceUsed]); // create a command-queue cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[uiDeviceUsed], 0, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Program Setup size_t program_length; cPathAndName = shrFindFilePath("Transform.cl", "."); oclCheckErrorEX(cPathAndName != NULL, shrTRUE, pCleanup); cSourceCL = oclLoadProgSource(cPathAndName, "", &program_length); oclCheckErrorEX(cSourceCL != NULL, shrTRUE, pCleanup); // create the program cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &program_length, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // build the program std::string buildOpts = "-cl-single-precision_constant"; // buildOpts += g_bImageSupport ? " -DIMAGE_SUPPORT" : ""; // ciErrNum = clBuildProgram(cpProgram, 1, &cdDevices[uiDeviceUsed],"-cl-fast-relaxed-math", NULL, NULL); ciErrNum = clBuildProgram(cpProgram, 1, &cdDevices[uiDeviceUsed],NULL, NULL, NULL); if (ciErrNum != CL_SUCCESS) { // write out standard error, Build Log and PTX, then cleanup and return error shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclVolumeRender.ptx"); Cleanup(EXIT_FAILURE); } // create the kernel ScalseKernel = clCreateKernel(cpProgram, "d_render", &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); TransformKernel = clCreateKernel(cpProgram, "angle", &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); LongToShortKernel = clCreateKernel(cpProgram, "transfer", &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); return TRUE; }
// main function //***************************************************************************** int main(int argc, const char **argv) { cl_context cxGPUContext; // OpenCL context cl_command_queue cqCommandQue[MAX_GPU_COUNT]; // OpenCL command que cl_device_id* cdDevices; // OpenCL device list cl_int err = 1; // Error code var shrSetLogFileName ("oclHiddenMarkovModel.txt"); shrLog(LOGBOTH, 0, "%s Starting...\n\n", argv[0]); shrLog(LOGBOTH, 0, "Create context\n"); cxGPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &err); shrCheckErrorEX(err, CL_SUCCESS, NULL); shrLog(LOGBOTH, 0, "Get device info...\n"); int nDevice = 0; size_t nDeviceBytes; err |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &nDeviceBytes); cdDevices = (cl_device_id*)malloc(nDeviceBytes); err |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, nDeviceBytes, cdDevices, NULL); nDevice = (int)(nDeviceBytes/sizeof(cl_device_id)); shrLog(LOGBOTH, 0, "clCreateCommandQueue\n"); int id_device; if(shrGetCmdLineArgumenti(argc, argv, "device", &id_device)) // Set up command queue(s) for GPU specified on the command line { // get & log device index # and name cl_device_id cdDevice = cdDevices[id_device]; // create a command que cqCommandQue[0] = clCreateCommandQueue(cxGPUContext, cdDevice, 0, &err); shrCheckErrorEX(err, CL_SUCCESS, NULL); oclPrintDevInfo(LOGBOTH, cdDevice); nDevice = 1; } else { // create command queues for all available devices for (int i = 0; i < nDevice; i++) { cqCommandQue[i] = clCreateCommandQueue(cxGPUContext, cdDevices[i], 0, &err); shrCheckErrorEX(err, CL_SUCCESS, NULL); } for (int i = 0; i < nDevice; i++) oclPrintDevInfo(LOGBOTH, cdDevices[i]); } shrLog(LOGBOTH, 0, "\nUsing %d GPU(s)...\n\n", nDevice); int wgSize; if (!shrGetCmdLineArgumenti(argc, argv, "work-group-size", &wgSize)) { wgSize = 256; } shrLog(LOGBOTH, 0, "Init Hidden Markov Model parameters\n"); int nState = 256*16; // number of states int nEmit = 128; // number of possible observations float *initProb = (float*)malloc(sizeof(float)*nState); // initial probability float *mtState = (float*)malloc(sizeof(float)*nState*nState); // state transition matrix float *mtEmit = (float*)malloc(sizeof(float)*nEmit*nState); // emission matrix initHMM(initProb, mtState, mtEmit, nState, nEmit); // define observational sequence int nObs = 500; // size of observational sequence int **obs = (int**)malloc(nDevice*sizeof(int*)); int **viterbiPathCPU = (int**)malloc(nDevice*sizeof(int*)); int **viterbiPathGPU = (int**)malloc(nDevice*sizeof(int*)); float *viterbiProbCPU = (float*)malloc(nDevice*sizeof(float)); for (int iDevice = 0; iDevice < nDevice; iDevice++) { obs[iDevice] = (int*)malloc(sizeof(int)*nObs); for (int i = 0; i < nObs; i++) obs[iDevice][i] = i % 15; viterbiPathCPU[iDevice] = (int*)malloc(sizeof(int)*nObs); viterbiPathGPU[iDevice] = (int*)malloc(sizeof(int)*nObs); } shrLog(LOGBOTH, 0, "# of states = %d\n# of possible observations = %d \nSize of observational sequence = %d\n\n", nState, nEmit, nObs); shrLog(LOGBOTH, 0, "Compute Viterbi path on GPU\n\n"); HMM **oHmm = (HMM**)malloc(nDevice*sizeof(HMM*)); for (int iDevice = 0; iDevice < nDevice; iDevice++) { oHmm[iDevice] = new HMM(cxGPUContext, cqCommandQue[iDevice], initProb, mtState, mtEmit, nState, nEmit, nObs, argv[0], wgSize); } cl_mem *vProb = (cl_mem*)malloc(sizeof(cl_mem)*nDevice); cl_mem *vPath = (cl_mem*)malloc(sizeof(cl_mem)*nDevice); for (int iDevice = 0; iDevice < nDevice; iDevice++) { vProb[iDevice] = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, sizeof(float), NULL, &err); vPath[iDevice] = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, sizeof(int)*nObs, NULL, &err); } #ifdef GPU_PROFILING for (int iDevice = 0; iDevice < nDevice; iDevice++) { clFinish(cqCommandQue[iDevice]);; } shrDeltaT(1); #endif size_t szWorkGroup; for (int iDevice = 0; iDevice < nDevice; iDevice++) { szWorkGroup = oHmm[iDevice]->ViterbiSearch(vProb[iDevice], vPath[iDevice], obs[iDevice]); } for (int iDevice = 0; iDevice < nDevice; iDevice++) { clFinish(cqCommandQue[iDevice]); } #ifdef GPU_PROFILING double dElapsedTime = shrDeltaT(1); shrLog(LOGBOTH | MASTER, 0, "oclHiddenMarkovModel, Throughput = %.4f, Time = %.5f, Size = %u, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * nDevice * nState * nObs)/dElapsedTime, dElapsedTime, (nDevice * nState * nObs), nDevice, szWorkGroup); #endif for (int iDevice = 0; iDevice < nDevice; iDevice++) { err = clEnqueueReadBuffer(cqCommandQue[iDevice], vPath[iDevice], CL_TRUE, 0, sizeof(int)*nObs, viterbiPathGPU[iDevice], 0, NULL, NULL); } shrLog(LOGBOTH, 0, "\nCompute Viterbi path on CPU\n"); for (int iDevice = 0; iDevice < nDevice; iDevice++) { err = ViterbiCPU(viterbiProbCPU[iDevice], viterbiPathCPU[iDevice], obs[iDevice], nObs, initProb, mtState, nState, mtEmit); } if (!err) { shrEXIT(argc, argv); } bool pass = true; for (int iDevice = 0; iDevice < nDevice; iDevice++) { for (int i = 0; i < nObs; i++) { if (viterbiPathCPU[iDevice][i] != viterbiPathGPU[iDevice][i]) { pass = false; break; } } } shrLog(LOGBOTH, 0, "\nTEST %s\n\n", (pass) ? "PASSED" : "FAILED !!!"); // NOTE: Most properly this should be done at any of the exit points above, but it is omitted elsewhere for clarity. shrLog(LOGBOTH, 0, "Release CPU buffers and OpenCL objects...\n"); free(initProb); free(mtState); free(mtEmit); for (int iDevice = 0; iDevice < nDevice; iDevice++) { free(obs[iDevice]); free(viterbiPathCPU[iDevice]); free(viterbiPathGPU[iDevice]); delete oHmm[iDevice]; clReleaseCommandQueue(cqCommandQue[iDevice]); } free(obs); free(viterbiPathCPU); free(viterbiPathGPU); free(viterbiProbCPU); free(cdDevices); free(oHmm); clReleaseContext(cxGPUContext); shrEXIT(argc, argv); }
// Main function // ********************************************************************* int main(const int argc, const char** argv) { // start logs shrSetLogFileName ("oclDXTCompression.txt"); shrLog(LOGBOTH, 0, "%s Starting...\n\n", argv[0]); cl_context cxGPUContext; cl_command_queue cqCommandQueue; cl_program cpProgram; cl_kernel ckKernel; cl_mem cmMemObjs[3]; size_t szGlobalWorkSize[1]; size_t szLocalWorkSize[1]; cl_int ciErrNum; // Get the path of the filename char *filename; if (shrGetCmdLineArgumentstr(argc, argv, "image", &filename)) { image_filename = filename; } // load image const char* image_path = shrFindFilePath(image_filename, argv[0]); shrCheckError(image_path != NULL, shrTRUE); shrLoadPPM4ub(image_path, (unsigned char **)&h_img, &width, &height); shrCheckError(h_img != NULL, shrTRUE); shrLog(LOGBOTH, 0, "Loaded '%s', %d x %d pixels\n", image_path, width, height); // Convert linear image to block linear. uint * block_image = (uint *) malloc(width * height * 4); // Convert linear image to block linear. for(uint by = 0; by < height/4; by++) { for(uint bx = 0; bx < width/4; bx++) { for (int i = 0; i < 16; i++) { const int x = i & 3; const int y = i / 4; block_image[(by * width/4 + bx) * 16 + i] = ((uint *)h_img)[(by * 4 + y) * 4 * (width/4) + bx * 4 + x]; } } } // create the OpenCL context on a GPU device cxGPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &ciErrNum); shrCheckError(ciErrNum, CL_SUCCESS); // get and log device cl_device_id device; if( shrCheckCmdLineFlag(argc, argv, "device") ) { int device_nr = 0; shrGetCmdLineArgumenti(argc, argv, "device", &device_nr); device = oclGetDev(cxGPUContext, device_nr); } else { device = oclGetMaxFlopsDev(cxGPUContext); } oclPrintDevInfo(LOGBOTH, device); // create a command-queue cqCommandQueue = clCreateCommandQueue(cxGPUContext, device, 0, &ciErrNum); shrCheckError(ciErrNum, CL_SUCCESS); // Memory Setup // Compute permutations. cl_uint permutations[1024]; computePermutations(permutations); // Upload permutations. cmMemObjs[0] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_uint) * 1024, permutations, &ciErrNum); shrCheckError(ciErrNum, CL_SUCCESS); // Image cmMemObjs[1] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY , sizeof(cl_uint) * width * height, NULL, &ciErrNum); shrCheckError(ciErrNum, CL_SUCCESS); // Result const uint compressedSize = (width / 4) * (height / 4) * 8; cmMemObjs[2] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, compressedSize, NULL , &ciErrNum); shrCheckError(ciErrNum, CL_SUCCESS); unsigned int * h_result = (uint *)malloc(compressedSize); // Program Setup size_t program_length; const char* source_path = shrFindFilePath("DXTCompression.cl", argv[0]); shrCheckError(source_path != NULL, shrTRUE); char *source = oclLoadProgSource(source_path, "", &program_length); shrCheckError(source != NULL, shrTRUE); // create the program cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **) &source, &program_length, &ciErrNum); shrCheckError(ciErrNum, CL_SUCCESS); // build the program ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-mad-enable", NULL, NULL); if (ciErrNum != CL_SUCCESS) { // write out standard error, Build Log and PTX, then cleanup and exit shrLog(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclDXTCompression.ptx"); shrCheckError(ciErrNum, CL_SUCCESS); } // create the kernel ckKernel = clCreateKernel(cpProgram, "compress", &ciErrNum); shrCheckError(ciErrNum, CL_SUCCESS); // set the args values ciErrNum = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void *) &cmMemObjs[0]); ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void *) &cmMemObjs[1]); ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void *) &cmMemObjs[2]); ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(float) * 4 * 16, NULL); ciErrNum |= clSetKernelArg(ckKernel, 4, sizeof(float) * 4 * 16, NULL); ciErrNum |= clSetKernelArg(ckKernel, 5, sizeof(int) * 64, NULL); ciErrNum |= clSetKernelArg(ckKernel, 6, sizeof(float) * 16 * 6, NULL); ciErrNum |= clSetKernelArg(ckKernel, 7, sizeof(unsigned int) * 160, NULL); ciErrNum |= clSetKernelArg(ckKernel, 8, sizeof(int) * 16, NULL); shrCheckError(ciErrNum, CL_SUCCESS); shrLog(LOGBOTH, 0, "Running DXT Compression on %u x %u image...\n\n", width, height); // Upload the image clEnqueueWriteBuffer(cqCommandQueue, cmMemObjs[1], CL_FALSE, 0, sizeof(cl_uint) * width * height, block_image, 0,0,0); // set work-item dimensions szGlobalWorkSize[0] = width * height * (NUM_THREADS/16); szLocalWorkSize[0]= NUM_THREADS; #ifdef GPU_PROFILING int numIterations = 100; for (int i = -1; i < numIterations; ++i) { if (i == 0) { // start timing only after the first warmup iteration clFinish(cqCommandQueue); // flush command queue shrDeltaT(0); // start timer } #endif // execute kernel ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL); shrCheckError(ciErrNum, CL_SUCCESS); #ifdef GPU_PROFILING } clFinish(cqCommandQueue); double dAvgTime = shrDeltaT(0) / (double)numIterations; shrLog(LOGBOTH | MASTER, 0, "oclDXTCompression, Throughput = %.4f, Time = %.5f, Size = %u, NumDevsUsed = %i\n", (1.0e-6 * (double)(width * height)/ dAvgTime), dAvgTime, (width * height), 1); #endif // blocking read output ciErrNum = clEnqueueReadBuffer(cqCommandQueue, cmMemObjs[2], CL_TRUE, 0, compressedSize, h_result, 0, NULL, NULL); shrCheckError(ciErrNum, CL_SUCCESS); // Write DDS file. FILE* fp = NULL; char output_filename[1024]; #ifdef WIN32 strcpy_s(output_filename, 1024, image_path); strcpy_s(output_filename + strlen(image_path) - 3, 1024 - strlen(image_path) + 3, "dds"); fopen_s(&fp, output_filename, "wb"); #else strcpy(output_filename, image_path); strcpy(output_filename + strlen(image_path) - 3, "dds"); fp = fopen(output_filename, "wb"); #endif shrCheckError(fp != NULL, shrTRUE); DDSHeader header; header.fourcc = FOURCC_DDS; header.size = 124; header.flags = (DDSD_WIDTH|DDSD_HEIGHT|DDSD_CAPS|DDSD_PIXELFORMAT|DDSD_LINEARSIZE); header.height = height; header.width = width; header.pitch = compressedSize; header.depth = 0; header.mipmapcount = 0; memset(header.reserved, 0, sizeof(header.reserved)); header.pf.size = 32; header.pf.flags = DDPF_FOURCC; header.pf.fourcc = FOURCC_DXT1; header.pf.bitcount = 0; header.pf.rmask = 0; header.pf.gmask = 0; header.pf.bmask = 0; header.pf.amask = 0; header.caps.caps1 = DDSCAPS_TEXTURE; header.caps.caps2 = 0; header.caps.caps3 = 0; header.caps.caps4 = 0; header.notused = 0; fwrite(&header, sizeof(DDSHeader), 1, fp); fwrite(h_result, compressedSize, 1, fp); fclose(fp); // Make sure the generated image matches the reference image (regression check) shrLog(LOGBOTH, 0, "\nComparing against Host/C++ computation...\n"); const char* reference_image_path = shrFindFilePath(refimage_filename, argv[0]); shrCheckError(reference_image_path != NULL, shrTRUE); // read in the reference image from file #ifdef WIN32 fopen_s(&fp, reference_image_path, "rb"); #else fp = fopen(reference_image_path, "rb"); #endif shrCheckError(fp != NULL, shrTRUE); fseek(fp, sizeof(DDSHeader), SEEK_SET); uint referenceSize = (width / 4) * (height / 4) * 8; uint * reference = (uint *)malloc(referenceSize); fread(reference, referenceSize, 1, fp); fclose(fp); // compare the reference image data to the sample/generated image float rms = 0; for (uint y = 0; y < height; y += 4) { for (uint x = 0; x < width; x += 4) { // binary comparison of data uint referenceBlockIdx = ((y/4) * (width/4) + (x/4)); uint resultBlockIdx = ((y/4) * (width/4) + (x/4)); int cmp = compareBlock(((BlockDXT1 *)h_result) + resultBlockIdx, ((BlockDXT1 *)reference) + referenceBlockIdx); // log deviations, if any if (cmp != 0.0f) { compareBlock(((BlockDXT1 *)h_result) + resultBlockIdx, ((BlockDXT1 *)reference) + referenceBlockIdx); shrLog(LOGBOTH, 0, "Deviation at (%d, %d):\t%f rms\n", x/4, y/4, float(cmp)/16/3); } rms += cmp; } } rms /= width * height * 3; shrLog(LOGBOTH, 0, "RMS(reference, result) = %f\n\n", rms); shrLog(LOGBOTH, 0, "TEST %s\n\n", (rms <= ERROR_THRESHOLD) ? "PASSED" : "FAILED !!!"); // Free OpenCL resources oclDeleteMemObjs(cmMemObjs, 3); clReleaseKernel(ckKernel); clReleaseProgram(cpProgram); clReleaseCommandQueue(cqCommandQueue); clReleaseContext(cxGPUContext); // Free host memory free(source); free(h_img); // finish shrEXIT(argc, argv); }