// Main program //***************************************************************************** int main(int argc, char** argv) { pArgc = &argc; pArgv = argv; shrQAStart(argc, argv); // Start logs cExecutableName = argv[0]; shrSetLogFileName ("oclSobelFilter.txt"); shrLog("%s Starting (Using %s)...\n\n", argv[0], clSourcefile); // Get command line args for quick test or QA test, if provided bNoPrompt = (bool)shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt"); bQATest = (bool)shrCheckCmdLineFlag(argc, (const char**)argv, "qatest"); // Menu items if (!(bQATest)) { ShowMenuItems(); } // Find the path from the exe to the image file cPathAndName = shrFindFilePath(cImageFile, argv[0]); oclCheckErrorEX(cPathAndName != NULL, shrTRUE, pCleanup); shrLog("Image File\t = %s\nImage Dimensions = %u w x %u h x %u bpp\n\n", cPathAndName, uiImageWidth, uiImageHeight, sizeof(unsigned int)<<3); // Initialize OpenGL items (if not No-GL QA test) shrLog("%sInitGL...\n\n", bQATest ? "Skipping " : "Calling "); if (!(bQATest)) { InitGL(&argc, argv); } //Get the NVIDIA platform if available, otherwise use default char cBuffer[1024]; bool bNV = false; shrLog("Get Platform ID... "); ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); ciErrNum = clGetPlatformInfo (cpPlatform, CL_PLATFORM_NAME, sizeof(cBuffer), cBuffer, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); shrLog("%s\n\n", cBuffer); bNV = (strstr(cBuffer, "NVIDIA") != NULL); //Get the devices shrLog("Get Device Info...\n"); cl_uint uiNumAllDevs = 0; GpuDevMngr = new DeviceManager(cpPlatform, &uiNumAllDevs, pCleanup); // Get selected device if specified, otherwise examine avaiable ones and choose by perf cl_int iSelectedDevice = 0; if((shrGetCmdLineArgumenti(argc, (const char**)argv, "device", &iSelectedDevice)) || (uiNumAllDevs == 1)) { // Use 1 selected device GpuDevMngr->uiUsefulDevCt = 1; iSelectedDevice = CLAMP((cl_uint)iSelectedDevice, 0, (uiNumAllDevs - 1)); GpuDevMngr->uiUsefulDevs[0] = iSelectedDevice; GpuDevMngr->fLoadProportions[0] = 1.0f; shrLog(" Using 1 Selected Device for Sobel Filter Computation...\n"); } else { // Use available useful devices and Compute the device load proportions ciErrNum = GpuDevMngr->GetDevLoadProportions(bNV); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); if (GpuDevMngr->uiUsefulDevCt == 1) { iSelectedDevice = GpuDevMngr->uiUsefulDevs[0]; } shrLog(" Using %u Device(s) for Sobel Filter Computation\n", GpuDevMngr->uiUsefulDevCt); } //Create the context shrLog("\nclCreateContext...\n\n"); cxGPUContext = clCreateContext(0, uiNumAllDevs, GpuDevMngr->cdDevices, NULL, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Allocate per-device OpenCL objects for useful devices cqCommandQueue = new cl_command_queue[GpuDevMngr->uiUsefulDevCt]; ckSobel = new cl_kernel[GpuDevMngr->uiUsefulDevCt]; cmDevBufIn = new cl_mem[GpuDevMngr->uiUsefulDevCt]; cmDevBufOut = new cl_mem[GpuDevMngr->uiUsefulDevCt]; szAllocDevBytes = new size_t[GpuDevMngr->uiUsefulDevCt]; uiInHostPixOffsets = new cl_uint[GpuDevMngr->uiUsefulDevCt]; uiOutHostPixOffsets = new cl_uint[GpuDevMngr->uiUsefulDevCt]; uiDevImageHeight = new cl_uint[GpuDevMngr->uiUsefulDevCt]; // Create command queue(s) for device(s) shrLog("clCreateCommandQueue...\n"); for (cl_uint i = 0; i < GpuDevMngr->uiUsefulDevCt; i++) { cqCommandQueue[i] = clCreateCommandQueue(cxGPUContext, GpuDevMngr->cdDevices[GpuDevMngr->uiUsefulDevs[i]], 0, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); shrLog(" CommandQueue %u, Device %u, Device Load Proportion = %.2f, ", i, GpuDevMngr->uiUsefulDevs[i], GpuDevMngr->fLoadProportions[i]); oclPrintDevName(LOGBOTH, GpuDevMngr->cdDevices[GpuDevMngr->uiUsefulDevs[i]]); shrLog("\n"); } // Allocate pinned input and output host image buffers: mem copy operations to/from pinned memory is much faster than paged memory szBuffBytes = uiImageWidth * uiImageHeight * sizeof (unsigned int); cmPinnedBufIn = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); cmPinnedBufOut = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); shrLog("\nclCreateBuffer (Input and Output Pinned Host buffers)...\n"); // Get mapped pointers for writing to pinned input and output host image pointers uiInput = (cl_uint*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedBufIn, CL_TRUE, CL_MAP_WRITE, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); uiOutput = (cl_uint*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedBufOut, CL_TRUE, CL_MAP_READ, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); shrLog("clEnqueueMapBuffer (Pointer to Input and Output pinned host buffers)...\n"); // Load image data from file to pinned input host buffer ciErrNum = shrLoadPPM4ub(cPathAndName, (unsigned char **)&uiInput, &uiImageWidth, &uiImageHeight); oclCheckErrorEX(ciErrNum, shrTRUE, pCleanup); shrLog("Load Input Image to Input pinned host buffer...\n"); // Read the kernel in from file free(cPathAndName); cPathAndName = shrFindFilePath(clSourcefile, argv[0]); oclCheckErrorEX(cPathAndName != NULL, shrTRUE, pCleanup); cSourceCL = oclLoadProgSource(cPathAndName, "// My comment\n", &szKernelLength); oclCheckErrorEX(cSourceCL != NULL, shrTRUE, pCleanup); shrLog("Load OpenCL Prog Source from File...\n"); // Create the program object cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); shrLog("clCreateProgramWithSource...\n"); // Build the program with 'mad' Optimization option #ifdef MAC char *flags = "-cl-fast-relaxed-math -DMAC"; #else char *flags = "-cl-fast-relaxed-math"; #endif ciErrNum = clBuildProgram(cpProgram, 0, NULL, flags, NULL, NULL); if (ciErrNum != CL_SUCCESS) { // On error: write out standard error, Build Log and PTX, then cleanup and exit shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclSobelFilter.ptx"); Cleanup(EXIT_FAILURE); } shrLog("clBuildProgram...\n\n"); // Determine, the size/shape of the image portions for each dev and create the device buffers unsigned uiSumHeight = 0; for (cl_uint i = 0; i < GpuDevMngr->uiUsefulDevCt; i++) { // Create kernel instance ckSobel[i] = clCreateKernel(cpProgram, "ckSobel", &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); shrLog("clCreateKernel (ckSobel), Device %u...\n", i); // Allocations and offsets for the portion of the image worked on by each device if (GpuDevMngr->uiUsefulDevCt == 1) { // One device processes the whole image with no offset uiDevImageHeight[i] = uiImageHeight; uiInHostPixOffsets[i] = 0; uiOutHostPixOffsets[i] = 0; szAllocDevBytes[i] = uiDevImageHeight[i] * uiImageWidth * sizeof(cl_uint); } else if (i == 0) { // Multiple devices, top stripe zone including topmost row of image: // Over-allocate on device by 1 row // Set offset and size to copy extra 1 padding row H2D (below bottom of stripe) // Won't return the last row (dark/garbage row) D2H uiInHostPixOffsets[i] = 0; uiOutHostPixOffsets[i] = 0; uiDevImageHeight[i] = (cl_uint)(GpuDevMngr->fLoadProportions[GpuDevMngr->uiUsefulDevs[i]] * (float)uiImageHeight); // height is proportional to dev perf uiSumHeight += uiDevImageHeight[i]; uiDevImageHeight[i] += 1; szAllocDevBytes[i] = uiDevImageHeight[i] * uiImageWidth * sizeof(cl_uint); } else if (i < (GpuDevMngr->uiUsefulDevCt - 1)) { // Multiple devices, middle stripe zone: // Over-allocate on device by 2 rows // Set offset and size to copy extra 2 padding rows H2D (above top and below bottom of stripe) // Won't return the first and last rows (dark/garbage rows) D2H uiInHostPixOffsets[i] = (uiSumHeight - 1) * uiImageWidth; uiOutHostPixOffsets[i] = uiInHostPixOffsets[i] + uiImageWidth; uiDevImageHeight[i] = (cl_uint)(GpuDevMngr->fLoadProportions[GpuDevMngr->uiUsefulDevs[i]] * (float)uiImageHeight); // height is proportional to dev perf uiSumHeight += uiDevImageHeight[i]; uiDevImageHeight[i] += 2; szAllocDevBytes[i] = uiDevImageHeight[i] * uiImageWidth * sizeof(cl_uint); } else { // Multiple devices, last boundary tile: // Over-allocate on device by 1 row // Set offset and size to copy extra 1 padding row H2D (above top of stripe) // Won't return the first row (dark/garbage rows D2H uiInHostPixOffsets[i] = (uiSumHeight - 1) * uiImageWidth; uiOutHostPixOffsets[i] = uiInHostPixOffsets[i] + uiImageWidth; uiDevImageHeight[i] = uiImageHeight - uiSumHeight; // "leftover" rows uiSumHeight += uiDevImageHeight[i]; uiDevImageHeight[i] += 1; szAllocDevBytes[i] = uiDevImageHeight[i] * uiImageWidth * sizeof(cl_uint); } shrLog("Image Height (rows) for Device %u = %u...\n", i, uiDevImageHeight[i]); // Create the device buffers in GMEM on each device cmDevBufIn[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, szAllocDevBytes[i], NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); cmDevBufOut[i] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, szAllocDevBytes[i], NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); shrLog("clCreateBuffer (Input and Output GMEM buffers, Device %u)...\n", i); // Set the common argument values for the Median kernel instance for each device int iLocalPixPitch = iBlockDimX + 2; ciErrNum = clSetKernelArg(ckSobel[i], 0, sizeof(cl_mem), (void*)&cmDevBufIn[i]); ciErrNum |= clSetKernelArg(ckSobel[i], 1, sizeof(cl_mem), (void*)&cmDevBufOut[i]); ciErrNum |= clSetKernelArg(ckSobel[i], 2, (iLocalPixPitch * (iBlockDimY + 2) * sizeof(cl_uchar4)), NULL); ciErrNum |= clSetKernelArg(ckSobel[i], 3, sizeof(cl_int), (void*)&iLocalPixPitch); ciErrNum |= clSetKernelArg(ckSobel[i], 4, sizeof(cl_uint), (void*)&uiImageWidth); ciErrNum |= clSetKernelArg(ckSobel[i], 6, sizeof(cl_float), (void*)&fThresh); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); shrLog("clSetKernelArg (0-4), Device %u...\n\n", i); } // Set common global and local work sizes for Median kernel szLocalWorkSize[0] = iBlockDimX; szLocalWorkSize[1] = iBlockDimY; szGlobalWorkSize[0] = shrRoundUp((int)szLocalWorkSize[0], uiImageWidth); // init running timers shrDeltaT(0); // timer 0 used for computation timing shrDeltaT(1); // timer 1 used for fps computation // Start main GLUT rendering loop for processing and rendering, // or otherwise run No-GL Q/A test sequence if (!(bQATest)) { glutMainLoop(); } else { TestNoGL(); } Cleanup(EXIT_SUCCESS); }
// Main program //***************************************************************************** int main(int argc, char** argv) { numParticles = NUM_PARTICLES; uint gridDim = GRID_SIZE; pArgc = &argc; pArgv = argv; shrQAStart(argc, argv); // Start logs and timers cExecutableName = argv[0]; shrSetLogFileName ("oclParticles.txt"); shrLog("%s Starting...\n\n", argv[0]); // check command line flags and parameters if (argc > 1) { shrGetCmdLineArgumenti(argc, (const char**)argv, "n", (int*)&numParticles); shrGetCmdLineArgumenti(argc, (const char**)argv, "grid", (int*)&gridDim); bQATest = shrCheckCmdLineFlag(argc, (const char**)argv, "qatest"); bNoPrompt = shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt"); } // Set and log grid size and particle count, after checking optional command-line inputs gridSize.x = gridSize.y = gridSize.z = gridDim; shrLog(" grid: %d x %d x %d = %d cells\n", gridSize.x, gridSize.y, gridSize.z, gridSize.x * gridSize.y * gridSize.z); shrLog(" particles: %d\n\n", numParticles); // initialize GLUT and GLEW if(!bQATest) { InitGL(&argc, argv); } // initialize OpenCL startupOpenCL(argc,(const char**)argv); // init simulation parameters and objects initParticleSystem(numParticles, gridSize); initParams(); // Init timers shrDeltaT(0); // timer 0 is for processing time measurements shrDeltaT(1); // timer 1 is for fps measurement // Start main GLUT rendering loop for processing and rendering, // or otherwise run No-GL Q/A test sequence if(!bQATest) { glutMainLoop(); } else { TestNoGL(); } // Normally unused return path shrQAFinish(argc, (const char **)argv, QA_PASSED); Cleanup(EXIT_SUCCESS); }
// Main Program //***************************************************************************** int main(int argc, char** argv) { pArgc = &argc; pArgv = argv; shrQAStart(argc, argv); // start logs cExecutableName = argv[0]; shrSetLogFileName ("oclPostProcessGL.txt"); shrLog("%s Starting...\n\n", argv[0]); // process command line arguments if (argc > 1) { bQATest = shrCheckCmdLineFlag(argc, (const char**)argv, "qatest"); bNoPrompt = shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt"); } shrLog(" Image Width = %d, Image Height = %d, Blur Radius = %d\n\n", image_width, image_height, blur_radius); // init GL if(!bQATest) { InitGL(&argc, argv); // create pbo createPBO(&pbo_source); createPBO(&pbo_dest); // create texture for blitting onto the screen createTexture(&tex_screen, image_width, image_height); bGLinterop = shrTRUE; } // init CL if( initCL(argc, (const char**)argv) != 0 ) { return -1; } // init fps timer shrDeltaT (1); // Create buffers and textures, // and then start main GLUT rendering loop for processing and rendering, // or otherwise run No-GL Q/A test sequence shrLog("\n%s...\n", bQATest ? "No-GL test sequence" : "Standard GL Loop"); printf("\n" "\tControls\n" "\t(right click mouse button for Menu)\n" "\t[ ] : Toggle Post-Processing (blur filter) ON/OFF\n" "\t[ p ] : Toggle Processing (between GPU or CPU)\n" "\t[ a ] : Toggle OpenGL Animation (rotation) ON/OFF\n" "\t[+/=] : Increase Blur Radius\n" "\t[-/_] : Decrease Blur Radius\n" "\t[Esc] - Quit\n\n" ); if(!bQATest) { glutMainLoop(); } else { TestNoGL(); } Cleanup(EXIT_SUCCESS); }