// Initialize GL //***************************************************************************** bool InitGL(int* argc, char **argv ) { // init GLUT and GLUT window glutInit(argc, argv); glutInitDisplayMode(GLUT_RGBA | GLUT_ALPHA | GLUT_DOUBLE | GLUT_DEPTH); glutInitWindowPosition (glutGet(GLUT_SCREEN_WIDTH)/2 - iGraphicsWinWidth/2, glutGet(GLUT_SCREEN_HEIGHT)/2 - iGraphicsWinHeight/2); glutInitWindowSize(iGraphicsWinWidth, iGraphicsWinHeight); iGLUTWindowHandle = glutCreateWindow("OpenCL/OpenGL post-processing"); #if !(defined (__APPLE__) || defined(MACOSX)) glutSetOption(GLUT_ACTION_ON_WINDOW_CLOSE, GLUT_ACTION_GLUTMAINLOOP_RETURNS); #endif // register GLUT callbacks glutDisplayFunc(DisplayGL); glutKeyboardFunc(KeyboardGL); glutReshapeFunc(Reshape); glutTimerFunc(REFRESH_DELAY, timerEvent, 0); // create GLUT menu iGLUTMenuHandle = glutCreateMenu(mainMenu); glutAddMenuEntry("Toggle Post-processing (Blur filter) ON/OFF <spacebar>", ' '); glutAddMenuEntry("Toggle Processor between GPU and CPU [p]", 'p'); glutAddMenuEntry("Toggle GL animation (rotation) ON/OFF [a]", 'a'); glutAddMenuEntry("Increment blur radius [+ or =]", '='); glutAddMenuEntry("Decrement blur radius [- or _]", '-'); glutAddMenuEntry("Quit <esc>", '\033'); glutAttachMenu(GLUT_RIGHT_BUTTON); // init GLEW glewInit(); GLboolean bGLEW = glewIsSupported("GL_VERSION_2_0 GL_ARB_pixel_buffer_object"); oclCheckErrorEX(bGLEW, shrTRUE, pCleanup); // default initialization glClearColor(0.5, 0.5, 0.5, 1.0); glDisable(GL_DEPTH_TEST); // viewport glViewport(0, 0, iGraphicsWinWidth, iGraphicsWinHeight); // projection glMatrixMode(GL_PROJECTION); glLoadIdentity(); gluPerspective(60.0, (GLfloat)iGraphicsWinWidth / (GLfloat) iGraphicsWinHeight, 0.1, 10.0); glPolygonMode(GL_FRONT_AND_BACK, GL_FILL); glEnable(GL_LIGHT0); float red[] = { 1.0, 0.1, 0.1, 1.0 }; float white[] = { 1.0, 1.0, 1.0, 1.0 }; glMaterialfv(GL_FRONT_AND_BACK, GL_DIFFUSE, red); glMaterialfv(GL_FRONT_AND_BACK, GL_SPECULAR, white); glMaterialf(GL_FRONT_AND_BACK, GL_SHININESS, 60.0); return true; }
//----------------------------------------------------------------------------- // Name: CreateKernelProgram() // Desc: Creates OpenCL program and kernel instances //----------------------------------------------------------------------------- HRESULT CreateKernelProgram( const char *exepath, const char *clName, const char *clPtx, const char *kernelEntryPoint, cl_program &cpProgram, cl_kernel &ckKernel ) { // Program Setup size_t program_length; const char* source_path = shrFindFilePath(clName, exepath); char *source = oclLoadProgSource(source_path, "", &program_length); oclCheckErrorEX(source != NULL, shrTRUE, pCleanup); // create the program cpProgram = clCreateProgramWithSource(cxGPUContext, 1,(const char **) &source, &program_length, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); free(source); // build the program #ifdef USE_STAGING_BUFFER static char *opts = "-cl-fast-relaxed-math -DUSE_STAGING_BUFFER"; #else static char *opts = "-cl-fast-relaxed-math"; #endif ciErrNum = clBuildProgram(cpProgram, 0, NULL, opts, NULL, NULL); if (ciErrNum != CL_SUCCESS) { // write out standard error, Build Log and PTX, then cleanup and exit shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), clPtx); Cleanup(EXIT_FAILURE); } // create the kernel ckKernel = clCreateKernel(cpProgram, kernelEntryPoint, &ciErrNum); if (!ckKernel) { Cleanup(EXIT_FAILURE); } // set the args values return ciErrNum ? E_FAIL : S_OK; }
//***************************************************************************** void SelectDemo(int index) { oclCheckErrorEX((index < numDemos), shrTRUE, pCleanup); activeParams = demoParams[index]; camera_trans[0] = camera_trans_lag[0] = activeParams.m_x; camera_trans[1] = camera_trans_lag[1] = activeParams.m_y; camera_trans[2] = camera_trans_lag[2] = activeParams.m_z; ResetSim(nbody, numBodies, NBODY_CONFIG_SHELL, true); //Rest the demo timer shrDeltaT(DEMOTIME); }
//----------------------------------------------------------------------------- // Name: ReleaseTexturesFromOpenCL() // Desc: Release Textures from OpenCL //----------------------------------------------------------------------------- void ReleaseTexturesFromOpenCL() { cl_event event; cl_mem memToAcquire[6+1+1]; memToAcquire[0] = g_texture_2d.clTexture; memToAcquire[1] = g_texture_vol.clTexture; memToAcquire[2] = g_texture_cube.clTexture[0]; memToAcquire[3] = g_texture_cube.clTexture[1]; memToAcquire[4] = g_texture_cube.clTexture[2]; memToAcquire[5] = g_texture_cube.clTexture[3]; memToAcquire[6] = g_texture_cube.clTexture[4]; memToAcquire[7] = g_texture_cube.clTexture[5]; // do the acquire ciErrNum = clEnqueueReleaseD3D9ObjectsNV( cqCommandQueue, 6 + 1 + 1, //cube map + tex2d + volume texture memToAcquire, 0, NULL, &event); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // make sure the event type is correct cl_uint eventType = 0; ciErrNum = clGetEventInfo( event, CL_EVENT_COMMAND_TYPE, sizeof(eventType), &eventType, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); if(eventType != CL_COMMAND_RELEASE_D3D9_OBJECTS_NV) { shrLog("event type is not CL_COMMAND_RELEASE_D3D9_OBJECTS_NV !\n"); } ciErrNum = clReleaseEvent(event); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); }
// Kernel function //***************************************************************************** int executeKernel(cl_int radius) { // set global and local work item dimensions szLocalWorkSize[0] = 16; szLocalWorkSize[1] = 16; szGlobalWorkSize[0] = shrRoundUp((int)szLocalWorkSize[0], image_width); szGlobalWorkSize[1] = shrRoundUp((int)szLocalWorkSize[1], image_height); // set the args values cl_int tilew = (cl_int)szLocalWorkSize[0]+(2*radius); ciErrNum = clSetKernelArg(ckKernel, 4, sizeof(tilew), &tilew); ciErrNum |= clSetKernelArg(ckKernel, 5, sizeof(radius), &radius); cl_float threshold = 0.8f; ciErrNum |= clSetKernelArg(ckKernel, 6, sizeof(threshold), &threshold); cl_float highlight = 4.0f; ciErrNum |= clSetKernelArg(ckKernel, 7, sizeof(highlight), &highlight); // Local memory ciErrNum |= clSetKernelArg(ckKernel, 8, (szLocalWorkSize[0]+(2*16))*(szLocalWorkSize[1]+(2*16))*sizeof(int), NULL); // launch computation kernel #ifdef GPU_PROFILING int nIter = 30; for( int i=-1; i< nIter; ++i) { if( i ==0 ) shrDeltaT(0); #endif ciErrNum |= clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 2, NULL, szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL); #ifdef GPU_PROFILING } clFinish(cqCommandQueue); double dSeconds = shrDeltaT(0)/(double)nIter; double dNumTexels = (double)image_width * (double)image_height; double mtexps = 1.0e-6 * dNumTexels/dSeconds; shrLogEx(LOGBOTH | MASTER, 0, "oclPostprocessGL, Throughput = %.4f MTexels/s, Time = %.5f s, Size = %.0f Texels, NumDevsUsed = %u, Workgroup = %u\n", mtexps, dSeconds, dNumTexels, uiNumDevsUsed, szLocalWorkSize[0] * szLocalWorkSize[1]); #endif oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); return 0; }
int main(int argc, char **argv) { bool bTestResult = true; shrQAStart(argc, argv); // Start the log shrSetLogFileName(shrLogFile); shrLog("%s Starting...\n\n", argv[0]); // Check help flag if (shrCheckCmdLineFlag(argc, (const char **)argv, "help")) { shrLog("Displaying help on console\n"); showHelp(argc, (const char **)argv); } else { // Execute bTestResult = runTest(argc, (const char **)argv); oclCheckErrorEX(bTestResult, true, NULL); } // Finish shrQAFinishExit( argc, (const char **)argv, (bTestResult ? QA_PASSED : QA_FAILED) ); }
//----------------------------------------------------------------------------- // Name: InitTextures() // Desc: Initializes Direct3D Textures (allocation and initialization) //----------------------------------------------------------------------------- HRESULT InitTextures() { // // create the D3D resources we'll be using // // 2D texture g_texture_2d.width = 256; g_texture_2d.height = 256; g_texture_2d.pitch = 256; if (FAILED(g_pD3DDevice->CreateTexture(g_texture_2d.width, g_texture_2d.height, 1, D3DUSAGE_DYNAMIC, D3DFMT_A8R8G8B8/*D3DFMT_A32B32G32R32F*/, D3DPOOL_DEFAULT, &g_texture_2d.pTexture, NULL) )) { return E_FAIL; } D3DLOCKED_RECT r; HRESULT hr = g_texture_2d.pTexture->LockRect(0, &r, NULL, 0); unsigned long *data = (unsigned long *)r.pBits; for(int i=0; i< 256*256; i++) { *data = 0xFF00FFFF; data++; } g_texture_2d.pTexture->UnlockRect(0); // Create the OpenCL part g_texture_2d.clTexture = clCreateFromD3D9TextureNV( cxGPUContext, 0, g_texture_2d.pTexture, 0,//miplevel &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // // Optional Check... // IDirect3DResource9* clResource = NULL; ciErrNum = clGetMemObjectInfo( g_texture_2d.clTexture, CL_MEM_D3D9_RESOURCE_NV, sizeof(clResource), &clResource, NULL); assert(clResource == g_texture_2d.pTexture); #ifdef USE_STAGING_BUFFER // Memory Setup : allocate 4 bytes (RGBA) pixels // Create the intermediate buffers in which OpenCL will do the rendering // then we will blit the result back to the texture that we will have mapped to OpenCL area g_texture_2d.clMem = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, 4 * g_texture_2d.width * g_texture_2d.height, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); #endif // cube texture g_texture_cube.size = 64; g_texture_cube.pitch = 64; if (FAILED(g_pD3DDevice->CreateCubeTexture(g_texture_cube.size, 1, D3DUSAGE_DYNAMIC, D3DFMT_A8R8G8B8, D3DPOOL_DEFAULT, &g_texture_cube.pTexture, NULL) )) { return E_FAIL; } // Create the OpenCL part for(int i=0; i<6; i++) { g_texture_cube.clTexture[i] = clCreateFromD3D9CubeTextureNV( cxGPUContext, 0, g_texture_cube.pTexture, (D3DCUBEMAP_FACES)i, // face 0, // miplevel &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); #ifdef USE_STAGING_BUFFER g_texture_cube.clMem[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, 4 * g_texture_cube.size * g_texture_cube.size, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); #endif } // 3D texture g_texture_vol.width = 16; g_texture_vol.height = 16; g_texture_vol.depth = 8; g_texture_vol.pitch = 16; g_texture_vol.pitchslice = g_texture_vol.pitch * g_texture_vol.height; if (FAILED(g_pD3DDevice->CreateVolumeTexture( g_texture_vol.width, g_texture_vol.height, g_texture_vol.depth, 1, D3DUSAGE_DYNAMIC, D3DFMT_A8R8G8B8, D3DPOOL_DEFAULT, &g_texture_vol.pTexture, NULL) )) { return E_FAIL; } g_texture_vol.clTexture = clCreateFromD3D9VolumeTextureNV( cxGPUContext, 0, g_texture_vol.pTexture, 0, //Miplevel &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); g_texture_vol.clMem = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, 4 * g_texture_vol.width * g_texture_vol.height * g_texture_vol.depth, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); return S_OK; }
// Keyboard event handler callback //***************************************************************************** void KeyboardGL(unsigned char key, int /*x*/, int /*y*/) { switch(key) { case 'P': // P toggles Processing between CPU and GPU case 'p': // p toggles Processing between CPU and GPU if (iProcFlag == 0) { iProcFlag = 1; } else { iProcFlag = 0; } shrLog("\n%s Processing...\n", cProcessor[iProcFlag]); break; case 'F': // F toggles main graphics display full screen case 'f': // f toggles main graphics display full screen bFullScreen = !bFullScreen; if (bFullScreen) { iGraphicsWinPosX = glutGet(GLUT_WINDOW_X) - 8; iGraphicsWinPosY = glutGet(GLUT_WINDOW_Y) - 30; iGraphicsWinWidth = min(glutGet(GLUT_WINDOW_WIDTH) , glutGet(GLUT_SCREEN_WIDTH) - 2*iGraphicsWinPosX ); iGraphicsWinHeight = min(glutGet(GLUT_WINDOW_HEIGHT), glutGet(GLUT_SCREEN_HEIGHT)- 2*iGraphicsWinPosY ); printf("(x,y)=(%d,%d), (w,h)=(%d,%d)\n", iGraphicsWinPosX, iGraphicsWinPosY, iGraphicsWinWidth, iGraphicsWinHeight); glutFullScreen(); } else { glutPositionWindow(iGraphicsWinPosX, iGraphicsWinPosY); glutReshapeWindow(iGraphicsWinWidth, iGraphicsWinHeight); } shrLog("\nMain Graphics %s...\n", bFullScreen ? "FullScreen" : "Windowed"); break; case ' ': // space bar toggles filter on and off bFilter = !bFilter; shrLog("\nSobel Filter Toggled %s...\n", bFilter ? "ON" : "OFF"); break; case '+': // + sign increases threshold case '=': // = sign increases threshold case '-': // - sign decreases threshold case '_': // _ decreases threshold if(key == '+' || key == '=') { fThresh += 10.0f; } else { fThresh -= 10.0f; } // Clamp and reset the associated kernel arg, and log value fThresh = CLAMP(fThresh, 0.0f, 255.0f); for (cl_uint i = 0; i < GpuDevMngr->uiUsefulDevCt; i++) { ciErrNum = clSetKernelArg(ckSobel[i], 6, sizeof(cl_float), (void*)&fThresh); } oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); shrLog("\nThreshold changed to %.1f...\n", fThresh); break; case '\033':// Escape quits case '\015':// Enter quits case 'Q': // Q quits case 'q': // q quits // Cleanup up and quit bNoPrompt = shrTRUE; Cleanup(EXIT_SUCCESS); break; } // Trigger fps update and call for refresh TriggerFPSUpdate(); }
// Init OpenCL //***************************************************************************** int initCL(int argc, const char** argv) { cl_platform_id cpPlatform; cl_uint uiDevCount; cl_device_id *cdDevices; //Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Get the number of GPU devices available to the platform ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiDevCount); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Create the device list cdDevices = new cl_device_id [uiDevCount]; ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiDevCount, cdDevices, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Get device requested on command line, if any unsigned int uiDeviceUsed = 0; unsigned int uiEndDev = uiDevCount - 1; if(shrGetCmdLineArgumentu(argc, argv, "device", &uiDeviceUsed)) { uiDeviceUsed = CLAMP(uiDeviceUsed, 0, uiEndDev); uiEndDev = uiDeviceUsed; } // Check if the requested device (or any of the devices if none requested) supports context sharing with OpenGL if(bGLinterop && !bQATest) { bool bSharingSupported = false; for(unsigned int i = uiDeviceUsed; (!bSharingSupported && (i <= uiEndDev)); ++i) { size_t extensionSize; ciErrNum = clGetDeviceInfo(cdDevices[i], CL_DEVICE_EXTENSIONS, 0, NULL, &extensionSize ); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); if(extensionSize > 0) { char* extensions = (char*)malloc(extensionSize); ciErrNum = clGetDeviceInfo(cdDevices[i], CL_DEVICE_EXTENSIONS, extensionSize, extensions, &extensionSize); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); std::string stdDevString(extensions); free(extensions); size_t szOldPos = 0; size_t szSpacePos = stdDevString.find(' ', szOldPos); // extensions string is space delimited while (szSpacePos != stdDevString.npos) { if( strcmp(GL_SHARING_EXTENSION, stdDevString.substr(szOldPos, szSpacePos - szOldPos).c_str()) == 0 ) { // Device supports context sharing with OpenGL uiDeviceUsed = i; bSharingSupported = true; break; } do { szOldPos = szSpacePos + 1; szSpacePos = stdDevString.find(' ', szOldPos); } while (szSpacePos == szOldPos); } } } shrLog("%s...\n\n", bSharingSupported ? "Using CL-GL Interop" : "No device found that supports CL/GL context sharing"); oclCheckErrorEX(bSharingSupported, true, pCleanup); // Define OS-specific context properties and create the OpenCL context #if defined (__APPLE__) || defined (MACOSX) CGLContextObj kCGLContext = CGLGetCurrentContext(); CGLShareGroupObj kCGLShareGroup = CGLGetShareGroup(kCGLContext); cl_context_properties props[] = { CL_CONTEXT_PROPERTY_USE_CGL_SHAREGROUP_APPLE, (cl_context_properties)kCGLShareGroup, 0 }; cxGPUContext = clCreateContext(props, 0,0, NULL, NULL, &ciErrNum); #else #ifdef UNIX cl_context_properties props[] = { CL_GL_CONTEXT_KHR, (cl_context_properties)glXGetCurrentContext(), CL_GLX_DISPLAY_KHR, (cl_context_properties)glXGetCurrentDisplay(), CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform, 0 }; cxGPUContext = clCreateContext(props, 1, &cdDevices[uiDeviceUsed], NULL, NULL, &ciErrNum); #else // Win32 cl_context_properties props[] = { CL_GL_CONTEXT_KHR, (cl_context_properties)wglGetCurrentContext(), CL_WGL_HDC_KHR, (cl_context_properties)wglGetCurrentDC(), CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform, 0 }; cxGPUContext = clCreateContext(props, 1, &cdDevices[uiDeviceUsed], NULL, NULL, &ciErrNum); #endif #endif } else { // No GL interop cl_context_properties props[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform, 0}; cxGPUContext = clCreateContext(props, 1, &cdDevices[uiDeviceUsed], NULL, NULL, &ciErrNum); bGLinterop = shrFALSE; } shrCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Log device used shrLog("Device # %u, ", uiDeviceUsed); oclPrintDevName(LOGBOTH, cdDevices[uiDeviceUsed]); shrLog("\n"); // create a command-queue cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[uiDeviceUsed], 0, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Memory Setup if( bGLinterop ) { cl_pbos[0] = clCreateFromGLBuffer(cxGPUContext, CL_MEM_READ_ONLY, pbo_source, &ciErrNum); cl_pbos[1] = clCreateFromGLBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, pbo_dest, &ciErrNum); } else { cl_pbos[0] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, 4 * image_width * image_height, NULL, &ciErrNum); cl_pbos[1] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, 4 * image_width * image_height, NULL, &ciErrNum); } oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Program Setup size_t program_length; const char* source_path = shrFindFilePath(clSourcefile, argv[0]); char *source = oclLoadProgSource(source_path, "", &program_length); oclCheckErrorEX(source != NULL, shrTRUE, pCleanup); // create the program cpProgram = clCreateProgramWithSource(cxGPUContext, 1,(const char **) &source, &program_length, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); free(source); // build the program ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL); if (ciErrNum != CL_SUCCESS) { // write out standard error, Build Log and PTX, then cleanup and exit shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclPostProcessGL.ptx"); Cleanup(EXIT_FAILURE); } // create the kernel ckKernel = clCreateKernel(cpProgram, "postprocess", &ciErrNum); // set the args values ciErrNum |= clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void *) &(cl_pbos[0])); ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void *) &(cl_pbos[1])); ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(image_width), &image_width); ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(image_width), &image_height); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); return 0; }
//----------------------------------------------------------------------------- // Name: InitCL() // Desc: Get platform and devices and create context and queues //----------------------------------------------------------------------------- HRESULT InitCL(int argc, const char** argv) { cl_platform_id cpPlatform; //Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // // Initialize extension functions for D3D9 // INITPFN(clGetDeviceIDsFromD3D9NV); INITPFN(clCreateFromD3D9VertexBufferNV); INITPFN(clCreateFromD3D9IndexBufferNV); INITPFN(clCreateFromD3D9SurfaceNV); INITPFN(clCreateFromD3D9TextureNV); INITPFN(clCreateFromD3D9CubeTextureNV); INITPFN(clCreateFromD3D9VolumeTextureNV); INITPFN(clEnqueueAcquireD3D9ObjectsNV); INITPFN(clEnqueueReleaseD3D9ObjectsNV); INITPFN(clGetDeviceIDsFromD3D9NV); // Query the OpenCL device that would be good for the current D3D device // We need to take the one that is on the same Gfx card. // Get the device ids for the adapter cl_device_id cdDevice; cl_uint num_devices = 0; ciErrNum = clGetDeviceIDsFromD3D9NV( cpPlatform, CL_D3D9_DEVICE_NV,//CL_D3D9_ADAPTER_NAME_NV, g_pD3DDevice,//adapterName, CL_PREFERRED_DEVICES_FOR_D3D9_NV, //CL_ALL_DEVICES_FOR_D3D9_NV, 1, &cdDevice, &num_devices); if (ciErrNum == -1) { shrLog("No OpenCL device available that supports D3D9, exiting...\n"); Cleanup (EXIT_SUCCESS); } else { oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); } cl_context_properties props[] = { CL_CONTEXT_D3D9_DEVICE_NV, (cl_context_properties)g_pD3DDevice, CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform, 0 }; cxGPUContext = clCreateContext(props, 1, &cdDevice, NULL, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Log device used shrLog("Device: "); oclPrintDevName(LOGBOTH, cdDevice); shrLog("\n"); // create a command-queue cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, 0, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); CreateKernelProgram(argv[0], "texture_2d.cl", "texture_2d.ptx", "cl_kernel_texture_2d", cpProgram_tex2d, ckKernel_tex2d); CreateKernelProgram(argv[0], "texture_cube.cl", "texture_cube.ptx", "cl_kernel_texture_cube", cpProgram_texcube, ckKernel_texcube); CreateKernelProgram(argv[0], "texture_volume.cl", "texture_volume.ptx", "cl_kernel_texture_volume", cpProgram_texvolume, ckKernel_texvolume); return S_OK; }
// Main program //***************************************************************************** int main(int argc, char** argv) { // Locals used with command line args int p = 256; // workgroup X dimension int q = 1; // workgroup Y dimension pArgc = &argc; pArgv = argv; shrQAStart(argc, argv); // latch the executable path for other funcs to use cExecutablePath = argv[0]; // start logs and show command line help shrSetLogFileName ("oclNbody.txt"); shrLog("%s Starting...\n\n", cExecutablePath); shrLog("Command line switches:\n"); shrLog(" --qatest\t\tCheck correctness of GPU execution and measure performance)\n"); shrLog(" --noprompt\t\tQuit simulation automatically after a brief period\n"); shrLog(" --n=<numbodies>\tSpecify # of bodies to simulate (default = %d)\n", numBodies); shrLog(" --double\t\tUse double precision floating point values for simulation\n"); shrLog(" --p=<workgroup X dim>\tSpecify X dimension of workgroup (default = %d)\n", p); shrLog(" --q=<workgroup Y dim>\tSpecify Y dimension of workgroup (default = %d)\n\n", q); // Get command line arguments if there are any and set vars accordingly if (argc > 0) { shrGetCmdLineArgumenti(argc, (const char**)argv, "p", &p); shrGetCmdLineArgumenti(argc, (const char**)argv, "q", &q); shrGetCmdLineArgumenti(argc, (const char**)argv, "n", &numBodies); bDouble = (shrTRUE == shrCheckCmdLineFlag(argc, (const char**)argv, "double")); bNoPrompt = shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt"); bQATest = shrCheckCmdLineFlag(argc, (const char**)argv, "qatest"); } //Get the NVIDIA platform cl_int ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); shrLog("clGetPlatformID...\n\n"); if (bDouble) { shrLog("Double precision execution...\n\n"); } else { shrLog("Single precision execution...\n\n"); } flopsPerInteraction = bDouble ? 30 : 20; //Get all the devices shrLog("Get the Device info and select Device...\n"); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) ); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Set target device and Query number of compute units on uiTargetDevice shrLog(" # of Devices Available = %u\n", uiNumDevices); if(shrGetCmdLineArgumentu(argc, (const char**)argv, "device", &uiTargetDevice)== shrTRUE) { uiTargetDevice = CLAMP(uiTargetDevice, 0, (uiNumDevices - 1)); } shrLog(" Using Device %u, ", uiTargetDevice); oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]); cl_uint uiNumComputeUnits; clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(uiNumComputeUnits), &uiNumComputeUnits, NULL); shrLog(" # of Compute Units = %u\n", uiNumComputeUnits); //Create the context shrLog("clCreateContext...\n"); cxContext = clCreateContext(0, uiNumDevsUsed, &cdDevices[uiTargetDevice], NULL, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Create a command-queue shrLog("clCreateCommandQueue...\n\n"); cqCommandQueue = clCreateCommandQueue(cxContext, cdDevices[uiTargetDevice], CL_QUEUE_PROFILING_ENABLE, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Log and config for number of bodies shrLog("Number of Bodies = %d\n", numBodies); switch (numBodies) { case 1024: activeParams.m_clusterScale = 1.52f; activeParams.m_velocityScale = 2.f; break; case 2048: activeParams.m_clusterScale = 1.56f; activeParams.m_velocityScale = 2.64f; break; case 4096: activeParams.m_clusterScale = 1.68f; activeParams.m_velocityScale = 2.98f; break; case 7680: case 8192: activeParams.m_clusterScale = 1.98f; activeParams.m_velocityScale = 2.9f; break; default: case 15360: case 16384: activeParams.m_clusterScale = 1.54f; activeParams.m_velocityScale = 8.f; break; case 30720: case 32768: activeParams.m_clusterScale = 1.44f; activeParams.m_velocityScale = 11.f; break; } if ((q * p) > 256) { p = 256 / q; shrLog("Setting p=%d to maintain %d threads per block\n", p, 256); } if ((q == 1) && (numBodies < p)) { p = numBodies; shrLog("Setting p=%d because # of bodies < p\n", p); } shrLog("Workgroup Dims = (%d x %d)\n\n", p, q); // Initialize OpenGL items if using GL if (bQATest == shrFALSE) { assert(0); /* shrLog("Calling InitGL...\n"); InitGL(&argc, argv); */ } else { shrLog("Skipping InitGL...\n"); } // CL/GL interop disabled bUsePBO = (false && (bQATest == shrFALSE)); InitNbody(cdDevices[uiTargetDevice], cxContext, cqCommandQueue, numBodies, p, q, bUsePBO, bDouble); ResetSim(nbody, numBodies, NBODY_CONFIG_SHELL, bUsePBO); // init timers shrDeltaT(DEMOTIME); // timer 0 is for timing demo periods shrDeltaT(FUNCTIME); // timer 1 is for logging function delta t's shrDeltaT(FPSTIME); // timer 2 is for fps measurement // Standard simulation if (bQATest == shrFALSE) { assert(0); /* shrLog("Running standard oclNbody simulation...\n\n"); glutDisplayFunc(DisplayGL); glutReshapeFunc(ReshapeGL); glutMouseFunc(MouseGL); glutMotionFunc(MotionGL); glutKeyboardFunc(KeyboardGL); glutSpecialFunc(SpecialGL); glutIdleFunc(IdleGL); glutMainLoop(); */ } // Compare to host, profile and write out file for regression analysis if (bQATest == shrTRUE) { bool bTestResults = false; shrLog("Running oclNbody Results Comparison...\n\n"); bTestResults = CompareResults(numBodies); //shrLog("Profiling oclNbody...\n\n"); //RunProfiling(100, (unsigned int)(p * q)); // 100 iterations shrQAFinish(argc, (const char **)argv, bTestResults ? QA_PASSED : QA_FAILED); } else { // Cleanup/exit bNoPrompt = shrTRUE; shrQAFinish2(false, *pArgc, (const char **)pArgv, QA_PASSED); } Cleanup(EXIT_SUCCESS); }
// main function //***************************************************************************** int main(int argc, const char **argv) { cl_platform_id cpPlatform; // OpenCL platform cl_uint nDevice; // OpenCL device count cl_device_id* cdDevices; // OpenCL device list cl_context cxGPUContext; // OpenCL context cl_command_queue cqCommandQue[MAX_GPU_COUNT]; // OpenCL command que cl_int ciErrNum = 1; // Error code var shrQAStart(argc, (char **)argv); // Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); shrLog("clGetPlatformID...\n"); //Get all the devices cl_uint uiNumDevices = 0; // Number of devices available cl_uint uiTargetDevice = 0; // Default Device to compute on cl_uint uiNumComputeUnits; // Number of compute units (SM's on NV GPU) shrLog("Get the Device info and select Device...\n"); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) ); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); // Get command line device options and config accordingly shrLog(" # of Devices Available = %u\n", uiNumDevices); if(shrGetCmdLineArgumentu(argc, (const char**)argv, "device", &uiTargetDevice)== shrTRUE) { uiTargetDevice = CLAMP(uiTargetDevice, 0, (uiNumDevices - 1)); } shrLog(" Using Device %u: ", uiTargetDevice); oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]); ciErrNum = clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(uiNumComputeUnits), &uiNumComputeUnits, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); shrLog("\n # of Compute Units = %u\n", uiNumComputeUnits); shrSetLogFileName ("oclHiddenMarkovModel.txt"); shrLog("%s Starting...\n\n", argv[0]); shrLog("Get platform...\n"); ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); shrLog("Get devices...\n"); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &nDevice); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); cdDevices = (cl_device_id *)malloc(nDevice * sizeof(cl_device_id) ); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, nDevice, cdDevices, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); shrLog("clCreateContext\n"); cxGPUContext = clCreateContext(0, nDevice, cdDevices, NULL, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); shrLog("clCreateCommandQueue\n"); int id_device; if(shrGetCmdLineArgumenti(argc, argv, "device", &id_device)) // Set up command queue(s) for GPU specified on the command line { // create a command que cqCommandQue[0] = clCreateCommandQueue(cxGPUContext, cdDevices[id_device], 0, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); oclPrintDevInfo(LOGBOTH, cdDevices[id_device]); nDevice = 1; } else { // create command queues for all available devices for (cl_uint i = 0; i < nDevice; i++) { cqCommandQue[i] = clCreateCommandQueue(cxGPUContext, cdDevices[i], CL_QUEUE_PROFILING_ENABLE, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); } for (cl_uint i = 0; i < nDevice; i++) oclPrintDevInfo(LOGBOTH, cdDevices[i]); } shrLog("\nUsing %d GPU(s)...\n\n", nDevice); int wgSize; if (!shrGetCmdLineArgumenti(argc, argv, "work-group-size", &wgSize)) { wgSize = 256; } shrLog("Init Hidden Markov Model parameters\n"); int nState = 256*16; // number of states, must be a multiple of 256 int nEmit = 128; // number of possible observations float *initProb = (float*)malloc(sizeof(float)*nState); // initial probability float *mtState = (float*)malloc(sizeof(float)*nState*nState); // state transition matrix float *mtEmit = (float*)malloc(sizeof(float)*nEmit*nState); // emission matrix initHMM(initProb, mtState, mtEmit, nState, nEmit); // define observational sequence int nObs = 100; // size of observational sequence int **obs = (int**)malloc(nDevice*sizeof(int*)); int **viterbiPathCPU = (int**)malloc(nDevice*sizeof(int*)); int **viterbiPathGPU = (int**)malloc(nDevice*sizeof(int*)); float *viterbiProbCPU = (float*)malloc(nDevice*sizeof(float)); for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { obs[iDevice] = (int*)malloc(sizeof(int)*nObs); for (int i = 0; i < nObs; i++) obs[iDevice][i] = i % 15; viterbiPathCPU[iDevice] = (int*)malloc(sizeof(int)*nObs); viterbiPathGPU[iDevice] = (int*)malloc(sizeof(int)*nObs); } shrLog("# of states = %d\n# of possible observations = %d \nSize of observational sequence = %d\n\n", nState, nEmit, nObs); shrLog("Compute Viterbi path on GPU\n\n"); HMM **oHmm = (HMM**)malloc(nDevice*sizeof(HMM*)); for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { oHmm[iDevice] = new HMM(cxGPUContext, cqCommandQue[iDevice], initProb, mtState, mtEmit, nState, nEmit, nObs, argv[0], wgSize); } cl_mem *vProb = (cl_mem*)malloc(sizeof(cl_mem)*nDevice); cl_mem *vPath = (cl_mem*)malloc(sizeof(cl_mem)*nDevice); for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { vProb[iDevice] = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, sizeof(float), NULL, &ciErrNum); vPath[iDevice] = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, sizeof(int)*nObs, NULL, &ciErrNum); } #ifdef GPU_PROFILING for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { clFinish(cqCommandQue[iDevice]);; } shrDeltaT(1); #endif size_t szWorkGroup; for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { szWorkGroup = oHmm[iDevice]->ViterbiSearch(vProb[iDevice], vPath[iDevice], obs[iDevice]); } for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { clFinish(cqCommandQue[iDevice]); } #ifdef GPU_PROFILING double dElapsedTime = shrDeltaT(1); shrLogEx(LOGBOTH | MASTER, 0, "oclHiddenMarkovModel, Throughput = %.4f GB/s, Time = %.5f s, Size = %u items, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-9 * 2.0 * sizeof(float) * nDevice * nState * nState * (nObs-1))/dElapsedTime, dElapsedTime, (nDevice * nState * nObs), nDevice, szWorkGroup); #endif for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { ciErrNum = clEnqueueReadBuffer(cqCommandQue[iDevice], vPath[iDevice], CL_TRUE, 0, sizeof(int)*nObs, viterbiPathGPU[iDevice], 0, NULL, NULL); } shrLog("\nCompute Viterbi path on CPU\n"); for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { ciErrNum = ViterbiCPU(viterbiProbCPU[iDevice], viterbiPathCPU[iDevice], obs[iDevice], nObs, initProb, mtState, nState, mtEmit); } if (!ciErrNum) { shrEXIT(argc, argv); } bool pass = true; for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { for (int i = 0; i < nObs; i++) { if (viterbiPathCPU[iDevice][i] != viterbiPathGPU[iDevice][i]) { pass = false; break; } } } // NOTE: Most properly this should be done at any of the exit points above, but it is omitted elsewhere for clarity. shrLog("Release CPU buffers and OpenCL objects...\n"); free(initProb); free(mtState); free(mtEmit); for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { free(obs[iDevice]); free(viterbiPathCPU[iDevice]); free(viterbiPathGPU[iDevice]); delete oHmm[iDevice]; clReleaseCommandQueue(cqCommandQue[iDevice]); } free(obs); free(viterbiPathCPU); free(viterbiPathGPU); free(viterbiProbCPU); free(cdDevices); free(oHmm); clReleaseContext(cxGPUContext); // finish shrQAFinishExit(argc, (const char **)argv, pass ? QA_PASSED : QA_FAILED); shrEXIT(argc, argv); }
// Main function // ********************************************************************* int main(int argc, char **argv) { gp_argc = &argc; gp_argv = &argv; shrQAStart(argc, argv); // Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); shrLog("clGetPlatformID...\n"); // Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); shrLog("clGetPlatformID...\n"); //Get all the devices cl_uint uiNumDevices = 0; // Number of devices available cl_uint uiTargetDevice = 0; // Default Device to compute on cl_uint uiNumComputeUnits; // Number of compute units (SM's on NV GPU) shrLog("Get the Device info and select Device...\n"); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) ); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); // Get command line device options and config accordingly shrLog(" # of Devices Available = %u\n", uiNumDevices); if(shrGetCmdLineArgumentu(argc, (const char**)argv, "device", &uiTargetDevice)== shrTRUE) { uiTargetDevice = CLAMP(uiTargetDevice, 0, (uiNumDevices - 1)); } shrLog(" Using Device %u: ", uiTargetDevice); oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]); ciErrNum = clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(uiNumComputeUnits), &uiNumComputeUnits, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); shrLog("\n # of Compute Units = %u\n", uiNumComputeUnits); // get command line arg for quick test, if provided bNoPrompt = shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt"); // start logs cExecutableName = argv[0]; shrSetLogFileName ("oclDotProduct.txt"); shrLog("%s Starting...\n\n# of float elements per Array \t= %u\n", argv[0], iNumElements); // set and log Global and Local work size dimensions szLocalWorkSize = 256; szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, iNumElements); // rounded up to the nearest multiple of the LocalWorkSize shrLog("Global Work Size \t\t= %u\nLocal Work Size \t\t= %u\n# of Work Groups \t\t= %u\n\n", szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize)); // Allocate and initialize host arrays shrLog( "Allocate and Init Host Mem...\n"); srcA = (void *)malloc(sizeof(cl_float4) * szGlobalWorkSize); srcB = (void *)malloc(sizeof(cl_float4) * szGlobalWorkSize); dst = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize); Golden = (void *)malloc(sizeof(cl_float) * iNumElements); shrFillArray((float*)srcA, 4 * iNumElements); shrFillArray((float*)srcB, 4 * iNumElements); // Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Get a GPU device ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevices[uiTargetDevice], NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Create the context cxGPUContext = clCreateContext(0, 1, &cdDevices[uiTargetDevice], NULL, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Create a command-queue shrLog("clCreateCommandQueue...\n"); cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], 0, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Allocate the OpenCL buffer memory objects for source and result on the device GMEM shrLog("clCreateBuffer (SrcA, SrcB and Dst in Device GMEM)...\n"); cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize * 4, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize * 4, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); cmDevDst = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Read the OpenCL kernel in from source file shrLog("oclLoadProgSource (%s)...\n", cSourceFile); cPathAndName = shrFindFilePath(cSourceFile, argv[0]); oclCheckErrorEX(cPathAndName != NULL, shrTRUE, pCleanup); cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength); oclCheckErrorEX(cSourceCL != NULL, shrTRUE, pCleanup); // Create the program shrLog("clCreateProgramWithSource...\n"); cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErrNum); // Build the program with 'mad' Optimization option #ifdef MAC char* flags = "-cl-fast-relaxed-math -DMAC"; #else char* flags = "-cl-fast-relaxed-math"; #endif shrLog("clBuildProgram...\n"); ciErrNum = clBuildProgram(cpProgram, 0, NULL, NULL, NULL, NULL); if (ciErrNum != CL_SUCCESS) { // write out standard error, Build Log and PTX, then cleanup and exit shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclDotProduct.ptx"); Cleanup(EXIT_FAILURE); } // Create the kernel shrLog("clCreateKernel (DotProduct)...\n"); ckKernel = clCreateKernel(cpProgram, "DotProduct", &ciErrNum); // Set the Argument values shrLog("clSetKernelArg 0 - 3...\n\n"); ciErrNum = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmDevSrcA); ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmDevSrcB); ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmDevDst); ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(cl_int), (void*)&iNumElements); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // -------------------------------------------------------- // Core sequence... copy input data to GPU, compute, copy results back // Asynchronous write of data to GPU device shrLog("clEnqueueWriteBuffer (SrcA and SrcB)...\n"); ciErrNum = clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcA, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize * 4, srcA, 0, NULL, NULL); ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcB, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize * 4, srcB, 0, NULL, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Launch kernel shrLog("clEnqueueNDRangeKernel (DotProduct)...\n"); ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Read back results and check accumulated errors shrLog("clEnqueueReadBuffer (Dst)...\n\n"); ciErrNum = clEnqueueReadBuffer(cqCommandQueue, cmDevDst, CL_TRUE, 0, sizeof(cl_float) * szGlobalWorkSize, dst, 0, NULL, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Compute and compare results for golden-host and report errors and pass/fail shrLog("Comparing against Host/C++ computation...\n\n"); DotProductHost ((const float*)srcA, (const float*)srcB, (float*)Golden, iNumElements); shrBOOL bMatch = shrComparefet((const float*)Golden, (const float*)dst, (unsigned int)iNumElements, 0.0f, 0); // Cleanup and leave Cleanup (EXIT_SUCCESS); }
// Setup function for GLUT parameters and loop //***************************************************************************** void InitGL(int* argc, char **argv) { // init GLUT glutInit(argc, argv); glutInitDisplayMode(GLUT_RGB | GLUT_DEPTH | GLUT_DOUBLE); glutInitWindowPosition (glutGet(GLUT_SCREEN_WIDTH)/2 - iGraphicsWinWidth/2, glutGet(GLUT_SCREEN_HEIGHT)/2 - iGraphicsWinHeight/2); glutInitWindowSize(iGraphicsWinWidth, iGraphicsWinHeight); iGLUTWindowHandle = glutCreateWindow("OpenCL for GPU Nbody Demo"); #if !(defined (__APPLE__) || defined(MACOSX) || defined(__EMSCRIPTEN__)) glutSetOption(GLUT_ACTION_ON_WINDOW_CLOSE, GLUT_ACTION_GLUTMAINLOOP_RETURNS); #endif // init GLEW #ifndef __EMSCRIPTEN__ glewInit(); GLboolean bGlew = glewIsSupported("GL_VERSION_2_0 " "GL_VERSION_1_5 " "GL_ARB_multitexture " "GL_ARB_vertex_buffer_object"); oclCheckErrorEX(bGlew, shrTRUE, pCleanup); #endif glEnable(GL_DEPTH_TEST); glClearColor(0.0, 0.0, 0.0, 1.0); renderer = new ParticleRenderer(); // check GL errors GLenum error; while ((error = glGetError()) != GL_NO_ERROR) { #ifdef __EMSCRIPTEN__ shrLog("InitGL: error - %d\n", error); #else shrLog("InitGL: error - %s\n", (char *)gluErrorString(error)); #endif } // Disable vertical sync, if supported #ifdef _WIN32 if (wglewIsSupported("WGL_EXT_swap_control")) { iVsyncState = wglGetSwapIntervalEXT(); wglSwapIntervalEXT(0); } #else #if defined (__APPLE__) || defined(MACOSX) GLint VBL = 0; CGLGetParameter(CGLGetCurrentContext(), kCGLCPSwapInterval, &iVsyncState); CGLSetParameter(CGLGetCurrentContext(), kCGLCPSwapInterval, &VBL); #elif __EMSCRIPTEN__ #else if(glxewIsSupported("GLX_SGI_swap_control")) { glXSwapIntervalSGI(0); } #endif #endif // create a new parameter list paramlist = new ParamListGL("sliders"); paramlist->bar_col_outer[0] = 0.8f; paramlist->bar_col_outer[1] = 0.8f; paramlist->bar_col_outer[2] = 0.0f; paramlist->bar_col_inner[0] = 0.8f; paramlist->bar_col_inner[1] = 0.8f; paramlist->bar_col_inner[2] = 0.0f; // add parameters to the list // Point Size paramlist->AddParam(new Param<float>("Point Size", activeParams.m_pointSize, 0.0f, 10.0f, 0.01f, &activeParams.m_pointSize)); // Velocity Damping paramlist->AddParam(new Param<float>("Velocity Damping", activeParams.m_damping, 0.5f, 1.0f, .0001f, &(activeParams.m_damping))); // Softening Factor paramlist->AddParam(new Param<float>("Softening Factor", activeParams.m_softening, 0.001f, 1.0f, .0001f, &(activeParams.m_softening))); // Time step size paramlist->AddParam(new Param<float>("Time Step", activeParams.m_timestep, 0.0f, 1.0f, .0001f, &(activeParams.m_timestep))); // Cluster scale (only affects starting configuration paramlist->AddParam(new Param<float>("Cluster Scale", activeParams.m_clusterScale, 0.0f, 10.0f, 0.01f, &(activeParams.m_clusterScale))); // Velocity scale (only affects starting configuration) paramlist->AddParam(new Param<float>("Velocity Scale", activeParams.m_velocityScale, 0.0f, 1000.0f, 0.1f, &activeParams.m_velocityScale)); }
// Intitialize OpenCL //***************************************************************************** void createCLContext() { //Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Get the number of GPU devices available to the platform ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiDevCount); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Create the device list cdDevices = new cl_device_id [uiDevCount]; ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiDevCount, cdDevices, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Get device requested on command line, if any uiDeviceUsed = 0; unsigned int uiEndDev = uiDevCount - 1; // Check if the requested device (or any of the devices if none requested) supports context sharing with OpenGL if(0) { bool bSharingSupported = false; for(unsigned int i = uiDeviceUsed; (!bSharingSupported && (i <= uiEndDev)); ++i) { size_t extensionSize; ciErrNum = clGetDeviceInfo(cdDevices[i], CL_DEVICE_EXTENSIONS, 0, NULL, &extensionSize ); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); if(extensionSize > 0) { char* extensions = (char*)malloc(extensionSize); ciErrNum = clGetDeviceInfo(cdDevices[i], CL_DEVICE_EXTENSIONS, extensionSize, extensions, &extensionSize); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); std::string stdDevString(extensions); free(extensions); size_t szOldPos = 0; size_t szSpacePos = stdDevString.find(' ', szOldPos); // extensions string is space delimited while (szSpacePos != stdDevString.npos) { if( strcmp(GL_SHARING_EXTENSION, stdDevString.substr(szOldPos, szSpacePos - szOldPos).c_str()) == 0 ) { // Device supports context sharing with OpenGL uiDeviceUsed = i; bSharingSupported = true; break; } do { szOldPos = szSpacePos + 1; szSpacePos = stdDevString.find(' ', szOldPos); } while (szSpacePos == szOldPos); } } } // Log CL-GL interop support and quit if not available (sample needs it) // shrLog("%s...\n", bSharingSupported ? "Using CL-GL Interop" : "No device found that supports CL/GL context sharing"); oclCheckErrorEX(bSharingSupported, true, pCleanup); // Define OS-specific context properties and create the OpenCL context #if defined (__APPLE__) CGLContextObj kCGLContext = CGLGetCurrentContext(); CGLShareGroupObj kCGLShareGroup = CGLGetShareGroup(kCGLContext); cl_context_properties props[] = { CL_CONTEXT_PROPERTY_USE_CGL_SHAREGROUP_APPLE, (cl_context_properties)kCGLShareGroup, 0 }; cxGPUContext = clCreateContext(props, 0,0, NULL, NULL, &ciErrNum); #else #ifdef UNIX cl_context_properties props[] = { CL_GL_CONTEXT_KHR, (cl_context_properties)glXGetCurrentContext(), CL_GLX_DISPLAY_KHR, (cl_context_properties)glXGetCurrentDisplay(), CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform, 0 }; cxGPUContext = clCreateContext(props, 1, &cdDevices[uiDeviceUsed], NULL, NULL, &ciErrNum); #else // Win32 cl_context_properties props[] = { CL_GL_CONTEXT_KHR, (cl_context_properties)wglGetCurrentContext(), CL_WGL_HDC_KHR, (cl_context_properties)wglGetCurrentDC(), CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform, 0 }; cxGPUContext = clCreateContext(props, 1, &cdDevices[uiDeviceUsed], NULL, NULL, &ciErrNum); #endif #endif } else { // No GL interop cl_context_properties props[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform, 0}; cxGPUContext = clCreateContext(props, 1, &cdDevices[uiDeviceUsed], NULL, NULL, &ciErrNum); g_glInterop = false; } oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); }
//----------------------------------------------------------------------------- //! Run the CL part of the computation //----------------------------------------------------------------------------- void RunKernels() { static float t = 0.0f; // ---------------------------------------------------------------- // populate the 2d texture { // set global and local work item dimensions szLocalWorkSize[0] = 16; szLocalWorkSize[1] = 16; szGlobalWorkSize[0] = shrRoundUp((int)szLocalWorkSize[0], g_texture_2d.width); szGlobalWorkSize[1] = shrRoundUp((int)szLocalWorkSize[1], g_texture_2d.height); // set the args values #ifdef USE_STAGING_BUFFER ciErrNum |= clSetKernelArg(ckKernel_tex2d, 0, sizeof(g_texture_2d.clMem), (void *) &(g_texture_2d.clMem)); #else ciErrNum |= clSetKernelArg(ckKernel_tex2d, 0, sizeof(g_texture_2d.clTexture), (void *) &(g_texture_2d.clTexture)); #endif ciErrNum |= clSetKernelArg(ckKernel_tex2d, 1, sizeof(g_texture_2d.clTexture), (void *) &(g_texture_2d.clTexture)); ciErrNum |= clSetKernelArg(ckKernel_tex2d, 2, sizeof(g_texture_2d.width), &g_texture_2d.width); ciErrNum |= clSetKernelArg(ckKernel_tex2d, 3, sizeof(g_texture_2d.height), &g_texture_2d.height); ciErrNum |= clSetKernelArg(ckKernel_tex2d, 4, sizeof(g_texture_2d.pitch), &g_texture_2d.pitch); ciErrNum |= clSetKernelArg(ckKernel_tex2d, 5, sizeof(t), &t); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // launch computation kernel ciErrNum |= clEnqueueNDRangeKernel(cqCommandQueue, ckKernel_tex2d, 2, NULL, szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); #ifdef USE_STAGING_BUFFER size_t dst[3] = { 0, 0, 0}; size_t region[3] = { g_texture_2d.width, g_texture_2d.height, 1}; ciErrNum |= clEnqueueCopyBufferToImage(cqCommandQueue, g_texture_2d.clMem /* src_buffer */, g_texture_2d.clTexture /* dst_image */, 0 /* src_offset */, dst /* dst_origin[3] */, region /* region[3] */, 0 /* num_events_in_wait_list */, NULL /* event_wait_list */, NULL /* event */); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); #endif } // ---------------------------------------------------------------- // populate the volume texture { // set global and local work item dimensions szLocalWorkSize[0] = 16; szLocalWorkSize[1] = 16; szGlobalWorkSize[0] = shrRoundUp((int)szLocalWorkSize[0], g_texture_vol.width); szGlobalWorkSize[1] = shrRoundUp((int)szLocalWorkSize[1], g_texture_vol.height); // set the args values ciErrNum |= clSetKernelArg(ckKernel_texvolume, 0, sizeof(g_texture_vol.clMem), (void *) &(g_texture_vol.clMem)); ciErrNum |= clSetKernelArg(ckKernel_texvolume, 1, sizeof(g_texture_vol.width), &g_texture_vol.width); ciErrNum |= clSetKernelArg(ckKernel_texvolume, 2, sizeof(g_texture_vol.height), &g_texture_vol.height); ciErrNum |= clSetKernelArg(ckKernel_texvolume, 3, sizeof(g_texture_vol.depth), &g_texture_vol.depth); ciErrNum |= clSetKernelArg(ckKernel_texvolume, 4, sizeof(g_texture_vol.pitch), &g_texture_vol.pitch); ciErrNum |= clSetKernelArg(ckKernel_texvolume, 5, sizeof(g_texture_vol.pitchslice), &g_texture_vol.pitchslice); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // launch computation kernel ciErrNum |= clEnqueueNDRangeKernel(cqCommandQueue, ckKernel_texvolume, 2, NULL, szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // ONLY staging buffer works, for volume texture // do the copy here size_t dst[3] = { 0, 0, 0}; size_t region[3] = { g_texture_vol.width, g_texture_vol.height, g_texture_vol.depth}; ciErrNum |= clEnqueueCopyBufferToImage(cqCommandQueue, g_texture_vol.clMem /* src_buffer */, g_texture_vol.clTexture /* dst_image */, 0 /* src_offset */, dst /* dst_origin[3] */, region /* region[3] */, 0 /* num_events_in_wait_list */, NULL /* event_wait_list */, NULL /* event */); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); } // ---------------------------------------------------------------- // populate the faces of the cube map for (int face = 0; face < 6; ++face) { // set global and local work item dimensions szLocalWorkSize[0] = 16; szLocalWorkSize[1] = 16; szGlobalWorkSize[0] = shrRoundUp((int)szLocalWorkSize[0], g_texture_cube.size); szGlobalWorkSize[1] = shrRoundUp((int)szLocalWorkSize[1], g_texture_cube.size); // set the args values #ifdef USE_STAGING_BUFFER ciErrNum |= clSetKernelArg(ckKernel_texcube, 0, sizeof(g_texture_cube.clMem[face]), (void *) &(g_texture_cube.clMem[face])); #else ciErrNum |= clSetKernelArg(ckKernel_texcube, 0, sizeof(g_texture_cube.clTexture[face]), (void *) &(g_texture_cube.clTexture[face])); #endif ciErrNum |= clSetKernelArg(ckKernel_texcube, 1, sizeof(g_texture_cube.size), &g_texture_cube.size); ciErrNum |= clSetKernelArg(ckKernel_texcube, 2, sizeof(g_texture_cube.pitch), &g_texture_cube.pitch); ciErrNum |= clSetKernelArg(ckKernel_texcube, 3, sizeof(int), &face); ciErrNum |= clSetKernelArg(ckKernel_texcube, 4, sizeof(t), &t); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // launch computation kernel ciErrNum |= clEnqueueNDRangeKernel(cqCommandQueue, ckKernel_texcube, 2, NULL, szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); #ifdef USE_STAGING_BUFFER size_t dst[3] = { 0, 0, 0}; size_t region[3] = { g_texture_cube.size, g_texture_cube.size, 1}; ciErrNum |= clEnqueueCopyBufferToImage(cqCommandQueue, g_texture_cube.clMem[face]/* src_buffer */, g_texture_cube.clTexture[face]/* dst_image */, 0 /* src_offset */, dst /* dst_origin[3] */, region /* region[3] */, 0 /* num_events_in_wait_list */, NULL /* event_wait_list */, NULL /* event */); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); #endif } t += 0.1f; }
// OpenCL computation function for 1 or more GPUs // Copies input data from pinned host buf to the device, runs kernel, copies output data back to pinned output host buf //***************************************************************************** double SobelFilterGPU(cl_uint* uiInputImage, cl_uint* uiOutputImage) { // If this is a video application, fresh data in pinned host buffer is needed beyond here // This line could be a sync point assuring that an asynchronous acqusition is complete. // That ascynchronous acquisition would do a map, update and unmap for the pinned input buffer // // Otherwise a synchronous acquisition call ('get next frame') could be placed here, but that would be less optimal. // For each device: copy fresh input H2D ciErrNum = CL_SUCCESS; for (cl_uint i = 0; i < GpuDevMngr->uiUsefulDevCt; i++) { // Nonblocking Write of input image data from host to device ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[i], cmDevBufIn[i], CL_FALSE, 0, szAllocDevBytes[i], (void*)&uiInputImage[uiInHostPixOffsets[i]], 0, NULL, NULL); } // Sync all queues to host and start computation timer on host to get computation elapsed wall clock time // (Only for timing... can be omitted in a production app) for (cl_uint j = 0; j < GpuDevMngr->uiUsefulDevCt; j++) { ciErrNum |= clFinish(cqCommandQueue[j]); } oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // For each device: Process shrDeltaT(0); for (cl_uint i = 0; i < GpuDevMngr->uiUsefulDevCt; i++) { // Determine configuration bytes, offsets and launch config, based on position of device region vertically in image if (GpuDevMngr->uiUsefulDevCt == 1) { // One device processes the whole image with no offset tricks needed szGlobalWorkSize[1] = shrRoundUp((int)szLocalWorkSize[1], (int)uiDevImageHeight[i]); } else if (i == 0) { // Multiple devices, top boundary tile: // Process whole device allocation, including extra row // No offset, but don't return the last row (dark/garbage row) D2H szGlobalWorkSize[1] = shrRoundUp((int)szLocalWorkSize[1], (int)uiDevImageHeight[i]); } else if (i < (GpuDevMngr->uiUsefulDevCt - 1)) { // Multiple devices, middle tile: // Process whole device allocation, including extra 2 rows // Offset down by 1 row, and don't return the first and last rows (dark/garbage rows) D2H szGlobalWorkSize[1] = shrRoundUp((int)szLocalWorkSize[1], (int)uiDevImageHeight[i]); } else { // Multiple devices, last boundary tile: // Process whole device allocation, including extra row // Offset down by 1 row, and don't return the first row (dark/garbage row) D2H szGlobalWorkSize[1] = shrRoundUp((int)szLocalWorkSize[1], (int)uiDevImageHeight[i]); } // Pass in dev image height (# of rows worked on) for this device ciErrNum |= clSetKernelArg(ckSobel[i], 5, sizeof(cl_uint), (void*)&uiDevImageHeight[i]); // Launch Sobel kernel(s) into queue(s) and push to device(s) ciErrNum |= clEnqueueNDRangeKernel(cqCommandQueue[i], ckSobel[i], 2, NULL, szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL); // Push to device(s) so subsequent clFinish in queue 0 doesn't block driver from issuing enqueue command for higher queues ciErrNum |= clFlush(cqCommandQueue[i]); } // Sync all queues to host and get elapsed wall clock time for computation in all queues // (Only for timing... can be omitted in a production app) for (cl_uint j = 0; j < GpuDevMngr->uiUsefulDevCt; j++) { ciErrNum |= clFinish(cqCommandQueue[j]); } double dKernelTime = shrDeltaT(0); // Time from launch of first compute kernel to end of all compute kernels oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // For each device: copy fresh output D2H for (cl_uint i = 0; i < GpuDevMngr->uiUsefulDevCt; i++) { // Determine configuration bytes and offsets based on position of device region vertically in image size_t szReturnBytes; cl_uint uiOutDevByteOffset; if (GpuDevMngr->uiUsefulDevCt == 1) { // One device processes the whole image with no offset tricks needed szReturnBytes = szBuffBytes; uiOutDevByteOffset = 0; } else if (i == 0) { // Multiple devices, top boundary tile: // Process whole device allocation, including extra row // No offset, but don't return the last row (dark/garbage row) D2H szReturnBytes = szAllocDevBytes[i] - (uiImageWidth * sizeof(cl_uint)); uiOutDevByteOffset = 0; } else if (i < (GpuDevMngr->uiUsefulDevCt - 1)) { // Multiple devices, middle tile: // Process whole device allocation, including extra 2 rows // Offset down by 1 row, and don't return the first and last rows (dark/garbage rows) D2H szReturnBytes = szAllocDevBytes[i] - ((uiImageWidth * sizeof(cl_uint)) * 2); uiOutDevByteOffset = uiImageWidth * sizeof(cl_uint); } else { // Multiple devices, last boundary tile: // Process whole device allocation, including extra row // Offset down by 1 row, and don't return the first row (dark/garbage row) D2H szReturnBytes = szAllocDevBytes[i] - (uiImageWidth * sizeof(cl_uint)); uiOutDevByteOffset = uiImageWidth * sizeof(cl_uint); } // Non Blocking Read of output image data from device to host ciErrNum |= clEnqueueReadBuffer(cqCommandQueue[i], cmDevBufOut[i], CL_FALSE, uiOutDevByteOffset, szReturnBytes, (void*)&uiOutputImage[uiOutHostPixOffsets[i]], 0, NULL, NULL); } // Finish all queues and check for errors before returning // The block here assures valid output data for subsequent host processing for (cl_uint j = 0; j < GpuDevMngr->uiUsefulDevCt; j++) { ciErrNum |= clFinish(cqCommandQueue[j]); } oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); return dKernelTime; }
// Main program //***************************************************************************** int main(int argc, char** argv) { pArgc = &argc; pArgv = argv; shrQAStart(argc, argv); // Start logs cExecutableName = argv[0]; shrSetLogFileName ("oclSobelFilter.txt"); shrLog("%s Starting (Using %s)...\n\n", argv[0], clSourcefile); // Get command line args for quick test or QA test, if provided bNoPrompt = (bool)shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt"); bQATest = (bool)shrCheckCmdLineFlag(argc, (const char**)argv, "qatest"); // Menu items if (!(bQATest)) { ShowMenuItems(); } // Find the path from the exe to the image file cPathAndName = shrFindFilePath(cImageFile, argv[0]); oclCheckErrorEX(cPathAndName != NULL, shrTRUE, pCleanup); shrLog("Image File\t = %s\nImage Dimensions = %u w x %u h x %u bpp\n\n", cPathAndName, uiImageWidth, uiImageHeight, sizeof(unsigned int)<<3); // Initialize OpenGL items (if not No-GL QA test) shrLog("%sInitGL...\n\n", bQATest ? "Skipping " : "Calling "); if (!(bQATest)) { InitGL(&argc, argv); } //Get the NVIDIA platform if available, otherwise use default char cBuffer[1024]; bool bNV = false; shrLog("Get Platform ID... "); ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); ciErrNum = clGetPlatformInfo (cpPlatform, CL_PLATFORM_NAME, sizeof(cBuffer), cBuffer, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); shrLog("%s\n\n", cBuffer); bNV = (strstr(cBuffer, "NVIDIA") != NULL); //Get the devices shrLog("Get Device Info...\n"); cl_uint uiNumAllDevs = 0; GpuDevMngr = new DeviceManager(cpPlatform, &uiNumAllDevs, pCleanup); // Get selected device if specified, otherwise examine avaiable ones and choose by perf cl_int iSelectedDevice = 0; if((shrGetCmdLineArgumenti(argc, (const char**)argv, "device", &iSelectedDevice)) || (uiNumAllDevs == 1)) { // Use 1 selected device GpuDevMngr->uiUsefulDevCt = 1; iSelectedDevice = CLAMP((cl_uint)iSelectedDevice, 0, (uiNumAllDevs - 1)); GpuDevMngr->uiUsefulDevs[0] = iSelectedDevice; GpuDevMngr->fLoadProportions[0] = 1.0f; shrLog(" Using 1 Selected Device for Sobel Filter Computation...\n"); } else { // Use available useful devices and Compute the device load proportions ciErrNum = GpuDevMngr->GetDevLoadProportions(bNV); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); if (GpuDevMngr->uiUsefulDevCt == 1) { iSelectedDevice = GpuDevMngr->uiUsefulDevs[0]; } shrLog(" Using %u Device(s) for Sobel Filter Computation\n", GpuDevMngr->uiUsefulDevCt); } //Create the context shrLog("\nclCreateContext...\n\n"); cxGPUContext = clCreateContext(0, uiNumAllDevs, GpuDevMngr->cdDevices, NULL, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Allocate per-device OpenCL objects for useful devices cqCommandQueue = new cl_command_queue[GpuDevMngr->uiUsefulDevCt]; ckSobel = new cl_kernel[GpuDevMngr->uiUsefulDevCt]; cmDevBufIn = new cl_mem[GpuDevMngr->uiUsefulDevCt]; cmDevBufOut = new cl_mem[GpuDevMngr->uiUsefulDevCt]; szAllocDevBytes = new size_t[GpuDevMngr->uiUsefulDevCt]; uiInHostPixOffsets = new cl_uint[GpuDevMngr->uiUsefulDevCt]; uiOutHostPixOffsets = new cl_uint[GpuDevMngr->uiUsefulDevCt]; uiDevImageHeight = new cl_uint[GpuDevMngr->uiUsefulDevCt]; // Create command queue(s) for device(s) shrLog("clCreateCommandQueue...\n"); for (cl_uint i = 0; i < GpuDevMngr->uiUsefulDevCt; i++) { cqCommandQueue[i] = clCreateCommandQueue(cxGPUContext, GpuDevMngr->cdDevices[GpuDevMngr->uiUsefulDevs[i]], 0, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); shrLog(" CommandQueue %u, Device %u, Device Load Proportion = %.2f, ", i, GpuDevMngr->uiUsefulDevs[i], GpuDevMngr->fLoadProportions[i]); oclPrintDevName(LOGBOTH, GpuDevMngr->cdDevices[GpuDevMngr->uiUsefulDevs[i]]); shrLog("\n"); } // Allocate pinned input and output host image buffers: mem copy operations to/from pinned memory is much faster than paged memory szBuffBytes = uiImageWidth * uiImageHeight * sizeof (unsigned int); cmPinnedBufIn = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); cmPinnedBufOut = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); shrLog("\nclCreateBuffer (Input and Output Pinned Host buffers)...\n"); // Get mapped pointers for writing to pinned input and output host image pointers uiInput = (cl_uint*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedBufIn, CL_TRUE, CL_MAP_WRITE, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); uiOutput = (cl_uint*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedBufOut, CL_TRUE, CL_MAP_READ, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); shrLog("clEnqueueMapBuffer (Pointer to Input and Output pinned host buffers)...\n"); // Load image data from file to pinned input host buffer ciErrNum = shrLoadPPM4ub(cPathAndName, (unsigned char **)&uiInput, &uiImageWidth, &uiImageHeight); oclCheckErrorEX(ciErrNum, shrTRUE, pCleanup); shrLog("Load Input Image to Input pinned host buffer...\n"); // Read the kernel in from file free(cPathAndName); cPathAndName = shrFindFilePath(clSourcefile, argv[0]); oclCheckErrorEX(cPathAndName != NULL, shrTRUE, pCleanup); cSourceCL = oclLoadProgSource(cPathAndName, "// My comment\n", &szKernelLength); oclCheckErrorEX(cSourceCL != NULL, shrTRUE, pCleanup); shrLog("Load OpenCL Prog Source from File...\n"); // Create the program object cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); shrLog("clCreateProgramWithSource...\n"); // Build the program with 'mad' Optimization option #ifdef MAC char *flags = "-cl-fast-relaxed-math -DMAC"; #else char *flags = "-cl-fast-relaxed-math"; #endif ciErrNum = clBuildProgram(cpProgram, 0, NULL, flags, NULL, NULL); if (ciErrNum != CL_SUCCESS) { // On error: write out standard error, Build Log and PTX, then cleanup and exit shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclSobelFilter.ptx"); Cleanup(EXIT_FAILURE); } shrLog("clBuildProgram...\n\n"); // Determine, the size/shape of the image portions for each dev and create the device buffers unsigned uiSumHeight = 0; for (cl_uint i = 0; i < GpuDevMngr->uiUsefulDevCt; i++) { // Create kernel instance ckSobel[i] = clCreateKernel(cpProgram, "ckSobel", &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); shrLog("clCreateKernel (ckSobel), Device %u...\n", i); // Allocations and offsets for the portion of the image worked on by each device if (GpuDevMngr->uiUsefulDevCt == 1) { // One device processes the whole image with no offset uiDevImageHeight[i] = uiImageHeight; uiInHostPixOffsets[i] = 0; uiOutHostPixOffsets[i] = 0; szAllocDevBytes[i] = uiDevImageHeight[i] * uiImageWidth * sizeof(cl_uint); } else if (i == 0) { // Multiple devices, top stripe zone including topmost row of image: // Over-allocate on device by 1 row // Set offset and size to copy extra 1 padding row H2D (below bottom of stripe) // Won't return the last row (dark/garbage row) D2H uiInHostPixOffsets[i] = 0; uiOutHostPixOffsets[i] = 0; uiDevImageHeight[i] = (cl_uint)(GpuDevMngr->fLoadProportions[GpuDevMngr->uiUsefulDevs[i]] * (float)uiImageHeight); // height is proportional to dev perf uiSumHeight += uiDevImageHeight[i]; uiDevImageHeight[i] += 1; szAllocDevBytes[i] = uiDevImageHeight[i] * uiImageWidth * sizeof(cl_uint); } else if (i < (GpuDevMngr->uiUsefulDevCt - 1)) { // Multiple devices, middle stripe zone: // Over-allocate on device by 2 rows // Set offset and size to copy extra 2 padding rows H2D (above top and below bottom of stripe) // Won't return the first and last rows (dark/garbage rows) D2H uiInHostPixOffsets[i] = (uiSumHeight - 1) * uiImageWidth; uiOutHostPixOffsets[i] = uiInHostPixOffsets[i] + uiImageWidth; uiDevImageHeight[i] = (cl_uint)(GpuDevMngr->fLoadProportions[GpuDevMngr->uiUsefulDevs[i]] * (float)uiImageHeight); // height is proportional to dev perf uiSumHeight += uiDevImageHeight[i]; uiDevImageHeight[i] += 2; szAllocDevBytes[i] = uiDevImageHeight[i] * uiImageWidth * sizeof(cl_uint); } else { // Multiple devices, last boundary tile: // Over-allocate on device by 1 row // Set offset and size to copy extra 1 padding row H2D (above top of stripe) // Won't return the first row (dark/garbage rows D2H uiInHostPixOffsets[i] = (uiSumHeight - 1) * uiImageWidth; uiOutHostPixOffsets[i] = uiInHostPixOffsets[i] + uiImageWidth; uiDevImageHeight[i] = uiImageHeight - uiSumHeight; // "leftover" rows uiSumHeight += uiDevImageHeight[i]; uiDevImageHeight[i] += 1; szAllocDevBytes[i] = uiDevImageHeight[i] * uiImageWidth * sizeof(cl_uint); } shrLog("Image Height (rows) for Device %u = %u...\n", i, uiDevImageHeight[i]); // Create the device buffers in GMEM on each device cmDevBufIn[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, szAllocDevBytes[i], NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); cmDevBufOut[i] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, szAllocDevBytes[i], NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); shrLog("clCreateBuffer (Input and Output GMEM buffers, Device %u)...\n", i); // Set the common argument values for the Median kernel instance for each device int iLocalPixPitch = iBlockDimX + 2; ciErrNum = clSetKernelArg(ckSobel[i], 0, sizeof(cl_mem), (void*)&cmDevBufIn[i]); ciErrNum |= clSetKernelArg(ckSobel[i], 1, sizeof(cl_mem), (void*)&cmDevBufOut[i]); ciErrNum |= clSetKernelArg(ckSobel[i], 2, (iLocalPixPitch * (iBlockDimY + 2) * sizeof(cl_uchar4)), NULL); ciErrNum |= clSetKernelArg(ckSobel[i], 3, sizeof(cl_int), (void*)&iLocalPixPitch); ciErrNum |= clSetKernelArg(ckSobel[i], 4, sizeof(cl_uint), (void*)&uiImageWidth); ciErrNum |= clSetKernelArg(ckSobel[i], 6, sizeof(cl_float), (void*)&fThresh); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); shrLog("clSetKernelArg (0-4), Device %u...\n\n", i); } // Set common global and local work sizes for Median kernel szLocalWorkSize[0] = iBlockDimX; szLocalWorkSize[1] = iBlockDimY; szGlobalWorkSize[0] = shrRoundUp((int)szLocalWorkSize[0], uiImageWidth); // init running timers shrDeltaT(0); // timer 0 used for computation timing shrDeltaT(1); // timer 1 used for fps computation // Start main GLUT rendering loop for processing and rendering, // or otherwise run No-GL Q/A test sequence if (!(bQATest)) { glutMainLoop(); } else { TestNoGL(); } Cleanup(EXIT_SUCCESS); }
int main(int argc, const char **argv) { cl_platform_id cpPlatform; // OpenCL platform cl_uint nDevice; // OpenCL device count cl_device_id* cdDevices; // OpenCL device list cl_context cxGPUContext; // OpenCL context cl_command_queue cqCommandQueue[MAX_GPU_COUNT]; // OpenCL command que cl_int ciErrNum; shrSetLogFileName ("oclRadixSort.txt"); shrLog("%s starting...\n\n", argv[0]); shrLog("clGetPlatformID...\n"); ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); shrLog("clGetDeviceIDs...\n"); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &nDevice); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); cdDevices = (cl_device_id *)malloc(nDevice * sizeof(cl_device_id) ); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, nDevice, cdDevices, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); shrLog("clCreateContext...\n"); cxGPUContext = clCreateContext(0, nDevice, cdDevices, NULL, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); shrLog("Create command queue...\n\n"); int id_device; if(shrGetCmdLineArgumenti(argc, argv, "device", &id_device)) // Set up command queue(s) for GPU specified on the command line { // get & log device index # and name cl_device_id cdDevice = cdDevices[id_device]; // create a command que cqCommandQueue[0] = clCreateCommandQueue(cxGPUContext, cdDevice, 0, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); oclPrintDevInfo(LOGBOTH, cdDevice); nDevice = 1; } else { // create command queues for all available devices for (cl_uint i = 0; i < nDevice; i++) { cqCommandQueue[i] = clCreateCommandQueue(cxGPUContext, cdDevices[i], 0, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); } for (cl_uint i = 0; i < nDevice; i++) oclPrintDevInfo(LOGBOTH, cdDevices[i]); } int ctaSize; if (!shrGetCmdLineArgumenti(argc, argv, "work-group-size", &ctaSize)) { ctaSize = 128; } shrLog("Running Radix Sort on %d GPU(s) ...\n\n", nDevice); unsigned int numElements = 1048576;//128*128*128*2; // Alloc and init some data on the host, then alloc and init GPU buffer unsigned int **h_keys = (unsigned int**)malloc(nDevice * sizeof(unsigned int*)); unsigned int **h_keysSorted = (unsigned int**)malloc(nDevice * sizeof(unsigned int*)); cl_mem *d_keys = (cl_mem* )malloc(nDevice * sizeof(cl_mem)); for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { h_keys[iDevice] = (unsigned int*)malloc(numElements * sizeof(unsigned int)); h_keysSorted[iDevice] = (unsigned int*)malloc(numElements * sizeof(unsigned int)); makeRandomUintVector(h_keys[iDevice], numElements, keybits); d_keys[iDevice] = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, sizeof(unsigned int) * numElements, NULL, &ciErrNum); ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[iDevice], d_keys[iDevice], CL_TRUE, 0, sizeof(unsigned int) * numElements, h_keys[iDevice], 0, NULL, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); } // instantiate RadixSort objects RadixSort **radixSort = (RadixSort**)malloc(nDevice * sizeof(RadixSort*)); for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { radixSort[iDevice] = new RadixSort(cxGPUContext, cqCommandQueue[iDevice], numElements, argv[0], ctaSize, true); } #ifdef GPU_PROFILING int numIterations = 30; for (int i = -1; i < numIterations; i++) { if (i == 0) { for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { clFinish(cqCommandQueue[iDevice]); } shrDeltaT(1); } #endif for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { radixSort[iDevice]->sort(d_keys[iDevice], 0, numElements, keybits); } #ifdef GPU_PROFILING } for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { clFinish(cqCommandQueue[iDevice]); } double gpuTime = shrDeltaT(1)/(double)numIterations; shrLogEx(LOGBOTH | MASTER, 0, "oclRadixSort, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u elements, NumDevsUsed = %d, Workgroup = %d\n", (1.0e-6 * (double)(nDevice * numElements)/gpuTime), gpuTime, nDevice * numElements, nDevice, ctaSize); #endif // copy sorted keys to CPU for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { clEnqueueReadBuffer(cqCommandQueue[iDevice], d_keys[iDevice], CL_TRUE, 0, sizeof(unsigned int) * numElements, h_keysSorted[iDevice], 0, NULL, NULL); } // Check results bool passed = true; for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { passed &= verifySortUint(h_keysSorted[iDevice], NULL, h_keys[iDevice], numElements); } shrLog("\n%s\n\n", passed ? "PASSED" : "FAILED"); // cleanup allocs for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { clReleaseMemObject(d_keys[iDevice]); free(h_keys[iDevice]); free(h_keysSorted[iDevice]); delete radixSort[iDevice]; } free(radixSort); free(h_keys); free(h_keysSorted); // remaining cleanup and exit free(cdDevices); for (cl_uint iDevice = 0; iDevice < nDevice; iDevice++) { clReleaseCommandQueue(cqCommandQueue[iDevice]); } clReleaseContext(cxGPUContext); shrEXIT(argc, argv); }
// Main function // ********************************************************************* int main(int argc, char** argv) { shrQAStart(argc, argv); // get command line arg for quick test, if provided bNoPrompt = shrCheckCmdLineFlag(argc, (const char **)argv, "noprompt"); // start logs cExecutableName = argv[0]; shrSetLogFileName ("oclMatVecMul.txt"); shrLog("%s Starting...\n\n", argv[0]); // calculate matrix height given GPU memory shrLog("Determining Matrix height from available GPU mem...\n"); memsize_t memsize; getTargetDeviceGlobalMemSize(&memsize, argc, (const char **)argv); height = memsize/width/16; if (height > MAX_HEIGHT) height = MAX_HEIGHT; shrLog(" Matrix width\t= %u\n Matrix height\t= %u\n\n", width, height); // Allocate and initialize host arrays shrLog("Allocate and Init Host Mem...\n\n"); unsigned int size = width * height; unsigned int mem_size_M = size * sizeof(float); M = (float*)malloc(mem_size_M); unsigned int mem_size_V = width * sizeof(float); V = (float*)malloc(mem_size_V); unsigned int mem_size_W = height * sizeof(float); W = (float*)malloc(mem_size_W); shrFillArray(M, size); shrFillArray(V, width); Golden = (float*)malloc(mem_size_W); MatVecMulHost(M, V, width, height, Golden); //Get the NVIDIA platform shrLog("Get the Platform ID...\n\n"); ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); //Get all the devices shrLog("Get the Device info and select Device...\n"); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) ); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Set target device and Query number of compute units on targetDevice shrLog(" # of Devices Available = %u\n", uiNumDevices); if(shrGetCmdLineArgumentu(argc, (const char **)argv, "device", &targetDevice)== shrTRUE) { targetDevice = CLAMP(targetDevice, 0, (uiNumDevices - 1)); } shrLog(" Using Device %u: ", targetDevice); oclPrintDevName(LOGBOTH, cdDevices[targetDevice]); cl_uint num_compute_units; clGetDeviceInfo(cdDevices[targetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(num_compute_units), &num_compute_units, NULL); shrLog("\n # of Compute Units = %u\n\n", num_compute_units); //Create the context shrLog("clCreateContext...\n"); cxGPUContext = clCreateContext(0, uiNumDevsUsed, &cdDevices[targetDevice], NULL, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Create a command-queue shrLog("clCreateCommandQueue...\n"); cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[targetDevice], CL_QUEUE_PROFILING_ENABLE, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Allocate the OpenCL buffer memory objects for source and result on the device GMEM shrLog("clCreateBuffer (M, V and W in device global memory, mem_size_m = %u)...\n", mem_size_M); cmM = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, mem_size_M, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); cmV = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, mem_size_V, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); cmW = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, mem_size_W, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Read the OpenCL kernel in from source file shrLog("oclLoadProgSource (%s)...\n", cSourceFile); cPathAndName = shrFindFilePath(cSourceFile, argv[0]); oclCheckErrorEX(cPathAndName != NULL, shrTRUE, pCleanup); cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength); oclCheckErrorEX(cSourceCL != NULL, shrTRUE, pCleanup); // Create the program shrLog("clCreateProgramWithSource...\n"); cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErrNum); // Build the program shrLog("clBuildProgram...\n"); ciErrNum = clBuildProgram(cpProgram, uiNumDevsUsed, &cdDevices[targetDevice], "-cl-fast-relaxed-math", NULL, NULL); if (ciErrNum != CL_SUCCESS) { // write out standard error, Build Log and PTX, then cleanup and exit shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclMatVecMul.ptx"); shrQAFinish(argc, (const char **)argv, QA_FAILED); Cleanup(EXIT_FAILURE); } // -------------------------------------------------------- // Core sequence... copy input data to GPU, compute, copy results back // Asynchronous write of data to GPU device shrLog("clEnqueueWriteBuffer (M and V)...\n\n"); ciErrNum = clEnqueueWriteBuffer(cqCommandQueue, cmM, CL_FALSE, 0, mem_size_M, M, 0, NULL, NULL); ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue, cmV, CL_FALSE, 0, mem_size_V, V, 0, NULL, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Kernels const char* kernels[] = { "MatVecMulUncoalesced0", "MatVecMulUncoalesced1", "MatVecMulCoalesced0", "MatVecMulCoalesced1", "MatVecMulCoalesced2", "MatVecMulCoalesced3" }; for (int k = 0; k < (int)(sizeof(kernels)/sizeof(char*)); ++k) { shrLog("Running with Kernel %s...\n\n", kernels[k]); // Clear result shrLog(" Clear result with clEnqueueWriteBuffer (W)...\n"); memset(W, 0, mem_size_W); ciErrNum = clEnqueueWriteBuffer(cqCommandQueue, cmW, CL_FALSE, 0, mem_size_W, W, 0, NULL, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Create the kernel shrLog(" clCreateKernel...\n"); if (ckKernel) { clReleaseKernel(ckKernel); ckKernel = 0; } ckKernel = clCreateKernel(cpProgram, kernels[k], &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Set and log Global and Local work size dimensions szLocalWorkSize = 256; if (k == 0) szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, height); // rounded up to the nearest multiple of the LocalWorkSize else // Some experiments should be done here for determining the best global work size for a given device // We will assume here that we can run 2 work-groups per compute unit szGlobalWorkSize = 2 * num_compute_units * szLocalWorkSize; shrLog(" Global Work Size \t\t= %u\n Local Work Size \t\t= %u\n # of Work Groups \t\t= %u\n", szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize)); // Set the Argument values shrLog(" clSetKernelArg...\n\n"); int n = 0; ciErrNum = clSetKernelArg(ckKernel, n++, sizeof(cl_mem), (void*)&cmM); ciErrNum |= clSetKernelArg(ckKernel, n++, sizeof(cl_mem), (void*)&cmV); ciErrNum |= clSetKernelArg(ckKernel, n++, sizeof(cl_int), (void*)&width); ciErrNum |= clSetKernelArg(ckKernel, n++, sizeof(cl_int), (void*)&height); ciErrNum |= clSetKernelArg(ckKernel, n++, sizeof(cl_mem), (void*)&cmW); if (k > 1) ciErrNum |= clSetKernelArg(ckKernel, n++, szLocalWorkSize * sizeof(float), 0); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Launch kernel shrLog(" clEnqueueNDRangeKernel (%s)...\n", kernels[k]); ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, &ceEvent); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Read back results and check accumulated errors shrLog(" clEnqueueReadBuffer (W)...\n"); ciErrNum = clEnqueueReadBuffer(cqCommandQueue, cmW, CL_TRUE, 0, mem_size_W, W, 0, NULL, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); #ifdef GPU_PROFILING // Execution time ciErrNum = clWaitForEvents(1, &ceEvent); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); cl_ulong start, end; ciErrNum = clGetEventProfilingInfo(ceEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); ciErrNum |= clGetEventProfilingInfo(ceEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); double dSeconds = 1.0e-9 * (double)(end - start); shrLog(" Kernel execution time: %.5f s\n\n", dSeconds); #endif // Compare results for golden-host and report errors and pass/fail shrLog(" Comparing against Host/C++ computation...\n\n"); shrBOOL res = shrCompareL2fe(Golden, W, height, 1e-6f); shrLog(" GPU Result %s CPU Result within allowable tolerance\n\n", (res == shrTRUE) ? "MATCHES" : "DOESN'T MATCH"); bPassFlag &= (res == shrTRUE); // Release event ciErrNum = clReleaseEvent(ceEvent); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); ceEvent = 0; } // Master status Pass/Fail (all tests) shrQAFinish(argc, (const char **)argv, (bPassFlag ? QA_PASSED : QA_FAILED) ); // Cleanup and leave Cleanup (EXIT_SUCCESS); }
int InitOpenCLContext() { // start logs shrSetLogFileName ("oclVolumeRender.txt"); // get command line arg for quick test, if provided // process command line arguments // First initialize OpenGL context, so we can properly setup the OpenGL / OpenCL interop. // glewInit(); // GLboolean bGLEW = glewIsSupported("GL_VERSION_2_0 GL_ARB_pixel_buffer_object"); // oclCheckErrorEX(bGLEW, shrTRUE, pCleanup); g_glInterop = true; // Create OpenCL context, get device info, select device, select options for image/texture and CL-GL interop createCLContext(); // Print device info clGetDeviceInfo(cdDevices[uiDeviceUsed], CL_DEVICE_IMAGE_SUPPORT, sizeof(g_bImageSupport), &g_bImageSupport, NULL); //shrLog("%s...\n\n", g_bImageSupport ? "Using Image (Texture)" : "No Image (Texuture) Support"); // shrLog("Detailed Device info:\n\n"); oclPrintDevInfo(LOGBOTH, cdDevices[uiDeviceUsed]); // create a command-queue cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[uiDeviceUsed], 0, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Program Setup size_t program_length; cPathAndName = shrFindFilePath("Transform.cl", "."); oclCheckErrorEX(cPathAndName != NULL, shrTRUE, pCleanup); cSourceCL = oclLoadProgSource(cPathAndName, "", &program_length); oclCheckErrorEX(cSourceCL != NULL, shrTRUE, pCleanup); // create the program cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &program_length, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // build the program std::string buildOpts = "-cl-single-precision_constant"; // buildOpts += g_bImageSupport ? " -DIMAGE_SUPPORT" : ""; // ciErrNum = clBuildProgram(cpProgram, 1, &cdDevices[uiDeviceUsed],"-cl-fast-relaxed-math", NULL, NULL); ciErrNum = clBuildProgram(cpProgram, 1, &cdDevices[uiDeviceUsed],NULL, NULL, NULL); if (ciErrNum != CL_SUCCESS) { // write out standard error, Build Log and PTX, then cleanup and return error shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclVolumeRender.ptx"); Cleanup(EXIT_FAILURE); } // create the kernel ScalseKernel = clCreateKernel(cpProgram, "d_render", &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); TransformKernel = clCreateKernel(cpProgram, "angle", &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); LongToShortKernel = clCreateKernel(cpProgram, "transfer", &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); return TRUE; }