//-------------------------------------------------------------------------------------- // Name: Run() // Desc: Create the framework, initialize the application, and render frames. //-------------------------------------------------------------------------------------- BOOL CFrmAppContainer::Run(cl_device_type deviceType) { // Use the executable's directory so that relative media paths work correctly { char processDir[PATH_MAX]; char currentDir[PATH_MAX]; struct stat dirInfo; pid_t pid = getpid(); sprintf(processDir, "/proc/%d/exe", (int)pid); if (readlink(processDir, currentDir, PATH_MAX) != -1) { strcpy(currentDir, dirname(currentDir)); chdir(currentDir); } else { printf("Error finding executable directory.\n"); return FALSE; } } // Create the Application m_pApplication = FrmCreateComputeApplicationInstance(); m_pApplication->m_bRunTests = TRUE; if( NULL == m_pApplication ) return FALSE; if( FALSE == m_pApplication->CreateOpenCLContext( deviceType ) ) { return FALSE; } if( FALSE == m_pApplication->Initialize() ) { printf( "Application failed initialization!\nPlease debug accordingly."); return FALSE; } // Run the computation BOOL result = m_pApplication->Compute(); if ( m_pApplication->RunTests() ) { char msg[256]; FrmSprintf( msg, sizeof(msg), "RunTests: %s\n", result ? "PASSED" : "FAILED" ); FrmLogMessage( msg ); } // Display the message log const CHAR* strErrorMessage = FrmGetMessageLog(); printf("%s", strErrorMessage); return TRUE; }
//-------------------------------------------------------------------------------------- // Name: UpdateNormalsCL() // Desc: Update the normal/tangent/binormal using OpenCL //-------------------------------------------------------------------------------------- BOOL CClothSimCL::UpdateNormalsCL() { // Acquire from OpenGL VBOs as OpenCL memory objects cl_int errNum = 0; cl_mem memObjs[7] = { m_vboMem[CUR_POSITION], m_vboMem[NORMAL], m_vboMem[TANGENT], m_vboMem[BITANGENT], m_vboMem[BACKNORMAL], m_vboMem[BACKTANGENT], m_vboMem[BACKBITANGENT] }; // Set the kernel arguments for the UpdateNormals kernel cl_kernel kernel = m_kernels[UPDATE_NORMALS]; errNum |= clSetKernelArg( kernel, 0, sizeof(cl_mem), &m_vboMem[CUR_POSITION] ); errNum |= clSetKernelArg( kernel, 1, sizeof(cl_mem), &m_vboMem[NORMAL] ); errNum |= clSetKernelArg( kernel, 2, sizeof(cl_mem), &m_vboMem[TANGENT] ); errNum |= clSetKernelArg( kernel, 3, sizeof(cl_mem), &m_vboMem[BITANGENT] ); errNum |= clSetKernelArg( kernel, 4, sizeof(cl_mem), &m_vboMem[BACKNORMAL] ); errNum |= clSetKernelArg( kernel, 5, sizeof(cl_mem), &m_vboMem[BACKTANGENT] ); errNum |= clSetKernelArg( kernel, 6, sizeof(cl_mem), &m_vboMem[BACKBITANGENT] ); if( errNum != CL_SUCCESS ) { char str[256]; FrmSprintf( str, sizeof(str), "Error setting kernel arguments (%d).", errNum ); FrmLogMessage( str ); return FALSE; } // Finally queue the kernel for execution size_t globalWorkSize[2] = { CLOTH_POINTS_WIDTH - 1, CLOTH_POINTS_HEIGHT - 1 }; errNum = clEnqueueNDRangeKernel( m_commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL ); if( errNum != CL_SUCCESS ) { FrmLogMessage( "Error queuing kernel for execution." ); return FALSE; } return TRUE; }
//-------------------------------------------------------------------------------------- // Name: VerletIntegrationCL() // Desc: Perform verlet integration using OpenCL //-------------------------------------------------------------------------------------- BOOL CClothSimCL::VerletIntegrationCL( float fltElapsed ) { cl_int errNum = 0; // Set the kernel arguments for the VerletIntegration kernel FRMVECTOR4 VelDamping = FRMVECTOR4( m_VelDamping, 0.0f ); FRMVECTOR4 CurrentWind = FRMVECTOR4( m_CurrentWind, 0.0f ); FRMVECTOR4 WindDir = FRMVECTOR4( m_WindDir, 0.0f ); FRMVECTOR4 Acceleration = FRMVECTOR4( m_Gravity, 0.0f ); Acceleration *= 0.25f; cl_float ElapsedSquared = fltElapsed * fltElapsed; cl_kernel kernel = m_kernels[VERLET_INTEGRATION]; errNum |= clSetKernelArg( kernel, 0, sizeof(cl_float), &ElapsedSquared ); errNum |= clSetKernelArg( kernel, 1, sizeof(cl_float4), &Acceleration ); errNum |= clSetKernelArg( kernel, 2, sizeof(cl_float4), &VelDamping ); errNum |= clSetKernelArg( kernel, 3, sizeof(cl_float4), &WindDir ); errNum |= clSetKernelArg( kernel, 4, sizeof(cl_float4), &CurrentWind ); errNum |= clSetKernelArg( kernel, 5, sizeof(cl_mem), &m_vboMem[CUR_POSITION] ); errNum |= clSetKernelArg( kernel, 6, sizeof(cl_mem), &m_vboMem[PREV_POSITION] ); errNum |= clSetKernelArg( kernel, 7, sizeof(cl_mem), &m_vboMem[NORMAL] ); if( errNum != CL_SUCCESS ) { char str[256]; FrmSprintf( str, sizeof(str), "Error setting kernel arguments (%d).", errNum ); FrmLogMessage( str ); return FALSE; } // Finally queue the kernel for execution size_t globalWorkSize[1] = { m_uiNumVerts - CLOTH_POINTS_WIDTH }; errNum = clEnqueueNDRangeKernel( m_commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL ); if( errNum != CL_SUCCESS ) { FrmLogMessage( "Error queuing kernel for execution." ); return FALSE; } return TRUE; }
bool SimpleConvolution::Initialize( cl_context context, cl_device_id device, cl_command_queue commandQueue ){ m_context = context; m_device = device; m_commandQueue = commandQueue; cl_program m_program; cl_int errNum = 0; if( false == FrmGetOrBuildComputeProgramFromFile(&simpleConvolution, &m_program, m_context, &m_device, 1) ){ return false; } m_kernel = clCreateKernel(m_program, "simpleConvolution", &errNum); if(errNum != CL_SUCCESS){ char str[256]; FrmSprintf( str, sizeof(str), "ERROR: Failed to create kernel simpleConvolution.\n" ); FrmLogMessage( str ); return false; } clReleaseProgram(m_program); return true; }
//-------------------------------------------------------------------------------------- // Name: InitKernels() // Desc: Initialize the kernels //-------------------------------------------------------------------------------------- BOOL CClothSimCL::InitKernels( cl_context context, cl_device_id device ) { cl_int errNum; // Create the command queue m_commandQueue = clCreateCommandQueue( context, device, CL_QUEUE_PROFILING_ENABLE, &errNum ); if ( errNum != CL_SUCCESS ) { FrmLogMessage( "Failed to create command queue" ); return FALSE; } if( FALSE == FrmBuildComputeProgramFromFile( "Samples/Kernels/ClothSimCLGLES.cl", &m_program, context, &device, 1, "-cl-fast-relaxed-math" ) ) { return FALSE; } const CHAR* pKernelNames[] = { "SatisfyConstraints", "VerletIntegration", "UpdateNormals" }; for( INT32 i = 0; i < NUM_KERNELS; i++ ) { m_kernels[i] = clCreateKernel( m_program, pKernelNames[i] , &errNum ); if ( errNum != CL_SUCCESS ) { CHAR str[256]; FrmSprintf( str, sizeof(str), "ERROR: Failed to create kernel '%s'.\n", pKernelNames[i] ); FrmLogMessage( str ); return FALSE; } } return TRUE; }
//-------------------------------------------------------------------------------------- // Name: SatisfyConstraintsCL() // Desc: Perform satisfy spring constraint using OpeNCL //-------------------------------------------------------------------------------------- BOOL CClothSimCL::SatisfyConstraintsCL() { cl_int errNum = 0; // Copy the positions to another buffer for double buffering errNum = clEnqueueCopyBuffer( m_commandQueue, m_vboMem[CUR_POSITION], m_vertsCopyMem, 0, 0, sizeof(cl_float4) * m_uiNumVerts, 0, NULL, NULL); if( errNum != CL_SUCCESS ) { FrmLogMessage( "Error copying positions VBO.\n" ); return FALSE; } // Set kernel arguments cl_kernel kernel = m_kernels[SATISFY_CONSTRAINTS]; errNum |= clSetKernelArg( kernel, 2, sizeof(cl_mem), &m_baseDistMem ); errNum |= clSetKernelArg( kernel, 3, sizeof(cl_mem), &m_baseDistNeighborIndexMem ); errNum |= clSetKernelArg( kernel, 4, sizeof(cl_mem), &m_baseDistStartIndexMem ); errNum |= clSetKernelArg( kernel, 5, sizeof(cl_mem), &m_baseDistEndIndexMem ); if( errNum != CL_SUCCESS ) { char str[256]; FrmSprintf( str, sizeof(str), "Error setting kernel arguments (%d).", errNum ); FrmLogMessage( str ); return FALSE; } // Number of NxN grid chunks to perform separately, largest chunk should be 16x16 int numChunks = (CLOTH_POINTS_WIDTH * CLOTH_POINTS_HEIGHT) / (16 * 16); size_t globalWorkSize[1] = { (CLOTH_POINTS_WIDTH * CLOTH_POINTS_HEIGHT) / numChunks }; char str[1024]; for( UINT32 i = 0; i < m_ConstraintIters; i++ ) { bool even = ((i % 2) == 0); // Swap input/output errNum |= clSetKernelArg( kernel, 0, sizeof(cl_mem), even ? (&m_vboMem[CUR_POSITION]) : (&m_vertsCopyMem) ); errNum |= clSetKernelArg( kernel, 1, sizeof(cl_mem), even ? (&m_vertsCopyMem) : (&m_vboMem[CUR_POSITION]) ); if( errNum != CL_SUCCESS ) { FrmSprintf( str, sizeof(str), "Error setting kernel arguments (%d).", errNum ); FrmLogMessage( str ); return FALSE; } for( int chunk = 0; chunk < numChunks; chunk++ ) { size_t globalWorkOffset[1] = { chunk * globalWorkSize[0] }; errNum = clEnqueueNDRangeKernel( m_commandQueue, kernel, 1, globalWorkOffset, globalWorkSize, NULL, 0, NULL, NULL ); if( errNum != CL_SUCCESS ) { FrmLogMessage( "Error queuing kernel for execution." ); return FALSE; } } } return TRUE; }
//-------------------------------------------------------------------------------------- // Name: Compute() // Desc: //-------------------------------------------------------------------------------------- BOOL CSample::Compute() { m_Timer.Reset(); m_Timer.Start(); char str[256]; // Set the kernel arguments cl_int errNum = 0; errNum |= clSetKernelArg( m_kernel, 0, sizeof(cl_mem), &m_srcA ); errNum |= clSetKernelArg( m_kernel, 1, sizeof(cl_mem), &m_srcB ); errNum |= clSetKernelArg( m_kernel, 2, sizeof(cl_mem), &m_result ); if( errNum != CL_SUCCESS ) { FrmLogMessage( "Error setting kernel arguments" ); return FALSE; } size_t globalWorkSize[1] = { m_nNumVectors }; size_t localWorkSize[1] = { 1 }; cl_event kernel_event; cl_ulong t_queued=0, t_submit=0, t_start=0, t_end=0; // Queue the kernel for execution errNum = clEnqueueNDRangeKernel( m_commandQueue, m_kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, &kernel_event ); if( errNum != CL_SUCCESS ) { FrmLogMessage( "Error queueing kernel for execution." ); return FALSE; } clWaitForEvents(1 , &kernel_event); // Query timestamp for kernel profiling // Queued time is when the command is queued to host. // Submit time is when the command is submitted from host to device. // Start time is when the command starts the execution. // End time is when the command finishes the execution. // The delta between start and end, marks the total elapsed time to execute a kernel in device. errNum = clGetEventProfilingInfo(kernel_event, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &t_queued, NULL); if( errNum != CL_SUCCESS ) FrmLogMessage( "Error getting queued timestamp." ); errNum = clGetEventProfilingInfo(kernel_event, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &t_submit, NULL); if( errNum != CL_SUCCESS ) FrmLogMessage( "Error getting submit timestamp." ); errNum = clGetEventProfilingInfo(kernel_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &t_start, NULL); if( errNum != CL_SUCCESS ) FrmLogMessage( "Error getting start timestamp." ); errNum = clGetEventProfilingInfo(kernel_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &t_end, NULL); if( errNum != CL_SUCCESS ) FrmLogMessage( "Error getting end timestamp." ); FrmLogMessage("Kernel event profiling....(nano sec)\n"); FrmSprintf(str, sizeof(str), " -> Queued time: %lu\n", t_queued); FrmLogMessage( str ); FrmSprintf(str, sizeof(str), " -> Submit time: %lu\n", t_submit); FrmLogMessage( str ); FrmSprintf(str, sizeof(str), " -> Start time: %lu\n", t_start); FrmLogMessage( str ); FrmSprintf(str, sizeof(str), " -> End time: %lu\n", t_end); FrmLogMessage( str ); clReleaseEvent(kernel_event); // Read the result back to host memory FRMVECTOR4* pResult; pResult = (FRMVECTOR4*) clEnqueueMapBuffer( m_commandQueue, m_result, CL_TRUE, CL_MAP_READ, 0, sizeof(FRMVECTOR4) * m_nNumVectors, 0, NULL, NULL, &errNum ); if( errNum != CL_SUCCESS ) { FrmLogMessage( "Error enqueuing buffer map." ); return FALSE; } m_Timer.Stop(); FrmSprintf( str, sizeof(str), "Results: '%d' vector additions in '%.6f' seconds.\n", m_nNumVectors, m_Timer.GetTime() ); FrmLogMessage( str ); // Test results again CPU reference BOOL result = TRUE; if ( RunTests() ) { const FLOAT32 epsilon = 0.000001f; for( size_t i = 0; i < m_nNumVectors; i++ ) { for ( size_t j = 0; j < 4; j++ ) { FLOAT32 refVal = m_pRefResults[ i ].v[ j ]; FLOAT32 val = pResult[ i ].v[ j ]; if( FrmAbs( refVal - val ) > epsilon ) { FrmSprintf( str, sizeof(str), "Reference test failure, ref = (%f), result = (%f) Diff = (%f).\n", refVal, val, FrmAbs(refVal - val)); FrmLogMessage( str ); result = FALSE; } } } } // Unmap buffer errNum = clEnqueueUnmapMemObject( m_commandQueue, m_result, pResult, 0, NULL, NULL ); if( errNum != CL_SUCCESS ) { FrmLogMessage( "ERROR: Unmapping result buffer." ); return FALSE; } return result; }
//-------------------------------------------------------------------------------------- // Name: CreateOpenCLContext() // Desc: Create the OpenCL context based on the command-line options for creation //-------------------------------------------------------------------------------------- BOOL CFrmComputeApplication::CreateOpenCLContext(cl_device_type deviceType) { cl_int errNum; cl_uint numPlatforms = 0; cl_platform_id platformId; // Get the first platform ID errNum = clGetPlatformIDs( 1, &platformId, &numPlatforms ); if (errNum != CL_SUCCESS || numPlatforms <= 0) { FrmLogMessage("No OpenCL platforms found."); return FALSE; } m_platform = platformId; // Get the number of devices for the requested device type (CPU, GPU, ALL) cl_uint numDevices = 0; errNum = clGetDeviceIDs( platformId, deviceType, 0, NULL, &numDevices ); if (errNum != CL_SUCCESS || numDevices <= 0) { FrmLogMessage("No matching OpenCL devices found."); return FALSE; } char platformInfo[1024]; char logMessage[2048]; errNum = clGetPlatformInfo( platformId, CL_PLATFORM_VENDOR, sizeof(platformInfo), platformInfo, NULL ); if (errNum != CL_SUCCESS) { FrmLogMessage("ERROR: getting platform info."); return FALSE; } FrmSprintf( logMessage, sizeof(logMessage), "OpenCL Platform: %s\n", platformInfo ); FrmLogMessage( logMessage ); // Get the devices m_devices = new cl_device_id[numDevices]; m_deviceCount = numDevices; errNum = clGetDeviceIDs( platformId, deviceType, numDevices, m_devices, NULL ); if (errNum != CL_SUCCESS) { FrmLogMessage("Erorr getting OpenCL device(s)."); return FALSE; } switch (deviceType) { case CL_DEVICE_TYPE_GPU: FrmLogMessage("Selected device: GPU\n"); break; case CL_DEVICE_TYPE_CPU: FrmLogMessage("Selected device: CPU\n"); break; case CL_DEVICE_TYPE_DEFAULT: default: FrmLogMessage("Selected device: DEFAULT\n"); break; } for (int i = 0; i < m_deviceCount; i++) { char deviceInfo[1024]; errNum = clGetDeviceInfo( m_devices[i], CL_DEVICE_NAME, sizeof(deviceInfo), deviceInfo, NULL ); if (errNum == CL_SUCCESS ) { FrmSprintf( logMessage, sizeof(logMessage), "OpenCL Device Name (%d) : %s\n", i , deviceInfo ); FrmLogMessage( logMessage ); } } // Finally, create the context cl_context_properties contextProperties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platformId, 0 }; m_context = clCreateContext( contextProperties, m_deviceCount, m_devices, NULL, NULL, &errNum ); if (errNum != CL_SUCCESS) { FrmLogMessage("Could not create OpenCL context."); return FALSE; } return TRUE; }
//-------------------------------------------------------------------------------------- // Name: Select2DWGSize() // Desc: Select 2D local workgroup size based on user input. // Final selected dimension is AxB where minWorkGroupSize <= AxB <= maxWorkGroupSize //.......The dimension of A and B is based on m_WG2DWHRatio where A/B = m_WG2DWHRatio / (100-m_WG2DWHRatio) //-------------------------------------------------------------------------------------- void CFrmComputeApplication::Select2DWGSize(size_t maxWorkGroupSize, size_t defGlobalWG_X, size_t defGlobalWG_Y, size_t minWorkGroupSize) { char msg[512]; m_globalWorkSize[0] = defGlobalWG_X; m_globalWorkSize[1] = defGlobalWG_Y; // m_WGSize is the percentage of maxWorkGroupSize float selectedWgSize = (float)m_WGSize / 100.0f * (float)maxWorkGroupSize; if ((int)selectedWgSize < minWorkGroupSize) selectedWgSize = (float)minWorkGroupSize; FrmSprintf( msg, sizeof(msg), "Max WG Size: %d. Selected WG Size: %d\n", maxWorkGroupSize, (int)selectedWgSize); FrmLogMessage( msg ); FrmSprintf( msg, sizeof(msg), "Global WG Size: %dx%d\n", m_globalWorkSize[0], m_globalWorkSize[1]); FrmLogMessage( msg ); // if ratio = 0, leave local size calculation to the driver if (m_WG2DWHRatio == 0) return; // Ratio is out of scale 100 // Adjust local dimension based on width:height ratio float whRatio = (float)m_WG2DWHRatio / (float)(100-m_WG2DWHRatio); m_localWorkSize[0] = (size_t) FrmSqrt( (FLOAT32) selectedWgSize * whRatio); // make sure the size is at lease 1 if (m_localWorkSize[0] < 1) m_localWorkSize[0] = 1; // 50 means 1:1 if (m_WG2DWHRatio == 50) { m_localWorkSize[1] = m_localWorkSize[0]; } else { m_localWorkSize[1] = (size_t) selectedWgSize / m_localWorkSize[0]; if (m_localWorkSize[1] < 1) m_localWorkSize[1] = 1; } // Compute the next global size that is a multiple of the local size size_t remndr = defGlobalWG_X % m_localWorkSize[0]; if( remndr ) { m_globalWorkSize[0] = defGlobalWG_X + m_localWorkSize[0] - remndr; } remndr = defGlobalWG_Y % m_localWorkSize[1]; if( remndr ) { m_globalWorkSize[1] = defGlobalWG_Y + m_localWorkSize[1] - remndr; } for (int i = 0; i < 2; i ++) { remndr = (m_globalWorkSize[i] % m_localWorkSize[i])? 1:0; m_numWorkgroup[i] = m_globalWorkSize[i] / m_localWorkSize[i] + remndr; } FrmSprintf( msg, sizeof(msg), "Local WG Size: %dx%d\n", m_localWorkSize[0], m_localWorkSize[1]); FrmLogMessage( msg ); FrmSprintf( msg, sizeof(msg), "Num of local WG: (%d x %d)\n", m_numWorkgroup[0], m_numWorkgroup[1] ); FrmLogMessage( msg ); }