void CopyArrayFromDevice(cl_command_queue cqCommandQueue, float *host, cl_mem device, cl_mem pboCL, int numBodies, bool bDouble) { cl_int ciErrNum; unsigned int size; if (pboCL) { ciErrNum = clEnqueueAcquireGLObjects(cqCommandQueue, 1, &pboCL, 0, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); } if (bDouble) { size = numBodies * 4 * sizeof(double); double *dHost = (double *)malloc(size); ciErrNum = clEnqueueReadBuffer(cqCommandQueue, device, CL_TRUE, 0, size, dHost, 0, NULL, NULL); for (int i = 0; i < numBodies * 4; i++) { host[i] = (float)(dHost[i]); } free(dHost); } else { size = numBodies * 4 * sizeof(float); ciErrNum = clEnqueueReadBuffer(cqCommandQueue, device, CL_TRUE, 0, size, host, 0, NULL, NULL); } oclCheckError(ciErrNum, CL_SUCCESS); if (pboCL) { ciErrNum = clEnqueueReleaseGLObjects(cqCommandQueue, 1, &pboCL, 0, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); } }
extern "C" size_t scanExclusiveShort( cl_command_queue cqCommandQueue, cl_mem d_Dst, cl_mem d_Src, uint batchSize, uint arrayLength ) { //Check power-of-two factorization uint log2L; uint factorizationRemainder = factorRadix2(log2L, arrayLength); oclCheckError( factorizationRemainder == 1, shrTRUE); //Check supported size range oclCheckError( (arrayLength >= MIN_SHORT_ARRAY_SIZE) && (arrayLength <= MAX_SHORT_ARRAY_SIZE), shrTRUE ); //Check total batch size limit oclCheckError( (batchSize * arrayLength) <= MAX_BATCH_ELEMENTS, shrTRUE ); //Check all work-groups to be fully packed with data oclCheckError( (batchSize * arrayLength) % (4 * WORKGROUP_SIZE) == 0, shrTRUE); return scanExclusiveLocal1( cqCommandQueue, d_Dst, d_Src, batchSize, arrayLength ); }
extern "C" void convolutionColumns( cl_command_queue cqCommandQueue, cl_mem d_Dst, cl_mem d_Src, cl_mem c_Kernel, cl_uint imageW, cl_uint imageH ){ cl_int ciErrNum; size_t localWorkSize[2], globalWorkSize[2]; oclCheckError( COLUMNS_BLOCKDIM_Y * COLUMNS_HALO_STEPS >= KERNEL_RADIUS, shrTRUE ); oclCheckError( imageW % COLUMNS_BLOCKDIM_X == 0, shrTRUE ); oclCheckError( imageH % (COLUMNS_RESULT_STEPS * COLUMNS_BLOCKDIM_Y) == 0, shrTRUE ); if(!cqCommandQueue) cqCommandQueue = cqDefaultCommandQueue; ciErrNum = clSetKernelArg(ckConvolutionColumns, 0, sizeof(cl_mem), (void*)&d_Dst); ciErrNum |= clSetKernelArg(ckConvolutionColumns, 1, sizeof(cl_mem), (void*)&d_Src); ciErrNum |= clSetKernelArg(ckConvolutionColumns, 2, sizeof(cl_mem), (void*)&c_Kernel); ciErrNum |= clSetKernelArg(ckConvolutionColumns, 3, sizeof(unsigned int), (void*)&imageW); ciErrNum |= clSetKernelArg(ckConvolutionColumns, 4, sizeof(unsigned int), (void*)&imageH); ciErrNum |= clSetKernelArg(ckConvolutionColumns, 5, sizeof(unsigned int), (void*)&imageW); oclCheckError(ciErrNum, CL_SUCCESS); localWorkSize[0] = COLUMNS_BLOCKDIM_X; localWorkSize[1] = COLUMNS_BLOCKDIM_Y; globalWorkSize[0] = imageW; globalWorkSize[1] = imageH / COLUMNS_RESULT_STEPS; ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckConvolutionColumns, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); }
//////////////////////////////////////////////////////////////////////////////// // Large scan launcher //////////////////////////////////////////////////////////////////////////////// static void scanExclusiveLocal2( cl_command_queue cqCommandQueue, cl_mem d_Buffer, cl_mem d_Dst, cl_mem d_Src, uint n, uint size ) { cl_int ciErrNum; size_t localWorkSize, globalWorkSize; uint elements = n * size; ciErrNum = clSetKernelArg(ckScanExclusiveLocal2, 0, sizeof(cl_mem), (void *)&d_Buffer); ciErrNum |= clSetKernelArg(ckScanExclusiveLocal2, 1, sizeof(cl_mem), (void *)&d_Dst); ciErrNum |= clSetKernelArg(ckScanExclusiveLocal2, 2, sizeof(cl_mem), (void *)&d_Src); ciErrNum |= clSetKernelArg(ckScanExclusiveLocal2, 3, 2 * WORKGROUP_SIZE * sizeof(uint), NULL); ciErrNum |= clSetKernelArg(ckScanExclusiveLocal2, 4, sizeof(uint), (void *)&elements); ciErrNum |= clSetKernelArg(ckScanExclusiveLocal2, 5, sizeof(uint), (void *)&size); oclCheckError(ciErrNum, CL_SUCCESS); localWorkSize = WORKGROUP_SIZE; globalWorkSize = iSnapUp(elements, WORKGROUP_SIZE); ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckScanExclusiveLocal2, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); }
void integrateSystem(cl_command_queue cqCommandQueue, cl_kernel k, cl_mem newPositions, cl_mem newVelocities, cl_mem newEdges, cl_mem oldPositions, cl_mem oldVelocities, cl_mem oldEdges, cl_mem oldForces, float deltaTime, float damping, int numBodies, int p, int q, bool bDouble) { int sharedMemSize; //for double precision if (bDouble) { sharedMemSize = p * q * sizeof(cl_double4); // 4 doubles for pos } else { sharedMemSize = p * q * sizeof(cl_float4); // 4 floats for pos } size_t global_work_size[2]; size_t local_work_size[2]; cl_int ciErrNum = CL_SUCCESS; cl_kernel kernel; kernel = k; ciErrNum |= clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&newPositions); ciErrNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&newVelocities); ciErrNum |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&newEdges); ciErrNum |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&oldPositions); ciErrNum |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&oldVelocities); ciErrNum |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *)&oldEdges); ciErrNum |= clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *)&oldForces); ciErrNum |= clSetKernelArg(kernel, 7, sizeof(cl_float), (void *)&deltaTime); ciErrNum |= clSetKernelArg(kernel, 8, sizeof(cl_float), (void *)&damping); oclCheckError(ciErrNum, CL_SUCCESS); // set work-item dimensions local_work_size[0] = p; local_work_size[1] = q; global_work_size[0]= numBodies; global_work_size[1]= q; // execute the kernel: ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); }
/////////////////////////////////////////////////////////////////////////////// // test the bandwidth of a device to host memcopy of a specific size /////////////////////////////////////////////////////////////////////////////// double testDeviceToDeviceTransfer(unsigned int memSize) { double elapsedTimeInSec = 0.0; double bandwidthInMBs = 0.0; unsigned char* h_idata = NULL; cl_int ciErrNum = CL_SUCCESS; //allocate host memory h_idata = (unsigned char *)malloc( memSize ); //initialize the memory for(unsigned int i = 0; i < memSize/sizeof(unsigned char); i++) { h_idata[i] = (unsigned char) (i & 0xff); } // allocate device input and output memory and initialize the device input memory cl_mem d_idata = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, memSize, NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); cl_mem d_odata = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, memSize, NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); ciErrNum = clEnqueueWriteBuffer(cqCommandQueue, d_idata, CL_TRUE, 0, memSize, h_idata, 0, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); // Sync queue to host, start timer 0, and copy data from one GPU buffer to another GPU bufffer clFinish(cqCommandQueue); shrDeltaT(0); for(unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { ciErrNum = clEnqueueCopyBuffer(cqCommandQueue, d_idata, d_odata, 0, 0, memSize, 0, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); } // Sync with GPU clFinish(cqCommandQueue); //get the the elapsed time in seconds elapsedTimeInSec = shrDeltaT(0); // Calculate bandwidth in MB/s // This is for kernels that read and write GMEM simultaneously // Obtained Throughput for unidirectional block copies will be 1/2 of this # bandwidthInMBs = 2.0 * ((double)memSize * (double)MEMCOPY_ITERATIONS)/(elapsedTimeInSec * (double)(1 << 20)); //clean up memory on host and device free(h_idata); clReleaseMemObject(d_idata); clReleaseMemObject(d_odata); return bandwidthInMBs; }
void BodySystemCPU::update(float deltaTime) { oclCheckError(m_bInitialized, shrTRUE); _integrateNBodySystem(deltaTime); std::swap(m_currentRead, m_currentWrite); }
void BodySystemCPU::_initialize(int numBodies, int numEdges) { oclCheckError(m_bInitialized, shrFALSE); m_numBodies = numBodies; m_numEdges = numEdges; m_pos[0] = new float[m_numBodies*4]; m_pos[1] = new float[m_numBodies*4]; m_vel[0] = new float[m_numBodies*4]; m_vel[1] = new float[m_numBodies*4]; m_edge[0] = new float[m_numBodies*4]; m_edge[1] = new float[m_numBodies*4]; m_force = new float[m_numBodies*4]; memset(m_pos[0], 0, m_numBodies*4*sizeof(float)); memset(m_pos[1], 0, m_numBodies*4*sizeof(float)); memset(m_vel[0], 0, m_numBodies*4*sizeof(float)); memset(m_vel[1], 0, m_numBodies*4*sizeof(float)); memset(m_edge[0], 0, m_numEdges*4*sizeof(float)); memset(m_edge[1], 0, m_numEdges*4*sizeof(float)); memset(m_force, 0, m_numBodies*4*sizeof(float)); m_bInitialized = true; }
void CopyArrayToDevice(int __size, cl_command_queue cqCommandQueue, cl_mem device, const float* host, int numBodies, bool bDouble) { cl_int ciErrNum; unsigned int size; if (bDouble) { size = numBodies * 4 * sizeof(double); double *cdHost = (double *)malloc(size); for (int i = 0; i < numBodies * 4; i++) { cdHost[i] = (double)host[i]; } ciErrNum = clEnqueueWriteBuffer(cqCommandQueue, device, CL_TRUE, 0, size, cdHost, 0, NULL, NULL); free(cdHost); } else { size = numBodies*4*sizeof(float); ciErrNum = clEnqueueWriteBuffer(cqCommandQueue, device, CL_TRUE, 0, size, host, 0, NULL, NULL); } oclCheckError(ciErrNum, CL_SUCCESS); }
void computeSpringsForces(cl_command_queue cqCommandQueue, cl_kernel k, cl_mem newForces, cl_mem newEdges, cl_mem oldPositions, cl_mem oldEdges, int numEdges, int p, int q, bool bDouble) { int sharedMemSize; //for double precision if (bDouble) { sharedMemSize = p * q * sizeof(cl_double4); // 4 doubles for pos } else { sharedMemSize = p * q * sizeof(cl_float4); // 4 floats for pos } size_t global_work_size[2]; size_t local_work_size[2]; cl_int ciErrNum = CL_SUCCESS; cl_kernel kernel; kernel = k; ciErrNum |= clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&newForces); ciErrNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&newEdges); ciErrNum |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&oldPositions); ciErrNum |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&oldEdges); ciErrNum |= clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&numEdges); oclCheckError(ciErrNum, CL_SUCCESS); // set work-item dimensions local_work_size[0] = numEdges; // todo IMPORTANT!! this is XXX there was = p; local_work_size[1] = q; global_work_size[0]= numEdges; global_work_size[1]= q; // execute the kernel: ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); }
////////////////////////////////////////////////////////////////////////////// //! Initializes the global context and command queue ////////////////////////////////////////////////////////////////////////////// void oclInit( ) { cl_platform_id cpPlatform; cl_device_id cdDevice; cl_int ciErrNum; ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckError(ciErrNum, CL_SUCCESS); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL); oclCheckError(ciErrNum, CL_SUCCESS); g_clDeviceContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); g_clDeviceQueue = clCreateCommandQueue(g_clDeviceContext, cdDevice, 0, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); }
extern "C" void closeScan(void) { cl_int ciErrNum; ciErrNum = clReleaseMemObject(d_Buffer); ciErrNum |= clReleaseKernel(ckUniformUpdate); ciErrNum |= clReleaseKernel(ckScanExclusiveLocal2); ciErrNum |= clReleaseKernel(ckScanExclusiveLocal1); ciErrNum |= clReleaseProgram(cpProgram); oclCheckError(ciErrNum, CL_SUCCESS); }
extern "C" void initScan(cl_context cxGPUContext, cl_command_queue cqParamCommandQue, const char **argv) { cl_int ciErrNum; size_t kernelLength; shrLog(" ...loading Scan.cl\n"); char *cScan = oclLoadProgSource(shrFindFilePath("Scan.cl", argv[0]), "// My comment\n", &kernelLength); oclCheckError(cScan != NULL, shrTRUE); shrLog(" ...creating scan program\n"); cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cScan, &kernelLength, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog(" ...building scan program\n"); ciErrNum = clBuildProgram(cpProgram, 0, NULL, compileOptions, NULL, NULL); if (ciErrNum != CL_SUCCESS) { // write out standard error, Build Log and PTX, then cleanup and exit shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclScan.ptx"); oclCheckError(ciErrNum, CL_SUCCESS); } shrLog(" ...creating scan kernels\n"); ckScanExclusiveLocal1 = clCreateKernel(cpProgram, "scanExclusiveLocal1", &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); ckScanExclusiveLocal2 = clCreateKernel(cpProgram, "scanExclusiveLocal2", &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); ckUniformUpdate = clCreateKernel(cpProgram, "uniformUpdate", &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog( " ...checking minimum supported workgroup size\n"); //Check for work group size cl_device_id device; size_t szScanExclusiveLocal1, szScanExclusiveLocal2, szUniformUpdate; ciErrNum = clGetCommandQueueInfo(cqParamCommandQue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device, NULL); ciErrNum |= clGetKernelWorkGroupInfo(ckScanExclusiveLocal1, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &szScanExclusiveLocal1, NULL); ciErrNum |= clGetKernelWorkGroupInfo(ckScanExclusiveLocal2, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &szScanExclusiveLocal2, NULL); ciErrNum |= clGetKernelWorkGroupInfo(ckUniformUpdate, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &szUniformUpdate, NULL); oclCheckError(ciErrNum, CL_SUCCESS); if( (szScanExclusiveLocal1 < WORKGROUP_SIZE) || (szScanExclusiveLocal2 < WORKGROUP_SIZE) || (szUniformUpdate < WORKGROUP_SIZE) ) { shrLog("ERROR: Minimum work-group size %u required by this application is not supported on this device.\n", WORKGROUP_SIZE); exit(0); } shrLog(" ...allocating internal buffers\n"); d_Buffer = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, (MAX_BATCH_ELEMENTS / (4 * WORKGROUP_SIZE)) * sizeof(uint), NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); //Discard temp storage free(cScan); }
extern "C" size_t scanExclusiveLarge( cl_command_queue cqCommandQueue, cl_mem d_Dst, cl_mem d_Src, uint batchSize, uint arrayLength ) { //Check power-of-two factorization uint log2L; uint factorizationRemainder = factorRadix2(log2L, arrayLength); oclCheckError( factorizationRemainder == 1, shrTRUE); //Check supported size range oclCheckError( (arrayLength >= MIN_LARGE_ARRAY_SIZE) && (arrayLength <= MAX_LARGE_ARRAY_SIZE), shrTRUE ); //Check total batch size limit oclCheckError( (batchSize * arrayLength) <= MAX_BATCH_ELEMENTS, shrTRUE ); scanExclusiveLocal1( cqCommandQueue, d_Dst, d_Src, (batchSize * arrayLength) / (4 * WORKGROUP_SIZE), 4 * WORKGROUP_SIZE ); scanExclusiveLocal2( cqCommandQueue, d_Buffer, d_Dst, d_Src, batchSize, arrayLength / (4 * WORKGROUP_SIZE) ); return uniformUpdate( cqCommandQueue, d_Dst, d_Buffer, (batchSize * arrayLength) / (4 * WORKGROUP_SIZE) ); }
static size_t uniformUpdate( cl_command_queue cqCommandQueue, cl_mem d_Dst, cl_mem d_Buffer, uint n ) { cl_int ciErrNum; size_t localWorkSize, globalWorkSize; ciErrNum = clSetKernelArg(ckUniformUpdate, 0, sizeof(cl_mem), (void *)&d_Dst); ciErrNum |= clSetKernelArg(ckUniformUpdate, 1, sizeof(cl_mem), (void *)&d_Buffer); oclCheckError(ciErrNum, CL_SUCCESS); localWorkSize = WORKGROUP_SIZE; globalWorkSize = n * WORKGROUP_SIZE; ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckUniformUpdate, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); return localWorkSize; }
void BodySystemCPU::_finalize() { oclCheckError(m_bInitialized, shrTRUE); delete [] m_pos[0]; delete [] m_pos[1]; delete [] m_vel[0]; delete [] m_vel[1]; delete [] m_edge[0]; delete [] m_edge[1]; delete [] m_force; }
extern "C" void initConvolutionSeparable(cl_context cxGPUContext, cl_command_queue cqParamCommandQueue, const char **argv){ cl_int ciErrNum; size_t kernelLength; shrLog("Loading ConvolutionSeparable.cl...\n"); char *cPathAndName = shrFindFilePath("ConvolutionSeparable.cl", argv[0]); oclCheckError(cPathAndName != NULL, shrTRUE); char *cConvolutionSeparable = oclLoadProgSource(cPathAndName, "// My comment\n", &kernelLength); oclCheckError(cConvolutionSeparable != NULL, shrTRUE); shrLog("Creating convolutionSeparable program...\n"); cpConvolutionSeparable = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cConvolutionSeparable, &kernelLength, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("Building convolutionSeparable program...\n"); char compileOptions[2048]; #ifdef _WIN32 sprintf_s(compileOptions, 2048, "\ -cl-fast-relaxed-math \ -D KERNEL_RADIUS=%u\ -D ROWS_BLOCKDIM_X=%u -D COLUMNS_BLOCKDIM_X=%u\ -D ROWS_BLOCKDIM_Y=%u -D COLUMNS_BLOCKDIM_Y=%u\ -D ROWS_RESULT_STEPS=%u -D COLUMNS_RESULT_STEPS=%u\ -D ROWS_HALO_STEPS=%u -D COLUMNS_HALO_STEPS=%u\ ", KERNEL_RADIUS, ROWS_BLOCKDIM_X, COLUMNS_BLOCKDIM_X, ROWS_BLOCKDIM_Y, COLUMNS_BLOCKDIM_Y, ROWS_RESULT_STEPS, COLUMNS_RESULT_STEPS, ROWS_HALO_STEPS, COLUMNS_HALO_STEPS ); #else sprintf(compileOptions, "\ -cl-fast-relaxed-math \ -D KERNEL_RADIUS=%u\ -D ROWS_BLOCKDIM_X=%u -D COLUMNS_BLOCKDIM_X=%u\ -D ROWS_BLOCKDIM_Y=%u -D COLUMNS_BLOCKDIM_Y=%u\ -D ROWS_RESULT_STEPS=%u -D COLUMNS_RESULT_STEPS=%u\ -D ROWS_HALO_STEPS=%u -D COLUMNS_HALO_STEPS=%u\ ", KERNEL_RADIUS, ROWS_BLOCKDIM_X, COLUMNS_BLOCKDIM_X, ROWS_BLOCKDIM_Y, COLUMNS_BLOCKDIM_Y, ROWS_RESULT_STEPS, COLUMNS_RESULT_STEPS, ROWS_HALO_STEPS, COLUMNS_HALO_STEPS ); #endif ciErrNum = clBuildProgram(cpConvolutionSeparable, 0, NULL, compileOptions, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); ckConvolutionRows = clCreateKernel(cpConvolutionSeparable, "convolutionRows", &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); ckConvolutionColumns = clCreateKernel(cpConvolutionSeparable, "convolutionColumns", &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); cqDefaultCommandQueue = cqParamCommandQueue; free(cConvolutionSeparable); }
//////////////////////////////////////////////////////////////////////////////// // Short scan launcher //////////////////////////////////////////////////////////////////////////////// static size_t scanExclusiveLocal1( cl_command_queue cqCommandQueue, cl_mem d_Dst, cl_mem d_Src, uint n, uint size ) { cl_int ciErrNum; size_t localWorkSize, globalWorkSize; ciErrNum = clSetKernelArg(ckScanExclusiveLocal1, 0, sizeof(cl_mem), (void *)&d_Dst); ciErrNum |= clSetKernelArg(ckScanExclusiveLocal1, 1, sizeof(cl_mem), (void *)&d_Src); ciErrNum |= clSetKernelArg(ckScanExclusiveLocal1, 2, 2 * WORKGROUP_SIZE * sizeof(uint), NULL); ciErrNum |= clSetKernelArg(ckScanExclusiveLocal1, 3, sizeof(uint), (void *)&size); oclCheckError(ciErrNum, CL_SUCCESS); localWorkSize = WORKGROUP_SIZE; globalWorkSize = (n * size) / 4; ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckScanExclusiveLocal1, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); return localWorkSize; }
void BodySystemCPU::setArray(BodyArray array, const float* data) { oclCheckError(m_bInitialized, shrTRUE); float* target = 0; switch (array) { default: case BODYSYSTEM_POSITION: target = m_pos[m_currentRead]; break; case BODYSYSTEM_VELOCITY: target = m_vel[m_currentRead]; break; case BODYSYSTEM_EDGE: target = m_edge[m_currentRead]; break; } memcpy(target, data, m_numBodies*4*sizeof(float)); }
float* BodySystemCPU::getArray(BodyArray array) { oclCheckError(m_bInitialized, shrTRUE); float* data = 0; switch (array) { default: case BODYSYSTEM_POSITION: data = m_pos[m_currentRead]; break; case BODYSYSTEM_VELOCITY: data = m_vel[m_currentRead]; break; case BODYSYSTEM_EDGE: data = m_edge[m_currentRead]; break; } return data; }
//////////////////////////////////////////////////////////////////////////////// //! Run a simple test for //////////////////////////////////////////////////////////////////////////////// int runTest(int argc, const char** argv) { cl_platform_id cpPlatform = NULL; cl_uint ciDeviceCount = 0; cl_device_id *cdDevices = NULL; cl_int ciErrNum = CL_SUCCESS; //Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); if (ciErrNum != CL_SUCCESS) { shrLog("Error: Failed to create OpenCL context!\n"); return ciErrNum; } //Get the devices ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &ciDeviceCount); cdDevices = (cl_device_id *)malloc(ciDeviceCount * sizeof(cl_device_id) ); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, ciDeviceCount, cdDevices, NULL); if (ciErrNum != CL_SUCCESS) { shrLog("Error: Failed to create OpenCL context!\n"); return ciErrNum; } //Create the context cxGPUContext = clCreateContext(0, ciDeviceCount, cdDevices, NULL, NULL, &ciErrNum); if (ciErrNum != CL_SUCCESS) { shrLog("Error: Failed to create OpenCL context!\n"); return ciErrNum; } if(shrCheckCmdLineFlag(argc, (const char**)argv, "device")) { // User specified GPUs char* deviceList; char* deviceStr; char* next_token; shrGetCmdLineArgumentstr(argc, (const char**)argv, "device", &deviceList); #ifdef WIN32 deviceStr = strtok_s (deviceList," ,.-", &next_token); #else deviceStr = strtok (deviceList," ,.-"); #endif ciDeviceCount = 0; while(deviceStr != NULL) { // get and print the device for this queue cl_device_id device = oclGetDev(cxGPUContext, atoi(deviceStr)); if( device == (cl_device_id) -1 ) { shrLog(" Device %s does not exist!\n", deviceStr); return -1; } shrLog("Device %s: ", deviceStr); oclPrintDevName(LOGBOTH, device); shrLog("\n"); // create command queue commandQueue[ciDeviceCount] = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum); if (ciErrNum != CL_SUCCESS) { shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum); return ciErrNum; } ++ciDeviceCount; #ifdef WIN32 deviceStr = strtok_s (NULL," ,.-", &next_token); #else deviceStr = strtok (NULL," ,.-"); #endif } free(deviceList); } else { // Find out how many GPU's to compute on all available GPUs size_t nDeviceBytes; ciErrNum |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &nDeviceBytes); ciDeviceCount = (cl_uint)nDeviceBytes/sizeof(cl_device_id); if (ciErrNum != CL_SUCCESS) { shrLog(" Error %i in clGetDeviceIDs call !!!\n\n", ciErrNum); return ciErrNum; } else if (ciDeviceCount == 0) { shrLog(" There are no devices supporting OpenCL (return code %i)\n\n", ciErrNum); return -1; } // create command-queues for(unsigned int i = 0; i < ciDeviceCount; ++i) { // get and print the device for this queue cl_device_id device = oclGetDev(cxGPUContext, i); shrLog("Device %d: ", i); oclPrintDevName(LOGBOTH, device); shrLog("\n"); // create command queue commandQueue[i] = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum); if (ciErrNum != CL_SUCCESS) { shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum); return ciErrNum; } } } // Optional Command-line multiplier for matrix sizes shrGetCmdLineArgumenti(argc, (const char**)argv, "sizemult", &iSizeMultiple); iSizeMultiple = CLAMP(iSizeMultiple, 1, 10); uiWA = WA * iSizeMultiple; uiHA = HA * iSizeMultiple; uiWB = WB * iSizeMultiple; uiHB = HB * iSizeMultiple; uiWC = WC * iSizeMultiple; uiHC = HC * iSizeMultiple; shrLog("\nUsing Matrix Sizes: A(%u x %u), B(%u x %u), C(%u x %u)\n", uiWA, uiHA, uiWB, uiHB, uiWC, uiHC); // allocate host memory for matrices A and B unsigned int size_A = uiWA * uiHA; unsigned int mem_size_A = sizeof(float) * size_A; float* h_A_data = (float*)malloc(mem_size_A); unsigned int size_B = uiWB * uiHB; unsigned int mem_size_B = sizeof(float) * size_B; float* h_B_data = (float*)malloc(mem_size_B); // initialize host memory srand(2006); shrFillArray(h_A_data, size_A); shrFillArray(h_B_data, size_B); // allocate host memory for result unsigned int size_C = uiWC * uiHC; unsigned int mem_size_C = sizeof(float) * size_C; float* h_C = (float*) malloc(mem_size_C); // create OpenCL buffer pointing to the host memory cl_mem h_A = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, mem_size_A, h_A_data, &ciErrNum); if (ciErrNum != CL_SUCCESS) { shrLog("Error: clCreateBuffer\n"); return ciErrNum; } // Program Setup size_t program_length; const char* header_path = shrFindFilePath("matrixMul.h", argv[0]); oclCheckError(header_path != NULL, shrTRUE); char* header = oclLoadProgSource(header_path, "", &program_length); if(!header) { shrLog("Error: Failed to load the header %s!\n", header_path); return -1000; } const char* source_path = shrFindFilePath("matrixMul.cl", argv[0]); oclCheckError(source_path != NULL, shrTRUE); char *source = oclLoadProgSource(source_path, header, &program_length); if(!source) { shrLog("Error: Failed to load compute program %s!\n", source_path); return -2000; } // create the program cl_program cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&source, &program_length, &ciErrNum); if (ciErrNum != CL_SUCCESS) { shrLog("Error: Failed to create program\n"); return ciErrNum; } free(header); free(source); // build the program ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL); if (ciErrNum != CL_SUCCESS) { // write out standard error, Build Log and PTX, then return error shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclMatrixMul.ptx"); return ciErrNum; } // write out PTX if requested on the command line if(shrCheckCmdLineFlag(argc, argv, "dump-ptx") ) { oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclMatrixMul.ptx"); } // Create Kernel for(unsigned int i = 0; i < ciDeviceCount; ++i) { multiplicationKernel[i] = clCreateKernel(cpProgram, "matrixMul", &ciErrNum); if (ciErrNum != CL_SUCCESS) { shrLog("Error: Failed to create kernel\n"); return ciErrNum; } } // Run multiplication on 1..deviceCount GPUs to compare improvement shrLog("\nRunning Computations on 1 - %d GPU's...\n\n", ciDeviceCount); for(unsigned int k = 1; k <= ciDeviceCount; ++k) { matrixMulGPU(k, h_A, h_B_data, mem_size_B, h_C); } // compute reference solution shrLog("Comparing results with CPU computation... \n\n"); float* reference = (float*) malloc(mem_size_C); computeGold(reference, h_A_data, h_B_data, uiHA, uiWA, uiWB); // check result shrBOOL res = shrCompareL2fe(reference, h_C, size_C, 1.0e-6f); if (res != shrTRUE) { printDiff(reference, h_C, uiWC, uiHC, 100, 1.0e-5f); } // clean up OCL resources ciErrNum = clReleaseMemObject(h_A); for(unsigned int k = 0; k < ciDeviceCount; ++k) { ciErrNum |= clReleaseKernel( multiplicationKernel[k] ); ciErrNum |= clReleaseCommandQueue( commandQueue[k] ); } ciErrNum |= clReleaseProgram(cpProgram); ciErrNum |= clReleaseContext(cxGPUContext); if(ciErrNum != CL_SUCCESS) { shrLog("Error: Failure releasing OpenCL resources: %d\n", ciErrNum); return ciErrNum; } // clean up memory free(h_A_data); free(h_B_data); free(h_C); free(reference); return ((shrTRUE == res) ? CL_SUCCESS : -3000); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, const char** argv) { const char *my_name = "[oclAvcDisc]"; int bPassed = 1; char filename[500], cBuffer[1024], sProfileString[2048]; FILE *log_ofs=NULL; time_t g_the_time; /* OpenCL variables */ cl_int ciErrNum; cl_platform_id clSelectedPlatformID = NULL; cl_uint ciDeviceCount; cl_device_id *devices; sprintf(filename, "oclAvcDisc.txt"); if( (log_ofs=fopen(filename, "a"))== NULL ) { fprintf(stderr, "[oclAvcDisc] Error, could not open file %s\n", filename); exit(1); } g_the_time = time(NULL); _write_log(log_ofs, "%s oclDeviceQuery.exe Starting...\n", my_name); /* Get OpenCL platform ID for NVIDIA if avaiable, otherwise default */ _write_log(log_ofs, "%s OpenCL SW Info:\n", my_name); ciErrNum = oclGetPlatformID (&clSelectedPlatformID); oclCheckError(ciErrNum, CL_SUCCESS); /* Get OpenCL platform name and version */ ciErrNum = clGetPlatformInfo (clSelectedPlatformID, CL_PLATFORM_NAME, sizeof(cBuffer), cBuffer, NULL); if (ciErrNum == CL_SUCCESS) { _write_log(log_ofs, "%s CL_PLATFORM_NAME: \t%s\n", my_name, cBuffer); } else { _write_log(log_ofs, "%s Error %i in clGetPlatformInfo Call !!!\n\n", my_name, ciErrNum); bPassed = 0; } ciErrNum = clGetPlatformInfo (clSelectedPlatformID, CL_PLATFORM_VERSION, sizeof(cBuffer), cBuffer, NULL); if (ciErrNum == CL_SUCCESS) { _write_log(log_ofs, "%s CL_PLATFORM_VERSION: \t%s\n", my_name, cBuffer); } else { _write_log(" Error %i in clGetPlatformInfo Call !!!\n\n", ciErrNum); bPassed = 0; } // Get and log OpenCL device info _write_log(log_ofs, "%s OpenCL Device Info:\n\n", my_name); ciErrNum = clGetDeviceIDs (clSelectedPlatformID, CL_DEVICE_TYPE_ALL, 0, NULL, &ciDeviceCount); // check for 0 devices found or errors... if (ciDeviceCount == 0) { _write_log(log_ofs, "%s No devices found supporting OpenCL (return code %i)\n\n", my_name, ciErrNum); bPassed = false; } else if (ciErrNum != CL_SUCCESS) { _write_log(log_ofs, "%s Error %i in clGetDeviceIDs call !!!\n\n", my_name, ciErrNum); bPassed = false; } else { // Get and log the OpenCL device ID's char cTemp[2]; _write_log(log_ofs, "%s %u devices found supporting OpenCL:\n\n", my_name , ciDeviceCount); sprintf(cTemp, "%u", ciDeviceCount); if ((devices = (cl_device_id*)malloc(sizeof(cl_device_id) * ciDeviceCount)) == NULL) { _write_log(log_ofs, "%s Failed to allocate memory for devices !!!\n\n", my_name); bPassed = false; } ciErrNum = clGetDeviceIDs (clSelectedPlatformID, CL_DEVICE_TYPE_ALL, ciDeviceCount, devices, &ciDeviceCount); if (ciErrNum == CL_SUCCESS) { //Create a context for the devices cl_context cxGPUContext = clCreateContext(0, ciDeviceCount, devices, NULL, NULL, &ciErrNum); if (ciErrNum != CL_SUCCESS) { _write_log(log_ofs, "%s Error %i in clCreateContext call !!!\n\n", my_name, ciErrNum); bPassed = false; } else { // show info for each device in the context for(unsigned int i = 0; i < ciDeviceCount; ++i ) { _write_log(log_ofs, "%s ---------------------------------\n", my_name); clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(cBuffer), &cBuffer, NULL); _write_log(log_ofs, "%s Device %s\n", my_name, cBuffer); _write_log(log_ofs, "%s ---------------------------------\n", my_name); oclPrintDevInfo(LOGBOTH, devices[i]); } // Determine and show image format support cl_uint uiNumSupportedFormats = 0; // 2D clGetSupportedImageFormats(cxGPUContext, CL_MEM_READ_ONLY, CL_MEM_OBJECT_IMAGE2D, NULL, NULL, &uiNumSupportedFormats); cl_image_format *ImageFormats = NULL; ImageFormats = (cl_image_format*)malloc(uiNumSupportedFormats*sizeof(cl_image_format)); if(ImageFormats==NULL) { _write_log(log_ofs, "%s Error, could not alloc ImageFormats\n", my_name); exit(2); } clGetSupportedImageFormats(cxGPUContext, CL_MEM_READ_ONLY, CL_MEM_OBJECT_IMAGE2D, uiNumSupportedFormats, ImageFormats, NULL); _write_log(log_ofs, "%s ---------------------------------\n", my_name); _write_log(log_ofs, "%s 2D Image Formats Supported (%u)\n", my_name, uiNumSupportedFormats); _write_log(log_ofs, "%s ---------------------------------\n", my_name); _write_log(log_ofs, "%s %-6s%-16s%-22s\n\n", my_name, "#", "Channel Order", "Channel Type"); for(unsigned int i = 0; i < uiNumSupportedFormats; i++) { _write_log(log_ofs, "%s %-6u%-16s%-22s\n", my_name, (i + 1), oclImageFormatString(ImageFormats[i].image_channel_order), oclImageFormatString(ImageFormats[i].image_channel_data_type)); } _write_log(log_ofs, "%s\n", my_name); free(ImageFormats); ImageFormats = NULL; // 3D clGetSupportedImageFormats(cxGPUContext, CL_MEM_READ_ONLY, CL_MEM_OBJECT_IMAGE3D, NULL, NULL, &uiNumSupportedFormats); ImageFormats = (cl_image_format*)malloc(uiNumSupportedFormats*sizeof(cl_image_format)); if(ImageFormats==NULL) { _write_log(log_ofs, "%s Error, could not alloc ImageFormats\n", my_name); exit(3); } clGetSupportedImageFormats(cxGPUContext, CL_MEM_READ_ONLY, CL_MEM_OBJECT_IMAGE3D, uiNumSupportedFormats, ImageFormats, NULL); _write_log(log_ofs, "%s ---------------------------------\n", my_name); _write_log(log_ofs, "%s 3D Image Formats Supported (%u)\n", my_name, uiNumSupportedFormats); _write_log(log_ofs, "%s ---------------------------------\n", my_name); _write_log(log_ofs, "%s %-6s%-16s%-22s\n\n", my_name, "#", "Channel Order", "Channel Type"); for(unsigned int i = 0; i < uiNumSupportedFormats; i++) { _write_log(log_ofs, "%s %-6u%-16s%-22s\n", my_name, (i + 1), oclImageFormatString(ImageFormats[i].image_channel_order), oclImageFormatString(ImageFormats[i].image_channel_data_type)); } write_log(log_ofs, "%s\n", my_name); free(ImageFormats); ImageFormats=NULL; } } else { write_log(log_ofs, "%s Error %i in clGetDeviceIDs call !!!\n\n", my_name, ciErrNum); bPassed = 0; } } // finish _write_log(log_ofs, "%s %s\n\n", my_name, bPassed==1 ? "PASSED" : "FAILED"); fflush(log_ofs); fclose(log_ofs); return(0); }
//////////////////////////////////////////////////////////////////////////////// // Test driver //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { cl_platform_id cpPlatform; cl_device_id cdDevice; cl_context cxGPUContext; //OpenCL context cl_command_queue cqCommandQueue; //OpenCL command queue cl_mem c_Kernel, d_Input, d_Buffer, d_Output; //OpenCL memory buffer objects cl_float *h_Kernel, *h_Input, *h_Buffer, *h_OutputCPU, *h_OutputGPU; cl_int ciErrNum; const unsigned int imageW = 3072; const unsigned int imageH = 3072; shrQAStart(argc, argv); // set logfile name and start logs shrSetLogFileName ("oclConvolutionSeparable.txt"); shrLog("%s Starting...\n\n", argv[0]); shrLog("Allocating and initializing host memory...\n"); h_Kernel = (cl_float *)malloc(KERNEL_LENGTH * sizeof(cl_float)); h_Input = (cl_float *)malloc(imageW * imageH * sizeof(cl_float)); h_Buffer = (cl_float *)malloc(imageW * imageH * sizeof(cl_float)); h_OutputCPU = (cl_float *)malloc(imageW * imageH * sizeof(cl_float)); h_OutputGPU = (cl_float *)malloc(imageW * imageH * sizeof(cl_float)); srand(2009); for(unsigned int i = 0; i < KERNEL_LENGTH; i++) h_Kernel[i] = (cl_float)(rand() % 16); for(unsigned int i = 0; i < imageW * imageH; i++) h_Input[i] = (cl_float)(rand() % 16); shrLog("Initializing OpenCL...\n"); //Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckError(ciErrNum, CL_SUCCESS); //Get the devices ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL); //Create the context cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); //Create a command-queue cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("Initializing OpenCL separable convolution...\n"); initConvolutionSeparable(cxGPUContext, cqCommandQueue, (const char **)argv); shrLog("Creating OpenCL memory objects...\n"); c_Kernel = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, KERNEL_LENGTH * sizeof(cl_float), h_Kernel, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); d_Input = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, imageW * imageH * sizeof(cl_float), h_Input, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); d_Buffer = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, imageW * imageH * sizeof(cl_float), NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); d_Output = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, imageW * imageH * sizeof(cl_float), NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("Applying separable convolution to %u x %u image...\n\n", imageW, imageH); //Just a single run or a warmup iteration convolutionRows( NULL, d_Buffer, d_Input, c_Kernel, imageW, imageH ); convolutionColumns( NULL, d_Output, d_Buffer, c_Kernel, imageW, imageH ); #ifdef GPU_PROFILING const int numIterations = 16; cl_event startMark, endMark; ciErrNum = clEnqueueMarker(cqCommandQueue, &startMark); ciErrNum |= clFinish(cqCommandQueue); shrCheckError(ciErrNum, CL_SUCCESS); shrDeltaT(0); for(int iter = 0; iter < numIterations; iter++){ convolutionRows( cqCommandQueue, d_Buffer, d_Input, c_Kernel, imageW, imageH ); convolutionColumns( cqCommandQueue, d_Output, d_Buffer, c_Kernel, imageW, imageH ); } ciErrNum = clEnqueueMarker(cqCommandQueue, &endMark); ciErrNum |= clFinish(cqCommandQueue); shrCheckError(ciErrNum, CL_SUCCESS); //Calculate performance metrics by wallclock time double gpuTime = shrDeltaT(0) / (double)numIterations; shrLogEx(LOGBOTH | MASTER, 0, "oclConvolutionSeparable, Throughput = %.4f MPixels/s, Time = %.5f s, Size = %u Pixels, NumDevsUsed = %i, Workgroup = %u\n", (1.0e-6 * (double)(imageW * imageH)/ gpuTime), gpuTime, (imageW * imageH), 1, 0); //Get OpenCL profiler info cl_ulong startTime = 0, endTime = 0; ciErrNum = clGetEventProfilingInfo(startMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &startTime, NULL); ciErrNum |= clGetEventProfilingInfo(endMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL); shrCheckError(ciErrNum, CL_SUCCESS); shrLog("\nOpenCL time: %.5f s\n\n", 1.0e-9 * ((double)endTime - (double)startTime)/ (double)numIterations); #endif shrLog("Reading back OpenCL results...\n\n"); ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_Output, CL_TRUE, 0, imageW * imageH * sizeof(cl_float), h_OutputGPU, 0, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("Comparing against Host/C++ computation...\n"); convolutionRowHost(h_Buffer, h_Input, h_Kernel, imageW, imageH, KERNEL_RADIUS); convolutionColumnHost(h_OutputCPU, h_Buffer, h_Kernel, imageW, imageH, KERNEL_RADIUS); double sum = 0, delta = 0; double L2norm; for(unsigned int i = 0; i < imageW * imageH; i++){ delta += (h_OutputCPU[i] - h_OutputGPU[i]) * (h_OutputCPU[i] - h_OutputGPU[i]); sum += h_OutputCPU[i] * h_OutputCPU[i]; } L2norm = sqrt(delta / sum); shrLog("Relative L2 norm: %.3e\n\n", L2norm); // cleanup closeConvolutionSeparable(); ciErrNum = clReleaseMemObject(d_Output); ciErrNum |= clReleaseMemObject(d_Buffer); ciErrNum |= clReleaseMemObject(d_Input); ciErrNum |= clReleaseMemObject(c_Kernel); ciErrNum |= clReleaseCommandQueue(cqCommandQueue); ciErrNum |= clReleaseContext(cxGPUContext); oclCheckError(ciErrNum, CL_SUCCESS); free(h_OutputGPU); free(h_OutputCPU); free(h_Buffer); free(h_Input); free(h_Kernel); // finish shrQAFinishExit(argc, (const char **)argv, (L2norm < 1e-6) ? QA_PASSED : QA_FAILED); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { shrQAStart(argc, argv); // start logs shrSetLogFileName ("oclSimpleMultiGPU.txt"); shrLog("%s Starting, Array = %u float values...\n\n", argv[0], DATA_N); // OpenCL cl_platform_id cpPlatform; cl_uint ciDeviceCount; cl_device_id* cdDevices; cl_context cxGPUContext; cl_device_id cdDevice; // GPU device int deviceNr[MAX_GPU_COUNT]; cl_command_queue commandQueue[MAX_GPU_COUNT]; cl_mem d_Data[MAX_GPU_COUNT]; cl_mem d_Result[MAX_GPU_COUNT]; cl_program cpProgram; cl_kernel reduceKernel[MAX_GPU_COUNT]; cl_event GPUDone[MAX_GPU_COUNT]; cl_event GPUExecution[MAX_GPU_COUNT]; size_t programLength; cl_int ciErrNum; char cDeviceName [256]; cl_mem h_DataBuffer; // Vars for reduction results float h_SumGPU[MAX_GPU_COUNT * ACCUM_N]; float *h_Data; double sumGPU; double sumCPU, dRelError; // allocate and init host buffer with with some random generated input data h_Data = (float *)malloc(DATA_N * sizeof(float)); shrFillArray(h_Data, DATA_N); // start timer & logs shrLog("Setting up OpenCL on the Host...\n\n"); shrDeltaT(1); // Annotate profiling state #ifdef GPU_PROFILING shrLog("OpenCL Profiling is enabled...\n\n"); #endif //Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clGetPlatformID...\n"); //Get the devices ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &ciDeviceCount); oclCheckError(ciErrNum, CL_SUCCESS); cdDevices = (cl_device_id *)malloc(ciDeviceCount * sizeof(cl_device_id) ); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, ciDeviceCount, cdDevices, NULL); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clGetDeviceIDs...\n"); //Create the context cxGPUContext = clCreateContext(0, ciDeviceCount, cdDevices, NULL, NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateContext...\n"); // Set up command queue(s) for GPU's specified on the command line or all GPU's if(shrCheckCmdLineFlag(argc, (const char **)argv, "device")) { // User specified GPUs int ciMaxDeviceID = ciDeviceCount-1; ciDeviceCount = 0; char* deviceList; char* deviceStr; char* next_token; shrGetCmdLineArgumentstr(argc, (const char **)argv, "device", &deviceList); #ifdef WIN32 deviceStr = strtok_s (deviceList," ,.-", &next_token); #else deviceStr = strtok (deviceList," ,.-"); #endif // Create command queues for all Requested GPU's while(deviceStr != NULL) { // get & log device index # and name deviceNr[ciDeviceCount] = atoi(deviceStr); if( deviceNr[ciDeviceCount] > ciMaxDeviceID ) { shrLog(" Invalid user specified device ID: %d\n", deviceNr[ciDeviceCount]); return 1; } cdDevice = oclGetDev(cxGPUContext, deviceNr[ciDeviceCount]); ciErrNum = clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(cDeviceName), cDeviceName, NULL); oclCheckError(ciErrNum, CL_SUCCESS); shrLog(" Device %i: %s\n\n", deviceNr[ciDeviceCount], cDeviceName); // create a command que commandQueue[ciDeviceCount] = clCreateCommandQueue(cxGPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateCommandQueue\n"); ++ciDeviceCount; #ifdef WIN32 deviceStr = strtok_s (NULL," ,.-", &next_token); #else deviceStr = strtok (NULL," ,.-"); #endif } free(deviceList); } else { // Find out how many GPU's to compute on all available GPUs size_t nDeviceBytes; ciErrNum = clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &nDeviceBytes); oclCheckError(ciErrNum, CL_SUCCESS); ciDeviceCount = (cl_uint)nDeviceBytes/sizeof(cl_device_id); for(unsigned int i = 0; i < ciDeviceCount; ++i ) { // get & log device index # and name deviceNr[i] = i; cdDevice = oclGetDev(cxGPUContext, i); ciErrNum = clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(cDeviceName), cDeviceName, NULL); oclCheckError(ciErrNum, CL_SUCCESS); shrLog(" Device %i: %s\n", i, cDeviceName); // create a command que commandQueue[i] = clCreateCommandQueue(cxGPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateCommandQueue\n\n"); } } // Load the OpenCL source code from the .cl file const char* source_path = shrFindFilePath("simpleMultiGPU.cl", argv[0]); char *source = oclLoadProgSource(source_path, "", &programLength); oclCheckError(source != NULL, shrTRUE); shrLog("oclLoadProgSource\n"); // Create the program for all GPUs in the context cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&source, &programLength, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateProgramWithSource\n"); // build the program ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL); if (ciErrNum != CL_SUCCESS) { // write out standard error, Build Log and PTX, then cleanup and exit shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclSimpleMultiGPU.ptx"); oclCheckError(ciErrNum, CL_SUCCESS); } shrLog("clBuildProgram\n"); // Create host buffer with page-locked memory h_DataBuffer = clCreateBuffer(cxGPUContext, CL_MEM_COPY_HOST_PTR | CL_MEM_ALLOC_HOST_PTR, DATA_N * sizeof(float), h_Data, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateBuffer (Page-locked Host)\n\n"); // Create buffers for each GPU, with data divided evenly among GPU's int sizePerGPU = DATA_N / ciDeviceCount; int workOffset[MAX_GPU_COUNT]; int workSize[MAX_GPU_COUNT]; workOffset[0] = 0; for(unsigned int i = 0; i < ciDeviceCount; ++i ) { workSize[i] = (i != (ciDeviceCount - 1)) ? sizePerGPU : (DATA_N - workOffset[i]); // Input buffer d_Data[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, workSize[i] * sizeof(float), NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateBuffer (Input)\t\tDev %i\n", i); // Copy data from host to device ciErrNum = clEnqueueCopyBuffer(commandQueue[i], h_DataBuffer, d_Data[i], workOffset[i] * sizeof(float), 0, workSize[i] * sizeof(float), 0, NULL, NULL); shrLog("clEnqueueCopyBuffer (Input)\tDev %i\n", i); // Output buffer d_Result[i] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, ACCUM_N * sizeof(float), NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateBuffer (Output)\t\tDev %i\n", i); // Create kernel reduceKernel[i] = clCreateKernel(cpProgram, "reduce", &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateKernel\t\t\tDev %i\n", i); // Set the args values and check for errors ciErrNum |= clSetKernelArg(reduceKernel[i], 0, sizeof(cl_mem), &d_Result[i]); ciErrNum |= clSetKernelArg(reduceKernel[i], 1, sizeof(cl_mem), &d_Data[i]); ciErrNum |= clSetKernelArg(reduceKernel[i], 2, sizeof(int), &workSize[i]); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clSetKernelArg\t\t\tDev %i\n\n", i); workOffset[i + 1] = workOffset[i] + workSize[i]; } // Set # of work items in work group and total in 1 dimensional range size_t localWorkSize[] = {THREAD_N}; size_t globalWorkSize[] = {ACCUM_N}; // Start timer and launch reduction kernel on each GPU, with data split between them shrLog("Launching Kernels on GPU(s)...\n\n"); for(unsigned int i = 0; i < ciDeviceCount; i++) { ciErrNum = clEnqueueNDRangeKernel(commandQueue[i], reduceKernel[i], 1, 0, globalWorkSize, localWorkSize, 0, NULL, &GPUExecution[i]); oclCheckError(ciErrNum, CL_SUCCESS); } // Copy result from device to host for each device for(unsigned int i = 0; i < ciDeviceCount; i++) { ciErrNum = clEnqueueReadBuffer(commandQueue[i], d_Result[i], CL_FALSE, 0, ACCUM_N * sizeof(float), h_SumGPU + i * ACCUM_N, 0, NULL, &GPUDone[i]); oclCheckError(ciErrNum, CL_SUCCESS); } // Synchronize with the GPUs and do accumulated error check clWaitForEvents(ciDeviceCount, GPUDone); shrLog("clWaitForEvents complete...\n\n"); // Aggregate results for multiple GPU's and stop/log processing time sumGPU = 0; for(unsigned int i = 0; i < ciDeviceCount * ACCUM_N; i++) { sumGPU += h_SumGPU[i]; } // Print Execution Times for each GPU #ifdef GPU_PROFILING shrLog("Profiling Information for GPU Processing:\n\n"); for(unsigned int i = 0; i < ciDeviceCount; i++) { cdDevice = oclGetDev(cxGPUContext, deviceNr[i]); clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(cDeviceName), cDeviceName, NULL); shrLog("Device %i : %s\n", deviceNr[i], cDeviceName); shrLog(" Reduce Kernel : %.5f s\n", executionTime(GPUExecution[i])); shrLog(" Copy Device->Host : %.5f s\n\n\n", executionTime(GPUDone[i])); } #endif // Run the computation on the Host CPU and log processing time shrLog("Launching Host/CPU C++ Computation...\n\n"); sumCPU = 0; for(unsigned int i = 0; i < DATA_N; i++) { sumCPU += h_Data[i]; } // Check GPU result against CPU result dRelError = 100.0 * fabs(sumCPU - sumGPU) / fabs(sumCPU); shrLog("Comparing against Host/C++ computation...\n"); shrLog(" GPU sum: %f\n CPU sum: %f\n", sumGPU, sumCPU); shrLog(" Relative Error (100.0 * Error / Golden) = %f \n\n", dRelError); // cleanup free(source); free(h_Data); for(unsigned int i = 0; i < ciDeviceCount; ++i ) { clReleaseKernel(reduceKernel[i]); clReleaseCommandQueue(commandQueue[i]); } clReleaseProgram(cpProgram); clReleaseContext(cxGPUContext); // finish shrQAFinishExit(argc, (const char **)argv, (dRelError < 1e-4) ? QA_PASSED : QA_FAILED); }
// Main function // ********************************************************************* int main(int argc, char** argv) { shrQAStart(argc, argv); int use_gpu = 0; for(int i = 0; i < argc && argv; i++) { if(!argv[i]) continue; if(strstr(argv[i], "cpu")) use_gpu = 0; else if(strstr(argv[i], "gpu")) use_gpu = 1; } // start logs shrSetLogFileName ("oclDXTCompression.txt"); shrLog("%s Starting...\n\n", argv[0]); cl_platform_id cpPlatform = NULL; cl_uint uiNumDevices = 0; cl_device_id *cdDevices = NULL; cl_context cxGPUContext; cl_command_queue cqCommandQueue; cl_program cpProgram; cl_kernel ckKernel; cl_mem cmMemObjs[3]; cl_mem cmAlphaTable4, cmProds4; cl_mem cmAlphaTable3, cmProds3; size_t szGlobalWorkSize[1]; size_t szLocalWorkSize[1]; cl_int ciErrNum; // Get the path of the filename char *filename; if (shrGetCmdLineArgumentstr(argc, (const char **)argv, "image", &filename)) { image_filename = filename; } // load image const char* image_path = shrFindFilePath(image_filename, argv[0]); oclCheckError(image_path != NULL, shrTRUE); shrLoadPPM4ub(image_path, (unsigned char **)&h_img, &width, &height); oclCheckError(h_img != NULL, shrTRUE); shrLog("Loaded '%s', %d x %d pixels\n\n", image_path, width, height); // Convert linear image to block linear. const uint memSize = width * height * sizeof(cl_uint); uint* block_image = (uint*)malloc(memSize); // Convert linear image to block linear. for(uint by = 0; by < height/4; by++) { for(uint bx = 0; bx < width/4; bx++) { for (int i = 0; i < 16; i++) { const int x = i & 3; const int y = i / 4; block_image[(by * width/4 + bx) * 16 + i] = ((uint *)h_img)[(by * 4 + y) * 4 * (width/4) + bx * 4 + x]; } } } // Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckError(ciErrNum, CL_SUCCESS); // Get the platform's GPU devices ciErrNum = clGetDeviceIDs(cpPlatform, use_gpu?CL_DEVICE_TYPE_GPU:CL_DEVICE_TYPE_CPU, 0, NULL, &uiNumDevices); oclCheckError(ciErrNum, CL_SUCCESS); cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) ); ciErrNum = clGetDeviceIDs(cpPlatform, use_gpu?CL_DEVICE_TYPE_GPU:CL_DEVICE_TYPE_CPU, uiNumDevices, cdDevices, NULL); oclCheckError(ciErrNum, CL_SUCCESS); // Create the context cxGPUContext = clCreateContext(0, uiNumDevices, cdDevices, NULL, NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); // get and log device cl_device_id device; if( shrCheckCmdLineFlag(argc, (const char **)argv, "device") ) { int device_nr = 0; shrGetCmdLineArgumenti(argc, (const char **)argv, "device", &device_nr); device = oclGetDev(cxGPUContext, device_nr); if( device == (cl_device_id)-1 ) { shrLog(" Invalid GPU Device: devID=%d. %d valid GPU devices detected\n\n", device_nr, uiNumDevices); shrLog(" exiting...\n"); return -1; } } else { device = oclGetMaxFlopsDev(cxGPUContext); } oclPrintDevName(LOGBOTH, device); shrLog("\n"); // create a command-queue cqCommandQueue = clCreateCommandQueue(cxGPUContext, device, 0, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); // Memory Setup // Constants cmAlphaTable4 = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 4 * sizeof(cl_float), (void*)&alphaTable4[0], &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); cmProds4 = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 4 * sizeof(cl_int), (void*)&prods4[0], &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); cmAlphaTable3 = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 4 * sizeof(cl_float), (void*)&alphaTable3[0], &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); cmProds3 = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 4 * sizeof(cl_int), (void*)&prods3[0], &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); // Compute permutations. cl_uint permutations[1024]; computePermutations(permutations); // Upload permutations. cmMemObjs[0] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_uint) * 1024, permutations, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); // Image cmMemObjs[1] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, memSize, NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); // Result const uint compressedSize = (width / 4) * (height / 4) * 8; cmMemObjs[2] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, compressedSize, NULL , &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); unsigned int * h_result = (uint*)malloc(compressedSize); // Program Setup size_t program_length; const char* source_path = shrFindFilePath("DXTCompression.cl", argv[0]); oclCheckError(source_path != NULL, shrTRUE); char *source = oclLoadProgSource(source_path, "", &program_length); oclCheckError(source != NULL, shrTRUE); // create the program cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **) &source, &program_length, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); // build the program ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL); if (ciErrNum != CL_SUCCESS) { // write out standard error, Build Log and PTX, then cleanup and exit shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclDXTCompression.ptx"); oclCheckError(ciErrNum, CL_SUCCESS); } // create the kernel ckKernel = clCreateKernel(cpProgram, "compress", &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); // set the args values ciErrNum = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void *) &cmMemObjs[0]); ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void *) &cmMemObjs[1]); ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void *) &cmMemObjs[2]); ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(cl_mem), (void*)&cmAlphaTable4); ciErrNum |= clSetKernelArg(ckKernel, 4, sizeof(cl_mem), (void*)&cmProds4); ciErrNum |= clSetKernelArg(ckKernel, 5, sizeof(cl_mem), (void*)&cmAlphaTable3); ciErrNum |= clSetKernelArg(ckKernel, 6, sizeof(cl_mem), (void*)&cmProds3); oclCheckError(ciErrNum, CL_SUCCESS); // Copy input data host to device clEnqueueWriteBuffer(cqCommandQueue, cmMemObjs[1], CL_FALSE, 0, sizeof(cl_uint) * width * height, block_image, 0,0,0); // Determine launch configuration and run timed computation numIterations times int blocks = ((width + 3) / 4) * ((height + 3) / 4); // rounds up by 1 block in each dim if %4 != 0 // Restrict the numbers of blocks to launch on low end GPUs to avoid kernel timeout cl_uint compute_units; clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL); int blocksPerLaunch = MIN(blocks, 768 * (int)compute_units); // set work-item dimensions szGlobalWorkSize[0] = blocksPerLaunch * NUM_THREADS; szLocalWorkSize[0]= NUM_THREADS; #ifdef GPU_PROFILING shrLog("\nRunning DXT Compression on %u x %u image...\n", width, height); shrLog("\n%u Workgroups, %u Work Items per Workgroup, %u Work Items in NDRange...\n\n", blocks, NUM_THREADS, blocks * NUM_THREADS); int numIterations = 50; for (int i = -1; i < numIterations; ++i) { if (i == 0) { // start timing only after the first warmup iteration clFinish(cqCommandQueue); // flush command queue shrDeltaT(0); // start timer } #endif // execute kernel for( int j=0; j<blocks; j+= blocksPerLaunch ) { clSetKernelArg(ckKernel, 7, sizeof(int), &j); szGlobalWorkSize[0] = MIN( blocksPerLaunch, blocks-j ) * NUM_THREADS; ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); } #ifdef GPU_PROFILING } clFinish(cqCommandQueue); double dAvgTime = shrDeltaT(0) / (double)numIterations; shrLogEx(LOGBOTH | MASTER, 0, "oclDXTCompression, Throughput = %.4f MPixels/s, Time = %.5f s, Size = %u Pixels, NumDevsUsed = %i, Workgroup = %d\n", (1.0e-6 * (double)(width * height)/ dAvgTime), dAvgTime, (width * height), 1, szLocalWorkSize[0]); #endif // blocking read output ciErrNum = clEnqueueReadBuffer(cqCommandQueue, cmMemObjs[2], CL_TRUE, 0, compressedSize, h_result, 0, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); // Write DDS file. FILE* fp = NULL; char output_filename[1024]; #ifdef WIN32 strcpy_s(output_filename, 1024, image_path); strcpy_s(output_filename + strlen(image_path) - 3, 1024 - strlen(image_path) + 3, "dds"); fopen_s(&fp, output_filename, "wb"); #else strcpy(output_filename, image_path); strcpy(output_filename + strlen(image_path) - 3, "dds"); fp = fopen(output_filename, "wb"); #endif oclCheckError(fp != NULL, shrTRUE); DDSHeader header; header.fourcc = FOURCC_DDS; header.size = 124; header.flags = (DDSD_WIDTH|DDSD_HEIGHT|DDSD_CAPS|DDSD_PIXELFORMAT|DDSD_LINEARSIZE); header.height = height; header.width = width; header.pitch = compressedSize; header.depth = 0; header.mipmapcount = 0; memset(header.reserved, 0, sizeof(header.reserved)); header.pf.size = 32; header.pf.flags = DDPF_FOURCC; header.pf.fourcc = FOURCC_DXT1; header.pf.bitcount = 0; header.pf.rmask = 0; header.pf.gmask = 0; header.pf.bmask = 0; header.pf.amask = 0; header.caps.caps1 = DDSCAPS_TEXTURE; header.caps.caps2 = 0; header.caps.caps3 = 0; header.caps.caps4 = 0; header.notused = 0; fwrite(&header, sizeof(DDSHeader), 1, fp); fwrite(h_result, compressedSize, 1, fp); fclose(fp); // Make sure the generated image matches the reference image (regression check) shrLog("\nComparing against Host/C++ computation...\n"); const char* reference_image_path = shrFindFilePath(refimage_filename, argv[0]); oclCheckError(reference_image_path != NULL, shrTRUE); // read in the reference image from file #ifdef WIN32 fopen_s(&fp, reference_image_path, "rb"); #else fp = fopen(reference_image_path, "rb"); #endif oclCheckError(fp != NULL, shrTRUE); fseek(fp, sizeof(DDSHeader), SEEK_SET); uint referenceSize = (width / 4) * (height / 4) * 8; uint * reference = (uint *)malloc(referenceSize); fread(reference, referenceSize, 1, fp); fclose(fp); // compare the reference image data to the sample/generated image float rms = 0; for (uint y = 0; y < height; y += 4) { for (uint x = 0; x < width; x += 4) { // binary comparison of data uint referenceBlockIdx = ((y/4) * (width/4) + (x/4)); uint resultBlockIdx = ((y/4) * (width/4) + (x/4)); int cmp = compareBlock(((BlockDXT1 *)h_result) + resultBlockIdx, ((BlockDXT1 *)reference) + referenceBlockIdx); // log deviations, if any if (cmp != 0.0f) { compareBlock(((BlockDXT1 *)h_result) + resultBlockIdx, ((BlockDXT1 *)reference) + referenceBlockIdx); shrLog("Deviation at (%d, %d):\t%f rms\n", x/4, y/4, float(cmp)/16/3); } rms += cmp; } } rms /= width * height * 3; shrLog("RMS(reference, result) = %f\n\n", rms); // Free OpenCL resources oclDeleteMemObjs(cmMemObjs, 3); clReleaseMemObject(cmAlphaTable4); clReleaseMemObject(cmProds4); clReleaseMemObject(cmAlphaTable3); clReleaseMemObject(cmProds3); clReleaseKernel(ckKernel); clReleaseProgram(cpProgram); clReleaseCommandQueue(cqCommandQueue); clReleaseContext(cxGPUContext); // Free host memory free(source); free(h_img); // finish shrQAFinishExit(argc, (const char **)argv, (rms <= ERROR_THRESHOLD) ? QA_PASSED : QA_FAILED); }
// Function to read in kernel from uncompiled source, create the OCL program and build the OCL program // ************************************************************************************************** int CreateProgramAndKernel(cl_context cxGPUContext, cl_device_id* cdDevices, const char *kernel_name, cl_kernel *kernel, bool bDouble) { cl_program cpProgram; size_t szSourceLen; cl_int ciErrNum = CL_SUCCESS; // Read the kernel in from file shrLog("\nLoading Uncompiled kernel from .cl file, using %s\n", clSourcefile); char* cPathAndFile = shrFindFilePath(clSourcefile, cExecutablePath); oclCheckError(cPathAndFile != NULL, shrTRUE); char* pcSource = oclLoadProgSource(cPathAndFile, "", &szSourceLen); oclCheckError(pcSource != NULL, shrTRUE); // Check OpenCL version -> vec3 types are supported only from version 1.1 and above char cOCLVersion[32]; clGetDeviceInfo(cdDevices[0], CL_DEVICE_VERSION, sizeof(cOCLVersion), &cOCLVersion, 0); int iVec3Length = 3; if( strncmp("OpenCL 1.0", cOCLVersion, 10) == 0 ) { iVec3Length = 4; } //for double precision char *pcSourceForDouble; std::stringstream header; if (bDouble) { header << "#define REAL double"; header << std::endl; header << "#define REAL4 double4"; header << std::endl; header << "#define REAL3 double" << iVec3Length; header << std::endl; header << "#define ZERO3 {0.0, 0.0, 0.0" << ((iVec3Length == 4) ? ", 0.0}" : "}"); header << std::endl; } else { header << "#define REAL float"; header << std::endl; header << "#define REAL4 float4"; header << std::endl; header << "#define REAL3 float" << iVec3Length; header << std::endl; header << "#define ZERO3 {0.0f, 0.0f, 0.0f" << ((iVec3Length == 4) ? ", 0.0f}" : "}"); header << std::endl; } header << pcSource; pcSourceForDouble = (char *)malloc(header.str().size() + 1); szSourceLen = header.str().size(); #ifdef WIN32 strcpy_s(pcSourceForDouble, szSourceLen + 1, header.str().c_str()); #else strcpy(pcSourceForDouble, header.str().c_str()); #endif // create the program cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&pcSourceForDouble, &szSourceLen, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateProgramWithSource\n"); // Build the program with 'mad' Optimization option #ifdef MAC char *flags = "-cl-fast-relaxed-math -DMAC"; #else char *flags = "-cl-fast-relaxed-math"; #endif ciErrNum = clBuildProgram(cpProgram, 0, NULL, flags, NULL, NULL); if (ciErrNum != CL_SUCCESS) { // write out standard error, Build Log and PTX, then cleanup and exit shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclNbody.ptx"); oclCheckError(ciErrNum, CL_SUCCESS); } shrLog("clBuildProgram\n"); // create the kernel *kernel = clCreateKernel(cpProgram, kernel_name, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateKernel\n"); size_t wgSize; ciErrNum = clGetKernelWorkGroupInfo(*kernel, cdDevices[0], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wgSize, NULL); if (wgSize == 64) { shrLog( "ERROR: Minimum work-group size 256 required by this application is not supported on this device.\n"); exit(0); } free(pcSourceForDouble); return 0; }
//////////////////////////////////////////////////////////////////////////////// // Main program //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { cl_platform_id cpPlatform; //OpenCL platform cl_device_id cdDevice; //OpenCL device cl_context cxGPUContext; //OpenCL context cl_command_queue cqCommandQueue; //OpenCL command que cl_mem d_Input, d_Output; //OpenCL memory buffer objects cl_int ciErrNum; float *h_Input, *h_OutputCPU, *h_OutputGPU; const uint imageW = 2048, imageH = 2048, stride = 2048; const int dir = DCT_FORWARD; shrQAStart(argc, argv); int use_gpu = 0; for(int i = 0; i < argc && argv; i++) { if(!argv[i]) continue; if(strstr(argv[i], "cpu")) use_gpu = 0; else if(strstr(argv[i], "gpu")) use_gpu = 1; } // set logfile name and start logs shrSetLogFileName ("oclDCT8x8.txt"); shrLog("%s Starting...\n\n", argv[0]); shrLog("Allocating and initializing host memory...\n"); h_Input = (float *)malloc(imageH * stride * sizeof(float)); h_OutputCPU = (float *)malloc(imageH * stride * sizeof(float)); h_OutputGPU = (float *)malloc(imageH * stride * sizeof(float)); srand(2009); for(uint i = 0; i < imageH; i++) for(uint j = 0; j < imageW; j++) h_Input[i * stride + j] = (float)rand() / (float)RAND_MAX; shrLog("Initializing OpenCL...\n"); //Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckError(ciErrNum, CL_SUCCESS); //Get a GPU device ciErrNum = clGetDeviceIDs(cpPlatform, use_gpu?CL_DEVICE_TYPE_GPU:CL_DEVICE_TYPE_CPU, 1, &cdDevice, NULL); oclCheckError(ciErrNum, CL_SUCCESS); //Create the context cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); //Create a command-queue cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("Initializing OpenCL DCT 8x8...\n"); initDCT8x8(cxGPUContext, cqCommandQueue, (const char **)argv); shrLog("Creating OpenCL memory objects...\n"); d_Input = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, imageH * stride * sizeof(cl_float), h_Input, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); d_Output = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, imageH * stride * sizeof(cl_float), NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("Performing DCT8x8 of %u x %u image...\n\n", imageH, imageW); //Just a single iteration or a warmup iteration DCT8x8( cqCommandQueue, d_Output, d_Input, stride, imageH, imageW, dir ); #ifdef GPU_PROFILING const int numIterations = 16; cl_event startMark, endMark; ciErrNum = clEnqueueMarker(cqCommandQueue, &startMark); ciErrNum |= clFinish(cqCommandQueue); shrCheckError(ciErrNum, CL_SUCCESS); shrDeltaT(0); for(int iter = 0; iter < numIterations; iter++) DCT8x8( NULL, d_Output, d_Input, stride, imageH, imageW, dir ); ciErrNum = clEnqueueMarker(cqCommandQueue, &endMark); ciErrNum |= clFinish(cqCommandQueue); shrCheckError(ciErrNum, CL_SUCCESS); //Calculate performance metrics by wallclock time double gpuTime = shrDeltaT(0) / (double)numIterations; shrLogEx(LOGBOTH | MASTER, 0, "oclDCT8x8, Throughput = %.4f MPixels/s, Time = %.5f s, Size = %u Pixels, NumDevsUsed = %i, Workgroup = %u\n", (1.0e-6 * (double)(imageW * imageH)/ gpuTime), gpuTime, (imageW * imageH), 1, 0); //Get profiler time cl_ulong startTime = 0, endTime = 0; ciErrNum = clGetEventProfilingInfo(startMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &startTime, NULL); ciErrNum |= clGetEventProfilingInfo(endMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL); shrCheckError(ciErrNum, CL_SUCCESS); shrLog("\nOpenCL time: %.5f s\n\n", 1.0e-9 * ((double)endTime - (double)startTime) / (double)numIterations); #endif shrLog("Reading back OpenCL results...\n"); ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_Output, CL_TRUE, 0, imageH * stride * sizeof(cl_float), h_OutputGPU, 0, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("Comparing against Host/C++ computation...\n"); DCT8x8CPU(h_OutputCPU, h_Input, stride, imageH, imageW, dir); double sum = 0, delta = 0; double L2norm; for(uint i = 0; i < imageH; i++) for(uint j = 0; j < imageW; j++){ sum += h_OutputCPU[i * stride + j] * h_OutputCPU[i * stride + j]; delta += (h_OutputGPU[i * stride + j] - h_OutputCPU[i * stride + j]) * (h_OutputGPU[i * stride + j] - h_OutputCPU[i * stride + j]); } L2norm = sqrt(delta / sum); shrLog("Relative L2 norm: %.3e\n\n", L2norm); shrLog("Shutting down...\n"); //Release kernels and program closeDCT8x8(); //Release other OpenCL objects ciErrNum = clReleaseMemObject(d_Output); ciErrNum |= clReleaseMemObject(d_Input); ciErrNum |= clReleaseCommandQueue(cqCommandQueue); ciErrNum |= clReleaseContext(cxGPUContext); oclCheckError(ciErrNum, CL_SUCCESS); //Release host buffers free(h_OutputGPU); free(h_OutputCPU); free(h_Input); //Finish shrQAFinishExit(argc, (const char **)argv, (L2norm < 1E-6) ? QA_PASSED : QA_FAILED); }
/** * Helper function that will load the OpenCL source program, build and return a handle to that OpenCL kernel * @param context - the OpenCL context * @param device - the device to compile for * @param program - the program that is being built (in/out) * @param sourcePath - full path to the source file * @param options - build options, e.g. define flags("-D name=value") * @return an error code on failure, 0 on success */ cl_program CLState::compileOCLProgram(const char* sourcePath, const std::string& options) { cl_int errNum; size_t program_length; oclCheckError(sourcePath != NULL, shrTRUE); char *source = oclLoadProgSource(sourcePath, "", &program_length); if (!source) { shrLog("Error: Failed to load compute program %s!\n", sourcePath); BOOST_THROW_EXCEPTION( runtime_error() << error_message( (std::string( "Error: Failed to load cl program source from ") + sourcePath).c_str())); } // create the simple increment OpenCL program cl_program program = clCreateProgramWithSource(context, 1, (const char **) &source, &program_length, &errNum); if (errNum != CL_SUCCESS) { shrLog("Error: Failed to create program\n"); BOOST_THROW_EXCEPTION(runtime_error() << cl_error_code(errNum)); } else { shrLog("clCreateProgramWithSource <%s> succeeded, program_length=%d\n", sourcePath, program_length); } free(source); // build the program cl_build_status build_status = CL_SUCCESS; errNum = clBuildProgram(program, 0, NULL, std::string( "-cl-fast-relaxed-math -cl-nv-verbose" + options).c_str(), NULL, NULL); if (errNum != CL_SUCCESS) { // write out standard error, Build Log and PTX, then return error shrLogEx(LOGBOTH | ERRORMSG, errNum, STDERROR); oclLogBuildInfo(program, oclGetFirstDev(context)); oclLogPtx(program, oclGetFirstDev(context), "build_error.ptx"); BOOST_THROW_EXCEPTION(runtime_error() << cl_error_code(errNum)); } else { shrLog("clBuildProgram <%s> succeeded\n", sourcePath); if (this->execDevices != NULL) { for (uint iDevice = 0; iDevice < this->execDeviceCount && build_status == CL_SUCCESS && errNum == CL_SUCCESS; iDevice++) { cl_device_id device = this->execDevices[iDevice]; errNum = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &build_status, NULL); shrLog("clGetProgramBuildInfo returned: "); if (build_status == CL_SUCCESS) { shrLog("CL_SUCCESS\n"); } else { shrLog("CLErrorNumber = %d\n", errNum); } // print out the build log, note in the case where there is nothing shown, some OpenCL PTX->SASS caching has happened { char *build_log; size_t ret_val_size; errNum = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size); if (errNum != CL_SUCCESS) { shrLog( "clGetProgramBuildInfo device %d, failed to get the log size at line %d\n", device, __LINE__); } build_log = (char *) malloc(ret_val_size + 1); errNum = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL); if (errNum != CL_SUCCESS) { shrLog( "clGetProgramBuildInfo device %d, failed to get the build log at line %d\n", device, __LINE__); } // to be carefully, terminate with \0 // there's no information in the reference whether the string is 0 terminated or not build_log[ret_val_size] = '\0'; shrLog("%s\n", build_log); } } } } this->programs.push_back(program); return program; }
////////////////////////////////////////////////////////////////////////////// //! Releases the global context and command queue //! //! @param clContext the context to release //! @param clQueue the command queue to release ////////////////////////////////////////////////////////////////////////////// void oclShutdown(cl_context clContext, cl_command_queue clQueue) { cl_int ciErrNum; ciErrNum = clReleaseCommandQueue(clQueue); ciErrNum |= clReleaseContext(clContext); oclCheckError(ciErrNum, CL_SUCCESS); }
void IntegrateNbodySystem(cl_command_queue cqCommandQueue, cl_kernel MT_kernel, cl_kernel noMT_kernel, cl_mem newPos, cl_mem newVel, cl_mem oldPos, cl_mem oldVel, cl_mem pboCLOldPos, cl_mem pboCLNewPos, float deltaTime, float damping, float softSq, int numBodies, int p, int q, int bUsePBO, bool bDouble) { int sharedMemSize; //for double precision if (bDouble) { sharedMemSize = p * q * sizeof(cl_double4); // 4 doubles for pos } else { sharedMemSize = p * q * sizeof(cl_float4); // 4 floats for pos } size_t global_work_size[2]; size_t local_work_size[2]; cl_int ciErrNum = CL_SUCCESS; cl_kernel kernel; // When the numBodies / thread block size is < # multiprocessors // (16 on G80), the GPU is underutilized. For example, with 256 threads per // block and 1024 bodies, there will only be 4 thread blocks, so the // GPU will only be 25% utilized. To improve this, we use multiple threads // per body. We still can use blocks of 256 threads, but they are arranged // in q rows of p threads each. Each thread processes 1/q of the forces // that affect each body, and then 1/q of the threads (those with // threadIdx.y==0) add up the partial sums from the other threads for that // body. To enable this, use the "--p=" and "--q=" command line options to // this example. e.g.: "nbody.exe --n=1024 --p=64 --q=4" will use 4 // threads per body and 256 threads per block. There will be n/p = 16 // blocks, so a G80 GPU will be 100% utilized. if (q == 1) { kernel = MT_kernel; } else { kernel = noMT_kernel; } if (bUsePBO) { ciErrNum = clEnqueueAcquireGLObjects(cqCommandQueue, 1, &pboCLOldPos, 0, NULL, NULL); ciErrNum |= clEnqueueAcquireGLObjects(cqCommandQueue, 1, &pboCLNewPos, 0, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); } ciErrNum |= clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&newPos); ciErrNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&newVel); ciErrNum |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&oldPos); ciErrNum |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&oldVel); if (bDouble) { double ddeltaTime = (double)deltaTime; double ddamping = (double)damping; double dsoftSq = (double)softSq; ciErrNum |= clSetKernelArg(kernel, 4, sizeof(cl_double), (void *)&ddeltaTime); ciErrNum |= clSetKernelArg(kernel, 5, sizeof(cl_double), (void *)&ddamping); ciErrNum |= clSetKernelArg(kernel, 6, sizeof(cl_double), (void *)&dsoftSq); } else { ciErrNum |= clSetKernelArg(kernel, 4, sizeof(cl_float), (void *)&deltaTime); ciErrNum |= clSetKernelArg(kernel, 5, sizeof(cl_float), (void *)&damping); ciErrNum |= clSetKernelArg(kernel, 6, sizeof(cl_float), (void *)&softSq); } ciErrNum |= clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&numBodies); ciErrNum |= clSetKernelArg(kernel, 8, sharedMemSize, NULL); oclCheckError(ciErrNum, CL_SUCCESS); // set work-item dimensions local_work_size[0] = p; local_work_size[1] = q; global_work_size[0]= numBodies; global_work_size[1]= q; // execute the kernel: ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); if (bUsePBO) { ciErrNum = clEnqueueReleaseGLObjects(cqCommandQueue, 1, &pboCLNewPos, 0, NULL, NULL); ciErrNum |= clEnqueueReleaseGLObjects(cqCommandQueue, 1, &pboCLOldPos, 0, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); } }