void getIntensitySumFromOpenCLImage(OpenCLDevice::pointer device, cl::Image2D image, DataType type, float* sum) { // Get power of two size unsigned int powerOfTwoSize = getPowerOfTwoSize(std::max(image.getImageInfo<CL_IMAGE_WIDTH>(), image.getImageInfo<CL_IMAGE_HEIGHT>())); // Create image levels unsigned int size = powerOfTwoSize; size /= 2; std::vector<cl::Image2D> levels; while(size >= 4) { cl::Image2D level = cl::Image2D(device->getContext(), CL_MEM_READ_WRITE, getOpenCLImageFormat(device, CL_MEM_OBJECT_IMAGE2D, TYPE_FLOAT, 1), size, size); levels.push_back(level); size /= 2; } // Compile OpenCL code std::string buildOptions = ""; switch(type) { case TYPE_FLOAT: buildOptions = "-DTYPE_FLOAT"; break; case TYPE_UINT8: buildOptions = "-DTYPE_UINT8"; break; case TYPE_INT8: buildOptions = "-DTYPE_INT8"; break; case TYPE_UINT16: buildOptions = "-DTYPE_UINT16"; break; case TYPE_INT16: buildOptions = "-DTYPE_INT16"; break; } std::string sourceFilename = std::string(FAST_SOURCE_DIR) + "/ImageSum.cl"; std::string programName = sourceFilename + buildOptions; // Only create program if it doesn't exist for this device from before if(!device->hasProgram(programName)) device->createProgramFromSourceWithName(programName, sourceFilename, buildOptions); cl::Program program = device->getProgram(programName); cl::CommandQueue queue = device->getCommandQueue(); // Fill first level size = powerOfTwoSize/2; cl::Kernel firstLevel(program, "createFirstSumImage2DLevel"); firstLevel.setArg(0, image); firstLevel.setArg(1, levels[0]); queue.enqueueNDRangeKernel( firstLevel, cl::NullRange, cl::NDRange(size,size), cl::NullRange ); // Fill all other levels cl::Kernel createLevel(program, "createSumImage2DLevel"); int i = 0; size /= 2; while(size >= 4) { createLevel.setArg(0, levels[i]); createLevel.setArg(1, levels[i+1]); queue.enqueueNDRangeKernel( createLevel, cl::NullRange, cl::NDRange(size,size), cl::NullRange ); i++; size /= 2; } // Get result from the last level unsigned int nrOfElements = 4*4; unsigned int nrOfComponents = getOpenCLImageFormat(device, CL_MEM_OBJECT_IMAGE2D, TYPE_FLOAT, 1).image_channel_order == CL_RGBA ? 4 : 1; float* result = (float*)allocateDataArray(nrOfElements,TYPE_FLOAT,nrOfComponents); queue.enqueueReadImage(levels[levels.size()-1],CL_TRUE,createOrigoRegion(),createRegion(4,4,1),0,0,result); *sum = getSumFromOpenCLImageResult<float>(result, nrOfElements, nrOfComponents); delete[] result; }
void getMaxAndMinFromOpenCLBuffer(OpenCLDevice::pointer device, cl::Buffer buffer, unsigned int size, DataType type, float* min, float* max) { // Compile OpenCL code std::string buildOptions = ""; switch(type) { case TYPE_FLOAT: buildOptions = "-DTYPE_FLOAT"; break; case TYPE_UINT8: buildOptions = "-DTYPE_UINT8"; break; case TYPE_INT8: buildOptions = "-DTYPE_INT8"; break; case TYPE_UINT16: buildOptions = "-DTYPE_UINT16"; break; case TYPE_INT16: buildOptions = "-DTYPE_INT16"; break; } std::string sourceFilename = std::string(FAST_SOURCE_DIR) + "/ImageMinMax.cl"; std::string programName = sourceFilename + buildOptions; // Only create program if it doesn't exist for this device from before if(!device->hasProgram(programName)) device->createProgramFromSourceWithName(programName, sourceFilename, buildOptions); cl::Program program = device->getProgram(programName); cl::CommandQueue queue = device->getCommandQueue(); // Nr of work groups must be set so that work-group size does not exceed max work-group size (256 on AMD) int length = size; cl::Kernel reduce(program, "reduce"); cl::Buffer current = buffer; cl::Buffer clResult; int workGroupSize = 256; int workGroups = 256; int X = ceil((float)length / (workGroups*workGroupSize)); clResult = cl::Buffer(device->getContext(), CL_MEM_READ_WRITE, getSizeOfDataType(type,1)*workGroups*2); reduce.setArg(0, current); reduce.setArg(1, workGroupSize * getSizeOfDataType(type,1), NULL); reduce.setArg(2, workGroupSize * getSizeOfDataType(type,1), NULL); reduce.setArg(3, size); reduce.setArg(4, X); reduce.setArg(5, clResult); queue.enqueueNDRangeKernel( reduce, cl::NullRange, cl::NDRange(workGroups*workGroupSize), cl::NDRange(workGroupSize) ); length = workGroups; void* result = allocateDataArray(length, type, 2); unsigned int nrOfElements = length; queue.enqueueReadBuffer(clResult,CL_TRUE,0,getSizeOfDataType(type,1)*workGroups*2,result); switch(type) { case TYPE_FLOAT: getMaxAndMinFromOpenCLImageResult<float>(result, nrOfElements, 2, min, max); break; case TYPE_INT8: getMaxAndMinFromOpenCLImageResult<char>(result, nrOfElements, 2, min, max); break; case TYPE_UINT8: getMaxAndMinFromOpenCLImageResult<uchar>(result, nrOfElements, 2, min, max); break; case TYPE_INT16: getMaxAndMinFromOpenCLImageResult<short>(result, nrOfElements, 2, min, max); break; case TYPE_UINT16: getMaxAndMinFromOpenCLImageResult<ushort>(result, nrOfElements, 2, min, max); break; } deleteArray(result, type); }