template <typename T> cl_int DWTKernel<T>::run(T* in, int sizeX, int sizeY, int levels){ if (!in) return CL_INVALID_VALUE; cl_int error_code; cl_context context = NULL; // Obtain the OpenCL context from the command-queue properties error_code = clGetCommandQueueInfo(queue, CL_QUEUE_CONTEXT, sizeof(cl_context), &context, NULL); if (CL_SUCCESS != error_code) { LogError("Error: clGetCommandQueueInfo (CL_QUEUE_CONTEXT) returned %s.\n", TranslateOpenCLError(error_code)); return error_code; } // allocate memory on device srcMem = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeX * sizeY * sizeof(T), in, &error_code); if (CL_SUCCESS != error_code) { LogError("Error: clCreateBuffer (in) returned %s.\n", TranslateOpenCLError(error_code)); return error_code; } dstMem = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeX * sizeY * sizeof(T), NULL, &error_code); if (CL_SUCCESS != error_code) { LogError("Error: clCreateBuffer (out) returned %s.\n", TranslateOpenCLError(error_code)); return error_code; } ownsMemory = true; run(srcMem, dstMem, sizeX, sizeY, levels); return CL_SUCCESS; }
/* * Set kernel arguments */ cl_uint SetKernelArguments(ocl_args_d_t *ocl) { cl_int err = CL_SUCCESS; err = clSetKernelArg(ocl->kernel, 0, sizeof(cl_mem), (void *)&ocl->srcA); if (CL_SUCCESS != err) { LogError("error: Failed to set argument srcA, returned %s\n", TranslateOpenCLError(err)); return err; } err = clSetKernelArg(ocl->kernel, 1, sizeof(cl_mem), (void *)&ocl->srcB); if (CL_SUCCESS != err) { LogError("Error: Failed to set argument srcB, returned %s\n", TranslateOpenCLError(err)); return err; } err = clSetKernelArg(ocl->kernel, 2, sizeof(cl_mem), (void *)&ocl->dstMem); if (CL_SUCCESS != err) { LogError("Error: Failed to set argument dstMem, returned %s\n", TranslateOpenCLError(err)); return err; } return err; }
/* * Execute the kernel */ cl_uint ExecuteAddKernel(ocl_args_d_t *ocl, cl_uint width, cl_uint height) { cl_int err = CL_SUCCESS; // Define global iteration space for clEnqueueNDRangeKernel. size_t globalWorkSize[2] = { width, height }; // execute kernel err = clEnqueueNDRangeKernel(ocl->commandQueue, ocl->kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); if (CL_SUCCESS != err) { LogError("Error: Failed to run kernel, return %s\n", TranslateOpenCLError(err)); return err; } // Wait until the queued kernel is completed by the device err = clFinish(ocl->commandQueue); if (CL_SUCCESS != err) { LogError("Error: clFinish return %s\n", TranslateOpenCLError(err)); return err; } return CL_SUCCESS; }
/* * Create and build OpenCL program from its source code */ int CreateAndBuildProgram(ocl_args_d_t *ocl) { cl_int err = CL_SUCCESS; // Upload the OpenCL C source code from the input file to source // The size of the C program is returned in sourceSize char* source = NULL; size_t src_size = 0; err = ReadSourceFromFile("Template.cl", &source, &src_size); if (CL_SUCCESS != err) { LogError("Error: ReadSourceFromFile returned %s.\n", TranslateOpenCLError(err)); goto Finish; } // And now after you obtained a regular C string call clCreateProgramWithSource to create OpenCL program object. ocl->program = clCreateProgramWithSource(ocl->context, 1, (const char**)&source, &src_size, &err); if (CL_SUCCESS != err) { LogError("Error: clCreateProgramWithSource returned %s.\n", TranslateOpenCLError(err)); goto Finish; } // Build the program // During creation a program is not built. You need to explicitly call build function. // Here you just use create-build sequence, // but there are also other possibilities when program consist of several parts, // some of which are libraries, and you may want to consider using clCompileProgram and clLinkProgram as // alternatives. err = clBuildProgram(ocl->program, 1, &ocl->device, "", NULL, NULL); if (CL_SUCCESS != err) { LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err)); // In case of error print the build log to the standard output // First check the size of the log // Then allocate the memory and obtain the log from the program if (err == CL_BUILD_PROGRAM_FAILURE) { size_t log_size = 0; clGetProgramBuildInfo(ocl->program, ocl->device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); std::vector<char> build_log(log_size); clGetProgramBuildInfo(ocl->program, ocl->device, CL_PROGRAM_BUILD_LOG, log_size, &build_log[0], NULL); LogError("Error happened during the build of OpenCL program.\nBuild log:%s", &build_log[0]); } } Finish: if (source) { delete[] source; source = NULL; } return err; }
cl_uint CreateBufferArguments() { cl_int err = CL_SUCCESS; // Create new OpenCL buffer objects // As these buffer are used only for read by the kernel, you are recommended to create it with flag CL_MEM_READ_ONLY. // Always set minimal read/write flags for buffers, it may lead to better performance because it allows runtime // to better organize data copying. // You use CL_MEM_COPY_HOST_PTR here, because the buffers should be populated with bytes at inputA and inputB. ocl.Lights = clCreateBuffer(ocl.context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(RectangleLight) * masterSet.LightCount, masterSet.m_rectLight, &err); if (CL_SUCCESS != err) { printf("Error: clCreateBuffer for Lights returned %s\n", TranslateOpenCLError(err)); return err; } ocl.LightCount = masterSet.LightCount; ocl.Shapes = clCreateBuffer(ocl.context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(Plane) * masterSet.PlaneCount, masterSet.m_plane, &err); if (CL_SUCCESS != err) { printf("Error: clCreateBuffer for Shapes returned %s\n", TranslateOpenCLError(err)); return err; } ocl.ShapeCount = masterSet.PlaneCount; ocl.sampleCount = SampleCount; ocl.cam = clCreateBuffer(ocl.context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(Camera), &cam, &err); if (CL_SUCCESS != err) { printf("Error: clCreateBuffer for cam returned %s\n", TranslateOpenCLError(err)); return err; } ocl.width = Width; ocl.height = Height; /* ocl.Pixels = clCreateBuffer(ocl.context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, sizeof(cl_uint) * WorkAmount, NULL, &err); if (CL_SUCCESS != err) { printf("Error: clCreateBuffer for Pixels returned %s\n", TranslateOpenCLError(err)); return err; } ocl.Seeds = clCreateBuffer(ocl.context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(cl_uint) * WorkAmount * 2, Seeds, &err); if (CL_SUCCESS != err) { printf("Error: clCreateBuffer for Seeds returned %s\n", TranslateOpenCLError(err)); return err; } */ return CL_SUCCESS; }
template <typename T> cl_int DWTKernel<T>::setWindowKernelArgs(int WIN_SX, int WIN_SY) { cl_int error_code = clSetKernelArg(myKernel, 0, sizeof(int), &WIN_SX); if (CL_SUCCESS != error_code) { LogError("Error: clSetKernelArg returned %s.\n", TranslateOpenCLError(error_code)); return error_code; } error_code = clSetKernelArg(myKernel, 1, sizeof(int), &WIN_SY); if (CL_SUCCESS != error_code) { LogError("Error: clSetKernelArg returned %s.\n", TranslateOpenCLError(error_code)); return error_code; } return CL_SUCCESS; }
template<typename T> tDeviceRC OCLDWT<T>::setKernelArgs(OCLKernel* myKernel,unsigned int width, unsigned int height,unsigned int steps, unsigned int level, unsigned int levels) { numKernelArgs = 0; cl_kernel targetKernel = myKernel->getKernel(); cl_int error_code = clSetKernelArg(targetKernel, numKernelArgs++, sizeof(cl_mem), memoryManager->getDwtIn(level)); if (DeviceSuccess != error_code) { LogError("setKernelArgs returned %s.", TranslateOpenCLError(error_code)); return error_code; } error_code = clSetKernelArg(targetKernel, numKernelArgs++, sizeof(cl_mem), (level < levels-1) ? memoryManager->getDwtIn(level+1) : memoryManager->getDWTOut() ); if (DeviceSuccess != error_code) { LogError("setKernelArgs returned %s.", TranslateOpenCLError(error_code)); return error_code; } error_code = clSetKernelArg(targetKernel, numKernelArgs++, sizeof(cl_mem), memoryManager->getDWTOut()); if (DeviceSuccess != error_code) { LogError("setKernelArgs returned %s.", TranslateOpenCLError(error_code)); return error_code; } error_code = clSetKernelArg(targetKernel, numKernelArgs++, sizeof(width), &width); if (DeviceSuccess != error_code) { LogError("setKernelArgs returned %s.", TranslateOpenCLError(error_code)); return error_code; } error_code = clSetKernelArg(targetKernel, numKernelArgs++, sizeof(height), &height); if (DeviceSuccess != error_code) { LogError("setKernelArgs returned %s.", TranslateOpenCLError(error_code)); return error_code; } error_code = clSetKernelArg(targetKernel, numKernelArgs++, sizeof(steps), &steps); if (DeviceSuccess != error_code) { LogError("setKernelArgs returned %s.", TranslateOpenCLError(error_code)); return error_code; } error_code = clSetKernelArg(targetKernel, numKernelArgs++, sizeof(level), &level); if (DeviceSuccess != error_code) { LogError("setKernelArgs returned %s.", TranslateOpenCLError(error_code)); return error_code; } error_code = clSetKernelArg(targetKernel, numKernelArgs++, sizeof(levels), &levels); if (DeviceSuccess != error_code) { LogError("setKernelArgs returned %s.", TranslateOpenCLError(error_code)); return error_code; } return DeviceSuccess; }
int OCLDeviceManager::init(eDeviceType type) { bool isCpu = type == CPU; ocl_args_d_t** oclArgs; if (isCpu) { if (ocl_cpu) return 0; ocl_cpu = new ocl_args_d_t(); oclArgs = &ocl_cpu; } else { if (ocl_gpu) return 0; ocl_gpu = new ocl_args_d_t(); oclArgs = &ocl_gpu; } data_args_d_t args; args.preferGpu = !isCpu; args.preferCpu = isCpu; args.vendorName = NULL; int error_code; error_code = InitOpenCL(*oclArgs, &args); if (CL_SUCCESS != error_code) { LogError("InitOpenCL returned %s.", TranslateOpenCLError(error_code)); delete *oclArgs; *oclArgs = NULL;; } return error_code; }
template <typename T> cl_int DWTKernel<T>::setImageSizeKernelArgs(int sx, int sy) { cl_int error_code = clSetKernelArg(myKernel, 5, sizeof(int), &sx); if (CL_SUCCESS != error_code) { LogError("Error: clSetKernelArg returned %s.\n", TranslateOpenCLError(error_code)); return error_code; } error_code = clSetKernelArg(myKernel, 6, sizeof(int), &sy); if (CL_SUCCESS != error_code) { LogError("Error: clSetKernelArg returned %s.\n", TranslateOpenCLError(error_code)); return error_code; } return CL_SUCCESS; }
template <typename T> tDeviceRC DWTKernel<T>::copyLLBandToSrc(int LLSizeX, int LLSizeY){ // copy forward or reverse transformed LL band from output back into the input size_t bufferOffset[] = { 0, 0, 0}; cl_int err = CL_SUCCESS; // The region size must be given in bytes size_t region[] = {LLSizeX * sizeof(T), LLSizeY, 1 }; err = clEnqueueCopyBufferRect ( queue, //copy command will be queued dstMem, srcMem, bufferOffset, //offset associated with src_buffer bufferOffset, //offset associated with src_buffer region, //(width, height, depth) in bytes of the 2D or 3D rectangle being copied region[0], //length of each row in bytes 0, //length of each 2D slice in bytes region[0] , //length of each row in bytes 0, //length of each 2D slice in bytes 0, NULL, NULL); if (CL_SUCCESS != err) { LogError("Error: clEnqueueCopyBufferRect (srcMem) returned %s.\n", TranslateOpenCLError(err)); } return err; }
cl_uint CreateAndBuildProgram() { cl_int err = CL_SUCCESS; // Upload the OpenCL C source code from the input file to source // The size of the C program is returned in sourceSize char* source = NULL; size_t src_size = 0; err = ReadSourceFromFile("ray_algorithm.cl", &source, &src_size); if (CL_SUCCESS != err) { printf("Error: ReadSourceFromFile returned %s.\n", TranslateOpenCLError(err)); goto Finish; } // And now after you obtained a regular C string call clCreateProgramWithSource to create OpenCL program object. ocl.program = clCreateProgramWithSource(ocl.context, 1, (const char**)&source, &src_size, &err); if (CL_SUCCESS != err) { printf("Error: clCreateProgramWithSource returned %s.\n", TranslateOpenCLError(err)); goto Finish; } // Build the program // During creation a program is not built. You need to explicitly call build function. // Here you just use create-build sequence, // but there are also other possibilities when program consist of several parts, // some of which are libraries, and you may want to consider using clCompileProgram and clLinkProgram as // alternatives. err = clBuildProgram(ocl.program, 2, ocl.device, "", NULL, NULL); if (CL_SUCCESS != err) { printf("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err)); } Finish: if (source) { delete[] source; source = NULL; } return err; }
/* * "Read" the result buffer (mapping the buffer to the host memory address) */ bool ReadAndVerify(ocl_args_d_t *ocl, cl_uint width, cl_uint height, cl_int *inputA, cl_int *inputB) { cl_int err = CL_SUCCESS; bool result = true; // Enqueue a command to map the buffer object (ocl->dstMem) into the host address space and returns a pointer to it // The map operation is blocking cl_int *resultPtr = (cl_int *)clEnqueueMapBuffer(ocl->commandQueue, ocl->dstMem, true, CL_MAP_READ, 0, sizeof(cl_uint) * width * height, 0, NULL, NULL, &err); if (CL_SUCCESS != err) { LogError("Error: clEnqueueMapBuffer returned %s\n", TranslateOpenCLError(err)); return false; } // Call clFinish to guarantee that output region is updated err = clFinish(ocl->commandQueue); if (CL_SUCCESS != err) { LogError("Error: clFinish returned %s\n", TranslateOpenCLError(err)); } // We mapped dstMem to resultPtr, so resultPtr is ready and includes the kernel output !!! // Verify the results unsigned int size = width * height; for (unsigned int k = 0; k < size; ++k) { if (resultPtr[k] != inputA[k] + inputB[k]) { LogError("Verification failed at %d: (%d + %d = %d)\n", k, inputA[k], inputB[k], resultPtr[k]); result = false; } } // Unmapped the output buffer before releasing it err = clEnqueueUnmapMemObject(ocl->commandQueue, ocl->dstMem, resultPtr, 0, NULL, NULL); if (CL_SUCCESS != err) { LogError("Error: clEnqueueUnmapMemObject returned %s\n", TranslateOpenCLError(err)); } return result; }
template <typename T> T* DWTKernel<T>::mapOutputBufferToHost(){ cl_int error_code = CL_SUCCESS; void* hostPtr = clEnqueueMapBuffer(queue, dstMem, true, CL_MAP_READ, 0, dimX * dimY * sizeof(T), 0, NULL, NULL, &error_code); if (CL_SUCCESS != error_code) { LogError("Error: clEnqueueMapBuffer return %s.\n", TranslateOpenCLError(error_code)); } return (T*)hostPtr; }
template <typename T> cl_int DWTKernel<T>::run(cl_mem in, cl_mem out, int sizeX, int sizeY, int levels){ srcMem = in; dstMem = out; dimX = sizeX; dimY = sizeY; cl_int error_code = clSetKernelArg(myKernel, 3, sizeof(cl_mem), &srcMem); if (CL_SUCCESS != error_code) { LogError("Error: clSetKernelArg returned %s.\n", TranslateOpenCLError(error_code)); return error_code; } error_code = clSetKernelArg(myKernel, 4, sizeof(cl_mem), &dstMem); if (CL_SUCCESS != error_code) { LogError("Error: clSetKernelArg returned %s.\n", TranslateOpenCLError(error_code)); return error_code; } dwt(sizeX, sizeY, levels); return CL_SUCCESS; }
/* * Create OpenCL buffers from host memory * These buffers will be used later by the OpenCL kernel */ int CreateBufferArguments(ocl_args_d_t *ocl, cl_int* inputA, cl_int* inputB, cl_int* outputC, cl_uint arrayWidth, cl_uint arrayHeight) { cl_int err = CL_SUCCESS; // Create new OpenCL buffer objects // As these buffer are used only for read by the kernel, you are recommended to create it with flag CL_MEM_READ_ONLY. // Always set minimal read/write flags for buffers, it may lead to better performance because it allows runtime // to better organize data copying. // You use CL_MEM_COPY_HOST_PTR here, because the buffers should be populated with bytes at inputA and inputB. ocl->srcA = clCreateBuffer(ocl->context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(cl_uint) * arrayWidth * arrayHeight, inputA, &err); if (CL_SUCCESS != err) { LogError("Error: clCreateBuffer for srcA returned %s\n", TranslateOpenCLError(err)); return err; } ocl->srcB = clCreateBuffer(ocl->context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(cl_uint) * arrayWidth * arrayHeight, inputB, &err); if (CL_SUCCESS != err) { LogError("Error: clCreateBuffer for srcB returned %s\n", TranslateOpenCLError(err)); return err; } // If the output buffer is created directly on top of output buffer using CL_MEM_USE_HOST_PTR, // then, depending on the OpenCL runtime implementation and hardware capabilities, // it may save you not necessary data copying. // As it is known that output buffer will be write only, you explicitly declare it using CL_MEM_WRITE_ONLY. ocl->dstMem = clCreateBuffer(ocl->context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, sizeof(cl_uint) * arrayWidth * arrayHeight, outputC, &err); if (CL_SUCCESS != err) { LogError("Error: clCreateBuffer for dstMem returned %s\n", TranslateOpenCLError(err)); return err; } return CL_SUCCESS; }
template <typename T> DWTKernel<T>::~DWTKernel(void) { cl_int error_code = CL_SUCCESS; if (ownsMemory) { // free memory on device if (srcMem) { error_code = clReleaseMemObject(srcMem); if (CL_SUCCESS != error_code) { LogError("Error: clReleaseMemObject (input) returned %s.\n", TranslateOpenCLError(error_code)); } } if (dstMem) { error_code = clReleaseMemObject(dstMem); if (CL_SUCCESS != error_code) { LogError("Error: clReleaseMemObject (output) returned %s.\n", TranslateOpenCLError(error_code)); } } } }
/// Only computes optimal number of sliding window steps, /// number of threadblocks and then lanches the 5/3 FDWT kernel. /// @param WIN_SX width of sliding window /// @param WIN_SY height of sliding window /// @param in input image /// @param out output buffer /// @param sx width of the input image /// @param sy height of the input image template <typename T> void DWTKernel<T>::enqueue (int WIN_SX, int WIN_SY, const int sx, const int sy) { if (setWindowKernelArgs(WIN_SX, WIN_SY) != CL_SUCCESS) return; cl_int error_code = setImageSizeKernelArgs(sx, sy); if (CL_SUCCESS != error_code) { LogError("Error: setImageSizeKernelArgs returned %s.\n", TranslateOpenCLError(error_code)); return; } // allocate local data size_t localMemSize = calcTransformDataBufferSize(WIN_SX,WIN_SY) * sizeof(T); // Dynamically allocate local memory (allocated per workgroup) error_code = clSetKernelArg(myKernel, 2, localMemSize, NULL); if (CL_SUCCESS != error_code) { LogError("Error: clSetKernelArg returned %s.\n", TranslateOpenCLError(error_code)); return; } // compute optimal number of steps of each sliding window const int steps = divRndUp(sy, 15 * WIN_SY); error_code = clSetKernelArg(myKernel, 7, sizeof(T), &steps); if (CL_SUCCESS != error_code) { LogError("Error: clSetKernelArg returned %s.\n", TranslateOpenCLError(error_code)); return; } size_t global_work_size[3] = {divRndUp(sx, WIN_SX) * WIN_SX, divRndUp(sy, WIN_SY * steps),1}; size_t local_work_size[3] = {WIN_SX,1,1}; DeviceKernel::enqueue(2,global_work_size, local_work_size); }
/** A note about resolution levels: For a transform with N resolution levels, resolution levels run from 0 up to N-1. **/ template<typename T> tDeviceRC OCLDWT<T>::setKernelArgsQuant(OCLKernel* myKernel, float quantLL, float quantLH, float quantHH) { cl_kernel targetKernel = myKernel->getKernel(); cl_int error_code = clSetKernelArg(targetKernel, numKernelArgs++, sizeof(quantLL), &quantLL); if (DeviceSuccess != error_code) { LogError("setKernelArgs returned %s.", TranslateOpenCLError(error_code)); return error_code; } error_code = clSetKernelArg(targetKernel, numKernelArgs++, sizeof(quantLH), &quantLH); if (DeviceSuccess != error_code) { LogError("setKernelArgs returned %s.", TranslateOpenCLError(error_code)); return error_code; } error_code = clSetKernelArg(targetKernel, numKernelArgs++, sizeof(quantHH), &quantHH); if (DeviceSuccess != error_code) { LogError("setKernelArgs returned %s.", TranslateOpenCLError(error_code)); return error_code; } return DeviceSuccess; }
cl_uint SetKernelArguments() { cl_int err = CL_SUCCESS; err = clSetKernelArg(ocl.kernel, 0, sizeof(cl_mem), (void *)&ocl.Lights); if (CL_SUCCESS != err) { printf("error: Failed to set argument Lights, returned %s\n", TranslateOpenCLError(err)); return err; } err = clSetKernelArg(ocl.kernel, 1, sizeof(cl_uint), (void *)&ocl.LightCount); if (CL_SUCCESS != err) { printf("Error: Failed to set argument LightCount, returned %s\n", TranslateOpenCLError(err)); return err; } err = clSetKernelArg(ocl.kernel, 2, sizeof(cl_mem), (void *)&ocl.Shapes); if (CL_SUCCESS != err) { printf("error: Failed to set argument Shapes, returned %s\n", TranslateOpenCLError(err)); return err; } err = clSetKernelArg(ocl.kernel, 3, sizeof(cl_uint), (void *)&ocl.ShapeCount); if (CL_SUCCESS != err) { printf("Error: Failed to set argument ShapeCount, returned %s\n", TranslateOpenCLError(err)); return err; } err = clSetKernelArg(ocl.kernel, 4, sizeof(cl_uint), (void *)&ocl.sampleCount); if (CL_SUCCESS != err) { printf("Error: Failed to set argument ShapeCount, returned %s\n", TranslateOpenCLError(err)); return err; } err = clSetKernelArg(ocl.kernel, 5, sizeof(cl_uint), (void *)&ocl.width); if (CL_SUCCESS != err) { printf("Error: Failed to set argument ShapeCount, returned %s\n", TranslateOpenCLError(err)); return err; } err = clSetKernelArg(ocl.kernel, 6, sizeof(cl_uint), (void *)&ocl.height); if (CL_SUCCESS != err) { printf("Error: Failed to set argument ShapeCount, returned %s\n", TranslateOpenCLError(err)); return err; } err = clSetKernelArg(ocl.kernel, 7, sizeof(cl_mem), (void *)&ocl.cam); if (CL_SUCCESS != err) { printf("Error: Failed to set argument ShapeCount, returned %s\n", TranslateOpenCLError(err)); return err; } return err; }
/* * destructor - called only once * Release all OpenCL objects * This is a regular sequence of calls to deallocate all created OpenCL resources in bootstrapOpenCL. * * You may want to call these deallocation procedures in the middle of your application execution * (not at the end) if you don't further need OpenCL runtime. * You may want to do that in order to free some memory, for example, * or recreate OpenCL objects with different parameters. * */ ocl_args_d_t::~ocl_args_d_t() { cl_int err = CL_SUCCESS; if (kernel) { err = clReleaseKernel(kernel); if (CL_SUCCESS != err) { LogError("Error: clReleaseKernel returned '%s'.\n", TranslateOpenCLError(err)); } } if (program) { err = clReleaseProgram(program); if (CL_SUCCESS != err) { LogError("Error: clReleaseProgram returned '%s'.\n", TranslateOpenCLError(err)); } } if (srcA) { err = clReleaseMemObject(srcA); if (CL_SUCCESS != err) { LogError("Error: clReleaseMemObject returned '%s'.\n", TranslateOpenCLError(err)); } } if (srcB) { err = clReleaseMemObject(srcB); if (CL_SUCCESS != err) { LogError("Error: clReleaseMemObject returned '%s'.\n", TranslateOpenCLError(err)); } } if (dstMem) { err = clReleaseMemObject(dstMem); if (CL_SUCCESS != err) { LogError("Error: clReleaseMemObject returned '%s'.\n", TranslateOpenCLError(err)); } } if (commandQueue) { err = clReleaseCommandQueue(commandQueue); if (CL_SUCCESS != err) { LogError("Error: clReleaseCommandQueue returned '%s'.\n", TranslateOpenCLError(err)); } } if (device) { err = clReleaseDevice(device); if (CL_SUCCESS != err) { LogError("Error: clReleaseDevice returned '%s'.\n", TranslateOpenCLError(err)); } } if (context) { err = clReleaseContext(context); if (CL_SUCCESS != err) { LogError("Error: clReleaseContext returned '%s'.\n", TranslateOpenCLError(err)); } } /* * Note there is no procedure to deallocate platform * because it was not created at the startup, * but just queried from OpenCL runtime. */ }
/* * Check whether an OpenCL platform is the required platform * (based on the platform's name) */ bool CheckPreferredPlatformMatch(cl_platform_id platform, const char* preferredPlatform) { size_t stringLength = 0; cl_int err = CL_SUCCESS; bool match = false; // In order to read the platform's name, we first read the platform's name string length (param_value is NULL). // The value returned in stringLength err = clGetPlatformInfo(platform, CL_PLATFORM_NAME, 0, NULL, &stringLength); if (CL_SUCCESS != err) { LogError("Error: clGetPlatformInfo() to get CL_PLATFORM_NAME length returned '%s'.\n", TranslateOpenCLError(err)); return false; } // Now, that we know the platform's name string length, we can allocate enough space before read it std::vector<char> platformName(stringLength); // Read the platform's name string // The read value returned in platformName err = clGetPlatformInfo(platform, CL_PLATFORM_NAME, stringLength, &platformName[0], NULL); if (CL_SUCCESS != err) { LogError("Error: clGetplatform_ids() to get CL_PLATFORM_NAME returned %s.\n", TranslateOpenCLError(err)); return false; } // Now check if the platform's name is the required one if (strstr(&platformName[0], preferredPlatform) != 0) { // The checked platform is the one we're looking for match = true; } return match; }
/* * This function picks/creates necessary OpenCL objects which are needed. * The objects are: * OpenCL platform, device, context, and command queue. * * All these steps are needed to be performed once in a regular OpenCL application. * This happens before actual compute kernels calls are performed. * * For convenience, in this application you store all those basic OpenCL objects in structure ocl_args_d_t, * so this function populates fields of this structure, which is passed as parameter ocl. * Please, consider reviewing the fields before going further. * The structure definition is right in the beginning of this file. */ int SetupOpenCL(ocl_args_d_t *ocl, cl_device_type deviceType) { // The following variable stores return codes for all OpenCL calls. cl_int err = CL_SUCCESS; // Query for all available OpenCL platforms on the system // Here you enumerate all platforms and pick one which name has preferredPlatform as a sub-string cl_platform_id platformId = FindOpenCLPlatform("Intel", deviceType); if (NULL == platformId) { LogError("Error: Failed to find OpenCL platform.\n"); return CL_INVALID_VALUE; } // Create context with device of specified type. // Required device type is passed as function argument deviceType. // So you may use this function to create context for any CPU or GPU OpenCL device. // The creation is synchronized (pfn_notify is NULL) and NULL user_data cl_context_properties contextProperties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platformId, 0 }; ocl->context = clCreateContextFromType(contextProperties, deviceType, NULL, NULL, &err); if ((CL_SUCCESS != err) || (NULL == ocl->context)) { LogError("Couldn't create a context, clCreateContextFromType() returned '%s'.\n", TranslateOpenCLError(err)); return err; } // Query for OpenCL device which was used for context creation err = clGetContextInfo(ocl->context, CL_CONTEXT_DEVICES, sizeof(cl_device_id), &ocl->device, NULL); if (CL_SUCCESS != err) { LogError("Error: clGetContextInfo() to get list of devices returned %s.\n", TranslateOpenCLError(err)); return err; } // Read the OpenCL platform's version and the device OpenCL and OpenCL C versions GetPlatformAndDeviceVersion(platformId, ocl); // Create command queue. // OpenCL kernels are enqueued for execution to a particular device through special objects called command queues. // Command queue guarantees some ordering between calls and other OpenCL commands. // Here you create a simple in-order OpenCL command queue that doesn't allow execution of two kernels in parallel on a target device. #ifdef CL_VERSION_2_0 if (OPENCL_VERSION_2_0 == ocl->deviceVersion) { const cl_command_queue_properties properties[] = { CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0 }; ocl->commandQueue = clCreateCommandQueueWithProperties(ocl->context, ocl->device, properties, &err); } else { // default behavior: OpenCL 1.2 cl_command_queue_properties properties = CL_QUEUE_PROFILING_ENABLE; ocl->commandQueue = clCreateCommandQueue(ocl->context, ocl->device, properties, &err); } #else // default behavior: OpenCL 1.2 cl_command_queue_properties properties = CL_QUEUE_PROFILING_ENABLE; ocl->commandQueue = clCreateCommandQueue(ocl->context, ocl->device, properties, &err); #endif if (CL_SUCCESS != err) { LogError("Error: clCreateCommandQueue() returned %s.\n", TranslateOpenCLError(err)); return err; } return CL_SUCCESS; }
/* * This function read the OpenCL platdorm and device versions * (using clGetxxxInfo API) and stores it in the ocl structure. * Later it will enable us to support both OpenCL 1.2 and 2.0 platforms and devices * in the same program. */ int GetPlatformAndDeviceVersion(cl_platform_id platformId, ocl_args_d_t *ocl) { cl_int err = CL_SUCCESS; // Read the platform's version string length (param_value is NULL). // The value returned in stringLength size_t stringLength = 0; err = clGetPlatformInfo(platformId, CL_PLATFORM_VERSION, 0, NULL, &stringLength); if (CL_SUCCESS != err) { LogError("Error: clGetPlatformInfo() to get CL_PLATFORM_VERSION length returned '%s'.\n", TranslateOpenCLError(err)); return err; } // Now, that we know the platform's version string length, we can allocate enough space before read it std::vector<char> platformVersion(stringLength); // Read the platform's version string // The read value returned in platformVersion err = clGetPlatformInfo(platformId, CL_PLATFORM_VERSION, stringLength, &platformVersion[0], NULL); if (CL_SUCCESS != err) { LogError("Error: clGetplatform_ids() to get CL_PLATFORM_VERSION returned %s.\n", TranslateOpenCLError(err)); return err; } if (strstr(&platformVersion[0], "OpenCL 2.0") != NULL) { ocl->platformVersion = OPENCL_VERSION_2_0; } // Read the device's version string length (param_value is NULL). err = clGetDeviceInfo(ocl->device, CL_DEVICE_VERSION, 0, NULL, &stringLength); if (CL_SUCCESS != err) { LogError("Error: clGetDeviceInfo() to get CL_DEVICE_VERSION length returned '%s'.\n", TranslateOpenCLError(err)); return err; } // Now, that we know the device's version string length, we can allocate enough space before read it std::vector<char> deviceVersion(stringLength); // Read the device's version string // The read value returned in deviceVersion err = clGetDeviceInfo(ocl->device, CL_DEVICE_VERSION, stringLength, &deviceVersion[0], NULL); if (CL_SUCCESS != err) { LogError("Error: clGetDeviceInfo() to get CL_DEVICE_VERSION returned %s.\n", TranslateOpenCLError(err)); return err; } if (strstr(&deviceVersion[0], "OpenCL 2.0") != NULL) { ocl->deviceVersion = OPENCL_VERSION_2_0; } // Read the device's OpenCL C version string length (param_value is NULL). err = clGetDeviceInfo(ocl->device, CL_DEVICE_OPENCL_C_VERSION, 0, NULL, &stringLength); if (CL_SUCCESS != err) { LogError("Error: clGetDeviceInfo() to get CL_DEVICE_OPENCL_C_VERSION length returned '%s'.\n", TranslateOpenCLError(err)); return err; } // Now, that we know the device's OpenCL C version string length, we can allocate enough space before read it std::vector<char> compilerVersion(stringLength); // Read the device's OpenCL C version string // The read value returned in compilerVersion err = clGetDeviceInfo(ocl->device, CL_DEVICE_OPENCL_C_VERSION, stringLength, &compilerVersion[0], NULL); if (CL_SUCCESS != err) { LogError("Error: clGetDeviceInfo() to get CL_DEVICE_OPENCL_C_VERSION returned %s.\n", TranslateOpenCLError(err)); return err; } else if (strstr(&compilerVersion[0], "OpenCL C 2.0") != NULL) { ocl->compilerVersion = OPENCL_VERSION_2_0; } return err; }
/* * Find and return the preferred OpenCL platform * In case that preferredPlatform is NULL, the ID of the first discovered platform will be returned */ cl_platform_id FindOpenCLPlatform(const char* preferredPlatform, cl_device_type deviceType) { cl_uint numPlatforms = 0; cl_int err = CL_SUCCESS; // Get (in numPlatforms) the number of OpenCL platforms available // No platform ID will be return, since platforms is NULL err = clGetPlatformIDs(0, NULL, &numPlatforms); if (CL_SUCCESS != err) { LogError("Error: clGetplatform_ids() to get num platforms returned %s.\n", TranslateOpenCLError(err)); return NULL; } LogInfo("Number of available platforms: %u\n", numPlatforms); if (0 == numPlatforms) { LogError("Error: No platforms found!\n"); return NULL; } std::vector<cl_platform_id> platforms(numPlatforms); // Now, obtains a list of numPlatforms OpenCL platforms available // The list of platforms available will be returned in platforms err = clGetPlatformIDs(numPlatforms, &platforms[0], NULL); if (CL_SUCCESS != err) { LogError("Error: clGetplatform_ids() to get platforms returned %s.\n", TranslateOpenCLError(err)); return NULL; } // Check if one of the available platform matches the preferred requirements for (cl_uint i = 0; i < numPlatforms; i++) { bool match = true; cl_uint numDevices = 0; // If the preferredPlatform is not NULL then check if platforms[i] is the required one // Otherwise, continue the check with platforms[i] if ((NULL != preferredPlatform) && (strlen(preferredPlatform) > 0)) { // In case we're looking for a specific platform match = CheckPreferredPlatformMatch(platforms[i], preferredPlatform); } // match is true if the platform's name is the required one or don't care (NULL) if (match) { // Obtains the number of deviceType devices available on platform // When the function failed we expect numDevices to be zero. // We ignore the function return value since a non-zero error code // could happen if this platform doesn't support the specified device type. err = clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &numDevices); if (CL_SUCCESS != err) { LogError("clGetDeviceIDs() returned %s.\n", TranslateOpenCLError(err)); } if (0 != numDevices) { // There is at list one device that answer the requirements return platforms[i]; } } } return NULL; }
/* * main execution routine * Basically it consists of three parts: * - generating the inputs * - running OpenCL kernel * - reading results of processing */ int _tmain(int argc, TCHAR* argv[]) { cl_int err; ocl_args_d_t ocl; cl_device_type deviceType = CL_DEVICE_TYPE_GPU; LARGE_INTEGER perfFrequency; LARGE_INTEGER performanceCountNDRangeStart; LARGE_INTEGER performanceCountNDRangeStop; cl_uint arrayWidth = 1024; cl_uint arrayHeight = 1024; //initialize Open CL objects (context, queue, etc.) if (CL_SUCCESS != SetupOpenCL(&ocl, deviceType)) { return -1; } // allocate working buffers. // the buffer should be aligned with 4K page and size should fit 64-byte cached line cl_uint optimizedSize = ((sizeof(cl_int) * arrayWidth * arrayHeight - 1) / 64 + 1) * 64; cl_int* inputA = (cl_int*)_aligned_malloc(optimizedSize, 4096); cl_int* inputB = (cl_int*)_aligned_malloc(optimizedSize, 4096); cl_int* outputC = (cl_int*)_aligned_malloc(optimizedSize, 4096); if (NULL == inputA || NULL == inputB || NULL == outputC) { LogError("Error: _aligned_malloc failed to allocate buffers.\n"); return -1; } //random input generateInput(inputA, arrayWidth, arrayHeight); generateInput(inputB, arrayWidth, arrayHeight); // Create OpenCL buffers from host memory // These buffers will be used later by the OpenCL kernel if (CL_SUCCESS != CreateBufferArguments(&ocl, inputA, inputB, outputC, arrayWidth, arrayHeight)) { return -1; } // Create and build the OpenCL program if (CL_SUCCESS != CreateAndBuildProgram(&ocl)) { return -1; } // Program consists of kernels. // Each kernel can be called (enqueued) from the host part of OpenCL application. // To call the kernel, you need to create it from existing program. ocl.kernel = clCreateKernel(ocl.program, "Add", &err); if (CL_SUCCESS != err) { LogError("Error: clCreateKernel returned %s\n", TranslateOpenCLError(err)); return -1; } // Passing arguments into OpenCL kernel. if (CL_SUCCESS != SetKernelArguments(&ocl)) { return -1; } // Regularly you wish to use OpenCL in your application to achieve greater performance results // that are hard to achieve in other ways. // To understand those performance benefits you may want to measure time your application spent in OpenCL kernel execution. // The recommended way to obtain this time is to measure interval between two moments: // - just before clEnqueueNDRangeKernel is called, and // - just after clFinish is called // clFinish is necessary to measure entire time spending in the kernel, measuring just clEnqueueNDRangeKernel is not enough, // because this call doesn't guarantees that kernel is finished. // clEnqueueNDRangeKernel is just enqueue new command in OpenCL command queue and doesn't wait until it ends. // clFinish waits until all commands in command queue are finished, that suits your need to measure time. bool queueProfilingEnable = true; if (queueProfilingEnable) QueryPerformanceCounter(&performanceCountNDRangeStart); // Execute (enqueue) the kernel if (CL_SUCCESS != ExecuteAddKernel(&ocl, arrayWidth, arrayHeight)) { return -1; } if (queueProfilingEnable) QueryPerformanceCounter(&performanceCountNDRangeStop); // The last part of this function: getting processed results back. // use map-unmap sequence to update original memory area with output buffer. ReadAndVerify(&ocl, arrayWidth, arrayHeight, inputA, inputB); // retrieve performance counter frequency if (queueProfilingEnable) { QueryPerformanceFrequency(&perfFrequency); LogInfo("NDRange performance counter time %f ms.\n", 1000.0f*(float)(performanceCountNDRangeStop.QuadPart - performanceCountNDRangeStart.QuadPart) / (float)perfFrequency.QuadPart); } _aligned_free(inputA); _aligned_free(inputB); _aligned_free(outputC); #if defined(_DEBUG) getchar(); #endif return 0; }
void imgdiff(size_t N, size_t width, size_t height, double* diff_matrix, unsigned char* images) { //// we need to fill in //// cl_platform_id *platform; cl_device_type dev_type = CL_DEVICE_TYPE_GPU; cl_device_id *devs; cl_context context; cl_command_queue *cmd_queues; cl_program program; cl_kernel *kernels; cl_uint num_platforms; cl_uint num_devs; cl_mem* m_image1; cl_mem* m_image2; cl_mem* m_result; cl_event* ev_kernels; int err = CL_SUCCESS; int i, j, k; // modify version err = clGetPlatformIDs(0, NULL, &num_platforms); if(err != CL_SUCCESS) { printf("Error: platform error\n"); return 0; } if(num_platforms == 0) { printf("Error: platform no count\n"); return 0; } platform = (cl_platform_id*)malloc(sizeof(cl_platform_id)*num_platforms); err = clGetPlatformIDs(num_platforms, platform, NULL); if(err != CL_SUCCESS) { printf("Error: clGetPlatformIDs error\n"); return 0; } for(i = 0; i<num_platforms; i++) { err = clGetDeviceIDs(platform[i], dev_type, 0, NULL, &num_devs); if(err != CL_SUCCESS) { printf("Error: clGetDevice\n"); return 0; } if(num_devs >= 1) { devs = (cl_device_id*)malloc(sizeof(cl_device_id) * num_devs); clGetDeviceIDs(platform[i], dev_type, num_devs, devs, NULL); break; } } context = clCreateContext(NULL, num_devs, devs, NULL, NULL, &err); if(err != CL_SUCCESS) { printf("Error: clCreateContext error\n"); return 0; } char* source = NULL; size_t src_size = 0; err = ReadSourceFromFile("./imgdiff_cal.cl", &source, &src_size); if (CL_SUCCESS != err) { printf("Error: ReadSourceFromFile returned %s.\n", err); free(source); return 0; } program = clCreateProgramWithSource(context, 1, (const char**)&source, &src_size, &err); if(err != CL_SUCCESS) { printf("Error: clCreateProgram error\n"); return 0; } free(source); printf("Create Program Success\n"); #if DBG // Measure clBuildProgram -@henry added gettimeofday(&start_m, NULL ); #endif err = clBuildProgram(program, num_devs, devs, "", NULL, NULL); #if DBG gettimeofday(&end_m, NULL ); double time = (end_m.tv_usec - start_m.tv_usec)*1e-6 + (end_m.tv_sec - start_m.tv_sec); printf("[Debug] Elapsed Time of clBuildProgram() : %lf s\n",time); #endif if(err != CL_SUCCESS) { printf("Error: clBuildProgram\n"); return 0; } printf("Build Program Success\n"); kernels = (cl_kernel*)malloc(sizeof(cl_kernel)*num_devs); for(i = 0; i<num_devs; i++) { kernels[i] = clCreateKernel(program, "imgdiff_cal", NULL); } printf("Create Kernel Success\n"); cmd_queues = (cl_command_queue*)malloc(sizeof(cl_command_queue)*num_devs); for(i=0; i<num_devs; i++) { cmd_queues[i] = clCreateCommandQueue(context, devs[i], 0, &err); if(err != CL_SUCCESS) { printf("Error: clCreateCommandQueue error\n"); return 0; } } printf("Create commandQueue Success\n"); int LOCAL_WIDTH = 16; int LOCAL_HEIGHT = 16; int WORK_WIDTH = ceil((double)width / LOCAL_WIDTH)*LOCAL_WIDTH; int WORK_HEIGHT = ceil((double)height/LOCAL_HEIGHT) * LOCAL_HEIGHT; int WORK_AMOUNT = width * height; int WORK_GROUP_COUNT = ceil(((double)WORK_WIDTH * WORK_HEIGHT) / (LOCAL_WIDTH * LOCAL_HEIGHT)); int WORK_GROUP_WIDTH = width; int WORK_GROUP_HEIGHT = height; int SAMPLE_COUNT = 16; int WORK_COUNT[num_devs]; double tmp_result_data[WORK_GROUP_COUNT*SAMPLE_COUNT]; printf("WORK_WIDTH %d\tWORK_HEIGHT %d\t WORK_AMOUNT %d\t WORK_GROUP_COUNT %d\n", WORK_WIDTH, WORK_HEIGHT, WORK_AMOUNT, WORK_GROUP_COUNT); m_image1 = (cl_mem*)malloc(sizeof(cl_mem)* num_devs); m_image2 = (cl_mem*)malloc(sizeof(cl_mem)* num_devs); m_result = (cl_mem*)malloc(sizeof(cl_mem)* num_devs); for(i=0; i<num_devs; i++) { m_image1[i] = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(unsigned char) * WORK_AMOUNT * 3, NULL, NULL); m_image2[i] = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(unsigned char) * WORK_AMOUNT*SAMPLE_COUNT * 3, NULL, NULL); m_result[i] = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(double) * WORK_GROUP_COUNT * SAMPLE_COUNT, NULL, NULL); clSetKernelArg(kernels[i], 0, sizeof(cl_mem), (void*)&m_image1[i]); clSetKernelArg(kernels[i], 1, sizeof(cl_mem), (void*)&m_image2[i]); clSetKernelArg(kernels[i], 2, sizeof(cl_mem), (void*)&m_result[i]); clSetKernelArg(kernels[i], 3, sizeof(cl_int), &WORK_GROUP_WIDTH); clSetKernelArg(kernels[i], 4, sizeof(cl_int), &WORK_GROUP_HEIGHT); } ev_kernels = (cl_event*)malloc(sizeof(cl_event)*num_devs); int row, col; row = 0; col = 1; for(row = 0; row < N; row++) { if( (N-row-1) < (SAMPLE_COUNT*4) && SAMPLE_COUNT > 1) SAMPLE_COUNT = SAMPLE_COUNT / 2; int remain_count = N - (row + 1); for(i=0; i<num_devs; i++) { clEnqueueWriteBuffer(cmd_queues[i], m_image1[i], CL_FALSE, 0, sizeof(unsigned char) * WORK_AMOUNT * 3, (void*)(images + (row * width*height)*3), 0, NULL, NULL); } diff_matrix[row*N + row] = 0; col = row + 1; while( col< N) { size_t lws[2] = { LOCAL_WIDTH, LOCAL_HEIGHT }; size_t gws[2] = { WORK_WIDTH, WORK_HEIGHT}; for(i=0; i<num_devs; i++) { if((remain_count - SAMPLE_COUNT) < 0) { WORK_COUNT[i] = remain_count; remain_count = 0; } else { WORK_COUNT[i] = SAMPLE_COUNT; remain_count = remain_count - SAMPLE_COUNT; } if(WORK_COUNT[i] != 0) { clSetKernelArg(kernels[i], 5, sizeof(cl_int), &WORK_COUNT[i]); int offset = 0; for(j=0; j<i; j++) offset += WORK_COUNT[j]; err = clEnqueueWriteBuffer(cmd_queues[i], m_image2[i], CL_FALSE, 0, sizeof(unsigned char)*WORK_AMOUNT*WORK_COUNT[i]*3, (void*)(images +((col * width*height) + (WORK_AMOUNT * offset))*3), 0, NULL, NULL); } } for( i=0; i < num_devs; i++ ) { if(WORK_COUNT[i] != 0) { err = clEnqueueNDRangeKernel(cmd_queues[i], kernels[i], 2, NULL, gws, lws, 0, NULL, NULL); if(err != CL_SUCCESS) { printf("Error: clEnqueueNDRangeKernel %d error\n", i); printf("%s\n", TranslateOpenCLError(err)); return 0; } } } double tmp_sum = 0; i = 0; for( i = num_devs -1; i >= 0; i-- ) { if(WORK_COUNT[i] != 0) { err = clEnqueueReadBuffer( cmd_queues[i], m_result[i], CL_TRUE, 0, sizeof(double) * WORK_GROUP_COUNT * WORK_COUNT[i], tmp_result_data, 0, NULL, NULL); if(err != CL_SUCCESS) { printf("Error: clEnqueueReadBuffer%d error\n", i); return 0; } //printf("receive......"); for(j = 0; j<WORK_COUNT[i]; j++) { tmp_sum = 0; for(k = 0; k<WORK_GROUP_COUNT; k++) { tmp_sum += tmp_result_data[k + j*WORK_GROUP_COUNT]; //printf("%lf\t", tmp_result_data[k+j*WORK_GROUP_COUNT]); } //printf("%lf %lf\n", tmp_sum, tmp_result_data[j*WORK_GROUP_COUNT]); int offset = 0; for(k=0; k<i; k++) offset += WORK_COUNT[k]; diff_matrix[row*N+col+j+offset] = diff_matrix[(col+j+offset)*N+row] = tmp_sum; } } } for( i = 0; i < num_devs; i++ ) { col += WORK_COUNT[i]; } } } }