// main() for simple buffer and sub-buffer example // int main(int argc, char** argv) { std::cout << "Simple Image Processing Example" << std::endl; // First, select an OpenCL platform to run on. errNum = clGetPlatformIDs(0, NULL, &numPlatforms); checkErr( (errNum != CL_SUCCESS) ? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), "clGetPlatformIDs"); platformIDs = (cl_platform_id *)alloca(sizeof(cl_platform_id) * numPlatforms); std::cout << "Number of platforms: \t" << numPlatforms << std::endl; errNum = clGetPlatformIDs(numPlatforms, platformIDs, NULL); checkErr( (errNum != CL_SUCCESS) ? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), "clGetPlatformIDs"); std::ifstream srcFile("gaussian_filter.cl"); checkErr(srcFile.is_open() ? CL_SUCCESS : -1, "reading simple.cl"); std::string srcProg( std::istreambuf_iterator<char>(srcFile), (std::istreambuf_iterator<char>())); const char * src = srcProg.c_str(); size_t length = srcProg.length(); deviceIDs = NULL; DisplayPlatformInfo( platformIDs[PLATFORM_INDEX], CL_PLATFORM_VENDOR, "CL_PLATFORM_VENDOR"); errNum = clGetDeviceIDs( platformIDs[PLATFORM_INDEX], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices); if (errNum != CL_SUCCESS && errNum != CL_DEVICE_NOT_FOUND){ checkErr(errNum, "clGetDeviceIDs"); } deviceIDs = (cl_device_id *)alloca(sizeof(cl_device_id) * numDevices); errNum = clGetDeviceIDs( platformIDs[PLATFORM_INDEX], CL_DEVICE_TYPE_ALL, numDevices, &deviceIDs[0], NULL); checkErr(errNum, "clGetDeviceIDs"); cl_context_properties contextProperties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platformIDs[PLATFORM_INDEX], 0 }; context = clCreateContext( contextProperties, numDevices, deviceIDs, NULL, NULL, &errNum); checkErr(errNum, "clCreateContext"); // Create program from source program = clCreateProgramWithSource( context, 1, &src, &length, &errNum); checkErr(errNum, "clCreateProgramWithSource"); // Build program errNum = clBuildProgram( program, numDevices, deviceIDs, "-I.", NULL, NULL); if (errNum != CL_SUCCESS){ // Determine the reason for the error char buildLog[16384]; clGetProgramBuildInfo( program, deviceIDs[0], CL_PROGRAM_BUILD_LOG, sizeof(buildLog), buildLog, NULL); std::cerr << "Error in OpenCL C source: " << std::endl; std::cerr << buildLog; checkErr(errNum, "clBuildProgram"); } // Create a command commands // if(!(commands = clCreateCommandQueue(context, deviceIDs[0], 0, &errNum))) { std::cout << "Failed to create a command commands!" << std::endl; cleanKill(EXIT_FAILURE); } cl_kernel kernel = clCreateKernel(program, "gaussian_filter", &errNum); checkErr(errNum, "clCreateKernel(gaussian_filter)"); if(!doesGPUSupportImageObjects){ cleanKill(EXIT_FAILURE); } inputImage = LoadImage(context, (char*)"rgba.png", width, height); cl_image_format format; format.image_channel_order = CL_RGBA; format.image_channel_data_type = CL_UNORM_INT8; outputImage = clCreateImage2D(context, CL_MEM_WRITE_ONLY, &format, width, height, 0, NULL, &errNum); if(there_was_an_error(errNum)){ std::cout << "Output Image Buffer creation error!" << std::endl; cleanKill(EXIT_FAILURE); } if (!inputImage || !outputImage ){ std::cout << "Failed to allocate device memory!" << std::endl; cleanKill(EXIT_FAILURE); } char *buffer = new char [width * height * 4]; size_t origin[3] = { 0, 0, 0 }; size_t region[3] = { width, height, 1}; sampler = clCreateSampler(context, CL_FALSE, // Non-normalized coordinates CL_ADDRESS_CLAMP_TO_EDGE, CL_FILTER_NEAREST, &errNum); if(there_was_an_error(errNum)){ std::cout << "Error creating CL sampler object." << std::endl; cleanKill(EXIT_FAILURE); } // Set the kernel arguments errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage); errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage); errNum |= clSetKernelArg(kernel, 2, sizeof(cl_sampler), &sampler); errNum |= clSetKernelArg(kernel, 3, sizeof(cl_int), &width); errNum |= clSetKernelArg(kernel, 4, sizeof(cl_int), &height); if (errNum != CL_SUCCESS) { std::cerr << "Error setting kernel arguments." << std::endl; std::cerr << print_cl_errstring(errNum) << std::endl; cleanKill(EXIT_FAILURE); } //errNum = clGetKernelWorkGroupInfo(kernel, deviceIDs, CL_KERNEL_WORK_GROUP_SIZE, sizeof(unsigned short)* height*width*4, &local, NULL); // if (errNum != CL_SUCCESS) // { // cout << print_cl_errstring(err) << endl; // if(err == CL_INVALID_VALUE){ // cout << "if param_name is not valid, or if size in bytes specified by param_value_size " // << "is less than the size of return type as described in the table above and " // << "param_value is not NULL." << endl; // } // cout << "Error: Failed to retrieve kernel work group info!" << err << endl; // cleanKill(EXIT_FAILURE); // } std::cout << "Max work group size is " << CL_DEVICE_MAX_WORK_GROUP_SIZE << std::endl; std::cout << "Max work item size is " << CL_DEVICE_MAX_WORK_ITEM_SIZES << std::endl; size_t localWorkSize[2]; size_t globalWorkSize[2]; localWorkSize[0] = 1; localWorkSize[1] = localWorkSize[0]; globalWorkSize[0] = width*height; globalWorkSize[1] = globalWorkSize[0]; //CL_INVALID_WORK_GROUP_SIZE if local_work_size is specified and number of work-items specified by global_work_size is not evenly divisable by size of work-group given by local_work_size //size_t globalWorkSize[2] = { RoundUp(localWorkSize[0], width), RoundUp(localWorkSize[1], height)}; // size_t globalWorkSize[1] = {sizeof(unsigned short)* height * width}; // size_t localWorkSize[1] = {64}; // Queue the kernel up for execution errNum = clEnqueueNDRangeKernel(commands, kernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL); if (errNum != CL_SUCCESS){ std::cerr << "Error queuing kernel for execution." << std::endl; std::cerr << print_cl_errstring(errNum) << std::endl; cleanKill(EXIT_FAILURE); } // Wait for the command commands to get serviced before reading back results // clFinish(commands); // Read back computed data errNum = clEnqueueReadImage(commands, outputImage, CL_TRUE, origin, region, 0, 0, buffer, 0, NULL, NULL); SaveImage((char*)"outRGBA.png", (char*)buffer, width, height); std::cout << "Program completed successfully" << std::endl; return 0; }
void WriteBufferOperation::executeOpenCLRegion(OpenCLDevice *device, rcti *rect, unsigned int chunkNumber, MemoryBuffer **inputMemoryBuffers, MemoryBuffer *outputBuffer) { float *outputFloatBuffer = outputBuffer->getBuffer(); cl_int error; /* * 1. create cl_mem from outputbuffer * 2. call NodeOperation (input) executeOpenCLChunk(.....) * 3. schedule readback from opencl to main device (outputbuffer) * 4. schedule native callback * * note: list of cl_mem will be filled by 2, and needs to be cleaned up by 4 */ // STEP 1 const unsigned int outputBufferWidth = outputBuffer->getWidth(); const unsigned int outputBufferHeight = outputBuffer->getHeight(); const cl_image_format imageFormat = { CL_RGBA, CL_FLOAT }; cl_mem clOutputBuffer = clCreateImage2D(device->getContext(), CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, &imageFormat, outputBufferWidth, outputBufferHeight, 0, outputFloatBuffer, &error); if (error != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error)); } // STEP 2 list<cl_mem> *clMemToCleanUp = new list<cl_mem>(); clMemToCleanUp->push_back(clOutputBuffer); list<cl_kernel> *clKernelsToCleanUp = new list<cl_kernel>(); this->m_input->executeOpenCL(device, outputBuffer, clOutputBuffer, inputMemoryBuffers, clMemToCleanUp, clKernelsToCleanUp); // STEP 3 size_t origin[3] = {0, 0, 0}; size_t region[3] = {outputBufferWidth, outputBufferHeight, 1}; // clFlush(queue); // clFinish(queue); error = clEnqueueBarrier(device->getQueue()); if (error != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error)); } error = clEnqueueReadImage(device->getQueue(), clOutputBuffer, CL_TRUE, origin, region, 0, 0, outputFloatBuffer, 0, NULL, NULL); if (error != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error)); } this->getMemoryProxy()->getBuffer()->copyContentFrom(outputBuffer); // STEP 4 while (!clMemToCleanUp->empty()) { cl_mem mem = clMemToCleanUp->front(); error = clReleaseMemObject(mem); if (error != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error)); } clMemToCleanUp->pop_front(); } while (!clKernelsToCleanUp->empty()) { cl_kernel kernel = clKernelsToCleanUp->front(); error = clReleaseKernel(kernel); if (error != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error)); } clKernelsToCleanUp->pop_front(); } delete clKernelsToCleanUp; }
/*------------------------------------------------------------------------- * ソフトフォーカス処理をおこなう(OpenCLを使った実装, イメージオブジェクトを使う) */ static void softfocusWithOpenCLImage(unsigned char* srcData, unsigned char* dstData, const unsigned int width, const unsigned int height) { ClHelper clHelper; clHelper.preloadProgram("../../calc.cl"); // カーネルプログラムの読み込み cl_context context = clHelper.getContext(); // コンテキストの取得 cl_command_queue queue = clHelper.getCommandQueue(); // コマンドキューの取得 cl_program program = clHelper.getProgram(); // プログラムの取得 if (program == (cl_program)0) { throw MyError("program bug, program object is not loaded", __FUNCTION__); } cl_int status; // イメージフォーマットの定義 cl_image_format format; format.image_channel_data_type = CL_UNORM_INT8; // 0-255を[0-1]に正規化 format.image_channel_order = CL_BGRA; // BGRAの順に並んでいる // 元のイメージデータを保持するイメージオブジェクトの作成 cl_mem src_mem = clCreateImage2D(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &format, width, height, 0, srcData, &status); if (status != CL_SUCCESS) { ClHelper::printError(status); throw MyError("failed to create image memory object 1", __FUNCTION__); } // 結果を保持するイメージオブジェクトの作成 cl_mem dst_mem = clCreateImage2D(context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, &format, width, height, 0, dstData, &status); if (status != CL_SUCCESS) { ClHelper::printError(status); throw MyError("failed to create image memory object 2", __FUNCTION__); } // カーネルの作成 cl_kernel kernel; kernel = clCreateKernel(program, "softFocus", &status); if (kernel == (cl_kernel)0) { ClHelper::printError(status); throw MyError("failed to create kernel", __FUNCTION__); } // カーネル関数引数のセット status += clSetKernelArg(kernel, 0, sizeof(src_mem), (void *)&src_mem); status += clSetKernelArg(kernel, 1, sizeof(dst_mem), (void *)&dst_mem); if (status != 0) { printf("clSetKernelArg failed\n"); throw MyError("failed to set kernel arguments.", __FUNCTION__); } // カーネル実行のリクエスト cl_uint work_dim = 2; // x, y size_t global_work_size[] = {width - 2, height - 2}; status = clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, 0, 0, NULL, NULL); if (status != CL_SUCCESS) { ClHelper::printError(status); throw MyError("clEnqueueNDRangeKernel failed.", __FUNCTION__); } // 画像データの取得 const size_t origin[] = {0, 0, 0}; const size_t region[] = {width, height, 1}; status = clEnqueueReadImage(queue, dst_mem, CL_TRUE, origin, region, 0, 0, dstData, NULL, 0, NULL); if (status != CL_SUCCESS) { ClHelper::printError(status); throw MyError("failed to read image buffer", __FUNCTION__); } // リソースの解放 clReleaseMemObject(dst_mem); clReleaseMemObject(src_mem); }
/*! \brief Calls an OpenCL kernel from OpenVX Graph. * Steps: * \arg Find the target * \arg Get the vxcl context * \arg Find the kernel (to get cl kernel information) * \arg for each input parameter that is an object, enqueue write * \arg wait for finish * \arg for each parameter, SetKernelArg * \arg call kernel * \arg wait for finish * \arg for each output parameter that is an object, enqueue read * \arg wait for finish * \note This implementation will attempt to use the External API as much as possible, * but will cast to internal representation when needed (due to lack of API or * need for secret information). This is not an optimal OpenCL invocation. */ vx_status vxclCallOpenCLKernel(vx_node node, const vx_reference *parameters, vx_uint32 num) { vx_status status = VX_FAILURE; vx_context context = node->base.context; vx_target target = (vx_target_t *)&node->base.context->targets[node->affinity]; vx_cl_kernel_description_t *vxclk = vxclFindKernel(node->kernel->enumeration); vx_uint32 pidx, pln, didx, plidx, argidx; cl_int err = 0; size_t off_dim[3] = {0,0,0}; size_t work_dim[3]; //size_t local_dim[3]; cl_event writeEvents[VX_INT_MAX_PARAMS]; cl_event readEvents[VX_INT_MAX_PARAMS]; cl_int we = 0, re = 0; vxSemWait(&target->base.lock); // determine which platform to use plidx = 0; // determine which device to use didx = 0; /* for each input/bi data object, enqueue it and set the kernel parameters */ for (argidx = 0, pidx = 0; pidx < num; pidx++) { vx_reference ref = node->parameters[pidx]; vx_enum dir = node->kernel->signature.directions[pidx]; vx_enum type = node->kernel->signature.types[pidx]; vx_memory_t *memory = NULL; switch (type) { case VX_TYPE_ARRAY: memory = &((vx_array)ref)->memory; break; case VX_TYPE_CONVOLUTION: memory = &((vx_convolution)ref)->base.memory; break; case VX_TYPE_DISTRIBUTION: memory = &((vx_distribution)ref)->memory; break; case VX_TYPE_IMAGE: memory = &((vx_image)ref)->memory; break; case VX_TYPE_LUT: memory = &((vx_lut_t*)ref)->memory; break; case VX_TYPE_MATRIX: memory = &((vx_matrix)ref)->memory; break; //case VX_TYPE_PYRAMID: // break; case VX_TYPE_REMAP: memory = &((vx_remap)ref)->memory; break; //case VX_TYPE_SCALAR: //case VX_TYPE_THRESHOLD: // break; } if (memory) { for (pln = 0; pln < memory->nptrs; pln++) { if (memory->cl_type == CL_MEM_OBJECT_BUFFER) { if (type == VX_TYPE_IMAGE) { /* set the work dimensions */ work_dim[0] = memory->dims[pln][VX_DIM_X]; work_dim[1] = memory->dims[pln][VX_DIM_Y]; // width, height, stride_x, stride_y err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_int32), &memory->dims[pln][VX_DIM_X]); err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_int32), &memory->dims[pln][VX_DIM_Y]); err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_X]); err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_Y]); VX_PRINT(VX_ZONE_INFO, "Setting vx_image as Buffer with 4 parameters\n"); } else if (type == VX_TYPE_ARRAY || type == VX_TYPE_LUT) { vx_array arr = (vx_array)ref; // sizeof item, active count, capacity err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&arr->item_size); err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&arr->num_items); // this is output? err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&arr->capacity); err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_int32), &arr->memory.strides[VX_DIM_X]); VX_PRINT(VX_ZONE_INFO, "Setting vx_buffer as Buffer with 4 parameters\n"); } else if (type == VX_TYPE_MATRIX) { vx_matrix mat = (vx_matrix)ref; // columns, rows err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&mat->columns); err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&mat->rows); VX_PRINT(VX_ZONE_INFO, "Setting vx_matrix as Buffer with 2 parameters\n"); } else if (type == VX_TYPE_DISTRIBUTION) { vx_distribution dist = (vx_distribution)ref; // num, range, offset, winsize vx_uint32 range = dist->memory.dims[0][VX_DIM_X] * dist->window_x; err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&dist->memory.dims[VX_DIM_X]); err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&range); err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&dist->offset_x); err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&dist->window_x); } else if (type == VX_TYPE_CONVOLUTION) { vx_convolution conv = (vx_convolution)ref; // columns, rows, scale err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&conv->base.columns); err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&conv->base.rows); err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&conv->scale); } err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(cl_mem), &memory->hdls[pln]); CL_ERROR_MSG(err, "clSetKernelArg"); if (dir == VX_INPUT || dir == VX_BIDIRECTIONAL) { err = clEnqueueWriteBuffer(context->queues[plidx][didx], memory->hdls[pln], CL_TRUE, 0, vxComputeMemorySize(memory, pln), memory->ptrs[pln], 0, NULL, &ref->event); } } else if (memory->cl_type == CL_MEM_OBJECT_IMAGE2D) { vx_rectangle_t rect = {0}; vx_image image = (vx_image)ref; vxGetValidRegionImage(image, &rect); size_t origin[3] = {rect.start_x, rect.start_y, 0}; size_t region[3] = {rect.end_x-rect.start_x, rect.end_y-rect.start_y, 1}; /* set the work dimensions */ work_dim[0] = rect.end_x-rect.start_x; work_dim[1] = rect.end_y-rect.start_y; VX_PRINT(VX_ZONE_INFO, "Setting vx_image as image2d_t wd={%zu,%zu} arg:%u\n",work_dim[0], work_dim[1], argidx); err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(cl_mem), &memory->hdls[pln]); CL_ERROR_MSG(err, "clSetKernelArg"); if (err != CL_SUCCESS) { VX_PRINT(VX_ZONE_ERROR, "Error Calling Kernel %s, parameter %u\n", node->kernel->name, pidx); } if (dir == VX_INPUT || dir == VX_BIDIRECTIONAL) { err = clEnqueueWriteImage(context->queues[plidx][didx], memory->hdls[pln], CL_TRUE, origin, region, memory->strides[pln][VX_DIM_Y], 0, memory->ptrs[pln], 0, NULL, &ref->event); CL_ERROR_MSG(err, "clEnqueueWriteImage"); } } } } else { if (type == VX_TYPE_SCALAR) { vx_value_t value; // largest platform atomic vx_size size = 0ul; vx_scalar sc = (vx_scalar)ref; vx_enum stype = VX_TYPE_INVALID; vxReadScalarValue(sc, &value); vxQueryScalar(sc, VX_SCALAR_ATTRIBUTE_TYPE, &stype, sizeof(stype)); size = vxSizeOfType(stype); err = clSetKernelArg(vxclk->kernels[plidx], argidx++, size, &value); } else if (type == VX_TYPE_THRESHOLD) { vx_enum ttype = 0; vx_threshold th = (vx_threshold)ref; vxQueryThreshold(th, VX_THRESHOLD_ATTRIBUTE_TYPE, &ttype, sizeof(ttype)); if (ttype == VX_THRESHOLD_TYPE_BINARY) { err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint8), &th->value); } else if (ttype == VX_THRESHOLD_TYPE_RANGE) { err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint8), &th->lower); err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint8), &th->upper); } } } } we = 0; for (pidx = 0; pidx < num; pidx++) { vx_reference ref = node->parameters[pidx]; vx_enum dir = node->kernel->signature.directions[pidx]; if (dir == VX_INPUT || dir == VX_BIDIRECTIONAL) { memcpy(&writeEvents[we++],&ref->event, sizeof(cl_event)); } } //local_dim[0] = 1; //local_dim[1] = 1; err = clEnqueueNDRangeKernel(context->queues[plidx][didx], vxclk->kernels[plidx], 2, off_dim, work_dim, NULL, we, writeEvents, &node->base.event); CL_ERROR_MSG(err, "clEnqueueNDRangeKernel"); /* enqueue a read on all output data */ for (pidx = 0; pidx < num; pidx++) { vx_reference ref = node->parameters[pidx]; vx_enum dir = node->kernel->signature.directions[pidx]; vx_enum type = node->kernel->signature.types[pidx]; if (dir == VX_OUTPUT || dir == VX_BIDIRECTIONAL) { vx_memory_t *memory = NULL; switch (type) { case VX_TYPE_ARRAY: memory = &((vx_array)ref)->memory; break; case VX_TYPE_CONVOLUTION: memory = &((vx_convolution)ref)->base.memory; break; case VX_TYPE_DISTRIBUTION: memory = &((vx_distribution)ref)->memory; break; case VX_TYPE_IMAGE: memory = &((vx_image)ref)->memory; break; case VX_TYPE_LUT: memory = &((vx_lut_t*)ref)->memory; break; case VX_TYPE_MATRIX: memory = &((vx_matrix)ref)->memory; break; //case VX_TYPE_PYRAMID: // break; case VX_TYPE_REMAP: memory = &((vx_remap)ref)->memory; break; //case VX_TYPE_SCALAR: //case VX_TYPE_THRESHOLD: // break; } if (memory) { for (pln = 0; pln < memory->nptrs; pln++) { if (memory->cl_type == CL_MEM_OBJECT_BUFFER) { err = clEnqueueReadBuffer(context->queues[plidx][didx], memory->hdls[pln], CL_TRUE, 0, vxComputeMemorySize(memory, pln), memory->ptrs[pln], 1, &node->base.event, &ref->event); CL_ERROR_MSG(err, "clEnqueueReadBuffer"); } else if (memory->cl_type == CL_MEM_OBJECT_IMAGE2D) { vx_rectangle_t rect = {0}; vx_image image = (vx_image)ref; vxGetValidRegionImage(image, &rect); size_t origin[3] = {rect.start_x, rect.start_y, 0}; size_t region[3] = {rect.end_x-rect.start_x, rect.end_y-rect.start_y, 1}; /* set the work dimensions */ work_dim[0] = rect.end_x-rect.start_x; work_dim[1] = rect.end_y-rect.start_y; err = clEnqueueReadImage(context->queues[plidx][didx], memory->hdls[pln], CL_TRUE, origin, region, memory->strides[pln][VX_DIM_Y], 0, memory->ptrs[pln], 1, &node->base.event, &ref->event); CL_ERROR_MSG(err, "clEnqueueReadImage"); VX_PRINT(VX_ZONE_INFO, "Reading Image wd={%zu,%zu}\n", work_dim[0], work_dim[1]); } } } } } re = 0; for (pidx = 0; pidx < num; pidx++) { vx_reference ref = node->parameters[pidx]; vx_enum dir = node->kernel->signature.directions[pidx]; if (dir == VX_OUTPUT || dir == VX_BIDIRECTIONAL) { memcpy(&readEvents[re++],&ref->event, sizeof(cl_event)); } } err = clFlush(context->queues[plidx][didx]); CL_ERROR_MSG(err, "Flush"); VX_PRINT(VX_ZONE_TARGET, "Waiting for read events!\n"); clWaitForEvents(re, readEvents); if (err == CL_SUCCESS) status = VX_SUCCESS; //exit: VX_PRINT(VX_ZONE_API, "%s exiting %d\n", __FUNCTION__, status); vxSemPost(&target->base.lock); return status; }
int main() { int i,j,k; // nb of operations: const int dsize = 512; int nthreads = 1; int nbOfAverages = 1e2; int opsMAC = 2; // operations per MAC cl_short4 *in, *out; cl_half *ck; double tops; //total ops #define NQUEUES 1 cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queues[NQUEUES]; cl_mem bufin, bufck, bufout; cl_event event = NULL; cl_program program; cl_kernel kernel; size_t global[2], local[2]; size_t param[5]; char version[300]; // allocate matrices in = (cl_short4 *) calloc(dsize*dsize, sizeof(*in)); out = (cl_short4 *) calloc(dsize*dsize, sizeof(*out)); ck = (cl_half *) calloc(9*9, sizeof(*ck)); in[0].x = 0x3c00; in[1].x = 0x4000; in[dsize].x = 0x4100; ck[0] = 0x3c00; ck[1] = 0x4000; ck[9] = 0x3000; /* Setup OpenCL environment. */ err = clGetPlatformIDs( 1, &platform, NULL ); err = clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL ); props[1] = (cl_context_properties)platform; ctx = clCreateContext( props, 1, &device, NULL, NULL, &err ); for(i = 0; i < NQUEUES; i++) queues[i] = clCreateCommandQueue( ctx, device, 0, &err ); // Print some info about the system clGetDeviceInfo(device, CL_DEVICE_VERSION, sizeof(version), version, NULL); printf("CL_DEVICE_VERSION=%s\n", version); clGetDeviceInfo(device, CL_DRIVER_VERSION, sizeof(version), version, NULL); printf("CL_DRIVER_VERSION=%s\n", version); program = clCreateProgramWithSource(ctx, 1, (const char **)&source, NULL, &err); clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(param[0]), param, NULL); printf("CL_DEVICE_LOCAL_MEM_SIZE=%d\n", (int)param[0]); clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(param[0]), param, NULL); printf("CL_DEVICE_MAX_WORK_GROUP_SIZE=%d\n", (int)param[0]); clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(param[0]), param, NULL); printf("CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS=%d\n", (int)param[0]); j = param[0]; clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(param[0])*j, param, NULL); printf("CL_DEVICE_MAX_WORK_ITEM_SIZES="); for(i = 0; i < j; i++) printf("%d ", (int)param[i]); printf("\n"); clGetDeviceInfo(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(param[0]), param, NULL); printf("CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE=%d\n", (int)param[0]); program = clCreateProgramWithSource(ctx, 1, (const char **)&source, NULL, &err); if(!program) { printf("Error creating program\n"); return -1; } err = clBuildProgram(program, 0, 0, 0, 0, 0); if(err != CL_SUCCESS) { char buffer[20000]; size_t len; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); puts(buffer); return -1; } kernel = clCreateKernel(program, "conv9x9", &err); if(!kernel || err != CL_SUCCESS) { printf("Error creating kernel\n"); return -1; } /* Prepare OpenCL memory objects and place matrices inside them. */ cl_image_format fmt = {CL_RGBA, CL_HALF_FLOAT}; cl_int rc; bufin = clCreateImage2D(ctx, CL_MEM_READ_ONLY, &fmt, dsize, dsize, 0, 0, &rc); bufout = clCreateImage2D(ctx, CL_MEM_WRITE_ONLY, &fmt, dsize, dsize, 0, 0, &rc); bufck = clCreateBuffer( ctx, CL_MEM_READ_ONLY, 9 * 9 * sizeof(*ck), NULL, &err ); size_t origin[3] = {0,0,0}; size_t region[3] = {dsize, dsize, 1}; err = clEnqueueWriteImage(queues[0], bufin, CL_TRUE, origin, region, dsize * sizeof(*in), 0, in, 0, NULL, NULL ); err = clEnqueueWriteBuffer( queues[0], bufck, CL_TRUE, 0, 9 * 9 * sizeof( *ck ), ck, 0, NULL, NULL ); clSetKernelArg(kernel, 0, sizeof(int), &dsize); clSetKernelArg(kernel, 1, sizeof(cl_mem), &bufin); clSetKernelArg(kernel, 2, sizeof(cl_mem), &bufck); clSetKernelArg(kernel, 3, sizeof(cl_mem), &bufout); local[0] = 8; local[1] = 8; global[0] = global[1] = dsize-32; usleep(100000); struct timeval start,end; gettimeofday(&start, NULL); for (k=0; k<nthreads; k++) { //printf("Hello from thread %d, nthreads %d\n", omp_get_thread_num(), omp_get_num_threads()); for(i=0;i<nbOfAverages;i++) { // do the 2D convolution err = clEnqueueNDRangeKernel(queues[0], kernel, 2, NULL, global, local, 0, NULL, NULL); if(err != CL_SUCCESS) { printf("clEnqueueNDRangeKernel error %d\n", err); return -1; } } } clFinish(queues[0]); gettimeofday(&end, NULL); double t = ((double) (end.tv_sec - start.tv_sec)) + ((double) (end.tv_usec - start.tv_usec)) / 1e6; //reports time in [s] - verified! /* Wait for calculations to be finished. */ /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadImage(queues[0], bufout, CL_TRUE, origin, region, dsize * sizeof(*out), 0, out, 0, NULL, NULL ); clFinish(queues[0]); printf("%x %x %x %x\n", out[0].x, out[1].x, out[dsize].x, out[dsize+1].x); /* Release OpenCL memory objects. */ clReleaseMemObject( bufin ); clReleaseMemObject( bufck ); clReleaseMemObject( bufout ); /* Release OpenCL working objects. */ for(i = 0; i < NQUEUES; i++) clReleaseCommandQueue( queues[i] ); clReleaseContext( ctx ); // report performance: tops = 4 * nthreads * opsMAC * (dsize-32)*(dsize-32)*9*9; // total ops printf("Total M ops = %.0lf, # of threads = %d", nbOfAverages*tops*1e-6, nthreads); printf("\nTime in s: %lf:", t); printf("\nTest performance [G OP/s] %lf:", tops*nbOfAverages/t*1e-9); printf("\n"); return(0); }
void clInvert3D(CL* cl, VglImage* img){ cl_int err; cl_image_desc desc = getDesc(img->shape[0], img->shape[1], 3, img->shape[2]); cl_image_desc descOut = getDesc(img->shape[0], img->shape[1], 3, img->shape[2]); cl_image_format src; cl_image_format out; switch(img->nChannels){ case 1: src.image_channel_order = CL_A; out.image_channel_order = CL_A; break; case 3: rgb2rgba(NULL, img); src.image_channel_order = CL_RGBA; out.image_channel_order = CL_RGBA; break; case 4: src.image_channel_order = CL_RGBA; out.image_channel_order = CL_RGBA; break; default: printf("Numero de canais não suportado\n"); exit; } src.image_channel_data_type = CL_UNORM_INT8; out.image_channel_data_type = CL_UNORM_INT8; cl_mem src_mem = clCreateImage(cl->context, CL_MEM_READ_ONLY, &src, &desc, NULL, &err); printf("IMAGE STATUS SOURCE\t"); cl_error(err); cl_mem out_mem = clCreateImage(cl->context, CL_MEM_WRITE_ONLY, &out, &descOut, NULL, &err); printf("IMAGE STATUS OUT\t"); cl_error(err); clGetMemObjectInfo(src_mem, CL_MEM_TYPE, sizeof(cl_int), &err, NULL); if(err == CL_MEM_OBJECT_IMAGE3D) printf("IMAGE TYPE:\t\tCL_MEM_OBJECT_IMAGE3D\n"); size_t *src_origin=(size_t*)malloc(sizeof(size_t)*3); src_origin[0] = 0; src_origin[1] = 0; src_origin[2] = 0; size_t *src_region=(size_t*)malloc(sizeof(size_t)*3); src_region[0] = img->shape[0]; src_region[1] = img->shape[1]; src_region[2] = img->shape[2]; err = clEnqueueWriteImage(cl->queue, src_mem, CL_TRUE, src_origin, src_region, 0, 0, (void*)img->ndarray, 0, 0, NULL); printf("ENQUEUE IMAGE STATUS "); cl_error(err); cl_program program; cl_kernel kernel; const char* k = "./CLdemos/CL/Invert3D_RGBA.cl"; const char* k2 = "./CLdemos/CL/Invert3D_A.cl"; char** fonte; if(img->nChannels==1) fonte = (char**)getKernelPtr(k2); if(img->nChannels==4) fonte = (char**)getKernelPtr(k); program = clCreateProgramWithSource(cl->context, 1, (const char**)fonte, NULL, &err); printf("CREATE PROGRAM STATUS: "); cl_error(err); clBuildProgram(program, 0, NULL, NULL, NULL, &err); printf("BUILD PROGRAM STATUS: "); cl_error(err); kernel = clCreateKernel(program, "invert", &err); printf("KERNEL STATUS "); cl_error(err); err = clSetKernelArg( kernel, 0, sizeof( cl_mem ), (void *) &src_mem); printf("SET 1 KERNEL ARG "); cl_error(err); err = clSetKernelArg( kernel, 1, sizeof( cl_mem ), (void *) &out_mem); printf("SET 2 KERNEL ARG "); cl_error(err); size_t worksize[] = { img->shape[0], img->shape[1], img->shape[2]}; err = clEnqueueNDRangeKernel(cl->queue, kernel, 2, NULL, worksize, 0, 0, 0, 0); printf("ENQUEUE ND KERNEL STATUS "); cl_error(err); clFinish(cl->queue); char* auxout = (char*)malloc(img->shape[0]*img->shape[1]*img->shape[2]*img->nChannels); err = clEnqueueReadImage(cl->queue, out_mem, CL_TRUE, src_origin, src_region, 0, 0, auxout, 0, NULL, NULL); printf("READ NEW IMAGE STATUS "); cl_error(err); for(int i=0; i<(img->shape[0]*img->nChannels*img->shape[1]*img->shape[2]); i++) img->ndarray[i] = auxout[i]; free(auxout); clReleaseKernel(kernel); clReleaseProgram(program); }
END_TEST START_TEST (test_read_write_image) { cl_device_id device; cl_context ctx; cl_command_queue queue; cl_mem image2d, part2d; cl_int result; cl_platform_id platform = 0; cl_uint num_platforms = 0; clGetPlatformIDs(1, &platform, &num_platforms); unsigned char image2d_data_24bpp[3*3*4] = { 255, 0, 0, 0, 0, 255, 0, 0, 128, 128, 128, 0, 0, 0, 255, 0, 255, 255, 0, 0, 0, 128, 0, 0, 255, 128, 0, 0, 128, 0, 255, 0, 0, 0, 0, 0 }; unsigned char image2d_part_24bpp[2*2*4] = { 255, 0, 0, 0, 0, 255, 0, 0, 0, 0, 255, 0, 255, 255, 0, 0 }; unsigned char image2d_buffer[3*3*4]; unsigned char image2d_part[2*2*4]; cl_image_format fmt; fmt.image_channel_data_type = CL_UNORM_INT8; fmt.image_channel_order = CL_RGBA; size_t origin[3] = {0, 0, 0}; size_t region[3] = {3, 3, 1}; result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, 0); fail_if( result != CL_SUCCESS, "unable to get the default device" ); ctx = clCreateContext(0, 1, &device, 0, 0, &result); fail_if( result != CL_SUCCESS || ctx == 0, "unable to create a valid context" ); queue = clCreateCommandQueue(ctx, device, 0, &result); fail_if( result != CL_SUCCESS || queue == 0, "cannot create a command queue" ); image2d = clCreateImage2D(ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &fmt, 3, 3, 0, image2d_buffer, &result); fail_if( result != CL_SUCCESS || image2d == 0, "cannot create a valid 3x3 image2D" ); part2d = clCreateImage2D(ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &fmt, 2, 2, 0, image2d_part, &result); fail_if( result != CL_SUCCESS || image2d == 0, "cannot create a valid 2x2 image2D" ); // Write data in buffer result = clEnqueueWriteImage(queue, image2d, 1, origin, region, 0, 0, image2d_data_24bpp, 0, 0, 0); fail_if( result != CL_SUCCESS, "cannot enqueue a blocking write image event" ); // Read it back region[0] = 2; region[1] = 2; result = clEnqueueReadImage(queue, image2d, 1, origin, region, 0, 0, image2d_part, 0, 0, 0); fail_if( result != CL_SUCCESS, "cannot enqueue a blocking read image event" ); // Compare #if 0 // images not supported fail_if( std::memcmp(image2d_part, image2d_part_24bpp, sizeof(image2d_part)) != 0, "reading and writing images doesn't produce the correct result" ); #endif // Read it back using a buffer cl_event event; std::memset(image2d_part, 0, sizeof(image2d_part)); result = clEnqueueCopyImage(queue, image2d, part2d, origin, origin, region, 0, 0, &event); fail_if( result != CL_SUCCESS, "unable to enqueue a copy image event" ); result = clWaitForEvents(1, &event); fail_if( result != CL_SUCCESS, "unable to wait for events" ); // Compare #if 0 // images not supported fail_if( std::memcmp(image2d_part, image2d_part_24bpp, sizeof(image2d_part)) != 0, "copying images doesn't produce the correct result" ); #endif clReleaseEvent(event); clReleaseMemObject(part2d); clReleaseMemObject(image2d); clReleaseCommandQueue(queue); clReleaseContext(ctx); }