int setupScaleWeights(cl_float xscale, cl_float yscale, int width, int height, hb_oclscale_t *os, KernelEnv *kenv) { cl_int status; if (os->xscale != xscale || os->width < width) { cl_float *xweights = hb_bicubic_weights(xscale, width); CL_FREE(os->bicubic_x_weights); CREATEBUF(os->bicubic_x_weights, CL_MEM_READ_ONLY, sizeof(cl_float) * width * 4); OCLCHECK(clEnqueueWriteBuffer, kenv->command_queue, os->bicubic_x_weights, CL_TRUE, 0, sizeof(cl_float) * width * 4, xweights, 0, NULL, NULL ); os->width = width; os->xscale = xscale; free(xweights); } if ((os->yscale != yscale) || (os->height < height)) { cl_float *yweights = hb_bicubic_weights(yscale, height); CL_FREE(os->bicubic_y_weights); CREATEBUF(os->bicubic_y_weights, CL_MEM_READ_ONLY, sizeof(cl_float) * height * 4); OCLCHECK(clEnqueueWriteBuffer, kenv->command_queue, os->bicubic_y_weights, CL_TRUE, 0, sizeof(cl_float) * height * 4, yweights, 0, NULL, NULL ); os->height = height; os->yscale = yscale; free(yweights); } return 0; }
static int to3d_ocl_fillHole(void **userdata, KernelEnv *kenv) { cl_mem view = (cl_mem)userdata[0]; cl_mem hole = (cl_mem)userdata[1]; cl_mem holeU = (cl_mem)userdata[2]; cl_mem holeV = (cl_mem)userdata[3]; int w = (int)userdata[4]; int h = (int)userdata[5]; int base = (int)userdata[6]; cl_kernel kernel = (cl_kernel)userdata[7]; int flag = (int)userdata[8]; int lr = (int)userdata[9]; int status = 0; OCLCHECK( clSetKernelArg, kernel, 0, sizeof(cl_mem), &view ); OCLCHECK( clSetKernelArg, kernel, 1, sizeof(cl_mem), &hole ); OCLCHECK( clSetKernelArg, kernel, 2, sizeof(cl_mem), &holeU ); OCLCHECK( clSetKernelArg, kernel, 3, sizeof(cl_mem), &holeV ); OCLCHECK( clSetKernelArg, kernel, 4, sizeof(int), &w ); OCLCHECK( clSetKernelArg, kernel, 5, sizeof(int), &h ); OCLCHECK( clSetKernelArg, kernel, 6, sizeof(int), &base ); //NDRangekernel kenv->kernel = kernel; size_t globalThreads[2], offset[2]; if(flag == 0) { if(lr == 0)//left { globalThreads[0] = w - (tx / 2 * F / Znear); globalThreads[1] = h; offset[0] = tx / 2 * F / Znear; offset[1] = 0; } else { globalThreads[0] = w - (tx / 2 * F / Znear); globalThreads[1] = h; offset[0] = 0; offset[1] = 0; } } else { if(lr == 0) { globalThreads[0] = tx / 2 * F / Znear; globalThreads[1] = h; offset[0] = 0; offset[1] = 0; } else { globalThreads[0] = tx / 2 * F / Znear; globalThreads[1] = h; offset[0] = w - (tx / 2 * F / Znear); offset[1] = 0; } } OCLCHECK( clEnqueueNDRangeKernel, kenv->command_queue, kenv->kernel, 2, offset,globalThreads, NULL, 0, NULL, NULL ); return TO3D_SUCCESS; }
/** * executive scale using opencl * get filter args * create output buffer * create horizontal filter buffer * create vertical filter buffer * create kernels */ int hb_ocl_scale_func( void **data, KernelEnv *kenv ) { cl_int status; cl_mem in_buf = data[0]; cl_mem out_buf = data[1]; int crop_top = data[2]; int crop_bottom = data[3]; int crop_left = data[4]; int crop_right = data[5]; cl_int in_frame_w = (int)data[6]; cl_int in_frame_h = (int)data[7]; cl_int out_frame_w = (int)data[8]; cl_int out_frame_h = (int)data[9]; hb_oclscale_t *os = data[10]; hb_buffer_t *in = data[11]; hb_buffer_t *out = data[12]; if (os->initialized == 0) { hb_log( "Scaling With OpenCL" ); if (kenv->isAMD != 0) hb_log( "Using Zero Copy"); // create the block kernel cl_int status; os->m_kernel = clCreateKernel( kenv->program, "frame_scale", &status ); os->initialized = 1; } { // Use the new kernel cl_event events[5]; int eventCount = 0; if (kenv->isAMD == 0) { status = clEnqueueUnmapMemObject(kenv->command_queue, in->cl.buffer, in->data, 0, NULL, &events[eventCount++]); status = clEnqueueUnmapMemObject(kenv->command_queue, out->cl.buffer, out->data, 0, NULL, &events[eventCount++]); } cl_int srcPlaneOffset0 = in->plane[0].data - in->data; cl_int srcPlaneOffset1 = in->plane[1].data - in->data; cl_int srcPlaneOffset2 = in->plane[2].data - in->data; cl_int srcRowWords0 = in->plane[0].stride; cl_int srcRowWords1 = in->plane[1].stride; cl_int srcRowWords2 = in->plane[2].stride; cl_int dstPlaneOffset0 = out->plane[0].data - out->data; cl_int dstPlaneOffset1 = out->plane[1].data - out->data; cl_int dstPlaneOffset2 = out->plane[2].data - out->data; cl_int dstRowWords0 = out->plane[0].stride; cl_int dstRowWords1 = out->plane[1].stride; cl_int dstRowWords2 = out->plane[2].stride; if (crop_top != 0 || crop_bottom != 0 || crop_left != 0 || crop_right != 0) { srcPlaneOffset0 += crop_left + crop_top * srcRowWords0; srcPlaneOffset1 += crop_left / 2 + (crop_top / 2) * srcRowWords1; srcPlaneOffset2 += crop_left / 2 + (crop_top / 2) * srcRowWords2; in_frame_w = in_frame_w - crop_right - crop_left; in_frame_h = in_frame_h - crop_bottom - crop_top; } cl_float xscale = (out_frame_w * 1.0f) / in_frame_w; cl_float yscale = (out_frame_h * 1.0f) / in_frame_h; setupScaleWeights(xscale, yscale, out_frame_w, out_frame_h, os, kenv); OCLCHECK( clSetKernelArg, os->m_kernel, 0, sizeof(cl_mem), &out_buf ); OCLCHECK( clSetKernelArg, os->m_kernel, 1, sizeof(cl_mem), &in_buf ); OCLCHECK( clSetKernelArg, os->m_kernel, 2, sizeof(cl_float), &xscale ); OCLCHECK( clSetKernelArg, os->m_kernel, 3, sizeof(cl_float), &yscale ); OCLCHECK( clSetKernelArg, os->m_kernel, 4, sizeof(cl_int), &srcPlaneOffset0 ); OCLCHECK( clSetKernelArg, os->m_kernel, 5, sizeof(cl_int), &srcPlaneOffset1 ); OCLCHECK( clSetKernelArg, os->m_kernel, 6, sizeof(cl_int), &srcPlaneOffset2 ); OCLCHECK( clSetKernelArg, os->m_kernel, 7, sizeof(cl_int), &dstPlaneOffset0 ); OCLCHECK( clSetKernelArg, os->m_kernel, 8, sizeof(cl_int), &dstPlaneOffset1 ); OCLCHECK( clSetKernelArg, os->m_kernel, 9, sizeof(cl_int), &dstPlaneOffset2 ); OCLCHECK( clSetKernelArg, os->m_kernel, 10, sizeof(cl_int), &srcRowWords0 ); OCLCHECK( clSetKernelArg, os->m_kernel, 11, sizeof(cl_int), &srcRowWords1 ); OCLCHECK( clSetKernelArg, os->m_kernel, 12, sizeof(cl_int), &srcRowWords2 ); OCLCHECK( clSetKernelArg, os->m_kernel, 13, sizeof(cl_int), &dstRowWords0 ); OCLCHECK( clSetKernelArg, os->m_kernel, 14, sizeof(cl_int), &dstRowWords1 ); OCLCHECK( clSetKernelArg, os->m_kernel, 15, sizeof(cl_int), &dstRowWords2 ); OCLCHECK( clSetKernelArg, os->m_kernel, 16, sizeof(cl_int), &in_frame_w ); OCLCHECK( clSetKernelArg, os->m_kernel, 17, sizeof(cl_int), &in_frame_h ); OCLCHECK( clSetKernelArg, os->m_kernel, 18, sizeof(cl_int), &out_frame_w ); OCLCHECK( clSetKernelArg, os->m_kernel, 19, sizeof(cl_int), &out_frame_h ); OCLCHECK( clSetKernelArg, os->m_kernel, 20, sizeof(cl_mem), &os->bicubic_x_weights ); OCLCHECK( clSetKernelArg, os->m_kernel, 21, sizeof(cl_mem), &os->bicubic_y_weights ); size_t workOffset[] = { 0, 0, 0 }; size_t globalWorkSize[] = { 1, 1, 1 }; size_t localWorkSize[] = { 1, 1, 1 }; int xgroups = (out_frame_w + 63) / 64; int ygroups = (out_frame_h + 15) / 16; localWorkSize[0] = 64; localWorkSize[1] = 1; localWorkSize[2] = 1; globalWorkSize[0] = xgroups * 64; globalWorkSize[1] = ygroups; globalWorkSize[2] = 3; OCLCHECK( clEnqueueNDRangeKernel, kenv->command_queue, os->m_kernel, 3, workOffset, globalWorkSize, localWorkSize, eventCount, (eventCount == 0) ? NULL : &events[0], &events[eventCount] ); ++eventCount; if (kenv->isAMD == 0) { in->data = clEnqueueMapBuffer(kenv->command_queue, in->cl.buffer, CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, in->alloc, (eventCount == 0) ? 0 : 1, (eventCount == 0) ? NULL : &events[eventCount - 1], &events[eventCount], &status); out->data = clEnqueueMapBuffer(kenv->command_queue, out->cl.buffer, CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, out->alloc, (eventCount == 0) ? 0 : 1, (eventCount == 0) ? NULL : &events[eventCount - 1], &events[eventCount + 1], &status); eventCount += 2; } clFlush(kenv->command_queue); clWaitForEvents(eventCount, &events[0]); int i; for (i = 0; i < eventCount; ++i) clReleaseEvent(events[i]); } return 1; }