void FillVector(Queue &queue, const Device &device, const Program &program, const Databases &, EventPointer event, const std::vector<Event> &waitForEvents, const size_t n, const size_t inc, const size_t offset, const Buffer<T> &dest, const T constant_value) { auto kernel = Kernel(program, "FillVector"); kernel.SetArgument(0, static_cast<int>(n)); kernel.SetArgument(1, static_cast<int>(inc)); kernel.SetArgument(2, static_cast<int>(offset)); kernel.SetArgument(3, dest()); kernel.SetArgument(4, GetRealArg(constant_value)); auto local = std::vector<size_t>{64}; auto global = std::vector<size_t>{Ceil(n, 64)}; RunKernel(kernel, queue, device, global, local, event, waitForEvents); }
void Xim2col<T>::DoIm2col(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const Buffer<T> &im_buffer, const size_t im_offset, const Buffer<T> &col_buffer, const size_t col_offset) { // Makes sure all dimensions are larger than zero if ((channels == 0) || (height == 0) || (width == 0)) { throw BLASError(StatusCode::kInvalidDimension); } // Sets the output height and width const auto size_h = height + 2 * pad_h; const auto padding_h = dilation_h * (kernel_h - 1) + 1; const auto output_h = (size_h >= padding_h) ? (size_h - padding_h) / stride_h + 1 : 1; const auto size_w = width + 2 * pad_w; const auto padding_w = dilation_w * (kernel_w - 1) + 1; const auto output_w = (size_w >= padding_w) ? (size_w - padding_w) / stride_w + 1 : 1; // Retrieves the Xcopy kernel from the compiled binary auto kernel = Kernel(program_, "im2col"); // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(height)); kernel.SetArgument(1, static_cast<int>(width)); kernel.SetArgument(2, static_cast<int>(channels)); kernel.SetArgument(3, static_cast<int>(output_h)); kernel.SetArgument(4, static_cast<int>(output_w)); kernel.SetArgument(5, static_cast<int>(kernel_h)); kernel.SetArgument(6, static_cast<int>(kernel_w)); kernel.SetArgument(7, static_cast<int>(pad_h)); kernel.SetArgument(8, static_cast<int>(pad_w)); kernel.SetArgument(9, static_cast<int>(stride_h)); kernel.SetArgument(10, static_cast<int>(stride_w)); kernel.SetArgument(11, static_cast<int>(dilation_h)); kernel.SetArgument(12, static_cast<int>(dilation_w)); kernel.SetArgument(13, im_buffer()); kernel.SetArgument(14, static_cast<int>(im_offset)); kernel.SetArgument(15, col_buffer()); kernel.SetArgument(16, static_cast<int>(col_offset)); // Launches the kernel const auto w_ceiled = Ceil(output_w, db_["COPY_DIMX"]); const auto h_ceiled = Ceil(output_h, db_["COPY_DIMY"]); const auto global = std::vector<size_t>{w_ceiled, h_ceiled * channels}; const auto local = std::vector<size_t>{db_["COPY_DIMX"], db_["COPY_DIMY"]}; RunKernel(kernel, queue_, device_, global, local, event_); }
virtual void OnRender(ulong time) { if (time - m_last_time_dt > 0) { m_dt = ((time - m_last_time_dt) / (float)m_time_shift) / 1000.0f; m_last_time_dt = time; m_time_shift = 1; } else { m_time_shift = 1; } glClear(GL_COLOR_BUFFER_BIT); glEnableVertexAttribArray(0); glEnableVertexAttribArray(1); glBindBuffer(GL_ARRAY_BUFFER, m_VB); glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, sizeof(Body), 0); glVertexAttribPointer(1, 1, GL_FLOAT, GL_FALSE, sizeof(Body), (const GLvoid*)8); glDrawArrays(GL_POINTS, 0, m_bodies.size()); glDisableVertexAttribArray(0); glDisableVertexAttribArray(1); m_backend->SwapBuffers(); if (!m_pause) RunKernel(); if (time - m_last_time > 1000) { printf("FPS: %d\n", m_frame_count); m_frame_count = 0; m_last_time = time; } else { m_frame_count++; } }
void PadCopyTransposeMatrix(Queue &queue, const Device &device, const Databases &db, EventPointer event, const std::vector<Event> &waitForEvents, const size_t src_one, const size_t src_two, const size_t src_ld, const size_t src_offset, const Buffer<T> &src, const size_t dest_one, const size_t dest_two, const size_t dest_ld, const size_t dest_offset, const Buffer<T> &dest, const T alpha, const Program &program, const bool do_pad, const bool do_transpose, const bool do_conjugate, const bool upper = false, const bool lower = false, const bool diagonal_imag_zero = false) { // Determines whether or not the fast-version could potentially be used auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) && (src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld) && (upper == false) && (lower == false) && (diagonal_imag_zero == false); // Determines the right kernel auto kernel_name = std::string{}; if (do_transpose) { if (use_fast_kernel && IsMultiple(src_ld, db["TRA_WPT"]) && IsMultiple(src_one, db["TRA_WPT"]*db["TRA_DIM"]) && IsMultiple(src_two, db["TRA_WPT"]*db["TRA_DIM"])) { kernel_name = "TransposeMatrixFast"; } else { use_fast_kernel = false; kernel_name = (do_pad) ? "TransposePadMatrix" : "TransposeMatrix"; } } else { if (use_fast_kernel && IsMultiple(src_ld, db["COPY_VW"]) && IsMultiple(src_one, db["COPY_VW"]*db["COPY_DIMX"]) && IsMultiple(src_two, db["COPY_WPT"]*db["COPY_DIMY"])) { kernel_name = "CopyMatrixFast"; } else { use_fast_kernel = false; kernel_name = (do_pad) ? "CopyPadMatrix" : "CopyMatrix"; } } // Retrieves the kernel from the compiled binary auto kernel = Kernel(program, kernel_name); // Sets the kernel arguments if (use_fast_kernel) { kernel.SetArgument(0, static_cast<int>(src_ld)); kernel.SetArgument(1, src()); kernel.SetArgument(2, dest()); kernel.SetArgument(3, GetRealArg(alpha)); } else { kernel.SetArgument(0, static_cast<int>(src_one)); kernel.SetArgument(1, static_cast<int>(src_two)); kernel.SetArgument(2, static_cast<int>(src_ld)); kernel.SetArgument(3, static_cast<int>(src_offset)); kernel.SetArgument(4, src()); kernel.SetArgument(5, static_cast<int>(dest_one)); kernel.SetArgument(6, static_cast<int>(dest_two)); kernel.SetArgument(7, static_cast<int>(dest_ld)); kernel.SetArgument(8, static_cast<int>(dest_offset)); kernel.SetArgument(9, dest()); kernel.SetArgument(10, GetRealArg(alpha)); if (do_pad) { kernel.SetArgument(11, static_cast<int>(do_conjugate)); } else { kernel.SetArgument(11, static_cast<int>(upper)); kernel.SetArgument(12, static_cast<int>(lower)); kernel.SetArgument(13, static_cast<int>(diagonal_imag_zero)); } } // Launches the kernel and returns the error code. Uses global and local thread sizes based on // parameters in the database. if (do_transpose) { if (use_fast_kernel) { const auto global = std::vector<size_t>{ dest_one / db["TRA_WPT"], dest_two / db["TRA_WPT"] }; const auto local = std::vector<size_t>{db["TRA_DIM"], db["TRA_DIM"]}; RunKernel(kernel, queue, device, global, local, event, waitForEvents); } else { const auto global = std::vector<size_t>{ Ceil(CeilDiv(dest_one, db["PADTRA_WPT"]), db["PADTRA_TILE"]), Ceil(CeilDiv(dest_two, db["PADTRA_WPT"]), db["PADTRA_TILE"]) }; const auto local = std::vector<size_t>{db["PADTRA_TILE"], db["PADTRA_TILE"]}; RunKernel(kernel, queue, device, global, local, event, waitForEvents); } } else { if (use_fast_kernel) { const auto global = std::vector<size_t>{ dest_one / db["COPY_VW"], dest_two / db["COPY_WPT"] }; const auto local = std::vector<size_t>{db["COPY_DIMX"], db["COPY_DIMY"]}; RunKernel(kernel, queue, device, global, local, event, waitForEvents); } else { const auto global = std::vector<size_t>{ Ceil(CeilDiv(dest_one, db["PAD_WPTX"]), db["PAD_DIMX"]), Ceil(CeilDiv(dest_two, db["PAD_WPTY"]), db["PAD_DIMY"]) }; const auto local = std::vector<size_t>{db["PAD_DIMX"], db["PAD_DIMY"]}; RunKernel(kernel, queue, device, global, local, event, waitForEvents); } } }
void PadCopyTransposeMatrixBatched(Queue &queue, const Device &device, const Databases &db, EventPointer event, const std::vector<Event> &waitForEvents, const size_t src_one, const size_t src_two, const size_t src_ld, const Buffer<int> &src_offsets, const Buffer<T> &src, const size_t dest_one, const size_t dest_two, const size_t dest_ld, const Buffer<int> &dest_offsets, const Buffer<T> &dest, const Program &program, const bool do_pad, const bool do_transpose, const bool do_conjugate, const size_t batch_count) { // Determines the right kernel auto kernel_name = std::string{}; if (do_transpose) { kernel_name = (do_pad) ? "TransposePadMatrixBatched" : "TransposeMatrixBatched"; } else { kernel_name = (do_pad) ? "CopyPadMatrixBatched" : "CopyMatrixBatched"; } // Retrieves the kernel from the compiled binary auto kernel = Kernel(program, kernel_name); // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(src_one)); kernel.SetArgument(1, static_cast<int>(src_two)); kernel.SetArgument(2, static_cast<int>(src_ld)); kernel.SetArgument(3, src_offsets()); kernel.SetArgument(4, src()); kernel.SetArgument(5, static_cast<int>(dest_one)); kernel.SetArgument(6, static_cast<int>(dest_two)); kernel.SetArgument(7, static_cast<int>(dest_ld)); kernel.SetArgument(8, dest_offsets()); kernel.SetArgument(9, dest()); if (do_pad) { kernel.SetArgument(10, static_cast<int>(do_conjugate)); } // Launches the kernel and returns the error code. Uses global and local thread sizes based on // parameters in the database. if (do_transpose) { const auto global = std::vector<size_t>{ Ceil(CeilDiv(dest_one, db["PADTRA_WPT"]), db["PADTRA_TILE"]), Ceil(CeilDiv(dest_two, db["PADTRA_WPT"]), db["PADTRA_TILE"]), batch_count }; const auto local = std::vector<size_t>{db["PADTRA_TILE"], db["PADTRA_TILE"], 1}; RunKernel(kernel, queue, device, global, local, event, waitForEvents); } else { const auto global = std::vector<size_t>{ Ceil(CeilDiv(dest_one, db["PAD_WPTX"]), db["PAD_DIMX"]), Ceil(CeilDiv(dest_two, db["PAD_WPTY"]), db["PAD_DIMY"]), batch_count }; const auto local = std::vector<size_t>{db["PAD_DIMX"], db["PAD_DIMY"], 1}; RunKernel(kernel, queue, device, global, local, event, waitForEvents); } }
StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, const bool packed) { // Makes sure the dimensions are larger than zero if (n == 0) { return StatusCode::kInvalidDimension; } // The data is either in the upper or lower triangle const auto is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || (triangle == Triangle::kLower && layout == Layout::kRowMajor)); const auto is_rowmajor = (layout == Layout::kRowMajor); // Tests the matrix and the vectors for validity auto status = StatusCode::kSuccess; if (packed) { status = TestMatrixAP(n, a_buffer, a_offset); } else { status = TestMatrixA(n, n, a_buffer, a_offset, a_ld); } if (ErrorIn(status)) { return status; } status = TestVectorX(n, x_buffer, x_offset, x_inc); if (ErrorIn(status)) { return status; } status = TestVectorY(n, y_buffer, y_offset, y_inc); if (ErrorIn(status)) { return status; } // Upload the scalar argument as a constant buffer to the device (needed for half-precision) auto alpha_buffer = Buffer<T>(context_, 1); alpha_buffer.Write(queue_, 1, &alpha); // Retrieves the kernel from the compiled binary try { const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); auto kernel = Kernel(program, "Xher2"); // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(n)); kernel.SetArgument(1, alpha_buffer()); kernel.SetArgument(2, x_buffer()); kernel.SetArgument(3, static_cast<int>(x_offset)); kernel.SetArgument(4, static_cast<int>(x_inc)); kernel.SetArgument(5, y_buffer()); kernel.SetArgument(6, static_cast<int>(y_offset)); kernel.SetArgument(7, static_cast<int>(y_inc)); kernel.SetArgument(8, a_buffer()); kernel.SetArgument(9, static_cast<int>(a_offset)); kernel.SetArgument(10, static_cast<int>(a_ld)); kernel.SetArgument(11, static_cast<int>(is_upper)); kernel.SetArgument(12, static_cast<int>(is_rowmajor)); // Launches the kernel auto global_one = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS1"]); auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]); auto global = std::vector<size_t>{global_one, global_two}; auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]}; status = RunKernel(kernel, queue_, device_, global, local, event_); if (ErrorIn(status)) { return status; } // Succesfully finished the computation return StatusCode::kSuccess; } catch (...) { return StatusCode::kInvalidKernel; } }
// the process code that the host sees static OfxStatus render( OfxImageEffectHandle instance, OfxPropertySetHandle inArgs, OfxPropertySetHandle outArgs) { // get the render window and the time from the inArgs OfxTime time; OfxRectI renderWindow; OfxStatus status = kOfxStatOK; gPropHost->propGetDouble(inArgs, kOfxPropTime, 0, &time); gPropHost->propGetIntN(inArgs, kOfxImageEffectPropRenderWindow, 4, &renderWindow.x1); // Retrieve instance data associated with this effect MyInstanceData *myData = getMyInstanceData(instance); // property handles and members of each image OfxPropertySetHandle sourceImg = NULL, outputImg = NULL; int srcRowBytes, srcBitDepth, dstRowBytes, dstBitDepth; bool srcIsAlpha, dstIsAlpha; OfxRectI dstRect, srcRect; void *src, *dst; DPRINT(("Render: window = [%d, %d - %d, %d]\n", renderWindow.x1, renderWindow.y1, renderWindow.x2, renderWindow.y2)); int isOpenCLEnabled = 0; if (gHostSupportsOpenCL) { gPropHost->propGetInt(inArgs, kOfxImageEffectPropOpenCLEnabled, 0, &isOpenCLEnabled); DPRINT(("render: OpenCL rendering %s\n", isOpenCLEnabled ? "enabled" : "DISABLED")); } cl_context clContext = NULL; cl_command_queue cmdQ = NULL; cl_device_id deviceId = NULL; if (isOpenCLEnabled) { void* voidPtrCmdQ; gPropHost->propGetPointer(inArgs, kOfxImageEffectPropOpenCLCommandQueue, 0, &voidPtrCmdQ); cmdQ = reinterpret_cast<cl_command_queue>(voidPtrCmdQ); clGetCommandQueueInfo(cmdQ, CL_QUEUE_CONTEXT, sizeof(cl_context), &clContext, NULL); clGetCommandQueueInfo(cmdQ, CL_QUEUE_DEVICE, sizeof(cl_device_id), &deviceId, NULL); } else { clContext = GetContext(deviceId); cmdQ = clCreateCommandQueue(clContext, deviceId, 0, NULL); } char deviceName[128]; clGetDeviceInfo(deviceId, CL_DEVICE_NAME, 128, deviceName, NULL); DPRINT(("Using %s for plugin\n", deviceName)); cl_kernel kernel = GetKernel(clContext); // get the source image sourceImg = ofxuGetImage(myData->sourceClip, time, srcRowBytes, srcBitDepth, srcIsAlpha, srcRect, src); // get the output image outputImg = ofxuGetImage(myData->outputClip, time, dstRowBytes, dstBitDepth, dstIsAlpha, dstRect, dst); // get the scale parameter double rGain = 1, gGain = 1, bGain = 1; gParamHost->paramGetValueAtTime(myData->rGainParam, time, &rGain); gParamHost->paramGetValueAtTime(myData->gGainParam, time, &gGain); gParamHost->paramGetValueAtTime(myData->bGainParam, time, &bGain); DPRINT(("Gain(%f %f %f)\n", rGain, gGain, bGain)); float w = (renderWindow.x2 - renderWindow.x1); float h = (renderWindow.y2 - renderWindow.y1); const size_t rowSize = w * 4 * sizeof(float); if (isOpenCLEnabled) { DPRINT(("Using OpenCL transfers (same device)\n")); RunKernel(cmdQ, deviceId, kernel, w, h, rGain, gGain, bGain, (cl_mem)src, (cl_mem)dst); } else { DPRINT(("Using CPU transfers\n")); const size_t bufferSize = w * h * 4 * sizeof(float); // Allocate the temporary buffers on the plugin device cl_mem inBuffer = clCreateBuffer(clContext, CL_MEM_READ_ONLY, bufferSize, NULL, NULL); cl_mem outBuffer = clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, bufferSize, NULL, NULL); // Copy the buffer from the CPU to the plugin device clEnqueueWriteBuffer(cmdQ, inBuffer, CL_TRUE, 0, bufferSize, src, 0, NULL, NULL); RunKernel(cmdQ, deviceId, kernel, w, h, rGain, gGain, bGain, inBuffer, outBuffer); // Copy the buffer from the plugin device to the CPU clEnqueueReadBuffer(cmdQ, outBuffer, CL_TRUE, 0, bufferSize, dst, 0, NULL, NULL); clFinish(cmdQ); // Free the temporary buffers on the plugin device clReleaseMemObject(inBuffer); clReleaseMemObject(outBuffer); } if (sourceImg) { gEffectHost->clipReleaseImage(sourceImg); } if (outputImg) { gEffectHost->clipReleaseImage(outputImg); } return status; }
void Xgemm<T>::DoGemm(const std::vector<size_t>& global, const std::vector<size_t>& local, const std::string& kernelName, std::string argumentOrder, const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, const T alpha, const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, const T beta, const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) { // Makes sure all dimensions are larger than zero if ((m == 0) || (n == 0) || (k == 0)) { throw BLASError(StatusCode::kInvalidDimension); } // Computes whether or not the matrices are transposed in memory. This is based on their layout // (row or column-major) and whether or not they are requested to be pre-transposed. Note // that the Xgemm kernel expects either matrices A and C (in case of row-major) or B (in case of // col-major) to be transformed, so transposing requirements are not the same as whether or not // the matrix is actually transposed in memory. const auto a_rotated = (layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo); const auto b_rotated = (layout == Layout::kColMajor && b_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && b_transpose == Transpose::kNo); const auto c_rotated = (layout == Layout::kRowMajor); static const auto a_want_rotated = false; static const auto b_want_rotated = true; static const auto c_want_rotated = false; const auto a_do_transpose = a_rotated != a_want_rotated; const auto b_do_transpose = b_rotated != b_want_rotated; const auto c_do_transpose = c_rotated != c_want_rotated; // In case of complex data-types, the transpose can also become a conjugate transpose const auto a_conjugate = (a_transpose == Transpose::kConjugate); const auto b_conjugate = (b_transpose == Transpose::kConjugate); // Retrieves the proper XgemmDirect kernel from the compiled binary //const auto name = (a_do_transpose) ? (b_do_transpose ? "XgemmDirectTT" : "XgemmDirectTN") : //(b_do_transpose ? "XgemmDirectNT" : "XgemmDirectNN"); auto kernel = Kernel(*program_, kernelName); size_t i = 0; auto setArg = [&](std::string arg) { arg = trim(arg); if (arg == "m") { kernel.SetArgument(i, static_cast<int>(m)); } else if (arg == "n") { kernel.SetArgument(i, static_cast<int>(n)); } else if (arg == "k") { kernel.SetArgument(i, static_cast<int>(k)); } else if (arg == "a") { kernel.SetArgument(i, a_buffer()); } else if (arg == "b") { kernel.SetArgument(i, b_buffer()); } else if (arg == "c") { kernel.SetArgument(i, c_buffer()); } i = i+1; }; // parse and sets the kernel arguments std::string delimiter(","); size_t pos = 0; while ((pos = argumentOrder.find(delimiter)) != std::string::npos) { setArg(argumentOrder.substr(0, pos)); argumentOrder.erase(0, pos + delimiter.length()); } setArg(argumentOrder); // set last arg // Launches the kernel RunKernel(kernel, queue_, device_, global, local, event_); }