void TestVectorY(const size_t n, const Buffer<T> &buffer, const size_t offset, const size_t inc) { if (inc == 0) { throw BLASError(StatusCode::kInvalidIncrementY); } try { const auto required_size = ((n - 1) * inc + 1 + offset) * sizeof(T); if (buffer.GetSize() < required_size) { throw BLASError(StatusCode::kInsufficientMemoryY); } } catch (const Error<std::runtime_error> &e) { throw BLASError(StatusCode::kInvalidVectorY, e.what()); } }
void TestMatrixC(const size_t one, const size_t two, const Buffer<T> &buffer, const size_t offset, const size_t ld) { if (ld < one) { throw BLASError(StatusCode::kInvalidLeadDimC); } try { const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T); if (buffer.GetSize() < required_size) { throw BLASError(StatusCode::kInsufficientMemoryC); } } catch (const Error<std::runtime_error> &e) { throw BLASError(StatusCode::kInvalidMatrixC, e.what()); } }
void Xim2col<T>::DoIm2col(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const Buffer<T> &im_buffer, const size_t im_offset, const Buffer<T> &col_buffer, const size_t col_offset) { // Makes sure all dimensions are larger than zero if ((channels == 0) || (height == 0) || (width == 0)) { throw BLASError(StatusCode::kInvalidDimension); } // Sets the output height and width const auto size_h = height + 2 * pad_h; const auto padding_h = dilation_h * (kernel_h - 1) + 1; const auto output_h = (size_h >= padding_h) ? (size_h - padding_h) / stride_h + 1 : 1; const auto size_w = width + 2 * pad_w; const auto padding_w = dilation_w * (kernel_w - 1) + 1; const auto output_w = (size_w >= padding_w) ? (size_w - padding_w) / stride_w + 1 : 1; // Retrieves the Xcopy kernel from the compiled binary auto kernel = Kernel(program_, "im2col"); // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(height)); kernel.SetArgument(1, static_cast<int>(width)); kernel.SetArgument(2, static_cast<int>(channels)); kernel.SetArgument(3, static_cast<int>(output_h)); kernel.SetArgument(4, static_cast<int>(output_w)); kernel.SetArgument(5, static_cast<int>(kernel_h)); kernel.SetArgument(6, static_cast<int>(kernel_w)); kernel.SetArgument(7, static_cast<int>(pad_h)); kernel.SetArgument(8, static_cast<int>(pad_w)); kernel.SetArgument(9, static_cast<int>(stride_h)); kernel.SetArgument(10, static_cast<int>(stride_w)); kernel.SetArgument(11, static_cast<int>(dilation_h)); kernel.SetArgument(12, static_cast<int>(dilation_w)); kernel.SetArgument(13, im_buffer()); kernel.SetArgument(14, static_cast<int>(im_offset)); kernel.SetArgument(15, col_buffer()); kernel.SetArgument(16, static_cast<int>(col_offset)); // Launches the kernel const auto w_ceiled = Ceil(output_w, db_["COPY_DIMX"]); const auto h_ceiled = Ceil(output_h, db_["COPY_DIMY"]); const auto global = std::vector<size_t>{w_ceiled, h_ceiled * channels}; const auto local = std::vector<size_t>{db_["COPY_DIMX"], db_["COPY_DIMY"]}; RunKernel(kernel, queue_, device_, global, local, event_); }
void TestVectorIndex(const size_t n, const Buffer<T> &buffer, const size_t offset) { try { const auto required_size = (n + offset) * sizeof(T); if (buffer.GetSize() < required_size) { throw BLASError(StatusCode::kInsufficientMemoryScalar); } } catch (const Error<std::runtime_error> &e) { throw BLASError(StatusCode::kInvalidVectorScalar, e.what()); } }
void TestMatrixAP(const size_t n, const Buffer<T> &buffer, const size_t offset) { try { const auto required_size = (((n * (n + 1)) / 2) + offset) * sizeof(T); if (buffer.GetSize() < required_size) { throw BLASError(StatusCode::kInsufficientMemoryA); } } catch (const Error<std::runtime_error> &e) { throw BLASError(StatusCode::kInvalidMatrixA, e.what()); } }
void Xgemm<T>::DoGemm(const std::vector<size_t>& global, const std::vector<size_t>& local, const std::string& kernelName, std::string argumentOrder, const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, const T alpha, const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, const T beta, const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) { // Makes sure all dimensions are larger than zero if ((m == 0) || (n == 0) || (k == 0)) { throw BLASError(StatusCode::kInvalidDimension); } // Computes whether or not the matrices are transposed in memory. This is based on their layout // (row or column-major) and whether or not they are requested to be pre-transposed. Note // that the Xgemm kernel expects either matrices A and C (in case of row-major) or B (in case of // col-major) to be transformed, so transposing requirements are not the same as whether or not // the matrix is actually transposed in memory. const auto a_rotated = (layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo); const auto b_rotated = (layout == Layout::kColMajor && b_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && b_transpose == Transpose::kNo); const auto c_rotated = (layout == Layout::kRowMajor); static const auto a_want_rotated = false; static const auto b_want_rotated = true; static const auto c_want_rotated = false; const auto a_do_transpose = a_rotated != a_want_rotated; const auto b_do_transpose = b_rotated != b_want_rotated; const auto c_do_transpose = c_rotated != c_want_rotated; // In case of complex data-types, the transpose can also become a conjugate transpose const auto a_conjugate = (a_transpose == Transpose::kConjugate); const auto b_conjugate = (b_transpose == Transpose::kConjugate); // Retrieves the proper XgemmDirect kernel from the compiled binary //const auto name = (a_do_transpose) ? (b_do_transpose ? "XgemmDirectTT" : "XgemmDirectTN") : //(b_do_transpose ? "XgemmDirectNT" : "XgemmDirectNN"); auto kernel = Kernel(*program_, kernelName); size_t i = 0; auto setArg = [&](std::string arg) { arg = trim(arg); if (arg == "m") { kernel.SetArgument(i, static_cast<int>(m)); } else if (arg == "n") { kernel.SetArgument(i, static_cast<int>(n)); } else if (arg == "k") { kernel.SetArgument(i, static_cast<int>(k)); } else if (arg == "a") { kernel.SetArgument(i, a_buffer()); } else if (arg == "b") { kernel.SetArgument(i, b_buffer()); } else if (arg == "c") { kernel.SetArgument(i, c_buffer()); } i = i+1; }; // parse and sets the kernel arguments std::string delimiter(","); size_t pos = 0; while ((pos = argumentOrder.find(delimiter)) != std::string::npos) { setArg(argumentOrder.substr(0, pos)); argumentOrder.erase(0, pos + delimiter.length()); } setArg(argumentOrder); // set last arg // Launches the kernel RunKernel(kernel, queue_, device_, global, local, event_); }