// Sets the constraints and local memory size static void SetConstraints(cltune::Tuner &tuner, const size_t id) { if (V==2 || V==3) { auto MultipleOfX = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]); }; tuner.AddConstraint(id, MultipleOfX, {"WPT"+std::to_string(V), "VW"+std::to_string(V)}); } if (V==3) { auto LargerOrEqual = [] (std::vector<size_t> v) { return v[0] >= v[1]; }; tuner.AddConstraint(id, LargerOrEqual, {"WGS"+std::to_string(V), "WPT"+std::to_string(V)}); } }
// Sets the constraints static void SetConstraints(cltune::Tuner &tuner, const size_t id) { auto MultipleOfX = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]); }; auto MultipleOfXMulY = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]*v[2]); }; auto MultipleOfXMulYDivZ = [] (std::vector<size_t> v) { return IsMultiple(v[0], (v[1]*v[2])/v[3]); }; // Requirement for unrolling the WGD loop tuner.AddConstraint(id, MultipleOfX, {"WGD", "KWID"}); // Required for integer MWID and NWID tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "MDIMCD", "VWMD"}); tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "NDIMCD", "VWND"}); // Required for integer MWIAD and NWIBD tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "MDIMAD", "VWMD"}); tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "NDIMBD", "VWND"}); // WGD has to be a multiple of KDIMAD = ((MDIMCD*NDIMCD)/(MDIMAD)) and KDIMBD = (...) tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"WGD", "MDIMCD", "NDIMCD", "MDIMAD"}); tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"WGD", "MDIMCD", "NDIMCD", "NDIMBD"}); // Extra constraints for variation 1 to limit the set of options significantly if (V==1) { auto IsEqual = [] (std::vector<size_t> v) { return v[0] == v[1]; }; tuner.AddConstraint(id, IsEqual, {"MDIMCD", "MDIMAD"}); tuner.AddConstraint(id, IsEqual, {"NDIMCD", "NDIMBD"}); } }
void Client<T,U>::PrintTableRow(const Arguments<U>& args, const std::vector<std::pair<std::string, double>>& timings) { // Creates a vector of relevant variables auto integers = std::vector<size_t>{}; for (auto &o: options_) { if (o == kArgM) { integers.push_back(args.m); } else if (o == kArgN) { integers.push_back(args.n); } else if (o == kArgK) { integers.push_back(args.k); } else if (o == kArgKU) { integers.push_back(args.ku); } else if (o == kArgKL) { integers.push_back(args.kl); } else if (o == kArgLayout) { integers.push_back(static_cast<size_t>(args.layout)); } else if (o == kArgSide) { integers.push_back(static_cast<size_t>(args.side)); } else if (o == kArgTriangle) { integers.push_back(static_cast<size_t>(args.triangle)); } else if (o == kArgATransp) { integers.push_back(static_cast<size_t>(args.a_transpose)); } else if (o == kArgBTransp) { integers.push_back(static_cast<size_t>(args.b_transpose)); } else if (o == kArgDiagonal) { integers.push_back(static_cast<size_t>(args.diagonal)); } else if (o == kArgXInc) { integers.push_back(args.x_inc); } else if (o == kArgYInc) { integers.push_back(args.y_inc); } else if (o == kArgXOffset) { integers.push_back(args.x_offset); } else if (o == kArgYOffset) { integers.push_back(args.y_offset); } else if (o == kArgALeadDim) { integers.push_back(args.a_ld); } else if (o == kArgBLeadDim) { integers.push_back(args.b_ld); } else if (o == kArgCLeadDim) { integers.push_back(args.c_ld); } else if (o == kArgAOffset) { integers.push_back(args.a_offset); } else if (o == kArgBOffset) { integers.push_back(args.b_offset); } else if (o == kArgCOffset) { integers.push_back(args.c_offset); } else if (o == kArgAPOffset) { integers.push_back(args.ap_offset); } else if (o == kArgDotOffset) {integers.push_back(args.dot_offset); } else if (o == kArgNrm2Offset){integers.push_back(args.nrm2_offset); } else if (o == kArgAsumOffset){integers.push_back(args.asum_offset); } else if (o == kArgImaxOffset){integers.push_back(args.imax_offset); } } auto strings = std::vector<std::string>{}; for (auto &o: options_) { if (o == kArgAlpha) { strings.push_back(ToString(args.alpha)); } else if (o == kArgBeta) { strings.push_back(ToString(args.beta)); } } // Outputs the argument values for (auto &argument: integers) { if (!args.no_abbrv && argument >= 1024*1024 && IsMultiple(argument, 1024*1024)) { fprintf(stdout, "%8zuM;", argument/(1024*1024)); } else if (!args.no_abbrv && argument >= 1024 && IsMultiple(argument, 1024)) { fprintf(stdout, "%8zuK;", argument/1024); } else { fprintf(stdout, "%9zu;", argument); } } for (auto &argument: strings) { fprintf(stdout, "%9s;", argument.c_str()); } // Loops over all tested libraries for (const auto& timing : timings) { // Computes the GFLOPS and GB/s metrics auto flops = get_flops_(args); auto bytes = get_bytes_(args); auto gflops = (timing.second != 0.0) ? (flops*1e-6)/timing.second : 0; auto gbs = (timing.second != 0.0) ? (bytes*1e-6)/timing.second : 0; // Outputs the performance numbers if (timing.first != "CLBlast") { fprintf(stdout, ";"); } fprintf(stdout, "%9.2lf;%9.1lf;%9.1lf", timing.second, gflops, gbs); } fprintf(stdout, "\n"); }
void PadCopyTransposeMatrix(Queue &queue, const Device &device, const Databases &db, EventPointer event, const std::vector<Event> &waitForEvents, const size_t src_one, const size_t src_two, const size_t src_ld, const size_t src_offset, const Buffer<T> &src, const size_t dest_one, const size_t dest_two, const size_t dest_ld, const size_t dest_offset, const Buffer<T> &dest, const T alpha, const Program &program, const bool do_pad, const bool do_transpose, const bool do_conjugate, const bool upper = false, const bool lower = false, const bool diagonal_imag_zero = false) { // Determines whether or not the fast-version could potentially be used auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) && (src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld) && (upper == false) && (lower == false) && (diagonal_imag_zero == false); // Determines the right kernel auto kernel_name = std::string{}; if (do_transpose) { if (use_fast_kernel && IsMultiple(src_ld, db["TRA_WPT"]) && IsMultiple(src_one, db["TRA_WPT"]*db["TRA_DIM"]) && IsMultiple(src_two, db["TRA_WPT"]*db["TRA_DIM"])) { kernel_name = "TransposeMatrixFast"; } else { use_fast_kernel = false; kernel_name = (do_pad) ? "TransposePadMatrix" : "TransposeMatrix"; } } else { if (use_fast_kernel && IsMultiple(src_ld, db["COPY_VW"]) && IsMultiple(src_one, db["COPY_VW"]*db["COPY_DIMX"]) && IsMultiple(src_two, db["COPY_WPT"]*db["COPY_DIMY"])) { kernel_name = "CopyMatrixFast"; } else { use_fast_kernel = false; kernel_name = (do_pad) ? "CopyPadMatrix" : "CopyMatrix"; } } // Retrieves the kernel from the compiled binary auto kernel = Kernel(program, kernel_name); // Sets the kernel arguments if (use_fast_kernel) { kernel.SetArgument(0, static_cast<int>(src_ld)); kernel.SetArgument(1, src()); kernel.SetArgument(2, dest()); kernel.SetArgument(3, GetRealArg(alpha)); } else { kernel.SetArgument(0, static_cast<int>(src_one)); kernel.SetArgument(1, static_cast<int>(src_two)); kernel.SetArgument(2, static_cast<int>(src_ld)); kernel.SetArgument(3, static_cast<int>(src_offset)); kernel.SetArgument(4, src()); kernel.SetArgument(5, static_cast<int>(dest_one)); kernel.SetArgument(6, static_cast<int>(dest_two)); kernel.SetArgument(7, static_cast<int>(dest_ld)); kernel.SetArgument(8, static_cast<int>(dest_offset)); kernel.SetArgument(9, dest()); kernel.SetArgument(10, GetRealArg(alpha)); if (do_pad) { kernel.SetArgument(11, static_cast<int>(do_conjugate)); } else { kernel.SetArgument(11, static_cast<int>(upper)); kernel.SetArgument(12, static_cast<int>(lower)); kernel.SetArgument(13, static_cast<int>(diagonal_imag_zero)); } } // Launches the kernel and returns the error code. Uses global and local thread sizes based on // parameters in the database. if (do_transpose) { if (use_fast_kernel) { const auto global = std::vector<size_t>{ dest_one / db["TRA_WPT"], dest_two / db["TRA_WPT"] }; const auto local = std::vector<size_t>{db["TRA_DIM"], db["TRA_DIM"]}; RunKernel(kernel, queue, device, global, local, event, waitForEvents); } else { const auto global = std::vector<size_t>{ Ceil(CeilDiv(dest_one, db["PADTRA_WPT"]), db["PADTRA_TILE"]), Ceil(CeilDiv(dest_two, db["PADTRA_WPT"]), db["PADTRA_TILE"]) }; const auto local = std::vector<size_t>{db["PADTRA_TILE"], db["PADTRA_TILE"]}; RunKernel(kernel, queue, device, global, local, event, waitForEvents); } } else { if (use_fast_kernel) { const auto global = std::vector<size_t>{ dest_one / db["COPY_VW"], dest_two / db["COPY_WPT"] }; const auto local = std::vector<size_t>{db["COPY_DIMX"], db["COPY_DIMY"]}; RunKernel(kernel, queue, device, global, local, event, waitForEvents); } else { const auto global = std::vector<size_t>{ Ceil(CeilDiv(dest_one, db["PAD_WPTX"]), db["PAD_DIMX"]), Ceil(CeilDiv(dest_two, db["PAD_WPTY"]), db["PAD_DIMY"]) }; const auto local = std::vector<size_t>{db["PAD_DIMX"], db["PAD_DIMY"]}; RunKernel(kernel, queue, device, global, local, event, waitForEvents); } } }