static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) { if (V==1 || V==2) { auto LocalMemorySize = [args] (std::vector<size_t> v) { return v[0]*GetBytes(args.precision); }; tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGS"+std::to_string(V)}); } else { auto LocalMemorySize = [args] (std::vector<size_t> v) { return (v[0]*v[1] + v[1])*GetBytes(args.precision); }; tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGS"+std::to_string(V), "WPT"+std::to_string(V)}); } }
// Sets the constraints and local memory size static void SetConstraints(cltune::Tuner &tuner, const size_t id) { if (V==2 || V==3) { auto MultipleOfX = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]); }; tuner.AddConstraint(id, MultipleOfX, {"WPT"+std::to_string(V), "VW"+std::to_string(V)}); } if (V==3) { auto LargerOrEqual = [] (std::vector<size_t> v) { return v[0] >= v[1]; }; tuner.AddConstraint(id, LargerOrEqual, {"WGS"+std::to_string(V), "WPT"+std::to_string(V)}); } }
// Sets the kernel's arguments static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args, std::vector<T> &, std::vector<T> &, std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &, std::vector<T> &) { auto alpha_buffer = std::vector<T>{args.alpha}; tuner.AddArgumentScalar(static_cast<int>(args.m)); tuner.AddArgumentInput(a_mat); tuner.AddArgumentOutput(b_mat); tuner.AddArgumentInput(alpha_buffer); }
// Sets the constraints static void SetConstraints(cltune::Tuner &tuner, const size_t id) { auto MultipleOfX = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]); }; auto MultipleOfXMulY = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]*v[2]); }; auto MultipleOfXMulYDivZ = [] (std::vector<size_t> v) { return IsMultiple(v[0], (v[1]*v[2])/v[3]); }; // Requirement for unrolling the WGD loop tuner.AddConstraint(id, MultipleOfX, {"WGD", "KWID"}); // Required for integer MWID and NWID tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "MDIMCD", "VWMD"}); tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "NDIMCD", "VWND"}); // Required for integer MWIAD and NWIBD tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "MDIMAD", "VWMD"}); tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "NDIMBD", "VWND"}); // WGD has to be a multiple of KDIMAD = ((MDIMCD*NDIMCD)/(MDIMAD)) and KDIMBD = (...) tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"WGD", "MDIMCD", "NDIMCD", "MDIMAD"}); tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"WGD", "MDIMCD", "NDIMCD", "NDIMBD"}); // Extra constraints for variation 1 to limit the set of options significantly if (V==1) { auto IsEqual = [] (std::vector<size_t> v) { return v[0] == v[1]; }; tuner.AddConstraint(id, IsEqual, {"MDIMCD", "MDIMAD"}); tuner.AddConstraint(id, IsEqual, {"NDIMCD", "NDIMBD"}); } }
// Sets the tuning parameters and their possible values static void SetParameters(cltune::Tuner &tuner, const size_t id) { if (V==1) { // limited subset of tuning parameters - but explorable exhaustively tuner.AddParameter(id, "WGD", {8, 16, 32});//64,128 tuner.AddParameter(id, "MDIMCD", {4, 8, 16}); tuner.AddParameter(id, "NDIMCD", {4, 8, 16}); tuner.AddParameter(id, "MDIMAD", {4, 8, 16}); tuner.AddParameter(id, "NDIMBD", {4, 8, 16}); tuner.AddParameter(id, "KWID", {1, 2, 4}); //1,4 tuner.AddParameter(id, "VWMD", {1, 2, 4}); tuner.AddParameter(id, "VWND", {1, 2, 4}); tuner.AddParameter(id, "PADA", {1});//0 tuner.AddParameter(id, "PADB", {1});//0 } // a lot more tuning parameters - has to be sampled randomly, too much to test all else { tuner.AddParameter(id, "WGD", {8, 16, 32, 64, 128}); tuner.AddParameter(id, "MDIMCD", {8, 16, 32}); tuner.AddParameter(id, "NDIMCD", {8, 16, 32}); tuner.AddParameter(id, "MDIMAD", {8, 16, 32}); tuner.AddParameter(id, "NDIMBD", {8, 16, 32}); tuner.AddParameter(id, "KWID", {2, 8, 16}); tuner.AddParameter(id, "VWMD", {1, 2, 4, 8}); tuner.AddParameter(id, "VWND", {1, 2, 4, 8}); tuner.AddParameter(id, "PADA", {0, 1}); tuner.AddParameter(id, "PADB", {0, 1}); } }
// Sets the kernel's arguments static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args, std::vector<T> &, std::vector<T> &, std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &c_mat, std::vector<T> &) { tuner.AddArgumentScalar(static_cast<int>(args.m)); tuner.AddArgumentScalar(static_cast<int>(args.n)); tuner.AddArgumentScalar(static_cast<int>(args.k)); tuner.AddArgumentScalar(GetRealArg(args.alpha)); tuner.AddArgumentScalar(GetRealArg(args.beta)); tuner.AddArgumentInput(a_mat); tuner.AddArgumentScalar(0); // a_offset tuner.AddArgumentScalar(static_cast<int>(args.k)); // a_ld tuner.AddArgumentInput(b_mat); tuner.AddArgumentScalar(0); // b_offset tuner.AddArgumentScalar(static_cast<int>(args.n)); // b_ld tuner.AddArgumentOutput(c_mat); tuner.AddArgumentScalar(0); // c_offset tuner.AddArgumentScalar(static_cast<int>(args.n)); // c_ld tuner.AddArgumentScalar(1); // c_do_transpose tuner.AddArgumentScalar(0); // a_conjugate tuner.AddArgumentScalar(0); // b_conjugate }
// Sets the local memory size static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) { auto LocalMemorySize = [args] (std::vector<size_t> v) { return ((v[0]*(v[0] + v[1]) + v[0]*(v[0] + v[2]))*GetBytes(args.precision)); }; tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGD", "PADA", "PADB"}); }
// Sets the kernel's arguments static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args, std::vector<T> &x_vec, std::vector<T> &y_vec, std::vector<T> &a_mat, std::vector<T> &, std::vector<T> &, std::vector<T> &) { auto a_rotated = (V==3) ? 1 : 0; tuner.AddArgumentScalar(static_cast<int>(args.m)); tuner.AddArgumentScalar(static_cast<int>(args.n)); tuner.AddArgumentScalar(GetRealArg(args.alpha)); tuner.AddArgumentScalar(GetRealArg(args.beta)); tuner.AddArgumentScalar(static_cast<int>(a_rotated)); tuner.AddArgumentInput(a_mat); tuner.AddArgumentScalar(0); tuner.AddArgumentScalar(static_cast<int>(args.m)); tuner.AddArgumentInput(x_vec); tuner.AddArgumentScalar(0); tuner.AddArgumentScalar(1); tuner.AddArgumentOutput(y_vec); tuner.AddArgumentScalar(0); tuner.AddArgumentScalar(1); tuner.AddArgumentScalar(0); // Conjugate transpose tuner.AddArgumentScalar(0); // Additional parameter tuner.AddArgumentScalar(0); // Banded 'kl' tuner.AddArgumentScalar(0); // Banded 'ku' }
// Sets the tuning parameters and their possible values static void SetParameters(cltune::Tuner &tuner, const size_t id) { tuner.AddParameter(id, "TRA_DIM", {4, 8, 16, 32, 64}); tuner.AddParameter(id, "TRA_WPT", {1, 2, 4, 8, 16}); tuner.AddParameter(id, "TRA_PAD", {0, 1}); tuner.AddParameter(id, "TRA_SHUFFLE", {0, 1}); }