// Sets the kernel's arguments static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args, std::vector<T> &, std::vector<T> &, std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &c_mat, std::vector<T> &) { tuner.AddArgumentScalar(static_cast<int>(args.m)); tuner.AddArgumentScalar(static_cast<int>(args.n)); tuner.AddArgumentScalar(static_cast<int>(args.k)); tuner.AddArgumentScalar(GetRealArg(args.alpha)); tuner.AddArgumentScalar(GetRealArg(args.beta)); tuner.AddArgumentInput(a_mat); tuner.AddArgumentScalar(0); // a_offset tuner.AddArgumentScalar(static_cast<int>(args.k)); // a_ld tuner.AddArgumentInput(b_mat); tuner.AddArgumentScalar(0); // b_offset tuner.AddArgumentScalar(static_cast<int>(args.n)); // b_ld tuner.AddArgumentOutput(c_mat); tuner.AddArgumentScalar(0); // c_offset tuner.AddArgumentScalar(static_cast<int>(args.n)); // c_ld tuner.AddArgumentScalar(1); // c_do_transpose tuner.AddArgumentScalar(0); // a_conjugate tuner.AddArgumentScalar(0); // b_conjugate }
// Sets the kernel's arguments static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args, std::vector<T> &x_vec, std::vector<T> &y_vec, std::vector<T> &a_mat, std::vector<T> &, std::vector<T> &, std::vector<T> &) { auto a_rotated = (V==3) ? 1 : 0; tuner.AddArgumentScalar(static_cast<int>(args.m)); tuner.AddArgumentScalar(static_cast<int>(args.n)); tuner.AddArgumentScalar(GetRealArg(args.alpha)); tuner.AddArgumentScalar(GetRealArg(args.beta)); tuner.AddArgumentScalar(static_cast<int>(a_rotated)); tuner.AddArgumentInput(a_mat); tuner.AddArgumentScalar(0); tuner.AddArgumentScalar(static_cast<int>(args.m)); tuner.AddArgumentInput(x_vec); tuner.AddArgumentScalar(0); tuner.AddArgumentScalar(1); tuner.AddArgumentOutput(y_vec); tuner.AddArgumentScalar(0); tuner.AddArgumentScalar(1); tuner.AddArgumentScalar(0); // Conjugate transpose tuner.AddArgumentScalar(0); // Additional parameter tuner.AddArgumentScalar(0); // Banded 'kl' tuner.AddArgumentScalar(0); // Banded 'ku' }
void FillVector(Queue &queue, const Device &device, const Program &program, const Databases &, EventPointer event, const std::vector<Event> &waitForEvents, const size_t n, const size_t inc, const size_t offset, const Buffer<T> &dest, const T constant_value) { auto kernel = Kernel(program, "FillVector"); kernel.SetArgument(0, static_cast<int>(n)); kernel.SetArgument(1, static_cast<int>(inc)); kernel.SetArgument(2, static_cast<int>(offset)); kernel.SetArgument(3, dest()); kernel.SetArgument(4, GetRealArg(constant_value)); auto local = std::vector<size_t>{64}; auto global = std::vector<size_t>{Ceil(n, 64)}; RunKernel(kernel, queue, device, global, local, event, waitForEvents); }
Double GetRealScalar(structlpsolvecaller *lpsolvecaller, int element) { zval arg; ZVAL_UNDEF(&arg); Double a = 0.0; arg = GetpMatrix(lpsolvecaller, element); if ((!Z_ISUNDEF(arg)) && (Z_TYPE(arg) != IS_LONG) && (Z_TYPE(arg) != IS_DOUBLE)) { ZVAL_UNDEF(&arg); } if (Z_ISUNDEF(arg)) { abort(); ErrMsgTxt(lpsolvecaller, "Expecting a scalar argument."); } else { a = GetRealArg(lpsolvecaller, arg); } return(a); }
void PadCopyTransposeMatrix(Queue &queue, const Device &device, const Databases &db, EventPointer event, const std::vector<Event> &waitForEvents, const size_t src_one, const size_t src_two, const size_t src_ld, const size_t src_offset, const Buffer<T> &src, const size_t dest_one, const size_t dest_two, const size_t dest_ld, const size_t dest_offset, const Buffer<T> &dest, const T alpha, const Program &program, const bool do_pad, const bool do_transpose, const bool do_conjugate, const bool upper = false, const bool lower = false, const bool diagonal_imag_zero = false) { // Determines whether or not the fast-version could potentially be used auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) && (src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld) && (upper == false) && (lower == false) && (diagonal_imag_zero == false); // Determines the right kernel auto kernel_name = std::string{}; if (do_transpose) { if (use_fast_kernel && IsMultiple(src_ld, db["TRA_WPT"]) && IsMultiple(src_one, db["TRA_WPT"]*db["TRA_DIM"]) && IsMultiple(src_two, db["TRA_WPT"]*db["TRA_DIM"])) { kernel_name = "TransposeMatrixFast"; } else { use_fast_kernel = false; kernel_name = (do_pad) ? "TransposePadMatrix" : "TransposeMatrix"; } } else { if (use_fast_kernel && IsMultiple(src_ld, db["COPY_VW"]) && IsMultiple(src_one, db["COPY_VW"]*db["COPY_DIMX"]) && IsMultiple(src_two, db["COPY_WPT"]*db["COPY_DIMY"])) { kernel_name = "CopyMatrixFast"; } else { use_fast_kernel = false; kernel_name = (do_pad) ? "CopyPadMatrix" : "CopyMatrix"; } } // Retrieves the kernel from the compiled binary auto kernel = Kernel(program, kernel_name); // Sets the kernel arguments if (use_fast_kernel) { kernel.SetArgument(0, static_cast<int>(src_ld)); kernel.SetArgument(1, src()); kernel.SetArgument(2, dest()); kernel.SetArgument(3, GetRealArg(alpha)); } else { kernel.SetArgument(0, static_cast<int>(src_one)); kernel.SetArgument(1, static_cast<int>(src_two)); kernel.SetArgument(2, static_cast<int>(src_ld)); kernel.SetArgument(3, static_cast<int>(src_offset)); kernel.SetArgument(4, src()); kernel.SetArgument(5, static_cast<int>(dest_one)); kernel.SetArgument(6, static_cast<int>(dest_two)); kernel.SetArgument(7, static_cast<int>(dest_ld)); kernel.SetArgument(8, static_cast<int>(dest_offset)); kernel.SetArgument(9, dest()); kernel.SetArgument(10, GetRealArg(alpha)); if (do_pad) { kernel.SetArgument(11, static_cast<int>(do_conjugate)); } else { kernel.SetArgument(11, static_cast<int>(upper)); kernel.SetArgument(12, static_cast<int>(lower)); kernel.SetArgument(13, static_cast<int>(diagonal_imag_zero)); } } // Launches the kernel and returns the error code. Uses global and local thread sizes based on // parameters in the database. if (do_transpose) { if (use_fast_kernel) { const auto global = std::vector<size_t>{ dest_one / db["TRA_WPT"], dest_two / db["TRA_WPT"] }; const auto local = std::vector<size_t>{db["TRA_DIM"], db["TRA_DIM"]}; RunKernel(kernel, queue, device, global, local, event, waitForEvents); } else { const auto global = std::vector<size_t>{ Ceil(CeilDiv(dest_one, db["PADTRA_WPT"]), db["PADTRA_TILE"]), Ceil(CeilDiv(dest_two, db["PADTRA_WPT"]), db["PADTRA_TILE"]) }; const auto local = std::vector<size_t>{db["PADTRA_TILE"], db["PADTRA_TILE"]}; RunKernel(kernel, queue, device, global, local, event, waitForEvents); } } else { if (use_fast_kernel) { const auto global = std::vector<size_t>{ dest_one / db["COPY_VW"], dest_two / db["COPY_WPT"] }; const auto local = std::vector<size_t>{db["COPY_DIMX"], db["COPY_DIMY"]}; RunKernel(kernel, queue, device, global, local, event, waitForEvents); } else { const auto global = std::vector<size_t>{ Ceil(CeilDiv(dest_one, db["PAD_WPTX"]), db["PAD_DIMX"]), Ceil(CeilDiv(dest_two, db["PAD_WPTY"]), db["PAD_DIMY"]) }; const auto local = std::vector<size_t>{db["PAD_DIMX"], db["PAD_DIMY"]}; RunKernel(kernel, queue, device, global, local, event, waitForEvents); } } }
int GetRealSparseVector(structlpsolvecaller *lpsolvecaller, int element, Double *vec, int *index, int start, int len, int col) { int m, n, count = 0; zval pm = GetpMatrix(lpsolvecaller, element); zval *data; HashTable *arr_hash; Double a; zend_string *key; ulong i; if ((Z_ISUNDEF(pm)) || (Z_TYPE(pm) != IS_ARRAY)) { ErrMsgTxt(lpsolvecaller, "invalid vector."); } #if 1 m = GetM(lpsolvecaller, pm); n = GetN(lpsolvecaller, pm); #else m = zend_hash_num_elements(Z_ARRVAL_P(pm)); n = 1; #endif if ( ((col == 0) && (((m != 1) && (n != 1)) || ((m == 1) && (n > len)) || ((n == 1) && (m > len)))) || ((col != 0) && ((m > len) || (col > n))) /* || !IsNumeric(pm) || IsComplex(pm) */ ) { /* Printf("1: m=%d, n=%d, col=%d, len=%d, IsNumeric=%d, IsComplex=%d\n", m,n,col,len,IsNumeric(pm),IsComplex(pm)); */ ErrMsgTxt(lpsolvecaller, "invalid vector."); } if ((((n == 1) || (col != 0)) && (m > len)) || ((col == 0) && (m == 1) && (n > len))) { /* Printf("2: m=%d, n=%d, col=%d, len=%d\n", m,n,col,len); */ ErrMsgTxt(lpsolvecaller, "invalid vector."); } arr_hash = Z_ARRVAL(pm); ZEND_HASH_FOREACH_KEY_VAL(arr_hash, i, key, data) { if (key) { ErrMsgTxt(lpsolvecaller, "invalid vector."); } else { zval pm = *data; a = 0; if (Z_TYPE(pm) == IS_ARRAY) { zval *data; HashTable *arr_hash; zend_string *key; ulong i; arr_hash = Z_ARRVAL(pm); ZEND_HASH_FOREACH_KEY_VAL(arr_hash, i, key, data) { if (key) { ErrMsgTxt(lpsolvecaller, "invalid vector."); } else if (i + 1 == col) { a = GetRealArg(lpsolvecaller, *data); break; } } ZEND_HASH_FOREACH_END(); } else { a = GetRealArg(lpsolvecaller, pm); } if (a) { *(vec++) = a; *(index++) = start + i; count++; } }