void ActivateBeaconKeys( const std::string &cpid, const std::string &pubKey, const std::string &privKey) { SetArgument("publickey" + cpid + GetNetSuffix(), pubKey); SetArgument("privatekey" + cpid + GetNetSuffix(), privKey); }
void FillVector(Queue &queue, const Device &device, const Program &program, const Databases &, EventPointer event, const std::vector<Event> &waitForEvents, const size_t n, const size_t inc, const size_t offset, const Buffer<T> &dest, const T constant_value) { auto kernel = Kernel(program, "FillVector"); kernel.SetArgument(0, static_cast<int>(n)); kernel.SetArgument(1, static_cast<int>(inc)); kernel.SetArgument(2, static_cast<int>(offset)); kernel.SetArgument(3, dest()); kernel.SetArgument(4, GetRealArg(constant_value)); auto local = std::vector<size_t>{64}; auto global = std::vector<size_t>{Ceil(n, 64)}; RunKernel(kernel, queue, device, global, local, event, waitForEvents); }
void FORMULA_Formula::TryCaptureArguments(FORMULA_Formula* src) { if (GetArgumentsNum() != src->GetArgumentsNum()) return; FORMULA_Arguments::iterator src_current = src->m_arguments.begin(); while (src_current != src->m_arguments.end()) { if (src_current->second) SetArgument(src_current->second); src_current++; } src->m_arguments.clear(); }
void PadCopyTransposeMatrix(Queue &queue, const Device &device, const Databases &db, EventPointer event, const std::vector<Event> &waitForEvents, const size_t src_one, const size_t src_two, const size_t src_ld, const size_t src_offset, const Buffer<T> &src, const size_t dest_one, const size_t dest_two, const size_t dest_ld, const size_t dest_offset, const Buffer<T> &dest, const T alpha, const Program &program, const bool do_pad, const bool do_transpose, const bool do_conjugate, const bool upper = false, const bool lower = false, const bool diagonal_imag_zero = false) { // Determines whether or not the fast-version could potentially be used auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) && (src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld) && (upper == false) && (lower == false) && (diagonal_imag_zero == false); // Determines the right kernel auto kernel_name = std::string{}; if (do_transpose) { if (use_fast_kernel && IsMultiple(src_ld, db["TRA_WPT"]) && IsMultiple(src_one, db["TRA_WPT"]*db["TRA_DIM"]) && IsMultiple(src_two, db["TRA_WPT"]*db["TRA_DIM"])) { kernel_name = "TransposeMatrixFast"; } else { use_fast_kernel = false; kernel_name = (do_pad) ? "TransposePadMatrix" : "TransposeMatrix"; } } else { if (use_fast_kernel && IsMultiple(src_ld, db["COPY_VW"]) && IsMultiple(src_one, db["COPY_VW"]*db["COPY_DIMX"]) && IsMultiple(src_two, db["COPY_WPT"]*db["COPY_DIMY"])) { kernel_name = "CopyMatrixFast"; } else { use_fast_kernel = false; kernel_name = (do_pad) ? "CopyPadMatrix" : "CopyMatrix"; } } // Retrieves the kernel from the compiled binary auto kernel = Kernel(program, kernel_name); // Sets the kernel arguments if (use_fast_kernel) { kernel.SetArgument(0, static_cast<int>(src_ld)); kernel.SetArgument(1, src()); kernel.SetArgument(2, dest()); kernel.SetArgument(3, GetRealArg(alpha)); } else { kernel.SetArgument(0, static_cast<int>(src_one)); kernel.SetArgument(1, static_cast<int>(src_two)); kernel.SetArgument(2, static_cast<int>(src_ld)); kernel.SetArgument(3, static_cast<int>(src_offset)); kernel.SetArgument(4, src()); kernel.SetArgument(5, static_cast<int>(dest_one)); kernel.SetArgument(6, static_cast<int>(dest_two)); kernel.SetArgument(7, static_cast<int>(dest_ld)); kernel.SetArgument(8, static_cast<int>(dest_offset)); kernel.SetArgument(9, dest()); kernel.SetArgument(10, GetRealArg(alpha)); if (do_pad) { kernel.SetArgument(11, static_cast<int>(do_conjugate)); } else { kernel.SetArgument(11, static_cast<int>(upper)); kernel.SetArgument(12, static_cast<int>(lower)); kernel.SetArgument(13, static_cast<int>(diagonal_imag_zero)); } } // Launches the kernel and returns the error code. Uses global and local thread sizes based on // parameters in the database. if (do_transpose) { if (use_fast_kernel) { const auto global = std::vector<size_t>{ dest_one / db["TRA_WPT"], dest_two / db["TRA_WPT"] }; const auto local = std::vector<size_t>{db["TRA_DIM"], db["TRA_DIM"]}; RunKernel(kernel, queue, device, global, local, event, waitForEvents); } else { const auto global = std::vector<size_t>{ Ceil(CeilDiv(dest_one, db["PADTRA_WPT"]), db["PADTRA_TILE"]), Ceil(CeilDiv(dest_two, db["PADTRA_WPT"]), db["PADTRA_TILE"]) }; const auto local = std::vector<size_t>{db["PADTRA_TILE"], db["PADTRA_TILE"]}; RunKernel(kernel, queue, device, global, local, event, waitForEvents); } } else { if (use_fast_kernel) { const auto global = std::vector<size_t>{ dest_one / db["COPY_VW"], dest_two / db["COPY_WPT"] }; const auto local = std::vector<size_t>{db["COPY_DIMX"], db["COPY_DIMY"]}; RunKernel(kernel, queue, device, global, local, event, waitForEvents); } else { const auto global = std::vector<size_t>{ Ceil(CeilDiv(dest_one, db["PAD_WPTX"]), db["PAD_DIMX"]), Ceil(CeilDiv(dest_two, db["PAD_WPTY"]), db["PAD_DIMY"]) }; const auto local = std::vector<size_t>{db["PAD_DIMX"], db["PAD_DIMY"]}; RunKernel(kernel, queue, device, global, local, event, waitForEvents); } } }
void PadCopyTransposeMatrixBatched(Queue &queue, const Device &device, const Databases &db, EventPointer event, const std::vector<Event> &waitForEvents, const size_t src_one, const size_t src_two, const size_t src_ld, const Buffer<int> &src_offsets, const Buffer<T> &src, const size_t dest_one, const size_t dest_two, const size_t dest_ld, const Buffer<int> &dest_offsets, const Buffer<T> &dest, const Program &program, const bool do_pad, const bool do_transpose, const bool do_conjugate, const size_t batch_count) { // Determines the right kernel auto kernel_name = std::string{}; if (do_transpose) { kernel_name = (do_pad) ? "TransposePadMatrixBatched" : "TransposeMatrixBatched"; } else { kernel_name = (do_pad) ? "CopyPadMatrixBatched" : "CopyMatrixBatched"; } // Retrieves the kernel from the compiled binary auto kernel = Kernel(program, kernel_name); // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(src_one)); kernel.SetArgument(1, static_cast<int>(src_two)); kernel.SetArgument(2, static_cast<int>(src_ld)); kernel.SetArgument(3, src_offsets()); kernel.SetArgument(4, src()); kernel.SetArgument(5, static_cast<int>(dest_one)); kernel.SetArgument(6, static_cast<int>(dest_two)); kernel.SetArgument(7, static_cast<int>(dest_ld)); kernel.SetArgument(8, dest_offsets()); kernel.SetArgument(9, dest()); if (do_pad) { kernel.SetArgument(10, static_cast<int>(do_conjugate)); } // Launches the kernel and returns the error code. Uses global and local thread sizes based on // parameters in the database. if (do_transpose) { const auto global = std::vector<size_t>{ Ceil(CeilDiv(dest_one, db["PADTRA_WPT"]), db["PADTRA_TILE"]), Ceil(CeilDiv(dest_two, db["PADTRA_WPT"]), db["PADTRA_TILE"]), batch_count }; const auto local = std::vector<size_t>{db["PADTRA_TILE"], db["PADTRA_TILE"], 1}; RunKernel(kernel, queue, device, global, local, event, waitForEvents); } else { const auto global = std::vector<size_t>{ Ceil(CeilDiv(dest_one, db["PAD_WPTX"]), db["PAD_DIMX"]), Ceil(CeilDiv(dest_two, db["PAD_WPTY"]), db["PAD_DIMY"]), batch_count }; const auto local = std::vector<size_t>{db["PAD_DIMX"], db["PAD_DIMY"], 1}; RunKernel(kernel, queue, device, global, local, event, waitForEvents); } }
int ParseArgs(int argc, char *argv[], user_settings *set) { if (argv == NULL || set == NULL) return USR_PTR_PASS_FAIL; if (argc < 2) { DisplayHelp(argv[0]); return USR_HELP; } // Detecting Help Requried for (int i = 1; i < argc; i++) { if (strcmp(argv[i], "-help") == 0) { DisplayHelp(argv[0]); return USR_HELP; } else if (strcmp(argv[i], "-exthelp") == 0) { DisplayExtendedHelp(argv[0]); return USR_HELP; } } // Allocating Memory for Content Path Ptrs set->common.contentPath = calloc(CIA_MAX_CONTENT, sizeof(char*)); if (set->common.contentPath == NULL) { fprintf(stderr, "[SETTING ERROR] Not Enough Memory\n"); return USR_MEM_ERROR; } // Initialise Keys InitKeys(&set->common.keys); // Setting Defaults SetDefaults(set); // Parsing Arguments int set_result; for (int i = 1; i < argc; i += set_result) { set_result = SetArgument(argc, i, argv, set); if (set_result < 1) { fprintf(stderr, "[RESULT] Invalid arguments, see '%s -help'\n", argv[0]); return set_result; } } // Checking arguments if ((set_result = CheckArgumentCombination(set)) != 0) return set_result; // Setting Keys if ((set_result = SetKeys(&set->common.keys)) != 0) return set_result; // Generating outpath if required if (!set->common.outFileName) { char *source_path = NULL; if (set->ncch.buildNcch0) source_path = set->common.rsfPath; else if (set->common.workingFileType == infile_ncsd || set->common.workingFileType == infile_cia || set->common.workingFileType == infile_srl) source_path = set->common.workingFilePath; else source_path = set->common.contentPath[0]; set->common.outFileName_mallocd = true; set->common.outFileName = replace_filextention(source_path, GetOutputExtention(set->common.outFormat)); } return 0; }
StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc, const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, const bool packed) { // Makes sure the dimensions are larger than zero if (n == 0) { return StatusCode::kInvalidDimension; } // The data is either in the upper or lower triangle const auto is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || (triangle == Triangle::kLower && layout == Layout::kRowMajor)); const auto is_rowmajor = (layout == Layout::kRowMajor); // Tests the matrix and the vectors for validity auto status = StatusCode::kSuccess; if (packed) { status = TestMatrixAP(n, a_buffer, a_offset); } else { status = TestMatrixA(n, n, a_buffer, a_offset, a_ld); } if (ErrorIn(status)) { return status; } status = TestVectorX(n, x_buffer, x_offset, x_inc); if (ErrorIn(status)) { return status; } status = TestVectorY(n, y_buffer, y_offset, y_inc); if (ErrorIn(status)) { return status; } // Upload the scalar argument as a constant buffer to the device (needed for half-precision) auto alpha_buffer = Buffer<T>(context_, 1); alpha_buffer.Write(queue_, 1, &alpha); // Retrieves the kernel from the compiled binary try { const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); auto kernel = Kernel(program, "Xher2"); // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(n)); kernel.SetArgument(1, alpha_buffer()); kernel.SetArgument(2, x_buffer()); kernel.SetArgument(3, static_cast<int>(x_offset)); kernel.SetArgument(4, static_cast<int>(x_inc)); kernel.SetArgument(5, y_buffer()); kernel.SetArgument(6, static_cast<int>(y_offset)); kernel.SetArgument(7, static_cast<int>(y_inc)); kernel.SetArgument(8, a_buffer()); kernel.SetArgument(9, static_cast<int>(a_offset)); kernel.SetArgument(10, static_cast<int>(a_ld)); kernel.SetArgument(11, static_cast<int>(is_upper)); kernel.SetArgument(12, static_cast<int>(is_rowmajor)); // Launches the kernel auto global_one = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS1"]); auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]); auto global = std::vector<size_t>{global_one, global_two}; auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]}; status = RunKernel(kernel, queue_, device_, global, local, event_); if (ErrorIn(status)) { return status; } // Succesfully finished the computation return StatusCode::kSuccess; } catch (...) { return StatusCode::kInvalidKernel; } }
void Xim2col<T>::DoIm2col(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const Buffer<T> &im_buffer, const size_t im_offset, const Buffer<T> &col_buffer, const size_t col_offset) { // Makes sure all dimensions are larger than zero if ((channels == 0) || (height == 0) || (width == 0)) { throw BLASError(StatusCode::kInvalidDimension); } // Sets the output height and width const auto size_h = height + 2 * pad_h; const auto padding_h = dilation_h * (kernel_h - 1) + 1; const auto output_h = (size_h >= padding_h) ? (size_h - padding_h) / stride_h + 1 : 1; const auto size_w = width + 2 * pad_w; const auto padding_w = dilation_w * (kernel_w - 1) + 1; const auto output_w = (size_w >= padding_w) ? (size_w - padding_w) / stride_w + 1 : 1; // Retrieves the Xcopy kernel from the compiled binary auto kernel = Kernel(program_, "im2col"); // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(height)); kernel.SetArgument(1, static_cast<int>(width)); kernel.SetArgument(2, static_cast<int>(channels)); kernel.SetArgument(3, static_cast<int>(output_h)); kernel.SetArgument(4, static_cast<int>(output_w)); kernel.SetArgument(5, static_cast<int>(kernel_h)); kernel.SetArgument(6, static_cast<int>(kernel_w)); kernel.SetArgument(7, static_cast<int>(pad_h)); kernel.SetArgument(8, static_cast<int>(pad_w)); kernel.SetArgument(9, static_cast<int>(stride_h)); kernel.SetArgument(10, static_cast<int>(stride_w)); kernel.SetArgument(11, static_cast<int>(dilation_h)); kernel.SetArgument(12, static_cast<int>(dilation_w)); kernel.SetArgument(13, im_buffer()); kernel.SetArgument(14, static_cast<int>(im_offset)); kernel.SetArgument(15, col_buffer()); kernel.SetArgument(16, static_cast<int>(col_offset)); // Launches the kernel const auto w_ceiled = Ceil(output_w, db_["COPY_DIMX"]); const auto h_ceiled = Ceil(output_h, db_["COPY_DIMY"]); const auto global = std::vector<size_t>{w_ceiled, h_ceiled * channels}; const auto local = std::vector<size_t>{db_["COPY_DIMX"], db_["COPY_DIMY"]}; RunKernel(kernel, queue_, device_, global, local, event_); }
void Xgemm<T>::DoGemm(const std::vector<size_t>& global, const std::vector<size_t>& local, const std::string& kernelName, std::string argumentOrder, const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, const T alpha, const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, const T beta, const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) { // Makes sure all dimensions are larger than zero if ((m == 0) || (n == 0) || (k == 0)) { throw BLASError(StatusCode::kInvalidDimension); } // Computes whether or not the matrices are transposed in memory. This is based on their layout // (row or column-major) and whether or not they are requested to be pre-transposed. Note // that the Xgemm kernel expects either matrices A and C (in case of row-major) or B (in case of // col-major) to be transformed, so transposing requirements are not the same as whether or not // the matrix is actually transposed in memory. const auto a_rotated = (layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo); const auto b_rotated = (layout == Layout::kColMajor && b_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && b_transpose == Transpose::kNo); const auto c_rotated = (layout == Layout::kRowMajor); static const auto a_want_rotated = false; static const auto b_want_rotated = true; static const auto c_want_rotated = false; const auto a_do_transpose = a_rotated != a_want_rotated; const auto b_do_transpose = b_rotated != b_want_rotated; const auto c_do_transpose = c_rotated != c_want_rotated; // In case of complex data-types, the transpose can also become a conjugate transpose const auto a_conjugate = (a_transpose == Transpose::kConjugate); const auto b_conjugate = (b_transpose == Transpose::kConjugate); // Retrieves the proper XgemmDirect kernel from the compiled binary //const auto name = (a_do_transpose) ? (b_do_transpose ? "XgemmDirectTT" : "XgemmDirectTN") : //(b_do_transpose ? "XgemmDirectNT" : "XgemmDirectNN"); auto kernel = Kernel(*program_, kernelName); size_t i = 0; auto setArg = [&](std::string arg) { arg = trim(arg); if (arg == "m") { kernel.SetArgument(i, static_cast<int>(m)); } else if (arg == "n") { kernel.SetArgument(i, static_cast<int>(n)); } else if (arg == "k") { kernel.SetArgument(i, static_cast<int>(k)); } else if (arg == "a") { kernel.SetArgument(i, a_buffer()); } else if (arg == "b") { kernel.SetArgument(i, b_buffer()); } else if (arg == "c") { kernel.SetArgument(i, c_buffer()); } i = i+1; }; // parse and sets the kernel arguments std::string delimiter(","); size_t pos = 0; while ((pos = argumentOrder.find(delimiter)) != std::string::npos) { setArg(argumentOrder.substr(0, pos)); argumentOrder.erase(0, pos + delimiter.length()); } setArg(argumentOrder); // set last arg // Launches the kernel RunKernel(kernel, queue_, device_, global, local, event_); }