void operator()(const communicator& comm, const config& cfg, T alpha, MatrixA& A, MatrixB& B, T beta, MatrixC& C) { using namespace matrix_constants; len_type m = (Mat == MAT_A ? A.length(0) : Mat == MAT_B ? B.length(0) : C.length(0)); len_type n = (Mat == MAT_A ? A.length(1) : Mat == MAT_B ? B.length(1) : C.length(1)); if (!rscat) { if (comm.master()) { scat_buffer = Pool.allocate<stride_type>(2*m + 2*n); rscat = scat_buffer.get<stride_type>(); } comm.broadcast(rscat); cscat = rscat+m; rbs = cscat+n; cbs = rbs+m; } matrify_and_run<Mat>(*this, comm, cfg, alpha, A, B, beta, C); }
void operator()(const communicator& comm, const config& cfg, T alpha, MatrixA& A, MatrixB& B, T beta, MatrixC& C) { using namespace matrix_constants; const len_type MR = (Mat == MAT_B ? cfg.gemm_kr.def<T>() : cfg.gemm_mr.def<T>()); const len_type NR = (Mat == MAT_A ? cfg.gemm_kr.def<T>() : cfg.gemm_nr.def<T>()); len_type m = (Mat == MAT_A ? A.length(0) : Mat == MAT_B ? B.length(0) : C.length(0)); len_type n = (Mat == MAT_A ? A.length(1) : Mat == MAT_B ? B.length(1) : C.length(1)); m = round_up(m, MR); n = round_up(n, NR); auto& pack_buffer = child.pack_buffer; auto& pack_ptr = child.pack_ptr; if (!pack_ptr) { if (comm.master()) { len_type scatter_size = size_as_type<stride_type,T>(2*m + 2*n); pack_buffer = Pool.allocate<T>(m*n + std::max(m,n)*TBLIS_MAX_UNROLL + scatter_size); pack_ptr = pack_buffer.get(); } comm.broadcast(pack_ptr); rscat = convert_and_align<T,stride_type>(static_cast<T*>(pack_ptr) + m*n); cscat = rscat+m; rbs = cscat+n; cbs = rbs+m; } Sib::operator()(comm, cfg, alpha, A, B, beta, C); }