void L1DistanceMatrixTU(El::UpperOrLower uplo, direction_t dirA, direction_t dirB, T alpha, const El::DistMatrix<T, El::STAR, El::MC> &A, const El::DistMatrix<T, El::STAR, El::MR> &B, T beta, El::DistMatrix<T> &C) { // TODO verify sizes const T *a = A.LockedBuffer(); El::Int ldA = A.LDim(); const T *b = B.LockedBuffer(); El::Int ldB = B.LDim(); T *c = C.Buffer(); El::Int ldC = C.LDim(); El::Int d = A.Height(); /* Not the most efficient way... but mimicking BLAS is too much work! */ if (dirA == base::COLUMNS && dirB == base::COLUMNS) { El::Int n = C.LocalWidth(); El::Int m = C.LocalHeight(); for (El::Int j = 0; j < n; j++) for(El::Int i = ((uplo == El::UPPER) ? 0 : C.LocalRowOffset(A.GlobalCol(j))); i < ((uplo == El::UPPER) ? C.LocalRowOffset(A.GlobalCol(j) + 1) : m); i++) { T v = 0.0; for (El::Int k = 0; k < d; k++) v += std::abs(b[j * ldB + k] - a[i * ldA + k]); c[j * ldC + i] = beta * c[j * ldC + i] + alpha * v; } } // TODO the rest of the cases. }
inline void outer_panel_mixed_gemm_impl_tn( const double alpha, const SpParMat<index_type, value_type, SpDCCols<index_type, value_type> > &A, const El::DistMatrix<value_type, col_d, El::STAR> &S, const double beta, El::DistMatrix<value_type, El::STAR, El::STAR> &C) { El::DistMatrix<value_type, El::STAR, El::STAR> tmp_C(C.Height(), C.Width()); El::Zero(tmp_C); utility::combblas_slab_view_t<index_type, value_type> cbview(A, false); //FIXME: factor size_t slab_size = 2 * S.Grid().Height(); for(size_t cur_row_idx = 0; cur_row_idx < cbview.ncols(); cur_row_idx += slab_size) { size_t cur_slab_size = std::min(slab_size, cbview.ncols() - cur_row_idx); // get the next slab_size columns of B El::DistMatrix<value_type, El::STAR, El::STAR> A_row(cur_slab_size, S.Height()); // transpose is column //cbview.extract_elemental_column_slab_view(A_row, cur_slab_size); cbview.extract_full_slab_view(cur_slab_size); // matrix mult (FIXME only iter nz) for(size_t l_row_idx = 0; l_row_idx < A_row.LocalHeight(); ++l_row_idx) { size_t g_row_idx = l_row_idx * A_row.ColStride() + A_row.ColShift() + cur_row_idx; for(size_t l_col_idx = 0; l_col_idx < A_row.LocalWidth(); l_col_idx++) { //XXX: should be the same as l_col_idx size_t g_col_idx = l_col_idx * A_row.RowStride() + A_row.RowShift(); // continue if we don't own values in S in this row if(!S.IsLocalRow(g_col_idx)) continue; //get transposed value value_type val = alpha * cbview(g_col_idx, g_row_idx); for(size_t s_col_idx = 0; s_col_idx < S.LocalWidth(); s_col_idx++) { tmp_C.UpdateLocal(g_row_idx, s_col_idx, val * S.GetLocal(S.LocalRow(g_col_idx), s_col_idx)); } } } } //FIXME: scaling if(A.getcommgrid()->GetRank() == 0) { for(size_t col_idx = 0; col_idx < C.Width(); col_idx++) for(size_t row_idx = 0; row_idx < C.Height(); row_idx++) tmp_C.UpdateLocal(row_idx, col_idx, beta * C.GetLocal(row_idx, col_idx)); } //FIXME: Use utility getter boost::mpi::communicator world( A.getcommgrid()->GetWorld(), boost::mpi::comm_duplicate); boost::mpi::all_reduce (world, tmp_C.LockedBuffer(), C.Height() * C.Width(), C.Buffer(), std::plus<value_type>()); }