void compare_result(size_t rank, DistMatrixType &expected_A, El::DistMatrix<double, El::STAR, El::STAR> &result) { col_t &data = expected_A.seq(); const size_t my_row_offset = skylark::utility::cb_my_row_offset(expected_A); const size_t my_col_offset = skylark::utility::cb_my_col_offset(expected_A); for(typename col_t::SpColIter col = data.begcol(); col != data.endcol(); col++) { for(typename col_t::SpColIter::NzIter nz = data.begnz(col); nz != data.endnz(col); nz++) { const size_t rowid = nz.rowid() + my_row_offset; const size_t colid = col.colid() + my_col_offset; const double value = nz.value(); if(value != result.GetLocal(rowid, colid)) { std::ostringstream os; os << rank << ": " << rowid << ", " << colid << ": " << value << " != " << result.GetLocal(rowid, colid) << std::endl; std::cout << os.str() << std::flush; BOOST_FAIL("Result application not as expected"); } } } }
void SymmetricEuclideanDistanceMatrix(El::UpperOrLower uplo, direction_t dir, T alpha, const El::ElementalMatrix<T> &A, T beta, El::ElementalMatrix<T> &C) { T *c = C.Buffer(); int ldC = C.LDim(); if (dir == base::COLUMNS) { El::Herk(uplo, El::ADJOINT, -2.0 * alpha, A, beta, C); //El::Gemm(El::ADJOINT, El::NORMAL, T(-2.0) * alpha, A, A, beta, C); El::DistMatrix<El::Base<T>, El::STAR, El::STAR > N; ColumnNrm2(A, N); El::Base<T> *nn = N.Buffer();; El::Int n = C.LocalWidth(); El::Int m = C.LocalHeight(); for(El::Int j = 0; j < n; j++) for(El::Int i = ((uplo == El::UPPER) ? 0 : C.LocalRowOffset(A.GlobalCol(j))); i < ((uplo == El::UPPER) ? C.LocalRowOffset(A.GlobalCol(j) + 1) : m); i++) { El::Base<T> a = nn[C.GlobalRow(i)]; El::Base<T> b = nn[C.GlobalCol(j)]; c[j * ldC + i] += alpha * (a * a + b * b); } } // TODO the rest of the cases. }
inline void mixed_gemm_local_part_nn ( const double alpha, const SpParMat<index_type, value_type, SpDCCols<index_type, value_type> > &A, const El::DistMatrix<value_type, El::STAR, El::STAR> &S, const double beta, std::vector<value_type> &local_matrix) { typedef SpDCCols< index_type, value_type > col_t; typedef SpParMat< index_type, value_type, col_t > matrix_type; matrix_type &_A = const_cast<matrix_type&>(A); col_t &data = _A.seq(); //FIXME local_matrix.resize(S.Width() * data.getnrow(), 0); size_t cb_col_offset = utility::cb_my_col_offset(A); for(typename col_t::SpColIter col = data.begcol(); col != data.endcol(); col++) { for(typename col_t::SpColIter::NzIter nz = data.begnz(col); nz != data.endnz(col); nz++) { // we want local index here to fill local dense matrix index_type rowid = nz.rowid(); // column needs to be global index_type colid = col.colid() + cb_col_offset; // compute application of S to yield a partial row in the result. for(size_t bcol = 0; bcol < S.Width(); ++bcol) { local_matrix[rowid * S.Width() + bcol] += alpha * S.Get(colid, bcol) * nz.value(); } } } }
inline void Gemv(El::Orientation oA, T alpha, const El::DistMatrix<T, El::VC, El::STAR>& A, const El::DistMatrix<T, El::STAR, El::STAR>& x, El::DistMatrix<T, El::VC, El::STAR>& y) { int y_height = (oA == El::NORMAL ? A.Height() : A.Width()); El::Zeros(y, y_height, 1); base::Gemv(oA, alpha, A, x, T(0), y); }
inline void Gemv(El::Orientation oA, T alpha, const El::DistMatrix<T, El::VC, El::STAR>& A, const El::DistMatrix<T, El::STAR, El::STAR>& x, T beta, El::DistMatrix<T, El::VC, El::STAR>& y) { // TODO verify sizes etc. if (oA == El::NORMAL) { El::Gemv(El::NORMAL, alpha, A.LockedMatrix(), x.LockedMatrix(), beta, y.Matrix()); } else { SKYLARK_THROW_EXCEPTION(base::unsupported_base_operation()); } }
inline void outer_panel_mixed_gemm_impl_nn( const double alpha, const SpParMat<index_type, value_type, SpDCCols<index_type, value_type> > &A, const El::DistMatrix<value_type, El::STAR, El::STAR> &S, const double beta, El::DistMatrix<value_type, col_d, El::STAR> &C) { utility::combblas_slab_view_t<index_type, value_type> cbview(A, false); //FIXME: factor size_t slab_size = 2 * C.Grid().Height(); for(size_t cur_row_idx = 0; cur_row_idx < cbview.nrows(); cur_row_idx += slab_size) { size_t cur_slab_size = std::min(slab_size, cbview.nrows() - cur_row_idx); // get the next slab_size columns of B El::DistMatrix<value_type, col_d, El::STAR> A_row(cur_slab_size, S.Height()); cbview.extract_elemental_row_slab_view(A_row, cur_slab_size); // assemble the distributed column vector for(size_t l_col_idx = 0; l_col_idx < A_row.LocalWidth(); l_col_idx++) { size_t g_col_idx = l_col_idx * A_row.RowStride() + A_row.RowShift(); for(size_t l_row_idx = 0; l_row_idx < A_row.LocalHeight(); ++l_row_idx) { size_t g_row_idx = l_row_idx * A_row.ColStride() + A_row.ColShift() + cur_row_idx; A_row.SetLocal(l_row_idx, l_col_idx, cbview(g_row_idx, g_col_idx)); } } El::DistMatrix<value_type, col_d, El::STAR> C_slice(cur_slab_size, C.Width()); El::View(C_slice, C, cur_row_idx, 0, cur_slab_size, C.Width()); El::LocalGemm(El::NORMAL, El::NORMAL, alpha, A_row, S, beta, C_slice); } }
inline void Gemv(El::Orientation oA, T alpha, const El::DistMatrix<T, El::VC, El::STAR>& A, const El::DistMatrix<T, El::VC, El::STAR>& x, T beta, El::DistMatrix<T, El::STAR, El::STAR>& y) { // TODO verify sizes etc. // TODO verify matching grids. if (oA == El::TRANSPOSE) { boost::mpi::communicator comm(y.Grid().Comm().comm, boost::mpi::comm_attach); El::Matrix<T> ylocal(y.Matrix()); El::Gemv(El::TRANSPOSE, alpha, A.LockedMatrix(), x.LockedMatrix(), beta / T(comm.size()), ylocal); boost::mpi::all_reduce(comm, ylocal.Buffer(), ylocal.MemorySize(), y.Buffer(), std::plus<T>()); } else { SKYLARK_THROW_EXCEPTION(base::unsupported_base_operation()); } }
/* * Convert an elemental STAR, STAR distributed vector toa std::vector */ int elstar2vec(const El::DistMatrix<El::Complex<double>,El::STAR,El::STAR> &Y, std::vector<double> &vec){ const El::Complex<double> *Y_ptr = Y.LockedBuffer(); int sz = vec.size()/2; #pragma omp parallel for for(int i=0;i<sz; i++){ vec[2*i] = El::RealPart(Y_ptr[i]); vec[2*i+1] = El::ImagPart(Y_ptr[i]); } return 0; }
/* * Convert a std::vector to an elemental STAR, STAR distributed vector */ int vec2elstar(const std::vector<double> &vec, El::DistMatrix<El::Complex<double>,El::STAR,El::STAR > &Y){ El::Complex<double> *Y_ptr = Y.Buffer(); int sz = vec.size()/2; #pragma omp parallel for for(int i=0;i<sz; i++){ double real = vec[2*i]; double imag = vec[2*i+1]; El::SetRealPart(Y_ptr[i],real); El::SetImagPart(Y_ptr[i],imag); } return 0; }
int elemental2vec(const El::DistMatrix<El::Complex<double>,El::VC,El::STAR> &Y, std::vector<double> &vec){ assert((Y.DistData().colDist == El::STAR) and (Y.DistData().rowDist == El::VC)); int data_dof=2; int SCAL_EXP = 1; //double *pt_array,*pt_perm_array; int r,q,ll,rq; // el vec info int nbigs; //Number of large recv (i.e. recv 1 extra data point) int pstart; // p_id of nstart int rank = El::mpi::WorldRank(); //p_id int recv_size; // base recv size bool print = (rank == -1); // Get el vec info ll = Y.Height(); const El::Grid* g = &(Y.Grid()); r = g->Height(); q = g->Width(); MPI_Comm comm = (g->Comm()).comm; int cheb_deg = InvMedTree<FMM_Mat_t>::cheb_deg; int omp_p=omp_get_max_threads(); size_t n_coeff3=(cheb_deg+1)*(cheb_deg+2)*(cheb_deg+3)/6; // Get petsc vec params //VecGetLocalSize(pt_vec,&nlocal); int nlocal = (vec.size())/data_dof; if(print) std::cout << "m: " << std::endl; int nstart = 0; //VecGetArray(pt_vec,&pt_array); //VecGetOwnershipRange(pt_vec,&nstart,NULL); MPI_Exscan(&nlocal,&nstart,1,MPI_INT,MPI_SUM,comm); // Determine who owns the first element we want rq = r * q; pstart = nstart % rq; nbigs = nlocal % rq; recv_size = nlocal / rq; if(print){ std::cout << "r: " << r << " q: " << q <<std::endl; std::cout << "nstart: " << nstart << std::endl; std::cout << "ps: " << pstart << std::endl; std::cout << "nbigs: " << nbigs << std::endl; std::cout << "recv_size: " << recv_size << std::endl; } // Make recv sizes std::vector<int> recv_lengths(rq); std::fill(recv_lengths.begin(),recv_lengths.end(),recv_size); if(nbigs >0){ for(int i=0;i<nbigs;i++){ recv_lengths[(pstart + i) % rq] += 1; } } // Make recv disps std::vector<int> recv_disps = exscan(recv_lengths); // All2all to get send sizes std::vector<int> send_lengths(rq); MPI_Alltoall(&recv_lengths[0], 1, MPI_INT, &send_lengths[0], 1, MPI_INT,comm); // Scan to get send_disps std::vector<int> send_disps = exscan(send_lengths); // Do all2allv to get data on correct processor std::vector<El::Complex<double>> recv_data(nlocal); std::vector<El::Complex<double>> recv_data_ordered(nlocal); //MPI_Alltoallv(el_vec.Buffer(),&send_lengths[0],&send_disps[0],MPI_DOUBLE, \ &recv_data[0],&recv_lengths[0],&recv_disps[0],MPI_DOUBLE,comm); El::mpi::AllToAll(Y.LockedBuffer(), &send_lengths[0], &send_disps[0], &recv_data[0],&recv_lengths[0],&recv_disps[0],comm); if(print){ //std::cout << "Send data: " <<std::endl << *el_vec.Buffer() <<std::endl; std::cout << "Send lengths: " <<std::endl << send_lengths <<std::endl; std::cout << "Send disps: " <<std::endl << send_disps <<std::endl; std::cout << "Recv data: " <<std::endl << recv_data <<std::endl; std::cout << "Recv lengths: " <<std::endl << recv_lengths <<std::endl; std::cout << "Recv disps: " <<std::endl << recv_disps <<std::endl; } // Reorder the data so taht it is in the right order for the fmm tree for(int p=0;p<rq;p++){ int base_idx = (p - pstart + rq) % rq; int offset = recv_disps[p]; for(int i=0;i<recv_lengths[p];i++){ recv_data_ordered[base_idx + rq*i] = recv_data[offset + i]; } } // loop through and put the data into the vector #pragma omp parallel for for(int i=0;i<nlocal; i++){ vec[2*i] = El::RealPart(recv_data_ordered[i]); vec[2*i+1] = El::ImagPart(recv_data_ordered[i]); } if(print){std::cout <<"here?"<<std::endl;} return 0; }
int Width(const El::DistMatrix<T, U, V>& A) { return A.Width(); }
static type build_compatible(int m, int n, const El::DistMatrix<F, U, V>& A) { return type(m, n, A.Grid(), A.Root()); }
index_type owner(const El::DistMatrix<value_type, El::VR, El::STAR> &A, const index_type row_idx, const index_type col_idx) { return (row_idx / A.Grid().Width()) % A.Grid().Height() + (row_idx % A.Grid().Width()) * A.Grid().Height(); }
int vec2elemental(const std::vector<double> &vec, El::DistMatrix<El::Complex<double>,El::VC,El::STAR> &Y){ int data_dof=2; int SCAL_EXP = 1; int nlocal,gsize; //local elements, start p_id, global size double *pt_array; // will hold local array int r,q,rq; //Grid sizes int nbigs; //Number of large sends (i.e. send 1 extra data point) int pstart; // p_id of nstart int rank = El::mpi::WorldRank(); //p_id int send_size; // base send size bool print = rank == -1; // Get Grid and associated params const El::Grid* g = &(Y.Grid()); r = g->Height(); q = g->Width(); MPI_Comm comm = (g->Comm()).comm; // Get sizes, array in petsc nlocal = vec.size()/data_dof; int nstart = 0; MPI_Exscan(&nlocal,&nstart,1,MPI_INT,MPI_SUM,comm); //VecGetOwnershipRange(pt_vec,&nstart,NULL); //Find processor that nstart belongs to, number of larger sends rq = r * q; pstart = nstart % rq; //int div nbigs = nlocal % rq; send_size = nlocal/rq; if(print){ std::cout << "r: " << r << " q: " << q <<std::endl; std::cout << "nstart: " << nstart << std::endl; std::cout << "ps: " << pstart << std::endl; std::cout << "nbigs: " << nbigs << std::endl; std::cout << "send_size: " << send_size << std::endl; } // Make send_lengths std::vector<int> send_lengths(rq); std::fill(send_lengths.begin(),send_lengths.end(),send_size); if(nbigs >0){ for(int j=0;j<nbigs;j++){ send_lengths[(pstart + j) % rq] += 1; } } // Make send_disps std::vector<int> send_disps = exscan(send_lengths); std::vector<El::Complex<double>> indata(nlocal); // copy the data from an ffm tree to into a local vec of complex data for sending #pragma omp parallel for El::Complex<double> val; for(int i=0;i<nlocal;i++){ El::SetRealPart(val,vec[2*i+0]); El::SetImagPart(val,vec[2*i+1]); indata[i] = val; } // Make send_dataA, i.e. reorder the data std::vector<El::Complex<double>> send_data(nlocal); for(int proc=0;proc<rq;proc++){ int offset = send_disps[proc]; int base_idx = (proc - pstart + rq) % rq; for(int j=0; j<send_lengths[proc]; j++){ int idx = base_idx + (j * rq); send_data[offset + j] = indata[idx]; } } // Do all2all to get recv_lengths std::vector<int> recv_lengths(rq); MPI_Alltoall(&send_lengths[0], 1, MPI_INT, &recv_lengths[0], 1, MPI_INT,comm); // Scan to get recv_disps std::vector<int> recv_disps = exscan(recv_lengths); // Do all2allv to get data on correct processor El::Complex<double> * recv_data = Y.Buffer(); //MPI_Alltoallv(&send_data[0],&send_lengths[0],&send_disps[0],MPI_DOUBLE, \ // &recv_data[0],&recv_lengths[0],&recv_disps[0],MPI_DOUBLE,comm); El::mpi::AllToAll(&send_data[0], &send_lengths[0], &send_disps[0], recv_data,&recv_lengths[0],&recv_disps[0],comm); if(print){ std::cout << "Send data: " <<std::endl << send_data <<std::endl; std::cout << "Send lengths: " <<std::endl << send_lengths <<std::endl; std::cout << "Send disps: " <<std::endl << send_disps <<std::endl; std::cout << "Recv data: " <<std::endl << recv_data <<std::endl; std::cout << "Recv lengths: " <<std::endl << recv_lengths <<std::endl; std::cout << "Recv disps: " <<std::endl << recv_disps <<std::endl; } return 0; }
int tree2elemental(InvMedTree<FMM_Mat_t> *tree, El::DistMatrix<T,El::VC,El::STAR> &Y){ int data_dof=2; int SCAL_EXP = 1; int nlocal,gsize; //local elements, start p_id, global size double *pt_array; // will hold local array int r,q,rq; //Grid sizes int nbigs; //Number of large sends (i.e. send 1 extra data point) int pstart; // p_id of nstart int rank = El::mpi::WorldRank(); //p_id int send_size; // base send size bool print = rank == -1; // Get Grid and associated params const El::Grid* g = &(Y.Grid()); r = g->Height(); q = g->Width(); MPI_Comm comm = (g->Comm()).comm; std::vector<FMMNode_t*> nlist = tree->GetNGLNodes(); int cheb_deg = InvMedTree<FMM_Mat_t>::cheb_deg; int omp_p=omp_get_max_threads(); size_t n_coeff3=(cheb_deg+1)*(cheb_deg+2)*(cheb_deg+3)/6; // Get sizes, array in petsc //VecGetSize(pt_vec,&gsize); gsize = tree->M/data_dof; nlocal = tree->m/data_dof; //VecGetLocalSize(pt_vec,&nlocal); //VecGetArray(pt_vec,&pt_array); int nstart = 0; MPI_Exscan(&nlocal,&nstart,1,MPI_INT,MPI_SUM,comm); //VecGetOwnershipRange(pt_vec,&nstart,NULL); //Find processor that nstart belongs to, number of larger sends rq = r * q; pstart = nstart % rq; //int div nbigs = nlocal % rq; send_size = nlocal/rq; if(print){ std::cout << "r: " << r << " q: " << q <<std::endl; std::cout << "nstart: " << nstart << std::endl; std::cout << "ps: " << pstart << std::endl; std::cout << "nbigs: " << nbigs << std::endl; std::cout << "send_size: " << send_size << std::endl; } // Make send_lengths std::vector<int> send_lengths(rq); std::fill(send_lengths.begin(),send_lengths.end(),send_size); if(nbigs >0){ for(int j=0;j<nbigs;j++){ send_lengths[(pstart + j) % rq] += 1; } } // Make send_disps std::vector<int> send_disps = exscan(send_lengths); std::vector<El::Complex<double>> indata(nlocal); // copy the data from an ffm tree to into a local vec of complex data for sending #pragma omp parallel for for(size_t tid=0;tid<omp_p;tid++){ size_t i_start=(nlist.size()* tid )/omp_p; size_t i_end =(nlist.size()*(tid+1))/omp_p; for(size_t i=i_start;i<i_end;i++){ pvfmm::Vector<double>& coeff_vec=nlist[i]->ChebData(); double s=std::pow(0.5,COORD_DIM*nlist[i]->Depth()*0.5*SCAL_EXP); size_t offset=i*n_coeff3; for(size_t j=0;j<n_coeff3;j++){ double real = coeff_vec[j]*s; // local indices as in the pvfmm trees double imag = coeff_vec[j+n_coeff3]*s; El::Complex<double> coeff; El::SetRealPart(coeff,real); El::SetImagPart(coeff,imag); indata[offset+j] = coeff; } } } // Make send_data std::vector<El::Complex<double>> send_data(nlocal); for(int proc=0;proc<rq;proc++){ int offset = send_disps[proc]; int base_idx = (proc - pstart + rq) % rq; for(int j=0; j<send_lengths[proc]; j++){ int idx = base_idx + (j * rq); send_data[offset + j] = indata[idx]; } } // Do all2all to get recv_lengths std::vector<int> recv_lengths(rq); MPI_Alltoall(&send_lengths[0], 1, MPI_INT, &recv_lengths[0], 1, MPI_INT,comm); // Scan to get recv_disps std::vector<int> recv_disps = exscan(recv_lengths); // Do all2allv to get data on correct processor El::Complex<double> * recv_data = Y.Buffer(); //MPI_Alltoallv(&send_data[0],&send_lengths[0],&send_disps[0],MPI_DOUBLE, \ // &recv_data[0],&recv_lengths[0],&recv_disps[0],MPI_DOUBLE,comm); El::mpi::AllToAll(&send_data[0], &send_lengths[0], &send_disps[0], recv_data,&recv_lengths[0],&recv_disps[0],comm); if(print){ std::cout << "Send data: " <<std::endl << send_data <<std::endl; std::cout << "Send lengths: " <<std::endl << send_lengths <<std::endl; std::cout << "Send disps: " <<std::endl << send_disps <<std::endl; std::cout << "Recv data: " <<std::endl << recv_data <<std::endl; std::cout << "Recv lengths: " <<std::endl << recv_lengths <<std::endl; std::cout << "Recv disps: " <<std::endl << recv_disps <<std::endl; } return 0; }
void El2Petsc_vec(El::DistMatrix<double,VC,STAR>& el_vec,Vec& pt_vec){ PetscInt nlocal, nstart; // petsc vec info PetscScalar *pt_array,*pt_perm_array; int r,q,ll,rq; // el vec info int nbigs; //Number of large recv (i.e. recv 1 extra data point) int pstart; // p_id of nstart int p = El::mpi::WorldRank(); //p_id int recv_size; // base recv size bool print = p == -1; // Get el vec info ll = el_vec.Height(); const El::Grid* g = &(el_vec.Grid()); r = g->Height(); q = g->Width(); MPI_Comm comm = (g->Comm()).comm; // Get petsc vec params VecGetLocalSize(pt_vec,&nlocal); VecGetArray(pt_vec,&pt_array); VecGetOwnershipRange(pt_vec,&nstart,NULL); // Determine who owns the first element we want rq = r * q; pstart = nstart % rq; nbigs = nlocal % rq; recv_size = nlocal / rq; if(print){ std::cout << "r: " << r << " q: " << q <<std::endl; std::cout << "nstart: " << nstart << std::endl; std::cout << "ps: " << pstart << std::endl; std::cout << "nbigs: " << nbigs << std::endl; std::cout << "recv_size: " << recv_size << std::endl; } // Make recv sizes std::vector<int> recv_lengths(rq); std::fill(recv_lengths.begin(),recv_lengths.end(),recv_size); if(nbigs >0){ for(int i=0;i<nbigs;i++){ recv_lengths[(pstart + i) % rq] += 1; } } // Make recv disps std::vector<int> recv_disps = exscan(recv_lengths); // All2all to get send sizes std::vector<int> send_lengths(rq); MPI_Alltoall(&recv_lengths[0], 1, MPI_INT, &send_lengths[0], 1, MPI_INT,comm); // Scan to get send_disps std::vector<int> send_disps = exscan(send_lengths); // Do all2allv to get data on correct processor std::vector<double> recv_data(nlocal); MPI_Alltoallv(el_vec.Buffer(),&send_lengths[0],&send_disps[0],MPI_DOUBLE, \ &recv_data[0],&recv_lengths[0],&recv_disps[0],MPI_DOUBLE,comm); if(print){ //std::cout << "Send data: " <<std::endl << *el_vec.Buffer() <<std::endl; std::cout << "Send lengths: " <<std::endl << send_lengths <<std::endl; std::cout << "Send disps: " <<std::endl << send_disps <<std::endl; std::cout << "Recv data: " <<std::endl << recv_data <<std::endl; std::cout << "Recv lengths: " <<std::endl << recv_lengths <<std::endl; std::cout << "Recv disps: " <<std::endl << recv_disps <<std::endl; } // Reorder for petsc for(int p=0;p<rq;p++){ int base_idx = (p - pstart + rq) % rq; int offset = recv_disps[p]; for(int i=0;i<recv_lengths[p];i++){ pt_array[base_idx + rq*i] = recv_data[offset + i]; } } // Copy into array VecRestoreArray(pt_vec,&pt_array); }
void Petsc2El_vec(Vec& pt_vec,El::DistMatrix<double,VC,STAR>& el_vec){ PetscInt nlocal,nstart,gsize; //local elements, start p_id, global size PetscScalar *pt_array; // will hold local array int r,q,rq; //Grid sizes int nbigs; //Number of large sends (i.e. send 1 extra data point) int pstart; // p_id of nstart int p = El::mpi::WorldRank(); //p_id int send_size; // base send size bool print = p == -1; // Get Grid and associated params const El::Grid* g = &(el_vec.Grid()); r = g->Height(); q = g->Width(); MPI_Comm comm = (g->Comm()).comm; // Get sizes, array in petsc VecGetSize(pt_vec,&gsize); VecGetLocalSize(pt_vec,&nlocal); VecGetArray(pt_vec,&pt_array); VecGetOwnershipRange(pt_vec,&nstart,NULL); //Find processor that nstart belongs to, number of larger sends rq = r * q; pstart = nstart % rq; //int div nbigs = nlocal % rq; send_size = nlocal/rq; if(print){ std::cout << "r: " << r << " q: " << q <<std::endl; std::cout << "nstart: " << nstart << std::endl; std::cout << "ps: " << pstart << std::endl; std::cout << "nbigs: " << nbigs << std::endl; std::cout << "send_size: " << send_size << std::endl; } // Make send_lengths std::vector<int> send_lengths(rq); std::fill(send_lengths.begin(),send_lengths.end(),send_size); if(nbigs >0){ for(int j=0;j<nbigs;j++){ send_lengths[(pstart + j) % rq] += 1; } } // Make send_disps std::vector<int> send_disps = exscan(send_lengths); // Make send_data std::vector<double> send_data(nlocal); for(int proc=0;proc<rq;proc++){ int offset = send_disps[proc]; int base_idx = (proc - pstart + rq) % rq; for(int j=0; j<send_lengths[proc]; j++){ int idx = base_idx + (j * rq); send_data[offset + j] = pt_array[idx]; } } // Do all2all to get recv_lengths std::vector<int> recv_lengths(rq); MPI_Alltoall(&send_lengths[0], 1, MPI_INT, &recv_lengths[0], 1, MPI_INT,comm); // Scan to get recv_disps std::vector<int> recv_disps = exscan(recv_lengths); // Do all2allv to get data on correct processor double * recv_data = el_vec.Buffer(); MPI_Alltoallv(&send_data[0],&send_lengths[0],&send_disps[0],MPI_DOUBLE, \ &recv_data[0],&recv_lengths[0],&recv_disps[0],MPI_DOUBLE,comm); if(print){ std::cout << "Send data: " <<std::endl << send_data <<std::endl; std::cout << "Send lengths: " <<std::endl << send_lengths <<std::endl; std::cout << "Send disps: " <<std::endl << send_disps <<std::endl; std::cout << "Recv data: " <<std::endl << recv_data <<std::endl; std::cout << "Recv lengths: " <<std::endl << recv_lengths <<std::endl; std::cout << "Recv disps: " <<std::endl << recv_disps <<std::endl; } }
inline void outer_panel_mixed_gemm_impl_tn( const double alpha, const SpParMat<index_type, value_type, SpDCCols<index_type, value_type> > &A, const El::DistMatrix<value_type, col_d, El::STAR> &S, const double beta, El::DistMatrix<value_type, El::STAR, El::STAR> &C) { El::DistMatrix<value_type, El::STAR, El::STAR> tmp_C(C.Height(), C.Width()); El::Zero(tmp_C); utility::combblas_slab_view_t<index_type, value_type> cbview(A, false); //FIXME: factor size_t slab_size = 2 * S.Grid().Height(); for(size_t cur_row_idx = 0; cur_row_idx < cbview.ncols(); cur_row_idx += slab_size) { size_t cur_slab_size = std::min(slab_size, cbview.ncols() - cur_row_idx); // get the next slab_size columns of B El::DistMatrix<value_type, El::STAR, El::STAR> A_row(cur_slab_size, S.Height()); // transpose is column //cbview.extract_elemental_column_slab_view(A_row, cur_slab_size); cbview.extract_full_slab_view(cur_slab_size); // matrix mult (FIXME only iter nz) for(size_t l_row_idx = 0; l_row_idx < A_row.LocalHeight(); ++l_row_idx) { size_t g_row_idx = l_row_idx * A_row.ColStride() + A_row.ColShift() + cur_row_idx; for(size_t l_col_idx = 0; l_col_idx < A_row.LocalWidth(); l_col_idx++) { //XXX: should be the same as l_col_idx size_t g_col_idx = l_col_idx * A_row.RowStride() + A_row.RowShift(); // continue if we don't own values in S in this row if(!S.IsLocalRow(g_col_idx)) continue; //get transposed value value_type val = alpha * cbview(g_col_idx, g_row_idx); for(size_t s_col_idx = 0; s_col_idx < S.LocalWidth(); s_col_idx++) { tmp_C.UpdateLocal(g_row_idx, s_col_idx, val * S.GetLocal(S.LocalRow(g_col_idx), s_col_idx)); } } } } //FIXME: scaling if(A.getcommgrid()->GetRank() == 0) { for(size_t col_idx = 0; col_idx < C.Width(); col_idx++) for(size_t row_idx = 0; row_idx < C.Height(); row_idx++) tmp_C.UpdateLocal(row_idx, col_idx, beta * C.GetLocal(row_idx, col_idx)); } //FIXME: Use utility getter boost::mpi::communicator world( A.getcommgrid()->GetWorld(), boost::mpi::comm_duplicate); boost::mpi::all_reduce (world, tmp_C.LockedBuffer(), C.Height() * C.Width(), C.Buffer(), std::plus<value_type>()); }
void L1DistanceMatrixTU(El::UpperOrLower uplo, direction_t dirA, direction_t dirB, T alpha, const El::DistMatrix<T, El::STAR, El::MC> &A, const El::DistMatrix<T, El::STAR, El::MR> &B, T beta, El::DistMatrix<T> &C) { // TODO verify sizes const T *a = A.LockedBuffer(); El::Int ldA = A.LDim(); const T *b = B.LockedBuffer(); El::Int ldB = B.LDim(); T *c = C.Buffer(); El::Int ldC = C.LDim(); El::Int d = A.Height(); /* Not the most efficient way... but mimicking BLAS is too much work! */ if (dirA == base::COLUMNS && dirB == base::COLUMNS) { El::Int n = C.LocalWidth(); El::Int m = C.LocalHeight(); for (El::Int j = 0; j < n; j++) for(El::Int i = ((uplo == El::UPPER) ? 0 : C.LocalRowOffset(A.GlobalCol(j))); i < ((uplo == El::UPPER) ? C.LocalRowOffset(A.GlobalCol(j) + 1) : m); i++) { T v = 0.0; for (El::Int k = 0; k < d; k++) v += std::abs(b[j * ldB + k] - a[i * ldA + k]); c[j * ldC + i] = beta * c[j * ldC + i] + alpha * v; } } // TODO the rest of the cases. }
inline void inner_panel_mixed_gemm_impl_nn( const double alpha, const SpParMat<index_type, value_type, SpDCCols<index_type, value_type> > &A, const El::DistMatrix<value_type, El::STAR, El::STAR> &S, const double beta, El::DistMatrix<value_type, col_d, El::STAR> &C) { int n_proc_side = A.getcommgrid()->GetGridRows(); int output_width = S.Width(); int output_height = A.getnrow(); size_t rank = A.getcommgrid()->GetRank(); size_t cb_row_offset = utility::cb_my_row_offset(A); typedef SpDCCols< index_type, value_type > col_t; typedef SpParMat< index_type, value_type, col_t > matrix_type; matrix_type &_A = const_cast<matrix_type&>(A); col_t &data = _A.seq(); // 1) compute the local values still using the CombBLAS distribution (2D // processor grid). We assume the result is dense. std::vector<double> local_matrix; mixed_gemm_local_part_nn(alpha, A, S, 0.0, local_matrix); // 2) reduce first along rows so that each processor owns the values in // the output row of the SOMETHING/* matrix and values for processors in // the same processor column. boost::mpi::communicator my_row_comm( A.getcommgrid()->GetRowWorld(), boost::mpi::comm_duplicate); // storage for other procs in same row communicator: rank -> (row, values) typedef std::vector<std::pair<int, std::vector<double> > > for_rank_t; std::vector<for_rank_t> for_rank(n_proc_side); for(size_t local_row = 0; local_row < data.getnrow(); ++local_row) { size_t row = local_row + cb_row_offset; // the owner for VR/* and VC/* matrices is independent of the column size_t target_proc = utility::owner(C, row, static_cast<size_t>(0)); // if the target processor is not in the current row communicator, get // the value in the processor grid sharing the same row. if(!A.getcommgrid()->OnSameProcRow(target_proc)) target_proc = static_cast<int>(rank / n_proc_side) * n_proc_side + target_proc % n_proc_side; size_t target_row_rank = A.getcommgrid()->GetRankInProcRow(target_proc); // reduce partial row (FIXME: if the resulting matrix is still // expected to be sparse, change this to communicate only nnz). // Working on local_width columns concurrently per column processing // group. size_t local_width = S.Width(); const value_type* buffer = &local_matrix[local_row * local_width]; std::vector<value_type> new_values(local_width); boost::mpi::reduce(my_row_comm, buffer, local_width, &new_values[0], std::plus<value_type>(), target_row_rank); // processor stores result directly if it is the owning rank of that // row, save for subsequent communication along rows otherwise if(rank == utility::owner(C, row, static_cast<size_t>(0))) { int elem_lrow = C.LocalRow(row); for(size_t idx = 0; idx < local_width; ++idx) { int elem_lcol = C.LocalCol(idx); C.SetLocal(elem_lrow, elem_lcol, new_values[idx] + beta * C.GetLocal(elem_lrow, elem_lcol)); } } else if (rank == target_proc) { // store for later comm across rows for_rank[utility::owner(C, row, static_cast<size_t>(0)) / n_proc_side].push_back( std::make_pair(row, new_values)); } } // 3) gather remaining values along rows: we exchange all the values with // other processors in the same communicator row and then add them to // our local part. boost::mpi::communicator my_col_comm( A.getcommgrid()->GetColWorld(), boost::mpi::comm_duplicate); std::vector<for_rank_t> new_values; for(int i = 0; i < n_proc_side; ++i) boost::mpi::gather(my_col_comm, for_rank[i], new_values, i); // insert new values for(size_t proc = 0; proc < new_values.size(); ++proc) { const for_rank_t &cur = new_values[proc]; for(size_t i = 0; i < cur.size(); ++i) { int elem_lrow = C.LocalRow(cur[i].first); for(size_t j = 0; j < cur[i].second.size(); ++j) { size_t elem_lcol = C.LocalCol(j); C.SetLocal(elem_lrow, elem_lcol, cur[i].second[j] + beta * C.GetLocal(elem_lrow, elem_lcol)); } } } }
int Height(const El::DistMatrix<T, U, V>& A) { return A.Height(); }
index_type owner(const El::DistMatrix<value_type, El::VC, El::STAR> &A, const index_type row_idx, const index_type col_idx) { return row_idx % (A.Grid().Height() * A.Grid().Width()); }
mpi::communicator get_communicator(const El::DistMatrix<T, U, V>& A, mpi::comm_create_kind kind = mpi::comm_attach) { return mpi::communicator(A.DistComm().comm, kind); }