/** \brief * In many situations a rank computes a number of local DOFs. Then all * ranks want to know the number of global DOFs and the starting * displacment number of the DOF numbering in each rank. * * \param[in] mpiComm The MPI communicator. * \param[in] nRankDofs The number of local DOFs. * \param[out] rStartDofs Displacment of the DOF numbering. On rank n * this is the sum of all local DOF numbers in * ranks 0 to n - 1. * \param[out] nOverallDofs Global sum of nRankDofs. Is equal on all * ranks. */ inline void getDofNumbering(MPI::Intracomm& mpiComm, int nRankDofs, int& rStartDofs, int& nOverallDofs) { rStartDofs = 0; nOverallDofs = 0; mpiComm.Scan(&nRankDofs, &rStartDofs, 1, MPI_INT, MPI_SUM); rStartDofs -= nRankDofs; mpiComm.Allreduce(&nRankDofs, &nOverallDofs, 1, MPI_INT, MPI_SUM); }
void globalAdd(MPI::Intracomm& mpiComm, int& value) { int valCopy = value; mpiComm.Allreduce(&valCopy, &value, 1, MPI_INT, MPI_SUM); }
void globalAdd(MPI::Intracomm& mpiComm, double& value) { double valCopy = value; mpiComm.Allreduce(&valCopy, &value, 1, MPI_DOUBLE, MPI_SUM); }
FullyDistVec<IT,NT> FullyDistSpVec<IT,NT>::operator() (const FullyDistVec<IT,IT> & ri) const { MPI::Intracomm World = commGrid->GetWorld(); // FullyDistVec ( shared_ptr<CommGrid> grid, IT globallen, NT initval, NT id); FullyDistVec<IT,NT> Indexed(ri.commGrid, ri.glen, zero, zero); int nprocs = World.Get_size(); unordered_map<IT, IT> revr_map; // inverted index that maps indices of *this to indices of output vector< vector<IT> > data_req(nprocs); IT locnnz = ri.LocArrSize(); // ABAB: Input sanity check int local = 1; int whole = 1; for(IT i=0; i < locnnz; ++i) { if(ri.arr[i] >= glen || ri.arr[i] < 0) { local = 0; } } World.Allreduce( &local, &whole, 1, MPI::INT, MPI::BAND); if(whole == 0) { throw outofrangeexception(); } for(IT i=0; i < locnnz; ++i) { IT locind; int owner = Owner(ri.arr[i], locind); // numerical values in ri are 0-based data_req[owner].push_back(locind); revr_map.insert(typename unordered_map<IT, IT>::value_type(locind, i)); } IT * sendbuf = new IT[locnnz]; int * sendcnt = new int[nprocs]; int * sdispls = new int[nprocs]; for(int i=0; i<nprocs; ++i) sendcnt[i] = data_req[i].size(); int * rdispls = new int[nprocs]; int * recvcnt = new int[nprocs]; World.Alltoall(sendcnt, 1, MPI::INT, recvcnt, 1, MPI::INT); // share the request counts sdispls[0] = 0; rdispls[0] = 0; for(int i=0; i<nprocs-1; ++i) { sdispls[i+1] = sdispls[i] + sendcnt[i]; rdispls[i+1] = rdispls[i] + recvcnt[i]; } IT totrecv = accumulate(recvcnt,recvcnt+nprocs,0); IT * recvbuf = new IT[totrecv]; for(int i=0; i<nprocs; ++i) { copy(data_req[i].begin(), data_req[i].end(), sendbuf+sdispls[i]); vector<IT>().swap(data_req[i]); } World.Alltoallv(sendbuf, sendcnt, sdispls, MPIType<IT>(), recvbuf, recvcnt, rdispls, MPIType<IT>()); // request data // We will return the requested data, // our return can be at most as big as the request // and smaller if we are missing some elements IT * indsback = new IT[totrecv]; NT * databack = new NT[totrecv]; int * ddispls = new int[nprocs]; copy(rdispls, rdispls+nprocs, ddispls); for(int i=0; i<nprocs; ++i) { // this is not the most efficient method because it scans ind vector nprocs = sqrt(p) times IT * it = set_intersection(recvbuf+rdispls[i], recvbuf+rdispls[i]+recvcnt[i], ind.begin(), ind.end(), indsback+rdispls[i]); recvcnt[i] = (it - (indsback+rdispls[i])); // update with size of the intersection IT vi = 0; for(int j = rdispls[i]; j < rdispls[i] + recvcnt[i]; ++j) // fetch the numerical values { // indsback is a subset of ind while(indsback[j] > ind[vi]) ++vi; databack[j] = num[vi++]; } } DeleteAll(recvbuf, ddispls); NT * databuf = new NT[ri.LocArrSize()]; World.Alltoall(recvcnt, 1, MPI::INT, sendcnt, 1, MPI::INT); // share the response counts, overriding request counts World.Alltoallv(indsback, recvcnt, rdispls, MPIType<IT>(), sendbuf, sendcnt, sdispls, MPIType<IT>()); // send indices World.Alltoallv(databack, recvcnt, rdispls, MPIType<NT>(), databuf, sendcnt, sdispls, MPIType<NT>()); // send data DeleteAll(rdispls, recvcnt, indsback, databack); // Now create the output from databuf (holds numerical values) and sendbuf (holds indices) // arr is already resized during its construction for(int i=0; i<nprocs; ++i) { // data will come globally sorted from processors // i.e. ind owned by proc_i is always smaller than // ind owned by proc_j for j < i for(int j=sdispls[i]; j< sdispls[i]+sendcnt[i]; ++j) { typename unordered_map<IT,IT>::iterator it = revr_map.find(sendbuf[j]); Indexed.arr[it->second] = databuf[j]; // cout << it->second << "(" << sendbuf[j] << "):" << databuf[j] << endl; } } DeleteAll(sdispls, sendcnt, sendbuf, databuf); return Indexed; }