template <class T> inline void MPIContainerComm<T>::allToAll(const Array<T>& outgoing, Array<Array<T> >& incoming, const MPIComm& comm) { int numProcs = comm.getNProc(); // catch degenerate case if (numProcs==1) { incoming.resize(1); incoming[0] = outgoing; return; } Array<T> sb(numProcs * outgoing.length()); Array<T> rb(numProcs * outgoing.length()); T* sendBuf = new T[numProcs * outgoing.length()]; TEUCHOS_TEST_FOR_EXCEPTION(sendBuf==0, std::runtime_error, "Comm::allToAll failed to allocate sendBuf"); T* recvBuf = new T[numProcs * outgoing.length()]; TEUCHOS_TEST_FOR_EXCEPTION(recvBuf==0, std::runtime_error, "Comm::allToAll failed to allocate recvBuf"); int i; for (i=0; i<numProcs; i++) { for (int j=0; j<outgoing.length(); j++) { sendBuf[i*outgoing.length() + j] = outgoing[j]; } } comm.allToAll(sendBuf, outgoing.length(), MPITraits<T>::type(), recvBuf, outgoing.length(), MPITraits<T>::type()); incoming.resize(numProcs); for (i=0; i<numProcs; i++) { incoming[i].resize(outgoing.length()); for (int j=0; j<outgoing.length(); j++) { incoming[i][j] = recvBuf[i*outgoing.length() + j]; } } delete [] sendBuf; delete [] recvBuf; }
int main(int argc, char** argv) { try { /* Initialization */ Sundance::init(&argc, &argv); /* ---- BEGIN CODE BODY --- */ /* The main simulation code goes here. In this example, all we do * is to print some information about the processor ranks. */ MPIComm comm = MPIComm::world(); /* Print a header from the root processor only. Although this executes on * all processors, anything written to the output stream Out::root() * is ignored on all non-root processors (rank != 0). * After writing, synchronize to keep this message from getting jumbled * together with the subsequent messages. */ Out::root() << "Example: getting started" << endl; comm.synchronize(); /* Every processor now speaks up and identifies itself */ int myRank = comm.getRank(); int nProc = comm.getNProc(); Out::os() << "Processor " << myRank << " of " << nProc << " checking in" << endl; /* ---- END CODE BODY --- */ /* Test success or failure. Most examples you'll see will do this * as part of the Trilinos regression testing system. * If you write a simulation code that won't become part of Trilinos, * you often can bypass this step. * * Here the test is a trival one: every processor's rank must be * smaller than the total number of processors. If this fails, * your MPI installation is probably broken! * */ Sundance::passFailTest(myRank < nProc); } catch(std::exception& e) /* exception handling */ { cerr << "exception!" << endl; Sundance::handleException(e); } /* Finalization */ Sundance::finalize(); return Sundance::testStatus(); }
inline void MPIContainerComm<T>::bcast(Array<T>& x, int src, const MPIComm& comm) { int len = x.length(); MPIContainerComm<int>::bcast(len, src, comm); if (comm.getRank() != src) { x.resize(len); } if (len==0) return; /* then broadcast the contents */ comm.bcast((void*) &(x[0]), (int) len, MPITraits<T>::type(), src); }
inline void MPIContainerComm<std::string>::allGather(const std::string& outgoing, Array<std::string>& incoming, const MPIComm& comm) { int nProc = comm.getNProc(); int sendCount = outgoing.length(); incoming.resize(nProc); int* recvCounts = new int[nProc]; int* recvDisplacements = new int[nProc]; /* share lengths with all procs */ comm.allGather((void*) &sendCount, 1, MPIDataType::intType(), (void*) recvCounts, 1, MPIDataType::intType()); int recvSize = 0; recvDisplacements[0] = 0; for (int i=0; i<nProc; i++) { recvSize += recvCounts[i]; if (i < nProc-1) { recvDisplacements[i+1] = recvDisplacements[i]+recvCounts[i]; } } char* recvBuf = new char[recvSize]; comm.allGatherv((void*) outgoing.c_str(), sendCount, MPIDataType::charType(), recvBuf, recvCounts, recvDisplacements, MPIDataType::charType()); for (int j=0; j<nProc; j++) { char* start = recvBuf + recvDisplacements[j]; char* tmp = new char[recvCounts[j]+1]; std::memcpy(tmp, start, recvCounts[j]); tmp[recvCounts[j]] = '\0'; incoming[j] = std::string(tmp); delete [] tmp; } delete [] recvCounts; delete [] recvDisplacements; delete [] recvBuf; }
void reportFailure(const MPIComm& comm) { int myBad = 1; int anyBad = 0; comm.allReduce((void*) &myBad, (void*) &anyBad, 1, MPIComm::INT, MPIComm::SUM); }
template <class T> inline void MPIContainerComm<T>::accumulate(const T& localValue, Array<T>& sums, T& total, const MPIComm& comm) { Array<T> contributions; allGather(localValue, contributions, comm); sums.resize(comm.getNProc()); sums[0] = 0; total = contributions[0]; for (int i=0; i<comm.getNProc()-1; i++) { total += contributions[i+1]; sums[i+1] = sums[i] + contributions[i]; } }
template <class T> inline void MPIContainerComm<T>::allGather(const T& outgoing, Array<T>& incoming, const MPIComm& comm) { int nProc = comm.getNProc(); incoming.resize(nProc); if (nProc==1) { incoming[0] = outgoing; } else { comm.allGather((void*) &outgoing, 1, MPITraits<T>::type(), (void*) &(incoming[0]), 1, MPITraits<T>::type()); } }
bool checkForFailures(const MPIComm& comm) { int myBad = 0; int anyBad = 0; comm.allReduce((void*) &myBad, (void*) &anyBad, 1, MPIComm::INT, MPIComm::SUM); return anyBad > 0; }
inline void MPIContainerComm<std::string>::bcast(std::string& x, int src, const MPIComm& comm) { int len = x.length(); MPIContainerComm<int>::bcast(len, src, comm); x.resize(len); comm.bcast((void*)&(x[0]), len, MPITraits<char>::type(), src); }
inline void MPIContainerComm<T>::bcast(Array<Array<T> >& x, int src, const MPIComm& comm) { Array<T> bigArray; Array<int> offsets; if (src==comm.getRank()) { getBigArray(x, bigArray, offsets); } bcast(bigArray, src, comm); MPIContainerComm<int>::bcast(offsets, src, comm); if (src != comm.getRank()) { getSmallArrays(bigArray, offsets, x); } }
void ErrorPolling::reportFailure(const MPIComm& comm) { if (isActive()) { int myBad = 1; int anyBad = 0; comm.allReduce((void*) &myBad, (void*) &anyBad, 1, MPIDataType::intType(), MPIOp::sumOp()); } }
void ErrorPolling::reportFailure(const MPIComm& comm) { if (isActive()) { int myBad = 1; int anyBad = 0; comm.allReduce((void*) &myBad, (void*) &anyBad, 1, MPIComm::INT, MPIComm::SUM); } }
MPIComm::MPIComm(const MPIComm& parent, const MPIGroup& group) : #ifdef HAVE_MPI comm_(MPI_COMM_WORLD), #endif nProc_(0), myRank_(0) { #ifdef HAVE_MPI if (group.getNProc()==0) { rank_ = -1; nProc_ = 0; } else if (parent.containsMe()) { MPI_Comm parentComm = parent.comm_; MPI_Group newGroup = group.group_; errCheck(MPI_Comm_create(parentComm, newGroup, &comm_), "Comm_create"); if (group.containsProc(parent.getRank())) { errCheck(MPI_Comm_rank(comm_, &rank_), "Comm_rank"); errCheck(MPI_Comm_size(comm_, &nProc_), "Comm_size"); } else { rank_ = -1; nProc_ = -1; return; } } else { rank_ = -1; nProc_ = -1; } #endif }
void PerformanceMonitorUtils::synchNames(const MPIComm& comm, const Array<std::string>& localNames, Array<std::string>& allNames) { if (comm.getNProc() > 1) { /* gather names of counters from all processors */ int root = 0; std::set<std::string> nameSet; Array<Array<std::string> > namesForAllProcs; MPIContainerComm<std::string>::gatherv(localNames, namesForAllProcs, root, comm); /* on the root processor, compile the set union of all names */ if (comm.getRank()==0) { for (Array<Array<std::string> >::size_type p=0; p<namesForAllProcs.size(); p++) { for (Array<std::string>::size_type i=0; i<namesForAllProcs[p].size(); i++) { nameSet.insert(namesForAllProcs[p][i]); } } } /* convert the set to an array so we can send it out by MPI */ allNames.resize(0); for (std::set<std::string>::const_iterator i=nameSet.begin(); i!=nameSet.end(); i++) { allNames.append(*i); } /* broadcast the union of all names to all processors */ MPIContainerComm<std::string>::bcast(allNames, root, comm); } else { allNames = localNames; } }
inline void MPIContainerComm<std::string>::bcast(Array<std::string>& x, int src, const MPIComm& comm) { /* begin by packing all the data into a big char array. This will * take a little time, but will be cheaper than multiple MPI calls */ Array<char> bigArray; Array<int> offsets; if (comm.getRank()==src) { getBigArray(x, bigArray, offsets); } /* now broadcast the big array and the offsets */ MPIContainerComm<char>::bcast(bigArray, src, comm); MPIContainerComm<int>::bcast(offsets, src, comm); /* finally, reassemble the array of strings */ if (comm.getRank() != src) { getStrings(bigArray, offsets, x); } }
void PerformanceMonitorUtils::reduce(const MPIComm& comm, const EMetricReduction& reductionType, const Array<double>& localVals, Array<double>& reducedVals) { /* if we're asking for local values, do nothing but copy the local array * to the reduced array. */ if (comm.getNProc()==1 || reductionType==ELocal) { reducedVals = localVals; return; } /* If we're to this point we must do a reduction */ reducedVals.resize(localVals.size()); int op = MPIComm::SUM; if (reductionType==EMax) op = MPIComm::MAX; if (reductionType==EMin) op = MPIComm::MIN; int sendCount = localVals.size(); if (sendCount==0) return; double* sendBuf = const_cast<double*>(&localVals[0]); double* recvBuf = const_cast<double*>(&reducedVals[0]); comm.allReduce( (void*) sendBuf, (void*) recvBuf, sendCount, MPIComm::DOUBLE, op); if (reductionType==EAvg) { for (Array<double>::size_type i=0; i<reducedVals.size(); i++) { reducedVals[i] /= ((double) comm.getNProc()); } } }
bool ErrorPolling::pollForFailures(const MPIComm& comm) { /* bypass if inactive */ if (!isActive()) return true; int myBad = 0; int anyBad = 0; try { comm.allReduce((void*) &myBad, (void*) &anyBad, 1, MPIComm::INT, MPIComm::SUM); } catch(const std::exception&) { return true; } return anyBad > 0; }
VectorSpace<Scalar> buildPartitionedSpace( int nTotalDofs, int lowestLocalDof, int nLocalDofs, const Array<int>& isBCIndex, const VectorType<Scalar>& internalType, const VectorType<Scalar>& bcType, const MPIComm& comm ) { int nBCDofs = 0; for (int i=0; i<nLocalDofs; i++) { if (isBCIndex[i]) nBCDofs++; } /* sum number of BC Dofs over all processors */ int nTotalBCDofs = nBCDofs; comm.allReduce(&nBCDofs, &nTotalBCDofs, 1, MPIComm::INT, MPIComm::SUM); int nTotalInteriorDofs = nTotalDofs - nTotalBCDofs; Array<int> interiorDofs(nLocalDofs - nBCDofs); Array<int> bcDofs(nBCDofs); int iBC = 0; int iIn = 0; for (int i=0; i<nLocalDofs; i++) { if (isBCIndex[i]) bcDofs[iBC++] = lowestLocalDof+i; else interiorDofs[iIn++] = lowestLocalDof+i; } int p = MPIComm::world().getRank(); VectorSpace<double> bcSpace = bcType.createSpace(nTotalBCDofs, nBCDofs, &(bcDofs[0]), comm); VectorSpace<double> interiorSpace = internalType.createSpace(nTotalInteriorDofs, nLocalDofs-nBCDofs, &(interiorDofs[0]), comm); return productSpace<double>(interiorSpace, bcSpace); }
RCP<const VectorSpaceBase<double> > EpetraVectorType::createSpace(int /*dimension*/, int nLocal, const int* localIndices, const MPIComm& comm) const { #ifdef HAVE_MPI Epetra_MpiComm epComm(comm.getComm()); #else Epetra_SerialComm epComm; #endif TEUCHOS_TEST_FOR_EXCEPTION(nLocal < 0, std::runtime_error, "negative vector size n=" << nLocal); RCP<Epetra_Map> map = rcp(new Epetra_Map(-1, nLocal, (int*) localIndices, 0, epComm)); return rcp(new EpetraVectorSpace(map)); }
template <class T> inline void MPIContainerComm<T>::allToAll(const Array<Array<T> >& outgoing, Array<Array<T> >& incoming, const MPIComm& comm) { int numProcs = comm.getNProc(); // catch degenerate case if (numProcs==1) { incoming = outgoing; return; } int* sendMesgLength = new int[numProcs]; TEUCHOS_TEST_FOR_EXCEPTION(sendMesgLength==0, std::runtime_error, "failed to allocate sendMesgLength"); int* recvMesgLength = new int[numProcs]; TEUCHOS_TEST_FOR_EXCEPTION(recvMesgLength==0, std::runtime_error, "failed to allocate recvMesgLength"); int p = 0; for (p=0; p<numProcs; p++) { sendMesgLength[p] = outgoing[p].length(); } comm.allToAll(sendMesgLength, 1, MPIDataType::intType(), recvMesgLength, 1, MPIDataType::intType()); int totalSendLength = 0; int totalRecvLength = 0; for (p=0; p<numProcs; p++) { totalSendLength += sendMesgLength[p]; totalRecvLength += recvMesgLength[p]; } T* sendBuf = new T[totalSendLength]; TEUCHOS_TEST_FOR_EXCEPTION(sendBuf==0, std::runtime_error, "failed to allocate sendBuf"); T* recvBuf = new T[totalRecvLength]; TEUCHOS_TEST_FOR_EXCEPTION(recvBuf==0, std::runtime_error, "failed to allocate recvBuf"); int* sendDisp = new int[numProcs]; TEUCHOS_TEST_FOR_EXCEPTION(sendDisp==0, std::runtime_error, "failed to allocate sendDisp"); int* recvDisp = new int[numProcs]; TEUCHOS_TEST_FOR_EXCEPTION(recvDisp==0, std::runtime_error, "failed to allocate recvDisp"); int count = 0; sendDisp[0] = 0; recvDisp[0] = 0; for (p=0; p<numProcs; p++) { for (int i=0; i<outgoing[p].length(); i++) { sendBuf[count] = outgoing[p][i]; count++; } if (p>0) { sendDisp[p] = sendDisp[p-1] + sendMesgLength[p-1]; recvDisp[p] = recvDisp[p-1] + recvMesgLength[p-1]; } } comm.allToAllv(sendBuf, sendMesgLength, sendDisp, MPITraits<T>::type(), recvBuf, recvMesgLength, recvDisp, MPITraits<T>::type()); incoming.resize(numProcs); for (p=0; p<numProcs; p++) { incoming[p].resize(recvMesgLength[p]); for (int i=0; i<recvMesgLength[p]; i++) { incoming[p][i] = recvBuf[recvDisp[p] + i]; } } delete [] sendBuf; delete [] sendMesgLength; delete [] sendDisp; delete [] recvBuf; delete [] recvMesgLength; delete [] recvDisp; }
template <class T> inline void MPIContainerComm<T>::bcast(T& x, int src, const MPIComm& comm) { comm.bcast((void*)&x, 1, MPITraits<T>::type(), src); }
inline void MPIContainerComm<std::string>::gatherv(const Array<std::string>& outgoing, Array<Array<std::string> >& incoming, int root, const MPIComm& comm) { int nProc = comm.getNProc(); Array<char> packedLocalArray; pack(outgoing, packedLocalArray); int sendCount = packedLocalArray.size(); /* gather the message sizes from all procs */ Array<int> recvCounts(nProc); Array<int> recvDisplacements(nProc); comm.gather((void*) &sendCount, 1, MPIDataType::intType(), (void*) &(recvCounts[0]), 1, MPIDataType::intType(), root); /* compute the displacements */ int recvSize = 0; if (root == comm.getRank()) { recvDisplacements[0] = 0; for (int i=0; i<nProc; i++) { recvSize += recvCounts[i]; if (i < nProc-1) { recvDisplacements[i+1] = recvDisplacements[i]+recvCounts[i]; } } } /* set the size to 1 on non-root procs */ Array<char> recvBuf(std::max(1,recvSize)); void* sendBuf = (void*) &(packedLocalArray[0]); void* inBuf = (void*) &(recvBuf[0]); int* inCounts = &(recvCounts[0]); int* inDisps = &(recvDisplacements[0]); /* gather the packed data */ comm.gatherv( sendBuf, sendCount, MPIDataType::charType(), inBuf, inCounts, inDisps, MPIDataType::charType(), root); /* on the root, unpack the data */ if (comm.getRank()==root) { incoming.resize(nProc); for (int j=0; j<nProc; j++) { char* start = &(recvBuf[0]) + recvDisplacements[j]; Array<char> tmp(recvCounts[j]+1); std::memcpy(&(tmp[0]), start, recvCounts[j]); tmp[recvCounts[j]] = '\0'; unpack(tmp, incoming[j]); } } }