static int intra_Reduce(void *sendbuf, void *recvbuf, int count, struct MPIR_DATATYPE *datatype, MPI_Op op, int root, struct MPIR_COMMUNICATOR *comm) { MPI_Status status; int size, rank; int mask, relrank, source, lroot; int mpi_errno = MPI_SUCCESS; MPI_User_function *uop; MPI_Aint lb, ub, m_extent; /* Extent in memory */ void *buffer; struct MPIR_OP *op_ptr; static char myname[] = "MPI_REDUCE"; MPIR_ERROR_DECL; mpi_comm_err_ret = 0; /* Is root within the communicator? */ MPIR_Comm_size(comm, &size); #ifndef MPIR_NO_ERROR_CHECKING if (root >= size) mpi_errno = MPIR_Err_setmsg(MPI_ERR_ROOT, MPIR_ERR_ROOT_TOOBIG, myname, (char *) 0, (char *) 0, root, size); if (root < 0) mpi_errno = MPIR_Err_setmsg(MPI_ERR_ROOT, MPIR_ERR_DEFAULT, myname, (char *) 0, (char *) 0, root); if (mpi_errno) return MPIR_ERROR(comm, mpi_errno, myname); #endif /* See the overview in Collection Operations for why this is ok */ if (count == 0) return MPI_SUCCESS; /* If the operation is predefined, we could check that the datatype's type signature is compatible with the operation. */ #ifdef MPID_Reduce /* Eventually, this could apply the MPID_Reduce routine in a loop for counts > 1 */ if (comm->ADIReduce && count == 1) { /* Call a routine to sort through the datatypes and operations ... This allows us to provide partial support (e.g., only SUM_DOUBLE) */ if (MPIR_ADIReduce(comm->ADIctx, comm, sendbuf, recvbuf, count, datatype->self, op, root) == MPI_SUCCESS) return MPI_SUCCESS; } #endif /* Get my rank and switch communicators to the hidden collective */ MPIR_Comm_rank(comm, &rank); comm = comm->comm_coll; op_ptr = MPIR_GET_OP_PTR(op); MPIR_TEST_MPI_OP(op, op_ptr, comm, myname); uop = op_ptr->op; /* Here's the algorithm. Relative to the root, look at the bit pattern in my rank. Starting from the right (lsb), if the bit is 1, send to the node with that bit zero and exit; if the bit is 0, receive from the node with that bit set and combine (as long as that node is within the group) Note that by receiving with source selection, we guarentee that we get the same bits with the same input. If we allowed the parent to receive the children in any order, then timing differences could cause different results (roundoff error, over/underflows in some cases, etc). Because of the way these are ordered, if root is 0, then this is correct for both commutative and non-commutitive operations. If root is not 0, then for non-commutitive, we use a root of zero and then send the result to the root. To see this, note that the ordering is mask = 1: (ab)(cd)(ef)(gh) (odds send to evens) mask = 2: ((ab)(cd))((ef)(gh)) (3,6 send to 0,4) mask = 4: (((ab)(cd))((ef)(gh))) (4 sends to 0) Comments on buffering. If the datatype is not contiguous, we still need to pass contiguous data to the user routine. In this case, we should make a copy of the data in some format, and send/operate on that. In general, we can't use MPI_PACK, because the alignment of that is rather vague, and the data may not be re-usable. What we actually need is a "squeeze" operation that removes the skips. */ /* Make a temporary buffer */ MPIR_Type_get_limits(datatype, &lb, &ub); m_extent = ub - lb; /* MPI_Type_extent ( datatype, &extent ); */ MPIR_ALLOC(buffer, (void *) MALLOC(m_extent * count), comm, MPI_ERR_EXHAUSTED, "MPI_REDUCE"); buffer = (void *) ((char *) buffer - lb); /* If I'm not the root, then my recvbuf may not be valid, therefore I have to allocate a temporary one */ if (rank != root) { MPIR_ALLOC(recvbuf, (void *) MALLOC(m_extent * count), comm, MPI_ERR_EXHAUSTED, "MPI_REDUCE"); recvbuf = (void *) ((char *) recvbuf - lb); } /* This code isn't correct if the source is a more complex datatype */ memcpy(recvbuf, sendbuf, m_extent * count); mask = 0x1; if (op_ptr->commute) lroot = root; else lroot = 0; relrank = (rank - lroot + size) % size; /* Lock for collective operation */ MPID_THREAD_LOCK(comm->ADIctx, comm); MPIR_Op_errno = MPI_SUCCESS; while ( /*(mask & relrank) == 0 && */ mask < size) { /* Receive */ if ((mask & relrank) == 0) { source = (relrank | mask); if (source < size) { source = (source + lroot) % size; mpi_errno = MPI_Recv(buffer, count, datatype->self, source, MPIR_REDUCE_TAG, comm->self, &status); if (mpi_errno) return MPIR_ERROR(comm, mpi_errno, myname); /* The sender is above us, so the received buffer must be the second argument (in the noncommutitive case). */ /* error pop/push allows errors found by predefined routines to be visible. We need a better way to do this */ /* MPIR_ERROR_POP(comm); */ if (op_ptr->commute) (*uop) (buffer, recvbuf, &count, &datatype->self); else { (*uop) (recvbuf, buffer, &count, &datatype->self); /* short term hack to keep recvbuf up-to-date */ memcpy(recvbuf, buffer, m_extent * count); } /* MPIR_ERROR_PUSH(comm); */ } } else { /* I've received all that I'm going to. Send my result to my parent */ source = ((relrank & (~mask)) + lroot) % size; mpi_errno = MPI_Send(recvbuf, count, datatype->self, source, MPIR_REDUCE_TAG, comm->self); if (mpi_errno) return MPIR_ERROR(comm, mpi_errno, myname); break; } mask <<= 1; } FREE((char *) buffer + lb); if (!op_ptr->commute && root != 0) { if (rank == 0) { mpi_errno = MPI_Send(recvbuf, count, datatype->self, root, MPIR_REDUCE_TAG, comm->self); } else if (rank == root) { mpi_errno = MPI_Recv(recvbuf, count, datatype->self, 0, /*size-1, */ MPIR_REDUCE_TAG, comm->self, &status); } } /* Free the temporarily allocated recvbuf */ if (rank != root) FREE((char *) recvbuf + lb); /* If the predefined operation detected an error, report it here */ /* Note that only the root gets this result, so this can cause programs to hang, particularly if this is used to implement MPI_Allreduce. Use care with this. */ if (mpi_errno == MPI_SUCCESS && MPIR_Op_errno) { /* PRINTF( "Error in performing MPI_Op in reduce\n" ); */ mpi_errno = MPIR_Op_errno; } /* Unlock for collective operation */ MPID_THREAD_UNLOCK(comm->ADIctx, comm); return (mpi_errno); }
static int intra_Reduce_scatter(void *sendbuf, void *recvbuf, int *recvcnts, struct MPIR_DATATYPE *datatype, MPI_Op op, struct MPIR_COMMUNICATOR *comm) { int rank, size, i, count = 0; MPI_Aint lb, ub, m_extent; /* Extent in memory */ int *displs; void *buffer; int mpi_errno = MPI_SUCCESS, rc; static char myname[] = "MPI_REDUCE_SCATTER"; /* Determine the "count" of items to reduce and set the displacements */ MPIR_Type_get_limits(datatype, &lb, &ub); m_extent = ub - lb; /* MPI_Type_extent (datatype, &extent); */ MPIR_Comm_size(comm, &size); MPIR_Comm_rank(comm, &rank); /* Allocate the displacements and initialize them */ MPIR_ALLOC(displs, (int *) MALLOC(size * sizeof(int)), comm, MPI_ERR_EXHAUSTED, myname); for (i = 0; i < size; i++) { displs[i] = count; count += recvcnts[i]; if (recvcnts[i] < 0) { FREE(displs); mpi_errno = MPIR_Err_setmsg(MPI_ERR_COUNT, MPIR_ERR_COUNT_ARRAY_NEG, myname, (char *) 0, (char *) 0, i, recvcnts[i]); return mpi_errno; } } /* Allocate a temporary buffer */ if (count == 0) { FREE(displs); return MPI_SUCCESS; } MPIR_ALLOC(buffer, (void *) MALLOC(m_extent * count), comm, MPI_ERR_EXHAUSTED, myname); buffer = (void *) ((char *) buffer - lb); /* Reduce to 0, then scatter */ mpi_errno = MPI_Reduce(sendbuf, buffer, count, datatype->self, op, 0, comm->self); if (mpi_errno == MPI_SUCCESS || mpi_errno == MPIR_ERR_OP_NOT_DEFINED) { rc = MPI_Scatterv(buffer, recvcnts, displs, datatype->self, recvbuf, recvcnts[rank], datatype->self, 0, comm->self); if (rc) mpi_errno = rc; } /* Free the temporary buffers */ FREE((char *) buffer + lb); FREE(displs); return (mpi_errno); }
static int intra_Scan(void *sendbuf, void *recvbuf, int count, struct MPIR_DATATYPE *datatype, MPI_Op op, struct MPIR_COMMUNICATOR *comm) { MPI_Status status; int rank, size; int mpi_errno = MPI_SUCCESS; MPI_Aint lb, ub, m_extent; /* Extent in memory */ MPI_User_function *uop; struct MPIR_OP *op_ptr; MPIR_ERROR_DECL; mpi_comm_err_ret = 0; /* See the overview in Collection Operations for why this is ok */ if (count == 0) return MPI_SUCCESS; /* Get my rank & size and switch communicators to the hidden collective */ MPIR_Comm_size(comm, &size); MPIR_Comm_rank(comm, &rank); MPIR_Type_get_limits(datatype, &lb, &ub); m_extent = ub - lb; comm = comm->comm_coll; op_ptr = MPIR_GET_OP_PTR(op); MPIR_TEST_MPI_OP(op, op_ptr, comm, "MPI_SCAN"); uop = op_ptr->op; /* Lock for collective operation */ MPID_THREAD_LOCK(comm->ADIctx, comm); /* This is an O(size) algorithm. A modification of the algorithm in reduce.c can be used to make this O(log(size)) */ /* commutative case requires no extra buffering */ MPIR_Op_errno = MPI_SUCCESS; if (op_ptr->commute) { /* Do the scan operation */ if (rank > 0) { mpi_errno = MPI_Recv(recvbuf, count, datatype->self, rank - 1, MPIR_SCAN_TAG, comm->self, &status); if (mpi_errno) return mpi_errno; /* See reduce for why pop/push */ MPIR_ERROR_POP(comm); (*uop) (sendbuf, recvbuf, &count, &datatype->self); MPIR_ERROR_PUSH(comm); } else { MPIR_COPYSELF(sendbuf, count, datatype->self, recvbuf, MPIR_SCAN_TAG, rank, comm->self); if (mpi_errno) return mpi_errno; } } /* non-commutative case requires extra buffering */ else { /* Do the scan operation */ if (rank > 0) { void *tmpbuf; MPIR_ALLOC(tmpbuf, (void *) MALLOC(m_extent * count), comm, MPI_ERR_EXHAUSTED, "MPI_SCAN"); tmpbuf = (void *) ((char *) tmpbuf - lb); MPIR_COPYSELF(sendbuf, count, datatype->self, recvbuf, MPIR_SCAN_TAG, rank, comm->self); if (mpi_errno) return mpi_errno; mpi_errno = MPI_Recv(tmpbuf, count, datatype->self, rank - 1, MPIR_SCAN_TAG, comm->self, &status); if (mpi_errno) return mpi_errno; (*uop) (tmpbuf, recvbuf, &count, &datatype->self); FREE((char *) tmpbuf + lb); } else { MPIR_COPYSELF(sendbuf, count, datatype->self, recvbuf, MPIR_SCAN_TAG, rank, comm->self); if (mpi_errno) return mpi_errno; } } /* send the letter to destination */ if (rank < (size - 1)) mpi_errno = MPI_Send(recvbuf, count, datatype->self, rank + 1, MPIR_SCAN_TAG, comm->self); /* If the predefined operation detected an error, report it here */ if (mpi_errno == MPI_SUCCESS && MPIR_Op_errno) mpi_errno = MPIR_Op_errno; /* Unlock for collective operation */ MPID_THREAD_UNLOCK(comm->ADIctx, comm); return (mpi_errno); }
int MPIR_intra_Scan ( void *sendbuf, void *recvbuf, int count, struct MPIR_DATATYPE *datatype, MPI_Op op, struct MPIR_COMMUNICATOR *comm ) { MPI_Status status; int rank, size; int mpi_errno = MPI_SUCCESS; MPI_Aint lb, ub, m_extent; /* Extent in memory */ MPI_User_function *uop; struct MPIR_OP *op_ptr; MPIR_ERROR_DECL; int dd; /* displacement, no of hops to send (power of 2) */ int rr; /* "round rank" */ void *tmpbuf; mpi_comm_err_ret = 0; /* Nov. 98: Improved O(log(size)) algorithm */ /* See the overview in Collection Operations for why this is ok */ if (count == 0) return MPI_SUCCESS; /* Get my rank & size and switch communicators to the hidden collective */ MPIR_Comm_size ( comm, &size ); MPIR_Comm_rank ( comm, &rank ); MPIR_Type_get_limits( datatype, &lb, &ub ); m_extent = ub - lb; comm = comm->comm_coll; op_ptr = MPIR_GET_OP_PTR(op); MPIR_TEST_MPI_OP(op,op_ptr,comm,"MPI_SCAN"); uop = op_ptr->op; /* Lock for collective operation */ MPID_THREAD_DS_LOCK(comm); MPIR_Op_errno = MPI_SUCCESS; if (rank>0) { /* allocate temporary receive buffer (needed both in commutative and noncommutative case) */ MPIR_ALLOC(tmpbuf,(void *)MALLOC(m_extent * count), comm, MPI_ERR_EXHAUSTED, "Out of space in MPI_SCAN" ); tmpbuf = (void *)((char*)tmpbuf-lb); } MPIR_COPYSELF( sendbuf, count, datatype->self, recvbuf, MPIR_SCAN_TAG, rank, comm->self ); /* compute partial scans */ rr = rank; dd = 1; while ((rr&1)==1) { /* odd "round rank"s receive */ mpi_errno = MPI_Recv(tmpbuf,count,datatype->self,rank-dd, MPIR_SCAN_TAG,comm->self,&status); if (mpi_errno) return mpi_errno; #ifdef WIN32 if(op_ptr->stdcall) op_ptr->op_s(tmpbuf, recvbuf, &count, &datatype->self); else #endif (*uop)(tmpbuf, recvbuf, &count, &datatype->self); dd <<= 1; /* dd*2 */ rr >>= 1; /* rr/2 */ /* Invariant: recvbuf contains the scan of (rank-dd)+1, (rank-dd)+2,..., rank */ } /* rr even, rank==rr*dd+dd-1, recvbuf contains the scan of rr*dd, rr*dd+1,..., rank */ /* send partial scan forwards */ if (rank+dd<size) { mpi_errno = MPI_Send(recvbuf,count,datatype->self,rank+dd,MPIR_SCAN_TAG, comm->self); if (mpi_errno) return mpi_errno; } if (rank-dd>=0) { mpi_errno = MPI_Recv(tmpbuf,count,datatype->self,rank-dd, MPIR_SCAN_TAG,comm->self,&status); if (mpi_errno) return mpi_errno; #ifdef WIN32 if(op_ptr->stdcall) op_ptr->op_s(tmpbuf, recvbuf, &count, &datatype->self); else #endif (*uop)(tmpbuf, recvbuf, &count, &datatype->self); /* recvbuf contains the scan of 0,..., rank */ } /* send result forwards */ do { dd >>= 1; /* dd/2 */ } while (rank+dd>=size); while (dd>0) { mpi_errno = MPI_Send(recvbuf,count,datatype->self,rank+dd,MPIR_SCAN_TAG, comm->self); if (mpi_errno) return mpi_errno; dd >>= 1; /* dd/2 */ } if (rank>0) { /* free temporary receive buffer */ FREE((char*)tmpbuf+lb); } /* If the predefined operation detected an error, report it here */ if (mpi_errno == MPI_SUCCESS && MPIR_Op_errno) mpi_errno = MPIR_Op_errno; /* Unlock for collective operation */ MPID_THREAD_DS_UNLOCK(comm); return(mpi_errno); }