int MPID_Abort(MPIR_Comm * comm, int mpi_errno, int exit_code, const char *error_msg) { char sys_str[MPI_MAX_ERROR_STRING + 5] = ""; char comm_str[MPI_MAX_ERROR_STRING] = ""; char world_str[MPI_MAX_ERROR_STRING] = ""; char error_str[2 * MPI_MAX_ERROR_STRING + 128]; MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_ABORT); MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_ABORT); if (MPIR_Process.comm_world) { int rank = MPIR_Process.comm_world->rank; snprintf(world_str, sizeof(world_str), " on node %d", rank); } if (comm) { int rank = comm->rank; int context_id = comm->context_id; snprintf(comm_str, sizeof(comm_str), " (rank %d in comm %d)", rank, context_id); } if (!error_msg) error_msg = "Internal error"; if (mpi_errno != MPI_SUCCESS) { char msg[MPI_MAX_ERROR_STRING] = ""; MPIR_Err_get_string(mpi_errno, msg, MPI_MAX_ERROR_STRING, NULL); snprintf(sys_str, sizeof(msg), " (%s)", msg); } MPL_snprintf(error_str, sizeof(error_str), "Abort(%d)%s%s: %s%s\n", exit_code, world_str, comm_str, error_msg, sys_str); MPL_error_printf("%s", error_str); MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_ABORT); fflush(stderr); fflush(stdout); if (NULL == comm || (MPIR_Comm_size(comm) == 1 && comm->comm_kind == MPIR_COMM_KIND__INTRACOMM)) MPL_exit(exit_code); if (comm != MPIR_Process.comm_world) { MPIDIG_comm_abort(comm, exit_code); } else { #ifdef USE_PMIX_API PMIx_Abort(exit_code, error_msg, NULL, 0); #elif defined(USE_PMI2_API) PMI2_Abort(TRUE, error_msg); #else PMI_Abort(exit_code, error_msg); #endif } return 0; }
int MPIDU_bc_allgather(MPIR_Comm * comm, int *nodemap, void *bc, int bc_len, int same_len, void **bc_table, size_t ** bc_indices) { int mpi_errno = MPI_SUCCESS; int local_rank = -1, local_leader = -1; int rank = MPIR_Comm_rank(comm), size = MPIR_Comm_size(comm); mpi_errno = MPIDU_shm_barrier(barrier, local_size); if (mpi_errno) MPIR_ERR_POP(mpi_errno); if (!same_len) { bc_len *= 2; *bc_indices = indices; } MPIR_NODEMAP_get_local_info(rank, size, nodemap, &local_size, &local_rank, &local_leader); if (rank != local_leader) { size_t start = local_leader - nodemap[comm->rank] + (local_rank - 1); memcpy(&segment[start * bc_len], bc, bc_len); } mpi_errno = MPIDU_shm_barrier(barrier, local_size); if (mpi_errno) MPIR_ERR_POP(mpi_errno); if (rank == local_leader) { MPIR_Errflag_t errflag = MPIR_ERR_NONE; MPIR_Comm *allgather_comm = comm->node_roots_comm ? comm->node_roots_comm : comm; MPIR_Allgather_impl(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, segment, (local_size - 1) * bc_len, MPI_BYTE, allgather_comm, &errflag); } mpi_errno = MPIDU_shm_barrier(barrier, local_size); if (mpi_errno) MPIR_ERR_POP(mpi_errno); *bc_table = segment; fn_exit: return mpi_errno; fn_fail: goto fn_exit; }
/*@ MPI_Graph_map - Maps process to graph topology information Input Parameters: + comm - input communicator (handle) . nnodes - number of graph nodes (integer) . index - integer array specifying the graph structure, see 'MPI_GRAPH_CREATE' - edges - integer array specifying the graph structure Output Parameter: . newrank - reordered rank of the calling process; 'MPI_UNDEFINED' if the calling process does not belong to graph (integer) .N fortran .N Errors .N MPI_SUCCESS .N MPI_ERR_TOPOLOGY .N MPI_ERR_COMM .N MPI_ERR_ARG @*/ EXPORT_MPI_API int MPI_Graph_map ( MPI_Comm comm_old, int nnodes, int *index, int *edges, int *newrank ) { int rank, size; int mpi_errno = MPI_SUCCESS; struct MPIR_COMMUNICATOR *comm_old_ptr; static char myname[] = "MPI_GRAPH_MAP"; TR_PUSH(myname); comm_old_ptr = MPIR_GET_COMM_PTR(comm_old); #ifndef MPIR_NO_ERROR_CHECKING MPIR_TEST_MPI_COMM(comm_old,comm_old_ptr,comm_old_ptr,myname); if (nnodes < 1) mpi_errno = MPI_ERR_ARG; MPIR_TEST_ARG(newrank); MPIR_TEST_ARG(index); MPIR_TEST_ARG(edges); if (mpi_errno) return MPIR_ERROR(comm_old_ptr, mpi_errno, myname ); #endif /* Test that the communicator is large enough */ MPIR_Comm_size( comm_old_ptr, &size ); if (size < nnodes) { return MPIR_ERROR( comm_old_ptr, MPI_ERR_ARG, myname ); } /* Am I in this topology? */ MPIR_Comm_rank ( comm_old_ptr, &rank ); if ( rank < nnodes ) (*newrank) = rank; else (*newrank) = MPI_UNDEFINED; TR_POP; return (mpi_errno); }
static int intra_Scatter(void *sendbuf, int sendcnt, struct MPIR_DATATYPE *sendtype, void *recvbuf, int recvcnt, struct MPIR_DATATYPE *recvtype, int root, struct MPIR_COMMUNICATOR *comm) { MPI_Status status; MPI_Aint extent; int rank, size, i; int mpi_errno = MPI_SUCCESS; static char myname[] = "MPI_SCATTER"; /* Get size and rank */ MPIR_Comm_size(comm, &size); MPIR_Comm_rank(comm, &rank); /* Check for invalid arguments */ #ifndef MPIR_NO_ERROR_CHECKING if (root >= size) mpi_errno = MPIR_Err_setmsg(MPI_ERR_ROOT, MPIR_ERR_ROOT_TOOBIG, myname, (char *) 0, (char *) 0, root, size); if (root < 0) mpi_errno = MPIR_Err_setmsg(MPI_ERR_ROOT, MPIR_ERR_DEFAULT, myname, (char *) 0, (char *) 0, root); if (mpi_errno) return MPIR_ERROR(comm, mpi_errno, myname); #endif /* Switch communicators to the hidden collective */ comm = comm->comm_coll; /* Get the size of the send type */ MPI_Type_extent(sendtype->self, &extent); /* Lock for collective operation */ MPID_THREAD_LOCK(comm->ADIctx, comm); /* If I'm the root, send messages to the rest of 'em */ if (rank == root) { for (i = 0; i < root; i++) { mpi_errno = MPI_Send((void *) ((char *) sendbuf + i * sendcnt * extent), sendcnt, sendtype->self, i, MPIR_SCATTER_TAG, comm->self); if (mpi_errno) return mpi_errno; } mpi_errno = MPI_Sendrecv((void *) ((char *) sendbuf + rank * sendcnt * extent), sendcnt, sendtype->self, rank, MPIR_SCATTER_TAG, recvbuf, recvcnt, recvtype->self, rank, MPIR_SCATTER_TAG, comm->self, &status); if (mpi_errno) return mpi_errno; for (i = root + 1; i < size; i++) { mpi_errno = MPI_Send((void *) ((char *) sendbuf + i * sendcnt * extent), sendcnt, sendtype->self, i, MPIR_SCATTER_TAG, comm->self); if (mpi_errno) return mpi_errno; } } else mpi_errno = MPI_Recv(recvbuf, recvcnt, recvtype->self, root, MPIR_SCATTER_TAG, comm->self, &status); /* Unlock for collective operation */ MPID_THREAD_UNLOCK(comm->ADIctx, comm); return (mpi_errno); }
static int intra_Gatherv(void *sendbuf, int sendcnt, struct MPIR_DATATYPE *sendtype, void *recvbuf, int *recvcnts, int *displs, struct MPIR_DATATYPE *recvtype, int root, struct MPIR_COMMUNICATOR *comm) { int size, rank; int mpi_errno = MPI_SUCCESS; static char myname[] = "MPI_GATHERV"; /* Is root within the communicator? */ MPIR_Comm_size(comm, &size); #ifndef MPIR_NO_ERROR_CHECKING if (root >= size) mpi_errno = MPIR_Err_setmsg(MPI_ERR_ROOT, MPIR_ERR_ROOT_TOOBIG, myname, (char *) 0, (char *) 0, root, size); if (root < 0) mpi_errno = MPIR_Err_setmsg(MPI_ERR_ROOT, MPIR_ERR_DEFAULT, myname, (char *) 0, (char *) 0, root); if (mpi_errno) return MPIR_ERROR(comm, mpi_errno, myname); #endif /* Get my rank and switch communicators to the hidden collective */ MPIR_Comm_rank(comm, &rank); comm = comm->comm_coll; /* Lock for collective operation */ MPID_THREAD_LOCK(comm->ADIctx, comm); /* If rank == root, then I recv lots, otherwise I send */ if (rank == root) { MPI_Aint extent; int i; MPI_Request req; MPI_Status status; mpi_errno = MPI_Isend(sendbuf, sendcnt, sendtype->self, root, MPIR_GATHERV_TAG, comm->self, &req); if (mpi_errno) return mpi_errno; MPI_Type_extent(recvtype->self, &extent); for (i = 0; i < size; i++) { mpi_errno = MPI_Recv((void *) ((char *) recvbuf + displs[i] * extent), recvcnts[i], recvtype->self, i, MPIR_GATHERV_TAG, comm->self, &status); if (mpi_errno) return mpi_errno; } mpi_errno = MPI_Wait(&req, &status); } else mpi_errno = MPI_Send(sendbuf, sendcnt, sendtype->self, root, MPIR_GATHERV_TAG, comm->self); /* Unlock for collective operation */ MPID_THREAD_UNLOCK(comm->ADIctx, comm); return (mpi_errno); }
static int intra_Gather(void *sendbuf, int sendcnt, struct MPIR_DATATYPE *sendtype, void *recvbuf, int recvcount, struct MPIR_DATATYPE *recvtype, int root, struct MPIR_COMMUNICATOR *comm) { int size, rank; int mpi_errno = MPI_SUCCESS; MPI_Aint extent; /* Datatype extent */ static char myname[] = "MPI_GATHER"; /* Is root within the communicator? */ MPIR_Comm_size(comm, &size); #ifndef MPIR_NO_ERROR_CHECKING if (root >= size) mpi_errno = MPIR_Err_setmsg(MPI_ERR_ROOT, MPIR_ERR_ROOT_TOOBIG, myname, (char *) 0, (char *) 0, root, size); if (root < 0) mpi_errno = MPIR_Err_setmsg(MPI_ERR_ROOT, MPIR_ERR_DEFAULT, myname, (char *) 0, (char *) 0, root); if (mpi_errno) return MPIR_ERROR(comm, mpi_errno, myname); #endif /* Get my rank and switch communicators to the hidden collective */ MPIR_Comm_rank(comm, &rank); comm = comm->comm_coll; /* Lock for collective operation */ MPID_THREAD_LOCK(comm->ADIctx, comm); /* If rank == root, then I recv lots, otherwise I send */ /* This should use the same mechanism used in reduce; the intermediate nodes will need to allocate space. */ if (rank == root) { int i; MPI_Request req; MPI_Status status; /* This should really be COPYSELF.... , with the for look skipping root. */ mpi_errno = MPI_Isend(sendbuf, sendcnt, sendtype->self, root, MPIR_GATHER_TAG, comm->self, &req); if (mpi_errno) return mpi_errno; MPI_Type_extent(recvtype->self, &extent); for (i = 0; i < size; i++) { mpi_errno = MPI_Recv((void *) (((char *) recvbuf) + i * extent * recvcount), recvcount, recvtype->self, i, MPIR_GATHER_TAG, comm->self, &status); if (mpi_errno) return mpi_errno; } mpi_errno = MPI_Wait(&req, &status); } else mpi_errno = MPI_Send(sendbuf, sendcnt, sendtype->self, root, MPIR_GATHER_TAG, comm->self); /* Unlock for collective operation */ MPID_THREAD_UNLOCK(comm->ADIctx, comm); return (mpi_errno); }
static int intra_Bcast(void *buffer, int count, struct MPIR_DATATYPE *datatype, int root, struct MPIR_COMMUNICATOR *comm) { MPI_Status status; int rank, size, src, dst; int relative_rank, mask; int mpi_errno = MPI_SUCCESS; static char myname[] = "MPI_BCAST"; /* See the overview in Collection Operations for why this is ok */ if (count == 0) return MPI_SUCCESS; /* Is root within the comm and more than 1 processes involved? */ MPIR_Comm_size(comm, &size); #ifndef MPIR_NO_ERROR_CHECKING if (root >= size) mpi_errno = MPIR_Err_setmsg(MPI_ERR_ROOT, MPIR_ERR_ROOT_TOOBIG, myname, (char *) 0, (char *) 0, root, size); if (mpi_errno) return MPIR_ERROR(comm, mpi_errno, myname); #endif /* If there is only one process */ if (size == 1) return (mpi_errno); /* Get my rank and switch communicators to the hidden collective */ MPIR_Comm_rank(comm, &rank); comm = comm->comm_coll; /* Algorithm: This uses a fairly basic recursive subdivision algorithm. The root sends to the process size/2 away; the receiver becomes a root for a subtree and applies the same process. So that the new root can easily identify the size of its subtree, the (subtree) roots are all powers of two (relative to the root) If m = the first power of 2 such that 2^m >= the size of the communicator, then the subtree at root at 2^(m-k) has size 2^k (with special handling for subtrees that aren't a power of two in size). Optimizations: The original code attempted to switch to a linear broadcast when the subtree size became too small. As a further variation, the subtree broadcast sent data to the center of the block, rather than to one end. However, the original code did not properly compute the communications, resulting in extraneous (though harmless) communication. For very small messages, using a linear algorithm (process 0 sends to process 1, who sends to 2, etc.) can be better, since no one process takes more than 1 send/recv time, and successive bcasts using the same root can overlap. Another important technique for long messages is pipelining---sending the messages in blocks so that the message can be pipelined through the network without waiting for the subtree roots to receive the entire message before forwarding it to other processors. This is hard to do if the datatype/count are not the same on each processor (note that this is allowed - only the signatures must match). Of course, this can be accomplished at the byte transfer level, but it is awkward from the MPI point-to-point routines. Nonblocking operations can be used to achieve some "horizontal" pipelining (on some systems) by allowing multiple send/receives to begin on the same processor. */ relative_rank = (rank >= root) ? rank - root : rank - root + size; /* Lock for collective operation */ MPID_THREAD_LOCK(comm->ADIctx, comm); /* Do subdivision. There are two phases: 1. Wait for arrival of data. Because of the power of two nature of the subtree roots, the source of this message is alwyas the process whose relative rank has the least significant bit CLEARED. That is, process 4 (100) receives from process 0, process 7 (111) from process 6 (110), etc. 2. Forward to my subtree Note that the process that is the tree root is handled automatically by this code, since it has no bits set. */ mask = 0x1; while (mask < size) { if (relative_rank & mask) { src = rank - mask; if (src < 0) src += size; mpi_errno = MPI_Recv(buffer, count, datatype->self, src, MPIR_BCAST_TAG, comm->self, &status); if (mpi_errno) return mpi_errno; break; } mask <<= 1; } /* This process is responsible for all processes that have bits set from the LSB upto (but not including) mask. Because of the "not including", we start by shifting mask back down one. We can easily change to a different algorithm at any power of two by changing the test (mask > 1) to (mask > block_size) One such version would use non-blocking operations for the last 2-4 steps (this also bounds the number of MPI_Requests that would be needed). */ mask >>= 1; while (mask > 0) { if (relative_rank + mask < size) { dst = rank + mask; if (dst >= size) dst -= size; mpi_errno = MPI_Send(buffer, count, datatype->self, dst, MPIR_BCAST_TAG, comm->self); if (mpi_errno) return mpi_errno; } mask >>= 1; } /* Unlock for collective operation */ MPID_THREAD_UNLOCK(comm->ADIctx, comm); return (mpi_errno); }
static int intra_Reduce(void *sendbuf, void *recvbuf, int count, struct MPIR_DATATYPE *datatype, MPI_Op op, int root, struct MPIR_COMMUNICATOR *comm) { MPI_Status status; int size, rank; int mask, relrank, source, lroot; int mpi_errno = MPI_SUCCESS; MPI_User_function *uop; MPI_Aint lb, ub, m_extent; /* Extent in memory */ void *buffer; struct MPIR_OP *op_ptr; static char myname[] = "MPI_REDUCE"; MPIR_ERROR_DECL; mpi_comm_err_ret = 0; /* Is root within the communicator? */ MPIR_Comm_size(comm, &size); #ifndef MPIR_NO_ERROR_CHECKING if (root >= size) mpi_errno = MPIR_Err_setmsg(MPI_ERR_ROOT, MPIR_ERR_ROOT_TOOBIG, myname, (char *) 0, (char *) 0, root, size); if (root < 0) mpi_errno = MPIR_Err_setmsg(MPI_ERR_ROOT, MPIR_ERR_DEFAULT, myname, (char *) 0, (char *) 0, root); if (mpi_errno) return MPIR_ERROR(comm, mpi_errno, myname); #endif /* See the overview in Collection Operations for why this is ok */ if (count == 0) return MPI_SUCCESS; /* If the operation is predefined, we could check that the datatype's type signature is compatible with the operation. */ #ifdef MPID_Reduce /* Eventually, this could apply the MPID_Reduce routine in a loop for counts > 1 */ if (comm->ADIReduce && count == 1) { /* Call a routine to sort through the datatypes and operations ... This allows us to provide partial support (e.g., only SUM_DOUBLE) */ if (MPIR_ADIReduce(comm->ADIctx, comm, sendbuf, recvbuf, count, datatype->self, op, root) == MPI_SUCCESS) return MPI_SUCCESS; } #endif /* Get my rank and switch communicators to the hidden collective */ MPIR_Comm_rank(comm, &rank); comm = comm->comm_coll; op_ptr = MPIR_GET_OP_PTR(op); MPIR_TEST_MPI_OP(op, op_ptr, comm, myname); uop = op_ptr->op; /* Here's the algorithm. Relative to the root, look at the bit pattern in my rank. Starting from the right (lsb), if the bit is 1, send to the node with that bit zero and exit; if the bit is 0, receive from the node with that bit set and combine (as long as that node is within the group) Note that by receiving with source selection, we guarentee that we get the same bits with the same input. If we allowed the parent to receive the children in any order, then timing differences could cause different results (roundoff error, over/underflows in some cases, etc). Because of the way these are ordered, if root is 0, then this is correct for both commutative and non-commutitive operations. If root is not 0, then for non-commutitive, we use a root of zero and then send the result to the root. To see this, note that the ordering is mask = 1: (ab)(cd)(ef)(gh) (odds send to evens) mask = 2: ((ab)(cd))((ef)(gh)) (3,6 send to 0,4) mask = 4: (((ab)(cd))((ef)(gh))) (4 sends to 0) Comments on buffering. If the datatype is not contiguous, we still need to pass contiguous data to the user routine. In this case, we should make a copy of the data in some format, and send/operate on that. In general, we can't use MPI_PACK, because the alignment of that is rather vague, and the data may not be re-usable. What we actually need is a "squeeze" operation that removes the skips. */ /* Make a temporary buffer */ MPIR_Type_get_limits(datatype, &lb, &ub); m_extent = ub - lb; /* MPI_Type_extent ( datatype, &extent ); */ MPIR_ALLOC(buffer, (void *) MALLOC(m_extent * count), comm, MPI_ERR_EXHAUSTED, "MPI_REDUCE"); buffer = (void *) ((char *) buffer - lb); /* If I'm not the root, then my recvbuf may not be valid, therefore I have to allocate a temporary one */ if (rank != root) { MPIR_ALLOC(recvbuf, (void *) MALLOC(m_extent * count), comm, MPI_ERR_EXHAUSTED, "MPI_REDUCE"); recvbuf = (void *) ((char *) recvbuf - lb); } /* This code isn't correct if the source is a more complex datatype */ memcpy(recvbuf, sendbuf, m_extent * count); mask = 0x1; if (op_ptr->commute) lroot = root; else lroot = 0; relrank = (rank - lroot + size) % size; /* Lock for collective operation */ MPID_THREAD_LOCK(comm->ADIctx, comm); MPIR_Op_errno = MPI_SUCCESS; while ( /*(mask & relrank) == 0 && */ mask < size) { /* Receive */ if ((mask & relrank) == 0) { source = (relrank | mask); if (source < size) { source = (source + lroot) % size; mpi_errno = MPI_Recv(buffer, count, datatype->self, source, MPIR_REDUCE_TAG, comm->self, &status); if (mpi_errno) return MPIR_ERROR(comm, mpi_errno, myname); /* The sender is above us, so the received buffer must be the second argument (in the noncommutitive case). */ /* error pop/push allows errors found by predefined routines to be visible. We need a better way to do this */ /* MPIR_ERROR_POP(comm); */ if (op_ptr->commute) (*uop) (buffer, recvbuf, &count, &datatype->self); else { (*uop) (recvbuf, buffer, &count, &datatype->self); /* short term hack to keep recvbuf up-to-date */ memcpy(recvbuf, buffer, m_extent * count); } /* MPIR_ERROR_PUSH(comm); */ } } else { /* I've received all that I'm going to. Send my result to my parent */ source = ((relrank & (~mask)) + lroot) % size; mpi_errno = MPI_Send(recvbuf, count, datatype->self, source, MPIR_REDUCE_TAG, comm->self); if (mpi_errno) return MPIR_ERROR(comm, mpi_errno, myname); break; } mask <<= 1; } FREE((char *) buffer + lb); if (!op_ptr->commute && root != 0) { if (rank == 0) { mpi_errno = MPI_Send(recvbuf, count, datatype->self, root, MPIR_REDUCE_TAG, comm->self); } else if (rank == root) { mpi_errno = MPI_Recv(recvbuf, count, datatype->self, 0, /*size-1, */ MPIR_REDUCE_TAG, comm->self, &status); } } /* Free the temporarily allocated recvbuf */ if (rank != root) FREE((char *) recvbuf + lb); /* If the predefined operation detected an error, report it here */ /* Note that only the root gets this result, so this can cause programs to hang, particularly if this is used to implement MPI_Allreduce. Use care with this. */ if (mpi_errno == MPI_SUCCESS && MPIR_Op_errno) { /* PRINTF( "Error in performing MPI_Op in reduce\n" ); */ mpi_errno = MPIR_Op_errno; } /* Unlock for collective operation */ MPID_THREAD_UNLOCK(comm->ADIctx, comm); return (mpi_errno); }
static int intra_Alltoall(void *sendbuf, int sendcount, struct MPIR_DATATYPE *sendtype, void *recvbuf, int recvcnt, struct MPIR_DATATYPE *recvtype, struct MPIR_COMMUNICATOR *comm) { int size, i, j; int me; MPI_Aint send_extent, recv_extent; int mpi_errno = MPI_SUCCESS; MPI_Status *starray; MPI_Request *reqarray; static char myname[] = "MPI_ALLTOALL"; /* Get size and switch to collective communicator */ MPIR_Comm_size(comm, &size); MPIR_Comm_rank(comm, &me); comm = comm->comm_coll; /* Get extent of send and recv types */ MPI_Type_extent(sendtype->self, &send_extent); MPI_Type_extent(recvtype->self, &recv_extent); /* Lock for collective operation */ MPID_THREAD_LOCK(comm->ADIctx, comm); /* */ /* 1st, get some storage from the heap to hold handles, etc. */ MPIR_ALLOC(starray, (MPI_Status *) MALLOC(2 * size * sizeof(MPI_Status)), comm, MPI_ERR_EXHAUSTED, myname); MPIR_ALLOC(reqarray, (MPI_Request *) MALLOC(2 * size * sizeof(MPI_Request)), comm, MPI_ERR_EXHAUSTED, myname); /* do the communication -- post *all* sends and receives: */ /* ServerNet Optimization. Post all receives then synchronously cycle through all of the sends, */ for (i = 0; i < size; i++) { /* We'd like to avoid sending and receiving to ourselves; however, this is complicated by the presence of different sendtype and recvtypes. */ if ((mpi_errno = MPI_Irecv((void *) ((char *) recvbuf + (((i + me) % size) * recvcnt * recv_extent)), recvcnt, recvtype->self, ((i + me) % size), MPIR_ALLTOALL_TAG, comm->self, &reqarray[i])) ) break; } for (i = 0; i < size; i++) { MPI_Barrier(comm->self); if ((mpi_errno = MPI_Send((void *) ((char *) sendbuf + ((i + me) % size) * sendcount * send_extent), sendcount, sendtype->self, ((i + me) % size), MPIR_ALLTOALL_TAG, comm->self)) ) break; } if (mpi_errno) return mpi_errno; /* ... then wait for *all* of them to finish: */ mpi_errno = MPI_Waitall(size, reqarray, starray); if (mpi_errno == MPI_ERR_IN_STATUS) { for (j = 0; j < size; j++) { if (starray[j].MPI_ERROR != MPI_SUCCESS) mpi_errno = starray[j].MPI_ERROR; } } /* clean up */ FREE(starray); FREE(reqarray); /* Unlock for collective operation */ MPID_THREAD_UNLOCK(comm->ADIctx, comm); return (mpi_errno); }
static int win_init(MPI_Aint size, int disp_unit, int create_flavor, int model, MPIR_Info * info, MPIR_Comm * comm_ptr, MPIR_Win ** win_ptr) { int mpi_errno = MPI_SUCCESS; int i; MPIR_Comm *win_comm_ptr; int win_target_pool_size; MPIR_CHKPMEM_DECL(5); MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_WIN_INIT); MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_WIN_INIT); MPID_THREAD_CS_ENTER(POBJ, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX); if (initRMAoptions) { MPIDI_CH3_RMA_Init_sync_pvars(); MPIDI_CH3_RMA_Init_pkthandler_pvars(); initRMAoptions = 0; } MPID_THREAD_CS_EXIT(POBJ, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX); *win_ptr = (MPIR_Win *) MPIR_Handle_obj_alloc(&MPIR_Win_mem); MPIR_ERR_CHKANDJUMP1(!(*win_ptr), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPIR_Win_mem"); mpi_errno = MPIR_Comm_dup_impl(comm_ptr, &win_comm_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_Object_set_ref(*win_ptr, 1); /* (*win_ptr)->errhandler is set by upper level; */ /* (*win_ptr)->base is set by caller; */ (*win_ptr)->size = size; (*win_ptr)->disp_unit = disp_unit; (*win_ptr)->create_flavor = create_flavor; (*win_ptr)->model = model; (*win_ptr)->attributes = NULL; (*win_ptr)->comm_ptr = win_comm_ptr; (*win_ptr)->at_completion_counter = 0; (*win_ptr)->shm_base_addrs = NULL; /* (*win_ptr)->basic_info_table[] is set by caller; */ (*win_ptr)->current_lock_type = MPID_LOCK_NONE; (*win_ptr)->shared_lock_ref_cnt = 0; (*win_ptr)->target_lock_queue_head = NULL; (*win_ptr)->shm_allocated = FALSE; (*win_ptr)->states.access_state = MPIDI_RMA_NONE; (*win_ptr)->states.exposure_state = MPIDI_RMA_NONE; (*win_ptr)->num_targets_with_pending_net_ops = 0; (*win_ptr)->start_ranks_in_win_grp = NULL; (*win_ptr)->start_grp_size = 0; (*win_ptr)->lock_all_assert = 0; (*win_ptr)->lock_epoch_count = 0; (*win_ptr)->outstanding_locks = 0; (*win_ptr)->current_target_lock_data_bytes = 0; (*win_ptr)->sync_request_cnt = 0; (*win_ptr)->active = FALSE; (*win_ptr)->next = NULL; (*win_ptr)->prev = NULL; (*win_ptr)->outstanding_acks = 0; /* Initialize the info flags */ (*win_ptr)->info_args.no_locks = 0; (*win_ptr)->info_args.accumulate_ordering = MPIDI_ACC_ORDER_RAR | MPIDI_ACC_ORDER_RAW | MPIDI_ACC_ORDER_WAR | MPIDI_ACC_ORDER_WAW; (*win_ptr)->info_args.accumulate_ops = MPIDI_ACC_OPS_SAME_OP_NO_OP; (*win_ptr)->info_args.same_size = 0; (*win_ptr)->info_args.same_disp_unit = FALSE; (*win_ptr)->info_args.alloc_shared_noncontig = 0; (*win_ptr)->info_args.alloc_shm = FALSE; if ((*win_ptr)->create_flavor == MPI_WIN_FLAVOR_ALLOCATE || (*win_ptr)->create_flavor == MPI_WIN_FLAVOR_SHARED) { (*win_ptr)->info_args.alloc_shm = TRUE; } /* Set info_args on window based on info provided by user */ mpi_errno = MPID_Win_set_info((*win_ptr), info); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); MPIR_CHKPMEM_MALLOC((*win_ptr)->op_pool_start, MPIDI_RMA_Op_t *, sizeof(MPIDI_RMA_Op_t) * MPIR_CVAR_CH3_RMA_OP_WIN_POOL_SIZE, mpi_errno, "RMA op pool", MPL_MEM_RMA); (*win_ptr)->op_pool_head = NULL; for (i = 0; i < MPIR_CVAR_CH3_RMA_OP_WIN_POOL_SIZE; i++) { (*win_ptr)->op_pool_start[i].pool_type = MPIDI_RMA_POOL_WIN; DL_APPEND((*win_ptr)->op_pool_head, &((*win_ptr)->op_pool_start[i])); } win_target_pool_size = MPL_MIN(MPIR_CVAR_CH3_RMA_TARGET_WIN_POOL_SIZE, MPIR_Comm_size(win_comm_ptr)); MPIR_CHKPMEM_MALLOC((*win_ptr)->target_pool_start, MPIDI_RMA_Target_t *, sizeof(MPIDI_RMA_Target_t) * win_target_pool_size, mpi_errno, "RMA target pool", MPL_MEM_RMA); (*win_ptr)->target_pool_head = NULL; for (i = 0; i < win_target_pool_size; i++) { (*win_ptr)->target_pool_start[i].pool_type = MPIDI_RMA_POOL_WIN; DL_APPEND((*win_ptr)->target_pool_head, &((*win_ptr)->target_pool_start[i])); } (*win_ptr)->num_slots = MPL_MIN(MPIR_CVAR_CH3_RMA_SLOTS_SIZE, MPIR_Comm_size(win_comm_ptr)); MPIR_CHKPMEM_MALLOC((*win_ptr)->slots, MPIDI_RMA_Slot_t *, sizeof(MPIDI_RMA_Slot_t) * (*win_ptr)->num_slots, mpi_errno, "RMA slots", MPL_MEM_RMA); for (i = 0; i < (*win_ptr)->num_slots; i++) { (*win_ptr)->slots[i].target_list_head = NULL; } MPIR_CHKPMEM_MALLOC((*win_ptr)->target_lock_entry_pool_start, MPIDI_RMA_Target_lock_entry_t *, sizeof(MPIDI_RMA_Target_lock_entry_t) * MPIR_CVAR_CH3_RMA_TARGET_LOCK_ENTRY_WIN_POOL_SIZE, mpi_errno, "RMA lock entry pool", MPL_MEM_RMA); (*win_ptr)->target_lock_entry_pool_head = NULL; for (i = 0; i < MPIR_CVAR_CH3_RMA_TARGET_LOCK_ENTRY_WIN_POOL_SIZE; i++) { DL_APPEND((*win_ptr)->target_lock_entry_pool_head, &((*win_ptr)->target_lock_entry_pool_start[i])); } if (MPIDI_RMA_Win_inactive_list_head == NULL && MPIDI_RMA_Win_active_list_head == NULL) { /* this is the first window, register RMA progress hook */ mpi_errno = MPID_Progress_register_hook(MPIDI_CH3I_RMA_Make_progress_global, &MPIDI_CH3I_RMA_Progress_hook_id); if (mpi_errno) { MPIR_ERR_POP(mpi_errno); } } DL_APPEND(MPIDI_RMA_Win_inactive_list_head, (*win_ptr)); if (MPIDI_CH3U_Win_hooks.win_init != NULL) { mpi_errno = MPIDI_CH3U_Win_hooks.win_init(size, disp_unit, create_flavor, model, info, comm_ptr, win_ptr); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); } fn_exit: MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_WIN_INIT); return mpi_errno; fn_fail: MPIR_CHKPMEM_REAP(); goto fn_exit; }
int MPIR_intra_Scan ( void *sendbuf, void *recvbuf, int count, struct MPIR_DATATYPE *datatype, MPI_Op op, struct MPIR_COMMUNICATOR *comm ) { MPI_Status status; int rank, size; int mpi_errno = MPI_SUCCESS; MPI_Aint lb, ub, m_extent; /* Extent in memory */ MPI_User_function *uop; struct MPIR_OP *op_ptr; MPIR_ERROR_DECL; int dd; /* displacement, no of hops to send (power of 2) */ int rr; /* "round rank" */ void *tmpbuf; mpi_comm_err_ret = 0; /* Nov. 98: Improved O(log(size)) algorithm */ /* See the overview in Collection Operations for why this is ok */ if (count == 0) return MPI_SUCCESS; /* Get my rank & size and switch communicators to the hidden collective */ MPIR_Comm_size ( comm, &size ); MPIR_Comm_rank ( comm, &rank ); MPIR_Type_get_limits( datatype, &lb, &ub ); m_extent = ub - lb; comm = comm->comm_coll; op_ptr = MPIR_GET_OP_PTR(op); MPIR_TEST_MPI_OP(op,op_ptr,comm,"MPI_SCAN"); uop = op_ptr->op; /* Lock for collective operation */ MPID_THREAD_DS_LOCK(comm); MPIR_Op_errno = MPI_SUCCESS; if (rank>0) { /* allocate temporary receive buffer (needed both in commutative and noncommutative case) */ MPIR_ALLOC(tmpbuf,(void *)MALLOC(m_extent * count), comm, MPI_ERR_EXHAUSTED, "Out of space in MPI_SCAN" ); tmpbuf = (void *)((char*)tmpbuf-lb); } MPIR_COPYSELF( sendbuf, count, datatype->self, recvbuf, MPIR_SCAN_TAG, rank, comm->self ); /* compute partial scans */ rr = rank; dd = 1; while ((rr&1)==1) { /* odd "round rank"s receive */ mpi_errno = MPI_Recv(tmpbuf,count,datatype->self,rank-dd, MPIR_SCAN_TAG,comm->self,&status); if (mpi_errno) return mpi_errno; #ifdef WIN32 if(op_ptr->stdcall) op_ptr->op_s(tmpbuf, recvbuf, &count, &datatype->self); else #endif (*uop)(tmpbuf, recvbuf, &count, &datatype->self); dd <<= 1; /* dd*2 */ rr >>= 1; /* rr/2 */ /* Invariant: recvbuf contains the scan of (rank-dd)+1, (rank-dd)+2,..., rank */ } /* rr even, rank==rr*dd+dd-1, recvbuf contains the scan of rr*dd, rr*dd+1,..., rank */ /* send partial scan forwards */ if (rank+dd<size) { mpi_errno = MPI_Send(recvbuf,count,datatype->self,rank+dd,MPIR_SCAN_TAG, comm->self); if (mpi_errno) return mpi_errno; } if (rank-dd>=0) { mpi_errno = MPI_Recv(tmpbuf,count,datatype->self,rank-dd, MPIR_SCAN_TAG,comm->self,&status); if (mpi_errno) return mpi_errno; #ifdef WIN32 if(op_ptr->stdcall) op_ptr->op_s(tmpbuf, recvbuf, &count, &datatype->self); else #endif (*uop)(tmpbuf, recvbuf, &count, &datatype->self); /* recvbuf contains the scan of 0,..., rank */ } /* send result forwards */ do { dd >>= 1; /* dd/2 */ } while (rank+dd>=size); while (dd>0) { mpi_errno = MPI_Send(recvbuf,count,datatype->self,rank+dd,MPIR_SCAN_TAG, comm->self); if (mpi_errno) return mpi_errno; dd >>= 1; /* dd/2 */ } if (rank>0) { /* free temporary receive buffer */ FREE((char*)tmpbuf+lb); } /* If the predefined operation detected an error, report it here */ if (mpi_errno == MPI_SUCCESS && MPIR_Op_errno) mpi_errno = MPIR_Op_errno; /* Unlock for collective operation */ MPID_THREAD_DS_UNLOCK(comm); return(mpi_errno); }
int MPIR_intra_Scan ( void *sendbuf, void *recvbuf, int count, struct MPIR_DATATYPE *datatype, MPI_Op op, struct MPIR_COMMUNICATOR *comm ) { MPI_Status status; int rank, size; int mpi_errno = MPI_SUCCESS; MPI_User_function *uop; struct MPIR_OP *op_ptr; int mask, dst; MPI_Aint extent, lb; void *partial_scan, *tmp_buf; static char myname[] = "MPI_SCAN"; if (count == 0) return MPI_SUCCESS; MPIR_Comm_size(comm, &size); MPIR_Comm_rank(comm, &rank); /* Switch communicators to the hidden collective */ comm = comm->comm_coll; /* Lock for collective operation */ MPID_THREAD_LOCK(comm->ADIctx,comm); op_ptr = MPIR_GET_OP_PTR(op); MPIR_TEST_MPI_OP(op,op_ptr,comm,myname); uop = op_ptr->op; /* need to allocate temporary buffer to store partial scan*/ MPI_Type_extent(datatype->self, &extent); MPIR_ALLOC(partial_scan,(void *)MALLOC(count*extent), comm, MPI_ERR_EXHAUSTED, myname); /* adjust for potential negative lower bound in datatype */ MPI_Type_lb( datatype->self, &lb ); partial_scan = (void *)((char*)partial_scan - lb); /* need to allocate temporary buffer to store incoming data*/ MPIR_ALLOC(tmp_buf,(void *)MALLOC(count*extent), comm, MPI_ERR_EXHAUSTED, myname); /* adjust for potential negative lower bound in datatype */ tmp_buf = (void *)((char*)tmp_buf - lb); /* Since this is an inclusive scan, copy local contribution into recvbuf. */ mpi_errno = MPI_Sendrecv ( sendbuf, count, datatype->self, rank, MPIR_SCAN_TAG, recvbuf, count, datatype->self, rank, MPIR_SCAN_TAG, comm->self, &status ); if (mpi_errno) return mpi_errno; mpi_errno = MPI_Sendrecv ( sendbuf, count, datatype->self, rank, MPIR_SCAN_TAG, partial_scan, count, datatype->self, rank, MPIR_SCAN_TAG, comm->self, &status ); if (mpi_errno) return mpi_errno; mask = 0x1; while (mask < size) { dst = rank ^ mask; if (dst < size) { /* Send partial_scan to dst. Recv into tmp_buf */ mpi_errno = MPI_Sendrecv(partial_scan, count, datatype->self, dst, MPIR_SCAN_TAG, tmp_buf, count, datatype->self, dst, MPIR_SCAN_TAG, comm->self, &status); if (mpi_errno) return mpi_errno; if (rank > dst) { (*uop)(tmp_buf, partial_scan, &count, &datatype->self); (*uop)(tmp_buf, recvbuf, &count, &datatype->self); } else { if (op_ptr->commute) (*uop)(tmp_buf, partial_scan, &count, &datatype->self); else { (*uop)(partial_scan, tmp_buf, &count, &datatype->self); mpi_errno = MPI_Sendrecv(tmp_buf, count, datatype->self, rank, MPIR_SCAN_TAG, partial_scan, count, datatype->self, rank, MPIR_SCAN_TAG, comm->self, &status); if (mpi_errno) return mpi_errno; } } } mask <<= 1; } FREE((char *)partial_scan+lb); FREE((char *)tmp_buf+lb); /* Unlock for collective operation */ MPID_THREAD_UNLOCK(comm->ADIctx,comm); return (mpi_errno); }
/*@ MPI_Comm_size - Determines the size of the group associated with a communicator Input Parameters: . comm - communicator (handle) Output Parameters: . size - number of processes in the group of 'comm' (integer) Notes: .N NULL .N SignalSafe .N Fortran .N Errors .N MPI_SUCCESS .N MPI_ERR_COMM .N MPI_ERR_ARG @*/ int MPI_Comm_size( MPI_Comm comm, int *size ) { int mpi_errno = MPI_SUCCESS; MPID_Comm *comm_ptr = 0; MPID_MPI_STATE_DECL(MPID_STATE_MPI_COMM_SIZE); MPIR_ERRTEST_INITIALIZED_ORDIE(); MPID_MPI_FUNC_ENTER(MPID_STATE_MPI_COMM_SIZE); /* Validate parameters, especially handles needing to be converted */ # ifdef HAVE_ERROR_CHECKING { MPID_BEGIN_ERROR_CHECKS; { MPIR_ERRTEST_COMM(comm, mpi_errno); } MPID_END_ERROR_CHECKS; } # endif /* HAVE_ERROR_CHECKING */ /* Convert MPI object handles to object pointers */ MPID_Comm_get_ptr( comm, comm_ptr ); /* Validate parameters and objects (post conversion) */ # ifdef HAVE_ERROR_CHECKING { MPID_BEGIN_ERROR_CHECKS; { MPIR_ERRTEST_ARGNULL(size,"size",mpi_errno); /* Validate comm_ptr */ MPID_Comm_valid_ptr( comm_ptr, mpi_errno ); /* If comm_ptr is not valid, it will be reset to null */ if (mpi_errno) goto fn_fail; } MPID_END_ERROR_CHECKS; } # endif /* HAVE_ERROR_CHECKING */ /* ... body of routine ... */ *size = MPIR_Comm_size(comm_ptr); /* ... end of body of routine ... */ #ifdef HAVE_ERROR_CHECKING fn_exit: #endif MPID_MPI_FUNC_EXIT(MPID_STATE_MPI_COMM_SIZE); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ # ifdef HAVE_ERROR_CHECKING fn_fail: { mpi_errno = MPIR_Err_create_code( mpi_errno, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER, "**mpi_comm_size", "**mpi_comm_size %C %p", comm, size); } mpi_errno = MPIR_Err_return_comm( comm_ptr, FCNAME, mpi_errno ); goto fn_exit; # endif /* --END ERROR HANDLING-- */ }
/* This function produces topology aware trees for reduction and broadcasts, with different * K values. This is a heavy-weight function as it allocates shared memory, generates topology * information, builds a package-level tree (for package leaders), and a per-package tree. * These are combined in shared memory for other ranks to read out from. * */ int MPIDI_SHM_topology_tree_init(MPIR_Comm * comm_ptr, int root, int bcast_k, MPIR_Treealgo_tree_t * bcast_tree, int *bcast_topotree_fail, int reduce_k, MPIR_Treealgo_tree_t * reduce_tree, int *reduce_topotree_fail, MPIR_Errflag_t * errflag) { int *shared_region; MPL_shm_hnd_t fd; int num_ranks, rank; int mpi_errno = MPI_SUCCESS, mpi_errno_ret = MPI_SUCCESS; size_t shm_size; int **bind_map = NULL; int *max_entries_per_level = NULL; int **ranks_per_package = NULL; int *package_ctr = NULL; size_t topo_depth = 0; int package_level = 0, i, max_ranks_per_package = 0; bool mapfail_flag = false; MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SHM_TOPOLOGY_TREE_INIT); MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SHM_TOPOLOGY_TREE_INIT); num_ranks = MPIR_Comm_size(comm_ptr); rank = MPIR_Comm_rank(comm_ptr); /* Calculate the size of shared memory that would be needed */ shm_size = sizeof(int) * 5 * num_ranks + num_ranks * sizeof(cpu_set_t); /* STEP 1. Create shared memory region for exchanging topology information (root only) */ mpi_errno = MPIDIU_allocate_shm_segment(comm_ptr, shm_size, &fd, (void **) &shared_region, &mapfail_flag); if (mpi_errno || mapfail_flag) { /* for communication errors, just record the error but continue */ *errflag = MPIX_ERR_PROC_FAILED == MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); } /* STEP 2. Collect cpu_sets for each rank at the root */ cpu_set_t my_cpu_set; CPU_ZERO(&my_cpu_set); sched_getaffinity(0, sizeof(my_cpu_set), &my_cpu_set); ((cpu_set_t *) (shared_region))[rank] = my_cpu_set; mpi_errno = MPIR_Barrier_impl(comm_ptr, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIX_ERR_PROC_FAILED == MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); } /* STEP 3. Root has all the cpu_set information, now build tree */ if (rank == root) { topo_depth = hwloc_topology_get_depth(MPIR_Process.hwloc_topology); bind_map = (int **) MPL_malloc(num_ranks * sizeof(int *), MPL_MEM_OTHER); MPIR_ERR_CHKANDJUMP(!bind_map, mpi_errno, MPI_ERR_OTHER, "**nomem"); for (i = 0; i < num_ranks; ++i) { bind_map[i] = (int *) MPL_calloc(topo_depth, sizeof(int), MPL_MEM_OTHER); MPIR_ERR_CHKANDJUMP(!bind_map[i], mpi_errno, MPI_ERR_OTHER, "**nomem"); } MPIDI_SHM_hwloc_init_bindmap(num_ranks, topo_depth, shared_region, bind_map); /* Done building the topology information */ /* STEP 3.1. Count the maximum entries at each level - used for breaking the tree into * intra/inter socket */ max_entries_per_level = (int *) MPL_calloc(topo_depth, sizeof(size_t), MPL_MEM_OTHER); MPIR_ERR_CHKANDJUMP(!max_entries_per_level, mpi_errno, MPI_ERR_OTHER, "**nomem"); package_level = MPIDI_SHM_topotree_get_package_level(topo_depth, max_entries_per_level, num_ranks, bind_map); if (MPIDI_SHM_TOPOTREE_DEBUG) fprintf(stderr, "Breaking topology at :: %d (default= %d)\n", package_level, MPIDI_SHM_TOPOTREE_CUTOFF); /* STEP 3.2. allocate space for the entries that go in each package based on hwloc info */ ranks_per_package = (int **) MPL_malloc(max_entries_per_level[package_level] * sizeof(int *), MPL_MEM_OTHER); MPIR_ERR_CHKANDJUMP(!ranks_per_package, mpi_errno, MPI_ERR_OTHER, "**nomem"); package_ctr = (int *) MPL_calloc(max_entries_per_level[package_level], sizeof(int), MPL_MEM_OTHER); MPIR_ERR_CHKANDJUMP(!package_ctr, mpi_errno, MPI_ERR_OTHER, "**nomem"); for (i = 0; i < max_entries_per_level[package_level]; ++i) { package_ctr[i] = 0; ranks_per_package[i] = (int *) MPL_calloc(num_ranks, sizeof(int), MPL_MEM_OTHER); MPIR_ERR_CHKANDJUMP(!ranks_per_package[i], mpi_errno, MPI_ERR_OTHER, "**nomem"); } /* sort the ranks into packages based on the binding information */ for (i = 0; i < num_ranks; ++i) { int package = bind_map[i][package_level]; ranks_per_package[package][package_ctr[package]++] = i; } max_ranks_per_package = 0; for (i = 0; i < max_entries_per_level[package_level]; ++i) { max_ranks_per_package = MPL_MAX(max_ranks_per_package, package_ctr[i]); } /* At this point we have done the common work in extracting topology information * and restructuring it to our needs. Now we generate the tree. */ /* For Bcast, package leaders are added before the package local ranks, and the per_package * tree is left_skewed */ mpi_errno = MPIDI_SHM_gen_tree(bcast_k, shared_region, max_entries_per_level, ranks_per_package, max_ranks_per_package, package_ctr, package_level, num_ranks, 1 /*package_leaders_first */ , 0 /*left_skewed */ , errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIX_ERR_PROC_FAILED == MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); } } mpi_errno = MPIR_Barrier_impl(comm_ptr, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIX_ERR_PROC_FAILED == MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); } /* Every rank copies their tree out from shared memory */ MPIDI_SHM_copy_tree(shared_region, num_ranks, rank, bcast_tree, bcast_topotree_fail); if (MPIDI_SHM_TOPOTREE_DEBUG) MPIDI_SHM_print_topotree_file("BCAST", comm_ptr->context_id, rank, bcast_tree); /* Wait until shared memory is available */ mpi_errno = MPIR_Barrier_impl(comm_ptr, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIX_ERR_PROC_FAILED == MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); } /* Generate the reduce tree */ /* For Reduce, package leaders are added after the package local ranks, and the per_package * tree is right_skewed (children are added in the reverse order */ if (rank == root) { memset(shared_region, 0, shm_size); mpi_errno = MPIDI_SHM_gen_tree(reduce_k, shared_region, max_entries_per_level, ranks_per_package, max_ranks_per_package, package_ctr, package_level, num_ranks, 0 /*package_leaders_last */ , 1 /*right_skewed */ , errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIX_ERR_PROC_FAILED == MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); } } mpi_errno = MPIR_Barrier_impl(comm_ptr, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIX_ERR_PROC_FAILED == MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); } /* each rank copy the reduce tree out */ MPIDI_SHM_copy_tree(shared_region, num_ranks, rank, reduce_tree, reduce_topotree_fail); if (MPIDI_SHM_TOPOTREE_DEBUG) MPIDI_SHM_print_topotree_file("REDUCE", comm_ptr->context_id, rank, reduce_tree); /* Wait for all ranks to copy out the tree */ mpi_errno = MPIR_Barrier_impl(comm_ptr, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIX_ERR_PROC_FAILED == MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); } /* Cleanup */ if (rank == root) { for (i = 0; i < max_entries_per_level[package_level]; ++i) { MPL_free(ranks_per_package[i]); } MPL_free(ranks_per_package); MPL_free(package_ctr); if (MPIDI_SHM_TOPOTREE_DEBUG) for (i = 0; i < topo_depth; ++i) { fprintf(stderr, "Level :: %d, Max :: %d\n", i, max_entries_per_level[i]); } for (i = 0; i < num_ranks; ++i) { MPL_free(bind_map[i]); } MPL_free(max_entries_per_level); MPL_free(bind_map); } MPIDIU_destroy_shm_segment(shm_size, &fd, (void **) &shared_region); fn_exit: if (rank == root && MPIDI_SHM_TOPOTREE_DEBUG) fprintf(stderr, "Done creating tree for %d\n", num_ranks); MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SHM_TOPOLOGY_TREE_INIT); return mpi_errno; fn_fail: goto fn_exit; }
/*@ MPI_Cart_create - Makes a new communicator to which topology information has been attached Input Parameters: + comm_old - input communicator (handle) . ndims - number of dimensions of cartesian grid (integer) . dims - integer array of size ndims specifying the number of processes in each dimension . periods - logical array of size ndims specifying whether the grid is periodic (true) or not (false) in each dimension - reorder - ranking may be reordered (true) or not (false) (logical) Output Parameter: . comm_cart - communicator with new cartesian topology (handle) Algorithm: We ignore 'reorder' info currently. .N fortran .N Errors .N MPI_SUCCESS .N MPI_ERR_TOPOLOGY .N MPI_ERR_DIMS .N MPI_ERR_ARG @*/ int MPI_Cart_create ( MPI_Comm comm_old, int ndims, int *dims, int *periods, int reorder, MPI_Comm *comm_cart ) { int range[1][3]; MPI_Group group_old, group; int i, rank, num_ranks = 1; int mpi_errno = MPI_SUCCESS; int flag, size; MPIR_TOPOLOGY *topo; struct MPIR_COMMUNICATOR *comm_old_ptr; static char myname[] = "MPI_CART_CREATE"; TR_PUSH(myname); comm_old_ptr = MPIR_GET_COMM_PTR(comm_old); /* Check validity of arguments */ #ifndef MPIR_NO_ERROR_CHECKING MPIR_TEST_MPI_COMM(comm_old,comm_old_ptr,comm_old_ptr,myname); MPIR_TEST_ARG(comm_cart); MPIR_TEST_ARG(periods); if (ndims < 1 || dims == (int *)0) mpi_errno = MPI_ERR_DIMS; if (mpi_errno) return MPIR_ERROR(comm_old_ptr, mpi_errno, myname ); /* Check for Intra-communicator */ MPI_Comm_test_inter ( comm_old, &flag ); if (flag) return MPIR_ERROR(comm_old_ptr, MPIR_ERRCLASS_TO_CODE(MPI_ERR_COMM,MPIR_ERR_COMM_INTER), myname ); #endif /* Determine number of ranks in topology */ for ( i=0; i<ndims; i++ ) num_ranks *= (dims[i]>0)?dims[i]:-dims[i]; if ( num_ranks < 1 ) { (*comm_cart) = MPI_COMM_NULL; return MPIR_ERROR( comm_old_ptr, MPI_ERR_TOPOLOGY, myname ); } /* Is the old communicator big enough? */ MPIR_Comm_size (comm_old_ptr, &size); if (num_ranks > size) { mpi_errno = MPIR_Err_setmsg( MPI_ERR_TOPOLOGY, MPIR_ERR_TOPO_TOO_LARGE, myname, "Topology size is larger than size of communicator", "Topology size %d is greater than communicator size %d", num_ranks, size ); return MPIR_ERROR(comm_old_ptr, mpi_errno, myname ); } /* Make new comm */ range[0][0] = 0; range[0][1] = num_ranks - 1; range[0][2] = 1; MPI_Comm_group ( comm_old, &group_old ); MPI_Group_range_incl ( group_old, 1, range, &group ); MPI_Comm_create ( comm_old, group, comm_cart ); MPI_Group_free( &group ); MPI_Group_free( &group_old ); /* Store topology information in new communicator */ if ( (*comm_cart) != MPI_COMM_NULL ) { MPIR_ALLOC(topo,(MPIR_TOPOLOGY *) MPIR_SBalloc ( MPIR_topo_els ), comm_old_ptr,MPI_ERR_EXHAUSTED,myname); MPIR_SET_COOKIE(&topo->cart,MPIR_CART_TOPOL_COOKIE) topo->cart.type = MPI_CART; topo->cart.nnodes = num_ranks; topo->cart.ndims = ndims; MPIR_ALLOC(topo->cart.dims,(int *)MALLOC( sizeof(int) * 3 * ndims ), comm_old_ptr,MPI_ERR_EXHAUSTED,myname); topo->cart.periods = topo->cart.dims + ndims; topo->cart.position = topo->cart.periods + ndims; for ( i=0; i<ndims; i++ ) { topo->cart.dims[i] = dims[i]; topo->cart.periods[i] = periods[i]; } /* Compute my position */ MPI_Comm_rank ( (*comm_cart), &rank ); for ( i=0; i < ndims; i++ ) { num_ranks = num_ranks / dims[i]; topo->cart.position[i] = rank / num_ranks; rank = rank % num_ranks; } /* cache topology information */ MPI_Attr_put ( (*comm_cart), MPIR_TOPOLOGY_KEYVAL, (void *)topo ); } TR_POP; return (mpi_errno); }
static int intra_Scatterv(void *sendbuf, int *sendcnts, int *displs, struct MPIR_DATATYPE *sendtype, void *recvbuf, int recvcnt, struct MPIR_DATATYPE *recvtype, int root, struct MPIR_COMMUNICATOR *comm) { MPI_Status status; int rank, size; int mpi_errno = MPI_SUCCESS; static char myname[] = "MPI_SCATTERV"; /* Get size and rank */ MPIR_Comm_size(comm, &size); MPIR_Comm_rank(comm, &rank); /* Check for invalid arguments */ #ifndef MPIR_NO_ERROR_CHECKING if (root >= size) mpi_errno = MPIR_Err_setmsg(MPI_ERR_ROOT, MPIR_ERR_ROOT_TOOBIG, myname, (char *) 0, (char *) 0, root, size); if (root < 0) mpi_errno = MPIR_Err_setmsg(MPI_ERR_ROOT, MPIR_ERR_DEFAULT, myname, (char *) 0, (char *) 0, root); if (mpi_errno) return MPIR_ERROR(comm, mpi_errno, myname); #endif /* Switch communicators to the hidden collective */ comm = comm->comm_coll; /* Lock for collective operation */ MPID_THREAD_LOCK(comm->ADIctx, comm); /* If I'm the root, then scatter */ if (rank == root) { MPI_Aint extent; int i; MPI_Type_extent(sendtype->self, &extent); /* We could use Isend here, but since the receivers need to execute a simple Recv, it may not make much difference in performance, and using the blocking version is simpler */ for (i = 0; i < root; i++) { mpi_errno = MPI_Send((void *) ((char *) sendbuf + displs[i] * extent), sendcnts[i], sendtype->self, i, MPIR_SCATTERV_TAG, comm->self); if (mpi_errno) return mpi_errno; } mpi_errno = MPI_Sendrecv((void *) ((char *) sendbuf + displs[rank] * extent), sendcnts[rank], sendtype->self, rank, MPIR_SCATTERV_TAG, recvbuf, recvcnt, recvtype->self, rank, MPIR_SCATTERV_TAG, comm->self, &status); if (mpi_errno) return mpi_errno; for (i = root + 1; i < size; i++) { mpi_errno = MPI_Send((void *) ((char *) sendbuf + displs[i] * extent), sendcnts[i], sendtype->self, i, MPIR_SCATTERV_TAG, comm->self); if (mpi_errno) return mpi_errno; } } else mpi_errno = MPI_Recv(recvbuf, recvcnt, recvtype->self, root, MPIR_SCATTERV_TAG, comm->self, &status); /* Unlock for collective operation */ MPID_THREAD_UNLOCK(comm->ADIctx, comm); return (mpi_errno); }
static int intra_Allgatherv(void *sendbuf, int sendcount, struct MPIR_DATATYPE *sendtype, void *recvbuf, int *recvcounts, int *displs, struct MPIR_DATATYPE *recvtype, struct MPIR_COMMUNICATOR *comm) { int size, rank; int mpi_errno = MPI_SUCCESS; MPI_Status status; MPI_Aint recv_extent; int j, jnext, i, right, left; /* Get the size of the communicator */ MPIR_Comm_size(comm, &size); MPIR_Comm_rank(comm, &rank); /* Switch communicators to the hidden collective */ comm = comm->comm_coll; /* Do a gather for each process in the communicator This is the "circular" algorithm for allgatherv - each process sends to its right and receives from its left. This is faster than simply doing size Gathervs. */ MPI_Type_extent(recvtype->self, &recv_extent); /* First, load the "local" version in the recvbuf. */ mpi_errno = MPI_Sendrecv(sendbuf, sendcount, sendtype->self, rank, MPIR_ALLGATHERV_TAG, (void *) ((char *) recvbuf + displs[rank] * recv_extent), recvcounts[rank], recvtype->self, rank, MPIR_ALLGATHERV_TAG, comm->self, &status); if (mpi_errno) { return mpi_errno; } left = (size + rank - 1) % size; right = (rank + 1) % size; j = rank; jnext = left; for (i = 1; i < size; i++) { mpi_errno = MPI_Sendrecv((void *) ((char *) recvbuf + displs[j] * recv_extent), recvcounts[j], recvtype->self, right, MPIR_ALLGATHERV_TAG, (void *) ((char *) recvbuf + displs[jnext] * recv_extent), recvcounts[jnext], recvtype->self, left, MPIR_ALLGATHERV_TAG, comm->self, &status); if (mpi_errno) break; j = jnext; jnext = (size + jnext - 1) % size; } return (mpi_errno); }
static int intra_Reduce_scatter(void *sendbuf, void *recvbuf, int *recvcnts, struct MPIR_DATATYPE *datatype, MPI_Op op, struct MPIR_COMMUNICATOR *comm) { int rank, size, i, count = 0; MPI_Aint lb, ub, m_extent; /* Extent in memory */ int *displs; void *buffer; int mpi_errno = MPI_SUCCESS, rc; static char myname[] = "MPI_REDUCE_SCATTER"; /* Determine the "count" of items to reduce and set the displacements */ MPIR_Type_get_limits(datatype, &lb, &ub); m_extent = ub - lb; /* MPI_Type_extent (datatype, &extent); */ MPIR_Comm_size(comm, &size); MPIR_Comm_rank(comm, &rank); /* Allocate the displacements and initialize them */ MPIR_ALLOC(displs, (int *) MALLOC(size * sizeof(int)), comm, MPI_ERR_EXHAUSTED, myname); for (i = 0; i < size; i++) { displs[i] = count; count += recvcnts[i]; if (recvcnts[i] < 0) { FREE(displs); mpi_errno = MPIR_Err_setmsg(MPI_ERR_COUNT, MPIR_ERR_COUNT_ARRAY_NEG, myname, (char *) 0, (char *) 0, i, recvcnts[i]); return mpi_errno; } } /* Allocate a temporary buffer */ if (count == 0) { FREE(displs); return MPI_SUCCESS; } MPIR_ALLOC(buffer, (void *) MALLOC(m_extent * count), comm, MPI_ERR_EXHAUSTED, myname); buffer = (void *) ((char *) buffer - lb); /* Reduce to 0, then scatter */ mpi_errno = MPI_Reduce(sendbuf, buffer, count, datatype->self, op, 0, comm->self); if (mpi_errno == MPI_SUCCESS || mpi_errno == MPIR_ERR_OP_NOT_DEFINED) { rc = MPI_Scatterv(buffer, recvcnts, displs, datatype->self, recvbuf, recvcnts[rank], datatype->self, 0, comm->self); if (rc) mpi_errno = rc; } /* Free the temporary buffers */ FREE((char *) buffer + lb); FREE(displs); return (mpi_errno); }
static int intra_Alltoallv(void *sendbuf, int *sendcnts, int *sdispls, struct MPIR_DATATYPE *sendtype, void *recvbuf, int *recvcnts, int *rdispls, struct MPIR_DATATYPE *recvtype, struct MPIR_COMMUNICATOR *comm) { int size, i, j, rcnt; MPI_Aint send_extent, recv_extent; int mpi_errno = MPI_SUCCESS; MPI_Status *starray; MPI_Request *reqarray; /* Get size and switch to collective communicator */ MPIR_Comm_size(comm, &size); comm = comm->comm_coll; /* Get extent of send and recv types */ MPI_Type_extent(sendtype->self, &send_extent); MPI_Type_extent(recvtype->self, &recv_extent); /* Lock for collective operation */ MPID_THREAD_LOCK(comm->ADIctx, comm); /* 1st, get some storage from the heap to hold handles, etc. */ MPIR_ALLOC(starray, (MPI_Status *) MALLOC(2 * size * sizeof(MPI_Status)), comm, MPI_ERR_EXHAUSTED, "MPI_ALLTOALLV"); MPIR_ALLOC(reqarray, (MPI_Request *) MALLOC(2 * size * sizeof(MPI_Request)), comm, MPI_ERR_EXHAUSTED, "MPI_ALLTOALLV"); /* do the communication -- post *all* sends and receives: */ rcnt = 0; for (i = 0; i < size; i++) { reqarray[2 * i] = MPI_REQUEST_NULL; if ((mpi_errno = MPI_Irecv((void *) ((char *) recvbuf + rdispls[i] * recv_extent), recvcnts[i], recvtype->self, i, MPIR_ALLTOALLV_TAG, comm->self, &reqarray[2 * i + 1])) ) break; rcnt++; if ((mpi_errno = MPI_Isend((void *) ((char *) sendbuf + sdispls[i] * send_extent), sendcnts[i], sendtype->self, i, MPIR_ALLTOALLV_TAG, comm->self, &reqarray[2 * i])) ) break; rcnt++; } /* ... then wait for *all* of them to finish: */ if (mpi_errno) { /* We should really cancel all of the active requests */ for (j = 0; j < rcnt; j++) { MPI_Cancel(&reqarray[j]); } } else { mpi_errno = MPI_Waitall(2 * size, reqarray, starray); if (mpi_errno == MPI_ERR_IN_STATUS) { for (j = 0; j < 2 * size; j++) { if (starray[j].MPI_ERROR != MPI_SUCCESS) mpi_errno = starray[j].MPI_ERROR; } } } /* clean up */ FREE(reqarray); FREE(starray); /* Unlock for collective operation */ MPID_THREAD_UNLOCK(comm->ADIctx, comm); return (mpi_errno); }
static int intra_Scan(void *sendbuf, void *recvbuf, int count, struct MPIR_DATATYPE *datatype, MPI_Op op, struct MPIR_COMMUNICATOR *comm) { MPI_Status status; int rank, size; int mpi_errno = MPI_SUCCESS; MPI_Aint lb, ub, m_extent; /* Extent in memory */ MPI_User_function *uop; struct MPIR_OP *op_ptr; MPIR_ERROR_DECL; mpi_comm_err_ret = 0; /* See the overview in Collection Operations for why this is ok */ if (count == 0) return MPI_SUCCESS; /* Get my rank & size and switch communicators to the hidden collective */ MPIR_Comm_size(comm, &size); MPIR_Comm_rank(comm, &rank); MPIR_Type_get_limits(datatype, &lb, &ub); m_extent = ub - lb; comm = comm->comm_coll; op_ptr = MPIR_GET_OP_PTR(op); MPIR_TEST_MPI_OP(op, op_ptr, comm, "MPI_SCAN"); uop = op_ptr->op; /* Lock for collective operation */ MPID_THREAD_LOCK(comm->ADIctx, comm); /* This is an O(size) algorithm. A modification of the algorithm in reduce.c can be used to make this O(log(size)) */ /* commutative case requires no extra buffering */ MPIR_Op_errno = MPI_SUCCESS; if (op_ptr->commute) { /* Do the scan operation */ if (rank > 0) { mpi_errno = MPI_Recv(recvbuf, count, datatype->self, rank - 1, MPIR_SCAN_TAG, comm->self, &status); if (mpi_errno) return mpi_errno; /* See reduce for why pop/push */ MPIR_ERROR_POP(comm); (*uop) (sendbuf, recvbuf, &count, &datatype->self); MPIR_ERROR_PUSH(comm); } else { MPIR_COPYSELF(sendbuf, count, datatype->self, recvbuf, MPIR_SCAN_TAG, rank, comm->self); if (mpi_errno) return mpi_errno; } } /* non-commutative case requires extra buffering */ else { /* Do the scan operation */ if (rank > 0) { void *tmpbuf; MPIR_ALLOC(tmpbuf, (void *) MALLOC(m_extent * count), comm, MPI_ERR_EXHAUSTED, "MPI_SCAN"); tmpbuf = (void *) ((char *) tmpbuf - lb); MPIR_COPYSELF(sendbuf, count, datatype->self, recvbuf, MPIR_SCAN_TAG, rank, comm->self); if (mpi_errno) return mpi_errno; mpi_errno = MPI_Recv(tmpbuf, count, datatype->self, rank - 1, MPIR_SCAN_TAG, comm->self, &status); if (mpi_errno) return mpi_errno; (*uop) (tmpbuf, recvbuf, &count, &datatype->self); FREE((char *) tmpbuf + lb); } else { MPIR_COPYSELF(sendbuf, count, datatype->self, recvbuf, MPIR_SCAN_TAG, rank, comm->self); if (mpi_errno) return mpi_errno; } } /* send the letter to destination */ if (rank < (size - 1)) mpi_errno = MPI_Send(recvbuf, count, datatype->self, rank + 1, MPIR_SCAN_TAG, comm->self); /* If the predefined operation detected an error, report it here */ if (mpi_errno == MPI_SUCCESS && MPIR_Op_errno) mpi_errno = MPIR_Op_errno; /* Unlock for collective operation */ MPID_THREAD_UNLOCK(comm->ADIctx, comm); return (mpi_errno); }
/* Now the functions */ static int intra_Barrier(struct MPIR_COMMUNICATOR *comm) { int rank, size, N2_prev, surfeit; int d, dst, src; MPI_Status status; /* Intialize communicator size */ (void) MPIR_Comm_size(comm, &size); #ifdef MPID_Barrier if (comm->ADIBarrier) { MPID_Barrier(comm->ADIctx, comm); return MPI_SUCCESS; } #endif /* If there's only one member, this is trivial */ if (size > 1) { /* Initialize collective communicator */ comm = comm->comm_coll; (void) MPIR_Comm_rank(comm, &rank); (void) MPIR_Comm_N2_prev(comm, &N2_prev); surfeit = size - N2_prev; /* Lock for collective operation */ MPID_THREAD_LOCK(comm->ADIctx, comm); /* Perform a combine-like operation */ if (rank < N2_prev) { if (rank < surfeit) { /* get the fanin letter from the upper "half" process: */ dst = N2_prev + rank; MPI_Recv((void *) 0, 0, MPI_INT, dst, MPIR_BARRIER_TAG, comm->self, &status); } /* combine on embedded N2_prev power-of-two processes */ for (d = 1; d < N2_prev; d <<= 1) { dst = (rank ^ d); MPI_Sendrecv((void *) 0, 0, MPI_INT, dst, MPIR_BARRIER_TAG, (void *) 0, 0, MPI_INT, dst, MPIR_BARRIER_TAG, comm->self, &status); } /* fanout data to nodes above N2_prev... */ if (rank < surfeit) { dst = N2_prev + rank; MPI_Send((void *) 0, 0, MPI_INT, dst, MPIR_BARRIER_TAG, comm->self); } } else { /* fanin data to power of 2 subset */ src = rank - N2_prev; MPI_Sendrecv((void *) 0, 0, MPI_INT, src, MPIR_BARRIER_TAG, (void *) 0, 0, MPI_INT, src, MPIR_BARRIER_TAG, comm->self, &status); } /* Unlock for collective operation */ MPID_THREAD_UNLOCK(comm->ADIctx, comm); } return (MPI_SUCCESS); }
/*@ MPI_Intercomm_create - Creates an intercommuncator from two intracommunicators Input Paramters: + local_comm - Local (intra)communicator . local_leader - Rank in local_comm of leader (often 0) . peer_comm - Remote communicator . remote_leader - Rank in peer_comm of remote leader (often 0) - tag - Message tag to use in constructing intercommunicator; if multiple 'MPI_Intercomm_creates' are being made, they should use different tags (more precisely, ensure that the local and remote leaders are using different tags for each 'MPI_intercomm_create'). Output Parameter: . comm_out - Created intercommunicator Notes: The MPI 1.1 Standard contains two mutually exclusive comments on the input intracommunicators. One says that their repective groups must be disjoint; the other that the leaders can be the same process. After some discussion by the MPI Forum, it has been decided that the groups must be disjoint. Note that the `reason` given for this in the standard is `not` the reason for this choice; rather, the `other` operations on intercommunicators (like 'MPI_Intercomm_merge') do not make sense if the groups are not disjoint. .N fortran Algorithm: + 1) Allocate a send context, an inter-coll context, and an intra-coll context . 2) Send "send_context" and lrank_to_grank list from local comm group if I''m the local_leader. . 3) If I''m the local leader, then wait on the posted sends and receives to complete. Post the receive for the remote group information and wait for it to complete. . 4) Broadcast information received from the remote leader. . 5) Create the inter_communicator from the information we now have. - An inter-communicator ends up with three levels of communicators. The inter-communicator returned to the user, a "collective" inter-communicator that can be used for safe communications between local & remote groups, and a collective intra-communicator that can be used to allocate new contexts during the merge and dup operations. For the resulting inter-communicator, 'comm_out' .vb comm_out = inter-communicator comm_out->comm_coll = "collective" inter-communicator comm_out->comm_coll->comm_coll = safe collective intra-communicator .ve .N Errors .N MPI_SUCCESS .N MPI_ERR_COMM .N MPI_ERR_TAG .N MPI_ERR_EXHAUSTED .N MPI_ERR_RANK .seealso: MPI_Intercomm_merge, MPI_Comm_free, MPI_Comm_remote_group, MPI_Comm_remote_size @*/ EXPORT_MPI_API int MPI_Intercomm_create ( MPI_Comm local_comm, int local_leader, MPI_Comm peer_comm, int remote_leader, int tag, MPI_Comm *comm_out ) { int local_size, local_rank, peer_size, peer_rank; int remote_size; int mpi_errno = MPI_SUCCESS; MPIR_CONTEXT context, send_context; struct MPIR_GROUP *remote_group_ptr; struct MPIR_COMMUNICATOR *new_comm, *local_comm_ptr, *peer_comm_ptr; MPI_Request req[6]; MPI_Status status[6]; MPIR_ERROR_DECL; static char myname[]="MPI_INTERCOMM_CREATE"; TR_PUSH(myname); local_comm_ptr = MPIR_GET_COMM_PTR(local_comm); #ifndef MPIR_NO_ERROR_CHECKING /* Check for valid arguments to function */ MPIR_TEST_MPI_COMM(local_comm,local_comm_ptr,local_comm_ptr,myname); MPIR_TEST_SEND_TAG(tag); if (mpi_errno) return MPIR_ERROR(local_comm_ptr, mpi_errno, myname ); #endif if (local_comm == MPI_COMM_NULL) { mpi_errno = MPIR_Err_setmsg( MPI_ERR_COMM, MPIR_ERR_LOCAL_COMM, myname, "Local communicator must not be MPI_COMM_NULL", (char *)0 ); return MPIR_ERROR( local_comm_ptr, mpi_errno, myname ); } (void) MPIR_Comm_size ( local_comm_ptr, &local_size ); (void) MPIR_Comm_rank ( local_comm_ptr, &local_rank ); if ( local_leader == local_rank ) { /* Peer_comm need be valid only at local_leader */ peer_comm_ptr = MPIR_GET_COMM_PTR(peer_comm); if ((MPIR_TEST_COMM_NOTOK(peer_comm,peer_comm_ptr) || (peer_comm == MPI_COMM_NULL))) { mpi_errno = MPIR_Err_setmsg( MPI_ERR_COMM, MPIR_ERR_PEER_COMM, myname, "Peer communicator is not valid", (char *)0 ); return MPIR_ERROR( local_comm_ptr, mpi_errno, myname ); } (void) MPIR_Comm_size ( peer_comm_ptr, &peer_size ); (void) MPIR_Comm_rank ( peer_comm_ptr, &peer_rank ); if (((peer_rank == MPI_UNDEFINED) && (mpi_errno = MPI_ERR_RANK))) return MPIR_ERROR( local_comm_ptr, mpi_errno, myname ); if (((remote_leader >= peer_size) && (mpi_errno = MPI_ERR_RANK)) || ((remote_leader < 0) && (mpi_errno = MPI_ERR_RANK))) { mpi_errno = MPIR_Err_setmsg( MPI_ERR_RANK, MPIR_ERR_REMOTE_RANK, myname, "Error specifying remote_leader", "Error specifying remote_leader; value %d not between 0 and %d", remote_leader, peer_size ); return MPIR_ERROR( local_comm_ptr, mpi_errno, myname ); } } if (((local_leader >= local_size) && (mpi_errno = MPI_ERR_RANK)) || ((local_leader < 0) && (mpi_errno = MPI_ERR_RANK))) { mpi_errno = MPIR_Err_setmsg( MPI_ERR_RANK, MPIR_ERR_LOCAL_RANK, myname, "Error specifying local_leader", "Error specifying local_leader; value %d not in between 0 and %d", local_leader, local_size ); return MPIR_ERROR( local_comm_ptr, mpi_errno, myname ); } /* Allocate send context, inter-coll context and intra-coll context */ MPIR_Context_alloc ( local_comm_ptr, 3, &context ); /* If I'm the local leader, then exchange information */ if (local_rank == local_leader) { MPIR_ERROR_PUSH(peer_comm_ptr); /* Post the receives for the information from the remote_leader */ /* We don't post a receive for the remote group yet, because we */ /* don't know how big it is yet. */ MPIR_CALL_POP(MPI_Irecv (&remote_size, 1, MPI_INT, remote_leader, tag, peer_comm, &(req[2])),peer_comm_ptr,myname); MPIR_CALL_POP(MPI_Irecv (&send_context, 1, MPIR_CONTEXT_TYPE, remote_leader,tag, peer_comm, &(req[3])), peer_comm_ptr,myname); /* Send the lrank_to_grank table of the local_comm and an allocated */ /* context. Currently I use multiple messages to send this info. */ /* Eventually, this will change(?) */ MPIR_CALL_POP(MPI_Isend (&local_size, 1, MPI_INT, remote_leader, tag, peer_comm, &(req[0])),peer_comm_ptr,myname); MPIR_CALL_POP(MPI_Isend (&context, 1, MPIR_CONTEXT_TYPE, remote_leader, tag, peer_comm, &(req[1])),peer_comm_ptr,myname); /* Wait on the communication requests to finish */ MPIR_CALL_POP(MPI_Waitall ( 4, req, status ),peer_comm_ptr,myname); /* We now know how big the remote group is, so create it */ remote_group_ptr = MPIR_CreateGroup ( remote_size ); remote_group_ptr->self = (MPI_Group) MPIR_FromPointer( remote_group_ptr ); /* Post the receive for the group information */ MPIR_CALL_POP(MPI_Irecv (remote_group_ptr->lrank_to_grank, remote_size, MPI_INT, remote_leader, tag, peer_comm, &(req[5])),peer_comm_ptr,myname); /* Send the local group info to the remote group */ MPIR_CALL_POP(MPI_Isend (local_comm_ptr->group->lrank_to_grank, local_size, MPI_INT, remote_leader, tag, peer_comm, &(req[4])),peer_comm_ptr,myname); /* wait on the send and the receive for the group information */ MPIR_CALL_POP(MPI_Waitall ( 2, &(req[4]), &(status[4]) ),peer_comm_ptr, myname); MPIR_ERROR_POP(peer_comm_ptr); /* Now we can broadcast the group information to the other local comm */ /* members. */ MPIR_ERROR_PUSH(local_comm_ptr); MPIR_CALL_POP(MPI_Bcast(&remote_size,1,MPI_INT,local_rank,local_comm), local_comm_ptr,myname); MPIR_CALL_POP(MPI_Bcast(remote_group_ptr->lrank_to_grank, remote_size, MPI_INT, local_rank, local_comm),local_comm_ptr, myname); MPIR_ERROR_POP(local_comm_ptr); } /* Else I'm just an ordinary comm member, so receive the bcast'd */ /* info about the remote group */ else { MPIR_ERROR_PUSH(local_comm_ptr); MPIR_CALL_POP(MPI_Bcast(&remote_size, 1, MPI_INT, local_leader, local_comm),local_comm_ptr,myname); /* We now know how big the remote group is, so create it */ remote_group_ptr = MPIR_CreateGroup ( remote_size ); remote_group_ptr->self = (MPI_Group) MPIR_FromPointer( remote_group_ptr ); /* Receive the group info */ MPIR_CALL_POP(MPI_Bcast(remote_group_ptr->lrank_to_grank, remote_size, MPI_INT, local_leader, local_comm), local_comm_ptr,myname ); MPIR_ERROR_POP(local_comm_ptr); } MPIR_ERROR_PUSH(local_comm_ptr); /* Broadcast the send context */ MPIR_CALL_POP(MPI_Bcast(&send_context, 1, MPIR_CONTEXT_TYPE, local_leader, local_comm),local_comm_ptr,myname); MPIR_ERROR_POP(local_comm_ptr); /* We all now have all the information necessary, start building the */ /* inter-communicator */ MPIR_ALLOC(new_comm,NEW(struct MPIR_COMMUNICATOR),local_comm_ptr, MPI_ERR_EXHAUSTED,myname ); MPIR_Comm_init( new_comm, local_comm_ptr, MPIR_INTER ); *comm_out = new_comm->self; new_comm->group = remote_group_ptr; MPIR_Group_dup( local_comm_ptr->group, &(new_comm->local_group) ); new_comm->local_rank = new_comm->local_group->local_rank; new_comm->lrank_to_grank = new_comm->group->lrank_to_grank; new_comm->np = new_comm->group->np; new_comm->send_context = send_context; new_comm->recv_context = context; new_comm->comm_name = 0; if ((mpi_errno = MPID_CommInit( local_comm_ptr, new_comm )) ) return mpi_errno; (void) MPIR_Attr_create_tree ( new_comm ); /* Build the collective inter-communicator */ MPIR_Comm_make_coll( new_comm, MPIR_INTER ); MPIR_Comm_make_onesided( new_comm, MPIR_INTER ); /* Build the collective intra-communicator. Note that we require an intra-communicator for the "coll_comm" so that MPI_COMM_DUP can use it for some collective operations (do we need this for MPI-2 with intercommunicator collective?) Note that this really isn't the right thing to do; we need to replace *all* of the Mississippi state collective code. */ MPIR_Comm_make_coll( new_comm->comm_coll, MPIR_INTRA ); #if 0 MPIR_Comm_make_coll( new_comm->comm_onesided, MPIR_INTRA ); #endif /* Remember it for the debugger */ MPIR_Comm_remember ( new_comm ); TR_POP; return (mpi_errno); }