int MPIR_Get_intercomm_contextid(MPID_Comm * comm_ptr, MPIU_Context_id_t * context_id, MPIU_Context_id_t * recvcontext_id) { MPIU_Context_id_t mycontext_id, remote_context_id; int mpi_errno = MPI_SUCCESS; int tag = 31567; /* FIXME - we need an internal tag or * communication channel. Can we use a different * context instead?. Or can we use the tag * provided in the intercomm routine? (not on a dup, * but in that case it can use the collective context) */ MPIR_Errflag_t errflag = MPIR_ERR_NONE; MPID_MPI_STATE_DECL(MPID_STATE_MPIR_GET_INTERCOMM_CONTEXTID); MPID_MPI_FUNC_ENTER(MPID_STATE_MPIR_GET_INTERCOMM_CONTEXTID); if (!comm_ptr->local_comm) { /* Manufacture the local communicator */ mpi_errno = MPIR_Setup_intercomm_localcomm(comm_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } mpi_errno = MPIR_Get_contextid_sparse(comm_ptr->local_comm, &mycontext_id, FALSE); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIU_Assert(mycontext_id != 0); /* MPIC routine uses an internal context id. The local leads (process 0) * exchange data */ remote_context_id = -1; if (comm_ptr->rank == 0) { mpi_errno = MPIC_Sendrecv(&mycontext_id, 1, MPIU_CONTEXT_ID_T_DATATYPE, 0, tag, &remote_context_id, 1, MPIU_CONTEXT_ID_T_DATATYPE, 0, tag, comm_ptr, MPI_STATUS_IGNORE, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } /* Make sure that all of the local processes now have this * id */ mpi_errno = MPIR_Bcast_impl(&remote_context_id, 1, MPIU_CONTEXT_ID_T_DATATYPE, 0, comm_ptr->local_comm, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); /* The recvcontext_id must be the one that was allocated out of the local * group, not the remote group. Otherwise we could end up posting two * MPI_ANY_SOURCE,MPI_ANY_TAG recvs on the same context IDs even though we * are attempting to post them for two separate communicators. */ *context_id = remote_context_id; *recvcontext_id = mycontext_id; fn_fail: MPID_MPI_FUNC_EXIT(MPID_STATE_MPIR_GET_INTERCOMM_CONTEXTID); return mpi_errno; }
static int barrier_smp_intra(MPID_Comm *comm_ptr, mpir_errflag_t *errflag) { int mpi_errno=MPI_SUCCESS; int mpi_errno_ret = MPI_SUCCESS; MPIU_Assert(MPIR_CVAR_ENABLE_SMP_COLLECTIVES && MPIR_CVAR_ENABLE_SMP_BARRIER && MPIR_Comm_is_node_aware(comm_ptr)); /* do the intranode barrier on all nodes */ if (comm_ptr->node_comm != NULL) { mpi_errno = MPIR_Barrier_impl(comm_ptr->node_comm, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIR_ERR_GET_CLASS(mpi_errno); MPIU_ERR_SET(mpi_errno, *errflag, "**fail"); MPIU_ERR_ADD(mpi_errno_ret, mpi_errno); } } /* do the barrier across roots of all nodes */ if (comm_ptr->node_roots_comm != NULL) { mpi_errno = MPIR_Barrier_impl(comm_ptr->node_roots_comm, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIR_ERR_GET_CLASS(mpi_errno); MPIU_ERR_SET(mpi_errno, *errflag, "**fail"); MPIU_ERR_ADD(mpi_errno_ret, mpi_errno); } } /* release the local processes on each node with a 1-byte broadcast (0-byte broadcast just returns without doing anything) */ if (comm_ptr->node_comm != NULL) { int i=0; mpi_errno = MPIR_Bcast_impl(&i, 1, MPI_BYTE, 0, comm_ptr->node_comm, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIR_ERR_GET_CLASS(mpi_errno); MPIU_ERR_SET(mpi_errno, *errflag, "**fail"); MPIU_ERR_ADD(mpi_errno_ret, mpi_errno); } } fn_exit: if (mpi_errno_ret) mpi_errno = mpi_errno_ret; else if (*errflag != MPIR_ERR_NONE) MPIU_ERR_SET(mpi_errno, *errflag, "**coll_fail"); return mpi_errno; fn_fail: goto fn_exit; }
int MPIR_Bcast(void *buffer, int count, MPI_Datatype datatype, int root, MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag) { int mpi_errno = MPI_SUCCESS; if (MPIR_CVAR_BCAST_DEVICE_COLLECTIVE && MPIR_CVAR_DEVICE_COLLECTIVES) { mpi_errno = MPID_Bcast(buffer, count, datatype, root, comm_ptr, errflag); } else { mpi_errno = MPIR_Bcast_impl(buffer, count, datatype, root, comm_ptr, errflag); } return mpi_errno; }
int MPIR_Allreduce_inter ( void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr, int *errflag ) { /* Intercommunicator Allreduce. We first do an intercommunicator reduce to rank 0 on left group, then an intercommunicator reduce to rank 0 on right group, followed by local intracommunicator broadcasts in each group. We don't do local reduces first and then intercommunicator broadcasts because it would require allocation of a temporary buffer. */ int rank, mpi_errno, root; int mpi_errno_ret = MPI_SUCCESS; MPID_Comm *newcomm_ptr = NULL; rank = comm_ptr->rank; /* first do a reduce from right group to rank 0 in left group, then from left group to rank 0 in right group*/ if (comm_ptr->is_low_group) { /* reduce from right group to rank 0*/ root = (rank == 0) ? MPI_ROOT : MPI_PROC_NULL; mpi_errno = MPIR_Reduce_inter(sendbuf, recvbuf, count, datatype, op, root, comm_ptr, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = TRUE; MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail"); MPIU_ERR_ADD(mpi_errno_ret, mpi_errno); } /* reduce to rank 0 of right group */ root = 0; mpi_errno = MPIR_Reduce_inter(sendbuf, recvbuf, count, datatype, op, root, comm_ptr, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = TRUE; MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail"); MPIU_ERR_ADD(mpi_errno_ret, mpi_errno); } } else { /* reduce to rank 0 of left group */ root = 0; mpi_errno = MPIR_Reduce_inter(sendbuf, recvbuf, count, datatype, op, root, comm_ptr, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = TRUE; MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail"); MPIU_ERR_ADD(mpi_errno_ret, mpi_errno); } /* reduce from right group to rank 0 */ root = (rank == 0) ? MPI_ROOT : MPI_PROC_NULL; mpi_errno = MPIR_Reduce_inter(sendbuf, recvbuf, count, datatype, op, root, comm_ptr, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = TRUE; MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail"); MPIU_ERR_ADD(mpi_errno_ret, mpi_errno); } } /* Get the local intracommunicator */ if (!comm_ptr->local_comm) MPIR_Setup_intercomm_localcomm( comm_ptr ); newcomm_ptr = comm_ptr->local_comm; mpi_errno = MPIR_Bcast_impl(recvbuf, count, datatype, 0, newcomm_ptr, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = TRUE; MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail"); MPIU_ERR_ADD(mpi_errno_ret, mpi_errno); } fn_exit: if (mpi_errno_ret) mpi_errno = mpi_errno_ret; else if (*errflag) MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**coll_fail"); return mpi_errno; fn_fail: goto fn_exit; }
int MPIR_Allreduce_intra ( void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr, int *errflag ) { int is_homogeneous; #ifdef MPID_HAS_HETERO int rc; #endif int comm_size, rank, type_size; int mpi_errno = MPI_SUCCESS; int mpi_errno_ret = MPI_SUCCESS; int mask, dst, is_commutative, pof2, newrank, rem, newdst, i, send_idx, recv_idx, last_idx, send_cnt, recv_cnt, *cnts, *disps; MPI_Aint true_extent, true_lb, extent; void *tmp_buf; MPI_Comm comm; MPIU_CHKLMEM_DECL(3); /* check if multiple threads are calling this collective function */ MPIDU_ERR_CHECK_MULTIPLE_THREADS_ENTER( comm_ptr ); if (count == 0) goto fn_exit; comm = comm_ptr->handle; is_commutative = MPIR_Op_is_commutative(op); #if defined(USE_SMP_COLLECTIVES) /* is the op commutative? We do SMP optimizations only if it is. */ if (MPIR_Comm_is_node_aware(comm_ptr) && is_commutative) { /* on each node, do a reduce to the local root */ if (comm_ptr->node_comm != NULL) { /* take care of the MPI_IN_PLACE case. For reduce, MPI_IN_PLACE is specified only on the root; for allreduce it is specified on all processes. */ if ((sendbuf == MPI_IN_PLACE) && (comm_ptr->node_comm->rank != 0)) { /* IN_PLACE and not root of reduce. Data supplied to this allreduce is in recvbuf. Pass that as the sendbuf to reduce. */ mpi_errno = MPIR_Reduce_impl(recvbuf, NULL, count, datatype, op, 0, comm_ptr->node_comm, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = TRUE; MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail"); MPIU_ERR_ADD(mpi_errno_ret, mpi_errno); } } else { mpi_errno = MPIR_Reduce_impl(sendbuf, recvbuf, count, datatype, op, 0, comm_ptr->node_comm, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = TRUE; MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail"); MPIU_ERR_ADD(mpi_errno_ret, mpi_errno); } } } else { /* only one process on the node. copy sendbuf to recvbuf */ if (sendbuf != MPI_IN_PLACE) { mpi_errno = MPIR_Localcopy(sendbuf, count, datatype, recvbuf, count, datatype); if (mpi_errno) MPIU_ERR_POP(mpi_errno); } } /* now do an IN_PLACE allreduce among the local roots of all nodes */ if (comm_ptr->node_roots_comm != NULL) { mpi_errno = allreduce_intra_or_coll_fn(MPI_IN_PLACE, recvbuf, count, datatype, op, comm_ptr->node_roots_comm, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = TRUE; MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail"); MPIU_ERR_ADD(mpi_errno_ret, mpi_errno); } } /* now broadcast the result among local processes */ if (comm_ptr->node_comm != NULL) { mpi_errno = MPIR_Bcast_impl(recvbuf, count, datatype, 0, comm_ptr->node_comm, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = TRUE; MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail"); MPIU_ERR_ADD(mpi_errno_ret, mpi_errno); } } goto fn_exit; } #endif is_homogeneous = 1; #ifdef MPID_HAS_HETERO if (comm_ptr->is_hetero) is_homogeneous = 0; #endif #ifdef MPID_HAS_HETERO if (!is_homogeneous) { /* heterogeneous. To get the same result on all processes, we do a reduce to 0 and then broadcast. */ mpi_errno = MPIR_Reduce_impl ( sendbuf, recvbuf, count, datatype, op, 0, comm_ptr, errflag ); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = TRUE; MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail"); MPIU_ERR_ADD(mpi_errno_ret, mpi_errno); } mpi_errno = MPIR_Bcast_impl( recvbuf, count, datatype, 0, comm_ptr, errflag ); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = TRUE; MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail"); MPIU_ERR_ADD(mpi_errno_ret, mpi_errno); } } else #endif /* MPID_HAS_HETERO */ { /* homogeneous */ comm_size = comm_ptr->local_size; rank = comm_ptr->rank; is_commutative = MPIR_Op_is_commutative(op); /* need to allocate temporary buffer to store incoming data*/ MPIR_Type_get_true_extent_impl(datatype, &true_lb, &true_extent); MPID_Datatype_get_extent_macro(datatype, extent); MPID_Ensure_Aint_fits_in_pointer(count * MPIR_MAX(extent, true_extent)); MPIU_CHKLMEM_MALLOC(tmp_buf, void *, count*(MPIR_MAX(extent,true_extent)), mpi_errno, "temporary buffer"); /* adjust for potential negative lower bound in datatype */ tmp_buf = (void *)((char*)tmp_buf - true_lb); /* copy local data into recvbuf */ if (sendbuf != MPI_IN_PLACE) { mpi_errno = MPIR_Localcopy(sendbuf, count, datatype, recvbuf, count, datatype); if (mpi_errno) MPIU_ERR_POP(mpi_errno); } MPID_Datatype_get_size_macro(datatype, type_size); /* find nearest power-of-two less than or equal to comm_size */ pof2 = 1; while (pof2 <= comm_size) pof2 <<= 1; pof2 >>=1; rem = comm_size - pof2; /* In the non-power-of-two case, all even-numbered processes of rank < 2*rem send their data to (rank+1). These even-numbered processes no longer participate in the algorithm until the very end. The remaining processes form a nice power-of-two. */ if (rank < 2*rem) { if (rank % 2 == 0) { /* even */ mpi_errno = MPIC_Send_ft(recvbuf, count, datatype, rank+1, MPIR_ALLREDUCE_TAG, comm, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = TRUE; MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail"); MPIU_ERR_ADD(mpi_errno_ret, mpi_errno); } /* temporarily set the rank to -1 so that this process does not pariticipate in recursive doubling */ newrank = -1; } else { /* odd */ mpi_errno = MPIC_Recv_ft(tmp_buf, count, datatype, rank-1, MPIR_ALLREDUCE_TAG, comm, MPI_STATUS_IGNORE, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = TRUE; MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail"); MPIU_ERR_ADD(mpi_errno_ret, mpi_errno); } /* do the reduction on received data. since the ordering is right, it doesn't matter whether the operation is commutative or not. */ mpi_errno = MPIR_Reduce_local_impl(tmp_buf, recvbuf, count, datatype, op); if (mpi_errno) MPIU_ERR_POP(mpi_errno); /* change the rank */ newrank = rank / 2; } } else /* rank >= 2*rem */ newrank = rank - rem; /* If op is user-defined or count is less than pof2, use recursive doubling algorithm. Otherwise do a reduce-scatter followed by allgather. (If op is user-defined, derived datatypes are allowed and the user could pass basic datatypes on one process and derived on another as long as the type maps are the same. Breaking up derived datatypes to do the reduce-scatter is tricky, therefore using recursive doubling in that case.) */ if (newrank != -1) { if ((count*type_size <= MPIR_PARAM_ALLREDUCE_SHORT_MSG_SIZE) || (HANDLE_GET_KIND(op) != HANDLE_KIND_BUILTIN) || (count < pof2)) { /* use recursive doubling */ mask = 0x1; while (mask < pof2) { newdst = newrank ^ mask; /* find real rank of dest */ dst = (newdst < rem) ? newdst*2 + 1 : newdst + rem; /* Send the most current data, which is in recvbuf. Recv into tmp_buf */ mpi_errno = MPIC_Sendrecv_ft(recvbuf, count, datatype, dst, MPIR_ALLREDUCE_TAG, tmp_buf, count, datatype, dst, MPIR_ALLREDUCE_TAG, comm, MPI_STATUS_IGNORE, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = TRUE; MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail"); MPIU_ERR_ADD(mpi_errno_ret, mpi_errno); } /* tmp_buf contains data received in this step. recvbuf contains data accumulated so far */ if (is_commutative || (dst < rank)) { /* op is commutative OR the order is already right */ mpi_errno = MPIR_Reduce_local_impl(tmp_buf, recvbuf, count, datatype, op); if (mpi_errno) MPIU_ERR_POP(mpi_errno); } else { /* op is noncommutative and the order is not right */ mpi_errno = MPIR_Reduce_local_impl(recvbuf, tmp_buf, count, datatype, op); if (mpi_errno) MPIU_ERR_POP(mpi_errno); /* copy result back into recvbuf */ mpi_errno = MPIR_Localcopy(tmp_buf, count, datatype, recvbuf, count, datatype); if (mpi_errno) MPIU_ERR_POP(mpi_errno); } mask <<= 1; } } else { /* do a reduce-scatter followed by allgather */ /* for the reduce-scatter, calculate the count that each process receives and the displacement within the buffer */ MPIU_CHKLMEM_MALLOC(cnts, int *, pof2*sizeof(int), mpi_errno, "counts"); MPIU_CHKLMEM_MALLOC(disps, int *, pof2*sizeof(int), mpi_errno, "displacements"); for (i=0; i<(pof2-1); i++) cnts[i] = count/pof2; cnts[pof2-1] = count - (count/pof2)*(pof2-1); disps[0] = 0; for (i=1; i<pof2; i++) disps[i] = disps[i-1] + cnts[i-1]; mask = 0x1; send_idx = recv_idx = 0; last_idx = pof2; while (mask < pof2) { newdst = newrank ^ mask; /* find real rank of dest */ dst = (newdst < rem) ? newdst*2 + 1 : newdst + rem; send_cnt = recv_cnt = 0; if (newrank < newdst) { send_idx = recv_idx + pof2/(mask*2); for (i=send_idx; i<last_idx; i++) send_cnt += cnts[i]; for (i=recv_idx; i<send_idx; i++) recv_cnt += cnts[i]; } else { recv_idx = send_idx + pof2/(mask*2); for (i=send_idx; i<recv_idx; i++) send_cnt += cnts[i]; for (i=recv_idx; i<last_idx; i++) recv_cnt += cnts[i]; } /* printf("Rank %d, send_idx %d, recv_idx %d, send_cnt %d, recv_cnt %d, last_idx %d\n", newrank, send_idx, recv_idx, send_cnt, recv_cnt, last_idx); */ /* Send data from recvbuf. Recv into tmp_buf */ mpi_errno = MPIC_Sendrecv_ft((char *) recvbuf + disps[send_idx]*extent, send_cnt, datatype, dst, MPIR_ALLREDUCE_TAG, (char *) tmp_buf + disps[recv_idx]*extent, recv_cnt, datatype, dst, MPIR_ALLREDUCE_TAG, comm, MPI_STATUS_IGNORE, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = TRUE; MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail"); MPIU_ERR_ADD(mpi_errno_ret, mpi_errno); } /* tmp_buf contains data received in this step. recvbuf contains data accumulated so far */ /* This algorithm is used only for predefined ops and predefined ops are always commutative. */ mpi_errno = MPIR_Reduce_local_impl(((char *) tmp_buf + disps[recv_idx]*extent), ((char *) recvbuf + disps[recv_idx]*extent), recv_cnt, datatype, op); if (mpi_errno) MPIU_ERR_POP(mpi_errno); /* update send_idx for next iteration */ send_idx = recv_idx; mask <<= 1; /* update last_idx, but not in last iteration because the value is needed in the allgather step below. */ if (mask < pof2) last_idx = recv_idx + pof2/mask; } /* now do the allgather */ mask >>= 1; while (mask > 0) { newdst = newrank ^ mask; /* find real rank of dest */ dst = (newdst < rem) ? newdst*2 + 1 : newdst + rem; send_cnt = recv_cnt = 0; if (newrank < newdst) { /* update last_idx except on first iteration */ if (mask != pof2/2) last_idx = last_idx + pof2/(mask*2); recv_idx = send_idx + pof2/(mask*2); for (i=send_idx; i<recv_idx; i++) send_cnt += cnts[i]; for (i=recv_idx; i<last_idx; i++) recv_cnt += cnts[i]; } else { recv_idx = send_idx - pof2/(mask*2); for (i=send_idx; i<last_idx; i++) send_cnt += cnts[i]; for (i=recv_idx; i<send_idx; i++) recv_cnt += cnts[i]; } mpi_errno = MPIC_Sendrecv_ft((char *) recvbuf + disps[send_idx]*extent, send_cnt, datatype, dst, MPIR_ALLREDUCE_TAG, (char *) recvbuf + disps[recv_idx]*extent, recv_cnt, datatype, dst, MPIR_ALLREDUCE_TAG, comm, MPI_STATUS_IGNORE, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = TRUE; MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail"); MPIU_ERR_ADD(mpi_errno_ret, mpi_errno); } if (newrank > newdst) send_idx = recv_idx; mask >>= 1; } } } /* In the non-power-of-two case, all odd-numbered processes of rank < 2*rem send the result to (rank-1), the ranks who didn't participate above. */ if (rank < 2*rem) { if (rank % 2) /* odd */ mpi_errno = MPIC_Send_ft(recvbuf, count, datatype, rank-1, MPIR_ALLREDUCE_TAG, comm, errflag); else /* even */ mpi_errno = MPIC_Recv_ft(recvbuf, count, datatype, rank+1, MPIR_ALLREDUCE_TAG, comm, MPI_STATUS_IGNORE, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = TRUE; MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail"); MPIU_ERR_ADD(mpi_errno_ret, mpi_errno); } } } fn_exit: /* check if multiple threads are calling this collective function */ MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr ); MPIU_CHKLMEM_FREEALL(); if (mpi_errno_ret) mpi_errno = mpi_errno_ret; return (mpi_errno); fn_fail: goto fn_exit; }
int MPIDI_Comm_spawn_multiple(int count, char **commands, char ***argvs, const int *maxprocs, MPIR_Info **info_ptrs, int root, MPIR_Comm *comm_ptr, MPIR_Comm **intercomm, int *errcodes) { char port_name[MPI_MAX_PORT_NAME]; int *info_keyval_sizes=0, i, mpi_errno=MPI_SUCCESS; PMI_keyval_t **info_keyval_vectors=0, preput_keyval_vector; int *pmi_errcodes = 0, pmi_errno; int total_num_processes, should_accept = 1; MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_COMM_SPAWN_MULTIPLE); MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_COMM_SPAWN_MULTIPLE); if (comm_ptr->rank == root) { /* create an array for the pmi error codes */ total_num_processes = 0; for (i=0; i<count; i++) { total_num_processes += maxprocs[i]; } pmi_errcodes = (int*)MPL_malloc(sizeof(int) * total_num_processes); if (pmi_errcodes == NULL) { MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**nomem"); } /* initialize them to 0 */ for (i=0; i<total_num_processes; i++) pmi_errcodes[i] = 0; /* Open a port for the spawned processes to connect to */ /* FIXME: info may be needed for port name */ mpi_errno = MPID_Open_port(NULL, port_name); /* --BEGIN ERROR HANDLING-- */ if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* --END ERROR HANDLING-- */ /* Spawn the processes */ #ifdef USE_PMI2_API MPIR_Assert(count > 0); { int *argcs = MPL_malloc(count*sizeof(int)); struct MPIR_Info preput; struct MPIR_Info *preput_p[1] = { &preput }; MPIR_Assert(argcs); /* info_keyval_sizes = MPL_malloc(count * sizeof(int)); */ /* FIXME cheating on constness */ preput.key = (char *)PARENT_PORT_KVSKEY; preput.value = port_name; preput.next = NULL; /* compute argcs array */ for (i = 0; i < count; ++i) { argcs[i] = 0; if (argvs != NULL && argvs[i] != NULL) { while (argvs[i][argcs[i]]) { ++argcs[i]; } } /* a fib for now */ /* info_keyval_sizes[i] = 0; */ } /* XXX DJG don't need this, PMI API is thread-safe? */ /*MPID_THREAD_CS_ENTER(POBJ, MPIR_THREAD_POBJ_PMI_MUTEX);*/ /* release the global CS for spawn PMI calls */ MPID_THREAD_CS_EXIT(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX); pmi_errno = PMI2_Job_Spawn(count, (const char **)commands, argcs, (const char ***)argvs, maxprocs, info_keyval_sizes, (const MPIR_Info **)info_ptrs, 1, (const struct MPIR_Info **)preput_p, NULL, 0, /*jobId, jobIdSize,*/ /* XXX DJG job stuff? */ pmi_errcodes); MPID_THREAD_CS_ENTER(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX); /*MPID_THREAD_CS_EXIT(POBJ, MPIR_THREAD_POBJ_PMI_MUTEX);*/ MPL_free(argcs); if (pmi_errno != PMI2_SUCCESS) { MPIR_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**pmi_spawn_multiple", "**pmi_spawn_multiple %d", pmi_errno); } } #else /* FIXME: This is *really* awkward. We should either Fix on MPI-style info data structures for PMI (avoid unnecessary duplication) or add an MPIU_Info_getall(...) that creates the necessary arrays of key/value pairs */ /* convert the infos into PMI keyvals */ info_keyval_sizes = (int *) MPL_malloc(count * sizeof(int)); info_keyval_vectors = (PMI_keyval_t**) MPL_malloc(count * sizeof(PMI_keyval_t*)); if (!info_keyval_sizes || !info_keyval_vectors) { MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**nomem"); } if (!info_ptrs) { for (i=0; i<count; i++) { info_keyval_vectors[i] = 0; info_keyval_sizes[i] = 0; } } else { for (i=0; i<count; i++) { mpi_errno = mpi_to_pmi_keyvals( info_ptrs[i], &info_keyval_vectors[i], &info_keyval_sizes[i] ); if (mpi_errno) { MPIR_ERR_POP(mpi_errno); } } } preput_keyval_vector.key = PARENT_PORT_KVSKEY; preput_keyval_vector.val = port_name; MPID_THREAD_CS_ENTER(POBJ, MPIR_THREAD_POBJ_PMI_MUTEX); pmi_errno = PMI_Spawn_multiple(count, (const char **) commands, (const char ***) argvs, maxprocs, info_keyval_sizes, (const PMI_keyval_t **) info_keyval_vectors, 1, &preput_keyval_vector, pmi_errcodes); MPID_THREAD_CS_EXIT(POBJ, MPIR_THREAD_POBJ_PMI_MUTEX); if (pmi_errno != PMI_SUCCESS) { MPIR_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**pmi_spawn_multiple", "**pmi_spawn_multiple %d", pmi_errno); } #endif if (errcodes != MPI_ERRCODES_IGNORE) { for (i=0; i<total_num_processes; i++) { /* FIXME: translate the pmi error codes here */ errcodes[i] = pmi_errcodes[i]; /* We want to accept if any of the spawns succeeded. Alternatively, this is the same as we want to NOT accept if all of them failed. should_accept = NAND(e_0, ..., e_n) Remember, success equals false (0). */ should_accept = should_accept && errcodes[i]; } should_accept = !should_accept; /* the `N' in NAND */ } } if (errcodes != MPI_ERRCODES_IGNORE) { MPIR_Errflag_t errflag = MPIR_ERR_NONE; mpi_errno = MPIR_Bcast_impl(&should_accept, 1, MPI_INT, root, comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Bcast_impl(&total_num_processes, 1, MPI_INT, root, comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Bcast_impl(errcodes, total_num_processes, MPI_INT, root, comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); } if (should_accept) { mpi_errno = MPID_Comm_accept(port_name, NULL, root, comm_ptr, intercomm); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } else { MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**pmi_spawn_multiple"); } if (comm_ptr->rank == root) { /* Close the port opened for the spawned processes to connect to */ mpi_errno = MPID_Close_port(port_name); /* --BEGIN ERROR HANDLING-- */ if (mpi_errno != MPI_SUCCESS) { MPIR_ERR_POP(mpi_errno); } /* --END ERROR HANDLING-- */ } fn_exit: if (info_keyval_vectors) { free_pmi_keyvals(info_keyval_vectors, count, info_keyval_sizes); MPL_free(info_keyval_sizes); MPL_free(info_keyval_vectors); } if (pmi_errcodes) { MPL_free(pmi_errcodes); } MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_COMM_SPAWN_MULTIPLE); return mpi_errno; fn_fail: goto fn_exit; }
static int barrier_smp_intra(MPID_Comm *comm_ptr, MPIR_Errflag_t *errflag) { int mpi_errno=MPI_SUCCESS; int mpi_errno_ret = MPI_SUCCESS; MPIU_Assert(MPIR_CVAR_ENABLE_SMP_COLLECTIVES && MPIR_CVAR_ENABLE_SMP_BARRIER && MPIR_Comm_is_node_aware(comm_ptr)); #if defined(FINEGRAIN_MPI) int colocated_size = -1; int colocated_sense = -1; /* do barrier on osproc_colocated_comm */ if (comm_ptr->osproc_colocated_comm != NULL) { colocated_size = comm_ptr->osproc_colocated_comm->local_size; MPIU_Assert( (comm_ptr->osproc_colocated_comm->co_shared_vars != NULL) && (comm_ptr->osproc_colocated_comm->co_shared_vars->co_barrier_vars != NULL) ); MPIU_Assert(colocated_size > 1 ); colocated_sense = comm_ptr->osproc_colocated_comm->co_shared_vars->co_barrier_vars->coproclet_signal; if( comm_ptr->osproc_colocated_comm->rank != 0 ) { /* non-leader */ (comm_ptr->osproc_colocated_comm->co_shared_vars->co_barrier_vars->coproclet_counter)++; if (comm_ptr->osproc_colocated_comm->co_shared_vars->co_barrier_vars->coproclet_counter == (colocated_size-1)){ /* excluding the leader */ comm_ptr->osproc_colocated_comm->co_shared_vars->co_barrier_vars->leader_signal = 1; } while(comm_ptr->osproc_colocated_comm->co_shared_vars->co_barrier_vars->coproclet_signal == colocated_sense) { FG_Yield(); } } else { /* leader */ while(comm_ptr->osproc_colocated_comm->co_shared_vars->co_barrier_vars->leader_signal == 0) { FG_Yield(); } } #if 0 /* Non-optimized version */ mpi_errno = MPIR_Barrier_impl(comm_ptr->osproc_colocated_comm, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIR_ERR_GET_CLASS(mpi_errno); MPIU_ERR_SET(mpi_errno, *errflag, "**fail"); MPIU_ERR_ADD(mpi_errno_ret, mpi_errno); } #endif } #endif /* do the intranode barrier on all nodes */ if (comm_ptr->node_comm != NULL) { mpi_errno = MPIR_Barrier_impl(comm_ptr->node_comm, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIR_ERR_GET_CLASS(mpi_errno); MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); } } /* do the barrier across roots of all nodes */ if (comm_ptr->node_roots_comm != NULL) { mpi_errno = MPIR_Barrier_impl(comm_ptr->node_roots_comm, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIR_ERR_GET_CLASS(mpi_errno); MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); } } /* release the local processes on each node with a 1-byte broadcast (0-byte broadcast just returns without doing anything) */ if (comm_ptr->node_comm != NULL) { int i=0; mpi_errno = MPIR_Bcast_impl(&i, 1, MPI_BYTE, 0, comm_ptr->node_comm, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIR_ERR_GET_CLASS(mpi_errno); MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); } } #if defined(FINEGRAIN_MPI) if (comm_ptr->osproc_colocated_comm != NULL) { if (comm_ptr->osproc_colocated_comm->rank == 0) { /* leader */ comm_ptr->osproc_colocated_comm->co_shared_vars->co_barrier_vars->leader_signal = 0; comm_ptr->osproc_colocated_comm->co_shared_vars->co_barrier_vars->coproclet_counter = 0; comm_ptr->osproc_colocated_comm->co_shared_vars->co_barrier_vars->coproclet_signal = 1 - comm_ptr->osproc_colocated_comm->co_shared_vars->co_barrier_vars->coproclet_signal; } #if 0 /* Non-optimized version */ /* release the colocated processes in each OS-process with a 1-byte broadcast (0-byte broadcast just returns without doing anything) */ int i=0; mpi_errno = MPIR_Bcast_impl(&i, 1, MPI_BYTE, 0, comm_ptr->osproc_colocated_comm, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIR_ERR_GET_CLASS(mpi_errno); MPIU_ERR_SET(mpi_errno, *errflag, "**fail"); MPIU_ERR_ADD(mpi_errno_ret, mpi_errno); } #endif } #endif fn_exit: if (mpi_errno_ret) mpi_errno = mpi_errno_ret; else if (*errflag != MPIR_ERR_NONE) MPIR_ERR_SET(mpi_errno, *errflag, "**coll_fail"); return mpi_errno; fn_fail: goto fn_exit; }
int MPIR_Intercomm_merge_impl(MPID_Comm *comm_ptr, int high, MPID_Comm **new_intracomm_ptr) { int mpi_errno = MPI_SUCCESS; int local_high, remote_high, new_size; MPIU_Context_id_t new_context_id; MPIR_Errflag_t errflag = MPIR_ERR_NONE; MPID_MPI_STATE_DECL(MPID_STATE_MPIR_INTERCOMM_MERGE_IMPL); MPID_MPI_FUNC_ENTER(MPID_STATE_MPIR_INTERCOMM_MERGE_IMPL); /* Make sure that we have a local intercommunicator */ if (!comm_ptr->local_comm) { /* Manufacture the local communicator */ mpi_errno = MPIR_Setup_intercomm_localcomm( comm_ptr ); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } /* Find the "high" value of the other group of processes. This will be used to determine which group is ordered first in the generated communicator. high is logical */ local_high = high; if (comm_ptr->rank == 0) { /* This routine allows use to use the collective communication context rather than the point-to-point context. */ mpi_errno = MPIC_Sendrecv( &local_high, 1, MPI_INT, 0, 0, &remote_high, 1, MPI_INT, 0, 0, comm_ptr, MPI_STATUS_IGNORE, &errflag ); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* If local_high and remote_high are the same, then order is arbitrary. we use the gpids of the rank 0 member of the local and remote groups to choose an order in this case. */ if (local_high == remote_high) { MPID_Gpid ingpid, outgpid; mpi_errno = MPID_GPID_Get( comm_ptr, 0, &ingpid ); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIC_Sendrecv( &ingpid, sizeof(MPID_Gpid), MPI_BYTE, 0, 1, &outgpid, sizeof(MPID_Gpid), MPI_BYTE, 0, 1, comm_ptr, MPI_STATUS_IGNORE, &errflag ); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* Note that the gpids cannot be the same because we are starting from a valid intercomm */ int rc = memcmp(&ingpid,&outgpid,sizeof(MPID_Gpid)); if(rc < 0) local_high = 1; else if(rc > 0) local_high = 0; else { /* req#3930: The merge algorithm will deadlock if the gpids are inadvertently the same due to implementation bugs in the MPID_GPID_Get() function */ MPIU_Assert(rc != 0); } } } /* All processes in the local group now need to get the value of local_high, which may have changed if both groups of processes had the same value for high */ mpi_errno = MPIR_Bcast_impl( &local_high, 1, MPI_INT, 0, comm_ptr->local_comm, &errflag ); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); mpi_errno = MPIR_Comm_create( new_intracomm_ptr ); if (mpi_errno) MPIR_ERR_POP(mpi_errno); new_size = comm_ptr->local_size + comm_ptr->remote_size; /* FIXME: For the intracomm, we need a consistent context id. That means that one of the two groups needs to use the recvcontext_id and the other must use the context_id */ if (local_high) { (*new_intracomm_ptr)->context_id = comm_ptr->recvcontext_id + 2; /* See below */ } else { (*new_intracomm_ptr)->context_id = comm_ptr->context_id + 2; /* See below */ } (*new_intracomm_ptr)->recvcontext_id = (*new_intracomm_ptr)->context_id; (*new_intracomm_ptr)->remote_size = (*new_intracomm_ptr)->local_size = new_size; (*new_intracomm_ptr)->rank = -1; (*new_intracomm_ptr)->comm_kind = MPID_INTRACOMM; /* Now we know which group comes first. Build the new mapping from the existing comm */ mpi_errno = create_and_map(comm_ptr, local_high, (*new_intracomm_ptr)); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* We've setup a temporary context id, based on the context id used by the intercomm. This allows us to perform the allreduce operations within the context id algorithm, since we already have a valid (almost - see comm_create_hook) communicator. */ mpi_errno = MPIR_Comm_commit((*new_intracomm_ptr)); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* printf( "About to get context id \n" ); fflush( stdout ); */ /* In the multi-threaded case, MPIR_Get_contextid_sparse assumes that the calling routine already holds the single criticial section */ new_context_id = 0; mpi_errno = MPIR_Get_contextid_sparse( (*new_intracomm_ptr), &new_context_id, FALSE ); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIU_Assert(new_context_id != 0); /* We release this communicator that was involved just to * get valid context id and create true one */ mpi_errno = MPIR_Comm_release(*new_intracomm_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Comm_create( new_intracomm_ptr ); if (mpi_errno) MPIR_ERR_POP(mpi_errno); (*new_intracomm_ptr)->remote_size = (*new_intracomm_ptr)->local_size = new_size; (*new_intracomm_ptr)->rank = -1; (*new_intracomm_ptr)->comm_kind = MPID_INTRACOMM; (*new_intracomm_ptr)->context_id = new_context_id; (*new_intracomm_ptr)->recvcontext_id = new_context_id; mpi_errno = create_and_map(comm_ptr, local_high, (*new_intracomm_ptr)); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Comm_commit((*new_intracomm_ptr)); if (mpi_errno) MPIR_ERR_POP(mpi_errno); fn_exit: MPID_MPI_FUNC_EXIT(MPID_STATE_MPIR_INTERCOMM_MERGE_IMPL); return mpi_errno; fn_fail: goto fn_exit; }
int MPIR_Barrier_impl(MPID_Comm *comm_ptr, int *errflag) { int mpi_errno = MPI_SUCCESS; int mpi_errno_ret = MPI_SUCCESS; if (comm_ptr->coll_fns != NULL && comm_ptr->coll_fns->Barrier != NULL) { mpi_errno = comm_ptr->coll_fns->Barrier(comm_ptr, errflag); if (mpi_errno) MPIU_ERR_POP(mpi_errno); } else { if (comm_ptr->comm_kind == MPID_INTRACOMM) { #if defined(USE_SMP_COLLECTIVES) if (MPIR_Comm_is_node_aware(comm_ptr)) { /* do the intranode barrier on all nodes */ if (comm_ptr->node_comm != NULL) { mpi_errno = MPIR_Barrier_or_coll_fn(comm_ptr->node_comm, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = TRUE; MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail"); MPIU_ERR_ADD(mpi_errno_ret, mpi_errno); } } /* do the barrier across roots of all nodes */ if (comm_ptr->node_roots_comm != NULL) { mpi_errno = MPIR_Barrier_or_coll_fn(comm_ptr->node_roots_comm, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = TRUE; MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail"); MPIU_ERR_ADD(mpi_errno_ret, mpi_errno); } } /* release the local processes on each node with a 1-byte broadcast (0-byte broadcast just returns without doing anything) */ if (comm_ptr->node_comm != NULL) { int i=0; mpi_errno = MPIR_Bcast_impl(&i, 1, MPI_BYTE, 0, comm_ptr->node_comm, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = TRUE; MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail"); MPIU_ERR_ADD(mpi_errno_ret, mpi_errno); } } } else { mpi_errno = MPIR_Barrier_intra( comm_ptr, errflag ); if (mpi_errno) MPIU_ERR_POP(mpi_errno); } #else mpi_errno = MPIR_Barrier_intra( comm_ptr, errflag ); if (mpi_errno) MPIU_ERR_POP(mpi_errno); #endif } else { /* intercommunicator */ mpi_errno = MPIR_Barrier_inter( comm_ptr, errflag ); if (mpi_errno) MPIU_ERR_POP(mpi_errno); } } fn_exit: if (mpi_errno_ret) mpi_errno = mpi_errno_ret; else if (*errflag) MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**coll_fail"); return mpi_errno; fn_fail: goto fn_exit; }
int create_2level_comm (MPI_Comm comm, int size, int my_rank) { static const char FCNAME[] = "create_2level_comm"; int mpi_errno = MPI_SUCCESS; MPID_Comm* comm_ptr; MPID_Comm* comm_world_ptr; MPI_Group subgroup1, comm_group; MPID_Group *group_ptr=NULL; int leader_comm_size, my_local_size, my_local_id, input_flag =0, output_flag=0; int errflag = FALSE; int leader_group_size=0; MPIU_THREADPRIV_DECL; MPIU_THREADPRIV_GET; MPID_Comm_get_ptr( comm, comm_ptr ); MPID_Comm_get_ptr( MPI_COMM_WORLD, comm_world_ptr ); int* shmem_group = MPIU_Malloc(sizeof(int) * size); if (NULL == shmem_group){ printf("Couldn't malloc shmem_group\n"); ibv_error_abort (GEN_EXIT_ERR, "create_2level_com"); } /* Creating local shmem group */ int i = 0; int local_rank = 0; int grp_index = 0; comm_ptr->ch.leader_comm=MPI_COMM_NULL; comm_ptr->ch.shmem_comm=MPI_COMM_NULL; MPIDI_VC_t* vc = NULL; for (; i < size ; ++i){ MPIDI_Comm_get_vc(comm_ptr, i, &vc); if (my_rank == i || vc->smp.local_rank >= 0){ shmem_group[grp_index] = i; if (my_rank == i){ local_rank = grp_index; } ++grp_index; } } /* Creating leader group */ int leader = 0; leader = shmem_group[0]; /* Gives the mapping to any process's leader in comm */ comm_ptr->ch.leader_map = MPIU_Malloc(sizeof(int) * size); if (NULL == comm_ptr->ch.leader_map){ printf("Couldn't malloc group\n"); ibv_error_abort (GEN_EXIT_ERR, "create_2level_com"); } mpi_errno = MPIR_Allgather_impl (&leader, 1, MPI_INT , comm_ptr->ch.leader_map, 1, MPI_INT, comm_ptr, &errflag); if(mpi_errno) { MPIU_ERR_POP(mpi_errno); } int* leader_group = MPIU_Malloc(sizeof(int) * size); if (NULL == leader_group){ printf("Couldn't malloc leader_group\n"); ibv_error_abort (GEN_EXIT_ERR, "create_2level_com"); } /* Gives the mapping from leader's rank in comm to * leader's rank in leader_comm */ comm_ptr->ch.leader_rank = MPIU_Malloc(sizeof(int) * size); if (NULL == comm_ptr->ch.leader_rank){ printf("Couldn't malloc marker\n"); ibv_error_abort (GEN_EXIT_ERR, "create_2level_com"); } for (i=0; i < size ; ++i){ comm_ptr->ch.leader_rank[i] = -1; } int* group = comm_ptr->ch.leader_map; grp_index = 0; for (i=0; i < size ; ++i){ if (comm_ptr->ch.leader_rank[(group[i])] == -1){ comm_ptr->ch.leader_rank[(group[i])] = grp_index; leader_group[grp_index++] = group[i]; } } leader_group_size = grp_index; comm_ptr->ch.leader_group_size = leader_group_size; mpi_errno = PMPI_Comm_group(comm, &comm_group); if(mpi_errno) { MPIU_ERR_POP(mpi_errno); } mpi_errno = PMPI_Group_incl(comm_group, leader_group_size, leader_group, &subgroup1); if(mpi_errno) { MPIU_ERR_POP(mpi_errno); } mpi_errno = PMPI_Comm_create(comm, subgroup1, &(comm_ptr->ch.leader_comm)); if(mpi_errno) { MPIU_ERR_POP(mpi_errno); } MPID_Comm *leader_ptr; MPID_Comm_get_ptr( comm_ptr->ch.leader_comm, leader_ptr ); MPIU_Free(leader_group); MPID_Group_get_ptr( subgroup1, group_ptr ); if(group_ptr != NULL) { mpi_errno = PMPI_Group_free(&subgroup1); if(mpi_errno) { MPIU_ERR_POP(mpi_errno); } } mpi_errno = PMPI_Comm_split(comm, leader, local_rank, &(comm_ptr->ch.shmem_comm)); if(mpi_errno) { MPIU_ERR_POP(mpi_errno); } MPID_Comm *shmem_ptr; MPID_Comm_get_ptr(comm_ptr->ch.shmem_comm, shmem_ptr); mpi_errno = PMPI_Comm_rank(comm_ptr->ch.shmem_comm, &my_local_id); if(mpi_errno) { MPIU_ERR_POP(mpi_errno); } mpi_errno = PMPI_Comm_size(comm_ptr->ch.shmem_comm, &my_local_size); if(mpi_errno) { MPIU_ERR_POP(mpi_errno); } if(my_local_id == 0) { int array_index=0; mpi_errno = PMPI_Comm_size(comm_ptr->ch.leader_comm, &leader_comm_size); if(mpi_errno) { MPIU_ERR_POP(mpi_errno); } comm_ptr->ch.node_sizes = MPIU_Malloc(sizeof(int)*leader_comm_size); mpi_errno = PMPI_Allgather(&my_local_size, 1, MPI_INT, comm_ptr->ch.node_sizes, 1, MPI_INT, comm_ptr->ch.leader_comm); if(mpi_errno) { MPIU_ERR_POP(mpi_errno); } comm_ptr->ch.is_uniform = 1; for(array_index=0; array_index < leader_comm_size; array_index++) { if(comm_ptr->ch.node_sizes[0] != comm_ptr->ch.node_sizes[array_index]) { comm_ptr->ch.is_uniform = 0; break; } } } comm_ptr->ch.is_global_block = 0; /* We need to check to see if the ranks are block or not. Each node leader * gets the global ranks of all of its children processes. It scans through * this array to see if the ranks are in block order. The node-leaders then * do an allreduce to see if all the other nodes are also in block order. * This is followed by an intra-node bcast to let the children processes * know of the result of this step */ if(my_local_id == 0) { int is_local_block = 1; int index = 1; while( index < my_local_size) { if( (shmem_group[index] - 1) != shmem_group[index - 1]) { is_local_block = 0; break; } index++; } comm_ptr->ch.shmem_coll_ok = 0;/* To prevent Allreduce taking shmem route*/ mpi_errno = MPIR_Allreduce_impl(&(is_local_block), &(comm_ptr->ch.is_global_block), 1, MPI_INT, MPI_LAND, leader_ptr, &errflag); if(mpi_errno) { MPIU_ERR_POP(mpi_errno); } mpi_errno = MPIR_Bcast_impl(&(comm_ptr->ch.is_global_block),1, MPI_INT, 0, shmem_ptr, &errflag); if(mpi_errno) { MPIU_ERR_POP(mpi_errno); } } else { mpi_errno = MPIR_Bcast_impl(&(comm_ptr->ch.is_global_block),1, MPI_INT, 0, shmem_ptr, &errflag); if(mpi_errno) { MPIU_ERR_POP(mpi_errno); } } if (my_local_id == 0){ lock_shmem_region(); increment_shmem_comm_count(); shmem_comm_count = get_shmem_comm_count(); unlock_shmem_region(); } shmem_ptr->ch.shmem_coll_ok = 0; /* To prevent Bcast taking the knomial_2level_bcast route */ mpi_errno = MPIR_Bcast_impl (&shmem_comm_count, 1, MPI_INT, 0, shmem_ptr, &errflag); if(mpi_errno) { MPIU_ERR_POP(mpi_errno); } if (shmem_comm_count <= g_shmem_coll_blocks){ shmem_ptr->ch.shmem_comm_rank = shmem_comm_count-1; input_flag = 1; } else{ input_flag = 0; } comm_ptr->ch.shmem_coll_ok = 0;/* To prevent Allreduce taking shmem route*/ mpi_errno = MPIR_Allreduce_impl(&input_flag, &output_flag, 1, MPI_INT, MPI_LAND, comm_ptr, &errflag); if(mpi_errno) { MPIU_ERR_POP(mpi_errno); } comm_ptr->ch.allgather_comm_ok = 0; if (allgather_ranking){ int is_contig =1, check_leader =1, check_size=1, is_local_ok=0,is_block=0; int PPN; int shmem_grp_size = my_local_size; int leader_rank; MPI_Group allgather_group; comm_ptr->ch.allgather_comm=MPI_COMM_NULL; comm_ptr->ch.allgather_new_ranks=NULL; if(comm_ptr->ch.leader_comm != MPI_COMM_NULL) { PMPI_Comm_rank(comm_ptr->ch.leader_comm, &leader_rank); } mpi_errno=MPIR_Bcast_impl(&leader_rank, 1, MPI_INT, 0, shmem_ptr, &errflag); if(mpi_errno) { MPIU_ERR_POP(mpi_errno); } for (i=1; i < shmem_grp_size; i++ ){ if (shmem_group[i] != shmem_group[i-1]+1){ is_contig =0; break; } } if (leader != (shmem_grp_size*leader_rank)){ check_leader=0; } if (shmem_grp_size != (size/leader_group_size)){ check_size=0; } is_local_ok = is_contig && check_leader && check_size; mpi_errno = MPIR_Allreduce_impl(&is_local_ok, &is_block, 1, MPI_INT, MPI_LAND, comm_ptr, &errflag); if(mpi_errno) { MPIU_ERR_POP(mpi_errno); } if (is_block){ int counter=0,j; comm_ptr->ch.allgather_new_ranks = MPIU_Malloc(sizeof(int)*size); if (NULL == comm_ptr->ch.allgather_new_ranks){ mpi_errno = MPIR_Err_create_code( MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER, "**nomem", 0 ); return mpi_errno; } PPN = shmem_grp_size; for (j=0; j < PPN; j++){ for (i=0; i < leader_group_size; i++){ comm_ptr->ch.allgather_new_ranks[counter] = j + i*PPN; counter++; } } mpi_errno = PMPI_Group_incl(comm_group, size, comm_ptr->ch.allgather_new_ranks, &allgather_group); if(mpi_errno) { MPIU_ERR_POP(mpi_errno); } mpi_errno = PMPI_Comm_create(comm_ptr->handle, allgather_group, &(comm_ptr->ch.allgather_comm)); if(mpi_errno) { MPIU_ERR_POP(mpi_errno); } comm_ptr->ch.allgather_comm_ok = 1; mpi_errno=PMPI_Group_free(&allgather_group); if(mpi_errno) { MPIU_ERR_POP(mpi_errno); } } } mpi_errno=PMPI_Group_free(&comm_group); if(mpi_errno) { MPIU_ERR_POP(mpi_errno); } if (output_flag == 1){ comm_ptr->ch.shmem_coll_ok = 1; comm_registry[comm_registered++] = comm_ptr->context_id; } else{ comm_ptr->ch.shmem_coll_ok = 0; MPID_Group_get_ptr( subgroup1, group_ptr ); if(group_ptr != NULL) { mpi_errno = PMPI_Group_free(&subgroup1); if(mpi_errno) { MPIU_ERR_POP(mpi_errno); } } MPID_Group_get_ptr( comm_group, group_ptr ); if(group_ptr != NULL) { mpi_errno = PMPI_Group_free(&comm_group); if(mpi_errno) { MPIU_ERR_POP(mpi_errno); } } free_2level_comm(comm_ptr); comm_ptr->ch.shmem_comm = MPI_COMM_NULL; comm_ptr->ch.leader_comm = MPI_COMM_NULL; } ++comm_count; MPIU_Free(shmem_group); fn_fail: MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr ); return (mpi_errno); }
/* comm create impl for intercommunicators, assumes that the standard error * checking has already taken place in the calling function */ PMPI_LOCAL int MPIR_Comm_create_inter(MPID_Comm *comm_ptr, MPID_Group *group_ptr, MPID_Comm **newcomm_ptr) { int mpi_errno = MPI_SUCCESS; MPIU_Context_id_t new_context_id; int *mapping = NULL; int *remote_mapping = NULL; MPID_Comm *mapping_comm = NULL; int remote_size = -1; int rinfo[2]; MPIR_Errflag_t errflag = MPIR_ERR_NONE; MPIU_CHKLMEM_DECL(1); MPID_MPI_STATE_DECL(MPID_STATE_MPIR_COMM_CREATE_INTER); MPID_MPI_FUNC_ENTER(MPID_STATE_MPIR_COMM_CREATE_INTER); MPIU_Assert(comm_ptr->comm_kind == MPID_INTERCOMM); /* Create a new communicator from the specified group members */ /* If there is a context id cache in oldcomm, use it here. Otherwise, use the appropriate algorithm to get a new context id. Creating the context id is collective over the *input* communicator, so it must be created before we decide if this process is a member of the group */ /* In the multi-threaded case, MPIR_Get_contextid_sparse assumes that the calling routine already holds the single criticial section */ if (!comm_ptr->local_comm) { MPIR_Setup_intercomm_localcomm( comm_ptr ); } mpi_errno = MPIR_Get_contextid_sparse( comm_ptr->local_comm, &new_context_id, FALSE ); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIU_Assert(new_context_id != 0); MPIU_Assert(new_context_id != comm_ptr->recvcontext_id); mpi_errno = MPIR_Comm_create_calculate_mapping(group_ptr, comm_ptr, &mapping, &mapping_comm); if (mpi_errno) MPIR_ERR_POP(mpi_errno); *newcomm_ptr = NULL; if (group_ptr->rank != MPI_UNDEFINED) { /* Get the new communicator structure and context id */ mpi_errno = MPIR_Comm_create( newcomm_ptr ); if (mpi_errno) goto fn_fail; (*newcomm_ptr)->recvcontext_id = new_context_id; (*newcomm_ptr)->rank = group_ptr->rank; (*newcomm_ptr)->comm_kind = comm_ptr->comm_kind; /* Since the group has been provided, let the new communicator know about the group */ (*newcomm_ptr)->local_comm = 0; (*newcomm_ptr)->local_group = group_ptr; MPIR_Group_add_ref( group_ptr ); (*newcomm_ptr)->local_size = group_ptr->size; (*newcomm_ptr)->remote_group = 0; (*newcomm_ptr)->is_low_group = comm_ptr->is_low_group; } /* There is an additional step. We must communicate the information on the local context id and the group members, given by the ranks so that the remote process can construct the appropriate network address mapping. First we exchange group sizes and context ids. Then the ranks in the remote group, from which the remote network address mapping can be constructed. We need to use the "collective" context in the original intercommunicator */ if (comm_ptr->rank == 0) { int info[2]; info[0] = new_context_id; info[1] = group_ptr->size; mpi_errno = MPIC_Sendrecv(info, 2, MPI_INT, 0, 0, rinfo, 2, MPI_INT, 0, 0, comm_ptr, MPI_STATUS_IGNORE, &errflag ); if (mpi_errno) { MPIR_ERR_POP( mpi_errno ); } if (*newcomm_ptr != NULL) { (*newcomm_ptr)->context_id = rinfo[0]; } remote_size = rinfo[1]; MPIU_CHKLMEM_MALLOC(remote_mapping,int*, remote_size*sizeof(int), mpi_errno,"remote_mapping"); /* Populate and exchange the ranks */ mpi_errno = MPIC_Sendrecv( mapping, group_ptr->size, MPI_INT, 0, 0, remote_mapping, remote_size, MPI_INT, 0, 0, comm_ptr, MPI_STATUS_IGNORE, &errflag ); if (mpi_errno) { MPIR_ERR_POP( mpi_errno ); } /* Broadcast to the other members of the local group */ mpi_errno = MPIR_Bcast_impl( rinfo, 2, MPI_INT, 0, comm_ptr->local_comm, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Bcast_impl( remote_mapping, remote_size, MPI_INT, 0, comm_ptr->local_comm, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); }
static int MPIDI_CH3I_Win_allocate_shm(MPI_Aint size, int disp_unit, MPIR_Info * info, MPIR_Comm * comm_ptr, void *base_ptr, MPIR_Win ** win_ptr) { int mpi_errno = MPI_SUCCESS; void **base_pp = (void **) base_ptr; int i, node_size, node_rank; MPIR_Comm *node_comm_ptr; MPI_Aint *node_sizes; MPIR_Errflag_t errflag = MPIR_ERR_NONE; int noncontig = FALSE; MPIR_CHKPMEM_DECL(1); MPIR_CHKLMEM_DECL(1); MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3I_WIN_ALLOCATE_SHM); MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPIDI_CH3I_WIN_ALLOCATE_SHM); if ((*win_ptr)->comm_ptr->node_comm == NULL) { mpi_errno = MPIDI_CH3U_Win_allocate_no_shm(size, disp_unit, info, comm_ptr, base_ptr, win_ptr); goto fn_exit; } /* see if we can allocate all windows contiguously */ noncontig = (*win_ptr)->info_args.alloc_shared_noncontig; (*win_ptr)->shm_allocated = TRUE; /* When allocating shared memory region segment, we need comm of processes * that are on the same node as this process (node_comm). * If node_comm == NULL, this process is the only one on this node, therefore * we use comm_self as node comm. */ node_comm_ptr = (*win_ptr)->comm_ptr->node_comm; MPIR_Assert(node_comm_ptr != NULL); node_size = node_comm_ptr->local_size; node_rank = node_comm_ptr->rank; MPIR_T_PVAR_TIMER_START(RMA, rma_wincreate_allgather); /* allocate memory for the base addresses, disp_units, and * completion counters of all processes */ MPIR_CHKPMEM_MALLOC((*win_ptr)->shm_base_addrs, void **, node_size * sizeof(void *), mpi_errno, "(*win_ptr)->shm_base_addrs"); /* get the sizes of the windows and window objectsof * all processes. allocate temp. buffer for communication */ MPIR_CHKLMEM_MALLOC(node_sizes, MPI_Aint *, node_size * sizeof(MPI_Aint), mpi_errno, "node_sizes"); /* FIXME: This needs to be fixed for heterogeneous systems */ node_sizes[node_rank] = (MPI_Aint) size; mpi_errno = MPIR_Allgather_impl(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, node_sizes, sizeof(MPI_Aint), MPI_BYTE, node_comm_ptr, &errflag); MPIR_T_PVAR_TIMER_END(RMA, rma_wincreate_allgather); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); (*win_ptr)->shm_segment_len = 0; for (i = 0; i < node_size; i++) { if (noncontig) /* Round up to next page size */ (*win_ptr)->shm_segment_len += MPIDI_CH3_ROUND_UP_PAGESIZE(node_sizes[i]); else (*win_ptr)->shm_segment_len += node_sizes[i]; } if ((*win_ptr)->shm_segment_len == 0) { (*win_ptr)->base = NULL; } else { mpi_errno = MPL_shm_hnd_init(&(*win_ptr)->shm_segment_handle); if (mpi_errno) MPIR_ERR_POP(mpi_errno); if (node_rank == 0) { char *serialized_hnd_ptr = NULL; /* create shared memory region for all processes in win and map */ mpi_errno = MPL_shm_seg_create_and_attach((*win_ptr)->shm_segment_handle, (*win_ptr)->shm_segment_len, (char **) &(*win_ptr)->shm_base_addr, 0); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* serialize handle and broadcast it to the other processes in win */ mpi_errno = MPL_shm_hnd_get_serialized_by_ref((*win_ptr)->shm_segment_handle, &serialized_hnd_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Bcast_impl(serialized_hnd_ptr, MPL_SHM_GHND_SZ, MPI_CHAR, 0, node_comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); /* wait for other processes to attach to win */ mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); /* unlink shared memory region so it gets deleted when all processes exit */ mpi_errno = MPL_shm_seg_remove((*win_ptr)->shm_segment_handle); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } else { char serialized_hnd[MPL_SHM_GHND_SZ] = { 0 }; /* get serialized handle from rank 0 and deserialize it */ mpi_errno = MPIR_Bcast_impl(serialized_hnd, MPL_SHM_GHND_SZ, MPI_CHAR, 0, node_comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); mpi_errno = MPL_shm_hnd_deserialize((*win_ptr)->shm_segment_handle, serialized_hnd, strlen(serialized_hnd)); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* attach to shared memory region created by rank 0 */ mpi_errno = MPL_shm_seg_attach((*win_ptr)->shm_segment_handle, (*win_ptr)->shm_segment_len, (char **) &(*win_ptr)->shm_base_addr, 0); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); } /* Allocated the interprocess mutex segment. */ mpi_errno = MPL_shm_hnd_init(&(*win_ptr)->shm_mutex_segment_handle); if (mpi_errno) MPIR_ERR_POP(mpi_errno); if (node_rank == 0) { char *serialized_hnd_ptr = NULL; /* create shared memory region for all processes in win and map */ mpi_errno = MPL_shm_seg_create_and_attach((*win_ptr)->shm_mutex_segment_handle, sizeof(MPIDI_CH3I_SHM_MUTEX), (char **) &(*win_ptr)->shm_mutex, 0); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIDI_CH3I_SHM_MUTEX_INIT(*win_ptr); /* serialize handle and broadcast it to the other processes in win */ mpi_errno = MPL_shm_hnd_get_serialized_by_ref((*win_ptr)->shm_mutex_segment_handle, &serialized_hnd_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Bcast_impl(serialized_hnd_ptr, MPL_SHM_GHND_SZ, MPI_CHAR, 0, node_comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); /* wait for other processes to attach to win */ mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); /* unlink shared memory region so it gets deleted when all processes exit */ mpi_errno = MPL_shm_seg_remove((*win_ptr)->shm_mutex_segment_handle); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } else { char serialized_hnd[MPL_SHM_GHND_SZ] = { 0 }; /* get serialized handle from rank 0 and deserialize it */ mpi_errno = MPIR_Bcast_impl(serialized_hnd, MPL_SHM_GHND_SZ, MPI_CHAR, 0, node_comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); mpi_errno = MPL_shm_hnd_deserialize((*win_ptr)->shm_mutex_segment_handle, serialized_hnd, strlen(serialized_hnd)); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* attach to shared memory region created by rank 0 */ mpi_errno = MPL_shm_seg_attach((*win_ptr)->shm_mutex_segment_handle, sizeof(MPIDI_CH3I_SHM_MUTEX), (char **) &(*win_ptr)->shm_mutex, 0); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); } /* compute the base addresses of each process within the shared memory segment */ { char *cur_base; int cur_rank; cur_base = (*win_ptr)->shm_base_addr; cur_rank = 0; ((*win_ptr)->shm_base_addrs)[0] = (*win_ptr)->shm_base_addr; for (i = 1; i < node_size; ++i) { if (node_sizes[i]) { /* For the base addresses, we track the previous * process that has allocated non-zero bytes of shared * memory. We can not simply use "i-1" for the * previous process because rank "i-1" might not have * allocated any memory. */ if (noncontig) { ((*win_ptr)->shm_base_addrs)[i] = cur_base + MPIDI_CH3_ROUND_UP_PAGESIZE(node_sizes[cur_rank]); } else { ((*win_ptr)->shm_base_addrs)[i] = cur_base + node_sizes[cur_rank]; } cur_base = ((*win_ptr)->shm_base_addrs)[i]; cur_rank = i; } else { ((*win_ptr)->shm_base_addrs)[i] = NULL; } } } (*win_ptr)->base = (*win_ptr)->shm_base_addrs[node_rank]; } *base_pp = (*win_ptr)->base; /* gather window information among processes via shared memory region. */ mpi_errno = MPIDI_CH3I_Win_gather_info((*base_pp), size, disp_unit, info, comm_ptr, win_ptr); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); /* Cache SHM windows */ MPIDI_CH3I_SHM_Wins_append(&shm_wins_list, (*win_ptr)); fn_exit: MPIR_CHKLMEM_FREEALL(); MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPIDI_CH3I_WIN_ALLOCATE_SHM); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: MPIR_CHKPMEM_REAP(); goto fn_exit; /* --END ERROR HANDLING-- */ }
static int MPIDI_CH3I_Win_gather_info(void *base, MPI_Aint size, int disp_unit, MPIR_Info * info, MPIR_Comm * comm_ptr, MPIR_Win ** win_ptr) { MPIR_Comm *node_comm_ptr = NULL; int node_rank; int comm_rank, comm_size; MPI_Aint *tmp_buf = NULL; int i, k; MPIR_Errflag_t errflag = MPIR_ERR_NONE; int mpi_errno = MPI_SUCCESS; MPIR_CHKLMEM_DECL(1); MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3I_WIN_GATHER_INFO); MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPIDI_CH3I_WIN_GATHER_INFO); if ((*win_ptr)->comm_ptr->node_comm == NULL) { mpi_errno = MPIDI_CH3U_Win_gather_info(base, size, disp_unit, info, comm_ptr, win_ptr); goto fn_exit; } comm_size = (*win_ptr)->comm_ptr->local_size; comm_rank = (*win_ptr)->comm_ptr->rank; node_comm_ptr = (*win_ptr)->comm_ptr->node_comm; MPIR_Assert(node_comm_ptr != NULL); node_rank = node_comm_ptr->rank; (*win_ptr)->info_shm_segment_len = comm_size * sizeof(MPIDI_Win_basic_info_t); mpi_errno = MPL_shm_hnd_init(&(*win_ptr)->info_shm_segment_handle); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); if (node_rank == 0) { char *serialized_hnd_ptr = NULL; /* create shared memory region for all processes in win and map. */ mpi_errno = MPL_shm_seg_create_and_attach((*win_ptr)->info_shm_segment_handle, (*win_ptr)->info_shm_segment_len, (char **) &(*win_ptr)->info_shm_base_addr, 0); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* serialize handle and broadcast it to the other processes in win */ mpi_errno = MPL_shm_hnd_get_serialized_by_ref((*win_ptr)->info_shm_segment_handle, &serialized_hnd_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Bcast_impl(serialized_hnd_ptr, MPL_SHM_GHND_SZ, MPI_CHAR, 0, node_comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); /* wait for other processes to attach to win */ mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); /* unlink shared memory region so it gets deleted when all processes exit */ mpi_errno = MPL_shm_seg_remove((*win_ptr)->info_shm_segment_handle); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } else { char serialized_hnd[MPL_SHM_GHND_SZ] = { 0 }; /* get serialized handle from rank 0 and deserialize it */ mpi_errno = MPIR_Bcast_impl(serialized_hnd, MPL_SHM_GHND_SZ, MPI_CHAR, 0, node_comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); mpi_errno = MPL_shm_hnd_deserialize((*win_ptr)->info_shm_segment_handle, serialized_hnd, strlen(serialized_hnd)); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* attach to shared memory region created by rank 0 */ mpi_errno = MPL_shm_seg_attach((*win_ptr)->info_shm_segment_handle, (*win_ptr)->info_shm_segment_len, (char **) &(*win_ptr)->info_shm_base_addr, 0); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); } (*win_ptr)->basic_info_table = (MPIDI_Win_basic_info_t *) ((*win_ptr)->info_shm_base_addr); MPIR_CHKLMEM_MALLOC(tmp_buf, MPI_Aint *, 4 * comm_size * sizeof(MPI_Aint), mpi_errno, "tmp_buf"); tmp_buf[4 * comm_rank] = MPIR_Ptr_to_aint(base); tmp_buf[4 * comm_rank + 1] = size; tmp_buf[4 * comm_rank + 2] = (MPI_Aint) disp_unit; tmp_buf[4 * comm_rank + 3] = (MPI_Aint) (*win_ptr)->handle; mpi_errno = MPIR_Allgather_impl(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, tmp_buf, 4, MPI_AINT, (*win_ptr)->comm_ptr, &errflag); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); if (node_rank == 0) { /* only node_rank == 0 writes results to basic_info_table on shared memory region. */ k = 0; for (i = 0; i < comm_size; i++) { (*win_ptr)->basic_info_table[i].base_addr = MPIR_Aint_to_ptr(tmp_buf[k++]); (*win_ptr)->basic_info_table[i].size = tmp_buf[k++]; (*win_ptr)->basic_info_table[i].disp_unit = (int) tmp_buf[k++]; (*win_ptr)->basic_info_table[i].win_handle = (MPI_Win) tmp_buf[k++]; } } /* Make sure that all local processes see the results written by node_rank == 0 */ mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); fn_exit: MPIR_CHKLMEM_FREEALL(); MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPIDI_CH3I_WIN_GATHER_INFO); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: goto fn_exit; /* --END ERROR HANDLING-- */ }