int MPIR_Barrier_intra_smp(MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag) { int mpi_errno = MPI_SUCCESS; int mpi_errno_ret = MPI_SUCCESS; MPIR_Assert(MPIR_CVAR_ENABLE_SMP_COLLECTIVES && MPIR_CVAR_ENABLE_SMP_BARRIER && MPIR_Comm_is_node_aware(comm_ptr)); /* do the intranode barrier on all nodes */ if (comm_ptr->node_comm != NULL) { mpi_errno = MPIR_Barrier(comm_ptr->node_comm, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIX_ERR_PROC_FAILED == MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); } } /* do the barrier across roots of all nodes */ if (comm_ptr->node_roots_comm != NULL) { mpi_errno = MPIR_Barrier(comm_ptr->node_roots_comm, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIX_ERR_PROC_FAILED == MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); } } /* release the local processes on each node with a 1-byte * broadcast (0-byte broadcast just returns without doing * anything) */ if (comm_ptr->node_comm != NULL) { int i = 0; mpi_errno = MPIR_Bcast(&i, 1, MPI_BYTE, 0, comm_ptr->node_comm, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIX_ERR_PROC_FAILED == MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); } } if (mpi_errno_ret) mpi_errno = mpi_errno_ret; else if (*errflag != MPIR_ERR_NONE) MPIR_ERR_SET(mpi_errno, *errflag, "**coll_fail"); return mpi_errno; }
int MPID_Comm_spawn_multiple(int count, char *commands[], char **argvs[], const int maxprocs[], MPIR_Info * info_ptrs[], int root, MPIR_Comm * comm_ptr, MPIR_Comm ** intercomm, int errcodes[]) { char port_name[MPI_MAX_PORT_NAME]; int *info_keyval_sizes = 0, i, mpi_errno = MPI_SUCCESS; PMI_keyval_t **info_keyval_vectors = 0, preput_keyval_vector; int *pmi_errcodes = 0, pmi_errno = 0; int total_num_processes, should_accept = 1; MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_COMM_SPAWN_MULTIPLE); MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPID_COMM_SPAWN_MULTIPLE); memset(port_name, 0, sizeof(port_name)); if (comm_ptr->rank == root) { total_num_processes = 0; for (i = 0; i < count; i++) total_num_processes += maxprocs[i]; pmi_errcodes = (int *) MPL_malloc(sizeof(int) * total_num_processes, MPL_MEM_BUFFER); MPIR_ERR_CHKANDJUMP(!pmi_errcodes, mpi_errno, MPI_ERR_OTHER, "**nomem"); for (i = 0; i < total_num_processes; i++) pmi_errcodes[i] = 0; mpi_errno = MPID_Open_port(NULL, port_name); if (mpi_errno) MPIR_ERR_POP(mpi_errno); info_keyval_sizes = (int *) MPL_malloc(count * sizeof(int), MPL_MEM_BUFFER); MPIR_ERR_CHKANDJUMP(!info_keyval_sizes, mpi_errno, MPI_ERR_OTHER, "**nomem"); info_keyval_vectors = (PMI_keyval_t **) MPL_malloc(count * sizeof(PMI_keyval_t *), MPL_MEM_BUFFER); MPIR_ERR_CHKANDJUMP(!info_keyval_vectors, mpi_errno, MPI_ERR_OTHER, "**nomem"); if (!info_ptrs) for (i = 0; i < count; i++) { info_keyval_vectors[i] = 0; info_keyval_sizes[i] = 0; } else for (i = 0; i < count; i++) { mpi_errno = mpi_to_pmi_keyvals(info_ptrs[i], &info_keyval_vectors[i], &info_keyval_sizes[i]); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } preput_keyval_vector.key = MPIDI_PARENT_PORT_KVSKEY; preput_keyval_vector.val = port_name; pmi_errno = PMI_Spawn_multiple(count, (const char **) commands, (const char ***) argvs, maxprocs, info_keyval_sizes, (const PMI_keyval_t **) info_keyval_vectors, 1, &preput_keyval_vector, pmi_errcodes); if (pmi_errno != PMI_SUCCESS) MPIR_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**pmi_spawn_multiple", "**pmi_spawn_multiple %d", pmi_errno); if (errcodes != MPI_ERRCODES_IGNORE) { for (i = 0; i < total_num_processes; i++) { errcodes[i] = pmi_errcodes[0]; should_accept = should_accept && errcodes[i]; } should_accept = !should_accept; } } if (errcodes != MPI_ERRCODES_IGNORE) { MPIR_Errflag_t errflag = MPIR_ERR_NONE; mpi_errno = MPIR_Bcast(&should_accept, 1, MPI_INT, root, comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Bcast(&pmi_errno, 1, MPI_INT, root, comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Bcast(&total_num_processes, 1, MPI_INT, root, comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Bcast(errcodes, total_num_processes, MPI_INT, root, comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } if (should_accept) { mpi_errno = MPID_Comm_accept(port_name, NULL, root, comm_ptr, intercomm); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } else { if ((pmi_errno == PMI_SUCCESS) && (errcodes[0] != 0)) { mpi_errno = MPIR_Comm_create(intercomm); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } } if (comm_ptr->rank == root) { mpi_errno = MPID_Close_port(port_name); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } fn_exit: if (info_keyval_vectors) { free_pmi_keyvals(info_keyval_vectors, count, info_keyval_sizes); MPL_free(info_keyval_vectors); } MPL_free(info_keyval_sizes); MPL_free(pmi_errcodes); MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPID_COMM_SPAWN_MULTIPLE); return mpi_errno; fn_fail: goto fn_exit; }
/*@ MPI_Bcast - Broadcasts a message from the process with rank "root" to all other processes of the communicator Input/Output Parameters: . buffer - starting address of buffer (choice) Input Parameters: + count - number of entries in buffer (integer) . datatype - data type of buffer (handle) . root - rank of broadcast root (integer) - comm - communicator (handle) .N ThreadSafe .N Fortran .N Errors .N MPI_SUCCESS .N MPI_ERR_COMM .N MPI_ERR_COUNT .N MPI_ERR_TYPE .N MPI_ERR_BUFFER .N MPI_ERR_ROOT @*/ int MPI_Bcast(void *buffer, int count, MPI_Datatype datatype, int root, MPI_Comm comm) { int mpi_errno = MPI_SUCCESS; MPIR_Comm *comm_ptr = NULL; MPIR_Errflag_t errflag = MPIR_ERR_NONE; MPIR_FUNC_TERSE_STATE_DECL(MPID_STATE_MPI_BCAST); MPIR_ERRTEST_INITIALIZED_ORDIE(); MPID_THREAD_CS_ENTER(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX); MPIR_FUNC_TERSE_COLL_ENTER(MPID_STATE_MPI_BCAST); /* Validate parameters, especially handles needing to be converted */ #ifdef HAVE_ERROR_CHECKING { MPID_BEGIN_ERROR_CHECKS; { MPIR_ERRTEST_COMM(comm, mpi_errno); } MPID_END_ERROR_CHECKS; } #endif /* HAVE_ERROR_CHECKING */ /* Convert MPI object handles to object pointers */ MPIR_Comm_get_ptr(comm, comm_ptr); /* Validate parameters and objects (post conversion) */ #ifdef HAVE_ERROR_CHECKING { MPID_BEGIN_ERROR_CHECKS; { MPIR_Datatype *datatype_ptr = NULL; MPIR_Comm_valid_ptr(comm_ptr, mpi_errno, FALSE); if (mpi_errno != MPI_SUCCESS) goto fn_fail; MPIR_ERRTEST_COUNT(count, mpi_errno); MPIR_ERRTEST_DATATYPE(datatype, "datatype", mpi_errno); if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM) { MPIR_ERRTEST_INTRA_ROOT(comm_ptr, root, mpi_errno); } else { MPIR_ERRTEST_INTER_ROOT(comm_ptr, root, mpi_errno); } if (HANDLE_GET_KIND(datatype) != HANDLE_KIND_BUILTIN) { MPIR_Datatype_get_ptr(datatype, datatype_ptr); MPIR_Datatype_valid_ptr(datatype_ptr, mpi_errno); if (mpi_errno != MPI_SUCCESS) goto fn_fail; MPIR_Datatype_committed_ptr(datatype_ptr, mpi_errno); if (mpi_errno != MPI_SUCCESS) goto fn_fail; } MPIR_ERRTEST_BUF_INPLACE(buffer, count, mpi_errno); MPIR_ERRTEST_USERBUFFER(buffer, count, datatype, mpi_errno); } MPID_END_ERROR_CHECKS; } #endif /* HAVE_ERROR_CHECKING */ /* ... body of routine ... */ mpi_errno = MPIR_Bcast(buffer, count, datatype, root, comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* ... end of body of routine ... */ fn_exit: MPIR_FUNC_TERSE_COLL_EXIT(MPID_STATE_MPI_BCAST); MPID_THREAD_CS_EXIT(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX); return mpi_errno; fn_fail: /* --BEGIN ERROR HANDLING-- */ #ifdef HAVE_ERROR_CHECKING { mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_RECOVERABLE, __func__, __LINE__, MPI_ERR_OTHER, "**mpi_bcast", "**mpi_bcast %p %d %D %d %C", buffer, count, datatype, root, comm); } #endif mpi_errno = MPIR_Err_return_comm(comm_ptr, __func__, mpi_errno); goto fn_exit; /* --END ERROR HANDLING-- */ }
int MPIR_Intercomm_merge_impl(MPIR_Comm * comm_ptr, int high, MPIR_Comm ** new_intracomm_ptr) { int mpi_errno = MPI_SUCCESS; int local_high, remote_high, new_size; MPIR_Context_id_t new_context_id; MPIR_Errflag_t errflag = MPIR_ERR_NONE; MPIR_FUNC_TERSE_STATE_DECL(MPID_STATE_MPIR_COMM_KIND__INTERCOMM_MERGE_IMPL); MPIR_FUNC_TERSE_ENTER(MPID_STATE_MPIR_COMM_KIND__INTERCOMM_MERGE_IMPL); /* Make sure that we have a local intercommunicator */ if (!comm_ptr->local_comm) { /* Manufacture the local communicator */ mpi_errno = MPII_Setup_intercomm_localcomm(comm_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } /* Find the "high" value of the other group of processes. This * will be used to determine which group is ordered first in * the generated communicator. high is logical */ local_high = high; if (comm_ptr->rank == 0) { /* This routine allows use to use the collective communication * context rather than the point-to-point context. */ mpi_errno = MPIC_Sendrecv(&local_high, 1, MPI_INT, 0, 0, &remote_high, 1, MPI_INT, 0, 0, comm_ptr, MPI_STATUS_IGNORE, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* If local_high and remote_high are the same, then order is arbitrary. * we use the is_low_group in the intercomm in this case. */ MPL_DBG_MSG_FMT(MPIR_DBG_COMM, VERBOSE, (MPL_DBG_FDEST, "local_high=%d remote_high=%d is_low_group=%d", local_high, remote_high, comm_ptr->is_low_group)); if (local_high == remote_high) { local_high = !(comm_ptr->is_low_group); } } /* * All processes in the local group now need to get the * value of local_high, which may have changed if both groups * of processes had the same value for high */ mpi_errno = MPIR_Bcast(&local_high, 1, MPI_INT, 0, comm_ptr->local_comm, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); /* * For the intracomm, we need a consistent context id. * That means that one of the two groups needs to use * the recvcontext_id and the other must use the context_id * The recvcontext_id is unique on each process, but another * communicator may use the context_id. Therefore, we do a small hack. * We set both flags indicating a sub-communicator (intra-node and * inter-node) to one. This is normally not possible (a communicator * is either intra- or inter-node) - which makes this context_id unique. * */ new_size = comm_ptr->local_size + comm_ptr->remote_size; mpi_errno = MPIR_Comm_create(new_intracomm_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); if (local_high) { (*new_intracomm_ptr)->context_id = MPIR_CONTEXT_SET_FIELD(SUBCOMM, comm_ptr->recvcontext_id, 3); } else { (*new_intracomm_ptr)->context_id = MPIR_CONTEXT_SET_FIELD(SUBCOMM, comm_ptr->context_id, 3); } (*new_intracomm_ptr)->recvcontext_id = (*new_intracomm_ptr)->context_id; (*new_intracomm_ptr)->remote_size = (*new_intracomm_ptr)->local_size = new_size; (*new_intracomm_ptr)->pof2 = MPL_pof2(new_size); (*new_intracomm_ptr)->rank = -1; (*new_intracomm_ptr)->comm_kind = MPIR_COMM_KIND__INTRACOMM; /* Now we know which group comes first. Build the new mapping * from the existing comm */ mpi_errno = create_and_map(comm_ptr, local_high, (*new_intracomm_ptr)); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* We've setup a temporary context id, based on the context id * used by the intercomm. This allows us to perform the allreduce * operations within the context id algorithm, since we already * have a valid (almost - see comm_create_hook) communicator. */ mpi_errno = MPIR_Comm_commit((*new_intracomm_ptr)); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* printf("About to get context id \n"); fflush(stdout); */ /* In the multi-threaded case, MPIR_Get_contextid_sparse assumes that the * calling routine already holds the single criticial section */ new_context_id = 0; mpi_errno = MPIR_Get_contextid_sparse((*new_intracomm_ptr), &new_context_id, FALSE); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_Assert(new_context_id != 0); /* We release this communicator that was involved just to * get valid context id and create true one */ mpi_errno = MPIR_Comm_release(*new_intracomm_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Comm_create(new_intracomm_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); (*new_intracomm_ptr)->remote_size = (*new_intracomm_ptr)->local_size = new_size; (*new_intracomm_ptr)->pof2 = MPL_pof2(new_size); (*new_intracomm_ptr)->rank = -1; (*new_intracomm_ptr)->comm_kind = MPIR_COMM_KIND__INTRACOMM; (*new_intracomm_ptr)->context_id = new_context_id; (*new_intracomm_ptr)->recvcontext_id = new_context_id; mpi_errno = create_and_map(comm_ptr, local_high, (*new_intracomm_ptr)); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Comm_commit((*new_intracomm_ptr)); if (mpi_errno) MPIR_ERR_POP(mpi_errno); fn_exit: MPIR_FUNC_TERSE_EXIT(MPID_STATE_MPIR_COMM_KIND__INTERCOMM_MERGE_IMPL); return mpi_errno; fn_fail: goto fn_exit; }
/* comm create impl for intercommunicators, assumes that the standard error * checking has already taken place in the calling function */ PMPI_LOCAL int MPIR_Comm_create_inter(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, MPIR_Comm ** newcomm_ptr) { int mpi_errno = MPI_SUCCESS; MPIR_Context_id_t new_context_id; int *mapping = NULL; int *remote_mapping = NULL; MPIR_Comm *mapping_comm = NULL; int remote_size = -1; int rinfo[2]; MPIR_Errflag_t errflag = MPIR_ERR_NONE; MPIR_CHKLMEM_DECL(1); MPIR_FUNC_TERSE_STATE_DECL(MPID_STATE_MPIR_COMM_CREATE_INTER); MPIR_FUNC_TERSE_ENTER(MPID_STATE_MPIR_COMM_CREATE_INTER); MPIR_Assert(comm_ptr->comm_kind == MPIR_COMM_KIND__INTERCOMM); /* Create a new communicator from the specified group members */ /* If there is a context id cache in oldcomm, use it here. Otherwise, * use the appropriate algorithm to get a new context id. * Creating the context id is collective over the *input* communicator, * so it must be created before we decide if this process is a * member of the group */ /* In the multi-threaded case, MPIR_Get_contextid_sparse assumes that the * calling routine already holds the single criticial section */ if (!comm_ptr->local_comm) { MPII_Setup_intercomm_localcomm(comm_ptr); } mpi_errno = MPIR_Get_contextid_sparse(comm_ptr->local_comm, &new_context_id, FALSE); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_Assert(new_context_id != 0); MPIR_Assert(new_context_id != comm_ptr->recvcontext_id); mpi_errno = MPII_Comm_create_calculate_mapping(group_ptr, comm_ptr, &mapping, &mapping_comm); if (mpi_errno) MPIR_ERR_POP(mpi_errno); *newcomm_ptr = NULL; if (group_ptr->rank != MPI_UNDEFINED) { /* Get the new communicator structure and context id */ mpi_errno = MPIR_Comm_create(newcomm_ptr); if (mpi_errno) goto fn_fail; (*newcomm_ptr)->recvcontext_id = new_context_id; (*newcomm_ptr)->rank = group_ptr->rank; (*newcomm_ptr)->comm_kind = comm_ptr->comm_kind; /* Since the group has been provided, let the new communicator know * about the group */ (*newcomm_ptr)->local_comm = 0; (*newcomm_ptr)->local_group = group_ptr; MPIR_Group_add_ref(group_ptr); (*newcomm_ptr)->local_size = group_ptr->size; (*newcomm_ptr)->pof2 = MPL_pof2((*newcomm_ptr)->local_size); (*newcomm_ptr)->remote_group = 0; (*newcomm_ptr)->is_low_group = comm_ptr->is_low_group; } /* There is an additional step. We must communicate the * information on the local context id and the group members, * given by the ranks so that the remote process can construct the * appropriate network address mapping. * First we exchange group sizes and context ids. Then the ranks * in the remote group, from which the remote network address * mapping can be constructed. We need to use the "collective" * context in the original intercommunicator */ if (comm_ptr->rank == 0) { int info[2]; info[0] = new_context_id; info[1] = group_ptr->size; mpi_errno = MPIC_Sendrecv(info, 2, MPI_INT, 0, 0, rinfo, 2, MPI_INT, 0, 0, comm_ptr, MPI_STATUS_IGNORE, &errflag); if (mpi_errno) { MPIR_ERR_POP(mpi_errno); } if (*newcomm_ptr != NULL) { (*newcomm_ptr)->context_id = rinfo[0]; } remote_size = rinfo[1]; MPIR_CHKLMEM_MALLOC(remote_mapping, int *, remote_size * sizeof(int), mpi_errno, "remote_mapping", MPL_MEM_ADDRESS); /* Populate and exchange the ranks */ mpi_errno = MPIC_Sendrecv(mapping, group_ptr->size, MPI_INT, 0, 0, remote_mapping, remote_size, MPI_INT, 0, 0, comm_ptr, MPI_STATUS_IGNORE, &errflag); if (mpi_errno) { MPIR_ERR_POP(mpi_errno); } /* Broadcast to the other members of the local group */ mpi_errno = MPIR_Bcast(rinfo, 2, MPI_INT, 0, comm_ptr->local_comm, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Bcast(remote_mapping, remote_size, MPI_INT, 0, comm_ptr->local_comm, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); } else {
*/ static int MPID_PSP_Bcast(void *buffer, int count, MPI_Datatype datatype, int root, MPIR_Comm *comm_ptr, MPIR_Errflag_t *errflag) { int mpi_errno; D(printf("%s(buffer:%p, count:%u, root:%u, comm:%p(%s, rank:%2u, id:%x, rid:%x, size:%u))\n", __func__, buffer, count, root, comm_ptr, comm_ptr->name, comm_ptr->rank, comm_ptr->context_id, comm_ptr->recvcontext_id, comm_ptr->local_size); ) if (!comm_ptr->group) { /* Fallback to MPIch default Bcast */ mpi_errno = MPIR_Bcast(buffer, count, datatype, root, comm_ptr, errflag); return mpi_errno; } if (root == comm_ptr->rank) { /* I am the root */ return MPID_PSP_Bcast_send(buffer, count, datatype, root, comm_ptr); } else { return MPID_PSP_Bcast_recv(buffer, count, datatype, root, comm_ptr); } } void MPID_PSP_group_init(MPIR_Comm *comm_ptr) { unsigned comm_size = comm_ptr->local_size; unsigned rank;