int MPIR_Comm_commit(MPID_Comm * comm) { int mpi_errno = MPI_SUCCESS; int num_local = -1, num_external = -1; int local_rank = -1, external_rank = -1; int *local_procs = NULL, *external_procs = NULL; MPID_MPI_STATE_DECL(MPID_STATE_MPIR_COMM_COMMIT); MPID_MPI_FUNC_ENTER(MPID_STATE_MPIR_COMM_COMMIT); /* It's OK to relax these assertions, but we should do so very * intentionally. For now this function is the only place that we create * our hierarchy of communicators */ MPIU_Assert(comm->node_comm == NULL); MPIU_Assert(comm->node_roots_comm == NULL); mpi_errno = set_collops(comm); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* Notify device of communicator creation */ mpi_errno = MPID_Dev_comm_create_hook(comm); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_Comm_map_free(comm); if (comm->comm_kind == MPID_INTRACOMM) { mpi_errno = MPIU_Find_local_and_external(comm, &num_local, &local_rank, &local_procs, &num_external, &external_rank, &external_procs, &comm->intranode_table, &comm->internode_table); /* --BEGIN ERROR HANDLING-- */ if (mpi_errno) { if (MPIR_Err_is_fatal(mpi_errno)) MPIR_ERR_POP(mpi_errno); /* Non-fatal errors simply mean that this communicator will not have * any node awareness. Node-aware collectives are an optimization. */ MPIU_DBG_MSG_P(COMM, VERBOSE, "MPIU_Find_local_and_external failed for comm_ptr=%p", comm); if (comm->intranode_table) MPIU_Free(comm->intranode_table); if (comm->internode_table) MPIU_Free(comm->internode_table); mpi_errno = MPI_SUCCESS; goto fn_exit; } /* --END ERROR HANDLING-- */ /* defensive checks */ MPIU_Assert(num_local > 0); MPIU_Assert(num_local > 1 || external_rank >= 0); MPIU_Assert(external_rank < 0 || external_procs != NULL); /* if the node_roots_comm and comm would be the same size, then creating * the second communicator is useless and wasteful. */ if (num_external == comm->remote_size) { MPIU_Assert(num_local == 1); goto fn_exit; } /* we don't need a local comm if this process is the only one on this node */ if (num_local > 1) { mpi_errno = MPIR_Comm_create(&comm->node_comm); if (mpi_errno) MPIR_ERR_POP(mpi_errno); comm->node_comm->context_id = comm->context_id + MPID_CONTEXT_INTRANODE_OFFSET; comm->node_comm->recvcontext_id = comm->node_comm->context_id; comm->node_comm->rank = local_rank; comm->node_comm->comm_kind = MPID_INTRACOMM; comm->node_comm->hierarchy_kind = MPID_HIERARCHY_NODE; comm->node_comm->local_comm = NULL; MPIU_DBG_MSG_D(CH3_OTHER, VERBOSE, "Create node_comm=%p\n", comm->node_comm); comm->node_comm->local_size = num_local; comm->node_comm->remote_size = num_local; MPIR_Comm_map_irregular(comm->node_comm, comm, local_procs, num_local, MPIR_COMM_MAP_DIR_L2L, NULL); mpi_errno = set_collops(comm->node_comm); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* Notify device of communicator creation */ mpi_errno = MPID_Dev_comm_create_hook(comm->node_comm); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* don't call MPIR_Comm_commit here */ MPIR_Comm_map_free(comm->node_comm); } /* this process may not be a member of the node_roots_comm */ if (local_rank == 0) { mpi_errno = MPIR_Comm_create(&comm->node_roots_comm); if (mpi_errno) MPIR_ERR_POP(mpi_errno); comm->node_roots_comm->context_id = comm->context_id + MPID_CONTEXT_INTERNODE_OFFSET; comm->node_roots_comm->recvcontext_id = comm->node_roots_comm->context_id; comm->node_roots_comm->rank = external_rank; comm->node_roots_comm->comm_kind = MPID_INTRACOMM; comm->node_roots_comm->hierarchy_kind = MPID_HIERARCHY_NODE_ROOTS; comm->node_roots_comm->local_comm = NULL; comm->node_roots_comm->local_size = num_external; comm->node_roots_comm->remote_size = num_external; MPIR_Comm_map_irregular(comm->node_roots_comm, comm, external_procs, num_external, MPIR_COMM_MAP_DIR_L2L, NULL); mpi_errno = set_collops(comm->node_roots_comm); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* Notify device of communicator creation */ mpi_errno = MPID_Dev_comm_create_hook(comm->node_roots_comm); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* don't call MPIR_Comm_commit here */ MPIR_Comm_map_free(comm->node_roots_comm); } comm->hierarchy_kind = MPID_HIERARCHY_PARENT; } fn_exit: if (external_procs != NULL) MPIU_Free(external_procs); if (local_procs != NULL) MPIU_Free(local_procs); MPID_MPI_FUNC_EXIT(MPID_STATE_MPIR_COMM_COMMIT); return mpi_errno; fn_fail: goto fn_exit; }
static int sched_cb_gcn_allocate_cid(MPIR_Comm * comm, int tag, void *state) { int mpi_errno = MPI_SUCCESS; struct gcn_state *st = state, *tmp; MPIR_Context_id_t newctxid; MPIR_Errflag_t errflag = MPIR_ERR_NONE; if (st->own_eager_mask) { newctxid = find_and_allocate_context_id(st->local_mask); if (st->ctx0) *st->ctx0 = newctxid; if (st->ctx1) *st->ctx1 = newctxid; st->own_eager_mask = 0; eager_in_use = 0; } else if (st->own_mask) { newctxid = find_and_allocate_context_id(st->local_mask); if (st->ctx0) *st->ctx0 = newctxid; if (st->ctx1) *st->ctx1 = newctxid; /* reset flag for the next try */ mask_in_use = 0; /* If we found a ctx, remove element form list */ if (newctxid > 0) { if (next_gcn == st) { next_gcn = st->next; } else { for (tmp = next_gcn; tmp->next != st; tmp = tmp->next); tmp->next = st->next; } } } if (*st->ctx0 == 0) { if (st->local_mask[ALL_OWN_MASK_FLAG] == 1) { /* --BEGIN ERROR HANDLING-- */ int nfree = 0; int ntotal = 0; int minfree; context_mask_stats(&nfree, &ntotal); minfree = nfree; MPID_Allreduce(MPI_IN_PLACE, &minfree, 1, MPI_INT, MPI_MIN, st->comm_ptr, &errflag); if (minfree > 0) { MPIR_ERR_SETANDJUMP3(mpi_errno, MPI_ERR_OTHER, "**toomanycommfrag", "**toomanycommfrag %d %d %d", nfree, ntotal, minfree); } else { MPIR_ERR_SETANDJUMP3(mpi_errno, MPI_ERR_OTHER, "**toomanycomm", "**toomanycomm %d %d %d", nfree, ntotal, minfree); } /* --END ERROR HANDLING-- */ } else { /* do not own mask, try again */ if (st->first_iter == 1) { st->first_iter = 0; /* Set the Tag for the idup-operations. We have two problems here: * 1.) The tag should not be used by another (blocking) context_id allocation. * Therefore, we set tag_up as lower bound for the operation. tag_ub is used by * most of the other blocking operations, but tag is always >0, so this * should be fine. * 2.) We need odering between multiple idup operations on the same communicator. * The problem here is that the iallreduce operations of the first iteration * are not necessarily completed in the same order as they are issued, also on the * same communicator. To avoid deadlocks, we cannot add the elements to the * list bevfore the first iallreduce is completed. The "tag" is created for the * scheduling - by calling MPIR_Sched_next_tag(comm_ptr, &tag) - and the same * for a idup operation on all processes. So we use it here. */ /* FIXME I'm not sure if there can be an overflows for this tag */ st->tag = (uint64_t) tag + MPIR_Process.attrs.tag_ub; add_gcn_to_list(st); } mpi_errno = MPIR_Sched_cb(&sched_cb_gcn_copy_mask, st, st->s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(st->s); } } else { /* Successfully allocated a context id */ mpi_errno = MPIR_Sched_cb(&sched_cb_gcn_bcast, st, st->s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(st->s); } fn_exit: return mpi_errno; fn_fail: /* make sure that the pending allocations are scheduled */ if (!st->first_iter) { if (next_gcn == st) { next_gcn = st->next; } else { for (tmp = next_gcn; tmp && tmp->next != st; tmp = tmp->next); tmp->next = st->next; } } /* In the case of failure, the new communicator was half created. * So we need to clean the memory allocated for it. */ MPIR_Comm_map_free(st->new_comm); MPIR_Handle_obj_free(&MPIR_Comm_mem, st->new_comm); MPL_free(st); goto fn_exit; }