/* This routine serves two purposes: * - the allreduce acts as a kind of Barrier, * which avoids, that we have incoming fragments * on the new communicator before everybody has set * up the comm structure. * - some components (e.g. the collective MagPIe component * might want to generate new communicators and communicate * using the new comm. Thus, it can just be called after * the 'barrier'. * * The reason that this routine is in comm_cid and not in * comm.c is, that this file contains the allreduce implementations * which are required, and thus we avoid having duplicate code... */ int ompi_comm_activate ( ompi_communicator_t** newcomm, ompi_communicator_t* comm, ompi_communicator_t* bridgecomm, void* local_leader, void* remote_leader, int mode, int send_first ) { int ret = 0; int ok=0, gok=0; ompi_comm_cid_allredfct* allredfnct; /* Step 1: the barrier, after which it is allowed to * send messages over the new communicator */ switch (mode) { case OMPI_COMM_CID_INTRA: allredfnct=(ompi_comm_cid_allredfct*)ompi_comm_allreduce_intra; break; case OMPI_COMM_CID_INTER: allredfnct=(ompi_comm_cid_allredfct*)ompi_comm_allreduce_inter; break; case OMPI_COMM_CID_INTRA_BRIDGE: allredfnct=(ompi_comm_cid_allredfct*)ompi_comm_allreduce_intra_bridge; break; case OMPI_COMM_CID_INTRA_OOB: allredfnct=(ompi_comm_cid_allredfct*)ompi_comm_allreduce_intra_oob; break; default: return MPI_UNDEFINED; break; } if (MPI_UNDEFINED != (*newcomm)->c_local_group->grp_my_rank) { /* Initialize the PML stuff in the newcomm */ if ( OMPI_SUCCESS != (ret = MCA_PML_CALL(add_comm(*newcomm))) ) { goto bail_on_error; } OMPI_COMM_SET_PML_ADDED(*newcomm); } (allredfnct)(&ok, &gok, 1, MPI_MIN, comm, bridgecomm, local_leader, remote_leader, send_first ); /** * Check to see if this process is in the new communicator. * * Specifically, this function is invoked by all proceses in the * old communicator, regardless of whether they are in the new * communicator or not. This is because it is far simpler to use * MPI collective functions on the old communicator to determine * some data for the new communicator (e.g., remote_leader) than * to kludge up our own pseudo-collective routines over just the * processes in the new communicator. Hence, *all* processes in * the old communicator need to invoke this function. * * That being said, only processes in the new communicator need to * select a coll module for the new communicator. More * specifically, proceses who are not in the new communicator * should *not* select a coll module -- for example, * ompi_comm_rank(newcomm) returns MPI_UNDEFINED for processes who * are not in the new communicator. This can cause errors in the * selection / initialization of a coll module. Plus, it's * wasteful -- processes in the new communicator will end up * freeing the new communicator anyway, so we might as well leave * the coll selection as NULL (the coll base comm unselect code * handles that case properly). */ if (MPI_UNDEFINED == (*newcomm)->c_local_group->grp_my_rank) { return OMPI_SUCCESS; } /* Let the collectives components fight over who will do collective on this new comm. */ if (OMPI_SUCCESS != (ret = mca_coll_base_comm_select(*newcomm))) { goto bail_on_error; } /* For an inter communicator, we have to deal with the potential * problem of what is happening if the local_comm that we created * has a lower CID than the parent comm. This is not a problem * as long as the user calls MPI_Comm_free on the inter communicator. * However, if the communicators are not freed by the user but released * by Open MPI in MPI_Finalize, we walk through the list of still available * communicators and free them one by one. Thus, local_comm is freed before * the actual inter-communicator. However, the local_comm pointer in the * inter communicator will still contain the 'previous' address of the local_comm * and thus this will lead to a segmentation violation. In order to prevent * that from happening, we increase the reference counter local_comm * by one if its CID is lower than the parent. We cannot increase however * its reference counter if the CID of local_comm is larger than * the CID of the inter communicators, since a regular MPI_Comm_free would * leave in that the case the local_comm hanging around and thus we would not * recycle CID's properly, which was the reason and the cause for this trouble. */ if ( OMPI_COMM_IS_INTER(*newcomm)) { if ( OMPI_COMM_CID_IS_LOWER(*newcomm, comm)) { OMPI_COMM_SET_EXTRA_RETAIN (*newcomm); OBJ_RETAIN (*newcomm); } } return OMPI_SUCCESS; bail_on_error: OBJ_RELEASE(*newcomm); *newcomm = MPI_COMM_NULL; return ret; }
struct ompi_communicator_t* mca_coll_hierarch_get_llcomm (int root, mca_coll_hierarch_module_t *hierarch_module, int* llroot, int* lroot) { struct ompi_communicator_t *llcomm=NULL; struct ompi_group_t *llgroup=NULL; struct ompi_group_t *group=NULL; struct mca_coll_hierarch_llead_t *llead=NULL; int found, i, rc, num_llead, offset; int rank = ompi_comm_rank (hierarch_module->hier_comm); int color; /* determine what our offset of root is in the colorarr */ offset = mca_coll_hierarch_get_offset ( root, hierarch_module->hier_num_colorarr, hierarch_module->hier_colorarr ); num_llead = opal_pointer_array_get_size ( &(hierarch_module->hier_llead) ); for ( found=0, i=0; i < num_llead; i++ ) { llead = (struct mca_coll_hierarch_llead_t *) opal_pointer_array_get_item ( &(hierarch_module->hier_llead), i ); if ( NULL == llead ) { continue; } if (llead->offset == offset ) { found = 1; break; } #if 0 else if () { /* the offset of root = maxoffset of this color and * the offset on llead is larger then offset of root. * then we can also use this llead structure */ } #endif } if ( !found ) { /* allocate a new llead element */ llead = (struct mca_coll_hierarch_llead_t *) malloc ( sizeof(struct mca_coll_hierarch_llead_t)); if ( NULL == llead ) { return NULL; } /* generate the list of lleaders with this offset */ mca_coll_hierarch_get_all_lleaders ( rank, hierarch_module, llead, offset ); color = MPI_UNDEFINED; if ( llead->am_lleader ) { color = 1; } /* create new lleader subcommunicator */ rc = ompi_comm_split ( hierarch_module->hier_comm, color, root, &llcomm, 0); if ( OMPI_SUCCESS != rc ) { return NULL; } if ( OMPI_COMM_CID_IS_LOWER ( llcomm, hierarch_module->hier_comm ) ) { /* Mark the communicator as 'extra retain' and increase the reference count by one more. See ompi_comm_activate for detailed explanation. */ OMPI_COMM_SET_EXTRA_RETAIN (llcomm); OBJ_RETAIN(llcomm); } llead->llcomm = llcomm; /* Store the new element on the hierarch_module struct */ opal_pointer_array_add ( &(hierarch_module->hier_llead), llead); } llcomm = llead->llcomm; *lroot = llead->my_lleader; *llroot = MPI_UNDEFINED; if ( MPI_COMM_NULL != llcomm ) { group = hierarch_module->hier_comm->c_local_group; llgroup = llcomm->c_local_group; rc = ompi_group_translate_ranks ( group, 1, &root, llgroup, llroot); if ( OMPI_SUCCESS != rc ) { return NULL; } } return llcomm; }
static int ompi_comm_activate_nb_complete (ompi_comm_request_t *request) { ompi_comm_cid_context_t *context = (ompi_comm_cid_context_t *) request->context; int ret; /** * Check to see if this process is in the new communicator. * * Specifically, this function is invoked by all proceses in the * old communicator, regardless of whether they are in the new * communicator or not. This is because it is far simpler to use * MPI collective functions on the old communicator to determine * some data for the new communicator (e.g., remote_leader) than * to kludge up our own pseudo-collective routines over just the * processes in the new communicator. Hence, *all* processes in * the old communicator need to invoke this function. * * That being said, only processes in the new communicator need to * select a coll module for the new communicator. More * specifically, proceses who are not in the new communicator * should *not* select a coll module -- for example, * ompi_comm_rank(newcomm) returns MPI_UNDEFINED for processes who * are not in the new communicator. This can cause errors in the * selection / initialization of a coll module. Plus, it's * wasteful -- processes in the new communicator will end up * freeing the new communicator anyway, so we might as well leave * the coll selection as NULL (the coll base comm unselect code * handles that case properly). */ if (MPI_UNDEFINED == (context->newcomm)->c_local_group->grp_my_rank) { return OMPI_SUCCESS; } /* Let the collectives components fight over who will do collective on this new comm. */ if (OMPI_SUCCESS != (ret = mca_coll_base_comm_select(context->newcomm))) { OBJ_RELEASE(context->newcomm); *context->newcommp = MPI_COMM_NULL; return ret; } /* For an inter communicator, we have to deal with the potential * problem of what is happening if the local_comm that we created * has a lower CID than the parent comm. This is not a problem * as long as the user calls MPI_Comm_free on the inter communicator. * However, if the communicators are not freed by the user but released * by Open MPI in MPI_Finalize, we walk through the list of still available * communicators and free them one by one. Thus, local_comm is freed before * the actual inter-communicator. However, the local_comm pointer in the * inter communicator will still contain the 'previous' address of the local_comm * and thus this will lead to a segmentation violation. In order to prevent * that from happening, we increase the reference counter local_comm * by one if its CID is lower than the parent. We cannot increase however * its reference counter if the CID of local_comm is larger than * the CID of the inter communicators, since a regular MPI_Comm_free would * leave in that the case the local_comm hanging around and thus we would not * recycle CID's properly, which was the reason and the cause for this trouble. */ if (OMPI_COMM_IS_INTER(context->newcomm)) { if (OMPI_COMM_CID_IS_LOWER(context->newcomm, context->comm)) { OMPI_COMM_SET_EXTRA_RETAIN (context->newcomm); OBJ_RETAIN (context->newcomm); } } /* done */ return OMPI_SUCCESS; }
/* * Init module on the communicator */ int mca_coll_hierarch_module_enable (mca_coll_base_module_t *module, struct ompi_communicator_t *comm) { int color; int size, rank, ret=OMPI_SUCCESS; struct ompi_communicator_t *lcomm=NULL; struct ompi_communicator_t *llcomm=NULL; struct mca_coll_hierarch_llead_t *llead=NULL; mca_coll_hierarch_module_t *hierarch_module = (mca_coll_hierarch_module_t *) module; rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); color = hierarch_module->hier_colorarr[rank]; /* Generate the subcommunicator based on the color returned by the previous function. */ ret = ompi_comm_split ( comm, color, rank, &lcomm, 0 ); if ( OMPI_SUCCESS != ret ) { goto exit; } if ( OMPI_COMM_CID_IS_LOWER ( lcomm, comm ) ) { /* Mark the communicator as 'extra retain' and increase the reference count by one more. See ompi_comm_activate for detailed comments */ OMPI_COMM_SET_EXTRA_RETAIN (lcomm); OBJ_RETAIN(lcomm); } hierarch_module->hier_comm = comm; hierarch_module->hier_lcomm = lcomm; hierarch_module->hier_num_reqs = 2 * size; hierarch_module->hier_reqs = (ompi_request_t **) malloc (sizeof(ompi_request_t)*size*2); if ( NULL == hierarch_module->hier_reqs ) { goto exit; } /* allocate a certain number of the hierarch_llead structures, which store information about local leader and the according subcommunicators */ llead = (struct mca_coll_hierarch_llead_t * ) malloc ( sizeof(struct mca_coll_hierarch_llead_t)); if ( NULL == llead ) { goto exit; } /* These two routines set all relevant entries in the mca_coll_base_comm_t * structure. The first one makes all entries which are independent of the * offset (and have to be done only once per module. The second one is * depending on the offset, and has to be called therefore every time we need * a new llcomm */ mca_coll_hierarch_get_llr ( hierarch_module ); mca_coll_hierarch_get_all_lleaders ( rank, hierarch_module, llead, 1 ); /* Generate the lleader communicator assuming that all lleaders are the first process in the list of processes with the same color. A function generating other lleader-comms will follow soon. */ color = MPI_UNDEFINED; if ( llead->am_lleader ) { color = 1; } ret = ompi_comm_split ( comm, color, rank, &llcomm, 0); if ( OMPI_SUCCESS != ret ) { goto exit; } if ( OMPI_COMM_CID_IS_LOWER ( llcomm, comm ) ) { /* Mark the communicator as 'extra retain' and increase the reference count by one more. See ompi_comm_activate for detailed explanation. */ OMPI_COMM_SET_EXTRA_RETAIN (llcomm); OBJ_RETAIN(llcomm); } llead->llcomm = llcomm; /* Store it now on the data structure */ OBJ_CONSTRUCT(&(hierarch_module->hier_llead), opal_pointer_array_t); opal_pointer_array_add ( &(hierarch_module->hier_llead), llead); if ( mca_coll_hierarch_verbose_param ) { mca_coll_hierarch_dump_struct (hierarch_module); } exit: if ( OMPI_SUCCESS != ret ) { if (NULL != llead) { free(llead); } ompi_comm_free ( &lcomm ); return OMPI_ERROR; } return OMPI_SUCCESS; }