int MPI_Comm_compare(MPI_Comm comm1, MPI_Comm comm2, int *result) { /* local variables */ ompi_communicator_t *comp1, *comp2; ompi_group_t *group1, *group2; int size1, size2, rsize1, rsize2; int lresult, rresult=MPI_CONGRUENT; int sameranks=1; int sameorder=1; int i, j; int found = 0; if ( MPI_PARAM_CHECK ) { OMPI_ERR_INIT_FINALIZE(FUNC_NAME); if (ompi_comm_invalid(comm1) || ompi_comm_invalid(comm2)) { return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_COMM, FUNC_NAME); } if ( NULL == result ) { return OMPI_ERRHANDLER_INVOKE(comm1, MPI_ERR_ARG, FUNC_NAME); } } comp1 = (ompi_communicator_t *) comm1; comp2 = (ompi_communicator_t *) comm2; if ( comp1->c_contextid == comp2->c_contextid ) { *result = MPI_IDENT; return MPI_SUCCESS; } if ( MPI_COMM_NULL == comm1 || MPI_COMM_NULL == comm2 ) { *result = MPI_UNEQUAL; return MPI_SUCCESS; } /* compare sizes of local and remote groups */ size1 = ompi_comm_size (comp1); size2 = ompi_comm_size (comp2); rsize1 = ompi_comm_remote_size (comp1); rsize2 = ompi_comm_remote_size (comp2); if ( size1 != size2 || rsize1 != rsize2 ) { *result = MPI_UNEQUAL; return MPI_SUCCESS; } /* Compare local groups */ /* we need to check whether the communicators contain the same processes and in the same order */ group1 = (ompi_group_t *)comp1->c_local_group; group2 = (ompi_group_t *)comp2->c_local_group; for ( i = 0; i < size1; i++ ) { if ( group1->grp_proc_pointers[i] != group2->grp_proc_pointers[i]) { sameorder = 0; break; } } for ( i = 0; i < size1; i++ ) { found = 0; for ( j = 0; j < size2; j++ ) { if ( group1->grp_proc_pointers[i] == group2->grp_proc_pointers[j]) { found = 1; break; } } if ( !found ) { sameranks = 0; break; } } if ( sameranks && sameorder ) lresult = MPI_CONGRUENT; else if ( sameranks && !sameorder ) lresult = MPI_SIMILAR; else lresult = MPI_UNEQUAL; if ( rsize1 > 0 ) { /* Compare remote groups for inter-communicators */ /* we need to check whether the communicators contain the same processes and in the same order */ sameranks = sameorder = 1; group1 = (ompi_group_t *)comp1->c_remote_group; group2 = (ompi_group_t *)comp2->c_remote_group; for ( i = 0; i < rsize1; i++ ) { if ( group1->grp_proc_pointers[i] != group2->grp_proc_pointers[i]) { sameorder = 0; break; } } for ( i = 0; i < rsize1; i++ ) { found = 0; for ( j = 0; j < rsize2; j++ ) { if ( group1->grp_proc_pointers[i] == group2->grp_proc_pointers[j]) { found = 1; break; } } if ( !found ) { sameranks = 0; break; } } if ( sameranks && sameorder ) rresult = MPI_CONGRUENT; else if ( sameranks && !sameorder ) rresult = MPI_SIMILAR; else rresult = MPI_UNEQUAL; } /* determine final results */ if ( MPI_CONGRUENT == rresult ) { *result = lresult; } else if ( MPI_SIMILAR == rresult ) { if ( MPI_SIMILAR == lresult || MPI_CONGRUENT == lresult ) { *result = MPI_SIMILAR; } else *result = MPI_UNEQUAL; } else if ( MPI_UNEQUAL == rresult ) *result = MPI_UNEQUAL; return MPI_SUCCESS; }
/* simple linear Alltoallv */ static int nbc_alltoallv_inter_init (const void* sendbuf, const int *sendcounts, const int *sdispls, MPI_Datatype sendtype, void* recvbuf, const int *recvcounts, const int *rdispls, MPI_Datatype recvtype, struct ompi_communicator_t *comm, ompi_request_t ** request, struct mca_coll_base_module_2_3_0_t *module, bool persistent) { int res, rsize; MPI_Aint sndext, rcvext; NBC_Schedule *schedule; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; res = ompi_datatype_type_extent(sendtype, &sndext); if (MPI_SUCCESS != res) { NBC_Error("MPI Error in ompi_datatype_type_extent() (%i)", res); return res; } res = ompi_datatype_type_extent(recvtype, &rcvext); if (MPI_SUCCESS != res) { NBC_Error("MPI Error in ompi_datatype_type_extent() (%i)", res); return res; } rsize = ompi_comm_remote_size (comm); schedule = OBJ_NEW(NBC_Schedule); if (OPAL_UNLIKELY(NULL == schedule)) { return OMPI_ERR_OUT_OF_RESOURCE; } for (int i = 0; i < rsize; i++) { /* post all sends */ if (sendcounts[i] != 0) { char *sbuf = (char *) sendbuf + sdispls[i] * sndext; res = NBC_Sched_send (sbuf, false, sendcounts[i], sendtype, i, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } } /* post all receives */ if (recvcounts[i] != 0) { char *rbuf = (char *) recvbuf + rdispls[i] * rcvext; res = NBC_Sched_recv (rbuf, false, recvcounts[i], recvtype, i, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } } } res = NBC_Sched_commit(schedule); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } res = NBC_Schedule_request(schedule, comm, libnbc_module, persistent, request, NULL); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } return OMPI_SUCCESS; }
/* * Invoked when there's a new communicator that has been created. * Look at the communicator and decide which set of functions and * priority we want to return. */ mca_coll_base_module_t * mca_coll_basic_comm_query(struct ompi_communicator_t *comm, int *priority) { int size; mca_coll_basic_module_t *basic_module; basic_module = OBJ_NEW(mca_coll_basic_module_t); if (NULL == basic_module) return NULL; *priority = mca_coll_basic_priority; /* Allocate the data that hangs off the communicator */ if (OMPI_COMM_IS_INTER(comm)) { size = ompi_comm_remote_size(comm); } else { size = ompi_comm_size(comm); } basic_module->mccb_num_reqs = size * 2; basic_module->mccb_reqs = (ompi_request_t**) malloc(sizeof(ompi_request_t *) * basic_module->mccb_num_reqs); /* Choose whether to use [intra|inter], and [linear|log]-based * algorithms. */ basic_module->super.coll_module_enable = mca_coll_basic_module_enable; basic_module->super.ft_event = mca_coll_basic_ft_event; if (OMPI_COMM_IS_INTER(comm)) { basic_module->super.coll_allgather = mca_coll_basic_allgather_inter; basic_module->super.coll_allgatherv = mca_coll_basic_allgatherv_inter; basic_module->super.coll_allreduce = mca_coll_basic_allreduce_inter; basic_module->super.coll_alltoall = mca_coll_basic_alltoall_inter; basic_module->super.coll_alltoallv = mca_coll_basic_alltoallv_inter; basic_module->super.coll_alltoallw = mca_coll_basic_alltoallw_inter; basic_module->super.coll_barrier = mca_coll_basic_barrier_inter_lin; basic_module->super.coll_bcast = mca_coll_basic_bcast_lin_inter; basic_module->super.coll_exscan = NULL; basic_module->super.coll_gather = mca_coll_basic_gather_inter; basic_module->super.coll_gatherv = mca_coll_basic_gatherv_inter; basic_module->super.coll_reduce = mca_coll_basic_reduce_lin_inter; basic_module->super.coll_reduce_scatter = mca_coll_basic_reduce_scatter_inter; basic_module->super.coll_scan = NULL; basic_module->super.coll_scatter = mca_coll_basic_scatter_inter; basic_module->super.coll_scatterv = mca_coll_basic_scatterv_inter; } else if (ompi_comm_size(comm) <= mca_coll_basic_crossover) { basic_module->super.coll_allgather = mca_coll_basic_allgather_intra; basic_module->super.coll_allgatherv = mca_coll_basic_allgatherv_intra; basic_module->super.coll_allreduce = mca_coll_basic_allreduce_intra; basic_module->super.coll_alltoall = mca_coll_basic_alltoall_intra; basic_module->super.coll_alltoallv = mca_coll_basic_alltoallv_intra; basic_module->super.coll_alltoallw = mca_coll_basic_alltoallw_intra; basic_module->super.coll_barrier = mca_coll_basic_barrier_intra_lin; basic_module->super.coll_bcast = mca_coll_basic_bcast_lin_intra; basic_module->super.coll_exscan = mca_coll_basic_exscan_intra; basic_module->super.coll_gather = mca_coll_basic_gather_intra; basic_module->super.coll_gatherv = mca_coll_basic_gatherv_intra; basic_module->super.coll_reduce = mca_coll_basic_reduce_lin_intra; basic_module->super.coll_reduce_scatter = mca_coll_basic_reduce_scatter_intra; basic_module->super.coll_scan = mca_coll_basic_scan_intra; basic_module->super.coll_scatter = mca_coll_basic_scatter_intra; basic_module->super.coll_scatterv = mca_coll_basic_scatterv_intra; } else { basic_module->super.coll_allgather = mca_coll_basic_allgather_intra; basic_module->super.coll_allgatherv = mca_coll_basic_allgatherv_intra; basic_module->super.coll_allreduce = mca_coll_basic_allreduce_intra; basic_module->super.coll_alltoall = mca_coll_basic_alltoall_intra; basic_module->super.coll_alltoallv = mca_coll_basic_alltoallv_intra; basic_module->super.coll_alltoallw = mca_coll_basic_alltoallw_intra; basic_module->super.coll_barrier = mca_coll_basic_barrier_intra_log; basic_module->super.coll_bcast = mca_coll_basic_bcast_log_intra; basic_module->super.coll_exscan = mca_coll_basic_exscan_intra; basic_module->super.coll_gather = mca_coll_basic_gather_intra; basic_module->super.coll_gatherv = mca_coll_basic_gatherv_intra; basic_module->super.coll_reduce = mca_coll_basic_reduce_log_intra; basic_module->super.coll_reduce_scatter = mca_coll_basic_reduce_scatter_intra; basic_module->super.coll_scan = mca_coll_basic_scan_intra; basic_module->super.coll_scatter = mca_coll_basic_scatter_intra; basic_module->super.coll_scatterv = mca_coll_basic_scatterv_intra; } return &(basic_module->super); }
/* * alltoallw_inter * * Function: - MPI_Alltoallw * Accepts: - same as MPI_Alltoallw() * Returns: - MPI_SUCCESS or an MPI error code */ int mca_coll_basic_alltoallw_inter(void *sbuf, int *scounts, int *sdisps, struct ompi_datatype_t **sdtypes, void *rbuf, int *rcounts, int *rdisps, struct ompi_datatype_t **rdtypes, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i; int size; int err; char *psnd; char *prcv; int nreqs; MPI_Request *preq; mca_coll_basic_module_t *basic_module = (mca_coll_basic_module_t*) module; /* Initialize. */ size = ompi_comm_remote_size(comm); /* Initiate all send/recv to/from others. */ nreqs = size * 2; preq = basic_module->mccb_reqs; /* Post all receives first -- a simple optimization */ for (i = 0; i < size; ++i) { prcv = ((char *) rbuf) + rdisps[i]; err = MCA_PML_CALL(irecv_init(prcv, rcounts[i], rdtypes[i], i, MCA_COLL_BASE_TAG_ALLTOALLW, comm, preq++)); if (OMPI_SUCCESS != err) { mca_coll_basic_free_reqs(basic_module->mccb_reqs, nreqs); return err; } } /* Now post all sends */ for (i = 0; i < size; ++i) { psnd = ((char *) sbuf) + sdisps[i]; err = MCA_PML_CALL(isend_init(psnd, scounts[i], sdtypes[i], i, MCA_COLL_BASE_TAG_ALLTOALLW, MCA_PML_BASE_SEND_STANDARD, comm, preq++)); if (OMPI_SUCCESS != err) { mca_coll_basic_free_reqs(basic_module->mccb_reqs, nreqs); return err; } } /* Start your engines. This will never return an error. */ MCA_PML_CALL(start(nreqs, basic_module->mccb_reqs)); /* Wait for them all. If there's an error, note that we don't care * what the error was -- just that there *was* an error. The PML * will finish all requests, even if one or more of them fail. * i.e., by the end of this call, all the requests are free-able. * So free them anyway -- even if there was an error, and return the * error after we free everything. */ err = ompi_request_wait_all(nreqs, basic_module->mccb_reqs, MPI_STATUSES_IGNORE); /* Free the requests. */ mca_coll_basic_free_reqs(basic_module->mccb_reqs, nreqs); /* All done */ return err; }
/* * reduce_lin_inter * * Function: - reduction using O(N) algorithm * Accepts: - same as MPI_Reduce() * Returns: - MPI_SUCCESS or error code */ int mca_coll_basic_reduce_lin_inter(void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm) { int i, rank, err, size; ptrdiff_t true_lb, true_extent, lb, extent; char *free_buffer = NULL; char *pml_buffer = NULL; /* Initialize */ rank = ompi_comm_rank(comm); size = ompi_comm_remote_size(comm); if (MPI_PROC_NULL == root) { /* do nothing */ err = OMPI_SUCCESS; } else if (MPI_ROOT != root) { /* If not root, send data to the root. */ err = MCA_PML_CALL(send(sbuf, count, dtype, root, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_STANDARD, comm)); } else { /* Root receives and reduces messages */ ompi_ddt_get_extent(dtype, &lb, &extent); ompi_ddt_get_true_extent(dtype, &true_lb, &true_extent); free_buffer = (char*)malloc(true_extent + (count - 1) * extent); if (NULL == free_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; } pml_buffer = free_buffer - lb; /* Initialize the receive buffer. */ err = MCA_PML_CALL(recv(rbuf, count, dtype, 0, MCA_COLL_BASE_TAG_REDUCE, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != err) { if (NULL != free_buffer) { free(free_buffer); } return err; } /* Loop receiving and calling reduction function (C or Fortran). */ for (i = 1; i < size; i++) { err = MCA_PML_CALL(recv(pml_buffer, count, dtype, i, MCA_COLL_BASE_TAG_REDUCE, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != err) { if (NULL != free_buffer) { free(free_buffer); } return err; } /* Perform the reduction */ ompi_op_reduce(op, pml_buffer, rbuf, count, dtype); } if (NULL != free_buffer) { free(free_buffer); } } /* All done */ return err; }
/* * Init module on the communicator */ static int tuned_module_enable( mca_coll_base_module_t *module, struct ompi_communicator_t *comm ) { int size; mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t *) module; mca_coll_tuned_comm_t *data = NULL; OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_init called.")); /* Allocate the data that hangs off the communicator */ if (OMPI_COMM_IS_INTER(comm)) { size = ompi_comm_remote_size(comm); } else { size = ompi_comm_size(comm); } /** * we still malloc data as it is used by the TUNED modules * if we don't allocate it and fall back to a BASIC module routine then confuses debuggers * we place any special info after the default data * * BUT on very large systems we might not be able to allocate all this memory so * we do check a MCA parameter to see if if we should allocate this memory * * The default is set very high * */ /* if we within the memory/size limit, allow preallocated data */ if( size <= ompi_coll_tuned_preallocate_memory_comm_size_limit ) { data = (mca_coll_tuned_comm_t*)malloc(sizeof(struct mca_coll_tuned_comm_t) + (sizeof(ompi_request_t *) * size * 2)); if (NULL == data) { return OMPI_ERROR; } data->mcct_reqs = (ompi_request_t **) (data + 1); data->mcct_num_reqs = size * 2; } else { data = (mca_coll_tuned_comm_t*)malloc(sizeof(struct mca_coll_tuned_comm_t)); if (NULL == data) { return OMPI_ERROR; } data->mcct_reqs = (ompi_request_t **) NULL; data->mcct_num_reqs = 0; } if (ompi_coll_tuned_use_dynamic_rules) { OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_init MCW & Dynamic")); /** * Reset it to 0, it will be enabled again if we discover any need for dynamic decisions. */ ompi_coll_tuned_use_dynamic_rules = false; /** * next dynamic state, recheck all forced rules as well * warning, we should check to make sure this is really an INTRA comm here... */ COLL_TUNED_EXECUTE_IF_DYNAMIC(data, ALLGATHER, tuned_module->super.coll_allgather = ompi_coll_tuned_allgather_intra_dec_dynamic); COLL_TUNED_EXECUTE_IF_DYNAMIC(data, ALLGATHERV, tuned_module->super.coll_allgatherv = ompi_coll_tuned_allgatherv_intra_dec_dynamic); COLL_TUNED_EXECUTE_IF_DYNAMIC(data, ALLREDUCE, tuned_module->super.coll_allreduce = ompi_coll_tuned_allreduce_intra_dec_dynamic); COLL_TUNED_EXECUTE_IF_DYNAMIC(data, ALLTOALL, tuned_module->super.coll_alltoall = ompi_coll_tuned_alltoall_intra_dec_dynamic); COLL_TUNED_EXECUTE_IF_DYNAMIC(data, ALLTOALLV, tuned_module->super.coll_alltoallv = ompi_coll_tuned_alltoallv_intra_dec_dynamic); COLL_TUNED_EXECUTE_IF_DYNAMIC(data, ALLTOALLW, tuned_module->super.coll_alltoallw = NULL); COLL_TUNED_EXECUTE_IF_DYNAMIC(data, BARRIER, tuned_module->super.coll_barrier = ompi_coll_tuned_barrier_intra_dec_dynamic); COLL_TUNED_EXECUTE_IF_DYNAMIC(data, BCAST, tuned_module->super.coll_bcast = ompi_coll_tuned_bcast_intra_dec_dynamic); COLL_TUNED_EXECUTE_IF_DYNAMIC(data, EXSCAN, tuned_module->super.coll_exscan = NULL); COLL_TUNED_EXECUTE_IF_DYNAMIC(data, GATHER, tuned_module->super.coll_gather = ompi_coll_tuned_gather_intra_dec_dynamic); COLL_TUNED_EXECUTE_IF_DYNAMIC(data, GATHERV, tuned_module->super.coll_gatherv = NULL); COLL_TUNED_EXECUTE_IF_DYNAMIC(data, REDUCE, tuned_module->super.coll_reduce = ompi_coll_tuned_reduce_intra_dec_dynamic); COLL_TUNED_EXECUTE_IF_DYNAMIC(data, REDUCESCATTER, tuned_module->super.coll_reduce_scatter = ompi_coll_tuned_reduce_scatter_intra_dec_dynamic); COLL_TUNED_EXECUTE_IF_DYNAMIC(data, SCAN, tuned_module->super.coll_scan = NULL); COLL_TUNED_EXECUTE_IF_DYNAMIC(data, SCATTER, tuned_module->super.coll_scatter = ompi_coll_tuned_scatter_intra_dec_dynamic); COLL_TUNED_EXECUTE_IF_DYNAMIC(data, SCATTERV, tuned_module->super.coll_scatterv = NULL); if( false == ompi_coll_tuned_use_dynamic_rules ) { /* no real need for dynamic decisions */ OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:module_enable switch back to fixed" " decision by lack of dynamic rules")); } } /* general n fan out tree */ data->cached_ntree = NULL; /* binary tree */ data->cached_bintree = NULL; /* binomial tree */ data->cached_bmtree = NULL; /* binomial tree */ data->cached_in_order_bmtree = NULL; /* chains (fanout followed by pipelines) */ data->cached_chain = NULL; /* standard pipeline */ data->cached_pipeline = NULL; /* in-order binary tree */ data->cached_in_order_bintree = NULL; /* All done */ tuned_module->tuned_data = data; OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_init Tuned is in use")); return OMPI_SUCCESS; }
int ompi_mpi_abort(struct ompi_communicator_t* comm, int errcode, bool kill_remote_of_intercomm) { int count = 0, i; char *msg, *host, hostname[MAXHOSTNAMELEN]; pid_t pid = 0; orte_process_name_t *abort_procs; orte_std_cntr_t nabort_procs; /* Protection for recursive invocation */ if (have_been_invoked) { return OMPI_SUCCESS; } have_been_invoked = true; /* If ORTE is initialized, use its nodename. Otherwise, call gethostname. */ if (orte_initialized) { host = orte_process_info.nodename; } else { gethostname(hostname, sizeof(hostname)); host = hostname; } pid = getpid(); /* Should we print a stack trace? */ if (ompi_mpi_abort_print_stack) { char **messages; int len, i; if (OMPI_SUCCESS == opal_backtrace_buffer(&messages, &len)) { for (i = 0; i < len; ++i) { fprintf(stderr, "[%s:%d] [%d] func:%s\n", host, (int) pid, i, messages[i]); fflush(stderr); } free(messages); } else { /* This will print an message if it's unable to print the backtrace, so we don't need an additional "else" clause if opal_backtrace_print() is not supported. */ opal_backtrace_print(stderr); } } /* Notify the debugger that we're about to abort */ if (errcode < 0 || asprintf(&msg, "[%s:%d] aborting with MPI error %s%s", host, (int) pid, ompi_mpi_errnum_get_string(errcode), ompi_mpi_abort_print_stack ? " (stack trace available on stderr)" : "") < 0) { msg = NULL; } ompi_debugger_notify_abort(msg); if (NULL != msg) { free(msg); } /* Should we wait for a while before aborting? */ if (0 != ompi_mpi_abort_delay) { if (ompi_mpi_abort_delay < 0) { fprintf(stderr ,"[%s:%d] Looping forever (MCA parameter mpi_abort_delay is < 0)\n", host, (int) pid); fflush(stderr); while (1) { sleep(5); } } else { fprintf(stderr, "[%s:%d] Delaying for %d seconds before aborting\n", host, (int) pid, ompi_mpi_abort_delay); do { sleep(1); } while (--ompi_mpi_abort_delay > 0); } } /* If OMPI isn't setup yet/any more, then don't even try killing everyone. Ditto for ORTE (e.g., ORTE may be initialized before MPI_INIT is over, but ompi_initialized will be false because communicators are not setup yet). Sorry, Charlie... */ if (!orte_initialized || !ompi_mpi_initialized || ompi_mpi_finalized) { fprintf(stderr, "[%s:%d] Abort %s completed successfully; not able to guarantee that all other processes were killed!\n", host, (int) pid, ompi_mpi_finalized ? "after MPI_FINALIZE" : "before MPI_INIT"); exit(errcode); } /* abort local procs in the communicator. If the communicator is an intercommunicator AND the abort has explicitly requested that we abort the remote procs, then do that as well. */ nabort_procs = ompi_comm_size(comm); if (kill_remote_of_intercomm) { /* ompi_comm_remote_size() returns 0 if not an intercomm, so this is cool */ nabort_procs += ompi_comm_remote_size(comm); } abort_procs = (orte_process_name_t*)malloc(sizeof(orte_process_name_t) * nabort_procs); if (NULL == abort_procs) { /* quick clean orte and get out */ orte_errmgr.abort(errcode, "Abort unable to malloc memory to kill procs"); } /* put all the local procs in the abort list */ for (i = 0 ; i < ompi_comm_size(comm) ; ++i) { if (OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &comm->c_local_group->grp_proc_pointers[i]->proc_name, ORTE_PROC_MY_NAME)) { assert(count <= nabort_procs); abort_procs[count++] = comm->c_local_group->grp_proc_pointers[i]->proc_name; } else { /* don't terminate me just yet */ nabort_procs--; } } /* if requested, kill off remote procs too */ if (kill_remote_of_intercomm) { for (i = 0 ; i < ompi_comm_remote_size(comm) ; ++i) { if (OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &comm->c_remote_group->grp_proc_pointers[i]->proc_name, ORTE_PROC_MY_NAME)) { assert(count <= nabort_procs); abort_procs[count++] = comm->c_remote_group->grp_proc_pointers[i]->proc_name; } else { /* don't terminate me just yet */ nabort_procs--; } } } if (nabort_procs > 0) { #if 0 int ret = orte_errmgr.abort_procs_request(abort_procs, nabort_procs); if (OMPI_SUCCESS != ret) { orte_errmgr.abort(ret, "Open MPI failed to abort procs as requested (%d). Exiting.", ret); } #endif } /* now that we've aborted everyone else, gracefully die. */ orte_errmgr.abort(errcode, NULL); return OMPI_SUCCESS; }
static int nbc_barrier_inter_init(struct ompi_communicator_t *comm, ompi_request_t ** request, struct mca_coll_base_module_2_3_0_t *module, bool persistent) { int rank, res, rsize; NBC_Schedule *schedule; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; rank = ompi_comm_rank (comm); rsize = ompi_comm_remote_size (comm); schedule = OBJ_NEW(NBC_Schedule); if (OPAL_UNLIKELY(NULL == schedule)) { return OMPI_ERR_OUT_OF_RESOURCE; } if (0 == rank) { for (int peer = 1 ; peer < rsize ; ++peer) { res = NBC_Sched_recv (NULL, false, 0, MPI_BYTE, peer, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } } } /* synchronize with the remote root */ res = NBC_Sched_recv (NULL, false, 0, MPI_BYTE, 0, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } res = NBC_Sched_send (NULL, false, 0, MPI_BYTE, 0, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } if (0 == rank) { /* wait for the remote root */ res = NBC_Sched_barrier (schedule); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } /* inform remote peers that all local peers have entered the barrier */ for (int peer = 1; peer < rsize ; ++peer) { res = NBC_Sched_send (NULL, false, 0, MPI_BYTE, peer, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } } } res = NBC_Sched_commit (schedule); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } res = NBC_Schedule_request(schedule, comm, libnbc_module, persistent, request, NULL); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } return OMPI_SUCCESS; }
int ompi_coll_libnbc_iallreduce_inter(const void* sendbuf, void* recvbuf, int count, MPI_Datatype datatype, MPI_Op op, struct ompi_communicator_t *comm, ompi_request_t ** request, struct mca_coll_base_module_2_1_0_t *module) { int rank, res, size, rsize; MPI_Aint ext; NBC_Schedule *schedule; NBC_Handle *handle; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; rank = ompi_comm_rank (comm); rsize = ompi_comm_remote_size (comm); res = MPI_Type_extent(datatype, &ext); if (MPI_SUCCESS != res) { NBC_Error("MPI Error in MPI_Type_extent() (%i)", res); return res; } res = MPI_Type_size(datatype, &size); if (MPI_SUCCESS != res) { NBC_Error("MPI Error in MPI_Type_size() (%i)", res); return res; } res = NBC_Init_handle (comm, &handle, libnbc_module); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } handle->tmpbuf = malloc (ext * count); if (OPAL_UNLIKELY(NULL == handle->tmpbuf)) { NBC_Return_handle (handle); return OMPI_ERR_OUT_OF_RESOURCE; } schedule = OBJ_NEW(NBC_Schedule); if (OPAL_UNLIKELY(NULL == schedule)) { NBC_Return_handle (handle); return OMPI_ERR_OUT_OF_RESOURCE; } /* ensure the schedule is released with the handle on error */ handle->schedule = schedule; res = allred_sched_linear (rank, rsize, sendbuf, recvbuf, count, datatype, op, ext, size, schedule, handle); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; } res = NBC_Sched_commit(schedule); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; } res = NBC_Start(handle, schedule); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; } *request = (ompi_request_t *) handle; /* tmpbuf is freed with the handle */ return OMPI_SUCCESS; }
/* * scatterv_inter * * Function: - scatterv operation * Accepts: - same arguments as MPI_Scatterv() * Returns: - MPI_SUCCESS or error code */ int mca_coll_inter_scatterv_inter(const void *sbuf, const int *scounts, const int *disps, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, rank, size, err, total, size_local; int *counts=NULL,*displace=NULL; char *ptmp=NULL; MPI_Aint incr; MPI_Aint extent; MPI_Aint lb; ompi_datatype_t *ndtype; /* Initialize */ rank = ompi_comm_rank(comm); size = ompi_comm_remote_size(comm); size_local = ompi_comm_size(comm); if (MPI_PROC_NULL == root) { /* do nothing */ err = OMPI_SUCCESS; } else if (MPI_ROOT != root) { if(0 == rank) { /* local root recieves the counts from the root */ counts = (int *)malloc(sizeof(int) * size_local); err = MCA_PML_CALL(recv(counts, size_local, MPI_INT, root, MCA_COLL_BASE_TAG_SCATTERV, comm, MPI_STATUS_IGNORE)); if (OMPI_SUCCESS != err) { return err; } /* calculate the whole buffer size and recieve it from root */ err = ompi_datatype_get_extent(rdtype, &lb, &extent); if (OMPI_SUCCESS != err) { return OMPI_ERROR; } incr = 0; for (i = 0; i < size_local; i++) { incr = incr + extent*counts[i]; } if ( incr > 0 ) { ptmp = (char*)malloc(incr); if (NULL == ptmp) { return OMPI_ERR_OUT_OF_RESOURCE; } } total = 0; for (i = 0; i < size_local; i++) { total = total + counts[i]; } err = MCA_PML_CALL(recv(ptmp, total, rdtype, root, MCA_COLL_BASE_TAG_SCATTERV, comm, MPI_STATUS_IGNORE)); if (OMPI_SUCCESS != err) { return err; } /* set the local displacement i.e. no displacements here */ displace = (int *)malloc(sizeof(int) * size_local); displace[0] = 0; for (i = 1; i < size_local; i++) { displace[i] = displace[i-1] + counts[i-1]; } } /* perform the scatterv locally */ err = comm->c_local_comm->c_coll.coll_scatterv(ptmp, counts, displace, rdtype, rbuf, rcount, rdtype, 0, comm->c_local_comm, comm->c_local_comm->c_coll.coll_scatterv_module); if (OMPI_SUCCESS != err) { return err; } if (NULL != ptmp) { free(ptmp); } if (NULL != displace) { free(displace); } if (NULL != counts) { free(counts); } } else { err = MCA_PML_CALL(send(scounts, size, MPI_INT, 0, MCA_COLL_BASE_TAG_SCATTERV, MCA_PML_BASE_SEND_STANDARD, comm)); if (OMPI_SUCCESS != err) { return err; } ompi_datatype_create_indexed(size,scounts,disps,sdtype,&ndtype); ompi_datatype_commit(&ndtype); err = MCA_PML_CALL(send(sbuf, 1, ndtype, 0, MCA_COLL_BASE_TAG_SCATTERV, MCA_PML_BASE_SEND_STANDARD, comm)); if (OMPI_SUCCESS != err) { return err; } ompi_datatype_destroy(&ndtype); } /* All done */ return err; }
int MPI_Gatherv(void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int *recvcounts, int *displs, MPI_Datatype recvtype, int root, MPI_Comm comm) { int i, size, err; if (MPI_PARAM_CHECK) { err = MPI_SUCCESS; OMPI_ERR_INIT_FINALIZE(FUNC_NAME); if (ompi_comm_invalid(comm)) { return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_COMM, FUNC_NAME); } else if ((ompi_comm_rank(comm) != root && MPI_IN_PLACE == sendbuf) || (ompi_comm_rank(comm) == root && MPI_IN_PLACE == recvbuf)) { return OMPI_ERRHANDLER_INVOKE(comm, MPI_ERR_ARG, FUNC_NAME); } /* Errors for intracommunicators */ if (OMPI_COMM_IS_INTRA(comm)) { /* Errors for all ranks */ if ((root >= ompi_comm_size(comm)) || (root < 0)) { err = MPI_ERR_ROOT; } else if (MPI_IN_PLACE != sendbuf) { OMPI_CHECK_DATATYPE_FOR_SEND(err, sendtype, sendcount); } OMPI_ERRHANDLER_CHECK(err, comm, err, FUNC_NAME); /* Errors for the root. Some of these could have been combined into compound if statements above, but since this whole section can be compiled out (or turned off at run time) for efficiency, it's more clear to separate them out into individual tests. */ if (ompi_comm_rank(comm) == root) { if (NULL == displs) { return OMPI_ERRHANDLER_INVOKE(comm, MPI_ERR_ARG, FUNC_NAME); } if (NULL == recvcounts) { return OMPI_ERRHANDLER_INVOKE(comm, MPI_ERR_COUNT, FUNC_NAME); } size = ompi_comm_size(comm); for (i = 0; i < size; ++i) { if (recvcounts[i] < 0) { return OMPI_ERRHANDLER_INVOKE(comm, MPI_ERR_COUNT, FUNC_NAME); } else if (MPI_DATATYPE_NULL == recvtype) { return OMPI_ERRHANDLER_INVOKE(comm, MPI_ERR_TYPE, FUNC_NAME); } } } } /* Errors for intercommunicators */ else { if (! ((root >= 0 && root < ompi_comm_remote_size(comm)) || MPI_ROOT == root || MPI_PROC_NULL == root)) { return OMPI_ERRHANDLER_INVOKE(comm, MPI_ERR_ROOT, FUNC_NAME); } /* Errors for the senders */ if (MPI_ROOT != root && MPI_PROC_NULL != root) { OMPI_CHECK_DATATYPE_FOR_SEND(err, sendtype, sendcount); OMPI_ERRHANDLER_CHECK(err, comm, err, FUNC_NAME); } /* Errors for the root. Ditto on the comment above -- these error checks could have been combined above, but let's make the code easier to read. */ else if (MPI_ROOT == root) { if (NULL == displs) { return OMPI_ERRHANDLER_INVOKE(comm, MPI_ERR_ARG, FUNC_NAME); } if (NULL == recvcounts) { return OMPI_ERRHANDLER_INVOKE(comm, MPI_ERR_COUNT, FUNC_NAME); } size = ompi_comm_size(comm); for (i = 0; i < size; ++i) { if (recvcounts[i] < 0) { return OMPI_ERRHANDLER_INVOKE(comm, MPI_ERR_COUNT, FUNC_NAME); } else if (MPI_DATATYPE_NULL == recvtype) { return OMPI_ERRHANDLER_INVOKE(comm, MPI_ERR_TYPE, FUNC_NAME); } } } } } /* Invoke the coll component to perform the back-end operation */ err = comm->c_coll.coll_gatherv(sendbuf, sendcount, sendtype, recvbuf, recvcounts, displs, recvtype, root, comm); OMPI_ERRHANDLER_RETURN(err, comm, err, FUNC_NAME); }
/* * allgatherv_inter * * Function: - allgatherv using other MPI collectives * Accepts: - same as MPI_Allgatherv() * Returns: - MPI_SUCCESS or error code */ int mca_coll_inter_allgatherv_inter(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, const int *rcounts, const int *disps, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, rank, size, size_local, total=0, err; int *count=NULL,*displace=NULL; char *ptmp_free=NULL, *ptmp=NULL; ompi_datatype_t *ndtype = NULL; rank = ompi_comm_rank(comm); size_local = ompi_comm_size(comm->c_local_comm); size = ompi_comm_remote_size(comm); if (0 == rank) { count = (int *)malloc(sizeof(int) * size_local); displace = (int *)malloc(sizeof(int) * size_local); if ((NULL == count) || (NULL == displace)) { err = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } /* Local gather to get the scount of each process */ err = comm->c_local_comm->c_coll->coll_gather(&scount, 1, MPI_INT, count, 1, MPI_INT, 0, comm->c_local_comm, comm->c_local_comm->c_coll->coll_gather_module); if (OMPI_SUCCESS != err) { goto exit; } if(0 == rank) { displace[0] = 0; for (i = 1; i < size_local; i++) { displace[i] = displace[i-1] + count[i-1]; } total = 0; for (i = 0; i < size_local; i++) { total = total + count[i]; } if ( total > 0 ) { ptrdiff_t gap, span; span = opal_datatype_span(&sdtype->super, total, &gap); ptmp_free = (char*)malloc(span); if (NULL == ptmp_free) { err = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } ptmp = ptmp_free - gap; } } err = comm->c_local_comm->c_coll->coll_gatherv(sbuf, scount, sdtype, ptmp, count, displace, sdtype,0, comm->c_local_comm, comm->c_local_comm->c_coll->coll_gatherv_module); if (OMPI_SUCCESS != err) { goto exit; } ompi_datatype_create_indexed(size,rcounts,disps,rdtype,&ndtype); ompi_datatype_commit(&ndtype); if (0 == rank) { /* Exchange data between roots */ err = ompi_coll_base_sendrecv_actual(ptmp, total, sdtype, 0, MCA_COLL_BASE_TAG_ALLGATHERV, rbuf, 1, ndtype, 0, MCA_COLL_BASE_TAG_ALLGATHERV, comm, MPI_STATUS_IGNORE); if (OMPI_SUCCESS != err) { goto exit; } } /* bcast the message to all the local processes */ err = comm->c_local_comm->c_coll->coll_bcast(rbuf, 1, ndtype, 0, comm->c_local_comm, comm->c_local_comm->c_coll->coll_bcast_module); exit: if( NULL != ndtype ) { ompi_datatype_destroy(&ndtype); } if (NULL != ptmp_free) { free(ptmp_free); } if (NULL != displace) { free(displace); } if (NULL != count) { free(count); } return err; }
/* simple linear Alltoallw */ int ompi_coll_libnbc_ialltoallw_inter (void* sendbuf, int *sendcounts, int *sdispls, MPI_Datatype sendtypes[], void* recvbuf, int *recvcounts, int *rdispls, MPI_Datatype recvtypes[], struct ompi_communicator_t *comm, ompi_request_t ** request, struct mca_coll_base_module_2_1_0_t *module) { int res, rsize; NBC_Schedule *schedule; char *rbuf, *sbuf; NBC_Handle *handle; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; rsize = ompi_comm_remote_size (comm); schedule = OBJ_NEW(NBC_Schedule); if (OPAL_UNLIKELY(NULL == schedule)) { return OMPI_ERR_OUT_OF_RESOURCE; } for (int i = 0 ; i < rsize ; ++i) { /* post all sends */ if (sendcounts[i] != 0) { sbuf = (char *) sendbuf + sdispls[i]; res = NBC_Sched_send (sbuf, false, sendcounts[i], sendtypes[i], i, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } } /* post all receives */ if (recvcounts[i] != 0) { rbuf = (char *) recvbuf + rdispls[i]; res = NBC_Sched_recv (rbuf, false, recvcounts[i], recvtypes[i], i, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } } } res = NBC_Sched_commit (schedule); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } res = NBC_Init_handle (comm, &handle, libnbc_module); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } res = NBC_Start (handle, schedule); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; } *request = (ompi_request_t *) handle; return OMPI_SUCCESS; }
/* * allreduce_inter * * Function: - allreduce using other MPI collectives * Accepts: - same as MPI_Allreduce() * Returns: - MPI_SUCCESS or error code */ int mca_coll_inter_allreduce_inter(void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int err, rank, root = 0, rsize; ptrdiff_t lb, extent; char *tmpbuf = NULL, *pml_buffer = NULL; ompi_request_t *req[2]; rank = ompi_comm_rank(comm); rsize = ompi_comm_remote_size(comm); /* Perform the reduction locally */ err = ompi_ddt_get_extent(dtype, &lb, &extent); if (OMPI_SUCCESS != err) { return OMPI_ERROR; } tmpbuf = (char *) malloc(count * extent); if (NULL == tmpbuf) { return OMPI_ERR_OUT_OF_RESOURCE; } pml_buffer = tmpbuf - lb; err = comm->c_local_comm->c_coll.coll_reduce(sbuf, pml_buffer, count, dtype, op, root, comm->c_local_comm, comm->c_local_comm->c_coll.coll_reduce_module); if (OMPI_SUCCESS != err) { goto exit; } if (rank == root) { /* Do a send-recv between the two root procs. to avoid deadlock */ err = MCA_PML_CALL(irecv(rbuf, count, dtype, 0, MCA_COLL_BASE_TAG_ALLREDUCE, comm, &(req[0]))); if (OMPI_SUCCESS != err) { goto exit; } err = MCA_PML_CALL(isend(pml_buffer, count, dtype, 0, MCA_COLL_BASE_TAG_ALLREDUCE, MCA_PML_BASE_SEND_STANDARD, comm, &(req[1]))); if (OMPI_SUCCESS != err) { goto exit; } err = ompi_request_wait_all(2, req, MPI_STATUSES_IGNORE); if (OMPI_SUCCESS != err) { goto exit; } } /* bcast the message to all the local processes */ err = comm->c_local_comm->c_coll.coll_bcast(rbuf, count, dtype, root, comm->c_local_comm, comm->c_local_comm->c_coll.coll_bcast_module); if (OMPI_SUCCESS != err) { goto exit; } exit: if (NULL != tmpbuf) { free(tmpbuf); } return err; }
/* * alltoallv_inter * * Function: - MPI_Alltoallv * Accepts: - same as MPI_Alltoallv() * Returns: - MPI_SUCCESS or an MPI error code */ int mca_coll_basic_alltoallv_inter(const void *sbuf, const int *scounts, const int *sdisps, struct ompi_datatype_t *sdtype, void *rbuf, const int *rcounts, const int *rdisps, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i; int rsize; int err; char *psnd; char *prcv; size_t nreqs; MPI_Aint sndextent; MPI_Aint rcvextent; mca_coll_basic_module_t *basic_module = (mca_coll_basic_module_t*) module; ompi_request_t **preq = basic_module->mccb_reqs; /* Initialize. */ rsize = ompi_comm_remote_size(comm); ompi_datatype_type_extent(sdtype, &sndextent); ompi_datatype_type_extent(rdtype, &rcvextent); /* Initiate all send/recv to/from others. */ nreqs = rsize * 2; /* Post all receives first */ /* A simple optimization: do not send and recv msgs of length zero */ for (i = 0; i < rsize; ++i) { prcv = ((char *) rbuf) + (rdisps[i] * rcvextent); if (rcounts[i] > 0) { err = MCA_PML_CALL(irecv(prcv, rcounts[i], rdtype, i, MCA_COLL_BASE_TAG_ALLTOALLV, comm, &preq[i])); if (MPI_SUCCESS != err) { return err; } } else { preq[i] = MPI_REQUEST_NULL; } } /* Now post all sends */ for (i = 0; i < rsize; ++i) { psnd = ((char *) sbuf) + (sdisps[i] * sndextent); if (scounts[i] > 0) { err = MCA_PML_CALL(isend(psnd, scounts[i], sdtype, i, MCA_COLL_BASE_TAG_ALLTOALLV, MCA_PML_BASE_SEND_STANDARD, comm, &preq[rsize + i])); if (MPI_SUCCESS != err) { return err; } } else { preq[rsize + i] = MPI_REQUEST_NULL; } } err = ompi_request_wait_all(nreqs, preq, MPI_STATUSES_IGNORE); /* All done */ return err; }
/* * alltoall_inter * * Function: - MPI_Alltoall * Accepts: - same as MPI_Alltoall() * Returns: - MPI_SUCCESS or an MPI error code */ int mca_coll_basic_alltoall_inter(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i; int size; int err; int nreqs; char *psnd; char *prcv; MPI_Aint lb; MPI_Aint sndinc; MPI_Aint rcvinc; ompi_request_t **req; ompi_request_t **sreq; ompi_request_t **rreq; mca_coll_basic_module_t *basic_module = (mca_coll_basic_module_t*) module; /* Initialize. */ size = ompi_comm_remote_size(comm); err = ompi_datatype_get_extent(sdtype, &lb, &sndinc); if (OMPI_SUCCESS != err) { return err; } sndinc *= scount; err = ompi_datatype_get_extent(rdtype, &lb, &rcvinc); if (OMPI_SUCCESS != err) { return err; } rcvinc *= rcount; /* Initiate all send/recv to/from others. */ nreqs = size * 2; req = rreq = basic_module->mccb_reqs; sreq = rreq + size; prcv = (char *) rbuf; psnd = (char *) sbuf; /* Post all receives first */ for (i = 0; i < size; i++, ++rreq) { err = MCA_PML_CALL(irecv(prcv + (i * rcvinc), rcount, rdtype, i, MCA_COLL_BASE_TAG_ALLTOALL, comm, rreq)); if (OMPI_SUCCESS != err) { return err; } } /* Now post all sends */ for (i = 0; i < size; i++, ++sreq) { err = MCA_PML_CALL(isend(psnd + (i * sndinc), scount, sdtype, i, MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD, comm, sreq)); if (OMPI_SUCCESS != err) { return err; } } /* Wait for them all. If there's an error, note that we don't * care what the error was -- just that there *was* an error. The * PML will finish all requests, even if one or more of them fail. * i.e., by the end of this call, all the requests are free-able. * So free them anyway -- even if there was an error, and return * the error after we free everything. */ err = ompi_request_wait_all(nreqs, req, MPI_STATUSES_IGNORE); /* All done */ return err; }
/* * Init module on the communicator */ const struct mca_coll_base_module_1_0_0_t * ompi_coll_tuned_module_init(struct ompi_communicator_t *comm) { int size, rank; struct mca_coll_base_comm_t *data; /* fanout parameters */ int rc=0; int i; OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_init called.")); /* This routine will become more complex and might have to be */ /* broken into more sections/function calls */ /* Order of operations: * alloc memory for nb reqs (in case we fall through) * add decision rules if using dynamic rules * compact rules using communicator size info etc * build first guess cached topologies (might depend on the rules from above) * * then attach all to the communicator and return base module funct ptrs */ /* Allocate the data that hangs off the communicator */ if (OMPI_COMM_IS_INTER(comm)) { size = ompi_comm_remote_size(comm); } else { size = ompi_comm_size(comm); } /* * we still malloc data as it is used by the TUNED modules * if we don't allocate it and fall back to a BASIC module routine then confuses debuggers * we place any special info after the default data * * BUT on very large systems we might not be able to allocate all this memory so * we do check a MCA parameter to see if if we should allocate this memory * * The default is set very high * */ /* if we within the memory/size limit, allow preallocated data */ if (size<=ompi_coll_tuned_preallocate_memory_comm_size_limit) { data = (mca_coll_base_comm_t*)malloc(sizeof(struct mca_coll_base_comm_t) + (sizeof(ompi_request_t *) * size * 2)); if (NULL == data) { return NULL; } data->mcct_reqs = (ompi_request_t **) (data + 1); data->mcct_num_reqs = size * 2; } else { data = (mca_coll_base_comm_t*)malloc(sizeof(struct mca_coll_base_comm_t)); if (NULL == data) { return NULL; } data->mcct_reqs = (ompi_request_t **) NULL; data->mcct_num_reqs = 0; } /* * If using dynamic and you are MPI_COMM_WORLD and you want to use a parameter file.. * then this effects how much storage space you need * (This is a basic version of what will go into V2) * */ size = ompi_comm_size(comm); /* find size so we can (A) decide if to access the file directly */ /* (B) so we can get our very own customised ompi_coll_com_rule_t ptr */ /* which only has rules in it for our com size */ rank = ompi_comm_rank(comm); /* find rank as only MCW:0 opens any tuned conf files */ /* actually if they are below a threadhold, they all open it */ /* have to build a collective in here.. but just for MCW.. */ /* but we have to make sure we have the same rules everywhere :( */ /* if using dynamic rules make sure all overrides are NULL before we start override anything accidently */ if (ompi_coll_tuned_use_dynamic_rules) { /* base rules */ data->all_base_rules = (ompi_coll_alg_rule_t*) NULL; /* each collective rule for my com size */ for (i=0;i<COLLCOUNT;i++) { data->com_rules[i] = (ompi_coll_com_rule_t*) NULL; } } /* next dynamic state, recheck all forced rules as well */ /* warning, we should check to make sure this is really an INTRA comm here... */ if (ompi_coll_tuned_use_dynamic_rules) { ompi_coll_tuned_forced_getvalues (ompi_coll_tuned_forced_params[ALLREDUCE], &(data->user_forced[ALLREDUCE])); ompi_coll_tuned_forced_getvalues (ompi_coll_tuned_forced_params[ALLTOALL], &(data->user_forced[ALLTOALL])); ompi_coll_tuned_forced_getvalues (ompi_coll_tuned_forced_params[ALLGATHER], &(data->user_forced[ALLGATHER])); /* ompi_coll_tuned_forced_getvalues (ompi_coll_tuned_forced_params[ALLTOALLV], &(data->user_forced[ALLTOALLV])); */ ompi_coll_tuned_forced_getvalues_barrier (ompi_coll_tuned_forced_params[BARRIER], &(data->user_forced[BARRIER])); ompi_coll_tuned_forced_getvalues (ompi_coll_tuned_forced_params[BCAST], &(data->user_forced[BCAST])); ompi_coll_tuned_forced_getvalues (ompi_coll_tuned_forced_params[REDUCE], &(data->user_forced[REDUCE])); } if (&ompi_mpi_comm_world==comm) { if (ompi_coll_tuned_use_dynamic_rules) { OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_init MCW & Dynamic")); if (ompi_coll_tuned_dynamic_rules_filename) { OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_init Opening [%s]", ompi_coll_tuned_dynamic_rules_filename)); rc = ompi_coll_tuned_read_rules_config_file (ompi_coll_tuned_dynamic_rules_filename, &(data->all_base_rules), COLLCOUNT); if (rc>=0) { OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_init Read %d valid rules\n", rc)); /* at this point we all have a base set of rules */ /* now we can get our customized communicator sized rule set, for each collective */ for (i=0;i<COLLCOUNT;i++) { data->com_rules[i] = ompi_coll_tuned_get_com_rule_ptr (data->all_base_rules, i, size); } } else { /* failed to read config file, thus make sure its a NULL... */ data->all_base_rules = (ompi_coll_alg_rule_t*) NULL; } } /* end if a config filename exists */ } /* end if dynamic_rules */ } /* end if MCW */ /* ok, if using dynamic rules, not MCW and we are just any rank and a base set of rules exist.. ref them */ /* order of eval is important here, if we are MCW ompi_mpi_comm_world.c_coll_selected_data is NULL still.. */ if ((ompi_coll_tuned_use_dynamic_rules)&&(!(&ompi_mpi_comm_world==comm))&& ((ompi_mpi_comm_world.c_coll_selected_data)->all_base_rules)) { OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_init NOT MCW & Dynamic")); /* this will, erm fail if MCW doesn't exist which it should! */ data->all_base_rules = (ompi_mpi_comm_world.c_coll_selected_data)->all_base_rules; /* at this point we all have a base set of rules if they exist atall */ /* now we can get our customized communicator sized rule set, for each collective */ for (i=0;i<COLLCOUNT;i++) { data->com_rules[i] = ompi_coll_tuned_get_com_rule_ptr (data->all_base_rules, i, size); } } /* * now for the cached topo functions * guess the initial topologies to use rank 0 as root */ /* general n fan out tree */ data->cached_ntree = ompi_coll_tuned_topo_build_tree (ompi_coll_tuned_init_tree_fanout, comm, 0); data->cached_ntree_root = 0; data->cached_ntree_fanout = ompi_coll_tuned_init_tree_fanout; /* binary tree */ data->cached_bintree = ompi_coll_tuned_topo_build_tree (2, comm, 0); data->cached_bintree_root = 0; /* binomial tree */ data->cached_bmtree = ompi_coll_tuned_topo_build_bmtree (comm, 0); data->cached_bmtree_root = 0; /* * chains (fanout followed by pipelines) * are more difficuilt as the fan out really really depends on message size [sometimes].. * as size gets larger fan-out gets smaller [usually] * * will probably change how we cache this later, for now a midsize * GEF */ data->cached_chain = ompi_coll_tuned_topo_build_chain (ompi_coll_tuned_init_chain_fanout, comm, 0); data->cached_chain_root = 0; data->cached_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* standard pipeline */ data->cached_pipeline = ompi_coll_tuned_topo_build_chain (1, comm, 0); data->cached_pipeline_root = 0; /* All done */ comm->c_coll_selected_data = data; OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_init Tuned is in use")); return to_use; }
/* * allgather_inter * * Function: - allgather using other MPI collections * Accepts: - same as MPI_Allgather() * Returns: - MPI_SUCCESS or error code */ int mca_coll_basic_allgather_inter(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int rank, root = 0, size, rsize, err, i, line; char *tmpbuf_free = NULL, *tmpbuf, *ptmp; ptrdiff_t rlb, rextent, incr; ptrdiff_t gap, span; ompi_request_t *req; ompi_request_t **reqs = NULL; rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); rsize = ompi_comm_remote_size(comm); /* Algorithm: * - a gather to the root in remote group (simultaniously executed, * thats why we cannot use coll_gather). * - exchange the temp-results between two roots * - inter-bcast (again simultanious). */ /* Step one: gather operations: */ if (rank != root) { /* send your data to root */ err = MCA_PML_CALL(send(sbuf, scount, sdtype, root, MCA_COLL_BASE_TAG_ALLGATHER, MCA_PML_BASE_SEND_STANDARD, comm)); if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; } } else { /* receive a msg. from all other procs. */ err = ompi_datatype_get_extent(rdtype, &rlb, &rextent); if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; } /* Get a requests arrays of the right size */ reqs = ompi_coll_base_comm_get_reqs(module->base_data, rsize + 1); if( NULL == reqs ) { line = __LINE__; err = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } /* Do a send-recv between the two root procs. to avoid deadlock */ err = MCA_PML_CALL(isend(sbuf, scount, sdtype, 0, MCA_COLL_BASE_TAG_ALLGATHER, MCA_PML_BASE_SEND_STANDARD, comm, &reqs[rsize])); if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; } err = MCA_PML_CALL(irecv(rbuf, rcount, rdtype, 0, MCA_COLL_BASE_TAG_ALLGATHER, comm, &reqs[0])); if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; } incr = rextent * rcount; ptmp = (char *) rbuf + incr; for (i = 1; i < rsize; ++i, ptmp += incr) { err = MCA_PML_CALL(irecv(ptmp, rcount, rdtype, i, MCA_COLL_BASE_TAG_ALLGATHER, comm, &reqs[i])); if (MPI_SUCCESS != err) { line = __LINE__; goto exit; } } err = ompi_request_wait_all(rsize + 1, reqs, MPI_STATUSES_IGNORE); if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; } /* Step 2: exchange the resuts between the root processes */ span = opal_datatype_span(&sdtype->super, (int64_t)scount * (int64_t)size, &gap); tmpbuf_free = (char *) malloc(span); if (NULL == tmpbuf_free) { line = __LINE__; err = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } tmpbuf = tmpbuf_free - gap; err = MCA_PML_CALL(isend(rbuf, rsize * rcount, rdtype, 0, MCA_COLL_BASE_TAG_ALLGATHER, MCA_PML_BASE_SEND_STANDARD, comm, &req)); if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; } err = MCA_PML_CALL(recv(tmpbuf, size * scount, sdtype, 0, MCA_COLL_BASE_TAG_ALLGATHER, comm, MPI_STATUS_IGNORE)); if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; } err = ompi_request_wait( &req, MPI_STATUS_IGNORE); if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; } } /* Step 3: bcast the data to the remote group. This * happens in both groups simultaneously, thus we can * not use coll_bcast (this would deadlock). */ if (rank != root) { /* post the recv */ err = MCA_PML_CALL(recv(rbuf, rsize * rcount, rdtype, 0, MCA_COLL_BASE_TAG_ALLGATHER, comm, MPI_STATUS_IGNORE)); if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; } } else { /* Send the data to every other process in the remote group * except to rank zero. which has it already. */ for (i = 1; i < rsize; i++) { err = MCA_PML_CALL(isend(tmpbuf, size * scount, sdtype, i, MCA_COLL_BASE_TAG_ALLGATHER, MCA_PML_BASE_SEND_STANDARD, comm, &reqs[i - 1])); if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; } } err = ompi_request_wait_all(rsize - 1, reqs, MPI_STATUSES_IGNORE); if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; } } exit: if( MPI_SUCCESS != err ) { OPAL_OUTPUT( (ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank) ); (void)line; // silence compiler warning if( NULL != reqs ) ompi_coll_base_free_reqs(reqs, rsize+1); } if (NULL != tmpbuf_free) { free(tmpbuf_free); } return err; }
int ompi_coll_libnbc_ibarrier_inter(struct ompi_communicator_t *comm, ompi_request_t ** request, struct mca_coll_base_module_2_1_0_t *module) { int rank, res, rsize; NBC_Schedule *schedule; NBC_Handle *handle; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; rank = ompi_comm_rank (comm); rsize = ompi_comm_remote_size (comm); res = NBC_Init_handle(comm, &handle, libnbc_module); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } schedule = OBJ_NEW(NBC_Schedule); if (OPAL_UNLIKELY(NULL == schedule)) { NBC_Return_handle (handle); return OMPI_ERR_OUT_OF_RESOURCE; } /* ensure the schedule is released with the handle on error */ handle->schedule = schedule; if (0 == rank) { for (int peer = 1 ; peer < rsize ; ++peer) { res = NBC_Sched_recv (NULL, false, 0, MPI_BYTE, peer, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return OMPI_ERR_OUT_OF_RESOURCE; } } } /* synchronize with the remote root */ res = NBC_Sched_recv (NULL, false, 0, MPI_BYTE, 0, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return OMPI_ERR_OUT_OF_RESOURCE; } res = NBC_Sched_send (NULL, false, 0, MPI_BYTE, 0, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return OMPI_ERR_OUT_OF_RESOURCE; } if (0 == rank) { /* wait for the remote root */ res = NBC_Sched_barrier (schedule); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return OMPI_ERR_OUT_OF_RESOURCE; } /* inform remote peers that all local peers have entered the barrier */ for (int peer = 1; peer < rsize ; ++peer) { res = NBC_Sched_send (NULL, false, 0, MPI_BYTE, peer, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return OMPI_ERR_OUT_OF_RESOURCE; } } } res = NBC_Sched_commit (schedule); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return OMPI_ERR_OUT_OF_RESOURCE; } res = NBC_Start (handle, schedule); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return OMPI_ERR_OUT_OF_RESOURCE; } *request = (ompi_request_t *) handle; return OMPI_SUCCESS; }
/* * allreduce_inter * * Function: - allreduce using other MPI collectives * Accepts: - same as MPI_Allreduce() * Returns: - MPI_SUCCESS or error code */ int mca_coll_basic_allreduce_inter(void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm) { int err, i; int rank; int root = 0; int rsize; ptrdiff_t lb, extent; char *tmpbuf = NULL, *pml_buffer = NULL; ompi_request_t *req[2]; ompi_request_t **reqs = comm->c_coll_basic_data->mccb_reqs; rank = ompi_comm_rank(comm); rsize = ompi_comm_remote_size(comm); /* determine result of the remote group, you cannot * use coll_reduce for inter-communicators, since than * you would need to determine an order between the * two groups (e.g. which group is providing the data * and which one enters coll_reduce with providing * MPI_PROC_NULL as root argument etc.) Here, * we execute the data exchange for both groups * simultaniously. */ /*****************************************************************/ if (rank == root) { err = ompi_ddt_get_extent(dtype, &lb, &extent); if (OMPI_SUCCESS != err) { return OMPI_ERROR; } tmpbuf = (char *) malloc(count * extent); if (NULL == tmpbuf) { return OMPI_ERR_OUT_OF_RESOURCE; } pml_buffer = tmpbuf - lb; /* Do a send-recv between the two root procs. to avoid deadlock */ err = MCA_PML_CALL(irecv(rbuf, count, dtype, 0, MCA_COLL_BASE_TAG_ALLREDUCE, comm, &(req[0]))); if (OMPI_SUCCESS != err) { goto exit; } err = MCA_PML_CALL(isend(sbuf, count, dtype, 0, MCA_COLL_BASE_TAG_ALLREDUCE, MCA_PML_BASE_SEND_STANDARD, comm, &(req[1]))); if (OMPI_SUCCESS != err) { goto exit; } err = ompi_request_wait_all(2, req, MPI_STATUSES_IGNORE); if (OMPI_SUCCESS != err) { goto exit; } /* Loop receiving and calling reduction function (C or Fortran). */ for (i = 1; i < rsize; i++) { err = MCA_PML_CALL(recv(pml_buffer, count, dtype, i, MCA_COLL_BASE_TAG_ALLREDUCE, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != err) { goto exit; } /* Perform the reduction */ ompi_op_reduce(op, pml_buffer, rbuf, count, dtype); } } else { /* If not root, send data to the root. */ err = MCA_PML_CALL(send(sbuf, count, dtype, root, MCA_COLL_BASE_TAG_ALLREDUCE, MCA_PML_BASE_SEND_STANDARD, comm)); if (OMPI_SUCCESS != err) { goto exit; } } /* now we have on one process the result of the remote group. To distribute * the data to all processes in the local group, we exchange the data between * the two root processes. They then send it to every other process in the * remote group. */ /***************************************************************************/ if (rank == root) { /* sendrecv between the two roots */ err = MCA_PML_CALL(irecv(pml_buffer, count, dtype, 0, MCA_COLL_BASE_TAG_ALLREDUCE, comm, &(req[1]))); if (OMPI_SUCCESS != err) { goto exit; } err = MCA_PML_CALL(isend(rbuf, count, dtype, 0, MCA_COLL_BASE_TAG_ALLREDUCE, MCA_PML_BASE_SEND_STANDARD, comm, &(req[0]))); if (OMPI_SUCCESS != err) { goto exit; } err = ompi_request_wait_all(2, req, MPI_STATUSES_IGNORE); if (OMPI_SUCCESS != err) { goto exit; } /* distribute the data to other processes in remote group. * Note that we start from 1 (not from zero), since zero * has already the correct data AND we avoid a potential * deadlock here. */ if (rsize > 1) { for (i = 1; i < rsize; i++) { err = MCA_PML_CALL(isend(pml_buffer, count, dtype, i, MCA_COLL_BASE_TAG_ALLREDUCE, MCA_PML_BASE_SEND_STANDARD, comm, &reqs[i - 1])); if (OMPI_SUCCESS != err) { goto exit; } } err = ompi_request_wait_all(rsize - 1, reqs, MPI_STATUSES_IGNORE); if (OMPI_SUCCESS != err) { goto exit; } } } else { err = MCA_PML_CALL(recv(rbuf, count, dtype, root, MCA_COLL_BASE_TAG_ALLREDUCE, comm, MPI_STATUS_IGNORE)); } exit: if (NULL != tmpbuf) { free(tmpbuf); } return err; }
int ompi_coll_libnbc_iallgatherv_inter(const void* sendbuf, int sendcount, MPI_Datatype sendtype, void* recvbuf, const int *recvcounts, const int *displs, MPI_Datatype recvtype, struct ompi_communicator_t *comm, ompi_request_t ** request, struct mca_coll_base_module_2_1_0_t *module) { int res, rsize; MPI_Aint rcvext; NBC_Schedule *schedule; NBC_Handle *handle; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; rsize = ompi_comm_remote_size (comm); res = ompi_datatype_type_extent(recvtype, &rcvext); if (OPAL_UNLIKELY(MPI_SUCCESS != res)) { NBC_Error ("MPI Error in ompi_datatype_type_extent() (%i)", res); return res; } schedule = OBJ_NEW(NBC_Schedule); if (NULL == schedule) { return OMPI_ERR_OUT_OF_RESOURCE; } /* do rsize rounds */ for (int r = 0 ; r < rsize ; ++r) { char *rbuf = (char *) recvbuf + displs[r] * rcvext; if (recvcounts[r]) { res = NBC_Sched_recv (rbuf, false, recvcounts[r], recvtype, r, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } } } if (sendcount) { for (int r = 0 ; r < rsize ; ++r) { res = NBC_Sched_send (sendbuf, false, sendcount, sendtype, r, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } } } res = NBC_Sched_commit (schedule); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } res = NBC_Init_handle (comm, &handle, libnbc_module); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } res = NBC_Start (handle, schedule); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; } *request = (ompi_request_t *) handle; return OMPI_SUCCESS; }
/* * reduce_scatter_block_inter * * Function: - reduce/scatter operation * Accepts: - same arguments as MPI_Reduce_scatter() * Returns: - MPI_SUCCESS or error code */ int mca_coll_basic_reduce_scatter_block_inter(const void *sbuf, void *rbuf, int rcount, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int err, i, rank, root = 0, rsize, lsize; int totalcounts; ptrdiff_t gap, span; char *tmpbuf = NULL, *tmpbuf2 = NULL; char *lbuf = NULL, *buf; ompi_request_t *req; rank = ompi_comm_rank(comm); rsize = ompi_comm_remote_size(comm); lsize = ompi_comm_size(comm); totalcounts = lsize * rcount; /* * The following code basically does an interreduce followed by a * intrascatter. This is implemented by having the roots of each * group exchange their sbuf. Then, the roots receive the data * from each of the remote ranks and execute the reduce. When * this is complete, they have the reduced data available to them * for doing the scatter. They do this on the local communicator * associated with the intercommunicator. * * Note: There are other ways to implement MPI_Reduce_scatter_block on * intercommunicators. For example, one could do a MPI_Reduce locally, * then send the results to the other root which could scatter it. * */ if (rank == root) { span = opal_datatype_span(&dtype->super, totalcounts, &gap); tmpbuf = (char *) malloc(span); tmpbuf2 = (char *) malloc(span); if (NULL == tmpbuf || NULL == tmpbuf2) { return OMPI_ERR_OUT_OF_RESOURCE; } lbuf = tmpbuf - gap; buf = tmpbuf2 - gap; /* Do a send-recv between the two root procs. to avoid deadlock */ err = MCA_PML_CALL(isend(sbuf, totalcounts, dtype, 0, MCA_COLL_BASE_TAG_REDUCE_SCATTER, MCA_PML_BASE_SEND_STANDARD, comm, &req)); if (OMPI_SUCCESS != err) { goto exit; } err = MCA_PML_CALL(recv(lbuf, totalcounts, dtype, 0, MCA_COLL_BASE_TAG_REDUCE_SCATTER, comm, MPI_STATUS_IGNORE)); if (OMPI_SUCCESS != err) { goto exit; } err = ompi_request_wait( &req, MPI_STATUS_IGNORE); if (OMPI_SUCCESS != err) { goto exit; } /* Loop receiving and calling reduction function (C or Fortran) * The result of this reduction operations is then in * tmpbuf2. */ for (i = 1; i < rsize; i++) { char *tbuf; err = MCA_PML_CALL(recv(buf, totalcounts, dtype, i, MCA_COLL_BASE_TAG_REDUCE_SCATTER, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != err) { goto exit; } /* Perform the reduction */ ompi_op_reduce(op, lbuf, buf, totalcounts, dtype); /* swap the buffers */ tbuf = lbuf; lbuf = buf; buf = tbuf; } } else { /* If not root, send data to the root. */ err = MCA_PML_CALL(send(sbuf, totalcounts, dtype, root, MCA_COLL_BASE_TAG_REDUCE_SCATTER, MCA_PML_BASE_SEND_STANDARD, comm)); if (OMPI_SUCCESS != err) { goto exit; } } /* Now do a scatterv on the local communicator */ err = comm->c_local_comm->c_coll->coll_scatter(lbuf, rcount, dtype, rbuf, rcount, dtype, 0, comm->c_local_comm, comm->c_local_comm->c_coll->coll_scatter_module); exit: if (NULL != tmpbuf) { free(tmpbuf); } if (NULL != tmpbuf2) { free(tmpbuf2); } return err; }
/* * allgather_inter * * Function: - allgather using other MPI collections * Accepts: - same as MPI_Allgather() * Returns: - MPI_SUCCESS or error code */ int mca_coll_basic_allgather_inter(void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int rank, root = 0, size, rsize, err, i; char *tmpbuf = NULL, *ptmp; ptrdiff_t rlb, slb, rextent, sextent, incr; ompi_request_t *req; mca_coll_basic_module_t *basic_module = (mca_coll_basic_module_t*) module; ompi_request_t **reqs = basic_module->mccb_reqs; rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); rsize = ompi_comm_remote_size(comm); /* Algorithm: * - a gather to the root in remote group (simultaniously executed, * thats why we cannot use coll_gather). * - exchange the temp-results between two roots * - inter-bcast (again simultanious). */ /* Step one: gather operations: */ if (rank != root) { /* send your data to root */ err = MCA_PML_CALL(send(sbuf, scount, sdtype, root, MCA_COLL_BASE_TAG_ALLGATHER, MCA_PML_BASE_SEND_STANDARD, comm)); if (OMPI_SUCCESS != err) { return err; } } else { /* receive a msg. from all other procs. */ err = ompi_datatype_get_extent(rdtype, &rlb, &rextent); if (OMPI_SUCCESS != err) { return err; } err = ompi_datatype_get_extent(sdtype, &slb, &sextent); if (OMPI_SUCCESS != err) { return err; } /* Do a send-recv between the two root procs. to avoid deadlock */ err = MCA_PML_CALL(isend(sbuf, scount, sdtype, 0, MCA_COLL_BASE_TAG_ALLGATHER, MCA_PML_BASE_SEND_STANDARD, comm, &reqs[rsize])); if (OMPI_SUCCESS != err) { return err; } err = MCA_PML_CALL(irecv(rbuf, rcount, rdtype, 0, MCA_COLL_BASE_TAG_ALLGATHER, comm, &reqs[0])); if (OMPI_SUCCESS != err) { return err; } incr = rextent * rcount; ptmp = (char *) rbuf + incr; for (i = 1; i < rsize; ++i, ptmp += incr) { err = MCA_PML_CALL(irecv(ptmp, rcount, rdtype, i, MCA_COLL_BASE_TAG_ALLGATHER, comm, &reqs[i])); if (MPI_SUCCESS != err) { return err; } } err = ompi_request_wait_all(rsize + 1, reqs, MPI_STATUSES_IGNORE); if (OMPI_SUCCESS != err) { return err; } /* Step 2: exchange the resuts between the root processes */ tmpbuf = (char *) malloc(scount * size * sextent); if (NULL == tmpbuf) { return err; } err = MCA_PML_CALL(isend(rbuf, rsize * rcount, rdtype, 0, MCA_COLL_BASE_TAG_ALLGATHER, MCA_PML_BASE_SEND_STANDARD, comm, &req)); if (OMPI_SUCCESS != err) { goto exit; } err = MCA_PML_CALL(recv(tmpbuf, size * scount, sdtype, 0, MCA_COLL_BASE_TAG_ALLGATHER, comm, MPI_STATUS_IGNORE)); if (OMPI_SUCCESS != err) { goto exit; } err = ompi_request_wait( &req, MPI_STATUS_IGNORE); if (OMPI_SUCCESS != err) { goto exit; } } /* Step 3: bcast the data to the remote group. This * happens in both groups simultaniously, thus we can * not use coll_bcast (this would deadlock). */ if (rank != root) { /* post the recv */ err = MCA_PML_CALL(recv(rbuf, rsize * rcount, rdtype, 0, MCA_COLL_BASE_TAG_ALLGATHER, comm, MPI_STATUS_IGNORE)); if (OMPI_SUCCESS != err) { goto exit; } } else { /* Send the data to every other process in the remote group * except to rank zero. which has it already. */ for (i = 1; i < rsize; i++) { err = MCA_PML_CALL(isend(tmpbuf, size * scount, sdtype, i, MCA_COLL_BASE_TAG_ALLGATHER, MCA_PML_BASE_SEND_STANDARD, comm, &reqs[i - 1])); if (OMPI_SUCCESS != err) { goto exit; } } err = ompi_request_wait_all(rsize - 1, reqs, MPI_STATUSES_IGNORE); if (OMPI_SUCCESS != err) { goto exit; } } exit: if (NULL != tmpbuf) { free(tmpbuf); } return err; }
/* Arguments not used in this implementation: * - bridgecomm * - local_leader * - remote_leader * - send_first */ static int ompi_comm_allreduce_inter ( int *inbuf, int *outbuf, int count, struct ompi_op_t *op, ompi_communicator_t *intercomm, ompi_communicator_t *bridgecomm, void* local_leader, void* remote_leader, int send_first ) { int local_rank, rsize; int i, rc; int *sbuf; int *tmpbuf=NULL; int *rcounts=NULL, scount=0; int *rdisps=NULL; if ( &ompi_mpi_op_sum.op != op && &ompi_mpi_op_prod.op != op && &ompi_mpi_op_max.op != op && &ompi_mpi_op_min.op != op ) { return MPI_ERR_OP; } if ( !OMPI_COMM_IS_INTER (intercomm)) { return MPI_ERR_COMM; } /* Allocate temporary arrays */ rsize = ompi_comm_remote_size (intercomm); local_rank = ompi_comm_rank ( intercomm ); tmpbuf = (int *) malloc ( count * sizeof(int)); rdisps = (int *) calloc ( rsize, sizeof(int)); rcounts = (int *) calloc ( rsize, sizeof(int) ); if ( OPAL_UNLIKELY (NULL == tmpbuf || NULL == rdisps || NULL == rcounts)) { rc = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } /* Execute the inter-allreduce: the result of our group will be in the buffer of the remote group */ rc = intercomm->c_coll.coll_allreduce ( inbuf, tmpbuf, count, MPI_INT, op, intercomm, intercomm->c_coll.coll_allreduce_module); if ( OMPI_SUCCESS != rc ) { goto exit; } if ( 0 == local_rank ) { MPI_Request req; /* for the allgatherv later */ scount = count; /* local leader exchange their data and determine the overall result for both groups */ rc = MCA_PML_CALL(irecv (outbuf, count, MPI_INT, 0, OMPI_COMM_ALLREDUCE_TAG, intercomm, &req)); if ( OMPI_SUCCESS != rc ) { goto exit; } rc = MCA_PML_CALL(send (tmpbuf, count, MPI_INT, 0, OMPI_COMM_ALLREDUCE_TAG, MCA_PML_BASE_SEND_STANDARD, intercomm)); if ( OMPI_SUCCESS != rc ) { goto exit; } rc = ompi_request_wait ( &req, MPI_STATUS_IGNORE ); if ( OMPI_SUCCESS != rc ) { goto exit; } if ( &ompi_mpi_op_max.op == op ) { for ( i = 0 ; i < count; i++ ) { if (tmpbuf[i] > outbuf[i]) outbuf[i] = tmpbuf[i]; } } else if ( &ompi_mpi_op_min.op == op ) { for ( i = 0 ; i < count; i++ ) { if (tmpbuf[i] < outbuf[i]) outbuf[i] = tmpbuf[i]; } } else if ( &ompi_mpi_op_sum.op == op ) { for ( i = 0 ; i < count; i++ ) { outbuf[i] += tmpbuf[i]; } } else if ( &ompi_mpi_op_prod.op == op ) { for ( i = 0 ; i < count; i++ ) { outbuf[i] *= tmpbuf[i]; } } } /* distribute the overall result to all processes in the other group. Instead of using bcast, we are using here allgatherv, to avoid the possible deadlock. Else, we need an algorithm to determine, which group sends first in the inter-bcast and which receives the result first. */ rcounts[0] = count; sbuf = outbuf; rc = intercomm->c_coll.coll_allgatherv (sbuf, scount, MPI_INT, outbuf, rcounts, rdisps, MPI_INT, intercomm, intercomm->c_coll.coll_allgatherv_module); exit: if ( NULL != tmpbuf ) { free ( tmpbuf ); } if ( NULL != rcounts ) { free ( rcounts ); } if ( NULL != rdisps ) { free ( rdisps ); } return (rc); }
int ompi_coll_libnbc_iallgather_inter(void* sendbuf, int sendcount, MPI_Datatype sendtype, void* recvbuf, int recvcount, MPI_Datatype recvtype, struct ompi_communicator_t *comm, ompi_request_t ** request, struct mca_coll_base_module_2_1_0_t *module) { int res, rsize; MPI_Aint rcvext; NBC_Schedule *schedule; char *rbuf; NBC_Handle *handle; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; res = MPI_Type_extent(recvtype, &rcvext); if (MPI_SUCCESS != res) { NBC_Error ("MPI Error in MPI_Type_extent() (%i)", res); return res; } rsize = ompi_comm_remote_size (comm); /* set up schedule */ schedule = OBJ_NEW(NBC_Schedule); if (OPAL_UNLIKELY(NULL == schedule)) { return OMPI_ERR_OUT_OF_RESOURCE; } /* do rsize - 1 rounds */ for (int r = 0 ; r < rsize ; ++r) { /* recv from rank r */ rbuf = (char *) recvbuf + r * recvcount * rcvext; res = NBC_Sched_recv (rbuf, false, recvcount, recvtype, r, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } /* send to rank r */ res = NBC_Sched_send (sendbuf, false, sendcount, sendtype, r, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } } res = NBC_Sched_commit (schedule); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } res = NBC_Init_handle (comm, &handle, libnbc_module); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } res = NBC_Start (handle, schedule); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OMPI_COLL_LIBNBC_REQUEST_RETURN(handle); return res; } *request = (ompi_request_t *) handle; return OMPI_SUCCESS; }
/* * gatherv_inter * * Function: - gatherv operation using a local gather on c_local_comm * Accepts: - same arguments as MPI_Gatherv() * Returns: - MPI_SUCCESS or error code */ int mca_coll_inter_gatherv_inter(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, const int *rcounts, const int *disps, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, rank, size, size_local, total=0, err; int *count=NULL, *displace=NULL; char *ptmp=NULL; MPI_Aint incr; MPI_Aint extent; MPI_Aint lb; ompi_datatype_t *ndtype; if (MPI_PROC_NULL == root) { /* do nothing */ return OMPI_SUCCESS; } size = ompi_comm_remote_size(comm); rank = ompi_comm_rank(comm); size_local = ompi_comm_size(comm); if (MPI_ROOT == root) { /* I am the root, receiving the data from zero. */ ompi_datatype_create_indexed(size, rcounts, disps, rdtype, &ndtype); ompi_datatype_commit(&ndtype); err = MCA_PML_CALL(recv(rbuf, 1, ndtype, 0, MCA_COLL_BASE_TAG_GATHERV, comm, MPI_STATUS_IGNORE)); ompi_datatype_destroy(&ndtype); return err; } if (0 == rank) { count = (int *)malloc(sizeof(int) * size_local); displace = (int *)malloc(sizeof(int) * size_local); if ((NULL == displace) || (NULL == count)) { err = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } err = comm->c_local_comm->c_coll.coll_gather(&scount, 1, MPI_INT, count, 1, MPI_INT, 0, comm->c_local_comm, comm->c_local_comm->c_coll.coll_gather_module); if (OMPI_SUCCESS != err) { goto exit; } if(0 == rank) { displace[0] = 0; for (i = 1; i < size_local; i++) { displace[i] = displace[i-1] + count[i-1]; } /* Perform the gatherv locally with the first process as root */ err = ompi_datatype_get_extent(sdtype, &lb, &extent); if (OMPI_SUCCESS != err) { err = OMPI_ERROR; goto exit; } incr = 0; for (i = 0; i < size_local; i++) { incr = incr + extent*count[i]; } if ( incr > 0 ) { ptmp = (char*)malloc(incr); if (NULL == ptmp) { err = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } } } err = comm->c_local_comm->c_coll.coll_gatherv(sbuf, scount, sdtype, ptmp, count, displace, sdtype,0, comm->c_local_comm, comm->c_local_comm->c_coll.coll_gatherv_module); if (OMPI_SUCCESS != err) { goto exit; } if (0 == rank) { for (i = 0; i < size_local; i++) { total = total + count[i]; } /* First process sends data to the root */ err = MCA_PML_CALL(send(ptmp, total, sdtype, root, MCA_COLL_BASE_TAG_GATHERV, MCA_PML_BASE_SEND_STANDARD, comm)); } exit: if (NULL != ptmp) { free(ptmp); } if (NULL != displace) { free(displace); } if (NULL != count) { free(count); } /* All done */ return err; }
/* * allgatherv_inter * * Function: - allgatherv using other MPI collectives * Accepts: - same as MPI_Allgatherv() * Returns: - MPI_SUCCESS or error code */ int mca_coll_inter_allgatherv_inter(void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int *rcounts, int *disps, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, rank, size, size_local, total=0, err; int *count=NULL,*displace=NULL; char *ptmp=NULL; MPI_Aint incr; MPI_Aint extent; MPI_Aint lb; ompi_datatype_t *ndtype; ompi_request_t *req[2]; rank = ompi_comm_rank(comm); size_local = ompi_comm_size(comm->c_local_comm); size = ompi_comm_remote_size(comm); if (0 == rank) { count = (int *)malloc(sizeof(int) * size_local); if (NULL == count) { return OMPI_ERR_OUT_OF_RESOURCE; } displace = (int *)malloc(sizeof(int) * size_local); if (NULL == displace) { return OMPI_ERR_OUT_OF_RESOURCE; } } /* Local gather to get the scount of each process */ err = comm->c_local_comm->c_coll.coll_gather(&scount, 1, MPI_INT, count, 1, MPI_INT, 0, comm->c_local_comm, comm->c_local_comm->c_coll.coll_gather_module); if (OMPI_SUCCESS != err) { return err; } if(0 == rank) { displace[0] = 0; for (i = 1; i < size_local; i++) { displace[i] = displace[i-1] + count[i-1]; } /* Perform the gatherv locally with the first process as root */ err = ompi_ddt_get_extent(sdtype, &lb, &extent); if (OMPI_SUCCESS != err) { return OMPI_ERROR; } incr = 0; for (i = 0; i < size_local; i++) { incr = incr + extent*count[i]; } ptmp = (char*)malloc(incr); if (NULL == ptmp) { return OMPI_ERR_OUT_OF_RESOURCE; } } err = comm->c_local_comm->c_coll.coll_gatherv(sbuf, scount, sdtype, ptmp, count, displace, sdtype,0, comm->c_local_comm, comm->c_local_comm->c_coll.coll_gatherv_module); if (OMPI_SUCCESS != err) { return err; } ompi_ddt_create_indexed(size,rcounts,disps,rdtype,&ndtype); ompi_ddt_commit(&ndtype); if (0 == rank) { for (i = 0; i < size_local; i++) { total = total + count[i]; } /* Exchange data between roots */ err = MCA_PML_CALL(irecv(rbuf, 1, ndtype, 0, MCA_COLL_BASE_TAG_ALLGATHERV, comm, &(req[0]))); if (OMPI_SUCCESS != err) { return err; } err = MCA_PML_CALL(isend(ptmp, total, sdtype, 0, MCA_COLL_BASE_TAG_ALLGATHERV, MCA_PML_BASE_SEND_STANDARD, comm, &(req[1]))); if (OMPI_SUCCESS != err) { return err; } err = ompi_request_wait_all(2, req, MPI_STATUSES_IGNORE); if (OMPI_SUCCESS != err) { return err; } } /* bcast the message to all the local processes */ err = comm->c_local_comm->c_coll.coll_bcast(rbuf, 1, ndtype, 0, comm->c_local_comm, comm->c_local_comm->c_coll.coll_bcast_module); if (OMPI_SUCCESS != err) { return err; } ompi_ddt_destroy(&ndtype); if (NULL != ptmp) { free(ptmp); } if (NULL != displace) { free(displace); } if (NULL != count) { free(count); } return err; }