static inline int ompi_osc_rdma_gacc_self (void *source, int source_count, ompi_datatype_t *source_datatype, void *result, int result_count, ompi_datatype_t *result_datatype, OPAL_PTRDIFF_TYPE target_disp, int target_count, ompi_datatype_t *target_datatype, ompi_op_t *op, ompi_osc_rdma_module_t *module, ompi_osc_rdma_request_t *request) { void *target = (unsigned char*) module->baseptr + ((unsigned long) target_disp * module->disp_unit); int ret; /* if we are in active target mode wait until all post messages arrive */ if (module->sc_group && !module->active_eager_send_active) { OPAL_THREAD_LOCK(&module->lock); while (0 != module->num_post_msgs) { opal_condition_wait(&module->cond, &module->lock); } OPAL_THREAD_UNLOCK(&module->lock); } if (!(module->passive_target_access_epoch || module->active_eager_send_active)) { return OMPI_ERR_RMA_SYNC; } ompi_osc_rdma_accumulate_lock (module); do { ret = ompi_datatype_sndrcv (target, target_count, target_datatype, result, result_count, result_datatype); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { OPAL_OUTPUT_VERBOSE((5, ompi_osc_base_framework.framework_output, "ompi_osc_rdma_gacc_self: failed copying to the target buffer. ret = %d", ret)); break; } if (&ompi_mpi_op_no_op.op != op) { if (&ompi_mpi_op_replace.op != op) { ret = ompi_osc_base_sndrcv_op (source, source_count, source_datatype, target, target_count, target_datatype, op); } else { ret = ompi_datatype_sndrcv (source, source_count, source_datatype, target, target_count, target_datatype); } } if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { OPAL_OUTPUT_VERBOSE((5, ompi_osc_base_framework.framework_output, "ompi_osc_rdma_gacc_self: failed performing accumulate operation. ret = %d", ret)); break; } } while (0); ompi_osc_rdma_accumulate_unlock (module); if (request) { /* NTH: is it ok to use an ompi error code here? */ ompi_osc_rdma_request_complete (request, ret); } return OMPI_SUCCESS; }
int ompi_osc_sm_rget_accumulate(void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt, void *result_addr, int result_count, struct ompi_datatype_t *result_dt, int target, MPI_Aint target_disp, int target_count, struct ompi_datatype_t *target_dt, struct ompi_op_t *op, struct ompi_win_t *win, struct ompi_request_t **ompi_req) { int ret; ompi_osc_sm_request_t *request; ompi_osc_sm_module_t *module = (ompi_osc_sm_module_t*) win->w_osc_module; void *remote_address; OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "rget_accumulate: 0x%lx, %d, %s, %d, %d, %d, %s, %s, 0x%lx", (unsigned long) origin_addr, origin_count, origin_dt->name, target, (int) target_disp, target_count, target_dt->name, op->o_name, (unsigned long) win)); OMPI_OSC_SM_REQUEST_ALLOC(win, request); if (NULL == request) return OMPI_ERR_OUT_OF_RESOURCE; *ompi_req = &request->super; remote_address = ((char*) (module->bases[target])) + module->disp_units[target] * target_disp; opal_atomic_lock(&module->node_states[target].accumulate_lock); ret = ompi_datatype_sndrcv(remote_address, target_count, target_dt, result_addr, result_count, result_dt); if (OMPI_SUCCESS != ret || op == &ompi_mpi_op_no_op.op) goto done; if (op == &ompi_mpi_op_replace.op) { ret = ompi_datatype_sndrcv(origin_addr, origin_count, origin_dt, remote_address, target_count, target_dt); } else { ret = ompi_osc_base_sndrcv_op(origin_addr, origin_count, origin_dt, remote_address, target_count, target_dt, op); } done: opal_atomic_unlock(&module->node_states[target].accumulate_lock); OMPI_OSC_SM_REQUEST_COMPLETE(request); return ret; }
static int ompi_osc_rdma_gacc_local (const void *source_buffer, int source_count, ompi_datatype_t *source_datatype, void *result_buffer, int result_count, ompi_datatype_t *result_datatype, ompi_osc_rdma_peer_t *peer, uint64_t target_address, mca_btl_base_registration_handle_t *target_handle, int target_count, ompi_datatype_t *target_datatype, ompi_op_t *op, ompi_osc_rdma_module_t *module, ompi_osc_rdma_request_t *request) { int ret = OMPI_SUCCESS; do { if (!ompi_osc_rdma_peer_is_exclusive (peer)) { (void) ompi_osc_rdma_lock_acquire_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock)); } if (NULL != result_buffer) { /* get accumulate */ ret = ompi_datatype_sndrcv ((void *) (intptr_t) target_address, target_count, target_datatype, result_buffer, result_count, result_datatype); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { break; } } if (&ompi_mpi_op_no_op.op != op) { if (&ompi_mpi_op_replace.op != op) { ret = ompi_osc_base_sndrcv_op (source_buffer, source_count, source_datatype, (void *) (intptr_t) target_address, target_count, target_datatype, op); } else { ret = ompi_datatype_sndrcv (source_buffer, source_count, source_datatype, (void *) (intptr_t) target_address, target_count, target_datatype); } } if (!ompi_osc_rdma_peer_is_exclusive (peer)) { (void) ompi_osc_rdma_lock_release_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock)); } } while (0); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { OPAL_OUTPUT_VERBOSE((10, ompi_osc_base_framework.framework_output, "ompi_osc_rdma_gacc_self: failed performing accumulate operation. ret = %d", ret)); return ret; } if (request) { /* NTH: is it ok to use an ompi error code here? */ ompi_osc_rdma_request_complete (request, ret); } return ret; }
static inline int ompi_osc_pt2pt_gacc_self (ompi_osc_pt2pt_sync_t *pt2pt_sync, const void *source, int source_count, ompi_datatype_t *source_datatype, void *result, int result_count, ompi_datatype_t *result_datatype, OPAL_PTRDIFF_TYPE target_disp, int target_count, ompi_datatype_t *target_datatype, ompi_op_t *op, ompi_osc_pt2pt_module_t *module, ompi_osc_pt2pt_request_t *request) { void *target = (unsigned char*) module->baseptr + ((unsigned long) target_disp * module->disp_unit); int ret; OPAL_OUTPUT_VERBOSE((MCA_BASE_VERBOSE_TRACE, ompi_osc_base_framework.framework_output, "ompi_osc_pt2pt_gacc_self: starting local " "get accumulate")); ompi_osc_pt2pt_accumulate_lock (module); do { ret = ompi_datatype_sndrcv (target, target_count, target_datatype, result, result_count, result_datatype); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { OPAL_OUTPUT_VERBOSE((5, ompi_osc_base_framework.framework_output, "ompi_osc_pt2pt_gacc_self: failed copying to the target buffer. ret = %d", ret)); break; } if (&ompi_mpi_op_no_op.op != op) { if (&ompi_mpi_op_replace.op != op) { ret = ompi_osc_base_sndrcv_op (source, source_count, source_datatype, target, target_count, target_datatype, op); } else { ret = ompi_datatype_sndrcv ((void *)source, source_count, source_datatype, target, target_count, target_datatype); } } if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { OPAL_OUTPUT_VERBOSE((5, ompi_osc_base_framework.framework_output, "ompi_osc_pt2pt_gacc_self: failed performing accumulate operation. ret = %d", ret)); break; } } while (0); ompi_osc_pt2pt_accumulate_unlock (module); OPAL_OUTPUT_VERBOSE((MCA_BASE_VERBOSE_TRACE, ompi_osc_base_framework.framework_output, "ompi_osc_pt2pt_gacc_self: local get " "accumulate complete")); if (request) { /* NTH: is it ok to use an ompi error code here? */ ompi_osc_pt2pt_request_complete (request, ret); } return OMPI_SUCCESS; }
int ompi_osc_sm_get(void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt, int target, OPAL_PTRDIFF_TYPE target_disp, int target_count, struct ompi_datatype_t *target_dt, struct ompi_win_t *win) { int ret; ompi_osc_sm_module_t *module = (ompi_osc_sm_module_t*) win->w_osc_module; void *remote_address; OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "get: 0x%lx, %d, %s, %d, %d, %d, %s, 0x%lx", (unsigned long) origin_addr, origin_count, origin_dt->name, target, (int) target_disp, target_count, target_dt->name, (unsigned long) win)); remote_address = ((char*) (module->bases[target])) + module->disp_units[target] * target_disp; ret = ompi_datatype_sndrcv(remote_address, target_count, target_dt, origin_addr, origin_count, origin_dt); return ret; }
/* * alltoallv_intra * * Function: - MPI_Alltoallv * Accepts: - same as MPI_Alltoallv() * Returns: - MPI_SUCCESS or an MPI error code */ int mca_coll_self_alltoallv_intra(const void *sbuf, const int *scounts, const int *sdisps, struct ompi_datatype_t *sdtype, void *rbuf, const int *rcounts, const int *rdisps, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int err; ptrdiff_t lb, rextent, sextent; if (MPI_IN_PLACE == sbuf) { return MPI_SUCCESS; } err = ompi_datatype_get_extent(sdtype, &lb, &sextent); if (OMPI_SUCCESS != err) { return OMPI_ERROR; } err = ompi_datatype_get_extent(rdtype, &lb, &rextent); if (OMPI_SUCCESS != err) { return OMPI_ERROR; } return ompi_datatype_sndrcv(((char *) sbuf) + sdisps[0] * sextent, scounts[0], sdtype, ((char *) rbuf) + rdisps[0] * rextent, rcounts[0], rdtype); }
static inline int ompi_osc_pt2pt_acc_self (ompi_osc_pt2pt_sync_t *pt2pt_sync, const void *source, int source_count, ompi_datatype_t *source_datatype, OPAL_PTRDIFF_TYPE target_disp, int target_count, ompi_datatype_t *target_datatype, ompi_op_t *op, ompi_osc_pt2pt_module_t *module, ompi_osc_pt2pt_request_t *request) { void *target = (unsigned char*) module->baseptr + ((unsigned long) target_disp * module->disp_unit); int ret; /* if we are in active target mode wait until all post messages arrive */ ompi_osc_pt2pt_sync_wait_expected (pt2pt_sync); ompi_osc_pt2pt_accumulate_lock (module); if (&ompi_mpi_op_replace.op != op) { ret = ompi_osc_base_sndrcv_op (source, source_count, source_datatype, target, target_count, target_datatype, op); } else { ret = ompi_datatype_sndrcv ((void *)source, source_count, source_datatype, target, target_count, target_datatype); } ompi_osc_pt2pt_accumulate_unlock (module); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { OPAL_OUTPUT_VERBOSE((5, ompi_osc_base_framework.framework_output, "ompi_osc_pt2pt_acc_self: failed performing accumulate operation. ret = %d", ret)); return ret; } if (request) { ompi_osc_pt2pt_request_complete (request, MPI_SUCCESS); } return OMPI_SUCCESS; }
/* self communication optimizations */ static inline int ompi_osc_rdma_put_self (void *source, int source_count, ompi_datatype_t *source_datatype, OPAL_PTRDIFF_TYPE target_disp, int target_count, ompi_datatype_t *target_datatype, ompi_osc_rdma_module_t *module, ompi_osc_rdma_request_t *request) { void *target = (unsigned char*) module->baseptr + ((unsigned long) target_disp * module->disp_unit); int ret; /* if we are in active target mode wait until all post messages arrive */ if (module->sc_group && !module->active_eager_send_active) { OPAL_THREAD_LOCK(&module->lock); while (0 != module->num_post_msgs) { opal_condition_wait(&module->cond, &module->lock); } OPAL_THREAD_UNLOCK(&module->lock); } if (!(module->passive_target_access_epoch || module->active_eager_send_active)) { return OMPI_ERR_RMA_SYNC; } ret = ompi_datatype_sndrcv (source, source_count, source_datatype, target, target_count, target_datatype); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { return ret; } if (request) { ompi_osc_rdma_request_complete (request, MPI_SUCCESS); } return OMPI_SUCCESS; }
int ompi_coll_tuned_alltoall_intra_two_procs(void *sbuf, int scount, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int line = -1, err = 0, rank, remote; void * tmpsend, *tmprecv; ptrdiff_t sext, rext, lb; if (MPI_IN_PLACE == sbuf) { return mca_coll_tuned_alltoall_intra_basic_inplace (rbuf, rcount, rdtype, comm, module); } rank = ompi_comm_rank(comm); OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_alltoall_intra_two_procs rank %d", rank)); err = ompi_datatype_get_extent (sdtype, &lb, &sext); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } err = ompi_datatype_get_extent (rdtype, &lb, &rext); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } /* exchange data */ remote = rank ^ 1; tmpsend = (char*)sbuf + (ptrdiff_t)remote * sext * (ptrdiff_t)scount; tmprecv = (char*)rbuf + (ptrdiff_t)remote * rext * (ptrdiff_t)rcount; /* send and receive */ err = ompi_coll_tuned_sendrecv ( tmpsend, scount, sdtype, remote, MCA_COLL_BASE_TAG_ALLTOALL, tmprecv, rcount, rdtype, remote, MCA_COLL_BASE_TAG_ALLTOALL, comm, MPI_STATUS_IGNORE, rank ); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } /* ddt sendrecv your own data */ err = ompi_datatype_sndrcv((char*) sbuf + (ptrdiff_t)rank * sext * (ptrdiff_t)scount, (int32_t) scount, sdtype, (char*) rbuf + (ptrdiff_t)rank * rext * (ptrdiff_t)rcount, (int32_t) rcount, rdtype); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } /* done */ return MPI_SUCCESS; err_hndl: OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank)); return err; }
/* * gather_intra * * Function: - basic gather operation * Accepts: - same arguments as MPI_Gather() * Returns: - MPI_SUCCESS or error code */ int ompi_coll_base_gather_intra_basic_linear(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, err, rank, size; char *ptmp; MPI_Aint incr, extent, lb; size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); /* Everyone but root sends data and returns. */ OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_gather_intra_basic_linear rank %d", rank)); if (rank != root) { return MCA_PML_CALL(send(sbuf, scount, sdtype, root, MCA_COLL_BASE_TAG_GATHER, MCA_PML_BASE_SEND_STANDARD, comm)); } /* I am the root, loop receiving the data. */ ompi_datatype_get_extent(rdtype, &lb, &extent); incr = extent * (ptrdiff_t)rcount; for (i = 0, ptmp = (char *) rbuf; i < size; ++i, ptmp += incr) { if (i == rank) { if (MPI_IN_PLACE != sbuf) { err = ompi_datatype_sndrcv((void *)sbuf, scount, sdtype, ptmp, rcount, rdtype); } else { err = MPI_SUCCESS; } } else { err = MCA_PML_CALL(recv(ptmp, rcount, rdtype, i, MCA_COLL_BASE_TAG_GATHER, comm, MPI_STATUS_IGNORE)); } if (MPI_SUCCESS != err) { return err; } } /* All done */ return MPI_SUCCESS; }
/* * allgather_intra * * Function: - allgather * Accepts: - same as MPI_Allgather() * Returns: - MPI_SUCCESS, or error code */ int mca_coll_self_allgather_intra(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { if (MPI_IN_PLACE == sbuf) { return MPI_SUCCESS; } else { return ompi_datatype_sndrcv(sbuf, scount, sdtype, rbuf, rcount, rdtype); } }
/* completion of an accumulate get operation */ static void ompi_osc_rdma_acc_get_complete (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, mca_btl_base_registration_handle_t *local_handle, void *context, void *data, int status) { ompi_osc_rdma_request_t *request = (ompi_osc_rdma_request_t *) context; intptr_t source = (intptr_t) local_address + request->offset; ompi_osc_rdma_sync_t *sync = request->sync; ompi_osc_rdma_module_t *module = sync->module; assert (OMPI_SUCCESS == status); if (OMPI_SUCCESS == status && OMPI_OSC_RDMA_TYPE_GET_ACC == request->type) { if (NULL == request->result_addr) { /* result buffer is not necessarily contiguous. use the opal datatype engine to * copy the data over in this case */ struct iovec iov = {.iov_base = (void *) source, request->len}; uint32_t iov_count = 1; size_t size = request->len; opal_convertor_unpack (&request->convertor, &iov, &iov_count, &size); opal_convertor_cleanup (&request->convertor); } else { /* copy contiguous data to the result buffer */ ompi_datatype_sndrcv ((void *) source, request->len, MPI_BYTE, request->result_addr, request->result_count, request->result_dt); } if (&ompi_mpi_op_no_op.op == request->op) { /* this is a no-op. nothing more to do except release resources and the accumulate lock */ ompi_osc_rdma_acc_put_complete (btl, endpoint, local_address, local_handle, context, data, status); return; } } /* accumulate the data */ if (&ompi_mpi_op_replace.op != request->op) { ompi_op_reduce (request->op, request->origin_addr, (void *) source, request->origin_count, request->origin_dt); } /* initiate the put of the accumulated data */ status = module->selected_btl->btl_put (module->selected_btl, endpoint, (void *) source, request->target_address, local_handle, (mca_btl_base_registration_handle_t *) request->ctx, request->len, 0, MCA_BTL_NO_ORDER, ompi_osc_rdma_acc_put_complete, request, NULL); /* TODO -- we can do better. probably should queue up the next step and handle it in progress */ assert (OPAL_SUCCESS == status); }
/* * allgatherv_intra * * Function: - allgather * Accepts: - same as MPI_Allgatherv() * Returns: - MPI_SUCCESS or error code */ int mca_coll_self_allgatherv_intra(void *sbuf, int scount, struct ompi_datatype_t *sdtype, void * rbuf, int *rcounts, int *disps, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { if (MPI_IN_PLACE == sbuf) { return MPI_SUCCESS; } else { int err; ptrdiff_t lb, extent; err = ompi_datatype_get_extent(rdtype, &lb, &extent); if (OMPI_SUCCESS != err) { return OMPI_ERROR; } return ompi_datatype_sndrcv(sbuf, scount, sdtype, ((char *) rbuf) + disps[0] * extent, rcounts[0], rdtype); } }
int ompi_osc_sm_rget(void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt, int target, OPAL_PTRDIFF_TYPE target_disp, int target_count, struct ompi_datatype_t *target_dt, struct ompi_win_t *win, struct ompi_request_t **ompi_req) { int ret; ompi_osc_sm_request_t *request; ompi_osc_sm_module_t *module = (ompi_osc_sm_module_t*) win->w_osc_module; void *remote_address; OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "rget: 0x%lx, %d, %s, %d, %d, %d, %s, 0x%lx", (unsigned long) origin_addr, origin_count, origin_dt->name, target, (int) target_disp, target_count, target_dt->name, (unsigned long) win)); OMPI_OSC_SM_REQUEST_ALLOC(win, request); if (NULL == request) return OMPI_ERR_OUT_OF_RESOURCE; *ompi_req = &request->super; remote_address = ((char*) (module->bases[target])) + module->disp_units[target] * target_disp; ret = ompi_datatype_sndrcv(remote_address, target_count, target_dt, origin_addr, origin_count, origin_dt); if (OMPI_SUCCESS != ret) { OMPI_OSC_SM_REQUEST_RETURN(request); return ret; } OMPI_OSC_SM_REQUEST_COMPLETE(request); return OMPI_SUCCESS; }
int ompi_osc_sm_accumulate(void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt, int target, OPAL_PTRDIFF_TYPE target_disp, int target_count, struct ompi_datatype_t *target_dt, struct ompi_op_t *op, struct ompi_win_t *win) { int ret; ompi_osc_sm_module_t *module = (ompi_osc_sm_module_t*) win->w_osc_module; void *remote_address; OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "accumulate: 0x%lx, %d, %s, %d, %d, %d, %s, %s, 0x%lx", (unsigned long) origin_addr, origin_count, origin_dt->name, target, (int) target_disp, target_count, target_dt->name, op->o_name, (unsigned long) win)); remote_address = ((char*) (module->bases[target])) + module->disp_units[target] * target_disp; opal_atomic_lock(&module->node_states[target].accumulate_lock); if (op == &ompi_mpi_op_replace.op) { ret = ompi_datatype_sndrcv(origin_addr, origin_count, origin_dt, remote_address, target_count, target_dt); } else { ret = ompi_osc_base_sndrcv_op(origin_addr, origin_count, origin_dt, remote_address, target_count, target_dt, op); } opal_atomic_unlock(&module->node_states[target].accumulate_lock); return ret; }
static inline int ompi_osc_pt2pt_get_self (ompi_osc_pt2pt_sync_t *pt2pt_sync, void *target, int target_count, ompi_datatype_t *target_datatype, OPAL_PTRDIFF_TYPE source_disp, int source_count, ompi_datatype_t *source_datatype, ompi_osc_pt2pt_module_t *module, ompi_osc_pt2pt_request_t *request) { void *source = (unsigned char*) module->baseptr + ((unsigned long) source_disp * module->disp_unit); int ret; /* if we are in active target mode wait until all post messages arrive */ ompi_osc_pt2pt_sync_wait_expected (pt2pt_sync); ret = ompi_datatype_sndrcv (source, source_count, source_datatype, target, target_count, target_datatype); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { return ret; } if (request) { ompi_osc_pt2pt_request_complete (request, MPI_SUCCESS); } return OMPI_SUCCESS; }
/* * gather_intra_linear_sync * * Function: - synchronized gather operation with * Accepts: - same arguments as MPI_Gather(), first segment size * Returns: - MPI_SUCCESS or error code */ int ompi_coll_base_gather_intra_linear_sync(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module, int first_segment_size) { int i, ret, line, rank, size, first_segment_count; ompi_request_t **reqs = NULL; MPI_Aint extent, lb; size_t typelng; size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_gather_intra_linear_sync rank %d, segment %d", rank, first_segment_size)); if (rank != root) { /* Non-root processes: - receive zero byte message from the root, - send the first segment of the data synchronously, - send the second segment of the data. */ ompi_datatype_type_size(sdtype, &typelng); ompi_datatype_get_extent(sdtype, &lb, &extent); first_segment_count = scount; COLL_BASE_COMPUTED_SEGCOUNT( (size_t) first_segment_size, typelng, first_segment_count ); ret = MCA_PML_CALL(recv(sbuf, 0, MPI_BYTE, root, MCA_COLL_BASE_TAG_GATHER, comm, MPI_STATUS_IGNORE)); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } ret = MCA_PML_CALL(send(sbuf, first_segment_count, sdtype, root, MCA_COLL_BASE_TAG_GATHER, MCA_PML_BASE_SEND_STANDARD, comm)); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } ret = MCA_PML_CALL(send((char*)sbuf + extent * first_segment_count, (scount - first_segment_count), sdtype, root, MCA_COLL_BASE_TAG_GATHER, MCA_PML_BASE_SEND_STANDARD, comm)); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } } else { /* Root process, - For every non-root node: - post irecv for the first segment of the message - send zero byte message to signal node to send the message - post irecv for the second segment of the message - wait for the first segment to complete - Copy local data if necessary - Waitall for all the second segments to complete. */ char *ptmp; ompi_request_t *first_segment_req; reqs = (ompi_request_t**) calloc(size, sizeof(ompi_request_t*)); if (NULL == reqs) { ret = -1; line = __LINE__; goto error_hndl; } ompi_datatype_type_size(rdtype, &typelng); ompi_datatype_get_extent(rdtype, &lb, &extent); first_segment_count = rcount; COLL_BASE_COMPUTED_SEGCOUNT( (size_t)first_segment_size, typelng, first_segment_count ); ptmp = (char *) rbuf; for (i = 0; i < size; ++i) { if (i == rank) { /* skip myself */ reqs[i] = MPI_REQUEST_NULL; continue; } /* irecv for the first segment from i */ ptmp = (char*)rbuf + (ptrdiff_t)i * (ptrdiff_t)rcount * extent; ret = MCA_PML_CALL(irecv(ptmp, first_segment_count, rdtype, i, MCA_COLL_BASE_TAG_GATHER, comm, &first_segment_req)); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } /* send sync message */ ret = MCA_PML_CALL(send(rbuf, 0, MPI_BYTE, i, MCA_COLL_BASE_TAG_GATHER, MCA_PML_BASE_SEND_STANDARD, comm)); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } /* irecv for the second segment */ ptmp = (char*)rbuf + ((ptrdiff_t)i * (ptrdiff_t)rcount + first_segment_count) * extent; ret = MCA_PML_CALL(irecv(ptmp, (rcount - first_segment_count), rdtype, i, MCA_COLL_BASE_TAG_GATHER, comm, &reqs[i])); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } /* wait on the first segment to complete */ ret = ompi_request_wait(&first_segment_req, MPI_STATUS_IGNORE); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } } /* copy local data if necessary */ if (MPI_IN_PLACE != sbuf) { ret = ompi_datatype_sndrcv((void *)sbuf, scount, sdtype, (char*)rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * extent, rcount, rdtype); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } } /* wait all second segments to complete */ ret = ompi_request_wait_all(size, reqs, MPI_STATUSES_IGNORE); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } free(reqs); } /* All done */ return MPI_SUCCESS; error_hndl: if (NULL != reqs) { free(reqs); } OPAL_OUTPUT (( ompi_coll_base_framework.framework_output, "ERROR_HNDL: node %d file %s line %d error %d\n", rank, __FILE__, line, ret )); return ret; }
/* * reduce_scatter * * Function: - reduce then scatter * Accepts: - same as MPI_Reduce_scatter() * Returns: - MPI_SUCCESS or error code * * Algorithm: * Cummutative, reasonable sized messages * recursive halving algorithm * Others: * reduce and scatterv (needs to be cleaned * up at some point) * * NOTE: that the recursive halving algorithm should be faster than * the reduce/scatter for all message sizes. However, the memory * usage for the recusive halving is msg_size + 2 * comm_size greater * for the recursive halving, so I've limited where the recursive * halving is used to be nice to the app memory wise. There are much * better algorithms for large messages with cummutative operations, * so this should be investigated further. * * NOTE: We default to a simple reduce/scatterv if one of the rcounts * is zero. This is because the existing algorithms do not currently * support a count of zero in the array. */ int mca_coll_basic_reduce_scatter_intra(void *sbuf, void *rbuf, int *rcounts, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, rank, size, count, err = OMPI_SUCCESS; ptrdiff_t true_lb, true_extent, lb, extent, buf_size; int *disps = NULL; char *recv_buf = NULL, *recv_buf_free = NULL; char *result_buf = NULL, *result_buf_free = NULL; bool zerocounts = false; /* Initialize */ rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); /* Find displacements and the like */ disps = (int*) malloc(sizeof(int) * size); if (NULL == disps) return OMPI_ERR_OUT_OF_RESOURCE; disps[0] = 0; for (i = 0; i < (size - 1); ++i) { disps[i + 1] = disps[i] + rcounts[i]; if (0 == rcounts[i]) { zerocounts = true; } } count = disps[size - 1] + rcounts[size - 1]; if (0 == rcounts[size - 1]) { zerocounts = true; } /* short cut the trivial case */ if (0 == count) { free(disps); return OMPI_SUCCESS; } /* get datatype information */ ompi_datatype_get_extent(dtype, &lb, &extent); ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); buf_size = true_extent + (count - 1) * extent; /* Handle MPI_IN_PLACE */ if (MPI_IN_PLACE == sbuf) { sbuf = rbuf; } if ((op->o_flags & OMPI_OP_FLAGS_COMMUTE) && (buf_size < COMMUTATIVE_LONG_MSG) && (!zerocounts)) { int tmp_size, remain = 0, tmp_rank; /* temporary receive buffer. See coll_basic_reduce.c for details on sizing */ recv_buf_free = (char*) malloc(buf_size); recv_buf = recv_buf_free - lb; if (NULL == recv_buf_free) { err = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup; } /* allocate temporary buffer for results */ result_buf_free = (char*) malloc(buf_size); result_buf = result_buf_free - lb; /* copy local buffer into the temporary results */ err = ompi_datatype_sndrcv(sbuf, count, dtype, result_buf, count, dtype); if (OMPI_SUCCESS != err) goto cleanup; /* figure out power of two mapping: grow until larger than comm size, then go back one, to get the largest power of two less than comm size */ tmp_size = opal_next_poweroftwo(size); tmp_size >>= 1; remain = size - tmp_size; /* If comm size is not a power of two, have the first "remain" procs with an even rank send to rank + 1, leaving a power of two procs to do the rest of the algorithm */ if (rank < 2 * remain) { if ((rank & 1) == 0) { err = MCA_PML_CALL(send(result_buf, count, dtype, rank + 1, MCA_COLL_BASE_TAG_REDUCE_SCATTER, MCA_PML_BASE_SEND_STANDARD, comm)); if (OMPI_SUCCESS != err) goto cleanup; /* we don't participate from here on out */ tmp_rank = -1; } else { err = MCA_PML_CALL(recv(recv_buf, count, dtype, rank - 1, MCA_COLL_BASE_TAG_REDUCE_SCATTER, comm, MPI_STATUS_IGNORE)); if (OMPI_SUCCESS != err) goto cleanup; /* integrate their results into our temp results */ ompi_op_reduce(op, recv_buf, result_buf, count, dtype); /* adjust rank to be the bottom "remain" ranks */ tmp_rank = rank / 2; } } else { /* just need to adjust rank to show that the bottom "even remain" ranks dropped out */ tmp_rank = rank - remain; } /* For ranks not kicked out by the above code, perform the recursive halving */ if (tmp_rank >= 0) { int *tmp_disps = NULL, *tmp_rcounts = NULL; int mask, send_index, recv_index, last_index; /* recalculate disps and rcounts to account for the special "remainder" processes that are no longer doing anything */ tmp_rcounts = (int*) malloc(tmp_size * sizeof(int)); if (NULL == tmp_rcounts) { err = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup; } tmp_disps = (int*) malloc(tmp_size * sizeof(int)); if (NULL == tmp_disps) { free(tmp_rcounts); err = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup; } for (i = 0 ; i < tmp_size ; ++i) { if (i < remain) { /* need to include old neighbor as well */ tmp_rcounts[i] = rcounts[i * 2 + 1] + rcounts[i * 2]; } else { tmp_rcounts[i] = rcounts[i + remain]; } } tmp_disps[0] = 0; for (i = 0; i < tmp_size - 1; ++i) { tmp_disps[i + 1] = tmp_disps[i] + tmp_rcounts[i]; } /* do the recursive halving communication. Don't use the dimension information on the communicator because I think the information is invalidated by our "shrinking" of the communicator */ mask = tmp_size >> 1; send_index = recv_index = 0; last_index = tmp_size; while (mask > 0) { int tmp_peer, peer, send_count, recv_count; struct ompi_request_t *request; tmp_peer = tmp_rank ^ mask; peer = (tmp_peer < remain) ? tmp_peer * 2 + 1 : tmp_peer + remain; /* figure out if we're sending, receiving, or both */ send_count = recv_count = 0; if (tmp_rank < tmp_peer) { send_index = recv_index + mask; for (i = send_index ; i < last_index ; ++i) { send_count += tmp_rcounts[i]; } for (i = recv_index ; i < send_index ; ++i) { recv_count += tmp_rcounts[i]; } } else { recv_index = send_index + mask; for (i = send_index ; i < recv_index ; ++i) { send_count += tmp_rcounts[i]; } for (i = recv_index ; i < last_index ; ++i) { recv_count += tmp_rcounts[i]; } } /* actual data transfer. Send from result_buf, receive into recv_buf */ if (send_count > 0 && recv_count != 0) { err = MCA_PML_CALL(irecv(recv_buf + tmp_disps[recv_index] * extent, recv_count, dtype, peer, MCA_COLL_BASE_TAG_REDUCE_SCATTER, comm, &request)); if (OMPI_SUCCESS != err) { free(tmp_rcounts); free(tmp_disps); goto cleanup; } } if (recv_count > 0 && send_count != 0) { err = MCA_PML_CALL(send(result_buf + tmp_disps[send_index] * extent, send_count, dtype, peer, MCA_COLL_BASE_TAG_REDUCE_SCATTER, MCA_PML_BASE_SEND_STANDARD, comm)); if (OMPI_SUCCESS != err) { free(tmp_rcounts); free(tmp_disps); goto cleanup; } } if (send_count > 0 && recv_count != 0) { err = ompi_request_wait(&request, MPI_STATUS_IGNORE); if (OMPI_SUCCESS != err) { free(tmp_rcounts); free(tmp_disps); goto cleanup; } } /* if we received something on this step, push it into the results buffer */ if (recv_count > 0) { ompi_op_reduce(op, recv_buf + tmp_disps[recv_index] * extent, result_buf + tmp_disps[recv_index] * extent, recv_count, dtype); } /* update for next iteration */ send_index = recv_index; last_index = recv_index + mask; mask >>= 1; } /* copy local results from results buffer into real receive buffer */ if (0 != rcounts[rank]) { err = ompi_datatype_sndrcv(result_buf + disps[rank] * extent, rcounts[rank], dtype, rbuf, rcounts[rank], dtype); if (OMPI_SUCCESS != err) { free(tmp_rcounts); free(tmp_disps); goto cleanup; } } free(tmp_rcounts); free(tmp_disps); }
/* * Linear functions are copied from the basic coll module. For * some small number of nodes and/or small data sizes they are just as * fast as tuned/tree based segmenting operations and as such may be * selected by the decision functions. These are copied into this module * due to the way we select modules in V1. i.e. in V2 we will handle this * differently and so will not have to duplicate code. * GEF Oct05 after asking Jeff. */ int ompi_coll_tuned_alltoallv_intra_basic_linear(void *sbuf, int *scounts, int *sdisps, struct ompi_datatype_t *sdtype, void *rbuf, int *rcounts, int *rdisps, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, size, rank, err, nreqs; char *psnd, *prcv; ptrdiff_t sext, rext; MPI_Request *preq; mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; mca_coll_tuned_comm_t *data = tuned_module->tuned_data; if (MPI_IN_PLACE == sbuf) { return mca_coll_tuned_alltoallv_intra_basic_inplace (rbuf, rcounts, rdisps, rdtype, comm, module); } size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:alltoallv_intra_basic_linear rank %d", rank)); ompi_datatype_type_extent(sdtype, &sext); ompi_datatype_type_extent(rdtype, &rext); /* Simple optimization - handle send to self first */ psnd = ((char *) sbuf) + (ptrdiff_t)sdisps[rank] * sext; prcv = ((char *) rbuf) + (ptrdiff_t)rdisps[rank] * rext; if (0 != scounts[rank]) { err = ompi_datatype_sndrcv(psnd, scounts[rank], sdtype, prcv, rcounts[rank], rdtype); if (MPI_SUCCESS != err) { return err; } } /* If only one process, we're done. */ if (1 == size) { return MPI_SUCCESS; } /* Now, initiate all send/recv to/from others. */ nreqs = 0; preq = data->mcct_reqs; /* Post all receives first */ for (i = 0; i < size; ++i) { if (i == rank || 0 == rcounts[i]) { continue; } prcv = ((char *) rbuf) + (ptrdiff_t)rdisps[i] * rext; err = MCA_PML_CALL(irecv_init(prcv, rcounts[i], rdtype, i, MCA_COLL_BASE_TAG_ALLTOALLV, comm, preq++)); ++nreqs; if (MPI_SUCCESS != err) { ompi_coll_tuned_free_reqs(data->mcct_reqs, nreqs); return err; } } /* Now post all sends */ for (i = 0; i < size; ++i) { if (i == rank || 0 == scounts[i]) { continue; } psnd = ((char *) sbuf) + (ptrdiff_t)sdisps[i] * sext; err = MCA_PML_CALL(isend_init(psnd, scounts[i], sdtype, i, MCA_COLL_BASE_TAG_ALLTOALLV, MCA_PML_BASE_SEND_STANDARD, comm, preq++)); ++nreqs; if (MPI_SUCCESS != err) { ompi_coll_tuned_free_reqs(data->mcct_reqs, nreqs); return err; } } /* Start your engines. This will never return an error. */ MCA_PML_CALL(start(nreqs, data->mcct_reqs)); /* Wait for them all. If there's an error, note that we don't care * what the error was -- just that there *was* an error. The PML * will finish all requests, even if one or more of them fail. * i.e., by the end of this call, all the requests are free-able. * So free them anyway -- even if there was an error, and return the * error after we free everything. */ err = ompi_request_wait_all(nreqs, data->mcct_reqs, MPI_STATUSES_IGNORE); /* Free the requests. */ ompi_coll_tuned_free_reqs(data->mcct_reqs, nreqs); return err; }
/* * scatterv_intra * * Function: - scatterv operation * Accepts: - same arguments as MPI_Scatterv() * Returns: - MPI_SUCCESS or error code */ int mca_coll_basic_scatterv_intra(const void *sbuf, const int *scounts, const int *disps, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, rank, size, err; char *ptmp; ptrdiff_t lb, extent; /* Initialize */ rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); /* If not root, receive data. */ if (rank != root) { /* Only receive if there is something to receive */ if (rcount > 0) { return MCA_PML_CALL(recv(rbuf, rcount, rdtype, root, MCA_COLL_BASE_TAG_SCATTERV, comm, MPI_STATUS_IGNORE)); } return MPI_SUCCESS; } /* I am the root, loop sending data. */ err = ompi_datatype_get_extent(sdtype, &lb, &extent); if (OMPI_SUCCESS != err) { return OMPI_ERROR; } for (i = 0; i < size; ++i) { ptmp = ((char *) sbuf) + (extent * disps[i]); /* simple optimization */ if (i == rank) { /* simple optimization or a local operation */ if (scounts[i] > 0 && MPI_IN_PLACE != rbuf) { err = ompi_datatype_sndrcv(ptmp, scounts[i], sdtype, rbuf, rcount, rdtype); } } else { /* Only send if there is something to send */ if (scounts[i] > 0) { err = MCA_PML_CALL(send(ptmp, scounts[i], sdtype, i, MCA_COLL_BASE_TAG_SCATTERV, MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != err) { return err; } } } } /* All done */ return MPI_SUCCESS; }
/* * gatherv_intra * * Function: - basic gatherv operation * Accepts: - same arguments as MPI_Gatherv() * Returns: - MPI_SUCCESS or error code */ int mca_coll_basic_gatherv_intra(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, const int *rcounts, const int *disps, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, rank, size, err; char *ptmp; ptrdiff_t lb, extent; size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); /* Everyone but root sends data and returns. Don't send anything for sendcounts of 0 (even though MPI_Gatherv has a guard for 0 counts, this routine is used elsewhere, like the implementation of allgatherv, so it's possible to get here with a scount of 0) */ if (rank != root) { if (scount > 0) { return MCA_PML_CALL(send(sbuf, scount, sdtype, root, MCA_COLL_BASE_TAG_GATHERV, MCA_PML_BASE_SEND_STANDARD, comm)); } return MPI_SUCCESS; } /* I am the root, loop receiving data. */ err = ompi_datatype_get_extent(rdtype, &lb, &extent); if (OMPI_SUCCESS != err) { return OMPI_ERROR; } for (i = 0; i < size; ++i) { ptmp = ((char *) rbuf) + (extent * disps[i]); if (i == rank) { /* simple optimization */ if (MPI_IN_PLACE != sbuf && (0 < scount) && (0 < rcounts[i])) { err = ompi_datatype_sndrcv(sbuf, scount, sdtype, ptmp, rcounts[i], rdtype); } } else { /* Only receive if there is something to receive */ if (rcounts[i] > 0) { err = MCA_PML_CALL(recv(ptmp, rcounts[i], rdtype, i, MCA_COLL_BASE_TAG_GATHERV, comm, MPI_STATUS_IGNORE)); } } if (MPI_SUCCESS != err) { return err; } } /* All done */ return MPI_SUCCESS; }
int ompi_coll_tuned_alltoall_intra_basic_linear(void *sbuf, int scount, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, rank, size, err, nreqs; char *psnd, *prcv; MPI_Aint lb, sndinc, rcvinc; ompi_request_t **req, **sreq, **rreq; mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; mca_coll_tuned_comm_t *data = tuned_module->tuned_data; if (MPI_IN_PLACE == sbuf) { return mca_coll_tuned_alltoall_intra_basic_inplace (rbuf, rcount, rdtype, comm, module); } /* Initialize. */ size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_alltoall_intra_basic_linear rank %d", rank)); err = ompi_datatype_get_extent(sdtype, &lb, &sndinc); if (OMPI_SUCCESS != err) { return err; } sndinc *= scount; err = ompi_datatype_get_extent(rdtype, &lb, &rcvinc); if (OMPI_SUCCESS != err) { return err; } rcvinc *= rcount; /* simple optimization */ psnd = ((char *) sbuf) + (ptrdiff_t)rank * sndinc; prcv = ((char *) rbuf) + (ptrdiff_t)rank * rcvinc; err = ompi_datatype_sndrcv(psnd, scount, sdtype, prcv, rcount, rdtype); if (MPI_SUCCESS != err) { return err; } /* If only one process, we're done. */ if (1 == size) { return MPI_SUCCESS; } /* Initiate all send/recv to/from others. */ req = rreq = data->mcct_reqs; sreq = rreq + size - 1; prcv = (char *) rbuf; psnd = (char *) sbuf; /* Post all receives first -- a simple optimization */ for (nreqs = 0, i = (rank + 1) % size; i != rank; i = (i + 1) % size, ++rreq, ++nreqs) { err = MCA_PML_CALL(irecv_init (prcv + (ptrdiff_t)i * rcvinc, rcount, rdtype, i, MCA_COLL_BASE_TAG_ALLTOALL, comm, rreq)); if (MPI_SUCCESS != err) { ompi_coll_tuned_free_reqs(req, rreq - req); return err; } } /* Now post all sends in reverse order - We would like to minimize the search time through message queue when messages actually arrive in the order in which they were posted. */ for (nreqs = 0, i = (rank + size - 1) % size; i != rank; i = (i + size - 1) % size, ++sreq, ++nreqs) { err = MCA_PML_CALL(isend_init (psnd + (ptrdiff_t)i * sndinc, scount, sdtype, i, MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD, comm, sreq)); if (MPI_SUCCESS != err) { ompi_coll_tuned_free_reqs(req, sreq - req); return err; } } nreqs = (size - 1) * 2; /* Start your engines. This will never return an error. */ MCA_PML_CALL(start(nreqs, req)); /* Wait for them all. If there's an error, note that we don't * care what the error was -- just that there *was* an error. The * PML will finish all requests, even if one or more of them fail. * i.e., by the end of this call, all the requests are free-able. * So free them anyway -- even if there was an error, and return * the error after we free everything. */ err = ompi_request_wait_all(nreqs, req, MPI_STATUSES_IGNORE); /* Free the reqs */ ompi_coll_tuned_free_reqs(req, nreqs); /* All done */ return err; }
/* * alltoall_intra * * Function: - MPI_Alltoall * Accepts: - same as MPI_Alltoall() * Returns: - MPI_SUCCESS or an MPI error code */ int mca_coll_basic_alltoall_intra(void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i; int rank; int size; int err; int nreqs; char *psnd; char *prcv; MPI_Aint lb; MPI_Aint sndinc; MPI_Aint rcvinc; ompi_request_t **req; ompi_request_t **sreq; ompi_request_t **rreq; mca_coll_basic_module_t *basic_module = (mca_coll_basic_module_t*) module; /* Initialize. */ size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); err = ompi_datatype_get_extent(sdtype, &lb, &sndinc); if (OMPI_SUCCESS != err) { return err; } sndinc *= scount; err = ompi_datatype_get_extent(rdtype, &lb, &rcvinc); if (OMPI_SUCCESS != err) { return err; } rcvinc *= rcount; /* simple optimization */ psnd = ((char *) sbuf) + (rank * sndinc); prcv = ((char *) rbuf) + (rank * rcvinc); err = ompi_datatype_sndrcv(psnd, scount, sdtype, prcv, rcount, rdtype); if (MPI_SUCCESS != err) { return err; } /* If only one process, we're done. */ if (1 == size) { return MPI_SUCCESS; } /* Initiate all send/recv to/from others. */ req = rreq = basic_module->mccb_reqs; sreq = rreq + size - 1; prcv = (char *) rbuf; psnd = (char *) sbuf; /* Post all receives first -- a simple optimization */ for (nreqs = 0, i = (rank + 1) % size; i != rank; i = (i + 1) % size, ++rreq, ++nreqs) { err = MCA_PML_CALL(irecv_init (prcv + (i * rcvinc), rcount, rdtype, i, MCA_COLL_BASE_TAG_ALLTOALL, comm, rreq)); if (MPI_SUCCESS != err) { mca_coll_basic_free_reqs(req, nreqs); return err; } } /* Now post all sends */ for (nreqs = 0, i = (rank + 1) % size; i != rank; i = (i + 1) % size, ++sreq, ++nreqs) { err = MCA_PML_CALL(isend_init (psnd + (i * sndinc), scount, sdtype, i, MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD, comm, sreq)); if (MPI_SUCCESS != err) { mca_coll_basic_free_reqs(req, nreqs); return err; } } nreqs = (size - 1) * 2; /* Start your engines. This will never return an error. */ MCA_PML_CALL(start(nreqs, req)); /* Wait for them all. If there's an error, note that we don't * care what the error was -- just that there *was* an error. The * PML will finish all requests, even if one or more of them fail. * i.e., by the end of this call, all the requests are free-able. * So free them anyway -- even if there was an error, and return * the error after we free everything. */ err = ompi_request_wait_all(nreqs, req, MPI_STATUSES_IGNORE); /* Free the reqs */ mca_coll_basic_free_reqs(req, nreqs); /* All done */ return err; }
/* * alltoall_intra_linear_sync * * Function: Linear implementation of alltoall with limited number * of outstanding requests. * Accepts: Same as MPI_Alltoall(), and the maximum number of * outstanding requests (actual number is 2 * max, since * we count receive and send requests separately). * Returns: MPI_SUCCESS or error code * * Description: Algorithm is the following: * 1) post K irecvs, K <= N * 2) post K isends, K <= N * 3) while not done * - wait for any request to complete * - replace that request by the new one of the same type. */ int ompi_coll_tuned_alltoall_intra_linear_sync(void *sbuf, int scount, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module, int max_outstanding_reqs) { int line, error, ri, si, rank, size, nreqs, nrreqs, nsreqs, total_reqs; char *psnd, *prcv; ptrdiff_t slb, sext, rlb, rext; ompi_request_t **reqs = NULL; if (MPI_IN_PLACE == sbuf) { return mca_coll_tuned_alltoall_intra_basic_inplace (rbuf, rcount, rdtype, comm, module); } /* Initialize. */ size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_alltoall_intra_linear_sync rank %d", rank)); error = ompi_datatype_get_extent(sdtype, &slb, &sext); if (OMPI_SUCCESS != error) { return error; } sext *= scount; error = ompi_datatype_get_extent(rdtype, &rlb, &rext); if (OMPI_SUCCESS != error) { return error; } rext *= rcount; /* simple optimization */ psnd = ((char *) sbuf) + (ptrdiff_t)rank * sext; prcv = ((char *) rbuf) + (ptrdiff_t)rank * rext; error = ompi_datatype_sndrcv(psnd, scount, sdtype, prcv, rcount, rdtype); if (MPI_SUCCESS != error) { return error; } /* If only one process, we're done. */ if (1 == size) { return MPI_SUCCESS; } /* Initiate send/recv to/from others. */ total_reqs = (((max_outstanding_reqs > (size - 1)) || (max_outstanding_reqs <= 0)) ? (size - 1) : (max_outstanding_reqs)); reqs = (ompi_request_t**) malloc( 2 * total_reqs * sizeof(ompi_request_t*)); if (NULL == reqs) { error = -1; line = __LINE__; goto error_hndl; } prcv = (char *) rbuf; psnd = (char *) sbuf; /* Post first batch or ireceive and isend requests */ for (nreqs = 0, nrreqs = 0, ri = (rank + 1) % size; nreqs < total_reqs; ri = (ri + 1) % size, ++nreqs, ++nrreqs) { error = MCA_PML_CALL(irecv (prcv + (ptrdiff_t)ri * rext, rcount, rdtype, ri, MCA_COLL_BASE_TAG_ALLTOALL, comm, &reqs[nreqs])); if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; } } for ( nsreqs = 0, si = (rank + size - 1) % size; nreqs < 2 * total_reqs; si = (si + size - 1) % size, ++nreqs, ++nsreqs) { error = MCA_PML_CALL(isend (psnd + (ptrdiff_t)si * sext, scount, sdtype, si, MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD, comm, &reqs[nreqs])); if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; } } /* Wait for requests to complete */ if (nreqs == 2 * (size - 1)) { /* Optimization for the case when all requests have been posted */ error = ompi_request_wait_all(nreqs, reqs, MPI_STATUSES_IGNORE); if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; } } else { /* As requests complete, replace them with corresponding requests: - wait for any request to complete, mark the request as MPI_REQUEST_NULL - If it was a receive request, replace it with new irecv request (if any) - if it was a send request, replace it with new isend request (if any) */ int ncreqs = 0; while (ncreqs < 2 * (size - 1)) { int completed; error = ompi_request_wait_any(2 * total_reqs, reqs, &completed, MPI_STATUS_IGNORE); if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; } reqs[completed] = MPI_REQUEST_NULL; ncreqs++; if (completed < total_reqs) { if (nrreqs < (size - 1)) { error = MCA_PML_CALL(irecv (prcv + (ptrdiff_t)ri * rext, rcount, rdtype, ri, MCA_COLL_BASE_TAG_ALLTOALL, comm, &reqs[completed])); if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; } ++nrreqs; ri = (ri + 1) % size; } } else { if (nsreqs < (size - 1)) { error = MCA_PML_CALL(isend (psnd + (ptrdiff_t)si * sext, scount, sdtype, si, MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD, comm, &reqs[completed])); ++nsreqs; si = (si + size - 1) % size; } } } } /* Free the reqs */ free(reqs); /* All done */ return MPI_SUCCESS; error_hndl: OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, error, rank)); if (NULL != reqs) free(reqs); return error; }
/* * alltoallw_intra * * Function: - MPI_Alltoallw * Accepts: - same as MPI_Alltoallw() * Returns: - MPI_SUCCESS or an MPI error code */ int mca_coll_basic_alltoallw_intra(void *sbuf, int *scounts, int *sdisps, struct ompi_datatype_t **sdtypes, void *rbuf, int *rcounts, int *rdisps, struct ompi_datatype_t **rdtypes, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i; int size; int rank; int err; char *psnd; char *prcv; int nreqs; MPI_Request *preq; mca_coll_basic_module_t *basic_module = (mca_coll_basic_module_t*) module; /* Initialize. */ if (MPI_IN_PLACE == sbuf) { return mca_coll_basic_alltoallw_intra_inplace (rbuf, rcounts, rdisps, rdtypes, comm, module); } size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); /* simple optimization */ psnd = ((char *) sbuf) + sdisps[rank]; prcv = ((char *) rbuf) + rdisps[rank]; if (0 != scounts[rank]) { err = ompi_datatype_sndrcv(psnd, scounts[rank], sdtypes[rank], prcv, rcounts[rank], rdtypes[rank]); if (MPI_SUCCESS != err) { return err; } } /* If only one process, we're done. */ if (1 == size) { return MPI_SUCCESS; } /* Initiate all send/recv to/from others. */ nreqs = 0; preq = basic_module->mccb_reqs; /* Post all receives first -- a simple optimization */ for (i = 0; i < size; ++i) { if (i == rank || 0 == rcounts[i]) continue; prcv = ((char *) rbuf) + rdisps[i]; err = MCA_PML_CALL(irecv_init(prcv, rcounts[i], rdtypes[i], i, MCA_COLL_BASE_TAG_ALLTOALLW, comm, preq++)); ++nreqs; if (MPI_SUCCESS != err) { mca_coll_basic_free_reqs(basic_module->mccb_reqs, nreqs); return err; } } /* Now post all sends */ for (i = 0; i < size; ++i) { if (i == rank || 0 == scounts[i]) continue; psnd = ((char *) sbuf) + sdisps[i]; err = MCA_PML_CALL(isend_init(psnd, scounts[i], sdtypes[i], i, MCA_COLL_BASE_TAG_ALLTOALLW, MCA_PML_BASE_SEND_STANDARD, comm, preq++)); ++nreqs; if (MPI_SUCCESS != err) { mca_coll_basic_free_reqs(basic_module->mccb_reqs, nreqs); return err; } } /* Start your engines. This will never return an error. */ MCA_PML_CALL(start(nreqs, basic_module->mccb_reqs)); /* Wait for them all. If there's an error, note that we don't care * what the error was -- just that there *was* an error. The PML * will finish all requests, even if one or more of them fail. * i.e., by the end of this call, all the requests are free-able. * So free them anyway -- even if there was an error, and return the * error after we free everything. */ err = ompi_request_wait_all(nreqs, basic_module->mccb_reqs, MPI_STATUSES_IGNORE); /* Free the requests. */ mca_coll_basic_free_reqs(basic_module->mccb_reqs, nreqs); /* All done */ return err; }
int ompi_coll_base_alltoall_intra_basic_linear(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, rank, size, err, line; int nreqs = 0; char *psnd, *prcv; MPI_Aint lb, sndinc, rcvinc; ompi_request_t **req, **sreq, **rreq; mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module; mca_coll_base_comm_t *data = base_module->base_data; if (MPI_IN_PLACE == sbuf) { return mca_coll_base_alltoall_intra_basic_inplace (rbuf, rcount, rdtype, comm, module); } /* Initialize. */ size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_alltoall_intra_basic_linear rank %d", rank)); err = ompi_datatype_get_extent(sdtype, &lb, &sndinc); if (OMPI_SUCCESS != err) { return err; } sndinc *= scount; err = ompi_datatype_get_extent(rdtype, &lb, &rcvinc); if (OMPI_SUCCESS != err) { return err; } rcvinc *= rcount; /* simple optimization */ psnd = ((char *) sbuf) + (ptrdiff_t)rank * sndinc; prcv = ((char *) rbuf) + (ptrdiff_t)rank * rcvinc; err = ompi_datatype_sndrcv(psnd, scount, sdtype, prcv, rcount, rdtype); if (MPI_SUCCESS != err) { return err; } /* If only one process, we're done. */ if (1 == size) { return MPI_SUCCESS; } /* Initiate all send/recv to/from others. */ req = rreq = coll_base_comm_get_reqs(data, (size - 1) * 2); if (NULL == req) { err = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl; } prcv = (char *) rbuf; psnd = (char *) sbuf; /* Post all receives first -- a simple optimization */ for (nreqs = 0, i = (rank + 1) % size; i != rank; i = (i + 1) % size, ++rreq) { nreqs++; err = MCA_PML_CALL(irecv_init (prcv + (ptrdiff_t)i * rcvinc, rcount, rdtype, i, MCA_COLL_BASE_TAG_ALLTOALL, comm, rreq)); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } /* Now post all sends in reverse order - We would like to minimize the search time through message queue when messages actually arrive in the order in which they were posted. */ sreq = rreq; for (i = (rank + size - 1) % size; i != rank; i = (i + size - 1) % size, ++sreq) { nreqs++; err = MCA_PML_CALL(isend_init (psnd + (ptrdiff_t)i * sndinc, scount, sdtype, i, MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD, comm, sreq)); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } /* Start your engines. This will never return an error. */ MCA_PML_CALL(start(nreqs, req)); /* Wait for them all. If there's an error, note that we don't * care what the error was -- just that there *was* an error. The * PML will finish all requests, even if one or more of them fail. * i.e., by the end of this call, all the requests are free-able. * So free them anyway -- even if there was an error, and return * the error after we free everything. */ err = ompi_request_wait_all(nreqs, req, MPI_STATUSES_IGNORE); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } err_hndl: if( MPI_SUCCESS != err ) { OPAL_OUTPUT( (ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank) ); (void)line; // silence compiler warning } /* Free the reqs in all cases as they are persistent requests */ ompi_coll_base_free_reqs(req, nreqs); /* All done */ return err; }
/* Todo: gather_intra_generic, gather_intra_binary, gather_intra_chain, * gather_intra_pipeline, segmentation? */ int ompi_coll_base_gather_intra_binomial(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int line = -1, i, rank, vrank, size, total_recv = 0, err; char *ptmp = NULL, *tempbuf = NULL; ompi_coll_tree_t* bmtree; MPI_Status status; MPI_Aint sextent, slb, strue_lb, strue_extent; MPI_Aint rextent, rlb, rtrue_lb, rtrue_extent; mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module; mca_coll_base_comm_t *data = base_module->base_data; size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_gather_intra_binomial rank %d", rank)); /* create the binomial tree */ COLL_BASE_UPDATE_IN_ORDER_BMTREE( comm, base_module, root ); bmtree = data->cached_in_order_bmtree; ompi_datatype_get_extent(sdtype, &slb, &sextent); ompi_datatype_get_true_extent(sdtype, &strue_lb, &strue_extent); vrank = (rank - root + size) % size; if (rank == root) { ompi_datatype_get_extent(rdtype, &rlb, &rextent); ompi_datatype_get_true_extent(rdtype, &rtrue_lb, &rtrue_extent); if (0 == root){ /* root on 0, just use the recv buffer */ ptmp = (char *) rbuf; if (sbuf != MPI_IN_PLACE) { err = ompi_datatype_sndrcv((void *)sbuf, scount, sdtype, ptmp, rcount, rdtype); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } } else { /* root is not on 0, allocate temp buffer for recv, * rotate data at the end */ tempbuf = (char *) malloc(rtrue_extent + ((ptrdiff_t)rcount * (ptrdiff_t)size - 1) * rextent); if (NULL == tempbuf) { err= OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl; } ptmp = tempbuf - rtrue_lb; if (sbuf != MPI_IN_PLACE) { /* copy from sbuf to temp buffer */ err = ompi_datatype_sndrcv((void *)sbuf, scount, sdtype, ptmp, rcount, rdtype); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } else { /* copy from rbuf to temp buffer */ err = ompi_datatype_copy_content_same_ddt(rdtype, rcount, ptmp, (char *)rbuf + (ptrdiff_t)rank * rextent * (ptrdiff_t)rcount); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } } total_recv = rcount; } else if (!(vrank % 2)) { /* other non-leaf nodes, allocate temp buffer for data received from * children, the most we need is half of the total data elements due * to the property of binimoal tree */ tempbuf = (char *) malloc(strue_extent + ((ptrdiff_t)scount * (ptrdiff_t)size - 1) * sextent); if (NULL == tempbuf) { err= OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl; } ptmp = tempbuf - strue_lb; /* local copy to tempbuf */ err = ompi_datatype_sndrcv((void *)sbuf, scount, sdtype, ptmp, scount, sdtype); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } /* use sdtype,scount as rdtype,rdcount since they are ignored on * non-root procs */ rdtype = sdtype; rcount = scount; rextent = sextent; total_recv = rcount; } else { /* leaf nodes, no temp buffer needed, use sdtype,scount as * rdtype,rdcount since they are ignored on non-root procs */ ptmp = (char *) sbuf; total_recv = scount; } if (!(vrank % 2)) { /* all non-leaf nodes recv from children */ for (i = 0; i < bmtree->tree_nextsize; i++) { int mycount = 0, vkid; /* figure out how much data I have to send to this child */ vkid = (bmtree->tree_next[i] - root + size) % size; mycount = vkid - vrank; if (mycount > (size - vkid)) mycount = size - vkid; mycount *= rcount; OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_gather_intra_binomial rank %d recv %d mycount = %d", rank, bmtree->tree_next[i], mycount)); err = MCA_PML_CALL(recv(ptmp + total_recv*rextent, (ptrdiff_t)rcount * size - total_recv, rdtype, bmtree->tree_next[i], MCA_COLL_BASE_TAG_GATHER, comm, &status)); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } total_recv += mycount; } } if (rank != root) { /* all nodes except root send to parents */ OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_gather_intra_binomial rank %d send %d count %d\n", rank, bmtree->tree_prev, total_recv)); err = MCA_PML_CALL(send(ptmp, total_recv, sdtype, bmtree->tree_prev, MCA_COLL_BASE_TAG_GATHER, MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } if (rank == root) { if (root != 0) { /* rotate received data on root if root != 0 */ err = ompi_datatype_copy_content_same_ddt(rdtype, (ptrdiff_t)rcount * (ptrdiff_t)(size - root), (char *)rbuf + rextent * (ptrdiff_t)root * (ptrdiff_t)rcount, ptmp); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } err = ompi_datatype_copy_content_same_ddt(rdtype, (ptrdiff_t)rcount * (ptrdiff_t)root, (char *) rbuf, ptmp + rextent * (ptrdiff_t)rcount * (ptrdiff_t)(size-root)); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } free(tempbuf); } } else if (!(vrank % 2)) { /* other non-leaf nodes */ free(tempbuf); } return MPI_SUCCESS; err_hndl: if (NULL != tempbuf) free(tempbuf); OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank)); return err; }