int ompi_osc_portals4_put(void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt, int target, OPAL_PTRDIFF_TYPE target_disp, int target_count, struct ompi_datatype_t *target_dt, struct ompi_win_t *win) { int ret; ompi_osc_portals4_module_t *module = (ompi_osc_portals4_module_t*) win->w_osc_module; ptl_process_t peer = ompi_osc_portals4_get_peer(module, target); size_t length; size_t offset; ptl_handle_md_t md_h; void *md_base; OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "put: 0x%lx, %d, %s, %d, %d, %d, %s, 0x%lx", (unsigned long) origin_addr, origin_count, origin_dt->name, target, (int) target_disp, target_count, target_dt->name, (unsigned long) win)); offset = get_displacement(module, target) * target_disp; if (!ompi_datatype_is_contiguous_memory_layout(origin_dt, origin_count) || !ompi_datatype_is_contiguous_memory_layout(target_dt, target_count)) { opal_output(ompi_osc_base_framework.framework_output, "MPI_Put: transfer of non-contiguous memory is not currently supported.\n"); return OMPI_ERR_NOT_SUPPORTED; } else { (void)opal_atomic_add_64(&module->opcount, 1); ret = ompi_datatype_type_size(origin_dt, &length); if (OMPI_SUCCESS != ret) { return ret; } length *= origin_count; ompi_osc_portals4_get_md(origin_addr, module->md_h, &md_h, &md_base); ret = PtlPut(md_h, (ptl_size_t) ((char*) origin_addr - (char*) md_base), length, PTL_ACK_REQ, peer, module->pt_idx, module->match_bits, offset, NULL, 0); if (OMPI_SUCCESS != ret) { return ret; } } return OMPI_SUCCESS; }
int ompi_osc_ucx_get(void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt, int target, ptrdiff_t target_disp, int target_count, struct ompi_datatype_t *target_dt, struct ompi_win_t *win) { ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module; ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target); uint64_t remote_addr = (module->win_info_array[target]).addr + target_disp * OSC_UCX_GET_DISP(module, target); ucp_rkey_h rkey; ptrdiff_t origin_lb, origin_extent, target_lb, target_extent; bool is_origin_contig = false, is_target_contig = false; ucs_status_t status; int ret = OMPI_SUCCESS; ret = check_sync_state(module, target, false); if (ret != OMPI_SUCCESS) { return ret; } if (module->flavor == MPI_WIN_FLAVOR_DYNAMIC) { status = get_dynamic_win_info(remote_addr, module, ep, target); if (status != UCS_OK) { return OMPI_ERROR; } } rkey = (module->win_info_array[target]).rkey; ompi_datatype_get_true_extent(origin_dt, &origin_lb, &origin_extent); ompi_datatype_get_true_extent(target_dt, &target_lb, &target_extent); is_origin_contig = ompi_datatype_is_contiguous_memory_layout(origin_dt, origin_count); is_target_contig = ompi_datatype_is_contiguous_memory_layout(target_dt, target_count); if (is_origin_contig && is_target_contig) { /* fast path */ size_t origin_len; ompi_datatype_type_size(origin_dt, &origin_len); origin_len *= origin_count; status = ucp_get_nbi(ep, (void *)((intptr_t)origin_addr + origin_lb), origin_len, remote_addr + target_lb, rkey); if (status != UCS_OK && status != UCS_INPROGRESS) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: ucp_get_nbi failed: %d\n", __FILE__, __LINE__, status); return OMPI_ERROR; } return incr_and_check_ops_num(module, target, ep); } else { return ddt_put_get(module, origin_addr, origin_count, origin_dt, is_origin_contig, origin_lb, target, ep, remote_addr, rkey, target_count, target_dt, is_target_contig, target_lb, true); } }
void ADIOI_Datatype_iscontig(MPI_Datatype datatype, int *flag) { /* * Open MPI contiguous check return true for datatype with * gaps in the beginning and at the end. We have to provide * a count of 2 in order to get these gaps taken into acount. * In addition, if the data is contiguous but true_lb differes * from zero, ROMIO will ignore the displacement. Thus, lie! */ *flag = ompi_datatype_is_contiguous_memory_layout(datatype, 2); if (*flag) { MPI_Aint true_extent, true_lb; ompi_datatype_get_true_extent(datatype, &true_lb, &true_extent); if (true_lb > 0) *flag = 0; } }
int ompi_osc_ucx_get_accumulate(const void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt, void *result_addr, int result_count, struct ompi_datatype_t *result_dt, int target, ptrdiff_t target_disp, int target_count, struct ompi_datatype_t *target_dt, struct ompi_op_t *op, struct ompi_win_t *win) { ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module; ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target); int ret = OMPI_SUCCESS; ret = check_sync_state(module, target, false); if (ret != OMPI_SUCCESS) { return ret; } ret = start_atomicity(module, ep, target); if (ret != OMPI_SUCCESS) { return ret; } ret = ompi_osc_ucx_get(result_addr, result_count, result_dt, target, target_disp, target_count, target_dt, win); if (ret != OMPI_SUCCESS) { return ret; } if (op != &ompi_mpi_op_no_op.op) { if (op == &ompi_mpi_op_replace.op) { ret = ompi_osc_ucx_put(origin_addr, origin_count, origin_dt, target, target_disp, target_count, target_dt, win); if (ret != OMPI_SUCCESS) { return ret; } } else { void *temp_addr = NULL; uint32_t temp_count; ompi_datatype_t *temp_dt; ptrdiff_t temp_lb, temp_extent; ucs_status_t status; bool is_origin_contig = ompi_datatype_is_contiguous_memory_layout(origin_dt, origin_count); if (ompi_datatype_is_predefined(target_dt)) { temp_dt = target_dt; temp_count = target_count; } else { ret = ompi_osc_base_get_primitive_type_info(target_dt, &temp_dt, &temp_count); if (ret != OMPI_SUCCESS) { return ret; } } ompi_datatype_get_true_extent(temp_dt, &temp_lb, &temp_extent); temp_addr = malloc(temp_extent * temp_count); if (temp_addr == NULL) { return OMPI_ERR_TEMP_OUT_OF_RESOURCE; } ret = ompi_osc_ucx_get(temp_addr, (int)temp_count, temp_dt, target, target_disp, target_count, target_dt, win); if (ret != OMPI_SUCCESS) { return ret; } status = ucp_ep_flush(ep); if (status != UCS_OK) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: ucp_ep_flush failed: %d\n", __FILE__, __LINE__, status); return OMPI_ERROR; } if (ompi_datatype_is_predefined(origin_dt) || is_origin_contig) { ompi_op_reduce(op, (void *)origin_addr, temp_addr, (int)temp_count, temp_dt); } else { ucx_iovec_t *origin_ucx_iov = NULL; uint32_t origin_ucx_iov_count = 0; uint32_t origin_ucx_iov_idx = 0; ret = create_iov_list(origin_addr, origin_count, origin_dt, &origin_ucx_iov, &origin_ucx_iov_count); if (ret != OMPI_SUCCESS) { return ret; } if ((op != &ompi_mpi_op_maxloc.op && op != &ompi_mpi_op_minloc.op) || ompi_datatype_is_contiguous_memory_layout(temp_dt, temp_count)) { size_t temp_size; ompi_datatype_type_size(temp_dt, &temp_size); while (origin_ucx_iov_idx < origin_ucx_iov_count) { int curr_count = origin_ucx_iov[origin_ucx_iov_idx].len / temp_size; ompi_op_reduce(op, origin_ucx_iov[origin_ucx_iov_idx].addr, temp_addr, curr_count, temp_dt); temp_addr = (void *)((char *)temp_addr + curr_count * temp_size); origin_ucx_iov_idx++; } } else { int i; void *curr_origin_addr = origin_ucx_iov[origin_ucx_iov_idx].addr; for (i = 0; i < (int)temp_count; i++) { ompi_op_reduce(op, curr_origin_addr, (void *)((char *)temp_addr + i * temp_extent), 1, temp_dt); curr_origin_addr = (void *)((char *)curr_origin_addr + temp_extent); origin_ucx_iov_idx++; if (curr_origin_addr >= (void *)((char *)origin_ucx_iov[origin_ucx_iov_idx].addr + origin_ucx_iov[origin_ucx_iov_idx].len)) { origin_ucx_iov_idx++; curr_origin_addr = origin_ucx_iov[origin_ucx_iov_idx].addr; } } } free(origin_ucx_iov); } ret = ompi_osc_ucx_put(temp_addr, (int)temp_count, temp_dt, target, target_disp, target_count, target_dt, win); if (ret != OMPI_SUCCESS) { return ret; } status = ucp_ep_flush(ep); if (status != UCS_OK) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: ucp_ep_flush failed: %d\n", __FILE__, __LINE__, status); return OMPI_ERROR; } free(temp_addr); } } ret = end_atomicity(module, ep, target); return ret; }
static inline __opal_attribute_always_inline__ int mca_coll_ml_allgather_start (const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module, ompi_request_t **req) { size_t pack_len, sdt_size; int ret, n_fragments = 1, comm_size; mca_coll_ml_topology_t *topo_info; mca_bcol_base_payload_buffer_desc_t *src_buffer_desc; mca_coll_ml_component_t *cm = &mca_coll_ml_component; mca_coll_ml_collective_operation_progress_t *coll_op; mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t *) module; ptrdiff_t lb, extent; bool scontig, rcontig, in_place = false; /* check for in place setting */ if (MPI_IN_PLACE == sbuf) { in_place = true; sdtype = rdtype; scount = rcount; } /* scontig could be != to rcontig */ scontig = ompi_datatype_is_contiguous_memory_layout(sdtype, scount); rcontig = ompi_datatype_is_contiguous_memory_layout(rdtype, rcount); comm_size = ompi_comm_size(comm); ML_VERBOSE(10, ("Starting allgather")); assert(NULL != sdtype); /* Calculate size of the data, * at this stage, only contiguous data is supported */ /* this is valid for allagther */ ompi_datatype_type_size(sdtype, &sdt_size); pack_len = scount * sdt_size; if (in_place) { sbuf = (char *) rbuf + ompi_comm_rank(comm) * pack_len; } /* Allocate collective schedule and pack message */ /* this is the total ending message size that will need to fit in the ml-buffer */ if (pack_len <= (size_t) ml_module->small_message_thresholds[BCOL_ALLGATHER]) { /* The len of the message can not be larger than ML buffer size */ ML_VERBOSE(10, ("Single frag %d %d %d", pack_len, comm_size, ml_module->payload_block->size_buffer)); assert(pack_len * comm_size <= ml_module->payload_block->size_buffer); src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); while (NULL == src_buffer_desc) { opal_progress(); src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); } /* change 1 */ coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module, ml_module->coll_ml_allgather_functions[ML_SMALL_DATA_ALLGATHER], sbuf, rbuf, pack_len, 0 /* offset for first pack */); MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op, src_buffer_desc->buffer_index, src_buffer_desc); coll_op->fragment_data.current_coll_op = ML_SMALL_DATA_ALLGATHER; /* task setup callback function */ coll_op->sequential_routine.seq_task_setup = mca_coll_ml_allgather_task_setup; /* change 2 */ if (!scontig) { coll_op->full_message.n_bytes_scheduled = mca_coll_ml_convertor_prepare(sdtype, scount, sbuf, &coll_op->full_message.send_convertor, MCA_COLL_ML_NET_STREAM_SEND); mca_coll_ml_convertor_pack( (void *) ((uintptr_t) src_buffer_desc->data_addr + pack_len * (coll_op->coll_schedule->topo_info->hier_layout_info[0].offset + coll_op->coll_schedule->topo_info->hier_layout_info[0].level_one_index)), pack_len, &coll_op->full_message.send_convertor); } else { /* change 3 */ memcpy((void *)((uintptr_t) src_buffer_desc->data_addr + pack_len * (coll_op->coll_schedule->topo_info->hier_layout_info[0].offset + coll_op->coll_schedule->topo_info->hier_layout_info[0].level_one_index)), sbuf, pack_len); coll_op->full_message.n_bytes_scheduled = pack_len; } if (!rcontig) { mca_coll_ml_convertor_prepare(rdtype, rcount * comm_size, rbuf, &coll_op->full_message.recv_convertor, MCA_COLL_ML_NET_STREAM_RECV); } if (coll_op->coll_schedule->topo_info->ranks_contiguous) { coll_op->process_fn = mca_coll_ml_allgather_small_unpack_data; } else { coll_op->process_fn = mca_coll_ml_allgather_noncontiguous_unpack_data; } /* whole ml-buffer is used to send AND receive */ coll_op->variable_fn_params.sbuf = (void *) src_buffer_desc->data_addr; coll_op->variable_fn_params.rbuf = (void *) src_buffer_desc->data_addr; /* we can set the initial offset here */ coll_op->variable_fn_params.sbuf_offset = 0; coll_op->variable_fn_params.rbuf_offset = 0; coll_op->variable_fn_params.count = scount; coll_op->fragment_data.fragment_size = coll_op->full_message.n_bytes_scheduled; /* For small CINCO, we may use the native datatype */ coll_op->variable_fn_params.dtype = sdtype; coll_op->variable_fn_params.buffer_size = pack_len; coll_op->variable_fn_params.root = 0; } else if (cm->enable_fragmentation || pack_len * comm_size < (1 << 20)) { /* calculate the number of fragments and the size of each frag */ size_t n_dts_per_frag, frag_len; int pipeline_depth = mca_coll_ml_component.pipeline_depth; /* Calculate the number of fragments required for this message careful watch the integer division !*/ frag_len = (pack_len <= (size_t) ml_module->small_message_thresholds[BCOL_ALLGATHER] ? pack_len : (size_t) ml_module->small_message_thresholds[BCOL_ALLGATHER]); n_dts_per_frag = frag_len / sdt_size; n_fragments = (pack_len + sdt_size * n_dts_per_frag - 1) / (sdt_size * n_dts_per_frag); pipeline_depth = (n_fragments < pipeline_depth ? n_fragments : pipeline_depth); src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); while (NULL == src_buffer_desc) { opal_progress(); src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); } /* change 4 */ coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module, ml_module->coll_ml_allgather_functions[ML_SMALL_DATA_ALLGATHER], sbuf, rbuf, pack_len, 0 /* offset for first pack */); MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op, src_buffer_desc->buffer_index, src_buffer_desc); topo_info = coll_op->coll_schedule->topo_info; /* task setup callback function */ coll_op->sequential_routine.seq_task_setup = mca_coll_ml_allgather_task_setup; if (!scontig) { coll_op->full_message.send_converter_bytes_packed = mca_coll_ml_convertor_prepare( sdtype, scount, NULL, &coll_op->full_message.dummy_convertor, MCA_COLL_ML_NET_STREAM_SEND); coll_op->full_message.dummy_conv_position = 0; mca_coll_ml_convertor_get_send_frag_size( ml_module, &frag_len, &coll_op->full_message); /* change 5 */ mca_coll_ml_convertor_prepare(sdtype, scount, sbuf, &coll_op->full_message.send_convertor, MCA_COLL_ML_NET_STREAM_SEND); mca_coll_ml_convertor_pack( (void *) ((uintptr_t) src_buffer_desc->data_addr + frag_len * (topo_info->hier_layout_info[0].offset + topo_info->hier_layout_info[0].level_one_index)), frag_len, &coll_op->full_message.send_convertor); } else { /* change 6 */ memcpy((void *)((uintptr_t)src_buffer_desc->data_addr + frag_len * (topo_info->hier_layout_info[0].offset + topo_info->hier_layout_info[0].level_one_index)), sbuf, frag_len); } if (!rcontig) { mca_coll_ml_convertor_prepare(rdtype, rcount * comm_size, rbuf, &coll_op->full_message.recv_convertor, MCA_COLL_ML_NET_STREAM_RECV); } coll_op->process_fn = mca_coll_ml_allgather_noncontiguous_unpack_data; /* hopefully this doesn't royaly screw things up idea behind this is the * whole ml-buffer is used to send and receive */ coll_op->variable_fn_params.sbuf = (void *) src_buffer_desc->data_addr; coll_op->variable_fn_params.rbuf = (void *) src_buffer_desc->data_addr; /* we can set the initial offset here */ coll_op->variable_fn_params.sbuf_offset = 0; coll_op->variable_fn_params.rbuf_offset = 0; coll_op->fragment_data.buffer_desc = src_buffer_desc; coll_op->fragment_data.fragment_size = frag_len; coll_op->fragment_data.message_descriptor->n_active = 1; coll_op->full_message.n_bytes_scheduled = frag_len; coll_op->full_message.fragment_launcher = mca_coll_ml_allgather_frag_progress; coll_op->full_message.pipeline_depth = pipeline_depth; coll_op->fragment_data.current_coll_op = ML_SMALL_DATA_ALLGATHER; /* remember this is different for frags !! Caused data corruption when * not properly set. Need to be sure you have consistent units. */ coll_op->variable_fn_params.count = frag_len; coll_op->variable_fn_params.dtype = MPI_BYTE; /* for fragmented data, we work in * units of bytes. This means that * all of our arithmetic is done * in terms of bytes */ coll_op->variable_fn_params.root = 0; coll_op->variable_fn_params.frag_size = frag_len; coll_op->variable_fn_params.buffer_size = frag_len; } else { /* change 7 */ ML_VERBOSE(10, ("ML_ALLGATHER_LARGE_DATA_KNOWN case.")); coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module, ml_module->coll_ml_allgather_functions[ML_LARGE_DATA_ALLGATHER], sbuf, rbuf, pack_len, 0 /* offset for first pack */); topo_info = coll_op->coll_schedule->topo_info; if (MCA_BCOL_BASE_NO_ML_BUFFER_FOR_LARGE_MSG & topo_info->all_bcols_mode) { MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op, MCA_COLL_ML_NO_BUFFER, NULL); } else { src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); while (NULL == src_buffer_desc) { opal_progress(); src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); } MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op, src_buffer_desc->buffer_index, src_buffer_desc); } /* not sure if I really need this here */ coll_op->sequential_routine.seq_task_setup = mca_coll_ml_allgather_task_setup; coll_op->process_fn = NULL; /* probably the most important piece */ coll_op->variable_fn_params.sbuf = sbuf; coll_op->variable_fn_params.rbuf = rbuf; coll_op->variable_fn_params.sbuf_offset = 0; coll_op->variable_fn_params.rbuf_offset = 0; coll_op->variable_fn_params.count = scount; coll_op->variable_fn_params.dtype = sdtype;/* for zero copy, we want the * native datatype and actual count */ coll_op->variable_fn_params.root = 0; /* you still need to copy in your own data into the rbuf */ /* don't need to do this if you have in place data */ if (!in_place) { memcpy((char *) rbuf + ompi_comm_rank(comm) * pack_len, sbuf, pack_len); } } coll_op->full_message.send_count = scount; coll_op->full_message.recv_count = rcount; coll_op->full_message.send_data_continguous = scontig; coll_op->full_message.recv_data_continguous = rcontig; ompi_datatype_get_extent(sdtype, &lb, &extent); coll_op->full_message.send_extent = (size_t) extent; ompi_datatype_get_extent(rdtype, &lb, &extent); coll_op->full_message.recv_extent = (size_t) extent; /* Fill in the function arguments */ coll_op->variable_fn_params.sequence_num = OPAL_THREAD_ADD32(&(ml_module->collective_sequence_num), 1); coll_op->variable_fn_params.hier_factor = comm_size; MCA_COLL_ML_SET_ORDER_INFO(coll_op, n_fragments); ret = mca_coll_ml_launch_sequential_collective (coll_op); if (OMPI_SUCCESS != ret) { ML_VERBOSE(10, ("Failed to launch")); return ret; } *req = &coll_op->full_message.super; return OMPI_SUCCESS; }
int ompi_osc_portals4_get_accumulate(const void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt, void *result_addr, int result_count, struct ompi_datatype_t *result_dt, int target, MPI_Aint target_disp, int target_count, struct ompi_datatype_t *target_dt, struct ompi_op_t *op, struct ompi_win_t *win) { int ret; ompi_osc_portals4_module_t *module = (ompi_osc_portals4_module_t*) win->w_osc_module; ptl_process_t peer = ompi_osc_portals4_get_peer(module, target); size_t length, sent; size_t offset; ptl_op_t ptl_op; ptl_datatype_t ptl_dt; OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "get_accumulate: 0x%lx, %d, %s, 0x%lx, %d, %s, %d, %d, %d, %s, %s, 0x%lx", (unsigned long) origin_addr, origin_count, origin_dt->name, (unsigned long) result_addr, result_count, result_dt->name, target, (int) target_disp, target_count, target_dt->name, op->o_name, (unsigned long) win)); offset = get_displacement(module, target) * target_disp; /* we don't support non-contiguous buffers. but if the count is 0, we don't care if buffer is non-contiguous. */ if ((origin_count > 0 && !ompi_datatype_is_contiguous_memory_layout(origin_dt, origin_count)) || (result_count > 0 && !ompi_datatype_is_contiguous_memory_layout(result_dt, result_count)) || (target_count > 0 && !ompi_datatype_is_contiguous_memory_layout(target_dt, target_count))) { opal_output(ompi_osc_base_framework.framework_output, "MPI_Get_accumulate: transfer of non-contiguous memory is not currently supported.\n"); return OMPI_ERR_NOT_SUPPORTED; } else { sent = 0; if (MPI_REPLACE == op) { ptl_size_t result_md_offset, origin_md_offset; ret = ompi_datatype_type_size(origin_dt, &length); if (OMPI_SUCCESS != ret) { return ret; } ret = ompi_osc_portals4_get_dt(origin_dt, &ptl_dt); if (OMPI_SUCCESS != ret) { return ret; } length *= origin_count; result_md_offset = (ptl_size_t) result_addr; origin_md_offset = (ptl_size_t) origin_addr; do { size_t msg_length = MIN(module->fetch_atomic_max, length - sent); (void)opal_atomic_add_64(&module->opcount, 1); ret = PtlSwap(module->md_h, result_md_offset + sent, module->md_h, origin_md_offset + sent, msg_length, peer, module->pt_idx, module->match_bits, offset + sent, NULL, 0, NULL, PTL_SWAP, ptl_dt); sent += msg_length; } while (sent < length); } else if (MPI_NO_OP == op) { ptl_size_t md_offset; ret = ompi_datatype_type_size(target_dt, &length); if (OMPI_SUCCESS != ret) { return ret; } length *= target_count; md_offset = (ptl_size_t) result_addr; do { size_t msg_length = MIN(module->fetch_atomic_max, length - sent); (void)opal_atomic_add_64(&module->opcount, 1); ret = PtlGet(module->md_h, md_offset + sent, msg_length, peer, module->pt_idx, module->match_bits, offset + sent, NULL); sent += msg_length; } while (sent < length); } else { ptl_size_t result_md_offset, origin_md_offset; ret = ompi_datatype_type_size(origin_dt, &length); if (OMPI_SUCCESS != ret) { return ret; } length *= origin_count; result_md_offset = (ptl_size_t) result_addr; origin_md_offset = (ptl_size_t) origin_addr; ret = ompi_osc_portals4_get_dt(origin_dt, &ptl_dt); if (OMPI_SUCCESS != ret) return ret; ret = ompi_osc_portals4_get_op(op, &ptl_op); if (OMPI_SUCCESS != ret) return ret; do { size_t msg_length = MIN(module->fetch_atomic_max, length - sent); (void)opal_atomic_add_64(&module->opcount, 1); ret = PtlFetchAtomic(module->md_h, result_md_offset + sent, module->md_h, origin_md_offset + sent, msg_length, peer, module->pt_idx, module->match_bits, offset + sent, NULL, 0, ptl_op, ptl_dt); sent += msg_length; } while (sent < length); } if (OMPI_SUCCESS != ret) { return ret; } } return OMPI_SUCCESS; }
int ompi_osc_portals4_accumulate(const void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt, int target, OPAL_PTRDIFF_TYPE target_disp, int target_count, struct ompi_datatype_t *target_dt, struct ompi_op_t *op, struct ompi_win_t *win) { int ret; ompi_osc_portals4_module_t *module = (ompi_osc_portals4_module_t*) win->w_osc_module; ptl_process_t peer = ompi_osc_portals4_get_peer(module, target); size_t length, sent; size_t offset; ptl_op_t ptl_op; ptl_datatype_t ptl_dt; OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "accumulate: 0x%lx, %d, %s, %d, %d, %d, %s, %s, 0x%lx", (unsigned long) origin_addr, origin_count, origin_dt->name, target, (int) target_disp, target_count, target_dt->name, op->o_name, (unsigned long) win)); offset = get_displacement(module, target) * target_disp; if (!ompi_datatype_is_contiguous_memory_layout(origin_dt, origin_count) || !ompi_datatype_is_contiguous_memory_layout(target_dt, target_count)) { opal_output(ompi_osc_base_framework.framework_output, "MPI_Accumulate: transfer of non-contiguous memory is not currently supported.\n"); return OMPI_ERR_NOT_SUPPORTED; } else { ptl_size_t md_offset; ret = ompi_datatype_type_size(origin_dt, &length); if (OMPI_SUCCESS != ret) { return ret; } length *= origin_count; sent = 0; md_offset = (ptl_size_t) origin_addr; do { size_t msg_length = MIN(module->atomic_max, length - sent); (void)opal_atomic_add_64(&module->opcount, 1); if (MPI_REPLACE == op) { ret = PtlPut(module->md_h, md_offset + sent, msg_length, PTL_ACK_REQ, peer, module->pt_idx, module->match_bits, offset + sent, NULL, 0); } else { ret = ompi_osc_portals4_get_dt(origin_dt, &ptl_dt); if (OMPI_SUCCESS != ret) return ret; ret = ompi_osc_portals4_get_op(op, &ptl_op); if (OMPI_SUCCESS != ret) return ret; ret = PtlAtomic(module->md_h, md_offset + sent, msg_length, PTL_ACK_REQ, peer, module->pt_idx, module->match_bits, offset + sent, NULL, 0, ptl_op, ptl_dt); } if (OMPI_SUCCESS != ret) { return ret; } sent += msg_length; } while (sent < length); } return OMPI_SUCCESS; }
int ompi_osc_portals4_rget_accumulate(const void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt, void *result_addr, int result_count, struct ompi_datatype_t *result_dt, int target, MPI_Aint target_disp, int target_count, struct ompi_datatype_t *target_dt, struct ompi_op_t *op, struct ompi_win_t *win, struct ompi_request_t **ompi_req) { int ret; ompi_osc_portals4_request_t *request; ompi_osc_portals4_module_t *module = (ompi_osc_portals4_module_t*) win->w_osc_module; ptl_process_t peer = ompi_osc_portals4_get_peer(module, target); size_t length, sent; size_t offset; ptl_op_t ptl_op; ptl_datatype_t ptl_dt; OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "rget_accumulate: 0x%lx, %d, %s, 0x%lx, %d, %s, %d, %d, %d, %s, %s, 0x%lx", (unsigned long) origin_addr, origin_count, origin_dt->name, (unsigned long) result_addr, result_count, result_dt->name, target, (int) target_disp, target_count, target_dt->name, op->o_name, (unsigned long) win)); OMPI_OSC_PORTALS4_REQUEST_ALLOC(win, request); if (NULL == request) return OMPI_ERR_TEMP_OUT_OF_RESOURCE; *ompi_req = &request->super; offset = get_displacement(module, target) * target_disp; if (!ompi_datatype_is_contiguous_memory_layout(origin_dt, origin_count) || !ompi_datatype_is_contiguous_memory_layout(result_dt, result_count) || !ompi_datatype_is_contiguous_memory_layout(target_dt, target_count)) { OMPI_OSC_PORTALS4_REQUEST_RETURN(request); opal_output(ompi_osc_base_framework.framework_output, "MPI_Rget_accumulate: transfer of non-contiguous memory is not currently supported.\n"); return OMPI_ERR_NOT_SUPPORTED; } else { sent = 0; if (MPI_REPLACE == op) { ptl_size_t result_md_offset, origin_md_offset; ret = ompi_datatype_type_size(origin_dt, &length); if (OMPI_SUCCESS != ret) { OMPI_OSC_PORTALS4_REQUEST_RETURN(request); return ret; } ret = ompi_osc_portals4_get_dt(origin_dt, &ptl_dt); if (OMPI_SUCCESS != ret) { OMPI_OSC_PORTALS4_REQUEST_RETURN(request); return ret; } length *= origin_count; result_md_offset = (ptl_size_t) result_addr; origin_md_offset = (ptl_size_t) origin_addr; do { size_t msg_length = MIN(module->fetch_atomic_max, length - sent); (void)opal_atomic_add_64(&module->opcount, 1); request->ops_expected++; ret = PtlSwap(module->req_md_h, result_md_offset + sent, module->md_h, origin_md_offset + sent, msg_length, peer, module->pt_idx, module->match_bits, offset + sent, request, 0, NULL, PTL_SWAP, ptl_dt); sent += msg_length; } while (sent < length); } else if (MPI_NO_OP == op) { ptl_size_t md_offset; ret = ompi_datatype_type_size(target_dt, &length); if (OMPI_SUCCESS != ret) { OMPI_OSC_PORTALS4_REQUEST_RETURN(request); return ret; } length *= target_count; md_offset = (ptl_size_t) result_addr; do { size_t msg_length = MIN(module->fetch_atomic_max, length - sent); (void)opal_atomic_add_64(&module->opcount, 1); request->ops_expected++; ret = PtlGet(module->req_md_h, md_offset + sent, msg_length, peer, module->pt_idx, module->match_bits, offset + sent, request); sent += msg_length; } while (sent < length); } else { ptl_size_t result_md_offset, origin_md_offset; ret = ompi_datatype_type_size(origin_dt, &length); if (OMPI_SUCCESS != ret) { OMPI_OSC_PORTALS4_REQUEST_RETURN(request); return ret; } length *= origin_count; result_md_offset = (ptl_size_t) result_addr; origin_md_offset = (ptl_size_t) origin_addr; ret = ompi_osc_portals4_get_dt(origin_dt, &ptl_dt); if (OMPI_SUCCESS != ret) return ret; ret = ompi_osc_portals4_get_op(op, &ptl_op); if (OMPI_SUCCESS != ret) return ret; do { size_t msg_length = MIN(module->fetch_atomic_max, length - sent); (void)opal_atomic_add_64(&module->opcount, 1); request->ops_expected++; ret = PtlFetchAtomic(module->req_md_h, result_md_offset + sent, module->md_h, origin_md_offset + sent, msg_length, peer, module->pt_idx, module->match_bits, offset + sent, request, 0, ptl_op, ptl_dt); sent += msg_length; } while (sent < length); } if (OMPI_SUCCESS != ret) { OMPI_OSC_PORTALS4_REQUEST_RETURN(request); return ret; } } return OMPI_SUCCESS; }
int ompi_osc_portals4_rget(void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt, int target, OPAL_PTRDIFF_TYPE target_disp, int target_count, struct ompi_datatype_t *target_dt, struct ompi_win_t *win, struct ompi_request_t **ompi_req) { int ret; ompi_osc_portals4_request_t *request; ompi_osc_portals4_module_t *module = (ompi_osc_portals4_module_t*) win->w_osc_module; ptl_process_t peer = ompi_osc_portals4_get_peer(module, target); size_t length; size_t offset; OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "rget: 0x%lx, %d, %s, %d, %d, %d, %s, 0x%lx", (unsigned long) origin_addr, origin_count, origin_dt->name, target, (int) target_disp, target_count, target_dt->name, (unsigned long) win)); OMPI_OSC_PORTALS4_REQUEST_ALLOC(win, request); if (NULL == request) return OMPI_ERR_TEMP_OUT_OF_RESOURCE; *ompi_req = &request->super; offset = get_displacement(module, target) * target_disp; if (!ompi_datatype_is_contiguous_memory_layout(origin_dt, origin_count) || !ompi_datatype_is_contiguous_memory_layout(target_dt, target_count)) { OMPI_OSC_PORTALS4_REQUEST_RETURN(request); opal_output(ompi_osc_base_framework.framework_output, "MPI_Rget: transfer of non-contiguous memory is not currently supported.\n"); return OMPI_ERR_NOT_SUPPORTED; } else { (void)opal_atomic_add_64(&module->opcount, 1); request->ops_expected = 1; ret = ompi_datatype_type_size(origin_dt, &length); if (OMPI_SUCCESS != ret) { OMPI_OSC_PORTALS4_REQUEST_RETURN(request); return ret; } length *= origin_count; ret = PtlGet(module->req_md_h, (ptl_size_t) origin_addr, length, peer, module->pt_idx, module->match_bits, offset, request); if (OMPI_SUCCESS != ret) { OMPI_OSC_PORTALS4_REQUEST_RETURN(request); return ret; } } return OMPI_SUCCESS; }
static inline __opal_attribute_always_inline__ int parallel_reduce_start (void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, mca_coll_ml_module_t *ml_module, ompi_request_t **req, int small_data_reduce, int large_data_reduce) { ptrdiff_t lb, extent; size_t pack_len, dt_size; mca_bcol_base_payload_buffer_desc_t *src_buffer_desc = NULL; mca_coll_ml_collective_operation_progress_t * coll_op = NULL; bool contiguous = ompi_datatype_is_contiguous_memory_layout(dtype, count); mca_coll_ml_component_t *cm = &mca_coll_ml_component; int ret, n_fragments = 1, frag_len, pipeline_depth, n_dts_per_frag, rank; if (MPI_IN_PLACE == sbuf) { sbuf = rbuf; } ret = ompi_datatype_get_extent(dtype, &lb, &extent); if (ret < 0) { return OMPI_ERROR; } rank = ompi_comm_rank (comm); dt_size = (size_t) extent; pack_len = count * dt_size; /* We use a separate recieve and send buffer so only half the buffer is usable. */ if (pack_len < (size_t) ml_module->small_message_thresholds[BCOL_REDUCE] / 4) { /* The len of the message can not be larger than ML buffer size */ assert(pack_len <= ml_module->payload_block->size_buffer); src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); ML_VERBOSE(10,("Using small data reduce (threshold = %d)", REDUCE_SMALL_MESSAGE_THRESHOLD)); while (NULL == src_buffer_desc) { opal_progress(); src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); } coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module, ml_module->coll_ml_reduce_functions[small_data_reduce], sbuf, rbuf, pack_len, 0); MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op, src_buffer_desc->buffer_index, src_buffer_desc); coll_op->variable_fn_params.rbuf = src_buffer_desc->data_addr; coll_op->variable_fn_params.sbuf = src_buffer_desc->data_addr; coll_op->variable_fn_params.buffer_index = src_buffer_desc->buffer_index; coll_op->variable_fn_params.src_desc = src_buffer_desc; coll_op->variable_fn_params.count = count; ret = ompi_datatype_copy_content_same_ddt(dtype, count, (void *) (uintptr_t) src_buffer_desc->data_addr, (char *) sbuf); if (ret < 0) { return OMPI_ERROR; } } else if (cm->enable_fragmentation || !contiguous) { ML_VERBOSE(1,("Using Fragmented Reduce ")); /* fragment the data */ /* check for retarded application programming decisions */ if (dt_size > (size_t) ml_module->small_message_thresholds[BCOL_REDUCE] / 4) { ML_ERROR(("Sorry, but we don't support datatypes that large")); return OMPI_ERROR; } /* calculate the number of data types that can fit per ml-buffer */ n_dts_per_frag = ml_module->small_message_thresholds[BCOL_REDUCE] / (4 * dt_size); /* calculate the number of fragments */ n_fragments = (count + n_dts_per_frag - 1) / n_dts_per_frag; /* round up */ /* calculate the actual pipeline depth */ pipeline_depth = n_fragments < cm->pipeline_depth ? n_fragments : cm->pipeline_depth; /* calculate the fragment size */ frag_len = n_dts_per_frag * dt_size; /* allocate an ml buffer */ src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); while (NULL == src_buffer_desc) { opal_progress(); src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); } coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module, ml_module->coll_ml_reduce_functions[small_data_reduce], sbuf,rbuf, pack_len, 0 /* offset for first pack */); MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op, src_buffer_desc->buffer_index, src_buffer_desc); coll_op->variable_fn_params.sbuf = (void *) src_buffer_desc->data_addr; coll_op->variable_fn_params.rbuf = (void *) src_buffer_desc->data_addr; coll_op->fragment_data.message_descriptor->n_active = 1; coll_op->full_message.n_bytes_scheduled = frag_len; coll_op->full_message.fragment_launcher = mca_coll_ml_reduce_frag_progress; coll_op->full_message.pipeline_depth = pipeline_depth; coll_op->fragment_data.current_coll_op = small_data_reduce; coll_op->fragment_data.fragment_size = frag_len; coll_op->variable_fn_params.count = n_dts_per_frag; /* seems fishy */ coll_op->variable_fn_params.buffer_size = frag_len; coll_op->variable_fn_params.src_desc = src_buffer_desc; /* copy into the ml-buffer */ ret = ompi_datatype_copy_content_same_ddt(dtype, n_dts_per_frag, (char *) src_buffer_desc->data_addr, (char *) sbuf); if (ret < 0) { return OMPI_ERROR; } } else { ML_VERBOSE(1,("Using zero-copy ptp reduce")); coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module, ml_module->coll_ml_reduce_functions[large_data_reduce], sbuf, rbuf, pack_len, 0); coll_op->variable_fn_params.userbuf = coll_op->variable_fn_params.sbuf = sbuf; coll_op->variable_fn_params.rbuf = rbuf; /* The ML buffer is used for testing. Later, when we * switch to use knem/mmap/portals this should be replaced * appropriately */ src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); while (NULL == src_buffer_desc) { opal_progress(); src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); } coll_op->variable_fn_params.buffer_index = src_buffer_desc->buffer_index; coll_op->variable_fn_params.src_desc = src_buffer_desc; coll_op->variable_fn_params.count = count; } coll_op->process_fn = (rank != root) ? NULL : mca_coll_ml_reduce_unpack; /* Set common parts */ coll_op->fragment_data.buffer_desc = src_buffer_desc; coll_op->variable_fn_params.dtype = dtype; coll_op->variable_fn_params.op = op; /* NTH: the root, root route, and root flag are set in the task setup */ /* Fill in the function arguments */ coll_op->variable_fn_params.sbuf_offset = 0; coll_op->variable_fn_params.rbuf_offset = (ml_module->payload_block->size_buffer - ml_module->data_offset)/2; /* Keep track of the global root of this operation */ coll_op->global_root = root; coll_op->variable_fn_params.sequence_num = OPAL_THREAD_ADD32(&(ml_module->collective_sequence_num), 1); coll_op->sequential_routine.current_active_bcol_fn = 0; /* set the task setup callback */ coll_op->sequential_routine.seq_task_setup = mca_coll_ml_reduce_task_setup; /* Reduce requires the schedule to be fixed. If we use other (changing) schedule, the operation might result in different result. */ coll_op->coll_schedule->component_functions = coll_op->coll_schedule-> comp_fn_arr[coll_op->coll_schedule->topo_info->route_vector[root].level]; /* Launch the collective */ ret = mca_coll_ml_launch_sequential_collective (coll_op); if (OMPI_SUCCESS != ret) { ML_VERBOSE(10, ("Failed to launch reduce collective")); return ret; } *req = &coll_op->full_message.super; return OMPI_SUCCESS; }