Example #1
0
int
ompi_osc_portals4_put(void *origin_addr,
                      int origin_count,
                      struct ompi_datatype_t *origin_dt,
                      int target,
                      OPAL_PTRDIFF_TYPE target_disp,
                      int target_count,
                      struct ompi_datatype_t *target_dt,
                      struct ompi_win_t *win)
{
    int ret;
    ompi_osc_portals4_module_t *module =
        (ompi_osc_portals4_module_t*) win->w_osc_module;
    ptl_process_t peer = ompi_osc_portals4_get_peer(module, target);
    size_t length;
    size_t offset;
    ptl_handle_md_t md_h;
    void *md_base;

    OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
                         "put: 0x%lx, %d, %s, %d, %d, %d, %s, 0x%lx",
                         (unsigned long) origin_addr, origin_count,
                         origin_dt->name, target, (int) target_disp,
                         target_count, target_dt->name,
                         (unsigned long) win));

    offset = get_displacement(module, target) * target_disp;

    if (!ompi_datatype_is_contiguous_memory_layout(origin_dt, origin_count) ||
        !ompi_datatype_is_contiguous_memory_layout(target_dt, target_count)) {
        opal_output(ompi_osc_base_framework.framework_output,
                    "MPI_Put: transfer of non-contiguous memory is not currently supported.\n");
        return OMPI_ERR_NOT_SUPPORTED;
    } else {
        (void)opal_atomic_add_64(&module->opcount, 1);
        ret = ompi_datatype_type_size(origin_dt, &length);
        if (OMPI_SUCCESS != ret) {
            return ret;
        }
        length *= origin_count;
        ompi_osc_portals4_get_md(origin_addr, module->md_h, &md_h, &md_base);
        ret = PtlPut(md_h,
                     (ptl_size_t) ((char*) origin_addr - (char*) md_base),
                     length,
                     PTL_ACK_REQ,
                     peer,
                     module->pt_idx,
                     module->match_bits,
                     offset,
                     NULL,
                     0);
        if (OMPI_SUCCESS != ret) {
            return ret;
        }
    }

    return OMPI_SUCCESS;
}
Example #2
0
int ompi_osc_ucx_get(void *origin_addr, int origin_count,
                     struct ompi_datatype_t *origin_dt,
                     int target, ptrdiff_t target_disp, int target_count,
                     struct ompi_datatype_t *target_dt, struct ompi_win_t *win) {
    ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module;
    ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target);
    uint64_t remote_addr = (module->win_info_array[target]).addr + target_disp * OSC_UCX_GET_DISP(module, target);
    ucp_rkey_h rkey;
    ptrdiff_t origin_lb, origin_extent, target_lb, target_extent;
    bool is_origin_contig = false, is_target_contig = false;
    ucs_status_t status;
    int ret = OMPI_SUCCESS;

    ret = check_sync_state(module, target, false);
    if (ret != OMPI_SUCCESS) {
        return ret;
    }

    if (module->flavor == MPI_WIN_FLAVOR_DYNAMIC) {
        status = get_dynamic_win_info(remote_addr, module, ep, target);
        if (status != UCS_OK) {
            return OMPI_ERROR;
        }
    }

    rkey = (module->win_info_array[target]).rkey;

    ompi_datatype_get_true_extent(origin_dt, &origin_lb, &origin_extent);
    ompi_datatype_get_true_extent(target_dt, &target_lb, &target_extent);

    is_origin_contig = ompi_datatype_is_contiguous_memory_layout(origin_dt, origin_count);
    is_target_contig = ompi_datatype_is_contiguous_memory_layout(target_dt, target_count);

    if (is_origin_contig && is_target_contig) {
        /* fast path */
        size_t origin_len;

        ompi_datatype_type_size(origin_dt, &origin_len);
        origin_len *= origin_count;

        status = ucp_get_nbi(ep, (void *)((intptr_t)origin_addr + origin_lb), origin_len,
                             remote_addr + target_lb, rkey);
        if (status != UCS_OK && status != UCS_INPROGRESS) {
            opal_output_verbose(1, ompi_osc_base_framework.framework_output,
                                "%s:%d: ucp_get_nbi failed: %d\n",
                                __FILE__, __LINE__, status);
            return OMPI_ERROR;
        }

        return incr_and_check_ops_num(module, target, ep);
    } else {
        return ddt_put_get(module, origin_addr, origin_count, origin_dt, is_origin_contig,
                           origin_lb, target, ep, remote_addr, rkey, target_count, target_dt,
                           is_target_contig, target_lb, true);
    }
}
Example #3
0
void ADIOI_Datatype_iscontig(MPI_Datatype datatype, int *flag)
{
    /*
     * Open MPI contiguous check return true for datatype with
     * gaps in the beginning and at the end. We have to provide
     * a count of 2 in order to get these gaps taken into acount.
     * In addition, if the data is contiguous but true_lb differes
     * from zero, ROMIO will ignore the displacement. Thus, lie!
     */
    *flag = ompi_datatype_is_contiguous_memory_layout(datatype, 2);
    if (*flag) {
        MPI_Aint true_extent, true_lb;

        ompi_datatype_get_true_extent(datatype, &true_lb, &true_extent);

        if (true_lb > 0)
            *flag = 0;
    }
}
Example #4
0
int ompi_osc_ucx_get_accumulate(const void *origin_addr, int origin_count,
                                struct ompi_datatype_t *origin_dt,
                                void *result_addr, int result_count,
                                struct ompi_datatype_t *result_dt,
                                int target, ptrdiff_t target_disp,
                                int target_count, struct ompi_datatype_t *target_dt,
                                struct ompi_op_t *op, struct ompi_win_t *win) {
    ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module;
    ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target);
    int ret = OMPI_SUCCESS;

    ret = check_sync_state(module, target, false);
    if (ret != OMPI_SUCCESS) {
        return ret;
    }

    ret = start_atomicity(module, ep, target);
    if (ret != OMPI_SUCCESS) {
        return ret;
    }

    ret = ompi_osc_ucx_get(result_addr, result_count, result_dt, target,
                           target_disp, target_count, target_dt, win);
    if (ret != OMPI_SUCCESS) {
        return ret;
    }

    if (op != &ompi_mpi_op_no_op.op) {
        if (op == &ompi_mpi_op_replace.op) {
            ret = ompi_osc_ucx_put(origin_addr, origin_count, origin_dt,
                                   target, target_disp, target_count,
                                   target_dt, win);
            if (ret != OMPI_SUCCESS) {
                return ret;
            }
        } else {
            void *temp_addr = NULL;
            uint32_t temp_count;
            ompi_datatype_t *temp_dt;
            ptrdiff_t temp_lb, temp_extent;
            ucs_status_t status;
            bool is_origin_contig = ompi_datatype_is_contiguous_memory_layout(origin_dt, origin_count);

            if (ompi_datatype_is_predefined(target_dt)) {
                temp_dt = target_dt;
                temp_count = target_count;
            } else {
                ret = ompi_osc_base_get_primitive_type_info(target_dt, &temp_dt, &temp_count);
                if (ret != OMPI_SUCCESS) {
                    return ret;
                }
            }
            ompi_datatype_get_true_extent(temp_dt, &temp_lb, &temp_extent);
            temp_addr = malloc(temp_extent * temp_count);
            if (temp_addr == NULL) {
                return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
            }

            ret = ompi_osc_ucx_get(temp_addr, (int)temp_count, temp_dt,
                                   target, target_disp, target_count, target_dt, win);
            if (ret != OMPI_SUCCESS) {
                return ret;
            }

            status = ucp_ep_flush(ep);
            if (status != UCS_OK) {
                opal_output_verbose(1, ompi_osc_base_framework.framework_output,
                                    "%s:%d: ucp_ep_flush failed: %d\n",
                                    __FILE__, __LINE__, status);
                return OMPI_ERROR;
            }

            if (ompi_datatype_is_predefined(origin_dt) || is_origin_contig) {
                ompi_op_reduce(op, (void *)origin_addr, temp_addr, (int)temp_count, temp_dt);
            } else {
                ucx_iovec_t *origin_ucx_iov = NULL;
                uint32_t origin_ucx_iov_count = 0;
                uint32_t origin_ucx_iov_idx = 0;

                ret = create_iov_list(origin_addr, origin_count, origin_dt,
                                      &origin_ucx_iov, &origin_ucx_iov_count);
                if (ret != OMPI_SUCCESS) {
                    return ret;
                }

                if ((op != &ompi_mpi_op_maxloc.op && op != &ompi_mpi_op_minloc.op) ||
                    ompi_datatype_is_contiguous_memory_layout(temp_dt, temp_count)) {
                    size_t temp_size;
                    ompi_datatype_type_size(temp_dt, &temp_size);
                    while (origin_ucx_iov_idx < origin_ucx_iov_count) {
                        int curr_count = origin_ucx_iov[origin_ucx_iov_idx].len / temp_size;
                        ompi_op_reduce(op, origin_ucx_iov[origin_ucx_iov_idx].addr,
                                       temp_addr, curr_count, temp_dt);
                        temp_addr = (void *)((char *)temp_addr + curr_count * temp_size);
                        origin_ucx_iov_idx++;
                    }
                } else {
                    int i;
                    void *curr_origin_addr = origin_ucx_iov[origin_ucx_iov_idx].addr;
                    for (i = 0; i < (int)temp_count; i++) {
                        ompi_op_reduce(op, curr_origin_addr,
                                       (void *)((char *)temp_addr + i * temp_extent),
                                       1, temp_dt);
                        curr_origin_addr = (void *)((char *)curr_origin_addr + temp_extent);
                        origin_ucx_iov_idx++;
                        if (curr_origin_addr >= (void *)((char *)origin_ucx_iov[origin_ucx_iov_idx].addr + origin_ucx_iov[origin_ucx_iov_idx].len)) {
                            origin_ucx_iov_idx++;
                            curr_origin_addr = origin_ucx_iov[origin_ucx_iov_idx].addr;
                        }
                    }
                }
                free(origin_ucx_iov);
            }

            ret = ompi_osc_ucx_put(temp_addr, (int)temp_count, temp_dt, target, target_disp,
                                   target_count, target_dt, win);
            if (ret != OMPI_SUCCESS) {
                return ret;
            }

            status = ucp_ep_flush(ep);
            if (status != UCS_OK) {
                opal_output_verbose(1, ompi_osc_base_framework.framework_output,
                                    "%s:%d: ucp_ep_flush failed: %d\n",
                                    __FILE__, __LINE__, status);
                return OMPI_ERROR;
            }

            free(temp_addr);
        }
    }

    ret = end_atomicity(module, ep, target);

    return ret;
}
Example #5
0
static inline __opal_attribute_always_inline__
int mca_coll_ml_allgather_start (const void *sbuf, int scount,
                                 struct ompi_datatype_t *sdtype,
                                 void* rbuf, int rcount,
                                 struct ompi_datatype_t *rdtype,
                                 struct ompi_communicator_t *comm,
                                 mca_coll_base_module_t *module,
                                 ompi_request_t **req)
{
    size_t pack_len, sdt_size;
    int ret, n_fragments = 1, comm_size;

    mca_coll_ml_topology_t *topo_info;
    mca_bcol_base_payload_buffer_desc_t *src_buffer_desc;

    mca_coll_ml_component_t *cm = &mca_coll_ml_component;

    mca_coll_ml_collective_operation_progress_t *coll_op;
    mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t *) module;

    ptrdiff_t lb, extent;
    bool scontig, rcontig, in_place = false;

    /* check for in place setting */
    if (MPI_IN_PLACE == sbuf) {
        in_place = true;
        sdtype = rdtype;
        scount = rcount;
    }

    /* scontig could be != to rcontig */
    scontig = ompi_datatype_is_contiguous_memory_layout(sdtype, scount);
    rcontig = ompi_datatype_is_contiguous_memory_layout(rdtype, rcount);

    comm_size = ompi_comm_size(comm);

    ML_VERBOSE(10, ("Starting allgather"));

    assert(NULL != sdtype);
    /* Calculate size of the data,
     * at this stage, only contiguous data is supported */

    /* this is valid for allagther */
    ompi_datatype_type_size(sdtype, &sdt_size);
    pack_len = scount * sdt_size;

    if (in_place) {
        sbuf = (char *) rbuf + ompi_comm_rank(comm) * pack_len;
    }

    /* Allocate collective schedule and pack message */
    /* this is the total ending message size that will need to fit in the ml-buffer */
    if (pack_len <= (size_t) ml_module->small_message_thresholds[BCOL_ALLGATHER]) {
        /* The len of the message can not be larger than ML buffer size */
        ML_VERBOSE(10, ("Single frag %d %d %d", pack_len, comm_size, ml_module->payload_block->size_buffer));
        assert(pack_len * comm_size <= ml_module->payload_block->size_buffer);

        src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
        while (NULL == src_buffer_desc) {
            opal_progress();
            src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
        }

        /* change 1 */
        coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
                  ml_module->coll_ml_allgather_functions[ML_SMALL_DATA_ALLGATHER],
                  sbuf, rbuf, pack_len, 0 /* offset for first pack */);

        MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op,
                                              src_buffer_desc->buffer_index, src_buffer_desc);

        coll_op->fragment_data.current_coll_op = ML_SMALL_DATA_ALLGATHER;
        /* task setup callback function */
        coll_op->sequential_routine.seq_task_setup = mca_coll_ml_allgather_task_setup;

        /* change 2 */
        if (!scontig) {
            coll_op->full_message.n_bytes_scheduled =
                mca_coll_ml_convertor_prepare(sdtype, scount, sbuf,
                                              &coll_op->full_message.send_convertor, MCA_COLL_ML_NET_STREAM_SEND);

            mca_coll_ml_convertor_pack(
                (void *) ((uintptr_t) src_buffer_desc->data_addr + pack_len *
                          (coll_op->coll_schedule->topo_info->hier_layout_info[0].offset +
                           coll_op->coll_schedule->topo_info->hier_layout_info[0].level_one_index)),
                pack_len, &coll_op->full_message.send_convertor);
        } else {
            /* change 3 */
            memcpy((void *)((uintptr_t) src_buffer_desc->data_addr + pack_len *
                            (coll_op->coll_schedule->topo_info->hier_layout_info[0].offset +
                             coll_op->coll_schedule->topo_info->hier_layout_info[0].level_one_index)),
                   sbuf, pack_len);

            coll_op->full_message.n_bytes_scheduled = pack_len;
        }

        if (!rcontig) {
            mca_coll_ml_convertor_prepare(rdtype, rcount * comm_size, rbuf,
                                          &coll_op->full_message.recv_convertor, MCA_COLL_ML_NET_STREAM_RECV);
        }

        if (coll_op->coll_schedule->topo_info->ranks_contiguous) {
            coll_op->process_fn = mca_coll_ml_allgather_small_unpack_data;
        } else {
            coll_op->process_fn = mca_coll_ml_allgather_noncontiguous_unpack_data;
        }

        /* whole ml-buffer is used to send AND receive */
        coll_op->variable_fn_params.sbuf = (void *) src_buffer_desc->data_addr;
        coll_op->variable_fn_params.rbuf = (void *) src_buffer_desc->data_addr;

        /* we can set the initial offset here */
        coll_op->variable_fn_params.sbuf_offset = 0;
        coll_op->variable_fn_params.rbuf_offset = 0;

        coll_op->variable_fn_params.count = scount;
        coll_op->fragment_data.fragment_size =
            coll_op->full_message.n_bytes_scheduled;

        /* For small CINCO, we may use the native datatype */
        coll_op->variable_fn_params.dtype = sdtype;
        coll_op->variable_fn_params.buffer_size = pack_len;
        coll_op->variable_fn_params.root = 0;
    } else if (cm->enable_fragmentation || pack_len * comm_size < (1 << 20)) {
        /* calculate the number of fragments and the size of each frag */
        size_t n_dts_per_frag, frag_len;
        int pipeline_depth = mca_coll_ml_component.pipeline_depth;

        /* Calculate the number of fragments required for this message careful watch the integer division !*/
        frag_len = (pack_len <= (size_t) ml_module->small_message_thresholds[BCOL_ALLGATHER] ?
                    pack_len : (size_t) ml_module->small_message_thresholds[BCOL_ALLGATHER]);

        n_dts_per_frag = frag_len / sdt_size;
        n_fragments = (pack_len + sdt_size * n_dts_per_frag - 1) / (sdt_size * n_dts_per_frag);
        pipeline_depth = (n_fragments < pipeline_depth ? n_fragments : pipeline_depth);

        src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
        while (NULL == src_buffer_desc) {
            opal_progress();
            src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
        }

        /* change 4 */
        coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
                  ml_module->coll_ml_allgather_functions[ML_SMALL_DATA_ALLGATHER],
                  sbuf, rbuf, pack_len,
                  0 /* offset for first pack */);

        MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op,
                                              src_buffer_desc->buffer_index, src_buffer_desc);
        topo_info = coll_op->coll_schedule->topo_info;

        /* task setup callback function */
        coll_op->sequential_routine.seq_task_setup = mca_coll_ml_allgather_task_setup;

        if (!scontig) {
            coll_op->full_message.send_converter_bytes_packed =
                mca_coll_ml_convertor_prepare(
                    sdtype, scount, NULL,
                    &coll_op->full_message.dummy_convertor,
                    MCA_COLL_ML_NET_STREAM_SEND);

            coll_op->full_message.dummy_conv_position = 0;
            mca_coll_ml_convertor_get_send_frag_size(
                ml_module, &frag_len,
                &coll_op->full_message);

            /* change 5 */
            mca_coll_ml_convertor_prepare(sdtype, scount, sbuf,
                                          &coll_op->full_message.send_convertor, MCA_COLL_ML_NET_STREAM_SEND);

            mca_coll_ml_convertor_pack(
                (void *) ((uintptr_t) src_buffer_desc->data_addr + frag_len *
                          (topo_info->hier_layout_info[0].offset +
                           topo_info->hier_layout_info[0].level_one_index)),
                frag_len, &coll_op->full_message.send_convertor);
        } else {
            /* change 6 */
            memcpy((void *)((uintptr_t)src_buffer_desc->data_addr + frag_len *
                            (topo_info->hier_layout_info[0].offset +
                             topo_info->hier_layout_info[0].level_one_index)),
                   sbuf, frag_len);
        }

        if (!rcontig) {
            mca_coll_ml_convertor_prepare(rdtype, rcount * comm_size, rbuf,
                                          &coll_op->full_message.recv_convertor, MCA_COLL_ML_NET_STREAM_RECV);
        }

        coll_op->process_fn = mca_coll_ml_allgather_noncontiguous_unpack_data;

        /* hopefully this doesn't royaly screw things up idea behind this is the
         * whole ml-buffer is used to send and receive
         */
        coll_op->variable_fn_params.sbuf = (void *) src_buffer_desc->data_addr;
        coll_op->variable_fn_params.rbuf = (void *) src_buffer_desc->data_addr;

        /* we can set the initial offset here */
        coll_op->variable_fn_params.sbuf_offset = 0;
        coll_op->variable_fn_params.rbuf_offset = 0;

        coll_op->fragment_data.buffer_desc = src_buffer_desc;

        coll_op->fragment_data.fragment_size = frag_len;
        coll_op->fragment_data.message_descriptor->n_active = 1;

        coll_op->full_message.n_bytes_scheduled = frag_len;
        coll_op->full_message.fragment_launcher = mca_coll_ml_allgather_frag_progress;

        coll_op->full_message.pipeline_depth = pipeline_depth;
        coll_op->fragment_data.current_coll_op = ML_SMALL_DATA_ALLGATHER;

        /* remember this is different for frags !! Caused data corruption when
         * not properly set. Need to be sure you have consistent units.
         */
        coll_op->variable_fn_params.count = frag_len;
        coll_op->variable_fn_params.dtype = MPI_BYTE; /* for fragmented data, we work in
                                                       * units of bytes. This means that
                                                       * all of our arithmetic is done
                                                       * in terms of bytes
                                                       */

        coll_op->variable_fn_params.root = 0;
        coll_op->variable_fn_params.frag_size = frag_len;
        coll_op->variable_fn_params.buffer_size = frag_len;
    } else {
        /* change 7 */
        ML_VERBOSE(10, ("ML_ALLGATHER_LARGE_DATA_KNOWN case."));
        coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
                  ml_module->coll_ml_allgather_functions[ML_LARGE_DATA_ALLGATHER],
                  sbuf, rbuf, pack_len, 0 /* offset for first pack */);
        topo_info = coll_op->coll_schedule->topo_info;
        if (MCA_BCOL_BASE_NO_ML_BUFFER_FOR_LARGE_MSG & topo_info->all_bcols_mode) {
            MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op, MCA_COLL_ML_NO_BUFFER, NULL);
        } else {
            src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
            while (NULL == src_buffer_desc) {
                opal_progress();
                src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
            }

            MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op, src_buffer_desc->buffer_index, src_buffer_desc);
        }

        /* not sure if I really need this here */
        coll_op->sequential_routine.seq_task_setup = mca_coll_ml_allgather_task_setup;
        coll_op->process_fn = NULL;
        /* probably the most important piece */
        coll_op->variable_fn_params.sbuf = sbuf;
        coll_op->variable_fn_params.rbuf = rbuf;
        coll_op->variable_fn_params.sbuf_offset = 0;
        coll_op->variable_fn_params.rbuf_offset = 0;
        coll_op->variable_fn_params.count = scount;
        coll_op->variable_fn_params.dtype = sdtype;/* for zero copy, we want the
                                                    * native datatype and actual count
                                                    */
        coll_op->variable_fn_params.root = 0;

        /* you still need to copy in your own data into the rbuf */
        /* don't need to do this if you have in place data */
        if (!in_place) {
            memcpy((char *) rbuf + ompi_comm_rank(comm) * pack_len, sbuf, pack_len);
        }
    }

    coll_op->full_message.send_count = scount;
    coll_op->full_message.recv_count = rcount;

    coll_op->full_message.send_data_continguous = scontig;
    coll_op->full_message.recv_data_continguous = rcontig;

    ompi_datatype_get_extent(sdtype, &lb, &extent);
    coll_op->full_message.send_extent = (size_t) extent;

    ompi_datatype_get_extent(rdtype, &lb, &extent);
    coll_op->full_message.recv_extent = (size_t) extent;


    /* Fill in the function arguments */
    coll_op->variable_fn_params.sequence_num =
        OPAL_THREAD_ADD32(&(ml_module->collective_sequence_num), 1);
    coll_op->variable_fn_params.hier_factor = comm_size;

    MCA_COLL_ML_SET_ORDER_INFO(coll_op, n_fragments);


    ret = mca_coll_ml_launch_sequential_collective (coll_op);
    if (OMPI_SUCCESS != ret) {
        ML_VERBOSE(10, ("Failed to launch"));
        return ret;
    }

    *req = &coll_op->full_message.super;

    return OMPI_SUCCESS;
}
Example #6
0
int
ompi_osc_portals4_get_accumulate(const void *origin_addr,
                                 int origin_count,
                                 struct ompi_datatype_t *origin_dt,
                                 void *result_addr,
                                 int result_count,
                                 struct ompi_datatype_t *result_dt,
                                 int target,
                                 MPI_Aint target_disp,
                                 int target_count,
                                 struct ompi_datatype_t *target_dt,
                                 struct ompi_op_t *op,
                                 struct ompi_win_t *win)
{
    int ret;
    ompi_osc_portals4_module_t *module =
        (ompi_osc_portals4_module_t*) win->w_osc_module;
    ptl_process_t peer = ompi_osc_portals4_get_peer(module, target);
    size_t length, sent;
    size_t offset;
    ptl_op_t ptl_op;
    ptl_datatype_t ptl_dt;

    OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
                         "get_accumulate: 0x%lx, %d, %s, 0x%lx, %d, %s, %d, %d, %d, %s, %s, 0x%lx",
                         (unsigned long) origin_addr, origin_count,
                         origin_dt->name, (unsigned long) result_addr,
                         result_count, result_dt->name,
                         target, (int) target_disp,
                         target_count, target_dt->name,
                         op->o_name,
                         (unsigned long) win));

    offset = get_displacement(module, target) * target_disp;

    /* we don't support non-contiguous buffers.  but if the count is 0, we don't care if buffer is non-contiguous. */
    if ((origin_count > 0 && !ompi_datatype_is_contiguous_memory_layout(origin_dt, origin_count)) ||
        (result_count > 0 && !ompi_datatype_is_contiguous_memory_layout(result_dt, result_count)) ||
        (target_count > 0 && !ompi_datatype_is_contiguous_memory_layout(target_dt, target_count))) {
        opal_output(ompi_osc_base_framework.framework_output,
                    "MPI_Get_accumulate: transfer of non-contiguous memory is not currently supported.\n");
        return OMPI_ERR_NOT_SUPPORTED;
    } else {
        sent = 0;
        if (MPI_REPLACE == op) {
            ptl_size_t result_md_offset, origin_md_offset;

            ret = ompi_datatype_type_size(origin_dt, &length);
            if (OMPI_SUCCESS != ret) {
                return ret;
            }
            ret = ompi_osc_portals4_get_dt(origin_dt, &ptl_dt);
            if (OMPI_SUCCESS != ret) {
                return ret;
            }
            length *= origin_count;

            result_md_offset = (ptl_size_t) result_addr;
            origin_md_offset = (ptl_size_t) origin_addr;

            do {
                size_t msg_length = MIN(module->fetch_atomic_max, length - sent);

                (void)opal_atomic_add_64(&module->opcount, 1);

                ret = PtlSwap(module->md_h,
                              result_md_offset + sent,
                              module->md_h,
                              origin_md_offset + sent,
                              msg_length,
                              peer,
                              module->pt_idx,
                              module->match_bits,
                              offset + sent,
                              NULL,
                              0,
                              NULL,
                              PTL_SWAP,
                              ptl_dt);
                sent += msg_length;
            } while (sent < length);
        } else if (MPI_NO_OP == op) {
            ptl_size_t md_offset;

            ret = ompi_datatype_type_size(target_dt, &length);
            if (OMPI_SUCCESS != ret) {
                return ret;
            }
            length *= target_count;

            md_offset = (ptl_size_t) result_addr;

            do {
                size_t msg_length = MIN(module->fetch_atomic_max, length - sent);

                (void)opal_atomic_add_64(&module->opcount, 1);

                ret = PtlGet(module->md_h,
                             md_offset + sent,
                             msg_length,
                             peer,
                             module->pt_idx,
                             module->match_bits,
                             offset + sent,
                             NULL);
                sent += msg_length;
            } while (sent < length);
        } else {
            ptl_size_t result_md_offset, origin_md_offset;

            ret = ompi_datatype_type_size(origin_dt, &length);
            if (OMPI_SUCCESS != ret) {
                return ret;
            }
            length *= origin_count;

            result_md_offset = (ptl_size_t) result_addr;
            origin_md_offset = (ptl_size_t) origin_addr;

            ret = ompi_osc_portals4_get_dt(origin_dt, &ptl_dt);
            if (OMPI_SUCCESS != ret) return ret;

            ret = ompi_osc_portals4_get_op(op, &ptl_op);
            if (OMPI_SUCCESS != ret) return ret;


            do {
                size_t msg_length = MIN(module->fetch_atomic_max, length - sent);

                (void)opal_atomic_add_64(&module->opcount, 1);

                ret = PtlFetchAtomic(module->md_h,
                                     result_md_offset + sent,
                                     module->md_h,
                                     origin_md_offset + sent,
                                     msg_length,
                                     peer,
                                     module->pt_idx,
                                     module->match_bits,
                                     offset + sent,
                                     NULL,
                                     0,
                                     ptl_op,
                                     ptl_dt);
                sent += msg_length;
            } while (sent < length);
        }
        if (OMPI_SUCCESS != ret) {
            return ret;
        }
    }

    return OMPI_SUCCESS;
}
Example #7
0
int
ompi_osc_portals4_accumulate(const void *origin_addr,
                             int origin_count,
                             struct ompi_datatype_t *origin_dt,
                             int target,
                             OPAL_PTRDIFF_TYPE target_disp,
                             int target_count,
                             struct ompi_datatype_t *target_dt,
                             struct ompi_op_t *op,
                             struct ompi_win_t *win)
{
    int ret;
    ompi_osc_portals4_module_t *module =
        (ompi_osc_portals4_module_t*) win->w_osc_module;
    ptl_process_t peer = ompi_osc_portals4_get_peer(module, target);
    size_t length, sent;
    size_t offset;
    ptl_op_t ptl_op;
    ptl_datatype_t ptl_dt;

    OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
                         "accumulate: 0x%lx, %d, %s, %d, %d, %d, %s, %s, 0x%lx",
                         (unsigned long) origin_addr, origin_count,
                         origin_dt->name, target, (int) target_disp,
                         target_count, target_dt->name,
                         op->o_name,
                         (unsigned long) win));

    offset = get_displacement(module, target) * target_disp;

    if (!ompi_datatype_is_contiguous_memory_layout(origin_dt, origin_count) ||
        !ompi_datatype_is_contiguous_memory_layout(target_dt, target_count)) {
        opal_output(ompi_osc_base_framework.framework_output,
                    "MPI_Accumulate: transfer of non-contiguous memory is not currently supported.\n");
        return OMPI_ERR_NOT_SUPPORTED;
    } else {
        ptl_size_t md_offset;

        ret = ompi_datatype_type_size(origin_dt, &length);
        if (OMPI_SUCCESS != ret) {
            return ret;
        }
        length *= origin_count;
        sent = 0;

        md_offset = (ptl_size_t) origin_addr;

        do {
            size_t msg_length = MIN(module->atomic_max, length - sent);
            (void)opal_atomic_add_64(&module->opcount, 1);

            if (MPI_REPLACE == op) {
                ret = PtlPut(module->md_h,
                             md_offset + sent,
                             msg_length,
                             PTL_ACK_REQ,
                             peer,
                             module->pt_idx,
                             module->match_bits,
                             offset + sent,
                             NULL,
                             0);
            } else {
                ret = ompi_osc_portals4_get_dt(origin_dt, &ptl_dt);
                if (OMPI_SUCCESS != ret) return ret;

                ret = ompi_osc_portals4_get_op(op, &ptl_op);
                if (OMPI_SUCCESS != ret) return ret;

                ret = PtlAtomic(module->md_h,
                                md_offset + sent,
                                msg_length,
                                PTL_ACK_REQ,
                                peer,
                                module->pt_idx,
                                module->match_bits,
                                offset + sent,
                                NULL,
                                0,
                                ptl_op,
                                ptl_dt);
            }
            if (OMPI_SUCCESS != ret) {
                return ret;
            }
            sent += msg_length;
        } while (sent < length);
    }

    return OMPI_SUCCESS;
}
Example #8
0
int
ompi_osc_portals4_rget_accumulate(const void *origin_addr,
                                  int origin_count,
                                  struct ompi_datatype_t *origin_dt,
                                  void *result_addr,
                                  int result_count,
                                  struct ompi_datatype_t *result_dt,
                                  int target,
                                  MPI_Aint target_disp,
                                  int target_count,
                                  struct ompi_datatype_t *target_dt,
                                  struct ompi_op_t *op,
                                  struct ompi_win_t *win,
                                  struct ompi_request_t **ompi_req)
{
    int ret;
    ompi_osc_portals4_request_t *request;
    ompi_osc_portals4_module_t *module =
        (ompi_osc_portals4_module_t*) win->w_osc_module;
    ptl_process_t peer = ompi_osc_portals4_get_peer(module, target);
    size_t length, sent;
    size_t offset;
    ptl_op_t ptl_op;
    ptl_datatype_t ptl_dt;

    OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
                         "rget_accumulate: 0x%lx, %d, %s, 0x%lx, %d, %s, %d, %d, %d, %s, %s, 0x%lx",
                         (unsigned long) origin_addr, origin_count,
                         origin_dt->name, (unsigned long) result_addr,
                         result_count, result_dt->name,
                         target, (int) target_disp,
                         target_count, target_dt->name,
                         op->o_name,
                         (unsigned long) win));

    OMPI_OSC_PORTALS4_REQUEST_ALLOC(win, request);
    if (NULL == request) return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
    *ompi_req = &request->super;

    offset = get_displacement(module, target) * target_disp;

    if (!ompi_datatype_is_contiguous_memory_layout(origin_dt, origin_count) ||
        !ompi_datatype_is_contiguous_memory_layout(result_dt, result_count) ||
        !ompi_datatype_is_contiguous_memory_layout(target_dt, target_count)) {
        OMPI_OSC_PORTALS4_REQUEST_RETURN(request);
        opal_output(ompi_osc_base_framework.framework_output,
                    "MPI_Rget_accumulate: transfer of non-contiguous memory is not currently supported.\n");
        return OMPI_ERR_NOT_SUPPORTED;
    } else {
        sent = 0;

        if (MPI_REPLACE == op) {
            ptl_size_t result_md_offset, origin_md_offset;

            ret = ompi_datatype_type_size(origin_dt, &length);
            if (OMPI_SUCCESS != ret) {
                OMPI_OSC_PORTALS4_REQUEST_RETURN(request);
                return ret;
            }
            ret = ompi_osc_portals4_get_dt(origin_dt, &ptl_dt);
            if (OMPI_SUCCESS != ret) {
                OMPI_OSC_PORTALS4_REQUEST_RETURN(request);
                return ret;
            }
            length *= origin_count;

            result_md_offset = (ptl_size_t) result_addr;
            origin_md_offset = (ptl_size_t) origin_addr;

            do {
                size_t msg_length = MIN(module->fetch_atomic_max, length - sent);

                (void)opal_atomic_add_64(&module->opcount, 1);
                request->ops_expected++;

                ret = PtlSwap(module->req_md_h,
                              result_md_offset + sent,
                              module->md_h,
                              origin_md_offset + sent,
                              msg_length,
                              peer,
                              module->pt_idx,
                              module->match_bits,
                              offset + sent,
                              request,
                              0,
                              NULL,
                              PTL_SWAP,
                              ptl_dt);
                sent += msg_length;
            } while (sent < length);
        } else if (MPI_NO_OP == op) {
            ptl_size_t md_offset;

            ret = ompi_datatype_type_size(target_dt, &length);
            if (OMPI_SUCCESS != ret) {
                OMPI_OSC_PORTALS4_REQUEST_RETURN(request);
                return ret;
            }
            length *= target_count;

            md_offset = (ptl_size_t) result_addr;

            do {
                size_t msg_length = MIN(module->fetch_atomic_max, length - sent);

                (void)opal_atomic_add_64(&module->opcount, 1);
                request->ops_expected++;

                ret = PtlGet(module->req_md_h,
                             md_offset + sent,
                             msg_length,
                             peer,
                             module->pt_idx,
                             module->match_bits,
                             offset + sent,
                             request);
                sent += msg_length;
            } while (sent < length);
        } else {
            ptl_size_t result_md_offset, origin_md_offset;

            ret = ompi_datatype_type_size(origin_dt, &length);
            if (OMPI_SUCCESS != ret) {
                OMPI_OSC_PORTALS4_REQUEST_RETURN(request);
                return ret;
            }
            length *= origin_count;

            result_md_offset = (ptl_size_t) result_addr;
            origin_md_offset = (ptl_size_t) origin_addr;

            ret = ompi_osc_portals4_get_dt(origin_dt, &ptl_dt);
            if (OMPI_SUCCESS != ret) return ret;

            ret = ompi_osc_portals4_get_op(op, &ptl_op);
            if (OMPI_SUCCESS != ret) return ret;

            do {
                size_t msg_length = MIN(module->fetch_atomic_max, length - sent);

                (void)opal_atomic_add_64(&module->opcount, 1);
                request->ops_expected++;

                ret = PtlFetchAtomic(module->req_md_h,
                                     result_md_offset + sent,
                                     module->md_h,
                                     origin_md_offset + sent,
                                     msg_length,
                                     peer,
                                     module->pt_idx,
                                     module->match_bits,
                                     offset + sent,
                                     request,
                                     0,
                                     ptl_op,
                                     ptl_dt);
                sent += msg_length;
            } while (sent < length);
        }
        if (OMPI_SUCCESS != ret) {
            OMPI_OSC_PORTALS4_REQUEST_RETURN(request);
            return ret;
        }
    }

    return OMPI_SUCCESS;
}
Example #9
0
int
ompi_osc_portals4_rget(void *origin_addr,
                       int origin_count,
                       struct ompi_datatype_t *origin_dt,
                       int target,
                       OPAL_PTRDIFF_TYPE target_disp,
                       int target_count,
                       struct ompi_datatype_t *target_dt,
                       struct ompi_win_t *win,
                       struct ompi_request_t **ompi_req)
{
    int ret;
    ompi_osc_portals4_request_t *request;
    ompi_osc_portals4_module_t *module =
        (ompi_osc_portals4_module_t*) win->w_osc_module;
    ptl_process_t peer = ompi_osc_portals4_get_peer(module, target);
    size_t length;
    size_t offset;

    OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
                         "rget: 0x%lx, %d, %s, %d, %d, %d, %s, 0x%lx",
                         (unsigned long) origin_addr, origin_count,
                         origin_dt->name, target, (int) target_disp,
                         target_count, target_dt->name,
                         (unsigned long) win));

    OMPI_OSC_PORTALS4_REQUEST_ALLOC(win, request);
    if (NULL == request) return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
    *ompi_req = &request->super;

    offset = get_displacement(module, target) * target_disp;

    if (!ompi_datatype_is_contiguous_memory_layout(origin_dt, origin_count) ||
        !ompi_datatype_is_contiguous_memory_layout(target_dt, target_count)) {
        OMPI_OSC_PORTALS4_REQUEST_RETURN(request);
        opal_output(ompi_osc_base_framework.framework_output,
                    "MPI_Rget: transfer of non-contiguous memory is not currently supported.\n");
        return OMPI_ERR_NOT_SUPPORTED;
    } else {
        (void)opal_atomic_add_64(&module->opcount, 1);
        request->ops_expected = 1;
        ret = ompi_datatype_type_size(origin_dt, &length);
        if (OMPI_SUCCESS != ret) {
            OMPI_OSC_PORTALS4_REQUEST_RETURN(request);
            return ret;
        }
        length *= origin_count;
        ret = PtlGet(module->req_md_h,
                     (ptl_size_t) origin_addr,
                     length,
                     peer,
                     module->pt_idx,
                     module->match_bits,
                     offset,
                     request);
        if (OMPI_SUCCESS != ret) {
            OMPI_OSC_PORTALS4_REQUEST_RETURN(request);
            return ret;
        }
    }

    return OMPI_SUCCESS;
}
Example #10
0
static inline __opal_attribute_always_inline__
int parallel_reduce_start (void *sbuf, void *rbuf, int count,
                           struct ompi_datatype_t *dtype, struct ompi_op_t *op,
                           int root,
                           struct ompi_communicator_t *comm,
                           mca_coll_ml_module_t *ml_module,
                           ompi_request_t **req,
                           int small_data_reduce,
                           int large_data_reduce) {
    ptrdiff_t lb, extent;
    size_t pack_len, dt_size;
    mca_bcol_base_payload_buffer_desc_t *src_buffer_desc = NULL;
    mca_coll_ml_collective_operation_progress_t * coll_op = NULL;
    bool contiguous = ompi_datatype_is_contiguous_memory_layout(dtype, count);
    mca_coll_ml_component_t *cm = &mca_coll_ml_component;
    int ret, n_fragments = 1, frag_len,
             pipeline_depth, n_dts_per_frag, rank;

    if (MPI_IN_PLACE == sbuf) {
        sbuf = rbuf;
    }

    ret = ompi_datatype_get_extent(dtype, &lb, &extent);
    if (ret < 0) {
        return OMPI_ERROR;
    }

    rank = ompi_comm_rank (comm);

    dt_size = (size_t) extent;
    pack_len = count * dt_size;

    /* We use a separate recieve and send buffer so only half the buffer is usable. */
    if (pack_len < (size_t) ml_module->small_message_thresholds[BCOL_REDUCE] / 4) {
        /* The len of the message can not be larger than ML buffer size */
        assert(pack_len <= ml_module->payload_block->size_buffer);

        src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);

        ML_VERBOSE(10,("Using small data reduce (threshold = %d)",
                       REDUCE_SMALL_MESSAGE_THRESHOLD));
        while (NULL == src_buffer_desc) {
            opal_progress();
            src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
        }

        coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
                  ml_module->coll_ml_reduce_functions[small_data_reduce],
                  sbuf, rbuf, pack_len, 0);

        MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op,
                                              src_buffer_desc->buffer_index, src_buffer_desc);

        coll_op->variable_fn_params.rbuf = src_buffer_desc->data_addr;
        coll_op->variable_fn_params.sbuf = src_buffer_desc->data_addr;
        coll_op->variable_fn_params.buffer_index = src_buffer_desc->buffer_index;
        coll_op->variable_fn_params.src_desc = src_buffer_desc;
        coll_op->variable_fn_params.count = count;

        ret = ompi_datatype_copy_content_same_ddt(dtype, count,
                (void *) (uintptr_t) src_buffer_desc->data_addr, (char *) sbuf);
        if (ret < 0) {
            return OMPI_ERROR;
        }

    } else if (cm->enable_fragmentation || !contiguous) {
        ML_VERBOSE(1,("Using Fragmented Reduce "));

        /* fragment the data */
        /* check for retarded application programming decisions */
        if (dt_size > (size_t) ml_module->small_message_thresholds[BCOL_REDUCE] / 4) {
            ML_ERROR(("Sorry, but we don't support datatypes that large"));
            return OMPI_ERROR;
        }

        /* calculate the number of data types that can fit per ml-buffer */
        n_dts_per_frag = ml_module->small_message_thresholds[BCOL_REDUCE] / (4 * dt_size);

        /* calculate the number of fragments */
        n_fragments = (count + n_dts_per_frag - 1) / n_dts_per_frag; /* round up */

        /* calculate the actual pipeline depth */
        pipeline_depth = n_fragments < cm->pipeline_depth ? n_fragments : cm->pipeline_depth;

        /* calculate the fragment size */
        frag_len = n_dts_per_frag * dt_size;

        /* allocate an ml buffer */
        src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
        while (NULL == src_buffer_desc) {
            opal_progress();
            src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
        }

        coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
                  ml_module->coll_ml_reduce_functions[small_data_reduce],
                  sbuf,rbuf,
                  pack_len,
                  0 /* offset for first pack */);

        MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op,
                                              src_buffer_desc->buffer_index, src_buffer_desc);


        coll_op->variable_fn_params.sbuf = (void *) src_buffer_desc->data_addr;
        coll_op->variable_fn_params.rbuf = (void *) src_buffer_desc->data_addr;

        coll_op->fragment_data.message_descriptor->n_active = 1;
        coll_op->full_message.n_bytes_scheduled = frag_len;
        coll_op->full_message.fragment_launcher = mca_coll_ml_reduce_frag_progress;
        coll_op->full_message.pipeline_depth = pipeline_depth;
        coll_op->fragment_data.current_coll_op = small_data_reduce;
        coll_op->fragment_data.fragment_size = frag_len;

        coll_op->variable_fn_params.count = n_dts_per_frag;  /* seems fishy */
        coll_op->variable_fn_params.buffer_size = frag_len;
        coll_op->variable_fn_params.src_desc = src_buffer_desc;
        /* copy into the ml-buffer */
        ret = ompi_datatype_copy_content_same_ddt(dtype, n_dts_per_frag,
                (char *) src_buffer_desc->data_addr, (char *) sbuf);
        if (ret < 0) {
            return OMPI_ERROR;
        }
    } else {
        ML_VERBOSE(1,("Using zero-copy ptp reduce"));
        coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
                  ml_module->coll_ml_reduce_functions[large_data_reduce],
                  sbuf, rbuf, pack_len, 0);

        coll_op->variable_fn_params.userbuf =
            coll_op->variable_fn_params.sbuf = sbuf;

        coll_op->variable_fn_params.rbuf = rbuf;

        /* The ML buffer is used for testing. Later, when we
         * switch to use knem/mmap/portals this should be replaced
         * appropriately
         */
        src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
        while (NULL == src_buffer_desc) {
            opal_progress();
            src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
        }

        coll_op->variable_fn_params.buffer_index = src_buffer_desc->buffer_index;
        coll_op->variable_fn_params.src_desc = src_buffer_desc;
        coll_op->variable_fn_params.count = count;
    }

    coll_op->process_fn = (rank != root) ? NULL : mca_coll_ml_reduce_unpack;

    /* Set common parts */
    coll_op->fragment_data.buffer_desc = src_buffer_desc;
    coll_op->variable_fn_params.dtype = dtype;
    coll_op->variable_fn_params.op = op;

    /* NTH: the root, root route, and root flag are set in the task setup */

    /* Fill in the function arguments */
    coll_op->variable_fn_params.sbuf_offset = 0;
    coll_op->variable_fn_params.rbuf_offset = (ml_module->payload_block->size_buffer -
            ml_module->data_offset)/2;

    /* Keep track of the global root of this operation */
    coll_op->global_root = root;

    coll_op->variable_fn_params.sequence_num =
        OPAL_THREAD_ADD32(&(ml_module->collective_sequence_num), 1);
    coll_op->sequential_routine.current_active_bcol_fn = 0;
    /* set the task setup callback  */
    coll_op->sequential_routine.seq_task_setup = mca_coll_ml_reduce_task_setup;

    /* Reduce requires the schedule to be fixed. If we use other (changing) schedule,
       the operation might result in different result. */
    coll_op->coll_schedule->component_functions = coll_op->coll_schedule->
            comp_fn_arr[coll_op->coll_schedule->topo_info->route_vector[root].level];

    /* Launch the collective */
    ret = mca_coll_ml_launch_sequential_collective (coll_op);
    if (OMPI_SUCCESS != ret) {
        ML_VERBOSE(10, ("Failed to launch reduce collective"));
        return ret;
    }

    *req = &coll_op->full_message.super;

    return OMPI_SUCCESS;
}