Beispiel #1
0
static int mca_coll_ml_allgather_small_unpack_data(mca_coll_ml_collective_operation_progress_t *coll_op)
{
    bool rcontig = coll_op->full_message.recv_data_continguous;
    int n_ranks_in_comm = ompi_comm_size(OP_ML_MODULE(coll_op)->comm);

    void *dest = (void *)((uintptr_t)coll_op->full_message.dest_user_addr +
                          (uintptr_t)coll_op->full_message.n_bytes_delivered);
    void *src = (void *)((uintptr_t)coll_op->fragment_data.buffer_desc->data_addr +
                         (size_t)coll_op->variable_fn_params.rbuf_offset);

    if (rcontig) {
        memcpy(dest, src, n_ranks_in_comm * coll_op->full_message.n_bytes_scheduled);
    } else {
        mca_coll_ml_convertor_unpack(src, n_ranks_in_comm * coll_op->full_message.n_bytes_scheduled,
                                     &coll_op->fragment_data.message_descriptor->recv_convertor);
    }

    return OMPI_SUCCESS;
}
Beispiel #2
0
static int mca_coll_ml_allgather_frag_progress(mca_coll_ml_collective_operation_progress_t *coll_op)
{
    /* local variables */
    int ret;
    size_t frag_len, dt_size;

    const void *buf;
    mca_bcol_base_payload_buffer_desc_t *src_buffer_desc;
    mca_coll_ml_collective_operation_progress_t *new_op;

    mca_coll_ml_module_t *ml_module = OP_ML_MODULE(coll_op);
    bool scontig = coll_op->fragment_data.message_descriptor->send_data_continguous;

    ompi_datatype_type_size(coll_op->variable_fn_params.dtype, &dt_size);
    /* Keep the pipeline filled with fragments */
    while (coll_op->fragment_data.message_descriptor->n_active <
            coll_op->fragment_data.message_descriptor->pipeline_depth) {
        /* If an active fragment happens to have completed the collective during
         * a hop into the progress engine, then don't launch a new fragment,
         * instead break and return.
         */
        if (coll_op->fragment_data.message_descriptor->n_bytes_scheduled
                == coll_op->fragment_data.message_descriptor->n_bytes_total) {
            break;
        }
        /* Get an ml buffer */
        src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module);
        if (NULL == src_buffer_desc) {
            /* If there exist outstanding fragments, then break out
             * and let an active fragment deal with this later,
             * there are no buffers available.
             */
            if (0 < coll_op->fragment_data.message_descriptor->n_active) {
                return OMPI_SUCCESS;
            } else {
                /* The fragment is already on list and
                 * the we still have no ml resources
                 * Return busy */
                if (coll_op->pending & REQ_OUT_OF_MEMORY) {
                    ML_VERBOSE(10,("Out of resources %p", coll_op));
                    return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
                }

                coll_op->pending |= REQ_OUT_OF_MEMORY;
                opal_list_append(&((OP_ML_MODULE(coll_op))->waiting_for_memory_list),
                                 (opal_list_item_t *)coll_op);
                ML_VERBOSE(10,("Out of resources %p adding to pending queue", coll_op));
                return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
            }
        }

        /* Get a new collective descriptor and initialize it */
        new_op =  mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
                  ml_module->coll_ml_allgather_functions[ML_SMALL_DATA_ALLGATHER],
                  coll_op->fragment_data.message_descriptor->src_user_addr,
                  coll_op->fragment_data.message_descriptor->dest_user_addr,
                  coll_op->fragment_data.message_descriptor->n_bytes_total,
                  coll_op->fragment_data.message_descriptor->n_bytes_scheduled);

        new_op->fragment_data.current_coll_op = coll_op->fragment_data.current_coll_op;
        new_op->fragment_data.message_descriptor = coll_op->fragment_data.message_descriptor;

        /* set the task setup callback  */
        new_op->sequential_routine.seq_task_setup = mca_coll_ml_allgather_task_setup;

        /*
        MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(new_op,
                src_buffer_desc->buffer_index, src_buffer_desc);
        */

        /* We need this address for pointer arithmetic in memcpy */
        buf = coll_op->fragment_data.message_descriptor->src_user_addr;

        if (!scontig) {
            frag_len = ml_module->small_message_thresholds[BCOL_ALLGATHER];
            mca_coll_ml_convertor_get_send_frag_size(
                ml_module, &frag_len,
                coll_op->fragment_data.message_descriptor);

            mca_coll_ml_convertor_pack(
                (void *) ((uintptr_t) src_buffer_desc->data_addr +
                          frag_len * coll_op->coll_schedule->topo_info->hier_layout_info[0].offset +
                          frag_len * coll_op->coll_schedule->topo_info->hier_layout_info[0].level_one_index),
                frag_len, &coll_op->fragment_data.message_descriptor->send_convertor);
        } else {
            /* calculate new frag length, there are some issues here */
            frag_len = (coll_op->fragment_data.message_descriptor->n_bytes_total -
                        coll_op->fragment_data.message_descriptor->n_bytes_scheduled <
                        coll_op->fragment_data.fragment_size ?
                        coll_op->fragment_data.message_descriptor->n_bytes_total -
                        coll_op->fragment_data.message_descriptor->n_bytes_scheduled :
                        coll_op->fragment_data.fragment_size);

            /* everybody copies in, based on the new values */
            memcpy((void *) ((uintptr_t)src_buffer_desc->data_addr +
                             frag_len * new_op->coll_schedule->topo_info->hier_layout_info[0].offset +
                             frag_len * new_op->coll_schedule->topo_info->hier_layout_info[0].level_one_index),
                   (void *) ((uintptr_t) buf + (uintptr_t)
                             coll_op->fragment_data.message_descriptor->n_bytes_scheduled), frag_len);
        }

        new_op->variable_fn_params.sbuf = (void *) src_buffer_desc->data_addr;
        new_op->variable_fn_params.rbuf = (void *) src_buffer_desc->data_addr;

        /* update the number of bytes scheduled */
        new_op->fragment_data.message_descriptor->n_bytes_scheduled += frag_len;
        /* everyone needs an unpack function */
        new_op->process_fn = mca_coll_ml_allgather_noncontiguous_unpack_data;

        new_op->fragment_data.fragment_size = frag_len;
        new_op->fragment_data.buffer_desc = src_buffer_desc;

        /* Setup fragment specific data */
        ++(new_op->fragment_data.message_descriptor->n_active);

        ML_VERBOSE(10, ("Start more, My index %d ",
                        new_op->fragment_data.buffer_desc->buffer_index));

        /* this is a bit buggy */
        ML_SET_VARIABLE_PARAMS_BCAST(
            new_op,
            OP_ML_MODULE(new_op),
            frag_len /* yes, we have consistent units, so this makes sense */,
            MPI_BYTE /* we fragment according to buffer size
                          * we don't reduce the data thus we needn't
                          * keep "whole" datatypes, we may freely
                          * fragment without regard for multiples
                          * of any specific datatype
                          */,
            src_buffer_desc,
            0,
            0,
            frag_len,
            src_buffer_desc->data_addr);
        /* initialize first coll */
        ret = new_op->sequential_routine.seq_task_setup(new_op);
        if (OMPI_SUCCESS != ret) {
            ML_VERBOSE(3, ("Fragment failed to initialize itself"));
            return ret;
        }

        new_op->variable_fn_params.buffer_size = frag_len;
        new_op->variable_fn_params.hier_factor = coll_op->variable_fn_params.hier_factor;
        new_op->variable_fn_params.root = 0;

        MCA_COLL_ML_SET_NEW_FRAG_ORDER_INFO(new_op);

        /* append this collective !! */
        OPAL_THREAD_LOCK(&(mca_coll_ml_component.sequential_collectives_mutex));
        opal_list_append(&mca_coll_ml_component.sequential_collectives,
                         (opal_list_item_t *)new_op);
        OPAL_THREAD_UNLOCK(&(mca_coll_ml_component.sequential_collectives_mutex));
    }

    return OMPI_SUCCESS;
}
Beispiel #3
0
static int mca_coll_ml_reduce_frag_progress(mca_coll_ml_collective_operation_progress_t *coll_op)
{
    /* local variables */
    void *buf;

    size_t dt_size;
    int ret, frag_len, count;

    ptrdiff_t lb, extent;

    mca_bcol_base_payload_buffer_desc_t *src_buffer_desc;
    mca_coll_ml_collective_operation_progress_t *new_op;

    mca_coll_ml_module_t *ml_module = OP_ML_MODULE(coll_op);

    ret = ompi_datatype_get_extent(coll_op->variable_fn_params.dtype, &lb, &extent);
    if (ret < 0) {
        return OMPI_ERROR;
    }

    dt_size = (size_t) extent;

    /* Keep the pipeline filled with fragments */
    while (coll_op->fragment_data.message_descriptor->n_active <
            coll_op->fragment_data.message_descriptor->pipeline_depth) {
        /* If an active fragment happens to have completed the collective during
         * a hop into the progress engine, then don't launch a new fragment,
         * instead break and return.
         */
        if (coll_op->fragment_data.message_descriptor->n_bytes_scheduled
                == coll_op->fragment_data.message_descriptor->n_bytes_total) {
            break;
        }

        /* Get an ml buffer */
        src_buffer_desc = mca_coll_ml_alloc_buffer(OP_ML_MODULE(coll_op));
        if (NULL == src_buffer_desc) {
            /* If there exist outstanding fragments, then break out
             * and let an active fragment deal with this later,
             * there are no buffers available.
             */
            if (0 < coll_op->fragment_data.message_descriptor->n_active) {
                return OMPI_SUCCESS;
            } else {
                /* It is useless to call progress from here, since
                 * ml progress can't be executed as result ml memsync
                 * call will not be completed and no memory will be
                 * recycled. So we put the element on the list, and we will
                 * progress it later when memsync will recycle some memory*/

                /* The fragment is already on list and
                 * the we still have no ml resources
                 * Return busy */
                if (coll_op->pending & REQ_OUT_OF_MEMORY) {
                    ML_VERBOSE(10,("Out of resources %p", coll_op));
                    return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
                }

                coll_op->pending |= REQ_OUT_OF_MEMORY;
                opal_list_append(&((OP_ML_MODULE(coll_op))->waiting_for_memory_list),
                                 (opal_list_item_t *)coll_op);
                ML_VERBOSE(10,("Out of resources %p adding to pending queue", coll_op));
                return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
            }
        }

        /* Get a new collective descriptor and initialize it */
        new_op =  mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
                  ml_module->coll_ml_reduce_functions[ML_SMALL_DATA_REDUCE],
                  coll_op->fragment_data.message_descriptor->src_user_addr,
                  coll_op->fragment_data.message_descriptor->dest_user_addr,
                  coll_op->fragment_data.message_descriptor->n_bytes_total,
                  coll_op->fragment_data.message_descriptor->n_bytes_scheduled);

        ML_VERBOSE(1,(" In Reduce fragment progress %d %d ",
                      coll_op->fragment_data.message_descriptor->n_bytes_total,
                      coll_op->fragment_data.message_descriptor->n_bytes_scheduled));
        MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(new_op,
                                              src_buffer_desc->buffer_index, src_buffer_desc);

        new_op->fragment_data.current_coll_op = coll_op->fragment_data.current_coll_op;
        new_op->fragment_data.message_descriptor = coll_op->fragment_data.message_descriptor;

        /* set the task setup callback  */
        new_op->sequential_routine.seq_task_setup = mca_coll_ml_reduce_task_setup;
        /* We need this address for pointer arithmetic in memcpy */
        buf = (void*)coll_op->fragment_data.message_descriptor->src_user_addr;
        /* calculate the number of data types in this packet */
        count = (coll_op->fragment_data.message_descriptor->n_bytes_total -
                 coll_op->fragment_data.message_descriptor->n_bytes_scheduled <
                 ((size_t) OP_ML_MODULE(coll_op)->small_message_thresholds[BCOL_REDUCE]/4 )?
                 (coll_op->fragment_data.message_descriptor->n_bytes_total -
                  coll_op->fragment_data.message_descriptor->n_bytes_scheduled) / dt_size :
                 (size_t) coll_op->variable_fn_params.count);

        /* calculate the fragment length */
        frag_len = count * dt_size;

        ret = ompi_datatype_copy_content_same_ddt(coll_op->variable_fn_params.dtype, count,
                (char *) src_buffer_desc->data_addr, (char *) ((uintptr_t) buf + (uintptr_t)
                        coll_op->fragment_data.message_descriptor->n_bytes_scheduled));
        if (ret < 0) {
            return OMPI_ERROR;
        }

        /* if root unpack the data */
        if (ompi_comm_rank(ml_module->comm) == coll_op->global_root ) {
            new_op->process_fn = mca_coll_ml_reduce_unpack;
            new_op->variable_fn_params.root_flag = true;
        } else {
            new_op->process_fn = NULL;
            new_op->variable_fn_params.root_flag = false;
        }

        new_op->variable_fn_params.root_route = coll_op->variable_fn_params.root_route;

        /* Setup fragment specific data */
        new_op->fragment_data.message_descriptor->n_bytes_scheduled += frag_len;
        new_op->fragment_data.buffer_desc = src_buffer_desc;
        new_op->fragment_data.fragment_size = frag_len;
        (new_op->fragment_data.message_descriptor->n_active)++;

        /* Set in Reduce Buffer arguments */
        ML_SET_VARIABLE_PARAMS_BCAST(new_op, OP_ML_MODULE(new_op), count,
                                     coll_op->variable_fn_params.dtype, src_buffer_desc,
                                     0, (ml_module->payload_block->size_buffer -
                                         ml_module->data_offset)/2, frag_len,
                                     src_buffer_desc->data_addr);

        new_op->variable_fn_params.buffer_size = frag_len;
        new_op->variable_fn_params.sbuf = src_buffer_desc->data_addr;
        new_op->variable_fn_params.rbuf = src_buffer_desc->data_addr;
        new_op->variable_fn_params.root = coll_op->variable_fn_params.root;
        new_op->global_root = coll_op->global_root;
        new_op->variable_fn_params.op = coll_op->variable_fn_params.op;
        new_op->variable_fn_params.hier_factor = coll_op->variable_fn_params.hier_factor;
        new_op->sequential_routine.current_bcol_status = SEQ_TASK_PENDING;
        MCA_COLL_ML_SET_NEW_FRAG_ORDER_INFO(new_op);

        ML_VERBOSE(10,("FFFF Contig + fragmentation [0-sk, 1-lk, 3-su, 4-lu] %d %d %d\n",
                       new_op->variable_fn_params.buffer_size,
                       new_op->fragment_data.fragment_size,
                       new_op->fragment_data.message_descriptor->n_bytes_scheduled));
        /* initialize first coll */
        new_op->sequential_routine.seq_task_setup(new_op);

        /* append this collective !! */
        OPAL_THREAD_LOCK(&(mca_coll_ml_component.sequential_collectives_mutex));
        opal_list_append(&mca_coll_ml_component.sequential_collectives,
                         (opal_list_item_t *)new_op);
        OPAL_THREAD_UNLOCK(&(mca_coll_ml_component.sequential_collectives_mutex));

    }

    return OMPI_SUCCESS;
}