Exemplo n.º 1
0
/* self communication optimizations */
static inline int ompi_osc_rdma_put_self (void *source, int source_count, ompi_datatype_t *source_datatype,
                                          OPAL_PTRDIFF_TYPE target_disp, int target_count, ompi_datatype_t *target_datatype,
                                          ompi_osc_rdma_module_t *module, ompi_osc_rdma_request_t *request)
{
    void *target = (unsigned char*) module->baseptr +
        ((unsigned long) target_disp * module->disp_unit);
    int ret;

    /* if we are in active target mode wait until all post messages arrive */
    if (module->sc_group && !module->active_eager_send_active) {
        OPAL_THREAD_LOCK(&module->lock);
        while (0 != module->num_post_msgs) {
            opal_condition_wait(&module->cond, &module->lock);
        }
        OPAL_THREAD_UNLOCK(&module->lock);
    }

    if (!(module->passive_target_access_epoch || module->active_eager_send_active)) {
        return OMPI_ERR_RMA_SYNC;
    }

    ret = ompi_datatype_sndrcv (source, source_count, source_datatype,
                                target, target_count, target_datatype);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
        return ret;
    }

    if (request) {
        ompi_osc_rdma_request_complete (request, MPI_SUCCESS);
    }

    return OMPI_SUCCESS;
}
Exemplo n.º 2
0
static inline int ompi_osc_rdma_gacc_self (void *source, int source_count, ompi_datatype_t *source_datatype,
                                           void *result, int result_count, ompi_datatype_t *result_datatype,
                                           OPAL_PTRDIFF_TYPE target_disp, int target_count, ompi_datatype_t *target_datatype,
                                           ompi_op_t *op, ompi_osc_rdma_module_t *module, ompi_osc_rdma_request_t *request)
{
    void *target = (unsigned char*) module->baseptr +
        ((unsigned long) target_disp * module->disp_unit);
    int ret;

    /* if we are in active target mode wait until all post messages arrive */
    if (module->sc_group && !module->active_eager_send_active) {
        OPAL_THREAD_LOCK(&module->lock);
        while (0 != module->num_post_msgs) {
            opal_condition_wait(&module->cond, &module->lock);
        }
        OPAL_THREAD_UNLOCK(&module->lock);
    }

    if (!(module->passive_target_access_epoch || module->active_eager_send_active)) {
        return OMPI_ERR_RMA_SYNC;
    }

    ompi_osc_rdma_accumulate_lock (module);

    do {
        ret = ompi_datatype_sndrcv (target, target_count, target_datatype,
                                    result, result_count, result_datatype);

        if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
            OPAL_OUTPUT_VERBOSE((5, ompi_osc_base_framework.framework_output,
                                 "ompi_osc_rdma_gacc_self: failed copying to the target buffer. ret = %d", ret));
            break;
        }

        if (&ompi_mpi_op_no_op.op != op) {
            if (&ompi_mpi_op_replace.op != op) {
                ret = ompi_osc_base_sndrcv_op (source, source_count, source_datatype, target, target_count, target_datatype, op);
            } else {
                ret = ompi_datatype_sndrcv (source, source_count, source_datatype, target, target_count, target_datatype);
            }
        }

        if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
            OPAL_OUTPUT_VERBOSE((5, ompi_osc_base_framework.framework_output,
                                 "ompi_osc_rdma_gacc_self: failed performing accumulate operation. ret = %d", ret));
            break;
        }
    } while (0);

    ompi_osc_rdma_accumulate_unlock (module);

    if (request) {
        /* NTH: is it ok to use an ompi error code here? */
        ompi_osc_rdma_request_complete (request, ret);
    }

    return OMPI_SUCCESS;
}
Exemplo n.º 3
0
static int ompi_osc_rdma_gacc_local (const void *source_buffer, int source_count, ompi_datatype_t *source_datatype,
                                     void *result_buffer, int result_count, ompi_datatype_t *result_datatype,
                                     ompi_osc_rdma_peer_t *peer, uint64_t target_address,
                                     mca_btl_base_registration_handle_t *target_handle, int target_count,
                                     ompi_datatype_t *target_datatype, ompi_op_t *op, ompi_osc_rdma_module_t *module,
                                     ompi_osc_rdma_request_t *request)
{
    int ret = OMPI_SUCCESS;

    do {
        if (!ompi_osc_rdma_peer_is_exclusive (peer)) {
            (void) ompi_osc_rdma_lock_acquire_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock));
        }

        if (NULL != result_buffer) {
            /* get accumulate */

            ret = ompi_datatype_sndrcv ((void *) (intptr_t) target_address, target_count, target_datatype,
                                        result_buffer, result_count, result_datatype);

            if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
                break;
            }
        }

        if (&ompi_mpi_op_no_op.op != op) {
            if (&ompi_mpi_op_replace.op != op) {
                ret = ompi_osc_base_sndrcv_op (source_buffer, source_count, source_datatype, (void *) (intptr_t) target_address,
                                               target_count, target_datatype, op);
            } else {
                ret = ompi_datatype_sndrcv (source_buffer, source_count, source_datatype, (void *) (intptr_t) target_address,
                                            target_count, target_datatype);
            }
        }

        if (!ompi_osc_rdma_peer_is_exclusive (peer)) {
            (void) ompi_osc_rdma_lock_release_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock));
        }
    } while (0);

    if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
        OPAL_OUTPUT_VERBOSE((10, ompi_osc_base_framework.framework_output,
                             "ompi_osc_rdma_gacc_self: failed performing accumulate operation. ret = %d", ret));
        return ret;
    }

    if (request) {
        /* NTH: is it ok to use an ompi error code here? */
        ompi_osc_rdma_request_complete (request, ret);
    }

    return ret;
}
Exemplo n.º 4
0
/**
 * ompi_osc_rdma_cas_get_complete:
 * Note: This function will not work as is in a heterogeneous environment.
 */
static void ompi_osc_rdma_cas_get_complete (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
                                            void *local_address, mca_btl_base_registration_handle_t *local_handle,
                                            void *context, void *data, int status)
{
    ompi_osc_rdma_request_t *request = (ompi_osc_rdma_request_t *) context;
    ompi_osc_rdma_sync_t *sync = request->sync;
    ompi_osc_rdma_module_t *module = sync->module;
    intptr_t source = (intptr_t) local_address + request->offset;
    ompi_osc_rdma_frag_t *frag = request->frag;
    ompi_osc_rdma_peer_t *peer = request->peer;
    int ret;

    if (OMPI_SUCCESS == status) {
        /* copy data to the user buffer (for gacc) */
        memcpy (request->result_addr, (void *) source, request->len);
        memcpy ((void *) source, request->origin_addr, request->len);

        if (0 == memcmp ((void *) source, request->compare_addr, request->len)) {
            /* the target and compare buffers match so write the source to the target */
            ret = module->selected_btl->btl_put (module->selected_btl, peer->data_endpoint, local_address,
                                                 request->target_address, local_handle,
                                                 (mca_btl_base_registration_handle_t *) request->ctx,
                                                 request->len, 0, MCA_BTL_NO_ORDER,
                                                 ompi_osc_rdma_acc_put_complete, request, NULL);
            if (OPAL_UNLIKELY(OPAL_SUCCESS != ret)) {
                OPAL_OUTPUT_VERBOSE((1, ompi_osc_base_framework.framework_output, "could not start put to complete accumulate "
                                     "operation. opal return code: %d", ret));
            }

            /* TODO -- we can do better. probably should queue up the next step and handle it in progress */
            assert (OPAL_SUCCESS == ret);
        } else {
            /* this is a no-op. nothing more to do except release the accumulate lock */
            ompi_osc_rdma_frag_complete (frag);

            if (!ompi_osc_rdma_peer_is_exclusive (peer)) {
                (void) ompi_osc_rdma_lock_release_exclusive (module, request->peer,
                                                             offsetof (ompi_osc_rdma_state_t, accumulate_lock));
            }

            /* the request is now complete and the outstanding rdma operation is complete */
            ompi_osc_rdma_request_complete (request, status);

            ompi_osc_rdma_sync_rdma_dec (sync);
            peer->flags &= ~OMPI_OSC_RDMA_PEER_ACCUMULATING;
        }
    }
}
Exemplo n.º 5
0
/* completion of an accumulate put */
static void ompi_osc_rdma_acc_put_complete (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
                                            void *local_address, mca_btl_base_registration_handle_t *local_handle,
                                            void *context, void *data, int status)
{
    ompi_osc_rdma_request_t *request = (ompi_osc_rdma_request_t *) context;
    ompi_osc_rdma_sync_t *sync = request->sync;
    ompi_osc_rdma_peer_t *peer = request->peer;

    ompi_osc_rdma_frag_complete (request->frag);
    ompi_osc_rdma_request_complete (request, status);

    if (!ompi_osc_rdma_peer_is_exclusive (peer)) {
        (void) ompi_osc_rdma_lock_release_exclusive (sync->module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock));
    }

    ompi_osc_rdma_sync_rdma_dec (sync);
    peer->flags &= ~OMPI_OSC_RDMA_PEER_ACCUMULATING;
}
Exemplo n.º 6
0
int ompi_osc_rdma_raccumulate(void *origin_addr, int origin_count,
                              struct ompi_datatype_t *origin_dt, int target,
                              OPAL_PTRDIFF_TYPE target_disp, int target_count,
                              struct ompi_datatype_t *target_dt, struct ompi_op_t *op,
                              struct ompi_win_t *win, struct ompi_request_t **request)
{
    ompi_osc_rdma_request_t *rdma_request;
    int ret;

    OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
                         "raccumulate: 0x%lx, %d, %s, %d, %d, %d, %s, %s, %s",
                         (unsigned long) origin_addr, origin_count,
                         origin_dt->name, target, (int) target_disp,
                         target_count, target_dt->name, op->o_name,
                         win->w_name));

    OMPI_OSC_RDMA_REQUEST_ALLOC(win, rdma_request);
    if (NULL == rdma_request) {
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    /* short-circuit case */
    if (0 == origin_count || 0 == target_count) {
        ompi_osc_rdma_request_complete (rdma_request, MPI_SUCCESS);
        *request = (ompi_request_t *) rdma_request;
        return OMPI_SUCCESS;
    }

    rdma_request->type = OMPI_OSC_RDMA_HDR_TYPE_ACC;

    ret = ompi_osc_rdma_accumulate_w_req (origin_addr, origin_count, origin_dt, target,
                                          target_disp, target_count, target_dt, op, win,
                                          rdma_request);
    if (OMPI_SUCCESS != ret) {
        OMPI_OSC_RDMA_REQUEST_RETURN(rdma_request);
        return ret;
    }

    *request = (ompi_request_t *) rdma_request;

    return OMPI_SUCCESS;
}
Exemplo n.º 7
0
static inline
int ompi_osc_rdma_rget_accumulate_internal (ompi_osc_rdma_sync_t *sync, const void *origin_addr, int origin_count,
                                            struct ompi_datatype_t *origin_datatype, void *result_addr, int result_count,
                                            struct ompi_datatype_t *result_datatype, ompi_osc_rdma_peer_t *peer,
                                            int target_rank, MPI_Aint target_disp, int target_count,
                                            struct ompi_datatype_t *target_datatype, struct ompi_op_t *op,
                                            ompi_osc_rdma_request_t *request)
{
    ompi_osc_rdma_module_t *module = sync->module;
    mca_btl_base_registration_handle_t *target_handle;
    uint64_t target_address;
    int ret;

    /* short-circuit case. note that origin_count may be 0 if op is MPI_NO_OP */
    if ((result_addr && 0 == result_count) || 0 == target_count) {
        if (request) {
            ompi_osc_rdma_request_complete (request, MPI_SUCCESS);
        }

        return OMPI_SUCCESS;
    }

    ret = osc_rdma_get_remote_segment (module, peer, target_disp, target_datatype->super.size * target_count,
                                       &target_address, &target_handle);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
        return ret;
    }

    if (ompi_osc_rdma_peer_local_base (peer)) {
        /* local/self optimization */
        return ompi_osc_rdma_gacc_local (origin_addr, origin_count, origin_datatype, result_addr, result_count,
                                         result_datatype, peer, target_address, target_handle, target_count,
                                         target_datatype, op, module, request);
    }

    return ompi_osc_rdma_gacc_master (sync, origin_addr, origin_count, origin_datatype, result_addr, result_count,
                                      result_datatype, peer, target_address, target_handle, target_count,
                                      target_datatype, op, request);
}
Exemplo n.º 8
0
/* progress an OSC request */
static int ompi_osc_rdma_req_comm_complete (ompi_request_t *request)
{
    ompi_osc_rdma_request_t *rdma_request = (ompi_osc_rdma_request_t *) request->req_complete_cb_data;
    ompi_osc_rdma_module_t *module = rdma_request->module;

    OPAL_OUTPUT_VERBOSE((10, ompi_osc_base_framework.framework_output,
                         "ompi_osc_rdma_req_comm_complete called tag = %d",
                         request->req_status.MPI_TAG));

    mark_outgoing_completion (module);

    OPAL_THREAD_LOCK(&ompi_request_lock);
    if (0 == --rdma_request->outstanding_requests) {
        ompi_osc_rdma_request_complete (rdma_request, request->req_status.MPI_ERROR);
    }
    OPAL_THREAD_UNLOCK(&ompi_request_lock);

    /* put this request on the garbage colletion list */
    osc_rdma_gc_add_request (request);

    return OMPI_SUCCESS;
}
Exemplo n.º 9
0
static inline int ompi_osc_rdma_rget_internal (void *origin_addr, int origin_count,
                                               struct ompi_datatype_t *origin_dt,
                                               int target,
                                               OPAL_PTRDIFF_TYPE target_disp,
                                               int target_count,
                                               struct ompi_datatype_t *target_dt,
                                               struct ompi_win_t *win, bool release_req,
                                               struct ompi_request_t **request)
{
    int ret, tag;
    ompi_osc_rdma_module_t *module = GET_MODULE(win);
    bool is_long_datatype = false;
    ompi_osc_rdma_frag_t *frag;
    ompi_osc_rdma_header_get_t *header;
    size_t ddt_len, frag_len;
    char *ptr;
    const void *packed_ddt;
    ompi_osc_rdma_request_t *rdma_request;

    OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
                         "get: 0x%lx, %d, %s, %d, %d, %d, %s, %s",
                         (unsigned long) origin_addr, origin_count,
                         origin_dt->name, target, (int) target_disp,
                         target_count, target_dt->name, win->w_name));

    if (!ompi_osc_rdma_check_access_epoch (module, target)) {
        return OMPI_ERR_RMA_SYNC;
    }

    /* gets are always request based, so that we know where to land the data */
    OMPI_OSC_RDMA_REQUEST_ALLOC(win, rdma_request);
    if (NULL == rdma_request) {
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    rdma_request->internal = release_req;

    /* short-circuit case */
    if (0 == origin_count || 0 == target_count) {
        ompi_osc_rdma_request_complete (rdma_request, MPI_SUCCESS);
        *request = &rdma_request->super;
        return OMPI_SUCCESS;
    }

    /* optimize self communication. TODO: optimize local communication */
    if (ompi_comm_rank (module->comm) == target) {
        *request = &rdma_request->super;
        return ompi_osc_rdma_get_self (origin_addr, origin_count, origin_dt,
                                       target_disp, target_count, target_dt,
                                       module, rdma_request);
    }

    rdma_request->type = OMPI_OSC_RDMA_HDR_TYPE_GET;
    rdma_request->origin_addr = origin_addr;
    rdma_request->origin_count = origin_count;
    OBJ_RETAIN(origin_dt);
    rdma_request->origin_dt = origin_dt;

    /* Compute datatype length.  Note that the datatype description
     * must fit in a single frag */
    ddt_len = ompi_datatype_pack_description_length(target_dt);

    OPAL_THREAD_LOCK(&module->lock);

    frag_len = sizeof(ompi_osc_rdma_header_get_t) + ddt_len;
    ret = ompi_osc_rdma_frag_alloc(module, target, frag_len, &frag, &ptr);
    if (OMPI_SUCCESS != ret) {
        /* allocate space for the header plus space to store ddt_len */
        frag_len = sizeof(ompi_osc_rdma_header_put_t) + 8;
        ret = ompi_osc_rdma_frag_alloc(module, target, frag_len, &frag, &ptr);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
            OPAL_THREAD_UNLOCK(&module->lock);
            return OMPI_ERR_OUT_OF_RESOURCE;
        }

        is_long_datatype = true;
    }

    tag = get_tag (module);

    /* for bookkeeping the get is "outgoing" */
    ompi_osc_signal_outgoing (module, target, 1);

    /* flush will be called at the end of this function. make sure the post message has
     * arrived. */
    if (!release_req && module->sc_group) {
        while (0 != module->num_post_msgs) {
            OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
                                 "waiting for post messages. num_post_msgs = %d", module->num_post_msgs));
            opal_condition_wait(&module->cond, &module->lock);
        }
    }

    OPAL_THREAD_UNLOCK(&module->lock);

    header = (ompi_osc_rdma_header_get_t*) ptr;
    header->base.type = OMPI_OSC_RDMA_HDR_TYPE_GET;
    header->base.flags = 0;
    header->len = frag_len;
    header->count = target_count;
    header->displacement = target_disp;
    header->tag = tag;
    ptr += sizeof(ompi_osc_rdma_header_get_t);

    do {
        ret = ompi_datatype_get_pack_description(target_dt, &packed_ddt);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
            break;
        }

        if (is_long_datatype) {
            /* the datatype does not fit in an eager message. send it seperately */
            header->base.flags |= OMPI_OSC_RDMA_HDR_FLAG_LARGE_DATATYPE;

            OBJ_RETAIN(target_dt);

            ret = ompi_osc_rdma_isend_w_cb ((void *) packed_ddt, ddt_len, MPI_BYTE, target,
                                            tag, module->comm, ompi_osc_rdma_dt_send_complete,
                                            target_dt);
            if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
                break;
            }

            *((uint64_t *) ptr) = ddt_len;
            ptr += 8;
        } else {
            memcpy((unsigned char*) ptr, packed_ddt, ddt_len);
            ptr += ddt_len;
        }

        /* TODO -- store the request somewhere so we can cancel it on error */
        rdma_request->outstanding_requests = 1;
        ret = ompi_osc_rdma_irecv_w_cb (origin_addr, origin_count, origin_dt, target, tag,
                                        module->comm, NULL, ompi_osc_rdma_req_comm_complete, rdma_request);
    } while (0);

    if (OMPI_SUCCESS == ret) {
        header->base.flags |= OMPI_OSC_RDMA_HDR_FLAG_VALID;
        *request = &rdma_request->super;
    }

    OPAL_THREAD_LOCK(&module->lock);
    ret = ompi_osc_rdma_frag_finish(module, frag);

    if (!release_req) {
        /* need to flush now in case the caller decides to wait on the request */
        ompi_osc_rdma_frag_flush_target (module, target);
    }
    OPAL_THREAD_UNLOCK(&module->lock);

    return ret;
}
Exemplo n.º 10
0
static int
ompi_osc_rdma_accumulate_w_req (void *origin_addr, int origin_count,
                                struct ompi_datatype_t *origin_dt,
                                int target, OPAL_PTRDIFF_TYPE target_disp,
                                int target_count,
                                struct ompi_datatype_t *target_dt,
                                struct ompi_op_t *op, ompi_win_t *win,
                                ompi_osc_rdma_request_t *request)
{
    int ret;
    ompi_osc_rdma_module_t *module = GET_MODULE(win);
    ompi_proc_t *proc = ompi_comm_peer_lookup(module->comm, target);
    bool is_long_datatype = false;
    bool is_long_msg = false;
    ompi_osc_rdma_frag_t *frag;
    ompi_osc_rdma_header_acc_t *header;
    size_t ddt_len, payload_len, frag_len;
    char *ptr;
    const void *packed_ddt;
    int tag = -1;

    OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
                         "acc: 0x%lx, %d, %s, %d, %d, %d, %s, %s, %s",
                         (unsigned long) origin_addr, origin_count,
                         origin_dt->name, target, (int) target_disp,
                         target_count, target_dt->name, op->o_name,
                         win->w_name));

    if (!ompi_osc_rdma_check_access_epoch (module, target)) {
        return OMPI_ERR_RMA_SYNC;
    }

    /* short-circuit case */
    if (0 == origin_count || 0 == target_count) {
        if (request) {
            ompi_osc_rdma_request_complete (request, MPI_SUCCESS);
        }

        return OMPI_SUCCESS;
    }

    /* optimize the self case. TODO: optimize the local case */
    if (ompi_comm_rank (module->comm) == target) {
        return ompi_osc_rdma_acc_self (origin_addr, origin_count, origin_dt,
                                       target_disp, target_count, target_dt,
                                       op, module, request);
    }

    /* Compute datatype and payload lengths.  Note that the datatype description
     * must fit in a single frag */
    ddt_len = ompi_datatype_pack_description_length(target_dt);
    payload_len = origin_dt->super.size * origin_count;

    OPAL_THREAD_LOCK(&module->lock);

    frag_len = sizeof(*header) + ddt_len + payload_len;
    ret = ompi_osc_rdma_frag_alloc(module, target, frag_len, &frag, &ptr);
    if (OMPI_SUCCESS != ret) {
        frag_len = sizeof(*header) + ddt_len;
        ret = ompi_osc_rdma_frag_alloc(module, target, frag_len, &frag, &ptr);
        if (OMPI_SUCCESS != ret) {
            /* allocate space for the header plus space to store ddt_len */
            frag_len = sizeof(*header) + 8;
            ret = ompi_osc_rdma_frag_alloc(module, target, frag_len, &frag, &ptr);
            if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
                OPAL_THREAD_UNLOCK(&module->lock);
                return OMPI_ERR_OUT_OF_RESOURCE;
            }

            is_long_datatype = true;
         }

        is_long_msg = true;
        tag = get_tag (module);
    }

    /* flush will be called at the end of this function. make sure the post message has
     * arrived. */
    if ((is_long_msg || request) && module->sc_group) {
        while (0 != module->num_post_msgs) {
            OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
                                 "waiting for post messages. num_post_msgs = %d", module->num_post_msgs));
            opal_condition_wait(&module->cond, &module->lock);
        }
    }

    OPAL_THREAD_UNLOCK(&module->lock);

    header = (ompi_osc_rdma_header_acc_t*) ptr;
    header->base.flags = 0;
    header->len = frag_len;
    header->count = target_count;
    header->displacement = target_disp;
    header->op = op->o_f_to_c_index;
    ptr += sizeof (*header);

    do {
        ret = ompi_datatype_get_pack_description(target_dt, &packed_ddt);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
            break;
        }

        if (is_long_datatype) {
            /* the datatype does not fit in an eager message. send it seperately */
            header->base.flags |= OMPI_OSC_RDMA_HDR_FLAG_LARGE_DATATYPE;

            OBJ_RETAIN(target_dt);

            ret = ompi_osc_rdma_isend_w_cb ((void *) packed_ddt, ddt_len, MPI_BYTE, target,
                                            tag, module->comm, ompi_osc_rdma_dt_send_complete,
                                            target_dt);
            if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
                break;
            }

            *((uint64_t *) ptr) = ddt_len;
            ptr += 8;
        } else {
            memcpy((unsigned char*) ptr, packed_ddt, ddt_len);
            ptr += ddt_len;
        }

        if (!is_long_msg) {
            header->base.type = OMPI_OSC_RDMA_HDR_TYPE_ACC;

            osc_rdma_copy_for_send (ptr, payload_len, origin_addr, proc,
                                    origin_count, origin_dt);

            /* the user's buffer is no longer needed so mark the request as
             * complete. */
            if (request) {
                ompi_osc_rdma_request_complete (request, MPI_SUCCESS);
            }
        } else {
            header->base.type = OMPI_OSC_RDMA_HDR_TYPE_ACC_LONG;

            header->tag = tag;

            OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
                                 "acc: starting long accumulate with tag %d", tag));

            /* increment the outgoing send count */
            ompi_osc_signal_outgoing (module, target, 1);

            if (request) {
                request->outstanding_requests = 1;
                ret = ompi_osc_rdma_isend_w_cb (origin_addr, origin_count, origin_dt,
                                                target, tag, module->comm, ompi_osc_rdma_req_comm_complete,
                                                request);
            } else {
                ret = ompi_osc_rdma_component_isend (module, origin_addr, origin_count, origin_dt, target, tag,
                                                     module->comm);
            }
        }
    } while (0);

    if (OMPI_SUCCESS != ret) {
        OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
                             "acc: failed with eror %d", ret));
    } else {
        /* mark the fragment as valid */
        header->base.flags |= OMPI_OSC_RDMA_HDR_FLAG_VALID;
    }

    OPAL_THREAD_LOCK(&module->lock);

    ret = ompi_osc_rdma_frag_finish(module, frag);

    if (is_long_msg || request) {
        /* need to flush now in case the caller decides to wait on the request */
        ompi_osc_rdma_frag_flush_target (module, target);
    }

    OPAL_THREAD_UNLOCK(&module->lock);

    return ret;
}
Exemplo n.º 11
0
static inline
int ompi_osc_rdma_rget_accumulate_internal (void *origin_addr, int origin_count,
                                            struct ompi_datatype_t *origin_datatype,
                                            void *result_addr, int result_count,
                                            struct ompi_datatype_t *result_datatype,
                                            int target_rank, MPI_Aint target_disp,
                                            int target_count, struct ompi_datatype_t *target_datatype,
                                            struct ompi_op_t *op, struct ompi_win_t *win,
                                            bool release_req, struct ompi_request_t **request)
{
    int ret;
    ompi_osc_rdma_module_t *module = GET_MODULE(win);
    ompi_proc_t *proc = ompi_comm_peer_lookup(module->comm, target_rank);
    bool is_long_datatype = false;
    bool is_long_msg = false;
    ompi_osc_rdma_frag_t *frag;
    ompi_osc_rdma_header_acc_t *header;
    size_t ddt_len, payload_len, frag_len;
    char *ptr;
    const void *packed_ddt;
    int tag;
    ompi_osc_rdma_request_t *rdma_request;

    OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
                         "rget_acc: 0x%lx, %d, %s, 0x%lx, %d, %s, 0x%x, %d, %d, %s, %s, %s",
                         (unsigned long) origin_addr, origin_count, origin_datatype->name,
                         (unsigned long) result_addr, result_count, result_datatype->name,
                         target_rank, (int) target_disp, target_count, target_datatype->name,
                         op->o_name, win->w_name));

    if (!ompi_osc_rdma_check_access_epoch (module, target_rank)) {
        return OMPI_ERR_RMA_SYNC;
    }

    /* get_accumulates are always request based, so that we know where to land the data */
    OMPI_OSC_RDMA_REQUEST_ALLOC(win, rdma_request);
    if (OPAL_UNLIKELY(NULL == rdma_request)) {
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    rdma_request->internal = release_req;

    /* short-circuit case. note that origin_count may be 0 if op is MPI_NO_OP */
    if (0 == result_count || 0 == target_count) {
        ompi_osc_rdma_request_complete (rdma_request, MPI_SUCCESS);
        *request = &rdma_request->super;
        return OMPI_SUCCESS;
    }

    /* optimize the self case. TODO: optimize the local case */
    if (ompi_comm_rank (module->comm) == target_rank) {
        *request = &rdma_request->super;
        return ompi_osc_rdma_gacc_self (origin_addr, origin_count, origin_datatype,
                                        result_addr, result_count, result_datatype,
                                        target_disp, target_count, target_datatype,
                                        op, module, rdma_request);
    }

    rdma_request->type = OMPI_OSC_RDMA_HDR_TYPE_GET_ACC;
    rdma_request->origin_addr = origin_addr;
    rdma_request->origin_count = origin_count;
    OBJ_RETAIN(origin_datatype);
    rdma_request->origin_dt = origin_datatype;

    /* Compute datatype and payload lengths.  Note that the datatype description
     * must fit in a single frag */
    ddt_len = ompi_datatype_pack_description_length(target_datatype);

    if (&ompi_mpi_op_no_op.op != op) {
        payload_len = origin_datatype->super.size * origin_count;
    } else {
        payload_len = 0;
    }

    OPAL_THREAD_LOCK(&module->lock);

    frag_len = sizeof(*header) + ddt_len + payload_len;
    ret = ompi_osc_rdma_frag_alloc(module, target_rank, frag_len, &frag, &ptr);
    if (OMPI_SUCCESS != ret) {
        frag_len = sizeof(*header) + ddt_len;
        ret = ompi_osc_rdma_frag_alloc(module, target_rank, frag_len, &frag, &ptr);
        if (OMPI_SUCCESS != ret) {
            /* allocate space for the header plus space to store ddt_len */
            frag_len = sizeof(*header) + 8;
            ret = ompi_osc_rdma_frag_alloc(module, target_rank, frag_len, &frag, &ptr);
            if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
                OPAL_THREAD_UNLOCK(&module->lock);
                return OMPI_ERR_OUT_OF_RESOURCE;
            }

            is_long_datatype = true;
        }

        is_long_msg = true;
    }

    tag = get_tag (module);

    /* If this is a long message then we need two completions before the
     * request is complete (1 for the send, 1 for the receive) */
    rdma_request->outstanding_requests = 1 + is_long_msg;

    /* increment the number of outgoing fragments */
    ompi_osc_signal_outgoing (module, target_rank, rdma_request->outstanding_requests);

    /* flush will be called at the end of this function. make sure the post message has
     * arrived. */
    if (!release_req && module->sc_group) {
        while (0 != module->num_post_msgs) {
            OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
                                 "waiting for post messages. num_post_msgs = %d", module->num_post_msgs));
            opal_condition_wait(&module->cond, &module->lock);
        }
    }

    OPAL_THREAD_UNLOCK(&module->lock);

    header = (ompi_osc_rdma_header_acc_t *) ptr;
    header->base.flags = 0;
    header->len = frag_len;
    header->count = target_count;
    header->displacement = target_disp;
    header->op = op->o_f_to_c_index;
    header->tag = tag;
    ptr = (char *)(header + 1);

    do {
        ret = ompi_datatype_get_pack_description(target_datatype, &packed_ddt);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
            break;
        }

        if (is_long_datatype) {
            /* the datatype does not fit in an eager message. send it seperately */
            header->base.flags |= OMPI_OSC_RDMA_HDR_FLAG_LARGE_DATATYPE;

            OBJ_RETAIN(target_datatype);

            ret = ompi_osc_rdma_isend_w_cb ((void *) packed_ddt, ddt_len, MPI_BYTE, target_rank,
                                            tag, module->comm, ompi_osc_rdma_dt_send_complete,
                                            target_datatype);
            if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
                break;
            }

            *((uint64_t *) ptr) = ddt_len;
            ptr += 8;
        } else {
            memcpy((unsigned char*) ptr, packed_ddt, ddt_len);
            ptr += ddt_len;
        }

        ret = ompi_osc_rdma_irecv_w_cb (result_addr, result_count, result_datatype, target_rank, tag,
                                        module->comm, NULL, ompi_osc_rdma_req_comm_complete, rdma_request);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
            break;
        }

        if (!is_long_msg) {
            header->base.type = OMPI_OSC_RDMA_HDR_TYPE_GET_ACC;

            if (&ompi_mpi_op_no_op.op != op) {
                osc_rdma_copy_for_send (ptr, payload_len, origin_addr, proc, origin_count,
                                        origin_datatype);
            }
        } else {
            header->base.type = OMPI_OSC_RDMA_HDR_TYPE_GET_ACC_LONG;

            ret = ompi_osc_rdma_isend_w_cb (origin_addr, origin_count, origin_datatype, target_rank,
                                            tag, module->comm, ompi_osc_rdma_req_comm_complete, rdma_request);
        }
    } while (0);

    if (OMPI_SUCCESS == ret) {
        header->base.flags |= OMPI_OSC_RDMA_HDR_FLAG_VALID;
        *request = (ompi_request_t *) rdma_request;
    }

    OPAL_THREAD_LOCK(&module->lock);
    ret = ompi_osc_rdma_frag_finish(module, frag);

    if (!release_req) {
        /* need to flush now in case the caller decides to wait on the request */
        ompi_osc_rdma_frag_flush_target (module, target_rank);
    }
    OPAL_THREAD_UNLOCK(&module->lock);

    return ret;
}