示例#1
0
int mca_pml_yalla_mrecv(void *buf, size_t count, ompi_datatype_t *datatype,
                        struct ompi_message_t **message,
                        ompi_status_public_t* status)
{
    mxm_recv_req_t rreq;
    mxm_error_t error;

    PML_YALLA_INIT_MXM_RECV_REQ(&rreq, buf, count, datatype, -1, 0, (*message)->comm, recv);
    PML_YALLA_INIT_BLOCKING_MXM_RECV_REQ(&rreq);

    PML_YALLA_VERBOSE(8, "receive message *%p=%p dtype %s count %zu", (void *)message,
                      (void *)*message, datatype->name, count);

    error = mxm_message_recv(&rreq, (*message)->req_ptr);
    if (MXM_OK != error) {
        return OMPI_ERROR;
    }

    PML_YALLA_MESSAGE_RELEASE(message);

    PML_YALLA_WAIT_MXM_REQ(&rreq.base);
    PML_YALLA_VERBOSE(8, "receive completed with status %s source %d rtag %d(%d/0x%x) len %zu",
                      mxm_error_string(rreq.base.error),
                      rreq.completion.sender_imm, rreq.completion.sender_tag,
                      rreq.tag, rreq.tag_mask,
                      rreq.completion.actual_len);
    PML_YALLA_SET_RECV_STATUS(&rreq, rreq.completion.actual_len, status);
    return OMPI_SUCCESS;
}
示例#2
0
static int ompi_mtl_mxm_get_ep_address(void **address_p, size_t *address_len_p)
{
    mxm_error_t err;

    *address_len_p = 0;
    err = mxm_ep_get_address(ompi_mtl_mxm.ep, NULL, address_len_p);
    if (err != MXM_ERR_BUFFER_TOO_SMALL) {
        MXM_ERROR("Failed to get ep address length");
        return OMPI_ERROR;
    }

    *address_p = malloc(*address_len_p);
    if (*address_p == NULL) {
        MXM_ERROR("Failed to allocate ep address buffer");
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    err = mxm_ep_get_address(ompi_mtl_mxm.ep, *address_p, address_len_p);
    if (MXM_OK != err) {
        opal_show_help("help-mtl-mxm.txt", "unable to extract endpoint address",
                       true, mxm_error_string(err));
        return OMPI_ERROR;
    }

    return OMPI_SUCCESS;
}
示例#3
0
static void mca_pml_yalla_send_completion_cb(void *context)
{
    mca_pml_yalla_send_request_t* sreq = context;

    switch (sreq->mxm.base.error) {
    case MXM_OK:
        sreq->super.ompi.req_status.MPI_ERROR  = OMPI_SUCCESS;
        break;
    case MXM_ERR_CANCELED:
        sreq->super.ompi.req_status._cancelled = true;
        break;
    default:
        sreq->super.ompi.req_status.MPI_ERROR  = MPI_ERR_INTERN;
        break;
    }

    PML_YALLA_VERBOSE(8, "send request %p completed with status %s", (void *)sreq,
                   mxm_error_string(sreq->mxm.base.error));

    ompi_request_complete(&sreq->super.ompi, true);
    if (sreq->super.flags & MCA_PML_YALLA_REQUEST_FLAG_FREE_CALLED) {
        PML_YALLA_VERBOSE(7, "release request %p because free was already called", (void *)sreq);
        mca_pml_yalla_request_release(&sreq->super, &ompi_pml_yalla.send_reqs);
    }
}
示例#4
0
int mca_pml_yalla_recv(void *buf, size_t count, ompi_datatype_t *datatype, int src,
                       int tag, struct ompi_communicator_t* comm,
                       ompi_status_public_t* status)
{
    mxm_recv_req_t rreq;
    mxm_error_t error;

    PML_YALLA_INIT_MXM_RECV_REQ(&rreq, buf, count, datatype, src, tag, comm, recv);
    PML_YALLA_INIT_BLOCKING_MXM_RECV_REQ(&rreq);

    PML_YALLA_VERBOSE(8, "receive from %d tag %d dtype %s count %zu", src, tag,
                      datatype->name, count);

    error = mxm_req_recv(&rreq);
    if (MXM_OK != error) {
        return OMPI_ERROR;
    }

    PML_YALLA_WAIT_MXM_REQ(&rreq.base);
    PML_YALLA_VERBOSE(8, "receive completed with status %s source %d rtag %d(%d/0x%x) len %zu",
                      mxm_error_string(rreq.base.error),
                      rreq.completion.sender_imm, rreq.completion.sender_tag,
                      rreq.tag, rreq.tag_mask,
                      rreq.completion.actual_len);
    PML_YALLA_SET_RECV_STATUS(&rreq, rreq.completion.actual_len, status);
    PML_YALLA_FREE_BLOCKING_MXM_REQ(&rreq.base);

    return OMPI_SUCCESS;
}
示例#5
0
int ompi_mtl_mxm_irecv(struct mca_mtl_base_module_t* mtl,
                       struct ompi_communicator_t *comm, int src, int tag,
                       struct opal_convertor_t *convertor,
                       struct mca_mtl_request_t *mtl_request)
{
    int ret;
    mxm_error_t err;
    mxm_recv_req_t *mxm_recv_req;
    mca_mtl_mxm_request_t * mtl_mxm_request;

    mtl_mxm_request = (mca_mtl_mxm_request_t*) mtl_request;
    mxm_recv_req = &mtl_mxm_request->mxm.recv;

    ompi_mtl_mxm_set_recv_envelope(mxm_recv_req, comm, src, tag);

    /* prepare a receive request embedded in the MTL request */
    ret = ompi_mtl_mxm_recv_init(mtl_mxm_request, convertor, mxm_recv_req);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
        return ret;
    }

    /* post-recv */
    err = mxm_req_recv(mxm_recv_req);
    if (OPAL_UNLIKELY(MXM_OK != err)) {
        orte_show_help("help-mtl-mxm.txt", "error posting receive", true,
                       mxm_error_string(err), mtl_mxm_request->buf, mtl_mxm_request->length);
        return OMPI_ERROR;
    }

    return OMPI_SUCCESS;
}
示例#6
0
int mca_spml_ikrit_get(shmem_ctx_t ctx, void *src_addr, size_t size, void *dst_addr, int src)
{
    mxm_send_req_t sreq;

    if (0 >= size) {
        return OSHMEM_SUCCESS;
    }

    if (OSHMEM_SUCCESS == mca_spml_ikrit_get_shm(src_addr, size, dst_addr, src))
        return OSHMEM_SUCCESS;

    if (OSHMEM_SUCCESS
            != mca_spml_ikrit_get_helper(&sreq,
                                         src_addr,
                                         size,
                                         dst_addr,
                                         src)) {
        oshmem_shmem_abort(-1);
        return OSHMEM_ERROR;
    }

    sreq.base.completed_cb = NULL;
    sreq.flags = 0;

    SPML_IKRIT_MXM_POST_SEND(sreq);

    mca_spml_irkit_req_wait(&sreq.base);
    if (MXM_OK != sreq.base.error) {
        SPML_ERROR("get request failed: %s - aborting",
                   mxm_error_string(sreq.base.error));
        oshmem_shmem_abort(-1);
        return OSHMEM_ERROR;
    }
    return OSHMEM_SUCCESS;
}
示例#7
0
int ompi_mtl_mxm_progress(void)
{
    mxm_error_t err;

    err = mxm_progress(ompi_mtl_mxm.mxm_context);
    if ((MXM_OK != err) && (MXM_ERR_NO_PROGRESS != err) ) {
        opal_show_help("help-mtl-mxm.txt", "errors during mxm_progress", true, mxm_error_string(err));
    }
    return 1;
}
示例#8
0
static void mca_pml_yalla_bsend_completion_cb(void *context)
{
    mca_pml_yalla_bsend_request_t *bsreq = context;

    PML_YALLA_VERBOSE(8, "bsend request %p completed with status %s", (void *)bsreq,
                      mxm_error_string(bsreq->mxm.base.error));

    mca_pml_base_bsend_request_free(bsreq->mxm.base.data.buffer.ptr);
    PML_YALLA_FREELIST_RETURN(&ompi_pml_yalla.bsend_reqs, &bsreq->super);
}
int ompi_mtl_mxm_irecv(struct mca_mtl_base_module_t* mtl,
                       struct ompi_communicator_t *comm, int src, int tag,
                       struct opal_convertor_t *convertor,
                       struct mca_mtl_request_t *mtl_request)
{
    mca_mtl_mxm_request_t * mtl_mxm_request;
    mxm_error_t err;
    mxm_recv_req_t *mxm_recv_req;
    int ret;

    mtl_mxm_request = (mca_mtl_mxm_request_t*) mtl_request;

    mtl_mxm_request->convertor 	= convertor;
    ret = ompi_mtl_datatype_recv_buf(mtl_mxm_request->convertor,
                                     &mtl_mxm_request->buf,
                                     &mtl_mxm_request->length,
                                     &mtl_mxm_request->free_after);
    if (OMPI_SUCCESS != ret) {
        return ret;
    }


    /* prepare a receive request embedded in the MTL request */
    mxm_recv_req = &mtl_mxm_request->mxm.recv;
#if MXM_API >= MXM_VERSION(2,0)
    mtl_mxm_request->is_send = 0;
#endif

    mxm_recv_req->base.state               = MXM_REQ_NEW;
    ompi_mtl_mxm_set_recv_envelope(mxm_recv_req, comm, src, tag);

#if MXM_API < MXM_VERSION(2,0)
    mxm_recv_req->base.flags               = 0;
#endif
    mxm_recv_req->base.data_type           = MXM_REQ_DATA_BUFFER;
    mxm_recv_req->base.data.buffer.ptr     = mtl_mxm_request->buf;
    mxm_recv_req->base.data.buffer.length  = mtl_mxm_request->length;
#if MXM_API < MXM_VERSION(1,5)
    mxm_recv_req->base.data.buffer.mkey    = MXM_MKEY_NONE;
#else
    mxm_recv_req->base.data.buffer.memh    = MXM_INVALID_MEM_HANDLE;
#endif
    mxm_recv_req->base.context             = mtl_mxm_request;
    mxm_recv_req->base.completed_cb        = ompi_mtl_mxm_recv_completion_cb;

    /* post-recv */
    err = mxm_req_recv(mxm_recv_req);
    if (MXM_OK != err) {
        orte_show_help("help-mtl-mxm.txt", "error posting receive", true,
                       mxm_error_string(err), mtl_mxm_request->buf, mtl_mxm_request->length);
        return OMPI_ERROR;
    }

    return OMPI_SUCCESS;
}
示例#10
0
int ompi_mtl_mxm_isend(struct mca_mtl_base_module_t* mtl,
                       struct ompi_communicator_t* comm, int dest, int tag,
                       struct opal_convertor_t *convertor,
                       mca_pml_base_send_mode_t mode, bool blocking,
                       mca_mtl_request_t * mtl_request)
{
    mca_mtl_mxm_request_t *mtl_mxm_request = (mca_mtl_mxm_request_t *)mtl_request;
    mxm_send_req_t *mxm_send_req;
    mxm_error_t err;
    int ret;

    assert(mtl == &ompi_mtl_mxm.super);

    mtl_mxm_request->convertor = convertor;
    ret = ompi_mtl_datatype_pack(mtl_mxm_request->convertor,
                                 &mtl_mxm_request->buf,
                                 &mtl_mxm_request->length,
                                 &mtl_mxm_request->free_after);
    if (OMPI_SUCCESS != ret) {
        return ret;
    }

    mxm_send_req = &mtl_mxm_request->mxm.send;

    /* prepare a send request embedded in the MTL request */
    mxm_send_req->base.state               = MXM_REQ_NEW;
    mxm_send_req->base.mq                  = ompi_mtl_mxm_mq_lookup(comm);
    mxm_send_req->base.conn                = ompi_mtl_mxm_conn_lookup(comm, dest);
    mxm_send_req->base.flags               = 0;
    mxm_send_req->base.data_type           = MXM_REQ_DATA_BUFFER;
    mxm_send_req->base.data.buffer.ptr     = mtl_mxm_request->buf;
    mxm_send_req->base.data.buffer.length  = mtl_mxm_request->length;
    mxm_send_req->base.data.buffer.mkey    = MXM_MKEY_NONE;
    mxm_send_req->base.context             = mtl_mxm_request;
    mxm_send_req->base.completed_cb        = ompi_mtl_mxm_send_completion_cb;
    if (mode == MCA_PML_BASE_SEND_SYNCHRONOUS) {
        mxm_send_req->base.flags           |= MXM_REQ_FLAG_SEND_SYNC;
    }

    mxm_send_req->opcode                   = MXM_REQ_OP_SEND;
    mxm_send_req->op.send.tag              = tag;
    mxm_send_req->op.send.imm_data         = ompi_comm_rank(comm);

    /* post-send */
    err = mxm_req_send(mxm_send_req);
    if (MXM_OK != err) {
        orte_show_help("help-mtl-mxm.txt", "error posting send", true, 1, mxm_error_string(err));
        return OMPI_ERROR;
    }

    return OMPI_SUCCESS;
}
示例#11
0
int spml_ikrit_progress(void)
{
    mxm_error_t err;

    err = mxm_progress(mca_spml_ikrit.mxm_context);
    if ((MXM_OK != err) && (MXM_ERR_NO_PROGRESS != err)) {
        orte_show_help("help-oshmem-spml-ikrit.txt",
                       "errors during mxm_progress",
                       true,
                       mxm_error_string(err));
    }
    return 1;
}
示例#12
0
int ompi_mtl_mxm_send(struct mca_mtl_base_module_t* mtl,
                      struct ompi_communicator_t* comm, int dest, int tag,
                      struct opal_convertor_t *convertor,
                      mca_pml_base_send_mode_t mode)
{
    mxm_send_req_t mxm_send_req;
    mxm_wait_t wait;
    mxm_error_t err;
    int ret;

    /* prepare local send request */
    mxm_send_req.base.state         = MXM_REQ_NEW;
    mxm_send_req.base.mq            = ompi_mtl_mxm_mq_lookup(comm);
    mxm_send_req.base.conn          = ompi_mtl_mxm_conn_lookup(comm, dest);
    mxm_send_req.base.flags         = MXM_REQ_FLAG_BLOCKING;

    mxm_send_req.base.context       = NULL;
    ret = ompi_mtl_mxm_choose_send_datatype(&mxm_send_req, convertor);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
        return ret;
    }

    mxm_send_req.base.data.buffer.mkey   = MXM_MKEY_NONE;
    mxm_send_req.base.completed_cb       = NULL;
    if (mode == MCA_PML_BASE_SEND_SYNCHRONOUS) {
        mxm_send_req.base.flags          |= MXM_REQ_FLAG_SEND_SYNC;
    }

    mxm_send_req.opcode                  = MXM_REQ_OP_SEND;
    mxm_send_req.op.send.tag             = tag;
    mxm_send_req.op.send.imm_data        = ompi_comm_rank(comm);


    /* post-send */
    err = mxm_req_send(&mxm_send_req);
    if (MXM_OK != err) {
        orte_show_help("help-mtl-mxm.txt", "error posting send", true, 0, mxm_error_string(err));
        return OMPI_ERROR;
    }

    /* wait for request completion */
    wait.req          = &mxm_send_req.base;
    wait.state        = MXM_REQ_COMPLETED;
    wait.progress_cb  = ompi_mtl_mxm_send_progress_cb;
    wait.progress_arg = NULL;
    mxm_wait(&wait);

    return OMPI_SUCCESS;
}
示例#13
0
static int mca_pml_yalla_recv_request_cancel(ompi_request_t *request, int flag)
{
    mca_pml_yalla_recv_request_t *rreq = (mca_pml_yalla_recv_request_t*)request;
    mxm_error_t error;

    error = mxm_req_cancel_recv(&rreq->mxm);
    if ((error != MXM_OK) && (error != MXM_ERR_NO_PROGRESS)) {
        PML_YALLA_ERROR("failed to cancel receive request %p: %s", (void *)request,
                        mxm_error_string(error));
        return OMPI_ERROR;
    }

    PML_YALLA_VERBOSE(9, "canceled receive request %p", (void *)request);
    return OMPI_SUCCESS;
}
示例#14
0
int ompi_mtl_mxm_irecv(struct mca_mtl_base_module_t* mtl,
                       struct ompi_communicator_t *comm, int src, int tag,
                       struct opal_convertor_t *convertor,
                       struct mca_mtl_request_t *mtl_request)
{
    mca_mtl_mxm_request_t * mtl_mxm_request;
    mxm_error_t err;
    mxm_recv_req_t *mxm_recv_req;
    int ret;

    mtl_mxm_request = (mca_mtl_mxm_request_t*) mtl_request;

    mtl_mxm_request->convertor 	= convertor;
    ret = ompi_mtl_datatype_recv_buf(mtl_mxm_request->convertor,
                                     &mtl_mxm_request->buf,
                                     &mtl_mxm_request->length,
                                     &mtl_mxm_request->free_after);
    if (OMPI_SUCCESS != ret) {
        return ret;
    }

    /* prepare a receive request embedded in the MTL request */
    mxm_recv_req = &mtl_mxm_request->mxm.recv;

    mxm_recv_req->base.state               = MXM_REQ_NEW;
    mxm_recv_req->base.mq                  = ompi_mtl_mxm_mq_lookup(comm);
    mxm_recv_req->base.conn                = (src == MPI_ANY_SOURCE) ? NULL :
                                             ompi_mtl_mxm_conn_lookup(comm, src);
    mxm_recv_req->base.flags               = 0;
    mxm_recv_req->base.data_type           = MXM_REQ_DATA_BUFFER;
    mxm_recv_req->base.data.buffer.ptr     = mtl_mxm_request->buf;
    mxm_recv_req->base.data.buffer.length  = mtl_mxm_request->length;
    mxm_recv_req->base.data.buffer.mkey    = MXM_MKEY_NONE;
    mxm_recv_req->base.context             = mtl_mxm_request;
    mxm_recv_req->base.completed_cb        = ompi_mtl_mxm_recv_completion_cb;
    mxm_recv_req->tag                      = tag;
    mxm_recv_req->tag_mask                 = (tag == MPI_ANY_TAG) ? 0 : 0xffffffffU;

    /* post-recv */
    err = mxm_req_recv(mxm_recv_req);
    if (MXM_OK != err) {
        orte_show_help("help-mtl-mxm.txt", "error posting receive", true,
                       mxm_error_string(err), mtl_mxm_request->buf, mtl_mxm_request->length);
        return OMPI_ERROR;
    }

    return OMPI_SUCCESS;
}
示例#15
0
static int ompi_mtl_mxm_get_ep_address(ompi_mtl_mxm_ep_conn_info_t *ep_info, mxm_ptl_id_t ptlid)
{
    size_t addrlen;
    mxm_error_t err;

    addrlen = sizeof(ep_info->ptl_addr[ptlid]);
    err = mxm_ep_address(ompi_mtl_mxm.ep, ptlid,
                         (struct sockaddr *) &ep_info->ptl_addr[ptlid], &addrlen);
    if (MXM_OK != err) {
        opal_show_help("help-mtl-mxm.txt", "unable to extract endpoint ptl address",
                       true, (int)ptlid, mxm_error_string(err));
        return OMPI_ERROR;
    }

    return OMPI_SUCCESS;
}
示例#16
0
int ompi_mtl_mxm_imrecv(struct mca_mtl_base_module_t* mtl,
                        struct opal_convertor_t *convertor,
                        struct ompi_message_t **message,
                        struct mca_mtl_request_t *mtl_request)
{
#if MXM_API >= MXM_VERSION(1,5)
    int ret;
    mxm_error_t err;
    mxm_recv_req_t *mxm_recv_req;
    mca_mtl_mxm_request_t *mtl_mxm_request;

    ompi_mtl_mxm_message_t *msgp =
                        (ompi_mtl_mxm_message_t *) (*message)->req_ptr;

    mtl_mxm_request = (mca_mtl_mxm_request_t*) mtl_request;
    mxm_recv_req = &mtl_mxm_request->mxm.recv;

    /* prepare a receive request embedded in the MTL request */
    ret = ompi_mtl_mxm_recv_init(mtl_mxm_request, convertor, mxm_recv_req);
    if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
        return ret;
    }

    mxm_recv_req->tag       = msgp->tag;
    mxm_recv_req->tag_mask  = msgp->tag_mask;
    mxm_recv_req->base.mq   = msgp->mq;
    mxm_recv_req->base.conn = msgp->conn;

    err = mxm_message_recv(mxm_recv_req, msgp->mxm_msg);
    if (OPAL_UNLIKELY(MXM_OK != err)) {
        orte_show_help("help-mtl-mxm.txt", "error posting message receive", true,
                       mxm_error_string(err), mtl_mxm_request->buf, mtl_mxm_request->length);
        return OMPI_ERROR;
    }

    OMPI_FREE_LIST_RETURN(&mca_mtl_mxm_component.mxm_messages,
                         (ompi_free_list_item_t *) msgp);

    ompi_message_return(*message);
    (*message) = MPI_MESSAGE_NULL;

    return OMPI_SUCCESS;
#else
    return OMPI_ERR_NOT_IMPLEMENTED;
#endif
}
示例#17
0
/* extension. used 4 fence implementation b4 fence was added to mxm */
int mca_spml_ikrit_get_async(void *src_addr,
                             size_t size,
                             void *dst_addr,
                             int src)
{
    mca_spml_ikrit_get_request_t *get_req;

    if (OSHMEM_SUCCESS == mca_spml_ikrit_get_shm(src_addr, size, dst_addr, src))
        return OSHMEM_SUCCESS;

    get_req = alloc_get_req();
    if (NULL == get_req) {
        SPML_ERROR("out of get requests - aborting");
        oshmem_shmem_abort(-1);
        return OSHMEM_ERROR;
    }

    if (OSHMEM_SUCCESS
            != mca_spml_ikrit_get_helper(&get_req->mxm_req,
                                         src_addr,
                                         size,
                                         dst_addr,
                                         src)) {
        oshmem_shmem_abort(-1);
        return OSHMEM_ERROR;
    }

#if MXM_API < MXM_VERSION(2,0)
    get_req->mxm_req.base.flags = 0;
#else
    get_req->mxm_req.flags = 0;
#endif
    get_req->mxm_req.base.completed_cb = get_completion_cb;
    get_req->mxm_req.base.context = get_req;
    OPAL_THREAD_ADD32(&mca_spml_ikrit.n_active_gets, 1);

    mxm_req_send(&get_req->mxm_req);

    if (MXM_OK != get_req->mxm_req.base.error) {
        SPML_ERROR("get request failed: %s - aborting",
                   mxm_error_string(get_req->mxm_req.base.error));
        oshmem_shmem_abort(-1);
        return OSHMEM_ERROR;
    }
    return OSHMEM_SUCCESS;
}
示例#18
0
int ompi_mtl_mxm_add_comm(struct mca_mtl_base_module_t *mtl,
                          struct ompi_communicator_t *comm)
{
    mxm_error_t err;
    mxm_mq_h mq;

    assert(mtl == &ompi_mtl_mxm.super);
    assert(NULL != ompi_mtl_mxm.mxm_context);

    err = mxm_mq_create(ompi_mtl_mxm.mxm_context, comm->c_contextid, &mq);
    if (MXM_OK != err) {
        opal_show_help("help-mtl-mxm.txt", "mxm mq create", true, mxm_error_string(err));
        return OMPI_ERROR;
    }

    comm->c_pml_comm = (void*)mq;
    return OMPI_SUCCESS;
}
示例#19
0
static void mca_pml_yalla_recv_completion_cb(void *context)
{
    mca_pml_yalla_recv_request_t* rreq = context;

    PML_YALLA_SET_RECV_STATUS(&rreq->mxm, rreq->mxm.completion.actual_len,
                              &rreq->super.ompi.req_status);

    PML_YALLA_VERBOSE(8, "receive request %p completed with status %s source %d rtag %d(%d/0x%x) len %zu",
                      (void *)rreq, mxm_error_string(rreq->mxm.base.error),
                      rreq->mxm.completion.sender_imm, rreq->mxm.completion.sender_tag,
                      rreq->mxm.tag, rreq->mxm.tag_mask,
                      rreq->mxm.completion.actual_len);

    ompi_request_complete(&rreq->super.ompi, true);
    if (rreq->super.flags & MCA_PML_YALLA_REQUEST_FLAG_FREE_CALLED) {
        PML_YALLA_VERBOSE(7, "release request %p because free was already called", (void *)rreq);
        mca_pml_yalla_request_release(&rreq->super, &ompi_pml_yalla.recv_reqs);
    }
}
示例#20
0
static int spml_ikrit_get_ep_address(spml_ikrit_mxm_ep_conn_info_t *ep_info,
                                     mxm_ptl_id_t ptlid)
{
    size_t addrlen;
    mxm_error_t err;

    addrlen = sizeof(ep_info->addr.ptl_addr[ptlid]);
    err = mxm_ep_address(mca_spml_ikrit.mxm_ep,
                         ptlid,
                         (struct sockaddr *) &ep_info->addr.ptl_addr[ptlid],
                         &addrlen);
    if (MXM_OK != err) {
        orte_show_help("help-oshmem-spml-ikrit.txt",
                       "unable to get endpoint address",
                       true,
                       mxm_error_string(err));
        return OSHMEM_ERROR;
    }

    return OSHMEM_SUCCESS;
}
示例#21
0
int mca_spml_ikrit_get(void *src_addr, size_t size, void *dst_addr, int src)
{
    mxm_send_req_t sreq;

    if (0 >= size) {
        return OSHMEM_SUCCESS;
    }

    if (OSHMEM_SUCCESS == mca_spml_ikrit_get_shm(src_addr, size, dst_addr, src))
        return OSHMEM_SUCCESS;

    if (OSHMEM_SUCCESS
            != mca_spml_ikrit_get_helper(&sreq,
                                         src_addr,
                                         size,
                                         dst_addr,
                                         src)) {
        oshmem_shmem_abort(-1);
        return OSHMEM_ERROR;
    }

#if MXM_API < MXM_VERSION(2,0)
    sreq.base.flags = MXM_REQ_FLAG_BLOCKING;
#else
    sreq.flags = MXM_REQ_SEND_FLAG_BLOCKING;
#endif
    sreq.base.completed_cb = NULL;

    mxm_req_send(&sreq);
    opal_progress();
    mca_spml_irkit_req_wait(&sreq.base);

    if (MXM_OK != sreq.base.error) {
        SPML_ERROR("get request failed: %s - aborting",
                   mxm_error_string(sreq.base.error));
        oshmem_shmem_abort(-1);
        return OSHMEM_ERROR;
    }
    return OSHMEM_SUCCESS;
}
示例#22
0
static inline void gasnetc_PostSend(mxm_send_req_t * mxm_sreq,
                                    uint8_t is_request,
                                    uint8_t block,
                                    uint8_t msg_num)
{
    mxm_error_t mxm_res;

#if GASNET_DEBUG_AM
    MXM_LOG("[pid %d] calling mxm_req_send()...\n", getpid());
#endif

    mxm_res = mxm_req_send(mxm_sreq);

#if GASNET_DEBUG_AM
    MXM_LOG("[pid %d] mxm_req_send() returned %d\n", getpid(), (int)mxm_res);
#endif

    if_pf (mxm_res != MXM_OK)
    gasneti_fatalerror("Error posting send request - %s\n",
                       mxm_error_string(mxm_res));

    if_pt (block)
    gasnetc_ReqRepWait(mxm_sreq, is_request, msg_num);
}
示例#23
0
static int mca_pml_yalla_send_request_cancel(ompi_request_t *request, int flag)
{
    mca_pml_yalla_send_request_t *sreq = (mca_pml_yalla_send_request_t*)request;
    mxm_error_t error;

    if (REQUEST_COMPLETE(request)) {
        /*
         * This might be a buffered send request which has completed anyway, so
         * we cannot cancel it anymore. Just hope for the best.
         */
        PML_YALLA_VERBOSE(7, "not canceling a completed send request %p", (void *)request);
        return OMPI_SUCCESS;
    }

    error = mxm_req_cancel_send(&sreq->mxm);
    if ((error != MXM_OK) && (error != MXM_ERR_NO_PROGRESS)) {
        PML_YALLA_ERROR("failed to cancel send request %p: %s", (void *)request,
                        mxm_error_string(error));
        return OMPI_ERROR;
    }

    PML_YALLA_VERBOSE(9, "canceled send request %p", (void *)request);
    return OMPI_SUCCESS;
}
示例#24
0
int mca_spml_ikrit_add_procs(ompi_proc_t** procs, size_t nprocs)
{
    spml_ikrit_mxm_ep_conn_info_t *ep_info = NULL;
    spml_ikrit_mxm_ep_conn_info_t *ep_hw_rdma_info = NULL;
    spml_ikrit_mxm_ep_conn_info_t my_ep_info;
    size_t mxm_addr_len = MXM_MAX_ADDR_LEN;
    mxm_error_t err;
    size_t i, n;
    int rc = OSHMEM_ERROR;
    ompi_proc_t *proc_self;
    int my_rank = oshmem_my_proc_id();

    OBJ_CONSTRUCT(&mca_spml_ikrit.active_peers, opal_list_t);
    /* Allocate connection requests */
    ep_info = calloc(sizeof(spml_ikrit_mxm_ep_conn_info_t), nprocs);
    if (NULL == ep_info) {
        rc = OSHMEM_ERR_OUT_OF_RESOURCE;
        goto bail;
    }

    if (mca_spml_ikrit.hw_rdma_channel) {
        ep_hw_rdma_info = calloc(sizeof(spml_ikrit_mxm_ep_conn_info_t), nprocs);
        if (NULL == ep_hw_rdma_info) {
            rc = OSHMEM_ERR_OUT_OF_RESOURCE;
            goto bail;
        }
    }

    mca_spml_ikrit.mxm_peers = (mxm_peer_t *) calloc(nprocs , sizeof(mxm_peer_t));
    if (NULL == mca_spml_ikrit.mxm_peers) {
        rc = OSHMEM_ERR_OUT_OF_RESOURCE;
        goto bail;
    }

    memset(&my_ep_info, 0, sizeof(my_ep_info));

    if (mca_spml_ikrit.hw_rdma_channel) {
        err = mxm_ep_get_address(mca_spml_ikrit.mxm_hw_rdma_ep, &my_ep_info.addr.ep_addr, &mxm_addr_len);
        if (MXM_OK != err) {
            orte_show_help("help-oshmem-spml-ikrit.txt", "unable to get endpoint address", true,
                    mxm_error_string(err));
            rc = OSHMEM_ERROR;
            goto bail;
        }
        oshmem_shmem_allgather(&my_ep_info, ep_hw_rdma_info,
                sizeof(spml_ikrit_mxm_ep_conn_info_t));
    }
    err = mxm_ep_get_address(mca_spml_ikrit.mxm_ep, &my_ep_info.addr.ep_addr, &mxm_addr_len);
    if (MXM_OK != err) {
        orte_show_help("help-oshmem-spml-ikrit.txt", "unable to get endpoint address", true,
                mxm_error_string(err));
        rc = OSHMEM_ERROR;
        goto bail;
    }

    oshmem_shmem_allgather(&my_ep_info, ep_info,
                           sizeof(spml_ikrit_mxm_ep_conn_info_t));

    opal_progress_register(spml_ikrit_progress);

    /* Get the EP connection requests for all the processes from modex */
    for (n = 0; n < nprocs; ++n) {

        /* mxm 2.0 keeps its connections on a list. Make sure
         * that list have different order on every rank */
        i = (my_rank + n) % nprocs;
        mxm_peer_construct(&mca_spml_ikrit.mxm_peers[i]);

        err = mxm_ep_connect(mca_spml_ikrit.mxm_ep, ep_info[i].addr.ep_addr, &mca_spml_ikrit.mxm_peers[i].mxm_conn);
        if (MXM_OK != err) {
            SPML_ERROR("MXM returned connect error: %s\n", mxm_error_string(err));
            goto bail;
        }
        mxm_conn_ctx_set(mca_spml_ikrit.mxm_peers[i].mxm_conn, &mca_spml_ikrit.mxm_peers[i]);
        if (mca_spml_ikrit.hw_rdma_channel) {
            err = mxm_ep_connect(mca_spml_ikrit.mxm_hw_rdma_ep, ep_hw_rdma_info[i].addr.ep_addr, &mca_spml_ikrit.mxm_peers[i].mxm_hw_rdma_conn);
            if (MXM_OK != err) {
                SPML_ERROR("MXM returned connect error: %s\n", mxm_error_string(err));
                goto bail;
            }
        } else {
            mca_spml_ikrit.mxm_peers[i].mxm_hw_rdma_conn = mca_spml_ikrit.mxm_peers[i].mxm_conn;
        }
    }

    if (ep_info)
        free(ep_info);
    if (ep_hw_rdma_info)
        free(ep_hw_rdma_info);

    if (mca_spml_ikrit.bulk_connect) {
        /* Need a barrier to ensure remote peers already created connection */
        oshmem_shmem_barrier();
        mxm_ep_wireup(mca_spml_ikrit.mxm_ep);
    }

    proc_self = oshmem_proc_group_find(oshmem_group_all, my_rank);
    /* identify local processes and change transport to SHM */
    for (i = 0; i < nprocs; i++) {
        if (procs[i]->super.proc_name.jobid != proc_self->super.proc_name.jobid ||
            !OPAL_PROC_ON_LOCAL_NODE(procs[i]->super.proc_flags)) {
            continue;
        }
        if (procs[i] == proc_self)
            continue;

        /* use zcopy for put/get via sysv shared memory with fallback to RDMA */
        mca_spml_ikrit.mxm_peers[i].ptl_id = MXM_PTL_SHM;
    }

    SPML_VERBOSE(50, "*** ADDED PROCS ***");
    return OSHMEM_SUCCESS;

bail:
	if (ep_info)
		free(ep_info);
	if (ep_hw_rdma_info)
		free(ep_hw_rdma_info);
    SPML_ERROR("add procs FAILED rc=%d", rc);

    return rc;

}
示例#25
0
int ompi_mtl_mxm_module_init(void)
{
#if MXM_API < MXM_VERSION(2,0)
    ompi_mtl_mxm_ep_conn_info_t ep_info;
#endif
    void *ep_address;
    size_t ep_address_len;
    mxm_error_t err;
    uint32_t jobid;
    uint64_t mxlr;
    ompi_proc_t **procs;
    unsigned ptl_bitmap;
    size_t totps, proc;
    int lr, nlps;
    int rc;

    mxlr = 0;
    lr = -1;

    jobid = ompi_mtl_mxm_get_job_id();
    if (0 == jobid) {
        MXM_ERROR("Failed to generate jobid");
        return OMPI_ERROR;
    }

    if (NULL == (procs = ompi_proc_world(&totps))) {
        MXM_ERROR("Unable to obtain process list");
        return OMPI_ERROR;
    }

    if (totps < (size_t)ompi_mtl_mxm.mxm_np) {
        MXM_VERBOSE(1, "MXM support will be disabled because of total number "
                    "of processes (%lu) is less than the minimum set by the "
                    "mtl_mxm_np MCA parameter (%u)", totps, ompi_mtl_mxm.mxm_np);
        return OMPI_ERR_NOT_SUPPORTED;
    }
    MXM_VERBOSE(1, "MXM support enabled");

    if (ORTE_NODE_RANK_INVALID == (lr = ompi_process_info.my_node_rank)) {
        MXM_ERROR("Unable to obtain local node rank");
        return OMPI_ERROR;
    }
    nlps = ompi_process_info.num_local_peers + 1;

    for (proc = 0; proc < totps; proc++) {
        if (OPAL_PROC_ON_LOCAL_NODE(procs[proc]->proc_flags)) {
            mxlr = max(mxlr, procs[proc]->proc_name.vpid);
        }
    }

    /* Setup the endpoint options and local addresses to bind to. */
#if MXM_API < MXM_VERSION(2,0)
    ptl_bitmap = ompi_mtl_mxm.mxm_ctx_opts->ptl_bitmap;
#else
    ptl_bitmap = 0;
#endif

    /* Open MXM endpoint */
    err = ompi_mtl_mxm_create_ep(ompi_mtl_mxm.mxm_context, &ompi_mtl_mxm.ep,
                                 ptl_bitmap, lr, jobid, mxlr, nlps);
    if (MXM_OK != err) {
        opal_show_help("help-mtl-mxm.txt", "unable to create endpoint", true,
                       mxm_error_string(err));
        return OMPI_ERROR;
    }

    /*
     * Get address for each PTL on this endpoint, and share it with other ranks.
     */
#if MXM_API < MXM_VERSION(2,0)
    if ((ptl_bitmap & MXM_BIT(MXM_PTL_SELF)) &&
            OMPI_SUCCESS != ompi_mtl_mxm_get_ep_address(&ep_info, MXM_PTL_SELF)) {
        return OMPI_ERROR;
    }
    if ((ptl_bitmap & MXM_BIT(MXM_PTL_RDMA)) &&
            OMPI_SUCCESS != ompi_mtl_mxm_get_ep_address(&ep_info, MXM_PTL_RDMA)) {
        return OMPI_ERROR;
    }
    if ((ptl_bitmap & MXM_BIT(MXM_PTL_SHM)) &&
            OMPI_SUCCESS != ompi_mtl_mxm_get_ep_address(&ep_info, MXM_PTL_SHM)) {
        return OMPI_ERROR;
    }

    ep_address = &ep_info;
    ep_address_len = sizeof(ep_info);
#else
    rc = ompi_mtl_mxm_get_ep_address(&ep_address, &ep_address_len);
    if (OMPI_SUCCESS != rc) {
        return rc;
    }
#endif

    rc = ompi_mtl_mxm_send_ep_address(ep_address, ep_address_len);
    if (OMPI_SUCCESS != rc) {
        MXM_ERROR("Modex session failed.");
        return rc;
    }

#if MXM_API >= MXM_VERSION(2,0)
    free(ep_address);
#endif

    /* Register the MXM progress function */
    opal_progress_register(ompi_mtl_mxm_progress);

#if MXM_API >= MXM_VERSION(2,0)
    if (ompi_mtl_mxm.using_mem_hooks) {
        opal_mem_hooks_register_release(ompi_mtl_mxm_mem_release_cb, NULL);
    }
#endif
    return OMPI_SUCCESS;
}
示例#26
0
int mca_spml_ikrit_add_procs(ompi_proc_t** procs, size_t nprocs)
{
    spml_ikrit_mxm_ep_conn_info_t *ep_info = NULL;
    spml_ikrit_mxm_ep_conn_info_t *ep_hw_rdma_info = NULL;
    spml_ikrit_mxm_ep_conn_info_t my_ep_info = {{0}};
#if MXM_API < MXM_VERSION(2,0)
    mxm_conn_req_t *conn_reqs;
    int timeout;
#else
    size_t mxm_addr_len = MXM_MAX_ADDR_LEN;
#endif
    mxm_error_t err;
    size_t i, n;
    int rc = OSHMEM_ERROR;
    ompi_proc_t *proc_self;
    int my_rank = oshmem_my_proc_id();

    OBJ_CONSTRUCT(&mca_spml_ikrit.active_peers, opal_list_t);
    /* Allocate connection requests */
#if MXM_API < MXM_VERSION(2,0)
    conn_reqs = malloc(nprocs * sizeof(mxm_conn_req_t));
    if (NULL == conn_reqs) {
        rc = OSHMEM_ERR_OUT_OF_RESOURCE;
        goto bail;
    }
    memset(conn_reqs, 0x0, sizeof(mxm_conn_req_t));
#endif
    ep_info = calloc(sizeof(spml_ikrit_mxm_ep_conn_info_t), nprocs);
    if (NULL == ep_info) {
        rc = OSHMEM_ERR_OUT_OF_RESOURCE;
        goto bail;
    }

    if (mca_spml_ikrit.hw_rdma_channel) {
        ep_hw_rdma_info = calloc(sizeof(spml_ikrit_mxm_ep_conn_info_t), nprocs);
        if (NULL == ep_hw_rdma_info) {
            rc = OSHMEM_ERR_OUT_OF_RESOURCE;
            goto bail;
        }
    }

    mca_spml_ikrit.mxm_peers = (mxm_peer_t **) malloc(nprocs
            * sizeof(*(mca_spml_ikrit.mxm_peers)));
    if (NULL == mca_spml_ikrit.mxm_peers) {
        rc = OSHMEM_ERR_OUT_OF_RESOURCE;
        goto bail;
    }

#if MXM_API < MXM_VERSION(2,0)
    if (OSHMEM_SUCCESS
            != spml_ikrit_get_ep_address(&my_ep_info, MXM_PTL_SELF)) {
        rc = OSHMEM_ERROR;
        goto bail;
    }
    if (OSHMEM_SUCCESS
            != spml_ikrit_get_ep_address(&my_ep_info, MXM_PTL_RDMA)) {
        rc = OSHMEM_ERROR;
        goto bail;
    }
#else
    if (mca_spml_ikrit.hw_rdma_channel) {
        err = mxm_ep_get_address(mca_spml_ikrit.mxm_hw_rdma_ep, &my_ep_info.addr.ep_addr, &mxm_addr_len);
        if (MXM_OK != err) {
            orte_show_help("help-oshmem-spml-ikrit.txt", "unable to get endpoint address", true,
                    mxm_error_string(err));
            rc = OSHMEM_ERROR;
            goto bail;
        }
        oshmem_shmem_allgather(&my_ep_info, ep_hw_rdma_info,
                sizeof(spml_ikrit_mxm_ep_conn_info_t));
    }
    err = mxm_ep_get_address(mca_spml_ikrit.mxm_ep, &my_ep_info.addr.ep_addr, &mxm_addr_len);
    if (MXM_OK != err) {
        orte_show_help("help-oshmem-spml-ikrit.txt", "unable to get endpoint address", true,
                mxm_error_string(err));
        rc = OSHMEM_ERROR;
        goto bail;
    }
#endif
    oshmem_shmem_allgather(&my_ep_info, ep_info,
                           sizeof(spml_ikrit_mxm_ep_conn_info_t));

    opal_progress_register(spml_ikrit_progress);

    /* Get the EP connection requests for all the processes from modex */
    for (n = 0; n < nprocs; ++n) {

        /* mxm 2.0 keeps its connections on a list. Make sure
         * that list have different order on every rank */
        i = (my_rank + n) % nprocs;
        mca_spml_ikrit.mxm_peers[i] = OBJ_NEW(mxm_peer_t);
        if (NULL == mca_spml_ikrit.mxm_peers[i]) {
            rc = OSHMEM_ERR_OUT_OF_RESOURCE;
            goto bail;
        }
        mca_spml_ikrit.mxm_peers[i]->pe = i;

#if MXM_API < MXM_VERSION(2,0)
        conn_reqs[i].ptl_addr[MXM_PTL_SELF] =
                (struct sockaddr *) &ep_info[i].addr.ptl_addr[MXM_PTL_SELF];
        conn_reqs[i].ptl_addr[MXM_PTL_SHM] = NULL;
        conn_reqs[i].ptl_addr[MXM_PTL_RDMA] =
                (struct sockaddr *) &ep_info[i].addr.ptl_addr[MXM_PTL_RDMA];
#else
        err = mxm_ep_connect(mca_spml_ikrit.mxm_ep, ep_info[i].addr.ep_addr, &mca_spml_ikrit.mxm_peers[i]->mxm_conn);
        if (MXM_OK != err) {
            SPML_ERROR("MXM returned connect error: %s\n", mxm_error_string(err));
            goto bail;
        }
        if (OSHMEM_SUCCESS != create_ptl_idx(i))
                goto bail;
        mxm_conn_ctx_set(mca_spml_ikrit.mxm_peers[i]->mxm_conn, mca_spml_ikrit.mxm_peers[i]);
        if (mca_spml_ikrit.hw_rdma_channel) {
            err = mxm_ep_connect(mca_spml_ikrit.mxm_hw_rdma_ep, ep_hw_rdma_info[i].addr.ep_addr, &mca_spml_ikrit.mxm_peers[i]->mxm_hw_rdma_conn);
            if (MXM_OK != err) {
                SPML_ERROR("MXM returned connect error: %s\n", mxm_error_string(err));
                goto bail;
            }
        } else {
            mca_spml_ikrit.mxm_peers[i]->mxm_hw_rdma_conn = mca_spml_ikrit.mxm_peers[i]->mxm_conn;
        }
#endif
    }

#if MXM_API < MXM_VERSION(2,0)
    /* Connect to remote peers */
    if (mxm_get_version() < MXM_VERSION(1,5)) {
        timeout = 1000;
    } else {
        timeout = -1;
    }
    err = mxm_ep_connect(mca_spml_ikrit.mxm_ep, conn_reqs, nprocs, timeout);
    if (MXM_OK != err) {
        SPML_ERROR("MXM returned connect error: %s\n", mxm_error_string(err));
        for (i = 0; i < nprocs; ++i) {
            if (MXM_OK != conn_reqs[i].error) {
                SPML_ERROR("MXM EP connect to %s error: %s\n",
                           procs[i]->proc_hostname, mxm_error_string(conn_reqs[i].error));
            }
        }
        rc = OSHMEM_ERR_CONNECTION_FAILED;
        goto bail;
    }

    /* Save returned connections */
    for (i = 0; i < nprocs; ++i) {
        mca_spml_ikrit.mxm_peers[i]->mxm_conn = conn_reqs[i].conn;
        if (OSHMEM_SUCCESS != create_ptl_idx(i)) {
            rc = OSHMEM_ERR_CONNECTION_FAILED;
            goto bail;
        }

        mxm_conn_ctx_set(conn_reqs[i].conn, mca_spml_ikrit.mxm_peers[i]);
    }

    if (conn_reqs)
        free(conn_reqs);
#endif
    if (ep_info)
        free(ep_info);
    if (ep_hw_rdma_info)
        free(ep_hw_rdma_info);

#if MXM_API >= MXM_VERSION(2,0)
    if (mca_spml_ikrit.bulk_connect) {
        /* Need a barrier to ensure remote peers already created connection */
        oshmem_shmem_barrier();
        mxm_ep_wireup(mca_spml_ikrit.mxm_ep);
    }
#endif

    proc_self = oshmem_proc_group_find(oshmem_group_all, my_rank);
    /* identify local processes and change transport to SHM */
    for (i = 0; i < nprocs; i++) {
        if (procs[i]->super.proc_name.jobid != proc_self->super.proc_name.jobid ||
            !OPAL_PROC_ON_LOCAL_NODE(procs[i]->super.proc_flags)) {
            continue;
        }
        if (procs[i] == proc_self)
            continue;

        /* use zcopy for put/get via sysv shared memory */
        OSHMEM_PROC_DATA(procs[i])->transport_ids[0] = MXM_PTL_SHM;
        OSHMEM_PROC_DATA(procs[i])->transport_ids[1] = MXM_PTL_RDMA;
        OSHMEM_PROC_DATA(procs[i])->num_transports = 2;
    }

    SPML_VERBOSE(50, "*** ADDED PROCS ***");
    return OSHMEM_SUCCESS;

bail:
#if MXM_API < MXM_VERSION(2,0)
	if (conn_reqs)
		free(conn_reqs);
#endif
	if (ep_info)
		free(ep_info);
	if (ep_hw_rdma_info)
		free(ep_hw_rdma_info);
    SPML_ERROR("add procs FAILED rc=%d", rc);

    return rc;

}
示例#27
0
extern void gasnetc_barrier_fence(void)
{
    gasnet_mxm_send_req_t * sreqs;
    mxm_send_req_t * mxm_sreq;
    mxm_error_t mxm_res;
    int dest;

    sreqs = (gasnet_mxm_send_req_t *)
            alloca(sizeof(gasnet_mxm_send_req_t) * gasneti_nodes);

    for (dest = 0; dest < gasneti_nodes; dest++) {
        if (!gasnet_mxm_module.need_fence[dest])
            continue;
        if_pf (dest == gasneti_mynode)
        continue;
#if GASNET_PSHM
        if_pf (gasneti_pshm_in_supernode(dest))
        continue;
#endif
        mxm_sreq = &sreqs[dest].mxm_sreq;
#if MXM_API < MXM_VERSION(2,0)
        mxm_sreq->opcode = MXM_REQ_OP_FENCE;
        mxm_sreq->base.flags = MXM_REQ_FLAG_SEND_SYNC;
#else
        mxm_sreq->opcode = MXM_REQ_OP_PUT_SYNC;
        mxm_sreq->flags  = MXM_REQ_SEND_FLAG_FENCE;
        mxm_sreq->op.mem.remote_vaddr = 0;
        mxm_sreq->op.mem.remote_mkey  = &mxm_empty_mem_key;
#endif

        mxm_sreq->base.state = MXM_REQ_NEW;
        mxm_sreq->base.conn = gasnet_mxm_module.connections[dest];
        mxm_sreq->base.mq = gasnet_mxm_module.mxm_mq;

        mxm_sreq->base.data.buffer.ptr = NULL;
        mxm_sreq->base.data.buffer.length = 0;

#if MXM_API < MXM_VERSION(1,5)
        mxm_sreq->base.data.buffer.mkey = MXM_MKEY_NONE;
#else
        mxm_sreq->base.data.buffer.memh = NULL;
#endif
        mxm_sreq->base.data_type = MXM_REQ_DATA_BUFFER;

        mxm_sreq->base.completed_cb = NULL;
        mxm_sreq->base.context = NULL;

        mxm_res = mxm_req_send(mxm_sreq);
        if_pt (MXM_OK != mxm_res)
        gasneti_fatalerror("Error posting send request - %s\n",
                           mxm_error_string(mxm_res));
    }

    for (dest = 0; dest < gasneti_nodes; dest++) {
        if (!gasnet_mxm_module.need_fence[dest])
            continue;
        if_pf (dest == gasneti_mynode)
        continue;
#if GASNET_PSHM
        if_pf (gasneti_pshm_in_supernode(dest))
        continue;
#endif
        mxm_sreq = &sreqs[dest].mxm_sreq;

        /* we are waiting for real completion, not just for SENT state */
        while (!mxm_req_test(&mxm_sreq->base))
            gasnetc_AMPoll();

        gasnet_mxm_module.need_fence[dest] = 0;
    }
}
示例#28
0
int ompi_mtl_mxm_add_procs(struct mca_mtl_base_module_t *mtl, size_t nprocs,
                           struct ompi_proc_t** procs)
{
#if MXM_API < MXM_VERSION(2,0)
    ompi_mtl_mxm_ep_conn_info_t *ep_info;
    mxm_conn_req_t *conn_reqs;
    size_t ep_index = 0;
#endif
    void *ep_address;
    size_t ep_address_len;
    mxm_error_t err;
    size_t i;
    int rc;
    mca_mtl_mxm_endpoint_t *endpoint;

    assert(mtl == &ompi_mtl_mxm.super);

#if MXM_API < MXM_VERSION(2,0)
    /* Allocate connection requests */
    conn_reqs = calloc(nprocs, sizeof(mxm_conn_req_t));
    ep_info   = calloc(nprocs, sizeof(ompi_mtl_mxm_ep_conn_info_t));
    if (NULL == conn_reqs || NULL == ep_info) {
        rc = OMPI_ERR_OUT_OF_RESOURCE;
        goto bail;
    }
#endif

    /* Get the EP connection requests for all the processes from modex */
    for (i = 0; i < nprocs; ++i) {
        if (NULL != procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL]) {
            continue; /* already connected to this endpoint */
        }
        rc = ompi_mtl_mxm_recv_ep_address(procs[i], &ep_address, &ep_address_len);
        if (rc != OMPI_SUCCESS) {
            goto bail;
        }

#if MXM_API < MXM_VERSION(2,0)
        if (ep_address_len != sizeof(ep_info[i])) {
            MXM_ERROR("Invalid endpoint address length");
            rc = OMPI_ERROR;
            goto bail;
        }

        memcpy(&ep_info[i], ep_address, ep_address_len);
        conn_reqs[ep_index].ptl_addr[MXM_PTL_SELF] = (struct sockaddr *)&(ep_info[i].ptl_addr[MXM_PTL_SELF]);
        conn_reqs[ep_index].ptl_addr[MXM_PTL_SHM]  = (struct sockaddr *)&(ep_info[i].ptl_addr[MXM_PTL_SHM]);
        conn_reqs[ep_index].ptl_addr[MXM_PTL_RDMA] = (struct sockaddr *)&(ep_info[i].ptl_addr[MXM_PTL_RDMA]);
        ep_index++;

#else
        endpoint = OBJ_NEW(mca_mtl_mxm_endpoint_t);
        endpoint->mtl_mxm_module = &ompi_mtl_mxm;
        err = mxm_ep_connect(ompi_mtl_mxm.ep, ep_address, &endpoint->mxm_conn);
        if (err != MXM_OK) {
            MXM_ERROR("MXM returned connect error: %s\n", mxm_error_string(err));
            rc = OMPI_ERROR;
            goto bail;
        }
        procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL] = endpoint;
#endif
        free(ep_address);
    }

#if MXM_API < MXM_VERSION(2,0)
    /* Connect to remote peers */
    err = mxm_ep_connect(ompi_mtl_mxm.ep, conn_reqs, ep_index, -1);
    if (MXM_OK != err) {
        MXM_ERROR("MXM returned connect error: %s\n", mxm_error_string(err));
        for (i = 0; i < ep_index; ++i) {
            if (MXM_OK != conn_reqs[i].error) {
                MXM_ERROR("MXM EP connect to %s error: %s\n",
                          (NULL == procs[i]->proc_hostname) ?
                          "unknown" : procs[i]->proc_hostname,
                          mxm_error_string(conn_reqs[i].error));
            }
        }
        rc = OMPI_ERROR;
        goto bail;
    }

    /* Save returned connections */
    for (i = 0; i < ep_index; ++i) {
        endpoint = OBJ_NEW(mca_mtl_mxm_endpoint_t);
        endpoint->mtl_mxm_module = &ompi_mtl_mxm;
        endpoint->mxm_conn = conn_reqs[i].conn;
        procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL] = endpoint;
    }

#endif

    rc = OMPI_SUCCESS;

bail:
#if MXM_API < MXM_VERSION(2,0)
    free(conn_reqs);
    free(ep_info);
#endif
    return rc;
}
示例#29
0
int ompi_mtl_mxm_add_procs(struct mca_mtl_base_module_t *mtl, size_t nprocs,
                           struct ompi_proc_t** procs, /*const*/
                           struct mca_mtl_base_endpoint_t **mtl_peer_data)
{
#if MXM_API < MXM_VERSION(2,0)
    ompi_mtl_mxm_ep_conn_info_t *ep_info;
    mxm_conn_req_t *conn_reqs;
    int timeout;
#endif
    void *ep_address;
    size_t ep_address_len;
    mxm_error_t err;
    size_t i;
    int rc;

    assert(mtl == &ompi_mtl_mxm.super);

#if MXM_API < MXM_VERSION(2,0)
    /* Allocate connection requests */
    conn_reqs = calloc(nprocs, sizeof(mxm_conn_req_t));
    ep_info   = calloc(nprocs, sizeof(ompi_mtl_mxm_ep_conn_info_t));
    if (NULL == conn_reqs || NULL == ep_info) {
        rc = OMPI_ERR_OUT_OF_RESOURCE;
        goto bail;
    }
#endif

    /* Get the EP connection requests for all the processes from modex */
    for (i = 0; i < nprocs; ++i) {
        rc = ompi_mtl_mxm_recv_ep_address(procs[i], &ep_address, &ep_address_len);
        if (rc != OMPI_SUCCESS) {
            goto bail;
        }

#if MXM_API < MXM_VERSION(2,0)
        if (ep_address_len != sizeof(ep_info[i])) {
            MXM_ERROR("Invalid endpoint address length");
            rc = OMPI_ERROR;
            goto bail;
        }

        memcpy(&ep_info[i], ep_address, ep_address_len);
        conn_reqs[i].ptl_addr[MXM_PTL_SELF] = (struct sockaddr *)&(ep_info[i].ptl_addr[MXM_PTL_SELF]);
        conn_reqs[i].ptl_addr[MXM_PTL_SHM]  = (struct sockaddr *)&(ep_info[i].ptl_addr[MXM_PTL_SHM]);
        conn_reqs[i].ptl_addr[MXM_PTL_RDMA] = (struct sockaddr *)&(ep_info[i].ptl_addr[MXM_PTL_RDMA]);
#else
        mtl_peer_data[i] = (mca_mtl_mxm_endpoint_t *) OBJ_NEW(mca_mtl_mxm_endpoint_t);
        mtl_peer_data[i]->mtl_mxm_module = &ompi_mtl_mxm;
        err = mxm_ep_connect(ompi_mtl_mxm.ep, ep_address, &mtl_peer_data[i]->mxm_conn);
        if (err != MXM_OK) {
            MXM_ERROR("MXM returned connect error: %s\n", mxm_error_string(err));
            rc = OMPI_ERROR;
            goto bail;
        }
#endif
        free(ep_address);
    }

#if MXM_API < MXM_VERSION(2,0)
    /* Connect to remote peers */
    timeout = (mxm_get_version() < MXM_VERSION(1,5)) ? 1000 : -1;
    err = mxm_ep_connect(ompi_mtl_mxm.ep, conn_reqs, nprocs, timeout);
    if (MXM_OK != err) {
        MXM_ERROR("MXM returned connect error: %s\n", mxm_error_string(err));
        for (i = 0; i < nprocs; ++i) {
            if (MXM_OK != conn_reqs[i].error) {
                MXM_ERROR("MXM EP connect to %s error: %s\n", procs[i]->proc_hostname,
                          mxm_error_string(conn_reqs[i].error));
            }
        }
        rc = OMPI_ERROR;
        goto bail;
    }

    /* Save returned connections */
    for (i = 0; i < nprocs; ++i) {
        mtl_peer_data[i] = (mca_mtl_mxm_endpoint_t *) OBJ_NEW(mca_mtl_mxm_endpoint_t);
        mtl_peer_data[i]->mtl_mxm_module = &ompi_mtl_mxm;
        mtl_peer_data[i]->mxm_conn = conn_reqs[i].conn;
    }
#endif
    rc = OMPI_SUCCESS;

bail:
#if MXM_API < MXM_VERSION(2,0)
    free(conn_reqs);
    free(ep_info);
#endif
    return rc;
}
示例#30
0
sshmem_mkey_t *mca_spml_ikrit_register(void* addr,
                                         size_t size,
                                         uint64_t shmid,
                                         int *count)
{
    int i;
    sshmem_mkey_t *mkeys;
    mxm_error_t err;
    mxm_mem_key_t *m_key;
    int my_rank = oshmem_my_proc_id();

    *count = 0;
    mkeys = (sshmem_mkey_t *) calloc(1, MXM_PTL_LAST * sizeof(*mkeys));
    if (!mkeys) {
        return NULL ;
    }

    for (i = 0; i < MXM_PTL_LAST; i++) {
        mkeys[i].u.key = MAP_SEGMENT_SHM_INVALID;
        switch (i) {
        case MXM_PTL_SHM:
            if ((int)shmid != MAP_SEGMENT_SHM_INVALID) {
                mkeys[i].u.key = shmid;
                mkeys[i].va_base = 0;
            } else {
                mkeys[i].len = 0;
                mkeys[i].va_base = addr;
            }
            mkeys[i].spml_context = 0;
            break;
        case MXM_PTL_RDMA:
            mkeys[i].va_base = addr;
            mkeys[i].spml_context = 0;

            if (mca_spml_ikrit.ud_only) {
                mkeys[i].len = 0;
                break;
            }

            err = mxm_mem_map(mca_spml_ikrit.mxm_context, &addr, &size, 0, 0, 0);
            if (MXM_OK != err) {
                SPML_ERROR("Failed to register memory: %s", mxm_error_string(err));
                goto error_out;
            }
            mkeys[i].spml_context = (void *)(unsigned long)size;

            m_key = malloc(sizeof(*m_key));
            if (NULL == m_key) {
                SPML_ERROR("Failed to allocate m_key memory");
                goto error_out;
            }
            mkeys[i].len = sizeof(*m_key);
            mkeys[i].u.data = m_key;

            err = mxm_mem_get_key(mca_spml_ikrit.mxm_context, addr, m_key);
            if (MXM_OK != err) {
                SPML_ERROR("Failed to get memory key: %s", mxm_error_string(err));
                goto error_out;
            }
            break;

        default:
            SPML_ERROR("unsupported PTL: %d", i);
            goto error_out;
        }
        SPML_VERBOSE(5,
                     "rank %d ptl %d addr %p size %llu %s",
                     my_rank, i, addr, (unsigned long long)size,
                     mca_spml_base_mkey2str(&mkeys[i]));

        mca_spml_ikrit_cache_mkeys(&mkeys[i], memheap_find_segnum(addr), my_rank, i);
    }
    *count = MXM_PTL_LAST;

    return mkeys;

error_out:
    mca_spml_ikrit_deregister(mkeys);

    return NULL;
}