Beispiel #1
0
static inline ssize_t uct_ugni_post_fma(uct_ugni_rdma_iface_t *iface,
                                        uct_ugni_ep_t *ep,
                                        uct_ugni_base_desc_t *fma,
                                        ssize_t ok_status)
{
    gni_return_t ugni_rc;

    if (ucs_unlikely(!uct_ugni_ep_can_send(ep))) {
        ucs_mpool_put(fma);
        return UCS_ERR_NO_RESOURCE;
    }
    uct_ugni_device_lock(&iface->super.cdm);
    ugni_rc = GNI_PostFma(ep->ep, &fma->desc);
    uct_ugni_device_unlock(&iface->super.cdm);
    if (ucs_unlikely(GNI_RC_SUCCESS != ugni_rc)) {
        ucs_mpool_put(fma);
        if(GNI_RC_ERROR_RESOURCE == ugni_rc || GNI_RC_ERROR_NOMEM == ugni_rc) {
            ucs_debug("GNI_PostFma failed, Error status: %s %d",
                      gni_err_str[ugni_rc], ugni_rc);
            return UCS_ERR_NO_RESOURCE;
        } else {
            ucs_error("GNI_PostFma failed, Error status: %s %d",
                      gni_err_str[ugni_rc], ugni_rc);
            return UCS_ERR_IO_ERROR;
        }
    }

    ++fma->flush_group->flush_comp.count;
    ++iface->super.outstanding;

    return ok_status;
}
Beispiel #2
0
static inline ucs_status_t uct_ud_verbs_iface_poll_rx(uct_ud_verbs_iface_t *iface)
{
    uct_ib_iface_recv_desc_t *desc;
    struct ibv_wc wc[UCT_IB_MAX_WC];
    int i, ret;
    char *packet;


    ret = ibv_poll_cq(iface->super.super.recv_cq, UCT_IB_MAX_WC, wc);
    if (ret == 0) {
        return UCS_ERR_NO_PROGRESS;
    } 
    if (ucs_unlikely(ret < 0)) {
        ucs_fatal("Failed to poll receive CQ");
    }

    for (i = 0; i < ret; ++i) {
        if (ucs_unlikely(wc[i].status != IBV_WC_SUCCESS)) {
            ucs_fatal("Receive completion with error: %s", ibv_wc_status_str(wc[i].status));
        }

        desc = (void*)wc[i].wr_id;
        ucs_trace_data("pkt rcvd: buf=%p len=%d", desc, wc[i].byte_len);
        packet = uct_ib_iface_recv_desc_hdr(&iface->super.super, desc);
        VALGRIND_MAKE_MEM_DEFINED(packet, wc[i].byte_len);

        uct_ud_ep_process_rx(&iface->super, 
                             (uct_ud_neth_t *)(packet + UCT_IB_GRH_LEN),
                             wc[i].byte_len - UCT_IB_GRH_LEN,
                             (uct_ud_recv_skb_t *)desc); 
    }
    iface->super.rx.available += ret;
    uct_ud_verbs_iface_post_recv(iface);
    return UCS_OK;
}
Beispiel #3
0
static inline ucs_status_t uct_ugni_post_rdma(uct_ugni_rdma_iface_t *iface,
                                              uct_ugni_ep_t *ep,
                                              uct_ugni_base_desc_t *rdma)
{
    gni_return_t ugni_rc;

    if (ucs_unlikely(!uct_ugni_can_send(ep))) {
        ucs_mpool_put(rdma);
        return UCS_ERR_NO_RESOURCE;
    }

    ugni_rc = GNI_PostRdma(ep->ep, &rdma->desc);
    if (ucs_unlikely(GNI_RC_SUCCESS != ugni_rc)) {
        ucs_mpool_put(rdma);
        if(GNI_RC_ERROR_RESOURCE == ugni_rc || GNI_RC_ERROR_NOMEM == ugni_rc) {
            ucs_debug("GNI_PostRdma failed, Error status: %s %d",
                      gni_err_str[ugni_rc], ugni_rc);
            return UCS_ERR_NO_RESOURCE;
        } else {
            ucs_error("GNI_PostRdma failed, Error status: %s %d",
                      gni_err_str[ugni_rc], ugni_rc);
            return UCS_ERR_IO_ERROR;
        }
    }

    ++ep->outstanding;
    ++iface->super.outstanding;

    return UCS_INPROGRESS;
}
Beispiel #4
0
static void *uct_ugni_udt_device_thread(void *arg)
{
    uct_ugni_udt_iface_t *iface = (uct_ugni_udt_iface_t *)arg;
    gni_return_t ugni_rc;
    uint64_t id;

    while (1) {
        pthread_mutex_lock(&iface->device_lock);
        while (iface->events_ready) {
            pthread_cond_wait(&iface->device_condition, &iface->device_lock);
        }
        pthread_mutex_unlock(&iface->device_lock);
        ugni_rc = GNI_PostdataProbeWaitById(uct_ugni_udt_iface_nic_handle(iface),-1,&id);
        if (ucs_unlikely(GNI_RC_SUCCESS != ugni_rc)) {
            ucs_error("GNI_PostDataProbeWaitById, Error status: %s %d\n",
                      gni_err_str[ugni_rc], ugni_rc);
            continue;
        }
        if (ucs_unlikely(UCT_UGNI_UDT_CANCEL == id)) {
            /* When the iface is torn down, it will post and cancel a datagram with a
             * magic cookie as it's id that tells us to shut down.
             */
            break;
        }
        iface->events_ready = 1;
        ucs_trace("Recieved a new datagram");
        ucs_async_pipe_push(&iface->event_pipe);
    }

    return NULL;
}
Beispiel #5
0
static UCS_F_ALWAYS_INLINE ucs_status_t 
uct_rc_verbs_iface_poll_rx(uct_rc_verbs_iface_t *iface)
{
    uct_ib_iface_recv_desc_t *desc;
    uct_rc_hdr_t *hdr;
    struct ibv_wc wc[UCT_IB_MAX_WC];
    int i, ret;

    ret = ibv_poll_cq(iface->super.super.recv_cq, UCT_IB_MAX_WC, wc);
    if (ret > 0) {
        for (i = 0; i < ret; ++i) {
            if (ucs_unlikely(wc[i].status != IBV_WC_SUCCESS)) {
                ucs_fatal("Receive completion with error: %s", ibv_wc_status_str(wc[i].status));
            }

            UCS_STATS_UPDATE_COUNTER(iface->super.stats, UCT_RC_IFACE_STAT_RX_COMPLETION, 1);

            desc = (void*)wc[i].wr_id;
            uct_ib_iface_desc_received(&iface->super.super, desc, wc[i].byte_len, 1);

            hdr = uct_ib_iface_recv_desc_hdr(&iface->super.super, desc);
            uct_ib_log_recv_completion(IBV_QPT_RC, &wc[i], hdr, uct_rc_ep_am_packet_dump);

            uct_rc_iface_invoke_am(&iface->super, hdr, wc[i].byte_len, desc);
        }

        iface->super.rx.available += ret;
        return UCS_OK;
    } else if (ret == 0) {
        uct_rc_verbs_iface_post_recv(iface, 0);
        return UCS_ERR_NO_PROGRESS;
    } else {
        ucs_fatal("Failed to poll receive CQ");
    }
}
Beispiel #6
0
static ucs_status_t recieve_datagram(uct_ugni_udt_iface_t *iface, uint64_t id, uct_ugni_udt_ep_t **ep_out)
{
    uint32_t rem_addr, rem_id;
    gni_post_state_t post_state;
    gni_return_t ugni_rc;
    uct_ugni_udt_ep_t *ep;
    gni_ep_handle_t gni_ep;
    uct_ugni_udt_desc_t *desc;
    uct_ugni_udt_header_t *header;

    ucs_trace_func("iface=%p, id=%lx", iface, id);

    if (UCT_UGNI_UDT_ANY == id) {
        ep = NULL;
        gni_ep = iface->ep_any;
        desc = iface->desc_any;
    } else {
        ep = ucs_derived_of(uct_ugni_iface_lookup_ep(&iface->super, id),
                            uct_ugni_udt_ep_t);
        gni_ep = ep->super.ep;
        desc = ep->posted_desc;
    }

    *ep_out = ep;
    uct_ugni_device_lock(&iface->super.cdm);
    ugni_rc = GNI_EpPostDataWaitById(gni_ep, id, -1, &post_state, &rem_addr, &rem_id);
    uct_ugni_device_unlock(&iface->super.cdm);
    if (ucs_unlikely(GNI_RC_SUCCESS != ugni_rc)) {
        ucs_error("GNI_EpPostDataWaitById, id=%lu Error status: %s %d",
                  id, gni_err_str[ugni_rc], ugni_rc);
        return UCS_ERR_IO_ERROR;
    }
    if (GNI_POST_TERMINATED == post_state) {
        return UCS_ERR_CANCELED;
    }

    if (GNI_POST_COMPLETED != post_state) {
        ucs_error("GNI_EpPostDataWaitById gave unexpected response: %u", post_state);
        return UCS_ERR_IO_ERROR;
    }

    if (UCT_UGNI_UDT_ANY != id) {
        --iface->super.outstanding;
    }

    header = uct_ugni_udt_get_rheader(desc, iface);

    ucs_trace("Got datagram id: %lu type: %i len: %i am_id: %i", id, header->type, header->length, header->am_id);

    if (UCT_UGNI_UDT_PAYLOAD != header->type) {
        /* ack message, no data */
        ucs_assert_always(NULL != ep);
        ucs_mpool_put(ep->posted_desc);
        uct_ugni_check_flush(ep->desc_flush_group);
        ep->posted_desc = NULL;
        return UCS_OK;
    }

    return UCS_INPROGRESS;
}
Beispiel #7
0
unsigned uct_rc_verbs_iface_post_recv_always(uct_rc_iface_t *iface, unsigned max)
{
    struct ibv_recv_wr *bad_wr;
    uct_ib_recv_wr_t *wrs;
    unsigned count;
    int ret;

    wrs  = ucs_alloca(sizeof *wrs  * max);

    count = uct_ib_iface_prepare_rx_wrs(&iface->super, &iface->rx.mp,
                                        wrs, max);
    if (ucs_unlikely(count == 0)) {
        return 0;
    }

    UCT_IB_INSTRUMENT_RECORD_RECV_WR_LEN("uct_rc_iface_post_recv_always",
                                         &wrs[0].ibwr);
    ret = ibv_post_srq_recv(iface->rx.srq, &wrs[0].ibwr, &bad_wr);
    if (ret != 0) {
        ucs_fatal("ibv_post_srq_recv() returned %d: %m", ret);
    }
    iface->rx.available -= count;

    return count;
}
Beispiel #8
0
static ucs_status_t UCS_F_ALWAYS_INLINE
uct_dc_mlx5_ep_atomic_fop_post(uct_ep_h tl_ep, unsigned opcode, unsigned size,
                               uint64_t value, void *result,
                               uint64_t remote_addr, uct_rkey_t rkey,
                               uct_completion_t *comp)
{
    uct_dc_mlx5_ep_t *ep = ucs_derived_of(tl_ep, uct_dc_mlx5_ep_t);
    int op;
    uint64_t compare_mask;
    uint64_t compare;
    uint64_t swap_mask;
    uint64_t swap;
    int      ext;
    ucs_status_t status;

    UCT_RC_MLX5_CHECK_ATOMIC_OPS(opcode, size, UCT_RC_MLX5_ATOMIC_FOPS);

    status = uct_rc_mlx5_iface_common_atomic_data(opcode, size, value, &op, &compare_mask,
                                                  &compare, &swap_mask, &swap, &ext);
    if (ucs_unlikely(UCS_STATUS_IS_ERR(status))) {
        return status;
    }

    return uct_dc_mlx5_ep_atomic_fop(ep, op, result, ext, size, remote_addr, rkey,
                                     compare_mask, compare, swap_mask, swap, comp);
}
Beispiel #9
0
ucs_status_t uct_rc_verbs_ep_flush(uct_ep_h tl_ep, unsigned flags,
                                   uct_completion_t *comp)
{
    uct_rc_verbs_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_rc_verbs_iface_t);
    uct_rc_verbs_ep_t *ep = ucs_derived_of(tl_ep, uct_rc_verbs_ep_t);
    ucs_status_t status;

    if (ucs_unlikely(flags & UCT_FLUSH_FLAG_CANCEL)) {
        uct_ep_pending_purge(&ep->super.super.super, NULL, 0);
        uct_rc_verbs_ep_handle_failure(ep, UCS_ERR_CANCELED);
        return UCS_OK;
    }

    status = uct_rc_ep_flush(&ep->super, iface->config.tx_max_wr, flags);
    if (status != UCS_INPROGRESS) {
        return status;
    }

    if (uct_rc_txqp_unsignaled(&ep->super.txqp) != 0) {
        status = uct_rc_verbs_ep_put_short(tl_ep, NULL, 0, 0, 0);
        if (status != UCS_OK) {
            return status;
        }
    }

    return uct_rc_txqp_add_flush_comp(&iface->super, &ep->super.super,
                                      &ep->super.txqp, comp, ep->txcnt.pi);
}
Beispiel #10
0
static ucs_status_t UCS_F_ALWAYS_INLINE
uct_dc_mlx5_ep_atomic_op_post(uct_ep_h tl_ep, unsigned opcode, unsigned size,
                              uint64_t value, uint64_t remote_addr, uct_rkey_t rkey)
{
    uct_dc_mlx5_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_dc_mlx5_iface_t);
    uct_dc_mlx5_ep_t *ep       = ucs_derived_of(tl_ep, uct_dc_mlx5_ep_t);
    uct_rc_iface_send_desc_t *desc;
    int op;
    uint64_t compare_mask;
    uint64_t compare;
    uint64_t swap_mask;
    uint64_t swap;
    int      ext; /* not used here */
    ucs_status_t status;

    UCT_DC_MLX5_CHECK_RES(iface, ep);
    UCT_RC_MLX5_CHECK_ATOMIC_OPS(opcode, size, UCT_RC_MLX5_ATOMIC_OPS);

    status = uct_rc_mlx5_iface_common_atomic_data(opcode, size, value, &op, &compare_mask,
                                                  &compare, &swap_mask, &swap, &ext);
    if (ucs_unlikely(UCS_STATUS_IS_ERR(status))) {
        return status;
    }

    UCT_RC_IFACE_GET_TX_ATOMIC_DESC(&iface->super.super, &iface->super.tx.atomic_desc_mp, desc);
    uct_dc_mlx5_iface_atomic_post(iface, ep, op, desc, size, remote_addr, rkey,
                                  compare_mask, compare, swap_mask, swap);
    return UCS_OK;
}
Beispiel #11
0
static ucs_status_t UCS_F_ALWAYS_INLINE
uct_dc_mlx5_ep_short_dm(uct_dc_mlx5_ep_t *ep, uct_rc_mlx5_dm_copy_data_t *cache,
                        size_t hdr_len, const void *payload, unsigned length,
                        unsigned opcode, uint8_t fm_ce_se,
                        uint64_t rdma_raddr, uct_rkey_t rdma_rkey)
{
    uct_dc_mlx5_iface_t *iface = ucs_derived_of(ep->super.super.iface, uct_dc_mlx5_iface_t);
    uct_rc_iface_send_desc_t *desc;
    void *buffer;
    ucs_status_t status;
    uct_ib_log_sge_t log_sge;

    status = uct_rc_mlx5_common_dm_make_data(&iface->super, cache, hdr_len,
                                             payload, length, &desc,
                                             &buffer, &log_sge);
    if (ucs_unlikely(UCS_STATUS_IS_ERR(status))) {
        return status;
    }

    uct_dc_mlx5_iface_bcopy_post(iface, ep, opcode,
                                 hdr_len + length,
                                 rdma_raddr, rdma_rkey,
                                 desc, fm_ce_se, 0, buffer,
                                 log_sge.num_sge ? &log_sge : NULL);
    return UCS_OK;
}
Beispiel #12
0
static UCS_F_NOINLINE void
uct_ud_mlx5_iface_post_recv(uct_ud_mlx5_iface_t *iface)
{
    unsigned batch = iface->super.config.rx_max_batch;
    struct mlx5_wqe_data_seg *rx_wqes;
    uint16_t pi, next_pi, count;
    uct_ib_iface_recv_desc_t *desc;

    rx_wqes = iface->rx.wq.wqes;
    pi      = iface->rx.wq.rq_wqe_counter & iface->rx.wq.mask;

    for (count = 0; count < batch; count ++) {
        next_pi = (pi + 1) &  iface->rx.wq.mask;
        ucs_prefetch(rx_wqes + next_pi);
        UCT_TL_IFACE_GET_RX_DESC(&iface->super.super.super, &iface->super.rx.mp,
                                 desc, break);
        rx_wqes[pi].lkey = htonl(desc->lkey);
        rx_wqes[pi].addr = htonll((uintptr_t)uct_ib_iface_recv_desc_hdr(&iface->super.super, desc));
        pi = next_pi;
    }
    if (ucs_unlikely(count == 0)) {
        ucs_error("iface(%p) failed to post receive wqes", iface);
        return;
    }
    pi = iface->rx.wq.rq_wqe_counter + count;
    iface->rx.wq.rq_wqe_counter = pi;
    iface->super.rx.available -= count;
    ucs_memory_cpu_fence();
    *iface->rx.wq.dbrec = htonl(pi);
}
Beispiel #13
0
static void uct_ud_verbs_iface_progress_pending(uct_ud_verbs_iface_t *iface)
{
    uct_ud_ep_t *ep;
    ucs_status_t status;
    uct_ud_neth_t neth;
    uct_ud_send_skb_t *skb;

    while (!ucs_queue_is_empty(&iface->super.tx.pending_ops)) {
        status = uct_ud_iface_get_next_pending(&iface->super, &ep, &neth, &skb);
        if (status == UCS_ERR_NO_RESOURCE) {
            return;
        }
        if (status == UCS_INPROGRESS) {
            continue;
        }

        if (ucs_unlikely(skb != NULL)) {
            /* TODO: not every skb is inline */
            iface->tx.sge[0].addr   = (uintptr_t) (skb->neth);
            iface->tx.sge[0].length = skb->len;
            uct_ud_verbs_iface_tx_ctl(iface, ucs_derived_of(ep, uct_ud_verbs_ep_t));
            uct_ud_ep_log_tx_tag("PENDING_TX: (skb)", ep, skb->neth, skb->len);
        } 
        else {
            iface->tx.sge[0].addr   = (uintptr_t)&neth;
            iface->tx.sge[0].length = sizeof(neth);
            UCT_UD_EP_HOOK_CALL_TX(ep, &neth);
            uct_ud_verbs_iface_tx_ctl(iface, ucs_derived_of(ep, uct_ud_verbs_ep_t));
            uct_ud_ep_log_tx_tag("PENDING_TX: (neth)", ep, &neth, sizeof(neth));
        }
    }
}
Beispiel #14
0
static UCS_F_ALWAYS_INLINE ucs_status_t
uct_ugni_smsg_ep_am_common_send(uct_ugni_smsg_ep_t *ep, uct_ugni_smsg_iface_t *iface,
                                uint8_t am_id, unsigned header_length, void *header,
                                unsigned payload_length, void *payload, uct_ugni_smsg_desc_t *desc)
{
    gni_return_t gni_rc;

    if (ucs_unlikely(!uct_ugni_ep_can_send(&ep->super))) {
        goto exit_no_res;
    }

    desc->msg_id = iface->smsg_id++;
    desc->flush_group = ep->super.flush_group;
    uct_ugni_cdm_lock(&iface->super.cdm);
    gni_rc = GNI_SmsgSendWTag(ep->super.ep, header, header_length, 
                              payload, payload_length, desc->msg_id, am_id);
    uct_ugni_cdm_unlock(&iface->super.cdm);
    if(GNI_RC_SUCCESS != gni_rc){
        goto exit_no_res;
    }

    ++desc->flush_group->flush_comp.count;
    ++iface->super.outstanding;

    sglib_hashed_uct_ugni_smsg_desc_t_add(iface->smsg_list, desc);

    return UCS_OK;

exit_no_res:
    ucs_trace("Smsg send failed.");
    ucs_mpool_put(desc);
    UCS_STATS_UPDATE_COUNTER(ep->super.super.stats, UCT_EP_STAT_NO_RES, 1);
    return UCS_ERR_NO_RESOURCE;
}
Beispiel #15
0
ucs_status_t ucp_ep_new(ucp_worker_h worker, uint64_t dest_uuid,
                        const char *peer_name, const char *message,
                        ucp_ep_h *ep_p)
{
    ucs_status_t status;
    ucp_ep_config_key_t key;
    ucp_ep_h ep;
    khiter_t hash_it;
    int hash_extra_status = 0;

    ep = ucs_calloc(1, sizeof(*ep), "ucp ep");
    if (ep == NULL) {
        ucs_error("Failed to allocate ep");
        status = UCS_ERR_NO_MEMORY;
        goto err;
    }

    /* EP configuration without any lanes */
    memset(&key, 0, sizeof(key));
    key.rma_lane_map     = 0;
    key.amo_lane_map     = 0;
    key.reachable_md_map = 0;
    key.am_lane          = UCP_NULL_RESOURCE;
    key.rndv_lane        = UCP_NULL_RESOURCE;
    key.wireup_msg_lane  = UCP_NULL_LANE;
    key.num_lanes        = 0;
    memset(key.amo_lanes, UCP_NULL_LANE, sizeof(key.amo_lanes));

    ep->worker           = worker;
    ep->dest_uuid        = dest_uuid;
    ep->cfg_index        = ucp_worker_get_ep_config(worker, &key);
    ep->am_lane          = UCP_NULL_LANE;
    ep->flags            = 0;
#if ENABLE_DEBUG_DATA
    ucs_snprintf_zero(ep->peer_name, UCP_WORKER_NAME_MAX, "%s", peer_name);
#endif

    hash_it = kh_put(ucp_worker_ep_hash, &worker->ep_hash, dest_uuid,
                     &hash_extra_status);
    if (ucs_unlikely(hash_it == kh_end(&worker->ep_hash))) {
        ucs_error("Hash failed with ep %p to %s 0x%"PRIx64"->0x%"PRIx64" %s "
                  "with status %d", ep, peer_name, worker->uuid, ep->dest_uuid,
                  message, hash_extra_status);
        status = UCS_ERR_NO_RESOURCE;
        goto err_free_ep;
    }
    kh_value(&worker->ep_hash, hash_it) = ep;

    *ep_p = ep;
    ucs_debug("created ep %p to %s 0x%"PRIx64"->0x%"PRIx64" %s", ep, peer_name,
              worker->uuid, ep->dest_uuid, message);
    return UCS_OK;

err_free_ep:
    ucs_free(ep);
err:
    return status;
}
Beispiel #16
0
static UCS_F_ALWAYS_INLINE void
uct_rc_mlx5_post_send(uct_rc_mlx5_ep_t *ep, struct mlx5_wqe_ctrl_seg *ctrl,
                      uint8_t opcode, uint8_t opmod, unsigned sig_flag, unsigned wqe_size)
{
    unsigned n, num_seg, num_bb;
    void *src, *dst;
    uint16_t sw_pi;

    num_seg = ucs_div_round_up(wqe_size, UCT_IB_MLX5_WQE_SEG_SIZE);
    num_bb  = ucs_div_round_up(wqe_size, MLX5_SEND_WQE_BB);
    sw_pi   = ep->tx.sw_pi;

    uct_rc_mlx5_set_ctrl_seg(ctrl, sw_pi, opcode, opmod, ep->qp_num, sig_flag,
                             num_seg);
    uct_ib_mlx5_log_tx(IBV_QPT_RC, ctrl, ep->tx.qstart, ep->tx.qend,
                       (opcode == MLX5_OPCODE_SEND) ? uct_rc_ep_am_packet_dump : NULL);

    /* TODO Put memory store fence here too, to prevent WC being flushed after DBrec */
    ucs_memory_cpu_store_fence();

    /* Write doorbell record */
    ep->tx.prev_sw_pi = sw_pi;
    *ep->tx.dbrec = htonl(sw_pi += num_bb);

    /* Make sure that doorbell record is written before ringing the doorbell */
    ucs_memory_bus_store_fence();

    /* Set up copy pointers */
    dst = ep->tx.bf_reg;
    src = ctrl;

    /* BF copy */
    /* TODO support DB without BF */
    ucs_assert(wqe_size <= ep->tx.bf_size);
    ucs_assert(num_bb <= UCT_RC_MLX5_MAX_BB);
    for (n = 0; n < num_bb; ++n) {
        uct_rc_mlx5_bf_copy_bb(dst, src);
        dst += MLX5_SEND_WQE_BB;
        src += MLX5_SEND_WQE_BB;
        if (ucs_unlikely(src == ep->tx.qend)) {
            src = ep->tx.qstart;
        }
    }

    /* We don't want the compiler to reorder instructions and hurt latency */
    ucs_compiler_fence();

    /* Advance queue pointer */
    ucs_assert(ctrl == ep->tx.seg);
    ep->tx.seg   = src;
    ep->tx.sw_pi = sw_pi;

    /* Flip BF register */
    ep->tx.bf_reg = (void*) ((uintptr_t) ep->tx.bf_reg ^ ep->tx.bf_size);

    uct_rc_ep_tx_posted(&ep->super, sig_flag & MLX5_WQE_CTRL_CQ_UPDATE);
}
Beispiel #17
0
static inline void uct_mm_iface_poll_fifo(uct_mm_iface_t *iface)
{
    uint64_t read_index_loc, read_index;
    uct_mm_fifo_element_t* read_index_elem;
    ucs_status_t status;

    /* check the memory pool to make sure that there is a new descriptor available */
    if (ucs_unlikely(iface->last_recv_desc == NULL)) {
        UCT_TL_IFACE_GET_RX_DESC(&iface->super, &iface->recv_desc_mp,
                                 iface->last_recv_desc, return);
    }
Beispiel #18
0
static UCS_F_ALWAYS_INLINE ucs_status_ptr_t
ucp_tag_send_req(ucp_request_t *req, size_t count,
                 const ucp_ep_msg_config_t* msg_config,
                 size_t rndv_rma_thresh, size_t rndv_am_thresh,
                 ucp_send_callback_t cb, const ucp_proto_t *proto)
{
    size_t seg_size     = (msg_config->max_bcopy - proto->only_hdr_size);
    size_t rndv_thresh  = ucp_tag_get_rndv_threshold(req, count,
                                                     msg_config->max_iov,
                                                     rndv_rma_thresh,
                                                     rndv_am_thresh, seg_size);
    size_t zcopy_thresh = ucp_proto_get_zcopy_threshold(req, msg_config, count,
                                                        rndv_thresh);
    ssize_t max_short   = ucp_proto_get_short_max(req, msg_config);
    ucs_status_t status;

    ucs_trace_req("select tag request(%p) progress algorithm datatype=%lx "
                  "buffer=%p length=%zu max_short=%zd rndv_thresh=%zu "
                  "zcopy_thresh=%zu",
                  req, req->send.datatype, req->send.buffer, req->send.length,
                  max_short, rndv_thresh, zcopy_thresh);

    status = ucp_request_send_start(req, max_short, zcopy_thresh, seg_size,
                                    rndv_thresh, proto);
    if (ucs_unlikely(status != UCS_OK)) {
        if (status == UCS_ERR_NO_PROGRESS) {
             ucs_assert(req->send.length >= rndv_thresh);
            /* RMA/AM rendezvous */
            status = ucp_tag_send_start_rndv(req);
        }
        if (status != UCS_OK) {
            return UCS_STATUS_PTR(status);
        }
    }

    ucp_request_send_tag_stat(req);

    /*
     * Start the request.
     * If it is completed immediately, release the request and return the status.
     * Otherwise, return the request.
     */
    status = ucp_request_send(req);
    if (req->flags & UCP_REQUEST_FLAG_COMPLETED) {
        ucs_trace_req("releasing send request %p, returning status %s", req,
                      ucs_status_string(status));
        ucp_request_put(req);
        return UCS_STATUS_PTR(status);
    }

    ucp_request_set_callback(req, send.cb, cb)
    ucs_trace_req("returning send request %p", req);
    return req + 1;
}
Beispiel #19
0
ucs_status_t uct_rc_verbs_ep_am_zcopy(uct_ep_h tl_ep, uint8_t id, const void *header,
                                      unsigned header_length, const void *payload,
                                      size_t length, uct_mem_h memh,
                                      uct_completion_t *comp)
{
    uct_rc_verbs_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_rc_verbs_iface_t);
    uct_rc_verbs_ep_t *ep = ucs_derived_of(tl_ep, uct_rc_verbs_ep_t);
    struct ibv_mr *mr = memh;
    uct_rc_iface_send_desc_t *desc;
    struct ibv_send_wr wr;
    struct ibv_sge sge[2];
    uct_rc_hdr_t *rch;
    int send_flags;

    UCT_CHECK_AM_ID(id);
    UCT_CHECK_LENGTH(sizeof(*rch) + header_length, iface->config.short_desc_size,
                     "am_zcopy header");
    UCT_CHECK_LENGTH(header_length + length, iface->super.super.config.seg_size,
                     "am_zcopy payload");
    UCT_RC_VERBS_CHECK_RES(iface, ep);
    UCT_RC_IFACE_GET_TX_DESC(&iface->super, iface->short_desc_mp, desc);

    if (comp == NULL) {
        desc->super.handler   = (uct_rc_send_handler_t)ucs_mpool_put;
        send_flags            = 0;
    } else {
        desc->super.handler   = uct_rc_verbs_ep_am_zcopy_handler;
        desc->super.user_comp = comp;
        send_flags            = IBV_SEND_SIGNALED;
    }

    /* Header buffer: active message ID + user header */
    rch = (void*)(desc + 1);
    rch->am_id = id;
    memcpy(rch + 1, header, header_length);

    wr.sg_list    = sge;
    wr.opcode     = IBV_WR_SEND;
    sge[0].length = sizeof(*rch) + header_length;

    if (ucs_unlikely(length == 0)) {
        wr.num_sge    = 1;
    } else {
        wr.num_sge    = 2;
        sge[1].addr   = (uintptr_t)payload;
        sge[1].length = length;
        sge[1].lkey   = (mr == UCT_INVALID_MEM_HANDLE) ? 0 : mr->lkey;
    }

    UCT_TL_EP_STAT_OP(&ep->super.super, AM, ZCOPY, header_length + length);
    uct_rc_verbs_ep_post_send_desc(ep, &wr, desc, send_flags);
    return UCS_INPROGRESS;
}
Beispiel #20
0
static UCS_F_ALWAYS_INLINE void 
uct_rc_verbs_iface_poll_tx(uct_rc_verbs_iface_t *iface)
{
    struct ibv_wc wc[UCT_IB_MAX_WC];
    uct_rc_verbs_ep_t *ep;
    uct_rc_iface_send_op_t *op;
    unsigned count;
    uint16_t sn;
    int i, ret;

    ret = ibv_poll_cq(iface->super.super.send_cq, UCT_IB_MAX_WC, wc);
    if (ucs_unlikely(ret <= 0)) {
        if (ucs_unlikely(ret < 0)) {
            ucs_fatal("Failed to poll send CQ");
        }
        return;
    }

    for (i = 0; i < ret; ++i) {
        if (ucs_unlikely(wc[i].status != IBV_WC_SUCCESS)) {
            ucs_fatal("Send completion with error: %s", ibv_wc_status_str(wc[i].status));
        }

        UCS_STATS_UPDATE_COUNTER(iface->super.stats, UCT_RC_IFACE_STAT_TX_COMPLETION, 1);

        ep = ucs_derived_of(uct_rc_iface_lookup_ep(&iface->super, wc[i].qp_num), uct_rc_verbs_ep_t);
        ucs_assert(ep != NULL);

        count = wc[i].wr_id + 1; /* Number of sends with WC completes in batch */
        ep->super.available         += count;
        ep->tx.completion_count     += count;
        ++iface->super.tx.cq_available;

        sn = ep->tx.completion_count;
        ucs_queue_for_each_extract(op, &ep->super.outstanding, queue,
                                   UCS_CIRCULAR_COMPARE16(op->sn, <=, sn)) {
            op->handler(op);
        }
    }
}
Beispiel #21
0
ucs_status_t ucp_get_nbi(ucp_ep_h ep, void *buffer, size_t length,
                         uint64_t remote_addr, ucp_rkey_h rkey)
{
    ucp_ep_rma_config_t *rma_config;
    ucs_status_t status;
    uct_rkey_t uct_rkey;
    size_t frag_length;
    uct_ep_h uct_ep;

    UCP_RMA_CHECK_PARAMS(buffer, length);

    for (;;) {
        UCP_EP_RESOLVE_RKEY_RMA(ep, rkey, uct_ep, uct_rkey, rma_config);
        frag_length = ucs_min(rma_config->max_get_bcopy, length);
        status = uct_ep_get_bcopy(uct_ep,
                                  (uct_unpack_callback_t)memcpy,
                                  (void*)buffer,
                                  frag_length,
                                  remote_addr,
                                  uct_rkey,
                                  NULL);
        if (ucs_likely(status == UCS_OK || status == UCS_INPROGRESS)) {
            /* Get was initiated */
            length -= frag_length;
            buffer += frag_length;
            remote_addr += frag_length;
            if (length == 0) {
                break;
            }
        } else if (ucs_unlikely(status == UCS_ERR_NO_RESOURCE)) {
            /* Out of resources - adding request for later schedule */
            ucp_request_t *req;
            req = ucs_mpool_get_inline(&ep->worker->req_mp);
            if (req == NULL) {
                /* can't allocate memory for request - abort */
                status = UCS_ERR_NO_MEMORY;
                break;
            }
            ucp_add_pending_rma(req, ep, uct_ep, buffer, length, remote_addr,
                                rkey, ucp_progress_get_nbi);

            /* Mark it as in progress */
            status = UCS_INPROGRESS;
            break;
        } else {
            /* Error */
            break;
        }
    }

    return status;
}
Beispiel #22
0
static inline void uct_ud_verbs_iface_poll_tx(uct_ud_verbs_iface_t *iface)
{
    struct ibv_wc wc;
    int ret;

    ret = ibv_poll_cq(iface->super.super.send_cq, 1, &wc);
    if (ucs_unlikely(ret < 0)) {
        ucs_fatal("Failed to poll send CQ");
        return;
    }

    if (ret == 0) {
        return;
    }

    if (ucs_unlikely(wc.status != IBV_WC_SUCCESS)) {
        ucs_fatal("Send completion (wr_id=0x%0X with error: %s ", (unsigned)wc.wr_id, ibv_wc_status_str(wc.status));
        return;
    }

    iface->super.tx.available += UCT_UD_TX_MODERATION + 1;
}
Beispiel #23
0
ucs_status_t uct_dc_mlx5_ep_flush(uct_ep_h tl_ep, unsigned flags, uct_completion_t *comp)
{
    uct_dc_mlx5_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_dc_mlx5_iface_t);
    uct_dc_mlx5_ep_t    *ep    = ucs_derived_of(tl_ep, uct_dc_mlx5_ep_t);
    ucs_status_t        status;
    UCT_DC_MLX5_TXQP_DECL(txqp, txwq);

    if (ucs_unlikely(flags & UCT_FLUSH_FLAG_CANCEL)) {
        if (ep->dci != UCT_DC_MLX5_EP_NO_DCI) {
            uct_rc_txqp_purge_outstanding(&iface->tx.dcis[ep->dci].txqp,
                                          UCS_ERR_CANCELED, 0);
#if ENABLE_ASSERT
            iface->tx.dcis[ep->dci].flags |= UCT_DC_DCI_FLAG_EP_CANCELED;
#endif
        }

        uct_ep_pending_purge(tl_ep, NULL, 0);
        return UCS_OK;
    }

    if (!uct_rc_iface_has_tx_resources(&iface->super.super)) {
        return UCS_ERR_NO_RESOURCE;
    }

    if (ep->dci == UCT_DC_MLX5_EP_NO_DCI) {
        if (!uct_dc_mlx5_iface_dci_can_alloc(iface)) {
            return UCS_ERR_NO_RESOURCE; /* waiting for dci */
        } else {
            UCT_TL_EP_STAT_FLUSH(&ep->super); /* no sends */
            return UCS_OK;
        }
    }

    if (!uct_dc_mlx5_iface_dci_ep_can_send(ep)) {
        return UCS_ERR_NO_RESOURCE; /* cannot send */
    }

    status = uct_dc_mlx5_iface_flush_dci(iface, ep->dci);
    if (status == UCS_OK) {
        UCT_TL_EP_STAT_FLUSH(&ep->super);
        return UCS_OK; /* all sends completed */
    }

    ucs_assert(status == UCS_INPROGRESS);
    ucs_assert(ep->dci != UCT_DC_MLX5_EP_NO_DCI);

    UCT_DC_MLX5_IFACE_TXQP_GET(iface, ep, txqp, txwq);

    return uct_rc_txqp_add_flush_comp(&iface->super.super, &ep->super, txqp,
                                      comp, txwq->sig_pi);
}
Beispiel #24
0
static UCS_F_ALWAYS_INLINE ucs_status_t
ucp_tag_matched(void *buffer, size_t buffer_length, ucp_tag_t recv_tag,
                void *recv_data, size_t recv_length, ucp_tag_recv_completion_t *comp)
{
    ucs_debug("matched tag 0x%"PRIx64" ", (uint64_t)recv_tag);
    if (ucs_unlikely(recv_length > buffer_length)) {
        return UCS_ERR_MESSAGE_TRUNCATED;
    }

    memcpy(buffer, recv_data, recv_length);
    comp->rcvd_len   = recv_length;
    comp->sender_tag = recv_tag;
    return UCS_OK;
}
Beispiel #25
0
void __ucs_wtimer_add(ucs_twheel_t *t, ucs_wtimer_t *timer, ucs_time_t delta)
{
    uint64_t slot;

    timer->is_active = 1;
    slot = delta>>t->res_order;
    if (ucs_unlikely(slot == 0)) {
        /* nothing really wrong with adding timer to the current slot. However
         * we want to guard against the case we spend to much time in hi res
         * timer processing */
        ucs_fatal("Timer resolution is too low. Min resolution %lf usec, wanted %lf usec",
                ucs_time_to_usec(t->res), ucs_time_to_usec(delta));
    }
    ucs_assert(slot > 0);

    if (ucs_unlikely(slot >= t->num_slots)) {
        slot = t->num_slots - 1;
    }

    slot = (t->current + slot) % t->num_slots;
    ucs_assert(slot != t->current);

    ucs_list_add_tail(&t->wheel[slot], &timer->list);
}
Beispiel #26
0
ucs_status_t uct_ugni_ep_get_zcopy(uct_ep_h tl_ep, const uct_iov_t *iov, size_t iovcnt,
                                   uint64_t remote_addr, uct_rkey_t rkey,
                                   uct_completion_t *comp)
{
    uct_ugni_ep_t *ep = ucs_derived_of(tl_ep, uct_ugni_ep_t);
    uct_ugni_rdma_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_ugni_rdma_iface_t);
    uct_ugni_base_desc_t *rdma;

    UCT_CHECK_PARAM_IOV(iov, iovcnt, buffer, length, memh);
    UCT_SKIP_ZERO_LENGTH(length);
    UCT_CHECK_LENGTH(ucs_align_up_pow2(length, UGNI_GET_ALIGN), 0,
                     iface->config.rdma_max_size, "get_zcopy");

    /* Special flow for an unalign data */
    if (ucs_unlikely((GNI_DEVICE_GEMINI == iface->super.dev->type && 
                      ucs_check_if_align_pow2((uintptr_t)buffer, UGNI_GET_ALIGN)) ||
                      ucs_check_if_align_pow2(remote_addr, UGNI_GET_ALIGN)        ||
                      ucs_check_if_align_pow2(length, UGNI_GET_ALIGN))) {
        return uct_ugni_ep_get_composed(tl_ep, buffer, length, memh,
                                        remote_addr, rkey, comp);
    }

    /* Everything is perfectly aligned */
    UCT_TL_IFACE_GET_TX_DESC(&iface->super.super, &iface->free_desc, rdma,
                             return UCS_ERR_NO_RESOURCE);

    /* Setup Callback */
    uct_ugni_format_rdma(rdma, GNI_POST_RDMA_GET, buffer, remote_addr, memh, rkey,
                         ucs_align_up_pow2(length, UGNI_GET_ALIGN), ep, iface->super.local_cq, comp);

    ucs_trace_data("Posting GET ZCOPY, GNI_PostRdma of size %"PRIx64" (%lu) "
                   "from %p to %p, with [%"PRIx64" %"PRIx64"]",
                   rdma->desc.length, length,
                   (void *)rdma->desc.local_addr,
                   (void *)rdma->desc.remote_addr,
                   rdma->desc.remote_mem_hndl.qword1,
                   rdma->desc.remote_mem_hndl.qword2);
    UCT_TL_EP_STAT_OP(ucs_derived_of(tl_ep, uct_base_ep_t), GET, ZCOPY, length);
    return uct_ugni_post_rdma(iface, ep, rdma);
}
Beispiel #27
0
void __ucs_twheel_sweep(ucs_twheel_t *t, ucs_time_t current_time)
{
    ucs_wtimer_t *timer;
    uint64_t slot;

    slot   = (current_time - t->now) >> t->res_order;
    t->now = current_time;

    if (ucs_unlikely(slot >= t->num_slots)) {
        slot = t->num_slots - 1;
    }

    slot = (t->current + slot) % t->num_slots;

    for (; t->current != slot; t->current = (t->current+1) % t->num_slots) {
        while (!ucs_list_is_empty(&t->wheel[t->current])) {
            timer = ucs_list_extract_head(&t->wheel[t->current], ucs_wtimer_t, list);
            timer->is_active = 0;
            ucs_invoke_callback(&timer->cb);
        }
    }
}
Beispiel #28
0
static UCS_F_ALWAYS_INLINE ucs_status_t
uct_cuda_copy_post_cuda_async_copy(uct_ep_h tl_ep, void *dst, void *src, size_t length,
                                   int direction, cudaStream_t stream,
                                   ucs_queue_head_t *outstanding_queue,
                                   uct_completion_t *comp)
{
    uct_cuda_copy_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_cuda_copy_iface_t);
    uct_cuda_copy_event_desc_t *cuda_event;
    ucs_status_t status;

    if (!length) {
        return UCS_OK;
    }

    cuda_event = ucs_mpool_get(&iface->cuda_event_desc);
    if (ucs_unlikely(cuda_event == NULL)) {
        ucs_error("Failed to allocate cuda event object");
        return UCS_ERR_NO_MEMORY;
    }

    status = UCT_CUDA_FUNC(cudaMemcpyAsync(dst, src, length, direction, stream));
    if (UCS_OK != status) {
        return UCS_ERR_IO_ERROR;
    }

    status = UCT_CUDA_FUNC(cudaEventRecord(cuda_event->event, stream));
    if (UCS_OK != status) {
        return UCS_ERR_IO_ERROR;
    }
    ucs_queue_push(outstanding_queue, &cuda_event->queue);
    cuda_event->comp = comp;

    ucs_trace("cuda async issued :%p dst:%p, src:%p  len:%ld",
             cuda_event, dst, src, length);
    return UCS_INPROGRESS;
}
Beispiel #29
0
static UCS_F_ALWAYS_INLINE void
ucp_tag_recv_common(ucp_worker_h worker, void *buffer, size_t count,
                    uintptr_t datatype, ucp_tag_t tag, ucp_tag_t tag_mask,
                    ucp_request_t *req, uint32_t req_flags, ucp_tag_recv_callback_t cb,
                    ucp_recv_desc_t *rdesc, const char *debug_name)
{
    unsigned common_flags = UCP_REQUEST_FLAG_RECV | UCP_REQUEST_FLAG_EXPECTED;
    ucp_eager_first_hdr_t *eagerf_hdr;
    ucp_request_queue_t *req_queue;
    uct_memory_type_t mem_type;
    size_t hdr_len, recv_len;
    ucs_status_t status;
    uint64_t msg_id;

    ucp_trace_req(req, "%s buffer %p dt 0x%lx count %zu tag %"PRIx64"/%"PRIx64,
                  debug_name, buffer, datatype, count, tag, tag_mask);

    /* First, check the fast path case - single fragment
     * in this case avoid initializing most of request fields
     * */
    if (ucs_likely((rdesc != NULL) && (rdesc->flags & UCP_RECV_DESC_FLAG_EAGER_ONLY))) {
        UCS_PROFILE_REQUEST_EVENT(req, "eager_only_match", 0);
        UCP_WORKER_STAT_EAGER_MSG(worker, rdesc->flags);
        UCP_WORKER_STAT_EAGER_CHUNK(worker, UNEXP);

        if (ucs_unlikely(rdesc->flags & UCP_RECV_DESC_FLAG_EAGER_SYNC)) {
            ucp_tag_eager_sync_send_ack(worker, rdesc + 1, rdesc->flags);
        }

        req->flags                    = UCP_REQUEST_FLAG_RECV | req_flags;
        hdr_len                       = rdesc->payload_offset;
        recv_len                      = rdesc->length - hdr_len;
        req->recv.tag.info.sender_tag = ucp_rdesc_get_tag(rdesc);
        req->recv.tag.info.length     = recv_len;

        ucp_memory_type_detect_mds(worker->context, buffer, recv_len, &mem_type);

        status = ucp_dt_unpack_only(worker, buffer, count, datatype, mem_type,
                                    (void*)(rdesc + 1) + hdr_len, recv_len, 1);
        ucp_recv_desc_release(rdesc);

        if (req_flags & UCP_REQUEST_FLAG_CALLBACK) {
            cb(req + 1, status, &req->recv.tag.info);
        }
        ucp_tag_recv_request_completed(req, status, &req->recv.tag.info,
                                       debug_name);
        return;
    }

    /* Initialize receive request */
    req->status             = UCS_OK;
    req->recv.worker        = worker;
    req->recv.buffer        = buffer;
    req->recv.datatype      = datatype;

    ucp_dt_recv_state_init(&req->recv.state, buffer, datatype, count);

    if (!UCP_DT_IS_CONTIG(datatype)) {
        common_flags       |= UCP_REQUEST_FLAG_BLOCK_OFFLOAD;
    }

    req->flags              = common_flags | req_flags;
    req->recv.length        = ucp_dt_length(datatype, count, buffer,
                                            &req->recv.state);

    ucp_memory_type_detect_mds(worker->context, buffer, req->recv.length, &mem_type);

    req->recv.mem_type      = mem_type;
    req->recv.tag.tag       = tag;
    req->recv.tag.tag_mask  = tag_mask;
    req->recv.tag.cb        = cb;
    if (ucs_log_is_enabled(UCS_LOG_LEVEL_TRACE_REQ)) {
        req->recv.tag.info.sender_tag = 0;
    }

    if (ucs_unlikely(rdesc == NULL)) {
        /* If not found on unexpected, wait until it arrives.
         * If was found but need this receive request for later completion, save it */
        req_queue = ucp_tag_exp_get_queue(&worker->tm, tag, tag_mask);

        /* If offload supported, post this tag to transport as well.
         * TODO: need to distinguish the cases when posting is not needed. */
        ucp_tag_offload_try_post(worker, req, req_queue);

        ucp_tag_exp_push(&worker->tm, req_queue, req);

        ucs_trace_req("%s returning expected request %p (%p)", debug_name, req,
                      req + 1);
        return;
    }

    /* Check rendezvous case */
    if (ucs_unlikely(rdesc->flags & UCP_RECV_DESC_FLAG_RNDV)) {
        ucp_rndv_matched(worker, req, (void*)(rdesc + 1));
        UCP_WORKER_STAT_RNDV(worker, UNEXP);
        ucp_recv_desc_release(rdesc);
        return;
    }

    if (ucs_unlikely(rdesc->flags & UCP_RECV_DESC_FLAG_EAGER_SYNC)) {
        ucp_tag_eager_sync_send_ack(worker, rdesc + 1, rdesc->flags);
    }

    UCP_WORKER_STAT_EAGER_MSG(worker, rdesc->flags);
    ucs_assert(rdesc->flags & UCP_RECV_DESC_FLAG_EAGER);
    eagerf_hdr                    = (void*)(rdesc + 1);
    req->recv.tag.info.sender_tag = ucp_rdesc_get_tag(rdesc);
    req->recv.tag.info.length     =
    req->recv.tag.remaining       = eagerf_hdr->total_len;

    /* process first fragment */
    UCP_WORKER_STAT_EAGER_CHUNK(worker, UNEXP);
    msg_id = eagerf_hdr->msg_id;
    status = ucp_tag_recv_request_process_rdesc(req, rdesc, 0);
    ucs_assert(status == UCS_INPROGRESS);

    /* process additional fragments */
    ucp_tag_frag_list_process_queue(&worker->tm, req, msg_id
                                    UCS_STATS_ARG(UCP_WORKER_STAT_TAG_RX_EAGER_CHUNK_UNEXP));
}
Beispiel #30
0
/*
 * Generic data-pointer posting function.
 * Parameters which are not relevant to the opcode are ignored.
 *
 *            +--------+-----+-------+--------+-------+
 * SEND       | CTRL   | INL | am_id | am_hdr | DPSEG |
 *            +--------+-----+---+---+----+----+------+
 * RDMA_WRITE | CTRL   | RADDR   | DPSEG  |
 *            +--------+---------+--------+-------+
 * ATOMIC     | CTRL   | RADDR   | ATOMIC | DPSEG |
 *            +--------+---------+--------+-------+
 */
static UCS_F_ALWAYS_INLINE ucs_status_t
uct_rc_mlx5_ep_dptr_post(uct_rc_mlx5_ep_t *ep, unsigned opcode_flags,
                         const void *buffer, unsigned length, uint32_t *lkey_p,
                         /* SEND */ uint8_t am_id, const void *am_hdr, unsigned am_hdr_len,
                         /* RDMA/ATOMIC */ uint64_t remote_addr, uct_rkey_t rkey,
                         /* ATOMIC */ uint64_t compare_mask, uint64_t compare, uint64_t swap_add,
                         int signal)
{
    struct mlx5_wqe_ctrl_seg                     *ctrl;
    struct mlx5_wqe_raddr_seg                    *raddr;
    struct mlx5_wqe_atomic_seg                   *atomic;
    struct mlx5_wqe_data_seg                     *dptr;
    struct mlx5_wqe_inl_data_seg                 *inl;
    struct uct_ib_mlx5_atomic_masked_cswap32_seg *masked_cswap32;
    struct uct_ib_mlx5_atomic_masked_fadd32_seg  *masked_fadd32;
    struct uct_ib_mlx5_atomic_masked_cswap64_seg *masked_cswap64;

    uct_rc_mlx5_iface_t *iface;
    uct_rc_hdr_t        *rch;
    unsigned            wqe_size, inl_seg_size;
    uint8_t             opmod;

    iface = ucs_derived_of(ep->super.super.super.iface, uct_rc_mlx5_iface_t);
    if (!signal) {
        signal = uct_rc_iface_tx_moderation(&iface->super, &ep->super,
                                            MLX5_WQE_CTRL_CQ_UPDATE);
    } else {
        ucs_assert(signal == MLX5_WQE_CTRL_CQ_UPDATE);
    }

    opmod = 0;
    ctrl = ep->tx.seg;
    switch (opcode_flags) {
    case MLX5_OPCODE_SEND:
        UCT_CHECK_LENGTH(length + sizeof(*rch) + am_hdr_len,
                         iface->super.super.config.seg_size, "am_zcopy payload");

        inl_seg_size     = ucs_align_up_pow2(sizeof(*inl) + sizeof(*rch) + am_hdr_len,
                                             UCT_IB_MLX5_WQE_SEG_SIZE);
        UCT_CHECK_LENGTH(sizeof(*ctrl) + inl_seg_size + sizeof(*dptr),
                         UCT_RC_MLX5_MAX_BB * MLX5_SEND_WQE_BB, "am_zcopy header");

        /* Inline segment with AM ID and header */
        inl              = (void*)(ctrl + 1);
        inl->byte_count  = htonl((sizeof(*rch) + am_hdr_len) | MLX5_INLINE_SEG);
        rch              = (void*)(inl + 1);
        rch->am_id       = am_id;

        uct_rc_mlx5_inline_copy(rch + 1, am_hdr, am_hdr_len, ep);

        /* Data segment with payload */
        if (length == 0) {
            wqe_size     = sizeof(*ctrl) + inl_seg_size;
        } else {
            wqe_size     = sizeof(*ctrl) + inl_seg_size + sizeof(*dptr);
            dptr         = (void*)(ctrl + 1) + inl_seg_size;
            if (ucs_unlikely((void*)dptr >= ep->tx.qend)) {
                dptr = (void*)dptr - (ep->tx.qend - ep->tx.qstart);
            }

            ucs_assert((void*)dptr       >= ep->tx.qstart);
            ucs_assert((void*)(dptr + 1) <= ep->tx.qend);
            uct_rc_mlx5_ep_set_dptr_seg(dptr, buffer, length, *lkey_p);
        }
        break;

    case MLX5_OPCODE_SEND|UCT_RC_MLX5_OPCODE_FLAG_RAW:
        /* Data segment only */
        UCT_CHECK_LENGTH(length, iface->super.super.config.seg_size,
                         "send");
        ucs_assert(length < (2ul << 30));

        wqe_size         = sizeof(*ctrl) + sizeof(*dptr);
        uct_rc_mlx5_ep_set_dptr_seg((void*)(ctrl + 1), buffer, length, *lkey_p);
        break;

    case MLX5_OPCODE_RDMA_READ:
    case MLX5_OPCODE_RDMA_WRITE:
        /* Set RDMA segment */
        UCT_CHECK_LENGTH(length, UCT_IB_MAX_MESSAGE_SIZE, "put/get");

        raddr            = (void*)(ctrl + 1);
        uct_rc_mlx5_ep_set_rdma_seg(raddr, remote_addr, rkey);

        /* Data segment */
        if (length == 0) {
            wqe_size     = sizeof(*ctrl) + sizeof(*raddr);
        } else {
            wqe_size     = sizeof(*ctrl) + sizeof(*raddr) + sizeof(*dptr);
            uct_rc_mlx5_ep_set_dptr_seg((void*)(raddr + 1), buffer, length, *lkey_p);
        }
        break;

    case MLX5_OPCODE_ATOMIC_FA:
    case MLX5_OPCODE_ATOMIC_CS:
        ucs_assert(length == sizeof(uint64_t));
        raddr = (void*)(ctrl + 1);
        uct_rc_mlx5_ep_set_rdma_seg(raddr, remote_addr, rkey);

        atomic            = (void*)(raddr + 1);
        if (opcode_flags == MLX5_OPCODE_ATOMIC_CS) {
            atomic->compare = compare;
        }
        atomic->swap_add  = swap_add;

        uct_rc_mlx5_ep_set_dptr_seg((void*)(atomic + 1), buffer, length, *lkey_p);
        wqe_size          = sizeof(*ctrl) + sizeof(*raddr) + sizeof(*atomic) +
                            sizeof(*dptr);
        break;

    case MLX5_OPCODE_ATOMIC_MASKED_CS:
        raddr = (void*)(ctrl + 1);
        uct_rc_mlx5_ep_set_rdma_seg(raddr, remote_addr, rkey);

        switch (length) {
        case sizeof(uint32_t):
            opmod                        = UCT_IB_MLX5_OPMOD_EXT_ATOMIC(2);
            masked_cswap32 = (void*)(raddr + 1);
            masked_cswap32->swap         = swap_add;
            masked_cswap32->compare      = compare;
            masked_cswap32->swap_mask    = (uint32_t)-1;
            masked_cswap32->compare_mask = compare_mask;
            dptr                         = (void*)(masked_cswap32 + 1);
            wqe_size                     = sizeof(*ctrl) + sizeof(*raddr) +
                                           sizeof(*masked_cswap32) + sizeof(*dptr);
            break;
        case sizeof(uint64_t):
            opmod                        = UCT_IB_MLX5_OPMOD_EXT_ATOMIC(3); /* Ext. atomic, size 2**3 */
            masked_cswap64 = (void*)(raddr + 1);
            masked_cswap64->swap         = swap_add;
            masked_cswap64->compare      = compare;
            masked_cswap64->swap_mask    = (uint64_t)-1;
            masked_cswap64->compare_mask = compare_mask;
            dptr                         = (void*)(masked_cswap64 + 1);
            wqe_size                     = sizeof(*ctrl) + sizeof(*raddr) +
                                           sizeof(*masked_cswap64) + sizeof(*dptr);

            /* Handle QP wrap-around. It cannot happen in the middle of
             * masked-cswap segment, because it's still in the first BB.
             */
            ucs_assert((void*)dptr <= ep->tx.qend);
            if (dptr == ep->tx.qend) {
                dptr = ep->tx.qstart;
            } else {
                ucs_assert((void*)masked_cswap64 < ep->tx.qend);
            }
            break;
        default:
            ucs_assert(0);
        }

        uct_rc_mlx5_ep_set_dptr_seg(dptr, buffer, length, *lkey_p);
        break;

     case MLX5_OPCODE_ATOMIC_MASKED_FA:
        ucs_assert(length == sizeof(uint32_t));
        raddr = (void*)(ctrl + 1);
        uct_rc_mlx5_ep_set_rdma_seg(raddr, remote_addr, rkey);

        opmod                         = UCT_IB_MLX5_OPMOD_EXT_ATOMIC(2);
        masked_fadd32                 = (void*)(raddr + 1);
        masked_fadd32->add            = swap_add;
        masked_fadd32->filed_boundary = 0;

        uct_rc_mlx5_ep_set_dptr_seg((void*)(masked_fadd32 + 1), buffer, length,
                                    *lkey_p);
        wqe_size                      = sizeof(*ctrl) + sizeof(*raddr) +
                                        sizeof(*masked_fadd32) + sizeof(*dptr);
        break;

    default:
        return UCS_ERR_INVALID_PARAM;
    }

    uct_rc_mlx5_post_send(ep, ctrl, (opcode_flags & UCT_RC_MLX5_OPCODE_MASK),
                          opmod, signal, wqe_size);
    return UCS_OK;
}