static inline ssize_t uct_ugni_post_fma(uct_ugni_rdma_iface_t *iface, uct_ugni_ep_t *ep, uct_ugni_base_desc_t *fma, ssize_t ok_status) { gni_return_t ugni_rc; if (ucs_unlikely(!uct_ugni_ep_can_send(ep))) { ucs_mpool_put(fma); return UCS_ERR_NO_RESOURCE; } uct_ugni_device_lock(&iface->super.cdm); ugni_rc = GNI_PostFma(ep->ep, &fma->desc); uct_ugni_device_unlock(&iface->super.cdm); if (ucs_unlikely(GNI_RC_SUCCESS != ugni_rc)) { ucs_mpool_put(fma); if(GNI_RC_ERROR_RESOURCE == ugni_rc || GNI_RC_ERROR_NOMEM == ugni_rc) { ucs_debug("GNI_PostFma failed, Error status: %s %d", gni_err_str[ugni_rc], ugni_rc); return UCS_ERR_NO_RESOURCE; } else { ucs_error("GNI_PostFma failed, Error status: %s %d", gni_err_str[ugni_rc], ugni_rc); return UCS_ERR_IO_ERROR; } } ++fma->flush_group->flush_comp.count; ++iface->super.outstanding; return ok_status; }
static inline ucs_status_t uct_ud_verbs_iface_poll_rx(uct_ud_verbs_iface_t *iface) { uct_ib_iface_recv_desc_t *desc; struct ibv_wc wc[UCT_IB_MAX_WC]; int i, ret; char *packet; ret = ibv_poll_cq(iface->super.super.recv_cq, UCT_IB_MAX_WC, wc); if (ret == 0) { return UCS_ERR_NO_PROGRESS; } if (ucs_unlikely(ret < 0)) { ucs_fatal("Failed to poll receive CQ"); } for (i = 0; i < ret; ++i) { if (ucs_unlikely(wc[i].status != IBV_WC_SUCCESS)) { ucs_fatal("Receive completion with error: %s", ibv_wc_status_str(wc[i].status)); } desc = (void*)wc[i].wr_id; ucs_trace_data("pkt rcvd: buf=%p len=%d", desc, wc[i].byte_len); packet = uct_ib_iface_recv_desc_hdr(&iface->super.super, desc); VALGRIND_MAKE_MEM_DEFINED(packet, wc[i].byte_len); uct_ud_ep_process_rx(&iface->super, (uct_ud_neth_t *)(packet + UCT_IB_GRH_LEN), wc[i].byte_len - UCT_IB_GRH_LEN, (uct_ud_recv_skb_t *)desc); } iface->super.rx.available += ret; uct_ud_verbs_iface_post_recv(iface); return UCS_OK; }
static inline ucs_status_t uct_ugni_post_rdma(uct_ugni_rdma_iface_t *iface, uct_ugni_ep_t *ep, uct_ugni_base_desc_t *rdma) { gni_return_t ugni_rc; if (ucs_unlikely(!uct_ugni_can_send(ep))) { ucs_mpool_put(rdma); return UCS_ERR_NO_RESOURCE; } ugni_rc = GNI_PostRdma(ep->ep, &rdma->desc); if (ucs_unlikely(GNI_RC_SUCCESS != ugni_rc)) { ucs_mpool_put(rdma); if(GNI_RC_ERROR_RESOURCE == ugni_rc || GNI_RC_ERROR_NOMEM == ugni_rc) { ucs_debug("GNI_PostRdma failed, Error status: %s %d", gni_err_str[ugni_rc], ugni_rc); return UCS_ERR_NO_RESOURCE; } else { ucs_error("GNI_PostRdma failed, Error status: %s %d", gni_err_str[ugni_rc], ugni_rc); return UCS_ERR_IO_ERROR; } } ++ep->outstanding; ++iface->super.outstanding; return UCS_INPROGRESS; }
static void *uct_ugni_udt_device_thread(void *arg) { uct_ugni_udt_iface_t *iface = (uct_ugni_udt_iface_t *)arg; gni_return_t ugni_rc; uint64_t id; while (1) { pthread_mutex_lock(&iface->device_lock); while (iface->events_ready) { pthread_cond_wait(&iface->device_condition, &iface->device_lock); } pthread_mutex_unlock(&iface->device_lock); ugni_rc = GNI_PostdataProbeWaitById(uct_ugni_udt_iface_nic_handle(iface),-1,&id); if (ucs_unlikely(GNI_RC_SUCCESS != ugni_rc)) { ucs_error("GNI_PostDataProbeWaitById, Error status: %s %d\n", gni_err_str[ugni_rc], ugni_rc); continue; } if (ucs_unlikely(UCT_UGNI_UDT_CANCEL == id)) { /* When the iface is torn down, it will post and cancel a datagram with a * magic cookie as it's id that tells us to shut down. */ break; } iface->events_ready = 1; ucs_trace("Recieved a new datagram"); ucs_async_pipe_push(&iface->event_pipe); } return NULL; }
static UCS_F_ALWAYS_INLINE ucs_status_t uct_rc_verbs_iface_poll_rx(uct_rc_verbs_iface_t *iface) { uct_ib_iface_recv_desc_t *desc; uct_rc_hdr_t *hdr; struct ibv_wc wc[UCT_IB_MAX_WC]; int i, ret; ret = ibv_poll_cq(iface->super.super.recv_cq, UCT_IB_MAX_WC, wc); if (ret > 0) { for (i = 0; i < ret; ++i) { if (ucs_unlikely(wc[i].status != IBV_WC_SUCCESS)) { ucs_fatal("Receive completion with error: %s", ibv_wc_status_str(wc[i].status)); } UCS_STATS_UPDATE_COUNTER(iface->super.stats, UCT_RC_IFACE_STAT_RX_COMPLETION, 1); desc = (void*)wc[i].wr_id; uct_ib_iface_desc_received(&iface->super.super, desc, wc[i].byte_len, 1); hdr = uct_ib_iface_recv_desc_hdr(&iface->super.super, desc); uct_ib_log_recv_completion(IBV_QPT_RC, &wc[i], hdr, uct_rc_ep_am_packet_dump); uct_rc_iface_invoke_am(&iface->super, hdr, wc[i].byte_len, desc); } iface->super.rx.available += ret; return UCS_OK; } else if (ret == 0) { uct_rc_verbs_iface_post_recv(iface, 0); return UCS_ERR_NO_PROGRESS; } else { ucs_fatal("Failed to poll receive CQ"); } }
static ucs_status_t recieve_datagram(uct_ugni_udt_iface_t *iface, uint64_t id, uct_ugni_udt_ep_t **ep_out) { uint32_t rem_addr, rem_id; gni_post_state_t post_state; gni_return_t ugni_rc; uct_ugni_udt_ep_t *ep; gni_ep_handle_t gni_ep; uct_ugni_udt_desc_t *desc; uct_ugni_udt_header_t *header; ucs_trace_func("iface=%p, id=%lx", iface, id); if (UCT_UGNI_UDT_ANY == id) { ep = NULL; gni_ep = iface->ep_any; desc = iface->desc_any; } else { ep = ucs_derived_of(uct_ugni_iface_lookup_ep(&iface->super, id), uct_ugni_udt_ep_t); gni_ep = ep->super.ep; desc = ep->posted_desc; } *ep_out = ep; uct_ugni_device_lock(&iface->super.cdm); ugni_rc = GNI_EpPostDataWaitById(gni_ep, id, -1, &post_state, &rem_addr, &rem_id); uct_ugni_device_unlock(&iface->super.cdm); if (ucs_unlikely(GNI_RC_SUCCESS != ugni_rc)) { ucs_error("GNI_EpPostDataWaitById, id=%lu Error status: %s %d", id, gni_err_str[ugni_rc], ugni_rc); return UCS_ERR_IO_ERROR; } if (GNI_POST_TERMINATED == post_state) { return UCS_ERR_CANCELED; } if (GNI_POST_COMPLETED != post_state) { ucs_error("GNI_EpPostDataWaitById gave unexpected response: %u", post_state); return UCS_ERR_IO_ERROR; } if (UCT_UGNI_UDT_ANY != id) { --iface->super.outstanding; } header = uct_ugni_udt_get_rheader(desc, iface); ucs_trace("Got datagram id: %lu type: %i len: %i am_id: %i", id, header->type, header->length, header->am_id); if (UCT_UGNI_UDT_PAYLOAD != header->type) { /* ack message, no data */ ucs_assert_always(NULL != ep); ucs_mpool_put(ep->posted_desc); uct_ugni_check_flush(ep->desc_flush_group); ep->posted_desc = NULL; return UCS_OK; } return UCS_INPROGRESS; }
unsigned uct_rc_verbs_iface_post_recv_always(uct_rc_iface_t *iface, unsigned max) { struct ibv_recv_wr *bad_wr; uct_ib_recv_wr_t *wrs; unsigned count; int ret; wrs = ucs_alloca(sizeof *wrs * max); count = uct_ib_iface_prepare_rx_wrs(&iface->super, &iface->rx.mp, wrs, max); if (ucs_unlikely(count == 0)) { return 0; } UCT_IB_INSTRUMENT_RECORD_RECV_WR_LEN("uct_rc_iface_post_recv_always", &wrs[0].ibwr); ret = ibv_post_srq_recv(iface->rx.srq, &wrs[0].ibwr, &bad_wr); if (ret != 0) { ucs_fatal("ibv_post_srq_recv() returned %d: %m", ret); } iface->rx.available -= count; return count; }
static ucs_status_t UCS_F_ALWAYS_INLINE uct_dc_mlx5_ep_atomic_fop_post(uct_ep_h tl_ep, unsigned opcode, unsigned size, uint64_t value, void *result, uint64_t remote_addr, uct_rkey_t rkey, uct_completion_t *comp) { uct_dc_mlx5_ep_t *ep = ucs_derived_of(tl_ep, uct_dc_mlx5_ep_t); int op; uint64_t compare_mask; uint64_t compare; uint64_t swap_mask; uint64_t swap; int ext; ucs_status_t status; UCT_RC_MLX5_CHECK_ATOMIC_OPS(opcode, size, UCT_RC_MLX5_ATOMIC_FOPS); status = uct_rc_mlx5_iface_common_atomic_data(opcode, size, value, &op, &compare_mask, &compare, &swap_mask, &swap, &ext); if (ucs_unlikely(UCS_STATUS_IS_ERR(status))) { return status; } return uct_dc_mlx5_ep_atomic_fop(ep, op, result, ext, size, remote_addr, rkey, compare_mask, compare, swap_mask, swap, comp); }
ucs_status_t uct_rc_verbs_ep_flush(uct_ep_h tl_ep, unsigned flags, uct_completion_t *comp) { uct_rc_verbs_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_rc_verbs_iface_t); uct_rc_verbs_ep_t *ep = ucs_derived_of(tl_ep, uct_rc_verbs_ep_t); ucs_status_t status; if (ucs_unlikely(flags & UCT_FLUSH_FLAG_CANCEL)) { uct_ep_pending_purge(&ep->super.super.super, NULL, 0); uct_rc_verbs_ep_handle_failure(ep, UCS_ERR_CANCELED); return UCS_OK; } status = uct_rc_ep_flush(&ep->super, iface->config.tx_max_wr, flags); if (status != UCS_INPROGRESS) { return status; } if (uct_rc_txqp_unsignaled(&ep->super.txqp) != 0) { status = uct_rc_verbs_ep_put_short(tl_ep, NULL, 0, 0, 0); if (status != UCS_OK) { return status; } } return uct_rc_txqp_add_flush_comp(&iface->super, &ep->super.super, &ep->super.txqp, comp, ep->txcnt.pi); }
static ucs_status_t UCS_F_ALWAYS_INLINE uct_dc_mlx5_ep_atomic_op_post(uct_ep_h tl_ep, unsigned opcode, unsigned size, uint64_t value, uint64_t remote_addr, uct_rkey_t rkey) { uct_dc_mlx5_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_dc_mlx5_iface_t); uct_dc_mlx5_ep_t *ep = ucs_derived_of(tl_ep, uct_dc_mlx5_ep_t); uct_rc_iface_send_desc_t *desc; int op; uint64_t compare_mask; uint64_t compare; uint64_t swap_mask; uint64_t swap; int ext; /* not used here */ ucs_status_t status; UCT_DC_MLX5_CHECK_RES(iface, ep); UCT_RC_MLX5_CHECK_ATOMIC_OPS(opcode, size, UCT_RC_MLX5_ATOMIC_OPS); status = uct_rc_mlx5_iface_common_atomic_data(opcode, size, value, &op, &compare_mask, &compare, &swap_mask, &swap, &ext); if (ucs_unlikely(UCS_STATUS_IS_ERR(status))) { return status; } UCT_RC_IFACE_GET_TX_ATOMIC_DESC(&iface->super.super, &iface->super.tx.atomic_desc_mp, desc); uct_dc_mlx5_iface_atomic_post(iface, ep, op, desc, size, remote_addr, rkey, compare_mask, compare, swap_mask, swap); return UCS_OK; }
static ucs_status_t UCS_F_ALWAYS_INLINE uct_dc_mlx5_ep_short_dm(uct_dc_mlx5_ep_t *ep, uct_rc_mlx5_dm_copy_data_t *cache, size_t hdr_len, const void *payload, unsigned length, unsigned opcode, uint8_t fm_ce_se, uint64_t rdma_raddr, uct_rkey_t rdma_rkey) { uct_dc_mlx5_iface_t *iface = ucs_derived_of(ep->super.super.iface, uct_dc_mlx5_iface_t); uct_rc_iface_send_desc_t *desc; void *buffer; ucs_status_t status; uct_ib_log_sge_t log_sge; status = uct_rc_mlx5_common_dm_make_data(&iface->super, cache, hdr_len, payload, length, &desc, &buffer, &log_sge); if (ucs_unlikely(UCS_STATUS_IS_ERR(status))) { return status; } uct_dc_mlx5_iface_bcopy_post(iface, ep, opcode, hdr_len + length, rdma_raddr, rdma_rkey, desc, fm_ce_se, 0, buffer, log_sge.num_sge ? &log_sge : NULL); return UCS_OK; }
static UCS_F_NOINLINE void uct_ud_mlx5_iface_post_recv(uct_ud_mlx5_iface_t *iface) { unsigned batch = iface->super.config.rx_max_batch; struct mlx5_wqe_data_seg *rx_wqes; uint16_t pi, next_pi, count; uct_ib_iface_recv_desc_t *desc; rx_wqes = iface->rx.wq.wqes; pi = iface->rx.wq.rq_wqe_counter & iface->rx.wq.mask; for (count = 0; count < batch; count ++) { next_pi = (pi + 1) & iface->rx.wq.mask; ucs_prefetch(rx_wqes + next_pi); UCT_TL_IFACE_GET_RX_DESC(&iface->super.super.super, &iface->super.rx.mp, desc, break); rx_wqes[pi].lkey = htonl(desc->lkey); rx_wqes[pi].addr = htonll((uintptr_t)uct_ib_iface_recv_desc_hdr(&iface->super.super, desc)); pi = next_pi; } if (ucs_unlikely(count == 0)) { ucs_error("iface(%p) failed to post receive wqes", iface); return; } pi = iface->rx.wq.rq_wqe_counter + count; iface->rx.wq.rq_wqe_counter = pi; iface->super.rx.available -= count; ucs_memory_cpu_fence(); *iface->rx.wq.dbrec = htonl(pi); }
static void uct_ud_verbs_iface_progress_pending(uct_ud_verbs_iface_t *iface) { uct_ud_ep_t *ep; ucs_status_t status; uct_ud_neth_t neth; uct_ud_send_skb_t *skb; while (!ucs_queue_is_empty(&iface->super.tx.pending_ops)) { status = uct_ud_iface_get_next_pending(&iface->super, &ep, &neth, &skb); if (status == UCS_ERR_NO_RESOURCE) { return; } if (status == UCS_INPROGRESS) { continue; } if (ucs_unlikely(skb != NULL)) { /* TODO: not every skb is inline */ iface->tx.sge[0].addr = (uintptr_t) (skb->neth); iface->tx.sge[0].length = skb->len; uct_ud_verbs_iface_tx_ctl(iface, ucs_derived_of(ep, uct_ud_verbs_ep_t)); uct_ud_ep_log_tx_tag("PENDING_TX: (skb)", ep, skb->neth, skb->len); } else { iface->tx.sge[0].addr = (uintptr_t)&neth; iface->tx.sge[0].length = sizeof(neth); UCT_UD_EP_HOOK_CALL_TX(ep, &neth); uct_ud_verbs_iface_tx_ctl(iface, ucs_derived_of(ep, uct_ud_verbs_ep_t)); uct_ud_ep_log_tx_tag("PENDING_TX: (neth)", ep, &neth, sizeof(neth)); } } }
static UCS_F_ALWAYS_INLINE ucs_status_t uct_ugni_smsg_ep_am_common_send(uct_ugni_smsg_ep_t *ep, uct_ugni_smsg_iface_t *iface, uint8_t am_id, unsigned header_length, void *header, unsigned payload_length, void *payload, uct_ugni_smsg_desc_t *desc) { gni_return_t gni_rc; if (ucs_unlikely(!uct_ugni_ep_can_send(&ep->super))) { goto exit_no_res; } desc->msg_id = iface->smsg_id++; desc->flush_group = ep->super.flush_group; uct_ugni_cdm_lock(&iface->super.cdm); gni_rc = GNI_SmsgSendWTag(ep->super.ep, header, header_length, payload, payload_length, desc->msg_id, am_id); uct_ugni_cdm_unlock(&iface->super.cdm); if(GNI_RC_SUCCESS != gni_rc){ goto exit_no_res; } ++desc->flush_group->flush_comp.count; ++iface->super.outstanding; sglib_hashed_uct_ugni_smsg_desc_t_add(iface->smsg_list, desc); return UCS_OK; exit_no_res: ucs_trace("Smsg send failed."); ucs_mpool_put(desc); UCS_STATS_UPDATE_COUNTER(ep->super.super.stats, UCT_EP_STAT_NO_RES, 1); return UCS_ERR_NO_RESOURCE; }
ucs_status_t ucp_ep_new(ucp_worker_h worker, uint64_t dest_uuid, const char *peer_name, const char *message, ucp_ep_h *ep_p) { ucs_status_t status; ucp_ep_config_key_t key; ucp_ep_h ep; khiter_t hash_it; int hash_extra_status = 0; ep = ucs_calloc(1, sizeof(*ep), "ucp ep"); if (ep == NULL) { ucs_error("Failed to allocate ep"); status = UCS_ERR_NO_MEMORY; goto err; } /* EP configuration without any lanes */ memset(&key, 0, sizeof(key)); key.rma_lane_map = 0; key.amo_lane_map = 0; key.reachable_md_map = 0; key.am_lane = UCP_NULL_RESOURCE; key.rndv_lane = UCP_NULL_RESOURCE; key.wireup_msg_lane = UCP_NULL_LANE; key.num_lanes = 0; memset(key.amo_lanes, UCP_NULL_LANE, sizeof(key.amo_lanes)); ep->worker = worker; ep->dest_uuid = dest_uuid; ep->cfg_index = ucp_worker_get_ep_config(worker, &key); ep->am_lane = UCP_NULL_LANE; ep->flags = 0; #if ENABLE_DEBUG_DATA ucs_snprintf_zero(ep->peer_name, UCP_WORKER_NAME_MAX, "%s", peer_name); #endif hash_it = kh_put(ucp_worker_ep_hash, &worker->ep_hash, dest_uuid, &hash_extra_status); if (ucs_unlikely(hash_it == kh_end(&worker->ep_hash))) { ucs_error("Hash failed with ep %p to %s 0x%"PRIx64"->0x%"PRIx64" %s " "with status %d", ep, peer_name, worker->uuid, ep->dest_uuid, message, hash_extra_status); status = UCS_ERR_NO_RESOURCE; goto err_free_ep; } kh_value(&worker->ep_hash, hash_it) = ep; *ep_p = ep; ucs_debug("created ep %p to %s 0x%"PRIx64"->0x%"PRIx64" %s", ep, peer_name, worker->uuid, ep->dest_uuid, message); return UCS_OK; err_free_ep: ucs_free(ep); err: return status; }
static UCS_F_ALWAYS_INLINE void uct_rc_mlx5_post_send(uct_rc_mlx5_ep_t *ep, struct mlx5_wqe_ctrl_seg *ctrl, uint8_t opcode, uint8_t opmod, unsigned sig_flag, unsigned wqe_size) { unsigned n, num_seg, num_bb; void *src, *dst; uint16_t sw_pi; num_seg = ucs_div_round_up(wqe_size, UCT_IB_MLX5_WQE_SEG_SIZE); num_bb = ucs_div_round_up(wqe_size, MLX5_SEND_WQE_BB); sw_pi = ep->tx.sw_pi; uct_rc_mlx5_set_ctrl_seg(ctrl, sw_pi, opcode, opmod, ep->qp_num, sig_flag, num_seg); uct_ib_mlx5_log_tx(IBV_QPT_RC, ctrl, ep->tx.qstart, ep->tx.qend, (opcode == MLX5_OPCODE_SEND) ? uct_rc_ep_am_packet_dump : NULL); /* TODO Put memory store fence here too, to prevent WC being flushed after DBrec */ ucs_memory_cpu_store_fence(); /* Write doorbell record */ ep->tx.prev_sw_pi = sw_pi; *ep->tx.dbrec = htonl(sw_pi += num_bb); /* Make sure that doorbell record is written before ringing the doorbell */ ucs_memory_bus_store_fence(); /* Set up copy pointers */ dst = ep->tx.bf_reg; src = ctrl; /* BF copy */ /* TODO support DB without BF */ ucs_assert(wqe_size <= ep->tx.bf_size); ucs_assert(num_bb <= UCT_RC_MLX5_MAX_BB); for (n = 0; n < num_bb; ++n) { uct_rc_mlx5_bf_copy_bb(dst, src); dst += MLX5_SEND_WQE_BB; src += MLX5_SEND_WQE_BB; if (ucs_unlikely(src == ep->tx.qend)) { src = ep->tx.qstart; } } /* We don't want the compiler to reorder instructions and hurt latency */ ucs_compiler_fence(); /* Advance queue pointer */ ucs_assert(ctrl == ep->tx.seg); ep->tx.seg = src; ep->tx.sw_pi = sw_pi; /* Flip BF register */ ep->tx.bf_reg = (void*) ((uintptr_t) ep->tx.bf_reg ^ ep->tx.bf_size); uct_rc_ep_tx_posted(&ep->super, sig_flag & MLX5_WQE_CTRL_CQ_UPDATE); }
static inline void uct_mm_iface_poll_fifo(uct_mm_iface_t *iface) { uint64_t read_index_loc, read_index; uct_mm_fifo_element_t* read_index_elem; ucs_status_t status; /* check the memory pool to make sure that there is a new descriptor available */ if (ucs_unlikely(iface->last_recv_desc == NULL)) { UCT_TL_IFACE_GET_RX_DESC(&iface->super, &iface->recv_desc_mp, iface->last_recv_desc, return); }
static UCS_F_ALWAYS_INLINE ucs_status_ptr_t ucp_tag_send_req(ucp_request_t *req, size_t count, const ucp_ep_msg_config_t* msg_config, size_t rndv_rma_thresh, size_t rndv_am_thresh, ucp_send_callback_t cb, const ucp_proto_t *proto) { size_t seg_size = (msg_config->max_bcopy - proto->only_hdr_size); size_t rndv_thresh = ucp_tag_get_rndv_threshold(req, count, msg_config->max_iov, rndv_rma_thresh, rndv_am_thresh, seg_size); size_t zcopy_thresh = ucp_proto_get_zcopy_threshold(req, msg_config, count, rndv_thresh); ssize_t max_short = ucp_proto_get_short_max(req, msg_config); ucs_status_t status; ucs_trace_req("select tag request(%p) progress algorithm datatype=%lx " "buffer=%p length=%zu max_short=%zd rndv_thresh=%zu " "zcopy_thresh=%zu", req, req->send.datatype, req->send.buffer, req->send.length, max_short, rndv_thresh, zcopy_thresh); status = ucp_request_send_start(req, max_short, zcopy_thresh, seg_size, rndv_thresh, proto); if (ucs_unlikely(status != UCS_OK)) { if (status == UCS_ERR_NO_PROGRESS) { ucs_assert(req->send.length >= rndv_thresh); /* RMA/AM rendezvous */ status = ucp_tag_send_start_rndv(req); } if (status != UCS_OK) { return UCS_STATUS_PTR(status); } } ucp_request_send_tag_stat(req); /* * Start the request. * If it is completed immediately, release the request and return the status. * Otherwise, return the request. */ status = ucp_request_send(req); if (req->flags & UCP_REQUEST_FLAG_COMPLETED) { ucs_trace_req("releasing send request %p, returning status %s", req, ucs_status_string(status)); ucp_request_put(req); return UCS_STATUS_PTR(status); } ucp_request_set_callback(req, send.cb, cb) ucs_trace_req("returning send request %p", req); return req + 1; }
ucs_status_t uct_rc_verbs_ep_am_zcopy(uct_ep_h tl_ep, uint8_t id, const void *header, unsigned header_length, const void *payload, size_t length, uct_mem_h memh, uct_completion_t *comp) { uct_rc_verbs_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_rc_verbs_iface_t); uct_rc_verbs_ep_t *ep = ucs_derived_of(tl_ep, uct_rc_verbs_ep_t); struct ibv_mr *mr = memh; uct_rc_iface_send_desc_t *desc; struct ibv_send_wr wr; struct ibv_sge sge[2]; uct_rc_hdr_t *rch; int send_flags; UCT_CHECK_AM_ID(id); UCT_CHECK_LENGTH(sizeof(*rch) + header_length, iface->config.short_desc_size, "am_zcopy header"); UCT_CHECK_LENGTH(header_length + length, iface->super.super.config.seg_size, "am_zcopy payload"); UCT_RC_VERBS_CHECK_RES(iface, ep); UCT_RC_IFACE_GET_TX_DESC(&iface->super, iface->short_desc_mp, desc); if (comp == NULL) { desc->super.handler = (uct_rc_send_handler_t)ucs_mpool_put; send_flags = 0; } else { desc->super.handler = uct_rc_verbs_ep_am_zcopy_handler; desc->super.user_comp = comp; send_flags = IBV_SEND_SIGNALED; } /* Header buffer: active message ID + user header */ rch = (void*)(desc + 1); rch->am_id = id; memcpy(rch + 1, header, header_length); wr.sg_list = sge; wr.opcode = IBV_WR_SEND; sge[0].length = sizeof(*rch) + header_length; if (ucs_unlikely(length == 0)) { wr.num_sge = 1; } else { wr.num_sge = 2; sge[1].addr = (uintptr_t)payload; sge[1].length = length; sge[1].lkey = (mr == UCT_INVALID_MEM_HANDLE) ? 0 : mr->lkey; } UCT_TL_EP_STAT_OP(&ep->super.super, AM, ZCOPY, header_length + length); uct_rc_verbs_ep_post_send_desc(ep, &wr, desc, send_flags); return UCS_INPROGRESS; }
static UCS_F_ALWAYS_INLINE void uct_rc_verbs_iface_poll_tx(uct_rc_verbs_iface_t *iface) { struct ibv_wc wc[UCT_IB_MAX_WC]; uct_rc_verbs_ep_t *ep; uct_rc_iface_send_op_t *op; unsigned count; uint16_t sn; int i, ret; ret = ibv_poll_cq(iface->super.super.send_cq, UCT_IB_MAX_WC, wc); if (ucs_unlikely(ret <= 0)) { if (ucs_unlikely(ret < 0)) { ucs_fatal("Failed to poll send CQ"); } return; } for (i = 0; i < ret; ++i) { if (ucs_unlikely(wc[i].status != IBV_WC_SUCCESS)) { ucs_fatal("Send completion with error: %s", ibv_wc_status_str(wc[i].status)); } UCS_STATS_UPDATE_COUNTER(iface->super.stats, UCT_RC_IFACE_STAT_TX_COMPLETION, 1); ep = ucs_derived_of(uct_rc_iface_lookup_ep(&iface->super, wc[i].qp_num), uct_rc_verbs_ep_t); ucs_assert(ep != NULL); count = wc[i].wr_id + 1; /* Number of sends with WC completes in batch */ ep->super.available += count; ep->tx.completion_count += count; ++iface->super.tx.cq_available; sn = ep->tx.completion_count; ucs_queue_for_each_extract(op, &ep->super.outstanding, queue, UCS_CIRCULAR_COMPARE16(op->sn, <=, sn)) { op->handler(op); } } }
ucs_status_t ucp_get_nbi(ucp_ep_h ep, void *buffer, size_t length, uint64_t remote_addr, ucp_rkey_h rkey) { ucp_ep_rma_config_t *rma_config; ucs_status_t status; uct_rkey_t uct_rkey; size_t frag_length; uct_ep_h uct_ep; UCP_RMA_CHECK_PARAMS(buffer, length); for (;;) { UCP_EP_RESOLVE_RKEY_RMA(ep, rkey, uct_ep, uct_rkey, rma_config); frag_length = ucs_min(rma_config->max_get_bcopy, length); status = uct_ep_get_bcopy(uct_ep, (uct_unpack_callback_t)memcpy, (void*)buffer, frag_length, remote_addr, uct_rkey, NULL); if (ucs_likely(status == UCS_OK || status == UCS_INPROGRESS)) { /* Get was initiated */ length -= frag_length; buffer += frag_length; remote_addr += frag_length; if (length == 0) { break; } } else if (ucs_unlikely(status == UCS_ERR_NO_RESOURCE)) { /* Out of resources - adding request for later schedule */ ucp_request_t *req; req = ucs_mpool_get_inline(&ep->worker->req_mp); if (req == NULL) { /* can't allocate memory for request - abort */ status = UCS_ERR_NO_MEMORY; break; } ucp_add_pending_rma(req, ep, uct_ep, buffer, length, remote_addr, rkey, ucp_progress_get_nbi); /* Mark it as in progress */ status = UCS_INPROGRESS; break; } else { /* Error */ break; } } return status; }
static inline void uct_ud_verbs_iface_poll_tx(uct_ud_verbs_iface_t *iface) { struct ibv_wc wc; int ret; ret = ibv_poll_cq(iface->super.super.send_cq, 1, &wc); if (ucs_unlikely(ret < 0)) { ucs_fatal("Failed to poll send CQ"); return; } if (ret == 0) { return; } if (ucs_unlikely(wc.status != IBV_WC_SUCCESS)) { ucs_fatal("Send completion (wr_id=0x%0X with error: %s ", (unsigned)wc.wr_id, ibv_wc_status_str(wc.status)); return; } iface->super.tx.available += UCT_UD_TX_MODERATION + 1; }
ucs_status_t uct_dc_mlx5_ep_flush(uct_ep_h tl_ep, unsigned flags, uct_completion_t *comp) { uct_dc_mlx5_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_dc_mlx5_iface_t); uct_dc_mlx5_ep_t *ep = ucs_derived_of(tl_ep, uct_dc_mlx5_ep_t); ucs_status_t status; UCT_DC_MLX5_TXQP_DECL(txqp, txwq); if (ucs_unlikely(flags & UCT_FLUSH_FLAG_CANCEL)) { if (ep->dci != UCT_DC_MLX5_EP_NO_DCI) { uct_rc_txqp_purge_outstanding(&iface->tx.dcis[ep->dci].txqp, UCS_ERR_CANCELED, 0); #if ENABLE_ASSERT iface->tx.dcis[ep->dci].flags |= UCT_DC_DCI_FLAG_EP_CANCELED; #endif } uct_ep_pending_purge(tl_ep, NULL, 0); return UCS_OK; } if (!uct_rc_iface_has_tx_resources(&iface->super.super)) { return UCS_ERR_NO_RESOURCE; } if (ep->dci == UCT_DC_MLX5_EP_NO_DCI) { if (!uct_dc_mlx5_iface_dci_can_alloc(iface)) { return UCS_ERR_NO_RESOURCE; /* waiting for dci */ } else { UCT_TL_EP_STAT_FLUSH(&ep->super); /* no sends */ return UCS_OK; } } if (!uct_dc_mlx5_iface_dci_ep_can_send(ep)) { return UCS_ERR_NO_RESOURCE; /* cannot send */ } status = uct_dc_mlx5_iface_flush_dci(iface, ep->dci); if (status == UCS_OK) { UCT_TL_EP_STAT_FLUSH(&ep->super); return UCS_OK; /* all sends completed */ } ucs_assert(status == UCS_INPROGRESS); ucs_assert(ep->dci != UCT_DC_MLX5_EP_NO_DCI); UCT_DC_MLX5_IFACE_TXQP_GET(iface, ep, txqp, txwq); return uct_rc_txqp_add_flush_comp(&iface->super.super, &ep->super, txqp, comp, txwq->sig_pi); }
static UCS_F_ALWAYS_INLINE ucs_status_t ucp_tag_matched(void *buffer, size_t buffer_length, ucp_tag_t recv_tag, void *recv_data, size_t recv_length, ucp_tag_recv_completion_t *comp) { ucs_debug("matched tag 0x%"PRIx64" ", (uint64_t)recv_tag); if (ucs_unlikely(recv_length > buffer_length)) { return UCS_ERR_MESSAGE_TRUNCATED; } memcpy(buffer, recv_data, recv_length); comp->rcvd_len = recv_length; comp->sender_tag = recv_tag; return UCS_OK; }
void __ucs_wtimer_add(ucs_twheel_t *t, ucs_wtimer_t *timer, ucs_time_t delta) { uint64_t slot; timer->is_active = 1; slot = delta>>t->res_order; if (ucs_unlikely(slot == 0)) { /* nothing really wrong with adding timer to the current slot. However * we want to guard against the case we spend to much time in hi res * timer processing */ ucs_fatal("Timer resolution is too low. Min resolution %lf usec, wanted %lf usec", ucs_time_to_usec(t->res), ucs_time_to_usec(delta)); } ucs_assert(slot > 0); if (ucs_unlikely(slot >= t->num_slots)) { slot = t->num_slots - 1; } slot = (t->current + slot) % t->num_slots; ucs_assert(slot != t->current); ucs_list_add_tail(&t->wheel[slot], &timer->list); }
ucs_status_t uct_ugni_ep_get_zcopy(uct_ep_h tl_ep, const uct_iov_t *iov, size_t iovcnt, uint64_t remote_addr, uct_rkey_t rkey, uct_completion_t *comp) { uct_ugni_ep_t *ep = ucs_derived_of(tl_ep, uct_ugni_ep_t); uct_ugni_rdma_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_ugni_rdma_iface_t); uct_ugni_base_desc_t *rdma; UCT_CHECK_PARAM_IOV(iov, iovcnt, buffer, length, memh); UCT_SKIP_ZERO_LENGTH(length); UCT_CHECK_LENGTH(ucs_align_up_pow2(length, UGNI_GET_ALIGN), 0, iface->config.rdma_max_size, "get_zcopy"); /* Special flow for an unalign data */ if (ucs_unlikely((GNI_DEVICE_GEMINI == iface->super.dev->type && ucs_check_if_align_pow2((uintptr_t)buffer, UGNI_GET_ALIGN)) || ucs_check_if_align_pow2(remote_addr, UGNI_GET_ALIGN) || ucs_check_if_align_pow2(length, UGNI_GET_ALIGN))) { return uct_ugni_ep_get_composed(tl_ep, buffer, length, memh, remote_addr, rkey, comp); } /* Everything is perfectly aligned */ UCT_TL_IFACE_GET_TX_DESC(&iface->super.super, &iface->free_desc, rdma, return UCS_ERR_NO_RESOURCE); /* Setup Callback */ uct_ugni_format_rdma(rdma, GNI_POST_RDMA_GET, buffer, remote_addr, memh, rkey, ucs_align_up_pow2(length, UGNI_GET_ALIGN), ep, iface->super.local_cq, comp); ucs_trace_data("Posting GET ZCOPY, GNI_PostRdma of size %"PRIx64" (%lu) " "from %p to %p, with [%"PRIx64" %"PRIx64"]", rdma->desc.length, length, (void *)rdma->desc.local_addr, (void *)rdma->desc.remote_addr, rdma->desc.remote_mem_hndl.qword1, rdma->desc.remote_mem_hndl.qword2); UCT_TL_EP_STAT_OP(ucs_derived_of(tl_ep, uct_base_ep_t), GET, ZCOPY, length); return uct_ugni_post_rdma(iface, ep, rdma); }
void __ucs_twheel_sweep(ucs_twheel_t *t, ucs_time_t current_time) { ucs_wtimer_t *timer; uint64_t slot; slot = (current_time - t->now) >> t->res_order; t->now = current_time; if (ucs_unlikely(slot >= t->num_slots)) { slot = t->num_slots - 1; } slot = (t->current + slot) % t->num_slots; for (; t->current != slot; t->current = (t->current+1) % t->num_slots) { while (!ucs_list_is_empty(&t->wheel[t->current])) { timer = ucs_list_extract_head(&t->wheel[t->current], ucs_wtimer_t, list); timer->is_active = 0; ucs_invoke_callback(&timer->cb); } } }
static UCS_F_ALWAYS_INLINE ucs_status_t uct_cuda_copy_post_cuda_async_copy(uct_ep_h tl_ep, void *dst, void *src, size_t length, int direction, cudaStream_t stream, ucs_queue_head_t *outstanding_queue, uct_completion_t *comp) { uct_cuda_copy_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_cuda_copy_iface_t); uct_cuda_copy_event_desc_t *cuda_event; ucs_status_t status; if (!length) { return UCS_OK; } cuda_event = ucs_mpool_get(&iface->cuda_event_desc); if (ucs_unlikely(cuda_event == NULL)) { ucs_error("Failed to allocate cuda event object"); return UCS_ERR_NO_MEMORY; } status = UCT_CUDA_FUNC(cudaMemcpyAsync(dst, src, length, direction, stream)); if (UCS_OK != status) { return UCS_ERR_IO_ERROR; } status = UCT_CUDA_FUNC(cudaEventRecord(cuda_event->event, stream)); if (UCS_OK != status) { return UCS_ERR_IO_ERROR; } ucs_queue_push(outstanding_queue, &cuda_event->queue); cuda_event->comp = comp; ucs_trace("cuda async issued :%p dst:%p, src:%p len:%ld", cuda_event, dst, src, length); return UCS_INPROGRESS; }
static UCS_F_ALWAYS_INLINE void ucp_tag_recv_common(ucp_worker_h worker, void *buffer, size_t count, uintptr_t datatype, ucp_tag_t tag, ucp_tag_t tag_mask, ucp_request_t *req, uint32_t req_flags, ucp_tag_recv_callback_t cb, ucp_recv_desc_t *rdesc, const char *debug_name) { unsigned common_flags = UCP_REQUEST_FLAG_RECV | UCP_REQUEST_FLAG_EXPECTED; ucp_eager_first_hdr_t *eagerf_hdr; ucp_request_queue_t *req_queue; uct_memory_type_t mem_type; size_t hdr_len, recv_len; ucs_status_t status; uint64_t msg_id; ucp_trace_req(req, "%s buffer %p dt 0x%lx count %zu tag %"PRIx64"/%"PRIx64, debug_name, buffer, datatype, count, tag, tag_mask); /* First, check the fast path case - single fragment * in this case avoid initializing most of request fields * */ if (ucs_likely((rdesc != NULL) && (rdesc->flags & UCP_RECV_DESC_FLAG_EAGER_ONLY))) { UCS_PROFILE_REQUEST_EVENT(req, "eager_only_match", 0); UCP_WORKER_STAT_EAGER_MSG(worker, rdesc->flags); UCP_WORKER_STAT_EAGER_CHUNK(worker, UNEXP); if (ucs_unlikely(rdesc->flags & UCP_RECV_DESC_FLAG_EAGER_SYNC)) { ucp_tag_eager_sync_send_ack(worker, rdesc + 1, rdesc->flags); } req->flags = UCP_REQUEST_FLAG_RECV | req_flags; hdr_len = rdesc->payload_offset; recv_len = rdesc->length - hdr_len; req->recv.tag.info.sender_tag = ucp_rdesc_get_tag(rdesc); req->recv.tag.info.length = recv_len; ucp_memory_type_detect_mds(worker->context, buffer, recv_len, &mem_type); status = ucp_dt_unpack_only(worker, buffer, count, datatype, mem_type, (void*)(rdesc + 1) + hdr_len, recv_len, 1); ucp_recv_desc_release(rdesc); if (req_flags & UCP_REQUEST_FLAG_CALLBACK) { cb(req + 1, status, &req->recv.tag.info); } ucp_tag_recv_request_completed(req, status, &req->recv.tag.info, debug_name); return; } /* Initialize receive request */ req->status = UCS_OK; req->recv.worker = worker; req->recv.buffer = buffer; req->recv.datatype = datatype; ucp_dt_recv_state_init(&req->recv.state, buffer, datatype, count); if (!UCP_DT_IS_CONTIG(datatype)) { common_flags |= UCP_REQUEST_FLAG_BLOCK_OFFLOAD; } req->flags = common_flags | req_flags; req->recv.length = ucp_dt_length(datatype, count, buffer, &req->recv.state); ucp_memory_type_detect_mds(worker->context, buffer, req->recv.length, &mem_type); req->recv.mem_type = mem_type; req->recv.tag.tag = tag; req->recv.tag.tag_mask = tag_mask; req->recv.tag.cb = cb; if (ucs_log_is_enabled(UCS_LOG_LEVEL_TRACE_REQ)) { req->recv.tag.info.sender_tag = 0; } if (ucs_unlikely(rdesc == NULL)) { /* If not found on unexpected, wait until it arrives. * If was found but need this receive request for later completion, save it */ req_queue = ucp_tag_exp_get_queue(&worker->tm, tag, tag_mask); /* If offload supported, post this tag to transport as well. * TODO: need to distinguish the cases when posting is not needed. */ ucp_tag_offload_try_post(worker, req, req_queue); ucp_tag_exp_push(&worker->tm, req_queue, req); ucs_trace_req("%s returning expected request %p (%p)", debug_name, req, req + 1); return; } /* Check rendezvous case */ if (ucs_unlikely(rdesc->flags & UCP_RECV_DESC_FLAG_RNDV)) { ucp_rndv_matched(worker, req, (void*)(rdesc + 1)); UCP_WORKER_STAT_RNDV(worker, UNEXP); ucp_recv_desc_release(rdesc); return; } if (ucs_unlikely(rdesc->flags & UCP_RECV_DESC_FLAG_EAGER_SYNC)) { ucp_tag_eager_sync_send_ack(worker, rdesc + 1, rdesc->flags); } UCP_WORKER_STAT_EAGER_MSG(worker, rdesc->flags); ucs_assert(rdesc->flags & UCP_RECV_DESC_FLAG_EAGER); eagerf_hdr = (void*)(rdesc + 1); req->recv.tag.info.sender_tag = ucp_rdesc_get_tag(rdesc); req->recv.tag.info.length = req->recv.tag.remaining = eagerf_hdr->total_len; /* process first fragment */ UCP_WORKER_STAT_EAGER_CHUNK(worker, UNEXP); msg_id = eagerf_hdr->msg_id; status = ucp_tag_recv_request_process_rdesc(req, rdesc, 0); ucs_assert(status == UCS_INPROGRESS); /* process additional fragments */ ucp_tag_frag_list_process_queue(&worker->tm, req, msg_id UCS_STATS_ARG(UCP_WORKER_STAT_TAG_RX_EAGER_CHUNK_UNEXP)); }
/* * Generic data-pointer posting function. * Parameters which are not relevant to the opcode are ignored. * * +--------+-----+-------+--------+-------+ * SEND | CTRL | INL | am_id | am_hdr | DPSEG | * +--------+-----+---+---+----+----+------+ * RDMA_WRITE | CTRL | RADDR | DPSEG | * +--------+---------+--------+-------+ * ATOMIC | CTRL | RADDR | ATOMIC | DPSEG | * +--------+---------+--------+-------+ */ static UCS_F_ALWAYS_INLINE ucs_status_t uct_rc_mlx5_ep_dptr_post(uct_rc_mlx5_ep_t *ep, unsigned opcode_flags, const void *buffer, unsigned length, uint32_t *lkey_p, /* SEND */ uint8_t am_id, const void *am_hdr, unsigned am_hdr_len, /* RDMA/ATOMIC */ uint64_t remote_addr, uct_rkey_t rkey, /* ATOMIC */ uint64_t compare_mask, uint64_t compare, uint64_t swap_add, int signal) { struct mlx5_wqe_ctrl_seg *ctrl; struct mlx5_wqe_raddr_seg *raddr; struct mlx5_wqe_atomic_seg *atomic; struct mlx5_wqe_data_seg *dptr; struct mlx5_wqe_inl_data_seg *inl; struct uct_ib_mlx5_atomic_masked_cswap32_seg *masked_cswap32; struct uct_ib_mlx5_atomic_masked_fadd32_seg *masked_fadd32; struct uct_ib_mlx5_atomic_masked_cswap64_seg *masked_cswap64; uct_rc_mlx5_iface_t *iface; uct_rc_hdr_t *rch; unsigned wqe_size, inl_seg_size; uint8_t opmod; iface = ucs_derived_of(ep->super.super.super.iface, uct_rc_mlx5_iface_t); if (!signal) { signal = uct_rc_iface_tx_moderation(&iface->super, &ep->super, MLX5_WQE_CTRL_CQ_UPDATE); } else { ucs_assert(signal == MLX5_WQE_CTRL_CQ_UPDATE); } opmod = 0; ctrl = ep->tx.seg; switch (opcode_flags) { case MLX5_OPCODE_SEND: UCT_CHECK_LENGTH(length + sizeof(*rch) + am_hdr_len, iface->super.super.config.seg_size, "am_zcopy payload"); inl_seg_size = ucs_align_up_pow2(sizeof(*inl) + sizeof(*rch) + am_hdr_len, UCT_IB_MLX5_WQE_SEG_SIZE); UCT_CHECK_LENGTH(sizeof(*ctrl) + inl_seg_size + sizeof(*dptr), UCT_RC_MLX5_MAX_BB * MLX5_SEND_WQE_BB, "am_zcopy header"); /* Inline segment with AM ID and header */ inl = (void*)(ctrl + 1); inl->byte_count = htonl((sizeof(*rch) + am_hdr_len) | MLX5_INLINE_SEG); rch = (void*)(inl + 1); rch->am_id = am_id; uct_rc_mlx5_inline_copy(rch + 1, am_hdr, am_hdr_len, ep); /* Data segment with payload */ if (length == 0) { wqe_size = sizeof(*ctrl) + inl_seg_size; } else { wqe_size = sizeof(*ctrl) + inl_seg_size + sizeof(*dptr); dptr = (void*)(ctrl + 1) + inl_seg_size; if (ucs_unlikely((void*)dptr >= ep->tx.qend)) { dptr = (void*)dptr - (ep->tx.qend - ep->tx.qstart); } ucs_assert((void*)dptr >= ep->tx.qstart); ucs_assert((void*)(dptr + 1) <= ep->tx.qend); uct_rc_mlx5_ep_set_dptr_seg(dptr, buffer, length, *lkey_p); } break; case MLX5_OPCODE_SEND|UCT_RC_MLX5_OPCODE_FLAG_RAW: /* Data segment only */ UCT_CHECK_LENGTH(length, iface->super.super.config.seg_size, "send"); ucs_assert(length < (2ul << 30)); wqe_size = sizeof(*ctrl) + sizeof(*dptr); uct_rc_mlx5_ep_set_dptr_seg((void*)(ctrl + 1), buffer, length, *lkey_p); break; case MLX5_OPCODE_RDMA_READ: case MLX5_OPCODE_RDMA_WRITE: /* Set RDMA segment */ UCT_CHECK_LENGTH(length, UCT_IB_MAX_MESSAGE_SIZE, "put/get"); raddr = (void*)(ctrl + 1); uct_rc_mlx5_ep_set_rdma_seg(raddr, remote_addr, rkey); /* Data segment */ if (length == 0) { wqe_size = sizeof(*ctrl) + sizeof(*raddr); } else { wqe_size = sizeof(*ctrl) + sizeof(*raddr) + sizeof(*dptr); uct_rc_mlx5_ep_set_dptr_seg((void*)(raddr + 1), buffer, length, *lkey_p); } break; case MLX5_OPCODE_ATOMIC_FA: case MLX5_OPCODE_ATOMIC_CS: ucs_assert(length == sizeof(uint64_t)); raddr = (void*)(ctrl + 1); uct_rc_mlx5_ep_set_rdma_seg(raddr, remote_addr, rkey); atomic = (void*)(raddr + 1); if (opcode_flags == MLX5_OPCODE_ATOMIC_CS) { atomic->compare = compare; } atomic->swap_add = swap_add; uct_rc_mlx5_ep_set_dptr_seg((void*)(atomic + 1), buffer, length, *lkey_p); wqe_size = sizeof(*ctrl) + sizeof(*raddr) + sizeof(*atomic) + sizeof(*dptr); break; case MLX5_OPCODE_ATOMIC_MASKED_CS: raddr = (void*)(ctrl + 1); uct_rc_mlx5_ep_set_rdma_seg(raddr, remote_addr, rkey); switch (length) { case sizeof(uint32_t): opmod = UCT_IB_MLX5_OPMOD_EXT_ATOMIC(2); masked_cswap32 = (void*)(raddr + 1); masked_cswap32->swap = swap_add; masked_cswap32->compare = compare; masked_cswap32->swap_mask = (uint32_t)-1; masked_cswap32->compare_mask = compare_mask; dptr = (void*)(masked_cswap32 + 1); wqe_size = sizeof(*ctrl) + sizeof(*raddr) + sizeof(*masked_cswap32) + sizeof(*dptr); break; case sizeof(uint64_t): opmod = UCT_IB_MLX5_OPMOD_EXT_ATOMIC(3); /* Ext. atomic, size 2**3 */ masked_cswap64 = (void*)(raddr + 1); masked_cswap64->swap = swap_add; masked_cswap64->compare = compare; masked_cswap64->swap_mask = (uint64_t)-1; masked_cswap64->compare_mask = compare_mask; dptr = (void*)(masked_cswap64 + 1); wqe_size = sizeof(*ctrl) + sizeof(*raddr) + sizeof(*masked_cswap64) + sizeof(*dptr); /* Handle QP wrap-around. It cannot happen in the middle of * masked-cswap segment, because it's still in the first BB. */ ucs_assert((void*)dptr <= ep->tx.qend); if (dptr == ep->tx.qend) { dptr = ep->tx.qstart; } else { ucs_assert((void*)masked_cswap64 < ep->tx.qend); } break; default: ucs_assert(0); } uct_rc_mlx5_ep_set_dptr_seg(dptr, buffer, length, *lkey_p); break; case MLX5_OPCODE_ATOMIC_MASKED_FA: ucs_assert(length == sizeof(uint32_t)); raddr = (void*)(ctrl + 1); uct_rc_mlx5_ep_set_rdma_seg(raddr, remote_addr, rkey); opmod = UCT_IB_MLX5_OPMOD_EXT_ATOMIC(2); masked_fadd32 = (void*)(raddr + 1); masked_fadd32->add = swap_add; masked_fadd32->filed_boundary = 0; uct_rc_mlx5_ep_set_dptr_seg((void*)(masked_fadd32 + 1), buffer, length, *lkey_p); wqe_size = sizeof(*ctrl) + sizeof(*raddr) + sizeof(*masked_fadd32) + sizeof(*dptr); break; default: return UCS_ERR_INVALID_PARAM; } uct_rc_mlx5_post_send(ep, ctrl, (opcode_flags & UCT_RC_MLX5_OPCODE_MASK), opmod, signal, wqe_size); return UCS_OK; }