static inline ucs_status_t uct_mm_iface_process_recv(uct_mm_iface_t *iface, uct_mm_fifo_element_t* elem) { ucs_status_t status; void *data; if (ucs_likely(elem->flags & UCT_MM_FIFO_ELEM_FLAG_INLINE)) { /* read short (inline) messages from the FIFO elements */ uct_iface_trace_am(&iface->super, UCT_AM_TRACE_TYPE_RECV, elem->am_id, elem + 1, elem->length, "RX: AM_SHORT"); status = uct_mm_iface_invoke_am(iface, elem->am_id, elem + 1, elem->length, 0); } else { /* read bcopy messages from the receive descriptors */ VALGRIND_MAKE_MEM_DEFINED(elem->desc_chunk_base_addr + elem->desc_offset, elem->length); data = elem->desc_chunk_base_addr + elem->desc_offset; uct_iface_trace_am(&iface->super, UCT_AM_TRACE_TYPE_RECV, elem->am_id, data, elem->length, "RX: AM_BCOPY"); status = uct_mm_iface_invoke_am(iface, elem->am_id, data, elem->length, UCT_CB_FLAG_DESC); if (status != UCS_OK) { /* assign a new receive descriptor to this FIFO element.*/ uct_mm_assign_desc_to_fifo_elem(iface, elem, 0); } } return status; }
ssize_t uct_tcp_ep_am_bcopy(uct_ep_h uct_ep, uint8_t am_id, uct_pack_callback_t pack_cb, void *arg, unsigned flags) { uct_tcp_ep_t *ep = ucs_derived_of(uct_ep, uct_tcp_ep_t); uct_tcp_iface_t *iface = ucs_derived_of(uct_ep->iface, uct_tcp_iface_t); uct_tcp_am_hdr_t *hdr; size_t packed_length; if (!uct_tcp_ep_can_send(ep)) { return UCS_ERR_NO_RESOURCE; } hdr = ep->buf; hdr->am_id = am_id; hdr->length = packed_length = pack_cb(hdr + 1, arg); ep->length = sizeof(*hdr) + packed_length; UCT_TL_EP_STAT_OP(&ep->super, AM, BCOPY, hdr->length); uct_iface_trace_am(&iface->super, UCT_AM_TRACE_TYPE_SEND, hdr->am_id, hdr + 1, hdr->length, "SEND fd %d", ep->fd); iface->outstanding += ep->length; uct_tcp_ep_send(ep); if (ep->length > 0) { uct_tcp_ep_mod_events(ep, EPOLLOUT, 0); } return packed_length; }
ssize_t uct_ugni_smsg_ep_am_bcopy(uct_ep_h tl_ep, uint8_t id, uct_pack_callback_t pack_cb, void *arg) { uct_ugni_smsg_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_ugni_smsg_iface_t); uct_ugni_smsg_ep_t *ep = ucs_derived_of(tl_ep, uct_ugni_smsg_ep_t); ssize_t packed; uct_ugni_smsg_desc_t *desc; ucs_status_t rc; void *smsg_data; uct_ugni_smsg_header_t *smsg_header; UCT_CHECK_AM_ID(id); UCT_TL_IFACE_GET_TX_DESC(&iface->super.super, &iface->free_desc, desc, return UCS_ERR_NO_RESOURCE); ucs_trace_data("AM_BCOPY [%p] am_id: %d buf=%p", iface, id, arg ); smsg_header = (uct_ugni_smsg_header_t *)(desc+1); smsg_data = (void*)(smsg_header+1); packed = pack_cb(smsg_data, arg); smsg_header->length = packed; uct_iface_trace_am(&iface->super.super, UCT_AM_TRACE_TYPE_SEND, id, smsg_data, packed, "TX: AM_BCOPY"); rc = uct_ugni_smsg_ep_am_common_send(ep, iface, id, sizeof(uct_ugni_smsg_header_t), smsg_header, packed, smsg_data, desc); return (UCS_OK == rc) ? packed : rc; }
ucs_status_t uct_ugni_smsg_ep_am_short(uct_ep_h tl_ep, uint8_t id, uint64_t header, const void *payload, unsigned length) { uct_ugni_smsg_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_ugni_smsg_iface_t); uct_ugni_smsg_ep_t *ep = ucs_derived_of(tl_ep, uct_ugni_smsg_ep_t); uct_ugni_smsg_header_t *smsg_header; uint64_t *header_data; uct_ugni_smsg_desc_t *desc; UCT_CHECK_AM_ID(id); UCT_CHECK_LENGTH(length, iface->config.smsg_seg_size - (sizeof(smsg_header) + sizeof(header)), "am_short"); UCT_TL_IFACE_GET_TX_DESC(&iface->super.super, &iface->free_desc, desc, return UCS_ERR_NO_RESOURCE); ucs_trace_data("AM_SHORT [%p] am_id: %d buf=%p length=%u", iface, id, payload, length); smsg_header = (uct_ugni_smsg_header_t *)(desc+1); smsg_header->length = length + sizeof(header); header_data = (uint64_t*)(smsg_header+1); *header_data = header; memcpy((void*)(header_data+1), payload, length); uct_iface_trace_am(&iface->super.super, UCT_AM_TRACE_TYPE_SEND, id, header_data, length, "TX: AM_SHORT"); return uct_ugni_smsg_ep_am_common_send(ep, iface, id, sizeof(uct_ugni_smsg_header_t), smsg_header, smsg_header->length, (void*)header_data, desc); }
static void process_mbox(uct_ugni_smsg_iface_t *iface, uct_ugni_smsg_ep_t *ep){ ucs_status_t status; uint8_t tag; void *data_ptr; gni_return_t ugni_rc; uct_ugni_smsg_header_t *header; void *user_data; pthread_mutex_lock(&uct_ugni_global_lock); while(1){ tag = GNI_SMSG_ANY_TAG; ugni_rc = GNI_SmsgGetNextWTag(ep->super.ep, (void **)&data_ptr, &tag); /* Yes, GNI_RC_NOT_DONE means that you're done with the smsg mailbox */ if(GNI_RC_NOT_DONE == ugni_rc){ pthread_mutex_unlock(&uct_ugni_global_lock); return; } if(GNI_RC_SUCCESS != ugni_rc){ ucs_error("Unhandled smsg error: %s %d", gni_err_str[ugni_rc], ugni_rc); pthread_mutex_unlock(&uct_ugni_global_lock); return; } if(NULL == data_ptr){ ucs_error("Empty data pointer in smsg."); pthread_mutex_unlock(&uct_ugni_global_lock); return; } header = (uct_ugni_smsg_header_t *)data_ptr; user_data = (void *)(header + 1); void *user_desc = iface->user_desc+1; uct_iface_trace_am(&iface->super.super, UCT_AM_TRACE_TYPE_RECV, tag, user_data, header->length, "RX: AM"); pthread_mutex_unlock(&uct_ugni_global_lock); status = uct_iface_invoke_am(&iface->super.super, tag, user_data, header->length, user_desc); pthread_mutex_lock(&uct_ugni_global_lock); if(status != UCS_OK){ uct_recv_desc_iface(user_desc) = &iface->super.super.super; UCT_TL_IFACE_GET_TX_DESC(&iface->super.super, &iface->free_desc, iface->user_desc, iface->user_desc = NULL); } ugni_rc = GNI_SmsgRelease(ep->super.ep); if(GNI_RC_SUCCESS != ugni_rc){ ucs_error("Unhandled smsg error in GNI_SmsgRelease: %s %d", gni_err_str[ugni_rc], ugni_rc); pthread_mutex_unlock(&uct_ugni_global_lock); return; } } }
unsigned uct_tcp_ep_progress_rx(uct_tcp_ep_t *ep) { uct_tcp_iface_t *iface = ucs_derived_of(ep->super.super.iface, uct_tcp_iface_t); uct_tcp_am_hdr_t *hdr; ucs_status_t status; size_t recv_length; ssize_t remainder; ucs_trace_func("ep=%p", ep); /* Receive next chunk of data */ recv_length = iface->config.buf_size - ep->length; status = uct_tcp_recv(ep->fd, ep->buf + ep->length, &recv_length); if (status != UCS_OK) { if (status == UCS_ERR_CANCELED) { ucs_debug("tcp_ep %p: remote disconnected", ep); uct_tcp_ep_mod_events(ep, 0, EPOLLIN); uct_tcp_ep_destroy(&ep->super.super); } return 0; } ep->length += recv_length; ucs_trace_data("tcp_ep %p: recvd %zu bytes", ep, recv_length); /* Parse received active messages */ while ((remainder = ep->length - ep->offset) >= sizeof(*hdr)) { hdr = ep->buf + ep->offset; if (remainder < sizeof(*hdr) + hdr->length) { break; } /* Full message was received */ ep->offset += sizeof(*hdr) + hdr->length; if (hdr->am_id >= UCT_AM_ID_MAX) { ucs_error("invalid am id: %d", hdr->am_id); continue; } uct_iface_trace_am(&iface->super, UCT_AM_TRACE_TYPE_RECV, hdr->am_id, hdr + 1, hdr->length, "RECV fd %d", ep->fd); uct_iface_invoke_am(&iface->super, hdr->am_id, hdr + 1, hdr->length, 0); } /* Move the remaining data to the beginning of the buffer * TODO avoid extra copy on partial receive */ ucs_assert(remainder >= 0); memmove(ep->buf, ep->buf + ep->offset, remainder); ep->offset = 0; ep->length = remainder; return recv_length > 0; }
static ucs_status_t processs_datagram(uct_ugni_udt_iface_t *iface, uct_ugni_udt_desc_t *desc) { ucs_status_t status; uct_ugni_udt_header_t *header; void *payload; header = uct_ugni_udt_get_rheader(desc, iface); payload = uct_ugni_udt_get_rpayload(desc, iface); uct_iface_trace_am(&iface->super.super, UCT_AM_TRACE_TYPE_RECV, header->am_id, payload, header->length, "RX: AM"); status = uct_iface_invoke_am(&iface->super.super, header->am_id, payload, header->length, UCT_CB_FLAG_DESC); return status; }
ssize_t uct_ugni_smsg_ep_am_bcopy(uct_ep_h tl_ep, uint8_t id, uct_pack_callback_t pack_cb, void *arg, unsigned flags) { uct_ugni_smsg_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_ugni_smsg_iface_t); uct_ugni_smsg_ep_t *ep = ucs_derived_of(tl_ep, uct_ugni_smsg_ep_t); ssize_t packed; uct_ugni_smsg_desc_t *desc; ucs_status_t rc; void *smsg_data; uct_ugni_smsg_header_t *smsg_header; UCT_CHECK_AM_ID(id); UCT_TL_IFACE_GET_TX_DESC(&iface->super.super, &iface->free_desc, desc, return UCS_ERR_NO_RESOURCE); ucs_trace_data("AM_BCOPY [%p] am_id: %d send request %p", iface, id, arg); smsg_header = (uct_ugni_smsg_header_t *)(desc+1); smsg_data = (void*)(smsg_header+1); packed = pack_cb(smsg_data, arg); smsg_header->length = packed; UCT_CHECK_LENGTH(packed, 0, iface->config.smsg_seg_size - 0, "am_bcopy"); uct_iface_trace_am(&iface->super.super, UCT_AM_TRACE_TYPE_SEND, id, smsg_data, packed, "TX: AM_BCOPY"); rc = uct_ugni_smsg_ep_am_common_send(ep, iface, id, sizeof(uct_ugni_smsg_header_t), smsg_header, packed, smsg_data, desc); UCT_TL_EP_STAT_OP_IF_SUCCESS(rc, ucs_derived_of(ep, uct_base_ep_t), AM, BCOPY, packed); return (UCS_OK == rc) ? packed : rc; }
/* A common mm active message sending function. * The first parameter indicates the origin of the call. * is_short = 1 - perform AM short sending * is_short = 0 - perform AM bcopy sending */ static UCS_F_ALWAYS_INLINE ssize_t uct_mm_ep_am_common_send(const unsigned is_short, uct_mm_ep_t *ep, uct_mm_iface_t *iface, uint8_t am_id, size_t length, uint64_t header, const void *payload, uct_pack_callback_t pack_cb, void *arg) { uct_mm_fifo_element_t *elem; ucs_status_t status; void *base_address; uint64_t head; UCT_CHECK_AM_ID(am_id); head = ep->fifo_ctl->head; /* check if there is room in the remote process's receive FIFO to write */ if (!UCT_MM_EP_IS_ABLE_TO_SEND(head, ep->cached_tail, iface->config.fifo_size)) { if (!ucs_arbiter_group_is_empty(&ep->arb_group)) { /* pending isn't empty. don't send now to prevent out-of-order sending */ UCS_STATS_UPDATE_COUNTER(ep->super.stats, UCT_EP_STAT_NO_RES, 1); return UCS_ERR_NO_RESOURCE; } else { /* pending is empty */ /* update the local copy of the tail to its actual value on the remote peer */ uct_mm_ep_update_cached_tail(ep); if (!UCT_MM_EP_IS_ABLE_TO_SEND(head, ep->cached_tail, iface->config.fifo_size)) { UCS_STATS_UPDATE_COUNTER(ep->super.stats, UCT_EP_STAT_NO_RES, 1); return UCS_ERR_NO_RESOURCE; } } } status = uct_mm_ep_get_remote_elem(ep, head, &elem); if (status != UCS_OK) { ucs_trace_poll("couldn't get an available FIFO element"); UCS_STATS_UPDATE_COUNTER(ep->super.stats, UCT_EP_STAT_NO_RES, 1); return status; } if (is_short) { /* AM_SHORT */ /* write to the remote FIFO */ *(uint64_t*) (elem + 1) = header; memcpy((void*) (elem + 1) + sizeof(header), payload, length); elem->flags |= UCT_MM_FIFO_ELEM_FLAG_INLINE; elem->length = length + sizeof(header); uct_iface_trace_am(&iface->super, UCT_AM_TRACE_TYPE_SEND, am_id, elem + 1, length + sizeof(header), "TX: AM_SHORT"); UCT_TL_EP_STAT_OP(&ep->super, AM, SHORT, sizeof(header) + length); } else { /* AM_BCOPY */ /* write to the remote descriptor */ /* get the base_address: local ptr to remote memory chunk after attaching to it */ base_address = uct_mm_ep_attach_remote_seg(ep, iface, elem); length = pack_cb(base_address + elem->desc_offset, arg); elem->flags &= ~UCT_MM_FIFO_ELEM_FLAG_INLINE; elem->length = length; uct_iface_trace_am(&iface->super, UCT_AM_TRACE_TYPE_SEND, am_id, base_address + elem->desc_offset, length, "TX: AM_BCOPY"); UCT_TL_EP_STAT_OP(&ep->super, AM, BCOPY, length); } elem->am_id = am_id; /* memory barrier - make sure that the memory is flushed before setting the * 'writing is complete' flag which the reader checks */ ucs_memory_cpu_store_fence(); /* change the owner bit to indicate that the writing is complete. * the owner bit flips after every FIFO wraparound */ if (head & iface->config.fifo_size) { elem->flags |= UCT_MM_FIFO_ELEM_FLAG_OWNER; } else { elem->flags &= ~UCT_MM_FIFO_ELEM_FLAG_OWNER; } if (is_short) { return UCS_OK; } else { return length; } }
static void uct_ugni_udt_progress(void *arg) { uint32_t rem_addr, rem_id; uint64_t id; void *payload; void *user_desc; ucs_status_t status; uct_ugni_udt_desc_t *desc; uct_ugni_udt_header_t *header; uct_ugni_udt_iface_t * iface = (uct_ugni_udt_iface_t *)arg; uct_ugni_udt_ep_t *ep; gni_ep_handle_t ugni_ep; gni_post_state_t post_state; gni_return_t ugni_rc; pthread_mutex_lock(&uct_ugni_global_lock); ugni_rc = GNI_PostDataProbeById(iface->super.nic_handle, &id); if (ucs_unlikely(GNI_RC_SUCCESS != ugni_rc)) { if (GNI_RC_NO_MATCH != ugni_rc) { ucs_error("GNI_PostDataProbeById , Error status: %s %d", gni_err_str[ugni_rc], ugni_rc); } goto exit; } if (UCT_UGNI_UDT_ANY == id) { /* New incomming message */ ep = NULL; ugni_ep = iface->ep_any; desc = iface->desc_any; } else { /* Ack message */ ep = ucs_derived_of(uct_ugni_iface_lookup_ep(&iface->super, id), uct_ugni_udt_ep_t); if (ucs_unlikely(NULL == ep)) { ucs_error("Can not lookup ep with id %"PRIx64,id); goto exit; } ugni_ep = ep->super.ep; desc = ep->posted_desc; } ugni_rc = GNI_EpPostDataWaitById(ugni_ep, id, -1, &post_state, &rem_addr, &rem_id); if (ucs_unlikely(GNI_RC_SUCCESS != ugni_rc)) { ucs_error("GNI_EpPostDataWaitById, Error status: %s %d", gni_err_str[ugni_rc], ugni_rc); goto exit; } header = uct_ugni_udt_get_rheader(desc, iface); payload = uct_ugni_udt_get_rpayload(desc, iface); user_desc = uct_ugni_udt_get_user_desc(desc, iface); if (UCT_UGNI_UDT_ANY == id) { /* New incomming message */ ucs_assert_always(header->type == UCT_UGNI_UDT_PAYLOAD); uct_iface_trace_am(&iface->super.super, UCT_AM_TRACE_TYPE_RECV, header->am_id, payload, header->length, "RX: AM"); status = uct_iface_invoke_am(&iface->super.super, header->am_id, payload, header->length, user_desc); if (UCS_OK != status) { uct_ugni_udt_desc_t *new_desc; /* set iface for a later release call */ uct_recv_desc_iface(user_desc) = &iface->super.super.super; /* Allocate a new element */ UCT_TL_IFACE_GET_TX_DESC(&iface->super.super, &iface->free_desc, new_desc, goto exit); /* set the new desc */ iface->desc_any = new_desc; }
static UCS_F_ALWAYS_INLINE ssize_t uct_ugni_udt_ep_am_common_send(const unsigned is_short, uct_ugni_udt_ep_t *ep, uct_ugni_udt_iface_t *iface, uint8_t am_id, unsigned length, uint64_t header, const void *payload, uct_pack_callback_t pack_cb, void *arg) { gni_return_t ugni_rc; uint16_t msg_length; uct_ugni_udt_desc_t *desc; uct_ugni_udt_header_t *sheader, *rheader; ssize_t packed_length; UCT_CHECK_AM_ID(am_id); if (ucs_unlikely(NULL != ep->posted_desc)) { UCT_TL_IFACE_STAT_TX_NO_DESC(&iface->super.super); return UCS_ERR_NO_RESOURCE; } UCT_TL_IFACE_GET_TX_DESC(&iface->super.super, &iface->free_desc, desc, return UCS_ERR_NO_RESOURCE); rheader = uct_ugni_udt_get_rheader(desc, iface); rheader->type = UCT_UGNI_UDT_EMPTY; sheader = uct_ugni_udt_get_sheader(desc, iface); if (is_short) { uint64_t *hdr = (uint64_t *)uct_ugni_udt_get_spayload(desc, iface); *hdr = header; memcpy((void*)(hdr + 1), payload, length); sheader->length = length + sizeof(header); msg_length = sheader->length + sizeof(*sheader); UCT_TL_EP_STAT_OP(ucs_derived_of(ep, uct_base_ep_t), AM, SHORT, sizeof(header) + length); } else { packed_length = pack_cb((void *)uct_ugni_udt_get_spayload(desc, iface), arg); sheader->length = packed_length; msg_length = sheader->length + sizeof(*sheader); UCT_TL_EP_STAT_OP(ucs_derived_of(ep, uct_base_ep_t), AM, BCOPY, packed_length); } uct_iface_trace_am(&iface->super.super, UCT_AM_TRACE_TYPE_SEND, am_id, uct_ugni_udt_get_spayload(desc, iface), length, is_short ? "TX: AM_SHORT" : "TX: AM_BCOPY"); sheader->am_id = am_id; sheader->type = UCT_UGNI_UDT_PAYLOAD; ucs_assert_always(sheader->length <= GNI_DATAGRAM_MAXSIZE); pthread_mutex_lock(&uct_ugni_global_lock); ugni_rc = GNI_EpPostDataWId(ep->super.ep, sheader, msg_length, rheader, (uint16_t)iface->config.udt_seg_size, ep->super.hash_key); pthread_mutex_unlock(&uct_ugni_global_lock); UCT_UGNI_UDT_CHECK_RC(ugni_rc); ep->posted_desc = desc; ++ep->super.outstanding; ++iface->super.outstanding; return is_short ? UCS_OK : packed_length; }