static ssize_t ofi_nd_ep_readv(struct fid_ep *pep, const struct iovec *iov, void **desc, size_t count, fi_addr_t src_addr, uint64_t addr, uint64_t key, void *context) { struct fi_rma_iov rma_iov = { .addr = addr, .len = iov[0].iov_len, .key = key }; struct fi_msg_rma msg = { .msg_iov = iov, .desc = desc, .iov_count = count, .addr = src_addr, .rma_iov = &rma_iov, .rma_iov_count = 1, .context = context, .data = 0 }; assert(pep->fid.fclass == FI_CLASS_EP); if (pep->fid.fclass != FI_CLASS_EP) return -FI_EINVAL; struct nd_ep *ep = container_of(pep, struct nd_ep, fid); return ofi_nd_ep_readmsg(pep, &msg, ep->info->rx_attr->op_flags); } static ssize_t ofi_nd_ep_readmsg(struct fid_ep *pep, const struct fi_msg_rma *msg, uint64_t flags) { assert(pep->fid.fclass == FI_CLASS_EP); assert(msg); if (pep->fid.fclass != FI_CLASS_EP) return -FI_EINVAL; size_t msg_len = 0, rma_len = 0, i; HRESULT hr; struct nd_ep *ep = container_of(pep, struct nd_ep, fid); if (!ep->qp) return -FI_EOPBADSTATE; for (i = 0; i < msg->iov_count; i++) { if (msg->msg_iov[i].iov_len && !msg->msg_iov[i].iov_base) return -FI_EINVAL; msg_len += msg->msg_iov[i].iov_len; } for (i = 0; i < msg->rma_iov_count; i++) { if (msg->rma_iov[i].len && !msg->rma_iov[i].addr) return -FI_EINVAL; rma_len += msg->rma_iov[i].len; } /* Check the following: */ if ((msg_len != rma_len) || /* - msg and rma len are correlated */ /* - iov counts are less or equal than supported */ (msg->iov_count > ND_MSG_IOV_LIMIT || msg->rma_iov_count > ND_MSG_IOV_LIMIT) || /* - transmitted length is less or equal than max possible */ (msg_len > ep->domain->info->ep_attr->max_msg_size)) return -FI_EINVAL; struct nd_cq_entry *main_entry = ofi_nd_buf_alloc_nd_cq_entry(); if (!main_entry) return -FI_ENOMEM; memset(main_entry, 0, sizeof(*main_entry)); main_entry->data = msg->data; main_entry->flags = flags; main_entry->domain = ep->domain; main_entry->context = msg->context; main_entry->seq = InterlockedAdd64(&ep->domain->msg_cnt, 1); /* since write operation can't be canceled, set NULL into * the 1st byte of internal data of context */ if (msg->context) ND_FI_CONTEXT(msg->context) = 0; struct fi_rma_iov rma_iovecs[ND_MSG_INTERNAL_IOV_LIMIT]; size_t rma_count = 0; size_t from_split_map[ND_MSG_INTERNAL_IOV_LIMIT]; size_t to_split_map[ND_MSG_INTERNAL_IOV_LIMIT]; uint64_t remote_addr[ND_MSG_INTERNAL_IOV_LIMIT]; ofi_nd_split_msg_iov_2_rma_iov(msg->rma_iov, msg->rma_iov_count, msg->msg_iov, msg->iov_count, rma_iovecs, &rma_count, from_split_map, to_split_map, remote_addr); assert(rma_count <= ND_MSG_INTERNAL_IOV_LIMIT); main_entry->wait_completion.comp_count = 0; main_entry->wait_completion.total_count = rma_count; InitializeCriticalSection(&main_entry->wait_completion.comp_lock); struct nd_cq_entry *entries[ND_MSG_IOV_LIMIT]; for (i = 0; i < rma_count; i++) { entries[i] = ofi_nd_buf_alloc_nd_cq_entry(); if (!entries[i]) goto fn_fail; memset(entries[i], 0, sizeof(*entries[i])); entries[i]->data = msg->data; entries[i]->flags = flags; entries[i]->domain = ep->domain; entries[i]->context = msg->context; entries[i]->seq = main_entry->seq; entries[i]->aux_entry = main_entry; hr = ep->domain->adapter->lpVtbl->CreateMemoryRegion( ep->domain->adapter, &IID_IND2MemoryRegion, ep->domain->adapter_file, (void**)&entries[i]->mr[0]); if (FAILED(hr)) goto fn_fail; entries[i]->mr_count = 1; hr = ofi_nd_util_register_mr( entries[i]->mr[0], (const void *)remote_addr[i], rma_iovecs[i].len, ND_MR_FLAG_ALLOW_LOCAL_WRITE | ND_MR_FLAG_ALLOW_REMOTE_READ | ND_MR_FLAG_ALLOW_REMOTE_WRITE); if (FAILED(hr)) goto fn_fail; ND2_SGE sge = { .Buffer = (void *)remote_addr[i], .BufferLength = (ULONG)rma_iovecs[i].len, .MemoryRegionToken = (UINT32)(uintptr_t)msg->desc[to_split_map[i]] }; hr = ep->qp->lpVtbl->Read(ep->qp, entries[i], &sge, 1, (UINT64)rma_iovecs[i].addr, (UINT32)rma_iovecs[i].key, 0); if (FAILED(hr)) goto fn_fail; } return FI_SUCCESS; fn_fail: while (i-- > 0) ofi_nd_free_cq_entry(entries[i]); ND_LOG_WARN(FI_LOG_EP_DATA, ofi_nd_strerror((DWORD)hr, NULL)); return H2F(hr); } static ssize_t ofi_nd_ep_write(struct fid_ep *ep, const void *buf, size_t len, void *desc, fi_addr_t dest_addr, uint64_t addr, uint64_t key, void *context) { struct iovec iov = { .iov_base = (void*)buf, .iov_len = len }; return ofi_nd_ep_writev(ep, &iov, &desc, 1, dest_addr, addr, key, context); } static ssize_t ofi_nd_ep_writev(struct fid_ep *pep, const struct iovec *iov, void **desc, size_t count, fi_addr_t dest_addr, uint64_t addr, uint64_t key, void *context) { struct fi_rma_iov rma_iov = { .addr = addr, .len = iov[0].iov_len, .key = key }; struct fi_msg_rma msg = { .msg_iov = iov, .desc = desc, .iov_count = count, .addr = dest_addr, .rma_iov = &rma_iov, .rma_iov_count = 1, .context = context, .data = 0 }; assert(pep->fid.fclass == FI_CLASS_EP); if (pep->fid.fclass != FI_CLASS_EP) return -FI_EINVAL; struct nd_ep *ep = container_of(pep, struct nd_ep, fid); return ofi_nd_ep_writemsg(pep, &msg, ep->info->tx_attr->op_flags); } static ssize_t ofi_nd_ep_writemsg(struct fid_ep *pep, const struct fi_msg_rma *msg, uint64_t flags) { assert(pep->fid.fclass == FI_CLASS_EP); assert(msg); if (pep->fid.fclass != FI_CLASS_EP) return -FI_EINVAL; size_t msg_len = 0, rma_len = 0, i; HRESULT hr; struct nd_cq_entry *entries[ND_MSG_IOV_LIMIT]; struct nd_ep *ep = container_of(pep, struct nd_ep, fid); if (!ep->qp) return -FI_EOPBADSTATE; for (i = 0; i < msg->iov_count; i++) { if (msg->msg_iov[i].iov_len && !msg->msg_iov[i].iov_base) return -FI_EINVAL; msg_len += msg->msg_iov[i].iov_len; } if ((msg_len > ep->domain->info->ep_attr->max_msg_size) && (flags & FI_INJECT)) return -FI_EINVAL; for (i = 0; i < msg->rma_iov_count; i++) { if (msg->rma_iov[i].len && !msg->rma_iov[i].addr) return -FI_EINVAL; rma_len += msg->rma_iov[i].len; } /* Check the following: */ if ((msg_len != rma_len) || /* - msg and rma len are correlated */ /* - iov counts are less or equal than supported */ ((msg->iov_count > ND_MSG_IOV_LIMIT || msg->rma_iov_count > ND_MSG_IOV_LIMIT)) || /* - transmitted length is less or equal than max possible */ (msg_len > ep->domain->info->ep_attr->max_msg_size) || /* - if INJECT, data should be inlined */ ((flags & FI_INJECT) && (msg_len > ep->domain->info->tx_attr->inject_size))) return -FI_EINVAL; struct nd_cq_entry *main_entry = ofi_nd_buf_alloc_nd_cq_entry(); if (!main_entry) return -FI_ENOMEM; memset(main_entry, 0, sizeof(*main_entry)); main_entry->data = msg->data; main_entry->flags = flags; main_entry->domain = ep->domain; main_entry->context = msg->context; main_entry->seq = InterlockedAdd64(&ep->domain->msg_cnt, 1); /* since write operation can't be canceled, set NULL into * the 1st byte of internal data of context */ if (msg->context) ND_FI_CONTEXT(msg->context) = 0; /* TODO */ if (msg_len > (size_t)gl_data.inline_thr) { struct fi_rma_iov rma_iovecs[ND_MSG_INTERNAL_IOV_LIMIT]; size_t rma_count = 0; size_t from_split_map[ND_MSG_INTERNAL_IOV_LIMIT]; size_t to_split_map[ND_MSG_INTERNAL_IOV_LIMIT]; uint64_t remote_addr[ND_MSG_INTERNAL_IOV_LIMIT]; ofi_nd_split_msg_iov_2_rma_iov(msg->rma_iov, msg->rma_iov_count, msg->msg_iov, msg->iov_count, rma_iovecs, &rma_count, from_split_map, to_split_map, remote_addr); assert(rma_count <= ND_MSG_INTERNAL_IOV_LIMIT); main_entry->wait_completion.comp_count = 0; main_entry->wait_completion.total_count = rma_count; InitializeCriticalSection(&main_entry->wait_completion.comp_lock); for (i = 0; i < rma_count; i++) { entries[i] = ofi_nd_buf_alloc_nd_cq_entry(); if (!entries[i]) goto fn_fail; memset(entries[i], 0, sizeof(*entries[i])); entries[i]->data = msg->data; entries[i]->flags = flags; entries[i]->domain = ep->domain; entries[i]->context = msg->context; entries[i]->seq = main_entry->seq; entries[i]->aux_entry = main_entry; ND2_SGE sge = { .Buffer = (void *)remote_addr[i], .BufferLength = (ULONG)rma_iovecs[i].len, .MemoryRegionToken = (UINT32)(uintptr_t)msg->desc[to_split_map[i]] }; hr = ep->qp->lpVtbl->Write(ep->qp, entries[i], &sge, 1, (UINT64)rma_iovecs[i].addr, (UINT32)rma_iovecs[i].key, 0); if (FAILED(hr)) goto fn_fail; } return FI_SUCCESS; } else { if (msg_len) { main_entry->inline_buf = __ofi_nd_buf_alloc_nd_inlinebuf(&ep->domain->inlinebuf); if (!main_entry->inline_buf) return -FI_ENOMEM; char *buf = (char*)main_entry->inline_buf->buffer; for (i = 0; i < msg->iov_count; i++) { memcpy(buf, msg->msg_iov[i].iov_base, msg->msg_iov[i].iov_len); buf += msg->msg_iov[i].iov_len; } } for (i = 0; i < msg->rma_iov_count; i++) { char *buf = (char *)main_entry->inline_buf->buffer; entries[i] = ofi_nd_buf_alloc_nd_cq_entry(); if (!entries[i]) goto fn_fail; memset(entries[i], 0, sizeof(*entries[i])); entries[i]->data = msg->data; entries[i]->flags = flags; entries[i]->domain = ep->domain; entries[i]->context = msg->context; entries[i]->seq = main_entry->seq; entries[i]->aux_entry = main_entry; ND2_SGE sge = { .Buffer = (void *)(buf + msg->rma_iov[i].len), .BufferLength = (ULONG)msg->rma_iov[i].len, .MemoryRegionToken = main_entry->inline_buf->token }; hr = ep->qp->lpVtbl->Write(ep->qp, entries[i], &sge, 1, (UINT64)msg->rma_iov[i].addr, (UINT32)msg->rma_iov[i].key, 0); if (FAILED(hr)) goto fn_fail; } return FI_SUCCESS; } fn_fail: while (i-- > 0) ofi_nd_free_cq_entry(entries[i]); ND_LOG_WARN(FI_LOG_EP_DATA, ofi_nd_strerror((DWORD)hr, NULL)); return H2F(hr); } static ssize_t ofi_nd_ep_inject(struct fid_ep *pep, const void *buf, size_t len, fi_addr_t dest_addr, uint64_t addr, uint64_t key) { struct iovec iov = { .iov_base = (void*)buf, .iov_len = len }; struct fi_rma_iov rma_iov = { .addr = addr, .len = len, .key = key }; struct fi_msg_rma msg = { .msg_iov = &iov, .desc = 0, .iov_count = 1, .addr = dest_addr, .rma_iov = &rma_iov, .rma_iov_count = 1, .context = 0, .data = 0 }; return ofi_nd_ep_writemsg(pep, &msg, FI_INJECT); } static ssize_t ofi_nd_ep_writedata(struct fid_ep *pep, const void *buf, size_t len, void *desc, uint64_t data, fi_addr_t dest_addr, uint64_t addr, uint64_t key, void *context) { struct iovec iov = { .iov_base = (void*)buf, .iov_len = len }; struct fi_rma_iov rma_iov = { .addr = addr, .len = len, .key = key }; struct fi_msg_rma msg = { .msg_iov = &iov, .desc = &desc, .iov_count = 1, .addr = dest_addr, .rma_iov = &rma_iov, .rma_iov_count = 1, .context = context, .data = data }; assert(pep->fid.fclass == FI_CLASS_EP); if (pep->fid.fclass != FI_CLASS_EP) return -FI_EINVAL; struct nd_ep *ep = container_of(pep, struct nd_ep, fid); return ofi_nd_ep_writemsg(pep, &msg, ep->info->tx_attr->op_flags | FI_REMOTE_CQ_DATA); } static ssize_t ofi_nd_ep_writeinjectdata(struct fid_ep *ep, const void *buf, size_t len, uint64_t data, fi_addr_t dest_addr, uint64_t addr, uint64_t key) { struct iovec iov = { .iov_base = (void*)buf, .iov_len = len }; struct fi_rma_iov rma_iov = { .addr = addr, .len = len, .key = key }; struct fi_msg_rma msg = { .msg_iov = &iov, .desc = 0, .iov_count = 1, .addr = dest_addr, .rma_iov = &rma_iov, .rma_iov_count = 1, .context = 0, .data = data }; return ofi_nd_ep_writemsg(ep, &msg, FI_INJECT | FI_REMOTE_CQ_DATA); } void ofi_nd_read_event(ND2_RESULT *result) { assert(result); assert(result->RequestType == Nd2RequestTypeRead); nd_cq_entry *entry = (nd_cq_entry*)result->RequestContext; assert(entry); ND_LOG_EVENT_INFO(entry); /* Check whether the operation is complex, i.e. read operation * may consists from several subtasks of read */ if (entry->aux_entry) { EnterCriticalSection(&entry->aux_entry->wait_completion.comp_lock); entry->aux_entry->wait_completion.comp_count++; ND_LOG_DEBUG(FI_LOG_EP_DATA, "READ Event comp_count = %d, total_count = %d\n", entry->aux_entry->wait_completion.comp_count, entry->aux_entry->wait_completion.total_count); if (entry->aux_entry->wait_completion.comp_count < entry->aux_entry->wait_completion.total_count) { /* Should wait some remaining completion events about read operation */ LeaveCriticalSection(&entry->aux_entry->wait_completion.comp_lock); entry->aux_entry = NULL; ofi_nd_free_cq_entry(entry); return; } LeaveCriticalSection(&entry->aux_entry->wait_completion.comp_lock); } /*TODO: Handle erroneous case "result->Status != S_OK" */ ofi_nd_dispatch_cq_event(entry->state == LARGE_MSG_RECV_REQ ? LARGE_MSG_REQ : NORMAL_EVENT, entry, result); } void ofi_nd_write_event(ND2_RESULT *result) { assert(result); assert(result->RequestType == Nd2RequestTypeWrite); nd_cq_entry *entry = (nd_cq_entry*)result->RequestContext; assert(entry); struct nd_ep *ep = (struct nd_ep*)result->QueuePairContext; assert(ep); assert(ep->fid.fid.fclass == FI_CLASS_EP); ND_LOG_EVENT_INFO(entry); /* Check whether the operation is complex, i.e. write operation * may consist from several subtasks of write */ if (entry->aux_entry) { EnterCriticalSection(&entry->aux_entry->wait_completion.comp_lock); entry->aux_entry->wait_completion.comp_count++; if (entry->aux_entry->wait_completion.comp_count < entry->aux_entry->wait_completion.total_count) { /* Should wait some remaining completion events about write operation */ LeaveCriticalSection(&entry->aux_entry->wait_completion.comp_lock); entry->aux_entry = NULL; ofi_nd_free_cq_entry(entry); return; } LeaveCriticalSection(&entry->aux_entry->wait_completion.comp_lock); } if (!entry->context) { /* This means that this write was an internal event, * just release it */ ofi_nd_free_cq_entry(entry); return; } if (entry->flags & FI_REMOTE_CQ_DATA) { if (ofi_nd_ep_injectdata( &ep->fid, 0, 0, entry->data, FI_ADDR_UNSPEC) != FI_SUCCESS) ND_LOG_WARN(FI_LOG_CQ, "failed to write-inject"); } if (ep->cntr_write) { if (result->Status != S_OK) { InterlockedIncrement64(&ep->cntr_write->err); } InterlockedIncrement64(&ep->cntr_write->counter); WakeByAddressAll((void*)&ep->cntr_write->counter); } int notify = ofi_nd_util_completion_blackmagic( ep->info->tx_attr->op_flags, ep->send_flags, entry->flags) || result->Status != S_OK; if (notify) { PostQueuedCompletionStatus( entry->result.Status == S_OK ? ep->cq_send->iocp : ep->cq_send->err, 0, 0, &entry->base.ov); InterlockedIncrement(&ep->cq_send->count); WakeByAddressAll((void*)&ep->cq_send->count); } else { /* if notification is not requested - just free entry */ ofi_nd_free_cq_entry(entry); } } void ofi_nd_split_msg_iov_2_rma_iov(const struct fi_rma_iov *rma_iovecs, const size_t rma_count, const struct iovec *msg_iovecs, const size_t msg_count, struct fi_rma_iov res_iovecs[ND_MSG_INTERNAL_IOV_LIMIT], size_t *res_count, size_t from_split_map[ND_MSG_INTERNAL_IOV_LIMIT], size_t to_split_map[ND_MSG_INTERNAL_IOV_LIMIT], uint64_t remote_addr[ND_MSG_INTERNAL_IOV_LIMIT]) { size_t i; struct iovec from_rma_iovecs[ND_MSG_IOV_LIMIT]; size_t from_rma_count = rma_count; struct iovec res_msg_iovecs[ND_MSG_IOV_LIMIT]; size_t res_msg_count = 0; /* Convert RMA iovecs to MSG iovecs to be able to reuse * them in @ofi_nd_repack_iovecs */ for (i = 0; i < rma_count; i++) { from_rma_iovecs[i].iov_base = (void *)rma_iovecs[i].addr; from_rma_iovecs[i].iov_len = rma_iovecs[i].len; } ofi_nd_repack_iovecs(from_rma_iovecs, from_rma_count, msg_iovecs, msg_count, res_msg_iovecs, &res_msg_count, from_split_map, to_split_map, remote_addr); /* Extract MSG iov to RMA iovecs and returns them */ for (i = 0; i < res_msg_count; i++) { res_iovecs[i].addr = remote_addr[i]; res_iovecs[i].len = res_msg_iovecs[i].iov_len; res_iovecs[i].key = rma_iovecs[from_split_map[i]].key; remote_addr[i] = (uint64_t)res_msg_iovecs[i].iov_base; } *res_count = res_msg_count; }
static ssize_t ofi_nd_ep_sendmsg(struct fid_ep *pep, const struct fi_msg *msg, uint64_t flags) { assert(pep->fid.fclass == FI_CLASS_EP); assert(msg); if (pep->fid.fclass != FI_CLASS_EP) return -FI_EINVAL; HRESULT hr; size_t i; size_t len = 0; ssize_t res = FI_SUCCESS; struct nd_cq_entry *wait_ack_entry; nd_send_entry *send_entry; struct nd_ep *ep = container_of(pep, struct nd_ep, fid); if (!ep->qp) return -FI_EOPBADSTATE; for (i = 0; i < msg->iov_count; i++) { if (msg->msg_iov[i].iov_len && !msg->msg_iov[i].iov_base) return -FI_EINVAL; len += msg->msg_iov[i].iov_len; } if ((msg->iov_count > min(ep->domain->ainfo.MaxReceiveSge, ND_MSG_IOV_LIMIT) - 1) || (len > ep->domain->info->ep_attr->max_msg_size)) return -FI_EINVAL; struct nd_cq_entry *entry = ofi_nd_buf_alloc_nd_cq_entry(); if (!entry) return -FI_ENOMEM; memset(entry, 0, sizeof(*entry)); entry->buf = (msg->iov_count == 1) ? msg->msg_iov[0].iov_base : 0; entry->len = len; entry->data = msg->data; entry->flags = flags | FI_MSG | FI_SEND; entry->domain = ep->domain; entry->context = msg->context; entry->seq = InterlockedAdd64(&ep->domain->msg_cnt, 1); /* since send operation can't be canceled, set NULL into * the 1st byte of internal data of context */ if (msg->context) ND_FI_CONTEXT(msg->context) = 0; entry->prefix = __ofi_nd_buf_alloc_nd_msgprefix( &ep->domain->msgfooter); if (!entry->prefix) { res = -FI_ENOMEM; goto fn_fail_int; } if (entry->len <= (size_t)gl_data.inline_thr) { nd_flow_cntrl_flags flow_control_flags = { .req_ack = 0, .ack = 0, .empty = 0 }; struct nd_msgheader header_def = { .data = entry->data, .event = NORMAL_EVENT, .flags = flow_control_flags, .location_cnt = 0 }; entry->prefix->header = header_def; entry->event = NORMAL_EVENT; entry->flow_cntrl_flags = flow_control_flags; if (len) { entry->inline_buf = __ofi_nd_buf_alloc_nd_inlinebuf(&ep->domain->inlinebuf); if (!entry->inline_buf) { res = -FI_ENOMEM; goto fn_fail_int; } char *buf = (char*)entry->inline_buf->buffer; for (i = 0; i < msg->iov_count; i++) { memcpy(buf, msg->msg_iov[i].iov_base, msg->msg_iov[i].iov_len); buf += msg->msg_iov[i].iov_len; } } ND2_SGE sge[2] = { { .Buffer = &entry->prefix->header, .BufferLength = (ULONG)sizeof(entry->prefix->header), .MemoryRegionToken = entry->prefix->token }, { .Buffer = len ? entry->inline_buf->buffer : 0, .BufferLength = len, .MemoryRegionToken = len ? entry->inline_buf->token : 0 } }; nd_sge *sge_entry = ofi_nd_buf_alloc_nd_sge(); if (!sge_entry) { ND_LOG_WARN(FI_LOG_EP_DATA, "SGE entry buffer can't be allocated"); res = -FI_ENOMEM; goto fn_fail_int; } memset(sge_entry, 0, sizeof(*sge_entry)); sge_entry->count = len ? 2 : 1; for (i = 0; i < sge_entry->count; i++) sge_entry->entries[i] = sge[i]; nd_send_entry *send_entry = ofi_nd_buf_alloc_nd_send_entry(); if (!send_entry) { ND_LOG_WARN(FI_LOG_EP_DATA, "Send entry buffer can't be allocated"); res = -FI_ENOMEM; goto fn_fail_int; } memset(send_entry, 0, sizeof(*send_entry)); send_entry->cq_entry = entry; send_entry->sge = sge_entry; send_entry->ep = ep; /* Push the user's transmission request into * the Send Queue for furhter handling */ ofi_nd_queue_push(&ep->send_queue, &send_entry->queue_item); } else { if (flags & FI_INJECT)