static int smr_fetch_result(struct smr_ep *ep, struct smr_region *peer_smr, struct iovec *iov, size_t iov_count, const struct fi_rma_ioc *rma_ioc, size_t rma_count, enum fi_datatype datatype, size_t total_len) { int ret, i; struct iovec rma_iov[SMR_IOV_LIMIT]; for (i = 0; i < rma_count; i++) { rma_iov[i].iov_base = (void *) rma_ioc[i].addr; rma_iov[i].iov_len = rma_ioc[i].count * ofi_datatype_size(datatype); } ret = process_vm_readv(peer_smr->pid, iov, iov_count, rma_iov, rma_count, 0); if (ret != total_len) { if (ret < 0) { FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "CMA write error\n"); return -errno; } else { FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "partial read occurred\n"); return -FI_EIO; } } return 0; }
int psmx_query_atomic(struct fid_domain *doamin, enum fi_datatype datatype, enum fi_op op, struct fi_atomic_attr *attr, uint64_t flags) { int ret; size_t count; if (flags & FI_TAGGED) return -FI_EOPNOTSUPP; if (flags & FI_COMPARE_ATOMIC) { if (flags & FI_FETCH_ATOMIC) return -FI_EINVAL; ret = psmx_atomic_compwritevalid(NULL, datatype, op, &count); } else if (flags & FI_FETCH_ATOMIC) { ret = psmx_atomic_readwritevalid(NULL, datatype, op, &count); } else { ret = psmx_atomic_writevalid(NULL, datatype, op, &count); } if (attr && !ret) { attr->size = ofi_datatype_size(datatype); attr->count = count; } return ret; }
static int psmx_atomic_readwritevalid(struct fid_ep *ep, enum fi_datatype datatype, enum fi_op op, size_t *count) { int chunk_size; if (datatype >= FI_DATATYPE_LAST) return -FI_EOPNOTSUPP; switch (op) { case FI_MIN: case FI_MAX: case FI_SUM: case FI_PROD: case FI_LOR: case FI_LAND: case FI_BOR: case FI_BAND: case FI_LXOR: case FI_BXOR: case FI_ATOMIC_READ: case FI_ATOMIC_WRITE: break; default: return -FI_EOPNOTSUPP; } if (count) { chunk_size = MIN(PSMX_AM_CHUNK_SIZE, psmx_am_param.max_request_short); *count = chunk_size / ofi_datatype_size(datatype); } return 0; }
static int psmx_atomic_compwritevalid(struct fid_ep *ep, enum fi_datatype datatype, enum fi_op op, size_t *count) { int chunk_size; if (datatype >= FI_DATATYPE_LAST) return -FI_EOPNOTSUPP; switch (op) { case FI_CSWAP: case FI_CSWAP_NE: break; case FI_CSWAP_LE: case FI_CSWAP_LT: case FI_CSWAP_GE: case FI_CSWAP_GT: if (datatype == FI_FLOAT_COMPLEX || datatype == FI_DOUBLE_COMPLEX || datatype == FI_LONG_DOUBLE_COMPLEX) return -FI_EOPNOTSUPP; break; case FI_MSWAP: if (datatype == FI_FLOAT_COMPLEX || datatype == FI_DOUBLE_COMPLEX || datatype == FI_LONG_DOUBLE_COMPLEX || datatype == FI_FLOAT || datatype == FI_DOUBLE || datatype == FI_LONG_DOUBLE) return -FI_EOPNOTSUPP; break; default: return -FI_EOPNOTSUPP; } if (count) { chunk_size = MIN(PSMX_AM_CHUNK_SIZE, psmx_am_param.max_request_short); *count = chunk_size / (2 * ofi_datatype_size(datatype)); } return 0; }
int smr_query_atomic(struct fid_domain *domain, enum fi_datatype datatype, enum fi_op op, struct fi_atomic_attr *attr, uint64_t flags) { int ret; size_t total_size; if (flags & FI_TAGGED) { FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "tagged atomic op not supported\n"); return -FI_EINVAL; } ret = ofi_atomic_valid(&smr_prov, datatype, op, flags); if (ret || !attr) return ret; attr->size = ofi_datatype_size(datatype); total_size = (flags & FI_COMPARE_ATOMIC) ? SMR_COMP_INJECT_SIZE : SMR_INJECT_SIZE; attr->count = total_size / attr->size; return ret; }
static ssize_t smr_atomic_inject(struct fid_ep *ep_fid, const void *buf, size_t count, fi_addr_t dest_addr, uint64_t addr, uint64_t key, enum fi_datatype datatype, enum fi_op op) { struct smr_ep *ep; struct smr_region *peer_smr; struct smr_inject_buf *tx_buf; struct smr_cmd *cmd; struct iovec iov; struct fi_rma_ioc rma_ioc; int peer_id; ssize_t ret = 0; size_t total_len; assert(count <= SMR_INJECT_SIZE); ep = container_of(ep_fid, struct smr_ep, util_ep.ep_fid.fid); peer_id = (int) dest_addr; ret = smr_verify_peer(ep, peer_id); if(ret) return ret; peer_smr = smr_peer_region(ep->region, peer_id); fastlock_acquire(&peer_smr->lock); if (peer_smr->cmd_cnt < 2) { ret = -FI_EAGAIN; goto unlock_region; } cmd = ofi_cirque_tail(smr_cmd_queue(peer_smr)); total_len = count * ofi_datatype_size(datatype); iov.iov_base = (void *) buf; iov.iov_len = total_len; rma_ioc.addr = addr; rma_ioc.count = count; rma_ioc.key = key; if (total_len <= SMR_MSG_DATA_LEN) { smr_format_inline_atomic(cmd, smr_peer_addr(ep->region)[peer_id].addr, &iov, 1, NULL, 0, ofi_op_atomic, datatype, op); } else if (total_len <= SMR_INJECT_SIZE) { tx_buf = smr_freestack_pop(smr_inject_pool(peer_smr)); smr_format_inject_atomic(cmd, smr_peer_addr(ep->region)[peer_id].addr, &iov, 1, NULL, 0, NULL, 0, ofi_op_atomic, datatype, op, peer_smr, tx_buf); } ofi_cirque_commit(smr_cmd_queue(peer_smr)); peer_smr->cmd_cnt--; cmd = ofi_cirque_tail(smr_cmd_queue(peer_smr)); smr_format_rma_ioc(cmd, &rma_ioc, 1); ofi_cirque_commit(smr_cmd_queue(peer_smr)); peer_smr->cmd_cnt--; unlock_region: fastlock_release(&peer_smr->lock); return ret; }
static ssize_t smr_generic_atomic(struct fid_ep *ep_fid, const struct fi_ioc *ioc, void **desc, size_t count, const struct fi_ioc *compare_ioc, void **compare_desc, size_t compare_count, struct fi_ioc *result_ioc, void **result_desc, size_t result_count, fi_addr_t addr, const struct fi_rma_ioc *rma_ioc, size_t rma_count, enum fi_datatype datatype, enum fi_op atomic_op, void *context, uint32_t op) { struct smr_ep *ep; struct smr_domain *domain; struct smr_region *peer_smr; struct smr_inject_buf *tx_buf; struct smr_cmd *cmd; struct iovec iov[SMR_IOV_LIMIT]; struct iovec compare_iov[SMR_IOV_LIMIT]; struct iovec result_iov[SMR_IOV_LIMIT]; int peer_id, err = 0; uint16_t flags = 0; ssize_t ret = 0; size_t msg_len, total_len; assert(count <= SMR_IOV_LIMIT); assert(result_count <= SMR_IOV_LIMIT); assert(compare_count <= SMR_IOV_LIMIT); assert(rma_count <= SMR_IOV_LIMIT); ep = container_of(ep_fid, struct smr_ep, util_ep.ep_fid.fid); domain = container_of(ep->util_ep.domain, struct smr_domain, util_domain); peer_id = (int) addr; ret = smr_verify_peer(ep, peer_id); if(ret) return ret; peer_smr = smr_peer_region(ep->region, peer_id); fastlock_acquire(&peer_smr->lock); if (peer_smr->cmd_cnt < 2) { ret = -FI_EAGAIN; goto unlock_region; } fastlock_acquire(&ep->util_ep.tx_cq->cq_lock); if (ofi_cirque_isfull(ep->util_ep.tx_cq->cirq)) { ret = -FI_EAGAIN; goto unlock_cq; } cmd = ofi_cirque_tail(smr_cmd_queue(peer_smr)); msg_len = total_len = ofi_datatype_size(datatype) * ofi_total_ioc_cnt(ioc, count); switch (op) { case ofi_op_atomic_compare: assert(compare_ioc); ofi_ioc_to_iov(compare_ioc, compare_iov, compare_count, ofi_datatype_size(datatype)); total_len *= 2; /* fall through */ case ofi_op_atomic_fetch: assert(result_ioc); ofi_ioc_to_iov(result_ioc, result_iov, result_count, ofi_datatype_size(datatype)); if (!domain->fast_rma) flags |= SMR_RMA_REQ; /* fall through */ case ofi_op_atomic: if (atomic_op != FI_ATOMIC_READ) { assert(ioc); ofi_ioc_to_iov(ioc, iov, count, ofi_datatype_size(datatype)); } else { count = 0; } break; default: break; } if (total_len <= SMR_MSG_DATA_LEN && !(flags & SMR_RMA_REQ)) { smr_format_inline_atomic(cmd, smr_peer_addr(ep->region)[peer_id].addr, iov, count, compare_iov, compare_count, op, datatype, atomic_op); } else if (total_len <= SMR_INJECT_SIZE) { tx_buf = smr_freestack_pop(smr_inject_pool(peer_smr)); smr_format_inject_atomic(cmd, smr_peer_addr(ep->region)[peer_id].addr, iov, count, result_iov, result_count, compare_iov, compare_count, op, datatype, atomic_op, peer_smr, tx_buf); } else { FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "message too large\n"); ret = -FI_EINVAL; goto unlock_cq; } cmd->msg.hdr.op_flags |= flags; ofi_cirque_commit(smr_cmd_queue(peer_smr)); peer_smr->cmd_cnt--; if (op != ofi_op_atomic) { if (flags & SMR_RMA_REQ) { smr_post_fetch_resp(ep, cmd, (const struct iovec *) result_iov, result_count); goto format_rma; } err = smr_fetch_result(ep, peer_smr, result_iov, result_count, rma_ioc, rma_count, datatype, msg_len); if (err) FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "unable to fetch results"); } ret = ep->tx_comp(ep, context, ofi_tx_cq_flags(op), err); if (ret) { FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "unable to process tx completion\n"); } format_rma: cmd = ofi_cirque_tail(smr_cmd_queue(peer_smr)); smr_format_rma_ioc(cmd, rma_ioc, rma_count); ofi_cirque_commit(smr_cmd_queue(peer_smr)); peer_smr->cmd_cnt--; unlock_cq: fastlock_release(&ep->util_ep.tx_cq->cq_lock); unlock_region: fastlock_release(&peer_smr->lock); return ret; }
ssize_t _psmx_atomic_readwrite(struct fid_ep *ep, const void *buf, size_t count, void *desc, void *result, void *result_desc, fi_addr_t dest_addr, uint64_t addr, uint64_t key, enum fi_datatype datatype, enum fi_op op, void *context, uint64_t flags) { struct psmx_fid_ep *ep_priv; struct psmx_fid_av *av; struct psmx_epaddr_context *epaddr_context; struct psmx_am_request *req; psm_amarg_t args[8]; int am_flags = PSM_AM_FLAG_ASYNC; int chunk_size, len; size_t idx; ep_priv = container_of(ep, struct psmx_fid_ep, ep); if (flags & FI_TRIGGER) { struct psmx_trigger *trigger; struct fi_triggered_context *ctxt = context; trigger = calloc(1, sizeof(*trigger)); if (!trigger) return -FI_ENOMEM; trigger->op = PSMX_TRIGGERED_ATOMIC_READWRITE; trigger->cntr = container_of(ctxt->trigger.threshold.cntr, struct psmx_fid_cntr, cntr); trigger->threshold = ctxt->trigger.threshold.threshold; trigger->atomic_readwrite.ep = ep; trigger->atomic_readwrite.buf = buf; trigger->atomic_readwrite.count = count; trigger->atomic_readwrite.desc = desc; trigger->atomic_readwrite.result = result; trigger->atomic_readwrite.result_desc = result_desc; trigger->atomic_readwrite.dest_addr = dest_addr; trigger->atomic_readwrite.addr = addr; trigger->atomic_readwrite.key = key; trigger->atomic_readwrite.datatype = datatype; trigger->atomic_readwrite.atomic_op = op; trigger->atomic_readwrite.context = context; trigger->atomic_readwrite.flags = flags & ~FI_TRIGGER; psmx_cntr_add_trigger(trigger->cntr, trigger); return 0; } if (!buf && op != FI_ATOMIC_READ) return -FI_EINVAL; if (datatype >= FI_DATATYPE_LAST) return -FI_EINVAL; if (op >= FI_ATOMIC_OP_LAST) return -FI_EINVAL; av = ep_priv->av; if (av && av->type == FI_AV_TABLE) { idx = dest_addr; if (idx >= av->last) return -FI_EINVAL; dest_addr = (fi_addr_t) av->psm_epaddrs[idx]; } else if (!dest_addr) { return -FI_EINVAL; } epaddr_context = psm_epaddr_getctxt((void *)dest_addr); if (epaddr_context->epid == ep_priv->domain->psm_epid) return psmx_atomic_self(PSMX_AM_REQ_ATOMIC_READWRITE, ep_priv, buf, count, desc, NULL, NULL, result, result_desc, addr, key, datatype, op, context, flags); chunk_size = MIN(PSMX_AM_CHUNK_SIZE, psmx_am_param.max_request_short); len = ofi_datatype_size(datatype) * count; if (len > chunk_size) return -FI_EMSGSIZE; if ((flags & FI_INJECT) && op != FI_ATOMIC_READ) { req = malloc(sizeof(*req) + len); if (!req) return -FI_ENOMEM; memset(req, 0, sizeof(*req)); memcpy((uint8_t *)req+sizeof(*req), (void *)buf, len); buf = (uint8_t *)req + sizeof(*req); } else { req = calloc(1, sizeof(*req)); if (!req) return -FI_ENOMEM; } req->no_event = (flags & PSMX_NO_COMPLETION) || (ep_priv->send_selective_completion && !(flags & FI_COMPLETION)); req->op = PSMX_AM_REQ_ATOMIC_READWRITE; req->atomic.buf = (void *)buf; req->atomic.len = len; req->atomic.addr = addr; req->atomic.key = key; req->atomic.context = context; req->atomic.result = result; req->ep = ep_priv; if (op == FI_ATOMIC_READ) req->cq_flags = FI_READ | FI_ATOMIC; else req->cq_flags = FI_WRITE | FI_ATOMIC; args[0].u32w0 = PSMX_AM_REQ_ATOMIC_READWRITE; args[0].u32w1 = count; args[1].u64 = (uint64_t)(uintptr_t)req; args[2].u64 = addr; args[3].u64 = key; args[4].u32w0 = datatype; args[4].u32w1 = op; psm_am_request_short((psm_epaddr_t) dest_addr, PSMX_AM_ATOMIC_HANDLER, args, 5, (void *)buf, (buf?len:0), am_flags, NULL, NULL); return 0; }
static int psmx_atomic_self(int am_cmd, struct psmx_fid_ep *ep, const void *buf, size_t count, void *desc, const void *compare, void *compare_desc, void *result, void *result_desc, uint64_t addr, uint64_t key, enum fi_datatype datatype, enum fi_op op, void *context, uint64_t flags) { struct psmx_fid_mr *mr; struct psmx_cq_event *event; struct psmx_fid_ep *target_ep; struct psmx_fid_cntr *cntr = NULL; struct psmx_fid_cntr *mr_cntr = NULL; void *tmp_buf; size_t len; int no_event; int err = 0; int op_error; int access; uint64_t cq_flags = 0; if (am_cmd == PSMX_AM_REQ_ATOMIC_WRITE) access = FI_REMOTE_WRITE; else access = FI_REMOTE_READ | FI_REMOTE_WRITE; len = ofi_datatype_size(datatype) * count; mr = psmx_mr_get(psmx_active_fabric->active_domain, key); op_error = mr ? psmx_mr_validate(mr, addr, len, access) : -FI_EINVAL; if (op_error) goto gen_local_event; addr += mr->offset; switch (am_cmd) { case PSMX_AM_REQ_ATOMIC_WRITE: err = psmx_atomic_do_write((void *)addr, (void *)buf, (int)datatype, (int)op, (int)count); cq_flags = FI_WRITE | FI_ATOMIC; break; case PSMX_AM_REQ_ATOMIC_READWRITE: if (result != buf) { err = psmx_atomic_do_readwrite((void *)addr, (void *)buf, (void *)result, (int)datatype, (int)op, (int)count); } else { tmp_buf = malloc(len); if (tmp_buf) { memcpy(tmp_buf, result, len); err = psmx_atomic_do_readwrite((void *)addr, (void *)buf, tmp_buf, (int)datatype, (int)op, (int)count); memcpy(result, tmp_buf, len); free(tmp_buf); } else { err = -FI_ENOMEM; } } if (op == FI_ATOMIC_READ) cq_flags = FI_READ | FI_ATOMIC; else cq_flags = FI_WRITE | FI_ATOMIC; break; case PSMX_AM_REQ_ATOMIC_COMPWRITE: if (result != buf && result != compare) { err = psmx_atomic_do_compwrite((void *)addr, (void *)buf, (void *)compare, (void *)result, (int)datatype, (int)op, (int)count); } else { tmp_buf = malloc(len); if (tmp_buf) { memcpy(tmp_buf, result, len); err = psmx_atomic_do_compwrite((void *)addr, (void *)buf, (void *)compare, tmp_buf, (int)datatype, (int)op, (int)count); memcpy(result, tmp_buf, len); free(tmp_buf); } else { err = -FI_ENOMEM; } } cq_flags = FI_WRITE | FI_ATOMIC; break; } target_ep = mr->domain->atomics_ep; if (op == FI_ATOMIC_READ) { cntr = target_ep->remote_read_cntr; } else { cntr = target_ep->remote_write_cntr; mr_cntr = mr->cntr; } if (cntr) psmx_cntr_inc(cntr); if (mr_cntr && mr_cntr != cntr) psmx_cntr_inc(mr_cntr); gen_local_event: no_event = ((flags & PSMX_NO_COMPLETION) || (ep->send_selective_completion && !(flags & FI_COMPLETION))); if (ep->send_cq && (!no_event || op_error)) { event = psmx_cq_create_event( ep->send_cq, context, (void *)buf, cq_flags, len, 0, /* data */ 0, /* tag */ 0, /* olen */ op_error); if (event) psmx_cq_enqueue_event(ep->send_cq, event); else err = -FI_ENOMEM; } switch (am_cmd) { case PSMX_AM_REQ_ATOMIC_WRITE: if (ep->write_cntr) psmx_cntr_inc(ep->write_cntr); break; case PSMX_AM_REQ_ATOMIC_READWRITE: case PSMX_AM_REQ_ATOMIC_COMPWRITE: if (ep->read_cntr) psmx_cntr_inc(ep->read_cntr); break; } return err; }
int psmx_am_atomic_handler(psm_am_token_t token, psm_epaddr_t epaddr, psm_amarg_t *args, int nargs, void *src, uint32_t len) { psm_amarg_t rep_args[8]; int count; uint8_t *addr; uint64_t key; int datatype, op; int err = 0; int op_error = 0; struct psmx_am_request *req; struct psmx_cq_event *event; struct psmx_fid_mr *mr; struct psmx_fid_ep *target_ep; struct psmx_fid_cntr *cntr = NULL; struct psmx_fid_cntr *mr_cntr = NULL; void *tmp_buf; switch (args[0].u32w0 & PSMX_AM_OP_MASK) { case PSMX_AM_REQ_ATOMIC_WRITE: count = args[0].u32w1; addr = (uint8_t *)(uintptr_t)args[2].u64; key = args[3].u64; datatype = args[4].u32w0; op = args[4].u32w1; assert(len == ofi_datatype_size(datatype) * count); mr = psmx_mr_get(psmx_active_fabric->active_domain, key); op_error = mr ? psmx_mr_validate(mr, (uint64_t)addr, len, FI_REMOTE_WRITE) : -FI_EINVAL; if (!op_error) { addr += mr->offset; psmx_atomic_do_write(addr, src, datatype, op, count); target_ep = mr->domain->atomics_ep; cntr = target_ep->remote_write_cntr; mr_cntr = mr->cntr; if (cntr) psmx_cntr_inc(cntr); if (mr_cntr && mr_cntr != cntr) psmx_cntr_inc(mr_cntr); } rep_args[0].u32w0 = PSMX_AM_REP_ATOMIC_WRITE; rep_args[0].u32w1 = op_error; rep_args[1].u64 = args[1].u64; err = psm_am_reply_short(token, PSMX_AM_ATOMIC_HANDLER, rep_args, 2, NULL, 0, 0, NULL, NULL ); break; case PSMX_AM_REQ_ATOMIC_READWRITE: count = args[0].u32w1; addr = (uint8_t *)(uintptr_t)args[2].u64; key = args[3].u64; datatype = args[4].u32w0; op = args[4].u32w1; if (op == FI_ATOMIC_READ) len = ofi_datatype_size(datatype) * count; assert(len == ofi_datatype_size(datatype) * count); mr = psmx_mr_get(psmx_active_fabric->active_domain, key); op_error = mr ? psmx_mr_validate(mr, (uint64_t)addr, len, FI_REMOTE_READ|FI_REMOTE_WRITE) : -FI_EINVAL; if (!op_error) { addr += mr->offset; tmp_buf = malloc(len); if (tmp_buf) psmx_atomic_do_readwrite(addr, src, tmp_buf, datatype, op, count); else op_error = -FI_ENOMEM; target_ep = mr->domain->atomics_ep; if (op == FI_ATOMIC_READ) { cntr = target_ep->remote_read_cntr; } else { cntr = target_ep->remote_write_cntr; mr_cntr = mr->cntr; } if (cntr) psmx_cntr_inc(cntr); if (mr_cntr && mr_cntr != cntr) psmx_cntr_inc(mr_cntr); } else { tmp_buf = NULL; } rep_args[0].u32w0 = PSMX_AM_REP_ATOMIC_READWRITE; rep_args[0].u32w1 = op_error; rep_args[1].u64 = args[1].u64; err = psm_am_reply_short(token, PSMX_AM_ATOMIC_HANDLER, rep_args, 2, tmp_buf, (tmp_buf?len:0), 0, psmx_am_atomic_completion, tmp_buf ); break; case PSMX_AM_REQ_ATOMIC_COMPWRITE: count = args[0].u32w1; addr = (uint8_t *)(uintptr_t)args[2].u64; key = args[3].u64; datatype = args[4].u32w0; op = args[4].u32w1; len /= 2; assert(len == ofi_datatype_size(datatype) * count); mr = psmx_mr_get(psmx_active_fabric->active_domain, key); op_error = mr ? psmx_mr_validate(mr, (uint64_t)addr, len, FI_REMOTE_READ|FI_REMOTE_WRITE) : -FI_EINVAL; if (!op_error) { addr += mr->offset; tmp_buf = malloc(len); if (tmp_buf) psmx_atomic_do_compwrite(addr, src, (uint8_t *)src + len, tmp_buf, datatype, op, count); else op_error = -FI_ENOMEM; target_ep = mr->domain->atomics_ep; cntr = target_ep->remote_write_cntr; mr_cntr = mr->cntr; if (cntr) psmx_cntr_inc(cntr); if (mr_cntr && mr_cntr != cntr) psmx_cntr_inc(mr_cntr); } else { tmp_buf = NULL; } rep_args[0].u32w0 = PSMX_AM_REP_ATOMIC_READWRITE; rep_args[0].u32w1 = op_error; rep_args[1].u64 = args[1].u64; err = psm_am_reply_short(token, PSMX_AM_ATOMIC_HANDLER, rep_args, 2, tmp_buf, (tmp_buf?len:0), 0, psmx_am_atomic_completion, tmp_buf ); break; case PSMX_AM_REP_ATOMIC_WRITE: req = (struct psmx_am_request *)(uintptr_t)args[1].u64; op_error = (int)args[0].u32w1; assert(req->op == PSMX_AM_REQ_ATOMIC_WRITE); if (req->ep->send_cq && (!req->no_event || op_error)) { event = psmx_cq_create_event( req->ep->send_cq, req->atomic.context, req->atomic.buf, req->cq_flags, req->atomic.len, 0, /* data */ 0, /* tag */ 0, /* olen */ op_error); if (event) psmx_cq_enqueue_event(req->ep->send_cq, event); else err = -FI_ENOMEM; } if (req->ep->write_cntr) psmx_cntr_inc(req->ep->write_cntr); free(req); break; case PSMX_AM_REP_ATOMIC_READWRITE: case PSMX_AM_REP_ATOMIC_COMPWRITE: req = (struct psmx_am_request *)(uintptr_t)args[1].u64; op_error = (int)args[0].u32w1; assert(op_error || req->atomic.len == len); if (!op_error) memcpy(req->atomic.result, src, len); if (req->ep->send_cq && (!req->no_event || op_error)) { event = psmx_cq_create_event( req->ep->send_cq, req->atomic.context, req->atomic.buf, req->cq_flags, req->atomic.len, 0, /* data */ 0, /* tag */ 0, /* olen */ op_error); if (event) psmx_cq_enqueue_event(req->ep->send_cq, event); else err = -FI_ENOMEM; } if (req->ep->read_cntr) psmx_cntr_inc(req->ep->read_cntr); free(req); break; default: err = -FI_EINVAL; } return err; }
ssize_t _gnix_atomic(struct gnix_fid_ep *ep, enum gnix_fab_req_type fr_type, const struct fi_msg_atomic *msg, const struct fi_ioc *comparev, void **compare_desc, size_t compare_count, struct fi_ioc *resultv, void **result_desc, size_t result_count, uint64_t flags) { struct gnix_vc *vc; struct gnix_fab_req *req; struct gnix_fid_mem_desc *md = NULL; int rc, len; struct fid_mr *auto_mr = NULL; void *mdesc = NULL; uint64_t compare_operand = 0; void *loc_addr = NULL; int dt_len, dt_align; int connected; if (!(flags & FI_INJECT) && !ep->send_cq && (((fr_type == GNIX_FAB_RQ_AMO || fr_type == GNIX_FAB_RQ_NAMO_AX || fr_type == GNIX_FAB_RQ_NAMO_AX_S) && !ep->write_cntr) || ((fr_type == GNIX_FAB_RQ_FAMO || fr_type == GNIX_FAB_RQ_CAMO || fr_type == GNIX_FAB_RQ_NAMO_FAX || fr_type == GNIX_FAB_RQ_NAMO_FAX_S) && !ep->read_cntr))) { return -FI_ENOCQ; } if (!ep || !msg || !msg->msg_iov || msg->msg_iov[0].count != 1 || msg->iov_count != GNIX_MAX_ATOMIC_IOV_LIMIT || !msg->rma_iov) return -FI_EINVAL; /* * see fi_atomic man page */ if ((msg->op != FI_ATOMIC_READ) && !msg->msg_iov[0].addr) return -FI_EINVAL; if (flags & FI_TRIGGER) { struct fi_triggered_context *trigger_context = (struct fi_triggered_context *)msg->context; if ((trigger_context->event_type != FI_TRIGGER_THRESHOLD) || (flags & FI_INJECT)) { return -FI_EINVAL; } } if (fr_type == GNIX_FAB_RQ_CAMO) { if (!comparev || !comparev[0].addr || compare_count != 1) return -FI_EINVAL; compare_operand = *(uint64_t *)comparev[0].addr; } dt_len = ofi_datatype_size(msg->datatype); dt_align = dt_len - 1; len = dt_len * msg->msg_iov->count; if (msg->rma_iov->addr & dt_align) { GNIX_INFO(FI_LOG_EP_DATA, "Invalid target alignment: %d (mask 0x%x)\n", msg->rma_iov->addr, dt_align); return -FI_EINVAL; } /* need a memory descriptor for all fetching and comparison AMOs */ if (fr_type == GNIX_FAB_RQ_FAMO || fr_type == GNIX_FAB_RQ_CAMO || fr_type == GNIX_FAB_RQ_NAMO_FAX || fr_type == GNIX_FAB_RQ_NAMO_FAX_S) { if (!resultv || !resultv[0].addr || result_count != 1) return -FI_EINVAL; loc_addr = resultv[0].addr; if ((uint64_t)loc_addr & dt_align) { GNIX_INFO(FI_LOG_EP_DATA, "Invalid source alignment: %d (mask 0x%x)\n", loc_addr, dt_align); return -FI_EINVAL; } if (!result_desc || !result_desc[0]) { rc = _gnix_mr_reg(&ep->domain->domain_fid.fid, loc_addr, len, FI_READ | FI_WRITE, 0, 0, 0, &auto_mr, NULL, ep->auth_key, GNIX_PROV_REG); if (rc != FI_SUCCESS) { GNIX_INFO(FI_LOG_EP_DATA, "Failed to auto-register local buffer: %d\n", rc); return rc; } flags |= FI_LOCAL_MR; mdesc = (void *)auto_mr; GNIX_INFO(FI_LOG_EP_DATA, "auto-reg MR: %p\n", auto_mr); } else { mdesc = result_desc[0]; } } /* setup fabric request */ req = _gnix_fr_alloc(ep); if (!req) { GNIX_INFO(FI_LOG_EP_DATA, "_gnix_fr_alloc() failed\n"); rc = -FI_ENOSPC; goto err_fr_alloc; } req->type = fr_type; req->gnix_ep = ep; req->user_context = msg->context; req->work_fn = _gnix_amo_post_req; if (mdesc) { md = container_of(mdesc, struct gnix_fid_mem_desc, mr_fid); } req->amo.loc_md = (void *)md; req->amo.loc_addr = (uint64_t)loc_addr; if ((fr_type == GNIX_FAB_RQ_NAMO_AX) || (fr_type == GNIX_FAB_RQ_NAMO_FAX) || (fr_type == GNIX_FAB_RQ_NAMO_AX_S) || (fr_type == GNIX_FAB_RQ_NAMO_FAX_S)) { req->amo.first_operand = *(uint64_t *)msg->msg_iov[0].addr; req->amo.second_operand = *((uint64_t *)(msg->msg_iov[0].addr) + 1); } else if (msg->op == FI_ATOMIC_READ) { req->amo.first_operand = 0xFFFFFFFFFFFFFFFF; /* operand to FAND */ } else if (msg->op == FI_CSWAP) { req->amo.first_operand = compare_operand; req->amo.second_operand = *(uint64_t *)msg->msg_iov[0].addr; } else if (msg->op == FI_MSWAP) { req->amo.first_operand = ~compare_operand; req->amo.second_operand = *(uint64_t *)msg->msg_iov[0].addr; req->amo.second_operand &= compare_operand; } else { req->amo.first_operand = *(uint64_t *)msg->msg_iov[0].addr; } req->amo.rem_addr = msg->rma_iov->addr; req->amo.rem_mr_key = msg->rma_iov->key; req->amo.len = len; req->amo.imm = msg->data; req->amo.datatype = msg->datatype; req->amo.op = msg->op; req->flags = flags; /* Inject interfaces always suppress completions. If * SELECTIVE_COMPLETION is set, honor any setting. Otherwise, always * deliver a completion. */ if ((flags & GNIX_SUPPRESS_COMPLETION) || (ep->send_selective_completion && !(flags & FI_COMPLETION))) { req->flags &= ~FI_COMPLETION; } else { req->flags |= FI_COMPLETION; } COND_ACQUIRE(ep->requires_lock, &ep->vc_lock); /* find VC for target */ rc = _gnix_vc_ep_get_vc(ep, msg->addr, &vc); if (rc) { GNIX_INFO(FI_LOG_EP_DATA, "_gnix_vc_ep_get_vc() failed, addr: %lx, rc:\n", msg->addr, rc); goto err_get_vc; } req->vc = vc; rc = _gnix_vc_queue_tx_req(req); connected = (vc->conn_state == GNIX_VC_CONNECTED); COND_RELEASE(ep->requires_lock, &ep->vc_lock); /* *If a new VC was allocated, progress CM before returning. * If the VC is connected and there's a backlog, poke * the nic progress engine befure returning. */ if (!connected) { _gnix_cm_nic_progress(ep->cm_nic); } else if (!dlist_empty(&vc->tx_queue)) { _gnix_nic_progress(vc->ep->nic); } return rc; err_get_vc: COND_RELEASE(ep->requires_lock, &ep->vc_lock); err_fr_alloc: if (auto_mr) { fi_close(&auto_mr->fid); } return rc; }
static ssize_t rxm_ep_atomic_common(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, const struct fi_msg_atomic *msg, const struct fi_ioc *comparev, void **compare_desc, size_t compare_iov_count, struct fi_ioc *resultv, void **result_desc, size_t result_iov_count, uint32_t op, uint64_t flags) { struct rxm_tx_atomic_buf *tx_buf; struct rxm_atomic_hdr *atomic_hdr; struct iovec buf_iov[RXM_IOV_LIMIT]; struct iovec cmp_iov[RXM_IOV_LIMIT]; size_t datatype_sz = ofi_datatype_size(msg->datatype); size_t buf_len = 0; size_t cmp_len = 0; size_t tot_len; ssize_t ret; assert(msg->iov_count <= RXM_IOV_LIMIT && msg->rma_iov_count <= RXM_IOV_LIMIT); if (flags & FI_REMOTE_CQ_DATA) { FI_WARN(&rxm_prov, FI_LOG_EP_DATA, "atomic with remote CQ data not supported\n"); return -FI_EINVAL; } if (msg->op != FI_ATOMIC_READ) { assert(msg->msg_iov); ofi_ioc_to_iov(msg->msg_iov, buf_iov, msg->iov_count, datatype_sz); buf_len = ofi_total_iov_len(buf_iov, msg->iov_count); } if (op == ofi_op_atomic_compare) { assert(comparev); ofi_ioc_to_iov(comparev, cmp_iov, compare_iov_count, datatype_sz); cmp_len = ofi_total_iov_len(cmp_iov, compare_iov_count); assert(buf_len == cmp_len); } tot_len = buf_len + cmp_len + sizeof(struct rxm_atomic_hdr) + sizeof(struct rxm_pkt); if (tot_len > rxm_eager_limit) { FI_WARN(&rxm_prov, FI_LOG_EP_DATA, "atomic data too large %zu\n", tot_len); return -FI_EINVAL; } ofi_ep_lock_acquire(&rxm_ep->util_ep); tx_buf = (struct rxm_tx_atomic_buf *) rxm_tx_buf_alloc(rxm_ep, RXM_BUF_POOL_TX_ATOMIC); if (OFI_UNLIKELY(!tx_buf)) { FI_WARN(&rxm_prov, FI_LOG_EP_DATA, "Ran out of buffers from Atomic buffer pool\n"); ret = -FI_EAGAIN; goto unlock; } rxm_ep_format_atomic_pkt_hdr(rxm_conn, tx_buf, tot_len, op, msg->datatype, msg->op, flags, msg->data, msg->rma_iov, msg->rma_iov_count); tx_buf->pkt.ctrl_hdr.msg_id = ofi_buf_index(tx_buf); tx_buf->app_context = msg->context; atomic_hdr = (struct rxm_atomic_hdr *) tx_buf->pkt.data; ofi_copy_from_iov(atomic_hdr->data, buf_len, buf_iov, msg->iov_count, 0); if (cmp_len) ofi_copy_from_iov(atomic_hdr->data + buf_len, cmp_len, cmp_iov, compare_iov_count, 0); tx_buf->result_iov_count = result_iov_count; if (resultv) ofi_ioc_to_iov(resultv, tx_buf->result_iov, result_iov_count, datatype_sz); ret = rxm_ep_send_atomic_req(rxm_ep, rxm_conn, tx_buf, tot_len); if (ret) ofi_buf_free(tx_buf); unlock: ofi_ep_lock_release(&rxm_ep->util_ep); return ret; }
static ssize_t rxm_ep_atomic_writev(struct fid_ep *ep_fid, const struct fi_ioc *iov, void **desc, size_t count, fi_addr_t dest_addr, uint64_t addr, uint64_t key, enum fi_datatype datatype, enum fi_op op, void *context) { struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); struct fi_rma_ioc rma_iov = { .addr = addr, .count = ofi_total_ioc_cnt(iov, count), .key = key, }; struct fi_msg_atomic msg = { .msg_iov = iov, .desc = desc, .iov_count = count, .addr = dest_addr, .rma_iov = &rma_iov, .rma_iov_count = 1, .datatype = datatype, .op = op, .context = context, .data = 0, }; return rxm_ep_generic_atomic_writemsg(rxm_ep, &msg, rxm_ep_tx_flags(rxm_ep)); } static ssize_t rxm_ep_atomic_write(struct fid_ep *ep_fid, const void *buf, size_t count, void *desc, fi_addr_t dest_addr, uint64_t addr, uint64_t key, enum fi_datatype datatype, enum fi_op op, void *context) { const struct fi_ioc iov = { .addr = (void *) buf, .count = count, }; return rxm_ep_atomic_writev(ep_fid, &iov, &desc, 1, dest_addr, addr, key, datatype, op, context); } static ssize_t rxm_ep_atomic_inject(struct fid_ep *ep_fid, const void *buf, size_t count, fi_addr_t dest_addr, uint64_t addr, uint64_t key, enum fi_datatype datatype, enum fi_op op) { struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); struct fi_ioc msg_iov = { .addr = (void *) buf, .count = count, }; struct fi_rma_ioc rma_iov = { .addr = addr, .count = count, .key = key, }; struct fi_msg_atomic msg = { .msg_iov = &msg_iov, .desc = NULL, .iov_count = 1, .addr = dest_addr, .rma_iov = &rma_iov, .rma_iov_count = 1, .datatype = datatype, .op = op, .context = NULL, .data = 0, }; return rxm_ep_generic_atomic_writemsg(rxm_ep, &msg, FI_INJECT); } static ssize_t rxm_ep_generic_atomic_readwritemsg(struct rxm_ep *rxm_ep, const struct fi_msg_atomic *msg, struct fi_ioc *resultv, void **result_desc, size_t result_count, uint64_t flags) { int ret; struct rxm_conn *rxm_conn; ret = rxm_ep_prepare_tx(rxm_ep, msg->addr, &rxm_conn); if (OFI_UNLIKELY(ret)) return ret; return rxm_ep_atomic_common(rxm_ep, rxm_conn, msg, NULL, NULL, 0, resultv, result_desc, result_count, ofi_op_atomic_fetch, flags); } static ssize_t rxm_ep_atomic_readwritemsg(struct fid_ep *ep_fid, const struct fi_msg_atomic *msg, struct fi_ioc *resultv, void **result_desc, size_t result_count, uint64_t flags) { struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); return rxm_ep_generic_atomic_readwritemsg(rxm_ep, msg, resultv, result_desc, result_count, flags | rxm_ep->util_ep.tx_msg_flags); } static ssize_t rxm_ep_atomic_readwritev(struct fid_ep *ep_fid, const struct fi_ioc *iov, void **desc, size_t count, struct fi_ioc *resultv, void **result_desc, size_t result_count, fi_addr_t dest_addr, uint64_t addr, uint64_t key, enum fi_datatype datatype, enum fi_op op, void *context) { struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); struct fi_rma_ioc rma_iov = { .addr = addr, .count = ofi_total_ioc_cnt(iov, count), .key = key, }; struct fi_msg_atomic msg = { .msg_iov = iov, .desc = desc, .iov_count = count, .addr = dest_addr, .rma_iov = &rma_iov, .rma_iov_count = 1, .datatype = datatype, .op = op, .context = context, .data = 0, }; return rxm_ep_generic_atomic_readwritemsg(rxm_ep, &msg, resultv, result_desc, result_count, rxm_ep_tx_flags(rxm_ep)); } static ssize_t rxm_ep_atomic_readwrite(struct fid_ep *ep_fid, const void *buf, size_t count, void *desc, void *result, void *result_desc, fi_addr_t dest_addr, uint64_t addr, uint64_t key, enum fi_datatype datatype, enum fi_op op, void *context) { struct fi_ioc iov = { .addr = (op == FI_ATOMIC_READ) ? NULL : (void *) buf, .count = count, }; struct fi_ioc result_iov = { .addr = result, .count = count, }; if (!buf && op != FI_ATOMIC_READ) return -FI_EINVAL; return rxm_ep_atomic_readwritev(ep_fid, &iov, &desc, 1, &result_iov, &result_desc, 1, dest_addr, addr, key, datatype, op, context); } static ssize_t rxm_ep_generic_atomic_compwritemsg(struct rxm_ep *rxm_ep, const struct fi_msg_atomic *msg, const struct fi_ioc *comparev, void **compare_desc, size_t compare_count, struct fi_ioc *resultv, void **result_desc, size_t result_count, uint64_t flags) { int ret; struct rxm_conn *rxm_conn; ret = rxm_ep_prepare_tx(rxm_ep, msg->addr, &rxm_conn); if (OFI_UNLIKELY(ret)) return ret; return rxm_ep_atomic_common(rxm_ep, rxm_conn, msg, comparev, compare_desc, compare_count, resultv, result_desc, result_count, ofi_op_atomic_compare, flags); } static ssize_t rxm_ep_atomic_compwritemsg(struct fid_ep *ep_fid, const struct fi_msg_atomic *msg, const struct fi_ioc *comparev, void **compare_desc, size_t compare_count, struct fi_ioc *resultv, void **result_desc, size_t result_count, uint64_t flags) { struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); return rxm_ep_generic_atomic_compwritemsg(rxm_ep, msg, comparev, compare_desc, compare_count, resultv, result_desc, result_count, flags | rxm_ep->util_ep.tx_msg_flags); } static ssize_t rxm_ep_atomic_compwritev(struct fid_ep *ep_fid, const struct fi_ioc *iov, void **desc, size_t count, const struct fi_ioc *comparev, void **compare_desc, size_t compare_count, struct fi_ioc *resultv, void **result_desc, size_t result_count, fi_addr_t dest_addr, uint64_t addr, uint64_t key, enum fi_datatype datatype, enum fi_op op, void *context) { struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); struct fi_rma_ioc rma_iov = { .addr = addr, .count = ofi_total_ioc_cnt(iov, count), .key = key, }; struct fi_msg_atomic msg = { .msg_iov = iov, .desc = desc, .iov_count = count, .addr = dest_addr, .rma_iov = &rma_iov, .rma_iov_count = 1, .datatype = datatype, .op = op, .context = context, .data = 0, }; return rxm_ep_generic_atomic_compwritemsg(rxm_ep, &msg, comparev, compare_desc, compare_count, resultv, result_desc, result_count, rxm_ep_tx_flags(rxm_ep)); } static ssize_t rxm_ep_atomic_compwrite(struct fid_ep *ep_fid, const void *buf, size_t count, void *desc, const void *compare, void *compare_desc, void *result, void *result_desc, fi_addr_t dest_addr, uint64_t addr, uint64_t key, enum fi_datatype datatype, enum fi_op op, void *context) { struct fi_ioc iov = { .addr = (void *) buf, .count = count, }; struct fi_ioc resultv = { .addr = result, .count = count, }; struct fi_ioc comparev = { .addr = (void *) compare, .count = count, }; return rxm_ep_atomic_compwritev(ep_fid, &iov, &desc, 1, &comparev, &compare_desc, 1, &resultv, &result_desc, 1, dest_addr, addr, key, datatype, op, context); } int rxm_ep_query_atomic(struct fid_domain *domain, enum fi_datatype datatype, enum fi_op op, struct fi_atomic_attr *attr, uint64_t flags) { struct rxm_domain *rxm_domain = container_of(domain, struct rxm_domain, util_domain.domain_fid); size_t tot_size; int ret; if (flags & FI_TAGGED) { FI_WARN(&rxm_prov, FI_LOG_EP_DATA, "tagged atomic op not supported\n"); return -FI_EINVAL; } ret = ofi_atomic_valid(&rxm_prov, datatype, op, flags); if (ret || !attr) return ret; tot_size = flags & FI_COMPARE_ATOMIC ? rxm_domain->max_atomic_size / 2 : rxm_domain->max_atomic_size; attr->size = ofi_datatype_size(datatype); attr->count = tot_size / attr->size; return FI_SUCCESS; } static int rxm_ep_atomic_valid(struct fid_ep *ep_fid, enum fi_datatype datatype, enum fi_op op, size_t *count) { struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid); struct fi_atomic_attr attr; int ret; ret = rxm_ep_query_atomic(&rxm_ep->util_ep.domain->domain_fid, datatype, op, &attr, 0); if (!ret) *count = attr.count; return ret; } static int rxm_ep_atomic_fetch_valid(struct fid_ep *ep_fid, enum fi_datatype datatype, enum fi_op op, size_t *count) { struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid); struct fi_atomic_attr attr; int ret; ret = rxm_ep_query_atomic(&rxm_ep->util_ep.domain->domain_fid, datatype, op, &attr, FI_FETCH_ATOMIC); if (!ret) *count = attr.count; return ret; } static int rxm_ep_atomic_cswap_valid(struct fid_ep *ep_fid, enum fi_datatype datatype, enum fi_op op, size_t *count) { struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid); struct fi_atomic_attr attr; int ret; ret = rxm_ep_query_atomic(&rxm_ep->util_ep.domain->domain_fid, datatype, op, &attr, FI_COMPARE_ATOMIC); if (!ret) *count = attr.count; return ret; } struct fi_ops_atomic rxm_ops_atomic = { .size = sizeof(struct fi_ops_atomic), .write = rxm_ep_atomic_write, .writev = rxm_ep_atomic_writev, .writemsg = rxm_ep_atomic_writemsg, .inject = rxm_ep_atomic_inject, .readwrite = rxm_ep_atomic_readwrite, .readwritev = rxm_ep_atomic_readwritev, .readwritemsg = rxm_ep_atomic_readwritemsg, .compwrite = rxm_ep_atomic_compwrite, .compwritev = rxm_ep_atomic_compwritev, .compwritemsg = rxm_ep_atomic_compwritemsg, .writevalid = rxm_ep_atomic_valid, .readwritevalid = rxm_ep_atomic_fetch_valid, .compwritevalid = rxm_ep_atomic_cswap_valid, };