ssize_t _psmx_tagged_peek(struct fid_ep *ep, void *buf, size_t len, void *desc, fi_addr_t src_addr, uint64_t tag, uint64_t ignore, void *context, uint64_t flags) { struct psmx_fid_ep *ep_priv; psm_mq_status_t psm_status; uint64_t psm_tag, psm_tagsel; struct psmx_cq_event *event; int err; ep_priv = container_of(ep, struct psmx_fid_ep, ep); if (tag & ep_priv->domain->reserved_tag_bits) { FI_WARN(&psmx_prov, FI_LOG_EP_DATA, "using reserved tag bits." "tag=%lx. reserved_bits=%lx.\n", tag, ep_priv->domain->reserved_tag_bits); } psm_tag = tag & (~ep_priv->domain->reserved_tag_bits); psm_tagsel = (~ignore) | ep_priv->domain->reserved_tag_bits; if (flags & (FI_CLAIM | FI_DISCARD)) return -FI_EOPNOTSUPP; err = psm_mq_iprobe(ep_priv->domain->psm_mq, psm_tag, psm_tagsel, &psm_status); switch (err) { case PSM_OK: if (ep_priv->recv_cq) { event = psmx_cq_create_event( ep_priv->recv_cq, context, /* op_context */ NULL, /* buf */ flags|FI_RECV|FI_TAGGED,/* flags */ psm_status.msg_length, /* len */ 0, /* data */ psm_status.msg_tag, /* tag */ psm_status.msg_length, /* olen */ 0); /* err */ if (event) psmx_cq_enqueue_event(ep_priv->recv_cq, event); else return -FI_ENOMEM; /* TODO: set message source to FI_ADDR_NOTAVAIL? */ } return 0; case PSM_MQ_NO_COMPLETIONS: return -FI_ENOMSG; default: return psmx_errno(err); } }
static ssize_t psmx_ep_cancel(fid_t fid, void *context) { struct psmx_fid_ep *ep; psm_mq_status_t status; struct fi_context *fi_context = context; uint64_t flags; struct psmx_cq_event *event; int err; ep = container_of(fid, struct psmx_fid_ep, ep.fid); if (!ep->domain) return -FI_EBADF; if (!fi_context) return -FI_EINVAL; switch (PSMX_CTXT_TYPE(fi_context)) { case PSMX_TRECV_CONTEXT: flags = FI_RECV | FI_TAGGED; break; case PSMX_RECV_CONTEXT: case PSMX_MULTI_RECV_CONTEXT: flags = FI_RECV | FI_MSG; break; default: return -FI_EOPNOTSUPP; } err = psm_mq_cancel((psm_mq_req_t *)&PSMX_CTXT_REQ(fi_context)); if (err == PSM_OK) { err = psm_mq_test((psm_mq_req_t *)&PSMX_CTXT_REQ(fi_context), &status); if (err == PSM_OK && ep->recv_cq) { event = psmx_cq_create_event( ep->recv_cq, status.context, NULL, /* buf */ flags, 0, /* len */ 0, /* data */ 0, /* tag */ 0 /* olen */, -FI_ECANCELED); if (event) psmx_cq_enqueue_event(ep->recv_cq, event); else return -FI_ENOMEM; } } return psmx_errno(err); }
int psmx_am_rma_handler(psm_am_token_t token, psm_epaddr_t epaddr, psm_amarg_t *args, int nargs, void *src, uint32_t len) { psm_amarg_t rep_args[8]; void *rma_addr; ssize_t rma_len; uint64_t key; int err = 0; int op_error = 0; int cmd, eom, has_data; struct psmx_am_request *req; struct psmx_cq_event *event; int chunk_size; uint64_t offset; struct psmx_fid_mr *mr; cmd = args[0].u32w0 & PSMX_AM_OP_MASK; eom = args[0].u32w0 & PSMX_AM_EOM; has_data = args[0].u32w0 & PSMX_AM_DATA; switch (cmd) { case PSMX_AM_REQ_WRITE: rma_len = args[0].u32w1; rma_addr = (void *)(uintptr_t)args[2].u64; key = args[3].u64; mr = psmx_mr_hash_get(key); op_error = mr ? psmx_mr_validate(mr, (uint64_t)rma_addr, len, FI_REMOTE_WRITE) : -EINVAL; if (!op_error) { rma_addr += mr->offset; memcpy(rma_addr, src, len); if (eom) { if (mr->cq) { /* TODO: report the addr/len of the whole write */ event = psmx_cq_create_event( mr->cq, 0, /* context */ rma_addr, 0, /* flags */ rma_len, has_data ? args[4].u64 : 0, 0, /* tag */ 0, /* olen */ 0); if (event) psmx_cq_enqueue_event(mr->cq, event); else err = -ENOMEM; } if (mr->cntr) psmx_cntr_inc(mr->cntr); if (mr->domain->rma_ep->remote_write_cntr) psmx_cntr_inc(mr->domain->rma_ep->remote_write_cntr); } } if (eom || op_error) { rep_args[0].u32w0 = PSMX_AM_REP_WRITE | eom; rep_args[0].u32w1 = op_error; rep_args[1].u64 = args[1].u64; err = psm_am_reply_short(token, PSMX_AM_RMA_HANDLER, rep_args, 2, NULL, 0, 0, NULL, NULL ); } break; case PSMX_AM_REQ_WRITE_LONG: rma_len = args[0].u32w1; rma_addr = (void *)(uintptr_t)args[2].u64; key = args[3].u64; mr = psmx_mr_hash_get(key); op_error = mr ? psmx_mr_validate(mr, (uint64_t)rma_addr, len, FI_REMOTE_WRITE) : -EINVAL; if (op_error) { rep_args[0].u32w0 = PSMX_AM_REP_WRITE | eom; rep_args[0].u32w1 = op_error; rep_args[1].u64 = args[1].u64; err = psm_am_reply_short(token, PSMX_AM_RMA_HANDLER, rep_args, 2, NULL, 0, 0, NULL, NULL ); break; } rma_addr += mr->offset; req = calloc(1, sizeof(*req)); if (!req) { err = -ENOMEM; } else { req->op = args[0].u32w0; req->write.addr = (uint64_t)rma_addr; req->write.len = rma_len; req->write.key = key; req->write.context = (void *)args[4].u64; req->write.data = has_data ? args[5].u64 : 0; PSMX_CTXT_TYPE(&req->fi_context) = PSMX_REMOTE_WRITE_CONTEXT; PSMX_CTXT_USER(&req->fi_context) = mr; psmx_am_enqueue_rma(mr->domain, req); } break; case PSMX_AM_REQ_READ: rma_len = args[0].u32w1; rma_addr = (void *)(uintptr_t)args[2].u64; key = args[3].u64; offset = args[4].u64; mr = psmx_mr_hash_get(key); op_error = mr ? psmx_mr_validate(mr, (uint64_t)rma_addr, rma_len, FI_REMOTE_READ) : -EINVAL; if (!op_error) { rma_addr += mr->offset; } else { rma_addr = NULL; rma_len = 0; } chunk_size = MIN(PSMX_AM_CHUNK_SIZE, psmx_am_param.max_reply_short); assert(rma_len <= chunk_size); rep_args[0].u32w0 = PSMX_AM_REP_READ | eom; rep_args[0].u32w1 = op_error; rep_args[1].u64 = args[1].u64; rep_args[2].u64 = offset; err = psm_am_reply_short(token, PSMX_AM_RMA_HANDLER, rep_args, 3, rma_addr, rma_len, 0, NULL, NULL ); if (eom && !op_error) { if (mr->domain->rma_ep->remote_read_cntr) psmx_cntr_inc(mr->domain->rma_ep->remote_read_cntr); } break; case PSMX_AM_REQ_READ_LONG: rma_len = args[0].u32w1; rma_addr = (void *)(uintptr_t)args[2].u64; key = args[3].u64; mr = psmx_mr_hash_get(key); op_error = mr ? psmx_mr_validate(mr, (uint64_t)rma_addr, len, FI_REMOTE_WRITE) : -EINVAL; if (op_error) { rep_args[0].u32w0 = PSMX_AM_REP_READ | eom; rep_args[0].u32w1 = op_error; rep_args[1].u64 = args[1].u64; rep_args[2].u64 = 0; err = psm_am_reply_short(token, PSMX_AM_RMA_HANDLER, rep_args, 3, NULL, 0, 0, NULL, NULL ); break; } rma_addr += mr->offset; req = calloc(1, sizeof(*req)); if (!req) { err = -ENOMEM; } else { req->op = args[0].u32w0; req->read.addr = (uint64_t)rma_addr; req->read.len = rma_len; req->read.key = key; req->read.context = (void *)args[4].u64; req->read.peer_addr = (void *)epaddr; PSMX_CTXT_TYPE(&req->fi_context) = PSMX_REMOTE_READ_CONTEXT; PSMX_CTXT_USER(&req->fi_context) = mr; psmx_am_enqueue_rma(mr->domain, req); } break; case PSMX_AM_REP_WRITE: req = (struct psmx_am_request *)(uintptr_t)args[1].u64; assert(req->op == PSMX_AM_REQ_WRITE); op_error = (int)args[0].u32w1; if (!req->error) req->error = op_error; if (eom) { if (req->ep->send_cq && !req->no_event) { event = psmx_cq_create_event( req->ep->send_cq, req->write.context, req->write.buf, 0, /* flags */ req->write.len, 0, /* data */ 0, /* tag */ 0, /* olen */ req->error); if (event) psmx_cq_enqueue_event(req->ep->send_cq, event); else err = -ENOMEM; } if (req->ep->write_cntr) psmx_cntr_inc(req->ep->write_cntr); free(req); } break; case PSMX_AM_REP_READ: req = (struct psmx_am_request *)(uintptr_t)args[1].u64; assert(req->op == PSMX_AM_REQ_READ); op_error = (int)args[0].u32w1; offset = args[2].u64; if (!req->error) req->error = op_error; if (!op_error) { memcpy(req->read.buf + offset, src, len); req->read.len_read += len; } if (eom) { if (req->ep->send_cq && !req->no_event) { event = psmx_cq_create_event( req->ep->send_cq, req->read.context, req->read.buf, 0, /* flags */ req->read.len_read, 0, /* data */ 0, /* tag */ req->read.len - req->read.len_read, req->error); if (event) psmx_cq_enqueue_event(req->ep->send_cq, event); else err = -ENOMEM; } if (req->ep->read_cntr) psmx_cntr_inc(req->ep->read_cntr); free(req); } break; default: err = -EINVAL; } return err; }
static ssize_t psmx_rma_self(int am_cmd, struct psmx_fid_ep *ep, void *buf, size_t len, void *desc, uint64_t addr, uint64_t key, void *context, uint64_t flags, uint64_t data) { struct psmx_fid_mr *mr; struct psmx_cq_event *event; struct psmx_fid_cntr *cntr; int no_event; int err = 0; int op_error = 0; int access; void *dst, *src; switch (am_cmd) { case PSMX_AM_REQ_WRITE: access = FI_REMOTE_WRITE; break; case PSMX_AM_REQ_READ: access = FI_REMOTE_READ; break; default: return -EINVAL; } mr = psmx_mr_hash_get(key); op_error = mr ? psmx_mr_validate(mr, addr, len, access) : -EINVAL; if (!op_error) { addr += mr->offset; if (am_cmd == PSMX_AM_REQ_WRITE) { dst = (void *)addr; src = buf; cntr = mr->domain->rma_ep->remote_write_cntr; } else { dst = buf; src = (void *)addr; cntr = mr->domain->rma_ep->remote_read_cntr; } memcpy(dst, src, len); if (mr->cq && am_cmd == PSMX_AM_REQ_WRITE) { event = psmx_cq_create_event( mr->cq, 0, /* context */ (void *)addr, 0, /* flags */ len, flags & FI_REMOTE_CQ_DATA ? data : 0, 0, /* tag */ 0, /* olen */ 0 /* err */); if (event) psmx_cq_enqueue_event(mr->cq, event); else err = -ENOMEM; } if (mr->cntr && am_cmd == PSMX_AM_REQ_WRITE) psmx_cntr_inc(mr->cntr); if (cntr) psmx_cntr_inc(cntr); } no_event = (flags & FI_INJECT) || (ep->send_cq_event_flag && !(flags & FI_EVENT)); if (ep->send_cq && !no_event) { event = psmx_cq_create_event( ep->send_cq, context, (void *)buf, 0, /* flags */ len, 0, /* data */ 0, /* tag */ 0, /* olen */ op_error); if (event) psmx_cq_enqueue_event(ep->send_cq, event); else err = -ENOMEM; } switch (am_cmd) { case PSMX_AM_REQ_WRITE: if (ep->write_cntr) psmx_cntr_inc(ep->write_cntr); break; case PSMX_AM_REQ_READ: if (ep->read_cntr) psmx_cntr_inc(ep->read_cntr); break; } return err; }
static int psmx_atomic_self(int am_cmd, struct psmx_fid_ep *ep, const void *buf, size_t count, void *desc, const void *compare, void *compare_desc, void *result, void *result_desc, uint64_t addr, uint64_t key, enum fi_datatype datatype, enum fi_op op, void *context, uint64_t flags) { struct psmx_fid_mr *mr; struct psmx_cq_event *event; struct psmx_fid_ep *target_ep; struct psmx_fid_cntr *cntr = NULL; struct psmx_fid_cntr *mr_cntr = NULL; void *tmp_buf; size_t len; int no_event; int err = 0; int op_error; int access; uint64_t cq_flags = 0; if (am_cmd == PSMX_AM_REQ_ATOMIC_WRITE) access = FI_REMOTE_WRITE; else access = FI_REMOTE_READ | FI_REMOTE_WRITE; len = fi_datatype_size(datatype) * count; mr = psmx_mr_get(psmx_active_fabric->active_domain, key); op_error = mr ? psmx_mr_validate(mr, addr, len, access) : -FI_EINVAL; if (op_error) goto gen_local_event; addr += mr->offset; switch (am_cmd) { case PSMX_AM_REQ_ATOMIC_WRITE: err = psmx_atomic_do_write((void *)addr, (void *)buf, (int)datatype, (int)op, (int)count); cq_flags = FI_WRITE | FI_ATOMIC; break; case PSMX_AM_REQ_ATOMIC_READWRITE: if (result != buf) { err = psmx_atomic_do_readwrite((void *)addr, (void *)buf, (void *)result, (int)datatype, (int)op, (int)count); } else { tmp_buf = malloc(len); if (tmp_buf) { memcpy(tmp_buf, result, len); err = psmx_atomic_do_readwrite((void *)addr, (void *)buf, tmp_buf, (int)datatype, (int)op, (int)count); memcpy(result, tmp_buf, len); free(tmp_buf); } else { err = -FI_ENOMEM; } } if (op == FI_ATOMIC_READ) cq_flags = FI_READ | FI_ATOMIC; else cq_flags = FI_WRITE | FI_ATOMIC; break; case PSMX_AM_REQ_ATOMIC_COMPWRITE: if (result != buf && result != compare) { err = psmx_atomic_do_compwrite((void *)addr, (void *)buf, (void *)compare, (void *)result, (int)datatype, (int)op, (int)count); } else { tmp_buf = malloc(len); if (tmp_buf) { memcpy(tmp_buf, result, len); err = psmx_atomic_do_compwrite((void *)addr, (void *)buf, (void *)compare, tmp_buf, (int)datatype, (int)op, (int)count); memcpy(result, tmp_buf, len); free(tmp_buf); } else { err = -FI_ENOMEM; } } cq_flags = FI_WRITE | FI_ATOMIC; break; } target_ep = mr->domain->atomics_ep; if (op == FI_ATOMIC_READ) { cntr = target_ep->remote_read_cntr; } else { cntr = target_ep->remote_write_cntr; mr_cntr = mr->cntr; } if (cntr) psmx_cntr_inc(cntr); if (mr_cntr && mr_cntr != cntr) psmx_cntr_inc(mr_cntr); gen_local_event: no_event = ((flags & PSMX_NO_COMPLETION) || (ep->send_selective_completion && !(flags & FI_COMPLETION))); if (ep->send_cq && !no_event) { event = psmx_cq_create_event( ep->send_cq, context, (void *)buf, cq_flags, len, 0, /* data */ 0, /* tag */ 0, /* olen */ op_error); if (event) psmx_cq_enqueue_event(ep->send_cq, event); else err = -FI_ENOMEM; } switch (am_cmd) { case PSMX_AM_REQ_ATOMIC_WRITE: if (ep->write_cntr) psmx_cntr_inc(ep->write_cntr); break; case PSMX_AM_REQ_ATOMIC_READWRITE: case PSMX_AM_REQ_ATOMIC_COMPWRITE: if (ep->read_cntr) psmx_cntr_inc(ep->read_cntr); break; } return err; }
int psmx_am_atomic_handler(psm_am_token_t token, psm_epaddr_t epaddr, psm_amarg_t *args, int nargs, void *src, uint32_t len) #endif { psm_amarg_t rep_args[8]; int count; void *addr; uint64_t key; int datatype, op; int err = 0; int op_error = 0; struct psmx_am_request *req; struct psmx_cq_event *event; struct psmx_fid_mr *mr; struct psmx_fid_ep *target_ep; struct psmx_fid_cntr *cntr = NULL; struct psmx_fid_cntr *mr_cntr = NULL; void *tmp_buf; #if (PSM_VERNO_MAJOR >= 2) psm_epaddr_t epaddr; psm_am_get_source(token, &epaddr); #endif switch (args[0].u32w0 & PSMX_AM_OP_MASK) { case PSMX_AM_REQ_ATOMIC_WRITE: count = args[0].u32w1; addr = (void *)(uintptr_t)args[2].u64; key = args[3].u64; datatype = args[4].u32w0; op = args[4].u32w1; assert(len == fi_datatype_size(datatype) * count); mr = psmx_mr_get(psmx_active_fabric->active_domain, key); op_error = mr ? psmx_mr_validate(mr, (uint64_t)addr, len, FI_REMOTE_WRITE) : -FI_EINVAL; if (!op_error) { addr += mr->offset; psmx_atomic_do_write(addr, src, datatype, op, count); target_ep = mr->domain->atomics_ep; cntr = target_ep->remote_write_cntr; mr_cntr = mr->cntr; if (cntr) psmx_cntr_inc(cntr); if (mr_cntr && mr_cntr != cntr) psmx_cntr_inc(mr_cntr); } rep_args[0].u32w0 = PSMX_AM_REP_ATOMIC_WRITE; rep_args[0].u32w1 = op_error; rep_args[1].u64 = args[1].u64; err = psm_am_reply_short(token, PSMX_AM_ATOMIC_HANDLER, rep_args, 2, NULL, 0, 0, NULL, NULL ); break; case PSMX_AM_REQ_ATOMIC_READWRITE: count = args[0].u32w1; addr = (void *)(uintptr_t)args[2].u64; key = args[3].u64; datatype = args[4].u32w0; op = args[4].u32w1; if (op == FI_ATOMIC_READ) len = fi_datatype_size(datatype) * count; assert(len == fi_datatype_size(datatype) * count); mr = psmx_mr_get(psmx_active_fabric->active_domain, key); op_error = mr ? psmx_mr_validate(mr, (uint64_t)addr, len, FI_REMOTE_READ|FI_REMOTE_WRITE) : -FI_EINVAL; if (!op_error) { addr += mr->offset; tmp_buf = malloc(len); if (tmp_buf) psmx_atomic_do_readwrite(addr, src, tmp_buf, datatype, op, count); else op_error = -FI_ENOMEM; target_ep = mr->domain->atomics_ep; if (op == FI_ATOMIC_READ) { cntr = target_ep->remote_read_cntr; } else { cntr = target_ep->remote_write_cntr; mr_cntr = mr->cntr; } if (cntr) psmx_cntr_inc(cntr); if (mr_cntr && mr_cntr != cntr) psmx_cntr_inc(mr_cntr); } else { tmp_buf = NULL; } rep_args[0].u32w0 = PSMX_AM_REP_ATOMIC_READWRITE; rep_args[0].u32w1 = op_error; rep_args[1].u64 = args[1].u64; err = psm_am_reply_short(token, PSMX_AM_ATOMIC_HANDLER, rep_args, 2, tmp_buf, (tmp_buf?len:0), 0, psmx_am_atomic_completion, tmp_buf ); break; case PSMX_AM_REQ_ATOMIC_COMPWRITE: count = args[0].u32w1; addr = (void *)(uintptr_t)args[2].u64; key = args[3].u64; datatype = args[4].u32w0; op = args[4].u32w1; len /= 2; assert(len == fi_datatype_size(datatype) * count); mr = psmx_mr_get(psmx_active_fabric->active_domain, key); op_error = mr ? psmx_mr_validate(mr, (uint64_t)addr, len, FI_REMOTE_READ|FI_REMOTE_WRITE) : -FI_EINVAL; if (!op_error) { addr += mr->offset; tmp_buf = malloc(len); if (tmp_buf) psmx_atomic_do_compwrite(addr, src, src + len, tmp_buf, datatype, op, count); else op_error = -FI_ENOMEM; target_ep = mr->domain->atomics_ep; cntr = target_ep->remote_write_cntr; mr_cntr = mr->cntr; if (cntr) psmx_cntr_inc(cntr); if (mr_cntr && mr_cntr != cntr) psmx_cntr_inc(mr_cntr); } else { tmp_buf = NULL; } rep_args[0].u32w0 = PSMX_AM_REP_ATOMIC_READWRITE; rep_args[0].u32w1 = op_error; rep_args[1].u64 = args[1].u64; err = psm_am_reply_short(token, PSMX_AM_ATOMIC_HANDLER, rep_args, 2, tmp_buf, (tmp_buf?len:0), 0, psmx_am_atomic_completion, tmp_buf ); break; case PSMX_AM_REP_ATOMIC_WRITE: req = (struct psmx_am_request *)(uintptr_t)args[1].u64; op_error = (int)args[0].u32w1; assert(req->op == PSMX_AM_REQ_ATOMIC_WRITE); if (req->ep->send_cq && !req->no_event) { event = psmx_cq_create_event( req->ep->send_cq, req->atomic.context, req->atomic.buf, req->cq_flags, req->atomic.len, 0, /* data */ 0, /* tag */ 0, /* olen */ op_error); if (event) psmx_cq_enqueue_event(req->ep->send_cq, event); else err = -FI_ENOMEM; } if (req->ep->write_cntr) psmx_cntr_inc(req->ep->write_cntr); free(req); break; case PSMX_AM_REP_ATOMIC_READWRITE: case PSMX_AM_REP_ATOMIC_COMPWRITE: req = (struct psmx_am_request *)(uintptr_t)args[1].u64; op_error = (int)args[0].u32w1; assert(op_error || req->atomic.len == len); if (!op_error) memcpy(req->atomic.result, src, len); if (req->ep->send_cq && !req->no_event) { event = psmx_cq_create_event( req->ep->send_cq, req->atomic.context, req->atomic.buf, req->cq_flags, req->atomic.len, 0, /* data */ 0, /* tag */ 0, /* olen */ op_error); if (event) psmx_cq_enqueue_event(req->ep->send_cq, event); else err = -FI_ENOMEM; } if (req->ep->read_cntr) psmx_cntr_inc(req->ep->read_cntr); free(req); break; default: err = -FI_EINVAL; } return err; }
ssize_t _psmx_tagged_send(struct fid_ep *ep, const void *buf, size_t len, void *desc, fi_addr_t dest_addr, uint64_t tag, void *context, uint64_t flags) #endif { struct psmx_fid_ep *ep_priv; struct psmx_fid_av *av; psm_epaddr_t psm_epaddr; psm_mq_req_t psm_req; uint64_t psm_tag; #if (PSM_VERNO_MAJOR >= 2) psm_mq_tag_t psm_tag2; #endif struct fi_context *fi_context; int err; size_t idx; int no_completion = 0; struct psmx_cq_event *event; ep_priv = container_of(ep, struct psmx_fid_ep, ep); if (flags & FI_TRIGGER) { struct psmx_trigger *trigger; struct fi_triggered_context *ctxt = context; trigger = calloc(1, sizeof(*trigger)); if (!trigger) return -FI_ENOMEM; trigger->op = PSMX_TRIGGERED_TSEND; trigger->cntr = container_of(ctxt->trigger.threshold.cntr, struct psmx_fid_cntr, cntr); trigger->threshold = ctxt->trigger.threshold.threshold; trigger->tsend.ep = ep; trigger->tsend.buf = buf; trigger->tsend.len = len; trigger->tsend.desc = desc; trigger->tsend.dest_addr = dest_addr; trigger->tsend.tag = tag; trigger->tsend.context = context; trigger->tsend.flags = flags & ~FI_TRIGGER; #if (PSM_VERNO_MAJOR >= 2) trigger->tsend.data = data; #endif psmx_cntr_add_trigger(trigger->cntr, trigger); return 0; } if (tag & ep_priv->domain->reserved_tag_bits) { FI_WARN(&psmx_prov, FI_LOG_EP_DATA, "using reserved tag bits." "tag=%lx. reserved_bits=%lx.\n", tag, ep_priv->domain->reserved_tag_bits); } av = ep_priv->av; if (av && av->type == FI_AV_TABLE) { idx = (size_t)dest_addr; if (idx >= av->last) return -FI_EINVAL; psm_epaddr = av->psm_epaddrs[idx]; } else { psm_epaddr = (psm_epaddr_t) dest_addr; } psm_tag = tag & (~ep_priv->domain->reserved_tag_bits); #if (PSM_VERNO_MAJOR >= 2) PSMX_SET_TAG(psm_tag2, psm_tag, data); #endif if ((flags & PSMX_NO_COMPLETION) || (ep_priv->send_selective_completion && !(flags & FI_COMPLETION))) no_completion = 1; if (flags & FI_INJECT) { if (len > PSMX_INJECT_SIZE) return -FI_EMSGSIZE; #if (PSM_VERNO_MAJOR >= 2) err = psm_mq_send2(ep_priv->domain->psm_mq, psm_epaddr, 0, &psm_tag2, buf, len); #else err = psm_mq_send(ep_priv->domain->psm_mq, psm_epaddr, 0, psm_tag, buf, len); #endif if (err != PSM_OK) return psmx_errno(err); if (ep_priv->send_cntr) psmx_cntr_inc(ep_priv->send_cntr); if (ep_priv->send_cq && !no_completion) { event = psmx_cq_create_event( ep_priv->send_cq, context, (void *)buf, flags, len, #if (PSM_VERNO_MAJOR >= 2) (uint64_t) data, psm_tag, #else 0 /* data */, psm_tag, #endif 0 /* olen */, 0 /* err */); if (event) psmx_cq_enqueue_event(ep_priv->send_cq, event); else return -FI_ENOMEM; } return 0; } if (no_completion && !context) { fi_context = &ep_priv->nocomp_send_context; } else { if (!context) return -FI_EINVAL; fi_context = context; PSMX_CTXT_TYPE(fi_context) = PSMX_TSEND_CONTEXT; PSMX_CTXT_USER(fi_context) = (void *)buf; PSMX_CTXT_EP(fi_context) = ep_priv; } #if (PSM_VERNO_MAJOR >= 2) err = psm_mq_isend2(ep_priv->domain->psm_mq, psm_epaddr, 0, &psm_tag2, buf, len, (void*)fi_context, &psm_req); #else err = psm_mq_isend(ep_priv->domain->psm_mq, psm_epaddr, 0, psm_tag, buf, len, (void*)fi_context, &psm_req); #endif if (err != PSM_OK) return psmx_errno(err); if (fi_context == context) PSMX_CTXT_REQ(fi_context) = psm_req; return 0; }
ssize_t _psmx_tagged_peek(struct fid_ep *ep, void *buf, size_t len, void *desc, fi_addr_t src_addr, uint64_t tag, uint64_t ignore, void *context, uint64_t flags) { struct psmx_fid_ep *ep_priv; #if (PSM_VERNO_MAJOR >= 2) psm_mq_status2_t psm_status2; psm_mq_tag_t psm_tag2, psm_tagsel2; psm_mq_req_t req; struct psmx_fid_av *av; size_t idx; psm_epaddr_t psm_src_addr; #else psm_mq_status_t psm_status; #endif uint64_t psm_tag, psm_tagsel; struct psmx_cq_event *event; int err; ep_priv = container_of(ep, struct psmx_fid_ep, ep); if (tag & ep_priv->domain->reserved_tag_bits) { FI_WARN(&psmx_prov, FI_LOG_EP_DATA, "using reserved tag bits." "tag=%lx. reserved_bits=%lx.\n", tag, ep_priv->domain->reserved_tag_bits); } psm_tag = tag & (~ep_priv->domain->reserved_tag_bits); psm_tagsel = (~ignore) | ep_priv->domain->reserved_tag_bits; #if (PSM_VERNO_MAJOR >= 2) if (src_addr != FI_ADDR_UNSPEC) { av = ep_priv->av; if (av && av->type == FI_AV_TABLE) { idx = (size_t)src_addr; if (idx >= av->last) return -FI_EINVAL; psm_src_addr = av->psm_epaddrs[idx]; } else { psm_src_addr = (psm_epaddr_t)src_addr; } } else { psm_src_addr = NULL; } PSMX_SET_TAG(psm_tag2, psm_tag, 0); PSMX_SET_TAG(psm_tagsel2, psm_tagsel, 0); if (flags & (FI_CLAIM | FI_DISCARD)) err = psm_mq_improbe2(ep_priv->domain->psm_mq, psm_src_addr, &psm_tag2, &psm_tagsel2, &req, &psm_status2); else err = psm_mq_iprobe2(ep_priv->domain->psm_mq, psm_src_addr, &psm_tag2, &psm_tagsel2, &psm_status2); #else if (flags & (FI_CLAIM | FI_DISCARD)) return -FI_EOPNOTSUPP; err = psm_mq_iprobe(ep_priv->domain->psm_mq, psm_tag, psm_tagsel, &psm_status); #endif switch (err) { case PSM_OK: if (ep_priv->recv_cq) { #if (PSM_VERNO_MAJOR >= 2) if ((flags & FI_CLAIM) && context) PSMX_CTXT_REQ((struct fi_context *)context) = req; tag = psm_status2.msg_tag.tag0 | (((uint64_t)psm_status2.msg_tag.tag1) << 32); len = psm_status2.msg_length; src_addr = (fi_addr_t)psm_status2.msg_peer; #else tag = psm_status.msg_tag; len = psm_status.msg_length; src_addr = 0; #endif event = psmx_cq_create_event( ep_priv->recv_cq, context, /* op_context */ NULL, /* buf */ flags|FI_RECV|FI_TAGGED,/* flags */ len, /* len */ 0, /* data */ tag, /* tag */ len, /* olen */ 0); /* err */ if (!event) return -FI_ENOMEM; event->source = src_addr; psmx_cq_enqueue_event(ep_priv->recv_cq, event); } return 0; case PSM_MQ_NO_COMPLETIONS: return -FI_ENOMSG; default: return psmx_errno(err); } }
static int psmx_atomic_self(int am_cmd, struct psmx_fid_ep *ep, const void *buf, size_t count, void *desc, const void *compare, void *compare_desc, void *result, void *result_desc, uint64_t addr, uint64_t key, enum fi_datatype datatype, enum fi_op op, void *context, uint64_t flags) { struct psmx_fid_mr *mr; struct psmx_cq_event *event; struct psmx_fid_ep *target_ep; size_t len; int no_event; int err = 0; int op_error; int access; if (am_cmd == PSMX_AM_REQ_ATOMIC_WRITE) access = FI_REMOTE_WRITE; else access = FI_REMOTE_READ | FI_REMOTE_WRITE; len = fi_datatype_size(datatype) * count; mr = psmx_mr_hash_get(key); op_error = mr ? psmx_mr_validate(mr, addr, len, access) : -EINVAL; if (op_error) goto gen_local_event; addr += mr->offset; switch (am_cmd) { case PSMX_AM_REQ_ATOMIC_WRITE: err = psmx_atomic_do_write((void *)addr, (void *)buf, (int)datatype, (int)op, (int)count); break; case PSMX_AM_REQ_ATOMIC_READWRITE: err = psmx_atomic_do_readwrite((void *)addr, (void *)buf, (void *)result, (int)datatype, (int)op, (int)count); break; case PSMX_AM_REQ_ATOMIC_COMPWRITE: err = psmx_atomic_do_compwrite((void *)addr, (void *)buf, (void *)compare, (void *)result, (int)datatype, (int)op, (int)count); break; } if (op != FI_ATOMIC_READ) { if (mr->cq) { event = psmx_cq_create_event( mr->cq, 0, /* context */ (void *)addr, 0, /* flags */ len, 0, /* data */ 0, /* tag */ 0, /* olen */ 0 /* err */); if (event) psmx_cq_enqueue_event(mr->cq, event); else err = -ENOMEM; } if (mr->cntr) psmx_cntr_inc(mr->cntr); } target_ep = mr->domain->atomics_ep; if (op == FI_ATOMIC_WRITE) { if (target_ep->remote_write_cntr) psmx_cntr_inc(target_ep->remote_write_cntr); } else if (op == FI_ATOMIC_READ) { if (target_ep->remote_read_cntr) psmx_cntr_inc(target_ep->remote_read_cntr); } else { if (target_ep->remote_write_cntr) psmx_cntr_inc(target_ep->remote_write_cntr); if (am_cmd != PSMX_AM_REQ_ATOMIC_WRITE && target_ep->remote_read_cntr && target_ep->remote_read_cntr != target_ep->remote_write_cntr) psmx_cntr_inc(target_ep->remote_read_cntr); } gen_local_event: no_event = ((flags & FI_INJECT) || (ep->send_cq_event_flag && !(flags & FI_EVENT))); if (ep->send_cq && !no_event) { event = psmx_cq_create_event( ep->send_cq, context, (void *)buf, 0, /* flags */ len, 0, /* data */ 0, /* tag */ 0, /* olen */ op_error); if (event) psmx_cq_enqueue_event(ep->send_cq, event); else err = -ENOMEM; } switch (am_cmd) { case PSMX_AM_REQ_ATOMIC_WRITE: if (ep->write_cntr) psmx_cntr_inc(ep->write_cntr); break; case PSMX_AM_REQ_ATOMIC_READWRITE: case PSMX_AM_REQ_ATOMIC_COMPWRITE: if (ep->read_cntr) psmx_cntr_inc(ep->read_cntr); break; } return err; }
static ssize_t psmx_rma_self(int am_cmd, struct psmx_fid_ep *ep, void *buf, size_t len, void *desc, uint64_t addr, uint64_t key, void *context, uint64_t flags, uint64_t data) { struct psmx_fid_mr *mr; struct psmx_cq_event *event; struct psmx_fid_cntr *cntr; struct psmx_fid_cntr *mr_cntr = NULL; struct psmx_fid_cq *cq = NULL; int no_event; int err = 0; int op_error = 0; int access; void *dst, *src; uint64_t cq_flags; switch (am_cmd) { case PSMX_AM_REQ_WRITE: access = FI_REMOTE_WRITE; cq_flags = FI_WRITE | FI_RMA; break; case PSMX_AM_REQ_READ: access = FI_REMOTE_READ; cq_flags = FI_READ | FI_RMA; break; default: return -FI_EINVAL; } mr = psmx_mr_get(psmx_active_fabric->active_domain, key); op_error = mr ? psmx_mr_validate(mr, addr, len, access) : -FI_EINVAL; if (!op_error) { addr += mr->offset; if (am_cmd == PSMX_AM_REQ_WRITE) { dst = (void *)addr; src = buf; cntr = mr->domain->rma_ep->remote_write_cntr; if (flags & FI_REMOTE_CQ_DATA) cq = mr->domain->rma_ep->recv_cq; if (mr->cntr != cntr) mr_cntr = mr->cntr; } else { dst = buf; src = (void *)addr; cntr = mr->domain->rma_ep->remote_read_cntr; } memcpy(dst, src, len); if (cq) { event = psmx_cq_create_event( cq, 0, /* context */ (void *)addr, FI_REMOTE_WRITE | FI_RMA | FI_REMOTE_CQ_DATA, len, data, 0, /* tag */ 0, /* olen */ 0 /* err */); if (event) psmx_cq_enqueue_event(cq, event); else err = -FI_ENOMEM; } if (cntr) psmx_cntr_inc(cntr); if (mr_cntr) psmx_cntr_inc(mr_cntr); } no_event = (flags & PSMX_NO_COMPLETION) || (ep->send_selective_completion && !(flags & FI_COMPLETION)); if (ep->send_cq && !no_event) { event = psmx_cq_create_event( ep->send_cq, context, (void *)buf, cq_flags, len, 0, /* data */ 0, /* tag */ 0, /* olen */ op_error); if (event) psmx_cq_enqueue_event(ep->send_cq, event); else err = -FI_ENOMEM; } switch (am_cmd) { case PSMX_AM_REQ_WRITE: if (ep->write_cntr) psmx_cntr_inc(ep->write_cntr); break; case PSMX_AM_REQ_READ: if (ep->read_cntr) psmx_cntr_inc(ep->read_cntr); break; } return err; }
static ssize_t _psmx_recv2(struct fid_ep *ep, void *buf, size_t len, void *desc, fi_addr_t src_addr, void *context, uint64_t flags) { psm_amarg_t args[8]; struct psmx_fid_ep *ep_priv; struct psmx_fid_av *av; struct psmx_am_request *req; struct psmx_unexp *unexp; struct psmx_cq_event *event; int recv_done; int err = 0; size_t idx; ep_priv = container_of(ep, struct psmx_fid_ep, ep); if ((ep_priv->caps & FI_DIRECTED_RECV) && src_addr != FI_ADDR_UNSPEC) { av = ep_priv->av; if (av && av->type == FI_AV_TABLE) { idx = (size_t)src_addr; if (idx >= av->last) return -FI_EINVAL; src_addr = (fi_addr_t)av->psm_epaddrs[idx]; } } else { src_addr = 0; } req = calloc(1, sizeof(*req)); if (!req) return -FI_ENOMEM; req->op = PSMX_AM_REQ_SEND; req->recv.buf = (void *)buf; req->recv.len = len; req->recv.context = context; req->recv.src_addr = (void *)src_addr; req->ep = ep_priv; req->cq_flags = FI_RECV | FI_MSG; if (ep_priv->recv_selective_completion && !(flags & FI_COMPLETION)) req->no_event = 1; unexp = psmx_am_search_and_dequeue_unexp(ep_priv->domain, (const void *)src_addr); if (!unexp) { psmx_am_enqueue_recv(ep_priv->domain, req); return 0; } req->recv.len_received = MIN(req->recv.len, unexp->len_received); memcpy(req->recv.buf, unexp->buf, req->recv.len_received); recv_done = (req->recv.len_received >= req->recv.len); if (unexp->done) { recv_done = 1; } else { args[0].u32w0 = PSMX_AM_REP_SEND; args[0].u32w1 = 0; args[1].u64 = unexp->sender_context; args[2].u64 = recv_done ? 0 : (uint64_t)(uintptr_t)req; err = psm_am_request_short(unexp->sender_addr, PSMX_AM_MSG_HANDLER, args, 3, NULL, 0, 0, NULL, NULL ); } free(unexp); if (recv_done) { if (req->ep->recv_cq && !req->no_event) { event = psmx_cq_create_event( req->ep->recv_cq, req->recv.context, req->recv.buf, req->cq_flags, req->recv.len_received, 0, /* data */ 0, /* tag */ req->recv.len - req->recv.len_received, 0 /* err */); if (event) psmx_cq_enqueue_event(req->ep->recv_cq, event); else err = -FI_ENOMEM; } if (req->ep->recv_cntr) psmx_cntr_inc(req->ep->recv_cntr); free(req); } return err; }
int psmx_am_msg_handler(psm_am_token_t token, psm_epaddr_t epaddr, psm_amarg_t *args, int nargs, void *src, uint32_t len) #endif { psm_amarg_t rep_args[8]; struct psmx_am_request *req; struct psmx_cq_event *event; struct psmx_epaddr_context *epaddr_context; struct psmx_fid_domain *domain; int copy_len; uint64_t offset; int cmd, eom; int err = 0; int op_error = 0; struct psmx_unexp *unexp; #if (PSM_VERNO_MAJOR >= 2) psm_epaddr_t epaddr; psm_am_get_source(token, &epaddr); #endif epaddr_context = psm_epaddr_getctxt(epaddr); if (!epaddr_context) { FI_WARN(&psmx_prov, FI_LOG_EP_DATA, "NULL context for epaddr %p\n", epaddr); return -FI_EIO; } domain = epaddr_context->domain; cmd = args[0].u32w0 & PSMX_AM_OP_MASK; eom = args[0].u32w0 & PSMX_AM_EOM; switch (cmd) { case PSMX_AM_REQ_SEND: assert(len == args[0].u32w1); offset = args[3].u64; if (offset == 0) { /* this is the first packet */ req = psmx_am_search_and_dequeue_recv(domain, (const void *)epaddr); if (req) { copy_len = MIN(len, req->recv.len); memcpy(req->recv.buf, src, len); req->recv.len_received += copy_len; } else { unexp = malloc(sizeof(*unexp) + len); if (!unexp) { op_error = -FI_ENOSPC; } else { memcpy(unexp->buf, src, len); unexp->sender_addr = epaddr; unexp->sender_context = args[1].u64; unexp->len_received = len; unexp->done = !!eom; unexp->list_entry.next = NULL; psmx_am_enqueue_unexp(domain, unexp); if (!eom) { /* stop here. will reply when recv is posted */ break; } } } if (!op_error && !eom) { /* reply w/ recv req to be used for following packets */ rep_args[0].u32w0 = PSMX_AM_REP_SEND; rep_args[0].u32w1 = 0; rep_args[1].u64 = args[1].u64; rep_args[2].u64 = (uint64_t)(uintptr_t)req; err = psm_am_reply_short(token, PSMX_AM_MSG_HANDLER, rep_args, 3, NULL, 0, 0, NULL, NULL ); } } else { req = (struct psmx_am_request *)(uintptr_t)args[2].u64; if (req) { copy_len = MIN(req->recv.len + offset, len); memcpy(req->recv.buf + offset, src, copy_len); req->recv.len_received += copy_len; } else { FI_WARN(&psmx_prov, FI_LOG_EP_DATA, "NULL recv_req in follow-up packets.\n"); op_error = -FI_ENOMSG; } } if (eom && req) { if (req->ep->recv_cq && !req->no_event) { event = psmx_cq_create_event( req->ep->recv_cq, req->recv.context, req->recv.buf, req->cq_flags, req->recv.len_received, 0, /* data */ 0, /* tag */ req->recv.len - req->recv.len_received, 0 /* err */); if (event) psmx_cq_enqueue_event(req->ep->recv_cq, event); else err = -FI_ENOMEM; } if (req->ep->recv_cntr) psmx_cntr_inc(req->ep->recv_cntr); free(req); } if (eom || op_error) { rep_args[0].u32w0 = PSMX_AM_REP_SEND; rep_args[0].u32w1 = op_error; rep_args[1].u64 = args[1].u64; rep_args[2].u64 = 0; /* done */ err = psm_am_reply_short(token, PSMX_AM_MSG_HANDLER, rep_args, 3, NULL, 0, 0, NULL, NULL ); } break; case PSMX_AM_REP_SEND: req = (struct psmx_am_request *)(uintptr_t)args[1].u64; op_error = (int)args[0].u32w1; assert(req->op == PSMX_AM_REQ_SEND); if (args[2].u64) { /* more to send */ req->send.peer_context = (void *)(uintptr_t)args[2].u64; /* psm_am_request_short() can't be called inside the handler. * put the request into a queue and process it later. */ psmx_am_enqueue_send(req->ep->domain, req); } else { /* done */ if (req->ep->send_cq && !req->no_event) { event = psmx_cq_create_event( req->ep->send_cq, req->send.context, req->send.buf, req->cq_flags, req->send.len, 0, /* data */ 0, /* tag */ 0, /* olen */ op_error); if (event) psmx_cq_enqueue_event(req->ep->send_cq, event); else err = -FI_ENOMEM; } if (req->ep->send_cntr) psmx_cntr_inc(req->ep->send_cntr); free(req); } break; default: err = -FI_EINVAL; } return err; }
int psmx_am_msg_handler(psm_am_token_t token, psm_epaddr_t epaddr, psm_amarg_t *args, int nargs, void *src, uint32_t len) { psm_amarg_t rep_args[8]; struct psmx_am_request *req; struct psmx_cq_event *event; struct psmx_epaddr_context *epaddr_context; struct psmx_fid_domain *domain; int msg_len; int copy_len; uint64_t offset; int cmd, eom; int err = 0; int op_error = 0; struct psmx_unexp *unexp; epaddr_context = psm_epaddr_getctxt(epaddr); if (!epaddr_context) { fprintf(stderr, "%s: NULL context for epaddr %p\n", __func__, epaddr); return -EIO; } domain = epaddr_context->domain; cmd = args[0].u32w0 & PSMX_AM_OP_MASK; eom = args[0].u32w0 & PSMX_AM_EOM; switch (cmd) { case PSMX_AM_REQ_SEND: msg_len = args[0].u32w1; offset = args[3].u64; assert(len == msg_len); if (offset == 0) { /* this is the first packet */ req = psmx_am_search_and_dequeue_recv(domain, (const void *)epaddr); if (req) { copy_len = MIN(len, req->recv.len); memcpy(req->recv.buf, src, len); req->recv.len_received += copy_len; } else { unexp = malloc(sizeof(*unexp) + len); if (!unexp) { op_error = -ENOBUFS; } else { memcpy(unexp->buf, src, len); unexp->sender_addr = epaddr; unexp->sender_context = args[1].u64; unexp->len_received = len; unexp->done = !!eom; unexp->next = NULL; psmx_unexp_enqueue(unexp); if (!eom) { /* stop here. will reply when recv is posted */ break; } } } if (!op_error && !eom) { /* reply w/ recv req to be used for following packets */ rep_args[0].u32w0 = PSMX_AM_REP_SEND; rep_args[0].u32w1 = 0; rep_args[1].u64 = args[1].u64; rep_args[2].u64 = (uint64_t)(uintptr_t)req; err = psm_am_reply_short(token, PSMX_AM_MSG_HANDLER, rep_args, 3, NULL, 0, 0, NULL, NULL ); } } else { req = (struct psmx_am_request *)(uintptr_t)args[2].u64; if (req) { copy_len = MIN(req->recv.len + offset, len); memcpy(req->recv.buf + offset, src, copy_len); req->recv.len_received += copy_len; } else { fprintf(stderr, "%s: NULL recv_req in follow-up packets.\n", __func__); op_error = -EBADMSG; } } if (eom && req) { if (req->ep->recv_cq && !req->no_event) { event = psmx_cq_create_event( req->ep->recv_cq, req->recv.context, req->recv.buf, 0, /* flags */ req->recv.len_received, 0, /* data */ 0, /* tag */ req->recv.len - req->recv.len_received, 0 /* err */); if (event) psmx_cq_enqueue_event(req->ep->recv_cq, event); else err = -ENOMEM; } if (req->ep->recv_cntr) psmx_cntr_inc(req->ep->recv_cntr); free(req); } if (eom || op_error) { rep_args[0].u32w0 = PSMX_AM_REP_SEND; rep_args[0].u32w1 = op_error; rep_args[1].u64 = args[1].u64; rep_args[2].u64 = 0; /* done */ err = psm_am_reply_short(token, PSMX_AM_MSG_HANDLER, rep_args, 3, NULL, 0, 0, NULL, NULL ); } break; case PSMX_AM_REP_SEND: req = (struct psmx_am_request *)(uintptr_t)args[1].u64; op_error = (int)args[0].u32w1; assert(req->op == PSMX_AM_REQ_SEND); if (args[2].u64) { /* more to send */ req->send.peer_context = (void *)(uintptr_t)args[2].u64; #if PSMX_AM_USE_SEND_QUEUE /* psm_am_request_short() can't be called inside the handler. * put the request into a queue and process it later. */ psmx_am_enqueue_send(req->ep->domain, req); if (req->ep->domain->progress_thread) pthread_cond_signal(&req->ep->domain->progress_cond); #else req->send.peer_ready = 1; #endif } else { /* done */ if (req->ep->send_cq && !req->no_event) { event = psmx_cq_create_event( req->ep->send_cq, req->send.context, req->send.buf, 0, /* flags */ req->send.len, 0, /* data */ 0, /* tag */ 0, /* olen */ op_error); if (event) psmx_cq_enqueue_event(req->ep->send_cq, event); else err = -ENOMEM; } if (req->ep->send_cntr) psmx_cntr_inc(req->ep->send_cntr); if (req->state == PSMX_AM_STATE_QUEUED) req->state = PSMX_AM_STATE_DONE; else free(req); } break; default: err = -EINVAL; } return err; }