void psmx2_trx_ctxt_free(struct psmx2_trx_ctxt *trx_ctxt) { int err; if (!trx_ctxt) return; if (trx_ctxt->am_initialized) psmx2_am_fini(trx_ctxt); #if 0 /* AM messages could arrive after MQ is finalized, causing segfault * when trying to dereference the MQ pointer. There is no mechanism * to properly shutdown AM. The workaround is to keep MQ valid. */ psm2_mq_finalize(trx_ctxt->psm2_mq); #endif /* workaround for: * Assertion failure at psm2_ep.c:1059: ep->mctxt_master == ep */ sleep(psmx2_env.delay); if (psmx2_env.timeout) err = psm2_ep_close(trx_ctxt->psm2_ep, PSM2_EP_CLOSE_GRACEFUL, (int64_t) psmx2_env.timeout * 1000000000LL); else err = PSM2_EP_CLOSE_TIMEOUT; if (err != PSM2_OK) psm2_ep_close(trx_ctxt->psm2_ep, PSM2_EP_CLOSE_FORCE, 0); fastlock_destroy(&trx_ctxt->poll_lock); free(trx_ctxt); }
int ompi_mtl_psm2_finalize(struct mca_mtl_base_module_t* mtl) { psm2_error_t err; opal_progress_unregister(ompi_mtl_psm2_progress); /* free resources */ err = psm2_mq_finalize(ompi_mtl_psm2.mq); if (err) { opal_output(0, "Error in psm2_mq_finalize (error %s)\n", psm2_error_get_string(err)); return OMPI_ERROR; } err = psm2_ep_close(ompi_mtl_psm2.ep, PSM2_EP_CLOSE_GRACEFUL, 1*1e9); if (err) { opal_output(0, "Error in psm2_ep_close (error %s)\n", psm2_error_get_string(err)); return OMPI_ERROR; } err = psm2_finalize(); if (err) { opal_output(0, "Error in psm2_finalize (error %s)\n", psm2_error_get_string(err)); return OMPI_ERROR; } return OMPI_SUCCESS; }
void psmx2_domain_release(struct psmx2_fid_domain *domain) { int err; FI_INFO(&psmx2_prov, FI_LOG_DOMAIN, "refcnt=%d\n", domain->refcnt); if (--domain->refcnt > 0) return; if (domain->progress_thread_enabled) psmx2_domain_stop_progress(domain); psmx2_am_fini(domain); fastlock_destroy(&domain->poll_lock); fastlock_destroy(&domain->vl_lock); rbtDelete(domain->mr_map); fastlock_destroy(&domain->mr_lock); #if 0 /* AM messages could arrive after MQ is finalized, causing segfault * when trying to dereference the MQ pointer. There is no mechanism * to properly shutdown AM. The workaround is to keep MQ valid. */ psm2_mq_finalize(domain->psm2_mq); #endif /* workaround for: * Assertion failure at psm2_ep.c:1059: ep->mctxt_master == ep */ sleep(psmx2_env.delay); if (psmx2_env.timeout) err = psm2_ep_close(domain->psm2_ep, PSM2_EP_CLOSE_GRACEFUL, (int64_t) psmx2_env.timeout * 1000000000LL); else err = PSM2_EP_CLOSE_TIMEOUT; if (err != PSM2_OK) psm2_ep_close(domain->psm2_ep, PSM2_EP_CLOSE_FORCE, 0); domain->fabric->active_domain = NULL; psmx2_fabric_release(domain->fabric); free(domain); }
struct psmx2_trx_ctxt *psmx2_trx_ctxt_alloc(struct psmx2_fid_domain *domain, struct psmx2_src_name *src_addr, int sep_ctxt_idx) { struct psmx2_trx_ctxt *trx_ctxt; struct psm2_ep_open_opts opts; int should_retry = 0; int err; trx_ctxt = calloc(1, sizeof(*trx_ctxt)); if (!trx_ctxt) { FI_WARN(&psmx2_prov, FI_LOG_CORE, "failed to allocate trx_ctxt.\n"); return NULL; } psm2_ep_open_opts_get_defaults(&opts); FI_INFO(&psmx2_prov, FI_LOG_CORE, "uuid: %s\n", psmx2_uuid_to_string(domain->fabric->uuid)); if (src_addr) { opts.unit = src_addr->unit; opts.port = src_addr->port; FI_INFO(&psmx2_prov, FI_LOG_CORE, "ep_open_opts: unit=%d port=%u\n", opts.unit, opts.port); } if (opts.unit < 0 && sep_ctxt_idx >= 0) { should_retry = 1; opts.unit = sep_ctxt_idx % psmx2_env.num_devunits; FI_INFO(&psmx2_prov, FI_LOG_CORE, "sep %d: ep_open_opts: unit=%d\n", sep_ctxt_idx, opts.unit); } err = psm2_ep_open(domain->fabric->uuid, &opts, &trx_ctxt->psm2_ep, &trx_ctxt->psm2_epid); if (err != PSM2_OK) { FI_WARN(&psmx2_prov, FI_LOG_CORE, "psm2_ep_open returns %d, errno=%d\n", err, errno); if (!should_retry) { err = psmx2_errno(err); goto err_out; } /* When round-robin fails, retry w/o explicit assignment */ opts.unit = -1; err = psm2_ep_open(domain->fabric->uuid, &opts, &trx_ctxt->psm2_ep, &trx_ctxt->psm2_epid); if (err != PSM2_OK) { FI_WARN(&psmx2_prov, FI_LOG_CORE, "psm2_ep_open returns %d, errno=%d\n", err, errno); err = psmx2_errno(err); goto err_out; } } FI_INFO(&psmx2_prov, FI_LOG_CORE, "epid: 0x%016lx\n", trx_ctxt->psm2_epid); err = psm2_mq_init(trx_ctxt->psm2_ep, PSM2_MQ_ORDERMASK_ALL, NULL, 0, &trx_ctxt->psm2_mq); if (err != PSM2_OK) { FI_WARN(&psmx2_prov, FI_LOG_CORE, "psm2_mq_init returns %d, errno=%d\n", err, errno); err = psmx2_errno(err); goto err_out_close_ep; } fastlock_init(&trx_ctxt->poll_lock); fastlock_init(&trx_ctxt->rma_queue.lock); fastlock_init(&trx_ctxt->trigger_queue.lock); slist_init(&trx_ctxt->rma_queue.list); slist_init(&trx_ctxt->trigger_queue.list); return trx_ctxt; err_out_close_ep: if (psm2_ep_close(trx_ctxt->psm2_ep, PSM2_EP_CLOSE_GRACEFUL, (int64_t) psmx2_env.timeout * 1000000000LL) != PSM2_OK) psm2_ep_close(trx_ctxt->psm2_ep, PSM2_EP_CLOSE_FORCE, 0); err_out: free(trx_ctxt); return NULL; }
int psmx2_domain_open(struct fid_fabric *fabric, struct fi_info *info, struct fid_domain **domain, void *context) { struct psmx2_fid_fabric *fabric_priv; struct psmx2_fid_domain *domain_priv; struct psm2_ep_open_opts opts; int err; FI_INFO(&psmx2_prov, FI_LOG_DOMAIN, "\n"); fabric_priv = container_of(fabric, struct psmx2_fid_fabric, fabric); psmx2_fabric_acquire(fabric_priv); if (fabric_priv->active_domain) { psmx2_domain_acquire(fabric_priv->active_domain); *domain = &fabric_priv->active_domain->domain; return 0; } if (!info->domain_attr->name || strcmp(info->domain_attr->name, PSMX2_DOMAIN_NAME)) { err = -FI_EINVAL; goto err_out; } domain_priv = (struct psmx2_fid_domain *) calloc(1, sizeof *domain_priv); if (!domain_priv) { err = -FI_ENOMEM; goto err_out; } domain_priv->domain.fid.fclass = FI_CLASS_DOMAIN; domain_priv->domain.fid.context = context; domain_priv->domain.fid.ops = &psmx2_fi_ops; domain_priv->domain.ops = &psmx2_domain_ops; domain_priv->domain.mr = &psmx2_mr_ops; domain_priv->mr_mode = info->domain_attr->mr_mode; domain_priv->mode = info->mode; domain_priv->caps = info->caps; domain_priv->fabric = fabric_priv; domain_priv->progress_thread_enabled = (info->domain_attr->data_progress == FI_PROGRESS_AUTO); psm2_ep_open_opts_get_defaults(&opts); FI_INFO(&psmx2_prov, FI_LOG_CORE, "uuid: %s\n", psmx2_uuid_to_string(fabric_priv->uuid)); err = psm2_ep_open(fabric_priv->uuid, &opts, &domain_priv->psm2_ep, &domain_priv->psm2_epid); if (err != PSM2_OK) { FI_WARN(&psmx2_prov, FI_LOG_CORE, "psm2_ep_open returns %d, errno=%d\n", err, errno); err = psmx2_errno(err); goto err_out_free_domain; } FI_INFO(&psmx2_prov, FI_LOG_CORE, "epid: 0x%016lx\n", domain_priv->psm2_epid); err = psm2_mq_init(domain_priv->psm2_ep, PSM2_MQ_ORDERMASK_ALL, NULL, 0, &domain_priv->psm2_mq); if (err != PSM2_OK) { FI_WARN(&psmx2_prov, FI_LOG_CORE, "psm2_mq_init returns %d, errno=%d\n", err, errno); err = psmx2_errno(err); goto err_out_close_ep; } err = fastlock_init(&domain_priv->mr_lock); if (err) { FI_WARN(&psmx2_prov, FI_LOG_CORE, "fastlock_init(mr_lock) returns %d\n", err); goto err_out_finalize_mq; } domain_priv->mr_map = rbtNew(&psmx2_key_compare); if (!domain_priv->mr_map) { FI_WARN(&psmx2_prov, FI_LOG_CORE, "rbtNew failed\n"); goto err_out_destroy_mr_lock; } domain_priv->mr_reserved_key = 1; err = fastlock_init(&domain_priv->vl_lock); if (err) { FI_WARN(&psmx2_prov, FI_LOG_CORE, "fastlock_init(vl_lock) returns %d\n", err); goto err_out_delete_mr_map; } memset(domain_priv->vl_map, 0, sizeof(domain_priv->vl_map)); domain_priv->vl_alloc = 0; err = fastlock_init(&domain_priv->poll_lock); if (err) { FI_WARN(&psmx2_prov, FI_LOG_CORE, "fastlock_init(poll_lock) returns %d\n", err); goto err_out_destroy_vl_lock; } /* Set active domain before psmx2_domain_enable_ep() installs the * AM handlers to ensure that psmx2_active_fabric->active_domain * is always non-NULL inside the handlers. Notice that the vlaue * active_domain becomes NULL again only when the domain is closed. * At that time the AM handlers are gone with the PSM endpoint. */ fabric_priv->active_domain = domain_priv; if (psmx2_domain_enable_ep(domain_priv, NULL) < 0) goto err_out_reset_active_domain; if (domain_priv->progress_thread_enabled) psmx2_domain_start_progress(domain_priv); domain_priv->refcnt = 1; *domain = &domain_priv->domain; return 0; err_out_reset_active_domain: fabric_priv->active_domain = NULL; fastlock_destroy(&domain_priv->poll_lock); err_out_destroy_vl_lock: fastlock_destroy(&domain_priv->vl_lock); err_out_delete_mr_map: rbtDelete(domain_priv->mr_map); err_out_destroy_mr_lock: fastlock_destroy(&domain_priv->mr_lock); err_out_finalize_mq: psm2_mq_finalize(domain_priv->psm2_mq); err_out_close_ep: if (psm2_ep_close(domain_priv->psm2_ep, PSM2_EP_CLOSE_GRACEFUL, (int64_t) psmx2_env.timeout * 1000000000LL) != PSM2_OK) psm2_ep_close(domain_priv->psm2_ep, PSM2_EP_CLOSE_FORCE, 0); err_out_free_domain: free(domain_priv); err_out: psmx2_fabric_release(fabric_priv); return err; }