/* The main idea of resizing SRQ algorithm - We create a SRQ with size = rd_num, but for efficient usage of resources the number of WQEs that we post = rd_curr_num < rd_num and this value is increased (by needs) in IBV_EVENT_SRQ_LIMIT_REACHED event handler (i.e. in this function), the event will thrown by device if number of WQEs in SRQ will be less than srq_limit */ static int btl_openib_async_srq_limit_event(struct ibv_srq* srq) { int qp, rc = OPAL_SUCCESS; mca_btl_openib_module_t *openib_btl = NULL; opal_mutex_t *lock = &mca_btl_openib_component.srq_manager.lock; opal_hash_table_t *srq_addr_table = &mca_btl_openib_component.srq_manager.srq_addr_table; opal_mutex_lock(lock); if (OPAL_SUCCESS != opal_hash_table_get_value_ptr(srq_addr_table, &srq, sizeof(struct ibv_srq*), (void*) &openib_btl)) { /* If there isn't any element with the key in the table => we assume that SRQ was destroyed and don't serve the event */ goto srq_limit_event_exit; } for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) { if (!BTL_OPENIB_QP_TYPE_PP(qp)) { if(openib_btl->qps[qp].u.srq_qp.srq == srq) { break; } } } if(qp >= mca_btl_openib_component.num_qps) { BTL_ERROR(("Open MPI tried to access a shared receive queue (SRQ) on the device %s that was not found. This should not happen, and is a fatal error. Your MPI job will now abort.\n", ibv_get_device_name(openib_btl->device->ib_dev))); rc = OPAL_ERROR; goto srq_limit_event_exit; } /* dynamically re-size the SRQ to be larger */ openib_btl->qps[qp].u.srq_qp.rd_curr_num <<= 1; if(openib_btl->qps[qp].u.srq_qp.rd_curr_num >= mca_btl_openib_component.qp_infos[qp].rd_num) { openib_btl->qps[qp].u.srq_qp.rd_curr_num = mca_btl_openib_component.qp_infos[qp].rd_num; openib_btl->qps[qp].u.srq_qp.rd_low_local = mca_btl_openib_component.qp_infos[qp].rd_low; openib_btl->qps[qp].u.srq_qp.srq_limit_event_flag = false; goto srq_limit_event_exit; } openib_btl->qps[qp].u.srq_qp.rd_low_local <<= 1; openib_btl->qps[qp].u.srq_qp.srq_limit_event_flag = true; srq_limit_event_exit: opal_mutex_unlock(lock); return rc; }
static char* opal_infosubscribe_inform_subscribers(opal_infosubscriber_t *object, char *key, char *new_value) { opal_hash_table_t *table = &object->s_subscriber_table; opal_list_t *list = NULL; opal_callback_list_item_t *item; char *updated_value = NULL; /* * Present the new value to each subscriber. They can decide to accept it, ignore it, or * over-ride it with their own value (like ignore, but they specify what value they want it to have). * * Since multiple subscribers could set values, only the last setting is kept as the * returned value. */ if (table) { opal_hash_table_get_value_ptr(table, key, strlen(key), (void**) &list); if (list) { updated_value = new_value; OPAL_LIST_FOREACH(item, list, opal_callback_list_item_t) { updated_value = item->callback(object, key, updated_value); } } }
/* This func. opens XRC domain */ int mca_btl_openib_open_xrc_domain(struct mca_btl_openib_device_t *device) { int len; char *xrc_file_name; const char *dev_name; #if OPAL_HAVE_CONNECTX_XRC_DOMAINS struct ibv_xrcd_init_attr xrcd_attr; #endif dev_name = ibv_get_device_name(device->ib_dev); len = asprintf(&xrc_file_name, "%s"OPAL_PATH_SEP"openib_xrc_domain_%s", opal_process_info.job_session_dir, dev_name); if (0 > len) { BTL_ERROR(("Failed to allocate memomry for XRC file name: %s\n", strerror(errno))); return OPAL_ERROR; } device->xrc_fd = open(xrc_file_name, O_CREAT, S_IWUSR|S_IRUSR); if (0 > device->xrc_fd) { BTL_ERROR(("Failed to open XRC domain file %s, errno says %s\n", xrc_file_name,strerror(errno))); free(xrc_file_name); return OPAL_ERROR; } #if OPAL_HAVE_CONNECTX_XRC_DOMAINS memset(&xrcd_attr, 0, sizeof xrcd_attr); xrcd_attr.comp_mask = IBV_XRCD_INIT_ATTR_FD | IBV_XRCD_INIT_ATTR_OFLAGS; xrcd_attr.fd = device->xrc_fd; xrcd_attr.oflags = O_CREAT; device->xrcd = ibv_open_xrcd(device->ib_dev_context, &xrcd_attr); if (NULL == device->xrcd) { #else device->xrc_domain = ibv_open_xrc_domain(device->ib_dev_context, device->xrc_fd, O_CREAT); if (NULL == device->xrc_domain) { #endif BTL_ERROR(("Failed to open XRC domain\n")); close(device->xrc_fd); free(xrc_file_name); return OPAL_ERROR; } return OPAL_SUCCESS; } /* This func. closes XRC domain */ int mca_btl_openib_close_xrc_domain(struct mca_btl_openib_device_t *device) { #if OPAL_HAVE_CONNECTX_XRC_DOMAINS if (NULL == device->xrcd) { #else if (NULL == device->xrc_domain) { #endif /* No XRC domain, just exit */ return OPAL_SUCCESS; } #if OPAL_HAVE_CONNECTX_XRC_DOMAINS if (ibv_close_xrcd(device->xrcd)) { #else if (ibv_close_xrc_domain(device->xrc_domain)) { #endif BTL_ERROR(("Failed to close XRC domain, errno %d says %s\n", device->xrc_fd, strerror(errno))); return OPAL_ERROR; } /* do we need to check exit status */ if (close(device->xrc_fd)) { BTL_ERROR(("Failed to close XRC file descriptor, errno %d says %s\n", device->xrc_fd, strerror(errno))); return OPAL_ERROR; } return OPAL_SUCCESS; } static void ib_address_constructor(ib_address_t *ib_addr) { ib_addr->key = NULL; ib_addr->subnet_id = 0; ib_addr->lid = 0; ib_addr->status = MCA_BTL_IB_ADDR_CLOSED; ib_addr->qp = NULL; OBJ_CONSTRUCT(&ib_addr->addr_lock, opal_mutex_t); OBJ_CONSTRUCT(&ib_addr->pending_ep, opal_list_t); } static void ib_address_destructor(ib_address_t *ib_addr) { if (NULL != ib_addr->key) { free(ib_addr->key); } OBJ_DESTRUCT(&ib_addr->addr_lock); OBJ_DESTRUCT(&ib_addr->pending_ep); } static int ib_address_init(ib_address_t *ib_addr, uint16_t lid, uint64_t s_id, opal_jobid_t ep_jobid) { ib_addr->key = malloc(SIZE_OF3(s_id, lid, ep_jobid)); if (NULL == ib_addr->key) { BTL_ERROR(("Failed to allocate memory for key\n")); return OPAL_ERROR; } memset(ib_addr->key, 0, SIZE_OF3(s_id, lid, ep_jobid)); /* creating the key = lid + s_id + ep_jobid */ memcpy(ib_addr->key, &lid, sizeof(lid)); memcpy((void*)((char*)ib_addr->key + sizeof(lid)), &s_id, sizeof(s_id)); memcpy((void*)((char*)ib_addr->key + sizeof(lid) + sizeof(s_id)), &ep_jobid, sizeof(ep_jobid)); /* caching lid and subnet id */ ib_addr->subnet_id = s_id; ib_addr->lid = lid; return OPAL_SUCCESS; } /* Create new entry in hash table for subnet_id and lid, * update the endpoint pointer. * Before call to this function you need to protect with */ int mca_btl_openib_ib_address_add_new (uint16_t lid, uint64_t s_id, opal_jobid_t ep_jobid, mca_btl_openib_endpoint_t *ep) { void *tmp; int ret = OPAL_SUCCESS; struct ib_address_t *ib_addr = OBJ_NEW(ib_address_t); ret = ib_address_init(ib_addr, lid, s_id, ep_jobid); if (OPAL_SUCCESS != ret ) { BTL_ERROR(("XRC Internal error. Failed to init ib_addr\n")); OBJ_DESTRUCT(ib_addr); return ret; } /* is it already in the table ?*/ OPAL_THREAD_LOCK(&mca_btl_openib_component.ib_lock); if (OPAL_SUCCESS != opal_hash_table_get_value_ptr(&mca_btl_openib_component.ib_addr_table, ib_addr->key, SIZE_OF3(s_id, lid, ep_jobid), &tmp)) { /* It is new one, lets put it on the table */ ret = opal_hash_table_set_value_ptr(&mca_btl_openib_component.ib_addr_table, ib_addr->key, SIZE_OF3(s_id, lid, ep_jobid), (void*)ib_addr); if (OPAL_SUCCESS != ret) { BTL_ERROR(("XRC Internal error." " Failed to add element to mca_btl_openib_component.ib_addr_table\n")); OPAL_THREAD_UNLOCK(&mca_btl_openib_component.ib_lock); OBJ_DESTRUCT(ib_addr); return ret; } /* update the endpoint with pointer to ib address */ ep->ib_addr = ib_addr; } else { /* so we have this one in the table, just add the pointer to the endpoint */ ep->ib_addr = (ib_address_t *)tmp; assert(lid == ep->ib_addr->lid && s_id == ep->ib_addr->subnet_id); OBJ_DESTRUCT(ib_addr); } OPAL_THREAD_UNLOCK(&mca_btl_openib_component.ib_lock); return ret; }