コード例 #1
0
/* Function handle async device events */
static int btl_openib_async_deviceh(struct mca_btl_openib_async_poll *devices_poll, int index,
                                    opal_list_t *ignore_qp_err_list)
{
    int j;
    mca_btl_openib_device_t *device = NULL;
    struct ibv_async_event event;
    bool xrc_event = false;
    int event_type;

    /* We need to find correct device and process this event */
    for (j=0; j < mca_btl_openib_component.ib_num_btls; j++) {
        if (mca_btl_openib_component.openib_btls[j]->device->ib_dev_context->async_fd ==
                devices_poll->async_pollfd[index].fd ) {
            device = mca_btl_openib_component.openib_btls[j]->device;
            break;
        }
    }
    if (NULL != device) {
        if (ibv_get_async_event((struct ibv_context *)device->ib_dev_context,&event) < 0) {
            if (EWOULDBLOCK == errno) {
                /* No event found ?
                 * It was handled by somebody other */
                return OPAL_SUCCESS;
            } else {
                BTL_ERROR(("Failed to get async event"));
                return OPAL_ERROR;
            }
        }

        event_type = event.event_type;
#if HAVE_XRC
        /* is it XRC event ?*/
        if (IBV_XRC_QP_EVENT_FLAG & event.event_type) {
            xrc_event = true;
            /* Clean the bitnd handel as usual */
            event_type ^= IBV_XRC_QP_EVENT_FLAG;
        }
#endif
        switch(event_type) {
            case IBV_EVENT_PATH_MIG:
                BTL_ERROR(("Alternative path migration event reported"));
                if (APM_ENABLED) {
                    BTL_ERROR(("Trying to find additional path..."));
                    if (!xrc_event)
                        mca_btl_openib_load_apm(event.element.qp,
                                qp2endpoint(event.element.qp, device));
#if HAVE_XRC
                    else
                        mca_btl_openib_load_apm_xrc_rcv(event.element.xrc_qp_num,
                                xrc_qp2endpoint(event.element.xrc_qp_num, device));
#endif
                }
                break;
            case IBV_EVENT_DEVICE_FATAL:
                /* Set the flag to fatal */
                device->got_fatal_event = true;
                /* It is not critical to protect the counter */
                OPAL_THREAD_ADD32(&mca_btl_openib_component.error_counter, 1);
            case IBV_EVENT_CQ_ERR:
            case IBV_EVENT_QP_FATAL:
              if (event_type == IBV_EVENT_QP_FATAL) {
                  opal_list_item_t *item;
                  mca_btl_openib_qp_list *qp_item;
                  bool in_ignore_list = false;

                  BTL_VERBOSE(("QP is in err state %p", (void *)event.element.qp));

                  /* look through ignore list */
                  for (item = opal_list_get_first(ignore_qp_err_list);
                       item != opal_list_get_end(ignore_qp_err_list);
                       item = opal_list_get_next(item)) {
                      qp_item = (mca_btl_openib_qp_list *)item;
                      if (qp_item->qp == event.element.qp) {
                          BTL_VERBOSE(("QP %p is in error ignore list",
                                       (void *)event.element.qp));
                          in_ignore_list = true;
                          break;
                      }
                  }
                  if (in_ignore_list)
                      break;
              }

            case IBV_EVENT_QP_REQ_ERR:
            case IBV_EVENT_QP_ACCESS_ERR:
            case IBV_EVENT_PATH_MIG_ERR:
            case IBV_EVENT_SRQ_ERR:
                opal_show_help("help-mpi-btl-openib.txt", "of error event",
                    true,opal_proc_local_get()->proc_hostname, (int)getpid(),
                    event_type,
                    openib_event_to_str((enum ibv_event_type)event_type),
                    xrc_event ? "true" : "false");
                break;
            case IBV_EVENT_PORT_ERR:
                opal_show_help("help-mpi-btl-openib.txt", "of error event",
                    true,opal_proc_local_get()->proc_hostname, (int)getpid(),
                    event_type,
                    openib_event_to_str((enum ibv_event_type)event_type),
                    xrc_event ? "true" : "false");
                /* Set the flag to indicate port error */
                device->got_port_event = true;
                OPAL_THREAD_ADD32(&mca_btl_openib_component.error_counter, 1);
                break;
            case IBV_EVENT_COMM_EST:
            case IBV_EVENT_PORT_ACTIVE:
            case IBV_EVENT_SQ_DRAINED:
            case IBV_EVENT_LID_CHANGE:
            case IBV_EVENT_PKEY_CHANGE:
            case IBV_EVENT_SM_CHANGE:
            case IBV_EVENT_QP_LAST_WQE_REACHED:
#if HAVE_DECL_IBV_EVENT_CLIENT_REREGISTER
            case IBV_EVENT_CLIENT_REREGISTER:
#endif
                break;
            /* The event is signaled when number of prepost receive WQEs is going
                                            under predefined threshold - srq_limit */
            case IBV_EVENT_SRQ_LIMIT_REACHED:
                if(OPAL_SUCCESS !=
                         btl_openib_async_srq_limit_event(event.element.srq)) {
                    return OPAL_ERROR;
                }

                break;
            default:
                opal_show_help("help-mpi-btl-openib.txt", "of unknown event",
                        true,opal_proc_local_get()->proc_hostname, (int)getpid(),
                        event_type, xrc_event ? "true" : "false");
        }
        ibv_ack_async_event(&event);
    } else {
        /* if (device == NULL), then failed to locate the device!
           This should never happen... */
        BTL_ERROR(("Failed to find device with FD %d.  "
                   "Fatal error, stoping asynch event thread",
                   devices_poll->async_pollfd[index].fd));
        return OPAL_ERROR;
    }
    return OPAL_SUCCESS;
}
コード例 #2
0
/* Recv qp create */
static int xoob_recv_qp_create(mca_btl_openib_endpoint_t *endpoint, mca_btl_openib_rem_info_t *rem_info)
{
    struct ibv_qp_init_attr qp_init_attr;
    struct ibv_qp_attr attr;
    int ret;

    mca_btl_openib_module_t* openib_btl =
        (mca_btl_openib_module_t*)endpoint->endpoint_btl;

    BTL_VERBOSE(("Connecting Recv QP\n"));

    memset(&qp_init_attr, 0, sizeof(struct ibv_qp_init_attr));
    /* Only xrc_domain is required, all other are ignored */
    qp_init_attr.xrc_domain = openib_btl->device->xrc_domain;
    ret = ibv_create_xrc_rcv_qp(&qp_init_attr, &endpoint->xrc_recv_qp_num);
    if (ret) {
        BTL_ERROR(("Error creating XRC recv QP[%x], errno says: %s [%d]",
                    endpoint->xrc_recv_qp_num, strerror(ret), ret));
        return OMPI_ERROR;
    }

    memset(&attr, 0, sizeof(struct ibv_qp_attr));
    attr.qp_state = IBV_QPS_INIT;
    attr.pkey_index = openib_btl->pkey_index;
    attr.port_num = openib_btl->port_num;
    attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ;
    ret = ibv_modify_xrc_rcv_qp(openib_btl->device->xrc_domain,
            endpoint->xrc_recv_qp_num,
            &attr,
            IBV_QP_STATE|
            IBV_QP_PKEY_INDEX|
            IBV_QP_PORT|
            IBV_QP_ACCESS_FLAGS);
    if (ret) {
        BTL_ERROR(("Error modifying XRC recv QP[%x] to IBV_QPS_INIT, errno says: %s [%d]",
                     endpoint->xrc_recv_qp_num, strerror(ret), ret));
        while(1);
        return OMPI_ERROR;
    }

    memset(&attr, 0, sizeof(struct ibv_qp_attr));
    attr.qp_state           = IBV_QPS_RTR;
    attr.path_mtu = (openib_btl->device->mtu < endpoint->rem_info.rem_mtu) ?
        openib_btl->device->mtu : rem_info->rem_mtu;
    attr.dest_qp_num        = rem_info->rem_qps->rem_qp_num;
    attr.rq_psn             = rem_info->rem_qps->rem_psn;
    attr.max_dest_rd_atomic = mca_btl_openib_component.ib_max_rdma_dst_ops;
    attr.min_rnr_timer  = mca_btl_openib_component.ib_min_rnr_timer;
    attr.ah_attr.is_global     = 0;
    attr.ah_attr.dlid          = rem_info->rem_lid;
    attr.ah_attr.src_path_bits = openib_btl->src_path_bits;
    attr.ah_attr.port_num      = openib_btl->port_num;
    attr.ah_attr.static_rate   = 0;
    attr.ah_attr.sl            = mca_btl_openib_component.ib_service_level;

#if (ENABLE_DYNAMIC_SL)
    /* if user enabled dynamic SL, get it from PathRecord */
    if (0 != mca_btl_openib_component.ib_path_record_service_level) {
        int rc = btl_openib_connect_get_pathrecord_sl(
                                openib_btl->device->xrc_domain->context,
                                attr.ah_attr.port_num,
                                openib_btl->lid,
                                attr.ah_attr.dlid);
        if (OMPI_ERROR == rc) {
            return OMPI_ERROR;
        }
        attr.ah_attr.sl = rc;
    }
#endif

    ret = ibv_modify_xrc_rcv_qp(openib_btl->device->xrc_domain,
            endpoint->xrc_recv_qp_num,
            &attr,
            IBV_QP_STATE|
            IBV_QP_AV|
            IBV_QP_PATH_MTU|
            IBV_QP_DEST_QPN|
            IBV_QP_RQ_PSN|
            IBV_QP_MAX_DEST_RD_ATOMIC|
            IBV_QP_MIN_RNR_TIMER);
    if (ret) {
        BTL_ERROR(("Error modifying XRC recv QP[%x] to IBV_QPS_RTR, errno says: %s [%d]",
                    endpoint->xrc_recv_qp_num, strerror(ret), ret));
        return OMPI_ERROR;
    }
#if OPAL_HAVE_THREADS
    if (APM_ENABLED) {
        mca_btl_openib_load_apm_xrc_rcv(endpoint->xrc_recv_qp_num, endpoint);
    }
#endif

    return OMPI_SUCCESS;
}
コード例 #3
0
ファイル: btl_openib_async.c プロジェクト: jcybha/NeMoHEDS
/* Function handle async device events */
static int btl_openib_async_deviceh(struct mca_btl_openib_async_poll *devices_poll, int index)
{
    int j;
    mca_btl_openib_device_t *device = NULL;
    struct ibv_async_event event;
    bool xrc_event = false;
    int event_type;

    /* We need to find correct device and process this event */
    for (j=0; j < mca_btl_openib_component.ib_num_btls; j++) {
        if (mca_btl_openib_component.openib_btls[j]->device->ib_dev_context->async_fd ==
                devices_poll->async_pollfd[index].fd ) {
            device = mca_btl_openib_component.openib_btls[j]->device;
            break;
        }
    }
    if (NULL != device) {
        if (ibv_get_async_event((struct ibv_context *)device->ib_dev_context,&event) < 0) {
            if (EWOULDBLOCK == errno) {
                /* No event found ?
                 * It was handled by somebody other */
                return OMPI_SUCCESS;
            } else {
                BTL_ERROR(("Failed to get async event"));
                return OMPI_ERROR;
            }
        }

        event_type = event.event_type;
#if HAVE_XRC
        /* is it XRC event ?*/
        if (IBV_XRC_QP_EVENT_FLAG & event.event_type) {
            xrc_event = true;
            /* Clean the bitnd handel as usual */
            event_type ^= IBV_XRC_QP_EVENT_FLAG;
        }
#endif
        switch(event_type) {
            case IBV_EVENT_PATH_MIG:
                BTL_ERROR(("Alternative path migration event reported"));
                if (APM_ENABLED) {
                    BTL_ERROR(("Trying to find additional path..."));
                    if (!xrc_event) 
                        mca_btl_openib_load_apm(event.element.qp,
                                qp2endpoint(event.element.qp, device));
#if HAVE_XRC
                    else
                        mca_btl_openib_load_apm_xrc_rcv(event.element.xrc_qp_num,
                                xrc_qp2endpoint(event.element.xrc_qp_num, device));
#endif
                }
                break;
            case IBV_EVENT_DEVICE_FATAL:
                /* Set the flag to fatal */
                device->got_fatal_event = true;
                /* It is not critical to protect the counter */
                OPAL_THREAD_ADD32(&mca_btl_openib_component.fatal_counter, 1);
            case IBV_EVENT_CQ_ERR:
            case IBV_EVENT_QP_FATAL:
            case IBV_EVENT_QP_REQ_ERR:
            case IBV_EVENT_QP_ACCESS_ERR:
            case IBV_EVENT_PATH_MIG_ERR:
            case IBV_EVENT_SRQ_ERR:
            case IBV_EVENT_PORT_ERR:
                orte_show_help("help-mpi-btl-openib.txt", "of error event",
                    true,orte_process_info.nodename, orte_process_info.pid,
                    event.event_type, openib_event_to_str(event.event_type),
                    xrc_event ? "true" : "false");
                break;
            case IBV_EVENT_COMM_EST:
            case IBV_EVENT_PORT_ACTIVE:
            case IBV_EVENT_SQ_DRAINED:
            case IBV_EVENT_LID_CHANGE:
            case IBV_EVENT_PKEY_CHANGE:
            case IBV_EVENT_SM_CHANGE:
            case IBV_EVENT_QP_LAST_WQE_REACHED:
#if HAVE_DECL_IBV_EVENT_CLIENT_REREGISTER
            case IBV_EVENT_CLIENT_REREGISTER:
#endif
            case IBV_EVENT_SRQ_LIMIT_REACHED:
                break;
            default:
                orte_show_help("help-mpi-btl-openib.txt", "of unknown event",
                        true,orte_process_info.nodename, orte_process_info.pid,
                        event.event_type, xrc_event ? "true" : "false");
        }
        ibv_ack_async_event(&event);
    } else {
        /* if (device == NULL), then failed to locate the device!
           This should never happen... */
        BTL_ERROR(("Failed to find device with FD %d.  "
                   "Fatal error, stoping asynch event thread",
                   devices_poll->async_pollfd[index].fd));
        return OMPI_ERROR;
    }
    return OMPI_SUCCESS;
}