예제 #1
0
ucs_status_t progress_remote_cq(uct_ugni_smsg_iface_t *iface)
{
    gni_return_t ugni_rc;
    gni_cq_entry_t event_data;
    uct_ugni_ep_t tl_ep;
    uct_ugni_ep_t *ugni_ep;
    uct_ugni_smsg_ep_t *ep;

    ugni_rc = GNI_CqGetEvent(iface->remote_cq, &event_data);

    if(GNI_RC_NOT_DONE == ugni_rc){
        return UCS_OK;
    }

    if (GNI_RC_SUCCESS != ugni_rc || !GNI_CQ_STATUS_OK(event_data) || GNI_CQ_OVERRUN(event_data)) {
        if(GNI_RC_ERROR_RESOURCE == ugni_rc || (GNI_RC_SUCCESS == ugni_rc && GNI_CQ_OVERRUN(event_data))){
            ucs_debug("Detected remote CQ overrun. ungi_rc = %d [%s]", ugni_rc, gni_err_str[ugni_rc]);
            uct_ugni_smsg_handle_remote_overflow(iface);
            return UCS_OK;
        }
        ucs_error("GNI_CqGetEvent falied with unhandled error. Error status %s %d ",
                  gni_err_str[ugni_rc], ugni_rc);
        return UCS_ERR_IO_ERROR;
    }

    tl_ep.hash_key = GNI_CQ_GET_INST_ID(event_data);
    ugni_ep = sglib_hashed_uct_ugni_ep_t_find_member(iface->super.eps, &tl_ep);
    ep = ucs_derived_of(ugni_ep, uct_ugni_smsg_ep_t);

    process_mbox(iface, ep);
    return UCS_INPROGRESS;
}
예제 #2
0
static void uct_ugni_smsg_handle_remote_overflow(uct_ugni_smsg_iface_t *iface){
    gni_return_t ugni_rc;
    gni_cq_entry_t event_data;
    struct sglib_hashed_uct_ugni_ep_t_iterator ep_iterator;
    uct_ugni_ep_t *current_ep;
    uct_ugni_smsg_ep_t *ep;

    /* We don't know which EP dropped a completion entry, so flush everything */
    do{
        ugni_rc = GNI_CqGetEvent(iface->remote_cq, &event_data);
    } while(GNI_RC_NOT_DONE != ugni_rc);

    current_ep = sglib_hashed_uct_ugni_ep_t_it_init(&ep_iterator, iface->super.eps);
    while(NULL != current_ep){
        ep = ucs_derived_of(current_ep, uct_ugni_smsg_ep_t);
        process_mbox(iface, ep);
        current_ep = sglib_hashed_uct_ugni_ep_t_it_next(&ep_iterator);
    }
}