Esempio n. 1
0
static void eviction_cbfunc(struct opal_hotel_t *hotel,
                            int room_num, void *occupant)
{
    pmix_server_req_t *req = (pmix_server_req_t*)occupant;
    bool timeout = false;
    int rc;

    /* decrement the request timeout */
    req->timeout -= orte_pmix_server_globals.timeout;
    if (req->timeout > 0) {
        req->timeout -= orte_pmix_server_globals.timeout;
        if (0 >= req->timeout) {
            timeout = true;
        }
    }
    if (!timeout) {
        /* not done yet - check us back in */
        if (OPAL_SUCCESS == (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) {
            return;
        }
        ORTE_ERROR_LOG(rc);
        /* fall thru and return an error so the caller doesn't hang */
    }
    /* don't let the caller hang */
    if (NULL != req->opcbfunc) {
        req->opcbfunc(OPAL_ERR_TIMEOUT, req->cbdata);
    } else if (NULL != req->mdxcbfunc) {
        req->mdxcbfunc(OPAL_ERR_TIMEOUT, NULL, 0, req->cbdata, NULL, NULL);
    } else if (NULL != req->spcbfunc) {
        req->spcbfunc(OPAL_ERR_TIMEOUT, ORTE_JOBID_INVALID, req->cbdata);
    } else if (NULL != req->lkcbfunc) {
        req->lkcbfunc(OPAL_ERR_TIMEOUT, NULL, req->cbdata);
    }
    OBJ_RELEASE(req);
}
Esempio n. 2
0
static int ack_send ( void *qos_channel,  orte_rml_send_t *msg) {
    int32_t room_num;
    orte_qos_ack_channel_t *ack_chan = (orte_qos_ack_channel_t*) (qos_channel);
    if (ack_chan->out_msg_seq_num == ack_chan->window_first_seq_num -1 ) {
        /* begining msg window */
        ack_chan->out_msg_seq_num = ack_chan->window_first_seq_num;
        OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output,
                             "%s ack_send msg = %p to peer = %s\n begining window at seq_num = %d",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             (void*)msg, ORTE_NAME_PRINT(&msg->dst), ack_chan->out_msg_seq_num));
        ack_chan->state = orte_qos_ack_channel_state_filling_window;
    }
    else
        ack_chan->out_msg_seq_num++;
    if(ack_chan->out_msg_seq_num - ack_chan->window_first_seq_num == ack_chan->window - 1) {
        /* we are at the end of the window. */
        /* update state */
        ack_chan->state = orte_qos_ack_channel_state_window_completed;
        /* set begin window for next sequence */
        ack_chan->window_first_seq_num = ack_chan->out_msg_seq_num + 1;
    }
    msg->seq_num = ack_chan->out_msg_seq_num;
    /* check msg into hotel */
    if( OPAL_SUCCESS == (opal_hotel_checkin(&ack_chan->outstanding_msgs, msg, &room_num ))) {
        /* store room number */
        orte_qos_ack_channel_set_msg_room(ack_chan, msg->seq_num, room_num);
    } else {
        OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output,
                             "%s ack_send msg = %p to peer = %s returned with error %d\n",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             (void*)msg, ORTE_NAME_PRINT(&msg->dst),
                             ORTE_ERR_QOS_ACK_WINDOW_FULL));
        return ORTE_ERR_QOS_ACK_WINDOW_FULL;
    }
    OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output,
                         "%s ack_send msg = %p to peer = %s\n",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         (void*)msg, ORTE_NAME_PRINT(&msg->dst)));
    return ORTE_SUCCESS;
}
Esempio n. 3
0
static void execute(int sd, short args, void *cbdata)
{
    pmix_server_req_t *req = (pmix_server_req_t*)cbdata;
    int rc;
    opal_buffer_t *xfer;

    /* add this request to our tracker hotel */
    if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) {
        ORTE_ERROR_LOG(rc);
        goto callback;
    }

    /* setup the xfer */
    xfer = OBJ_NEW(opal_buffer_t);
    /* pack the room number */
    if (OPAL_SUCCESS != (rc = opal_dss.pack(xfer, &req->room_num, 1, OPAL_INT))) {
        ORTE_ERROR_LOG(rc);
        OBJ_RELEASE(xfer);
        goto callback;
    }
    opal_dss.copy_payload(xfer, &req->msg);

    /* send the request to the target */
    rc = orte_rml.send_buffer_nb(&req->target, xfer,
                                 ORTE_RML_TAG_DATA_SERVER,
                                 orte_rml_send_callback, NULL);
    if (ORTE_SUCCESS == rc) {
        return;
    }

  callback:
    /* execute the callback to avoid having the client hang */
    if (NULL != req->opcbfunc) {
        req->opcbfunc(rc, req->cbdata);
    } else if (NULL != req->lkcbfunc) {
        req->lkcbfunc(rc, NULL, req->cbdata);
    }
    opal_hotel_checkout(&orte_pmix_server_globals.reqs, req->room_num);
    OBJ_RELEASE(req);
}
Esempio n. 4
0
void orte_qos_ack_channel_process_ack (int status, orte_process_name_t* sender,
                                       opal_buffer_t *buffer,
                                       orte_rml_tag_t tag, void *cbdata)
{
    /*  process ack received for the msg */
    uint32_t num_msgs_acked, channel_num, i;
    int32_t num_values, room_num;
    orte_rml_send_t *msg, *missed_msg;
    void *occupant = NULL;
    orte_rml_channel_t *channel;
    orte_qos_ack_channel_t *ack_chan;
    uint32_t *seq_num_array;
    uint32_t ack_type;
    uint32_t missed_msg_seq_num = 0;
    num_values = 1;
    /* unpack channel number first */
    opal_dss.unpack(buffer, (void*) &channel_num, &num_values, OPAL_UINT32);
    OPAL_OUTPUT_VERBOSE((5, orte_qos_base_framework.framework_output,
                         "orte_qos_ack_channel_process_ack recieved ack on channel = %d",
                         channel_num));
    channel = orte_rml_base_get_channel (channel_num);
    if ((NULL != channel) || (NULL != channel->qos_channel_ptr)) {
        ack_chan = (orte_qos_ack_channel_t *) (channel->qos_channel_ptr);
        seq_num_array = malloc (sizeof(uint32_t) * ack_chan->window);
        num_values = 1;
        /* unpack ack type */
        opal_dss.unpack(buffer, (void*) &ack_type, &num_values, OPAL_UINT32);
        num_values = 1;
        /* unpack num messages acked */
        opal_dss.unpack(buffer, (void*) &num_msgs_acked, &num_values, OPAL_UINT32);
        OPAL_OUTPUT_VERBOSE((5, orte_qos_base_framework.framework_output,
                             "orte_qos_ack_channel_process_ack recieved ack type %d for %d msgs on channel = %d",
                             ack_type, num_msgs_acked, channel_num));
        if (ACK_OUT_OF_ORDER != ack_type)   {
            //handle normal ACK
            for (i = 0; i < num_msgs_acked; i++)
                {
                    opal_dss.unpack(buffer, (void*) &seq_num_array[i], &num_values, OPAL_UINT32);
                    room_num = orte_qos_ack_channel_get_msg_room (ack_chan, seq_num_array[i]);
                    opal_hotel_checkout_and_return_occupant(&ack_chan->outstanding_msgs, room_num, &occupant);
                    orte_qos_ack_channel_set_msg_room(ack_chan, seq_num_array[i], -1);
                    if((occupant != NULL) && (room_num != -1)) {
                        msg = (orte_rml_send_t*) occupant;
                        OPAL_OUTPUT_VERBOSE((10, orte_rml_base_framework.framework_output,
                                             "Releasing sent message with tag %d and seq_num %d after receiving Ack from dest ",
                                             msg->tag, msg->seq_num ));
                        msg->status = ORTE_SUCCESS;
                        ORTE_RML_SEND_COMPLETE(msg);
                    } else {
                        OPAL_OUTPUT_VERBOSE((10, orte_rml_base_framework.framework_output,
                                             "OOPS received an ACK for already completed seq_num =%d ",
                                             seq_num_array[i] ));
                    }
                }
        } else {
            // handle out of order ACK - complete msgs received in order, retry the lost msg.
            for (i = 0; i < num_msgs_acked; i++)
                {
                    opal_dss.unpack(buffer, (void*) &seq_num_array[i], &num_values, OPAL_UINT32);
                    room_num = orte_qos_ack_channel_get_msg_room (ack_chan, seq_num_array[i]);
                    opal_hotel_checkout_and_return_occupant(&ack_chan->outstanding_msgs, room_num, &occupant);
                    orte_qos_ack_channel_set_msg_room(ack_chan, seq_num_array[i], -1);
                    if ((NULL != occupant) && ((i == 0 )|| (seq_num_array[i] == seq_num_array[i-1] +1 ))) {
                        msg = (orte_rml_send_t*) occupant;
                        msg->status = ORTE_SUCCESS;
                        ORTE_RML_SEND_COMPLETE(msg);
                    } else {
                        if (NULL != occupant) {
                            // num_missed_msgs = (seq_num_array[i] - seq_num_array [i-1] - 1);
                            assert( i == num_msgs_acked -1);
                            /* recheck the ith msg */
                            opal_hotel_checkin(&ack_chan->outstanding_msgs, (void*)occupant, &room_num);
                            orte_qos_ack_channel_set_msg_room (ack_chan, seq_num_array[i], room_num);
                            /* resend and recheck all the missed msgs*/
                            missed_msg_seq_num = seq_num_array[i-1] + 1;
                            for (; missed_msg_seq_num < seq_num_array[i]; missed_msg_seq_num++) {
                                room_num = orte_qos_ack_channel_get_msg_room (ack_chan, missed_msg_seq_num);
                                opal_hotel_checkout_and_return_occupant (&ack_chan->outstanding_msgs, room_num, &occupant);
                                assert ( NULL != occupant);
                                missed_msg = (orte_rml_send_t*) occupant;
                                missed_msg->status = ORTE_ERR_LOST_MSG_IN_WINDOW;
                                opal_hotel_checkin(&ack_chan->outstanding_msgs, (void*)missed_msg, &room_num);
                                orte_qos_ack_channel_set_msg_room (ack_chan, missed_msg_seq_num, room_num);
                                /* send this out on wire directly */
                                ORTE_OOB_SEND (missed_msg);
                            } //end for
                        } else {
                            OPAL_OUTPUT_VERBOSE((10, orte_rml_base_framework.framework_output,
                                                 "OOPS received an ACK for already completed seq_num =%d ",
                                                 seq_num_array[i] ));
                        }//end  if (NULL != occupant)
                    } //end else
                } // end for
        }//end out of order ack processing
        free(seq_num_array);
    }else {
        OPAL_OUTPUT_VERBOSE((5, orte_qos_base_framework.framework_output,
                             "orte_qos_ack_channel_msg_ack_recv_callback recieved ack on non existent channel = %d",
                             channel_num));
    }
}
Esempio n. 5
0
static inline int process_out_of_order_msg ( orte_qos_ack_channel_t *ack_chan,
        orte_rml_recv_t *msg)
{
    int32_t rc, room_num, first_lost_msg_seq_num, num_lost_msgs, i;
    orte_rml_recv_t *out_msg;
    void *occupant = NULL;
    OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output,
                         "%s process_out_of_order_msg msg %d",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         msg->seq_num));
    /* if this msg is a duplicate - then do nothing */
    if ((orte_qos_ack_channel_get_msg_room(ack_chan, msg->seq_num)) != -1) {
        OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output,
                             "%s process_out_of_order_msg msg %d",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             msg->seq_num));
        rc = ORTE_ERR_DUPLICATE_MSG;
    }
    else {
        if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&ack_chan->outstanding_msgs, (void*)msg, &room_num))) {
            return rc;
        }
        OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output,
                             "process_out_of_order_msg checked in msg %d in room %d\n",
                              msg->seq_num, room_num));
        orte_qos_ack_channel_set_msg_room (ack_chan, msg->seq_num, room_num);
        rc = ORTE_ERR_OUT_OF_ORDER_MSG;
        /*  check if we need to send an ACK */
        if (ack_chan->ack_msg_seq_num <= ack_chan->in_msg_seq_num) {
            OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output,
                                 "%s process_out_of_order_msg sending ack last seq_num = %d\n",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 msg->seq_num));
            /* send ACK. */
            send_ack (ack_chan, msg->channel_num, ACK_OUT_OF_ORDER, msg->seq_num);
            /* stop window ack timer */
            opal_event_evtimer_del (&ack_chan->msg_ack_timer_event);
        }
        else {
            /* if we got a lost msg - any seq num between in_msg_seq_num and ack_seq_num*/
            if (ack_chan->ack_msg_seq_num > msg->seq_num) {
                /* check if we have got all lost msgs */
                first_lost_msg_seq_num = ack_chan->in_msg_seq_num + 1;
                num_lost_msgs = ack_chan->ack_msg_seq_num - ack_chan->in_msg_seq_num;
                OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output,
                                     "%s process_out_of_order_msg msg %d first_lost_msg =%d num_lost_msgs =%d",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     msg->seq_num, first_lost_msg_seq_num, num_lost_msgs));
                for (i =0 ; i < num_lost_msgs; i++) {
                    if ((orte_qos_ack_channel_get_msg_room(ack_chan, first_lost_msg_seq_num +i)) == -1)
                        break;
                }
                if (i == num_lost_msgs) {

                    /* we got all the lost msgs so we can complete all the msgs in the hotel now */
                    /* reset ack_seq_num */
                    ack_chan->ack_msg_seq_num = first_lost_msg_seq_num -1;
                    room_num = 0;
                    for ( i = 0; room_num != -1; i++) {
                        OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output,
                                             "%s process_out_of_order_msg got all lost msgs  completing outstanding msgs %d",
                                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                             (first_lost_msg_seq_num + i)));
                        /* evict msg and complete it */
                        room_num = orte_qos_ack_channel_get_msg_room (ack_chan, first_lost_msg_seq_num +i);
                        opal_hotel_checkout_and_return_occupant(&ack_chan->outstanding_msgs, room_num, &occupant);
                        orte_qos_ack_channel_set_msg_room(ack_chan, first_lost_msg_seq_num +i, -1);
                        out_msg = (orte_rml_recv_t *) occupant;
                        if ((NULL != out_msg) && (room_num != -1)) {
                            // set in seq num */
                            ack_chan->in_msg_seq_num = out_msg->seq_num;
                            orte_rml_base_complete_recv_msg(&out_msg);
                            /* completing recv msg to rml */
                            OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output,
                                                 "process_out_of_order_msg completed recv msg %d",
                                                 (first_lost_msg_seq_num + i)));
                            } else {
                            OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output,
                                                 "%s process_out_of_order_msg lost msg %d not in hotel",
                                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                                 (first_lost_msg_seq_num + i)));
                        }
                    } //end for
                    /* send ACK */
                    send_ack (ack_chan, ack_chan->channel_num, ACK_RECV_MISSED_MSG,
                                ack_chan->in_msg_seq_num);
                } //end if (i== num_lost_msgs)
            } // if (ack_chan->ack_msg_seq_num > msg->seq_num)
        } //end else
    } // end duplicate else
    return rc;
}
Esempio n. 6
0
static void pmix_server_dmdx_recv(int status, orte_process_name_t* sender,
                                  opal_buffer_t *buffer,
                                  orte_rml_tag_t tg, void *cbdata)
{
    int rc, room_num;
    int32_t cnt;
    opal_process_name_t idreq;
    orte_process_name_t name;
    orte_job_t *jdata;
    orte_proc_t *proc;
    pmix_server_req_t *req;


    /* unpack the id of the proc whose data is being requested */
    cnt = 1;
    if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &idreq, &cnt, OPAL_NAME))) {
        ORTE_ERROR_LOG(rc);
        return;
    }
    opal_output_verbose(2, orte_pmix_server_globals.output,
                        "%s dmdx:recv request from proc %s for proc %s",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        ORTE_NAME_PRINT(sender),
                        ORTE_NAME_PRINT(&idreq));
    /* and the remote daemon's tracking room number */
    cnt = 1;
    if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &room_num, &cnt, OPAL_INT))) {
        ORTE_ERROR_LOG(rc);
        return;
    }
    /* is this proc one of mine? */
    memcpy((char*)&name, (char*)&idreq, sizeof(orte_process_name_t));
    if (NULL == (jdata = orte_get_job_data_object(name.jobid))) {
        /* not having the jdata means that we haven't unpacked the
         * the launch message for this job yet - this is a race
         * condition, so just log the request and we will fill
         * it later */
        req = OBJ_NEW(pmix_server_req_t);
        req->proxy = *sender;
        req->target = idreq;
        req->remote_room_num = room_num;
        if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) {
            OBJ_RELEASE(req);
            send_error(rc, &idreq, sender);
        }
        return;
    }
    if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, name.vpid))) {
        /* this is truly an error, so notify the sender */
        send_error(ORTE_ERR_NOT_FOUND, &idreq, sender);
        return;
    }
    if (!ORTE_FLAG_TEST(proc, ORTE_PROC_FLAG_LOCAL)) {
        /* send back an error - they obviously have made a mistake */
        send_error(ORTE_ERR_NOT_FOUND, &idreq, sender);
        return;
    }
    /* track the request since the call down to the PMIx server
     * is asynchronous */
    req = OBJ_NEW(pmix_server_req_t);
    req->proxy = *sender;
    req->target = idreq;
    req->remote_room_num = room_num;
    if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) {
        OBJ_RELEASE(req);
        send_error(rc, &idreq, sender);
        return;
    }

    /* ask our local pmix server for the data */
    if (OPAL_SUCCESS != (rc = opal_pmix.server_dmodex_request(&idreq, modex_resp, req))) {
        ORTE_ERROR_LOG(rc);
        opal_hotel_checkout(&orte_pmix_server_globals.reqs, req->room_num);
        OBJ_RELEASE(req);
        send_error(rc, &idreq, sender);
        return;
    }
    return;
}