Exemplo n.º 1
0
void orte_qos_ack_channel_process_ack (int status, orte_process_name_t* sender,
                                       opal_buffer_t *buffer,
                                       orte_rml_tag_t tag, void *cbdata)
{
    /*  process ack received for the msg */
    uint32_t num_msgs_acked, channel_num, i;
    int32_t num_values, room_num;
    orte_rml_send_t *msg, *missed_msg;
    void *occupant = NULL;
    orte_rml_channel_t *channel;
    orte_qos_ack_channel_t *ack_chan;
    uint32_t *seq_num_array;
    uint32_t ack_type;
    uint32_t missed_msg_seq_num = 0;
    num_values = 1;
    /* unpack channel number first */
    opal_dss.unpack(buffer, (void*) &channel_num, &num_values, OPAL_UINT32);
    OPAL_OUTPUT_VERBOSE((5, orte_qos_base_framework.framework_output,
                         "orte_qos_ack_channel_process_ack recieved ack on channel = %d",
                         channel_num));
    channel = orte_rml_base_get_channel (channel_num);
    if ((NULL != channel) || (NULL != channel->qos_channel_ptr)) {
        ack_chan = (orte_qos_ack_channel_t *) (channel->qos_channel_ptr);
        seq_num_array = malloc (sizeof(uint32_t) * ack_chan->window);
        num_values = 1;
        /* unpack ack type */
        opal_dss.unpack(buffer, (void*) &ack_type, &num_values, OPAL_UINT32);
        num_values = 1;
        /* unpack num messages acked */
        opal_dss.unpack(buffer, (void*) &num_msgs_acked, &num_values, OPAL_UINT32);
        OPAL_OUTPUT_VERBOSE((5, orte_qos_base_framework.framework_output,
                             "orte_qos_ack_channel_process_ack recieved ack type %d for %d msgs on channel = %d",
                             ack_type, num_msgs_acked, channel_num));
        if (ACK_OUT_OF_ORDER != ack_type)   {
            //handle normal ACK
            for (i = 0; i < num_msgs_acked; i++)
                {
                    opal_dss.unpack(buffer, (void*) &seq_num_array[i], &num_values, OPAL_UINT32);
                    room_num = orte_qos_ack_channel_get_msg_room (ack_chan, seq_num_array[i]);
                    opal_hotel_checkout_and_return_occupant(&ack_chan->outstanding_msgs, room_num, &occupant);
                    orte_qos_ack_channel_set_msg_room(ack_chan, seq_num_array[i], -1);
                    if((occupant != NULL) && (room_num != -1)) {
                        msg = (orte_rml_send_t*) occupant;
                        OPAL_OUTPUT_VERBOSE((10, orte_rml_base_framework.framework_output,
                                             "Releasing sent message with tag %d and seq_num %d after receiving Ack from dest ",
                                             msg->tag, msg->seq_num ));
                        msg->status = ORTE_SUCCESS;
                        ORTE_RML_SEND_COMPLETE(msg);
                    } else {
                        OPAL_OUTPUT_VERBOSE((10, orte_rml_base_framework.framework_output,
                                             "OOPS received an ACK for already completed seq_num =%d ",
                                             seq_num_array[i] ));
                    }
                }
        } else {
            // handle out of order ACK - complete msgs received in order, retry the lost msg.
            for (i = 0; i < num_msgs_acked; i++)
                {
                    opal_dss.unpack(buffer, (void*) &seq_num_array[i], &num_values, OPAL_UINT32);
                    room_num = orte_qos_ack_channel_get_msg_room (ack_chan, seq_num_array[i]);
                    opal_hotel_checkout_and_return_occupant(&ack_chan->outstanding_msgs, room_num, &occupant);
                    orte_qos_ack_channel_set_msg_room(ack_chan, seq_num_array[i], -1);
                    if ((NULL != occupant) && ((i == 0 )|| (seq_num_array[i] == seq_num_array[i-1] +1 ))) {
                        msg = (orte_rml_send_t*) occupant;
                        msg->status = ORTE_SUCCESS;
                        ORTE_RML_SEND_COMPLETE(msg);
                    } else {
                        if (NULL != occupant) {
                            // num_missed_msgs = (seq_num_array[i] - seq_num_array [i-1] - 1);
                            assert( i == num_msgs_acked -1);
                            /* recheck the ith msg */
                            opal_hotel_checkin(&ack_chan->outstanding_msgs, (void*)occupant, &room_num);
                            orte_qos_ack_channel_set_msg_room (ack_chan, seq_num_array[i], room_num);
                            /* resend and recheck all the missed msgs*/
                            missed_msg_seq_num = seq_num_array[i-1] + 1;
                            for (; missed_msg_seq_num < seq_num_array[i]; missed_msg_seq_num++) {
                                room_num = orte_qos_ack_channel_get_msg_room (ack_chan, missed_msg_seq_num);
                                opal_hotel_checkout_and_return_occupant (&ack_chan->outstanding_msgs, room_num, &occupant);
                                assert ( NULL != occupant);
                                missed_msg = (orte_rml_send_t*) occupant;
                                missed_msg->status = ORTE_ERR_LOST_MSG_IN_WINDOW;
                                opal_hotel_checkin(&ack_chan->outstanding_msgs, (void*)missed_msg, &room_num);
                                orte_qos_ack_channel_set_msg_room (ack_chan, missed_msg_seq_num, room_num);
                                /* send this out on wire directly */
                                ORTE_OOB_SEND (missed_msg);
                            } //end for
                        } else {
                            OPAL_OUTPUT_VERBOSE((10, orte_rml_base_framework.framework_output,
                                                 "OOPS received an ACK for already completed seq_num =%d ",
                                                 seq_num_array[i] ));
                        }//end  if (NULL != occupant)
                    } //end else
                } // end for
        }//end out of order ack processing
        free(seq_num_array);
    }else {
        OPAL_OUTPUT_VERBOSE((5, orte_qos_base_framework.framework_output,
                             "orte_qos_ack_channel_msg_ack_recv_callback recieved ack on non existent channel = %d",
                             channel_num));
    }
}
Exemplo n.º 2
0
void pmix_server_keyval_client(int status, orte_process_name_t* sender,
                               opal_buffer_t *buffer,
                               orte_rml_tag_t tg, void *cbdata)
{
    int rc, ret, room_num = -1;
    int32_t cnt;
    pmix_server_req_t *req=NULL;
    opal_list_t info;
    opal_value_t *iptr;
    opal_pmix_pdata_t *pdata;
    opal_process_name_t source;

    opal_output_verbose(1, orte_pmix_server_globals.output,
                        "%s recvd lookup data return",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));

    OBJ_CONSTRUCT(&info, opal_list_t);
    /* unpack the room number of the request tracker */
    cnt = 1;
    if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &room_num, &cnt, OPAL_INT))) {
        ORTE_ERROR_LOG(rc);
        return;
    }

    /* unpack the status */
    cnt = 1;
    if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &ret, &cnt, OPAL_INT))) {
        ORTE_ERROR_LOG(rc);
        ret = rc;
        goto release;
    }

    opal_output_verbose(5, orte_pmix_server_globals.output,
                        "%s recvd lookup returned status %d",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ret);

    if (ORTE_SUCCESS == ret) {
        /* see if any data was included - not an error if the answer is no */
        cnt = 1;
        while (OPAL_SUCCESS == opal_dss.unpack(buffer, &source, &cnt, OPAL_NAME)) {
            pdata = OBJ_NEW(opal_pmix_pdata_t);
            pdata->proc = source;
            if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &iptr, &cnt, OPAL_VALUE))) {
                ORTE_ERROR_LOG(rc);
                OBJ_RELEASE(pdata);
                continue;
            }
            opal_output_verbose(5, orte_pmix_server_globals.output,
                                "%s recvd lookup returned data %s of type %d from source %s",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), iptr->key, iptr->type,
                                ORTE_NAME_PRINT(&source));
            if (OPAL_SUCCESS != (rc = opal_value_xfer(&pdata->value, iptr))) {
                ORTE_ERROR_LOG(rc);
                OBJ_RELEASE(pdata);
                OBJ_RELEASE(iptr);
                continue;
            }
            OBJ_RELEASE(iptr);
            opal_list_append(&info, &pdata->super);
        }
    }

  release:
    if (0 <= room_num) {
        /* retrieve the tracker */
        opal_hotel_checkout_and_return_occupant(&orte_pmix_server_globals.reqs, room_num, (void**)&req);
    }

    if (NULL != req) {
        /* pass down the response */
        if (NULL != req->opcbfunc) {
            req->opcbfunc(ret, req->cbdata);
        } else if (NULL != req->lkcbfunc) {
            req->lkcbfunc(ret, &info, req->cbdata);
        } else {
            /* should not happen */
            ORTE_ERROR_LOG(ORTE_ERR_NOT_SUPPORTED);
        }

        /* cleanup */
        OPAL_LIST_DESTRUCT(&info);
        OBJ_RELEASE(req);
    }
}
Exemplo n.º 3
0
static inline int process_out_of_order_msg ( orte_qos_ack_channel_t *ack_chan,
        orte_rml_recv_t *msg)
{
    int32_t rc, room_num, first_lost_msg_seq_num, num_lost_msgs, i;
    orte_rml_recv_t *out_msg;
    void *occupant = NULL;
    OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output,
                         "%s process_out_of_order_msg msg %d",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         msg->seq_num));
    /* if this msg is a duplicate - then do nothing */
    if ((orte_qos_ack_channel_get_msg_room(ack_chan, msg->seq_num)) != -1) {
        OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output,
                             "%s process_out_of_order_msg msg %d",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             msg->seq_num));
        rc = ORTE_ERR_DUPLICATE_MSG;
    }
    else {
        if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&ack_chan->outstanding_msgs, (void*)msg, &room_num))) {
            return rc;
        }
        OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output,
                             "process_out_of_order_msg checked in msg %d in room %d\n",
                              msg->seq_num, room_num));
        orte_qos_ack_channel_set_msg_room (ack_chan, msg->seq_num, room_num);
        rc = ORTE_ERR_OUT_OF_ORDER_MSG;
        /*  check if we need to send an ACK */
        if (ack_chan->ack_msg_seq_num <= ack_chan->in_msg_seq_num) {
            OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output,
                                 "%s process_out_of_order_msg sending ack last seq_num = %d\n",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 msg->seq_num));
            /* send ACK. */
            send_ack (ack_chan, msg->channel_num, ACK_OUT_OF_ORDER, msg->seq_num);
            /* stop window ack timer */
            opal_event_evtimer_del (&ack_chan->msg_ack_timer_event);
        }
        else {
            /* if we got a lost msg - any seq num between in_msg_seq_num and ack_seq_num*/
            if (ack_chan->ack_msg_seq_num > msg->seq_num) {
                /* check if we have got all lost msgs */
                first_lost_msg_seq_num = ack_chan->in_msg_seq_num + 1;
                num_lost_msgs = ack_chan->ack_msg_seq_num - ack_chan->in_msg_seq_num;
                OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output,
                                     "%s process_out_of_order_msg msg %d first_lost_msg =%d num_lost_msgs =%d",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     msg->seq_num, first_lost_msg_seq_num, num_lost_msgs));
                for (i =0 ; i < num_lost_msgs; i++) {
                    if ((orte_qos_ack_channel_get_msg_room(ack_chan, first_lost_msg_seq_num +i)) == -1)
                        break;
                }
                if (i == num_lost_msgs) {

                    /* we got all the lost msgs so we can complete all the msgs in the hotel now */
                    /* reset ack_seq_num */
                    ack_chan->ack_msg_seq_num = first_lost_msg_seq_num -1;
                    room_num = 0;
                    for ( i = 0; room_num != -1; i++) {
                        OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output,
                                             "%s process_out_of_order_msg got all lost msgs  completing outstanding msgs %d",
                                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                             (first_lost_msg_seq_num + i)));
                        /* evict msg and complete it */
                        room_num = orte_qos_ack_channel_get_msg_room (ack_chan, first_lost_msg_seq_num +i);
                        opal_hotel_checkout_and_return_occupant(&ack_chan->outstanding_msgs, room_num, &occupant);
                        orte_qos_ack_channel_set_msg_room(ack_chan, first_lost_msg_seq_num +i, -1);
                        out_msg = (orte_rml_recv_t *) occupant;
                        if ((NULL != out_msg) && (room_num != -1)) {
                            // set in seq num */
                            ack_chan->in_msg_seq_num = out_msg->seq_num;
                            orte_rml_base_complete_recv_msg(&out_msg);
                            /* completing recv msg to rml */
                            OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output,
                                                 "process_out_of_order_msg completed recv msg %d",
                                                 (first_lost_msg_seq_num + i)));
                            } else {
                            OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output,
                                                 "%s process_out_of_order_msg lost msg %d not in hotel",
                                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                                 (first_lost_msg_seq_num + i)));
                        }
                    } //end for
                    /* send ACK */
                    send_ack (ack_chan, ack_chan->channel_num, ACK_RECV_MISSED_MSG,
                                ack_chan->in_msg_seq_num);
                } //end if (i== num_lost_msgs)
            } // if (ack_chan->ack_msg_seq_num > msg->seq_num)
        } //end else
    } // end duplicate else
    return rc;
}
Exemplo n.º 4
0
static void pmix_server_dmdx_resp(int status, orte_process_name_t* sender,
                                  opal_buffer_t *buffer,
                                  orte_rml_tag_t tg, void *cbdata)
{
    int rc, ret, room_num, rnum;
    int32_t cnt;
    opal_process_name_t target;
    pmix_server_req_t *req;
    datacaddy_t *d;

    opal_output_verbose(2, orte_pmix_server_globals.output,
                        "%s dmdx:recv response from proc %s",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        ORTE_NAME_PRINT(sender));

    /* unpack the status */
    cnt = 1;
    if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &ret, &cnt, OPAL_INT))) {
        ORTE_ERROR_LOG(rc);
        return;
    }

    /* unpack the id of the target whose info we just received */
    cnt = 1;
    if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &target, &cnt, OPAL_NAME))) {
        ORTE_ERROR_LOG(rc);
        return;
    }

    /* unpack our tracking room number */
    cnt = 1;
    if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &room_num, &cnt, OPAL_INT))) {
        ORTE_ERROR_LOG(rc);
        return;
    }

    /* unload the remainder of the buffer */
    d = OBJ_NEW(datacaddy_t);
    if (OPAL_SUCCESS != (rc = opal_dss.unload(buffer, (void**)&d->data, &d->ndata))) {
        ORTE_ERROR_LOG(rc);
        return;
    }

    /* check the request out of the tracking hotel */
    opal_hotel_checkout_and_return_occupant(&orte_pmix_server_globals.reqs, room_num, (void**)&req);
    /* return the returned data to the requestor */
    if (NULL != req) {
        if (NULL != req->mdxcbfunc) {
            OBJ_RETAIN(d);
            req->mdxcbfunc(ret, d->data, d->ndata, req->cbdata, relcbfunc, d);
        }
        OBJ_RELEASE(req);
    }

    /* now see if anyone else was waiting for data from this target */
    for (rnum=0; rnum < orte_pmix_server_globals.reqs.num_rooms; rnum++) {
        opal_hotel_knock(&orte_pmix_server_globals.reqs, rnum, (void**)&req);
        if (NULL == req) {
            continue;
        }
        if (req->target.jobid == target.jobid &&
            req->target.vpid == target.vpid) {
            if (NULL != req->mdxcbfunc) {
                OBJ_RETAIN(d);
                req->mdxcbfunc(ret, d->data, d->ndata, req->cbdata, relcbfunc, d);
            }
            opal_hotel_checkout(&orte_pmix_server_globals.reqs, rnum);
            OBJ_RELEASE(req);
        }
    }
    OBJ_RELEASE(d);  // maintain accounting
}