void orte_rml_close_channel_recv_callback (int status, orte_process_name_t* peer, struct opal_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata) { // find the channel and close it or log error orte_rml_channel_t *channel; int32_t count =1, rc; orte_rml_channel_num_t channel_num =5; OPAL_OUTPUT_VERBOSE((1, orte_rml_base_framework.framework_output, "%s rml_close_channel_recv_callback from peer %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(peer))); /* unpack channel number */ if(ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &channel_num, &count, OPAL_UINT32))) { ORTE_ERROR_LOG(rc); return; } channel = orte_rml_base_get_channel(channel_num); OPAL_OUTPUT_VERBOSE((1, orte_rml_base_framework.framework_output, "%s rml_close_channel_recv_callback for channel num =%d channel=%p", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), channel_num, (void*)channel)); if (NULL != channel) { orte_qos_close_channel ( channel->qos, channel->qos_channel_ptr); opal_pointer_array_set_item ( &orte_rml_base.open_channels, channel->channel_num, NULL); OBJ_RELEASE(channel); } else { ORTE_ERROR_LOG(OPAL_ERR_BAD_PARAM); } }
void orte_qos_ack_recv_msg_timeout_callback (struct opal_hotel_t *hotel, int room_num, void *occupant) { #if OPAL_ENABLE_DEBUG orte_rml_recv_t *msg = (orte_rml_recv_t *) occupant; #endif #if 0 orte_qos_ack_channel_t *ack_chan; orte_rml_channel_t *channel; channel = orte_rml_base_get_channel(msg->channel_num); ack_chan = (orte_qos_ack_channel_t*) channel->qos_channel_ptr; #endif OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output, "%s OOPS received msg = %p seq num =%d timed out on ACK Queue\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (void*)msg, msg->seq_num)); /* Need to determine correct action here as the sender hasn't responded yet to a lost msg event */ /* This is highly unlikely - lets assert to enable debug*/ assert(0); /* // set room num to -1 for the msg's seq number ack_chan->seq_num_to_room_num[msg->seq_num % QOS_ACK_MAX_OUTSTANDING_MSGS] = -1; // complete the msg ORTE_RML_REACTIVATE_MESSAGE(msg);*/ }
void orte_qos_ack_channel_process_ack (int status, orte_process_name_t* sender, opal_buffer_t *buffer, orte_rml_tag_t tag, void *cbdata) { /* process ack received for the msg */ uint32_t num_msgs_acked, channel_num, i; int32_t num_values, room_num; orte_rml_send_t *msg, *missed_msg; void *occupant = NULL; orte_rml_channel_t *channel; orte_qos_ack_channel_t *ack_chan; uint32_t *seq_num_array; uint32_t ack_type; uint32_t missed_msg_seq_num = 0; num_values = 1; /* unpack channel number first */ opal_dss.unpack(buffer, (void*) &channel_num, &num_values, OPAL_UINT32); OPAL_OUTPUT_VERBOSE((5, orte_qos_base_framework.framework_output, "orte_qos_ack_channel_process_ack recieved ack on channel = %d", channel_num)); channel = orte_rml_base_get_channel (channel_num); if ((NULL != channel) || (NULL != channel->qos_channel_ptr)) { ack_chan = (orte_qos_ack_channel_t *) (channel->qos_channel_ptr); seq_num_array = malloc (sizeof(uint32_t) * ack_chan->window); num_values = 1; /* unpack ack type */ opal_dss.unpack(buffer, (void*) &ack_type, &num_values, OPAL_UINT32); num_values = 1; /* unpack num messages acked */ opal_dss.unpack(buffer, (void*) &num_msgs_acked, &num_values, OPAL_UINT32); OPAL_OUTPUT_VERBOSE((5, orte_qos_base_framework.framework_output, "orte_qos_ack_channel_process_ack recieved ack type %d for %d msgs on channel = %d", ack_type, num_msgs_acked, channel_num)); if (ACK_OUT_OF_ORDER != ack_type) { //handle normal ACK for (i = 0; i < num_msgs_acked; i++) { opal_dss.unpack(buffer, (void*) &seq_num_array[i], &num_values, OPAL_UINT32); room_num = orte_qos_ack_channel_get_msg_room (ack_chan, seq_num_array[i]); opal_hotel_checkout_and_return_occupant(&ack_chan->outstanding_msgs, room_num, &occupant); orte_qos_ack_channel_set_msg_room(ack_chan, seq_num_array[i], -1); if((occupant != NULL) && (room_num != -1)) { msg = (orte_rml_send_t*) occupant; OPAL_OUTPUT_VERBOSE((10, orte_rml_base_framework.framework_output, "Releasing sent message with tag %d and seq_num %d after receiving Ack from dest ", msg->tag, msg->seq_num )); msg->status = ORTE_SUCCESS; ORTE_RML_SEND_COMPLETE(msg); } else { OPAL_OUTPUT_VERBOSE((10, orte_rml_base_framework.framework_output, "OOPS received an ACK for already completed seq_num =%d ", seq_num_array[i] )); } } } else { // handle out of order ACK - complete msgs received in order, retry the lost msg. for (i = 0; i < num_msgs_acked; i++) { opal_dss.unpack(buffer, (void*) &seq_num_array[i], &num_values, OPAL_UINT32); room_num = orte_qos_ack_channel_get_msg_room (ack_chan, seq_num_array[i]); opal_hotel_checkout_and_return_occupant(&ack_chan->outstanding_msgs, room_num, &occupant); orte_qos_ack_channel_set_msg_room(ack_chan, seq_num_array[i], -1); if ((NULL != occupant) && ((i == 0 )|| (seq_num_array[i] == seq_num_array[i-1] +1 ))) { msg = (orte_rml_send_t*) occupant; msg->status = ORTE_SUCCESS; ORTE_RML_SEND_COMPLETE(msg); } else { if (NULL != occupant) { // num_missed_msgs = (seq_num_array[i] - seq_num_array [i-1] - 1); assert( i == num_msgs_acked -1); /* recheck the ith msg */ opal_hotel_checkin(&ack_chan->outstanding_msgs, (void*)occupant, &room_num); orte_qos_ack_channel_set_msg_room (ack_chan, seq_num_array[i], room_num); /* resend and recheck all the missed msgs*/ missed_msg_seq_num = seq_num_array[i-1] + 1; for (; missed_msg_seq_num < seq_num_array[i]; missed_msg_seq_num++) { room_num = orte_qos_ack_channel_get_msg_room (ack_chan, missed_msg_seq_num); opal_hotel_checkout_and_return_occupant (&ack_chan->outstanding_msgs, room_num, &occupant); assert ( NULL != occupant); missed_msg = (orte_rml_send_t*) occupant; missed_msg->status = ORTE_ERR_LOST_MSG_IN_WINDOW; opal_hotel_checkin(&ack_chan->outstanding_msgs, (void*)missed_msg, &room_num); orte_qos_ack_channel_set_msg_room (ack_chan, missed_msg_seq_num, room_num); /* send this out on wire directly */ ORTE_OOB_SEND (missed_msg); } //end for } else { OPAL_OUTPUT_VERBOSE((10, orte_rml_base_framework.framework_output, "OOPS received an ACK for already completed seq_num =%d ", seq_num_array[i] )); }//end if (NULL != occupant) } //end else } // end for }//end out of order ack processing free(seq_num_array); }else { OPAL_OUTPUT_VERBOSE((5, orte_qos_base_framework.framework_output, "orte_qos_ack_channel_msg_ack_recv_callback recieved ack on non existent channel = %d", channel_num)); } }
static inline int send_ack (orte_qos_ack_channel_t * ack_chan, orte_rml_channel_num_t channel_num, uint32_t ack_type, uint32_t last_msg_seq_num) { int rc; orte_rml_channel_t *rml_channel; opal_buffer_t *buffer; uint32_t num_msgs_to_ack = 0; uint32_t *ack_seq_num_array; uint32_t i; rml_channel = orte_rml_base_get_channel (channel_num); num_msgs_to_ack = ack_chan->in_msg_seq_num - ack_chan->ack_msg_seq_num + 1; OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output, "%s sending ack type = %d \n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ack_type)); if ( NULL != (ack_seq_num_array = malloc (sizeof(uint32_t) * num_msgs_to_ack))) { for (i = 1; i <= num_msgs_to_ack ; i++) { ack_seq_num_array[i-1] = ack_chan->ack_msg_seq_num + i; OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output, "%s ack_recv acking msg %d to peer = %s\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ack_seq_num_array[i-1], ORTE_NAME_PRINT(&rml_channel->peer))); } ack_seq_num_array[num_msgs_to_ack - 1] = last_msg_seq_num; OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output, "%s ack_recv acking last msg %d to peer = %s\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ack_seq_num_array[num_msgs_to_ack - 1], ORTE_NAME_PRINT(&rml_channel->peer))); } else { OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output, "%s ack_recv cannot allocate ack array to send ack to peer = %s\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&rml_channel->peer))); rc = ORTE_ERR_TEMP_OUT_OF_RESOURCE; return rc; } buffer = OBJ_NEW (opal_buffer_t); /* pack channel number */ opal_dss.pack (buffer, &rml_channel->peer_channel, 1, OPAL_UINT32); /* pack ack type */ opal_dss.pack (buffer, &ack_type, 1, OPAL_UINT32); /* pack num messages */ opal_dss.pack (buffer, &num_msgs_to_ack, 1, OPAL_UINT32); /* pack seq number array */ for (i =0; i<num_msgs_to_ack; i++) { opal_dss.pack (buffer, &ack_seq_num_array[i], 1 , OPAL_UINT32); } rc = orte_rml.send_buffer_nb (&rml_channel->peer, buffer, ORTE_RML_TAG_MSG_ACK, orte_qos_ack_msg_send_callback, rml_channel); if(ORTE_SUCCESS == rc) { /* update last acked msg */ ack_chan->ack_msg_seq_num = last_msg_seq_num; } else { //TO DO } return rc; }