void orte_qos_ack_msg_ack_timeout_callback (struct opal_hotel_t *hotel, int room_num, void *occupant) { orte_rml_send_t *msg; orte_qos_ack_channel_t *ack_chan; msg = (orte_rml_send_t *) occupant; ack_chan = (orte_qos_ack_channel_t*) msg->channel->qos_channel_ptr; OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output, "%s orte_qos_ack_msg_ack_timeout_callback for msg = %p seq num =%d\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (void*)msg, msg->seq_num)); /* for now complete only the msg that timed out TO DO : handle the completion of all messages in the window */ msg->status = ORTE_ERR_ACK_TIMEOUT_SENDER; // set room num to -1 for the msg's seq number orte_qos_ack_channel_set_msg_room (ack_chan, msg->seq_num , -1); // complete the msg ORTE_RML_SEND_COMPLETE(msg); }
void orte_qos_ack_channel_process_ack (int status, orte_process_name_t* sender, opal_buffer_t *buffer, orte_rml_tag_t tag, void *cbdata) { /* process ack received for the msg */ uint32_t num_msgs_acked, channel_num, i; int32_t num_values, room_num; orte_rml_send_t *msg, *missed_msg; void *occupant = NULL; orte_rml_channel_t *channel; orte_qos_ack_channel_t *ack_chan; uint32_t *seq_num_array; uint32_t ack_type; uint32_t missed_msg_seq_num = 0; num_values = 1; /* unpack channel number first */ opal_dss.unpack(buffer, (void*) &channel_num, &num_values, OPAL_UINT32); OPAL_OUTPUT_VERBOSE((5, orte_qos_base_framework.framework_output, "orte_qos_ack_channel_process_ack recieved ack on channel = %d", channel_num)); channel = orte_rml_base_get_channel (channel_num); if ((NULL != channel) || (NULL != channel->qos_channel_ptr)) { ack_chan = (orte_qos_ack_channel_t *) (channel->qos_channel_ptr); seq_num_array = malloc (sizeof(uint32_t) * ack_chan->window); num_values = 1; /* unpack ack type */ opal_dss.unpack(buffer, (void*) &ack_type, &num_values, OPAL_UINT32); num_values = 1; /* unpack num messages acked */ opal_dss.unpack(buffer, (void*) &num_msgs_acked, &num_values, OPAL_UINT32); OPAL_OUTPUT_VERBOSE((5, orte_qos_base_framework.framework_output, "orte_qos_ack_channel_process_ack recieved ack type %d for %d msgs on channel = %d", ack_type, num_msgs_acked, channel_num)); if (ACK_OUT_OF_ORDER != ack_type) { //handle normal ACK for (i = 0; i < num_msgs_acked; i++) { opal_dss.unpack(buffer, (void*) &seq_num_array[i], &num_values, OPAL_UINT32); room_num = orte_qos_ack_channel_get_msg_room (ack_chan, seq_num_array[i]); opal_hotel_checkout_and_return_occupant(&ack_chan->outstanding_msgs, room_num, &occupant); orte_qos_ack_channel_set_msg_room(ack_chan, seq_num_array[i], -1); if((occupant != NULL) && (room_num != -1)) { msg = (orte_rml_send_t*) occupant; OPAL_OUTPUT_VERBOSE((10, orte_rml_base_framework.framework_output, "Releasing sent message with tag %d and seq_num %d after receiving Ack from dest ", msg->tag, msg->seq_num )); msg->status = ORTE_SUCCESS; ORTE_RML_SEND_COMPLETE(msg); } else { OPAL_OUTPUT_VERBOSE((10, orte_rml_base_framework.framework_output, "OOPS received an ACK for already completed seq_num =%d ", seq_num_array[i] )); } } } else { // handle out of order ACK - complete msgs received in order, retry the lost msg. for (i = 0; i < num_msgs_acked; i++) { opal_dss.unpack(buffer, (void*) &seq_num_array[i], &num_values, OPAL_UINT32); room_num = orte_qos_ack_channel_get_msg_room (ack_chan, seq_num_array[i]); opal_hotel_checkout_and_return_occupant(&ack_chan->outstanding_msgs, room_num, &occupant); orte_qos_ack_channel_set_msg_room(ack_chan, seq_num_array[i], -1); if ((NULL != occupant) && ((i == 0 )|| (seq_num_array[i] == seq_num_array[i-1] +1 ))) { msg = (orte_rml_send_t*) occupant; msg->status = ORTE_SUCCESS; ORTE_RML_SEND_COMPLETE(msg); } else { if (NULL != occupant) { // num_missed_msgs = (seq_num_array[i] - seq_num_array [i-1] - 1); assert( i == num_msgs_acked -1); /* recheck the ith msg */ opal_hotel_checkin(&ack_chan->outstanding_msgs, (void*)occupant, &room_num); orte_qos_ack_channel_set_msg_room (ack_chan, seq_num_array[i], room_num); /* resend and recheck all the missed msgs*/ missed_msg_seq_num = seq_num_array[i-1] + 1; for (; missed_msg_seq_num < seq_num_array[i]; missed_msg_seq_num++) { room_num = orte_qos_ack_channel_get_msg_room (ack_chan, missed_msg_seq_num); opal_hotel_checkout_and_return_occupant (&ack_chan->outstanding_msgs, room_num, &occupant); assert ( NULL != occupant); missed_msg = (orte_rml_send_t*) occupant; missed_msg->status = ORTE_ERR_LOST_MSG_IN_WINDOW; opal_hotel_checkin(&ack_chan->outstanding_msgs, (void*)missed_msg, &room_num); orte_qos_ack_channel_set_msg_room (ack_chan, missed_msg_seq_num, room_num); /* send this out on wire directly */ ORTE_OOB_SEND (missed_msg); } //end for } else { OPAL_OUTPUT_VERBOSE((10, orte_rml_base_framework.framework_output, "OOPS received an ACK for already completed seq_num =%d ", seq_num_array[i] )); }//end if (NULL != occupant) } //end else } // end for }//end out of order ack processing free(seq_num_array); }else { OPAL_OUTPUT_VERBOSE((5, orte_qos_base_framework.framework_output, "orte_qos_ack_channel_msg_ack_recv_callback recieved ack on non existent channel = %d", channel_num)); } }
/* * A file descriptor is available/ready for send. Check the state * of the socket and take the appropriate action. */ void mca_oob_tcp_send_handler(int sd, short flags, void *cbdata) { mca_oob_tcp_peer_t* peer = (mca_oob_tcp_peer_t*)cbdata; mca_oob_tcp_send_t* msg = peer->send_msg; int rc; opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s tcp:send_handler called to send to peer %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer->name)); switch (peer->state) { case MCA_OOB_TCP_CONNECTING: case MCA_OOB_TCP_CLOSED: opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s tcp:send_handler %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), mca_oob_tcp_state_print(peer->state)); mca_oob_tcp_peer_complete_connect(peer); /* de-activate the send event until the connection * handshake completes */ if (peer->send_ev_active) { opal_event_del(&peer->send_event); peer->send_ev_active = false; } break; case MCA_OOB_TCP_CONNECTED: opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s tcp:send_handler SENDING TO %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == peer->send_msg) ? "NULL" : ORTE_NAME_PRINT(&peer->name)); if (NULL != msg) { /* if the header hasn't been completely sent, send it */ if (!msg->hdr_sent) { OPAL_TIMING_EVENT((&tm,"Send header to %s", ORTE_NAME_PRINT(&peer->name))); if (ORTE_SUCCESS == (rc = send_bytes(peer))) { /* header is completely sent */ msg->hdr_sent = true; /* setup to send the data */ if (NULL != msg->data) { /* relay msg - send that data */ msg->sdptr = msg->data; msg->sdbytes = (int)ntohl(msg->hdr.nbytes); } else if (NULL == msg->msg) { /* this was a zero-byte relay - nothing more to do */ OBJ_RELEASE(msg); peer->send_msg = NULL; goto next; } else if (NULL != msg->msg->buffer) { /* send the buffer data as a single block */ msg->sdptr = msg->msg->buffer->base_ptr; msg->sdbytes = msg->msg->buffer->bytes_used; } else if (NULL != msg->msg->iov) { /* start with the first iovec */ msg->sdptr = msg->msg->iov[0].iov_base; msg->sdbytes = msg->msg->iov[0].iov_len; msg->iovnum = 0; } else { /* just send the data */ msg->sdptr = msg->msg->data; msg->sdbytes = msg->msg->count; } /* fall thru and let the send progress */ } else if (ORTE_ERR_RESOURCE_BUSY == rc || ORTE_ERR_WOULD_BLOCK == rc) { /* exit this event and let the event lib progress */ return; } else { // report the error opal_output(0, "%s-%s mca_oob_tcp_peer_send_handler: unable to send header", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name))); opal_event_del(&peer->send_event); msg->msg->status = rc; ORTE_RML_SEND_COMPLETE(msg->msg); OBJ_RELEASE(msg); peer->send_msg = NULL; goto next; } } /* progress the data transmission */ if (msg->hdr_sent) { OPAL_TIMING_EVENT((&tm,"Send msg to %s", ORTE_NAME_PRINT(&peer->name))); if (ORTE_SUCCESS == (rc = send_bytes(peer))) { /* this block is complete */ if (NULL != msg->data || NULL == msg->msg) { /* the relay is complete - release the data */ opal_output_verbose(2, orte_oob_base_framework.framework_output, "%s MESSAGE RELAY COMPLETE TO %s OF %d BYTES ON SOCKET %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name)), (int)ntohl(msg->hdr.nbytes), peer->sd); OBJ_RELEASE(msg); peer->send_msg = NULL; } else if (NULL != msg->msg->buffer) { /* we are done - notify the RML */ opal_output_verbose(2, orte_oob_base_framework.framework_output, "%s MESSAGE SEND COMPLETE TO %s OF %d BYTES ON SOCKET %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name)), (int)ntohl(msg->hdr.nbytes), peer->sd); msg->msg->status = ORTE_SUCCESS; ORTE_RML_SEND_COMPLETE(msg->msg); OBJ_RELEASE(msg); peer->send_msg = NULL; } else if (NULL != msg->msg->data) { /* this was a relay we have now completed - no need to * notify the RML as the local proc didn't initiate * the send */ opal_output_verbose(2, orte_oob_base_framework.framework_output, "%s MESSAGE RELAY COMPLETE TO %s OF %d BYTES ON SOCKET %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name)), (int)ntohl(msg->hdr.nbytes), peer->sd); msg->msg->status = ORTE_SUCCESS; OBJ_RELEASE(msg); peer->send_msg = NULL; } else { /* rotate to the next iovec */ msg->iovnum++; if (msg->iovnum < msg->msg->count) { msg->sdptr = msg->msg->iov[msg->iovnum].iov_base; msg->sdbytes = msg->msg->iov[msg->iovnum].iov_len; /* exit this event to give the event lib * a chance to progress any other pending * actions */ return; } else { /* this message is complete - notify the RML */ opal_output_verbose(2, orte_oob_base_framework.framework_output, "%s MESSAGE SEND COMPLETE TO %s OF %d BYTES ON SOCKET %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name)), (int)ntohl(msg->hdr.nbytes), peer->sd); msg->msg->status = ORTE_SUCCESS; ORTE_RML_SEND_COMPLETE(msg->msg); OBJ_RELEASE(msg); peer->send_msg = NULL; } } /* fall thru to queue the next message */ } else if (ORTE_ERR_RESOURCE_BUSY == rc || ORTE_ERR_WOULD_BLOCK == rc) { /* exit this event and let the event lib progress */ return; } else { // report the error opal_output(0, "%s-%s mca_oob_tcp_peer_send_handler: unable to send message ON SOCKET %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name)), peer->sd); opal_event_del(&peer->send_event); msg->msg->status = rc; ORTE_RML_SEND_COMPLETE(msg->msg); OBJ_RELEASE(msg); peer->send_msg = NULL; ORTE_FORCED_TERMINATE(1); return; } } next: /* if current message completed - progress any pending sends by * moving the next in the queue into the "on-deck" position. Note * that this doesn't mean we send the message right now - we will * wait for another send_event to fire before doing so. This gives * us a chance to service any pending recvs. */ peer->send_msg = (mca_oob_tcp_send_t*) opal_list_remove_first(&peer->send_queue); } /* if nothing else to do unregister for send event notifications */ if (NULL == peer->send_msg && peer->send_ev_active) { opal_event_del(&peer->send_event); peer->send_ev_active = false; } break; default: opal_output(0, "%s-%s mca_oob_tcp_peer_send_handler: invalid connection state (%d) on socket %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name)), peer->state, peer->sd); if (peer->send_ev_active) { opal_event_del(&peer->send_event); peer->send_ev_active = false; } break; } }
static int mca_oob_ud_send_self (orte_rml_send_t *msg) { unsigned int srco, dsto; mca_oob_ud_req_t *req; int srci, dsti; int rc, size; MCA_OOB_UD_IOV_SIZE(msg, size); opal_output_verbose(10, orte_oob_base_framework.framework_output, "%s mca_oob_ud_send_self: sending %d bytes to myself", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), size); rc = mca_oob_ud_get_recv_req (*ORTE_PROC_MY_NAME, msg->tag, &req, (msg->iov != NULL) ? true : false); if (ORTE_SUCCESS != rc) { return rc; } req->req_rem_data_len = size; req->req_is_eager = true; rc = mca_oob_ud_recv_alloc (req); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); if (MCA_OOB_UD_REQ_IOV == req->req_data_type) { free (req->req_data.iov.uiov); } OBJ_RELEASE(req); return rc; } srci = dsti = 0; srco = dsto = 0; if (msg->iov != NULL) { do { req->req_data_type = MCA_OOB_UD_REQ_IOV; size_t copy = min(msg->iov[srci].iov_len - srco, req->req_data.iov.uiov[dsti].iov_len - dsto); memmove ((unsigned char *) req->req_data.iov.uiov[dsti].iov_base + dsto, (unsigned char *) msg->iov[srci].iov_base + srco, copy); srco += copy; if (srco == msg->iov[srci].iov_len) { srci++; srco = 0; } dsto += copy; if (dsto == req->req_data.iov.uiov[dsti].iov_len) { dsti++; dsto = 0; } } while (srci < req->req_data.iov.count && dsti < msg->count); } else { req->req_data_type = MCA_OOB_UD_REQ_BUF; opal_buffer_t *buffer; buffer = OBJ_NEW(opal_buffer_t); if (OPAL_SUCCESS != (rc = opal_dss.copy_payload(buffer, msg->buffer))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(buffer); return rc; } if (OPAL_SUCCESS != (rc = opal_dss.unload(buffer, (void **)&req->req_data.buf.p, &req->req_data.buf.size))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(buffer); free(req->req_data.buf.p); return rc; } OBJ_RELEASE(buffer); } req->state = MCA_OOB_UD_REQ_COMPLETE; opal_output_verbose(10, orte_oob_base_framework.framework_output, "%s mca_oob_ud_send_self: complete. calling callbacks", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); /* queue up recv callback */ mca_oob_ud_event_queue_completed (req); req->rml_msg->status = ORTE_SUCCESS; ORTE_RML_SEND_COMPLETE(req->rml_msg); return size; }
void mca_oob_ud_req_complete (mca_oob_ud_req_t *req, int rc) { int i; opal_output_verbose(10, orte_oob_base_framework.framework_output, "%s oob:ud:req_complete %s request %p completed with status %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (req->type == MCA_OOB_UD_REQ_SEND) ? "SEND":"RECV", (void *) req, rc); if (NULL != req->req_qp) { (void) mca_oob_ud_qp_data_release (req->req_qp); req->req_qp = NULL; } /* deregister memory *before* handing it to the callback */ MCA_OOB_UD_REQ_DEREG_MR(req); switch (req->type) { case MCA_OOB_UD_REQ_SEND: if (req->req_data_type != MCA_OOB_UD_REQ_TR) { req->rml_msg->status = rc; if( NULL == req->rml_msg->channel) { ORTE_RML_SEND_COMPLETE(req->rml_msg); } else { ORTE_QOS_SEND_COMPLETE(req->rml_msg); } } break; case MCA_OOB_UD_REQ_RECV: if ((req->req_target.jobid == ORTE_PROC_MY_NAME->jobid) && (req->req_target.vpid == ORTE_PROC_MY_NAME->vpid)) { opal_output_verbose(1, orte_oob_base_framework.framework_output, "%s DELIVERING TO RML", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); if (MCA_OOB_UD_REQ_IOV == req->req_data_type) { char *data = (char *)calloc(req->req_data.iov.count, sizeof(struct iovec)); int datalen = 0; for (i = 0 ; i < req->req_data.iov.count; ++i) { memcpy (&data[datalen], req->req_data.iov.uiov[i].iov_base, req->req_data.iov.uiov[i].iov_len); datalen += req->req_data.iov.uiov[i].iov_len; } ORTE_RML_POST_MESSAGE(&req->req_origin, req->req_tag, req->req_channel, req->req_seq_num, data, datalen, 0); free(data); } else { ORTE_RML_POST_MESSAGE(&req->req_origin, req->req_tag, req->req_channel, req->req_seq_num, req->req_data.buf.p, req->req_data.buf.size, 0); } } else { opal_output_verbose(1, orte_oob_base_framework.framework_output, "%s UD PROMOTING ROUTED MESSAGE FOR %s TO OOB", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&req->req_target)); orte_rml_send_t *snd = OBJ_NEW(orte_rml_send_t); snd->dst = req->req_target; snd->origin = req->req_origin; snd->tag = req->req_tag; snd->dst_channel = req->req_channel; snd->seq_num = req->req_seq_num; if (MCA_OOB_UD_REQ_IOV == req->req_data_type) { char *data = (char *)calloc(req->req_data.iov.count, sizeof(struct iovec)); int datalen = 0; for (i = 0 ; i < req->req_data.iov.count; ++i) { memcpy (&data[datalen], req->req_data.iov.uiov[i].iov_base, req->req_data.iov.uiov[i].iov_len); datalen += req->req_data.iov.uiov[i].iov_len; } snd->data = data; snd->count = datalen; } else { char *data = (char *)calloc(req->req_data.buf.size, sizeof(char)); memcpy (data, req->req_data.buf.p, req->req_data.buf.size); snd->data = data; snd->count = req->req_data.buf.size; } snd->cbfunc.iov = NULL; snd->cbdata = NULL; /* activate the OOB send state */ ORTE_OOB_SEND(snd); } break; default: break; } mca_oob_ud_req_return (req); }