/* * Packet has been fully processed, update the receive window * to indicate that it and possible following contiguous sequence * numbers have been received. */ static inline void ompi_btl_usnic_update_window( ompi_btl_usnic_endpoint_t *endpoint, uint32_t window_index) { uint32_t i; /* Enable ACK reply if not enabled */ #if MSGDEBUG1 opal_output(0, "ep: %p, ack_needed = %s\n", endpoint, endpoint->endpoint_ack_needed?"true":"false"); #endif if (!endpoint->endpoint_ack_needed) { ompi_btl_usnic_add_to_endpoints_needing_ack(endpoint); } /* give this process a chance to send something before ACKing */ if (0 == endpoint->endpoint_acktime) { endpoint->endpoint_acktime = get_nsec() + 50000; /* 50 usec */ } /* Save this incoming segment in the received segmentss array on the endpoint. */ /* JMS Another optimization: make rcvd_segs be a bitmask (i.e., more cache friendly) */ endpoint->endpoint_rcvd_segs[window_index] = true; /* See if the leftmost segment in the receiver window is occupied. If so, advance the window. Repeat until we hit an unoccupied position in the window. */ i = endpoint->endpoint_rfstart; while (endpoint->endpoint_rcvd_segs[i]) { endpoint->endpoint_rcvd_segs[i] = false; endpoint->endpoint_next_contig_seq_to_recv++; i = WINDOW_SIZE_MOD(i + 1); #if MSGDEBUG opal_output(0, "Advance window to %d; next seq to send %" UDSEQ, i, endpoint->endpoint_next_contig_seq_to_recv); #endif } endpoint->endpoint_rfstart = i; }
/* * Force a retrans of a segment */ static void opal_btl_usnic_force_retrans( opal_btl_usnic_endpoint_t *endpoint, opal_btl_usnic_seq_t ack_seq) { opal_btl_usnic_send_segment_t *sseg; int is; is = WINDOW_SIZE_MOD(ack_seq+1); sseg = endpoint->endpoint_sent_segs[is]; if (sseg == NULL || sseg->ss_hotel_room == -1) { return; } /* cancel retrans timer */ opal_hotel_checkout(&endpoint->endpoint_hotel, sseg->ss_hotel_room); sseg->ss_hotel_room = -1; /* Queue up this segment to be resent */ opal_list_append(&(endpoint->endpoint_module->pending_resend_segs), &(sseg->ss_base.us_list.super)); ++endpoint->endpoint_module->stats.num_fast_retrans; }
/* * We have received an ACK for a given sequence number (either standalone * or via piggy-back on a regular send) */ void opal_btl_usnic_handle_ack( opal_btl_usnic_endpoint_t *endpoint, opal_btl_usnic_seq_t ack_seq) { opal_btl_usnic_seq_t is; opal_btl_usnic_send_segment_t *sseg; opal_btl_usnic_send_frag_t *frag; opal_btl_usnic_module_t *module; uint32_t bytes_acked; module = endpoint->endpoint_module; /* ignore if this is an old ACK */ if (SEQ_LT(ack_seq, endpoint->endpoint_ack_seq_rcvd)) { #if MSGDEBUG1 opal_output(0, "Got OLD DUP ACK seq %"UDSEQ" < %"UDSEQ"\n", ack_seq, endpoint->endpoint_ack_seq_rcvd); #endif ++module->stats.num_old_dup_acks; return; /* A duplicate ACK means next seg was lost */ } else if (ack_seq == endpoint->endpoint_ack_seq_rcvd) { ++module->stats.num_dup_acks; opal_btl_usnic_force_retrans(endpoint, ack_seq); return; } /* Does this ACK have a new sequence number that we haven't seen before? */ for (is = endpoint->endpoint_ack_seq_rcvd + 1; SEQ_LE(is, ack_seq); ++is) { sseg = endpoint->endpoint_sent_segs[WINDOW_SIZE_MOD(is)]; #if MSGDEBUG1 opal_output(0, " Checking ACK/sent_segs window %p, index %lu, seq %lu, occupied=%p, seg_room=%d", (void*) endpoint->endpoint_sent_segs, WINDOW_SIZE_MOD(is), is, (void*)sseg, (sseg?sseg->ss_hotel_room:-2)); #endif assert(sseg != NULL); assert(sseg->ss_base.us_btl_header->pkt_seq == is); #if MSGDEBUG1 if (sseg->ss_hotel_room == -1) { opal_output(0, "=== ACKed frag in sent_frags array is not in hotel/enqueued, module %p, endpoint %p, seg %p, seq %" UDSEQ ", slot %lu", (void*) module, (void*) endpoint, (void*) sseg, is, WINDOW_SIZE_MOD(is)); } #endif /* Check the sending segment out from the hotel. NOTE: The segment might not actually be in a hotel room if it has already been evicted and queued for resend. If it's not in the hotel, don't check it out! */ if (OPAL_LIKELY(sseg->ss_hotel_room != -1)) { opal_hotel_checkout(&endpoint->endpoint_hotel, sseg->ss_hotel_room); sseg->ss_hotel_room = -1; /* hotel_room == -1 means queued for resend, remove it */ } else { opal_list_remove_item((&module->pending_resend_segs), &sseg->ss_base.us_list.super); } /* update the owning fragment */ bytes_acked = sseg->ss_base.us_btl_header->payload_len; frag = sseg->ss_parent_frag; #if MSGDEBUG1 opal_output(0, " ACKED seg %p frag %p ack_bytes=%"PRIu32" left=%zd dst_seg[0].seg_addr=%p des_flags=0x%x\n", (void*)sseg, (void*)frag, bytes_acked, frag->sf_ack_bytes_left - bytes_acked, frag->sf_base.uf_local_seg[0].seg_addr.pval, frag->sf_base.uf_base.des_flags); #endif /* If all ACKs received, and this is a put or a regular send * that needs a callback, perform the callback now * * NOTE on sf_ack_bytes_left - here we check for * sf_ack_bytes_left == bytes_acked * as opposed to adjusting sf_ack_bytes_left and checking for 0 because * if we don't, the callback function may call usnic_free() and free * the fragment out from under us which we do not want. If the * fragment really needs to be freed, we'll take care of it in a few * lines below. */ if (frag->sf_ack_bytes_left == bytes_acked && ((frag->sf_base.uf_remote_seg[0].seg_addr.pval != NULL) || (frag->sf_base.uf_base.des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK))) { OPAL_BTL_USNIC_DO_SEND_FRAG_CB(module, frag, "send completion"); } /* free this segment */ sseg->ss_ack_pending = false; if (sseg->ss_send_posted == 0) { opal_btl_usnic_release_send_segment(module, frag, sseg); } /* when no bytes left to ACK, fragment send is truly done */ /* see note above on why this is done here as opposed to earlier */ frag->sf_ack_bytes_left -= bytes_acked; /* OK to return this fragment? */ opal_btl_usnic_send_frag_return_cond(module, frag); /* indicate this segment has been ACKed */ endpoint->endpoint_sent_segs[WINDOW_SIZE_MOD(is)] = NULL; } /* update ACK received */ endpoint->endpoint_ack_seq_rcvd = ack_seq; /* send window may have opened, possibly make endpoint ready-to-send */ opal_btl_usnic_check_rts(endpoint); }
/* * Create an endpoint and claim the matched modex slot */ int opal_btl_usnic_create_endpoint(opal_btl_usnic_module_t *module, opal_btl_usnic_proc_t *proc, opal_btl_usnic_endpoint_t **endpoint_o) { int err; int modex_index; opal_btl_usnic_endpoint_t *endpoint; /* look for matching modex info */ err = match_modex(module, proc, &modex_index); if (OPAL_SUCCESS != err) { opal_output_verbose(5, USNIC_OUT, "btl:usnic:create_endpoint: did not match usnic modex info for peer %s", OPAL_NAME_PRINT(proc->proc_opal->proc_name)); return err; } endpoint = OBJ_NEW(opal_btl_usnic_endpoint_t); if (NULL == endpoint) { return OPAL_ERR_OUT_OF_RESOURCE; } /* Initalize the endpoint */ endpoint->endpoint_module = module; assert(modex_index >= 0 && modex_index < (int)proc->proc_modex_count); endpoint->endpoint_remote_addr = proc->proc_modex[modex_index]; /* Initialize endpoint sequence number info */ endpoint->endpoint_next_seq_to_send = module->local_addr.isn; endpoint->endpoint_ack_seq_rcvd = endpoint->endpoint_next_seq_to_send - 1; endpoint->endpoint_next_contig_seq_to_recv = endpoint->endpoint_remote_addr.isn; endpoint->endpoint_highest_seq_rcvd = endpoint->endpoint_next_contig_seq_to_recv - 1; endpoint->endpoint_rfstart = WINDOW_SIZE_MOD(endpoint->endpoint_next_contig_seq_to_recv); /* Defer creating the ibv_ah. Since calling ibv_create_ah() may trigger ARP resolution, it's better to batch all the endpoints' calls to ibv_create_ah() together to get some parallelism. */ endpoint->endpoint_remote_ah = NULL; /* Now claim that modex slot */ proc->proc_modex_claimed[modex_index] = true; MSGDEBUG1_OUT("create_endpoint: module=%p claimed endpoint=%p on proc=%p (hash=0x%" PRIx64 ")\n", (void *)module, (void *)endpoint, (void *)proc, proc->proc_opal->proc_name); /* Save the endpoint on this proc's array of endpoints */ proc->proc_endpoints[proc->proc_endpoint_count] = endpoint; endpoint->endpoint_proc_index = proc->proc_endpoint_count; endpoint->endpoint_proc = proc; ++proc->proc_endpoint_count; OBJ_RETAIN(proc); /* also add endpoint to module's list of endpoints (done here and not in the endpoint constructor because we aren't able to pass the module as a constructor argument -- doh!). */ opal_mutex_lock(&module->all_endpoints_lock); opal_list_append(&(module->all_endpoints), &(endpoint->endpoint_endpoint_li)); endpoint->endpoint_on_all_endpoints = true; opal_mutex_unlock(&module->all_endpoints_lock); *endpoint_o = endpoint; return OPAL_SUCCESS; }
static inline int ompi_btl_usnic_check_rx_seq( ompi_btl_usnic_endpoint_t *endpoint, ompi_btl_usnic_recv_segment_t *seg, uint32_t *window_index) { uint32_t i; ompi_btl_usnic_seq_t seq; /* * Handle piggy-backed ACK if present */ if (seg->rs_base.us_btl_header->ack_seq != 0) { #if MSGDEBUG1 opal_output(0, "Handle piggy-packed ACK seq %d\n", seg->rs_base.us_btl_header->ack_seq); #endif ompi_btl_usnic_handle_ack(endpoint, seg->rs_base.us_btl_header->ack_seq); } /* Do we have room in the endpoint's receiver window? Receiver window: |-------- WINDOW_SIZE ----------| +---------------------------------+ | highest_seq_rcvd | | somewhere in this range | +^--------------------------------+ | +-- next_contig_seq_to_recv: the window left edge; will always be less than highest_seq_rcvd The good condition is next_contig_seq_to_recv <= seq < next_contig_seq_to_recv + WINDOW_SIZE And the bad condition is seq < next_contig_seq_to_recv or seq >= next_contig_seg_to_recv + WINDOW_SIZE */ seq = seg->rs_base.us_btl_header->seq; if (seq < endpoint->endpoint_next_contig_seq_to_recv || seq >= endpoint->endpoint_next_contig_seq_to_recv + WINDOW_SIZE) { #if MSGDEBUG opal_output(0, "<-- Received FRAG/CHUNK ep %p, seq %" UDSEQ " outside of window (%" UDSEQ " - %" UDSEQ "), %p, module %p -- DROPPED\n", (void*)endpoint, seg->rs_base.us_btl_header->seq, endpoint->endpoint_next_contig_seq_to_recv, (endpoint->endpoint_next_contig_seq_to_recv + WINDOW_SIZE - 1), (void*) seg, (void*) endpoint->endpoint_module); #endif /* Stats */ if (seq < endpoint->endpoint_next_contig_seq_to_recv) { ++endpoint->endpoint_module->num_oow_low_recvs; } else { ++endpoint->endpoint_module->num_oow_high_recvs; } goto dup_needs_ack; } /* Ok, this segment is within the receiver window. Have we already received it? It's possible that the sender has re-sent a segment that we've already received (but not yet ACKed). We have saved all un-ACKed segment in an array on the endpoint that is the same legnth as the receiver's window (i.e., WINDOW_SIZE). We can use the incoming segment sequence number to find its position in the array. It's a little tricky because the left edge of the receiver window keeps moving, so we use a starting reference point in the array that is updated when we sent ACKs (and therefore move the left edge of the receiver's window). So this segment's index into the endpoint array is: rel_posn_in_recv_win = seq - next_contig_seq_to_recv array_posn = (rel_posn_in_recv_win + rfstart) % WINDOW_SIZE rfstart is then updated when we send ACKs: rfstart = (rfstart + num_acks_sent) % WINDOW_SIZE */ i = seq - endpoint->endpoint_next_contig_seq_to_recv; i = WINDOW_SIZE_MOD(i + endpoint->endpoint_rfstart); if (endpoint->endpoint_rcvd_segs[i]) { #if MSGDEBUG opal_output(0, "<-- Received FRAG/CHUNK ep %p, seq %" UDSEQ " from %s to %s, seg %p: duplicate -- DROPPED\n", (void*) endpoint, bseg->us_btl_header->seq, src_mac, dest_mac, (void*) seg); #endif /* highest_seq_rcvd is for debug stats only; it's not used in any window calculations */ assert(seq <= endpoint->endpoint_highest_seq_rcvd); /* next_contig_seq_to_recv-1 is the ack number we'll send */ assert (seq > endpoint->endpoint_next_contig_seq_to_recv - 1); /* Stats */ ++endpoint->endpoint_module->num_dup_recvs; goto dup_needs_ack; } /* Stats: is this the highest sequence number we've received? */ if (seq > endpoint->endpoint_highest_seq_rcvd) { endpoint->endpoint_highest_seq_rcvd = seq; } *window_index = i; return true; dup_needs_ack: if (!endpoint->endpoint_ack_needed) { ompi_btl_usnic_add_to_endpoints_needing_ack(endpoint); } return false; }