Пример #1
0
/*
 * Packet has been fully processed, update the receive window
 * to indicate that it and possible following contiguous sequence
 * numbers have been received.
 */
static inline void
ompi_btl_usnic_update_window(
    ompi_btl_usnic_endpoint_t *endpoint,
    uint32_t window_index)
{
    uint32_t i;

    /* Enable ACK reply if not enabled */
#if MSGDEBUG1
    opal_output(0, "ep: %p, ack_needed = %s\n", endpoint, endpoint->endpoint_ack_needed?"true":"false");
#endif
    if (!endpoint->endpoint_ack_needed) {
        ompi_btl_usnic_add_to_endpoints_needing_ack(endpoint);
    }

    /* give this process a chance to send something before ACKing */
    if (0 == endpoint->endpoint_acktime) {
        endpoint->endpoint_acktime = get_nsec() + 50000;    /* 50 usec */
    }

    /* Save this incoming segment in the received segmentss array on the
       endpoint. */
    /* JMS Another optimization: make rcvd_segs be a bitmask (i.e.,
       more cache friendly) */
    endpoint->endpoint_rcvd_segs[window_index] = true;

    /* See if the leftmost segment in the receiver window is
       occupied.  If so, advance the window.  Repeat until we hit
       an unoccupied position in the window. */
    i = endpoint->endpoint_rfstart;
    while (endpoint->endpoint_rcvd_segs[i]) {
        endpoint->endpoint_rcvd_segs[i] = false;
        endpoint->endpoint_next_contig_seq_to_recv++;
        i = WINDOW_SIZE_MOD(i + 1);

#if MSGDEBUG
        opal_output(0, "Advance window to %d; next seq to send %" UDSEQ, i,
                    endpoint->endpoint_next_contig_seq_to_recv);
#endif
    }
    endpoint->endpoint_rfstart = i;
}
Пример #2
0
/*
 * Force a retrans of a segment
 */
static void
opal_btl_usnic_force_retrans(
    opal_btl_usnic_endpoint_t *endpoint,
    opal_btl_usnic_seq_t ack_seq)
{
    opal_btl_usnic_send_segment_t *sseg;
    int is;

    is = WINDOW_SIZE_MOD(ack_seq+1);
    sseg = endpoint->endpoint_sent_segs[is];
    if (sseg == NULL || sseg->ss_hotel_room == -1) {
        return;
    }

    /* cancel retrans timer */
    opal_hotel_checkout(&endpoint->endpoint_hotel, sseg->ss_hotel_room);
    sseg->ss_hotel_room = -1;

    /* Queue up this segment to be resent */
    opal_list_append(&(endpoint->endpoint_module->pending_resend_segs),
                     &(sseg->ss_base.us_list.super));

    ++endpoint->endpoint_module->stats.num_fast_retrans;
}
Пример #3
0
/*
 * We have received an ACK for a given sequence number (either standalone
 * or via piggy-back on a regular send)
 */
void
opal_btl_usnic_handle_ack(
    opal_btl_usnic_endpoint_t *endpoint,
    opal_btl_usnic_seq_t ack_seq)
{
    opal_btl_usnic_seq_t is;
    opal_btl_usnic_send_segment_t *sseg;
    opal_btl_usnic_send_frag_t *frag;
    opal_btl_usnic_module_t *module;
    uint32_t bytes_acked;

    module = endpoint->endpoint_module;

    /* ignore if this is an old ACK */
    if (SEQ_LT(ack_seq, endpoint->endpoint_ack_seq_rcvd)) {
#if MSGDEBUG1
        opal_output(0, "Got OLD DUP ACK seq %"UDSEQ" < %"UDSEQ"\n",
                ack_seq, endpoint->endpoint_ack_seq_rcvd);
#endif
        ++module->stats.num_old_dup_acks;
        return;

    /* A duplicate ACK means next seg was lost */
    } else if (ack_seq == endpoint->endpoint_ack_seq_rcvd) {
        ++module->stats.num_dup_acks;

        opal_btl_usnic_force_retrans(endpoint, ack_seq);
        return;
    }

    /* Does this ACK have a new sequence number that we haven't
       seen before? */
    for (is = endpoint->endpoint_ack_seq_rcvd + 1; SEQ_LE(is, ack_seq); ++is) {
        sseg = endpoint->endpoint_sent_segs[WINDOW_SIZE_MOD(is)];

#if MSGDEBUG1
        opal_output(0, "  Checking ACK/sent_segs window %p, index %lu, seq %lu, occupied=%p, seg_room=%d",
            (void*) endpoint->endpoint_sent_segs,
            WINDOW_SIZE_MOD(is), is, (void*)sseg, (sseg?sseg->ss_hotel_room:-2));
#endif

        assert(sseg != NULL);
        assert(sseg->ss_base.us_btl_header->pkt_seq == is);
#if MSGDEBUG1
        if (sseg->ss_hotel_room == -1) {
            opal_output(0, "=== ACKed frag in sent_frags array is not in hotel/enqueued, module %p, endpoint %p, seg %p, seq %" UDSEQ ", slot %lu",
                        (void*) module, (void*) endpoint,
                        (void*) sseg, is, WINDOW_SIZE_MOD(is));
        }
#endif

        /* Check the sending segment out from the hotel.  NOTE: The
           segment might not actually be in a hotel room if it has
           already been evicted and queued for resend.
           If it's not in the hotel, don't check it out! */
        if (OPAL_LIKELY(sseg->ss_hotel_room != -1)) {

            opal_hotel_checkout(&endpoint->endpoint_hotel, sseg->ss_hotel_room);
            sseg->ss_hotel_room = -1;

        /* hotel_room == -1 means queued for resend, remove it */
        } else {
            opal_list_remove_item((&module->pending_resend_segs),
                    &sseg->ss_base.us_list.super);
        }

        /* update the owning fragment */
        bytes_acked = sseg->ss_base.us_btl_header->payload_len;
        frag = sseg->ss_parent_frag;

#if MSGDEBUG1
        opal_output(0, "   ACKED seg %p frag %p ack_bytes=%"PRIu32" left=%zd dst_seg[0].seg_addr=%p des_flags=0x%x\n",
                (void*)sseg, (void*)frag, bytes_acked,
                frag->sf_ack_bytes_left - bytes_acked,
                frag->sf_base.uf_local_seg[0].seg_addr.pval,
                frag->sf_base.uf_base.des_flags);
#endif

        /* If all ACKs received, and this is a put or a regular send
         * that needs a callback, perform the callback now
         *
         * NOTE on sf_ack_bytes_left - here we check for
         *      sf_ack_bytes_left == bytes_acked
         * as opposed to adjusting sf_ack_bytes_left and checking for 0 because
         * if we don't, the callback function may call usnic_free() and free
         * the fragment out from under us which we do not want.  If the
         * fragment really needs to be freed, we'll take care of it in a few
         * lines below.
         */
        if (frag->sf_ack_bytes_left == bytes_acked &&
            ((frag->sf_base.uf_remote_seg[0].seg_addr.pval != NULL) ||
             (frag->sf_base.uf_base.des_flags &
              MCA_BTL_DES_SEND_ALWAYS_CALLBACK))) {
            OPAL_BTL_USNIC_DO_SEND_FRAG_CB(module, frag, "send completion");
        }

        /* free this segment */
        sseg->ss_ack_pending = false;
        if (sseg->ss_send_posted == 0) {
            opal_btl_usnic_release_send_segment(module, frag, sseg);
        }

        /* when no bytes left to ACK, fragment send is truly done */
        /* see note above on why this is done here as opposed to earlier */
        frag->sf_ack_bytes_left -= bytes_acked;

        /* OK to return this fragment? */
        opal_btl_usnic_send_frag_return_cond(module, frag);

        /* indicate this segment has been ACKed */
        endpoint->endpoint_sent_segs[WINDOW_SIZE_MOD(is)] = NULL;
    }

    /* update ACK received */
    endpoint->endpoint_ack_seq_rcvd = ack_seq;

    /* send window may have opened, possibly make endpoint ready-to-send */
    opal_btl_usnic_check_rts(endpoint);
}
Пример #4
0
/*
 * Create an endpoint and claim the matched modex slot
 */
int
opal_btl_usnic_create_endpoint(opal_btl_usnic_module_t *module,
                opal_btl_usnic_proc_t *proc,
                opal_btl_usnic_endpoint_t **endpoint_o)
{
    int err;
    int modex_index;
    opal_btl_usnic_endpoint_t *endpoint;

    /* look for matching modex info */
    err = match_modex(module, proc, &modex_index);
    if (OPAL_SUCCESS != err) {
        opal_output_verbose(5, USNIC_OUT,
                            "btl:usnic:create_endpoint: did not match usnic modex info for peer %s",
                            OPAL_NAME_PRINT(proc->proc_opal->proc_name));
        return err;
    }

    endpoint = OBJ_NEW(opal_btl_usnic_endpoint_t);
    if (NULL == endpoint) {
        return OPAL_ERR_OUT_OF_RESOURCE;
    }

    /* Initalize the endpoint */
    endpoint->endpoint_module = module;
    assert(modex_index >= 0 && modex_index < (int)proc->proc_modex_count);
    endpoint->endpoint_remote_addr = proc->proc_modex[modex_index];

    /* Initialize endpoint sequence number info */
    endpoint->endpoint_next_seq_to_send = module->local_addr.isn;
    endpoint->endpoint_ack_seq_rcvd = endpoint->endpoint_next_seq_to_send - 1;
    endpoint->endpoint_next_contig_seq_to_recv =
        endpoint->endpoint_remote_addr.isn;
    endpoint->endpoint_highest_seq_rcvd =
        endpoint->endpoint_next_contig_seq_to_recv - 1;
    endpoint->endpoint_rfstart = WINDOW_SIZE_MOD(endpoint->endpoint_next_contig_seq_to_recv);

    /* Defer creating the ibv_ah.  Since calling ibv_create_ah() may
       trigger ARP resolution, it's better to batch all the endpoints'
       calls to ibv_create_ah() together to get some parallelism. */
    endpoint->endpoint_remote_ah = NULL;

    /* Now claim that modex slot */
    proc->proc_modex_claimed[modex_index] = true;
    MSGDEBUG1_OUT("create_endpoint: module=%p claimed endpoint=%p on proc=%p (hash=0x%" PRIx64 ")\n",
                  (void *)module, (void *)endpoint, (void *)proc,
                  proc->proc_opal->proc_name);

    /* Save the endpoint on this proc's array of endpoints */
    proc->proc_endpoints[proc->proc_endpoint_count] = endpoint;
    endpoint->endpoint_proc_index = proc->proc_endpoint_count;
    endpoint->endpoint_proc = proc;
    ++proc->proc_endpoint_count;
    OBJ_RETAIN(proc);

    /* also add endpoint to module's list of endpoints (done here and
       not in the endpoint constructor because we aren't able to pass
       the module as a constructor argument -- doh!). */
    opal_mutex_lock(&module->all_endpoints_lock);
    opal_list_append(&(module->all_endpoints),
            &(endpoint->endpoint_endpoint_li));
    endpoint->endpoint_on_all_endpoints = true;
    opal_mutex_unlock(&module->all_endpoints_lock);

    *endpoint_o = endpoint;
    return OPAL_SUCCESS;
}
Пример #5
0
static inline int
ompi_btl_usnic_check_rx_seq(
    ompi_btl_usnic_endpoint_t *endpoint,
    ompi_btl_usnic_recv_segment_t *seg,
    uint32_t *window_index)
{
    uint32_t i;
    ompi_btl_usnic_seq_t seq;

    /*
     * Handle piggy-backed ACK if present
     */
    if (seg->rs_base.us_btl_header->ack_seq != 0) {
#if MSGDEBUG1
        opal_output(0, "Handle piggy-packed ACK seq %d\n", seg->rs_base.us_btl_header->ack_seq);
#endif
        ompi_btl_usnic_handle_ack(endpoint,
                seg->rs_base.us_btl_header->ack_seq);
    }

    /* Do we have room in the endpoint's receiver window?
           
       Receiver window:

                   |-------- WINDOW_SIZE ----------|
                  +---------------------------------+
                  |         highest_seq_rcvd        |
                  |     somewhere in this range     |
                  +^--------------------------------+
                   |
                   +-- next_contig_seq_to_recv: the window left edge;
                       will always be less than highest_seq_rcvd

       The good condition is 

         next_contig_seq_to_recv <= seq < next_contig_seq_to_recv + WINDOW_SIZE

       And the bad condition is

         seq < next_contig_seq_to_recv
           or
         seq >= next_contig_seg_to_recv + WINDOW_SIZE
    */
    seq = seg->rs_base.us_btl_header->seq;
    if (seq < endpoint->endpoint_next_contig_seq_to_recv ||
        seq >= endpoint->endpoint_next_contig_seq_to_recv + WINDOW_SIZE) {
#if MSGDEBUG
            opal_output(0, "<-- Received FRAG/CHUNK ep %p, seq %" UDSEQ " outside of window (%" UDSEQ " - %" UDSEQ "), %p, module %p -- DROPPED\n",
                        (void*)endpoint, seg->rs_base.us_btl_header->seq, 
                        endpoint->endpoint_next_contig_seq_to_recv,
                        (endpoint->endpoint_next_contig_seq_to_recv + 
                         WINDOW_SIZE - 1),
                        (void*) seg,
                        (void*) endpoint->endpoint_module);
#endif

        /* Stats */
        if (seq < endpoint->endpoint_next_contig_seq_to_recv) {
            ++endpoint->endpoint_module->num_oow_low_recvs;
        } else {
            ++endpoint->endpoint_module->num_oow_high_recvs;
        }
        goto dup_needs_ack;
    }

    /* Ok, this segment is within the receiver window.  Have we
       already received it?  It's possible that the sender has
       re-sent a segment that we've already received (but not yet
       ACKed).

       We have saved all un-ACKed segment in an array on the
       endpoint that is the same legnth as the receiver's window
       (i.e., WINDOW_SIZE).  We can use the incoming segment sequence
       number to find its position in the array.  It's a little
       tricky because the left edge of the receiver window keeps
       moving, so we use a starting reference point in the array
       that is updated when we sent ACKs (and therefore move the
       left edge of the receiver's window).

       So this segment's index into the endpoint array is:

           rel_posn_in_recv_win = seq - next_contig_seq_to_recv
           array_posn = (rel_posn_in_recv_win + rfstart) % WINDOW_SIZE
       
       rfstart is then updated when we send ACKs:

           rfstart = (rfstart + num_acks_sent) % WINDOW_SIZE
    */
    i = seq - endpoint->endpoint_next_contig_seq_to_recv;
    i = WINDOW_SIZE_MOD(i + endpoint->endpoint_rfstart);
    if (endpoint->endpoint_rcvd_segs[i]) {
#if MSGDEBUG
        opal_output(0, "<-- Received FRAG/CHUNK ep %p, seq %" UDSEQ " from %s to %s, seg %p: duplicate -- DROPPED\n",
            (void*) endpoint, bseg->us_btl_header->seq, src_mac, dest_mac,
            (void*) seg);
#endif
        /* highest_seq_rcvd is for debug stats only; it's not used
           in any window calculations */
        assert(seq <= endpoint->endpoint_highest_seq_rcvd);
        /* next_contig_seq_to_recv-1 is the ack number we'll
           send */
        assert (seq > endpoint->endpoint_next_contig_seq_to_recv - 1);

        /* Stats */
        ++endpoint->endpoint_module->num_dup_recvs;
        goto dup_needs_ack;
    }

    /* Stats: is this the highest sequence number we've received? */
    if (seq > endpoint->endpoint_highest_seq_rcvd) {
        endpoint->endpoint_highest_seq_rcvd = seq;
    }

    *window_index = i;
    return true;

dup_needs_ack:
    if (!endpoint->endpoint_ack_needed) {
        ompi_btl_usnic_add_to_endpoints_needing_ack(endpoint);
    }
    return false;
}