/* * Look for an existing usnic proc based on a hashed RTE process * name. */ ompi_btl_usnic_endpoint_t * ompi_btl_usnic_proc_lookup_endpoint(ompi_btl_usnic_module_t *receiver, uint64_t sender_hashed_rte_name) { ompi_btl_usnic_proc_t *proc; ompi_btl_usnic_endpoint_t *endpoint; opal_list_item_t *item; MSGDEBUG1_OUT("lookup_endpoint: recvmodule=%p sendhash=0x%" PRIx64, (void *)receiver, sender_hashed_rte_name); for (item = opal_list_get_first(&receiver->all_endpoints); item != opal_list_get_end(&receiver->all_endpoints); item = opal_list_get_next(item)) { endpoint = container_of(item, ompi_btl_usnic_endpoint_t, endpoint_endpoint_li); proc = endpoint->endpoint_proc; if (ompi_rte_hash_name(&proc->proc_ompi->proc_name) == sender_hashed_rte_name) { MSGDEBUG1_OUT("lookup_endpoint: matched endpoint=%p", (void *)endpoint); return endpoint; } } /* Didn't find it */ return NULL; }
/* * Look for an existing usnic proc based on a hashed RTE process * name. */ opal_btl_usnic_endpoint_t * opal_btl_usnic_proc_lookup_endpoint(opal_btl_usnic_module_t *receiver, uint64_t sender_proc_name) { opal_btl_usnic_proc_t *proc; opal_btl_usnic_endpoint_t *endpoint; opal_list_item_t *item; MSGDEBUG1_OUT("lookup_endpoint: recvmodule=%p sendhash=0x%" PRIx64, (void *)receiver, sender_hashed_rte_name); opal_mutex_lock(&receiver->all_endpoints_lock); for (item = opal_list_get_first(&receiver->all_endpoints); item != opal_list_get_end(&receiver->all_endpoints); item = opal_list_get_next(item)) { endpoint = container_of(item, opal_btl_usnic_endpoint_t, endpoint_endpoint_li); proc = endpoint->endpoint_proc; /* Note that this works today because opal_proc_t->proc_name is unique across the universe. George is potentially working to give handles instead of proc names, and then have a function pointer to perform comparisons. This would be bad here in the critical path, though... */ if (proc->proc_opal->proc_name == sender_proc_name) { MSGDEBUG1_OUT("lookup_endpoint: matched endpoint=%p", (void *)endpoint); opal_mutex_unlock(&receiver->all_endpoints_lock); return endpoint; } } opal_mutex_unlock(&receiver->all_endpoints_lock); /* Didn't find it */ return NULL; }
/* * Create an endpoint and claim the matched modex slot */ int opal_btl_usnic_create_endpoint(opal_btl_usnic_module_t *module, opal_btl_usnic_proc_t *proc, opal_btl_usnic_endpoint_t **endpoint_o) { int err; int modex_index; opal_btl_usnic_endpoint_t *endpoint; /* look for matching modex info */ err = match_modex(module, proc, &modex_index); if (OPAL_SUCCESS != err) { opal_output_verbose(5, USNIC_OUT, "btl:usnic:create_endpoint: did not match usnic modex info for peer %s", OPAL_NAME_PRINT(proc->proc_opal->proc_name)); return err; } endpoint = OBJ_NEW(opal_btl_usnic_endpoint_t); if (NULL == endpoint) { return OPAL_ERR_OUT_OF_RESOURCE; } /* Initalize the endpoint */ endpoint->endpoint_module = module; assert(modex_index >= 0 && modex_index < (int)proc->proc_modex_count); endpoint->endpoint_remote_addr = proc->proc_modex[modex_index]; /* Initialize endpoint sequence number info */ endpoint->endpoint_next_seq_to_send = module->local_addr.isn; endpoint->endpoint_ack_seq_rcvd = endpoint->endpoint_next_seq_to_send - 1; endpoint->endpoint_next_contig_seq_to_recv = endpoint->endpoint_remote_addr.isn; endpoint->endpoint_highest_seq_rcvd = endpoint->endpoint_next_contig_seq_to_recv - 1; endpoint->endpoint_rfstart = WINDOW_SIZE_MOD(endpoint->endpoint_next_contig_seq_to_recv); /* Defer creating the ibv_ah. Since calling ibv_create_ah() may trigger ARP resolution, it's better to batch all the endpoints' calls to ibv_create_ah() together to get some parallelism. */ endpoint->endpoint_remote_ah = NULL; /* Now claim that modex slot */ proc->proc_modex_claimed[modex_index] = true; MSGDEBUG1_OUT("create_endpoint: module=%p claimed endpoint=%p on proc=%p (hash=0x%" PRIx64 ")\n", (void *)module, (void *)endpoint, (void *)proc, proc->proc_opal->proc_name); /* Save the endpoint on this proc's array of endpoints */ proc->proc_endpoints[proc->proc_endpoint_count] = endpoint; endpoint->endpoint_proc_index = proc->proc_endpoint_count; endpoint->endpoint_proc = proc; ++proc->proc_endpoint_count; OBJ_RETAIN(proc); /* also add endpoint to module's list of endpoints (done here and not in the endpoint constructor because we aren't able to pass the module as a constructor argument -- doh!). */ opal_mutex_lock(&module->all_endpoints_lock); opal_list_append(&(module->all_endpoints), &(endpoint->endpoint_endpoint_li)); endpoint->endpoint_on_all_endpoints = true; opal_mutex_unlock(&module->all_endpoints_lock); *endpoint_o = endpoint; return OPAL_SUCCESS; }
/* Responsible for handling "large" frags (reserve + *size > max_frag_payload) * in the same manner as btl_prepare_src. Must return a smaller amount than * requested if the given convertor cannot process the entire (*size). */ static opal_btl_usnic_send_frag_t * prepare_src_large( struct opal_btl_usnic_module_t* module, struct mca_btl_base_endpoint_t* endpoint, struct opal_convertor_t* convertor, uint8_t order, size_t reserve, size_t* size, uint32_t flags) { opal_btl_usnic_send_frag_t *frag; opal_btl_usnic_large_send_frag_t *lfrag; int rc; /* Get holder for the msg */ lfrag = opal_btl_usnic_large_send_frag_alloc(module); if (OPAL_UNLIKELY(NULL == lfrag)) { return NULL; } frag = &lfrag->lsf_base; /* The header location goes in SG[0], payload in SG[1]. If we are using a * convertor then SG[1].seg_len is accurate but seg_addr is NULL. */ frag->sf_base.uf_base.USNIC_SEND_LOCAL_COUNT = 2; /* stash header location, PML will write here */ frag->sf_base.uf_local_seg[0].seg_addr.pval = &lfrag->lsf_ompi_header; frag->sf_base.uf_local_seg[0].seg_len = reserve; /* make sure upper header small enough */ assert(reserve <= sizeof(lfrag->lsf_ompi_header)); if (OPAL_UNLIKELY(opal_convertor_need_buffers(convertor))) { /* threshold == -1 means always pack eagerly */ if (mca_btl_usnic_component.pack_lazy_threshold >= 0 && *size >= (size_t)mca_btl_usnic_component.pack_lazy_threshold) { MSGDEBUG1_OUT("packing frag %p on the fly", (void *)frag); lfrag->lsf_pack_on_the_fly = true; /* tell the PML we will absorb as much as possible while still * respecting indivisible element boundaries in the convertor */ *size = opal_btl_usnic_convertor_pack_peek(convertor, *size); /* Clone the convertor b/c we (the BTL) don't own it and the PML * might mutate it after we return from this function. */ rc = opal_convertor_clone(convertor, &frag->sf_convertor, /*copy_stack=*/true); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { BTL_ERROR(("unexpected convertor clone error")); abort(); /* XXX */ } } else { /* pack everything in the convertor into a chain of segments now, * leaving space for the PML header in the first segment */ lfrag->lsf_base.sf_base.uf_local_seg[0].seg_addr.pval = pack_chunk_seg_chain_with_reserve(module, lfrag, reserve, convertor, *size, size); } /* We set SG[1] to {NULL,bytes_packed} so that various calculations * by both PML and this BTL will be correct. For example, the PML adds * up the bytes in the descriptor segments to determine if an MPI-level * request is complete or not. */ frag->sf_base.uf_local_seg[1].seg_addr.pval = NULL; frag->sf_base.uf_local_seg[1].seg_len = *size; } else { /* convertor not needed, just save the payload pointer in SG[1] */ lfrag->lsf_pack_on_the_fly = true; opal_convertor_get_current_pointer(convertor, &frag->sf_base.uf_local_seg[1].seg_addr.pval); frag->sf_base.uf_local_seg[1].seg_len = *size; } frag->sf_base.uf_base.des_flags = flags; frag->sf_endpoint = endpoint; return frag; }