size_t mca_pml_ob1_rdma_pipeline_btls_count (mca_bml_base_endpoint_t* bml_endpoint) { int num_btls = mca_bml_base_btl_array_get_size (&bml_endpoint->btl_rdma); int num_eager_btls = mca_bml_base_btl_array_get_size (&bml_endpoint->btl_eager); int rdma_count = 0; for(int i = 0; i < num_btls && i < mca_pml_ob1.max_rdma_per_request; ++i) { mca_bml_base_btl_t *bml_btl = mca_bml_base_btl_array_get_next(&bml_endpoint->btl_rdma); /* NTH: go ahead and use an rdma btl if is the only one */ bool ignore = !mca_pml_ob1.use_all_rdma; for (int i = 0 ; i < num_eager_btls && ignore ; ++i) { mca_bml_base_btl_t *eager_btl = mca_bml_base_btl_array_get_index (&bml_endpoint->btl_eager, i); if (eager_btl->btl_endpoint == bml_btl->btl_endpoint) { ignore = false; break; } } if (!ignore) { ++rdma_count; } } return rdma_count; }
static inline mca_bml_base_btl_t *get_next_btl(int dst, int *btl_id) { mca_bml_base_endpoint_t* endpoint; mca_bml_base_btl_t* bml_btl; oshmem_proc_t *proc; mca_bml_base_btl_array_t *btl_array = 0; int size = 0; int shmem_index = 0; /* get endpoint and btl */ proc = oshmem_proc_group_all(dst); if (!proc) { SPML_ERROR("Can not find destination proc for pe=%d", dst); return NULL ; } endpoint = (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]; if (!endpoint) { SPML_ERROR("pe=%d proc has no endpoint", dst); return NULL ; } /* At the moment always return first transport */ size = mca_bml_base_btl_array_get_size(btl_array = &endpoint->btl_rdma); if (0 >= size) { /* Possibly this is SM BTL with KNEM disabled? Then we should use send based get/put */ /* This hack is necessary for the case when KNEM is not available. In this case we still want to use send/recv of SM BTL for put and get but SM BTL is not in the rdma list anymore */ size = mca_bml_base_btl_array_get_size(btl_array = &endpoint->btl_eager); if (0 < size) { /*Chose SHMEM capable btl from eager array. Not filter now: take the first (but could appear on demand).*/ for (shmem_index = 0; shmem_index < size; shmem_index++) { bml_btl = mca_bml_base_btl_array_get_index(btl_array, shmem_index); _find_btl_id(bml_btl); size = 1; break; } } } bml_btl = mca_bml_base_btl_array_get_index(btl_array, shmem_index); *btl_id = proc->transport_ids[0]; #if SPML_YODA_DEBUG == 1 assert(*btl_id >= 0 && *btl_id < YODA_BTL_MAX); SPML_VERBOSE(100, "pe=%d reachable via btl %s %d", dst, bml_btl->btl->btl_component->btl_version.mca_component_name, *btl_id); #endif return bml_btl; }
int mca_pml_ob1_progress(void) { int i, queue_length = opal_list_get_size(&mca_pml_ob1.send_pending); int j, completed_requests = 0; bool send_succedded; #if OPAL_CUDA_SUPPORT mca_pml_ob1_process_pending_cuda_async_copies(); #endif /* OPAL_CUDA_SUPPORT */ if( OPAL_LIKELY(0 == queue_length) ) return 0; for( i = 0; i < queue_length; i++ ) { mca_pml_ob1_send_pending_t pending_type = MCA_PML_OB1_SEND_PENDING_NONE; mca_pml_ob1_send_request_t* sendreq; mca_bml_base_endpoint_t* endpoint; sendreq = get_request_from_send_pending(&pending_type); if(OPAL_UNLIKELY(NULL == sendreq)) break; switch(pending_type) { case MCA_PML_OB1_SEND_PENDING_NONE: assert(0); return 0; case MCA_PML_OB1_SEND_PENDING_SCHEDULE: if( mca_pml_ob1_send_request_schedule_exclusive(sendreq) == OMPI_ERR_OUT_OF_RESOURCE ) { return 0; } completed_requests++; break; case MCA_PML_OB1_SEND_PENDING_START: MCA_PML_OB1_SEND_REQUEST_RESET(sendreq); endpoint = sendreq->req_endpoint; send_succedded = false; for(j = 0; j < (int)mca_bml_base_btl_array_get_size(&endpoint->btl_eager); j++) { mca_bml_base_btl_t* bml_btl; int rc; /* select a btl */ bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager); rc = mca_pml_ob1_send_request_start_btl(sendreq, bml_btl); if( OPAL_LIKELY(OMPI_SUCCESS == rc) ) { send_succedded = true; completed_requests++; break; } } if( false == send_succedded ) { add_request_to_send_pending(sendreq, MCA_PML_OB1_SEND_PENDING_START, true); } } } return completed_requests; }
size_t mca_pml_bfo_rdma_btls( mca_bml_base_endpoint_t* bml_endpoint, unsigned char* base, size_t size, mca_pml_bfo_com_btl_t* rdma_btls) { int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma); double weight_total = 0; int num_btls_used = 0, n; /* shortcut when there are no rdma capable btls */ if(num_btls == 0) { return 0; } /* check to see if memory is registered */ for(n = 0; n < num_btls && num_btls_used < mca_pml_bfo.max_rdma_per_request; n++) { mca_bml_base_btl_t* bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_rdma, (bml_endpoint->btl_rdma_index + n) % num_btls); mca_mpool_base_registration_t* reg = &pml_bfo_dummy_reg; mca_mpool_base_module_t *btl_mpool = bml_btl->btl->btl_mpool; if( NULL != btl_mpool ) { if(!mca_pml_bfo.leave_pinned) { /* look through existing registrations */ btl_mpool->mpool_find(btl_mpool, base, size, ®); } else { /* register the memory */ btl_mpool->mpool_register(btl_mpool, base, size, 0, ®); } if(NULL == reg) continue; } rdma_btls[num_btls_used].bml_btl = bml_btl; rdma_btls[num_btls_used].btl_reg = reg; weight_total += bml_btl->btl_weight; num_btls_used++; } /* if we don't use leave_pinned and all BTLs that already have this memory * registered amount to less then half of available bandwidth - fall back to * pipeline protocol */ if(0 == num_btls_used || (!mca_pml_bfo.leave_pinned && weight_total < 0.5)) return 0; mca_pml_bfo_calc_weighted_length(rdma_btls, num_btls_used, size, weight_total); bml_endpoint->btl_rdma_index = (bml_endpoint->btl_rdma_index + 1) % num_btls; return num_btls_used; }
static int mca_bml_r2_del_procs(size_t nprocs, struct ompi_proc_t** procs) { size_t p; int rc; struct ompi_proc_t** del_procs = (struct ompi_proc_t**) malloc(nprocs * sizeof(struct ompi_proc_t*)); size_t n_del_procs = 0; if (NULL == del_procs) { return OMPI_ERR_OUT_OF_RESOURCE; } for(p = 0; p < nprocs; p++) { ompi_proc_t *proc = procs[p]; if(((opal_object_t*)proc)->obj_reference_count == 1) { del_procs[n_del_procs++] = proc; } } for(p = 0; p < n_del_procs; p++) { ompi_proc_t *proc = del_procs[p]; mca_bml_base_endpoint_t* bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]; size_t f_index, f_size; /* notify each btl that the proc is going away */ f_size = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send); for(f_index = 0; f_index < f_size; f_index++) { mca_bml_base_btl_t* bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, f_index); mca_btl_base_module_t* btl = bml_btl->btl; rc = btl->btl_del_procs(btl, 1, &proc, &bml_btl->btl_endpoint); if(OMPI_SUCCESS != rc) { free(del_procs); return rc; } /* The reference stored in btl_eager and btl_rdma will automatically * dissapear once the btl_array destructor is called. Thus, there is * no need for extra cleaning here. */ } OBJ_RELEASE(proc); /* do any required cleanup */ OBJ_RELEASE(bml_endpoint); proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML] = NULL; } free(del_procs); return OMPI_SUCCESS; }
size_t mca_pml_ob1_rdma_btls( mca_bml_base_endpoint_t* bml_endpoint, unsigned char* base, size_t size, mca_pml_ob1_rdma_btl_t* rdma_btls) { size_t num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma); size_t num_btls_used = 0; size_t n; /* shortcut when there are no rdma capable btls */ if(num_btls == 0) { return 0; } /* check to see if memory is registered */ for(n = 0; n < num_btls && num_btls_used < MCA_PML_OB1_MAX_RDMA_PER_REQUEST; n++) { mca_bml_base_btl_t* bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_rdma, (bml_endpoint->btl_rdma_index + n) % num_btls); mca_mpool_base_registration_t* reg = NULL; mca_mpool_base_module_t *btl_mpool = bml_btl->btl_mpool; /* btl is rdma capable and registration is not required */ if(NULL == btl_mpool) { reg = NULL; } else { if(!mca_pml_ob1.leave_pinned) { /* look through existing registrations */ btl_mpool->mpool_find(btl_mpool, base, size, ®); } else { /* register the memory */ btl_mpool->mpool_register(btl_mpool, base, size, 0, ®); } if(NULL == reg) bml_btl = NULL; /* skip it */ } if(bml_btl != NULL) { rdma_btls[num_btls_used].bml_btl = bml_btl; rdma_btls[num_btls_used].btl_reg = reg; num_btls_used++; } } bml_endpoint->btl_rdma_index = (bml_endpoint->btl_rdma_index + 1) % num_btls; return num_btls_used; }
size_t mca_pml_ob1_rdma_pipeline_btls( mca_bml_base_endpoint_t* bml_endpoint, size_t size, mca_pml_ob1_com_btl_t* rdma_btls ) { int num_btls = mca_bml_base_btl_array_get_size (&bml_endpoint->btl_rdma); int num_eager_btls = mca_bml_base_btl_array_get_size (&bml_endpoint->btl_eager); double weight_total = 0; int rdma_count = 0; for(int i = 0; i < num_btls && i < mca_pml_ob1.max_rdma_per_request; i++) { mca_bml_base_btl_t *bml_btl = mca_bml_base_btl_array_get_next(&bml_endpoint->btl_rdma); /* NTH: go ahead and use an rdma btl if is the only one */ bool ignore = !mca_pml_ob1.use_all_rdma; for (int i = 0 ; i < num_eager_btls && ignore ; ++i) { mca_bml_base_btl_t *eager_btl = mca_bml_base_btl_array_get_index (&bml_endpoint->btl_eager, i); if (eager_btl->btl_endpoint == bml_btl->btl_endpoint) { ignore = false; break; } } if (ignore) { continue; } rdma_btls[rdma_count].bml_btl = bml_btl; rdma_btls[rdma_count++].btl_reg = NULL; weight_total += bml_btl->btl_weight; } mca_pml_ob1_calc_weighted_length (rdma_btls, rdma_count, size, weight_total); return rdma_count; }
size_t mca_pml_bfo_rdma_pipeline_btls( mca_bml_base_endpoint_t* bml_endpoint, size_t size, mca_pml_bfo_com_btl_t* rdma_btls ) { int i, num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma); double weight_total = 0; for(i = 0; i < num_btls && i < mca_pml_bfo.max_rdma_per_request; i++) { rdma_btls[i].bml_btl = mca_bml_base_btl_array_get_next(&bml_endpoint->btl_rdma); if(NULL != rdma_btls[i].bml_btl->btl->btl_mpool) rdma_btls[i].btl_reg = NULL; else rdma_btls[i].btl_reg = &pml_bfo_dummy_reg; weight_total += rdma_btls[i].bml_btl->btl_weight; } mca_pml_bfo_calc_weighted_length(rdma_btls, i, size, weight_total); return i; }
/* for each proc create transport ids which are indexes into global * btl list&map */ static int create_btl_idx(int dst_pe) { oshmem_proc_t *proc; int btl_id; mca_bml_base_endpoint_t* endpoint; mca_bml_base_btl_t* bml_btl = 0; int i, size; mca_bml_base_btl_array_t *btl_array; int shmem_index = -1; proc = oshmem_proc_group_find(oshmem_group_all, dst_pe); endpoint = (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]; assert(endpoint); size = mca_bml_base_btl_array_get_size(btl_array = &endpoint->btl_rdma); if (0 >= size) { /* Possibly this is SM BTL with KNEM disabled? Then we should use send based get/put */ /* This hack is necessary for the case when KNEM is not available. In this case we still want to use send/recv of SM BTL for put and get but SM BTL is not in the rdma list anymore */ size = mca_bml_base_btl_array_get_size(btl_array = &endpoint->btl_eager); if (0 < size) { /*Chose SHMEM capable btl from eager array. Not filter now: take the first (but could appear on demand).*/ shmem_index = 0; size = 1; } else { SPML_ERROR("no SHMEM capable transport for dest pe=%d", dst_pe); return OSHMEM_ERROR; } } proc->transport_ids = (char *) malloc(size * sizeof(char)); if (!proc->transport_ids) return OSHMEM_ERROR; proc->num_transports = size; for (i = 0; i < size; i++) { bml_btl = mca_bml_base_btl_array_get_index(btl_array, (shmem_index >= 0) ? (shmem_index) : (i)); btl_id = _find_btl_id(bml_btl); SPML_VERBOSE(50, "dst_pe(%d) use btl (%s) btl_id=%d", dst_pe, bml_btl->btl->btl_component->btl_version.mca_component_name, btl_id); if (0 > btl_id) { SPML_ERROR("unknown btl: dst_pe(%d) use btl (%s) btl_id=%d", dst_pe, bml_btl->btl->btl_component->btl_version.mca_component_name, btl_id); return OSHMEM_ERROR; } proc->transport_ids[i] = btl_id; mca_spml_yoda.btl_type_map[btl_id].bml_btl = bml_btl; mca_spml_yoda.btl_type_map[btl_id].use_cnt++; } return OSHMEM_SUCCESS; }
static int mca_bml_r2_del_proc_btl(ompi_proc_t* proc, mca_btl_base_module_t* btl) { mca_bml_base_endpoint_t* ep = (mca_bml_base_endpoint_t*)proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]; mca_bml_base_btl_t* bml_btl; mca_btl_base_module_t* ep_btl; double total_bandwidth = 0; size_t b; if(NULL == ep) return OMPI_SUCCESS; /* remove btl from eager list */ mca_bml_base_btl_array_remove(&ep->btl_eager, btl); /* remove btl from send list */ if(mca_bml_base_btl_array_remove(&ep->btl_send, btl)) { /* compute total_bandwidth and reset max_send_size to the min of all btl's */ total_bandwidth = 0; ep->btl_max_send_size = -1; for(b=0; b< mca_bml_base_btl_array_get_size(&ep->btl_send); b++) { bml_btl = mca_bml_base_btl_array_get_index(&ep->btl_send, b); ep_btl = bml_btl->btl; total_bandwidth += ep_btl->btl_bandwidth; if (ep->btl_max_send_size > ep_btl->btl_max_send_size) { ep->btl_max_send_size = ep_btl->btl_max_send_size; } } /* compute weighting factor for this btl */ for(b=0; b< mca_bml_base_btl_array_get_size(&ep->btl_send); b++) { bml_btl = mca_bml_base_btl_array_get_index(&ep->btl_send, b); ep_btl = bml_btl->btl; if(ep_btl->btl_bandwidth > 0) { bml_btl->btl_weight = (float)(ep_btl->btl_bandwidth / total_bandwidth); } else { bml_btl->btl_weight = (float)(1.0 / mca_bml_base_btl_array_get_size(&ep->btl_send)); } } } /* remove btl from RDMA list */ if(mca_bml_base_btl_array_remove(&ep->btl_rdma, btl)) { /* compute total bandwidth */ total_bandwidth = 0; ep->btl_pipeline_send_length = 0; ep->btl_send_limit = 0; for(b=0; b< mca_bml_base_btl_array_get_size(&ep->btl_rdma); b++) { bml_btl = mca_bml_base_btl_array_get_index(&ep->btl_rdma, b); ep_btl = bml_btl->btl; /* update aggregate endpoint info */ total_bandwidth += ep_btl->btl_bandwidth; if (ep->btl_pipeline_send_length < ep_btl->btl_rdma_pipeline_send_length) { ep->btl_pipeline_send_length = ep_btl->btl_rdma_pipeline_send_length; } if (ep->btl_send_limit < ep_btl->btl_min_rdma_pipeline_size) { ep->btl_send_limit = ep_btl->btl_min_rdma_pipeline_size; } } /* compute weighting factor for this btl */ for(b=0; b< mca_bml_base_btl_array_get_size(&ep->btl_rdma); b++) { bml_btl = mca_bml_base_btl_array_get_index(&ep->btl_rdma, b); ep_btl = bml_btl->btl; if(ep_btl->btl_bandwidth > 0) { bml_btl->btl_weight = (float)(ep_btl->btl_bandwidth / total_bandwidth); } else { bml_btl->btl_weight = (float)(1.0 / mca_bml_base_btl_array_get_size(&ep->btl_rdma)); } } } return OMPI_SUCCESS; }
static int mca_bml_r2_del_procs(size_t nprocs, struct ompi_proc_t** procs) { size_t p; int rc; struct ompi_proc_t** del_procs = (struct ompi_proc_t**) malloc(nprocs * sizeof(struct ompi_proc_t*)); size_t n_del_procs = 0; if (NULL == del_procs) { return OMPI_ERR_OUT_OF_RESOURCE; } for(p =0; p < nprocs; p++) { ompi_proc_t *proc = procs[p]; if(((opal_object_t*)proc)->obj_reference_count == 1) { del_procs[n_del_procs++] = proc; } } for(p = 0; p < n_del_procs; p++) { ompi_proc_t *proc = del_procs[p]; mca_bml_base_endpoint_t* bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]; size_t f_index, f_size; size_t n_index, n_size; /* notify each btl that the proc is going away */ f_size = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_eager); for(f_index = 0; f_index < f_size; f_index++) { mca_bml_base_btl_t* bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_eager, f_index); mca_btl_base_module_t* btl = bml_btl->btl; rc = btl->btl_del_procs(btl,1,&proc,&bml_btl->btl_endpoint); if(OMPI_SUCCESS != rc) { return rc; } /* remove this from next array so that we dont call it twice w/ * the same address pointer */ n_size = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_eager); for(n_index = 0; n_index < n_size; n_index++) { mca_bml_base_btl_t* search_bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n_index); if(search_bml_btl->btl == btl) { memset(search_bml_btl, 0, sizeof(mca_bml_base_btl_t)); break; } } } /* notify each r2 that was not in the array of r2s for first fragments */ n_size = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send); for(n_index = 0; n_index < n_size; n_index++) { mca_bml_base_btl_t* bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_eager, n_index); mca_btl_base_module_t* btl = bml_btl->btl; if (btl != 0) { rc = btl->btl_del_procs(btl,1,&proc,&bml_btl->btl_endpoint); if(OMPI_SUCCESS != rc) { return rc; } } } OBJ_RELEASE(proc); /* do any required cleanup */ OBJ_RELEASE(bml_endpoint); } return OMPI_SUCCESS; }
static int mca_bml_r2_add_procs( size_t nprocs, struct ompi_proc_t** procs, struct opal_bitmap_t* reachable ) { size_t p, p_index, n_new_procs = 0; struct mca_btl_base_endpoint_t ** btl_endpoints = NULL; struct ompi_proc_t** new_procs = NULL; struct ompi_proc_t *unreach_proc = NULL; int rc, ret = OMPI_SUCCESS; if(0 == nprocs) { return OMPI_SUCCESS; } if(OMPI_SUCCESS != (rc = mca_bml_r2_add_btls()) ) { return rc; } /* Select only the procs that don't yet have the BML proc struct. This prevent * us from calling btl->add_procs several this on the same destination proc. */ for(p_index = 0; p_index < nprocs; p_index++) { struct ompi_proc_t* proc = procs[p_index]; OBJ_RETAIN(proc); if(NULL != proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]) { continue; /* go to the next proc */ } /* Allocate the new_procs on demand */ if( NULL == new_procs ) { new_procs = (struct ompi_proc_t **)malloc(nprocs * sizeof(struct ompi_proc_t *)); if( NULL == new_procs ) { return OMPI_ERR_OUT_OF_RESOURCE; } } new_procs[n_new_procs++] = proc; } if ( 0 == n_new_procs ) { return OMPI_SUCCESS; } /* Starting from here we only work on the unregistered procs */ procs = new_procs; nprocs = n_new_procs; /* attempt to add all procs to each r2 */ btl_endpoints = (struct mca_btl_base_endpoint_t **) malloc(nprocs * sizeof(struct mca_btl_base_endpoint_t*)); if (NULL == btl_endpoints) { free(new_procs); return OMPI_ERR_OUT_OF_RESOURCE; } for(p_index = 0; p_index < mca_bml_r2.num_btl_modules; p_index++) { mca_btl_base_module_t* btl = mca_bml_r2.btl_modules[p_index]; int btl_inuse = 0; /* if the r2 can reach the destination proc it sets the * corresponding bit (proc index) in the reachable bitmap * and can return addressing information for each proc * that is passed back to the r2 on data transfer calls */ opal_bitmap_clear_all_bits(reachable); memset(btl_endpoints, 0, nprocs *sizeof(struct mca_btl_base_endpoint_t*)); rc = btl->btl_add_procs(btl, n_new_procs, new_procs, btl_endpoints, reachable); if(OMPI_SUCCESS != rc) { /* This BTL has troubles adding the nodes. Let's continue maybe some other BTL * can take care of this task. */ continue; } /* for each proc that is reachable */ for( p = 0; p < n_new_procs; p++ ) { if(opal_bitmap_is_set_bit(reachable, p)) { ompi_proc_t *proc = new_procs[p]; mca_bml_base_endpoint_t * bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]; mca_bml_base_btl_t* bml_btl; size_t size; if(NULL == bml_endpoint) { /* allocate bml specific proc data */ bml_endpoint = OBJ_NEW(mca_bml_base_endpoint_t); if (NULL == bml_endpoint) { opal_output(0, "mca_bml_r2_add_procs: unable to allocate resources"); free(btl_endpoints); free(new_procs); return OMPI_ERR_OUT_OF_RESOURCE; } /* preallocate space in array for max number of r2s */ mca_bml_base_btl_array_reserve(&bml_endpoint->btl_eager, mca_bml_r2.num_btl_modules); mca_bml_base_btl_array_reserve(&bml_endpoint->btl_send, mca_bml_r2.num_btl_modules); mca_bml_base_btl_array_reserve(&bml_endpoint->btl_rdma, mca_bml_r2.num_btl_modules); bml_endpoint->btl_max_send_size = -1; bml_endpoint->btl_proc = proc; proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML] = bml_endpoint; bml_endpoint->btl_flags_or = 0; } /* dont allow an additional BTL with a lower exclusivity ranking */ size = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send); if(size > 0) { bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, size-1); /* skip this btl if the exclusivity is less than the previous */ if(bml_btl->btl->btl_exclusivity > btl->btl_exclusivity) { btl->btl_del_procs(btl, 1, &proc, &btl_endpoints[p]); continue; } } /* cache the endpoint on the proc */ bml_btl = mca_bml_base_btl_array_insert(&bml_endpoint->btl_send); bml_btl->btl = btl; bml_btl->btl_endpoint = btl_endpoints[p]; bml_btl->btl_weight = 0; bml_btl->btl_flags = btl->btl_flags; if( (bml_btl->btl_flags & MCA_BTL_FLAGS_PUT) && (NULL == btl->btl_put) ) { opal_output(0, "mca_bml_r2_add_procs: The PUT flag is specified for" " the %s BTL without any PUT function attached. Disard the flag !", bml_btl->btl->btl_component->btl_version.mca_component_name); bml_btl->btl_flags ^= MCA_BTL_FLAGS_PUT; } if( (bml_btl->btl_flags & MCA_BTL_FLAGS_GET) && (NULL == btl->btl_get) ) { opal_output(0, "mca_bml_r2_add_procs: The GET flag is specified for" " the %s BTL without any GET function attached. Discard the flag !", bml_btl->btl->btl_component->btl_version.mca_component_name); bml_btl->btl_flags ^= MCA_BTL_FLAGS_GET; } if( (bml_btl->btl_flags & (MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_SEND)) == 0 ) { /** * If no protocol specified, we have 2 choices: we ignore the BTL * as we don't know which protocl to use, or we suppose that all * BTLs support the send protocol. */ bml_btl->btl_flags |= MCA_BTL_FLAGS_SEND; } /** * calculate the bitwise OR of the btl flags */ bml_endpoint->btl_flags_or |= bml_btl->btl_flags; /* This BTL is in use, allow the progress registration */ btl_inuse++; } } if(btl_inuse > 0 && NULL != btl->btl_component->btl_progress) { size_t p; bool found = false; for( p = 0; p < mca_bml_r2.num_btl_progress; p++ ) { if(mca_bml_r2.btl_progress[p] == btl->btl_component->btl_progress) { found = true; break; } } if(found == false) { mca_bml_r2.btl_progress[mca_bml_r2.num_btl_progress] = btl->btl_component->btl_progress; mca_bml_r2.num_btl_progress++; opal_progress_register( btl->btl_component->btl_progress ); } } } free(btl_endpoints); /* iterate back through procs and compute metrics for registered r2s */ for(p=0; p<n_new_procs; p++) { ompi_proc_t *proc = new_procs[p]; mca_bml_base_endpoint_t* bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]; double total_bandwidth = 0; uint32_t latency = 0xffffffff; size_t n_index; size_t n_size; /* skip over procs w/ no btl's registered */ if(NULL == bml_endpoint) { continue; } /* (1) determine the total bandwidth available across all btls * note that we need to do this here, as we may already have btls configured * (2) determine the highest priority ranking for latency * (3) compute the maximum amount of bytes that can be send without any * weighting. Once the left over is smaller than this number we will * start using the weight to compute the correct amount. */ n_size = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send); /* sort BTLs in descending order according to bandwidth value */ qsort(bml_endpoint->btl_send.bml_btls, n_size, sizeof(mca_bml_base_btl_t), btl_bandwidth_compare); bml_endpoint->btl_rdma_index = 0; for(n_index = 0; n_index < n_size; n_index++) { mca_bml_base_btl_t* bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n_index); mca_btl_base_module_t* btl = bml_btl->btl; total_bandwidth += bml_btl->btl->btl_bandwidth; if(btl->btl_latency < latency) { latency = btl->btl_latency; } } /* (1) set the weight of each btl as a percentage of overall bandwidth * (2) copy all btl instances at the highest priority ranking into the * list of btls used for first fragments */ for(n_index = 0; n_index < n_size; n_index++) { mca_bml_base_btl_t* bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n_index); mca_btl_base_module_t *btl = bml_btl->btl; /* compute weighting factor for this r2 */ if(btl->btl_bandwidth > 0) { bml_btl->btl_weight = (float)(btl->btl_bandwidth / total_bandwidth); } else { bml_btl->btl_weight = (float)(1.0 / n_size); } /* check to see if this r2 is already in the array of r2s * used for first fragments - if not add it. */ if(btl->btl_latency == latency) { mca_bml_base_btl_t* bml_btl_new = mca_bml_base_btl_array_insert(&bml_endpoint->btl_eager); *bml_btl_new = *bml_btl; } /* set endpoint max send size as min of available btls */ if(bml_endpoint->btl_max_send_size > btl->btl_max_send_size) bml_endpoint->btl_max_send_size = btl->btl_max_send_size; /* check flags - is rdma prefered */ if ((btl->btl_flags & (MCA_BTL_FLAGS_PUT|MCA_BTL_FLAGS_GET)) && !((proc->proc_arch != ompi_proc_local_proc->proc_arch) && (0 == (btl->btl_flags & MCA_BTL_FLAGS_HETEROGENEOUS_RDMA)))) { mca_bml_base_btl_t* bml_btl_rdma = mca_bml_base_btl_array_insert(&bml_endpoint->btl_rdma); mca_btl_base_module_t* btl_rdma = bml_btl->btl; *bml_btl_rdma = *bml_btl; if(bml_endpoint->btl_pipeline_send_length < btl_rdma->btl_rdma_pipeline_send_length) { bml_endpoint->btl_pipeline_send_length = btl_rdma->btl_rdma_pipeline_send_length; } if(bml_endpoint->btl_send_limit < btl_rdma->btl_min_rdma_pipeline_size) { bml_endpoint->btl_send_limit = btl_rdma->btl_min_rdma_pipeline_size; } } } } /* see if we have a connection to everyone else */ for(p=0; p<n_new_procs; p++) { ompi_proc_t *proc = new_procs[p]; if (NULL == proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]) { if (NULL == unreach_proc) { unreach_proc = proc; } ret = OMPI_ERR_UNREACH; } } if (mca_bml_r2.show_unreach_errors && OMPI_ERR_UNREACH == ret) { opal_show_help("help-mca-bml-r2.txt", "unreachable proc", true, OMPI_NAME_PRINT(&(ompi_proc_local_proc->proc_name)), (NULL != ompi_proc_local_proc->proc_hostname ? ompi_proc_local_proc->proc_hostname : "unknown!"), OMPI_NAME_PRINT(&(unreach_proc->proc_name)), (NULL != ompi_proc_local_proc->proc_hostname ? ompi_proc_local_proc->proc_hostname : "unknown!"), btl_names); } free(new_procs); return ret; }
size_t mca_pml_ob1_rdma_btls( mca_bml_base_endpoint_t* bml_endpoint, unsigned char* base, size_t size, mca_pml_ob1_com_btl_t* rdma_btls) { int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma); int num_eager_btls = mca_bml_base_btl_array_get_size (&bml_endpoint->btl_eager); double weight_total = 0; int num_btls_used = 0; /* shortcut when there are no rdma capable btls */ if(num_btls == 0) { return 0; } /* check to see if memory is registered */ for (int n = 0; n < num_btls && num_btls_used < mca_pml_ob1.max_rdma_per_request; n++) { mca_bml_base_btl_t* bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_rdma, (bml_endpoint->btl_rdma_index + n) % num_btls); mca_btl_base_registration_handle_t *reg_handle = NULL; mca_btl_base_module_t *btl = bml_btl->btl; /* NTH: go ahead and use an rdma btl if is the only one */ bool ignore = !mca_pml_ob1.use_all_rdma; /* do not use rdma btls that are not in the eager list. this is necessary to avoid using * btls that exist on the endpoint only to support RMA. */ for (int i = 0 ; i < num_eager_btls && ignore ; ++i) { mca_bml_base_btl_t *eager_btl = mca_bml_base_btl_array_get_index (&bml_endpoint->btl_eager, i); if (eager_btl->btl_endpoint == bml_btl->btl_endpoint) { ignore = false; break; } } if (ignore) { continue; } if (btl->btl_register_mem) { /* do not use the RDMA protocol with this btl if 1) leave pinned is disabled, * 2) the btl supports put, and 3) the fragment is larger than the minimum * pipeline size specified by the BTL */ if (!opal_leave_pinned && (btl->btl_flags & MCA_BTL_FLAGS_PUT) && size > btl->btl_min_rdma_pipeline_size) { continue; } /* try to register the memory region with the btl */ reg_handle = btl->btl_register_mem (btl, bml_btl->btl_endpoint, base, size, MCA_BTL_REG_FLAG_REMOTE_READ); if (NULL == reg_handle) { /* btl requires registration but the registration failed */ continue; } } /* else no registration is needed with this btl */ rdma_btls[num_btls_used].bml_btl = bml_btl; rdma_btls[num_btls_used].btl_reg = reg_handle; weight_total += bml_btl->btl_weight; num_btls_used++; } /* if we don't use leave_pinned and all BTLs that already have this memory * registered amount to less then half of available bandwidth - fall back to * pipeline protocol */ if (0 == num_btls_used || (!opal_leave_pinned && weight_total < 0.5)) return 0; mca_pml_ob1_calc_weighted_length(rdma_btls, num_btls_used, size, weight_total); bml_endpoint->btl_rdma_index = (bml_endpoint->btl_rdma_index + 1) % num_btls; return num_btls_used; }
int mca_bml_r2_del_proc_btl(ompi_proc_t* proc, mca_btl_base_module_t* btl) { mca_bml_base_endpoint_t* ep = (mca_bml_base_endpoint_t*)proc->proc_bml; double total_bandwidth = 0; size_t b; /* remove btl from eager list */ mca_bml_base_btl_array_remove(&ep->btl_eager, btl); /* remove btl from send list */ if(mca_bml_base_btl_array_remove(&ep->btl_send, btl)) { /* compute total_bandwidth and reset max_send_size to the min of all btl's */ total_bandwidth = 0; for(b=0; b< mca_bml_base_btl_array_get_size(&ep->btl_send); b++) { mca_bml_base_btl_t* bml_btl = mca_bml_base_btl_array_get_index(&ep->btl_send, b); total_bandwidth += bml_btl->btl->btl_bandwidth; if (bml_btl->btl_max_send_size < ep->btl_max_send_size) { ep->btl_max_send_size = bml_btl->btl->btl_max_send_size; } } /* compute weighting factor for this btl */ for(b=0; b< mca_bml_base_btl_array_get_size(&ep->btl_send); b++) { mca_bml_base_btl_t* bml_btl = mca_bml_base_btl_array_get_index(&ep->btl_send, b); if(bml_btl->btl->btl_bandwidth > 0) { bml_btl->btl_weight = bml_btl->btl->btl_bandwidth / total_bandwidth; } else { bml_btl->btl_weight = 1.0 / mca_bml_base_btl_array_get_size(&ep->btl_send); } } } /* remove btl from RDMA list */ if(mca_bml_base_btl_array_remove(&ep->btl_rdma, btl)) { /* computer total bandwidth */ total_bandwidth = 0; for(b=0; b< mca_bml_base_btl_array_get_size(&ep->btl_rdma); b++) { mca_bml_base_btl_t* bml_btl = mca_bml_base_btl_array_get_index(&ep->btl_rdma, b); /* update aggregate endpoint info */ total_bandwidth += bml_btl->btl->btl_bandwidth; if (ep->btl_rdma_offset < bml_btl->btl_min_rdma_size) { ep->btl_rdma_offset = bml_btl->btl_min_rdma_size; } } /* compute weighting factor for this btl */ for(b=0; b< mca_bml_base_btl_array_get_size(&ep->btl_rdma); b++) { mca_bml_base_btl_t* bml_btl = mca_bml_base_btl_array_get_index(&ep->btl_rdma, b); if(bml_btl->btl->btl_bandwidth > 0) { bml_btl->btl_weight = bml_btl->btl->btl_bandwidth / total_bandwidth; } else { bml_btl->btl_weight = 1.0 / mca_bml_base_btl_array_get_size(&ep->btl_rdma); } } } return OMPI_SUCCESS; }
int mca_bml_r2_add_procs( size_t nprocs, struct ompi_proc_t** procs, struct mca_bml_base_endpoint_t** bml_endpoints, struct ompi_bitmap_t* reachable ) { size_t p; int rc; size_t p_index; struct mca_btl_base_endpoint_t ** btl_endpoints = NULL; struct ompi_proc_t** new_procs = NULL; size_t n_new_procs = 0; int ret = OMPI_SUCCESS; struct ompi_proc_t *unreach_proc = NULL; if(0 == nprocs) { return OMPI_SUCCESS; } if(OMPI_SUCCESS != (rc = mca_bml_r2_add_btls()) ) { return rc; } new_procs = (struct ompi_proc_t **) malloc(nprocs * sizeof(struct ompi_proc_t *)); if (NULL == new_procs ) { return OMPI_ERR_OUT_OF_RESOURCE; } memset(bml_endpoints, 0, nprocs * sizeof(struct mca_bml_base_endpoint_t*)); for(p_index = 0; p_index < nprocs; p_index++) { struct ompi_proc_t* proc; proc = procs[p_index]; OBJ_RETAIN(proc); if(NULL != proc->proc_bml) { bml_endpoints[p_index] = (mca_bml_base_endpoint_t*) proc->proc_bml; } else { new_procs[n_new_procs++] = proc; } } if ( 0 == n_new_procs ) { return OMPI_SUCCESS; } procs = new_procs; nprocs = n_new_procs; /* attempt to add all procs to each r2 */ btl_endpoints = (struct mca_btl_base_endpoint_t **) malloc(nprocs * sizeof(struct mca_btl_base_endpoint_t*)); if (NULL == btl_endpoints) { return OMPI_ERR_OUT_OF_RESOURCE; } for(p_index = 0; p_index < mca_bml_r2.num_btl_modules; p_index++) { mca_btl_base_module_t* btl = mca_bml_r2.btl_modules[p_index]; int btl_inuse = 0; /* if the r2 can reach the destination proc it sets the * corresponding bit (proc index) in the reachable bitmap * and can return addressing information for each proc * that is passed back to the r2 on data transfer calls */ ompi_bitmap_clear_all_bits(reachable); memset(btl_endpoints, 0, nprocs *sizeof(struct mca_btl_base_endpoint_t*)); rc = btl->btl_add_procs(btl, n_new_procs, new_procs, btl_endpoints, reachable); if(OMPI_SUCCESS != rc) { free(btl_endpoints); return rc; } /* for each proc that is reachable - add the endpoint to the bml_endpoints array(s) */ for(p=0; p<n_new_procs; p++) { if(ompi_bitmap_is_set_bit(reachable, p)) { ompi_proc_t *proc = new_procs[p]; mca_bml_base_endpoint_t * bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_bml; mca_bml_base_btl_t* bml_btl; size_t size; btl_inuse++; if(NULL == bml_endpoint) { /* allocate bml specific proc data */ bml_endpoint = OBJ_NEW(mca_bml_base_endpoint_t); if (NULL == bml_endpoint) { opal_output(0, "mca_bml_r2_add_procs: unable to allocate resources"); free(btl_endpoints); return OMPI_ERR_OUT_OF_RESOURCE; } /* preallocate space in array for max number of r2s */ mca_bml_base_btl_array_reserve(&bml_endpoint->btl_eager, mca_bml_r2.num_btl_modules); mca_bml_base_btl_array_reserve(&bml_endpoint->btl_send, mca_bml_r2.num_btl_modules); mca_bml_base_btl_array_reserve(&bml_endpoint->btl_rdma, mca_bml_r2.num_btl_modules); bml_endpoint->btl_max_send_size = -1; bml_endpoint->btl_proc = proc; proc->proc_bml = bml_endpoint; bml_endpoint->btl_flags_and = 0; bml_endpoint->btl_flags_or = 0; } bml_endpoints[p] =(mca_bml_base_endpoint_t*) proc->proc_bml; /* dont allow an additional BTL with a lower exclusivity ranking */ size = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send); if(size > 0) { bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, size-1); /* skip this btl if the exclusivity is less than the previous */ if(bml_btl->btl->btl_exclusivity > btl->btl_exclusivity) { if(btl_endpoints[p] != NULL) { btl->btl_del_procs(btl, 1, &proc, &btl_endpoints[p]); } btl_inuse--; continue; } } /* cache the endpoint on the proc */ bml_btl = mca_bml_base_btl_array_insert(&bml_endpoint->btl_send); bml_btl->btl = btl; bml_btl->btl_eager_limit = btl->btl_eager_limit; bml_btl->btl_min_send_size = btl->btl_min_send_size; bml_btl->btl_max_send_size = btl->btl_max_send_size; bml_btl->btl_min_rdma_size = btl->btl_min_rdma_size; bml_btl->btl_max_rdma_size = btl->btl_max_rdma_size; bml_btl->btl_cache = NULL; bml_btl->btl_endpoint = btl_endpoints[p]; bml_btl->btl_weight = 0; bml_btl->btl_alloc = btl->btl_alloc; bml_btl->btl_free = btl->btl_free; bml_btl->btl_prepare_src = btl->btl_prepare_src; bml_btl->btl_prepare_dst = btl->btl_prepare_dst; bml_btl->btl_send = btl->btl_send; bml_btl->btl_flags = btl->btl_flags; bml_btl->btl_put = btl->btl_put; if( (bml_btl->btl_flags & MCA_BTL_FLAGS_PUT) && (NULL == bml_btl->btl_put) ) { opal_output(0, "mca_bml_r2_add_procs: The PUT flag is specified for" " the %s BTL without any PUT function attached. Disard the flag !", bml_btl->btl->btl_component->btl_version.mca_component_name); bml_btl->btl_flags ^= MCA_BTL_FLAGS_PUT; } bml_btl->btl_get = btl->btl_get; if( (bml_btl->btl_flags & MCA_BTL_FLAGS_GET) && (NULL == bml_btl->btl_get) ) { opal_output(0, "mca_bml_r2_add_procs: The GET flag is specified for" " the %s BTL without any GET function attached. Disard the flag !", bml_btl->btl->btl_component->btl_version.mca_component_name); bml_btl->btl_flags ^= MCA_BTL_FLAGS_GET; } bml_btl->btl_mpool = btl->btl_mpool; if( (bml_btl->btl_flags & (MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_SEND)) == 0 ) { /** * If no protocol specified, we have 2 choices: we ignore the BTL * as we don't know which protocl to use, or we suppose that all * BTLs support the send protocol. */ bml_btl->btl_flags |= MCA_BTL_FLAGS_SEND; } /** * calculate the bitwise OR and AND of the btl flags */ bml_endpoint->btl_flags_or |= bml_btl->btl_flags; bml_endpoint->btl_flags_and &= bml_btl->btl_flags; } } if(btl_inuse > 0 && NULL != btl->btl_component->btl_progress) { size_t p; bool found = false; for(p=0; p<mca_bml_r2.num_btl_progress; p++) { if(mca_bml_r2.btl_progress[p] == btl->btl_component->btl_progress) { found = true; break; } } if(found == false) { mca_bml_r2.btl_progress[mca_bml_r2.num_btl_progress] = btl->btl_component->btl_progress; mca_bml_r2.num_btl_progress++; } } } free(btl_endpoints); /* iterate back through procs and compute metrics for registered r2s */ for(p=0; p<n_new_procs; p++) { ompi_proc_t *proc = new_procs[p]; mca_bml_base_endpoint_t* bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_bml; double total_bandwidth = 0; uint32_t latency = 0xffffffff; size_t n_index; size_t n_size; /* skip over procs w/ no btl's registered */ if(NULL == bml_endpoint) { continue; } /* (1) determine the total bandwidth available across all btls * note that we need to do this here, as we may already have btls configured * (2) determine the highest priority ranking for latency * (3) compute the maximum amount of bytes that can be send without any * weighting. Once the left over is smaller than this number we will * start using the weight to compute the correct amount. */ n_size = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send); bml_endpoint->bml_max_send_length = 0; bml_endpoint->bml_max_rdma_length = 0; bml_endpoint->btl_rdma_index = 0; for(n_index = 0; n_index < n_size; n_index++) { mca_bml_base_btl_t* bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n_index); mca_btl_base_module_t* btl = bml_btl->btl; total_bandwidth += bml_btl->btl->btl_bandwidth; if(btl->btl_latency < latency) { latency = btl->btl_latency; } bml_endpoint->bml_max_send_length += bml_btl->btl->btl_bandwidth; } /* (1) set the weight of each btl as a percentage of overall bandwidth * (2) copy all btl instances at the highest priority ranking into the * list of btls used for first fragments */ for(n_index = 0; n_index < n_size; n_index++) { mca_bml_base_btl_t* bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n_index); mca_btl_base_module_t *btl = bml_btl->btl; /* compute weighting factor for this r2 */ if(btl->btl_bandwidth > 0) { bml_btl->btl_weight = btl->btl_bandwidth / total_bandwidth; } else { bml_btl->btl_weight = 1.0 / n_size; } /* check to see if this r2 is already in the array of r2s * used for first fragments - if not add it. */ if(btl->btl_latency == latency) { mca_bml_base_btl_t* bml_btl_new = mca_bml_base_btl_array_insert(&bml_endpoint->btl_eager); *bml_btl_new = *bml_btl; } /* set endpoint max send size as min of available btls */ if(bml_endpoint->btl_max_send_size > btl->btl_max_send_size) bml_endpoint->btl_max_send_size = btl->btl_max_send_size; /* check flags - is rdma prefered */ if(btl->btl_flags & (MCA_BTL_FLAGS_PUT|MCA_BTL_FLAGS_GET) && proc->proc_arch == ompi_proc_local_proc->proc_arch) { mca_bml_base_btl_t* bml_btl_rdma = mca_bml_base_btl_array_insert(&bml_endpoint->btl_rdma); *bml_btl_rdma = *bml_btl; if(bml_endpoint->btl_rdma_offset < bml_btl_rdma->btl_min_rdma_size) { bml_endpoint->btl_rdma_offset = bml_btl_rdma->btl_min_rdma_size; } } } } /* see if we have a connection to everyone else */ for(p=0; p<n_new_procs; p++) { ompi_proc_t *proc = new_procs[p]; if (NULL == proc->proc_bml) { if (NULL == unreach_proc) { unreach_proc = proc; } ret = OMPI_ERR_UNREACH; } } if (mca_bml_r2.show_unreach_errors && OMPI_ERR_UNREACH == ret) { char *local, *remote; orte_ns.get_proc_name_string(&local, &(ompi_proc_local_proc->proc_name)); orte_ns.get_proc_name_string(&remote, &(unreach_proc->proc_name)); opal_show_help("help-mca-bml-r2", "unreachable proc", true, local, remote, NULL); free(local); free(remote); } free(new_procs); return ret; }