int is_set_bit(opal_bitmap_t *bm, int bit) { bool result = opal_bitmap_is_set_bit(bm, bit); if (result) { if (bit < 0) { fprintf(error_out, "ERROR: is_set_bit for bit = %d \n\n",bit); return ERR_CODE; } if (!(bm->bitmap[bit/SIZE_OF_CHAR] & (1 << bit % SIZE_OF_CHAR))) { fprintf(error_out, "ERROR: is_set_bit for bit = %d \n\n",bit); return ERR_CODE; } return 0; } if (!result) { if (0 <= bit && bit <= bm->array_size && !(bm->bitmap[bit/SIZE_OF_CHAR] & (1 << bit % SIZE_OF_CHAR))) { fprintf(error_out, "ERROR: is_set_bit for bit = %d \n\n",bit); return ERR_CODE; } return 0; } return 0; }
int set_bit(opal_bitmap_t *bm, int bit) { int err = opal_bitmap_set_bit(bm, bit); if (err != 0 || !opal_bitmap_is_set_bit(bm, bit)) { fprintf(error_out, "ERROR: set_bit for bit = %d\n\n", bit); return ERR_CODE; } return 0; }
int main(int argc, char* argv[]) { int i, j; int found; opal_list_t children; opal_list_item_t *item; int num_children; int num_procs; orte_routed_tree_t *child; opal_bitmap_t *relations; if (2 != argc) { printf("usage: binom x, where x=number of procs\n"); exit(1); } orte_init(&argc, &argv, ORTE_PROC_TOOL); num_procs = atoi(argv[1]); for (i=0; i < num_procs; i++) { OBJ_CONSTRUCT(&children, opal_list_t); num_children = 0; printf("i am %d:", i); found = down_search(0, 0, i, num_procs, &num_children, &children, NULL); printf("\tparent %d num_children %d\n", found, num_children); while (NULL != (item = opal_list_remove_first(&children))) { child = (orte_routed_tree_t*)item; printf("\tchild %d\n", child->vpid); for (j=0; j < num_procs; j++) { if (opal_bitmap_is_set_bit(&child->relatives, j)) { printf("\t\trelation %d\n", j); } } OBJ_RELEASE(item); } OBJ_DESTRUCT(&children); } orte_finalize(); }
static void update_routing_plan(void) { orte_routed_tree_t *child; int j; opal_list_item_t *item; /* if I am anything other than a daemon or the HNP, this * is a meaningless command as I am not allowed to route */ if (!ORTE_PROC_IS_DAEMON && !ORTE_PROC_IS_HNP) { return; } /* clear the list of children if any are already present */ while (NULL != (item = opal_list_remove_first(&my_children))) { OBJ_RELEASE(item); } num_children = 0; /* compute my direct children and the bitmap that shows which vpids * lie underneath their branch */ ORTE_PROC_MY_PARENT->vpid = binomial_tree(0, 0, ORTE_PROC_MY_NAME->vpid, orte_process_info.max_procs, &num_children, &my_children, NULL, true); if (0 < opal_output_get_verbosity(orte_routed_base_framework.framework_output)) { opal_output(0, "%s: parent %d num_children %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_PROC_MY_PARENT->vpid, num_children); for (item = opal_list_get_first(&my_children); item != opal_list_get_end(&my_children); item = opal_list_get_next(item)) { child = (orte_routed_tree_t*)item; opal_output(0, "%s: \tchild %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), child->vpid); for (j=0; j < (int)orte_process_info.max_procs; j++) { if (opal_bitmap_is_set_bit(&child->relatives, j)) { opal_output(0, "%s: \t\trelation %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), j); } } } } }
char * opal_bitmap_get_string(opal_bitmap_t *bitmap) { int i; char *tmp_str = NULL; char *bitmap_str = NULL; char cur_char = ' '; if( NULL == bitmap) { return NULL; } for( i = 0; i < (bitmap->array_size * SIZE_OF_CHAR); ++i) { if( opal_bitmap_is_set_bit(bitmap, i) ) { cur_char = 'X'; } else { cur_char = '_'; } if( NULL == bitmap_str ) { asprintf(&tmp_str, "%c", cur_char); } else { asprintf(&tmp_str, "%s%c", bitmap_str, cur_char); } if( NULL != bitmap_str ) { free(bitmap_str); bitmap_str = NULL; } bitmap_str = strdup(tmp_str); free(tmp_str); tmp_str = NULL; } if( NULL != tmp_str ) { free(tmp_str); tmp_str = NULL; } return bitmap_str; }
static void update_routing_plan(void) { orte_routed_tree_t *child; int j; opal_list_item_t *item; int Level,Sum,NInLevel,Ii; int NInPrevLevel; /* if I am anything other than a daemon or the HNP, this * is a meaningless command as I am not allowed to route */ if (!ORTE_PROC_IS_DAEMON && !ORTE_PROC_IS_HNP) { return; } /* clear the list of children if any are already present */ while (NULL != (item = opal_list_remove_first(&my_children))) { OBJ_RELEASE(item); } num_children = 0; /* compute my parent */ Ii = ORTE_PROC_MY_NAME->vpid; Level=0; Sum=1; NInLevel=1; while ( Sum < (Ii+1) ) { Level++; NInLevel *= mca_routed_radix_component.radix; Sum += NInLevel; } Sum -= NInLevel; NInPrevLevel = NInLevel/mca_routed_radix_component.radix; if( 0 == Ii ) { ORTE_PROC_MY_PARENT->vpid = -1; } else { ORTE_PROC_MY_PARENT->vpid = (Ii-Sum) % NInPrevLevel; ORTE_PROC_MY_PARENT->vpid += (Sum - NInPrevLevel); } /* compute my direct children and the bitmap that shows which vpids * lie underneath their branch */ radix_tree(Ii, &num_children, &my_children, NULL); if (0 < opal_output_get_verbosity(orte_routed_base_framework.framework_output)) { opal_output(0, "%s: parent %d num_children %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_PROC_MY_PARENT->vpid, num_children); for (item = opal_list_get_first(&my_children); item != opal_list_get_end(&my_children); item = opal_list_get_next(item)) { child = (orte_routed_tree_t*)item; opal_output(0, "%s: \tchild %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), child->vpid); for (j=0; j < (int)orte_process_info.num_procs; j++) { if (opal_bitmap_is_set_bit(&child->relatives, j)) { opal_output(0, "%s: \t\trelation %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), j); } } } } }
static orte_process_name_t get_route(orte_process_name_t *target) { orte_process_name_t *ret, daemon; opal_list_item_t *item; orte_routed_tree_t *child; int i; orte_routed_jobfam_t *jfam; uint16_t jfamily; if (!orte_routing_is_enabled) { ret = target; goto found; } /* initialize */ daemon.jobid = ORTE_PROC_MY_DAEMON->jobid; daemon.vpid = ORTE_PROC_MY_DAEMON->vpid; if (target->jobid == ORTE_JOBID_INVALID || target->vpid == ORTE_VPID_INVALID) { ret = ORTE_NAME_INVALID; goto found; } /* if it is me, then the route is just direct */ if (OPAL_EQUAL == opal_dss.compare(ORTE_PROC_MY_NAME, target, ORTE_NAME)) { ret = target; goto found; } /* if I am an application process, always route via my local daemon */ if (ORTE_PROC_IS_APP) { ret = ORTE_PROC_MY_DAEMON; goto found; } /* if I am a tool, the route is direct if target is in * my own job family, and to the target's HNP if not */ if (ORTE_PROC_IS_TOOL) { if (ORTE_JOB_FAMILY(target->jobid) == ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) { ret = target; goto found; } else { ORTE_HNP_NAME_FROM_JOB(&daemon, target->jobid); ret = &daemon; goto found; } } /****** HNP AND DAEMONS ONLY ******/ /* IF THIS IS FOR A DIFFERENT JOB FAMILY... */ if (ORTE_JOB_FAMILY(target->jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) { /* if I am a daemon, route this via the HNP */ if (ORTE_PROC_IS_DAEMON) { ret = ORTE_PROC_MY_HNP; goto found; } /* if I am the HNP or a tool, then I stored a route to * this job family, so look it up */ jfamily = ORTE_JOB_FAMILY(target->jobid); for (i=0; i < orte_routed_jobfams.size; i++) { if (NULL == (jfam = (orte_routed_jobfam_t*)opal_pointer_array_get_item(&orte_routed_jobfams, i))) { continue; } if (jfam->job_family == jfamily) { OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output, "%s routed_binomial: route to %s found", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOB_FAMILY_PRINT(target->jobid))); ret = &jfam->route; goto found; } } /* not found - so we have no route */ ret = ORTE_NAME_INVALID; goto found; } /* THIS CAME FROM OUR OWN JOB FAMILY... */ /* if this is going to the HNP, then send it direct if we don't know * how to get there - otherwise, send it via the tree */ if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, target)) { if (!hnp_direct || orte_static_ports) { OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output, "%s routing to the HNP through my parent %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_PARENT))); ret = ORTE_PROC_MY_PARENT; goto found; } else { OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output, "%s routing direct to the HNP", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ret = ORTE_PROC_MY_HNP; goto found; } } daemon.jobid = ORTE_PROC_MY_NAME->jobid; /* find out what daemon hosts this proc */ if (ORTE_VPID_INVALID == (daemon.vpid = orte_get_proc_daemon_vpid(target))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ret = ORTE_NAME_INVALID; goto found; } /* if the daemon is me, then send direct to the target! */ if (ORTE_PROC_MY_NAME->vpid == daemon.vpid) { ret = target; goto found; } else if (orte_process_info.num_procs < mca_routed_radix_component.max_connections) { /* if the job is small enough, send direct to the target's daemon */ ret = &daemon; goto found; } else { /* search routing tree for next step to that daemon */ for (item = opal_list_get_first(&my_children); item != opal_list_get_end(&my_children); item = opal_list_get_next(item)) { child = (orte_routed_tree_t*)item; if (child->vpid == daemon.vpid) { /* the child is hosting the proc - just send it there */ ret = &daemon; goto found; } /* otherwise, see if the daemon we need is below the child */ if (opal_bitmap_is_set_bit(&child->relatives, daemon.vpid)) { /* yep - we need to step through this child */ daemon.vpid = child->vpid; ret = &daemon; goto found; } } } /* if we get here, then the target daemon is not beneath * any of our children, so we have to step up through our parent */ daemon.vpid = ORTE_PROC_MY_PARENT->vpid; ret = &daemon; found: OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output, "%s routed_radix_get(%s) --> %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(target), ORTE_NAME_PRINT(ret))); return *ret; }
static int mca_bml_r2_add_procs( size_t nprocs, struct ompi_proc_t** procs, struct opal_bitmap_t* reachable ) { size_t p, p_index, n_new_procs = 0; struct mca_btl_base_endpoint_t ** btl_endpoints = NULL; struct ompi_proc_t** new_procs = NULL; struct ompi_proc_t *unreach_proc = NULL; int rc, ret = OMPI_SUCCESS; if(0 == nprocs) { return OMPI_SUCCESS; } if(OMPI_SUCCESS != (rc = mca_bml_r2_add_btls()) ) { return rc; } /* Select only the procs that don't yet have the BML proc struct. This prevent * us from calling btl->add_procs several this on the same destination proc. */ for(p_index = 0; p_index < nprocs; p_index++) { struct ompi_proc_t* proc = procs[p_index]; OBJ_RETAIN(proc); if(NULL != proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]) { continue; /* go to the next proc */ } /* Allocate the new_procs on demand */ if( NULL == new_procs ) { new_procs = (struct ompi_proc_t **)malloc(nprocs * sizeof(struct ompi_proc_t *)); if( NULL == new_procs ) { return OMPI_ERR_OUT_OF_RESOURCE; } } new_procs[n_new_procs++] = proc; } if ( 0 == n_new_procs ) { return OMPI_SUCCESS; } /* Starting from here we only work on the unregistered procs */ procs = new_procs; nprocs = n_new_procs; /* attempt to add all procs to each r2 */ btl_endpoints = (struct mca_btl_base_endpoint_t **) malloc(nprocs * sizeof(struct mca_btl_base_endpoint_t*)); if (NULL == btl_endpoints) { free(new_procs); return OMPI_ERR_OUT_OF_RESOURCE; } for(p_index = 0; p_index < mca_bml_r2.num_btl_modules; p_index++) { mca_btl_base_module_t* btl = mca_bml_r2.btl_modules[p_index]; int btl_inuse = 0; /* if the r2 can reach the destination proc it sets the * corresponding bit (proc index) in the reachable bitmap * and can return addressing information for each proc * that is passed back to the r2 on data transfer calls */ opal_bitmap_clear_all_bits(reachable); memset(btl_endpoints, 0, nprocs *sizeof(struct mca_btl_base_endpoint_t*)); rc = btl->btl_add_procs(btl, n_new_procs, new_procs, btl_endpoints, reachable); if(OMPI_SUCCESS != rc) { /* This BTL has troubles adding the nodes. Let's continue maybe some other BTL * can take care of this task. */ continue; } /* for each proc that is reachable */ for( p = 0; p < n_new_procs; p++ ) { if(opal_bitmap_is_set_bit(reachable, p)) { ompi_proc_t *proc = new_procs[p]; mca_bml_base_endpoint_t * bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]; mca_bml_base_btl_t* bml_btl; size_t size; if(NULL == bml_endpoint) { /* allocate bml specific proc data */ bml_endpoint = OBJ_NEW(mca_bml_base_endpoint_t); if (NULL == bml_endpoint) { opal_output(0, "mca_bml_r2_add_procs: unable to allocate resources"); free(btl_endpoints); free(new_procs); return OMPI_ERR_OUT_OF_RESOURCE; } /* preallocate space in array for max number of r2s */ mca_bml_base_btl_array_reserve(&bml_endpoint->btl_eager, mca_bml_r2.num_btl_modules); mca_bml_base_btl_array_reserve(&bml_endpoint->btl_send, mca_bml_r2.num_btl_modules); mca_bml_base_btl_array_reserve(&bml_endpoint->btl_rdma, mca_bml_r2.num_btl_modules); bml_endpoint->btl_max_send_size = -1; bml_endpoint->btl_proc = proc; proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML] = bml_endpoint; bml_endpoint->btl_flags_or = 0; } /* dont allow an additional BTL with a lower exclusivity ranking */ size = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send); if(size > 0) { bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, size-1); /* skip this btl if the exclusivity is less than the previous */ if(bml_btl->btl->btl_exclusivity > btl->btl_exclusivity) { btl->btl_del_procs(btl, 1, &proc, &btl_endpoints[p]); continue; } } /* cache the endpoint on the proc */ bml_btl = mca_bml_base_btl_array_insert(&bml_endpoint->btl_send); bml_btl->btl = btl; bml_btl->btl_endpoint = btl_endpoints[p]; bml_btl->btl_weight = 0; bml_btl->btl_flags = btl->btl_flags; if( (bml_btl->btl_flags & MCA_BTL_FLAGS_PUT) && (NULL == btl->btl_put) ) { opal_output(0, "mca_bml_r2_add_procs: The PUT flag is specified for" " the %s BTL without any PUT function attached. Disard the flag !", bml_btl->btl->btl_component->btl_version.mca_component_name); bml_btl->btl_flags ^= MCA_BTL_FLAGS_PUT; } if( (bml_btl->btl_flags & MCA_BTL_FLAGS_GET) && (NULL == btl->btl_get) ) { opal_output(0, "mca_bml_r2_add_procs: The GET flag is specified for" " the %s BTL without any GET function attached. Discard the flag !", bml_btl->btl->btl_component->btl_version.mca_component_name); bml_btl->btl_flags ^= MCA_BTL_FLAGS_GET; } if( (bml_btl->btl_flags & (MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_SEND)) == 0 ) { /** * If no protocol specified, we have 2 choices: we ignore the BTL * as we don't know which protocl to use, or we suppose that all * BTLs support the send protocol. */ bml_btl->btl_flags |= MCA_BTL_FLAGS_SEND; } /** * calculate the bitwise OR of the btl flags */ bml_endpoint->btl_flags_or |= bml_btl->btl_flags; /* This BTL is in use, allow the progress registration */ btl_inuse++; } } if(btl_inuse > 0 && NULL != btl->btl_component->btl_progress) { size_t p; bool found = false; for( p = 0; p < mca_bml_r2.num_btl_progress; p++ ) { if(mca_bml_r2.btl_progress[p] == btl->btl_component->btl_progress) { found = true; break; } } if(found == false) { mca_bml_r2.btl_progress[mca_bml_r2.num_btl_progress] = btl->btl_component->btl_progress; mca_bml_r2.num_btl_progress++; opal_progress_register( btl->btl_component->btl_progress ); } } } free(btl_endpoints); /* iterate back through procs and compute metrics for registered r2s */ for(p=0; p<n_new_procs; p++) { ompi_proc_t *proc = new_procs[p]; mca_bml_base_endpoint_t* bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]; double total_bandwidth = 0; uint32_t latency = 0xffffffff; size_t n_index; size_t n_size; /* skip over procs w/ no btl's registered */ if(NULL == bml_endpoint) { continue; } /* (1) determine the total bandwidth available across all btls * note that we need to do this here, as we may already have btls configured * (2) determine the highest priority ranking for latency * (3) compute the maximum amount of bytes that can be send without any * weighting. Once the left over is smaller than this number we will * start using the weight to compute the correct amount. */ n_size = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send); /* sort BTLs in descending order according to bandwidth value */ qsort(bml_endpoint->btl_send.bml_btls, n_size, sizeof(mca_bml_base_btl_t), btl_bandwidth_compare); bml_endpoint->btl_rdma_index = 0; for(n_index = 0; n_index < n_size; n_index++) { mca_bml_base_btl_t* bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n_index); mca_btl_base_module_t* btl = bml_btl->btl; total_bandwidth += bml_btl->btl->btl_bandwidth; if(btl->btl_latency < latency) { latency = btl->btl_latency; } } /* (1) set the weight of each btl as a percentage of overall bandwidth * (2) copy all btl instances at the highest priority ranking into the * list of btls used for first fragments */ for(n_index = 0; n_index < n_size; n_index++) { mca_bml_base_btl_t* bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n_index); mca_btl_base_module_t *btl = bml_btl->btl; /* compute weighting factor for this r2 */ if(btl->btl_bandwidth > 0) { bml_btl->btl_weight = (float)(btl->btl_bandwidth / total_bandwidth); } else { bml_btl->btl_weight = (float)(1.0 / n_size); } /* check to see if this r2 is already in the array of r2s * used for first fragments - if not add it. */ if(btl->btl_latency == latency) { mca_bml_base_btl_t* bml_btl_new = mca_bml_base_btl_array_insert(&bml_endpoint->btl_eager); *bml_btl_new = *bml_btl; } /* set endpoint max send size as min of available btls */ if(bml_endpoint->btl_max_send_size > btl->btl_max_send_size) bml_endpoint->btl_max_send_size = btl->btl_max_send_size; /* check flags - is rdma prefered */ if ((btl->btl_flags & (MCA_BTL_FLAGS_PUT|MCA_BTL_FLAGS_GET)) && !((proc->proc_arch != ompi_proc_local_proc->proc_arch) && (0 == (btl->btl_flags & MCA_BTL_FLAGS_HETEROGENEOUS_RDMA)))) { mca_bml_base_btl_t* bml_btl_rdma = mca_bml_base_btl_array_insert(&bml_endpoint->btl_rdma); mca_btl_base_module_t* btl_rdma = bml_btl->btl; *bml_btl_rdma = *bml_btl; if(bml_endpoint->btl_pipeline_send_length < btl_rdma->btl_rdma_pipeline_send_length) { bml_endpoint->btl_pipeline_send_length = btl_rdma->btl_rdma_pipeline_send_length; } if(bml_endpoint->btl_send_limit < btl_rdma->btl_min_rdma_pipeline_size) { bml_endpoint->btl_send_limit = btl_rdma->btl_min_rdma_pipeline_size; } } } } /* see if we have a connection to everyone else */ for(p=0; p<n_new_procs; p++) { ompi_proc_t *proc = new_procs[p]; if (NULL == proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]) { if (NULL == unreach_proc) { unreach_proc = proc; } ret = OMPI_ERR_UNREACH; } } if (mca_bml_r2.show_unreach_errors && OMPI_ERR_UNREACH == ret) { opal_show_help("help-mca-bml-r2.txt", "unreachable proc", true, OMPI_NAME_PRINT(&(ompi_proc_local_proc->proc_name)), (NULL != ompi_proc_local_proc->proc_hostname ? ompi_proc_local_proc->proc_hostname : "unknown!"), OMPI_NAME_PRINT(&(unreach_proc->proc_name)), (NULL != ompi_proc_local_proc->proc_hostname ? ompi_proc_local_proc->proc_hostname : "unknown!"), btl_names); } free(new_procs); return ret; }
static orte_process_name_t get_route(orte_process_name_t *target) { orte_process_name_t *ret, daemon; opal_list_item_t *item; orte_routed_tree_t *child; int rc; if (target->jobid == ORTE_JOBID_INVALID || target->vpid == ORTE_VPID_INVALID) { ret = ORTE_NAME_INVALID; goto found; } /* if it is me, then the route is just direct */ if (OPAL_EQUAL == opal_dss.compare(ORTE_PROC_MY_NAME, target, ORTE_NAME)) { ret = target; goto found; } /* if I am an application process, always route via my local daemon */ if (ORTE_PROC_IS_APP) { ret = ORTE_PROC_MY_DAEMON; goto found; } /****** HNP AND DAEMONS ONLY ******/ /* if the job family is zero, then this is going to a local slave, * so the path is direct */ if (0 == ORTE_JOB_FAMILY(target->jobid)) { ret = target; goto found; } /* IF THIS IS FOR A DIFFERENT JOB FAMILY... */ if (ORTE_JOB_FAMILY(target->jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) { /* if I am a daemon, route this via the HNP */ if (ORTE_PROC_IS_DAEMON) { ret = ORTE_PROC_MY_HNP; goto found; } /* if I am the HNP or a tool, then I stored a route to * this job family, so look it up */ rc = opal_hash_table_get_value_uint32(&jobfam_list, ORTE_JOB_FAMILY(target->jobid), (void**)&ret); if (ORTE_SUCCESS == rc) { /* got a good result - return it */ goto found; } /* not found - so we have no route */ ret = ORTE_NAME_INVALID; goto found; } /* THIS CAME FROM OUR OWN JOB FAMILY... */ /* if we are not using static ports and this is going to the HNP, send direct */ if (!orte_static_ports && ORTE_PROC_MY_HNP->jobid == target->jobid && ORTE_PROC_MY_HNP->vpid == target->vpid) { OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output, "%s routing not enabled - going direct", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ret = target; goto found; } daemon.jobid = ORTE_PROC_MY_NAME->jobid; /* find out what daemon hosts this proc */ if (ORTE_VPID_INVALID == (daemon.vpid = orte_ess.proc_get_daemon(target))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ret = ORTE_NAME_INVALID; goto found; } /* if the daemon is me, then send direct to the target! */ if (ORTE_PROC_MY_NAME->vpid == daemon.vpid) { ret = target; goto found; } else { /* search routing tree for next step to that daemon */ for (item = opal_list_get_first(&my_children); item != opal_list_get_end(&my_children); item = opal_list_get_next(item)) { child = (orte_routed_tree_t*)item; if (child->vpid == daemon.vpid) { /* the child is hosting the proc - just send it there */ ret = &daemon; goto found; } /* otherwise, see if the daemon we need is below the child */ if (opal_bitmap_is_set_bit(&child->relatives, daemon.vpid)) { /* yep - we need to step through this child */ daemon.vpid = child->vpid; ret = &daemon; goto found; } } } /* if we get here, then the target daemon is not beneath * any of our children, so we have to step up through our parent */ daemon.vpid = my_parent.vpid; ret = &daemon; found: OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output, "%s routed_radix_get(%s) --> %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(target), ORTE_NAME_PRINT(ret))); return *ret; }
void orte_grpcomm_base_daemon_collective(orte_process_name_t *sender, opal_buffer_t *data) { orte_jobid_t jobid; orte_odls_job_t *jobdat; orte_routed_tree_t *child; orte_std_cntr_t n; opal_list_t daemon_tree; opal_list_item_t *item, *next; int32_t num_contributors; opal_buffer_t buf; orte_process_name_t my_parent, proc; orte_vpid_t daemonvpid; int rc; int32_t numc; orte_rml_tag_t rmltag; OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, "%s grpcomm:base:daemon_coll: daemon collective called", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* unpack the jobid using this collective */ n = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobid, &n, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); return; } /* lookup the job record for it */ jobdat = NULL; for (item = opal_list_get_first(&orte_local_jobdata); item != opal_list_get_end(&orte_local_jobdata); item = opal_list_get_next(item)) { jobdat = (orte_odls_job_t*)item; /* is this the specified job? */ if (jobdat->jobid == jobid) { break; } } if (NULL == jobdat) { /* race condition - someone sent us a collective before we could * parse the add_local_procs cmd. Just add the jobdat object * and continue */ jobdat = OBJ_NEW(orte_odls_job_t); jobdat->jobid = jobid; opal_list_append(&orte_local_jobdata, &jobdat->super); } /* it may be possible to get here prior to having actually finished processing our * local launch msg due to the race condition between different nodes and when * they start their individual procs. Hence, we have to first ensure that we * -have- finished processing the launch msg, or else we won't know whether * or not to wait before sending this on */ OPAL_THREAD_LOCK(&jobdat->lock); while (!jobdat->launch_msg_processed) { opal_condition_wait(&jobdat->cond, &jobdat->lock); } OPAL_THREAD_UNLOCK(&jobdat->lock); /* unpack the tag for this collective */ n = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &rmltag, &n, ORTE_RML_TAG))) { ORTE_ERROR_LOG(rc); return; } /* unpack the number of contributors in this data bucket */ n = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &num_contributors, &n, OPAL_INT32))) { ORTE_ERROR_LOG(rc); return; } jobdat->num_contributors += num_contributors; /* xfer the data */ opal_dss.copy_payload(&jobdat->collection_bucket, data); /* count the number of participants collected */ jobdat->num_collected++; /* if we haven't already done so, figure out how many participants we * should be expecting */ if (jobdat->num_participating < 0) { if (0 < jobdat->num_local_procs) { /* we have children, so account for our own participation */ jobdat->num_participating = 1; } else { jobdat->num_participating = 0; } /* now see if anyone else will be sending us something */ OBJ_CONSTRUCT(&daemon_tree, opal_list_t); orte_routed.get_routing_tree(&daemon_tree); /* unfortunately, there is no simple way to determine which of our "child" * daemons in the routing tree will be sending us something. All we can do * is brute force a search, though we attempt to keep it as short as possible */ proc.jobid = jobid; proc.vpid = 0; while (proc.vpid < jobdat->num_procs && 0 < opal_list_get_size(&daemon_tree)) { ORTE_EPOCH_SET(proc.epoch,orte_ess.proc_get_epoch(&proc)); /* get the daemon that hosts this proc */ daemonvpid = orte_ess.proc_get_daemon(&proc); /* is this daemon one of our children, or at least its contribution * will pass through one of our children */ item = opal_list_get_first(&daemon_tree); while (item != opal_list_get_end(&daemon_tree)) { next = opal_list_get_next(item); child = (orte_routed_tree_t*)item; if (child->vpid == daemonvpid || opal_bitmap_is_set_bit(&child->relatives, daemonvpid)) { /* it does - add to num_participating */ jobdat->num_participating++; /* remove this from the list so we don't double count it */ opal_list_remove_item(&daemon_tree, item); /* done with search */ break; } item = next; } proc.vpid++; } } OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, "%s grpcomm:base:daemon_coll: daemon collective for job %s from %s type %ld" " num_collected %d num_participating %d num_contributors %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jobid), ORTE_NAME_PRINT(sender), (long)jobdat->collective_type, jobdat->num_collected, jobdat->num_participating, jobdat->num_contributors)); if (jobdat->num_collected == jobdat->num_participating) { /* if I am the HNP, go process the results */ if (ORTE_PROC_IS_HNP) { goto hnp_process; } /* if I am not the HNP, send to my parent */ OBJ_CONSTRUCT(&buf, opal_buffer_t); /* pack the jobid */ if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &jobid, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); return; } /* pack the target tag */ if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &rmltag, 1, ORTE_RML_TAG))) { ORTE_ERROR_LOG(rc); return; } /* pack the number of contributors */ if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &jobdat->num_contributors, 1, OPAL_INT32))) { ORTE_ERROR_LOG(rc); return; } /* xfer the payload*/ opal_dss.copy_payload(&buf, &jobdat->collection_bucket); /* reset everything for next collective */ jobdat->num_contributors = 0; jobdat->num_collected = 0; OBJ_DESTRUCT(&jobdat->collection_bucket); OBJ_CONSTRUCT(&jobdat->collection_bucket, opal_buffer_t); /* send it */ my_parent.jobid = ORTE_PROC_MY_NAME->jobid; my_parent.vpid = orte_routed.get_routing_tree(NULL); ORTE_EPOCH_SET(my_parent.epoch,orte_ess.proc_get_epoch(&my_parent)); OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, "%s grpcomm:base:daemon_coll: daemon collective not the HNP - sending to parent %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&my_parent))); if (0 > (rc = orte_rml.send_buffer(&my_parent, &buf, ORTE_RML_TAG_DAEMON_COLLECTIVE, 0))) { ORTE_ERROR_LOG(rc); return; } OBJ_DESTRUCT(&buf); } return; hnp_process: OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, "%s grpcomm:base:daemon_coll: daemon collective HNP - xcasting to job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jobid))); /* setup a buffer to send the results back to the job members */ OBJ_CONSTRUCT(&buf, opal_buffer_t); /* add any collected data */ numc = jobdat->num_contributors; if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &numc, 1, OPAL_INT32))) { ORTE_ERROR_LOG(rc); goto cleanup; } if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(&buf, &jobdat->collection_bucket))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* reset everything for next collective */ jobdat->num_contributors = 0; jobdat->num_collected = 0; OBJ_DESTRUCT(&jobdat->collection_bucket); OBJ_CONSTRUCT(&jobdat->collection_bucket, opal_buffer_t); /* send the buffer */ if (ORTE_SUCCESS != (rc = orte_grpcomm.xcast(jobid, &buf, rmltag))) { ORTE_ERROR_LOG(rc); } cleanup: OBJ_DESTRUCT(&buf); return; }
/* * Group Difference has to use the dense format since we don't support * two parent groups in the group structure and maintain functions */ int ompi_group_difference(ompi_group_t* group1, ompi_group_t* group2, ompi_group_t **new_group) { /* local varibles */ int new_group_size, overlap_count, rc; ompi_group_t *new_group_pointer; ompi_proc_t *proc1_pointer; opal_bitmap_t bitmap; /* * form union */ /* get new group size */ OBJ_CONSTRUCT(&bitmap, opal_bitmap_t); rc = opal_bitmap_init (&bitmap, 32); if (OPAL_SUCCESS != rc) { return rc; } /* check group2 elements to see if they need to be included in the list */ overlap_count = ompi_group_dense_overlap (group2, group1, &bitmap); if (0 > overlap_count) { OBJ_DESTRUCT(&bitmap); return overlap_count; } new_group_size = group1->grp_proc_count - overlap_count; if ( 0 == new_group_size ) { *new_group = MPI_GROUP_EMPTY; OBJ_RETAIN(MPI_GROUP_EMPTY); OBJ_DESTRUCT(&bitmap); return MPI_SUCCESS; } /* allocate a new ompi_group_t structure */ new_group_pointer = ompi_group_allocate(new_group_size); if( NULL == new_group_pointer ) { OBJ_DESTRUCT(&bitmap); return MPI_ERR_GROUP; } /* fill in group list */ /* loop over group1 members */ for (int proc1 = 0, cnt = 0 ; proc1 < group1->grp_proc_count ; ++proc1) { if (opal_bitmap_is_set_bit (&bitmap, proc1)) { continue; } proc1_pointer = ompi_group_get_proc_ptr_raw (group1, proc1); new_group_pointer->grp_proc_pointers[cnt++] = proc1_pointer; } /* end proc loop */ OBJ_DESTRUCT(&bitmap); /* increment proc reference counters */ ompi_group_increment_proc_count(new_group_pointer); /* find my rank */ if (MPI_UNDEFINED == group1->grp_my_rank || MPI_UNDEFINED != group2->grp_my_rank) { new_group_pointer->grp_my_rank = MPI_UNDEFINED; } else { ompi_set_group_rank(new_group_pointer, ompi_proc_local_proc); } *new_group = (MPI_Group)new_group_pointer; return OMPI_SUCCESS; }
static orte_process_name_t get_route(orte_process_name_t *target) { orte_process_name_t *ret, daemon; opal_list_item_t *item; orte_routed_tree_t *child; if (!orte_routing_is_enabled) { ret = target; goto found; } /* initialize */ daemon.jobid = ORTE_PROC_MY_DAEMON->jobid; daemon.vpid = ORTE_PROC_MY_DAEMON->vpid; if (target->jobid == ORTE_JOBID_INVALID || target->vpid == ORTE_VPID_INVALID) { ret = ORTE_NAME_INVALID; goto found; } /* if it is me, then the route is just direct */ if (OPAL_EQUAL == opal_dss.compare(ORTE_PROC_MY_NAME, target, ORTE_NAME)) { ret = target; goto found; } /* if I am an application process, always route via my local daemon */ if (ORTE_PROC_IS_APP) { ret = ORTE_PROC_MY_DAEMON; goto found; } /* if I am a tool, the route is direct if target is in * my own job family, and to the target's HNP if not */ if (ORTE_PROC_IS_TOOL) { if (ORTE_JOB_FAMILY(target->jobid) == ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) { ret = target; goto found; } else { ORTE_HNP_NAME_FROM_JOB(&daemon, target->jobid); ret = &daemon; goto found; } } /****** HNP AND DAEMONS ONLY ******/ if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, target)) { if (!hnp_direct || orte_static_ports) { OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output, "%s routing to the HNP through my parent %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_PARENT))); ret = ORTE_PROC_MY_PARENT; goto found; } else { OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output, "%s routing direct to the HNP", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ret = ORTE_PROC_MY_HNP; goto found; } } daemon.jobid = ORTE_PROC_MY_NAME->jobid; /* find out what daemon hosts this proc */ if (ORTE_VPID_INVALID == (daemon.vpid = orte_get_proc_daemon_vpid(target))) { /*ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);*/ ret = ORTE_NAME_INVALID; goto found; } /* if the daemon is me, then send direct to the target! */ if (ORTE_PROC_MY_NAME->vpid == daemon.vpid) { ret = target; goto found; } /* search routing tree for next step to that daemon */ for (item = opal_list_get_first(&my_children); item != opal_list_get_end(&my_children); item = opal_list_get_next(item)) { child = (orte_routed_tree_t*)item; if (child->vpid == daemon.vpid) { /* the child is hosting the proc - just send it there */ ret = &daemon; goto found; } /* otherwise, see if the daemon we need is below the child */ if (opal_bitmap_is_set_bit(&child->relatives, daemon.vpid)) { /* yep - we need to step through this child */ daemon.vpid = child->vpid; ret = &daemon; goto found; } } /* if we get here, then the target daemon is not beneath * any of our children, so we have to step up through our parent */ daemon.vpid = ORTE_PROC_MY_PARENT->vpid; ret = &daemon; found: OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output, "%s routed_binomial_get(%s) --> %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(target), ORTE_NAME_PRINT(ret))); return *ret; }