void chpl_comm_ofi_oob_allgather(const void* mine, void* all, size_t size) { DBG_PRINTF(DBG_OOB, "OOB allGather: %zd", size); // // PMI doesn't provide an ordered allGather, so we build one here // by concatenating the node index and the payload and using that // to scatter the unordered PMI_Allgather() results. // typedef struct { int nodeID; uint64_t info[]; } gather_t; const size_t g_size = offsetof(gather_t, info) + size; gather_t* g_mine; CHK_SYS_CALLOC_SZ(g_mine, 1, g_size); g_mine->nodeID = chpl_nodeID; memcpy(&g_mine->info, mine, size); gather_t* g_all; CHK_SYS_CALLOC_SZ(g_all, chpl_numNodes, g_size); PMI_CHK(PMI_Allgather(g_mine, g_all, g_size)); for (int g_i = 0; g_i < chpl_numNodes; g_i++) { char* g_pa = (char*) g_all + g_i * g_size; int i; memcpy(&i, g_pa + offsetof(gather_t, nodeID), sizeof(i)); char* p_a = (char*) all + i * size; memcpy(p_a, g_pa + offsetof(gather_t, info), size); } sys_free(g_all); sys_free(g_mine); }
static int cray_fence(opal_list_t *procs, int collect_data) { int rc, cnt; int32_t i; int *all_lens = NULL; opal_value_t *kp, kvn; opal_buffer_t *send_buffer = NULL; opal_buffer_t *buf = NULL; void *sbuf_ptr; char *cptr, *rcv_buff = NULL; opal_process_name_t id; typedef struct { uint32_t pmix_rank; opal_process_name_t name; int32_t nbytes; } bytes_and_rank_t; int32_t rcv_nbytes_tot; bytes_and_rank_t s_bytes_and_rank; bytes_and_rank_t *r_bytes_and_ranks = NULL; opal_hwloc_locality_t locality; opal_list_t vals; char *cpuset = NULL; opal_process_name_t pname; opal_output_verbose(2, opal_pmix_base_framework.framework_output, "%s pmix:cray executing fence cache_global %p cache_local %p", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), (void *)mca_pmix_cray_component.cache_global, (void *)mca_pmix_cray_component.cache_local); /* get the modex data from each local process and set the * localities to avoid having the MPI layer fetch data * for every process in the job */ pname.jobid = OPAL_PROC_MY_NAME.jobid; /* * "unload" the cache_local/cache_global buffers, first copy * it so we can continue to use the local buffers if further * calls to put can be made */ send_buffer = OBJ_NEW(opal_buffer_t); if (NULL == send_buffer) { return OPAL_ERR_OUT_OF_RESOURCE; } opal_dss.copy_payload(send_buffer, mca_pmix_cray_component.cache_global); opal_dss.unload(send_buffer, &sbuf_ptr, &s_bytes_and_rank.nbytes); s_bytes_and_rank.pmix_rank = pmix_rank; s_bytes_and_rank.name = OPAL_PROC_MY_NAME; r_bytes_and_ranks = (bytes_and_rank_t *)malloc(pmix_size * sizeof(bytes_and_rank_t)); if (NULL == r_bytes_and_ranks) { rc = OPAL_ERR_OUT_OF_RESOURCE; goto fn_exit; } /* * gather up all the buffer sizes and rank order. * doing this step below since the cray pmi PMI_Allgather doesn't deliver * the gathered data necessarily in PMI rank order, although the order stays * the same for the duration of a job - assuming no node failures. */ if (PMI_SUCCESS != (rc = PMI_Allgather(&s_bytes_and_rank,r_bytes_and_ranks,sizeof(bytes_and_rank_t)))) { OPAL_PMI_ERROR(rc,"PMI_Allgather"); rc = OPAL_ERR_COMM_FAILURE; goto fn_exit; } for (rcv_nbytes_tot=0,i=0; i < pmix_size; i++) { rcv_nbytes_tot += r_bytes_and_ranks[i].nbytes; } opal_output_verbose(20, opal_pmix_base_framework.framework_output, "%s pmix:cray total number of bytes to receive %d", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), rcv_nbytes_tot); rcv_buff = (char *) malloc(rcv_nbytes_tot * sizeof(char)); if (NULL == rcv_buff) { rc = OPAL_ERR_OUT_OF_RESOURCE; goto fn_exit; } all_lens = (int *)malloc(sizeof(int) * pmix_size); if (NULL == all_lens) { rc = OPAL_ERR_OUT_OF_RESOURCE; goto fn_exit; } for (i=0; i< pmix_size; i++) { all_lens[r_bytes_and_ranks[i].pmix_rank] = r_bytes_and_ranks[i].nbytes; } if (PMI_SUCCESS != (rc = PMI_Allgatherv(sbuf_ptr,s_bytes_and_rank.nbytes,rcv_buff,all_lens))) { OPAL_PMI_ERROR(rc,"PMI_Allgatherv"); rc = OPAL_ERR_COMM_FAILURE; goto fn_exit; } OBJ_RELEASE(send_buffer); send_buffer = NULL; buf = OBJ_NEW(opal_buffer_t); if (buf == NULL) { rc = OPAL_ERR_OUT_OF_RESOURCE; goto fn_exit; } for (cptr = rcv_buff, i=0; i < pmix_size; i++) { id = r_bytes_and_ranks[i].name; buf->base_ptr = NULL; /* TODO: ugh */ if (OPAL_SUCCESS != (rc = opal_dss.load(buf, (void *)cptr, r_bytes_and_ranks[i].nbytes))) { OPAL_PMI_ERROR(rc,"pmix:cray opal_dss.load failed"); goto fn_exit; } /* unpack and stuff in to the dstore */ cnt = 1; while (OPAL_SUCCESS == (rc = opal_dss.unpack(buf, &kp, &cnt, OPAL_VALUE))) { opal_output_verbose(20, opal_pmix_base_framework.framework_output, "%s pmix:cray unpacked kp with key %s type(%d) for id %s", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), kp->key, kp->type, OPAL_NAME_PRINT(id)); if (OPAL_SUCCESS != (rc = opal_pmix_base_store(&id, kp))) { OPAL_ERROR_LOG(rc); goto fn_exit; } OBJ_RELEASE(kp); cnt = 1; } cptr += r_bytes_and_ranks[i].nbytes; } buf->base_ptr = NULL; /* TODO: ugh */ OBJ_RELEASE(buf); opal_output_verbose(2, opal_pmix_base_framework.framework_output, "%s pmix:cray kvs_fence complete", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME)); #if OPAL_HAVE_HWLOC /* fetch my cpuset */ OBJ_CONSTRUCT(&vals, opal_list_t); if (OPAL_SUCCESS == (rc = opal_pmix_base_fetch(&pmix_pname, OPAL_PMIX_CPUSET, &vals))) { kp = (opal_value_t*)opal_list_get_first(&vals); cpuset = strdup(kp->data.string); } else { cpuset = NULL; } OPAL_LIST_DESTRUCT(&vals); #endif /* we only need to set locality for each local rank as "not found" * equates to "non-local" */ for (i=0; i < pmix_nlranks; i++) { id.vpid = pmix_lranks[i]; id.jobid = pmix_jobid; opal_output_verbose(2, opal_pmix_base_framework.framework_output, "%s checking out if %s is local to me", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), OPAL_NAME_PRINT(id)); /* fetch cpuset for this vpid */ #if OPAL_HAVE_HWLOC OBJ_CONSTRUCT(&vals, opal_list_t); if (OPAL_SUCCESS != (rc = opal_pmix_base_fetch(&id, OPAL_PMIX_CPUSET, &vals))) { opal_output_verbose(2, opal_pmix_base_framework.framework_output, "%s cpuset for local proc %s not found", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), OPAL_NAME_PRINT(id)); OPAL_LIST_DESTRUCT(&vals); /* even though the cpuset wasn't found, we at least know it is * on the same node with us */ locality = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE; } else { kp = (opal_value_t*)opal_list_get_first(&vals); if (NULL == kp->data.string) { /* if we share a node, but we don't know anything more, then * mark us as on the node as this is all we know */ locality = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE; } else { /* determine relative location on our node */ locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology, cpuset, kp->data.string); } OPAL_LIST_DESTRUCT(&vals); } #else /* all we know is we share a node */ locality = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE; #endif OPAL_OUTPUT_VERBOSE((1, opal_pmix_base_framework.framework_output, "%s pmix:cray proc %s locality %s", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), OPAL_NAME_PRINT(id), opal_hwloc_base_print_locality(locality))); OBJ_CONSTRUCT(&kvn, opal_value_t); kvn.key = strdup(OPAL_PMIX_LOCALITY); kvn.type = OPAL_UINT16; kvn.data.uint16 = locality; opal_pmix_base_store(&pname, &kvn); OBJ_DESTRUCT(&kvn); } fn_exit: #if OPAL_HAVE_HWLOC if (NULL != cpuset) { free(cpuset); } #endif if (all_lens != NULL) { free(all_lens); } if (rcv_buff != NULL) { free(rcv_buff); } if (r_bytes_and_ranks != NULL) { free(r_bytes_and_ranks); } return rc; }
counter++; #endif #else PMI_Barrier(); #endif } #if HAVE_PMI_ALLGATHER static gasnet_node_t *gasnetc_pmi_allgather_order = NULL; GASNETI_INLINE(gasnetc_pmi_allgather_init) void gasnetc_pmi_allgather_init(void) { /* perform (just once) an Allgather of node number to establish the order */ if_pf (!gasnetc_pmi_allgather_order) { int rc; gasnetc_pmi_allgather_order = gasneti_malloc(gasneti_nodes * sizeof(gasnet_node_t)); rc = PMI_Allgather(&gasneti_mynode, gasnetc_pmi_allgather_order, sizeof(gasnet_node_t)); gasneti_assert(PMI_SUCCESS == rc); } } #endif /* gasneti_bootstrapExchange */ void gasneti_bootstrapExchange_pmi(void *src, size_t len, void *dest) { #if HAVE_PMI_ALLGATHER uint8_t *unsorted = gasneti_malloc(len * gasneti_nodes); /* TODO: use alloca()? */ gasnet_node_t i; int rc; /* Allgather the callers data to a temporary array */ gasnetc_pmi_allgather_init();