static int brks_allgather_process_buffered (orte_grpcomm_coll_t *coll, uint32_t distance) { opal_buffer_t *buffer; size_t nreceived; int32_t cnt = 1; int rc; /* check whether data for next distance is available*/ if (NULL == coll->buffers || NULL == coll->buffers[distance]) { return 0; } buffer = coll->buffers[distance]; coll->buffers[distance] = NULL; OPAL_OUTPUT_VERBOSE((80, orte_grpcomm_base_framework.framework_output, "%s grpcomm:coll:brks %u distance data found", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), distance)); rc = opal_dss.unpack (buffer, &nreceived, &cnt, OPAL_SIZE); if (OPAL_SUCCESS != rc) { ORTE_ERROR_LOG(rc); brks_finalize_coll(coll, rc); return rc; } if (OPAL_SUCCESS != (rc = opal_dss.copy_payload(&coll->bucket, buffer))) { ORTE_ERROR_LOG(rc); brks_finalize_coll(coll, rc); return rc; } coll->nreported += nreceived; orte_grpcomm_base_mark_distance_recv (coll, distance); OBJ_RELEASE(buffer); return 1; }
static int allgather(orte_grpcomm_coll_t *coll, opal_buffer_t *sendbuf) { OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, "%s grpcomm:coll:bruck algo employed for %d processes", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)coll->ndmns)); /* get my own rank */ coll->my_rank = ORTE_VPID_INVALID; for (orte_vpid_t nv = 0; nv < coll->ndmns; nv++) { if (coll->dmns[nv] == ORTE_PROC_MY_NAME->vpid) { coll->my_rank = nv; break; } } /* check for bozo case */ if (ORTE_VPID_INVALID == coll->my_rank) { OPAL_OUTPUT((orte_grpcomm_base_framework.framework_output, "Peer not found")); ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); brks_finalize_coll(coll, ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; } /* record that we contributed */ coll->nreported = 1; /* mark local data received */ if (coll->ndmns > 1) { opal_bitmap_init (&coll->distance_mask_recv, (uint32_t) log2 (coll->ndmns) + 1); } /* start by seeding the collection with our own data */ opal_dss.copy_payload(&coll->bucket, sendbuf); /* process data */ brks_allgather_process_data (coll, 0); return ORTE_SUCCESS; }
static void brks_allgather_recv_dist(int status, orte_process_name_t* sender, opal_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata) { int32_t cnt, num_remote; int rc; orte_grpcomm_signature_t *sig; orte_grpcomm_coll_t *coll; orte_vpid_t distance, new_distance; OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, "%s grpcomm:coll:recdub received data", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* unpack the signature */ cnt = 1; if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &sig, &cnt, ORTE_SIGNATURE))) { ORTE_ERROR_LOG(rc); return; } /* check for the tracker and create it if not found */ if (NULL == (coll = orte_grpcomm_base_get_tracker(sig, true))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); OBJ_RELEASE(sig); return; } /* unpack the distance */ distance = 1; if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &distance, &cnt, OPAL_INT32))) { OBJ_RELEASE(sig); ORTE_ERROR_LOG(rc); brks_finalize_coll(coll, rc); return; } /* unpack number of reported processes */ num_remote = 0; if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &num_remote, &cnt, OPAL_INT32))) { OBJ_RELEASE(sig); ORTE_ERROR_LOG(rc); brks_finalize_coll(coll, rc); return; } coll->nreported += num_remote; /* capture any provided content */ if (OPAL_SUCCESS != (rc = opal_dss.copy_payload(&coll->bucket, buffer))) { OBJ_RELEASE(sig); ORTE_ERROR_LOG(rc); brks_finalize_coll(coll, rc); return; } //update distance and send new_distance = distance <<= 1; if (new_distance < coll->ndmns) { brks_allgather_send_dist(coll, new_distance); } else { brks_finalize_coll(coll, ORTE_SUCCESS); } OBJ_RELEASE(sig); return; }
static void brks_allgather_recv_dist(int status, orte_process_name_t* sender, opal_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata) { int32_t cnt; int rc; orte_grpcomm_signature_t *sig; orte_grpcomm_coll_t *coll; uint32_t distance; OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, "%s grpcomm:coll:brks RECEIVING FROM %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(sender))); /* unpack the signature */ cnt = 1; if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &sig, &cnt, ORTE_SIGNATURE))) { ORTE_ERROR_LOG(rc); return; } /* check for the tracker and create it if not found */ if (NULL == (coll = orte_grpcomm_base_get_tracker(sig, true))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); OBJ_RELEASE(sig); return; } /* unpack the distance */ distance = 1; if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &distance, &cnt, OPAL_INT32))) { OBJ_RELEASE(sig); ORTE_ERROR_LOG(rc); brks_finalize_coll(coll, rc); return; } assert(0 == orte_grpcomm_base_check_distance_recv(coll, distance)); /* Check whether we can process next distance */ if (coll->nreported && (!distance || orte_grpcomm_base_check_distance_recv(coll, distance - 1))) { size_t nreceived; OPAL_OUTPUT_VERBOSE((80, orte_grpcomm_base_framework.framework_output, "%s grpcomm:coll:brks data from %d distance received, " "Process the next distance.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), distance)); /* capture any provided content */ rc = opal_dss.unpack (buffer, &nreceived, &cnt, OPAL_SIZE); if (OPAL_SUCCESS != rc) { OBJ_RELEASE(sig); ORTE_ERROR_LOG(rc); brks_finalize_coll(coll, rc); return; } if (OPAL_SUCCESS != (rc = opal_dss.copy_payload(&coll->bucket, buffer))) { OBJ_RELEASE(sig); ORTE_ERROR_LOG(rc); brks_finalize_coll(coll, rc); return; } coll->nreported += nreceived; orte_grpcomm_base_mark_distance_recv(coll, distance); brks_allgather_process_data(coll, distance + 1); } else { OPAL_OUTPUT_VERBOSE((80, orte_grpcomm_base_framework.framework_output, "%s grpcomm:coll:brks data from %d distance received, " "still waiting for data.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), distance)); if (NULL == coll->buffers) { if (NULL == (coll->buffers = (opal_buffer_t **) calloc ((uint32_t) log2 (coll->ndmns) + 1, sizeof(opal_buffer_t *)))) { rc = OPAL_ERR_OUT_OF_RESOURCE; OBJ_RELEASE(sig); ORTE_ERROR_LOG(rc); brks_finalize_coll(coll, rc); return; } } if (NULL == (coll->buffers[distance] = OBJ_NEW(opal_buffer_t))) { rc = OPAL_ERR_OUT_OF_RESOURCE; OBJ_RELEASE(sig); ORTE_ERROR_LOG(rc); brks_finalize_coll(coll, rc); return; } if (OPAL_SUCCESS != (rc = opal_dss.copy_payload(coll->buffers[distance], buffer))) { OBJ_RELEASE(sig); ORTE_ERROR_LOG(rc); brks_finalize_coll(coll, rc); return; } } OBJ_RELEASE(sig); }
static void brks_allgather_process_data(orte_grpcomm_coll_t *coll, uint32_t distance) { /* Communication step: At every step i, rank r: - doubles the distance - sends message containing all data collected so far to rank r - distance - receives message containing all data collected so far from rank (r + distance) */ uint32_t log2ndmns = (uint32_t) log2 (coll->ndmns); uint32_t last_round; orte_process_name_t peer; orte_vpid_t nv; int rc; /* NTH: calculate in which round we should send the final data. this is the first * round in which we have data from at least (coll->ndmns - (1 << log2ndmns)) * daemons. alternatively we could just send when distance reaches log2ndmns but * that could end up sending more data than needed */ last_round = (uint32_t) ceil (log2 ((double) (coll->ndmns - (1 << log2ndmns)))); peer.jobid = ORTE_PROC_MY_NAME->jobid; while (distance < log2ndmns) { OPAL_OUTPUT_VERBOSE((80, orte_grpcomm_base_framework.framework_output, "%s grpcomm:coll:brks process distance %u)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), distance)); /* first send my current contents */ nv = (coll->ndmns + coll->my_rank - (1 << distance)) % coll->ndmns; peer.vpid = coll->dmns[nv]; brks_allgather_send_dist(coll, &peer, distance); if (distance == last_round) { /* have enough data to send the final round now */ nv = (coll->ndmns + coll->my_rank - (1 << log2ndmns)) % coll->ndmns; peer.vpid = coll->dmns[nv]; brks_allgather_send_dist(coll, &peer, log2ndmns); } rc = brks_allgather_process_buffered (coll, distance); if (!rc) { break; } else if (rc < 0) { return; } ++distance; } if (distance == log2ndmns) { if (distance == last_round) { /* need to send the final round now */ nv = (coll->ndmns + coll->my_rank - (1 << log2ndmns)) % coll->ndmns; peer.vpid = coll->dmns[nv]; brks_allgather_send_dist(coll, &peer, log2ndmns); } /* check if the final message is already queued */ rc = brks_allgather_process_buffered (coll, distance); if (rc < 0) { return; } } OPAL_OUTPUT_VERBOSE((80, orte_grpcomm_base_framework.framework_output, "%s grpcomm:coll:brks reported %lu process from %lu", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (unsigned long)coll->nreported, (unsigned long)coll->ndmns)); /* if we are done, then complete things. we may get data from more daemons than expected */ if (coll->nreported >= coll->ndmns){ brks_finalize_coll(coll, ORTE_SUCCESS); } }