static void barrier_release(int status, orte_process_name_t* sender, opal_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata) { int32_t cnt; int rc, ret; orte_grpcomm_signature_t *sig; orte_grpcomm_coll_t *coll; OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, "%s grpcomm:direct: barrier release called with %d bytes", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)buffer->bytes_used)); /* unpack the signature */ cnt = 1; if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &sig, &cnt, ORTE_SIGNATURE))) { ORTE_ERROR_LOG(rc); return; } /* unpack the return status */ cnt = 1; if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &ret, &cnt, OPAL_INT))) { ORTE_ERROR_LOG(rc); return; } /* check for the tracker - it is not an error if not * found as that just means we wre not involved * in the collective */ if (NULL == (coll = orte_grpcomm_base_get_tracker(sig, false))) { OBJ_RELEASE(sig); return; } /* execute the callback */ if (NULL != coll->cbfunc) { coll->cbfunc(ret, buffer, coll->cbdata); } opal_list_remove_item(&orte_grpcomm_base.ongoing, &coll->super); OBJ_RELEASE(coll); OBJ_RELEASE(sig); }
static void brks_allgather_recv_dist(int status, orte_process_name_t* sender, opal_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata) { int32_t cnt, num_remote; int rc; orte_grpcomm_signature_t *sig; orte_grpcomm_coll_t *coll; orte_vpid_t distance, new_distance; OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, "%s grpcomm:coll:recdub received data", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* unpack the signature */ cnt = 1; if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &sig, &cnt, ORTE_SIGNATURE))) { ORTE_ERROR_LOG(rc); return; } /* check for the tracker and create it if not found */ if (NULL == (coll = orte_grpcomm_base_get_tracker(sig, true))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); OBJ_RELEASE(sig); return; } /* unpack the distance */ distance = 1; if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &distance, &cnt, OPAL_INT32))) { OBJ_RELEASE(sig); ORTE_ERROR_LOG(rc); brks_finalize_coll(coll, rc); return; } /* unpack number of reported processes */ num_remote = 0; if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &num_remote, &cnt, OPAL_INT32))) { OBJ_RELEASE(sig); ORTE_ERROR_LOG(rc); brks_finalize_coll(coll, rc); return; } coll->nreported += num_remote; /* capture any provided content */ if (OPAL_SUCCESS != (rc = opal_dss.copy_payload(&coll->bucket, buffer))) { OBJ_RELEASE(sig); ORTE_ERROR_LOG(rc); brks_finalize_coll(coll, rc); return; } //update distance and send new_distance = distance <<= 1; if (new_distance < coll->ndmns) { brks_allgather_send_dist(coll, new_distance); } else { brks_finalize_coll(coll, ORTE_SUCCESS); } OBJ_RELEASE(sig); return; }
static void allgather_recv(int status, orte_process_name_t* sender, opal_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata) { int32_t cnt; int rc, ret; orte_grpcomm_signature_t *sig; opal_buffer_t *reply; orte_grpcomm_coll_t *coll; OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_framework.framework_output, "%s grpcomm:direct allgather recvd from %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(sender))); /* unpack the signature */ cnt = 1; if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &sig, &cnt, ORTE_SIGNATURE))) { ORTE_ERROR_LOG(rc); return; } /* check for the tracker and create it if not found */ if (NULL == (coll = orte_grpcomm_base_get_tracker(sig, true))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); OBJ_RELEASE(sig); return; } /* increment nprocs reported for collective */ coll->nreported++; /* capture any provided content */ opal_dss.copy_payload(&coll->bucket, buffer); OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_framework.framework_output, "%s grpcomm:direct allgather recv nexpected %d nrep %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)coll->nexpected, (int)coll->nreported)); /* see if everyone has reported */ if (coll->nreported == coll->nexpected) { if (ORTE_PROC_IS_HNP) { OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_framework.framework_output, "%s grpcomm:direct allgather HNP reports complete", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* the allgather is complete - send the xcast */ reply = OBJ_NEW(opal_buffer_t); /* pack the signature */ if (OPAL_SUCCESS != (rc = opal_dss.pack(reply, &sig, 1, ORTE_SIGNATURE))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(reply); OBJ_RELEASE(sig); return; } /* pack the status - success since the allgather completed. This * would be an error if we timeout instead */ ret = ORTE_SUCCESS; if (OPAL_SUCCESS != (rc = opal_dss.pack(reply, &ret, 1, OPAL_INT))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(reply); OBJ_RELEASE(sig); return; } /* transfer the collected bucket */ opal_dss.copy_payload(reply, &coll->bucket); /* send the release via xcast */ (void)orte_grpcomm.xcast(sig, ORTE_RML_TAG_COLL_RELEASE, reply); OBJ_RELEASE(reply); } else { OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_framework.framework_output, "%s grpcomm:direct allgather rollup complete - sending to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_PARENT))); /* relay the bucket upward */ reply = OBJ_NEW(opal_buffer_t); /* pack the signature */ if (OPAL_SUCCESS != (rc = opal_dss.pack(reply, &sig, 1, ORTE_SIGNATURE))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(reply); OBJ_RELEASE(sig); return; } /* transfer the collected bucket */ opal_dss.copy_payload(reply, &coll->bucket); /* send the info to our parent */ rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_PARENT, reply, ORTE_RML_TAG_ALLGATHER_DIRECT, orte_rml_send_callback, NULL); } } OBJ_RELEASE(sig); }
static void brks_allgather_recv_dist(int status, orte_process_name_t* sender, opal_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata) { int32_t cnt; int rc; orte_grpcomm_signature_t *sig; orte_grpcomm_coll_t *coll; uint32_t distance; OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, "%s grpcomm:coll:brks RECEIVING FROM %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(sender))); /* unpack the signature */ cnt = 1; if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &sig, &cnt, ORTE_SIGNATURE))) { ORTE_ERROR_LOG(rc); return; } /* check for the tracker and create it if not found */ if (NULL == (coll = orte_grpcomm_base_get_tracker(sig, true))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); OBJ_RELEASE(sig); return; } /* unpack the distance */ distance = 1; if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &distance, &cnt, OPAL_INT32))) { OBJ_RELEASE(sig); ORTE_ERROR_LOG(rc); brks_finalize_coll(coll, rc); return; } assert(0 == orte_grpcomm_base_check_distance_recv(coll, distance)); /* Check whether we can process next distance */ if (coll->nreported && (!distance || orte_grpcomm_base_check_distance_recv(coll, distance - 1))) { size_t nreceived; OPAL_OUTPUT_VERBOSE((80, orte_grpcomm_base_framework.framework_output, "%s grpcomm:coll:brks data from %d distance received, " "Process the next distance.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), distance)); /* capture any provided content */ rc = opal_dss.unpack (buffer, &nreceived, &cnt, OPAL_SIZE); if (OPAL_SUCCESS != rc) { OBJ_RELEASE(sig); ORTE_ERROR_LOG(rc); brks_finalize_coll(coll, rc); return; } if (OPAL_SUCCESS != (rc = opal_dss.copy_payload(&coll->bucket, buffer))) { OBJ_RELEASE(sig); ORTE_ERROR_LOG(rc); brks_finalize_coll(coll, rc); return; } coll->nreported += nreceived; orte_grpcomm_base_mark_distance_recv(coll, distance); brks_allgather_process_data(coll, distance + 1); } else { OPAL_OUTPUT_VERBOSE((80, orte_grpcomm_base_framework.framework_output, "%s grpcomm:coll:brks data from %d distance received, " "still waiting for data.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), distance)); if (NULL == coll->buffers) { if (NULL == (coll->buffers = (opal_buffer_t **) calloc ((uint32_t) log2 (coll->ndmns) + 1, sizeof(opal_buffer_t *)))) { rc = OPAL_ERR_OUT_OF_RESOURCE; OBJ_RELEASE(sig); ORTE_ERROR_LOG(rc); brks_finalize_coll(coll, rc); return; } } if (NULL == (coll->buffers[distance] = OBJ_NEW(opal_buffer_t))) { rc = OPAL_ERR_OUT_OF_RESOURCE; OBJ_RELEASE(sig); ORTE_ERROR_LOG(rc); brks_finalize_coll(coll, rc); return; } if (OPAL_SUCCESS != (rc = opal_dss.copy_payload(coll->buffers[distance], buffer))) { OBJ_RELEASE(sig); ORTE_ERROR_LOG(rc); brks_finalize_coll(coll, rc); return; } } OBJ_RELEASE(sig); }
static void allgather_recv(int status, orte_process_name_t* sender, opal_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata) { int32_t cnt; int rc, ret; orte_grpcomm_signature_t *sig; opal_buffer_t *reply; orte_grpcomm_coll_t *coll; OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_framework.framework_output, "%s grpcomm:direct allgather recvd from %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(sender))); /* unpack the signature */ cnt = 1; if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &sig, &cnt, ORTE_SIGNATURE))) { ORTE_ERROR_LOG(rc); return; } /* check for the tracker and create it if not found */ if (NULL == (coll = orte_grpcomm_base_get_tracker(sig, true))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); OBJ_RELEASE(sig); return; } /* increment nprocs reported for collective */ coll->nreported++; /* capture any provided content */ opal_dss.copy_payload(&coll->bucket, buffer); OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_framework.framework_output, "%s grpcomm:direct allgather recv ndmns %d nrep %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)coll->ndmns, (int)coll->nreported)); /* if all participating daemons have reported */ if (coll->ndmns == coll->nreported) { reply = OBJ_NEW(opal_buffer_t); /* pack the signature */ if (OPAL_SUCCESS != (rc = opal_dss.pack(reply, &sig, 1, ORTE_SIGNATURE))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(reply); OBJ_RELEASE(sig); return; } /* pack the status - success since the allgather completed. This * would be an error if we timeout instead */ ret = ORTE_SUCCESS; if (OPAL_SUCCESS != (rc = opal_dss.pack(reply, &ret, 1, OPAL_INT))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(reply); OBJ_RELEASE(sig); return; } /* transfer the collected bucket */ opal_dss.copy_payload(reply, &coll->bucket); /* send the release via xcast */ (void)orte_grpcomm.xcast(sig, ORTE_RML_TAG_COLL_RELEASE, reply); OBJ_RELEASE(reply); } OBJ_RELEASE(sig); }