int orte_ess_alps_sync_complete(void) { int ret = ORTE_SUCCESS; int lli_ret = 0; int alps_status = 0; size_t alps_count; lli_ret = alps_app_lli_lock(); if (0 != ret) { OPAL_OUTPUT_VERBOSE((20, orte_ess_base_framework.framework_output, "%s ess:alps: alps_app_lli_lock returned %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), lli_ret)); ret = ORTE_ERR_FILE_WRITE_FAILURE; goto fn_exit; } lli_ret = alps_app_lli_put_request(ALPS_APP_LLI_ALPS_REQ_EXITING, NULL, 0); if (ALPS_APP_LLI_ALPS_STAT_OK != lli_ret) { OPAL_OUTPUT_VERBOSE((20, orte_ess_base_framework.framework_output, "%s ess:alps: alps_app_lli_put_request returned %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), lli_ret)); ret = ORTE_ERR_FILE_WRITE_FAILURE; goto fn_exit_w_lock; } lli_ret = alps_app_lli_get_response (&alps_status, &alps_count); if (ALPS_APP_LLI_ALPS_STAT_OK != alps_status) { OPAL_OUTPUT_VERBOSE((20, orte_ess_base_framework.framework_output, "%s ess:alps: alps_app_lli_get_response returned %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), alps_status)); ret = ORTE_ERR_FILE_READ_FAILURE; goto fn_exit_w_lock; } fn_exit_w_lock: lli_ret = alps_app_lli_unlock(); if (ALPS_APP_LLI_ALPS_STAT_OK != lli_ret) { OPAL_OUTPUT_VERBOSE((20, orte_ess_base_framework.framework_output, "%s ess:alps: alps_app_lli_unlock returned %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), lli_ret)); ret = ORTE_ERR_FILE_WRITE_FAILURE; } fn_exit: return ret; }
int orte_ess_alps_get_first_rank_on_node(int *first_rank) { int alps_status = 0; uint64_t apid; size_t alps_count; int ret = ORTE_SUCCESS; int lli_ret = 0, place_ret; alpsAppLayout_t orted_layout; if (first_rank == NULL) { ret = ORTE_ERR_BAD_PARAM; goto fn_exit; } /* * First get our apid */ lli_ret = alps_app_lli_lock(); if (0 != ret) { OPAL_OUTPUT_VERBOSE((20, orte_ess_base_framework.framework_output, "%s ess:alps: alps_app_lli_lock returned %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), lli_ret)); ret = ORTE_ERR_FILE_WRITE_FAILURE; goto fn_exit; } lli_ret = alps_app_lli_put_request(ALPS_APP_LLI_ALPS_REQ_APID, NULL, 0); if (ALPS_APP_LLI_ALPS_STAT_OK != lli_ret) { OPAL_OUTPUT_VERBOSE((20, orte_ess_base_framework.framework_output, "%s ess:alps: alps_app_lli_put_request - APID returned %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), lli_ret)); ret = ORTE_ERR_FILE_WRITE_FAILURE; goto fn_exit_w_lock; } lli_ret = alps_app_lli_get_response (&alps_status, &alps_count); if (ALPS_APP_LLI_ALPS_STAT_OK != alps_status) { OPAL_OUTPUT_VERBOSE((20, orte_ess_base_framework.framework_output, "%s ess:alps: alps_app_lli_get_response returned %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), alps_status)); ret = ORTE_ERR_FILE_READ_FAILURE; goto fn_exit_w_lock; } lli_ret = alps_app_lli_get_response_bytes (&apid, sizeof(apid)); if (ALPS_APP_LLI_ALPS_STAT_OK != lli_ret) { OPAL_OUTPUT_VERBOSE((20, orte_ess_base_framework.framework_output, "%s ess:alps: alps_app_lli_get_response_bytes returned %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), lli_ret)); ret = ORTE_ERR_FILE_READ_FAILURE; goto fn_exit_w_lock; } place_ret = alps_get_placement_info(apid, &orted_layout, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL); if (1 != place_ret) { OPAL_OUTPUT_VERBOSE((20, orte_ess_base_framework.framework_output, "%s ess:alps: alps_get_placement_info returned %d (%s)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), place_ret, strerror(errno))); ret = ORTE_ERROR; goto fn_exit; } OPAL_OUTPUT_VERBOSE((2, orte_ess_base_framework.framework_output, "%s ess:alps: alps_get_placement_info returned %d first pe on node is %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), place_ret, orted_layout.firstPe)); *first_rank = orted_layout.firstPe; fn_exit_w_lock: lli_ret = alps_app_lli_unlock(); if (ALPS_APP_LLI_ALPS_STAT_OK != lli_ret) { OPAL_OUTPUT_VERBOSE((20, orte_ess_base_framework.framework_output, "%s ess:alps: alps_app_lli_unlock returned %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), lli_ret)); ret = ORTE_ERR_FILE_WRITE_FAILURE; } fn_exit: return ret; }
static int __gnix_alps_init(void) { int ret = FI_SUCCESS; int alps_status = 0; size_t alps_count; alpsAppLLIGni_t *rdmacred_rsp = NULL; alpsAppGni_t *rdmacred_buf = NULL; /* lli_lock doesn't return anything useful */ ret = alps_app_lli_lock(); if (alps_init) { /* alps lli lock protects alps_init for now */ alps_app_lli_unlock(); return ret; } /* * First get our apid */ ret = alps_app_lli_put_request(ALPS_APP_LLI_ALPS_REQ_APID, NULL, 0); if (ret != ALPS_APP_LLI_ALPS_STAT_OK) { GNIX_WARN(FI_LOG_FABRIC, "lli put failed, ret=%d(%s)\n", ret, strerror(errno)); ret = -FI_EIO; goto err; } ret = alps_app_lli_get_response(&alps_status, &alps_count); if (alps_status != ALPS_APP_LLI_ALPS_STAT_OK) { GNIX_WARN(FI_LOG_FABRIC, "lli get response failed, " "alps_status=%d(%s)\n", alps_status, strerror(errno)); ret = -FI_EIO; goto err; } ret = alps_app_lli_get_response_bytes(&gnix_apid, sizeof(gnix_apid)); if (ret != ALPS_APP_LLI_ALPS_STAT_OK) { GNIX_WARN(FI_LOG_FABRIC, "lli get response failed, ret=%d(%s)\n", ret, strerror(errno)); ret = -FI_EIO; goto err; } /* * now get the GNI rdma credentials info */ ret = alps_app_lli_put_request(ALPS_APP_LLI_ALPS_REQ_GNI, NULL, 0); if (ret != ALPS_APP_LLI_ALPS_STAT_OK) { GNIX_WARN(FI_LOG_FABRIC, "lli put failed, ret=%d(%s)\n", ret, strerror(errno)); ret = -FI_EIO; goto err; } ret = alps_app_lli_get_response(&alps_status, &alps_count); if (alps_status != ALPS_APP_LLI_ALPS_STAT_OK) { GNIX_WARN(FI_LOG_FABRIC, "lli get response failed, alps_status=%d(%s)\n", alps_status, strerror(errno)); ret = -FI_EIO; goto err; } rdmacred_rsp = malloc(alps_count); if (rdmacred_rsp == NULL) { ret = -FI_ENOMEM; goto err; } memset(rdmacred_rsp, 0, alps_count); ret = alps_app_lli_get_response_bytes(rdmacred_rsp, alps_count); if (ret != ALPS_APP_LLI_ALPS_STAT_OK) { GNIX_WARN(FI_LOG_FABRIC, "lli get response failed, ret=%d(%s)\n", ret, strerror(errno)); ret = -FI_EIO; goto err; } rdmacred_buf = (alpsAppGni_t *) rdmacred_rsp->u.buf; /* * just use the first ptag/cookie for now */ gnix_app_ptag = rdmacred_buf[0].ptag; gnix_app_cookie = rdmacred_buf[0].cookie; /* * alps_get_placement_info(uint64_t apid, alpsAppLayout_t *appLayout, * int **placementList, int **targetNids, int **targetPes, * int **targetLen, struct in_addr **targetIps, int **startPe, * int **totalPes, int **nodePes, int **peCpus); */ ret = alps_get_placement_info(gnix_apid, &gnix_appLayout, &gnix_app_placementList, &gnix_app_targetNids, &gnix_app_targetPes, &gnix_app_targetLen, &gnix_app_targetIps, &gnix_app_startPe, &gnix_app_totalPes, &gnix_app_nodePes, &gnix_app_peCpus); if (ret != 1) { GNIX_WARN(FI_LOG_FABRIC, "alps_get_placement_info failed, ret=%d(%s)\n", ret, strerror(errno)); ret = -FI_EIO; goto err; } gnix_pes_on_node = gnix_appLayout.numPesHere; alps_init = true; ret = 0; err: alps_app_lli_unlock(); if (rdmacred_rsp != NULL) { free(rdmacred_rsp); } return ret; }