/** * Initialize the module */ static int init(void) { int max_length, rc; #if WANT_PMI2_SUPPORT /* TODO -- is this ok */ max_length = 1024; #else if (PMI_SUCCESS != (rc = PMI_KVS_Get_name_length_max(&max_length))) { OPAL_PMI_ERROR(rc, "PMI_KVS_Get_name_length_max"); return ORTE_ERROR; } #endif pmi_kvs_name = (char*)malloc(max_length); if (NULL == pmi_kvs_name) { return ORTE_ERR_OUT_OF_RESOURCE; } #if WANT_PMI2_SUPPORT rc = PMI2_Job_GetId(pmi_kvs_name, max_length); #else rc = PMI_KVS_Get_my_name(pmi_kvs_name,max_length); #endif if (PMI_SUCCESS != rc) { OPAL_PMI_ERROR(rc, "PMI_KVS_Get_my_name"); return ORTE_ERROR; } return ORTE_SUCCESS; }
static int setup_pmi(void) { int max_length, rc; #if WANT_PMI2_SUPPORT pmi_vallen_max = PMI2_MAX_VALLEN; #else rc = PMI_KVS_Get_value_length_max(&pmi_vallen_max); if (PMI_SUCCESS != rc) { OPAL_PMI_ERROR(rc, "PMI_Get_value_length_max"); return OPAL_ERROR; } #endif #if WANT_PMI2_SUPPORT /* TODO -- is this ok */ max_length = 1024; #else if (PMI_SUCCESS != (rc = PMI_KVS_Get_name_length_max(&max_length))) { OPAL_PMI_ERROR(rc, "PMI_KVS_Get_name_length_max"); return OPAL_ERROR; } #endif pmi_kvs_name = (char*)malloc(max_length); if (NULL == pmi_kvs_name) { return OPAL_ERR_OUT_OF_RESOURCE; } #if WANT_PMI2_SUPPORT rc = PMI2_Job_GetId(pmi_kvs_name, max_length); #else rc = PMI_KVS_Get_my_name(pmi_kvs_name,max_length); #endif if (PMI_SUCCESS != rc) { OPAL_PMI_ERROR(rc, "PMI_KVS_Get_my_name"); return OPAL_ERROR; } #if WANT_PMI2_SUPPORT pmi_keylen_max = PMI2_MAX_KEYLEN; #else if (PMI_SUCCESS != (rc = PMI_KVS_Get_key_length_max(&pmi_keylen_max))) { OPAL_PMI_ERROR(rc, "PMI_KVS_Get_key_length_max"); return OPAL_ERROR; } #endif return OPAL_SUCCESS; }
static int cray_put(opal_pmix_scope_t scope, opal_value_t *kv) { int rc; opal_output_verbose(10, opal_pmix_base_framework.framework_output, "%s pmix:cray cray_put key %s scope %d\n", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), kv->key, scope); if (!pmix_init_count) { return OPAL_ERROR; } /* * for now just always just global cache */ if (NULL == mca_pmix_cray_component.cache_global) { mca_pmix_cray_component.cache_global = OBJ_NEW(opal_buffer_t); } opal_output_verbose(20, opal_pmix_base_framework.framework_output, "%s pmix:cray put global data for key %s type %d", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), kv->key, kv->type); if (OPAL_SUCCESS != (rc = opal_dss.pack(mca_pmix_cray_component.cache_global, &kv, 1, OPAL_VALUE))) { OPAL_PMI_ERROR(rc,"pmix:cray opal_dss.pack returned error"); OPAL_ERROR_LOG(rc); } return rc; }
static int cray_local_info(int vpid, int **ranks_ret, int *procs_ret, char **error) { int *ranks; int procs = -1; int rc; char *pmapping = (char*)malloc(PMI2_MAX_VALLEN); if( pmapping == NULL ){ *error = "mca_common_pmix_local_info: could not get memory for PMIv2 process mapping"; return OPAL_ERR_OUT_OF_RESOURCE; } int found; int my_node; rc = PMI2_Info_GetJobAttr("PMI_process_mapping", pmapping, PMI2_MAX_VALLEN, &found); if( !found || PMI_SUCCESS != rc ) { /* can't check PMI_SUCCESS as some folks (i.e., Cray) don't define it */ OPAL_PMI_ERROR(rc,"PMI2_Info_GetJobAttr"); *error = "mca_common_pmix_local_info: could not get PMI_process_mapping"; return OPAL_ERROR; } ranks = pmix_cray_parse_pmap(pmapping, vpid, &my_node, &procs); if (NULL == ranks) { *error = "mca_common_pmix_local_info: could not get memory for PMIv2 local ranks"; return OPAL_ERR_OUT_OF_RESOURCE; } free(pmapping); *ranks_ret = ranks; *procs_ret = procs; return OPAL_SUCCESS; }
static int pmi_barrier(orte_grpcomm_collective_t *coll) { int rc; OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_framework.framework_output, "%s grpcomm:pmi entering barrier", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* if I am alone, just execute the callback */ if (1 == orte_process_info.num_procs) { OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_framework.framework_output, "%s grpcomm:pmi:barrier only one proc", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); coll->active = false; if (NULL != coll->cbfunc) { coll->cbfunc(NULL, coll->cbdata); } return ORTE_SUCCESS; } #if WANT_PMI2_SUPPORT /* PMI2 doesn't provide a barrier, so use the Fence function here */ if (PMI_SUCCESS != (rc = PMI2_KVS_Fence())) { OPAL_PMI_ERROR(rc, "PMI2_KVS_Fence"); return ORTE_ERROR; } #else /* use the PMI barrier function */ if (PMI_SUCCESS != (rc = PMI_Barrier())) { OPAL_PMI_ERROR(rc, "PMI_Barrier"); return ORTE_ERROR; } #endif OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_framework.framework_output, "%s grpcomm:pmi barrier complete", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* execute the callback */ coll->active = false; if (NULL != coll->cbfunc) { coll->cbfunc(NULL, coll->cbdata); } return ORTE_SUCCESS; }
static int kvs_get(const char key[], char value [], int maxvalue) { int rc; rc = PMI_KVS_Get(pmix_kvs_name, key, value, maxvalue); if( PMI_SUCCESS != rc ){ OPAL_PMI_ERROR(rc, "PMI_KVS_Get"); return OPAL_ERROR; } return OPAL_SUCCESS; }
int mca_common_pmi_publish(const char *service_name, const char *port_name) { #if WANT_PMI2_SUPPORT if( mca_common_pmi_version == 2){ if (PMI2_SUCCESS != (rc = PMI2_Nameserv_publish(service_name, NULL, port_name))) { OPAL_PMI_ERROR(rc, "PMI2_Nameserv_publish"); return OPAL_ERROR; } } else #endif { if (PMI_SUCCESS != (rc = PMI_Publish_name(service_name, port_name))) { OPAL_PMI_ERROR(rc, "PMI_Publish_name"); return OPAL_ERROR; } } return OPAL_SUCCESS; }
static int s2_job_disconnect(const char jobId[]) { int rc; rc = PMI2_Job_Disconnect(jobId); if( PMI2_SUCCESS != rc ){ OPAL_PMI_ERROR(rc, "PMI2_Job_Disconnect"); return OPAL_ERROR; } return OPAL_SUCCESS; }
static int kvs_put(const char key[], const char value[]) { int rc; rc = PMI_KVS_Put(pmix_kvs_name, key, value); if( PMI_SUCCESS != rc ){ OPAL_PMI_ERROR(rc, "PMI_KVS_Put"); return OPAL_ERROR; } return rc; }
static int cray_unpublish(const char service_name[], opal_list_t *info) { int rc; if (PMI_SUCCESS != (rc = PMI2_Nameserv_unpublish(service_name, NULL))) { OPAL_PMI_ERROR(rc, "PMI2_Nameserv_unpublish"); return OPAL_ERROR; } return OPAL_SUCCESS;; }
int mca_common_pmi_commit(char *kvs_name) { if( mca_common_pmi_version == 1 ){ if (PMI_SUCCESS != (rc = PMI_KVS_Commit(kvs_name))) { OPAL_PMI_ERROR(rc, "PMI_KVS_Commit"); return OPAL_ERROR; } } return mca_common_pmi_barrier(); }
static int kvs_get(const char key[], char value [], int maxvalue) { int rc; int len; rc = PMI2_KVS_Get(pmix_kvs_name, PMI2_ID_NULL, key, value, maxvalue, &len); if( PMI2_SUCCESS != rc || len < 0){ OPAL_PMI_ERROR(rc, "PMI2_KVS_Get"); return OPAL_ERROR; } return OPAL_SUCCESS; }
static int s1_publish(const char service_name[], opal_list_t *info, const char port[]) { int rc; if (PMI_SUCCESS != (rc = PMI_Publish_name(service_name, port))) { OPAL_PMI_ERROR(rc, "PMI_Publish_name"); return OPAL_ERROR; } return OPAL_SUCCESS; }
static int s2_job_connect(const char jobId[]) { int rc; PMI2_Connect_comm_t conn; /*FIXME should change function prototype to add void* conn */ rc = PMI2_Job_Connect(jobId, &conn); if( PMI2_SUCCESS != rc ){ OPAL_PMI_ERROR(rc, "PMI2_Job_Connect"); return OPAL_ERROR; } return OPAL_SUCCESS; }
static int s2_unpublish(char **keys, opal_list_t *info) { #if 0 int rc; if (PMI2_SUCCESS != (rc = PMI2_Nameserv_unpublish(service_name, NULL))) { OPAL_PMI_ERROR(rc, "PMI2_Nameserv_unpublish"); return OPAL_ERROR; } #endif return OPAL_ERR_NOT_IMPLEMENTED; }
static int s2_publish(const char service_name[], opal_list_t *info, const char port[]) { int rc; if (PMI2_SUCCESS != (rc = PMI2_Nameserv_publish(service_name, NULL, port))) { OPAL_PMI_ERROR(rc, "PMI2_Nameserv_publish"); return OPAL_ERROR; } return OPAL_SUCCESS; }
static int cray_lookup(const char service_name[], opal_list_t *info, char port[], int portLen) { int rc; if (PMI_SUCCESS != (rc = PMI2_Nameserv_lookup(service_name, NULL, port, portLen))) { OPAL_PMI_ERROR(rc, "PMI2_Nameserv_lookup"); return OPAL_ERROR; } return OPAL_SUCCESS; }
static int s2_lookup(opal_list_t *data, opal_list_t *info) { #if 0 int rc; if (PMI2_SUCCESS != (rc = PMI2_Nameserv_lookup(service_name, NULL, port, portLen))) { OPAL_PMI_ERROR(rc, "PMI2_Nameserv_lookup"); return OPAL_ERROR; } #endif return OPAL_ERR_NOT_IMPLEMENTED; }
int mca_common_pmi_lookup(const char *service_name, char **port_ret) { // FIXME: // 1. Why don't we malloc memory for the port for PMI v1? // 2. Maybe error handling is needed in pbusub? // 3. Is it legal to call OPAL_PMI_ERROR for PMIv2 rc? char *port = NULL; *port_ret = port; int rc; #if WANT_PMI2_SUPPORT if( mca_common_pmi_version == 2 ){ port = (char*)malloc(1024*sizeof(char)); /* arbitrary size */ if( port == NULL ){ return OPAL_ERR_OUT_OF_RESOURCE; } if (PMI_SUCCESS != (rc = PMI2_Nameserv_lookup(service_name, NULL, port, 1024))) { OPAL_PMI_ERROR(rc, "PMI2_Nameserv_lookup"); free(port); return OPAL_ERROR; } } else #endif { // Allocate mem for port here? Otherwise we won't get success! // SLURM PMIv1 doesn't implement this function if (PMI_SUCCESS != (rc = PMI_Lookup_name(service_name, port))) { OPAL_PMI_ERROR(rc, "PMI_Lookup_name"); return OPAL_ERROR; } } *port_ret = port; return OPAL_SUCCESS; }
bool mca_common_pmi_rank(int *rank) { #if !WANT_PMI2_SUPPORT { int ret; if (PMI_SUCCESS != (ret = PMI_Get_rank(&mca_common_pmi_init_rank))) { OPAL_PMI_ERROR(ret, "PMI_Get_rank"); return false; } } #endif *rank = mca_common_pmi_init_rank; return true; }
bool mca_common_pmi_size(int *size) { #if !WANT_PMI2_SUPPORT { int ret; if (PMI_SUCCESS != (ret = PMI_Get_universe_size(&mca_common_pmi_init_size))) { OPAL_PMI_ERROR(ret, "PMI_Get_universe_size"); return false; } } #endif *size = mca_common_pmi_init_size; return true; }
static int s1_commit(void) { int rc; /* check if there is partially filled meta key and put them */ opal_pmix_base_commit_packed (&pmix_packed_data, &pmix_packed_data_offset, &pmix_packed_encoded_data, &pmix_packed_encoded_data_offset, pmix_vallen_max, &pmix_pack_key, kvs_put); if (PMI_SUCCESS != (rc = PMI_KVS_Commit(pmix_kvs_name))) { OPAL_PMI_ERROR(rc, "PMI_KVS_Commit"); return OPAL_ERROR; } return OPAL_SUCCESS; }
static int kvs_put(const char key[], const char value[]) { int rc; opal_output_verbose(10, opal_pmix_base_framework.framework_output, "%s pmix:cray kvs_put key %s value %s", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), key, value); rc = PMI2_KVS_Put(key, value); if( PMI_SUCCESS != rc ){ OPAL_PMI_ERROR(rc, "PMI2_KVS_Put"); return OPAL_ERROR; } return OPAL_SUCCESS; }
static int s1_lookup(const char service_name[], opal_list_t *info, char port[], int portLen) { int rc; // Allocate mem for port here? Otherwise we won't get success! // SLURM PMIv1 doesn't implement this function /* I don't understand this comment. Is it still valid? */ if (PMI_SUCCESS != (rc = PMI_Lookup_name(service_name, port))) { OPAL_PMI_ERROR(rc, "PMI_Lookup_name"); return OPAL_ERROR; } return OPAL_SUCCESS; }
static int cray_fence(opal_process_name_t *procs, size_t nprocs) { int rc; /* check if there is partially filled meta key and put them */ if (0 != pmix_packed_data_offset && NULL != pmix_packed_data) { opal_pmix_base_commit_packed(pmix_packed_data, pmix_packed_data_offset, pmix_vallen_max, &pmix_pack_key, kvs_put); pmix_packed_data_offset = 0; free(pmix_packed_data); pmix_packed_data = NULL; } if (PMI_SUCCESS != (rc = PMI2_KVS_Fence())) { OPAL_PMI_ERROR(rc, "PMI2_KVS_Fence"); return OPAL_ERROR; } return OPAL_SUCCESS; }
int mca_common_pmi_barrier() { #if WANT_PMI2_SUPPORT if( mca_common_pmi_version == 2 ){ /* PMI2 doesn't provide a barrier, so use the Fence function here */ if (PMI2_SUCCESS != (rc = PMI2_KVS_Fence())) { // FIX ME: OPAL_PMI2_ERROR(rc, "PMI2_KVS_Fence"); return OPAL_ERROR; } } else #endif { /* use the PMI barrier function */ if (PMI_SUCCESS != (rc = PMI_Barrier())) { OPAL_PMI_ERROR(rc, "PMI_Barrier"); return OPAL_ERROR; } } return OPAL_SUCCESS; }
static int s2_job_disconnect(opal_list_t *procs) { int rc; opal_namelist_t *nm; char *jobid; if (NULL == procs || 1 < opal_list_get_size(procs)) { return OPAL_ERR_NOT_SUPPORTED; } nm = (opal_namelist_t*)opal_list_get_first(procs); (void)asprintf(&jobid, "%s", OPAL_JOBID_PRINT(nm->name.jobid)); rc = PMI2_Job_Disconnect(jobid); if( PMI2_SUCCESS != rc ){ OPAL_PMI_ERROR(rc, "PMI2_Job_Disconnect"); free(jobid); return OPAL_ERROR; } free(jobid); return OPAL_SUCCESS; }
int mca_common_pmi_put(const char *kvs_name, const char *key, const char *value) { int rc; #if WANT_PMI2_SUPPORT if( mca_common_pmi_version == 2 ){ if( PMI2_SUCCESS != PMI2_KVS_Put(key, value) ){ // FIXME: OPAL_PMI2_ERROR(rc, "PMI2_KVS_Put"); return OPAL_ERROR; } } else #endif { rc = PMI_KVS_Put(kvs_name, key, value); if( PMI_SUCCESS != rc ){ OPAL_PMI_ERROR(rc, "PMI_KVS_Put"); return OPAL_ERROR; } } return OPAL_SUCCESS; }
static int s2_job_connect(opal_list_t *procs) { int rc; PMI2_Connect_comm_t conn; opal_namelist_t *nm; char *jobid; if (NULL == procs || 1 < opal_list_get_size(procs)) { return OPAL_ERR_NOT_SUPPORTED; } nm = (opal_namelist_t*)opal_list_get_first(procs); (void)asprintf(&jobid, "%s", OPAL_JOBID_PRINT(nm->name.jobid)); /*FIXME should change function prototype to add void* conn */ rc = PMI2_Job_Connect(jobid, &conn); if( PMI2_SUCCESS != rc ){ OPAL_PMI_ERROR(rc, "PMI2_Job_Connect"); free(jobid); return OPAL_ERROR; } free(jobid); return OPAL_SUCCESS; }
int mca_common_pmi_get(const char *kvs_name, const char *key, char *value, int valuelen) { int rc; #if WANT_PMI2_SUPPORT if( mca_common_pmi_version == 2 ){ int len; rc = PMI2_KVS_Get(kvs_name, PMI2_ID_NULL, key, value, valuelen, &len); if( PMI2_SUCCESS != rc ){ // OPAL_PMI2_ERROR(rc, "PMI_KVS_Put"); return OPAL_ERROR; } } else #endif { rc = PMI_KVS_Get(kvs_name, key, value, valuelen); if( PMI_SUCCESS != rc ){ OPAL_PMI_ERROR(rc, "PMI_KVS_Put"); return OPAL_ERROR; } } return OPAL_SUCCESS; }