static void _set_env(char ***env_ptr, void *gres_ptr, int node_inx, bitstr_t *usable_gres, bool *already_seen, int *local_inx, bool reset, bool is_job) { char *global_list = NULL, *local_list = NULL; char *slurm_env_var = NULL; if (is_job) slurm_env_var = "SLURM_JOB_NICS"; else slurm_env_var = "SLURM_STEP_NICS"; if (*already_seen) { global_list = xstrdup(getenvp(*env_ptr, slurm_env_var)); local_list = xstrdup(getenvp(*env_ptr, "OMPI_MCA_btl_openib_if_include")); } common_gres_set_env(gres_devices, env_ptr, gres_ptr, node_inx, usable_gres, "mlx4_", local_inx, &local_list, &global_list, reset, is_job); if (global_list) { env_array_overwrite(env_ptr, slurm_env_var, global_list); xfree(global_list); } if (local_list) { env_array_overwrite( env_ptr, "OMPI_MCA_btl_openib_if_include", local_list); xfree(local_list); *already_seen = true; } }
static void _set_env(char ***env_ptr, void *gres_ptr, int node_inx, bitstr_t *usable_gres, bool *already_seen, int *local_inx, bool reset, bool is_job) { char *global_list = NULL, *local_list = NULL, *perc_env = NULL; char perc_str[64], *slurm_env_var = NULL; uint64_t count_on_dev, gres_per_node = 0, percentage; int global_id = -1; if (is_job) slurm_env_var = "SLURM_JOB_GRES"; else slurm_env_var = "SLURM_STEP_GRES"; if (*already_seen) { global_list = xstrdup(getenvp(*env_ptr, slurm_env_var)); local_list = xstrdup(getenvp(*env_ptr, "CUDA_VISIBLE_DEVICES")); perc_env = xstrdup(getenvp(*env_ptr, "CUDA_MPS_ACTIVE_THREAD_PERCENTAGE")); } common_gres_set_env(gres_devices, env_ptr, gres_ptr, node_inx, usable_gres, "", local_inx, &gres_per_node, &local_list, &global_list, reset, is_job, &global_id); if (perc_env) { env_array_overwrite(env_ptr, "CUDA_MPS_ACTIVE_THREAD_perc_str", perc_env); xfree(perc_env); } else if (gres_per_node && mps_info) { count_on_dev = _get_dev_count(global_id); if (count_on_dev > 0) { percentage = (gres_per_node * 100) / count_on_dev; percentage = MAX(percentage, 1); } else percentage = 0; snprintf(perc_str, sizeof(perc_str), "%"PRIu64, percentage); env_array_overwrite(env_ptr, "CUDA_MPS_ACTIVE_THREAD_PERCENTAGE", perc_str); } else if (gres_per_node) { error("%s: mps_info list is NULL", __func__); snprintf(perc_str, sizeof(perc_str), "%"PRIu64, gres_per_node); env_array_overwrite(env_ptr, "CUDA_MPS_ACTIVE_THREAD_PERCENTAGE", perc_str); } if (global_list) { env_array_overwrite(env_ptr, slurm_env_var, global_list); xfree(global_list); } if (local_list) { /* * CUDA_VISIBLE_DEVICES is relative to the MPS server. * With only one GPU under the control of MPS, the device * number will always be "0". */ env_array_overwrite(env_ptr, "CUDA_VISIBLE_DEVICES", "0"); env_array_overwrite(env_ptr, "GPU_DEVICE_ORDINAL", "0"); xfree(local_list); *already_seen = true; } }