Exemple #1
0
static void _set_env(char ***env_ptr, void *gres_ptr, int node_inx,
		     bitstr_t *usable_gres,
		     bool *already_seen, int *local_inx,
		     bool reset, bool is_job)
{
	char *global_list = NULL, *local_list = NULL;
	char *slurm_env_var = NULL;

	if (is_job)
			slurm_env_var = "SLURM_JOB_NICS";
	else
			slurm_env_var = "SLURM_STEP_NICS";

	if (*already_seen) {
		global_list = xstrdup(getenvp(*env_ptr, slurm_env_var));
		local_list = xstrdup(getenvp(*env_ptr,
					     "OMPI_MCA_btl_openib_if_include"));
	}

	common_gres_set_env(gres_devices, env_ptr, gres_ptr, node_inx,
			    usable_gres, "mlx4_", local_inx,
			    &local_list, &global_list,
			    reset, is_job);

	if (global_list) {
		env_array_overwrite(env_ptr, slurm_env_var, global_list);
		xfree(global_list);
	}

	if (local_list) {
		env_array_overwrite(
			env_ptr, "OMPI_MCA_btl_openib_if_include", local_list);
		xfree(local_list);
		*already_seen = true;
	}
}
Exemple #2
0
static void _set_env(char ***env_ptr, void *gres_ptr, int node_inx,
		     bitstr_t *usable_gres,
		     bool *already_seen, int *local_inx,
		     bool reset, bool is_job)
{
	char *global_list = NULL, *local_list = NULL, *perc_env = NULL;
	char perc_str[64], *slurm_env_var = NULL;
	uint64_t count_on_dev, gres_per_node = 0, percentage;
	int global_id = -1;

	if (is_job)
		slurm_env_var = "SLURM_JOB_GRES";
	else
		slurm_env_var = "SLURM_STEP_GRES";

	if (*already_seen) {
		global_list = xstrdup(getenvp(*env_ptr, slurm_env_var));
		local_list = xstrdup(getenvp(*env_ptr,
					     "CUDA_VISIBLE_DEVICES"));
		perc_env = xstrdup(getenvp(*env_ptr,
					  "CUDA_MPS_ACTIVE_THREAD_PERCENTAGE"));
	}

	common_gres_set_env(gres_devices, env_ptr, gres_ptr, node_inx,
			    usable_gres, "", local_inx,
			    &gres_per_node, &local_list, &global_list,
			    reset, is_job, &global_id);

	if (perc_env) {
		env_array_overwrite(env_ptr,
				    "CUDA_MPS_ACTIVE_THREAD_perc_str",
				    perc_env);
		xfree(perc_env);
	} else if (gres_per_node && mps_info) {
		count_on_dev = _get_dev_count(global_id);
		if (count_on_dev > 0) {
			percentage = (gres_per_node * 100) / count_on_dev;
			percentage = MAX(percentage, 1);
		} else
			percentage = 0;
		snprintf(perc_str, sizeof(perc_str), "%"PRIu64, percentage);
		env_array_overwrite(env_ptr,
				    "CUDA_MPS_ACTIVE_THREAD_PERCENTAGE",
				    perc_str);
	} else if (gres_per_node) {
		error("%s: mps_info list is NULL", __func__);
		snprintf(perc_str, sizeof(perc_str), "%"PRIu64, gres_per_node);
		env_array_overwrite(env_ptr,
				    "CUDA_MPS_ACTIVE_THREAD_PERCENTAGE",
				    perc_str);
	}

	if (global_list) {
		env_array_overwrite(env_ptr, slurm_env_var, global_list);
		xfree(global_list);
	}

	if (local_list) {
		/*
		 * CUDA_VISIBLE_DEVICES is relative to the MPS server.
		 * With only one GPU under the control of MPS, the device
		 * number will always be "0".
		 */
		env_array_overwrite(env_ptr, "CUDA_VISIBLE_DEVICES", "0");
		env_array_overwrite(env_ptr, "GPU_DEVICE_ORDINAL", "0");
		xfree(local_list);
		*already_seen = true;
	}
}