/* * Set environment variables as appropriate for a job (i.e. all tasks) based * upon the job step's GRES state. */ extern void step_set_env(char ***job_env_ptr, void *gres_ptr) { int i, len; char *dev_list = NULL; gres_step_state_t *gres_step_ptr = (gres_step_state_t *) gres_ptr; if ((gres_step_ptr != NULL) && (gres_step_ptr->node_cnt == 1) && (gres_step_ptr->gres_bit_alloc != NULL) && (gres_step_ptr->gres_bit_alloc[0] != NULL)) { len = bit_size(gres_step_ptr->gres_bit_alloc[0]); for (i=0; i<len; i++) { if (!bit_test(gres_step_ptr->gres_bit_alloc[0], i)) continue; if (!dev_list) dev_list = xmalloc(128); else xstrcat(dev_list, ","); xstrfmtcat(dev_list, "%d", i); } } if (dev_list) { env_array_overwrite(job_env_ptr,"CUDA_VISIBLE_DEVICES", dev_list); xfree(dev_list); } else { /* The gres.conf file must identify specific device files * in order to set the CUDA_VISIBLE_DEVICES env var */ env_array_overwrite(job_env_ptr,"CUDA_VISIBLE_DEVICES", "NoDevFiles"); } }
static stepd_step_rec_t * _step_setup(slurm_addr_t *cli, slurm_addr_t *self, slurm_msg_t *msg) { stepd_step_rec_t *job = NULL; switch (msg->msg_type) { case REQUEST_BATCH_JOB_LAUNCH: debug2("setup for a batch_job"); job = mgr_launch_batch_job_setup(msg->data, cli); break; case REQUEST_LAUNCH_TASKS: debug2("setup for a launch_task"); job = mgr_launch_tasks_setup(msg->data, cli, self, msg->protocol_version); break; default: fatal("handle_launch_message: Unrecognized launch RPC"); break; } if (!job) { error("_step_setup: no job returned"); return NULL; } job->jmgr_pid = getpid(); job->jobacct = jobacctinfo_create(NULL); /* Establish GRES environment variables */ if (conf->debug_flags & DEBUG_FLAG_GRES) { gres_plugin_job_state_log(job->job_gres_list, job->jobid); gres_plugin_step_state_log(job->step_gres_list, job->jobid, job->stepid); } if (msg->msg_type == REQUEST_BATCH_JOB_LAUNCH) gres_plugin_job_set_env(&job->env, job->job_gres_list, 0); else if (msg->msg_type == REQUEST_LAUNCH_TASKS) gres_plugin_step_set_env(&job->env, job->step_gres_list, 0); /* * Add slurmd node topology informations to job env array */ env_array_overwrite(&job->env,"SLURM_TOPOLOGY_ADDR", conf->node_topo_addr); env_array_overwrite(&job->env,"SLURM_TOPOLOGY_ADDR_PATTERN", conf->node_topo_pattern); set_msg_node_id(job); return job; }
/* * task_p_pre_launch() is called prior to exec of application task. * It is followed by TaskProlog program (from slurm.conf) and * --task-prolog (from srun command line). */ extern int task_p_pre_launch (stepd_step_rec_t *job) { #ifdef HAVE_NATIVE_CRAY int rc; uint64_t apid; DEF_TIMERS; START_TIMER; apid = SLURM_ID_HASH(job->jobid, job->stepid); debug2("task_p_pre_launch: %u.%u, apid %"PRIu64", task %d", job->jobid, job->stepid, apid, job->envtp->procid); /* * Send the rank to the application's PMI layer via an environment * variable. */ rc = env_array_overwrite_fmt(&job->env, ALPS_APP_PE_ENV, "%d", job->envtp->procid); if (rc == 0) { CRAY_ERR("Failed to set env variable %s", ALPS_APP_PE_ENV); return SLURM_ERROR; } /* * Set the PMI_NO_FORK environment variable. */ rc = env_array_overwrite(&job->env, PMI_NO_FORK_ENV, "1"); if (rc == 0) { CRAY_ERR("Failed to set env variable %s", PMI_NO_FORK_ENV); return SLURM_ERROR; } /* * Notify the task which offset to use */ rc = env_array_overwrite_fmt(&job->env, LLI_STATUS_OFFS_ENV, "%d", job->envtp->localid + 1); if (rc == 0) { CRAY_ERR("Failed to set env variable %s", LLI_STATUS_OFFS_ENV); return SLURM_ERROR; } /* * Set the ALPS_APP_ID environment variable for use by * Cray tools. */ rc = env_array_overwrite_fmt(&job->env, ALPS_APP_ID_ENV, "%"PRIu64, apid); if (rc == 0) { CRAY_ERR("Failed to set env variable %s", ALPS_APP_ID_ENV); } END_TIMER; if (debug_flags & DEBUG_FLAG_TIME_CRAY) INFO_LINE("call took: %s", TIME_STR); #endif return SLURM_SUCCESS; }
/* * Reset environment variables as appropriate for a job (i.e. this one tasks) * based upon the job step's GRES state and assigned CPUs. */ extern void step_reset_env(char ***job_env_ptr, void *gres_ptr, bitstr_t *usable_gres) { int i, len, first_match = -1; char *dev_list = NULL; gres_step_state_t *gres_step_ptr = (gres_step_state_t *) gres_ptr; if ((gres_step_ptr != NULL) && (gres_step_ptr->node_cnt == 1) && (gres_step_ptr->gres_bit_alloc != NULL) && (gres_step_ptr->gres_bit_alloc[0] != NULL) && (usable_gres != NULL)) { len = MIN(bit_size(gres_step_ptr->gres_bit_alloc[0]), bit_size(usable_gres)); for (i = 0; i < len; i++) { if (!bit_test(gres_step_ptr->gres_bit_alloc[0], i)) continue; if (first_match == -1) first_match = i; if (!bit_test(usable_gres, i)) continue; if (!dev_list) dev_list = xmalloc(128); else xstrcat(dev_list, ","); if (nic_devices && (i < nb_available_files) && (nic_devices[i] >= 0)) { xstrfmtcat(dev_list, "mlx4_%d", nic_devices[i]); } else { xstrfmtcat(dev_list, "mlx4_%d", i); } } if (!dev_list && (first_match != -1)) { i = first_match; dev_list = xmalloc(128); if (nic_devices && (i < nb_available_files) && (nic_devices[i] >= 0)) { xstrfmtcat(dev_list, "mlx4_%d", nic_devices[i]); } else { xstrfmtcat(dev_list, "mlx4_%d", i); } } } if (dev_list) { /* we assume mellanox cards and OpenMPI programm */ env_array_overwrite(job_env_ptr, "OMPI_MCA_btl_openib_if_include", dev_list); xfree(dev_list); } }
static void _set_env(char ***env_ptr, void *gres_ptr, int node_inx, bitstr_t *usable_gres, bool *already_seen, int *local_inx, bool reset, bool is_job) { char *global_list = NULL, *local_list = NULL; char *slurm_env_var = NULL; if (is_job) slurm_env_var = "SLURM_JOB_NICS"; else slurm_env_var = "SLURM_STEP_NICS"; if (*already_seen) { global_list = xstrdup(getenvp(*env_ptr, slurm_env_var)); local_list = xstrdup(getenvp(*env_ptr, "OMPI_MCA_btl_openib_if_include")); } common_gres_set_env(gres_devices, env_ptr, gres_ptr, node_inx, usable_gres, "mlx4_", local_inx, &local_list, &global_list, reset, is_job); if (global_list) { env_array_overwrite(env_ptr, slurm_env_var, global_list); xfree(global_list); } if (local_list) { env_array_overwrite( env_ptr, "OMPI_MCA_btl_openib_if_include", local_list); xfree(local_list); *already_seen = true; } }
/* * Merge all of the environment variables in src_array into the * array dest_array. Any variables already found in dest_array * will be overwritten with the value from src_array. */ void env_array_merge(char ***dest_array, const char **src_array) { char **ptr; char name[256], *value; if (src_array == NULL) return; value = xmalloc(ENV_BUFSIZE); for (ptr = (char **)src_array; *ptr != NULL; ptr++) { if (_env_array_entry_splitter(*ptr, name, sizeof(name), value, ENV_BUFSIZE)) env_array_overwrite(dest_array, name, value); } xfree(value); }
/* * Append a single environment variable to an environment variable array * if a variable by that name does not already exist. If a variable * by the same name is found in the array, it is overwritten with the * new value. * * "value_fmt" supports printf-style formatting. * * Return 1 on success, and 0 on error. */ int env_array_overwrite_fmt(char ***array_ptr, const char *name, const char *value_fmt, ...) { int rc; char *value; va_list ap; value = xmalloc(ENV_BUFSIZE); va_start(ap, value_fmt); vsnprintf (value, ENV_BUFSIZE, value_fmt, ap); va_end(ap); rc = env_array_overwrite(array_ptr, name, value); xfree(value); return rc; }
/* * Set environment variables as appropriate for a job (i.e. all tasks) based * upon the job step's GRES state. */ extern void step_set_env(char ***job_env_ptr, void *gres_ptr) { int i, len, local_inx = 0; char *dev_list = NULL; gres_step_state_t *gres_step_ptr = (gres_step_state_t *) gres_ptr; bool use_local_dev_index = _use_local_device_index(); if ((gres_step_ptr != NULL) && (gres_step_ptr->node_cnt == 1) && (gres_step_ptr->gres_bit_alloc != NULL) && (gres_step_ptr->gres_bit_alloc[0] != NULL)) { len = bit_size(gres_step_ptr->gres_bit_alloc[0]); for (i = 0; i < len; i++) { if (!bit_test(gres_step_ptr->gres_bit_alloc[0], i)) continue; if (!dev_list) dev_list = xmalloc(128); else xstrcat(dev_list, ","); if (use_local_dev_index) { xstrfmtcat(dev_list, "mlx4_%d", local_inx++); } else if (nic_devices && (i < nb_available_files) && (nic_devices[i] >= 0)) { xstrfmtcat(dev_list, "mlx4_%d", nic_devices[i]); } else { xstrfmtcat(dev_list, "mlx4_%d", i); } } } else if (gres_step_ptr && (gres_step_ptr->gres_cnt_alloc > 0)) { /* The gres.conf file must identify specific device files * in order to set the OMPI_MCA_btl_openib_if_include env var */ error("gres/nic unable to set OMPI_MCA_btl_openib_if_include, " "no device files configured"); } else { xstrcat(dev_list, "NoDevFiles"); } if (dev_list) { /* we assume mellanox cards and OpenMPI programm */ env_array_overwrite(job_env_ptr, "OMPI_MCA_btl_openib_if_include", dev_list); xfree(dev_list); } }
/* * task_p_pre_launch() is called prior to exec of application task. * It is followed by TaskProlog program (from slurm.conf) and * --task-prolog (from srun command line). */ extern int task_p_pre_launch (stepd_step_rec_t *job) { #ifdef HAVE_NATIVE_CRAY int rc; debug("task_p_pre_launch: %u.%u, task %d", job->jobid, job->stepid, job->envtp->procid); /* * Send the rank to the application's PMI layer via an environment * variable. */ rc = env_array_overwrite_fmt(&job->env, ALPS_APP_PE_ENV, "%d", job->envtp->procid); if (rc == 0) { CRAY_ERR("Failed to set env variable %s", ALPS_APP_PE_ENV); return SLURM_ERROR; } /* * Set the PMI_NO_FORK environment variable. */ rc = env_array_overwrite(&job->env, PMI_NO_FORK_ENV, "1"); if (rc == 0) { CRAY_ERR("Failed to set env variable %s", PMI_NO_FORK_ENV); return SLURM_ERROR; } /* * Notify the task which offset to use */ rc = env_array_overwrite_fmt(&job->env, LLI_STATUS_OFFS_ENV, "%d", job->envtp->localid + 1); if (rc == 0) { CRAY_ERR("Failed to set env variable %s", LLI_STATUS_OFFS_ENV); return SLURM_ERROR; } #endif return SLURM_SUCCESS; }
/* * Set environment variables as appropriate for a job (i.e. all tasks) based * upon the job's GRES state. */ extern void job_set_env(char ***job_env_ptr, void *gres_ptr) { int i, len; char *dev_list = NULL; gres_job_state_t *gres_job_ptr = (gres_job_state_t *) gres_ptr; if ((gres_job_ptr != NULL) && (gres_job_ptr->node_cnt == 1) && (gres_job_ptr->gres_bit_alloc != NULL) && (gres_job_ptr->gres_bit_alloc[0] != NULL)) { len = bit_size(gres_job_ptr->gres_bit_alloc[0]); for (i=0; i<len; i++) { if (!bit_test(gres_job_ptr->gres_bit_alloc[0], i)) continue; if (!dev_list) dev_list = xmalloc(128); else xstrcat(dev_list, ","); if (gpu_devices && (i < nb_available_files) && (gpu_devices[i] >= 0)) xstrfmtcat(dev_list, "%d", gpu_devices[i]); else xstrfmtcat(dev_list, "%d", i); } } else if (gres_job_ptr && (gres_job_ptr->gres_cnt_alloc > 0)) { /* The gres.conf file must identify specific device files * in order to set the CUDA_VISIBLE_DEVICES env var */ error("gres/gpu unable to set CUDA_VISIBLE_DEVICES, " "no device files configured"); } else { xstrcat(dev_list, "NoDevFiles"); } if (dev_list) { env_array_overwrite(job_env_ptr,"CUDA_VISIBLE_DEVICES", dev_list); xfree(dev_list); } }
/* * Write the IAA file and set the filename in the job's environment */ int write_iaa_file(stepd_step_rec_t *job, slurm_cray_jobinfo_t *sw_job, int *ptags, int num_ptags, alpsc_peInfo_t *alpsc_pe_info) { char *fname = xstrdup_printf(CRAY_IAA_FILE, sw_job->apid); int rc, ret = SLURM_ERROR; char *err_msg = NULL; do { // Write the file rc = alpsc_write_iaa_info(&err_msg, fname, sw_job->num_cookies, (const char **)sw_job->cookies, num_ptags, ptags, alpsc_pe_info); ALPSC_CN_DEBUG("alpsc_write_iaa_info"); if (rc != 1) { break; } // chown the file to the job user rc = chown(fname, job->uid, job->gid); if (rc == -1) { CRAY_ERR("chown(%s, %d, %d) failed: %m", fname, (int)job->uid, (int)job->gid); break; } // Write the environment variable rc = env_array_overwrite(&job->env, CRAY_IAA_INFO_FILE_ENV, fname); if (rc == 0) { CRAY_ERR("Failed to set env variable %s", CRAY_IAA_INFO_FILE_ENV); break; } ret = SLURM_SUCCESS; } while(0); xfree(fname); return ret; }
int setenvf(char ***envp, const char *name, const char *fmt, ...) { char *str = NULL, *value; va_list ap; int rc; value = xmalloc(ENV_BUFSIZE); va_start(ap, fmt); vsnprintf (value, ENV_BUFSIZE, fmt, ap); va_end(ap); if (envp && *envp) { if (env_array_overwrite(envp, name, value) == 1) rc = 0; else rc = 1; } else { xstrfmtcat(str, "%s=%s", name, value); rc = putenv(str); } xfree(value); return rc; }
static void _set_env(char ***env_ptr, void *gres_ptr, int node_inx, bitstr_t *usable_gres, bool *already_seen, int *local_inx, bool reset, bool is_job) { char *global_list = NULL, *local_list = NULL, *perc_env = NULL; char perc_str[64], *slurm_env_var = NULL; uint64_t count_on_dev, gres_per_node = 0, percentage; int global_id = -1; if (is_job) slurm_env_var = "SLURM_JOB_GRES"; else slurm_env_var = "SLURM_STEP_GRES"; if (*already_seen) { global_list = xstrdup(getenvp(*env_ptr, slurm_env_var)); local_list = xstrdup(getenvp(*env_ptr, "CUDA_VISIBLE_DEVICES")); perc_env = xstrdup(getenvp(*env_ptr, "CUDA_MPS_ACTIVE_THREAD_PERCENTAGE")); } common_gres_set_env(gres_devices, env_ptr, gres_ptr, node_inx, usable_gres, "", local_inx, &gres_per_node, &local_list, &global_list, reset, is_job, &global_id); if (perc_env) { env_array_overwrite(env_ptr, "CUDA_MPS_ACTIVE_THREAD_perc_str", perc_env); xfree(perc_env); } else if (gres_per_node && mps_info) { count_on_dev = _get_dev_count(global_id); if (count_on_dev > 0) { percentage = (gres_per_node * 100) / count_on_dev; percentage = MAX(percentage, 1); } else percentage = 0; snprintf(perc_str, sizeof(perc_str), "%"PRIu64, percentage); env_array_overwrite(env_ptr, "CUDA_MPS_ACTIVE_THREAD_PERCENTAGE", perc_str); } else if (gres_per_node) { error("%s: mps_info list is NULL", __func__); snprintf(perc_str, sizeof(perc_str), "%"PRIu64, gres_per_node); env_array_overwrite(env_ptr, "CUDA_MPS_ACTIVE_THREAD_PERCENTAGE", perc_str); } if (global_list) { env_array_overwrite(env_ptr, slurm_env_var, global_list); xfree(global_list); } if (local_list) { /* * CUDA_VISIBLE_DEVICES is relative to the MPS server. * With only one GPU under the control of MPS, the device * number will always be "0". */ env_array_overwrite(env_ptr, "CUDA_VISIBLE_DEVICES", "0"); env_array_overwrite(env_ptr, "GPU_DEVICE_ORDINAL", "0"); xfree(local_list); *already_seen = true; } }