static void _list_pids_all_steps(const char *node_name, uint32_t jobid) { List steps; ListIterator itr; step_loc_t *stepd; int count = 0; steps = stepd_available(NULL, node_name); if (!steps || list_count(steps) == 0) { fprintf(stderr, "Job %u does not exist on this node.\n", jobid); if (steps) list_destroy(steps); exit_code = 1; return; } itr = list_iterator_create(steps); while((stepd = list_next(itr))) { if (jobid == stepd->jobid) { _list_pids_one_step(stepd->nodename, stepd->jobid, stepd->stepid); count++; } } list_iterator_destroy(itr); list_destroy(steps); if (count == 0) { fprintf(stderr, "Job %u does not exist on this node.\n", jobid); exit_code = 1; } }
static void _list_pids_all_jobs(const char *node_name) { List steps; ListIterator itr; step_loc_t *stepd; steps = stepd_available(NULL, node_name); if (!steps || list_count(steps) == 0) { fprintf(stderr, "No job steps exist on this node.\n"); FREE_NULL_LIST(steps); exit_code = 1; return; } itr = list_iterator_create(steps); while((stepd = list_next(itr))) { _list_pids_one_step(stepd->nodename, stepd->jobid, stepd->stepid); } list_iterator_destroy(itr); FREE_NULL_LIST(steps); }
static void _fill_registration_msg(slurm_node_registration_status_msg_t *msg) { List steps; ListIterator i; step_loc_t *stepd; int n; char *arch, *os; struct utsname buf; static bool first_msg = true; static time_t slurmd_start_time = 0; Buf gres_info; msg->node_name = xstrdup (conf->node_name); msg->cpus = conf->cpus; msg->boards = conf->boards; msg->sockets = conf->sockets; msg->cores = conf->cores; msg->threads = conf->threads; msg->real_memory = conf->real_memory_size; msg->tmp_disk = conf->tmp_disk_space; msg->hash_val = slurm_get_hash_val(); get_cpu_load(&msg->cpu_load); gres_info = init_buf(1024); if (gres_plugin_node_config_pack(gres_info) != SLURM_SUCCESS) error("error packing gres configuration"); else msg->gres_info = gres_info; get_up_time(&conf->up_time); msg->up_time = conf->up_time; if (slurmd_start_time == 0) slurmd_start_time = time(NULL); msg->slurmd_start_time = slurmd_start_time; if (first_msg) { first_msg = false; info("CPUs=%u Boards=%u Sockets=%u Cores=%u Threads=%u " "Memory=%u TmpDisk=%u Uptime=%u", msg->cpus, msg->boards, msg->sockets, msg->cores, msg->threads, msg->real_memory, msg->tmp_disk, msg->up_time); } else { debug3("CPUs=%u Boards=%u Sockets=%u Cores=%u Threads=%u " "Memory=%u TmpDisk=%u Uptime=%u", msg->cpus, msg->boards, msg->sockets, msg->cores, msg->threads, msg->real_memory, msg->tmp_disk, msg->up_time); } uname(&buf); if ((arch = getenv("SLURM_ARCH"))) msg->arch = xstrdup(arch); else msg->arch = xstrdup(buf.machine); if ((os = getenv("SLURM_OS"))) msg->os = xstrdup(os); else msg->os = xstrdup(buf.sysname); if (msg->startup) { if (switch_g_alloc_node_info(&msg->switch_nodeinfo)) error("switch_g_alloc_node_info: %m"); if (switch_g_build_node_info(msg->switch_nodeinfo)) error("switch_g_build_node_info: %m"); } steps = stepd_available(conf->spooldir, conf->node_name); msg->job_count = list_count(steps); msg->job_id = xmalloc(msg->job_count * sizeof(*msg->job_id)); /* Note: Running batch jobs will have step_id == NO_VAL */ msg->step_id = xmalloc(msg->job_count * sizeof(*msg->step_id)); i = list_iterator_create(steps); n = 0; while ((stepd = list_next(i))) { int fd; fd = stepd_connect(stepd->directory, stepd->nodename, stepd->jobid, stepd->stepid); if (fd == -1) { --(msg->job_count); continue; } if (stepd_state(fd) == SLURMSTEPD_NOT_RUNNING) { debug("stale domain socket for stepd %u.%u ", stepd->jobid, stepd->stepid); --(msg->job_count); close(fd); continue; } close(fd); if (stepd->stepid == NO_VAL) debug("found apparently running job %u", stepd->jobid); else debug("found apparently running step %u.%u", stepd->jobid, stepd->stepid); msg->job_id[n] = stepd->jobid; msg->step_id[n] = stepd->stepid; n++; } list_iterator_destroy(i); list_destroy(steps); if (!msg->energy) msg->energy = acct_gather_energy_alloc(); acct_gather_energy_g_get_data(ENERGY_DATA_STRUCT, msg->energy); msg->timestamp = time(NULL); return; }
static void _reconfigure(void) { List steps; ListIterator i; slurm_ctl_conf_t *cf; step_loc_t *stepd; bool did_change; _reconfig = 0; slurm_conf_reinit(conf->conffile); _read_config(); /* * Rebuild topology information and refresh slurmd topo infos */ slurm_topo_build_config(); _set_topo_info(); /* * In case the administrator changed the cpu frequency set capabilities * on this node, rebuild the cpu frequency table information */ cpu_freq_init(conf); _print_conf(); /* * Make best effort at changing to new public key */ slurm_cred_ctx_key_update(conf->vctx, conf->pubkey); /* * Reinitialize the groups cache */ cf = slurm_conf_lock(); if (cf->group_info & GROUP_CACHE) init_gids_cache(1); else init_gids_cache(0); slurm_conf_unlock(); /* send reconfig to each stepd so they can refresh their log * file handle */ steps = stepd_available(conf->spooldir, conf->node_name); i = list_iterator_create(steps); while ((stepd = list_next(i))) { int fd; fd = stepd_connect(stepd->directory, stepd->nodename, stepd->jobid, stepd->stepid); if (fd == -1) continue; if (stepd_reconfig(fd) != SLURM_SUCCESS) debug("Reconfig jobid=%u.%u failed: %m", stepd->jobid, stepd->stepid); close(fd); } list_iterator_destroy(i); list_destroy(steps); gres_plugin_reconfig(&did_change); (void) switch_g_reconfig(); container_g_reconfig(); if (did_change) { uint32_t cpu_cnt = MAX(conf->conf_cpus, conf->block_map_size); (void) gres_plugin_node_config_load(cpu_cnt); send_registration_msg(SLURM_SUCCESS, false); } /* reconfigure energy */ acct_gather_energy_g_set_data(ENERGY_DATA_RECONFIG, NULL); /* * XXX: reopen slurmd port? */ }
/* Parse arguments, etc then get my socket address/port information. Attempt to * adopt this process into a job in the following order: * 1) If the user has only one job on the node, pick that one * 2) Send RPC to source IP of socket. If there is a slurmd at the IP * address, ask it which job I belong to. On success, pick that one * 3) Pick a job semi-randomly (default) or skip the adoption (if * configured) */ PAM_EXTERN int pam_sm_acct_mgmt(pam_handle_t *pamh, int flags __attribute__((unused)), int argc, const char **argv) { int retval = PAM_IGNORE, rc, slurmrc, bufsize, user_jobs; char *user_name; List steps = NULL; step_loc_t *stepd = NULL; struct passwd pwd, *pwd_result; char *buf = NULL; _init_opts(); _parse_opts(pamh, argc, argv); _log_init(opts.log_level); switch (opts.action_generic_failure) { case CALLERID_ACTION_DENY: rc = PAM_PERM_DENIED; break; case CALLERID_ACTION_ALLOW: rc = PAM_SUCCESS; break; case CALLERID_ACTION_IGNORE: rc = PAM_IGNORE; break; /* Newer gcc versions warn if enum cases are missing */ default: error("The code is broken!!!!"); } retval = pam_get_item(pamh, PAM_USER, (void *) &user_name); if (user_name == NULL || retval != PAM_SUCCESS) { pam_syslog(pamh, LOG_ERR, "No username in PAM_USER? Fail!"); return PAM_SESSION_ERR; } /* Check for an unsafe config that might lock out root. This is a very * basic check that shouldn't be 100% relied on */ if (!opts.ignore_root && (opts.action_unknown == CALLERID_ACTION_DENY || opts.action_no_jobs != CALLERID_ACTION_ALLOW || opts.action_adopt_failure != CALLERID_ACTION_ALLOW || opts.action_generic_failure != CALLERID_ACTION_ALLOW )) { /* Let's get verbose */ info("==============================="); info("Danger!!!"); info("A crazy admin set ignore_root=0 and some unsafe actions"); info("You might lock out root!"); info("If this is desirable, modify the source code"); info("Setting ignore_root=1 and continuing"); opts.ignore_root = 1; } /* Ignoring root is probably best but the admin can allow it */ if (!strcmp(user_name, "root")) { if (opts.ignore_root) { info("Ignoring root user"); return PAM_IGNORE; } else { /* This administrator is crazy */ info("Danger!!! This is a connection attempt by root and ignore_root=0 is set! Hope for the best!"); } } /* Calculate buffer size for getpwnam_r */ bufsize = sysconf(_SC_GETPW_R_SIZE_MAX); if (bufsize == -1) bufsize = 16384; /* take a large guess */ buf = xmalloc(bufsize); retval = getpwnam_r(user_name, &pwd, buf, bufsize, &pwd_result); if (pwd_result == NULL) { if (retval == 0) { error("getpwnam_r could not locate %s", user_name); } else { errno = retval; error("getpwnam_r: %m"); } xfree(buf); return PAM_SESSION_ERR; } if (_load_cgroup_config() != SLURM_SUCCESS) return rc; /* Check if there are any steps on the node from any user. A failure here * likely means failures everywhere so exit on failure or if no local jobs * exist. */ steps = stepd_available(NULL, opts.node_name); if (!steps) { error("Error obtaining local step information."); goto cleanup; } /* Check to see if this user has only one job on the node. If so, choose * that job and adopt this process into it (unless configured not to) */ user_jobs = _user_job_count(steps, pwd.pw_uid, &stepd); if (user_jobs == 0) { if (opts.action_no_jobs == CALLERID_ACTION_DENY) { send_user_msg(pamh, "Access denied by " PAM_MODULE_NAME ": you have no active jobs on this node"); rc = PAM_PERM_DENIED; } else { debug("uid %u owns no jobs but action_no_jobs=ignore", pwd.pw_uid); rc = PAM_IGNORE; } goto cleanup; } else if (user_jobs == 1) { if (opts.single_job_skip_rpc) { info("Connection by user %s: user has only one job %u", user_name, stepd->jobid); slurmrc = _adopt_process(getpid(), stepd); /* If adoption into the only job fails, it is time to * exit. Return code is based on the * action_adopt_failure setting */ if (slurmrc == SLURM_SUCCESS || (opts.action_adopt_failure == CALLERID_ACTION_ALLOW)) rc = PAM_SUCCESS; else rc = PAM_PERM_DENIED; goto cleanup; } } else { debug("uid %u has %d jobs", pwd.pw_uid, user_jobs); } /* Single job check turned up nothing (or we skipped it). Make RPC call * to slurmd at source IP. If it can tell us the job, the function calls * _adopt_process */ rc = _try_rpc(&pwd); if (rc == PAM_SUCCESS) goto cleanup; /* The source of the connection either didn't reply or couldn't * determine the job ID at the source. Proceed to action_unknown */ rc = _action_unknown(pamh, &pwd, steps); cleanup: FREE_NULL_LIST(steps); xfree(buf); xfree(slurm_cgroup_conf); xfree(opts.node_name); return rc; }