/* Returns negative number on failure. Failures are likely to occur if a step * exits; this is not a problem. */ static uid_t _get_job_uid(step_loc_t *stepd) { uid_t uid = -1; int fd; uint16_t protocol_version; fd = stepd_connect(stepd->directory, stepd->nodename, stepd->jobid, stepd->stepid, &protocol_version); if (fd < 0) { /* It's normal for a step to exit */ debug3("unable to connect to step %u.%u on %s: %m", stepd->jobid, stepd->stepid, stepd->nodename); return -1; } uid = stepd_get_uid(fd, stepd->protocol_version); close(fd); /* The step may have exited. Not a big concern. */ if ((int32_t)uid == -1) debug3("unable to determine uid of step %u.%u on %s", stepd->jobid, stepd->stepid, stepd->nodename); return uid; }
/* Adopts a process into the given step. Returns SLURM_SUCCESS if * opts.action_adopt_failure == CALLERID_ACTION_ALLOW or if the process was * successfully adopted. */ static int _adopt_process(pid_t pid, step_loc_t *stepd) { int fd; uint16_t protocol_version; int rc; if (!stepd) return -1; debug("_adopt_process: trying to get %u.%u to adopt %d", stepd->jobid, stepd->stepid, pid); fd = stepd_connect(stepd->directory, stepd->nodename, stepd->jobid, stepd->stepid, &protocol_version); if (fd < 0) { /* It's normal for a step to exit */ debug3("unable to connect to step %u.%u on %s: %m", stepd->jobid, stepd->stepid, stepd->nodename); return -1; } rc = stepd_add_extern_pid(fd, stepd->protocol_version, pid); close(fd); if (rc == PAM_SUCCESS) info("Process %d adopted into job %u", pid, stepd->jobid); else info("Process %d adoption FAILED for job %u", pid, stepd->jobid); return rc; }
static void _list_pids_one_step(const char *node_name, uint32_t jobid, uint32_t stepid) { int fd; slurmstepd_task_info_t *task_info; uint32_t *pids; uint32_t count = 0; uint32_t tcount = 0; int i; fd = stepd_connect(NULL, node_name, jobid, stepid); if (fd == -1) { exit_code = 1; if (errno == ENOENT) { fprintf(stderr, "Job step %u.%u does not exist on this node.\n", jobid, stepid); exit_code = 1; } else { perror("Unable to connect to slurmstepd"); } return; } stepd_task_info(fd, &task_info, &tcount); for (i = 0; i < (int)tcount; i++) { if (!task_info[i].exited) { if (stepid == NO_VAL) printf("%-8d %-8u %-6s %-7d %-8d\n", task_info[i].pid, jobid, "batch", task_info[i].id, task_info[i].gtid); else printf("%-8d %-8u %-6u %-7d %-8d\n", task_info[i].pid, jobid, stepid, task_info[i].id, task_info[i].gtid); } } stepd_list_pids(fd, &pids, &count); for (i = 0; i < count; i++) { if (!_in_task_array((pid_t)pids[i], task_info, tcount)) { if (stepid == NO_VAL) printf("%-8d %-8u %-6s %-7s %-8s\n", pids[i], jobid, "batch", "-", "-"); else printf("%-8d %-8u %-6u %-7s %-8s\n", pids[i], jobid, stepid, "-", "-"); } } if (count > 0) xfree(pids); if (tcount > 0) xfree(task_info); close(fd); }
static void _reconfigure(void) { List steps; ListIterator i; slurm_ctl_conf_t *cf; step_loc_t *stepd; bool did_change; _reconfig = 0; slurm_conf_reinit(conf->conffile); _read_config(); /* * Rebuild topology information and refresh slurmd topo infos */ slurm_topo_build_config(); _set_topo_info(); /* * In case the administrator changed the cpu frequency set capabilities * on this node, rebuild the cpu frequency table information */ cpu_freq_init(conf); _print_conf(); /* * Make best effort at changing to new public key */ slurm_cred_ctx_key_update(conf->vctx, conf->pubkey); /* * Reinitialize the groups cache */ cf = slurm_conf_lock(); if (cf->group_info & GROUP_CACHE) init_gids_cache(1); else init_gids_cache(0); slurm_conf_unlock(); /* send reconfig to each stepd so they can refresh their log * file handle */ steps = stepd_available(conf->spooldir, conf->node_name); i = list_iterator_create(steps); while ((stepd = list_next(i))) { int fd; fd = stepd_connect(stepd->directory, stepd->nodename, stepd->jobid, stepd->stepid); if (fd == -1) continue; if (stepd_reconfig(fd) != SLURM_SUCCESS) debug("Reconfig jobid=%u.%u failed: %m", stepd->jobid, stepd->stepid); close(fd); } list_iterator_destroy(i); list_destroy(steps); gres_plugin_reconfig(&did_change); (void) switch_g_reconfig(); container_g_reconfig(); if (did_change) { uint32_t cpu_cnt = MAX(conf->conf_cpus, conf->block_map_size); (void) gres_plugin_node_config_load(cpu_cnt); send_registration_msg(SLURM_SUCCESS, false); } /* reconfigure energy */ acct_gather_energy_g_set_data(ENERGY_DATA_RECONFIG, NULL); /* * XXX: reopen slurmd port? */ }
static void _fill_registration_msg(slurm_node_registration_status_msg_t *msg) { List steps; ListIterator i; step_loc_t *stepd; int n; char *arch, *os; struct utsname buf; static bool first_msg = true; static time_t slurmd_start_time = 0; Buf gres_info; msg->node_name = xstrdup (conf->node_name); msg->cpus = conf->cpus; msg->boards = conf->boards; msg->sockets = conf->sockets; msg->cores = conf->cores; msg->threads = conf->threads; msg->real_memory = conf->real_memory_size; msg->tmp_disk = conf->tmp_disk_space; msg->hash_val = slurm_get_hash_val(); get_cpu_load(&msg->cpu_load); gres_info = init_buf(1024); if (gres_plugin_node_config_pack(gres_info) != SLURM_SUCCESS) error("error packing gres configuration"); else msg->gres_info = gres_info; get_up_time(&conf->up_time); msg->up_time = conf->up_time; if (slurmd_start_time == 0) slurmd_start_time = time(NULL); msg->slurmd_start_time = slurmd_start_time; if (first_msg) { first_msg = false; info("CPUs=%u Boards=%u Sockets=%u Cores=%u Threads=%u " "Memory=%u TmpDisk=%u Uptime=%u", msg->cpus, msg->boards, msg->sockets, msg->cores, msg->threads, msg->real_memory, msg->tmp_disk, msg->up_time); } else { debug3("CPUs=%u Boards=%u Sockets=%u Cores=%u Threads=%u " "Memory=%u TmpDisk=%u Uptime=%u", msg->cpus, msg->boards, msg->sockets, msg->cores, msg->threads, msg->real_memory, msg->tmp_disk, msg->up_time); } uname(&buf); if ((arch = getenv("SLURM_ARCH"))) msg->arch = xstrdup(arch); else msg->arch = xstrdup(buf.machine); if ((os = getenv("SLURM_OS"))) msg->os = xstrdup(os); else msg->os = xstrdup(buf.sysname); if (msg->startup) { if (switch_g_alloc_node_info(&msg->switch_nodeinfo)) error("switch_g_alloc_node_info: %m"); if (switch_g_build_node_info(msg->switch_nodeinfo)) error("switch_g_build_node_info: %m"); } steps = stepd_available(conf->spooldir, conf->node_name); msg->job_count = list_count(steps); msg->job_id = xmalloc(msg->job_count * sizeof(*msg->job_id)); /* Note: Running batch jobs will have step_id == NO_VAL */ msg->step_id = xmalloc(msg->job_count * sizeof(*msg->step_id)); i = list_iterator_create(steps); n = 0; while ((stepd = list_next(i))) { int fd; fd = stepd_connect(stepd->directory, stepd->nodename, stepd->jobid, stepd->stepid); if (fd == -1) { --(msg->job_count); continue; } if (stepd_state(fd) == SLURMSTEPD_NOT_RUNNING) { debug("stale domain socket for stepd %u.%u ", stepd->jobid, stepd->stepid); --(msg->job_count); close(fd); continue; } close(fd); if (stepd->stepid == NO_VAL) debug("found apparently running job %u", stepd->jobid); else debug("found apparently running step %u.%u", stepd->jobid, stepd->stepid); msg->job_id[n] = stepd->jobid; msg->step_id[n] = stepd->stepid; n++; } list_iterator_destroy(i); list_destroy(steps); if (!msg->energy) msg->energy = acct_gather_energy_alloc(); acct_gather_energy_g_get_data(ENERGY_DATA_STRUCT, msg->energy); msg->timestamp = time(NULL); return; }
/* * Send the termination signal to all of the unix domain socket files * for a given directory and nodename, and then unlink the files. * Returns SLURM_ERROR if any sockets could not be unlinked. */ int stepd_cleanup_sockets(const char *directory, const char *nodename) { DIR *dp; struct dirent *ent; regex_t re; struct stat stat_buf; int rc = SLURM_SUCCESS; _sockname_regex_init(&re, nodename); /* * Make sure that "directory" exists and is a directory. */ if (stat(directory, &stat_buf) < 0) { error("Domain socket directory %s: %m", directory); goto done; } else if (!S_ISDIR(stat_buf.st_mode)) { error("%s is not a directory", directory); goto done; } if ((dp = opendir(directory)) == NULL) { error("Unable to open directory: %m"); goto done; } while ((ent = readdir(dp)) != NULL) { uint32_t jobid, stepid; if (_sockname_regex(&re, ent->d_name, &jobid, &stepid) == 0) { char *path; int fd; path = NULL; xstrfmtcat(path, "%s/%s", directory, ent->d_name); verbose("Cleaning up stray job step %u.%u", jobid, stepid); /* signal the slurmstepd to terminate its step */ fd = stepd_connect((char *) directory, (char *) nodename, jobid, stepid); if (fd == -1) { debug("Unable to connect to socket %s", path); } else { stepd_signal_container(fd, SIGKILL); close(fd); } /* make sure that the socket has been removed */ if (unlink(path) == -1 && errno != ENOENT) { error("Unable to clean up stray socket %s: %m", path); rc = SLURM_ERROR; } xfree(path); } } closedir(dp); done: regfree(&re); return rc; }