int main (int argc, char *argv[]) { slurm_addr_t *cli; slurm_addr_t *self; slurm_msg_t *msg; slurmd_job_t *job; int ngids; gid_t *gids; int rc = 0; if (process_cmdline (argc, argv) < 0) fatal ("Error in slurmstepd command line"); xsignal_block(slurmstepd_blocked_signals); conf = xmalloc(sizeof(*conf)); conf->argv = &argv; conf->argc = &argc; init_setproctitle(argc, argv); if (slurm_select_init(1) != SLURM_SUCCESS ) fatal( "failed to initialize node selection plugin" ); /* Receive job parameters from the slurmd */ _init_from_slurmd(STDIN_FILENO, argv, &cli, &self, &msg, &ngids, &gids); /* Fancy way of closing stdin that keeps STDIN_FILENO from being * allocated to any random file. The slurmd already opened /dev/null * on STDERR_FILENO for us. */ dup2(STDERR_FILENO, STDIN_FILENO); /* Create the slurmd_job_t, mostly from info in a launch_tasks_request_msg_t or a batch_job_launch_msg_t */ if(!(job = _step_setup(cli, self, msg))) { _send_fail_to_slurmd(STDOUT_FILENO); rc = SLURM_FAILURE; goto ending; } job->ngids = ngids; job->gids = gids; /* fork handlers cause mutexes on some global data structures to be re-initialized after the fork. */ list_install_fork_handlers(); slurm_conf_install_fork_handlers(); /* sets job->msg_handle and job->msgid */ if (msg_thr_create(job) == SLURM_ERROR) { _send_fail_to_slurmd(STDOUT_FILENO); rc = SLURM_FAILURE; goto ending; } _send_ok_to_slurmd(STDOUT_FILENO); /* Fancy way of closing stdout that keeps STDOUT_FILENO from being * allocated to any random file. The slurmd already opened /dev/null * on STDERR_FILENO for us. */ dup2(STDERR_FILENO, STDOUT_FILENO); /* This does most of the stdio setup, then launches all the tasks, and blocks until the step is complete */ rc = job_manager(job); /* signal the message thread to shutdown, and wait for it */ eio_signal_shutdown(job->msg_handle); pthread_join(job->msgid, NULL); if (job->batch) batch_finish(job, rc); /* sends batch complete message */ ending: #ifdef MEMORY_LEAK_DEBUG _step_cleanup(job, msg, rc); xfree(cli); xfree(self); xfree(conf->hostname); xfree(conf->block_map); xfree(conf->block_map_inv); xfree(conf->spooldir); xfree(conf->node_name); xfree(conf->node_topo_addr); xfree(conf->node_topo_pattern); xfree(conf->logfile); xfree(conf); #endif info("done with job"); return rc; }
int main (int argc, char *argv[]) { int i, pidfd; int blocked_signals[] = {SIGPIPE, 0}; int cc; char *oom_value; uint32_t slurmd_uid = 0; uint32_t curr_uid = 0; char time_stamp[256]; log_options_t lopts = LOG_OPTS_INITIALIZER; /* NOTE: logfile is NULL at this point */ log_init(argv[0], lopts, LOG_DAEMON, NULL); /* * Make sure we have no extra open files which * would be propagated to spawned tasks. */ cc = sysconf(_SC_OPEN_MAX); for (i = 3; i < cc; i++) close(i); /* * Drop supplementary groups. */ if (geteuid() == 0) { if (setgroups(0, NULL) != 0) { fatal("Failed to drop supplementary groups, " "setgroups: %m"); } } else { debug("Not running as root. Can't drop supplementary groups"); } /* * Create and set default values for the slurmd global * config variable "conf" */ conf = xmalloc(sizeof(slurmd_conf_t)); _init_conf(); conf->argv = &argv; conf->argc = &argc; if (_slurmd_init() < 0) { error( "slurmd initialization failed" ); fflush( NULL ); exit(1); } slurmd_uid = slurm_get_slurmd_user_id(); curr_uid = getuid(); if (curr_uid != slurmd_uid) { struct passwd *pw = NULL; char *slurmd_user = NULL; char *curr_user = NULL; /* since when you do a getpwuid you get a pointer to a * structure you have to do a xstrdup on the first * call or your information will just get over * written. This is a memory leak, but a fatal is * called right after so it isn't that big of a deal. */ if ((pw=getpwuid(slurmd_uid))) slurmd_user = xstrdup(pw->pw_name); if ((pw=getpwuid(curr_uid))) curr_user = pw->pw_name; fatal("You are running slurmd as something " "other than user %s(%d). If you want to " "run as this user add SlurmdUser=%s " "to the slurm.conf file.", slurmd_user, slurmd_uid, curr_user); } init_setproctitle(argc, argv); xsignal(SIGTERM, &_term_handler); xsignal(SIGINT, &_term_handler); xsignal(SIGHUP, &_hup_handler ); xsignal_block(blocked_signals); debug3("slurmd initialization successful"); /* * Become a daemon if desired. * Do not chdir("/") or close all fd's */ if (conf->daemonize) { if (daemon(1,1) == -1) error("Couldn't daemonize slurmd: %m"); } test_core_limit(); info("slurmd version %s started", SLURM_VERSION_STRING); debug3("finished daemonize"); if ((oom_value = getenv("SLURMD_OOM_ADJ"))) { i = atoi(oom_value); debug("Setting slurmd oom_adj to %d", i); set_oom_adj(i); } _kill_old_slurmd(); if (conf->mlock_pages) { /* * Call mlockall() if available to ensure slurmd * doesn't get swapped out */ #ifdef _POSIX_MEMLOCK if (mlockall (MCL_FUTURE | MCL_CURRENT) < 0) error ("failed to mlock() slurmd pages: %m"); #else error ("mlockall() system call does not appear to be available"); #endif /* _POSIX_MEMLOCK */ } /* * Restore any saved revoked credential information */ if (!conf->cleanstart && (_restore_cred_state(conf->vctx) < 0)) return SLURM_FAILURE; if (jobacct_gather_init() != SLURM_SUCCESS) fatal("Unable to initialize jobacct_gather"); if (job_container_init() < 0) fatal("Unable to initialize job_container plugin."); if (container_g_restore(conf->spooldir, !conf->cleanstart)) error("Unable to restore job_container state."); if (switch_g_node_init() < 0) fatal("Unable to initialize interconnect."); if (conf->cleanstart && switch_g_clear_node_state()) fatal("Unable to clear interconnect state."); switch_g_slurmd_init(); _create_msg_socket(); conf->pid = getpid(); /* This has to happen after daemon(), which closes all fd's, so we keep the write lock of the pidfile. */ pidfd = create_pidfile(conf->pidfile, 0); rfc2822_timestamp(time_stamp, sizeof(time_stamp)); info("%s started on %s", slurm_prog_name, time_stamp); _install_fork_handlers(); list_install_fork_handlers(); slurm_conf_install_fork_handlers(); /* * Initialize any plugins */ if (slurmd_plugstack_init()) fatal("failed to initialize slurmd_plugstack"); _spawn_registration_engine(); _msg_engine(); /* * Close fd here, otherwise we'll deadlock since create_pidfile() * flocks the pidfile. */ if (pidfd >= 0) /* valid pidfd, non-error */ (void) close(pidfd); /* Ignore errors */ if (unlink(conf->pidfile) < 0) error("Unable to remove pidfile `%s': %m", conf->pidfile); _wait_for_all_threads(120); _slurmd_fini(); _destroy_conf(); slurm_crypto_fini(); /* must be after _destroy_conf() */ info("Slurmd shutdown completing"); log_fini(); return 0; }
int main (int argc, char **argv) { slurm_addr_t *cli; slurm_addr_t *self; slurm_msg_t *msg; stepd_step_rec_t *job; int ngids; gid_t *gids; int rc = 0; char *launch_params; if (_process_cmdline (argc, argv) < 0) fatal ("Error in slurmstepd command line"); xsignal_block(slurmstepd_blocked_signals); conf = xmalloc(sizeof(*conf)); conf->argv = &argv; conf->argc = &argc; init_setproctitle(argc, argv); if (slurm_select_init(1) != SLURM_SUCCESS ) fatal( "failed to initialize node selection plugin" ); if (slurm_auth_init(NULL) != SLURM_SUCCESS) fatal( "failed to initialize authentication plugin" ); /* Receive job parameters from the slurmd */ _init_from_slurmd(STDIN_FILENO, argv, &cli, &self, &msg, &ngids, &gids); /* Create the stepd_step_rec_t, mostly from info in a * launch_tasks_request_msg_t or a batch_job_launch_msg_t */ if (!(job = _step_setup(cli, self, msg))) { _send_fail_to_slurmd(STDOUT_FILENO); rc = SLURM_FAILURE; goto ending; } job->ngids = ngids; job->gids = gids; /* fork handlers cause mutexes on some global data structures * to be re-initialized after the fork. */ list_install_fork_handlers(); slurm_conf_install_fork_handlers(); /* sets job->msg_handle and job->msgid */ if (msg_thr_create(job) == SLURM_ERROR) { _send_fail_to_slurmd(STDOUT_FILENO); rc = SLURM_FAILURE; goto ending; } _send_ok_to_slurmd(STDOUT_FILENO); _got_ack_from_slurmd(STDIN_FILENO); /* Fancy way of closing stdin that keeps STDIN_FILENO from being * allocated to any random file. The slurmd already opened /dev/null * on STDERR_FILENO for us. */ dup2(STDERR_FILENO, STDIN_FILENO); /* Fancy way of closing stdout that keeps STDOUT_FILENO from being * allocated to any random file. The slurmd already opened /dev/null * on STDERR_FILENO for us. */ dup2(STDERR_FILENO, STDOUT_FILENO); /* slurmstepd is the only daemon that should survive upgrade. If it * had been swapped out before upgrade happened it could easily lead * to SIGBUS at any time after upgrade. Avoid that by locking it * in-memory. */ launch_params = slurm_get_launch_params(); if (launch_params && strstr(launch_params, "slurmstepd_memlock")) { #ifdef _POSIX_MEMLOCK int flags = MCL_CURRENT; if (strstr(launch_params, "slurmstepd_memlock_all")) flags |= MCL_FUTURE; if (mlockall(flags) < 0) info("failed to mlock() slurmstepd pages: %m"); else debug("slurmstepd locked in memory"); #else info("mlockall() system call does not appear to be available"); #endif } xfree(launch_params); /* This does most of the stdio setup, then launches all the tasks, * and blocks until the step is complete */ rc = job_manager(job); return stepd_cleanup(msg, job, cli, self, rc, 0); ending: return stepd_cleanup(msg, job, cli, self, rc, 1); }