Beispiel #1
0
extern int jobacct_gather_endpoll(void)
{
	int retval = SLURM_SUCCESS;

	if (jobacct_gather_init() < 0)
		return SLURM_ERROR;

	jobacct_shutdown = true;
	slurm_mutex_lock(&task_list_lock);
	FREE_NULL_LIST(task_list);

	retval = (*(ops.endpoll))();

	slurm_mutex_unlock(&task_list_lock);

	return retval;
}
Beispiel #2
0
extern int jobacct_gather_add_task(pid_t pid, jobacct_id_t *jobacct_id,
				   int poll)
{
	struct jobacctinfo *jobacct;

	if (jobacct_gather_init() < 0)
		return SLURM_ERROR;

	if (!plugin_polling)
		return SLURM_SUCCESS;

	if (_jobacct_shutdown_test())
		return SLURM_ERROR;

	jobacct = jobacctinfo_create(jobacct_id);

	slurm_mutex_lock(&task_list_lock);
	if (pid <= 0) {
		error("invalid pid given (%d) for task acct", pid);
		goto error;
	} else if (!task_list) {
		error("no task list created!");
		goto error;
	}

	jobacct->pid = pid;
	memcpy(&jobacct->id, jobacct_id, sizeof(jobacct_id_t));
	jobacct->min_cpu = 0;
	debug2("adding task %u pid %d on node %u to jobacct",
	       jobacct_id->taskid, pid, jobacct_id->nodeid);
	list_push(task_list, jobacct);
	slurm_mutex_unlock(&task_list_lock);

	(*(ops.add_task))(pid, jobacct_id);

	if (poll == 1)
		_poll_data(1);

	return SLURM_SUCCESS;
error:
	slurm_mutex_unlock(&task_list_lock);
	jobacctinfo_destroy(jobacct);
	return SLURM_ERROR;
}
Beispiel #3
0
extern int jobacct_gather_startpoll(uint16_t frequency)
{
	int retval = SLURM_SUCCESS;
	pthread_attr_t attr;
	pthread_t _watch_tasks_thread_id;

	if (!plugin_polling)
		return SLURM_SUCCESS;

	if (jobacct_gather_init() < 0)
		return SLURM_ERROR;

	if (!jobacct_shutdown) {
		error("jobacct_gather_startpoll: poll already started!");
		return retval;
	}

	jobacct_shutdown = false;

	freq = frequency;

	task_list = list_create(jobacctinfo_destroy);
	if (frequency == 0) {	/* don't want dynamic monitoring? */
		debug2("jobacct_gather dynamic logging disabled");
		return retval;
	}

	/* create polling thread */
	slurm_attr_init(&attr);
	if (pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED))
		error("pthread_attr_setdetachstate error %m");

	if  (pthread_create(&_watch_tasks_thread_id, &attr,
			    &_watch_tasks, NULL)) {
		debug("jobacct_gather failed to create _watch_tasks "
		      "thread: %m");
		frequency = 0;
	} else
		debug3("jobacct_gather dynamic logging enabled");
	slurm_attr_destroy(&attr);

	return retval;
}
Beispiel #4
0
extern int jobacct_gather_startpoll(uint16_t frequency)
{
	int retval = SLURM_SUCCESS;
	pthread_attr_t attr;

	if (!plugin_polling)
		return SLURM_SUCCESS;

	if (jobacct_gather_init() < 0)
		return SLURM_ERROR;

	if (!_jobacct_shutdown_test()) {
		error("jobacct_gather_startpoll: poll already started!");
		return retval;
	}
	slurm_mutex_lock(&jobacct_shutdown_mutex);
	jobacct_shutdown = false;
	slurm_mutex_unlock(&jobacct_shutdown_mutex);

	freq = frequency;

	task_list = list_create(jobacctinfo_destroy);
	if (frequency == 0) {	/* don't want dynamic monitoring? */
		debug2("jobacct_gather dynamic logging disabled");
		return retval;
	}

	/* create polling thread */
	slurm_attr_init(&attr);
	if  (pthread_create(&watch_tasks_thread_id, &attr,
			    &_watch_tasks, NULL)) {
		debug("jobacct_gather failed to create _watch_tasks "
		      "thread: %m");
	} else
		debug3("jobacct_gather dynamic logging enabled");
	slurm_attr_destroy(&attr);

	return retval;
}
Beispiel #5
0
extern int jobacctinfo_unpack(jobacctinfo_t **jobacct,
			      uint16_t rpc_version, uint16_t protocol_type,
			      Buf buffer, bool alloc)
{
	uint32_t uint32_tmp;
	uint8_t  uint8_tmp;

	if (jobacct_gather_init() < 0)
		return SLURM_ERROR;

	if (rpc_version >= SLURM_MIN_PROTOCOL_VERSION) {
		safe_unpack8(&uint8_tmp, buffer);
		if (uint8_tmp == (uint8_t) 0)
			return SLURM_SUCCESS;
		if (alloc)
			*jobacct = xmalloc(sizeof(struct jobacctinfo));
		safe_unpack32(&uint32_tmp, buffer);
		(*jobacct)->user_cpu_sec = uint32_tmp;
		safe_unpack32(&uint32_tmp, buffer);
		(*jobacct)->user_cpu_usec = uint32_tmp;
		safe_unpack32(&uint32_tmp, buffer);
		(*jobacct)->sys_cpu_sec = uint32_tmp;
		safe_unpack32(&uint32_tmp, buffer);
		(*jobacct)->sys_cpu_usec = uint32_tmp;
		safe_unpack64(&(*jobacct)->max_vsize, buffer);
		safe_unpack64(&(*jobacct)->tot_vsize, buffer);
		safe_unpack64(&(*jobacct)->max_rss, buffer);
		safe_unpack64(&(*jobacct)->tot_rss, buffer);
		safe_unpack64(&(*jobacct)->max_pages, buffer);
		safe_unpack64(&(*jobacct)->tot_pages, buffer);
		safe_unpack32(&(*jobacct)->min_cpu, buffer);
		safe_unpackdouble(&(*jobacct)->tot_cpu, buffer);
		safe_unpack32(&(*jobacct)->act_cpufreq, buffer);
		safe_unpack64(&(*jobacct)->energy.consumed_energy, buffer);

		safe_unpackdouble(&(*jobacct)->max_disk_read, buffer);
		safe_unpackdouble(&(*jobacct)->tot_disk_read, buffer);
		safe_unpackdouble(&(*jobacct)->max_disk_write, buffer);
		safe_unpackdouble(&(*jobacct)->tot_disk_write, buffer);

		if (_unpack_jobacct_id(&(*jobacct)->max_vsize_id, rpc_version,
			buffer) != SLURM_SUCCESS)
			goto unpack_error;
		if (_unpack_jobacct_id(&(*jobacct)->max_rss_id, rpc_version,
			buffer) != SLURM_SUCCESS)
			goto unpack_error;
		if (_unpack_jobacct_id(&(*jobacct)->max_pages_id, rpc_version,
			buffer) != SLURM_SUCCESS)
			goto unpack_error;
		if (_unpack_jobacct_id(&(*jobacct)->min_cpu_id, rpc_version,
			buffer) != SLURM_SUCCESS)
			goto unpack_error;
		if (_unpack_jobacct_id(&(*jobacct)->max_disk_read_id,
			rpc_version, buffer) != SLURM_SUCCESS)
			goto unpack_error;
		if (_unpack_jobacct_id(&(*jobacct)->max_disk_write_id,
			rpc_version, buffer) != SLURM_SUCCESS)
			goto unpack_error;
	} else {
		info("jobacctinfo_unpack version %u not supported",
		     rpc_version);
		return SLURM_ERROR;
	}

	return SLURM_SUCCESS;

unpack_error:
	debug2("jobacctinfo_unpack: unpack_error: size_buf(buffer) %u",
	       size_buf(buffer));
	if (alloc)
		xfree(*jobacct);
       	return SLURM_ERROR;
}
Beispiel #6
0
int
main (int argc, char *argv[])
{
	int i, pidfd;
	int blocked_signals[] = {SIGPIPE, 0};
	int cc;
	char *oom_value;
	uint32_t slurmd_uid = 0;
	uint32_t curr_uid = 0;
	char time_stamp[256];
	log_options_t lopts = LOG_OPTS_INITIALIZER;

	/* NOTE: logfile is NULL at this point */
	log_init(argv[0], lopts, LOG_DAEMON, NULL);

	/*
	 * Make sure we have no extra open files which
	 * would be propagated to spawned tasks.
	 */
	cc = sysconf(_SC_OPEN_MAX);
	for (i = 3; i < cc; i++)
		close(i);

	/*
	 * Drop supplementary groups.
	 */
	if (geteuid() == 0) {
		if (setgroups(0, NULL) != 0) {
			fatal("Failed to drop supplementary groups, "
			      "setgroups: %m");
		}
	} else {
		debug("Not running as root. Can't drop supplementary groups");
	}

	/*
	 * Create and set default values for the slurmd global
	 * config variable "conf"
	 */
	conf = xmalloc(sizeof(slurmd_conf_t));
	_init_conf();
	conf->argv = &argv;
	conf->argc = &argc;

	if (_slurmd_init() < 0) {
		error( "slurmd initialization failed" );
		fflush( NULL );
		exit(1);
	}

	slurmd_uid = slurm_get_slurmd_user_id();
	curr_uid = getuid();
	if (curr_uid != slurmd_uid) {
		struct passwd *pw = NULL;
		char *slurmd_user = NULL;
		char *curr_user = NULL;

		/* since when you do a getpwuid you get a pointer to a
		 * structure you have to do a xstrdup on the first
		 * call or your information will just get over
		 * written.  This is a memory leak, but a fatal is
		 * called right after so it isn't that big of a deal.
		 */
		if ((pw=getpwuid(slurmd_uid)))
			slurmd_user = xstrdup(pw->pw_name);
		if ((pw=getpwuid(curr_uid)))
			curr_user = pw->pw_name;

		fatal("You are running slurmd as something "
		      "other than user %s(%d).  If you want to "
		      "run as this user add SlurmdUser=%s "
		      "to the slurm.conf file.",
		      slurmd_user, slurmd_uid, curr_user);
	}
	init_setproctitle(argc, argv);

	xsignal(SIGTERM, &_term_handler);
	xsignal(SIGINT,  &_term_handler);
	xsignal(SIGHUP,  &_hup_handler );
	xsignal_block(blocked_signals);

	debug3("slurmd initialization successful");

	/*
	 * Become a daemon if desired.
	 * Do not chdir("/") or close all fd's
	 */
	if (conf->daemonize) {
		if (daemon(1,1) == -1)
			error("Couldn't daemonize slurmd: %m");
	}
	test_core_limit();
	info("slurmd version %s started", SLURM_VERSION_STRING);
	debug3("finished daemonize");

	if ((oom_value = getenv("SLURMD_OOM_ADJ"))) {
		i = atoi(oom_value);
		debug("Setting slurmd oom_adj to %d", i);
		set_oom_adj(i);
	}

	_kill_old_slurmd();

	if (conf->mlock_pages) {
		/*
		 * Call mlockall() if available to ensure slurmd
		 *  doesn't get swapped out
		 */
#ifdef _POSIX_MEMLOCK
		if (mlockall (MCL_FUTURE | MCL_CURRENT) < 0)
			error ("failed to mlock() slurmd pages: %m");
#else
		error ("mlockall() system call does not appear to be available");
#endif /* _POSIX_MEMLOCK */
	}


	/*
	 * Restore any saved revoked credential information
	 */
	if (!conf->cleanstart && (_restore_cred_state(conf->vctx) < 0))
		return SLURM_FAILURE;

	if (jobacct_gather_init() != SLURM_SUCCESS)
		fatal("Unable to initialize jobacct_gather");
	if (job_container_init() < 0)
		fatal("Unable to initialize job_container plugin.");
	if (container_g_restore(conf->spooldir, !conf->cleanstart))
		error("Unable to restore job_container state.");
	if (switch_g_node_init() < 0)
		fatal("Unable to initialize interconnect.");
	if (conf->cleanstart && switch_g_clear_node_state())
		fatal("Unable to clear interconnect state.");
	switch_g_slurmd_init();

	_create_msg_socket();

	conf->pid = getpid();
	/* This has to happen after daemon(), which closes all fd's,
	   so we keep the write lock of the pidfile.
	*/
	pidfd = create_pidfile(conf->pidfile, 0);

	rfc2822_timestamp(time_stamp, sizeof(time_stamp));
	info("%s started on %s", slurm_prog_name, time_stamp);

	_install_fork_handlers();
	list_install_fork_handlers();
	slurm_conf_install_fork_handlers();

	/*
	 * Initialize any plugins
	 */
	if (slurmd_plugstack_init())
		fatal("failed to initialize slurmd_plugstack");

	_spawn_registration_engine();
	_msg_engine();

	/*
	 * Close fd here, otherwise we'll deadlock since create_pidfile()
	 * flocks the pidfile.
	 */
	if (pidfd >= 0)			/* valid pidfd, non-error */
		(void) close(pidfd);	/* Ignore errors */
	if (unlink(conf->pidfile) < 0)
		error("Unable to remove pidfile `%s': %m", conf->pidfile);

	_wait_for_all_threads(120);
	_slurmd_fini();
	_destroy_conf();
	slurm_crypto_fini();	/* must be after _destroy_conf() */

	info("Slurmd shutdown completing");
	log_fini();
       	return 0;
}
Beispiel #7
0
extern int jobacctinfo_unpack(jobacctinfo_t **jobacct,
			      uint16_t rpc_version, uint16_t protocol_type,
			      Buf buffer, bool alloc)
{
	uint32_t uint32_tmp;
	uint8_t  uint8_tmp;
	bool no_pack;

	jobacct_gather_init();
	no_pack = (!plugin_polling && (protocol_type != PROTOCOL_TYPE_DBD));

	/* The function can take calls from both DBD and from regular
	 * SLURM functions.  We choose to standardize on using the
	 * SLURM_PROTOCOL_VERSION here so if PROTOCOL_TYPE_DBD comes
	 * in we need to translate the DBD rpc_version to use the
	 * SLURM protocol_version.
	 *
	 * If this function ever changes make sure the
	 * slurmdbd_translate_rpc function has been updated with the
	 * new protocol version.
	 */
	if (protocol_type == PROTOCOL_TYPE_DBD)
		rpc_version = slurmdbd_translate_rpc(rpc_version);

	if (rpc_version >= SLURM_14_03_PROTOCOL_VERSION) {
		safe_unpack8(&uint8_tmp, buffer);
		if (uint8_tmp == (uint8_t) 0)
			return SLURM_SUCCESS;
		if (alloc)
			*jobacct = xmalloc(sizeof(struct jobacctinfo));
		safe_unpack32(&uint32_tmp, buffer);
		(*jobacct)->user_cpu_sec = uint32_tmp;
		safe_unpack32(&uint32_tmp, buffer);
		(*jobacct)->user_cpu_usec = uint32_tmp;
		safe_unpack32(&uint32_tmp, buffer);
		(*jobacct)->sys_cpu_sec = uint32_tmp;
		safe_unpack32(&uint32_tmp, buffer);
		(*jobacct)->sys_cpu_usec = uint32_tmp;
		safe_unpack64(&(*jobacct)->max_vsize, buffer);
		safe_unpack64(&(*jobacct)->tot_vsize, buffer);
		safe_unpack64(&(*jobacct)->max_rss, buffer);
		safe_unpack64(&(*jobacct)->tot_rss, buffer);
		safe_unpack64(&(*jobacct)->max_pages, buffer);
		safe_unpack64(&(*jobacct)->tot_pages, buffer);
		safe_unpack32(&(*jobacct)->min_cpu, buffer);
		safe_unpack32(&(*jobacct)->tot_cpu, buffer);
		safe_unpack32(&(*jobacct)->act_cpufreq, buffer);
		safe_unpack32(&(*jobacct)->energy.consumed_energy, buffer);

		safe_unpackdouble(&(*jobacct)->max_disk_read, buffer);
		safe_unpackdouble(&(*jobacct)->tot_disk_read, buffer);
		safe_unpackdouble(&(*jobacct)->max_disk_write, buffer);
		safe_unpackdouble(&(*jobacct)->tot_disk_write, buffer);

		if (_unpack_jobacct_id(&(*jobacct)->max_vsize_id, rpc_version,
			buffer) != SLURM_SUCCESS)
			goto unpack_error;
		if (_unpack_jobacct_id(&(*jobacct)->max_rss_id, rpc_version,
			buffer) != SLURM_SUCCESS)
			goto unpack_error;
		if (_unpack_jobacct_id(&(*jobacct)->max_pages_id, rpc_version,
			buffer) != SLURM_SUCCESS)
			goto unpack_error;
		if (_unpack_jobacct_id(&(*jobacct)->min_cpu_id, rpc_version,
			buffer) != SLURM_SUCCESS)
			goto unpack_error;
		if (_unpack_jobacct_id(&(*jobacct)->max_disk_read_id,
			rpc_version, buffer) != SLURM_SUCCESS)
			goto unpack_error;
		if (_unpack_jobacct_id(&(*jobacct)->max_disk_write_id,
			rpc_version, buffer) != SLURM_SUCCESS)
			goto unpack_error;
	} else if (rpc_version >= SLURM_2_6_PROTOCOL_VERSION) {
		if (no_pack)
			return SLURM_SUCCESS;
		if (alloc)
			*jobacct = xmalloc(sizeof(struct jobacctinfo));
		safe_unpack32(&uint32_tmp, buffer);
		(*jobacct)->user_cpu_sec = uint32_tmp;
		safe_unpack32(&uint32_tmp, buffer);
		(*jobacct)->user_cpu_usec = uint32_tmp;
		safe_unpack32(&uint32_tmp, buffer);
		(*jobacct)->sys_cpu_sec = uint32_tmp;
		safe_unpack32(&uint32_tmp, buffer);
		(*jobacct)->sys_cpu_usec = uint32_tmp;
		safe_unpack32((uint32_t *)&(*jobacct)->max_vsize, buffer);
		safe_unpack32((uint32_t *)&(*jobacct)->tot_vsize, buffer);
		safe_unpack32((uint32_t *)&(*jobacct)->max_rss, buffer);
		safe_unpack32((uint32_t *)&(*jobacct)->tot_rss, buffer);
		safe_unpack32((uint32_t *)&(*jobacct)->max_pages, buffer);
		safe_unpack32((uint32_t *)&(*jobacct)->tot_pages, buffer);
		safe_unpack32(&(*jobacct)->min_cpu, buffer);
		safe_unpack32(&(*jobacct)->tot_cpu, buffer);
		safe_unpack32(&(*jobacct)->act_cpufreq, buffer);
		safe_unpack32(&(*jobacct)->energy.consumed_energy, buffer);

		safe_unpackdouble(&(*jobacct)->max_disk_read, buffer);
		safe_unpackdouble(&(*jobacct)->tot_disk_read, buffer);
		safe_unpackdouble(&(*jobacct)->max_disk_write, buffer);
		safe_unpackdouble(&(*jobacct)->tot_disk_write, buffer);

		if (_unpack_jobacct_id(&(*jobacct)->max_vsize_id, rpc_version,
			buffer) != SLURM_SUCCESS)
			goto unpack_error;
		if (_unpack_jobacct_id(&(*jobacct)->max_rss_id, rpc_version,
			buffer) != SLURM_SUCCESS)
			goto unpack_error;
		if (_unpack_jobacct_id(&(*jobacct)->max_pages_id, rpc_version,
			buffer) != SLURM_SUCCESS)
			goto unpack_error;
		if (_unpack_jobacct_id(&(*jobacct)->min_cpu_id, rpc_version,
			buffer) != SLURM_SUCCESS)
			goto unpack_error;
		if (_unpack_jobacct_id(&(*jobacct)->max_disk_read_id,
			rpc_version, buffer) != SLURM_SUCCESS)
			goto unpack_error;
		if (_unpack_jobacct_id(&(*jobacct)->max_disk_write_id,
			rpc_version, buffer) != SLURM_SUCCESS)
			goto unpack_error;
	} else {
		info("jobacctinfo_unpack version %u not supported",
		     rpc_version);
		return SLURM_ERROR;
	}

	return SLURM_SUCCESS;

unpack_error:
	debug2("jobacctinfo_unpack: unpack_error: size_buf(buffer) %u",
	       size_buf(buffer));
	if (alloc)
		xfree(*jobacct);
       	return SLURM_ERROR;
}