Beispiel #1
0
extern int as_mysql_job_complete(mysql_conn_t *mysql_conn,
				 struct job_record *job_ptr)
{
	char *query = NULL;
	int rc = SLURM_SUCCESS, job_state;
	time_t submit_time, end_time;
	uint32_t exit_code = 0;

	if (!job_ptr->db_index
	    && ((!job_ptr->details || !job_ptr->details->submit_time)
		&& !job_ptr->resize_time)) {
		error("as_mysql_job_complete: "
		      "Not inputing this job, it has no submit time.");
		return SLURM_ERROR;
	}

	if (check_connection(mysql_conn) != SLURM_SUCCESS)
		return ESLURM_DB_CONNECTION;

	debug2("as_mysql_slurmdb_job_complete() called");

	if (job_ptr->resize_time)
		submit_time = job_ptr->resize_time;
	else
		submit_time = job_ptr->details->submit_time;

	if (IS_JOB_RESIZING(job_ptr)) {
		end_time = job_ptr->resize_time;
		job_state = JOB_RESIZING;
	} else {
		/* If we get an error with this just fall through to avoid an
		 * infinite loop */
		if (job_ptr->end_time == 0) {
			debug("as_mysql_jobacct: job %u never started",
			      job_ptr->job_id);
			return SLURM_SUCCESS;
		}
		end_time = job_ptr->end_time;

		if (IS_JOB_REQUEUED(job_ptr))
			job_state = JOB_REQUEUE;
		else
			job_state = job_ptr->job_state & JOB_STATE_BASE;
	}

	slurm_mutex_lock(&rollup_lock);
	if (end_time < global_last_rollup) {
		global_last_rollup = job_ptr->end_time;
		slurm_mutex_unlock(&rollup_lock);

		query = xstrdup_printf("update \"%s_%s\" set "
				       "hourly_rollup=%ld, "
				       "daily_rollup=%ld, monthly_rollup=%ld",
				       mysql_conn->cluster_name,
				       last_ran_table, end_time,
				       end_time, end_time);
		if (debug_flags & DEBUG_FLAG_DB_JOB)
			DB_DEBUG(mysql_conn->conn, "query\n%s", query);
		(void) mysql_db_query(mysql_conn, query);
		xfree(query);
	} else
		slurm_mutex_unlock(&rollup_lock);

	if (!job_ptr->db_index) {
		if (!(job_ptr->db_index =
		      _get_db_index(mysql_conn,
				    submit_time,
				    job_ptr->job_id,
				    job_ptr->assoc_id))) {
			/* Comment is overloaded in job_start to be
			   the block_id, so we will need to store this
			   for later.
			*/
			char *comment = job_ptr->comment;
			job_ptr->comment = NULL;
			/* If we get an error with this just fall
			 * through to avoid an infinite loop
			 */
			if (as_mysql_job_start(
				    mysql_conn, job_ptr) == SLURM_ERROR) {
				job_ptr->comment = comment;
				error("couldn't add job %u at job completion",
				      job_ptr->job_id);
				return SLURM_SUCCESS;
			}
			job_ptr->comment = comment;
		}
	}

	/*
	 * make sure we handle any quotes that may be in the comment
	 */

	query = xstrdup_printf("update \"%s_%s\" set "
			       "time_end=%ld, state=%d",
			       mysql_conn->cluster_name, job_table,
			       end_time, job_state);

	if (job_ptr->derived_ec != NO_VAL)
		xstrfmtcat(query, ", derived_ec=%u", job_ptr->derived_ec);

	if (job_ptr->comment) {
		char *comment = slurm_add_slash_to_quotes(job_ptr->comment);
		xstrfmtcat(query, ", derived_es='%s'", comment);
		xfree(comment);
	}

	exit_code = job_ptr->exit_code;
	if (exit_code == 1) {
		/* This wasn't signalled, it was set by Slurm so don't
		 * treat it like a signal.
		 */
		exit_code = 256;
	}

	xstrfmtcat(query,
		   ", exit_code=%d, kill_requid=%d where job_db_inx=%d;",
		   exit_code, job_ptr->requid,
		   job_ptr->db_index);

	if (debug_flags & DEBUG_FLAG_DB_JOB)
		DB_DEBUG(mysql_conn->conn, "query\n%s", query);
	rc = mysql_db_query(mysql_conn, query);
	xfree(query);

	return rc;
}
Beispiel #2
0
Datei: mpi.c Projekt: HDOD/slurm
int _mpi_init (char *mpi_type)
{
	int retval = SLURM_SUCCESS;
	char *plugin_type = "mpi";
	char *type = NULL;
	int got_default = 0;

	if (init_run && g_context)
		return retval;

	slurm_mutex_lock( &context_lock );

	if ( g_context )
		goto done;

	if (mpi_type == NULL) {
		mpi_type = slurm_get_mpi_default();
		got_default = 1;
	}
	if (mpi_type == NULL) {
		error("No MPI default set.");
		retval = SLURM_ERROR;
		goto done;
	}

	if (!xstrcmp(mpi_type, "list")) {
		char *plugin_dir;
		plugrack_t mpi_rack;

		mpi_rack = plugrack_create();
		if (!mpi_rack) {
			error("Unable to create a plugin manager");
			exit(0);
		}
		plugrack_set_major_type(mpi_rack, "mpi");
		plugin_dir = slurm_get_plugin_dir();
		plugrack_read_dir(mpi_rack, plugin_dir);
		plugrack_print_all_plugin(mpi_rack);
		exit(0);
	}

	setenvf(NULL, "SLURM_MPI_TYPE", "%s", mpi_type);

	type = xstrdup_printf("mpi/%s", mpi_type);

	g_context = plugin_context_create(
		plugin_type, type, (void **)&ops, syms, sizeof(syms));

	if (!g_context) {
		error("cannot create %s context for %s", plugin_type, type);
		retval = SLURM_ERROR;
		goto done;
	}
	init_run = true;

done:
	xfree(type);
	if (got_default)
		xfree(mpi_type);
	slurm_mutex_unlock( &context_lock );
	return retval;
}
Beispiel #3
0
/*
 * Initialize the job container plugin.
 *
 * RET - slurm error code
 */
extern int job_container_init(void)
{
    int retval = SLURM_SUCCESS;
    char *plugin_type = "job_container";
    char *container_plugin_type = NULL;
    char *last = NULL, *job_container_plugin_list, *job_container = NULL;

    if (init_run && (g_container_context_num >= 0))
        return retval;

    slurm_mutex_lock(&g_container_context_lock);

    if (g_container_context_num >= 0)
        goto done;

    container_plugin_type = slurm_get_job_container_plugin();
    g_container_context_num = 0; /* mark it before anything else */
    if ((container_plugin_type == NULL) ||
            (container_plugin_type[0] == '\0'))
        goto done;

    job_container_plugin_list = container_plugin_type;
    while ((job_container =
                strtok_r(job_container_plugin_list, ",", &last))) {
        xrealloc(ops,
                 sizeof(job_container_ops_t) *
                 (g_container_context_num + 1));
        xrealloc(g_container_context, (sizeof(plugin_context_t *)
                                       * (g_container_context_num + 1)));
        if (strncmp(job_container, "job_container/", 14) == 0)
            job_container += 14; /* backward compatibility */
        job_container = xstrdup_printf("job_container/%s",
                                       job_container);
        g_container_context[g_container_context_num] =
            plugin_context_create(
                plugin_type, job_container,
                (void **)&ops[g_container_context_num],
                syms, sizeof(syms));
        if (!g_container_context[g_container_context_num]) {
            error("cannot create %s context for %s",
                  plugin_type, job_container);
            xfree(job_container);
            retval = SLURM_ERROR;
            break;
        }

        xfree(job_container);
        g_container_context_num++;
        job_container_plugin_list = NULL; /* for next iteration */
    }
    init_run = true;

done:
    slurm_mutex_unlock(&g_container_context_lock);
    xfree(container_plugin_type);

    if (retval != SLURM_SUCCESS)
        job_container_fini();

    return retval;
}
Beispiel #4
0
/*
 * route_p_split_hostlist - logic to split an input hostlist into
 *                           a set of hostlists to forward to.
 *
 * IN: hl        - hostlist_t   - list of every node to send message to
 *                                will be empty on return;
 * OUT: sp_hl    - hostlist_t** - the array of hostlists that will be malloced
 * OUT: count    - int*         - the count of created hostlists
 * RET: SLURM_SUCCESS - int
 *
 * Note: created hostlist will have to be freed independently using
 *       hostlist_destroy by the caller.
 * Note: the hostlist_t array will have to be xfree.
 */
extern int route_p_split_hostlist(hostlist_t hl,
				  hostlist_t** sp_hl,
				  int* count, uint16_t tree_width)
{
	int i, j, k, hl_ndx, msg_count, sw_count, lst_count;
	char  *buf;
	bitstr_t *nodes_bitmap = NULL;		/* nodes in message list */
	bitstr_t *fwd_bitmap = NULL;		/* nodes in forward list */

	msg_count = hostlist_count(hl);
	slurm_mutex_lock(&route_lock);
	if (switch_record_cnt == 0) {
		/* configs have not already been processed */
		slurm_conf_init(NULL);
		if (init_node_conf()) {
			fatal("ROUTE: Failed to init slurm config");
		}
		if (build_all_nodeline_info(false, 0)) {
			fatal("ROUTE: Failed to build node config");
		}
		rehash_node();

		if (slurm_topo_build_config() != SLURM_SUCCESS) {
			fatal("ROUTE: Failed to build topology config");
		}
	}
	slurm_mutex_unlock(&route_lock);
	*sp_hl = (hostlist_t*) xmalloc(switch_record_cnt * sizeof(hostlist_t));
	/* create bitmap of nodes to send message too */
	if (hostlist2bitmap (hl, false, &nodes_bitmap) != SLURM_SUCCESS) {
		buf = hostlist_ranged_string_xmalloc(hl);
		fatal("ROUTE: Failed to make bitmap from hostlist=%s.", buf);
	}

	/* Find lowest level switch containing all the nodes in the list */
	j = 0;
	for (i = 0; i <= switch_levels; i++) {
		for (j=0; j<switch_record_cnt; j++) {
			if (switch_record_table[j].level == i) {
				if (bit_super_set(nodes_bitmap,
						  switch_record_table[j].
						  node_bitmap)) {
					/* All nodes in message list are in
					 * this switch */
					break;
				}
			}
		}
		if (j < switch_record_cnt) {
			/* Got here via break after bit_super_set */
			break; // 'j' is our switch
		} /* else, no switches at this level reach all nodes */
	}
	if (i > switch_levels) {
		/* This can only happen if trying to schedule multiple physical
		 * clusters as a single logical cluster under the control of a
		 * single slurmctld daemon, and sending something like a
		 * node_registation request to all nodes.
		 * Revert to default behavior*/
		if (debug_flags & DEBUG_FLAG_ROUTE) {
			buf = hostlist_ranged_string_xmalloc(hl);
			debug("ROUTE: didn't find switch containing nodes=%s",
			      buf);
			xfree(buf);
		}
		FREE_NULL_BITMAP(nodes_bitmap);
		xfree(*sp_hl);
		return route_split_hostlist_treewidth(
			hl, sp_hl, count, tree_width);
	}
	if (switch_record_table[j].level == 0) {
		/* This is a leaf switch. Construct list based on TreeWidth */
		FREE_NULL_BITMAP(nodes_bitmap);
		xfree(*sp_hl);
		return route_split_hostlist_treewidth(
			hl, sp_hl, count, tree_width);
	}
	/* loop through children, construction a hostlist for each child switch
	 * with nodes in the message list */
	hl_ndx = 0;
	lst_count = 0;
	for (i=0; i < switch_record_table[j].num_switches; i++) {
		k = switch_record_table[j].switch_index[i];
		fwd_bitmap = bit_copy(switch_record_table[k].node_bitmap);
		bit_and(fwd_bitmap, nodes_bitmap);
		sw_count = bit_set_count(fwd_bitmap);
		if (sw_count == 0) {
			continue; /* no nodes on this switch in message list */
		}
		(*sp_hl)[hl_ndx] = bitmap2hostlist(fwd_bitmap);
		/* Now remove nodes from this switch from message list */
		bit_and_not(nodes_bitmap, fwd_bitmap);
		FREE_NULL_BITMAP(fwd_bitmap);
		if (debug_flags & DEBUG_FLAG_ROUTE) {
			buf = hostlist_ranged_string_xmalloc((*sp_hl)[hl_ndx]);
			debug("ROUTE: ... sublist[%d] switch=%s :: %s",
			      i, switch_record_table[i].name, buf);
			xfree(buf);
		}
		hl_ndx++;
		lst_count += sw_count;
		if (lst_count == msg_count)
			break; /* all nodes in message are in a child list */
	}
	FREE_NULL_BITMAP(nodes_bitmap);

	*count = hl_ndx;
	return SLURM_SUCCESS;

}
Beispiel #5
0
static void *_node_update(void *args)
{
	char *node_name = (char *) args;
	char *argv[10], nid_str[32], *resp_msg;
	int i, nid = -1, status = 0;
	bool node_state_ok;

	for (i = 0; node_name[i]; i++) {
		if ((node_name[i] >= '0') && (node_name[i] <= '9')) {
			nid = strtol(node_name + i, NULL, 10);
			break;
		}
	}
	if (nid < 0) {
		error("%s: No valid NID: %s", log_file, node_name);
		goto fini;
	}
	snprintf(nid_str, sizeof(nid_str), "%d", nid);

	if (mcdram_mode) {
		/* Update MCDRAM mode.
		* Example: "capmc set_mcdram_cfg –n 43 –m cache" */
		argv[0] = "capmc";
		argv[1] = "set_mcdram_cfg";
		argv[2] = "-n";
		argv[3] = nid_str;
		argv[4] = "-m";
		argv[5] = mcdram_mode;
		argv[6] = NULL;
		resp_msg = _run_script(argv, &status);
		if (status != 0) {
			error("%s: capmc(%s,%s,%s,%s,%s): %d %s", log_file,
			      argv[1], argv[2], argv[3], argv[4], argv[5],
			      status, resp_msg);
		}
		xfree(resp_msg);
	}

	if (numa_mode) {
		/* Update NUMA mode.
		 * Example: "capmc set_numa_cfg –n 43 –m a2a" */
		argv[0] = "capmc";
		argv[1] = "set_numa_cfg";
		argv[2] = "-n";
		argv[3] = nid_str;
		argv[4] = "-m";
		argv[5] = numa_mode;
		argv[6] = NULL;
		resp_msg = _run_script(argv, &status);
		if (status != 0) {
			error("%s: capmc(%s,%s,%s,%s,%s): %d %s", log_file,
			      argv[1], argv[2], argv[3], argv[4], argv[5],
			      status, resp_msg);
		}
		xfree(resp_msg);
	}

	/* Test if already in "off" state */
	node_state_ok = _check_node_state(nid, nid_str, "off");

	/* Request node power down.
	 * Example: "capmc node_off –n 43" */
	if (!node_state_ok) {
		argv[0] = "capmc";
		argv[1] = "node_off";
		argv[2] = "-n";
		argv[3] = nid_str;
		argv[4] = NULL;
		resp_msg = _run_script(argv, &status);
		if (status != 0) {
			error("%s: capmc(%s,%s,%s): %d %s", log_file,
			      argv[1], argv[2], argv[3], status, resp_msg);
		}
		xfree(resp_msg);
	}

	/* Wait for node in "off" state */
	while (!node_state_ok) {
		sleep(2);
		node_state_ok = _check_node_state(nid, nid_str, "off");
	}

	/* Request node power up.
	 * Example: "capmc node_up –n 43" */
	argv[0] = "capmc";
	argv[1] = "node_up";
	argv[2] = "-n";
	argv[3] = nid_str;
	argv[4] = NULL;
	resp_msg = _run_script(argv, &status);
	if (status != 0) {
		error("%s: capmc(%s,%s,%s): %d %s", log_file,
			argv[1], argv[2], argv[3], status, resp_msg);
	}
	xfree(resp_msg);

fini:	slurm_mutex_lock(&thread_cnt_mutex);
	thread_cnt--;
	pthread_cond_signal(&thread_cnt_cond);
	slurm_mutex_unlock(&thread_cnt_mutex);
	return NULL;
}
int
main(int argc, char **argv)
{
	int debug_level, sig, srun_fd;
	struct sigaction sa;
	log_options_t logopt = LOG_OPTS_STDERR_ONLY;
	struct sockaddr_un ca;
	unsigned int ca_len = sizeof(ca);

	atexit(remove_listen_socket);

	/* copied from srun */
	slurm_conf_init(NULL);
	debug_level = _slurm_debug_env_val();
	logopt.stderr_level += debug_level;
	log_init(xbasename(argv[0]), logopt, 0, NULL);

	if (init_srun_argv(argc, argv)) {
		fatal("failed to initialize arguments for running srun");
	}

	if ((cr_id = cr_init()) < 0) {
		fatal("failed to initialize libcr: %s", cr_strerror(errno));
	}
	(void)cr_register_callback(cr_callback, NULL, CR_THREAD_CONTEXT);

	/* forward signals. copied from cr_restart */
	sa.sa_sigaction = signal_child;
	sa.sa_flags = SA_RESTART | SA_NODEFER | SA_SIGINFO;
	sigemptyset(&sa.sa_mask);
	for (sig = 0;  sig < _NSIG; sig ++) {
		if (sig == SIGSTOP ||
		    sig == SIGKILL ||
		    sig == SIGCHLD)
			continue;
		sigaction(sig, &sa, NULL);
	}
	sa.sa_sigaction = on_child_exit;
	sa.sa_flags = SA_RESTART | SA_SIGINFO | SA_NOCLDSTOP;
	sigaction(SIGCHLD, &sa, NULL);

	cr_enter_cs(cr_id); /* BEGIN CS: avoid race condition of whether srun is forked */
	if ( fork_exec_srun() ) {
		fatal("failed fork/exec/wait srun");
	}
	cr_leave_cs(cr_id); /* END CS */

	while (1) {
		slurm_mutex_lock(&step_launch_mutex);
		while (step_launched) {
			/* just avoid busy waiting */
			slurm_cond_wait(&step_launch_cond, &step_launch_mutex);
		}
		slurm_mutex_unlock(&step_launch_mutex);

		if (_wait_for_srun_connect() < 0)
			continue;

		cr_enter_cs(cr_id); /* BEGIN CS: checkpoint(callback) will be delayed */

		srun_fd = accept(listen_fd, (struct sockaddr*)&ca, &ca_len);
		if (srun_fd < 0) {
			/* restarted before enter CS. socket will not be restored */
			if (errno == EBADF) {
				cr_leave_cs(cr_id);
				continue;
			} else {
				fatal("failed to accept socket: %m");
			}
		}

		_read_info_from_srun(srun_fd);
		close(srun_fd);

		step_launched = 1;
		debug2("step launched");

		cr_leave_cs(cr_id); /* END CS */
	}

	return 0;
}
Beispiel #7
0
static void * _service_connection(void *arg)
{
	slurmdbd_conn_t *conn = (slurmdbd_conn_t *) arg;
	uint32_t nw_size = 0, msg_size = 0, uid = NO_VAL;
	char *msg = NULL;
	ssize_t msg_read = 0, offset = 0;
	bool fini = false, first = true;
	Buf buffer = NULL;
	int rc = SLURM_SUCCESS;

	debug2("Opened connection %d from %s", conn->newsockfd, conn->ip);

	while (!fini) {
		if (!_fd_readable(conn->newsockfd))
			break;		/* problem with this socket */
		msg_read = read(conn->newsockfd, &nw_size, sizeof(nw_size));
		if (msg_read == 0)	/* EOF */
			break;
		if (msg_read != sizeof(nw_size)) {
			error("Could not read msg_size from "
			      "connection %d(%s) uid(%d)",
			      conn->newsockfd, conn->ip, uid);
			break;
		}
		msg_size = ntohl(nw_size);
		if ((msg_size < 2) || (msg_size > MAX_MSG_SIZE)) {
			error("Invalid msg_size (%u) from "
			      "connection %d(%s) uid(%d)",
			      msg_size, conn->newsockfd, conn->ip, uid);
			break;
		}

		msg = xmalloc(msg_size);
		offset = 0;
		while (msg_size > offset) {
			if (!_fd_readable(conn->newsockfd))
				break;		/* problem with this socket */
			msg_read = read(conn->newsockfd, (msg + offset),
					(msg_size - offset));
			if (msg_read <= 0) {
				error("read(%d): %m", conn->newsockfd);
				break;
			}
			offset += msg_read;
		}
		if (msg_size == offset) {
			rc = proc_req(
				conn, msg, msg_size, first, &buffer, &uid);
			first = false;
			if (rc != SLURM_SUCCESS && rc != ACCOUNTING_FIRST_REG) {
				error("Processing last message from "
				      "connection %d(%s) uid(%d)",
				      conn->newsockfd, conn->ip, uid);
				if (rc == ESLURM_ACCESS_DENIED
				    || rc == SLURM_PROTOCOL_VERSION_ERROR)
					fini = true;
			}
		} else {
			buffer = make_dbd_rc_msg(conn->rpc_version,
						 SLURM_ERROR, "Bad offset", 0);
			fini = true;
		}

		(void) _send_resp(conn->newsockfd, buffer);
		xfree(msg);
	}

	if (conn->ctld_port) {
		if (!shutdown_time) {
			slurmdb_cluster_rec_t cluster_rec;
			ListIterator itr;
			slurmdbd_conn_t *slurmdbd_conn;
			memset(&cluster_rec, 0, sizeof(slurmdb_cluster_rec_t));
			cluster_rec.name = conn->cluster_name;
			cluster_rec.control_host = conn->ip;
			cluster_rec.control_port = conn->ctld_port;
			cluster_rec.tres_str = conn->tres_str;
			debug("cluster %s has disconnected",
			      conn->cluster_name);

			clusteracct_storage_g_fini_ctld(
				conn->db_conn, &cluster_rec);

			slurm_mutex_lock(&registered_lock);
			itr = list_iterator_create(registered_clusters);
			while ((slurmdbd_conn = list_next(itr))) {
				if (conn == slurmdbd_conn) {
					list_delete_item(itr);
					break;
				}
			}
			list_iterator_destroy(itr);
			slurm_mutex_unlock(&registered_lock);
		}
		/* needs to be the last thing done */
		acct_storage_g_commit(conn->db_conn, 1);
	}

	acct_storage_g_close_connection(&conn->db_conn);
	if (slurm_close(conn->newsockfd) < 0)
		error("close(%d): %m(%s)",  conn->newsockfd, conn->ip);
	else
		debug2("Closed connection %d uid(%d)", conn->newsockfd, uid);

	xfree(conn->tres_str);
	xfree(conn->cluster_name);
	xfree(conn);
	_free_server_thread(pthread_self());
	return NULL;
}
Beispiel #8
0
static int _handle_mult_rc_ret(void)
{
	Buf buffer;
	uint16_t msg_type;
	persist_rc_msg_t *msg = NULL;
	dbd_list_msg_t *list_msg = NULL;
	int rc = SLURM_ERROR;
	Buf out_buf = NULL;

	buffer = slurm_persist_recv_msg(slurmdbd_conn);
	if (buffer == NULL)
		return rc;

	safe_unpack16(&msg_type, buffer);
	switch (msg_type) {
	case DBD_GOT_MULT_MSG:
		if (slurmdbd_unpack_list_msg(
			    &list_msg, slurmdbd_conn->version,
			    DBD_GOT_MULT_MSG, buffer)
		    != SLURM_SUCCESS) {
			error("slurmdbd: unpack message error");
			break;
		}

		slurm_mutex_lock(&agent_lock);
		if (agent_list) {
			ListIterator itr =
				list_iterator_create(list_msg->my_list);
			while ((out_buf = list_next(itr))) {
				Buf b;
				if ((rc = _unpack_return_code(
					     slurmdbd_conn->version, out_buf))
				    != SLURM_SUCCESS)
					break;

				if ((b = list_dequeue(agent_list))) {
					free_buf(b);
				} else {
					error("slurmdbd: DBD_GOT_MULT_MSG "
					      "unpack message error");
				}
			}
			list_iterator_destroy(itr);
		}
		slurm_mutex_unlock(&agent_lock);
		slurmdbd_free_list_msg(list_msg);
		break;
	case PERSIST_RC:
		if (slurm_persist_unpack_rc_msg(
			    &msg, buffer, slurmdbd_conn->version)
		    == SLURM_SUCCESS) {
			rc = msg->rc;
			if (rc != SLURM_SUCCESS) {
				if (msg->ret_info == DBD_REGISTER_CTLD &&
				    slurm_get_accounting_storage_enforce()) {
					error("slurmdbd: PERSIST_RC is %d from "
					      "%s(%u): %s",
					      rc,
					      slurmdbd_msg_type_2_str(
						      msg->ret_info, 1),
					      msg->ret_info,
					      msg->comment);
					fatal("You need to add this cluster "
					      "to accounting if you want to "
					      "enforce associations, or no "
					      "jobs will ever run.");
				} else
					debug("slurmdbd: PERSIST_RC is %d from "
					      "%s(%u): %s",
					      rc,
					      slurmdbd_msg_type_2_str(
						      msg->ret_info, 1),
					      msg->ret_info,
					      msg->comment);
			}
			slurm_persist_free_rc_msg(msg);
		} else
			error("slurmdbd: unpack message error");
		break;
	default:
		error("slurmdbd: bad message type %d != PERSIST_RC", msg_type);
	}

unpack_error:
	free_buf(buffer);
	return rc;
}
Beispiel #9
0
static void *_agent(void *x)
{
	int cnt, rc;
	Buf buffer;
	struct timespec abs_time;
	static time_t fail_time = 0;
	int sigarray[] = {SIGUSR1, 0};
	slurmdbd_msg_t list_req;
	dbd_list_msg_t list_msg;

	list_req.msg_type = DBD_SEND_MULT_MSG;
	list_req.data = &list_msg;
	memset(&list_msg, 0, sizeof(dbd_list_msg_t));
	/* DEF_TIMERS; */

	/* Prepare to catch SIGUSR1 to interrupt pending
	 * I/O and terminate in a timely fashion. */
	xsignal(SIGUSR1, _sig_handler);
	xsignal_unblock(sigarray);

	while (*slurmdbd_conn->shutdown == 0) {
		/* START_TIMER; */
		slurm_mutex_lock(&slurmdbd_lock);
		if (halt_agent)
			slurm_cond_wait(&slurmdbd_cond, &slurmdbd_lock);

		if ((slurmdbd_conn->fd < 0) &&
		    (difftime(time(NULL), fail_time) >= 10)) {
			/* The connection to Slurm DBD is not open */
			_open_slurmdbd_conn(1);
			if (slurmdbd_conn->fd < 0)
				fail_time = time(NULL);
		}

		slurm_mutex_lock(&agent_lock);
		if (agent_list && slurmdbd_conn->fd)
			cnt = list_count(agent_list);
		else
			cnt = 0;
		if ((cnt == 0) || (slurmdbd_conn->fd < 0) ||
		    (fail_time && (difftime(time(NULL), fail_time) < 10))) {
			slurm_mutex_unlock(&slurmdbd_lock);
			abs_time.tv_sec  = time(NULL) + 10;
			abs_time.tv_nsec = 0;
			slurm_cond_timedwait(&agent_cond, &agent_lock,
					     &abs_time);
			slurm_mutex_unlock(&agent_lock);
			continue;
		} else if ((cnt > 0) && ((cnt % 100) == 0))
			info("slurmdbd: agent queue size %u", cnt);
		/* Leave item on the queue until processing complete */
		if (agent_list) {
			int handle_agent_count = 1000;
			if (cnt > handle_agent_count) {
				int agent_count = 0;
				ListIterator agent_itr =
					list_iterator_create(agent_list);
				list_msg.my_list = list_create(NULL);
				while ((buffer = list_next(agent_itr))) {
					list_enqueue(list_msg.my_list, buffer);
					agent_count++;
					if (agent_count > handle_agent_count)
						break;
				}
				list_iterator_destroy(agent_itr);
				buffer = pack_slurmdbd_msg(
					&list_req, SLURM_PROTOCOL_VERSION);
			} else if (cnt > 1) {
				list_msg.my_list = agent_list;
				buffer = pack_slurmdbd_msg(
					&list_req, SLURM_PROTOCOL_VERSION);
			} else
				buffer = (Buf) list_peek(agent_list);
		} else
			buffer = NULL;
		slurm_mutex_unlock(&agent_lock);
		if (buffer == NULL) {
			slurm_mutex_unlock(&slurmdbd_lock);

			slurm_mutex_lock(&assoc_cache_mutex);
			if (slurmdbd_conn->fd >= 0 && running_cache)
				slurm_cond_signal(&assoc_cache_cond);
			slurm_mutex_unlock(&assoc_cache_mutex);

			continue;
		}

		/* NOTE: agent_lock is clear here, so we can add more
		 * requests to the queue while waiting for this RPC to
		 * complete. */
		rc = slurm_persist_send_msg(slurmdbd_conn, buffer);
		if (rc != SLURM_SUCCESS) {
			if (*slurmdbd_conn->shutdown) {
				slurm_mutex_unlock(&slurmdbd_lock);
				break;
			}
			error("slurmdbd: Failure sending message: %d: %m", rc);
		} else if (list_msg.my_list) {
			rc = _handle_mult_rc_ret();
		} else {
			rc = _get_return_code();
			if (rc == EAGAIN) {
				if (*slurmdbd_conn->shutdown) {
					slurm_mutex_unlock(&slurmdbd_lock);
					break;
				}
				error("slurmdbd: Failure with "
				      "message need to resend: %d: %m", rc);
			}
		}
		slurm_mutex_unlock(&slurmdbd_lock);
		slurm_mutex_lock(&assoc_cache_mutex);
		if (slurmdbd_conn->fd >= 0 && running_cache)
			slurm_cond_signal(&assoc_cache_cond);
		slurm_mutex_unlock(&assoc_cache_mutex);

		slurm_mutex_lock(&agent_lock);
		if (agent_list && (rc == SLURM_SUCCESS)) {
			/*
			 * If we sent a mult_msg we just need to free buffer,
			 * we don't need to requeue, just mark list_msg.my_list
			 * as NULL as that is the sign we sent a mult_msg.
			 */
			if (list_msg.my_list) {
				if (list_msg.my_list != agent_list)
					FREE_NULL_LIST(list_msg.my_list);
				list_msg.my_list = NULL;
			} else
				buffer = (Buf) list_dequeue(agent_list);

			free_buf(buffer);
			fail_time = 0;
		} else {
			/* We need to free a mult_msg even on failure */
			if (list_msg.my_list) {
				if (list_msg.my_list != agent_list)
					FREE_NULL_LIST(list_msg.my_list);
				list_msg.my_list = NULL;
				free_buf(buffer);
			}

			fail_time = time(NULL);
		}
		slurm_mutex_unlock(&agent_lock);
		/* END_TIMER; */
		/* info("at the end with %s", TIME_STR); */
	}

	slurm_mutex_lock(&agent_lock);
	_save_dbd_state();
	FREE_NULL_LIST(agent_list);
	slurm_mutex_unlock(&agent_lock);
	return NULL;
}
Beispiel #10
0
/* un/lock semaphore used for saving state of slurmctld */
extern void lock_state_files(void)
{
	slurm_mutex_lock(&state_mutex);
}
Beispiel #11
0
/* Send an RPC to the SlurmDBD. Do not wait for the reply. The RPC
 * will be queued and processed later if the SlurmDBD is not responding.
 * NOTE: slurm_open_slurmdbd_conn() must have been called with callbacks set
 *
 * Returns SLURM_SUCCESS or an error code */
extern int send_slurmdbd_msg(uint16_t rpc_version, slurmdbd_msg_t *req)
{
	Buf buffer;
	int cnt, rc = SLURM_SUCCESS;
	static time_t syslog_time = 0;
	static int max_agent_queue = 0;

	/*
	 * Whatever our max job count is multiplied by 2 plus node count
	 * multiplied by 4 or MAX_AGENT_QUEUE which ever is bigger.
	 */
	if (!max_agent_queue)
		max_agent_queue =
			MAX(MAX_AGENT_QUEUE,
			    ((slurmctld_conf.max_job_cnt * 2) +
			     (node_record_count * 4)));

	buffer = slurm_persist_msg_pack(
		slurmdbd_conn, (persist_msg_t *)req);
	if (!buffer)	/* pack error */
		return SLURM_ERROR;

	slurm_mutex_lock(&agent_lock);
	if ((agent_tid == 0) || (agent_list == NULL)) {
		_create_agent();
		if ((agent_tid == 0) || (agent_list == NULL)) {
			slurm_mutex_unlock(&agent_lock);
			free_buf(buffer);
			return SLURM_ERROR;
		}
	}
	cnt = list_count(agent_list);
	if ((cnt >= (max_agent_queue / 2)) &&
	    (difftime(time(NULL), syslog_time) > 120)) {
		/* Record critical error every 120 seconds */
		syslog_time = time(NULL);
		error("slurmdbd: agent queue filling (%d), RESTART SLURMDBD NOW",
		      cnt);
		syslog(LOG_CRIT, "*** RESTART SLURMDBD NOW ***");
		if (slurmdbd_conn->trigger_callbacks.dbd_fail)
			(slurmdbd_conn->trigger_callbacks.dbd_fail)();
	}
	if (cnt == (max_agent_queue - 1))
		cnt -= _purge_step_req();
	if (cnt == (max_agent_queue - 1))
		cnt -= _purge_job_start_req();
	if (cnt < max_agent_queue) {
		if (list_enqueue(agent_list, buffer) == NULL)
			fatal("list_enqueue: memory allocation failure");
	} else {
		error("slurmdbd: agent queue is full (%u), discarding %s:%u request",
		      cnt,
		      slurmdbd_msg_type_2_str(req->msg_type, 1),
		      req->msg_type);
		if (slurmdbd_conn->trigger_callbacks.acct_full)
			(slurmdbd_conn->trigger_callbacks.acct_full)();
		free_buf(buffer);
		rc = SLURM_ERROR;
	}

	slurm_cond_broadcast(&agent_cond);
	slurm_mutex_unlock(&agent_lock);
	return rc;
}
Beispiel #12
0
/*
 * init_power_save - Initialize the power save module. Started as a
 *	pthread. Terminates automatically at slurmctld shutdown time.
 *	Input and output are unused.
 */
static void *_init_power_save(void *arg)
{
        /* Locks: Read nodes */
        slurmctld_lock_t node_read_lock = {
                NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK };
        /* Locks: Write nodes */
        slurmctld_lock_t node_write_lock = {
                NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK, NO_LOCK };
	time_t now, boot_time = 0, last_power_scan = 0;

	if (power_save_config && !power_save_enabled) {
		debug("power_save mode not enabled");
		return NULL;
	}

	suspend_node_bitmap = bit_alloc(node_record_count);
	resume_node_bitmap  = bit_alloc(node_record_count);

	while (slurmctld_config.shutdown_time == 0) {
		sleep(1);

		if (_reap_procs() < 2) {
			debug("power_save programs getting backlogged");
			continue;
		}

		if ((last_config != slurmctld_conf.last_update) &&
		    (_init_power_config())) {
			info("power_save mode has been disabled due to "
			     "configuration changes");
			goto fini;
		}

		now = time(NULL);
		if (boot_time == 0)
			boot_time = now;

		/* Only run every 60 seconds or after a node state change,
		 *  whichever happens first */
		if ((last_node_update >= last_power_scan) ||
		    (now >= (last_power_scan + 60))) {
			lock_slurmctld(node_write_lock);
			_do_power_work(now);
			unlock_slurmctld(node_write_lock);
			last_power_scan = now;
		}

		if (slurmd_timeout &&
		    (now > (boot_time + (slurmd_timeout / 2)))) {
			lock_slurmctld(node_read_lock);
			_re_wake();
			unlock_slurmctld(node_read_lock);
			/* prevent additional executions */
			boot_time += (365 * 24 * 60 * 60);
			slurmd_timeout = 0;
		}
	}

fini:	_clear_power_config();
	FREE_NULL_BITMAP(suspend_node_bitmap);
	FREE_NULL_BITMAP(resume_node_bitmap);
	_shutdown_power();
	slurm_mutex_lock(&power_mutex);
	power_save_enabled = false;
	slurm_cond_signal(&power_cond);
	slurm_mutex_unlock(&power_mutex);
	pthread_exit(NULL);
	return NULL;
}
extern int acct_gather_profile_startpoll(char *freq, char *freq_def)
{
	int retval = SLURM_SUCCESS;
	pthread_attr_t attr;
	int i;
	uint32_t profile = ACCT_GATHER_PROFILE_NOT_SET;

	if (acct_gather_profile_init() < 0)
		return SLURM_ERROR;

	slurm_mutex_lock(&profile_running_mutex);
	if (acct_gather_profile_running) {
		slurm_mutex_unlock(&profile_running_mutex);
		error("acct_gather_profile_startpoll: poll already started!");
		return retval;
	}
	acct_gather_profile_running = true;
	slurm_mutex_unlock(&profile_running_mutex);

	(*(ops.get))(ACCT_GATHER_PROFILE_RUNNING, &profile);
	xassert(profile != ACCT_GATHER_PROFILE_NOT_SET);

	for (i=0; i < PROFILE_CNT; i++) {
		memset(&acct_gather_profile_timer[i], 0,
		       sizeof(acct_gather_profile_timer_t));
		slurm_cond_init(&acct_gather_profile_timer[i].notify, NULL);
		slurm_mutex_init(&acct_gather_profile_timer[i].notify_mutex);

		switch (i) {
		case PROFILE_ENERGY:
			if (!(profile & ACCT_GATHER_PROFILE_ENERGY))
				break;
			_set_freq(i, freq, freq_def);

			acct_gather_energy_startpoll(
				acct_gather_profile_timer[i].freq);
			break;
		case PROFILE_TASK:
			/* Always set up the task (always first) to be
			   done since it is used to control memory
			   consumption and such.  It will check
			   profile inside it's plugin.
			*/
			_set_freq(i, freq, freq_def);

			jobacct_gather_startpoll(
				acct_gather_profile_timer[i].freq);

			break;
		case PROFILE_FILESYSTEM:
			if (!(profile & ACCT_GATHER_PROFILE_LUSTRE))
				break;
			_set_freq(i, freq, freq_def);

			acct_gather_filesystem_startpoll(
				acct_gather_profile_timer[i].freq);
			break;
		case PROFILE_NETWORK:
			if (!(profile & ACCT_GATHER_PROFILE_NETWORK))
				break;
			_set_freq(i, freq, freq_def);

			acct_gather_interconnect_startpoll(
				acct_gather_profile_timer[i].freq);
			break;
		default:
			fatal("Unhandled profile option %d please update "
			      "slurm_acct_gather_profile.c "
			      "(acct_gather_profile_startpoll)", i);
		}
	}

	/* create polling thread */
	slurm_attr_init(&attr);

	if  (pthread_create(&timer_thread_id, &attr,
			    &_timer_thread, NULL)) {
		debug("acct_gather_profile_startpoll failed to create "
		      "_timer_thread: %m");
	} else
		debug3("acct_gather_profile_startpoll dynamic logging enabled");
	slurm_attr_destroy(&attr);

	return retval;
}
static void *_timer_thread(void *args)
{
	int i, now, diff;

#if HAVE_SYS_PRCTL_H
	if (prctl(PR_SET_NAME, "acctg_prof", NULL, NULL, NULL) < 0) {
		error("%s: cannot set my name to %s %m",
		      __func__, "acctg_prof");
	}
#endif

	(void) pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
	(void) pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL);

	DEF_TIMERS;
	while (init_run && acct_gather_profile_test()) {
		slurm_mutex_lock(&g_context_lock);
		START_TIMER;
		now = time(NULL);

		for (i=0; i<PROFILE_CNT; i++) {
			if (acct_gather_suspend_test()) {
				/* Handle suspended time as if it
				 * didn't happen */
				if (!acct_gather_profile_timer[i].freq)
					continue;
				if (acct_gather_profile_timer[i].last_notify)
					acct_gather_profile_timer[i].
						last_notify += SLEEP_TIME;
				else
					acct_gather_profile_timer[i].
						last_notify = now;
				continue;
			}

			diff = now - acct_gather_profile_timer[i].last_notify;
			/* info ("%d is %d and %d", i, */
			/*       acct_gather_profile_timer[i].freq, */
			/*       diff); */
			if (!acct_gather_profile_timer[i].freq
			    || (diff < acct_gather_profile_timer[i].freq))
				continue;
			if (!acct_gather_profile_test())
				break;	/* Shutting down */
			debug2("profile signalling type %s",
			       acct_gather_profile_type_t_name(i));

			/* signal poller to start */
			slurm_mutex_lock(&acct_gather_profile_timer[i].
					 notify_mutex);
			slurm_cond_signal(
				&acct_gather_profile_timer[i].notify);
			slurm_mutex_unlock(&acct_gather_profile_timer[i].
					   notify_mutex);
			acct_gather_profile_timer[i].last_notify = now;
		}
		END_TIMER;
		slurm_mutex_unlock(&g_context_lock);

		usleep(USLEEP_TIME - DELTA_TIMER);
	}

	return NULL;
}
Beispiel #15
0
extern List as_mysql_get_wckeys(mysql_conn_t *mysql_conn, uid_t uid,
				slurmdb_wckey_cond_t *wckey_cond)
{
	//DEF_TIMERS;
	char *extra = NULL;
	char *tmp = NULL;
	char *cluster_name = NULL;
	List wckey_list = NULL;
	int i=0, is_admin=1;
	uint16_t private_data = 0;
	slurmdb_user_rec_t user;
	List use_cluster_list = as_mysql_cluster_list;
	ListIterator itr;

	if (!wckey_cond) {
		xstrcat(extra, " where deleted=0");
		goto empty;
	}

	if (check_connection(mysql_conn) != SLURM_SUCCESS)
		return NULL;

	memset(&user, 0, sizeof(slurmdb_user_rec_t));
	user.uid = uid;

	private_data = slurm_get_private_data();
	if (private_data & PRIVATE_DATA_USERS) {
		if (!(is_admin = is_user_min_admin_level(
			      mysql_conn, uid, SLURMDB_ADMIN_OPERATOR))) {
			assoc_mgr_fill_in_user(
				mysql_conn, &user, 1, NULL);
		}
		if (!is_admin && !user.name) {
			debug("User %u has no associations, and is not admin, "
			      "so not returning any wckeys.", user.uid);
			return NULL;
		}
	}

	(void) _setup_wckey_cond_limits(wckey_cond, &extra);

	if (wckey_cond->cluster_list && list_count(wckey_cond->cluster_list))
		use_cluster_list = wckey_cond->cluster_list;
empty:
	xfree(tmp);
	xstrfmtcat(tmp, "t1.%s", wckey_req_inx[i]);
	for(i=1; i<WCKEY_REQ_COUNT; i++) {
		xstrfmtcat(tmp, ", t1.%s", wckey_req_inx[i]);
	}

	/* this is here to make sure we are looking at only this user
	 * if this flag is set.  We also include any accounts they may be
	 * coordinator of.
	 */
	if (!is_admin && (private_data & PRIVATE_DATA_USERS))
		xstrfmtcat(extra, " && t1.user='******'", user.name);

	wckey_list = list_create(slurmdb_destroy_wckey_rec);

	if (use_cluster_list == as_mysql_cluster_list)
		slurm_mutex_lock(&as_mysql_cluster_list_lock);
	//START_TIMER;
	itr = list_iterator_create(use_cluster_list);
	while ((cluster_name = list_next(itr))) {
		if (_cluster_get_wckeys(mysql_conn, wckey_cond, tmp, extra,
					cluster_name, wckey_list)
		    != SLURM_SUCCESS) {
			FREE_NULL_LIST(wckey_list);
			wckey_list = NULL;
			break;
		}
	}
	list_iterator_destroy(itr);

	if (use_cluster_list == as_mysql_cluster_list)
		slurm_mutex_unlock(&as_mysql_cluster_list_lock);

	xfree(tmp);
	xfree(extra);

	//END_TIMER2("get_wckeys");
	return wckey_list;
}
Beispiel #16
0
/* Send an RPC to the SlurmDBD and wait for an arbitrary reply message.
 * The RPC will not be queued if an error occurs.
 * The "resp" message must be freed by the caller.
 * Returns SLURM_SUCCESS or an error code */
extern int send_recv_slurmdbd_msg(uint16_t rpc_version,
				  slurmdbd_msg_t *req,
				  slurmdbd_msg_t *resp)
{
	int rc = SLURM_SUCCESS;
	Buf buffer;

	xassert(req);
	xassert(resp);

	/* To make sure we can get this to send instead of the agent
	   sending stuff that can happen anytime we set halt_agent and
	   then after we get into the mutex we unset.
	*/
	halt_agent = 1;
	slurm_mutex_lock(&slurmdbd_lock);
	halt_agent = 0;
	if (!slurmdbd_conn || (slurmdbd_conn->fd < 0)) {
		/* Either slurm_open_slurmdbd_conn() was not executed or
		 * the connection to Slurm DBD has been closed */
		if (req->msg_type == DBD_GET_CONFIG)
			_open_slurmdbd_conn(0);
		else
			_open_slurmdbd_conn(1);
		if (!slurmdbd_conn || (slurmdbd_conn->fd < 0)) {
			rc = SLURM_ERROR;
			goto end_it;
		}
	}

	if (!(buffer = pack_slurmdbd_msg(req, rpc_version))) {
		rc = SLURM_ERROR;
		goto end_it;
	}

	rc = slurm_persist_send_msg(slurmdbd_conn, buffer);
	free_buf(buffer);
	if (rc != SLURM_SUCCESS) {
		error("slurmdbd: Sending message type %s: %d: %m",
		      rpc_num2string(req->msg_type), rc);
		goto end_it;
	}

	buffer = slurm_persist_recv_msg(slurmdbd_conn);
	if (buffer == NULL) {
		error("slurmdbd: Getting response to message type %u",
		      req->msg_type);
		rc = SLURM_ERROR;
		goto end_it;
	}

	rc = unpack_slurmdbd_msg(resp, rpc_version, buffer);
	/* check for the rc of the start job message */
	if (rc == SLURM_SUCCESS && resp->msg_type == DBD_ID_RC)
		rc = ((dbd_id_rc_msg_t *)resp->data)->return_code;

	free_buf(buffer);
end_it:
	slurm_cond_signal(&slurmdbd_cond);
	slurm_mutex_unlock(&slurmdbd_lock);

	return rc;
}
Beispiel #17
0
static void _launch_app(srun_job_t *job, List srun_job_list, bool got_alloc)
{
	ListIterator opt_iter, job_iter;
	opt_t *opt_local = NULL;
	_launch_app_data_t *opts;
	int total_ntasks = 0, total_nnodes = 0, step_cnt = 0, node_offset = 0;
	pthread_mutex_t step_mutex = PTHREAD_MUTEX_INITIALIZER;
	pthread_cond_t step_cond   = PTHREAD_COND_INITIALIZER;
	srun_job_t *first_job = NULL;
	char *launch_type, *pack_node_list = NULL;
	bool need_mpir = false;
	uint16_t *tmp_task_cnt = NULL, *pack_task_cnts = NULL;
	uint32_t **tmp_tids = NULL, **pack_tids = NULL;

	launch_type = slurm_get_launch_type();
	if (launch_type && strstr(launch_type, "slurm"))
		need_mpir = true;
	xfree(launch_type);

	if (srun_job_list) {
		int pack_step_cnt = list_count(srun_job_list);
		first_job = (srun_job_t *) list_peek(srun_job_list);
		if (!opt_list) {
			if (first_job)
				fini_srun(first_job, got_alloc, &global_rc, 0);
			fatal("%s: have srun_job_list, but no opt_list",
			      __func__);
		}

		job_iter = list_iterator_create(srun_job_list);
		while ((job = (srun_job_t *) list_next(job_iter))) {
			char *node_list = NULL;
			int i, node_inx;
			total_ntasks += job->ntasks;
			total_nnodes += job->nhosts;

			xrealloc(pack_task_cnts, sizeof(uint16_t)*total_nnodes);
			(void) slurm_step_ctx_get(job->step_ctx,
						  SLURM_STEP_CTX_TASKS,
						  &tmp_task_cnt);
			if (!tmp_task_cnt) {
				fatal("%s: job %u has NULL task array",
				      __func__, job->jobid);
				break;	/* To eliminate CLANG error */
			}
			memcpy(pack_task_cnts + node_offset, tmp_task_cnt,
			       sizeof(uint16_t) * job->nhosts);

			xrealloc(pack_tids, sizeof(uint32_t *) * total_nnodes);
			(void) slurm_step_ctx_get(job->step_ctx,
						  SLURM_STEP_CTX_TIDS,
						  &tmp_tids);
			if (!tmp_tids) {
				fatal("%s: job %u has NULL task ID array",
				      __func__, job->jobid);
				break;	/* To eliminate CLANG error */
			}
			for (node_inx = 0; node_inx < job->nhosts; node_inx++) {
				uint32_t *node_tids;
				node_tids = xmalloc(sizeof(uint32_t) *
						    tmp_task_cnt[node_inx]);
				for (i = 0; i < tmp_task_cnt[node_inx]; i++) {
					node_tids[i] = tmp_tids[node_inx][i] +
						       job->pack_task_offset;
				}
				pack_tids[node_offset + node_inx] =
					node_tids;
			}

			(void) slurm_step_ctx_get(job->step_ctx,
						  SLURM_STEP_CTX_NODE_LIST,
						  &node_list);
			if (!node_list) {
				fatal("%s: job %u has NULL hostname",
				      __func__, job->jobid);
			}
			if (pack_node_list)
				xstrfmtcat(pack_node_list, ",%s", node_list);
			else
				pack_node_list = xstrdup(node_list);
			xfree(node_list);
			node_offset += job->nhosts;
		}
		list_iterator_reset(job_iter);
		_reorder_pack_recs(&pack_node_list, &pack_task_cnts,
				   &pack_tids, total_nnodes);

		if (need_mpir)
			mpir_init(total_ntasks);

		opt_iter = list_iterator_create(opt_list);
		while ((opt_local = (opt_t *) list_next(opt_iter))) {
			job = (srun_job_t *) list_next(job_iter);
			if (!job) {
				slurm_mutex_lock(&step_mutex);
				while (step_cnt > 0)
					slurm_cond_wait(&step_cond,&step_mutex);
				slurm_mutex_unlock(&step_mutex);
				if (first_job) {
					fini_srun(first_job, got_alloc,
						  &global_rc, 0);
				}
				fatal("%s: job allocation count does not match request count (%d != %d)",
				      __func__, list_count(srun_job_list),
				      list_count(opt_list));
				break;	/* To eliminate CLANG error */
			}

			slurm_mutex_lock(&step_mutex);
			step_cnt++;
			slurm_mutex_unlock(&step_mutex);
			job->pack_node_list = xstrdup(pack_node_list);
			if ((pack_step_cnt > 1) && pack_task_cnts) {
				xassert(node_offset == job->pack_nnodes);
				job->pack_task_cnts = xmalloc(sizeof(uint16_t) *
							      job->pack_nnodes);
				memcpy(job->pack_task_cnts, pack_task_cnts,
				       sizeof(uint16_t) * job->pack_nnodes);
				job->pack_tids = xmalloc(sizeof(uint32_t *) *
							 job->pack_nnodes);
				memcpy(job->pack_tids, pack_tids,
				       sizeof(uint32_t *) * job->pack_nnodes);
			}
			opts = xmalloc(sizeof(_launch_app_data_t));
			opts->got_alloc   = got_alloc;
			opts->job         = job;
			opts->opt_local   = opt_local;
			opts->step_cond   = &step_cond;
			opts->step_cnt    = &step_cnt;
			opts->step_mutex  = &step_mutex;
			opt_local->pack_step_cnt = pack_step_cnt;

			slurm_thread_create_detached(NULL, _launch_one_app,
						     opts);
		}
		xfree(pack_node_list);
		xfree(pack_task_cnts);
		list_iterator_destroy(job_iter);
		list_iterator_destroy(opt_iter);
		slurm_mutex_lock(&step_mutex);
		while (step_cnt > 0)
			slurm_cond_wait(&step_cond, &step_mutex);
		slurm_mutex_unlock(&step_mutex);

		if (first_job)
			fini_srun(first_job, got_alloc, &global_rc, 0);
	} else {
		if (need_mpir)
			mpir_init(job->ntasks);
		opts = xmalloc(sizeof(_launch_app_data_t));
		opts->got_alloc   = got_alloc;
		opts->job         = job;
		opts->opt_local   = &opt;
		opt.pack_step_cnt = 1;
		_launch_one_app(opts);
		fini_srun(job, got_alloc, &global_rc, 0);
	}
}
Beispiel #18
0
/* block_state_mutex must be unlocked before calling this. */
extern int put_block_in_error_state(bg_record_t *bg_record, char *reason)
{
	uid_t pw_uid;

	xassert(bg_record);

	/* Only check this if the blocks are created, meaning this
	   isn't at startup.
	*/
	if (blocks_are_created) {
		/* Since we are putting this block in an error state we need
		   to wait for the job to be removed.  We don't really
		   need to free the block though since we may just
		   want it to be in an error state for some reason. */
		while (bg_record->job_running > NO_JOB_RUNNING) {
			if (bg_record->magic != BLOCK_MAGIC) {
				error("While putting block %s in a error "
				      "state it was destroyed",
				      bg_record->bg_block_id);
				return SLURM_ERROR;
			}
			debug2("block %s is still running job %d",
			       bg_record->bg_block_id, bg_record->job_running);
			sleep(1);
		}
	}

	slurm_mutex_lock(&block_state_mutex);
	if (!block_ptr_exist_in_list(bg_lists->main, bg_record)) {
		slurm_mutex_unlock(&block_state_mutex);
		error("while trying to put block in "
		      "error state it disappeared");
		return SLURM_ERROR;
	}

	info("Setting Block %s to ERROR state. (reason: '%s')",
	     bg_record->bg_block_id, reason);
	/* we add the block to these lists so we don't try to schedule
	   on them. */
	if (!block_ptr_exist_in_list(bg_lists->job_running, bg_record)) {
		list_push(bg_lists->job_running, bg_record);
		num_unused_cpus -= bg_record->cpu_cnt;
	}
	if (!block_ptr_exist_in_list(bg_lists->booted, bg_record))
		list_push(bg_lists->booted, bg_record);

	bg_record->job_running = BLOCK_ERROR_STATE;
	bg_record->state |= BG_BLOCK_ERROR_FLAG;

	xfree(bg_record->user_name);
	xfree(bg_record->target_name);
	bg_record->user_name = xstrdup(bg_conf->slurm_user_name);
	bg_record->target_name = xstrdup(bg_conf->slurm_user_name);
	bg_record->reason = xstrdup(reason);

	if (uid_from_string (bg_record->user_name, &pw_uid) < 0)
		error("No such user: %s", bg_record->user_name);
	else
		bg_record->user_uid = pw_uid;

	/* Only send if reason is set.  If it isn't set then
	   accounting should already know about this error state */
	if (reason)
		_set_block_nodes_accounting(bg_record, reason);
	slurm_mutex_unlock(&block_state_mutex);

	trigger_block_error();
	return SLURM_SUCCESS;
}
Beispiel #19
0
/*
 * Initialize context for node selection plugin
 */
extern int slurm_select_init(bool only_default)
{
	int retval = SLURM_SUCCESS;
	char *type = NULL;
	int i, j, len;
	DIR *dirp;
	struct dirent *e;
	char *dir_array = NULL, *head = NULL;
	char *plugin_type = "select";

	if ( init_run && select_context )
		return retval;

	slurm_mutex_lock( &select_context_lock );

	if ( select_context )
		goto done;

	type = slurm_get_select_type();
	if (working_cluster_rec) {
		/* just ignore warnings here */
	} else {
#ifdef HAVE_BG
		if (xstrcasecmp(type, "select/bluegene")) {
			error("%s is incompatible with BlueGene", type);
			fatal("Use SelectType=select/bluegene");
		}
#else
		if (!xstrcasecmp(type, "select/bluegene")) {
			fatal("Requested SelectType=select/bluegene "
			      "in slurm.conf, but not running on a BG[L|P|Q] "
			      "system.  If looking to emulate a BG[L|P|Q] "
			      "system use --enable-bgl-emulation or "
			      "--enable-bgp-emulation respectively.");
		}
#endif

#ifdef HAVE_ALPS_CRAY
		if (xstrcasecmp(type, "select/alps")) {
			error("%s is incompatible with Cray system "
			      "running alps", type);
			fatal("Use SelectType=select/alps");
		}
#else
		if (!xstrcasecmp(type, "select/alps")) {
			fatal("Requested SelectType=select/alps "
			      "in slurm.conf, but not running on a ALPS Cray "
			      "system.  If looking to emulate a Alps Cray "
			      "system use --enable-alps-cray-emulation.");
		}
#endif

#ifdef HAVE_NATIVE_CRAY
		if (xstrcasecmp(type, "select/cray")) {
			error("%s is incompatible with a native Cray system.",
			      type);
			fatal("Use SelectType=select/cray");
		}
#else
		/* if (!xstrcasecmp(type, "select/cray")) { */
		/* 	fatal("Requested SelectType=select/cray " */
		/* 	      "in slurm.conf, but not running on a native Cray " */
		/* 	      "system.  If looking to run on a Cray " */
		/* 	      "system natively use --enable-native-cray."); */
		/* } */
#endif
	}

	select_context_cnt = 0;
	if (only_default) {
		ops = xmalloc(sizeof(slurm_select_ops_t));
		select_context = xmalloc(sizeof(plugin_context_t));
		if ((select_context[0] = plugin_context_create(
			     plugin_type, type, (void **)&ops[0],
			     node_select_syms, sizeof(node_select_syms)))) {
			select_context_default = 0;
			select_context_cnt++;
		}
		goto skip_load_all;
	}

	if (!(dir_array = slurm_get_plugin_dir())) {
		error("plugin_load_and_link: No plugin dir given");
		goto done;
	}

	head = dir_array;
	for (i=0; ; i++) {
		bool got_colon = 0;
		if (dir_array[i] == ':') {
			dir_array[i] = '\0';
			got_colon = 1;
		} else if (dir_array[i] != '\0')
			continue;

		/* Open the directory. */
		if (!(dirp = opendir(head))) {
			error("cannot open plugin directory %s", head);
			goto done;
		}

		while (1) {
			char full_name[128];

			if (!(e = readdir( dirp )))
				break;
			/* Check only files with select_ in them. */
			if (xstrncmp(e->d_name, "select_", 7))
				continue;

			len = strlen(e->d_name);
#if defined(__CYGWIN__)
			len -= 4;
#else
			len -= 3;
#endif
			/* Check only shared object files */
			if (xstrcmp(e->d_name+len,
#if defined(__CYGWIN__)
				   ".dll"
#else
				   ".so"
#endif
				    ))
				continue;
			/* add one for the / */
			len++;
			xassert(len<sizeof(full_name));
			snprintf(full_name, len, "select/%s", e->d_name+7);
			for (j=0; j<select_context_cnt; j++) {
				if (!xstrcmp(full_name,
					     select_context[j]->type))
					break;
			}
			if (j >= select_context_cnt) {
				xrealloc(ops,
					 (sizeof(slurm_select_ops_t) *
					  (select_context_cnt + 1)));
				xrealloc(select_context,
					 (sizeof(plugin_context_t) *
					  (select_context_cnt + 1)));

				select_context[select_context_cnt] =
					plugin_context_create(
						plugin_type, full_name,
						(void **)&ops[
							select_context_cnt],
						node_select_syms,
						sizeof(node_select_syms));
				if (select_context[select_context_cnt]) {
					/* set the default */
					if (!xstrcmp(full_name, type))
						select_context_default =
							select_context_cnt;
					select_context_cnt++;
				}
			}
		}

		closedir(dirp);

		if (got_colon) {
			head = dir_array + i + 1;
		} else
			break;
	}

skip_load_all:
	if (select_context_default == -1)
		fatal("Can't find plugin for %s", type);

	/* Insure that plugin_id is valid and unique */
	for (i=0; i<select_context_cnt; i++) {
		for (j=i+1; j<select_context_cnt; j++) {
			if (*(ops[i].plugin_id) !=
			    *(ops[j].plugin_id))
				continue;
			fatal("SelectPlugins: Duplicate plugin_id %u for "
			      "%s and %s",
			      *(ops[i].plugin_id),
			      select_context[i]->type,
			      select_context[j]->type);
		}
		if (*(ops[i].plugin_id) < 100) {
			fatal("SelectPlugins: Invalid plugin_id %u (<100) %s",
			      *(ops[i].plugin_id),
			      select_context[i]->type);
		}

	}
	init_run = true;
done:
	slurm_mutex_unlock( &select_context_lock );
	if (!working_cluster_rec) {
		if (select_running_linear_based()) {
			uint16_t cr_type = slurm_get_select_type_param();
			if (cr_type & (CR_CPU | CR_CORE | CR_SOCKET)) {
				fatal("Invalid SelectTypeParameters for "
				      "%s: %s (%u), it's can't contain "
				      "CR_(CPU|CORE|SOCKET).",
				      type,
				      select_type_param_string(cr_type),
				      cr_type);
			}
		}
	}

	xfree(type);
	xfree(dir_array);

	return retval;
}
Beispiel #20
0
/*
 * This could potentially lock the node lock in the slurmctld with
 * slurm_drain_node, or slurm_fail_job so if slurmctld_locked is called we
 * will call the functions without locking the locks again.
 */
extern int down_nodecard(char *mp_name, bitoff_t io_start,
			 bool slurmctld_locked)
{
	List requests = NULL;
	List delete_list = NULL;
	ListIterator itr = NULL;
	bg_record_t *bg_record = NULL, *found_record = NULL, tmp_record;
	bg_record_t *smallest_bg_record = NULL;
	struct node_record *node_ptr = NULL;
	int mp_bit = 0;
	static int io_cnt = NO_VAL;
	static int create_size = NO_VAL;
	static select_ba_request_t blockreq;
	int rc = SLURM_SUCCESS;
	char *reason = "select_bluegene: nodecard down";

	xassert(mp_name);

	if (io_cnt == NO_VAL) {
		io_cnt = 1;
		/* Translate 1 nodecard count to ionode count */
		if ((io_cnt *= bg_conf->io_ratio))
			io_cnt--;

		/* make sure we create something that is able to be
		   created */
		if (bg_conf->smallest_block < bg_conf->nodecard_cnode_cnt)
			create_size = bg_conf->nodecard_cnode_cnt;
		else
			create_size = bg_conf->smallest_block;
	}

	node_ptr = find_node_record(mp_name);
	if (!node_ptr) {
		error ("down_sub_node_blocks: invalid node specified '%s'",
		       mp_name);
		return EINVAL;
	}

	/* this is here for sanity check to make sure we don't core on
	   these bits when we set them below. */
	if (io_start >= bg_conf->ionodes_per_mp
	    || (io_start+io_cnt) >= bg_conf->ionodes_per_mp) {
		debug("io %d-%d not configured on this "
		      "system, only %d ionodes per midplane",
		      io_start, io_start+io_cnt, bg_conf->ionodes_per_mp);
		return EINVAL;
	}
	mp_bit = (node_ptr - node_record_table_ptr);

	memset(&blockreq, 0, sizeof(select_ba_request_t));

	blockreq.conn_type[0] = SELECT_SMALL;
	blockreq.save_name = mp_name;

	debug3("here setting node %d of %d and ionodes %d-%d of %d",
	       mp_bit, node_record_count, io_start,
	       io_start+io_cnt, bg_conf->ionodes_per_mp);

	memset(&tmp_record, 0, sizeof(bg_record_t));
	tmp_record.mp_count = 1;
	tmp_record.cnode_cnt = bg_conf->nodecard_cnode_cnt;
	tmp_record.mp_bitmap = bit_alloc(node_record_count);
	bit_set(tmp_record.mp_bitmap, mp_bit);

	tmp_record.ionode_bitmap = bit_alloc(bg_conf->ionodes_per_mp);
	bit_nset(tmp_record.ionode_bitmap, io_start, io_start+io_cnt);

	slurm_mutex_lock(&block_state_mutex);
	itr = list_iterator_create(bg_lists->main);
	while ((bg_record = list_next(itr))) {
		if (!bit_test(bg_record->mp_bitmap, mp_bit))
			continue;

		if (!blocks_overlap(bg_record, &tmp_record))
			continue;

		if (bg_record->job_running > NO_JOB_RUNNING) {
			if (slurmctld_locked)
				job_fail(bg_record->job_running);
			else
				slurm_fail_job(bg_record->job_running);

		}
		/* If Running Dynamic mode and the block is
		   smaller than the create size just continue on.
		*/
		if ((bg_conf->layout_mode == LAYOUT_DYNAMIC)
		    && (bg_record->cnode_cnt < create_size)) {
			if (!delete_list)
				delete_list = list_create(NULL);
			list_append(delete_list, bg_record);
			continue;
		}

		/* keep track of the smallest size that is at least
		   the size of create_size. */
		if (!smallest_bg_record ||
		    (smallest_bg_record->cnode_cnt > bg_record->cnode_cnt))
			smallest_bg_record = bg_record;
	}
	list_iterator_destroy(itr);
	slurm_mutex_unlock(&block_state_mutex);

	if (bg_conf->layout_mode != LAYOUT_DYNAMIC) {
		debug3("running non-dynamic mode");
		/* This should never happen, but just in case... */
		if (delete_list)
			list_destroy(delete_list);

		/* If we found a block that is smaller or equal to a
		   midplane we will just mark it in an error state as
		   opposed to draining the node.
		*/
		if (smallest_bg_record
		    && (smallest_bg_record->cnode_cnt < bg_conf->mp_cnode_cnt)){
			if (smallest_bg_record->state & BG_BLOCK_ERROR_FLAG) {
				rc = SLURM_NO_CHANGE_IN_DATA;
				goto cleanup;
			}

			rc = put_block_in_error_state(
				smallest_bg_record, reason);
			goto cleanup;
		}

		debug("No block under 1 midplane available for this nodecard.  "
		      "Draining the whole node.");
		if (!node_already_down(mp_name)) {
			if (slurmctld_locked)
				drain_nodes(mp_name, reason,
					    slurm_get_slurm_user_id());
			else
				slurm_drain_nodes(mp_name, reason,
						  slurm_get_slurm_user_id());
		}
		rc = SLURM_SUCCESS;
		goto cleanup;
	}

	/* below is only for Dynamic mode */

	if (delete_list) {
		int cnt_set = 0;
		bitstr_t *iobitmap = bit_alloc(bg_conf->ionodes_per_mp);
		/* don't lock here since it is handled inside
		   the put_block_in_error_state
		*/
		itr = list_iterator_create(delete_list);
		while ((bg_record = list_next(itr))) {
			debug2("combining smaller than nodecard "
			       "dynamic block %s",
			       bg_record->bg_block_id);
			while (bg_record->job_running > NO_JOB_RUNNING)
				sleep(1);

			bit_or(iobitmap, bg_record->ionode_bitmap);
			cnt_set++;
		}
		list_iterator_destroy(itr);
		list_destroy(delete_list);
		if (!cnt_set) {
			FREE_NULL_BITMAP(iobitmap);
			rc = SLURM_ERROR;
			goto cleanup;
		}
		/* set the start to be the same as the start of the
		   ionode_bitmap.  If no ionodes set (not a small
		   block) set io_start = 0. */
		if ((io_start = bit_ffs(iobitmap)) == -1) {
			io_start = 0;
			if (create_size > bg_conf->nodecard_cnode_cnt)
				blockreq.small128 = 4;
			else
				blockreq.small32 = 16;
		} else if (create_size <= bg_conf->nodecard_cnode_cnt)
			blockreq.small32 = 1;
		else
			/* this should never happen */
			blockreq.small128 = 1;

		FREE_NULL_BITMAP(iobitmap);
	} else if (smallest_bg_record) {
		debug2("smallest dynamic block is %s",
		       smallest_bg_record->bg_block_id);
		if (smallest_bg_record->state & BG_BLOCK_ERROR_FLAG) {
			rc = SLURM_NO_CHANGE_IN_DATA;
			goto cleanup;
		}

		while (smallest_bg_record->job_running > NO_JOB_RUNNING)
			sleep(1);

		if (smallest_bg_record->cnode_cnt == create_size) {
			rc = put_block_in_error_state(
				smallest_bg_record, reason);
			goto cleanup;
		}

		if (create_size > smallest_bg_record->cnode_cnt) {
			/* we should never get here.  This means we
			 * have a create_size that is bigger than a
			 * block that is already made.
			 */
			rc = put_block_in_error_state(
				smallest_bg_record, reason);
			goto cleanup;
		}
		debug3("node count is %d", smallest_bg_record->cnode_cnt);
		switch(smallest_bg_record->cnode_cnt) {
#ifndef HAVE_BGL
		case 64:
			blockreq.small32 = 2;
			break;
		case 256:
			blockreq.small32 = 8;
			break;
#endif
		case 128:
			blockreq.small32 = 4;
			break;
		case 512:
		default:
			blockreq.small32 = 16;
			break;
		}

		if (create_size != bg_conf->nodecard_cnode_cnt) {
			blockreq.small128 = blockreq.small32 / 4;
			blockreq.small32 = 0;
			io_start = 0;
		} else if ((io_start =
			    bit_ffs(smallest_bg_record->ionode_bitmap)) == -1)
			/* set the start to be the same as the start of the
			   ionode_bitmap.  If no ionodes set (not a small
			   block) set io_start = 0. */
			io_start = 0;
	} else {
		switch(create_size) {
#ifndef HAVE_BGL
		case 64:
			blockreq.small64 = 8;
			break;
		case 256:
			blockreq.small256 = 2;
#endif
		case 32:
			blockreq.small32 = 16;
			break;
		case 128:
			blockreq.small128 = 4;
			break;
		case 512:
			if (!node_already_down(mp_name)) {
				char *reason = "select_bluegene: nodecard down";
				if (slurmctld_locked)
					drain_nodes(mp_name, reason,
						    slurm_get_slurm_user_id());
				else
					slurm_drain_nodes(
						mp_name, reason,
						slurm_get_slurm_user_id());
			}
			rc = SLURM_SUCCESS;
			goto cleanup;
			break;
		default:
			error("Unknown create size of %d", create_size);
			break;
		}
		/* since we don't have a block in this midplane
		   we need to start at the beginning. */
		io_start = 0;
		/* we also need a bg_block to pretend to be the
		   smallest block that takes up the entire midplane. */
	}


	/* Here we need to add blocks that take up nodecards on this
	   midplane.  Since Slurm only keeps track of midplanes
	   natively this is the only want to handle this case.
	*/
	requests = list_create(destroy_bg_record);
	add_bg_record(requests, NULL, &blockreq, 1, io_start);

	slurm_mutex_lock(&block_state_mutex);
	delete_list = list_create(NULL);
	while ((bg_record = list_pop(requests))) {
		itr = list_iterator_create(bg_lists->main);
		while ((found_record = list_next(itr))) {
			if (!blocks_overlap(bg_record, found_record))
				continue;
			list_push(delete_list, found_record);
			list_remove(itr);
		}
		list_iterator_destroy(itr);

		/* we need to add this record since it doesn't exist */
		if (bridge_block_create(bg_record) == SLURM_ERROR) {
			destroy_bg_record(bg_record);
			error("down_sub_node_blocks: "
			      "unable to configure block in api");
			continue;
		}

		debug("adding block %s to fill in small blocks "
		      "around bad nodecards",
		      bg_record->bg_block_id);
		print_bg_record(bg_record);
		list_append(bg_lists->main, bg_record);
		if (bit_overlap(bg_record->ionode_bitmap,
				tmp_record.ionode_bitmap)) {
			/* here we know the error block doesn't exist
			   so just set the state here */
			slurm_mutex_unlock(&block_state_mutex);
			rc = put_block_in_error_state(bg_record, reason);
			slurm_mutex_lock(&block_state_mutex);
		}
	}
	list_destroy(requests);

	if (delete_list) {
		slurm_mutex_unlock(&block_state_mutex);
		free_block_list(NO_VAL, delete_list, 0, 0);
		list_destroy(delete_list);
	}
	slurm_mutex_lock(&block_state_mutex);
	sort_bg_record_inc_size(bg_lists->main);
	slurm_mutex_unlock(&block_state_mutex);
	last_bg_update = time(NULL);

cleanup:
	FREE_NULL_BITMAP(tmp_record.mp_bitmap);
	FREE_NULL_BITMAP(tmp_record.ionode_bitmap);

	return rc;

}
Beispiel #21
0
/* Process incoming RPCs. Meant to execute as a pthread */
extern void *rpc_mgr(void *no_data)
{
	pthread_attr_t thread_attr_rpc_req;
	slurm_fd_t sockfd, newsockfd;
	int i, retry_cnt, sigarray[] = {SIGUSR1, 0};
	slurm_addr_t cli_addr;
	slurmdbd_conn_t *conn_arg = NULL;

	slurm_mutex_lock(&thread_count_lock);
	master_thread_id = pthread_self();
	slurm_mutex_unlock(&thread_count_lock);

	(void) pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
	(void) pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL);

	/* threads to process individual RPC's are detached */
	slurm_attr_init(&thread_attr_rpc_req);
	if (pthread_attr_setdetachstate
	    (&thread_attr_rpc_req, PTHREAD_CREATE_DETACHED))
		fatal("pthread_attr_setdetachstate %m");

	/* initialize port for RPCs */
	if ((sockfd = slurm_init_msg_engine_port(get_dbd_port()))
	    == SLURM_SOCKET_ERROR)
		fatal("slurm_init_msg_engine_port error %m");

	/* Prepare to catch SIGUSR1 to interrupt accept().
	 * This signal is generated by the slurmdbd signal
	 * handler thread upon receipt of SIGABRT, SIGINT,
	 * or SIGTERM. That thread does all processing of
	 * all signals. */
	xsignal(SIGUSR1, _sig_handler);
	xsignal_unblock(sigarray);

	/*
	 * Process incoming RPCs until told to shutdown
	 */
	while ((i = _wait_for_server_thread()) >= 0) {
		/*
		 * accept needed for stream implementation is a no-op in
		 * message implementation that just passes sockfd to newsockfd
		 */
		if ((newsockfd = slurm_accept_msg_conn(sockfd,
						       &cli_addr)) ==
		    SLURM_SOCKET_ERROR) {
			_free_server_thread((pthread_t) 0);
			if (errno != EINTR)
				error("slurm_accept_msg_conn: %m");
			continue;
		}
		fd_set_nonblocking(newsockfd);

		conn_arg = xmalloc(sizeof(slurmdbd_conn_t));
		conn_arg->newsockfd = newsockfd;
		slurm_get_ip_str(&cli_addr, &conn_arg->orig_port,
				 conn_arg->ip, sizeof(conn_arg->ip));
		retry_cnt = 0;
		while (pthread_create(&slave_thread_id[i],
				      &thread_attr_rpc_req,
				      _service_connection,
				      (void *) conn_arg)) {
			if (retry_cnt > 0) {
				error("pthread_create failure, "
				      "aborting RPC: %m");
				close(newsockfd);
				break;
			}
			error("pthread_create failure: %m");
			retry_cnt++;
			usleep(1000);	/* retry in 1 msec */
		}
	}

	debug3("rpc_mgr shutting down");
	slurm_attr_destroy(&thread_attr_rpc_req);
	(void) slurm_shutdown_msg_engine(sockfd);
	_wait_for_thread_fini();
	pthread_exit((void *) 0);
	return NULL;
}
Beispiel #22
0
extern int create_full_system_block(List bg_found_block_list)
{
	int rc = SLURM_SUCCESS;
	ListIterator itr;
	bg_record_t *bg_record = NULL;
	char *name = NULL;
	List records = NULL;
	uint16_t geo[SYSTEM_DIMENSIONS];
	int i;
	select_ba_request_t blockreq;
	List results = NULL;
	struct part_record *part_ptr = NULL;
	bitstr_t *bitmap = bit_alloc(node_record_count);
	static int *dims = NULL;
	bool larger = 0;
	char start_char[SYSTEM_DIMENSIONS+1];
	char geo_char[SYSTEM_DIMENSIONS+1];

	if (!dims) {
		dims = select_g_ba_get_dims();
		memset(start_char, 0, sizeof(start_char));
		memset(geo_char, 0, sizeof(geo_char));
	}

	/* Locks are already in place to protect part_list here */
	itr = list_iterator_create(part_list);
	while ((part_ptr = list_next(itr))) {
		/* we only want to use mps that are in
		 * partitions
		 */
		if (!part_ptr->node_bitmap) {
			debug4("Partition %s doesn't have any nodes in it.",
			       part_ptr->name);
			continue;
		}
		bit_or(bitmap, part_ptr->node_bitmap);
	}
	list_iterator_destroy(itr);

	bit_not(bitmap);
	if (bit_ffs(bitmap) != -1) {
		error("We don't have the entire system covered by partitions, "
		      "can't create full system block");
		FREE_NULL_BITMAP(bitmap);
		return SLURM_ERROR;
	}
	FREE_NULL_BITMAP(bitmap);

	/* Here we are adding a block that in for the entire machine
	   just in case it isn't in the bluegene.conf file.
	*/
	slurm_mutex_lock(&block_state_mutex);

	for (i=0; i<SYSTEM_DIMENSIONS; i++) {
		geo[i] = dims[i] - 1;
		if (geo[i] > 0)
			larger = 1;
		geo_char[i] = alpha_num[geo[i]];
		start_char[i] = alpha_num[0];
	}

	if (!larger)
		name = xstrdup_printf("%s%s",
				      bg_conf->slurm_node_prefix, start_char);
	else
		name = xstrdup_printf("%s[%sx%s]",
				      bg_conf->slurm_node_prefix,
				      start_char, geo_char);

	if (bg_found_block_list) {
		itr = list_iterator_create(bg_found_block_list);
		while ((bg_record = (bg_record_t *) list_next(itr)) != NULL) {
			if (!strcmp(name, bg_record->mp_str)) {
				xfree(name);
				list_iterator_destroy(itr);
				/* don't create total already there */
				goto no_total;
			}
		}
		list_iterator_destroy(itr);
	} else {
		error("create_full_system_block: no bg_found_block_list 2");
	}

	if (bg_lists->main) {
		itr = list_iterator_create(bg_lists->main);
		while ((bg_record = (bg_record_t *) list_next(itr))
		       != NULL) {
			/* If we are deleting old blocks they will
			   have been added to the main list, so we
			   want to skip over them.
			*/
			if (bg_record->free_cnt)
				continue;

			if (!strcmp(name, bg_record->mp_str)) {
				xfree(name);
				list_iterator_destroy(itr);
				/* don't create total already there */
				goto no_total;
			}
		}
		list_iterator_destroy(itr);
	} else {
		xfree(name);
		error("create_overlapped_blocks: no bg_lists->main 3");
		rc = SLURM_ERROR;
		goto no_total;
	}

	records = list_create(destroy_bg_record);

	memset(&blockreq, 0, sizeof(select_ba_request_t));
	blockreq.save_name = name;
	for (i=0; i<SYSTEM_DIMENSIONS; i++)
		blockreq.conn_type[i] = SELECT_TORUS;

	add_bg_record(records, NULL, &blockreq, 0 , 0);
	xfree(name);

	bg_record = (bg_record_t *) list_pop(records);
	if (!bg_record) {
		error("Nothing was returned from full system create");
		rc = SLURM_ERROR;
		goto no_total;
	}
	reset_ba_system(false);
	for(i=0; i<SYSTEM_DIMENSIONS; i++) {
		geo_char[i] = alpha_num[bg_record->geo[i]];
		start_char[i] = alpha_num[bg_record->start[i]];
	}
	debug2("adding %s %s %s",  bg_record->mp_str, start_char, geo_char);
	if (bg_record->ba_mp_list)
		list_flush(bg_record->ba_mp_list);
	else
		bg_record->ba_mp_list = list_create(destroy_ba_mp);
#ifdef HAVE_BGQ
	results = list_create(destroy_ba_mp);
#else
	results = list_create(NULL);
#endif
	name = set_bg_block(results,
			    bg_record->start,
			    bg_record->geo,
			    bg_record->conn_type);
	if (!name) {
		error("I was unable to make the full system block.");
		list_destroy(results);
		list_iterator_destroy(itr);
		slurm_mutex_unlock(&block_state_mutex);
		return SLURM_ERROR;
	}
	xfree(name);
	if (bg_record->ba_mp_list)
		list_destroy(bg_record->ba_mp_list);
#ifdef HAVE_BGQ
	bg_record->ba_mp_list = results;
	results = NULL;
#else
	bg_record->ba_mp_list = list_create(destroy_ba_mp);
	copy_node_path(results, &bg_record->ba_mp_list);
	list_destroy(results);
#endif
	if ((rc = bridge_block_create(bg_record)) == SLURM_ERROR) {
		error("create_full_system_block: "
		      "unable to configure block in api");
		destroy_bg_record(bg_record);
		goto no_total;
	}

	print_bg_record(bg_record);
	list_append(bg_lists->main, bg_record);

no_total:
	if (records)
		list_destroy(records);
	slurm_mutex_unlock(&block_state_mutex);
	return rc;
}
Beispiel #23
0
extern int mysql_db_get_db_connection(mysql_conn_t *mysql_conn, char *db_name,
				      mysql_db_info_t *db_info)
{
	int rc = SLURM_SUCCESS;
	bool storage_init = false;
	char *db_host = db_info->host;

	xassert(mysql_conn);

	slurm_mutex_lock(&mysql_conn->lock);

	if (!(mysql_conn->db_conn = mysql_init(mysql_conn->db_conn))) {
		slurm_mutex_unlock(&mysql_conn->lock);
		fatal("mysql_init failed: %s",
		      mysql_error(mysql_conn->db_conn));
	} else {
		/* If this ever changes you will need to alter
		 * src/common/slurmdbd_defs.c function _send_init_msg to
		 * handle a different timeout when polling for the
		 * response.
		 */
		unsigned int my_timeout = 30;
#ifdef MYSQL_OPT_RECONNECT
		my_bool reconnect = 1;
		/* make sure reconnect is on */
		mysql_options(mysql_conn->db_conn, MYSQL_OPT_RECONNECT,
			      &reconnect);
#endif
		mysql_options(mysql_conn->db_conn, MYSQL_OPT_CONNECT_TIMEOUT,
			      (char *)&my_timeout);
		while (!storage_init) {
			if (!mysql_real_connect(mysql_conn->db_conn, db_host,
					        db_info->user, db_info->pass,
					        db_name, db_info->port, NULL,
					        CLIENT_MULTI_STATEMENTS)) {
				int err = mysql_errno(mysql_conn->db_conn);
				if (err == ER_BAD_DB_ERROR) {
					debug("Database %s not created.  "
					      "Creating", db_name);
					rc = _create_db(db_name, db_info);
				} else {
					const char *err_str = mysql_error(
						mysql_conn->db_conn);
					error("mysql_real_connect failed: "
					      "%d %s",
					      err, err_str);
					if ((db_host == db_info->host)
					    && db_info->backup) {
						db_host = db_info->backup;
						continue;
					}

					rc = ESLURM_DB_CONNECTION;
					mysql_close(mysql_conn->db_conn);
					mysql_conn->db_conn = NULL;
					break;
				}
			} else {
				storage_init = true;
				if (mysql_conn->rollback)
					mysql_autocommit(
						mysql_conn->db_conn, 0);
				rc = _mysql_query_internal(
					mysql_conn->db_conn,
					"SET session sql_mode='ANSI_QUOTES,"
					"NO_ENGINE_SUBSTITUTION';");
			}
		}
	}
	slurm_mutex_unlock(&mysql_conn->lock);
	errno = rc;
	return rc;
}
Beispiel #24
0
/*
 * create_defined_blocks - create the static blocks that will be used
 * for scheduling, all partitions must be able to be created and booted
 * at once.
 * IN - int overlapped, 1 if partitions are to be overlapped, 0 if they are
 * static.
 * RET - success of fitting all configurations
 */
extern int create_defined_blocks(bg_layout_t overlapped,
				 List bg_found_block_list)
{
	int rc = SLURM_SUCCESS;

	ListIterator itr;
	bg_record_t *bg_record = NULL;
	int i;
	uint16_t geo[SYSTEM_DIMENSIONS];
	char temp[256];
	struct part_record *part_ptr = NULL;
	bitstr_t *usable_mp_bitmap = bit_alloc(node_record_count);

	/* Locks are already in place to protect part_list here */
	itr = list_iterator_create(part_list);
	while ((part_ptr = list_next(itr))) {
		/* we only want to use mps that are in
		 * partitions
		 */
		if (!part_ptr->node_bitmap) {
			debug4("Partition %s doesn't have any nodes in it.",
			       part_ptr->name);
			continue;
		}
		bit_or(usable_mp_bitmap, part_ptr->node_bitmap);
	}
	list_iterator_destroy(itr);

	if (bit_ffs(usable_mp_bitmap) == -1) {
		fatal("We don't have any nodes in any partitions.  "
		      "Can't create blocks.  "
		      "Please check your slurm.conf.");
	}

	slurm_mutex_lock(&block_state_mutex);
	reset_ba_system(false);
	ba_set_removable_mps(usable_mp_bitmap, 1);
	if (bg_lists->main) {
		itr = list_iterator_create(bg_lists->main);
		while ((bg_record = list_next(itr))) {
			/* If we are deleting old blocks they will
			   have been added to the main list, so we
			   want to skip over them.
			*/
			if (bg_record->free_cnt)
				continue;
			if (bg_record->mp_count > 0
			    && !bg_record->full_block
			    && bg_record->cpu_cnt >= bg_conf->cpus_per_mp) {
				char *name = NULL;
				char start_char[SYSTEM_DIMENSIONS+1];
				char geo_char[SYSTEM_DIMENSIONS+1];

				if (overlapped == LAYOUT_OVERLAP) {
					reset_ba_system(false);
					ba_set_removable_mps(usable_mp_bitmap,
							     1);
				}

				/* we want the mps that aren't
				 * in this record to mark them as used
				 */
				if (ba_set_removable_mps(
					    bg_record->mp_bitmap, 1)
				    != SLURM_SUCCESS)
					fatal("It doesn't seem we have a "
					      "bitmap for %s",
					      bg_record->bg_block_id);

				for (i=0; i<SYSTEM_DIMENSIONS; i++) {
					geo[i] = bg_record->geo[i];
					start_char[i] = alpha_num[
						bg_record->start[i]];
					geo_char[i] = alpha_num[geo[i]];
				}
				start_char[i] = '\0';
				geo_char[i] = '\0';

				debug2("adding %s %s %s",
				       bg_record->mp_str,
				       start_char, geo_char);
				if (bg_record->ba_mp_list
				    && list_count(bg_record->ba_mp_list)) {
					if ((rc = check_and_set_mp_list(
						     bg_record->ba_mp_list))
					    != SLURM_SUCCESS) {
						error("Something happened in "
						      "the load of %s.  "
						      "Did you use smap to "
						      "make the "
						      "bluegene.conf file?",
						       bg_record->bg_block_id);
						break;
					}
					ba_reset_all_removed_mps();
				} else {
#ifdef HAVE_BGQ
					List results =
						list_create(destroy_ba_mp);
#else
					List results = list_create(NULL);
#endif
					name = set_bg_block(
						results,
						bg_record->start,
						geo,
						bg_record->conn_type);
					ba_reset_all_removed_mps();
					if (!name) {
						error("I was unable to "
						      "make the "
						      "requested block.");
						list_destroy(results);
						rc = SLURM_ERROR;
						break;
					}

					snprintf(temp, sizeof(temp), "%s%s",
						 bg_conf->slurm_node_prefix,
						 name);

					xfree(name);
					if (strcmp(temp, bg_record->mp_str)) {
						fatal("given list of %s "
						      "but allocated %s, "
						      "your order might be "
						      "wrong in bluegene.conf",
						      bg_record->mp_str,
						      temp);
					}
					if (bg_record->ba_mp_list)
						list_destroy(
							bg_record->ba_mp_list);
#ifdef HAVE_BGQ
					bg_record->ba_mp_list = results;
					results = NULL;
#else
					bg_record->ba_mp_list =
						list_create(destroy_ba_mp);
					copy_node_path(results,
						       &bg_record->ba_mp_list);
					list_destroy(results);
#endif
				}
			}
			if (!block_exist_in_list(
				    bg_found_block_list, bg_record)) {
				if (bg_record->full_block) {
					/* if this is defined we need
					   to remove it since we are
					   going to try to create it
					   later on overlap systems
					   this doesn't matter, but
					   since we don't clear the
					   table on static mode we
					   can't do it here or it just
					   won't work since other
					   wires will be or are
					   already set
					*/
					list_remove(itr);
					continue;
				}
				if ((rc = bridge_block_create(bg_record))
				    != SLURM_SUCCESS)
					break;
				print_bg_record(bg_record);
			}
		}
		list_iterator_destroy(itr);
		if (rc != SLURM_SUCCESS)
			goto end_it;
	} else {
		error("create_defined_blocks: no bg_lists->main 2");
		rc = SLURM_ERROR;
		goto end_it;
	}
	slurm_mutex_unlock(&block_state_mutex);
	create_full_system_block(bg_found_block_list);

	slurm_mutex_lock(&block_state_mutex);
	sort_bg_record_inc_size(bg_lists->main);

end_it:
	ba_reset_all_removed_mps();
	FREE_NULL_BITMAP(usable_mp_bitmap);
	slurm_mutex_unlock(&block_state_mutex);

#ifdef _PRINT_BLOCKS_AND_EXIT
	if (bg_lists->main) {
		itr = list_iterator_create(bg_lists->main);
		debug("\n\n");
		while ((found_record = (bg_record_t *) list_next(itr))
		       != NULL) {
			/* If we are deleting old blocks they will
			   have been added to the main list, so we
			   want to skip over them.
			*/
			if (found_record->free_cnt)
				continue;
			print_bg_record(found_record);
		}
		list_iterator_destroy(itr);
	} else {
		error("create_defined_blocks: no bg_lists->main 5");
	}
 	exit(0);
#endif	/* _PRINT_BLOCKS_AND_EXIT */
	//exit(0);
	return rc;
}
Beispiel #25
0
int main(int argc, char *argv[])
{
	log_options_t log_opts = LOG_OPTS_INITIALIZER;
	char *features, *save_ptr = NULL, *tok;
	update_node_msg_t node_msg;
	int rc =  SLURM_SUCCESS;
	hostlist_t hl = NULL;
	char *node_name;
	pthread_attr_t attr_work;
	pthread_t thread_work = 0;

	prog_name = argv[0];
	_read_config();
	log_opts.stderr_level = LOG_LEVEL_QUIET;
	log_opts.syslog_level = LOG_LEVEL_QUIET;
	if (slurm_get_debug_flags() && DEBUG_FLAG_NODE_FEATURES)
		log_opts.logfile_level += 3;
	(void) log_init(argv[0], log_opts, LOG_DAEMON, log_file);

	/* Parse the MCDRAM and NUMA boot options */
	if (argc == 3) {
		features = xstrdup(argv[2]);
		tok = strtok_r(features, ",", &save_ptr);
		while (tok) {
			printf("%s\n", tok);
			if (!strcasecmp(tok, "a2a")  ||
			    !strcasecmp(tok, "hemi") ||
			    !strcasecmp(tok, "quad") ||
			    !strcasecmp(tok, "snc2") ||
			    !strcasecmp(tok, "snc4")) {
				xfree(mcdram_mode);
				mcdram_mode = xstrdup(tok);
			} else if (!strcasecmp(tok, "cache")  ||
				   !strcasecmp(tok, "equal") ||
				   !strcasecmp(tok, "flat")) {
				xfree(numa_mode);
				numa_mode = xstrdup(tok);
			}
			tok = strtok_r(NULL, ",", &save_ptr);
		}
		xfree(features);
	}

	/* Spawn threads to change MCDRAM and NUMA states and start node
	 * reboot process */
	if ((hl = hostlist_create(argv[1])) == NULL) {
		error("%s: Invalid hostlist (%s)", prog_name, argv[1]);
		exit(2);
	}
	node_bitmap = bit_alloc(100000);
	while ((node_name = hostlist_pop(hl))) {
		slurm_mutex_lock(&thread_cnt_mutex);
		while (1) {
			if (thread_cnt <= MAX_THREADS) {
				thread_cnt++;
				break;
			} else {	/* wait for state change and retry */
				pthread_cond_wait(&thread_cnt_cond,
						  &thread_cnt_mutex);
			}
		}
		slurm_mutex_unlock(&thread_cnt_mutex);

		slurm_attr_init(&attr_work);
		(void) pthread_attr_setdetachstate
			(&attr_work, PTHREAD_CREATE_DETACHED);
		if (pthread_create(&thread_work, &attr_work, _node_update,
				   (void *) node_name)) {
			_node_update((void *) node_name);
		}
		slurm_attr_destroy(&attr_work);
	}

	/* Wait for work threads to complete */
	slurm_mutex_lock(&thread_cnt_mutex);
	while (1) {
		if (thread_cnt == 0)
			break;
		else	/* wait for state change and retry */
			pthread_cond_wait(&thread_cnt_cond, &thread_cnt_mutex);
	}
	slurm_mutex_unlock(&thread_cnt_mutex);
	hostlist_destroy(hl);
	xfree(mcdram_mode);
	xfree(numa_mode);

	/* Wait for all nodes to change state to "on" */
	_wait_all_nodes_on();

	if ((argc == 3) && !syscfg_path) {
		slurm_init_update_node_msg(&node_msg);
		node_msg.node_names = argv[1];
		node_msg.features_act = argv[2];
		rc = slurm_update_node(&node_msg);
	}

	if (rc == SLURM_SUCCESS) {
		exit(0);
	} else {
		error("%s: slurm_update_node(\'%s\', \'%s\'): %s\n",
		      prog_name, argv[1], argv[2],
		      slurm_strerror(slurm_get_errno()));
		exit(1);
	}
}
Beispiel #26
0
/* This needs to happen to make since 2.1 code doesn't have enough
 * smarts to figure out it isn't adding a default wckey if just
 * adding a new wckey for a user that has never been on the cluster before.
 */
static int _make_sure_users_have_default(
	mysql_conn_t *mysql_conn, List user_list)
{
	char *query = NULL, *cluster = NULL, *user = NULL;
	ListIterator itr = NULL, clus_itr = NULL;
	int rc = SLURM_SUCCESS;

	if (!user_list)
		return SLURM_SUCCESS;

	slurm_mutex_lock(&as_mysql_cluster_list_lock);

	clus_itr = list_iterator_create(as_mysql_cluster_list);
	itr = list_iterator_create(user_list);

	while ((user = list_next(itr))) {
		while ((cluster = list_next(clus_itr))) {
			MYSQL_RES *result = NULL;
			MYSQL_ROW row;
			char *wckey = NULL;
			/* only look at non * and non deleted ones */
			query = xstrdup_printf(
				"select distinct is_def, wckey_name from "
				"\"%s_%s\" where user='******' and wckey_name "
				"not like '*%%' and deleted=0 FOR UPDATE;",
				cluster, wckey_table, user);
			debug4("%d(%s:%d) query\n%s",
			       mysql_conn->conn, THIS_FILE, __LINE__, query);
			if (!(result = mysql_db_query_ret(
				      mysql_conn, query, 0))) {
				xfree(query);
				error("couldn't query the database");
				rc = SLURM_ERROR;
				break;
			}
			xfree(query);
			/* Check to see if the user is even added to
			   the cluster.
			*/
			if (!mysql_num_rows(result)) {
				mysql_free_result(result);
				continue;
			}
			while ((row = mysql_fetch_row(result))) {
				if (row[0][0] == '1')
					break;
				if (!wckey)
					wckey = xstrdup(row[1]);
			}
			mysql_free_result(result);

			/* we found one so just continue */
			if (row || !wckey) {
				xfree(wckey);
				continue;
			}
			query = xstrdup_printf(
				"update \"%s_%s\" set is_def=1 where "
				"user='******' and wckey_name='%s';",
				cluster, wckey_table, user, wckey);
			xfree(wckey);
			if (debug_flags & DEBUG_FLAG_DB_WCKEY)
				DB_DEBUG(mysql_conn->conn, "query\n%s", query);
			rc = mysql_db_query(mysql_conn, query);
			xfree(query);
			if (rc != SLURM_SUCCESS) {
				error("problem with update query");
				rc = SLURM_ERROR;
				break;
			}
		}
		if (rc != SLURM_SUCCESS)
			break;
		list_iterator_reset(clus_itr);
	}
	list_iterator_destroy(itr);
	list_iterator_destroy(clus_itr);
	slurm_mutex_unlock(&as_mysql_cluster_list_lock);

	return rc;
}
Beispiel #27
0
extern int slurm_jobcomp_log_record ( struct job_record *job_ptr )
{
	int rc = SLURM_SUCCESS;
	char job_rec[1024];
	char usr_str[32], grp_str[32], start_str[32], end_str[32], lim_str[32];
	char select_buf[128], *state_string, *work_dir;
	size_t offset = 0, tot_size, wrote;
	uint32_t job_state;
	uint32_t time_limit;

	if ((log_name == NULL) || (job_comp_fd < 0)) {
		error("JobCompLoc log file %s not open", log_name);
		return SLURM_ERROR;
	}

	slurm_mutex_lock( &file_lock );
	_get_user_name(job_ptr->user_id, usr_str, sizeof(usr_str));
	_get_group_name(job_ptr->group_id, grp_str, sizeof(grp_str));

	if ((job_ptr->time_limit == NO_VAL) && job_ptr->part_ptr)
		time_limit = job_ptr->part_ptr->max_time;
	else
		time_limit = job_ptr->time_limit;
	if (time_limit == INFINITE)
		strcpy(lim_str, "UNLIMITED");
	else {
		snprintf(lim_str, sizeof(lim_str), "%lu",
			 (unsigned long) time_limit);
	}

	if (job_ptr->job_state & JOB_RESIZING) {
		time_t now = time(NULL);
		state_string = job_state_string(job_ptr->job_state);
		if (job_ptr->resize_time) {
			_make_time_str(&job_ptr->resize_time, start_str,
				       sizeof(start_str));
		} else {
			_make_time_str(&job_ptr->start_time, start_str,
				       sizeof(start_str));
		}
		_make_time_str(&now, end_str, sizeof(end_str));
	} else {
		/* Job state will typically have JOB_COMPLETING or JOB_RESIZING
		 * flag set when called. We remove the flags to get the eventual
		 * completion state: JOB_FAILED, JOB_TIMEOUT, etc. */
		job_state = job_ptr->job_state & JOB_STATE_BASE;
		state_string = job_state_string(job_state);
		if (job_ptr->resize_time) {
			_make_time_str(&job_ptr->resize_time, start_str,
				       sizeof(start_str));
		} else if (job_ptr->start_time > job_ptr->end_time) {
			/* Job cancelled while pending and
			 * expected start time is in the future. */
			snprintf(start_str, sizeof(start_str), "Unknown");
		} else {
			_make_time_str(&job_ptr->start_time, start_str,
				       sizeof(start_str));
		}
		_make_time_str(&job_ptr->end_time, end_str, sizeof(end_str));
	}

	if (job_ptr->details && job_ptr->details->work_dir)
		work_dir = job_ptr->details->work_dir;
	else
		work_dir = "unknown";

	select_g_select_jobinfo_sprint(job_ptr->select_jobinfo,
		select_buf, sizeof(select_buf), SELECT_PRINT_MIXED);

	snprintf(job_rec, sizeof(job_rec), JOB_FORMAT,
		 (unsigned long) job_ptr->job_id, usr_str,
		 (unsigned long) job_ptr->user_id, grp_str,
		 (unsigned long) job_ptr->group_id, job_ptr->name,
		 state_string, job_ptr->partition, lim_str, start_str,
		 end_str, job_ptr->nodes, job_ptr->node_cnt,
		 job_ptr->total_cpus, work_dir,
		 select_buf);
	tot_size = strlen(job_rec);

	while ( offset < tot_size ) {
		wrote = write(job_comp_fd, job_rec + offset,
			tot_size - offset);
		if (wrote == -1) {
			if (errno == EAGAIN)
				continue;
			else {
				plugin_errno = errno;
				rc = SLURM_ERROR;
				break;
			}
		}
		offset += wrote;
	}
	slurm_mutex_unlock( &file_lock );
	return rc;
}
Beispiel #28
0
extern List as_mysql_modify_wckeys(mysql_conn_t *mysql_conn,
				   uint32_t uid,
				   slurmdb_wckey_cond_t *wckey_cond,
				   slurmdb_wckey_rec_t *wckey)
{
	List ret_list = NULL;
	int rc = SLURM_SUCCESS;
	char *extra = NULL, *object = NULL, *vals = NULL;
	char *user_name = NULL;
	List use_cluster_list = as_mysql_cluster_list;
	ListIterator itr;

	if (!wckey_cond || !wckey) {
		error("we need something to change");
		return NULL;
	}

	if (check_connection(mysql_conn) != SLURM_SUCCESS)
		return NULL;

	if (!is_user_min_admin_level(mysql_conn, uid, SLURMDB_ADMIN_OPERATOR)) {
		if (wckey_cond->user_list
		    && (list_count(wckey_cond->user_list) == 1)) {
			uid_t pw_uid;
			char *name;
			name = list_peek(wckey_cond->user_list);
		        if ((uid_from_string (name, &pw_uid) >= 0)
			    && (pw_uid == uid)) {
				/* Make sure they aren't trying to
				   change something else and then set
				   this association as a default.
				*/
				slurmdb_init_wckey_rec(wckey, 1);
				wckey->is_def = 1;
				goto is_same_user;
			}
		}

		error("Only admins can modify wckeys");
		errno = ESLURM_ACCESS_DENIED;
		return NULL;
	}
is_same_user:

	(void) _setup_wckey_cond_limits(wckey_cond, &extra);

	if (wckey->is_def == 1)
		xstrcat(vals, ", is_def=1");

	if (!extra || !vals) {
		error("Nothing to modify '%s' '%s'", extra, vals);
		return NULL;
	}

	if (wckey_cond->cluster_list && list_count(wckey_cond->cluster_list))
		use_cluster_list = wckey_cond->cluster_list;

	user_name = uid_to_string((uid_t) uid);

	if (use_cluster_list == as_mysql_cluster_list)
		slurm_mutex_lock(&as_mysql_cluster_list_lock);

	ret_list = list_create(slurm_destroy_char);
	itr = list_iterator_create(use_cluster_list);
	while ((object = list_next(itr))) {
		if ((rc = _cluster_modify_wckeys(
			     mysql_conn, wckey, object,
			     extra, vals, user_name, ret_list))
		    != SLURM_SUCCESS)
			break;
	}
	list_iterator_destroy(itr);
	xfree(extra);
	xfree(user_name);

	if (use_cluster_list == as_mysql_cluster_list)
		slurm_mutex_unlock(&as_mysql_cluster_list_lock);

	if (rc == SLURM_ERROR) {
		FREE_NULL_LIST(ret_list);
		ret_list = NULL;
	}

	return ret_list;
}
Beispiel #29
0
/*
 * Run as pthread to keep saving slurmctld state information as needed,
 * Use schedule_job_save(),  schedule_node_save(), and schedule_part_save()
 * to queue state save of each data structure
 * no_data IN - unused
 * RET - NULL
 */
extern void *slurmctld_state_save(void *no_data)
{
	time_t last_save = 0, now;
	double save_delay;
	bool run_save;
	int save_count;

#if HAVE_SYS_PRCTL_H
	if (prctl(PR_SET_NAME, "slurmctld_sstate", NULL, NULL, NULL) < 0) {
		error("%s: cannot set my name to %s %m",
		      __func__, "slurmctld_sstate");
	}
#endif

	while (1) {
		/* wait for work to perform */
		slurm_mutex_lock(&state_save_lock);
		while (1) {
			save_count = save_jobs + save_nodes + save_parts +
				     save_front_end + save_resv +
				     save_triggers;
			now = time(NULL);
			save_delay = difftime(now, last_save);
			if (save_count &&
			    (!run_save_thread ||
			     (save_delay >= SAVE_MAX_WAIT))) {
				last_save = now;
				break;		/* do the work */
			} else if (!run_save_thread) {
				run_save_thread = true;
				slurm_mutex_unlock(&state_save_lock);
				return NULL;	/* shutdown */
			} else if (save_count) { /* wait for a timeout */
				struct timespec ts = {0, 0};
				ts.tv_sec = now + 1;
				pthread_cond_timedwait(&state_save_cond,
				           	       &state_save_lock, &ts);
			} else {		/* wait for more work */
				pthread_cond_wait(&state_save_cond,
				           	  &state_save_lock);
			}
		}

		/* save front_end node info if necessary */
		run_save = false;
		/* slurm_mutex_lock(&state_save_lock); done above */
		if (save_front_end) {
			run_save = true;
			save_front_end = 0;
		}
		slurm_mutex_unlock(&state_save_lock);
		if (run_save)
			(void)dump_all_front_end_state();

		/* save job info if necessary */
		run_save = false;
		slurm_mutex_lock(&state_save_lock);
		if (save_jobs) {
			run_save = true;
			save_jobs = 0;
		}
		slurm_mutex_unlock(&state_save_lock);
		if (run_save)
			(void)dump_all_job_state();

		/* save node info if necessary */
		run_save = false;
		slurm_mutex_lock(&state_save_lock);
		if (save_nodes) {
			run_save = true;
			save_nodes = 0;
		}
		slurm_mutex_unlock(&state_save_lock);
		if (run_save)
			(void)dump_all_node_state();

		/* save partition info if necessary */
		run_save = false;
		slurm_mutex_lock(&state_save_lock);
		if (save_parts) {
			run_save = true;
			save_parts = 0;
		}
		slurm_mutex_unlock(&state_save_lock);
		if (run_save)
			(void)dump_all_part_state();

		/* save reservation info if necessary */
		run_save = false;
		slurm_mutex_lock(&state_save_lock);
		if (save_resv) {
			run_save = true;
			save_resv = 0;
		}
		slurm_mutex_unlock(&state_save_lock);
		if (run_save)
			(void)dump_all_resv_state();

		/* save trigger info if necessary */
		run_save = false;
		slurm_mutex_lock(&state_save_lock);
		if (save_triggers) {
			run_save = true;
			save_triggers = 0;
		}
		slurm_mutex_unlock(&state_save_lock);
		if (run_save)
			(void)trigger_state_save();
	}
}
Beispiel #30
0
extern int as_mysql_job_start(mysql_conn_t *mysql_conn,
			      struct job_record *job_ptr)
{
	int rc=SLURM_SUCCESS;
	char *nodes = NULL, *jname = NULL, *node_inx = NULL;
	int track_steps = 0;
	char *block_id = NULL, *partition = NULL,
		*gres_req = NULL, *gres_alloc = NULL;
	char *query = NULL;
	int reinit = 0;
	time_t begin_time, check_time, start_time, submit_time;
	uint32_t wckeyid = 0;
	int job_state, node_cnt = 0;
	uint32_t job_db_inx = job_ptr->db_index;
	job_array_struct_t *array_recs = job_ptr->array_recs;

	if ((!job_ptr->details || !job_ptr->details->submit_time)
	    && !job_ptr->resize_time) {
		error("as_mysql_job_start: "
		      "Not inputing this job, it has no submit time.");
		return SLURM_ERROR;
	}

	if (check_connection(mysql_conn) != SLURM_SUCCESS)
		return ESLURM_DB_CONNECTION;

	debug2("as_mysql_slurmdb_job_start() called");

	job_state = job_ptr->job_state;

	if (job_ptr->resize_time) {
		begin_time  = job_ptr->resize_time;
		submit_time = job_ptr->resize_time;
		start_time  = job_ptr->resize_time;
	} else {
		begin_time  = job_ptr->details->begin_time;
		submit_time = job_ptr->details->submit_time;
		start_time  = job_ptr->start_time;
	}

	/* If the reason is WAIT_ARRAY_TASK_LIMIT we don't want to
	 * give the pending jobs an eligible time since it will add
	 * time to accounting where as these jobs aren't able to run
	 * until later so mark it as such.
	 */
	if (job_ptr->state_reason == WAIT_ARRAY_TASK_LIMIT)
		begin_time = INFINITE;

	/* Since we need a new db_inx make sure the old db_inx
	 * removed. This is most likely the only time we are going to
	 * be notified of the change also so make the state without
	 * the resize. */
	if (IS_JOB_RESIZING(job_ptr)) {
		/* If we have a db_index lets end the previous record. */
		if (!job_ptr->db_index) {
			error("We don't have a db_index for job %u, "
			      "this should only happen when resizing "
			      "jobs and the database interface was down.",
			      job_ptr->job_id);
			job_ptr->db_index = _get_db_index(mysql_conn,
							  job_ptr->details->
							  submit_time,
							  job_ptr->job_id,
							  job_ptr->assoc_id);
		}

		if (job_ptr->db_index)
			as_mysql_job_complete(mysql_conn, job_ptr);

		job_state &= (~JOB_RESIZING);
		job_ptr->db_index = 0;
	}

	job_state &= JOB_STATE_BASE;

	/* See what we are hearing about here if no start time. If
	 * this job latest time is before the last roll up we will
	 * need to reset it to look at this job. */
	if (start_time)
		check_time = start_time;
	else if (begin_time)
		check_time = begin_time;
	else
		check_time = submit_time;

	slurm_mutex_lock(&rollup_lock);
	if (check_time < global_last_rollup) {
		MYSQL_RES *result = NULL;
		MYSQL_ROW row;

		/* check to see if we are hearing about this time for the
		 * first time.
		 */
		query = xstrdup_printf("select job_db_inx "
				       "from \"%s_%s\" where id_job=%u and "
				       "time_submit=%ld and time_eligible=%ld "
				       "and time_start=%ld;",
				       mysql_conn->cluster_name,
				       job_table, job_ptr->job_id,
				       submit_time, begin_time, start_time);
		if (debug_flags & DEBUG_FLAG_DB_JOB)
			DB_DEBUG(mysql_conn->conn, "query\n%s", query);
		if (!(result =
		      mysql_db_query_ret(mysql_conn, query, 0))) {
			xfree(query);
			slurm_mutex_unlock(&rollup_lock);
			return SLURM_ERROR;
		}
		xfree(query);
		if ((row = mysql_fetch_row(result))) {
			mysql_free_result(result);
			debug4("revieved an update for a "
			       "job (%u) already known about",
			       job_ptr->job_id);
			slurm_mutex_unlock(&rollup_lock);
			goto no_rollup_change;
		}
		mysql_free_result(result);

		if (job_ptr->start_time)
			debug("Need to reroll usage from %s Job %u "
			      "from %s started then and we are just "
			      "now hearing about it.",
			      slurm_ctime2(&check_time),
			      job_ptr->job_id, mysql_conn->cluster_name);
		else if (begin_time)
			debug("Need to reroll usage from %s Job %u "
			      "from %s became eligible then and we are just "
			      "now hearing about it.",
			      slurm_ctime2(&check_time),
			      job_ptr->job_id, mysql_conn->cluster_name);
		else
			debug("Need to reroll usage from %s Job %u "
			      "from %s was submitted then and we are just "
			      "now hearing about it.",
			      slurm_ctime2(&check_time),
			      job_ptr->job_id, mysql_conn->cluster_name);

		global_last_rollup = check_time;
		slurm_mutex_unlock(&rollup_lock);

		/* If the times here are later than the daily_rollup
		   or monthly rollup it isn't a big deal since they
		   are always shrunk down to the beginning of each
		   time period.
		*/
		query = xstrdup_printf("update \"%s_%s\" set "
				       "hourly_rollup=%ld, "
				       "daily_rollup=%ld, monthly_rollup=%ld",
				       mysql_conn->cluster_name,
				       last_ran_table, check_time,
				       check_time, check_time);
		if (debug_flags & DEBUG_FLAG_DB_JOB)
			DB_DEBUG(mysql_conn->conn, "query\n%s", query);
		rc = mysql_db_query(mysql_conn, query);
		xfree(query);
	} else
		slurm_mutex_unlock(&rollup_lock);

no_rollup_change:

	if (job_ptr->name && job_ptr->name[0])
		jname = slurm_add_slash_to_quotes(job_ptr->name);
	else {
		jname = xstrdup("allocation");
		track_steps = 1;
	}

	if (job_ptr->nodes && job_ptr->nodes[0])
		nodes = job_ptr->nodes;
	else
		nodes = "None assigned";

	if (job_ptr->batch_flag)
		track_steps = 1;

	if (slurmdbd_conf) {
		block_id = xstrdup(job_ptr->comment);
		node_cnt = job_ptr->total_nodes;
		node_inx = job_ptr->network;
	} else {
		char temp_bit[BUF_SIZE];

		if (job_ptr->node_bitmap) {
			node_inx = bit_fmt(temp_bit, sizeof(temp_bit),
					   job_ptr->node_bitmap);
		}
#ifdef HAVE_BG
		select_g_select_jobinfo_get(job_ptr->select_jobinfo,
					    SELECT_JOBDATA_BLOCK_ID,
					    &block_id);
		select_g_select_jobinfo_get(job_ptr->select_jobinfo,
					    SELECT_JOBDATA_NODE_CNT,
					    &node_cnt);
#else
		node_cnt = job_ptr->total_nodes;
#endif
	}

	/* Grab the wckey once to make sure it is placed. */
	if (job_ptr->assoc_id && (!job_ptr->db_index || job_ptr->wckey))
		wckeyid = _get_wckeyid(mysql_conn, &job_ptr->wckey,
				       job_ptr->user_id,
				       mysql_conn->cluster_name,
				       job_ptr->assoc_id);

	if (!IS_JOB_PENDING(job_ptr) && job_ptr->part_ptr)
		partition = slurm_add_slash_to_quotes(job_ptr->part_ptr->name);
	else if (job_ptr->partition)
		partition = slurm_add_slash_to_quotes(job_ptr->partition);

	if (job_ptr->gres_req)
		gres_req = slurm_add_slash_to_quotes(job_ptr->gres_req);

	if (job_ptr->gres_alloc)
		gres_alloc = slurm_add_slash_to_quotes(job_ptr->gres_alloc);

	if (!job_ptr->db_index) {
		if (!begin_time)
			begin_time = submit_time;
		query = xstrdup_printf(
			"insert into \"%s_%s\" "
			"(id_job, id_array_job, id_array_task, "
			"id_assoc, id_qos, id_user, "
			"id_group, nodelist, id_resv, timelimit, "
			"time_eligible, time_submit, time_start, "
			"job_name, track_steps, state, priority, cpus_req, "
			"nodes_alloc, mem_req",
			mysql_conn->cluster_name, job_table);

		if (wckeyid)
			xstrcat(query, ", id_wckey");
		if (job_ptr->account)
			xstrcat(query, ", account");
		if (partition)
			xstrcat(query, ", `partition`");
		if (block_id)
			xstrcat(query, ", id_block");
		if (job_ptr->wckey)
			xstrcat(query, ", wckey");
		if (node_inx)
			xstrcat(query, ", node_inx");
		if (gres_req)
			xstrcat(query, ", gres_req");
		if (gres_alloc)
			xstrcat(query, ", gres_alloc");
		if (array_recs && array_recs->task_id_str)
			xstrcat(query, ", array_task_str, array_max_tasks, "
				"array_task_pending");
		else
			xstrcat(query, ", array_task_str, array_task_pending");

		if (job_ptr->tres_alloc_str)
			xstrcat(query, ", tres_alloc");

		xstrfmtcat(query,
			   ") values (%u, %u, %u, %u, %u, %u, %u, "
			   "'%s', %u, %u, %ld, %ld, %ld, "
			   "'%s', %u, %u, %u, %u, %u, %u",
			   job_ptr->job_id, job_ptr->array_job_id,
			   job_ptr->array_task_id, job_ptr->assoc_id,
			   job_ptr->qos_id,
			   job_ptr->user_id, job_ptr->group_id, nodes,
			   job_ptr->resv_id, job_ptr->time_limit,
			   begin_time, submit_time, start_time,
			   jname, track_steps, job_state,
			   job_ptr->priority, job_ptr->details->min_cpus,
			   node_cnt,
			   job_ptr->details->pn_min_memory);

		if (wckeyid)
			xstrfmtcat(query, ", %u", wckeyid);
		if (job_ptr->account)
			xstrfmtcat(query, ", '%s'", job_ptr->account);
		if (partition)
			xstrfmtcat(query, ", '%s'", partition);
		if (block_id)
			xstrfmtcat(query, ", '%s'", block_id);
		if (job_ptr->wckey)
			xstrfmtcat(query, ", '%s'", job_ptr->wckey);
		if (node_inx)
			xstrfmtcat(query, ", '%s'", node_inx);
		if (gres_req)
			xstrfmtcat(query, ", '%s'", gres_req);
		if (gres_alloc)
			xstrfmtcat(query, ", '%s'", gres_alloc);
		if (array_recs && array_recs->task_id_str)
			xstrfmtcat(query, ", '%s', %u, %u",
				   array_recs->task_id_str,
				   array_recs->max_run_tasks,
				   array_recs->task_cnt);
		else
			xstrcat(query, ", NULL, 0");

		if (job_ptr->tres_alloc_str)
			xstrfmtcat(query, ", '%s'", job_ptr->tres_alloc_str);

		xstrfmtcat(query,
			   ") on duplicate key update "
			   "job_db_inx=LAST_INSERT_ID(job_db_inx), "
			   "id_user=%u, id_group=%u, "
			   "nodelist='%s', id_resv=%u, timelimit=%u, "
			   "time_submit=%ld, time_eligible=%ld, "
			   "time_start=%ld, "
			   "job_name='%s', track_steps=%u, id_qos=%u, "
			   "state=greatest(state, %u), priority=%u, "
			   "cpus_req=%u, nodes_alloc=%u, "
			   "mem_req=%u, id_array_job=%u, id_array_task=%u",
			   job_ptr->user_id, job_ptr->group_id, nodes,
			   job_ptr->resv_id, job_ptr->time_limit,
			   submit_time, begin_time, start_time,
			   jname, track_steps, job_ptr->qos_id, job_state,
			   job_ptr->priority, job_ptr->details->min_cpus,
			   node_cnt,
			   job_ptr->details->pn_min_memory,
			   job_ptr->array_job_id,
			   job_ptr->array_task_id);

		if (wckeyid)
			xstrfmtcat(query, ", id_wckey=%u", wckeyid);
		if (job_ptr->account)
			xstrfmtcat(query, ", account='%s'", job_ptr->account);
		if (partition)
			xstrfmtcat(query, ", `partition`='%s'", partition);
		if (block_id)
			xstrfmtcat(query, ", id_block='%s'", block_id);
		if (job_ptr->wckey)
			xstrfmtcat(query, ", wckey='%s'", job_ptr->wckey);
		if (node_inx)
			xstrfmtcat(query, ", node_inx='%s'", node_inx);
		if (gres_req)
			xstrfmtcat(query, ", gres_req='%s'", gres_req);
		if (gres_alloc)
			xstrfmtcat(query, ", gres_alloc='%s'", gres_alloc);
		if (array_recs && array_recs->task_id_str)
			xstrfmtcat(query, ", array_task_str='%s', "
				   "array_max_tasks=%u, array_task_pending=%u",
				   array_recs->task_id_str,
				   array_recs->max_run_tasks,
				   array_recs->task_cnt);
		else
			xstrfmtcat(query, ", array_task_str=NULL, "
				   "array_task_pending=0");

		if (job_ptr->tres_alloc_str)
			xstrfmtcat(query, ", tres_alloc='%s'",
				   job_ptr->tres_alloc_str);

		if (debug_flags & DEBUG_FLAG_DB_JOB)
			DB_DEBUG(mysql_conn->conn, "query\n%s", query);
	try_again:
		if (!(job_ptr->db_index = mysql_db_insert_ret_id(
			      mysql_conn, query))) {
			if (!reinit) {
				error("It looks like the storage has gone "
				      "away trying to reconnect");
				mysql_db_close_db_connection(
					mysql_conn);
				/* reconnect */
				check_connection(mysql_conn);
				reinit = 1;
				goto try_again;
			} else
				rc = SLURM_ERROR;
		}
	} else {
		query = xstrdup_printf("update \"%s_%s\" set nodelist='%s', ",
				       mysql_conn->cluster_name,
				       job_table, nodes);

		if (wckeyid)
			xstrfmtcat(query, "id_wckey=%u, ", wckeyid);
		if (job_ptr->account)
			xstrfmtcat(query, "account='%s', ", job_ptr->account);
		if (partition)
			xstrfmtcat(query, "`partition`='%s', ", partition);
		if (block_id)
			xstrfmtcat(query, "id_block='%s', ", block_id);
		if (job_ptr->wckey)
			xstrfmtcat(query, "wckey='%s', ", job_ptr->wckey);
		if (node_inx)
			xstrfmtcat(query, "node_inx='%s', ", node_inx);
		if (gres_req)
			xstrfmtcat(query, "gres_req='%s', ", gres_req);
		if (gres_alloc)
			xstrfmtcat(query, "gres_alloc='%s', ", gres_alloc);
		if (array_recs && array_recs->task_id_str)
			xstrfmtcat(query, "array_task_str='%s', "
				   "array_max_tasks=%u, "
				   "array_task_pending=%u, ",
				   array_recs->task_id_str,
				   array_recs->max_run_tasks,
				   array_recs->task_cnt);
		else
			xstrfmtcat(query, "array_task_str=NULL, "
				   "array_task_pending=0, ");

		if (job_ptr->tres_alloc_str)
			xstrfmtcat(query, "tres_alloc='%s', ",
				   job_ptr->tres_alloc_str);

		xstrfmtcat(query, "time_start=%ld, job_name='%s', state=%u, "
			   "nodes_alloc=%u, id_qos=%u, "
			   "id_assoc=%u, id_resv=%u, "
			   "timelimit=%u, mem_req=%u, "
			   "id_array_job=%u, id_array_task=%u, "
			   "time_eligible=%ld where job_db_inx=%d",
			   start_time, jname, job_state,
			   node_cnt, job_ptr->qos_id,
			   job_ptr->assoc_id,
			   job_ptr->resv_id, job_ptr->time_limit,
			   job_ptr->details->pn_min_memory,
			   job_ptr->array_job_id,
			   job_ptr->array_task_id,
			   begin_time, job_ptr->db_index);

		if (debug_flags & DEBUG_FLAG_DB_JOB)
			DB_DEBUG(mysql_conn->conn, "query\n%s", query);
		rc = mysql_db_query(mysql_conn, query);
	}

	xfree(block_id);
	xfree(partition);
	xfree(gres_req);
	xfree(gres_alloc);
	xfree(jname);
	xfree(query);

	/* now we will reset all the steps */
	if (IS_JOB_RESIZING(job_ptr)) {
		/* FIXME : Verify this is still needed */
		if (IS_JOB_SUSPENDED(job_ptr))
			as_mysql_suspend(mysql_conn, job_db_inx, job_ptr);
	}

	return rc;
}