예제 #1
0
/*
 * as_pg_add_clusters - add clusters
 *
 * IN pg_conn: database connection
 * IN uid: user performing the add operation
 * IN cluster_list: clusters to add
 * RET: error code
 */
extern int
as_pg_add_clusters(pgsql_conn_t *pg_conn, uint32_t uid,
		   List cluster_list)
{
	ListIterator itr = NULL;
	int rc = SLURM_SUCCESS, added = 0;
	slurmdb_cluster_rec_t *object = NULL;
	time_t now = time(NULL);
	List assoc_list = NULL;
	slurmdb_association_rec_t *assoc = NULL;
	char *txn_info = NULL, *query = NULL, *user_name = NULL;

	if (check_db_connection(pg_conn) != SLURM_SUCCESS)
		return ESLURM_DB_CONNECTION;

	assoc_list = list_create(slurmdb_destroy_association_rec);
	user_name = uid_to_string((uid_t) uid);
	itr = list_iterator_create(cluster_list);
	while((object = list_next(itr))) {
		if(!object->name) {
			error("as/pg: add_clusters: We need a cluster "
			      "name to add.");
			rc = SLURM_ERROR;
			continue;
		}
		if (strchr(object->name, '.')) {
			error("as/pg: add_clusters: invalid cluster name %s",
			      object->name);
			rc = SLURM_ERROR;
			continue;
		}
		if (cluster_in_db(pg_conn, object->name)) {
			error("cluster %s already added", object->name);
			rc = SLURM_ERROR;
			continue;
		}

		query = xstrdup_printf(
			"SELECT public.add_cluster("
			"(%ld, %ld, 0, '%s', '', 0, 0, %u, 1, 0, 0));",
			(long)now, (long)now, object->name,
			object->classification);
		rc = DEF_QUERY_RET_RC;
		if(rc != SLURM_SUCCESS) {
			error("Couldn't add cluster %s", object->name);
			added = 0; /* rollback modification to DB */
			break;
		}

		rc = _create_cluster_tables(pg_conn, object->name);
		if (rc != SLURM_SUCCESS) {
			error("Failed creating cluster tables for %s",
			      object->name);
			added = 0;
			break;
		}

		/* add root account assoc: <'cluster', 'root', '', ''> */
		if (add_cluster_root_assoc(pg_conn, now, object, &txn_info)
		    != SLURM_SUCCESS) {
			added = 0;
			break;
		}

		if (add_txn(pg_conn, now, "", DBD_ADD_CLUSTERS, object->name,
			    user_name, txn_info) != SLURM_SUCCESS) {
			error("as/pg: add_cluster: couldn't add txn");
		} else {
			added ++;
		}
		xfree(txn_info);

		/* Add user root by default to run from the root
		 * association.  This gets popped off so we need to
		 * read it every time here.
		 */
		assoc = xmalloc(sizeof(slurmdb_association_rec_t));
		slurmdb_init_association_rec(assoc, 0);
		list_append(assoc_list, assoc);
		assoc->cluster = xstrdup(object->name);
		assoc->user = xstrdup("root");
		assoc->acct = xstrdup("root");
		if(acct_storage_p_add_associations(pg_conn, uid, assoc_list)
		   == SLURM_ERROR) {
			error("Problem adding root user association");
			rc = SLURM_ERROR;
		}
		list_flush(assoc_list); /* do not add it again, in case not popped */
	}
	list_iterator_destroy(itr);
	xfree(user_name);
	list_destroy(assoc_list);

	if (!added) {
		reset_pgsql_conn(pg_conn);
	} else {
		/* when loading sacctmgr cfg file,
		   get_assoc will be called before commit
		*/
		pg_conn->cluster_changed = 1;
	}

	return rc;
}
예제 #2
0
파일: as_pg_job.c 프로젝트: jsollom/slurm
/*
 * js_pg_job_start - load into the storage the start of a job
 *
 * IN pg_conn: database connection
 * IN cluster_name: cluster of the job
 * IN job_ptr: job just started
 * RET: error code
 */
extern int
js_pg_job_start(pgsql_conn_t *pg_conn,
		struct job_record *job_ptr)
{
	int rc=SLURM_SUCCESS, track_steps = 0, reinit = 0;
	char *jname = NULL, *nodes = NULL, *node_inx = NULL;
	char *block_id = NULL, *rec = NULL, *query = NULL;
	time_t begin_time, check_time, start_time, submit_time;
	int job_state, node_cnt = 0;
	uint32_t wckeyid = 0;

	if ((!job_ptr->details || !job_ptr->details->submit_time)
	    && !job_ptr->resize_time) {
		error("as/pg: job_start: Not inputing this job, "
		      "it has no submit time.");
		return SLURM_ERROR;
	}

	if (check_db_connection(pg_conn) != SLURM_SUCCESS)
		return ESLURM_DB_CONNECTION;

	if (! cluster_in_db(pg_conn, pg_conn->cluster_name) ) {
		error("cluster %s not in db", pg_conn->cluster_name);
		return SLURM_ERROR;
	}

	debug3("as/pg: job_start() called");

	job_state = job_ptr->job_state;

	/* Since we need a new db_inx make sure the old db_inx
	 * removed. This is most likely the only time we are going to
	 * be notified of the change also so make the state without
	 * the resize. */
	if (IS_JOB_RESIZING(job_ptr)) {
		/* If we have a db_index lets end the previous record. */
		if (job_ptr->db_index)
			js_pg_job_complete(pg_conn, job_ptr);
		else
			error("We don't have a db_index for job %u, "
			      "this should never happen.", job_ptr->job_id);
		job_state &= (~JOB_RESIZING);
		job_ptr->db_index = 0;
	}

	job_state &= JOB_STATE_BASE;

	if (job_ptr->resize_time) {
		begin_time  = job_ptr->resize_time;
		submit_time = job_ptr->resize_time;
		start_time  = job_ptr->resize_time;
	} else {
		begin_time  = job_ptr->details->begin_time;
		submit_time = job_ptr->details->submit_time;
		start_time  = job_ptr->start_time;
	}

	/* See what we are hearing about here if no start time. If
	 * this job latest time is before the last roll up we will
	 * need to reset it to look at this job. */
	if (start_time)
		check_time = start_time;
	else if (begin_time)
		check_time = begin_time;
	else
		check_time = submit_time;

	slurm_mutex_lock(&usage_rollup_lock);
	if (check_time < global_last_rollup) {
		PGresult *result = NULL;
		/* check to see if we are hearing about this time for the
		 * first time.
		 */
		query = xstrdup_printf(
			"SELECT job_db_inx FROM %s.%s WHERE id_job=%u AND "
			"time_submit=%ld AND time_eligible=%ld AND time_start=%ld",
			pg_conn->cluster_name, job_table, job_ptr->job_id,
			submit_time, begin_time, start_time);
		result = DEF_QUERY_RET;
		if (!result) {
			 slurm_mutex_unlock(&usage_rollup_lock);
			return SLURM_ERROR;
		}
		if (PQntuples(result) != 0) {
			PQclear(result);
			debug4("revieved an update for a "
			       "job (%u) already known about",
			       job_ptr->job_id);
			 slurm_mutex_unlock(&usage_rollup_lock);
			goto no_rollup_change;
		}
		PQclear(result);

		if (job_ptr->start_time)
			debug("Need to reroll usage from %s Job %u "
			      "from %s started then and we are just "
			      "now hearing about it.",
			      ctime(&check_time),
			      job_ptr->job_id, pg_conn->cluster_name);
		else if (begin_time)
			debug("Need to reroll usage from %s Job %u "
			      "from %s became eligible then and we are just "
			      "now hearing about it.",
			      ctime(&check_time),
			      job_ptr->job_id, pg_conn->cluster_name);
		else
			debug("Need to reroll usage from %s Job %u "
			      "from %s was submitted then and we are just "
			      "now hearing about it.",
			      ctime(&check_time),
			      job_ptr->job_id, pg_conn->cluster_name);

		global_last_rollup = check_time;
		slurm_mutex_unlock(&usage_rollup_lock);

		query = xstrdup_printf("UPDATE %s.%s SET hourly_rollup=%ld, "
				       "daily_rollup=%ld, monthly_rollup=%ld",
				       pg_conn->cluster_name, last_ran_table,
				       check_time, check_time, check_time);
		rc = DEF_QUERY_RET_RC;
	} else
		slurm_mutex_unlock(&usage_rollup_lock);

no_rollup_change:

	if (job_ptr->name && job_ptr->name[0])
		jname = xstrdup(job_ptr->name);
	else {
		jname = xstrdup("allocation");
		track_steps = 1;
	}

	if (job_ptr->nodes && job_ptr->nodes[0])
		nodes = job_ptr->nodes;
	else
		nodes = "None assigned";

	if (job_ptr->batch_flag)
		track_steps = 1;

	if (slurmdbd_conf) {
		block_id = xstrdup(job_ptr->comment);
		node_cnt = job_ptr->total_nodes;
		node_inx = job_ptr->network;
	} else {
		char temp_bit[BUF_SIZE];

		if (job_ptr->node_bitmap) {
			node_inx = bit_fmt(temp_bit, sizeof(temp_bit),
					   job_ptr->node_bitmap);
		}
#ifdef HAVE_BG
		select_g_select_jobinfo_get(job_ptr->select_jobinfo,
					    SELECT_JOBDATA_BLOCK_ID,
					    &block_id);
		select_g_select_jobinfo_get(job_ptr->select_jobinfo,
					    SELECT_JOBDATA_NODE_CNT,
					    &node_cnt);
#else
		node_cnt = job_ptr->total_nodes;
#endif
	}

	/* If there is a start_time get the wckeyid.  If the job is
	 * cancelled before the job starts we also want to grab it. */
	if (job_ptr->assoc_id &&
	   (job_ptr->start_time || IS_JOB_CANCELLED(job_ptr)))
		wckeyid = get_wckeyid(pg_conn, &job_ptr->wckey,
				      job_ptr->user_id, pg_conn->cluster_name,
				      job_ptr->assoc_id);

	if (!job_ptr->db_index) {
		if (!begin_time)
			begin_time = submit_time;

		rec = xstrdup_printf(
			"(0, 0, '%s', '%s', %d, %d, 0, '%s', "
			"%d, '%s', %d, %d, %d, %d, %d, %d, 0, "
			"%d, %ld, %ld, %ld, 0, 0, "
			"%d, '%s', '%s', %d, %d, '%s', %d)",
			/* job_db_inx=0, not used */
			/* deleted=0 */
			job_ptr->account ?: "",     /* account */
			job_ptr->partition ?: "",   /* partition */
			(int)job_ptr->details->min_cpus, /* cpus_req */
			(int)job_ptr->total_cpus, /* cpus_alloc */
			/* exit_code=0 */
			jname,  /* job_name */

			(int)job_ptr->assoc_id, /* id_assoc */
			block_id ?: "", /* id_block */
			(int)job_ptr->job_id,   /* id_job */
			(int)job_ptr->qos_id,   /* id_qos */
			(int)job_ptr->resv_id,  /* id_resv */
			(int)wckeyid,           /* id_wckey */
			(int)job_ptr->user_id,  /* uid */
			(int)job_ptr->group_id, /* gid */
			/* kill_requid=0 */

			(int)job_ptr->time_limit, /* timelimit */
			submit_time,         /* time_submit */
			begin_time,          /* time_eligible */
			start_time,          /* time_start */
			/* time_end=0 */
			/* time_suspended=0 */

			(int)node_cnt, /* nodes_alloc */
			nodes ?: "",                    /* nodelist */
			node_inx ?: "",         /* node_inx */
			(int)job_ptr->priority, /* priority */
			(int)job_state,         /* state */
			job_ptr->wckey ?: "",   /* wckey */
			(int)track_steps);

		query = xstrdup_printf("SELECT %s.add_job_start(%s);",
				       pg_conn->cluster_name, rec);
		xfree(rec);

	try_again:
		DEBUG_QUERY;
		job_ptr->db_index = pgsql_query_ret_id(pg_conn->db_conn,
						       query);
		if (!job_ptr->db_index) {
			if (!reinit) {
				error("It looks like the storage has gone "
				      "away trying to reconnect");
				check_db_connection(pg_conn);
				reinit = 1;
				goto try_again;
			} else
				rc = SLURM_ERROR;
		}
		xfree(query);
	} else {