/*
 * load into the storage the end of a job
 */
extern int jobacct_storage_p_job_complete(void *db_conn,
					  struct job_record *job_ptr)
{
	char buf[BUFFER_SIZE];
	uint16_t job_state;
	int duration;
	uint32_t exit_code;

	if (!storage_init) {
		debug("jobacct init was not called or it failed");
		return SLURM_ERROR;
	}

	debug2("slurmdb_job_complete() called");
	if (IS_JOB_RESIZING(job_ptr)) {
		job_state = JOB_RESIZING;
		if (job_ptr->resize_time)
			duration = time(NULL) - job_ptr->resize_time;
		else
			duration = time(NULL) - job_ptr->start_time;
	} else {
		if (job_ptr->end_time == 0) {
			debug("jobacct: job %u never started", job_ptr->job_id);
			return SLURM_ERROR;
		}
		job_state = job_ptr->job_state & JOB_STATE_BASE;
		if (job_ptr->resize_time)
			duration = job_ptr->end_time - job_ptr->resize_time;
		else
			duration = job_ptr->end_time - job_ptr->start_time;
	}

	exit_code = job_ptr->exit_code;
	if (exit_code == 1) {
		/* This wasn't signalled, it was set by Slurm so don't
		 * treat it like a signal.
		 */
		exit_code = 256;
	}

	/* leave the requid as a %d since we want to see if it is -1
	   in stats */
	snprintf(buf, BUFFER_SIZE, "%d %d %u %u %u",
		 JOB_TERMINATED, duration,
		 job_state, job_ptr->requid, exit_code);

	return  _print_record(job_ptr, job_ptr->end_time, buf);
}
Esempio n. 2
0
static struct jobcomp_info * _jobcomp_info_create (struct job_record *job)
{
    enum job_states state;
    struct jobcomp_info * j = xmalloc (sizeof (*j));

    j->jobid = job->job_id;
    j->uid = job->user_id;
    j->gid = job->group_id;
    j->name = xstrdup (job->name);
    j->array_job_id = job->array_job_id;
    j->array_task_id = job->array_task_id;

    if (IS_JOB_RESIZING(job)) {
        state = JOB_RESIZING;
        j->jobstate = xstrdup (job_state_string (state));
        if (job->resize_time)
            j->start = job->resize_time;
        else
            j->start = job->start_time;
        j->end = time(NULL);
    } else {
        /* Job state will typically have JOB_COMPLETING or JOB_RESIZING
         * flag set when called. We remove the flags to get the eventual
         * completion state: JOB_FAILED, JOB_TIMEOUT, etc. */
        state = job->job_state & JOB_STATE_BASE;
        j->jobstate = xstrdup (job_state_string (state));
        if (job->resize_time)
            j->start = job->resize_time;
        else if (job->start_time > job->end_time) {
            /* Job cancelled while pending and
             * expected start time is in the future. */
            j->start = 0;
        } else
            j->start = job->start_time;
        j->end = job->end_time;
    }

    j->partition = xstrdup (job->partition);
    if ((job->time_limit == NO_VAL) && job->part_ptr)
        j->limit = job->part_ptr->max_time;
    else
        j->limit = job->time_limit;
    j->submit = job->details ? job->details->submit_time:job->start_time;
    j->batch_flag = job->batch_flag;
    j->nodes = xstrdup (job->nodes);
    j->nprocs = job->total_cpus;
    j->nnodes = job->node_cnt;
    j->account = job->account ? xstrdup (job->account) : NULL;
    if (job->details && job->details->work_dir)
        j->work_dir = xstrdup(job->details->work_dir);
    else
        j->work_dir = xstrdup("unknown");
    if (job->details) {
        if (job->details->std_in)
            j->std_in = xstrdup(job->details->std_in);
        if (job->details->std_out)
            j->std_out = xstrdup(job->details->std_out);
        if (job->details->std_err)
            j->std_err = xstrdup(job->details->std_err);
    }

#ifdef HAVE_BG
    j->connect_type = select_g_select_jobinfo_xstrdup(job->select_jobinfo,
                      SELECT_PRINT_CONNECTION);
    j->geometry = select_g_select_jobinfo_xstrdup(job->select_jobinfo,
                  SELECT_PRINT_GEOMETRY);
    j->blockid = select_g_select_jobinfo_xstrdup(job->select_jobinfo,
                 SELECT_PRINT_BG_ID);
#endif
    return (j);
}
Esempio n. 3
0
extern int slurm_jobcomp_log_record(struct job_record *job_ptr)
{
	int rc = SLURM_SUCCESS;
	char *usr_str = NULL, *grp_str = NULL, lim_str[32];
	char *connect_type = NULL, *reboot = NULL, *rotate = NULL,
		*geometry = NULL, *start = NULL,
		*blockid = NULL;
	enum job_states job_state;
	char *query = NULL;
	uint32_t time_limit, start_time, end_time;

	if(!jobcomp_mysql_conn || mysql_db_ping(jobcomp_mysql_conn) != 0) {
		char *loc = slurm_get_jobcomp_loc();
		if(slurm_jobcomp_set_location(loc) == SLURM_ERROR) {
			xfree(loc);
			return SLURM_ERROR;
		}
		xfree(loc);
	}

	usr_str = _get_user_name(job_ptr->user_id);
	grp_str = _get_group_name(job_ptr->group_id);

	if ((job_ptr->time_limit == NO_VAL) && job_ptr->part_ptr)
		time_limit = job_ptr->part_ptr->max_time;
	else
		time_limit = job_ptr->time_limit;
	if (time_limit == INFINITE)
		strcpy(lim_str, "UNLIMITED");
	else {
		snprintf(lim_str, sizeof(lim_str), "%lu",
			 (unsigned long) time_limit);
	}

	/* Job will typically be COMPLETING when this is called.
	 * We remove the flags to get the eventual completion state:
	 * JOB_FAILED, JOB_TIMEOUT, etc. */
	if (IS_JOB_RESIZING(job_ptr)) {
		job_state = JOB_RESIZING;
		if (job_ptr->resize_time)
			start_time = job_ptr->resize_time;
		else
			start_time = job_ptr->start_time;
		end_time = time(NULL);
	} else {
		job_state = job_ptr->job_state & JOB_STATE_BASE;
		if (job_ptr->resize_time)
			start_time = job_ptr->resize_time;
		else if (job_ptr->start_time > job_ptr->end_time) {
			/* Job cancelled while pending and
			 * expected start time is in the future. */
			start_time = 0;
		} else
			start_time = job_ptr->start_time;
		end_time = job_ptr->end_time;
	}

	connect_type = select_g_select_jobinfo_xstrdup(job_ptr->select_jobinfo,
						       SELECT_PRINT_CONNECTION);
	reboot = select_g_select_jobinfo_xstrdup(job_ptr->select_jobinfo,
						 SELECT_PRINT_REBOOT);
	rotate = select_g_select_jobinfo_xstrdup(job_ptr->select_jobinfo,
						 SELECT_PRINT_ROTATE);
	geometry = select_g_select_jobinfo_xstrdup(job_ptr->select_jobinfo,
						   SELECT_PRINT_GEOMETRY);
	start = select_g_select_jobinfo_xstrdup(job_ptr->select_jobinfo,
						SELECT_PRINT_START);
#ifdef HAVE_BG
	blockid = select_g_select_jobinfo_xstrdup(job_ptr->select_jobinfo,
						  SELECT_PRINT_BG_ID);
#else
	blockid = select_g_select_jobinfo_xstrdup(job_ptr->select_jobinfo,
						  SELECT_PRINT_RESV_ID);
#endif
	query = xstrdup_printf(
		"insert into %s (jobid, uid, user_name, gid, group_name, "
		"name, state, proc_cnt, partition, timelimit, "
		"starttime, endtime, nodecnt",
		jobcomp_table);

	if(job_ptr->nodes)
		xstrcat(query, ", nodelist");
	if(connect_type)
		xstrcat(query, ", connect_type");
	if(reboot)
		xstrcat(query, ", reboot");
	if(rotate)
		xstrcat(query, ", rotate");
	if(job_ptr->details && (job_ptr->details->max_cpus != NO_VAL))
		xstrcat(query, ", maxprocs");
	if(geometry)
		xstrcat(query, ", geometry");
	if(start)
		xstrcat(query, ", start");
	if(blockid)
		xstrcat(query, ", blockid");
	xstrfmtcat(query, ") values (%u, %u, '%s', %u, '%s', \"%s\", %d, %u, "
		   "'%s', \"%s\", %u, %u, %u",
		   job_ptr->job_id, job_ptr->user_id, usr_str,
		   job_ptr->group_id, grp_str, job_ptr->name,
		   job_state, job_ptr->total_cpus, job_ptr->partition, lim_str,
		   start_time, end_time, job_ptr->node_cnt);

	if(job_ptr->nodes)
		xstrfmtcat(query, ", '%s'", job_ptr->nodes);

	if(connect_type) {
		xstrfmtcat(query, ", '%s'", connect_type);
		xfree(connect_type);
	}
	if(reboot) {
		xstrfmtcat(query, ", '%s'", reboot);
		xfree(reboot);
	}
	if(rotate) {
		xstrfmtcat(query, ", '%s'", rotate);
		xfree(rotate);
	}
	if(job_ptr->details && (job_ptr->details->max_cpus != NO_VAL))
		xstrfmtcat(query, ", '%u'", job_ptr->details->max_cpus);

	if(geometry) {
		xstrfmtcat(query, ", '%s'", geometry);
		xfree(geometry);
	}
	if(start) {
		xstrfmtcat(query, ", '%s'", start);
		xfree(start);
	}
	if(blockid) {
		xstrfmtcat(query, ", '%s'", blockid);
		xfree(blockid);
	}
	xstrcat(query, ")");
	//info("query = %s", query);
	rc = mysql_db_query(jobcomp_mysql_conn, query);
	xfree(usr_str);
	xfree(grp_str);

	return rc;
}
Esempio n. 4
0
extern int as_mysql_job_complete(mysql_conn_t *mysql_conn,
				 struct job_record *job_ptr)
{
	char *query = NULL;
	int rc = SLURM_SUCCESS, job_state;
	time_t submit_time, end_time;

	if (!job_ptr->db_index
	    && ((!job_ptr->details || !job_ptr->details->submit_time)
		&& !job_ptr->resize_time)) {
		error("as_mysql_job_complete: "
		      "Not inputing this job, it has no submit time.");
		return SLURM_ERROR;
	}

	if (check_connection(mysql_conn) != SLURM_SUCCESS)
		return ESLURM_DB_CONNECTION;

	debug2("as_mysql_slurmdb_job_complete() called");

	if (job_ptr->resize_time)
		submit_time = job_ptr->resize_time;
	else
		submit_time = job_ptr->details->submit_time;

	if (IS_JOB_RESIZING(job_ptr)) {
		end_time = job_ptr->resize_time;
		job_state = JOB_RESIZING;
	} else {
		/* If we get an error with this just fall through to avoid an
		 * infinite loop */
		if (job_ptr->end_time == 0) {
			debug("as_mysql_jobacct: job %u never started",
			      job_ptr->job_id);
			return SLURM_SUCCESS;
		}
		end_time = job_ptr->end_time;
		job_state = job_ptr->job_state & JOB_STATE_BASE;
	}

	slurm_mutex_lock(&rollup_lock);
	if (end_time < global_last_rollup) {
		global_last_rollup = job_ptr->end_time;
		slurm_mutex_unlock(&rollup_lock);

		query = xstrdup_printf("update \"%s_%s\" set "
				       "hourly_rollup=%ld, "
				       "daily_rollup=%ld, monthly_rollup=%ld",
				       mysql_conn->cluster_name,
				       last_ran_table, end_time,
				       end_time, end_time);
		debug3("%d(%s:%d) query\n%s",
		       mysql_conn->conn, THIS_FILE, __LINE__, query);
		rc = mysql_db_query(mysql_conn, query);
		xfree(query);
	} else
		slurm_mutex_unlock(&rollup_lock);

	if (!job_ptr->db_index) {
		if (!(job_ptr->db_index =
		      _get_db_index(mysql_conn,
				    submit_time,
				    job_ptr->job_id,
				    job_ptr->assoc_id))) {
			/* Comment is overloaded in job_start to be
			   the block_id, so we will need to store this
			   for later.
			*/
			char *comment = job_ptr->comment;
			job_ptr->comment = NULL;
			/* If we get an error with this just fall
			 * through to avoid an infinite loop
			 */
			if (as_mysql_job_start(
				    mysql_conn, job_ptr) == SLURM_ERROR) {
				job_ptr->comment = comment;
				error("couldn't add job %u at job completion",
				      job_ptr->job_id);
				return SLURM_SUCCESS;
			}
			job_ptr->comment = comment;
		}
	}

	/*
	 * make sure we handle any quotes that may be in the comment
	 */

	query = xstrdup_printf("update \"%s_%s\" set "
			       "time_end=%ld, state=%d",
			       mysql_conn->cluster_name, job_table,
			       end_time, job_state);

	if (job_ptr->derived_ec != NO_VAL)
		xstrfmtcat(query, ", derived_ec=%u", job_ptr->derived_ec);

	if (job_ptr->comment) {
		char *comment = slurm_add_slash_to_quotes(job_ptr->comment);
		xstrfmtcat(query, ", derived_es='%s'", comment);
		xfree(comment);
	}

	xstrfmtcat(query,
		   ", exit_code=%d, kill_requid=%d where job_db_inx=%d;",
		   job_ptr->exit_code, job_ptr->requid,
		   job_ptr->db_index);

	debug3("%d(%s:%d) query\n%s",
	       mysql_conn->conn, THIS_FILE, __LINE__, query);
	rc = mysql_db_query(mysql_conn, query);
	xfree(query);

	return rc;
}
Esempio n. 5
0
extern int as_mysql_job_start(mysql_conn_t *mysql_conn,
			      struct job_record *job_ptr)
{
	int rc=SLURM_SUCCESS;
	char *nodes = NULL, *jname = NULL, *node_inx = NULL;
	int track_steps = 0;
	char *block_id = NULL, *partition = NULL,
		*gres_req = NULL, *gres_alloc = NULL;
	char *query = NULL;
	int reinit = 0;
	time_t begin_time, check_time, start_time, submit_time;
	uint32_t wckeyid = 0;
	int job_state, node_cnt = 0;
	uint32_t job_db_inx = job_ptr->db_index;

	if ((!job_ptr->details || !job_ptr->details->submit_time)
	    && !job_ptr->resize_time) {
		error("as_mysql_job_start: "
		      "Not inputing this job, it has no submit time.");
		return SLURM_ERROR;
	}

	if (check_connection(mysql_conn) != SLURM_SUCCESS)
		return ESLURM_DB_CONNECTION;

	debug2("as_mysql_slurmdb_job_start() called");

	job_state = job_ptr->job_state;

	if (job_ptr->resize_time) {
		begin_time  = job_ptr->resize_time;
		submit_time = job_ptr->resize_time;
		start_time  = job_ptr->resize_time;
	} else {
		begin_time  = job_ptr->details->begin_time;
		submit_time = job_ptr->details->submit_time;
		start_time  = job_ptr->start_time;
	}

	/* Since we need a new db_inx make sure the old db_inx
	 * removed. This is most likely the only time we are going to
	 * be notified of the change also so make the state without
	 * the resize. */
	if (IS_JOB_RESIZING(job_ptr)) {
		/* If we have a db_index lets end the previous record. */
		if (!job_ptr->db_index) {
			error("We don't have a db_index for job %u, "
			      "this should only happen when resizing "
			      "jobs and the database interface was down.",
			      job_ptr->job_id);
			job_ptr->db_index = _get_db_index(mysql_conn,
							  job_ptr->details->
							  submit_time,
							  job_ptr->job_id,
							  job_ptr->assoc_id);
		}

		if (job_ptr->db_index)
			as_mysql_job_complete(mysql_conn, job_ptr);

		job_state &= (~JOB_RESIZING);
		job_ptr->db_index = 0;
	}

	job_state &= JOB_STATE_BASE;

	/* See what we are hearing about here if no start time. If
	 * this job latest time is before the last roll up we will
	 * need to reset it to look at this job. */
	if (start_time)
		check_time = start_time;
	else if (begin_time)
		check_time = begin_time;
	else
		check_time = submit_time;

	slurm_mutex_lock(&rollup_lock);
	if (check_time < global_last_rollup) {
		MYSQL_RES *result = NULL;
		MYSQL_ROW row;

		/* check to see if we are hearing about this time for the
		 * first time.
		 */
		query = xstrdup_printf("select job_db_inx "
				       "from \"%s_%s\" where id_job=%u and "
				       "time_submit=%ld and time_eligible=%ld "
				       "and time_start=%ld;",
				       mysql_conn->cluster_name,
				       job_table, job_ptr->job_id,
				       submit_time, begin_time, start_time);
		debug3("%d(%s:%d) query\n%s",
		       mysql_conn->conn, THIS_FILE, __LINE__, query);
		if (!(result =
		      mysql_db_query_ret(mysql_conn, query, 0))) {
			xfree(query);
			slurm_mutex_unlock(&rollup_lock);
			return SLURM_ERROR;
		}
		xfree(query);
		if ((row = mysql_fetch_row(result))) {
			mysql_free_result(result);
			debug4("revieved an update for a "
			       "job (%u) already known about",
			       job_ptr->job_id);
			slurm_mutex_unlock(&rollup_lock);
			goto no_rollup_change;
		}
		mysql_free_result(result);

		if (job_ptr->start_time)
			debug("Need to reroll usage from %sJob %u "
			      "from %s started then and we are just "
			      "now hearing about it.",
			      slurm_ctime(&check_time),
			      job_ptr->job_id, mysql_conn->cluster_name);
		else if (begin_time)
			debug("Need to reroll usage from %sJob %u "
			      "from %s became eligible then and we are just "
			      "now hearing about it.",
			      slurm_ctime(&check_time),
			      job_ptr->job_id, mysql_conn->cluster_name);
		else
			debug("Need to reroll usage from %sJob %u "
			      "from %s was submitted then and we are just "
			      "now hearing about it.",
			      slurm_ctime(&check_time),
			      job_ptr->job_id, mysql_conn->cluster_name);

		global_last_rollup = check_time;
		slurm_mutex_unlock(&rollup_lock);

		/* If the times here are later than the daily_rollup
		   or monthly rollup it isn't a big deal since they
		   are always shrunk down to the beginning of each
		   time period.
		*/
		query = xstrdup_printf("update \"%s_%s\" set "
				       "hourly_rollup=%ld, "
				       "daily_rollup=%ld, monthly_rollup=%ld",
				       mysql_conn->cluster_name,
				       last_ran_table, check_time,
				       check_time, check_time);
		debug3("%d(%s:%d) query\n%s",
		       mysql_conn->conn, THIS_FILE, __LINE__, query);
		rc = mysql_db_query(mysql_conn, query);
		xfree(query);
	} else
		slurm_mutex_unlock(&rollup_lock);

no_rollup_change:

	if (job_ptr->name && job_ptr->name[0])
		jname = slurm_add_slash_to_quotes(job_ptr->name);
	else {
		jname = xstrdup("allocation");
		track_steps = 1;
	}

	if (job_ptr->nodes && job_ptr->nodes[0])
		nodes = job_ptr->nodes;
	else
		nodes = "None assigned";

	if (job_ptr->batch_flag)
		track_steps = 1;

	if (slurmdbd_conf) {
		block_id = xstrdup(job_ptr->comment);
		node_cnt = job_ptr->total_nodes;
		node_inx = job_ptr->network;
	} else {
		char temp_bit[BUF_SIZE];

		if (job_ptr->node_bitmap) {
			node_inx = bit_fmt(temp_bit, sizeof(temp_bit),
					   job_ptr->node_bitmap);
		}
#ifdef HAVE_BG
		select_g_select_jobinfo_get(job_ptr->select_jobinfo,
					    SELECT_JOBDATA_BLOCK_ID,
					    &block_id);
		select_g_select_jobinfo_get(job_ptr->select_jobinfo,
					    SELECT_JOBDATA_NODE_CNT,
					    &node_cnt);
#else
		node_cnt = job_ptr->total_nodes;
#endif
	}

	/* If there is a start_time get the wckeyid.  If the job is
	 * cancelled before the job starts we also want to grab it. */
	if (job_ptr->assoc_id
	    && (job_ptr->start_time || IS_JOB_CANCELLED(job_ptr)))
		wckeyid = _get_wckeyid(mysql_conn, &job_ptr->wckey,
				       job_ptr->user_id,
				       mysql_conn->cluster_name,
				       job_ptr->assoc_id);

	if (job_ptr->partition)
		partition = slurm_add_slash_to_quotes(job_ptr->partition);

	if (job_ptr->gres_req)
		gres_req = slurm_add_slash_to_quotes(job_ptr->gres_req);

	if (job_ptr->gres_alloc)
		gres_alloc = slurm_add_slash_to_quotes(job_ptr->gres_alloc);

	if (!job_ptr->db_index) {
		if (!begin_time)
			begin_time = submit_time;
		query = xstrdup_printf(
			"insert into \"%s_%s\" "
			"(id_job, id_array_job, id_array_task, "
			"id_assoc, id_qos, id_wckey, id_user, "
			"id_group, nodelist, id_resv, timelimit, "
			"time_eligible, time_submit, time_start, "
			"job_name, track_steps, state, priority, cpus_req, "
			"cpus_alloc, nodes_alloc, mem_req",
			mysql_conn->cluster_name, job_table);

		if (job_ptr->account)
			xstrcat(query, ", account");
		if (partition)
			xstrcat(query, ", `partition`");
		if (block_id)
			xstrcat(query, ", id_block");
		if (job_ptr->wckey)
			xstrcat(query, ", wckey");
		if (node_inx)
			xstrcat(query, ", node_inx");
		if (gres_req)
			xstrcat(query, ", gres_req");
		if (gres_alloc)
			xstrcat(query, ", gres_alloc");

		xstrfmtcat(query,
			   ") values (%u, %u, %u, %u, %u, %u, %u, %u, "
			   "'%s', %u, %u, %ld, %ld, %ld, "
			   "'%s', %u, %u, %u, %u, %u, %u, %u",
			   job_ptr->job_id, job_ptr->array_job_id,
			   job_ptr->array_task_id, job_ptr->assoc_id,
			   job_ptr->qos_id, wckeyid,
			   job_ptr->user_id, job_ptr->group_id, nodes,
			   job_ptr->resv_id, job_ptr->time_limit,
			   begin_time, submit_time, start_time,
			   jname, track_steps, job_state,
			   job_ptr->priority, job_ptr->details->min_cpus,
			   job_ptr->total_cpus, node_cnt,
			   job_ptr->details->pn_min_memory);

		if (job_ptr->account)
			xstrfmtcat(query, ", '%s'", job_ptr->account);
		if (partition)
			xstrfmtcat(query, ", '%s'", partition);
		if (block_id)
			xstrfmtcat(query, ", '%s'", block_id);
		if (job_ptr->wckey)
			xstrfmtcat(query, ", '%s'", job_ptr->wckey);
		if (node_inx)
			xstrfmtcat(query, ", '%s'", node_inx);
		if (gres_req)
			xstrfmtcat(query, ", '%s'", gres_req);
		if (gres_alloc)
			xstrfmtcat(query, ", '%s'", gres_alloc);

		xstrfmtcat(query,
			   ") on duplicate key update "
			   "job_db_inx=LAST_INSERT_ID(job_db_inx), "
			   "id_wckey=%u, id_user=%u, id_group=%u, "
			   "nodelist='%s', id_resv=%u, timelimit=%u, "
			   "time_submit=%ld, time_start=%ld, "
			   "job_name='%s', track_steps=%u, id_qos=%u, "
			   "state=greatest(state, %u), priority=%u, "
			   "cpus_req=%u, cpus_alloc=%u, nodes_alloc=%u, "
			   "mem_req=%u, id_array_job=%u, id_array_task=%u",
			   wckeyid, job_ptr->user_id, job_ptr->group_id, nodes,
			   job_ptr->resv_id, job_ptr->time_limit,
			   submit_time, start_time,
			   jname, track_steps, job_ptr->qos_id, job_state,
			   job_ptr->priority, job_ptr->details->min_cpus,
			   job_ptr->total_cpus, node_cnt,
			   job_ptr->details->pn_min_memory,
			   job_ptr->array_job_id,
			   job_ptr->array_task_id);

		if (job_ptr->account)
			xstrfmtcat(query, ", account='%s'", job_ptr->account);
		if (partition)
			xstrfmtcat(query, ", `partition`='%s'", partition);
		if (block_id)
			xstrfmtcat(query, ", id_block='%s'", block_id);
		if (job_ptr->wckey)
			xstrfmtcat(query, ", wckey='%s'", job_ptr->wckey);
		if (node_inx)
			xstrfmtcat(query, ", node_inx='%s'", node_inx);
		if (gres_req)
			xstrfmtcat(query, ", gres_req='%s'", gres_req);
		if (gres_alloc)
			xstrfmtcat(query, ", gres_alloc='%s'", gres_alloc);

		debug3("%d(%s:%d) query\n%s",
		       mysql_conn->conn, THIS_FILE, __LINE__, query);
	try_again:
		if (!(job_ptr->db_index = mysql_db_insert_ret_id(
			      mysql_conn, query))) {
			if (!reinit) {
				error("It looks like the storage has gone "
				      "away trying to reconnect");
				mysql_db_close_db_connection(
					mysql_conn);
				/* reconnect */
				check_connection(mysql_conn);
				reinit = 1;
				goto try_again;
			} else
				rc = SLURM_ERROR;
		}
	} else {
		query = xstrdup_printf("update \"%s_%s\" set nodelist='%s', ",
				       mysql_conn->cluster_name,
				       job_table, nodes);

		if (job_ptr->account)
			xstrfmtcat(query, "account='%s', ", job_ptr->account);
		if (partition)
			xstrfmtcat(query, "`partition`='%s', ", partition);
		if (block_id)
			xstrfmtcat(query, "id_block='%s', ", block_id);
		if (job_ptr->wckey)
			xstrfmtcat(query, "wckey='%s', ", job_ptr->wckey);
		if (node_inx)
			xstrfmtcat(query, "node_inx='%s', ", node_inx);
		if (gres_req)
			xstrfmtcat(query, "gres_req='%s', ", gres_req);
		if (gres_alloc)
			xstrfmtcat(query, "gres_alloc='%s', ", gres_alloc);

		xstrfmtcat(query, "time_start=%ld, job_name='%s', state=%u, "
			   "cpus_alloc=%u, nodes_alloc=%u, id_qos=%u, "
			   "id_assoc=%u, id_wckey=%u, id_resv=%u, "
			   "timelimit=%u, mem_req=%u, "
			   "id_array_job=%u, id_array_task=%u, "
			   "time_eligible=%ld where job_db_inx=%d",
			   start_time, jname, job_state,
			   job_ptr->total_cpus, node_cnt, job_ptr->qos_id,
			   job_ptr->assoc_id, wckeyid,
			   job_ptr->resv_id, job_ptr->time_limit,
			   job_ptr->details->pn_min_memory,
			   job_ptr->array_job_id,
			   job_ptr->array_task_id,
			   begin_time, job_ptr->db_index);

		debug3("%d(%s:%d) query\n%s",
		       mysql_conn->conn, THIS_FILE, __LINE__, query);
		rc = mysql_db_query(mysql_conn, query);
	}

	xfree(block_id);
	xfree(partition);
	xfree(gres_req);
	xfree(gres_alloc);
	xfree(jname);
	xfree(query);

	/* now we will reset all the steps */
	if (IS_JOB_RESIZING(job_ptr)) {
		/* FIXME : Verify this is still needed */
		if (IS_JOB_SUSPENDED(job_ptr))
			as_mysql_suspend(mysql_conn, job_db_inx, job_ptr);
	}

	return rc;
}
Esempio n. 6
0
extern int as_mysql_suspend(mysql_conn_t *mysql_conn,
			    uint32_t old_db_inx,
			    struct job_record *job_ptr)
{
	char *query = NULL;
	int rc = SLURM_SUCCESS;
	time_t submit_time;
	uint32_t job_db_inx;

	if (check_connection(mysql_conn) != SLURM_SUCCESS)
		return ESLURM_DB_CONNECTION;

	if (job_ptr->resize_time)
		submit_time = job_ptr->resize_time;
	else
		submit_time = job_ptr->details->submit_time;

	if (!job_ptr->db_index) {
		if (!(job_ptr->db_index =
		      _get_db_index(mysql_conn,
				    submit_time,
				    job_ptr->job_id,
				    job_ptr->assoc_id))) {
			/* If we get an error with this just fall
			 * through to avoid an infinite loop
			 */
			if (as_mysql_job_start(
				    mysql_conn, job_ptr) == SLURM_ERROR) {
				error("couldn't suspend job %u",
				      job_ptr->job_id);
				return SLURM_SUCCESS;
			}
		}
	}

	if (IS_JOB_RESIZING(job_ptr)) {
		if (!old_db_inx) {
			error("No old db inx given for job %u cluster %s, "
			      "can't update suspend table.",
			      job_ptr->job_id, mysql_conn->cluster_name);
			return SLURM_ERROR;
		}
		job_db_inx = old_db_inx;
		xstrfmtcat(query,
			   "update \"%s_%s\" set time_end=%d where "
			   "job_db_inx=%u && time_end=0;",
			   mysql_conn->cluster_name, suspend_table,
			   (int)job_ptr->suspend_time, job_db_inx);

	} else
		job_db_inx = job_ptr->db_index;

	/* use job_db_inx for this one since we want to update the
	   supend time of the job before it was resized.
	*/
	xstrfmtcat(query,
		   "update \"%s_%s\" set time_suspended=%d-time_suspended, "
		   "state=%d where job_db_inx=%d;",
		   mysql_conn->cluster_name, job_table,
		   (int)job_ptr->suspend_time,
		   job_ptr->job_state & JOB_STATE_BASE,
		   job_db_inx);
	if (IS_JOB_SUSPENDED(job_ptr))
		xstrfmtcat(query,
			   "insert into \"%s_%s\" (job_db_inx, id_assoc, "
			   "time_start, time_end) values (%u, %u, %d, 0);",
			   mysql_conn->cluster_name, suspend_table,
			   job_ptr->db_index, job_ptr->assoc_id,
			   (int)job_ptr->suspend_time);
	else
		xstrfmtcat(query,
			   "update \"%s_%s\" set time_end=%d where "
			   "job_db_inx=%u && time_end=0;",
			   mysql_conn->cluster_name, suspend_table,
			   (int)job_ptr->suspend_time, job_ptr->db_index);
	debug3("%d(%s:%d) query\n%s",
	       mysql_conn->conn, THIS_FILE, __LINE__, query);

	rc = mysql_db_query(mysql_conn, query);

	xfree(query);
	if (rc != SLURM_ERROR) {
		xstrfmtcat(query,
			   "update \"%s_%s\" set "
			   "time_suspended=%u-time_suspended, "
			   "state=%d where job_db_inx=%u and time_end=0",
			   mysql_conn->cluster_name, step_table,
			   (int)job_ptr->suspend_time,
			   job_ptr->job_state, job_ptr->db_index);
		rc = mysql_db_query(mysql_conn, query);
		xfree(query);
	}

	return rc;
}
Esempio n. 7
0
/*
 * js_pg_job_start - load into the storage the start of a job
 *
 * IN pg_conn: database connection
 * IN cluster_name: cluster of the job
 * IN job_ptr: job just started
 * RET: error code
 */
extern int
js_pg_job_start(pgsql_conn_t *pg_conn,
		struct job_record *job_ptr)
{
	int rc=SLURM_SUCCESS, track_steps = 0, reinit = 0;
	char *jname = NULL, *nodes = NULL, *node_inx = NULL;
	char *block_id = NULL, *rec = NULL, *query = NULL;
	time_t begin_time, check_time, start_time, submit_time;
	int job_state, node_cnt = 0;
	uint32_t wckeyid = 0;

	if ((!job_ptr->details || !job_ptr->details->submit_time)
	    && !job_ptr->resize_time) {
		error("as/pg: job_start: Not inputing this job, "
		      "it has no submit time.");
		return SLURM_ERROR;
	}

	if (check_db_connection(pg_conn) != SLURM_SUCCESS)
		return ESLURM_DB_CONNECTION;

	if (! cluster_in_db(pg_conn, pg_conn->cluster_name) ) {
		error("cluster %s not in db", pg_conn->cluster_name);
		return SLURM_ERROR;
	}

	debug3("as/pg: job_start() called");

	job_state = job_ptr->job_state;

	/* Since we need a new db_inx make sure the old db_inx
	 * removed. This is most likely the only time we are going to
	 * be notified of the change also so make the state without
	 * the resize. */
	if (IS_JOB_RESIZING(job_ptr)) {
		/* If we have a db_index lets end the previous record. */
		if (job_ptr->db_index)
			js_pg_job_complete(pg_conn, job_ptr);
		else
			error("We don't have a db_index for job %u, "
			      "this should never happen.", job_ptr->job_id);
		job_state &= (~JOB_RESIZING);
		job_ptr->db_index = 0;
	}

	job_state &= JOB_STATE_BASE;

	if (job_ptr->resize_time) {
		begin_time  = job_ptr->resize_time;
		submit_time = job_ptr->resize_time;
		start_time  = job_ptr->resize_time;
	} else {
		begin_time  = job_ptr->details->begin_time;
		submit_time = job_ptr->details->submit_time;
		start_time  = job_ptr->start_time;
	}

	/* See what we are hearing about here if no start time. If
	 * this job latest time is before the last roll up we will
	 * need to reset it to look at this job. */
	if (start_time)
		check_time = start_time;
	else if (begin_time)
		check_time = begin_time;
	else
		check_time = submit_time;

	slurm_mutex_lock(&usage_rollup_lock);
	if (check_time < global_last_rollup) {
		PGresult *result = NULL;
		/* check to see if we are hearing about this time for the
		 * first time.
		 */
		query = xstrdup_printf(
			"SELECT job_db_inx FROM %s.%s WHERE id_job=%u AND "
			"time_submit=%ld AND time_eligible=%ld AND time_start=%ld",
			pg_conn->cluster_name, job_table, job_ptr->job_id,
			submit_time, begin_time, start_time);
		result = DEF_QUERY_RET;
		if (!result) {
			 slurm_mutex_unlock(&usage_rollup_lock);
			return SLURM_ERROR;
		}
		if (PQntuples(result) != 0) {
			PQclear(result);
			debug4("revieved an update for a "
			       "job (%u) already known about",
			       job_ptr->job_id);
			 slurm_mutex_unlock(&usage_rollup_lock);
			goto no_rollup_change;
		}
		PQclear(result);

		if (job_ptr->start_time)
			debug("Need to reroll usage from %s Job %u "
			      "from %s started then and we are just "
			      "now hearing about it.",
			      ctime(&check_time),
			      job_ptr->job_id, pg_conn->cluster_name);
		else if (begin_time)
			debug("Need to reroll usage from %s Job %u "
			      "from %s became eligible then and we are just "
			      "now hearing about it.",
			      ctime(&check_time),
			      job_ptr->job_id, pg_conn->cluster_name);
		else
			debug("Need to reroll usage from %s Job %u "
			      "from %s was submitted then and we are just "
			      "now hearing about it.",
			      ctime(&check_time),
			      job_ptr->job_id, pg_conn->cluster_name);

		global_last_rollup = check_time;
		slurm_mutex_unlock(&usage_rollup_lock);

		query = xstrdup_printf("UPDATE %s.%s SET hourly_rollup=%ld, "
				       "daily_rollup=%ld, monthly_rollup=%ld",
				       pg_conn->cluster_name, last_ran_table,
				       check_time, check_time, check_time);
		rc = DEF_QUERY_RET_RC;
	} else
		slurm_mutex_unlock(&usage_rollup_lock);

no_rollup_change:

	if (job_ptr->name && job_ptr->name[0])
		jname = xstrdup(job_ptr->name);
	else {
		jname = xstrdup("allocation");
		track_steps = 1;
	}

	if (job_ptr->nodes && job_ptr->nodes[0])
		nodes = job_ptr->nodes;
	else
		nodes = "None assigned";

	if (job_ptr->batch_flag)
		track_steps = 1;

	if (slurmdbd_conf) {
		block_id = xstrdup(job_ptr->comment);
		node_cnt = job_ptr->total_nodes;
		node_inx = job_ptr->network;
	} else {
		char temp_bit[BUF_SIZE];

		if (job_ptr->node_bitmap) {
			node_inx = bit_fmt(temp_bit, sizeof(temp_bit),
					   job_ptr->node_bitmap);
		}
#ifdef HAVE_BG
		select_g_select_jobinfo_get(job_ptr->select_jobinfo,
					    SELECT_JOBDATA_BLOCK_ID,
					    &block_id);
		select_g_select_jobinfo_get(job_ptr->select_jobinfo,
					    SELECT_JOBDATA_NODE_CNT,
					    &node_cnt);
#else
		node_cnt = job_ptr->total_nodes;
#endif
	}

	/* If there is a start_time get the wckeyid.  If the job is
	 * cancelled before the job starts we also want to grab it. */
	if (job_ptr->assoc_id &&
	   (job_ptr->start_time || IS_JOB_CANCELLED(job_ptr)))
		wckeyid = get_wckeyid(pg_conn, &job_ptr->wckey,
				      job_ptr->user_id, pg_conn->cluster_name,
				      job_ptr->assoc_id);

	if (!job_ptr->db_index) {
		if (!begin_time)
			begin_time = submit_time;

		rec = xstrdup_printf(
			"(0, 0, '%s', '%s', %d, %d, 0, '%s', "
			"%d, '%s', %d, %d, %d, %d, %d, %d, 0, "
			"%d, %ld, %ld, %ld, 0, 0, "
			"%d, '%s', '%s', %d, %d, '%s', %d)",
			/* job_db_inx=0, not used */
			/* deleted=0 */
			job_ptr->account ?: "",     /* account */
			job_ptr->partition ?: "",   /* partition */
			(int)job_ptr->details->min_cpus, /* cpus_req */
			(int)job_ptr->total_cpus, /* cpus_alloc */
			/* exit_code=0 */
			jname,  /* job_name */

			(int)job_ptr->assoc_id, /* id_assoc */
			block_id ?: "", /* id_block */
			(int)job_ptr->job_id,   /* id_job */
			(int)job_ptr->qos_id,   /* id_qos */
			(int)job_ptr->resv_id,  /* id_resv */
			(int)wckeyid,           /* id_wckey */
			(int)job_ptr->user_id,  /* uid */
			(int)job_ptr->group_id, /* gid */
			/* kill_requid=0 */

			(int)job_ptr->time_limit, /* timelimit */
			submit_time,         /* time_submit */
			begin_time,          /* time_eligible */
			start_time,          /* time_start */
			/* time_end=0 */
			/* time_suspended=0 */

			(int)node_cnt, /* nodes_alloc */
			nodes ?: "",                    /* nodelist */
			node_inx ?: "",         /* node_inx */
			(int)job_ptr->priority, /* priority */
			(int)job_state,         /* state */
			job_ptr->wckey ?: "",   /* wckey */
			(int)track_steps);

		query = xstrdup_printf("SELECT %s.add_job_start(%s);",
				       pg_conn->cluster_name, rec);
		xfree(rec);

	try_again:
		DEBUG_QUERY;
		job_ptr->db_index = pgsql_query_ret_id(pg_conn->db_conn,
						       query);
		if (!job_ptr->db_index) {
			if (!reinit) {
				error("It looks like the storage has gone "
				      "away trying to reconnect");
				check_db_connection(pg_conn);
				reinit = 1;
				goto try_again;
			} else
				rc = SLURM_ERROR;
		}
		xfree(query);
	} else {