/* * load into the storage the end of a job */ extern int jobacct_storage_p_job_complete(void *db_conn, struct job_record *job_ptr) { char buf[BUFFER_SIZE]; uint16_t job_state; int duration; uint32_t exit_code; if (!storage_init) { debug("jobacct init was not called or it failed"); return SLURM_ERROR; } debug2("slurmdb_job_complete() called"); if (IS_JOB_RESIZING(job_ptr)) { job_state = JOB_RESIZING; if (job_ptr->resize_time) duration = time(NULL) - job_ptr->resize_time; else duration = time(NULL) - job_ptr->start_time; } else { if (job_ptr->end_time == 0) { debug("jobacct: job %u never started", job_ptr->job_id); return SLURM_ERROR; } job_state = job_ptr->job_state & JOB_STATE_BASE; if (job_ptr->resize_time) duration = job_ptr->end_time - job_ptr->resize_time; else duration = job_ptr->end_time - job_ptr->start_time; } exit_code = job_ptr->exit_code; if (exit_code == 1) { /* This wasn't signalled, it was set by Slurm so don't * treat it like a signal. */ exit_code = 256; } /* leave the requid as a %d since we want to see if it is -1 in stats */ snprintf(buf, BUFFER_SIZE, "%d %d %u %u %u", JOB_TERMINATED, duration, job_state, job_ptr->requid, exit_code); return _print_record(job_ptr, job_ptr->end_time, buf); }
static struct jobcomp_info * _jobcomp_info_create (struct job_record *job) { enum job_states state; struct jobcomp_info * j = xmalloc (sizeof (*j)); j->jobid = job->job_id; j->uid = job->user_id; j->gid = job->group_id; j->name = xstrdup (job->name); j->array_job_id = job->array_job_id; j->array_task_id = job->array_task_id; if (IS_JOB_RESIZING(job)) { state = JOB_RESIZING; j->jobstate = xstrdup (job_state_string (state)); if (job->resize_time) j->start = job->resize_time; else j->start = job->start_time; j->end = time(NULL); } else { /* Job state will typically have JOB_COMPLETING or JOB_RESIZING * flag set when called. We remove the flags to get the eventual * completion state: JOB_FAILED, JOB_TIMEOUT, etc. */ state = job->job_state & JOB_STATE_BASE; j->jobstate = xstrdup (job_state_string (state)); if (job->resize_time) j->start = job->resize_time; else if (job->start_time > job->end_time) { /* Job cancelled while pending and * expected start time is in the future. */ j->start = 0; } else j->start = job->start_time; j->end = job->end_time; } j->partition = xstrdup (job->partition); if ((job->time_limit == NO_VAL) && job->part_ptr) j->limit = job->part_ptr->max_time; else j->limit = job->time_limit; j->submit = job->details ? job->details->submit_time:job->start_time; j->batch_flag = job->batch_flag; j->nodes = xstrdup (job->nodes); j->nprocs = job->total_cpus; j->nnodes = job->node_cnt; j->account = job->account ? xstrdup (job->account) : NULL; if (job->details && job->details->work_dir) j->work_dir = xstrdup(job->details->work_dir); else j->work_dir = xstrdup("unknown"); if (job->details) { if (job->details->std_in) j->std_in = xstrdup(job->details->std_in); if (job->details->std_out) j->std_out = xstrdup(job->details->std_out); if (job->details->std_err) j->std_err = xstrdup(job->details->std_err); } #ifdef HAVE_BG j->connect_type = select_g_select_jobinfo_xstrdup(job->select_jobinfo, SELECT_PRINT_CONNECTION); j->geometry = select_g_select_jobinfo_xstrdup(job->select_jobinfo, SELECT_PRINT_GEOMETRY); j->blockid = select_g_select_jobinfo_xstrdup(job->select_jobinfo, SELECT_PRINT_BG_ID); #endif return (j); }
extern int slurm_jobcomp_log_record(struct job_record *job_ptr) { int rc = SLURM_SUCCESS; char *usr_str = NULL, *grp_str = NULL, lim_str[32]; char *connect_type = NULL, *reboot = NULL, *rotate = NULL, *geometry = NULL, *start = NULL, *blockid = NULL; enum job_states job_state; char *query = NULL; uint32_t time_limit, start_time, end_time; if(!jobcomp_mysql_conn || mysql_db_ping(jobcomp_mysql_conn) != 0) { char *loc = slurm_get_jobcomp_loc(); if(slurm_jobcomp_set_location(loc) == SLURM_ERROR) { xfree(loc); return SLURM_ERROR; } xfree(loc); } usr_str = _get_user_name(job_ptr->user_id); grp_str = _get_group_name(job_ptr->group_id); if ((job_ptr->time_limit == NO_VAL) && job_ptr->part_ptr) time_limit = job_ptr->part_ptr->max_time; else time_limit = job_ptr->time_limit; if (time_limit == INFINITE) strcpy(lim_str, "UNLIMITED"); else { snprintf(lim_str, sizeof(lim_str), "%lu", (unsigned long) time_limit); } /* Job will typically be COMPLETING when this is called. * We remove the flags to get the eventual completion state: * JOB_FAILED, JOB_TIMEOUT, etc. */ if (IS_JOB_RESIZING(job_ptr)) { job_state = JOB_RESIZING; if (job_ptr->resize_time) start_time = job_ptr->resize_time; else start_time = job_ptr->start_time; end_time = time(NULL); } else { job_state = job_ptr->job_state & JOB_STATE_BASE; if (job_ptr->resize_time) start_time = job_ptr->resize_time; else if (job_ptr->start_time > job_ptr->end_time) { /* Job cancelled while pending and * expected start time is in the future. */ start_time = 0; } else start_time = job_ptr->start_time; end_time = job_ptr->end_time; } connect_type = select_g_select_jobinfo_xstrdup(job_ptr->select_jobinfo, SELECT_PRINT_CONNECTION); reboot = select_g_select_jobinfo_xstrdup(job_ptr->select_jobinfo, SELECT_PRINT_REBOOT); rotate = select_g_select_jobinfo_xstrdup(job_ptr->select_jobinfo, SELECT_PRINT_ROTATE); geometry = select_g_select_jobinfo_xstrdup(job_ptr->select_jobinfo, SELECT_PRINT_GEOMETRY); start = select_g_select_jobinfo_xstrdup(job_ptr->select_jobinfo, SELECT_PRINT_START); #ifdef HAVE_BG blockid = select_g_select_jobinfo_xstrdup(job_ptr->select_jobinfo, SELECT_PRINT_BG_ID); #else blockid = select_g_select_jobinfo_xstrdup(job_ptr->select_jobinfo, SELECT_PRINT_RESV_ID); #endif query = xstrdup_printf( "insert into %s (jobid, uid, user_name, gid, group_name, " "name, state, proc_cnt, partition, timelimit, " "starttime, endtime, nodecnt", jobcomp_table); if(job_ptr->nodes) xstrcat(query, ", nodelist"); if(connect_type) xstrcat(query, ", connect_type"); if(reboot) xstrcat(query, ", reboot"); if(rotate) xstrcat(query, ", rotate"); if(job_ptr->details && (job_ptr->details->max_cpus != NO_VAL)) xstrcat(query, ", maxprocs"); if(geometry) xstrcat(query, ", geometry"); if(start) xstrcat(query, ", start"); if(blockid) xstrcat(query, ", blockid"); xstrfmtcat(query, ") values (%u, %u, '%s', %u, '%s', \"%s\", %d, %u, " "'%s', \"%s\", %u, %u, %u", job_ptr->job_id, job_ptr->user_id, usr_str, job_ptr->group_id, grp_str, job_ptr->name, job_state, job_ptr->total_cpus, job_ptr->partition, lim_str, start_time, end_time, job_ptr->node_cnt); if(job_ptr->nodes) xstrfmtcat(query, ", '%s'", job_ptr->nodes); if(connect_type) { xstrfmtcat(query, ", '%s'", connect_type); xfree(connect_type); } if(reboot) { xstrfmtcat(query, ", '%s'", reboot); xfree(reboot); } if(rotate) { xstrfmtcat(query, ", '%s'", rotate); xfree(rotate); } if(job_ptr->details && (job_ptr->details->max_cpus != NO_VAL)) xstrfmtcat(query, ", '%u'", job_ptr->details->max_cpus); if(geometry) { xstrfmtcat(query, ", '%s'", geometry); xfree(geometry); } if(start) { xstrfmtcat(query, ", '%s'", start); xfree(start); } if(blockid) { xstrfmtcat(query, ", '%s'", blockid); xfree(blockid); } xstrcat(query, ")"); //info("query = %s", query); rc = mysql_db_query(jobcomp_mysql_conn, query); xfree(usr_str); xfree(grp_str); return rc; }
extern int as_mysql_job_complete(mysql_conn_t *mysql_conn, struct job_record *job_ptr) { char *query = NULL; int rc = SLURM_SUCCESS, job_state; time_t submit_time, end_time; if (!job_ptr->db_index && ((!job_ptr->details || !job_ptr->details->submit_time) && !job_ptr->resize_time)) { error("as_mysql_job_complete: " "Not inputing this job, it has no submit time."); return SLURM_ERROR; } if (check_connection(mysql_conn) != SLURM_SUCCESS) return ESLURM_DB_CONNECTION; debug2("as_mysql_slurmdb_job_complete() called"); if (job_ptr->resize_time) submit_time = job_ptr->resize_time; else submit_time = job_ptr->details->submit_time; if (IS_JOB_RESIZING(job_ptr)) { end_time = job_ptr->resize_time; job_state = JOB_RESIZING; } else { /* If we get an error with this just fall through to avoid an * infinite loop */ if (job_ptr->end_time == 0) { debug("as_mysql_jobacct: job %u never started", job_ptr->job_id); return SLURM_SUCCESS; } end_time = job_ptr->end_time; job_state = job_ptr->job_state & JOB_STATE_BASE; } slurm_mutex_lock(&rollup_lock); if (end_time < global_last_rollup) { global_last_rollup = job_ptr->end_time; slurm_mutex_unlock(&rollup_lock); query = xstrdup_printf("update \"%s_%s\" set " "hourly_rollup=%ld, " "daily_rollup=%ld, monthly_rollup=%ld", mysql_conn->cluster_name, last_ran_table, end_time, end_time, end_time); debug3("%d(%s:%d) query\n%s", mysql_conn->conn, THIS_FILE, __LINE__, query); rc = mysql_db_query(mysql_conn, query); xfree(query); } else slurm_mutex_unlock(&rollup_lock); if (!job_ptr->db_index) { if (!(job_ptr->db_index = _get_db_index(mysql_conn, submit_time, job_ptr->job_id, job_ptr->assoc_id))) { /* Comment is overloaded in job_start to be the block_id, so we will need to store this for later. */ char *comment = job_ptr->comment; job_ptr->comment = NULL; /* If we get an error with this just fall * through to avoid an infinite loop */ if (as_mysql_job_start( mysql_conn, job_ptr) == SLURM_ERROR) { job_ptr->comment = comment; error("couldn't add job %u at job completion", job_ptr->job_id); return SLURM_SUCCESS; } job_ptr->comment = comment; } } /* * make sure we handle any quotes that may be in the comment */ query = xstrdup_printf("update \"%s_%s\" set " "time_end=%ld, state=%d", mysql_conn->cluster_name, job_table, end_time, job_state); if (job_ptr->derived_ec != NO_VAL) xstrfmtcat(query, ", derived_ec=%u", job_ptr->derived_ec); if (job_ptr->comment) { char *comment = slurm_add_slash_to_quotes(job_ptr->comment); xstrfmtcat(query, ", derived_es='%s'", comment); xfree(comment); } xstrfmtcat(query, ", exit_code=%d, kill_requid=%d where job_db_inx=%d;", job_ptr->exit_code, job_ptr->requid, job_ptr->db_index); debug3("%d(%s:%d) query\n%s", mysql_conn->conn, THIS_FILE, __LINE__, query); rc = mysql_db_query(mysql_conn, query); xfree(query); return rc; }
extern int as_mysql_job_start(mysql_conn_t *mysql_conn, struct job_record *job_ptr) { int rc=SLURM_SUCCESS; char *nodes = NULL, *jname = NULL, *node_inx = NULL; int track_steps = 0; char *block_id = NULL, *partition = NULL, *gres_req = NULL, *gres_alloc = NULL; char *query = NULL; int reinit = 0; time_t begin_time, check_time, start_time, submit_time; uint32_t wckeyid = 0; int job_state, node_cnt = 0; uint32_t job_db_inx = job_ptr->db_index; if ((!job_ptr->details || !job_ptr->details->submit_time) && !job_ptr->resize_time) { error("as_mysql_job_start: " "Not inputing this job, it has no submit time."); return SLURM_ERROR; } if (check_connection(mysql_conn) != SLURM_SUCCESS) return ESLURM_DB_CONNECTION; debug2("as_mysql_slurmdb_job_start() called"); job_state = job_ptr->job_state; if (job_ptr->resize_time) { begin_time = job_ptr->resize_time; submit_time = job_ptr->resize_time; start_time = job_ptr->resize_time; } else { begin_time = job_ptr->details->begin_time; submit_time = job_ptr->details->submit_time; start_time = job_ptr->start_time; } /* Since we need a new db_inx make sure the old db_inx * removed. This is most likely the only time we are going to * be notified of the change also so make the state without * the resize. */ if (IS_JOB_RESIZING(job_ptr)) { /* If we have a db_index lets end the previous record. */ if (!job_ptr->db_index) { error("We don't have a db_index for job %u, " "this should only happen when resizing " "jobs and the database interface was down.", job_ptr->job_id); job_ptr->db_index = _get_db_index(mysql_conn, job_ptr->details-> submit_time, job_ptr->job_id, job_ptr->assoc_id); } if (job_ptr->db_index) as_mysql_job_complete(mysql_conn, job_ptr); job_state &= (~JOB_RESIZING); job_ptr->db_index = 0; } job_state &= JOB_STATE_BASE; /* See what we are hearing about here if no start time. If * this job latest time is before the last roll up we will * need to reset it to look at this job. */ if (start_time) check_time = start_time; else if (begin_time) check_time = begin_time; else check_time = submit_time; slurm_mutex_lock(&rollup_lock); if (check_time < global_last_rollup) { MYSQL_RES *result = NULL; MYSQL_ROW row; /* check to see if we are hearing about this time for the * first time. */ query = xstrdup_printf("select job_db_inx " "from \"%s_%s\" where id_job=%u and " "time_submit=%ld and time_eligible=%ld " "and time_start=%ld;", mysql_conn->cluster_name, job_table, job_ptr->job_id, submit_time, begin_time, start_time); debug3("%d(%s:%d) query\n%s", mysql_conn->conn, THIS_FILE, __LINE__, query); if (!(result = mysql_db_query_ret(mysql_conn, query, 0))) { xfree(query); slurm_mutex_unlock(&rollup_lock); return SLURM_ERROR; } xfree(query); if ((row = mysql_fetch_row(result))) { mysql_free_result(result); debug4("revieved an update for a " "job (%u) already known about", job_ptr->job_id); slurm_mutex_unlock(&rollup_lock); goto no_rollup_change; } mysql_free_result(result); if (job_ptr->start_time) debug("Need to reroll usage from %sJob %u " "from %s started then and we are just " "now hearing about it.", slurm_ctime(&check_time), job_ptr->job_id, mysql_conn->cluster_name); else if (begin_time) debug("Need to reroll usage from %sJob %u " "from %s became eligible then and we are just " "now hearing about it.", slurm_ctime(&check_time), job_ptr->job_id, mysql_conn->cluster_name); else debug("Need to reroll usage from %sJob %u " "from %s was submitted then and we are just " "now hearing about it.", slurm_ctime(&check_time), job_ptr->job_id, mysql_conn->cluster_name); global_last_rollup = check_time; slurm_mutex_unlock(&rollup_lock); /* If the times here are later than the daily_rollup or monthly rollup it isn't a big deal since they are always shrunk down to the beginning of each time period. */ query = xstrdup_printf("update \"%s_%s\" set " "hourly_rollup=%ld, " "daily_rollup=%ld, monthly_rollup=%ld", mysql_conn->cluster_name, last_ran_table, check_time, check_time, check_time); debug3("%d(%s:%d) query\n%s", mysql_conn->conn, THIS_FILE, __LINE__, query); rc = mysql_db_query(mysql_conn, query); xfree(query); } else slurm_mutex_unlock(&rollup_lock); no_rollup_change: if (job_ptr->name && job_ptr->name[0]) jname = slurm_add_slash_to_quotes(job_ptr->name); else { jname = xstrdup("allocation"); track_steps = 1; } if (job_ptr->nodes && job_ptr->nodes[0]) nodes = job_ptr->nodes; else nodes = "None assigned"; if (job_ptr->batch_flag) track_steps = 1; if (slurmdbd_conf) { block_id = xstrdup(job_ptr->comment); node_cnt = job_ptr->total_nodes; node_inx = job_ptr->network; } else { char temp_bit[BUF_SIZE]; if (job_ptr->node_bitmap) { node_inx = bit_fmt(temp_bit, sizeof(temp_bit), job_ptr->node_bitmap); } #ifdef HAVE_BG select_g_select_jobinfo_get(job_ptr->select_jobinfo, SELECT_JOBDATA_BLOCK_ID, &block_id); select_g_select_jobinfo_get(job_ptr->select_jobinfo, SELECT_JOBDATA_NODE_CNT, &node_cnt); #else node_cnt = job_ptr->total_nodes; #endif } /* If there is a start_time get the wckeyid. If the job is * cancelled before the job starts we also want to grab it. */ if (job_ptr->assoc_id && (job_ptr->start_time || IS_JOB_CANCELLED(job_ptr))) wckeyid = _get_wckeyid(mysql_conn, &job_ptr->wckey, job_ptr->user_id, mysql_conn->cluster_name, job_ptr->assoc_id); if (job_ptr->partition) partition = slurm_add_slash_to_quotes(job_ptr->partition); if (job_ptr->gres_req) gres_req = slurm_add_slash_to_quotes(job_ptr->gres_req); if (job_ptr->gres_alloc) gres_alloc = slurm_add_slash_to_quotes(job_ptr->gres_alloc); if (!job_ptr->db_index) { if (!begin_time) begin_time = submit_time; query = xstrdup_printf( "insert into \"%s_%s\" " "(id_job, id_array_job, id_array_task, " "id_assoc, id_qos, id_wckey, id_user, " "id_group, nodelist, id_resv, timelimit, " "time_eligible, time_submit, time_start, " "job_name, track_steps, state, priority, cpus_req, " "cpus_alloc, nodes_alloc, mem_req", mysql_conn->cluster_name, job_table); if (job_ptr->account) xstrcat(query, ", account"); if (partition) xstrcat(query, ", `partition`"); if (block_id) xstrcat(query, ", id_block"); if (job_ptr->wckey) xstrcat(query, ", wckey"); if (node_inx) xstrcat(query, ", node_inx"); if (gres_req) xstrcat(query, ", gres_req"); if (gres_alloc) xstrcat(query, ", gres_alloc"); xstrfmtcat(query, ") values (%u, %u, %u, %u, %u, %u, %u, %u, " "'%s', %u, %u, %ld, %ld, %ld, " "'%s', %u, %u, %u, %u, %u, %u, %u", job_ptr->job_id, job_ptr->array_job_id, job_ptr->array_task_id, job_ptr->assoc_id, job_ptr->qos_id, wckeyid, job_ptr->user_id, job_ptr->group_id, nodes, job_ptr->resv_id, job_ptr->time_limit, begin_time, submit_time, start_time, jname, track_steps, job_state, job_ptr->priority, job_ptr->details->min_cpus, job_ptr->total_cpus, node_cnt, job_ptr->details->pn_min_memory); if (job_ptr->account) xstrfmtcat(query, ", '%s'", job_ptr->account); if (partition) xstrfmtcat(query, ", '%s'", partition); if (block_id) xstrfmtcat(query, ", '%s'", block_id); if (job_ptr->wckey) xstrfmtcat(query, ", '%s'", job_ptr->wckey); if (node_inx) xstrfmtcat(query, ", '%s'", node_inx); if (gres_req) xstrfmtcat(query, ", '%s'", gres_req); if (gres_alloc) xstrfmtcat(query, ", '%s'", gres_alloc); xstrfmtcat(query, ") on duplicate key update " "job_db_inx=LAST_INSERT_ID(job_db_inx), " "id_wckey=%u, id_user=%u, id_group=%u, " "nodelist='%s', id_resv=%u, timelimit=%u, " "time_submit=%ld, time_start=%ld, " "job_name='%s', track_steps=%u, id_qos=%u, " "state=greatest(state, %u), priority=%u, " "cpus_req=%u, cpus_alloc=%u, nodes_alloc=%u, " "mem_req=%u, id_array_job=%u, id_array_task=%u", wckeyid, job_ptr->user_id, job_ptr->group_id, nodes, job_ptr->resv_id, job_ptr->time_limit, submit_time, start_time, jname, track_steps, job_ptr->qos_id, job_state, job_ptr->priority, job_ptr->details->min_cpus, job_ptr->total_cpus, node_cnt, job_ptr->details->pn_min_memory, job_ptr->array_job_id, job_ptr->array_task_id); if (job_ptr->account) xstrfmtcat(query, ", account='%s'", job_ptr->account); if (partition) xstrfmtcat(query, ", `partition`='%s'", partition); if (block_id) xstrfmtcat(query, ", id_block='%s'", block_id); if (job_ptr->wckey) xstrfmtcat(query, ", wckey='%s'", job_ptr->wckey); if (node_inx) xstrfmtcat(query, ", node_inx='%s'", node_inx); if (gres_req) xstrfmtcat(query, ", gres_req='%s'", gres_req); if (gres_alloc) xstrfmtcat(query, ", gres_alloc='%s'", gres_alloc); debug3("%d(%s:%d) query\n%s", mysql_conn->conn, THIS_FILE, __LINE__, query); try_again: if (!(job_ptr->db_index = mysql_db_insert_ret_id( mysql_conn, query))) { if (!reinit) { error("It looks like the storage has gone " "away trying to reconnect"); mysql_db_close_db_connection( mysql_conn); /* reconnect */ check_connection(mysql_conn); reinit = 1; goto try_again; } else rc = SLURM_ERROR; } } else { query = xstrdup_printf("update \"%s_%s\" set nodelist='%s', ", mysql_conn->cluster_name, job_table, nodes); if (job_ptr->account) xstrfmtcat(query, "account='%s', ", job_ptr->account); if (partition) xstrfmtcat(query, "`partition`='%s', ", partition); if (block_id) xstrfmtcat(query, "id_block='%s', ", block_id); if (job_ptr->wckey) xstrfmtcat(query, "wckey='%s', ", job_ptr->wckey); if (node_inx) xstrfmtcat(query, "node_inx='%s', ", node_inx); if (gres_req) xstrfmtcat(query, "gres_req='%s', ", gres_req); if (gres_alloc) xstrfmtcat(query, "gres_alloc='%s', ", gres_alloc); xstrfmtcat(query, "time_start=%ld, job_name='%s', state=%u, " "cpus_alloc=%u, nodes_alloc=%u, id_qos=%u, " "id_assoc=%u, id_wckey=%u, id_resv=%u, " "timelimit=%u, mem_req=%u, " "id_array_job=%u, id_array_task=%u, " "time_eligible=%ld where job_db_inx=%d", start_time, jname, job_state, job_ptr->total_cpus, node_cnt, job_ptr->qos_id, job_ptr->assoc_id, wckeyid, job_ptr->resv_id, job_ptr->time_limit, job_ptr->details->pn_min_memory, job_ptr->array_job_id, job_ptr->array_task_id, begin_time, job_ptr->db_index); debug3("%d(%s:%d) query\n%s", mysql_conn->conn, THIS_FILE, __LINE__, query); rc = mysql_db_query(mysql_conn, query); } xfree(block_id); xfree(partition); xfree(gres_req); xfree(gres_alloc); xfree(jname); xfree(query); /* now we will reset all the steps */ if (IS_JOB_RESIZING(job_ptr)) { /* FIXME : Verify this is still needed */ if (IS_JOB_SUSPENDED(job_ptr)) as_mysql_suspend(mysql_conn, job_db_inx, job_ptr); } return rc; }
extern int as_mysql_suspend(mysql_conn_t *mysql_conn, uint32_t old_db_inx, struct job_record *job_ptr) { char *query = NULL; int rc = SLURM_SUCCESS; time_t submit_time; uint32_t job_db_inx; if (check_connection(mysql_conn) != SLURM_SUCCESS) return ESLURM_DB_CONNECTION; if (job_ptr->resize_time) submit_time = job_ptr->resize_time; else submit_time = job_ptr->details->submit_time; if (!job_ptr->db_index) { if (!(job_ptr->db_index = _get_db_index(mysql_conn, submit_time, job_ptr->job_id, job_ptr->assoc_id))) { /* If we get an error with this just fall * through to avoid an infinite loop */ if (as_mysql_job_start( mysql_conn, job_ptr) == SLURM_ERROR) { error("couldn't suspend job %u", job_ptr->job_id); return SLURM_SUCCESS; } } } if (IS_JOB_RESIZING(job_ptr)) { if (!old_db_inx) { error("No old db inx given for job %u cluster %s, " "can't update suspend table.", job_ptr->job_id, mysql_conn->cluster_name); return SLURM_ERROR; } job_db_inx = old_db_inx; xstrfmtcat(query, "update \"%s_%s\" set time_end=%d where " "job_db_inx=%u && time_end=0;", mysql_conn->cluster_name, suspend_table, (int)job_ptr->suspend_time, job_db_inx); } else job_db_inx = job_ptr->db_index; /* use job_db_inx for this one since we want to update the supend time of the job before it was resized. */ xstrfmtcat(query, "update \"%s_%s\" set time_suspended=%d-time_suspended, " "state=%d where job_db_inx=%d;", mysql_conn->cluster_name, job_table, (int)job_ptr->suspend_time, job_ptr->job_state & JOB_STATE_BASE, job_db_inx); if (IS_JOB_SUSPENDED(job_ptr)) xstrfmtcat(query, "insert into \"%s_%s\" (job_db_inx, id_assoc, " "time_start, time_end) values (%u, %u, %d, 0);", mysql_conn->cluster_name, suspend_table, job_ptr->db_index, job_ptr->assoc_id, (int)job_ptr->suspend_time); else xstrfmtcat(query, "update \"%s_%s\" set time_end=%d where " "job_db_inx=%u && time_end=0;", mysql_conn->cluster_name, suspend_table, (int)job_ptr->suspend_time, job_ptr->db_index); debug3("%d(%s:%d) query\n%s", mysql_conn->conn, THIS_FILE, __LINE__, query); rc = mysql_db_query(mysql_conn, query); xfree(query); if (rc != SLURM_ERROR) { xstrfmtcat(query, "update \"%s_%s\" set " "time_suspended=%u-time_suspended, " "state=%d where job_db_inx=%u and time_end=0", mysql_conn->cluster_name, step_table, (int)job_ptr->suspend_time, job_ptr->job_state, job_ptr->db_index); rc = mysql_db_query(mysql_conn, query); xfree(query); } return rc; }
/* * js_pg_job_start - load into the storage the start of a job * * IN pg_conn: database connection * IN cluster_name: cluster of the job * IN job_ptr: job just started * RET: error code */ extern int js_pg_job_start(pgsql_conn_t *pg_conn, struct job_record *job_ptr) { int rc=SLURM_SUCCESS, track_steps = 0, reinit = 0; char *jname = NULL, *nodes = NULL, *node_inx = NULL; char *block_id = NULL, *rec = NULL, *query = NULL; time_t begin_time, check_time, start_time, submit_time; int job_state, node_cnt = 0; uint32_t wckeyid = 0; if ((!job_ptr->details || !job_ptr->details->submit_time) && !job_ptr->resize_time) { error("as/pg: job_start: Not inputing this job, " "it has no submit time."); return SLURM_ERROR; } if (check_db_connection(pg_conn) != SLURM_SUCCESS) return ESLURM_DB_CONNECTION; if (! cluster_in_db(pg_conn, pg_conn->cluster_name) ) { error("cluster %s not in db", pg_conn->cluster_name); return SLURM_ERROR; } debug3("as/pg: job_start() called"); job_state = job_ptr->job_state; /* Since we need a new db_inx make sure the old db_inx * removed. This is most likely the only time we are going to * be notified of the change also so make the state without * the resize. */ if (IS_JOB_RESIZING(job_ptr)) { /* If we have a db_index lets end the previous record. */ if (job_ptr->db_index) js_pg_job_complete(pg_conn, job_ptr); else error("We don't have a db_index for job %u, " "this should never happen.", job_ptr->job_id); job_state &= (~JOB_RESIZING); job_ptr->db_index = 0; } job_state &= JOB_STATE_BASE; if (job_ptr->resize_time) { begin_time = job_ptr->resize_time; submit_time = job_ptr->resize_time; start_time = job_ptr->resize_time; } else { begin_time = job_ptr->details->begin_time; submit_time = job_ptr->details->submit_time; start_time = job_ptr->start_time; } /* See what we are hearing about here if no start time. If * this job latest time is before the last roll up we will * need to reset it to look at this job. */ if (start_time) check_time = start_time; else if (begin_time) check_time = begin_time; else check_time = submit_time; slurm_mutex_lock(&usage_rollup_lock); if (check_time < global_last_rollup) { PGresult *result = NULL; /* check to see if we are hearing about this time for the * first time. */ query = xstrdup_printf( "SELECT job_db_inx FROM %s.%s WHERE id_job=%u AND " "time_submit=%ld AND time_eligible=%ld AND time_start=%ld", pg_conn->cluster_name, job_table, job_ptr->job_id, submit_time, begin_time, start_time); result = DEF_QUERY_RET; if (!result) { slurm_mutex_unlock(&usage_rollup_lock); return SLURM_ERROR; } if (PQntuples(result) != 0) { PQclear(result); debug4("revieved an update for a " "job (%u) already known about", job_ptr->job_id); slurm_mutex_unlock(&usage_rollup_lock); goto no_rollup_change; } PQclear(result); if (job_ptr->start_time) debug("Need to reroll usage from %s Job %u " "from %s started then and we are just " "now hearing about it.", ctime(&check_time), job_ptr->job_id, pg_conn->cluster_name); else if (begin_time) debug("Need to reroll usage from %s Job %u " "from %s became eligible then and we are just " "now hearing about it.", ctime(&check_time), job_ptr->job_id, pg_conn->cluster_name); else debug("Need to reroll usage from %s Job %u " "from %s was submitted then and we are just " "now hearing about it.", ctime(&check_time), job_ptr->job_id, pg_conn->cluster_name); global_last_rollup = check_time; slurm_mutex_unlock(&usage_rollup_lock); query = xstrdup_printf("UPDATE %s.%s SET hourly_rollup=%ld, " "daily_rollup=%ld, monthly_rollup=%ld", pg_conn->cluster_name, last_ran_table, check_time, check_time, check_time); rc = DEF_QUERY_RET_RC; } else slurm_mutex_unlock(&usage_rollup_lock); no_rollup_change: if (job_ptr->name && job_ptr->name[0]) jname = xstrdup(job_ptr->name); else { jname = xstrdup("allocation"); track_steps = 1; } if (job_ptr->nodes && job_ptr->nodes[0]) nodes = job_ptr->nodes; else nodes = "None assigned"; if (job_ptr->batch_flag) track_steps = 1; if (slurmdbd_conf) { block_id = xstrdup(job_ptr->comment); node_cnt = job_ptr->total_nodes; node_inx = job_ptr->network; } else { char temp_bit[BUF_SIZE]; if (job_ptr->node_bitmap) { node_inx = bit_fmt(temp_bit, sizeof(temp_bit), job_ptr->node_bitmap); } #ifdef HAVE_BG select_g_select_jobinfo_get(job_ptr->select_jobinfo, SELECT_JOBDATA_BLOCK_ID, &block_id); select_g_select_jobinfo_get(job_ptr->select_jobinfo, SELECT_JOBDATA_NODE_CNT, &node_cnt); #else node_cnt = job_ptr->total_nodes; #endif } /* If there is a start_time get the wckeyid. If the job is * cancelled before the job starts we also want to grab it. */ if (job_ptr->assoc_id && (job_ptr->start_time || IS_JOB_CANCELLED(job_ptr))) wckeyid = get_wckeyid(pg_conn, &job_ptr->wckey, job_ptr->user_id, pg_conn->cluster_name, job_ptr->assoc_id); if (!job_ptr->db_index) { if (!begin_time) begin_time = submit_time; rec = xstrdup_printf( "(0, 0, '%s', '%s', %d, %d, 0, '%s', " "%d, '%s', %d, %d, %d, %d, %d, %d, 0, " "%d, %ld, %ld, %ld, 0, 0, " "%d, '%s', '%s', %d, %d, '%s', %d)", /* job_db_inx=0, not used */ /* deleted=0 */ job_ptr->account ?: "", /* account */ job_ptr->partition ?: "", /* partition */ (int)job_ptr->details->min_cpus, /* cpus_req */ (int)job_ptr->total_cpus, /* cpus_alloc */ /* exit_code=0 */ jname, /* job_name */ (int)job_ptr->assoc_id, /* id_assoc */ block_id ?: "", /* id_block */ (int)job_ptr->job_id, /* id_job */ (int)job_ptr->qos_id, /* id_qos */ (int)job_ptr->resv_id, /* id_resv */ (int)wckeyid, /* id_wckey */ (int)job_ptr->user_id, /* uid */ (int)job_ptr->group_id, /* gid */ /* kill_requid=0 */ (int)job_ptr->time_limit, /* timelimit */ submit_time, /* time_submit */ begin_time, /* time_eligible */ start_time, /* time_start */ /* time_end=0 */ /* time_suspended=0 */ (int)node_cnt, /* nodes_alloc */ nodes ?: "", /* nodelist */ node_inx ?: "", /* node_inx */ (int)job_ptr->priority, /* priority */ (int)job_state, /* state */ job_ptr->wckey ?: "", /* wckey */ (int)track_steps); query = xstrdup_printf("SELECT %s.add_job_start(%s);", pg_conn->cluster_name, rec); xfree(rec); try_again: DEBUG_QUERY; job_ptr->db_index = pgsql_query_ret_id(pg_conn->db_conn, query); if (!job_ptr->db_index) { if (!reinit) { error("It looks like the storage has gone " "away trying to reconnect"); check_db_connection(pg_conn); reinit = 1; goto try_again; } else rc = SLURM_ERROR; } xfree(query); } else {