/* * reset usage_raw, and grp_used_wall on all associations * This should be called every PriorityUsageResetPeriod * RET: SLURM_SUCCESS on SUCCESS, SLURM_ERROR else. */ static int _reset_usage(void) { ListIterator itr = NULL; slurmdb_association_rec_t *assoc = NULL; slurmdb_qos_rec_t *qos = NULL; assoc_mgr_lock_t locks = { WRITE_LOCK, NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK }; if (!calc_fairshare) return SLURM_SUCCESS; assoc_mgr_lock(&locks); xassert(assoc_mgr_association_list); itr = list_iterator_create(assoc_mgr_association_list); /* We want to do this to all associations including root. All usage_raws are calculated from the bottom up. */ while ((assoc = list_next(itr))) { assoc->usage->usage_raw = 0; assoc->usage->grp_used_wall = 0; } list_iterator_destroy(itr); itr = list_iterator_create(assoc_mgr_qos_list); while ((qos = list_next(itr))) { qos->usage->usage_raw = 0; qos->usage->grp_used_wall = 0; } list_iterator_destroy(itr); assoc_mgr_unlock(&locks); return SLURM_SUCCESS; }
extern void priority_p_job_end(struct job_record *job_ptr) { uint64_t unused_cpu_run_secs = 0; uint64_t time_limit_secs = (uint64_t)job_ptr->time_limit * 60; slurmdb_assoc_rec_t *assoc_ptr; assoc_mgr_lock_t locks = { NO_LOCK, WRITE_LOCK, NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; /* No unused cpu_run_secs if job ran past its time limit */ if (job_ptr->end_time >= job_ptr->start_time + time_limit_secs) return; unused_cpu_run_secs = job_ptr->total_cpus * (job_ptr->start_time + time_limit_secs - job_ptr->end_time); assoc_mgr_lock(&locks); if (job_ptr->qos_ptr) { slurmdb_qos_rec_t *qos_ptr = (slurmdb_qos_rec_t *)job_ptr->qos_ptr; if (unused_cpu_run_secs > qos_ptr->usage->grp_used_cpu_run_secs) { qos_ptr->usage->grp_used_cpu_run_secs = 0; debug2("acct_policy_job_fini: " "grp_used_cpu_run_secs " "underflow for qos %s", qos_ptr->name); } else qos_ptr->usage->grp_used_cpu_run_secs -= unused_cpu_run_secs; } assoc_ptr = (slurmdb_assoc_rec_t *)job_ptr->assoc_ptr; while (assoc_ptr) { /* If the job finished early remove the extra time now. */ if (unused_cpu_run_secs > assoc_ptr->usage->grp_used_cpu_run_secs) { assoc_ptr->usage->grp_used_cpu_run_secs = 0; debug2("acct_policy_job_fini: " "grp_used_cpu_run_secs " "underflow for account %s", assoc_ptr->acct); } else { assoc_ptr->usage->grp_used_cpu_run_secs -= unused_cpu_run_secs; debug4("acct_policy_job_fini: job %u. " "Removed %"PRIu64" unused seconds " "from assoc %s " "grp_used_cpu_run_secs = %"PRIu64"", job_ptr->job_id, unused_cpu_run_secs, assoc_ptr->acct, assoc_ptr->usage->grp_used_cpu_run_secs); } /* now handle all the group limits of the parents */ assoc_ptr = assoc_ptr->usage->parent_assoc_ptr; } assoc_mgr_unlock(&locks); return; }
/* job_ptr should already have the partition priority and such added * here before had we will be adding to it */ static double _get_fairshare_priority( struct job_record *job_ptr) { slurmdb_association_rec_t *job_assoc = (slurmdb_association_rec_t *)job_ptr->assoc_ptr; slurmdb_association_rec_t *fs_assoc = NULL; double priority_fs = 0.0; assoc_mgr_lock_t locks = { READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; if (!calc_fairshare) return 0; if (!job_assoc) { error("Job %u has no association. Unable to " "compute fairshare.", job_ptr->job_id); return 0; } fs_assoc = job_assoc; assoc_mgr_lock(&locks); /* Use values from parent when FairShare=SLURMDB_FS_USE_PARENT */ while ((fs_assoc->shares_raw == SLURMDB_FS_USE_PARENT) && fs_assoc->usage->parent_assoc_ptr && (fs_assoc != assoc_mgr_root_assoc)) { fs_assoc = fs_assoc->usage->parent_assoc_ptr; } if (fuzzy_equal(fs_assoc->usage->usage_efctv, NO_VAL)) priority_p_set_assoc_usage(fs_assoc); /* Priority is 0 -> 1 */ priority_fs = priority_p_calc_fs_factor( fs_assoc->usage->usage_efctv, (long double)fs_assoc->usage->shares_norm); if (priority_debug) { info("Fairshare priority of job %u for user %s in acct" " %s is 2**(-%Lf/%f) = %f", job_ptr->job_id, job_assoc->user, job_assoc->acct, fs_assoc->usage->usage_efctv, fs_assoc->usage->shares_norm, priority_fs); } assoc_mgr_unlock(&locks); return priority_fs; }
/* * apply decay factor to all associations usage_raw * IN: decay_factor - decay to be applied to each associations' used * shares. This should already be modified with the amount of delta * time from last application.. * RET: SLURM_SUCCESS on SUCCESS, SLURM_ERROR else. */ static int _apply_decay(double decay_factor) { ListIterator itr = NULL; slurmdb_association_rec_t *assoc = NULL; slurmdb_qos_rec_t *qos = NULL; assoc_mgr_lock_t locks = { WRITE_LOCK, NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK }; /* continue if decay_factor is 0 or 1 since that doesn't help us at all. 1 means no decay and 0 will just zero everything out so don't waste time doing it */ if (!decay_factor) return SLURM_ERROR; else if (!calc_fairshare) return SLURM_SUCCESS; assoc_mgr_lock(&locks); xassert(assoc_mgr_association_list); xassert(assoc_mgr_qos_list); itr = list_iterator_create(assoc_mgr_association_list); /* We want to do this to all associations including root. All usage_raws are calculated from the bottom up. */ while ((assoc = list_next(itr))) { assoc->usage->usage_raw *= decay_factor; assoc->usage->grp_used_wall *= decay_factor; } list_iterator_destroy(itr); itr = list_iterator_create(assoc_mgr_qos_list); while ((qos = list_next(itr))) { qos->usage->usage_raw *= decay_factor; qos->usage->grp_used_wall *= decay_factor; } list_iterator_destroy(itr); assoc_mgr_unlock(&locks); return SLURM_SUCCESS; }
/* Fair Tree code called from the decay thread loop */ extern void fair_tree_decay(List jobs, time_t start) { slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK }; assoc_mgr_lock_t locks = { WRITE_LOCK, NO_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; /* apply decayed usage */ lock_slurmctld(job_write_lock); list_for_each(jobs, (ListForF) _ft_decay_apply_new_usage, &start); unlock_slurmctld(job_write_lock); /* calculate fs factor for associations */ assoc_mgr_lock(&locks); _apply_priority_fs(); assoc_mgr_unlock(&locks); /* assign job priorities */ lock_slurmctld(job_write_lock); list_for_each(jobs, (ListForF) decay_apply_weighted_factors, &start); unlock_slurmctld(job_write_lock); }
extern void print_jag_prec(jag_prec_t *prec) { int i; assoc_mgr_lock_t locks = { NO_LOCK, NO_LOCK, NO_LOCK, NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK }; info("pid %d (ppid %d)", prec->pid, prec->ppid); info("act_cpufreq\t%d", prec->act_cpufreq); info("ssec \t%f", prec->ssec); assoc_mgr_lock(&locks); for (i = 0; i < prec->tres_count; i++) { if (prec->tres_data[i].size_read == INFINITE64) continue; info("%s in/read \t%"PRIu64"", assoc_mgr_tres_name_array[i], prec->tres_data[i].size_read); info("%s out/write \t%"PRIu64"", assoc_mgr_tres_name_array[i], prec->tres_data[i].size_write); } assoc_mgr_unlock(&locks); info("usec \t%f", prec->usec); }
extern List as_mysql_modify_qos(mysql_conn_t *mysql_conn, uint32_t uid, slurmdb_qos_cond_t *qos_cond, slurmdb_qos_rec_t *qos) { ListIterator itr = NULL; List ret_list = NULL; int rc = SLURM_SUCCESS; char *object = NULL; char *vals = NULL, *extra = NULL, *query = NULL, *name_char = NULL; time_t now = time(NULL); char *user_name = NULL; int set = 0, i; MYSQL_RES *result = NULL; MYSQL_ROW row; char *tmp_char1=NULL, *tmp_char2=NULL; bitstr_t *preempt_bitstr = NULL; char *added_preempt = NULL; uint32_t qos_cnt; assoc_mgr_lock_t locks = { NO_LOCK, NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; if (!qos_cond || !qos) { error("we need something to change"); return NULL; } if (check_connection(mysql_conn) != SLURM_SUCCESS) return NULL; if (!is_user_min_admin_level(mysql_conn, uid, SLURMDB_ADMIN_SUPER_USER)) { errno = ESLURM_ACCESS_DENIED; return NULL; } xstrcat(extra, "where deleted=0"); if (qos_cond->description_list && list_count(qos_cond->description_list)) { set = 0; xstrcat(extra, " && ("); itr = list_iterator_create(qos_cond->description_list); while ((object = list_next(itr))) { if (set) xstrcat(extra, " || "); xstrfmtcat(extra, "description='%s'", object); set = 1; } list_iterator_destroy(itr); xstrcat(extra, ")"); } if (qos_cond->id_list && list_count(qos_cond->id_list)) { set = 0; xstrcat(extra, " && ("); itr = list_iterator_create(qos_cond->id_list); while ((object = list_next(itr))) { if (set) xstrcat(extra, " || "); xstrfmtcat(extra, "id='%s'", object); set = 1; } list_iterator_destroy(itr); xstrcat(extra, ")"); } if (qos_cond->name_list && list_count(qos_cond->name_list)) { set = 0; xstrcat(extra, " && ("); itr = list_iterator_create(qos_cond->name_list); while ((object = list_next(itr))) { if (set) xstrcat(extra, " || "); xstrfmtcat(extra, "name='%s'", object); set = 1; } list_iterator_destroy(itr); xstrcat(extra, ")"); } _setup_qos_limits(qos, &tmp_char1, &tmp_char2, &vals, &added_preempt, 0); assoc_mgr_lock(&locks); qos_cnt = g_qos_count; assoc_mgr_unlock(&locks); if (added_preempt) { preempt_bitstr = bit_alloc(qos_cnt); bit_unfmt(preempt_bitstr, added_preempt+1); xfree(added_preempt); } xfree(tmp_char1); xfree(tmp_char2); if (!extra || !vals) { errno = SLURM_NO_CHANGE_IN_DATA; FREE_NULL_BITMAP(preempt_bitstr); error("Nothing to change"); return NULL; } object = xstrdup(mqos_req_inx[0]); for (i = 1; i < MQOS_COUNT; i++) xstrfmtcat(object, ", %s", mqos_req_inx[i]); query = xstrdup_printf("select %s from %s %s;", object, qos_table, extra); xfree(extra); xfree(object); if (!(result = mysql_db_query_ret(mysql_conn, query, 0))) { xfree(query); FREE_NULL_BITMAP(preempt_bitstr); return NULL; } rc = 0; ret_list = list_create(slurm_destroy_char); while ((row = mysql_fetch_row(result))) { slurmdb_qos_rec_t *qos_rec = NULL; uint32_t id = slurm_atoul(row[MQOS_ID]); if (preempt_bitstr) { if (_preemption_loop(mysql_conn, id, preempt_bitstr)) break; } object = xstrdup(row[MQOS_NAME]); list_append(ret_list, object); if (!rc) { xstrfmtcat(name_char, "(name='%s'", object); rc = 1; } else { xstrfmtcat(name_char, " || name='%s'", object); } qos_rec = xmalloc(sizeof(slurmdb_qos_rec_t)); qos_rec->name = xstrdup(object); qos_rec->id = id; qos_rec->flags = qos->flags; qos_rec->grace_time = qos->grace_time; mod_tres_str(&qos_rec->grp_tres, qos->grp_tres, row[MQOS_GT], NULL, "grp_tres", &vals, qos_rec->id, 0); mod_tres_str(&qos_rec->grp_tres_mins, qos->grp_tres_mins, row[MQOS_GTM], NULL, "grp_tres_mins", &vals, qos_rec->id, 0); mod_tres_str(&qos_rec->grp_tres_run_mins, qos->grp_tres_run_mins, row[MQOS_GTRM], NULL, "grp_tres_run_mins", &vals, qos_rec->id, 0); qos_rec->grp_jobs = qos->grp_jobs; qos_rec->grp_submit_jobs = qos->grp_submit_jobs; qos_rec->grp_wall = qos->grp_wall; mod_tres_str(&qos_rec->max_tres_pa, qos->max_tres_pa, row[MQOS_MTPA], NULL, "max_tres_pa", &vals, qos_rec->id, 0); mod_tres_str(&qos_rec->max_tres_pj, qos->max_tres_pj, row[MQOS_MTPJ], NULL, "max_tres_pj", &vals, qos_rec->id, 0); mod_tres_str(&qos_rec->max_tres_pn, qos->max_tres_pn, row[MQOS_MTPN], NULL, "max_tres_pn", &vals, qos_rec->id, 0); mod_tres_str(&qos_rec->max_tres_pu, qos->max_tres_pu, row[MQOS_MTPU], NULL, "max_tres_pu", &vals, qos_rec->id, 0); mod_tres_str(&qos_rec->max_tres_mins_pj, qos->max_tres_mins_pj, row[MQOS_MTMPJ], NULL, "max_tres_mins_pj", &vals, qos_rec->id, 0); mod_tres_str(&qos_rec->max_tres_run_mins_pa, qos->max_tres_run_mins_pa, row[MQOS_MTRM], NULL, "max_tres_run_mins_pa", &vals, qos_rec->id, 0); mod_tres_str(&qos_rec->max_tres_run_mins_pu, qos->max_tres_run_mins_pu, row[MQOS_MTRM], NULL, "max_tres_run_mins_pu", &vals, qos_rec->id, 0); qos_rec->max_jobs_pa = qos->max_jobs_pa; qos_rec->max_jobs_pu = qos->max_jobs_pu; qos_rec->max_submit_jobs_pa = qos->max_submit_jobs_pa; qos_rec->max_submit_jobs_pu = qos->max_submit_jobs_pu; qos_rec->max_wall_pj = qos->max_wall_pj; mod_tres_str(&qos_rec->min_tres_pj, qos->min_tres_pj, row[MQOS_MITPJ], NULL, "min_tres_pj", &vals, qos_rec->id, 0); qos_rec->preempt_mode = qos->preempt_mode; qos_rec->priority = qos->priority; if (qos->preempt_list) { ListIterator new_preempt_itr = list_iterator_create(qos->preempt_list); char *new_preempt = NULL; bool cleared = 0; qos_rec->preempt_bitstr = bit_alloc(qos_cnt); if (row[MQOS_PREEMPT] && row[MQOS_PREEMPT][0]) bit_unfmt(qos_rec->preempt_bitstr, row[MQOS_PREEMPT]+1); while ((new_preempt = list_next(new_preempt_itr))) { if (new_preempt[0] == '-') { bit_clear(qos_rec->preempt_bitstr, atol(new_preempt+1)); } else if (new_preempt[0] == '+') { bit_set(qos_rec->preempt_bitstr, atol(new_preempt+1)); } else { if (!cleared) { cleared = 1; bit_nclear( qos_rec->preempt_bitstr, 0, qos_cnt-1); } bit_set(qos_rec->preempt_bitstr, atol(new_preempt)); } } list_iterator_destroy(new_preempt_itr); } qos_rec->usage_factor = qos->usage_factor; qos_rec->usage_thres = qos->usage_thres; if (addto_update_list(mysql_conn->update_list, SLURMDB_MODIFY_QOS, qos_rec) != SLURM_SUCCESS) slurmdb_destroy_qos_rec(qos_rec); } mysql_free_result(result); FREE_NULL_BITMAP(preempt_bitstr); if (row) { xfree(vals); xfree(name_char); xfree(query); FREE_NULL_LIST(ret_list); ret_list = NULL; errno = ESLURM_QOS_PREEMPTION_LOOP; return ret_list; } if (!list_count(ret_list)) { errno = SLURM_NO_CHANGE_IN_DATA; if (debug_flags & DEBUG_FLAG_DB_QOS) DB_DEBUG(mysql_conn->conn, "didn't effect anything\n%s", query); xfree(vals); xfree(query); return ret_list; } xfree(query); xstrcat(name_char, ")"); user_name = uid_to_string((uid_t) uid); rc = modify_common(mysql_conn, DBD_MODIFY_QOS, now, user_name, qos_table, name_char, vals, NULL); xfree(user_name); xfree(name_char); xfree(vals); if (rc == SLURM_ERROR) { error("Couldn't modify qos"); FREE_NULL_LIST(ret_list); ret_list = NULL; } return ret_list; }
extern int as_mysql_add_qos(mysql_conn_t *mysql_conn, uint32_t uid, List qos_list) { ListIterator itr = NULL; int rc = SLURM_SUCCESS; slurmdb_qos_rec_t *object = NULL; char *cols = NULL, *extra = NULL, *vals = NULL, *query = NULL, *tmp_extra = NULL; time_t now = time(NULL); char *user_name = NULL; int affect_rows = 0; int added = 0; char *added_preempt = NULL; uint32_t qos_cnt; assoc_mgr_lock_t locks = { NO_LOCK, NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; if (check_connection(mysql_conn) != SLURM_SUCCESS) return ESLURM_DB_CONNECTION; if (!is_user_min_admin_level(mysql_conn, uid, SLURMDB_ADMIN_SUPER_USER)) return ESLURM_ACCESS_DENIED; assoc_mgr_lock(&locks); qos_cnt = g_qos_count; assoc_mgr_unlock(&locks); user_name = uid_to_string((uid_t) uid); itr = list_iterator_create(qos_list); while ((object = list_next(itr))) { if (!object->name || !object->name[0]) { error("We need a qos name to add."); rc = SLURM_ERROR; continue; } xstrcat(cols, "creation_time, mod_time, name"); xstrfmtcat(vals, "%ld, %ld, '%s'", now, now, object->name); xstrfmtcat(extra, ", mod_time=%ld", now); _setup_qos_limits(object, &cols, &vals, &extra, &added_preempt, 1); if (added_preempt) { object->preempt_bitstr = bit_alloc(qos_cnt); bit_unfmt(object->preempt_bitstr, added_preempt+1); xfree(added_preempt); } xstrfmtcat(query, "insert into %s (%s) values (%s) " "on duplicate key update deleted=0, " "id=LAST_INSERT_ID(id)%s;", qos_table, cols, vals, extra); if (debug_flags & DEBUG_FLAG_DB_QOS) DB_DEBUG(mysql_conn->conn, "query\n%s", query); object->id = (uint32_t)mysql_db_insert_ret_id( mysql_conn, query); xfree(query); if (!object->id) { error("Couldn't add qos %s", object->name); added=0; xfree(cols); xfree(extra); xfree(vals); break; } affect_rows = last_affected_rows(mysql_conn); if (!affect_rows) { debug2("nothing changed %d", affect_rows); xfree(cols); xfree(extra); xfree(vals); continue; } /* we always have a ', ' as the first 2 chars */ tmp_extra = slurm_add_slash_to_quotes(extra+2); xstrfmtcat(query, "insert into %s " "(timestamp, action, name, actor, info) " "values (%ld, %u, '%s', '%s', '%s');", txn_table, now, DBD_ADD_QOS, object->name, user_name, tmp_extra); xfree(tmp_extra); xfree(cols); xfree(extra); xfree(vals); debug4("query\n%s",query); rc = mysql_db_query(mysql_conn, query); xfree(query); if (rc != SLURM_SUCCESS) { error("Couldn't add txn"); } else { if (addto_update_list(mysql_conn->update_list, SLURMDB_ADD_QOS, object) == SLURM_SUCCESS) list_remove(itr); added++; } } list_iterator_destroy(itr); xfree(user_name); if (!added) { reset_mysql_conn(mysql_conn); } return rc; }
extern List as_mysql_get_qos(mysql_conn_t *mysql_conn, uid_t uid, slurmdb_qos_cond_t *qos_cond) { char *query = NULL; char *extra = NULL; char *tmp = NULL; List qos_list = NULL; ListIterator itr = NULL; char *object = NULL; int set = 0; int i=0; MYSQL_RES *result = NULL; MYSQL_ROW row; uint32_t qos_cnt; assoc_mgr_lock_t locks = { NO_LOCK, NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; /* if this changes you will need to edit the corresponding enum */ char *qos_req_inx[] = { "name", "description", "id", "flags", "grace_time", "grp_tres_mins", "grp_tres_run_mins", "grp_tres", "grp_jobs", "grp_submit_jobs", "grp_wall", "max_tres_mins_pj", "max_tres_run_mins_pa", "max_tres_run_mins_pu", "max_tres_pa", "max_tres_pj", "max_tres_pn", "max_tres_pu", "max_jobs_pa", "max_jobs_per_user", "max_submit_jobs_pa", "max_submit_jobs_per_user", "max_wall_duration_per_job", "substr(preempt, 1, length(preempt) - 1)", "preempt_mode", "priority", "usage_factor", "usage_thres", "min_tres_pj", }; enum { QOS_REQ_NAME, QOS_REQ_DESC, QOS_REQ_ID, QOS_REQ_FLAGS, QOS_REQ_GRACE, QOS_REQ_GTM, QOS_REQ_GTRM, QOS_REQ_GT, QOS_REQ_GJ, QOS_REQ_GSJ, QOS_REQ_GW, QOS_REQ_MTMPJ, QOS_REQ_MTRMA, QOS_REQ_MTRM, QOS_REQ_MTPA, QOS_REQ_MTPJ, QOS_REQ_MTPN, QOS_REQ_MTPU, QOS_REQ_MJPA, QOS_REQ_MJPU, QOS_REQ_MSJPA, QOS_REQ_MSJPU, QOS_REQ_MWPJ, QOS_REQ_PREE, QOS_REQ_PREEM, QOS_REQ_PRIO, QOS_REQ_UF, QOS_REQ_UT, QOS_REQ_MITPJ, QOS_REQ_COUNT }; if (check_connection(mysql_conn) != SLURM_SUCCESS) return NULL; if (!qos_cond) { xstrcat(extra, "where deleted=0"); goto empty; } if (qos_cond->with_deleted) xstrcat(extra, "where (deleted=0 || deleted=1)"); else xstrcat(extra, "where deleted=0"); if (qos_cond->description_list && list_count(qos_cond->description_list)) { set = 0; xstrcat(extra, " && ("); itr = list_iterator_create(qos_cond->description_list); while ((object = list_next(itr))) { if (set) xstrcat(extra, " || "); xstrfmtcat(extra, "description='%s'", object); set = 1; } list_iterator_destroy(itr); xstrcat(extra, ")"); } if (qos_cond->id_list && list_count(qos_cond->id_list)) { set = 0; xstrcat(extra, " && ("); itr = list_iterator_create(qos_cond->id_list); while ((object = list_next(itr))) { if (set) xstrcat(extra, " || "); xstrfmtcat(extra, "id='%s'", object); set = 1; } list_iterator_destroy(itr); xstrcat(extra, ")"); } if (qos_cond->name_list && list_count(qos_cond->name_list)) { set = 0; xstrcat(extra, " && ("); itr = list_iterator_create(qos_cond->name_list); while ((object = list_next(itr))) { if (set) xstrcat(extra, " || "); xstrfmtcat(extra, "name='%s'", object); set = 1; } list_iterator_destroy(itr); xstrcat(extra, ")"); } empty: xfree(tmp); xstrfmtcat(tmp, "%s", qos_req_inx[i]); for(i=1; i<QOS_REQ_COUNT; i++) { xstrfmtcat(tmp, ", %s", qos_req_inx[i]); } query = xstrdup_printf("select %s from %s %s", tmp, qos_table, extra); xfree(tmp); xfree(extra); if (debug_flags & DEBUG_FLAG_DB_QOS) DB_DEBUG(mysql_conn->conn, "query\n%s", query); if (!(result = mysql_db_query_ret( mysql_conn, query, 0))) { xfree(query); return NULL; } xfree(query); qos_list = list_create(slurmdb_destroy_qos_rec); assoc_mgr_lock(&locks); qos_cnt = g_qos_count; assoc_mgr_unlock(&locks); while ((row = mysql_fetch_row(result))) { slurmdb_qos_rec_t *qos = xmalloc(sizeof(slurmdb_qos_rec_t)); list_append(qos_list, qos); if (row[QOS_REQ_DESC] && row[QOS_REQ_DESC][0]) qos->description = xstrdup(row[QOS_REQ_DESC]); qos->id = slurm_atoul(row[QOS_REQ_ID]); qos->flags = slurm_atoul(row[QOS_REQ_FLAGS]); if (row[QOS_REQ_NAME] && row[QOS_REQ_NAME][0]) qos->name = xstrdup(row[QOS_REQ_NAME]); if (row[QOS_REQ_GRACE]) qos->grace_time = slurm_atoul(row[QOS_REQ_GRACE]); if (row[QOS_REQ_GT][0]) qos->grp_tres = xstrdup(row[QOS_REQ_GT]); if (row[QOS_REQ_GTM][0]) qos->grp_tres_mins = xstrdup(row[QOS_REQ_GTM]); if (row[QOS_REQ_GTRM][0]) qos->grp_tres_run_mins = xstrdup(row[QOS_REQ_GTRM]); if (row[QOS_REQ_GJ]) qos->grp_jobs = slurm_atoul(row[QOS_REQ_GJ]); else qos->grp_jobs = INFINITE; if (row[QOS_REQ_GSJ]) qos->grp_submit_jobs = slurm_atoul(row[QOS_REQ_GSJ]); else qos->grp_submit_jobs = INFINITE; if (row[QOS_REQ_GW]) qos->grp_wall = slurm_atoul(row[QOS_REQ_GW]); else qos->grp_wall = INFINITE; if (row[QOS_REQ_MJPA]) qos->max_jobs_pa = slurm_atoul(row[QOS_REQ_MJPA]); else qos->max_jobs_pa = INFINITE; if (row[QOS_REQ_MJPU]) qos->max_jobs_pu = slurm_atoul(row[QOS_REQ_MJPU]); else qos->max_jobs_pu = INFINITE; if (row[QOS_REQ_MSJPA]) qos->max_submit_jobs_pa = slurm_atoul(row[QOS_REQ_MSJPA]); else qos->max_submit_jobs_pa = INFINITE; if (row[QOS_REQ_MSJPU]) qos->max_submit_jobs_pu = slurm_atoul(row[QOS_REQ_MSJPU]); else qos->max_submit_jobs_pu = INFINITE; if (row[QOS_REQ_MTPA][0]) qos->max_tres_pa = xstrdup(row[QOS_REQ_MTPA]); if (row[QOS_REQ_MTPJ][0]) qos->max_tres_pj = xstrdup(row[QOS_REQ_MTPJ]); if (row[QOS_REQ_MTPN][0]) qos->max_tres_pn = xstrdup(row[QOS_REQ_MTPN]); if (row[QOS_REQ_MTPU][0]) qos->max_tres_pu = xstrdup(row[QOS_REQ_MTPU]); if (row[QOS_REQ_MTMPJ][0]) qos->max_tres_mins_pj = xstrdup(row[QOS_REQ_MTMPJ]); if (row[QOS_REQ_MTRMA][0]) qos->max_tres_run_mins_pa = xstrdup(row[QOS_REQ_MTRMA]); if (row[QOS_REQ_MTRM][0]) qos->max_tres_run_mins_pu = xstrdup(row[QOS_REQ_MTRM]); if (row[QOS_REQ_MWPJ]) qos->max_wall_pj = slurm_atoul(row[QOS_REQ_MWPJ]); else qos->max_wall_pj = INFINITE; if (row[QOS_REQ_PREE] && row[QOS_REQ_PREE][0]) { if (!qos->preempt_bitstr) qos->preempt_bitstr = bit_alloc(qos_cnt); bit_unfmt(qos->preempt_bitstr, row[QOS_REQ_PREE]+1); } if (row[QOS_REQ_PREEM]) qos->preempt_mode = slurm_atoul(row[QOS_REQ_PREEM]); if (row[QOS_REQ_PRIO]) qos->priority = slurm_atoul(row[QOS_REQ_PRIO]); if (row[QOS_REQ_UF]) qos->usage_factor = atof(row[QOS_REQ_UF]); if (row[QOS_REQ_UT]) qos->usage_thres = atof(row[QOS_REQ_UT]); else qos->usage_thres = (double)INFINITE; if (row[QOS_REQ_MITPJ][0]) qos->min_tres_pj = xstrdup(row[QOS_REQ_MITPJ]); } mysql_free_result(result); return qos_list; }
/* * acct_policy_job_runnable - Determine of the specified job can execute * right now or not depending upon accounting policy (e.g. running * job limit for this association). If the association limits prevent * the job from ever running (lowered limits since job submission), * then cancel the job. */ extern bool acct_policy_job_runnable(struct job_record *job_ptr) { slurmdb_qos_rec_t *qos_ptr; slurmdb_association_rec_t *assoc_ptr; uint32_t time_limit; uint64_t cpu_time_limit; uint64_t job_cpu_time_limit; bool rc = true; uint64_t usage_mins; uint32_t wall_mins; bool cancel_job = 0; int parent = 0; /*flag to tell us if we are looking at the * parent or not */ assoc_mgr_lock_t locks = { READ_LOCK, NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK }; /* check to see if we are enforcing associations */ if (!accounting_enforce) return true; if (!_valid_job_assoc(job_ptr)) { _cancel_job(job_ptr); return false; } /* now see if we are enforcing limits */ if (!(accounting_enforce & ACCOUNTING_ENFORCE_LIMITS)) return true; /* clear old state reason */ if ((job_ptr->state_reason == WAIT_ASSOC_JOB_LIMIT) || (job_ptr->state_reason == WAIT_ASSOC_RESOURCE_LIMIT) || (job_ptr->state_reason == WAIT_ASSOC_TIME_LIMIT)) job_ptr->state_reason = WAIT_NO_REASON; job_cpu_time_limit = (uint64_t)job_ptr->time_limit * (uint64_t)job_ptr->details->min_cpus; assoc_mgr_lock(&locks); qos_ptr = job_ptr->qos_ptr; if(qos_ptr) { usage_mins = (uint64_t)(qos_ptr->usage->usage_raw / 60.0); wall_mins = qos_ptr->usage->grp_used_wall / 60; if ((qos_ptr->grp_cpu_mins != (uint64_t)INFINITE) && (usage_mins >= qos_ptr->grp_cpu_mins)) { job_ptr->state_reason = WAIT_ASSOC_JOB_LIMIT; xfree(job_ptr->state_desc); debug2("Job %u being held, " "the job is at or exceeds QOS %s's " "group max cpu minutes of %"PRIu64" " "with %"PRIu64"", job_ptr->job_id, qos_ptr->name, qos_ptr->grp_cpu_mins, usage_mins); rc = false; goto end_it; } if ((job_ptr->limit_set_min_cpus != ADMIN_SET_LIMIT) && qos_ptr->grp_cpus != INFINITE) { if (job_ptr->details->min_cpus > qos_ptr->grp_cpus) { info("job %u is being cancelled, " "min cpu request %u exceeds " "group max cpu limit %u for " "qos '%s'", job_ptr->job_id, job_ptr->details->min_cpus, qos_ptr->grp_cpus, qos_ptr->name); cancel_job = 1; rc = false; goto end_it; } if ((qos_ptr->usage->grp_used_cpus + job_ptr->details->min_cpus) > qos_ptr->grp_cpus) { job_ptr->state_reason = WAIT_ASSOC_RESOURCE_LIMIT; xfree(job_ptr->state_desc); debug2("job %u being held, " "the job is at or exceeds " "group max cpu limit %u " "with already used %u + requested %u " "for qos %s", job_ptr->job_id, qos_ptr->grp_cpus, qos_ptr->usage->grp_used_cpus, job_ptr->details->min_cpus, qos_ptr->name); rc = false; goto end_it; } } if ((qos_ptr->grp_jobs != INFINITE) && (qos_ptr->usage->grp_used_jobs >= qos_ptr->grp_jobs)) { job_ptr->state_reason = WAIT_ASSOC_JOB_LIMIT; xfree(job_ptr->state_desc); debug2("job %u being held, " "the job is at or exceeds " "group max jobs limit %u with %u for qos %s", job_ptr->job_id, qos_ptr->grp_jobs, qos_ptr->usage->grp_used_jobs, qos_ptr->name); rc = false; goto end_it; } if ((job_ptr->limit_set_min_nodes != ADMIN_SET_LIMIT) && qos_ptr->grp_nodes != INFINITE) { if (job_ptr->details->min_nodes > qos_ptr->grp_nodes) { info("job %u is being cancelled, " "min node request %u exceeds " "group max node limit %u for " "qos '%s'", job_ptr->job_id, job_ptr->details->min_nodes, qos_ptr->grp_nodes, qos_ptr->name); cancel_job = 1; rc = false; goto end_it; } if ((qos_ptr->usage->grp_used_nodes + job_ptr->details->min_nodes) > qos_ptr->grp_nodes) { job_ptr->state_reason = WAIT_ASSOC_RESOURCE_LIMIT; xfree(job_ptr->state_desc); debug2("job %u being held, " "the job is at or exceeds " "group max node limit %u " "with already used %u + requested %u " "for qos %s", job_ptr->job_id, qos_ptr->grp_nodes, qos_ptr->usage->grp_used_nodes, job_ptr->details->min_nodes, qos_ptr->name); rc = false; goto end_it; } } /* we don't need to check submit_jobs here */ if ((qos_ptr->grp_wall != INFINITE) && (wall_mins >= qos_ptr->grp_wall)) { job_ptr->state_reason = WAIT_ASSOC_JOB_LIMIT; xfree(job_ptr->state_desc); debug2("job %u being held, " "the job is at or exceeds " "group wall limit %u " "with %u for qos %s", job_ptr->job_id, qos_ptr->grp_wall, wall_mins, qos_ptr->name); rc = false; goto end_it; } if (qos_ptr->max_cpu_mins_pj != INFINITE) { cpu_time_limit = qos_ptr->max_cpu_mins_pj; if ((job_ptr->time_limit != NO_VAL) && (job_cpu_time_limit > cpu_time_limit)) { info("job %u being cancelled, " "cpu time limit %"PRIu64" exceeds " "qos max per job %"PRIu64"", job_ptr->job_id, job_cpu_time_limit, cpu_time_limit); cancel_job = 1; rc = false; goto end_it; } } if ((job_ptr->limit_set_min_cpus != ADMIN_SET_LIMIT) && qos_ptr->max_cpus_pj != INFINITE) { if (job_ptr->details->min_cpus > qos_ptr->max_cpus_pj) { info("job %u being cancelled, " "min cpu limit %u exceeds " "qos max %u", job_ptr->job_id, job_ptr->details->min_cpus, qos_ptr->max_cpus_pj); cancel_job = 1; rc = false; goto end_it; } } if (qos_ptr->max_jobs_pu != INFINITE) { slurmdb_used_limits_t *used_limits = NULL; if(qos_ptr->usage->user_limit_list) { ListIterator itr = list_iterator_create( qos_ptr->usage->user_limit_list); while((used_limits = list_next(itr))) { if(used_limits->uid == job_ptr->user_id) break; } list_iterator_destroy(itr); } if(used_limits && (used_limits->jobs >= qos_ptr->max_jobs_pu)) { debug2("job %u being held, " "the job is at or exceeds " "max jobs limit %u with %u for QOS %s", job_ptr->job_id, qos_ptr->max_jobs_pu, used_limits->jobs, qos_ptr->name); rc = false; goto end_it; } } if ((job_ptr->limit_set_min_nodes != ADMIN_SET_LIMIT) && qos_ptr->max_nodes_pj != INFINITE) { if (job_ptr->details->min_nodes > qos_ptr->max_nodes_pj) { info("job %u being cancelled, " "min node limit %u exceeds " "qos max %u", job_ptr->job_id, job_ptr->details->min_nodes, qos_ptr->max_nodes_pj); cancel_job = 1; rc = false; goto end_it; } } /* we don't need to check submit_jobs_pu here */ /* if the qos limits have changed since job * submission and job can not run, then kill it */ if ((job_ptr->limit_set_time != ADMIN_SET_LIMIT) && qos_ptr->max_wall_pj != INFINITE) { time_limit = qos_ptr->max_wall_pj; if ((job_ptr->time_limit != NO_VAL) && (job_ptr->time_limit > time_limit)) { info("job %u being cancelled, " "time limit %u exceeds qos " "max wall pj %u", job_ptr->job_id, job_ptr->time_limit, time_limit); cancel_job = 1; rc = false; goto end_it; } } } assoc_ptr = job_ptr->assoc_ptr; while(assoc_ptr) { usage_mins = (uint64_t)(assoc_ptr->usage->usage_raw / 60.0); wall_mins = assoc_ptr->usage->grp_used_wall / 60; #if _DEBUG info("acct_job_limits: %u of %u", assoc_ptr->usage->used_jobs, assoc_ptr->max_jobs); #endif if ((!qos_ptr || (qos_ptr && qos_ptr->grp_cpu_mins == (uint64_t)INFINITE)) && (assoc_ptr->grp_cpu_mins != (uint64_t)INFINITE) && (usage_mins >= assoc_ptr->grp_cpu_mins)) { job_ptr->state_reason = WAIT_ASSOC_JOB_LIMIT; xfree(job_ptr->state_desc); debug2("job %u being held, " "assoc %u is at or exceeds " "group max cpu minutes limit %"PRIu64" " "with %Lf for account %s", job_ptr->job_id, assoc_ptr->id, assoc_ptr->grp_cpu_mins, assoc_ptr->usage->usage_raw, assoc_ptr->acct); rc = false; goto end_it; } if ((job_ptr->limit_set_min_cpus != ADMIN_SET_LIMIT) && (!qos_ptr || (qos_ptr && qos_ptr->grp_cpus == INFINITE)) && (assoc_ptr->grp_cpus != INFINITE)) { if (job_ptr->details->min_cpus > assoc_ptr->grp_cpus) { info("job %u being cancelled, " "min cpu request %u exceeds " "group max cpu limit %u for " "account %s", job_ptr->job_id, job_ptr->details->min_cpus, assoc_ptr->grp_cpus, assoc_ptr->acct); cancel_job = 1; rc = false; goto end_it; } if ((assoc_ptr->usage->grp_used_cpus + job_ptr->details->min_cpus) > assoc_ptr->grp_cpus) { job_ptr->state_reason = WAIT_ASSOC_RESOURCE_LIMIT; xfree(job_ptr->state_desc); debug2("job %u being held, " "assoc %u is at or exceeds " "group max cpu limit %u " "with already used %u + requested %u " "for account %s", job_ptr->job_id, assoc_ptr->id, assoc_ptr->grp_cpus, assoc_ptr->usage->grp_used_cpus, job_ptr->details->min_cpus, assoc_ptr->acct); rc = false; goto end_it; } } if ((!qos_ptr || (qos_ptr && qos_ptr->grp_jobs == INFINITE)) && (assoc_ptr->grp_jobs != INFINITE) && (assoc_ptr->usage->used_jobs >= assoc_ptr->grp_jobs)) { job_ptr->state_reason = WAIT_ASSOC_JOB_LIMIT; xfree(job_ptr->state_desc); debug2("job %u being held, " "assoc %u is at or exceeds " "group max jobs limit %u with %u for account %s", job_ptr->job_id, assoc_ptr->id, assoc_ptr->grp_jobs, assoc_ptr->usage->used_jobs, assoc_ptr->acct); rc = false; goto end_it; } if ((job_ptr->limit_set_min_nodes != ADMIN_SET_LIMIT) && (!qos_ptr || (qos_ptr && qos_ptr->grp_nodes == INFINITE)) && (assoc_ptr->grp_nodes != INFINITE)) { if (job_ptr->details->min_nodes > assoc_ptr->grp_nodes) { info("job %u being cancelled, " "min node request %u exceeds " "group max node limit %u for " "account %s", job_ptr->job_id, job_ptr->details->min_nodes, assoc_ptr->grp_nodes, assoc_ptr->acct); cancel_job = 1; rc = false; goto end_it; } if ((assoc_ptr->usage->grp_used_nodes + job_ptr->details->min_nodes) > assoc_ptr->grp_nodes) { job_ptr->state_reason = WAIT_ASSOC_RESOURCE_LIMIT; xfree(job_ptr->state_desc); debug2("job %u being held, " "assoc %u is at or exceeds " "group max node limit %u " "with already used %u + requested %u " "for account %s", job_ptr->job_id, assoc_ptr->id, assoc_ptr->grp_nodes, assoc_ptr->usage->grp_used_nodes, job_ptr->details->min_nodes, assoc_ptr->acct); rc = false; goto end_it; } } /* we don't need to check submit_jobs here */ if ((!qos_ptr || (qos_ptr && qos_ptr->grp_wall == INFINITE)) && (assoc_ptr->grp_wall != INFINITE) && (wall_mins >= assoc_ptr->grp_wall)) { job_ptr->state_reason = WAIT_ASSOC_JOB_LIMIT; xfree(job_ptr->state_desc); debug2("job %u being held, " "assoc %u is at or exceeds " "group wall limit %u " "with %u for account %s", job_ptr->job_id, assoc_ptr->id, assoc_ptr->grp_wall, wall_mins, assoc_ptr->acct); rc = false; goto end_it; } /* We don't need to look at the regular limits for * parents since we have pre-propogated them, so just * continue with the next parent */ if(parent) { assoc_ptr = assoc_ptr->usage->parent_assoc_ptr; continue; } if ((!qos_ptr || (qos_ptr && qos_ptr->max_cpu_mins_pj == INFINITE)) && (assoc_ptr->max_cpu_mins_pj != INFINITE)) { cpu_time_limit = assoc_ptr->max_cpu_mins_pj; if ((job_ptr->time_limit != NO_VAL) && (job_cpu_time_limit > cpu_time_limit)) { info("job %u being cancelled, " "cpu time limit %"PRIu64" exceeds " "assoc max per job %"PRIu64"", job_ptr->job_id, job_cpu_time_limit, cpu_time_limit); cancel_job = 1; rc = false; goto end_it; } } if ((!qos_ptr || (qos_ptr && qos_ptr->max_cpus_pj == INFINITE)) && (assoc_ptr->max_cpus_pj != INFINITE)) { if (job_ptr->details->min_cpus > assoc_ptr->max_cpus_pj) { info("job %u being cancelled, " "min cpu limit %u exceeds " "account max %u", job_ptr->job_id, job_ptr->details->min_cpus, assoc_ptr->max_cpus_pj); cancel_job = 1; rc = false; goto end_it; } } if ((!qos_ptr || (qos_ptr && qos_ptr->max_jobs_pu == INFINITE)) && (assoc_ptr->max_jobs != INFINITE) && (assoc_ptr->usage->used_jobs >= assoc_ptr->max_jobs)) { job_ptr->state_reason = WAIT_ASSOC_JOB_LIMIT; xfree(job_ptr->state_desc); debug2("job %u being held, " "assoc %u is at or exceeds " "max jobs limit %u with %u for account %s", job_ptr->job_id, assoc_ptr->id, assoc_ptr->max_jobs, assoc_ptr->usage->used_jobs, assoc_ptr->acct); rc = false; goto end_it; } if ((!qos_ptr || (qos_ptr && qos_ptr->max_nodes_pj == INFINITE)) && (assoc_ptr->max_nodes_pj != INFINITE)) { if (job_ptr->details->min_nodes > assoc_ptr->max_nodes_pj) { info("job %u being cancelled, " "min node limit %u exceeds " "account max %u", job_ptr->job_id, job_ptr->details->min_nodes, assoc_ptr->max_nodes_pj); cancel_job = 1; rc = false; goto end_it; } } /* we don't need to check submit_jobs here */ /* if the association limits have changed since job * submission and job can not run, then kill it */ if ((job_ptr->limit_set_time != ADMIN_SET_LIMIT) && (!qos_ptr || (qos_ptr && qos_ptr->max_wall_pj == INFINITE)) && (assoc_ptr->max_wall_pj != INFINITE)) { time_limit = assoc_ptr->max_wall_pj; if ((job_ptr->time_limit != NO_VAL) && (job_ptr->time_limit > time_limit)) { info("job %u being cancelled, " "time limit %u exceeds account " "max %u", job_ptr->job_id, job_ptr->time_limit, time_limit); cancel_job = 1; rc = false; goto end_it; } } assoc_ptr = assoc_ptr->usage->parent_assoc_ptr; parent = 1; } end_it: assoc_mgr_unlock(&locks); if(cancel_job) _cancel_job(job_ptr); return rc; }
extern bool acct_policy_node_usable(struct job_record *job_ptr, uint32_t used_cpus, char *node_name, uint32_t node_cpus) { slurmdb_qos_rec_t *qos_ptr; slurmdb_association_rec_t *assoc_ptr; bool rc = true; uint32_t total_cpus = used_cpus + node_cpus; bool cancel_job = 0; int parent = 0; /* flag to tell us if we are looking at the * parent or not */ assoc_mgr_lock_t locks = { READ_LOCK, NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK }; /* check to see if we are enforcing associations */ if (!accounting_enforce) return true; if (!_valid_job_assoc(job_ptr)) { _cancel_job(job_ptr); return false; } /* now see if we are enforcing limits */ if (!(accounting_enforce & ACCOUNTING_ENFORCE_LIMITS)) return true; /* clear old state reason */ if ((job_ptr->state_reason == WAIT_ASSOC_JOB_LIMIT) || (job_ptr->state_reason == WAIT_ASSOC_RESOURCE_LIMIT) || (job_ptr->state_reason == WAIT_ASSOC_TIME_LIMIT)) job_ptr->state_reason = WAIT_NO_REASON; assoc_mgr_lock(&locks); qos_ptr = job_ptr->qos_ptr; if(qos_ptr) { if (qos_ptr->grp_cpus != INFINITE) { if ((total_cpus+qos_ptr->usage->grp_used_cpus) > qos_ptr->grp_cpus) { debug("Can't use %s, adding it's %u cpus " "exceeds " "group max cpu limit %u for qos '%s'", node_name, node_cpus, qos_ptr->grp_cpus, qos_ptr->name); rc = false; goto end_it; } } if (qos_ptr->max_cpus_pj != INFINITE) { if (total_cpus > qos_ptr->max_cpus_pj) { debug("Can't use %s, adding it's %u cpus " "exceeds " "max cpu limit %u for qos '%s'", node_name, node_cpus, qos_ptr->max_cpus_pj, qos_ptr->name); cancel_job = 1; rc = false; goto end_it; } } } assoc_ptr = job_ptr->assoc_ptr; while(assoc_ptr) { if ((!qos_ptr || (qos_ptr && qos_ptr->grp_cpus == INFINITE)) && (assoc_ptr->grp_cpus != INFINITE)) { if ((total_cpus+assoc_ptr->usage->grp_used_cpus) > assoc_ptr->grp_cpus) { debug("Can't use %s, adding it's %u cpus " "exceeds " "group max cpu limit %u for account '%s'", node_name, node_cpus, assoc_ptr->grp_cpus, assoc_ptr->acct); rc = false; goto end_it; } } /* We don't need to look at the regular limits for * parents since we have pre-propogated them, so just * continue with the next parent */ if(parent) { assoc_ptr = assoc_ptr->usage->parent_assoc_ptr; continue; } if ((!qos_ptr || (qos_ptr && qos_ptr->max_cpus_pj == INFINITE)) && (assoc_ptr->max_cpus_pj != INFINITE)) { if (job_ptr->details->min_cpus > assoc_ptr->max_cpus_pj) { debug("Can't use %s, adding it's %u cpus " "exceeds " "max cpu limit %u for account '%s'", node_name, node_cpus, assoc_ptr->max_cpus_pj, assoc_ptr->acct); rc = false; goto end_it; } } assoc_ptr = assoc_ptr->usage->parent_assoc_ptr; parent = 1; } end_it: assoc_mgr_unlock(&locks); if(cancel_job) _cancel_job(job_ptr); return rc; }
static int _job_modify(uint32_t jobid, char *bank_ptr, char *depend_ptr, char *new_hostlist, uint32_t new_node_cnt, char *part_name_ptr, uint32_t new_time_limit, char *name_ptr, char *start_ptr, char *feature_ptr, char *env_ptr, char *comment_ptr, char *gres_ptr, char *wckey_ptr) { struct job_record *job_ptr; time_t now = time(NULL); bool update_accounting = false; job_ptr = find_job_record(jobid); if (job_ptr == NULL) { error("wiki: MODIFYJOB has invalid jobid %u", jobid); return ESLURM_INVALID_JOB_ID; } if (IS_JOB_FINISHED(job_ptr) || (job_ptr->details == NULL)) { info("wiki: MODIFYJOB jobid %u is finished", jobid); return ESLURM_DISABLED; } if (comment_ptr) { info("wiki: change job %u comment %s", jobid, comment_ptr); xfree(job_ptr->comment); job_ptr->comment = xstrdup(comment_ptr); last_job_update = now; } if (depend_ptr) { int rc = update_job_dependency(job_ptr, depend_ptr); if (rc == SLURM_SUCCESS) { info("wiki: changed job %u dependency to %s", jobid, depend_ptr); } else { error("wiki: changing job %u dependency to %s", jobid, depend_ptr); return EINVAL; } } if (env_ptr) { bool have_equal = false; char old_sep[1]; int begin = 0, i; if (job_ptr->batch_flag == 0) { error("wiki: attempt to set environment variables " "for non-batch job %u", jobid); return ESLURM_DISABLED; } for (i=0; ; i++) { if (env_ptr[i] == '=') { if (have_equal) { error("wiki: setting job %u invalid " "environment variables: %s", jobid, env_ptr); return EINVAL; } have_equal = true; if (env_ptr[i+1] == '\"') { for (i+=2; ; i++) { if (env_ptr[i] == '\0') { error("wiki: setting job %u " "invalid environment " "variables: %s", jobid, env_ptr); return EINVAL; } if (env_ptr[i] == '\"') { i++; break; } if (env_ptr[i] == '\\') { i++; } } } else if (env_ptr[i+1] == '\'') { for (i+=2; ; i++) { if (env_ptr[i] == '\0') { error("wiki: setting job %u " "invalid environment " "variables: %s", jobid, env_ptr); return EINVAL; } if (env_ptr[i] == '\'') { i++; break; } if (env_ptr[i] == '\\') { i++; } } } } if (isspace(env_ptr[i]) || (env_ptr[i] == ',')) { if (!have_equal) { error("wiki: setting job %u invalid " "environment variables: %s", jobid, env_ptr); return EINVAL; } old_sep[0] = env_ptr[i]; env_ptr[i] = '\0'; xrealloc(job_ptr->details->env_sup, sizeof(char *) * (job_ptr->details->env_cnt+1)); job_ptr->details->env_sup [job_ptr->details->env_cnt++] = xstrdup(&env_ptr[begin]); info("wiki: for job %u add env: %s", jobid, &env_ptr[begin]); env_ptr[i] = old_sep[0]; if (isspace(old_sep[0])) break; begin = i + 1; have_equal = false; } } } if (new_time_limit) { time_t old_time = job_ptr->time_limit; job_ptr->time_limit = new_time_limit; info("wiki: change job %u time_limit to %u", jobid, new_time_limit); /* Update end_time based upon change * to preserve suspend time info */ job_ptr->end_time = job_ptr->end_time + ((job_ptr->time_limit - old_time) * 60); last_job_update = now; } if (bank_ptr && (update_job_account("wiki", job_ptr, bank_ptr) != SLURM_SUCCESS)) { return EINVAL; } if (feature_ptr) { if (IS_JOB_PENDING(job_ptr) && (job_ptr->details)) { info("wiki: change job %u features to %s", jobid, feature_ptr); job_ptr->details->features = xstrdup(feature_ptr); last_job_update = now; } else { error("wiki: MODIFYJOB features of non-pending " "job %u", jobid); return ESLURM_DISABLED; } } if (start_ptr) { char *end_ptr; uint32_t begin_time = strtol(start_ptr, &end_ptr, 10); if (IS_JOB_PENDING(job_ptr) && (job_ptr->details)) { info("wiki: change job %u begin time to %u", jobid, begin_time); job_ptr->details->begin_time = begin_time; last_job_update = now; update_accounting = true; } else { error("wiki: MODIFYJOB begin_time of non-pending " "job %u", jobid); return ESLURM_DISABLED; } } if (name_ptr) { if (IS_JOB_PENDING(job_ptr)) { info("wiki: change job %u name %s", jobid, name_ptr); xfree(job_ptr->name); job_ptr->name = xstrdup(name_ptr); last_job_update = now; update_accounting = true; } else { error("wiki: MODIFYJOB name of non-pending job %u", jobid); return ESLURM_DISABLED; } } if (new_hostlist) { int rc = 0, task_cnt; hostlist_t hl; char *tasklist; if (!IS_JOB_PENDING(job_ptr) || !job_ptr->details) { /* Job is done, nothing to reset */ if (new_hostlist == '\0') goto host_fini; error("wiki: MODIFYJOB hostlist of non-pending " "job %u", jobid); return ESLURM_DISABLED; } xfree(job_ptr->details->req_nodes); FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap); if (new_hostlist == '\0') goto host_fini; tasklist = moab2slurm_task_list(new_hostlist, &task_cnt); if (tasklist == NULL) { rc = 1; goto host_fini; } hl = hostlist_create(tasklist); if (hl == 0) { rc = 1; goto host_fini; } hostlist_uniq(hl); hostlist_sort(hl); job_ptr->details->req_nodes = hostlist_ranged_string_xmalloc(hl); hostlist_destroy(hl); if (job_ptr->details->req_nodes == NULL) { rc = 1; goto host_fini; } if (node_name2bitmap(job_ptr->details->req_nodes, false, &job_ptr->details->req_node_bitmap)) { rc = 1; goto host_fini; } host_fini: if (rc) { info("wiki: change job %u invalid hostlist %s", jobid, new_hostlist); xfree(job_ptr->details->req_nodes); return EINVAL; } else { info("wiki: change job %u hostlist %s", jobid, new_hostlist); update_accounting = true; } } if (part_name_ptr) { struct part_record *part_ptr; if (!IS_JOB_PENDING(job_ptr)) { error("wiki: MODIFYJOB partition of non-pending " "job %u", jobid); return ESLURM_DISABLED; } part_ptr = find_part_record(part_name_ptr); if (part_ptr == NULL) { error("wiki: MODIFYJOB has invalid partition %s", part_name_ptr); return ESLURM_INVALID_PARTITION_NAME; } info("wiki: change job %u partition %s", jobid, part_name_ptr); xfree(job_ptr->partition); job_ptr->partition = xstrdup(part_name_ptr); job_ptr->part_ptr = part_ptr; last_job_update = now; update_accounting = true; } if (new_node_cnt) { job_desc_msg_t job_desc; #ifdef HAVE_BG uint16_t geometry[SYSTEM_DIMENSIONS] = {(uint16_t) NO_VAL}; static uint16_t cpus_per_node = 0; if (!cpus_per_node) { select_g_alter_node_cnt(SELECT_GET_NODE_CPU_CNT, &cpus_per_node); } #endif if (!IS_JOB_PENDING(job_ptr) || !job_ptr->details) { error("wiki: MODIFYJOB node count of non-pending " "job %u", jobid); return ESLURM_DISABLED; } memset(&job_desc, 0, sizeof(job_desc_msg_t)); job_desc.min_nodes = new_node_cnt; job_desc.max_nodes = NO_VAL; job_desc.select_jobinfo = select_g_select_jobinfo_alloc(); select_g_alter_node_cnt(SELECT_SET_NODE_CNT, &job_desc); select_g_select_jobinfo_free(job_desc.select_jobinfo); job_ptr->details->min_nodes = job_desc.min_nodes; if (job_ptr->details->max_nodes && (job_ptr->details->max_nodes < job_desc.min_nodes)) job_ptr->details->max_nodes = job_desc.min_nodes; info("wiki: change job %u min_nodes to %u", jobid, new_node_cnt); #ifdef HAVE_BG job_ptr->details->min_cpus = job_desc.min_cpus; job_ptr->details->max_cpus = job_desc.max_cpus; job_ptr->details->pn_min_cpus = job_desc.pn_min_cpus; new_node_cnt = job_ptr->details->min_cpus; if (cpus_per_node) new_node_cnt /= cpus_per_node; /* This is only set up so accounting is set up correctly */ select_g_select_jobinfo_set(job_ptr->select_jobinfo, SELECT_JOBDATA_NODE_CNT, &new_node_cnt); /* reset geo since changing this makes any geo potentially invalid */ select_g_select_jobinfo_set(job_ptr->select_jobinfo, SELECT_JOBDATA_GEOMETRY, geometry); #endif last_job_update = now; update_accounting = true; } if (gres_ptr) { char *orig_gres; assoc_mgr_lock_t locks = { NO_LOCK, NO_LOCK, NO_LOCK, NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK }; if (!IS_JOB_PENDING(job_ptr)) { error("wiki: MODIFYJOB GRES of non-pending job %u", jobid); return ESLURM_DISABLED; } orig_gres = job_ptr->gres; job_ptr->gres = NULL; if (gres_ptr[0]) job_ptr->gres = xstrdup(gres_ptr); if (gres_plugin_job_state_validate(job_ptr->gres, &job_ptr->gres_list)) { error("wiki: MODIFYJOB Invalid GRES=%s", gres_ptr); xfree(job_ptr->gres); job_ptr->gres = orig_gres; return ESLURM_INVALID_GRES; } xfree(orig_gres); assoc_mgr_lock(&locks); gres_set_job_tres_cnt(job_ptr->gres_list, job_ptr->details ? job_ptr->details->min_nodes : 0, job_ptr->tres_req_cnt, true); xfree(job_ptr->tres_req_str); job_ptr->tres_req_str = assoc_mgr_make_tres_str_from_array( job_ptr->tres_req_cnt, TRES_STR_FLAG_SIMPLE, true); assoc_mgr_unlock(&locks); } if (wckey_ptr) { int rc = update_job_wckey("update_job", job_ptr, wckey_ptr); if (rc != SLURM_SUCCESS) { error("wiki: MODIFYJOB Invalid WCKEY=%s", wckey_ptr); return rc; } } if (update_accounting) { /* Update job record in accounting to reflect the changes */ jobacct_storage_job_start_direct(acct_db_conn, job_ptr); } return SLURM_SUCCESS; }
/* assoc_mgr locks need to unlocked before you get here */ static int _get_cluster_usage(mysql_conn_t *mysql_conn, uid_t uid, slurmdb_cluster_rec_t *cluster_rec, slurmdbd_msg_type_t type, time_t start, time_t end) { int rc = SLURM_SUCCESS; int i=0; MYSQL_RES *result = NULL; MYSQL_ROW row; char *tmp = NULL; char *my_usage_table = cluster_day_table; char *query = NULL; assoc_mgr_lock_t locks = { NO_LOCK, NO_LOCK, NO_LOCK, NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK }; char *cluster_req_inx[] = { "id_tres", "alloc_secs", "down_secs", "pdown_secs", "idle_secs", "resv_secs", "over_secs", "count", "time_start", }; enum { CLUSTER_TRES, CLUSTER_ACPU, CLUSTER_DCPU, CLUSTER_PDCPU, CLUSTER_ICPU, CLUSTER_RCPU, CLUSTER_OCPU, CLUSTER_CNT, CLUSTER_START, CLUSTER_COUNT }; if (!cluster_rec->name || !cluster_rec->name[0]) { error("We need a cluster name to set data for"); return SLURM_ERROR; } if (set_usage_information(&my_usage_table, type, &start, &end) != SLURM_SUCCESS) { return SLURM_ERROR; } xfree(tmp); i=0; xstrfmtcat(tmp, "%s", cluster_req_inx[i]); for(i=1; i<CLUSTER_COUNT; i++) { xstrfmtcat(tmp, ", %s", cluster_req_inx[i]); } query = xstrdup_printf( "select %s from \"%s_%s\" where (time_start < %ld " "&& time_start >= %ld)", tmp, cluster_rec->name, my_usage_table, end, start); xfree(tmp); if (debug_flags & DEBUG_FLAG_DB_USAGE) DB_DEBUG(mysql_conn->conn, "query\n%s", query); if (!(result = mysql_db_query_ret(mysql_conn, query, 0))) { xfree(query); return SLURM_ERROR; } xfree(query); if (!cluster_rec->accounting_list) cluster_rec->accounting_list = list_create(slurmdb_destroy_cluster_accounting_rec); assoc_mgr_lock(&locks); while ((row = mysql_fetch_row(result))) { slurmdb_tres_rec_t *tres_rec; slurmdb_cluster_accounting_rec_t *accounting_rec = xmalloc(sizeof(slurmdb_cluster_accounting_rec_t)); accounting_rec->tres_rec.id = slurm_atoul(row[CLUSTER_TRES]); accounting_rec->tres_rec.count = slurm_atoul(row[CLUSTER_CNT]); if ((tres_rec = list_find_first( assoc_mgr_tres_list, slurmdb_find_tres_in_list, &accounting_rec->tres_rec.id))) { accounting_rec->tres_rec.name = xstrdup(tres_rec->name); accounting_rec->tres_rec.type = xstrdup(tres_rec->type); } accounting_rec->alloc_secs = slurm_atoull(row[CLUSTER_ACPU]); accounting_rec->down_secs = slurm_atoull(row[CLUSTER_DCPU]); accounting_rec->pdown_secs = slurm_atoull(row[CLUSTER_PDCPU]); accounting_rec->idle_secs = slurm_atoull(row[CLUSTER_ICPU]); accounting_rec->over_secs = slurm_atoull(row[CLUSTER_OCPU]); accounting_rec->resv_secs = slurm_atoull(row[CLUSTER_RCPU]); accounting_rec->period_start = slurm_atoul(row[CLUSTER_START]); list_append(cluster_rec->accounting_list, accounting_rec); } assoc_mgr_unlock(&locks); mysql_free_result(result); return rc; }
/* assoc_mgr locks need to be unlocked before coming here */ static int _get_object_usage(mysql_conn_t *mysql_conn, slurmdbd_msg_type_t type, char *my_usage_table, char *cluster_name, char *id_str, time_t start, time_t end, List *usage_list) { char *tmp = NULL; int i = 0; MYSQL_RES *result = NULL; MYSQL_ROW row; char *query = NULL; assoc_mgr_lock_t locks = { NO_LOCK, NO_LOCK, NO_LOCK, NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK }; char *usage_req_inx[] = { "t3.id_assoc", "t1.id_tres", "t1.time_start", "t1.alloc_secs", }; enum { USAGE_ID, USAGE_TRES, USAGE_START, USAGE_ALLOC, USAGE_COUNT }; if (type == DBD_GET_WCKEY_USAGE) usage_req_inx[0] = "t1.id"; xstrfmtcat(tmp, "%s", usage_req_inx[i]); for (i=1; i<USAGE_COUNT; i++) { xstrfmtcat(tmp, ", %s", usage_req_inx[i]); } switch (type) { case DBD_GET_ASSOC_USAGE: query = xstrdup_printf( "select %s from \"%s_%s\" as t1, " "\"%s_%s\" as t2, \"%s_%s\" as t3 " "where (t1.time_start < %ld && t1.time_start >= %ld) " "&& t1.id=t2.id_assoc && (%s) && " "t2.lft between t3.lft and t3.rgt " "order by t3.id_assoc, time_start;", tmp, cluster_name, my_usage_table, cluster_name, assoc_table, cluster_name, assoc_table, end, start, id_str); break; case DBD_GET_WCKEY_USAGE: query = xstrdup_printf( "select %s from \"%s_%s\" as t1 " "where (time_start < %ld && time_start >= %ld) " "&& (%s) order by id, time_start;", tmp, cluster_name, my_usage_table, end, start, id_str); break; default: error("Unknown usage type %d", type); xfree(tmp); return SLURM_ERROR; break; } xfree(tmp); if (debug_flags & DEBUG_FLAG_DB_USAGE) DB_DEBUG(mysql_conn->conn, "query\n%s", query); result = mysql_db_query_ret(mysql_conn, query, 0); xfree(query); if (!result) return SLURM_ERROR; if (!(*usage_list)) (*usage_list) = list_create(slurmdb_destroy_accounting_rec); assoc_mgr_lock(&locks); while ((row = mysql_fetch_row(result))) { slurmdb_tres_rec_t *tres_rec; slurmdb_accounting_rec_t *accounting_rec = xmalloc(sizeof(slurmdb_accounting_rec_t)); accounting_rec->tres_rec.id = slurm_atoul(row[USAGE_TRES]); if ((tres_rec = list_find_first( assoc_mgr_tres_list, slurmdb_find_tres_in_list, &accounting_rec->tres_rec.id))) { accounting_rec->tres_rec.name = xstrdup(tres_rec->name); accounting_rec->tres_rec.type = xstrdup(tres_rec->type); } accounting_rec->id = slurm_atoul(row[USAGE_ID]); accounting_rec->period_start = slurm_atoul(row[USAGE_START]); accounting_rec->alloc_secs = slurm_atoull(row[USAGE_ALLOC]); list_append(*usage_list, accounting_rec); } assoc_mgr_unlock(&locks); mysql_free_result(result); return SLURM_SUCCESS; }
extern void priority_p_job_end(struct job_record *job_ptr) { uint64_t time_limit_secs = (uint64_t)job_ptr->time_limit * 60; slurmdb_assoc_rec_t *assoc_ptr; int i; uint64_t *unused_tres_run_secs; assoc_mgr_lock_t locks = { NO_LOCK, WRITE_LOCK, NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; /* No decaying in basic priority. Just remove the total secs. */ unused_tres_run_secs = xmalloc(sizeof(uint64_t) * slurmctld_tres_cnt); for (i=0; i<slurmctld_tres_cnt; i++) { unused_tres_run_secs[i] = job_ptr->tres_alloc_cnt[i] * time_limit_secs; } assoc_mgr_lock(&locks); if (job_ptr->qos_ptr) { slurmdb_qos_rec_t *qos_ptr = job_ptr->qos_ptr; for (i=0; i<slurmctld_tres_cnt; i++) { if (unused_tres_run_secs[i] > qos_ptr->usage->grp_used_tres_run_secs[i]) { qos_ptr->usage->grp_used_tres_run_secs[i] = 0; debug2("acct_policy_job_fini: " "grp_used_tres_run_secs " "underflow for qos %s tres %s", qos_ptr->name, assoc_mgr_tres_name_array[i]); } else qos_ptr->usage->grp_used_tres_run_secs[i] -= unused_tres_run_secs[i]; } } assoc_ptr = job_ptr->assoc_ptr; while (assoc_ptr) { /* If the job finished early remove the extra time now. */ for (i=0; i<slurmctld_tres_cnt; i++) { if (unused_tres_run_secs[i] > assoc_ptr->usage->grp_used_tres_run_secs[i]) { assoc_ptr->usage->grp_used_tres_run_secs[i] = 0; debug2("acct_policy_job_fini: " "grp_used_tres_run_secs " "underflow for account %s tres %s", assoc_ptr->acct, assoc_mgr_tres_name_array[i]); } else { assoc_ptr->usage->grp_used_tres_run_secs[i] -= unused_tres_run_secs[i]; debug4("acct_policy_job_fini: job %u. " "Removed %"PRIu64" unused seconds " "from acct %s tres %s " "grp_used_tres_run_secs = %"PRIu64"", job_ptr->job_id, unused_tres_run_secs[i], assoc_ptr->acct, assoc_mgr_tres_name_array[i], assoc_ptr->usage-> grp_used_tres_run_secs[i]); } } /* now handle all the group limits of the parents */ assoc_ptr = assoc_ptr->usage->parent_assoc_ptr; } assoc_mgr_unlock(&locks); xfree(unused_tres_run_secs); return; }
/* * Read and process the bluegene.conf configuration file so to interpret what * blocks are static/dynamic, torus/mesh, etc. */ extern int read_bg_conf(void) { int i; bool tmp_bool = 0; int count = 0; s_p_hashtbl_t *tbl = NULL; char *tmp_char = NULL; select_ba_request_t **blockreq_array = NULL; image_t **image_array = NULL; image_t *image = NULL; static time_t last_config_update = (time_t) 0; struct stat config_stat; ListIterator itr = NULL; char* bg_conf_file = NULL; static int *dims = NULL; if (!dims) dims = select_g_ba_get_dims(); if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("Reading the bluegene.conf file"); /* check if config file has changed */ bg_conf_file = get_extra_conf_path("bluegene.conf"); if (stat(bg_conf_file, &config_stat) < 0) fatal("can't stat bluegene.conf file %s: %m", bg_conf_file); if (last_config_update) { _reopen_bridge_log(); if (last_config_update == config_stat.st_mtime) { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("%s unchanged", bg_conf_file); } else { info("Restart slurmctld for %s changes " "to take effect", bg_conf_file); } last_config_update = config_stat.st_mtime; xfree(bg_conf_file); return SLURM_SUCCESS; } last_config_update = config_stat.st_mtime; /* initialization */ /* bg_conf defined in bg_node_alloc.h */ if (!(tbl = config_make_tbl(bg_conf_file))) fatal("something wrong with opening/reading bluegene " "conf file"); xfree(bg_conf_file); #ifdef HAVE_BGL if (s_p_get_array((void ***)&image_array, &count, "AltBlrtsImage", tbl)) { for (i = 0; i < count; i++) { list_append(bg_conf->blrts_list, image_array[i]); image_array[i] = NULL; } } if (!s_p_get_string(&bg_conf->default_blrtsimage, "BlrtsImage", tbl)) { if (!list_count(bg_conf->blrts_list)) fatal("BlrtsImage not configured " "in bluegene.conf"); itr = list_iterator_create(bg_conf->blrts_list); image = list_next(itr); image->def = true; list_iterator_destroy(itr); bg_conf->default_blrtsimage = xstrdup(image->name); info("Warning: using %s as the default BlrtsImage. " "If this isn't correct please set BlrtsImage", bg_conf->default_blrtsimage); } else { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("default BlrtsImage %s", bg_conf->default_blrtsimage); image = xmalloc(sizeof(image_t)); image->name = xstrdup(bg_conf->default_blrtsimage); image->def = true; image->groups = NULL; /* we want it to be first */ list_push(bg_conf->blrts_list, image); } if (s_p_get_array((void ***)&image_array, &count, "AltLinuxImage", tbl)) { for (i = 0; i < count; i++) { list_append(bg_conf->linux_list, image_array[i]); image_array[i] = NULL; } } if (!s_p_get_string(&bg_conf->default_linuximage, "LinuxImage", tbl)) { if (!list_count(bg_conf->linux_list)) fatal("LinuxImage not configured " "in bluegene.conf"); itr = list_iterator_create(bg_conf->linux_list); image = list_next(itr); image->def = true; list_iterator_destroy(itr); bg_conf->default_linuximage = xstrdup(image->name); info("Warning: using %s as the default LinuxImage. " "If this isn't correct please set LinuxImage", bg_conf->default_linuximage); } else { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("default LinuxImage %s", bg_conf->default_linuximage); image = xmalloc(sizeof(image_t)); image->name = xstrdup(bg_conf->default_linuximage); image->def = true; image->groups = NULL; /* we want it to be first */ list_push(bg_conf->linux_list, image); } if (s_p_get_array((void ***)&image_array, &count, "AltRamDiskImage", tbl)) { for (i = 0; i < count; i++) { list_append(bg_conf->ramdisk_list, image_array[i]); image_array[i] = NULL; } } if (!s_p_get_string(&bg_conf->default_ramdiskimage, "RamDiskImage", tbl)) { if (!list_count(bg_conf->ramdisk_list)) fatal("RamDiskImage not configured " "in bluegene.conf"); itr = list_iterator_create(bg_conf->ramdisk_list); image = list_next(itr); image->def = true; list_iterator_destroy(itr); bg_conf->default_ramdiskimage = xstrdup(image->name); info("Warning: using %s as the default RamDiskImage. " "If this isn't correct please set RamDiskImage", bg_conf->default_ramdiskimage); } else { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("default RamDiskImage %s", bg_conf->default_ramdiskimage); image = xmalloc(sizeof(image_t)); image->name = xstrdup(bg_conf->default_ramdiskimage); image->def = true; image->groups = NULL; /* we want it to be first */ list_push(bg_conf->ramdisk_list, image); } #elif defined HAVE_BGP if (s_p_get_array((void ***)&image_array, &count, "AltCnloadImage", tbl)) { for (i = 0; i < count; i++) { list_append(bg_conf->linux_list, image_array[i]); image_array[i] = NULL; } } if (!s_p_get_string(&bg_conf->default_linuximage, "CnloadImage", tbl)) { if (!list_count(bg_conf->linux_list)) fatal("CnloadImage not configured " "in bluegene.conf"); itr = list_iterator_create(bg_conf->linux_list); image = list_next(itr); image->def = true; list_iterator_destroy(itr); bg_conf->default_linuximage = xstrdup(image->name); info("Warning: using %s as the default CnloadImage. " "If this isn't correct please set CnloadImage", bg_conf->default_linuximage); } else { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("default CnloadImage %s", bg_conf->default_linuximage); image = xmalloc(sizeof(image_t)); image->name = xstrdup(bg_conf->default_linuximage); image->def = true; image->groups = NULL; /* we want it to be first */ list_push(bg_conf->linux_list, image); } if (s_p_get_array((void ***)&image_array, &count, "AltIoloadImage", tbl)) { for (i = 0; i < count; i++) { list_append(bg_conf->ramdisk_list, image_array[i]); image_array[i] = NULL; } } if (!s_p_get_string(&bg_conf->default_ramdiskimage, "IoloadImage", tbl)) { if (!list_count(bg_conf->ramdisk_list)) fatal("IoloadImage not configured " "in bluegene.conf"); itr = list_iterator_create(bg_conf->ramdisk_list); image = list_next(itr); image->def = true; list_iterator_destroy(itr); bg_conf->default_ramdiskimage = xstrdup(image->name); info("Warning: using %s as the default IoloadImage. " "If this isn't correct please set IoloadImage", bg_conf->default_ramdiskimage); } else { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("default IoloadImage %s", bg_conf->default_ramdiskimage); image = xmalloc(sizeof(image_t)); image->name = xstrdup(bg_conf->default_ramdiskimage); image->def = true; image->groups = NULL; /* we want it to be first */ list_push(bg_conf->ramdisk_list, image); } #endif if (s_p_get_array((void ***)&image_array, &count, "AltMloaderImage", tbl)) { for (i = 0; i < count; i++) { list_append(bg_conf->mloader_list, image_array[i]); image_array[i] = NULL; } } if (!s_p_get_string(&bg_conf->default_mloaderimage, "MloaderImage", tbl)) { if (!list_count(bg_conf->mloader_list)) fatal("MloaderImage not configured " "in bluegene.conf"); itr = list_iterator_create(bg_conf->mloader_list); image = list_next(itr); image->def = true; list_iterator_destroy(itr); bg_conf->default_mloaderimage = xstrdup(image->name); info("Warning: using %s as the default MloaderImage. " "If this isn't correct please set MloaderImage", bg_conf->default_mloaderimage); } else { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("default MloaderImage %s", bg_conf->default_mloaderimage); image = xmalloc(sizeof(image_t)); image->name = xstrdup(bg_conf->default_mloaderimage); image->def = true; image->groups = NULL; /* we want it to be first */ list_push(bg_conf->mloader_list, image); } if (!s_p_get_uint16(&bg_conf->mp_cnode_cnt, "MidplaneNodeCnt", tbl)) { if (!s_p_get_uint16(&bg_conf->mp_cnode_cnt, "BasePartitionNodeCnt", tbl)) { error("MidplaneNodeCnt not configured in bluegene.conf " "defaulting to 512 as MidplaneNodeCnt"); bg_conf->mp_cnode_cnt = 512; } } if (bg_conf->mp_cnode_cnt <= 0) fatal("You should have more than 0 nodes " "per midplane"); bg_conf->actual_cnodes_per_mp = bg_conf->mp_cnode_cnt; bg_conf->quarter_cnode_cnt = bg_conf->mp_cnode_cnt/4; /* bg_conf->cpus_per_mp should had already been set from the * node_init */ if (bg_conf->cpus_per_mp < bg_conf->mp_cnode_cnt) { fatal("For some reason we have only %u cpus per mp, but " "have %u cnodes per mp. You need at least the same " "number of cpus as you have cnodes per mp. " "Check the NodeName CPUs= " "definition in the slurm.conf.", bg_conf->cpus_per_mp, bg_conf->mp_cnode_cnt); } bg_conf->cpu_ratio = bg_conf->cpus_per_mp/bg_conf->mp_cnode_cnt; if (!bg_conf->cpu_ratio) fatal("We appear to have less than 1 cpu on a cnode. " "You specified %u for MidplaneNodeCnt " "in the blugene.conf and %u cpus " "for each node in the slurm.conf", bg_conf->mp_cnode_cnt, bg_conf->cpus_per_mp); num_unused_cpus = 1; for (i = 0; i<SYSTEM_DIMENSIONS; i++) num_unused_cpus *= dims[i]; num_unused_cpus *= bg_conf->cpus_per_mp; num_possible_unused_cpus = num_unused_cpus; if (!s_p_get_uint16(&bg_conf->nodecard_cnode_cnt, "NodeBoardNodeCnt", tbl)) { if (!s_p_get_uint16(&bg_conf->nodecard_cnode_cnt, "NodeCardNodeCnt", tbl)) { error("NodeCardNodeCnt not configured in bluegene.conf " "defaulting to 32 as NodeCardNodeCnt"); bg_conf->nodecard_cnode_cnt = 32; } } if (bg_conf->nodecard_cnode_cnt <= 0) fatal("You should have more than 0 nodes per nodecard"); bg_conf->mp_nodecard_cnt = bg_conf->mp_cnode_cnt / bg_conf->nodecard_cnode_cnt; if (!s_p_get_uint16(&bg_conf->ionodes_per_mp, "IONodesPerMP", tbl)) if (!s_p_get_uint16(&bg_conf->ionodes_per_mp, "Numpsets", tbl)) fatal("Warning: IONodesPerMP not configured " "in bluegene.conf"); s_p_get_uint16(&bg_conf->max_block_err, "MaxBlockInError", tbl); tmp_bool = 0; s_p_get_boolean(&tmp_bool, "SubMidplaneSystem", tbl); bg_conf->sub_mp_sys = tmp_bool; #ifdef HAVE_BGQ tmp_bool = 0; s_p_get_boolean(&tmp_bool, "AllowSubBlockAllocations", tbl); bg_conf->sub_blocks = tmp_bool; /* You can only have 16 ionodes per midplane */ if (bg_conf->ionodes_per_mp > bg_conf->mp_nodecard_cnt) bg_conf->ionodes_per_mp = bg_conf->mp_nodecard_cnt; #endif for (i=0; i<SYSTEM_DIMENSIONS; i++) bg_conf->default_conn_type[i] = (uint16_t)NO_VAL; s_p_get_string(&tmp_char, "DefaultConnType", tbl); if (tmp_char) { verify_conn_type(tmp_char, bg_conf->default_conn_type); if ((bg_conf->default_conn_type[0] != SELECT_MESH) && (bg_conf->default_conn_type[0] != SELECT_TORUS)) fatal("Can't have a DefaultConnType of %s " "(only Mesh or Torus values are valid).", tmp_char); xfree(tmp_char); } else bg_conf->default_conn_type[0] = SELECT_TORUS; #ifndef HAVE_BG_L_P int first_conn_type = bg_conf->default_conn_type[0]; for (i=1; i<SYSTEM_DIMENSIONS; i++) { if (bg_conf->default_conn_type[i] == (uint16_t)NO_VAL) bg_conf->default_conn_type[i] = first_conn_type; else if (bg_conf->default_conn_type[i] >= SELECT_SMALL) fatal("Can't have a DefaultConnType of %s " "(only Mesh or Torus values are valid).", tmp_char); } #endif if (bg_conf->ionodes_per_mp) { bitstr_t *tmp_bitmap = NULL; int small_size = 1; /* THIS IS A HACK TO MAKE A 1 NODECARD SYSTEM WORK, * Sometime on a Q system the nodecard isn't in the 0 * spot so only do this if you know it is in that * spot. Otherwise say the whole midplane is there * and just make blocks over the whole thing. They * you can error out the blocks that aren't usable. */ if (bg_conf->sub_mp_sys && bg_conf->mp_cnode_cnt == bg_conf->nodecard_cnode_cnt) { #ifdef HAVE_BGQ bg_conf->quarter_ionode_cnt = 1; bg_conf->nodecard_ionode_cnt = 1; #else bg_conf->quarter_ionode_cnt = 2; bg_conf->nodecard_ionode_cnt = 2; #endif } else { bg_conf->quarter_ionode_cnt = bg_conf->ionodes_per_mp/4; bg_conf->nodecard_ionode_cnt = bg_conf->quarter_ionode_cnt/4; } /* How many nodecards per ionode */ bg_conf->nc_ratio = ((double)bg_conf->mp_cnode_cnt / (double)bg_conf->nodecard_cnode_cnt) / (double)bg_conf->ionodes_per_mp; /* How many ionodes per nodecard */ bg_conf->io_ratio = (double)bg_conf->ionodes_per_mp / ((double)bg_conf->mp_cnode_cnt / (double)bg_conf->nodecard_cnode_cnt); /* How many cnodes per ionode */ bg_conf->ionode_cnode_cnt = bg_conf->nodecard_cnode_cnt * bg_conf->nc_ratio; //info("got %f %f", bg_conf->nc_ratio, bg_conf->io_ratio); /* figure out the smallest block we can have on the system */ #ifdef HAVE_BGL if (bg_conf->io_ratio >= 1) bg_conf->smallest_block=32; else bg_conf->smallest_block=128; #else if (bg_conf->io_ratio >= 2) bg_conf->smallest_block=16; else if (bg_conf->io_ratio == 1) bg_conf->smallest_block=32; else if (bg_conf->io_ratio == .5) bg_conf->smallest_block=64; else if (bg_conf->io_ratio == .25) bg_conf->smallest_block=128; else if (bg_conf->io_ratio == .125) bg_conf->smallest_block=256; else { error("unknown ioratio %f. Can't figure out " "smallest block size, setting it to midplane", bg_conf->io_ratio); bg_conf->smallest_block = 512; } #endif if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("Smallest block possible on this system is %u", bg_conf->smallest_block); /* below we are creating all the possible bitmaps for * each size of small block */ if ((int)bg_conf->nodecard_ionode_cnt < 1) { bg_conf->nodecard_ionode_cnt = 0; } else { bg_lists->valid_small32 = list_create(_destroy_bitmap); /* This is suppose to be = and not ==, we only want to decrement when small_size equals something. */ if ((small_size = bg_conf->nodecard_ionode_cnt)) small_size--; i = 0; while (i<bg_conf->ionodes_per_mp) { tmp_bitmap = bit_alloc(bg_conf->ionodes_per_mp); bit_nset(tmp_bitmap, i, i+small_size); i += small_size+1; list_append(bg_lists->valid_small32, tmp_bitmap); } } /* If we only have 1 nodecard just jump to the end since this will never need to happen below. Pretty much a hack to avoid seg fault;). */ if (bg_conf->mp_cnode_cnt == bg_conf->nodecard_cnode_cnt) goto no_calc; bg_lists->valid_small128 = list_create(_destroy_bitmap); if ((small_size = bg_conf->quarter_ionode_cnt)) small_size--; i = 0; while (i<bg_conf->ionodes_per_mp) { tmp_bitmap = bit_alloc(bg_conf->ionodes_per_mp); bit_nset(tmp_bitmap, i, i+small_size); i += small_size+1; list_append(bg_lists->valid_small128, tmp_bitmap); } #ifndef HAVE_BGL bg_lists->valid_small64 = list_create(_destroy_bitmap); if ((small_size = bg_conf->nodecard_ionode_cnt * 2)) small_size--; i = 0; while (i<bg_conf->ionodes_per_mp) { tmp_bitmap = bit_alloc(bg_conf->ionodes_per_mp); bit_nset(tmp_bitmap, i, i+small_size); i += small_size+1; list_append(bg_lists->valid_small64, tmp_bitmap); } bg_lists->valid_small256 = list_create(_destroy_bitmap); if ((small_size = bg_conf->quarter_ionode_cnt * 2)) small_size--; i = 0; while (i<bg_conf->ionodes_per_mp) { tmp_bitmap = bit_alloc(bg_conf->ionodes_per_mp); bit_nset(tmp_bitmap, i, i+small_size); i += small_size+1; list_append(bg_lists->valid_small256, tmp_bitmap); } #endif } else { fatal("your ionodes_per_mp is 0"); } no_calc: if (!s_p_get_uint16(&bg_conf->bridge_api_verb, "BridgeAPIVerbose", tbl)) info("Warning: BridgeAPIVerbose not configured " "in bluegene.conf"); if (!s_p_get_string(&bg_conf->bridge_api_file, "BridgeAPILogFile", tbl)) info("BridgeAPILogFile not configured in bluegene.conf"); else _reopen_bridge_log(); if (s_p_get_string(&tmp_char, "DenyPassthrough", tbl)) { if (strstr(tmp_char, "A")) ba_deny_pass |= PASS_DENY_A; if (strstr(tmp_char, "X")) ba_deny_pass |= PASS_DENY_X; if (strstr(tmp_char, "Y")) ba_deny_pass |= PASS_DENY_Y; if (strstr(tmp_char, "Z")) ba_deny_pass |= PASS_DENY_Z; if (!xstrcasecmp(tmp_char, "ALL")) ba_deny_pass |= PASS_DENY_ALL; bg_conf->deny_pass = ba_deny_pass; xfree(tmp_char); } if (!s_p_get_string(&tmp_char, "LayoutMode", tbl)) { info("Warning: LayoutMode was not specified in bluegene.conf " "defaulting to STATIC partitioning"); bg_conf->layout_mode = LAYOUT_STATIC; } else { if (!xstrcasecmp(tmp_char,"STATIC")) bg_conf->layout_mode = LAYOUT_STATIC; else if (!xstrcasecmp(tmp_char,"OVERLAP")) bg_conf->layout_mode = LAYOUT_OVERLAP; else if (!xstrcasecmp(tmp_char,"DYNAMIC")) bg_conf->layout_mode = LAYOUT_DYNAMIC; else { fatal("I don't understand this LayoutMode = %s", tmp_char); } xfree(tmp_char); } /* add blocks defined in file */ if (bg_conf->layout_mode != LAYOUT_DYNAMIC) { if (!s_p_get_array((void ***)&blockreq_array, &count, "MPs", tbl)) { if (!s_p_get_array((void ***)&blockreq_array, &count, "BPs", tbl)) { info("WARNING: no blocks defined in " "bluegene.conf, " "only making full system block"); /* create_full_system_block(NULL); */ if (bg_conf->sub_mp_sys || (bg_conf->mp_cnode_cnt == bg_conf->nodecard_cnode_cnt)) fatal("On a sub-midplane system you " "need to define the blocks you " "want on your system."); } } for (i = 0; i < count; i++) { add_bg_record(bg_lists->main, NULL, blockreq_array[i], 0, 0); } } else if (bg_conf->sub_mp_sys || (bg_conf->mp_cnode_cnt == bg_conf->nodecard_cnode_cnt)) /* we can't do dynamic here on a sub-midplane system */ fatal("On a sub-midplane system we can only do OVERLAP or " "STATIC LayoutMode. Please update your bluegene.conf."); #ifdef HAVE_BGQ if ((bg_recover != NOT_FROM_CONTROLLER) && assoc_mgr_qos_list && s_p_get_string(&tmp_char, "RebootQOSList", tbl)) { bool valid; char *token, *last = NULL; slurmdb_qos_rec_t *qos = NULL; assoc_mgr_lock_t locks = { NO_LOCK, NO_LOCK, NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; /* Lock here to avoid g_qos_count changing under us */ assoc_mgr_lock(&locks); bg_conf->reboot_qos_bitmap = bit_alloc(g_qos_count); itr = list_iterator_create(assoc_mgr_qos_list); token = strtok_r(tmp_char, ",", &last); while (token) { valid = false; while((qos = list_next(itr))) { if (!xstrcasecmp(token, qos->name)) { bit_set(bg_conf->reboot_qos_bitmap, qos->id); valid = true; break; } } if (!valid) error("Invalid RebootQOSList value: %s", token); list_iterator_reset(itr); token = strtok_r(NULL, ",", &last); } list_iterator_destroy(itr); xfree(tmp_char); assoc_mgr_unlock(&locks); } #endif s_p_hashtbl_destroy(tbl); return SLURM_SUCCESS; }
extern int slurm_jobcomp_log_record(struct job_record *job_ptr) { int nwritten, B_SIZE = 1024; char usr_str[32], grp_str[32], start_str[32], end_str[32]; char submit_str[32], *cluster = NULL, *qos, *state_string; time_t elapsed_time, submit_time, eligible_time; enum job_states job_state; uint32_t time_limit; uint16_t ntasks_per_node; int i; char *buffer, tmp_str[256], *script_str, *script; struct job_node *jnode; if (list_count(jobslist) > MAX_JOBS) { error("%s: Limit of %d enqueued jobs in memory waiting to be " "indexed reached. Job %lu discarded", plugin_type, MAX_JOBS, (unsigned long)job_ptr->job_id); return SLURM_ERROR; } _get_user_name(job_ptr->user_id, usr_str, sizeof(usr_str)); _get_group_name(job_ptr->group_id, grp_str, sizeof(grp_str)); if ((job_ptr->time_limit == NO_VAL) && job_ptr->part_ptr) time_limit = job_ptr->part_ptr->max_time; else time_limit = job_ptr->time_limit; if (job_ptr->job_state & JOB_RESIZING) { time_t now = time(NULL); state_string = job_state_string(job_ptr->job_state); if (job_ptr->resize_time) { _make_time_str(&job_ptr->resize_time, start_str, sizeof(start_str)); } else { _make_time_str(&job_ptr->start_time, start_str, sizeof(start_str)); } _make_time_str(&now, end_str, sizeof(end_str)); } else { /* Job state will typically have JOB_COMPLETING or JOB_RESIZING * flag set when called. We remove the flags to get the eventual * completion state: JOB_FAILED, JOB_TIMEOUT, etc. */ job_state = job_ptr->job_state & JOB_STATE_BASE; state_string = job_state_string(job_state); if (job_ptr->resize_time) { _make_time_str(&job_ptr->resize_time, start_str, sizeof(start_str)); } else if (job_ptr->start_time > job_ptr->end_time) { /* Job cancelled while pending and * expected start time is in the future. */ snprintf(start_str, sizeof(start_str), "Unknown"); } else { _make_time_str(&job_ptr->start_time, start_str, sizeof(start_str)); } _make_time_str(&job_ptr->end_time, end_str, sizeof(end_str)); } elapsed_time = job_ptr->end_time - job_ptr->start_time; buffer = xmalloc(B_SIZE); nwritten = snprintf(buffer, B_SIZE, JOBCOMP_DATA_FORMAT, (unsigned long) job_ptr->job_id, usr_str, (unsigned long) job_ptr->user_id, grp_str, (unsigned long) job_ptr->group_id, start_str, end_str, (long) elapsed_time, job_ptr->partition, job_ptr->alloc_node, job_ptr->nodes, (unsigned long) job_ptr->total_cpus, (unsigned long) job_ptr->total_nodes, (unsigned long) job_ptr->derived_ec, (unsigned long) job_ptr->exit_code, state_string); if (nwritten >= B_SIZE) { B_SIZE += nwritten + 1; buffer = xrealloc(buffer, B_SIZE); nwritten = snprintf(buffer, B_SIZE, JOBCOMP_DATA_FORMAT, (unsigned long) job_ptr->job_id, usr_str, (unsigned long) job_ptr->user_id, grp_str, (unsigned long) job_ptr->group_id, start_str, end_str, (long) elapsed_time, job_ptr->partition, job_ptr->alloc_node, job_ptr->nodes, (unsigned long) job_ptr->total_cpus, (unsigned long) job_ptr->total_nodes, (unsigned long) job_ptr->derived_ec, (unsigned long) job_ptr->exit_code, state_string); if (nwritten >= B_SIZE) { error("%s: Job completion data truncated and lost", plugin_type); return SLURM_ERROR; } } snprintf(tmp_str, sizeof(tmp_str), ",\"cpu_hours\":%.6f", ((float) elapsed_time * (float) job_ptr->total_cpus) / (float) 3600); xstrcat(buffer, tmp_str); if (job_ptr->array_task_id != NO_VAL) { xstrfmtcat(buffer, ",\"array_job_id\":%lu", (unsigned long) job_ptr->array_job_id); xstrfmtcat(buffer, ",\"array_task_id\":%lu", (unsigned long) job_ptr->array_task_id); } if (job_ptr->details && (job_ptr->details->submit_time != NO_VAL)) { submit_time = job_ptr->details->submit_time; _make_time_str(&submit_time, submit_str, sizeof(submit_str)); xstrfmtcat(buffer, ",\"@submit\":\"%s\"", submit_str); } if (job_ptr->details && (job_ptr->details->begin_time != NO_VAL)) { eligible_time = job_ptr->start_time - job_ptr->details->begin_time; xstrfmtcat(buffer, ",\"eligible_time\":%lu", eligible_time); } if (job_ptr->details && (job_ptr->details->work_dir && job_ptr->details->work_dir[0])) { xstrfmtcat(buffer, ",\"work_dir\":\"%s\"", job_ptr->details->work_dir); } if (job_ptr->details && (job_ptr->details->std_err && job_ptr->details->std_err[0])) { xstrfmtcat(buffer, ",\"std_err\":\"%s\"", job_ptr->details->std_err); } if (job_ptr->details && (job_ptr->details->std_in && job_ptr->details->std_in[0])) { xstrfmtcat(buffer, ",\"std_in\":\"%s\"", job_ptr->details->std_in); } if (job_ptr->details && (job_ptr->details->std_out && job_ptr->details->std_out[0])) { xstrfmtcat(buffer, ",\"std_out\":\"%s\"", job_ptr->details->std_out); } if (job_ptr->assoc_ptr != NULL) { cluster = ((slurmdb_assoc_rec_t *) job_ptr->assoc_ptr)->cluster; xstrfmtcat(buffer, ",\"cluster\":\"%s\"", cluster); } if (job_ptr->qos_ptr != NULL) { slurmdb_qos_rec_t *assoc = (slurmdb_qos_rec_t *) job_ptr->qos_ptr; qos = assoc->name; xstrfmtcat(buffer, ",\"qos\":\"%s\"", qos); } if (job_ptr->details && (job_ptr->details->num_tasks != NO_VAL)) { xstrfmtcat(buffer, ",\"ntasks\":%hu", job_ptr->details->num_tasks); } if (job_ptr->details && (job_ptr->details->ntasks_per_node != NO_VAL)) { ntasks_per_node = job_ptr->details->ntasks_per_node; xstrfmtcat(buffer, ",\"ntasks_per_node\":%hu", ntasks_per_node); } if (job_ptr->details && (job_ptr->details->cpus_per_task != NO_VAL)) { xstrfmtcat(buffer, ",\"cpus_per_task\":%hu", job_ptr->details->cpus_per_task); } if (job_ptr->details && (job_ptr->details->orig_dependency && job_ptr->details->orig_dependency[0])) { xstrfmtcat(buffer, ",\"orig_dependency\":\"%s\"", job_ptr->details->orig_dependency); } if (job_ptr->details && (job_ptr->details->exc_nodes && job_ptr->details->exc_nodes[0])) { xstrfmtcat(buffer, ",\"excluded_nodes\":\"%s\"", job_ptr->details->exc_nodes); } if (time_limit != INFINITE) { xstrfmtcat(buffer, ",\"time_limit\":%lu", (unsigned long) time_limit * 60); } if (job_ptr->resv_name && job_ptr->resv_name[0]) { xstrfmtcat(buffer, ",\"reservation_name\":\"%s\"", job_ptr->resv_name); } if (job_ptr->gres_req && job_ptr->gres_req[0]) { xstrfmtcat(buffer, ",\"gres_req\":\"%s\"", job_ptr->gres_req); } if (job_ptr->gres_alloc && job_ptr->gres_alloc[0]) { xstrfmtcat(buffer, ",\"gres_alloc\":\"%s\"", job_ptr->gres_alloc); } if (job_ptr->account && job_ptr->account[0]) { xstrfmtcat(buffer, ",\"account\":\"%s\"", job_ptr->account); } script = get_job_script(job_ptr); if (script && script[0]) { script_str = _json_escape(script); xstrfmtcat(buffer, ",\"script\":\"%s\"", script_str); xfree(script_str); } xfree(script); if (job_ptr->assoc_ptr) { assoc_mgr_lock_t locks = { READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; slurmdb_assoc_rec_t *assoc_ptr = job_ptr->assoc_ptr; char *parent_accounts = NULL; char **acc_aux = NULL; int nparents = 0; assoc_mgr_lock(&locks); /* Start at the first parent and go up. When studying * this code it was slightly faster to do 2 loops on * the association linked list and only 1 xmalloc but * we opted for cleaner looking code and going with a * realloc. */ while (assoc_ptr) { if (assoc_ptr->acct) { acc_aux = xrealloc(acc_aux, sizeof(char *) * (nparents + 1)); acc_aux[nparents++] = assoc_ptr->acct; } assoc_ptr = assoc_ptr->usage->parent_assoc_ptr; } for (i = nparents - 1; i >= 0; i--) xstrfmtcat(parent_accounts, "/%s", acc_aux[i]); xfree(acc_aux); xstrfmtcat(buffer, ",\"parent_accounts\":\"%s\"", parent_accounts); xfree(parent_accounts); assoc_mgr_unlock(&locks); } xstrcat(buffer, "}"); jnode = xmalloc(sizeof(struct job_node)); jnode->serialized_job = xstrdup(buffer); list_enqueue(jobslist, jnode); return SLURM_SUCCESS; }
/* Remove previously used time from qos and assocs grp_used_cpu_run_secs. When restarting slurmctld acct_policy_job_begin() is called for all running jobs. There every jobs total requested cputime (total_cpus * time_limit) is added to grp_used_cpu_run_secs of assocs and qos. This function will subtract all cputime that was used until the decay thread last ran. This kludge is necessary as the decay thread last_ran variable can't be accessed from acct_policy_job_begin(). */ void _init_grp_used_cpu_run_secs(time_t last_ran) { struct job_record *job_ptr = NULL; ListIterator itr; assoc_mgr_lock_t locks = { WRITE_LOCK, NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK }; slurmctld_lock_t job_read_lock = { NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK }; uint64_t delta; slurmdb_qos_rec_t *qos; slurmdb_association_rec_t *assoc; if(priority_debug) info("Initializing grp_used_cpu_run_secs"); if (!(job_list && list_count(job_list))) return; lock_slurmctld(job_read_lock); itr = list_iterator_create(job_list); if (itr == NULL) fatal("list_iterator_create: malloc failure"); assoc_mgr_lock(&locks); while ((job_ptr = list_next(itr))) { if (priority_debug) debug2("job: %u",job_ptr->job_id); qos = NULL; assoc = NULL; delta = 0; if (!IS_JOB_RUNNING(job_ptr)) continue; if (job_ptr->start_time > last_ran) continue; delta = job_ptr->total_cpus * (last_ran - job_ptr->start_time); qos = (slurmdb_qos_rec_t *) job_ptr->qos_ptr; assoc = (slurmdb_association_rec_t *) job_ptr->assoc_ptr; if(qos) { if (priority_debug) info("Subtracting %"PRIu64" from qos " "%u grp_used_cpu_run_secs " "%"PRIu64" = %"PRIu64"", delta, qos->id, qos->usage->grp_used_cpu_run_secs, qos->usage->grp_used_cpu_run_secs - delta); qos->usage->grp_used_cpu_run_secs -= delta; } while (assoc) { if (priority_debug) info("Subtracting %"PRIu64" from assoc %u " "grp_used_cpu_run_secs " "%"PRIu64" = %"PRIu64"", delta, assoc->id, assoc->usage->grp_used_cpu_run_secs, assoc->usage->grp_used_cpu_run_secs - delta); assoc->usage->grp_used_cpu_run_secs -= delta; assoc = assoc->usage->parent_assoc_ptr; } } assoc_mgr_unlock(&locks); list_iterator_destroy(itr); unlock_slurmctld(job_read_lock); }
static void _adjust_limit_usage(int type, struct job_record *job_ptr) { slurmdb_association_rec_t *assoc_ptr = NULL; assoc_mgr_lock_t locks = { WRITE_LOCK, NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK }; if (!(accounting_enforce & ACCOUNTING_ENFORCE_LIMITS) || !_valid_job_assoc(job_ptr)) return; assoc_mgr_lock(&locks); if (job_ptr->qos_ptr && (accounting_enforce & ACCOUNTING_ENFORCE_QOS)) { ListIterator itr = NULL; slurmdb_qos_rec_t *qos_ptr = NULL; slurmdb_used_limits_t *used_limits = NULL; qos_ptr = (slurmdb_qos_rec_t *)job_ptr->qos_ptr; if(!qos_ptr->usage->user_limit_list) qos_ptr->usage->user_limit_list = list_create(slurmdb_destroy_used_limits); itr = list_iterator_create(qos_ptr->usage->user_limit_list); while((used_limits = list_next(itr))) { if(used_limits->uid == job_ptr->user_id) break; } list_iterator_destroy(itr); if(!used_limits) { used_limits = xmalloc(sizeof(slurmdb_used_limits_t)); used_limits->uid = job_ptr->user_id; list_append(qos_ptr->usage->user_limit_list, used_limits); } switch(type) { case ACCT_POLICY_ADD_SUBMIT: qos_ptr->usage->grp_used_submit_jobs++; used_limits->submit_jobs++; break; case ACCT_POLICY_REM_SUBMIT: if(qos_ptr->usage->grp_used_submit_jobs) qos_ptr->usage->grp_used_submit_jobs--; else debug2("acct_policy_remove_job_submit: " "grp_submit_jobs underflow for qos %s", qos_ptr->name); if(used_limits->submit_jobs) used_limits->submit_jobs--; else debug2("acct_policy_remove_job_submit: " "used_submit_jobs underflow for " "qos %s user %d", qos_ptr->name, used_limits->uid); break; case ACCT_POLICY_JOB_BEGIN: qos_ptr->usage->grp_used_jobs++; qos_ptr->usage->grp_used_cpus += job_ptr->total_cpus; qos_ptr->usage->grp_used_nodes += job_ptr->node_cnt; used_limits->jobs++; break; case ACCT_POLICY_JOB_FINI: if(qos_ptr->usage->grp_used_jobs) qos_ptr->usage->grp_used_jobs--; else debug2("acct_policy_job_fini: used_jobs " "underflow for qos %s", qos_ptr->name); qos_ptr->usage->grp_used_cpus -= job_ptr->total_cpus; if((int32_t)qos_ptr->usage->grp_used_cpus < 0) { qos_ptr->usage->grp_used_cpus = 0; debug2("acct_policy_job_fini: grp_used_cpus " "underflow for qos %s", qos_ptr->name); } qos_ptr->usage->grp_used_nodes -= job_ptr->node_cnt; if((int32_t)qos_ptr->usage->grp_used_nodes < 0) { qos_ptr->usage->grp_used_nodes = 0; debug2("acct_policy_job_fini: grp_used_nodes " "underflow for qos %s", qos_ptr->name); } if(used_limits->jobs) used_limits->jobs--; else debug2("acct_policy_job_fini: used_jobs " "underflow for qos %s user %d", qos_ptr->name, used_limits->uid); break; default: error("acct_policy: qos unknown type %d", type); break; } } assoc_ptr = (slurmdb_association_rec_t *)job_ptr->assoc_ptr; while(assoc_ptr) { switch(type) { case ACCT_POLICY_ADD_SUBMIT: assoc_ptr->usage->used_submit_jobs++; break; case ACCT_POLICY_REM_SUBMIT: if (assoc_ptr->usage->used_submit_jobs) assoc_ptr->usage->used_submit_jobs--; else debug2("acct_policy_remove_job_submit: " "used_submit_jobs underflow for " "account %s", assoc_ptr->acct); break; case ACCT_POLICY_JOB_BEGIN: assoc_ptr->usage->used_jobs++; assoc_ptr->usage->grp_used_cpus += job_ptr->total_cpus; assoc_ptr->usage->grp_used_nodes += job_ptr->node_cnt; break; case ACCT_POLICY_JOB_FINI: if (assoc_ptr->usage->used_jobs) assoc_ptr->usage->used_jobs--; else debug2("acct_policy_job_fini: used_jobs " "underflow for account %s", assoc_ptr->acct); assoc_ptr->usage->grp_used_cpus -= job_ptr->total_cpus; if ((int32_t)assoc_ptr->usage->grp_used_cpus < 0) { assoc_ptr->usage->grp_used_cpus = 0; debug2("acct_policy_job_fini: grp_used_cpus " "underflow for account %s", assoc_ptr->acct); } assoc_ptr->usage->grp_used_nodes -= job_ptr->node_cnt; if ((int32_t)assoc_ptr->usage->grp_used_nodes < 0) { assoc_ptr->usage->grp_used_nodes = 0; debug2("acct_policy_job_fini: grp_used_nodes " "underflow for account %s", assoc_ptr->acct); } break; default: error("acct_policy: association unknown type %d", type); break; } /* now handle all the group limits of the parents */ assoc_ptr = assoc_ptr->usage->parent_assoc_ptr; } assoc_mgr_unlock(&locks); }
/* If the job is running then apply decay to the job. * * Return 0 if we don't need to process the job any further, 1 if * futher processing is needed. */ static int _apply_new_usage(struct job_record *job_ptr, double decay_factor, time_t start_period, time_t end_period) { slurmdb_qos_rec_t *qos; slurmdb_association_rec_t *assoc; int run_delta = 0; double run_decay = 0.0, real_decay = 0.0; uint64_t cpu_run_delta = 0; uint64_t job_time_limit_ends = 0; assoc_mgr_lock_t locks = { WRITE_LOCK, NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK }; assoc_mgr_lock_t qos_read_lock = { NO_LOCK, NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK }; /* If usage_factor is 0 just skip this since we don't add the usage. */ assoc_mgr_lock(&qos_read_lock); qos = (slurmdb_qos_rec_t *)job_ptr->qos_ptr; if (qos && !qos->usage_factor) { assoc_mgr_unlock(&qos_read_lock); return 0; } assoc_mgr_unlock(&qos_read_lock); if (job_ptr->start_time > start_period) start_period = job_ptr->start_time; if (job_ptr->end_time && (end_period > job_ptr->end_time)) end_period = job_ptr->end_time; run_delta = (int) (end_period - start_period); /* job already has been accounted for go to next */ if (run_delta < 1) return 0; /* cpu_run_delta will is used to decrease qos and assocs grp_used_cpu_run_secs values. When a job is started only seconds until start_time+time_limit is added, so for jobs running over their timelimit we should only subtract the used time until the time limit. */ job_time_limit_ends = (uint64_t)job_ptr->start_time + (uint64_t)job_ptr->time_limit * 60; if ((uint64_t)start_period >= job_time_limit_ends) cpu_run_delta = 0; else if (end_period > job_time_limit_ends) cpu_run_delta = job_ptr->total_cpus * (job_time_limit_ends - (uint64_t)start_period); else cpu_run_delta = job_ptr->total_cpus * run_delta; if (priority_debug) info("job %u ran for %d seconds on %u cpus", job_ptr->job_id, run_delta, job_ptr->total_cpus); /* get the time in decayed fashion */ run_decay = run_delta * pow(decay_factor, (double)run_delta); real_decay = run_decay * (double)job_ptr->total_cpus; assoc_mgr_lock(&locks); /* Just to make sure we don't make a window where the qos_ptr could of changed make sure we get it again here. */ qos = (slurmdb_qos_rec_t *)job_ptr->qos_ptr; assoc = (slurmdb_association_rec_t *)job_ptr->assoc_ptr; /* now apply the usage factor for this qos */ if (qos) { if (qos->usage_factor >= 0) { real_decay *= qos->usage_factor; run_decay *= qos->usage_factor; } qos->usage->grp_used_wall += run_decay; qos->usage->usage_raw += (long double)real_decay; if (qos->usage->grp_used_cpu_run_secs >= cpu_run_delta) { if (priority_debug) info("grp_used_cpu_run_secs is %"PRIu64", " "will subtract %"PRIu64"", qos->usage->grp_used_cpu_run_secs, cpu_run_delta); qos->usage->grp_used_cpu_run_secs -= cpu_run_delta; } else { if (priority_debug) info("jobid %u, qos %s: setting " "grp_used_cpu_run_secs " "to 0 because %"PRIu64" < %"PRIu64"", job_ptr->job_id, qos->name, qos->usage->grp_used_cpu_run_secs, cpu_run_delta); qos->usage->grp_used_cpu_run_secs = 0; } } /* We want to do this all the way up to and including root. This way we can keep track of how much usage has occured on the entire system and use that to normalize against. */ while (assoc) { if (assoc->usage->grp_used_cpu_run_secs >= cpu_run_delta) { if(priority_debug) info("grp_used_cpu_run_secs is %"PRIu64", " "will subtract %"PRIu64"", assoc->usage->grp_used_cpu_run_secs, cpu_run_delta); assoc->usage->grp_used_cpu_run_secs -= cpu_run_delta; } else { if (priority_debug) info("jobid %u, assoc %u: setting " "grp_used_cpu_run_secs " "to 0 because %"PRIu64" < %"PRIu64"", job_ptr->job_id, assoc->id, assoc->usage->grp_used_cpu_run_secs, cpu_run_delta); assoc->usage->grp_used_cpu_run_secs = 0; } assoc->usage->grp_used_wall += run_decay; assoc->usage->usage_raw += (long double)real_decay; if (priority_debug) info("adding %f new usage to assoc %u (user='******' " "acct='%s') raw usage is now %Lf. Group wall " "added %f making it %f. GrpCPURunMins is " "%"PRIu64"", real_decay, assoc->id, assoc->user, assoc->acct, assoc->usage->usage_raw, run_decay, assoc->usage->grp_used_wall, assoc->usage->grp_used_cpu_run_secs/60); assoc = assoc->usage->parent_assoc_ptr; } assoc_mgr_unlock(&locks); return 1; }
extern bool acct_policy_validate(job_desc_msg_t *job_desc, struct part_record *part_ptr, slurmdb_association_rec_t *assoc_in, slurmdb_qos_rec_t *qos_ptr, uint16_t *limit_set_max_cpus, uint16_t *limit_set_max_nodes, uint16_t *limit_set_time, bool update_call) { uint32_t time_limit; slurmdb_association_rec_t *assoc_ptr = assoc_in; int parent = 0; char *user_name = NULL; bool rc = true; assoc_mgr_lock_t locks = { READ_LOCK, NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK }; xassert(limit_set_max_cpus); xassert(limit_set_max_nodes); xassert(limit_set_time); if (!assoc_ptr) { error("_validate_acct_policy: no assoc_ptr given for job."); return false; } user_name = assoc_ptr->user; assoc_mgr_lock(&locks); if (qos_ptr) { /* for validation we don't need to look at * qos_ptr->grp_cpu_mins. */ if (((*limit_set_max_cpus) == ADMIN_SET_LIMIT) || (qos_ptr->grp_cpus == INFINITE) || (update_call && (job_desc->max_cpus == NO_VAL))) { /* no need to check/set */ } else if ((job_desc->min_cpus != NO_VAL) && (job_desc->min_cpus > qos_ptr->grp_cpus)) { info("job submit for user %s(%u): " "min cpu request %u exceeds " "group max cpu limit %u for qos '%s'", user_name, job_desc->user_id, job_desc->min_cpus, qos_ptr->grp_cpus, qos_ptr->name); rc = false; goto end_it; } else if ((job_desc->max_cpus == NO_VAL) || ((*limit_set_max_cpus) && (job_desc->max_cpus > qos_ptr->grp_cpus))) { job_desc->max_cpus = qos_ptr->grp_cpus; (*limit_set_max_cpus) = 1; } else if (job_desc->max_cpus > qos_ptr->grp_cpus) { info("job submit for user %s(%u): " "max cpu changed %u -> %u because " "of qos limit", user_name, job_desc->user_id, job_desc->max_cpus, qos_ptr->grp_cpus); if (job_desc->max_cpus == NO_VAL) (*limit_set_max_cpus) = 1; job_desc->max_cpus = qos_ptr->grp_cpus; } /* for validation we don't need to look at * qos_ptr->grp_jobs. */ if (((*limit_set_max_nodes) == ADMIN_SET_LIMIT) || (qos_ptr->grp_nodes == INFINITE) || (update_call && (job_desc->max_nodes == NO_VAL))) { /* no need to check/set */ } else if ((job_desc->min_nodes != NO_VAL) && (job_desc->min_nodes > qos_ptr->grp_nodes)) { info("job submit for user %s(%u): " "min node request %u exceeds " "group max node limit %u for qos '%s'", user_name, job_desc->user_id, job_desc->min_nodes, qos_ptr->grp_nodes, qos_ptr->name); rc = false; goto end_it; } else if ((job_desc->max_nodes == 0) || ((*limit_set_max_nodes) && (job_desc->max_nodes > qos_ptr->grp_nodes))) { job_desc->max_nodes = qos_ptr->grp_nodes; (*limit_set_max_nodes) = 1; } else if (job_desc->max_nodes > qos_ptr->grp_nodes) { info("job submit for user %s(%u): " "max node changed %u -> %u because " "of qos limit", user_name, job_desc->user_id, job_desc->max_nodes, qos_ptr->grp_nodes); if (job_desc->max_nodes == NO_VAL) (*limit_set_max_nodes) = 1; job_desc->max_nodes = qos_ptr->grp_nodes; } if ((qos_ptr->grp_submit_jobs != INFINITE) && (qos_ptr->usage->grp_used_submit_jobs >= qos_ptr->grp_submit_jobs)) { info("job submit for user %s(%u): " "group max submit job limit exceeded %u " "for qos '%s'", user_name, job_desc->user_id, qos_ptr->grp_submit_jobs, qos_ptr->name); rc = false; goto end_it; } /* for validation we don't need to look at * qos_ptr->grp_wall. It is checked while the job is running. */ /* for validation we don't need to look at * qos_ptr->max_cpu_mins_pj. It is checked while the * job is running. */ if (((*limit_set_max_cpus) == ADMIN_SET_LIMIT) || (qos_ptr->max_cpus_pj == INFINITE) || (update_call && (job_desc->max_cpus == NO_VAL))) { /* no need to check/set */ } else if ((job_desc->min_cpus != NO_VAL) && (job_desc->min_cpus > qos_ptr->max_cpus_pj)) { info("job submit for user %s(%u): " "min cpu limit %u exceeds " "qos max %u", user_name, job_desc->user_id, job_desc->min_cpus, qos_ptr->max_cpus_pj); rc = false; goto end_it; } else if ((job_desc->max_cpus == NO_VAL) || ((*limit_set_max_cpus) && (job_desc->max_cpus > qos_ptr->max_cpus_pj))) { job_desc->max_cpus = qos_ptr->max_cpus_pj; (*limit_set_max_cpus) = 1; } else if (job_desc->max_cpus > qos_ptr->max_cpus_pj) { info("job submit for user %s(%u): " "max cpu changed %u -> %u because " "of qos limit", user_name, job_desc->user_id, job_desc->max_cpus, qos_ptr->max_cpus_pj); if (job_desc->max_cpus == NO_VAL) (*limit_set_max_cpus) = 1; job_desc->max_cpus = qos_ptr->max_cpus_pj; } /* for validation we don't need to look at * qos_ptr->max_jobs. */ if (((*limit_set_max_nodes) == ADMIN_SET_LIMIT) || (qos_ptr->max_nodes_pj == INFINITE) || (update_call && (job_desc->max_nodes == NO_VAL))) { /* no need to check/set */ } else if ((job_desc->min_nodes != NO_VAL) && (job_desc->min_nodes > qos_ptr->max_nodes_pj)) { info("job submit for user %s(%u): " "min node limit %u exceeds " "qos max %u", user_name, job_desc->user_id, job_desc->min_nodes, qos_ptr->max_nodes_pj); rc = false; goto end_it; } else if ((job_desc->max_nodes == 0) || ((*limit_set_max_nodes) && (job_desc->max_nodes > qos_ptr->max_nodes_pj))) { job_desc->max_nodes = qos_ptr->max_nodes_pj; (*limit_set_max_nodes) = 1; } else if (job_desc->max_nodes > qos_ptr->max_nodes_pj) { info("job submit for user %s(%u): " "max node changed %u -> %u because " "of qos limit", user_name, job_desc->user_id, job_desc->max_nodes, qos_ptr->max_nodes_pj); if (job_desc->max_nodes == NO_VAL) (*limit_set_max_nodes) = 1; job_desc->max_nodes = qos_ptr->max_nodes_pj; } if (qos_ptr->max_submit_jobs_pu != INFINITE) { slurmdb_used_limits_t *used_limits = NULL; if (qos_ptr->usage->user_limit_list) { ListIterator itr = list_iterator_create( qos_ptr->usage->user_limit_list); while((used_limits = list_next(itr))) { if (used_limits->uid == job_desc->user_id) break; } list_iterator_destroy(itr); } if (used_limits && (used_limits->submit_jobs >= qos_ptr->max_submit_jobs_pu)) { info("job submit for user %s(%u): " "qos max submit job limit exceeded %u", user_name, job_desc->user_id, qos_ptr->max_submit_jobs_pu); rc = false; goto end_it; } } if (((*limit_set_time) == ADMIN_SET_LIMIT) || (qos_ptr->max_wall_pj == INFINITE) || (update_call && (job_desc->time_limit == NO_VAL))) { /* no need to check/set */ } else { time_limit = qos_ptr->max_wall_pj; if (job_desc->time_limit == NO_VAL) { if (part_ptr->max_time == INFINITE) job_desc->time_limit = time_limit; else job_desc->time_limit = MIN(time_limit, part_ptr->max_time); (*limit_set_time) = 1; } else if ((*limit_set_time) && job_desc->time_limit > time_limit) { job_desc->time_limit = time_limit; } else if (job_desc->time_limit > time_limit) { info("job submit for user %s(%u): " "time limit %u exceeds qos max %u", user_name, job_desc->user_id, job_desc->time_limit, time_limit); rc = false; goto end_it; } } } while(assoc_ptr) { /* for validation we don't need to look at * assoc_ptr->grp_cpu_mins. */ if (((*limit_set_max_cpus) == ADMIN_SET_LIMIT) || (qos_ptr && (qos_ptr->grp_cpus != INFINITE)) || (assoc_ptr->grp_cpus == INFINITE) || (update_call && (job_desc->max_cpus == NO_VAL))) { /* no need to check/set */ } else if ((job_desc->min_cpus != NO_VAL) && (job_desc->min_cpus > assoc_ptr->grp_cpus)) { info("job submit for user %s(%u): " "min cpu request %u exceeds " "group max cpu limit %u for account %s", user_name, job_desc->user_id, job_desc->min_cpus, assoc_ptr->grp_cpus, assoc_ptr->acct); rc = false; break; } else if ((job_desc->max_cpus == NO_VAL) || ((*limit_set_max_cpus) && (job_desc->max_cpus > assoc_ptr->grp_cpus))) { job_desc->max_cpus = assoc_ptr->grp_cpus; (*limit_set_max_cpus) = 1; } else if (job_desc->max_cpus > assoc_ptr->grp_cpus) { info("job submit for user %s(%u): " "max cpu changed %u -> %u because " "of account limit", user_name, job_desc->user_id, job_desc->max_cpus, assoc_ptr->grp_cpus); if (job_desc->max_cpus == NO_VAL) (*limit_set_max_cpus) = 1; job_desc->max_cpus = assoc_ptr->grp_cpus; } /* for validation we don't need to look at * assoc_ptr->grp_jobs. */ if (((*limit_set_max_nodes) == ADMIN_SET_LIMIT) || (qos_ptr && (qos_ptr->grp_nodes != INFINITE)) || (assoc_ptr->grp_nodes == INFINITE) || (update_call && (job_desc->max_nodes == NO_VAL))) { /* no need to check/set */ } else if ((job_desc->min_nodes != NO_VAL) && (job_desc->min_nodes > assoc_ptr->grp_nodes)) { info("job submit for user %s(%u): " "min node request %u exceeds " "group max node limit %u for account %s", user_name, job_desc->user_id, job_desc->min_nodes, assoc_ptr->grp_nodes, assoc_ptr->acct); rc = false; break; } else if ((job_desc->max_nodes == 0) || ((*limit_set_max_nodes) && (job_desc->max_nodes > assoc_ptr->grp_nodes))) { job_desc->max_nodes = assoc_ptr->grp_nodes; (*limit_set_max_nodes) = 1; } else if (job_desc->max_nodes > assoc_ptr->grp_nodes) { info("job submit for user %s(%u): " "max node changed %u -> %u because " "of account limit", user_name, job_desc->user_id, job_desc->max_nodes, assoc_ptr->grp_nodes); if (job_desc->max_nodes == NO_VAL) (*limit_set_max_nodes) = 1; job_desc->max_nodes = assoc_ptr->grp_nodes; } if ((!qos_ptr || (qos_ptr && qos_ptr->grp_submit_jobs == INFINITE)) && (assoc_ptr->grp_submit_jobs != INFINITE) && (assoc_ptr->usage->used_submit_jobs >= assoc_ptr->grp_submit_jobs)) { info("job submit for user %s(%u): " "group max submit job limit exceeded %u " "for account '%s'", user_name, job_desc->user_id, assoc_ptr->grp_submit_jobs, assoc_ptr->acct); rc = false; break; } /* for validation we don't need to look at * assoc_ptr->grp_wall. It is checked while the job is running. */ /* We don't need to look at the regular limits for * parents since we have pre-propogated them, so just * continue with the next parent */ if (parent) { assoc_ptr = assoc_ptr->usage->parent_assoc_ptr; continue; } /* for validation we don't need to look at * assoc_ptr->max_cpu_mins_pj. */ if (((*limit_set_max_cpus) == ADMIN_SET_LIMIT) || (qos_ptr && (qos_ptr->max_cpus_pj != INFINITE)) || (assoc_ptr->max_cpus_pj == INFINITE) || (update_call && (job_desc->max_cpus == NO_VAL))) { /* no need to check/set */ } else if ((job_desc->min_cpus != NO_VAL) && (job_desc->min_cpus > assoc_ptr->max_cpus_pj)) { info("job submit for user %s(%u): " "min cpu limit %u exceeds " "account max %u", user_name, job_desc->user_id, job_desc->min_cpus, assoc_ptr->max_cpus_pj); rc = false; break; } else if (job_desc->max_cpus == NO_VAL || ((*limit_set_max_cpus) && (job_desc->max_cpus > assoc_ptr->max_cpus_pj))) { job_desc->max_cpus = assoc_ptr->max_cpus_pj; (*limit_set_max_cpus) = 1; } else if (job_desc->max_cpus > assoc_ptr->max_cpus_pj) { info("job submit for user %s(%u): " "max cpu changed %u -> %u because " "of account limit", user_name, job_desc->user_id, job_desc->max_cpus, assoc_ptr->max_cpus_pj); if (job_desc->max_cpus == NO_VAL) (*limit_set_max_cpus) = 1; job_desc->max_cpus = assoc_ptr->max_cpus_pj; } /* for validation we don't need to look at * assoc_ptr->max_jobs. */ if (((*limit_set_max_nodes) == ADMIN_SET_LIMIT) || (qos_ptr && (qos_ptr->max_nodes_pj != INFINITE)) || (assoc_ptr->max_nodes_pj == INFINITE) || (update_call && (job_desc->max_nodes == NO_VAL))) { /* no need to check/set */ } else if ((job_desc->min_nodes != NO_VAL) && (job_desc->min_nodes > assoc_ptr->max_nodes_pj)) { info("job submit for user %s(%u): " "min node limit %u exceeds " "account max %u", user_name, job_desc->user_id, job_desc->min_nodes, assoc_ptr->max_nodes_pj); rc = false; break; } else if (((job_desc->max_nodes == NO_VAL) || (job_desc->max_nodes == 0)) || ((*limit_set_max_nodes) && (job_desc->max_nodes > assoc_ptr->max_nodes_pj))) { job_desc->max_nodes = assoc_ptr->max_nodes_pj; (*limit_set_max_nodes) = 1; } else if (job_desc->max_nodes > assoc_ptr->max_nodes_pj) { info("job submit for user %s(%u): " "max node changed %u -> %u because " "of account limit", user_name, job_desc->user_id, job_desc->max_nodes, assoc_ptr->max_nodes_pj); if (job_desc->max_nodes == NO_VAL) (*limit_set_max_nodes) = 1; job_desc->max_nodes = assoc_ptr->max_nodes_pj; } if ((!qos_ptr || (qos_ptr && qos_ptr->max_submit_jobs_pu == INFINITE)) && (assoc_ptr->max_submit_jobs != INFINITE) && (assoc_ptr->usage->used_submit_jobs >= assoc_ptr->max_submit_jobs)) { info("job submit for user %s(%u): " "account max submit job limit exceeded %u", user_name, job_desc->user_id, assoc_ptr->max_submit_jobs); rc = false; break; } if (((*limit_set_time) == ADMIN_SET_LIMIT) || (qos_ptr && (qos_ptr->max_wall_pj != INFINITE)) || (assoc_ptr->max_wall_pj == INFINITE) || (update_call && (job_desc->time_limit == NO_VAL))) { /* no need to check/set */ } else { time_limit = assoc_ptr->max_wall_pj; if (job_desc->time_limit == NO_VAL) { if (part_ptr->max_time == INFINITE) job_desc->time_limit = time_limit; else job_desc->time_limit = MIN(time_limit, part_ptr->max_time); (*limit_set_time) = 1; } else if ((*limit_set_time) && job_desc->time_limit > time_limit) { job_desc->time_limit = time_limit; } else if (job_desc->time_limit > time_limit) { info("job submit for user %s(%u): " "time limit %u exceeds account max %u", user_name, job_desc->user_id, job_desc->time_limit, time_limit); rc = false; break; } } assoc_ptr = assoc_ptr->usage->parent_assoc_ptr; parent = 1; } end_it: assoc_mgr_unlock(&locks); return rc; }
static void *_decay_thread(void *no_data) { struct job_record *job_ptr = NULL; ListIterator itr; time_t start_time = time(NULL); time_t next_time; /* int sigarray[] = {SIGUSR1, 0}; */ struct tm tm; time_t last_ran = 0; time_t last_reset = 0, next_reset = 0; uint32_t calc_period = slurm_get_priority_calc_period(); double decay_hl = (double)slurm_get_priority_decay_hl(); double decay_factor = 1; uint16_t reset_period = slurm_get_priority_reset_period(); /* Write lock on jobs, read lock on nodes and partitions */ slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK }; assoc_mgr_lock_t locks = { WRITE_LOCK, NO_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; if (decay_hl > 0) decay_factor = 1 - (0.693 / decay_hl); (void) pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL); (void) pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL); if (!localtime_r(&start_time, &tm)) { fatal("_decay_thread: " "Couldn't get localtime for rollup handler %ld", (long)start_time); return NULL; } _read_last_decay_ran(&last_ran, &last_reset); if (last_reset == 0) last_reset = start_time; _init_grp_used_cpu_run_secs(last_ran); while (1) { time_t now = time(NULL); int run_delta = 0; double real_decay = 0.0; slurm_mutex_lock(&decay_lock); running_decay = 1; /* If reconfig is called handle all that happens outside of the loop here */ if (reconfig) { /* if decay_hl is 0 or less that means no decay is to be had. This also means we flush the used time at a certain time set by PriorityUsageResetPeriod in the slurm.conf */ calc_period = slurm_get_priority_calc_period(); reset_period = slurm_get_priority_reset_period(); next_reset = 0; decay_hl = (double)slurm_get_priority_decay_hl(); if (decay_hl > 0) decay_factor = 1 - (0.693 / decay_hl); else decay_factor = 1; reconfig = 0; } /* this needs to be done right away so as to * incorporate it into the decay loop. */ switch(reset_period) { case PRIORITY_RESET_NONE: break; case PRIORITY_RESET_NOW: /* do once */ _reset_usage(); reset_period = PRIORITY_RESET_NONE; last_reset = now; break; case PRIORITY_RESET_DAILY: case PRIORITY_RESET_WEEKLY: case PRIORITY_RESET_MONTHLY: case PRIORITY_RESET_QUARTERLY: case PRIORITY_RESET_YEARLY: if (next_reset == 0) { next_reset = _next_reset(reset_period, last_reset); } if (now >= next_reset) { _reset_usage(); last_reset = next_reset; next_reset = _next_reset(reset_period, last_reset); } } /* now calculate all the normalized usage here */ assoc_mgr_lock(&locks); _set_children_usage_efctv( assoc_mgr_root_assoc->usage->childern_list); assoc_mgr_unlock(&locks); if (!last_ran) goto get_usage; else run_delta = (start_time - last_ran); if (run_delta <= 0) goto get_usage; real_decay = pow(decay_factor, (double)run_delta); if (priority_debug) info("Decay factor over %d seconds goes " "from %.15f -> %.15f", run_delta, decay_factor, real_decay); /* first apply decay to used time */ if (_apply_decay(real_decay) != SLURM_SUCCESS) { error("problem applying decay"); running_decay = 0; slurm_mutex_unlock(&decay_lock); break; } lock_slurmctld(job_write_lock); itr = list_iterator_create(job_list); while ((job_ptr = list_next(itr))) { /* apply new usage */ if (!IS_JOB_PENDING(job_ptr) && job_ptr->start_time && job_ptr->assoc_ptr) { if (!_apply_new_usage(job_ptr, decay_factor, last_ran, start_time)) continue; } /* * Priority 0 is reserved for held jobs. Also skip * priority calculation for non-pending jobs. */ if ((job_ptr->priority == 0) || !IS_JOB_PENDING(job_ptr)) continue; job_ptr->priority = _get_priority_internal(start_time, job_ptr); last_job_update = time(NULL); debug2("priority for job %u is now %u", job_ptr->job_id, job_ptr->priority); } list_iterator_destroy(itr); unlock_slurmctld(job_write_lock); get_usage: last_ran = start_time; _write_last_decay_ran(last_ran, last_reset); running_decay = 0; slurm_mutex_unlock(&decay_lock); /* sleep for calc_period secs */ tm.tm_sec += calc_period; tm.tm_isdst = -1; next_time = mktime(&tm); sleep((next_time-start_time)); start_time = next_time; /* repeat ;) */ } return NULL; }
extern List as_mysql_jobacct_process_get_jobs(mysql_conn_t *mysql_conn, uid_t uid, slurmdb_job_cond_t *job_cond) { char *extra = NULL; char *tmp = NULL, *tmp2 = NULL; ListIterator itr = NULL; int is_admin=1; int i; List job_list = NULL; uint16_t private_data = 0; slurmdb_user_rec_t user; int only_pending = 0; List use_cluster_list = as_mysql_cluster_list; char *cluster_name; assoc_mgr_lock_t locks = { NO_LOCK, NO_LOCK, NO_LOCK, NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK }; memset(&user, 0, sizeof(slurmdb_user_rec_t)); user.uid = uid; private_data = slurm_get_private_data(); if (private_data & PRIVATE_DATA_JOBS) { if (!(is_admin = is_user_min_admin_level( mysql_conn, uid, SLURMDB_ADMIN_OPERATOR))) { /* Only fill in the coordinator accounts here we will check them later when we actually try to get the jobs. */ is_user_any_coord(mysql_conn, &user); } if (!is_admin && !user.name) { debug("User %u has no assocations, and is not admin, " "so not returning any jobs.", user.uid); return NULL; } } if (job_cond && job_cond->state_list && (list_count(job_cond->state_list) == 1) && (slurm_atoul(list_peek(job_cond->state_list)) == JOB_PENDING)) only_pending = 1; setup_job_cond_limits(job_cond, &extra); xfree(tmp); xstrfmtcat(tmp, "%s", job_req_inx[0]); for(i=1; i<JOB_REQ_COUNT; i++) { xstrfmtcat(tmp, ", %s", job_req_inx[i]); } xfree(tmp2); xstrfmtcat(tmp2, "%s", step_req_inx[0]); for(i=1; i<STEP_REQ_COUNT; i++) { xstrfmtcat(tmp2, ", %s", step_req_inx[i]); } if (job_cond && job_cond->cluster_list && list_count(job_cond->cluster_list)) use_cluster_list = job_cond->cluster_list; else slurm_mutex_lock(&as_mysql_cluster_list_lock); assoc_mgr_lock(&locks); job_list = list_create(slurmdb_destroy_job_rec); itr = list_iterator_create(use_cluster_list); while ((cluster_name = list_next(itr))) { int rc; if ((rc = _cluster_get_jobs(mysql_conn, &user, job_cond, cluster_name, tmp, tmp2, extra, is_admin, only_pending, job_list)) != SLURM_SUCCESS) error("Problem getting jobs for cluster %s", cluster_name); } list_iterator_destroy(itr); assoc_mgr_unlock(&locks); if (use_cluster_list == as_mysql_cluster_list) slurm_mutex_unlock(&as_mysql_cluster_list_lock); xfree(tmp); xfree(tmp2); xfree(extra); return job_list; }
static void _handle_stats(List prec_list, char *proc_stat_file, char *proc_io_file, char *proc_smaps_file, jag_callbacks_t *callbacks, int tres_count) { static int no_share_data = -1; static int use_pss = -1; FILE *stat_fp = NULL; FILE *io_fp = NULL; int fd, fd2, i; jag_prec_t *prec = NULL; if (no_share_data == -1) { char *acct_params = slurm_get_jobacct_gather_params(); if (acct_params && xstrcasestr(acct_params, "NoShare")) no_share_data = 1; else no_share_data = 0; if (acct_params && xstrcasestr(acct_params, "UsePss")) use_pss = 1; else use_pss = 0; xfree(acct_params); } if (!(stat_fp = fopen(proc_stat_file, "r"))) return; /* Assume the process went away */ /* * Close the file on exec() of user tasks. * * NOTE: If we fork() slurmstepd after the * fopen() above and before the fcntl() below, * then the user task may have this extra file * open, which can cause problems for * checkpoint/restart, but this should be a very rare * problem in practice. */ fd = fileno(stat_fp); if (fcntl(fd, F_SETFD, FD_CLOEXEC) == -1) error("%s: fcntl(%s): %m", __func__, proc_stat_file); prec = xmalloc(sizeof(jag_prec_t)); if (!tres_count) { assoc_mgr_lock_t locks = { NO_LOCK, NO_LOCK, NO_LOCK, NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK }; assoc_mgr_lock(&locks); tres_count = g_tres_count; assoc_mgr_unlock(&locks); } prec->tres_count = tres_count; prec->tres_data = xmalloc(prec->tres_count * sizeof(acct_gather_data_t)); /* Initialize read/writes */ for (i = 0; i < prec->tres_count; i++) { prec->tres_data[i].num_reads = INFINITE64; prec->tres_data[i].num_writes = INFINITE64; prec->tres_data[i].size_read = INFINITE64; prec->tres_data[i].size_write = INFINITE64; } if (!_get_process_data_line(fd, prec)) { xfree(prec->tres_data); xfree(prec); fclose(stat_fp); return; } fclose(stat_fp); if (acct_gather_filesystem_g_get_data(prec->tres_data) < 0) { debug2("problem retrieving filesystem data"); } if (acct_gather_interconnect_g_get_data(prec->tres_data) < 0) { debug2("problem retrieving interconnect data"); } /* Remove shared data from rss */ if (no_share_data) _remove_share_data(proc_stat_file, prec); /* Use PSS instead if RSS */ if (use_pss) { if (_get_pss(proc_smaps_file, prec) == -1) { xfree(prec->tres_data); xfree(prec); return; } } list_append(prec_list, prec); if ((io_fp = fopen(proc_io_file, "r"))) { fd2 = fileno(io_fp); if (fcntl(fd2, F_SETFD, FD_CLOEXEC) == -1) error("%s: fcntl: %m", __func__); _get_process_io_data_line(fd2, prec); fclose(io_fp); } }