int main(int argc, char *argv[]) { log_options_t opts = LOG_OPTS_STDERR_ONLY; log_init("sbcast", opts, SYSLOG_FACILITY_DAEMON, NULL); #ifdef HAVE_ALPS_CRAY error("The sbcast command is not supported on Cray systems"); return 1; #endif #ifdef HAVE_BG error("The sbcast command is not supported on IBM BlueGene systems"); return 1; #endif slurm_conf_init(NULL); parse_command_line(argc, argv); if (params.verbose) { opts.stderr_level += params.verbose; log_alter(opts, SYSLOG_FACILITY_DAEMON, NULL); } /* validate the source file */ if ((fd = open(params.src_fname, O_RDONLY)) < 0) { error("Can't open `%s`: %s", params.src_fname, strerror(errno)); exit(1); } if (fstat(fd, &f_stat)) { error("Can't stat `%s`: %s", params.src_fname, strerror(errno)); exit(1); } verbose("modes = %o", (unsigned int) f_stat.st_mode); verbose("uid = %d", (int) f_stat.st_uid); verbose("gid = %d", (int) f_stat.st_gid); verbose("atime = %s", slurm_ctime(&f_stat.st_atime)); verbose("mtime = %s", slurm_ctime(&f_stat.st_mtime)); verbose("ctime = %s", slurm_ctime(&f_stat.st_ctime)); verbose("size = %ld", (long) f_stat.st_size); verbose("-----------------------------"); /* identify the nodes allocated to the job */ _get_job_info(); /* transmit the file */ _bcast_file(); /* slurm_free_sbcast_cred_msg(sbcast_cred); */ exit(0); }
/* * scontrol_pid_info - given a local process id, print the corresponding * slurm job id and its expected end time * IN job_pid - the local process id of interest */ extern void scontrol_pid_info(pid_t job_pid) { int error_code; uint32_t job_id; time_t end_time; long rem_time; error_code = slurm_pid2jobid (job_pid, &job_id); if (error_code) { exit_code = 1; if (quiet_flag != 1) slurm_perror ("slurm_pid2jobid error"); return; } error_code = slurm_get_end_time(job_id, &end_time); if (error_code) { exit_code = 1; if (quiet_flag != 1) slurm_perror ("slurm_get_end_time error"); return; } printf("Slurm job id %u ends at %s\n", job_id, slurm_ctime(&end_time)); rem_time = slurm_get_rem_time(job_id); printf("slurm_get_rem_time is %ld\n", rem_time); return; }
/***************************************************************************** * Global Print Functions *****************************************************************************/ void print_date(void) { time_t now; now = time(NULL); printf("%s", slurm_ctime(&now)); }
/* * Job has been notified of it's approaching time limit. * Job will be killed shortly after timeout. * This RPC can arrive multiple times with the same or updated timeouts. * FIXME: We may want to signal the job or perform other action for this. * FIXME: How much lead time do we want for this message? Some jobs may * require tens of minutes to gracefully terminate. */ static void _timeout_handler(srun_timeout_msg_t *msg) { static time_t last_timeout = 0; if (msg->timeout != last_timeout) { last_timeout = msg->timeout; verbose("job time limit to be reached at %s", slurm_ctime(&msg->timeout)); } }
/* _rollup_handler - Process rollup duties */ static void *_rollup_handler(void *db_conn) { time_t start_time = time(NULL); time_t next_time; /* int sigarray[] = {SIGUSR1, 0}; */ struct tm tm; (void) pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL); (void) pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL); if (!localtime_r(&start_time, &tm)) { fatal("Couldn't get localtime for rollup handler %ld", (long)start_time); return NULL; } while (1) { if (!db_conn) break; /* run the roll up */ slurm_mutex_lock(&rollup_lock); running_rollup = 1; debug2("running rollup at %s", slurm_ctime(&start_time)); acct_storage_g_roll_usage(db_conn, 0, 0, 1); acct_storage_g_commit(db_conn, 1); running_rollup = 0; slurm_mutex_unlock(&rollup_lock); /* sleep for an hour */ tm.tm_sec = 0; tm.tm_min = 0; tm.tm_hour++; tm.tm_isdst = -1; next_time = mktime(&tm); /* get the time now we have rolled usage */ start_time = time(NULL); sleep((next_time-start_time)); start_time = time(NULL); if (!localtime_r(&start_time, &tm)) { fatal("Couldn't get localtime for rollup handler %ld", (long)start_time); return NULL; } /* Just in case some new uids were added to the system pick them up here. */ assoc_mgr_set_missing_uids(); /* repeat ;) */ } return NULL; }
/* set_idbuf() * Write in the input buffer the current time and milliseconds * the process id and the current thread id. */ static void set_idbuf(char *idbuf) { struct timeval now; gettimeofday(&now, NULL); sprintf(idbuf, "%.15s.%-6d %5d %p", slurm_ctime(&now.tv_sec) + 4, (int)now.tv_usec, (int)getpid(), (void *)pthread_self()); }
extern void print_date(void) { time_t now_time = time(NULL); if (params.commandline) { printf("%s", slurm_ctime(&now_time)); } else { mvwprintw(text_win, main_ycord, main_xcord, "%s", slurm_ctime2(&now_time)); main_ycord++; } }
/* set_idbuf() * Write in the input buffer the current time and milliseconds * the process id and the current thread id. */ static void set_idbuf(char *idbuf) { struct timeval now; char thread_name[NAMELEN]; int max_len = 12; /* handles current longest thread name */ gettimeofday(&now, NULL); #if HAVE_SYS_PRCTL_H if (prctl(PR_GET_NAME, thread_name, NULL, NULL, NULL) < 0) { error("failed to get thread name: %m"); max_len = 0; thread_name[0] = '\0'; } #else /* skip printing thread name if not available */ max_len = 0; thread_name[0] = '\0'; #endif sprintf(idbuf, "%.15s.%-6d %5d %-*s %p", slurm_ctime(&now.tv_sec) + 4, (int)now.tv_usec, (int)getpid(), max_len, thread_name, (void *)pthread_self()); }
static int _print_stats(void) { int i; if (!buf) { printf("No data available. Probably slurmctld is not working\n"); return -1; } printf("*******************************************************\n"); printf("sdiag output at %s", slurm_ctime(&buf->req_time)); printf("Data since %s", slurm_ctime(&buf->req_time_start)); printf("*******************************************************\n"); printf("Server thread count: %d\n", buf->server_thread_count); printf("Agent queue size: %d\n\n", buf->agent_queue_size); printf("Jobs submitted: %d\n", buf->jobs_submitted); printf("Jobs started: %d\n", buf->jobs_started); printf("Jobs completed: %d\n", buf->jobs_completed); printf("Jobs canceled: %d\n", buf->jobs_canceled); printf("Jobs failed: %d\n", buf->jobs_failed); printf("\nMain schedule statistics (microseconds):\n"); printf("\tLast cycle: %u\n", buf->schedule_cycle_last); printf("\tMax cycle: %u\n", buf->schedule_cycle_max); printf("\tTotal cycles: %u\n", buf->schedule_cycle_counter); if (buf->schedule_cycle_counter > 0) { printf("\tMean cycle: %u\n", buf->schedule_cycle_sum / buf->schedule_cycle_counter); printf("\tMean depth cycle: %u\n", buf->schedule_cycle_depth / buf->schedule_cycle_counter); } if ((buf->req_time - buf->req_time_start) > 60) { printf("\tCycles per minute: %u\n", (uint32_t) (buf->schedule_cycle_counter / ((buf->req_time - buf->req_time_start) / 60))); } printf("\tLast queue length: %u\n", buf->schedule_queue_len); if (buf->bf_active) { printf("\nBackfilling stats (WARNING: data obtained" " in the middle of backfilling execution.)\n"); } else printf("\nBackfilling stats\n"); printf("\tTotal backfilled jobs (since last slurm start): %u\n", buf->bf_backfilled_jobs); printf("\tTotal backfilled jobs (since last stats cycle start): %u\n", buf->bf_last_backfilled_jobs); printf("\tTotal cycles: %u\n", buf->bf_cycle_counter); printf("\tLast cycle when: %s", slurm_ctime(&buf->bf_when_last_cycle)); printf("\tLast cycle: %u\n", buf->bf_cycle_last); printf("\tMax cycle: %u\n", buf->bf_cycle_max); if (buf->bf_cycle_counter > 0) { printf("\tMean cycle: %"PRIu64"\n", buf->bf_cycle_sum / buf->bf_cycle_counter); } printf("\tLast depth cycle: %u\n", buf->bf_last_depth); printf("\tLast depth cycle (try sched): %u\n", buf->bf_last_depth_try); if (buf->bf_cycle_counter > 0) { printf("\tDepth Mean: %u\n", buf->bf_depth_sum / buf->bf_cycle_counter); printf("\tDepth Mean (try depth): %u\n", buf->bf_depth_try_sum / buf->bf_cycle_counter); } printf("\tLast queue length: %u\n", buf->bf_queue_len); if (buf->bf_cycle_counter > 0) { printf("\tQueue length mean: %u\n", buf->bf_queue_len_sum / buf->bf_cycle_counter); } printf("\nRemote Procedure Call statistics by message type\n"); for (i = 0; i < buf->rpc_type_size; i++) { printf("\t%-40s(%5u) count:%-6u " "ave_time:%-6u total_time:%"PRIu64"\n", rpc_num2string(buf->rpc_type_id[i]), buf->rpc_type_id[i], buf->rpc_type_cnt[i], rpc_type_ave_time[i], buf->rpc_type_time[i]); } printf("\nRemote Procedure Call statistics by user\n"); for (i = 0; i < buf->rpc_user_size; i++) { printf("\t%-16s(%8u) count:%-6u " "ave_time:%-6u total_time:%"PRIu64"\n", uid_to_string_cached((uid_t)buf->rpc_user_id[i]), buf->rpc_user_id[i], buf->rpc_user_cnt[i], rpc_user_ave_time[i], buf->rpc_user_time[i]); } return 0; }
extern int as_mysql_job_start(mysql_conn_t *mysql_conn, struct job_record *job_ptr) { int rc=SLURM_SUCCESS; char *nodes = NULL, *jname = NULL, *node_inx = NULL; int track_steps = 0; char *block_id = NULL, *partition = NULL, *gres_req = NULL, *gres_alloc = NULL; char *query = NULL; int reinit = 0; time_t begin_time, check_time, start_time, submit_time; uint32_t wckeyid = 0; int job_state, node_cnt = 0; uint32_t job_db_inx = job_ptr->db_index; if ((!job_ptr->details || !job_ptr->details->submit_time) && !job_ptr->resize_time) { error("as_mysql_job_start: " "Not inputing this job, it has no submit time."); return SLURM_ERROR; } if (check_connection(mysql_conn) != SLURM_SUCCESS) return ESLURM_DB_CONNECTION; debug2("as_mysql_slurmdb_job_start() called"); job_state = job_ptr->job_state; if (job_ptr->resize_time) { begin_time = job_ptr->resize_time; submit_time = job_ptr->resize_time; start_time = job_ptr->resize_time; } else { begin_time = job_ptr->details->begin_time; submit_time = job_ptr->details->submit_time; start_time = job_ptr->start_time; } /* Since we need a new db_inx make sure the old db_inx * removed. This is most likely the only time we are going to * be notified of the change also so make the state without * the resize. */ if (IS_JOB_RESIZING(job_ptr)) { /* If we have a db_index lets end the previous record. */ if (!job_ptr->db_index) { error("We don't have a db_index for job %u, " "this should only happen when resizing " "jobs and the database interface was down.", job_ptr->job_id); job_ptr->db_index = _get_db_index(mysql_conn, job_ptr->details-> submit_time, job_ptr->job_id, job_ptr->assoc_id); } if (job_ptr->db_index) as_mysql_job_complete(mysql_conn, job_ptr); job_state &= (~JOB_RESIZING); job_ptr->db_index = 0; } job_state &= JOB_STATE_BASE; /* See what we are hearing about here if no start time. If * this job latest time is before the last roll up we will * need to reset it to look at this job. */ if (start_time) check_time = start_time; else if (begin_time) check_time = begin_time; else check_time = submit_time; slurm_mutex_lock(&rollup_lock); if (check_time < global_last_rollup) { MYSQL_RES *result = NULL; MYSQL_ROW row; /* check to see if we are hearing about this time for the * first time. */ query = xstrdup_printf("select job_db_inx " "from \"%s_%s\" where id_job=%u and " "time_submit=%ld and time_eligible=%ld " "and time_start=%ld;", mysql_conn->cluster_name, job_table, job_ptr->job_id, submit_time, begin_time, start_time); debug3("%d(%s:%d) query\n%s", mysql_conn->conn, THIS_FILE, __LINE__, query); if (!(result = mysql_db_query_ret(mysql_conn, query, 0))) { xfree(query); slurm_mutex_unlock(&rollup_lock); return SLURM_ERROR; } xfree(query); if ((row = mysql_fetch_row(result))) { mysql_free_result(result); debug4("revieved an update for a " "job (%u) already known about", job_ptr->job_id); slurm_mutex_unlock(&rollup_lock); goto no_rollup_change; } mysql_free_result(result); if (job_ptr->start_time) debug("Need to reroll usage from %sJob %u " "from %s started then and we are just " "now hearing about it.", slurm_ctime(&check_time), job_ptr->job_id, mysql_conn->cluster_name); else if (begin_time) debug("Need to reroll usage from %sJob %u " "from %s became eligible then and we are just " "now hearing about it.", slurm_ctime(&check_time), job_ptr->job_id, mysql_conn->cluster_name); else debug("Need to reroll usage from %sJob %u " "from %s was submitted then and we are just " "now hearing about it.", slurm_ctime(&check_time), job_ptr->job_id, mysql_conn->cluster_name); global_last_rollup = check_time; slurm_mutex_unlock(&rollup_lock); /* If the times here are later than the daily_rollup or monthly rollup it isn't a big deal since they are always shrunk down to the beginning of each time period. */ query = xstrdup_printf("update \"%s_%s\" set " "hourly_rollup=%ld, " "daily_rollup=%ld, monthly_rollup=%ld", mysql_conn->cluster_name, last_ran_table, check_time, check_time, check_time); debug3("%d(%s:%d) query\n%s", mysql_conn->conn, THIS_FILE, __LINE__, query); rc = mysql_db_query(mysql_conn, query); xfree(query); } else slurm_mutex_unlock(&rollup_lock); no_rollup_change: if (job_ptr->name && job_ptr->name[0]) jname = slurm_add_slash_to_quotes(job_ptr->name); else { jname = xstrdup("allocation"); track_steps = 1; } if (job_ptr->nodes && job_ptr->nodes[0]) nodes = job_ptr->nodes; else nodes = "None assigned"; if (job_ptr->batch_flag) track_steps = 1; if (slurmdbd_conf) { block_id = xstrdup(job_ptr->comment); node_cnt = job_ptr->total_nodes; node_inx = job_ptr->network; } else { char temp_bit[BUF_SIZE]; if (job_ptr->node_bitmap) { node_inx = bit_fmt(temp_bit, sizeof(temp_bit), job_ptr->node_bitmap); } #ifdef HAVE_BG select_g_select_jobinfo_get(job_ptr->select_jobinfo, SELECT_JOBDATA_BLOCK_ID, &block_id); select_g_select_jobinfo_get(job_ptr->select_jobinfo, SELECT_JOBDATA_NODE_CNT, &node_cnt); #else node_cnt = job_ptr->total_nodes; #endif } /* If there is a start_time get the wckeyid. If the job is * cancelled before the job starts we also want to grab it. */ if (job_ptr->assoc_id && (job_ptr->start_time || IS_JOB_CANCELLED(job_ptr))) wckeyid = _get_wckeyid(mysql_conn, &job_ptr->wckey, job_ptr->user_id, mysql_conn->cluster_name, job_ptr->assoc_id); if (job_ptr->partition) partition = slurm_add_slash_to_quotes(job_ptr->partition); if (job_ptr->gres_req) gres_req = slurm_add_slash_to_quotes(job_ptr->gres_req); if (job_ptr->gres_alloc) gres_alloc = slurm_add_slash_to_quotes(job_ptr->gres_alloc); if (!job_ptr->db_index) { if (!begin_time) begin_time = submit_time; query = xstrdup_printf( "insert into \"%s_%s\" " "(id_job, id_array_job, id_array_task, " "id_assoc, id_qos, id_wckey, id_user, " "id_group, nodelist, id_resv, timelimit, " "time_eligible, time_submit, time_start, " "job_name, track_steps, state, priority, cpus_req, " "cpus_alloc, nodes_alloc, mem_req", mysql_conn->cluster_name, job_table); if (job_ptr->account) xstrcat(query, ", account"); if (partition) xstrcat(query, ", `partition`"); if (block_id) xstrcat(query, ", id_block"); if (job_ptr->wckey) xstrcat(query, ", wckey"); if (node_inx) xstrcat(query, ", node_inx"); if (gres_req) xstrcat(query, ", gres_req"); if (gres_alloc) xstrcat(query, ", gres_alloc"); xstrfmtcat(query, ") values (%u, %u, %u, %u, %u, %u, %u, %u, " "'%s', %u, %u, %ld, %ld, %ld, " "'%s', %u, %u, %u, %u, %u, %u, %u", job_ptr->job_id, job_ptr->array_job_id, job_ptr->array_task_id, job_ptr->assoc_id, job_ptr->qos_id, wckeyid, job_ptr->user_id, job_ptr->group_id, nodes, job_ptr->resv_id, job_ptr->time_limit, begin_time, submit_time, start_time, jname, track_steps, job_state, job_ptr->priority, job_ptr->details->min_cpus, job_ptr->total_cpus, node_cnt, job_ptr->details->pn_min_memory); if (job_ptr->account) xstrfmtcat(query, ", '%s'", job_ptr->account); if (partition) xstrfmtcat(query, ", '%s'", partition); if (block_id) xstrfmtcat(query, ", '%s'", block_id); if (job_ptr->wckey) xstrfmtcat(query, ", '%s'", job_ptr->wckey); if (node_inx) xstrfmtcat(query, ", '%s'", node_inx); if (gres_req) xstrfmtcat(query, ", '%s'", gres_req); if (gres_alloc) xstrfmtcat(query, ", '%s'", gres_alloc); xstrfmtcat(query, ") on duplicate key update " "job_db_inx=LAST_INSERT_ID(job_db_inx), " "id_wckey=%u, id_user=%u, id_group=%u, " "nodelist='%s', id_resv=%u, timelimit=%u, " "time_submit=%ld, time_start=%ld, " "job_name='%s', track_steps=%u, id_qos=%u, " "state=greatest(state, %u), priority=%u, " "cpus_req=%u, cpus_alloc=%u, nodes_alloc=%u, " "mem_req=%u, id_array_job=%u, id_array_task=%u", wckeyid, job_ptr->user_id, job_ptr->group_id, nodes, job_ptr->resv_id, job_ptr->time_limit, submit_time, start_time, jname, track_steps, job_ptr->qos_id, job_state, job_ptr->priority, job_ptr->details->min_cpus, job_ptr->total_cpus, node_cnt, job_ptr->details->pn_min_memory, job_ptr->array_job_id, job_ptr->array_task_id); if (job_ptr->account) xstrfmtcat(query, ", account='%s'", job_ptr->account); if (partition) xstrfmtcat(query, ", `partition`='%s'", partition); if (block_id) xstrfmtcat(query, ", id_block='%s'", block_id); if (job_ptr->wckey) xstrfmtcat(query, ", wckey='%s'", job_ptr->wckey); if (node_inx) xstrfmtcat(query, ", node_inx='%s'", node_inx); if (gres_req) xstrfmtcat(query, ", gres_req='%s'", gres_req); if (gres_alloc) xstrfmtcat(query, ", gres_alloc='%s'", gres_alloc); debug3("%d(%s:%d) query\n%s", mysql_conn->conn, THIS_FILE, __LINE__, query); try_again: if (!(job_ptr->db_index = mysql_db_insert_ret_id( mysql_conn, query))) { if (!reinit) { error("It looks like the storage has gone " "away trying to reconnect"); mysql_db_close_db_connection( mysql_conn); /* reconnect */ check_connection(mysql_conn); reinit = 1; goto try_again; } else rc = SLURM_ERROR; } } else { query = xstrdup_printf("update \"%s_%s\" set nodelist='%s', ", mysql_conn->cluster_name, job_table, nodes); if (job_ptr->account) xstrfmtcat(query, "account='%s', ", job_ptr->account); if (partition) xstrfmtcat(query, "`partition`='%s', ", partition); if (block_id) xstrfmtcat(query, "id_block='%s', ", block_id); if (job_ptr->wckey) xstrfmtcat(query, "wckey='%s', ", job_ptr->wckey); if (node_inx) xstrfmtcat(query, "node_inx='%s', ", node_inx); if (gres_req) xstrfmtcat(query, "gres_req='%s', ", gres_req); if (gres_alloc) xstrfmtcat(query, "gres_alloc='%s', ", gres_alloc); xstrfmtcat(query, "time_start=%ld, job_name='%s', state=%u, " "cpus_alloc=%u, nodes_alloc=%u, id_qos=%u, " "id_assoc=%u, id_wckey=%u, id_resv=%u, " "timelimit=%u, mem_req=%u, " "id_array_job=%u, id_array_task=%u, " "time_eligible=%ld where job_db_inx=%d", start_time, jname, job_state, job_ptr->total_cpus, node_cnt, job_ptr->qos_id, job_ptr->assoc_id, wckeyid, job_ptr->resv_id, job_ptr->time_limit, job_ptr->details->pn_min_memory, job_ptr->array_job_id, job_ptr->array_task_id, begin_time, job_ptr->db_index); debug3("%d(%s:%d) query\n%s", mysql_conn->conn, THIS_FILE, __LINE__, query); rc = mysql_db_query(mysql_conn, query); } xfree(block_id); xfree(partition); xfree(gres_req); xfree(gres_alloc); xfree(jname); xfree(query); /* now we will reset all the steps */ if (IS_JOB_RESIZING(job_ptr)) { /* FIXME : Verify this is still needed */ if (IS_JOB_SUSPENDED(job_ptr)) as_mysql_suspend(mysql_conn, job_db_inx, job_ptr); } return rc; }