extern void msg_aggr_sender_init(char *host, uint16_t port, uint64_t window, uint64_t max_msg_cnt) { if (msg_collection.running || (max_msg_cnt <= 1)) return; memset(&msg_collection, 0, sizeof(msg_collection_type_t)); slurm_mutex_init(&msg_collection.aggr_mutex); slurm_mutex_init(&msg_collection.mutex); slurm_mutex_lock(&msg_collection.mutex); slurm_mutex_lock(&msg_collection.aggr_mutex); slurm_cond_init(&msg_collection.cond, NULL); slurm_set_addr(&msg_collection.node_addr, port, host); msg_collection.window = window; msg_collection.max_msg_cnt = max_msg_cnt; msg_collection.msg_aggr_list = list_create(_msg_aggr_free); msg_collection.msg_list = list_create(slurm_free_comp_msg_list); msg_collection.max_msgs = false; msg_collection.debug_flags = slurm_get_debug_flags(); slurm_mutex_unlock(&msg_collection.aggr_mutex); slurm_mutex_unlock(&msg_collection.mutex); slurm_thread_create(&msg_collection.thread_id, &_msg_aggregation_sender, NULL); }
/* * start_msg_tree - logic to begin the forward tree and * accumulate the return codes from processes getting the * the forwarded message * * IN: hl - hostlist_t - list of every node to send message to * IN: msg - slurm_msg_t - message to send. * IN: timeout - int - how long to wait in milliseconds. * RET List - List containing the responses of the children * (if any) we forwarded the message to. List * containing type (ret_data_info_t). */ extern List start_msg_tree(hostlist_t hl, slurm_msg_t *msg, int timeout) { fwd_tree_t fwd_tree; pthread_mutex_t tree_mutex; pthread_cond_t notify; int count = 0; List ret_list = NULL; int thr_count = 0; int host_count = 0; hostlist_t* sp_hl; int hl_count = 0; xassert(hl); xassert(msg); hostlist_uniq(hl); host_count = hostlist_count(hl); if (route_g_split_hostlist(hl, &sp_hl, &hl_count, msg->forward.tree_width)) { error("unable to split forward hostlist"); return NULL; } slurm_mutex_init(&tree_mutex); slurm_cond_init(¬ify, NULL); ret_list = list_create(destroy_data_info); memset(&fwd_tree, 0, sizeof(fwd_tree)); fwd_tree.orig_msg = msg; fwd_tree.ret_list = ret_list; fwd_tree.timeout = timeout; fwd_tree.notify = ¬ify; fwd_tree.p_thr_count = &thr_count; fwd_tree.tree_mutex = &tree_mutex; _start_msg_tree_internal(NULL, sp_hl, &fwd_tree, hl_count); xfree(sp_hl); slurm_mutex_lock(&tree_mutex); count = list_count(ret_list); debug2("Tree head got back %d looking for %d", count, host_count); while (thr_count > 0) { slurm_cond_wait(¬ify, &tree_mutex); count = list_count(ret_list); debug2("Tree head got back %d", count); } xassert(count >= host_count); /* Tree head did not get all responses, * but no more active fwd threads!*/ slurm_mutex_unlock(&tree_mutex); slurm_mutex_destroy(&tree_mutex); slurm_cond_destroy(¬ify); return ret_list; }
static int _signal_job_by_str(void) { job_cancel_info_t *cancel_info; int err, i, rc = 0; pthread_t dummy; slurm_attr_init(&attr); if (pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED)) error("pthread_attr_setdetachstate error %m"); slurm_mutex_init(&num_active_threads_lock); slurm_cond_init(&num_active_threads_cond, NULL); for (i = 0; opt.job_list[i]; i++) { cancel_info = (job_cancel_info_t *) xmalloc(sizeof(job_cancel_info_t)); cancel_info->job_id_str = xstrdup(opt.job_list[i]); cancel_info->rc = &rc; cancel_info->sig = opt.signal; cancel_info->num_active_threads = &num_active_threads; cancel_info->num_active_threads_lock = &num_active_threads_lock; cancel_info->num_active_threads_cond = &num_active_threads_cond; slurm_mutex_lock(&num_active_threads_lock); num_active_threads++; while (num_active_threads > MAX_THREADS) { slurm_cond_wait(&num_active_threads_cond, &num_active_threads_lock); } slurm_mutex_unlock(&num_active_threads_lock); err = pthread_create(&dummy, &attr, _cancel_job_id,cancel_info); if (err) /* Run in-line if thread create fails */ _cancel_job_id(cancel_info); } /* Wait all spawned threads to finish */ slurm_mutex_lock( &num_active_threads_lock ); while (num_active_threads > 0) { slurm_cond_wait(&num_active_threads_cond, &num_active_threads_lock); } slurm_mutex_unlock(&num_active_threads_lock); slurm_attr_destroy(&attr); return rc; }
/* _cancel_jobs - filter then cancel jobs or job steps per request */ static int _cancel_jobs(int filter_cnt) { int rc = 0; slurm_attr_init(&attr); if (pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED)) error("pthread_attr_setdetachstate error %m"); slurm_mutex_init(&num_active_threads_lock); slurm_cond_init(&num_active_threads_cond, NULL); _cancel_jobs_by_state(JOB_PENDING, filter_cnt, &rc); /* Wait for any cancel of pending jobs to complete before starting * cancellation of running jobs so that we don't have a race condition * with pending jobs getting scheduled while running jobs are also * being cancelled. */ slurm_mutex_lock( &num_active_threads_lock ); while (num_active_threads > 0) { slurm_cond_wait(&num_active_threads_cond, &num_active_threads_lock); } slurm_mutex_unlock(&num_active_threads_lock); _cancel_jobs_by_state(JOB_END, filter_cnt, &rc); /* Wait for any spawned threads that have not finished */ slurm_mutex_lock( &num_active_threads_lock ); while (num_active_threads > 0) { slurm_cond_wait(&num_active_threads_cond, &num_active_threads_lock); } slurm_mutex_unlock(&num_active_threads_lock); slurm_attr_destroy(&attr); slurm_mutex_destroy(&num_active_threads_lock); slurm_cond_destroy(&num_active_threads_cond); return rc; }
extern void msg_aggr_add_msg(slurm_msg_t *msg, bool wait, void (*resp_callback) (slurm_msg_t *msg)) { int count; static uint16_t msg_index = 1; static uint32_t wait_count = 0; if (!msg_collection.running) return; slurm_mutex_lock(&msg_collection.mutex); if (msg_collection.max_msgs == true) { slurm_cond_wait(&msg_collection.cond, &msg_collection.mutex); } msg->msg_index = msg_index++; /* Add msg to message collection */ list_append(msg_collection.msg_list, msg); count = list_count(msg_collection.msg_list); /* First msg in collection; initiate new window */ if (count == 1) slurm_cond_signal(&msg_collection.cond); /* Max msgs reached; terminate window */ if (count >= msg_collection.max_msg_cnt) { msg_collection.max_msgs = true; slurm_cond_signal(&msg_collection.cond); } slurm_mutex_unlock(&msg_collection.mutex); if (wait) { msg_aggr_t *msg_aggr = xmalloc(sizeof(msg_aggr_t)); uint16_t msg_timeout; struct timeval now; struct timespec timeout; msg_aggr->msg_index = msg->msg_index; msg_aggr->resp_callback = resp_callback; slurm_cond_init(&msg_aggr->wait_cond, NULL); slurm_mutex_lock(&msg_collection.aggr_mutex); list_append(msg_collection.msg_aggr_list, msg_aggr); msg_timeout = slurm_get_msg_timeout(); gettimeofday(&now, NULL); timeout.tv_sec = now.tv_sec + msg_timeout; timeout.tv_nsec = now.tv_usec * 1000; wait_count++; if (pthread_cond_timedwait(&msg_aggr->wait_cond, &msg_collection.aggr_mutex, &timeout) == ETIMEDOUT) _handle_msg_aggr_ret(msg_aggr->msg_index, 1); wait_count--; slurm_mutex_unlock(&msg_collection.aggr_mutex); ; if (!msg_collection.running && !wait_count) slurm_mutex_destroy(&msg_collection.aggr_mutex); _msg_aggr_free(msg_aggr); } }
extern stepd_step_rec_t * batch_stepd_step_rec_create(batch_job_launch_msg_t *msg) { stepd_step_rec_t *job; srun_info_t *srun = NULL; char *in_name; xassert(msg != NULL); debug3("entering batch_stepd_step_rec_create"); if (acct_gather_check_acct_freq_task(msg->job_mem, msg->acctg_freq)) return NULL; job = xmalloc(sizeof(stepd_step_rec_t)); job->state = SLURMSTEPD_STEP_STARTING; slurm_cond_init(&job->state_cond, NULL); slurm_mutex_init(&job->state_mutex); if (msg->cpus_per_node) job->cpus = msg->cpus_per_node[0]; job->node_tasks = 1; job->ntasks = msg->ntasks; job->jobid = msg->job_id; job->stepid = msg->step_id; job->array_job_id = msg->array_job_id; job->array_task_id = msg->array_task_id; job->pack_jobid = NO_VAL; /* Used to set env vars */ job->pack_nnodes = NO_VAL; /* Used to set env vars */ job->pack_ntasks = NO_VAL; /* Used to set env vars */ job->pack_offset = NO_VAL; /* Used to set labels and env vars */ job->job_core_spec = msg->job_core_spec; job->batch = true; job->node_name = xstrdup(conf->node_name); job->uid = (uid_t) msg->uid; job->gid = (gid_t) msg->gid; job->user_name = xstrdup(msg->user_name); job->ngids = (int) msg->ngids; job->gids = copy_gids(msg->ngids, msg->gids); job->profile = msg->profile; /* give them all to the 1 task */ job->cpus_per_task = job->cpus; /* This needs to happen before acct_gather_profile_startpoll and only really looks at the profile in the job. */ acct_gather_profile_g_node_step_start(job); /* needed for the jobacct_gather plugin to start */ acct_gather_profile_startpoll(msg->acctg_freq, conf->job_acct_gather_freq); job->open_mode = msg->open_mode; job->overcommit = (bool) msg->overcommit; job->cwd = xstrdup(msg->work_dir); job->ckpt_dir = xstrdup(msg->ckpt_dir); job->restart_dir = xstrdup(msg->restart_dir); job->env = _array_copy(msg->envc, msg->environment); job->eio = eio_handle_create(0); job->sruns = list_create((ListDelF) _srun_info_destructor); job->envtp = xmalloc(sizeof(env_t)); job->envtp->jobid = -1; job->envtp->stepid = -1; job->envtp->procid = -1; job->envtp->localid = -1; job->envtp->nodeid = -1; job->envtp->distribution = 0; job->cpu_bind_type = msg->cpu_bind_type; job->cpu_bind = xstrdup(msg->cpu_bind); job->envtp->mem_bind_type = 0; job->envtp->mem_bind = NULL; job->envtp->ckpt_dir = NULL; job->envtp->restart_cnt = msg->restart_cnt; if (msg->cpus_per_node) job->cpus = msg->cpus_per_node[0]; format_core_allocs(msg->cred, conf->node_name, conf->cpus, &job->job_alloc_cores, &job->step_alloc_cores, &job->job_mem, &job->step_mem); if (job->step_mem && conf->mem_limit_enforce) jobacct_gather_set_mem_limit(job->jobid, NO_VAL, job->step_mem); else if (job->job_mem && conf->mem_limit_enforce) jobacct_gather_set_mem_limit(job->jobid, NO_VAL, job->job_mem); get_cred_gres(msg->cred, conf->node_name, &job->job_gres_list, &job->step_gres_list); srun = srun_info_create(NULL, NULL, NULL, NO_VAL16); list_append(job->sruns, (void *) srun); if (msg->argc) { job->argc = msg->argc; job->argv = _array_copy(job->argc, msg->argv); } else { job->argc = 1; /* job script has not yet been written out to disk -- * argv will be filled in later by _make_batch_script() */ job->argv = (char **) xmalloc(2 * sizeof(char *)); } job->task = xmalloc(sizeof(stepd_step_task_info_t *)); if (msg->std_err == NULL) msg->std_err = xstrdup(msg->std_out); if (msg->std_in == NULL) in_name = xstrdup("/dev/null"); else in_name = fname_create(job, msg->std_in, 0); job->task[0] = _task_info_create(0, 0, in_name, _batchfilename(job, msg->std_out), _batchfilename(job, msg->std_err)); job->task[0]->argc = job->argc; job->task[0]->argv = job->argv; #ifdef HAVE_ALPS_CRAY select_g_select_jobinfo_get(msg->select_jobinfo, SELECT_JOBDATA_RESV_ID, &job->resv_id); #endif return job; }
/* create a slurmd job structure from a launch tasks message */ extern stepd_step_rec_t *stepd_step_rec_create(launch_tasks_request_msg_t *msg, uint16_t protocol_version) { stepd_step_rec_t *job = NULL; srun_info_t *srun = NULL; slurm_addr_t resp_addr; slurm_addr_t io_addr; int i, nodeid = NO_VAL; xassert(msg != NULL); xassert(msg->complete_nodelist != NULL); debug3("entering stepd_step_rec_create"); if (acct_gather_check_acct_freq_task(msg->job_mem_lim, msg->acctg_freq)) return NULL; job = xmalloc(sizeof(stepd_step_rec_t)); job->msg = msg; #ifndef HAVE_FRONT_END nodeid = nodelist_find(msg->complete_nodelist, conf->node_name); job->node_name = xstrdup(conf->node_name); #else nodeid = 0; job->node_name = xstrdup(msg->complete_nodelist); #endif if (nodeid < 0) { error("couldn't find node %s in %s", job->node_name, msg->complete_nodelist); stepd_step_rec_destroy(job); return NULL; } job->state = SLURMSTEPD_STEP_STARTING; slurm_cond_init(&job->state_cond, NULL); slurm_mutex_init(&job->state_mutex); job->node_tasks = msg->tasks_to_launch[nodeid]; i = sizeof(uint16_t) * msg->nnodes; job->task_cnts = xmalloc(i); memcpy(job->task_cnts, msg->tasks_to_launch, i); job->ntasks = msg->ntasks; job->jobid = msg->job_id; job->stepid = msg->job_step_id; job->uid = (uid_t) msg->uid; job->gid = (gid_t) msg->gid; job->user_name = xstrdup(msg->user_name); job->ngids = (int) msg->ngids; job->gids = copy_gids(msg->ngids, msg->gids); job->cwd = xstrdup(msg->cwd); job->task_dist = msg->task_dist; job->cpu_bind_type = msg->cpu_bind_type; job->cpu_bind = xstrdup(msg->cpu_bind); job->mem_bind_type = msg->mem_bind_type; job->mem_bind = xstrdup(msg->mem_bind); job->cpu_freq_min = msg->cpu_freq_min; job->cpu_freq_max = msg->cpu_freq_max; job->cpu_freq_gov = msg->cpu_freq_gov; job->ckpt_dir = xstrdup(msg->ckpt_dir); job->restart_dir = xstrdup(msg->restart_dir); job->cpus_per_task = msg->cpus_per_task; job->env = _array_copy(msg->envc, msg->env); job->array_job_id = msg->job_id; job->array_task_id = NO_VAL; job->node_offset = msg->node_offset; /* Used for env vars */ job->pack_jobid = msg->pack_jobid; /* Used for env vars */ job->pack_nnodes = msg->pack_nnodes; /* Used for env vars */ if (msg->pack_nnodes && msg->pack_ntasks && msg->pack_task_cnts) { job->pack_ntasks = msg->pack_ntasks; /* Used for env vars */ i = sizeof(uint16_t) * msg->pack_nnodes; job->pack_task_cnts = xmalloc(i); memcpy(job->pack_task_cnts, msg->pack_task_cnts, i); } job->pack_offset = msg->pack_offset; /* Used for env vars & labels */ job->pack_task_offset = msg->pack_task_offset; /* Used for env vars & labels */ job->pack_node_list = xstrdup(msg->pack_node_list); for (i = 0; i < msg->envc; i++) { /* 1234567890123456789 */ if (!xstrncmp(msg->env[i], "SLURM_ARRAY_JOB_ID=", 19)) job->array_job_id = atoi(msg->env[i] + 19); /* 12345678901234567890 */ if (!xstrncmp(msg->env[i], "SLURM_ARRAY_TASK_ID=", 20)) job->array_task_id = atoi(msg->env[i] + 20); } job->eio = eio_handle_create(0); job->sruns = list_create((ListDelF) _srun_info_destructor); /* * Based on my testing the next 3 lists here could use the * eio_obj_destroy, but if you do you can get an invalid read. Since * these stay until the end of the job it isn't that big of a deal. */ job->clients = list_create(NULL); /* FIXME! Needs destructor */ job->stdout_eio_objs = list_create(NULL); /* FIXME! Needs destructor */ job->stderr_eio_objs = list_create(NULL); /* FIXME! Needs destructor */ job->free_incoming = list_create(NULL); /* FIXME! Needs destructor */ job->incoming_count = 0; job->free_outgoing = list_create(NULL); /* FIXME! Needs destructor */ job->outgoing_count = 0; job->outgoing_cache = list_create(NULL); /* FIXME! Needs destructor */ job->envtp = xmalloc(sizeof(env_t)); job->envtp->jobid = -1; job->envtp->stepid = -1; job->envtp->procid = -1; job->envtp->localid = -1; job->envtp->nodeid = -1; job->envtp->distribution = 0; job->envtp->cpu_bind_type = 0; job->envtp->cpu_bind = NULL; job->envtp->mem_bind_type = 0; job->envtp->mem_bind = NULL; job->envtp->ckpt_dir = NULL; if (!msg->resp_port) msg->num_resp_port = 0; if (msg->num_resp_port) { job->envtp->comm_port = msg->resp_port[nodeid % msg->num_resp_port]; memcpy(&resp_addr, &msg->orig_addr, sizeof(slurm_addr_t)); slurm_set_addr(&resp_addr, msg->resp_port[nodeid % msg->num_resp_port], NULL); } else { memset(&resp_addr, 0, sizeof(slurm_addr_t)); } if (!msg->io_port) msg->flags |= LAUNCH_USER_MANAGED_IO; if ((msg->flags & LAUNCH_USER_MANAGED_IO) == 0) { memcpy(&io_addr, &msg->orig_addr, sizeof(slurm_addr_t)); slurm_set_addr(&io_addr, msg->io_port[nodeid % msg->num_io_port], NULL); } else { memset(&io_addr, 0, sizeof(slurm_addr_t)); } srun = srun_info_create(msg->cred, &resp_addr, &io_addr, protocol_version); job->profile = msg->profile; job->task_prolog = xstrdup(msg->task_prolog); job->task_epilog = xstrdup(msg->task_epilog); job->argc = msg->argc; job->argv = _array_copy(job->argc, msg->argv); job->nnodes = msg->nnodes; job->nodeid = nodeid; job->debug = msg->slurmd_debug; job->cpus = msg->node_cpus; job->job_core_spec = msg->job_core_spec; /* This needs to happen before acct_gather_profile_startpoll and only really looks at the profile in the job. */ acct_gather_profile_g_node_step_start(job); acct_gather_profile_startpoll(msg->acctg_freq, conf->job_acct_gather_freq); job->timelimit = (time_t) -1; job->flags = msg->flags; job->switch_job = msg->switch_job; job->open_mode = msg->open_mode; job->options = msg->options; format_core_allocs(msg->cred, conf->node_name, conf->cpus, &job->job_alloc_cores, &job->step_alloc_cores, &job->job_mem, &job->step_mem); /* If users have configured MemLimitEnforce=no * in their slurm.conf keep going. */ if (job->step_mem && conf->mem_limit_enforce) { jobacct_gather_set_mem_limit(job->jobid, job->stepid, job->step_mem); } else if (job->job_mem && conf->mem_limit_enforce) { jobacct_gather_set_mem_limit(job->jobid, job->stepid, job->job_mem); } #ifdef HAVE_ALPS_CRAY /* This is only used for Cray emulation mode where slurmd is used to * launch job steps. On a real Cray system, ALPS is used to launch * the tasks instead of SLURM. SLURM's task launch RPC does NOT * contain the reservation ID, so just use some non-zero value here * for testing purposes. */ job->resv_id = 1; select_g_select_jobinfo_set(msg->select_jobinfo, SELECT_JOBDATA_RESV_ID, &job->resv_id); #endif /* only need these values on the extern step, don't copy otherwise */ if ((msg->job_step_id == SLURM_EXTERN_CONT) && msg->x11) { job->x11 = msg->x11; job->x11_magic_cookie = xstrdup(msg->x11_magic_cookie); job->x11_target_host = xstrdup(msg->x11_target_host); job->x11_target_port = msg->x11_target_port; } get_cred_gres(msg->cred, conf->node_name, &job->job_gres_list, &job->step_gres_list); list_append(job->sruns, (void *) srun); _job_init_task_info(job, msg->global_task_ids, msg->ifname, msg->ofname, msg->efname); return job; }
extern int acct_gather_profile_startpoll(char *freq, char *freq_def) { int retval = SLURM_SUCCESS; pthread_attr_t attr; int i; uint32_t profile = ACCT_GATHER_PROFILE_NOT_SET; if (acct_gather_profile_init() < 0) return SLURM_ERROR; if (acct_gather_profile_running) { error("acct_gather_profile_startpoll: poll already started!"); return retval; } acct_gather_profile_running = true; (*(ops.get))(ACCT_GATHER_PROFILE_RUNNING, &profile); xassert(profile != ACCT_GATHER_PROFILE_NOT_SET); for (i=0; i < PROFILE_CNT; i++) { memset(&acct_gather_profile_timer[i], 0, sizeof(acct_gather_profile_timer_t)); slurm_cond_init(&acct_gather_profile_timer[i].notify, NULL); slurm_mutex_init(&acct_gather_profile_timer[i].notify_mutex); switch (i) { case PROFILE_ENERGY: if (!(profile & ACCT_GATHER_PROFILE_ENERGY)) break; _set_freq(i, freq, freq_def); acct_gather_energy_startpoll( acct_gather_profile_timer[i].freq); break; case PROFILE_TASK: /* Always set up the task (always first) to be done since it is used to control memory consumption and such. It will check profile inside it's plugin. */ _set_freq(i, freq, freq_def); jobacct_gather_startpoll( acct_gather_profile_timer[i].freq); break; case PROFILE_FILESYSTEM: if (!(profile & ACCT_GATHER_PROFILE_LUSTRE)) break; _set_freq(i, freq, freq_def); acct_gather_filesystem_startpoll( acct_gather_profile_timer[i].freq); break; case PROFILE_NETWORK: if (!(profile & ACCT_GATHER_PROFILE_NETWORK)) break; _set_freq(i, freq, freq_def); acct_gather_infiniband_startpoll( acct_gather_profile_timer[i].freq); break; default: fatal("Unhandled profile option %d please update " "slurm_acct_gather_profile.c " "(acct_gather_profile_startpoll)", i); } } /* create polling thread */ slurm_attr_init(&attr); if (pthread_create(&timer_thread_id, &attr, &_timer_thread, NULL)) { debug("acct_gather_profile_startpoll failed to create " "_timer_thread: %m"); } else debug3("acct_gather_profile_startpoll dynamic logging enabled"); slurm_attr_destroy(&attr); return retval; }