void task_state_print (task_state_t ts, log_f fn) { bitstr_t *unseen; if (!ts) /* Not built yet */ return; unseen = bit_alloc (ts->n_tasks); if (bit_set_count (ts->start_failed)) { _do_log_msg (ts->start_failed, fn, "failed to start"); bit_or (unseen, ts->start_failed); } if (bit_set_count (ts->running)) { _do_log_msg (ts->running, fn, "running"); bit_or (unseen, ts->running); } if (bit_set_count (ts->abnormal_exit)) { _do_log_msg (ts->abnormal_exit, fn, "exited abnormally"); bit_or (unseen, ts->abnormal_exit); } if (bit_set_count (ts->normal_exit)) { _do_log_msg (ts->normal_exit, fn, "exited"); bit_or (unseen, ts->normal_exit); } bit_not (unseen); if (bit_set_count (unseen)) _do_log_msg (unseen, fn, "unknown"); FREE_NULL_BITMAP(unseen); }
/* * Update the state of a specific task ID in a specific task_state structure */ extern void task_state_update(task_state_t ts, int task_id, task_state_type_t t) { xassert(ts != NULL); xassert(task_id >= 0); xassert(task_id < ts->n_tasks); if (ts->pack_group == NO_VAL) { debug3("%s: step=%u.%u task_id=%d, %s", __func__, ts->job_id, ts->step_id, task_id, _task_state_type_str(t)); } else { debug3("%s: step=%u.%u pack_group=%u task_id=%d, %s", __func__, ts->job_id, ts->step_id, ts->pack_group, task_id, _task_state_type_str(t)); } switch (t) { case TS_START_SUCCESS: bit_set (ts->running, task_id); ts->n_started++; break; case TS_START_FAILURE: bit_set (ts->start_failed, task_id); break; case TS_NORMAL_EXIT: bit_clear(ts->running, task_id); if (bit_test(ts->normal_exit, task_id) || bit_test(ts->abnormal_exit, task_id)) { error("Task %d reported exit for a second time.", task_id); } else { bit_set (ts->normal_exit, task_id); ts->n_exited++; } break; case TS_ABNORMAL_EXIT: bit_clear(ts->running, task_id); if (bit_test(ts->normal_exit, task_id) || bit_test(ts->abnormal_exit, task_id)) { error("Task %d reported exit for a second time.", task_id); } else { bit_set (ts->abnormal_exit, task_id); ts->n_exited++; ts->n_abnormal++; } break; } xassert((bit_set_count(ts->abnormal_exit) + bit_set_count(ts->normal_exit)) == ts->n_exited); }
/* Select the best set of resources for the given job * IN: job_ptr - pointer to the job requesting resources * IN/OUT: node_map - bitmap of available nodes / bitmap of selected nodes * IN: cr_node_cnt - total number of nodes in the cluster * IN/OUT: core_map - bitmap of available cores / bitmap of selected cores * IN: cr_type - resource type * IN: test_only - ignore allocated memory check * RET - array with number of CPUs available per node or NULL if not runnable */ static uint16_t *_select_nodes(struct job_record *job_ptr, bitstr_t *node_map, uint32_t cr_node_cnt, bitstr_t *core_map, struct node_use_record *node_usage, uint16_t cr_type, bool test_only) { int node_inx; uint16_t *cpu_cnt, *cpus = NULL; if (bit_set_count(node_map) == 0) return NULL; /* get resource usage for this job from first available node */ node_inx = _get_res_usage(job_ptr, node_map, core_map, cr_node_cnt, node_usage, cr_type, &cpu_cnt, test_only); /* if successful, sync up the core_map with the node_map, and * create a cpus array */ if (node_inx >= 0) { cpus = xmalloc(sizeof(uint16_t)); cpus[0] = cpu_cnt[node_inx]; if (node_inx != 0) { bit_nclear(core_map, 0, (cr_get_coremap_offset(node_inx))-1); } if (node_inx < (cr_node_cnt - 1)) { bit_nclear(core_map, (cr_get_coremap_offset(node_inx + 1)), (cr_get_coremap_offset(cr_node_cnt) - 1)); } } xfree(cpu_cnt); return cpus; }
/* * Test if job can fit into the given full-length core_bitmap * IN job_resrcs_ptr - resources allocated to a job * IN full_bitmap - bitmap of available CPUs * IN bits_per_node - bits per node in the full_bitmap * RET 1 on success, 0 otherwise */ extern int job_fits_into_cores(job_resources_t *job_resrcs_ptr, bitstr_t *full_bitmap, const uint16_t *bits_per_node) { int full_node_inx = 0, full_bit_inx = 0, job_bit_inx = 0, i; int job_node_cnt; if (!full_bitmap) return 1; job_node_cnt = bit_set_count(job_resrcs_ptr->node_bitmap); for (full_node_inx = bit_ffs(job_resrcs_ptr->node_bitmap); job_node_cnt > 0; full_node_inx++) { if (bit_test(job_resrcs_ptr->node_bitmap, full_node_inx)) { full_bit_inx = cr_node_cores_offset[full_node_inx]; for (i = 0; i < bits_per_node[full_node_inx]; i++) { if (!bit_test(full_bitmap, full_bit_inx + i)) continue; if (job_resrcs_ptr->whole_node || bit_test(job_resrcs_ptr->core_bitmap, job_bit_inx + i)) { return 0; } } job_bit_inx += bits_per_node[full_node_inx]; job_node_cnt --; } } return 1; }
/* * Remove job from full-length core_bitmap * IN job_resrcs_ptr - resources allocated to a job * IN/OUT full_bitmap - bitmap of available CPUs, allocate as needed * IN bits_per_node - bits per node in the full_bitmap * RET 1 on success, 0 otherwise */ extern void remove_job_from_cores(job_resources_t *job_resrcs_ptr, bitstr_t **full_core_bitmap, const uint16_t *bits_per_node) { int full_node_inx = 0, job_node_cnt; int job_bit_inx = 0, full_bit_inx = 0, i; if (!job_resrcs_ptr->core_bitmap) return; /* add the job to the row_bitmap */ if (*full_core_bitmap == NULL) { uint32_t size = 0; for (i = 0; i < node_record_count; i++) size += bits_per_node[i]; *full_core_bitmap = bit_alloc(size); } job_node_cnt = bit_set_count(job_resrcs_ptr->node_bitmap); for (full_node_inx = bit_ffs(job_resrcs_ptr->node_bitmap); job_node_cnt > 0; full_node_inx++) { if (bit_test(job_resrcs_ptr->node_bitmap, full_node_inx)) { full_bit_inx = cr_node_cores_offset[full_node_inx]; for (i = 0; i < bits_per_node[full_node_inx]; i++) { if (!job_resrcs_ptr->whole_node && !bit_test(job_resrcs_ptr->core_bitmap, job_bit_inx + i)) continue; bit_clear(*full_core_bitmap, full_bit_inx + i); } job_bit_inx += bits_per_node[full_node_inx]; job_node_cnt --; } } }
static void _print_jobs(struct gs_part *p_ptr) { int i; if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) { info("gang: part %s has %u jobs, %u shadows:", p_ptr->part_name, p_ptr->num_jobs, p_ptr->num_shadows); for (i = 0; i < p_ptr->num_shadows; i++) { info("gang: shadow job %u row_s %s, sig_s %s", p_ptr->shadow[i]->job_ptr->job_id, _print_flag(p_ptr->shadow[i]->row_state), _print_flag(p_ptr->shadow[i]->sig_state)); } for (i = 0; i < p_ptr->num_jobs; i++) { info("gang: job %u row_s %s, sig_s %s", p_ptr->job_list[i]->job_ptr->job_id, _print_flag(p_ptr->job_list[i]->row_state), _print_flag(p_ptr->job_list[i]->sig_state)); } if (p_ptr->active_resmap) { int s = bit_size(p_ptr->active_resmap); i = bit_set_count(p_ptr->active_resmap); info("gang: active resmap has %d of %d bits set", i, s); } } }
/* Return 1 if job fits in this row, else return 0 */ static int _job_fits_in_active_row(struct job_record *job_ptr, struct gs_part *p_ptr) { job_resources_t *job_res = job_ptr->job_resrcs; int count; bitstr_t *job_map; uint16_t job_gr_type; if ((p_ptr->active_resmap == NULL) || (p_ptr->jobs_active == 0)) return 1; job_gr_type = _get_part_gr_type(job_ptr->part_ptr); if ((job_gr_type == GS_CPU2) || (job_gr_type == GS_CORE) || (job_gr_type == GS_SOCKET)) { return job_fits_into_cores(job_res, p_ptr->active_resmap, gs_bits_per_node); } /* job_gr_type == GS_NODE || job_gr_type == GS_CPU */ job_map = bit_copy(job_res->node_bitmap); bit_and(job_map, p_ptr->active_resmap); /* any set bits indicate contention for the same resource */ count = bit_set_count(job_map); if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) info("gang: _job_fits_in_active_row: %d bits conflict", count); FREE_NULL_BITMAP(job_map); if (count == 0) return 1; if (job_gr_type == GS_CPU) { /* For GS_CPU we check the CPU arrays */ return _can_cpus_fit(job_ptr, p_ptr); } return 0; }
/* Reset the node_bitmap in a job_resources data structure * This is needed after a restart/reconfiguration since nodes can * be added or removed from the system resulting in changing in * the bitmap size or bit positions */ extern int reset_node_bitmap(job_resources_t *job_resrcs_ptr, uint32_t job_id) { int i; if (!job_resrcs_ptr) return SLURM_SUCCESS; if (job_resrcs_ptr->node_bitmap) FREE_NULL_BITMAP(job_resrcs_ptr->node_bitmap); if (job_resrcs_ptr->nodes && (node_name2bitmap(job_resrcs_ptr->nodes, false, &job_resrcs_ptr->node_bitmap))) { error("Invalid nodes (%s) for job_id %u", job_resrcs_ptr->nodes, job_id); return SLURM_ERROR; } else if (job_resrcs_ptr->nodes == NULL) { job_resrcs_ptr->node_bitmap = bit_alloc(node_record_count); } i = bit_set_count(job_resrcs_ptr->node_bitmap); if (job_resrcs_ptr->nhosts != i) { error("Invalid change in resource allocation node count for " "job %u, %u to %d", job_id, job_resrcs_ptr->nhosts, i); return SLURM_ERROR; } return SLURM_SUCCESS; }
void task_state_update (task_state_t ts, int taskid, task_state_type_t t) { xassert (ts != NULL); xassert (taskid >= 0); xassert (taskid < ts->n_tasks); debug3("task_state_update(taskid=%d, %s)", taskid, _task_state_type_str (t)); switch (t) { case TS_START_SUCCESS: bit_set (ts->running, taskid); ts->n_started++; break; case TS_START_FAILURE: bit_set (ts->start_failed, taskid); break; case TS_NORMAL_EXIT: bit_clear (ts->running, taskid); if (bit_test(ts->normal_exit, taskid)) { error("Task %d reported exit for a second time.", taskid); } else { bit_set (ts->normal_exit, taskid); ts->n_exited++; } break; case TS_ABNORMAL_EXIT: bit_clear (ts->running, taskid); if (bit_test(ts->abnormal_exit, taskid)) { error("Task %d reported exit for a second time.", taskid); } else { bit_set (ts->abnormal_exit, taskid); ts->n_exited++; ts->n_abnormal++; } break; } xassert ((bit_set_count(ts->abnormal_exit) + bit_set_count(ts->normal_exit)) == ts->n_exited); }
static void _do_log_msg(task_state_t ts, bitstr_t *b, log_f fn, const char *msg) { char buf[4096]; char *s = bit_set_count (b) == 1 ? "" : "s"; if (ts->pack_group == NO_VAL) { (*fn) ("step:%u.%u task%s %s: %s", ts->job_id, ts->step_id, s, bit_fmt(buf, sizeof(buf), b), msg); } else { (*fn) ("step:%u.%u pack_group:%u task%s %s: %s", ts->job_id, ts->step_id, ts->pack_group, s, bit_fmt(buf, sizeof(buf), b), msg); } }
/* Rebuild active_feature_list for given node index, * IN node_inx - Node index, if -1 then copy alloc_feature_list into * acitve_feature_list, if -2 then log state */ extern void build_active_feature_list2(int node_inx, char *active_features) { node_feature_t *feature_ptr; ListIterator feature_iter; char *tmp_str, *token, *last = NULL; if (node_inx == -1) { _copy_feature_list(); return; } if (node_inx == -2) { #if _DEBUG feature_iter = list_iterator_create(active_feature_list); while ((feature_ptr = (node_feature_t *) list_next(feature_iter))) { info("ACTIVE FEATURE: NAME:%s CNT:%d", feature_ptr->name, bit_set_count(feature_ptr->node_bitmap)); } list_iterator_destroy(feature_iter); #endif return; } if ((node_inx < 0) || (node_inx >= node_record_count)) { error("%s: Invalid node_inx:%d", __func__, node_inx); return; } /* Clear this node from the feature_list record, * then restore as needed */ feature_iter = list_iterator_create(active_feature_list); while ((feature_ptr = (node_feature_t *) list_next(feature_iter))) { bit_clear(feature_ptr->node_bitmap, node_inx); } list_iterator_destroy(feature_iter); if (active_features) { tmp_str = xstrdup(active_features); token = strtok_r(tmp_str, ",", &last); while (token) { _add_config_feature_inx(active_feature_list, token, node_inx); token = strtok_r(NULL, ",", &last); } xfree(tmp_str); } }
static void _dump_resv_port_info(void) { #if _DEBUG int i; char *tmp_char; for (i=0; i<port_resv_cnt; i++) { if (bit_set_count(port_resv_table[i]) == 0) continue; tmp_char = bitmap2node_name(port_resv_table[i]); info("Port %d: %s", (i+port_resv_min), tmp_char); xfree(tmp_char); } #endif }
static void _build_select_struct(struct job_record *job_ptr, bitstr_t *bitmap, uint32_t node_cnt) { int i; uint32_t total_cpus = 0; job_resources_t *job_resrcs_ptr; xassert(job_ptr); if (job_ptr->job_resrcs) { error("select_p_job_test: already have select_job"); free_job_resources(&job_ptr->job_resrcs); } job_ptr->job_resrcs = job_resrcs_ptr = create_job_resources(); job_resrcs_ptr->cpu_array_reps = xmalloc(sizeof(uint32_t)); job_resrcs_ptr->cpu_array_value = xmalloc(sizeof(uint16_t)); job_resrcs_ptr->cpus = xmalloc(sizeof(uint16_t) * node_cnt); job_resrcs_ptr->cpus_used = xmalloc(sizeof(uint16_t) * node_cnt); /* job_resrcs_ptr->nhosts = node_cnt; */ job_resrcs_ptr->nhosts = bit_set_count(bitmap); job_resrcs_ptr->ncpus = job_ptr->details->min_cpus; job_resrcs_ptr->node_bitmap = bit_copy(bitmap); job_resrcs_ptr->nodes = bitmap2node_name(bitmap); if (job_resrcs_ptr->node_bitmap == NULL) fatal("bit_copy malloc failure"); job_resrcs_ptr->cpu_array_cnt = 1; if (job_ptr->details->min_cpus < bg_conf->cpus_per_mp) job_resrcs_ptr->cpu_array_value[0] = job_ptr->details->min_cpus; else job_resrcs_ptr->cpu_array_value[0] = bg_conf->cpus_per_mp; job_resrcs_ptr->cpu_array_reps[0] = node_cnt; total_cpus = bg_conf->cpu_ratio * node_cnt; for (i=0; i<node_cnt; i++) job_resrcs_ptr->cpus[i] = bg_conf->cpu_ratio; if (job_resrcs_ptr->ncpus != total_cpus) { error("select_p_job_test: ncpus mismatch %u != %u", job_resrcs_ptr->ncpus, total_cpus); } }
int _print_job_job_id(job_info_t * job, int width, bool right, char* suffix) { if (job == NULL) { /* Print the Header instead */ _print_str("JOBID", width, right, true); } else if ((job->array_task_id != NO_VAL) && !params.array_flag && IS_JOB_PENDING(job) && job->node_inx) { uint32_t i, local_width = width, max_task_id = 0; char *id, *task_str; bitstr_t *task_bits; for (i = 1; i <= job->node_inx[0]; i++) max_task_id = MAX(max_task_id, job->node_inx[i]); task_bits = bit_alloc(max_task_id + 1); for (i = 1; i <= job->node_inx[0]; i++) bit_set(task_bits, job->node_inx[i]); if (local_width == 0) { local_width = bit_set_count(task_bits) * FORMAT_STRING_SIZE; } id = xmalloc(local_width); task_str = xmalloc(local_width); bit_fmt(task_str, local_width, task_bits); snprintf(id, local_width, "%u_[%s]", job->array_job_id, task_str); _print_str(id, width, right, true); bit_free(task_bits); xfree(id); xfree(task_str); } else if (job->array_task_id != NO_VAL) { char id[FORMAT_STRING_SIZE]; snprintf(id, FORMAT_STRING_SIZE, "%u_%u", job->array_job_id, job->array_task_id); _print_str(id, width, right, true); } else { char id[FORMAT_STRING_SIZE]; snprintf(id, FORMAT_STRING_SIZE, "%u", job->job_id); _print_str(id, width, right, true); } if (suffix) printf("%s", suffix); return SLURM_SUCCESS; }
/* Wait for all identified computed nodes to enter "on" state */ static void _wait_all_nodes_on(void) { char *argv[10], *resp_msg; int i, nid_cnt = 0, status = 0; json_object *j; uint32_t *nid_array; time_t start_time = time(NULL); while ((difftime(time(NULL), start_time) < (30 * 60)) && (bit_set_count(node_bitmap) > 0)) { sleep(20); argv[0] = "capmc"; argv[1] = "node_status"; argv[2] = NULL; resp_msg = _run_script(argv, &status); if (status != 0) { error("%s: capmc(%s,%s,%s): %d %s", log_file, argv[1], argv[2], argv[3], status, resp_msg); break; } j = json_tokener_parse(resp_msg); if (j == NULL) { error("%s: json parser failed on %s", log_file, resp_msg); xfree(resp_msg); break; } xfree(resp_msg); nid_cnt = 0; nid_array = _json_parse_nids(j, "on", &nid_cnt); json_object_put(j); /* Frees json memory */ for (i = 0; i < nid_cnt; i++) { bit_clear(node_bitmap, nid_array[i]); } xfree(nid_array); } }
/* Attempt to schedule a specific job on specific available nodes * IN job_ptr - job to schedule * IN/OUT avail_bitmap - nodes available/selected to use * IN exc_core_bitmap - cores which can not be used * RET SLURM_SUCCESS on success, otherwise an error code */ static int _try_sched(struct job_record *job_ptr, bitstr_t **avail_bitmap, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes, bitstr_t *exc_core_bitmap) { bitstr_t *tmp_bitmap; int rc = SLURM_SUCCESS; int feat_cnt = _num_feature_count(job_ptr); List preemptee_candidates = NULL; if (feat_cnt) { /* Ideally schedule the job feature by feature, * but I don't want to add that complexity here * right now, so clear the feature counts and try * to schedule. This will work if there is only * one feature count. It should work fairly well * in cases where there are multiple feature * counts. */ struct job_details *detail_ptr = job_ptr->details; ListIterator feat_iter; struct feature_record *feat_ptr; int i = 0, list_size; uint16_t *feat_cnt_orig = NULL, high_cnt = 0; /* Clear the feature counts */ list_size = list_count(detail_ptr->feature_list); feat_cnt_orig = xmalloc(sizeof(uint16_t) * list_size); feat_iter = list_iterator_create(detail_ptr->feature_list); while ((feat_ptr = (struct feature_record *) list_next(feat_iter))) { high_cnt = MAX(high_cnt, feat_ptr->count); feat_cnt_orig[i++] = feat_ptr->count; feat_ptr->count = 0; } list_iterator_destroy(feat_iter); if ((job_req_node_filter(job_ptr, *avail_bitmap) != SLURM_SUCCESS) || (bit_set_count(*avail_bitmap) < high_cnt)) { rc = ESLURM_NODES_BUSY; } else { preemptee_candidates = slurm_find_preemptable_jobs(job_ptr); rc = select_g_job_test(job_ptr, *avail_bitmap, high_cnt, max_nodes, req_nodes, SELECT_MODE_WILL_RUN, preemptee_candidates, NULL, exc_core_bitmap); } /* Restore the feature counts */ i = 0; feat_iter = list_iterator_create(detail_ptr->feature_list); while ((feat_ptr = (struct feature_record *) list_next(feat_iter))) { feat_ptr->count = feat_cnt_orig[i++]; } list_iterator_destroy(feat_iter); xfree(feat_cnt_orig); } else { /* Try to schedule the job. First on dedicated nodes * then on shared nodes (if so configured). */ uint16_t orig_shared; time_t now = time(NULL); char str[100]; preemptee_candidates = slurm_find_preemptable_jobs(job_ptr); orig_shared = job_ptr->details->shared; job_ptr->details->shared = 0; tmp_bitmap = bit_copy(*avail_bitmap); if (exc_core_bitmap) { bit_fmt(str, (sizeof(str) - 1), exc_core_bitmap); debug2(" _try_sched with exclude core bitmap: %s",str); } rc = select_g_job_test(job_ptr, *avail_bitmap, min_nodes, max_nodes, req_nodes, SELECT_MODE_WILL_RUN, preemptee_candidates, NULL, exc_core_bitmap); job_ptr->details->shared = orig_shared; if (((rc != SLURM_SUCCESS) || (job_ptr->start_time > now)) && (orig_shared != 0)) { FREE_NULL_BITMAP(*avail_bitmap); *avail_bitmap= tmp_bitmap; rc = select_g_job_test(job_ptr, *avail_bitmap, min_nodes, max_nodes, req_nodes, SELECT_MODE_WILL_RUN, preemptee_candidates, NULL, exc_core_bitmap); } else FREE_NULL_BITMAP(tmp_bitmap); } if (preemptee_candidates) list_destroy(preemptee_candidates); return rc; }
/* * Attempt to start a job * jobid (IN) - job id * task_cnt (IN) - total count of tasks to start * hostlist (IN) - SLURM hostlist expression with no repeated hostnames * tasklist (IN/OUT) - comma separated list of hosts with tasks to be started, * list hostname once per task to start * comment_ptr (IN) - new comment field for the job or NULL for no change * err_code (OUT) - Moab error code * err_msg (OUT) - Moab error message */ static int _start_job(uint32_t jobid, int task_cnt, char *hostlist, char *tasklist, char *comment_ptr, int *err_code, char **err_msg) { int rc = 0, old_task_cnt = 1; struct job_record *job_ptr; /* Write lock on job info, read lock on node info */ slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, READ_LOCK, NO_LOCK }; char *new_node_list = NULL; static char tmp_msg[128]; bitstr_t *new_bitmap = (bitstr_t *) NULL; bitstr_t *save_req_bitmap = (bitstr_t *) NULL; bitoff_t i, bsize; int ll; /* layout info index */ char *node_name, *node_idx, *node_cur, *save_req_nodes = NULL; size_t node_name_len; static uint32_t cr_test = 0, cr_enabled = 0; if (cr_test == 0) { select_g_get_info_from_plugin(SELECT_CR_PLUGIN, NULL, &cr_enabled); cr_test = 1; } lock_slurmctld(job_write_lock); job_ptr = find_job_record(jobid); if (job_ptr == NULL) { *err_code = -700; *err_msg = "No such job"; error("wiki: Failed to find job %u", jobid); rc = -1; goto fini; } if ((job_ptr->details == NULL) || (!IS_JOB_PENDING(job_ptr))) { *err_code = -700; *err_msg = "Job not pending, can't start"; error("wiki: Attempt to start job %u in state %s", jobid, job_state_string(job_ptr->job_state)); rc = -1; goto fini; } if (comment_ptr) { char *reserved = strstr(comment_ptr, "RESERVED:"); if (reserved) { reserved += 9; job_ptr->details->reserved_resources = strtol(reserved, NULL, 10); } xfree(job_ptr->comment); job_ptr->comment = xstrdup(comment_ptr); } if (task_cnt) { new_node_list = xstrdup(hostlist); if (node_name2bitmap(new_node_list, false, &new_bitmap) != 0) { *err_code = -700; *err_msg = "Invalid TASKLIST"; error("wiki: Attempt to set invalid node list for " "job %u, %s", jobid, hostlist); xfree(new_node_list); rc = -1; goto fini; } if (!bit_super_set(new_bitmap, avail_node_bitmap)) { /* Selected node is UP and not responding * or it just went DOWN */ *err_code = -700; *err_msg = "TASKLIST includes non-responsive node"; error("wiki: Attempt to use non-responsive nodes for " "job %u, %s", jobid, hostlist); xfree(new_node_list); FREE_NULL_BITMAP(new_bitmap); rc = -1; goto fini; } /* User excluded node list incompatible with Wiki * Exclude all nodes not explicitly requested */ FREE_NULL_BITMAP(job_ptr->details->exc_node_bitmap); job_ptr->details->exc_node_bitmap = bit_copy(new_bitmap); bit_not(job_ptr->details->exc_node_bitmap); } /* Build layout information from tasklist (assuming that Moab * sends a non-bracketed list of nodes, repeated as many times * as cpus should be used per node); at this point, node names * are comma-separated. This is _not_ a fast algorithm as it * performs many string compares. */ xfree(job_ptr->details->req_node_layout); if (task_cnt && cr_enabled) { uint16_t cpus_per_task = MAX(1, job_ptr->details->cpus_per_task); job_ptr->details->req_node_layout = (uint16_t *) xmalloc(bit_set_count(new_bitmap) * sizeof(uint16_t)); bsize = bit_size(new_bitmap); for (i = 0, ll = -1; i < bsize; i++) { if (!bit_test(new_bitmap, i)) continue; ll++; node_name = node_record_table_ptr[i].name; node_name_len = strlen(node_name); if (node_name_len == 0) continue; node_cur = tasklist; while (*node_cur) { if ((node_idx = strstr(node_cur, node_name))) { if ((node_idx[node_name_len] == ',') || (node_idx[node_name_len] == '\0')) { job_ptr->details-> req_node_layout[ll] += cpus_per_task; } node_cur = strchr(node_idx, ','); if (node_cur) continue; } break; } } } /* save and update job state to start now */ save_req_nodes = job_ptr->details->req_nodes; job_ptr->details->req_nodes = new_node_list; save_req_bitmap = job_ptr->details->req_node_bitmap; job_ptr->details->req_node_bitmap = new_bitmap; old_task_cnt = job_ptr->details->min_cpus; job_ptr->details->min_cpus = MAX(task_cnt, old_task_cnt); job_ptr->priority = 100000000; fini: unlock_slurmctld(job_write_lock); if (rc) return rc; /* No errors so far */ (void) schedule(INFINITE); /* provides own locking */ /* Check to insure the job was actually started */ lock_slurmctld(job_write_lock); if (job_ptr->job_id != jobid) job_ptr = find_job_record(jobid); if (job_ptr && (job_ptr->job_id == jobid) && (!IS_JOB_RUNNING(job_ptr))) { uint16_t wait_reason = 0; char *wait_string; if (IS_JOB_FAILED(job_ptr)) wait_string = "Invalid request, job aborted"; else { wait_reason = job_ptr->state_reason; if (wait_reason == WAIT_HELD) { /* some job is completing, slurmctld did * not even try to schedule this job */ wait_reason = WAIT_RESOURCES; } wait_string = job_reason_string(wait_reason); job_ptr->state_reason = WAIT_HELD; xfree(job_ptr->state_desc); } *err_code = -910 - wait_reason; snprintf(tmp_msg, sizeof(tmp_msg), "Could not start job %u(%s): %s", jobid, new_node_list, wait_string); *err_msg = tmp_msg; error("wiki: %s", tmp_msg); /* restore some of job state */ job_ptr->priority = 0; job_ptr->details->min_cpus = old_task_cnt; rc = -1; } if (job_ptr && (job_ptr->job_id == jobid) && job_ptr->details) { /* Restore required node list in case job requeued */ xfree(job_ptr->details->req_nodes); job_ptr->details->req_nodes = save_req_nodes; FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap); job_ptr->details->req_node_bitmap = save_req_bitmap; FREE_NULL_BITMAP(job_ptr->details->exc_node_bitmap); xfree(job_ptr->details->req_node_layout); } else { error("wiki: start_job(%u) job missing", jobid); xfree(save_req_nodes); FREE_NULL_BITMAP(save_req_bitmap); } unlock_slurmctld(job_write_lock); schedule_node_save(); /* provides own locking */ schedule_job_save(); /* provides own locking */ return rc; }
extern int select_nodeinfo_set_all(void) { ListIterator itr = NULL; struct node_record *node_ptr = NULL; int i=0; bg_record_t *bg_record = NULL; static time_t last_set_all = 0; ba_mp_t *ba_mp; node_subgrp_t *subgrp = NULL; int bit_count; //uint32_t cluster_flags = slurmdb_setup_cluster_flags(); if (!blocks_are_created) return SLURM_NO_CHANGE_IN_DATA; if (!g_bitmap_size) { /* if (cluster_flags & CLUSTER_FLAG_BGQ) */ /* g_bitmap_size = bg_conf->mp_cnode_cnt; */ /* else */ g_bitmap_size = bg_conf->ionodes_per_mp; } /* only set this once when the last_bg_update is newer than the last time we set things up. */ if (last_set_all && (last_bg_update-1 < last_set_all)) { debug2("Node select info for set all hasn't " "changed since %ld", last_set_all); return SLURM_NO_CHANGE_IN_DATA; } last_set_all = last_bg_update; /* set this here so we know things have changed */ last_node_update = time(NULL); slurm_mutex_lock(&block_state_mutex); for (i=0; i<node_record_count; i++) { select_nodeinfo_t *nodeinfo; node_ptr = &(node_record_table_ptr[i]); xassert(node_ptr->select_nodeinfo); nodeinfo = node_ptr->select_nodeinfo->data; xassert(nodeinfo); xassert(nodeinfo->subgrp_list); list_flush(nodeinfo->subgrp_list); if (nodeinfo->bitmap_size != g_bitmap_size) nodeinfo->bitmap_size = g_bitmap_size; } itr = list_iterator_create(bg_lists->main); while ((bg_record = list_next(itr))) { enum node_states state = NODE_STATE_UNKNOWN; select_nodeinfo_t *nodeinfo; bitstr_t *bitmap; ListIterator itr2 = NULL; /* Only mark unidle blocks */ if (bg_record->job_list && list_count(bg_record->job_list)) { struct job_record *job_ptr; select_jobinfo_t *jobinfo; ListIterator itr = list_iterator_create(bg_record->job_list); ba_mp = list_peek(bg_record->ba_mp_list); node_ptr = &(node_record_table_ptr[ba_mp->index]); xassert(node_ptr->select_nodeinfo); nodeinfo = node_ptr->select_nodeinfo->data; xassert(nodeinfo); xassert(nodeinfo->subgrp_list); if (ba_mp->cnode_err_bitmap && (bit_count = bit_set_count(ba_mp->cnode_err_bitmap))) { subgrp = _find_subgrp(nodeinfo->subgrp_list, NODE_STATE_ERROR, g_bitmap_size); /* FIXME: the subgrp->bitmap isn't set here. */ subgrp->cnode_cnt += bit_count; } subgrp = _find_subgrp(nodeinfo->subgrp_list, NODE_STATE_ALLOCATED, g_bitmap_size); while ((job_ptr = list_next(itr))) { jobinfo = job_ptr->select_jobinfo->data; /* FIXME: the subgrp->bitmap isn't set here. */ subgrp->cnode_cnt += jobinfo->cnode_cnt; } list_iterator_destroy(itr); continue; } else if (bg_record->job_running == NO_JOB_RUNNING) continue; if (bg_record->state & BG_BLOCK_ERROR_FLAG) state = NODE_STATE_ERROR; else if (bg_record->job_running > NO_JOB_RUNNING) { /* we don't need to set the allocated here * since the whole midplane is allocated */ if (bg_record->conn_type[0] < SELECT_SMALL) continue; state = NODE_STATE_ALLOCATED; } else { error("not sure why we got here with block %s %s", bg_record->bg_block_id, bg_block_state_string(bg_record->state)); continue; } /* if ((cluster_flags & CLUSTER_FLAG_BGQ) */ /* && (state != NODE_STATE_ERROR)) */ /* bitmap = bg_record->cnodes_used_bitmap; */ /* else */ bitmap = bg_record->ionode_bitmap; itr2 = list_iterator_create(bg_record->ba_mp_list); while ((ba_mp = list_next(itr2))) { if (!ba_mp->used) continue; node_ptr = &(node_record_table_ptr[ba_mp->index]); xassert(node_ptr->select_nodeinfo); nodeinfo = node_ptr->select_nodeinfo->data; xassert(nodeinfo); xassert(nodeinfo->subgrp_list); if (ba_mp->cnode_err_bitmap && (state == NODE_STATE_ALLOCATED) && (bit_count = bit_set_count(ba_mp->cnode_err_bitmap))) { subgrp = _find_subgrp(nodeinfo->subgrp_list, NODE_STATE_ERROR, g_bitmap_size); /* FIXME: the subgrp->bitmap isn't set here. */ subgrp->cnode_cnt += bit_count; } subgrp = _find_subgrp(nodeinfo->subgrp_list, state, g_bitmap_size); if (subgrp->cnode_cnt < bg_conf->mp_cnode_cnt) { /* if (cluster_flags & CLUSTER_FLAG_BGQ) { */ /* bit_or(subgrp->bitmap, bitmap); */ /* subgrp->cnode_cnt += */ /* bit_set_count(bitmap); */ /* } else */ if (bg_record->cnode_cnt < bg_conf->mp_cnode_cnt) { bit_or(subgrp->bitmap, bitmap); subgrp->cnode_cnt += bg_record->cnode_cnt; } else { bit_nset(subgrp->bitmap, 0, (g_bitmap_size-1)); subgrp->cnode_cnt = bg_conf->mp_cnode_cnt; } } } list_iterator_destroy(itr2); } list_iterator_destroy(itr); slurm_mutex_unlock(&block_state_mutex); return SLURM_SUCCESS; }
/* * _task_layout_lllp_block * * task_layout_lllp_block will create a block distribution at the * lowest level of logical processor which is either socket, core or * thread depending on the system architecture. The Block algorithm * is the same as the Block distribution performed in srun. * * Distribution at the lllp: * -m hostfile|plane|block|cyclic:block|cyclic * * The first distribution "hostfile|plane|block|cyclic" is computed * in srun. The second distribution "plane|block|cyclic" is computed * locally by each slurmd. * * The input to the lllp distribution algorithms is the gids (tasks * ids) generated for the local node. * * The output is a mapping of the gids onto logical processors * (thread/core/socket) with is expressed cpu_bind masks. * */ static int _task_layout_lllp_block(launch_tasks_request_msg_t *req, uint32_t node_id, bitstr_t ***masks_p) { int c, i, size, last_taskcount = -1, taskcount = 0; uint16_t hw_sockets = 0, hw_cores = 0, hw_threads = 0; int max_tasks = req->tasks_to_launch[(int)node_id]; int max_cpus = max_tasks * req->cpus_per_task; bitstr_t *avail_map; bitstr_t **masks = NULL; info("_task_layout_lllp_block "); avail_map = _get_avail_map(req, &hw_sockets, &hw_cores, &hw_threads); if (!avail_map) { return SLURM_ERROR; } size = bit_set_count(avail_map); if (size < max_tasks) { error("task/affinity: only %d bits in avail_map for %d tasks!", size, max_tasks); FREE_NULL_BITMAP(avail_map); return SLURM_ERROR; } if (size < max_cpus) { /* Possible result of overcommit */ i = size / max_tasks; info("task/affinity: reset cpus_per_task from %d to %d", req->cpus_per_task, i); req->cpus_per_task = i; } size = bit_size(avail_map); *masks_p = xmalloc(max_tasks * sizeof(bitstr_t*)); masks = *masks_p; /* block distribution with oversubsciption */ c = 0; while(taskcount < max_tasks) { if (taskcount == last_taskcount) { fatal("_task_layout_lllp_block infinite loop"); } last_taskcount = taskcount; /* the abstract map is already laid out in block order, * so just iterate over it */ for (i = 0; i < size; i++) { /* skip unavailable resources */ if (bit_test(avail_map, i) == 0) continue; if (!masks[taskcount]) masks[taskcount] = bit_alloc( conf->block_map_size); //info("setting %d %d", taskcount, i); bit_set(masks[taskcount], i); /* skip unrequested threads */ if (req->cpu_bind_type & CPU_BIND_ONE_THREAD_PER_CORE) i += hw_threads-1; if (++c < req->cpus_per_task) continue; c = 0; if (++taskcount >= max_tasks) break; } } /* last step: expand the masks to bind each task * to the requested resource */ _expand_masks(req->cpu_bind_type, max_tasks, masks, hw_sockets, hw_cores, hw_threads, avail_map); FREE_NULL_BITMAP(avail_map); return SLURM_SUCCESS; }
static int _attempt_backfill(void) { DEF_TIMERS; bool filter_root = false; List job_queue; job_queue_rec_t *job_queue_rec; slurmdb_qos_rec_t *qos_ptr = NULL; int i, j, node_space_recs; struct job_record *job_ptr; struct part_record *part_ptr, **bf_part_ptr = NULL; uint32_t end_time, end_reserve; uint32_t time_limit, comp_time_limit, orig_time_limit, part_time_limit; uint32_t min_nodes, max_nodes, req_nodes; bitstr_t *avail_bitmap = NULL, *resv_bitmap = NULL; bitstr_t *exc_core_bitmap = NULL, *non_cg_bitmap = NULL; time_t now, sched_start, later_start, start_res, resv_end, window_end; node_space_map_t *node_space; struct timeval bf_time1, bf_time2; int rc = 0; int job_test_count = 0; uint32_t *uid = NULL, nuser = 0, bf_parts = 0, *bf_part_jobs = NULL; uint16_t *njobs = NULL; bool already_counted; uint32_t reject_array_job_id = 0; struct part_record *reject_array_part = NULL; uint32_t job_start_cnt = 0, start_time; time_t config_update = slurmctld_conf.last_update; time_t part_update = last_part_update; struct timeval start_tv; bf_last_yields = 0; #ifdef HAVE_ALPS_CRAY /* * Run a Basil Inventory immediately before setting up the schedule * plan, to avoid race conditions caused by ALPS node state change. * Needs to be done with the node-state lock taken. */ START_TIMER; if (select_g_reconfigure()) { debug4("backfill: not scheduling due to ALPS"); return SLURM_SUCCESS; } END_TIMER; if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: ALPS inventory completed, %s", TIME_STR); /* The Basil inventory can take a long time to complete. Process * pending RPCs before starting the backfill scheduling logic */ _yield_locks(1000000); #endif START_TIMER; if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: beginning"); else debug("backfill: beginning"); sched_start = now = time(NULL); gettimeofday(&start_tv, NULL); if (slurm_get_root_filter()) filter_root = true; job_queue = build_job_queue(true, true); if (list_count(job_queue) == 0) { if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: no jobs to backfill"); else debug("backfill: no jobs to backfill"); list_destroy(job_queue); return 0; } gettimeofday(&bf_time1, NULL); non_cg_bitmap = bit_copy(cg_node_bitmap); bit_not(non_cg_bitmap); slurmctld_diag_stats.bf_queue_len = list_count(job_queue); slurmctld_diag_stats.bf_queue_len_sum += slurmctld_diag_stats. bf_queue_len; slurmctld_diag_stats.bf_last_depth = 0; slurmctld_diag_stats.bf_last_depth_try = 0; slurmctld_diag_stats.bf_when_last_cycle = now; slurmctld_diag_stats.bf_active = 1; node_space = xmalloc(sizeof(node_space_map_t) * (max_backfill_job_cnt * 2 + 1)); node_space[0].begin_time = sched_start; window_end = sched_start + backfill_window; node_space[0].end_time = window_end; node_space[0].avail_bitmap = bit_copy(avail_node_bitmap); node_space[0].next = 0; node_space_recs = 1; if (debug_flags & DEBUG_FLAG_BACKFILL_MAP) _dump_node_space_table(node_space); if (max_backfill_job_per_part) { ListIterator part_iterator; struct part_record *part_ptr; bf_parts = list_count(part_list); bf_part_ptr = xmalloc(sizeof(struct part_record *) * bf_parts); bf_part_jobs = xmalloc(sizeof(int) * bf_parts); part_iterator = list_iterator_create(part_list); i = 0; while ((part_ptr = (struct part_record *) list_next(part_iterator))) { bf_part_ptr[i++] = part_ptr; } list_iterator_destroy(part_iterator); } if (max_backfill_job_per_user) { uid = xmalloc(BF_MAX_USERS * sizeof(uint32_t)); njobs = xmalloc(BF_MAX_USERS * sizeof(uint16_t)); } sort_job_queue(job_queue); while (1) { job_queue_rec = (job_queue_rec_t *) list_pop(job_queue); if (!job_queue_rec) { if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: reached end of job queue"); break; } if (slurmctld_config.shutdown_time) break; if (((defer_rpc_cnt > 0) && (slurmctld_config.server_thread_count >= defer_rpc_cnt)) || (_delta_tv(&start_tv) >= sched_timeout)) { if (debug_flags & DEBUG_FLAG_BACKFILL) { END_TIMER; info("backfill: completed yielding locks " "after testing %u(%d) jobs, %s", slurmctld_diag_stats.bf_last_depth, job_test_count, TIME_STR); } if ((_yield_locks(yield_sleep) && !backfill_continue) || (slurmctld_conf.last_update != config_update) || (last_part_update != part_update)) { if (debug_flags & DEBUG_FLAG_BACKFILL) { info("backfill: system state changed, " "breaking out after testing " "%u(%d) jobs", slurmctld_diag_stats.bf_last_depth, job_test_count); } rc = 1; xfree(job_queue_rec); break; } /* cg_node_bitmap may be changed */ bit_copybits(non_cg_bitmap, cg_node_bitmap); bit_not(non_cg_bitmap); /* Reset backfill scheduling timers, resume testing */ sched_start = time(NULL); gettimeofday(&start_tv, NULL); job_test_count = 0; START_TIMER; } job_ptr = job_queue_rec->job_ptr; /* With bf_continue configured, the original job could have * been cancelled and purged. Validate pointer here. */ if ((job_ptr->magic != JOB_MAGIC) || (job_ptr->job_id != job_queue_rec->job_id)) { xfree(job_queue_rec); continue; } orig_time_limit = job_ptr->time_limit; part_ptr = job_queue_rec->part_ptr; job_test_count++; slurmctld_diag_stats.bf_last_depth++; already_counted = false; xfree(job_queue_rec); if (!IS_JOB_PENDING(job_ptr)) continue; /* started in other partition */ if (!avail_front_end(job_ptr)) continue; /* No available frontend for this job */ if (job_ptr->array_task_id != NO_VAL) { if ((reject_array_job_id == job_ptr->array_job_id) && (reject_array_part == part_ptr)) continue; /* already rejected array element */ /* assume reject whole array for now, clear if OK */ reject_array_job_id = job_ptr->array_job_id; reject_array_part = part_ptr; } job_ptr->part_ptr = part_ptr; if (debug_flags & DEBUG_FLAG_BACKFILL) { info("backfill test for JobID=%u Prio=%u Partition=%s", job_ptr->job_id, job_ptr->priority, job_ptr->part_ptr->name); } if (max_backfill_job_per_part) { bool skip_job = false; for (j = 0; j < bf_parts; j++) { if (bf_part_ptr[j] != job_ptr->part_ptr) continue; if (bf_part_jobs[j]++ >= max_backfill_job_per_part) skip_job = true; break; } if (skip_job) { if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: have already " "checked %u jobs for " "partition %s; skipping " "job %u", max_backfill_job_per_part, job_ptr->part_ptr->name, job_ptr->job_id); continue; } } if (max_backfill_job_per_user) { for (j = 0; j < nuser; j++) { if (job_ptr->user_id == uid[j]) { njobs[j]++; if (debug_flags & DEBUG_FLAG_BACKFILL) debug("backfill: user %u: " "#jobs %u", uid[j], njobs[j]); break; } } if (j == nuser) { /* user not found */ static bool bf_max_user_msg = true; if (nuser < BF_MAX_USERS) { uid[j] = job_ptr->user_id; njobs[j] = 1; nuser++; } else if (bf_max_user_msg) { bf_max_user_msg = false; error("backfill: too many users in " "queue. Consider increasing " "BF_MAX_USERS"); } if (debug_flags & DEBUG_FLAG_BACKFILL) debug2("backfill: found new user %u. " "Total #users now %u", job_ptr->user_id, nuser); } else { if (njobs[j] >= max_backfill_job_per_user) { /* skip job */ if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: have already " "checked %u jobs for " "user %u; skipping " "job %u", max_backfill_job_per_user, job_ptr->user_id, job_ptr->job_id); continue; } } } if (((part_ptr->state_up & PARTITION_SCHED) == 0) || (part_ptr->node_bitmap == NULL) || ((part_ptr->flags & PART_FLAG_ROOT_ONLY) && filter_root)) { if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: partition %s not usable", job_ptr->part_ptr->name); continue; } if ((!job_independent(job_ptr, 0)) || (license_job_test(job_ptr, time(NULL)) != SLURM_SUCCESS)) { if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: job %u not runable now", job_ptr->job_id); continue; } /* Determine minimum and maximum node counts */ min_nodes = MAX(job_ptr->details->min_nodes, part_ptr->min_nodes); if (job_ptr->details->max_nodes == 0) max_nodes = part_ptr->max_nodes; else max_nodes = MIN(job_ptr->details->max_nodes, part_ptr->max_nodes); max_nodes = MIN(max_nodes, 500000); /* prevent overflows */ if (job_ptr->details->max_nodes) req_nodes = max_nodes; else req_nodes = min_nodes; if (min_nodes > max_nodes) { if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: job %u node count too high", job_ptr->job_id); continue; } /* Determine job's expected completion time */ if (part_ptr->max_time == INFINITE) part_time_limit = 365 * 24 * 60; /* one year */ else part_time_limit = part_ptr->max_time; if (job_ptr->time_limit == NO_VAL) { time_limit = part_time_limit; } else { if (part_ptr->max_time == INFINITE) time_limit = job_ptr->time_limit; else time_limit = MIN(job_ptr->time_limit, part_time_limit); } comp_time_limit = time_limit; qos_ptr = job_ptr->qos_ptr; if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE) && slurm_get_preempt_mode()) time_limit = job_ptr->time_limit = 1; else if (job_ptr->time_min && (job_ptr->time_min < time_limit)) time_limit = job_ptr->time_limit = job_ptr->time_min; /* Determine impact of any resource reservations */ later_start = now; TRY_LATER: if (slurmctld_config.shutdown_time) break; if (((defer_rpc_cnt > 0) && (slurmctld_config.server_thread_count >= defer_rpc_cnt)) || (_delta_tv(&start_tv) >= sched_timeout)) { uint32_t save_job_id = job_ptr->job_id; uint32_t save_time_limit = job_ptr->time_limit; job_ptr->time_limit = orig_time_limit; if (debug_flags & DEBUG_FLAG_BACKFILL) { END_TIMER; info("backfill: completed yielding locks " "after testing %u(%d) jobs, %s", slurmctld_diag_stats.bf_last_depth, job_test_count, TIME_STR); } if ((_yield_locks(yield_sleep) && !backfill_continue) || (slurmctld_conf.last_update != config_update) || (last_part_update != part_update)) { if (debug_flags & DEBUG_FLAG_BACKFILL) { info("backfill: system state changed, " "breaking out after testing " "%u(%d) jobs", slurmctld_diag_stats.bf_last_depth, job_test_count); } rc = 1; break; } /* cg_node_bitmap may be changed */ bit_copybits(non_cg_bitmap, cg_node_bitmap); bit_not(non_cg_bitmap); /* With bf_continue configured, the original job could * have been scheduled or cancelled and purged. * Revalidate job the record here. */ if ((job_ptr->magic != JOB_MAGIC) || (job_ptr->job_id != save_job_id)) continue; if (!IS_JOB_PENDING(job_ptr)) continue; if (!avail_front_end(job_ptr)) continue; /* No available frontend */ job_ptr->time_limit = save_time_limit; /* Reset backfill scheduling timers, resume testing */ sched_start = time(NULL); gettimeofday(&start_tv, NULL); job_test_count = 1; START_TIMER; } FREE_NULL_BITMAP(avail_bitmap); FREE_NULL_BITMAP(exc_core_bitmap); start_res = later_start; later_start = 0; j = job_test_resv(job_ptr, &start_res, true, &avail_bitmap, &exc_core_bitmap); if (j != SLURM_SUCCESS) { if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: job %u reservation defer", job_ptr->job_id); job_ptr->time_limit = orig_time_limit; continue; } if (start_res > now) end_time = (time_limit * 60) + start_res; else end_time = (time_limit * 60) + now; resv_end = find_resv_end(start_res); /* Identify usable nodes for this job */ bit_and(avail_bitmap, part_ptr->node_bitmap); bit_and(avail_bitmap, up_node_bitmap); bit_and(avail_bitmap, non_cg_bitmap); for (j=0; ; ) { if ((node_space[j].end_time > start_res) && node_space[j].next && (later_start == 0)) later_start = node_space[j].end_time; if (node_space[j].end_time <= start_res) ; else if (node_space[j].begin_time <= end_time) { bit_and(avail_bitmap, node_space[j].avail_bitmap); } else break; if ((j = node_space[j].next) == 0) break; } if (resv_end && (++resv_end < window_end) && ((later_start == 0) || (resv_end < later_start))) { later_start = resv_end; } if (job_ptr->details->exc_node_bitmap) { bit_not(job_ptr->details->exc_node_bitmap); bit_and(avail_bitmap, job_ptr->details->exc_node_bitmap); bit_not(job_ptr->details->exc_node_bitmap); } /* Test if insufficient nodes remain OR * required nodes missing OR * nodes lack features OR * no change since previously tested nodes (only changes * in other partition nodes) */ if ((bit_set_count(avail_bitmap) < min_nodes) || ((job_ptr->details->req_node_bitmap) && (!bit_super_set(job_ptr->details->req_node_bitmap, avail_bitmap))) || (job_req_node_filter(job_ptr, avail_bitmap))) { if (later_start) { job_ptr->start_time = 0; goto TRY_LATER; } /* Job can not start until too far in the future */ job_ptr->time_limit = orig_time_limit; job_ptr->start_time = sched_start + backfill_window; continue; } /* Identify nodes which are definitely off limits */ FREE_NULL_BITMAP(resv_bitmap); resv_bitmap = bit_copy(avail_bitmap); bit_not(resv_bitmap); /* this is the time consuming operation */ debug2("backfill: entering _try_sched for job %u.", job_ptr->job_id); if (!already_counted) { slurmctld_diag_stats.bf_last_depth_try++; already_counted = true; } if (debug_flags & DEBUG_FLAG_BACKFILL_MAP) _dump_job_test(job_ptr, avail_bitmap, start_res); j = _try_sched(job_ptr, &avail_bitmap, min_nodes, max_nodes, req_nodes, exc_core_bitmap); now = time(NULL); if (j != SLURM_SUCCESS) { job_ptr->time_limit = orig_time_limit; job_ptr->start_time = 0; continue; /* not runable */ } if (start_res > job_ptr->start_time) { job_ptr->start_time = start_res; last_job_update = now; } if (job_ptr->start_time <= now) { /* Can start now */ uint32_t save_time_limit = job_ptr->time_limit; uint32_t hard_limit; bool reset_time = false; int rc = _start_job(job_ptr, resv_bitmap); if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE)) { if (orig_time_limit == NO_VAL) { acct_policy_alter_job( job_ptr, comp_time_limit); job_ptr->time_limit = comp_time_limit; } else { acct_policy_alter_job( job_ptr, orig_time_limit); job_ptr->time_limit = orig_time_limit; } } else if ((rc == SLURM_SUCCESS) && job_ptr->time_min) { /* Set time limit as high as possible */ acct_policy_alter_job(job_ptr, comp_time_limit); job_ptr->time_limit = comp_time_limit; reset_time = true; } else if (orig_time_limit == NO_VAL) { acct_policy_alter_job(job_ptr, comp_time_limit); job_ptr->time_limit = comp_time_limit; } else { acct_policy_alter_job(job_ptr, orig_time_limit); job_ptr->time_limit = orig_time_limit; } if (job_ptr->time_limit == INFINITE) hard_limit = 365 * 24 * 60; /* one year */ else hard_limit = job_ptr->time_limit; job_ptr->end_time = job_ptr->start_time + (hard_limit * 60); if (reset_time) { _reset_job_time_limit(job_ptr, now, node_space); time_limit = job_ptr->time_limit; } if (rc == ESLURM_ACCOUNTING_POLICY) { /* Unknown future start time, just skip job */ job_ptr->start_time = 0; continue; } else if (rc != SLURM_SUCCESS) { if (debug_flags & DEBUG_FLAG_BACKFILL) { info("backfill: planned start of job %u" " failed: %s", job_ptr->job_id, slurm_strerror(rc)); } /* Drop through and reserve these resources. * Likely due to state changes during sleep. * Make best-effort based upon original state */ job_ptr->time_limit = orig_time_limit; later_start = 0; } else { /* Started this job, move to next one */ reject_array_job_id = 0; reject_array_part = NULL; /* Update the database if job time limit * changed and move to next job */ if (save_time_limit != job_ptr->time_limit) jobacct_storage_g_job_start(acct_db_conn, job_ptr); job_start_cnt++; if (max_backfill_jobs_start && (job_start_cnt >= max_backfill_jobs_start)){ if (debug_flags & DEBUG_FLAG_BACKFILL) { info("backfill: bf_max_job_start" " limit of %d reached", max_backfill_jobs_start); } break; } continue; } } else { job_ptr->time_limit = orig_time_limit; } start_time = job_ptr->start_time; end_reserve = job_ptr->start_time + (time_limit * 60); start_time = (start_time / backfill_resolution) * backfill_resolution; end_reserve = (end_reserve / backfill_resolution) * backfill_resolution; if (later_start && (start_time > later_start)) { /* Try later when some nodes currently reserved for * pending jobs are free */ job_ptr->start_time = 0; goto TRY_LATER; } if (job_ptr->start_time > (sched_start + backfill_window)) { /* Starts too far in the future to worry about */ if (debug_flags & DEBUG_FLAG_BACKFILL) _dump_job_sched(job_ptr, end_reserve, avail_bitmap); continue; } if (node_space_recs >= max_backfill_job_cnt) { if (debug_flags & DEBUG_FLAG_BACKFILL) { info("backfill: table size limit of %u reached", max_backfill_job_cnt); } break; } if ((job_ptr->start_time > now) && _test_resv_overlap(node_space, avail_bitmap, start_time, end_reserve)) { /* This job overlaps with an existing reservation for * job to be backfill scheduled, which the sched * plugin does not know about. Try again later. */ later_start = job_ptr->start_time; job_ptr->start_time = 0; goto TRY_LATER; } /* * Add reservation to scheduling table if appropriate */ if (debug_flags & DEBUG_FLAG_BACKFILL) _dump_job_sched(job_ptr, end_reserve, avail_bitmap); if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE)) continue; reject_array_job_id = 0; reject_array_part = NULL; if (debug_flags & DEBUG_FLAG_BACKFILL) _dump_job_sched(job_ptr, end_reserve, avail_bitmap); xfree(job_ptr->sched_nodes); job_ptr->sched_nodes = bitmap2node_name(avail_bitmap); bit_not(avail_bitmap); _add_reservation(start_time, end_reserve, avail_bitmap, node_space, &node_space_recs); if (debug_flags & DEBUG_FLAG_BACKFILL_MAP) _dump_node_space_table(node_space); } xfree(bf_part_jobs); xfree(bf_part_ptr); xfree(uid); xfree(njobs); FREE_NULL_BITMAP(avail_bitmap); FREE_NULL_BITMAP(exc_core_bitmap); FREE_NULL_BITMAP(resv_bitmap); FREE_NULL_BITMAP(non_cg_bitmap); for (i=0; ; ) { FREE_NULL_BITMAP(node_space[i].avail_bitmap); if ((i = node_space[i].next) == 0) break; } xfree(node_space); list_destroy(job_queue); gettimeofday(&bf_time2, NULL); _do_diag_stats(&bf_time1, &bf_time2, yield_sleep); if (debug_flags & DEBUG_FLAG_BACKFILL) { END_TIMER; info("backfill: completed testing %u(%d) jobs, %s", slurmctld_diag_stats.bf_last_depth, job_test_count, TIME_STR); } return rc; }
/* * Try to find resources for a given job request * IN job_ptr - pointer to job record in slurmctld * IN/OUT bitmap - nodes available for assignment to job, clear those not to * be used * IN min_nodes, max_nodes - minimum and maximum number of nodes to allocate * to this job (considers slurm block limits) * IN mode - SELECT_MODE_RUN_NOW: try to schedule job now * SELECT_MODE_TEST_ONLY: test if job can ever run * SELECT_MODE_WILL_RUN: determine when and where job can run * IN preemptee_candidates - List of pointers to jobs which can be preempted. * IN/OUT preemptee_job_list - Pointer to list of job pointers. These are the * jobs to be preempted to initiate the pending job. Not set * if mode=SELECT_MODE_TEST_ONLY or input pointer is NULL. * RET - SLURM_SUCCESS if job runnable now, error code otherwise */ extern int submit_job(struct job_record *job_ptr, bitstr_t *slurm_block_bitmap, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes, uint16_t mode, List preemptee_candidates, List *preemptee_job_list) { int rc = SLURM_SUCCESS; bg_record_t* bg_record = NULL; char buf[256]; uint16_t conn_type[SYSTEM_DIMENSIONS]; List block_list = NULL; int blocks_added = 0; time_t starttime = time(NULL); uint16_t local_mode = mode; int avail_cpus = num_unused_cpus; int dim = 0; for (dim=0; dim<SYSTEM_DIMENSIONS; dim++) conn_type[dim] = (uint16_t)NO_VAL; if (preemptee_candidates && preemptee_job_list && list_count(preemptee_candidates)) local_mode |= SELECT_MODE_PREEMPT_FLAG; else local_mode |= SELECT_MODE_CHECK_FULL; if (bg_conf->layout_mode == LAYOUT_DYNAMIC) slurm_mutex_lock(&create_dynamic_mutex); slurm_mutex_lock(&block_state_mutex); block_list = copy_bg_list(bg_lists->main); slurm_mutex_unlock(&block_state_mutex); get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_CONN_TYPE, &conn_type); if (conn_type[0] == SELECT_NAV) { if (bg_conf->mp_cnode_cnt == bg_conf->nodecard_cnode_cnt) conn_type[0] = SELECT_SMALL; else if (min_nodes > 1) { for (dim=0; dim<SYSTEM_DIMENSIONS; dim++) conn_type[dim] = SELECT_TORUS; } else if (job_ptr->details->min_cpus < bg_conf->cpus_per_mp) conn_type[0] = SELECT_SMALL; else { for (dim=1; dim<SYSTEM_DIMENSIONS; dim++) conn_type[dim] = SELECT_NAV; } set_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_CONN_TYPE, &conn_type); } if (slurm_block_bitmap && !bit_set_count(slurm_block_bitmap)) { error("no nodes given to place job %u.", job_ptr->job_id); if (bg_conf->layout_mode == LAYOUT_DYNAMIC) slurm_mutex_unlock(&create_dynamic_mutex); return SLURM_ERROR; } sprint_select_jobinfo(job_ptr->select_jobinfo->data, buf, sizeof(buf), SELECT_PRINT_MIXED); debug("bluegene:submit_job: %u mode=%d %s nodes=%u-%u-%u", job_ptr->job_id, local_mode, buf, min_nodes, req_nodes, max_nodes); #ifdef HAVE_BG_L_P # ifdef HAVE_BGL sprint_select_jobinfo(job_ptr->select_jobinfo->data, buf, sizeof(buf), SELECT_PRINT_BLRTS_IMAGE); debug3("BlrtsImage=%s", buf); # endif sprint_select_jobinfo(job_ptr->select_jobinfo->data, buf, sizeof(buf), SELECT_PRINT_LINUX_IMAGE); # ifdef HAVE_BGL debug3("LinuxImage=%s", buf); # else debug3("ComputNodeImage=%s", buf); # endif sprint_select_jobinfo(job_ptr->select_jobinfo->data, buf, sizeof(buf), SELECT_PRINT_RAMDISK_IMAGE); # ifdef HAVE_BGL debug3("RamDiskImage=%s", buf); # else debug3("RamDiskIoLoadImage=%s", buf); # endif #endif sprint_select_jobinfo(job_ptr->select_jobinfo->data, buf, sizeof(buf), SELECT_PRINT_MLOADER_IMAGE); debug3("MloaderImage=%s", buf); /* First look at the empty space, and then remove the preemptable jobs and try again. */ list_sort(block_list, (ListCmpF)bg_record_sort_aval_inc); rc = _find_best_block_match(block_list, &blocks_added, job_ptr, slurm_block_bitmap, min_nodes, max_nodes, req_nodes, &bg_record, local_mode, avail_cpus); if (rc == SLURM_SUCCESS && SELECT_IS_PREEMPT_SET(local_mode)) { ListIterator itr; ListIterator job_itr; bg_record_t *found_record; struct job_record *preempt_job_ptr; if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) info("doing preemption"); local_mode |= SELECT_MODE_CHECK_FULL; job_itr = list_iterator_create(preemptee_candidates); itr = list_iterator_create(block_list); while ((preempt_job_ptr = list_next(job_itr))) { while ((found_record = list_next(itr))) { if (found_record->job_ptr == preempt_job_ptr) { /* info("removing job %u running on %s", */ /* preempt_job_ptr->job_id, */ /* found_record->bg_block_id); */ found_record->job_ptr = NULL; found_record->job_running = NO_JOB_RUNNING; avail_cpus += found_record->cpu_cnt; break; } } if (!found_record) { list_iterator_reset(itr); error("Job %u wasn't found running anywhere, " "can't preempt", preempt_job_ptr->job_id); continue; } else if (job_ptr->details->min_cpus > avail_cpus) continue; list_sort(block_list, (ListCmpF)bg_record_sort_aval_inc); if ((rc = _find_best_block_match( block_list, &blocks_added, job_ptr, slurm_block_bitmap, min_nodes, max_nodes, req_nodes, &bg_record, local_mode, avail_cpus)) == SLURM_SUCCESS) break; list_iterator_reset(itr); } list_iterator_destroy(itr); list_iterator_destroy(job_itr); } if (rc == SLURM_SUCCESS) { if (!bg_record) fatal("we got a success, but no block back"); /* Here we see if there is a job running since * some jobs take awhile to finish we need to * make sure the time of the end is in the * future. If it isn't (meaning it is in the * past or current time) we add 5 seconds to * it so we don't use the block immediately. */ if (bg_record->job_ptr && bg_record->job_ptr->end_time) { if (bg_record->job_ptr->end_time <= starttime) starttime += 5; else starttime = bg_record->job_ptr->end_time; } else if (bg_record->job_running == BLOCK_ERROR_STATE) starttime = INFINITE; /* make sure the job is eligible to run */ if (job_ptr->details->begin_time > starttime) starttime = job_ptr->details->begin_time; job_ptr->start_time = starttime; set_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_NODES, bg_record->mp_str); set_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_IONODES, bg_record->ionode_str); if (!bg_record->bg_block_id) { debug("%d can start unassigned job %u " "at %ld on %s", local_mode, job_ptr->job_id, starttime, bg_record->mp_str); set_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_BLOCK_PTR, NULL); set_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_NODE_CNT, &bg_record->cnode_cnt); } else { if ((bg_record->ionode_str) && (job_ptr->part_ptr->max_share <= 1)) error("Small block used in " "non-shared partition"); debug("%d(%d) can start job %u " "at %ld on %s(%s) %d", local_mode, mode, job_ptr->job_id, starttime, bg_record->bg_block_id, bg_record->mp_str, SELECT_IS_MODE_RUN_NOW(local_mode)); if (SELECT_IS_MODE_RUN_NOW(local_mode)) { /* Set this up to be the correct pointer since we probably are working off a copy. */ if (bg_record->original) bg_record = bg_record->original; set_select_jobinfo( job_ptr->select_jobinfo->data, SELECT_JOBDATA_BLOCK_PTR, bg_record); if (job_ptr) { bg_record->job_running = job_ptr->job_id; bg_record->job_ptr = job_ptr; job_ptr->job_state |= JOB_CONFIGURING; last_bg_update = time(NULL); } } else { set_select_jobinfo( job_ptr->select_jobinfo->data, SELECT_JOBDATA_BLOCK_PTR, NULL); /* Just to make sure we don't end up using this on another job, or we have to wait until preemption is done. */ bg_record->job_ptr = NULL; bg_record->job_running = NO_JOB_RUNNING; } set_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_NODE_CNT, &bg_record->cnode_cnt); } if (SELECT_IS_MODE_RUN_NOW(local_mode)) _build_select_struct(job_ptr, slurm_block_bitmap, bg_record->cnode_cnt); /* set up the preempted job list */ if (SELECT_IS_PREEMPT_SET(local_mode)) { if (*preemptee_job_list) list_destroy(*preemptee_job_list); *preemptee_job_list = _get_preemptables( local_mode, bg_record, preemptee_candidates); } if (!bg_record->bg_block_id) { /* This is a fake record so we need to * destroy it after we get the info from * it. If it was just testing then * we added this record to the * block_list. If this is the case * it will be handled if se sync the * lists. But we don't want to do * that so we will set blocks_added to * 0 so it doesn't happen. */ if (!blocks_added) { destroy_bg_record(bg_record); bg_record = NULL; } blocks_added = 0; } last_job_update = time(NULL); } if (bg_conf->layout_mode == LAYOUT_DYNAMIC) { slurm_mutex_lock(&block_state_mutex); if (blocks_added) _sync_block_lists(block_list, bg_lists->main); slurm_mutex_unlock(&block_state_mutex); slurm_mutex_unlock(&create_dynamic_mutex); } list_destroy(block_list); return rc; }
/* * route_p_split_hostlist - logic to split an input hostlist into * a set of hostlists to forward to. * * IN: hl - hostlist_t - list of every node to send message to * will be empty on return; * OUT: sp_hl - hostlist_t** - the array of hostlists that will be malloced * OUT: count - int* - the count of created hostlists * RET: SLURM_SUCCESS - int * * Note: created hostlist will have to be freed independently using * hostlist_destroy by the caller. * Note: the hostlist_t array will have to be xfree. */ extern int route_p_split_hostlist(hostlist_t hl, hostlist_t** sp_hl, int* count) { int i, j, k, hl_ndx, msg_count, sw_count, lst_count; char *buf; bitstr_t *nodes_bitmap = NULL; /* nodes in message list */ bitstr_t *fwd_bitmap = NULL; /* nodes in forward list */ msg_count = hostlist_count(hl); if (switch_record_cnt == 0) { /* configs have not already been processed */ slurm_conf_init(NULL); if (init_node_conf()) { fatal("ROUTE: Failed to init slurm config"); } if (build_all_nodeline_info(false)) { fatal("ROUTE: Failed to build node config"); } rehash_node(); if (slurm_topo_build_config() != SLURM_SUCCESS) { fatal("ROUTE: Failed to build topology config"); } } *sp_hl = (hostlist_t*) xmalloc(switch_record_cnt * sizeof(hostlist_t)); /* create bitmap of nodes to send message too */ if (hostlist2bitmap (hl, false, &nodes_bitmap) != SLURM_SUCCESS) { buf = hostlist_ranged_string_xmalloc(hl); fatal("ROUTE: Failed to make bitmap from hostlist=%s.", buf); } /* Find lowest level switch containing all the nodes in the list */ j = 0; for (i = 0; i <= switch_levels; i++) { for (j=0; j<switch_record_cnt; j++) { if (switch_record_table[j].level == i) { if (bit_super_set(nodes_bitmap, switch_record_table[j]. node_bitmap)) { /* All nodes in message list are in * this switch */ break; } } } if (j < switch_record_cnt) { /* Got here via break after bit_super_set */ break; // 'j' is our switch } /* else, no switches at this level reach all nodes */ } if (i > switch_levels) { /* This can only happen if trying to schedule multiple physical * clusters as a single logical cluster under the control of a * single slurmctld daemon, and sending something like a * node_registation request to all nodes. * Revert to default behavior*/ if (debug_flags & DEBUG_FLAG_ROUTE) { buf = hostlist_ranged_string_xmalloc(hl); debug("ROUTE: didn't find switch containing nodes=%s", buf); xfree(buf); } FREE_NULL_BITMAP(nodes_bitmap); xfree(*sp_hl); return route_split_hostlist_treewidth(hl, sp_hl, count); } if (switch_record_table[j].level == 0) { /* This is a leaf switch. Construct list based on TreeWidth */ FREE_NULL_BITMAP(nodes_bitmap); xfree(*sp_hl); return route_split_hostlist_treewidth(hl, sp_hl, count); } /* loop through children, construction a hostlist for each child switch * with nodes in the message list */ hl_ndx = 0; lst_count = 0; for (i=0; i < switch_record_table[j].num_switches; i++) { k = switch_record_table[j].switch_index[i]; fwd_bitmap = bit_copy(switch_record_table[k].node_bitmap); bit_and(fwd_bitmap, nodes_bitmap); sw_count = bit_set_count(fwd_bitmap); if (sw_count == 0) { continue; /* no nodes on this switch in message list */ } (*sp_hl)[hl_ndx] = bitmap2hostlist(fwd_bitmap); /* Now remove nodes from this switch from message list */ bit_not(fwd_bitmap); bit_and(nodes_bitmap, fwd_bitmap); FREE_NULL_BITMAP(fwd_bitmap); if (debug_flags & DEBUG_FLAG_ROUTE) { buf = hostlist_ranged_string_xmalloc((*sp_hl)[hl_ndx]); debug("ROUTE: ... sublist[%d] switch=%s :: %s", i, switch_record_table[i].name, buf); xfree(buf); } hl_ndx++; lst_count += sw_count; if (lst_count == msg_count) break; /* all nodes in message are in a child list */ } FREE_NULL_BITMAP(nodes_bitmap); *count = hl_ndx; return SLURM_SUCCESS; }
/* * _task_layout_lllp_block * * task_layout_lllp_block will create a block distribution at the * lowest level of logical processor which is either socket, core or * thread depending on the system architecture. The Block algorithm * is the same as the Block distribution performed in srun. * * Distribution at the lllp: * -m hostfile|plane|block|cyclic:block|cyclic * * The first distribution "hostfile|plane|block|cyclic" is computed * in srun. The second distribution "plane|block|cyclic" is computed * locally by each slurmd. * * The input to the lllp distribution algorithms is the gids (tasks * ids) generated for the local node. * * The output is a mapping of the gids onto logical processors * (thread/core/socket) with is expressed cpu_bind masks. * */ static int _task_layout_lllp_block(launch_tasks_request_msg_t *req, uint32_t node_id, bitstr_t ***masks_p) { int c, i, size, last_taskcount = -1, taskcount = 0; uint16_t hw_sockets = 0, hw_cores = 0, hw_threads = 0; int max_tasks = req->tasks_to_launch[(int)node_id]; int max_cpus = max_tasks * req->cpus_per_task; bitstr_t *avail_map; bitstr_t **masks = NULL; int core_inx, pu_per_core, *core_tasks = NULL; int sock_inx, pu_per_socket, *socket_tasks = NULL; info("_task_layout_lllp_block "); avail_map = _get_avail_map(req, &hw_sockets, &hw_cores, &hw_threads); if (!avail_map) { return SLURM_ERROR; } size = bit_set_count(avail_map); if (size < max_tasks) { error("task/affinity: only %d bits in avail_map for %d tasks!", size, max_tasks); FREE_NULL_BITMAP(avail_map); return SLURM_ERROR; } if (size < max_cpus) { /* Possible result of overcommit */ i = size / max_tasks; info("task/affinity: reset cpus_per_task from %d to %d", req->cpus_per_task, i); req->cpus_per_task = i; } size = bit_size(avail_map); if ((req->cpu_bind_type & CPU_BIND_ONE_THREAD_PER_CORE) && (max_cpus > (hw_sockets * hw_cores))) { /* More CPUs requested than available cores, * disable core-level binding */ req->cpu_bind_type &= (~CPU_BIND_ONE_THREAD_PER_CORE); } *masks_p = xmalloc(max_tasks * sizeof(bitstr_t*)); masks = *masks_p; pu_per_core = hw_threads; core_tasks = xmalloc(sizeof(int) * hw_sockets * hw_cores); pu_per_socket = hw_cores * hw_threads; socket_tasks = xmalloc(sizeof(int) * hw_sockets); /* block distribution with oversubsciption */ c = 0; while (taskcount < max_tasks) { if (taskcount == last_taskcount) fatal("_task_layout_lllp_block infinite loop"); if (taskcount > 0) { /* Clear counters to over-subscribe, if necessary */ memset(core_tasks, 0, (sizeof(int) * hw_sockets * hw_cores)); memset(socket_tasks, 0, (sizeof(int) * hw_sockets)); } last_taskcount = taskcount; /* the abstract map is already laid out in block order, * so just iterate over it */ for (i = 0; i < size; i++) { /* skip unavailable resources */ if (bit_test(avail_map, i) == 0) continue; core_inx = i / pu_per_core; if ((req->ntasks_per_core != 0) && (core_tasks[core_inx] >= req->ntasks_per_core)) continue; sock_inx = i / pu_per_socket; if ((req->ntasks_per_socket != 0) && (socket_tasks[sock_inx] >= req->ntasks_per_socket)) continue; socket_tasks[sock_inx]++; if (!masks[taskcount]) masks[taskcount] = bit_alloc( conf->block_map_size); //info("setting %d %d", taskcount, i); bit_set(masks[taskcount], i); /* skip unrequested threads */ if (req->cpu_bind_type & CPU_BIND_ONE_THREAD_PER_CORE) i += hw_threads - 1; if (++c < req->cpus_per_task) continue; core_tasks[core_inx]++; /* Binding to cores, skip remaining of the threads */ if (!(req->cpu_bind_type & CPU_BIND_ONE_THREAD_PER_CORE) && ((req->cpu_bind_type & CPU_BIND_TO_CORES) || (req->ntasks_per_core == 1))) { int threads_not_used; if (req->cpus_per_task < hw_threads) threads_not_used = hw_threads - req->cpus_per_task; else threads_not_used = req->cpus_per_task % hw_threads; i += threads_not_used; } c = 0; if (++taskcount >= max_tasks) break; } } xfree(core_tasks); xfree(socket_tasks); /* last step: expand the masks to bind each task * to the requested resource */ _expand_masks(req->cpu_bind_type, max_tasks, masks, hw_sockets, hw_cores, hw_threads, avail_map); FREE_NULL_BITMAP(avail_map); return SLURM_SUCCESS; }
/* * _task_layout_lllp_block * * task_layout_lllp_block will create a block distribution at the * lowest level of logical processor which is either socket, core or * thread depending on the system architecture. The Block algorithm * is the same as the Block distribution performed in srun. * * Distribution at the lllp: * -m hostfile|plane|block|cyclic:block|cyclic * * The first distribution "hostfile|plane|block|cyclic" is computed * in srun. The second distribution "plane|block|cyclic" is computed * locally by each slurmd. * * The input to the lllp distribution algorithms is the gids (tasks * ids) generated for the local node. * * The output is a mapping of the gids onto logical processors * (thread/core/socket) with is expressed cpu_bind masks. * */ static int _task_layout_lllp_block(launch_tasks_request_msg_t *req, uint32_t node_id, bitstr_t ***masks_p) { int c, i, j, t, size, last_taskcount = -1, taskcount = 0; uint16_t hw_sockets = 0, hw_cores = 0, hw_threads = 0; int max_tasks = req->tasks_to_launch[(int)node_id]; int max_cpus = max_tasks * req->cpus_per_task; int *task_array; bitstr_t *avail_map; bitstr_t **masks = NULL; info("_task_layout_lllp_block "); avail_map = _get_avail_map(req, &hw_sockets, &hw_cores, &hw_threads); if (!avail_map) { return SLURM_ERROR; } size = bit_set_count(avail_map); if (size < max_tasks) { error("task/affinity: only %d bits in avail_map for %d tasks!", size, max_tasks); FREE_NULL_BITMAP(avail_map); return SLURM_ERROR; } if (size < max_cpus) { /* Possible result of overcommit */ i = size / max_tasks; info("task/affinity: reset cpus_per_task from %d to %d", req->cpus_per_task, i); req->cpus_per_task = i; } size = bit_size(avail_map); *masks_p = xmalloc(max_tasks * sizeof(bitstr_t*)); masks = *masks_p; task_array = xmalloc(size * sizeof(int)); if (!task_array) { error("In lllp_block: task_array memory error"); FREE_NULL_BITMAP(avail_map); return SLURM_ERROR; } /* block distribution with oversubsciption */ c = 0; while(taskcount < max_tasks) { if (taskcount == last_taskcount) { fatal("_task_layout_lllp_block infinite loop"); } last_taskcount = taskcount; /* the abstract map is already laid out in block order, * so just iterate over it */ for (i = 0; i < size; i++) { /* skip unrequested threads */ if (i%hw_threads >= hw_threads) continue; /* skip unavailable resources */ if (bit_test(avail_map, i) == 0) continue; /* if multiple CPUs per task, only * count the task on the first CPU */ if (c == 0) task_array[i] += 1; if (++c < req->cpus_per_task) continue; c = 0; if (++taskcount >= max_tasks) break; } } /* Distribute the tasks and create per-task masks that only * contain the first CPU. Note that unused resources * (task_array[i] == 0) will get skipped */ taskcount = 0; for (i = 0; i < size; i++) { for (t = 0; t < task_array[i]; t++) { if (masks[taskcount] == NULL) masks[taskcount] = (bitstr_t *)bit_alloc(conf->block_map_size); bit_set(masks[taskcount++], i); } } /* now set additional CPUs for cpus_per_task > 1 */ for (t=0; t<max_tasks && req->cpus_per_task>1; t++) { if (!masks[t]) continue; c = 0; for (i = 0; i < size && c<req->cpus_per_task; i++) { if (bit_test(masks[t], i) == 0) continue; for (j=i+1,c=1; j<size && c<req->cpus_per_task;j++) { if (bit_test(avail_map, j) == 0) continue; bit_set(masks[t], j); c++; } if (c < req->cpus_per_task) { /* we haven't found all of the CPUs for this * task, so we'll wrap the search to cover the * whole node */ for (j=0; j<i && c<req->cpus_per_task; j++) { if (bit_test(avail_map, j) == 0) continue; bit_set(masks[t], j); c++; } } } } xfree(task_array); /* last step: expand the masks to bind each task * to the requested resource */ _expand_masks(req->cpu_bind_type, max_tasks, masks, hw_sockets, hw_cores, hw_threads, avail_map); FREE_NULL_BITMAP(avail_map); return SLURM_SUCCESS; }
/* * _task_layout_lllp_cyclic * * task_layout_lllp_cyclic creates a cyclic distribution at the * lowest level of logical processor which is either socket, core or * thread depending on the system architecture. The Cyclic algorithm * is the same as the Cyclic distribution performed in srun. * * Distribution at the lllp: * -m hostfile|plane|block|cyclic:block|cyclic * * The first distribution "hostfile|plane|block|cyclic" is computed * in srun. The second distribution "plane|block|cyclic" is computed * locally by each slurmd. * * The input to the lllp distribution algorithms is the gids (tasks * ids) generated for the local node. * * The output is a mapping of the gids onto logical processors * (thread/core/socket) with is expressed cpu_bind masks. * */ static int _task_layout_lllp_cyclic(launch_tasks_request_msg_t *req, uint32_t node_id, bitstr_t ***masks_p) { int last_taskcount = -1, taskcount = 0; uint16_t c, i, s, t, hw_sockets = 0, hw_cores = 0, hw_threads = 0; int size, max_tasks = req->tasks_to_launch[(int)node_id]; int max_cpus = max_tasks * req->cpus_per_task; int avail_size; bitstr_t *avail_map; bitstr_t **masks = NULL; info ("_task_layout_lllp_cyclic "); avail_map = _get_avail_map(req, &hw_sockets, &hw_cores, &hw_threads); if (!avail_map) return SLURM_ERROR; avail_size = bit_size(avail_map); *masks_p = xmalloc(max_tasks * sizeof(bitstr_t*)); masks = *masks_p; size = bit_set_count(avail_map); if (size < max_tasks) { error("task/affinity: only %d bits in avail_map for %d tasks!", size, max_tasks); FREE_NULL_BITMAP(avail_map); return SLURM_ERROR; } if (size < max_cpus) { /* Possible result of overcommit */ i = size / max_tasks; info("task/affinity: reset cpus_per_task from %d to %d", req->cpus_per_task, i); req->cpus_per_task = i; } i = 0; while (taskcount < max_tasks) { if (taskcount == last_taskcount) fatal("_task_layout_lllp_cyclic failure"); last_taskcount = taskcount; for (t = 0; t < hw_threads; t++) { for (c = 0; c < hw_cores; c++) { for (s = 0; s < hw_sockets; s++) { uint16_t bit = s*(hw_cores*hw_threads) + c*(hw_threads) + t; /* In case hardware and config differ */ bit %= avail_size; if (bit_test(avail_map, bit) == 0) continue; if (masks[taskcount] == NULL) { masks[taskcount] = (bitstr_t *) bit_alloc(conf-> block_map_size); } bit_set(masks[taskcount], bit); if (++i < req->cpus_per_task) continue; i = 0; if (++taskcount >= max_tasks) break; } if (taskcount >= max_tasks) break; } if (taskcount >= max_tasks) break; } } /* last step: expand the masks to bind each task * to the requested resource */ _expand_masks(req->cpu_bind_type, max_tasks, masks, hw_sockets, hw_cores, hw_threads, avail_map); FREE_NULL_BITMAP(avail_map); return SLURM_SUCCESS; }
/* cr_job_test - does most of the real work for select_p_job_test(), which * includes contiguous selection, load-leveling and max_share logic * * PROCEDURE: * * Step 1: compare nodes in "avail" bitmap with current node state data * to find available nodes that match the job request * * Step 2: check resources in "avail" bitmap with allocated resources from * higher priority partitions (busy resources are UNavailable) * * Step 3: select resource usage on remaining resources in "avail" bitmap * for this job, with the placement influenced by existing * allocations */ extern int cr_job_test(struct job_record *job_ptr, bitstr_t *bitmap, int mode, uint16_t cr_type, enum node_cr_state job_node_req, uint32_t cr_node_cnt, struct part_res_record *cr_part_ptr, struct node_use_record *node_usage) { static int gang_mode = -1; int error_code = SLURM_SUCCESS; bitstr_t *orig_map, *avail_cores, *free_cores; bitstr_t *tmpcore = NULL; bool test_only; uint32_t c, i, j, k, n, csize, save_mem = 0; job_resources_t *job_res; struct job_details *details_ptr; struct part_res_record *p_ptr, *jp_ptr; uint16_t *cpu_count; if (gang_mode == -1) { if (slurm_get_preempt_mode() & PREEMPT_MODE_GANG) gang_mode = 1; else gang_mode = 0; } details_ptr = job_ptr->details; free_job_resources(&job_ptr->job_resrcs); if (mode == SELECT_MODE_TEST_ONLY) test_only = true; else /* SELECT_MODE_RUN_NOW || SELECT_MODE_WILL_RUN */ test_only = false; /* check node_state and update the node bitmap as necessary */ if (!test_only) { error_code = _verify_node_state(cr_part_ptr, job_ptr, bitmap, cr_type, node_usage, job_node_req); if (error_code != SLURM_SUCCESS) return error_code; } if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: evaluating job %u on %u nodes", job_ptr->job_id, bit_set_count(bitmap)); } orig_map = bit_copy(bitmap); avail_cores = _make_core_bitmap(bitmap); /* test to make sure that this job can succeed with all avail_cores * if 'no' then return FAIL * if 'yes' then we will seek the optimal placement for this job * within avail_cores */ free_cores = bit_copy(avail_cores); cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt, free_cores, node_usage, cr_type, test_only); if (cpu_count == NULL) { /* job cannot fit */ FREE_NULL_BITMAP(orig_map); FREE_NULL_BITMAP(free_cores); FREE_NULL_BITMAP(avail_cores); if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: test 0 fail: " "insufficient resources"); } return SLURM_ERROR; } else if (test_only) { FREE_NULL_BITMAP(orig_map); FREE_NULL_BITMAP(free_cores); FREE_NULL_BITMAP(avail_cores); xfree(cpu_count); if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("select/serial: cr_job_test: test 0 pass: "******"test_only"); return SLURM_SUCCESS; } if (cr_type == CR_MEMORY) { /* CR_MEMORY does not care about existing CPU allocations, * so we can jump right to job allocation from here */ goto alloc_job; } xfree(cpu_count); if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: test 0 pass - " "job fits on given resources"); } /* now that we know that this job can run with the given resources, * let's factor in the existing allocations and seek the optimal set * of resources for this job. Here is the procedure: * * Step 1: Seek idle CPUs across all partitions. If successful then * place job and exit. If not successful, then continue. Two * related items to note: * 1. Jobs that don't share CPUs finish with step 1. * 2. The remaining steps assume sharing or preemption. * * Step 2: Remove resources that are in use by higher-priority * partitions, and test that job can still succeed. If not * then exit. * * Step 3: Seek idle nodes among the partitions with the same * priority as the job's partition. If successful then * goto Step 6. If not then continue: * * Step 4: Seek placement within the job's partition. Search * row-by-row. If no placement if found, then exit. If a row * is found, then continue: * * Step 5: Place job and exit. FIXME! Here is where we need a * placement algorithm that recognizes existing job * boundaries and tries to "overlap jobs" as efficiently * as possible. * * Step 6: Place job and exit. FIXME! here is we use a placement * algorithm similar to Step 5 on jobs from lower-priority * partitions. */ /*** Step 1 ***/ bit_copybits(bitmap, orig_map); bit_copybits(free_cores, avail_cores); /* remove all existing allocations from free_cores */ tmpcore = bit_copy(free_cores); for (p_ptr = cr_part_ptr; p_ptr; p_ptr = p_ptr->next) { if (!p_ptr->row) continue; for (i = 0; i < p_ptr->num_rows; i++) { if (!p_ptr->row[i].row_bitmap) continue; bit_copybits(tmpcore, p_ptr->row[i].row_bitmap); bit_not(tmpcore); /* set bits now "free" resources */ bit_and(free_cores, tmpcore); } } cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt, free_cores, node_usage, cr_type, test_only); if (cpu_count) { /* job fits! We're done. */ if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: test 1 pass - " "idle resources found"); } goto alloc_job; } if ((gang_mode == 0) && (job_node_req == NODE_CR_ONE_ROW)) { /* This job CANNOT share CPUs regardless of priority, * so we fail here. Note that Shared=EXCLUSIVE was already * addressed in _verify_node_state() and job preemption * removes jobs from simulated resource allocation map * before this point. */ if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: test 1 fail - " "no idle resources available"); } goto alloc_job; } if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: test 1 fail - " "not enough idle resources"); } /*** Step 2 ***/ bit_copybits(bitmap, orig_map); bit_copybits(free_cores, avail_cores); for (jp_ptr = cr_part_ptr; jp_ptr; jp_ptr = jp_ptr->next) { if (jp_ptr->part_ptr == job_ptr->part_ptr) break; } if (!jp_ptr) { fatal("select/serial: could not find partition for job %u", job_ptr->job_id); return SLURM_ERROR; /* Fix CLANG false positive */ } /* remove existing allocations (jobs) from higher-priority partitions * from avail_cores */ for (p_ptr = cr_part_ptr; p_ptr; p_ptr = p_ptr->next) { if ((p_ptr->part_ptr->priority <= jp_ptr->part_ptr->priority) && (p_ptr->part_ptr->preempt_mode != PREEMPT_MODE_OFF)) continue; if (!p_ptr->row) continue; for (i = 0; i < p_ptr->num_rows; i++) { if (!p_ptr->row[i].row_bitmap) continue; bit_copybits(tmpcore, p_ptr->row[i].row_bitmap); bit_not(tmpcore); /* set bits now "free" resources */ bit_and(free_cores, tmpcore); } } /* make these changes permanent */ bit_copybits(avail_cores, free_cores); cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt, free_cores, node_usage, cr_type, test_only); if (!cpu_count) { /* job needs resources that are currently in use by * higher-priority jobs, so fail for now */ if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: test 2 fail - " "resources busy with higher priority jobs"); } goto alloc_job; } xfree(cpu_count); if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: test 2 pass - " "available resources for this priority"); } /*** Step 3 ***/ bit_copybits(bitmap, orig_map); bit_copybits(free_cores, avail_cores); /* remove existing allocations (jobs) from same-priority partitions * from avail_cores */ for (p_ptr = cr_part_ptr; p_ptr; p_ptr = p_ptr->next) { if (p_ptr->part_ptr->priority != jp_ptr->part_ptr->priority) continue; if (!p_ptr->row) continue; for (i = 0; i < p_ptr->num_rows; i++) { if (!p_ptr->row[i].row_bitmap) continue; bit_copybits(tmpcore, p_ptr->row[i].row_bitmap); bit_not(tmpcore); /* set bits now "free" resources */ bit_and(free_cores, tmpcore); } } cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt, free_cores, node_usage, cr_type, test_only); if (cpu_count) { /* jobs from low-priority partitions are the only thing left * in our way. for now we'll ignore them, but FIXME: we need * a good placement algorithm here that optimizes "job overlap" * between this job (in these idle nodes) and the low-priority * jobs */ if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: test 3 pass - " "found resources"); } goto alloc_job; } if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: test 3 fail - " "not enough idle resources in same priority"); } /*** Step 4 ***/ /* try to fit the job into an existing row * * tmpcore = worker core_bitmap * free_cores = core_bitmap to be built * avail_cores = static core_bitmap of all available cores */ if (!jp_ptr || !jp_ptr->row) { /* there's no existing jobs in this partition, so place * the job in avail_cores. FIXME: still need a good * placement algorithm here that optimizes "job overlap" * between this job (in these idle nodes) and existing * jobs in the other partitions with <= priority to * this partition */ bit_copybits(bitmap, orig_map); bit_copybits(free_cores, avail_cores); cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt, free_cores, node_usage, cr_type, test_only); if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: test 4 pass - " "first row found"); } goto alloc_job; } cr_sort_part_rows(jp_ptr); c = jp_ptr->num_rows; if (job_node_req != NODE_CR_AVAILABLE) c = 1; for (i = 0; i < c; i++) { if (!jp_ptr->row[i].row_bitmap) break; bit_copybits(bitmap, orig_map); bit_copybits(free_cores, avail_cores); bit_copybits(tmpcore, jp_ptr->row[i].row_bitmap); bit_not(tmpcore); bit_and(free_cores, tmpcore); cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt, free_cores, node_usage, cr_type, test_only); if (cpu_count) { if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: " "test 4 pass - row %i", i); } break; } if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: " "test 4 fail - row %i", i); } } if ((i < c) && !jp_ptr->row[i].row_bitmap) { /* we've found an empty row, so use it */ bit_copybits(bitmap, orig_map); bit_copybits(free_cores, avail_cores); if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: " "test 4 trying empty row %i",i); } cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt, free_cores, node_usage, cr_type, test_only); } if (!cpu_count) { /* job can't fit into any row, so exit */ if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: test 4 fail - " "busy partition"); } goto alloc_job; } /*** CONSTRUCTION ZONE FOR STEPs 5 AND 6 *** * Note that while the job may have fit into a row, it should * still be run through a good placement algorithm here that * optimizes "job overlap" between this job (in these idle nodes) * and existing jobs in the other partitions with <= priority to * this partition */ alloc_job: /* at this point we've found a good set of * bits to allocate to this job: * - bitmap is the set of nodes to allocate * - free_cores is the set of allocated cores * - cpu_count is the number of cpus per allocated node * * Next steps are to cleanup the worker variables, * create the job_resources struct, * distribute the job on the bits, and exit */ FREE_NULL_BITMAP(orig_map); FREE_NULL_BITMAP(avail_cores); FREE_NULL_BITMAP(tmpcore); if (!cpu_count) { /* we were sent here to cleanup and exit */ FREE_NULL_BITMAP(free_cores); if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: exiting cr_job_test with no " "allocation"); } return SLURM_ERROR; } /* At this point we have: * - a bitmap of selected nodes * - a free_cores bitmap of usable cores on each selected node * - a per-alloc-node cpu_count array */ if ((mode != SELECT_MODE_WILL_RUN) && (job_ptr->part_ptr == NULL)) error_code = EINVAL; if ((error_code == SLURM_SUCCESS) && (mode == SELECT_MODE_WILL_RUN)) job_ptr->total_cpus = 1; if ((error_code != SLURM_SUCCESS) || (mode != SELECT_MODE_RUN_NOW)) { FREE_NULL_BITMAP(free_cores); xfree(cpu_count); return error_code; } n = bit_ffs(bitmap); if (n < 0) { FREE_NULL_BITMAP(free_cores); xfree(cpu_count); return error_code; } if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: distributing job %u", job_ptr->job_id); } /** create the struct_job_res **/ job_res = create_job_resources(); job_res->node_bitmap = bit_copy(bitmap); job_res->nodes = bitmap2node_name(bitmap); job_res->nhosts = bit_set_count(bitmap); job_res->ncpus = job_res->nhosts; if (job_ptr->details->ntasks_per_node) job_res->ncpus *= details_ptr->ntasks_per_node; job_res->ncpus = MAX(job_res->ncpus, details_ptr->min_cpus); job_res->ncpus = MAX(job_res->ncpus, details_ptr->pn_min_cpus); job_res->node_req = job_node_req; job_res->cpus = cpu_count; job_res->cpus_used = xmalloc(job_res->nhosts * sizeof(uint16_t)); job_res->memory_allocated = xmalloc(job_res->nhosts * sizeof(uint32_t)); job_res->memory_used = xmalloc(job_res->nhosts * sizeof(uint32_t)); /* store the hardware data for the selected nodes */ error_code = build_job_resources(job_res, node_record_table_ptr, select_fast_schedule); if (error_code != SLURM_SUCCESS) { free_job_resources(&job_res); FREE_NULL_BITMAP(free_cores); return error_code; } c = 0; csize = bit_size(job_res->core_bitmap); j = cr_get_coremap_offset(n); k = cr_get_coremap_offset(n + 1); for (; j < k; j++, c++) { if (!bit_test(free_cores, j)) continue; if (c >= csize) { error("select/serial: cr_job_test " "core_bitmap index error on node %s", select_node_record[n].node_ptr->name); drain_nodes(select_node_record[n].node_ptr->name, "Bad core count", getuid()); free_job_resources(&job_res); FREE_NULL_BITMAP(free_cores); return SLURM_ERROR; } bit_set(job_res->core_bitmap, c); break; } if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: job %u ncpus %u cbits %u/%d " "nbits %u", job_ptr->job_id, job_res->ncpus, bit_set_count(free_cores), 1, job_res->nhosts); } FREE_NULL_BITMAP(free_cores); /* distribute the tasks and clear any unused cores */ job_ptr->job_resrcs = job_res; error_code = cr_dist(job_ptr, cr_type); if (error_code != SLURM_SUCCESS) { free_job_resources(&job_ptr->job_resrcs); return error_code; } /* translate job_res->cpus array into format with rep count */ job_ptr->total_cpus = build_job_resources_cpu_array(job_res); if (!(cr_type & CR_MEMORY)) return error_code; /* load memory allocated array */ save_mem = details_ptr->pn_min_memory; if (save_mem & MEM_PER_CPU) { /* memory is per-cpu */ save_mem &= (~MEM_PER_CPU); job_res->memory_allocated[0] = job_res->cpus[0] * save_mem; } else { /* memory is per-node */ job_res->memory_allocated[0] = save_mem; } return error_code; }
static int _attempt_backfill(void) { DEF_TIMERS; bool filter_root = false; List job_queue; job_queue_rec_t *job_queue_rec; slurmdb_qos_rec_t *qos_ptr = NULL; int i, j, node_space_recs; struct job_record *job_ptr; struct part_record *part_ptr; uint32_t end_time, end_reserve; uint32_t time_limit, comp_time_limit, orig_time_limit; uint32_t min_nodes, max_nodes, req_nodes; bitstr_t *avail_bitmap = NULL, *resv_bitmap = NULL; bitstr_t *exc_core_bitmap = NULL; time_t now, sched_start, later_start, start_res, resv_end; node_space_map_t *node_space; struct timeval bf_time1, bf_time2; int sched_timeout = 2, yield_sleep = 1; int rc = 0; int job_test_count = 0; uint32_t *uid = NULL, nuser = 0; uint16_t *njobs = NULL; bool already_counted; uint32_t reject_array_job_id = 0; #ifdef HAVE_CRAY /* * Run a Basil Inventory immediately before setting up the schedule * plan, to avoid race conditions caused by ALPS node state change. * Needs to be done with the node-state lock taken. */ START_TIMER; if (select_g_reconfigure()) { debug4("backfill: not scheduling due to ALPS"); return SLURM_SUCCESS; } END_TIMER; if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: ALPS inventory completed, %s", TIME_STR); /* The Basil inventory can take a long time to complete. Process * pending RPCs before starting the backfill scheduling logic */ _yield_locks(1); #endif START_TIMER; if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: beginning"); sched_start = now = time(NULL); if (slurm_get_root_filter()) filter_root = true; job_queue = build_job_queue(true); if (list_count(job_queue) == 0) { debug("backfill: no jobs to backfill"); list_destroy(job_queue); return 0; } gettimeofday(&bf_time1, NULL); slurmctld_diag_stats.bf_queue_len = list_count(job_queue); slurmctld_diag_stats.bf_queue_len_sum += slurmctld_diag_stats. bf_queue_len; slurmctld_diag_stats.bf_last_depth = 0; slurmctld_diag_stats.bf_last_depth_try = 0; slurmctld_diag_stats.bf_when_last_cycle = now; bf_last_yields = 0; slurmctld_diag_stats.bf_active = 1; node_space = xmalloc(sizeof(node_space_map_t) * (max_backfill_job_cnt + 3)); node_space[0].begin_time = sched_start; node_space[0].end_time = sched_start + backfill_window; node_space[0].avail_bitmap = bit_copy(avail_node_bitmap); node_space[0].next = 0; node_space_recs = 1; if (debug_flags & DEBUG_FLAG_BACKFILL) _dump_node_space_table(node_space); if (max_backfill_job_per_user) { uid = xmalloc(BF_MAX_USERS * sizeof(uint32_t)); njobs = xmalloc(BF_MAX_USERS * sizeof(uint16_t)); } while ((job_queue_rec = (job_queue_rec_t *) list_pop_bottom(job_queue, sort_job_queue2))) { job_ptr = job_queue_rec->job_ptr; orig_time_limit = job_ptr->time_limit; if ((time(NULL) - sched_start) >= sched_timeout) { uint32_t save_time_limit = job_ptr->time_limit; job_ptr->time_limit = orig_time_limit; if (debug_flags & DEBUG_FLAG_BACKFILL) { END_TIMER; info("backfill: completed yielding locks " "after testing %d jobs, %s", job_test_count, TIME_STR); } if (_yield_locks(yield_sleep) && !backfill_continue) { if (debug_flags & DEBUG_FLAG_BACKFILL) { info("backfill: system state changed, " "breaking out after testing %d " "jobs", job_test_count); } rc = 1; break; } job_ptr->time_limit = save_time_limit; /* Reset backfill scheduling timers, resume testing */ sched_start = time(NULL); job_test_count = 0; START_TIMER; } part_ptr = job_queue_rec->part_ptr; job_test_count++; xfree(job_queue_rec); if (!IS_JOB_PENDING(job_ptr)) continue; /* started in other partition */ if (!avail_front_end(job_ptr)) continue; /* No available frontend for this job */ if (job_ptr->array_task_id != (uint16_t) NO_VAL) { if (reject_array_job_id == job_ptr->array_job_id) continue; /* already rejected array element */ /* assume reject whole array for now, clear if OK */ reject_array_job_id = job_ptr->array_job_id; } job_ptr->part_ptr = part_ptr; if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill test for job %u", job_ptr->job_id); slurmctld_diag_stats.bf_last_depth++; already_counted = false; if (max_backfill_job_per_user) { for (j = 0; j < nuser; j++) { if (job_ptr->user_id == uid[j]) { njobs[j]++; if (debug_flags & DEBUG_FLAG_BACKFILL) debug("backfill: user %u: " "#jobs %u", uid[j], njobs[j]); break; } } if (j == nuser) { /* user not found */ if (nuser < BF_MAX_USERS) { uid[j] = job_ptr->user_id; njobs[j] = 1; nuser++; } else { error("backfill: too many users in " "queue. Consider increasing " "BF_MAX_USERS"); } if (debug_flags & DEBUG_FLAG_BACKFILL) debug2("backfill: found new user %u. " "Total #users now %u", job_ptr->user_id, nuser); } else { if (njobs[j] > max_backfill_job_per_user) { /* skip job */ if (debug_flags & DEBUG_FLAG_BACKFILL) debug("backfill: have already " "checked %u jobs for " "user %u; skipping " "job %u", max_backfill_job_per_user, job_ptr->user_id, job_ptr->job_id); continue; } } } if (((part_ptr->state_up & PARTITION_SCHED) == 0) || (part_ptr->node_bitmap == NULL)) continue; if ((part_ptr->flags & PART_FLAG_ROOT_ONLY) && filter_root) continue; if ((!job_independent(job_ptr, 0)) || (license_job_test(job_ptr, time(NULL)) != SLURM_SUCCESS)) continue; /* Determine minimum and maximum node counts */ min_nodes = MAX(job_ptr->details->min_nodes, part_ptr->min_nodes); if (job_ptr->details->max_nodes == 0) max_nodes = part_ptr->max_nodes; else max_nodes = MIN(job_ptr->details->max_nodes, part_ptr->max_nodes); max_nodes = MIN(max_nodes, 500000); /* prevent overflows */ if (job_ptr->details->max_nodes) req_nodes = max_nodes; else req_nodes = min_nodes; if (min_nodes > max_nodes) { /* job's min_nodes exceeds partition's max_nodes */ continue; } /* Determine job's expected completion time */ if (job_ptr->time_limit == NO_VAL) { if (part_ptr->max_time == INFINITE) time_limit = 365 * 24 * 60; /* one year */ else time_limit = part_ptr->max_time; } else { if (part_ptr->max_time == INFINITE) time_limit = job_ptr->time_limit; else time_limit = MIN(job_ptr->time_limit, part_ptr->max_time); } comp_time_limit = time_limit; qos_ptr = job_ptr->qos_ptr; if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE) && slurm_get_preempt_mode()) time_limit = job_ptr->time_limit = 1; else if (job_ptr->time_min && (job_ptr->time_min < time_limit)) time_limit = job_ptr->time_limit = job_ptr->time_min; /* Determine impact of any resource reservations */ later_start = now; TRY_LATER: if ((time(NULL) - sched_start) >= sched_timeout) { uint32_t save_time_limit = job_ptr->time_limit; job_ptr->time_limit = orig_time_limit; if (debug_flags & DEBUG_FLAG_BACKFILL) { END_TIMER; info("backfill: completed yielding locks 2" "after testing %d jobs, %s", job_test_count, TIME_STR); } if (_yield_locks(yield_sleep) && !backfill_continue) { if (debug_flags & DEBUG_FLAG_BACKFILL) { info("backfill: system state changed, " "breaking out after testing %d " "jobs", job_test_count); } rc = 1; break; } job_ptr->time_limit = save_time_limit; /* Reset backfill scheduling timers, resume testing */ sched_start = time(NULL); job_test_count = 1; START_TIMER; } FREE_NULL_BITMAP(avail_bitmap); FREE_NULL_BITMAP(exc_core_bitmap); start_res = later_start; later_start = 0; j = job_test_resv(job_ptr, &start_res, true, &avail_bitmap, &exc_core_bitmap); if (j != SLURM_SUCCESS) { job_ptr->time_limit = orig_time_limit; continue; } if (start_res > now) end_time = (time_limit * 60) + start_res; else end_time = (time_limit * 60) + now; resv_end = find_resv_end(start_res); /* Identify usable nodes for this job */ bit_and(avail_bitmap, part_ptr->node_bitmap); bit_and(avail_bitmap, up_node_bitmap); for (j=0; ; ) { if ((node_space[j].end_time > start_res) && node_space[j].next && (later_start == 0)) later_start = node_space[j].end_time; if (node_space[j].end_time <= start_res) ; else if (node_space[j].begin_time <= end_time) { bit_and(avail_bitmap, node_space[j].avail_bitmap); } else break; if ((j = node_space[j].next) == 0) break; } if ((resv_end++) && ((later_start == 0) || (resv_end < later_start))) { later_start = resv_end; } if (job_ptr->details->exc_node_bitmap) { bit_not(job_ptr->details->exc_node_bitmap); bit_and(avail_bitmap, job_ptr->details->exc_node_bitmap); bit_not(job_ptr->details->exc_node_bitmap); } /* Test if insufficient nodes remain OR * required nodes missing OR * nodes lack features */ if ((bit_set_count(avail_bitmap) < min_nodes) || ((job_ptr->details->req_node_bitmap) && (!bit_super_set(job_ptr->details->req_node_bitmap, avail_bitmap))) || (job_req_node_filter(job_ptr, avail_bitmap))) { if (later_start) { job_ptr->start_time = 0; goto TRY_LATER; } /* Job can not start until too far in the future */ job_ptr->time_limit = orig_time_limit; job_ptr->start_time = sched_start + backfill_window; continue; } /* Identify nodes which are definitely off limits */ FREE_NULL_BITMAP(resv_bitmap); resv_bitmap = bit_copy(avail_bitmap); bit_not(resv_bitmap); /* this is the time consuming operation */ debug2("backfill: entering _try_sched for job %u.", job_ptr->job_id); if (!already_counted) { slurmctld_diag_stats.bf_last_depth_try++; already_counted = true; } j = _try_sched(job_ptr, &avail_bitmap, min_nodes, max_nodes, req_nodes, exc_core_bitmap); now = time(NULL); if (j != SLURM_SUCCESS) { job_ptr->time_limit = orig_time_limit; job_ptr->start_time = 0; continue; /* not runable */ } if (start_res > job_ptr->start_time) { job_ptr->start_time = start_res; last_job_update = now; } if (job_ptr->start_time <= now) { uint32_t save_time_limit = job_ptr->time_limit; int rc = _start_job(job_ptr, resv_bitmap); if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE)) { if (orig_time_limit == NO_VAL) job_ptr->time_limit = comp_time_limit; else job_ptr->time_limit = orig_time_limit; job_ptr->end_time = job_ptr->start_time + (job_ptr->time_limit * 60); } else if ((rc == SLURM_SUCCESS) && job_ptr->time_min) { /* Set time limit as high as possible */ job_ptr->time_limit = comp_time_limit; job_ptr->end_time = job_ptr->start_time + (comp_time_limit * 60); _reset_job_time_limit(job_ptr, now, node_space); time_limit = job_ptr->time_limit; } else { job_ptr->time_limit = orig_time_limit; } if (rc == ESLURM_ACCOUNTING_POLICY) { /* Unknown future start time, just skip job */ job_ptr->start_time = 0; continue; } else if (rc != SLURM_SUCCESS) { /* Planned to start job, but something bad * happended. */ job_ptr->start_time = 0; break; } else { /* Started this job, move to next one */ reject_array_job_id = 0; /* Update the database if job time limit * changed and move to next job */ if (save_time_limit != job_ptr->time_limit) jobacct_storage_g_job_start(acct_db_conn, job_ptr); continue; } } else job_ptr->time_limit = orig_time_limit; if (later_start && (job_ptr->start_time > later_start)) { /* Try later when some nodes currently reserved for * pending jobs are free */ job_ptr->start_time = 0; goto TRY_LATER; } if (job_ptr->start_time > (sched_start + backfill_window)) { /* Starts too far in the future to worry about */ continue; } if (node_space_recs >= max_backfill_job_cnt) { /* Already have too many jobs to deal with */ break; } end_reserve = job_ptr->start_time + (time_limit * 60); if (_test_resv_overlap(node_space, avail_bitmap, job_ptr->start_time, end_reserve)) { /* This job overlaps with an existing reservation for * job to be backfill scheduled, which the sched * plugin does not know about. Try again later. */ later_start = job_ptr->start_time; job_ptr->start_time = 0; goto TRY_LATER; } /* * Add reservation to scheduling table if appropriate */ if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE)) continue; reject_array_job_id = 0; bit_not(avail_bitmap); _add_reservation(job_ptr->start_time, end_reserve, avail_bitmap, node_space, &node_space_recs); if (debug_flags & DEBUG_FLAG_BACKFILL) _dump_node_space_table(node_space); } xfree(uid); xfree(njobs); FREE_NULL_BITMAP(avail_bitmap); FREE_NULL_BITMAP(exc_core_bitmap); FREE_NULL_BITMAP(resv_bitmap); for (i=0; ; ) { FREE_NULL_BITMAP(node_space[i].avail_bitmap); if ((i = node_space[i].next) == 0) break; } xfree(node_space); list_destroy(job_queue); gettimeofday(&bf_time2, NULL); _do_diag_stats(&bf_time1, &bf_time2, yield_sleep); if (debug_flags & DEBUG_FLAG_BACKFILL) { END_TIMER; info("backfill: completed testing %d jobs, %s", job_test_count, TIME_STR); } return rc; }
static void _do_log_msg (bitstr_t *b, log_f fn, const char *msg) { char buf [65536]; char *s = bit_set_count (b) == 1 ? "" : "s"; (*fn) ("task%s %s: %s", s, bit_fmt (buf, sizeof(buf), b), msg); }
/* * _task_layout_lllp_cyclic * * task_layout_lllp_cyclic creates a cyclic distribution at the * lowest level of logical processor which is either socket, core or * thread depending on the system architecture. The Cyclic algorithm * is the same as the Cyclic distribution performed in srun. * * Distribution at the lllp: * -m hostfile|block|cyclic:block|cyclic * * The first distribution "hostfile|block|cyclic" is computed * in srun. The second distribution "block|cyclic" is computed * locally by each slurmd. * * The input to the lllp distribution algorithms is the gids (tasks * ids) generated for the local node. * * The output is a mapping of the gids onto logical processors * (thread/core/socket) with is expressed cpu_bind masks. * * If a task asks for more than one CPU per task, put the tasks as * close as possible (fill core rather than going next socket for the * extra task) * */ static int _task_layout_lllp_cyclic(launch_tasks_request_msg_t *req, uint32_t node_id, bitstr_t ***masks_p) { int last_taskcount = -1, taskcount = 0; uint16_t i, s, hw_sockets = 0, hw_cores = 0, hw_threads = 0; uint16_t offset = 0, p = 0; int size, max_tasks = req->tasks_to_launch[(int)node_id]; int max_cpus = max_tasks * req->cpus_per_task; bitstr_t *avail_map; bitstr_t **masks = NULL; int *socket_last_pu = NULL; int core_inx, pu_per_core, *core_tasks = NULL; info ("_task_layout_lllp_cyclic "); avail_map = _get_avail_map(req, &hw_sockets, &hw_cores, &hw_threads); if (!avail_map) return SLURM_ERROR; size = bit_set_count(avail_map); if (size < max_tasks) { error("task/affinity: only %d bits in avail_map for %d tasks!", size, max_tasks); FREE_NULL_BITMAP(avail_map); return SLURM_ERROR; } if (size < max_cpus) { /* Possible result of overcommit */ i = size / max_tasks; info("task/affinity: reset cpus_per_task from %d to %d", req->cpus_per_task, i); req->cpus_per_task = i; } pu_per_core = hw_threads; core_tasks = xmalloc(sizeof(int) * hw_sockets * hw_cores); socket_last_pu = xmalloc(hw_sockets * sizeof(int)); *masks_p = xmalloc(max_tasks * sizeof(bitstr_t*)); masks = *masks_p; size = bit_size(avail_map); offset = hw_cores * hw_threads; s = 0; while (taskcount < max_tasks) { if (taskcount == last_taskcount) fatal("_task_layout_lllp_cyclic failure"); last_taskcount = taskcount; for (i = 0; i < size; i++) { bool already_switched = false; uint16_t bit; uint16_t orig_s = s; while (socket_last_pu[s] >= offset) { /* Switch to the next socket we have * ran out here. */ /* This only happens if the slurmctld * gave us an allocation that made a * task split sockets. Or if the * entire allocation is on one socket. */ s = (s + 1) % hw_sockets; if (orig_s == s) { /* This should rarely happen, * but is here for sanity sake. */ debug("allocation is full, " "oversubscribing"); memset(core_tasks, 0, (sizeof(int) * hw_sockets * hw_cores)); memset(socket_last_pu, 0, sizeof(hw_sockets * sizeof(int))); } } bit = socket_last_pu[s] + (s * offset); /* In case hardware and config differ */ bit %= size; /* set up for the next one */ socket_last_pu[s]++; /* skip unrequested threads */ if (req->cpu_bind_type & CPU_BIND_ONE_THREAD_PER_CORE) socket_last_pu[s] += hw_threads - 1; if (!bit_test(avail_map, bit)) continue; core_inx = bit / pu_per_core; if ((req->ntasks_per_core != 0) && (core_tasks[core_inx] >= req->ntasks_per_core)) continue; if (!masks[taskcount]) masks[taskcount] = bit_alloc(conf->block_map_size); //info("setting %d %d", taskcount, bit); bit_set(masks[taskcount], bit); if (!already_switched && (((req->task_dist & SLURM_DIST_STATE_BASE) == SLURM_DIST_CYCLIC_CFULL) || ((req->task_dist & SLURM_DIST_STATE_BASE) == SLURM_DIST_BLOCK_CFULL))) { /* This means we are laying out cpus * within a task cyclically as well. */ s = (s + 1) % hw_sockets; already_switched = true; } if (++p < req->cpus_per_task) continue; core_tasks[core_inx]++; /* Binding to cores, skip remaining of the threads */ if (!(req->cpu_bind_type & CPU_BIND_ONE_THREAD_PER_CORE) && ((req->cpu_bind_type & CPU_BIND_TO_CORES) || (req->ntasks_per_core == 1))) { int threads_not_used; if (req->cpus_per_task < hw_threads) threads_not_used = hw_threads - req->cpus_per_task; else threads_not_used = req->cpus_per_task % hw_threads; socket_last_pu[s] += threads_not_used; } p = 0; if (!already_switched) { /* Now that we have finished a task, switch to * the next socket. */ s = (s + 1) % hw_sockets; } if (++taskcount >= max_tasks) break; } } /* last step: expand the masks to bind each task * to the requested resource */ _expand_masks(req->cpu_bind_type, max_tasks, masks, hw_sockets, hw_cores, hw_threads, avail_map); FREE_NULL_BITMAP(avail_map); xfree(core_tasks); xfree(socket_last_pu); return SLURM_SUCCESS; }
int main(int argc, char *argv[]) { note("Testing static decl"); { bitstr_t bit_decl(bs, 65); /*bitstr_t *bsp = bs;*/ bit_set(bs,9); bit_set(bs,14); TEST(bit_test(bs,9), "bit 9 set"); TEST(!bit_test(bs,12), "bit 12 not set"); TEST(bit_test(bs,14), "bit 14 set" ); /*bit_free(bsp);*/ /* triggers TEST in bit_free - OK */ } note("Testing basic vixie functions"); { bitstr_t *bs = bit_alloc(16), *bs2; /*bit_set(bs, 42);*/ /* triggers TEST in bit_set - OK */ bit_set(bs,9); bit_set(bs,14); TEST(bit_test(bs,9), "bit 9 set"); TEST(!bit_test(bs,12), "bit 12 not set" ); TEST(bit_test(bs,14), "bit 14 set"); bs2 = bit_copy(bs); bit_fill_gaps(bs2); TEST(bit_ffs(bs2) == 9, "first bit set = 9 "); TEST(bit_fls(bs2) == 14, "last bit set = 14"); TEST(bit_set_count(bs2) == 6, "bitstring"); TEST(bit_test(bs2,12), "bitstring"); TEST(bit_super_set(bs,bs2) == 1, "bitstring"); TEST(bit_super_set(bs2,bs) == 0, "bitstring"); bit_clear(bs,14); TEST(!bit_test(bs,14), "bitstring"); bit_nclear(bs,9,14); TEST(!bit_test(bs,9), "bitstring"); TEST(!bit_test(bs,12), "bitstring"); TEST(!bit_test(bs,14), "bitstring"); bit_nset(bs,9,14); TEST(bit_test(bs,9), "bitstring"); TEST(bit_test(bs,12), "bitstring"); TEST(bit_test(bs,14), "bitstring"); TEST(bit_ffs(bs) == 9, "ffs"); TEST(bit_ffc(bs) == 0, "ffc"); bit_nset(bs,0,8); TEST(bit_ffc(bs) == 15, "ffc"); bit_free(bs); /*bit_set(bs,9); */ /* triggers TEST in bit_set - OK */ } note("Testing and/or/not"); { bitstr_t *bs1 = bit_alloc(128); bitstr_t *bs2 = bit_alloc(128); bit_set(bs1, 100); bit_set(bs1, 104); bit_set(bs2, 100); bit_and(bs1, bs2); TEST(bit_test(bs1, 100), "and"); TEST(!bit_test(bs1, 104), "and"); bit_set(bs2, 110); bit_set(bs2, 111); bit_set(bs2, 112); bit_or(bs1, bs2); TEST(bit_test(bs1, 100), "or"); TEST(bit_test(bs1, 110), "or"); TEST(bit_test(bs1, 111), "or"); TEST(bit_test(bs1, 112), "or"); bit_not(bs1); TEST(!bit_test(bs1, 100), "not"); TEST(bit_test(bs1, 12), "not"); bit_free(bs1); bit_free(bs2); } note("testing bit selection"); { bitstr_t *bs1 = bit_alloc(128), *bs2; bit_set(bs1, 21); bit_set(bs1, 100); bit_fill_gaps(bs1); bs2 = bit_pick_cnt(bs1,20); if (bs2) { TEST(bit_set_count(bs2) == 20, "pick"); TEST(bit_ffs(bs2) == 21, "pick"); TEST(bit_fls(bs2) == 40, "pick"); bit_free(bs2); } else TEST(0, "alloc fail"); bit_free(bs1); } note("Testing realloc"); { bitstr_t *bs = bit_alloc(1); TEST(bit_ffs(bs) == -1, "bitstring"); bit_set(bs,0); /*bit_set(bs, 1000);*/ /* triggers TEST in bit_set - OK */ bs = bit_realloc(bs,1048576); bit_set(bs,1000); bit_set(bs,1048575); TEST(bit_test(bs, 0), "bitstring"); TEST(bit_test(bs, 1000), "bitstring"); TEST(bit_test(bs, 1048575), "bitstring"); TEST(bit_set_count(bs) == 3, "bitstring"); bit_clear(bs,0); bit_clear(bs,1000); TEST(bit_set_count(bs) == 1, "bitstring"); TEST(bit_ffs(bs) == 1048575, "bitstring"); bit_free(bs); } note("Testing bit_fmt"); { char tmpstr[1024]; bitstr_t *bs = bit_alloc(1024); TEST(!strcmp(bit_fmt(tmpstr,sizeof(tmpstr),bs), ""), "bitstring"); bit_set(bs,42); TEST(!strcmp(bit_fmt(tmpstr,sizeof(tmpstr),bs), "42"), "bitstring"); bit_set(bs,102); TEST(!strcmp(bit_fmt(tmpstr,sizeof(tmpstr),bs), "42,102"), "bitstring"); bit_nset(bs,9,14); TEST(!strcmp(bit_fmt(tmpstr,sizeof(tmpstr), bs), "9-14,42,102"), "bitstring"); } note("Testing bit_nffc/bit_nffs"); { bitstr_t *bs = bit_alloc(1024); bit_set(bs, 2); bit_set(bs, 6); bit_set(bs, 7); bit_nset(bs,12,1018); TEST(bit_nffc(bs, 2) == 0, "bitstring"); TEST(bit_nffc(bs, 3) == 3, "bitstring"); TEST(bit_nffc(bs, 4) == 8, "bitstring"); TEST(bit_nffc(bs, 5) == 1019, "bitstring"); TEST(bit_nffc(bs, 6) == -1, "bitstring"); TEST(bit_nffs(bs, 1) == 2, "bitstring"); TEST(bit_nffs(bs, 2) == 6, "bitstring"); TEST(bit_nffs(bs, 100) == 12, "bitstring"); TEST(bit_nffs(bs, 1023) == -1, "bitstring"); bit_free(bs); } note("Testing bit_unfmt"); { bitstr_t *bs = bit_alloc(1024); bitstr_t *bs2 = bit_alloc(1024); char tmpstr[4096]; bit_set(bs,1); bit_set(bs,3); bit_set(bs,30); bit_nset(bs,42,64); bit_nset(bs,97,1000); bit_fmt(tmpstr, sizeof(tmpstr), bs); TEST(bit_unfmt(bs2, tmpstr) != -1, "bitstring"); TEST(bit_equal(bs, bs2), "bitstring"); } totals(); return failed; }