/* * slurm_sprint_job_info - output information about a specific Slurm * job based upon message as loaded using slurm_load_jobs * IN job_ptr - an individual job information record pointer * IN one_liner - print as a single line if true * RET out - char * containing formatted output (must be freed after call) * NULL is returned on failure. */ extern char * slurm_sprint_job_info ( job_info_t * job_ptr, int one_liner ) { int i, j, k; char time_str[32], *group_name, *user_name; char tmp1[128], tmp2[128]; char *tmp6_ptr; char tmp_line[1024 * 128]; char tmp_path[MAXPATHLEN]; char *ionodes = NULL; uint16_t exit_status = 0, term_sig = 0; job_resources_t *job_resrcs = job_ptr->job_resrcs; char *out = NULL; time_t run_time; uint32_t min_nodes, max_nodes = 0; char *nodelist = "NodeList"; bitstr_t *cpu_bitmap; char *host; int sock_inx, sock_reps, last; int abs_node_inx, rel_node_inx; int64_t nice; int bit_inx, bit_reps; uint32_t *last_mem_alloc_ptr = NULL; uint32_t last_mem_alloc = NO_VAL; char *last_hosts; hostlist_t hl, hl_last; char select_buf[122]; uint32_t cluster_flags = slurmdb_setup_cluster_flags(); uint32_t threads; char *line_end = (one_liner) ? " " : "\n "; if (cluster_flags & CLUSTER_FLAG_BG) { nodelist = "MidplaneList"; select_g_select_jobinfo_get(job_ptr->select_jobinfo, SELECT_JOBDATA_IONODES, &ionodes); } /****** Line 1 ******/ xstrfmtcat(out, "JobId=%u ", job_ptr->job_id); if (job_ptr->array_job_id) { if (job_ptr->array_task_str) { xstrfmtcat(out, "ArrayJobId=%u ArrayTaskId=%s ", job_ptr->array_job_id, job_ptr->array_task_str); } else { xstrfmtcat(out, "ArrayJobId=%u ArrayTaskId=%u ", job_ptr->array_job_id, job_ptr->array_task_id); } } xstrfmtcat(out, "JobName=%s", job_ptr->name); xstrcat(out, line_end); /****** Line 2 ******/ user_name = uid_to_string((uid_t) job_ptr->user_id); group_name = gid_to_string((gid_t) job_ptr->group_id); xstrfmtcat(out, "UserId=%s(%u) GroupId=%s(%u) MCS_label=%s", user_name, job_ptr->user_id, group_name, job_ptr->group_id, (job_ptr->mcs_label==NULL) ? "N/A" : job_ptr->mcs_label); xfree(user_name); xfree(group_name); xstrcat(out, line_end); /****** Line 3 ******/ nice = ((int64_t)job_ptr->nice) - NICE_OFFSET; xstrfmtcat(out, "Priority=%u Nice=%"PRIi64" Account=%s QOS=%s", job_ptr->priority, nice, job_ptr->account, job_ptr->qos); if (slurm_get_track_wckey()) xstrfmtcat(out, " WCKey=%s", job_ptr->wckey); xstrcat(out, line_end); /****** Line 4 ******/ xstrfmtcat(out, "JobState=%s ", job_state_string(job_ptr->job_state)); if (job_ptr->state_desc) { /* Replace white space with underscore for easier parsing */ for (j=0; job_ptr->state_desc[j]; j++) { if (isspace((int)job_ptr->state_desc[j])) job_ptr->state_desc[j] = '_'; } xstrfmtcat(out, "Reason=%s ", job_ptr->state_desc); } else xstrfmtcat(out, "Reason=%s ", job_reason_string(job_ptr->state_reason)); xstrfmtcat(out, "Dependency=%s", job_ptr->dependency); xstrcat(out, line_end); /****** Line 5 ******/ xstrfmtcat(out, "Requeue=%u Restarts=%u BatchFlag=%u Reboot=%u ", job_ptr->requeue, job_ptr->restart_cnt, job_ptr->batch_flag, job_ptr->reboot); if (WIFSIGNALED(job_ptr->exit_code)) term_sig = WTERMSIG(job_ptr->exit_code); exit_status = WEXITSTATUS(job_ptr->exit_code); xstrfmtcat(out, "ExitCode=%u:%u", exit_status, term_sig); xstrcat(out, line_end); /****** Line 5a (optional) ******/ if (job_ptr->show_flags & SHOW_DETAIL) { if (WIFSIGNALED(job_ptr->derived_ec)) term_sig = WTERMSIG(job_ptr->derived_ec); else term_sig = 0; exit_status = WEXITSTATUS(job_ptr->derived_ec); xstrfmtcat(out, "DerivedExitCode=%u:%u", exit_status, term_sig); xstrcat(out, line_end); } /****** Line 6 ******/ if (IS_JOB_PENDING(job_ptr)) run_time = 0; else if (IS_JOB_SUSPENDED(job_ptr)) run_time = job_ptr->pre_sus_time; else { time_t end_time; if (IS_JOB_RUNNING(job_ptr) || (job_ptr->end_time == 0)) end_time = time(NULL); else end_time = job_ptr->end_time; if (job_ptr->suspend_time) { run_time = (time_t) (difftime(end_time, job_ptr->suspend_time) + job_ptr->pre_sus_time); } else run_time = (time_t) difftime(end_time, job_ptr->start_time); } secs2time_str(run_time, time_str, sizeof(time_str)); xstrfmtcat(out, "RunTime=%s ", time_str); if (job_ptr->time_limit == NO_VAL) xstrcat(out, "TimeLimit=Partition_Limit "); else { mins2time_str(job_ptr->time_limit, time_str, sizeof(time_str)); xstrfmtcat(out, "TimeLimit=%s ", time_str); } if (job_ptr->time_min == 0) xstrcat(out, "TimeMin=N/A"); else { mins2time_str(job_ptr->time_min, time_str, sizeof(time_str)); xstrfmtcat(out, "TimeMin=%s", time_str); } xstrcat(out, line_end); /****** Line 7 ******/ slurm_make_time_str(&job_ptr->submit_time, time_str, sizeof(time_str)); xstrfmtcat(out, "SubmitTime=%s ", time_str); slurm_make_time_str(&job_ptr->eligible_time, time_str, sizeof(time_str)); xstrfmtcat(out, "EligibleTime=%s", time_str); xstrcat(out, line_end); /****** Line 8 (optional) ******/ if (job_ptr->resize_time) { slurm_make_time_str(&job_ptr->resize_time, time_str, sizeof(time_str)); xstrfmtcat(out, "ResizeTime=%s", time_str); xstrcat(out, line_end); } /****** Line 9 ******/ slurm_make_time_str(&job_ptr->start_time, time_str, sizeof(time_str)); xstrfmtcat(out, "StartTime=%s ", time_str); if ((job_ptr->time_limit == INFINITE) && (job_ptr->end_time > time(NULL))) xstrcat(out, "EndTime=Unknown "); else { slurm_make_time_str(&job_ptr->end_time, time_str, sizeof(time_str)); xstrfmtcat(out, "EndTime=%s ", time_str); } if (job_ptr->deadline) { slurm_make_time_str(&job_ptr->deadline, time_str, sizeof(time_str)); xstrfmtcat(out, "Deadline=%s", time_str); } else { xstrcat(out, "Deadline=N/A"); } xstrcat(out, line_end); /****** Line 10 ******/ if (job_ptr->preempt_time == 0) xstrcat(out, "PreemptTime=None "); else { slurm_make_time_str(&job_ptr->preempt_time, time_str, sizeof(time_str)); xstrfmtcat(out, "PreemptTime=%s ", time_str); } if (job_ptr->suspend_time) { slurm_make_time_str(&job_ptr->suspend_time, time_str, sizeof(time_str)); xstrfmtcat(out, "SuspendTime=%s ", time_str); } else xstrcat(out, "SuspendTime=None "); xstrfmtcat(out, "SecsPreSuspend=%ld", (long int)job_ptr->pre_sus_time); xstrcat(out, line_end); /****** Line 11 ******/ xstrfmtcat(out, "Partition=%s AllocNode:Sid=%s:%u", job_ptr->partition, job_ptr->alloc_node, job_ptr->alloc_sid); xstrcat(out, line_end); /****** Line 12 ******/ xstrfmtcat(out, "Req%s=%s Exc%s=%s", nodelist, job_ptr->req_nodes, nodelist, job_ptr->exc_nodes); xstrcat(out, line_end); /****** Line 13 ******/ xstrfmtcat(out, "%s=%s", nodelist, job_ptr->nodes); if (job_ptr->nodes && ionodes) { xstrfmtcat(out, "[%s]", ionodes); xfree(ionodes); } if (job_ptr->sched_nodes) xstrfmtcat(out, " Sched%s=%s", nodelist, job_ptr->sched_nodes); xstrcat(out, line_end); /****** Line 14 (optional) ******/ if (job_ptr->batch_host) { xstrfmtcat(out, "BatchHost=%s", job_ptr->batch_host); xstrcat(out, line_end); } /****** Line 15 ******/ if (cluster_flags & CLUSTER_FLAG_BG) { select_g_select_jobinfo_get(job_ptr->select_jobinfo, SELECT_JOBDATA_NODE_CNT, &min_nodes); if ((min_nodes == 0) || (min_nodes == NO_VAL)) { min_nodes = job_ptr->num_nodes; max_nodes = job_ptr->max_nodes; } else if (job_ptr->max_nodes) max_nodes = min_nodes; } else if (IS_JOB_PENDING(job_ptr)) { min_nodes = job_ptr->num_nodes; max_nodes = job_ptr->max_nodes; if (max_nodes && (max_nodes < min_nodes)) min_nodes = max_nodes; } else { min_nodes = job_ptr->num_nodes; max_nodes = 0; } _sprint_range(tmp_line, sizeof(tmp_line), min_nodes, max_nodes); xstrfmtcat(out, "NumNodes=%s ", tmp_line); _sprint_range(tmp_line, sizeof(tmp_line), job_ptr->num_cpus, job_ptr->max_cpus); xstrfmtcat(out, "NumCPUs=%s ", tmp_line); xstrfmtcat(out, "NumTasks=%u ", job_ptr->num_tasks); xstrfmtcat(out, "CPUs/Task=%u ", job_ptr->cpus_per_task); if (job_ptr->boards_per_node == (uint16_t) NO_VAL) xstrcat(out, "ReqB:S:C:T=*:"); else xstrfmtcat(out, "ReqB:S:C:T=%u:", job_ptr->boards_per_node); if (job_ptr->sockets_per_board == (uint16_t) NO_VAL) xstrcat(out, "*:"); else xstrfmtcat(out, "%u:", job_ptr->sockets_per_board); if (job_ptr->cores_per_socket == (uint16_t) NO_VAL) xstrcat(out, "*:"); else xstrfmtcat(out, "%u:", job_ptr->cores_per_socket); if (job_ptr->threads_per_core == (uint16_t) NO_VAL) xstrcat(out, "*"); else xstrfmtcat(out, "%u", job_ptr->threads_per_core); xstrcat(out, line_end); /****** Line 16 ******/ /* Tres should already of been converted at this point from simple */ xstrfmtcat(out, "TRES=%s", job_ptr->tres_alloc_str ? job_ptr->tres_alloc_str : job_ptr->tres_req_str); xstrcat(out, line_end); /****** Line 17 ******/ if (job_ptr->sockets_per_node == (uint16_t) NO_VAL) xstrcat(out, "Socks/Node=* "); else xstrfmtcat(out, "Socks/Node=%u ", job_ptr->sockets_per_node); if (job_ptr->ntasks_per_node == (uint16_t) NO_VAL) xstrcat(out, "NtasksPerN:B:S:C=*:"); else xstrfmtcat(out, "NtasksPerN:B:S:C=%u:", job_ptr->ntasks_per_node); if (job_ptr->ntasks_per_board == (uint16_t) NO_VAL) xstrcat(out, "*:"); else xstrfmtcat(out, "%u:", job_ptr->ntasks_per_board); if ((job_ptr->ntasks_per_socket == (uint16_t) NO_VAL) || (job_ptr->ntasks_per_socket == (uint16_t) INFINITE)) xstrcat(out, "*:"); else xstrfmtcat(out, "%u:", job_ptr->ntasks_per_socket); if ((job_ptr->ntasks_per_core == (uint16_t) NO_VAL) || (job_ptr->ntasks_per_core == (uint16_t) INFINITE)) xstrcat(out, "* "); else xstrfmtcat(out, "%u ", job_ptr->ntasks_per_core); if (job_ptr->core_spec == (uint16_t) NO_VAL) xstrcat(out, "CoreSpec=*"); else if (job_ptr->core_spec & CORE_SPEC_THREAD) xstrfmtcat(out, "ThreadSpec=%d", (job_ptr->core_spec & (~CORE_SPEC_THREAD))); else xstrfmtcat(out, "CoreSpec=%u", job_ptr->core_spec); xstrcat(out, line_end); if (job_resrcs && cluster_flags & CLUSTER_FLAG_BG) { if ((job_resrcs->cpu_array_cnt > 0) && (job_resrcs->cpu_array_value) && (job_resrcs->cpu_array_reps)) { int length = 0; xstrcat(out, "CPUs="); for (i = 0; i < job_resrcs->cpu_array_cnt; i++) { /* only print 60 characters worth of this record */ if (length > 60) { /* skip to last CPU group entry */ if (i < job_resrcs->cpu_array_cnt - 1) { continue; } /* add ellipsis before last entry */ xstrcat(out, "...,"); } length += xstrfmtcat(out, "%d", job_resrcs->cpus[i]); if (job_resrcs->cpu_array_reps[i] > 1) { length += xstrfmtcat(out, "*%d", job_resrcs->cpu_array_reps[i]); } if (i < job_resrcs->cpu_array_cnt - 1) { xstrcat(out, ","); length++; } } xstrcat(out, line_end); } } else if (job_resrcs && job_resrcs->core_bitmap && ((last = bit_fls(job_resrcs->core_bitmap)) != -1)) { hl = hostlist_create(job_resrcs->nodes); if (!hl) { error("slurm_sprint_job_info: hostlist_create: %s", job_resrcs->nodes); return NULL; } hl_last = hostlist_create(NULL); if (!hl_last) { error("slurm_sprint_job_info: hostlist_create: NULL"); hostlist_destroy(hl); return NULL; } bit_inx = 0; i = sock_inx = sock_reps = 0; abs_node_inx = job_ptr->node_inx[i]; /* tmp1[] stores the current cpu(s) allocated */ tmp2[0] = '\0'; /* stores last cpu(s) allocated */ for (rel_node_inx=0; rel_node_inx < job_resrcs->nhosts; rel_node_inx++) { if (sock_reps >= job_resrcs->sock_core_rep_count[sock_inx]) { sock_inx++; sock_reps = 0; } sock_reps++; bit_reps = job_resrcs->sockets_per_node[sock_inx] * job_resrcs->cores_per_socket[sock_inx]; host = hostlist_shift(hl); threads = _threads_per_core(host); cpu_bitmap = bit_alloc(bit_reps * threads); for (j = 0; j < bit_reps; j++) { if (bit_test(job_resrcs->core_bitmap, bit_inx)){ for (k = 0; k < threads; k++) bit_set(cpu_bitmap, (j * threads) + k); } bit_inx++; } bit_fmt(tmp1, sizeof(tmp1), cpu_bitmap); FREE_NULL_BITMAP(cpu_bitmap); /* * If the allocation values for this host are not the * same as the last host, print the report of the last * group of hosts that had identical allocation values. */ if (xstrcmp(tmp1, tmp2) || (last_mem_alloc_ptr != job_resrcs->memory_allocated) || (job_resrcs->memory_allocated && (last_mem_alloc != job_resrcs->memory_allocated[rel_node_inx]))) { if (hostlist_count(hl_last)) { last_hosts = hostlist_ranged_string_xmalloc( hl_last); xstrfmtcat(out, " Nodes=%s CPU_IDs=%s Mem=%u", last_hosts, tmp2, last_mem_alloc_ptr ? last_mem_alloc : 0); xfree(last_hosts); xstrcat(out, line_end); hostlist_destroy(hl_last); hl_last = hostlist_create(NULL); } strcpy(tmp2, tmp1); last_mem_alloc_ptr = job_resrcs->memory_allocated; if (last_mem_alloc_ptr) last_mem_alloc = job_resrcs-> memory_allocated[rel_node_inx]; else last_mem_alloc = NO_VAL; } hostlist_push_host(hl_last, host); free(host); if (bit_inx > last) break; if (abs_node_inx > job_ptr->node_inx[i+1]) { i += 2; abs_node_inx = job_ptr->node_inx[i]; } else { abs_node_inx++; } } if (hostlist_count(hl_last)) { last_hosts = hostlist_ranged_string_xmalloc(hl_last); xstrfmtcat(out, " Nodes=%s CPU_IDs=%s Mem=%u", last_hosts, tmp2, last_mem_alloc_ptr ? last_mem_alloc : 0); xfree(last_hosts); xstrcat(out, line_end); } hostlist_destroy(hl); hostlist_destroy(hl_last); } /****** Line 18 ******/ if (job_ptr->pn_min_memory & MEM_PER_CPU) { job_ptr->pn_min_memory &= (~MEM_PER_CPU); tmp6_ptr = "CPU"; } else tmp6_ptr = "Node"; if (cluster_flags & CLUSTER_FLAG_BG) { convert_num_unit((float)job_ptr->pn_min_cpus, tmp1, sizeof(tmp1), UNIT_NONE, NO_VAL, CONVERT_NUM_UNIT_EXACT); xstrfmtcat(out, "MinCPUsNode=%s ", tmp1); } else { xstrfmtcat(out, "MinCPUsNode=%u ", job_ptr->pn_min_cpus); } convert_num_unit((float)job_ptr->pn_min_memory, tmp1, sizeof(tmp1), UNIT_MEGA, NO_VAL, CONVERT_NUM_UNIT_EXACT); convert_num_unit((float)job_ptr->pn_min_tmp_disk, tmp2, sizeof(tmp2), UNIT_MEGA, NO_VAL, CONVERT_NUM_UNIT_EXACT); xstrfmtcat(out, "MinMemory%s=%s MinTmpDiskNode=%s", tmp6_ptr, tmp1, tmp2); xstrcat(out, line_end); /****** Line ******/ secs2time_str((time_t)job_ptr->delay_boot, tmp1, sizeof(tmp1)); xstrfmtcat(out, "Features=%s DelayBoot=%s", job_ptr->features, tmp1); xstrcat(out, line_end); /****** Line ******/ xstrfmtcat(out, "Gres=%s Reservation=%s", job_ptr->gres, job_ptr->resv_name); xstrcat(out, line_end); /****** Line 20 ******/ xstrfmtcat(out, "OverSubscribe=%s Contiguous=%d Licenses=%s Network=%s", job_share_string(job_ptr->shared), job_ptr->contiguous, job_ptr->licenses, job_ptr->network); xstrcat(out, line_end); /****** Line 21 ******/ xstrfmtcat(out, "Command=%s", job_ptr->command); xstrcat(out, line_end); /****** Line 22 ******/ xstrfmtcat(out, "WorkDir=%s", job_ptr->work_dir); if (cluster_flags & CLUSTER_FLAG_BG) { /****** Line 23 (optional) ******/ select_g_select_jobinfo_sprint(job_ptr->select_jobinfo, select_buf, sizeof(select_buf), SELECT_PRINT_BG_ID); if (select_buf[0] != '\0') { xstrcat(out, line_end); xstrfmtcat(out, "Block_ID=%s", select_buf); } /****** Line 24 (optional) ******/ select_g_select_jobinfo_sprint(job_ptr->select_jobinfo, select_buf, sizeof(select_buf), SELECT_PRINT_MIXED_SHORT); if (select_buf[0] != '\0') { xstrcat(out, line_end); xstrcat(out, select_buf); } if (cluster_flags & CLUSTER_FLAG_BGL) { /****** Line 25 (optional) ******/ select_g_select_jobinfo_sprint( job_ptr->select_jobinfo, select_buf, sizeof(select_buf), SELECT_PRINT_BLRTS_IMAGE); if (select_buf[0] != '\0') { xstrcat(out, line_end); xstrfmtcat(out, "BlrtsImage=%s", select_buf); } } /****** Line 26 (optional) ******/ select_g_select_jobinfo_sprint(job_ptr->select_jobinfo, select_buf, sizeof(select_buf), SELECT_PRINT_LINUX_IMAGE); if (select_buf[0] != '\0') { xstrcat(out, line_end); if (cluster_flags & CLUSTER_FLAG_BGL) xstrfmtcat(out, "LinuxImage=%s", select_buf); else xstrfmtcat(out, "CnloadImage=%s", select_buf); } /****** Line 27 (optional) ******/ select_g_select_jobinfo_sprint(job_ptr->select_jobinfo, select_buf, sizeof(select_buf), SELECT_PRINT_MLOADER_IMAGE); if (select_buf[0] != '\0') { xstrcat(out, line_end); xstrfmtcat(out, "MloaderImage=%s", select_buf); } /****** Line 28 (optional) ******/ select_g_select_jobinfo_sprint(job_ptr->select_jobinfo, select_buf, sizeof(select_buf), SELECT_PRINT_RAMDISK_IMAGE); if (select_buf[0] != '\0') { xstrcat(out, line_end); if (cluster_flags & CLUSTER_FLAG_BGL) xstrfmtcat(out, "RamDiskImage=%s", select_buf); else xstrfmtcat(out, "IoloadImage=%s", select_buf); } } /****** Line (optional) ******/ if (job_ptr->admin_comment) { xstrcat(out, line_end); xstrfmtcat(out, "AdminComment=%s ", job_ptr->admin_comment); } /****** Line (optional) ******/ if (job_ptr->comment) { xstrcat(out, line_end); xstrfmtcat(out, "Comment=%s ", job_ptr->comment); } /****** Line 30 (optional) ******/ if (job_ptr->batch_flag) { xstrcat(out, line_end); slurm_get_job_stderr(tmp_path, sizeof(tmp_path), job_ptr); xstrfmtcat(out, "StdErr=%s", tmp_path); } /****** Line 31 (optional) ******/ if (job_ptr->batch_flag) { xstrcat(out, line_end); slurm_get_job_stdin(tmp_path, sizeof(tmp_path), job_ptr); xstrfmtcat(out, "StdIn=%s", tmp_path); } /****** Line 32 (optional) ******/ if (job_ptr->batch_flag) { xstrcat(out, line_end); slurm_get_job_stdout(tmp_path, sizeof(tmp_path), job_ptr); xstrfmtcat(out, "StdOut=%s", tmp_path); } /****** Line 33 (optional) ******/ if (job_ptr->batch_script) { xstrcat(out, line_end); xstrcat(out, "BatchScript=\n"); xstrcat(out, job_ptr->batch_script); } /****** Line 34 (optional) ******/ if (job_ptr->req_switch) { char time_buf[32]; xstrcat(out, line_end); secs2time_str((time_t) job_ptr->wait4switch, time_buf, sizeof(time_buf)); xstrfmtcat(out, "Switches=%u@%s\n", job_ptr->req_switch, time_buf); } /****** Line 35 (optional) ******/ if (job_ptr->burst_buffer) { xstrcat(out, line_end); xstrfmtcat(out, "BurstBuffer=%s", job_ptr->burst_buffer); } /****** Line (optional) ******/ if (job_ptr->burst_buffer_state) { xstrcat(out, line_end); xstrfmtcat(out, "BurstBufferState=%s", job_ptr->burst_buffer_state); } /****** Line 36 (optional) ******/ if (cpu_freq_debug(NULL, NULL, tmp1, sizeof(tmp1), job_ptr->cpu_freq_gov, job_ptr->cpu_freq_min, job_ptr->cpu_freq_max, NO_VAL) != 0) { xstrcat(out, line_end); xstrcat(out, tmp1); } /****** Line 37 ******/ xstrcat(out, line_end); xstrfmtcat(out, "Power=%s", power_flags_str(job_ptr->power_flags)); /****** Line 38 (optional) ******/ if (job_ptr->bitflags) { xstrcat(out, line_end); if (job_ptr->bitflags & KILL_INV_DEP) xstrcat(out, "KillOInInvalidDependent=Yes"); if (job_ptr->bitflags & NO_KILL_INV_DEP) xstrcat(out, "KillOInInvalidDependent=No"); if (job_ptr->bitflags & SPREAD_JOB) xstrcat(out, "SpreadJob=Yes"); } /****** END OF JOB RECORD ******/ if (one_liner) xstrcat(out, "\n"); else xstrcat(out, "\n\n"); return out; }
/** * Check page (used or not) */ static inline bool bitmask_check(struct PagePool *pp, pageno_t n) { return bit_test(pp->bitmask, n); }
static int _grid_table_by_switch(button_processor_t *button_processor, List node_list) { int rc = SLURM_SUCCESS; int inx = 0, ii = 0; switch_record_bitmaps_t *sw_nodes_bitmaps_ptr = g_switch_nodes_maps; #if TOPO_DEBUG /* engage if want original display below switched */ ListIterator itr = list_iterator_create(node_list); sview_node_info_t *sview_node_info_ptr = NULL; #endif button_processor->inx = &inx; for (ii=0; ii<g_topo_info_msg_ptr->record_count; ii++, sw_nodes_bitmaps_ptr++) { int j = 0, first, last; if (g_topo_info_msg_ptr->topo_array[ii].level) continue; first = bit_ffs(sw_nodes_bitmaps_ptr->node_bitmap); if (first == -1) continue; last = bit_fls(sw_nodes_bitmaps_ptr->node_bitmap); button_processor->inx = &j; button_processor->force_row_break = FALSE; for (j = first; j <= last; j++) { if (TOPO_DEBUG) g_print("allocated node = %s button# %d\n", g_node_info_ptr->node_array[j].name, j); if (!bit_test(sw_nodes_bitmaps_ptr->node_bitmap, j)) continue; /* if (!working_sview_config.show_hidden) { */ /* if (!check_part_includes_node(j)) */ /* continue; */ /* } */ if (j == last) button_processor->force_row_break = TRUE; if ((rc = _add_button_to_list( &g_node_info_ptr->node_array[j], button_processor)) != SLURM_SUCCESS) break; button_processor->force_row_break = FALSE; } rc = _add_button_to_list(NULL, button_processor); } #if TOPO_DEBUG /* engage this if want original display below * switched grid */ button_processor->inx = &inx; while ((sview_node_info_ptr = list_next(itr))) { if ((rc = _add_button_to_list( sview_node_info_ptr->node_ptr, button_processor)) != SLURM_SUCCESS) break; inx++; } list_iterator_destroy(itr); #endif /* This is needed to get the correct width of the grid window. * If it is not given then we get a really narrow window. */ gtk_table_set_row_spacing(button_processor->table, (*button_processor->coord_y)? ((*button_processor->coord_y)-1):0, 1); return rc; }
/* * Determine which of these nodes are usable by this job * * Remove nodes from the bitmap that don't have enough memory or gres to * support the job. * * Return SLURM_ERROR if a required node can't be used. * * if node_state = NODE_CR_RESERVED, clear bitmap (if node is required * then should we return NODE_BUSY!?!) * * if node_state = NODE_CR_ONE_ROW, then this node can only be used by * another NODE_CR_ONE_ROW job * * if node_state = NODE_CR_AVAILABLE AND: * - job_node_req = NODE_CR_RESERVED, then we need idle nodes * - job_node_req = NODE_CR_ONE_ROW, then we need idle or non-sharing nodes */ static int _verify_node_state(struct part_res_record *cr_part_ptr, struct job_record *job_ptr, bitstr_t * bitmap, uint16_t cr_type, struct node_use_record *node_usage, enum node_cr_state job_node_req) { struct node_record *node_ptr; uint32_t i, free_mem, gres_cpus, min_mem; int i_first, i_last; List gres_list; if (job_ptr->details->pn_min_memory & MEM_PER_CPU) min_mem = job_ptr->details->pn_min_memory & (~MEM_PER_CPU); else min_mem = job_ptr->details->pn_min_memory; i_first = bit_ffs(bitmap); if (i_first >= 0) i_last = bit_fls(bitmap); else i_last = -2; for (i = i_first; i <= i_last; i++) { if (!bit_test(bitmap, i)) continue; node_ptr = select_node_record[i].node_ptr; /* node-level memory check */ if ((job_ptr->details->pn_min_memory) && (cr_type & CR_MEMORY)) { free_mem = select_node_record[i].real_memory; free_mem -= node_usage[i].alloc_memory; if (free_mem < min_mem) { debug3("select/serial: node %s no mem %u < %u", select_node_record[i].node_ptr->name, free_mem, min_mem); goto clear_bit; } } /* node-level gres check */ if (node_usage[i].gres_list) gres_list = node_usage[i].gres_list; else gres_list = node_ptr->gres_list; gres_cpus = gres_plugin_job_test(job_ptr->gres_list, gres_list, true, NULL, 0, 0, job_ptr->job_id, node_ptr->name); if (gres_cpus == 0) { debug3("select/serial: node %s lacks gres", node_ptr->name); goto clear_bit; } /* exclusive node check */ if (node_usage[i].node_state >= NODE_CR_RESERVED) { debug3("select/serial: node %s in exclusive use", node_ptr->name); goto clear_bit; /* non-resource-sharing node check */ } else if (node_usage[i].node_state >= NODE_CR_ONE_ROW) { if ((job_node_req == NODE_CR_RESERVED) || (job_node_req == NODE_CR_AVAILABLE)) { debug3("select/serial: node %s non-sharing", node_ptr->name); goto clear_bit; } /* cannot use this node if it is running jobs * in sharing partitions */ if (_is_node_busy(cr_part_ptr, i, 1, job_ptr->part_ptr)) { debug3("select/serial: node %s sharing?", node_ptr->name); goto clear_bit; } /* node is NODE_CR_AVAILABLE - check job request */ } else { if (job_node_req == NODE_CR_RESERVED) { if (_is_node_busy(cr_part_ptr, i, 0, job_ptr->part_ptr)) { debug3("select/serial: node %s busy", node_ptr->name); goto clear_bit; } } else if (job_node_req == NODE_CR_ONE_ROW) { /* cannot use this node if it is running jobs * in sharing partitions */ if (_is_node_busy(cr_part_ptr, i, 1, job_ptr->part_ptr)) { debug3("select/serial: node %s vbusy", node_ptr->name); goto clear_bit; } } } continue; /* node is usable, test next node */ clear_bit: /* This node is not usable by this job */ bit_clear(bitmap, i); if (job_ptr->details->req_node_bitmap && bit_test(job_ptr->details->req_node_bitmap, i)) { return SLURM_ERROR; } } return SLURM_SUCCESS; }
/* cr_job_test - does most of the real work for select_p_job_test(), which * includes contiguous selection, load-leveling and max_share logic * * PROCEDURE: * * Step 1: compare nodes in "avail" bitmap with current node state data * to find available nodes that match the job request * * Step 2: check resources in "avail" bitmap with allocated resources from * higher priority partitions (busy resources are UNavailable) * * Step 3: select resource usage on remaining resources in "avail" bitmap * for this job, with the placement influenced by existing * allocations */ extern int cr_job_test(struct job_record *job_ptr, bitstr_t *bitmap, int mode, uint16_t cr_type, enum node_cr_state job_node_req, uint32_t cr_node_cnt, struct part_res_record *cr_part_ptr, struct node_use_record *node_usage) { static int gang_mode = -1; int error_code = SLURM_SUCCESS; bitstr_t *orig_map, *avail_cores, *free_cores; bitstr_t *tmpcore = NULL; bool test_only; uint32_t c, i, j, k, n, csize, save_mem = 0; job_resources_t *job_res; struct job_details *details_ptr; struct part_res_record *p_ptr, *jp_ptr; uint16_t *cpu_count; if (gang_mode == -1) { if (slurm_get_preempt_mode() & PREEMPT_MODE_GANG) gang_mode = 1; else gang_mode = 0; } details_ptr = job_ptr->details; free_job_resources(&job_ptr->job_resrcs); if (mode == SELECT_MODE_TEST_ONLY) test_only = true; else /* SELECT_MODE_RUN_NOW || SELECT_MODE_WILL_RUN */ test_only = false; /* check node_state and update the node bitmap as necessary */ if (!test_only) { error_code = _verify_node_state(cr_part_ptr, job_ptr, bitmap, cr_type, node_usage, job_node_req); if (error_code != SLURM_SUCCESS) return error_code; } if (select_debug_flags & DEBUG_FLAG_CPU_BIND) { info("select/serial: evaluating job %u on %u nodes", job_ptr->job_id, bit_set_count(bitmap)); } orig_map = bit_copy(bitmap); avail_cores = _make_core_bitmap(bitmap); /* test to make sure that this job can succeed with all avail_cores * if 'no' then return FAIL * if 'yes' then we will seek the optimal placement for this job * within avail_cores */ free_cores = bit_copy(avail_cores); cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt, free_cores, node_usage, cr_type, test_only); if (cpu_count == NULL) { /* job cannot fit */ FREE_NULL_BITMAP(orig_map); FREE_NULL_BITMAP(free_cores); FREE_NULL_BITMAP(avail_cores); if (select_debug_flags & DEBUG_FLAG_CPU_BIND) { info("select/serial: cr_job_test: test 0 fail: " "insufficient resources"); } return SLURM_ERROR; } else if (test_only) { FREE_NULL_BITMAP(orig_map); FREE_NULL_BITMAP(free_cores); FREE_NULL_BITMAP(avail_cores); xfree(cpu_count); if (select_debug_flags & DEBUG_FLAG_CPU_BIND) info("select/serial: cr_job_test: test 0 pass: "******"test_only"); return SLURM_SUCCESS; } if (cr_type == CR_MEMORY) { /* CR_MEMORY does not care about existing CPU allocations, * so we can jump right to job allocation from here */ goto alloc_job; } xfree(cpu_count); if (select_debug_flags & DEBUG_FLAG_CPU_BIND) { info("select/serial: cr_job_test: test 0 pass - " "job fits on given resources"); } /* now that we know that this job can run with the given resources, * let's factor in the existing allocations and seek the optimal set * of resources for this job. Here is the procedure: * * Step 1: Seek idle CPUs across all partitions. If successful then * place job and exit. If not successful, then continue. Two * related items to note: * 1. Jobs that don't share CPUs finish with step 1. * 2. The remaining steps assume sharing or preemption. * * Step 2: Remove resources that are in use by higher-priority * partitions, and test that job can still succeed. If not * then exit. * * Step 3: Seek idle nodes among the partitions with the same * priority as the job's partition. If successful then * goto Step 6. If not then continue: * * Step 4: Seek placement within the job's partition. Search * row-by-row. If no placement if found, then exit. If a row * is found, then continue: * * Step 5: Place job and exit. FIXME! Here is where we need a * placement algorithm that recognizes existing job * boundaries and tries to "overlap jobs" as efficiently * as possible. * * Step 6: Place job and exit. FIXME! here is we use a placement * algorithm similar to Step 5 on jobs from lower-priority * partitions. */ /*** Step 1 ***/ bit_copybits(bitmap, orig_map); bit_copybits(free_cores, avail_cores); /* remove all existing allocations from free_cores */ tmpcore = bit_copy(free_cores); for (p_ptr = cr_part_ptr; p_ptr; p_ptr = p_ptr->next) { if (!p_ptr->row) continue; for (i = 0; i < p_ptr->num_rows; i++) { if (!p_ptr->row[i].row_bitmap) continue; bit_copybits(tmpcore, p_ptr->row[i].row_bitmap); bit_not(tmpcore); /* set bits now "free" resources */ bit_and(free_cores, tmpcore); } } cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt, free_cores, node_usage, cr_type, test_only); if (cpu_count) { /* job fits! We're done. */ if (select_debug_flags & DEBUG_FLAG_CPU_BIND) { info("select/serial: cr_job_test: test 1 pass - " "idle resources found"); } goto alloc_job; } if ((gang_mode == 0) && (job_node_req == NODE_CR_ONE_ROW)) { /* This job CANNOT share CPUs regardless of priority, * so we fail here. Note that Shared=EXCLUSIVE was already * addressed in _verify_node_state() and job preemption * removes jobs from simulated resource allocation map * before this point. */ if (select_debug_flags & DEBUG_FLAG_CPU_BIND) { info("select/serial: cr_job_test: test 1 fail - " "no idle resources available"); } goto alloc_job; } if (select_debug_flags & DEBUG_FLAG_CPU_BIND) { info("select/serial: cr_job_test: test 1 fail - " "not enough idle resources"); } /*** Step 2 ***/ bit_copybits(bitmap, orig_map); bit_copybits(free_cores, avail_cores); for (jp_ptr = cr_part_ptr; jp_ptr; jp_ptr = jp_ptr->next) { if (jp_ptr->part_ptr == job_ptr->part_ptr) break; } if (!jp_ptr) { fatal("select/serial: could not find partition for job %u", job_ptr->job_id); } /* remove existing allocations (jobs) from higher-priority partitions * from avail_cores */ for (p_ptr = cr_part_ptr; p_ptr; p_ptr = p_ptr->next) { if (p_ptr->part_ptr->priority <= jp_ptr->part_ptr->priority) continue; if (!p_ptr->row) continue; for (i = 0; i < p_ptr->num_rows; i++) { if (!p_ptr->row[i].row_bitmap) continue; bit_copybits(tmpcore, p_ptr->row[i].row_bitmap); bit_not(tmpcore); /* set bits now "free" resources */ bit_and(free_cores, tmpcore); } } /* make these changes permanent */ bit_copybits(avail_cores, free_cores); cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt, free_cores, node_usage, cr_type, test_only); if (!cpu_count) { /* job needs resources that are currently in use by * higher-priority jobs, so fail for now */ if (select_debug_flags & DEBUG_FLAG_CPU_BIND) { info("select/serial: cr_job_test: test 2 fail - " "resources busy with higher priority jobs"); } goto alloc_job; } xfree(cpu_count); if (select_debug_flags & DEBUG_FLAG_CPU_BIND) { info("select/serial: cr_job_test: test 2 pass - " "available resources for this priority"); } /*** Step 3 ***/ bit_copybits(bitmap, orig_map); bit_copybits(free_cores, avail_cores); /* remove existing allocations (jobs) from same-priority partitions * from avail_cores */ for (p_ptr = cr_part_ptr; p_ptr; p_ptr = p_ptr->next) { if (p_ptr->part_ptr->priority != jp_ptr->part_ptr->priority) continue; if (!p_ptr->row) continue; for (i = 0; i < p_ptr->num_rows; i++) { if (!p_ptr->row[i].row_bitmap) continue; bit_copybits(tmpcore, p_ptr->row[i].row_bitmap); bit_not(tmpcore); /* set bits now "free" resources */ bit_and(free_cores, tmpcore); } } cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt, free_cores, node_usage, cr_type, test_only); if (cpu_count) { /* jobs from low-priority partitions are the only thing left * in our way. for now we'll ignore them, but FIXME: we need * a good placement algorithm here that optimizes "job overlap" * between this job (in these idle nodes) and the low-priority * jobs */ if (select_debug_flags & DEBUG_FLAG_CPU_BIND) { info("select/serial: cr_job_test: test 3 pass - " "found resources"); } goto alloc_job; } if (select_debug_flags & DEBUG_FLAG_CPU_BIND) { info("select/serial: cr_job_test: test 3 fail - " "not enough idle resources in same priority"); } /*** Step 4 ***/ /* try to fit the job into an existing row * * tmpcore = worker core_bitmap * free_cores = core_bitmap to be built * avail_cores = static core_bitmap of all available cores */ if (jp_ptr->row == NULL) { /* there's no existing jobs in this partition, so place * the job in avail_cores. FIXME: still need a good * placement algorithm here that optimizes "job overlap" * between this job (in these idle nodes) and existing * jobs in the other partitions with <= priority to * this partition */ bit_copybits(bitmap, orig_map); bit_copybits(free_cores, avail_cores); cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt, free_cores, node_usage, cr_type, test_only); if (select_debug_flags & DEBUG_FLAG_CPU_BIND) { info("select/serial: cr_job_test: test 4 pass - " "first row found"); } goto alloc_job; } cr_sort_part_rows(jp_ptr); c = jp_ptr->num_rows; if (job_node_req != NODE_CR_AVAILABLE) c = 1; for (i = 0; i < c; i++) { if (!jp_ptr->row[i].row_bitmap) break; bit_copybits(bitmap, orig_map); bit_copybits(free_cores, avail_cores); bit_copybits(tmpcore, jp_ptr->row[i].row_bitmap); bit_not(tmpcore); bit_and(free_cores, tmpcore); cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt, free_cores, node_usage, cr_type, test_only); if (cpu_count) { if (select_debug_flags & DEBUG_FLAG_CPU_BIND) { info("select/serial: cr_job_test: " "test 4 pass - row %i", i); } break; } if (select_debug_flags & DEBUG_FLAG_CPU_BIND) { info("select/serial: cr_job_test: " "test 4 fail - row %i", i); } } if ((i < c) && !jp_ptr->row[i].row_bitmap) { /* we've found an empty row, so use it */ bit_copybits(bitmap, orig_map); bit_copybits(free_cores, avail_cores); if (select_debug_flags & DEBUG_FLAG_CPU_BIND) { info("select/serial: cr_job_test: " "test 4 trying empty row %i",i); } cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt, free_cores, node_usage, cr_type, test_only); } if (!cpu_count) { /* job can't fit into any row, so exit */ if (select_debug_flags & DEBUG_FLAG_CPU_BIND) { info("select/serial: cr_job_test: test 4 fail - " "busy partition"); } goto alloc_job; } /*** CONSTRUCTION ZONE FOR STEPs 5 AND 6 *** * Note that while the job may have fit into a row, it should * still be run through a good placement algorithm here that * optimizes "job overlap" between this job (in these idle nodes) * and existing jobs in the other partitions with <= priority to * this partition */ alloc_job: /* at this point we've found a good set of * bits to allocate to this job: * - bitmap is the set of nodes to allocate * - free_cores is the set of allocated cores * - cpu_count is the number of cpus per allocated node * * Next steps are to cleanup the worker variables, * create the job_resources struct, * distribute the job on the bits, and exit */ FREE_NULL_BITMAP(orig_map); FREE_NULL_BITMAP(avail_cores); FREE_NULL_BITMAP(tmpcore); if (!cpu_count) { /* we were sent here to cleanup and exit */ FREE_NULL_BITMAP(free_cores); if (select_debug_flags & DEBUG_FLAG_CPU_BIND) { info("select/serial: exiting cr_job_test with no " "allocation"); } return SLURM_ERROR; } /* At this point we have: * - a bitmap of selected nodes * - a free_cores bitmap of usable cores on each selected node * - a per-alloc-node cpu_count array */ if ((mode != SELECT_MODE_WILL_RUN) && (job_ptr->part_ptr == NULL)) error_code = EINVAL; if ((error_code == SLURM_SUCCESS) && (mode == SELECT_MODE_WILL_RUN)) job_ptr->total_cpus = 1; if ((error_code != SLURM_SUCCESS) || (mode != SELECT_MODE_RUN_NOW)) { FREE_NULL_BITMAP(free_cores); xfree(cpu_count); return error_code; } n = bit_ffs(bitmap); if (n < 0) { FREE_NULL_BITMAP(free_cores); xfree(cpu_count); return error_code; } if (select_debug_flags & DEBUG_FLAG_CPU_BIND) { info("select/serial: cr_job_test: distributing job %u", job_ptr->job_id); } /** create the struct_job_res **/ job_res = create_job_resources(); job_res->node_bitmap = bit_copy(bitmap); job_res->nodes = bitmap2node_name(bitmap); if (job_res->node_bitmap == NULL) fatal("bit_copy malloc failure"); job_res->nhosts = bit_set_count(bitmap); job_res->ncpus = job_res->nhosts; if (job_ptr->details->ntasks_per_node) job_res->ncpus *= details_ptr->ntasks_per_node; job_res->ncpus = MAX(job_res->ncpus, details_ptr->min_cpus); job_res->ncpus = MAX(job_res->ncpus, details_ptr->pn_min_cpus); job_res->node_req = job_node_req; job_res->cpus = cpu_count; job_res->cpus_used = xmalloc(job_res->nhosts * sizeof(uint16_t)); job_res->memory_allocated = xmalloc(job_res->nhosts * sizeof(uint32_t)); job_res->memory_used = xmalloc(job_res->nhosts * sizeof(uint32_t)); /* store the hardware data for the selected nodes */ error_code = build_job_resources(job_res, node_record_table_ptr, select_fast_schedule); if (error_code != SLURM_SUCCESS) { free_job_resources(&job_res); FREE_NULL_BITMAP(free_cores); return error_code; } c = 0; csize = bit_size(job_res->core_bitmap); j = cr_get_coremap_offset(n); k = cr_get_coremap_offset(n + 1); for (; j < k; j++, c++) { if (!bit_test(free_cores, j)) continue; if (c >= csize) { error("select/serial: cr_job_test " "core_bitmap index error on node %s", select_node_record[n].node_ptr->name); drain_nodes(select_node_record[n].node_ptr->name, "Bad core count", getuid()); free_job_resources(&job_res); FREE_NULL_BITMAP(free_cores); return SLURM_ERROR; } bit_set(job_res->core_bitmap, c); break; } if (select_debug_flags & DEBUG_FLAG_CPU_BIND) { info("select/serial: cr_job_test: job %u ncpus %u cbits %u/%d " "nbits %u", job_ptr->job_id, job_res->ncpus, bit_set_count(free_cores), 1, job_res->nhosts); } FREE_NULL_BITMAP(free_cores); /* distribute the tasks and clear any unused cores */ job_ptr->job_resrcs = job_res; error_code = cr_dist(job_ptr, cr_type); if (error_code != SLURM_SUCCESS) { free_job_resources(&job_ptr->job_resrcs); return error_code; } /* translate job_res->cpus array into format with rep count */ job_ptr->total_cpus = build_job_resources_cpu_array(job_res); if (!(cr_type & CR_MEMORY)) return error_code; /* load memory allocated array */ save_mem = details_ptr->pn_min_memory; if (save_mem & MEM_PER_CPU) { /* memory is per-cpu */ save_mem &= (~MEM_PER_CPU); job_res->memory_allocated[0] = job_res->cpus[0] * save_mem; } else { /* memory is per-node */ job_res->memory_allocated[0] = save_mem; } return error_code; }
/* * Given a job step request, return an equivalent local bitmap for this node * IN req - The job step launch request * OUT hw_sockets - number of actual sockets on this node * OUT hw_cores - number of actual cores per socket on this node * OUT hw_threads - number of actual threads per core on this node * RET: bitmap of processors available to this job step on this node * OR NULL on error */ static bitstr_t *_get_avail_map(launch_tasks_request_msg_t *req, uint16_t *hw_sockets, uint16_t *hw_cores, uint16_t *hw_threads) { bitstr_t *req_map, *hw_map; slurm_cred_arg_t arg; uint16_t p, t, new_p, num_cpus, sockets, cores; int job_node_id; int start; char *str; *hw_sockets = conf->sockets; *hw_cores = conf->cores; *hw_threads = conf->threads; if (slurm_cred_get_args(req->cred, &arg) != SLURM_SUCCESS) { error("task/affinity: job lacks a credential"); return NULL; } /* we need this node's ID in relation to the whole * job allocation, not just this jobstep */ job_node_id = nodelist_find(arg.job_hostlist, conf->node_name); start = _get_local_node_info(&arg, job_node_id, &sockets, &cores); if (start < 0) { error("task/affinity: missing node %d in job credential", job_node_id); slurm_cred_free_args(&arg); return NULL; } debug3("task/affinity: slurmctld s %u c %u; hw s %u c %u t %u", sockets, cores, *hw_sockets, *hw_cores, *hw_threads); num_cpus = MIN((sockets * cores), ((*hw_sockets)*(*hw_cores))); req_map = (bitstr_t *) bit_alloc(num_cpus); hw_map = (bitstr_t *) bit_alloc(conf->block_map_size); /* Transfer core_bitmap data to local req_map. * The MOD function handles the case where fewer processes * physically exist than are configured (slurmd is out of * sync with the slurmctld daemon). */ for (p = 0; p < (sockets * cores); p++) { if (bit_test(arg.step_core_bitmap, start+p)) bit_set(req_map, (p % num_cpus)); } str = (char *)bit_fmt_hexmask(req_map); debug3("task/affinity: job %u.%u CPU mask from slurmctld: %s", req->job_id, req->job_step_id, str); xfree(str); for (p = 0; p < num_cpus; p++) { if (bit_test(req_map, p) == 0) continue; /* If we are pretending we have a larger system than we really have this is needed to make sure we don't bust the bank. */ new_p = p % conf->block_map_size; /* core_bitmap does not include threads, so we * add them here but limit them to what the job * requested */ for (t = 0; t < (*hw_threads); t++) { uint16_t bit = new_p * (*hw_threads) + t; bit %= conf->block_map_size; bit_set(hw_map, bit); } } str = (char *)bit_fmt_hexmask(hw_map); debug3("task/affinity: job %u.%u CPU final mask for local node: %s", req->job_id, req->job_step_id, str); xfree(str); FREE_NULL_BITMAP(req_map); slurm_cred_free_args(&arg); return hw_map; }
/* Execute control sequence. */ int input_csi_dispatch(struct input_ctx *ictx) { struct screen_write_ctx *sctx = &ictx->ctx; struct screen *s = sctx->s; struct input_table_entry *entry; int n, m; if (ictx->flags & INPUT_DISCARD) return (0); if (input_split(ictx) != 0) return (0); log_debug("%s: '%c' \"%s\" \"%s\"", __func__, ictx->ch, ictx->interm_buf, ictx->param_buf); entry = bsearch(ictx, input_csi_table, nitems(input_csi_table), sizeof input_csi_table[0], input_table_compare); if (entry == NULL) { log_debug("%s: unknown '%c'", __func__, ictx->ch); return (0); } switch (entry->type) { case INPUT_CSI_CBT: /* Find the previous tab point, n times. */ n = input_get(ictx, 0, 1, 1); while (s->cx > 0 && n-- > 0) { do s->cx--; while (s->cx > 0 && !bit_test(s->tabs, s->cx)); } break; case INPUT_CSI_CUB: screen_write_cursorleft(sctx, input_get(ictx, 0, 1, 1)); break; case INPUT_CSI_CUD: screen_write_cursordown(sctx, input_get(ictx, 0, 1, 1)); break; case INPUT_CSI_CUF: screen_write_cursorright(sctx, input_get(ictx, 0, 1, 1)); break; case INPUT_CSI_CUP: n = input_get(ictx, 0, 1, 1); m = input_get(ictx, 1, 1, 1); screen_write_cursormove(sctx, m - 1, n - 1); break; case INPUT_CSI_WINOPS: input_csi_dispatch_winops(ictx); break; case INPUT_CSI_CUU: screen_write_cursorup(sctx, input_get(ictx, 0, 1, 1)); break; case INPUT_CSI_CNL: screen_write_carriagereturn(sctx); screen_write_cursordown(sctx, input_get(ictx, 0, 1, 1)); break; case INPUT_CSI_CPL: screen_write_carriagereturn(sctx); screen_write_cursorup(sctx, input_get(ictx, 0, 1, 1)); break; case INPUT_CSI_DA: switch (input_get(ictx, 0, 0, 0)) { case 0: input_reply(ictx, "\033[?1;2c"); break; default: log_debug("%s: unknown '%c'", __func__, ictx->ch); break; } break; case INPUT_CSI_DA_TWO: switch (input_get(ictx, 0, 0, 0)) { case 0: input_reply(ictx, "\033[>84;0;0c"); break; default: log_debug("%s: unknown '%c'", __func__, ictx->ch); break; } break; case INPUT_CSI_ECH: screen_write_clearcharacter(sctx, input_get(ictx, 0, 1, 1)); break; case INPUT_CSI_DCH: screen_write_deletecharacter(sctx, input_get(ictx, 0, 1, 1)); break; case INPUT_CSI_DECSTBM: n = input_get(ictx, 0, 1, 1); m = input_get(ictx, 1, 1, screen_size_y(s)); screen_write_scrollregion(sctx, n - 1, m - 1); break; case INPUT_CSI_DL: screen_write_deleteline(sctx, input_get(ictx, 0, 1, 1)); break; case INPUT_CSI_DSR: switch (input_get(ictx, 0, 0, 0)) { case 5: input_reply(ictx, "\033[0n"); break; case 6: input_reply(ictx, "\033[%u;%uR", s->cy + 1, s->cx + 1); break; default: log_debug("%s: unknown '%c'", __func__, ictx->ch); break; } break; case INPUT_CSI_ED: switch (input_get(ictx, 0, 0, 0)) { case 0: screen_write_clearendofscreen(sctx); break; case 1: screen_write_clearstartofscreen(sctx); break; case 2: screen_write_clearscreen(sctx); break; case 3: switch (input_get(ictx, 1, 0, 0)) { case 0: /* * Linux console extension to clear history * (for example before locking the screen). */ screen_write_clearhistory(sctx); break; } break; default: log_debug("%s: unknown '%c'", __func__, ictx->ch); break; } break; case INPUT_CSI_EL: switch (input_get(ictx, 0, 0, 0)) { case 0: screen_write_clearendofline(sctx); break; case 1: screen_write_clearstartofline(sctx); break; case 2: screen_write_clearline(sctx); break; default: log_debug("%s: unknown '%c'", __func__, ictx->ch); break; } break; case INPUT_CSI_HPA: n = input_get(ictx, 0, 1, 1); screen_write_cursormove(sctx, n - 1, s->cy); break; case INPUT_CSI_ICH: screen_write_insertcharacter(sctx, input_get(ictx, 0, 1, 1)); break; case INPUT_CSI_IL: screen_write_insertline(sctx, input_get(ictx, 0, 1, 1)); break; case INPUT_CSI_RCP: memcpy(&ictx->cell, &ictx->old_cell, sizeof ictx->cell); screen_write_cursormove(sctx, ictx->old_cx, ictx->old_cy); break; case INPUT_CSI_RM: input_csi_dispatch_rm(ictx); break; case INPUT_CSI_RM_PRIVATE: input_csi_dispatch_rm_private(ictx); break; case INPUT_CSI_SCP: memcpy(&ictx->old_cell, &ictx->cell, sizeof ictx->old_cell); ictx->old_cx = s->cx; ictx->old_cy = s->cy; break; case INPUT_CSI_SGR: input_csi_dispatch_sgr(ictx); break; case INPUT_CSI_SM: input_csi_dispatch_sm(ictx); break; case INPUT_CSI_SM_PRIVATE: input_csi_dispatch_sm_private(ictx); break; case INPUT_CSI_TBC: switch (input_get(ictx, 0, 0, 0)) { case 0: if (s->cx < screen_size_x(s)) bit_clear(s->tabs, s->cx); break; case 3: bit_nclear(s->tabs, 0, screen_size_x(s) - 1); break; default: log_debug("%s: unknown '%c'", __func__, ictx->ch); break; } break; case INPUT_CSI_VPA: n = input_get(ictx, 0, 1, 1); screen_write_cursormove(sctx, s->cx, n - 1); break; case INPUT_CSI_DECSCUSR: n = input_get(ictx, 0, 0, 0); screen_set_cursor_style(s, n); break; } return (0); }
/* Sync up the core_bitmap with the CPU array using cyclic distribution * * The CPU array contains the distribution of CPUs, which can include * virtual CPUs (hyperthreads) */ static int _cyclic_sync_core_bitmap(struct job_record *job_ptr, const uint16_t cr_type, bool preempt_mode) { uint32_t c, i, j, s, n, *sock_start, *sock_end, size, csize, core_cnt; uint16_t cps = 0, cpus, vpus, sockets, sock_size; job_resources_t *job_res = job_ptr->job_resrcs; bitstr_t *core_map; bool *sock_used, *sock_avoid; bool alloc_cores = false, alloc_sockets = false; uint16_t ntasks_per_socket = 0xffff; uint16_t ntasks_per_core = 0xffff; int error_code = SLURM_SUCCESS; int tmp_cpt = 0; /* cpus_per_task */ if ((job_res == NULL) || (job_res->core_bitmap == NULL) || (job_ptr->details == NULL)) return error_code; if (cr_type & CR_SOCKET) alloc_sockets = true; else if (cr_type & CR_CORE) alloc_cores = true; core_map = job_res->core_bitmap; if (job_ptr->details->mc_ptr) { multi_core_data_t *mc_ptr = job_ptr->details->mc_ptr; if ((mc_ptr->ntasks_per_core != (uint16_t) INFINITE) && (mc_ptr->ntasks_per_core)) { ntasks_per_core = mc_ptr->ntasks_per_core; } if (mc_ptr->ntasks_per_socket) ntasks_per_socket = mc_ptr->ntasks_per_socket; } sock_size = select_node_record[0].sockets; sock_avoid = xmalloc(sock_size * sizeof(bool)); sock_start = xmalloc(sock_size * sizeof(uint32_t)); sock_end = xmalloc(sock_size * sizeof(uint32_t)); sock_used = xmalloc(sock_size * sizeof(bool)); size = bit_size(job_res->node_bitmap); csize = bit_size(core_map); for (c = 0, i = 0, n = 0; n < size; n++) { if (bit_test(job_res->node_bitmap, n) == 0) continue; sockets = select_node_record[n].sockets; cps = select_node_record[n].cores; vpus = cr_cpus_per_core(job_ptr->details, n); if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("DEBUG: job %u node %s vpus %u cpus %u", job_ptr->job_id, select_node_record[n].node_ptr->name, vpus, job_res->cpus[i]); } if ((c + (sockets * cps)) > csize) fatal("cons_res: _cyclic_sync_core_bitmap index error"); if (sockets > sock_size) { sock_size = sockets; xrealloc(sock_avoid, sock_size * sizeof(bool)); xrealloc(sock_start, sock_size * sizeof(uint32_t)); xrealloc(sock_end, sock_size * sizeof(uint32_t)); xrealloc(sock_used, sock_size * sizeof(bool)); } for (s = 0; s < sockets; s++) { sock_start[s] = c + (s * cps); sock_end[s] = sock_start[s] + cps; sock_avoid[s] = false; sock_used[s] = false; } core_cnt = 0; cpus = job_res->cpus[i]; if (ntasks_per_socket != 0xffff) { int x_cpus, cpus_per_socket; uint32_t total_cpus = 0; uint32_t *cpus_cnt; cpus_per_socket = ntasks_per_socket * job_ptr->details->cpus_per_task; cpus_cnt = xmalloc(sizeof(uint32_t) * sockets); for (s = 0; s < sockets; s++) { for (j = sock_start[s]; j < sock_end[s]; j++) { if (bit_test(core_map, j)) cpus_cnt[s] += vpus; } total_cpus += cpus_cnt[s]; } for (s = 0; s < sockets && total_cpus > cpus; s++) { if (cpus_cnt[s] > cpus_per_socket) { x_cpus = cpus_cnt[s] - cpus_per_socket; cpus_cnt[s] = cpus_per_socket; total_cpus -= x_cpus; } } for (s = 0; s < sockets && total_cpus > cpus; s++) { if ((cpus_cnt[s] <= cpus_per_socket) && (total_cpus - cpus_cnt[s] >= cpus)) { sock_avoid[s] = true; total_cpus -= cpus_cnt[s]; } } xfree(cpus_cnt); } else if (job_ptr->details->cpus_per_task > 1) { /* CLANG false positive */ /* Try to pack all CPUs of each tasks on one socket. */ uint32_t *cpus_cnt, cpus_per_task; cpus_per_task = job_ptr->details->cpus_per_task; cpus_cnt = xmalloc(sizeof(uint32_t) * sockets); for (s = 0; s < sockets; s++) { for (j = sock_start[s]; j < sock_end[s]; j++) { if (bit_test(core_map, j)) cpus_cnt[s] += vpus; } cpus_cnt[s] -= (cpus_cnt[s] % cpus_per_task); } tmp_cpt = cpus_per_task; for (s = 0; ((s < sockets) && (cpus > 0)); s++) { while ((sock_start[s] < sock_end[s]) && (cpus_cnt[s] > 0) && (cpus > 0)) { if (bit_test(core_map, sock_start[s])) { int used; sock_used[s] = true; core_cnt++; if ((ntasks_per_core == 1) && (cpus_per_task > vpus)) { used = MIN(tmp_cpt, vpus); if (tmp_cpt <= used) tmp_cpt = cpus_per_task; else tmp_cpt -= used; } else used = vpus; if (cpus_cnt[s] < vpus) cpus_cnt[s] = 0; else cpus_cnt[s] -= used; if (cpus < vpus) cpus = 0; else cpus -= used; } sock_start[s]++; } } xfree(cpus_cnt); } while (cpus > 0) { uint16_t prev_cpus = cpus; for (s = 0; s < sockets && cpus > 0; s++) { if (sock_avoid[s]) continue; while (sock_start[s] < sock_end[s]) { if (bit_test(core_map, sock_start[s])) { sock_used[s] = true; core_cnt++; break; } else sock_start[s]++; } if (sock_start[s] == sock_end[s]) /* this socket is unusable */ continue; if (cpus < vpus) cpus = 0; else cpus -= vpus; sock_start[s]++; } if (prev_cpus != cpus) continue; if (!preempt_mode) { /* we're stuck! */ job_ptr->priority = 0; job_ptr->state_reason = WAIT_HELD; error("%s: sync loop not progressing on node %s, holding job %u", __func__, select_node_record[n].node_ptr->name, job_ptr->job_id); } error_code = SLURM_ERROR; goto fini; } /* clear the rest of the cores in each socket * FIXME: do we need min_core/min_socket checks here? */ for (s = 0; s < sockets; s++) { if (sock_start[s] == sock_end[s]) continue; if (!alloc_sockets || !sock_used[s]) { bit_nclear(core_map, sock_start[s], sock_end[s]-1); } if ((select_node_record[n].vpus >= 1) && (alloc_sockets || alloc_cores) && sock_used[s]) { for (j=sock_start[s]; j<sock_end[s]; j++) { /* Mark all cores as used */ if (alloc_sockets) bit_set(core_map, j); if (bit_test(core_map, j)) core_cnt++; } } } if ((alloc_cores || alloc_sockets) && (select_node_record[n].vpus >= 1)) { job_res->cpus[i] = core_cnt * select_node_record[n].vpus; } i++; /* advance 'c' to the beginning of the next node */ c += sockets * cps; } fini: xfree(sock_avoid); xfree(sock_start); xfree(sock_end); xfree(sock_used); return error_code; }
/* To effectively deal with heterogeneous nodes, we fake a cyclic * distribution to figure out how many cpus are needed on each node. * * This routine is a slightly modified "version" of the routine * _task_layout_block in src/common/dist_tasks.c. We do not need to * assign tasks to job->hostid[] and job->tids[][] at this point so * the cpu allocation is the same for cyclic and block. * * For the consumable resources support we need to determine what * "node/CPU/Core/thread"-tuplets will be allocated for a given job. * In the past we assumed that we only allocated one task per CPU (at * that point the lowest level of logical processor) and didn't allow * the use of overcommit. We have changed this philosophy and are now * allowing people to overcommit their resources and expect the system * administrator to enable the task/affinity plug-in which will then * bind all of a job's tasks to its allocated resources thereby * avoiding interference between co-allocated running jobs. * * In the consumable resources environment we need to determine the * layout schema within slurmctld. * * We have a core_bitmap of all available cores. All we're doing here * is removing cores that are not needed based on the task count, and * the choice of cores to remove is based on the distribution: * - "cyclic" removes cores "evenly", starting from the last socket, * - "block" removes cores from the "last" socket(s) * - "plane" removes cores "in chunks" * * IN job_ptr - job to be allocated resources * IN cr_type - allocation type (sockets, cores, etc.) * IN preempt_mode - true if testing with simulated preempted jobs */ extern int cr_dist(struct job_record *job_ptr, const uint16_t cr_type, bool preempt_mode) { int error_code, cr_cpu = 1; if (job_ptr->details->core_spec != (uint16_t) NO_VAL) { /* The job has been allocated all non-specialized cores, * so we don't need to select specific CPUs. */ return SLURM_SUCCESS; } if ((job_ptr->job_resrcs->node_req == NODE_CR_RESERVED) || (job_ptr->details->whole_node == 1)) { int n, i; job_resources_t *job_res = job_ptr->job_resrcs; /* The job has been allocated an EXCLUSIVE set of nodes, * so it gets all of the bits in the core_bitmap and * all of the available CPUs in the cpus array. */ int size = bit_size(job_res->core_bitmap); bit_nset(job_res->core_bitmap, 0, size-1); /* Up to this point we might not have the job_res pointer have * the right cpu count. It is most likely a core count. We * will fix that so we can layout tasks correctly. */ size = bit_size(job_res->node_bitmap); for (i = 0, n = bit_ffs(job_res->node_bitmap); n < size; n++) { if (bit_test(job_res->node_bitmap, n) == 0) continue; job_res->cpus[i++] = select_node_record[n].cpus; } return SLURM_SUCCESS; } _log_select_maps("cr_dist/start", job_ptr->job_resrcs->node_bitmap, job_ptr->job_resrcs->core_bitmap); if ((job_ptr->details->task_dist & SLURM_DIST_STATE_BASE) == SLURM_DIST_PLANE) { /* perform a plane distribution on the 'cpus' array */ error_code = _compute_plane_dist(job_ptr); if (error_code != SLURM_SUCCESS) { error("cons_res: cr_dist: Error in " "_compute_plane_dist"); return error_code; } } else { /* perform a cyclic distribution on the 'cpus' array */ error_code = _compute_c_b_task_dist(job_ptr); if (error_code != SLURM_SUCCESS) { error("cons_res: cr_dist: Error in " "_compute_c_b_task_dist"); return error_code; } } /* now sync up the core_bitmap with the allocated 'cpus' array * based on the given distribution AND resource setting */ if ((cr_type & CR_CORE) || (cr_type & CR_SOCKET)) cr_cpu = 0; if (cr_cpu) { _block_sync_core_bitmap(job_ptr, cr_type); return SLURM_SUCCESS; } /* * If SelectTypeParameters mentions to use a block distribution for * cores by default, use that kind of distribution if no particular * cores distribution specified. * Note : cyclic cores distribution, which is the default, is treated * by the next code block */ if ( slurmctld_conf.select_type_param & CR_CORE_DEFAULT_DIST_BLOCK ) { switch(job_ptr->details->task_dist & SLURM_DIST_NODEMASK) { case SLURM_DIST_ARBITRARY: case SLURM_DIST_BLOCK: case SLURM_DIST_CYCLIC: case SLURM_DIST_UNKNOWN: _block_sync_core_bitmap(job_ptr, cr_type); return SLURM_SUCCESS; } } /* Determine the number of logical processors per node needed * for this job. Make sure below matches the layouts in * lllp_distribution in plugins/task/affinity/dist_task.c (FIXME) */ switch(job_ptr->details->task_dist & SLURM_DIST_NODESOCKMASK) { case SLURM_DIST_BLOCK_BLOCK: case SLURM_DIST_CYCLIC_BLOCK: case SLURM_DIST_PLANE: _block_sync_core_bitmap(job_ptr, cr_type); break; case SLURM_DIST_ARBITRARY: case SLURM_DIST_BLOCK: case SLURM_DIST_CYCLIC: case SLURM_DIST_BLOCK_CYCLIC: case SLURM_DIST_CYCLIC_CYCLIC: case SLURM_DIST_BLOCK_CFULL: case SLURM_DIST_CYCLIC_CFULL: case SLURM_DIST_UNKNOWN: error_code = _cyclic_sync_core_bitmap(job_ptr, cr_type, preempt_mode); break; default: error("select/cons_res: invalid task_dist entry"); return SLURM_ERROR; } _log_select_maps("cr_dist/fini", job_ptr->job_resrcs->node_bitmap, job_ptr->job_resrcs->core_bitmap); return error_code; }
static int retrieve_segment(struct segment * seg) { char proc[256]; snprintf(proc, 256, "rsg%u", g_nodeId); prctl(PR_SET_NAME, proc, 0, 0, 0); int rv = -1; log_print(g_log, "retrieve_segment: trying to retrieve segment %s[%d - %d].", seg->name->full_name, 0, seg->num_chunks-1); pthread_mutex_lock(&g_lock); int retries = g_interest_attempts; int timeout_ms = g_timeout_ms; pthread_mutex_unlock(&g_lock); int ttl = MAX_TTL; if ((seg->opts->mode & CCNFDNB_USE_RETRIES) == CCNFDNB_USE_RETRIES) { retries = seg->opts->retries; } if ((seg->opts->mode & CCNFDNB_USE_TIMEOUT) == CCNFDNB_USE_TIMEOUT) { timeout_ms = seg->opts->timeout_ms; } if ((seg->opts->mode & CCNFDNB_USE_TTL) == CCNFDNB_USE_TTL) { ttl = seg->opts->ttl; } struct chunk chunk_window[MAX_INTEREST_PIPELINE]; PENTRY _pit_handles[MAX_INTEREST_PIPELINE]; int pit_to_chunk[PIT_SIZE]; memset(&pit_to_chunk, 0, sizeof(pit_to_chunk)); struct bitmap * window = bit_create(MAX_INTEREST_PIPELINE); struct bitmap * missing = bit_create(seg->num_chunks); char str[MAX_NAME_LENGTH], comp[MAX_NAME_LENGTH]; strncpy(str, seg->name->full_name, seg->name->len); int rtt_est = timeout_ms; int cwnd = 1; int ssthresh = DEFAULT_INTEREST_PIPELINE; int fullfilled = 0; int min_rtt_est = 10; int current_chunk = 0; cc_state state = SLOW_START; int tx; _segment_q_t seg_q; pthread_mutex_init(&seg_q.mutex, NULL); pthread_cond_init(&seg_q.cond, NULL); seg_q.rcv_window = 0; seg_q.max_window = &cwnd; seg_q.rcv_chunks = linked_list_init(NULL); seg_q.base = seg->name; ccnfdnl_reg_segment(&seg_q); int i; window->num_bits = cwnd; while (!bit_allSet(missing)) { tx = cwnd; window->num_bits = cwnd; log_debug(g_log, "state = %d, cwnd = %d, ssthresh = %d rtt_est = %d", state, cwnd, ssthresh, rtt_est); while (tx && (current_chunk < seg->num_chunks)) { snprintf(comp, MAX_NAME_LENGTH - seg->name->len, "/%d", current_chunk); strncpy(str + seg->name->len, comp, seg->name->len); i = bit_find(window); if (i < 0 || i >= MAX_INTEREST_PIPELINE) { /* we must still be waiting for data */ break; } chunk_window[i].intr.name = content_name_create(str); chunk_window[i].intr.ttl = ttl; chunk_window[i].seq_no = current_chunk; chunk_window[i].retries = retries; _pit_handles[i] = PIT_get_handle(chunk_window[i].intr.name); if (!_pit_handles[i]) { bit_clear(window, i); break; } pit_to_chunk[_pit_handles[i]->index] = i; pthread_mutex_unlock(_pit_handles[i]->mutex); struct content_obj * co = CS_get(chunk_window[i].intr.name); if (!co) { log_debug(g_log, "expressing new interest: %s", chunk_window[i].intr.name->full_name); ccnfdnb_fwd_interest(&chunk_window[i].intr); tx--; } else { log_debug(g_log, "retrieved %s from CS", co->name->full_name); PENTRY pe = PIT_exact_match(chunk_window[i].intr.name); *pe->obj = co; pthread_mutex_unlock(pe->mutex); pthread_mutex_lock(&seg_q.mutex); linked_list_append(seg_q.rcv_chunks, pe); seg_q.rcv_window++; pthread_mutex_unlock(&seg_q.mutex); } current_chunk++; } log_debug(g_log, "tx window full"); pthread_mutex_lock(&seg_q.mutex); if (seg_q.rcv_chunks->len == 0) { struct timespec wait; ts_fromnow(&wait); ts_addms(&wait, 2 * rtt_est); rv = pthread_cond_timedwait(&seg_q.cond, &seg_q.mutex, &wait); if ((rv == ETIMEDOUT) && !seg_q.rcv_chunks->len) { /* we timed out, we need to rtx */ rtt_est += rtt_est / 2; if (rtt_est > PIT_LIFETIME_MS) rtt_est = PIT_LIFETIME_MS / 2; } } else { int pit_ages = 0; int pits_fulfilled = 0; while (seg_q.rcv_chunks->len > 0) { PENTRY pe = linked_list_remove(seg_q.rcv_chunks, 0); log_assert(g_log, pe != NULL, "invalid pit entry"); pthread_mutex_lock(pe->mutex); log_debug(g_log, "pit entry %s fulfilled", (*pe->obj)->name->full_name); int chunk_id = pit_to_chunk[pe->index]; log_assert(g_log, chunk_id >= 0, "invalid chunk id"); if (chunk_window[chunk_id].seq_no == 0) { seg->obj->publisher = (*pe->obj)->publisher; seg->obj->timestamp = (*pe->obj)->timestamp; seg->chunk_size = (*pe->obj)->size; } int offset = chunk_window[chunk_id].seq_no * seg->chunk_size; memcpy(&seg->obj->data[offset], (*pe->obj)->data, (*pe->obj)->size); content_obj_destroy(*pe->obj); struct timespec now; ts_fromnow(&now); ts_addms(&now, PIT_LIFETIME_MS); pit_ages += PIT_age(pe); pits_fulfilled++; pit_to_chunk[pe->index] = -1; PIT_release(pe); free(_pit_handles[chunk_id]); _pit_handles[chunk_id] = NULL; bit_clear(window, chunk_id); bit_set(missing, chunk_window[chunk_id].seq_no); log_debug(g_log, "retrieved chunk %s", chunk_window[chunk_id].intr.name->full_name); content_name_delete(chunk_window[chunk_id].intr.name); chunk_window[chunk_id].intr.name = NULL; cwnd++; if (state == CONG_AVOID) fullfilled++; } rtt_est -= floor(pit_ages / pits_fulfilled); if (rtt_est < min_rtt_est) rtt_est = min_rtt_est; } pthread_mutex_unlock(&seg_q.mutex); for (i = 0; i < MAX_INTEREST_PIPELINE; i++) { if (bit_test(window, i)) { if (!_pit_handles[i]) { continue; } pthread_mutex_lock(_pit_handles[i]->mutex); if (PIT_age(_pit_handles[i]) > (2 * rtt_est)) { PIT_refresh(_pit_handles[i]); chunk_window[i].retries--; ccnfdnb_fwd_interest(&chunk_window[i].intr); log_debug(g_log, "rtx interest: %s (rtt = %d)", chunk_window[i].intr.name->full_name, rtt_est); ssthresh = cwnd / 2 + 1; cwnd = 1; state = SLOW_START; } pthread_mutex_unlock(_pit_handles[i]->mutex); } } if ((cwnd >= ssthresh) && (state == SLOW_START)) state = CONG_AVOID; if (state == SLOW_START) fullfilled = 0; if ((fullfilled == cwnd) && (state == CONG_AVOID)) { cwnd++; fullfilled = 0; } if (cwnd > MAX_INTEREST_PIPELINE) cwnd = MAX_INTEREST_PIPELINE; /*log_debug(g_log, "cwnd = %d, ssthresh = %d", cwnd, ssthresh);*/ } log_debug(g_log, "retrieve_segment: finished for %s[%d-%d]", seg->name->full_name, 0, seg->num_chunks-1); rv = 0; PIT_print(); ccnfdnl_unreg_segment(&seg_q); pthread_mutex_destroy(&seg_q.mutex); pthread_cond_destroy(&seg_q.cond); while (seg_q.rcv_chunks->len) { PENTRY pe = linked_list_remove(seg_q.rcv_chunks, 0); content_obj_destroy(*pe->obj); PIT_release(pe); } bit_destroy(window); bit_destroy(missing); return rv; }
/* sync up core bitmap with new CPU count using a best-fit approach * on the available resources on each node * * "Best-fit" means: * 1st priority: Use smallest number of boards with sufficient * available CPUs * 2nd priority: Use smallest number of sockets with sufficient * available CPUs * 3rd priority: Use board combination with the smallest number * of available CPUs * 4th priority: Use higher-numbered boards/sockets/cores first * * The CPU array contains the distribution of CPUs, which can include * virtual CPUs (hyperthreads) */ static void _block_sync_core_bitmap(struct job_record *job_ptr, const uint16_t cr_type) { uint32_t c, s, i, j, n, b, z, size, csize, core_cnt; uint16_t cpus, num_bits, vpus = 1; uint16_t cpus_per_task = job_ptr->details->cpus_per_task; job_resources_t *job_res = job_ptr->job_resrcs; bool alloc_cores = false, alloc_sockets = false; uint16_t ntasks_per_core = 0xffff; int tmp_cpt = 0; int count, cpu_min, b_min, elig, s_min, comb_idx, sock_idx; int elig_idx, comb_brd_idx, sock_list_idx, comb_min, board_num; int* boards_core_cnt; int* sort_brds_core_cnt; int* board_combs; int* socket_list; int* elig_brd_combs; int* elig_core_cnt; bool* sockets_used; uint16_t boards_nb; uint16_t nboards_nb; uint16_t sockets_nb; uint16_t ncores_nb; uint16_t nsockets_nb; uint16_t sock_per_brd; uint16_t sock_per_comb; uint16_t req_cores,best_fit_cores = 0; uint32_t best_fit_location = 0; uint64_t ncomb_brd; bool sufficient, best_fit_sufficient; if (!job_res) return; if (!job_res->core_bitmap) { error("%s: core_bitmap for job %u is NULL", __func__, job_ptr->job_id); return; } if (bit_ffs(job_res->core_bitmap) == -1) { error("%s: core_bitmap for job %u has no bits set", __func__, job_ptr->job_id); return; } if (cr_type & CR_SOCKET) alloc_sockets = true; else if (cr_type & CR_CORE) alloc_cores = true; if (job_ptr->details && job_ptr->details->mc_ptr) { multi_core_data_t *mc_ptr = job_ptr->details->mc_ptr; if ((mc_ptr->ntasks_per_core != (uint16_t) INFINITE) && (mc_ptr->ntasks_per_core)) { ntasks_per_core = mc_ptr->ntasks_per_core; } } size = bit_size(job_res->node_bitmap); csize = bit_size(job_res->core_bitmap); sockets_nb = select_node_record[0].sockets; sockets_core_cnt = xmalloc(sockets_nb * sizeof(int)); sockets_used = xmalloc(sockets_nb * sizeof(bool)); boards_nb = select_node_record[0].boards; boards_core_cnt = xmalloc(boards_nb * sizeof(int)); sort_brds_core_cnt = xmalloc(boards_nb * sizeof(int)); for (c = 0, i = 0, n = 0; n < size; n++) { if (bit_test(job_res->node_bitmap, n) == 0) continue; core_cnt = 0; ncores_nb = select_node_record[n].cores; nsockets_nb = select_node_record[n].sockets; nboards_nb = select_node_record[n].boards; num_bits = nsockets_nb * ncores_nb; if ((c + num_bits) > csize) fatal("cons_res: _block_sync_core_bitmap index error"); cpus = job_res->cpus[i]; vpus = cr_cpus_per_core(job_ptr->details, n); /* compute still required cores on the node */ req_cores = cpus / vpus; if ( cpus % vpus ) req_cores++; /* figure out core cnt if task requires more than one core and * tasks_per_core is 1 */ if ((ntasks_per_core == 1) && (cpus_per_task > vpus)) { /* how many cores a task will consume */ int cores_per_task = (cpus_per_task + vpus - 1) / vpus; int tasks = cpus / cpus_per_task; req_cores = tasks * cores_per_task; } if (nboards_nb > MAX_BOARDS) { debug3("cons_res: node[%u]: exceeds max boards; " "doing best-fit across sockets only", n); nboards_nb = 1; } if ( nsockets_nb > sockets_nb) { sockets_nb = nsockets_nb; xrealloc(sockets_core_cnt, sockets_nb * sizeof(int)); xrealloc(sockets_used,sockets_nb * sizeof(bool)); } if ( nboards_nb > boards_nb) { boards_nb = nboards_nb; xrealloc(boards_core_cnt, boards_nb * sizeof(int)); xrealloc(sort_brds_core_cnt, boards_nb * sizeof(int)); } /* Count available cores on each socket and board */ if (nsockets_nb >= nboards_nb) { sock_per_brd = nsockets_nb / nboards_nb; } else { error("Node socket count lower than board count " "(%u < %u), job %u node %s", nsockets_nb, nboards_nb, job_ptr->job_id, node_record_table_ptr[n].name); sock_per_brd = 1; } for (b = 0; b < nboards_nb; b++) { boards_core_cnt[b] = 0; sort_brds_core_cnt[b] = 0; } for (s = 0; s < nsockets_nb; s++) { sockets_core_cnt[s]=0; sockets_used[s]=false; b = s/sock_per_brd; for ( j = c + (s * ncores_nb) ; j < c + ((s+1) * ncores_nb) ; j++ ) { if ( bit_test(job_res->core_bitmap,j) ) { sockets_core_cnt[s]++; boards_core_cnt[b]++; sort_brds_core_cnt[b]++; } } } /* Sort boards in descending order of available core count */ qsort(sort_brds_core_cnt, nboards_nb, sizeof (int), _cmp_int_descend); /* Determine minimum number of boards required for the * allocation (b_min) */ count = 0; for (b = 0; b < nboards_nb; b++) { count+=sort_brds_core_cnt[b]; if (count >= req_cores) break; } b_min = b+1; sock_per_comb = b_min * sock_per_brd; /* Allocate space for list of board combinations */ ncomb_brd = comb_counts[nboards_nb-1][b_min-1]; board_combs = xmalloc(ncomb_brd * b_min * sizeof(int)); /* Generate all combinations of b_min boards on the node */ _gen_combs(board_combs, nboards_nb, b_min); /* Determine which combinations have enough available cores * for the allocation (eligible board combinations) */ elig_brd_combs = xmalloc(ncomb_brd * sizeof(int)); elig_core_cnt = xmalloc(ncomb_brd * sizeof(int)); elig = 0; for (comb_idx = 0; comb_idx < ncomb_brd; comb_idx++) { count = 0; for (comb_brd_idx = 0; comb_brd_idx < b_min; comb_brd_idx++) { board_num = board_combs[(comb_idx * b_min) + comb_brd_idx]; count += boards_core_cnt[board_num]; } if (count >= req_cores) { elig_brd_combs[elig] = comb_idx; elig_core_cnt[elig] = count; elig++; } } /* Allocate space for list of sockets for each eligible board * combination */ socket_list = xmalloc(elig * sock_per_comb * sizeof(int)); /* Generate sorted list of sockets for each eligible board * combination, and find combination with minimum number * of sockets and minimum number of cpus required for the * allocation */ s_min = sock_per_comb; comb_min = 0; cpu_min = sock_per_comb * ncores_nb; for (elig_idx = 0; elig_idx < elig; elig_idx++) { comb_idx = elig_brd_combs[elig_idx]; for (comb_brd_idx = 0; comb_brd_idx < b_min; comb_brd_idx++) { board_num = board_combs[(comb_idx * b_min) + comb_brd_idx]; sock_list_idx = (elig_idx * sock_per_comb) + (comb_brd_idx * sock_per_brd); for (sock_idx = 0; sock_idx < sock_per_brd; sock_idx++) { socket_list[sock_list_idx + sock_idx] = (board_num * sock_per_brd) + sock_idx; } } /* Sort this socket list in descending order of * available core count */ qsort(&socket_list[elig_idx*sock_per_comb], sock_per_comb, sizeof (int), _cmp_sock); /* Determine minimum number of sockets required for * the allocation from this socket list */ count = 0; for (b = 0; b < sock_per_comb; b++) { sock_idx = socket_list[(int)((elig_idx*sock_per_comb)+b)]; count+=sockets_core_cnt[sock_idx]; if (count >= req_cores) break; } b++; /* Use board combination with minimum number * of required sockets and minimum number of CPUs */ if ((b < s_min) || (b == s_min && elig_core_cnt[elig_idx] <= cpu_min)) { s_min = b; comb_min = elig_idx; cpu_min = elig_core_cnt[elig_idx]; } } if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("cons_res: best_fit: node[%u]: " "required cpus: %u, min req boards: %u,", n, cpus, b_min); info("cons_res: best_fit: node[%u]: " "min req sockets: %u, min avail cores: %u", n, s_min, cpu_min); } /* Re-sort socket list for best-fit board combination in * ascending order of socket number */ qsort(&socket_list[comb_min * sock_per_comb], sock_per_comb, sizeof (int), _cmp_int_ascend); xfree(board_combs); xfree(elig_brd_combs); xfree(elig_core_cnt); /* select cores from the sockets of the best-fit board * combination using a best-fit approach */ tmp_cpt = cpus_per_task; while ( cpus > 0 ) { best_fit_cores = 0; best_fit_sufficient = false; /* search for the socket with best fit */ for ( z = 0; z < sock_per_comb; z++ ) { s = socket_list[(comb_min*sock_per_comb)+z]; sufficient = sockets_core_cnt[s] >= req_cores; if ( (best_fit_cores == 0) || (sufficient && !best_fit_sufficient ) || (sufficient && (sockets_core_cnt[s] < best_fit_cores)) || (!sufficient && (sockets_core_cnt[s] > best_fit_cores)) ) { best_fit_cores = sockets_core_cnt[s]; best_fit_location = s; best_fit_sufficient = sufficient; } } /* check that we have found a usable socket */ if ( best_fit_cores == 0 ) break; j = best_fit_location; if (sock_per_brd) j /= sock_per_brd; if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("cons_res: best_fit: using node[%u]: " "board[%u]: socket[%u]: %u cores " "available", n, j, best_fit_location, sockets_core_cnt[best_fit_location]); } sockets_used[best_fit_location] = true; for ( j = (c + (best_fit_location * ncores_nb)); j < (c + ((best_fit_location + 1) * ncores_nb)); j++ ) { /* * if no more cpus to select * release remaining cores unless * we are allocating whole sockets */ if (cpus == 0) { if (alloc_sockets) { bit_set(job_res->core_bitmap,j); core_cnt++; } else { bit_clear(job_res->core_bitmap,j); } continue; } /* * remove cores from socket count and * cpus count using hyperthreading requirement */ if ( bit_test(job_res->core_bitmap, j) ) { sockets_core_cnt[best_fit_location]--; core_cnt++; if (cpus < vpus) cpus = 0; else if ((ntasks_per_core == 1) && (cpus_per_task > vpus)) { int used = MIN(tmp_cpt, vpus); cpus -= used; if (tmp_cpt <= used) tmp_cpt = cpus_per_task; else tmp_cpt -= used; } else { cpus -= vpus; } } else if (alloc_sockets) { /* If the core is not used, add it * anyway if allocating whole sockets */ bit_set(job_res->core_bitmap, j); core_cnt++; } } /* loop again if more cpus required */ if ( cpus > 0 ) continue; /* release remaining cores of the unused sockets */ for (s = 0; s < nsockets_nb; s++) { if ( sockets_used[s] ) continue; bit_nclear(job_res->core_bitmap, c+(s*ncores_nb), c+((s+1)*ncores_nb)-1); } } xfree(socket_list); if (cpus > 0) { /* cpu count should NEVER be greater than the number * of set bits in the core bitmap for a given node */ fatal("cons_res: cpus computation error"); } /* adjust cpus count of the current node */ if ((alloc_cores || alloc_sockets) && (select_node_record[n].vpus >= 1)) { job_res->cpus[i] = core_cnt * select_node_record[n].vpus; } i++; /* move c to the next node in core_bitmap */ c += num_bits; } xfree(boards_core_cnt); xfree(sort_brds_core_cnt); xfree(sockets_core_cnt); xfree(sockets_used); }
/* Perform any power change work to nodes */ static void _do_power_work(time_t now) { static time_t last_log = 0, last_work_scan = 0; int i, wake_cnt = 0, sleep_cnt = 0, susp_total = 0; time_t delta_t; uint32_t susp_state; bitstr_t *wake_node_bitmap = NULL, *sleep_node_bitmap = NULL; struct node_record *node_ptr; bool run_suspend = false; /* Set limit on counts of nodes to have state changed */ delta_t = now - last_work_scan; if (delta_t >= 60) { suspend_cnt_f = 0.0; resume_cnt_f = 0.0; } else { float rate = (60 - delta_t) / 60.0; suspend_cnt_f *= rate; resume_cnt_f *= rate; } suspend_cnt = (suspend_cnt_f + 0.5); resume_cnt = (resume_cnt_f + 0.5); if (now > (last_suspend + suspend_timeout)) { /* ready to start another round of node suspends */ run_suspend = true; if (last_suspend) { bit_nclear(suspend_node_bitmap, 0, (node_record_count - 1)); last_suspend = (time_t) 0; } } last_work_scan = now; /* Build bitmaps identifying each node which should change state */ for (i=0, node_ptr=node_record_table_ptr; i<node_record_count; i++, node_ptr++) { susp_state = IS_NODE_POWER_SAVE(node_ptr); if (susp_state) susp_total++; /* Resume nodes as appropriate */ if (susp_state && ((resume_rate == 0) || (resume_cnt < resume_rate)) && (bit_test(suspend_node_bitmap, i) == 0) && (IS_NODE_ALLOCATED(node_ptr) || (node_ptr->last_idle > (now - idle_time)))) { if (wake_node_bitmap == NULL) { wake_node_bitmap = bit_alloc(node_record_count); } wake_cnt++; resume_cnt++; resume_cnt_f++; node_ptr->node_state &= (~NODE_STATE_POWER_SAVE); node_ptr->node_state |= NODE_STATE_POWER_UP; node_ptr->node_state |= NODE_STATE_NO_RESPOND; bit_clear(power_node_bitmap, i); bit_clear(avail_node_bitmap, i); node_ptr->last_response = now + resume_timeout; bit_set(wake_node_bitmap, i); } /* Suspend nodes as appropriate */ if (run_suspend && (susp_state == 0) && ((suspend_rate == 0) || (suspend_cnt < suspend_rate)) && IS_NODE_IDLE(node_ptr) && (node_ptr->sus_job_cnt == 0) && (!IS_NODE_COMPLETING(node_ptr)) && (!IS_NODE_POWER_UP(node_ptr)) && (node_ptr->last_idle < (now - idle_time)) && ((exc_node_bitmap == NULL) || (bit_test(exc_node_bitmap, i) == 0))) { if (sleep_node_bitmap == NULL) { sleep_node_bitmap = bit_alloc(node_record_count); } sleep_cnt++; suspend_cnt++; suspend_cnt_f++; node_ptr->node_state |= NODE_STATE_POWER_SAVE; bit_set(power_node_bitmap, i); bit_set(sleep_node_bitmap, i); bit_set(suspend_node_bitmap, i); last_suspend = now; } } if (((now - last_log) > 600) && (susp_total > 0)) { info("Power save mode: %d nodes", susp_total); last_log = now; } if (sleep_node_bitmap) { char *nodes; nodes = bitmap2node_name(sleep_node_bitmap); if (nodes) _do_suspend(nodes); else error("power_save: bitmap2nodename"); xfree(nodes); FREE_NULL_BITMAP(sleep_node_bitmap); /* last_node_update could be changed already by another thread! last_node_update = now; */ } if (wake_node_bitmap) { char *nodes; nodes = bitmap2node_name(wake_node_bitmap); if (nodes) _do_resume(nodes); else error("power_save: bitmap2nodename"); xfree(nodes); FREE_NULL_BITMAP(wake_node_bitmap); /* last_node_update could be changed already by another thread! last_node_update = now; */ } }
/* Sync up the core_bitmap with the CPU array using cyclic distribution * * The CPU array contains the distribution of CPUs, which can include * virtual CPUs (hyperthreads) */ static int _cyclic_sync_core_bitmap(struct job_record *job_ptr, const uint16_t cr_type) { uint32_t c, i, j, s, n, *sock_start, *sock_end, size, csize, core_cnt; uint16_t cps = 0, cpus, vpus, sockets, sock_size; job_resources_t *job_res = job_ptr->job_resrcs; bitstr_t *core_map; bool *sock_used, *sock_avoid; bool alloc_cores = false, alloc_sockets = false; uint16_t ntasks_per_core = 0xffff, ntasks_per_socket = 0xffff; int error_code = SLURM_SUCCESS; if ((job_res == NULL) || (job_res->core_bitmap == NULL)) return error_code; if (cr_type & CR_CORE) alloc_cores = true; if (slurmctld_conf.select_type_param & CR_ALLOCATE_FULL_SOCKET) { if (cr_type & CR_SOCKET) alloc_sockets = true; } else { if (cr_type & CR_SOCKET) alloc_cores = true; } core_map = job_res->core_bitmap; if (job_ptr->details && job_ptr->details->mc_ptr) { multi_core_data_t *mc_ptr = job_ptr->details->mc_ptr; if (mc_ptr->ntasks_per_core) { ntasks_per_core = mc_ptr->ntasks_per_core; } if ((mc_ptr->threads_per_core != (uint16_t) NO_VAL) && (mc_ptr->threads_per_core < ntasks_per_core)) { ntasks_per_core = mc_ptr->threads_per_core; } if (mc_ptr->ntasks_per_socket) ntasks_per_socket = mc_ptr->ntasks_per_socket; } sock_size = select_node_record[0].sockets; sock_avoid = xmalloc(sock_size * sizeof(bool)); sock_start = xmalloc(sock_size * sizeof(uint32_t)); sock_end = xmalloc(sock_size * sizeof(uint32_t)); sock_used = xmalloc(sock_size * sizeof(bool)); size = bit_size(job_res->node_bitmap); csize = bit_size(core_map); for (c = 0, i = 0, n = 0; n < size; n++) { if (bit_test(job_res->node_bitmap, n) == 0) continue; sockets = select_node_record[n].sockets; cps = select_node_record[n].cores; vpus = MIN(select_node_record[n].vpus, ntasks_per_core); if (select_debug_flags & DEBUG_FLAG_CPU_BIND) { info("DEBUG: job %u node %s vpus %u cpus %u", job_ptr->job_id, select_node_record[n].node_ptr->name, vpus, job_res->cpus[i]); } if ((c + (sockets * cps)) > csize) fatal("cons_res: _cyclic_sync_core_bitmap index error"); if (sockets > sock_size) { sock_size = sockets; xrealloc(sock_avoid, sock_size * sizeof(bool)); xrealloc(sock_start, sock_size * sizeof(uint32_t)); xrealloc(sock_end, sock_size * sizeof(uint32_t)); xrealloc(sock_used, sock_size * sizeof(bool)); } for (s = 0; s < sockets; s++) { sock_start[s] = c + (s * cps); sock_end[s] = sock_start[s] + cps; sock_avoid[s] = false; sock_used[s] = false; } core_cnt = 0; cpus = job_res->cpus[i]; if (ntasks_per_socket != 0xffff) { int x_cpus; uint32_t total_cpus = 0; uint32_t *cpus_cnt = xmalloc(sizeof(uint32_t)* sockets); for (s = 0; s < sockets; s++) { for (j = sock_start[s]; j < sock_end[s]; j++) { if (bit_test(core_map, j)) cpus_cnt[s] += vpus; } total_cpus += cpus_cnt[s]; } for (s = 0; s < sockets && total_cpus > cpus; s++) { if (cpus_cnt[s] > ntasks_per_socket) { x_cpus = cpus_cnt[s] -ntasks_per_socket; cpus_cnt[s] = ntasks_per_socket; total_cpus -= x_cpus; } } for (s = 0; s < sockets && total_cpus > cpus; s++) { if ((cpus_cnt[s] <= ntasks_per_socket) && (total_cpus - cpus_cnt[s] >= cpus)) { sock_avoid[s] = true; total_cpus -= cpus_cnt[s]; } } xfree(cpus_cnt); } while (cpus > 0) { uint16_t prev_cpus = cpus; for (s = 0; s < sockets && cpus > 0; s++) { if (sock_avoid[s]) continue; while (sock_start[s] < sock_end[s]) { if (bit_test(core_map,sock_start[s])) { sock_used[s] = true; core_cnt++; break; } else sock_start[s]++; } if (sock_start[s] == sock_end[s]) /* this socket is unusable */ continue; if (cpus < vpus) cpus = 0; else cpus -= vpus; sock_start[s]++; } if (prev_cpus == cpus) { /* we're stuck! */ job_ptr->priority = 0; job_ptr->state_reason = WAIT_HELD; error("cons_res: sync loop not progressing, " "holding job %u", job_ptr->job_id); error_code = SLURM_ERROR; goto fini; } } /* clear the rest of the cores in each socket * FIXME: do we need min_core/min_socket checks here? */ for (s = 0; s < sockets; s++) { if (sock_start[s] == sock_end[s]) continue; if (!alloc_sockets || !sock_used[s]) { bit_nclear(core_map, sock_start[s], sock_end[s]-1); } if ((select_node_record[n].vpus >= 1) && (alloc_sockets || alloc_cores) && sock_used[s]) { for (j=sock_start[s]; j<sock_end[s]; j++) { /* Mark all cores as used */ if (alloc_sockets) bit_set(core_map, j); if (bit_test(core_map, j)) core_cnt++; } } } if ((alloc_cores || alloc_sockets) && (select_node_record[n].vpus >= 1)) { job_res->cpus[i] = core_cnt * select_node_record[n].vpus; } i++; /* advance 'c' to the beginning of the next node */ c += sockets * cps; } fini: xfree(sock_avoid); xfree(sock_start); xfree(sock_end); xfree(sock_used); return error_code; }
/* Execute C0 control sequence. */ int input_c0_dispatch(struct input_ctx *ictx) { struct screen_write_ctx *sctx = &ictx->ctx; struct window_pane *wp = ictx->wp; struct screen *s = sctx->s; u_int trigger; log_debug("%s: '%c", __func__, ictx->ch); switch (ictx->ch) { case '\000': /* NUL */ break; case '\007': /* BEL */ wp->window->flags |= WINDOW_BELL; break; case '\010': /* BS */ screen_write_backspace(sctx); goto count_c0; case '\011': /* HT */ /* Don't tab beyond the end of the line. */ if (s->cx >= screen_size_x(s) - 1) break; /* Find the next tab point, or use the last column if none. */ do { s->cx++; if (bit_test(s->tabs, s->cx)) break; } while (s->cx < screen_size_x(s) - 1); break; case '\012': /* LF */ case '\013': /* VT */ case '\014': /* FF */ screen_write_linefeed(sctx, 0); goto count_c0; case '\015': /* CR */ screen_write_carriagereturn(sctx); goto count_c0; case '\016': /* SO */ ictx->cell.set = 1; break; case '\017': /* SI */ ictx->cell.set = 0; break; default: log_debug("%s: unknown '%c'", __func__, ictx->ch); break; } return (0); count_c0: trigger = options_get_number(&wp->window->options, "c0-change-trigger"); if (trigger != 0 && ++wp->changes >= trigger) { wp->flags |= PANE_DROP; window_pane_timer_start(wp); } return (0); }
/* * batch_bind - Set the batch request message so as to bind the shell to the * proper resources */ void batch_bind(batch_job_launch_msg_t *req) { bitstr_t *req_map, *hw_map; slurm_cred_arg_t arg; uint16_t sockets=0, cores=0, num_cpus; int start, task_cnt=0; if (slurm_cred_get_args(req->cred, &arg) != SLURM_SUCCESS) { error("task/affinity: job lacks a credential"); return; } start = _get_local_node_info(&arg, 0, &sockets, &cores); if (start != 0) { error("task/affinity: missing node 0 in job credential"); slurm_cred_free_args(&arg); return; } if ((sockets * cores) == 0) { error("task/affinity: socket and core count both zero"); slurm_cred_free_args(&arg); return; } num_cpus = MIN((sockets * cores), (conf->sockets * conf->cores)); req_map = (bitstr_t *) bit_alloc(num_cpus); hw_map = (bitstr_t *) bit_alloc(conf->block_map_size); #ifdef HAVE_FRONT_END { /* Since the front-end nodes are a shared resource, we limit each job * to one CPU based upon monotonically increasing sequence number */ static int last_id = 0; bit_set(hw_map, ((last_id++) % conf->block_map_size)); task_cnt = 1; } #else { char *str; int t, p; /* Transfer core_bitmap data to local req_map. * The MOD function handles the case where fewer processes * physically exist than are configured (slurmd is out of * sync with the slurmctld daemon). */ for (p = 0; p < (sockets * cores); p++) { if (bit_test(arg.job_core_bitmap, p)) bit_set(req_map, (p % num_cpus)); } str = (char *)bit_fmt_hexmask(req_map); debug3("task/affinity: job %u CPU mask from slurmctld: %s", req->job_id, str); xfree(str); for (p = 0; p < num_cpus; p++) { if (bit_test(req_map, p) == 0) continue; /* core_bitmap does not include threads, so we * add them here but limit them to what the job * requested */ for (t = 0; t < conf->threads; t++) { uint16_t pos = p * conf->threads + t; if (pos >= conf->block_map_size) { info("more resources configured than exist"); p = num_cpus; break; } bit_set(hw_map, pos); task_cnt++; } } } #endif if (task_cnt) { req->cpu_bind_type = CPU_BIND_MASK; if (conf->task_plugin_param & CPU_BIND_VERBOSE) req->cpu_bind_type |= CPU_BIND_VERBOSE; req->cpu_bind = (char *)bit_fmt_hexmask(hw_map); info("task/affinity: job %u CPU input mask for node: %s", req->job_id, req->cpu_bind); /* translate abstract masks to actual hardware layout */ _lllp_map_abstract_masks(1, &hw_map); #ifdef HAVE_NUMA if (req->cpu_bind_type & CPU_BIND_TO_LDOMS) { _match_masks_to_ldom(1, &hw_map); } #endif xfree(req->cpu_bind); req->cpu_bind = (char *)bit_fmt_hexmask(hw_map); info("task/affinity: job %u CPU final HW mask for node: %s", req->job_id, req->cpu_bind); } else { error("task/affinity: job %u allocated no CPUs", req->job_id); } FREE_NULL_BITMAP(hw_map); FREE_NULL_BITMAP(req_map); slurm_cred_free_args(&arg); }
/* Add the given job to the "active" structures of * the given partition and increment the run count */ static void _add_job_to_active(struct job_record *job_ptr, struct gs_part *p_ptr) { job_resources_t *job_res = job_ptr->job_resrcs; uint16_t job_gr_type; /* add job to active_resmap */ job_gr_type = _get_part_gr_type(job_ptr->part_ptr); if ((job_gr_type == GS_CPU2) || (job_gr_type == GS_CORE) || (job_gr_type == GS_SOCKET)) { if (p_ptr->jobs_active == 0 && p_ptr->active_resmap) { uint32_t size = bit_size(p_ptr->active_resmap); bit_nclear(p_ptr->active_resmap, 0, size-1); } add_job_to_cores(job_res, &(p_ptr->active_resmap), gs_bits_per_node); if (job_gr_type == GS_SOCKET) _fill_sockets(job_res->node_bitmap, p_ptr); } else { /* GS_NODE or GS_CPU */ if (!p_ptr->active_resmap) { if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) { info("gang: _add_job_to_active: job %u first", job_ptr->job_id); } p_ptr->active_resmap = bit_copy(job_res->node_bitmap); } else if (p_ptr->jobs_active == 0) { if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) { info("gang: _add_job_to_active: job %u copied", job_ptr->job_id); } bit_copybits(p_ptr->active_resmap, job_res->node_bitmap); } else { if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) { info("gang: _add_job_to_active: adding job %u", job_ptr->job_id); } bit_or(p_ptr->active_resmap, job_res->node_bitmap); } } /* add job to the active_cpus array */ if (job_gr_type == GS_CPU) { uint32_t i, a, sz = bit_size(p_ptr->active_resmap); if (!p_ptr->active_cpus) { /* create active_cpus array */ p_ptr->active_cpus = xmalloc(sz * sizeof(uint16_t)); } if (p_ptr->jobs_active == 0) { /* overwrite the existing values in active_cpus */ for (a = 0, i = 0; i < sz; i++) { if (bit_test(job_res->node_bitmap, i)) { p_ptr->active_cpus[i] = job_res->cpus[a++]; } else { p_ptr->active_cpus[i] = 0; } } } else { /* add job to existing jobs in the active cpus */ for (a = 0, i = 0; i < sz; i++) { if (bit_test(job_res->node_bitmap, i)) { uint16_t limit = _get_phys_bit_cnt(i); p_ptr->active_cpus[i] += job_res->cpus[a++]; /* when adding shadows, the resources * may get overcommitted */ if (p_ptr->active_cpus[i] > limit) p_ptr->active_cpus[i] = limit; } } } } p_ptr->jobs_active += 1; }
/* Determine which CPUs a job step can use. * OUT whole_<entity>_count - returns count of whole <entities> in this * allocation for this node * OUT part__<entity>_count - returns count of partial <entities> in this * allocation for this node * RET - a string representation of the available mask or NULL on error * NOTE: Caller must xfree() the return value. */ static char *_alloc_mask(launch_tasks_request_msg_t *req, int *whole_node_cnt, int *whole_socket_cnt, int *whole_core_cnt, int *whole_thread_cnt, int *part_socket_cnt, int *part_core_cnt) { uint16_t sockets, cores, threads; int c, s, t, i; int c_miss, s_miss, t_miss, c_hit, t_hit; bitstr_t *alloc_bitmap; char *str_mask; bitstr_t *alloc_mask; *whole_node_cnt = 0; *whole_socket_cnt = 0; *whole_core_cnt = 0; *whole_thread_cnt = 0; *part_socket_cnt = 0; *part_core_cnt = 0; alloc_bitmap = _get_avail_map(req, &sockets, &cores, &threads); if (!alloc_bitmap) return NULL; alloc_mask = bit_alloc(bit_size(alloc_bitmap)); i = 0; for (s=0, s_miss=false; s<sockets; s++) { for (c=0, c_hit=c_miss=false; c<cores; c++) { for (t=0, t_hit=t_miss=false; t<threads; t++) { /* If we are pretending we have a larger system than we really have this is needed to make sure we don't bust the bank. */ if (i >= bit_size(alloc_bitmap)) i = 0; if (bit_test(alloc_bitmap, i)) { bit_set(alloc_mask, i); (*whole_thread_cnt)++; t_hit = true; c_hit = true; } else t_miss = true; i++; } if (!t_miss) (*whole_core_cnt)++; else { if (t_hit) (*part_core_cnt)++; c_miss = true; } } if (!c_miss) (*whole_socket_cnt)++; else { if (c_hit) (*part_socket_cnt)++; s_miss = true; } } if (!s_miss) (*whole_node_cnt)++; FREE_NULL_BITMAP(alloc_bitmap); /* translate abstract masks to actual hardware layout */ _lllp_map_abstract_masks(1, &alloc_mask); #ifdef HAVE_NUMA if (req->cpu_bind_type & CPU_BIND_TO_LDOMS) { _match_masks_to_ldom(1, &alloc_mask); } #endif str_mask = bit_fmt_hexmask(alloc_mask); FREE_NULL_BITMAP(alloc_mask); return str_mask; }
/* * topo_get_node_addr - build node address and the associated pattern * based on the topology information * * example of output : * address : s0.s4.s8.tux1 * pattern : switch.switch.switch.node */ extern int topo_get_node_addr(char* node_name, char** paddr, char** ppattern) { struct node_record *node_ptr; int node_inx; hostlist_t sl = NULL; int s_max_level = 0; int i, j; /* no switches found, return */ if ( switch_record_cnt == 0 ) { *paddr = xstrdup(node_name); *ppattern = xstrdup("node"); return SLURM_SUCCESS; } node_ptr = find_node_record(node_name); /* node not found in configuration */ if ( node_ptr == NULL ) return SLURM_ERROR; node_inx = node_ptr - node_record_table_ptr; /* look for switches max level */ for (i=0; i<switch_record_cnt; i++) { if ( switch_record_table[i].level > s_max_level ) s_max_level = switch_record_table[i].level; } /* initialize output parameters */ *paddr = xstrdup(""); *ppattern = xstrdup(""); /* build node topology address and the associated pattern */ for (j=s_max_level; j>=0; j--) { for (i=0; i<switch_record_cnt; i++) { if ( switch_record_table[i].level != j ) continue; if ( !bit_test(switch_record_table[i]. node_bitmap, node_inx) ) continue; if ( sl == NULL ) { sl = hostlist_create(switch_record_table[i]. name); } else { hostlist_push_host(sl, switch_record_table[i]. name); } } if ( sl ) { char *buf = hostlist_ranged_string_xmalloc(sl); xstrcat(*paddr,buf); xfree(buf); hostlist_destroy(sl); sl = NULL; } xstrcat(*paddr, "."); xstrcat(*ppattern, "switch."); } /* append node name */ xstrcat(*paddr, node_name); xstrcat(*ppattern, "node"); return SLURM_SUCCESS; }
/* * _task_layout_lllp_cyclic * * task_layout_lllp_cyclic creates a cyclic distribution at the * lowest level of logical processor which is either socket, core or * thread depending on the system architecture. The Cyclic algorithm * is the same as the Cyclic distribution performed in srun. * * Distribution at the lllp: * -m hostfile|block|cyclic:block|cyclic * * The first distribution "hostfile|block|cyclic" is computed * in srun. The second distribution "block|cyclic" is computed * locally by each slurmd. * * The input to the lllp distribution algorithms is the gids (tasks * ids) generated for the local node. * * The output is a mapping of the gids onto logical processors * (thread/core/socket) with is expressed cpu_bind masks. * * If a task asks for more than one CPU per task, put the tasks as * close as possible (fill core rather than going next socket for the * extra task) * */ static int _task_layout_lllp_cyclic(launch_tasks_request_msg_t *req, uint32_t node_id, bitstr_t ***masks_p) { int last_taskcount = -1, taskcount = 0; uint16_t i, s, hw_sockets = 0, hw_cores = 0, hw_threads = 0; uint16_t offset = 0, p = 0; int size, max_tasks = req->tasks_to_launch[(int)node_id]; int max_cpus = max_tasks * req->cpus_per_task; bitstr_t *avail_map; bitstr_t **masks = NULL; int *socket_last_pu = NULL; info ("_task_layout_lllp_cyclic "); avail_map = _get_avail_map(req, &hw_sockets, &hw_cores, &hw_threads); if (!avail_map) return SLURM_ERROR; size = bit_set_count(avail_map); if (size < max_tasks) { error("task/affinity: only %d bits in avail_map for %d tasks!", size, max_tasks); FREE_NULL_BITMAP(avail_map); return SLURM_ERROR; } if (size < max_cpus) { /* Possible result of overcommit */ i = size / max_tasks; info("task/affinity: reset cpus_per_task from %d to %d", req->cpus_per_task, i); req->cpus_per_task = i; } socket_last_pu = xmalloc(hw_sockets * sizeof(int)); *masks_p = xmalloc(max_tasks * sizeof(bitstr_t*)); masks = *masks_p; size = bit_size(avail_map); offset = hw_cores * hw_threads; s = 0; while (taskcount < max_tasks) { if (taskcount == last_taskcount) fatal("_task_layout_lllp_cyclic failure"); last_taskcount = taskcount; for (i = 0; i < size; i++) { bool already_switched = false; uint16_t bit = socket_last_pu[s] + (s * offset); /* In case hardware and config differ */ bit %= size; /* set up for the next one */ socket_last_pu[s]++; /* skip unrequested threads */ if (req->cpu_bind_type & CPU_BIND_ONE_THREAD_PER_CORE) socket_last_pu[s] += hw_threads-1; if (socket_last_pu[s] >= offset) { /* Switch to the next socket we have * ran out here. */ /* This only happens if the slurmctld gave us an allocation that made a task split sockets. */ s = (s + 1) % hw_sockets; already_switched = true; } if (!bit_test(avail_map, bit)) continue; if (!masks[taskcount]) masks[taskcount] = bit_alloc(conf->block_map_size); //info("setting %d %d", taskcount, bit); bit_set(masks[taskcount], bit); if (!already_switched && ((req->task_dist == SLURM_DIST_CYCLIC_CFULL) || (req->task_dist == SLURM_DIST_BLOCK_CFULL))) { /* This means we are laying out cpus * within a task cyclically as well. */ s = (s + 1) % hw_sockets; already_switched = true; } if (++p < req->cpus_per_task) continue; p = 0; if (!already_switched) { /* Now that we have finished a task, switch to * the next socket. */ s = (s + 1) % hw_sockets; } if (++taskcount >= max_tasks) break; } } /* last step: expand the masks to bind each task * to the requested resource */ _expand_masks(req->cpu_bind_type, max_tasks, masks, hw_sockets, hw_cores, hw_threads, avail_map); FREE_NULL_BITMAP(avail_map); xfree(socket_last_pu); return SLURM_SUCCESS; }
/** * do_basil_reserve - create a BASIL reservation. * IN job_ptr - pointer to job which has just been allocated resources * RET 0 or error code, job will abort or be requeued on failure */ extern int do_basil_reserve(struct job_record *job_ptr) { struct nodespec *ns_head = NULL; uint16_t mppwidth = 0, mppdepth, mppnppn; uint32_t mppmem = 0, node_min_mem = 0; uint32_t resv_id; int i, first_bit, last_bit; hostlist_t hl; long rc; char *user, batch_id[16]; if (!job_ptr->job_resrcs || job_ptr->job_resrcs->nhosts == 0) return SLURM_SUCCESS; debug3("job #%u: %u nodes = %s, cpus=%u" , job_ptr->job_id, job_ptr->job_resrcs->nhosts, job_ptr->job_resrcs->nodes, job_ptr->job_resrcs->ncpus ); if (job_ptr->job_resrcs->node_bitmap == NULL) { error("job %u node_bitmap not set", job_ptr->job_id); return SLURM_SUCCESS; } first_bit = bit_ffs(job_ptr->job_resrcs->node_bitmap); last_bit = bit_fls(job_ptr->job_resrcs->node_bitmap); if (first_bit == -1 || last_bit == -1) return SLURM_SUCCESS; /* no nodes allocated */ mppdepth = MAX(1, job_ptr->details->cpus_per_task); mppnppn = job_ptr->details->ntasks_per_node; /* mppmem */ if (job_ptr->details->pn_min_memory & MEM_PER_CPU) { /* Only honour --mem-per-cpu if --ntasks has been given */ if (job_ptr->details->num_tasks) mppmem = job_ptr->details->pn_min_memory & ~MEM_PER_CPU; } else if (job_ptr->details->pn_min_memory) { node_min_mem = job_ptr->details->pn_min_memory; } hl = hostlist_create(""); if (hl == NULL) fatal("hostlist_create: malloc error"); for (i = first_bit; i <= last_bit; i++) { struct node_record *node_ptr = node_record_table_ptr + i; uint32_t basil_node_id; if (!bit_test(job_ptr->job_resrcs->node_bitmap, i)) continue; if (!node_ptr->name || node_ptr->name[0] == '\0') continue; /* bad node */ if (sscanf(node_ptr->name, "nid%05u", &basil_node_id) != 1) fatal("can not read basil_node_id from %s", node_ptr->name); if (ns_add_node(&ns_head, basil_node_id) != 0) { error("can not add node %s (nid%05u)", node_ptr->name, basil_node_id); free_nodespec(ns_head); return SLURM_ERROR; } if (node_min_mem) { uint32_t node_cpus, node_mem; if (slurmctld_conf.fast_schedule) { node_cpus = node_ptr->config_ptr->cpus; node_mem = node_ptr->config_ptr->real_memory; } else { node_cpus = node_ptr->cpus; node_mem = node_ptr->real_memory; } /* * ALPS 'Processing Elements per Node' value (aprun -N), * which in slurm is --ntasks-per-node and 'mppnppn' in * PBS: if --ntasks is specified, default to the number * of cores per node (also the default for 'aprun -N'). */ node_mem /= mppnppn ? mppnppn : node_cpus; mppmem = node_min_mem = MIN(node_mem, node_min_mem); } } /* mppwidth */ for (i = 0; i < job_ptr->job_resrcs->nhosts; i++) { uint16_t node_tasks = job_ptr->job_resrcs->cpus[i] / mppdepth; if (mppnppn && mppnppn < node_tasks) node_tasks = mppnppn; mppwidth += node_tasks; } snprintf(batch_id, sizeof(batch_id), "%u", job_ptr->job_id); user = uid_to_string(job_ptr->user_id); rc = basil_reserve(user, batch_id, mppwidth, mppdepth, mppnppn, mppmem, ns_head); xfree(user); if (rc <= 0) { /* errno value will be resolved by select_g_job_begin() */ errno = is_transient_error(rc) ? EAGAIN : ECONNABORTED; return SLURM_ERROR; } resv_id = rc; if (_set_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_RESV_ID, &resv_id) != SLURM_SUCCESS) { /* * This is a fatal error since it means we will not be able to * confirm the reservation; no step will be able to run in it. */ error("job %u: can not set resId %u", job_ptr->job_id, resv_id); basil_release(resv_id); return SLURM_ERROR; } info("ALPS RESERVATION #%u, JobId %u: BASIL -n %d -N %d -d %d -m %d", resv_id, job_ptr->job_id, mppwidth, mppnppn, mppdepth, mppmem); return SLURM_SUCCESS; }