Beispiel #1
0
/*
 * slurm_sprint_job_info - output information about a specific Slurm
 *	job based upon message as loaded using slurm_load_jobs
 * IN job_ptr - an individual job information record pointer
 * IN one_liner - print as a single line if true
 * RET out - char * containing formatted output (must be freed after call)
 *           NULL is returned on failure.
 */
extern char *
slurm_sprint_job_info ( job_info_t * job_ptr, int one_liner )
{
	int i, j, k;
	char time_str[32], *group_name, *user_name;
	char tmp1[128], tmp2[128];
	char *tmp6_ptr;
	char tmp_line[1024 * 128];
	char tmp_path[MAXPATHLEN];
	char *ionodes = NULL;
	uint16_t exit_status = 0, term_sig = 0;
	job_resources_t *job_resrcs = job_ptr->job_resrcs;
	char *out = NULL;
	time_t run_time;
	uint32_t min_nodes, max_nodes = 0;
	char *nodelist = "NodeList";
	bitstr_t *cpu_bitmap;
	char *host;
	int sock_inx, sock_reps, last;
	int abs_node_inx, rel_node_inx;
	int64_t nice;
	int bit_inx, bit_reps;
	uint32_t *last_mem_alloc_ptr = NULL;
	uint32_t last_mem_alloc = NO_VAL;
	char *last_hosts;
	hostlist_t hl, hl_last;
	char select_buf[122];
	uint32_t cluster_flags = slurmdb_setup_cluster_flags();
	uint32_t threads;
	char *line_end = (one_liner) ? " " : "\n   ";

	if (cluster_flags & CLUSTER_FLAG_BG) {
		nodelist = "MidplaneList";
		select_g_select_jobinfo_get(job_ptr->select_jobinfo,
					    SELECT_JOBDATA_IONODES,
					    &ionodes);
	}

	/****** Line 1 ******/
	xstrfmtcat(out, "JobId=%u ", job_ptr->job_id);

	if (job_ptr->array_job_id) {
		if (job_ptr->array_task_str) {
			xstrfmtcat(out, "ArrayJobId=%u ArrayTaskId=%s ",
				   job_ptr->array_job_id,
				   job_ptr->array_task_str);
		} else {
			xstrfmtcat(out, "ArrayJobId=%u ArrayTaskId=%u ",
				   job_ptr->array_job_id,
				   job_ptr->array_task_id);
		}
	}
	xstrfmtcat(out, "JobName=%s", job_ptr->name);
	xstrcat(out, line_end);

	/****** Line 2 ******/
	user_name = uid_to_string((uid_t) job_ptr->user_id);
	group_name = gid_to_string((gid_t) job_ptr->group_id);
	xstrfmtcat(out, "UserId=%s(%u) GroupId=%s(%u) MCS_label=%s",
		   user_name, job_ptr->user_id, group_name, job_ptr->group_id,
		   (job_ptr->mcs_label==NULL) ? "N/A" : job_ptr->mcs_label);
	xfree(user_name);
	xfree(group_name);
	xstrcat(out, line_end);

	/****** Line 3 ******/
	nice = ((int64_t)job_ptr->nice) - NICE_OFFSET;
	xstrfmtcat(out, "Priority=%u Nice=%"PRIi64" Account=%s QOS=%s",
		   job_ptr->priority, nice, job_ptr->account, job_ptr->qos);
	if (slurm_get_track_wckey())
		xstrfmtcat(out, " WCKey=%s", job_ptr->wckey);
	xstrcat(out, line_end);

	/****** Line 4 ******/
	xstrfmtcat(out, "JobState=%s ", job_state_string(job_ptr->job_state));

	if (job_ptr->state_desc) {
		/* Replace white space with underscore for easier parsing */
		for (j=0; job_ptr->state_desc[j]; j++) {
			if (isspace((int)job_ptr->state_desc[j]))
				job_ptr->state_desc[j] = '_';
		}
		xstrfmtcat(out, "Reason=%s ", job_ptr->state_desc);
	} else
		xstrfmtcat(out, "Reason=%s ", job_reason_string(job_ptr->state_reason));

	xstrfmtcat(out, "Dependency=%s", job_ptr->dependency);
	xstrcat(out, line_end);

	/****** Line 5 ******/
	xstrfmtcat(out, "Requeue=%u Restarts=%u BatchFlag=%u Reboot=%u ",
		 job_ptr->requeue, job_ptr->restart_cnt, job_ptr->batch_flag,
		 job_ptr->reboot);
	if (WIFSIGNALED(job_ptr->exit_code))
		term_sig = WTERMSIG(job_ptr->exit_code);
	exit_status = WEXITSTATUS(job_ptr->exit_code);
	xstrfmtcat(out, "ExitCode=%u:%u", exit_status, term_sig);
	xstrcat(out, line_end);

	/****** Line 5a (optional) ******/
	if (job_ptr->show_flags & SHOW_DETAIL) {
		if (WIFSIGNALED(job_ptr->derived_ec))
			term_sig = WTERMSIG(job_ptr->derived_ec);
		else
			term_sig = 0;
		exit_status = WEXITSTATUS(job_ptr->derived_ec);
		xstrfmtcat(out, "DerivedExitCode=%u:%u", exit_status, term_sig);
		xstrcat(out, line_end);
	}

	/****** Line 6 ******/
	if (IS_JOB_PENDING(job_ptr))
		run_time = 0;
	else if (IS_JOB_SUSPENDED(job_ptr))
		run_time = job_ptr->pre_sus_time;
	else {
		time_t end_time;
		if (IS_JOB_RUNNING(job_ptr) || (job_ptr->end_time == 0))
			end_time = time(NULL);
		else
			end_time = job_ptr->end_time;
		if (job_ptr->suspend_time) {
			run_time = (time_t)
				(difftime(end_time, job_ptr->suspend_time)
				 + job_ptr->pre_sus_time);
		} else
			run_time = (time_t)
				difftime(end_time, job_ptr->start_time);
	}
	secs2time_str(run_time, time_str, sizeof(time_str));
	xstrfmtcat(out, "RunTime=%s ", time_str);

	if (job_ptr->time_limit == NO_VAL)
		xstrcat(out, "TimeLimit=Partition_Limit ");
	else {
		mins2time_str(job_ptr->time_limit, time_str, sizeof(time_str));
		xstrfmtcat(out, "TimeLimit=%s ", time_str);
	}

	if (job_ptr->time_min == 0)
		xstrcat(out, "TimeMin=N/A");
	else {
		mins2time_str(job_ptr->time_min, time_str, sizeof(time_str));
		xstrfmtcat(out, "TimeMin=%s", time_str);
	}
	xstrcat(out, line_end);

	/****** Line 7 ******/
	slurm_make_time_str(&job_ptr->submit_time, time_str, sizeof(time_str));
	xstrfmtcat(out, "SubmitTime=%s ", time_str);

	slurm_make_time_str(&job_ptr->eligible_time, time_str, sizeof(time_str));
	xstrfmtcat(out, "EligibleTime=%s", time_str);

	xstrcat(out, line_end);

	/****** Line 8 (optional) ******/
	if (job_ptr->resize_time) {
		slurm_make_time_str(&job_ptr->resize_time, time_str, sizeof(time_str));
		xstrfmtcat(out, "ResizeTime=%s", time_str);
		xstrcat(out, line_end);
	}

	/****** Line 9 ******/
	slurm_make_time_str(&job_ptr->start_time, time_str, sizeof(time_str));
	xstrfmtcat(out, "StartTime=%s ", time_str);

	if ((job_ptr->time_limit == INFINITE) &&
	    (job_ptr->end_time > time(NULL)))
		xstrcat(out, "EndTime=Unknown ");
	else {
		slurm_make_time_str(&job_ptr->end_time, time_str, sizeof(time_str));
		xstrfmtcat(out, "EndTime=%s ", time_str);
	}

	if (job_ptr->deadline) {
		slurm_make_time_str(&job_ptr->deadline, time_str, sizeof(time_str));
		xstrfmtcat(out, "Deadline=%s", time_str);
	} else {
		xstrcat(out, "Deadline=N/A");
	}

	xstrcat(out, line_end);

	/****** Line 10 ******/
	if (job_ptr->preempt_time == 0)
		xstrcat(out, "PreemptTime=None ");
	else {
		slurm_make_time_str(&job_ptr->preempt_time, time_str, sizeof(time_str));
		xstrfmtcat(out, "PreemptTime=%s ", time_str);
	}

	if (job_ptr->suspend_time) {
		slurm_make_time_str(&job_ptr->suspend_time, time_str, sizeof(time_str));
		xstrfmtcat(out, "SuspendTime=%s ", time_str);
	} else
		xstrcat(out, "SuspendTime=None ");

	xstrfmtcat(out, "SecsPreSuspend=%ld", (long int)job_ptr->pre_sus_time);
	xstrcat(out, line_end);

	/****** Line 11 ******/
	xstrfmtcat(out, "Partition=%s AllocNode:Sid=%s:%u",
		   job_ptr->partition, job_ptr->alloc_node, job_ptr->alloc_sid);
	xstrcat(out, line_end);

	/****** Line 12 ******/
	xstrfmtcat(out, "Req%s=%s Exc%s=%s", nodelist, job_ptr->req_nodes,
		   nodelist, job_ptr->exc_nodes);
	xstrcat(out, line_end);

	/****** Line 13 ******/
	xstrfmtcat(out, "%s=%s", nodelist, job_ptr->nodes);
	if (job_ptr->nodes && ionodes) {
		xstrfmtcat(out, "[%s]", ionodes);
		xfree(ionodes);
	}
	if (job_ptr->sched_nodes)
		xstrfmtcat(out, " Sched%s=%s", nodelist, job_ptr->sched_nodes);

	xstrcat(out, line_end);

	/****** Line 14 (optional) ******/
	if (job_ptr->batch_host) {
		xstrfmtcat(out, "BatchHost=%s", job_ptr->batch_host);
		xstrcat(out, line_end);
	}

	/****** Line 15 ******/
	if (cluster_flags & CLUSTER_FLAG_BG) {
		select_g_select_jobinfo_get(job_ptr->select_jobinfo,
					    SELECT_JOBDATA_NODE_CNT,
					    &min_nodes);
		if ((min_nodes == 0) || (min_nodes == NO_VAL)) {
			min_nodes = job_ptr->num_nodes;
			max_nodes = job_ptr->max_nodes;
		} else if (job_ptr->max_nodes)
			max_nodes = min_nodes;
	} else if (IS_JOB_PENDING(job_ptr)) {
		min_nodes = job_ptr->num_nodes;
		max_nodes = job_ptr->max_nodes;
		if (max_nodes && (max_nodes < min_nodes))
			min_nodes = max_nodes;
	} else {
		min_nodes = job_ptr->num_nodes;
		max_nodes = 0;
	}

	_sprint_range(tmp_line, sizeof(tmp_line), min_nodes, max_nodes);
	xstrfmtcat(out, "NumNodes=%s ", tmp_line);
	_sprint_range(tmp_line, sizeof(tmp_line), job_ptr->num_cpus, job_ptr->max_cpus);
	xstrfmtcat(out, "NumCPUs=%s ", tmp_line);

	xstrfmtcat(out, "NumTasks=%u ", job_ptr->num_tasks);
	xstrfmtcat(out, "CPUs/Task=%u ", job_ptr->cpus_per_task);

	if (job_ptr->boards_per_node == (uint16_t) NO_VAL)
		xstrcat(out, "ReqB:S:C:T=*:");
	else
		xstrfmtcat(out, "ReqB:S:C:T=%u:", job_ptr->boards_per_node);

	if (job_ptr->sockets_per_board == (uint16_t) NO_VAL)
		xstrcat(out, "*:");
	else
		xstrfmtcat(out, "%u:", job_ptr->sockets_per_board);

	if (job_ptr->cores_per_socket == (uint16_t) NO_VAL)
		xstrcat(out, "*:");
	else
		xstrfmtcat(out, "%u:", job_ptr->cores_per_socket);

	if (job_ptr->threads_per_core == (uint16_t) NO_VAL)
		xstrcat(out, "*");
	else
		xstrfmtcat(out, "%u", job_ptr->threads_per_core);

	xstrcat(out, line_end);

	/****** Line 16 ******/
	/* Tres should already of been converted at this point from simple */
	xstrfmtcat(out, "TRES=%s",
		   job_ptr->tres_alloc_str ? job_ptr->tres_alloc_str
					   : job_ptr->tres_req_str);
	xstrcat(out, line_end);

	/****** Line 17 ******/
	if (job_ptr->sockets_per_node == (uint16_t) NO_VAL)
		xstrcat(out, "Socks/Node=* ");
	else
		xstrfmtcat(out, "Socks/Node=%u ", job_ptr->sockets_per_node);

	if (job_ptr->ntasks_per_node == (uint16_t) NO_VAL)
		xstrcat(out, "NtasksPerN:B:S:C=*:");
	else
		xstrfmtcat(out, "NtasksPerN:B:S:C=%u:", job_ptr->ntasks_per_node);

	if (job_ptr->ntasks_per_board == (uint16_t) NO_VAL)
		xstrcat(out, "*:");
	else
		xstrfmtcat(out, "%u:", job_ptr->ntasks_per_board);

	if ((job_ptr->ntasks_per_socket == (uint16_t) NO_VAL) ||
	    (job_ptr->ntasks_per_socket == (uint16_t) INFINITE))
		xstrcat(out, "*:");
	else
		xstrfmtcat(out, "%u:", job_ptr->ntasks_per_socket);

	if ((job_ptr->ntasks_per_core == (uint16_t) NO_VAL) ||
	    (job_ptr->ntasks_per_core == (uint16_t) INFINITE))
		xstrcat(out, "* ");
	else
		xstrfmtcat(out, "%u ", job_ptr->ntasks_per_core);

	if (job_ptr->core_spec == (uint16_t) NO_VAL)
		xstrcat(out, "CoreSpec=*");
	else if (job_ptr->core_spec & CORE_SPEC_THREAD)
		xstrfmtcat(out, "ThreadSpec=%d",
			   (job_ptr->core_spec & (~CORE_SPEC_THREAD)));
	else
		xstrfmtcat(out, "CoreSpec=%u", job_ptr->core_spec);

	xstrcat(out, line_end);

	if (job_resrcs && cluster_flags & CLUSTER_FLAG_BG) {
		if ((job_resrcs->cpu_array_cnt > 0) &&
		    (job_resrcs->cpu_array_value) &&
		    (job_resrcs->cpu_array_reps)) {
			int length = 0;
			xstrcat(out, "CPUs=");
			for (i = 0; i < job_resrcs->cpu_array_cnt; i++) {
				/* only print 60 characters worth of this record */
				if (length > 60) {
					/* skip to last CPU group entry */
					if (i < job_resrcs->cpu_array_cnt - 1) {
						continue;
					}
					/* add ellipsis before last entry */
					xstrcat(out, "...,");
				}

				length += xstrfmtcat(out, "%d", job_resrcs->cpus[i]);
				if (job_resrcs->cpu_array_reps[i] > 1) {
					length += xstrfmtcat(out, "*%d",
							     job_resrcs->cpu_array_reps[i]);
				}
				if (i < job_resrcs->cpu_array_cnt - 1) {
					xstrcat(out, ",");
					length++;
				}
			}
			xstrcat(out, line_end);
		}
	} else if (job_resrcs && job_resrcs->core_bitmap &&
		   ((last = bit_fls(job_resrcs->core_bitmap)) != -1)) {
		hl = hostlist_create(job_resrcs->nodes);
		if (!hl) {
			error("slurm_sprint_job_info: hostlist_create: %s",
			      job_resrcs->nodes);
			return NULL;
		}
		hl_last = hostlist_create(NULL);
		if (!hl_last) {
			error("slurm_sprint_job_info: hostlist_create: NULL");
			hostlist_destroy(hl);
			return NULL;
		}

		bit_inx = 0;
		i = sock_inx = sock_reps = 0;
		abs_node_inx = job_ptr->node_inx[i];

		/* tmp1[] stores the current cpu(s) allocated */
		tmp2[0] = '\0';	/* stores last cpu(s) allocated */
		for (rel_node_inx=0; rel_node_inx < job_resrcs->nhosts;
		     rel_node_inx++) {

			if (sock_reps >=
			    job_resrcs->sock_core_rep_count[sock_inx]) {
				sock_inx++;
				sock_reps = 0;
			}
			sock_reps++;

			bit_reps = job_resrcs->sockets_per_node[sock_inx] *
				   job_resrcs->cores_per_socket[sock_inx];
			host = hostlist_shift(hl);
			threads = _threads_per_core(host);
			cpu_bitmap = bit_alloc(bit_reps * threads);
			for (j = 0; j < bit_reps; j++) {
				if (bit_test(job_resrcs->core_bitmap, bit_inx)){
					for (k = 0; k < threads; k++)
						bit_set(cpu_bitmap,
							(j * threads) + k);
				}
				bit_inx++;
			}
			bit_fmt(tmp1, sizeof(tmp1), cpu_bitmap);
			FREE_NULL_BITMAP(cpu_bitmap);
			/*
			 * If the allocation values for this host are not the
			 * same as the last host, print the report of the last
			 * group of hosts that had identical allocation values.
			 */
			if (xstrcmp(tmp1, tmp2) ||
			    (last_mem_alloc_ptr != job_resrcs->memory_allocated) ||
			    (job_resrcs->memory_allocated &&
			     (last_mem_alloc !=
			      job_resrcs->memory_allocated[rel_node_inx]))) {
				if (hostlist_count(hl_last)) {
					last_hosts =
						hostlist_ranged_string_xmalloc(
						hl_last);
					xstrfmtcat(out,
						   "  Nodes=%s CPU_IDs=%s Mem=%u",
						   last_hosts, tmp2,
						   last_mem_alloc_ptr ?
						   last_mem_alloc : 0);
					xfree(last_hosts);
					xstrcat(out, line_end);

					hostlist_destroy(hl_last);
					hl_last = hostlist_create(NULL);
				}
				strcpy(tmp2, tmp1);
				last_mem_alloc_ptr = job_resrcs->memory_allocated;
				if (last_mem_alloc_ptr)
					last_mem_alloc = job_resrcs->
						memory_allocated[rel_node_inx];
				else
					last_mem_alloc = NO_VAL;
			}
			hostlist_push_host(hl_last, host);
			free(host);

			if (bit_inx > last)
				break;

			if (abs_node_inx > job_ptr->node_inx[i+1]) {
				i += 2;
				abs_node_inx = job_ptr->node_inx[i];
			} else {
				abs_node_inx++;
			}
		}

		if (hostlist_count(hl_last)) {
			last_hosts = hostlist_ranged_string_xmalloc(hl_last);
			xstrfmtcat(out, "  Nodes=%s CPU_IDs=%s Mem=%u",
				 last_hosts, tmp2,
				 last_mem_alloc_ptr ? last_mem_alloc : 0);
			xfree(last_hosts);
			xstrcat(out, line_end);
		}
		hostlist_destroy(hl);
		hostlist_destroy(hl_last);
	}
	/****** Line 18 ******/
	if (job_ptr->pn_min_memory & MEM_PER_CPU) {
		job_ptr->pn_min_memory &= (~MEM_PER_CPU);
		tmp6_ptr = "CPU";
	} else
		tmp6_ptr = "Node";

	if (cluster_flags & CLUSTER_FLAG_BG) {
		convert_num_unit((float)job_ptr->pn_min_cpus, tmp1,
				 sizeof(tmp1), UNIT_NONE, NO_VAL,
				 CONVERT_NUM_UNIT_EXACT);
		xstrfmtcat(out, "MinCPUsNode=%s ", tmp1);
	} else {
		xstrfmtcat(out, "MinCPUsNode=%u ", job_ptr->pn_min_cpus);
	}

	convert_num_unit((float)job_ptr->pn_min_memory, tmp1, sizeof(tmp1),
			 UNIT_MEGA, NO_VAL, CONVERT_NUM_UNIT_EXACT);
	convert_num_unit((float)job_ptr->pn_min_tmp_disk, tmp2, sizeof(tmp2),
			 UNIT_MEGA, NO_VAL, CONVERT_NUM_UNIT_EXACT);
	xstrfmtcat(out, "MinMemory%s=%s MinTmpDiskNode=%s", tmp6_ptr, tmp1, tmp2);
	xstrcat(out, line_end);

	/****** Line ******/
	secs2time_str((time_t)job_ptr->delay_boot, tmp1, sizeof(tmp1));
	xstrfmtcat(out, "Features=%s DelayBoot=%s", job_ptr->features, tmp1);
	xstrcat(out, line_end);

	/****** Line ******/
	xstrfmtcat(out, "Gres=%s Reservation=%s",
		   job_ptr->gres, job_ptr->resv_name);
	xstrcat(out, line_end);

	/****** Line 20 ******/
	xstrfmtcat(out, "OverSubscribe=%s Contiguous=%d Licenses=%s Network=%s",
		   job_share_string(job_ptr->shared), job_ptr->contiguous,
		   job_ptr->licenses, job_ptr->network);
	xstrcat(out, line_end);

	/****** Line 21 ******/
	xstrfmtcat(out, "Command=%s", job_ptr->command);
	xstrcat(out, line_end);

	/****** Line 22 ******/
	xstrfmtcat(out, "WorkDir=%s", job_ptr->work_dir);

	if (cluster_flags & CLUSTER_FLAG_BG) {
		/****** Line 23 (optional) ******/
		select_g_select_jobinfo_sprint(job_ptr->select_jobinfo,
					       select_buf, sizeof(select_buf),
					       SELECT_PRINT_BG_ID);
		if (select_buf[0] != '\0') {
			xstrcat(out, line_end);
			xstrfmtcat(out, "Block_ID=%s", select_buf);
		}

		/****** Line 24 (optional) ******/
		select_g_select_jobinfo_sprint(job_ptr->select_jobinfo,
					       select_buf, sizeof(select_buf),
					       SELECT_PRINT_MIXED_SHORT);
		if (select_buf[0] != '\0') {
			xstrcat(out, line_end);
			xstrcat(out, select_buf);
		}

		if (cluster_flags & CLUSTER_FLAG_BGL) {
			/****** Line 25 (optional) ******/
			select_g_select_jobinfo_sprint(
				job_ptr->select_jobinfo,
				select_buf, sizeof(select_buf),
				SELECT_PRINT_BLRTS_IMAGE);
			if (select_buf[0] != '\0') {
				xstrcat(out, line_end);
				xstrfmtcat(out, "BlrtsImage=%s", select_buf);
			}
		}
		/****** Line 26 (optional) ******/
		select_g_select_jobinfo_sprint(job_ptr->select_jobinfo,
					       select_buf, sizeof(select_buf),
					       SELECT_PRINT_LINUX_IMAGE);
		if (select_buf[0] != '\0') {
			xstrcat(out, line_end);
			if (cluster_flags & CLUSTER_FLAG_BGL)
				xstrfmtcat(out, "LinuxImage=%s", select_buf);
			else
				xstrfmtcat(out, "CnloadImage=%s", select_buf);
		}
		/****** Line 27 (optional) ******/
		select_g_select_jobinfo_sprint(job_ptr->select_jobinfo,
					       select_buf, sizeof(select_buf),
					       SELECT_PRINT_MLOADER_IMAGE);
		if (select_buf[0] != '\0') {
			xstrcat(out, line_end);
			xstrfmtcat(out, "MloaderImage=%s", select_buf);
		}
		/****** Line 28 (optional) ******/
		select_g_select_jobinfo_sprint(job_ptr->select_jobinfo,
					       select_buf, sizeof(select_buf),
					       SELECT_PRINT_RAMDISK_IMAGE);
		if (select_buf[0] != '\0') {
			xstrcat(out, line_end);
			if (cluster_flags & CLUSTER_FLAG_BGL)
				xstrfmtcat(out, "RamDiskImage=%s", select_buf);
			else
				xstrfmtcat(out, "IoloadImage=%s", select_buf);
		}
	}

	/****** Line (optional) ******/
	if (job_ptr->admin_comment) {
		xstrcat(out, line_end);
		xstrfmtcat(out, "AdminComment=%s ", job_ptr->admin_comment);
	}

	/****** Line (optional) ******/
	if (job_ptr->comment) {
		xstrcat(out, line_end);
		xstrfmtcat(out, "Comment=%s ", job_ptr->comment);
	}

	/****** Line 30 (optional) ******/
	if (job_ptr->batch_flag) {
		xstrcat(out, line_end);
		slurm_get_job_stderr(tmp_path, sizeof(tmp_path), job_ptr);
		xstrfmtcat(out, "StdErr=%s", tmp_path);
	}

	/****** Line 31 (optional) ******/
	if (job_ptr->batch_flag) {
		xstrcat(out, line_end);
		slurm_get_job_stdin(tmp_path, sizeof(tmp_path), job_ptr);
		xstrfmtcat(out, "StdIn=%s", tmp_path);
	}

	/****** Line 32 (optional) ******/
	if (job_ptr->batch_flag) {
		xstrcat(out, line_end);
		slurm_get_job_stdout(tmp_path, sizeof(tmp_path), job_ptr);
		xstrfmtcat(out, "StdOut=%s", tmp_path);
	}

	/****** Line 33 (optional) ******/
	if (job_ptr->batch_script) {
		xstrcat(out, line_end);
		xstrcat(out, "BatchScript=\n");
		xstrcat(out, job_ptr->batch_script);
	}

	/****** Line 34 (optional) ******/
	if (job_ptr->req_switch) {
		char time_buf[32];
		xstrcat(out, line_end);
		secs2time_str((time_t) job_ptr->wait4switch, time_buf,
			      sizeof(time_buf));
		xstrfmtcat(out, "Switches=%u@%s\n", job_ptr->req_switch, time_buf);
	}

	/****** Line 35 (optional) ******/
	if (job_ptr->burst_buffer) {
		xstrcat(out, line_end);
		xstrfmtcat(out, "BurstBuffer=%s", job_ptr->burst_buffer);
	}

	/****** Line (optional) ******/
	if (job_ptr->burst_buffer_state) {
		xstrcat(out, line_end);
		xstrfmtcat(out, "BurstBufferState=%s",
			   job_ptr->burst_buffer_state);
	}

	/****** Line 36 (optional) ******/
	if (cpu_freq_debug(NULL, NULL, tmp1, sizeof(tmp1),
			   job_ptr->cpu_freq_gov, job_ptr->cpu_freq_min,
			   job_ptr->cpu_freq_max, NO_VAL) != 0) {
		xstrcat(out, line_end);
		xstrcat(out, tmp1);
	}

	/****** Line 37 ******/
	xstrcat(out, line_end);
	xstrfmtcat(out, "Power=%s", power_flags_str(job_ptr->power_flags));

	/****** Line 38 (optional) ******/
	if (job_ptr->bitflags) {
		xstrcat(out, line_end);
		if (job_ptr->bitflags & KILL_INV_DEP)
			xstrcat(out, "KillOInInvalidDependent=Yes");
		if (job_ptr->bitflags & NO_KILL_INV_DEP)
			xstrcat(out, "KillOInInvalidDependent=No");
		if (job_ptr->bitflags & SPREAD_JOB)
			xstrcat(out, "SpreadJob=Yes");
	}

	/****** END OF JOB RECORD ******/
	if (one_liner)
		xstrcat(out, "\n");
	else
		xstrcat(out, "\n\n");

	return out;
}
Beispiel #2
0
/**
 * Check page (used or not)
 */
static inline bool bitmask_check(struct PagePool *pp, pageno_t n) {
	return bit_test(pp->bitmask, n);
}
Beispiel #3
0
Datei: grid.c Projekt: IFCA/slurm
static int _grid_table_by_switch(button_processor_t *button_processor,
				 List node_list)
{
	int rc = SLURM_SUCCESS;
	int inx = 0, ii = 0;
	switch_record_bitmaps_t *sw_nodes_bitmaps_ptr = g_switch_nodes_maps;
#if TOPO_DEBUG
	/* engage if want original display below switched */
	ListIterator itr = list_iterator_create(node_list);
	sview_node_info_t *sview_node_info_ptr = NULL;
#endif
	button_processor->inx = &inx;
	for (ii=0; ii<g_topo_info_msg_ptr->record_count;
	     ii++, sw_nodes_bitmaps_ptr++) {
		int j = 0, first, last;
		if (g_topo_info_msg_ptr->topo_array[ii].level)
			continue;
		first = bit_ffs(sw_nodes_bitmaps_ptr->node_bitmap);
		if (first == -1)
			continue;
		last = bit_fls(sw_nodes_bitmaps_ptr->node_bitmap);
		button_processor->inx = &j;
		button_processor->force_row_break = FALSE;
		for (j = first; j <= last; j++) {
			if (TOPO_DEBUG)
				g_print("allocated node = %s button# %d\n",
					g_node_info_ptr->node_array[j].name,
					j);
			if (!bit_test(sw_nodes_bitmaps_ptr->node_bitmap, j))
				continue;
			/* if (!working_sview_config.show_hidden) { */
			/* 	if (!check_part_includes_node(j)) */
			/* 		continue; */
			/* } */
			if (j == last)
				button_processor->force_row_break = TRUE;
			if ((rc = _add_button_to_list(
				     &g_node_info_ptr->node_array[j],
				     button_processor)) != SLURM_SUCCESS)
				break;
			button_processor->force_row_break = FALSE;
		}
		rc = _add_button_to_list(NULL, button_processor);
	}

#if TOPO_DEBUG
	/* engage this if want original display below
	 * switched grid */
	 button_processor->inx = &inx;
	 while ((sview_node_info_ptr = list_next(itr))) {
		 if ((rc = _add_button_to_list(
				sview_node_info_ptr->node_ptr,
	 			button_processor)) != SLURM_SUCCESS)
			 break;
	 	inx++;
	 }
	 list_iterator_destroy(itr);
#endif

	/* This is needed to get the correct width of the grid window.
	 * If it is not given then we get a really narrow window. */
	gtk_table_set_row_spacing(button_processor->table,
				  (*button_processor->coord_y)?
				  ((*button_processor->coord_y)-1):0, 1);

	return rc;

}
Beispiel #4
0
/*
 * Determine which of these nodes are usable by this job
 *
 * Remove nodes from the bitmap that don't have enough memory or gres to
 * support the job. 
 *
 * Return SLURM_ERROR if a required node can't be used.
 *
 * if node_state = NODE_CR_RESERVED, clear bitmap (if node is required
 *                                   then should we return NODE_BUSY!?!)
 *
 * if node_state = NODE_CR_ONE_ROW, then this node can only be used by
 *                                  another NODE_CR_ONE_ROW job
 *
 * if node_state = NODE_CR_AVAILABLE AND:
 *  - job_node_req = NODE_CR_RESERVED, then we need idle nodes
 *  - job_node_req = NODE_CR_ONE_ROW, then we need idle or non-sharing nodes
 */
static int _verify_node_state(struct part_res_record *cr_part_ptr,
			      struct job_record *job_ptr, bitstr_t * bitmap,
			      uint16_t cr_type,
			      struct node_use_record *node_usage,
			      enum node_cr_state job_node_req)
{
	struct node_record *node_ptr;
	uint32_t i, free_mem, gres_cpus, min_mem;
	int i_first, i_last;
	List gres_list;

	if (job_ptr->details->pn_min_memory & MEM_PER_CPU)
		min_mem = job_ptr->details->pn_min_memory & (~MEM_PER_CPU);
	else
		min_mem = job_ptr->details->pn_min_memory;
	i_first = bit_ffs(bitmap);
	if (i_first >= 0)
		i_last  = bit_fls(bitmap);
	else
		i_last = -2;
	for (i = i_first; i <= i_last; i++) {
		if (!bit_test(bitmap, i))
			continue;
		node_ptr = select_node_record[i].node_ptr;

		/* node-level memory check */
		if ((job_ptr->details->pn_min_memory) &&
		    (cr_type & CR_MEMORY)) {
			free_mem  = select_node_record[i].real_memory;
			free_mem -= node_usage[i].alloc_memory;
			if (free_mem < min_mem) {
				debug3("select/serial: node %s no mem %u < %u",
					select_node_record[i].node_ptr->name,
					free_mem, min_mem);
				goto clear_bit;
			}
		}

		/* node-level gres check */
		if (node_usage[i].gres_list)
			gres_list = node_usage[i].gres_list;
		else
			gres_list = node_ptr->gres_list;
		gres_cpus = gres_plugin_job_test(job_ptr->gres_list, 
						 gres_list, true,
						 NULL, 0, 0, job_ptr->job_id,
						 node_ptr->name);
		if (gres_cpus == 0) {
			debug3("select/serial: node %s lacks gres",
			       node_ptr->name);
			goto clear_bit;
		}

		/* exclusive node check */
		if (node_usage[i].node_state >= NODE_CR_RESERVED) {
			debug3("select/serial: node %s in exclusive use",
			       node_ptr->name);
			goto clear_bit;

		/* non-resource-sharing node check */
		} else if (node_usage[i].node_state >= NODE_CR_ONE_ROW) {
			if ((job_node_req == NODE_CR_RESERVED) ||
			    (job_node_req == NODE_CR_AVAILABLE)) {
				debug3("select/serial: node %s non-sharing",
				       node_ptr->name);
				goto clear_bit;
			}
			/* cannot use this node if it is running jobs
			 * in sharing partitions */
			if (_is_node_busy(cr_part_ptr, i, 1,
					  job_ptr->part_ptr)) {
				debug3("select/serial: node %s sharing?",
				       node_ptr->name);
				goto clear_bit;
			}

		/* node is NODE_CR_AVAILABLE - check job request */
		} else {
			if (job_node_req == NODE_CR_RESERVED) {
				if (_is_node_busy(cr_part_ptr, i, 0,
						  job_ptr->part_ptr)) {
					debug3("select/serial:  node %s busy",
					       node_ptr->name);
					goto clear_bit;
				}
			} else if (job_node_req == NODE_CR_ONE_ROW) {
				/* cannot use this node if it is running jobs
				 * in sharing partitions */
				if (_is_node_busy(cr_part_ptr, i, 1, 
						  job_ptr->part_ptr)) {
					debug3("select/serial: node %s vbusy",
					       node_ptr->name);
					goto clear_bit;
				}
			}
		}
		continue;	/* node is usable, test next node */

clear_bit:	/* This node is not usable by this job */
		bit_clear(bitmap, i);
		if (job_ptr->details->req_node_bitmap &&
		    bit_test(job_ptr->details->req_node_bitmap, i)) {
			return SLURM_ERROR;
		}
	}

	return SLURM_SUCCESS;
}
Beispiel #5
0
/* cr_job_test - does most of the real work for select_p_job_test(), which
 *	includes contiguous selection, load-leveling and max_share logic
 *
 * PROCEDURE:
 *
 * Step 1: compare nodes in "avail" bitmap with current node state data
 *         to find available nodes that match the job request
 *
 * Step 2: check resources in "avail" bitmap with allocated resources from
 *         higher priority partitions (busy resources are UNavailable)
 *
 * Step 3: select resource usage on remaining resources in "avail" bitmap
 *         for this job, with the placement influenced by existing
 *         allocations
 */
extern int cr_job_test(struct job_record *job_ptr, bitstr_t *bitmap, int mode,
			uint16_t cr_type, enum node_cr_state job_node_req, 
			uint32_t cr_node_cnt,
			struct part_res_record *cr_part_ptr,
			struct node_use_record *node_usage)
{
	static int gang_mode = -1;
	int error_code = SLURM_SUCCESS;
	bitstr_t *orig_map, *avail_cores, *free_cores;
	bitstr_t *tmpcore = NULL;
	bool test_only;
	uint32_t c, i, j, k, n, csize, save_mem = 0;
	job_resources_t *job_res;
	struct job_details *details_ptr;
	struct part_res_record *p_ptr, *jp_ptr;
	uint16_t *cpu_count;

	if (gang_mode == -1) {
		if (slurm_get_preempt_mode() & PREEMPT_MODE_GANG)
			gang_mode = 1;
		else
			gang_mode = 0;
	}

	details_ptr = job_ptr->details;

	free_job_resources(&job_ptr->job_resrcs);

	if (mode == SELECT_MODE_TEST_ONLY)
		test_only = true;
	else	/* SELECT_MODE_RUN_NOW || SELECT_MODE_WILL_RUN  */
		test_only = false;

	/* check node_state and update the node bitmap as necessary */
	if (!test_only) {
		error_code = _verify_node_state(cr_part_ptr, job_ptr,
						bitmap, cr_type, node_usage,
						job_node_req);
		if (error_code != SLURM_SUCCESS)
			return error_code;
	}

	if (select_debug_flags & DEBUG_FLAG_CPU_BIND) {
		info("select/serial: evaluating job %u on %u nodes",
		     job_ptr->job_id, bit_set_count(bitmap));
	}

	orig_map = bit_copy(bitmap);
	avail_cores = _make_core_bitmap(bitmap);

	/* test to make sure that this job can succeed with all avail_cores
	 * if 'no' then return FAIL
	 * if 'yes' then we will seek the optimal placement for this job
	 *          within avail_cores
	 */
	free_cores = bit_copy(avail_cores);
	cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt, free_cores,
				  node_usage, cr_type, test_only);
	if (cpu_count == NULL) {
		/* job cannot fit */
		FREE_NULL_BITMAP(orig_map);
		FREE_NULL_BITMAP(free_cores);
		FREE_NULL_BITMAP(avail_cores);
		if (select_debug_flags & DEBUG_FLAG_CPU_BIND) {
			info("select/serial: cr_job_test: test 0 fail: "
			     "insufficient resources");
		}
		return SLURM_ERROR;
	} else if (test_only) {
		FREE_NULL_BITMAP(orig_map);
		FREE_NULL_BITMAP(free_cores);
		FREE_NULL_BITMAP(avail_cores);
		xfree(cpu_count);
		if (select_debug_flags & DEBUG_FLAG_CPU_BIND)
			info("select/serial: cr_job_test: test 0 pass: "******"test_only");
		return SLURM_SUCCESS;
	}
	if (cr_type == CR_MEMORY) {
		/* CR_MEMORY does not care about existing CPU allocations,
		 * so we can jump right to job allocation from here */
		goto alloc_job;
	}
	xfree(cpu_count);
	if (select_debug_flags & DEBUG_FLAG_CPU_BIND) {
		info("select/serial: cr_job_test: test 0 pass - "
		     "job fits on given resources");
	}

	/* now that we know that this job can run with the given resources,
	 * let's factor in the existing allocations and seek the optimal set
	 * of resources for this job. Here is the procedure:
	 *
	 * Step 1: Seek idle CPUs across all partitions. If successful then
	 *         place job and exit. If not successful, then continue. Two
	 *         related items to note:
	 *          1. Jobs that don't share CPUs finish with step 1.
	 *          2. The remaining steps assume sharing or preemption.
	 *
	 * Step 2: Remove resources that are in use by higher-priority
	 *         partitions, and test that job can still succeed. If not
	 *         then exit.
	 *
	 * Step 3: Seek idle nodes among the partitions with the same
	 *         priority as the job's partition. If successful then
	 *         goto Step 6. If not then continue:
	 *
	 * Step 4: Seek placement within the job's partition. Search
	 *         row-by-row. If no placement if found, then exit. If a row
	 *         is found, then continue:
	 *
	 * Step 5: Place job and exit. FIXME! Here is where we need a
	 *         placement algorithm that recognizes existing job
	 *         boundaries and tries to "overlap jobs" as efficiently
	 *         as possible.
	 *
	 * Step 6: Place job and exit. FIXME! here is we use a placement
	 *         algorithm similar to Step 5 on jobs from lower-priority
	 *         partitions.
	 */


	/*** Step 1 ***/
	bit_copybits(bitmap, orig_map);
	bit_copybits(free_cores, avail_cores);

	/* remove all existing allocations from free_cores */
	tmpcore = bit_copy(free_cores);
	for (p_ptr = cr_part_ptr; p_ptr; p_ptr = p_ptr->next) {
		if (!p_ptr->row)
			continue;
		for (i = 0; i < p_ptr->num_rows; i++) {
			if (!p_ptr->row[i].row_bitmap)
				continue;
			bit_copybits(tmpcore, p_ptr->row[i].row_bitmap);
			bit_not(tmpcore); /* set bits now "free" resources */
			bit_and(free_cores, tmpcore);
		}
	}
	cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt, free_cores,
				  node_usage, cr_type, test_only);
	if (cpu_count) {
		/* job fits! We're done. */
		if (select_debug_flags & DEBUG_FLAG_CPU_BIND) {
			info("select/serial: cr_job_test: test 1 pass - "
			     "idle resources found");
		}
		goto alloc_job;
	}

	if ((gang_mode == 0) && (job_node_req == NODE_CR_ONE_ROW)) {
		/* This job CANNOT share CPUs regardless of priority,
		 * so we fail here. Note that Shared=EXCLUSIVE was already
		 * addressed in _verify_node_state() and job preemption
		 * removes jobs from simulated resource allocation map
		 * before this point. */
		if (select_debug_flags & DEBUG_FLAG_CPU_BIND) {
			info("select/serial: cr_job_test: test 1 fail - "
			     "no idle resources available");
		}
		goto alloc_job;
	}
	if (select_debug_flags & DEBUG_FLAG_CPU_BIND) {
		info("select/serial: cr_job_test: test 1 fail - "
		     "not enough idle resources");
	}

	/*** Step 2 ***/
	bit_copybits(bitmap, orig_map);
	bit_copybits(free_cores, avail_cores);

	for (jp_ptr = cr_part_ptr; jp_ptr; jp_ptr = jp_ptr->next) {
		if (jp_ptr->part_ptr == job_ptr->part_ptr)
			break;
	}
	if (!jp_ptr) {
		fatal("select/serial: could not find partition for job %u",
			job_ptr->job_id);
	}

	/* remove existing allocations (jobs) from higher-priority partitions
	 * from avail_cores */
	for (p_ptr = cr_part_ptr; p_ptr; p_ptr = p_ptr->next) {
		if (p_ptr->part_ptr->priority <= jp_ptr->part_ptr->priority)
			continue;
		if (!p_ptr->row)
			continue;
		for (i = 0; i < p_ptr->num_rows; i++) {
			if (!p_ptr->row[i].row_bitmap)
				continue;
			bit_copybits(tmpcore, p_ptr->row[i].row_bitmap);
			bit_not(tmpcore); /* set bits now "free" resources */
			bit_and(free_cores, tmpcore);
		}
	}
	/* make these changes permanent */
	bit_copybits(avail_cores, free_cores);
	cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt, free_cores,
				  node_usage, cr_type, test_only);
	if (!cpu_count) {
		/* job needs resources that are currently in use by
		 * higher-priority jobs, so fail for now */
		if (select_debug_flags & DEBUG_FLAG_CPU_BIND) {
			info("select/serial: cr_job_test: test 2 fail - "
			     "resources busy with higher priority jobs");
		}
		goto alloc_job;
	}
	xfree(cpu_count);
	if (select_debug_flags & DEBUG_FLAG_CPU_BIND) {
		info("select/serial: cr_job_test: test 2 pass - "
		     "available resources for this priority");
	}

	/*** Step 3 ***/
	bit_copybits(bitmap, orig_map);
	bit_copybits(free_cores, avail_cores);

	/* remove existing allocations (jobs) from same-priority partitions
	 * from avail_cores */
	for (p_ptr = cr_part_ptr; p_ptr; p_ptr = p_ptr->next) {
		if (p_ptr->part_ptr->priority != jp_ptr->part_ptr->priority)
			continue;
		if (!p_ptr->row)
			continue;
		for (i = 0; i < p_ptr->num_rows; i++) {
			if (!p_ptr->row[i].row_bitmap)
				continue;
			bit_copybits(tmpcore, p_ptr->row[i].row_bitmap);
			bit_not(tmpcore); /* set bits now "free" resources */
			bit_and(free_cores, tmpcore);
		}
	}
	cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt, free_cores,
				  node_usage, cr_type, test_only);
	if (cpu_count) {
		/* jobs from low-priority partitions are the only thing left
		 * in our way. for now we'll ignore them, but FIXME: we need
		 * a good placement algorithm here that optimizes "job overlap"
		 * between this job (in these idle nodes) and the low-priority
		 * jobs */
		if (select_debug_flags & DEBUG_FLAG_CPU_BIND) {
			info("select/serial: cr_job_test: test 3 pass - "
			     "found resources");
		}
		goto alloc_job;
	}
	if (select_debug_flags & DEBUG_FLAG_CPU_BIND) {
		info("select/serial: cr_job_test: test 3 fail - "
		     "not enough idle resources in same priority");
	}


	/*** Step 4 ***/
	/* try to fit the job into an existing row
	 *
	 * tmpcore = worker core_bitmap
	 * free_cores = core_bitmap to be built
	 * avail_cores = static core_bitmap of all available cores
	 */

	if (jp_ptr->row == NULL) {
		/* there's no existing jobs in this partition, so place
		 * the job in avail_cores. FIXME: still need a good
		 * placement algorithm here that optimizes "job overlap"
		 * between this job (in these idle nodes) and existing
		 * jobs in the other partitions with <= priority to
		 * this partition */
		bit_copybits(bitmap, orig_map);
		bit_copybits(free_cores, avail_cores);
		cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt,
					  free_cores, node_usage, cr_type,
					  test_only);
		if (select_debug_flags & DEBUG_FLAG_CPU_BIND) {
			info("select/serial: cr_job_test: test 4 pass - "
			     "first row found");
		}
		goto alloc_job;
	}

	cr_sort_part_rows(jp_ptr);
	c = jp_ptr->num_rows;
	if (job_node_req != NODE_CR_AVAILABLE)
		c = 1;
	for (i = 0; i < c; i++) {
		if (!jp_ptr->row[i].row_bitmap)
			break;
		bit_copybits(bitmap, orig_map);
		bit_copybits(free_cores, avail_cores);
		bit_copybits(tmpcore, jp_ptr->row[i].row_bitmap);
		bit_not(tmpcore);
		bit_and(free_cores, tmpcore);
		cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt,
					  free_cores, node_usage, cr_type,
					  test_only);
		if (cpu_count) {
			if (select_debug_flags & DEBUG_FLAG_CPU_BIND) {
				info("select/serial: cr_job_test: "
				     "test 4 pass - row %i", i);
			}
			break;
		}
		if (select_debug_flags & DEBUG_FLAG_CPU_BIND) {
			info("select/serial: cr_job_test: "
			     "test 4 fail - row %i", i);
		}
	}

	if ((i < c) && !jp_ptr->row[i].row_bitmap) {
		/* we've found an empty row, so use it */
		bit_copybits(bitmap, orig_map);
		bit_copybits(free_cores, avail_cores);
		if (select_debug_flags & DEBUG_FLAG_CPU_BIND) {
			info("select/serial: cr_job_test: "
			     "test 4 trying empty row %i",i);
		}
		cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt,
					  free_cores, node_usage, cr_type,
					  test_only);
	}

	if (!cpu_count) {
		/* job can't fit into any row, so exit */
		if (select_debug_flags & DEBUG_FLAG_CPU_BIND) {
			info("select/serial: cr_job_test: test 4 fail - "
			     "busy partition");
		}
		goto alloc_job;

	}

	/*** CONSTRUCTION ZONE FOR STEPs 5 AND 6 ***
	 * Note that while the job may have fit into a row, it should
	 * still be run through a good placement algorithm here that
	 * optimizes "job overlap" between this job (in these idle nodes)
	 * and existing jobs in the other partitions with <= priority to
	 * this partition */

alloc_job:
	/* at this point we've found a good set of
	 * bits to allocate to this job:
	 * - bitmap is the set of nodes to allocate
	 * - free_cores is the set of allocated cores
	 * - cpu_count is the number of cpus per allocated node
	 *
	 * Next steps are to cleanup the worker variables,
	 * create the job_resources struct,
	 * distribute the job on the bits, and exit
	 */
	FREE_NULL_BITMAP(orig_map);
	FREE_NULL_BITMAP(avail_cores);
	FREE_NULL_BITMAP(tmpcore);
	if (!cpu_count) {
		/* we were sent here to cleanup and exit */
		FREE_NULL_BITMAP(free_cores);
		if (select_debug_flags & DEBUG_FLAG_CPU_BIND) {
			info("select/serial: exiting cr_job_test with no "
			     "allocation");
		}
		return SLURM_ERROR;
	}

	/* At this point we have:
	 * - a bitmap of selected nodes
	 * - a free_cores bitmap of usable cores on each selected node
	 * - a per-alloc-node cpu_count array
	 */

	if ((mode != SELECT_MODE_WILL_RUN) && (job_ptr->part_ptr == NULL))
		error_code = EINVAL;
	if ((error_code == SLURM_SUCCESS) && (mode == SELECT_MODE_WILL_RUN))
		job_ptr->total_cpus = 1;
	if ((error_code != SLURM_SUCCESS) || (mode != SELECT_MODE_RUN_NOW)) {
		FREE_NULL_BITMAP(free_cores);
		xfree(cpu_count);
		return error_code;
	}

	n = bit_ffs(bitmap);
	if (n < 0) {
		FREE_NULL_BITMAP(free_cores);
		xfree(cpu_count);
		return error_code;
	}

	if (select_debug_flags & DEBUG_FLAG_CPU_BIND) {
		info("select/serial: cr_job_test: distributing job %u",
		     job_ptr->job_id);
	}

	/** create the struct_job_res  **/
	job_res                   = create_job_resources();
	job_res->node_bitmap      = bit_copy(bitmap);
	job_res->nodes            = bitmap2node_name(bitmap);
	if (job_res->node_bitmap == NULL)
		fatal("bit_copy malloc failure");
	job_res->nhosts           = bit_set_count(bitmap);
	job_res->ncpus            = job_res->nhosts;
	if (job_ptr->details->ntasks_per_node)
		job_res->ncpus   *= details_ptr->ntasks_per_node;
	job_res->ncpus            = MAX(job_res->ncpus,
					details_ptr->min_cpus);
	job_res->ncpus            = MAX(job_res->ncpus,
					details_ptr->pn_min_cpus);
	job_res->node_req         = job_node_req;
	job_res->cpus             = cpu_count;
	job_res->cpus_used        = xmalloc(job_res->nhosts *
					    sizeof(uint16_t));
	job_res->memory_allocated = xmalloc(job_res->nhosts *
					    sizeof(uint32_t));
	job_res->memory_used      = xmalloc(job_res->nhosts *
					    sizeof(uint32_t));

	/* store the hardware data for the selected nodes */
	error_code = build_job_resources(job_res, node_record_table_ptr,
					  select_fast_schedule);
	if (error_code != SLURM_SUCCESS) {
		free_job_resources(&job_res);
		FREE_NULL_BITMAP(free_cores);
		return error_code;
	}

	c = 0;
	csize = bit_size(job_res->core_bitmap);
	j = cr_get_coremap_offset(n);
	k = cr_get_coremap_offset(n + 1);
	for (; j < k; j++, c++) {
		if (!bit_test(free_cores, j))
			continue;
		if (c >= csize)	{
			error("select/serial: cr_job_test "
			      "core_bitmap index error on node %s", 
			      select_node_record[n].node_ptr->name);
			drain_nodes(select_node_record[n].node_ptr->name,
				    "Bad core count", getuid());
			free_job_resources(&job_res);
			FREE_NULL_BITMAP(free_cores);
			return SLURM_ERROR;
		}
		bit_set(job_res->core_bitmap, c);
		break;
	}

	if (select_debug_flags & DEBUG_FLAG_CPU_BIND) {
		info("select/serial: cr_job_test: job %u ncpus %u cbits %u/%d "
		     "nbits %u", job_ptr->job_id,
		     job_res->ncpus, bit_set_count(free_cores), 1,
		     job_res->nhosts);
	}
	FREE_NULL_BITMAP(free_cores);

	/* distribute the tasks and clear any unused cores */
	job_ptr->job_resrcs = job_res;
	error_code = cr_dist(job_ptr, cr_type);
	if (error_code != SLURM_SUCCESS) {
		free_job_resources(&job_ptr->job_resrcs);
		return error_code;
	}

	/* translate job_res->cpus array into format with rep count */
	job_ptr->total_cpus = build_job_resources_cpu_array(job_res);

	if (!(cr_type & CR_MEMORY))
		return error_code;

	/* load memory allocated array */
	save_mem = details_ptr->pn_min_memory;
	if (save_mem & MEM_PER_CPU) {
		/* memory is per-cpu */
		save_mem &= (~MEM_PER_CPU);
		job_res->memory_allocated[0] = job_res->cpus[0] * save_mem;
	} else {
		/* memory is per-node */
		job_res->memory_allocated[0] = save_mem;
	}
	return error_code;
}
Beispiel #6
0
/*
 * Given a job step request, return an equivalent local bitmap for this node
 * IN req          - The job step launch request
 * OUT hw_sockets  - number of actual sockets on this node
 * OUT hw_cores    - number of actual cores per socket on this node
 * OUT hw_threads  - number of actual threads per core on this node
 * RET: bitmap of processors available to this job step on this node
 *      OR NULL on error
 */
static bitstr_t *_get_avail_map(launch_tasks_request_msg_t *req,
				uint16_t *hw_sockets, uint16_t *hw_cores,
				uint16_t *hw_threads)
{
	bitstr_t *req_map, *hw_map;
	slurm_cred_arg_t arg;
	uint16_t p, t, new_p, num_cpus, sockets, cores;
	int job_node_id;
	int start;
	char *str;

	*hw_sockets = conf->sockets;
	*hw_cores   = conf->cores;
	*hw_threads = conf->threads;

	if (slurm_cred_get_args(req->cred, &arg) != SLURM_SUCCESS) {
		error("task/affinity: job lacks a credential");
		return NULL;
	}

	/* we need this node's ID in relation to the whole
	 * job allocation, not just this jobstep */
	job_node_id = nodelist_find(arg.job_hostlist, conf->node_name);
	start = _get_local_node_info(&arg, job_node_id, &sockets, &cores);
	if (start < 0) {
		error("task/affinity: missing node %d in job credential",
		      job_node_id);
		slurm_cred_free_args(&arg);
		return NULL;
	}
	debug3("task/affinity: slurmctld s %u c %u; hw s %u c %u t %u",
	       sockets, cores, *hw_sockets, *hw_cores, *hw_threads);

	num_cpus = MIN((sockets * cores), ((*hw_sockets)*(*hw_cores)));
	req_map = (bitstr_t *) bit_alloc(num_cpus);
	hw_map  = (bitstr_t *) bit_alloc(conf->block_map_size);

	/* Transfer core_bitmap data to local req_map.
	 * The MOD function handles the case where fewer processes
	 * physically exist than are configured (slurmd is out of
	 * sync with the slurmctld daemon). */
	for (p = 0; p < (sockets * cores); p++) {
		if (bit_test(arg.step_core_bitmap, start+p))
			bit_set(req_map, (p % num_cpus));
	}

	str = (char *)bit_fmt_hexmask(req_map);
	debug3("task/affinity: job %u.%u CPU mask from slurmctld: %s",
		req->job_id, req->job_step_id, str);
	xfree(str);

	for (p = 0; p < num_cpus; p++) {
		if (bit_test(req_map, p) == 0)
			continue;
		/* If we are pretending we have a larger system than
		   we really have this is needed to make sure we
		   don't bust the bank.
		*/
		new_p = p % conf->block_map_size;
		/* core_bitmap does not include threads, so we
		 * add them here but limit them to what the job
		 * requested */
		for (t = 0; t < (*hw_threads); t++) {
			uint16_t bit = new_p * (*hw_threads) + t;
			bit %= conf->block_map_size;
			bit_set(hw_map, bit);
		}
	}

	str = (char *)bit_fmt_hexmask(hw_map);
	debug3("task/affinity: job %u.%u CPU final mask for local node: %s",
		req->job_id, req->job_step_id, str);
	xfree(str);

	FREE_NULL_BITMAP(req_map);
	slurm_cred_free_args(&arg);
	return hw_map;
}
Beispiel #7
0
/* Execute control sequence. */
int
input_csi_dispatch(struct input_ctx *ictx)
{
	struct screen_write_ctx	       *sctx = &ictx->ctx;
	struct screen		       *s = sctx->s;
	struct input_table_entry       *entry;
	int				n, m;

	if (ictx->flags & INPUT_DISCARD)
		return (0);
	if (input_split(ictx) != 0)
		return (0);
	log_debug("%s: '%c' \"%s\" \"%s\"",
	    __func__, ictx->ch, ictx->interm_buf, ictx->param_buf);

	entry = bsearch(ictx, input_csi_table, nitems(input_csi_table),
	    sizeof input_csi_table[0], input_table_compare);
	if (entry == NULL) {
		log_debug("%s: unknown '%c'", __func__, ictx->ch);
		return (0);
	}

	switch (entry->type) {
	case INPUT_CSI_CBT:
		/* Find the previous tab point, n times. */
		n = input_get(ictx, 0, 1, 1);
		while (s->cx > 0 && n-- > 0) {
			do
				s->cx--;
			while (s->cx > 0 && !bit_test(s->tabs, s->cx));
		}
		break;
	case INPUT_CSI_CUB:
		screen_write_cursorleft(sctx, input_get(ictx, 0, 1, 1));
		break;
	case INPUT_CSI_CUD:
		screen_write_cursordown(sctx, input_get(ictx, 0, 1, 1));
		break;
	case INPUT_CSI_CUF:
		screen_write_cursorright(sctx, input_get(ictx, 0, 1, 1));
		break;
	case INPUT_CSI_CUP:
		n = input_get(ictx, 0, 1, 1);
		m = input_get(ictx, 1, 1, 1);
		screen_write_cursormove(sctx, m - 1, n - 1);
		break;
	case INPUT_CSI_WINOPS:
		input_csi_dispatch_winops(ictx);
		break;
	case INPUT_CSI_CUU:
		screen_write_cursorup(sctx, input_get(ictx, 0, 1, 1));
		break;
	case INPUT_CSI_CNL:
		screen_write_carriagereturn(sctx);
		screen_write_cursordown(sctx, input_get(ictx, 0, 1, 1));
		break;
	case INPUT_CSI_CPL:
		screen_write_carriagereturn(sctx);
		screen_write_cursorup(sctx, input_get(ictx, 0, 1, 1));
		break;
	case INPUT_CSI_DA:
		switch (input_get(ictx, 0, 0, 0)) {
		case 0:
			input_reply(ictx, "\033[?1;2c");
			break;
		default:
			log_debug("%s: unknown '%c'", __func__, ictx->ch);
			break;
		}
		break;
	case INPUT_CSI_DA_TWO:
		switch (input_get(ictx, 0, 0, 0)) {
		case 0:
			input_reply(ictx, "\033[>84;0;0c");
			break;
		default:
			log_debug("%s: unknown '%c'", __func__, ictx->ch);
			break;
		}
		break;
	case INPUT_CSI_ECH:
		screen_write_clearcharacter(sctx, input_get(ictx, 0, 1, 1));
		break;
	case INPUT_CSI_DCH:
		screen_write_deletecharacter(sctx, input_get(ictx, 0, 1, 1));
		break;
	case INPUT_CSI_DECSTBM:
		n = input_get(ictx, 0, 1, 1);
		m = input_get(ictx, 1, 1, screen_size_y(s));
		screen_write_scrollregion(sctx, n - 1, m - 1);
		break;
	case INPUT_CSI_DL:
		screen_write_deleteline(sctx, input_get(ictx, 0, 1, 1));
		break;
	case INPUT_CSI_DSR:
		switch (input_get(ictx, 0, 0, 0)) {
		case 5:
			input_reply(ictx, "\033[0n");
			break;
		case 6:
			input_reply(ictx, "\033[%u;%uR", s->cy + 1, s->cx + 1);
			break;
		default:
			log_debug("%s: unknown '%c'", __func__, ictx->ch);
			break;
		}
		break;
	case INPUT_CSI_ED:
		switch (input_get(ictx, 0, 0, 0)) {
		case 0:
			screen_write_clearendofscreen(sctx);
			break;
		case 1:
			screen_write_clearstartofscreen(sctx);
			break;
		case 2:
			screen_write_clearscreen(sctx);
			break;
		case 3:
			switch (input_get(ictx, 1, 0, 0)) {
			case 0:
				/*
				 * Linux console extension to clear history
				 * (for example before locking the screen).
				 */
				screen_write_clearhistory(sctx);
				break;
			}
			break;
		default:
			log_debug("%s: unknown '%c'", __func__, ictx->ch);
			break;
		}
		break;
	case INPUT_CSI_EL:
		switch (input_get(ictx, 0, 0, 0)) {
		case 0:
			screen_write_clearendofline(sctx);
			break;
		case 1:
			screen_write_clearstartofline(sctx);
			break;
		case 2:
			screen_write_clearline(sctx);
			break;
		default:
			log_debug("%s: unknown '%c'", __func__, ictx->ch);
			break;
		}
		break;
	case INPUT_CSI_HPA:
		n = input_get(ictx, 0, 1, 1);
		screen_write_cursormove(sctx, n - 1, s->cy);
		break;
	case INPUT_CSI_ICH:
		screen_write_insertcharacter(sctx, input_get(ictx, 0, 1, 1));
		break;
	case INPUT_CSI_IL:
		screen_write_insertline(sctx, input_get(ictx, 0, 1, 1));
		break;
	case INPUT_CSI_RCP:
		memcpy(&ictx->cell, &ictx->old_cell, sizeof ictx->cell);
		screen_write_cursormove(sctx, ictx->old_cx, ictx->old_cy);
		break;
	case INPUT_CSI_RM:
		input_csi_dispatch_rm(ictx);
		break;
	case INPUT_CSI_RM_PRIVATE:
		input_csi_dispatch_rm_private(ictx);
		break;
	case INPUT_CSI_SCP:
		memcpy(&ictx->old_cell, &ictx->cell, sizeof ictx->old_cell);
		ictx->old_cx = s->cx;
		ictx->old_cy = s->cy;
		break;
	case INPUT_CSI_SGR:
		input_csi_dispatch_sgr(ictx);
		break;
	case INPUT_CSI_SM:
		input_csi_dispatch_sm(ictx);
		break;
	case INPUT_CSI_SM_PRIVATE:
		input_csi_dispatch_sm_private(ictx);
		break;
	case INPUT_CSI_TBC:
		switch (input_get(ictx, 0, 0, 0)) {
		case 0:
			if (s->cx < screen_size_x(s))
				bit_clear(s->tabs, s->cx);
			break;
		case 3:
			bit_nclear(s->tabs, 0, screen_size_x(s) - 1);
			break;
		default:
			log_debug("%s: unknown '%c'", __func__, ictx->ch);
			break;
		}
		break;
	case INPUT_CSI_VPA:
		n = input_get(ictx, 0, 1, 1);
		screen_write_cursormove(sctx, s->cx, n - 1);
		break;
	case INPUT_CSI_DECSCUSR:
		n = input_get(ictx, 0, 0, 0);
		screen_set_cursor_style(s, n);
		break;
	}

	return (0);
}
/* Sync up the core_bitmap with the CPU array using cyclic distribution
 *
 * The CPU array contains the distribution of CPUs, which can include
 * virtual CPUs (hyperthreads)
 */
static int _cyclic_sync_core_bitmap(struct job_record *job_ptr,
				     const uint16_t cr_type, bool preempt_mode)
{
	uint32_t c, i, j, s, n, *sock_start, *sock_end, size, csize, core_cnt;
	uint16_t cps = 0, cpus, vpus, sockets, sock_size;
	job_resources_t *job_res = job_ptr->job_resrcs;
	bitstr_t *core_map;
	bool *sock_used, *sock_avoid;
	bool alloc_cores = false, alloc_sockets = false;
	uint16_t ntasks_per_socket = 0xffff;
	uint16_t ntasks_per_core = 0xffff;
	int error_code = SLURM_SUCCESS;
	int tmp_cpt = 0; /* cpus_per_task */

	if ((job_res == NULL) || (job_res->core_bitmap == NULL) ||
	    (job_ptr->details == NULL))
		return error_code;

	if (cr_type & CR_SOCKET)
		alloc_sockets = true;
	else if (cr_type & CR_CORE)
		alloc_cores = true;

	core_map = job_res->core_bitmap;
	if (job_ptr->details->mc_ptr) {
		multi_core_data_t *mc_ptr = job_ptr->details->mc_ptr;
		if ((mc_ptr->ntasks_per_core != (uint16_t) INFINITE) &&
		    (mc_ptr->ntasks_per_core)) {
			ntasks_per_core = mc_ptr->ntasks_per_core;
		}

		if (mc_ptr->ntasks_per_socket)
			ntasks_per_socket = mc_ptr->ntasks_per_socket;
	}

	sock_size  = select_node_record[0].sockets;
	sock_avoid = xmalloc(sock_size * sizeof(bool));
	sock_start = xmalloc(sock_size * sizeof(uint32_t));
	sock_end   = xmalloc(sock_size * sizeof(uint32_t));
	sock_used  = xmalloc(sock_size * sizeof(bool));

	size  = bit_size(job_res->node_bitmap);
	csize = bit_size(core_map);
	for (c = 0, i = 0, n = 0; n < size; n++) {
		if (bit_test(job_res->node_bitmap, n) == 0)
			continue;
		sockets = select_node_record[n].sockets;
		cps     = select_node_record[n].cores;
		vpus    = cr_cpus_per_core(job_ptr->details, n);

		if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
			info("DEBUG: job %u node %s vpus %u cpus %u",
			     job_ptr->job_id,
			     select_node_record[n].node_ptr->name,
			     vpus, job_res->cpus[i]);
		}

		if ((c + (sockets * cps)) > csize)
			fatal("cons_res: _cyclic_sync_core_bitmap index error");

		if (sockets > sock_size) {
			sock_size = sockets;
			xrealloc(sock_avoid, sock_size * sizeof(bool));
			xrealloc(sock_start, sock_size * sizeof(uint32_t));
			xrealloc(sock_end,   sock_size * sizeof(uint32_t));
			xrealloc(sock_used,  sock_size * sizeof(bool));
		}

		for (s = 0; s < sockets; s++) {
			sock_start[s] = c + (s * cps);
			sock_end[s]   = sock_start[s] + cps;
			sock_avoid[s] = false;
			sock_used[s]  = false;
		}
		core_cnt = 0;
		cpus  = job_res->cpus[i];

		if (ntasks_per_socket != 0xffff) {
			int x_cpus, cpus_per_socket;
			uint32_t total_cpus = 0;
			uint32_t *cpus_cnt;

			cpus_per_socket = ntasks_per_socket *
					  job_ptr->details->cpus_per_task;
			cpus_cnt = xmalloc(sizeof(uint32_t) * sockets);
			for (s = 0; s < sockets; s++) {
				for (j = sock_start[s]; j < sock_end[s]; j++) {
					if (bit_test(core_map, j))
						cpus_cnt[s] += vpus;
				}
				total_cpus += cpus_cnt[s];
			}
			for (s = 0; s < sockets && total_cpus > cpus; s++) {
				if (cpus_cnt[s] > cpus_per_socket) {
					x_cpus = cpus_cnt[s] - cpus_per_socket;
					cpus_cnt[s] = cpus_per_socket;
					total_cpus -= x_cpus;
				}
			}
			for (s = 0; s < sockets && total_cpus > cpus; s++) {
				if ((cpus_cnt[s] <= cpus_per_socket) &&
				    (total_cpus - cpus_cnt[s] >= cpus)) {
					sock_avoid[s] = true;
					total_cpus -= cpus_cnt[s];
				}
			}
			xfree(cpus_cnt);
		} else if (job_ptr->details->cpus_per_task > 1) {
			/* CLANG false positive */
			/* Try to pack all CPUs of each tasks on one socket. */
			uint32_t *cpus_cnt, cpus_per_task;

			cpus_per_task = job_ptr->details->cpus_per_task;
			cpus_cnt = xmalloc(sizeof(uint32_t) * sockets);
			for (s = 0; s < sockets; s++) {
				for (j = sock_start[s]; j < sock_end[s]; j++) {
					if (bit_test(core_map, j))
						cpus_cnt[s] += vpus;
				}
				cpus_cnt[s] -= (cpus_cnt[s] % cpus_per_task);
			}
			tmp_cpt = cpus_per_task;
			for (s = 0; ((s < sockets) && (cpus > 0)); s++) {
				while ((sock_start[s] < sock_end[s]) &&
				       (cpus_cnt[s] > 0) && (cpus > 0)) {
					if (bit_test(core_map, sock_start[s])) {
						int used;
						sock_used[s] = true;
						core_cnt++;

						if ((ntasks_per_core == 1) &&
						    (cpus_per_task > vpus)) {
							used = MIN(tmp_cpt,
								   vpus);
							if (tmp_cpt <= used)
								tmp_cpt = cpus_per_task;
							else
								tmp_cpt -= used;
						} else
							used = vpus;

						if (cpus_cnt[s] < vpus)
							cpus_cnt[s] = 0;
						else
							cpus_cnt[s] -= used;
						if (cpus < vpus)
							cpus = 0;
						else
							cpus -= used;
					}
					sock_start[s]++;
				}
			}
			xfree(cpus_cnt);
		}

		while (cpus > 0) {
			uint16_t prev_cpus = cpus;
			for (s = 0; s < sockets && cpus > 0; s++) {
				if (sock_avoid[s])
					continue;
				while (sock_start[s] < sock_end[s]) {
					if (bit_test(core_map, sock_start[s])) {
						sock_used[s] = true;
						core_cnt++;
						break;
					} else
						sock_start[s]++;
				}
				if (sock_start[s] == sock_end[s])
					/* this socket is unusable */
					continue;
				if (cpus < vpus)
					cpus = 0;
				else
					cpus -= vpus;
				sock_start[s]++;
			}
			if (prev_cpus != cpus)
				 continue;
			if (!preempt_mode) {
				/* we're stuck! */
				job_ptr->priority = 0;
				job_ptr->state_reason = WAIT_HELD;
				error("%s: sync loop not progressing on node %s, holding job %u",
				      __func__,
				      select_node_record[n].node_ptr->name,
				      job_ptr->job_id);
			}
			error_code = SLURM_ERROR;
			goto fini;
		}

		/* clear the rest of the cores in each socket
		 * FIXME: do we need min_core/min_socket checks here? */
		for (s = 0; s < sockets; s++) {
			if (sock_start[s] == sock_end[s])
				continue;
			if (!alloc_sockets || !sock_used[s]) {
				bit_nclear(core_map, sock_start[s],
					   sock_end[s]-1);
			}
			if ((select_node_record[n].vpus >= 1) &&
			    (alloc_sockets || alloc_cores) && sock_used[s]) {
				for (j=sock_start[s]; j<sock_end[s]; j++) {
					/* Mark all cores as used */
					if (alloc_sockets)
						bit_set(core_map, j);
					if (bit_test(core_map, j))
						core_cnt++;
				}
			}
		}
		if ((alloc_cores || alloc_sockets) &&
		    (select_node_record[n].vpus >= 1)) {
			job_res->cpus[i] = core_cnt *
					   select_node_record[n].vpus;
		}
		i++;
		/* advance 'c' to the beginning of the next node */
		c += sockets * cps;
	}
fini:	xfree(sock_avoid);
	xfree(sock_start);
	xfree(sock_end);
	xfree(sock_used);
	return error_code;
}
/* To effectively deal with heterogeneous nodes, we fake a cyclic
 * distribution to figure out how many cpus are needed on each node.
 *
 * This routine is a slightly modified "version" of the routine
 * _task_layout_block in src/common/dist_tasks.c. We do not need to
 * assign tasks to job->hostid[] and job->tids[][] at this point so
 * the cpu allocation is the same for cyclic and block.
 *
 * For the consumable resources support we need to determine what
 * "node/CPU/Core/thread"-tuplets will be allocated for a given job.
 * In the past we assumed that we only allocated one task per CPU (at
 * that point the lowest level of logical processor) and didn't allow
 * the use of overcommit. We have changed this philosophy and are now
 * allowing people to overcommit their resources and expect the system
 * administrator to enable the task/affinity plug-in which will then
 * bind all of a job's tasks to its allocated resources thereby
 * avoiding interference between co-allocated running jobs.
 *
 * In the consumable resources environment we need to determine the
 * layout schema within slurmctld.
 *
 * We have a core_bitmap of all available cores. All we're doing here
 * is removing cores that are not needed based on the task count, and
 * the choice of cores to remove is based on the distribution:
 * - "cyclic" removes cores "evenly", starting from the last socket,
 * - "block" removes cores from the "last" socket(s)
 * - "plane" removes cores "in chunks"
 *
 * IN job_ptr - job to be allocated resources
 * IN cr_type - allocation type (sockets, cores, etc.)
 * IN preempt_mode - true if testing with simulated preempted jobs
 */
extern int cr_dist(struct job_record *job_ptr, const uint16_t cr_type,
		   bool preempt_mode)
{
	int error_code, cr_cpu = 1;

	if (job_ptr->details->core_spec != (uint16_t) NO_VAL) {
		/* The job has been allocated all non-specialized cores,
		 * so we don't need to select specific CPUs. */
		return SLURM_SUCCESS;
	}

	if ((job_ptr->job_resrcs->node_req == NODE_CR_RESERVED) ||
	    (job_ptr->details->whole_node == 1)) {
		int n, i;
		job_resources_t *job_res = job_ptr->job_resrcs;
		/* The job has been allocated an EXCLUSIVE set of nodes,
		 * so it gets all of the bits in the core_bitmap and
		 * all of the available CPUs in the cpus array. */
		int size = bit_size(job_res->core_bitmap);
		bit_nset(job_res->core_bitmap, 0, size-1);

		/* Up to this point we might not have the job_res pointer have
		 * the right cpu count.  It is most likely a core count.  We
		 * will fix that so we can layout tasks correctly.
		 */
		size = bit_size(job_res->node_bitmap);
		for (i = 0, n = bit_ffs(job_res->node_bitmap); n < size; n++) {
			if (bit_test(job_res->node_bitmap, n) == 0)
				continue;
			job_res->cpus[i++] = select_node_record[n].cpus;
		}
		return SLURM_SUCCESS;
	}

	_log_select_maps("cr_dist/start", job_ptr->job_resrcs->node_bitmap,
			 job_ptr->job_resrcs->core_bitmap);
	if ((job_ptr->details->task_dist & SLURM_DIST_STATE_BASE) ==
	    SLURM_DIST_PLANE) {
		/* perform a plane distribution on the 'cpus' array */
		error_code = _compute_plane_dist(job_ptr);
		if (error_code != SLURM_SUCCESS) {
			error("cons_res: cr_dist: Error in "
			      "_compute_plane_dist");
			return error_code;
		}
	} else {
		/* perform a cyclic distribution on the 'cpus' array */
		error_code = _compute_c_b_task_dist(job_ptr);
		if (error_code != SLURM_SUCCESS) {
			error("cons_res: cr_dist: Error in "
			      "_compute_c_b_task_dist");
			return error_code;
		}
	}

	/* now sync up the core_bitmap with the allocated 'cpus' array
	 * based on the given distribution AND resource setting */
	if ((cr_type & CR_CORE) || (cr_type & CR_SOCKET))
		cr_cpu = 0;

	if (cr_cpu) {
		_block_sync_core_bitmap(job_ptr, cr_type);
		return SLURM_SUCCESS;
	}

	/*
	 * If SelectTypeParameters mentions to use a block distribution for
	 * cores by default, use that kind of distribution if no particular
	 * cores distribution specified.
	 * Note : cyclic cores distribution, which is the default, is treated
	 * by the next code block
	 */
	if ( slurmctld_conf.select_type_param & CR_CORE_DEFAULT_DIST_BLOCK ) {
		switch(job_ptr->details->task_dist & SLURM_DIST_NODEMASK) {
		case SLURM_DIST_ARBITRARY:
		case SLURM_DIST_BLOCK:
		case SLURM_DIST_CYCLIC:
		case SLURM_DIST_UNKNOWN:
			_block_sync_core_bitmap(job_ptr, cr_type);
			return SLURM_SUCCESS;
		}
	}

	/* Determine the number of logical processors per node needed
	 * for this job. Make sure below matches the layouts in
	 * lllp_distribution in plugins/task/affinity/dist_task.c (FIXME) */
	switch(job_ptr->details->task_dist & SLURM_DIST_NODESOCKMASK) {
	case SLURM_DIST_BLOCK_BLOCK:
	case SLURM_DIST_CYCLIC_BLOCK:
	case SLURM_DIST_PLANE:
		_block_sync_core_bitmap(job_ptr, cr_type);
		break;
	case SLURM_DIST_ARBITRARY:
	case SLURM_DIST_BLOCK:
	case SLURM_DIST_CYCLIC:
	case SLURM_DIST_BLOCK_CYCLIC:
	case SLURM_DIST_CYCLIC_CYCLIC:
	case SLURM_DIST_BLOCK_CFULL:
	case SLURM_DIST_CYCLIC_CFULL:
	case SLURM_DIST_UNKNOWN:
		error_code = _cyclic_sync_core_bitmap(job_ptr, cr_type,
						      preempt_mode);
		break;
	default:
		error("select/cons_res: invalid task_dist entry");
		return SLURM_ERROR;
	}

	_log_select_maps("cr_dist/fini", job_ptr->job_resrcs->node_bitmap,
			 job_ptr->job_resrcs->core_bitmap);
	return error_code;
}
Beispiel #10
0
static int retrieve_segment(struct segment * seg)
{
    char proc[256];
    snprintf(proc, 256, "rsg%u", g_nodeId);
    prctl(PR_SET_NAME, proc, 0, 0, 0);
    int rv = -1;
    log_print(g_log, "retrieve_segment: trying to retrieve segment %s[%d - %d].",
              seg->name->full_name, 0, seg->num_chunks-1);

    pthread_mutex_lock(&g_lock);
    int retries = g_interest_attempts;
    int timeout_ms = g_timeout_ms;
    pthread_mutex_unlock(&g_lock);
    int ttl = MAX_TTL;

    if ((seg->opts->mode & CCNFDNB_USE_RETRIES) == CCNFDNB_USE_RETRIES) {
        retries = seg->opts->retries;
    }
    if ((seg->opts->mode & CCNFDNB_USE_TIMEOUT) == CCNFDNB_USE_TIMEOUT) {
        timeout_ms = seg->opts->timeout_ms;
    }
    if ((seg->opts->mode & CCNFDNB_USE_TTL) == CCNFDNB_USE_TTL) {
        ttl = seg->opts->ttl;
    }

    struct chunk chunk_window[MAX_INTEREST_PIPELINE];
    PENTRY _pit_handles[MAX_INTEREST_PIPELINE];
    int pit_to_chunk[PIT_SIZE];
    memset(&pit_to_chunk, 0, sizeof(pit_to_chunk));
    struct bitmap * window = bit_create(MAX_INTEREST_PIPELINE);
    struct bitmap * missing = bit_create(seg->num_chunks);

    char str[MAX_NAME_LENGTH], comp[MAX_NAME_LENGTH];
    strncpy(str, seg->name->full_name, seg->name->len);

    int rtt_est = timeout_ms;
    int cwnd = 1;
    int ssthresh = DEFAULT_INTEREST_PIPELINE;
    int fullfilled = 0;
    int min_rtt_est = 10;

    int current_chunk = 0;
    cc_state state = SLOW_START;
    int tx;

    _segment_q_t seg_q;
    pthread_mutex_init(&seg_q.mutex, NULL);
    pthread_cond_init(&seg_q.cond, NULL);
    seg_q.rcv_window = 0;
    seg_q.max_window = &cwnd;
    seg_q.rcv_chunks = linked_list_init(NULL);
    seg_q.base = seg->name;
    ccnfdnl_reg_segment(&seg_q);

    int i;
    window->num_bits = cwnd;
    while (!bit_allSet(missing)) {
        tx = cwnd;
        window->num_bits = cwnd;

        log_debug(g_log, "state = %d, cwnd = %d, ssthresh = %d rtt_est = %d",
                  state, cwnd, ssthresh, rtt_est);

        while (tx && (current_chunk < seg->num_chunks)) {
            snprintf(comp, MAX_NAME_LENGTH - seg->name->len, "/%d", current_chunk);
            strncpy(str + seg->name->len, comp, seg->name->len);
            i = bit_find(window);
            if (i < 0 || i >= MAX_INTEREST_PIPELINE) {
                /* we must still be waiting for data */
                break;
            }
            chunk_window[i].intr.name = content_name_create(str);
            chunk_window[i].intr.ttl = ttl;
            chunk_window[i].seq_no = current_chunk;
            chunk_window[i].retries = retries;

            _pit_handles[i] = PIT_get_handle(chunk_window[i].intr.name);
            if (!_pit_handles[i]) {
                bit_clear(window, i);
                break;
            }
            pit_to_chunk[_pit_handles[i]->index] = i;
            pthread_mutex_unlock(_pit_handles[i]->mutex);

            struct content_obj * co = CS_get(chunk_window[i].intr.name);
            if (!co) {
                log_debug(g_log, "expressing new interest: %s", chunk_window[i].intr.name->full_name);
                ccnfdnb_fwd_interest(&chunk_window[i].intr);
                tx--;
            } else {
                log_debug(g_log, "retrieved %s from CS", co->name->full_name);

                PENTRY pe = PIT_exact_match(chunk_window[i].intr.name);
                *pe->obj = co;
                pthread_mutex_unlock(pe->mutex);

                pthread_mutex_lock(&seg_q.mutex);

                linked_list_append(seg_q.rcv_chunks, pe);
                seg_q.rcv_window++;

                pthread_mutex_unlock(&seg_q.mutex);
            }

            current_chunk++;
        }
        log_debug(g_log, "tx window full");

        pthread_mutex_lock(&seg_q.mutex);

        if (seg_q.rcv_chunks->len == 0) {
            struct timespec wait;
            ts_fromnow(&wait);
            ts_addms(&wait, 2 * rtt_est);

            rv = pthread_cond_timedwait(&seg_q.cond, &seg_q.mutex, &wait);
            if ((rv == ETIMEDOUT) && !seg_q.rcv_chunks->len) {
                /* we timed out, we need to rtx */
                rtt_est += rtt_est / 2;
                if (rtt_est > PIT_LIFETIME_MS)
                    rtt_est = PIT_LIFETIME_MS / 2;
            }
        } else {
            int pit_ages = 0;
            int pits_fulfilled = 0;
            while (seg_q.rcv_chunks->len > 0) {
                PENTRY pe = linked_list_remove(seg_q.rcv_chunks, 0);
                log_assert(g_log, pe != NULL, "invalid pit entry");

                pthread_mutex_lock(pe->mutex);
                log_debug(g_log, "pit entry %s fulfilled", (*pe->obj)->name->full_name);

                int chunk_id = pit_to_chunk[pe->index];
                log_assert(g_log, chunk_id >= 0, "invalid chunk id");

                if (chunk_window[chunk_id].seq_no == 0) {
                    seg->obj->publisher = (*pe->obj)->publisher;
                    seg->obj->timestamp = (*pe->obj)->timestamp;
                    seg->chunk_size = (*pe->obj)->size;
                }
                int offset = chunk_window[chunk_id].seq_no * seg->chunk_size;
                memcpy(&seg->obj->data[offset], (*pe->obj)->data, (*pe->obj)->size);
                content_obj_destroy(*pe->obj);

                struct timespec now;
                ts_fromnow(&now);
                ts_addms(&now, PIT_LIFETIME_MS);
                pit_ages += PIT_age(pe);
				pits_fulfilled++;

                pit_to_chunk[pe->index] = -1;
                PIT_release(pe);
                free(_pit_handles[chunk_id]);
                _pit_handles[chunk_id] = NULL;
                bit_clear(window, chunk_id);
                bit_set(missing, chunk_window[chunk_id].seq_no);
                log_debug(g_log, "retrieved chunk %s", chunk_window[chunk_id].intr.name->full_name);
                content_name_delete(chunk_window[chunk_id].intr.name);
                chunk_window[chunk_id].intr.name = NULL;
                cwnd++;
                if (state == CONG_AVOID)
                    fullfilled++;
            }

            rtt_est -= floor(pit_ages / pits_fulfilled);
            if (rtt_est < min_rtt_est)
                rtt_est = min_rtt_est;
        }

        pthread_mutex_unlock(&seg_q.mutex);

        for (i = 0; i < MAX_INTEREST_PIPELINE; i++) {
            if (bit_test(window, i)) {
                if (!_pit_handles[i]) {
                    continue;
                }
                pthread_mutex_lock(_pit_handles[i]->mutex);
                if (PIT_age(_pit_handles[i]) > (2 * rtt_est)) {
                    PIT_refresh(_pit_handles[i]);
                    chunk_window[i].retries--;
                    ccnfdnb_fwd_interest(&chunk_window[i].intr);
                    log_debug(g_log, "rtx interest: %s (rtt = %d)",
                              chunk_window[i].intr.name->full_name, rtt_est);
                    ssthresh = cwnd / 2 + 1;
                    cwnd = 1;
                    state = SLOW_START;
                }
                pthread_mutex_unlock(_pit_handles[i]->mutex);
            }
        }

        if ((cwnd >= ssthresh) && (state == SLOW_START))
            state = CONG_AVOID;
        if (state == SLOW_START)
            fullfilled = 0;
        if ((fullfilled == cwnd) && (state == CONG_AVOID)) {
            cwnd++;
            fullfilled = 0;
        }
        if (cwnd > MAX_INTEREST_PIPELINE)
            cwnd = MAX_INTEREST_PIPELINE;

        /*log_debug(g_log, "cwnd = %d, ssthresh = %d", cwnd, ssthresh);*/
    }

    log_debug(g_log, "retrieve_segment: finished for %s[%d-%d]",
              seg->name->full_name, 0, seg->num_chunks-1);

    rv = 0;

    PIT_print();
    ccnfdnl_unreg_segment(&seg_q);
    pthread_mutex_destroy(&seg_q.mutex);
    pthread_cond_destroy(&seg_q.cond);
    while (seg_q.rcv_chunks->len) {
        PENTRY pe = linked_list_remove(seg_q.rcv_chunks, 0);
        content_obj_destroy(*pe->obj);
        PIT_release(pe);
    }

    bit_destroy(window);
    bit_destroy(missing);

    return rv;
}
/* sync up core bitmap with new CPU count using a best-fit approach
 * on the available resources on each node
 *
 * "Best-fit" means:
 * 1st priority: Use smallest number of boards with sufficient
 *               available CPUs
 * 2nd priority: Use smallest number of sockets with sufficient
 *               available CPUs
 * 3rd priority: Use board combination with the smallest number
 *               of available CPUs
 * 4th priority: Use higher-numbered boards/sockets/cores first
 *
 * The CPU array contains the distribution of CPUs, which can include
 * virtual CPUs (hyperthreads)
 */
static void _block_sync_core_bitmap(struct job_record *job_ptr,
				    const uint16_t cr_type)
{
	uint32_t c, s, i, j, n, b, z, size, csize, core_cnt;
	uint16_t cpus, num_bits, vpus = 1;
	uint16_t cpus_per_task = job_ptr->details->cpus_per_task;
	job_resources_t *job_res = job_ptr->job_resrcs;
	bool alloc_cores = false, alloc_sockets = false;
	uint16_t ntasks_per_core = 0xffff;
	int tmp_cpt = 0;
	int count, cpu_min, b_min, elig, s_min, comb_idx, sock_idx;
	int elig_idx, comb_brd_idx, sock_list_idx, comb_min, board_num;
	int* boards_core_cnt;
	int* sort_brds_core_cnt;
	int* board_combs;
	int* socket_list;
	int* elig_brd_combs;
	int* elig_core_cnt;
	bool* sockets_used;
	uint16_t boards_nb;
	uint16_t nboards_nb;
	uint16_t sockets_nb;
	uint16_t ncores_nb;
	uint16_t nsockets_nb;
	uint16_t sock_per_brd;
	uint16_t sock_per_comb;
	uint16_t req_cores,best_fit_cores = 0;
	uint32_t best_fit_location = 0;
	uint64_t ncomb_brd;
	bool sufficient, best_fit_sufficient;

	if (!job_res)
		return;
	if (!job_res->core_bitmap) {
		error("%s: core_bitmap for job %u is NULL",
		      __func__, job_ptr->job_id);
		return;
	}
	if (bit_ffs(job_res->core_bitmap) == -1) {
		error("%s: core_bitmap for job %u has no bits set",
		      __func__, job_ptr->job_id);
		return;
	}

	if (cr_type & CR_SOCKET)
		alloc_sockets = true;
	else if (cr_type & CR_CORE)
		alloc_cores = true;

	if (job_ptr->details && job_ptr->details->mc_ptr) {
		multi_core_data_t *mc_ptr = job_ptr->details->mc_ptr;
		if ((mc_ptr->ntasks_per_core != (uint16_t) INFINITE) &&
		    (mc_ptr->ntasks_per_core)) {
			ntasks_per_core = mc_ptr->ntasks_per_core;
		}
	}

	size  = bit_size(job_res->node_bitmap);
	csize = bit_size(job_res->core_bitmap);

	sockets_nb  = select_node_record[0].sockets;
	sockets_core_cnt = xmalloc(sockets_nb * sizeof(int));
	sockets_used = xmalloc(sockets_nb * sizeof(bool));
	boards_nb = select_node_record[0].boards;
	boards_core_cnt = xmalloc(boards_nb * sizeof(int));
	sort_brds_core_cnt = xmalloc(boards_nb * sizeof(int));

	for (c = 0, i = 0, n = 0; n < size; n++) {

		if (bit_test(job_res->node_bitmap, n) == 0)
			continue;

		core_cnt = 0;
		ncores_nb = select_node_record[n].cores;
		nsockets_nb = select_node_record[n].sockets;
		nboards_nb = select_node_record[n].boards;
		num_bits =  nsockets_nb * ncores_nb;

		if ((c + num_bits) > csize)
			fatal("cons_res: _block_sync_core_bitmap index error");

		cpus  = job_res->cpus[i];
		vpus = cr_cpus_per_core(job_ptr->details, n);

		/* compute still required cores on the node */
		req_cores = cpus / vpus;
		if ( cpus % vpus )
			req_cores++;

		/* figure out core cnt if task requires more than one core and
		 * tasks_per_core is 1 */
		if ((ntasks_per_core == 1) &&
		    (cpus_per_task > vpus)) {
			/* how many cores a task will consume */
			int cores_per_task = (cpus_per_task + vpus - 1) / vpus;
			int tasks = cpus / cpus_per_task;
			req_cores = tasks * cores_per_task;
		}

		if (nboards_nb > MAX_BOARDS) {
			debug3("cons_res: node[%u]: exceeds max boards; "
				"doing best-fit across sockets only", n);
			nboards_nb = 1;
		}

		if ( nsockets_nb > sockets_nb) {
			sockets_nb = nsockets_nb;
			xrealloc(sockets_core_cnt, sockets_nb * sizeof(int));
			xrealloc(sockets_used,sockets_nb * sizeof(bool));
		}

		if ( nboards_nb > boards_nb) {
			boards_nb = nboards_nb;
			xrealloc(boards_core_cnt, boards_nb * sizeof(int));
			xrealloc(sort_brds_core_cnt, boards_nb * sizeof(int));
		}

		/* Count available cores on each socket and board */
		if (nsockets_nb >= nboards_nb) {
			sock_per_brd = nsockets_nb / nboards_nb;
		} else {
			error("Node socket count lower than board count "
			      "(%u < %u), job %u node %s",
			      nsockets_nb, nboards_nb, job_ptr->job_id,
			      node_record_table_ptr[n].name);
			sock_per_brd = 1;
		}
		for (b = 0; b < nboards_nb; b++) {
			boards_core_cnt[b] = 0;
			sort_brds_core_cnt[b] = 0;
		}
		for (s = 0; s < nsockets_nb; s++) {
			sockets_core_cnt[s]=0;
			sockets_used[s]=false;
			b = s/sock_per_brd;
			for ( j = c + (s * ncores_nb) ;
			      j < c + ((s+1) * ncores_nb) ;
			      j++ ) {
				if ( bit_test(job_res->core_bitmap,j) ) {
					sockets_core_cnt[s]++;
					boards_core_cnt[b]++;
					sort_brds_core_cnt[b]++;
				}
			}
		}

		/* Sort boards in descending order of available core count */
		qsort(sort_brds_core_cnt, nboards_nb, sizeof (int),
				_cmp_int_descend);
		/* Determine minimum number of boards required for the
		 * allocation (b_min) */
		count = 0;
		for (b = 0; b < nboards_nb; b++) {
			count+=sort_brds_core_cnt[b];
			if (count >= req_cores)
				break;
		}
		b_min = b+1;
		sock_per_comb = b_min * sock_per_brd;

		/* Allocate space for list of board combinations */
		ncomb_brd = comb_counts[nboards_nb-1][b_min-1];
		board_combs = xmalloc(ncomb_brd * b_min * sizeof(int));
		/* Generate all combinations of b_min boards on the node */
		_gen_combs(board_combs, nboards_nb, b_min);

		/* Determine which combinations have enough available cores
		 * for the allocation (eligible board combinations)
		 */
		elig_brd_combs = xmalloc(ncomb_brd * sizeof(int));
		elig_core_cnt = xmalloc(ncomb_brd * sizeof(int));
		elig = 0;
		for (comb_idx = 0; comb_idx < ncomb_brd; comb_idx++) {
			count = 0;
			for (comb_brd_idx = 0; comb_brd_idx < b_min;
				comb_brd_idx++) {
				board_num = board_combs[(comb_idx * b_min)
							+ comb_brd_idx];
				count += boards_core_cnt[board_num];
			}
			if (count >= req_cores) {
				elig_brd_combs[elig] = comb_idx;
				elig_core_cnt[elig] = count;
				elig++;
			}
		}

		/* Allocate space for list of sockets for each eligible board
		 * combination */
		socket_list = xmalloc(elig * sock_per_comb * sizeof(int));

		/* Generate sorted list of sockets for each eligible board
		 * combination, and find combination with minimum number
		 * of sockets and minimum number of cpus required for the
		 * allocation
		 */
		s_min = sock_per_comb;
		comb_min = 0;
		cpu_min = sock_per_comb * ncores_nb;
		for (elig_idx = 0; elig_idx < elig; elig_idx++) {
			comb_idx = elig_brd_combs[elig_idx];
			for (comb_brd_idx = 0; comb_brd_idx < b_min;
							comb_brd_idx++) {
				board_num = board_combs[(comb_idx * b_min)
							+ comb_brd_idx];
				sock_list_idx = (elig_idx * sock_per_comb) +
					(comb_brd_idx * sock_per_brd);
				for (sock_idx = 0; sock_idx < sock_per_brd;
								sock_idx++) {
					socket_list[sock_list_idx + sock_idx]
						= (board_num * sock_per_brd)
							+ sock_idx;
				}
			}
			/* Sort this socket list in descending order of
			 * available core count */
			qsort(&socket_list[elig_idx*sock_per_comb],
				sock_per_comb, sizeof (int), _cmp_sock);
			/* Determine minimum number of sockets required for
			 * the allocation from this socket list */
			count = 0;
			for (b = 0; b < sock_per_comb; b++) {
				sock_idx =
				socket_list[(int)((elig_idx*sock_per_comb)+b)];
				count+=sockets_core_cnt[sock_idx];
				if (count >= req_cores)
					break;
			}
			b++;
			/* Use board combination with minimum number
			 * of required sockets and minimum number of CPUs
			 */
			if ((b < s_min) ||
				(b == s_min && elig_core_cnt[elig_idx]
							    <= cpu_min)) {
				s_min = b;
				comb_min = elig_idx;
				cpu_min = elig_core_cnt[elig_idx];
			}
		}
		if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
			info("cons_res: best_fit: node[%u]: "
			     "required cpus: %u, min req boards: %u,",
			     n, cpus, b_min);
			info("cons_res: best_fit: node[%u]: "
			     "min req sockets: %u, min avail cores: %u",
			     n, s_min, cpu_min);
		}
		/* Re-sort socket list for best-fit board combination in
		 * ascending order of socket number */
		qsort(&socket_list[comb_min * sock_per_comb], sock_per_comb,
				sizeof (int), _cmp_int_ascend);

		xfree(board_combs);
		xfree(elig_brd_combs);
		xfree(elig_core_cnt);

		/* select cores from the sockets of the best-fit board
		 * combination using a best-fit approach */
		tmp_cpt = cpus_per_task;
		while ( cpus > 0 ) {

			best_fit_cores = 0;
			best_fit_sufficient = false;

			/* search for the socket with best fit */
			for ( z = 0; z < sock_per_comb; z++ ) {
				s = socket_list[(comb_min*sock_per_comb)+z];
				sufficient = sockets_core_cnt[s] >= req_cores;
				if ( (best_fit_cores == 0) ||
				     (sufficient && !best_fit_sufficient ) ||
				     (sufficient && (sockets_core_cnt[s] <
						     best_fit_cores)) ||
				     (!sufficient && (sockets_core_cnt[s] >
						      best_fit_cores)) ) {
					best_fit_cores = sockets_core_cnt[s];
					best_fit_location = s;
					best_fit_sufficient = sufficient;
				}
			}

			/* check that we have found a usable socket */
			if ( best_fit_cores == 0 )
				break;

			j = best_fit_location;
			if (sock_per_brd)
				j /= sock_per_brd;
			if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
				info("cons_res: best_fit: using node[%u]: "
				     "board[%u]: socket[%u]: %u cores "
				     "available", n, j, best_fit_location,
				     sockets_core_cnt[best_fit_location]);
			}

			sockets_used[best_fit_location] = true;
			for ( j = (c + (best_fit_location * ncores_nb));
			      j < (c + ((best_fit_location + 1) * ncores_nb));
			      j++ ) {
				/*
				 * if no more cpus to select
				 * release remaining cores unless
				 * we are allocating whole sockets
				 */
				if (cpus == 0) {
					if (alloc_sockets) {
						bit_set(job_res->core_bitmap,j);
						core_cnt++;
					} else {
						bit_clear(job_res->core_bitmap,j);
					}
					continue;
				}

				/*
				 * remove cores from socket count and
				 * cpus count using hyperthreading requirement
				 */
				if ( bit_test(job_res->core_bitmap, j) ) {
					sockets_core_cnt[best_fit_location]--;
					core_cnt++;
					if (cpus < vpus)
						cpus = 0;
					else if ((ntasks_per_core == 1) &&
						 (cpus_per_task > vpus)) {
						int used = MIN(tmp_cpt, vpus);
						cpus -= used;

						if (tmp_cpt <= used)
							tmp_cpt = cpus_per_task;
						else
							tmp_cpt -= used;
					} else {
						cpus -= vpus;
					}
				} else if (alloc_sockets) {
					/* If the core is not used, add it
					 * anyway if allocating whole sockets */
					bit_set(job_res->core_bitmap, j);
					core_cnt++;
				}
			}

			/* loop again if more cpus required */
			if ( cpus > 0 )
				continue;

			/* release remaining cores of the unused sockets */
			for (s = 0; s < nsockets_nb; s++) {
				if ( sockets_used[s] )
					continue;
				bit_nclear(job_res->core_bitmap,
					   c+(s*ncores_nb),
					   c+((s+1)*ncores_nb)-1);
			}

		}

		xfree(socket_list);
		if (cpus > 0) {
			/* cpu count should NEVER be greater than the number
			 * of set bits in the core bitmap for a given node */
			fatal("cons_res: cpus computation error");
		}

		/* adjust cpus count of the current node */
		if ((alloc_cores || alloc_sockets) &&
		    (select_node_record[n].vpus >= 1)) {
			job_res->cpus[i] = core_cnt *
				select_node_record[n].vpus;
		}
		i++;

		/* move c to the next node in core_bitmap */
		c += num_bits;

	}

	xfree(boards_core_cnt);
	xfree(sort_brds_core_cnt);
	xfree(sockets_core_cnt);
	xfree(sockets_used);
}
Beispiel #12
0
/* Perform any power change work to nodes */
static void _do_power_work(time_t now)
{
	static time_t last_log = 0, last_work_scan = 0;
	int i, wake_cnt = 0, sleep_cnt = 0, susp_total = 0;
	time_t delta_t;
	uint32_t susp_state;
	bitstr_t *wake_node_bitmap = NULL, *sleep_node_bitmap = NULL;
	struct node_record *node_ptr;
	bool run_suspend = false;

	/* Set limit on counts of nodes to have state changed */
	delta_t = now - last_work_scan;
	if (delta_t >= 60) {
		suspend_cnt_f = 0.0;
		resume_cnt_f  = 0.0;
	} else {
		float rate = (60 - delta_t) / 60.0;
		suspend_cnt_f *= rate;
		resume_cnt_f  *= rate;
	}
	suspend_cnt = (suspend_cnt_f + 0.5);
	resume_cnt  = (resume_cnt_f  + 0.5);

	if (now > (last_suspend + suspend_timeout)) {
		/* ready to start another round of node suspends */
		run_suspend = true;
		if (last_suspend) {
			bit_nclear(suspend_node_bitmap, 0,
				   (node_record_count - 1));
			last_suspend = (time_t) 0;
		}
	}

	last_work_scan = now;

	/* Build bitmaps identifying each node which should change state */
	for (i=0, node_ptr=node_record_table_ptr;
	     i<node_record_count; i++, node_ptr++) {
		susp_state = IS_NODE_POWER_SAVE(node_ptr);

		if (susp_state)
			susp_total++;

		/* Resume nodes as appropriate */
		if (susp_state &&
		    ((resume_rate == 0) || (resume_cnt < resume_rate))	&&
		    (bit_test(suspend_node_bitmap, i) == 0)		&&
		    (IS_NODE_ALLOCATED(node_ptr) ||
		     (node_ptr->last_idle > (now - idle_time)))) {
			if (wake_node_bitmap == NULL) {
				wake_node_bitmap =
					bit_alloc(node_record_count);
			}
			wake_cnt++;
			resume_cnt++;
			resume_cnt_f++;
			node_ptr->node_state &= (~NODE_STATE_POWER_SAVE);
			node_ptr->node_state |=   NODE_STATE_POWER_UP;
			node_ptr->node_state |=   NODE_STATE_NO_RESPOND;
			bit_clear(power_node_bitmap, i);
			bit_clear(avail_node_bitmap, i);
			node_ptr->last_response = now + resume_timeout;
			bit_set(wake_node_bitmap, i);
		}

		/* Suspend nodes as appropriate */
		if (run_suspend 					&&
		    (susp_state == 0)					&&
		    ((suspend_rate == 0) || (suspend_cnt < suspend_rate)) &&
		    IS_NODE_IDLE(node_ptr)				&&
		    (node_ptr->sus_job_cnt == 0)			&&
		    (!IS_NODE_COMPLETING(node_ptr))			&&
		    (!IS_NODE_POWER_UP(node_ptr))			&&
		    (node_ptr->last_idle < (now - idle_time))		&&
		    ((exc_node_bitmap == NULL) ||
		     (bit_test(exc_node_bitmap, i) == 0))) {
			if (sleep_node_bitmap == NULL) {
				sleep_node_bitmap =
					bit_alloc(node_record_count);
			}
			sleep_cnt++;
			suspend_cnt++;
			suspend_cnt_f++;
			node_ptr->node_state |= NODE_STATE_POWER_SAVE;
			bit_set(power_node_bitmap, i);
			bit_set(sleep_node_bitmap,   i);
			bit_set(suspend_node_bitmap, i);
			last_suspend = now;
		}
	}
	if (((now - last_log) > 600) && (susp_total > 0)) {
		info("Power save mode: %d nodes", susp_total);
		last_log = now;
	}

	if (sleep_node_bitmap) {
		char *nodes;
		nodes = bitmap2node_name(sleep_node_bitmap);
		if (nodes)
			_do_suspend(nodes);
		else
			error("power_save: bitmap2nodename");
		xfree(nodes);
		FREE_NULL_BITMAP(sleep_node_bitmap);
		/* last_node_update could be changed already by another thread!
		last_node_update = now; */
	}

	if (wake_node_bitmap) {
		char *nodes;
		nodes = bitmap2node_name(wake_node_bitmap);
		if (nodes)
			_do_resume(nodes);
		else
			error("power_save: bitmap2nodename");
		xfree(nodes);
		FREE_NULL_BITMAP(wake_node_bitmap);
		/* last_node_update could be changed already by another thread!
		last_node_update = now; */
	}
}
Beispiel #13
0
/* Sync up the core_bitmap with the CPU array using cyclic distribution
 *
 * The CPU array contains the distribution of CPUs, which can include
 * virtual CPUs (hyperthreads)
 */
static int _cyclic_sync_core_bitmap(struct job_record *job_ptr,
				     const uint16_t cr_type)
{
	uint32_t c, i, j, s, n, *sock_start, *sock_end, size, csize, core_cnt;
	uint16_t cps = 0, cpus, vpus, sockets, sock_size;
	job_resources_t *job_res = job_ptr->job_resrcs;
	bitstr_t *core_map;
	bool *sock_used, *sock_avoid;
	bool alloc_cores = false, alloc_sockets = false;
	uint16_t ntasks_per_core = 0xffff, ntasks_per_socket = 0xffff;
	int error_code = SLURM_SUCCESS;

	if ((job_res == NULL) || (job_res->core_bitmap == NULL))
		return error_code;

	if (cr_type & CR_CORE)
		alloc_cores = true;
	if (slurmctld_conf.select_type_param & CR_ALLOCATE_FULL_SOCKET) {
		if (cr_type & CR_SOCKET)
			alloc_sockets = true;
	} else {
		if (cr_type & CR_SOCKET)
			alloc_cores = true;
	}
	core_map = job_res->core_bitmap;
	if (job_ptr->details && job_ptr->details->mc_ptr) {
		multi_core_data_t *mc_ptr = job_ptr->details->mc_ptr;
		if (mc_ptr->ntasks_per_core) {
			ntasks_per_core = mc_ptr->ntasks_per_core;
		}

		if ((mc_ptr->threads_per_core != (uint16_t) NO_VAL) &&
		    (mc_ptr->threads_per_core <  ntasks_per_core)) {
			ntasks_per_core = mc_ptr->threads_per_core;
		}

		if (mc_ptr->ntasks_per_socket)
			ntasks_per_socket = mc_ptr->ntasks_per_socket;
	}

	sock_size  = select_node_record[0].sockets;
	sock_avoid = xmalloc(sock_size * sizeof(bool));
	sock_start = xmalloc(sock_size * sizeof(uint32_t));
	sock_end   = xmalloc(sock_size * sizeof(uint32_t));
	sock_used  = xmalloc(sock_size * sizeof(bool));

	size  = bit_size(job_res->node_bitmap);
	csize = bit_size(core_map);
	for (c = 0, i = 0, n = 0; n < size; n++) {

		if (bit_test(job_res->node_bitmap, n) == 0)
			continue;
		sockets = select_node_record[n].sockets;
		cps     = select_node_record[n].cores;
		vpus    = MIN(select_node_record[n].vpus, ntasks_per_core);

		if (select_debug_flags & DEBUG_FLAG_CPU_BIND) {
			info("DEBUG: job %u node %s vpus %u cpus %u",
			     job_ptr->job_id,
			     select_node_record[n].node_ptr->name,
			     vpus, job_res->cpus[i]);
		}

		if ((c + (sockets * cps)) > csize)
			fatal("cons_res: _cyclic_sync_core_bitmap index error");

		if (sockets > sock_size) {
			sock_size = sockets;
			xrealloc(sock_avoid, sock_size * sizeof(bool));
			xrealloc(sock_start, sock_size * sizeof(uint32_t));
			xrealloc(sock_end,   sock_size * sizeof(uint32_t));
			xrealloc(sock_used,  sock_size * sizeof(bool));
		}

		for (s = 0; s < sockets; s++) {
			sock_start[s] = c + (s * cps);
			sock_end[s]   = sock_start[s] + cps;
			sock_avoid[s] = false;
			sock_used[s]  = false;
		}
		core_cnt = 0;
		cpus  = job_res->cpus[i];

		if (ntasks_per_socket != 0xffff) {
			int x_cpus;
			uint32_t total_cpus = 0;
			uint32_t *cpus_cnt = xmalloc(sizeof(uint32_t)* sockets);
			for (s = 0; s < sockets; s++) {
				for (j = sock_start[s]; j < sock_end[s]; j++) {
					if (bit_test(core_map, j))
						cpus_cnt[s] += vpus;
				}
				total_cpus += cpus_cnt[s];
			}
			for (s = 0; s < sockets && total_cpus > cpus; s++) {
				if (cpus_cnt[s] > ntasks_per_socket) {
					x_cpus = cpus_cnt[s] -ntasks_per_socket;
					cpus_cnt[s] = ntasks_per_socket;
					total_cpus -= x_cpus;
				}
			}
			for (s = 0; s < sockets && total_cpus > cpus; s++) {
				if ((cpus_cnt[s] <= ntasks_per_socket) &&
				    (total_cpus - cpus_cnt[s] >= cpus)) {
					sock_avoid[s] = true;
					total_cpus -= cpus_cnt[s];
				}
			}
			xfree(cpus_cnt);
		}

		while (cpus > 0) {
			uint16_t prev_cpus = cpus;
			for (s = 0; s < sockets && cpus > 0; s++) {
				if (sock_avoid[s])
					continue;
				while (sock_start[s] < sock_end[s]) {
					if (bit_test(core_map,sock_start[s])) {
						sock_used[s] = true;
						core_cnt++;
						break;
					} else
						sock_start[s]++;
				}

				if (sock_start[s] == sock_end[s])
					/* this socket is unusable */
					continue;
				if (cpus < vpus)
					cpus = 0;
				else
					cpus -= vpus;
				sock_start[s]++;
			}
			if (prev_cpus == cpus) {
				/* we're stuck! */
				job_ptr->priority = 0;
				job_ptr->state_reason = WAIT_HELD;
				error("cons_res: sync loop not progressing, "
				      "holding job %u", job_ptr->job_id);
				error_code = SLURM_ERROR;
				goto fini;
			}
		}

		/* clear the rest of the cores in each socket
		 * FIXME: do we need min_core/min_socket checks here? */
		for (s = 0; s < sockets; s++) {
			if (sock_start[s] == sock_end[s])
				continue;
			if (!alloc_sockets || !sock_used[s]) {
				bit_nclear(core_map, sock_start[s],
					   sock_end[s]-1);
			}
			if ((select_node_record[n].vpus >= 1) &&
			    (alloc_sockets || alloc_cores) && sock_used[s]) {
				for (j=sock_start[s]; j<sock_end[s]; j++) {
					/* Mark all cores as used */
					if (alloc_sockets)
						bit_set(core_map, j);
					if (bit_test(core_map, j))
						core_cnt++;
				}
			}
		}
		if ((alloc_cores || alloc_sockets) &&
		    (select_node_record[n].vpus >= 1)) {
			job_res->cpus[i] = core_cnt *
					   select_node_record[n].vpus;
		}
		i++;
		/* advance 'c' to the beginning of the next node */
		c += sockets * cps;
	}
fini:	xfree(sock_avoid);
	xfree(sock_start);
	xfree(sock_end);
	xfree(sock_used);
	return error_code;
}
Beispiel #14
0
/* Execute C0 control sequence. */
int
input_c0_dispatch(struct input_ctx *ictx)
{
	struct screen_write_ctx	*sctx = &ictx->ctx;
	struct window_pane	*wp = ictx->wp;
	struct screen		*s = sctx->s;
	u_int			 trigger;

	log_debug("%s: '%c", __func__, ictx->ch);

	switch (ictx->ch) {
	case '\000':	/* NUL */
		break;
	case '\007':	/* BEL */
		wp->window->flags |= WINDOW_BELL;
		break;
	case '\010':	/* BS */
		screen_write_backspace(sctx);
		goto count_c0;
	case '\011':	/* HT */
		/* Don't tab beyond the end of the line. */
		if (s->cx >= screen_size_x(s) - 1)
			break;

		/* Find the next tab point, or use the last column if none. */
		do {
			s->cx++;
			if (bit_test(s->tabs, s->cx))
				break;
		} while (s->cx < screen_size_x(s) - 1);
		break;
	case '\012':	/* LF */
	case '\013':	/* VT */
	case '\014':	/* FF */
		screen_write_linefeed(sctx, 0);
		goto count_c0;
	case '\015':	/* CR */
		screen_write_carriagereturn(sctx);
		goto count_c0;
	case '\016':	/* SO */
		ictx->cell.set = 1;
		break;
	case '\017':	/* SI */
		ictx->cell.set = 0;
		break;
	default:
		log_debug("%s: unknown '%c'", __func__, ictx->ch);
		break;
	}

	return (0);

count_c0:
	trigger = options_get_number(&wp->window->options, "c0-change-trigger");
	if (trigger != 0 && ++wp->changes >= trigger) {
		wp->flags |= PANE_DROP;
		window_pane_timer_start(wp);
	}

	return (0);
}
Beispiel #15
0
/*
 * batch_bind - Set the batch request message so as to bind the shell to the
 *	proper resources
 */
void batch_bind(batch_job_launch_msg_t *req)
{
	bitstr_t *req_map, *hw_map;
	slurm_cred_arg_t arg;
	uint16_t sockets=0, cores=0, num_cpus;
	int start, task_cnt=0;

	if (slurm_cred_get_args(req->cred, &arg) != SLURM_SUCCESS) {
		error("task/affinity: job lacks a credential");
		return;
	}
	start = _get_local_node_info(&arg, 0, &sockets, &cores);
	if (start != 0) {
		error("task/affinity: missing node 0 in job credential");
		slurm_cred_free_args(&arg);
		return;
	}
	if ((sockets * cores) == 0) {
		error("task/affinity: socket and core count both zero");
		slurm_cred_free_args(&arg);
		return;
	}

	num_cpus  = MIN((sockets * cores),
			 (conf->sockets * conf->cores));
	req_map = (bitstr_t *) bit_alloc(num_cpus);
	hw_map  = (bitstr_t *) bit_alloc(conf->block_map_size);

#ifdef HAVE_FRONT_END
{
	/* Since the front-end nodes are a shared resource, we limit each job
	 * to one CPU based upon monotonically increasing sequence number */
	static int last_id = 0;
	bit_set(hw_map, ((last_id++) % conf->block_map_size));
	task_cnt = 1;
}
#else
{
	char *str;
	int t, p;

	/* Transfer core_bitmap data to local req_map.
	 * The MOD function handles the case where fewer processes
	 * physically exist than are configured (slurmd is out of
	 * sync with the slurmctld daemon). */
	for (p = 0; p < (sockets * cores); p++) {
		if (bit_test(arg.job_core_bitmap, p))
			bit_set(req_map, (p % num_cpus));
	}

	str = (char *)bit_fmt_hexmask(req_map);
	debug3("task/affinity: job %u CPU mask from slurmctld: %s",
		req->job_id, str);
	xfree(str);

	for (p = 0; p < num_cpus; p++) {
		if (bit_test(req_map, p) == 0)
			continue;
		/* core_bitmap does not include threads, so we
		 * add them here but limit them to what the job
		 * requested */
		for (t = 0; t < conf->threads; t++) {
			uint16_t pos = p * conf->threads + t;
			if (pos >= conf->block_map_size) {
				info("more resources configured than exist");
				p = num_cpus;
				break;
			}
			bit_set(hw_map, pos);
			task_cnt++;
		}
	}
}
#endif
	if (task_cnt) {
		req->cpu_bind_type = CPU_BIND_MASK;
		if (conf->task_plugin_param & CPU_BIND_VERBOSE)
			req->cpu_bind_type |= CPU_BIND_VERBOSE;
		req->cpu_bind = (char *)bit_fmt_hexmask(hw_map);
		info("task/affinity: job %u CPU input mask for node: %s",
		     req->job_id, req->cpu_bind);
		/* translate abstract masks to actual hardware layout */
		_lllp_map_abstract_masks(1, &hw_map);
#ifdef HAVE_NUMA
		if (req->cpu_bind_type & CPU_BIND_TO_LDOMS) {
			_match_masks_to_ldom(1, &hw_map);
		}
#endif
		xfree(req->cpu_bind);
		req->cpu_bind = (char *)bit_fmt_hexmask(hw_map);
		info("task/affinity: job %u CPU final HW mask for node: %s",
		     req->job_id, req->cpu_bind);
	} else {
		error("task/affinity: job %u allocated no CPUs",
		      req->job_id);
	}
	FREE_NULL_BITMAP(hw_map);
	FREE_NULL_BITMAP(req_map);
	slurm_cred_free_args(&arg);
}
Beispiel #16
0
/* Add the given job to the "active" structures of
 * the given partition and increment the run count */
static void _add_job_to_active(struct job_record *job_ptr,
			       struct gs_part *p_ptr)
{
	job_resources_t *job_res = job_ptr->job_resrcs;
	uint16_t job_gr_type;

	/* add job to active_resmap */
	job_gr_type = _get_part_gr_type(job_ptr->part_ptr);
	if ((job_gr_type == GS_CPU2) || (job_gr_type == GS_CORE) ||
	    (job_gr_type == GS_SOCKET)) {
		if (p_ptr->jobs_active == 0 && p_ptr->active_resmap) {
			uint32_t size = bit_size(p_ptr->active_resmap);
			bit_nclear(p_ptr->active_resmap, 0, size-1);
		}
		add_job_to_cores(job_res, &(p_ptr->active_resmap),
				 gs_bits_per_node);
		if (job_gr_type == GS_SOCKET)
			_fill_sockets(job_res->node_bitmap, p_ptr);
	} else { /* GS_NODE or GS_CPU */
		if (!p_ptr->active_resmap) {
			if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) {
				info("gang: _add_job_to_active: job %u first",
				     job_ptr->job_id);
			}
			p_ptr->active_resmap = bit_copy(job_res->node_bitmap);
		} else if (p_ptr->jobs_active == 0) {
			if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) {
				info("gang: _add_job_to_active: job %u copied",
				     job_ptr->job_id);
			}
			bit_copybits(p_ptr->active_resmap,
				     job_res->node_bitmap);
		} else {
			if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) {
				info("gang: _add_job_to_active: adding job %u",
				     job_ptr->job_id);
			}
			bit_or(p_ptr->active_resmap, job_res->node_bitmap);
		}
	}

	/* add job to the active_cpus array */
	if (job_gr_type == GS_CPU) {
		uint32_t i, a, sz = bit_size(p_ptr->active_resmap);
		if (!p_ptr->active_cpus) {
			/* create active_cpus array */
			p_ptr->active_cpus = xmalloc(sz * sizeof(uint16_t));
		}
		if (p_ptr->jobs_active == 0) {
			/* overwrite the existing values in active_cpus */
			for (a = 0, i = 0; i < sz; i++) {
				if (bit_test(job_res->node_bitmap, i)) {
					p_ptr->active_cpus[i] =
						job_res->cpus[a++];
				} else {
					p_ptr->active_cpus[i] = 0;
				}
			}
		} else {
			/* add job to existing jobs in the active cpus */
			for (a = 0, i = 0; i < sz; i++) {
				if (bit_test(job_res->node_bitmap, i)) {
					uint16_t limit = _get_phys_bit_cnt(i);
					p_ptr->active_cpus[i] +=
						job_res->cpus[a++];
					/* when adding shadows, the resources
					 * may get overcommitted */
					if (p_ptr->active_cpus[i] > limit)
						p_ptr->active_cpus[i] = limit;
				}
			}
		}
	}
	p_ptr->jobs_active += 1;
}
Beispiel #17
0
/* Determine which CPUs a job step can use.
 * OUT whole_<entity>_count - returns count of whole <entities> in this
 *                            allocation for this node
 * OUT part__<entity>_count - returns count of partial <entities> in this
 *                            allocation for this node
 * RET - a string representation of the available mask or NULL on error
 * NOTE: Caller must xfree() the return value. */
static char *_alloc_mask(launch_tasks_request_msg_t *req,
			 int *whole_node_cnt,  int *whole_socket_cnt,
			 int *whole_core_cnt,  int *whole_thread_cnt,
			 int *part_socket_cnt, int *part_core_cnt)
{
	uint16_t sockets, cores, threads;
	int c, s, t, i;
	int c_miss, s_miss, t_miss, c_hit, t_hit;
	bitstr_t *alloc_bitmap;
	char *str_mask;
	bitstr_t *alloc_mask;

	*whole_node_cnt   = 0;
	*whole_socket_cnt = 0;
	*whole_core_cnt   = 0;
	*whole_thread_cnt = 0;
	*part_socket_cnt  = 0;
	*part_core_cnt    = 0;

	alloc_bitmap = _get_avail_map(req, &sockets, &cores, &threads);
	if (!alloc_bitmap)
		return NULL;

	alloc_mask = bit_alloc(bit_size(alloc_bitmap));

	i = 0;
	for (s=0, s_miss=false; s<sockets; s++) {
		for (c=0, c_hit=c_miss=false; c<cores; c++) {
			for (t=0, t_hit=t_miss=false; t<threads; t++) {
				/* If we are pretending we have a
				   larger system than we really have
				   this is needed to make sure we
				   don't bust the bank.
				*/
				if (i >= bit_size(alloc_bitmap))
					i = 0;
				if (bit_test(alloc_bitmap, i)) {
					bit_set(alloc_mask, i);
					(*whole_thread_cnt)++;
					t_hit = true;
					c_hit = true;
				} else
					t_miss = true;
				i++;
			}
			if (!t_miss)
				(*whole_core_cnt)++;
			else {
				if (t_hit)
					(*part_core_cnt)++;
				c_miss = true;
			}
		}
		if (!c_miss)
			(*whole_socket_cnt)++;
		else {
			if (c_hit)
				(*part_socket_cnt)++;
			s_miss = true;
		}
	}
	if (!s_miss)
		(*whole_node_cnt)++;
	FREE_NULL_BITMAP(alloc_bitmap);

	/* translate abstract masks to actual hardware layout */
	_lllp_map_abstract_masks(1, &alloc_mask);

#ifdef HAVE_NUMA
	if (req->cpu_bind_type & CPU_BIND_TO_LDOMS) {
		_match_masks_to_ldom(1, &alloc_mask);
	}
#endif

	str_mask = bit_fmt_hexmask(alloc_mask);
	FREE_NULL_BITMAP(alloc_mask);
	return str_mask;
}
Beispiel #18
0
/*
 * topo_get_node_addr - build node address and the associated pattern
 *      based on the topology information
 *
 * example of output :
 *      address : s0.s4.s8.tux1
 *      pattern : switch.switch.switch.node
 */
extern int topo_get_node_addr(char* node_name, char** paddr, char** ppattern)
{
	struct node_record *node_ptr;
	int node_inx;
	hostlist_t sl = NULL;

	int s_max_level = 0;
	int i, j;

	/* no switches found, return */
	if ( switch_record_cnt == 0 ) {
		*paddr = xstrdup(node_name);
		*ppattern = xstrdup("node");
		return SLURM_SUCCESS;
	}

	node_ptr = find_node_record(node_name);
	/* node not found in configuration */
	if ( node_ptr == NULL )
		return SLURM_ERROR;
	node_inx = node_ptr - node_record_table_ptr;

	/* look for switches max level */
	for (i=0; i<switch_record_cnt; i++) {
		if ( switch_record_table[i].level > s_max_level )
			s_max_level = switch_record_table[i].level;
	}

	/* initialize output parameters */
	*paddr = xstrdup("");
	*ppattern = xstrdup("");

	/* build node topology address and the associated pattern */
	for (j=s_max_level; j>=0; j--) {
		for (i=0; i<switch_record_cnt; i++) {
			if ( switch_record_table[i].level != j )
				continue;
			if ( !bit_test(switch_record_table[i]. node_bitmap,
				       node_inx) )
				continue;
			if ( sl == NULL ) {
				sl = hostlist_create(switch_record_table[i].
						     name);
			} else {
				hostlist_push_host(sl,
						   switch_record_table[i].
						   name);
			}
		}
		if ( sl ) {
			char *buf = hostlist_ranged_string_xmalloc(sl);
			xstrcat(*paddr,buf);
			xfree(buf);
			hostlist_destroy(sl);
			sl = NULL;
		}
		xstrcat(*paddr, ".");
		xstrcat(*ppattern, "switch.");
	}

	/* append node name */
	xstrcat(*paddr, node_name);
	xstrcat(*ppattern, "node");

	return SLURM_SUCCESS;
}
Beispiel #19
0
/*
 * _task_layout_lllp_cyclic
 *
 * task_layout_lllp_cyclic creates a cyclic distribution at the
 * lowest level of logical processor which is either socket, core or
 * thread depending on the system architecture. The Cyclic algorithm
 * is the same as the Cyclic distribution performed in srun.
 *
 *  Distribution at the lllp:
 *  -m hostfile|block|cyclic:block|cyclic
 *
 * The first distribution "hostfile|block|cyclic" is computed
 * in srun. The second distribution "block|cyclic" is computed
 * locally by each slurmd.
 *
 * The input to the lllp distribution algorithms is the gids (tasks
 * ids) generated for the local node.
 *
 * The output is a mapping of the gids onto logical processors
 * (thread/core/socket) with is expressed cpu_bind masks.
 *
 * If a task asks for more than one CPU per task, put the tasks as
 * close as possible (fill core rather than going next socket for the
 * extra task)
 *
 */
static int _task_layout_lllp_cyclic(launch_tasks_request_msg_t *req,
				    uint32_t node_id, bitstr_t ***masks_p)
{
	int last_taskcount = -1, taskcount = 0;
	uint16_t i, s, hw_sockets = 0, hw_cores = 0, hw_threads = 0;
	uint16_t offset = 0, p = 0;
	int size, max_tasks = req->tasks_to_launch[(int)node_id];
	int max_cpus = max_tasks * req->cpus_per_task;
	bitstr_t *avail_map;
	bitstr_t **masks = NULL;
	int *socket_last_pu = NULL;

	info ("_task_layout_lllp_cyclic ");

	avail_map = _get_avail_map(req, &hw_sockets, &hw_cores, &hw_threads);
	if (!avail_map)
		return SLURM_ERROR;

	size = bit_set_count(avail_map);
	if (size < max_tasks) {
		error("task/affinity: only %d bits in avail_map for %d tasks!",
		      size, max_tasks);
		FREE_NULL_BITMAP(avail_map);
		return SLURM_ERROR;
	}
	if (size < max_cpus) {
		/* Possible result of overcommit */
		i = size / max_tasks;
		info("task/affinity: reset cpus_per_task from %d to %d",
		     req->cpus_per_task, i);
		req->cpus_per_task = i;
	}

	socket_last_pu = xmalloc(hw_sockets * sizeof(int));

	*masks_p = xmalloc(max_tasks * sizeof(bitstr_t*));
	masks = *masks_p;

	size = bit_size(avail_map);

	offset = hw_cores * hw_threads;
	s = 0;
	while (taskcount < max_tasks) {
		if (taskcount == last_taskcount)
			fatal("_task_layout_lllp_cyclic failure");
		last_taskcount = taskcount;
		for (i = 0; i < size; i++) {
			bool already_switched = false;
			uint16_t bit = socket_last_pu[s] + (s * offset);

			/* In case hardware and config differ */
			bit %= size;

			/* set up for the next one */
			socket_last_pu[s]++;
			/* skip unrequested threads */
			if (req->cpu_bind_type & CPU_BIND_ONE_THREAD_PER_CORE)
				socket_last_pu[s] += hw_threads-1;
			if (socket_last_pu[s] >= offset) {
				/* Switch to the next socket we have
				 * ran out here. */

				/* This only happens if the slurmctld
				   gave us an allocation that made a
				   task split sockets.
				*/
				s = (s + 1) % hw_sockets;
				already_switched = true;
			}

			if (!bit_test(avail_map, bit))
				continue;

			if (!masks[taskcount])
				masks[taskcount] =
					bit_alloc(conf->block_map_size);

			//info("setting %d %d", taskcount, bit);
			bit_set(masks[taskcount], bit);

			if (!already_switched
			    && ((req->task_dist == SLURM_DIST_CYCLIC_CFULL) ||
				(req->task_dist == SLURM_DIST_BLOCK_CFULL))) {
				/* This means we are laying out cpus
				 * within a task cyclically as well. */
				s = (s + 1) % hw_sockets;
				already_switched = true;
			}

			if (++p < req->cpus_per_task)
				continue;
			p = 0;

			if (!already_switched) {
				/* Now that we have finished a task, switch to
				 * the next socket. */
				s = (s + 1) % hw_sockets;
			}

			if (++taskcount >= max_tasks)
				break;
		}
	}

	/* last step: expand the masks to bind each task
	 * to the requested resource */
	_expand_masks(req->cpu_bind_type, max_tasks, masks,
		      hw_sockets, hw_cores, hw_threads, avail_map);
	FREE_NULL_BITMAP(avail_map);
	xfree(socket_last_pu);

	return SLURM_SUCCESS;
}
Beispiel #20
0
/**
 * do_basil_reserve - create a BASIL reservation.
 * IN job_ptr - pointer to job which has just been allocated resources
 * RET 0 or error code, job will abort or be requeued on failure
 */
extern int do_basil_reserve(struct job_record *job_ptr)
{
    struct nodespec *ns_head = NULL;
    uint16_t mppwidth = 0, mppdepth, mppnppn;
    uint32_t mppmem = 0, node_min_mem = 0;
    uint32_t resv_id;
    int i, first_bit, last_bit;
    hostlist_t hl;
    long rc;
    char *user, batch_id[16];

    if (!job_ptr->job_resrcs || job_ptr->job_resrcs->nhosts == 0)
        return SLURM_SUCCESS;

    debug3("job #%u: %u nodes = %s, cpus=%u" , job_ptr->job_id,
           job_ptr->job_resrcs->nhosts,
           job_ptr->job_resrcs->nodes,
           job_ptr->job_resrcs->ncpus
          );

    if (job_ptr->job_resrcs->node_bitmap == NULL) {
        error("job %u node_bitmap not set", job_ptr->job_id);
        return SLURM_SUCCESS;
    }

    first_bit = bit_ffs(job_ptr->job_resrcs->node_bitmap);
    last_bit  = bit_fls(job_ptr->job_resrcs->node_bitmap);
    if (first_bit == -1 || last_bit == -1)
        return SLURM_SUCCESS;		/* no nodes allocated */

    mppdepth = MAX(1, job_ptr->details->cpus_per_task);
    mppnppn  = job_ptr->details->ntasks_per_node;

    /* mppmem */
    if (job_ptr->details->pn_min_memory & MEM_PER_CPU) {
        /* Only honour --mem-per-cpu if --ntasks has been given */
        if (job_ptr->details->num_tasks)
            mppmem = job_ptr->details->pn_min_memory & ~MEM_PER_CPU;
    } else if (job_ptr->details->pn_min_memory) {
        node_min_mem = job_ptr->details->pn_min_memory;
    }

    hl = hostlist_create("");
    if (hl == NULL)
        fatal("hostlist_create: malloc error");

    for (i = first_bit; i <= last_bit; i++) {
        struct node_record *node_ptr = node_record_table_ptr + i;
        uint32_t basil_node_id;

        if (!bit_test(job_ptr->job_resrcs->node_bitmap, i))
            continue;

        if (!node_ptr->name || node_ptr->name[0] == '\0')
            continue;	/* bad node */

        if (sscanf(node_ptr->name, "nid%05u", &basil_node_id) != 1)
            fatal("can not read basil_node_id from %s", node_ptr->name);

        if (ns_add_node(&ns_head, basil_node_id) != 0) {
            error("can not add node %s (nid%05u)", node_ptr->name,
                  basil_node_id);
            free_nodespec(ns_head);
            return SLURM_ERROR;
        }

        if (node_min_mem) {
            uint32_t node_cpus, node_mem;

            if (slurmctld_conf.fast_schedule) {
                node_cpus = node_ptr->config_ptr->cpus;
                node_mem  = node_ptr->config_ptr->real_memory;
            } else {
                node_cpus = node_ptr->cpus;
                node_mem  = node_ptr->real_memory;
            }
            /*
             * ALPS 'Processing Elements per Node' value (aprun -N),
             * which in slurm is --ntasks-per-node and 'mppnppn' in
             * PBS: if --ntasks is specified, default to the number
             * of cores per node (also the default for 'aprun -N').
             */
            node_mem /= mppnppn ? mppnppn : node_cpus;

            mppmem = node_min_mem = MIN(node_mem, node_min_mem);
        }
    }

    /* mppwidth */
    for (i = 0; i < job_ptr->job_resrcs->nhosts; i++) {
        uint16_t node_tasks = job_ptr->job_resrcs->cpus[i] / mppdepth;

        if (mppnppn && mppnppn < node_tasks)
            node_tasks = mppnppn;
        mppwidth += node_tasks;
    }

    snprintf(batch_id, sizeof(batch_id), "%u", job_ptr->job_id);
    user = uid_to_string(job_ptr->user_id);
    rc   = basil_reserve(user, batch_id, mppwidth,
                         mppdepth, mppnppn, mppmem, ns_head);
    xfree(user);
    if (rc <= 0) {
        /* errno value will be resolved by select_g_job_begin() */
        errno = is_transient_error(rc) ? EAGAIN : ECONNABORTED;
        return SLURM_ERROR;
    }

    resv_id	= rc;
    if (_set_select_jobinfo(job_ptr->select_jobinfo->data,
                            SELECT_JOBDATA_RESV_ID, &resv_id) != SLURM_SUCCESS) {
        /*
         * This is a fatal error since it means we will not be able to
         * confirm the reservation; no step will be able to run in it.
         */
        error("job %u: can not set resId %u", job_ptr->job_id, resv_id);
        basil_release(resv_id);
        return SLURM_ERROR;
    }

    info("ALPS RESERVATION #%u, JobId %u: BASIL -n %d -N %d -d %d -m %d",
         resv_id, job_ptr->job_id, mppwidth, mppnppn, mppdepth, mppmem);

    return SLURM_SUCCESS;
}