Esempio n. 1
0
extern int slurm_jobcomp_log_record(struct job_record *job_ptr)
{
	int nwritten, B_SIZE = 1024;
	char usr_str[32], grp_str[32], start_str[32], end_str[32];
	char submit_str[32], *cluster = NULL, *qos, *state_string;
	time_t elapsed_time, submit_time, eligible_time;
	enum job_states job_state;
	uint32_t time_limit;
	uint16_t ntasks_per_node;
	int i;
	char *buffer, tmp_str[256], *script_str, *script;
	struct job_node *jnode;

	if (list_count(jobslist) > MAX_JOBS) {
		error("%s: Limit of %d enqueued jobs in memory waiting to be "
		      "indexed reached. Job %lu discarded", plugin_type,
		      MAX_JOBS, (unsigned long)job_ptr->job_id);
		return SLURM_ERROR;
	}

	_get_user_name(job_ptr->user_id, usr_str, sizeof(usr_str));
	_get_group_name(job_ptr->group_id, grp_str, sizeof(grp_str));

	if ((job_ptr->time_limit == NO_VAL) && job_ptr->part_ptr)
		time_limit = job_ptr->part_ptr->max_time;
	else
		time_limit = job_ptr->time_limit;

	if (job_ptr->job_state & JOB_RESIZING) {
		time_t now = time(NULL);
		state_string = job_state_string(job_ptr->job_state);
		if (job_ptr->resize_time) {
			_make_time_str(&job_ptr->resize_time, start_str,
				       sizeof(start_str));
		} else {
			_make_time_str(&job_ptr->start_time, start_str,
				       sizeof(start_str));
		}
		_make_time_str(&now, end_str, sizeof(end_str));
	} else {
		/* Job state will typically have JOB_COMPLETING or JOB_RESIZING
		 * flag set when called. We remove the flags to get the eventual
		 * completion state: JOB_FAILED, JOB_TIMEOUT, etc. */
		job_state = job_ptr->job_state & JOB_STATE_BASE;
		state_string = job_state_string(job_state);
		if (job_ptr->resize_time) {
			_make_time_str(&job_ptr->resize_time, start_str,
				       sizeof(start_str));
		} else if (job_ptr->start_time > job_ptr->end_time) {
			/* Job cancelled while pending and
			 * expected start time is in the future. */
			snprintf(start_str, sizeof(start_str), "Unknown");
		} else {
			_make_time_str(&job_ptr->start_time, start_str,
				       sizeof(start_str));
		}
		_make_time_str(&job_ptr->end_time, end_str, sizeof(end_str));
	}

	elapsed_time = job_ptr->end_time - job_ptr->start_time;

	buffer = xmalloc(B_SIZE);

	nwritten = snprintf(buffer, B_SIZE, JOBCOMP_DATA_FORMAT,
			    (unsigned long) job_ptr->job_id, usr_str,
			    (unsigned long) job_ptr->user_id, grp_str,
			    (unsigned long) job_ptr->group_id, start_str,
			    end_str, (long) elapsed_time,
			    job_ptr->partition, job_ptr->alloc_node,
			    job_ptr->nodes, (unsigned long) job_ptr->total_cpus,
			    (unsigned long) job_ptr->total_nodes,
			    (unsigned long) job_ptr->derived_ec,
			    (unsigned long) job_ptr->exit_code, state_string);

	if (nwritten >= B_SIZE) {
		B_SIZE += nwritten + 1;
		buffer = xrealloc(buffer, B_SIZE);

		nwritten = snprintf(buffer, B_SIZE, JOBCOMP_DATA_FORMAT,
				    (unsigned long) job_ptr->job_id, usr_str,
				    (unsigned long) job_ptr->user_id, grp_str,
				    (unsigned long) job_ptr->group_id,
				    start_str, end_str, (long) elapsed_time,
				    job_ptr->partition, job_ptr->alloc_node,
				    job_ptr->nodes,
				    (unsigned long) job_ptr->total_cpus,
				    (unsigned long) job_ptr->total_nodes,
				    (unsigned long) job_ptr->derived_ec,
				    (unsigned long) job_ptr->exit_code,
				    state_string);

		if (nwritten >= B_SIZE) {
			error("%s: Job completion data truncated and lost",
			      plugin_type);
			return SLURM_ERROR;
		}
	}

	snprintf(tmp_str, sizeof(tmp_str), ",\"cpu_hours\":%.6f",
		 ((float) elapsed_time * (float) job_ptr->total_cpus) /
		  (float) 3600);
	xstrcat(buffer, tmp_str);

	if (job_ptr->array_task_id != NO_VAL) {
		xstrfmtcat(buffer, ",\"array_job_id\":%lu",
			   (unsigned long) job_ptr->array_job_id);
		xstrfmtcat(buffer, ",\"array_task_id\":%lu",
			   (unsigned long) job_ptr->array_task_id);
	}

	if (job_ptr->details && (job_ptr->details->submit_time != NO_VAL)) {
		submit_time = job_ptr->details->submit_time;
		_make_time_str(&submit_time, submit_str, sizeof(submit_str));
		xstrfmtcat(buffer, ",\"@submit\":\"%s\"", submit_str);
	}

	if (job_ptr->details && (job_ptr->details->begin_time != NO_VAL)) {
		eligible_time = job_ptr->start_time -
				job_ptr->details->begin_time;
		xstrfmtcat(buffer, ",\"eligible_time\":%lu", eligible_time);
	}

	if (job_ptr->details
	    && (job_ptr->details->work_dir && job_ptr->details->work_dir[0])) {
		xstrfmtcat(buffer, ",\"work_dir\":\"%s\"",
			   job_ptr->details->work_dir);
	}

	if (job_ptr->details
	    && (job_ptr->details->std_err && job_ptr->details->std_err[0])) {
		xstrfmtcat(buffer, ",\"std_err\":\"%s\"",
			   job_ptr->details->std_err);
	}

	if (job_ptr->details
	    && (job_ptr->details->std_in && job_ptr->details->std_in[0])) {
		xstrfmtcat(buffer, ",\"std_in\":\"%s\"",
			   job_ptr->details->std_in);
	}

	if (job_ptr->details
	    && (job_ptr->details->std_out && job_ptr->details->std_out[0])) {
		xstrfmtcat(buffer, ",\"std_out\":\"%s\"",
			   job_ptr->details->std_out);
	}

	if (job_ptr->assoc_ptr != NULL) {
		cluster = ((slurmdb_assoc_rec_t *) job_ptr->assoc_ptr)->cluster;
		xstrfmtcat(buffer, ",\"cluster\":\"%s\"", cluster);
	}

	if (job_ptr->qos_ptr != NULL) {
		slurmdb_qos_rec_t *assoc =
			(slurmdb_qos_rec_t *) job_ptr->qos_ptr;
		qos = assoc->name;
		xstrfmtcat(buffer, ",\"qos\":\"%s\"", qos);
	}

	if (job_ptr->details && (job_ptr->details->num_tasks != NO_VAL)) {
		xstrfmtcat(buffer, ",\"ntasks\":%hu",
			   job_ptr->details->num_tasks);
	}

	if (job_ptr->details && (job_ptr->details->ntasks_per_node != NO_VAL)) {
		ntasks_per_node = job_ptr->details->ntasks_per_node;
		xstrfmtcat(buffer, ",\"ntasks_per_node\":%hu", ntasks_per_node);
	}

	if (job_ptr->details && (job_ptr->details->cpus_per_task != NO_VAL)) {
		xstrfmtcat(buffer, ",\"cpus_per_task\":%hu",
			   job_ptr->details->cpus_per_task);
	}

	if (job_ptr->details
	    && (job_ptr->details->orig_dependency
		&& job_ptr->details->orig_dependency[0])) {
		xstrfmtcat(buffer, ",\"orig_dependency\":\"%s\"",
			   job_ptr->details->orig_dependency);
	}

	if (job_ptr->details
	    && (job_ptr->details->exc_nodes
		&& job_ptr->details->exc_nodes[0])) {
		xstrfmtcat(buffer, ",\"excluded_nodes\":\"%s\"",
			   job_ptr->details->exc_nodes);
	}

	if (time_limit != INFINITE) {
		xstrfmtcat(buffer, ",\"time_limit\":%lu",
			   (unsigned long) time_limit * 60);
	}

	if (job_ptr->resv_name && job_ptr->resv_name[0]) {
		xstrfmtcat(buffer, ",\"reservation_name\":\"%s\"",
			   job_ptr->resv_name);
	}

	if (job_ptr->gres_req && job_ptr->gres_req[0]) {
		xstrfmtcat(buffer, ",\"gres_req\":\"%s\"", job_ptr->gres_req);
	}

	if (job_ptr->gres_alloc && job_ptr->gres_alloc[0]) {
		xstrfmtcat(buffer, ",\"gres_alloc\":\"%s\"",
			   job_ptr->gres_alloc);
	}

	if (job_ptr->account && job_ptr->account[0]) {
		xstrfmtcat(buffer, ",\"account\":\"%s\"", job_ptr->account);
	}

	script = get_job_script(job_ptr);
	if (script && script[0]) {
		script_str = _json_escape(script);
		xstrfmtcat(buffer, ",\"script\":\"%s\"", script_str);
		xfree(script_str);
	}
	xfree(script);

	if (job_ptr->assoc_ptr) {
		assoc_mgr_lock_t locks = { READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK,
					   NO_LOCK, NO_LOCK, NO_LOCK };
		slurmdb_assoc_rec_t *assoc_ptr = job_ptr->assoc_ptr;
		char *parent_accounts = NULL;
		char **acc_aux = NULL;
		int nparents = 0;

		assoc_mgr_lock(&locks);

		/* Start at the first parent and go up.  When studying
		 * this code it was slightly faster to do 2 loops on
		 * the association linked list and only 1 xmalloc but
		 * we opted for cleaner looking code and going with a
		 * realloc. */
		while (assoc_ptr) {
			if (assoc_ptr->acct) {
				acc_aux = xrealloc(acc_aux,
						   sizeof(char *) *
						   (nparents + 1));
				acc_aux[nparents++] = assoc_ptr->acct;
			}
			assoc_ptr = assoc_ptr->usage->parent_assoc_ptr;
		}

		for (i = nparents - 1; i >= 0; i--)
			xstrfmtcat(parent_accounts, "/%s", acc_aux[i]);
		xfree(acc_aux);

		xstrfmtcat(buffer, ",\"parent_accounts\":\"%s\"",
			   parent_accounts);

		xfree(parent_accounts);

		assoc_mgr_unlock(&locks);
	}

	xstrcat(buffer, "}");
	jnode = xmalloc(sizeof(struct job_node));
	jnode->serialized_job = xstrdup(buffer);
	list_enqueue(jobslist, jnode);

	return SLURM_SUCCESS;
}
Esempio n. 2
0
extern int slurm_jobcomp_log_record ( struct job_record *job_ptr )
{
	int rc = SLURM_SUCCESS;
	char job_rec[1024];
	char usr_str[32], grp_str[32], start_str[32], end_str[32], lim_str[32];
	char select_buf[128], *state_string, *work_dir;
	size_t offset = 0, tot_size, wrote;
	enum job_states job_state;
	uint32_t time_limit;

	if ((log_name == NULL) || (job_comp_fd < 0)) {
		error("JobCompLoc log file %s not open", log_name);
		return SLURM_ERROR;
	}

	slurm_mutex_lock( &file_lock );
	_get_user_name(job_ptr->user_id, usr_str, sizeof(usr_str));
	_get_group_name(job_ptr->group_id, grp_str, sizeof(grp_str));

	if ((job_ptr->time_limit == NO_VAL) && job_ptr->part_ptr)
		time_limit = job_ptr->part_ptr->max_time;
	else
		time_limit = job_ptr->time_limit;
	if (time_limit == INFINITE)
		strcpy(lim_str, "UNLIMITED");
	else {
		snprintf(lim_str, sizeof(lim_str), "%lu",
			 (unsigned long) time_limit);
	}

	if (job_ptr->job_state & JOB_RESIZING) {
		time_t now = time(NULL);
		state_string = job_state_string(job_ptr->job_state);
		if (job_ptr->resize_time) {
			_make_time_str(&job_ptr->resize_time, start_str,
				       sizeof(start_str));
		} else {
			_make_time_str(&job_ptr->start_time, start_str,
				       sizeof(start_str));
		}
		_make_time_str(&now, end_str, sizeof(end_str));
	} else {
		/* Job state will typically have JOB_COMPLETING or JOB_RESIZING
		 * flag set when called. We remove the flags to get the eventual
		 * completion state: JOB_FAILED, JOB_TIMEOUT, etc. */
		job_state = job_ptr->job_state & JOB_STATE_BASE;
		state_string = job_state_string(job_state);
		if (job_ptr->resize_time) {
			_make_time_str(&job_ptr->resize_time, start_str,
				       sizeof(start_str));
		} else if (job_ptr->start_time > job_ptr->end_time) {
			/* Job cancelled while pending and
			 * expected start time is in the future. */
			snprintf(start_str, sizeof(start_str), "Unknown");
		} else {
			_make_time_str(&job_ptr->start_time, start_str,
				       sizeof(start_str));
		}
		_make_time_str(&job_ptr->end_time, end_str, sizeof(end_str));
	}

	if (job_ptr->details && job_ptr->details->work_dir)
		work_dir = job_ptr->details->work_dir;
	else
		work_dir = "unknown";

	select_g_select_jobinfo_sprint(job_ptr->select_jobinfo,
		select_buf, sizeof(select_buf), SELECT_PRINT_MIXED);

	snprintf(job_rec, sizeof(job_rec), JOB_FORMAT,
		 (unsigned long) job_ptr->job_id, usr_str,
		 (unsigned long) job_ptr->user_id, grp_str,
		 (unsigned long) job_ptr->group_id, job_ptr->name,
		 state_string, job_ptr->partition, lim_str, start_str,
		 end_str, job_ptr->nodes, job_ptr->node_cnt,
		 job_ptr->total_cpus, work_dir,
		 select_buf);
	tot_size = strlen(job_rec);

	while ( offset < tot_size ) {
		wrote = write(job_comp_fd, job_rec + offset,
			tot_size - offset);
		if (wrote == -1) {
			if (errno == EAGAIN)
				continue;
			else {
				plugin_errno = errno;
				rc = SLURM_ERROR;
				break;
			}
		}
		offset += wrote;
	}
	slurm_mutex_unlock( &file_lock );
	return rc;
}
Esempio n. 3
0
extern int slurm_jobcomp_log_record ( struct job_record *job_ptr )
{
	int rc = SLURM_SUCCESS;
	char job_rec[1024];
	char usr_str[32], grp_str[32], start_str[32], end_str[32], lim_str[32];
	char *resv_name, *gres, *account, *qos, *wckey, *cluster;
	char submit_time[32], eligible_time[32], array_id[64], pack_id[64];
	char select_buf[128], *state_string, *work_dir;
	size_t offset = 0, tot_size, wrote;
	uint32_t job_state;
	uint32_t time_limit;

	if ((log_name == NULL) || (job_comp_fd < 0)) {
		error("JobCompLoc log file %s not open", log_name);
		return SLURM_ERROR;
	}

	slurm_mutex_lock( &file_lock );
	_get_user_name(job_ptr->user_id, usr_str, sizeof(usr_str));
	_get_group_name(job_ptr->group_id, grp_str, sizeof(grp_str));

	if ((job_ptr->time_limit == NO_VAL) && job_ptr->part_ptr)
		time_limit = job_ptr->part_ptr->max_time;
	else
		time_limit = job_ptr->time_limit;
	if (time_limit == INFINITE)
		strcpy(lim_str, "UNLIMITED");
	else {
		snprintf(lim_str, sizeof(lim_str), "%lu",
			 (unsigned long) time_limit);
	}

	if (job_ptr->job_state & JOB_RESIZING) {
		time_t now = time(NULL);
		state_string = job_state_string(job_ptr->job_state);
		if (job_ptr->resize_time) {
			_make_time_str(&job_ptr->resize_time, start_str,
				       sizeof(start_str));
		} else {
			_make_time_str(&job_ptr->start_time, start_str,
				       sizeof(start_str));
		}
		_make_time_str(&now, end_str, sizeof(end_str));
	} else {
		/* Job state will typically have JOB_COMPLETING or JOB_RESIZING
		 * flag set when called. We remove the flags to get the eventual
		 * completion state: JOB_FAILED, JOB_TIMEOUT, etc. */
		job_state = job_ptr->job_state & JOB_STATE_BASE;
		state_string = job_state_string(job_state);
		if (job_ptr->resize_time) {
			_make_time_str(&job_ptr->resize_time, start_str,
				       sizeof(start_str));
		} else if (job_ptr->start_time > job_ptr->end_time) {
			/* Job cancelled while pending and
			 * expected start time is in the future. */
			snprintf(start_str, sizeof(start_str), "Unknown");
		} else {
			_make_time_str(&job_ptr->start_time, start_str,
				       sizeof(start_str));
		}
		_make_time_str(&job_ptr->end_time, end_str, sizeof(end_str));
	}

	if (job_ptr->details && job_ptr->details->work_dir)
		work_dir = job_ptr->details->work_dir;
	else
		work_dir = "unknown";

	if (job_ptr->resv_name && job_ptr->resv_name[0])
		resv_name = job_ptr->resv_name;
	else
		resv_name = "";

	if (job_ptr->gres_req && job_ptr->gres_req[0])
		gres = job_ptr->gres_req;
	else
		gres = "";

	if (job_ptr->account && job_ptr->account[0])
		account = job_ptr->account;
	else
		account = "";

	if (job_ptr->qos_ptr != NULL) {
		qos = job_ptr->qos_ptr->name;
	} else
		qos = "";

	if (job_ptr->wckey && job_ptr->wckey[0])
		wckey = job_ptr->wckey;
	else
		wckey = "";

	if (job_ptr->assoc_ptr != NULL)
		cluster = job_ptr->assoc_ptr->cluster;
	else
		cluster = "unknown";

	if (job_ptr->details && job_ptr->details->submit_time) {
		_make_time_str(&job_ptr->details->submit_time,
			       submit_time, sizeof(submit_time));
	} else {
		snprintf(submit_time, sizeof(submit_time), "unknown");
	}

	if (job_ptr->details && job_ptr->details->begin_time) {
		_make_time_str(&job_ptr->details->begin_time,
			       eligible_time, sizeof(eligible_time));
	} else {
		snprintf(eligible_time, sizeof(eligible_time), "unknown");
	}

	if (job_ptr->array_task_id != NO_VAL) {
		snprintf(array_id, sizeof(array_id),
			 " ArrayJobId=%u ArrayTaskId=%u",
			 job_ptr->array_job_id, job_ptr->array_task_id);
	} else {
		array_id[0] = '\0';
	}

	if (job_ptr->pack_job_id) {
		snprintf(pack_id, sizeof(pack_id),
			 " PackJobId=%u PackJobOffset=%u",
			 job_ptr->pack_job_id, job_ptr->pack_job_offset);
	} else {
		pack_id[0] = '\0';
	}

	select_g_select_jobinfo_sprint(job_ptr->select_jobinfo,
		select_buf, sizeof(select_buf), SELECT_PRINT_MIXED);

	snprintf(job_rec, sizeof(job_rec), JOB_FORMAT,
		 (unsigned long) job_ptr->job_id, usr_str,
		 (unsigned long) job_ptr->user_id, grp_str,
		 (unsigned long) job_ptr->group_id, job_ptr->name,
		 state_string, job_ptr->partition, lim_str, start_str,
		 end_str, job_ptr->nodes, job_ptr->node_cnt,
		 job_ptr->total_cpus, work_dir, resv_name, gres, account, qos,
		 wckey, cluster, submit_time, eligible_time, array_id, pack_id,
		 job_ptr->derived_ec, job_ptr->exit_code, select_buf);
	tot_size = strlen(job_rec);

	while (offset < tot_size) {
		wrote = write(job_comp_fd, job_rec + offset,
			tot_size - offset);
		if (wrote == -1) {
			if (errno == EAGAIN)
				continue;
			else {
				plugin_errno = errno;
				rc = SLURM_ERROR;
				break;
			}
		}
		offset += wrote;
	}
	slurm_mutex_unlock( &file_lock );
	return rc;
}