Esempio n. 1
0
/*
 * load into the storage a suspention of a job
 */
extern int jobacct_storage_p_suspend(void *db_conn,
				     struct job_record *job_ptr)
{
	char buf[BUFFER_SIZE];
	static time_t	now = 0;
	static time_t	temp = 0;
	int elapsed;
	if (!storage_init) {
		debug("jobacct init was not called or it failed");
		return SLURM_ERROR;
	}

	/* tell what time has passed */
	if (!now)
		now = job_ptr->start_time;
	temp = now;
	now = time(NULL);

	if ((elapsed=now-temp) < 0)
		elapsed=0;	/* For *very* short jobs, if clock is wrong */

	/* here we are really just going for a marker in time to tell when
	 * the process was suspended or resumed (check job state), we don't
	 * really need to keep track of anything else */
	snprintf(buf, BUFFER_SIZE, "%d %d %d",
		 JOB_SUSPEND,
		 elapsed,
		 job_ptr->job_state & JOB_STATE_BASE);/* job status */

	return _print_record(job_ptr, now, buf);
}
Esempio n. 2
0
/*
 * load into the storage the start of a job
 */
extern int jobacct_storage_p_job_start(void *db_conn,
				       struct job_record *job_ptr)
{
	int	rc = SLURM_SUCCESS;
	char	buf[BUFFER_SIZE], *account, *nodes;
	char    *jname = NULL;
	long	priority;
	int track_steps = 0;

	if (!storage_init) {
		debug("jobacct init was not called or it failed");
		return SLURM_ERROR;
	}

	debug2("slurmdb_job_start() called");

	if (job_ptr->start_time == 0) {
		/* This function is called when a job becomes elligible to run
		 * in order to record reserved time (a measure of system
		 * over-subscription). We only use this with database
		 * plugins. */
		return rc;
	}

	priority = (job_ptr->priority == NO_VAL) ?
		   -1L : (long) job_ptr->priority;

	if (job_ptr->name && job_ptr->name[0]) {
		jname = _safe_dup(job_ptr->name);
	} else {
		jname = xstrdup("allocation");
		track_steps = 1;
	}

	account= _safe_dup(job_ptr->account);
	if (job_ptr->nodes && job_ptr->nodes[0])
		nodes = job_ptr->nodes;
	else
		nodes = "(null)";

	if (job_ptr->batch_flag)
		track_steps = 1;

	job_ptr->requid = -1; /* force to -1 for stats to know this
			       * hasn't been set yet */

	snprintf(buf, BUFFER_SIZE,
		 "%d %s %d %ld %u %s %s",
		 JOB_START, jname,
		 track_steps, priority, job_ptr->total_cpus,
		 nodes, account);

	rc = _print_record(job_ptr, job_ptr->start_time, buf);
	xfree(account);
	xfree(jname);
	return rc;
}
/*
 * load into the storage the end of a job
 */
extern int jobacct_storage_p_job_complete(void *db_conn,
					  struct job_record *job_ptr)
{
	char buf[BUFFER_SIZE];
	uint16_t job_state;
	int duration;
	uint32_t exit_code;

	if (!storage_init) {
		debug("jobacct init was not called or it failed");
		return SLURM_ERROR;
	}

	debug2("slurmdb_job_complete() called");
	if (IS_JOB_RESIZING(job_ptr)) {
		job_state = JOB_RESIZING;
		if (job_ptr->resize_time)
			duration = time(NULL) - job_ptr->resize_time;
		else
			duration = time(NULL) - job_ptr->start_time;
	} else {
		if (job_ptr->end_time == 0) {
			debug("jobacct: job %u never started", job_ptr->job_id);
			return SLURM_ERROR;
		}
		job_state = job_ptr->job_state & JOB_STATE_BASE;
		if (job_ptr->resize_time)
			duration = job_ptr->end_time - job_ptr->resize_time;
		else
			duration = job_ptr->end_time - job_ptr->start_time;
	}

	exit_code = job_ptr->exit_code;
	if (exit_code == 1) {
		/* This wasn't signalled, it was set by Slurm so don't
		 * treat it like a signal.
		 */
		exit_code = 256;
	}

	/* leave the requid as a %d since we want to see if it is -1
	   in stats */
	snprintf(buf, BUFFER_SIZE, "%d %d %u %u %u",
		 JOB_TERMINATED, duration,
		 job_state, job_ptr->requid, exit_code);

	return  _print_record(job_ptr, job_ptr->end_time, buf);
}
Esempio n. 4
0
void cdb_print_records(cdb_t *cdb, cdb_request_t *request, FILE *fh, const char *date_format) {

    uint64_t i = 0;
    uint64_t num_recs = 0;

    cdb_record_t *records = NULL;

    if (_cdb_read_records(cdb, request, &num_recs, &records) == CDB_SUCCESS) {

        for (i = 0; i < num_recs; i++) {

            _print_record(fh, records[i].time, records[i].value, date_format);
        }
    }

    free(records);
}
Esempio n. 5
0
void cdb_print_aggregate_records(cdb_t **cdbs, int num_cdbs, cdb_request_t *request, FILE *fh, const char *date_format) {

    uint64_t i = 0;
    uint64_t num_recs = 0;

    cdb_record_t *records = NULL;
    cdb_range_t *range = calloc(1, sizeof(cdb_range_t));

    cdb_read_aggregate_records(cdbs, num_cdbs, request, &num_recs, &records, range);

    for (i = 0; i < num_recs; i++) {

        _print_record(fh, records[i].time, records[i].value, date_format);
    }

    free(range);
    free(records);
}
Esempio n. 6
0
/*
 * load into the storage the end of a job step
 */
extern int jobacct_storage_p_step_complete(void *db_conn,
					   struct step_record *step_ptr)
{
	char buf[BUFFER_SIZE];
	time_t now;
	int elapsed;
	int comp_status;
	int cpus = 0, rc;
	char node_list[BUFFER_SIZE];
	struct jobacctinfo *jobacct = (struct jobacctinfo *)step_ptr->jobacct;
	struct jobacctinfo dummy_jobacct;
#ifdef HAVE_BG
	char *ionodes = NULL;
#endif
	float ave_vsize = 0, ave_rss = 0, ave_pages = 0;
	float ave_cpu = 0;
	uint32_t ave_cpu2 = 0;
	char *account, *step_name;
	uint32_t exit_code;

	if (!storage_init) {
		debug("jobacct init was not called or it failed");
		return SLURM_ERROR;
	}

	now = time(NULL);

	if (jobacct == NULL) {
		/* JobAcctGather=slurmdb_gather/none, no data to process */
		memset(&dummy_jobacct, 0, sizeof(dummy_jobacct));
		jobacct = &dummy_jobacct;
	}

	if ((elapsed=now-step_ptr->start_time)<0)
		elapsed=0;	/* For *very* short jobs, if clock is wrong */

	exit_code = step_ptr->exit_code;
	if (exit_code == NO_VAL) {
		comp_status = JOB_CANCELLED;
		exit_code = 0;
	} else if (exit_code)
		comp_status = JOB_FAILED;
	else
		comp_status = JOB_COMPLETE;

#ifdef HAVE_BG
	if (step_ptr->job_ptr->details)
		cpus = step_ptr->job_ptr->details->min_cpus;
	else
		cpus = step_ptr->job_ptr->cpu_cnt;
	select_g_select_jobinfo_get(step_ptr->job_ptr->select_jobinfo,
			     SELECT_JOBDATA_IONODES,
			     &ionodes);
	if (ionodes) {
		snprintf(node_list, BUFFER_SIZE,
			 "%s[%s]", step_ptr->job_ptr->nodes, ionodes);
		xfree(ionodes);
	} else
		snprintf(node_list, BUFFER_SIZE, "%s",
			 step_ptr->job_ptr->nodes);

#else
	if (!step_ptr->step_layout || !step_ptr->step_layout->task_cnt) {
		cpus = step_ptr->job_ptr->total_cpus;
		snprintf(node_list, BUFFER_SIZE, "%s", step_ptr->job_ptr->nodes);

	} else {
		cpus = step_ptr->step_layout->task_cnt;
		snprintf(node_list, BUFFER_SIZE, "%s",
			 step_ptr->step_layout->node_list);
	}
#endif
	/* figure out the ave of the totals sent */
	if (cpus > 0) {
		ave_vsize = jobacct->tot_vsize;
		ave_vsize /= cpus;
		ave_rss = jobacct->tot_rss;
		ave_rss /= cpus;
		ave_pages = jobacct->tot_pages;
		ave_pages /= cpus;
		ave_cpu = jobacct->tot_cpu;
		ave_cpu /= cpus;
	}

	if (jobacct->min_cpu != (uint32_t)NO_VAL) {
		ave_cpu2 = jobacct->min_cpu;
	}

	account   = _safe_dup(step_ptr->job_ptr->account);
	step_name = _safe_dup(step_ptr->name);

	snprintf(buf, BUFFER_SIZE, _jobstep_format,
		 JOB_STEP,
		 step_ptr->step_id,	/* stepid */
		 comp_status,		/* completion status */
		 exit_code,	/* completion code */
		 cpus,          	/* number of tasks */
		 cpus,                  /* number of cpus */
		 elapsed,	        /* elapsed seconds */
		 /* total cputime seconds */
		 jobacct->user_cpu_sec
		 + jobacct->sys_cpu_sec,
		 /* total cputime seconds */
		 jobacct->user_cpu_usec
		 + jobacct->sys_cpu_usec,
		 jobacct->user_cpu_sec,	/* user seconds */
		 jobacct->user_cpu_usec,/* user microseconds */
		 jobacct->sys_cpu_sec,	/* system seconds */
		 jobacct->sys_cpu_usec,/* system microsecs */
		 0,	/* max rss */
		 0,	/* max ixrss */
		 0,	/* max idrss */
		 0,	/* max isrss */
		 0,	/* max minflt */
		 0,	/* max majflt */
		 0,	/* max nswap */
		 0,	/* total inblock */
		 0,	/* total outblock */
		 0,	/* total msgsnd */
		 0,	/* total msgrcv */
		 0,	/* total nsignals */
		 0,	/* total nvcsw */
		 0,	/* total nivcsw */
		 jobacct->max_vsize,	/* max vsize */
		 jobacct->max_vsize_id.taskid,	/* max vsize node */
		 ave_vsize,	/* ave vsize */
		 jobacct->max_rss,	/* max vsize */
		 jobacct->max_rss_id.taskid,	/* max rss node */
		 ave_rss,	/* ave rss */
		 jobacct->max_pages,	/* max pages */
		 jobacct->max_pages_id.taskid,	/* max pages node */
		 ave_pages,	/* ave pages */
		 ave_cpu2,	/* min cpu */
		 jobacct->min_cpu_id.taskid,	/* min cpu node */
		 ave_cpu,	/* ave cpu */
		 step_name,	/* step exe name */
		 node_list, /* name of nodes step running on */
		 jobacct->max_vsize_id.nodeid,	/* max vsize task */
		 jobacct->max_rss_id.nodeid,	/* max rss task */
		 jobacct->max_pages_id.nodeid,	/* max pages task */
		 jobacct->min_cpu_id.nodeid,	/* min cpu task */
		 account,
		 step_ptr->job_ptr->requid); /* requester user id */

	rc = _print_record(step_ptr->job_ptr, now, buf);
	xfree(account);
	xfree(step_name);
	return rc;
}
Esempio n. 7
0
/*
 * load into the storage the start of a job step
 */
extern int jobacct_storage_p_step_start(void *db_conn,
					struct step_record *step_ptr)
{
	char buf[BUFFER_SIZE];
	int cpus = 0, rc;
	char node_list[BUFFER_SIZE];
#ifdef HAVE_BG
	char *ionodes = NULL;
#endif
	float float_tmp = 0;
	char *account, *step_name;

	if (!storage_init) {
		debug("jobacct init was not called or it failed");
		return SLURM_ERROR;
	}

#ifdef HAVE_BG
	if (step_ptr->job_ptr->details)
		cpus = step_ptr->job_ptr->details->min_cpus;
	else
		cpus = step_ptr->job_ptr->cpu_cnt;
	select_g_select_jobinfo_get(step_ptr->job_ptr->select_jobinfo,
			     SELECT_JOBDATA_IONODES,
			     &ionodes);
	if (ionodes) {
		snprintf(node_list, BUFFER_SIZE,
			 "%s[%s]", step_ptr->job_ptr->nodes, ionodes);
		xfree(ionodes);
	} else
		snprintf(node_list, BUFFER_SIZE, "%s",
			 step_ptr->job_ptr->nodes);

#else
	if (!step_ptr->step_layout || !step_ptr->step_layout->task_cnt) {
		cpus = step_ptr->job_ptr->total_cpus;
		snprintf(node_list, BUFFER_SIZE, "%s", step_ptr->job_ptr->nodes);
	} else {
		cpus = step_ptr->step_layout->task_cnt;
		snprintf(node_list, BUFFER_SIZE, "%s",
			 step_ptr->step_layout->node_list);
	}
#endif
	account   = _safe_dup(step_ptr->job_ptr->account);
	step_name = _safe_dup(step_ptr->name);

	step_ptr->job_ptr->requid = -1; /* force to -1 for stats to know this
				     * hasn't been set yet  */

	snprintf(buf, BUFFER_SIZE, _jobstep_format,
		 JOB_STEP,
		 step_ptr->step_id,	/* stepid */
		 JOB_RUNNING,		/* completion status */
		 0,     		/* completion code */
		 cpus,          	/* number of tasks */
		 cpus,                  /* number of cpus */
		 0,	        	/* elapsed seconds */
		 0,                    /* total cputime seconds */
		 0,    		/* total cputime seconds */
		 0,	/* user seconds */
		 0,	/* user microseconds */
		 0,	/* system seconds */
		 0,	/* system microsecs */
		 0,	/* max rss */
		 0,	/* max ixrss */
		 0,	/* max idrss */
		 0,	/* max isrss */
		 0,	/* max minflt */
		 0,	/* max majflt */
		 0,	/* max nswap */
		 0,	/* total inblock */
		 0,	/* total outblock */
		 0,	/* total msgsnd */
		 0,	/* total msgrcv */
		 0,	/* total nsignals */
		 0,	/* total nvcsw */
		 0,	/* total nivcsw */
		 0,	/* max vsize */
		 0,	/* max vsize task */
		 float_tmp,	/* ave vsize */
		 0,	/* max rss */
		 0,	/* max rss task */
		 float_tmp,	/* ave rss */
		 0,	/* max pages */
		 0,	/* max pages task */
		 float_tmp,	/* ave pages */
		 0,	/* min cpu */
		 0,	/* min cpu task */
		 float_tmp,	/* ave cpu */
		 step_name,	/* step exe name */
		 node_list,     /* name of nodes step running on */
		 0,	/* max vsize node */
		 0,	/* max rss node */
		 0,	/* max pages node */
		 0,	/* min cpu node */
		 account,
		 step_ptr->job_ptr->requid); /* requester user id */

	rc = _print_record(step_ptr->job_ptr, step_ptr->start_time, buf);
	xfree(account);
	xfree(step_name);
	return rc;
}