Ejemplo n.º 1
0
/* Convert a SLURM hostlist expression into the equivalent node index
 * value expression.
 */
static char *_get_nids(opt_t *opt_local)
{
	hostlist_t hl;
	char *nids = NULL;
	int node_cnt;

	if (!opt_local->nodelist)
		return NULL;
	hl = hostlist_create(opt_local->nodelist);
	if (!hl) {
		error("Invalid hostlist: %s", opt_local->nodelist);
		return NULL;
	}
	//info("input hostlist: %s", nodelist);
	hostlist_uniq(hl);

	/* aprun needs the hostlist to be the exact size requested.
	   So if it doesn't set it.
	*/
	node_cnt = hostlist_count(hl);
	if (opt_local->nodes_set_opt && (node_cnt != opt_local->min_nodes)) {
		error("You requested %d nodes and %d hosts.  These numbers "
		      "must be the same, so setting number of nodes to %d",
		      opt_local->min_nodes, node_cnt, node_cnt);
	}
	opt_local->min_nodes = node_cnt;
	opt_local->nodes_set = 1;

	nids = cray_nodelist2nids(hl, NULL);

	hostlist_destroy(hl);
	//info("output node IDs: %s", nids);

	return nids;
}
Ejemplo n.º 2
0
static int _run_nhc(nhc_info_t *nhc_info)
{
#ifdef HAVE_NATIVE_CRAY
	int argc = 11, status = 1, wait_rc, i = 0;
	char *argv[argc];
	pid_t cpid;
	char *jobid_char = NULL, *apid_char = NULL, *nodelist_nids = NULL,
		*exit_char = NULL;
	DEF_TIMERS;

	START_TIMER;

	apid_char = xstrdup_printf("%"PRIu64"", nhc_info->apid);
	exit_char = xstrdup_printf("%u", nhc_info->exit_code);
	jobid_char = xstrdup_printf("%u", nhc_info->jobid);
	nodelist_nids = cray_nodelist2nids(NULL, nhc_info->nodelist);

	argv[i++] = "/opt/cray/nodehealth/default/bin/xtcleanup_after";
	argv[i++] = "-a";
	argv[i++] = apid_char;
	argv[i++] = "-e";
	argv[i++] = exit_char;
	argv[i++] = "-r";
	argv[i++] = jobid_char;
	argv[i++] = "-m";
	argv[i++] = nhc_info->step ? "application" : "reservation";
	argv[i++] = nodelist_nids;
	argv[i++] = NULL;

	if (debug_flags & DEBUG_FLAG_SELECT_TYPE) {
		info("Calling NHC for jobid %u and apid %"PRIu64" "
		     "on nodes %s(%s) exit code %u",
		     nhc_info->jobid, nhc_info->apid,
		     nhc_info->nodelist, nodelist_nids,
		     nhc_info->exit_code);
	}

	if (!nhc_info->nodelist || !nodelist_nids) {
		/* already done */
		goto fini;
	}

	if ((cpid = fork()) < 0) {
		error("_run_nhc fork error: %m");
		goto fini;
	}
	if (cpid == 0) {
#ifdef SETPGRP_TWO_ARGS
		setpgrp(0, 0);
#else
		setpgrp();
#endif
		execvp(argv[0], argv);
		exit(127);
	}

	while (1) {
		wait_rc = waitpid(cpid, &status, 0);
		if (wait_rc < 0) {
			if (errno == EINTR)
				continue;
			error("_run_nhc waitpid error: %m");
			break;
		} else if (wait_rc > 0) {
			killpg(cpid, SIGKILL);	/* kill children too */
			break;
		}
	}
	END_TIMER;
	if (status != 0) {
		error("_run_nhc jobid %u and apid %"PRIu64" exit "
		      "status %u:%u took: %s",
		      nhc_info->jobid, nhc_info->apid, WEXITSTATUS(status),
		      WTERMSIG(status), TIME_STR);
	} else if (debug_flags & DEBUG_FLAG_SELECT_TYPE)
		info("_run_nhc jobid %u and apid %"PRIu64" completed took: %s",
		     nhc_info->jobid, nhc_info->apid, TIME_STR);

 fini:
	xfree(apid_char);
	xfree(exit_char);
	xfree(jobid_char);
	xfree(nodelist_nids);

	return status;
#else
	if (debug_flags & DEBUG_FLAG_SELECT_TYPE)
		info("simluating calling NHC for jobid %u "
		     "and apid %"PRIu64" on nodes %s",
		     nhc_info->jobid, nhc_info->apid, nhc_info->nodelist);

	/* simulate sleeping */
	sleep(2);
	return 0;
#endif
}
Ejemplo n.º 3
0
static int _run_nhc(uint64_t id, char *nodelist, bool step)
{
#ifdef HAVE_NATIVE_CRAY
	int argc = 5, status = 1, wait_rc;
	char *argv[argc];
	pid_t cpid;
	DEF_TIMERS;

	START_TIMER;
	argv[0] = "/opt/cray/nodehealth/default/bin/xtcleanup_after";
	if (step)
		argv[1] = "-a";
	else
		argv[1] = "-r";
	argv[2] = xstrdup_printf("%"PRIu64"", id);
	argv[3] = cray_nodelist2nids(NULL, nodelist);
	argv[4] = NULL;

	if (debug_flags & DEBUG_FLAG_SELECT_TYPE)
		info("Calling NHC for id %"PRIu64" on nodes %s(%s)",
		     id, nodelist, argv[3]);

	if ((cpid = fork()) < 0) {
		error("_run_nhc fork error: %m");
		goto fini;
	}
	if (cpid == 0) {
#ifdef SETPGRP_TWO_ARGS
		setpgrp(0, 0);
#else
		setpgrp();
#endif
		execvp(argv[0], argv);
		exit(127);
	}

	while (1) {
		wait_rc = waitpid(cpid, &status, 0);
		if (wait_rc < 0) {
			if (errno == EINTR)
				continue;
			error("_run_nhc waitpid error: %m");
			break;
		} else if (wait_rc > 0) {
			killpg(cpid, SIGKILL);	/* kill children too */
			break;
		}
	}
	END_TIMER;
	if (status != 0) {
		error("_run_nhc %s %"PRIu64" exit status %u:%u took: %s",
		      step ? "step" : "job", step ? id : id,
		      WEXITSTATUS(status), WTERMSIG(status), TIME_STR);
	} else if (debug_flags & DEBUG_FLAG_SELECT_TYPE)
		info("_run_nhc %s %"PRIu64" completed took: %s",
		     step ? "step" : "job", step ? id : id, TIME_STR);

 fini:
	xfree(argv[2]);
	xfree(argv[3]);
	return status;
#else
	if (debug_flags & DEBUG_FLAG_SELECT_TYPE)
		info("simluating calling NHC for id %"PRIu64" on nodes %s",
		     id, nodelist);

	/* simulate sleeping */
	sleep(2);
	return 0;
#endif
}