예제 #1
0
파일: setup.c 프로젝트: Cray/slurm
static void *
_task_launch_detection(void *unused)
{
	spawn_resp_t *resp;
	time_t start;
	int rc = 0;

	/*
	 * mpir_init() is called in plugins/launch/slurm/launch_slurm.c before
	 * mpi_hook_client_prelaunch() is called in api/step_launch.c
	 */
	start = time(NULL);
	while (_tasks_launched() == 0) {
		usleep(1000*50);
		if (time(NULL) - start > 600) {
			rc = 1;
			break;
		}
	}

	/* send a resp to spawner srun */
	resp = spawn_resp_new();
	resp->seq = job_info.spawn_seq;
	resp->jobid = xstrdup(job_info.pmi_jobid);
	resp->error_cnt = 0;	/* TODO */
	resp->rc = rc;
	resp->pmi_port = tree_info.pmi_port;

	spawn_resp_send_to_srun(resp);
	spawn_resp_free(resp);
	return NULL;
}
예제 #2
0
파일: spawn.c 프로젝트: corburn/slurm
extern int
spawn_resp_unpack(spawn_resp_t **resp_ptr, Buf buf)
{
	spawn_resp_t *resp = NULL;
	uint32_t temp32;
	int i;

	resp = xmalloc(sizeof(spawn_resp_t));

	safe_unpack32(&resp->seq, buf);
	safe_unpack32((uint32_t *)&resp->rc, buf);
	safe_unpack16((uint16_t *)&resp->pmi_port, buf);
	safe_unpackstr_xmalloc(&resp->jobid, &temp32, buf);
	safe_unpack32(&resp->error_cnt, buf);
	if (resp->error_cnt > 0) {
		resp->error_codes = xmalloc(resp->error_cnt * sizeof(int));
		for (i = 0; i < resp->error_cnt; i ++) {
			safe_unpack32((uint32_t *)&(resp->error_codes[i]), buf);
		}
	}
	*resp_ptr = resp;
	return SLURM_SUCCESS;

unpack_error:
	spawn_resp_free(resp);
	return SLURM_ERROR;
}
예제 #3
0
파일: spawn.c 프로젝트: corburn/slurm
static void
_setup_exec_srun(spawn_req_t *req)
{
	char **env, env_key[32];
	int i, rc;
	spawn_resp_t *resp;

	debug3("mpi/pmi2: in _setup_exec_srun");

	/* setup environments */
	env = env_array_copy((const char **)job_info.job_env);
	/* TODO: unset some env-vars */

	env_array_overwrite_fmt(&env, "SLURM_JOB_ID", "%u", job_info.jobid);
	env_array_overwrite_fmt(&env, PMI2_SPAWNER_JOBID_ENV, "%s",
				job_info.pmi_jobid);
	env_array_overwrite_fmt(&env, PMI2_PMI_JOBID_ENV, "%s-%u",
				job_info.pmi_jobid, req->seq);
	env_array_overwrite_fmt(&env, PMI2_SPAWN_SEQ_ENV, "%u", req->seq);
	env_array_overwrite_fmt(&env, PMI2_SPAWNER_PORT_ENV, "%hu",
				tree_info.pmi_port);
	/* preput kvs */
	env_array_overwrite_fmt(&env, PMI2_PREPUT_CNT_ENV, "%d",
				req->preput_cnt);
	for (i = 0; i < req->preput_cnt; i ++) {
		snprintf(env_key, 32, PMI2_PPKEY_ENV"%d", i);
		env_array_overwrite_fmt(&env, env_key, "%s", req->pp_keys[i]);
		snprintf(env_key, 32, PMI2_PPVAL_ENV"%d", i);
		env_array_overwrite_fmt(&env, env_key, "%s", req->pp_vals[i]);
	}

	if (req->subcmd_cnt == 1) {
		/* no return if success */
		rc = _exec_srun_single(req, env);
	} else {
		/* no return if success */
		rc = _exec_srun_multiple(req, env);
	}

	resp = spawn_resp_new();
	resp->seq = req->seq;
	xstrfmtcat(resp->jobid, "%s-%u", job_info.pmi_jobid, req->seq);
	resp->error_cnt = 0;
	resp->rc = rc;

	/* fake a srun address */
	tree_info.srun_addr = xmalloc(sizeof(slurm_addr_t));
	slurm_set_addr(tree_info.srun_addr, tree_info.pmi_port,
		       "127.0.0.1");
	spawn_resp_send_to_srun(resp);
	spawn_resp_free(resp);
	exit(errno);
}
예제 #4
0
static int
_handle_mcmd(int fd, int lrank, client_req_t *req)
{
    spawn_subcmd_t *subcmd = NULL;
    spawn_resp_t *spawn_resp = NULL;
    client_resp_t *task_resp = NULL;
    int spawnssofar = 0, rc = SLURM_SUCCESS, i;
    char buf[64];

    debug3("mpi/pmi2: in _handle_mcmd");

    client_req_parse_body(req);
    subcmd = client_req_parse_spawn_subcmd(req);

    debug3("mpi/pmi2: got subcmd");

    client_req_get_int(req, SPAWNSSOFAR_KEY, &spawnssofar);
    if (spawnssofar == 1) {
        pmi1_spawn = spawn_req_new();
        client_req_get_int(req, TOTSPAWNS_KEY,
                           (int *)&pmi1_spawn->subcmd_cnt);
        pmi1_spawn->subcmds = xmalloc(pmi1_spawn->subcmd_cnt *
                                      sizeof(spawn_subcmd_t *));
        client_req_get_int(req, PREPUTNUM_KEY,
                           (int *)&pmi1_spawn->preput_cnt);
        pmi1_spawn->pp_keys =
            xmalloc(pmi1_spawn->preput_cnt * sizeof(char *));
        pmi1_spawn->pp_vals =
            xmalloc(pmi1_spawn->preput_cnt * sizeof(char *));
        for (i = 0; i < pmi1_spawn->preput_cnt; i ++) {
            snprintf(buf, 64, PREPUTKEY_KEY"%d", i);
            client_req_get_str(req, buf, &pmi1_spawn->pp_keys[i]);
            snprintf(buf, 64, PREPUTVAL_KEY"%d", i);
            client_req_get_str(req, buf, &pmi1_spawn->pp_vals[i]);
        }
    }
    pmi1_spawn->subcmds[spawnssofar - 1] = subcmd;

    if (spawnssofar == pmi1_spawn->subcmd_cnt) {
        debug3("mpi/pmi2: got whole spawn req");
        /* a resp will be send back from srun.
           this will not be forwarded to the tasks */
        rc = spawn_req_send_to_srun(pmi1_spawn, &spawn_resp);
        if (spawn_resp->rc != SLURM_SUCCESS) {
            task_resp = client_resp_new();
            client_resp_append(task_resp, CMD_KEY"="SPAWNRESP_CMD";"
                               RC_KEY"=%d;"
                               ERRMSG_KEY"=spawn failed;",
                               spawn_resp->rc);
            client_resp_send(task_resp, fd);
            client_resp_free(task_resp);

            spawn_resp_free(spawn_resp);
            spawn_req_free(pmi1_spawn);
            pmi1_spawn = NULL;
            error("mpi/pmi2: spawn failed");
            rc = SLURM_ERROR;
            goto out;
        }

        debug("mpi/pmi2: spawn request sent to srun");
        spawn_psr_enqueue(spawn_resp->seq, fd, lrank, NULL);

        spawn_resp_free(spawn_resp);
        spawn_req_free(pmi1_spawn);
        pmi1_spawn = NULL;
    }
out:
    debug3("mpi/pmi2: out _handle_mcmd");
    return rc;
}