static void * _task_launch_detection(void *unused) { spawn_resp_t *resp; time_t start; int rc = 0; /* * mpir_init() is called in plugins/launch/slurm/launch_slurm.c before * mpi_hook_client_prelaunch() is called in api/step_launch.c */ start = time(NULL); while (_tasks_launched() == 0) { usleep(1000*50); if (time(NULL) - start > 600) { rc = 1; break; } } /* send a resp to spawner srun */ resp = spawn_resp_new(); resp->seq = job_info.spawn_seq; resp->jobid = xstrdup(job_info.pmi_jobid); resp->error_cnt = 0; /* TODO */ resp->rc = rc; resp->pmi_port = tree_info.pmi_port; spawn_resp_send_to_srun(resp); spawn_resp_free(resp); return NULL; }
extern int spawn_resp_unpack(spawn_resp_t **resp_ptr, Buf buf) { spawn_resp_t *resp = NULL; uint32_t temp32; int i; resp = xmalloc(sizeof(spawn_resp_t)); safe_unpack32(&resp->seq, buf); safe_unpack32((uint32_t *)&resp->rc, buf); safe_unpack16((uint16_t *)&resp->pmi_port, buf); safe_unpackstr_xmalloc(&resp->jobid, &temp32, buf); safe_unpack32(&resp->error_cnt, buf); if (resp->error_cnt > 0) { resp->error_codes = xmalloc(resp->error_cnt * sizeof(int)); for (i = 0; i < resp->error_cnt; i ++) { safe_unpack32((uint32_t *)&(resp->error_codes[i]), buf); } } *resp_ptr = resp; return SLURM_SUCCESS; unpack_error: spawn_resp_free(resp); return SLURM_ERROR; }
static void _setup_exec_srun(spawn_req_t *req) { char **env, env_key[32]; int i, rc; spawn_resp_t *resp; debug3("mpi/pmi2: in _setup_exec_srun"); /* setup environments */ env = env_array_copy((const char **)job_info.job_env); /* TODO: unset some env-vars */ env_array_overwrite_fmt(&env, "SLURM_JOB_ID", "%u", job_info.jobid); env_array_overwrite_fmt(&env, PMI2_SPAWNER_JOBID_ENV, "%s", job_info.pmi_jobid); env_array_overwrite_fmt(&env, PMI2_PMI_JOBID_ENV, "%s-%u", job_info.pmi_jobid, req->seq); env_array_overwrite_fmt(&env, PMI2_SPAWN_SEQ_ENV, "%u", req->seq); env_array_overwrite_fmt(&env, PMI2_SPAWNER_PORT_ENV, "%hu", tree_info.pmi_port); /* preput kvs */ env_array_overwrite_fmt(&env, PMI2_PREPUT_CNT_ENV, "%d", req->preput_cnt); for (i = 0; i < req->preput_cnt; i ++) { snprintf(env_key, 32, PMI2_PPKEY_ENV"%d", i); env_array_overwrite_fmt(&env, env_key, "%s", req->pp_keys[i]); snprintf(env_key, 32, PMI2_PPVAL_ENV"%d", i); env_array_overwrite_fmt(&env, env_key, "%s", req->pp_vals[i]); } if (req->subcmd_cnt == 1) { /* no return if success */ rc = _exec_srun_single(req, env); } else { /* no return if success */ rc = _exec_srun_multiple(req, env); } resp = spawn_resp_new(); resp->seq = req->seq; xstrfmtcat(resp->jobid, "%s-%u", job_info.pmi_jobid, req->seq); resp->error_cnt = 0; resp->rc = rc; /* fake a srun address */ tree_info.srun_addr = xmalloc(sizeof(slurm_addr_t)); slurm_set_addr(tree_info.srun_addr, tree_info.pmi_port, "127.0.0.1"); spawn_resp_send_to_srun(resp); spawn_resp_free(resp); exit(errno); }
static int _handle_mcmd(int fd, int lrank, client_req_t *req) { spawn_subcmd_t *subcmd = NULL; spawn_resp_t *spawn_resp = NULL; client_resp_t *task_resp = NULL; int spawnssofar = 0, rc = SLURM_SUCCESS, i; char buf[64]; debug3("mpi/pmi2: in _handle_mcmd"); client_req_parse_body(req); subcmd = client_req_parse_spawn_subcmd(req); debug3("mpi/pmi2: got subcmd"); client_req_get_int(req, SPAWNSSOFAR_KEY, &spawnssofar); if (spawnssofar == 1) { pmi1_spawn = spawn_req_new(); client_req_get_int(req, TOTSPAWNS_KEY, (int *)&pmi1_spawn->subcmd_cnt); pmi1_spawn->subcmds = xmalloc(pmi1_spawn->subcmd_cnt * sizeof(spawn_subcmd_t *)); client_req_get_int(req, PREPUTNUM_KEY, (int *)&pmi1_spawn->preput_cnt); pmi1_spawn->pp_keys = xmalloc(pmi1_spawn->preput_cnt * sizeof(char *)); pmi1_spawn->pp_vals = xmalloc(pmi1_spawn->preput_cnt * sizeof(char *)); for (i = 0; i < pmi1_spawn->preput_cnt; i ++) { snprintf(buf, 64, PREPUTKEY_KEY"%d", i); client_req_get_str(req, buf, &pmi1_spawn->pp_keys[i]); snprintf(buf, 64, PREPUTVAL_KEY"%d", i); client_req_get_str(req, buf, &pmi1_spawn->pp_vals[i]); } } pmi1_spawn->subcmds[spawnssofar - 1] = subcmd; if (spawnssofar == pmi1_spawn->subcmd_cnt) { debug3("mpi/pmi2: got whole spawn req"); /* a resp will be send back from srun. this will not be forwarded to the tasks */ rc = spawn_req_send_to_srun(pmi1_spawn, &spawn_resp); if (spawn_resp->rc != SLURM_SUCCESS) { task_resp = client_resp_new(); client_resp_append(task_resp, CMD_KEY"="SPAWNRESP_CMD";" RC_KEY"=%d;" ERRMSG_KEY"=spawn failed;", spawn_resp->rc); client_resp_send(task_resp, fd); client_resp_free(task_resp); spawn_resp_free(spawn_resp); spawn_req_free(pmi1_spawn); pmi1_spawn = NULL; error("mpi/pmi2: spawn failed"); rc = SLURM_ERROR; goto out; } debug("mpi/pmi2: spawn request sent to srun"); spawn_psr_enqueue(spawn_resp->seq, fd, lrank, NULL); spawn_resp_free(spawn_resp); spawn_req_free(pmi1_spawn); pmi1_spawn = NULL; } out: debug3("mpi/pmi2: out _handle_mcmd"); return rc; }