int p_mpi_hook_slurmstepd_task (const mpi_plugin_task_info_t *job, char ***env) { int i; env_array_overwrite_fmt(env, "PMI_FD", "%u", TASK_PMI_SOCK(job->ltaskid)); env_array_overwrite_fmt(env, "PMI_JOBID", "%s", job_info.pmi_jobid); env_array_overwrite_fmt(env, "PMI_RANK", "%u", job->gtaskid); env_array_overwrite_fmt(env, "PMI_SIZE", "%u", job->ntasks); if (job_info.spawn_seq) { /* PMI1.1 needs this env-var */ env_array_overwrite_fmt(env, "PMI_SPAWNED", "%u", 1); } /* close unused sockets in task */ close(tree_sock); tree_sock = 0; for (i = 0; i < job->ltasks; i ++) { close(STEPD_PMI_SOCK(i)); STEPD_PMI_SOCK(i) = 0; if (i != job->ltaskid) { close(TASK_PMI_SOCK(i)); TASK_PMI_SOCK(i) = 0; } } return SLURM_SUCCESS; }
/* * main loop of agent thread */ static void * _agent(void * unused) { eio_handle_t *pmi2_handle; eio_obj_t *tree_listen_obj, *task_obj; int i; pmi2_handle = eio_handle_create(); //fd_set_nonblocking(tree_sock); tree_listen_obj = eio_obj_create(tree_sock, &tree_listen_ops, (void *)(-1)); eio_new_initial_obj(pmi2_handle, tree_listen_obj); /* for stepd, add the sockets to tasks */ if (in_stepd()) { for (i = 0; i < job_info.ltasks; i ++) { task_obj = eio_obj_create(STEPD_PMI_SOCK(i), &task_ops, (void*)(long)(i)); eio_new_initial_obj(pmi2_handle, task_obj); } initialized = xmalloc(job_info.ltasks * sizeof(int)); finalized = xmalloc(job_info.ltasks * sizeof(int)); } eio_handle_mainloop(pmi2_handle); debug("mpi/pmi2: agent thread exit"); eio_handle_destroy(pmi2_handle); return NULL; }
/* send fence_resp/barrier_out to tasks */ extern int send_kvs_fence_resp_to_clients(int rc, char *errmsg) { int i = 0; client_resp_t *resp; char *msg; resp = client_resp_new(); if ( is_pmi11() ) { if (rc != 0 && errmsg != NULL) { // XXX: pmi1.1 does not check the rc msg = _str_replace(errmsg, ' ', '_'); client_resp_append(resp, CMD_KEY"="BARRIEROUT_CMD" " RC_KEY"=%d "MSG_KEY"=%s\n", rc, msg); xfree(msg); } else { client_resp_append(resp, CMD_KEY"="BARRIEROUT_CMD" " RC_KEY"=%d\n", rc); } } else if (is_pmi20()) { if (rc != 0 && errmsg != NULL) { // TODO: pmi2.0 accept escaped ';' (";;") msg = _str_replace(errmsg, ';', '_'); client_resp_append(resp, CMD_KEY"="KVSFENCERESP_CMD";" RC_KEY"=%d;"ERRMSG_KEY"=%s;", rc, msg); xfree(msg); } else { client_resp_append(resp, CMD_KEY"="KVSFENCERESP_CMD";" RC_KEY"=%d;", rc); } } for (i = 0; i < job_info.ltasks; i ++) { rc = client_resp_send(resp, STEPD_PMI_SOCK(i)); } client_resp_free(resp); return rc; }
/* ring_out messages come in from our parent, * we process this and send ring_out messages to each of our children: * count - starting rank for our leftmost application process * left - left value for leftmost application process in our subtree * right - right value for rightmost application process in our subtree */ int pmix_ring_out(int count, char* left, char* right) { int rc = SLURM_SUCCESS; debug3("mpi/pmi2: in pmix_ring_out rank=%d count=%d left=%s right=%s", pmix_stepd_rank, count, left, right); /* our parent will send us a pmix_ring_out message, the count value * contained in this message will be the rank of the first process * in our subtree, the left value will be the left value for the * first process in the subtree, and the right value will be the * right value for the last process in our subtree */ /* allocate a structure to compute values to send to each child */ pmix_ring_msg* outmsgs = (pmix_ring_msg*) xmalloc(pmix_ring_children * sizeof(pmix_ring_msg)); /* initialize messages to all children */ int i; for (i = 0; i < pmix_ring_children; i++) { outmsgs[i].count = 0; outmsgs[i].left = NULL; outmsgs[i].right = NULL; } /* iterate over all msgs and set count and left neighbor */ for (i = 0; i < pmix_ring_children; i++) { /* store current count in output message */ outmsgs[i].count = count; /* add count for this child to our running total */ count += pmix_ring_msgs[i].count; /* set left value for this child */ outmsgs[i].left = left; /* get right value from child, if it exists, * it will be the left neighbor of the next child, * otherwise, reuse the current left value */ char* next = pmix_ring_msgs[i].right; if (next != NULL) { left = next; } } /* now set all right values (iterate backwards through children) */ for (i = (pmix_ring_children - 1); i >= 0; i--) { /* set right value for this child */ outmsgs[i].right = right; /* get left value from child, if it exists, * it will be the right neighbor of the next child, * otherwise, reuse the current right value */ char* next = pmix_ring_msgs[i].left; if (next != NULL) { right = next; } } /* send messages to children in stepd tree, * we do this first to get the message down the tree quickly */ for (i = 0; i < pmix_stepd_children; i++) { /* get pointer to message data for this child */ int ring_id = pmix_app_children + i; pmix_ring_msg* msg = &outmsgs[ring_id]; /* TODO: do we need hton translation? */ /* construct message */ Buf buf = init_buf(1024); pack16(TREE_CMD_RING_RESP, buf); /* specify message type (RING_OUT) */ pack32((uint32_t) msg->count, buf); /* send count value */ packstr(msg->left, buf); /* send left value */ packstr(msg->right, buf); /* send right value */ /* get global rank of our i-th child stepd */ int rank = pmix_stepd_rank_child(i); debug3("mpi/pmi2: rank=%d sending RING_OUT to rank=%d count=%d left=%s right=%s", pmix_stepd_rank, rank, msg->count, msg->left, msg->right); /* send message to child */ rc = pmix_stepd_send(get_buf_data(buf), (uint32_t) size_buf(buf), rank); /* TODO: use tmp_rc here to catch any failure */ /* free message */ free_buf(buf); } /* now send messages to children app procs, * and set their state back to normal */ for (i = 0; i < pmix_app_children; i++) { /* get pointer to message data for this child */ pmix_ring_msg* msg = &outmsgs[i]; /* TODO: want to catch send failure here? */ /* construct message and send to client */ client_resp_t *resp = client_resp_new(); client_resp_append(resp, "%s=%s;%s=%d;%s=%d;%s=%s;%s=%s;", CMD_KEY, RINGRESP_CMD, RC_KEY, 0, RING_COUNT_KEY, msg->count, RING_LEFT_KEY, msg->left, RING_RIGHT_KEY, msg->right); client_resp_send(resp, STEPD_PMI_SOCK(i)); client_resp_free(resp); } /* delete messages, note that we don't need to free * left and right strings in each message since they * are pointers to strings allocated in pmix_ring_msgs */ xfree(outmsgs); /* clear the pmix_ring_in messages for next ring operation */ for (i = 0; i < pmix_ring_children; i++) { pmix_ring_msg* msg = &pmix_ring_msgs[i]; msg->count = 0; if (msg->left != NULL) { xfree(msg->left); msg->left = NULL; } if (msg->right != NULL) { xfree(msg->right); msg->right = NULL; } } /* reset our ring count */ pmix_ring_count = 0; debug3("mpi/pmi2: out pmix_ring_out"); return rc; }