static void vm_cmd(int status, orte_process_name_t *sender, orcm_pnp_tag_t tag, struct iovec *msg, int count, opal_buffer_t *buffer, void *cbdata) { int rc, n; uint16_t jfam; orte_process_name_t generator; OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output, "%s GOT COMMAND FROM %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(sender))); /* if this isn't intended for me, ignore it */ n=1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &jfam, &n, OPAL_UINT16))) { ORTE_ERROR_LOG(rc); return; } if (jfam != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) { OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output, "%s GOT COMMAND FOR DVM %d - NOT FOR ME!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jfam)); return; } ORTE_MESSAGE_EVENT(ORTE_PROC_MY_NAME, buffer, ORTE_RML_TAG_DAEMON, orte_daemon_cmd_processor); }
void orte_grpcomm_base_daemon_coll_recv(int status, orte_process_name_t* sender, opal_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata) { int rc; OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, "%s grpcomm:base:daemon_coll:receive got message from %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(sender))); /* don't process this right away - we need to get out of the recv before * we process the message as it may ask us to do something that involves * more messaging! Instead, setup an event so that the message gets processed * as soon as we leave the recv. * * The macro makes a copy of the buffer, which we release above - the incoming * buffer, however, is NOT released here, although its payload IS transferred * to the message buffer for later processing */ ORTE_MESSAGE_EVENT(sender, buffer, tag, process_msg); /* reissue the recv */ if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON_COLLECTIVE, ORTE_RML_NON_PERSISTENT, orte_grpcomm_base_daemon_coll_recv, cbdata))) { ORTE_ERROR_LOG(rc); } return; }
static void orte_rml_base_recv(int status, orte_process_name_t* sender, opal_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata) { int rc; /* don't process this right away - we need to get out of the recv before * we process the message as it may ask us to do something that involves * more messaging! Instead, setup an event so that the message gets processed * as soon as we leave the recv. * * The macro makes a copy of the buffer, which we release above - the incoming * buffer, however, is NOT released here, although its payload IS transferred * to the message buffer for later processing */ ORTE_MESSAGE_EVENT(sender, buffer, tag, process_message); if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_RML_INFO_UPDATE, ORTE_RML_NON_PERSISTENT, orte_rml_base_recv, NULL))) { ORTE_ERROR_LOG(rc); } }
static void sstore_central_local_recv(int status, orte_process_name_t* sender, opal_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata) { if( ORTE_RML_TAG_SSTORE_INTERNAL != tag ) { return; } ORTE_MESSAGE_EVENT(sender, buffer, tag, orte_sstore_central_local_process_cmd); return; }
static void recv_ack(int status, orte_process_name_t* sender, opal_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata) { /* don't process this right away - we need to get out of the recv before * we process the message as it may ask us to do something that involves * more messaging! Instead, setup an event so that the message gets processed * as soon as we leave the recv. * * The macro makes a copy of the buffer, which we release above - the incoming * buffer, however, is NOT released here, although its payload IS transferred * to the message buffer for later processing */ ORTE_MESSAGE_EVENT(sender, buffer, tag, release_ack); }
static int xcast(orte_jobid_t job, opal_buffer_t *buffer, orte_rml_tag_t tag) { int rc = ORTE_SUCCESS; opal_buffer_t buf; OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, "%s grpcomm:xcast sent to job %s tag %ld", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job), (long)tag)); /* if there is no message to send, then just return ok */ if (NULL == buffer) { return ORTE_SUCCESS; } /* prep the output buffer */ OBJ_CONSTRUCT(&buf, opal_buffer_t); if (ORTE_SUCCESS != (rc = orte_grpcomm_base_app_pack_xcast(ORTE_DAEMON_PROCESS_AND_RELAY_CMD, job, &buf, buffer, tag))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } /* if I am the HNP, just set things up so the cmd processor gets called. * We don't want to message ourselves as this can create circular logic * in the RML. Instead, this macro will set a zero-time event which will * cause the buffer to be processed by the cmd processor - probably will * fire right away, but that's okay * The macro makes a copy of the buffer, so it's okay to release it here */ if (ORTE_PROC_IS_HNP) { ORTE_MESSAGE_EVENT(ORTE_PROC_MY_NAME, &buf, ORTE_RML_TAG_DAEMON, orte_daemon_cmd_processor); } else { /* otherwise, send it to the HNP for relay */ if (0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_HNP, &buf, ORTE_RML_TAG_DAEMON, 0))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } rc = ORTE_SUCCESS; } CLEANUP: OBJ_DESTRUCT(&buf); return rc; }
void orte_grpcomm_base_coll_recv(int status, orte_process_name_t* sender, opal_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata) { OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, "%s grpcomm:coll:receive got message from %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(sender))); /* don't process this right away - we need to get out of the recv before * we process the message as it may ask us to do something that involves * more messaging! Instead, setup an event so that the message gets processed * as soon as we leave the recv. * * The macro makes a copy of the buffer, which we release above - the incoming * buffer, however, is NOT released here, although its payload IS transferred * to the message buffer for later processing */ ORTE_MESSAGE_EVENT(sender, buffer, tag, process_coll_msg); return; }
int orte_global_comm(orte_process_name_t *recipient, opal_buffer_t *buf, orte_rml_tag_t tag, orte_default_cbfunc_t cbfunc) { int ret; orte_ns_cmp_bitmask_t mask; mask = ORTE_NS_CMP_ALL; if (OPAL_EQUAL == orte_util_compare_name_fields(mask, recipient, ORTE_PROC_MY_NAME) && NULL != cbfunc) { /* if I am the recipient and a direct fn is provided, use a message event */ ORTE_MESSAGE_EVENT(ORTE_PROC_MY_NAME, buf, tag, cbfunc); ret = ORTE_SUCCESS; } else { /* go ahead and send it */ if (0 > (ret = orte_rml.send_buffer(recipient, buf, tag, 0))) { ORTE_ERROR_LOG(ret); } else { ret = ORTE_SUCCESS; } } return ret; }
static int xcast(orte_jobid_t job, opal_buffer_t *buffer, orte_rml_tag_t tag) { int rc = ORTE_SUCCESS; opal_buffer_t buf; orte_daemon_cmd_flag_t command; OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output, "%s grpcomm:xcast sent to job %s tag %ld", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job), (long)tag)); /* if there is no message to send, then just return ok */ if (NULL == buffer) { return ORTE_SUCCESS; } /* setup a buffer to handle the xcast command */ OBJ_CONSTRUCT(&buf, opal_buffer_t); /* all we need to do is send this to the HNP - the relay logic * will ensure everyone else gets it! So tell the HNP to * process and relay it. The HNP will use the routed.get_routing_tree * to find out who it should relay the message to. */ command = ORTE_DAEMON_PROCESS_AND_RELAY_CMD; if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &command, 1, ORTE_DAEMON_CMD))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } /* pack the target jobid and tag for use in relay */ if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &job, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &tag, 1, ORTE_RML_TAG))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } /* if this isn't intended for the daemon command tag, then we better * tell the daemon to deliver it to the procs, and what job is supposed * to get it - this occurs when a caller just wants to send something * to all the procs in a job. In that use-case, the caller doesn't know * anything about inserting daemon commands or what routing algo might * be used, so we have to help them out a little. Functions that are * sending commands to the daemons themselves are smart enough to know * what they need to do. */ if (ORTE_RML_TAG_DAEMON != tag) { command = ORTE_DAEMON_MESSAGE_LOCAL_PROCS; if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &command, 1, ORTE_DAEMON_CMD))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &job, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &tag, 1, ORTE_RML_TAG))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } } /* copy the payload into the new buffer - this is non-destructive, so our * caller is still responsible for releasing any memory in the buffer they * gave to us */ if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(&buf, buffer))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } /* if I am the HNP, just set things up so the cmd processor gets called. * We don't want to message ourselves as this can create circular logic * in the RML. Instead, this macro will set a zero-time event which will * cause the buffer to be processed by the cmd processor - probably will * fire right away, but that's okay * The macro makes a copy of the buffer, so it's okay to release it here */ if (orte_process_info.hnp) { ORTE_MESSAGE_EVENT(ORTE_PROC_MY_NAME, &buf, ORTE_RML_TAG_DAEMON, orte_daemon_cmd_processor); } else { /* otherwise, send it to the HNP for relay */ if (0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_HNP, &buf, ORTE_RML_TAG_DAEMON, 0))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } rc = ORTE_SUCCESS; } CLEANUP: OBJ_DESTRUCT(&buf); return rc; }
/* When working in this function, ALWAYS jump to "cleanup" if * you encounter an error so that orterun will be woken up and * the job can cleanly terminate */ static int plm_slurm_launch_job(orte_job_t *jdata) { orte_app_context_t **apps; orte_node_t **nodes; orte_std_cntr_t n; orte_job_map_t *map; char *jobid_string = NULL; char *param; char **argv = NULL; int argc; int rc; char *tmp; char** env = NULL; char* var; char *nodelist_flat; char **nodelist_argv; char *name_string; char **custom_strings; int num_args, i; char *cur_prefix; struct timeval launchstart, launchstop; int proc_vpid_index; orte_jobid_t failed_job; bool failed_launch=true; bool using_regexp=false; if (NULL == jdata) { /* just launching debugger daemons */ active_job = ORTE_JOBID_INVALID; goto launch_apps; } if (jdata->controls & ORTE_JOB_CONTROL_DEBUGGER_DAEMON) { /* debugger daemons */ active_job = jdata->jobid; goto launch_apps; } if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) { /* if this is a request to launch a local slave, * then we will not be launching an orted - we will * directly ssh the slave process itself. No mapping * is performed to support this - the caller must * provide all the info required to launch the job, * including the target hosts */ if (!local_launch_available) { /* if we can't support this, then abort */ orte_show_help("help-plm-slurm.txt", "no-local-slave-support", true); return ORTE_ERR_FAILED_TO_START; } return orte_plm_base_local_slave_launch(jdata); } /* if we are timing, record the start time */ if (orte_timing) { gettimeofday(&orte_plm_globals.daemonlaunchstart, NULL); } /* flag the daemons as failing by default */ failed_job = ORTE_PROC_MY_NAME->jobid; if (orte_timing) { if (0 != gettimeofday(&launchstart, NULL)) { opal_output(0, "plm_slurm: could not obtain job start time"); launchstart.tv_sec = 0; launchstart.tv_usec = 0; } } /* indicate the state of the launch */ launching_daemons = true; /* setup the job */ if (ORTE_SUCCESS != (rc = orte_plm_base_setup_job(jdata))) { ORTE_ERROR_LOG(rc); goto cleanup; } OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:slurm: launching job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid))); /* set the active jobid */ active_job = jdata->jobid; /* Get the map for this job */ if (NULL == (map = orte_rmaps.get_job_map(active_job))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); rc = ORTE_ERR_NOT_FOUND; goto cleanup; } apps = (orte_app_context_t**)jdata->apps->addr; nodes = (orte_node_t**)map->nodes->addr; if (0 == map->num_new_daemons) { /* no new daemons required - just launch apps */ OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:slurm: no new daemons to launch", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto launch_apps; } /* need integer value for command line parameter */ asprintf(&jobid_string, "%lu", (unsigned long) jdata->jobid); /* * start building argv array */ argv = NULL; argc = 0; /* * SLURM srun OPTIONS */ /* add the srun command */ opal_argv_append(&argc, &argv, "srun"); /* Append user defined arguments to srun */ if ( NULL != mca_plm_slurm_component.custom_args ) { custom_strings = opal_argv_split(mca_plm_slurm_component.custom_args, ' '); num_args = opal_argv_count(custom_strings); for (i = 0; i < num_args; ++i) { opal_argv_append(&argc, &argv, custom_strings[i]); } opal_argv_free(custom_strings); } asprintf(&tmp, "--nodes=%lu", (unsigned long) map->num_new_daemons); opal_argv_append(&argc, &argv, tmp); free(tmp); asprintf(&tmp, "--ntasks=%lu", (unsigned long) map->num_new_daemons); opal_argv_append(&argc, &argv, tmp); free(tmp); /* alert us if any orteds die during startup */ opal_argv_append(&argc, &argv, "--kill-on-bad-exit"); /* create nodelist */ nodelist_argv = NULL; for (n=0; n < map->num_nodes; n++ ) { /* if the daemon already exists on this node, then * don't include it */ if (nodes[n]->daemon_launched) { continue; } /* otherwise, add it to the list of nodes upon which * we need to launch a daemon */ opal_argv_append_nosize(&nodelist_argv, nodes[n]->name); } if (0 == opal_argv_count(nodelist_argv)) { orte_show_help("help-plm-slurm.txt", "no-hosts-in-list", true); rc = ORTE_ERR_FAILED_TO_START; goto cleanup; } nodelist_flat = opal_argv_join(nodelist_argv, ','); opal_argv_free(nodelist_argv); asprintf(&tmp, "--nodelist=%s", nodelist_flat); opal_argv_append(&argc, &argv, tmp); free(tmp); OPAL_OUTPUT_VERBOSE((2, orte_plm_globals.output, "%s plm:slurm: launching on nodes %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nodelist_flat)); /* * ORTED OPTIONS */ /* add the daemon command (as specified by user) */ orte_plm_base_setup_orted_cmd(&argc, &argv); /* Add basic orted command line options, including debug flags */ orte_plm_base_orted_append_basic_args(&argc, &argv, "slurm", &proc_vpid_index, false, nodelist_flat); free(nodelist_flat); /* tell the new daemons the base of the name list so they can compute * their own name on the other end */ rc = orte_util_convert_vpid_to_string(&name_string, map->daemon_vpid_start); if (ORTE_SUCCESS != rc) { opal_output(0, "plm_slurm: unable to get daemon vpid as string"); goto cleanup; } free(argv[proc_vpid_index]); argv[proc_vpid_index] = strdup(name_string); free(name_string); /* Copy the prefix-directory specified in the corresponding app_context. If there are multiple, different prefix's in the app context, complain (i.e., only allow one --prefix option for the entire slurm run -- we don't support different --prefix'es for different nodes in the SLURM plm) */ cur_prefix = NULL; for (n=0; n < jdata->num_apps; n++) { char * app_prefix_dir = apps[n]->prefix_dir; /* Check for already set cur_prefix_dir -- if different, complain */ if (NULL != app_prefix_dir) { if (NULL != cur_prefix && 0 != strcmp (cur_prefix, app_prefix_dir)) { orte_show_help("help-plm-slurm.txt", "multiple-prefixes", true, cur_prefix, app_prefix_dir); return ORTE_ERR_FATAL; } /* If not yet set, copy it; iff set, then it's the * same anyway */ if (NULL == cur_prefix) { cur_prefix = strdup(app_prefix_dir); OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:slurm: Set prefix:%s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), cur_prefix)); } } } /* setup environment */ env = opal_argv_copy(orte_launch_environ); /* enable local launch by the orteds */ var = mca_base_param_environ_variable("plm", NULL, NULL); opal_setenv(var, "rsh", true, &env); free(var); /* if we can do it, use the regexp to launch the apps - this * requires that the user requested this mode, that we were * provided with static ports, and that we only have one * app_context */ if (orte_use_regexp && orte_static_ports && jdata->num_apps < 2) { char *regexp; regexp = orte_regex_encode_maps(jdata); opal_argv_append(&argc, &argv, "--launch"); opal_argv_append(&argc, &argv, regexp); free(regexp); using_regexp = true; } if (0 < opal_output_get_verbosity(orte_plm_globals.output)) { param = opal_argv_join(argv, ' '); OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:slurm: final top-level argv:\n\t%s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == param) ? "NULL" : param)); if (NULL != param) free(param); } /* exec the daemon(s) */ if (ORTE_SUCCESS != (rc = plm_slurm_start_proc(argc, argv, env, cur_prefix))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* do NOT wait for srun to complete. Srun only completes when the processes * it starts - in this case, the orteds - complete. Instead, we'll catch * any srun failures and deal with them elsewhere */ /* wait for daemons to callback */ if (ORTE_SUCCESS != (rc = orte_plm_base_daemon_callback(map->num_new_daemons))) { OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:slurm: daemon launch failed for job %s on error %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(active_job), ORTE_ERROR_NAME(rc))); goto cleanup; } launch_apps: /* get here if daemons launch okay - any failures now by apps */ launching_daemons = false; failed_job = active_job; if (using_regexp) { /* daemons already have launch cmd - just wait for them to * report back */ opal_buffer_t launch; int8_t flag; orte_daemon_cmd_flag_t command = ORTE_DAEMON_ADD_LOCAL_PROCS; OBJ_CONSTRUCT(&launch, opal_buffer_t); opal_dss.pack(&launch, &command, 1, ORTE_DAEMON_CMD); flag = 1; opal_dss.pack(&launch, &flag, 1, OPAL_INT8); opal_dss.pack(&launch, &orted_launch_cmd, 1, OPAL_STRING); ORTE_MESSAGE_EVENT(ORTE_PROC_MY_NAME, &launch, ORTE_RML_TAG_DAEMON, orte_daemon_cmd_processor); OBJ_DESTRUCT(&launch); if (ORTE_SUCCESS != (rc = orte_plm_base_report_launched(jdata->jobid))) { OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, "%s plm:slurm:launch failed for job %s on error %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid), ORTE_ERROR_NAME(rc))); goto cleanup; } } else { if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(active_job))) { OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, "%s plm:slurm: launch of apps failed for job %s on error %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(active_job), ORTE_ERROR_NAME(rc))); goto cleanup; } } /* declare the launch a success */ failed_launch = false; if (orte_timing) { if (0 != gettimeofday(&launchstop, NULL)) { opal_output(0, "plm_slurm: could not obtain stop time"); } else { opal_output(0, "plm_slurm: total job launch time is %ld usec", (launchstop.tv_sec - launchstart.tv_sec)*1000000 + (launchstop.tv_usec - launchstart.tv_usec)); } } if (ORTE_SUCCESS != rc) { opal_output(0, "plm:slurm: start_procs returned error %d", rc); goto cleanup; } cleanup: if (NULL != argv) { opal_argv_free(argv); } if (NULL != env) { opal_argv_free(env); } if(NULL != jobid_string) { free(jobid_string); } /* check for failed launch - if so, force terminate */ if (failed_launch) { orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_FAILED_TO_START); } return rc; }
/* this is the read handler for my own child procs. In this case, * the data is going nowhere - I just output it myself */ void orte_iof_hnp_read_local_handler(int fd, short event, void *cbdata) { orte_iof_read_event_t *rev = (orte_iof_read_event_t*)cbdata; unsigned char data[ORTE_IOF_BASE_MSG_MAX]; int32_t numbytes; opal_list_item_t *item; orte_iof_proc_t *proct; int rc; OPAL_THREAD_LOCK(&mca_iof_hnp_component.lock); /* read up to the fragment size */ #if !defined(__WINDOWS__) numbytes = read(fd, data, sizeof(data)); #else { DWORD readed; HANDLE handle = (HANDLE)_get_osfhandle(fd); ReadFile(handle, data, sizeof(data), &readed, NULL); numbytes = (int)readed; } #endif /* !defined(__WINDOWS__) */ if (numbytes < 0) { /* either we have a connection error or it was a non-blocking read */ /* non-blocking, retry */ if (EAGAIN == errno || EINTR == errno) { opal_event_add(&rev->ev, 0); OPAL_THREAD_UNLOCK(&mca_iof_hnp_component.lock); return; } OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, "%s iof:hnp:read handler %s Error on connection:%d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&rev->name), fd)); /* Un-recoverable error. Allow the code to flow as usual in order to * to send the zero bytes message up the stream, and then close the * file descriptor and delete the event. */ numbytes = 0; } /* is this read from our stdin? */ if (ORTE_IOF_STDIN & rev->tag) { /* if job termination has been ordered, just ignore the * data and delete the read event */ if (orte_job_term_ordered) { OBJ_RELEASE(mca_iof_hnp_component.stdinev); OPAL_THREAD_UNLOCK(&mca_iof_hnp_component.lock); return; } /* cycle through our list of sinks */ for (item = opal_list_get_first(&mca_iof_hnp_component.sinks); item != opal_list_get_end(&mca_iof_hnp_component.sinks); item = opal_list_get_next(item)) { orte_iof_sink_t* sink = (orte_iof_sink_t*)item; /* only look at stdin sinks */ if (!(ORTE_IOF_STDIN & sink->tag)) { continue; } /* if the daemon is me, then this is a local sink */ if (ORTE_PROC_MY_NAME->jobid == sink->daemon.jobid && ORTE_PROC_MY_NAME->vpid == sink->daemon.vpid) { OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, "%s read %d bytes from stdin - writing to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes, ORTE_NAME_PRINT(&rev->name))); /* send the bytes down the pipe - we even send 0 byte events * down the pipe so it forces out any preceding data before * closing the output stream */ if (NULL != sink->wev) { if (ORTE_IOF_MAX_INPUT_BUFFERS < orte_iof_base_write_output(&rev->name, rev->tag, data, numbytes, sink->wev)) { /* getting too backed up - stop the read event for now if it is still active */ if (mca_iof_hnp_component.stdinev->active) { OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, "buffer backed up - holding")); mca_iof_hnp_component.stdinev->active = false; } OPAL_THREAD_UNLOCK(&mca_iof_hnp_component.lock); return; } } } else { OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, "%s sending %d bytes from stdin to daemon %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes, ORTE_NAME_PRINT(&sink->daemon))); /* send the data to the daemon so it can * write it to the proc's fd - in this case, * we pass sink->name to indicate who is to * receive the data. If the connection closed, * numbytes will be zero so zero bytes will be * sent - this will tell the daemon to close * the fd for stdin to that proc */ orte_iof_hnp_send_data_to_endpoint(&sink->daemon, &sink->name, ORTE_IOF_STDIN, data, numbytes); } } /* if num_bytes was zero, then we need to terminate the event */ if (0 == numbytes) { /* this will also close our stdin file descriptor */ OBJ_RELEASE(mca_iof_hnp_component.stdinev); } else { /* if we are looking at a tty, then we just go ahead and restart the * read event assuming we are not backgrounded */ if (orte_iof_hnp_stdin_check(fd)) { restart_stdin(fd, 0, NULL); } else { /* delay for awhile and then restart */ ORTE_TIMER_EVENT(0, 10000, restart_stdin); } } /* nothing more to do */ OPAL_THREAD_UNLOCK(&mca_iof_hnp_component.lock); return; } /* this must be output from one of my local procs - see * if anyone else has requested a copy of this info */ for (item = opal_list_get_first(&mca_iof_hnp_component.sinks); item != opal_list_get_end(&mca_iof_hnp_component.sinks); item = opal_list_get_next(item)) { orte_iof_sink_t *sink = (orte_iof_sink_t*)item; /* if the target isn't set, then this sink is for another purpose - ignore it */ if (ORTE_JOBID_INVALID == sink->daemon.jobid) { continue; } if ((sink->tag & rev->tag) && sink->name.jobid == rev->name.jobid && (ORTE_VPID_WILDCARD == sink->name.vpid || sink->name.vpid == rev->name.vpid)) { /* need to send the data to the remote endpoint - if * the connection closed, numbytes will be zero, so * the remote endpoint will know to close its local fd. * In this case, we pass rev->name to indicate who the * data came from. */ OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, "%s sending data to tool %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&sink->daemon))); orte_iof_hnp_send_data_to_endpoint(&sink->daemon, &rev->name, rev->tag, data, numbytes); } } OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, "%s read %d bytes from %s of %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes, (ORTE_IOF_STDOUT & rev->tag) ? "stdout" : ((ORTE_IOF_STDERR & rev->tag) ? "stderr" : "stddiag"), ORTE_NAME_PRINT(&rev->name))); if (0 == numbytes) { /* if we read 0 bytes from the stdout/err/diag, there is * nothing to output - find this proc on our list and * release the appropriate event. This will delete the * read event and close the file descriptor */ for (item = opal_list_get_first(&mca_iof_hnp_component.procs); item != opal_list_get_end(&mca_iof_hnp_component.procs); item = opal_list_get_next(item)) { proct = (orte_iof_proc_t*)item; if (proct->name.jobid == rev->name.jobid && proct->name.vpid == rev->name.vpid) { /* found it - release corresponding event. This deletes * the read event and closes the file descriptor */ if (rev->tag & ORTE_IOF_STDOUT) { OBJ_RELEASE(proct->revstdout); } else if (rev->tag & ORTE_IOF_STDERR) { OBJ_RELEASE(proct->revstderr); } else if (rev->tag & ORTE_IOF_STDDIAG) { OBJ_RELEASE(proct->revstddiag); } /* check to see if they are all done */ if (NULL == proct->revstdout && NULL == proct->revstderr && NULL == proct->revstddiag) { opal_buffer_t cmdbuf; orte_daemon_cmd_flag_t command; /* this proc's iof is complete */ opal_list_remove_item(&mca_iof_hnp_component.procs, item); /* setup a cmd to notify that the iof is complete */ OBJ_CONSTRUCT(&cmdbuf, opal_buffer_t); command = ORTE_DAEMON_IOF_COMPLETE; if (ORTE_SUCCESS != (rc = opal_dss.pack(&cmdbuf, &command, 1, ORTE_DAEMON_CMD))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } if (ORTE_SUCCESS != (rc = opal_dss.pack(&cmdbuf, &proct->name, 1, ORTE_NAME))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } ORTE_MESSAGE_EVENT(ORTE_PROC_MY_NAME, &cmdbuf, ORTE_RML_TAG_DAEMON, orte_daemon_cmd_processor); CLEANUP: OBJ_DESTRUCT(&cmdbuf); OBJ_RELEASE(proct); } break; } } OPAL_THREAD_UNLOCK(&mca_iof_hnp_component.lock); return; } /* see if the user wanted the output directed to files */ if (NULL != orte_output_filename) { /* find the sink for this rank */ for (item = opal_list_get_first(&mca_iof_hnp_component.sinks); item != opal_list_get_end(&mca_iof_hnp_component.sinks); item = opal_list_get_next(item)) { orte_iof_sink_t *sink = (orte_iof_sink_t*)item; /* if the target is set, then this sink is for another purpose - ignore it */ if (ORTE_JOBID_INVALID != sink->daemon.jobid) { continue; } /* if this sink isn't for output, ignore it */ if (ORTE_IOF_STDIN & sink->tag) { continue; } /* is this the desired proc? */ if (sink->name.jobid == rev->name.jobid && sink->name.vpid == rev->name.vpid) { /* output to the corresponding file */ orte_iof_base_write_output(&rev->name, rev->tag, data, numbytes, sink->wev); /* done */ break; } } } else { /* output this to our local output */ if (ORTE_IOF_STDOUT & rev->tag || orte_xml_output) { orte_iof_base_write_output(&rev->name, rev->tag, data, numbytes, orte_iof_base.iof_write_stdout->wev); } else { orte_iof_base_write_output(&rev->name, rev->tag, data, numbytes, orte_iof_base.iof_write_stderr->wev); } } /* re-add the event */ opal_event_add(&rev->ev, 0); OPAL_THREAD_UNLOCK(&mca_iof_hnp_component.lock); return; }
void orte_iof_orted_read_handler(int fd, short event, void *cbdata) { orte_iof_read_event_t *rev = (orte_iof_read_event_t*)cbdata; unsigned char data[ORTE_IOF_BASE_MSG_MAX]; opal_buffer_t *buf=NULL; int rc; int32_t numbytes; opal_list_item_t *item; orte_iof_proc_t *proct; OPAL_THREAD_LOCK(&mca_iof_orted_component.lock); /* read up to the fragment size */ #if !defined(__WINDOWS__) numbytes = read(fd, data, sizeof(data)); #else { DWORD readed; HANDLE handle = (HANDLE)_get_osfhandle(fd); ReadFile(handle, data, sizeof(data), &readed, NULL); numbytes = (int)readed; } #endif /* !defined(__WINDOWS__) */ OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, "%s iof:orted:read handler read %d bytes from %s, fd %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes, ORTE_NAME_PRINT(&rev->name), fd)); if (numbytes <= 0) { if (0 > numbytes) { /* either we have a connection error or it was a non-blocking read */ if (EAGAIN == errno || EINTR == errno) { /* non-blocking, retry */ opal_event_add(&rev->ev, 0); OPAL_THREAD_UNLOCK(&mca_iof_orted_component.lock); return; } OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, "%s iof:orted:read handler %s Error on connection:%d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&rev->name), fd)); } /* numbytes must have been zero, so go down and close the fd etc */ goto CLEAN_RETURN; } /* see if the user wanted the output directed to files */ if (NULL != orte_output_filename) { /* find the sink for this rank */ for (item = opal_list_get_first(&mca_iof_orted_component.sinks); item != opal_list_get_end(&mca_iof_orted_component.sinks); item = opal_list_get_next(item)) { orte_iof_sink_t *sink = (orte_iof_sink_t*)item; /* if the target is set, then this sink is for another purpose - ignore it */ if (ORTE_JOBID_INVALID != sink->daemon.jobid) { continue; } /* if this sink isn't for output, ignore it */ if (ORTE_IOF_STDIN & sink->tag) { continue; } /* is this the desired proc? */ if (sink->name.jobid == rev->name.jobid && sink->name.vpid == rev->name.vpid) { /* output to the corresponding file */ orte_iof_base_write_output(&rev->name, rev->tag, data, numbytes, sink->wev); /* done */ break; } } goto RESTART; } /* prep the buffer */ buf = OBJ_NEW(opal_buffer_t); /* pack the stream first - we do this so that flow control messages can * consist solely of the tag */ if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &rev->tag, 1, ORTE_IOF_TAG))) { ORTE_ERROR_LOG(rc); goto CLEAN_RETURN; } /* pack name of process that gave us this data */ if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &rev->name, 1, ORTE_NAME))) { ORTE_ERROR_LOG(rc); goto CLEAN_RETURN; } /* pack the data - only pack the #bytes we read! */ if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &data, numbytes, OPAL_BYTE))) { ORTE_ERROR_LOG(rc); goto CLEAN_RETURN; } /* start non-blocking RML call to forward received data */ OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, "%s iof:orted:read handler sending %d bytes to HNP", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes)); orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf, ORTE_RML_TAG_IOF_HNP, 0, send_cb, NULL); RESTART: /* re-add the event */ opal_event_add(&rev->ev, 0); OPAL_THREAD_UNLOCK(&mca_iof_orted_component.lock); return; CLEAN_RETURN: /* must be an error, or zero bytes were read indicating that the * proc terminated this IOF channel - either way, find this proc * on our list and clean up */ for (item = opal_list_get_first(&mca_iof_orted_component.procs); item != opal_list_get_end(&mca_iof_orted_component.procs); item = opal_list_get_next(item)) { proct = (orte_iof_proc_t*)item; if (proct->name.jobid == rev->name.jobid && proct->name.vpid == rev->name.vpid) { /* found it - release corresponding event. This deletes * the read event and closes the file descriptor */ if (rev->tag & ORTE_IOF_STDOUT) { OBJ_RELEASE(proct->revstdout); } else if (rev->tag & ORTE_IOF_STDERR) { OBJ_RELEASE(proct->revstderr); } else if (rev->tag & ORTE_IOF_STDDIAG) { OBJ_RELEASE(proct->revstddiag); } /* check to see if they are all done */ if (NULL == proct->revstdout && NULL == proct->revstderr && NULL == proct->revstddiag) { opal_buffer_t cmdbuf; orte_daemon_cmd_flag_t command; /* this proc's iof is complete */ opal_list_remove_item(&mca_iof_orted_component.procs, item); /* setup a cmd to notify that the iof is complete */ OBJ_CONSTRUCT(&cmdbuf, opal_buffer_t); command = ORTE_DAEMON_IOF_COMPLETE; if (ORTE_SUCCESS != (rc = opal_dss.pack(&cmdbuf, &command, 1, ORTE_DAEMON_CMD))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } if (ORTE_SUCCESS != (rc = opal_dss.pack(&cmdbuf, &proct->name, 1, ORTE_NAME))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } ORTE_MESSAGE_EVENT(ORTE_PROC_MY_NAME, &cmdbuf, ORTE_RML_TAG_DAEMON, orte_daemon_cmd_processor); CLEANUP: OBJ_DESTRUCT(&cmdbuf); OBJ_RELEASE(proct); } break; } } if (NULL != buf) { OBJ_RELEASE(buf); } OPAL_THREAD_UNLOCK(&mca_iof_orted_component.lock); return; }