void Kickoff_PBS(const Node_info *ddinodes,const Cmdline_info *info) { char ddiinfo[] = "-ddi"; char procid[8]; char portid[8]; char nodeid[8]; char snodes[8]; char sprocs[8]; char **rargs; char **argv = info->argv; int i,j,r,iarg,nargs = info->ddiarg + info->nnodes + 8; int inode,ncpus,np = info->nprocs; int ntests; if(info->nnodes == 1) return; int tm_errno; tm_task_id *tid; tm_event_t *spawn; tm_event_t polled; struct tm_roots roots; tm_node_id *nodelist; /* ---------------------------------- *\ Initialize PBS Task Management API \* ---------------------------------- */ if(tm_init(0, &roots) != TM_SUCCESS) { fprintf(stderr, " ddikick.x: tm_init failed\n"); Fatal_error(911); } if(tm_nodeinfo(&nodelist, &np) != TM_SUCCESS) { fprintf(stderr, " ddikick.x: tm_nodeinfo failed.\n"); Fatal_error(911); } tid = (tm_task_id *) Malloc(2*np*sizeof(tm_task_id)); spawn = (tm_event_t *) Malloc(2*np*sizeof(tm_event_t)); for(i=0; i<2*np; i++) { *(tid + i) = TM_NULL_TASK; *(spawn + i) = TM_NULL_EVENT; } /* ----------------------------------------- *\ Initialize arguments to kickoff DDI tasks \* ----------------------------------------- */ rargs = (char **) Malloc(nargs*sizeof(char*)); sprintf(portid, "%d", info->kickoffport); sprintf(snodes, "%d", info->nnodes); sprintf(sprocs, "%d", info->nprocs); for(i=1,r=0; i<info->ddiarg-1; i++) rargs[r++] = argv[i]; rargs[r++] = ddiinfo; rargs[r++] = info->kickoffhost; /* kickoff host name */ rargs[r++] = portid; /* kickoff port number */ rargs[r++] = nodeid; /* rank of this node */ rargs[r++] = procid; /* rank of this process */ rargs[r++] = snodes; /* number of nodes */ rargs[r++] = sprocs; /* number of processors */ for(i=0,iarg=info->nodearg; i<info->nnodes; i++,iarg++) { rargs[r++] = argv[iarg]; } rargs[r] = NULL; /* ------------------------ *\ Spawn DDI tasks to nodes \* ------------------------ */ ncpus=ddinodes[0].cpus+ddinodes[1].cpus; for(i=ddinodes[0].cpus,inode=1; i<np; i++) { if(i == ncpus) ncpus += ddinodes[++inode].cpus; sprintf(nodeid,"%d",inode); sprintf(procid,"%d",i); # if DDI_DEBUG DEBUG_START(DEBUG_MAX) fprintf(stdout,"DDI Process %i PBS tm_spawn arguments: ",i); for(iarg=0; iarg<r; iarg++) fprintf(stdout,"%s ",rargs[iarg]); fprintf(stdout,"\n"); DEBUG_END() # endif /* ------------------------- *\ Spawn DDI Compute Process \* ------------------------- */ if(tm_spawn(r,rargs,NULL,*(nodelist+i),(tid+i),spawn+i) != TM_SUCCESS) { fprintf(stderr," ddikick.x: tm_spawn failed.\n"); Fatal_error(911); } /* ---------------------------------- *\ No data server on single node runs \* ---------------------------------- */ if(info->nnodes == 1) continue; # if DDI_DEBUG DEBUG_START(DEBUG_MAX) fprintf(stdout,"DDI Process %i PBS tm_spawn arguments: ",j); for(iarg=0; iarg<r; iarg++) fprintf(stdout,"%s ",rargs[iarg]); fprintf(stdout,"\n"); DEBUG_END() # endif j = i+np; sprintf(procid,"%d",j); /* --------------------- *\ Spawn DDI Data Server \* --------------------- */ if(tm_spawn(r,rargs,NULL,*(nodelist+i),(tid+j),spawn+j) != TM_SUCCESS) { fprintf(stderr," ddikick.x: tm_spawn failed.\n"); Fatal_error(911); } } /* -------------------------------------------------------- *\ Poll PBS to ensure each DDI process started successfully \* -------------------------------------------------------- */ ntests = np-ddinodes[0].cpus; if(USING_DATA_SERVERS()) ntests *= 2; for(i=ntests; i--; ) { if(tm_poll(TM_NULL_EVENT,&polled,1,&tm_errno) != TM_SUCCESS) { fprintf(stderr," ddikick.x: tm_poll failed.\n"); Fatal_error(911); } for(j=0; j<np; j++) { if(polled == *(spawn+j)) { if(tm_errno) { fprintf(stderr," ddikick.x: error spawning DDI task %i.\n",j); Fatal_error(911); } else { # if DDI_DEBUG DEBUG_START(DEBUG_MAX) fprintf(stdout," ddikick.x: DDI task %i started.\n",j); DEBUG_END() # endif } } if(info->nnodes == 1) continue; if(polled == *(spawn+j+np)) { if(tm_errno) { fprintf(stderr," ddikick.x: error spawning DDI task %i.\n",j+np); Fatal_error(911); } else { # if DDI_DEBUG DEBUG_START(DEBUG_MAX) fprintf(stdout," ddikick.x: DDI task %i started.\n",j+np); DEBUG_END() # endif } } } } /* -------------------------------------- *\ Close the link to the PBS Task Manager \* -------------------------------------- */ tm_finalize(); /* ---------------- *\ Free used memory \* ---------------- */ free(tid); free(spawn); free(rargs); }
int main( int argc, char *argv[]) { int c; int err = 0; int ncopies = -1; int onenode = -1; int rc; struct tm_roots rootrot; int nspawned = 0; tm_node_id *nodelist; int start; int stop; int sync = 0; int pernode = 0; char *targethost = NULL; char *allnodes; struct sigaction act; char **ioenv; extern int optind; extern char *optarg; int posixly_correct_set_by_caller = 0; char *envstr; id = malloc(60 * sizeof(char)); if (id == NULL) { fprintf(stderr, "%s: malloc failed, (%d)\n", id, errno); return(1); } sprintf(id, "pbsdsh%s", ((getenv("PBSDEBUG") != NULL) && (getenv("PBS_TASKNUM") != NULL)) ? getenv("PBS_TASKNUM") : ""); #ifdef __GNUC__ /* If it's already set, we won't unset it later */ if (getenv("POSIXLY_CORRECT") != NULL) posixly_correct_set_by_caller = 1; envstr = strdup("POSIXLY_CORRECT=1"); putenv(envstr); #endif while ((c = getopt(argc, argv, "c:n:h:osuv")) != EOF) { switch (c) { case 'c': ncopies = atoi(optarg); if (ncopies <= 0) { err = 1; } break; case 'h': targethost = strdup(optarg); /* run on this 1 hostname */ break; case 'n': onenode = atoi(optarg); if (onenode < 0) { err = 1; } break; case 'o': grabstdio = 1; break; case 's': sync = 1; /* force synchronous spawns */ break; case 'u': pernode = 1; /* run once per node (unique hostnames) */ break; case 'v': verbose = 1; /* turn on verbose output */ break; default: err = 1; break; } /* END switch (c) */ } /* END while ((c = getopt()) != EOF) */ if ((err != 0) || ((onenode >= 0) && (ncopies >= 1))) { fprintf(stderr, "Usage: %s [-c copies][-o][-s][-u][-v] program [args]...]\n", argv[0]); fprintf(stderr, " %s [-n nodenumber][-o][-s][-u][-v] program [args]...\n", argv[0]); fprintf(stderr, " %s [-h hostname][-o][-v] program [args]...\n", argv[0]); fprintf(stderr, "Where -c copies = run copy of \"args\" on the first \"copies\" nodes,\n"); fprintf(stderr, " -n nodenumber = run a copy of \"args\" on the \"nodenumber\"-th node,\n"); fprintf(stderr, " -o = capture stdout of processes,\n"); fprintf(stderr, " -s = forces synchronous execution,\n"); fprintf(stderr, " -u = run on unique hostnames,\n"); fprintf(stderr, " -h = run on this specific hostname,\n"); fprintf(stderr, " -v = forces verbose output.\n"); exit(1); } #ifdef __GNUC__ if (!posixly_correct_set_by_caller) { putenv("POSIXLY_CORRECT"); free(envstr); } #endif if (getenv("PBS_ENVIRONMENT") == NULL) { fprintf(stderr, "%s: not executing under PBS\n", id); return(1); } /* * Set up interface to the Task Manager */ if ((rc = tm_init(0, &rootrot)) != TM_SUCCESS) { fprintf(stderr, "%s: tm_init failed, rc = %s (%d)\n", id, get_ecname(rc), rc); return(1); } sigemptyset(&allsigs); sigaddset(&allsigs, SIGHUP); sigaddset(&allsigs, SIGINT); sigaddset(&allsigs, SIGTERM); act.sa_mask = allsigs; act.sa_flags = 0; /* We want to abort system calls and call a function. */ #ifdef SA_INTERRUPT act.sa_flags |= SA_INTERRUPT; #endif act.sa_handler = bailout; sigaction(SIGHUP, &act, NULL); sigaction(SIGINT, &act, NULL); sigaction(SIGTERM, &act, NULL); #ifdef DEBUG if (rootrot.tm_parent == TM_NULL_TASK) { fprintf(stderr, "%s: I am the mother of all tasks\n", id); } else { fprintf(stderr, "%s: I am but a child in the scheme of things\n", id); } #endif /* DEBUG */ if ((rc = tm_nodeinfo(&nodelist, &numnodes)) != TM_SUCCESS) { fprintf(stderr, "%s: tm_nodeinfo failed, rc = %s (%d)\n", id, get_ecname(rc), rc); return(1); } /* nifty unique/hostname code */ if (pernode || targethost) { allnodes = gethostnames(nodelist); if (targethost) { onenode = findtargethost(allnodes, targethost); } else { numnodes = uniquehostlist(nodelist, allnodes); } free(allnodes); if (targethost) free(targethost); } /* We already checked the lower bounds in the argument processing, now we check the upper bounds */ if ((onenode >= numnodes) || (ncopies > numnodes)) { fprintf(stderr, "%s: only %d nodes available\n", id, numnodes); return(1); } /* malloc space for various arrays based on number of nodes/tasks */ tid = (tm_task_id *)calloc(numnodes, sizeof(tm_task_id)); events_spawn = (tm_event_t *)calloc(numnodes, sizeof(tm_event_t)); events_obit = (tm_event_t *)calloc(numnodes, sizeof(tm_event_t)); ev = (int *)calloc(numnodes, sizeof(int)); if ((tid == NULL) || (events_spawn == NULL) || (events_obit == NULL) || (ev == NULL)) { /* FAILURE - cannot alloc memory */ fprintf(stderr, "%s: memory alloc of task ids failed\n", id); return(1); } for (c = 0; c < numnodes; c++) { *(tid + c) = TM_NULL_TASK; *(events_spawn + c) = TM_NULL_EVENT; *(events_obit + c) = TM_NULL_EVENT; *(ev + c) = 0; } /* END for (c) */ /* Now spawn the program to where it goes */ if (onenode >= 0) { /* Spawning one copy onto logical node "onenode" */ start = onenode; stop = onenode + 1; } else if (ncopies >= 0) { /* Spawn a copy of the program to the first "ncopies" nodes */ start = 0; stop = ncopies; } else { /* Spawn a copy on all nodes */ start = 0; stop = numnodes; } if ((ioenv = calloc(2, sizeof(char *)))==NULL) { /* FAILURE - cannot alloc memory */ fprintf(stderr,"%s: memory alloc of ioenv failed\n", id); return(1); } if (grabstdio != 0) { stdoutfd = build_listener(&stdoutport); if ((*ioenv = calloc(50,sizeof(char *))) == NULL) { /* FAILURE - cannot alloc memory */ fprintf(stderr,"%s: memory alloc of *ioenv failed\n", id); return(1); } snprintf(*ioenv,49,"TM_STDOUT_PORT=%d", stdoutport); FD_ZERO(&permrfsd); } sigprocmask(SIG_BLOCK, &allsigs, NULL); for (c = start; c < stop; ++c) { if ((rc = tm_spawn( argc - optind, argv + optind, ioenv, *(nodelist + c), tid + c, events_spawn + c)) != TM_SUCCESS) { fprintf(stderr, "%s: spawn failed on node %d err %s\n", id, c, get_ecname(rc)); } else { if (verbose) fprintf(stderr, "%s: spawned task %d\n", id, c); ++nspawned; if (sync) wait_for_task(&nspawned); /* one at a time */ } } /* END for (c) */ if (sync == 0) wait_for_task(&nspawned); /* wait for all to finish */ /* * Terminate interface with Task Manager */ tm_finalize(); return 0; } /* END main() */
/* When working in this function, ALWAYS jump to "cleanup" if * you encounter an error so that orterun will be woken up and * the job can cleanly terminate */ static void launch_daemons(int fd, short args, void *cbdata) { orte_job_map_t *map = NULL; orte_app_context_t *app; orte_node_t *node; int proc_vpid_index; char *param; char **env = NULL; char *var; char **argv = NULL; char **nodeargv; int argc = 0; int rc; orte_std_cntr_t i; char *bin_base = NULL, *lib_base = NULL; tm_event_t *tm_events = NULL; tm_task_id *tm_task_ids = NULL; bool failed_launch = true; mode_t current_umask; char *nodelist; char* vpid_string; orte_job_t *daemons, *jdata; orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata; int32_t launchid, *ldptr; char *prefix_dir = NULL; jdata = state->jdata; /* if we are launching debugger daemons, then just go * do it - no new daemons will be launched */ if (ORTE_FLAG_TEST(state->jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) { jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_REPORTED); OBJ_RELEASE(state); return; } /* setup the virtual machine */ daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(jdata))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* if we don't want to launch, then don't attempt to * launch the daemons - the user really wants to just * look at the proposed process map */ if (orte_do_not_launch) { /* set the state to indicate the daemons reported - this * will trigger the daemons_reported event and cause the * job to move to the following step */ jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_REPORTED); OBJ_RELEASE(state); return; } /* Get the map for this job */ if (NULL == (map = daemons->map)) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); rc = ORTE_ERR_NOT_FOUND; goto cleanup; } if (0 == map->num_new_daemons) { /* set the state to indicate the daemons reported - this * will trigger the daemons_reported event and cause the * job to move to the following step */ jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_REPORTED); OBJ_RELEASE(state); return; } OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output, "%s plm:tm: launching vm", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* Allocate a bunch of TM events to use for tm_spawn()ing */ tm_events = malloc(sizeof(tm_event_t) * map->num_new_daemons); if (NULL == tm_events) { rc = ORTE_ERR_OUT_OF_RESOURCE; ORTE_ERROR_LOG(rc); goto cleanup; } tm_task_ids = malloc(sizeof(tm_task_id) * map->num_new_daemons); if (NULL == tm_task_ids) { rc = ORTE_ERR_OUT_OF_RESOURCE; ORTE_ERROR_LOG(rc); goto cleanup; } /* add the daemon command (as specified by user) */ orte_plm_base_setup_orted_cmd(&argc, &argv); /* create a list of nodes in this launch */ nodeargv = NULL; for (i = 0; i < map->nodes->size; i++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) { continue; } /* if this daemon already exists, don't launch it! */ if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_DAEMON_LAUNCHED)) { continue; } /* add to list */ opal_argv_append_nosize(&nodeargv, node->name); } nodelist = opal_argv_join(nodeargv, ','); opal_argv_free(nodeargv); /* Add basic orted command line options */ orte_plm_base_orted_append_basic_args(&argc, &argv, "tm", &proc_vpid_index, nodelist); free(nodelist); if (0 < opal_output_get_verbosity(orte_plm_base_framework.framework_output)) { param = opal_argv_join(argv, ' '); OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output, "%s plm:tm: final top-level argv:\n\t%s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == param) ? "NULL" : param)); if (NULL != param) free(param); } if (!connected) { if (ORTE_SUCCESS != plm_tm_connect()) { goto cleanup; } connected = true; } /* Figure out the basenames for the libdir and bindir. There is a lengthy comment about this in plm_rsh_module.c explaining all the rationale for how / why we're doing this. */ lib_base = opal_basename(opal_install_dirs.libdir); bin_base = opal_basename(opal_install_dirs.bindir); /* setup environment */ env = opal_argv_copy(orte_launch_environ); /* enable local launch by the orteds */ (void) mca_base_var_env_name ("plm", &var); opal_setenv(var, "rsh", true, &env); free(var); /* add our umask -- see big note in orted.c */ current_umask = umask(0); umask(current_umask); (void)asprintf(&var, "0%o", current_umask); opal_setenv("ORTE_DAEMON_UMASK_VALUE", var, true, &env); free(var); /* If we have a prefix, then modify the PATH and LD_LIBRARY_PATH environment variables. We only allow a single prefix to be specified. Since there will always be at least one app_context, we take it from there */ app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0); orte_get_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, (void**)&prefix_dir, OPAL_STRING); if (NULL != prefix_dir) { char *newenv; for (i = 0; NULL != env && NULL != env[i]; ++i) { /* Reset PATH */ if (0 == strncmp("PATH=", env[i], 5)) { (void)asprintf(&newenv, "%s/%s:%s", prefix_dir, bin_base, env[i] + 5); OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output, "%s plm:tm: resetting PATH: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), newenv)); opal_setenv("PATH", newenv, true, &env); free(newenv); } /* Reset LD_LIBRARY_PATH */ else if (0 == strncmp("LD_LIBRARY_PATH=", env[i], 16)) { (void)asprintf(&newenv, "%s/%s:%s", prefix_dir, lib_base, env[i] + 16); OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output, "%s plm:tm: resetting LD_LIBRARY_PATH: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), newenv)); opal_setenv("LD_LIBRARY_PATH", newenv, true, &env); free(newenv); } } free(prefix_dir); } /* Iterate through each of the nodes and spin * up a daemon. */ ldptr = &launchid; for (i = 0; i < map->nodes->size; i++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) { continue; } /* if this daemon already exists, don't launch it! */ if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_DAEMON_LAUNCHED)) { continue; } OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output, "%s plm:tm: launching on node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name)); /* setup process name */ rc = orte_util_convert_vpid_to_string(&vpid_string, node->daemon->name.vpid); if (ORTE_SUCCESS != rc) { opal_output(0, "plm:tm: unable to get daemon vpid as string"); exit(-1); } free(argv[proc_vpid_index]); argv[proc_vpid_index] = strdup(vpid_string); free(vpid_string); /* exec the daemon */ if (0 < opal_output_get_verbosity(orte_plm_base_framework.framework_output)) { param = opal_argv_join(argv, ' '); OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output, "%s plm:tm: executing:\n\t%s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == param) ? "NULL" : param)); if (NULL != param) free(param); } launchid = 0; if (!orte_get_attribute(&node->attributes, ORTE_NODE_LAUNCH_ID, (void**)&ldptr, OPAL_INT32)) { orte_show_help("help-plm-tm.txt", "tm-spawn-failed", true, argv[0], node->name, 0); rc = ORTE_ERROR; goto cleanup; } rc = tm_spawn(argc, argv, env, launchid, tm_task_ids + launched, tm_events + launched); if (TM_SUCCESS != rc) { orte_show_help("help-plm-tm.txt", "tm-spawn-failed", true, argv[0], node->name, launchid); rc = ORTE_ERROR; goto cleanup; } launched++; } /* indicate that the daemons for this job were launched */ state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; daemons->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; /* flag that launch was successful, so far as we currently know */ failed_launch = false; OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output, "%s plm:tm:launch: finished spawning orteds", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); cleanup: /* cleanup */ OBJ_RELEASE(state); /* check for failed launch - if so, force terminate */ if (failed_launch) { ORTE_ACTIVATE_JOB_STATE(daemons, ORTE_JOB_STATE_FAILED_TO_START); } }
/* * Start the tasks, with much stuff in the environment. If concurrent * master, this could be on behalf of some other mpiexec to which we * will forward any event/error results. */ int start_tasks(int spawn) { int i, ret = 0; char *nargv[3]; char pwd[PATH_MAX]; char *cp; int conns[3]; /* expected connections to the stdio process */ int master_port = 0; const char *user_shell; growstr_t *g; int gmpi_port[2]; int pmi_fd; int task_start, task_end; const char *mpiexec_redir_helper_path; char *psm_uuid = NULL; int tv_port = 0; /* for looping from 0..numtasks in the case of MPI_Spawn */ task_start = spawns[spawn].task_start; task_end = spawns[spawn].task_end; /* * Get the pwd. Probably can trust libc not to overflow this, * but who knows. */ if (!getcwd(pwd, sizeof(pwd))) error("%s: no current working directory", __func__); pwd[sizeof(pwd)-1] = '\0'; /* * Eventually use the user's preferred shell. */ if ((cp = getenv("SHELL"))) user_shell = cp; else if (pswd->pw_shell) user_shell = pswd->pw_shell; else user_shell = "/bin/sh"; /* assume again */ /* * Rewrite argv to go through user's shell, just like rsh. * $SHELL, "-c", "cd <path>; exec <argv0> <argv1>..." * But to change the working dir and not frighten weak shells like tcsh, * we must detect that the dir actually exists on the far side before * committing to the cd. Use /bin/sh for this task, hoping it exists * everywhere we'll be. Then there's also a bit of quoting nightmare * to handle too. So we'll end up with: * rsh node "/bin/sh -c 'if test -d $dir ; then cd $dir ; fi ; $SHELL -c * \'exec argv0 argv1 ...\''" * but with argv* (including the executable, argv0) changed to replace * all occurrences of ' with '\''. */ nargv[0] = strsave("/bin/sh"); /* assume this exists everywhere */ nargv[1] = strsave("-c"); /* exec_line constructed for each process */ g = growstr_init(); /* * Start stdio stream handler process, if anybody gets stdin, * or !nostdout. */ if (cl_args->which_stdin == STDIN_NONE) conns[0] = 0; else if (cl_args->which_stdin == STDIN_ONE) { if (spawn == 0) conns[0] = 1; else conns[0] = 0; /* already connected the single stdin */ } else if (cl_args->which_stdin == STDIN_ALL) { /* total processes which connect stdin */ conns[0] = 0; for (i=task_start; i<task_end; i++) conns[0] += tasks[i].num_copies; } if (cl_args->nostdout) conns[1] = conns[2] = 0; else /* even for p4 and shmem, not with multiplicity */ conns[1] = conns[2] = task_end - task_start; /* * Initialize listener sockets for gm and ib, since these will be * used to implement MPI_Abort in the stdio listener later. */ if (cl_args->comm == COMM_MPICH_GM) { prepare_gm_startup_ports(gmpi_port); } else if (cl_args->comm == COMM_MPICH_IB) { master_port = prepare_ib_startup_port(&gmpi_fd[0]); gmpi_fd[1] = -1; } else if (cl_args->comm == COMM_MPICH_PSM) { master_port = prepare_psm_startup_port(&gmpi_fd[0]); gmpi_fd[1] = -1; } else if (cl_args->comm == COMM_MPICH_RAI) { master_port = prepare_rai_startup_port(); gmpi_fd[0] = -1; gmpi_fd[1] = -1; } else { gmpi_fd[0] = -1; gmpi_fd[1] = -1; } pmi_fd = -1; if (cl_args->comm == COMM_MPICH2_PMI) { /* stdio listener handles all PMI activity, even startup */ if (spawn == 0) master_port = prepare_pmi_startup_port(&pmi_fd); else master_port = stdio_msg_parent_say_more_tasks( task_end - task_start, conns); } /* flush output buffer, else forked child will have the output too */ fflush(stdout); /* fork the listener (unless we're just spawning more tasks) */ if (spawn == 0) stdio_fork(conns, gmpi_fd, pmi_fd); if (pmi_fd >= 0) close(pmi_fd); /* child has it now */ numtasks_waiting_start = 0; if (cl_args->comm == COMM_NONE) /* do not complain if they exit before all other tasks are up */ startup_complete = 1; else startup_complete = 0; /* * Start signal handling _after_ stdio child is up. */ handle_signals(0, 0, killall); /* * environment variables common to all tasks */ env_init(); /* override user env with these */ if (cl_args->comm == COMM_MPICH_GM) { env_add_int("GMPI_MAGIC", atoi(jobid)); /* PBS always gives us the "mother superior" node first in the list */ env_add("GMPI_MASTER", nodes[0].name); env_add_int("GMPI_PORT", gmpi_port[0]); /* 1.2.5..10 */ env_add_int("GMPI_PORT1", gmpi_port[0]); /* 1.2.4..8a */ env_add_int("GMPI_PORT2", gmpi_port[1]); env_add_int("GMPI_NP", numtasks); env_add_int("GMPI_BOARD", -1); /* ditto for new MX version */ env_add_int("MXMPI_MAGIC", atoi(jobid)); env_add("MXMPI_MASTER", nodes[0].name); env_add_int("MXMPI_PORT", gmpi_port[0]); env_add_int("MXMPI_NP", numtasks); env_add_int("MXMPI_BOARD", -1); /* for MACOSX to override default malloc */ env_add_int("DYLD_FORCE_FLAT_NAMESPACE", 1); } if (cl_args->comm == COMM_EMP) { growstr_t *emphosts = growstr_init(); for (i=0; i<numtasks; i++) growstr_printf(emphosts, "%s%s", (i > 0 ? " " : ""), nodes[tasks[i].node].mpname); env_add("EMPHOSTS", emphosts->s); growstr_free(emphosts); } if (cl_args->comm == COMM_MPICH_IB || cl_args->comm == COMM_MPICH_RAI) { int len; char *cq, *cr; env_add("MPIRUN_HOST", nodes[0].name); /* master address */ env_add_int("MPIRUN_PORT", master_port); env_add_int("MPIRUN_NPROCS", numtasks); env_add_int("MPIRUN_ID", atoi(jobid)); /* global job id */ /* * pmgr_version >= 3 needs this terribly long string in every task. * Since it may be quite large, we do the allocation by hand and * skip some growstr overhead. */ len = numtasks; /* separating colons and terminal \0 */ for (i=0; i<numtasks; i++) len += strlen(nodes[tasks[i].node].name); cq = cp = Malloc(len); for (i=0; i<numtasks; i++) { for (cr=nodes[tasks[i].node].name; *cr; cr++) *cq++ = *cr; *cq++ = ':'; } --cq; *cq = '\0'; env_add("MPIRUN_PROCESSES", cp); free(cp); } if (cl_args->comm == COMM_MPICH2_PMI) { growstr_t *hp = growstr_init(); growstr_printf(hp, "%s:%d", nodes[0].name, master_port); env_add("PMI_PORT", hp->s); growstr_free(hp); if (spawn > 0) env_add_int("PMI_SPAWNED", 1); } if (cl_args->comm == COMM_PORTALS) { growstr_t *nidmap = growstr_init(); growstr_t *pidmap = growstr_init(); portals_build_nidpid_maps(spawn, nidmap, pidmap); env_add("PTL_NIDMAP", nidmap->s); env_add("PTL_PIDMAP", pidmap->s); growstr_free(nidmap); growstr_free(pidmap); env_add("PTL_IFACE", "eth0"); /* XXX: no way to know */ } if (cl_args->comm == COMM_MPICH_P4 && numtasks > 1) master_port = prepare_p4_master_port(); if (cl_args->comm == COMM_MPICH_PSM) { /* We need to generate a uuid of the form * 9dea0f22-39a4-462a-80c9-b60b28cdfd38. If /usr/bin/uuidgen exists, * we should probably just use that. * 4bytes-2bytes-2bytes-2bytes-6bytes */ char uuid_packed[16]; unsigned char *p = (unsigned char *) uuid_packed; int fd, rret; fd = open("/dev/urandom", O_RDONLY); if (fd < 0) error_errno("%s: open /dev/urandom", __func__); rret = read_full_ret(fd, uuid_packed, sizeof(uuid_packed)); if (rret < 0) error_errno("%s: read /dev/urandom", __func__); if (rret != sizeof(uuid_packed)) error("%s: short read /dev/urandom", __func__); close(fd); psm_uuid = Malloc(37); /* 16 * 2 + 4 + 1 */ snprintf(psm_uuid, 37, "%02x%02x%02x%02x-%02x%02x-%02x%02x-" "%02x%02x-%02x%02x%02x%02x%02x%02x", p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); psm_uuid[36] = '\0'; } /* * Ports on which to talk to listener process for stdout/stderr * connection (if !-nostdout). */ if (stdio_port(1) >= 0) env_add_int("MPIEXEC_STDOUT_PORT", stdio_port(1)); if (stdio_port(2) >= 0) env_add_int("MPIEXEC_STDERR_PORT", stdio_port(2)); /* * Add our hostname too, for use by the redir-helper. And resolve * it now via the user's path for use by the spawns. */ if (HAVE_PBSPRO_HELPER) { env_add("MPIEXEC_HOST", nodes[0].name); mpiexec_redir_helper_path = resolve_exe("mpiexec-redir-helper", 1); } /* now the env as given from pbs */ env_add_environ(); /* if pbs did not give us these, put in some defaults */ env_add_if_not("PATH", _PATH_DEFPATH); env_add_if_not("USER", pswd->pw_name); /* * Set up for totalview attach. Returns local port number that will be * used in command startup to tell processes how to find us. These two * env vars are necessary in all processes. The first tells them to * consume the tv_ready message. The second is checked in MPI_Init to * determine if they should wait for all processes to be attached by * totalview. */ if (cl_args->tview && cl_args->comm == COMM_MPICH2_PMI) { env_add_int("PMI_TOTALVIEW", 1); env_add_int("MPIEXEC_DEBUG", 1); tv_port = tv_startup(task_end - task_start); } /* * Spawn each task, adding its private env vars. * numspawned set to zero earlier before signal handler setup; * both it and i walk the iterations in the loop. */ for (i=task_start; i<task_end; i++) { env_push(); if (cl_args->comm == COMM_MPICH_GM) { /* build proc-specific gmpi_opts in envp */ env_add_int("GMPI_ID", i); env_add_int("MXMPI_ID", i); env_add("GMPI_SLAVE", nodes[tasks[i].node].name); /* 1.2.5..10 */ } if (cl_args->comm == COMM_SHMEM) { /* earlier in get_hosts we checked that there is only one task */ env_add_int("MPICH_NP", tasks[0].num_copies); } if (cl_args->comm == COMM_MPICH_IB || cl_args->comm == COMM_MPICH_RAI) env_add_int("MPIRUN_RANK", i); if (cl_args->comm == COMM_MPICH_IB) { /* hack for topspin adaptation of mvapich 0.9.2 */ env_add("MPIRUN_NODENAME", nodes[tasks[i].node].name); } if (cl_args->comm == COMM_MPICH2_PMI) { /* task id is always 0-based, even for spawn */ env_add_int("PMI_ID", i - task_start); if (strcmp(nodes[tasks[i].node].mpname, nodes[tasks[i].node].name) != 0) env_add("MPICH_INTERFACE_HOSTNAME", nodes[tasks[i].node].mpname); } if (cl_args->comm == COMM_MPICH_PSM) { /* build one big string with everything in it */ char buf[2048]; snprintf(buf, sizeof(buf) - 1, "%d %d %s %d %d %d %d %d %d %d %s", 0, /* protocol version */ 0x4, /* protocol flags, ASYNC_SHUTDOWN=0x4 */ nodes[0].name, /* spawner host */ master_port, /* spawner port */ atoi(jobid), /* spawner jobid */ numtasks, /* COMM_WORLD size */ i - task_start, /* COMM_WORLD rank for this process */ nodes[tasks[i].node].numcpu, /* num local ranks */ tasks[i].cpu_index[0], /* my local rank */ 60, /* timeout... */ psm_uuid); buf[sizeof(buf) - 1] = '\0'; env_add("MPI_SPAWNER", buf); } if (cl_args->comm == COMM_PORTALS) env_add_int("PTL_MY_RID", i); if (cl_args->comm == COMM_NONE) env_add_int("MPIEXEC_RANK", i); /* either no stdin, or just to proc #0, or to all of them */ if (cl_args->which_stdin == STDIN_ONE && i == 0) { env_add_int("MPIEXEC_STDIN_PORT", stdio_port(0)); /* do not add _HOST for p4, since we don't want * the children of the big or remote master to * connect. This _PORT is just for PBS, not for MPICH. */ } if (cl_args->which_stdin == STDIN_ALL) { env_add_int("MPIEXEC_STDIN_PORT", stdio_port(0)); if (cl_args->comm == COMM_MPICH_P4) /* slave processes need to be told which host, as the stdin * connection happens not in pbs_mom, but in mpich/p4 library * code when it spawns each of the other tasks. */ env_add("MPIEXEC_STDIN_HOST", nodes[0].name); } env_terminate(); /* build proc-specific command line */ growstr_zero(g); g->translate_single_quote = 0; /* * Totalview is a bit odd, even hackish perhaps. Send the pid * the just-starting process to ourselves via /dev/tcp, some sort * of virtual device that makes a TCP connection as told and sends * the echoed data. */ if (cl_args->tview && cl_args->comm == COMM_MPICH2_PMI) growstr_printf(g, "if hash nc > /dev/null; then printf %%10d $$ | nc %s %d; else printf %%10d $$ > /dev/tcp/%s/%d; fi; " "if test -d \"%s\"; then cd \"%s\"; fi; exec %s -c ", nodes[0].name, tv_port, nodes[0].name, tv_port, pwd, pwd, user_shell); else growstr_printf(g, "if test -d \"%s\"; then cd \"%s\"; fi; exec %s -c ", pwd, pwd, user_shell); growstr_append(g, "'exec "); g->translate_single_quote = 1; /* * PBSPro environments do not know how to redirect standard streams. * So we fork a helper program that lives in the user's PATH, hopefully * the same place as mpiexec, that does the redirection then execs the * actual executable. This will break on OpenPBS or Torque, although * I guess the redir helper could unset the env vars, but I'd rather * people just didn't use the redir helper in that case. */ if (HAVE_PBSPRO_HELPER) growstr_printf(g, "%s ", mpiexec_redir_helper_path); /* * The executable, or a debugger wrapper around it. In the mpich2 * case we don't need any special args. */ if (cl_args->tview && cl_args->comm != COMM_MPICH2_PMI) { if (i == 0) growstr_printf(g, "%s %s -a -mpichtv", tvname, tasks[i].conf->exe); else growstr_printf(g, "%s -mpichtv", tasks[i].conf->exe); } else growstr_printf(g, "%s", tasks[i].conf->exe); /* process arguments _before_ p4 arguments to allow xterm/gdb hack */ if (tasks[i].conf->args) growstr_printf(g, " %s", tasks[i].conf->args); if (cl_args->comm == COMM_MPICH_P4) { /* * Pass the cwd to ch_p4, else it tries to chdir(exedir). Thanks * to Ben Webb <*****@*****.**> for fixing this. */ growstr_printf(g, " -p4wd %s", pwd); /* The actual flag names are just for debugging; they're not used * but the order is important. */ growstr_printf(g, " -execer_id mpiexec"); growstr_printf(g, " -master_host %s", nodes[tasks[0].node].mpname); growstr_printf(g, " -my_hostname %s", nodes[tasks[i].node].mpname); growstr_printf(g, " -my_nodenum %d", i); growstr_printf(g, " -my_numprocs %d", tasks[i].num_copies); growstr_printf(g, " -total_numnodes %d", numtasks); growstr_printf(g, " -master_port %d", master_port); if (i == 0 && numtasks > 1) { int j; /* list of: <hostname> <procs-on-that-node> */ growstr_printf(g, " -remote_info"); for (j=1; j<numtasks; j++) growstr_printf(g, " %s %d", nodes[tasks[j].node].mpname, tasks[j].num_copies); } } g->translate_single_quote = 0; growstr_printf(g, "'"); /* close quote for 'exec myjob ...' */ nargv[2] = g->s; /* * Dump all the info if sufficiently verbose. */ debug(2, "%s: command to %d/%d %s: %s", __func__, i, numtasks, nodes[tasks[i].node].name, nargv[2]); if (cl_args->verbose > 2) { int j; debug(3, "%s: environment to %d/%d %s", __func__, i, numtasks, nodes[tasks[i].node].name); for (j=0; (cp = envp[j]); j++) printf("env %2d %s\n", j, cp); } if (concurrent_master) { tm_event_t evt; int err; /* Note, would like to add obit immediately, but that is * not allowed until the START message is polled. */ err = tm_spawn(list_count(nargv), nargv, envp, nodes[tasks[i].node].ids[tasks[i].cpu_index[0]], &tasks[i].tid, &evt); if (err != TM_SUCCESS) error_tm(err, "%s: tm_spawn task %d", __func__, i); evt_add(evt, -1, i, EVT_START); } else { concurrent_request_spawn(i, list_count(nargv), nargv, envp, nodes[tasks[i].node].ids[tasks[i].cpu_index[0]]); } tasks[i].done = DONE_NOT; /* has now been started */ env_pop(); ++numspawned; ++numtasks_waiting_start; if (cl_args->comm == COMM_MPICH_P4 && i == 0 && numtasks > 1) { ret = wait_task_start(); if (ret) break; /* don't bother trying to start the rest */ ret = read_p4_master_port(&master_port); if (ret) break; } /* * Pay attention to incoming tasks so they don't time out while * we're starting up all the others, non blocking. */ if (cl_args->comm == COMM_MPICH_IB) { int one = 1; for (;;) { ret = service_ib_startup(one); one = 0; /* only report the new task that first time */ if (ret < 0) { ret = 1; goto out; } if (ret == 0) /* nothing accomplished */ break; } } if (cl_args->comm == COMM_MPICH_GM) { int one = 1; for (;;) { ret = service_gm_startup(one); one = 0; /* only report the new task that first time */ if (ret < 0) { ret = 1; goto out; } if (ret == 0) /* nothing accomplished */ break; } } if (cl_args->comm == COMM_MPICH_PSM) { int one = 1; for (;;) { ret = service_psm_startup(one); one = 0; /* only report the new task that first time */ if (ret < 0) { ret = 1; goto out; } if (ret == 0) /* nothing accomplished */ break; } } if (cl_args->tview && cl_args->comm == COMM_MPICH2_PMI) tv_accept_one(i); } if (cl_args->tview && cl_args->comm == COMM_MPICH2_PMI) tv_complete(); /* don't need these anymore */ free(nargv[0]); free(nargv[1]); growstr_free(g); if (cl_args->comm == COMM_MPICH_PSM) free(psm_uuid); if (ret) goto out; /* * Wait for spawn events and submit obit requests. */ while (numtasks_waiting_start) { ret = wait_task_start(); if (ret) goto out; } debug(1, "All %d task%s (spawn %d) started", task_end - task_start, task_end - task_start > 1 ? "s": "", spawn); /* * Finalize mpi-specific startup protocal, e.g. wait for all tasks to * checkin, perform barrier, etc. */ if (cl_args->comm == COMM_MPICH_GM) ret = read_gm_startup_ports(); if (cl_args->comm == COMM_MPICH_IB) ret = read_ib_startup_ports(); if (cl_args->comm == COMM_MPICH_PSM) ret = read_psm_startup_ports(); if (cl_args->comm == COMM_MPICH_RAI) ret = read_rai_startup_ports(); if (ret == 0) startup_complete = 1; out: return ret; }