Beispiel #1
0
   void Kickoff_PBS(const Node_info *ddinodes,const Cmdline_info *info) {
      char ddiinfo[] = "-ddi";
      char procid[8];
      char portid[8];
      char nodeid[8];
      char snodes[8];
      char sprocs[8];
      char **rargs;
      char **argv = info->argv;
      int i,j,r,iarg,nargs = info->ddiarg + info->nnodes + 8;
      int inode,ncpus,np = info->nprocs;
      int ntests;

      if(info->nnodes == 1) return;

      int tm_errno;
      tm_task_id *tid;
      tm_event_t *spawn;
      tm_event_t polled;
      struct tm_roots roots;
      tm_node_id *nodelist;


   /* ---------------------------------- *\
      Initialize PBS Task Management API
   \* ---------------------------------- */
      if(tm_init(0, &roots) != TM_SUCCESS) {
         fprintf(stderr, " ddikick.x: tm_init failed\n");
         Fatal_error(911);
      }

      if(tm_nodeinfo(&nodelist, &np) != TM_SUCCESS) {
         fprintf(stderr, " ddikick.x: tm_nodeinfo failed.\n");
         Fatal_error(911);
      }

      tid   = (tm_task_id *) Malloc(2*np*sizeof(tm_task_id)); 
      spawn = (tm_event_t *) Malloc(2*np*sizeof(tm_event_t));

      for(i=0; i<2*np; i++) {
         *(tid + i)   = TM_NULL_TASK;
         *(spawn + i) = TM_NULL_EVENT;
      }


   /* ----------------------------------------- *\
      Initialize arguments to kickoff DDI tasks
   \* ----------------------------------------- */
      rargs = (char **) Malloc(nargs*sizeof(char*));

      sprintf(portid, "%d", info->kickoffport);
      sprintf(snodes, "%d", info->nnodes);
      sprintf(sprocs, "%d", info->nprocs);     

      for(i=1,r=0; i<info->ddiarg-1; i++) rargs[r++] = argv[i];

      rargs[r++] = ddiinfo;
      rargs[r++] = info->kickoffhost;    /*   kickoff host name     */
      rargs[r++] = portid;               /*   kickoff port number   */
      rargs[r++] = nodeid;               /*   rank of this node     */
      rargs[r++] = procid;               /*   rank of this process  */
      rargs[r++] = snodes;               /*   number of nodes       */
      rargs[r++] = sprocs;               /*   number of processors  */
  
      for(i=0,iarg=info->nodearg; i<info->nnodes; i++,iarg++) {
         rargs[r++] = argv[iarg];
      }   
          
      rargs[r] = NULL;


   /* ------------------------ *\
      Spawn DDI tasks to nodes
   \* ------------------------ */
      ncpus=ddinodes[0].cpus+ddinodes[1].cpus;
      for(i=ddinodes[0].cpus,inode=1; i<np; i++) {
         
         if(i == ncpus) ncpus += ddinodes[++inode].cpus;
         
         sprintf(nodeid,"%d",inode);
         sprintf(procid,"%d",i);

       # if DDI_DEBUG
         DEBUG_START(DEBUG_MAX)
         fprintf(stdout,"DDI Process %i PBS tm_spawn arguments: ",i);
         for(iarg=0; iarg<r; iarg++) fprintf(stdout,"%s ",rargs[iarg]);
         fprintf(stdout,"\n");
         DEBUG_END()
       # endif

      /* ------------------------- *\
         Spawn DDI Compute Process
      \* ------------------------- */
         if(tm_spawn(r,rargs,NULL,*(nodelist+i),(tid+i),spawn+i) != TM_SUCCESS) {
            fprintf(stderr," ddikick.x: tm_spawn failed.\n");
            Fatal_error(911);
         }


      /* ---------------------------------- *\
         No data server on single node runs
      \* ---------------------------------- */
         if(info->nnodes == 1) continue;


       # if DDI_DEBUG
         DEBUG_START(DEBUG_MAX)
         fprintf(stdout,"DDI Process %i PBS tm_spawn arguments: ",j);
         for(iarg=0; iarg<r; iarg++) fprintf(stdout,"%s ",rargs[iarg]);
         fprintf(stdout,"\n");
         DEBUG_END()
       # endif

         j = i+np;
         sprintf(procid,"%d",j);
         
      /* --------------------- *\
         Spawn DDI Data Server
      \* --------------------- */
         if(tm_spawn(r,rargs,NULL,*(nodelist+i),(tid+j),spawn+j) != TM_SUCCESS) {
            fprintf(stderr," ddikick.x: tm_spawn failed.\n");
            Fatal_error(911);
      }  }


   /* -------------------------------------------------------- *\
      Poll PBS to ensure each DDI process started successfully
   \* -------------------------------------------------------- */
      ntests = np-ddinodes[0].cpus;
      if(USING_DATA_SERVERS())  ntests *= 2;

      for(i=ntests; i--; ) {
         if(tm_poll(TM_NULL_EVENT,&polled,1,&tm_errno) != TM_SUCCESS) {
            fprintf(stderr," ddikick.x: tm_poll failed.\n");
            Fatal_error(911);
         }
         
         for(j=0; j<np; j++) {
            if(polled == *(spawn+j)) {
               if(tm_errno) {
                  fprintf(stderr," ddikick.x: error spawning DDI task %i.\n",j);
                  Fatal_error(911);
               } else {
                # if DDI_DEBUG
                  DEBUG_START(DEBUG_MAX)
                  fprintf(stdout," ddikick.x: DDI task %i started.\n",j);
                  DEBUG_END()
                # endif
            }  }

            if(info->nnodes == 1) continue;

            if(polled == *(spawn+j+np)) {
               if(tm_errno) {
                  fprintf(stderr," ddikick.x: error spawning DDI task %i.\n",j+np);
                  Fatal_error(911);
               } else {
                # if DDI_DEBUG
                  DEBUG_START(DEBUG_MAX)
                  fprintf(stdout," ddikick.x: DDI task %i started.\n",j+np);
                  DEBUG_END()
                # endif
      }  }  }  }

      
   /* -------------------------------------- *\
      Close the link to the PBS Task Manager
   \* -------------------------------------- */
      tm_finalize();


   /* ---------------- *\
      Free used memory
   \* ---------------- */
      free(tid);
      free(spawn);
      free(rargs);      
   }
Beispiel #2
0
int main(

    int   argc,
    char *argv[])

{
    int c;
    int err = 0;
    int ncopies = -1;
    int onenode = -1;
    int rc;

    struct tm_roots rootrot;
    int  nspawned = 0;
    tm_node_id *nodelist;
    int start;
    int stop;
    int sync = 0;

    int pernode = 0;
    char *targethost = NULL;
    char *allnodes;

    struct sigaction act;

    char **ioenv;

    extern int   optind;
    extern char *optarg;

    int posixly_correct_set_by_caller = 0;
    char *envstr;

    id = malloc(60 * sizeof(char));

    if (id == NULL)
    {
        fprintf(stderr, "%s: malloc failed, (%d)\n",
                id,
                errno);

        return(1);
    }

    sprintf(id, "pbsdsh%s",
            ((getenv("PBSDEBUG") != NULL) && (getenv("PBS_TASKNUM") != NULL))
            ? getenv("PBS_TASKNUM")
            : "");

#ifdef __GNUC__
    /* If it's already set, we won't unset it later */

    if (getenv("POSIXLY_CORRECT") != NULL)
        posixly_correct_set_by_caller = 1;

    envstr = strdup("POSIXLY_CORRECT=1");

    putenv(envstr);

#endif

    while ((c = getopt(argc, argv, "c:n:h:osuv")) != EOF)
    {
        switch (c)
        {

        case 'c':

            ncopies = atoi(optarg);

            if (ncopies <= 0)
            {
                err = 1;
            }

            break;

        case 'h':

            targethost = strdup(optarg); /* run on this 1 hostname */

            break;

        case 'n':

            onenode = atoi(optarg);

            if (onenode < 0)
            {
                err = 1;
            }

            break;

        case 'o':

            grabstdio = 1;

            break;

        case 's':

            sync = 1; /* force synchronous spawns */

            break;

        case 'u':

            pernode = 1; /* run once per node (unique hostnames) */

            break;

        case 'v':

            verbose = 1; /* turn on verbose output */

            break;

        default:

            err = 1;

            break;
        }  /* END switch (c) */

    }    /* END while ((c = getopt()) != EOF) */

    if ((err != 0) || ((onenode >= 0) && (ncopies >= 1)))
    {
        fprintf(stderr, "Usage: %s [-c copies][-o][-s][-u][-v] program [args]...]\n",
                argv[0]);

        fprintf(stderr, "       %s [-n nodenumber][-o][-s][-u][-v] program [args]...\n",
                argv[0]);

        fprintf(stderr, "       %s [-h hostname][-o][-v] program [args]...\n",
                argv[0]);

        fprintf(stderr, "Where -c copies =  run  copy of \"args\" on the first \"copies\" nodes,\n");
        fprintf(stderr, "      -n nodenumber = run a copy of \"args\" on the \"nodenumber\"-th node,\n");
        fprintf(stderr, "      -o = capture stdout of processes,\n");
        fprintf(stderr, "      -s = forces synchronous execution,\n");
        fprintf(stderr, "      -u = run on unique hostnames,\n");
        fprintf(stderr, "      -h = run on this specific hostname,\n");
        fprintf(stderr, "      -v = forces verbose output.\n");

        exit(1);
    }

#ifdef __GNUC__
    if (!posixly_correct_set_by_caller)
    {
        putenv("POSIXLY_CORRECT");
        free(envstr);
    }

#endif


    if (getenv("PBS_ENVIRONMENT") == NULL)
    {
        fprintf(stderr, "%s: not executing under PBS\n",
                id);

        return(1);
    }


    /*
     * Set up interface to the Task Manager
     */

    if ((rc = tm_init(0, &rootrot)) != TM_SUCCESS)
    {
        fprintf(stderr, "%s: tm_init failed, rc = %s (%d)\n",
                id,
                get_ecname(rc),
                rc);

        return(1);
    }

    sigemptyset(&allsigs);

    sigaddset(&allsigs, SIGHUP);
    sigaddset(&allsigs, SIGINT);
    sigaddset(&allsigs, SIGTERM);

    act.sa_mask = allsigs;
    act.sa_flags = 0;

    /* We want to abort system calls and call a function. */

#ifdef SA_INTERRUPT
    act.sa_flags |= SA_INTERRUPT;
#endif
    act.sa_handler = bailout;
    sigaction(SIGHUP, &act, NULL);
    sigaction(SIGINT, &act, NULL);
    sigaction(SIGTERM, &act, NULL);

#ifdef DEBUG

    if (rootrot.tm_parent == TM_NULL_TASK)
    {
        fprintf(stderr, "%s: I am the mother of all tasks\n",
                id);
    }
    else
    {
        fprintf(stderr, "%s: I am but a child in the scheme of things\n",
                id);
    }

#endif /* DEBUG */

    if ((rc = tm_nodeinfo(&nodelist, &numnodes)) != TM_SUCCESS)
    {
        fprintf(stderr, "%s: tm_nodeinfo failed, rc = %s (%d)\n",
                id,
                get_ecname(rc),
                rc);

        return(1);
    }

    /* nifty unique/hostname code */
    if (pernode || targethost)
    {
        allnodes = gethostnames(nodelist);

        if (targethost)
        {
            onenode = findtargethost(allnodes, targethost);
        }
        else
        {
            numnodes = uniquehostlist(nodelist, allnodes);
        }

        free(allnodes);

        if (targethost)
            free(targethost);
    }

    /* We already checked the lower bounds in the argument processing,
       now we check the upper bounds */

    if ((onenode >= numnodes) || (ncopies > numnodes))
    {
        fprintf(stderr, "%s: only %d nodes available\n",
                id,
                numnodes);

        return(1);
    }

    /* malloc space for various arrays based on number of nodes/tasks */

    tid = (tm_task_id *)calloc(numnodes, sizeof(tm_task_id));

    events_spawn = (tm_event_t *)calloc(numnodes, sizeof(tm_event_t));

    events_obit  = (tm_event_t *)calloc(numnodes, sizeof(tm_event_t));

    ev = (int *)calloc(numnodes, sizeof(int));

    if ((tid == NULL) ||
            (events_spawn == NULL) ||
            (events_obit == NULL) ||
            (ev == NULL))
    {
        /* FAILURE - cannot alloc memory */

        fprintf(stderr, "%s: memory alloc of task ids failed\n",
                id);

        return(1);
    }

    for (c = 0; c < numnodes; c++)
    {
        *(tid + c)          = TM_NULL_TASK;
        *(events_spawn + c) = TM_NULL_EVENT;
        *(events_obit  + c) = TM_NULL_EVENT;
        *(ev + c)           = 0;
    }  /* END for (c) */

    /* Now spawn the program to where it goes */

    if (onenode >= 0)
    {
        /* Spawning one copy onto logical node "onenode" */

        start = onenode;
        stop  = onenode + 1;
    }
    else if (ncopies >= 0)
    {
        /* Spawn a copy of the program to the first "ncopies" nodes */

        start = 0;
        stop  = ncopies;
    }
    else
    {
        /* Spawn a copy on all nodes */

        start = 0;
        stop  = numnodes;
    }

    if ((ioenv = calloc(2, sizeof(char *)))==NULL)
    {
        /* FAILURE - cannot alloc memory */

        fprintf(stderr,"%s: memory alloc of ioenv failed\n",
                id);

        return(1);
    }

    if (grabstdio != 0)
    {
        stdoutfd = build_listener(&stdoutport);

        if ((*ioenv = calloc(50,sizeof(char *))) == NULL)
        {
            /* FAILURE - cannot alloc memory */

            fprintf(stderr,"%s: memory alloc of *ioenv failed\n",
                    id);

            return(1);
        }

        snprintf(*ioenv,49,"TM_STDOUT_PORT=%d",
                 stdoutport);

        FD_ZERO(&permrfsd);
    }

    sigprocmask(SIG_BLOCK, &allsigs, NULL);

    for (c = start; c < stop; ++c)
    {
        if ((rc = tm_spawn(
                      argc - optind,
                      argv + optind,
                      ioenv,
                      *(nodelist + c),
                      tid + c,
                      events_spawn + c)) != TM_SUCCESS)
        {
            fprintf(stderr, "%s: spawn failed on node %d err %s\n",
                    id,
                    c,
                    get_ecname(rc));
        }
        else
        {
            if (verbose)
                fprintf(stderr, "%s: spawned task %d\n",
                        id,
                        c);

            ++nspawned;

            if (sync)
                wait_for_task(&nspawned); /* one at a time */
        }

    }    /* END for (c) */

    if (sync == 0)
        wait_for_task(&nspawned); /* wait for all to finish */


    /*
     * Terminate interface with Task Manager
     */

    tm_finalize();

    return 0;
}  /* END main() */
Beispiel #3
0
/* When working in this function, ALWAYS jump to "cleanup" if
 * you encounter an error so that orterun will be woken up and
 * the job can cleanly terminate
 */
static void launch_daemons(int fd, short args, void *cbdata)
{
    orte_job_map_t *map = NULL;
    orte_app_context_t *app;
    orte_node_t *node;
    int proc_vpid_index;
    char *param;
    char **env = NULL;
    char *var;
    char **argv = NULL;
    char **nodeargv;
    int argc = 0;
    int rc;
    orte_std_cntr_t i; 
    char *bin_base = NULL, *lib_base = NULL;
    tm_event_t *tm_events = NULL;
    tm_task_id *tm_task_ids = NULL;
    bool failed_launch = true;
    mode_t current_umask;
    char *nodelist;
    char* vpid_string;
    orte_job_t *daemons, *jdata;
    orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
    int32_t launchid, *ldptr;
    char *prefix_dir = NULL;

    jdata = state->jdata;

    /* if we are launching debugger daemons, then just go
     * do it - no new daemons will be launched
     */
    if (ORTE_FLAG_TEST(state->jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
        jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
        ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
        OBJ_RELEASE(state);
        return;
    }

    /* setup the virtual machine */
    daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
    if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(jdata))) {
        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }

    /* if we don't want to launch, then don't attempt to
     * launch the daemons - the user really wants to just
     * look at the proposed process map
     */
    if (orte_do_not_launch) {
        /* set the state to indicate the daemons reported - this
         * will trigger the daemons_reported event and cause the
         * job to move to the following step
         */
        jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
        ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
        OBJ_RELEASE(state);
        return;
    }
    
    /* Get the map for this job */
    if (NULL == (map = daemons->map)) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        rc = ORTE_ERR_NOT_FOUND;
        goto cleanup;
    }

    if (0 == map->num_new_daemons) {
        /* set the state to indicate the daemons reported - this
         * will trigger the daemons_reported event and cause the
         * job to move to the following step
         */
        jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
        ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
        OBJ_RELEASE(state);
        return;
    }
    
    OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
                         "%s plm:tm: launching vm",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    
    /* Allocate a bunch of TM events to use for tm_spawn()ing */
    tm_events = malloc(sizeof(tm_event_t) * map->num_new_daemons);
    if (NULL == tm_events) {
        rc = ORTE_ERR_OUT_OF_RESOURCE;
        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }
    tm_task_ids = malloc(sizeof(tm_task_id) * map->num_new_daemons);
    if (NULL == tm_task_ids) {
        rc = ORTE_ERR_OUT_OF_RESOURCE;
        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }

    /* add the daemon command (as specified by user) */
    orte_plm_base_setup_orted_cmd(&argc, &argv);

    /* create a list of nodes in this launch */
    nodeargv = NULL;
    for (i = 0; i < map->nodes->size; i++) {
        if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
            continue;
        }
        
        /* if this daemon already exists, don't launch it! */
        if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_DAEMON_LAUNCHED)) {
            continue;
        }
        
        /* add to list */
        opal_argv_append_nosize(&nodeargv, node->name);
    }
    nodelist = opal_argv_join(nodeargv, ',');
    opal_argv_free(nodeargv);
    
    /* Add basic orted command line options */
    orte_plm_base_orted_append_basic_args(&argc, &argv, "tm",
                                          &proc_vpid_index,
                                          nodelist);
    free(nodelist);
    
    if (0 < opal_output_get_verbosity(orte_plm_base_framework.framework_output)) {
        param = opal_argv_join(argv, ' ');
        OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
                             "%s plm:tm: final top-level argv:\n\t%s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             (NULL == param) ? "NULL" : param));
        if (NULL != param) free(param);
    }

    if (!connected) {
        if (ORTE_SUCCESS != plm_tm_connect()) {
            goto cleanup;
        }
        connected = true;
    }

    /* Figure out the basenames for the libdir and bindir.  There is a
       lengthy comment about this in plm_rsh_module.c explaining all
       the rationale for how / why we're doing this. */
    lib_base = opal_basename(opal_install_dirs.libdir);
    bin_base = opal_basename(opal_install_dirs.bindir);

    /* setup environment */
    env = opal_argv_copy(orte_launch_environ);

    /* enable local launch by the orteds */
    (void) mca_base_var_env_name ("plm", &var);
    opal_setenv(var, "rsh", true, &env);
    free(var);
    
    /* add our umask -- see big note in orted.c */
    current_umask = umask(0);
    umask(current_umask);
    (void)asprintf(&var, "0%o", current_umask);
    opal_setenv("ORTE_DAEMON_UMASK_VALUE", var, true, &env);
    free(var);
    
    /* If we have a prefix, then modify the PATH and
       LD_LIBRARY_PATH environment variables. We only allow
       a single prefix to be specified. Since there will
       always be at least one app_context, we take it from
       there
    */
    app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0);
    orte_get_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, (void**)&prefix_dir, OPAL_STRING);
    if (NULL != prefix_dir) {
        char *newenv;
        
        for (i = 0; NULL != env && NULL != env[i]; ++i) {
            /* Reset PATH */
            if (0 == strncmp("PATH=", env[i], 5)) {
                (void)asprintf(&newenv, "%s/%s:%s", 
                               prefix_dir, bin_base, env[i] + 5);
                OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
                                     "%s plm:tm: resetting PATH: %s",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     newenv));
                opal_setenv("PATH", newenv, true, &env);
                free(newenv);
            } 
            
            /* Reset LD_LIBRARY_PATH */
            else if (0 == strncmp("LD_LIBRARY_PATH=", env[i], 16)) {
                (void)asprintf(&newenv, "%s/%s:%s", 
                               prefix_dir, lib_base, env[i] + 16);
                OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
                                     "%s plm:tm: resetting LD_LIBRARY_PATH: %s",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     newenv));
                opal_setenv("LD_LIBRARY_PATH", newenv, true, &env);
                free(newenv);
            } 
        }
        free(prefix_dir);
    }
    
    /* Iterate through each of the nodes and spin
     * up a daemon.
     */
    ldptr = &launchid;
    for (i = 0; i < map->nodes->size; i++) {
        if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
            continue;
        }
        /* if this daemon already exists, don't launch it! */
        if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_DAEMON_LAUNCHED)) {
            continue;
        }
 
        OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
                             "%s plm:tm: launching on node %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             node->name));
        
        /* setup process name */
        rc = orte_util_convert_vpid_to_string(&vpid_string, node->daemon->name.vpid);
        if (ORTE_SUCCESS != rc) {
            opal_output(0, "plm:tm: unable to get daemon vpid as string");
            exit(-1);
        }
        free(argv[proc_vpid_index]);
        argv[proc_vpid_index] = strdup(vpid_string);
        free(vpid_string);
        
        /* exec the daemon */
        if (0 < opal_output_get_verbosity(orte_plm_base_framework.framework_output)) {
            param = opal_argv_join(argv, ' ');
            OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
                                 "%s plm:tm: executing:\n\t%s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 (NULL == param) ? "NULL" : param));
            if (NULL != param) free(param);
        }
        
        launchid = 0;
        if (!orte_get_attribute(&node->attributes, ORTE_NODE_LAUNCH_ID, (void**)&ldptr, OPAL_INT32)) {
            orte_show_help("help-plm-tm.txt", "tm-spawn-failed", true, argv[0], node->name, 0);
            rc = ORTE_ERROR;
            goto cleanup;
        }
        rc = tm_spawn(argc, argv, env, launchid, tm_task_ids + launched, tm_events + launched);
        if (TM_SUCCESS != rc) {
            orte_show_help("help-plm-tm.txt", "tm-spawn-failed", true, argv[0], node->name, launchid);
            rc = ORTE_ERROR;
            goto cleanup;
        }
        
        launched++;
    }

    /* indicate that the daemons for this job were launched */
    state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
    daemons->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;

    /* flag that launch was successful, so far as we currently know */
    failed_launch = false;

    OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
                         "%s plm:tm:launch: finished spawning orteds",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

 cleanup:
    /* cleanup */
    OBJ_RELEASE(state);

    /* check for failed launch - if so, force terminate */
    if (failed_launch) {
        ORTE_ACTIVATE_JOB_STATE(daemons, ORTE_JOB_STATE_FAILED_TO_START);
    }
}
Beispiel #4
0
/*
 * Start the tasks, with much stuff in the environment.  If concurrent
 * master, this could be on behalf of some other mpiexec to which we
 * will forward any event/error results.
 */
int
start_tasks(int spawn)
{
    int i, ret = 0;
    char *nargv[3];
    char pwd[PATH_MAX];
    char *cp;
    int conns[3];  /* expected connections to the stdio process */
    int master_port = 0;
    const char *user_shell;
    growstr_t *g;
    int gmpi_port[2];
    int pmi_fd;
    int task_start, task_end;
    const char *mpiexec_redir_helper_path;
    char *psm_uuid = NULL;
    int tv_port = 0;

    /* for looping from 0..numtasks in the case of MPI_Spawn */
    task_start = spawns[spawn].task_start;
    task_end = spawns[spawn].task_end;

    /*
     * Get the pwd.  Probably can trust libc not to overflow this,
     * but who knows.
     */
    if (!getcwd(pwd, sizeof(pwd)))
	error("%s: no current working directory", __func__);
    pwd[sizeof(pwd)-1] = '\0';

    /*
     * Eventually use the user's preferred shell.
     */
    if ((cp = getenv("SHELL")))
	user_shell = cp;
    else if (pswd->pw_shell)
	user_shell = pswd->pw_shell;
    else
	user_shell = "/bin/sh";  /* assume again */

    /*
     * Rewrite argv to go through user's shell, just like rsh.
     *   $SHELL, "-c", "cd <path>; exec <argv0> <argv1>..."
     * But to change the working dir and not frighten weak shells like tcsh,
     * we must detect that the dir actually exists on the far side before
     * committing to the cd.  Use /bin/sh for this task, hoping it exists
     * everywhere we'll be.  Then there's also a bit of quoting nightmare
     * to handle too.  So we'll end up with:
     * rsh node "/bin/sh -c 'if test -d $dir ; then cd $dir ; fi ; $SHELL -c
     *         \'exec argv0 argv1 ...\''"
     * but with argv* (including the executable, argv0) changed to replace
     * all occurrences of ' with '\''.
     */
    nargv[0] = strsave("/bin/sh");  /* assume this exists everywhere */
    nargv[1] = strsave("-c");

    /* exec_line constructed for each process */
    g = growstr_init();

    /*
     * Start stdio stream handler process, if anybody gets stdin,
     * or !nostdout.
     */
    if (cl_args->which_stdin == STDIN_NONE)
	conns[0] = 0;
    else if (cl_args->which_stdin == STDIN_ONE) {
	if (spawn == 0)
	    conns[0] = 1;
	else
	    conns[0] = 0;  /* already connected the single stdin */
    } else if (cl_args->which_stdin == STDIN_ALL) {
	/* total processes which connect stdin */
	conns[0] = 0;
	for (i=task_start; i<task_end; i++)
	    conns[0] += tasks[i].num_copies;
    }

    if (cl_args->nostdout)
	conns[1] = conns[2] = 0;
    else
	/* even for p4 and shmem, not with multiplicity */
  	conns[1] = conns[2] = task_end - task_start;

    /*
     * Initialize listener sockets for gm and ib, since these will be
     * used to implement MPI_Abort in the stdio listener later.
     */
    if (cl_args->comm == COMM_MPICH_GM) {
	prepare_gm_startup_ports(gmpi_port);
    } else if (cl_args->comm == COMM_MPICH_IB) {
	master_port = prepare_ib_startup_port(&gmpi_fd[0]);
	gmpi_fd[1] = -1;
    } else if (cl_args->comm == COMM_MPICH_PSM) {
	master_port = prepare_psm_startup_port(&gmpi_fd[0]);
	gmpi_fd[1] = -1;
    } else if (cl_args->comm == COMM_MPICH_RAI) {
	master_port = prepare_rai_startup_port();
	gmpi_fd[0] = -1;
	gmpi_fd[1] = -1;
    } else {
	gmpi_fd[0] = -1;
	gmpi_fd[1] = -1;
    }

    pmi_fd = -1;
    if (cl_args->comm == COMM_MPICH2_PMI) {
	/* stdio listener handles all PMI activity, even startup */
	if (spawn == 0)
	    master_port = prepare_pmi_startup_port(&pmi_fd);
	else
	    master_port = stdio_msg_parent_say_more_tasks(
	                    task_end - task_start, conns);
    }

    /* flush output buffer, else forked child will have the output too */
    fflush(stdout);

    /* fork the listener (unless we're just spawning more tasks) */
    if (spawn == 0)
	stdio_fork(conns, gmpi_fd, pmi_fd);

    if (pmi_fd >= 0)
	close(pmi_fd);  /* child has it now */

    numtasks_waiting_start = 0;
    if (cl_args->comm == COMM_NONE)
	/* do not complain if they exit before all other tasks are up */
	startup_complete = 1;
    else
	startup_complete = 0;

    /*
     * Start signal handling _after_ stdio child is up.
     */
    handle_signals(0, 0, killall);

    /*
     * environment variables common to all tasks
     */
    env_init();

    /* override user env with these */
    if (cl_args->comm == COMM_MPICH_GM) {
	env_add_int("GMPI_MAGIC", atoi(jobid));
	/* PBS always gives us the "mother superior" node first in the list */
	env_add("GMPI_MASTER", nodes[0].name);
	env_add_int("GMPI_PORT", gmpi_port[0]);   /* 1.2.5..10 */
	env_add_int("GMPI_PORT1", gmpi_port[0]);  /* 1.2.4..8a */
	env_add_int("GMPI_PORT2", gmpi_port[1]);
	env_add_int("GMPI_NP", numtasks);
	env_add_int("GMPI_BOARD", -1);

	/* ditto for new MX version */
	env_add_int("MXMPI_MAGIC", atoi(jobid));
	env_add("MXMPI_MASTER", nodes[0].name);
	env_add_int("MXMPI_PORT", gmpi_port[0]);
	env_add_int("MXMPI_NP", numtasks);
	env_add_int("MXMPI_BOARD", -1);

	/* for MACOSX to override default malloc */
	env_add_int("DYLD_FORCE_FLAT_NAMESPACE", 1);
    }

    if (cl_args->comm == COMM_EMP) {
	growstr_t *emphosts = growstr_init();
	for (i=0; i<numtasks; i++)
	    growstr_printf(emphosts, "%s%s", (i > 0 ? " " : ""),
	      nodes[tasks[i].node].mpname);
	env_add("EMPHOSTS", emphosts->s);
	growstr_free(emphosts);
    }
    
    if (cl_args->comm == COMM_MPICH_IB || cl_args->comm == COMM_MPICH_RAI) {
	int len;
	char *cq, *cr;
	env_add("MPIRUN_HOST", nodes[0].name);  /* master address */
	env_add_int("MPIRUN_PORT", master_port);
	env_add_int("MPIRUN_NPROCS", numtasks);
	env_add_int("MPIRUN_ID", atoi(jobid));  /* global job id */
	/*
	 * pmgr_version >= 3 needs this terribly long string in every task.
	 * Since it may be quite large, we do the allocation by hand and
	 * skip some growstr overhead.
	 */
	len = numtasks;  /* separating colons and terminal \0 */
	for (i=0; i<numtasks; i++)
	    len += strlen(nodes[tasks[i].node].name);
	cq = cp = Malloc(len);
	for (i=0; i<numtasks; i++) {
	    for (cr=nodes[tasks[i].node].name; *cr; cr++)
		*cq++ = *cr;
	    *cq++ = ':';
	}
	--cq;
	*cq = '\0';
	env_add("MPIRUN_PROCESSES", cp);
	free(cp);
    }

    if (cl_args->comm == COMM_MPICH2_PMI) {
	growstr_t *hp = growstr_init();
	growstr_printf(hp, "%s:%d", nodes[0].name, master_port);
	env_add("PMI_PORT", hp->s);
	growstr_free(hp);
	if (spawn > 0)
	    env_add_int("PMI_SPAWNED", 1);
    }

    if (cl_args->comm == COMM_PORTALS) {
	growstr_t *nidmap = growstr_init();
	growstr_t *pidmap = growstr_init();
	portals_build_nidpid_maps(spawn, nidmap, pidmap);
	env_add("PTL_NIDMAP", nidmap->s);
	env_add("PTL_PIDMAP", pidmap->s);
	growstr_free(nidmap);
	growstr_free(pidmap);
    	env_add("PTL_IFACE", "eth0");  /* XXX: no way to know */
    }

    if (cl_args->comm == COMM_MPICH_P4 && numtasks > 1)
	master_port = prepare_p4_master_port();

    if (cl_args->comm == COMM_MPICH_PSM) {
	/* We need to generate a uuid of the form
	 * 9dea0f22-39a4-462a-80c9-b60b28cdfd38.  If /usr/bin/uuidgen exists,
	 * we should probably just use that.
	 * 4bytes-2bytes-2bytes-2bytes-6bytes
	 */
	char uuid_packed[16];
	unsigned char *p = (unsigned char *) uuid_packed;
	int fd, rret;
	
	fd = open("/dev/urandom", O_RDONLY);
	if (fd < 0)
	    error_errno("%s: open /dev/urandom", __func__);
	rret = read_full_ret(fd, uuid_packed, sizeof(uuid_packed));
	if (rret < 0)
	    error_errno("%s: read /dev/urandom", __func__);
	if (rret != sizeof(uuid_packed))
	    error("%s: short read /dev/urandom", __func__);
	close(fd);
	psm_uuid = Malloc(37);  /* 16 * 2 + 4 + 1 */
	snprintf(psm_uuid, 37,
		 "%02x%02x%02x%02x-%02x%02x-%02x%02x-"
		 "%02x%02x-%02x%02x%02x%02x%02x%02x",
		 p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
		 p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
	psm_uuid[36] = '\0';
    }

    /*
     * Ports on which to talk to listener process for stdout/stderr
     * connection (if !-nostdout).
     */
    if (stdio_port(1) >= 0)
	env_add_int("MPIEXEC_STDOUT_PORT", stdio_port(1));
    if (stdio_port(2) >= 0)
	env_add_int("MPIEXEC_STDERR_PORT", stdio_port(2));

    /*
     * Add our hostname too, for use by the redir-helper.  And resolve
     * it now via the user's path for use by the spawns.
     */
    if (HAVE_PBSPRO_HELPER) {
	env_add("MPIEXEC_HOST", nodes[0].name);
	mpiexec_redir_helper_path = resolve_exe("mpiexec-redir-helper", 1);
    }


    /* now the env as given from pbs */
    env_add_environ();

    /* if pbs did not give us these, put in some defaults */
    env_add_if_not("PATH", _PATH_DEFPATH);
    env_add_if_not("USER", pswd->pw_name);


    /*
     * Set up for totalview attach.  Returns local port number that will be
     * used in command startup to tell processes how to find us.  These two
     * env vars are necessary in all processes.  The first tells them to
     * consume the tv_ready message.  The second is checked in MPI_Init to
     * determine if they should wait for all processes to be attached by
     * totalview.
     */
    if (cl_args->tview && cl_args->comm == COMM_MPICH2_PMI) {
	env_add_int("PMI_TOTALVIEW", 1);
	env_add_int("MPIEXEC_DEBUG", 1);
	tv_port = tv_startup(task_end - task_start);
    }

    /*
     * Spawn each task, adding its private env vars.
     * numspawned set to zero earlier before signal handler setup;
     * both it and i walk the iterations in the loop.
     */
    for (i=task_start; i<task_end; i++) {
	env_push();
	if (cl_args->comm == COMM_MPICH_GM) {
	    /* build proc-specific gmpi_opts in envp */
	    env_add_int("GMPI_ID", i);
	    env_add_int("MXMPI_ID", i);
	    env_add("GMPI_SLAVE", nodes[tasks[i].node].name);  /* 1.2.5..10 */
	}
	if (cl_args->comm == COMM_SHMEM) {
	    /* earlier in get_hosts we checked that there is only one task */
	    env_add_int("MPICH_NP", tasks[0].num_copies);
	}

	if (cl_args->comm == COMM_MPICH_IB || cl_args->comm == COMM_MPICH_RAI)
	    env_add_int("MPIRUN_RANK", i);

	if (cl_args->comm == COMM_MPICH_IB) {
	    /* hack for topspin adaptation of mvapich 0.9.2 */
	    env_add("MPIRUN_NODENAME", nodes[tasks[i].node].name);
	}

	if (cl_args->comm == COMM_MPICH2_PMI) {
	    /* task id is always 0-based, even for spawn  */
	    env_add_int("PMI_ID", i - task_start);
	    if (strcmp(nodes[tasks[i].node].mpname,
	               nodes[tasks[i].node].name) != 0)
		env_add("MPICH_INTERFACE_HOSTNAME",
		        nodes[tasks[i].node].mpname);
	}

	if (cl_args->comm == COMM_MPICH_PSM) {
	    /* build one big string with everything in it */
	    char buf[2048];
	    snprintf(buf, sizeof(buf) - 1,
		     "%d %d %s %d %d %d %d %d %d %d %s",
		     0,    /* protocol version */
		     0x4,  /* protocol flags, ASYNC_SHUTDOWN=0x4 */
		     nodes[0].name,  /* spawner host */
		     master_port,    /* spawner port */
		     atoi(jobid),    /* spawner jobid */
		     numtasks,       /* COMM_WORLD size */
		     i - task_start, /* COMM_WORLD rank for this process */
		     nodes[tasks[i].node].numcpu, /* num local ranks */
		     tasks[i].cpu_index[0],       /* my local rank */
		     60, /* timeout... */
		     psm_uuid);
	    buf[sizeof(buf) - 1] = '\0';
	    env_add("MPI_SPAWNER", buf);
	}

	if (cl_args->comm == COMM_PORTALS)
	    env_add_int("PTL_MY_RID", i);

	if (cl_args->comm == COMM_NONE)
	    env_add_int("MPIEXEC_RANK", i);
        
	/* either no stdin, or just to proc #0, or to all of them */
	if (cl_args->which_stdin == STDIN_ONE && i == 0) {
	    env_add_int("MPIEXEC_STDIN_PORT", stdio_port(0));
	    /* do not add _HOST for p4, since we don't want
	     * the children of the big or remote master to
	     * connect.  This _PORT is just for PBS, not for MPICH.  */
	}
	if (cl_args->which_stdin == STDIN_ALL) {
	    env_add_int("MPIEXEC_STDIN_PORT", stdio_port(0));
	    if (cl_args->comm == COMM_MPICH_P4)
		/* slave processes need to be told which host, as the stdin
		 * connection happens not in pbs_mom, but in mpich/p4 library
		 * code when it spawns each of the other tasks. */
		env_add("MPIEXEC_STDIN_HOST", nodes[0].name);
	}

	env_terminate();

	/* build proc-specific command line */
	growstr_zero(g);
	g->translate_single_quote = 0;

	/*
	 * Totalview is a bit odd, even hackish perhaps.  Send the pid
	 * the just-starting process to ourselves via /dev/tcp, some sort
	 * of virtual device that makes a TCP connection as told and sends
	 * the echoed data.
	 */
	if (cl_args->tview && cl_args->comm == COMM_MPICH2_PMI)
	    growstr_printf(g, "if hash nc > /dev/null; then printf %%10d $$ | nc %s %d; else printf %%10d $$ > /dev/tcp/%s/%d; fi; "
	    		   "if test -d \"%s\"; then cd \"%s\"; fi; exec %s -c ",
			   nodes[0].name, tv_port,
			   nodes[0].name, tv_port,
			   pwd, pwd, user_shell);
	else
	    growstr_printf(g,
			   "if test -d \"%s\"; then cd \"%s\"; fi; exec %s -c ",
			   pwd, pwd, user_shell);
	growstr_append(g, "'exec ");
	g->translate_single_quote = 1;

	/*
	 * PBSPro environments do not know how to redirect standard streams.
	 * So we fork a helper program that lives in the user's PATH, hopefully
	 * the same place as mpiexec, that does the redirection then execs the
	 * actual executable.  This will break on OpenPBS or Torque, although
	 * I guess the redir helper could unset the env vars, but I'd rather
	 * people just didn't use the redir helper in that case.
	 */
	if (HAVE_PBSPRO_HELPER)
	    growstr_printf(g, "%s ", mpiexec_redir_helper_path);

	/*
	 * The executable, or a debugger wrapper around it.  In the mpich2
	 * case we don't need any special args.
	 */
	if (cl_args->tview && cl_args->comm != COMM_MPICH2_PMI) {
	    if (i == 0)
		growstr_printf(g, "%s %s -a -mpichtv", tvname,
			       tasks[i].conf->exe);
	    else
		growstr_printf(g, "%s -mpichtv", tasks[i].conf->exe);
	} else
	    growstr_printf(g, "%s", tasks[i].conf->exe);

	/* process arguments _before_ p4 arguments to allow xterm/gdb hack */
	if (tasks[i].conf->args)
	    growstr_printf(g, " %s", tasks[i].conf->args);

	if (cl_args->comm == COMM_MPICH_P4) {
	    /*
	     * Pass the cwd to ch_p4, else it tries to chdir(exedir).  Thanks
	     * to Ben Webb <*****@*****.**> for fixing this.
	     */
	    growstr_printf(g, " -p4wd %s", pwd);

	    /* The actual flag names are just for debugging; they're not used
	     * but the order is important. */
	    growstr_printf(g, " -execer_id mpiexec");
	    growstr_printf(g, " -master_host %s", nodes[tasks[0].node].mpname);
	    growstr_printf(g, " -my_hostname %s", nodes[tasks[i].node].mpname);
	    growstr_printf(g, " -my_nodenum %d", i);
	    growstr_printf(g, " -my_numprocs %d", tasks[i].num_copies);
	    growstr_printf(g, " -total_numnodes %d", numtasks);
	    growstr_printf(g, " -master_port %d", master_port);
	    if (i == 0 && numtasks > 1) {
		int j;
		/* list of: <hostname> <procs-on-that-node> */
		growstr_printf(g, " -remote_info");
		for (j=1; j<numtasks; j++)
		    growstr_printf(g, " %s %d",
		      nodes[tasks[j].node].mpname, tasks[j].num_copies);
	    }
	}

	g->translate_single_quote = 0;
	growstr_printf(g, "'");  /* close quote for 'exec myjob ...' */
	nargv[2] = g->s;

	/*
	 * Dump all the info if sufficiently verbose.
	 */
	debug(2, "%s: command to %d/%d %s: %s", __func__, i, numtasks,
	  nodes[tasks[i].node].name, nargv[2]);
	if (cl_args->verbose > 2) {
	    int j;
	    debug(3, "%s: environment to %d/%d %s", __func__, i,
	      numtasks, nodes[tasks[i].node].name);
	    for (j=0; (cp = envp[j]); j++)
		printf("env %2d %s\n", j, cp);
	}

	if (concurrent_master) {
	    tm_event_t evt;
	    int err;

	    /* Note, would like to add obit immediately, but that is
	     * not allowed until the START message is polled.
	     */
	    err = tm_spawn(list_count(nargv), nargv, envp,
	                   nodes[tasks[i].node].ids[tasks[i].cpu_index[0]],
			   &tasks[i].tid, &evt);
	    if (err != TM_SUCCESS)
		error_tm(err, "%s: tm_spawn task %d", __func__, i);
	    evt_add(evt, -1, i, EVT_START);
	} else {
	    concurrent_request_spawn(i, list_count(nargv), nargv, envp,
	      nodes[tasks[i].node].ids[tasks[i].cpu_index[0]]);
	}
	tasks[i].done = DONE_NOT;  /* has now been started */
	env_pop();
	++numspawned;
	++numtasks_waiting_start;

	if (cl_args->comm == COMM_MPICH_P4 && i == 0 && numtasks > 1) {
	    ret = wait_task_start();
	    if (ret)
		break;  /* don't bother trying to start the rest */
	    ret = read_p4_master_port(&master_port);
	    if (ret)
		break;
	}

	/*
	 * Pay attention to incoming tasks so they don't time out while
	 * we're starting up all the others, non blocking.
	 */
	if (cl_args->comm == COMM_MPICH_IB) {
	    int one = 1;
	    for (;;) {
		ret = service_ib_startup(one);
		one = 0;  /* only report the new task that first time */
		if (ret < 0) {
		    ret = 1;
		    goto out;
		}
		if (ret == 0)  /* nothing accomplished */
		    break;
	    }
	}
	if (cl_args->comm == COMM_MPICH_GM) {
	    int one = 1;
	    for (;;) {
		ret = service_gm_startup(one);
		one = 0;  /* only report the new task that first time */
		if (ret < 0) {
		    ret = 1;
		    goto out;
		}
		if (ret == 0)  /* nothing accomplished */
		    break;
	    }
	}
	if (cl_args->comm == COMM_MPICH_PSM) {
	    int one = 1;
	    for (;;) {
		ret = service_psm_startup(one);
		one = 0;  /* only report the new task that first time */
		if (ret < 0) {
		    ret = 1;
		    goto out;
		}
		if (ret == 0)  /* nothing accomplished */
		    break;
	    }
	}
	if (cl_args->tview && cl_args->comm == COMM_MPICH2_PMI)
	    tv_accept_one(i);
    }

    if (cl_args->tview && cl_args->comm == COMM_MPICH2_PMI)
       tv_complete();

    /* don't need these anymore */
    free(nargv[0]);
    free(nargv[1]);
    growstr_free(g);

    if (cl_args->comm == COMM_MPICH_PSM)
	free(psm_uuid);
    if (ret)
	goto out;

    /*
     * Wait for spawn events and submit obit requests.
     */
    while (numtasks_waiting_start) {
	ret = wait_task_start();
	if (ret)
	    goto out;
    }

    debug(1, "All %d task%s (spawn %d) started", task_end - task_start,
          task_end - task_start > 1 ? "s": "", spawn);

    /*
     * Finalize mpi-specific startup protocal, e.g. wait for all tasks to
     * checkin, perform barrier, etc.
     */
    if (cl_args->comm == COMM_MPICH_GM)
	ret = read_gm_startup_ports();

    if (cl_args->comm == COMM_MPICH_IB)
	ret = read_ib_startup_ports();

    if (cl_args->comm == COMM_MPICH_PSM)
	ret = read_psm_startup_ports();

    if (cl_args->comm == COMM_MPICH_RAI)
	ret = read_rai_startup_ports();

    if (ret == 0)
	startup_complete = 1;

  out:
    return ret;
}