Beispiel #1
0
   void Kickoff_PBS(const Node_info *ddinodes,const Cmdline_info *info) {
      char ddiinfo[] = "-ddi";
      char procid[8];
      char portid[8];
      char nodeid[8];
      char snodes[8];
      char sprocs[8];
      char **rargs;
      char **argv = info->argv;
      int i,j,r,iarg,nargs = info->ddiarg + info->nnodes + 8;
      int inode,ncpus,np = info->nprocs;
      int ntests;

      if(info->nnodes == 1) return;

      int tm_errno;
      tm_task_id *tid;
      tm_event_t *spawn;
      tm_event_t polled;
      struct tm_roots roots;
      tm_node_id *nodelist;


   /* ---------------------------------- *\
      Initialize PBS Task Management API
   \* ---------------------------------- */
      if(tm_init(0, &roots) != TM_SUCCESS) {
         fprintf(stderr, " ddikick.x: tm_init failed\n");
         Fatal_error(911);
      }

      if(tm_nodeinfo(&nodelist, &np) != TM_SUCCESS) {
         fprintf(stderr, " ddikick.x: tm_nodeinfo failed.\n");
         Fatal_error(911);
      }

      tid   = (tm_task_id *) Malloc(2*np*sizeof(tm_task_id)); 
      spawn = (tm_event_t *) Malloc(2*np*sizeof(tm_event_t));

      for(i=0; i<2*np; i++) {
         *(tid + i)   = TM_NULL_TASK;
         *(spawn + i) = TM_NULL_EVENT;
      }


   /* ----------------------------------------- *\
      Initialize arguments to kickoff DDI tasks
   \* ----------------------------------------- */
      rargs = (char **) Malloc(nargs*sizeof(char*));

      sprintf(portid, "%d", info->kickoffport);
      sprintf(snodes, "%d", info->nnodes);
      sprintf(sprocs, "%d", info->nprocs);     

      for(i=1,r=0; i<info->ddiarg-1; i++) rargs[r++] = argv[i];

      rargs[r++] = ddiinfo;
      rargs[r++] = info->kickoffhost;    /*   kickoff host name     */
      rargs[r++] = portid;               /*   kickoff port number   */
      rargs[r++] = nodeid;               /*   rank of this node     */
      rargs[r++] = procid;               /*   rank of this process  */
      rargs[r++] = snodes;               /*   number of nodes       */
      rargs[r++] = sprocs;               /*   number of processors  */
  
      for(i=0,iarg=info->nodearg; i<info->nnodes; i++,iarg++) {
         rargs[r++] = argv[iarg];
      }   
          
      rargs[r] = NULL;


   /* ------------------------ *\
      Spawn DDI tasks to nodes
   \* ------------------------ */
      ncpus=ddinodes[0].cpus+ddinodes[1].cpus;
      for(i=ddinodes[0].cpus,inode=1; i<np; i++) {
         
         if(i == ncpus) ncpus += ddinodes[++inode].cpus;
         
         sprintf(nodeid,"%d",inode);
         sprintf(procid,"%d",i);

       # if DDI_DEBUG
         DEBUG_START(DEBUG_MAX)
         fprintf(stdout,"DDI Process %i PBS tm_spawn arguments: ",i);
         for(iarg=0; iarg<r; iarg++) fprintf(stdout,"%s ",rargs[iarg]);
         fprintf(stdout,"\n");
         DEBUG_END()
       # endif

      /* ------------------------- *\
         Spawn DDI Compute Process
      \* ------------------------- */
         if(tm_spawn(r,rargs,NULL,*(nodelist+i),(tid+i),spawn+i) != TM_SUCCESS) {
            fprintf(stderr," ddikick.x: tm_spawn failed.\n");
            Fatal_error(911);
         }


      /* ---------------------------------- *\
         No data server on single node runs
      \* ---------------------------------- */
         if(info->nnodes == 1) continue;


       # if DDI_DEBUG
         DEBUG_START(DEBUG_MAX)
         fprintf(stdout,"DDI Process %i PBS tm_spawn arguments: ",j);
         for(iarg=0; iarg<r; iarg++) fprintf(stdout,"%s ",rargs[iarg]);
         fprintf(stdout,"\n");
         DEBUG_END()
       # endif

         j = i+np;
         sprintf(procid,"%d",j);
         
      /* --------------------- *\
         Spawn DDI Data Server
      \* --------------------- */
         if(tm_spawn(r,rargs,NULL,*(nodelist+i),(tid+j),spawn+j) != TM_SUCCESS) {
            fprintf(stderr," ddikick.x: tm_spawn failed.\n");
            Fatal_error(911);
      }  }


   /* -------------------------------------------------------- *\
      Poll PBS to ensure each DDI process started successfully
   \* -------------------------------------------------------- */
      ntests = np-ddinodes[0].cpus;
      if(USING_DATA_SERVERS())  ntests *= 2;

      for(i=ntests; i--; ) {
         if(tm_poll(TM_NULL_EVENT,&polled,1,&tm_errno) != TM_SUCCESS) {
            fprintf(stderr," ddikick.x: tm_poll failed.\n");
            Fatal_error(911);
         }
         
         for(j=0; j<np; j++) {
            if(polled == *(spawn+j)) {
               if(tm_errno) {
                  fprintf(stderr," ddikick.x: error spawning DDI task %i.\n",j);
                  Fatal_error(911);
               } else {
                # if DDI_DEBUG
                  DEBUG_START(DEBUG_MAX)
                  fprintf(stdout," ddikick.x: DDI task %i started.\n",j);
                  DEBUG_END()
                # endif
            }  }

            if(info->nnodes == 1) continue;

            if(polled == *(spawn+j+np)) {
               if(tm_errno) {
                  fprintf(stderr," ddikick.x: error spawning DDI task %i.\n",j+np);
                  Fatal_error(911);
               } else {
                # if DDI_DEBUG
                  DEBUG_START(DEBUG_MAX)
                  fprintf(stdout," ddikick.x: DDI task %i started.\n",j+np);
                  DEBUG_END()
                # endif
      }  }  }  }

      
   /* -------------------------------------- *\
      Close the link to the PBS Task Manager
   \* -------------------------------------- */
      tm_finalize();


   /* ---------------- *\
      Free used memory
   \* ---------------- */
      free(tid);
      free(spawn);
      free(rargs);      
   }
Beispiel #2
0
int main(

    int   argc,
    char *argv[])

{
    int c;
    int err = 0;
    int ncopies = -1;
    int onenode = -1;
    int rc;

    struct tm_roots rootrot;
    int  nspawned = 0;
    tm_node_id *nodelist;
    int start;
    int stop;
    int sync = 0;

    int pernode = 0;
    char *targethost = NULL;
    char *allnodes;

    struct sigaction act;

    char **ioenv;

    extern int   optind;
    extern char *optarg;

    int posixly_correct_set_by_caller = 0;
    char *envstr;

    id = malloc(60 * sizeof(char));

    if (id == NULL)
    {
        fprintf(stderr, "%s: malloc failed, (%d)\n",
                id,
                errno);

        return(1);
    }

    sprintf(id, "pbsdsh%s",
            ((getenv("PBSDEBUG") != NULL) && (getenv("PBS_TASKNUM") != NULL))
            ? getenv("PBS_TASKNUM")
            : "");

#ifdef __GNUC__
    /* If it's already set, we won't unset it later */

    if (getenv("POSIXLY_CORRECT") != NULL)
        posixly_correct_set_by_caller = 1;

    envstr = strdup("POSIXLY_CORRECT=1");

    putenv(envstr);

#endif

    while ((c = getopt(argc, argv, "c:n:h:osuv")) != EOF)
    {
        switch (c)
        {

        case 'c':

            ncopies = atoi(optarg);

            if (ncopies <= 0)
            {
                err = 1;
            }

            break;

        case 'h':

            targethost = strdup(optarg); /* run on this 1 hostname */

            break;

        case 'n':

            onenode = atoi(optarg);

            if (onenode < 0)
            {
                err = 1;
            }

            break;

        case 'o':

            grabstdio = 1;

            break;

        case 's':

            sync = 1; /* force synchronous spawns */

            break;

        case 'u':

            pernode = 1; /* run once per node (unique hostnames) */

            break;

        case 'v':

            verbose = 1; /* turn on verbose output */

            break;

        default:

            err = 1;

            break;
        }  /* END switch (c) */

    }    /* END while ((c = getopt()) != EOF) */

    if ((err != 0) || ((onenode >= 0) && (ncopies >= 1)))
    {
        fprintf(stderr, "Usage: %s [-c copies][-o][-s][-u][-v] program [args]...]\n",
                argv[0]);

        fprintf(stderr, "       %s [-n nodenumber][-o][-s][-u][-v] program [args]...\n",
                argv[0]);

        fprintf(stderr, "       %s [-h hostname][-o][-v] program [args]...\n",
                argv[0]);

        fprintf(stderr, "Where -c copies =  run  copy of \"args\" on the first \"copies\" nodes,\n");
        fprintf(stderr, "      -n nodenumber = run a copy of \"args\" on the \"nodenumber\"-th node,\n");
        fprintf(stderr, "      -o = capture stdout of processes,\n");
        fprintf(stderr, "      -s = forces synchronous execution,\n");
        fprintf(stderr, "      -u = run on unique hostnames,\n");
        fprintf(stderr, "      -h = run on this specific hostname,\n");
        fprintf(stderr, "      -v = forces verbose output.\n");

        exit(1);
    }

#ifdef __GNUC__
    if (!posixly_correct_set_by_caller)
    {
        putenv("POSIXLY_CORRECT");
        free(envstr);
    }

#endif


    if (getenv("PBS_ENVIRONMENT") == NULL)
    {
        fprintf(stderr, "%s: not executing under PBS\n",
                id);

        return(1);
    }


    /*
     * Set up interface to the Task Manager
     */

    if ((rc = tm_init(0, &rootrot)) != TM_SUCCESS)
    {
        fprintf(stderr, "%s: tm_init failed, rc = %s (%d)\n",
                id,
                get_ecname(rc),
                rc);

        return(1);
    }

    sigemptyset(&allsigs);

    sigaddset(&allsigs, SIGHUP);
    sigaddset(&allsigs, SIGINT);
    sigaddset(&allsigs, SIGTERM);

    act.sa_mask = allsigs;
    act.sa_flags = 0;

    /* We want to abort system calls and call a function. */

#ifdef SA_INTERRUPT
    act.sa_flags |= SA_INTERRUPT;
#endif
    act.sa_handler = bailout;
    sigaction(SIGHUP, &act, NULL);
    sigaction(SIGINT, &act, NULL);
    sigaction(SIGTERM, &act, NULL);

#ifdef DEBUG

    if (rootrot.tm_parent == TM_NULL_TASK)
    {
        fprintf(stderr, "%s: I am the mother of all tasks\n",
                id);
    }
    else
    {
        fprintf(stderr, "%s: I am but a child in the scheme of things\n",
                id);
    }

#endif /* DEBUG */

    if ((rc = tm_nodeinfo(&nodelist, &numnodes)) != TM_SUCCESS)
    {
        fprintf(stderr, "%s: tm_nodeinfo failed, rc = %s (%d)\n",
                id,
                get_ecname(rc),
                rc);

        return(1);
    }

    /* nifty unique/hostname code */
    if (pernode || targethost)
    {
        allnodes = gethostnames(nodelist);

        if (targethost)
        {
            onenode = findtargethost(allnodes, targethost);
        }
        else
        {
            numnodes = uniquehostlist(nodelist, allnodes);
        }

        free(allnodes);

        if (targethost)
            free(targethost);
    }

    /* We already checked the lower bounds in the argument processing,
       now we check the upper bounds */

    if ((onenode >= numnodes) || (ncopies > numnodes))
    {
        fprintf(stderr, "%s: only %d nodes available\n",
                id,
                numnodes);

        return(1);
    }

    /* malloc space for various arrays based on number of nodes/tasks */

    tid = (tm_task_id *)calloc(numnodes, sizeof(tm_task_id));

    events_spawn = (tm_event_t *)calloc(numnodes, sizeof(tm_event_t));

    events_obit  = (tm_event_t *)calloc(numnodes, sizeof(tm_event_t));

    ev = (int *)calloc(numnodes, sizeof(int));

    if ((tid == NULL) ||
            (events_spawn == NULL) ||
            (events_obit == NULL) ||
            (ev == NULL))
    {
        /* FAILURE - cannot alloc memory */

        fprintf(stderr, "%s: memory alloc of task ids failed\n",
                id);

        return(1);
    }

    for (c = 0; c < numnodes; c++)
    {
        *(tid + c)          = TM_NULL_TASK;
        *(events_spawn + c) = TM_NULL_EVENT;
        *(events_obit  + c) = TM_NULL_EVENT;
        *(ev + c)           = 0;
    }  /* END for (c) */

    /* Now spawn the program to where it goes */

    if (onenode >= 0)
    {
        /* Spawning one copy onto logical node "onenode" */

        start = onenode;
        stop  = onenode + 1;
    }
    else if (ncopies >= 0)
    {
        /* Spawn a copy of the program to the first "ncopies" nodes */

        start = 0;
        stop  = ncopies;
    }
    else
    {
        /* Spawn a copy on all nodes */

        start = 0;
        stop  = numnodes;
    }

    if ((ioenv = calloc(2, sizeof(char *)))==NULL)
    {
        /* FAILURE - cannot alloc memory */

        fprintf(stderr,"%s: memory alloc of ioenv failed\n",
                id);

        return(1);
    }

    if (grabstdio != 0)
    {
        stdoutfd = build_listener(&stdoutport);

        if ((*ioenv = calloc(50,sizeof(char *))) == NULL)
        {
            /* FAILURE - cannot alloc memory */

            fprintf(stderr,"%s: memory alloc of *ioenv failed\n",
                    id);

            return(1);
        }

        snprintf(*ioenv,49,"TM_STDOUT_PORT=%d",
                 stdoutport);

        FD_ZERO(&permrfsd);
    }

    sigprocmask(SIG_BLOCK, &allsigs, NULL);

    for (c = start; c < stop; ++c)
    {
        if ((rc = tm_spawn(
                      argc - optind,
                      argv + optind,
                      ioenv,
                      *(nodelist + c),
                      tid + c,
                      events_spawn + c)) != TM_SUCCESS)
        {
            fprintf(stderr, "%s: spawn failed on node %d err %s\n",
                    id,
                    c,
                    get_ecname(rc));
        }
        else
        {
            if (verbose)
                fprintf(stderr, "%s: spawned task %d\n",
                        id,
                        c);

            ++nspawned;

            if (sync)
                wait_for_task(&nspawned); /* one at a time */
        }

    }    /* END for (c) */

    if (sync == 0)
        wait_for_task(&nspawned); /* wait for all to finish */


    /*
     * Terminate interface with Task Manager
     */

    tm_finalize();

    return 0;
}  /* END main() */