Beispiel #1
0
static void poll_spawns(int fd, short args, void *cbdata)
{
    orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
    int i, rc;
    bool failed_launch = true;
    int local_err;
    tm_event_t event;

    /* TM poll for all the spawns */
    for (i = 0; i < launched; ++i) {
        rc = tm_poll(TM_NULL_EVENT, &event, 1, &local_err);
        if (TM_SUCCESS != rc) {
            opal_output(0, "plm:tm: failed to poll for a spawned daemon, return status = %d", rc);
            goto cleanup;
        }
        if (TM_SUCCESS != local_err) {
            opal_output(0, "plm:tm: failed to spawn daemon, error code = %d", local_err );
            goto cleanup;
        }
    }
    failed_launch = false;

 cleanup:
    /* cleanup */
    OBJ_RELEASE(state);

    /* check for failed launch - if so, force terminate */
    if (failed_launch) {
        ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_FAILED_TO_START);
    }
}
Beispiel #2
0
static void poll_spawns(int fd, short args, void *cbdata)
{
    orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
    int i, rc;
    bool failed_launch = true;
    int local_err;
    tm_event_t event;

    /* TM poll for all the spawns */
    for (i = 0; i < launched; ++i) {
        rc = tm_poll(TM_NULL_EVENT, &event, 1, &local_err);
        if (TM_SUCCESS != rc) {
            opal_output(0, "plm:tm: failed to poll for a spawned daemon, return status = %d", rc);
            goto cleanup;
        }
        if (TM_SUCCESS != local_err) {
            opal_output(0, "plm:tm: failed to spawn daemon, error code = %d", local_err );
            goto cleanup;
        }
    }
    failed_launch = false;

#if 0
    /* set a timer to tell us if one or more daemon's fails to start - use the
     * millisec/daemon timeout provided by the user to compute time
     */
    if (0 < orte_startup_timeout) {
        OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
                             "%s plm:tm: setting startup timer for %d milliseconds",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             orte_startup_timeout));
        ORTE_DETECT_TIMEOUT(map->num_new_daemons,
                            orte_startup_timeout*1000,
                            -1, failed_start, state->jdata);
    }
#endif
    
 cleanup:
    /* cleanup */
    OBJ_RELEASE(state);

    /* check for failed launch - if so, force terminate */
    if (failed_launch) {
        ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
    }
}
Beispiel #3
0
   void Kickoff_PBS(const Node_info *ddinodes,const Cmdline_info *info) {
      char ddiinfo[] = "-ddi";
      char procid[8];
      char portid[8];
      char nodeid[8];
      char snodes[8];
      char sprocs[8];
      char **rargs;
      char **argv = info->argv;
      int i,j,r,iarg,nargs = info->ddiarg + info->nnodes + 8;
      int inode,ncpus,np = info->nprocs;
      int ntests;

      if(info->nnodes == 1) return;

      int tm_errno;
      tm_task_id *tid;
      tm_event_t *spawn;
      tm_event_t polled;
      struct tm_roots roots;
      tm_node_id *nodelist;


   /* ---------------------------------- *\
      Initialize PBS Task Management API
   \* ---------------------------------- */
      if(tm_init(0, &roots) != TM_SUCCESS) {
         fprintf(stderr, " ddikick.x: tm_init failed\n");
         Fatal_error(911);
      }

      if(tm_nodeinfo(&nodelist, &np) != TM_SUCCESS) {
         fprintf(stderr, " ddikick.x: tm_nodeinfo failed.\n");
         Fatal_error(911);
      }

      tid   = (tm_task_id *) Malloc(2*np*sizeof(tm_task_id)); 
      spawn = (tm_event_t *) Malloc(2*np*sizeof(tm_event_t));

      for(i=0; i<2*np; i++) {
         *(tid + i)   = TM_NULL_TASK;
         *(spawn + i) = TM_NULL_EVENT;
      }


   /* ----------------------------------------- *\
      Initialize arguments to kickoff DDI tasks
   \* ----------------------------------------- */
      rargs = (char **) Malloc(nargs*sizeof(char*));

      sprintf(portid, "%d", info->kickoffport);
      sprintf(snodes, "%d", info->nnodes);
      sprintf(sprocs, "%d", info->nprocs);     

      for(i=1,r=0; i<info->ddiarg-1; i++) rargs[r++] = argv[i];

      rargs[r++] = ddiinfo;
      rargs[r++] = info->kickoffhost;    /*   kickoff host name     */
      rargs[r++] = portid;               /*   kickoff port number   */
      rargs[r++] = nodeid;               /*   rank of this node     */
      rargs[r++] = procid;               /*   rank of this process  */
      rargs[r++] = snodes;               /*   number of nodes       */
      rargs[r++] = sprocs;               /*   number of processors  */
  
      for(i=0,iarg=info->nodearg; i<info->nnodes; i++,iarg++) {
         rargs[r++] = argv[iarg];
      }   
          
      rargs[r] = NULL;


   /* ------------------------ *\
      Spawn DDI tasks to nodes
   \* ------------------------ */
      ncpus=ddinodes[0].cpus+ddinodes[1].cpus;
      for(i=ddinodes[0].cpus,inode=1; i<np; i++) {
         
         if(i == ncpus) ncpus += ddinodes[++inode].cpus;
         
         sprintf(nodeid,"%d",inode);
         sprintf(procid,"%d",i);

       # if DDI_DEBUG
         DEBUG_START(DEBUG_MAX)
         fprintf(stdout,"DDI Process %i PBS tm_spawn arguments: ",i);
         for(iarg=0; iarg<r; iarg++) fprintf(stdout,"%s ",rargs[iarg]);
         fprintf(stdout,"\n");
         DEBUG_END()
       # endif

      /* ------------------------- *\
         Spawn DDI Compute Process
      \* ------------------------- */
         if(tm_spawn(r,rargs,NULL,*(nodelist+i),(tid+i),spawn+i) != TM_SUCCESS) {
            fprintf(stderr," ddikick.x: tm_spawn failed.\n");
            Fatal_error(911);
         }


      /* ---------------------------------- *\
         No data server on single node runs
      \* ---------------------------------- */
         if(info->nnodes == 1) continue;


       # if DDI_DEBUG
         DEBUG_START(DEBUG_MAX)
         fprintf(stdout,"DDI Process %i PBS tm_spawn arguments: ",j);
         for(iarg=0; iarg<r; iarg++) fprintf(stdout,"%s ",rargs[iarg]);
         fprintf(stdout,"\n");
         DEBUG_END()
       # endif

         j = i+np;
         sprintf(procid,"%d",j);
         
      /* --------------------- *\
         Spawn DDI Data Server
      \* --------------------- */
         if(tm_spawn(r,rargs,NULL,*(nodelist+i),(tid+j),spawn+j) != TM_SUCCESS) {
            fprintf(stderr," ddikick.x: tm_spawn failed.\n");
            Fatal_error(911);
      }  }


   /* -------------------------------------------------------- *\
      Poll PBS to ensure each DDI process started successfully
   \* -------------------------------------------------------- */
      ntests = np-ddinodes[0].cpus;
      if(USING_DATA_SERVERS())  ntests *= 2;

      for(i=ntests; i--; ) {
         if(tm_poll(TM_NULL_EVENT,&polled,1,&tm_errno) != TM_SUCCESS) {
            fprintf(stderr," ddikick.x: tm_poll failed.\n");
            Fatal_error(911);
         }
         
         for(j=0; j<np; j++) {
            if(polled == *(spawn+j)) {
               if(tm_errno) {
                  fprintf(stderr," ddikick.x: error spawning DDI task %i.\n",j);
                  Fatal_error(911);
               } else {
                # if DDI_DEBUG
                  DEBUG_START(DEBUG_MAX)
                  fprintf(stdout," ddikick.x: DDI task %i started.\n",j);
                  DEBUG_END()
                # endif
            }  }

            if(info->nnodes == 1) continue;

            if(polled == *(spawn+j+np)) {
               if(tm_errno) {
                  fprintf(stderr," ddikick.x: error spawning DDI task %i.\n",j+np);
                  Fatal_error(911);
               } else {
                # if DDI_DEBUG
                  DEBUG_START(DEBUG_MAX)
                  fprintf(stdout," ddikick.x: DDI task %i started.\n",j+np);
                  DEBUG_END()
                # endif
      }  }  }  }

      
   /* -------------------------------------- *\
      Close the link to the PBS Task Manager
   \* -------------------------------------- */
      tm_finalize();


   /* ---------------- *\
      Free used memory
   \* ---------------- */
      free(tid);
      free(spawn);
      free(rargs);      
   }
Beispiel #4
0
char *gethostnames(

    tm_node_id *nodelist)

{
    char *allnodes;
    char *rescinfo;
    tm_event_t *rescevent;
    tm_event_t resultevent;
    char *hoststart;
    int rc, tm_errno, i, j;

    allnodes = calloc(numnodes, PBS_MAXNODENAME + 1 + sizeof(char));
    rescinfo = calloc(numnodes, RESCSTRLEN + 1 + sizeof(char));
    rescevent = calloc(numnodes, sizeof(tm_event_t));

    if (!allnodes || !rescinfo || !rescevent)
    {
        fprintf(stderr, "%s: malloc failed!\n",
                id);
        tm_finalize();

        exit(1);
    }

    /* submit resource requests */

    for (i = 0; i < numnodes; i++)
    {
        if (tm_rescinfo(
                    nodelist[i],
                    rescinfo + (i*RESCSTRLEN),
                    RESCSTRLEN - 1,
                    rescevent + i) != TM_SUCCESS)
        {
            fprintf(stderr, "%s: error from tm_rescinfo()\n", id);

            tm_finalize();

            exit(1);
        }
    }

    /* read back resource requests */

    for (j = 0, i = 0; i < numnodes; i++)
    {
        rc = tm_poll(TM_NULL_EVENT, &resultevent, 1, &tm_errno);

        if ((rc != TM_SUCCESS) || (tm_errno != TM_SUCCESS))
        {
            fprintf(stderr, "%s: error from tm_poll() %d\n",
                    id,
                    rc);

            tm_finalize();

            exit(1);
        }

        for (j = 0; j < numnodes; j++)
        {
            if (*(rescevent + j) == resultevent)
                break;
        }

        if (j == numnodes)
        {
            fprintf(stderr, "%s: unknown resource result\n", id);
            tm_finalize();
            exit(1);
        }

        if (verbose)
            fprintf(stderr, "%s: rescinfo from %d: %s\n", id, j, rescinfo + (j*RESCSTRLEN));

        strtok(rescinfo + (j*RESCSTRLEN), " ");

        hoststart = strtok(NULL, " ");

        if (hoststart == NULL)
        {
            fprintf(stderr, "%s: can't find a hostname in resource result\n", id);
            tm_finalize();
            exit(1);
        }

        strcpy(allnodes + (j*PBS_MAXNODENAME), hoststart);
    }

    free(rescinfo);

    free(rescevent);

    return(allnodes);
}
Beispiel #5
0
void wait_for_task(

    int *nspawned) /* number of tasks spawned */

{
    int     c;
    tm_event_t  eventpolled;
    int     nobits = 0;
    int     rc;
    int     tm_errno;

    while (*nspawned || nobits)
    {
        if (grabstdio)
            getstdout();

        if (verbose)
        {
        }

        if (fire_phasers)
        {
            tm_event_t event;

            for (c = 0; c < numnodes; c++)
            {
                if (*(tid + c) == TM_NULL_TASK)
                    continue;

                fprintf(stderr, "%s: killing task %u signal %d\n",
                        id,
                        *(tid + c),
                        fire_phasers);

                tm_kill(*(tid + c), fire_phasers, &event);
            }

            tm_finalize();

            exit(1);
        }

        sigprocmask(SIG_UNBLOCK, &allsigs, NULL);

        rc = tm_poll(TM_NULL_EVENT, &eventpolled, !grabstdio, &tm_errno);

        sigprocmask(SIG_BLOCK, &allsigs, NULL);

        if (rc != TM_SUCCESS)
        {
            fprintf(stderr, "%s: Event poll failed, error %s\n",
                    id,
                    get_ecname(rc));

            if (rc == TM_ENOTCONNECTED)
            {
                mom_reconnect();
            }
            else
            {
                exit(2);
            }
        }

        if (eventpolled == TM_NULL_EVENT)
            continue;

        for (c = 0; c < numnodes; ++c)
        {
            if (eventpolled == *(events_spawn + c))
            {
                /* spawn event returned - register obit */

                if (verbose)
                {
                    fprintf(stderr, "%s: spawn event returned: %d (%d spawns and %d obits outstanding)\n",
                            id,
                            c,
                            *nspawned,
                            nobits);
                }

                (*nspawned)--;

                if (tm_errno)
                {
                    fprintf(stderr, "%s: error %d on spawn\n",
                            id,
                            tm_errno);

                    continue;
                }

                rc = obit_submit(c);

                if (rc == TM_SUCCESS)
                {
                    if ((*(events_obit + c) != TM_NULL_EVENT) &&
                            (*(events_obit + c) != TM_ERROR_EVENT))
                    {
                        nobits++;
                    }
                }
            }
            else if (eventpolled == *(events_obit + c))
            {
                /* obit event, let's check it out */

                if (tm_errno == TM_ESYSTEM)
                {
                    if (verbose)
                    {
                        fprintf(stderr, "%s: error TM_ESYSTEM on obit (resubmitting)\n",
                                id);
                    }

                    sleep(2);  /* Give the world a second to take a breath */

                    obit_submit(c);

                    continue; /* Go poll again */
                }

                if (tm_errno != 0)
                {
                    fprintf(stderr, "%s: error %d on obit for task %d\n",
                            id,
                            tm_errno,
                            c);
                }

                /* task exited */

                if (verbose)
                {
                    fprintf(stderr, "%s: obit event returned: %d (%d spawns and %d obits outstanding)\n",
                            id,
                            c,
                            *nspawned,
                            nobits);
                }

                nobits--;

                *(tid + c) = TM_NULL_TASK;

                *(events_obit + c) = TM_NULL_EVENT;

                if ((verbose != 0) || (*(ev + c) != 0))
                {
                    fprintf(stderr, "%s: task %d exit status %d\n",
                            id,
                            c,
                            *(ev + c));
                }
            }
        }
    }

    return;
}  /* END wait_for_task() */
Beispiel #6
0
HYD_status HYDT_bscd_pbs_wait_for_completion(int timeout)
{
    int time_elapsed;
    int events_count, spawned_count;
    int idx, ierr;
    struct timeval start_tval, curr_tval;
    HYD_status status = HYD_SUCCESS;

    HYDU_FUNC_ENTER();

    /* Allocate memory for taskobits[] */
    HYDU_MALLOC(HYDT_bscd_pbs_sys->taskobits, int *,
                HYDT_bscd_pbs_sys->size * sizeof(int), status);
    spawned_count = HYDT_bscd_pbs_sys->spawned_count;

    /*
     * FIXME: We rely on gettimeofday here. This needs to detect the
     * timer type available and use that. Probably more of an MPL
     * functionality than Hydra's.
     */
    gettimeofday(&start_tval, NULL);

    /* Register with TM to be notified the obituary of the spawning process. */
    for (idx = 0; idx < spawned_count; idx++) {
        /*
         * Get a TM event which will be returned by tm_poll() when
         * the process labelled by taskID dies
         */
        ierr = tm_obit(HYDT_bscd_pbs_sys->taskIDs[idx],
                       HYDT_bscd_pbs_sys->taskobits + idx, HYDT_bscd_pbs_sys->events + idx);
        if (ierr != TM_SUCCESS)
            HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR,
                                "tm_obit() fails with TM err=%d.\n", ierr);
        if (HYDT_bscd_pbs_sys->events[idx] == TM_ERROR_EVENT)
            HYDU_error_printf("tm_obit(Task %d) returns error.\n",
                              HYDT_bscd_pbs_sys->taskIDs[idx]);
        if (HYDT_bscd_pbs_sys->events[idx] == TM_NULL_EVENT)
            HYDU_error_printf("Task %d already exits with status %d\n",
                              HYDT_bscd_pbs_sys->taskIDs[idx],
                              HYDT_bscd_pbs_sys->taskobits[idx]);
    }

    /* Poll if the spawned process has exited */
    events_count = 0;
    /* Substract all the processes that have already exited */
    for (idx = 0; idx < spawned_count; idx++) {
        if (HYDT_bscd_pbs_sys->events[idx] == TM_NULL_EVENT)
            events_count++;
    }
    /* Polling for the remaining alive processes till they all exit */
    while (events_count < spawned_count) {
        tm_event_t event = -1;
        int poll_err;
        ierr = tm_poll(TM_NULL_EVENT, &event, 0, &poll_err);
        if (ierr != TM_SUCCESS)
            HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR,
                                "tm_poll(obit_event) fails with err=%d.\n", ierr);
        if (event != TM_NULL_EVENT) {
            for (idx = 0; idx < spawned_count; idx++) {
                if (HYDT_bscd_pbs_sys->events[idx] == event) {
                    if (HYDT_bsci_info.debug) {
                        HYDU_dump(stdout,
                                  "PBS_DEBUG: Event %d received, task %d exits with status %d.\n",
                                  event, HYDT_bscd_pbs_sys->taskIDs[idx],
                                  HYDT_bscd_pbs_sys->taskobits[idx]);
                        /*
                         * HYDU_error_printf("DEBUG: Event %d received, task %d exits with status %d.\n", event, HYDT_bscd_pbs_sys->taskIDs[idx], HYDT_bscd_pbs_sys->taskobits[idx]);
                         */
                    }
                    events_count++;
                    break;      /* break from for(idx<spawned_count) loop */
                }
            }
        }

        /* Check if time is up */
        if (timeout > 0) {
            gettimeofday(&curr_tval, NULL);
            time_elapsed = curr_tval.tv_sec - start_tval.tv_sec;
            if (time_elapsed > timeout) {
                status = HYD_TIMED_OUT;
                goto fn_exit;
            }
        }
    }

    if (HYDT_bsci_info.debug) {
        HYDU_dump(stdout, "\nPBS_DEBUG: Done with polling obit events!\n");
    }

    /* Loop till all sockets have closed */
fn_exit:
    HYDU_FUNC_EXIT();
    return status;

fn_fail:
    goto fn_exit;
}