示例#1
0
/*
 * Find an entry for the resources for the requested host in the list of
 * existing resources, or create a new one for that host and return it.
 */
Resources *
schd_get_resources(char *exechost)
  {
  char   *id = "schd_get_resources";
  Resources *rptr, *new_rsrcs;
  int     rm;

  char   *response;
  int     badreply   = 0;
  int     cpus_avail = 0;
  int     cpus_tot   = 0;

  struct sigaction act, oact;

  unsigned int remain; /* Time remaining in any old alarm(). */
  time_t then;  /* When this alarm() was started. */

  /*
   * Check for a local copy of the resources being available already.
   * If so, just return a reference to that Resources structure.
   */

  if (schd_RsrcsList != NULL)
    {
    for (rptr = schd_RsrcsList; rptr != NULL; rptr = rptr->next)
      if (strcmp(rptr->exechost, exechost) == 0)
        return (rptr);
    }

  schd_timestamp("get_rsrcs");

  /*
   * No cached resource information for 'exechost'.  Need to query the
   * host for its information.
   */

  if ((new_rsrcs = (Resources *)malloc(sizeof(Resources))) == NULL)
    {
    (void)sprintf(log_buffer, "Unable to alloc space for Resources.");
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
    DBPRT(("%s: %s\n", id, log_buffer));

    return (NULL); /* Can't get the information - nowhere to store it. */
    }

  memset((void *)new_rsrcs, 0, sizeof(Resources));

  act.sa_flags = 0;
  act.sa_handler = connect_interrupt;
  sigemptyset(&act.sa_mask);
  remain = 0;
  then = 0;

  /*
   * Set the alarm, and maintain some idea of how long was left on any
   * previously set alarm.
   */

  if (sigaction(SIGALRM, &act, &oact) == 0)
    {
    remain = alarm(GETRSRCS_CONNECT_TIME);
    then = time(NULL);
    }

  if ((rm = openrm(exechost, 0)) == -1)
    {
    (void)sprintf(log_buffer,
                  "Unable to contact resmom@%s (%d)", exechost, pbs_errno);
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

    badreply = 1;
    goto bail;
    }

  /*
   * Turn off full response.  Responses will be received in the order in
   * which they are sent.
   */
  fullresp(0);

  /* Build a list of all the resources about which we want information. */

  addreq(rm, "mppe_app");

  addreq(rm, "mppe_avail");

  /* Get the values back from the resource monitor, and round up. */

  /* Receive MPPE_APP response from resource monitor. */
  /* returns the total number of Application PEs configured */
  response = getreq(rm);

  if (response != NULL)
    {
    cpus_tot = atoi(response) * schd_FAKE_MACH_MULT;
    }
  else
    {
    (void)sprintf(log_buffer, "bad return from getreq(ncpus), %d, %d",
                  pbs_errno, errno);
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
    badreply = 1;
    goto bail;
    }

  /* Receive MPPE_AVAIL response from resource monitor. */
  /* returns the largest contiguous block of APP PEs */
  response = getreq(rm);

  if (response != NULL)
    {
    cpus_avail = atoi(response) * schd_FAKE_MACH_MULT;
    }
  else
    {
    (void)sprintf(log_buffer, "bad return from getreq(ncpus), %d, %d",
                  pbs_errno, errno);
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
    badreply = 1;
    goto bail;
    }

  new_rsrcs->freemem = MB_PER_NODE * schd_FAKE_MACH_MULT;

bail:
  /* Disconnect from the resource monitor. */

  if (rm)
    closerm(rm);

  /* And unset the alarm and handler. */
  alarm(0);

  sigaction(SIGALRM, &oact, &act);

  /* Reset the old alarm, taking into account how much time has passed. */
  if (remain)
    {
    DBPRT(("%s: old alarm had %d secs remaining, %d elapsed, ", id,
           remain, (time(NULL) - then)));
    /* How much time remains even after the time spent above? */
    remain -= (time(NULL) - then);

    /*
     * Would the previous time have already expired?  If so, schedule
     * an alarm call in 1 second (close enough, hopefully).
     */

    if (remain < 1)
      remain = 1;

    DBPRT(("reset to %d secs\n", remain));

    alarm(remain);
    }

  /*
   * Verify all the data came back as expected; if not, abort this
   * iteration of the scheduler.
   */
  if (badreply)
    {
    (void)sprintf(log_buffer,
                  "Got bad info from mom@%s - aborting sched run", exechost);
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
    DBPRT(("%s: %s\n", id, log_buffer));

    free(new_rsrcs);
    return (NULL);
    }

  /* Make a copy of the hostname for the resources struct. */
  new_rsrcs->exechost = schd_strdup(exechost);

  if (new_rsrcs->exechost == NULL)
    {
    (void)sprintf(log_buffer, "Unable to copy exechost %s to rsrcs",
                  exechost);
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
    DBPRT(("%s: %s\n", id, log_buffer));

    free(new_rsrcs);
    return (NULL);
    }

  new_rsrcs->nodes_total = cpus_tot;

  new_rsrcs->nodes_alloc = cpus_tot - cpus_avail;

  if (schd_RsrcsList == NULL)
    {
    schd_RsrcsList  = new_rsrcs; /* Start the list. */
    }
  else
    {
    for (rptr = schd_RsrcsList; rptr->next != NULL; rptr = rptr->next)
      /* Find the last element in the list. */ ;

    rptr->next = new_rsrcs;
    }

  /* Next pointer for the tail of the list points to nothing. */
  new_rsrcs->next = NULL;

  return (new_rsrcs);
  }
示例#2
0
文件: getrsrcs.c 项目: CESNET/torque
/*
 * Find an entry for the resources for the requested host in the list of
 * existing resources, or create a new one for that host and return it.
 */
Resources *
schd_get_resources(char *exechost)
{
    char   *id = "schd_get_resources";
    Resources *rptr, *new_rsrcs;
    int     rm;

    char   *response = NULL;
    int     badreply   = 0;
    int     cpus_avail = 0;
    size_t  pmem_avail = 0;

    char    hpm_ctl[64];

    struct sigaction act, oact;

    unsigned int remain; /* Time remaining in any old alarm(). */
    time_t then;  /* When this alarm() was started. */

#ifdef NODEMASK
    Bitfield cpy;
    int     i, j;
#endif /* NODEMASK */

    /*
     * Check for a local copy of the resources being available already.
     * If so, just return a reference to that Resources structure.
     */

    if (schd_RsrcsList != NULL)
    {
        for (rptr = schd_RsrcsList; rptr != NULL; rptr = rptr->next)
            if (strcmp(rptr->exechost, exechost) == 0)
                return (rptr);
    }

    schd_timestamp("get_rsrcs");

    /*
     * No cached resource information for 'exechost'.  Need to query the
     * host for its information.
     */

    if ((new_rsrcs = (Resources *)malloc(sizeof(Resources))) == NULL)
    {
        (void)sprintf(log_buffer, "Unable to alloc space for Resources.");
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
        DBPRT(("%s: %s\n", id, log_buffer));

        return (NULL); /* Can't get the information - nowhere to store it. */
    }

    memset((void *)new_rsrcs, 0, sizeof(Resources));

    act.sa_flags = 0;
    act.sa_handler = connect_interrupt;
    sigemptyset(&act.sa_mask);
    remain = 0;
    then = 0;

    /*
     * Set the alarm, and maintain some idea of how long was left on any
     * previously set alarm.
     */

    if (sigaction(SIGALRM, &act, &oact) == 0)
    {
        remain = alarm(GETRSRCS_CONNECT_TIME);
        then = time(NULL);
    }

    if ((rm = openrm(exechost, 0)) == -1)
    {
        (void)sprintf(log_buffer,
                      "Unable to contact resmom@%s (%d)", exechost, pbs_errno);
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

        badreply = 1;
        goto bail;
    }

    /*
     * Turn off full response.  Responses will be received in the order in
     * which they are sent.
     */
    fullresp(0);

    /* Build a list of all the resources about which we want information. */

    addreq(rm, "loadave");

    addreq(rm, "availmem");

    addreq(rm, "physmem");

    addreq(rm, "ncpus");

#ifdef NODEMASK
    addreq(rm, "availmask");

#endif /* NODEMASK */

    if (schd_MANAGE_HPM)
    {
        (void)sprintf(hpm_ctl, HPM_CTL_FORMAT_STR, HPM_CTL_QUERY_STR);
        addreq(rm, hpm_ctl);
    }

    /* Get the values back from the resource monitor, and round up. */

    /* Receive LOADAVE response from resource monitor. */
    response = getreq(rm);

    if (response != NULL)
    {
        new_rsrcs->loadave = atof(response) * schd_FAKE_MACH_MULT;
        (void)free(response);
    }
    else
    {
        (void)sprintf(log_buffer, "bad return from getreq(loadave), %d, %d",
                      pbs_errno, errno);
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
        badreply = 1;
        goto bail;
    }

    /* Receive AVAILMEM response from resource monitor. */
    response = getreq(rm);

    if (response != NULL)
    {
        new_rsrcs->freemem = schd_val2byte(response);
        new_rsrcs->freemem *= schd_FAKE_MACH_MULT;
        (void)free(response);
    }
    else
    {
        (void)sprintf(log_buffer, "bad return from getreq(freemem), %d, %d",
                      pbs_errno, errno);
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
        badreply = 1;
        goto bail;
    }

    /* Receive PHYSMEM response from resource monitor. */
    response = getreq(rm);

    if (response != NULL)
    {
        pmem_avail = schd_val2byte(response);
        pmem_avail *= schd_FAKE_MACH_MULT;
        (void)free(response);
    }
    else
    {
        (void)sprintf(log_buffer, "bad return from getreq(realmem), %d, %d",
                      pbs_errno, errno);
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
        badreply = 1;
        goto bail;
    }

    /* Receive NCPUS response from resource monitor. */
    response = getreq(rm);

    if (response != NULL)
    {
        cpus_avail = atoi(response) * schd_FAKE_MACH_MULT;
        (void)free(response);
    }
    else
    {
        (void)sprintf(log_buffer, "bad return from getreq(ncpus), %d, %d",
                      pbs_errno, errno);
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
        badreply = 1;
        goto bail;
    }

#ifdef NODEMASK
    /* Receive available nodes from resource monitor. */
    response = getreq(rm);

    if (response == NULL)
    {
        (void)sprintf(log_buffer, "bad return from getreq(availmask), %d, %d",
                      pbs_errno, errno);
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
        badreply = 1;
        goto bail;
    }
    else
    {
        if (schd_bits2mask(response, &new_rsrcs->availmask) != 0)
        {
            if (schd_str2mask(response, &new_rsrcs->availmask) != 0)
            {
                (void)sprintf(log_buffer, "can't parse availmask '%s'", response);
                log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
                badreply = 1;
                goto bail;
            }
        }

        (void)free(response);
    }

#endif /* NODEMASK */

    if (schd_MANAGE_HPM)
    {
        /* Receive HPM_CTL response from resource monitor. */
        response = getreq(rm);

        if (response != NULL)
        {
            if (strcmp(response, HPM_CTL_USERMODE_STR) == 0)
                new_rsrcs->flags |= RSRCS_FLAGS_HPM_USER;
            else if (strcmp(response, HPM_CTL_GLOBALMODE_STR) == 0)
                new_rsrcs->flags &= ~RSRCS_FLAGS_HPM_USER;
            else
            {
                (void)sprintf(log_buffer, "bad response '%s' for '%s@%s'",
                              response, hpm_ctl, exechost);
                log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id,
                           log_buffer);
                badreply = 1;
                goto bail;
            }
        }
        else
        {
            (void)sprintf(log_buffer, "bad return from getreq(%s), %d, %d",
                          hpm_ctl, pbs_errno, errno);
            log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
            badreply = 1;
            goto bail;
        }
    }

    /*
     * NOTE: response will be free()'d in bail.  Be sure to explicitly free()
     * response if more getreq() calls are added before the code below.
     */

bail:
    if (response != NULL)
        (void)free(response);

    /* Disconnect from the resource monitor. */
    if (rm >= 0)  /* resmom handle "0" is valid in RPP. */
        closerm(rm);

    /* And unset the alarm and handler. */
    alarm(0);

    sigaction(SIGALRM, &oact, &act);

    /* Reset the old alarm, taking into account how much time has passed. */
    if (remain)
    {
        DBPRT(("%s: old alarm had %d secs remaining, %d elapsed, ", id,
               remain, (time(NULL) - then)));
        /* How much time remains even after the time spent above? */
        remain -= (time(NULL) - then);

        /*
         * Would the previous time have already expired?  If so, schedule
         * an alarm call in 1 second (close enough, hopefully).
         */

        if (remain < 1)
            remain = 1;

        DBPRT(("reset to %d secs\n", remain));

        alarm(remain);
    }

    /*
     * Verify all the data came back as expected; if not, abort this
     * iteration of the scheduler.
     */
    if (badreply)
    {
        (void)sprintf(log_buffer,
                      "Got bad info from mom@%s - aborting sched run", exechost);
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
        DBPRT(("%s: %s\n", id, log_buffer));

        free(new_rsrcs);
        return (NULL);
    }

    /* Make a copy of the hostname for the resources struct. */
    new_rsrcs->exechost = schd_strdup(exechost);

    if (new_rsrcs->exechost == NULL)
    {
        (void)sprintf(log_buffer, "Unable to copy exechost %s to rsrcs",
                      exechost);
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
        DBPRT(("%s: %s\n", id, log_buffer));

        free(new_rsrcs);
        return (NULL);
    }

    new_rsrcs->nodes_total = NODES_REQD(cpus_avail, pmem_avail);

#ifdef NODEMASK
    /* Copy the availmask schd_FAKE_MACH_MULT times to match avail cpus. */
    BITFIELD_CPY(&cpy, &(new_rsrcs->availmask));

    for (i = 2; i <= schd_FAKE_MACH_MULT; i++)
    {
        for (j = 0; j < (cpus_avail / schd_FAKE_MACH_MULT / 2); j++)
            BITFIELD_SHIFTL(&cpy);

        BITFIELD_SETM(&(new_rsrcs->availmask), &cpy);
    }

#endif /* NODEMASK */

    if (schd_RsrcsList == NULL)
    {
        schd_RsrcsList  = new_rsrcs; /* Start the list. */
    }
    else
    {
        for (rptr = schd_RsrcsList; rptr->next != NULL; rptr = rptr->next)
            /* Find the last element in the list. */ ;

        rptr->next = new_rsrcs;
    }

    /* Next pointer for the tail of the list points to nothing. */
    new_rsrcs->next = NULL;

    return (new_rsrcs);
}
示例#3
0
/*
 * Find an entry for the resources for the requested host in the list of
 * existing resources, or create a new one for that host and return it.
 */
Resources *
schd_get_resources(char *exechost)
  {
  char *id = "schd_get_resources";
  Resources *rptr, *new_rsrcs;
  int  rm;
  char *response = NULL;
  int  badreply   = 0;
  int  local_errno = 0;

  struct sigaction act, oact;
  unsigned int remain;  /* Time remaining in any old alarm(). */
  time_t  then;  /* When this alarm() was started. */

  /*
   * Check for a local copy of the resources being available already.
   * If so, just return a reference to that Resources structure.
   */

  if (schd_RsrcsList != NULL)
    {
    for (rptr = schd_RsrcsList; rptr != NULL; rptr = rptr->next)
      if (strcmp(rptr->exechost, exechost) == 0)
        return (rptr);
    }

  schd_timestamp("get_rsrcs");

  /*
   * No cached resource information for 'exechost'.  Need to query the
   * host for its information.
   */

  if ((new_rsrcs = (Resources *)malloc(sizeof(Resources))) == NULL)
    {
    (void)sprintf(log_buffer, "Unable to alloc space for Resources.");
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
    DBPRT(("%s: %s\n", id, log_buffer));

    return (NULL); /* Can't get the information - nowhere to store it. */
    }

  memset((void *)new_rsrcs, 0, sizeof(Resources));

  act.sa_flags = 0;
  act.sa_handler = connect_interrupt;
  sigemptyset(&act.sa_mask);
  remain = 0;
  then = 0;

  /*
   * Set the alarm, and maintain some idea of how long was left on any
   * previously set alarm.
   */

  if (sigaction(SIGALRM, &act, &oact) == 0)
    {
    remain = alarm(GETRSRCS_CONNECT_TIME);
    then = time(NULL);
    }

  if ((rm = openrm(exechost, 0)) == -1)
    {
    (void)sprintf(log_buffer,
                  "Unable to contact resmom@%s ", exechost);
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

    badreply = 1;
    goto bail;
    }

  /*
   * Turn off full response.  Responses will be received in the order in
   * which they are sent.
   */
  fullresp(0);

  /* Build a list of all the resources about which we want information. */

  addreq(rm, "loadave");

  addreq(rm, "availmem");

  addreq(rm, "physmem");

  addreq(rm, "ncpus");

  addreq(rm, "tmpdir");

  addreq(rm, "arch");

  /* Get the values back from the resource monitor, and round up. */

  /* Receive LOADAVE response from resource monitor. */
  response = getreq_err(&local_errno, rm);

  if (response != NULL)
    {
    new_rsrcs->loadave = atof(response);
    (void)free(response);
    }
  else
    {
    (void)sprintf(log_buffer, "bad return from getreq(loadave), %d, %d",
                  local_errno, errno);
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
    badreply = 1;
    goto bail;
    }

  /* Receive AVAILMEM response from resource monitor. */
  response = getreq_err(&local_errno, rm);

  if (response != NULL)
    {
    new_rsrcs->freemem = schd_val2byte(response);
    (void)free(response);
    }
  else
    {
    (void)sprintf(log_buffer, "bad return from getreq(freemem), %d, %d",
                  local_errno, errno);
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
    badreply = 1;
    goto bail;
    }

  /* Receive PHYSMEM response from resource monitor. */
  response = getreq_err(&local_errno, rm);

  if (response != NULL)
    {
    new_rsrcs->mem_total = schd_val2byte(response);
    (void)free(response);
    }
  else
    {
    (void)sprintf(log_buffer, "bad return from getreq(realmem), %d, %d",
                  local_errno, errno);
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
    badreply = 1;
    goto bail;
    }

  /* Receive NCPUS response from resource monitor. */
  response = getreq_err(&local_errno, rm);

  if (response != NULL)
    {
    new_rsrcs->ncpus_total = atoi(response);
    (void)free(response);
    }
  else
    {
    (void)sprintf(log_buffer, "bad return from getreq(ncpus), %d, %d",
                  local_errno, errno);
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
    badreply = 1;
    goto bail;
    }

  /* Receive TMPDIR response from resource monitor. */
  response = getreq_err(&local_errno, rm);

  if (response != NULL)
    {
    new_rsrcs->tmpdir = schd_val2byte(response);
    (void)free(response);
    }
  else
    {
    (void)sprintf(log_buffer, "bad return from getreq(tmpdir), %d, %d",
                  local_errno, errno);
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
    badreply = 1;
    goto bail;
    }

  /* Receive ARCH response from resource monitor. */
  response = getreq_err(&local_errno, rm);

  if (response != NULL)
    {
    new_rsrcs->arch = schd_strdup(response);
    (void)free(response);
    }
  else
    {
    (void)sprintf(log_buffer, "bad return from getreq(arch), %d, %d",
                  local_errno, errno);
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
    badreply = 1;
    goto bail;
    }

bail:

  /* Disconnect from the resource monitor. */

  if (rm >= 0)  /* resmom handle "0" is valid in RPP. */
    closerm(rm);

  /* And unset the alarm and handler. */
  alarm(0);

  sigaction(SIGALRM, &oact, &act);

  /* Reset the old alarm, taking into account how much time has passed. */
  if (remain)
    {
    DBPRT(("%s: old alarm had %d secs remaining, %d elapsed, ", id,
           remain, (time(NULL) - then)));

    /* How much time remains even after the time spent above? */
    remain -= (time(NULL) - then);

    /*
     * Would the previous time have already expired?  If so, schedule
     * an alarm call in 1 second (close enough, hopefully).
     */

    if (remain < 1)
      remain = 1;

    DBPRT(("reset to %d secs\n", remain));

    alarm(remain);
    }

  /*
   * Verify all the data came back as expected; if not, abort this
   * iteration of the scheduler.
   */

  if (badreply)
    {
    (void)sprintf(log_buffer,
                  "Got bad info from mom@%s - skipping this node", exechost);
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
    DBPRT(("%s: %s\n", id, log_buffer));
    free(new_rsrcs);
    return (NULL);
    }

  /* Make a copy of the hostname for the resources struct. */
  new_rsrcs->exechost = schd_strdup(exechost);

  if (new_rsrcs->exechost == NULL)
    {
    (void)sprintf(log_buffer, "Unable to copy exechost %s to rsrcs",
                  exechost);
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
    DBPRT(("%s: %s\n", id, log_buffer));
    free(new_rsrcs);
    return (NULL);
    }

  if (schd_RsrcsList == NULL)
    {
    schd_RsrcsList  = new_rsrcs; /* Start the list. */
    }
  else
    {
    for (rptr = schd_RsrcsList; rptr->next != NULL; rptr = rptr->next)
      /* Find the last element in the list. */ ;

    rptr->next = new_rsrcs;
    }

  /* Next pointer for the tail of the list points to nothing. */
  new_rsrcs->next = NULL;

  return (new_rsrcs);
  }