Пример #1
0
/*
 * Attempt to set the state of the hpm counters on the host associated
 * with the given Resources.  Mode must be one of HPM_SETUP_USERMODE or
 * HPM_SETUP_GLOBALMODE.  Return 0 on success, non-zero otherwise.
 */
static int
setup_hpm(Resources *rsrcs, int mode)
  {
  char   *id = "setup_hpm";
  char   *response, *value;
  char    hpm_ctl[64];
  int     rm;
  int     local_errno = 0;

  switch (mode)
    {

    case HPM_SETUP_USERMODE:

      /* Sanity check - is the host already in the requested mode? */

      if (rsrcs->flags & RSRCS_FLAGS_HPM_USER)
        {
        DBPRT(("%s: hpm user mode requested for %s, but already set!\n",
               id, rsrcs->exechost));
        return (0);
        }

      (void)sprintf(hpm_ctl, HPM_CTL_FORMAT_STR, HPM_CTL_USERMODE_STR);
      break;

    case HPM_SETUP_GLOBALMODE:

      /* Sanity check - is the host already in the requested mode? */

      if (!(rsrcs->flags & RSRCS_FLAGS_HPM_USER))
        {
        DBPRT(("%s: hpm global mode requested for %s, but already set!\n",
               id, rsrcs->exechost));
        return (0);
        }

      (void)sprintf(hpm_ctl, HPM_CTL_FORMAT_STR, HPM_CTL_GLOBALMODE_STR);
      break;

    case HPM_SETUP_REVOKE:
      /* Sanity check - is the host already in the requested mode? */

      if (!(rsrcs->flags & RSRCS_FLAGS_HPM_USER))
        {
        DBPRT(("%s: hpm revocation requested for %s, but already global!\n",
               id, rsrcs->exechost));
        return (0);
        }

      (void)sprintf(hpm_ctl, HPM_CTL_FORMAT_STR, HPM_CTL_REVOKE_STR);
      break;

    default:
      DBPRT(("%s: Bogus mode %d - bailing.\n", id, mode));
      return (1);
    }

  DBPRT(("%s: '%s' @ %s\n", id, hpm_ctl, rsrcs->exechost));

  if ((rm = openrm(rsrcs->exechost, 0)) < 0)
    {
    (void)sprintf(log_buffer,
                  "Unable to contact resmom@%s", rsrcs->exechost);
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
    return (1);
    }

  /* Ask the resource monitor on the remote host to set the mode for us. */
  response = NULL;

  if (addreq_err(rm, &local_errno, hpm_ctl) == 0)
    response = getreq_err(&local_errno, rm);

  closerm(rm);

  if (response == NULL)
    {
    (void)sprintf(log_buffer, "bad return from getreq(%s) @%s, %d",
                  hpm_ctl, rsrcs->exechost, local_errno);
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
    DBPRT(("%s: %s\n", id, log_buffer));
    return (1);
    }

  /*
   * If a full response was received, move forward to the first character
   * of the value (following the '=' in the attribute-value pair).
   */
  if (value = strchr(response, '='))
    response = ++value;

  /*
   * If the hpm_ctl request succeeded, log the fact, and set the flag in
   * the resources for this host to indicate that it is now in the other
   * state.
   */
  if (strcmp(response, HPM_CTL_OKAY_STR) == 0)
    {
    if (mode == HPM_SETUP_USERMODE)
      rsrcs->flags |= RSRCS_FLAGS_HPM_USER;
    else
      rsrcs->flags &= ~RSRCS_FLAGS_HPM_USER;

    (void)sprintf(log_buffer, "%s on %s succeeded", hpm_ctl,
                  rsrcs->exechost);

    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

    DBPRT(("%s: %s\n", id, log_buffer));

    return (0);
    }

  /* If it's an error string, just report the error message returned. */
  if (strncmp(response, HPM_CTL_ERROR_STR, strlen(HPM_CTL_ERROR_STR)) == 0)
    {

    response += strlen(HPM_CTL_ERROR_STR); /* Skip the error string. */

    while (*response == ' ')  /* Skip leading whitespace. */
      ++ response;

    /* And generate the log message from the request and the response. */
    (void)sprintf(log_buffer, "%s: %s (%s)", hpm_ctl, response,
                  rsrcs->exechost);
    }
  else
    {
    (void)sprintf(log_buffer, "cannot parse response %s to request %s@%s",
                  response, hpm_ctl, rsrcs->exechost);
    }

  log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

  DBPRT(("%s: %s\n", id, log_buffer));
  return (1);
  }
Пример #2
0
int talk_with_mom(

    node_info *ninfo)

{
    int mom_sd;   /* connection descriptor to mom */
    char *mom_ans;  /* the answer from mom - getreq() */
    char *endp;   /* used with strtol() */
    double testd;   /* used to convert string -> double */
    int testi;   /* used to convert string -> int */
    char errbuf[256];
    int i;
    int local_errno = 0;

    if ((ninfo != NULL) && !ninfo->is_down && !ninfo->is_offline)
    {
        if ((mom_sd = openrm(ninfo -> name, pbs_rm_port)) < 0)
        {
            sched_log(PBSEVENT_SYSTEM, PBS_EVENTCLASS_REQUEST, ninfo -> name, "Can not open connection to mom");
            return 1;
        }

        if(begin_rm_req(mom_sd,&local_errno,num_resget) != 0)
        {
            closerm_err(&local_errno, mom_sd);
            return 0;
        }
        for (i = 0; i < num_resget; i++)
            addreq_err(mom_sd, &local_errno, (char *) res_to_get[i]);

        for (i = 0; i < num_resget && (mom_ans = getreq_err(&local_errno, (mom_sd))) != NULL; i++)
        {
            if (!strcmp(res_to_get[i], "max_load"))
            {
                testd = strtod(mom_ans, &endp);

                if (*endp == '\0')
                    ninfo -> max_load = testd;
                else
                    ninfo -> max_load = ninfo -> ncpus;

                free(mom_ans);
            }
            else if (!strcmp(res_to_get[i], "ideal_load"))
            {
                testd = strtod(mom_ans, &endp);

                if (*endp == '\0')
                    ninfo -> ideal_load = testd;
                else
                    ninfo -> ideal_load = ninfo -> ncpus;

                free(mom_ans);
            }
            else if (!strcmp(res_to_get[i], "arch"))
                ninfo -> arch = mom_ans;
            else if (!strcmp(res_to_get[i], "ncpus"))
            {
                testi = strtol(mom_ans, &endp, 10);

                if (*endp == '\0')
                    ninfo -> ncpus = testi;
                else
                    ninfo -> ncpus = 1;

                free(mom_ans);
            }
            else if (!strcmp(res_to_get[i], "physmem"))
            {
                ninfo -> physmem = res_to_num(mom_ans);
                free(mom_ans);
            }
            else if (!strcmp(res_to_get[i], "loadave"))
            {
                testd = strtod(mom_ans, &endp);

                if (*endp == '\0')
                    ninfo -> loadave = testd;
                else
                    ninfo -> loadave = -1.0;

                free(mom_ans);
            }
            else
            {
                sprintf(errbuf, "Unknown resource value[%d]: %s", i, mom_ans);
                sched_log(PBSEVENT_SCHED, PBS_EVENTCLASS_NODE, ninfo -> name, errbuf);
            }
        }

        closerm_err(&local_errno, mom_sd);
    }

    return 0;
}
Пример #3
0
/*
 * Find an entry for the resources for the requested host in the list of
 * existing resources, or create a new one for that host and return it.
 */
Resources *
schd_get_resources(char *exechost)
  {
  char   *id = "schd_get_resources";
  Resources *rptr, *new_rsrcs;
  int     rm;

  char   *response = NULL;
  int     badreply   = 0;
  int     local_errno = 0;

  struct sigaction act, oact;
  unsigned int remain; /* Time remaining in any old alarm(). */
  time_t then;  /* When this alarm() was started. */

  /*
   * Check for a local copy of the resources being available already.
   * If so, just return a reference to that Resources structure.
   */

  if (schd_RsrcsList != NULL)
    {
    for (rptr = schd_RsrcsList; rptr != NULL; rptr = rptr->next)
      if (strcmp(rptr->exechost, exechost) == 0)
        return (rptr);
    }

  schd_timestamp("get_rsrcs");

  /*
   * No cached resource information for 'exechost'.  Need to query the
   * host for its information.
   */

  if ((new_rsrcs = (Resources *)malloc(sizeof(Resources))) == NULL)
    {
    (void)sprintf(log_buffer, "Unable to alloc space for Resources.");
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
    DBPRT(("%s: %s\n", id, log_buffer));

    return (NULL); /* Can't get the information - nowhere to store it. */
    }

  memset((void *)new_rsrcs, 0, sizeof(Resources));

  act.sa_flags = 0;
  act.sa_handler = connect_interrupt;
  sigemptyset(&act.sa_mask);
  remain = 0;
  then = 0;

  /*
   * Set the alarm, and maintain some idea of how long was left on any
   * previously set alarm.
   */

  if (sigaction(SIGALRM, &act, &oact) == 0)
    {
    remain = alarm(GETRSRCS_CONNECT_TIME);
    then = time(NULL);
    }

  if ((rm = openrm(exechost, pbs_rm_port)) == -1)
    {
    (void)sprintf(log_buffer,
                  "Unable to contact resmom@%s", exechost);
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

    badreply = 1;
    goto bail;
    }

  /*
   * Turn off full response.  Responses will be received in the order in
   * which they are sent.
   */
  fullresp(0);

  /* Build a list of all the resources about which we want information. */

  addreq(rm, "loadave");

  addreq(rm, "physmem");

  addreq(rm, "ncpus");

  addreq(rm, "arch");

  /* Get the values back from the resource monitor, and round up. */

  /* Receive LOADAVE response from resource monitor. */
  response = getreq_err(&local_errno, rm);

  if (response != NULL)
    {
    new_rsrcs->loadave = atof(response);
    (void)free(response);
    }
  else
    {
    (void)sprintf(log_buffer, "bad return from getreq(loadave), %d, %d",
                  local_errno, errno);
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
    badreply = 1;
    goto bail;
    }

  /* Receive PHYSMEM response from resource monitor. */
  response = getreq_err(&local_errno, rm);

  if (response != NULL)
    {
    new_rsrcs->mem_total = schd_val2byte(response);
    (void)free(response);
    }
  else
    {
    (void)sprintf(log_buffer, "bad return from getreq(realmem), %d, %d",
                  local_errno, errno);
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
    badreply = 1;
    goto bail;
    }

  /* Receive NCPUS response from resource monitor. */
  response = getreq_err(&local_errno, rm);

  if (response != NULL)
    {
    new_rsrcs->ncpus_total = atoi(response);
    (void)free(response);
    }
  else
    {
    (void)sprintf(log_buffer, "bad return from getreq(ncpus), %d, %d",
                  local_errno, errno);
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
    badreply = 1;
    goto bail;
    }

  /* Receive ARCH response from resource monitor. */
  response = getreq_err(&local_errno, rm);

  if (response != NULL)
    {
    new_rsrcs->arch = schd_strdup(response);
    (void)free(response);
    }
  else
    {
    (void)sprintf(log_buffer, "bad return from getreq(arch), %d, %d",
                  local_errno, errno);
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
    badreply = 1;
    goto bail;
    }

bail:

  /* Disconnect from the resource monitor. */

  if (rm >= 0)  /* resmom handle "0" is valid in RPP. */
    closerm(rm);

  /* And unset the alarm and handler. */
  alarm(0);

  sigaction(SIGALRM, &oact, &act);

  /* Reset the old alarm, taking into account how much time has passed. */
  if (remain)
    {
    DBPRT(("%s: old alarm had %d secs remaining, %d elapsed, ", id,
           remain, (time(NULL) - then)));

    /* How much time remains even after the time spent above? */
    remain -= (time(NULL) - then);

    /*
     * Would the previous time have already expired?  If so, schedule
     * an alarm call in 1 second (close enough, hopefully).
     */

    if (remain < 1)
      remain = 1;

    DBPRT(("reset to %d secs\n", remain));

    alarm(remain);
    }

  /*
   * Verify all the data came back as expected; if not, abort this
   * iteration of the scheduler.
   */

  if (badreply)
    {
    (void)sprintf(log_buffer,
                  "Got bad info from mom@%s - skipping this node", exechost);
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
    DBPRT(("%s: %s\n", id, log_buffer));
    free(new_rsrcs);
    return (NULL);
    }

  /* Make a copy of the hostname for the resources struct. */
  new_rsrcs->exechost = schd_strdup(exechost);

  if (new_rsrcs->exechost == NULL)
    {
    (void)sprintf(log_buffer, "Unable to copy exechost %s to rsrcs",
                  exechost);
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
    DBPRT(("%s: %s\n", id, log_buffer));
    free(new_rsrcs);
    return (NULL);
    }

  if (schd_RsrcsList == NULL)
    {
    schd_RsrcsList  = new_rsrcs; /* Start the list. */
    }
  else
    {
    for (rptr = schd_RsrcsList; rptr->next != NULL; rptr = rptr->next)
      /* Find the last element in the list. */ ;

    rptr->next = new_rsrcs;
    }

  /* Next pointer for the tail of the list points to nothing. */
  new_rsrcs->next = NULL;

  return (new_rsrcs);
  }