Example #1
0
/**
 * @brief
 * 	teardown_cpuset()
 * 	Main interface used by mom to revoke cpusets from jobs.
 *
 * @par	Functionality:
 *	Attempt to tear down the cpuset assigned to this job.  If unable to do
 *	so immediately, place the cpuset on the queue of "stuck" cpusets.  This
 * 	list is periodically traversed, and any cpusets that have become
 * 	"unstuck" are freed and returned to the global pool.
 *
 * @par Note:
 * 	A cpuset can become "stuck" if all of the processes in the cpuset are
 * 	not killed before attempting to delete the cpuset.  This is usually a
 * 	symptom of user code attempting to dump core to an NFS filesystem on a
 * 	fileserver that is temporarily unreachable (i.e. crashed).
 *
 * @param[in] qname - cpuset name
 * @param[in] nodesp - pointer to nodes bitfield
 *
 * @return	int
 * @retval	0	success
 * @retval	-1	if the cpuset cannot be deleted
 *
 */
int
teardown_cpuset(char *qname, Bitfield *nodesp)
{
	/*
	 * Attempt to destroy the cpuset named in *cpuset.  If it succeeds, that
	 * is all that needs to be done.  Return the nodes to the nodepool.
	 */
	if (destroy_cpuset(qname) == 0) {
		(void)sprintf(log_buffer, "destroyed cpuset %s", qname);
		log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB, LOG_DEBUG, __func__, log_buffer);

		BITFIELD_SETM(&nodepool, nodesp);

#ifdef	DEBUG
		(void)sprintf(log_buffer, "nodepool now %s", bitfield2hex(&nodepool));
		log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB, LOG_DEBUG, __func__, log_buffer);
#endif	/* DEBUG */

		return 0;
	}

	/*
	 * The cpuset was not destroyed.  If it was a real system error, log
	 * it.  If it didn't exist, well then consider it torn down.  If it's
	 * busy, put it on the list of cpusets to attempt to reclaim later.
	 */
	if (errno == ESRCH || errno == ENOENT) {
		(void)sprintf(log_buffer, "can't delete nonexistent cpuset '%s'", qname);
		log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB, LOG_INFO, __func__, log_buffer);
		return 1;
	}
	if (errno != EBUSY) {
		(void)sprintf(log_buffer, "failed to destroy cpuset '%s'", qname);
		log_err(errno, __func__, log_buffer);
	}

	/*
	 * The cpuset is "busy".  At some point in the future it should become
	 * empty and be revocable.  Arrange to occasionally check it and clean
	 * it up if possible.
	 */

	if (add_to_cpusetlist(&stuckcpusets, qname, nodesp, NULL)) {
		(void)sprintf(log_buffer, "failed to add cpuset %s to stuck list",
			qname);
		log_err(errno, __func__, log_buffer);
		return 1;
	}

	/* Note that the nodes are stuck and unavailable. */
	BITFIELD_SETM(&stucknodes, nodesp);
	mom_update_resources();

	(void)sprintf(log_buffer,
		"can't destroy cpuset '%s' - retry later", qname);
	log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB, LOG_INFO, __func__, log_buffer);

	return -1;		/* Cpuset cannot be deleted at this time. */
}
Example #2
0
/** 
 * @brief
 * 	reclaim_cpusets()
 * 	Given a list of cpusets, attempt to destroy each cpuset named by the list.
 * 	If it can be destroyed, unset the bits corresponding to the cpuset's nodes
 * 	in the mask (if supplied).  This is used to reclaim cpusets that were
 * 	supposed to be deleted, but were in fact "stuck", and placed on stucklist.
 *
 * @param[in] listp - pointer to cpuset list
 * @param[in] maskp - pointer to mask bitfield
 *
 * @return	int
 * @retval	num of cpuset reclaimed		success
 * @retval	0				error
 *
 */
int
reclaim_cpusets(cpusetlist **listp, Bitfield *maskp)
{
	cpusetlist		*set, *next;
	int			count = 0;

	/*
	 * Walk the list of stuck cpusets, attempting to free each one.  Keep
	 * track of the previous and next pointers so the element can be
	 * unlinked and freed.
	 */
	for (set = *listp; set != NULL; set = next) {
		next = set->next;	/* Keep track of next pointer. */

		/* See if this cpuset can be deleted now.  If not, go on. */
		if (destroy_cpuset(set->name)) {
			log_err(0, __func__, "could not destroy cpuset");
			continue;
		}

		/*
		 * Remove the corresponding bits from the given bitmask, if supplied,
		 * and return the nodes to the nodepool.
		 */
		if (maskp != NULL)
			BITFIELD_CLRM(maskp, &(set->nodes));
		BITFIELD_SETM(&nodepool, &(set->nodes));

		/* Log that the cpuset was reclaimed. */
		(void)sprintf(log_buffer, "stuck cpuset %s reclaimed", set->name);
		log_event(PBSEVENT_ERROR, PBS_EVENTCLASS_JOB, LOG_INFO, __func__, log_buffer);

#ifdef	DEBUG
		(void)sprintf(log_buffer, "nodepool now %s", bitfield2hex(&nodepool));
		log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB, LOG_INFO, __func__, log_buffer);
#endif	/* DEBUG */

		/* Now free the storage for the cpusetlist element. */
		if (remove_from_cpusetlist(listp, NULL, set->name, NULL))
			break;

		count ++;		/* Another cpuset reclaimed. */
	}

	/*
	 * Perform a quick sanity check.  If there are no cpusets on the supplied
	 * list, then there should be no bits set in the supplied bitfield.  Log
	 * an error message if this is not the case.
	 */
	if (maskp != NULL && *listp == NULL && !BITFIELD_IS_ZERO(maskp))
		log_err(-1, __func__, "NULL cpusetlist but mask not empty!");

	return (count);
}
Example #3
0
static int
find_nodemasks(Queue *queue, Resources *rsrcs)
  {
  Job *job;
  Bitfield jobs_using;

  BITFIELD_CLRALL(&jobs_using);

  /*
   * Compute the set of nodes that are both physically available and also
   * assigned to this queue.
   */
  BITFIELD_CPY(&queue->availmask, &(queue->queuemask));
  BITFIELD_ANDM(&queue->availmask, &(queue->rsrcs->availmask));

  /*
   * Compute the set of nodes in use by jobs running on the queue (if
   * there are any) and remove those nodes from the available node mask.
   */

  if (queue->running)
    {
    for (job = queue->jobs; job != NULL; job = job->next)
      {
      if (job->state == 'R')
        BITFIELD_SETM(&jobs_using, &(job->nodemask));
      }
    }

  /*
   * Remove the used node bits from the queue's availmask, and add them to
   * the resources' nodes used bits.
   */
  BITFIELD_CLRM(&queue->availmask, &jobs_using);

  BITFIELD_SETM(&rsrcs->nodes_used, &jobs_using);

  return (0);
  }
Example #4
0
/**
 * @brief
 * 	query_cpusets()
 * 	Ask for a list of cpusets currently running on the system.  If a pointer
 * 	to a Bitfield was given, fill in the nodes in the bitfield with the union
 * 	of the nodes used in the current cpusets.  The input bitfield is not
 * 	cleared.
 *
 * @param[in] listp - pointer to cpusets
 * @param[in] maskp - pointer to bit mask
 *
 * @par Note:
 *	Cpusets are added to the tail of the list pointed to by listp if non-NULL,
 * 	and the total number of cpusets found is returned.
 *
 * @return	int
 * @retval	-1			error(with errno left as set by sysmp())
 * @retval	count of cpuset		
 */
int
query_cpusets(cpusetlist **listp, Bitfield *maskp)
{
	cpuset_NameList_t *names;
	char		qname[QNAME_STRING_LEN + 1];
	int			i, ret, count = 0;
	Bitfield		nodes;

	if (sysmp(MP_NPROCS) < 1) {
		log_err(errno, __func__, "sysmp(MP_NPROCS");
		return -1;			/* "This can't happen." */
	}

	/* Get the list of names else print error & exit */
	if ((names = cpusetGetNameList()) == NULL) {
		log_err(errno, __func__, "cpusetGetNameList");
		return (-1);
	}

	for (i = 0; i < names->count; i++) {

		if (names->status[i] != CPUSET_QUEUE_NAME)
			continue;

		if (listp) {	/* Add to supplied list? */

			(void)strncpy(qname, names->list[i], QNAME_STRING_LEN);
			qname[QNAME_STRING_LEN] = '\0';

			/* Query the kernel for the nodes for this cpuset. */
			if (cpuset2bitfield(qname, &nodes))
				continue;

			ret = add_to_cpusetlist(listp, qname, &nodes, NULL);

			if (ret < 0)	/* Cpuset not found -- race condition? */
				continue;

			if (ret > 0)	/* Error in list manipulation - give up. */
				break;

			/* Add the nodes for this cpuset into the specified bitmask. */
			if (maskp)
				BITFIELD_SETM(maskp, &nodes);
		}

		count ++;
	}

	return count;
}
Example #5
0
/*
 * Find an entry for the resources for the requested host in the list of
 * existing resources, or create a new one for that host and return it.
 */
Resources *
schd_get_resources(char *exechost)
{
    char   *id = "schd_get_resources";
    Resources *rptr, *new_rsrcs;
    int     rm;

    char   *response = NULL;
    int     badreply   = 0;
    int     cpus_avail = 0;
    size_t  pmem_avail = 0;

    char    hpm_ctl[64];

    struct sigaction act, oact;

    unsigned int remain; /* Time remaining in any old alarm(). */
    time_t then;  /* When this alarm() was started. */

#ifdef NODEMASK
    Bitfield cpy;
    int     i, j;
#endif /* NODEMASK */

    /*
     * Check for a local copy of the resources being available already.
     * If so, just return a reference to that Resources structure.
     */

    if (schd_RsrcsList != NULL)
    {
        for (rptr = schd_RsrcsList; rptr != NULL; rptr = rptr->next)
            if (strcmp(rptr->exechost, exechost) == 0)
                return (rptr);
    }

    schd_timestamp("get_rsrcs");

    /*
     * No cached resource information for 'exechost'.  Need to query the
     * host for its information.
     */

    if ((new_rsrcs = (Resources *)malloc(sizeof(Resources))) == NULL)
    {
        (void)sprintf(log_buffer, "Unable to alloc space for Resources.");
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
        DBPRT(("%s: %s\n", id, log_buffer));

        return (NULL); /* Can't get the information - nowhere to store it. */
    }

    memset((void *)new_rsrcs, 0, sizeof(Resources));

    act.sa_flags = 0;
    act.sa_handler = connect_interrupt;
    sigemptyset(&act.sa_mask);
    remain = 0;
    then = 0;

    /*
     * Set the alarm, and maintain some idea of how long was left on any
     * previously set alarm.
     */

    if (sigaction(SIGALRM, &act, &oact) == 0)
    {
        remain = alarm(GETRSRCS_CONNECT_TIME);
        then = time(NULL);
    }

    if ((rm = openrm(exechost, 0)) == -1)
    {
        (void)sprintf(log_buffer,
                      "Unable to contact resmom@%s (%d)", exechost, pbs_errno);
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

        badreply = 1;
        goto bail;
    }

    /*
     * Turn off full response.  Responses will be received in the order in
     * which they are sent.
     */
    fullresp(0);

    /* Build a list of all the resources about which we want information. */

    addreq(rm, "loadave");

    addreq(rm, "availmem");

    addreq(rm, "physmem");

    addreq(rm, "ncpus");

#ifdef NODEMASK
    addreq(rm, "availmask");

#endif /* NODEMASK */

    if (schd_MANAGE_HPM)
    {
        (void)sprintf(hpm_ctl, HPM_CTL_FORMAT_STR, HPM_CTL_QUERY_STR);
        addreq(rm, hpm_ctl);
    }

    /* Get the values back from the resource monitor, and round up. */

    /* Receive LOADAVE response from resource monitor. */
    response = getreq(rm);

    if (response != NULL)
    {
        new_rsrcs->loadave = atof(response) * schd_FAKE_MACH_MULT;
        (void)free(response);
    }
    else
    {
        (void)sprintf(log_buffer, "bad return from getreq(loadave), %d, %d",
                      pbs_errno, errno);
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
        badreply = 1;
        goto bail;
    }

    /* Receive AVAILMEM response from resource monitor. */
    response = getreq(rm);

    if (response != NULL)
    {
        new_rsrcs->freemem = schd_val2byte(response);
        new_rsrcs->freemem *= schd_FAKE_MACH_MULT;
        (void)free(response);
    }
    else
    {
        (void)sprintf(log_buffer, "bad return from getreq(freemem), %d, %d",
                      pbs_errno, errno);
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
        badreply = 1;
        goto bail;
    }

    /* Receive PHYSMEM response from resource monitor. */
    response = getreq(rm);

    if (response != NULL)
    {
        pmem_avail = schd_val2byte(response);
        pmem_avail *= schd_FAKE_MACH_MULT;
        (void)free(response);
    }
    else
    {
        (void)sprintf(log_buffer, "bad return from getreq(realmem), %d, %d",
                      pbs_errno, errno);
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
        badreply = 1;
        goto bail;
    }

    /* Receive NCPUS response from resource monitor. */
    response = getreq(rm);

    if (response != NULL)
    {
        cpus_avail = atoi(response) * schd_FAKE_MACH_MULT;
        (void)free(response);
    }
    else
    {
        (void)sprintf(log_buffer, "bad return from getreq(ncpus), %d, %d",
                      pbs_errno, errno);
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
        badreply = 1;
        goto bail;
    }

#ifdef NODEMASK
    /* Receive available nodes from resource monitor. */
    response = getreq(rm);

    if (response == NULL)
    {
        (void)sprintf(log_buffer, "bad return from getreq(availmask), %d, %d",
                      pbs_errno, errno);
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
        badreply = 1;
        goto bail;
    }
    else
    {
        if (schd_bits2mask(response, &new_rsrcs->availmask) != 0)
        {
            if (schd_str2mask(response, &new_rsrcs->availmask) != 0)
            {
                (void)sprintf(log_buffer, "can't parse availmask '%s'", response);
                log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
                badreply = 1;
                goto bail;
            }
        }

        (void)free(response);
    }

#endif /* NODEMASK */

    if (schd_MANAGE_HPM)
    {
        /* Receive HPM_CTL response from resource monitor. */
        response = getreq(rm);

        if (response != NULL)
        {
            if (strcmp(response, HPM_CTL_USERMODE_STR) == 0)
                new_rsrcs->flags |= RSRCS_FLAGS_HPM_USER;
            else if (strcmp(response, HPM_CTL_GLOBALMODE_STR) == 0)
                new_rsrcs->flags &= ~RSRCS_FLAGS_HPM_USER;
            else
            {
                (void)sprintf(log_buffer, "bad response '%s' for '%s@%s'",
                              response, hpm_ctl, exechost);
                log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id,
                           log_buffer);
                badreply = 1;
                goto bail;
            }
        }
        else
        {
            (void)sprintf(log_buffer, "bad return from getreq(%s), %d, %d",
                          hpm_ctl, pbs_errno, errno);
            log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
            badreply = 1;
            goto bail;
        }
    }

    /*
     * NOTE: response will be free()'d in bail.  Be sure to explicitly free()
     * response if more getreq() calls are added before the code below.
     */

bail:
    if (response != NULL)
        (void)free(response);

    /* Disconnect from the resource monitor. */
    if (rm >= 0)  /* resmom handle "0" is valid in RPP. */
        closerm(rm);

    /* And unset the alarm and handler. */
    alarm(0);

    sigaction(SIGALRM, &oact, &act);

    /* Reset the old alarm, taking into account how much time has passed. */
    if (remain)
    {
        DBPRT(("%s: old alarm had %d secs remaining, %d elapsed, ", id,
               remain, (time(NULL) - then)));
        /* How much time remains even after the time spent above? */
        remain -= (time(NULL) - then);

        /*
         * Would the previous time have already expired?  If so, schedule
         * an alarm call in 1 second (close enough, hopefully).
         */

        if (remain < 1)
            remain = 1;

        DBPRT(("reset to %d secs\n", remain));

        alarm(remain);
    }

    /*
     * Verify all the data came back as expected; if not, abort this
     * iteration of the scheduler.
     */
    if (badreply)
    {
        (void)sprintf(log_buffer,
                      "Got bad info from mom@%s - aborting sched run", exechost);
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
        DBPRT(("%s: %s\n", id, log_buffer));

        free(new_rsrcs);
        return (NULL);
    }

    /* Make a copy of the hostname for the resources struct. */
    new_rsrcs->exechost = schd_strdup(exechost);

    if (new_rsrcs->exechost == NULL)
    {
        (void)sprintf(log_buffer, "Unable to copy exechost %s to rsrcs",
                      exechost);
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
        DBPRT(("%s: %s\n", id, log_buffer));

        free(new_rsrcs);
        return (NULL);
    }

    new_rsrcs->nodes_total = NODES_REQD(cpus_avail, pmem_avail);

#ifdef NODEMASK
    /* Copy the availmask schd_FAKE_MACH_MULT times to match avail cpus. */
    BITFIELD_CPY(&cpy, &(new_rsrcs->availmask));

    for (i = 2; i <= schd_FAKE_MACH_MULT; i++)
    {
        for (j = 0; j < (cpus_avail / schd_FAKE_MACH_MULT / 2); j++)
            BITFIELD_SHIFTL(&cpy);

        BITFIELD_SETM(&(new_rsrcs->availmask), &cpy);
    }

#endif /* NODEMASK */

    if (schd_RsrcsList == NULL)
    {
        schd_RsrcsList  = new_rsrcs; /* Start the list. */
    }
    else
    {
        for (rptr = schd_RsrcsList; rptr->next != NULL; rptr = rptr->next)
            /* Find the last element in the list. */ ;

        rptr->next = new_rsrcs;
    }

    /* Next pointer for the tail of the list points to nothing. */
    new_rsrcs->next = NULL;

    return (new_rsrcs);
}
Example #6
0
int schd_alloc_nodes(int request, Queue *queue, Bitfield *maskp)
  {
  char   *id = "schd_alloc_nodes";
  Bitfield avail;
  Bitfield mask;
  Bitfield contig;
  int remain;
  int qmsb;
  int qlsb;
  int i, n;
  int count;
  int found;

  /* Make certain the nodecount request can be fulfilled. */

  if (request <= 0 || request > BITFIELD_NUM_ONES(&(queue->availmask)))
    return 0;

  /*
   * Make a copy of the queue's available bit mask to play with, and clear
   * the allocated nodes mask.
   */
  BITFIELD_CPY(&avail, &(queue->availmask));

  BITFIELD_CLRALL(&mask);

  /* How many have been found, and how many remain. */
  found  = 0;

  remain = request;

  while (remain > 0)
    {
    /*
     * Find first and last available bit positions in the
     * queue's available node mask.
     */
    qmsb = BITFIELD_MS_ONE(&avail);
    qlsb = BITFIELD_LS_ONE(&avail);

    /*
     * Starting with the size of the remaining nodes needed to satisfy
     * this request, look for a set of 'n' contiguous bits in the
     * available node mask.  If that is not found, try the next smallest
     * contiguous vector, etc.
     */

    for (n = remain; n > 0; n--)
      {
      /*
       * Create a contiguous bitmask of 'n' bits, starting at the
       * position of the highest bit in the avail mask.
       */
      BITFIELD_CLRALL(&contig);

      for (i = 0; i < n; i++)
        BITFIELD_SETB(&contig, qmsb - i);

      /*
       * Calculate how many times this contiguous bitmask needs to be
       * shifted to the right to cover every set of 'n' bits between
       * the qmsb and qlsb, inclusive.  Count the initial configuration
       * as well (the trailing '+ 1').
       */
      count = (qmsb + 1 - qlsb) - n + 1;

      /*
       * Shift the contiguous mask right one bit at a time, checking
       * if all the bits in the mask are set in the available mask.
       */
      for (i = 0; i < count; i++)
        {

        /* Are all bits in contig also set in the avail mask? */
        if (BITFIELD_TSTALLM(&avail, &contig))
          {
          break;
          }

        BITFIELD_SHIFTR(&contig);
        }

      /*
       * If the contiguous bits are available, add them to the new job
       * nodemask, and remove them from the avail mask.  Adjust the
       * remaining node count, and start the next hunt for the remaining
       * nodes.
       */
      if (i < count)
        {
        BITFIELD_SETM(&mask,  &contig);
        BITFIELD_CLRM(&avail, &contig);

        found  += n;
        remain -= n;

        break; /* for(n) loop */
        }
      }

    /* Check for something going wrong. */
    if (n == 0)
      {
      DBPRT(("%s: couldn't find any contiguous bits (even one!)\n", id));
      break; /* while(remain) loop */
      }
    }

  /*
   * If no bits remain to be allocated, copy the new mask into the provided
   * space, and return the number of bits requested.
   */
  if (!remain && (found == request))
    {
    BITFIELD_CPY(maskp, &mask);
    DBPRT(("%s: mask %s\n", id,
           schd_format_nodemask(&queue->queuemask, maskp)));
    return found;
    }

  return 0;
  }