Exemple #1
0
int set_proc_ct(

  resource      *pprocsp,  /* I */
  pbs_attribute *pattr,    /* I */
  int            actmode)  /* I */

  {
  resource *pnodesp;
  resource_def *pndef;
  resource *ppct;
  resource_def *ppdef;

  if (actmode == ATR_ACTION_RECOV)
    {
    /* SUCCESS */

    return(0);
    }

  /* set "procct" to count of processors in "nodes" plus "procs" */

  ppdef = find_resc_def(svr_resc_def, "procct", svr_resc_size);

  if (ppdef == NULL)
    {
    return(PBSE_SYSTEM);
    }

  if ((ppct = find_resc_entry(pattr, ppdef)) == NULL)
    {
    if ((ppct = add_resource_entry(pattr, ppdef)) == 0)
      {
      return(PBSE_SYSTEM);
      }
    }

  pndef = find_resc_def(svr_resc_def, "nodes", svr_resc_size);
  if (pndef == NULL)
    {
    return(PBSE_SYSTEM);
    }

  if ((pnodesp = find_resc_entry(pattr, pndef)) == NULL)
    {
    ppct->rs_value.at_val.at_long =
      pprocsp->rs_value.at_val.at_long;
    }
  else
    {
    ppct->rs_value.at_val.at_long =
      pprocsp->rs_value.at_val.at_long;

    count_proc(pnodesp->rs_value.at_val.at_str);
    }

  ppct->rs_value.at_flags |= ATR_VFLAG_SET;

  return(0);
  }  /* END set_proc_ct() */
void set_resc_assigned(

  job *pjob,         /* I */
  enum batch_op op)  /* INCR or DECR */

  {
  resource      *jobrsc;
  resource      *pr;
  pbs_attribute *queru;
  resource_def  *rscdef;
  pbs_attribute *sysru;
  pbs_queue     *pque;
  char           log_buf[LOCAL_LOG_BUF_SIZE];

  if ((pjob == NULL))
    return;

  if ((pque = get_jobs_queue(&pjob)) != NULL)
    {
    if (pque->qu_qs.qu_type == QTYPE_Execution)
      {
      if (op == DECR)
        {
        /* if freeing completed job resources, ignore constraint (???) */
        /* NO-OP */
        }
      }
    else
      {
      snprintf(log_buf,sizeof(log_buf),
        "job %s isn't in an execution queue, can't modify resources\njob is in queue %s",
        pjob->ji_qs.ji_jobid,
        pque->qu_qs.qu_name);
      log_err(-1, __func__, log_buf);
    
      unlock_queue(pque, __func__, NULL, LOGLEVEL);
      return;
      }
  
    if (op == INCR)
      {
      if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_RescAssn)
        {
        unlock_queue(pque, __func__, NULL, LOGLEVEL);
        return;  /* already added in */
        }
      
      pjob->ji_qs.ji_svrflags |= JOB_SVFLG_RescAssn;
      }
    else if (op == DECR)
      {
      if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_RescAssn) == 0)
        {
        unlock_queue(pque, __func__, NULL, LOGLEVEL);
        return;  /* not currently included */
        }
      
      pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_RescAssn;
      }
    else
      {
      unlock_queue(pque, __func__, NULL, LOGLEVEL);
      return;   /* invalid op */
      }
    
    sysru = &server.sv_attr[SRV_ATR_resource_assn];

    queru = &pque->qu_attr[QE_ATR_ResourceAssn];
    jobrsc = (resource *)GET_NEXT(pjob->ji_wattr[JOB_ATR_resource].at_val.at_list);

    while (jobrsc != NULL)
      {
      rscdef = jobrsc->rs_defin;

      /* if resource usage is to be tracked */

      if ((rscdef->rs_flags & ATR_DFLAG_RASSN) &&
          (jobrsc->rs_value.at_flags & ATR_VFLAG_SET))
        {
        /* update system pbs_attribute of resources assigned */

        pr = find_resc_entry(sysru, rscdef);

        if (pr == NULL)
          {
          pr = add_resource_entry(sysru, rscdef);

          if (pr == NULL)
            {
            unlock_queue(pque, __func__, "sysru", LOGLEVEL);
            return;
            }
          }

        rscdef->rs_set(&pr->rs_value, &jobrsc->rs_value, op);

        /* update queue pbs_attribute of resources assigned */

        pr = find_resc_entry(queru, rscdef);

        if (pr == NULL)
          {
          pr = add_resource_entry(queru, rscdef);

          if (pr == NULL)
            {
            unlock_queue(pque, __func__, "queru", LOGLEVEL);
            return;
            }
          }

        rscdef->rs_set(&pr->rs_value, &jobrsc->rs_value, op);
        }

      jobrsc = (resource *)GET_NEXT(jobrsc->rs_link);
      }  /* END while (jobrsc != NULL) */

    unlock_queue(pque, __func__, "success", LOGLEVEL);
    }
  else if (pjob == NULL)
    {
    log_err(PBSE_JOBNOTFOUND, __func__, "Job lost while acquiring queue 9");
    }

  return;
  }  /* END set_resc_assigned() */
Exemple #3
0
int set_mppnodect(

  resource      * UNUSED(res),
  pbs_attribute *attr,
  int             UNUSED(op))

  {
  int           width;
  int           nppn;
  int           nodect;
  int           have_mppwidth = 0;
  int           have_mppnppn = 0;
  resource_def *pdef;
  resource     *pent = NULL;

  /* Go find the currently known width, nppn attributes */

  width = 0;
  nppn = 0;

  if (((pdef = find_resc_def(svr_resc_def,"mppwidth",svr_resc_size))) &&
      ((pent = find_resc_entry(attr,pdef))))
    {
    width = pent->rs_value.at_val.at_long;
    have_mppwidth = 1;
    }

  if (((pdef = find_resc_def(svr_resc_def,"mppnppn",svr_resc_size))) &&
      ((pent = find_resc_entry(attr,pdef))))
    {
    nppn = pent->rs_value.at_val.at_long;
    have_mppnppn = 1;
  
    /* Check for width less than a node */
    if ((width) && (width < nppn))
      {
      nppn = width;
      pent->rs_value.at_val.at_long = nppn;
      pent->rs_value.at_flags |= ATR_VFLAG_SET;
      }
    }

  /* Compute an estimate for the number of nodes needed */

  nodect = width;
  if (nppn > 1)
    {
    nodect = (nodect + nppn - 1) / nppn;
    }

  /* Find or create the "mppnodect" pbs_attribute entry */

  if ((pdef = find_resc_def(svr_resc_def,"mppnodect",svr_resc_size))) 
    {
    if (((pent = find_resc_entry(attr,pdef)) == NULL) &&
        ((pent = add_resource_entry(attr,pdef)) == NULL))
      {
      return(PBSE_SYSTEM);
      }
    }
  else
    return(PBSE_SYSTEM);

  /* Update the value */

  if (!have_mppwidth || !have_mppnppn)
    {
    pent->rs_value.at_val.at_long = -1;
    }
  else
    {
    pent->rs_value.at_val.at_long = nodect;
    }

  pent->rs_value.at_flags |= ATR_VFLAG_SET;

  return(PBSE_NONE);
  } /* END set_mppnodect() */
Exemple #4
0
int set_node_ct(

  resource      *pnodesp,  /* I */
  pbs_attribute *pattr,    /* I */
  int            actmode)  /* I */

  {
  resource *pnct;
  resource_def *pndef;
  resource *ppct = NULL;
  resource_def *ppdef;
  resource *pprocsp;
  resource_def *pprocsdef;

  if (actmode == ATR_ACTION_RECOV)
    {
    /* SUCCESS */

    return(0);
    }

  /* Set "nodect" to count of nodes in "nodes" */

  pndef = find_resc_def(svr_resc_def, "nodect", svr_resc_size);

  if (pndef == NULL)
    {
    return(PBSE_SYSTEM);
    }

  if ((pnct = find_resc_entry(pattr, pndef)) == NULL)
    {
    if ((pnct = add_resource_entry(pattr, pndef)) == NULL)
      {
      return(PBSE_SYSTEM);
      }
    }

  pnct->rs_value.at_val.at_long = ctnodes(pnodesp->rs_value.at_val.at_str);

  pnct->rs_value.at_flags |= ATR_VFLAG_SET;

  /* Set "neednodes" to "nodes", may be altered by scheduler */

  pndef = find_resc_def(svr_resc_def, "neednodes", svr_resc_size);

  if (pndef == NULL)
    {
    return(PBSE_SYSTEM);
    }

  if ((pnct = find_resc_entry(pattr, pndef)) == NULL)
    {
    if ((pnct = add_resource_entry(pattr, pndef)) == NULL)
      {
      return(PBSE_SYSTEM);
      }
    }
  else
    {
    pndef->rs_free(&pnct->rs_value);
    }

  pndef->rs_decode(&pnct->rs_value, NULL, NULL, pnodesp->rs_value.at_val.at_str, ATR_DFLAG_ACCESS);

  pnct->rs_value.at_flags |= ATR_VFLAG_SET;

  /* SUCCESS nodect */

  /* set "procct" to count of processors in "nodes" plus "procs" */

  ppdef = find_resc_def(svr_resc_def, "procct", svr_resc_size);

  if (ppdef == NULL)
    {
    return(PBSE_SYSTEM);
    }

  if ((ppct = find_resc_entry(pattr, ppdef)) == NULL)
    {
    if ((ppct = add_resource_entry(pattr, ppdef)) == 0)
      {
      return(PBSE_SYSTEM);
      }
    }

  pprocsdef = find_resc_def(svr_resc_def, "procs", svr_resc_size);
  if (pprocsdef == NULL)
    {
    return(PBSE_SYSTEM);
    }

  if ((pprocsp = find_resc_entry(pattr, pprocsdef)) == NULL)
    {
    ppct->rs_value.at_val.at_long = count_proc(pnodesp->rs_value.at_val.at_str);
    }
  else
    { 
    ppct->rs_value.at_val.at_long = pprocsp->rs_value.at_val.at_long;
    ppct->rs_value.at_val.at_long += count_proc(pnodesp->rs_value.at_val.at_str);
    }

  ppct->rs_value.at_flags |= ATR_VFLAG_SET;

  /* SUCCESS procct */

  return(0);
  }  /* END set_node_ct() */
Exemple #5
0
/* int initialize_procct - set pjob->procct plus the resource
 * procct in the Resource_List
 *
 * Assumes the nodes resource has been set on the Resource_List. This should
 * have been done in req_quejob with the set_nodes_attr() function or in
 * set_node_ct and/or set_proc_ct.
 *
 * Returns 0 on success. Non-zero on failure
 */
int initialize_procct(job *pjob)
{
    char id[] = "initialize_procct";
    resource     *pnodesp = NULL;
    resource_def *pnodes_def = NULL;
    resource     *pprocsp = NULL;
    resource_def *pprocs_def = NULL;
    resource     *procctp = NULL;
    resource_def *procct_def = NULL;
    pbs_attribute    *pattr = NULL;

    pattr = &pjob->ji_wattr[JOB_ATR_resource];
    if(pattr == NULL)
    {
        /* Something is really wrong. ji_wattr[JOB_ATR_resource] should always be set
           by the time this function is called */
        sprintf(log_buffer, "%s: Resource_List is NULL. Cannot proceed", id);
        log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer);
        pbs_errno = PBSE_INTERNAL;
        return(ROUTE_PERM_FAILURE);
    }

    /* Has nodes been initialzed */
    if(pattr->at_flags & ATR_VFLAG_SET)
    {
        /* get the node spec from the nodes resource */
        pnodes_def = find_resc_def(svr_resc_def, "nodes", svr_resc_size);
        if(pnodes_def == NULL)
        {
            sprintf(log_buffer, "%s: Could not get nodes resource definition. Cannot proceed", id);
            log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer);
            pbs_errno = PBSE_INTERNAL;
            return(ROUTE_PERM_FAILURE);
        }
        pnodesp = find_resc_entry(pattr, pnodes_def);

        /* Get the procs count if the procs resource attribute is set */
        pprocs_def = find_resc_def(svr_resc_def, "procs", svr_resc_size);
        if(pprocs_def != NULL)
        {
            /* if pprocs_def is NULL we just go on. Otherwise we will get its value now */
            pprocsp = find_resc_entry(pattr, pprocs_def);
            /* We will evaluate pprocsp later. If it is null we do not care */
        }

        /* if neither pnodesp nor pprocsp are set, terminate */
        if(pnodesp == NULL && pprocsp == NULL)
        {
            /* nodes and procs were not set. Hopefully req_quejob set procct to 1 for us already */
            procct_def = find_resc_def(svr_resc_def, "procct", svr_resc_size);
            if(procct_def == NULL)
            {
                sprintf(log_buffer, "%s: Could not get procct resource definition. Cannot proceed", id);
                log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer);
                pbs_errno = PBSE_INTERNAL;
                return(ROUTE_PERM_FAILURE);
            }
            procctp = find_resc_entry(pattr, procct_def);
            if(procctp == NULL)
            {
                sprintf(log_buffer, "%s: Could not get nodes nor procs entry from Resource_List. Cannot proceed", id);
                log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer);
                pbs_errno = PBSE_INTERNAL;
                return(ROUTE_PERM_FAILURE);
            }
        }

        /* we now set pjob->procct and we also set the resource attribute procct */
        procct_def = find_resc_def(svr_resc_def, "procct", svr_resc_size);
        if(procct_def == NULL)
        {
            sprintf(log_buffer, "%s: Could not get procct resource definition. Cannot proceed", id);
            log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer);
            pbs_errno = PBSE_INTERNAL;
            return(ROUTE_PERM_FAILURE);
        }
        procctp = find_resc_entry(pattr, procct_def);
        if(procctp == NULL)
        {
            procctp = add_resource_entry(pattr, procct_def);
            if(procctp == NULL)
            {
                sprintf(log_buffer, "%s: Could not add procct resource. Cannot proceed", id);
                log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer);
                pbs_errno = PBSE_INTERNAL;
                return(ROUTE_PERM_FAILURE);
            }
        }

        /* Finally the moment of truth. We have the nodes and procs resources. Add them
           to the procct resoruce*/
        procctp->rs_value.at_val.at_long = 0;
        if(pnodesp != NULL)
        {
            procctp->rs_value.at_val.at_long = count_proc(pnodesp->rs_value.at_val.at_str);
        }

        if(pprocsp != NULL)
        {
            procctp->rs_value.at_val.at_long += pprocsp->rs_value.at_val.at_long;
        }
        procctp->rs_value.at_flags |= ATR_VFLAG_SET;
    }
    else
    {
        /* Something is really wrong. ji_wattr[JOB_ATR_resource] should always be set
           by the time this function is called */
        sprintf(log_buffer, "%s: Resource_List not set. Cannot proceed", id);
        log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer);
        pbs_errno = PBSE_INTERNAL;
        return(ROUTE_PERM_FAILURE);
    }

    return(PBSE_NONE);
} /* END initialize_procct */
Exemple #6
0
/**
 * @brief
 *      Update the job attribute for resources used.
 *
 *      The first time this is called for a job, set up resource entries for
 *      each resource that can be reported for this machine.  Fill in the
 *      correct values.  Return an error code.
 *
 *      Assumes that the session ID attribute has already been set.
 *
 * @return int
 * @retval PBSE_NONE    for success.
 */
int 
mom_set_use(job *pjob)
{
	resource		*pres;
	attribute		*at;
	resource_def		*rd;
	u_Long 			*lp_sz, lnum_sz;
	unsigned long		*lp, lnum, oldcput;
	long			 dur;
	long                     ncpus_req;


	assert(pjob != NULL);
	at = &pjob->ji_wattr[(int)JOB_ATR_resc_used];
	assert(at->at_type == ATR_TYPE_RESC);

	if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_Suspend) != 0)
		return (PBSE_NONE);	/* job suspended, don't track it */

	DBPRT(("%s: entered %s\n", __func__, pjob->ji_qs.ji_jobid))

	at->at_flags |= ATR_VFLAG_MODIFY;
	if ((at->at_flags & ATR_VFLAG_SET) == 0) {
		at->at_flags |= ATR_VFLAG_SET;

		rd = find_resc_def(svr_resc_def, "ncpus", svr_resc_size);
		assert(rd != NULL);
		pres = add_resource_entry(at, rd);
		pres->rs_value.at_flags |= ATR_VFLAG_SET;
		pres->rs_value.at_type = ATR_TYPE_LONG;
		/*
		 * get pointer to list of resources *requested* for the job
		 * so the ncpus used can be set to ncpus requested
		 */
		at_req = &pjob->ji_wattr[(int)JOB_ATR_resource];
		assert(at->at_type == ATR_TYPE_RESC);

		pres_req = find_resc_entry(at_req, rd);
		if ((pres_req != NULL) &&
			((ncpus_req=pres_req->rs_value.at_val.at_long) !=0))
				pres->rs_value.at_val.at_long = ncpus_req;
		else
			pres->rs_value.at_val.at_long = 0;


		rd = find_resc_def(svr_resc_def, "cput", svr_resc_size);
		assert(rd != NULL);
		pres = add_resource_entry(at, rd);
		pres->rs_value.at_flags |= ATR_VFLAG_SET;
		pres->rs_value.at_type = ATR_TYPE_LONG;
		pres->rs_value.at_val.at_long = 0;

		rd = find_resc_def(svr_resc_def, "cpupercent", svr_resc_size);
		assert(rd != NULL);
		pres = add_resource_entry(at, rd);
		pres->rs_value.at_flags |= ATR_VFLAG_SET;
		pres->rs_value.at_type = ATR_TYPE_LONG;
		pres->rs_value.at_val.at_long = 0;

		rd = find_resc_def(svr_resc_def, "vmem", svr_resc_size);
		assert(rd != NULL);
		pres = add_resource_entry(at, rd);
		pres->rs_value.at_flags |= ATR_VFLAG_SET;
		pres->rs_value.at_type = ATR_TYPE_SIZE;
		pres->rs_value.at_val.at_size.atsv_shift = 10; /* in KB */
		pres->rs_value.at_val.at_size.atsv_units = ATR_SV_BYTESZ;

		rd = find_resc_def(svr_resc_def, "walltime", svr_resc_size);
		assert(rd != NULL);
		pres = add_resource_entry(at, rd);
		pres->rs_value.at_flags |= ATR_VFLAG_SET;
		pres->rs_value.at_type = ATR_TYPE_LONG;

		rd = find_resc_def(svr_resc_def, "mem", svr_resc_size);
		assert(rd != NULL);
		pres = add_resource_entry(at, rd);
		pres->rs_value.at_flags |= ATR_VFLAG_SET;
		pres->rs_value.at_type = ATR_TYPE_SIZE;
		pres->rs_value.at_val.at_size.atsv_shift = 10; /* in KB */
		pres->rs_value.at_val.at_size.atsv_units = ATR_SV_BYTESZ;
	}

	rd = find_resc_def(svr_resc_def, "cput", svr_resc_size);
	assert(rd != NULL);
	pres = find_resc_entry(at, rd);
	assert(pres != NULL);
	lp = (unsigned long *)&pres->rs_value.at_val.at_long;
	oldcput = *lp;
	lnum = MAX(*lp, cput_sum(pjob));
	*lp = lnum;

	/* now calculate weight moving average cpu usage percentage */

	if ((dur = sampletime_ceil+1 - pjob->ji_sampletim) > PBS_MIN_CPUPERCENT_PERIOD) {
		calc_cpupercent(pjob, oldcput, lnum, dur, at);
	}
	pjob->ji_sampletim = sampletime_floor;

	rd = find_resc_def(svr_resc_def, "vmem", svr_resc_size);
	assert(rd != NULL);
	pres = find_resc_entry(at, rd);
	assert(pres != NULL);
	lp_sz = &pres->rs_value.at_val.at_size.atsv_num;
	lnum_sz = (mem_sum(pjob) + 1023) >> 10;	/* as KB */
	*lp_sz = MAX(*lp_sz, lnum_sz);

	rd = find_resc_def(svr_resc_def, "walltime", svr_resc_size);
	assert(rd != NULL);
	pres = find_resc_entry(at, rd);
	assert(pres != NULL);
	pres->rs_value.at_val.at_long = (long)((double)(time_now - pjob->ji_qs.ji_stime) * wallfactor);

	rd = find_resc_def(svr_resc_def, "mem", svr_resc_size);
	assert(rd != NULL);
	pres = find_resc_entry(at, rd);
	assert(pres != NULL);
	lp_sz = &pres->rs_value.at_val.at_size.atsv_num;
	lnum_sz = (resi_sum(pjob) + 1023) >> 10;	/* in KB */
	*lp_sz = MAX(*lp_sz, lnum_sz);

	return (PBSE_NONE);
}
int set_node_ct(

  resource      *pnodesp,  /* I */
  pbs_attribute *pattr,    /* I */
  int            actmode)  /* I */

  {
  resource *pnct;
  resource_def *pndef;
  resource *ppct = NULL;
  resource_def *ppdef;
  resource *pprocsp;
  resource_def *pprocsdef;

  if (actmode == ATR_ACTION_RECOV)
    {
    /* SUCCESS */

    return(0);
    }
  
  // WARNING: we are potentially re-sizing the vector in the calls to add_resource_entry()
  // below. All attempts to use pnodesp after the calls to add_resource_entry may be using an
  // invalid pointer, so copy data here
  std::string nodes_val(pnodesp->rs_value.at_val.at_str);

  /* Set "nodect" to count of nodes in "nodes" */

  pndef = find_resc_def(svr_resc_def, "nodect", svr_resc_size);

  if (pndef == NULL)
    {
    return(PBSE_SYSTEM);
    }

  if ((pnct = find_resc_entry(pattr, pndef)) == NULL)
    {
    if ((pnct = add_resource_entry(pattr, pndef)) == NULL)
      {
      return(PBSE_SYSTEM);
      }
    }

  pnct->rs_value.at_val.at_long = ctnodes(nodes_val.c_str());

  pnct->rs_value.at_flags |= ATR_VFLAG_SET;

  /* Set "neednodes" to "nodes", may be altered by scheduler */

  pndef = find_resc_def(svr_resc_def, "neednodes", svr_resc_size);

  if (pndef == NULL)
    {
    return(PBSE_SYSTEM);
    }

  if ((pnct = find_resc_entry(pattr, pndef)) == NULL)
    {
    if ((pnct = add_resource_entry(pattr, pndef)) == NULL)
      {
      return(PBSE_SYSTEM);
      }
    }
  else
    {
    pndef->rs_free(&pnct->rs_value);
    }

  pndef->rs_decode(&pnct->rs_value, NULL, NULL, nodes_val.c_str(), ATR_DFLAG_ACCESS);

  pnct->rs_value.at_flags |= ATR_VFLAG_SET;

  /* SUCCESS nodect */

  /* set "procct" to count of processors in "nodes" plus "procs" */

  ppdef = find_resc_def(svr_resc_def, "procct", svr_resc_size);

  if (ppdef == NULL)
    {
    return(PBSE_SYSTEM);
    }

  if ((ppct = find_resc_entry(pattr, ppdef)) == NULL)
    {
    if ((ppct = add_resource_entry(pattr, ppdef)) == 0)
      {
      return(PBSE_SYSTEM);
      }
    }

  pprocsdef = find_resc_def(svr_resc_def, "procs", svr_resc_size);
  if (pprocsdef == NULL)
    {
    return(PBSE_SYSTEM);
    }

  if ((pprocsp = find_resc_entry(pattr, pprocsdef)) == NULL)
    {
    ppct->rs_value.at_val.at_long = count_proc(nodes_val.c_str());
    }
  else
    { 
    ppct->rs_value.at_val.at_long = pprocsp->rs_value.at_val.at_long;
    ppct->rs_value.at_val.at_long += count_proc(nodes_val.c_str());
    }

  ppct->rs_value.at_flags |= ATR_VFLAG_SET;

  /* SUCCESS procct */

  return(0);
  }  /* END set_node_ct() */
static job *chk_job_torun(

  struct batch_request *preq,  /* I */
  int                   setnn) /* I */

  {
  static char *id = "chk_job_torun";

  job              *pjob;

  struct rq_runjob *prun;
  int               rc;

  char              EMsg[1024];
  char              FailHost[1024];
  char              exec_host[1024];
  char              *ptr;

  prun = &preq->rq_ind.rq_run;

  if ((pjob = chk_job_request(prun->rq_jid, preq)) == 0)
    {
    /* FAILURE */

    return(NULL);
    }

  if ((pjob->ji_qs.ji_state == JOB_STATE_TRANSIT) ||
      (pjob->ji_qs.ji_state == JOB_STATE_EXITING) ||
      (pjob->ji_qs.ji_substate == JOB_SUBSTATE_STAGEGO) ||
      (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN)  ||
      (pjob->ji_qs.ji_substate == JOB_SUBSTATE_RUNNING))
    {
    /* FAILURE - job already started */

    req_reject(PBSE_BADSTATE, 0, preq, NULL, "job already running");

    return(NULL);
    }

  if (preq->rq_type == PBS_BATCH_StageIn)
    {
    if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_STAGEIN)
      {
      /* FAILURE */

      req_reject(PBSE_BADSTATE, 0, preq, NULL, NULL);

      return(NULL);
      }
    }

  if ((preq->rq_perm & (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)) == 0)
    {
    /* FAILURE - run request not authorized */

    req_reject(PBSE_PERM, 0, preq, NULL, NULL);

    return(NULL);
    }

  if (pjob->ji_qhdr->qu_qs.qu_type != QTYPE_Execution)
    {
    /* FAILURE - job must be in execution queue */

    log_err(-1, id, "attempt to start job in non-execution queue");

    req_reject(PBSE_IVALREQ, 0, preq, NULL, "job not in execution queue");

    return(NULL);
    }

  /* where to execute the job */

#ifdef ENABLE_BLCR
  if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn)
#else
  if (pjob->ji_qs.ji_svrflags & (JOB_SVFLG_CHECKPOINT_FILE | JOB_SVFLG_StagedIn))
#endif
    {
    /* job has been checkpointed or files already staged in */
    /* in this case, exec_host must be already set          */

    if (prun->rq_destin && *prun->rq_destin) /* If a destination has been specified */
      {
      /* specified destination must match exec_host */

      strcpy(exec_host, pjob->ji_wattr[(int)JOB_ATR_exec_host].at_val.at_str);

      if ((ptr = strchr(exec_host, '/')))
        * ptr = 0; /* For some reason, node name has "/0" on the end (i.e. "node0001/0"). */

      if (strcmp(prun->rq_destin, exec_host) != 0)
        {
        /* FAILURE */

        if (pjob->ji_qs.ji_svrflags & (JOB_SVFLG_CHECKPOINT_FILE))
          req_reject(PBSE_EXECTHERE, 0, preq, NULL, "allocated nodes must match checkpoint location");
        else
          req_reject(PBSE_EXECTHERE, 0, preq, NULL, "allocated nodes must match input file stagein location");

        return(NULL);
        }
      }

    if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_HasNodes) == 0)
      {
      /* re-reserve nodes and leave exec_host as is */

      if ((rc = assign_hosts(  /* inside chk_job_torun() */
                  pjob,
                  pjob->ji_wattr[(int)JOB_ATR_exec_host].at_val.at_str,
                  0,
                  FailHost,
                  EMsg)) != 0)   /* O */
        {
        req_reject(PBSE_EXECTHERE, 0, preq, FailHost, EMsg);

        return(NULL);
        }
      }
    }    /* END if (pjob->ji_qs.ji_svrflags & (JOB_SVFLG_CHECKPOINT_FILE|JOB_SVFLG_StagedIn)) */
  else
    {
    /* make sure exec gpus is clear */
    if (((pjob->ji_wattr[JOB_ATR_exec_gpus].at_flags & ATR_VFLAG_SET) != 0) &&
      (pjob->ji_wattr[JOB_ATR_exec_gpus].at_val.at_str != NULL))
      {
      job_attr_def[(int)JOB_ATR_exec_gpus].at_free(
        &pjob->ji_wattr[JOB_ATR_exec_gpus]);
      }

    /* job has not run before or need not run there again */
    /* reserve nodes and set new exec_host */
    if ((prun->rq_destin == NULL) || (prun->rq_destin[0] == '\0'))
      {
      /* it is possible for the scheduler to pass a hostlist using the 
       * rq_extend field--we should use it as the given list
       * as an alternative to rq_destin */

      rc = assign_hosts(pjob, preq->rq_extend, 1, FailHost, EMsg);  /* inside chk_job_torun() */
      }
    else
      {
      rc = assign_hosts(pjob, prun->rq_destin, 1, FailHost, EMsg);  /* inside chk_job_torun() */
      }

    if (rc != 0)
      {
      /* FAILURE - cannot essign correct hosts */

      req_reject(rc, 0, preq, FailHost, EMsg);

      return(NULL);
      }
    }

  if (setnn == 1)
    {
#ifdef TDEV
    /* what should neednodes be set to? */

    resource_def *DRes;  /* resource definition */

    resource *JRes;      /* resource on job */

    attribute *Attr;     /* 'neednodes' attribute */

    Attr = &pjob->ji_wattr[(int)JOB_ATR_resource];

    DRes = find_resc_def(svr_resc_def, "neednodes", svr_resc_size);

    JRes = find_resc_entry(Attr, DRes);

    if ((JRes == NULL) ||
        ((JRes->rs_value.at_flags & ATR_VFLAG_SET) == 0))
      {
      /* resource does not exist or value is not set */

      if (JRes == NULL)
        {
        JRes = add_resource_entry(Attr, DRes);
        }

      if (JRes != NULL)
        {
        if (DRes->rs_defin->rs_set(
              &JRes->rs_value,
              &DRes->rs_value,
              SET) == 0)
          {
          JRes->rs_value.at_flags |= ATR_VFLAG_SET;
          }
        }
      }

#endif /* TDEV */
    }    /* END if (setnn == 1) */

  return(pjob);
  }  /* END chk_job_torun() */
Exemple #9
0
int CPACreatePartition(

  job              *pjob,   /* I */
  struct var_table *vtab)   /* I */

  {
  char id[] = "CPACreatePartition";

  cpa_node_req_t *NodeReq;

  int rc;

  char *Value;

  char *Spec;

  int   PPN;
  int   Flags;
  int   Size = 0;
  int   UID;
  char *AcctID = NULL;
  char *JobID;
  char *HostList = NULL;  /* scheduler specified list of hosts to allocate (optional) */

  unsigned long      ParID;       /* O - partition id */
  unsigned long long AdminCookie; /* O - admin cookie */
  unsigned long long AllocCookie; /* O - alloc cookie */
  char longbuf[1024];

  resource            *presc;         /* Requested Resource List */
  resource_def        *prd;
  attribute           *pattr;

  int                  rc;

  cpa_nid_list_t       Wanted = NULL;

  /* first, get the size, uid, jobid, and subnodelist from the job */

  pattr = &pjob->ji_wattr[JOB_ATR_resource];
  prd = find_resc_def(svr_resc_def, "size", svr_resc_size);
  presc = find_resc_entry(pattr, prd);

  if (presc != NULL)
    {
    Size = presc->rs_value.at_val.at_long;
    }

  UID = pjob->ji_qs.ji_un.ji_momt.ji_exuid;

  if ((Size <= 0) || (UID < 0))
    {
    /* FAILURE */

    sprintf(log_buffer, "ERROR:  invalid parameters:  Size: %d  UID: %d  \n",
            Size,
            UID);

    log_err(-1, id, log_buffer);

    return(1);
    }

  pattr = &pjob->ji_wattr[JOB_ATR_resource];

  prd = find_resc_def(svr_resc_def, "subnode_list", svr_resc_size);
  presc = find_resc_entry(pattr, prd);

  if (presc != NULL)
    {
    HostList = presc->rs_value.at_val.at_string;
    }

  if (pjob->ji_wattr[JOB_ATR_account].at_flags & ATR_VFLAG_SET)
    {
    AcctID = pjob->ji_wattr[JOB_ATR_account].at_val.at_str;
    }

  JobID = pjob->ji_qs.ji_jobid;

  PPN = 1;       /* NOTE: not really supported w/in CPA, always use 1 */
  Flags = 0;     /* NOTE: only allocate compute hosts, always use 0 */
  Spec = NULL;   /* NOTE: required node specification, not used */

  if (HostList != NULL)
    {
    char tmpBuffer[256000];
    int  index;

    rc = nid_list_create(
           0,
           MaxListSize,  /* max count */
           0,
           MaxNID,       /* max value */
           &Wanted);     /* O */

    if (rc != 0)
      {
      /* FAILURE */

      printf("nid_list_create: rc=%d (%s)\n",
             rc,
             cpa_rc2str(rc));

      return(1);
      }

    strncpy(tmpBuffer, HostList, sizeof(tmpBuffer));

    tmpBuffer[sizeof(tmpBuffer) - 1] = '\0';

    for (index = 0;tmpBuffer[index] != '\0';index++)
      {
      if (tmpBuffer[index] == ':')
        tmpBuffer[index] = ',';
      }

    rc = nid_list_destringify(tmpBuffer, Wanted);

    if (rc != 0)
      {
      /* FAILURE */

      printf("nid_list_destringify: rc=%d (%s)\n",
             rc,
             cpa_rc2str(rc));

      nid_list_destroy(Wanted);

      return(1);
      }

    if (loglevel >= 3)
      {
      char *buf = NULL;
      int   bufsize = 0;

      rc = nid_list_stringify(Wanted, &buf, &bufsize);

      if (rc == 0)
        {
        snprintf(log_buffer, sizeof(log_buffer), "CPANodeList: %s\n",
                 buf);
        }
      else
        {
        snprintf(log_buffer, sizeof(log_buffer), "CPA nid_list_stringify: rc=%d\n",
                 rc);
        }

      log_record(

        PBSEVENT_JOB,
        PBS_EVENTCLASS_JOB,
        pjob->ji_qs.ji_jobid,
        log_buffer);

      free(buf);
      }
    }
  else
    {
    Wanted = NULL;
    }

  NodeReq = cpa_new_node_req(

              Size, /* number of procs/nodes required by job */
              PPN,
              Flags,
              Spec,
              Wanted);  /* I */

  if (NodeReq == NULL)
    {
    /* FAILURE:  cannot alloc memory for node req */

    sprintf(log_buffer, "cpa_new_node_req: NULL\n");

    log_err(-1, id, log_buffer);

    nid_list_destroy(Wanted);

    return(1);
    }

  rc = cpa_create_partition(

         NodeReq,
         CPA_BATCH,
         CPA_NOT_SPECIFIED,
         UID,
         (AcctID != NULL) ? AcctID : "DEFAULT",
         (cpa_partition_id_t *) & ParID, /* O */
         (cpa_cookie_t *) & AdminCookie, /* O */
         (cpa_cookie_t *) & AllocCookie);  /* O */

  if (rc != 0)
    {
    /* FAILURE */

    sprintf(log_buffer, "cpa_create_partition: rc=%d (%s)\n",
            rc,
            cpa_rc2str(rc));

    log_err(-1, id, log_buffer);

    nid_list_destroy(Wanted);

    return(1);
    }

  rc = cpa_assign_partition(

         (cpa_partition_id_t)ParID,
         (cpa_cookie_t)AdminCookie,
         JobID,
         1);     /* NOT CURRENTLY USED - should be set to NID of 'master host' */

  /* free memory, nid list no longer required */

  nid_list_destroy(Wanted);

  if (rc != 0)
    {
    /* FAILURE */

    sprintf(log_buffer, "cpa_assign_partition: rc=%d (%s)\n",
            rc,
            cpa_rc2str(rc));

    log_err(-1, id, log_buffer);

    return(1);
    }

  /* save the partition and cookies in the job and vtab */

  prd = find_resc_def(svr_resc_def, "cpapartition", svr_resc_size);

  if (prd == NULL)
    {
    return(PBSE_SYSTEM);
    }

  if ((presc = find_resc_entry(pattr, prd)) == NULL)
    {
    if ((presc = add_resource_entry(pattr, prd)) == NULL)
      {
      return(PBSE_SYSTEM);
      }
    }
  else
    {
    prd->rs_free(&presc->rs_value);
    }

  snprintf(longbuf, 1023, "%lu", ParID);

  prd->rs_decode(&presc->rs_value, NULL, NULL, longbuf);
  presc->rs_value.at_flags |= ATR_VFLAG_SET;
  bld_env_variables(vtab, "BATCH_PARTITION_ID", longbuf);

  prd = find_resc_def(svr_resc_def, "cpaadmincookie", svr_resc_size);

  if (prd == NULL)
    {
    return(PBSE_SYSTEM);
    }

  if ((presc = find_resc_entry(pattr, prd)) == NULL)
    {
    if ((presc = add_resource_entry(pattr, prd)) == NULL)
      {
      return(PBSE_SYSTEM);
      }
    }
  else
    {
    prd->rs_free(&presc->rs_value);
    }

  snprintf(longbuf, 1023, "%llu", AdminCookie);

  prd->rs_decode(&presc->rs_value, NULL, NULL, longbuf);
  presc->rs_value.at_flags |= ATR_VFLAG_SET;
  /* admincookie doesn't go into job env */

  prd = find_resc_def(svr_resc_def, "cpaalloccookie", svr_resc_size);

  if (prd == NULL)
    {
    return(PBSE_SYSTEM);
    }

  if ((presc = find_resc_entry(pattr, prd)) == NULL)
    {
    if ((presc = add_resource_entry(pattr, prd)) == NULL)
      {
      return(PBSE_SYSTEM);
      }
    }
  else
    {
    prd->rs_free(&presc->rs_value);
    }

  snprintf(longbuf, 1023, "%llu", AllocCookie);

  prd->rs_decode(&presc->rs_value, NULL, NULL, longbuf);
  presc->rs_value.at_flags |= ATR_VFLAG_SET;
  bld_env_variables(vtab, "BATCH_ALLOC_COOKIE", longbuf);

  bld_env_variables(vtab, "BATCH_JOBID", JobID);

  return(0);
  }  /* END CPACreatePartition() */
Exemple #10
0
int
set_node_ct(resource *pnodesp, attribute *pattr, void *pobj, int type, int actmode)
{
#ifndef PBS_MOM
	int		 nn;		/* num of nodes */
	int		 nt;		/* num of tasks (processes) */
	int		 hcpp = 0;	/* has :ccp in string */
	long		 nc;
	resource	*pnct;
	resource	*pncpus;
	resource_def	*pndef;

	if ((actmode == ATR_ACTION_RECOV) ||
		((pnodesp->rs_value.at_flags & ATR_VFLAG_SET) == 0))
		return (0);

	/* first validate the spec */

	if ((nn = validate_nodespec(pnodesp->rs_value.at_val.at_str)) != 0)
		return nn;

	/* Set "nodect" to count of nodes in "nodes" */

	pndef = find_resc_def(svr_resc_def, "nodect", svr_resc_size);
	if (pndef == (resource_def *)0)
		return (PBSE_SYSTEM);

	if ((pnct = find_resc_entry(pattr, pndef)) == (resource *)0) {
		if ((pnct = add_resource_entry(pattr, pndef)) == 0)
			return (PBSE_SYSTEM);
	}

	nn = ctnodes(pnodesp->rs_value.at_val.at_str);
	pnct->rs_value.at_val.at_long = nn;
	pnct->rs_value.at_flags |= ATR_VFLAG_SET|ATR_VFLAG_MODCACHE;

	/* find the number of cpus specified in the node string */

	nt = ctcpus(pnodesp->rs_value.at_val.at_str, &hcpp);

	/* Is "ncpus" set as a separate resource? */

	pndef = find_resc_def(svr_resc_def, "ncpus", svr_resc_size);
	if (pndef == (resource_def *)0)
		return (PBSE_SYSTEM);
	if ((pncpus = find_resc_entry(pattr, pndef)) == (resource *)0) {
		if ((pncpus = add_resource_entry(pattr, pndef)) == 0)
			return (PBSE_SYSTEM);
	}

	if (((pncpus->rs_value.at_flags & (ATR_VFLAG_SET | ATR_VFLAG_DEFLT)) ==
		ATR_VFLAG_SET) && (actmode == ATR_ACTION_NEW)) {
		/* ncpus is already set and not a default and new job */

		nc = pncpus->rs_value.at_val.at_long;
		if (hcpp && (nt != pncpus->rs_value.at_val.at_long)) {
			/* if cpp string specificed, this is an error */
			return (PBSE_BADATVAL);
		} else if ((nc % nt) != 0) {
			/* ncpus must be multiple of number of tasks */
			return (PBSE_BADATVAL);
		}

	} else {
		/* ncpus is not set or not a new job (qalter being done) */
		/* force ncpus to the correct thing */
		pncpus->rs_value.at_val.at_long = nt;
		pncpus->rs_value.at_flags |= (ATR_VFLAG_SET|ATR_VFLAG_MODCACHE);
	}


#endif	/* not MOM */
	return (0);
}
Exemple #11
0
int decode_resc(

  pbs_attribute *patr,  /* Modified on Return */
  char          *name,  /* pbs_attribute name */
  char          *rescn, /* I resource name - is used here */
  char          *val,   /* resource value */
  int            perm)  /* access permissions */

  {
  resource *prsc;
  resource_def *prdef;
  int   rc = 0;
  int   rv;

  if (patr == NULL)
    {
    return(PBSE_INTERNAL);
    }

  if (rescn == NULL)
    {
    return(PBSE_UNKRESC);
    }

  if (!(patr->at_flags & ATR_VFLAG_SET))
    CLEAR_HEAD(patr->at_val.at_list);

  prdef = find_resc_def(svr_resc_def, rescn, svr_resc_size);

  if (prdef == NULL)
    {
    /*
     * didn't find resource with matching name, use unknown;
     * but return PBSE_UNKRESC in case caller doesn`t wish to
     * accept unknown resources
     */

    rc = PBSE_UNKRESC;

    prdef = svr_resc_def + (svr_resc_size - 1);
    }

  prsc = find_resc_entry(patr, prdef);

  if (prsc == NULL) /* no current resource entry, add it */

    if ((prsc = add_resource_entry(patr, prdef)) == NULL)
      {
      return(PBSE_SYSTEM);
      }

  /* note special use of ATR_DFLAG_ACCESS, see server/attr_recov() */

  if (((prsc->rs_defin->rs_flags & perm & ATR_DFLAG_WRACC) == 0) &&
      (perm != ATR_DFLAG_ACCESS))
    {
    return(PBSE_ATTRRO);
    }

  patr->at_flags |= ATR_VFLAG_SET | ATR_VFLAG_MODIFY;

  rv = prdef->rs_decode(&prsc->rs_value, name, rescn, val, perm);

  if (rv == 0)
    {
    /* FAILURE */

    return(rc);
    }

  /* SUCCESS */

  return(rv);
  }